Python中的类实现(TF-IDF-CF)

我想用TF-IDF-CF方法进行单词加权。我从github得到了这样的代码,但是我仍然不明白如何在我的数据框中实现它。我拥有的数据集包含总共1000行的文本集合。这是代码:

 import math


"""
"""
class FrequencyCalc:


def tfidfcf(self,tfidfZip,classWordLists):
    """
    """
    tfidfcf = []
    N = len(classWordLists)
    for (w,f) in tfidfZip:
        ncij = 0
        for words in classWordLists:
            if w in words:
                ncij += 1
        v = f * (ncij / N)
        tfidfcf.append(v)
    return tfidfcf


def tfidf(self,tf,idf):
    """
    """
    tfidf = []
    for i in range(len(tf)):
        v = tf[i] * idf[i]
        tfidf.append(v)
    return tfidf


def tf(self,wordCount):
    """
    """
    tf = []
    sum = self.__totalWords(wordCount)
    for (w,n) in wordCount:
        tf.append(int(n) / sum)
    return tf

def idf(self,docWords,wordLists):
    """
    """
    idf = []
    N = len(wordLists)
    for w in docWords:
        nt = 0
        for words in wordLists:
            if w in words:
                nt += 1
        r = math.log(N / nt,10)
        idf.append(r)
    return idf


def __totalWords(self,wordCount):
    """
    """
    sum = 0
    for (w,n) in wordCount:
        sum += int(n)
    return sum

请给我一个使用该类的例子。谢谢

zjfshy 回答:Python中的类实现(TF-IDF-CF)

我得到了它的类实现,这里是该代码

def c_tf_idf(documents,m,ngram_range=(1,1)):
""" Calculate a class-based TF-IDF where m is the number of total documents. """
count = CountVectorizer(ngram_range=ngram_range,stop_words="english").fit(documents)
t = count.transform(documents).toarray()
w = t.sum(axis=1)
tf = np.divide(t.T,w)
sum_t = t.sum(axis=0)
idf = np.log(np.divide(m,sum_t)).reshape(-1,1)
tf_idf = np.multiply(tf,idf)

return tf_idf,count
本文链接:https://www.f2er.com/2766434.html

大家都在问