def ComputeFreq(wordlist, text): result = [] for word in wordlist: countword = text.count(word) texted = nltk.word_tokenize(text) length = len(texted) freq = countword/length temp = {} temp['word'] = word temp['freq'] = freq #print freq result.append(temp) return result def Computetfidf(wordfreq, corpus): result = [] for item in wordfreq: word = item['word'] tf = item['freq'] dlength = len(corpus) count = 1 for line in corpus: if line.find(word)!=-1: count = count+1 idf = math.log10(dlength/count) tfidf = tf*idf # tempword.append(word) # temptfidf.append(tfidf) temp = {} temp['word'] = word temp['tfidf'] = tfidf result.append(temp) result.sort(lambda x,y : -cmp(x['tfidf'], y['tfidf'])) return result
第一个函数:计算word在text的词频
wordlist是list格式的word,text是对应的document,python中的string格式
第二个函数:计算word在语料库中的TF-IDF
wordfreq是第一个函数的输出结果,corpus是document的list存储格式