1 # -*- coding: UTF-8-*-
2 import numpy
3 import math
4 import string
5 import matplotlib.pyplot as plt
6 import re
7
8 def dictionary_found(wordlist): #对模型训练出来的词转换成一个词为KEY,概率为值的字典。
9 word_dictionary1={}
10 for i in xrange(len(wordlist)):
11 if i%2==0:
12 if word_dictionary1.has_key(wordlist)==True:
13 word_probability=word_dictionary1.get(wordlist)
14 word_probability=float(word_probability)+float(wordlist[i+1])
15 word_dictionary1.update({wordlist:word_probability})
16 else:
17 word_dictionary1.update({wordlist:wordlist[i+1]})
18 else:
19 pass
20 return word_dictionary1
21
22 def look_into_dic(dictionary,testset): #对于测试集的每一个词,在字典中查找其概率。
23 '''Calculates the TF-list for perplexity'''
24 frequency=[]
25 letter_list=[]
26 a=0.0
27 for letter in testset.split():
28 if letter not in letter_list:
29 letter_list.append(letter)
30 letter_frequency=(dictionary.get(letter))
31 frequency.append(letter_frequency)
32 else:
33 pass
34 for each in frequency:
35 if each!=None:
36 a+=float(each)
37 else:
38 pass
39 return a
40
41
42 def f_testset_word_count(testset): #测试集的词数统计
43 '''reture the sum of words in testset which is the denominator of the formula of Perplexity'''
44 testset_clean=testset.split()
45 return (len(testset_clean)-testset.count("\n"))
46
47 def f_perplexity(word_frequency,word_count): #计算困惑度
48 '''Search the probability of each word in dictionary
49 Calculates the perplexity of the LDA model for every parameter T'''
50 duishu=-math.log(word_frequency)
51 kuohaoli=duishu/word_count
52 perplexity=math.exp(kuohaoli)
53 return perplexity
54
55 def graph_draw(topic,perplexity): #做主题数与困惑度的折线图
56 x=topic
57 y=perplexity
58 plt.plot(x,y,color="red",linewidth=2)
59 plt.xlabel("Number of Topic")
60 plt.ylabel("Perplexity")
61 plt.show()
62
63
64 topic=[]
65 perplexity_list=[]
66 f1=open('/home/alber/lda/GibbsLDA/jd/test.txt','r') #测试集目录
67 testset=f1.read()
68 testset_word_count=f_testset_word_count(testset) #call the function to count the sum-words in testset
69 for i in xrange(14):
70 dictionary={}
71 topic.append(5*(3i+1)) #模型文件名的迭代公式
72 trace="/home/alber/lda/GibbsLDA/jd/stats/model-final-"+str(5*(i+1))+".txt" #模型目录
73 f=open(trace,'r')
74 text=f.readlines()
75 word_list=[]
76 for line in text:
77 if "Topic" not in line:
78 line_clean=line.split()
79 word_list.extend(line_clean)
80 else:
81 pass
82 word_dictionary=dictionary_found(word_list)
83 frequency=look_into_dic(word_dictionary,testset)
84 perplexity=f_perplexity(frequency,testset_word_count)
85 perplexity_list.append(perplexity)
86 graph_draw(topic,perplexity_list)
下面是画出的折线图,在拐点附近再调整参数(当然与测试集有关,有图为证~~),寻找最优的主题数。实验证明,只要Topic选取数量在其附近,主题抽取一般比较理想。