现在的位置: 首页 > 综合 > 正文

在分词结果中过滤停止词和数字。

2012年08月14日 ⁄ 综合 ⁄ 共 1292字 ⁄ 字号 评论关闭

BUG质量分析,通过分词,得到了大量的关键词,但是存在很多垃圾数据,譬如大量的停止此 和数字,因此进行过滤是必不可少的呢。

#-*- coding: utf-8 -*-

import os
import sys

def func_filter(stopword,bugkey,keyword):
if not os.path.isfile(stopword) or not os.path.isfile(bugkey):
print "ERROR : The file , stopword or bugkey doesn't exist !"
else:
stopw = open(stopword,'r')
bugk = open(bugkey,'r')
#should judge the length of file
tmpsw = stopw.readlines()
tmpbk = bugk.readlines()
stopw.close()
bugk.close()
tmplist = [bk for bk in tmpbk if not bk.split(' ')[1] in tmpsw]
#tmplist = [bk for bk in tmpbk if not bk in tmpsw]
#print len(tmplist)
tmpkw = open(".tmp.txt",'w')
tmpkw.writelines(tmplist)
tmpkw.close()
os.system("cat .tmp.txt | awk 'function trim(str){sub(/^[0-9]+[ ][0-9A-Za-z]+$/,\"\",str);return str}{print trim($0)}'| awk '{if(length($0)!=0) print $0}' >>" + keyword)
#os.system("cat .tmp.txt | awk 'function trim(str){sub(/^[0-9]+[ ][0-9]+$/,\"\",str);return str}{print trim($0)}'| awk '{if(length($0)!=0) print $0}' >>" + keyword)
os.system("rm .tmp.txt")
print "CWS_TrimStopWord is OK! Stopword: %s %s ==> %s" % (stopword,bugkey,keyword)
#print "The Total of Trimed StopWord is %d" % (len(tmpbk)-len(tmplist))


if __name__ == '__main__':
if len(sys.argv) != 4:
print "Usage: python CWS_TrimStopWord_new.py [stopword] [bugwordlist] [keyword]"
else:
func_filter(sys.argv[1],sys.argv[2],sys.argv[3])
#for bk in func_filter(sys.argv[1],sys.argv[2]):
# print bk,
#print len(func_filter(sys.argv[1],sys.argv[2]))

抱歉!评论已关闭.