# coding=gbk # -*- coding:utf8 -*- def u(str): """ 仅用来转换utf-8编码的 """ return str.decode("gbk") def recognize_one_token(str, dictionary): """ 输入一句话,匹配第一个词,并返回词结尾的位置 """ print str if len(str) == 0: return 0 elif str in dictionary: return len(str) else: return recognize_one_token(str[0:-1], dictionary) def tokenize(str, dictionary): if len(str) == 0: return [] i = recognize_one_token(str, dictionary) if i == 0: return tokenize(str[1:], dictionary) else: if len(str) == i: return [str[0:i]] # 避免超界 else: return [str[0:i]] + tokenize(str[i:], dictionary) def parse(str): """ str参数需要unicode编码的 """ """ 1. 分词 (2. 暂时未建立语法结构, 仅以list方式返回分词结果) """ # 1. 建立用来分词的词典 dictionary = map(u, ['十七楼', '我', '我要去', '有什么建议吗', '二十楼', '三十楼']) # 2. 匹配分词(从前往后,最大化匹配) return tokenize(str, dictionary) class Literal: def __init__(self, stringval): self.m_stringval = stringval def content(self): return self.m_stringval def match(self, condition): return self.content() == condition class When(Literal): def __init__(self, timestring): Literal.__init__(self, timestring) class Where(Literal): def __init__(self, placestring): Literal.__init__(self, placestring) class Who(Literal): def __init__(self, namestring): Literal.__init__(self, namestring) class Cause(Literal): def __init__(self, causestring): Literal.__init__(self, causestring) class Course(Literal): def __init__(self, coursestring): Literal.__init__(self, coursestring) class Result(Literal): def __init__(self, resultstring): Literal.__init__(self, resultstring) class Fact: """ 6要素方式的模型,其实未必合适,主要是cause, course, result可能需要分得更细 """ def __init__(self, when, where, who, cause, course, result): self.m_when = when self.m_where = where self.m_who = who self.m_cause = cause self.m_course = course self.m_result = result def when(self): return self.m_when def where(self): return self.m_where def who(self): return self.m_who def cause(self): return self.m_cause def course(self): return self.m_course def result(self): return self.m_result class Query: def __init__(self, question, facts): self.m_question = question # 问题文本 self.m_tokens = parse(question) # 对问题文本进行解析 self.m_facts = facts # 事实库 def answer(self): """ 理解问题,并搜索Fact,并生成自然的答案 """ # 基本思路,从前往后扫描语法树,对照事实上下文理解语义,并给出答案 # 应当是一个状态机,0: 初始,1: 地点意愿 state = 0 result = [] for eachtoken in self.m_tokens: if u("我要去") == eachtoken: state = 1 elif state == 1: # 搜索facts, 匹配地点 for eachfact in self.m_facts: if eachfact.where() == eachtoken: result.append(eachfact) else: pass return result if __name__ == '__main__': """ 测试代码 """ facts = [Fact(u("4月17号"), u("十七楼"), u("我"), u("需要钱"), u("找张三借10块钱"), u("张三借给了我"))] facts.append(Fact(u("4月18号"), u("二十楼"), u("我"), u("要开会"), u("要过去"), u(""))) q = Query(u("我要去二十楼"), facts) for f in q.answer(): print f.when(), f.where(), f.who(), f.cause(), f.course(), f.result()