为文本建立索引是文本信息处理的一个重要的任务,给定一个由英文单词构成的文件,为文件中所有单词建立索引,记录每个单词出现的行号和每行出现的次数,并将索引存入一个文件。
-module(text_index). -compile(export_all). -import(re, [run/2,replace/4]). -record(data, {word, line = []}). -define(DBNAME, dataDB). %%% 数据库操作 createDB() -> ets:new(?DBNAME, [public, named_table, set, {keypos, #data.word}]). closeDB() -> ets:delete(?DBNAME). start(File) -> case file:open(File, read) of {ok, IoDevice} -> Content = readAllText(IoDevice), Lines = string:tokens(Content, "\r\n"), %讲文件分解每行 processLine(Lines); _ -> io:format("Open the file failed!") end. %%%将文件的内容全部读入到内存中 readAllText(IoDevice) -> readAllText(IoDevice, []). readAllText(IoDevice, Content)-> case file:read(IoDevice, 1024) of {ok, Text} -> readAllText(IoDevice, [Content | Text]); eof -> file:close(IoDevice), lists:flatten(Content); {error, Reason}-> io:format("Read file failed! The reason is:~p~n", [Reason]), file:close(IoDevice), {error, Reason} end. -define(PATTERN, "[a-zA-Z\.]+"). %%%对文件的每一行进行处理,提取出每一个单词 processLine(Lines) -> processLine(Lines, 1). processLine([H | T], LineNumber) -> case re:run(H, ?PATTERN, [global]) of {match, MatchItem} -> Words = splitWords(H, lists:flatten(MatchItem)), writeDB(Words, LineNumber); nomatch -> ok end, processLine(T, LineNumber + 1); processLine([], _LineNumber) -> ok. splitWords(Line, MatchItem) -> splitWords(Line, MatchItem, []). splitWords(Line, [{Start, Length} | T], Words)-> Word = string:substr(Line, Start+1, Length), splitWords(Line, T, [Word | Words]); splitWords(_Line, [], Words)-> lists:reverse(Words). %%% 将数据额写入到ets数据库中 writeDB([Key | T], LineNumber)-> case ets:match_object(dataDB, #data{word = Key, _ = '_'}) of [#data{word = Key, line = Value}] -> ets:insert(dataDB, #data{word = Key, line = updateData(Value, LineNumber)}), writeDB(T, LineNumber); _ -> ets:insert(dataDB, #data{word = Key, line = [{LineNumber, 1}]}), writeDB(T, LineNumber) end; writeDB([], _) -> ok. %%%对数据进行更新,此方法不太好。 updateData(LineValue, LineNumber) -> updateData(LineValue, LineNumber, [], false). updateData([{LineNumber, Times} | Tail], LineNumber, Res, false) -> updateData(Tail, LineNumber, [{LineNumber, Times + 1} | Res], true); updateData([], LineNumber, Res, false) -> lists:keysort(1, [{LineNumber, 1} | Res]); updateData([], _LineNumber, Res, true)-> lists:keysort(1, Res); updateData([H | T], LineNumber, Res, Tmp)-> updateData(T, LineNumber, [H | Res], Tmp). main() -> FileName = "C:\\Users\\elqstux\\Desktop\\wy.py", createDB(), start(FileName), closeDB().
updateData/2 的简洁版本,利用lists模块的内建函数。
updateData(LineValue, LineNumber)-> case lists:keysearch(LineNumber, 1, LineValue) of {value, {LineNumber, Times}} -> lists:keyreplace(LineNumber, 1, LineValue, {LineNumber, Times + 1}); false -> [{LineNumber, 1} | LineValue] end.