Clucene索引建立剖析

现在的位置: 首页 > 综合 > 正文

RSS

Clucene索引建立剖析

2013年09月07日 ⁄ 综合 ⁄ 共 4292字 ⁄ 字号小中大 ⁄ 评论关闭

下面,我们将结合代码,对Clucene建立索引的过程进行剖析.

(一). main函数中调用建立索引的过程

(1).void IndexFiles()方法:

//参数:索引文件路径,索引后的目标路径

void IndexFiles(char* path, char* target, const bool clearIndex)

{

IndexWriter* writer = NULL;

lucene::analysis::standard::StandardAnalyzer an;

if (!clearIndex && IndexReader::indexExists(target)){

if (IndexReader::isLocked(target) ){ //在函数调用里面执行了创建了索引的目录

printf("Index was locked... unlocking it./n");

IndexReader::unlock(target);

}

writer = _CLNEW IndexWriter( target, &an, false);

}else{

writer = _CLNEW IndexWriter( target ,&an, true);

}

writer->setMaxFieldLength(IndexWriter::DEFAULT_MAX_FIELD_LENGTH);

writer->setUseCompoundFile(false); //设置不使用复合索引

uint64_t str = lucene::util::Misc::currentTimeMillis();

indexDocs(writer, path);

writer->optimize();

writer->close();

_CLDELETE(writer);

printf("Indexing took: %d ms./n/n", lucene::util::Misc::currentTimeMillis() - str);

}

(2).void IndexFiles()方法中调用indexDocs(writer, path)方法:

void indexDocs(IndexWriter* writer, char* directory)

{

DIR* dir = opendir(directory);

if ( dir != NULL ){

struct dirent* fl;

struct fileStat buf;

char path[CL_MAX_DIR];

strcpy(path,directory);

strcat(path,PATH_DELIMITERA);

char* pathP = path + strlen(path);

fl = readdir(dir);

while ( fl != NULL ){

if ( (strcmp(fl->d_name, ".")) && (strcmp(fl->d_name, "..")) ) {

pathP[0]=0;

strcat(pathP,fl->d_name);

int32_t ret = fileStat(path,&buf);

if ( buf.st_mode & S_IFDIR ) {

indexDocs(writer, path );

}else{

//处理目录下面的每个文档

Document* doc = FileDocument( path );

writer->addDocument(doc);

_CLDELETE(doc);

}

fl = readdir(dir);

}

closedir(dir);

}else{

printf( "adding: %s/n", directory);

Document* doc = FileDocument( directory );

writer->addDocument( doc );

_CLDELETE(doc);

}

(3). Document* FileDocument(const char* f)方法:

//先将字段加入到文档,在将文档加入到IndexWriter中

Document* FileDocument(const char* f)

{

Document* doc = _CLNEW Document();

TCHAR tf[CL_MAX_DIR];

STRCPY_AtoT(tf,f,CL_MAX_DIR);

doc->add( *_CLNEW Field(_T("path"), tf, Field::STORE_YES | Field::INDEX_UNTOKENIZED ) );

FILE* fh = fopen(f,"r");

if ( fh != NULL ){

StringBuffer str;

int fn = fileno(fh);

struct stat filestat;

fstat(fn, &filestat);

str.reserve(filestat.st_size);

char abuf[1024];

TCHAR tbuf[1024];

size_t r;

//每次读取1023字节

do{

r = fread(abuf,1,1023,fh);

abuf[r]=0;

STRCPY_AtoT(tbuf,abuf,r);

tbuf[r]=0;

str.append(tbuf);

}while(r>0);

fclose(fh);

doc->add( *_CLNEW Field(_T("contents"),str.getBuffer(), Field::STORE_YES | Field::INDEX_TOKENIZED|Field::TERMVECTOR_WITH_OFFSETS) );

}

return doc;

}

(二).进入建立索引的细节

(1). Document类与Field类

void Document::add(Field& field)

{

//刚开始创建时fieldList是空,然后加入下一个字段时,又不是为空了,这样就建立了next的关系

//新生成的fieldList都是在链表头部

fieldList = _CLNEW DocumentFieldEnumeration::DocumentFieldList(&field, fieldList);

}

文档字段迭代器DocumentFieldEnumeration类

//文档字段迭代器

class DocumentFieldEnumeration :LUCENE_BASE{

class DocumentFieldList :LUCENE_BASE{

public:

DocumentFieldList(Field* f, DocumentFieldList* n); //构造函数

~DocumentFieldList();

Field* field;

DocumentFieldList* next; //应该叫做之前的pre指针

};

friend class Document;

private:

const DocumentFieldList* fields;

public:

DocumentFieldEnumeration(const DocumentFieldList* fl);

~DocumentFieldEnumeration();

bool hasMoreElements() const;

Field* nextElement();

};

void Document::removeFields()方法:

//从链表中删除多个重名的字段

void Document::removeFields(const TCHAR* name)

{

CND_PRECONDITION(name != NULL, "name is NULL");

DocumentFieldEnumeration::DocumentFieldList* previous = NULL;

DocumentFieldEnumeration::DocumentFieldList* current = fieldList;

while (current != NULL) {

if ( _tcscmp(current->field->name(),name) == 0 ){

if (previous){

previous->next = current->next; //删除当前指针,修改指针指向

}else

fieldList = current->next;

current->next=NULL;

_CLDELETE(current);

if ( previous )

current = previous->next; //重新设置当前指针

else

current = fieldList;

}else{

previous = current;

current = current->next;

【上篇】笑话之-猴子拣到ip卡
【下篇】Java算法学习（栈操作实例）

作者: cao69893490

该日志由 cao69893490 于11年前发表在综合分类下，最后更新于 2013年09月07日.
转载请注明: Clucene索引建立剖析 | 学步园 +复制链接

抱歉!评论已关闭.

返回首页

（其他合作也可洽谈）

必威体育

必威电竞

学步园