下面,我们将结合代码,对Clucene建立索引的过程进行剖析.
(一). main函数中调用建立索引的过程
(1).void IndexFiles()方法:
//参数:索引文件路径,索引后的目标路径 void IndexFiles(char* path, char* target, const bool clearIndex) { IndexWriter* writer = NULL; lucene::analysis::standard::StandardAnalyzer an; if (!clearIndex && IndexReader::indexExists(target)){ if (IndexReader::isLocked(target) ){ //在函数调用里面执行了创建了索引的目录 printf("Index was locked... unlocking it./n"); IndexReader::unlock(target); } writer = _CLNEW IndexWriter( target, &an, false); }else{ writer = _CLNEW IndexWriter( target ,&an, true); } writer->setMaxFieldLength(IndexWriter::DEFAULT_MAX_FIELD_LENGTH); writer->setUseCompoundFile(false); //设置不使用复合索引 uint64_t str = lucene::util::Misc::currentTimeMillis(); indexDocs(writer, path); writer->optimize(); writer->close(); _CLDELETE(writer); printf("Indexing took: %d ms./n/n", lucene::util::Misc::currentTimeMillis() - str); } |
(2).void IndexFiles()方法中调用indexDocs(writer, path)方法:
void indexDocs(IndexWriter* writer, char* directory) { DIR* dir = opendir(directory); if ( dir != NULL ){ struct dirent* fl; struct fileStat buf; char path[CL_MAX_DIR]; strcpy(path,directory); strcat(path,PATH_DELIMITERA); char* pathP = path + strlen(path); fl = readdir(dir); while ( fl != NULL ){ if ( (strcmp(fl->d_name, ".")) && (strcmp(fl->d_name, "..")) ) { pathP[0]=0; strcat(pathP,fl->d_name); int32_t ret = fileStat(path,&buf); if ( buf.st_mode & S_IFDIR ) { indexDocs(writer, path ); }else{ //处理目录下面的每个文档 Document* doc = FileDocument( path ); writer->addDocument(doc); _CLDELETE(doc); } } fl = readdir(dir); } closedir(dir); }else{ printf( "adding: %s/n", directory); Document* doc = FileDocument( directory ); writer->addDocument( doc ); _CLDELETE(doc); } |
(3). Document* FileDocument(const char* f)方法:
//先将字段加入到文档,在将文档加入到IndexWriter中 Document* FileDocument(const char* f) { Document* doc = _CLNEW Document(); TCHAR tf[CL_MAX_DIR]; STRCPY_AtoT(tf,f,CL_MAX_DIR); doc->add( *_CLNEW Field(_T("path"), tf, Field::STORE_YES | Field::INDEX_UNTOKENIZED ) ); FILE* fh = fopen(f,"r"); if ( fh != NULL ){ StringBuffer str; int fn = fileno(fh); struct stat filestat; fstat(fn, &filestat); str.reserve(filestat.st_size); char abuf[1024]; TCHAR tbuf[1024]; size_t r; //每次读取1023字节 do{ r = fread(abuf,1,1023,fh); abuf[r]=0; STRCPY_AtoT(tbuf,abuf,r); tbuf[r]=0; str.append(tbuf); }while(r>0); fclose(fh); doc->add( *_CLNEW Field(_T("contents"),str.getBuffer(), Field::STORE_YES | Field::INDEX_TOKENIZED|Field::TERMVECTOR_WITH_OFFSETS) ); } return doc; } |
(二).进入建立索引的细节
(1). Document类与Field类
void Document::add(Field& field) { //刚开始创建时fieldList是空,然后加入下一个字段时,又不是为空了,这样就建立了next的关系 //新生成的fieldList都是在链表头部 fieldList = _CLNEW DocumentFieldEnumeration::DocumentFieldList(&field, fieldList); } |
文档字段迭代器DocumentFieldEnumeration类
//文档字段迭代器 class DocumentFieldEnumeration :LUCENE_BASE{ class DocumentFieldList :LUCENE_BASE{ public: DocumentFieldList(Field* f, DocumentFieldList* n); //构造函数 ~DocumentFieldList(); Field* field; DocumentFieldList* next; //应该叫做之前的pre指针 }; friend class Document; private: const DocumentFieldList* fields; public: DocumentFieldEnumeration(const DocumentFieldList* fl); ~DocumentFieldEnumeration(); bool hasMoreElements() const; Field* nextElement(); }; |
void Document::removeFields()方法:
//从链表中删除多个重名的字段 void Document::removeFields(const TCHAR* name) { CND_PRECONDITION(name != NULL, "name is NULL"); DocumentFieldEnumeration::DocumentFieldList* previous = NULL; DocumentFieldEnumeration::DocumentFieldList* current = fieldList; while (current != NULL) { if ( _tcscmp(current->field->name(),name) == 0 ){ if (previous){ previous->next = current->next; //删除当前指针,修改指针指向 }else fieldList = current->next; current->next=NULL; _CLDELETE(current); if ( previous ) current = previous->next; //重新设置当前指针 else current = fieldList; }else{ previous = current; current = current->next;
|