现在的位置: 首页 > 综合 > 正文

Clucene索引建立剖析

2013年09月07日 ⁄ 综合 ⁄ 共 4292字 ⁄ 字号 评论关闭

  下面,我们将结合代码,Clucene建立索引的过程进行剖析.

  (). main函数中调用建立索引的过程

(1).void IndexFiles()方法:

//参数:索引文件路径,索引后的目标路径

void IndexFiles(char* path, char* target, const bool clearIndex)

{

    IndexWriter* writer = NULL;

    lucene::analysis::standard::StandardAnalyzer an;

    if (!clearIndex && IndexReader::indexExists(target)){

        if (IndexReader::isLocked(target) ){  //在函数调用里面执行了创建了索引的目录

            printf("Index was locked... unlocking it./n");

            IndexReader::unlock(target);

        }

        writer = _CLNEW IndexWriter( target, &an, false);

    }else{

        writer = _CLNEW IndexWriter( target ,&an, true);

    }

    writer->setMaxFieldLength(IndexWriter::DEFAULT_MAX_FIELD_LENGTH);

    writer->setUseCompoundFile(false);  //设置不使用复合索引

    uint64_t str = lucene::util::Misc::currentTimeMillis();

    indexDocs(writer, path);

    writer->optimize();

    writer->close();

    _CLDELETE(writer);

    printf("Indexing took: %d ms./n/n", lucene::util::Misc::currentTimeMillis() - str);

}

   (2).void IndexFiles()方法中调用indexDocs(writer, path)方法:

void indexDocs(IndexWriter* writer, char* directory)

{

    DIR* dir = opendir(directory);

    if ( dir != NULL ){

        struct dirent* fl;

        struct fileStat buf;

        char path[CL_MAX_DIR];

        strcpy(path,directory);

        strcat(path,PATH_DELIMITERA);

        char* pathP = path + strlen(path);

        fl = readdir(dir);

        while ( fl != NULL ){

            if ( (strcmp(fl->d_name, ".")) && (strcmp(fl->d_name, "..")) ) {

                pathP[0]=0;

                strcat(pathP,fl->d_name);

                int32_t ret = fileStat(path,&buf);

                if ( buf.st_mode & S_IFDIR ) {

                    indexDocs(writer, path );

                }else{

                    //处理目录下面的每个文档

                    Document* doc = FileDocument( path );

                    writer->addDocument(doc);

                    _CLDELETE(doc);

                }

            }

            fl = readdir(dir);

        }

        closedir(dir);

    }else{

        printf( "adding: %s/n", directory);

        Document* doc = FileDocument( directory );

        writer->addDocument( doc );

        _CLDELETE(doc);

    }

   (3). Document* FileDocument(const char* f)方法:

//先将字段加入到文档,在将文档加入到IndexWriter

Document* FileDocument(const char* f)

{

    Document* doc = _CLNEW Document();

    TCHAR tf[CL_MAX_DIR];

    STRCPY_AtoT(tf,f,CL_MAX_DIR);

    doc->add( *_CLNEW Field(_T("path"), tf, Field::STORE_YES | Field::INDEX_UNTOKENIZED ) );

    FILE* fh = fopen(f,"r");

    if ( fh != NULL ){

        StringBuffer str;

        int fn = fileno(fh);

        struct stat filestat;

        fstat(fn, &filestat);

        str.reserve(filestat.st_size);

        char abuf[1024];

        TCHAR tbuf[1024];

        size_t r;

        //每次读取1023字节

        do{

            r = fread(abuf,1,1023,fh);

            abuf[r]=0;

            STRCPY_AtoT(tbuf,abuf,r);

            tbuf[r]=0;

            str.append(tbuf);

        }while(r>0);

        fclose(fh);

        doc->add( *_CLNEW Field(_T("contents"),str.getBuffer(), Field::STORE_YES | Field::INDEX_TOKENIZED|Field::TERMVECTOR_WITH_OFFSETS) );

    }

    return doc;

}

 ().进入建立索引的细节

   (1). Document类与Field

void Document::add(Field& field)

{

    //刚开始创建时fieldList是空,然后加入下一个字段时,又不是为空了,这样就建立了next的关系

    //新生成的fieldList都是在链表头部

    fieldList = _CLNEW DocumentFieldEnumeration::DocumentFieldList(&field, fieldList);

}

      文档字段迭代器DocumentFieldEnumeration

//文档字段迭代器

class DocumentFieldEnumeration :LUCENE_BASE{  

    class DocumentFieldList :LUCENE_BASE{

    public:

        DocumentFieldList(Field* f, DocumentFieldList* n);  //构造函数

        ~DocumentFieldList();

        Field* field;

        DocumentFieldList* next;    //应该叫做之前的pre指针

    };

    friend class Document;

private:

    const DocumentFieldList* fields;

public:

    DocumentFieldEnumeration(const DocumentFieldList* fl);

    ~DocumentFieldEnumeration();

    bool hasMoreElements() const;

    Field* nextElement();

};

     void Document::removeFields()方法:

//从链表中删除多个重名的字段

void Document::removeFields(const TCHAR* name)

{

    CND_PRECONDITION(name != NULL, "name is NULL");

    DocumentFieldEnumeration::DocumentFieldList* previous = NULL;

    DocumentFieldEnumeration::DocumentFieldList* current = fieldList;

    while (current != NULL) {

        if ( _tcscmp(current->field->name(),name) == 0 ){

            if (previous){

                previous->next = current->next;  //删除当前指针,修改指针指向

            }else

                fieldList = current->next;

            current->next=NULL;

            _CLDELETE(current);

            if ( previous )

                current = previous->next;  //重新设置当前指针

            else

                current = fieldList;

        }else{

            previous = current;

            current = current->next;

抱歉!评论已关闭.