现在的位置: 首页 > 综合 > 正文

Clucene索引建立剖析

2013年07月07日 ⁄ 综合 ⁄ 共 29377字 ⁄ 字号 评论关闭

转自:http://blog.csdn.net/bingfox/archive/2010/07/19/5745363.aspx

 

 下面,我们将结合代码,对Clucene建立索引的过程进行剖析.

  (一). main函数中调用建立索引的过程
(1).void IndexFiles()方法:
//参数:索引文件路径,索引后的目标路径
void IndexFiles(char* path, char* target, const bool clearIndex)

{

    IndexWriter* writer = NULL;

    lucene::analysis::standard::StandardAnalyzer an;

    if (!clearIndex && IndexReader::indexExists(target)){

        if (IndexReader::isLocked(target) ){  //在函数调用里面执行了创建了索引的目录
            printf("Index was locked... unlocking it./n");

            IndexReader::unlock(target);

        }

        writer = _CLNEW IndexWriter( target, &an, false);

    }else{

        writer = _CLNEW IndexWriter( target ,&an, true);

    }

    writer->setMaxFieldLength(IndexWriter::DEFAULT_MAX_FIELD_LENGTH);

    writer->setUseCompoundFile(false);  //设置不使用复合索引
    uint64_t str = lucene::util::Misc::currentTimeMillis();

    indexDocs(writer, path);

    writer->optimize();

    writer->close();

    _CLDELETE(writer);

    printf("Indexing took: %d ms./n/n", lucene::util::Misc::currentTimeMillis() - str);

}
 

   (2).void IndexFiles()方法中调用indexDocs(writer, path)方法:

void indexDocs(IndexWriter* writer, char* directory)

{

    DIR* dir = opendir(directory);

    if ( dir != NULL ){

        struct dirent* fl;

        struct fileStat buf;

        char path[CL_MAX_DIR];

        strcpy(path,directory);

        strcat(path,PATH_DELIMITERA);

        char* pathP = path + strlen(path);

        fl = readdir(dir);

        while ( fl != NULL ){

            if ( (strcmp(fl->d_name, ".")) && (strcmp(fl->d_name, "..")) ) {

                pathP[0]=0;

                strcat(pathP,fl->d_name);

                int32_t ret = fileStat(path,&buf);

                if ( buf.st_mode & S_IFDIR ) {

                    indexDocs(writer, path );

                }else{

                    //处理目录下面的每个文档
                    Document* doc = FileDocument( path );

                    writer->addDocument(doc);

                    _CLDELETE(doc);

                }

            }

            fl = readdir(dir);

        }

        closedir(dir);

    }else{

        printf( "adding: %s/n", directory);

        Document* doc = FileDocument( directory );

        writer->addDocument( doc );

        _CLDELETE(doc);

    }
 

   (3). Document* FileDocument(const char* f)方法:

//先将字段加入到文档,在将文档加入到IndexWriter中
Document* FileDocument(const char* f)

{

    Document* doc = _CLNEW Document();

    TCHAR tf[CL_MAX_DIR];

    STRCPY_AtoT(tf,f,CL_MAX_DIR);

    doc->add( *_CLNEW Field(_T("path"), tf, Field::STORE_YES | Field::INDEX_UNTOKENIZED ) );

    FILE* fh = fopen(f,"r");

    if ( fh != NULL ){

        StringBuffer str;

        int fn = fileno(fh);

        struct stat filestat;

        fstat(fn, &filestat);

        str.reserve(filestat.st_size);

        char abuf[1024];

        TCHAR tbuf[1024];

        size_t r;

        //每次读取1023字节
        do{

            r = fread(abuf,1,1023,fh);

            abuf[r]=0;

            STRCPY_AtoT(tbuf,abuf,r);

            tbuf[r]=0;

            str.append(tbuf);

        }while(r>0);

        fclose(fh);

        doc->add( *_CLNEW Field(_T("contents"),str.getBuffer(), Field::STORE_YES | Field::INDEX_TOKENIZED|Field::TERMVECTOR_WITH_OFFSETS) );

    }

    return doc;

}
 

 (二).进入建立索引的细节

   (1). Document类与Field类
void Document::add(Field& field)

{

    //刚开始创建时fieldList是空,然后加入下一个字段时,又不是为空了,这样就建立了next的关系
    //新生成的fieldList都是在链表头部
    fieldList = _CLNEW DocumentFieldEnumeration::DocumentFieldList(&field, fieldList);

}
 

      文档字段迭代器DocumentFieldEnumeration类
//文档字段迭代器
class DocumentFieldEnumeration :LUCENE_BASE{  

    class DocumentFieldList :LUCENE_BASE{

    public:

        DocumentFieldList(Field* f, DocumentFieldList* n);  //构造函数
        ~DocumentFieldList();

        Field* field;

        DocumentFieldList* next;    //应该叫做之前的pre指针
    };

    friend class Document;

private:

    const DocumentFieldList* fields;

public:

    DocumentFieldEnumeration(const DocumentFieldList* fl);

    ~DocumentFieldEnumeration();

    bool hasMoreElements() const;

    Field* nextElement();

};
 

     void Document::removeFields()方法:

//从链表中删除多个重名的字段
void Document::removeFields(const TCHAR* name)

{

    CND_PRECONDITION(name != NULL, "name is NULL");

    DocumentFieldEnumeration::DocumentFieldList* previous = NULL;

    DocumentFieldEnumeration::DocumentFieldList* current = fieldList;

    while (current != NULL) {

        if ( _tcscmp(current->field->name(),name) == 0 ){

            if (previous){

                previous->next = current->next;  //删除当前指针,修改指针指向
            }else

                fieldList = current->next;

            current->next=NULL;
            _CLDELETE(current);

            if ( previous )

                current = previous->next;  //重新设置当前指针
            else

                current = fieldList;

        }else{

            previous = current;

            current = current->next;

        }

    }

}
 

(2). void IndexWriter::addDocument()方法
void IndexWriter::addDocument(Document* doc, Analyzer* analyzer)

{

    CND_PRECONDITION(ramDirectory != NULL,"ramDirectory is NULL");

    if ( analyzer == NULL )

    {

        analyzer = this->analyzer;

    }

    ramDirectory->transStart();

    try {

        //每加入一个文档,就得到新的段名
        char* segmentName = newSegmentName();

        CND_CONDITION(segmentName != NULL, "segmentName is NULL");

        try {

            // ramDirectory:带事务的内存文件目录
            DocumentWriter* dw = _CLNEW DocumentWriter(ramDirectory, analyzer, this );

            CND_CONDITION(dw != NULL, "dw is NULL");

            try {

                dw->addDocument(segmentName, doc);

            } _CLFINALLY(

                _CLDELETE(dw);

            );

            //建立索引时加入一个文档,就生成一个新的段信息
            SegmentInfo* si = _CLNEW SegmentInfo(segmentName, 1, ramDirectory);

            CND_CONDITION(si != NULL, "Si is NULL");

            {

                SCOPED_LOCK_MUTEX(THIS_LOCK)

                segmentInfos->add(si);

                //合并段
                maybeMergeSegments();

            }

        } _CLFINALLY(

            _CLDELETE_CaARRAY(segmentName);

        );

    } catch (...) {

        ramDirectory->transAbort();

        throw;

    }

    ramDirectory->transCommit();

}
 

(3).调用到的void DocumentWriter::addDocument()方法
//将文档加入到新段里面
void DocumentWriter::addDocument(const char* segment, Document* doc)

{

    CND_PRECONDITION(fieldInfos==NULL, "fieldInfos!=NULL")

        // write field names

        fieldInfos = _CLNEW FieldInfos();

    fieldInfos->add(doc);

    //.fnm 写入字段名称的文件
    const char* buf = Misc::segmentname(segment, ".fnm");

    fieldInfos->write(directory, buf);

    _CLDELETE_CaARRAY(buf);

    // write field values

    FieldsWriter fieldsWriter(directory, segment, fieldInfos);

    try {

        fieldsWriter.addDocument(doc);

    } _CLFINALLY(fieldsWriter.close());

    //invert doc into postingTable

    clearPostingTable();              // clear postingTable

    //文档中字段的个数
    size_t size = fieldInfos->size();

    fieldLengths = _CL_NEWARRAY(int32_t,size);    // init fieldLengths

    fieldPositions = _CL_NEWARRAY(int32_t,size);  // init fieldPositions

    fieldOffsets = _CL_NEWARRAY(int32_t,size);    // init fieldOffsets

    memset(fieldPositions, 0, sizeof(int32_t) * size);

    //initialise fieldBoost array with default boost

    int32_t fbl = fieldInfos->size();

    float_t fbd = doc->getBoost();  //初始是.0f;

    fieldBoosts = _CL_NEWARRAY(float_t,fbl);      // init fieldBoosts

    {

        for ( int32_t i=0;i<fbl;i++ )

            fieldBoosts[i] = fbd;

    }

    {

        for ( int32_t i=0;i<fieldInfos->size();i++ )

            fieldLengths[i] = 0;

    }

    //进行倒排处理
    invertDocument(doc);

    // sort postingTable into an array

    Posting** postings = NULL;

    int32_t postingsLength = 0;

    //对postingTable中的词条进行排序,返回一个排序的Posting[]数组
    sortPostingTable(postings,postingsLength);

    //write postings

    //将经过排序的Posting[]数组写入到索引段文件中(segmentsv.frq文件和segments.prx文件)

    writePostings(postings,postingsLength, segment);

    //write norms of indexed fields

    //写入被索引的Field的norm信息
    writeNorms(segment);

    _CLDELETE_ARRAY( postings );

}
 

(4). void FieldInfos::write()方法
//写入字段信息
void FieldInfos::write(IndexOutput* output) const

{

    //首先写入字段个数
    output->writeVInt(size());

    FieldInfo* fi;

    uint8_t bits;

    for (int32_t i = 0; i < size(); ++i) {

        fi = fieldInfo(i);

        bits = 0x0;

        if (fi->isIndexed) bits |= IS_INDEXED;  //每个位的位置,如果两个操作数对应的位有一个或者两个都为,则该位为,否则为
        if (fi->storeTermVector) bits |= STORE_TERMVECTOR;

        if (fi->storePositionWithTermVector) bits |= STORE_POSITIONS_WITH_TERMVECTOR;

        if (fi->storeOffsetWithTermVector) bits |= STORE_OFFSET_WITH_TERMVECTOR;

        if (fi->omitNorms) bits |= OMIT_NORMS;

        output->writeString(fi->name,_tcslen(fi->name));  //写入字段名称以及长度
        output->writeByte(bits); //写入一个字节
    }

}

 
 

(5). void FieldsWriter::addDocument()方法
//写入字段值
void FieldsWriter::addDocument(Document* doc)

{

    CND_PRECONDITION(indexStream != NULL,"indexStream is NULL");

    CND_PRECONDITION(fieldsStream != NULL,"fieldsStream is NULL");

    printf("%s=%d","fieldsStream->getFilePointer()",fieldsStream->getFilePointer());

    //索引流写入字段流的位置指针
    indexStream->writeLong(fieldsStream->getFilePointer());

    int32_t storedCount = 0;

    DocumentFieldEnumeration* fields = doc->fields();

    while (fields->hasMoreElements()) {

        Field* field = fields->nextElement();

        if (field->isStored())

        {

            storedCount++;

        }

    }

    _CLDELETE(fields);

    //字段流写入存储索引的字段个数
    fieldsStream->writeVInt(storedCount);

    fields = doc->fields();

    while (fields->hasMoreElements())

    {

        Field* field = fields->nextElement();

        if (field->isStored())

        {

            //写入字段序号
            fieldsStream->writeVInt(fieldInfos->fieldNumber(field->name())); 

            uint8_t bits = 0;

            if (field->isTokenized())

                bits |= FieldsWriter::FIELD_IS_TOKENIZED;

            if (field->isBinary())

                bits |= FieldsWriter::FIELD_IS_BINARY;

            if (field->isCompressed())

                bits |= FieldsWriter::FIELD_IS_COMPRESSED;

            //写入一个字节:是否分词,是否是字节,是否压缩
            fieldsStream->writeByte(bits);

            if ( field->isCompressed() ){

                _CLTHROWA(CL_ERR_Runtime, "CLucene does not directly support compressed fields. Write a compressed byte array instead");

            }else{

                if (field->isBinary()) {

                    jstreams::StreamBase<char>* stream = field->streamValue();

                    const char* sd;

                    //去读取
                    int32_t rl = stream->read(sd,10000000,0);

                    if ( rl < 0 ){

                        fieldsStream->writeVInt(0);                 }else{

                        fieldsStream->writeVInt(rl);

                        fieldsStream->writeBytes((uint8_t*)sd, rl);

                    }

                }else if ( field->stringValue() == NULL ){                  CND_PRECONDITION(!field->isIndexed(), "Cannot store reader if it is indexed too")

                        Reader* r = field->readerValue();

                    const TCHAR* rv;

                    int64_t rl = r->read(rv, LUCENE_INT32_MAX_SHOULDBE);

                    if ( rl > LUCENE_INT32_MAX_SHOULDBE )

                        _CLTHROWA(CL_ERR_Runtime,"Field length too long");

                    else if ( rl < 0 )

                        rl = 0;

                    fieldsStream->writeString( rv, (int32_t)rl);

                }else if ( field->stringValue() != NULL ){

                    //写入读取的字符串
            fieldsStream->writeString(field->stringValue(),_tcslen(field->stringValue()));

                }else

                    _CLTHROWA(CL_ERR_Runtime, "No values are set for the field");

            }

        }

    }

    _CLDELETE(fields);

}
 

(6). void DocumentWriter::invertDocument方法
//进行倒排处理
void DocumentWriter::invertDocument(const Document* doc)

{

    DocumentFieldEnumeration* fields = doc->fields();

    try {

        while (fields->hasMoreElements())

        {

            Field* field = (Field*)fields->nextElement();

            const TCHAR*  fieldName = field->name();

            const int32_t fieldNumber = fieldInfos->fieldNumber(fieldName);

            //初始时都是
            int32_t length = fieldLengths[fieldNumber];     // length of field  // 根据每个Field的编号,设置每个Field的长度
            int32_t position = fieldPositions[fieldNumber]; // position in field // 根据每个Field的编号,设置每个Field的位置
            if (length>0)

            {

                position+=analyzer->getPositionIncrementGap(fieldName);

            }

            int32_t offset = fieldOffsets[fieldNumber];       // offset field  // 根据每个Field的编号,设置每个Field的offset

            if (field->isIndexed())

            {   // 如果Field被索引
                if (!field->isTokenized())

                {   // 如果Field没有进行分词
                    const TCHAR* charBuf = NULL;

                    int64_t dataLen = 0;

                    if (field->stringValue() == NULL && !field->isStored() )

                    {

                        CL_NS(util)::Reader* r = field->readerValue();

                        dataLen = r->read(charBuf, LUCENE_INT32_MAX_SHOULDBE);

                        if (dataLen == -1)

                            dataLen = 0;

                    } else {

                        charBuf = field->stringValue();

                        dataLen = _tcslen(charBuf);

                    }

                    // 是否把整个Field的数据作为一个词条存储到postingTable中
                    if(field->isStoreOffsetWithTermVector()){

                        TermVectorOffsetInfo tio;

                        tio.setStartOffset(offset);

                        tio.setEndOffset(offset + dataLen);

                        addPosition(fieldName, charBuf, position++, &tio );

                    }

                    else

                    {

                        addPosition(fieldName, charBuf, position++, NULL);

                    }

                    offset += dataLen;  //偏移量在加上数据长度
                    length++;

                } else { // field must be tokenized  // 需要对Field进行分词
                    CL_NS(util)::Reader* reader; // find or make Reader

                    bool delReader = false;

                    if (field->readerValue() != NULL) {  // 如果从Field获取的Reader数据不为null

                        reader = field->readerValue();

                    } else if (field->stringValue() != NULL) {  //   根据从Field获取的字符串数据构造一个Reader输入流
                        reader = _CLNEW CL_NS(util)::StringReader(field->stringValue(),_tcslen(field->stringValue()),false);

                        delReader = true;

                    } else {

                        _CLTHROWA(CL_ERR_IO,"field must have either String or Reader value");

                    }

 

                    try {

                        // Tokenize field and add to postingTable.

                        // 把经过分词处理的Field加入到postingTable中
                        CL_NS(analysis)::TokenStream* stream = analyzer->tokenStream(fieldName, reader);

                        try

                        {

                            CL_NS(analysis)::Token t;

                            int32_t lastTokenEndOffset = -1;  //上一个分词的终止位置
                            while (stream->next(&t))

                            {

                                position += (t.getPositionIncrement() - 1);  //每次切出一个词,就将position加上这个词的长度
                                // 如果指定了Field的词条向量的偏移量,则存储该此条向量
                                if(field->isStoreOffsetWithTermVector()){

                                    TermVectorOffsetInfo tio;

                                    tio.setStartOffset(offset + t.startOffset());

                                    tio.setEndOffset(offset + t.endOffset());

                                    addPosition(fieldName, t.termText(), position++, &tio);

                                }

                                else

                                {

                                    addPosition(fieldName, t.termText(), position++, NULL);

                                }

                                lastTokenEndOffset = t.endOffset();

                                length++;

                                // Apply field truncation policy.

                                // length:切出的字段的长度
                                if (maxFieldLength != IndexWriter::FIELD_TRUNC_POLICY__WARN) {

                                    if ( length > maxFieldLength) {  // 如果当前切出的词条数已经达到了该Field的最大长度
                                        break;

                                    }

                                } else if (length > IndexWriter::DEFAULT_MAX_FIELD_LENGTH) {

                                    const TCHAR* errMsgBase =

                                        _T("Indexing a huge number of tokens from a single")

                                        _T(" field (/"%s/", in this case) can cause CLucene")

                                        _T(" to use memory excessively.")

                                        _T("  By default, CLucene will accept only %s tokens")

                                        _T(" tokens from a single field before forcing the")

                                        _T(" client programmer to specify a threshold at")

                                        _T(" which to truncate the token stream.")

                                        _T("  You should set this threshold via")

                                        _T(" IndexReader::maxFieldLength (set to LUCENE_INT32_MAX")

                                        _T(" to disable truncation, or a value to specify maximum number of fields).");

                                    TCHAR defaultMaxAsChar[34];

                                    _i64tot(IndexWriter::DEFAULT_MAX_FIELD_LENGTH,defaultMaxAsChar, 10);

                                    int32_t errMsgLen = _tcslen(errMsgBase)+ _tcslen(fieldName)+ _tcslen(defaultMaxAsChar);

                                    TCHAR* errMsg = _CL_NEWARRAY(TCHAR,errMsgLen+1);

                                    _sntprintf(errMsg, errMsgLen,errMsgBase, fieldName, defaultMaxAsChar);

                                    _CLTHROWT_DEL(CL_ERR_Runtime,errMsg);

                                }

                            } // while token->next

                            if(lastTokenEndOffset != -1 )

                            {

                                offset += lastTokenEndOffset + 1;

                            }

                        } _CLFINALLY (

                            stream->close();

                        _CLDELETE(stream);

                        );

                    } _CLFINALLY (

                        if (delReader) {

                            _CLDELETE(reader);

                        }

                        );

                } // if/else field is to be tokenized

                // 位置信息,偏移量信息,长度信息
                fieldLengths[fieldNumber] = length;       // save field length

                fieldPositions[fieldNumber] = position;   // save field position

                fieldBoosts[fieldNumber] *= field->getBoost();

                fieldOffsets[fieldNumber] = offset;  //实际上是这个字段的终止偏移位置
            } // if field is to beindexed

        } // while more fields available

    } _CLFINALLY (

        _CLDELETE(fields);

    );

}
 

(7). void DocumentWriter::addPosition()方法
void DocumentWriter::addPosition(const TCHAR* field,const TCHAR* text,const int32_t position,TermVectorOffsetInfo* offset)

{

    //设置词条
    //typedef CL_NS(util)::CLHashtable<Term*,Posting*,Term::Compare, Term::Equals> PostingTableType;

    termBuffer->set(field,text,false);

    Posting* ti = postingTable.get(termBuffer);

    if (ti != NULL)

    {          
        int32_t freq = ti->freq;

        if (ti->positions.length == freq) {

            // positions array is full, realloc its size

            // 扩充数组:初始添加时频率为positions.values[0] = position; positions.length = 1;

            ti->positions.length = freq*2;

            ti->positions.values = (int32_t*)realloc(ti->positions.values, ti->positions.length * sizeof(int32_t));

        }

        ti->positions.values[freq] = position;        // add new position

        if (offset != NULL)

        {

            if (ti->offsets.length == freq)

            {

                //存储偏移量信息时跟存储位置采用相同的方法
                ti->offsets.length = freq*2;

                ti->offsets.values = (TermVectorOffsetInfo*)realloc(ti->offsets.values, ti->offsets.length * sizeof(TermVectorOffsetInfo));

            }

            ti->offsets[freq] = *offset;

        }

        ti->freq = freq + 1;      // 更新词条频率
    } else {                      // word not seen before

        Term* term = _CLNEW Term( field, text, false);

        postingTable.put(term, _CLNEW Posting(term, position, offset));

    }

}
 

     (8). DocumentWriter::Posting::Posting()构造函数
/*Posting构造函数*/

DocumentWriter::Posting::Posting(Term* t, const int32_t position, TermVectorOffsetInfo* offset)

{

    //对新生成的词条的处理
    freq = 1;  //频率设置为
    term = _CL_POINTER(t);

    positions.values = (int32_t*)malloc(sizeof(int32_t));  //存储的位置数组
    positions.values[0] = position;

    positions.length = 1;    //设置数组容量也是
    if ( offset != NULL )

    {

        this->offsets.values = (TermVectorOffsetInfo*)malloc(sizeof(TermVectorOffsetInfo));

        this->offsets.values[0] = *offset; //设置其中一个偏移量信息
        this->offsets.length = 1;  //设置数组容量也是
    }

}
 

   (9). DocumentWriter::writePostings()方法
void DocumentWriter::writePostings(Posting** postings, const int32_t postingsLength, const char* segment)

{

#define __DOCLOSE(obj) if(obj!=NULL){ try{ obj->close(); _CLDELETE(obj);} catch(CLuceneError &e){ierr=e.number();err=e.what();} catch(...){err="Unknown error while closing posting tables";} }

    IndexOutput* freq = NULL;

    IndexOutput* prox = NULL;

    TermInfosWriter* tis = NULL;

    TermVectorsWriter* termVectorWriter = NULL;

    try {

        //open files for inverse index storage

        //.frq: 频率信息文件
        const char* buf = Misc::segmentname( segment, ".frq");

        freq = directory->createOutput( buf );

        _CLDELETE_CaARRAY( buf );

        //.prx: 位置信息文件
        buf = Misc::segmentname( segment, ".prx");

        prox = directory->createOutput( buf );

        _CLDELETE_CaARRAY( buf );

        //TermInfosWriter类的构造函数,termIndexInterval:词条分组间隔
        tis = _CLNEW TermInfosWriter(directory, segment, fieldInfos,termIndexInterval);

        TermInfo* ti = _CLNEW TermInfo();

        const TCHAR* currentField = NULL;

        for (int32_t i = 0; i < postingsLength; i++) {

            Posting* posting = postings[i];

            // 写入字典文件以及快表文件
            ti->set(1, freq->getFilePointer(), prox->getFilePointer(), -1);

            tis->add(posting->term, ti);

            int32_t postingFreq = posting->freq;

            if (postingFreq == 1)                 // optimize freq=1

                freq->writeVInt(1);           // set low bit of doc num.

            else {

                freq->writeVInt(0);           // the document number

                freq->writeVInt(postingFreq); // frequency in doc

            }

            int32_t lastPosition = 0;             // write positions

            //使用差别法写入位置信息
            for (int32_t j = 0; j < postingFreq; ++j) {       // use delta-encoding

                prox->writeVInt(posting->positions.values[j] - lastPosition);

                lastPosition = posting->positions.values[j];

            }

            // check to see if we switched to a new field

            const TCHAR* termField = posting->term->field();

            //对字段包含的词条的处理
            //对不同字段的处理
            if (currentField==NULL||_tcscmp(currentField,termField)!= 0) { //todo, can we do an intern'd check?

                // changing field - see if there is something to save

                currentField = termField;

                FieldInfo* fi = fieldInfos->fieldInfo(currentField);

                //在field中以StoreTermVector方式保存的posting信息需要TermVectorsWriter类来写入
                if (fi->storeTermVector)

                {

                    if (termVectorWriter == NULL) {

                        //TermVectorsWriter类的构造函数
                        termVectorWriter =_CLNEW TermVectorsWriter(directory, segment, fieldInfos);

                        termVectorWriter->openDocument();

                    }

                    termVectorWriter->openField(currentField);

                } else if (termVectorWriter != NULL) {

                    termVectorWriter->closeField();

                }

            }

            if (termVectorWriter != NULL && termVectorWriter->isFieldOpen())

            {

                termVectorWriter->addTerm(posting->term->text(), postingFreq, &posting->positions, &posting->offsets);

            }

        }

        if (termVectorWriter != NULL)

        {

            termVectorWriter->closeDocument();

        }

        _CLDELETE(ti);

    }_CLFINALLY (

        const char* err=NULL;

    int32_t ierr=0;

    __DOCLOSE(freq);

    __DOCLOSE(prox);

    __DOCLOSE(tis);

    __DOCLOSE(termVectorWriter);

    if ( err != NULL )

        _CLTHROWA(ierr,err);

    );

}
 

   (10). TermInfosWriter::TermInfosWriter()构造函数
TermInfosWriter::TermInfosWriter(Directory* directory, const char* segment, FieldInfos* fis, int32_t interval):

fieldInfos(fis)

{

    CND_PRECONDITION(segment != NULL, "segment is NULL");

    initialise(directory,segment,interval, false);

    //这个other会填写.tii文件信息
    other = _CLNEW TermInfosWriter(directory, segment,fieldInfos, interval, true);

    CND_CONDITION(other != NULL, "other is NULL");

    other->other = this;

}
 

   (11). void TermInfosWriter::initialise()方法
void TermInfosWriter::initialise(Directory* directory, const char* segment, int32_t interval, bool IsIndex)

{

//字典文件由term信息组成,.tis 文件表示term信息文件.tii文件代表快表文件
   //对.tis文件中term个数计数,每到一个分组跨度(比如计数到,256),便把分组信息点term信息保存到.tii文件中
    lastTerm = _CLNEW Term;

    CND_CONDITION(lastTerm != NULL, "Could not allocate memory for lastTerm");

    lastTi  = _CLNEW TermInfo();

    CND_CONDITION(lastTi != NULL, "Could not allocate memory for lastTi");

    lastIndexPointer = 0;

    size             = 0;

    isIndex          = IsIndex;

    indexInterval = interval;

    skipInterval = LUCENE_DEFAULT_TERMDOCS_SKIP_INTERVAL;

    //other: isIndex=true 本身自己是false

    const char* buf = Misc::segmentname(segment, (isIndex ? ".tii" : ".tis"));

    output = directory->createOutput(buf);

    _CLDELETE_CaARRAY(buf);

    output->writeInt(FORMAT);                      // write format

    output->writeLong(0);                          // leave space for size

    output->writeInt(indexInterval);// write indexInterval

    output->writeInt(skipInterval); // write skipInterval

    //Set other to NULL by Default

    other = NULL;

}
 

(12). void TermInfosWriter::add()方法
void TermInfosWriter::add(Term* term, const TermInfo* ti)

{

    CND_PRECONDITION(isIndex || (!isIndex  && term->compareTo(lastTerm) > 0),"term out of order");

    CND_PRECONDITION(ti->freqPointer >= lastTi->freqPointer,"freqPointer out of order");

    CND_PRECONDITION(ti->proxPointer >= lastTi->proxPointer,"proxPointer out of order");

    if (!isIndex && size % indexInterval == 0){

        //本身是isIndex=false 然后达到了词条分组间隔
        other->add(lastTerm, lastTi);

    }

    //写入词条
    writeTerm(term);                     

    // write doc freq 写入文档频率
    output->writeVInt(ti->docFreq);     

    //write pointers  写入词条频率差值,位置信息差值,初始时都是
    output->writeVLong(ti->freqPointer - lastTi->freqPointer);

    output->writeVLong(ti->proxPointer - lastTi->proxPointer);

    if (ti->docFreq >= skipInterval)

    {

        output->writeVInt(ti->skipOffset);

    }

    //对other 快表的写入处理
    if (isIndex)

    {

        output->writeVLong(other->output->getFilePointer() - lastIndexPointer);

        lastIndexPointer = other->output->getFilePointer(); // write pointer

    }

    lastTi->set(ti); //设置上一次TermInfo* ti信息
    size++;

}
 

(13). void TermInfosWriter::writeTerm()方法
void TermInfosWriter::writeTerm(Term* term)

{

    //因为词条信息已经是排序好了的,这里使用差别法写入词条信息
    int32_t start = Misc::stringDifference(lastTerm->text(),lastTerm->textLength(), term->text(),term->textLength());

    int32_t length = term->textLength() - start;

    output->writeVInt(start);             // 写入共有前缀字符长度
    output->writeVInt(length);            // 写入不同的字符长度
    output->writeChars(term->text(), start, length);  //写入不同的字符值
    int32_t fieldnum = fieldInfos->fieldNumber(term->field()); //写入词条所在字段的编号
    CND_PRECONDITION(fieldnum>=-1&&fieldnum<fieldInfos->size(),"Fieldnum is out of range");

    output->writeVInt(fieldnum); // write field num

    if ( lastTerm->__cl_refcount == 1 ){

        lastTerm->set(term,term->text());

    }else{

        _CLDECDELETE(lastTerm);

        lastTerm = _CL_POINTER(term);

    }

}
 

(14). TermVectorsWriter::TermVectorsWriter()构造函数
TermVectorsWriter::TermVectorsWriter(CL_NS(store)::Directory* directory,

                                     const char* segment,FieldInfos* fieldInfos)

{

    //.tvx: 保存了指针信息,指针指向.tvd的document数据位置
    //.tvd: 保存表指针,表内的指针指向.tvf文件中的field信息
    //.tvf: 保存field中term,频率,位置与偏移信息
    char fbuf[CL_MAX_NAME];

    strcpy(fbuf,segment);

    char* fpbuf=fbuf+strlen(fbuf);

    strcpy(fpbuf,LUCENE_TVX_EXTENSION);

    tvx = directory->createOutput(fbuf);

    tvx->writeInt(FORMAT_VERSION);

    strcpy(fpbuf,LUCENE_TVD_EXTENSION);

    tvd = directory->createOutput(fbuf);

    tvd->writeInt(FORMAT_VERSION);

    strcpy(fpbuf,LUCENE_TVF_EXTENSION);

    tvf = directory->createOutput(fbuf);

    tvf->writeInt(FORMAT_VERSION);

    this->fieldInfos = fieldInfos;

    currentField = NULL;      //字段是否打开的判断
    currentDocPointer = -1;   //文档是否打开的判断
}
 

(15). void TermVectorsWriter::writeField()方法
void TermVectorsWriter::writeField() 

{

    //.tvx: 保存了指针信息,指针指向.tvd的document数据位置
    //.tvd: 保存表指针,表内的指针指向.tvf文件中的field信息
    //.tvf: 保存field中term,频率,位置与偏移信息
    currentField->tvfPointer = tvf->getFilePointer();

    //System.out.println("Field Pointer: " + currentField.tvfPointer);

    //写入词条个数
    int32_t size = terms.size();

    tvf->writeVInt(size);

    //是否以TermVector方式保存位置信息,是否以TermVector方式保存偏移量信息
    bool storePositions = currentField->storePositions;

    bool storeOffsets = currentField->storeOffsets;

    uint8_t bits = 0x0;

    if (storePositions)

        bits |= STORE_POSITIONS_WITH_TERMVECTOR;

    if (storeOffsets)

        bits |= STORE_OFFSET_WITH_TERMVECTOR;

    tvf->writeByte(bits);

    const TCHAR* lastTermText = LUCENE_BLANK_STRING;  //一个空串""

    int32_t lastTermTextLen = 0;

    for (int32_t i = 0; i < size; ++i)

    {

        TVTerm* term = terms[i];

        int32_t start = CL_NS(util)::Misc::stringDifference(lastTermText, lastTermTextLen, term->getTermText(),term->getTermTextLen());

        int32_t length = term->getTermTextLen() - start;

        tvf->writeVInt(start);            // 写入共有前缀字符长度
        tvf->writeVInt(length);           // 写入不同的字符长度
        tvf->writeChars(term->getTermText(), start, length);  //写入不同的字符值
        tvf->writeVInt(term->freq);  //写入词条的频率
        lastTermText = term->getTermText();

        lastTermTextLen = term->getTermTextLen();

        //位置信息与偏移量的差别在于:位置信息保存的是term之间相隔term的个数,偏移量保存
        //term之间相隔的字符数
        if(storePositions){

            //以TermVector方式保存位置信息
            if(term->positions == NULL)

            {

                _CLTHROWA(CL_ERR_IllegalState, "Trying to write positions that are NULL!");

            }

            // use delta encoding for positions

            int32_t position = 0;

            for (int32_t j = 0; j < term->freq; ++j){

                tvf->writeVInt((*term->positions)[j] - position); //只保存位置差值
                position = (*term->positions)[j];

            }

        }

        if(storeOffsets){

            //以TermVector方式保存偏移量信息
            if(term->offsets == NULL)

            {

                _CLTHROWA(CL_ERR_IllegalState, "Trying to write offsets that are NULL!");

            }

            int32_t position = 0;

            for (int32_t j = 0; j < term->freq; ++j) {

                tvf->writeVInt((*term->offsets)[j].getStartOffset() - position);

                tvf->writeVInt((*term->offsets)[j].getEndOffset() - (*term->offsets)[j].getStartOffset()); //Save the diff between the two.

                position = (*term->offsets)[j].getEndOffset();

            }

        }

    }

}
 

 

 

本文来自CSDN博客,转载请标明出处:http://blog.csdn.net/bingfox/archive/2010/07/19/5745363.aspx

抱歉!评论已关闭.