现在的位置: 首页 > 综合 > 正文

C语言实现简单的倒排文件索引,TF-IDF

2018年02月20日 ⁄ 综合 ⁄ 共 2225字 ⁄ 字号 评论关闭

inver.h文件

#ifndef INVERT_FILE_H
#define INVERT_FILE_H
#include<stdio.h>
#include<stdlib.h>

typedef struct _invertfile_ {
    unsigned int tablelen;
    void **table;
    //unsigned int offset;	
	unsigned int nodecount;

}if_t;
typedef struct _word_{
	unsigned int id;
	unsigned int refered;//
	void *link;
}word_t;
typedef struct _word_frequency_{
	unsigned int d_id;
	unsigned int refered;//the num of referenced in the document
	void *next;
}wf_t;

if_t* invertfile_create(int length);
void invertfile_insert(if_t *h,int w_id,int d_id);
wf_t* invertfile_search(if_t *h,int w_id,int d_id);
void invertfile_traverse(if_t *h);
void invertfile_free(if_t *h);
#endif

invert.cpp

#include"invert.h"
if_t* invertfile_create(int length){
	if_t *h;
	
	h = (if_t *)calloc(1,sizeof(if_t));
    if (NULL == h) return NULL; 	
	
	h->table =(void **)calloc(length,sizeof(void *));
	h->tablelen=length;
	h->nodecount=0;
	word_t *w;
	for(int i=0;i<length;i++){
		h->table[i]=malloc(sizeof(word_t));
		w=(word_t*)h->table[i];
		w->id=i;
		w->refered=0;
		w->link=NULL;
	}
	return h;
}
//check if document d_id have word w_id
wf_t* invertfile_search(if_t *h,int w_id,int d_id){
	word_t *w;
	wf_t	*wf;
	w=(word_t*)h->table[w_id];
	if(w->refered>0){
		wf=(wf_t*)w->link;
		while(wf){
			if(wf->d_id==d_id)return wf;
			wf=(wf_t*)wf->next;
		}
	}
	return NULL;
}
void invertfile_insert(if_t *h,int w_id,int d_id){
	word_t * w=(word_t*)h->table[w_id];
	wf_t * wf;
	if((wf=invertfile_search(h,w_id,d_id))!=NULL){
		wf->refered++;
	}
	else{
		wf=(wf_t *)malloc(sizeof(wf_t));
		wf->next=w->link;
		w->link=wf;
		w->refered++;
		wf->refered++;
		wf->d_id=d_id;
		h->nodecount++;
	}
}
void invertfile_free(if_t *h){
	word_t *w;
	wf_t* wf,*cur;
	for(int i=0;i<h->tablelen;i++){
		w=(word_t*)h->table[i];
		wf=(wf_t*)w->link;
		while(wf!=NULL){
			cur=wf;
			wf=(wf_t*)wf->next;
			free(cur);
		}
		free(w);
	}
	free(h->table);
}
void invertfile_traverse(if_t *h){
	word_t *w;
	wf_t* wf,*cur;
	for(int i=0;i<h->tablelen;i++){
		w=(word_t*)h->table[i];
		wf=(wf_t*)w->link;
		printf("word_id:%d;",w->id);
		while(wf!=NULL){
			cur=wf;
			wf=(wf_t*)wf->next;
			printf("d_id:%d,freq:%d;",cur->d_id,cur->refered);
		}
		printf("\n");
	}
}

测试文件main.cpp

#include"invert.h"
int main(){
	if_t *f=invertfile_create(10);
	invertfile_insert(f,1,1);
	invertfile_insert(f,1,1);
	invertfile_insert(f,1,3);
	invertfile_insert(f,2,5);
	invertfile_traverse(f);
	invertfile_free(f);
}

实验结果:

word_id:0;
word_id:1;d_id:3,freq:1;d_id:1,freq:2;
word_id:2;d_id:5,freq:1;
word_id:3;
word_id:4;
word_id:5;
word_id:6;
word_id:7;
word_id:8;
word_id:9;

抱歉!评论已关闭.