现在的位置: 首页 > 综合 > 正文

搜索引擎的那些事(摘取价格数据)

2013年09月03日 ⁄ 综合 ⁄ 共 3853字 ⁄ 字号 评论关闭

【 声明:版权所有,欢迎转载,请勿用于商业用途。  联系信箱:feixiaoxing @163.com】 

    下载网页不难,提取数据其实也不难。前面,我们说到了如何在当当网页中提取title。当然了,不仅仅是当当网页可以提取title,几乎所有的网页都可以提取标题。因为当当是一家电商网站,所以基本上其标题信息和它卖的商品是分不开的。但是,现在我们已经不满足于此了,我们还想从网页中提取价格信息,那应该怎么做呢?

    要从网页中提取价格信息,关键是要寻找规律,怎么样又好又快地将价格信息找出来。我们可以随便找一个当当的网页,查看一下它的源代码信息,就会发现这样的数据,大家可以看一下,

     <p>当 当 价:<b id="d_price" class="d_price "><span class="yen">&yen;</span>171.00</b><span class="break"></span></p>

    几乎所有当当网页商品中都会保留这样格式的信息,但是数字当然不一样了。我们可以想到一个比较简单的提取方法,就是分成下面两个步骤:(1)寻找“当 当 价”的起始位置;(2)寻找到起始位置后,查找第一个数字信息,就可以发现171.00这些数据了,也就是我们需要的定价数据信息。

#include <stdio.h>
#include <windows.h>
#include <wininet.h>
#include <assert.h>

#ifdef ERROR
#undef ERROR
#endif

#define U8 unsigned char
#define U32 unsigned int
#define STATUS unsigned int

#define OK 0
#define ERROR (~0L)
#define MAX_BLOCK_SIZE 1024
#define HTTP_NAME_ADDRESS "http://product.dangdang.com/main/product.aspx?product_id=22560249&ref=book-11712-3032_1-63349-0"
#pragma comment(lib, "wininet.lib")

/* show file content */
static void show_file_content(char* buffer, int size)
{
	while(size --)
	{
		printf("%c", *buffer++);
	}	
}

/* find pattern content */
static STATUS find_pattern_content(char* buffer, char* start, char* end,  char** pp_buffer, int* size)
{
	char* prev;
	char* next;

	if(NULL == buffer)
	{
		return ERROR;
	}
	
	if(NULL == start || NULL == end)
	{
		return ERROR;
	}
	
	if(NULL == pp_buffer || 0 == size)
	{
		return ERROR;
	}
	
	next = strstr(buffer, start);
	if(NULL == next)
	{
		return ERROR;
	}
	
	prev = next;
	next += strlen(start);
	next = strstr(next, end);
	if(NULL == next)
	{
		return ERROR;
	}
	
	*pp_buffer = prev + strlen(start);
	*size = next - (prev + strlen(start));
	return OK;
}

/* get length of html file */
static int get_file_size(const char* path)
{
	HANDLE hFile;
	int size = 0;
		
	hFile = CreateFile(path, FILE_READ_EA, FILE_SHARE_READ, 0, OPEN_EXISTING, 0, 0);
	if (hFile != INVALID_HANDLE_VALUE)
    {
		size = GetFileSize(hFile, NULL);
        CloseHandle(hFile);
    }

	return size;
}

/* get all data from html file */
static STATUS get_file_content(const char* path, void** pp_buffer, int* size)
{
	int length;
	char* buffer;
	HANDLE hFile;

	if(NULL == path)
	{
		return ERROR;
	}

	if(NULL == pp_buffer)
	{
		return ERROR;
	}

	if(NULL == size)
	{
		return ERROR;
	}

	length = get_file_size(path);
	if(0 == length)
	{
		return ERROR;
	}

	buffer = (char*) malloc(length +1);
	if(NULL == buffer)
	{
		return ERROR;
	}

	buffer[length] = '\0';
	hFile = fopen(path, "r+b");
	if(NULL == hFile)
	{
		free(buffer);
		return ERROR;
	}

	fread(buffer, 1, length, hFile);
	fclose(hFile);

	*pp_buffer = buffer;
	*size = length;
	return OK;
}

/* implement page download */
static STATUS download_web_page(const char* url, const char* path)
{
	U8 buffer[MAX_BLOCK_SIZE];
	U32 iNumber;
	FILE* hFile;
	HINTERNET hSession;
	HINTERNET hUrl;
	STATUS result;
	
	hSession = InternetOpen("RookIE/1.0", INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL, 0);
	if(NULL == hSession)
	{
		return ERROR;
	}
	
	hUrl = InternetOpenUrl(hSession, url, NULL, 0, INTERNET_FLAG_DONT_CACHE, 0);
	if(NULL == hUrl)
	{
		result = ERROR;
		goto error1;
	}
	
	hFile = fopen(path, "wb");
	if(NULL == hFile)
	{
		result = ERROR;
		goto error2;
	}

	iNumber = 1;
	while(iNumber > 0)
	{
		InternetReadFile(hUrl, buffer, MAX_BLOCK_SIZE -1, &iNumber);
		fwrite(buffer, sizeof(char), iNumber, hFile);
	}
	
	fclose(hFile);
	result = OK;
	
error2:
	InternetCloseHandle(hUrl);
	
error1:
	InternetCloseHandle(hSession);

	return result;
}


static STATUS is_char_digital(char value)
{
	if(value >= '0' && value <= '9')
	{
		return OK;
	}
	
	return ERROR;
}

/* get product price */
static STATUS find_product_price(char* buffer,  char* str, int len)
{
	char* prev;
	char* next;

	if(NULL == buffer)
	{
		return ERROR;
	}
	
	if(NULL == str || 0 == len)
	{
		return ERROR;
	}
	
	memset(str, 0, len);
	next = strstr(buffer, "当 当 价:");
	if(NULL == next)
	{
		return ERROR;
	}
	
	next += strlen("当 当 价:");
	while(ERROR == is_char_digital(*next))
	{
		next ++;
	}
	
	prev = next;
	while('<' != *next)
	{
		next ++;
	}
	
	memmove(str, prev, next - prev);
	return OK;
}


/* entry of programme */
int main(int argc, char* argv[])
{	
	char* buffer;
	char* begin;
	int length;
	int size;
	char price[16];

	/* 0.html is just the start page */
	download_web_page(HTTP_NAME_ADDRESS, "E:/0.html");
	if(OK == get_file_content("E:/0.html", &buffer, &size))
	{
		memset(price, 0, 16);

		if(OK == find_pattern_content(buffer, "<title>", "</title>", &begin, &length))  
        {  
			printf("商品:");
            show_file_content(begin, length);
			printf("\n");
        } 
	
		if(OK == find_product_price(buffer, price, 16))
		{
			printf("当当价:%s\n", price);
		}

		free(buffer);
	}
	
	return 1;
}

抱歉!评论已关闭.