【 声明:版权所有,欢迎转载,请勿用于商业用途。 联系信箱:feixiaoxing @163.com】
下载网页不难,提取数据其实也不难。前面,我们说到了如何在当当网页中提取title。当然了,不仅仅是当当网页可以提取title,几乎所有的网页都可以提取标题。因为当当是一家电商网站,所以基本上其标题信息和它卖的商品是分不开的。但是,现在我们已经不满足于此了,我们还想从网页中提取价格信息,那应该怎么做呢?
要从网页中提取价格信息,关键是要寻找规律,怎么样又好又快地将价格信息找出来。我们可以随便找一个当当的网页,查看一下它的源代码信息,就会发现这样的数据,大家可以看一下,
<p>当 当 价:<b id="d_price" class="d_price "><span class="yen">¥</span>171.00</b><span class="break"></span></p>
几乎所有当当网页商品中都会保留这样格式的信息,但是数字当然不一样了。我们可以想到一个比较简单的提取方法,就是分成下面两个步骤:(1)寻找“当 当 价”的起始位置;(2)寻找到起始位置后,查找第一个数字信息,就可以发现171.00这些数据了,也就是我们需要的定价数据信息。
#include <stdio.h> #include <windows.h> #include <wininet.h> #include <assert.h> #ifdef ERROR #undef ERROR #endif #define U8 unsigned char #define U32 unsigned int #define STATUS unsigned int #define OK 0 #define ERROR (~0L) #define MAX_BLOCK_SIZE 1024 #define HTTP_NAME_ADDRESS "http://product.dangdang.com/main/product.aspx?product_id=22560249&ref=book-11712-3032_1-63349-0" #pragma comment(lib, "wininet.lib") /* show file content */ static void show_file_content(char* buffer, int size) { while(size --) { printf("%c", *buffer++); } } /* find pattern content */ static STATUS find_pattern_content(char* buffer, char* start, char* end, char** pp_buffer, int* size) { char* prev; char* next; if(NULL == buffer) { return ERROR; } if(NULL == start || NULL == end) { return ERROR; } if(NULL == pp_buffer || 0 == size) { return ERROR; } next = strstr(buffer, start); if(NULL == next) { return ERROR; } prev = next; next += strlen(start); next = strstr(next, end); if(NULL == next) { return ERROR; } *pp_buffer = prev + strlen(start); *size = next - (prev + strlen(start)); return OK; } /* get length of html file */ static int get_file_size(const char* path) { HANDLE hFile; int size = 0; hFile = CreateFile(path, FILE_READ_EA, FILE_SHARE_READ, 0, OPEN_EXISTING, 0, 0); if (hFile != INVALID_HANDLE_VALUE) { size = GetFileSize(hFile, NULL); CloseHandle(hFile); } return size; } /* get all data from html file */ static STATUS get_file_content(const char* path, void** pp_buffer, int* size) { int length; char* buffer; HANDLE hFile; if(NULL == path) { return ERROR; } if(NULL == pp_buffer) { return ERROR; } if(NULL == size) { return ERROR; } length = get_file_size(path); if(0 == length) { return ERROR; } buffer = (char*) malloc(length +1); if(NULL == buffer) { return ERROR; } buffer[length] = '\0'; hFile = fopen(path, "r+b"); if(NULL == hFile) { free(buffer); return ERROR; } fread(buffer, 1, length, hFile); fclose(hFile); *pp_buffer = buffer; *size = length; return OK; } /* implement page download */ static STATUS download_web_page(const char* url, const char* path) { U8 buffer[MAX_BLOCK_SIZE]; U32 iNumber; FILE* hFile; HINTERNET hSession; HINTERNET hUrl; STATUS result; hSession = InternetOpen("RookIE/1.0", INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL, 0); if(NULL == hSession) { return ERROR; } hUrl = InternetOpenUrl(hSession, url, NULL, 0, INTERNET_FLAG_DONT_CACHE, 0); if(NULL == hUrl) { result = ERROR; goto error1; } hFile = fopen(path, "wb"); if(NULL == hFile) { result = ERROR; goto error2; } iNumber = 1; while(iNumber > 0) { InternetReadFile(hUrl, buffer, MAX_BLOCK_SIZE -1, &iNumber); fwrite(buffer, sizeof(char), iNumber, hFile); } fclose(hFile); result = OK; error2: InternetCloseHandle(hUrl); error1: InternetCloseHandle(hSession); return result; } static STATUS is_char_digital(char value) { if(value >= '0' && value <= '9') { return OK; } return ERROR; } /* get product price */ static STATUS find_product_price(char* buffer, char* str, int len) { char* prev; char* next; if(NULL == buffer) { return ERROR; } if(NULL == str || 0 == len) { return ERROR; } memset(str, 0, len); next = strstr(buffer, "当 当 价:"); if(NULL == next) { return ERROR; } next += strlen("当 当 价:"); while(ERROR == is_char_digital(*next)) { next ++; } prev = next; while('<' != *next) { next ++; } memmove(str, prev, next - prev); return OK; } /* entry of programme */ int main(int argc, char* argv[]) { char* buffer; char* begin; int length; int size; char price[16]; /* 0.html is just the start page */ download_web_page(HTTP_NAME_ADDRESS, "E:/0.html"); if(OK == get_file_content("E:/0.html", &buffer, &size)) { memset(price, 0, 16); if(OK == find_pattern_content(buffer, "<title>", "</title>", &begin, &length)) { printf("商品:"); show_file_content(begin, length); printf("\n"); } if(OK == find_product_price(buffer, price, 16)) { printf("当当价:%s\n", price); } free(buffer); } return 1; }