现在的位置: 首页 > 综合 > 正文

VC之网页获取(能区分GB2312与UTF8)

2012年12月07日 ⁄ 综合 ⁄ 共 4074字 ⁄ 字号 评论关闭

 

测试版,使用时还需要优化个别地方
代码

int FindCodePage(PBYTE p,int nLen,CString theUrl)
{
    
int nResult = -1;
    UINT u[
4];
    UINT uUTF8Count 
= 0;
    UINT uACPCount 
= 0;
    nResult 
= -1;
    
if(nLen < 8)
        
return nResult;
    
if (p[0== 0xFF && p[1== 0xFE && p[2!= 0xFF)//Unicode
    {
        nResult 
= CP_UTF8;
    }
    
else if (p[0== 0xEF && p[1== 0xBB && p[2== 0xBF)//UTF8
    {
        nResult 
= CP_UTF8;
    }
    
else
    {
        
for(DWORD i=4;i<nLen-4;i++)
        {
            u[
0= p[i];
            u[
1= p[i+1];
            u[
2= p[i+2];
            u[
3= p[i+3];
            
if((u[0]&248==240)   //& B11111000     must be:B11110XXX
            {   
                
if((u[1]&192== 128
                
&&(u[2]&192== 128
                
&&(u[3]&192== 128)
                {
                    nResult 
= CP_UTF8;
                    uUTF8Count
++;
                    i 
+=3;
                    
//break;
                }
                
else
                {
                    nResult 
= CP_ACP;
                    i 
++;
                    uACPCount
++;
                    
break;
                }
            }
            
else if((u[0]&240==224)   //& B11110000     must be:B1110XXXX
            {
                
                
//if((p[i+1] & 192 ==128)
                 
//&&(p[i+2] & 192 ==128))
                if((u[1]&192== 128
                
&&(u[2]&192== 128)
                {
                    nResult 
= CP_UTF8;
                    uUTF8Count
++;
                    i 
+=2;
                    
//break;
                }
                
else// if(u[0]>=128 && u[1] >=128)
                {
                    nResult 
= CP_ACP;
                    i 
++;
                    uACPCount
++;
                    
break;
                }
            }
            
//else if((u[0]&224) ==192)   //& B11100000     must be:B110XXXXX
            
//{
            
//    if((u[1]&192) == 128)
            
//    {
            
//        nResult = CP_UTF8;
            
//        break;
            
//    }
            
//}
            /*else if(p[i]>160)
            {
                if((p[i+1]>160))
                {
                    nResult = CP_ACP;
                    break;
                }
            }
*/
            
        }
    }
    
if(nResult<0)
        nResult 
= CP_ACP;
    
if(uUTF8Count+uACPCount>0)
        TRACE(theUrl
+CString("  PageCode = %d  \n"),nResult);
    
return nResult;
}
//获取网页内容
CString GetSourceHtml(CString theUrl) 
{
    CString retVal;
    CInternetSession session;
    CInternetFile
* file = NULL;
    
    
try
    {
        
// 试着连接到指定URL
        file = (CInternetFile*) session.OpenURL(theUrl);
    }
    
catch (CInternetException* m_pException)
    {
        
// 如果有错误的话,置文件为空
        file = NULL; 
        m_pException
->Delete();
        
return retVal;
    }
    
    
if (file)
    {
        DWORD dwFileLen 
= 2097152;// 2 M
        
//BYTE* pBuf =new byte[81920];
        BYTE* pBuf =new byte[dwFileLen];
        
        DWORD dwReadBytes 
= 0;
        CString  somecode; 
//也可采用LPTSTR类型,将不会删除文本中的\n回车符
        
        
int nCodePage = -1;
        
// 读写网页文件,直到为空
        DWORD dwPos = 0;
        
while(1)
        {
            dwReadBytes 
= file->Read(pBuf+dwPos,4096);
            
if(dwReadBytes <1)
                
break;
            
else
                dwPos 
+= dwReadBytes;
        }
        dwReadBytes 
= dwPos;

        //nCodePage = FindCodePage(pBuf,dwReadBytes);
        nCodePage = FindCodePage(pBuf,dwReadBytes,theUrl);
        
        
//预转换,得到所需空间的大小
        int oldLen = retVal.GetLength();
                
        
int wcsLen = ::MultiByteToWideChar(nCodePage, NULL, (LPCSTR)pBuf,dwReadBytes, NULL, 0);
        
//分配空间要给'\0'留个空间,MultiByteToWideChar不会给'\0'空间
        wchar_t* wszString = new wchar_t[wcsLen + 1];
        memset(wszString,
0,sizeof(wchar_t)*(wcsLen + 1));
        
//转换
        ::MultiByteToWideChar(nCodePage, NULL, (LPCSTR)pBuf, dwReadBytes, wszString, wcsLen);
        
//最后加上'\0'
        
//wszString[wcsLen] = '\0';
        retVal = CString(wszString);
        delete[] wszString;

        file->Close();
        delete file;
        delete pBuf;
    }
    
else
    {
        
return retVal;
    }

    return retVal;
}

 

 

抱歉!评论已关闭.