Unicode，UTF8，GB2312，UCS2，GBK之间的转换

现在的位置: 首页 > 综合 > 正文

Unicode，UTF8，GB2312，UCS2，GBK之间的转换

2013年01月17日 ⁄ 综合 ⁄ 共 3801字 ⁄ 字号小中大 ⁄ 评论关闭

Unicode，UTF8，GB2312，UCS2，GBK之间的转换

平时用到的几种编码格式转换。平时用的不是很多。但是在做短信协议的时候，就经常遇到了。这段时间做短信平台接口，总结了几个，也不是很全。

//////////////////////////////////////////////////////////////////////////////

//把汉字格式化为%HH
int URLEncode(LPCTSTR pInBuf,LPTSTR szOut)
{
LPBYTE pInTmp,pOutTmp;
pInTmp = (LPBYTE)pInBuf;
pOutTmp = (LPBYTE)szOut;
while (*pInTmp){
  if(isalnum(*pInTmp)){
   *pOutTmp++ = *pInTmp;
  }else{

   if(isspace(*pInTmp)){
    *pOutTmp++ = '+';
   }else{
    *pOutTmp++ = '%';
    *pOutTmp++ = toHex(*pInTmp>>4);
    *pOutTmp++ = toHex(*pInTmp&0xF);
   }
  }
  pInTmp++;
}
*pOutTmp = '/0';
return (int)(pOutTmp-(LPBYTE)szOut);
}

///////////////////////////////////////////////////////////////////

// Unicode字符转换成UTF-8编码
LPCTSTR UnicodeToUTF8Char(LPTSTR pOut,WCHAR wcText)
{
// 注意 WCHAR高低字的顺序,低字节在前，高字节在后
LPTSTR pchar = (LPTSTR)&wcText;
pOut[0] = (0xE0 | ((pchar[1] & 0xF0) >> 4));
pOut[1] = (0x80 | ((pchar[1] & 0x0F) << 2)) + ((pchar[0] & 0xC0) >> 6);
pOut[2] = (0x80 | (pchar[0] & 0x3F));
pOut[3] = '/0';
return pOut;
}

//GB2312字符串转为UTF-8编码
LPCTSTR GB2312ToUTF8(LPTSTR pUTF8Out,LPCTSTR pGB2312Input, int GB2312Len)
{
CHAR buf[4];
LPCTSTR lpReturn,pGB2312Cursor,pGB2312InputEnd;
WCHAR wcBuffer;

lpReturn = (LPCTSTR)pUTF8Out;
pGB2312Cursor = (LPTSTR)pGB2312Input;
pGB2312InputEnd= pGB2312Cursor + GB2312Len;

while( pGB2312Cursor<pGB2312InputEnd ){
  //如果ANSII直接复制就可以
  if( *pGB2312Cursor>0 ){
   *pUTF8Out++ = *pGB2312Cursor++;
  }else{
   ::MultiByteToWideChar(CP_ACP,MB_PRECOMPOSED,pGB2312Cursor,2,&wcBuffer,1);
   memcpy( pUTF8Out, UnicodeToUTF8Char(buf,wcBuffer), 3 );
   pGB2312Cursor += 2;
   pUTF8Out += 3;
  }
}
*pUTF8Out = '/0';
return lpReturn;
}

int UTF8ToGB(const char* str,char *out)
{
WCHAR *strSrc;
TCHAR *szRes;
int len;

//获得临时变量的大小
int i = MultiByteToWideChar(CP_UTF8, 0, str, -1, NULL, 0);
strSrc = new WCHAR[i+1];
MultiByteToWideChar(CP_UTF8, 0, str, -1, strSrc, i);

//获得临时变量的大小
i = WideCharToMultiByte(CP_ACP, 0, strSrc, -1, NULL, 0, NULL, NULL);
szRes = new TCHAR[i+1];
WideCharToMultiByte(CP_ACP, 0, strSrc, -1, szRes, i, NULL, NULL);

len = (i+1)*sizeof(CHAR);
memcpy(out,szRes,len);
out[len+1] ='/0';

delete []strSrc;
delete []szRes;

return len;
}

//GB2312字符串转为GB2312网页编码
LPCTSTR GB2312ToWebGB2312(LPTSTR pWebGB2312Out,LPCTSTR pGB2312Input, int GB2312Len)
{

LPCTSTR lpReturn,pGB2312Cursor,pGB2312InputEnd;
WCHAR wcBuffer;

lpReturn = (LPCTSTR)pWebGB2312Out;
pGB2312Cursor = (LPTSTR)pGB2312Input;
pGB2312InputEnd= pGB2312Cursor + GB2312Len;

while( pGB2312Cursor<pGB2312InputEnd ){
  //如果ANSII直接复制就可以
  if( *pGB2312Cursor>0 ){
   *pWebGB2312Out++ = *pGB2312Cursor++;
  }else{
   ::MultiByteToWideChar(CP_ACP,MB_PRECOMPOSED,pGB2312Cursor,2,&wcBuffer,1);
   pWebGB2312Out += sprintf( pWebGB2312Out, "&#%d;", wcBuffer);
   pGB2312Cursor += 2;
  }
}
*pWebGB2312Out = '/0';
return lpReturn;
}

void UCS2toUTF8(unsigned short *ucs2, int count, char *utf8)
{
    unsigned short unicode;
    unsigned char bytes[4] = {0};
    int nbytes = 0;
    int i = 0, j = 0;
    int len=0;

    if((ucs2 != NULL) && (utf8 != NULL))
    {
        if(count == 0)
        {
            len = 0;
        }
        else
        {
   for (i=0; i<count; i++)
   {
    unicode = ucs2[i];

    if (unicode < 0x80)
    {
     nbytes = 1;
     bytes[0] = unicode;
    }
    else if (unicode < 0x800)
    {
     nbytes = 2;
     bytes[1] = (unicode & 0x3f) | 0x80;
                    bytes[0] = ((unicode << 2) & 0x1f00 | 0xc000) >> 8;
    }
    else
    {
     nbytes = 3;
     bytes[2] = (unicode & 0x3f) | 0x80;
     bytes[1] = ((unicode << 2) & 0x3f00 | 0x8000) >> 8;
                    bytes[0] = ((unicode << 4) & 0x0f0000 | 0xe00000) >> 16;
    }

    for (j=0; j<nbytes; j++)
    {
     utf8[len] = bytes[j];
     len++;
                }
            }
        }

        utf8[len] = '/0';
    }
}