现在的位置: 首页 > 综合 > 正文

GB2312和Utf-8编码的转换[2]

2013年07月30日 ⁄ 综合 ⁄ 共 5828字 ⁄ 字号 评论关闭


字符转换函数


encoding.c


/************************************************************************
*
*  File name:                  
encoding.c
*
*  Description:           
    Convert character encoding between GB2312
*             
                       and
Unicode(Utf-8).
*
*  Author:           
           JiangPeifu
*  Creation date:            2009-02-20
*
************************************************************************/

#include <stdio.h>
#include "encoding.h"

#define ENCODING_DEBUG 1

#if ENCODING_DEBUG
#define debug printf
#else
#define debug
#endif //ENCODING_DEBUG

const unsigned char *GBCodeToUnicode(unsigned char *gbCode);
const unsigned char *UnicodeToGBCode(unsigned char *unicode);
static void UnicodeToUtf8(char* utf8, char *unicode);
static void Utf8ToUnicode(char* unicode, char *utf8);
void GB2312StrToUtf8(
        char *utf8Str,   
    /* Output Utf-8 chars */
        char* gbStr,   
    /* Input GB2312 chars */
        int nBytes   
        /* size of input GB2312 chars */
        );
void Utf8StrToGB2312(
        char *gbStr,   
    /* Output GB2312 chars */
        char* utf8Str,   
    /* Input Utf-8 chars */
        int nBytes   
        /* Size of input GB2312 chars */
        );

/************************************************************************
 *  Function: GBCodeToUnicode
 *      Convert one GB2312 character to one Unicode
character
 ************************************************************************/
const unsigned char *GBCodeToUnicode(unsigned char *gbCode)
{
   const unsigned char *mapped = 0;
   unsigned int i = 0;

   if ((*(gbCode + 1) >= 0xa1) && (*(gbCode + 1) <=
0xfe))
   {
      if ((*gbCode >= 0xa1) && (*gbCode
<= 0xa9))
      {
         i = ((*gbCode - 0xa1) * 94 +
(*(gbCode + 1) - 0xa1)) * 2;
         mapped = &gb2uTable[i];
      } /* end of if */
      else
      {
          if ((*gbCode >= 0xb0)
&& (*gbCode <= 0xf7))
          {
             i = ((*gbCode -
0xb0 + 9) * 94 + (*(gbCode + 1) - 0xa1)) * 2;
             mapped =
&gb2uTable[i];
          } /* end of if */
          else
          {
             
debug("ERROR: GB2312 convert to unicode!!!/n");
          }
      }
   } /* end of if */
   else
   {
       debug("ERROR: GB2312 convert to
unicode!!!/n");
   }

   return mapped;
}

/************************************************************************
 *  Function: UnicodeToGBCode
 *      Convert single Unicode character to single
GB2312 character
 ************************************************************************/
const unsigned char *UnicodeToGBCode(unsigned char *unicode)
{
   unsigned int   i;

   i = ((*unicode << 8) + *(unicode + 1)) * 2;

   return &u2gbTable[i];
}

/************************************************************************
 *  Function: UnicodeToUtf8
 ************************************************************************/
static void UnicodeToUtf8(char* utf8, char *unicode)
{
    char *pchar = unicode;
   
    if (unicode == 0)
    {
        debug("ERROR: Unicode convert to
utf8, unicode=0/n");
        return;
    }

    unsigned char Hchar = *pchar;
    unsigned char Lchar = *(pchar + 1);
    utf8[0] = (0xE0 | ((Hchar & 0xF0) >>4));
    utf8[1] = (0x80 | ((Hchar & 0x0F) <<2)) + ((Lchar & 0xc0) >>6);
    utf8[2] = (0x80 | (Lchar & 0x3F));
 
    return ;
}

/************************************************************************
 *     Function: GB2312StrToUtf8
 ***********************************************************************/
void GB2312StrToUtf8(
        char *utf8Str,   
    /* Output Utf-8 chars */
        char* gbStr,   
    /* Input GB2312 chars */
        int nBytes   
        /* size of input GB2312 chars */
        )
{
    char buf[3];
 
    int i = 0;
    int j = 0;

    while (i < nBytes)
    {
        if(*(gbStr + i) >= 0)
        {
            utf8Str[j++]
= gbStr[i++];
           
debug("GB2312Str[%d]=%c/n", i-1, gbStr[i-1]);
            debug(" 
utf8Str[%d]=%c/n", j-1, utf8Str[j-1]);
        }
        else
        {
            char *pbuffer;
            pbuffer =
(char *)GBCodeToUnicode(gbStr+i);
            debug("unicode
[0]=%x, [1]=%x/n", *pbuffer, *(pbuffer+1));
           
UnicodeToUtf8(buf, pbuffer);
 
            utf8Str[j++]
= buf[0];
            debug("
utf8Str[%d]=%x/n", j-1, utf8Str[j-1]);
            utf8Str[j++]
= buf[1];
            debug("
utf8Str[%d]=%x/n", j-1, utf8Str[j-1]);
            utf8Str[j++]
= buf[2];
            debug("
utf8Str[%d]=%x/n", j-1, utf8Str[j-1]);
 
            i +=2;
        }
    }
    utf8Str[j] = '/0';

    return;
}

/************************************************************************
 *  Function: Utf8ToUnicode
 ************************************************************************/
static void Utf8ToUnicode(char* unicode, char *utf8)
{
    char *pchar = utf8;
    int nBytes = 0;
   
    if (0 == (*utf8 & 0x80))
    {
        /*
         * single-byte char
         */
        nBytes = 1;
        unicode[0] = *utf8;
    }
    else
    {
        /*
         * 3-byte char (chinese char)
         */
        int i;

        if ( (*utf8 & 0xf0) == 0xe0 )
        {
            nBytes 
= 3;
            unicode[0] =
((utf8[0] & 0x0f)
<<4) + ((utf8[1] & 0x3c)
>>2);
            unicode[1] =
((utf8[1] & 0x03) <<6) + (utf8[2] & 0x3f);
        }
        else
        {
            debug("ERROR:
utf-8 to unicode, nBytes !=3/n");
            nBytes = 0;
            unicode[0] = '?';
            return;
        }
    }

    return;
}

/************************************************************************
 *     Function: GB2312StrToUtf8
 ***********************************************************************/
void Utf8StrToGB2312(
        char *gbStr,   
    /* Output GB2312 chars */
        char* utf8Str,   
    /* Input Utf-8 chars */
        int nBytes   
        /* Size of input GB2312 chars */
        )
{
    char buf[2];
    int i = 0;
    int j = 0;

    while (i < nBytes)
    {
        if (0 == (*(utf8Str + i) & 0x80))
        {
            gbStr[j++] =
utf8Str[i++];
            debug(" 
utf8Str[%d]=%c/n", i-1, utf8Str[i-1]);
           
debug("GB2312Str[%d]=%c/n", j-1, gbStr[j-1]);
        }
        else
        {
            const unsigned char
*pbuffer;
           
Utf8ToUnicode(buf, utf8Str + i);
            debug(" 
utf8Str[%d]=%x/n" ,i, utf8Str[i]);
            debug(" 
utf8Str[%d]=%x/n" ,i+1, utf8Str[i+1]);
            debug(" 
utf8Str[%d]=%x/n" ,i+2, utf8Str[i+2]);
            debug("unicode
[0]=%x, [1]=%x/n", buf[0], buf[1]);

            pbuffer =
UnicodeToGBCode(buf);
            gbStr[j++] =
*pbuffer;
           
debug("GB2312[%d]=%x/n", j-1, gbStr[j-1]);
            gbStr[j++] =
*(pbuffer + 1);
           
debug("GB2312[%d]=%x/n", j-1, gbStr[j-1]);
 
            i +=3;
        }
    }
    gbStr[j] = 0;

    return;
}





至此,一切
OK
!!!



抱歉!评论已关闭.