unicode utf8 utf16轉換說明及代碼實現(包含emoji表情編碼轉換)

文字:

Z 饕餮🇨🇳❤️☎️ 😀 圞壜奱

(這裏第4個“CN”是一個國旗的字符,這裏顯示不出來,在手機上輸入國旗可以顯示,見下圖)

 

    \u005a    Z                             ->uft8: 5A
    \u0020  空格                            ->uft8: 20
    \u9955    饕                             ->uft8: E9 A5 95
    \u992e    餮                            ->uft8: E9 A4 AE
    \ud83c\udde8\ud83c\uddf3  紅旗🇨🇳        ->uft8: F0 9F 87 A8 F0 9F 87 B3
    \u2764\ufe0f心形❤️                    ->uft8: E2 9D A4 EF B8 8F
    \u260e\ufe0f ☎️                         ->uft8: E2 98 8E EF B8 8F
    \ud83d \ude00 😀                        ->uft8: F0 9F 98 80
    \u0020 空格                            ->uft8: 20
    \u571e 圞                             ->uft8: E5 9C 9E
    \u58dc 壜                            ->uft8: E5 A3 9C
    \u5971 奱                            ->uft8: E5 A5 B1

 

utf16:
5a 00 20 00 55 99 2e 99 3c d8 e8 dd 3c d8 f3 dd 64 27 0f fe 0e 26 0f fe 3d d8 00 de 20 00 1e 57 dc 58 71 59

utf16:
\u005a \u0020 \u9955 \u992e \ud83c \udde8 \ud83c \uddf3 \u2764 \ufe0f \u260e \ufe0f \ud83d \ude00 \u0020 \u571e \u58dc \u5971


utf-8:

5a 20 e9 a5 95 e9 a4 ae F0 9F 87 A8 F0 9F 87 B3 e2 9d a4 ef b8 8f e2 98 8e ef b8 8f f0 9f 98 80 20 e5 9c 9e e5 a3 9c e5 a5 b1
 

代碼實現:

/*------------------------------------------------------------------------------*/
/*Name : StrUnicodeToUtf8                                                       */
/*Role : Converts a string of type unicode(UCS-2) to a string of type utf8      */
/*Interface :                                                                   */
/*  - IN     : UnicBytes:unicode string(UCS-2) Little Endian                    */
/*  - IN     : UnicChLen:unicode char len                                       */
/*  - IN/OUT : Utf8Buf:Utf8 string buf                                          */
/*  - IN     : BufSize:Utf8 string buf size                                     */
/*  - OUT    : OutBytesSize:Output Utf8 string bytes len                        */
/*  - return : result of convert, 0:convert success, 1:param error, 2:out of buf*/
/*Pre-condition : Unicode(UCS-2) Little Endian                                  */
/*------------------------------------------------------------------------------*/
int StrUnicodeToUtf8(ubyte *UnicBytes, int UnicChLen, ubyte *Utf8Buf, int BufSize, int *OutBytesSize)
{
  int ChCnt = 0;
  ulong UnicCh = 0;
  ulong UnicChUTF16Low = 0;
  ubyte Utf8Ch[6] = {0};
  ubyte *pIndex = Utf8Buf;
  int BytesCnt = 0;
  int Utf8ChBytesLen = 0;
  int ret = 0;

  if(NULL==UnicBytes || NULL==Utf8Buf || NULL==OutBytesSize)
  {
    ret = 1;
  }
  else
  {
    for(ChCnt=0; ChCnt<UnicChLen; ChCnt++)
    {
      UnicCh = 0;
      UnicCh = UnicBytes[ChCnt*2+1];
      UnicCh <<= 8;
      UnicCh |= UnicBytes[ChCnt*2];
      /*UTF-16*/
      /*
       * High-half zone of UTF-16 = ((emoji code)-0x10000)/0x400+0xD800
       * Low-half zone of UTF-16 =((emoji code)-0x10000)%0x400+0xDC00
       **/
      if(UnicCh>=0xD800 && UnicCh<=0xDBFF)
      {
        if(ChCnt+1<UnicChLen)
        {
          UnicChUTF16Low = UnicBytes[ChCnt*2+3];
          UnicChUTF16Low <<= 8;
          UnicChUTF16Low |= UnicBytes[ChCnt*2+2];
          if(UnicChUTF16Low>=0xDC00 && UnicChUTF16Low<=0xDFFF)
          {
            UnicCh = (UnicCh-0xD800)*0x400;
            UnicCh += (UnicChUTF16Low-0xDC00+0x10000);
            ChCnt++;
          }
        }
      }

      /*Convert*/
      Utf8ChBytesLen = UnicodeToUtf8(UnicCh, Utf8Ch, sizeof(Utf8Ch));

      if(BytesCnt+Utf8ChBytesLen <= BufSize)
      {
        memcpy(pIndex, Utf8Ch, Utf8ChBytesLen);
        pIndex += Utf8ChBytesLen;
        BytesCnt += Utf8ChBytesLen;
      }
      else
      {
        ret = 2;
    	break;
      }
    }/*end for*/
    *OutBytesSize = BytesCnt;
    *pIndex = 0; /* add end signal of string */
  }

  return ret;
}

/*----------------------------------------------------------------------------*/
/*Name : UnicodeToUtf8                                                        */
/*Role : Converts unicode(UCS-2&UCS-4) to utf8                                */
/*Interface :                                                                 */
/*  - IN  : unic:unicode code                                                 */
/*  - OUT : pOutput:Output uft8 code                                          */
/*  - IN  : outSize:pOutput buf size                                          */
/*  - return : valid utf8 bytes size                                          */
/*Pre-condition :                                                             */
/*----------------------------------------------------------------------------*/
int UnicodeToUtf8(ulong unic, ubyte *pOutput, int outSize)
{
  int Ret = 0;

  if( (NULL != pOutput) && (outSize >= 6) ) /*pOutput not null and outSize size big than 6(utf8 needs a maximum of 6 bytes)*/
  {
    if ( unic <= 0x0000007F )
    {
        // * U-00000000 - U-0000007F:  0xxxxxxx
        *pOutput     = (unic & 0x7F);
        Ret = 1;
    }
    else if ( unic >= 0x00000080 && unic <= 0x000007FF )
    {
        // * U-00000080 - U-000007FF:  110xxxxx 10xxxxxx
        *(pOutput+1) = (unic & 0x3F) | 0x80;
        *pOutput     = ((unic >> 6) & 0x1F) | 0xC0;
        Ret = 2;
    }
    else if ( unic >= 0x00000800 && unic <= 0x0000FFFF )
    {
        // * U-00000800 - U-0000FFFF:  1110xxxx 10xxxxxx 10xxxxxx
        *(pOutput+2) = (unic & 0x3F) | 0x80;
        *(pOutput+1) = ((unic >>  6) & 0x3F) | 0x80;
        *pOutput     = ((unic >> 12) & 0x0F) | 0xE0;
        Ret = 3;
    }
    else if ( unic >= 0x00010000 && unic <= 0x001FFFFF )
    {
        // * U-00010000 - U-001FFFFF:  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
        *(pOutput+3) = (unic & 0x3F) | 0x80;
        *(pOutput+2) = ((unic >>  6) & 0x3F) | 0x80;
        *(pOutput+1) = ((unic >> 12) & 0x3F) | 0x80;
        *pOutput     = ((unic >> 18) & 0x07) | 0xF0;
        Ret = 4;
    }
    else if ( unic >= 0x00200000 && unic <= 0x03FFFFFF )
    {
        // * U-00200000 - U-03FFFFFF:  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
        *(pOutput+4) = (unic & 0x3F) | 0x80;
        *(pOutput+3) = ((unic >>  6) & 0x3F) | 0x80;
        *(pOutput+2) = ((unic >> 12) & 0x3F) | 0x80;
        *(pOutput+1) = ((unic >> 18) & 0x3F) | 0x80;
        *pOutput     = ((unic >> 24) & 0x03) | 0xF8;
        Ret = 5;
    }
    else if ( unic >= 0x04000000 && unic <= 0x7FFFFFFF )
    {
        // * U-04000000 - U-7FFFFFFF:  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
        *(pOutput+5) = (unic & 0x3F) | 0x80;
        *(pOutput+4) = ((unic >>  6) & 0x3F) | 0x80;
        *(pOutput+3) = ((unic >> 12) & 0x3F) | 0x80;
        *(pOutput+2) = ((unic >> 18) & 0x3F) | 0x80;
        *(pOutput+1) = ((unic >> 24) & 0x3F) | 0x80;
        *pOutput     = ((unic >> 30) & 0x01) | 0xFC;
        Ret = 6;
    }
    else
    {
      Ret = 0;
    }
  }
  else
  {
    Ret = 0;
  }

  return Ret;
}

 

 

 


 編碼轉換
 http://tool.chinaz.com/tools/unicode.aspx
 
 utf-8轉換
 https://www.qqxiuzi.cn/bianma/Unicode-UTF.php 
 
 emoji表情大全
 http://bbs.52svip.cn/emoji/
 
 Unicode編碼表
 https://www.cnblogs.com/csguo/p/7401874.html
 
 變量選擇器
 http://www.seotest.cn/jishu/34607.html

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章