utf8字符串是變長字符串,在日常處理時字符串截取時,如果處理不好則會出現亂碼,針對這個問題下面給出一個通用utf8字符串截取的示例,下面給出的是用python和php實現根據原理可以用任何語言來實現.
- # python
- # utf8 substr
- def safestrlength_utf8(sourcestr): # {{{
- i = 0;
- n = 0;
- str_length = len(sourcestr);
- while i < str_length:
- temp_str = sourcestr[i:i+1]
- ascnum = ord(temp_str)
- if ascnum >= 252:
- i = i + 6
- elif ascnum >= 248:
- i = i + 5
- elif ascnum >= 240:
- i = i + 4
- elif ascnum >= 224:
- i = i + 3
- elif ascnum >= 192:
- i = i + 2
- elif ascnum >= 65 and ascnum <= 90:
- i = i + 1
- else:
- i = i + 1
- n = n + 1
- n = n - 1
- return n
- # utf8 string length
- def safesubstr_utf8(sourcestr, cutlength):
- returnlist = []
- i = 0
- n = 0
- str_length = len(sourcestr)
- while (n < cutlength) and (i <= str_length):
- temp_str = sourcestr[i:i+1]
- ascnum = ord(temp_str)
- if ascnum >= 252:
- returnlist.append(sourcestr[i:i+6])
- i = i + 6
- elif ascnum >= 248:
- returnlist.append(sourcestr[i:i+5])
- i = i + 5
- elif ascnum >= 240:
- returnlist.append(sourcestr[i:i+4])
- i = i + 4
- elif ascnum >= 224:
- returnlist.append(sourcestr[i:i+3])
- i = i + 3
- elif ascnum >= 192:
- returnlist.append(sourcestr[i:i+2])
- i = i + 2
- elif ascnum >= 65 and ascnum <= 90:
- returnlist.append(sourcestr[i:i+1])
- i = i + 1
- else:
- returnlist.append(sourcestr[i:i+1])
- i = i + 1
- n = n + 1;
- return "".join(returnlist)
- // php
- // substr for utf8 string, then utf8 word is 1 length
- public static function safesubstr_utf8($sourcestr, $cutlength) // {{{
- {
- $returnstr = '';
- $i = 0;
- $n = 0;
- $str_length = strlen($sourcestr);
- while(($n < $cutlength) && ($i <= $str_length))
- {
- $temp_str = substr($sourcestr, $i, 1);
- $ascnum = Ord($temp_str);
- if($ascnum >= 252)
- {
- $returnstr = $returnstr.substr($sourcestr, $i, 6);
- $i = $i + 6;
- }
- elseif($ascnum >= 248)
- {
- $returnstr = $returnstr.substr($sourcestr, $i, 5);
- $i = $i + 5;
- }
- elseif($ascnum >= 240)
- {
- $returnstr = $returnstr.substr($sourcestr, $i, 4);
- $i = $i + 4;
- }
- elseif($ascnum >= 224)
- {
- $returnstr = $returnstr.substr($sourcestr, $i, 3);
- $i = $i + 3;
- }
- elseif($ascnum >= 192)
- {
- $returnstr = $returnstr.substr($sourcestr, $i, 2);
- $i = $i + 2;
- }
- elseif($ascnum >= 65 && $ascnum <= 90)
- {
- $returnstr = $returnstr.substr($sourcestr, $i, 1);
- $i = $i + 1;
- }
- else
- {
- $returnstr = $returnstr.substr($sourcestr, $i, 1);
- $i = $i + 1;
- }
- $n++;
- }
- return $returnstr;
- } // }}}
- // get length for utf8 string, then utf8 word is 1 length
- public static function safestrlength_utf8($sourcestr) // {{{
- {
- $i = 0;
- $n = 0;
- $str_length = strlen($sourcestr);
- while($i <= $str_length)
- {
- $temp_str = substr($sourcestr, $i, 1);
- $ascnum = Ord($temp_str);
- if($ascnum >= 252)
- {
- $i = $i + 6;
- }
- elseif($ascnum >= 248)
- {
- $i = $i + 5;
- }
- elseif($ascnum >= 240)
- {
- $i = $i + 4;
- }
- elseif($ascnum >= 224)
- {
- $i = $i + 3;
- }
- elseif($ascnum >= 192)
- {
- $i = $i + 2;
- }
- elseif($ascnum >= 65 && $ascnum <= 90)
- {
- $i = $i + 1;
- }
- else
- {
- $i = $i + 1;
- }
- $n++;
- }
- $n--;
- return $n;
- } // }}}
U-00000000 - U:
0xxxxxxx
U-00000080 - U-000007FF:
110xxxxx 10xxxxxx
U-00000800 - U-0000FFFF:
1110xxxx 10xxxxxx 10xxxxxx
U-00010000 - U-001FFFFF:
11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
U-00200000 - U-03FFFFFF:
111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
U-04000000 - U-7FFFFFFF:
1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx