public static String stripNonCharCodepoints(String input) {
StringBuilder retval = new StringBuilder();
char ch;
for (int i = 0; i < input.length(); i++) {
ch = input.charAt(i);
// Strip all non-characters http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:Noncharacter_Code_Point=True:]
// and non-printable control characters except tabulator, new line and carriage return
if (ch % 0x10000 != 0xffff && // 0xffff - 0x10ffff range step 0x10000
ch % 0x10000 != 0xfffe && // 0xfffe - 0x10fffe range
(ch <= 0xfdd0 || ch >= 0xfdef) && // 0xfdd0 - 0xfdef
(ch > 0x1F || ch == 0x9 || ch == 0xa || ch == 0xd)) {
retval.append(ch);
}
}
return retval.toString();
}
java如何過濾無效的utf-8字符?
之前遇到過幾次nutch/solr報這樣的錯誤:Invalid UTF-8 character。原來1.3版本的nutch有Strip UTF-8 non-character codepoints的bug,在1.4就修復了。
於是這裏把nutch裏如何過濾無效utf-8字符的代碼找出來給小夥伴們看看。直接上代碼了:
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.