通過java寫的UTF-8文件,使用Java可以正確的讀,但是如果用記事本將相同的內容使用UTF-8格式保存,則在使用程序讀取是會從文件中多讀出一個不可見字符,多一個?
實例:
新建一個文本:內容爲“測試BOM”的txt文本,另存爲UTF-8.
處理帶BOM的類UnicodeReader
package com.java.io; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.PushbackInputStream; import java.io.Reader; /** version: 1.1 / 2007-01-25 - changed BOM recognition ordering (longer boms first) 網絡地址:http://koti.mbnet.fi/akini/java/unicodereader/UnicodeReader.java.txt Original pseudocode : Thomas Weidenfeller Implementation tweaked: Aki Nieminen http://www.unicode.org/unicode/faq/utf_bom.html BOMs: 00 00 FE FF = UTF-32, big-endian FF FE 00 00 = UTF-32, little-endian EF BB BF = UTF-8, FE FF = UTF-16, big-endian FF FE = UTF-16, little-endian Win2k Notepad: Unicode format = UTF-16LE ***/ /** * Generic unicode textreader, which will use BOM mark * to identify the encoding to be used. If BOM is not found * then use a given default or system encoding. */ public class UnicodeReader extends Reader { PushbackInputStream internalIn; InputStreamReader internalIn2 = null; String defaultEnc; private static final int BOM_SIZE = 4; /** * * @param in inputstream to be read * @param defaultEnc default encoding if stream does not have * BOM marker. Give NULL to use system-level default. */ UnicodeReader(InputStream in, String defaultEnc) { internalIn = new PushbackInputStream(in, BOM_SIZE); this.defaultEnc = defaultEnc; } public String getDefaultEncoding() { return defaultEnc; } /** * Get stream encoding or NULL if stream is uninitialized. * Call init() or read() method to initialize it. */ public String getEncoding() { if (internalIn2 == null) return null; return internalIn2.getEncoding(); } /** * Read-ahead four bytes and check for BOM marks. Extra bytes are * unread back to the stream, only BOM bytes are skipped. */ protected void init() throws IOException { if (internalIn2 != null) return; String encoding; byte bom[] = new byte[BOM_SIZE]; int n, unread; n = internalIn.read(bom, 0, bom.length); if ( (bom[0] == (byte)0x00) && (bom[1] == (byte)0x00) && (bom[2] == (byte)0xFE) && (bom[3] == (byte)0xFF) ) { encoding = "UTF-32BE"; unread = n - 4; } else if ( (bom[0] == (byte)0xFF) && (bom[1] == (byte)0xFE) && (bom[2] == (byte)0x00) && (bom[3] == (byte)0x00) ) { encoding = "UTF-32LE"; unread = n - 4; } else if ( (bom[0] == (byte)0xEF) && (bom[1] == (byte)0xBB) && (bom[2] == (byte)0xBF) ) { encoding = "UTF-8"; unread = n - 3; } else if ( (bom[0] == (byte)0xFE) && (bom[1] == (byte)0xFF) ) { encoding = "UTF-16BE"; unread = n - 2; } else if ( (bom[0] == (byte)0xFF) && (bom[1] == (byte)0xFE) ) { encoding = "UTF-16LE"; unread = n - 2; } else { // Unicode BOM mark not found, unread all bytes encoding = defaultEnc; unread = n; } //System.out.println("read=" + n + ", unread=" + unread); if (unread > 0) internalIn.unread(bom, (n - unread), unread); // Use given encoding if (encoding == null) { internalIn2 = new InputStreamReader(internalIn); } else { internalIn2 = new InputStreamReader(internalIn, encoding); } } public void close() throws IOException { init(); internalIn2.close(); } public int read(char[] cbuf, int off, int len) throws IOException { init(); return internalIn2.read(cbuf, off, len); } }
測試類
package com.java.io; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.InputStreamReader; import java.nio.charset.Charset; public class BomRead { /** * 讀帶有BOM的UTF-8文件亂碼 * @param args */ public static void main(String[] args)throws Exception { File file = new File("E:\\JS_Exercise\\JavaExercise\\BOM.txt"); FileInputStream in = new FileInputStream(file); BufferedReader br = new BufferedReader(new InputStreamReader(in, "UTF-8")); String line = null; System.out.println("處理前:"); while((line = br.readLine()) != null){ System.out.println(line); } File file2 = new File("E:\\JS_Exercise\\JavaExercise\\BOM.txt"); FileInputStream in2 = new FileInputStream(file2); BufferedReader br2 = new BufferedReader(new UnicodeReader(in2, "UTF-8")); String line2 = null; System.out.println("處理後:"); while((line2 = br2.readLine()) != null){ System.out.println(line2); } } }
輸出結果
處理前:
?測試BOM
處理後:
測試BOM
另一種解決方式
從目前來看1.6只是解決了讀取帶有BOM文件失敗的問題,還是不能區別處理有BOM和無BOM的UTF-8編碼的文件,從Bug ID:4508058裏的描述可以看出,這個問題將作爲一個不會修改的問題關閉,對於BOM編碼的識別將由應用程序自己來處理,原因可從另處一個bug處查看到,因爲Unicode對於BOM的編碼的規定可能發生變化。也就是說對於一個UTF-8的文件,應用程序需要知道這個文件有沒有寫BOM,然後自己決定處理BOM的方式。
因此在遇到此問題的時候可以特殊問題特殊處理。