package org.x3.cloud.file.util;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
/**
* Created By Rock-Ayl on 2020-05-07
* txt文本工具箱
*/
public class TxtUtils {
protected static Logger logger = LoggerFactory.getLogger(TxtUtils.class);
/**
* txt轉html
*
* @param txtPath 源txt位置
* @param htmlPath 輸出的html位置
*/
public static boolean txtToHtml(String txtPath, String htmlPath) {
//初始化成功失敗,缺省失敗
boolean isSuccess = false;
//檢測文件編碼
String encoding = getFileCoding(txtPath);
//獲取文件
File file = new File(txtPath);
//判斷文件是否存在
if (file.exists() && file.isFile()) {
try {
//獲取文件內容
InputStreamReader read = new InputStreamReader(new FileInputStream(file), encoding);
//考慮到txt文本的編碼格式
BufferedReader bufferedReader = new BufferedReader(read);
//創建html對象
FileOutputStream fos = new FileOutputStream(new File(htmlPath));
//考慮到html的編碼
OutputStreamWriter osw = new OutputStreamWriter(fos, "UTF-8");
BufferedWriter bw = new BufferedWriter(osw);
String lineTxt;
//一行行讀
while ((lineTxt = bufferedReader.readLine()) != null) {
//寫入
bw.write("   " + lineTxt + "</br>");
}
//清空
bw.close();
osw.close();
fos.close();
read.close();
//操作成功
isSuccess = true;
} catch (IOException e) {
logger.error("txt轉html出現異常:{}", e);
}
}
return isSuccess;
}
/**
* 判斷txt文本編碼格式方法
*
* @param path 文件path
* @return 文件編碼 eg: UTF-8
*/
private static String getFileCoding(String path) {
//獲取文件
File file = new File(path);
//默認按照GBK來
String charset = "GBK";
//初始化前三個byte
byte[] first3Bytes = new byte[3];
try {
boolean checked = false;
//讀取文件流
BufferedInputStream bis = new BufferedInputStream(new FileInputStream(file));
bis.mark(0);
int read = bis.read(first3Bytes, 0, 3);
if (read == -1) {
//文件編碼爲 ANSI
return charset;
} else if (first3Bytes[0] == (byte) 0xFF
&& first3Bytes[1] == (byte) 0xFE) {
//文件編碼爲 Unicode
charset = "UTF-16LE";
checked = true;
} else if (first3Bytes[0] == (byte) 0xFE
&& first3Bytes[1] == (byte) 0xFF) {
//文件編碼爲 Unicode big endian
charset = "UTF-16BE";
checked = true;
} else if (first3Bytes[0] == (byte) 0xEF
&& first3Bytes[1] == (byte) 0xBB
&& first3Bytes[2] == (byte) 0xBF) {
//文件編碼爲 UTF-8
charset = "UTF-8";
checked = true;
}
bis.reset();
if (!checked) {
while ((read = bis.read()) != -1) {
if (read >= 0xF0)
break;
// 單獨出現BF以下的,也算是GBK
if (0x80 <= read && read <= 0xBF)
break;
if (0xC0 <= read && read <= 0xDF) {
read = bis.read();
// 雙字節 (0xC0 - 0xDF)
if (0x80 <= read && read <= 0xBF)
// (0x80 - 0xBF),也可能在GB編碼內
continue;
else
break;
} else if (0xE0 <= read && read <= 0xEF) {
// 也有可能出錯,但是機率較小
read = bis.read();
if (0x80 <= read && read <= 0xBF) {
read = bis.read();
if (0x80 <= read && read <= 0xBF) {
charset = "UTF-8";
break;
} else
break;
} else
break;
}
}
}
bis.close();
} catch (IOException e) {
logger.error("txt識別編碼錯誤:{}", e);
return charset;
}
return charset;
}
}