package com.ysz.search.utils;
import com.alibaba.fastjson.JSONObject;
import com.ysz.search.utils.log.Logger;
import com.ysz.search.utils.log.LoggerFactory;
import org.apache.commons.io.FileUtils;
import org.springframework.util.ResourceUtils;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @Description: 檢測字符串 是中文 拼音 拼音縮寫 或者 中英混合
* @Description: 不考慮英文情況, 默認輸入只有字母(小寫)、數字、中文
* @Description: 關於純簡寫與拼音加簡寫的界定(因爲中文拼音中有單音節聲母的原因例如a,o,e:即可認定爲簡寫,也可認定爲全拼),這裏採用如果拼音加簡寫也符合純簡寫的規則,則認定爲簡寫(可根據自己具體業務需求調整)
* @Description: 控制在1ms內
* @Param: 本人獨創,侵權必究(哈哈)
* @return:
* @Author: [email protected]
* @Date: 2020/4/27
*/
public class WordUtils {
private static Logger logger = LoggerFactory.getLogger(WordUtils.class);
// 聲母
static String[] sm = {"b", "p", "m", "f", "d", "t", "n", "l", "g", "k", "h", "j", "q", "x", "zh", "ch", "sh", "r", "z", "c", "s", "y", "w"};
// 韻母
static String[] ym = {"a", "o", "e", "i", "u", "v", "ai", "ei", "ui", "ao", "ou", "iu", "ie", "ve", "er", "an", "en", "in", "un", "vn", "ang", "eng", "ing", "ong"};
// 整體連讀
static String[] zt = {"zhi", "chi", "shi", "ri", "wu", "yu", "ye", "yue", "yuan", "yin", "yun", "ying"};
// 聲母韻母
static String[] smym = {"a", "o", "e", "ai", "ei", "ao", "ou", "er", "an", "en", "ang", "eng"};
// 聲母簡拼 聲母韻母簡拼
static String[] jp = {"b", "p", "m", "f", "d", "t", "n", "l", "g", "k", "h", "j", "q", "x", "r", "z", "c", "s", "y", "w", "a", "o", "e"};
/**
* @Description: 檢測 字符串 類型
* @Param: 【1.純中文 2.純拼音全拼 3.純拼音縮寫 4.全拼+簡拼 5.中文+拼音 】
* @return:
* @Author: [email protected]
* @Date: 2020/4/27
*/
public static int discernWordType(String word) {
word = word.replaceAll(" ", "");
// 按數字剔除 不要影響算法,如果放到後面處理會很麻煩
word = Pattern.compile("[\\d]")
.matcher(word)
.replaceAll("");
// 默認 4
int wordType = 4;
if (isCNWord(word)) {
wordType = 1;
} else if (isPYWord(word)) {
Boolean pyqpWord = isPYQPWord(word);
if (pyqpWord == null) {
// 這裏對 單音節 韻母 可作爲全拼 or 簡寫 ,做特殊處理 再加一層判斷
if (isPYSXWord(word)) {
wordType = 3;
} else {
wordType = 4;
}
} else if (pyqpWord) {
wordType = 2;
} else {
wordType = 3;
}
} else {
// 這種情況就是 既不是 全中文 也不是全拼音 ,即中拼混合
wordType = 5;
}
return wordType;
}
/**
* @Description: 1.是否全是中文
*/
private static boolean isCNWord(String word) {
boolean isCNWord = false;
Pattern p_str = Pattern.compile("^[0-9\\u4E00-\\u9FA5]+$");
Matcher m = p_str.matcher(word);
if (m.find() && m.group(0)
.equals(word)) {
isCNWord = true;
}
return isCNWord;
}
/**
* @Description: 2.是否全是英文
*/
private static boolean isPYWord(String word) {
boolean isPYWord = false;
Pattern p_str = Pattern.compile("^[a-zA-Z0-9]+$");
Matcher m = p_str.matcher(word);
if (m.find() && m.group(0)
.equals(word)) {
isPYWord = true;
}
return isPYWord;
}
/**
* @Description: 3.是否全是拼音 全拼 (最難判斷 考慮性能這裏要)
*/
private static Boolean isPYQPWord(String word) {
Boolean isCNWord = false;
// 所有字符拆開
String[] split = word.split("");
// 找聲母
if (split.length < 2) {
String smymWord = split[0];
isCNWord = smymFind(smymWord);
return isCNWord;
} else {
// 首次 取最大 5個字符
isCNWord = recursionFindWordPinYin(split, 0);
}
return isCNWord;
}
// 遞歸
// true是全拼, false不是全拼, null 拼音加簡寫
// 如果用戶先輸入簡拼 在輸入 拼音 認定爲 簡拼(這裏是考慮到實際場景中 不會有用戶 輸完簡拼在輸入全拼)
private static Boolean recursionFindWordPinYin(String[] split, int i) {
Boolean isPinYin = null;
String wordFind = "";
int maxLength = 5;
for (int x = 0; x < maxLength && (x + i) < split.length; x++) {
wordFind += split[x + i];
}
if (split.length < 2) {
return smymFind(wordFind);
} else {
int nextI = lengPinYin(wordFind);
if (nextI == -1) {
// 之前存在拼音這裏返回null 標識 這是 拼音加簡拼
if (i > 0) {
return null;
} else {
return false;
}
} else if (nextI == wordFind.length() && split.length == (nextI + i)) {
return true;
} else {
return recursionFindWordPinYin(split, nextI + i);
}
}
}
// 拼音的規格是固定的 1-5位
// 先排除特殊情況是不是整體連讀 整體連讀 分爲 2,3,4 位 三種類型
// 取第一個字符 判斷是不是 聲母 ,如果是 找韻母,找到 即認爲是拼音
// 取第一個字符 判斷是不是 聲母 ,如果不是 是不是整體認讀 如果是 是拼
private static int lengPinYin(String word) {
boolean isPy = false;
String[] split = word.split("");
int wordLength = split.length;
// 第一個拼音的 長度 (1-5)
int tempWordLength = -1;
// 一、先排除特殊情況是不是整體連讀 整體連讀 分爲 2,3,4 位 三種類型
if (wordLength > 1) {
if (!isPy && wordLength > 3) {
String ztWord4 = split[0] + split[1] + split[2] + split[3];
isPy = ztFind(ztWord4);
if (isPy) {
tempWordLength = 4;
}
}
if (!isPy && wordLength > 2) {
String ztWord3 = split[0] + split[1] + split[2];
isPy = ztFind(ztWord3);
if (isPy) {
tempWordLength = 3;
}
}
if (!isPy && wordLength > 1) {
String ztWord2 = split[0] + split[1];
isPy = ztFind(ztWord2);
if (isPy) {
tempWordLength = 2;
}
}
}
if (!isPy) {
// 二、不用考慮連讀的情況 尋找聲母
// 1.單音節聲母 2.雙音節聲母 3.不是聲母
int type = 3;
boolean b = false;
// 先判斷雙音節
if (!b && wordLength > 1) {
// 雙音節聲母
String smFind2 = split[0] + split[1];
b = smFind(smFind2);
if (b) {
type = 2;
}
}
// 判斷單音節
if (!b && wordLength > 0) {
// 雙音節聲母
String smFind1 = split[0];
b = smFind(smFind1);
if (b) {
type = 1;
}
}
if (type != 3) {
// 三、找到聲母 找韻母
int indexStart = 1;
if (type == 2) {
// 雙音節 聲母 往後找一位
indexStart += 1;
}
// 韻母共有 1,2,3 位 三種
if (!isPy && wordLength > (indexStart + 2)) {
String ymWord3 = split[indexStart] + split[indexStart + 1] + split[indexStart + 2];
isPy = ymFind(ymWord3);
if (isPy) {
tempWordLength = indexStart + 3;
}
}
if (!isPy && wordLength > (indexStart + 1)) {
String ymWord2 = split[indexStart] + split[indexStart + 1];
isPy = ymFind(ymWord2);
if (isPy) {
tempWordLength = indexStart + 2;
}
}
if (!isPy && wordLength > indexStart) {
String ymWord1 = split[indexStart];
isPy = ymFind(ymWord1);
if (isPy) {
tempWordLength = indexStart + 1;
}
}
} else {
// 四、聲母未找到 直接找韻母
int indexStart = 0;
// 韻母共有 1,2,3 位 三種
if (!isPy && wordLength > (indexStart + 2)) {
String smymWord3 = split[indexStart] + split[indexStart + 1] + split[indexStart + 2];
isPy = smymFind(smymWord3);
if (isPy) {
tempWordLength = indexStart + 3;
}
}
if (!isPy && wordLength > (indexStart + 1)) {
String smymWord2 = split[indexStart] + split[indexStart + 1];
isPy = smymFind(smymWord2);
if (isPy) {
tempWordLength = indexStart + 2;
}
}
// 這裏比較特殊 如果是單音節 韻母 可作爲簡拼 也可作爲全拼!!!
if (!isPy && wordLength > indexStart) {
String smymWord1 = split[indexStart];
isPy = smymFind(smymWord1);
if (isPy) {
tempWordLength = indexStart + 1;
}
}
}
}
if (!isPy) {
tempWordLength = -1;
}
return tempWordLength;
}
/**
* @Description: 4.是否全是拼音 縮寫
*/
private static boolean isPYSXWord(String word) {
boolean isCNWord = false;
// 每個字符都是 聲母簡拼 聲母韻母簡拼
String[] split = word.split("");
for (int index = 0; index < split.length; index++) {
isCNWord = jpFind(split[index]);
if (!isCNWord) {
return isCNWord;
}
}
return isCNWord;
}
/**
* @Description: 聲母 是否存在
*/
private static boolean smFind(String word) {
boolean isFind = false;
for (int index = 0; index < sm.length; index++) {
if (word.equals(sm[index])) {
isFind = true;
return isFind;
}
}
return isFind;
}
/**
* @Description: 韻母 是否存在
*/
private static boolean ymFind(String word) {
boolean isFind = false;
for (int index = 0; index < ym.length; index++) {
if (word.equals(ym[index])) {
isFind = true;
return isFind;
}
}
return isFind;
}
/**
* @Description: 韻母 是否存在
*/
private static boolean ztFind(String word) {
boolean isFind = false;
for (int index = 0; index < zt.length; index++) {
if (word.equals(zt[index])) {
isFind = true;
return isFind;
}
}
return isFind;
}
/**
* @Description: 聲母韻母 是否存在
*/
private static boolean smymFind(String word) {
boolean isFind = false;
for (int index = 0; index < smym.length; index++) {
if (word.equals(smym[index])) {
isFind = true;
return isFind;
}
}
return isFind;
}
/**
* @Description: 聲母簡拼 聲母韻母簡拼
*/
private static boolean jpFind(String word) {
boolean isFind = false;
for (int index = 0; index < jp.length; index++) {
if (word.equals(jp[index])) {
isFind = true;
return isFind;
}
}
return isFind;
}
public static void main(String[] args) {
String str = "王文勝";
String str1 = "王wensehng";
String str2 = "wang文勝";
String str3 = "wws";
String str4 = "wangws";
String str5 = "wangwens";
String str6 = "wangwensheng";
String str7 = "amxl";
List<String> strings = Arrays.asList(str, str1, str2, str3, str4, str5, str6, str7);
strings.forEach(word -> {
// 1.純中文 2.純拼音全拼 3.純拼音縮寫 4.全拼+簡拼 5.中文+拼音
long million = System.currentTimeMillis();
int i = discernWordType(word);
switch (i) {
case 1:
System.err.println(word + ":純中文");
break;
case 2:
System.err.println(word + ":純拼音全拼");
break;
case 3:
System.err.println(word + ":純拼音縮寫");
break;
case 4:
System.err.println(word + ":全拼+簡拼");
break;
case 5:
System.err.println(word + ":中文+拼音");
break;
default:
System.err.println(word + ":未知類型");
break;
}
long million2 = System.currentTimeMillis();
System.err.println("million :" + (million2 - million));
});
}
}