檢測字符串是中文、拼音、拼音縮寫或者中英混合

package com.ysz.search.utils;


import com.alibaba.fastjson.JSONObject;
import com.ysz.search.utils.log.Logger;
import com.ysz.search.utils.log.LoggerFactory;
import org.apache.commons.io.FileUtils;
import org.springframework.util.ResourceUtils;

import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @Description: 檢測字符串 是中文 拼音 拼音縮寫 或者 中英混合
 * @Description: 不考慮英文情況, 默認輸入只有字母(小寫)、數字、中文
 * @Description: 關於純簡寫與拼音加簡寫的界定(因爲中文拼音中有單音節聲母的原因例如a,o,e:即可認定爲簡寫,也可認定爲全拼),這裏採用如果拼音加簡寫也符合純簡寫的規則,則認定爲簡寫(可根據自己具體業務需求調整)
 * @Description: 控制在1ms內
 * @Param: 本人獨創,侵權必究(哈哈)
 * @return:
 * @Author: [email protected]
 * @Date: 2020/4/27
 */
public class WordUtils {

    private static Logger logger = LoggerFactory.getLogger(WordUtils.class);

    // 聲母
    static String[] sm = {"b", "p", "m", "f", "d", "t", "n", "l", "g", "k", "h", "j", "q", "x", "zh", "ch", "sh", "r", "z", "c", "s", "y", "w"};

    // 韻母
    static String[] ym = {"a", "o", "e", "i", "u", "v", "ai", "ei", "ui", "ao", "ou", "iu", "ie", "ve", "er", "an", "en", "in", "un", "vn", "ang", "eng", "ing", "ong"};

    // 整體連讀
    static String[] zt = {"zhi", "chi", "shi", "ri", "wu", "yu", "ye", "yue", "yuan", "yin", "yun", "ying"};

    // 聲母韻母
    static String[] smym = {"a", "o", "e", "ai", "ei", "ao", "ou", "er", "an", "en", "ang", "eng"};

    // 聲母簡拼 聲母韻母簡拼
    static String[] jp = {"b", "p", "m", "f", "d", "t", "n", "l", "g", "k", "h", "j", "q", "x", "r", "z", "c", "s", "y", "w", "a", "o", "e"};

    /**
     * @Description: 檢測 字符串 類型
     * @Param: 【1.純中文 2.純拼音全拼 3.純拼音縮寫 4.全拼+簡拼 5.中文+拼音 】
     * @return:
     * @Author: [email protected]
     * @Date: 2020/4/27
     */
    public static int discernWordType(String word) {
        word = word.replaceAll(" ", "");
        // 按數字剔除 不要影響算法,如果放到後面處理會很麻煩
        word = Pattern.compile("[\\d]")
                .matcher(word)
                .replaceAll("");
        // 默認 4
        int wordType = 4;
        if (isCNWord(word)) {
            wordType = 1;
        } else if (isPYWord(word)) {
            Boolean pyqpWord = isPYQPWord(word);
            if (pyqpWord == null) {
                // 這裏對 單音節 韻母 可作爲全拼 or 簡寫 ,做特殊處理 再加一層判斷
                if (isPYSXWord(word)) {
                    wordType = 3;
                } else {
                    wordType = 4;
                }
            } else if (pyqpWord) {
                wordType = 2;
            } else {
                wordType = 3;
            }
        } else {
            // 這種情況就是 既不是 全中文 也不是全拼音 ,即中拼混合
            wordType = 5;
        }
        return wordType;
    }

    /**
     * @Description: 1.是否全是中文
     */
    private static boolean isCNWord(String word) {
        boolean isCNWord = false;
        Pattern p_str = Pattern.compile("^[0-9\\u4E00-\\u9FA5]+$");
        Matcher m = p_str.matcher(word);
        if (m.find() && m.group(0)
                .equals(word)) {
            isCNWord = true;
        }
        return isCNWord;
    }

    /**
     * @Description: 2.是否全是英文
     */
    private static boolean isPYWord(String word) {
        boolean isPYWord = false;
        Pattern p_str = Pattern.compile("^[a-zA-Z0-9]+$");
        Matcher m = p_str.matcher(word);
        if (m.find() && m.group(0)
                .equals(word)) {
            isPYWord = true;
        }
        return isPYWord;
    }

    /**
     * @Description: 3.是否全是拼音 全拼 (最難判斷 考慮性能這裏要)
     */
    private static Boolean isPYQPWord(String word) {
        Boolean isCNWord = false;
        // 所有字符拆開
        String[] split = word.split("");
        // 找聲母
        if (split.length < 2) {
            String smymWord = split[0];
            isCNWord = smymFind(smymWord);
            return isCNWord;
        } else {
            // 首次 取最大 5個字符
            isCNWord = recursionFindWordPinYin(split, 0);
        }
        return isCNWord;
    }

    // 遞歸
    // true是全拼, false不是全拼, null 拼音加簡寫
    // 如果用戶先輸入簡拼 在輸入 拼音 認定爲 簡拼(這裏是考慮到實際場景中 不會有用戶 輸完簡拼在輸入全拼)
    private static Boolean recursionFindWordPinYin(String[] split, int i) {
        Boolean isPinYin = null;
        String wordFind = "";
        int maxLength = 5;
        for (int x = 0; x < maxLength && (x + i) < split.length; x++) {
            wordFind += split[x + i];
        }
        if (split.length < 2) {
            return smymFind(wordFind);
        } else {
            int nextI = lengPinYin(wordFind);
            if (nextI == -1) {
                // 之前存在拼音這裏返回null 標識 這是 拼音加簡拼
                if (i > 0) {
                    return null;
                } else {
                    return false;
                }
            } else if (nextI == wordFind.length() && split.length == (nextI + i)) {
                return true;
            } else {
                return recursionFindWordPinYin(split, nextI + i);
            }
        }
    }

    // 拼音的規格是固定的 1-5位
    // 先排除特殊情況是不是整體連讀 整體連讀 分爲 2,3,4 位 三種類型
    // 取第一個字符 判斷是不是 聲母 ,如果是 找韻母,找到 即認爲是拼音
    // 取第一個字符 判斷是不是 聲母 ,如果不是 是不是整體認讀 如果是 是拼
    private static int lengPinYin(String word) {
        boolean isPy = false;
        String[] split = word.split("");
        int wordLength = split.length;
        // 第一個拼音的 長度 (1-5)
        int tempWordLength = -1;
        // 一、先排除特殊情況是不是整體連讀 整體連讀 分爲 2,3,4 位 三種類型
        if (wordLength > 1) {
            if (!isPy && wordLength > 3) {
                String ztWord4 = split[0] + split[1] + split[2] + split[3];
                isPy = ztFind(ztWord4);
                if (isPy) {
                    tempWordLength = 4;
                }
            }
            if (!isPy && wordLength > 2) {
                String ztWord3 = split[0] + split[1] + split[2];
                isPy = ztFind(ztWord3);
                if (isPy) {
                    tempWordLength = 3;
                }
            }
            if (!isPy && wordLength > 1) {
                String ztWord2 = split[0] + split[1];
                isPy = ztFind(ztWord2);
                if (isPy) {
                    tempWordLength = 2;
                }
            }
        }

        if (!isPy) {
            // 二、不用考慮連讀的情況 尋找聲母
            // 1.單音節聲母 2.雙音節聲母 3.不是聲母
            int type = 3;
            boolean b = false;
            // 先判斷雙音節
            if (!b && wordLength > 1) {
                // 雙音節聲母
                String smFind2 = split[0] + split[1];
                b = smFind(smFind2);
                if (b) {
                    type = 2;
                }
            }
            // 判斷單音節
            if (!b && wordLength > 0) {
                // 雙音節聲母
                String smFind1 = split[0];
                b = smFind(smFind1);
                if (b) {
                    type = 1;
                }
            }

            if (type != 3) {
                // 三、找到聲母 找韻母
                int indexStart = 1;
                if (type == 2) {
                    // 雙音節 聲母 往後找一位
                    indexStart += 1;
                }
                // 韻母共有 1,2,3 位 三種
                if (!isPy && wordLength > (indexStart + 2)) {
                    String ymWord3 = split[indexStart] + split[indexStart + 1] + split[indexStart + 2];
                    isPy = ymFind(ymWord3);
                    if (isPy) {
                        tempWordLength = indexStart + 3;
                    }
                }
                if (!isPy && wordLength > (indexStart + 1)) {
                    String ymWord2 = split[indexStart] + split[indexStart + 1];
                    isPy = ymFind(ymWord2);
                    if (isPy) {
                        tempWordLength = indexStart + 2;
                    }
                }
                if (!isPy && wordLength > indexStart) {
                    String ymWord1 = split[indexStart];
                    isPy = ymFind(ymWord1);
                    if (isPy) {
                        tempWordLength = indexStart + 1;
                    }
                }
            } else {
                // 四、聲母未找到 直接找韻母
                int indexStart = 0;
                // 韻母共有 1,2,3 位 三種
                if (!isPy && wordLength > (indexStart + 2)) {
                    String smymWord3 = split[indexStart] + split[indexStart + 1] + split[indexStart + 2];
                    isPy = smymFind(smymWord3);
                    if (isPy) {
                        tempWordLength = indexStart + 3;
                    }
                }
                if (!isPy && wordLength > (indexStart + 1)) {
                    String smymWord2 = split[indexStart] + split[indexStart + 1];
                    isPy = smymFind(smymWord2);
                    if (isPy) {
                        tempWordLength = indexStart + 2;
                    }
                }

                // 這裏比較特殊 如果是單音節 韻母 可作爲簡拼 也可作爲全拼!!!
                if (!isPy && wordLength > indexStart) {
                    String smymWord1 = split[indexStart];
                    isPy = smymFind(smymWord1);
                    if (isPy) {
                        tempWordLength = indexStart + 1;
                    }
                }
            }
        }
        if (!isPy) {
            tempWordLength = -1;
        }
        return tempWordLength;
    }

    /**
     * @Description: 4.是否全是拼音 縮寫
     */
    private static boolean isPYSXWord(String word) {
        boolean isCNWord = false;
        // 每個字符都是 聲母簡拼 聲母韻母簡拼
        String[] split = word.split("");
        for (int index = 0; index < split.length; index++) {
            isCNWord = jpFind(split[index]);
            if (!isCNWord) {
                return isCNWord;
            }
        }
        return isCNWord;
    }

    /**
     * @Description: 聲母 是否存在
     */
    private static boolean smFind(String word) {
        boolean isFind = false;
        for (int index = 0; index < sm.length; index++) {
            if (word.equals(sm[index])) {
                isFind = true;
                return isFind;
            }
        }
        return isFind;
    }

    /**
     * @Description: 韻母 是否存在
     */
    private static boolean ymFind(String word) {
        boolean isFind = false;
        for (int index = 0; index < ym.length; index++) {
            if (word.equals(ym[index])) {
                isFind = true;
                return isFind;
            }
        }
        return isFind;
    }

    /**
     * @Description: 韻母 是否存在
     */
    private static boolean ztFind(String word) {
        boolean isFind = false;
        for (int index = 0; index < zt.length; index++) {
            if (word.equals(zt[index])) {
                isFind = true;
                return isFind;
            }
        }
        return isFind;
    }

    /**
     * @Description: 聲母韻母 是否存在
     */
    private static boolean smymFind(String word) {
        boolean isFind = false;
        for (int index = 0; index < smym.length; index++) {
            if (word.equals(smym[index])) {
                isFind = true;
                return isFind;
            }
        }
        return isFind;
    }

    /**
     * @Description: 聲母簡拼 聲母韻母簡拼
     */
    private static boolean jpFind(String word) {
        boolean isFind = false;
        for (int index = 0; index < jp.length; index++) {
            if (word.equals(jp[index])) {
                isFind = true;
                return isFind;
            }
        }
        return isFind;
    }


    public static void main(String[] args) {
        String str = "王文勝";
        String str1 = "王wensehng";
        String str2 = "wang文勝";
        String str3 = "wws";
        String str4 = "wangws";
        String str5 = "wangwens";
        String str6 = "wangwensheng";
        String str7 = "amxl";

        List<String> strings = Arrays.asList(str, str1, str2, str3, str4, str5, str6, str7);

        strings.forEach(word -> {
            // 1.純中文 2.純拼音全拼 3.純拼音縮寫 4.全拼+簡拼 5.中文+拼音
            long million = System.currentTimeMillis();
            int i = discernWordType(word);
            switch (i) {
                case 1:
                    System.err.println(word + ":純中文");
                    break;
                case 2:
                    System.err.println(word + ":純拼音全拼");
                    break;
                case 3:
                    System.err.println(word + ":純拼音縮寫");
                    break;
                case 4:
                    System.err.println(word + ":全拼+簡拼");
                    break;
                case 5:
                    System.err.println(word + ":中文+拼音");
                    break;
                default:
                    System.err.println(word + ":未知類型");
                    break;
            }
            long million2 = System.currentTimeMillis();
            System.err.println("million :" + (million2 - million));
        });
    }


}

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章