import java.io.UnsupportedEncodingException; import java.net.URLDecoder; import java.util.regex.Matcher; import java.util.regex.Pattern; public class SeoURLKeyword { public static String keywordReg = "^(?:http|https)://.+(?:\\.baidu\\.com.*[\\&|\\?](?:wd|word)=" + "|\\.soso\\.com.*[\\&|\\?]w=" + "|\\.sogou\\.com.*[\\&|\\?]query=" + "|\\.bing\\.com.*[\\&|\\?]q=" + "|\\.youdao\\.com.*[\\&|\\?]q=" + "|\\.google\\.com.*[\\&|\\?]q=" + "|\\.360\\.cn.*[\\&|\\?][kw|q]=" + "|\\.360sou\\.com.*[\\&|\\?](?:kw|q)=" + "|\\.so\\.com.*[\\&|\\?](?:kw|q)=)([^&]*)"; public static String encodeReg = "^(?:[\\x00-\\x7f]|[\\xfc-\\xff][\\x80-\\xbf]{5}|[\\xf8-\\xfb][\\x80-\\xbf]{4}|[\\xf0-\\xf7][\\x80-\\xbf]{3}|[\\xe0-\\xef][\\x80-\\xbf]{2}|[\\xc0-\\xdf][\\x80-\\xbf])+$"; public static String parsePercent(String dataStr) { if(dataStr.length() <= 1 ){ return "%".equals(dataStr)? "%25":dataStr; } StringBuffer buffer = new StringBuffer(); String[] arr = new String[]{}; int sIdx = dataStr.indexOf("%"); int eIdx = dataStr.lastIndexOf("%"); if (sIdx == -1){ return dataStr; }else{ buffer.append(dataStr.substring(0,sIdx)); arr = dataStr.substring(sIdx+1).split("%"); } for (int i = 0; i < arr.length; i++) { if (arr[i].length() < 2) { buffer.append("%25"); } else { if(Util.isShiLiu(arr[i].substring(0,2))){ buffer.append("%"); }else{ buffer.append("%25"); } } buffer.append(arr[i]); } if (eIdx == dataStr.length()-1) { buffer.append("%25"); } return buffer.toString(); }//end parsePercent public static String getKeyword(String url, String defaultKw) { Pattern keywordPatt = Pattern.compile(keywordReg); StringBuffer keyword = new StringBuffer(20); Matcher keywordMat = keywordPatt.matcher(url); while (keywordMat.find()) { keywordMat.appendReplacement(keyword, "$1"); } if (!keyword.toString().equals("")) { String keywordsTmp = filterPercent25(keyword.toString()); keywordsTmp = parsePercent(keywordsTmp); Pattern encodePatt = Pattern.compile(encodeReg); String unescapeString = ParseURLKeyword.unescape(keywordsTmp); Matcher encodeMat = encodePatt.matcher(unescapeString); String encodeString = "gbk"; if (encodeMat.matches()) encodeString = "utf-8"; try { return decode(keywordsTmp, encodeString).trim(); } catch (UnsupportedEncodingException e) { return defaultKw; } } return defaultKw; }//end getKeyword public static String decode(String kw, String encode) throws UnsupportedEncodingException{ if(kw.startsWith("%u")){ kw = unescape(kw); } return URLDecoder.decode(kw, encode); }//end decode public static String filterPercent25(String str){ String strUnicode = str; //%25 int n = 0; while (strUnicode.contains("%25")) { strUnicode = strUnicode.replace("%25", "%"); n++; if (n == 3) break; } //+ strUnicode = strUnicode.replace("+", ""); return strUnicode.trim(); }// end filterPercent25 public static String unescape(String src) { StringBuffer tmp = new StringBuffer(); tmp.ensureCapacity(src.length()); int lastPos = 0, pos = 0; char ch; while (lastPos < src.length()) { pos = src.indexOf("%", lastPos); if (pos == lastPos) { if (src.charAt(pos + 1) == 'u') { ch = (char) Integer.parseInt(src.substring(pos + 2, pos + 6), 16); tmp.append(ch); lastPos = pos + 6; } else { ch = (char) Integer.parseInt(src.substring(pos + 1, pos + 3), 16); tmp.append(ch); lastPos = pos + 3; } } else { if (pos == -1) { tmp.append(src.substring(lastPos)); lastPos = src.length(); } else { tmp.append(src.substring(lastPos, pos)); lastPos = pos; } } } return tmp.toString(); }//end unescape }
通用解析搜索關鍵詞類
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.