概述
前提:數據源不會爆內存
使用HashMap做去重、統計、使用TreeMap做排序
原代碼
KeyWordCount.java
import util.TimeUtil;
import java.io.*;
import java.util.*;
/**
* 搜索關鍵詞去重、統計、降序
*/
public class KeyWordCount {
static String log = "";
public static void main(String[] args) {
String fileStr = "數據源路徑1";
String fileStr1 = "數據源路徑2";
long start_1 = System.currentTimeMillis();
HashMap<String, String> map = Read2Map(fileStr);
long end_1 = System.currentTimeMillis();
System.out.printf("%s:%dms %n", "讀取+分割+去重字符串耗時", end_1 - start_1);
log = log + "讀取+分割+去重字符串耗時:" + (end_1 - start_1) + "ms" + "\n";
CountForMap(map);
}
/**
* 讀取+分割+去重
*
* @param fileStr 數據源路徑
* @return 去重後數據Map
*/
public static HashMap<String, String> Read2Map(String fileStr) {
HashMap<String, String> rdMap = new HashMap<>();
File f = new File(fileStr);
int i = 0;
try (
InputStreamReader isr = new InputStreamReader(new FileInputStream(f), "GBK");//解決中文亂碼
BufferedReader br = new BufferedReader(isr);
) {
while (true) {
String line = br.readLine(); // 一次讀一行
if (null == line)
break;
i++;
String[] lineArr = line.split("\t");//分割
StringBuilder sb = new StringBuilder();
sb.append(lineArr[1]).append("\t").append(lineArr[2].substring(1, lineArr[2].length() - 1).trim());//選列
rdMap.put(sb.toString(), null);
}
} catch (IOException e) {
e.printStackTrace();
rdMap.put("null", null);
}
System.out.println("原數據條數:" + i);
log = "原數據條數:" + i + "\n";
System.out.println("去重後數據條數:" + rdMap.size());
log = log + "去重後數據條數:" + rdMap.size() + "\n";
return rdMap;
}
/**
* 詞頻統計+降序+文件輸出
*
* @param rdMap 去重後的數據Map
*/
public static void CountForMap(HashMap<String, String> rdMap) {
long start = System.currentTimeMillis();
TreeMap<String, Integer> countMap = new TreeMap<>();
for (String key : rdMap.keySet()) {
String keyStr = key.split("\t")[1];
//如果關鍵詞沒有,就加入並設個數爲1
if (countMap.get(keyStr) == null) {
countMap.put(keyStr, 1);
} else {
int num = countMap.get(keyStr);
countMap.put(keyStr, num + 1);//關鍵詞存在,個數+1
}
}
long end = System.currentTimeMillis();
System.out.printf("%s:%dms %n", "字符串頻度統計耗時", end - start);
log = log + "字符串頻度統計耗時:" + (end - start) + "ms" + "\n";
System.out.println("統計數據條數:"+countMap.size());
log = log + "統計數據條數:"+countMap.size() + "\n";
start = System.currentTimeMillis();
//將map.entrySet()轉換成list
List<Map.Entry<String, Integer>> countList = new ArrayList<>(countMap.entrySet());
//通過比較器來實現排序
Collections.sort(countList, new Comparator<Map.Entry<String, Integer>>() {
@Override
public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
return o2.getValue().compareTo(o1.getValue());//降序
}
});
end = System.currentTimeMillis();
System.out.printf("%s:%dms %n", "字符串頻度排序耗時", end - start);
log = log + "字符串頻度排序耗時:" + (end - start) + "ms" + "\n";
System.out.println("");
start = System.currentTimeMillis();
File f = new File("src/testFile/input_" + TimeUtil.getTime() + ".txt");
try (
FileWriter fr = new FileWriter(f)
) {
fr.write(log);
fr.write("\n");
for (Map.Entry<String, Integer> mapping : countList) {
fr.write(mapping.getKey() + ":" + mapping.getValue() + "\n");
// System.out.println();
}
} catch (IOException e) {
e.printStackTrace();
}
end = System.currentTimeMillis();
System.out.printf("%s:%dms %n", "結果寫入耗時:", end - start);
}
}
TimeUtil.java
package util;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;
public class TimeUtil {
/**
* 獲取當前時間
*
* @return yyyyMMddHHmmss格式的14位時間戳
*/
public static String getTime() {
String newTime = null;//當前時間
DateFormat df = new SimpleDateFormat("yyyyMMddHHmmss");
newTime = df.format(new Date());
return newTime;
}
/**
* 將時間戳轉化爲帶時間分割的時間戳
*
* @param timeCode yyyyMMddHHmmss格式的14位時間戳
* @return yyyy/MM/dd HH:mm:ss格式的時間戳
*/
public static String timeC2S(String timeCode) {
String timeStr = "無數據";
if (timeCode.length() == 14) {
System.out.println("TimeUtil" + timeCode);
timeStr = timeCode.substring(2, 4) + "/" + timeCode.substring(4, 6) + "/" + timeCode.substring(6, 8) +
" " + timeCode.substring(8, 10) + ":" + timeCode.substring(10, 12);
} else {
timeStr = "無數據";
}
return timeStr;
}
}