用java實現大數據去重、詞頻統計、排序

概述

前提:數據源不會爆內存

使用HashMap做去重、統計、使用TreeMap做排序

 

原代碼

KeyWordCount.java

import util.TimeUtil;

import java.io.*;
import java.util.*;

/**
 * 搜索關鍵詞去重、統計、降序
 */
public class KeyWordCount {
    static String log = "";

    public static void main(String[] args) {
        String fileStr = "數據源路徑1";
        String fileStr1 = "數據源路徑2";

        long start_1 = System.currentTimeMillis();
        HashMap<String, String> map = Read2Map(fileStr);
        long end_1 = System.currentTimeMillis();
        System.out.printf("%s:%dms %n", "讀取+分割+去重字符串耗時", end_1 - start_1);
        log = log + "讀取+分割+去重字符串耗時:" + (end_1 - start_1) + "ms" + "\n";
        CountForMap(map);
    }

    /**
     * 讀取+分割+去重
     *
     * @param fileStr 數據源路徑
     * @return 去重後數據Map
     */
    public static HashMap<String, String> Read2Map(String fileStr) {
        HashMap<String, String> rdMap = new HashMap<>();
        File f = new File(fileStr);
        int i = 0;
        try (
                InputStreamReader isr = new InputStreamReader(new FileInputStream(f), "GBK");//解決中文亂碼
                BufferedReader br = new BufferedReader(isr);
        ) {
            while (true) {
                String line = br.readLine(); // 一次讀一行
                if (null == line)
                    break;
                i++;
                String[] lineArr = line.split("\t");//分割
                StringBuilder sb = new StringBuilder();
                sb.append(lineArr[1]).append("\t").append(lineArr[2].substring(1, lineArr[2].length() - 1).trim());//選列
                rdMap.put(sb.toString(), null);
            }
        } catch (IOException e) {
            e.printStackTrace();
            rdMap.put("null", null);
        }
        System.out.println("原數據條數:" + i);
        log = "原數據條數:" + i + "\n";
        System.out.println("去重後數據條數:" + rdMap.size());
        log = log + "去重後數據條數:" + rdMap.size() + "\n";

        return rdMap;
    }

    /**
     * 詞頻統計+降序+文件輸出
     *
     * @param rdMap 去重後的數據Map
     */
    public static void CountForMap(HashMap<String, String> rdMap) {
        long start = System.currentTimeMillis();
        TreeMap<String, Integer> countMap = new TreeMap<>();
        for (String key : rdMap.keySet()) {
            String keyStr = key.split("\t")[1];
            //如果關鍵詞沒有,就加入並設個數爲1
            if (countMap.get(keyStr) == null) {
                countMap.put(keyStr, 1);
            } else {
                int num = countMap.get(keyStr);
                countMap.put(keyStr, num + 1);//關鍵詞存在,個數+1
            }
        }
        long end = System.currentTimeMillis();
        System.out.printf("%s:%dms %n", "字符串頻度統計耗時", end - start);
        log = log + "字符串頻度統計耗時:" + (end - start) + "ms" + "\n";
        System.out.println("統計數據條數:"+countMap.size());
        log = log + "統計數據條數:"+countMap.size() + "\n";

        start = System.currentTimeMillis();
        //將map.entrySet()轉換成list
        List<Map.Entry<String, Integer>> countList = new ArrayList<>(countMap.entrySet());
        //通過比較器來實現排序
        Collections.sort(countList, new Comparator<Map.Entry<String, Integer>>() {
            @Override
            public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
                return o2.getValue().compareTo(o1.getValue());//降序
            }
        });
        end = System.currentTimeMillis();
        System.out.printf("%s:%dms %n", "字符串頻度排序耗時", end - start);
        log = log + "字符串頻度排序耗時:" + (end - start) + "ms" + "\n";

        System.out.println("");

        start = System.currentTimeMillis();
        File f = new File("src/testFile/input_" + TimeUtil.getTime() + ".txt");
        try (
                FileWriter fr = new FileWriter(f)
        ) {
            fr.write(log);
            fr.write("\n");
            for (Map.Entry<String, Integer> mapping : countList) {
                fr.write(mapping.getKey() + ":" + mapping.getValue() + "\n");
//                System.out.println();
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        end = System.currentTimeMillis();
        System.out.printf("%s:%dms %n", "結果寫入耗時:", end - start);
    }
}

TimeUtil.java

package util;

import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;

public class TimeUtil {

    /**
     * 獲取當前時間
     *
     * @return yyyyMMddHHmmss格式的14位時間戳
     */
    public static String getTime() {
        String newTime = null;//當前時間
        DateFormat df = new SimpleDateFormat("yyyyMMddHHmmss");
        newTime = df.format(new Date());
        return newTime;
    }

    /**
     * 將時間戳轉化爲帶時間分割的時間戳
     *
     * @param timeCode yyyyMMddHHmmss格式的14位時間戳
     * @return yyyy/MM/dd HH:mm:ss格式的時間戳
     */
    public static String timeC2S(String timeCode) {
        String timeStr = "無數據";
        if (timeCode.length() == 14) {
            System.out.println("TimeUtil" + timeCode);
            timeStr = timeCode.substring(2, 4) + "/" + timeCode.substring(4, 6) + "/" + timeCode.substring(6, 8) +
                    " " + timeCode.substring(8, 10) + ":" + timeCode.substring(10, 12);
        } else {
            timeStr = "無數據";
        }
        return timeStr;
    }
}

測試

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章