批量插入百萬數量級的數據到mysql的解決方案

項目場景：需要從上報的單個文件中解析出百萬數據入庫，項目中無論是使用jpa 還是 mybatis 存入數據達到10000時速度明顯變慢，達到100000時就讓人難以接受。所以就考慮使用存儲過程或者是使用原生jdbc實現，該案例使用原生jdbc實現。

單線程實現案例：

import lombok.Data;
import lombok.extern.slf4j.Slf4j;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;

@Slf4j
@Data
public abstract class BigDataInsert<T> {
    //     String driverClassName = "com.mysql.cj.jdbc.Driver";
    String driverClassName = "";
    //     String url = "jdbc:mysql://localhost:3306/bigdata?useServerPrepStmts=false&rewriteBatchedStatements=true&useUnicode=true&characterEncoding=UTF-8";
    String url = "";
    //     String user = "root";
    String user = "";
    //     String password = "123456";
    String password = "";
    //     String sql = "";
    String sql = "";

    public BigDataInsert() {
        init();
    }

    public void insertBigData(List<T> list) {
        //定義連接、statement對象
        Connection conn = null;
        PreparedStatement pstm = null;
        try {
            // 檢查初始化參數
            checkInit();
            //加載jdbc驅動
            Class.forName(driverClassName);
            //連接mysql
            conn = DriverManager.getConnection(url, user, password);
            //將自動提交關閉
            conn.setAutoCommit(false);
            //預編譯sql
            pstm = conn.prepareStatement(sql);
            //開始總計時
            long bTime1 = System.currentTimeMillis();

            List<List<T>> listList = null;
            if (list.size() > 100000) {
                listList = fixedGrouping(list, 100000);
            } else {
                listList.add(list);
            }
            //循環10次，每次十萬數據，一共100萬
            for (int i = 0; i < listList.size(); i++) {
                //開啓分段計時，計1W數據耗時
                long bTime = System.currentTimeMillis();
                //開始循環
                for (T object : listList.get(i)) {
                    //賦值
                    pstmToSetValue(pstm, object);
                    //添加到同一個批處理中
                    pstm.addBatch();
                }
                //執行批處理
                pstm.executeBatch();
                //提交事務
                conn.commit();
                //關閉分段計時
                long eTime = System.currentTimeMillis();
                //輸出
                System.out.println("成功插入" + listList.get(i).size() + "條數據耗時：" + (eTime - bTime));
            }
            //關閉總計時
            long eTime1 = System.currentTimeMillis();
            //輸出
            System.out.println("插入" + list.size() + "數據共耗時：" + (eTime1 - bTime1));
        } catch (SQLException e) {
            e.printStackTrace();
        } catch (ClassNotFoundException e1) {
            e1.printStackTrace();
        }finally {
            try {
                pstm.close();
                conn.close();
            } catch (SQLException e) {
                e.printStackTrace();
            }
        }
    }

    private void checkInit() {
        if ("".equals(driverClassName)) {
            log.warn("driverClassName未初始化！");
        }
        if ("".equals(url)) {
            log.warn("url未初始化！");
        }
        if ("".equals(user)) {
            log.warn("user未初始化！");
        }
        if ("".equals(password)) {
            log.warn("password未初始化！");
        }
        if ("".equals(sql)) {
            log.warn("sql未設置！");
        }
    }

    /**
     * 將一組數據固定分組，每組n個元素
     *
     * @param source 要分組的數據源
     * @param n      每組n個元素
     * @param <T>
     * @return
     */
    public static <T> List<List<T>> fixedGrouping(List<T> source, int n) {

        if (null == source || source.size() == 0 || n <= 0)
            return null;
        List<List<T>> result = new ArrayList<List<T>>();
        int remainder = source.size() % n;
        int size = (source.size() / n);
        for (int i = 0; i < size; i++) {
            List<T> subset = null;
            subset = source.subList(i * n, (i + 1) * n);
            result.add(subset);
        }
        if (remainder > 0) {
            List<T> subset = null;
            subset = source.subList(size * n, size * n + remainder);
            result.add(subset);
        }
        return result;
    }

    public abstract PreparedStatement pstmToSetValue(PreparedStatement pstm, T object);

    public abstract void init();

}

上面代碼是一個批量保存大數據的一個單線程的封裝，使用方法如下：

import *****.NetCollectData;
import org.springframework.stereotype.Component;

import java.sql.Date;
import java.sql.PreparedStatement;
import java.sql.SQLException;

@Component("netCollectDataBigDataInsert")
public class NetCollectDataBigDataInsertImpl extends BigDataInsert<NetCollectData> {

    @Override
    public void init() {
        this.driverClassName = "com.mysql.cj.jdbc.Driver";
        this.url = "jdbc:mysql://10.1.1.149:3306/cnleb139?useServerPrepStmts=false&rewriteBatchedStatements=true&useUnicode=true&characterEncoding=UTF-8";
        this.user = "root";
        this.password = "Root123!!!";
        this.sql = "INSERT INTO net_collect_data(collect_time,mac,edu_id,ip,time,TS_UP_4G,TS_UP,TS_DOWN_4G,TS_DOWN,PKG_UP_4G,PKG_UP,PKG_DOWN_4G,PKG_DOWN,status_code,created_date) " +
                "VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)";
    }

    @Override
    public PreparedStatement pstmToSetValue(PreparedStatement pstm, NetCollectData netCollectData) {
        try {
            pstm.setString(1, netCollectData.getCOLLECT_TIME());
            pstm.setString(2, netCollectData.getMAC());
            pstm.setString(3, netCollectData.getEduId());
            pstm.setString(4, netCollectData.getIP());
            pstm.setString(5, netCollectData.getTIME());

            pstm.setLong(6, netCollectData.getTS_UP_4G());
            pstm.setLong(7, netCollectData.getTS_UP());
            pstm.setLong(8, netCollectData.getTS_DOWN_4G());
            pstm.setLong(9, netCollectData.getTS_DOWN());
            pstm.setLong(10, netCollectData.getPKG_UP_4G());
            pstm.setLong(11, netCollectData.getPKG_UP());
            pstm.setLong(12, netCollectData.getPKG_DOWN_4G());
            pstm.setLong(13, netCollectData.getPKG_DOWN());
            pstm.setInt(14, netCollectData.getStatus());
            pstm.setDate(15, new Date(netCollectData.getDate().getTime()));
        } catch (SQLException e) {
            e.printStackTrace();
        }
        return pstm;
    }


}

繼承BigDataInsert實現init()和pstmToSetValue()方法

init(): 初始化鏈接信息的方法（包括插入語句），在實例被創建出來的時候會被調用

pstmToSetValue(): 對預編譯的sql語句set值，當實例出來的對象調用insertBigData()時執行

根據檢查，這裏是對16個字段的表插入數據，字段越多越慢，50w數據需要15分鐘完成

多線程版本：

import lombok.Data;
import lombok.extern.slf4j.Slf4j;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

@Slf4j
@Data
public abstract class BigDataInsert<T> {
    //     String driverClassName = "com.mysql.cj.jdbc.Driver";
    String driverClassName = "";
    //     String url = "jdbc:mysql://localhost:3306/bigdata?useServerPrepStmts=false&rewriteBatchedStatements=true&useUnicode=true&characterEncoding=UTF-8";
    String url = "";
    //     String user = "root";
    String user = "";
    //     String password = "123456";
    String password = "";
    //     String sql = "";
    String sql = "";

    int groupCount = 100000;

    int threadPoolCount = 5;

    // 創建一個固定大小的線程池
    private ExecutorService service = null;

    public BigDataInsert() {
        init();
    }

    public void insertBigData(List<T> list) {
        // 檢查初始化參數
        checkInit();
        // 創建線程池對象
        service = Executors.newFixedThreadPool(threadPoolCount);
        // 將需保存集合分組
        List<List<T>> listList = new ArrayList<>();
        if (list.size() > groupCount) {
            listList = fixedGrouping(list, groupCount);
        } else {
            listList.add(list);
        }
        // 計數器
        final CountDownLatch latch = new CountDownLatch(listList.size());

        //開始總計時
        long bTime1 = System.currentTimeMillis();
        //循環10次，每次十萬數據，一共100萬
        for (int i = 0; i < listList.size(); i++) {
            int finalI = i;
            List<List<T>> finalListList = listList;
            // 多線程保存
            service.execute(() -> {
                Connection conn = null;
                PreparedStatement pstm = null;
                try {
                    //加載jdbc驅動
                    Class.forName(driverClassName);
                    //連接mysql
                    conn = DriverManager.getConnection(url, user, password);
                    //將自動提交關閉
                    conn.setAutoCommit(false);
                    //預編譯sql
                    pstm = conn.prepareStatement(sql);
                    //開啓分段計時，計1W數據耗時
                    long bTime = System.currentTimeMillis();
                    //開始循環
                    for (T object : finalListList.get(finalI)) {
                        //賦值
                        pstmToSetValue(pstm, object);
                        //添加到同一個批處理中
                        pstm.addBatch();
                    }
                    //執行批處理
                    pstm.executeBatch();
                    //提交事務
                    conn.commit();
                    //關閉分段計時
                    long eTime = System.currentTimeMillis();
                    //輸出
                    System.out.println("成功插入" + finalListList.get(finalI).size() + "條數據耗時：" + (eTime - bTime));
                } catch (Exception e) {
                    log.error("批量保存失敗！");
                } finally {
                    latch.countDown();
                    try {
                        pstm.close();
                        conn.close();
                    } catch (SQLException e) {
                        e.printStackTrace();
                    }
                }
            });
        }

        try {
            latch.await();
        } catch (Exception e) {
            log.error("多線程分析數據中途異常!,{}", e);
        }
        //關閉總計時
        long eTime1 = System.currentTimeMillis();
        //輸出
        System.out.println("插入" + list.size() + "數據共耗時：" + (eTime1 - bTime1));

    }

    private void checkInit() {
        if ("".equals(driverClassName)) {
            log.warn("driverClassName未初始化！");
        }
        if ("".equals(url)) {
            log.warn("url未初始化！");
        }
        if ("".equals(user)) {
            log.warn("user未初始化！");
        }
        if ("".equals(password)) {
            log.warn("password未初始化！");
        }
        if ("".equals(sql)) {
            log.warn("sql未設置！");
        }
    }

    /**
     * 將一組數據固定分組，每組n個元素
     *
     * @param source 要分組的數據源
     * @param n      每組n個元素
     * @param <T>
     * @return
     */
    public static <T> List<List<T>> fixedGrouping(List<T> source, int n) {

        if (null == source || source.size() == 0 || n <= 0)
            return null;
        List<List<T>> result = new ArrayList<List<T>>();
        int remainder = source.size() % n;
        int size = (source.size() / n);
        for (int i = 0; i < size; i++) {
            List<T> subset = null;
            subset = source.subList(i * n, (i + 1) * n);
            result.add(subset);
        }
        if (remainder > 0) {
            List<T> subset = null;
            subset = source.subList(size * n, size * n + remainder);
            result.add(subset);
        }
        return result;
    }

    public abstract PreparedStatement pstmToSetValue(PreparedStatement pstm, T object);

    public abstract void init();

}

多線程版本允許在實現init()方法時，修改線程數量（默認5）以及分組保存的數量（默認100000）

經過檢查：50w數據3分鐘左右能夠完成（本人電腦一般，電腦性能好的會更快）

參考blog: https://blog.csdn.net/q6834850/article/details/73726707

批量插入百萬數量級的數據到mysql的解決方案

DAPPER 事務 TRANSACTION

批量插入百萬數量級的數據到mysql的解決方案

在CentOS7系統上安裝mysql

vue中使用wangeditor的簡單實踐

java逐行讀取超大文件

使用mycat後jpa踩到的坑（記錄）

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結