解析Office文件文本內容,掃描文件是否涉敏(doc、docx、xls、xlsx、ppt、pptx、pdf、txt)

解析Office文件文本內容,掃描文件是否涉敏(doc、docx、xls、xlsx、ppt、pptx、pdf、txt)

實現思路:將解析到的文本內容利用正則表達式去匹配

加入主要的依賴

		<dependency>
			<groupId>org.apache.poi</groupId>
			<artifactId>poi-ooxml</artifactId>
			<version>4.1.2</version>
		</dependency>
		<dependency>
			<groupId>org.apache.poi</groupId>
			<artifactId>poi-scratchpad</artifactId>
			<version>4.1.2</version>
		</dependency>

Excel掃描文件是否爲涉敏文件

    private static List<SensitiveEntity> excelScanFile(String fileType, String filePath, List<SensitiveEntity> list, PloyEntity ployEntity) {
        if ("xls".equals(fileType)){
            return excelScanXlsFile(filePath,list,ployEntity);
        }else if("xlsx".equals(fileType)){
            return excelScanXlsxFile(filePath,list,ployEntity);
        }else {
            return new ArrayList<>();
        }
    }

	/**
     * Excel掃描文件是否爲涉敏文件(xls)
     */
    private static List<SensitiveEntity> excelScanXlsFile(String filePath, List<SensitiveEntity> list,PloyEntity ployEntity){
        // 已匹配上的策略放這裏
        List<SensitiveEntity> matchedSensitive = new ArrayList<>();
        Map<String, Integer> mapDiscoveryStrategy = new HashMap<>();
        try{
            // 讀取文件
            HSSFWorkbook sheets = new HSSFWorkbook(new FileInputStream(new File(filePath)));
            Matcher matcher;
            // 循環所有sheet頁
            for (int sheetIndex = 0; sheetIndex < sheets.getNumberOfSheets(); sheetIndex++){
                HSSFSheet sheetAt = sheets.getSheetAt(sheetIndex);
                // 計算發現策略掃描區間
                calculationDiscoveryStrategy(mapDiscoveryStrategy,sheetAt.getLastRowNum(),ployEntity);
                int start = mapDiscoveryStrategy.get("start");
                int end = mapDiscoveryStrategy.get("end");
                logger.info("掃描文件從{}行開始掃描到{}行結束",start,end);
                // 第一個sheet頁所有行
                for (int rowIndex = start; rowIndex <= end; rowIndex++) {
                    HSSFRow row = sheetAt.getRow(rowIndex);
                    if(row == null){
                        continue;
                    }
                    for (int cellIndex = 0; cellIndex < row.getPhysicalNumberOfCells(); cellIndex++) {
                        Cell cell = row.getCell(cellIndex);
                        if(cell == null){
                            continue;
                        }
                        // 獲取表格內容
                        String cellText = getCellValString(cell);
                        logger.info("表格文本內容:{}",cellText);
                        for (SensitiveEntity sensitiveEntity : list) {
                            matcher = matcherTxt(sensitiveEntity.getRules(),cellText);
                            if (matcher.find()) {
                                while ((matcher = matcherTxt(sensitiveEntity.getRules(),cellText)).find()) {
                                    String group = matcher.group(0);
                                    // 打印涉敏信息
                                    logger.info("規則:{}    涉敏信息:{}",sensitiveEntity.getRules(),group);
                                    cellText = cellText.replaceFirst(group,"****");
                                }
                                // 若集合中不存在改策略則加入集合
                                if (!matchedSensitive.contains(sensitiveEntity)){
                                    matchedSensitive.add(sensitiveEntity);
                                }
                            }
                        }
                    }
                }
            }
            sheets.close();
        }catch (Exception e){
            logger.error("掃描【{}】文件是否爲涉敏文件異常:{}",filePath,e.getMessage());
            return matchedSensitive;
        }
        return matchedSensitive;
    }

    /**
     * Excel掃描文件是否爲涉敏文件(xlsx)
     */
    private static List<SensitiveEntity> excelScanXlsxFile(String filePath, List<SensitiveEntity> list,PloyEntity ployEntity){
        // 已匹配上的策略放這裏
        List<SensitiveEntity> matchedSensitive = new ArrayList<>();
        Map<String, Integer> mapDiscoveryStrategy = new HashMap<>();
        try{
            // 讀取文件
            XSSFWorkbook sheets = new XSSFWorkbook(new FileInputStream(new File(filePath)));
            Matcher matcher;
            // 循環所有sheet頁
            for (int sheetIndex = 0; sheetIndex < sheets.getNumberOfSheets(); sheetIndex++){
                XSSFSheet sheetAt = sheets.getSheetAt(sheetIndex);
                // 計算發現策略掃描區間
                calculationDiscoveryStrategy(mapDiscoveryStrategy,sheetAt.getLastRowNum(),ployEntity);
                int start = mapDiscoveryStrategy.get("start");
                int end = mapDiscoveryStrategy.get("end");
                logger.info("掃描文件從{}行開始掃描到{}行結束",start,end);
                // 第一個sheet頁所有行
                for (int rowIndex = start; rowIndex <= end; rowIndex++) {
                    XSSFRow row = sheetAt.getRow(rowIndex);
                    if(row == null){
                        continue;
                    }
                    for (int cellIndex = 0; cellIndex < row.getPhysicalNumberOfCells(); cellIndex++) {
                        XSSFCell cell = row.getCell(cellIndex);
                        if(cell == null){
                            continue;
                        }
                        // 獲取表格內容
                        String cellText = getCellValString(cell);
                        logger.info("表格文本內容:{}",cellText);
                        for (SensitiveEntity sensitiveEntity : list) {
                            matcher = matcherTxt(sensitiveEntity.getRules(),cellText);
                            if (matcher.find()) {
                                while ((matcher = matcherTxt(sensitiveEntity.getRules(),cellText)).find()) {
                                    String group = matcher.group(0);
                                    // 打印涉敏信息
                                    logger.info("規則:{}    涉敏信息:{}",sensitiveEntity.getRules(),group);
                                    cellText = cellText.replaceFirst(group,"****");
                                }
                                // 若集合中不存在改策略則加入集合
                                if (!matchedSensitive.contains(sensitiveEntity)){
                                    matchedSensitive.add(sensitiveEntity);
                                }
                            }
                        }
                    }
                }
            }
            sheets.close();
        }catch (Exception e){
            logger.error("掃描【{}】文件是否爲涉敏文件異常:{}",filePath,e.getMessage());
            return matchedSensitive;
        }
        return matchedSensitive;
    }

Word掃描文件是否爲涉敏文件

	private static List<SensitiveEntity> wordScanFile(String fileType, String filePath, List<SensitiveEntity> list, PloyEntity ployEntity) {
        if ("doc".equals(fileType)){
            return wordScanDocFile(filePath,list,ployEntity);
        }else if("docx".equals(fileType)){
            return wordScanDocxFile(filePath,list,ployEntity);
        }else {
            return new ArrayList<>();
        }
    }
	
	/**
     * Word掃描文件是否爲涉敏文件(doc)
     */
    private static List<SensitiveEntity> wordScanDocFile(String filePath, List<SensitiveEntity> list, PloyEntity ployEntity) {
        // 已匹配上的策略放這裏
        List<SensitiveEntity> matchedSensitive = new ArrayList<>();
        Map<String, Integer> mapDiscoveryStrategy = new HashMap<>();
        try{
            HWPFDocument doc = new HWPFDocument(new FileInputStream(new File(filePath)));
            Range range = doc.getRange();
            int rowNum = range.numParagraphs();
            // 計算發現策略掃描區間
            calculationDiscoveryStrategy(mapDiscoveryStrategy,rowNum,ployEntity);
            int start = mapDiscoveryStrategy.get("start");
            int end = mapDiscoveryStrategy.get("end");
            logger.info("掃描文件從{}行開始掃描到{}行結束",start,end);
            Matcher matcher;
            for (int rowIndex = start; rowIndex < end; rowIndex++) {
                String text = range.getParagraph(rowIndex).text();
                logger.info("文本內容:{}", text);
                for (SensitiveEntity sensitiveEntity : list) {
                    String rule = sensitiveEntity.getRules();
                    if (matcherTxt(rule, text).find()) {
                        matcher = matcherTxt(rule, text);
                        if (matcher.find()) {
                            while ((matcher = matcherTxt(rule, text)).find()) {
                                String group = matcher.group(0);
                                // 打印涉敏信息
                                logger.info("規則:{}    涉敏信息:{}", sensitiveEntity.getRules(), group);
                                // 不要去掉(while循環校驗使用)
                                text = text.replaceFirst(group,"****");
                            }
                        }
                        // 若集合中不存在改策略則加入集合
                        if (!matchedSensitive.contains(sensitiveEntity)){
                            matchedSensitive.add(sensitiveEntity);
                        }
                    }
                }
            }
        }catch (Exception e){
            logger.error("掃描【{}】文件是否爲涉敏文件異常:{}",filePath,e.getMessage());
            return matchedSensitive;
        }
        return matchedSensitive;
    }

    /**
     * Word掃描文件是否爲涉敏文件(docx)
     */
    private static List<SensitiveEntity> wordScanDocxFile(String filePath, List<SensitiveEntity> list, PloyEntity ployEntity) {
        // 已匹配上的策略放這裏
        List<SensitiveEntity> matchedSensitive = new ArrayList<>();
        Map<String, Integer> mapDiscoveryStrategy = new HashMap<>();
        try{
            XWPFDocument doc = new XWPFDocument(new FileInputStream(new File(filePath)));
            List<XWPFParagraph> paragraphs = doc.getParagraphs();
            // 計算發現策略掃描區間
            calculationDiscoveryStrategy(mapDiscoveryStrategy,paragraphs.size(),ployEntity);
            int start = mapDiscoveryStrategy.get("start");
            int end = mapDiscoveryStrategy.get("end");
            logger.info("掃描文件從{}行開始掃描到{}行結束",start,end);
            Matcher matcher;
            for (int rowIndex = start; rowIndex < end; rowIndex++) {
                logger.info("runs文本內容:{}",paragraphs.get(rowIndex).getParagraphText());
                for (SensitiveEntity sensitiveEntity : list) {
                    String rule = sensitiveEntity.getRules();
                    if (matcherTxt(rule,paragraphs.get(rowIndex).getParagraphText()).find()) {
                        String runText = paragraphs.get(rowIndex).getParagraphText();
                        matcher = matcherTxt(rule,runText);
                        if (matcher.find()) {
                            while ((matcher = matcherTxt(rule,runText)).find()) {
                                String group = matcher.group(0);
                                // 打印涉敏信息
                                logger.info("規則:{}    涉敏信息:{}",sensitiveEntity.getRules(),group);
                                // 不要去掉(while循環校驗使用)
                                runText = runText.replaceFirst(group,"****");
                            }
                        }
                        // 若集合中不存在改策略則加入集合
                        if (!matchedSensitive.contains(sensitiveEntity)){
                            matchedSensitive.add(sensitiveEntity);
                        }
                    }
                }
            }
        }catch (Exception e){
            logger.error("掃描【{}】文件是否爲涉敏文件異常:{}",filePath,e.getMessage());
            return matchedSensitive;
        }
        return matchedSensitive;
    }

TxT掃描文件是否爲涉敏文件

	private static List<SensitiveEntity> txtScanFile(String filePath, List<SensitiveEntity> list, PloyEntity ployEntity) {
        // 已匹配上的策略放這裏
        List<SensitiveEntity> matchedSensitive = new ArrayList<>();
        List<String> listStr = new ArrayList<>();
        Map<String, Integer> mapDiscoveryStrategy = new HashMap<>();
        try {
            BufferedReader bufferedReader = new BufferedReader(new FileReader(filePath));
            Matcher matcher;
            String lineStr = "";
            while ((lineStr = bufferedReader.readLine()) != null){
                listStr.add(lineStr);
            }
            // 計算發現策略掃描區間
            calculationDiscoveryStrategy(mapDiscoveryStrategy,listStr.size(),ployEntity);
            int start = mapDiscoveryStrategy.get("start");
            int end = mapDiscoveryStrategy.get("end");
            logger.info("掃描文件從{}行開始掃描到{}行結束",start,end);
            for (int rowIndex = start; rowIndex < end; rowIndex++) {
                lineStr = listStr.get(rowIndex);
                logger.info("文本內容:{}",lineStr);
                for (SensitiveEntity sensitiveEntity : list) {
                    // 獲取表格內容
                    matcher = matcherTxt(sensitiveEntity.getRules(),lineStr);
                    if (matcher.find()) {
                        while ((matcher = matcherTxt(sensitiveEntity.getRules(),lineStr)).find()) {
                            String group = matcher.group(0);
                            // 打印涉敏信息
                            logger.info("規則:{}    涉敏信息:{}",sensitiveEntity.getRules(),group);
                            lineStr = lineStr.replaceFirst(group,"****");
                        }
                        // 若集合中不存在改策略則加入集合
                        if (!matchedSensitive.contains(sensitiveEntity)){
                            matchedSensitive.add(sensitiveEntity);
                        }
                    }
                }
            }
        }catch (Exception e){
            logger.error("掃描【{}】文件是否爲涉敏文件異常:{}",filePath,e.getMessage());
            return matchedSensitive;
        }
        return matchedSensitive;
    }

PPT掃描文件是否爲涉敏文件

	private static List<SensitiveEntity> pptScanFile(String fileType, String filePath, List<SensitiveEntity> list, PloyEntity ployEntity) {
        // 已匹配上的策略放這裏
        List<SensitiveEntity> matchedSensitive = new ArrayList<>();
        Map<String, Integer> mapDiscoveryStrategy = new HashMap<>();
        Matcher matcher;
        try {
            String[] texts = new String[]{};
            if ("ppt".equals(fileType)){
                logger.info("掃描文件類型爲PPT");
                PowerPointExtractor extractor = new PowerPointExtractor(new FileInputStream(new File(filePath)));
                texts = extractor.getText().split("\n");
                extractor.close();
            }else if ("pptx".equals(fileType)){
                logger.info("掃描文件類型爲PPTX");
                XSLFPowerPointExtractor xslfExtractor = new XSLFPowerPointExtractor(POIXMLDocument.openPackage(filePath));
                texts = xslfExtractor.getText().split("\n");
                xslfExtractor.close();
            }
            // 計算發現策略掃描區間
            calculationDiscoveryStrategy(mapDiscoveryStrategy,texts.length,ployEntity);
            int start = mapDiscoveryStrategy.get("start");
            int end = mapDiscoveryStrategy.get("end");
            logger.info("掃描文件從{}行開始掃描到{}行結束",start,end);
            for (int rowIndex = start; rowIndex < end; rowIndex++) {
                String lineStr = texts[rowIndex];
                logger.info("文本內容:{}",lineStr);
                for (SensitiveEntity sensitiveEntity : list) {
                    // 獲取表格內容
                    matcher = matcherTxt(sensitiveEntity.getRules(),lineStr);
                    if (matcher.find()) {
                        while ((matcher = matcherTxt(sensitiveEntity.getRules(),lineStr)).find()) {
                            String group = matcher.group(0);
                            // 打印涉敏信息
                            logger.info("規則:{}    涉敏信息:{}",sensitiveEntity.getRules(),group);
                            lineStr = lineStr.replaceFirst(group,"****");
                        }
                        // 若集合中不存在改策略則加入集合
                        if (!matchedSensitive.contains(sensitiveEntity)){
                            matchedSensitive.add(sensitiveEntity);
                        }
                    }
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
            logger.error("掃描【{}】文件是否爲涉敏文件異常:{}",filePath,e.getMessage());
            return matchedSensitive;
        }
        return matchedSensitive;
    }

PDF掃描文件是否爲涉敏文件

	private static List<SensitiveEntity> pdfScanFile(String filePath, List<SensitiveEntity> list, PloyEntity ployEntity) {
        logger.info("==============pdfScanFile==========");
        // 已匹配上的策略放這裏
        List<SensitiveEntity> matchedSensitive = new ArrayList<>();
        Map<String, Integer> mapDiscoveryStrategy = new HashMap<>();
        Matcher matcher;
        try {
            // 獲取PDF文件
            com.itextpdf.text.pdf.PdfReader pdfReader = new com.itextpdf.text.pdf.PdfReader(filePath);
            // 解析PDF文件
            com.itextpdf.text.pdf.parser.PdfReaderContentParser pdfReaderContentParser = new com.itextpdf.text.pdf.parser.PdfReaderContentParser(pdfReader);
            // 計算發現策略掃描區間(PDF按頁掃描暫定每頁爲30行)
            calculationDiscoveryStrategy(mapDiscoveryStrategy,pdfReader.getNumberOfPages()*30,ployEntity);
            int start = mapDiscoveryStrategy.get("start") / 30;
            int end = mapDiscoveryStrategy.get("end") / 30;
            logger.info("掃描文件從{}頁開始掃描到{}頁結束",start,end);
            // 獲取每頁的文本內容
            for (int i = (start==0?1:start); i <= end; i++) {
                com.itextpdf.text.pdf.parser.TextExtractionStrategy simpleTextExtractionStrategy = pdfReaderContentParser.processContent(i, new com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy());
                String resultantText = simpleTextExtractionStrategy.getResultantText();
                logger.info("PDF每頁文本內容:{}",resultantText);
                for (SensitiveEntity sensitiveEntity : list) {
                    // 判斷是否匹配策略
                    matcher = matcherTxt(sensitiveEntity.getRules(),resultantText);
                    if (matcher.find()) {
                        while ((matcher = matcherTxt(sensitiveEntity.getRules(),resultantText)).find()) {
                            String group = matcher.group(0);
                            // 打印涉敏信息
                            logger.info("規則:{}    涉敏信息:{}",sensitiveEntity.getRules(),group);
                            resultantText = resultantText.replaceFirst(group,"****");
                        }
                        // 若集合中不存在改策略則加入集合
                        if (!matchedSensitive.contains(sensitiveEntity)){
                            matchedSensitive.add(sensitiveEntity);
                        }
                    }
                }
            }
            pdfReader.close();
        }catch (IOException e){
            logger.error("掃描【{}】文件是否爲涉敏文件異常:{}",filePath,e.getMessage());
            return matchedSensitive;
        }
        return matchedSensitive;
    }

Main測試及補充方法

package com.zxl.demo.utiles;

import com.zxl.demo.entity.PloyEntity;
import com.zxl.demo.entity.SensitiveEntity;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.ooxml.POIXMLDocument;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CellType;
import org.apache.poi.ss.util.NumberToTextConverter;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @Describe: 掃描脫敏工具類
 * @Author: zml
 * @Date: 2020-4-27 11:25:30
 */
public class OfficeScanDesensitizationUtils {

    private static Logger logger = LoggerFactory.getLogger(OfficeScanDesensitizationUtils.class);
    
    public static void main(String[] args) {
        String fileType = "docx";
        String filePath = "D:\\liang\\office掃描\\word-docx掃描.docx";
        // 創建掃描策略
        SensitiveEntity sensitiveEntity1 = new SensitiveEntity();
        sensitiveEntity1.setRulename("策略1");
        sensitiveEntity1.setRules("17711131114");
        sensitiveEntity1.setNode("掃描包含17711131114手機號的文件");
        SensitiveEntity sensitiveEntity2 = new SensitiveEntity();
        sensitiveEntity2.setRulename("策略2");
        sensitiveEntity2.setRules("\\w+([-+.]\\w+)*@\\w+([-.]\\w+)*\\.\\w+([-.]\\w+)*");
        sensitiveEntity2.setNode("掃描Email地址");
        List<SensitiveEntity> sensitiveEntities = Arrays.asList(sensitiveEntity1,sensitiveEntity2);
        // 制定掃描規則(如:掃描前100行)
        PloyEntity ployEntity = new PloyEntity();
        ployEntity.setTop100(true);
        // 開始掃描
        List<SensitiveEntity> sensitives = scanFile(fileType, filePath, sensitiveEntities, ployEntity);
        if (sensitives.size() > 0){
            logger.info("該文件爲涉敏文件");
            sensitives.forEach(sensitive -> {
                logger.info("涉敏策略爲:{}  描述:{}",sensitive.getRulename(),sensitive.getNode());
            });
        }else {
            logger.info("不是涉敏文件");
        }
    }
    
    /**
     * 掃描文件是否爲涉敏文件
     */
    public static List<SensitiveEntity> scanFile(String fileType, String filePath, List<SensitiveEntity> list, PloyEntity ployEntity){
        List<SensitiveEntity> sensitiveEntities = new ArrayList<>();
        try{
            if (fileType.toLowerCase().contains("doc")){
                sensitiveEntities = wordScanFile(fileType,filePath, list, ployEntity);
            }else if (fileType.toLowerCase().contains("xls")){
                sensitiveEntities = excelScanFile(fileType, filePath, list, ployEntity);
            }else if (fileType.toLowerCase().contains("txt")){
                sensitiveEntities = txtScanFile(filePath, list, ployEntity);
            }else if (fileType.toLowerCase().contains("ppt")){
                sensitiveEntities = pptScanFile(fileType,filePath, list, ployEntity);
            }else if (fileType.toLowerCase().contains("pdf")){
                logger.info("掃描PDF類型文件");
                sensitiveEntities = pdfScanFile(filePath, list, ployEntity);
            }
        }catch (Exception e){
            e.printStackTrace();
            logger.error("掃描文件是否爲涉敏文件異常:{}",e.getMessage());
            return sensitiveEntities;
        }
        return sensitiveEntities;
    }

 	/** 
    * 計算發現策略掃描區間
    */
    private static void calculationDiscoveryStrategy(Map mapDiscoveryStrategy, int size, PloyEntity ployEntity) {
        // 是否全文掃描
        if(ployEntity.isAll()){
            mapDiscoveryStrategy.put("start",0);
            mapDiscoveryStrategy.put("end",size);
        }else if (ployEntity.isCustomize()){
            //是否爲自定義
            mapDiscoveryStrategy.put("start",ployEntity.getStart());
            mapDiscoveryStrategy.put("end",ployEntity.getEnd() > size ? size : ployEntity.getEnd());
        }else if (ployEntity.isTop100()){
            // 是否掃描前100行
            mapDiscoveryStrategy.put("start",0);
            mapDiscoveryStrategy.put("end",100 > size ? size : 100);
        }else if (ployEntity.isLast100()){
            // 是否掃描後100行
            mapDiscoveryStrategy.put("start",100 > size ? 0 : size-100);
            mapDiscoveryStrategy.put("end",size);
        }
    }

    /** 
    * 將所有類型轉換爲String
    */
    public static String getCellValString(Cell cell){
        CellType cellType = cell.getCellType();
        String val = "";
        if (CellType.STRING.equals(cellType)){
            val = cell.getStringCellValue();
        }else if (CellType.BOOLEAN.equals(cellType)){
            val = String.valueOf(cell.getBooleanCellValue());
        }else if (CellType.NUMERIC.equals(cellType)){
            val = NumberToTextConverter.toText(cell.getNumericCellValue());
        }
        return val;
    }

    /** 
    * 正則匹配字符串 
    */
    private static Matcher matcherTxt(String regex,String str) {
        Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
        Matcher matcher = pattern.matcher(str);
        return matcher;
    }
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章