解析Office文件文本內容,掃描文件是否涉敏(doc、docx、xls、xlsx、ppt、pptx、pdf、txt)
實現思路:將解析到的文本內容利用正則表達式去匹配
加入主要的依賴
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>4.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>4.1.2</version>
</dependency>
Excel掃描文件是否爲涉敏文件
private static List<SensitiveEntity> excelScanFile(String fileType, String filePath, List<SensitiveEntity> list, PloyEntity ployEntity) {
if ("xls".equals(fileType)){
return excelScanXlsFile(filePath,list,ployEntity);
}else if("xlsx".equals(fileType)){
return excelScanXlsxFile(filePath,list,ployEntity);
}else {
return new ArrayList<>();
}
}
/**
* Excel掃描文件是否爲涉敏文件(xls)
*/
private static List<SensitiveEntity> excelScanXlsFile(String filePath, List<SensitiveEntity> list,PloyEntity ployEntity){
// 已匹配上的策略放這裏
List<SensitiveEntity> matchedSensitive = new ArrayList<>();
Map<String, Integer> mapDiscoveryStrategy = new HashMap<>();
try{
// 讀取文件
HSSFWorkbook sheets = new HSSFWorkbook(new FileInputStream(new File(filePath)));
Matcher matcher;
// 循環所有sheet頁
for (int sheetIndex = 0; sheetIndex < sheets.getNumberOfSheets(); sheetIndex++){
HSSFSheet sheetAt = sheets.getSheetAt(sheetIndex);
// 計算發現策略掃描區間
calculationDiscoveryStrategy(mapDiscoveryStrategy,sheetAt.getLastRowNum(),ployEntity);
int start = mapDiscoveryStrategy.get("start");
int end = mapDiscoveryStrategy.get("end");
logger.info("掃描文件從{}行開始掃描到{}行結束",start,end);
// 第一個sheet頁所有行
for (int rowIndex = start; rowIndex <= end; rowIndex++) {
HSSFRow row = sheetAt.getRow(rowIndex);
if(row == null){
continue;
}
for (int cellIndex = 0; cellIndex < row.getPhysicalNumberOfCells(); cellIndex++) {
Cell cell = row.getCell(cellIndex);
if(cell == null){
continue;
}
// 獲取表格內容
String cellText = getCellValString(cell);
logger.info("表格文本內容:{}",cellText);
for (SensitiveEntity sensitiveEntity : list) {
matcher = matcherTxt(sensitiveEntity.getRules(),cellText);
if (matcher.find()) {
while ((matcher = matcherTxt(sensitiveEntity.getRules(),cellText)).find()) {
String group = matcher.group(0);
// 打印涉敏信息
logger.info("規則:{} 涉敏信息:{}",sensitiveEntity.getRules(),group);
cellText = cellText.replaceFirst(group,"****");
}
// 若集合中不存在改策略則加入集合
if (!matchedSensitive.contains(sensitiveEntity)){
matchedSensitive.add(sensitiveEntity);
}
}
}
}
}
}
sheets.close();
}catch (Exception e){
logger.error("掃描【{}】文件是否爲涉敏文件異常:{}",filePath,e.getMessage());
return matchedSensitive;
}
return matchedSensitive;
}
/**
* Excel掃描文件是否爲涉敏文件(xlsx)
*/
private static List<SensitiveEntity> excelScanXlsxFile(String filePath, List<SensitiveEntity> list,PloyEntity ployEntity){
// 已匹配上的策略放這裏
List<SensitiveEntity> matchedSensitive = new ArrayList<>();
Map<String, Integer> mapDiscoveryStrategy = new HashMap<>();
try{
// 讀取文件
XSSFWorkbook sheets = new XSSFWorkbook(new FileInputStream(new File(filePath)));
Matcher matcher;
// 循環所有sheet頁
for (int sheetIndex = 0; sheetIndex < sheets.getNumberOfSheets(); sheetIndex++){
XSSFSheet sheetAt = sheets.getSheetAt(sheetIndex);
// 計算發現策略掃描區間
calculationDiscoveryStrategy(mapDiscoveryStrategy,sheetAt.getLastRowNum(),ployEntity);
int start = mapDiscoveryStrategy.get("start");
int end = mapDiscoveryStrategy.get("end");
logger.info("掃描文件從{}行開始掃描到{}行結束",start,end);
// 第一個sheet頁所有行
for (int rowIndex = start; rowIndex <= end; rowIndex++) {
XSSFRow row = sheetAt.getRow(rowIndex);
if(row == null){
continue;
}
for (int cellIndex = 0; cellIndex < row.getPhysicalNumberOfCells(); cellIndex++) {
XSSFCell cell = row.getCell(cellIndex);
if(cell == null){
continue;
}
// 獲取表格內容
String cellText = getCellValString(cell);
logger.info("表格文本內容:{}",cellText);
for (SensitiveEntity sensitiveEntity : list) {
matcher = matcherTxt(sensitiveEntity.getRules(),cellText);
if (matcher.find()) {
while ((matcher = matcherTxt(sensitiveEntity.getRules(),cellText)).find()) {
String group = matcher.group(0);
// 打印涉敏信息
logger.info("規則:{} 涉敏信息:{}",sensitiveEntity.getRules(),group);
cellText = cellText.replaceFirst(group,"****");
}
// 若集合中不存在改策略則加入集合
if (!matchedSensitive.contains(sensitiveEntity)){
matchedSensitive.add(sensitiveEntity);
}
}
}
}
}
}
sheets.close();
}catch (Exception e){
logger.error("掃描【{}】文件是否爲涉敏文件異常:{}",filePath,e.getMessage());
return matchedSensitive;
}
return matchedSensitive;
}
Word掃描文件是否爲涉敏文件
private static List<SensitiveEntity> wordScanFile(String fileType, String filePath, List<SensitiveEntity> list, PloyEntity ployEntity) {
if ("doc".equals(fileType)){
return wordScanDocFile(filePath,list,ployEntity);
}else if("docx".equals(fileType)){
return wordScanDocxFile(filePath,list,ployEntity);
}else {
return new ArrayList<>();
}
}
/**
* Word掃描文件是否爲涉敏文件(doc)
*/
private static List<SensitiveEntity> wordScanDocFile(String filePath, List<SensitiveEntity> list, PloyEntity ployEntity) {
// 已匹配上的策略放這裏
List<SensitiveEntity> matchedSensitive = new ArrayList<>();
Map<String, Integer> mapDiscoveryStrategy = new HashMap<>();
try{
HWPFDocument doc = new HWPFDocument(new FileInputStream(new File(filePath)));
Range range = doc.getRange();
int rowNum = range.numParagraphs();
// 計算發現策略掃描區間
calculationDiscoveryStrategy(mapDiscoveryStrategy,rowNum,ployEntity);
int start = mapDiscoveryStrategy.get("start");
int end = mapDiscoveryStrategy.get("end");
logger.info("掃描文件從{}行開始掃描到{}行結束",start,end);
Matcher matcher;
for (int rowIndex = start; rowIndex < end; rowIndex++) {
String text = range.getParagraph(rowIndex).text();
logger.info("文本內容:{}", text);
for (SensitiveEntity sensitiveEntity : list) {
String rule = sensitiveEntity.getRules();
if (matcherTxt(rule, text).find()) {
matcher = matcherTxt(rule, text);
if (matcher.find()) {
while ((matcher = matcherTxt(rule, text)).find()) {
String group = matcher.group(0);
// 打印涉敏信息
logger.info("規則:{} 涉敏信息:{}", sensitiveEntity.getRules(), group);
// 不要去掉(while循環校驗使用)
text = text.replaceFirst(group,"****");
}
}
// 若集合中不存在改策略則加入集合
if (!matchedSensitive.contains(sensitiveEntity)){
matchedSensitive.add(sensitiveEntity);
}
}
}
}
}catch (Exception e){
logger.error("掃描【{}】文件是否爲涉敏文件異常:{}",filePath,e.getMessage());
return matchedSensitive;
}
return matchedSensitive;
}
/**
* Word掃描文件是否爲涉敏文件(docx)
*/
private static List<SensitiveEntity> wordScanDocxFile(String filePath, List<SensitiveEntity> list, PloyEntity ployEntity) {
// 已匹配上的策略放這裏
List<SensitiveEntity> matchedSensitive = new ArrayList<>();
Map<String, Integer> mapDiscoveryStrategy = new HashMap<>();
try{
XWPFDocument doc = new XWPFDocument(new FileInputStream(new File(filePath)));
List<XWPFParagraph> paragraphs = doc.getParagraphs();
// 計算發現策略掃描區間
calculationDiscoveryStrategy(mapDiscoveryStrategy,paragraphs.size(),ployEntity);
int start = mapDiscoveryStrategy.get("start");
int end = mapDiscoveryStrategy.get("end");
logger.info("掃描文件從{}行開始掃描到{}行結束",start,end);
Matcher matcher;
for (int rowIndex = start; rowIndex < end; rowIndex++) {
logger.info("runs文本內容:{}",paragraphs.get(rowIndex).getParagraphText());
for (SensitiveEntity sensitiveEntity : list) {
String rule = sensitiveEntity.getRules();
if (matcherTxt(rule,paragraphs.get(rowIndex).getParagraphText()).find()) {
String runText = paragraphs.get(rowIndex).getParagraphText();
matcher = matcherTxt(rule,runText);
if (matcher.find()) {
while ((matcher = matcherTxt(rule,runText)).find()) {
String group = matcher.group(0);
// 打印涉敏信息
logger.info("規則:{} 涉敏信息:{}",sensitiveEntity.getRules(),group);
// 不要去掉(while循環校驗使用)
runText = runText.replaceFirst(group,"****");
}
}
// 若集合中不存在改策略則加入集合
if (!matchedSensitive.contains(sensitiveEntity)){
matchedSensitive.add(sensitiveEntity);
}
}
}
}
}catch (Exception e){
logger.error("掃描【{}】文件是否爲涉敏文件異常:{}",filePath,e.getMessage());
return matchedSensitive;
}
return matchedSensitive;
}
TxT掃描文件是否爲涉敏文件
private static List<SensitiveEntity> txtScanFile(String filePath, List<SensitiveEntity> list, PloyEntity ployEntity) {
// 已匹配上的策略放這裏
List<SensitiveEntity> matchedSensitive = new ArrayList<>();
List<String> listStr = new ArrayList<>();
Map<String, Integer> mapDiscoveryStrategy = new HashMap<>();
try {
BufferedReader bufferedReader = new BufferedReader(new FileReader(filePath));
Matcher matcher;
String lineStr = "";
while ((lineStr = bufferedReader.readLine()) != null){
listStr.add(lineStr);
}
// 計算發現策略掃描區間
calculationDiscoveryStrategy(mapDiscoveryStrategy,listStr.size(),ployEntity);
int start = mapDiscoveryStrategy.get("start");
int end = mapDiscoveryStrategy.get("end");
logger.info("掃描文件從{}行開始掃描到{}行結束",start,end);
for (int rowIndex = start; rowIndex < end; rowIndex++) {
lineStr = listStr.get(rowIndex);
logger.info("文本內容:{}",lineStr);
for (SensitiveEntity sensitiveEntity : list) {
// 獲取表格內容
matcher = matcherTxt(sensitiveEntity.getRules(),lineStr);
if (matcher.find()) {
while ((matcher = matcherTxt(sensitiveEntity.getRules(),lineStr)).find()) {
String group = matcher.group(0);
// 打印涉敏信息
logger.info("規則:{} 涉敏信息:{}",sensitiveEntity.getRules(),group);
lineStr = lineStr.replaceFirst(group,"****");
}
// 若集合中不存在改策略則加入集合
if (!matchedSensitive.contains(sensitiveEntity)){
matchedSensitive.add(sensitiveEntity);
}
}
}
}
}catch (Exception e){
logger.error("掃描【{}】文件是否爲涉敏文件異常:{}",filePath,e.getMessage());
return matchedSensitive;
}
return matchedSensitive;
}
PPT掃描文件是否爲涉敏文件
private static List<SensitiveEntity> pptScanFile(String fileType, String filePath, List<SensitiveEntity> list, PloyEntity ployEntity) {
// 已匹配上的策略放這裏
List<SensitiveEntity> matchedSensitive = new ArrayList<>();
Map<String, Integer> mapDiscoveryStrategy = new HashMap<>();
Matcher matcher;
try {
String[] texts = new String[]{};
if ("ppt".equals(fileType)){
logger.info("掃描文件類型爲PPT");
PowerPointExtractor extractor = new PowerPointExtractor(new FileInputStream(new File(filePath)));
texts = extractor.getText().split("\n");
extractor.close();
}else if ("pptx".equals(fileType)){
logger.info("掃描文件類型爲PPTX");
XSLFPowerPointExtractor xslfExtractor = new XSLFPowerPointExtractor(POIXMLDocument.openPackage(filePath));
texts = xslfExtractor.getText().split("\n");
xslfExtractor.close();
}
// 計算發現策略掃描區間
calculationDiscoveryStrategy(mapDiscoveryStrategy,texts.length,ployEntity);
int start = mapDiscoveryStrategy.get("start");
int end = mapDiscoveryStrategy.get("end");
logger.info("掃描文件從{}行開始掃描到{}行結束",start,end);
for (int rowIndex = start; rowIndex < end; rowIndex++) {
String lineStr = texts[rowIndex];
logger.info("文本內容:{}",lineStr);
for (SensitiveEntity sensitiveEntity : list) {
// 獲取表格內容
matcher = matcherTxt(sensitiveEntity.getRules(),lineStr);
if (matcher.find()) {
while ((matcher = matcherTxt(sensitiveEntity.getRules(),lineStr)).find()) {
String group = matcher.group(0);
// 打印涉敏信息
logger.info("規則:{} 涉敏信息:{}",sensitiveEntity.getRules(),group);
lineStr = lineStr.replaceFirst(group,"****");
}
// 若集合中不存在改策略則加入集合
if (!matchedSensitive.contains(sensitiveEntity)){
matchedSensitive.add(sensitiveEntity);
}
}
}
}
} catch (Exception e) {
e.printStackTrace();
logger.error("掃描【{}】文件是否爲涉敏文件異常:{}",filePath,e.getMessage());
return matchedSensitive;
}
return matchedSensitive;
}
PDF掃描文件是否爲涉敏文件
private static List<SensitiveEntity> pdfScanFile(String filePath, List<SensitiveEntity> list, PloyEntity ployEntity) {
logger.info("==============pdfScanFile==========");
// 已匹配上的策略放這裏
List<SensitiveEntity> matchedSensitive = new ArrayList<>();
Map<String, Integer> mapDiscoveryStrategy = new HashMap<>();
Matcher matcher;
try {
// 獲取PDF文件
com.itextpdf.text.pdf.PdfReader pdfReader = new com.itextpdf.text.pdf.PdfReader(filePath);
// 解析PDF文件
com.itextpdf.text.pdf.parser.PdfReaderContentParser pdfReaderContentParser = new com.itextpdf.text.pdf.parser.PdfReaderContentParser(pdfReader);
// 計算發現策略掃描區間(PDF按頁掃描暫定每頁爲30行)
calculationDiscoveryStrategy(mapDiscoveryStrategy,pdfReader.getNumberOfPages()*30,ployEntity);
int start = mapDiscoveryStrategy.get("start") / 30;
int end = mapDiscoveryStrategy.get("end") / 30;
logger.info("掃描文件從{}頁開始掃描到{}頁結束",start,end);
// 獲取每頁的文本內容
for (int i = (start==0?1:start); i <= end; i++) {
com.itextpdf.text.pdf.parser.TextExtractionStrategy simpleTextExtractionStrategy = pdfReaderContentParser.processContent(i, new com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy());
String resultantText = simpleTextExtractionStrategy.getResultantText();
logger.info("PDF每頁文本內容:{}",resultantText);
for (SensitiveEntity sensitiveEntity : list) {
// 判斷是否匹配策略
matcher = matcherTxt(sensitiveEntity.getRules(),resultantText);
if (matcher.find()) {
while ((matcher = matcherTxt(sensitiveEntity.getRules(),resultantText)).find()) {
String group = matcher.group(0);
// 打印涉敏信息
logger.info("規則:{} 涉敏信息:{}",sensitiveEntity.getRules(),group);
resultantText = resultantText.replaceFirst(group,"****");
}
// 若集合中不存在改策略則加入集合
if (!matchedSensitive.contains(sensitiveEntity)){
matchedSensitive.add(sensitiveEntity);
}
}
}
}
pdfReader.close();
}catch (IOException e){
logger.error("掃描【{}】文件是否爲涉敏文件異常:{}",filePath,e.getMessage());
return matchedSensitive;
}
return matchedSensitive;
}
Main測試及補充方法
package com.zxl.demo.utiles;
import com.zxl.demo.entity.PloyEntity;
import com.zxl.demo.entity.SensitiveEntity;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.ooxml.POIXMLDocument;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CellType;
import org.apache.poi.ss.util.NumberToTextConverter;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @Describe: 掃描脫敏工具類
* @Author: zml
* @Date: 2020-4-27 11:25:30
*/
public class OfficeScanDesensitizationUtils {
private static Logger logger = LoggerFactory.getLogger(OfficeScanDesensitizationUtils.class);
public static void main(String[] args) {
String fileType = "docx";
String filePath = "D:\\liang\\office掃描\\word-docx掃描.docx";
// 創建掃描策略
SensitiveEntity sensitiveEntity1 = new SensitiveEntity();
sensitiveEntity1.setRulename("策略1");
sensitiveEntity1.setRules("17711131114");
sensitiveEntity1.setNode("掃描包含17711131114手機號的文件");
SensitiveEntity sensitiveEntity2 = new SensitiveEntity();
sensitiveEntity2.setRulename("策略2");
sensitiveEntity2.setRules("\\w+([-+.]\\w+)*@\\w+([-.]\\w+)*\\.\\w+([-.]\\w+)*");
sensitiveEntity2.setNode("掃描Email地址");
List<SensitiveEntity> sensitiveEntities = Arrays.asList(sensitiveEntity1,sensitiveEntity2);
// 制定掃描規則(如:掃描前100行)
PloyEntity ployEntity = new PloyEntity();
ployEntity.setTop100(true);
// 開始掃描
List<SensitiveEntity> sensitives = scanFile(fileType, filePath, sensitiveEntities, ployEntity);
if (sensitives.size() > 0){
logger.info("該文件爲涉敏文件");
sensitives.forEach(sensitive -> {
logger.info("涉敏策略爲:{} 描述:{}",sensitive.getRulename(),sensitive.getNode());
});
}else {
logger.info("不是涉敏文件");
}
}
/**
* 掃描文件是否爲涉敏文件
*/
public static List<SensitiveEntity> scanFile(String fileType, String filePath, List<SensitiveEntity> list, PloyEntity ployEntity){
List<SensitiveEntity> sensitiveEntities = new ArrayList<>();
try{
if (fileType.toLowerCase().contains("doc")){
sensitiveEntities = wordScanFile(fileType,filePath, list, ployEntity);
}else if (fileType.toLowerCase().contains("xls")){
sensitiveEntities = excelScanFile(fileType, filePath, list, ployEntity);
}else if (fileType.toLowerCase().contains("txt")){
sensitiveEntities = txtScanFile(filePath, list, ployEntity);
}else if (fileType.toLowerCase().contains("ppt")){
sensitiveEntities = pptScanFile(fileType,filePath, list, ployEntity);
}else if (fileType.toLowerCase().contains("pdf")){
logger.info("掃描PDF類型文件");
sensitiveEntities = pdfScanFile(filePath, list, ployEntity);
}
}catch (Exception e){
e.printStackTrace();
logger.error("掃描文件是否爲涉敏文件異常:{}",e.getMessage());
return sensitiveEntities;
}
return sensitiveEntities;
}
/**
* 計算發現策略掃描區間
*/
private static void calculationDiscoveryStrategy(Map mapDiscoveryStrategy, int size, PloyEntity ployEntity) {
// 是否全文掃描
if(ployEntity.isAll()){
mapDiscoveryStrategy.put("start",0);
mapDiscoveryStrategy.put("end",size);
}else if (ployEntity.isCustomize()){
//是否爲自定義
mapDiscoveryStrategy.put("start",ployEntity.getStart());
mapDiscoveryStrategy.put("end",ployEntity.getEnd() > size ? size : ployEntity.getEnd());
}else if (ployEntity.isTop100()){
// 是否掃描前100行
mapDiscoveryStrategy.put("start",0);
mapDiscoveryStrategy.put("end",100 > size ? size : 100);
}else if (ployEntity.isLast100()){
// 是否掃描後100行
mapDiscoveryStrategy.put("start",100 > size ? 0 : size-100);
mapDiscoveryStrategy.put("end",size);
}
}
/**
* 將所有類型轉換爲String
*/
public static String getCellValString(Cell cell){
CellType cellType = cell.getCellType();
String val = "";
if (CellType.STRING.equals(cellType)){
val = cell.getStringCellValue();
}else if (CellType.BOOLEAN.equals(cellType)){
val = String.valueOf(cell.getBooleanCellValue());
}else if (CellType.NUMERIC.equals(cellType)){
val = NumberToTextConverter.toText(cell.getNumericCellValue());
}
return val;
}
/**
* 正則匹配字符串
*/
private static Matcher matcherTxt(String regex,String str) {
Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(str);
return matcher;
}