package com.chinaztt.mes.docx.util;
|
|
import cn.hutool.core.io.FileUtil;
|
import com.chinaztt.mes.docx.dto.GetFileDto;
|
import com.chinaztt.mes.docx.dto.ThicknessData;
|
import net.sourceforge.tess4j.Tesseract;
|
import net.sourceforge.tess4j.TesseractException;
|
import org.apache.commons.lang3.ObjectUtils;
|
import org.apache.commons.lang3.StringUtils;
|
import org.apache.poi.POIXMLDocument;
|
import org.apache.poi.POIXMLTextExtractor;
|
import org.apache.poi.hssf.usermodel.HSSFPicture;
|
import org.apache.poi.hssf.usermodel.HSSFPictureData;
|
import org.apache.poi.hssf.usermodel.HSSFSheet;
|
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
|
import org.apache.poi.hwpf.extractor.WordExtractor;
|
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
|
import org.apache.poi.openxml4j.opc.OPCPackage;
|
import org.apache.poi.ss.usermodel.*;
|
import org.apache.poi.xssf.usermodel.XSSFSheet;
|
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
|
import java.awt.Color;
|
import java.awt.image.BufferedImage;
|
import javax.imageio.ImageIO;
|
|
import java.io.*;
|
import java.nio.file.Files;
|
import java.sql.*;
|
import java.util.*;
|
import java.util.regex.Matcher;
|
import java.util.regex.Pattern;
|
|
public class TakeWords {
|
|
private static final String splitIdentifier = "@-@"; // 自定义唯一标识符
|
|
public static Object readWordFile(File file) {
|
String result = "";
|
//首先判断文件中的是doc/docx
|
try {
|
if (file.getName().endsWith(".doc")) {
|
InputStream is = new FileInputStream(file);
|
WordExtractor re = new WordExtractor(is);
|
result = re.getText();
|
re.close();
|
} else if (file.getName().endsWith(".docx")) {
|
OPCPackage opcPackage = POIXMLDocument.openPackage(file.getPath());
|
POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
|
result = extractor.getText();
|
extractor.close();
|
}
|
} catch (Exception e) {
|
e.printStackTrace();
|
}
|
return result;
|
}
|
|
public static Object readExcelFile(File file) throws FileNotFoundException, IOException {
|
StringBuilder result = new StringBuilder();
|
//创建工作簿对象
|
XSSFWorkbook xssfWorkbook = new XSSFWorkbook(new FileInputStream(file));
|
//获取工作簿下sheet的个数 只读取第一个sheet
|
// int sheetNum = xssfWorkbook.getNumberOfSheets();
|
//遍历工作簿中的所有数据
|
for (int i = 0; i < 1; i++) {
|
XSSFSheet sheet = xssfWorkbook.getSheetAt(i);
|
//获取最后一行的num,即总行数。此处从0开始
|
int maxRow = sheet.getLastRowNum();
|
for (int row = 0; row <= maxRow; row++) {
|
//获取最后单元格num,即总单元格数 ***注意:此处从1开始计数***
|
int maxRol = sheet.getRow(row).getLastCellNum();
|
StringBuilder aLine = new StringBuilder();
|
for (int rol = 0; rol < maxRol; rol++) {
|
aLine.append(sheet.getRow(row).getCell(rol)).append(splitIdentifier);
|
}
|
result.append(aLine).append("\n");
|
}
|
}
|
return result.toString();
|
}
|
|
public static Object readExcelxlsFile(File file) throws IOException {
|
StringBuilder result = new StringBuilder();
|
try (FileInputStream fis = new FileInputStream(file);
|
Workbook workbook = new HSSFWorkbook(fis)) {
|
// 获取第一个工作表
|
Sheet sheet = workbook.getSheetAt(0);
|
// 读取图片信息
|
if (workbook instanceof HSSFWorkbook) {
|
HSSFWorkbook hssfWorkbook = (HSSFWorkbook) workbook;
|
List<HSSFPictureData> pictures = hssfWorkbook.getAllPictures();
|
for (HSSFPictureData picture : pictures) {
|
// 获取图片类型
|
String pictureType = picture.suggestFileExtension();
|
// 获取图片数据
|
byte[] pictureData = picture.getData();
|
// 创建临时文件
|
File tempFile = File.createTempFile(UUID.randomUUID().toString(), "." + pictureType);
|
try (FileOutputStream fos = new FileOutputStream(tempFile)) {
|
fos.write(pictureData);
|
}
|
// 图片预处理
|
// File processedFile = preprocessImage(tempFile, pictureType);
|
// 调用 readPngFile1 方法读取图片文字信息
|
String ocrResult = "";
|
try {
|
ocrResult = (String) readPngFile1(tempFile);
|
// ocrResult = (String) readPngFile(tempFile);
|
// ocrResult = (String) readPngFile(processedFile);
|
} catch (TesseractException e) {
|
ocrResult = "OCR识别失败: " + e.getMessage();
|
} finally {
|
// 删除临时文件
|
tempFile.delete();
|
// processedFile.delete();
|
}
|
|
// 将图片信息添加到结果中
|
// result.append("Picture Type: ").append(pictureType)
|
// .append(", Picture Size: ").append(pictureData.length)
|
// .append(" bytes")
|
// .append(", OCR Result: ").append(ocrResult)
|
// .append(",");
|
String ocrText = fixOcrText(ocrResult);
|
result.append("OCR Result:").append(ocrText).append(",");
|
}
|
}
|
//
|
// // 遍历每一行
|
// for (Row row : sheet) {
|
// // 遍历每一列
|
// for (Cell cell : row) {
|
// CellType cellType = CellType.forInt(cell.getCellType());
|
// switch (cellType) {
|
// case STRING:
|
// result.append(cell.getStringCellValue());
|
// break;
|
// case NUMERIC:
|
// if (DateUtil.isCellDateFormatted(cell)) {
|
// result.append(cell.getDateCellValue());
|
// } else {
|
// result.append(cell.getNumericCellValue());
|
// }
|
// break;
|
// case BOOLEAN:
|
// result.append(cell.getBooleanCellValue());
|
// break;
|
// case FORMULA:
|
// result.append(cell.getCellFormula());
|
// break;
|
// default:
|
// result.append("");
|
// }
|
// result.append("\t");
|
// }
|
// result.append("\n");
|
// }
|
}
|
return result;
|
|
}
|
|
// 修正 OCR 识别文本中的错误关键词
|
public static String fixOcrText(String ocrText) {
|
// 定义错误关键词和正确内容的映射,这里处理“击 宇 强 庞”修正为“击穿强度”
|
// 考虑到可能有空格分隔,用正则匹配包含这些字的内容
|
ocrText = ocrText.replaceAll("击\\s*宇\\s*强\\s*庞", "击穿强度");
|
// 还可以继续添加其他错误修正,比如下面假设“电 压 \\(HV\\)”里的空格影响,也修正下
|
ocrText = ocrText.replaceAll("电\\s*压\\s*\\(HV\\)", "电压(KV)");
|
ocrText = ocrText.replaceAll("电\\s*流\\s*\\(nt\\)", "电流(mA)");
|
return ocrText;
|
}
|
|
public static Object readPngFile1(File file) throws IOException, TesseractException {
|
// 获取 tessdata 目录的绝对路径
|
String arch = System.getProperty("sun.arch.data.model");
|
File tessDataDir;
|
if (arch.contains("32")) {
|
tessDataDir = FileUtil.file(".", "/jre_32/tessdata");
|
} else {
|
tessDataDir = FileUtil.file(".", "/jre_64/tessdata");
|
}
|
String path = tessDataDir.getCanonicalPath();
|
// 检查 chi_sim.traineddata 文件是否存在
|
File chiSimFile = new File(path, "chi_sim.traineddata");
|
if (!chiSimFile.exists()) {
|
throw new FileNotFoundException("chi_sim.traineddata 文件未找到,请检查路径: " + chiSimFile.getAbsolutePath());
|
}
|
// 设置配置文件夹、识别语言、识别模式
|
Tesseract tesseract = new Tesseract();
|
tesseract.setDatapath(path);
|
// 设置识别语言为中文简体和英文(如果要设置为英文可改为 "eng")
|
tesseract.setLanguage("chi_sim+eng");
|
// 使用 OSD 进行自动页面分割以进行图像处理
|
tesseract.setPageSegMode(1);
|
// 设置引擎模式是神经网络 LSTM 引擎
|
tesseract.setOcrEngineMode(1);
|
// 开始识别整张图片中的文字
|
return tesseract.doOCR(file);
|
}
|
|
|
|
|
public static Object readTxtFile(File file) throws IOException {
|
FileInputStream fin = new FileInputStream(file);
|
InputStreamReader reader = new InputStreamReader(fin);
|
BufferedReader buffReader = new BufferedReader(reader);
|
StringBuilder stringBuilder = new StringBuilder();
|
String strTmp = "";
|
while ((strTmp = buffReader.readLine()) != null) {
|
stringBuilder.append(strTmp).append("\n");
|
}
|
buffReader.close();
|
return stringBuilder.toString();
|
}
|
|
public static Object readCsvFile(File file) {
|
|
StringBuilder stringBuilder = new StringBuilder();
|
// 创建 reader
|
try (BufferedReader br = Files.newBufferedReader(file.toPath())) {
|
// CSV文件的分隔符
|
String DELIMITER = ",";
|
// 按行读取
|
String line;
|
while ((line = br.readLine()) != null) {
|
// 分割
|
String[] columns = line.split(DELIMITER);
|
// 打印行
|
stringBuilder.append(String.join(splitIdentifier, columns)).append("\n");
|
}
|
} catch (IOException ex) {
|
ex.printStackTrace();
|
}
|
return stringBuilder.toString();
|
}
|
|
public static Object readMdbFile(File file, GetFileDto getFileDto) throws SQLException, ClassNotFoundException, InstantiationException, IllegalAccessException {
|
Map<String, Object> tableMap = new HashMap<>(16);
|
Properties prop = new Properties();
|
//设置编码
|
prop.put("charSet", "UTF-8");
|
prop.put("user", "");
|
prop.put("password", "");
|
//数据地址
|
String dbUrl = "jdbc:ucanaccess://" + file.getPath();
|
//引入驱动
|
Class.forName("net.ucanaccess.jdbc.UcanaccessDriver").newInstance();
|
Connection conn = null;
|
PreparedStatement preparedStatement = null;
|
ResultSet rs = null;
|
|
//连接数据库资源
|
conn = DriverManager.getConnection(dbUrl, prop);
|
try {
|
List<Object> list = new ArrayList<>();
|
//遍历获取多张表数据
|
// String s = "select * from " + getFileDto.getDbFileName() + " where 1=1" + getFileDto.getMdbEntrustCode() + " = '" + getFileDto.getEntrustCode() +
|
// "' and " + getFileDto.getMdbSampleCode() + " = '" + getFileDto.getSampleCode() + "'";
|
String s = "select * from " + getFileDto.getDbFileName() + " where 1=1";
|
if(StringUtils.isNotBlank(getFileDto.getMdbEntrustCode())){
|
s+=" and " + getFileDto.getMdbEntrustCode() + " = '" + getFileDto.getEntrustCode()+ "'";
|
}
|
if(StringUtils.isNotBlank(getFileDto.getMdbSampleCode())){
|
s+=" and " + getFileDto.getMdbSampleCode() + " = '" + getFileDto.getSampleCode() + "'";
|
}
|
preparedStatement = conn.prepareStatement(s);
|
rs = preparedStatement.executeQuery();
|
ResultSetMetaData data = rs.getMetaData();
|
while (rs.next()) {
|
Map<String, String> map = new HashMap<>();
|
for (int i = 1; i <= data.getColumnCount(); i++) {
|
//列名
|
String columnName = data.getColumnName(i);
|
map.put(columnName, rs.getString(i));
|
}
|
list.add(map);
|
}
|
tableMap.put("data", list);
|
} catch (Exception e) {
|
} finally {
|
closeA1l(conn, preparedStatement, rs);
|
}
|
return tableMap;
|
}
|
|
private static void closeA1l(Connection conn, PreparedStatement preparedStatement, ResultSet rs) {
|
try {
|
if (null != rs) {
|
rs.close();
|
}
|
if (null != preparedStatement) {
|
preparedStatement.close();
|
}
|
if (null != conn) {
|
conn.close();
|
}
|
} catch (Exception ignore) {
|
}
|
}
|
public static Object getmysqlFile(GetFileDto getFileDto) throws SQLException, ClassNotFoundException, InstantiationException, IllegalAccessException {
|
Map<String, Object> tableMap = new HashMap<>(16);
|
// 数据库连接信息
|
String url = "jdbc:mysql://localhost:3306/"+getFileDto.getDbFileName()+"?useSSL=false&serverTimezone=UTC&allowPublicKeyRetrieval=true";
|
String user = getFileDto.getDbUserName();
|
String password = getFileDto.getDbPassword();
|
List<ThicknessData> dataList = new ArrayList<>();
|
|
try (
|
// 建立连接
|
Connection connection = DriverManager.getConnection(url, user, password);
|
// 创建 Statement 对象执行 SQL
|
Statement statement = connection.createStatement()
|
) {
|
String sql = "SELECT ThinnestPoint, AverageThickness FROM model1records";
|
ResultSet resultSet = statement.executeQuery(sql);
|
|
// 遍历结果集获取数据
|
while (resultSet.next()) {
|
double thinnestPoint = resultSet.getDouble("ThinnestPoint");
|
double averageThickness = resultSet.getDouble("AverageThickness");
|
dataList.add(new ThicknessData(thinnestPoint, averageThickness));
|
}
|
tableMap.put("data", dataList);
|
} catch (Exception e) {
|
e.printStackTrace();
|
return R.failed("数据库查询出错: " + e.getMessage());
|
}
|
return tableMap;
|
}
|
|
public static Object readDbFile(File file, GetFileDto getFileDto) throws SQLException, ClassNotFoundException, InstantiationException, IllegalAccessException {
|
Map<String, Object> tableMap = new HashMap<>(16);
|
Properties prop = new Properties();
|
//设置编码
|
prop.put("charSet", "GBK");
|
prop.put("user", "");
|
prop.put("password", "");
|
//数据地址
|
String dbUrl = "jdbc:sqlite:" + file.getPath();
|
//引入驱动
|
Class.forName("org.sqlite.JDBC").newInstance();
|
Connection conn = null;
|
PreparedStatement preparedStatement = null;
|
ResultSet rs = null;
|
//连接数据库资源
|
conn = DriverManager.getConnection(dbUrl, prop);
|
try {
|
List<Object> list = new ArrayList<>();
|
//转义中文
|
String sampleCode = changeCharset(getFileDto.getSampleCode(), "GBK");
|
String sql = "select * from " + getFileDto.getDbFileName() +
|
" where " + getFileDto.getMdbEntrustCode() + " = '" + getFileDto.getEntrustCode() +
|
"' and " + getFileDto.getMdbSampleCode() + " = '" + sampleCode + "'";
|
if (ObjectUtils.allNotNull(getFileDto.getFiberOpticRibbonCode(), getFileDto.getFiberOpticRibbon()) && !getFileDto.getFiberOpticRibbonCode().equals("null")) {
|
sql = sql + " and " + getFileDto.getFiberOpticRibbonCode() + " = '" + getFileDto.getFiberOpticRibbon() + "'";
|
}
|
//遍历获取多张表数据
|
preparedStatement = conn.prepareStatement(sql);
|
rs = preparedStatement.executeQuery();
|
ResultSetMetaData data = rs.getMetaData();
|
while (rs.next()) {
|
Map<String, String> map = new HashMap<>();
|
for (int i = 1; i <= data.getColumnCount(); i++) {
|
//列名
|
String columnName = data.getColumnName(i);
|
//将返回的结果转换从utf-8编码
|
map.put(columnName, changeCharset(rs.getString(i),"UTF-8"));
|
}
|
list.add(map);
|
}
|
tableMap.put("data", list);
|
} catch (Exception e) {
|
} finally {
|
closeA1l(conn, preparedStatement, rs);
|
}
|
return tableMap;
|
}
|
|
public static Object readPngFile(File file) throws IOException, TesseractException {
|
String canonicalPath32 = FileUtil.file(".", "/jre_32/tessdata").getCanonicalPath();
|
String canonicalPath64 = FileUtil.file(".", "/jre_64/tessdata").getCanonicalPath();
|
String path;
|
// 判断电脑是32位还是64位
|
String arch = System.getProperty("sun.arch.data.model");
|
if (arch.contains("32")) {
|
path = canonicalPath32.replaceAll("/chi_sim.traineddata", "").replaceAll("\\\\", "/");
|
} else {
|
path = canonicalPath64.replaceAll("/chi_sim.traineddata", "").replaceAll("\\\\", "/");
|
}
|
// 设置 TESSDATA_PREFIX 环境变量
|
// System.setProperty("TESSDATA_PREFIX", path);
|
//设置配置文件夹微视、识别语言、识别模式
|
Tesseract tesseract = new Tesseract();
|
tesseract.setDatapath(path);
|
//设置识别语言为中文简体,(如果要设置为英文可改为"eng")
|
tesseract.setLanguage("chi_sim");
|
//使用 OSD 进行自动页面分割以进行图像处理
|
tesseract.setPageSegMode(1);
|
//设置引擎模式是神经网络LSTM引擎
|
tesseract.setOcrEngineMode(1);
|
//开始识别整张图片中的文字
|
return tesseract.doOCR(file);
|
}
|
|
/**
|
* 字符串编码转换
|
*
|
* @param str 要转换的字符串
|
* @param unicode 字符编码
|
* @return
|
*/
|
public static String changeCharset(String str, String unicode) {
|
if(Objects.isNull(str)){
|
return "";
|
}
|
try {
|
return new String(str.getBytes(), unicode);
|
} catch (UnsupportedEncodingException e) {
|
throw new RuntimeException(e);
|
}
|
}
|
|
}
|