package com.chinaztt.mes.docx.util; import cn.hutool.core.io.FileUtil; import com.chinaztt.mes.docx.dto.GetFileDto; import com.chinaztt.mes.docx.dto.ThicknessData; import net.sourceforge.tess4j.Tesseract; import net.sourceforge.tess4j.TesseractException; import org.apache.commons.lang3.ObjectUtils; import org.apache.commons.lang3.StringUtils; import org.apache.poi.POIXMLDocument; import org.apache.poi.POIXMLTextExtractor; import org.apache.poi.hssf.usermodel.HSSFPicture; import org.apache.poi.hssf.usermodel.HSSFPictureData; import org.apache.poi.hssf.usermodel.HSSFSheet; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.ss.usermodel.*; import org.apache.poi.xssf.usermodel.XSSFSheet; import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.apache.poi.xwpf.extractor.XWPFWordExtractor; import java.awt.Color; import java.awt.image.BufferedImage; import javax.imageio.ImageIO; import java.io.*; import java.nio.file.Files; import java.sql.*; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; public class TakeWords { private static final String splitIdentifier = "@-@"; // 自定义唯一标识符 public static Object readWordFile(File file) { String result = ""; //首先判断文件中的是doc/docx try { if (file.getName().endsWith(".doc")) { InputStream is = new FileInputStream(file); WordExtractor re = new WordExtractor(is); result = re.getText(); re.close(); } else if (file.getName().endsWith(".docx")) { OPCPackage opcPackage = POIXMLDocument.openPackage(file.getPath()); POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage); result = extractor.getText(); extractor.close(); } } catch (Exception e) { e.printStackTrace(); } return result; } public static Object readExcelFile(File file) throws FileNotFoundException, IOException { StringBuilder result = new StringBuilder(); //创建工作簿对象 XSSFWorkbook xssfWorkbook = new XSSFWorkbook(new FileInputStream(file)); //获取工作簿下sheet的个数 只读取第一个sheet // int sheetNum = xssfWorkbook.getNumberOfSheets(); //遍历工作簿中的所有数据 for (int i = 0; i < 1; i++) { XSSFSheet sheet = xssfWorkbook.getSheetAt(i); //获取最后一行的num,即总行数。此处从0开始 int maxRow = sheet.getLastRowNum(); for (int row = 0; row <= maxRow; row++) { //获取最后单元格num,即总单元格数 ***注意:此处从1开始计数*** int maxRol = sheet.getRow(row).getLastCellNum(); StringBuilder aLine = new StringBuilder(); for (int rol = 0; rol < maxRol; rol++) { aLine.append(sheet.getRow(row).getCell(rol)).append(splitIdentifier); } result.append(aLine).append("\n"); } } return result.toString(); } public static Object readExcelxlsFile(File file) throws IOException { StringBuilder result = new StringBuilder(); try (FileInputStream fis = new FileInputStream(file); Workbook workbook = new HSSFWorkbook(fis)) { // 获取第一个工作表 Sheet sheet = workbook.getSheetAt(0); // 读取图片信息 if (workbook instanceof HSSFWorkbook) { HSSFWorkbook hssfWorkbook = (HSSFWorkbook) workbook; List pictures = hssfWorkbook.getAllPictures(); for (HSSFPictureData picture : pictures) { // 获取图片类型 String pictureType = picture.suggestFileExtension(); // 获取图片数据 byte[] pictureData = picture.getData(); // 创建临时文件 File tempFile = File.createTempFile(UUID.randomUUID().toString(), "." + pictureType); try (FileOutputStream fos = new FileOutputStream(tempFile)) { fos.write(pictureData); } // 图片预处理 // File processedFile = preprocessImage(tempFile, pictureType); // 调用 readPngFile1 方法读取图片文字信息 String ocrResult = ""; try { ocrResult = (String) readPngFile1(tempFile); // ocrResult = (String) readPngFile(tempFile); // ocrResult = (String) readPngFile(processedFile); } catch (TesseractException e) { ocrResult = "OCR识别失败: " + e.getMessage(); } finally { // 删除临时文件 tempFile.delete(); // processedFile.delete(); } // 将图片信息添加到结果中 // result.append("Picture Type: ").append(pictureType) // .append(", Picture Size: ").append(pictureData.length) // .append(" bytes") // .append(", OCR Result: ").append(ocrResult) // .append(","); String ocrText = fixOcrText(ocrResult); result.append("OCR Result:").append(ocrText).append(","); } } // // // 遍历每一行 // for (Row row : sheet) { // // 遍历每一列 // for (Cell cell : row) { // CellType cellType = CellType.forInt(cell.getCellType()); // switch (cellType) { // case STRING: // result.append(cell.getStringCellValue()); // break; // case NUMERIC: // if (DateUtil.isCellDateFormatted(cell)) { // result.append(cell.getDateCellValue()); // } else { // result.append(cell.getNumericCellValue()); // } // break; // case BOOLEAN: // result.append(cell.getBooleanCellValue()); // break; // case FORMULA: // result.append(cell.getCellFormula()); // break; // default: // result.append(""); // } // result.append("\t"); // } // result.append("\n"); // } } return result; } // 修正 OCR 识别文本中的错误关键词 public static String fixOcrText(String ocrText) { // 定义错误关键词和正确内容的映射,这里处理“击 宇 强 庞”修正为“击穿强度” // 考虑到可能有空格分隔,用正则匹配包含这些字的内容 ocrText = ocrText.replaceAll("击\\s*宇\\s*强\\s*庞", "击穿强度"); // 还可以继续添加其他错误修正,比如下面假设“电 压 \\(HV\\)”里的空格影响,也修正下 ocrText = ocrText.replaceAll("电\\s*压\\s*\\(HV\\)", "电压(KV)"); ocrText = ocrText.replaceAll("电\\s*流\\s*\\(nt\\)", "电流(mA)"); return ocrText; } public static Object readPngFile1(File file) throws IOException, TesseractException { // 获取 tessdata 目录的绝对路径 String arch = System.getProperty("sun.arch.data.model"); File tessDataDir; if (arch.contains("32")) { tessDataDir = FileUtil.file(".", "/jre_32/tessdata"); } else { tessDataDir = FileUtil.file(".", "/jre_64/tessdata"); } String path = tessDataDir.getCanonicalPath(); // 检查 chi_sim.traineddata 文件是否存在 File chiSimFile = new File(path, "chi_sim.traineddata"); if (!chiSimFile.exists()) { throw new FileNotFoundException("chi_sim.traineddata 文件未找到,请检查路径: " + chiSimFile.getAbsolutePath()); } // 设置配置文件夹、识别语言、识别模式 Tesseract tesseract = new Tesseract(); tesseract.setDatapath(path); // 设置识别语言为中文简体和英文(如果要设置为英文可改为 "eng") tesseract.setLanguage("chi_sim+eng"); // 使用 OSD 进行自动页面分割以进行图像处理 tesseract.setPageSegMode(1); // 设置引擎模式是神经网络 LSTM 引擎 tesseract.setOcrEngineMode(1); // 开始识别整张图片中的文字 return tesseract.doOCR(file); } public static Object readTxtFile(File file) throws IOException { FileInputStream fin = new FileInputStream(file); InputStreamReader reader = new InputStreamReader(fin); BufferedReader buffReader = new BufferedReader(reader); StringBuilder stringBuilder = new StringBuilder(); String strTmp = ""; while ((strTmp = buffReader.readLine()) != null) { stringBuilder.append(strTmp).append("\n"); } buffReader.close(); return stringBuilder.toString(); } public static Object readCsvFile(File file) { StringBuilder stringBuilder = new StringBuilder(); // 创建 reader try (BufferedReader br = Files.newBufferedReader(file.toPath())) { // CSV文件的分隔符 String DELIMITER = ","; // 按行读取 String line; while ((line = br.readLine()) != null) { // 分割 String[] columns = line.split(DELIMITER); // 打印行 stringBuilder.append(String.join(splitIdentifier, columns)).append("\n"); } } catch (IOException ex) { ex.printStackTrace(); } return stringBuilder.toString(); } public static Object readMdbFile(File file, GetFileDto getFileDto) throws SQLException, ClassNotFoundException, InstantiationException, IllegalAccessException { Map tableMap = new HashMap<>(16); Properties prop = new Properties(); //设置编码 prop.put("charSet", "UTF-8"); prop.put("user", ""); prop.put("password", ""); //数据地址 String dbUrl = "jdbc:ucanaccess://" + file.getPath(); //引入驱动 Class.forName("net.ucanaccess.jdbc.UcanaccessDriver").newInstance(); Connection conn = null; PreparedStatement preparedStatement = null; ResultSet rs = null; //连接数据库资源 conn = DriverManager.getConnection(dbUrl, prop); try { List list = new ArrayList<>(); //遍历获取多张表数据 // String s = "select * from " + getFileDto.getDbFileName() + " where 1=1" + getFileDto.getMdbEntrustCode() + " = '" + getFileDto.getEntrustCode() + // "' and " + getFileDto.getMdbSampleCode() + " = '" + getFileDto.getSampleCode() + "'"; String s = "select * from " + getFileDto.getDbFileName() + " where 1=1"; if(StringUtils.isNotBlank(getFileDto.getMdbEntrustCode())){ s+=" and " + getFileDto.getMdbEntrustCode() + " = '" + getFileDto.getEntrustCode()+ "'"; } if(StringUtils.isNotBlank(getFileDto.getMdbSampleCode())){ s+=" and " + getFileDto.getMdbSampleCode() + " = '" + getFileDto.getSampleCode() + "'"; } preparedStatement = conn.prepareStatement(s); rs = preparedStatement.executeQuery(); ResultSetMetaData data = rs.getMetaData(); while (rs.next()) { Map map = new HashMap<>(); for (int i = 1; i <= data.getColumnCount(); i++) { //列名 String columnName = data.getColumnName(i); map.put(columnName, rs.getString(i)); } list.add(map); } tableMap.put("data", list); } catch (Exception e) { } finally { closeA1l(conn, preparedStatement, rs); } return tableMap; } private static void closeA1l(Connection conn, PreparedStatement preparedStatement, ResultSet rs) { try { if (null != rs) { rs.close(); } if (null != preparedStatement) { preparedStatement.close(); } if (null != conn) { conn.close(); } } catch (Exception ignore) { } } public static Object getMysqlFile(GetFileDto getFileDto){ Map tableMap = new HashMap<>(16); // 从 GetFileDto 获取数据库名,对应【文件名称】字段 String dbName = getFileDto.getDbFileName(); String user = getFileDto.getDbUserName(); String password = getFileDto.getDbPassword(); // 从 GetFileDto 获取数据表名,对应【数据库表名】字段 String table = getFileDto.getDbTable(); // 检查数据库名和表名是否为空 if (dbName == null || dbName.isEmpty() || table == null || table.isEmpty()) { return R.failed("数据库名或表名不能为空"); } // 数据库连接信息 String url = "jdbc:mysql://localhost:3306/"+dbName+"?useSSL=false&serverTimezone=UTC&allowPublicKeyRetrieval=true"; Connection connection = null; PreparedStatement preparedStatement = null; ResultSet resultSet = null; List> dataList = new ArrayList<>(); try { // 建立连接 connection = DriverManager.getConnection(url, user, password); // 构建基础 SQL StringBuilder sql = new StringBuilder("SELECT * FROM ").append(table); // 创建 PreparedStatement 对象执行 SQL preparedStatement = connection.prepareStatement(sql.toString()); resultSet = preparedStatement.executeQuery(); ResultSetMetaData metaData = resultSet.getMetaData(); int columnCount = metaData.getColumnCount(); // 遍历结果集获取数据 while (resultSet.next()) { Map rowData = new HashMap<>(); for (int i = 1; i <= columnCount; i++) { String columnName = metaData.getColumnName(i); rowData.put(columnName, resultSet.getObject(i)); } dataList.add(rowData); } // while (resultSet.next()) { // double thinnestPoint = resultSet.getDouble("ThinnestPoint"); // double averageThickness = resultSet.getDouble("AverageThickness"); // dataList.add(new ThicknessData(thinnestPoint, averageThickness)); // } tableMap.put("data", dataList); } catch (Exception e) { e.printStackTrace(); // 假设 R 类有 failed 方法,若没有需补充实现 return R.failed("数据库查询出错: " + e.getMessage()); } finally { try { if (resultSet != null) { resultSet.close(); } if (preparedStatement != null) { preparedStatement.close(); } if (connection != null) { connection.close(); } } catch (SQLException e) { e.printStackTrace(); } } return tableMap; } public static Object readDbFile(File file, GetFileDto getFileDto) throws SQLException, ClassNotFoundException, InstantiationException, IllegalAccessException { Map tableMap = new HashMap<>(16); Properties prop = new Properties(); //设置编码 prop.put("charSet", "GBK"); prop.put("user", ""); prop.put("password", ""); //数据地址 String dbUrl = "jdbc:sqlite:" + file.getPath(); //引入驱动 Class.forName("org.sqlite.JDBC").newInstance(); Connection conn = null; PreparedStatement preparedStatement = null; ResultSet rs = null; //连接数据库资源 conn = DriverManager.getConnection(dbUrl, prop); try { List list = new ArrayList<>(); //转义中文 String sampleCode = changeCharset(getFileDto.getSampleCode(), "GBK"); String sql = "select * from " + getFileDto.getDbFileName() + " where " + getFileDto.getMdbEntrustCode() + " = '" + getFileDto.getEntrustCode() + "' and " + getFileDto.getMdbSampleCode() + " = '" + sampleCode + "'"; if (ObjectUtils.allNotNull(getFileDto.getFiberOpticRibbonCode(), getFileDto.getFiberOpticRibbon()) && !getFileDto.getFiberOpticRibbonCode().equals("null")) { sql = sql + " and " + getFileDto.getFiberOpticRibbonCode() + " = '" + getFileDto.getFiberOpticRibbon() + "'"; } //遍历获取多张表数据 preparedStatement = conn.prepareStatement(sql); rs = preparedStatement.executeQuery(); ResultSetMetaData data = rs.getMetaData(); while (rs.next()) { Map map = new HashMap<>(); for (int i = 1; i <= data.getColumnCount(); i++) { //列名 String columnName = data.getColumnName(i); //将返回的结果转换从utf-8编码 map.put(columnName, changeCharset(rs.getString(i),"UTF-8")); } list.add(map); } tableMap.put("data", list); } catch (Exception e) { } finally { closeA1l(conn, preparedStatement, rs); } return tableMap; } public static Object readPngFile(File file) throws IOException, TesseractException { String canonicalPath32 = FileUtil.file(".", "/jre_32/tessdata").getCanonicalPath(); String canonicalPath64 = FileUtil.file(".", "/jre_64/tessdata").getCanonicalPath(); String path; // 判断电脑是32位还是64位 String arch = System.getProperty("sun.arch.data.model"); if (arch.contains("32")) { path = canonicalPath32.replaceAll("/chi_sim.traineddata", "").replaceAll("\\\\", "/"); } else { path = canonicalPath64.replaceAll("/chi_sim.traineddata", "").replaceAll("\\\\", "/"); } // 设置 TESSDATA_PREFIX 环境变量 // System.setProperty("TESSDATA_PREFIX", path); //设置配置文件夹微视、识别语言、识别模式 Tesseract tesseract = new Tesseract(); tesseract.setDatapath(path); //设置识别语言为中文简体,(如果要设置为英文可改为"eng") tesseract.setLanguage("chi_sim"); //使用 OSD 进行自动页面分割以进行图像处理 tesseract.setPageSegMode(1); //设置引擎模式是神经网络LSTM引擎 tesseract.setOcrEngineMode(1); //开始识别整张图片中的文字 return tesseract.doOCR(file); } /** * 字符串编码转换 * * @param str 要转换的字符串 * @param unicode 字符编码 * @return */ public static String changeCharset(String str, String unicode) { if(Objects.isNull(str)){ return ""; } try { return new String(str.getBytes(), unicode); } catch (UnsupportedEncodingException e) { throw new RuntimeException(e); } } }