From 9ff6b2fa38167d93de00e6d730d2b038d7731776 Mon Sep 17 00:00:00 2001 From: yaowanxin <3588231647@qq.com> Date: 星期四, 24 七月 2025 13:21:52 +0800 Subject: [PATCH] Merge remote-tracking branch 'origin/ywx' into ywx --- src/main/java/com/chinaztt/mes/docx/util/TakeWords.java | 269 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 files changed, 256 insertions(+), 13 deletions(-) diff --git a/src/main/java/com/chinaztt/mes/docx/util/TakeWords.java b/src/main/java/com/chinaztt/mes/docx/util/TakeWords.java index 3d77d75..c3469d4 100644 --- a/src/main/java/com/chinaztt/mes/docx/util/TakeWords.java +++ b/src/main/java/com/chinaztt/mes/docx/util/TakeWords.java @@ -2,26 +2,46 @@ import cn.hutool.core.io.FileUtil; import com.chinaztt.mes.docx.dto.GetFileDto; +import com.opencsv.CSVReader; +import com.opencsv.CSVReaderBuilder; +import com.opencsv.exceptions.CsvValidationException; +import com.chinaztt.mes.docx.dto.ThicknessData; import net.sourceforge.tess4j.Tesseract; import net.sourceforge.tess4j.TesseractException; import org.apache.commons.lang3.ObjectUtils; import org.apache.commons.lang3.StringUtils; import org.apache.poi.POIXMLDocument; import org.apache.poi.POIXMLTextExtractor; +import org.apache.poi.hssf.usermodel.HSSFPicture; +import org.apache.poi.hssf.usermodel.HSSFPictureData; +import org.apache.poi.hssf.usermodel.HSSFSheet; +import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.hwpf.extractor.WordExtractor; +import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import org.apache.poi.openxml4j.opc.OPCPackage; +import org.apache.poi.ss.usermodel.*; import org.apache.poi.xssf.usermodel.XSSFSheet; import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.apache.poi.xwpf.extractor.XWPFWordExtractor; +import java.awt.Color; +import java.awt.image.BufferedImage; +import javax.imageio.ImageIO; import java.io.*; -import java.nio.file.Files; import java.sql.*; import java.util.*; +import java.util.regex.Pattern; +import java.util.regex.Matcher; +import java.util.regex.Pattern; public class TakeWords { private static final String splitIdentifier = "@-@"; // 鑷畾涔夊敮涓�鏍囪瘑绗� + + // 绉戝璁℃暟娉曟鍒欐ā寮� + private static final Pattern SCIENTIFIC_PATTERN = Pattern.compile( + "^[+-]?\\d+(\\.\\d+)?[eE][+-]?\\d+$" + ); public static Object readWordFile(File file) { String result = ""; @@ -68,6 +88,129 @@ return result.toString(); } + public static Object readExcelxlsFile(File file) throws IOException { + StringBuilder result = new StringBuilder(); + try (FileInputStream fis = new FileInputStream(file); + Workbook workbook = new HSSFWorkbook(fis)) { + // 鑾峰彇绗竴涓伐浣滆〃 + Sheet sheet = workbook.getSheetAt(0); + // 璇诲彇鍥剧墖淇℃伅 + if (workbook instanceof HSSFWorkbook) { + HSSFWorkbook hssfWorkbook = (HSSFWorkbook) workbook; + List<HSSFPictureData> pictures = hssfWorkbook.getAllPictures(); + for (HSSFPictureData picture : pictures) { + // 鑾峰彇鍥剧墖绫诲瀷 + String pictureType = picture.suggestFileExtension(); + // 鑾峰彇鍥剧墖鏁版嵁 + byte[] pictureData = picture.getData(); + // 鍒涘缓涓存椂鏂囦欢 + File tempFile = File.createTempFile(UUID.randomUUID().toString(), "." + pictureType); + try (FileOutputStream fos = new FileOutputStream(tempFile)) { + fos.write(pictureData); + } + // 鍥剧墖棰勫鐞� +// File processedFile = preprocessImage(tempFile, pictureType); + // 璋冪敤 readPngFile1 鏂规硶璇诲彇鍥剧墖鏂囧瓧淇℃伅 + String ocrResult = ""; + try { + ocrResult = (String) readPngFile1(tempFile); +// ocrResult = (String) readPngFile(tempFile); +// ocrResult = (String) readPngFile(processedFile); + } catch (TesseractException e) { + ocrResult = "OCR璇嗗埆澶辫触: " + e.getMessage(); + } finally { + // 鍒犻櫎涓存椂鏂囦欢 + tempFile.delete(); +// processedFile.delete(); + } + + // 灏嗗浘鐗囦俊鎭坊鍔犲埌缁撴灉涓� +// result.append("Picture Type: ").append(pictureType) +// .append(", Picture Size: ").append(pictureData.length) +// .append(" bytes") +// .append(", OCR Result: ").append(ocrResult) +// .append(","); + String ocrText = fixOcrText(ocrResult); + result.append("OCR Result:").append(ocrText).append(","); + } + } +// +// // 閬嶅巻姣忎竴琛� +// for (Row row : sheet) { +// // 閬嶅巻姣忎竴鍒� +// for (Cell cell : row) { +// CellType cellType = CellType.forInt(cell.getCellType()); +// switch (cellType) { +// case STRING: +// result.append(cell.getStringCellValue()); +// break; +// case NUMERIC: +// if (DateUtil.isCellDateFormatted(cell)) { +// result.append(cell.getDateCellValue()); +// } else { +// result.append(cell.getNumericCellValue()); +// } +// break; +// case BOOLEAN: +// result.append(cell.getBooleanCellValue()); +// break; +// case FORMULA: +// result.append(cell.getCellFormula()); +// break; +// default: +// result.append(""); +// } +// result.append("\t"); +// } +// result.append("\n"); +// } + } + return result; + + } + + // 淇 OCR 璇嗗埆鏂囨湰涓殑閿欒鍏抽敭璇� + public static String fixOcrText(String ocrText) { + // 瀹氫箟閿欒鍏抽敭璇嶅拰姝g‘鍐呭鐨勬槧灏勶紝杩欓噷澶勭悊鈥滃嚮 瀹� 寮� 搴炩�濅慨姝d负鈥滃嚮绌垮己搴︹�� + // 鑰冭檻鍒板彲鑳芥湁绌烘牸鍒嗛殧锛岀敤姝e垯鍖归厤鍖呭惈杩欎簺瀛楃殑鍐呭 + ocrText = ocrText.replaceAll("鍑籠\s*瀹嘰\s*寮篭\s*搴�", "鍑荤┛寮哄害"); + // 杩樺彲浠ョ户缁坊鍔犲叾浠栭敊璇慨姝o紝姣斿涓嬮潰鍋囪鈥滅數 鍘� \\(HV\\)鈥濋噷鐨勭┖鏍煎奖鍝嶏紝涔熶慨姝d笅 + ocrText = ocrText.replaceAll("鐢礬\s*鍘媆\s*\\(HV\\)", "鐢靛帇(KV)"); + ocrText = ocrText.replaceAll("鐢礬\s*娴乗\s*\\(nt\\)", "鐢垫祦(mA)"); + return ocrText; + } + + public static Object readPngFile1(File file) throws IOException, TesseractException { + // 鑾峰彇 tessdata 鐩綍鐨勭粷瀵硅矾寰� + String arch = System.getProperty("sun.arch.data.model"); + File tessDataDir; + if (arch.contains("32")) { + tessDataDir = FileUtil.file(".", "/jre_32/tessdata"); + } else { + tessDataDir = FileUtil.file(".", "/jre_64/tessdata"); + } + String path = tessDataDir.getCanonicalPath(); + // 妫�鏌� chi_sim.traineddata 鏂囦欢鏄惁瀛樺湪 + File chiSimFile = new File(path, "chi_sim.traineddata"); + if (!chiSimFile.exists()) { + throw new FileNotFoundException("chi_sim.traineddata 鏂囦欢鏈壘鍒帮紝璇锋鏌ヨ矾寰�: " + chiSimFile.getAbsolutePath()); + } + // 璁剧疆閰嶇疆鏂囦欢澶广�佽瘑鍒瑷�銆佽瘑鍒ā寮� + Tesseract tesseract = new Tesseract(); + tesseract.setDatapath(path); + // 璁剧疆璇嗗埆璇█涓轰腑鏂囩畝浣撳拰鑻辨枃锛堝鏋滆璁剧疆涓鸿嫳鏂囧彲鏀逛负 "eng"锛� + tesseract.setLanguage("chi_sim+eng"); + // 浣跨敤 OSD 杩涜鑷姩椤甸潰鍒嗗壊浠ヨ繘琛屽浘鍍忓鐞� + tesseract.setPageSegMode(1); + // 璁剧疆寮曟搸妯″紡鏄缁忕綉缁� LSTM 寮曟搸 + tesseract.setOcrEngineMode(1); + // 寮�濮嬭瘑鍒暣寮犲浘鐗囦腑鐨勬枃瀛� + return tesseract.doOCR(file); + } + + + + public static Object readTxtFile(File file) throws IOException { FileInputStream fin = new FileInputStream(file); InputStreamReader reader = new InputStreamReader(fin); @@ -82,23 +225,55 @@ } public static Object readCsvFile(File file) { + StringBuilder stringBuilder = new StringBuilder(); // 鍒涘缓 reader - try (BufferedReader br = Files.newBufferedReader(file.toPath())) { - // CSV鏂囦欢鐨勫垎闅旂 - String DELIMITER = ","; - // 鎸夎璇诲彇 - String line; - while ((line = br.readLine()) != null) { - // 鍒嗗壊 - String[] columns = line.split(DELIMITER); - // 鎵撳嵃琛� - stringBuilder.append(String.join(splitIdentifier, columns)).append("\n"); +// try (BufferedReader br = Files.newBufferedReader(file.toPath())) { +// // CSV鏂囦欢鐨勫垎闅旂 +// String DELIMITER = ","; +// // 鎸夎璇诲彇 +// String line; +// System.out.println(br.readLine()); +// while ((line = br.readLine()) != null) { +// // 鍒嗗壊 +// String[] columns = line.split(DELIMITER); +// // 鎵撳嵃琛� +// stringBuilder.append(String.join(splitIdentifier, columns)).append("\n"); +// } +// } catch (IOException ex) { +// ex.printStackTrace(); +// } + try (FileReader fileReader = new FileReader(file); + CSVReader csvReader = new CSVReaderBuilder(fileReader).build()) { + + String[] nextLine; + while ((nextLine = csvReader.readNext()) != null) { + // 澶勭悊姣忎竴琛屾暟鎹� + for (String cell : nextLine) { + if(StringUtils.isNotBlank(cell)){ + stringBuilder.append(scientificToNumber(cell)).append(splitIdentifier); + } + } + stringBuilder.append("\n"); } - } catch (IOException ex) { - ex.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } catch (CsvValidationException e) { + throw new RuntimeException(e); } return stringBuilder.toString(); + } + + /** + * 灏嗙瀛﹁鏁版硶杞崲涓烘暟瀛� + * @param cell + * @return + */ + public static String scientificToNumber(String cell){ + if(SCIENTIFIC_PATTERN.matcher(cell).matches()){ + return String.valueOf(Double.parseDouble(cell)); + } + return cell; } public static Object readMdbFile(File file, GetFileDto getFileDto) throws SQLException, ClassNotFoundException, InstantiationException, IllegalAccessException { @@ -164,6 +339,72 @@ } catch (Exception ignore) { } } + public static Object getMysqlFile(GetFileDto getFileDto){ + Map<String, Object> tableMap = new HashMap<>(16); + // 浠� GetFileDto 鑾峰彇鏁版嵁搴撳悕锛屽搴斻�愭枃浠跺悕绉般�戝瓧娈� + String dbName = getFileDto.getDbFileName(); + String user = getFileDto.getDbUserName(); + String password = getFileDto.getDbPassword(); + // 浠� GetFileDto 鑾峰彇鏁版嵁琛ㄥ悕锛屽搴斻�愭暟鎹簱琛ㄥ悕銆戝瓧娈� + String table = getFileDto.getDbTable(); + // 妫�鏌ユ暟鎹簱鍚嶅拰琛ㄥ悕鏄惁涓虹┖ + if (dbName == null || dbName.isEmpty() || table == null || table.isEmpty()) { + return R.failed("鏁版嵁搴撳悕鎴栬〃鍚嶄笉鑳戒负绌�"); + } + // 鏁版嵁搴撹繛鎺ヤ俊鎭� + String url = "jdbc:mysql://localhost:3306/"+dbName+"?useSSL=false&serverTimezone=UTC&allowPublicKeyRetrieval=true"; + Connection connection = null; + PreparedStatement preparedStatement = null; + ResultSet resultSet = null; + List<Map<String, Object>> dataList = new ArrayList<>(); + + try { + // 寤虹珛杩炴帴 + connection = DriverManager.getConnection(url, user, password); + // 鏋勫缓鍩虹 SQL + + StringBuilder sql = new StringBuilder("SELECT * FROM ").append(table); + // 鍒涘缓 PreparedStatement 瀵硅薄鎵ц SQL + preparedStatement = connection.prepareStatement(sql.toString()); + resultSet = preparedStatement.executeQuery(); + ResultSetMetaData metaData = resultSet.getMetaData(); + int columnCount = metaData.getColumnCount(); + // 閬嶅巻缁撴灉闆嗚幏鍙栨暟鎹� + while (resultSet.next()) { + Map<String, Object> rowData = new HashMap<>(); + for (int i = 1; i <= columnCount; i++) { + String columnName = metaData.getColumnName(i); + rowData.put(columnName, resultSet.getObject(i)); + } + dataList.add(rowData); + } +// while (resultSet.next()) { +// double thinnestPoint = resultSet.getDouble("ThinnestPoint"); +// double averageThickness = resultSet.getDouble("AverageThickness"); +// dataList.add(new ThicknessData(thinnestPoint, averageThickness)); +// } + tableMap.put("data", dataList); + } catch (Exception e) { + e.printStackTrace(); + // 鍋囪 R 绫绘湁 failed 鏂规硶锛岃嫢娌℃湁闇�琛ュ厖瀹炵幇 + return R.failed("鏁版嵁搴撴煡璇㈠嚭閿�: " + e.getMessage()); + } finally { + try { + if (resultSet != null) { + resultSet.close(); + } + if (preparedStatement != null) { + preparedStatement.close(); + } + if (connection != null) { + connection.close(); + } + } catch (SQLException e) { + e.printStackTrace(); + } + } + return tableMap; + } public static Object readDbFile(File file, GetFileDto getFileDto) throws SQLException, ClassNotFoundException, InstantiationException, IllegalAccessException { Map<String, Object> tableMap = new HashMap<>(16); @@ -224,6 +465,8 @@ } else { path = canonicalPath64.replaceAll("/chi_sim.traineddata", "").replaceAll("\\\\", "/"); } + // 璁剧疆 TESSDATA_PREFIX 鐜鍙橀噺 +// System.setProperty("TESSDATA_PREFIX", path); //璁剧疆閰嶇疆鏂囦欢澶瑰井瑙嗐�佽瘑鍒瑷�銆佽瘑鍒ā寮� Tesseract tesseract = new Tesseract(); tesseract.setDatapath(path); -- Gitblit v1.9.3