| | |
| | | |
| | | import cn.hutool.core.io.FileUtil; |
| | | import com.chinaztt.mes.docx.dto.GetFileDto; |
| | | import com.chinaztt.mes.docx.dto.ThicknessData; |
| | | import com.opencsv.CSVReader; |
| | | import com.opencsv.CSVReaderBuilder; |
| | | import com.opencsv.exceptions.CsvValidationException; |
| | | import net.sourceforge.tess4j.Tesseract; |
| | | import net.sourceforge.tess4j.TesseractException; |
| | | import org.apache.commons.lang3.ObjectUtils; |
| | | import org.apache.commons.lang3.StringUtils; |
| | | import org.apache.poi.POIXMLDocument; |
| | | import org.apache.poi.POIXMLTextExtractor; |
| | | import org.apache.poi.hssf.usermodel.HSSFPictureData; |
| | | import org.apache.poi.hssf.usermodel.HSSFWorkbook; |
| | | import org.apache.poi.hwpf.extractor.WordExtractor; |
| | | import org.apache.poi.openxml4j.opc.OPCPackage; |
| | | import org.apache.poi.ss.usermodel.Sheet; |
| | | import org.apache.poi.ss.usermodel.Workbook; |
| | | import org.apache.poi.xssf.usermodel.XSSFSheet; |
| | | import org.apache.poi.xssf.usermodel.XSSFWorkbook; |
| | | import org.apache.poi.xwpf.extractor.XWPFWordExtractor; |
| | | |
| | | import java.io.*; |
| | | import java.nio.file.Files; |
| | | import java.sql.*; |
| | | import java.util.*; |
| | | import java.util.regex.Pattern; |
| | | |
| | | public class TakeWords { |
| | | |
| | | private static final String splitIdentifier = "@-@"; // 自定义唯一标识符 |
| | | |
| | | // 科学计数法正则模式 |
| | | private static final Pattern SCIENTIFIC_PATTERN = Pattern.compile( |
| | | "^[+-]?\\d+(\\.\\d+)?[eE][+-]?\\d+$" |
| | | ); |
| | | |
| | | public static Object readWordFile(File file) { |
| | | String result = ""; |
| | |
| | | return result.toString(); |
| | | } |
| | | |
| | | public static Object readExcelxlsFile(File file) throws IOException { |
| | | StringBuilder result = new StringBuilder(); |
| | | try (FileInputStream fis = new FileInputStream(file); |
| | | Workbook workbook = new HSSFWorkbook(fis)) { |
| | | // 获取第一个工作表 |
| | | Sheet sheet = workbook.getSheetAt(0); |
| | | // 读取图片信息 |
| | | if (workbook instanceof HSSFWorkbook) { |
| | | HSSFWorkbook hssfWorkbook = (HSSFWorkbook) workbook; |
| | | List<HSSFPictureData> pictures = hssfWorkbook.getAllPictures(); |
| | | for (HSSFPictureData picture : pictures) { |
| | | // 获取图片类型 |
| | | String pictureType = picture.suggestFileExtension(); |
| | | // 获取图片数据 |
| | | byte[] pictureData = picture.getData(); |
| | | // 创建临时文件 |
| | | File tempFile = File.createTempFile(UUID.randomUUID().toString(), "." + pictureType); |
| | | try (FileOutputStream fos = new FileOutputStream(tempFile)) { |
| | | fos.write(pictureData); |
| | | } |
| | | // 图片预处理 |
| | | // File processedFile = preprocessImage(tempFile, pictureType); |
| | | // 调用 readPngFile1 方法读取图片文字信息 |
| | | String ocrResult = ""; |
| | | try { |
| | | ocrResult = (String) readPngFile1(tempFile); |
| | | // ocrResult = (String) readPngFile(tempFile); |
| | | // ocrResult = (String) readPngFile(processedFile); |
| | | } catch (TesseractException e) { |
| | | ocrResult = "OCR识别失败: " + e.getMessage(); |
| | | } finally { |
| | | // 删除临时文件 |
| | | tempFile.delete(); |
| | | // processedFile.delete(); |
| | | } |
| | | |
| | | // 将图片信息添加到结果中 |
| | | // result.append("Picture Type: ").append(pictureType) |
| | | // .append(", Picture Size: ").append(pictureData.length) |
| | | // .append(" bytes") |
| | | // .append(", OCR Result: ").append(ocrResult) |
| | | // .append(","); |
| | | String ocrText = fixOcrText(ocrResult); |
| | | result.append("OCR Result:").append(ocrText).append(","); |
| | | } |
| | | } |
| | | // |
| | | // // 遍历每一行 |
| | | // for (Row row : sheet) { |
| | | // // 遍历每一列 |
| | | // for (Cell cell : row) { |
| | | // CellType cellType = CellType.forInt(cell.getCellType()); |
| | | // switch (cellType) { |
| | | // case STRING: |
| | | // result.append(cell.getStringCellValue()); |
| | | // break; |
| | | // case NUMERIC: |
| | | // if (DateUtil.isCellDateFormatted(cell)) { |
| | | // result.append(cell.getDateCellValue()); |
| | | // } else { |
| | | // result.append(cell.getNumericCellValue()); |
| | | // } |
| | | // break; |
| | | // case BOOLEAN: |
| | | // result.append(cell.getBooleanCellValue()); |
| | | // break; |
| | | // case FORMULA: |
| | | // result.append(cell.getCellFormula()); |
| | | // break; |
| | | // default: |
| | | // result.append(""); |
| | | // } |
| | | // result.append("\t"); |
| | | // } |
| | | // result.append("\n"); |
| | | // } |
| | | } |
| | | return result; |
| | | |
| | | } |
| | | |
| | | // 修正 OCR 识别文本中的错误关键词 |
| | | public static String fixOcrText(String ocrText) { |
| | | // 定义错误关键词和正确内容的映射,这里处理“击 宇 强 庞”修正为“击穿强度” |
| | | // 考虑到可能有空格分隔,用正则匹配包含这些字的内容 |
| | | ocrText = ocrText.replaceAll("击\\s*宇\\s*强\\s*庞", "击穿强度"); |
| | | // 还可以继续添加其他错误修正,比如下面假设“电 压 \\(HV\\)”里的空格影响,也修正下 |
| | | ocrText = ocrText.replaceAll("电\\s*压\\s*\\(HV\\)", "电压(KV)"); |
| | | ocrText = ocrText.replaceAll("电\\s*流\\s*\\(nt\\)", "电流(mA)"); |
| | | return ocrText; |
| | | } |
| | | |
| | | public static Object readPngFile1(File file) throws IOException, TesseractException { |
| | | // 获取 tessdata 目录的绝对路径 |
| | | String arch = System.getProperty("sun.arch.data.model"); |
| | | File tessDataDir; |
| | | if (arch.contains("32")) { |
| | | tessDataDir = FileUtil.file(".", "/jre_32/tessdata"); |
| | | } else { |
| | | tessDataDir = FileUtil.file(".", "/jre_64/tessdata"); |
| | | } |
| | | String path = tessDataDir.getCanonicalPath(); |
| | | // 检查 chi_sim.traineddata 文件是否存在 |
| | | File chiSimFile = new File(path, "chi_sim.traineddata"); |
| | | if (!chiSimFile.exists()) { |
| | | throw new FileNotFoundException("chi_sim.traineddata 文件未找到,请检查路径: " + chiSimFile.getAbsolutePath()); |
| | | } |
| | | // 设置配置文件夹、识别语言、识别模式 |
| | | Tesseract tesseract = new Tesseract(); |
| | | tesseract.setDatapath(path); |
| | | // 设置识别语言为中文简体和英文(如果要设置为英文可改为 "eng") |
| | | tesseract.setLanguage("chi_sim+eng"); |
| | | // 使用 OSD 进行自动页面分割以进行图像处理 |
| | | tesseract.setPageSegMode(1); |
| | | // 设置引擎模式是神经网络 LSTM 引擎 |
| | | tesseract.setOcrEngineMode(1); |
| | | // 开始识别整张图片中的文字 |
| | | return tesseract.doOCR(file); |
| | | } |
| | | |
| | | |
| | | |
| | | |
| | | public static Object readTxtFile(File file) throws IOException { |
| | | FileInputStream fin = new FileInputStream(file); |
| | | InputStreamReader reader = new InputStreamReader(fin); |
| | |
| | | } |
| | | |
| | | public static Object readCsvFile(File file) { |
| | | |
| | | StringBuilder stringBuilder = new StringBuilder(); |
| | | // 创建 reader |
| | | try (BufferedReader br = Files.newBufferedReader(file.toPath())) { |
| | | // CSV文件的分隔符 |
| | | String DELIMITER = ","; |
| | | // 按行读取 |
| | | String line; |
| | | while ((line = br.readLine()) != null) { |
| | | // 分割 |
| | | String[] columns = line.split(DELIMITER); |
| | | // 打印行 |
| | | stringBuilder.append(String.join(splitIdentifier, columns)).append("\n"); |
| | | // try (BufferedReader br = Files.newBufferedReader(file.toPath())) { |
| | | // // CSV文件的分隔符 |
| | | // String DELIMITER = ","; |
| | | // // 按行读取 |
| | | // String line; |
| | | // System.out.println(br.readLine()); |
| | | // while ((line = br.readLine()) != null) { |
| | | // // 分割 |
| | | // String[] columns = line.split(DELIMITER); |
| | | // // 打印行 |
| | | // stringBuilder.append(String.join(splitIdentifier, columns)).append("\n"); |
| | | // } |
| | | // } catch (IOException ex) { |
| | | // ex.printStackTrace(); |
| | | // } |
| | | try (FileReader fileReader = new FileReader(file); |
| | | CSVReader csvReader = new CSVReaderBuilder(fileReader).build()) { |
| | | |
| | | String[] nextLine; |
| | | while ((nextLine = csvReader.readNext()) != null) { |
| | | // 处理每一行数据 |
| | | for (String cell : nextLine) { |
| | | if(StringUtils.isNotBlank(cell)){ |
| | | stringBuilder.append(scientificToNumber(cell)).append(splitIdentifier); |
| | | } |
| | | } |
| | | stringBuilder.append("\n"); |
| | | } |
| | | } catch (IOException ex) { |
| | | ex.printStackTrace(); |
| | | } catch (IOException e) { |
| | | e.printStackTrace(); |
| | | } catch (CsvValidationException e) { |
| | | throw new RuntimeException(e); |
| | | } |
| | | return stringBuilder.toString(); |
| | | } |
| | | |
| | | /** |
| | | * 将科学计数法转换为数字 |
| | | * @param cell |
| | | * @return |
| | | */ |
| | | public static String scientificToNumber(String cell){ |
| | | if(SCIENTIFIC_PATTERN.matcher(cell).matches()){ |
| | | return String.valueOf(Double.parseDouble(cell)); |
| | | } |
| | | return cell; |
| | | } |
| | | |
| | | public static Object readMdbFile(File file, GetFileDto getFileDto) throws SQLException, ClassNotFoundException, InstantiationException, IllegalAccessException { |
| | |
| | | } catch (Exception ignore) { |
| | | } |
| | | } |
| | | public static Object getmysqlFile(GetFileDto getFileDto) throws SQLException, ClassNotFoundException, InstantiationException, IllegalAccessException { |
| | | Map<String, Object> tableMap = new HashMap<>(16); |
| | | // 数据库连接信息 |
| | | String url = "jdbc:mysql://localhost:3306/"+getFileDto.getDbFileName()+"?useSSL=false&serverTimezone=UTC&allowPublicKeyRetrieval=true"; |
| | | String user = getFileDto.getDbUserName(); |
| | | String password = getFileDto.getDbPassword(); |
| | | List<ThicknessData> dataList = new ArrayList<>(); |
| | | |
| | | try ( |
| | | // 建立连接 |
| | | Connection connection = DriverManager.getConnection(url, user, password); |
| | | // 创建 Statement 对象执行 SQL |
| | | Statement statement = connection.createStatement() |
| | | ) { |
| | | String sql = "SELECT ThinnestPoint, AverageThickness FROM model1records"; |
| | | ResultSet resultSet = statement.executeQuery(sql); |
| | | |
| | | // 遍历结果集获取数据 |
| | | while (resultSet.next()) { |
| | | double thinnestPoint = resultSet.getDouble("ThinnestPoint"); |
| | | double averageThickness = resultSet.getDouble("AverageThickness"); |
| | | dataList.add(new ThicknessData(thinnestPoint, averageThickness)); |
| | | } |
| | | tableMap.put("data", dataList); |
| | | } catch (Exception e) { |
| | | e.printStackTrace(); |
| | | return R.failed("数据库查询出错: " + e.getMessage()); |
| | | } |
| | | return tableMap; |
| | | } |
| | | |
| | | public static Object readDbFile(File file, GetFileDto getFileDto) throws SQLException, ClassNotFoundException, InstantiationException, IllegalAccessException { |
| | | Map<String, Object> tableMap = new HashMap<>(16); |
| | |
| | | } else { |
| | | path = canonicalPath64.replaceAll("/chi_sim.traineddata", "").replaceAll("\\\\", "/"); |
| | | } |
| | | // 设置 TESSDATA_PREFIX 环境变量 |
| | | // System.setProperty("TESSDATA_PREFIX", path); |
| | | //设置配置文件夹微视、识别语言、识别模式 |
| | | Tesseract tesseract = new Tesseract(); |
| | | tesseract.setDatapath(path); |