data-acquisition.git

package com.chinaztt.mes.docx.util;
 
import cn.hutool.core.io.FileUtil;
import com.chinaztt.mes.docx.dto.GetFileDto;
import com.chinaztt.mes.docx.dto.ThicknessData;
import net.sourceforge.tess4j.Tesseract;
import net.sourceforge.tess4j.TesseractException;
import org.apache.commons.lang3.ObjectUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hssf.usermodel.HSSFPicture;
import org.apache.poi.hssf.usermodel.HSSFPictureData;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.ss.usermodel.*;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import java.awt.Color;
import java.awt.image.BufferedImage;
import javax.imageio.ImageIO;
 
import java.io.*;
import java.nio.file.Files;
import java.sql.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
 
public class TakeWords {
 
    private static final String splitIdentifier = "@-@"; // 自定义唯一标识符
 
    public static Object readWordFile(File file) {
        String result = "";
        //首先判断文件中的是doc/docx
        try {
            if (file.getName().endsWith(".doc")) {
                InputStream is = new FileInputStream(file);
                WordExtractor re = new WordExtractor(is);
                result = re.getText();
                re.close();
            } else if (file.getName().endsWith(".docx")) {
                OPCPackage opcPackage = POIXMLDocument.openPackage(file.getPath());
                POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
                result = extractor.getText();
                extractor.close();
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return result;
    }
 
    public static Object readExcelFile(File file) throws FileNotFoundException, IOException {
        StringBuilder result = new StringBuilder();
        //创建工作簿对象
        XSSFWorkbook xssfWorkbook = new XSSFWorkbook(new FileInputStream(file));
        //获取工作簿下sheet的个数 只读取第一个sheet
//            int sheetNum = xssfWorkbook.getNumberOfSheets();
        //遍历工作簿中的所有数据
        for (int i = 0; i < 1; i++) {
            XSSFSheet sheet = xssfWorkbook.getSheetAt(i);
            //获取最后一行的num，即总行数。此处从0开始
            int maxRow = sheet.getLastRowNum();
            for (int row = 0; row <= maxRow; row++) {
                //获取最后单元格num，即总单元格数 ***注意：此处从1开始计数***
                int maxRol = sheet.getRow(row).getLastCellNum();
                StringBuilder aLine = new StringBuilder();
                for (int rol = 0; rol < maxRol; rol++) {
                    aLine.append(sheet.getRow(row).getCell(rol)).append(splitIdentifier);
                }
                result.append(aLine).append("\n");
            }
        }
        return result.toString();
    }
 
    public static Object readExcelxlsFile(File file) throws IOException {
        StringBuilder result = new StringBuilder();
        try (FileInputStream fis = new FileInputStream(file);
             Workbook workbook = new HSSFWorkbook(fis)) {
            // 获取第一个工作表
            Sheet sheet = workbook.getSheetAt(0);
            // 读取图片信息
            if (workbook instanceof HSSFWorkbook) {
                HSSFWorkbook hssfWorkbook = (HSSFWorkbook) workbook;
                List<HSSFPictureData> pictures = hssfWorkbook.getAllPictures();
                for (HSSFPictureData picture : pictures) {
                    // 获取图片类型
                    String pictureType = picture.suggestFileExtension();
                    // 获取图片数据
                    byte[] pictureData = picture.getData();
                    // 创建临时文件
                    File tempFile = File.createTempFile(UUID.randomUUID().toString(), "." + pictureType);
                    try (FileOutputStream fos = new FileOutputStream(tempFile)) {
                        fos.write(pictureData);
                    }
                    // 图片预处理
//                    File processedFile = preprocessImage(tempFile, pictureType);
                    // 调用 readPngFile1 方法读取图片文字信息
                    String ocrResult = "";
                    try {
                        ocrResult = (String) readPngFile1(tempFile);
//                        ocrResult = (String) readPngFile(tempFile);
//                        ocrResult = (String) readPngFile(processedFile);
                    } catch (TesseractException e) {
                        ocrResult = "OCR识别失败: " + e.getMessage();
                    } finally {
                        // 删除临时文件
                        tempFile.delete();
//                        processedFile.delete();
                    }
 
                    // 将图片信息添加到结果中
//                    result.append("Picture Type: ").append(pictureType)
//                            .append(", Picture Size: ").append(pictureData.length)
//                            .append(" bytes")
//                            .append(", OCR Result: ").append(ocrResult)
//                            .append(",");
                    String ocrText = fixOcrText(ocrResult);
                    result.append("OCR Result:").append(ocrText).append(",");
                }
            }
//
//            // 遍历每一行
//            for (Row row : sheet) {
//                // 遍历每一列
//                for (Cell cell : row) {
//                    CellType cellType = CellType.forInt(cell.getCellType());
//                    switch (cellType) {
//                        case STRING:
//                            result.append(cell.getStringCellValue());
//                            break;
//                        case NUMERIC:
//                            if (DateUtil.isCellDateFormatted(cell)) {
//                                result.append(cell.getDateCellValue());
//                            } else {
//                                result.append(cell.getNumericCellValue());
//                            }
//                            break;
//                        case BOOLEAN:
//                            result.append(cell.getBooleanCellValue());
//                            break;
//                        case FORMULA:
//                            result.append(cell.getCellFormula());
//                            break;
//                        default:
//                            result.append("");
//                    }
//                    result.append("\t");
//                }
//                result.append("\n");
//            }
        }
        return result;
 
    }
 
    // 修正 OCR 识别文本中的错误关键词
    public static String fixOcrText(String ocrText) {
        // 定义错误关键词和正确内容的映射，这里处理“击 宇 强 庞”修正为“击穿强度”
        // 考虑到可能有空格分隔，用正则匹配包含这些字的内容
        ocrText = ocrText.replaceAll("击\\s*宇\\s*强\\s*庞", "击穿强度");
        // 还可以继续添加其他错误修正，比如下面假设“电 压 \\(HV\\)”里的空格影响，也修正下
        ocrText = ocrText.replaceAll("电\\s*压\\s*\\(HV\\)", "电压(KV)");
        ocrText = ocrText.replaceAll("电\\s*流\\s*\\(nt\\)", "电流(mA)");
        return ocrText;
    }
 
    public static Object readPngFile1(File file) throws IOException, TesseractException {
        // 获取 tessdata 目录的绝对路径
        String arch = System.getProperty("sun.arch.data.model");
        File tessDataDir;
        if (arch.contains("32")) {
            tessDataDir = FileUtil.file(".", "/jre_32/tessdata");
        } else {
            tessDataDir = FileUtil.file(".", "/jre_64/tessdata");
        }
        String path = tessDataDir.getCanonicalPath();
        // 检查 chi_sim.traineddata 文件是否存在
        File chiSimFile = new File(path, "chi_sim.traineddata");
        if (!chiSimFile.exists()) {
            throw new FileNotFoundException("chi_sim.traineddata 文件未找到，请检查路径: " + chiSimFile.getAbsolutePath());
        }
        // 设置配置文件夹、识别语言、识别模式
        Tesseract tesseract = new Tesseract();
        tesseract.setDatapath(path);
        // 设置识别语言为中文简体和英文（如果要设置为英文可改为 "eng"）
        tesseract.setLanguage("chi_sim+eng");
        // 使用 OSD 进行自动页面分割以进行图像处理
        tesseract.setPageSegMode(1);
        // 设置引擎模式是神经网络 LSTM 引擎
        tesseract.setOcrEngineMode(1);
        // 开始识别整张图片中的文字
        return tesseract.doOCR(file);
    }
 
 
 
 
    public static Object readTxtFile(File file) throws IOException {
        FileInputStream fin = new FileInputStream(file);
        InputStreamReader reader = new InputStreamReader(fin);
        BufferedReader buffReader = new BufferedReader(reader);
        StringBuilder stringBuilder = new StringBuilder();
        String strTmp = "";
        while ((strTmp = buffReader.readLine()) != null) {
            stringBuilder.append(strTmp).append("\n");
        }
        buffReader.close();
        return stringBuilder.toString();
    }
 
    public static Object readCsvFile(File file) {
 
        StringBuilder stringBuilder = new StringBuilder();
        // 创建 reader
        try (BufferedReader br = Files.newBufferedReader(file.toPath())) {
            // CSV文件的分隔符
            String DELIMITER = ",";
            // 按行读取
            String line;
            while ((line = br.readLine()) != null) {
                // 分割
                String[] columns = line.split(DELIMITER);
                // 打印行
                stringBuilder.append(String.join(splitIdentifier, columns)).append("\n");
            }
        } catch (IOException ex) {
            ex.printStackTrace();
        }
        return stringBuilder.toString();
    }
 
    public static Object readMdbFile(File file, GetFileDto getFileDto) throws SQLException, ClassNotFoundException, InstantiationException, IllegalAccessException {
        Map<String, Object> tableMap = new HashMap<>(16);
        Properties prop = new Properties();
        //设置编码
        prop.put("charSet", "UTF-8");
        prop.put("user", "");
        prop.put("password", "");
        //数据地址
        String dbUrl = "jdbc:ucanaccess://" + file.getPath();
        //引入驱动
        Class.forName("net.ucanaccess.jdbc.UcanaccessDriver").newInstance();
        Connection conn = null;
        PreparedStatement preparedStatement = null;
        ResultSet rs = null;
 
        //连接数据库资源
        conn = DriverManager.getConnection(dbUrl, prop);
        try {
            List<Object> list = new ArrayList<>();
            //遍历获取多张表数据
//            String s = "select * from " + getFileDto.getDbFileName() + " where 1=1" + getFileDto.getMdbEntrustCode() + " = '" + getFileDto.getEntrustCode() +
//                    "' and " + getFileDto.getMdbSampleCode() + " = '" + getFileDto.getSampleCode() + "'";
            String s = "select * from " + getFileDto.getDbFileName() + " where 1=1";
            if(StringUtils.isNotBlank(getFileDto.getMdbEntrustCode())){
                s+=" and " + getFileDto.getMdbEntrustCode() + " = '" + getFileDto.getEntrustCode()+ "'";
            }
            if(StringUtils.isNotBlank(getFileDto.getMdbSampleCode())){
                s+=" and " + getFileDto.getMdbSampleCode() + " = '" + getFileDto.getSampleCode() + "'";
            }
            preparedStatement = conn.prepareStatement(s);
            rs = preparedStatement.executeQuery();
            ResultSetMetaData data = rs.getMetaData();
            while (rs.next()) {
                Map<String, String> map = new HashMap<>();
                for (int i = 1; i <= data.getColumnCount(); i++) {
                    //列名
                    String columnName = data.getColumnName(i);
                    map.put(columnName, rs.getString(i));
                }
                list.add(map);
            }
            tableMap.put("data", list);
        } catch (Exception e) {
        } finally {
            closeA1l(conn, preparedStatement, rs);
        }
        return tableMap;
    }
 
    private static void closeA1l(Connection conn, PreparedStatement preparedStatement, ResultSet rs) {
        try {
            if (null != rs) {
                rs.close();
            }
            if (null != preparedStatement) {
                preparedStatement.close();
            }
            if (null != conn) {
                conn.close();
            }
        } catch (Exception ignore) {
        }
    }
    public static Object getmysqlFile(GetFileDto getFileDto) throws SQLException, ClassNotFoundException, InstantiationException, IllegalAccessException {
        Map<String, Object> tableMap = new HashMap<>(16);
        // 数据库连接信息
        String url = "jdbc:mysql://localhost:3306/"+getFileDto.getDbFileName()+"?useSSL=false&serverTimezone=UTC&allowPublicKeyRetrieval=true";
        String user = getFileDto.getDbUserName();
        String password = getFileDto.getDbPassword();
        List<ThicknessData> dataList = new ArrayList<>();
 
        try (
                // 建立连接
                Connection connection = DriverManager.getConnection(url, user, password);
                // 创建 Statement 对象执行 SQL
                Statement statement = connection.createStatement()
        ) {
            String sql = "SELECT ThinnestPoint, AverageThickness FROM model1records";
            ResultSet resultSet = statement.executeQuery(sql);
 
            // 遍历结果集获取数据
            while (resultSet.next()) {
                double thinnestPoint = resultSet.getDouble("ThinnestPoint");
                double averageThickness = resultSet.getDouble("AverageThickness");
                dataList.add(new ThicknessData(thinnestPoint, averageThickness));
            }
            tableMap.put("data", dataList);
        } catch (Exception e) {
            e.printStackTrace();
            return R.failed("数据库查询出错: " + e.getMessage());
        }
        return tableMap;
    }
 
    public static Object readDbFile(File file, GetFileDto getFileDto) throws SQLException, ClassNotFoundException, InstantiationException, IllegalAccessException {
        Map<String, Object> tableMap = new HashMap<>(16);
        Properties prop = new Properties();
        //设置编码
        prop.put("charSet", "GBK");
        prop.put("user", "");
        prop.put("password", "");
        //数据地址
        String dbUrl = "jdbc:sqlite:" + file.getPath();
        //引入驱动
        Class.forName("org.sqlite.JDBC").newInstance();
        Connection conn = null;
        PreparedStatement preparedStatement = null;
        ResultSet rs = null;
        //连接数据库资源
        conn = DriverManager.getConnection(dbUrl, prop);
        try {
            List<Object> list = new ArrayList<>();
            //转义中文
            String sampleCode = changeCharset(getFileDto.getSampleCode(), "GBK");
            String sql = "select * from " + getFileDto.getDbFileName() +
                    " where " + getFileDto.getMdbEntrustCode() + " = '" + getFileDto.getEntrustCode() +
                    "' and " + getFileDto.getMdbSampleCode() + " = '" + sampleCode + "'";
            if (ObjectUtils.allNotNull(getFileDto.getFiberOpticRibbonCode(), getFileDto.getFiberOpticRibbon()) && !getFileDto.getFiberOpticRibbonCode().equals("null")) {
                sql = sql + " and " + getFileDto.getFiberOpticRibbonCode() + " = '" + getFileDto.getFiberOpticRibbon() + "'";
            }
            //遍历获取多张表数据
            preparedStatement = conn.prepareStatement(sql);
            rs = preparedStatement.executeQuery();
            ResultSetMetaData data = rs.getMetaData();
            while (rs.next()) {
                Map<String, String> map = new HashMap<>();
                for (int i = 1; i <= data.getColumnCount(); i++) {
                    //列名
                    String columnName = data.getColumnName(i);
                    //将返回的结果转换从utf-8编码
                    map.put(columnName, changeCharset(rs.getString(i),"UTF-8"));
                }
                list.add(map);
            }
            tableMap.put("data", list);
        } catch (Exception e) {
        } finally {
            closeA1l(conn, preparedStatement, rs);
        }
        return tableMap;
    }
 
    public static Object readPngFile(File file) throws IOException, TesseractException {
        String canonicalPath32 = FileUtil.file(".", "/jre_32/tessdata").getCanonicalPath();
        String canonicalPath64 = FileUtil.file(".", "/jre_64/tessdata").getCanonicalPath();
        String path;
        // 判断电脑是32位还是64位
        String arch = System.getProperty("sun.arch.data.model");
        if (arch.contains("32")) {
            path = canonicalPath32.replaceAll("/chi_sim.traineddata", "").replaceAll("\\\\", "/");
        } else {
            path = canonicalPath64.replaceAll("/chi_sim.traineddata", "").replaceAll("\\\\", "/");
        }
        // 设置 TESSDATA_PREFIX 环境变量
//        System.setProperty("TESSDATA_PREFIX", path);
        //设置配置文件夹微视、识别语言、识别模式
        Tesseract tesseract = new Tesseract();
        tesseract.setDatapath(path);
        //设置识别语言为中文简体，（如果要设置为英文可改为"eng"）
        tesseract.setLanguage("chi_sim");
        //使用 OSD 进行自动页面分割以进行图像处理
        tesseract.setPageSegMode(1);
        //设置引擎模式是神经网络LSTM引擎
        tesseract.setOcrEngineMode(1);
        //开始识别整张图片中的文字
        return tesseract.doOCR(file);
    }
 
    /**
     * 字符串编码转换
     *
     * @param str 要转换的字符串
     * @param unicode 字符编码
     * @return
     */
    public static String changeCharset(String str, String unicode) {
        if(Objects.isNull(str)){
            return "";
        }
        try {
            return new String(str.getBytes(), unicode);
        } catch (UnsupportedEncodingException e) {
            throw new RuntimeException(e);
        }
    }
 
}