package com.ruoyi.ai.service; import com.ruoyi.common.utils.StringUtils; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.ss.usermodel.DataFormatter; import org.apache.poi.ss.usermodel.Row; import org.apache.poi.ss.usermodel.Sheet; import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.apache.poi.xwpf.extractor.XWPFWordExtractor; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.springframework.stereotype.Component; import org.springframework.web.multipart.MultipartFile; import java.io.ByteArrayInputStream; import java.io.IOException; import java.nio.charset.StandardCharsets; @Component public class AiFileTextExtractor { private static final long MAX_FILE_SIZE = 10L * 1024 * 1024; public String extractText(MultipartFile file) throws IOException { if (file == null || file.isEmpty()) { throw new IllegalArgumentException("文件不能为空"); } if (file.getSize() > MAX_FILE_SIZE) { throw new IllegalArgumentException("文件过大,请控制在10MB以内"); } String filename = file.getOriginalFilename(); String ext = getExtension(filename); byte[] bytes = file.getBytes(); if (isPlainText(ext)) { return decodeText(bytes); } if ("docx".equals(ext)) { return extractDocx(bytes); } if ("xlsx".equals(ext)) { return extractXlsx(bytes); } if ("xls".equals(ext)) { return extractXls(bytes); } if (isImage(ext)) { return "图片文件:" + filename + ",已上传,请结合图片内容识别采购单据、表格和产品明细。"; } throw new IllegalArgumentException("暂不支持该文件类型: " + ext); } public boolean isImageFile(MultipartFile file) { if (file == null) { return false; } return isImage(getExtension(file.getOriginalFilename())); } private String extractDocx(byte[] bytes) throws IOException { try (ByteArrayInputStream inputStream = new ByteArrayInputStream(bytes); XWPFDocument document = new XWPFDocument(inputStream); XWPFWordExtractor extractor = new XWPFWordExtractor(document)) { return extractor.getText(); } } private String extractXlsx(byte[] bytes) throws IOException { try (ByteArrayInputStream inputStream = new ByteArrayInputStream(bytes); XSSFWorkbook workbook = new XSSFWorkbook(inputStream)) { return extractWorkbook(workbook); } } private String extractXls(byte[] bytes) throws IOException { try (ByteArrayInputStream inputStream = new ByteArrayInputStream(bytes); HSSFWorkbook workbook = new HSSFWorkbook(inputStream)) { return extractWorkbook(workbook); } } private String extractWorkbook(org.apache.poi.ss.usermodel.Workbook workbook) { StringBuilder text = new StringBuilder(); DataFormatter formatter = new DataFormatter(); for (int i = 0; i < workbook.getNumberOfSheets(); i++) { Sheet sheet = workbook.getSheetAt(i); text.append("Sheet: ").append(sheet.getSheetName()).append("\n"); for (Row row : sheet) { short lastCellNum = row.getLastCellNum(); if (lastCellNum <= 0) { text.append("\n"); continue; } for (int c = 0; c < lastCellNum; c++) { String cellText = formatter.formatCellValue(row.getCell(c)); text.append(cellText); if (c < lastCellNum - 1) { text.append('\t'); } } text.append('\n'); } } return text.toString(); } private String decodeText(byte[] bytes) { String utf8 = new String(bytes, StandardCharsets.UTF_8); if (utf8.contains("�")) { return new String(bytes, java.nio.charset.Charset.forName("GBK")); } return utf8; } private String getExtension(String filename) { if (!StringUtils.hasText(filename) || !filename.contains(".")) { return ""; } return filename.substring(filename.lastIndexOf('.') + 1).toLowerCase(); } private boolean isPlainText(String ext) { return StringUtils.inStringIgnoreCase(ext, "txt", "md", "markdown", "json", "xml", "yaml", "yml", "csv", "log", "properties", "java", "js", "ts", "vue", "html", "css", "sql", "py", "go", "sh", "bat"); } private boolean isImage(String ext) { return StringUtils.inStringIgnoreCase(ext, "png", "jpg", "jpeg", "webp", "bmp"); } }