package com.ruoyi.ai.service;
|
|
import com.ruoyi.common.utils.StringUtils;
|
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
|
import org.apache.poi.ss.usermodel.DataFormatter;
|
import org.apache.poi.ss.usermodel.Row;
|
import org.apache.poi.ss.usermodel.Sheet;
|
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
|
import org.apache.poi.xwpf.usermodel.XWPFDocument;
|
import org.springframework.stereotype.Component;
|
import org.springframework.web.multipart.MultipartFile;
|
|
import java.io.ByteArrayInputStream;
|
import java.io.IOException;
|
import java.nio.charset.StandardCharsets;
|
|
@Component
|
public class AiFileTextExtractor {
|
|
private static final long MAX_FILE_SIZE = 10L * 1024 * 1024;
|
|
public String extractText(MultipartFile file) throws IOException {
|
if (file == null || file.isEmpty()) {
|
throw new IllegalArgumentException("文件不能为空");
|
}
|
if (file.getSize() > MAX_FILE_SIZE) {
|
throw new IllegalArgumentException("文件过大,请控制在10MB以内");
|
}
|
|
String filename = file.getOriginalFilename();
|
String ext = getExtension(filename);
|
byte[] bytes = file.getBytes();
|
|
if (isPlainText(ext)) {
|
return decodeText(bytes);
|
}
|
if ("docx".equals(ext)) {
|
return extractDocx(bytes);
|
}
|
if ("xlsx".equals(ext)) {
|
return extractXlsx(bytes);
|
}
|
if ("xls".equals(ext)) {
|
return extractXls(bytes);
|
}
|
if (isImage(ext)) {
|
return "图片文件:" + filename + ",已上传,请结合图片内容识别采购单据、表格和产品明细。";
|
}
|
throw new IllegalArgumentException("暂不支持该文件类型: " + ext);
|
}
|
|
public boolean isImageFile(MultipartFile file) {
|
if (file == null) {
|
return false;
|
}
|
return isImage(getExtension(file.getOriginalFilename()));
|
}
|
|
private String extractDocx(byte[] bytes) throws IOException {
|
try (ByteArrayInputStream inputStream = new ByteArrayInputStream(bytes);
|
XWPFDocument document = new XWPFDocument(inputStream);
|
XWPFWordExtractor extractor = new XWPFWordExtractor(document)) {
|
return extractor.getText();
|
}
|
}
|
|
private String extractXlsx(byte[] bytes) throws IOException {
|
try (ByteArrayInputStream inputStream = new ByteArrayInputStream(bytes);
|
XSSFWorkbook workbook = new XSSFWorkbook(inputStream)) {
|
return extractWorkbook(workbook);
|
}
|
}
|
|
private String extractXls(byte[] bytes) throws IOException {
|
try (ByteArrayInputStream inputStream = new ByteArrayInputStream(bytes);
|
HSSFWorkbook workbook = new HSSFWorkbook(inputStream)) {
|
return extractWorkbook(workbook);
|
}
|
}
|
|
private String extractWorkbook(org.apache.poi.ss.usermodel.Workbook workbook) {
|
StringBuilder text = new StringBuilder();
|
DataFormatter formatter = new DataFormatter();
|
for (int i = 0; i < workbook.getNumberOfSheets(); i++) {
|
Sheet sheet = workbook.getSheetAt(i);
|
text.append("Sheet: ").append(sheet.getSheetName()).append("\n");
|
for (Row row : sheet) {
|
short lastCellNum = row.getLastCellNum();
|
if (lastCellNum <= 0) {
|
text.append("\n");
|
continue;
|
}
|
for (int c = 0; c < lastCellNum; c++) {
|
String cellText = formatter.formatCellValue(row.getCell(c));
|
text.append(cellText);
|
if (c < lastCellNum - 1) {
|
text.append('\t');
|
}
|
}
|
text.append('\n');
|
}
|
}
|
return text.toString();
|
}
|
|
private String decodeText(byte[] bytes) {
|
String utf8 = new String(bytes, StandardCharsets.UTF_8);
|
if (utf8.contains("�")) {
|
return new String(bytes, java.nio.charset.Charset.forName("GBK"));
|
}
|
return utf8;
|
}
|
|
private String getExtension(String filename) {
|
if (!StringUtils.hasText(filename) || !filename.contains(".")) {
|
return "";
|
}
|
return filename.substring(filename.lastIndexOf('.') + 1).toLowerCase();
|
}
|
|
private boolean isPlainText(String ext) {
|
return StringUtils.inStringIgnoreCase(ext,
|
"txt", "md", "markdown", "json", "xml", "yaml", "yml", "csv", "log", "properties",
|
"java", "js", "ts", "vue", "html", "css", "sql", "py", "go", "sh", "bat");
|
}
|
|
private boolean isImage(String ext) {
|
return StringUtils.inStringIgnoreCase(ext, "png", "jpg", "jpeg", "webp", "bmp");
|
}
|
}
|