From 1ca5584d7e3200a9af65a099bd26d3593e2ba702 Mon Sep 17 00:00:00 2001
From: liyong <18434998025@163.com>
Date: 星期四, 07 五月 2026 14:36:08 +0800
Subject: [PATCH] 迁移pro
---
src/main/java/com/ruoyi/ai/service/AiFileTextExtractor.java | 131 +++++++++++++++++++++++++++++++++++++++++++
1 files changed, 131 insertions(+), 0 deletions(-)
diff --git a/src/main/java/com/ruoyi/ai/service/AiFileTextExtractor.java b/src/main/java/com/ruoyi/ai/service/AiFileTextExtractor.java
new file mode 100644
index 0000000..6e37451
--- /dev/null
+++ b/src/main/java/com/ruoyi/ai/service/AiFileTextExtractor.java
@@ -0,0 +1,131 @@
+package com.ruoyi.ai.service;
+
+import com.ruoyi.common.utils.StringUtils;
+import org.apache.poi.hssf.usermodel.HSSFWorkbook;
+import org.apache.poi.ss.usermodel.DataFormatter;
+import org.apache.poi.ss.usermodel.Row;
+import org.apache.poi.ss.usermodel.Sheet;
+import org.apache.poi.xssf.usermodel.XSSFWorkbook;
+import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
+import org.apache.poi.xwpf.usermodel.XWPFDocument;
+import org.springframework.stereotype.Component;
+import org.springframework.web.multipart.MultipartFile;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+
+@Component
+public class AiFileTextExtractor {
+
+ private static final long MAX_FILE_SIZE = 10L * 1024 * 1024;
+
+ public String extractText(MultipartFile file) throws IOException {
+ if (file == null || file.isEmpty()) {
+ throw new IllegalArgumentException("鏂囦欢涓嶈兘涓虹┖");
+ }
+ if (file.getSize() > MAX_FILE_SIZE) {
+ throw new IllegalArgumentException("鏂囦欢杩囧ぇ锛岃鎺у埗鍦�10MB浠ュ唴");
+ }
+
+ String filename = file.getOriginalFilename();
+ String ext = getExtension(filename);
+ byte[] bytes = file.getBytes();
+
+ if (isPlainText(ext)) {
+ return decodeText(bytes);
+ }
+ if ("docx".equals(ext)) {
+ return extractDocx(bytes);
+ }
+ if ("xlsx".equals(ext)) {
+ return extractXlsx(bytes);
+ }
+ if ("xls".equals(ext)) {
+ return extractXls(bytes);
+ }
+ if (isImage(ext)) {
+ return "鍥剧墖鏂囦欢锛�" + filename + "锛屽凡涓婁紶锛岃缁撳悎鍥剧墖鍐呭璇嗗埆閲囪喘鍗曟嵁銆佽〃鏍煎拰浜у搧鏄庣粏銆�";
+ }
+ throw new IllegalArgumentException("鏆備笉鏀寔璇ユ枃浠剁被鍨�: " + ext);
+ }
+
+ public boolean isImageFile(MultipartFile file) {
+ if (file == null) {
+ return false;
+ }
+ return isImage(getExtension(file.getOriginalFilename()));
+ }
+
+ private String extractDocx(byte[] bytes) throws IOException {
+ try (ByteArrayInputStream inputStream = new ByteArrayInputStream(bytes);
+ XWPFDocument document = new XWPFDocument(inputStream);
+ XWPFWordExtractor extractor = new XWPFWordExtractor(document)) {
+ return extractor.getText();
+ }
+ }
+
+ private String extractXlsx(byte[] bytes) throws IOException {
+ try (ByteArrayInputStream inputStream = new ByteArrayInputStream(bytes);
+ XSSFWorkbook workbook = new XSSFWorkbook(inputStream)) {
+ return extractWorkbook(workbook);
+ }
+ }
+
+ private String extractXls(byte[] bytes) throws IOException {
+ try (ByteArrayInputStream inputStream = new ByteArrayInputStream(bytes);
+ HSSFWorkbook workbook = new HSSFWorkbook(inputStream)) {
+ return extractWorkbook(workbook);
+ }
+ }
+
+ private String extractWorkbook(org.apache.poi.ss.usermodel.Workbook workbook) {
+ StringBuilder text = new StringBuilder();
+ DataFormatter formatter = new DataFormatter();
+ for (int i = 0; i < workbook.getNumberOfSheets(); i++) {
+ Sheet sheet = workbook.getSheetAt(i);
+ text.append("Sheet: ").append(sheet.getSheetName()).append("\n");
+ for (Row row : sheet) {
+ short lastCellNum = row.getLastCellNum();
+ if (lastCellNum <= 0) {
+ text.append("\n");
+ continue;
+ }
+ for (int c = 0; c < lastCellNum; c++) {
+ String cellText = formatter.formatCellValue(row.getCell(c));
+ text.append(cellText);
+ if (c < lastCellNum - 1) {
+ text.append('\t');
+ }
+ }
+ text.append('\n');
+ }
+ }
+ return text.toString();
+ }
+
+ private String decodeText(byte[] bytes) {
+ String utf8 = new String(bytes, StandardCharsets.UTF_8);
+ if (utf8.contains("锟�")) {
+ return new String(bytes, java.nio.charset.Charset.forName("GBK"));
+ }
+ return utf8;
+ }
+
+ private String getExtension(String filename) {
+ if (!StringUtils.hasText(filename) || !filename.contains(".")) {
+ return "";
+ }
+ return filename.substring(filename.lastIndexOf('.') + 1).toLowerCase();
+ }
+
+ private boolean isPlainText(String ext) {
+ return StringUtils.inStringIgnoreCase(ext,
+ "txt", "md", "markdown", "json", "xml", "yaml", "yml", "csv", "log", "properties",
+ "java", "js", "ts", "vue", "html", "css", "sql", "py", "go", "sh", "bat");
+ }
+
+ private boolean isImage(String ext) {
+ return StringUtils.inStringIgnoreCase(ext, "png", "jpg", "jpeg", "webp", "bmp");
+ }
+}
--
Gitblit v1.9.3