| ¶Ô±ÈÐÂÎļþ |
| | |
| | | package com.ruoyi.ai.service; |
| | | |
| | | import com.ruoyi.common.utils.StringUtils; |
| | | import org.apache.poi.hssf.usermodel.HSSFWorkbook; |
| | | import org.apache.poi.ss.usermodel.DataFormatter; |
| | | import org.apache.poi.ss.usermodel.Row; |
| | | import org.apache.poi.ss.usermodel.Sheet; |
| | | import org.apache.poi.xssf.usermodel.XSSFWorkbook; |
| | | import org.apache.poi.xwpf.extractor.XWPFWordExtractor; |
| | | import org.apache.poi.xwpf.usermodel.XWPFDocument; |
| | | import org.springframework.stereotype.Component; |
| | | import org.springframework.web.multipart.MultipartFile; |
| | | |
| | | import java.io.ByteArrayInputStream; |
| | | import java.io.IOException; |
| | | import java.nio.charset.StandardCharsets; |
| | | |
| | | @Component |
| | | public class AiFileTextExtractor { |
| | | |
| | | private static final long MAX_FILE_SIZE = 10L * 1024 * 1024; |
| | | |
| | | public String extractText(MultipartFile file) throws IOException { |
| | | if (file == null || file.isEmpty()) { |
| | | throw new IllegalArgumentException("æä»¶ä¸è½ä¸ºç©º"); |
| | | } |
| | | if (file.getSize() > MAX_FILE_SIZE) { |
| | | throw new IllegalArgumentException("æä»¶è¿å¤§ï¼è¯·æ§å¶å¨10MB以å
"); |
| | | } |
| | | |
| | | String filename = file.getOriginalFilename(); |
| | | String ext = getExtension(filename); |
| | | byte[] bytes = file.getBytes(); |
| | | |
| | | if (isPlainText(ext)) { |
| | | return decodeText(bytes); |
| | | } |
| | | if ("docx".equals(ext)) { |
| | | return extractDocx(bytes); |
| | | } |
| | | if ("xlsx".equals(ext)) { |
| | | return extractXlsx(bytes); |
| | | } |
| | | if ("xls".equals(ext)) { |
| | | return extractXls(bytes); |
| | | } |
| | | if (isImage(ext)) { |
| | | return "å¾çæä»¶ï¼" + filename + "ï¼å·²ä¸ä¼ ï¼è¯·ç»åå¾çå
容è¯å«éè´åæ®ãè¡¨æ ¼å产åæç»ã"; |
| | | } |
| | | throw new IllegalArgumentException("æä¸æ¯æè¯¥æä»¶ç±»å: " + ext); |
| | | } |
| | | |
| | | public boolean isImageFile(MultipartFile file) { |
| | | if (file == null) { |
| | | return false; |
| | | } |
| | | return isImage(getExtension(file.getOriginalFilename())); |
| | | } |
| | | |
| | | private String extractDocx(byte[] bytes) throws IOException { |
| | | try (ByteArrayInputStream inputStream = new ByteArrayInputStream(bytes); |
| | | XWPFDocument document = new XWPFDocument(inputStream); |
| | | XWPFWordExtractor extractor = new XWPFWordExtractor(document)) { |
| | | return extractor.getText(); |
| | | } |
| | | } |
| | | |
| | | private String extractXlsx(byte[] bytes) throws IOException { |
| | | try (ByteArrayInputStream inputStream = new ByteArrayInputStream(bytes); |
| | | XSSFWorkbook workbook = new XSSFWorkbook(inputStream)) { |
| | | return extractWorkbook(workbook); |
| | | } |
| | | } |
| | | |
| | | private String extractXls(byte[] bytes) throws IOException { |
| | | try (ByteArrayInputStream inputStream = new ByteArrayInputStream(bytes); |
| | | HSSFWorkbook workbook = new HSSFWorkbook(inputStream)) { |
| | | return extractWorkbook(workbook); |
| | | } |
| | | } |
| | | |
| | | private String extractWorkbook(org.apache.poi.ss.usermodel.Workbook workbook) { |
| | | StringBuilder text = new StringBuilder(); |
| | | DataFormatter formatter = new DataFormatter(); |
| | | for (int i = 0; i < workbook.getNumberOfSheets(); i++) { |
| | | Sheet sheet = workbook.getSheetAt(i); |
| | | text.append("Sheet: ").append(sheet.getSheetName()).append("\n"); |
| | | for (Row row : sheet) { |
| | | short lastCellNum = row.getLastCellNum(); |
| | | if (lastCellNum <= 0) { |
| | | text.append("\n"); |
| | | continue; |
| | | } |
| | | for (int c = 0; c < lastCellNum; c++) { |
| | | String cellText = formatter.formatCellValue(row.getCell(c)); |
| | | text.append(cellText); |
| | | if (c < lastCellNum - 1) { |
| | | text.append('\t'); |
| | | } |
| | | } |
| | | text.append('\n'); |
| | | } |
| | | } |
| | | return text.toString(); |
| | | } |
| | | |
| | | private String decodeText(byte[] bytes) { |
| | | String utf8 = new String(bytes, StandardCharsets.UTF_8); |
| | | if (utf8.contains("�")) { |
| | | return new String(bytes, java.nio.charset.Charset.forName("GBK")); |
| | | } |
| | | return utf8; |
| | | } |
| | | |
| | | private String getExtension(String filename) { |
| | | if (!StringUtils.hasText(filename) || !filename.contains(".")) { |
| | | return ""; |
| | | } |
| | | return filename.substring(filename.lastIndexOf('.') + 1).toLowerCase(); |
| | | } |
| | | |
| | | private boolean isPlainText(String ext) { |
| | | return StringUtils.inStringIgnoreCase(ext, |
| | | "txt", "md", "markdown", "json", "xml", "yaml", "yml", "csv", "log", "properties", |
| | | "java", "js", "ts", "vue", "html", "css", "sql", "py", "go", "sh", "bat"); |
| | | } |
| | | |
| | | private boolean isImage(String ext) { |
| | | return StringUtils.inStringIgnoreCase(ext, "png", "jpg", "jpeg", "webp", "bmp"); |
| | | } |
| | | } |