| | |
| | | import com.opencsv.CSVReader; |
| | | import com.opencsv.CSVReaderBuilder; |
| | | import com.opencsv.exceptions.CsvValidationException; |
| | | import lombok.extern.slf4j.Slf4j; |
| | | import net.sourceforge.tess4j.Tesseract; |
| | | import net.sourceforge.tess4j.TesseractException; |
| | | import org.apache.commons.lang3.ObjectUtils; |
| | |
| | | import java.util.*; |
| | | import java.util.regex.Pattern; |
| | | |
| | | @Slf4j |
| | | public class TakeWords { |
| | | |
| | | private static final String splitIdentifier = "@-@"; // 自定义唯一标识符 |
| | |
| | | private static final Pattern SCIENTIFIC_PATTERN = Pattern.compile( |
| | | "^[+-]?\\d+(\\.\\d+)?[eE][+-]?\\d+$" |
| | | ); |
| | | |
| | | /** |
| | | * 处理mdb数据库排除字段类型 |
| | | */ |
| | | private final static List<String> MDB_EXCLUDE_TYPES = Arrays.asList("java.sql.Blob"); |
| | | |
| | | public static Object readWordFile(File file) { |
| | | String result = ""; |
| | |
| | | rs = preparedStatement.executeQuery(); |
| | | ResultSetMetaData data = rs.getMetaData(); |
| | | while (rs.next()) { |
| | | Map<String, String> map = new HashMap<>(); |
| | | Map<String, Object> map = new HashMap<>(); |
| | | for (int i = 1; i <= data.getColumnCount(); i++) { |
| | | //列名 |
| | | String columnName = data.getColumnName(i); |
| | | map.put(columnName, rs.getString(i)); |
| | | //列字段类型 |
| | | String columnClassName = data.getColumnClassName(i); |
| | | Object columnValue = null; |
| | | if(!MDB_EXCLUDE_TYPES.contains(columnClassName)){ |
| | | columnValue = rs.getObject(i); |
| | | } |
| | | map.put(columnName, columnValue); |
| | | } |
| | | list.add(map); |
| | | } |