From f34f57d81cca8d498f396ee4d93c0db98a6c7401 Mon Sep 17 00:00:00 2001 From: Alexandre Oliveira Date: Sat, 7 Sep 2024 11:24:00 -0300 Subject: [PATCH 01/10] - TurFileAttributes --- .../com/viglet/turing/api/llm/TurLlmAPI.java | 4 +- .../turing/api/ml/data/TurMLDataAPI.java | 3 +- .../turing/api/nlp/TurNLPInstanceAPI.java | 14 +++--- .../com/viglet/turing/api/ocr/TurOcrAPI.java | 3 +- .../turing/api/sn/bean/TurSolrCoreExists.java | 4 +- .../turing/api/sn/job/TurSNImportAPI.java | 8 ++-- .../commons/file/TurFileAttributes.java | 37 +++++++++++++++ .../turing/commons/file/TurFileSize.java | 46 +++++++++++++++++++ .../turing/connector/db/TurDbImportTool.java | 30 ++++++------ .../filesystem/commons/TurFileUtils.java | 29 ++++++++---- ...ibutes.java => TurTikaFileAttributes.java} | 4 +- .../com/viglet/turing/client/ocr/TurOcr.java | 7 +-- 12 files changed, 142 insertions(+), 47 deletions(-) create mode 100644 turing-commons/src/main/java/com/viglet/turing/commons/file/TurFileAttributes.java create mode 100644 turing-commons/src/main/java/com/viglet/turing/commons/file/TurFileSize.java rename turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/{TurFileAttributes.java => TurTikaFileAttributes.java} (90%) diff --git a/turing-app/src/main/java/com/viglet/turing/api/llm/TurLlmAPI.java b/turing-app/src/main/java/com/viglet/turing/api/llm/TurLlmAPI.java index f63cf2849a2..bfb06577f48 100644 --- a/turing-app/src/main/java/com/viglet/turing/api/llm/TurLlmAPI.java +++ b/turing-app/src/main/java/com/viglet/turing/api/llm/TurLlmAPI.java @@ -45,6 +45,7 @@ import java.net.http.HttpRequest; import java.net.http.HttpResponse; import java.util.List; +import java.util.Objects; import java.util.Optional; @Slf4j @@ -98,8 +99,7 @@ public RedactionScript validateText(@RequestParam("text") String text) @PostMapping(value = "/entity/file/blazon") public RedactionScript validateFile(@RequestParam("file") MultipartFile multipartFile) throws IOException, InterruptedException { - final String text = TurFileUtils.documentToText(multipartFile); - return getEntities(text); + return getEntities(Objects.requireNonNull(TurFileUtils.documentToText(multipartFile)).getContent()); } diff --git a/turing-app/src/main/java/com/viglet/turing/api/ml/data/TurMLDataAPI.java b/turing-app/src/main/java/com/viglet/turing/api/ml/data/TurMLDataAPI.java index b6943fc40e5..50634fff4d1 100644 --- a/turing-app/src/main/java/com/viglet/turing/api/ml/data/TurMLDataAPI.java +++ b/turing-app/src/main/java/com/viglet/turing/api/ml/data/TurMLDataAPI.java @@ -40,6 +40,7 @@ import java.util.Arrays; import java.util.List; +import java.util.Objects; @Slf4j @RestController @@ -104,7 +105,7 @@ public TurData turDataAdd(@RequestBody TurData turData) { @Transactional public String turDataImport(@RequestParam("file") MultipartFile multipartFile) { String[] sentences = turOpenNLPConnector.sentenceDetect(turNLPProcess.getDefaultNLPInstance(), - TurFileUtils.documentToText(multipartFile)); + Objects.requireNonNull(TurFileUtils.documentToText(multipartFile)).getContent()); TurData turData = new TurData(); turData.setName(multipartFile.getOriginalFilename()); turData.setType(FilenameUtils.getExtension(multipartFile.getOriginalFilename())); diff --git a/turing-app/src/main/java/com/viglet/turing/api/nlp/TurNLPInstanceAPI.java b/turing-app/src/main/java/com/viglet/turing/api/nlp/TurNLPInstanceAPI.java index 2cc1b63b152..0c44fa3f1d3 100644 --- a/turing-app/src/main/java/com/viglet/turing/api/nlp/TurNLPInstanceAPI.java +++ b/turing-app/src/main/java/com/viglet/turing/api/nlp/TurNLPInstanceAPI.java @@ -27,9 +27,8 @@ import com.viglet.turing.api.nlp.bean.TurNLPEntityValidateResponse; import com.viglet.turing.api.nlp.bean.TurNLPValidateDocument; import com.viglet.turing.api.nlp.bean.TurNLPValidateResponse; -import com.viglet.turing.commons.utils.TurCommonsUtils; -import com.viglet.turing.filesystem.commons.TurFileAttributes; import com.viglet.turing.filesystem.commons.TurFileUtils; +import com.viglet.turing.filesystem.commons.TurTikaFileAttributes; import com.viglet.turing.nlp.TurNLPProcess; import com.viglet.turing.nlp.TurNLPResponse; import com.viglet.turing.nlp.TurNLPUtils; @@ -54,6 +53,7 @@ import java.io.File; import java.util.Collections; import java.util.List; +import java.util.Objects; import java.util.Optional; import java.util.stream.Collectors; @@ -134,7 +134,7 @@ public TurNLPInstance turNLPInstanceAdd(@RequestBody TurNLPInstance turNLPInstan @PostMapping(value = "/{id}/validate/file/blazon", produces = MediaType.APPLICATION_XML_VALUE) public RedactionScript validateFile(@RequestParam("file") MultipartFile multipartFile, @PathVariable String id) { - final String text = TurFileUtils.documentToText(multipartFile); + final String text = Objects.requireNonNull(TurFileUtils.documentToText(multipartFile)).getContent(); TurNLPTextValidate textValidate = new TurNLPTextValidate(); textValidate.setText(text); return this.turNLPInstanceRepository.findById(id).map(turNLPInstance -> @@ -149,18 +149,18 @@ public TurNLPValidateResponse validateDocument(@PathVariable String id, @RequestParam("config") String turNLPValidateDocumentRequest) { File file = TurSpringUtils.getFileFromMultipart(multipartFile); - TurFileAttributes turFileAttributes = TurFileUtils.readFile(file); + TurTikaFileAttributes turTikaFileAttributes = TurFileUtils.readFile(file); return this.turNLPInstanceRepository.findById(id) .map(turNLPInstance -> { try { TurNLPValidateDocument turNLPValidateDocument = new ObjectMapper().readValue(turNLPValidateDocumentRequest, TurNLPValidateDocument.class); - if (turFileAttributes != null && turNLPValidateDocument != null) { + if (turTikaFileAttributes != null && turNLPValidateDocument != null) { TurNLPResponse turNLPResponse = turNLPProcess.processTextByNLP(turNLPInstance, - turFileAttributes.getContent(), turNLPValidateDocument.getEntities()); + turTikaFileAttributes.getContent(), turNLPValidateDocument.getEntities()); List terms = getNLPTerms(turNLPResponse); turNLPUtils.redactPdf(file, terms); - return createNLPValidateResponse(turNLPInstance, turNLPResponse, turFileAttributes.getContent()); + return createNLPValidateResponse(turNLPInstance, turNLPResponse, turTikaFileAttributes.getContent()); } } catch (JsonProcessingException e) { log.error(e.getMessage(), e); diff --git a/turing-app/src/main/java/com/viglet/turing/api/ocr/TurOcrAPI.java b/turing-app/src/main/java/com/viglet/turing/api/ocr/TurOcrAPI.java index 63d23a3a865..83eb5b81e00 100644 --- a/turing-app/src/main/java/com/viglet/turing/api/ocr/TurOcrAPI.java +++ b/turing-app/src/main/java/com/viglet/turing/api/ocr/TurOcrAPI.java @@ -1,5 +1,6 @@ package com.viglet.turing.api.ocr; +import com.viglet.turing.commons.file.TurFileAttributes; import com.viglet.turing.filesystem.commons.TurFileUtils; import io.swagger.v3.oas.annotations.tags.Tag; import lombok.extern.slf4j.Slf4j; @@ -16,7 +17,7 @@ public class TurOcrAPI { @PostMapping - public String convertToText(@RequestParam("file") MultipartFile multipartFile) { + public TurFileAttributes convertToText(@RequestParam("file") MultipartFile multipartFile) { return TurFileUtils.documentToText(multipartFile); } } diff --git a/turing-app/src/main/java/com/viglet/turing/api/sn/bean/TurSolrCoreExists.java b/turing-app/src/main/java/com/viglet/turing/api/sn/bean/TurSolrCoreExists.java index 99327c66444..a0eaf58a4a8 100644 --- a/turing-app/src/main/java/com/viglet/turing/api/sn/bean/TurSolrCoreExists.java +++ b/turing-app/src/main/java/com/viglet/turing/api/sn/bean/TurSolrCoreExists.java @@ -9,8 +9,8 @@ @Setter @Builder public class TurSolrCoreExists { - String name; - boolean exists; + private String name; + private boolean exists; @Tolerate public TurSolrCoreExists() { diff --git a/turing-app/src/main/java/com/viglet/turing/api/sn/job/TurSNImportAPI.java b/turing-app/src/main/java/com/viglet/turing/api/sn/job/TurSNImportAPI.java index 6801113da07..6fd0fbbef5d 100644 --- a/turing-app/src/main/java/com/viglet/turing/api/sn/job/TurSNImportAPI.java +++ b/turing-app/src/main/java/com/viglet/turing/api/sn/job/TurSNImportAPI.java @@ -27,7 +27,7 @@ import com.viglet.turing.client.sn.job.TurSNJobItem; import com.viglet.turing.client.sn.job.TurSNJobItems; import com.viglet.turing.commons.utils.TurCommonsUtils; -import com.viglet.turing.filesystem.commons.TurFileAttributes; +import com.viglet.turing.filesystem.commons.TurTikaFileAttributes; import com.viglet.turing.filesystem.commons.TurFileUtils; import com.viglet.turing.persistence.repository.sn.TurSNSiteRepository; import com.viglet.turing.sn.TurSNConstants; @@ -110,9 +110,9 @@ private void extractTextOfFileAttribute(File extractFolder, Map.Entry attribute.setValue(TurCommonsUtils.cleanTextContent(content))); } catch (IOException e) { log.error(e.getMessage(), e); diff --git a/turing-commons/src/main/java/com/viglet/turing/commons/file/TurFileAttributes.java b/turing-commons/src/main/java/com/viglet/turing/commons/file/TurFileAttributes.java new file mode 100644 index 00000000000..bdfddde2fd4 --- /dev/null +++ b/turing-commons/src/main/java/com/viglet/turing/commons/file/TurFileAttributes.java @@ -0,0 +1,37 @@ +/* + * + * Copyright (C) 2016-2024 the original author or authors. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package com.viglet.turing.commons.file; + +import lombok.*; + +/** +* +* @author Alexandre Oliveira +* +* @since 0.3.9 +* +**/ + +@Setter +@Getter +public class TurFileAttributes { + private String content; + private String name; + private String extension; + private TurFileSize size; +} diff --git a/turing-commons/src/main/java/com/viglet/turing/commons/file/TurFileSize.java b/turing-commons/src/main/java/com/viglet/turing/commons/file/TurFileSize.java new file mode 100644 index 00000000000..0605032c270 --- /dev/null +++ b/turing-commons/src/main/java/com/viglet/turing/commons/file/TurFileSize.java @@ -0,0 +1,46 @@ +/* + * + * Copyright (C) 2016-2024 the original author or authors. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +package com.viglet.turing.commons.file; + +import lombok.Getter; + +import java.math.BigDecimal; +import java.math.RoundingMode; + +/** + * @author Alexandre Oliveira + * @since 0.3.9 + **/ +@Getter +public class TurFileSize { + private final float bytes; + private final float kiloBytes; + private final float megaBytes; + + public TurFileSize(float bytes) { + this.bytes = twoDecimalFloat(bytes); + this.kiloBytes = twoDecimalFloat(this.bytes / 1024); + this.megaBytes = twoDecimalFloat(this.kiloBytes / 1024); + } + + private float twoDecimalFloat(float value) { + return new BigDecimal(value).setScale(2, RoundingMode.HALF_UP).floatValue(); + } + +} diff --git a/turing-db/db-app/src/main/java/com/viglet/turing/connector/db/TurDbImportTool.java b/turing-db/db-app/src/main/java/com/viglet/turing/connector/db/TurDbImportTool.java index 1a751933296..a9404edc01e 100644 --- a/turing-db/db-app/src/main/java/com/viglet/turing/connector/db/TurDbImportTool.java +++ b/turing-db/db-app/src/main/java/com/viglet/turing/connector/db/TurDbImportTool.java @@ -27,7 +27,7 @@ import com.viglet.turing.commons.cache.TurCustomClassCache; import com.viglet.turing.connector.db.ext.TurDbExtCustomImpl; import com.viglet.turing.connector.db.format.TurDbFormatValue; -import com.viglet.turing.filesystem.commons.TurFileAttributes; +import com.viglet.turing.filesystem.commons.TurTikaFileAttributes; import com.viglet.turing.filesystem.commons.TurFileUtils; import lombok.Getter; import lombok.Setter; @@ -302,23 +302,23 @@ private void addDBFieldsAsAttributes(ResultSet rs, Map attribute private void addFileAttributes(Map attributes) { if (filePathField != null && attributes.containsKey(filePathField)) { - TurFileAttributes turFileAttributes = TurFileUtils.readFile((String) attributes.get(filePathField)); - if (turFileAttributes != null) { - addFileSizeAttribute(attributes, turFileAttributes); - addFileContentAttribute(attributes, turFileAttributes); + TurTikaFileAttributes turTikaFileAttributes = TurFileUtils.readFile((String) attributes.get(filePathField)); + if (turTikaFileAttributes != null) { + addFileSizeAttribute(attributes, turTikaFileAttributes); + addFileContentAttribute(attributes, turTikaFileAttributes); } } } - private void addFileContentAttribute(Map attributes, TurFileAttributes turFileAttributes) { + private void addFileContentAttribute(Map attributes, TurTikaFileAttributes turTikaFileAttributes) { if (fileContentField != null) { long maxContentByteSize = maxContentMegaByteSize * MEGA_BYTE; - if (turFileAttributes.getContent().getBytes().length <= maxContentByteSize) { - attributes.put(fileContentField, turFileAttributes.getContent()); + if (turTikaFileAttributes.getContent().getBytes().length <= maxContentByteSize) { + attributes.put(fileContentField, turTikaFileAttributes.getContent()); } else { attributes.put(fileContentField, - turFileAttributes.getContent().substring(0, Math.toIntExact(maxContentByteSize))); + turTikaFileAttributes.getContent().substring(0, Math.toIntExact(maxContentByteSize))); if (log.isDebugEnabled()) { log.debug("File size greater than {}, truncating content ...:", FileUtils.byteCountToDisplaySize(maxContentByteSize)); @@ -329,14 +329,14 @@ private void addFileContentAttribute(Map attributes, TurFileAttr } } - private void addFileSizeAttribute(Map attributes, TurFileAttributes turFileAttributes) { - if (fileSizeField != null && turFileAttributes.getFile() != null) { - attributes.put(fileSizeField, turFileAttributes.getFile().length()); + private void addFileSizeAttribute(Map attributes, TurTikaFileAttributes turTikaFileAttributes) { + if (fileSizeField != null && turTikaFileAttributes.getFile() != null) { + attributes.put(fileSizeField, turTikaFileAttributes.getFile().length()); if (log.isDebugEnabled()) { - log.debug("File: {}", turFileAttributes.getFile().getAbsolutePath()); - log.debug("File size: {}", FileUtils.byteCountToDisplaySize(turFileAttributes.getFile().length())); + log.debug("File: {}", turTikaFileAttributes.getFile().getAbsolutePath()); + log.debug("File size: {}", FileUtils.byteCountToDisplaySize(turTikaFileAttributes.getFile().length())); log.debug("File - Content size: {}", - FileUtils.byteCountToDisplaySize(turFileAttributes.getContent().getBytes().length)); + FileUtils.byteCountToDisplaySize(turTikaFileAttributes.getContent().getBytes().length)); } } else { log.debug("File without size: {}", filePathField); diff --git a/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileUtils.java b/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileUtils.java index a5770288b88..1481c4c376b 100644 --- a/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileUtils.java +++ b/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileUtils.java @@ -1,7 +1,10 @@ package com.viglet.turing.filesystem.commons; +import com.viglet.turing.commons.file.TurFileAttributes; +import com.viglet.turing.commons.file.TurFileSize; import com.viglet.turing.commons.utils.TurCommonsUtils; import lombok.extern.slf4j.Slf4j; +import org.apache.commons.io.FilenameUtils; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.metadata.Metadata; @@ -31,11 +34,11 @@ private TurFileUtils() { throw new IllegalStateException("Turing File Utilities class"); } - public static TurFileAttributes readFile(String filePath) { + public static TurTikaFileAttributes readFile(String filePath) { return readFile(new File(filePath)); } - public static TurFileAttributes readFile(File file) { + public static TurTikaFileAttributes readFile(File file) { if (file.exists()) { return parseFile(file); } else { @@ -44,7 +47,7 @@ public static TurFileAttributes readFile(File file) { } } - public static TurFileAttributes parseFile(File file) { + public static TurTikaFileAttributes parseFile(File file) { try (InputStream fileInputStreamAttribute = new FileInputStream(file)) { return parseFile(fileInputStreamAttribute, file); @@ -54,7 +57,7 @@ public static TurFileAttributes parseFile(File file) { return null; } - public static TurFileAttributes parseFile(InputStream inputStream, File file) { + public static TurTikaFileAttributes parseFile(InputStream inputStream, File file) { try (inputStream) { StringBuilder contentFile = new StringBuilder(); AutoDetectParser parser = new AutoDetectParser(); @@ -78,7 +81,7 @@ public void parseEmbedded(InputStream stream, ContentHandler handler, Metadata m parseContext.set(EmbeddedDocumentExtractor.class, embeddedDocumentExtractor); parser.parse(inputStream, handler, metadata, parseContext); contentFile.append(handler); - return new TurFileAttributes(file, contentFile.toString(), metadata); + return new TurTikaFileAttributes(file, contentFile.toString(), metadata); } catch (IOException | SAXException | TikaException e) { log.error(e.getMessage(), e); } @@ -96,12 +99,18 @@ private static ParseContext getParseContext(AutoDetectParser parser) { return parseContext; } - public static String documentToText(MultipartFile multipartFile) { + public static TurFileAttributes documentToText(MultipartFile multipartFile) { + try (InputStream inputStream = multipartFile.getInputStream()) { - TurFileAttributes turFileAttributes = parseFile(inputStream, null); - return Optional.ofNullable(turFileAttributes) - .map(TurFileAttributes::getContent) - .orElse(null); + TurTikaFileAttributes turTikaFileAttributes = parseFile(inputStream, null); + TurFileAttributes turFileAttributes = new TurFileAttributes(); + Optional.ofNullable(turTikaFileAttributes).ifPresent(attributes -> { + turFileAttributes.setContent(attributes.getContent()); + turFileAttributes.setName(multipartFile.getOriginalFilename()); + turFileAttributes.setExtension(FilenameUtils.getExtension(multipartFile.getOriginalFilename())); + turFileAttributes.setSize(new TurFileSize(multipartFile.getSize())); + }); + return turFileAttributes; } catch (IOException e) { log.error(e.getMessage(), e); } diff --git a/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileAttributes.java b/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurTikaFileAttributes.java similarity index 90% rename from turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileAttributes.java rename to turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurTikaFileAttributes.java index 46031a50127..3720689844e 100644 --- a/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileAttributes.java +++ b/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurTikaFileAttributes.java @@ -31,12 +31,12 @@ **/ @Setter @Getter -public class TurFileAttributes { +public class TurTikaFileAttributes { private File file; private String content; private Metadata metadata; - public TurFileAttributes(File file, String content, Metadata metadata) { + public TurTikaFileAttributes(File file, String content, Metadata metadata) { super(); this.file = file; this.content = content; diff --git a/turing-java-sdk/src/main/java/com/viglet/turing/client/ocr/TurOcr.java b/turing-java-sdk/src/main/java/com/viglet/turing/client/ocr/TurOcr.java index d6a846059a0..77ce32c0d73 100644 --- a/turing-java-sdk/src/main/java/com/viglet/turing/client/ocr/TurOcr.java +++ b/turing-java-sdk/src/main/java/com/viglet/turing/client/ocr/TurOcr.java @@ -1,7 +1,9 @@ package com.viglet.turing.client.ocr; +import com.fasterxml.jackson.databind.ObjectMapper; import com.viglet.turing.client.auth.TurServer; import com.viglet.turing.client.utils.TurClientUtils; +import com.viglet.turing.commons.file.TurFileAttributes; import lombok.extern.slf4j.Slf4j; import org.apache.hc.client5.http.classic.methods.HttpPost; import org.apache.hc.client5.http.entity.mime.*; @@ -11,14 +13,13 @@ import org.apache.hc.core5.http.HttpEntity; import org.apache.hc.core5.http.io.entity.EntityUtils; -import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; @Slf4j public class TurOcr { - public static String processFile(TurServer turServer, File file, boolean showOutput) { + public static TurFileAttributes processFile(TurServer turServer, File file, boolean showOutput) { try (CloseableHttpClient client = HttpClients.createDefault()) { FileBody fileBody = new FileBody(file, ContentType.DEFAULT_BINARY); String url = String.format("%s/api/ocr", turServer.getServerUrl()); @@ -34,7 +35,7 @@ public static String processFile(TurServer turServer, File file, boolean showOut if (showOutput) { System.out.println(responseBody); } - return responseBody; + return new ObjectMapper().readValue(responseBody, TurFileAttributes.class); } catch (IOException e) { log.error(e.getMessage(), e); return null; From 5dcb04c7b95fa2963c6b468f8f64941544ee5aa7 Mon Sep 17 00:00:00 2001 From: Alexandre Oliveira Date: Sat, 7 Sep 2024 11:47:24 -0300 Subject: [PATCH 02/10] - OCR: Metadata and Title --- .../turing/commons/file/TurFileAttributes.java | 5 +++++ .../turing/filesystem/commons/TurFileUtils.java | 14 ++++++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/turing-commons/src/main/java/com/viglet/turing/commons/file/TurFileAttributes.java b/turing-commons/src/main/java/com/viglet/turing/commons/file/TurFileAttributes.java index bdfddde2fd4..77496a9c4f9 100644 --- a/turing-commons/src/main/java/com/viglet/turing/commons/file/TurFileAttributes.java +++ b/turing-commons/src/main/java/com/viglet/turing/commons/file/TurFileAttributes.java @@ -19,6 +19,9 @@ import lombok.*; +import java.util.Date; +import java.util.Map; + /** * * @author Alexandre Oliveira @@ -32,6 +35,8 @@ public class TurFileAttributes { private String content; private String name; + private String title; private String extension; private TurFileSize size; + private Map metadata; } diff --git a/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileUtils.java b/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileUtils.java index 1481c4c376b..8bf0c317783 100644 --- a/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileUtils.java +++ b/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileUtils.java @@ -24,12 +24,13 @@ import java.io.InputStream; import java.nio.file.Files; import java.nio.file.StandardCopyOption; -import java.util.Optional; -import java.util.UUID; +import java.util.*; @Slf4j public class TurFileUtils { + public static final String PDF_DOCINFO_TITLE = "pdf:docinfo:title"; + private TurFileUtils() { throw new IllegalStateException("Turing File Utilities class"); } @@ -103,12 +104,21 @@ public static TurFileAttributes documentToText(MultipartFile multipartFile) { try (InputStream inputStream = multipartFile.getInputStream()) { TurTikaFileAttributes turTikaFileAttributes = parseFile(inputStream, null); + TurFileAttributes turFileAttributes = new TurFileAttributes(); Optional.ofNullable(turTikaFileAttributes).ifPresent(attributes -> { turFileAttributes.setContent(attributes.getContent()); turFileAttributes.setName(multipartFile.getOriginalFilename()); turFileAttributes.setExtension(FilenameUtils.getExtension(multipartFile.getOriginalFilename())); turFileAttributes.setSize(new TurFileSize(multipartFile.getSize())); + turFileAttributes.setTitle(Optional.ofNullable(turTikaFileAttributes + .getMetadata() + .get(PDF_DOCINFO_TITLE)) + .orElseGet(turFileAttributes::getName)); + Map metadataMap = new HashMap<>(); + Arrays.stream(turTikaFileAttributes.getMetadata().names()).forEach(name -> + metadataMap.put(name, turTikaFileAttributes.getMetadata().get(name))); + turFileAttributes.setMetadata(metadataMap); }); return turFileAttributes; } catch (IOException e) { From 7b89ba6add4a76a29bce63e55f64afde6bbfd047 Mon Sep 17 00:00:00 2001 From: Alexandre Oliveira Date: Mon, 16 Sep 2024 17:25:39 -0300 Subject: [PATCH 03/10] - OCR: URL --- .../com/viglet/turing/api/ocr/TurOcrAPI.java | 44 ++++- .../viglet/turing/api/ocr/TurOcrFromUrl.java | 32 ++++ .../turing/api/sn/job/TurSNImportAPI.java | 17 +- .../security/TurSecurityConfigProduction.java | 2 +- .../commons/file/TurFileAttributes.java | 4 + .../turing/commons/file/TurFileSize.java | 9 +- .../filesystem/commons/TurFileUtils.java | 166 ++++++++++++------ turing-java-sdk/conf/log4j.properties | 4 + turing-java-sdk/pom.xml | 19 +- .../com/viglet/turing/client/ocr/TurOcr.java | 64 +++++-- .../client/sn/sample/TurClientOcrSample.java | 52 ++++++ .../client/sn/sample/TurSNClientSample.java | 8 +- .../turing/client/utils/TurClientUtils.java | 5 - 13 files changed, 331 insertions(+), 95 deletions(-) create mode 100644 turing-app/src/main/java/com/viglet/turing/api/ocr/TurOcrFromUrl.java create mode 100644 turing-java-sdk/conf/log4j.properties create mode 100644 turing-java-sdk/src/main/java/com/viglet/turing/client/sn/sample/TurClientOcrSample.java diff --git a/turing-app/src/main/java/com/viglet/turing/api/ocr/TurOcrAPI.java b/turing-app/src/main/java/com/viglet/turing/api/ocr/TurOcrAPI.java index 83eb5b81e00..4ef1a4821a4 100644 --- a/turing-app/src/main/java/com/viglet/turing/api/ocr/TurOcrAPI.java +++ b/turing-app/src/main/java/com/viglet/turing/api/ocr/TurOcrAPI.java @@ -1,23 +1,55 @@ +/* + * + * Copyright (C) 2016-2024 the original author or authors. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + package com.viglet.turing.api.ocr; import com.viglet.turing.commons.file.TurFileAttributes; import com.viglet.turing.filesystem.commons.TurFileUtils; import io.swagger.v3.oas.annotations.tags.Tag; import lombok.extern.slf4j.Slf4j; -import org.springframework.web.bind.annotation.PostMapping; -import org.springframework.web.bind.annotation.RequestMapping; -import org.springframework.web.bind.annotation.RequestParam; -import org.springframework.web.bind.annotation.RestController; +import org.springframework.web.bind.annotation.*; import org.springframework.web.multipart.MultipartFile; +import java.net.MalformedURLException; +import java.net.URI; +/** + * @author Alexandre Oliveira + * @since 0.3.9 + */ @Slf4j @RestController @RequestMapping("/api/ocr") @Tag(name = "OCR", description = "OCR API") public class TurOcrAPI { - @PostMapping - public TurFileAttributes convertToText(@RequestParam("file") MultipartFile multipartFile) { + @PostMapping("/file") + public TurFileAttributes fileToText(@RequestParam("file") MultipartFile multipartFile) { return TurFileUtils.documentToText(multipartFile); } + + @PostMapping("/url") + public TurFileAttributes urlToText(@RequestBody TurOcrFromUrl turOcrFromUrl) { + try { + return TurFileUtils.urlContentToText(URI.create(turOcrFromUrl.getUrl()).toURL()); + } + catch (MalformedURLException e) { + log.error(e.getMessage(), e); + } + return new TurFileAttributes(); + } } diff --git a/turing-app/src/main/java/com/viglet/turing/api/ocr/TurOcrFromUrl.java b/turing-app/src/main/java/com/viglet/turing/api/ocr/TurOcrFromUrl.java new file mode 100644 index 00000000000..e09b52023e5 --- /dev/null +++ b/turing-app/src/main/java/com/viglet/turing/api/ocr/TurOcrFromUrl.java @@ -0,0 +1,32 @@ +/* + * + * Copyright (C) 2016-2024 the original author or authors. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +package com.viglet.turing.api.ocr; + +import lombok.Getter; +import lombok.Setter; + +/** + * @author Alexandre Oliveira + * @since 0.3.9 + */ +@Getter +@Setter +public class TurOcrFromUrl { + private String url; +} diff --git a/turing-app/src/main/java/com/viglet/turing/api/sn/job/TurSNImportAPI.java b/turing-app/src/main/java/com/viglet/turing/api/sn/job/TurSNImportAPI.java index 6fd0fbbef5d..ac399856698 100644 --- a/turing-app/src/main/java/com/viglet/turing/api/sn/job/TurSNImportAPI.java +++ b/turing-app/src/main/java/com/viglet/turing/api/sn/job/TurSNImportAPI.java @@ -27,8 +27,8 @@ import com.viglet.turing.client.sn.job.TurSNJobItem; import com.viglet.turing.client.sn.job.TurSNJobItems; import com.viglet.turing.commons.utils.TurCommonsUtils; -import com.viglet.turing.filesystem.commons.TurTikaFileAttributes; import com.viglet.turing.filesystem.commons.TurFileUtils; +import com.viglet.turing.filesystem.commons.TurTikaFileAttributes; import com.viglet.turing.persistence.repository.sn.TurSNSiteRepository; import com.viglet.turing.sn.TurSNConstants; import com.viglet.turing.spring.utils.TurSpringUtils; @@ -108,16 +108,11 @@ public boolean turSNImportZipFileBroker(@RequestParam("file") MultipartFile mult private void extractTextOfFileAttribute(File extractFolder, Map.Entry attribute) { if (attribute.getValue().toString().startsWith(TurSNConstants.FILE_PROTOCOL)) { String fileName = attribute.getValue().toString().replace(TurSNConstants.FILE_PROTOCOL, ""); - try (FileInputStream fileInputStreamAttribute = new FileInputStream( - extractFolder.getAbsolutePath() + File.separator + fileName)) { - TurTikaFileAttributes turTikaFileAttributes = TurFileUtils.parseFile(fileInputStreamAttribute, null); - Optional.ofNullable(turTikaFileAttributes) - .map(TurTikaFileAttributes::getContent) - .ifPresent(content -> attribute.setValue(TurCommonsUtils.cleanTextContent(content))); - } catch (IOException e) { - log.error(e.getMessage(), e); - } - + File file = new File(extractFolder.getAbsolutePath().concat(File.separator).concat(fileName)); + TurTikaFileAttributes turTikaFileAttributes = TurFileUtils.parseFile(file); + Optional.ofNullable(turTikaFileAttributes) + .map(TurTikaFileAttributes::getContent) + .ifPresent(content -> attribute.setValue(TurCommonsUtils.cleanTextContent(content))); } } diff --git a/turing-app/src/main/java/com/viglet/turing/spring/security/TurSecurityConfigProduction.java b/turing-app/src/main/java/com/viglet/turing/spring/security/TurSecurityConfigProduction.java index c915c01e14b..96afd00580e 100644 --- a/turing-app/src/main/java/com/viglet/turing/spring/security/TurSecurityConfigProduction.java +++ b/turing-app/src/main/java/com/viglet/turing/spring/security/TurSecurityConfigProduction.java @@ -85,7 +85,7 @@ SecurityFilterChain filterChain(HttpSecurity http, MvcRequestMatcher.Builder mvc mvc.pattern("/error/**"), mvc.pattern("/logout"), mvc.pattern("/api/nlp/**"), - mvc.pattern("/api/ocr"), + mvc.pattern("/api/ocr/**"), mvc.pattern("/api/llm/**"), mvc.pattern("/api/v2/guest/**"), AntPathRequestMatcher.antMatcher("/h2/**"))) diff --git a/turing-commons/src/main/java/com/viglet/turing/commons/file/TurFileAttributes.java b/turing-commons/src/main/java/com/viglet/turing/commons/file/TurFileAttributes.java index 77496a9c4f9..e929628d394 100644 --- a/turing-commons/src/main/java/com/viglet/turing/commons/file/TurFileAttributes.java +++ b/turing-commons/src/main/java/com/viglet/turing/commons/file/TurFileAttributes.java @@ -32,11 +32,15 @@ @Setter @Getter +@ToString public class TurFileAttributes { private String content; private String name; private String title; private String extension; private TurFileSize size; + private Date lastModified; private Map metadata; + + } diff --git a/turing-commons/src/main/java/com/viglet/turing/commons/file/TurFileSize.java b/turing-commons/src/main/java/com/viglet/turing/commons/file/TurFileSize.java index 0605032c270..6ee2be59d25 100644 --- a/turing-commons/src/main/java/com/viglet/turing/commons/file/TurFileSize.java +++ b/turing-commons/src/main/java/com/viglet/turing/commons/file/TurFileSize.java @@ -18,7 +18,7 @@ package com.viglet.turing.commons.file; -import lombok.Getter; +import lombok.*; import java.math.BigDecimal; import java.math.RoundingMode; @@ -27,12 +27,17 @@ * @author Alexandre Oliveira * @since 0.3.9 **/ +@Setter @Getter +@AllArgsConstructor +@ToString public class TurFileSize { private final float bytes; private final float kiloBytes; private final float megaBytes; - + public TurFileSize() { + this(0f); + } public TurFileSize(float bytes) { this.bytes = twoDecimalFloat(bytes); this.kiloBytes = twoDecimalFloat(this.bytes / 1024); diff --git a/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileUtils.java b/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileUtils.java index 8bf0c317783..b8b5961d8b1 100644 --- a/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileUtils.java +++ b/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileUtils.java @@ -4,6 +4,7 @@ import com.viglet.turing.commons.file.TurFileSize; import com.viglet.turing.commons.utils.TurCommonsUtils; import lombok.extern.slf4j.Slf4j; +import org.apache.commons.io.FileUtils; import org.apache.commons.io.FilenameUtils; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; @@ -22,6 +23,8 @@ import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; +import java.net.HttpURLConnection; +import java.net.URL; import java.nio.file.Files; import java.nio.file.StandardCopyOption; import java.util.*; @@ -29,7 +32,7 @@ @Slf4j public class TurFileUtils { - public static final String PDF_DOCINFO_TITLE = "pdf:docinfo:title"; + public static final String PDF_DOC_INFO_TITLE = "pdf:docinfo:title"; private TurFileUtils() { throw new IllegalStateException("Turing File Utilities class"); @@ -49,43 +52,50 @@ public static TurTikaFileAttributes readFile(File file) { } public static TurTikaFileAttributes parseFile(File file) { - try (InputStream fileInputStreamAttribute = new FileInputStream(file)) { - return parseFile(fileInputStreamAttribute, file); - + try (InputStream inputStream = new FileInputStream(file)) { + return getTurTikaFileAttributes(file, inputStream); } catch (IOException e) { log.error(e.getMessage(), e); } return null; } - public static TurTikaFileAttributes parseFile(InputStream inputStream, File file) { - try (inputStream) { - StringBuilder contentFile = new StringBuilder(); - AutoDetectParser parser = new AutoDetectParser(); - // -1 = no limit of number of characters - BodyContentHandler handler = new BodyContentHandler(-1); - Metadata metadata = new Metadata(); - EmbeddedDocumentExtractor embeddedDocumentExtractor = new EmbeddedDocumentExtractor() { - @Override - public boolean shouldParseEmbedded(Metadata metadata) { - return true; - } - - @Override - public void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata, - boolean outputHtml) throws IOException { - parseDocument(stream).ifPresent(contentFile::append); - - } - }; - final ParseContext parseContext = getParseContext(parser); - parseContext.set(EmbeddedDocumentExtractor.class, embeddedDocumentExtractor); + private static TurTikaFileAttributes getTurTikaFileAttributes(File file, InputStream inputStream) { + StringBuilder contentFile = new StringBuilder(); + AutoDetectParser parser = new AutoDetectParser(); + // -1 = no limit of number of characters + BodyContentHandler handler = new BodyContentHandler(-1); + Metadata metadata = new Metadata(); + EmbeddedDocumentExtractor embeddedDocumentExtractor = new EmbeddedDocumentExtractor() { + @Override + public boolean shouldParseEmbedded(Metadata metadata) { + return true; + } + + @Override + public void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata, + boolean outputHtml) throws IOException { + parseDocument(stream).ifPresent(contentFile::append); + + } + }; + ParseContext parseContext = getParseContext(parser); + parseContext.set(EmbeddedDocumentExtractor.class, embeddedDocumentExtractor); + try { parser.parse(inputStream, handler, metadata, parseContext); - contentFile.append(handler); - return new TurTikaFileAttributes(file, contentFile.toString(), metadata); } catch (IOException | SAXException | TikaException e) { log.error(e.getMessage(), e); } + contentFile.append(handler); + return new TurTikaFileAttributes(file, contentFile.toString(), metadata); + } + + public static TurTikaFileAttributes parseFile(MultipartFile multipartFile) { + try (InputStream inputStream = multipartFile.getInputStream()) { + return getTurTikaFileAttributes(null, inputStream); + } catch (IOException e) { + log.error(e.getMessage(), e); + } return null; } @@ -101,30 +111,81 @@ private static ParseContext getParseContext(AutoDetectParser parser) { } public static TurFileAttributes documentToText(MultipartFile multipartFile) { + return getTurFileAttributes(parseFile(multipartFile), + multipartFile.getOriginalFilename(), + FilenameUtils.getExtension(multipartFile.getOriginalFilename()), + multipartFile.getSize(), + new Date()); + } - try (InputStream inputStream = multipartFile.getInputStream()) { - TurTikaFileAttributes turTikaFileAttributes = parseFile(inputStream, null); - - TurFileAttributes turFileAttributes = new TurFileAttributes(); - Optional.ofNullable(turTikaFileAttributes).ifPresent(attributes -> { - turFileAttributes.setContent(attributes.getContent()); - turFileAttributes.setName(multipartFile.getOriginalFilename()); - turFileAttributes.setExtension(FilenameUtils.getExtension(multipartFile.getOriginalFilename())); - turFileAttributes.setSize(new TurFileSize(multipartFile.getSize())); - turFileAttributes.setTitle(Optional.ofNullable(turTikaFileAttributes - .getMetadata() - .get(PDF_DOCINFO_TITLE)) - .orElseGet(turFileAttributes::getName)); - Map metadataMap = new HashMap<>(); - Arrays.stream(turTikaFileAttributes.getMetadata().names()).forEach(name -> - metadataMap.put(name, turTikaFileAttributes.getMetadata().get(name))); - turFileAttributes.setMetadata(metadataMap); - }); - return turFileAttributes; + public static TurFileAttributes urlContentToText(URL url) { + File file = getFile(url); + file.deleteOnExit(); + return getTurFileAttributes(parseFile(file), + FilenameUtils.getName(url.getPath()), + FilenameUtils.getExtension(url.getPath()), + file.length(), + getLastModified(url)); + } + + private static Date getLastModified(URL url) { + Date date = new Date(); + try { + HttpURLConnection httpUrlConnection = (HttpURLConnection) url.openConnection(); + httpUrlConnection.setRequestMethod("HEAD"); + date = new Date(httpUrlConnection.getLastModified()); + httpUrlConnection.disconnect(); } catch (IOException e) { log.error(e.getMessage(), e); } - return null; + return date; + } + + private static File getFile(URL url) { + File tempFile = null; + try { + tempFile = createTempFile(); + FileUtils.copyURLToFile( + url, + tempFile, + 5000, + 5000); + } catch (IOException e) { + log.error(e.getMessage(), e); + } + return tempFile; + } + + private static TurFileAttributes getTurFileAttributes(TurTikaFileAttributes file, + String fileName, + String fileExtension, + long fileSize, + Date lastModified) { + TurFileAttributes turFileAttributes = new TurFileAttributes(); + Optional.ofNullable(file).ifPresent(attributes -> { + turFileAttributes.setContent(attributes.getContent()); + turFileAttributes.setName(fileName); + turFileAttributes.setExtension(fileExtension); + turFileAttributes.setSize(new TurFileSize(fileSize)); + turFileAttributes.setTitle(getTitle(file, turFileAttributes)); + turFileAttributes.setLastModified(lastModified); + turFileAttributes.setMetadata(getMetadataMap(file)); + }); + return turFileAttributes; + } + + private static String getTitle(TurTikaFileAttributes file, TurFileAttributes turFileAttributes) { + return Optional.ofNullable(file + .getMetadata() + .get(PDF_DOC_INFO_TITLE)) + .orElseGet(turFileAttributes::getName); + } + + private static Map getMetadataMap(TurTikaFileAttributes file) { + Map metadataMap = new HashMap<>(); + Arrays.stream(file.getMetadata().names()).forEach(name -> + metadataMap.put(name, file.getMetadata().get(name))); + return metadataMap; } public static Optional parseDocument(InputStream stream) throws IOException { @@ -138,8 +199,7 @@ public static Optional parseDocument(InputStream stream) throws IOExcept public static Optional getFileContent(InputStream stream, BodyContentHandler handlerInner, AutoDetectParser parserInner, Metadata metadataInner, ParseContext parseContextInner) throws IOException { - File tempFile = File.createTempFile(UUID.randomUUID().toString(), null, - TurCommonsUtils.addSubDirToStoreDir("tmp")); + File tempFile = createTempFile(); Files.copy(stream, tempFile.toPath(), StandardCopyOption.REPLACE_EXISTING); try (FileInputStream fileInputStreamInner = new FileInputStream(tempFile)) { parserInner.parse(fileInputStreamInner, handlerInner, metadataInner, parseContextInner); @@ -152,4 +212,10 @@ public static Optional getFileContent(InputStream stream, BodyContentHan return Optional.empty(); } + private static File createTempFile() throws IOException { + return File.createTempFile(UUID.randomUUID().toString(), null, + TurCommonsUtils.addSubDirToStoreDir("tmp")); + } + + } diff --git a/turing-java-sdk/conf/log4j.properties b/turing-java-sdk/conf/log4j.properties new file mode 100644 index 00000000000..b17329c192d --- /dev/null +++ b/turing-java-sdk/conf/log4j.properties @@ -0,0 +1,4 @@ +log4j.rootLogger=debug, stdout +log4j.appender.stdout=org.apache.log4j.ConsoleAppender +log4j.appender.stdout.layout=org.apache.log4j.PatternLayout +log4j.appender.stdout.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n diff --git a/turing-java-sdk/pom.xml b/turing-java-sdk/pom.xml index c85e694a187..82fb97420a7 100644 --- a/turing-java-sdk/pom.xml +++ b/turing-java-sdk/pom.xml @@ -38,7 +38,6 @@ compile - @@ -51,6 +50,24 @@ 11 + + + maven-shade-plugin + 3.6.0 + + + + shade + + none + + turing-java-sdk + true + indexer + + + + diff --git a/turing-java-sdk/src/main/java/com/viglet/turing/client/ocr/TurOcr.java b/turing-java-sdk/src/main/java/com/viglet/turing/client/ocr/TurOcr.java index 77ce32c0d73..d2371a24489 100644 --- a/turing-java-sdk/src/main/java/com/viglet/turing/client/ocr/TurOcr.java +++ b/turing-java-sdk/src/main/java/com/viglet/turing/client/ocr/TurOcr.java @@ -6,39 +6,75 @@ import com.viglet.turing.commons.file.TurFileAttributes; import lombok.extern.slf4j.Slf4j; import org.apache.hc.client5.http.classic.methods.HttpPost; -import org.apache.hc.client5.http.entity.mime.*; +import org.apache.hc.client5.http.entity.mime.FileBody; +import org.apache.hc.client5.http.entity.mime.MultipartEntityBuilder; import org.apache.hc.client5.http.impl.classic.CloseableHttpClient; import org.apache.hc.client5.http.impl.classic.HttpClients; import org.apache.hc.core5.http.ContentType; import org.apache.hc.core5.http.HttpEntity; import org.apache.hc.core5.http.io.entity.EntityUtils; +import org.apache.hc.core5.http.io.entity.StringEntity; +import org.json.JSONObject; import java.io.File; import java.io.IOException; +import java.net.URL; @Slf4j public class TurOcr { + public static final String API_OCR_URL = "%s/api/ocr/url"; + public static final String API_OCR_FILE = "%s/api/ocr/file"; + public static final String FILE = "file"; + public static final String URL = "url"; + public static TurFileAttributes processFile(TurServer turServer, File file, boolean showOutput) { - try (CloseableHttpClient client = HttpClients.createDefault()) { - FileBody fileBody = new FileBody(file, ContentType.DEFAULT_BINARY); - String url = String.format("%s/api/ocr", turServer.getServerUrl()); - HttpPost httpPost = new HttpPost(url); - HttpEntity requestEntity = MultipartEntityBuilder.create().addPart("file", fileBody).build(); - httpPost.setEntity(requestEntity); + return getTurFileAttributes(turServer, + getRequestEntity(file), + String.format(API_OCR_FILE, turServer.getServerUrl()), + showOutput); + } + + private static TurFileAttributes getTurFileAttributes(TurServer turServer, HttpEntity requestEntity, + String endpoint, + boolean showOutput) { + try (CloseableHttpClient client = HttpClients.createDefault(); + HttpEntity entity = requestEntity) { + HttpPost httpPost = new HttpPost(endpoint); + httpPost.setEntity(entity); TurClientUtils.authentication(httpPost, turServer.getApiKey()); - String responseBody = client.execute(httpPost, response-> { - log.info("Request Status {} - {}", response.getCode(), url); - HttpEntity entity = response.getEntity(); - return entity != null ? EntityUtils.toString(entity) : null; + String responseBody = client.execute(httpPost, response -> { + log.info("Request Status {} - {}", response.getCode(), endpoint); + HttpEntity responseEntity = response.getEntity(); + return responseEntity != null ? EntityUtils.toString(responseEntity) : null; }); + TurFileAttributes turFileAttributes = new ObjectMapper().readValue(responseBody, TurFileAttributes.class); if (showOutput) { - System.out.println(responseBody); + System.out.println(turFileAttributes.toString()); } - return new ObjectMapper().readValue(responseBody, TurFileAttributes.class); + return turFileAttributes; } catch (IOException e) { log.error(e.getMessage(), e); - return null; + return new TurFileAttributes(); } } + + private static HttpEntity getRequestEntity(File file) { + return MultipartEntityBuilder.create() + .addPart(FILE, new FileBody(file, ContentType.DEFAULT_BINARY)) + .build(); + } + + public static TurFileAttributes processUrl(TurServer turServer, URL url, boolean showOutput) { + return getTurFileAttributes(turServer, + getRequestEntity(url), + String.format(API_OCR_URL, turServer.getServerUrl()), + showOutput); + } + + private static StringEntity getRequestEntity(URL url) { + return new StringEntity( + new JSONObject().put(URL, url.toString()).toString(), + ContentType.APPLICATION_JSON); + } } diff --git a/turing-java-sdk/src/main/java/com/viglet/turing/client/sn/sample/TurClientOcrSample.java b/turing-java-sdk/src/main/java/com/viglet/turing/client/sn/sample/TurClientOcrSample.java new file mode 100644 index 00000000000..b6fbfc38c89 --- /dev/null +++ b/turing-java-sdk/src/main/java/com/viglet/turing/client/sn/sample/TurClientOcrSample.java @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2016-2024 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.viglet.turing.client.sn.sample; + +import com.viglet.turing.client.auth.TurServer; +import com.viglet.turing.client.auth.credentials.TurApiKeyCredentials; +import com.viglet.turing.client.ocr.TurOcr; +import com.viglet.turing.commons.file.TurFileAttributes; +import lombok.extern.slf4j.Slf4j; + +import java.net.MalformedURLException; +import java.net.URI; + +/** + * Sample code to use this SDK. + * + * @author Alexandre Oliveira + * @since 0.3.9 + */ +@Slf4j +public class TurClientOcrSample { + private static final String TURING_URL = "http://localhost:2700"; + private static final String TURING_API_KEY = "9c29eb9697a642349ddedcec9"; + public static final String URL = "https://www.princexml.com/samples/invoice/invoicesample.pdf"; + + public static void main(String[] args) { + try { + TurServer turSNServer = new TurServer(URI.create(TURING_URL).toURL(), + new TurApiKeyCredentials(TURING_API_KEY)); + log.info("--- Ocr Url"); + TurFileAttributes turFileAttributes = TurOcr.processUrl(turSNServer, URI.create(URL).toURL(), + true); + log.info(turFileAttributes.toString()); + } catch (MalformedURLException e) { + log.error(e.getMessage(), e); + } + } +} diff --git a/turing-java-sdk/src/main/java/com/viglet/turing/client/sn/sample/TurSNClientSample.java b/turing-java-sdk/src/main/java/com/viglet/turing/client/sn/sample/TurSNClientSample.java index a69d6d84138..66b9d37abda 100644 --- a/turing-java-sdk/src/main/java/com/viglet/turing/client/sn/sample/TurSNClientSample.java +++ b/turing-java-sdk/src/main/java/com/viglet/turing/client/sn/sample/TurSNClientSample.java @@ -23,6 +23,7 @@ import com.viglet.turing.client.sn.pagination.TurSNPagination; import com.viglet.turing.client.sn.response.QueryTurSNResponse; import com.viglet.turing.client.sn.spotlight.TurSNSpotlightDocument; +import lombok.extern.slf4j.Slf4j; import java.net.MalformedURLException; import java.net.URI; @@ -30,17 +31,14 @@ import java.util.List; import java.util.Locale; import java.util.Map.Entry; -import java.util.logging.Level; -import java.util.logging.Logger; /** * Sample code to use this SDK. * * @since 0.3.4 */ +@Slf4j public class TurSNClientSample { - private static final Logger logger = Logger.getLogger(TurSNClientSample.class.getName()); - private static final String TURING_URL = "http://localhost:2700"; private static final String TURING_SITE = "Sample"; private static final Locale TURING_LOCALE = Locale.US; @@ -85,7 +83,7 @@ public static void main(String[] args) { groupBy(args, turSNServer); } catch (MalformedURLException e) { - logger.log(Level.SEVERE, e.getMessage(), e); + log.error(e.getMessage(), e); } } diff --git a/turing-java-sdk/src/main/java/com/viglet/turing/client/utils/TurClientUtils.java b/turing-java-sdk/src/main/java/com/viglet/turing/client/utils/TurClientUtils.java index ca1a42695ec..3946ba36cad 100644 --- a/turing-java-sdk/src/main/java/com/viglet/turing/client/utils/TurClientUtils.java +++ b/turing-java-sdk/src/main/java/com/viglet/turing/client/utils/TurClientUtils.java @@ -15,12 +15,7 @@ */ package com.viglet.turing.client.utils; -import com.viglet.turing.client.auth.credentials.TurUsernamePasswordCredentials; import org.apache.hc.client5.http.classic.methods.HttpPost; -import org.apache.hc.core5.http.HttpHeaders; - -import java.nio.charset.StandardCharsets; -import java.util.Base64; /** * Client Utils From 3d55e808ceec235bf278c668ad17a67d7d74d518 Mon Sep 17 00:00:00 2001 From: Alexandre Oliveira Date: Mon, 16 Sep 2024 17:27:55 -0300 Subject: [PATCH 04/10] Update TurClientOcrSample.java --- .../sample/TurClientOcrSample.java | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) rename turing-java-sdk/src/main/java/com/viglet/turing/client/{sn => ocr}/sample/TurClientOcrSample.java (65%) diff --git a/turing-java-sdk/src/main/java/com/viglet/turing/client/sn/sample/TurClientOcrSample.java b/turing-java-sdk/src/main/java/com/viglet/turing/client/ocr/sample/TurClientOcrSample.java similarity index 65% rename from turing-java-sdk/src/main/java/com/viglet/turing/client/sn/sample/TurClientOcrSample.java rename to turing-java-sdk/src/main/java/com/viglet/turing/client/ocr/sample/TurClientOcrSample.java index b6fbfc38c89..58e5479ee77 100644 --- a/turing-java-sdk/src/main/java/com/viglet/turing/client/sn/sample/TurClientOcrSample.java +++ b/turing-java-sdk/src/main/java/com/viglet/turing/client/ocr/sample/TurClientOcrSample.java @@ -1,20 +1,22 @@ /* + * * Copyright (C) 2016-2024 the original author or authors. * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. * - * http://www.apache.org/licenses/LICENSE-2.0 + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . */ -package com.viglet.turing.client.sn.sample; +package com.viglet.turing.client.ocr.sample; import com.viglet.turing.client.auth.TurServer; import com.viglet.turing.client.auth.credentials.TurApiKeyCredentials; From fd5459807878c05500b5f8e82e809892c98e2b1a Mon Sep 17 00:00:00 2001 From: Alexandre Oliveira Date: Tue, 17 Sep 2024 10:27:26 -0300 Subject: [PATCH 05/10] - Ocr: Code Smells --- .../main/java/com/viglet/turing/commons/file/TurFileSize.java | 2 +- .../src/main/java/com/viglet/turing/client/ocr/TurOcr.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/turing-commons/src/main/java/com/viglet/turing/commons/file/TurFileSize.java b/turing-commons/src/main/java/com/viglet/turing/commons/file/TurFileSize.java index 6ee2be59d25..88f8ef9d5c0 100644 --- a/turing-commons/src/main/java/com/viglet/turing/commons/file/TurFileSize.java +++ b/turing-commons/src/main/java/com/viglet/turing/commons/file/TurFileSize.java @@ -45,7 +45,7 @@ public TurFileSize(float bytes) { } private float twoDecimalFloat(float value) { - return new BigDecimal(value).setScale(2, RoundingMode.HALF_UP).floatValue(); + return BigDecimal.valueOf(value).setScale(2, RoundingMode.HALF_UP).floatValue(); } } diff --git a/turing-java-sdk/src/main/java/com/viglet/turing/client/ocr/TurOcr.java b/turing-java-sdk/src/main/java/com/viglet/turing/client/ocr/TurOcr.java index 7f93656e024..87689178594 100644 --- a/turing-java-sdk/src/main/java/com/viglet/turing/client/ocr/TurOcr.java +++ b/turing-java-sdk/src/main/java/com/viglet/turing/client/ocr/TurOcr.java @@ -53,7 +53,7 @@ private static TurFileAttributes getTurFileAttributes(TurServer turServer, HttpE }); TurFileAttributes turFileAttributes = new ObjectMapper().readValue(responseBody, TurFileAttributes.class); if (showOutput) { - System.out.println(turFileAttributes.toString()); + log.info(turFileAttributes.toString()); } return turFileAttributes; } catch (IOException e) { From 511e15deb3095972144baeae75907e8089dde99b Mon Sep 17 00:00:00 2001 From: Alexandre Oliveira Date: Tue, 17 Sep 2024 10:46:52 -0300 Subject: [PATCH 06/10] - OCR Client: Timeout --- .../com/viglet/turing/client/ocr/TurOcr.java | 37 ++++++++++++++++--- 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/turing-java-sdk/src/main/java/com/viglet/turing/client/ocr/TurOcr.java b/turing-java-sdk/src/main/java/com/viglet/turing/client/ocr/TurOcr.java index 87689178594..ac927f897f2 100644 --- a/turing-java-sdk/src/main/java/com/viglet/turing/client/ocr/TurOcr.java +++ b/turing-java-sdk/src/main/java/com/viglet/turing/client/ocr/TurOcr.java @@ -6,14 +6,19 @@ import com.viglet.turing.commons.file.TurFileAttributes; import lombok.extern.slf4j.Slf4j; import org.apache.hc.client5.http.classic.methods.HttpPost; +import org.apache.hc.client5.http.config.ConnectionConfig; import org.apache.hc.client5.http.entity.mime.FileBody; import org.apache.hc.client5.http.entity.mime.MultipartEntityBuilder; import org.apache.hc.client5.http.impl.classic.CloseableHttpClient; import org.apache.hc.client5.http.impl.classic.HttpClients; +import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManager; +import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManagerBuilder; import org.apache.hc.core5.http.ContentType; import org.apache.hc.core5.http.HttpEntity; import org.apache.hc.core5.http.io.entity.EntityUtils; import org.apache.hc.core5.http.io.entity.StringEntity; +import org.apache.hc.core5.util.TimeValue; +import org.apache.hc.core5.util.Timeout; import org.json.JSONObject; import java.io.File; @@ -22,15 +27,17 @@ @Slf4j public class TurOcr { - private TurOcr() { - throw new IllegalStateException("OCR Utility class"); - } + public static final int TIMEOUT_MINUTES = 5; public static final String API_OCR_URL = "%s/api/ocr/url"; public static final String API_OCR_FILE = "%s/api/ocr/file"; public static final String FILE = "file"; public static final String URL = "url"; + private TurOcr() { + throw new IllegalStateException("OCR Utility class"); + } + public static TurFileAttributes processFile(TurServer turServer, File file, boolean showOutput) { return getTurFileAttributes(turServer, getRequestEntity(file), @@ -41,8 +48,14 @@ public static TurFileAttributes processFile(TurServer turServer, File file, bool private static TurFileAttributes getTurFileAttributes(TurServer turServer, HttpEntity requestEntity, String endpoint, boolean showOutput) { - try (CloseableHttpClient client = HttpClients.createDefault(); - HttpEntity entity = requestEntity) { + + try (PoolingHttpClientConnectionManager pool = setConnectionManager(); + CloseableHttpClient client = HttpClients + .custom() + .setConnectionManager(pool) + .build(); + HttpEntity entity = requestEntity + ) { HttpPost httpPost = new HttpPost(endpoint); httpPost.setEntity(entity); TurClientUtils.authentication(httpPost, turServer.getApiKey()); @@ -53,7 +66,7 @@ private static TurFileAttributes getTurFileAttributes(TurServer turServer, HttpE }); TurFileAttributes turFileAttributes = new ObjectMapper().readValue(responseBody, TurFileAttributes.class); if (showOutput) { - log.info(turFileAttributes.toString()); + log.info(turFileAttributes.toString()); } return turFileAttributes; } catch (IOException e) { @@ -62,6 +75,18 @@ private static TurFileAttributes getTurFileAttributes(TurServer turServer, HttpE } } + private static PoolingHttpClientConnectionManager setConnectionManager() { + ConnectionConfig config = ConnectionConfig.custom() + .setSocketTimeout(Timeout.ofMinutes(TIMEOUT_MINUTES)) + .setConnectTimeout(Timeout.ofMinutes(TIMEOUT_MINUTES)) + .setTimeToLive(TimeValue.ofMinutes(TIMEOUT_MINUTES)) + .build(); + return PoolingHttpClientConnectionManagerBuilder + .create() + .setDefaultConnectionConfig(config) + .build(); + } + private static HttpEntity getRequestEntity(File file) { return MultipartEntityBuilder.create() .addPart(FILE, new FileBody(file, ContentType.DEFAULT_BINARY)) From 9cb50a3beb23fc4ea8638130a523a62d0b6cb2d1 Mon Sep 17 00:00:00 2001 From: Alexandre Oliveira Date: Tue, 17 Sep 2024 10:53:48 -0300 Subject: [PATCH 07/10] - OCR: Reused HttpClient Pooling --- .../turing/tool/filesystem/TurFSImportTool.java | 3 ++- .../java/com/viglet/turing/client/ocr/TurOcr.java | 15 +++++++-------- .../client/ocr/sample/TurClientOcrSample.java | 3 ++- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/turing-filesystem/fs-connector/src/main/java/com/viglet/turing/tool/filesystem/TurFSImportTool.java b/turing-filesystem/fs-connector/src/main/java/com/viglet/turing/tool/filesystem/TurFSImportTool.java index 05d1588576a..936398a20ef 100644 --- a/turing-filesystem/fs-connector/src/main/java/com/viglet/turing/tool/filesystem/TurFSImportTool.java +++ b/turing-filesystem/fs-connector/src/main/java/com/viglet/turing/tool/filesystem/TurFSImportTool.java @@ -153,7 +153,8 @@ private TurSNJobItem createJobItem(File file) { attributes.put(fileSizeField, file.length()); attributes.put("url", fileURL); try { - attributes.put("text", TurOcr.processFile(new TurServer(URI.create(turingServer).toURL(), + TurOcr ocr = new TurOcr(); + attributes.put("text", ocr.processFile(new TurServer(URI.create(turingServer).toURL(), new TurApiKeyCredentials(turingApiKey)), file, showOutput)); } catch (MalformedURLException e) { log.error(e.getMessage(), e); diff --git a/turing-java-sdk/src/main/java/com/viglet/turing/client/ocr/TurOcr.java b/turing-java-sdk/src/main/java/com/viglet/turing/client/ocr/TurOcr.java index ac927f897f2..cabdf4f2ef6 100644 --- a/turing-java-sdk/src/main/java/com/viglet/turing/client/ocr/TurOcr.java +++ b/turing-java-sdk/src/main/java/com/viglet/turing/client/ocr/TurOcr.java @@ -33,24 +33,23 @@ public class TurOcr { public static final String API_OCR_FILE = "%s/api/ocr/file"; public static final String FILE = "file"; public static final String URL = "url"; - - private TurOcr() { - throw new IllegalStateException("OCR Utility class"); + private final PoolingHttpClientConnectionManager pool; + public TurOcr() { + pool = setConnectionManager(); } - public static TurFileAttributes processFile(TurServer turServer, File file, boolean showOutput) { + public TurFileAttributes processFile(TurServer turServer, File file, boolean showOutput) { return getTurFileAttributes(turServer, getRequestEntity(file), String.format(API_OCR_FILE, turServer.getServerUrl()), showOutput); } - private static TurFileAttributes getTurFileAttributes(TurServer turServer, HttpEntity requestEntity, + private TurFileAttributes getTurFileAttributes(TurServer turServer, HttpEntity requestEntity, String endpoint, boolean showOutput) { - try (PoolingHttpClientConnectionManager pool = setConnectionManager(); - CloseableHttpClient client = HttpClients + try (CloseableHttpClient client = HttpClients .custom() .setConnectionManager(pool) .build(); @@ -93,7 +92,7 @@ private static HttpEntity getRequestEntity(File file) { .build(); } - public static TurFileAttributes processUrl(TurServer turServer, URL url, boolean showOutput) { + public TurFileAttributes processUrl(TurServer turServer, URL url, boolean showOutput) { return getTurFileAttributes(turServer, getRequestEntity(url), String.format(API_OCR_URL, turServer.getServerUrl()), diff --git a/turing-java-sdk/src/main/java/com/viglet/turing/client/ocr/sample/TurClientOcrSample.java b/turing-java-sdk/src/main/java/com/viglet/turing/client/ocr/sample/TurClientOcrSample.java index 58e5479ee77..b9cb34e36af 100644 --- a/turing-java-sdk/src/main/java/com/viglet/turing/client/ocr/sample/TurClientOcrSample.java +++ b/turing-java-sdk/src/main/java/com/viglet/turing/client/ocr/sample/TurClientOcrSample.java @@ -44,7 +44,8 @@ public static void main(String[] args) { TurServer turSNServer = new TurServer(URI.create(TURING_URL).toURL(), new TurApiKeyCredentials(TURING_API_KEY)); log.info("--- Ocr Url"); - TurFileAttributes turFileAttributes = TurOcr.processUrl(turSNServer, URI.create(URL).toURL(), + TurOcr turOcr = new TurOcr(); + TurFileAttributes turFileAttributes = turOcr.processUrl(turSNServer, URI.create(URL).toURL(), true); log.info(turFileAttributes.toString()); } catch (MalformedURLException e) { From f5a42c4cb415a5e43bf5ec8731a6045c9deb7be5 Mon Sep 17 00:00:00 2001 From: Alexandre Oliveira Date: Tue, 17 Sep 2024 11:01:25 -0300 Subject: [PATCH 08/10] - OCR Sample: Parameters --- .../client/ocr/sample/TurClientOcrSample.java | 31 ++++++++++--------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/turing-java-sdk/src/main/java/com/viglet/turing/client/ocr/sample/TurClientOcrSample.java b/turing-java-sdk/src/main/java/com/viglet/turing/client/ocr/sample/TurClientOcrSample.java index b9cb34e36af..31ffa3267b6 100644 --- a/turing-java-sdk/src/main/java/com/viglet/turing/client/ocr/sample/TurClientOcrSample.java +++ b/turing-java-sdk/src/main/java/com/viglet/turing/client/ocr/sample/TurClientOcrSample.java @@ -35,21 +35,24 @@ */ @Slf4j public class TurClientOcrSample { - private static final String TURING_URL = "http://localhost:2700"; - private static final String TURING_API_KEY = "9c29eb9697a642349ddedcec9"; - public static final String URL = "https://www.princexml.com/samples/invoice/invoicesample.pdf"; - public static void main(String[] args) { - try { - TurServer turSNServer = new TurServer(URI.create(TURING_URL).toURL(), - new TurApiKeyCredentials(TURING_API_KEY)); - log.info("--- Ocr Url"); - TurOcr turOcr = new TurOcr(); - TurFileAttributes turFileAttributes = turOcr.processUrl(turSNServer, URI.create(URL).toURL(), - true); - log.info(turFileAttributes.toString()); - } catch (MalformedURLException e) { - log.error(e.getMessage(), e); + if (args.length == 3) { + String turingUrl = args[0]; + String apiKey = args[1]; + String fileUrl = args[2]; + try { + TurServer turSNServer = new TurServer(URI.create(turingUrl).toURL(), + new TurApiKeyCredentials(apiKey)); + log.info("--- Ocr Url"); + TurOcr turOcr = new TurOcr(); + TurFileAttributes turFileAttributes = turOcr.processUrl(turSNServer, URI.create(fileUrl).toURL(), + true); + log.info(turFileAttributes.toString()); + } catch (MalformedURLException e) { + log.error(e.getMessage(), e); + } + } else { + log.info("Parameters: turingUrl apiKey fileUrl"); } } } From b095d13b72a8e3709c2dcf50e29d143e61a5f9c7 Mon Sep 17 00:00:00 2001 From: Alexandre Oliveira Date: Tue, 17 Sep 2024 11:18:24 -0300 Subject: [PATCH 09/10] - OCR: isValidUrl --- turing-filesystem/fs-commons/pom.xml | 5 +++ .../filesystem/commons/TurFileUtils.java | 43 ++++++++++++------- 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/turing-filesystem/fs-commons/pom.xml b/turing-filesystem/fs-commons/pom.xml index 0d5aa91b53a..0560e0ff81f 100644 --- a/turing-filesystem/fs-commons/pom.xml +++ b/turing-filesystem/fs-commons/pom.xml @@ -87,5 +87,10 @@ spring-web 6.1.13 + + commons-validator + commons-validator + 1.9.0 + diff --git a/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileUtils.java b/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileUtils.java index b8b5961d8b1..e59b1f4c1eb 100644 --- a/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileUtils.java +++ b/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileUtils.java @@ -6,6 +6,7 @@ import lombok.extern.slf4j.Slf4j; import org.apache.commons.io.FileUtils; import org.apache.commons.io.FilenameUtils; +import org.apache.commons.validator.routines.UrlValidator; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.metadata.Metadata; @@ -130,28 +131,38 @@ public static TurFileAttributes urlContentToText(URL url) { private static Date getLastModified(URL url) { Date date = new Date(); - try { - HttpURLConnection httpUrlConnection = (HttpURLConnection) url.openConnection(); - httpUrlConnection.setRequestMethod("HEAD"); - date = new Date(httpUrlConnection.getLastModified()); - httpUrlConnection.disconnect(); - } catch (IOException e) { - log.error(e.getMessage(), e); + + if (isValidUrl(url)) { + try { + HttpURLConnection httpUrlConnection = (HttpURLConnection) url.openConnection(); + httpUrlConnection.setRequestMethod("HEAD"); + date = new Date(httpUrlConnection.getLastModified()); + httpUrlConnection.disconnect(); + } catch (IOException e) { + log.error(e.getMessage(), e); + } } return date; } + private static boolean isValidUrl(URL url) { + UrlValidator urlValidator = new UrlValidator(); + return urlValidator.isValid(url.getPath()); + } + private static File getFile(URL url) { File tempFile = null; - try { - tempFile = createTempFile(); - FileUtils.copyURLToFile( - url, - tempFile, - 5000, - 5000); - } catch (IOException e) { - log.error(e.getMessage(), e); + if (isValidUrl(url)) { + try { + tempFile = createTempFile(); + FileUtils.copyURLToFile( + url, + tempFile, + 5000, + 5000); + } catch (IOException e) { + log.error(e.getMessage(), e); + } } return tempFile; } From 608d38312abc0454b8e252f5d6f9fa4cac1d3bcb Mon Sep 17 00:00:00 2001 From: Alexandre Oliveira Date: Tue, 17 Sep 2024 11:26:50 -0300 Subject: [PATCH 10/10] - OCR: TurFileAttributes Builder --- .../commons/file/TurFileAttributes.java | 3 ++ .../filesystem/commons/TurFileUtils.java | 41 ++++++++++--------- 2 files changed, 25 insertions(+), 19 deletions(-) diff --git a/turing-commons/src/main/java/com/viglet/turing/commons/file/TurFileAttributes.java b/turing-commons/src/main/java/com/viglet/turing/commons/file/TurFileAttributes.java index e929628d394..009c1011d4d 100644 --- a/turing-commons/src/main/java/com/viglet/turing/commons/file/TurFileAttributes.java +++ b/turing-commons/src/main/java/com/viglet/turing/commons/file/TurFileAttributes.java @@ -30,6 +30,9 @@ * **/ +@Builder(toBuilder = true) +@NoArgsConstructor +@AllArgsConstructor @Setter @Getter @ToString diff --git a/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileUtils.java b/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileUtils.java index e59b1f4c1eb..8e33c4445b3 100644 --- a/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileUtils.java +++ b/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileUtils.java @@ -34,6 +34,9 @@ public class TurFileUtils { public static final String PDF_DOC_INFO_TITLE = "pdf:docinfo:title"; + public static final int CONNECTION_TIMEOUT_MILLIS = 5000; + public static final String TMP = "tmp"; + public static final String HEAD = "HEAD"; private TurFileUtils() { throw new IllegalStateException("Turing File Utilities class"); @@ -135,7 +138,7 @@ private static Date getLastModified(URL url) { if (isValidUrl(url)) { try { HttpURLConnection httpUrlConnection = (HttpURLConnection) url.openConnection(); - httpUrlConnection.setRequestMethod("HEAD"); + httpUrlConnection.setRequestMethod(HEAD); date = new Date(httpUrlConnection.getLastModified()); httpUrlConnection.disconnect(); } catch (IOException e) { @@ -158,8 +161,8 @@ private static File getFile(URL url) { FileUtils.copyURLToFile( url, tempFile, - 5000, - 5000); + CONNECTION_TIMEOUT_MILLIS, + CONNECTION_TIMEOUT_MILLIS); } catch (IOException e) { log.error(e.getMessage(), e); } @@ -172,24 +175,24 @@ private static TurFileAttributes getTurFileAttributes(TurTikaFileAttributes file String fileExtension, long fileSize, Date lastModified) { - TurFileAttributes turFileAttributes = new TurFileAttributes(); - Optional.ofNullable(file).ifPresent(attributes -> { - turFileAttributes.setContent(attributes.getContent()); - turFileAttributes.setName(fileName); - turFileAttributes.setExtension(fileExtension); - turFileAttributes.setSize(new TurFileSize(fileSize)); - turFileAttributes.setTitle(getTitle(file, turFileAttributes)); - turFileAttributes.setLastModified(lastModified); - turFileAttributes.setMetadata(getMetadataMap(file)); - }); - return turFileAttributes; - } - - private static String getTitle(TurTikaFileAttributes file, TurFileAttributes turFileAttributes) { + return Optional.ofNullable(file).map(attributes -> + TurFileAttributes.builder() + .content(attributes.getContent()) + .name(fileName) + .extension(fileExtension) + .size(new TurFileSize(fileSize)) + .title(getTitle(file, fileName)) + .lastModified(lastModified) + .metadata(getMetadataMap(file)) + .build()) + .orElseGet(TurFileAttributes::new); + } + + private static String getTitle(TurTikaFileAttributes file, String fileName) { return Optional.ofNullable(file .getMetadata() .get(PDF_DOC_INFO_TITLE)) - .orElseGet(turFileAttributes::getName); + .orElse(fileName); } private static Map getMetadataMap(TurTikaFileAttributes file) { @@ -225,7 +228,7 @@ public static Optional getFileContent(InputStream stream, BodyContentHan private static File createTempFile() throws IOException { return File.createTempFile(UUID.randomUUID().toString(), null, - TurCommonsUtils.addSubDirToStoreDir("tmp")); + TurCommonsUtils.addSubDirToStoreDir(TMP)); }