diff --git a/turing-app/src/main/java/com/viglet/turing/api/llm/TurLlmAPI.java b/turing-app/src/main/java/com/viglet/turing/api/llm/TurLlmAPI.java index f63cf2849a2..bfb06577f48 100644 --- a/turing-app/src/main/java/com/viglet/turing/api/llm/TurLlmAPI.java +++ b/turing-app/src/main/java/com/viglet/turing/api/llm/TurLlmAPI.java @@ -45,6 +45,7 @@ import java.net.http.HttpRequest; import java.net.http.HttpResponse; import java.util.List; +import java.util.Objects; import java.util.Optional; @Slf4j @@ -98,8 +99,7 @@ public RedactionScript validateText(@RequestParam("text") String text) @PostMapping(value = "/entity/file/blazon") public RedactionScript validateFile(@RequestParam("file") MultipartFile multipartFile) throws IOException, InterruptedException { - final String text = TurFileUtils.documentToText(multipartFile); - return getEntities(text); + return getEntities(Objects.requireNonNull(TurFileUtils.documentToText(multipartFile)).getContent()); } diff --git a/turing-app/src/main/java/com/viglet/turing/api/ml/data/TurMLDataAPI.java b/turing-app/src/main/java/com/viglet/turing/api/ml/data/TurMLDataAPI.java index b6943fc40e5..50634fff4d1 100644 --- a/turing-app/src/main/java/com/viglet/turing/api/ml/data/TurMLDataAPI.java +++ b/turing-app/src/main/java/com/viglet/turing/api/ml/data/TurMLDataAPI.java @@ -40,6 +40,7 @@ import java.util.Arrays; import java.util.List; +import java.util.Objects; @Slf4j @RestController @@ -104,7 +105,7 @@ public TurData turDataAdd(@RequestBody TurData turData) { @Transactional public String turDataImport(@RequestParam("file") MultipartFile multipartFile) { String[] sentences = turOpenNLPConnector.sentenceDetect(turNLPProcess.getDefaultNLPInstance(), - TurFileUtils.documentToText(multipartFile)); + Objects.requireNonNull(TurFileUtils.documentToText(multipartFile)).getContent()); TurData turData = new TurData(); turData.setName(multipartFile.getOriginalFilename()); turData.setType(FilenameUtils.getExtension(multipartFile.getOriginalFilename())); diff --git a/turing-app/src/main/java/com/viglet/turing/api/nlp/TurNLPInstanceAPI.java b/turing-app/src/main/java/com/viglet/turing/api/nlp/TurNLPInstanceAPI.java index 2cc1b63b152..0c44fa3f1d3 100644 --- a/turing-app/src/main/java/com/viglet/turing/api/nlp/TurNLPInstanceAPI.java +++ b/turing-app/src/main/java/com/viglet/turing/api/nlp/TurNLPInstanceAPI.java @@ -27,9 +27,8 @@ import com.viglet.turing.api.nlp.bean.TurNLPEntityValidateResponse; import com.viglet.turing.api.nlp.bean.TurNLPValidateDocument; import com.viglet.turing.api.nlp.bean.TurNLPValidateResponse; -import com.viglet.turing.commons.utils.TurCommonsUtils; -import com.viglet.turing.filesystem.commons.TurFileAttributes; import com.viglet.turing.filesystem.commons.TurFileUtils; +import com.viglet.turing.filesystem.commons.TurTikaFileAttributes; import com.viglet.turing.nlp.TurNLPProcess; import com.viglet.turing.nlp.TurNLPResponse; import com.viglet.turing.nlp.TurNLPUtils; @@ -54,6 +53,7 @@ import java.io.File; import java.util.Collections; import java.util.List; +import java.util.Objects; import java.util.Optional; import java.util.stream.Collectors; @@ -134,7 +134,7 @@ public TurNLPInstance turNLPInstanceAdd(@RequestBody TurNLPInstance turNLPInstan @PostMapping(value = "/{id}/validate/file/blazon", produces = MediaType.APPLICATION_XML_VALUE) public RedactionScript validateFile(@RequestParam("file") MultipartFile multipartFile, @PathVariable String id) { - final String text = TurFileUtils.documentToText(multipartFile); + final String text = Objects.requireNonNull(TurFileUtils.documentToText(multipartFile)).getContent(); TurNLPTextValidate textValidate = new TurNLPTextValidate(); textValidate.setText(text); return this.turNLPInstanceRepository.findById(id).map(turNLPInstance -> @@ -149,18 +149,18 @@ public TurNLPValidateResponse validateDocument(@PathVariable String id, @RequestParam("config") String turNLPValidateDocumentRequest) { File file = TurSpringUtils.getFileFromMultipart(multipartFile); - TurFileAttributes turFileAttributes = TurFileUtils.readFile(file); + TurTikaFileAttributes turTikaFileAttributes = TurFileUtils.readFile(file); return this.turNLPInstanceRepository.findById(id) .map(turNLPInstance -> { try { TurNLPValidateDocument turNLPValidateDocument = new ObjectMapper().readValue(turNLPValidateDocumentRequest, TurNLPValidateDocument.class); - if (turFileAttributes != null && turNLPValidateDocument != null) { + if (turTikaFileAttributes != null && turNLPValidateDocument != null) { TurNLPResponse turNLPResponse = turNLPProcess.processTextByNLP(turNLPInstance, - turFileAttributes.getContent(), turNLPValidateDocument.getEntities()); + turTikaFileAttributes.getContent(), turNLPValidateDocument.getEntities()); List terms = getNLPTerms(turNLPResponse); turNLPUtils.redactPdf(file, terms); - return createNLPValidateResponse(turNLPInstance, turNLPResponse, turFileAttributes.getContent()); + return createNLPValidateResponse(turNLPInstance, turNLPResponse, turTikaFileAttributes.getContent()); } } catch (JsonProcessingException e) { log.error(e.getMessage(), e); diff --git a/turing-app/src/main/java/com/viglet/turing/api/ocr/TurOcrAPI.java b/turing-app/src/main/java/com/viglet/turing/api/ocr/TurOcrAPI.java index 63d23a3a865..4ef1a4821a4 100644 --- a/turing-app/src/main/java/com/viglet/turing/api/ocr/TurOcrAPI.java +++ b/turing-app/src/main/java/com/viglet/turing/api/ocr/TurOcrAPI.java @@ -1,22 +1,55 @@ +/* + * + * Copyright (C) 2016-2024 the original author or authors. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + package com.viglet.turing.api.ocr; +import com.viglet.turing.commons.file.TurFileAttributes; import com.viglet.turing.filesystem.commons.TurFileUtils; import io.swagger.v3.oas.annotations.tags.Tag; import lombok.extern.slf4j.Slf4j; -import org.springframework.web.bind.annotation.PostMapping; -import org.springframework.web.bind.annotation.RequestMapping; -import org.springframework.web.bind.annotation.RequestParam; -import org.springframework.web.bind.annotation.RestController; +import org.springframework.web.bind.annotation.*; import org.springframework.web.multipart.MultipartFile; +import java.net.MalformedURLException; +import java.net.URI; +/** + * @author Alexandre Oliveira + * @since 0.3.9 + */ @Slf4j @RestController @RequestMapping("/api/ocr") @Tag(name = "OCR", description = "OCR API") public class TurOcrAPI { - @PostMapping - public String convertToText(@RequestParam("file") MultipartFile multipartFile) { + @PostMapping("/file") + public TurFileAttributes fileToText(@RequestParam("file") MultipartFile multipartFile) { return TurFileUtils.documentToText(multipartFile); } + + @PostMapping("/url") + public TurFileAttributes urlToText(@RequestBody TurOcrFromUrl turOcrFromUrl) { + try { + return TurFileUtils.urlContentToText(URI.create(turOcrFromUrl.getUrl()).toURL()); + } + catch (MalformedURLException e) { + log.error(e.getMessage(), e); + } + return new TurFileAttributes(); + } } diff --git a/turing-app/src/main/java/com/viglet/turing/api/ocr/TurOcrFromUrl.java b/turing-app/src/main/java/com/viglet/turing/api/ocr/TurOcrFromUrl.java new file mode 100644 index 00000000000..e09b52023e5 --- /dev/null +++ b/turing-app/src/main/java/com/viglet/turing/api/ocr/TurOcrFromUrl.java @@ -0,0 +1,32 @@ +/* + * + * Copyright (C) 2016-2024 the original author or authors. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +package com.viglet.turing.api.ocr; + +import lombok.Getter; +import lombok.Setter; + +/** + * @author Alexandre Oliveira + * @since 0.3.9 + */ +@Getter +@Setter +public class TurOcrFromUrl { + private String url; +} diff --git a/turing-app/src/main/java/com/viglet/turing/api/sn/bean/TurSolrCoreExists.java b/turing-app/src/main/java/com/viglet/turing/api/sn/bean/TurSolrCoreExists.java index 99327c66444..a0eaf58a4a8 100644 --- a/turing-app/src/main/java/com/viglet/turing/api/sn/bean/TurSolrCoreExists.java +++ b/turing-app/src/main/java/com/viglet/turing/api/sn/bean/TurSolrCoreExists.java @@ -9,8 +9,8 @@ @Setter @Builder public class TurSolrCoreExists { - String name; - boolean exists; + private String name; + private boolean exists; @Tolerate public TurSolrCoreExists() { diff --git a/turing-app/src/main/java/com/viglet/turing/api/sn/job/TurSNImportAPI.java b/turing-app/src/main/java/com/viglet/turing/api/sn/job/TurSNImportAPI.java index 6801113da07..ac399856698 100644 --- a/turing-app/src/main/java/com/viglet/turing/api/sn/job/TurSNImportAPI.java +++ b/turing-app/src/main/java/com/viglet/turing/api/sn/job/TurSNImportAPI.java @@ -27,8 +27,8 @@ import com.viglet.turing.client.sn.job.TurSNJobItem; import com.viglet.turing.client.sn.job.TurSNJobItems; import com.viglet.turing.commons.utils.TurCommonsUtils; -import com.viglet.turing.filesystem.commons.TurFileAttributes; import com.viglet.turing.filesystem.commons.TurFileUtils; +import com.viglet.turing.filesystem.commons.TurTikaFileAttributes; import com.viglet.turing.persistence.repository.sn.TurSNSiteRepository; import com.viglet.turing.sn.TurSNConstants; import com.viglet.turing.spring.utils.TurSpringUtils; @@ -108,16 +108,11 @@ public boolean turSNImportZipFileBroker(@RequestParam("file") MultipartFile mult private void extractTextOfFileAttribute(File extractFolder, Map.Entry attribute) { if (attribute.getValue().toString().startsWith(TurSNConstants.FILE_PROTOCOL)) { String fileName = attribute.getValue().toString().replace(TurSNConstants.FILE_PROTOCOL, ""); - try (FileInputStream fileInputStreamAttribute = new FileInputStream( - extractFolder.getAbsolutePath() + File.separator + fileName)) { - TurFileAttributes turFileAttributes = TurFileUtils.parseFile(fileInputStreamAttribute, null); - Optional.ofNullable(turFileAttributes) - .map(TurFileAttributes::getContent) - .ifPresent(content -> attribute.setValue(TurCommonsUtils.cleanTextContent(content))); - } catch (IOException e) { - log.error(e.getMessage(), e); - } - + File file = new File(extractFolder.getAbsolutePath().concat(File.separator).concat(fileName)); + TurTikaFileAttributes turTikaFileAttributes = TurFileUtils.parseFile(file); + Optional.ofNullable(turTikaFileAttributes) + .map(TurTikaFileAttributes::getContent) + .ifPresent(content -> attribute.setValue(TurCommonsUtils.cleanTextContent(content))); } } diff --git a/turing-app/src/main/java/com/viglet/turing/spring/security/TurSecurityConfigProduction.java b/turing-app/src/main/java/com/viglet/turing/spring/security/TurSecurityConfigProduction.java index c915c01e14b..96afd00580e 100644 --- a/turing-app/src/main/java/com/viglet/turing/spring/security/TurSecurityConfigProduction.java +++ b/turing-app/src/main/java/com/viglet/turing/spring/security/TurSecurityConfigProduction.java @@ -85,7 +85,7 @@ SecurityFilterChain filterChain(HttpSecurity http, MvcRequestMatcher.Builder mvc mvc.pattern("/error/**"), mvc.pattern("/logout"), mvc.pattern("/api/nlp/**"), - mvc.pattern("/api/ocr"), + mvc.pattern("/api/ocr/**"), mvc.pattern("/api/llm/**"), mvc.pattern("/api/v2/guest/**"), AntPathRequestMatcher.antMatcher("/h2/**"))) diff --git a/turing-commons/src/main/java/com/viglet/turing/commons/file/TurFileAttributes.java b/turing-commons/src/main/java/com/viglet/turing/commons/file/TurFileAttributes.java new file mode 100644 index 00000000000..009c1011d4d --- /dev/null +++ b/turing-commons/src/main/java/com/viglet/turing/commons/file/TurFileAttributes.java @@ -0,0 +1,49 @@ +/* + * + * Copyright (C) 2016-2024 the original author or authors. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package com.viglet.turing.commons.file; + +import lombok.*; + +import java.util.Date; +import java.util.Map; + +/** +* +* @author Alexandre Oliveira +* +* @since 0.3.9 +* +**/ + +@Builder(toBuilder = true) +@NoArgsConstructor +@AllArgsConstructor +@Setter +@Getter +@ToString +public class TurFileAttributes { + private String content; + private String name; + private String title; + private String extension; + private TurFileSize size; + private Date lastModified; + private Map metadata; + + +} diff --git a/turing-commons/src/main/java/com/viglet/turing/commons/file/TurFileSize.java b/turing-commons/src/main/java/com/viglet/turing/commons/file/TurFileSize.java new file mode 100644 index 00000000000..88f8ef9d5c0 --- /dev/null +++ b/turing-commons/src/main/java/com/viglet/turing/commons/file/TurFileSize.java @@ -0,0 +1,51 @@ +/* + * + * Copyright (C) 2016-2024 the original author or authors. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +package com.viglet.turing.commons.file; + +import lombok.*; + +import java.math.BigDecimal; +import java.math.RoundingMode; + +/** + * @author Alexandre Oliveira + * @since 0.3.9 + **/ +@Setter +@Getter +@AllArgsConstructor +@ToString +public class TurFileSize { + private final float bytes; + private final float kiloBytes; + private final float megaBytes; + public TurFileSize() { + this(0f); + } + public TurFileSize(float bytes) { + this.bytes = twoDecimalFloat(bytes); + this.kiloBytes = twoDecimalFloat(this.bytes / 1024); + this.megaBytes = twoDecimalFloat(this.kiloBytes / 1024); + } + + private float twoDecimalFloat(float value) { + return BigDecimal.valueOf(value).setScale(2, RoundingMode.HALF_UP).floatValue(); + } + +} diff --git a/turing-db/db-app/src/main/java/com/viglet/turing/connector/db/TurDbImportTool.java b/turing-db/db-app/src/main/java/com/viglet/turing/connector/db/TurDbImportTool.java index 9bfc5b303f8..6f92e42ae1e 100644 --- a/turing-db/db-app/src/main/java/com/viglet/turing/connector/db/TurDbImportTool.java +++ b/turing-db/db-app/src/main/java/com/viglet/turing/connector/db/TurDbImportTool.java @@ -27,7 +27,7 @@ import com.viglet.turing.commons.cache.TurCustomClassCache; import com.viglet.turing.connector.db.ext.TurDbExtCustomImpl; import com.viglet.turing.connector.db.format.TurDbFormatValue; -import com.viglet.turing.filesystem.commons.TurFileAttributes; +import com.viglet.turing.filesystem.commons.TurTikaFileAttributes; import com.viglet.turing.filesystem.commons.TurFileUtils; import lombok.Getter; import lombok.Setter; @@ -302,23 +302,23 @@ private void addDBFieldsAsAttributes(ResultSet rs, Map attribute private void addFileAttributes(Map attributes) { if (filePathField != null && attributes.containsKey(filePathField)) { - TurFileAttributes turFileAttributes = TurFileUtils.readFile((String) attributes.get(filePathField)); - if (turFileAttributes != null) { - addFileSizeAttribute(attributes, turFileAttributes); - addFileContentAttribute(attributes, turFileAttributes); + TurTikaFileAttributes turTikaFileAttributes = TurFileUtils.readFile((String) attributes.get(filePathField)); + if (turTikaFileAttributes != null) { + addFileSizeAttribute(attributes, turTikaFileAttributes); + addFileContentAttribute(attributes, turTikaFileAttributes); } } } - private void addFileContentAttribute(Map attributes, TurFileAttributes turFileAttributes) { + private void addFileContentAttribute(Map attributes, TurTikaFileAttributes turTikaFileAttributes) { if (fileContentField != null) { long maxContentByteSize = maxContentMegaByteSize * MEGA_BYTE; - if (turFileAttributes.getContent().getBytes().length <= maxContentByteSize) { - attributes.put(fileContentField, turFileAttributes.getContent()); + if (turTikaFileAttributes.getContent().getBytes().length <= maxContentByteSize) { + attributes.put(fileContentField, turTikaFileAttributes.getContent()); } else { attributes.put(fileContentField, - turFileAttributes.getContent().substring(0, Math.toIntExact(maxContentByteSize))); + turTikaFileAttributes.getContent().substring(0, Math.toIntExact(maxContentByteSize))); if (log.isDebugEnabled()) { log.debug("File size greater than {}, truncating content ...:", FileUtils.byteCountToDisplaySize(maxContentByteSize)); @@ -329,14 +329,14 @@ private void addFileContentAttribute(Map attributes, TurFileAttr } } - private void addFileSizeAttribute(Map attributes, TurFileAttributes turFileAttributes) { - if (fileSizeField != null && turFileAttributes.getFile() != null) { - attributes.put(fileSizeField, turFileAttributes.getFile().length()); + private void addFileSizeAttribute(Map attributes, TurTikaFileAttributes turTikaFileAttributes) { + if (fileSizeField != null && turTikaFileAttributes.getFile() != null) { + attributes.put(fileSizeField, turTikaFileAttributes.getFile().length()); if (log.isDebugEnabled()) { - log.debug("File: {}", turFileAttributes.getFile().getAbsolutePath()); - log.debug("File size: {}", FileUtils.byteCountToDisplaySize(turFileAttributes.getFile().length())); + log.debug("File: {}", turTikaFileAttributes.getFile().getAbsolutePath()); + log.debug("File size: {}", FileUtils.byteCountToDisplaySize(turTikaFileAttributes.getFile().length())); log.debug("File - Content size: {}", - FileUtils.byteCountToDisplaySize(turFileAttributes.getContent().getBytes().length)); + FileUtils.byteCountToDisplaySize(turTikaFileAttributes.getContent().getBytes().length)); } } else { log.debug("File without size: {}", filePathField); diff --git a/turing-filesystem/fs-commons/pom.xml b/turing-filesystem/fs-commons/pom.xml index 0d5aa91b53a..0560e0ff81f 100644 --- a/turing-filesystem/fs-commons/pom.xml +++ b/turing-filesystem/fs-commons/pom.xml @@ -87,5 +87,10 @@ spring-web 6.1.13 + + commons-validator + commons-validator + 1.9.0 + diff --git a/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileUtils.java b/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileUtils.java index a5770288b88..8e33c4445b3 100644 --- a/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileUtils.java +++ b/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileUtils.java @@ -1,7 +1,12 @@ package com.viglet.turing.filesystem.commons; +import com.viglet.turing.commons.file.TurFileAttributes; +import com.viglet.turing.commons.file.TurFileSize; import com.viglet.turing.commons.utils.TurCommonsUtils; import lombok.extern.slf4j.Slf4j; +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.FilenameUtils; +import org.apache.commons.validator.routines.UrlValidator; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.metadata.Metadata; @@ -19,23 +24,29 @@ import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; +import java.net.HttpURLConnection; +import java.net.URL; import java.nio.file.Files; import java.nio.file.StandardCopyOption; -import java.util.Optional; -import java.util.UUID; +import java.util.*; @Slf4j public class TurFileUtils { + public static final String PDF_DOC_INFO_TITLE = "pdf:docinfo:title"; + public static final int CONNECTION_TIMEOUT_MILLIS = 5000; + public static final String TMP = "tmp"; + public static final String HEAD = "HEAD"; + private TurFileUtils() { throw new IllegalStateException("Turing File Utilities class"); } - public static TurFileAttributes readFile(String filePath) { + public static TurTikaFileAttributes readFile(String filePath) { return readFile(new File(filePath)); } - public static TurFileAttributes readFile(File file) { + public static TurTikaFileAttributes readFile(File file) { if (file.exists()) { return parseFile(file); } else { @@ -44,44 +55,51 @@ public static TurFileAttributes readFile(File file) { } } - public static TurFileAttributes parseFile(File file) { - try (InputStream fileInputStreamAttribute = new FileInputStream(file)) { - return parseFile(fileInputStreamAttribute, file); - + public static TurTikaFileAttributes parseFile(File file) { + try (InputStream inputStream = new FileInputStream(file)) { + return getTurTikaFileAttributes(file, inputStream); } catch (IOException e) { log.error(e.getMessage(), e); } return null; } - public static TurFileAttributes parseFile(InputStream inputStream, File file) { - try (inputStream) { - StringBuilder contentFile = new StringBuilder(); - AutoDetectParser parser = new AutoDetectParser(); - // -1 = no limit of number of characters - BodyContentHandler handler = new BodyContentHandler(-1); - Metadata metadata = new Metadata(); - EmbeddedDocumentExtractor embeddedDocumentExtractor = new EmbeddedDocumentExtractor() { - @Override - public boolean shouldParseEmbedded(Metadata metadata) { - return true; - } - - @Override - public void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata, - boolean outputHtml) throws IOException { - parseDocument(stream).ifPresent(contentFile::append); - - } - }; - final ParseContext parseContext = getParseContext(parser); - parseContext.set(EmbeddedDocumentExtractor.class, embeddedDocumentExtractor); + private static TurTikaFileAttributes getTurTikaFileAttributes(File file, InputStream inputStream) { + StringBuilder contentFile = new StringBuilder(); + AutoDetectParser parser = new AutoDetectParser(); + // -1 = no limit of number of characters + BodyContentHandler handler = new BodyContentHandler(-1); + Metadata metadata = new Metadata(); + EmbeddedDocumentExtractor embeddedDocumentExtractor = new EmbeddedDocumentExtractor() { + @Override + public boolean shouldParseEmbedded(Metadata metadata) { + return true; + } + + @Override + public void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata, + boolean outputHtml) throws IOException { + parseDocument(stream).ifPresent(contentFile::append); + + } + }; + ParseContext parseContext = getParseContext(parser); + parseContext.set(EmbeddedDocumentExtractor.class, embeddedDocumentExtractor); + try { parser.parse(inputStream, handler, metadata, parseContext); - contentFile.append(handler); - return new TurFileAttributes(file, contentFile.toString(), metadata); } catch (IOException | SAXException | TikaException e) { log.error(e.getMessage(), e); } + contentFile.append(handler); + return new TurTikaFileAttributes(file, contentFile.toString(), metadata); + } + + public static TurTikaFileAttributes parseFile(MultipartFile multipartFile) { + try (InputStream inputStream = multipartFile.getInputStream()) { + return getTurTikaFileAttributes(null, inputStream); + } catch (IOException e) { + log.error(e.getMessage(), e); + } return null; } @@ -96,16 +114,92 @@ private static ParseContext getParseContext(AutoDetectParser parser) { return parseContext; } - public static String documentToText(MultipartFile multipartFile) { - try (InputStream inputStream = multipartFile.getInputStream()) { - TurFileAttributes turFileAttributes = parseFile(inputStream, null); - return Optional.ofNullable(turFileAttributes) - .map(TurFileAttributes::getContent) - .orElse(null); - } catch (IOException e) { - log.error(e.getMessage(), e); + public static TurFileAttributes documentToText(MultipartFile multipartFile) { + return getTurFileAttributes(parseFile(multipartFile), + multipartFile.getOriginalFilename(), + FilenameUtils.getExtension(multipartFile.getOriginalFilename()), + multipartFile.getSize(), + new Date()); + } + + public static TurFileAttributes urlContentToText(URL url) { + File file = getFile(url); + file.deleteOnExit(); + return getTurFileAttributes(parseFile(file), + FilenameUtils.getName(url.getPath()), + FilenameUtils.getExtension(url.getPath()), + file.length(), + getLastModified(url)); + } + + private static Date getLastModified(URL url) { + Date date = new Date(); + + if (isValidUrl(url)) { + try { + HttpURLConnection httpUrlConnection = (HttpURLConnection) url.openConnection(); + httpUrlConnection.setRequestMethod(HEAD); + date = new Date(httpUrlConnection.getLastModified()); + httpUrlConnection.disconnect(); + } catch (IOException e) { + log.error(e.getMessage(), e); + } } - return null; + return date; + } + + private static boolean isValidUrl(URL url) { + UrlValidator urlValidator = new UrlValidator(); + return urlValidator.isValid(url.getPath()); + } + + private static File getFile(URL url) { + File tempFile = null; + if (isValidUrl(url)) { + try { + tempFile = createTempFile(); + FileUtils.copyURLToFile( + url, + tempFile, + CONNECTION_TIMEOUT_MILLIS, + CONNECTION_TIMEOUT_MILLIS); + } catch (IOException e) { + log.error(e.getMessage(), e); + } + } + return tempFile; + } + + private static TurFileAttributes getTurFileAttributes(TurTikaFileAttributes file, + String fileName, + String fileExtension, + long fileSize, + Date lastModified) { + return Optional.ofNullable(file).map(attributes -> + TurFileAttributes.builder() + .content(attributes.getContent()) + .name(fileName) + .extension(fileExtension) + .size(new TurFileSize(fileSize)) + .title(getTitle(file, fileName)) + .lastModified(lastModified) + .metadata(getMetadataMap(file)) + .build()) + .orElseGet(TurFileAttributes::new); + } + + private static String getTitle(TurTikaFileAttributes file, String fileName) { + return Optional.ofNullable(file + .getMetadata() + .get(PDF_DOC_INFO_TITLE)) + .orElse(fileName); + } + + private static Map getMetadataMap(TurTikaFileAttributes file) { + Map metadataMap = new HashMap<>(); + Arrays.stream(file.getMetadata().names()).forEach(name -> + metadataMap.put(name, file.getMetadata().get(name))); + return metadataMap; } public static Optional parseDocument(InputStream stream) throws IOException { @@ -119,8 +213,7 @@ public static Optional parseDocument(InputStream stream) throws IOExcept public static Optional getFileContent(InputStream stream, BodyContentHandler handlerInner, AutoDetectParser parserInner, Metadata metadataInner, ParseContext parseContextInner) throws IOException { - File tempFile = File.createTempFile(UUID.randomUUID().toString(), null, - TurCommonsUtils.addSubDirToStoreDir("tmp")); + File tempFile = createTempFile(); Files.copy(stream, tempFile.toPath(), StandardCopyOption.REPLACE_EXISTING); try (FileInputStream fileInputStreamInner = new FileInputStream(tempFile)) { parserInner.parse(fileInputStreamInner, handlerInner, metadataInner, parseContextInner); @@ -133,4 +226,10 @@ public static Optional getFileContent(InputStream stream, BodyContentHan return Optional.empty(); } + private static File createTempFile() throws IOException { + return File.createTempFile(UUID.randomUUID().toString(), null, + TurCommonsUtils.addSubDirToStoreDir(TMP)); + } + + } diff --git a/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileAttributes.java b/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurTikaFileAttributes.java similarity index 90% rename from turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileAttributes.java rename to turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurTikaFileAttributes.java index 46031a50127..3720689844e 100644 --- a/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileAttributes.java +++ b/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurTikaFileAttributes.java @@ -31,12 +31,12 @@ **/ @Setter @Getter -public class TurFileAttributes { +public class TurTikaFileAttributes { private File file; private String content; private Metadata metadata; - public TurFileAttributes(File file, String content, Metadata metadata) { + public TurTikaFileAttributes(File file, String content, Metadata metadata) { super(); this.file = file; this.content = content; diff --git a/turing-filesystem/fs-connector/src/main/java/com/viglet/turing/tool/filesystem/TurFSImportTool.java b/turing-filesystem/fs-connector/src/main/java/com/viglet/turing/tool/filesystem/TurFSImportTool.java index 05d1588576a..936398a20ef 100644 --- a/turing-filesystem/fs-connector/src/main/java/com/viglet/turing/tool/filesystem/TurFSImportTool.java +++ b/turing-filesystem/fs-connector/src/main/java/com/viglet/turing/tool/filesystem/TurFSImportTool.java @@ -153,7 +153,8 @@ private TurSNJobItem createJobItem(File file) { attributes.put(fileSizeField, file.length()); attributes.put("url", fileURL); try { - attributes.put("text", TurOcr.processFile(new TurServer(URI.create(turingServer).toURL(), + TurOcr ocr = new TurOcr(); + attributes.put("text", ocr.processFile(new TurServer(URI.create(turingServer).toURL(), new TurApiKeyCredentials(turingApiKey)), file, showOutput)); } catch (MalformedURLException e) { log.error(e.getMessage(), e); diff --git a/turing-java-sdk/conf/log4j.properties b/turing-java-sdk/conf/log4j.properties new file mode 100644 index 00000000000..b17329c192d --- /dev/null +++ b/turing-java-sdk/conf/log4j.properties @@ -0,0 +1,4 @@ +log4j.rootLogger=debug, stdout +log4j.appender.stdout=org.apache.log4j.ConsoleAppender +log4j.appender.stdout.layout=org.apache.log4j.PatternLayout +log4j.appender.stdout.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n diff --git a/turing-java-sdk/pom.xml b/turing-java-sdk/pom.xml index c85e694a187..82fb97420a7 100644 --- a/turing-java-sdk/pom.xml +++ b/turing-java-sdk/pom.xml @@ -38,7 +38,6 @@ compile - @@ -51,6 +50,24 @@ 11 + + + maven-shade-plugin + 3.6.0 + + + + shade + + none + + turing-java-sdk + true + indexer + + + + diff --git a/turing-java-sdk/src/main/java/com/viglet/turing/client/ocr/TurOcr.java b/turing-java-sdk/src/main/java/com/viglet/turing/client/ocr/TurOcr.java index f8a623683ca..cabdf4f2ef6 100644 --- a/turing-java-sdk/src/main/java/com/viglet/turing/client/ocr/TurOcr.java +++ b/turing-java-sdk/src/main/java/com/viglet/turing/client/ocr/TurOcr.java @@ -1,46 +1,107 @@ package com.viglet.turing.client.ocr; +import com.fasterxml.jackson.databind.ObjectMapper; import com.viglet.turing.client.auth.TurServer; import com.viglet.turing.client.utils.TurClientUtils; +import com.viglet.turing.commons.file.TurFileAttributes; import lombok.extern.slf4j.Slf4j; import org.apache.hc.client5.http.classic.methods.HttpPost; +import org.apache.hc.client5.http.config.ConnectionConfig; import org.apache.hc.client5.http.entity.mime.FileBody; import org.apache.hc.client5.http.entity.mime.MultipartEntityBuilder; import org.apache.hc.client5.http.impl.classic.CloseableHttpClient; import org.apache.hc.client5.http.impl.classic.HttpClients; +import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManager; +import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManagerBuilder; import org.apache.hc.core5.http.ContentType; import org.apache.hc.core5.http.HttpEntity; import org.apache.hc.core5.http.io.entity.EntityUtils; +import org.apache.hc.core5.http.io.entity.StringEntity; +import org.apache.hc.core5.util.TimeValue; +import org.apache.hc.core5.util.Timeout; +import org.json.JSONObject; import java.io.File; import java.io.IOException; +import java.net.URL; @Slf4j public class TurOcr { - private TurOcr() { - throw new IllegalStateException("OCR Utility class"); + + public static final int TIMEOUT_MINUTES = 5; + public static final String API_OCR_URL = "%s/api/ocr/url"; + public static final String API_OCR_FILE = "%s/api/ocr/file"; + public static final String FILE = "file"; + public static final String URL = "url"; + private final PoolingHttpClientConnectionManager pool; + public TurOcr() { + pool = setConnectionManager(); + } + + public TurFileAttributes processFile(TurServer turServer, File file, boolean showOutput) { + return getTurFileAttributes(turServer, + getRequestEntity(file), + String.format(API_OCR_FILE, turServer.getServerUrl()), + showOutput); } - public static String processFile(TurServer turServer, File file, boolean showOutput) { - try (CloseableHttpClient client = HttpClients.createDefault()) { - FileBody fileBody = new FileBody(file, ContentType.DEFAULT_BINARY); - String url = String.format("%s/api/ocr", turServer.getServerUrl()); - HttpPost httpPost = new HttpPost(url); - HttpEntity requestEntity = MultipartEntityBuilder.create().addPart("file", fileBody).build(); - httpPost.setEntity(requestEntity); + private TurFileAttributes getTurFileAttributes(TurServer turServer, HttpEntity requestEntity, + String endpoint, + boolean showOutput) { + + try (CloseableHttpClient client = HttpClients + .custom() + .setConnectionManager(pool) + .build(); + HttpEntity entity = requestEntity + ) { + HttpPost httpPost = new HttpPost(endpoint); + httpPost.setEntity(entity); TurClientUtils.authentication(httpPost, turServer.getApiKey()); String responseBody = client.execute(httpPost, response -> { - log.info("Request Status {} - {}", response.getCode(), url); - HttpEntity entity = response.getEntity(); - return entity != null ? EntityUtils.toString(entity) : null; + log.info("Request Status {} - {}", response.getCode(), endpoint); + HttpEntity responseEntity = response.getEntity(); + return responseEntity != null ? EntityUtils.toString(responseEntity) : null; }); + TurFileAttributes turFileAttributes = new ObjectMapper().readValue(responseBody, TurFileAttributes.class); if (showOutput) { - log.info(responseBody); + log.info(turFileAttributes.toString()); } - return responseBody; + return turFileAttributes; } catch (IOException e) { log.error(e.getMessage(), e); - return null; + return new TurFileAttributes(); } } + + private static PoolingHttpClientConnectionManager setConnectionManager() { + ConnectionConfig config = ConnectionConfig.custom() + .setSocketTimeout(Timeout.ofMinutes(TIMEOUT_MINUTES)) + .setConnectTimeout(Timeout.ofMinutes(TIMEOUT_MINUTES)) + .setTimeToLive(TimeValue.ofMinutes(TIMEOUT_MINUTES)) + .build(); + return PoolingHttpClientConnectionManagerBuilder + .create() + .setDefaultConnectionConfig(config) + .build(); + } + + private static HttpEntity getRequestEntity(File file) { + return MultipartEntityBuilder.create() + .addPart(FILE, new FileBody(file, ContentType.DEFAULT_BINARY)) + .build(); + } + + public TurFileAttributes processUrl(TurServer turServer, URL url, boolean showOutput) { + return getTurFileAttributes(turServer, + getRequestEntity(url), + String.format(API_OCR_URL, turServer.getServerUrl()), + showOutput); + } + + private static StringEntity getRequestEntity(URL url) { + return new StringEntity( + new JSONObject().put(URL, url.toString()).toString(), + ContentType.APPLICATION_JSON); + } } diff --git a/turing-java-sdk/src/main/java/com/viglet/turing/client/ocr/sample/TurClientOcrSample.java b/turing-java-sdk/src/main/java/com/viglet/turing/client/ocr/sample/TurClientOcrSample.java new file mode 100644 index 00000000000..31ffa3267b6 --- /dev/null +++ b/turing-java-sdk/src/main/java/com/viglet/turing/client/ocr/sample/TurClientOcrSample.java @@ -0,0 +1,58 @@ +/* + * + * Copyright (C) 2016-2024 the original author or authors. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +package com.viglet.turing.client.ocr.sample; + +import com.viglet.turing.client.auth.TurServer; +import com.viglet.turing.client.auth.credentials.TurApiKeyCredentials; +import com.viglet.turing.client.ocr.TurOcr; +import com.viglet.turing.commons.file.TurFileAttributes; +import lombok.extern.slf4j.Slf4j; + +import java.net.MalformedURLException; +import java.net.URI; + +/** + * Sample code to use this SDK. + * + * @author Alexandre Oliveira + * @since 0.3.9 + */ +@Slf4j +public class TurClientOcrSample { + public static void main(String[] args) { + if (args.length == 3) { + String turingUrl = args[0]; + String apiKey = args[1]; + String fileUrl = args[2]; + try { + TurServer turSNServer = new TurServer(URI.create(turingUrl).toURL(), + new TurApiKeyCredentials(apiKey)); + log.info("--- Ocr Url"); + TurOcr turOcr = new TurOcr(); + TurFileAttributes turFileAttributes = turOcr.processUrl(turSNServer, URI.create(fileUrl).toURL(), + true); + log.info(turFileAttributes.toString()); + } catch (MalformedURLException e) { + log.error(e.getMessage(), e); + } + } else { + log.info("Parameters: turingUrl apiKey fileUrl"); + } + } +} diff --git a/turing-java-sdk/src/main/java/com/viglet/turing/client/sn/sample/TurSNClientSample.java b/turing-java-sdk/src/main/java/com/viglet/turing/client/sn/sample/TurSNClientSample.java index f10193ad22d..b4d5b2a9aed 100644 --- a/turing-java-sdk/src/main/java/com/viglet/turing/client/sn/sample/TurSNClientSample.java +++ b/turing-java-sdk/src/main/java/com/viglet/turing/client/sn/sample/TurSNClientSample.java @@ -31,19 +31,14 @@ import java.util.List; import java.util.Locale; import java.util.Map.Entry; -import java.util.logging.Level; -import java.util.logging.Logger; /** * Sample code to use this SDK. * * @since 0.3.4 */ - @Slf4j public class TurSNClientSample { - private static final Logger logger = Logger.getLogger(TurSNClientSample.class.getName()); - private static final String TURING_URL = "http://localhost:2700"; private static final String TURING_SITE = "Sample"; private static final Locale TURING_LOCALE = Locale.US; @@ -88,7 +83,7 @@ public static void main(String[] args) { groupBy(args, turSNServer); } catch (MalformedURLException e) { - logger.log(Level.SEVERE, e.getMessage(), e); + log.error(e.getMessage(), e); } }