Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.util.List;
import java.util.Objects;
import java.util.Optional;

@Slf4j
Expand Down Expand Up @@ -98,8 +99,7 @@ public RedactionScript validateText(@RequestParam("text") String text)
@PostMapping(value = "/entity/file/blazon")
public RedactionScript validateFile(@RequestParam("file") MultipartFile multipartFile)
throws IOException, InterruptedException {
final String text = TurFileUtils.documentToText(multipartFile);
return getEntities(text);
return getEntities(Objects.requireNonNull(TurFileUtils.documentToText(multipartFile)).getContent());

}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@

import java.util.Arrays;
import java.util.List;
import java.util.Objects;

@Slf4j
@RestController
Expand Down Expand Up @@ -104,7 +105,7 @@ public TurData turDataAdd(@RequestBody TurData turData) {
@Transactional
public String turDataImport(@RequestParam("file") MultipartFile multipartFile) {
String[] sentences = turOpenNLPConnector.sentenceDetect(turNLPProcess.getDefaultNLPInstance(),
TurFileUtils.documentToText(multipartFile));
Objects.requireNonNull(TurFileUtils.documentToText(multipartFile)).getContent());
TurData turData = new TurData();
turData.setName(multipartFile.getOriginalFilename());
turData.setType(FilenameUtils.getExtension(multipartFile.getOriginalFilename()));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,8 @@
import com.viglet.turing.api.nlp.bean.TurNLPEntityValidateResponse;
import com.viglet.turing.api.nlp.bean.TurNLPValidateDocument;
import com.viglet.turing.api.nlp.bean.TurNLPValidateResponse;
import com.viglet.turing.commons.utils.TurCommonsUtils;
import com.viglet.turing.filesystem.commons.TurFileAttributes;
import com.viglet.turing.filesystem.commons.TurFileUtils;
import com.viglet.turing.filesystem.commons.TurTikaFileAttributes;
import com.viglet.turing.nlp.TurNLPProcess;
import com.viglet.turing.nlp.TurNLPResponse;
import com.viglet.turing.nlp.TurNLPUtils;
Expand All @@ -54,6 +53,7 @@
import java.io.File;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors;

Expand Down Expand Up @@ -134,7 +134,7 @@ public TurNLPInstance turNLPInstanceAdd(@RequestBody TurNLPInstance turNLPInstan

@PostMapping(value = "/{id}/validate/file/blazon", produces = MediaType.APPLICATION_XML_VALUE)
public RedactionScript validateFile(@RequestParam("file") MultipartFile multipartFile, @PathVariable String id) {
final String text = TurFileUtils.documentToText(multipartFile);
final String text = Objects.requireNonNull(TurFileUtils.documentToText(multipartFile)).getContent();
TurNLPTextValidate textValidate = new TurNLPTextValidate();
textValidate.setText(text);
return this.turNLPInstanceRepository.findById(id).map(turNLPInstance ->
Expand All @@ -149,18 +149,18 @@ public TurNLPValidateResponse validateDocument(@PathVariable String id,
@RequestParam("config") String turNLPValidateDocumentRequest) {

File file = TurSpringUtils.getFileFromMultipart(multipartFile);
TurFileAttributes turFileAttributes = TurFileUtils.readFile(file);
TurTikaFileAttributes turTikaFileAttributes = TurFileUtils.readFile(file);
return this.turNLPInstanceRepository.findById(id)
.map(turNLPInstance -> {
try {
TurNLPValidateDocument turNLPValidateDocument = new ObjectMapper().readValue(turNLPValidateDocumentRequest,
TurNLPValidateDocument.class);
if (turFileAttributes != null && turNLPValidateDocument != null) {
if (turTikaFileAttributes != null && turNLPValidateDocument != null) {
TurNLPResponse turNLPResponse = turNLPProcess.processTextByNLP(turNLPInstance,
turFileAttributes.getContent(), turNLPValidateDocument.getEntities());
turTikaFileAttributes.getContent(), turNLPValidateDocument.getEntities());
List<String> terms = getNLPTerms(turNLPResponse);
turNLPUtils.redactPdf(file, terms);
return createNLPValidateResponse(turNLPInstance, turNLPResponse, turFileAttributes.getContent());
return createNLPValidateResponse(turNLPInstance, turNLPResponse, turTikaFileAttributes.getContent());
}
} catch (JsonProcessingException e) {
log.error(e.getMessage(), e);
Expand Down
45 changes: 39 additions & 6 deletions turing-app/src/main/java/com/viglet/turing/api/ocr/TurOcrAPI.java
Original file line number Diff line number Diff line change
@@ -1,22 +1,55 @@
/*
*
* Copyright (C) 2016-2024 the original author or authors.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/

package com.viglet.turing.api.ocr;

import com.viglet.turing.commons.file.TurFileAttributes;
import com.viglet.turing.filesystem.commons.TurFileUtils;
import io.swagger.v3.oas.annotations.tags.Tag;
import lombok.extern.slf4j.Slf4j;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.bind.annotation.*;
import org.springframework.web.multipart.MultipartFile;
import java.net.MalformedURLException;
import java.net.URI;

/**
* @author Alexandre Oliveira
* @since 0.3.9
*/
@Slf4j
@RestController
@RequestMapping("/api/ocr")
@Tag(name = "OCR", description = "OCR API")
public class TurOcrAPI {

@PostMapping
public String convertToText(@RequestParam("file") MultipartFile multipartFile) {
@PostMapping("/file")
public TurFileAttributes fileToText(@RequestParam("file") MultipartFile multipartFile) {
return TurFileUtils.documentToText(multipartFile);
}

@PostMapping("/url")
public TurFileAttributes urlToText(@RequestBody TurOcrFromUrl turOcrFromUrl) {
try {
return TurFileUtils.urlContentToText(URI.create(turOcrFromUrl.getUrl()).toURL());
}
catch (MalformedURLException e) {
log.error(e.getMessage(), e);
}
return new TurFileAttributes();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/*
*
* Copyright (C) 2016-2024 the original author or authors.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/

package com.viglet.turing.api.ocr;

import lombok.Getter;
import lombok.Setter;

/**
* @author Alexandre Oliveira
* @since 0.3.9
*/
@Getter
@Setter
public class TurOcrFromUrl {
private String url;
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
@Setter
@Builder
public class TurSolrCoreExists {
String name;
boolean exists;
private String name;
private boolean exists;

@Tolerate
public TurSolrCoreExists() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@
import com.viglet.turing.client.sn.job.TurSNJobItem;
import com.viglet.turing.client.sn.job.TurSNJobItems;
import com.viglet.turing.commons.utils.TurCommonsUtils;
import com.viglet.turing.filesystem.commons.TurFileAttributes;
import com.viglet.turing.filesystem.commons.TurFileUtils;
import com.viglet.turing.filesystem.commons.TurTikaFileAttributes;
import com.viglet.turing.persistence.repository.sn.TurSNSiteRepository;
import com.viglet.turing.sn.TurSNConstants;
import com.viglet.turing.spring.utils.TurSpringUtils;
Expand Down Expand Up @@ -108,16 +108,11 @@ public boolean turSNImportZipFileBroker(@RequestParam("file") MultipartFile mult
private void extractTextOfFileAttribute(File extractFolder, Map.Entry<String, Object> attribute) {
if (attribute.getValue().toString().startsWith(TurSNConstants.FILE_PROTOCOL)) {
String fileName = attribute.getValue().toString().replace(TurSNConstants.FILE_PROTOCOL, "");
try (FileInputStream fileInputStreamAttribute = new FileInputStream(
extractFolder.getAbsolutePath() + File.separator + fileName)) {
TurFileAttributes turFileAttributes = TurFileUtils.parseFile(fileInputStreamAttribute, null);
Optional.ofNullable(turFileAttributes)
.map(TurFileAttributes::getContent)
.ifPresent(content -> attribute.setValue(TurCommonsUtils.cleanTextContent(content)));
} catch (IOException e) {
log.error(e.getMessage(), e);
}

File file = new File(extractFolder.getAbsolutePath().concat(File.separator).concat(fileName));
TurTikaFileAttributes turTikaFileAttributes = TurFileUtils.parseFile(file);
Optional.ofNullable(turTikaFileAttributes)
.map(TurTikaFileAttributes::getContent)
.ifPresent(content -> attribute.setValue(TurCommonsUtils.cleanTextContent(content)));
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ SecurityFilterChain filterChain(HttpSecurity http, MvcRequestMatcher.Builder mvc
mvc.pattern("/error/**"),
mvc.pattern("/logout"),
mvc.pattern("/api/nlp/**"),
mvc.pattern("/api/ocr"),
mvc.pattern("/api/ocr/**"),
mvc.pattern("/api/llm/**"),
mvc.pattern("/api/v2/guest/**"),
AntPathRequestMatcher.antMatcher("/h2/**")))
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
/*
*
* Copyright (C) 2016-2024 the original author or authors.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package com.viglet.turing.commons.file;

import lombok.*;

import java.util.Date;
import java.util.Map;

/**
*
* @author Alexandre Oliveira
*
* @since 0.3.9
*
**/

@Setter
@Getter
@ToString
public class TurFileAttributes {
private String content;
private String name;
private String title;
private String extension;
private TurFileSize size;
private Date lastModified;
private Map<String, String> metadata;


}
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/*
*
* Copyright (C) 2016-2024 the original author or authors.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/

package com.viglet.turing.commons.file;

import lombok.*;

import java.math.BigDecimal;
import java.math.RoundingMode;

/**
* @author Alexandre Oliveira
* @since 0.3.9
**/
@Setter
@Getter
@AllArgsConstructor
@ToString
public class TurFileSize {
private final float bytes;
private final float kiloBytes;
private final float megaBytes;
public TurFileSize() {
this(0f);
}
public TurFileSize(float bytes) {
this.bytes = twoDecimalFloat(bytes);
this.kiloBytes = twoDecimalFloat(this.bytes / 1024);
this.megaBytes = twoDecimalFloat(this.kiloBytes / 1024);
}

private float twoDecimalFloat(float value) {
return new BigDecimal(value).setScale(2, RoundingMode.HALF_UP).floatValue();
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
import com.viglet.turing.commons.cache.TurCustomClassCache;
import com.viglet.turing.connector.db.ext.TurDbExtCustomImpl;
import com.viglet.turing.connector.db.format.TurDbFormatValue;
import com.viglet.turing.filesystem.commons.TurFileAttributes;
import com.viglet.turing.filesystem.commons.TurTikaFileAttributes;
import com.viglet.turing.filesystem.commons.TurFileUtils;
import lombok.Getter;
import lombok.Setter;
Expand Down Expand Up @@ -302,23 +302,23 @@ private void addDBFieldsAsAttributes(ResultSet rs, Map<String, Object> attribute

private void addFileAttributes(Map<String, Object> attributes) {
if (filePathField != null && attributes.containsKey(filePathField)) {
TurFileAttributes turFileAttributes = TurFileUtils.readFile((String) attributes.get(filePathField));
if (turFileAttributes != null) {
addFileSizeAttribute(attributes, turFileAttributes);
addFileContentAttribute(attributes, turFileAttributes);
TurTikaFileAttributes turTikaFileAttributes = TurFileUtils.readFile((String) attributes.get(filePathField));
if (turTikaFileAttributes != null) {
addFileSizeAttribute(attributes, turTikaFileAttributes);
addFileContentAttribute(attributes, turTikaFileAttributes);
}
}
}

private void addFileContentAttribute(Map<String, Object> attributes, TurFileAttributes turFileAttributes) {
private void addFileContentAttribute(Map<String, Object> attributes, TurTikaFileAttributes turTikaFileAttributes) {
if (fileContentField != null) {
long maxContentByteSize = maxContentMegaByteSize * MEGA_BYTE;

if (turFileAttributes.getContent().getBytes().length <= maxContentByteSize) {
attributes.put(fileContentField, turFileAttributes.getContent());
if (turTikaFileAttributes.getContent().getBytes().length <= maxContentByteSize) {
attributes.put(fileContentField, turTikaFileAttributes.getContent());
} else {
attributes.put(fileContentField,
turFileAttributes.getContent().substring(0, Math.toIntExact(maxContentByteSize)));
turTikaFileAttributes.getContent().substring(0, Math.toIntExact(maxContentByteSize)));
if (log.isDebugEnabled()) {
log.debug("File size greater than {}, truncating content ...:",
FileUtils.byteCountToDisplaySize(maxContentByteSize));
Expand All @@ -329,14 +329,14 @@ private void addFileContentAttribute(Map<String, Object> attributes, TurFileAttr
}
}

private void addFileSizeAttribute(Map<String, Object> attributes, TurFileAttributes turFileAttributes) {
if (fileSizeField != null && turFileAttributes.getFile() != null) {
attributes.put(fileSizeField, turFileAttributes.getFile().length());
private void addFileSizeAttribute(Map<String, Object> attributes, TurTikaFileAttributes turTikaFileAttributes) {
if (fileSizeField != null && turTikaFileAttributes.getFile() != null) {
attributes.put(fileSizeField, turTikaFileAttributes.getFile().length());
if (log.isDebugEnabled()) {
log.debug("File: {}", turFileAttributes.getFile().getAbsolutePath());
log.debug("File size: {}", FileUtils.byteCountToDisplaySize(turFileAttributes.getFile().length()));
log.debug("File: {}", turTikaFileAttributes.getFile().getAbsolutePath());
log.debug("File size: {}", FileUtils.byteCountToDisplaySize(turTikaFileAttributes.getFile().length()));
log.debug("File - Content size: {}",
FileUtils.byteCountToDisplaySize(turFileAttributes.getContent().getBytes().length));
FileUtils.byteCountToDisplaySize(turTikaFileAttributes.getContent().getBytes().length));
}
} else {
log.debug("File without size: {}", filePathField);
Expand Down
Loading
Loading