diff --git a/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileUtils.java b/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileUtils.java index 8e33c4445b3..60c4200de48 100644 --- a/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileUtils.java +++ b/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileUtils.java @@ -9,6 +9,7 @@ import org.apache.commons.validator.routines.UrlValidator; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.metadata.DublinCore; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; @@ -115,26 +116,48 @@ private static ParseContext getParseContext(AutoDetectParser parser) { } public static TurFileAttributes documentToText(MultipartFile multipartFile) { - return getTurFileAttributes(parseFile(multipartFile), - multipartFile.getOriginalFilename(), - FilenameUtils.getExtension(multipartFile.getOriginalFilename()), - multipartFile.getSize(), - new Date()); + return Optional.ofNullable(parseFile(multipartFile)).map(tikaFileAttributes -> + getTurFileAttributes(parseFile(multipartFile), + multipartFile.getOriginalFilename(), + FilenameUtils.getExtension(multipartFile.getOriginalFilename()), + multipartFile.getSize(), + getTikaLastModified(tikaFileAttributes) + .orElseGet(Date::new))) + .orElseGet(TurFileAttributes::new); + } + + private static Optional getTikaLastModified(TurTikaFileAttributes tikaFileAttributes) { + return Optional.ofNullable(tikaFileAttributes) + .flatMap(t -> Optional.ofNullable(t.getMetadata()) + .map(m -> m.getDate(DublinCore.MODIFIED))); + } + + private static void ocrDocumentLog(String documentName) { + log.info("Processing {} document to text", documentName); } public static TurFileAttributes urlContentToText(URL url) { - File file = getFile(url); - file.deleteOnExit(); - return getTurFileAttributes(parseFile(file), - FilenameUtils.getName(url.getPath()), - FilenameUtils.getExtension(url.getPath()), - file.length(), - getLastModified(url)); + ocrDocumentLog(url.toString()); + return Optional.ofNullable(getFile(url)).map(f -> { + f.deleteOnExit(); + return Optional.ofNullable(parseFile(f)).map(tikaFileAttributes -> + getTurFileAttributes(tikaFileAttributes, + FilenameUtils.getName(url.getPath()), + FilenameUtils.getExtension(url.getPath()), + f.length(), + getLastModified(tikaFileAttributes, url))) + .orElseGet(TurFileAttributes::new); + }) + .orElseGet(TurFileAttributes::new); } - private static Date getLastModified(URL url) { - Date date = new Date(); + private static Date getLastModified(TurTikaFileAttributes tikaFileAttributes, URL url) { + return getTikaLastModified(tikaFileAttributes) + .orElseGet(() -> getLastModifiedFromUrl(url)); + } + private static Date getLastModifiedFromUrl(URL url) { + Date date = new Date(); if (isValidUrl(url)) { try { HttpURLConnection httpUrlConnection = (HttpURLConnection) url.openConnection(); @@ -150,7 +173,7 @@ private static Date getLastModified(URL url) { private static boolean isValidUrl(URL url) { UrlValidator urlValidator = new UrlValidator(); - return urlValidator.isValid(url.getPath()); + return urlValidator.isValid(url.toString()); } private static File getFile(URL url) { @@ -170,26 +193,26 @@ private static File getFile(URL url) { return tempFile; } - private static TurFileAttributes getTurFileAttributes(TurTikaFileAttributes file, + private static TurFileAttributes getTurFileAttributes(TurTikaFileAttributes tikaFileAttributes, String fileName, String fileExtension, long fileSize, Date lastModified) { - return Optional.ofNullable(file).map(attributes -> + return Optional.ofNullable(tikaFileAttributes).map(attributes -> TurFileAttributes.builder() .content(attributes.getContent()) .name(fileName) .extension(fileExtension) .size(new TurFileSize(fileSize)) - .title(getTitle(file, fileName)) + .title(getTitle(tikaFileAttributes, fileName)) .lastModified(lastModified) - .metadata(getMetadataMap(file)) + .metadata(getMetadataMap(tikaFileAttributes)) .build()) .orElseGet(TurFileAttributes::new); } - private static String getTitle(TurTikaFileAttributes file, String fileName) { - return Optional.ofNullable(file + private static String getTitle(TurTikaFileAttributes tikaFileAttributes, String fileName) { + return Optional.ofNullable(tikaFileAttributes .getMetadata() .get(PDF_DOC_INFO_TITLE)) .orElse(fileName); diff --git a/turing-sprinklr/sprinklr-app/src/main/java/com/viglet/turing/connector/sprinklr/TurSprinklrProcess.java b/turing-sprinklr/sprinklr-app/src/main/java/com/viglet/turing/connector/sprinklr/TurSprinklrProcess.java index 1fc8cdfb3aa..dabde765ac4 100644 --- a/turing-sprinklr/sprinklr-app/src/main/java/com/viglet/turing/connector/sprinklr/TurSprinklrProcess.java +++ b/turing-sprinklr/sprinklr-app/src/main/java/com/viglet/turing/connector/sprinklr/TurSprinklrProcess.java @@ -98,12 +98,12 @@ public void start(TurSprinklrSource turSprinklrSource) { TurSprinklrAccessToken turSprinklrAccessToken = turSprinklrTokenService.getAccessToken(); if (turSprinklrAccessToken != null) { + final var fileAssetExtractor = new FileAssetsExtractor(turingUrl, turingApiKey); while (true) { TurSprinklrKBSearch turSprinklrKBSearch = TurSprinklrKBService.run(turSprinklrAccessToken, kbPage.get()); if (turSprinklrKBSearch != null) { List results = turSprinklrKBSearch.getData().getSearchResults(); - if (results.isEmpty()) { break; } else { @@ -115,7 +115,7 @@ public void start(TurSprinklrSource turSprinklrSource) { getArticle(turSprinklrSource, searchResult, turSprinklrAccessToken); // Gets the assets attached to the search result and inserts into turSNJobItems. - List assets = getFileAssets(searchResult); + List assets = getFileAssets(searchResult, fileAssetExtractor); addFileAssetsToJobItens(assets, resultLocale, turSites); // Quando o tamanho de turSNJobItems alcançar o JobSize definido, envia para o turing. @@ -139,8 +139,7 @@ public void start(TurSprinklrSource turSprinklrSource) { /** * Extracts the file assets from the search result and returns a list of FileAsset objects. */ - private List getFileAssets(TurSprinklrSearchResult searchResult) { - final var fileAssetExtractor = new FileAssetsExtractor(turingUrl, turingApiKey); + private List getFileAssets(TurSprinklrSearchResult searchResult, FileAssetsExtractor fileAssetExtractor) { final var fileAssets = fileAssetExtractor.extractFromLinkedAssets(searchResult); if (fileAssets == null || fileAssets.isEmpty()) { diff --git a/turing-sprinklr/sprinklr-app/src/main/java/com/viglet/turing/connector/sprinklr/utils/FileAsset.java b/turing-sprinklr/sprinklr-app/src/main/java/com/viglet/turing/connector/sprinklr/utils/FileAsset.java index deb7d56a20d..3542ac33063 100644 --- a/turing-sprinklr/sprinklr-app/src/main/java/com/viglet/turing/connector/sprinklr/utils/FileAsset.java +++ b/turing-sprinklr/sprinklr-app/src/main/java/com/viglet/turing/connector/sprinklr/utils/FileAsset.java @@ -24,10 +24,8 @@ public class FileAsset { private Date indexingDate; private Date modificationDate; private URL url; - private long fileSize; + private float fileSize; private String extension; - private String assetType; - private String assetCategory; /** * Converts this FileAsset to an attribute map. @@ -51,6 +49,7 @@ public Map toMapAttributes() { attributes.put("source_apps", List.of("SPRINKLR")); attributes.put("type", "Static File"); + return attributes; } } diff --git a/turing-sprinklr/sprinklr-app/src/main/java/com/viglet/turing/connector/sprinklr/utils/FileAssetsExtractor.java b/turing-sprinklr/sprinklr-app/src/main/java/com/viglet/turing/connector/sprinklr/utils/FileAssetsExtractor.java index 24bbdc0377e..71ccb6a4248 100644 --- a/turing-sprinklr/sprinklr-app/src/main/java/com/viglet/turing/connector/sprinklr/utils/FileAssetsExtractor.java +++ b/turing-sprinklr/sprinklr-app/src/main/java/com/viglet/turing/connector/sprinklr/utils/FileAssetsExtractor.java @@ -3,23 +3,20 @@ import com.viglet.turing.client.auth.TurServer; import com.viglet.turing.client.auth.credentials.TurApiKeyCredentials; import com.viglet.turing.client.ocr.TurOcr; +import com.viglet.turing.commons.file.TurFileAttributes; import com.viglet.turing.sprinklr.client.service.kb.response.TurSprinklrAsset; import com.viglet.turing.sprinklr.client.service.kb.response.TurSprinklrSearchResult; import lombok.extern.log4j.Log4j2; -import org.apache.commons.io.FileUtils; -import java.io.File; -import java.io.IOException; import java.net.MalformedURLException; import java.net.URI; import java.net.URISyntaxException; import java.net.URL; -import java.nio.file.Files; import java.util.ArrayList; import java.util.Collections; import java.util.Date; +import java.util.HashSet; import java.util.List; -import java.util.UUID; // Estou nomeando de File extractor ao invés de asset extractor pois talvez linked Assets pode ser mais do que da categoria "file-attachment" @@ -28,6 +25,7 @@ /** * Extracts File Assets from Sprinklr Knowledge Base search result. Has turing URL and turing API key just to use the * OCR API. + * * @author Gabriel F. Gomazako * @see FileAsset * @since 0.3.9 @@ -43,6 +41,8 @@ public class FileAssetsExtractor { */ final String turingApiKey; + final HashSet alreadyProcessedIds = new HashSet<>(); + public FileAssetsExtractor(String turingUrl, String turingApiKey) { this.turingUrl = turingUrl; this.turingApiKey = turingApiKey; @@ -51,26 +51,28 @@ public FileAssetsExtractor(String turingUrl, String turingApiKey) { /** * Extract all Files and its metadata from Linked Asset key from searchResult. + * * @param searchResult Knowledge Base search API result. */ public List extractFromLinkedAssets(TurSprinklrSearchResult searchResult) { // Extracting LinkedAssets from the search API result of Sprinkler Knowledge Base. List linkedAssets = searchResult.getLinkedAssets(); - // If there are no linked assets, return an empty list. if (linkedAssets == null || linkedAssets.isEmpty()) { return Collections.emptyList(); } - + if(linkedAssets.removeIf(asset -> !asset.getAssetType().equals("file-attachment"))){ + log.warn("Removed assets that are not file-attachment"); + log.warn("The assets on this iteration are: ",linkedAssets.toString()); + } List fileAssets = new ArrayList<>(); - // Vamos extrair informações a partir de cada asset, vamos baixar o documento para conseguir seu tamanho e informações sobre a data - // Também vamos usar OCR para extrair o conteúdo do arquivo. + // For each asset, uses turing OCR API to extract data. for (var asset : linkedAssets) { + log.info("Processing asset - AssetId : {}", asset.getAssetId()); String id = null; - String filename = null; - String extension = null; URL url = null; + try { // assetId it's the complete URI of the file. URI assetURI = new URI(asset.getAssetId()); // ex: google.com/files/text.pdf. @@ -78,70 +80,56 @@ public List extractFromLinkedAssets(TurSprinklrSearchResult searchRes id = id.substring(id.lastIndexOf('/') + 1); // text.pdf url = assetURI.toURL(); - - int lastDotIndex = id.lastIndexOf('.'); - filename = id.substring(0, lastDotIndex); - extension = id.substring(lastDotIndex + 1); - - } catch (URISyntaxException | MalformedURLException e) { + } catch (URISyntaxException | MalformedURLException | IllegalArgumentException e) { + if (e instanceof IllegalArgumentException || e instanceof MalformedURLException) { + log.error("Invalid URL, AssetId: {}, AssetType: {}, AssetCategory {}", asset.getAssetId(), asset.getAssetType(), asset.getAssetCategory()); + } log.error(e); + log.error("Asset coming from article: {}, {}", searchResult.getContent().getTitle(), searchResult.getMappingDetails().get(0).getCommunityPermalink()); + log.info("Skipping asset"); + continue; + } + + // If the asset has already been processed, skip it. + if (alreadyProcessedIds.contains(id)) { + log.info("Asset already processed, skipping: {}", id); + continue; } - File downloadedFile = downloadFile(url); - String contentFromDownloadedFile = null; // Tries to use turing OCR API to extract content from the downloaded file. + TurFileAttributes ocrResult = null; try { - log.info("Sending documento to OCR api in: {}", URI.create(turingUrl).toURL()); - log.info("file type={}", asset.getAssetType()); - + log.info("Sending File Asset to turing OCR API"); TurServer turingServer = new TurServer(URI.create(turingUrl).toURL(), new TurApiKeyCredentials(turingApiKey)); TurOcr ocrProcessor = new TurOcr(); - contentFromDownloadedFile = ocrProcessor.processFile(turingServer, downloadedFile, false).toString(); + ocrResult = ocrProcessor.processUrl(turingServer, url, false); + log.debug("OCR result: {}", ocrResult); + } catch (MalformedURLException e) { log.error(e); } - + String extension = ocrResult.getExtension(); + String filename = ocrResult.getName(); + String content = ocrResult.getContent(); Date indexingDate = new Date(); - Date modificationDate = null; - long fileSize = -1; - try { - long dateFromFile = Files.getLastModifiedTime(downloadedFile.toPath()).toMillis(); - modificationDate = new Date(dateFromFile); - fileSize = Files.size(downloadedFile.toPath()); - } catch (IOException e) { - log.error(e); - } - - var assetType = asset.getAssetType(); - var assetCategory = asset.getAssetCategory(); + Date modificationDate = ocrResult.getLastModified(); + float fileSize = ocrResult.getSize().getBytes(); var fileAsset = new FileAsset( ("sprinklr" + id), filename, - contentFromDownloadedFile, + content, indexingDate, modificationDate, url, fileSize, - extension, - assetType, - assetCategory); + extension); fileAssets.add(fileAsset); + alreadyProcessedIds.add(id); } + return fileAssets; } - private File downloadFile(URL url) { - try { - File file = new File("/store/tmp/" + UUID.randomUUID() + ".pdf"); - FileUtils.copyURLToFile(url, file, 5000, 5000); - - return file; - - } catch (IOException e) { - log.error(e); - } - return null; - } }