From dbc39373a42216c9d64846d94bbf931a930f12af Mon Sep 17 00:00:00 2001 From: Gabriel Date: Tue, 17 Sep 2024 16:02:01 -0300 Subject: [PATCH 1/5] refactors FileExtractor logic to use TurFileAttributes. --- .../connector/sprinklr/utils/FileAsset.java | 18 ++--- .../sprinklr/utils/FileAssetsExtractor.java | 75 +++++-------------- 2 files changed, 28 insertions(+), 65 deletions(-) diff --git a/turing-sprinklr/sprinklr-app/src/main/java/com/viglet/turing/connector/sprinklr/utils/FileAsset.java b/turing-sprinklr/sprinklr-app/src/main/java/com/viglet/turing/connector/sprinklr/utils/FileAsset.java index 05a7cea01d9..3542ac33063 100644 --- a/turing-sprinklr/sprinklr-app/src/main/java/com/viglet/turing/connector/sprinklr/utils/FileAsset.java +++ b/turing-sprinklr/sprinklr-app/src/main/java/com/viglet/turing/connector/sprinklr/utils/FileAsset.java @@ -24,10 +24,8 @@ public class FileAsset { private Date indexingDate; private Date modificationDate; private URL url; - private long fileSize; + private float fileSize; private String extension; - private String assetType; - private String assetCategory; /** * Converts this FileAsset to an attribute map. @@ -41,17 +39,17 @@ public Map toMapAttributes() { String formatedModificationDate = formatter.format(modificationDate); attributes.put("id", id); - attributes.put("filename", filename); - attributes.put("ocr_content", ocrContent); - // TODO: Change TypeDef in Solr. - attributes.put("indexing_date", formatedIndexingDate); + attributes.put("title", filename); + attributes.put("text", ocrContent); + attributes.put("publication_date", formatedIndexingDate); attributes.put("modification_date", formatedModificationDate); attributes.put("url", url); - attributes.put("file_size", fileSize); + attributes.put("filesize", fileSize); attributes.put("extension", extension); - attributes.put("asset_type", assetType); - attributes.put("asset_category", assetCategory); attributes.put("source_apps", List.of("SPRINKLR")); + attributes.put("type", "Static File"); + + return attributes; } } diff --git a/turing-sprinklr/sprinklr-app/src/main/java/com/viglet/turing/connector/sprinklr/utils/FileAssetsExtractor.java b/turing-sprinklr/sprinklr-app/src/main/java/com/viglet/turing/connector/sprinklr/utils/FileAssetsExtractor.java index fe37d563196..b39ac89715b 100644 --- a/turing-sprinklr/sprinklr-app/src/main/java/com/viglet/turing/connector/sprinklr/utils/FileAssetsExtractor.java +++ b/turing-sprinklr/sprinklr-app/src/main/java/com/viglet/turing/connector/sprinklr/utils/FileAssetsExtractor.java @@ -3,23 +3,19 @@ import com.viglet.turing.client.auth.TurServer; import com.viglet.turing.client.auth.credentials.TurApiKeyCredentials; import com.viglet.turing.client.ocr.TurOcr; +import com.viglet.turing.commons.file.TurFileAttributes; import com.viglet.turing.sprinklr.client.service.kb.response.TurSprinklrAsset; import com.viglet.turing.sprinklr.client.service.kb.response.TurSprinklrSearchResult; import lombok.extern.log4j.Log4j2; -import org.apache.commons.io.FileUtils; -import java.io.File; -import java.io.IOException; import java.net.MalformedURLException; import java.net.URI; import java.net.URISyntaxException; import java.net.URL; -import java.nio.file.Files; import java.util.ArrayList; import java.util.Collections; import java.util.Date; import java.util.List; -import java.util.UUID; // Estou nomeando de File extractor ao invés de asset extractor pois talvez linked Assets pode ser mais do que da categoria "file-attachment" @@ -52,97 +48,66 @@ public FileAssetsExtractor(String turingUrl, String turingApiKey) { /** * Extract all Files and its metadata from Linked Asset key from searchResult. + * * @param searchResult Knowledge Base search API result. */ public List extractFromLinkedAssets(TurSprinklrSearchResult searchResult) { - // Extraindo LinkedAssets do resultado da chamada de API de search do Knowledge Base do Sprinkler + // Extracting LinkedAssets from the search API result of Sprinkler Knowledge Base. List linkedAssets = searchResult.getLinkedAssets(); - // Se não houver resultados. + // If there are no linked assets, return an empty list. if (linkedAssets == null || linkedAssets.isEmpty()) { return Collections.emptyList(); } - List fileAssets = new ArrayList<>(); - // Vamos extrair informações a partir de cada asset, vamos baixar o documento para conseguir seu tamanho e informações sobre a data - // Também vamos usar OCR para extrair o conteúdo do arquivo. + // For each asset, uses turing OCR API to extract data. for (var asset : linkedAssets) { String id = null; - String filename = null; - String extension = null; URL url = null; try { - // assetId em arquivos é a completa do URL do arquivo + // assetId it's the complete URI of the file. URI assetURI = new URI(asset.getAssetId()); // ex: google.com/files/text.pdf. id = assetURI.getPath();// /files/text.pdf id = id.substring(id.lastIndexOf('/') + 1); // text.pdf - url = assetURI.toURL(); - - int lastDotIndex = id.lastIndexOf('.'); - filename = id.substring(0, lastDotIndex); - extension = id.substring(lastDotIndex + 1); - } catch (URISyntaxException | MalformedURLException e) { log.error(e); } - File downloadedFile = downloadFile(url); - String contentFromDownloadedFile = null; - // Usa OCR para converter o arquivo para string. + // Tries to use turing OCR API to extract content from the downloaded file. + TurFileAttributes ocrResult = null; try { - log.info("Sending documento to OCR api in: {}", URI.create(turingUrl).toURL()); + log.info("Sending document to OCR api in: {}", URI.create(turingUrl).toURL()); log.info("file type={}", asset.getAssetType()); - + log.info("The document url is {}", url); TurServer turingServer = new TurServer(URI.create(turingUrl).toURL(), new TurApiKeyCredentials(turingApiKey)); TurOcr ocrProcessor = new TurOcr(); - contentFromDownloadedFile = ocrProcessor.processFile(turingServer, downloadedFile, false).toString(); + ocrResult = ocrProcessor.processUrl(turingServer, url, false); + log.info("OCR result: {}", ocrResult); + } catch (MalformedURLException e) { log.error(e); } - + String extension = ocrResult.getExtension(); + String filename = ocrResult.getName(); + String content = ocrResult.getContent(); Date indexingDate = new Date(); - Date modificationDate = null; - long fileSize = -1; - try { - long dateFromFile = Files.getLastModifiedTime(downloadedFile.toPath()).toMillis(); - modificationDate = new Date(dateFromFile); - fileSize = Files.size(downloadedFile.toPath()); - } catch (IOException e) { - log.error(e); - } - - var assetType = asset.getAssetType(); - var assetCategory = asset.getAssetCategory(); + Date modificationDate = ocrResult.getLastModified(); + float fileSize = ocrResult.getSize().getBytes(); var fileAsset = new FileAsset( ("sprinklr" + id), filename, - contentFromDownloadedFile, + content, indexingDate, modificationDate, url, fileSize, - extension, - assetType, - assetCategory); + extension); fileAssets.add(fileAsset); } return fileAssets; } - - private File downloadFile(URL url) { - try { - File file = new File("/store/tmp/" + UUID.randomUUID() + ".pdf"); - FileUtils.copyURLToFile(url, file, 5000, 5000); - - return file; - - } catch (IOException e) { - log.error(e); - } - return null; - } } From 1f3928441f7e2b088f0ab7f3d6cef09b51569a48 Mon Sep 17 00:00:00 2001 From: Alexandre Oliveira Date: Thu, 19 Sep 2024 18:20:23 -0300 Subject: [PATCH 2/5] - OCR: isValidUrl fixed URL and better validation of file --- .../filesystem/commons/TurFileUtils.java | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileUtils.java b/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileUtils.java index 8e33c4445b3..1f994650bc0 100644 --- a/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileUtils.java +++ b/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileUtils.java @@ -115,6 +115,7 @@ private static ParseContext getParseContext(AutoDetectParser parser) { } public static TurFileAttributes documentToText(MultipartFile multipartFile) { + return getTurFileAttributes(parseFile(multipartFile), multipartFile.getOriginalFilename(), FilenameUtils.getExtension(multipartFile.getOriginalFilename()), @@ -122,19 +123,25 @@ public static TurFileAttributes documentToText(MultipartFile multipartFile) { new Date()); } + private static void ocrDocumentLog(String documentName) { + log.info("Processing {} document to text", documentName); + } + public static TurFileAttributes urlContentToText(URL url) { - File file = getFile(url); - file.deleteOnExit(); - return getTurFileAttributes(parseFile(file), - FilenameUtils.getName(url.getPath()), - FilenameUtils.getExtension(url.getPath()), - file.length(), - getLastModified(url)); + ocrDocumentLog(url.toString()); + return Optional.ofNullable(getFile(url)).map(f -> { + f.deleteOnExit(); + return getTurFileAttributes(parseFile(f), + FilenameUtils.getName(url.getPath()), + FilenameUtils.getExtension(url.getPath()), + f.length(), + getLastModified(url)); + }) + .orElseGet(TurFileAttributes::new); } private static Date getLastModified(URL url) { Date date = new Date(); - if (isValidUrl(url)) { try { HttpURLConnection httpUrlConnection = (HttpURLConnection) url.openConnection(); @@ -150,7 +157,7 @@ private static Date getLastModified(URL url) { private static boolean isValidUrl(URL url) { UrlValidator urlValidator = new UrlValidator(); - return urlValidator.isValid(url.getPath()); + return urlValidator.isValid(url.toString()); } private static File getFile(URL url) { From d9a4219b1ffd01e74c4792a52b47d299e1ef6937 Mon Sep 17 00:00:00 2001 From: Alexandre Oliveira Date: Thu, 19 Sep 2024 19:02:48 -0300 Subject: [PATCH 3/5] - OCR: DublinCore.MODIFIED --- .../filesystem/commons/TurFileUtils.java | 52 ++++++++++++------- 1 file changed, 34 insertions(+), 18 deletions(-) diff --git a/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileUtils.java b/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileUtils.java index 1f994650bc0..60c4200de48 100644 --- a/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileUtils.java +++ b/turing-filesystem/fs-commons/src/main/java/com/viglet/turing/filesystem/commons/TurFileUtils.java @@ -9,6 +9,7 @@ import org.apache.commons.validator.routines.UrlValidator; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.metadata.DublinCore; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; @@ -115,32 +116,47 @@ private static ParseContext getParseContext(AutoDetectParser parser) { } public static TurFileAttributes documentToText(MultipartFile multipartFile) { + return Optional.ofNullable(parseFile(multipartFile)).map(tikaFileAttributes -> + getTurFileAttributes(parseFile(multipartFile), + multipartFile.getOriginalFilename(), + FilenameUtils.getExtension(multipartFile.getOriginalFilename()), + multipartFile.getSize(), + getTikaLastModified(tikaFileAttributes) + .orElseGet(Date::new))) + .orElseGet(TurFileAttributes::new); + } - return getTurFileAttributes(parseFile(multipartFile), - multipartFile.getOriginalFilename(), - FilenameUtils.getExtension(multipartFile.getOriginalFilename()), - multipartFile.getSize(), - new Date()); + private static Optional getTikaLastModified(TurTikaFileAttributes tikaFileAttributes) { + return Optional.ofNullable(tikaFileAttributes) + .flatMap(t -> Optional.ofNullable(t.getMetadata()) + .map(m -> m.getDate(DublinCore.MODIFIED))); } private static void ocrDocumentLog(String documentName) { - log.info("Processing {} document to text", documentName); + log.info("Processing {} document to text", documentName); } public static TurFileAttributes urlContentToText(URL url) { ocrDocumentLog(url.toString()); return Optional.ofNullable(getFile(url)).map(f -> { f.deleteOnExit(); - return getTurFileAttributes(parseFile(f), - FilenameUtils.getName(url.getPath()), - FilenameUtils.getExtension(url.getPath()), - f.length(), - getLastModified(url)); + return Optional.ofNullable(parseFile(f)).map(tikaFileAttributes -> + getTurFileAttributes(tikaFileAttributes, + FilenameUtils.getName(url.getPath()), + FilenameUtils.getExtension(url.getPath()), + f.length(), + getLastModified(tikaFileAttributes, url))) + .orElseGet(TurFileAttributes::new); }) .orElseGet(TurFileAttributes::new); } - private static Date getLastModified(URL url) { + private static Date getLastModified(TurTikaFileAttributes tikaFileAttributes, URL url) { + return getTikaLastModified(tikaFileAttributes) + .orElseGet(() -> getLastModifiedFromUrl(url)); + } + + private static Date getLastModifiedFromUrl(URL url) { Date date = new Date(); if (isValidUrl(url)) { try { @@ -177,26 +193,26 @@ private static File getFile(URL url) { return tempFile; } - private static TurFileAttributes getTurFileAttributes(TurTikaFileAttributes file, + private static TurFileAttributes getTurFileAttributes(TurTikaFileAttributes tikaFileAttributes, String fileName, String fileExtension, long fileSize, Date lastModified) { - return Optional.ofNullable(file).map(attributes -> + return Optional.ofNullable(tikaFileAttributes).map(attributes -> TurFileAttributes.builder() .content(attributes.getContent()) .name(fileName) .extension(fileExtension) .size(new TurFileSize(fileSize)) - .title(getTitle(file, fileName)) + .title(getTitle(tikaFileAttributes, fileName)) .lastModified(lastModified) - .metadata(getMetadataMap(file)) + .metadata(getMetadataMap(tikaFileAttributes)) .build()) .orElseGet(TurFileAttributes::new); } - private static String getTitle(TurTikaFileAttributes file, String fileName) { - return Optional.ofNullable(file + private static String getTitle(TurTikaFileAttributes tikaFileAttributes, String fileName) { + return Optional.ofNullable(tikaFileAttributes .getMetadata() .get(PDF_DOC_INFO_TITLE)) .orElse(fileName); From c067507c43c4ddc7b119e1058f50ea26afa86070 Mon Sep 17 00:00:00 2001 From: Gabriel Date: Fri, 20 Sep 2024 10:51:30 -0300 Subject: [PATCH 4/5] - Sprinklr: Alter Logs in Asset Extractor --- .../connector/sprinklr/utils/FileAssetsExtractor.java | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/turing-sprinklr/sprinklr-app/src/main/java/com/viglet/turing/connector/sprinklr/utils/FileAssetsExtractor.java b/turing-sprinklr/sprinklr-app/src/main/java/com/viglet/turing/connector/sprinklr/utils/FileAssetsExtractor.java index b39ac89715b..36c43b1cb5f 100644 --- a/turing-sprinklr/sprinklr-app/src/main/java/com/viglet/turing/connector/sprinklr/utils/FileAssetsExtractor.java +++ b/turing-sprinklr/sprinklr-app/src/main/java/com/viglet/turing/connector/sprinklr/utils/FileAssetsExtractor.java @@ -78,13 +78,11 @@ public List extractFromLinkedAssets(TurSprinklrSearchResult searchRes // Tries to use turing OCR API to extract content from the downloaded file. TurFileAttributes ocrResult = null; try { - log.info("Sending document to OCR api in: {}", URI.create(turingUrl).toURL()); - log.info("file type={}", asset.getAssetType()); - log.info("The document url is {}", url); + log.info("Sending File Asset to turing OCR API, document url is {}", url); TurServer turingServer = new TurServer(URI.create(turingUrl).toURL(), new TurApiKeyCredentials(turingApiKey)); TurOcr ocrProcessor = new TurOcr(); ocrResult = ocrProcessor.processUrl(turingServer, url, false); - log.info("OCR result: {}", ocrResult); + log.debug("OCR result: {}", ocrResult); } catch (MalformedURLException e) { log.error(e); From 9de481cb15ac57974eca2bdb5a3624606647cb7a Mon Sep 17 00:00:00 2001 From: Gabriel Date: Fri, 20 Sep 2024 16:58:55 -0300 Subject: [PATCH 5/5] - New Asset Cache, file-attachment test, improve error treatment --- .../sprinklr/TurSprinklrProcess.java | 7 ++--- .../sprinklr/utils/FileAssetsExtractor.java | 30 +++++++++++++++++-- 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/turing-sprinklr/sprinklr-app/src/main/java/com/viglet/turing/connector/sprinklr/TurSprinklrProcess.java b/turing-sprinklr/sprinklr-app/src/main/java/com/viglet/turing/connector/sprinklr/TurSprinklrProcess.java index 1fc8cdfb3aa..dabde765ac4 100644 --- a/turing-sprinklr/sprinklr-app/src/main/java/com/viglet/turing/connector/sprinklr/TurSprinklrProcess.java +++ b/turing-sprinklr/sprinklr-app/src/main/java/com/viglet/turing/connector/sprinklr/TurSprinklrProcess.java @@ -98,12 +98,12 @@ public void start(TurSprinklrSource turSprinklrSource) { TurSprinklrAccessToken turSprinklrAccessToken = turSprinklrTokenService.getAccessToken(); if (turSprinklrAccessToken != null) { + final var fileAssetExtractor = new FileAssetsExtractor(turingUrl, turingApiKey); while (true) { TurSprinklrKBSearch turSprinklrKBSearch = TurSprinklrKBService.run(turSprinklrAccessToken, kbPage.get()); if (turSprinklrKBSearch != null) { List results = turSprinklrKBSearch.getData().getSearchResults(); - if (results.isEmpty()) { break; } else { @@ -115,7 +115,7 @@ public void start(TurSprinklrSource turSprinklrSource) { getArticle(turSprinklrSource, searchResult, turSprinklrAccessToken); // Gets the assets attached to the search result and inserts into turSNJobItems. - List assets = getFileAssets(searchResult); + List assets = getFileAssets(searchResult, fileAssetExtractor); addFileAssetsToJobItens(assets, resultLocale, turSites); // Quando o tamanho de turSNJobItems alcançar o JobSize definido, envia para o turing. @@ -139,8 +139,7 @@ public void start(TurSprinklrSource turSprinklrSource) { /** * Extracts the file assets from the search result and returns a list of FileAsset objects. */ - private List getFileAssets(TurSprinklrSearchResult searchResult) { - final var fileAssetExtractor = new FileAssetsExtractor(turingUrl, turingApiKey); + private List getFileAssets(TurSprinklrSearchResult searchResult, FileAssetsExtractor fileAssetExtractor) { final var fileAssets = fileAssetExtractor.extractFromLinkedAssets(searchResult); if (fileAssets == null || fileAssets.isEmpty()) { diff --git a/turing-sprinklr/sprinklr-app/src/main/java/com/viglet/turing/connector/sprinklr/utils/FileAssetsExtractor.java b/turing-sprinklr/sprinklr-app/src/main/java/com/viglet/turing/connector/sprinklr/utils/FileAssetsExtractor.java index 36c43b1cb5f..71ccb6a4248 100644 --- a/turing-sprinklr/sprinklr-app/src/main/java/com/viglet/turing/connector/sprinklr/utils/FileAssetsExtractor.java +++ b/turing-sprinklr/sprinklr-app/src/main/java/com/viglet/turing/connector/sprinklr/utils/FileAssetsExtractor.java @@ -15,6 +15,7 @@ import java.util.ArrayList; import java.util.Collections; import java.util.Date; +import java.util.HashSet; import java.util.List; @@ -40,6 +41,8 @@ public class FileAssetsExtractor { */ final String turingApiKey; + final HashSet alreadyProcessedIds = new HashSet<>(); + public FileAssetsExtractor(String turingUrl, String turingApiKey) { this.turingUrl = turingUrl; this.turingApiKey = turingApiKey; @@ -54,31 +57,49 @@ public FileAssetsExtractor(String turingUrl, String turingApiKey) { public List extractFromLinkedAssets(TurSprinklrSearchResult searchResult) { // Extracting LinkedAssets from the search API result of Sprinkler Knowledge Base. List linkedAssets = searchResult.getLinkedAssets(); - // If there are no linked assets, return an empty list. if (linkedAssets == null || linkedAssets.isEmpty()) { return Collections.emptyList(); } + if(linkedAssets.removeIf(asset -> !asset.getAssetType().equals("file-attachment"))){ + log.warn("Removed assets that are not file-attachment"); + log.warn("The assets on this iteration are: ",linkedAssets.toString()); + } List fileAssets = new ArrayList<>(); // For each asset, uses turing OCR API to extract data. for (var asset : linkedAssets) { + log.info("Processing asset - AssetId : {}", asset.getAssetId()); String id = null; URL url = null; + try { // assetId it's the complete URI of the file. URI assetURI = new URI(asset.getAssetId()); // ex: google.com/files/text.pdf. id = assetURI.getPath();// /files/text.pdf id = id.substring(id.lastIndexOf('/') + 1); // text.pdf + url = assetURI.toURL(); - } catch (URISyntaxException | MalformedURLException e) { + } catch (URISyntaxException | MalformedURLException | IllegalArgumentException e) { + if (e instanceof IllegalArgumentException || e instanceof MalformedURLException) { + log.error("Invalid URL, AssetId: {}, AssetType: {}, AssetCategory {}", asset.getAssetId(), asset.getAssetType(), asset.getAssetCategory()); + } log.error(e); + log.error("Asset coming from article: {}, {}", searchResult.getContent().getTitle(), searchResult.getMappingDetails().get(0).getCommunityPermalink()); + log.info("Skipping asset"); + continue; + } + + // If the asset has already been processed, skip it. + if (alreadyProcessedIds.contains(id)) { + log.info("Asset already processed, skipping: {}", id); + continue; } // Tries to use turing OCR API to extract content from the downloaded file. TurFileAttributes ocrResult = null; try { - log.info("Sending File Asset to turing OCR API, document url is {}", url); + log.info("Sending File Asset to turing OCR API"); TurServer turingServer = new TurServer(URI.create(turingUrl).toURL(), new TurApiKeyCredentials(turingApiKey)); TurOcr ocrProcessor = new TurOcr(); ocrResult = ocrProcessor.processUrl(turingServer, url, false); @@ -105,7 +126,10 @@ public List extractFromLinkedAssets(TurSprinklrSearchResult searchRes extension); fileAssets.add(fileAsset); + alreadyProcessedIds.add(id); } + return fileAssets; } + }