Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import org.apache.commons.validator.routines.UrlValidator;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
Expand Down Expand Up @@ -115,26 +116,48 @@ private static ParseContext getParseContext(AutoDetectParser parser) {
}

public static TurFileAttributes documentToText(MultipartFile multipartFile) {
return getTurFileAttributes(parseFile(multipartFile),
multipartFile.getOriginalFilename(),
FilenameUtils.getExtension(multipartFile.getOriginalFilename()),
multipartFile.getSize(),
new Date());
return Optional.ofNullable(parseFile(multipartFile)).map(tikaFileAttributes ->
getTurFileAttributes(parseFile(multipartFile),
multipartFile.getOriginalFilename(),
FilenameUtils.getExtension(multipartFile.getOriginalFilename()),
multipartFile.getSize(),
getTikaLastModified(tikaFileAttributes)
.orElseGet(Date::new)))
.orElseGet(TurFileAttributes::new);
}

private static Optional<Date> getTikaLastModified(TurTikaFileAttributes tikaFileAttributes) {
return Optional.ofNullable(tikaFileAttributes)
.flatMap(t -> Optional.ofNullable(t.getMetadata())
.map(m -> m.getDate(DublinCore.MODIFIED)));
}

private static void ocrDocumentLog(String documentName) {
log.info("Processing {} document to text", documentName);
}

public static TurFileAttributes urlContentToText(URL url) {
File file = getFile(url);
file.deleteOnExit();
return getTurFileAttributes(parseFile(file),
FilenameUtils.getName(url.getPath()),
FilenameUtils.getExtension(url.getPath()),
file.length(),
getLastModified(url));
ocrDocumentLog(url.toString());
return Optional.ofNullable(getFile(url)).map(f -> {
f.deleteOnExit();
return Optional.ofNullable(parseFile(f)).map(tikaFileAttributes ->
getTurFileAttributes(tikaFileAttributes,
FilenameUtils.getName(url.getPath()),
FilenameUtils.getExtension(url.getPath()),
f.length(),
getLastModified(tikaFileAttributes, url)))
.orElseGet(TurFileAttributes::new);
})
.orElseGet(TurFileAttributes::new);
}

private static Date getLastModified(URL url) {
Date date = new Date();
private static Date getLastModified(TurTikaFileAttributes tikaFileAttributes, URL url) {
return getTikaLastModified(tikaFileAttributes)
.orElseGet(() -> getLastModifiedFromUrl(url));
}

private static Date getLastModifiedFromUrl(URL url) {
Date date = new Date();
if (isValidUrl(url)) {
try {
HttpURLConnection httpUrlConnection = (HttpURLConnection) url.openConnection();
Expand All @@ -150,7 +173,7 @@ private static Date getLastModified(URL url) {

private static boolean isValidUrl(URL url) {
UrlValidator urlValidator = new UrlValidator();
return urlValidator.isValid(url.getPath());
return urlValidator.isValid(url.toString());
}

private static File getFile(URL url) {
Expand All @@ -170,26 +193,26 @@ private static File getFile(URL url) {
return tempFile;
}

private static TurFileAttributes getTurFileAttributes(TurTikaFileAttributes file,
private static TurFileAttributes getTurFileAttributes(TurTikaFileAttributes tikaFileAttributes,
String fileName,
String fileExtension,
long fileSize,
Date lastModified) {
return Optional.ofNullable(file).map(attributes ->
return Optional.ofNullable(tikaFileAttributes).map(attributes ->
TurFileAttributes.builder()
.content(attributes.getContent())
.name(fileName)
.extension(fileExtension)
.size(new TurFileSize(fileSize))
.title(getTitle(file, fileName))
.title(getTitle(tikaFileAttributes, fileName))
.lastModified(lastModified)
.metadata(getMetadataMap(file))
.metadata(getMetadataMap(tikaFileAttributes))
.build())
.orElseGet(TurFileAttributes::new);
}

private static String getTitle(TurTikaFileAttributes file, String fileName) {
return Optional.ofNullable(file
private static String getTitle(TurTikaFileAttributes tikaFileAttributes, String fileName) {
return Optional.ofNullable(tikaFileAttributes
.getMetadata()
.get(PDF_DOC_INFO_TITLE))
.orElse(fileName);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,12 +98,12 @@ public void start(TurSprinklrSource turSprinklrSource) {
TurSprinklrAccessToken turSprinklrAccessToken = turSprinklrTokenService.getAccessToken();

if (turSprinklrAccessToken != null) {
final var fileAssetExtractor = new FileAssetsExtractor(turingUrl, turingApiKey);
while (true) {
TurSprinklrKBSearch turSprinklrKBSearch = TurSprinklrKBService.run(turSprinklrAccessToken, kbPage.get());

if (turSprinklrKBSearch != null) {
List<TurSprinklrSearchResult> results = turSprinklrKBSearch.getData().getSearchResults();

if (results.isEmpty()) {
break;
} else {
Expand All @@ -115,7 +115,7 @@ public void start(TurSprinklrSource turSprinklrSource) {
getArticle(turSprinklrSource, searchResult, turSprinklrAccessToken);

// Gets the assets attached to the search result and inserts into turSNJobItems.
List<FileAsset> assets = getFileAssets(searchResult);
List<FileAsset> assets = getFileAssets(searchResult, fileAssetExtractor);
addFileAssetsToJobItens(assets, resultLocale, turSites);

// Quando o tamanho de turSNJobItems alcançar o JobSize definido, envia para o turing.
Expand All @@ -139,8 +139,7 @@ public void start(TurSprinklrSource turSprinklrSource) {
/**
* Extracts the file assets from the search result and returns a list of FileAsset objects.
*/
private List<FileAsset> getFileAssets(TurSprinklrSearchResult searchResult) {
final var fileAssetExtractor = new FileAssetsExtractor(turingUrl, turingApiKey);
private List<FileAsset> getFileAssets(TurSprinklrSearchResult searchResult, FileAssetsExtractor fileAssetExtractor) {
final var fileAssets = fileAssetExtractor.extractFromLinkedAssets(searchResult);

if (fileAssets == null || fileAssets.isEmpty()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,8 @@ public class FileAsset {
private Date indexingDate;
private Date modificationDate;
private URL url;
private long fileSize;
private float fileSize;
private String extension;
private String assetType;
private String assetCategory;

/**
* Converts this FileAsset to an attribute map.
Expand All @@ -51,6 +49,7 @@ public Map<String, Object> toMapAttributes() {
attributes.put("source_apps", List.of("SPRINKLR"));
attributes.put("type", "Static File");


return attributes;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,20 @@
import com.viglet.turing.client.auth.TurServer;
import com.viglet.turing.client.auth.credentials.TurApiKeyCredentials;
import com.viglet.turing.client.ocr.TurOcr;
import com.viglet.turing.commons.file.TurFileAttributes;
import com.viglet.turing.sprinklr.client.service.kb.response.TurSprinklrAsset;
import com.viglet.turing.sprinklr.client.service.kb.response.TurSprinklrSearchResult;
import lombok.extern.log4j.Log4j2;
import org.apache.commons.io.FileUtils;

import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.UUID;


// Estou nomeando de File extractor ao invés de asset extractor pois talvez linked Assets pode ser mais do que da categoria "file-attachment"
Expand All @@ -28,6 +25,7 @@
/**
* Extracts File Assets from Sprinklr Knowledge Base search result. Has turing URL and turing API key just to use the
* OCR API.
*
* @author Gabriel F. Gomazako
* @see FileAsset
* @since 0.3.9
Expand All @@ -43,6 +41,8 @@ public class FileAssetsExtractor {
*/
final String turingApiKey;

final HashSet<String> alreadyProcessedIds = new HashSet<>();

public FileAssetsExtractor(String turingUrl, String turingApiKey) {
this.turingUrl = turingUrl;
this.turingApiKey = turingApiKey;
Expand All @@ -51,97 +51,85 @@ public FileAssetsExtractor(String turingUrl, String turingApiKey) {

/**
* Extract all Files and its metadata from Linked Asset key from searchResult.
*
* @param searchResult Knowledge Base search API result.
*/
public List<FileAsset> extractFromLinkedAssets(TurSprinklrSearchResult searchResult) {
// Extracting LinkedAssets from the search API result of Sprinkler Knowledge Base.
List<TurSprinklrAsset> linkedAssets = searchResult.getLinkedAssets();

// If there are no linked assets, return an empty list.
if (linkedAssets == null || linkedAssets.isEmpty()) {
return Collections.emptyList();
}

if(linkedAssets.removeIf(asset -> !asset.getAssetType().equals("file-attachment"))){
log.warn("Removed assets that are not file-attachment");
log.warn("The assets on this iteration are: ",linkedAssets.toString());
}
List<FileAsset> fileAssets = new ArrayList<>();

// Vamos extrair informações a partir de cada asset, vamos baixar o documento para conseguir seu tamanho e informações sobre a data
// Também vamos usar OCR para extrair o conteúdo do arquivo.
// For each asset, uses turing OCR API to extract data.
for (var asset : linkedAssets) {
log.info("Processing asset - AssetId : {}", asset.getAssetId());
String id = null;
String filename = null;
String extension = null;
URL url = null;

try {
// assetId it's the complete URI of the file.
URI assetURI = new URI(asset.getAssetId()); // ex: google.com/files/text.pdf.
id = assetURI.getPath();// /files/text.pdf
id = id.substring(id.lastIndexOf('/') + 1); // text.pdf

url = assetURI.toURL();

int lastDotIndex = id.lastIndexOf('.');
filename = id.substring(0, lastDotIndex);
extension = id.substring(lastDotIndex + 1);

} catch (URISyntaxException | MalformedURLException e) {
} catch (URISyntaxException | MalformedURLException | IllegalArgumentException e) {
if (e instanceof IllegalArgumentException || e instanceof MalformedURLException) {
log.error("Invalid URL, AssetId: {}, AssetType: {}, AssetCategory {}", asset.getAssetId(), asset.getAssetType(), asset.getAssetCategory());
}
log.error(e);
log.error("Asset coming from article: {}, {}", searchResult.getContent().getTitle(), searchResult.getMappingDetails().get(0).getCommunityPermalink());
log.info("Skipping asset");
continue;
}

// If the asset has already been processed, skip it.
if (alreadyProcessedIds.contains(id)) {
log.info("Asset already processed, skipping: {}", id);
continue;
}

File downloadedFile = downloadFile(url);
String contentFromDownloadedFile = null;
// Tries to use turing OCR API to extract content from the downloaded file.
TurFileAttributes ocrResult = null;
try {
log.info("Sending documento to OCR api in: {}", URI.create(turingUrl).toURL());
log.info("file type={}", asset.getAssetType());

log.info("Sending File Asset to turing OCR API");
TurServer turingServer = new TurServer(URI.create(turingUrl).toURL(), new TurApiKeyCredentials(turingApiKey));
TurOcr ocrProcessor = new TurOcr();
contentFromDownloadedFile = ocrProcessor.processFile(turingServer, downloadedFile, false).toString();
ocrResult = ocrProcessor.processUrl(turingServer, url, false);
log.debug("OCR result: {}", ocrResult);

} catch (MalformedURLException e) {
log.error(e);
}

String extension = ocrResult.getExtension();
String filename = ocrResult.getName();
String content = ocrResult.getContent();
Date indexingDate = new Date();
Date modificationDate = null;
long fileSize = -1;
try {
long dateFromFile = Files.getLastModifiedTime(downloadedFile.toPath()).toMillis();
modificationDate = new Date(dateFromFile);
fileSize = Files.size(downloadedFile.toPath());
} catch (IOException e) {
log.error(e);
}

var assetType = asset.getAssetType();
var assetCategory = asset.getAssetCategory();
Date modificationDate = ocrResult.getLastModified();
float fileSize = ocrResult.getSize().getBytes();

var fileAsset = new FileAsset(
("sprinklr" + id),
filename,
contentFromDownloadedFile,
content,
indexingDate,
modificationDate,
url,
fileSize,
extension,
assetType,
assetCategory);
extension);

fileAssets.add(fileAsset);
alreadyProcessedIds.add(id);
}

return fileAssets;
}

private File downloadFile(URL url) {
try {
File file = new File("/store/tmp/" + UUID.randomUUID() + ".pdf");
FileUtils.copyURLToFile(url, file, 5000, 5000);

return file;

} catch (IOException e) {
log.error(e);
}
return null;
}
}
Loading