Skip to content

✨ feat: Tika OCR #7474

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/backend-review.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
name: Backend Unit Tests
on:
workflow_dispatch:
pull_request:
branches:
- main
Expand Down Expand Up @@ -67,4 +68,4 @@ jobs:
run: cd packages/data-provider && npm run test:ci

- name: Run librechat-mcp unit tests
run: cd packages/mcp && npm run test:ci
run: cd packages/mcp && npm run test:ci
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,9 @@ auth.json
# User uploads
uploads/

# Ollama
ollama/

# owner
release/

Expand Down
109 changes: 109 additions & 0 deletions api/server/services/Files/TikaOCR/crud.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
// ~/server/services/Files/TikaOCR/crud.js
const fs = require('fs');
const path = require('path');
const FormData = require('form-data');
const { FileSources, envVarRegex, extractEnvVariable } = require('librechat-data-provider');
const { loadAuthValues } = require('~/server/services/Tools/credentials');
const { logger, createAxiosInstance } = require('~/config');
const { logAxiosError } = require('~/utils/axios');

const axios = createAxiosInstance();

function extractVariableName(str) {
const match = str.match(envVarRegex);
return match ? match[1] : null;
}

/**
* Uploads a document to Tika. DOES NOT FILE STREAM.
*
* @param {Object} params Upload parameters
* @param {string} params.filePath The path to the file on disk
* @param {string} [params.baseURL=http://tika:9998] Tika API base URL if using docker
* @returns {Promise<Object>} The response from Tika
*/
async function uploadDocumentToTika({
filePath,
baseURL = 'http://tika:9998',
}) {
const fileData = fs.readFileSync(filePath); // Read the entire file into memory :(

return axios
.put(`${baseURL}/tika`, fileData, {
headers: {
'Content-Type': 'application/pdf', // This should be dynamic based on the file type
'Accept': 'text/plain',
},
maxBodyLength: Infinity,
maxContentLength: Infinity,
})
.then((res) => res.data)
.catch((error) => {
logger.error('Error uploading document to Tika:', error.message);
throw error;
});
}

/**
* Uploads a file to the Tika OCR API and processes the OCR result.
*
* @param {Object} params - The params object.
* @param {ServerRequest} params.req - The request object from Express. It should have a `user` property with an `id`
* representing the user
* @param {Express.Multer.File} params.file - The file object, which is part of the request. The file object should
* have a `mimetype` property that tells us the file type
* @param {string} params.file_id - The file ID.
* @param {string} [params.entity_id] - The entity ID, not used here but passed for consistency.
* @returns {Promise<{ filepath: string, bytes: number }>} - The result object containing the processed `text` and `images` (not currently used),
* along with the `filename` and `bytes` properties.
*/
const uploadTikaOCR = async ({ req, file, file_id, entity_id }) => {
try {
/** @type {TCustomConfig['ocr']} */
const ocrConfig = req.app.locals?.ocr;

const baseURLConfig = ocrConfig.baseURL || '';

const isBaseURLEnvVar = envVarRegex.test(baseURLConfig);

const isBaseURLEmpty = !baseURLConfig.trim();

let baseURL;

if (isBaseURLEnvVar || isBaseURLEmpty) {
const baseURLVarName = isBaseURLEnvVar ? extractVariableName(baseURLConfig) : 'OCR_BASEURL';

const authValues = await loadAuthValues({
userId: req.user.id,
authFields: [baseURLVarName],
optional: new Set([baseURLVarName]),
});

baseURL = authValues[baseURLVarName];
} else {
baseURL = baseURLConfig;
}

const extractedText = await uploadDocumentToTika({
filePath: file.path,
baseURL,
});



return {
filename: file.originalname,
bytes: extractedText.length * 4,
filepath: FileSources.tika_ocr,
text: extractedText,
images: [] // Not used in this implementation
};
} catch (error) {
const message = 'Error uploading document to Tika OCR API';
throw new Error(logAxiosError({ error, message }));
}
};

module.exports = {
uploadTikaOCR
};
174 changes: 174 additions & 0 deletions api/server/services/Files/TikaOCR/crud.spec.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
const fs = require('fs');
const mockAxios = {
put: jest.fn().mockResolvedValue({ data: 'Extracted text from Tika' }),
interceptors: {
response: {
use: jest.fn(),
},
},
};
jest.mock('axios', () => mockAxios);
jest.mock('fs');
jest.mock('~/config', () => ({
logger: {
error: jest.fn(),
},
createAxiosInstance: () => mockAxios,
}));
jest.mock('~/server/services/Tools/credentials', () => ({
loadAuthValues: jest.fn(),
}));

const { uploadTikaOCR } = require('./crud');

describe('TikaOCR Service', () => {
afterEach(() => {
jest.clearAllMocks();
});

describe('uploadDocumentToTika', () => {
it('should upload a document to Tika and return extracted text', async () => {
const mockFilePath = '/path/to/test.pdf';
const mockFileData = Buffer.from('mock file data');
fs.readFileSync.mockReturnValue(mockFileData);

const result = await uploadTikaOCR({
req: {
user: { id: 'user123' },
app: {
locals: {
ocr: {
baseURL: 'http://tika:9998',
},
},
},
},
file: {
path: mockFilePath,
originalname: 'test.pdf',
},
file_id: 'file123',
entity_id: 'entity123',
});

expect(fs.readFileSync).toHaveBeenCalledWith(mockFilePath);
expect(mockAxios.put).toHaveBeenCalledWith(
'http://tika:9998/tika',
mockFileData,
expect.objectContaining({
headers: {
'Content-Type': 'application/pdf',
Accept: 'text/plain',
},
maxBodyLength: Infinity,
maxContentLength: Infinity,
}),
);
expect(result).toEqual({
filename: 'test.pdf',
bytes: 'Extracted text from Tika'.length * 4,
filepath: 'tika_ocr',
text: 'Extracted text from Tika',
images: [],
});
});

it('should handle errors during document upload', async () => {
const errorMessage = 'Tika API error';
mockAxios.put.mockRejectedValueOnce(new Error(errorMessage));

await expect(
uploadTikaOCR({
req: {
user: { id: 'user123' },
app: {
locals: {
ocr: {
baseURL: 'http://tika:9998',
},
},
},
},
file: {
path: '/path/to/test.pdf',
originalname: 'test.pdf',
},
file_id: 'file123',
entity_id: 'entity123',
}),
).rejects.toThrow('Error uploading document to Tika OCR API');

const { logger } = require('~/config');
expect(logger.error).toHaveBeenCalledWith(
expect.stringContaining('Error uploading document to Tika:'),
expect.any(String),
);
});

it('should resolve baseURL from environment variables when configured', async () => {
const { loadAuthValues } = require('~/server/services/Tools/credentials');
loadAuthValues.mockResolvedValue({
OCR_BASEURL: 'http://tika:9998',
});

const result = await uploadTikaOCR({
req: {
user: { id: 'user123' },
app: {
locals: {
ocr: {
baseURL: '${OCR_BASEURL}',
},
},
},
},
file: {
path: '/path/to/test.pdf',
originalname: 'test.pdf',
},
file_id: 'file123',
entity_id: 'entity123',
});

expect(loadAuthValues).toHaveBeenCalledWith({
userId: 'user123',
authFields: ['OCR_BASEURL'],
optional: expect.any(Set),
});
expect(mockAxios.put).toHaveBeenCalledWith(
'http://tika:9998/tika',
expect.any(Buffer),
expect.any(Object),
);
expect(result.text).toEqual('Extracted text from Tika');
});

it('should handle empty baseURL and use default', async () => {
const result = await uploadTikaOCR({
req: {
user: { id: 'user123' },
app: {
locals: {
ocr: {
baseURL: '',
},
},
},
},
file: {
path: '/path/to/test.pdf',
originalname: 'test.pdf',
},
file_id: 'file123',
entity_id: 'entity123',
});

expect(mockAxios.put).toHaveBeenCalledWith(
'http://tika:9998/tika',
expect.any(Buffer),
expect.any(Object),
);
expect(result.text).toEqual('Extracted text from Tika');
});
});
});
5 changes: 5 additions & 0 deletions api/server/services/Files/TikaOCR/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
const crud = require('./crud');

module.exports = {
...crud,
};
4 changes: 2 additions & 2 deletions api/server/services/Files/process.js
Original file line number Diff line number Diff line change
Expand Up @@ -522,7 +522,7 @@ const processAgentFileUpload = async ({ req, res, metadata }) => {
throw new Error('OCR capability is not enabled for Agents');
}

const { handleFileUpload: uploadMistralOCR } = getStrategyFunctions(
const { handleFileUpload: uploadOCR } = getStrategyFunctions(
req.app.locals?.ocr?.strategy ?? FileSources.mistral_ocr,
);
const { file_id, temp_file_id } = metadata;
Expand All @@ -534,7 +534,7 @@ const processAgentFileUpload = async ({ req, res, metadata }) => {
images,
filename,
filepath: ocrFileURL,
} = await uploadMistralOCR({ req, file, file_id, entity_id: agent_id, basePath });
} = await uploadOCR({ req, file, file_id, entity_id: agent_id, basePath });

const fileInfo = removeNullishValues({
text,
Expand Down
23 changes: 23 additions & 0 deletions api/server/services/Files/strategies.js
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ const { uploadOpenAIFile, deleteOpenAIFile, getOpenAIFileStream } = require('./O
const { getCodeOutputDownloadStream, uploadCodeEnvFile } = require('./Code');
const { uploadVectors, deleteVectors } = require('./VectorDB');
const { uploadMistralOCR } = require('./MistralOCR');
const { uploadTikaOCR } = require('./TikaOCR');

/**
* Firebase Storage Strategy Functions
Expand Down Expand Up @@ -202,6 +203,26 @@ const mistralOCRStrategy = () => ({
handleFileUpload: uploadMistralOCR,
});

const tikaOCRStrategy = () => ({
/** @type {typeof saveFileFromURL | null} */
saveURL: null,
/** @type {typeof getLocalFileURL | null} */
getFileURL: null,
/** @type {typeof saveLocalBuffer | null} */
saveBuffer: null,
/** @type {typeof processLocalAvatar | null} */ // I am not sure if this is correct
processAvatar: null,
/** @type {typeof uploadLocalImage | null} */
handleImageUpload: null,
/** @type {typeof prepareImagesLocal | null} */
prepareImagePayload: null,
/** @type {typeof deleteLocalFile | null} */
deleteFile: null,
/** @type {typeof getLocalFileStream | null} */
getDownloadStream: null,
handleFileUpload: uploadTikaOCR,
});

// Strategy Selector
const getStrategyFunctions = (fileSource) => {
if (fileSource === FileSources.firebase) {
Expand All @@ -222,6 +243,8 @@ const getStrategyFunctions = (fileSource) => {
return codeOutputStrategy();
} else if (fileSource === FileSources.mistral_ocr) {
return mistralOCRStrategy();
} else if (fileSource === FileSources.tika_ocr) {
return tikaOCRStrategy();
} else {
throw new Error('Invalid file source');
}
Expand Down
14 changes: 14 additions & 0 deletions docker-compose.override.yml.example
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,20 @@
# volumes:
# - ./ollama:/root/.ollama

# # ADD TIKA
# tika:
# image: apache/tika:latest-full
# container_name: tika
# ports:
# - "${TIKA_PORT-9998}:9998"
# configs:
# - source: tika_config_3
# target: /tika-config.xml
#
# configs:
# tika_config_3:
# file: ./tika-config.xml

# # ADD LITELLM BASIC - NEED TO CONFIGURE litellm-config.yaml, ONLY NEED ENV TO ENABLE REDIS FOR CACHING OR LANGFUSE FOR MONITORING
# litellm:
# image: ghcr.io/berriai/litellm:main-latest
Expand Down
Loading