From 3ba845059f39426f2a10e7063ff044051c34f1be Mon Sep 17 00:00:00 2001 From: Tristan H <108023962+captn-hook@users.noreply.github.com> Date: Tue, 20 May 2025 12:07:54 -0700 Subject: [PATCH 1/5] Update backend-review.yml --- .github/workflows/backend-review.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/backend-review.yml b/.github/workflows/backend-review.yml index b7bccecae84e..a054d348eec3 100644 --- a/.github/workflows/backend-review.yml +++ b/.github/workflows/backend-review.yml @@ -1,5 +1,6 @@ name: Backend Unit Tests on: + workflow_dispatch: pull_request: branches: - main @@ -67,4 +68,4 @@ jobs: run: cd packages/data-provider && npm run test:ci - name: Run librechat-mcp unit tests - run: cd packages/mcp && npm run test:ci \ No newline at end of file + run: cd packages/mcp && npm run test:ci From 295eb95cb4c9e68b986bddfdd2a90613d4a0767c Mon Sep 17 00:00:00 2001 From: thook Date: Tue, 20 May 2025 13:56:33 -0700 Subject: [PATCH 2/5] Tika --- .gitignore | 3 + api/server/services/Files/TikaOCR/crud.js | 104 +++++++++++ .../services/Files/TikaOCR/crud.spec.js | 169 ++++++++++++++++++ api/server/services/Files/TikaOCR/index.js | 5 + api/server/services/Files/process.js | 4 +- api/server/services/Files/strategies.js | 23 +++ docker-compose.override.yml.example | 14 ++ librechat.example.yaml | 24 +++ packages/data-provider/src/config.ts | 1 + packages/data-provider/src/types/files.ts | 1 + tika-config.xml | 36 ++++ 11 files changed, 382 insertions(+), 2 deletions(-) create mode 100644 api/server/services/Files/TikaOCR/crud.js create mode 100644 api/server/services/Files/TikaOCR/crud.spec.js create mode 100644 api/server/services/Files/TikaOCR/index.js create mode 100644 tika-config.xml diff --git a/.gitignore b/.gitignore index 0b64a284b523..d04edaf524a3 100644 --- a/.gitignore +++ b/.gitignore @@ -111,6 +111,9 @@ auth.json # User uploads uploads/ +# Ollama +ollama/ + # owner release/ diff --git a/api/server/services/Files/TikaOCR/crud.js b/api/server/services/Files/TikaOCR/crud.js new file mode 100644 index 000000000000..58651b6a6de2 --- /dev/null +++ b/api/server/services/Files/TikaOCR/crud.js @@ -0,0 +1,104 @@ +// ~/server/services/Files/TikaOCR/crud.js +const fs = require('fs'); +const path = require('path'); +const FormData = require('form-data'); +const { FileSources, envVarRegex, extractEnvVariable } = require('librechat-data-provider'); +const { loadAuthValues } = require('~/server/services/Tools/credentials'); +const { logger, createAxiosInstance } = require('~/config'); +const { logAxiosError } = require('~/utils/axios'); + +const axios = createAxiosInstance(); + +/** + * Uploads a document to Tika. DOES NOT FILE STREAM. + * + * @param {Object} params Upload parameters + * @param {string} params.filePath The path to the file on disk + * @param {string} [params.baseURL=http://tika:9998] Tika API base URL if using docker + * @returns {Promise} The response from Tika + */ +async function uploadDocumentToTika({ + filePath, + baseURL = 'http://tika:9998', +}) { + const fileData = fs.readFileSync(filePath); // Read the entire file into memory :( + + return axios + .put(`${baseURL}/tika`, fileData, { + headers: { + 'Content-Type': 'application/pdf', // This should be dynamic based on the file type + 'Accept': 'text/plain', + }, + maxBodyLength: Infinity, + maxContentLength: Infinity, + }) + .then((res) => res.data) + .catch((error) => { + logger.error('Error uploading document to Tika:', error.message); + throw error; + }); +} + +/** + * Uploads a file to the Tika OCR API and processes the OCR result. + * + * @param {Object} params - The params object. + * @param {ServerRequest} params.req - The request object from Express. It should have a `user` property with an `id` + * representing the user + * @param {Express.Multer.File} params.file - The file object, which is part of the request. The file object should + * have a `mimetype` property that tells us the file type + * @param {string} params.file_id - The file ID. + * @param {string} [params.entity_id] - The entity ID, not used here but passed for consistency. + * @returns {Promise<{ filepath: string, bytes: number }>} - The result object containing the processed `text` and `images` (not currently used), + * along with the `filename` and `bytes` properties. + */ +const uploadTikaOCR = async ({ req, file, file_id, entity_id }) => { + try { + /** @type {TCustomConfig['ocr']} */ + const ocrConfig = req.app.locals?.ocr; + + const baseURLConfig = ocrConfig.baseURL || ''; + + const isBaseURLEnvVar = envVarRegex.test(baseURLConfig); + + const isBaseURLEmpty = !baseURLConfig.trim(); + + let baseURL; + + if (isBaseURLEnvVar || isBaseURLEmpty) { + const baseURLVarName = isBaseURLEnvVar ? extractVariableName(baseURLConfig) : 'OCR_BASEURL'; + + const authValues = await loadAuthValues({ + userId: req.user.id, + authFields: [baseURLVarName], + optional: new Set([baseURLVarName]), + }); + + baseURL = authValues[baseURLVarName]; + } else { + baseURL = baseURLConfig; + } + + const extractedText = await uploadDocumentToTika({ + filePath: file.path, + baseURL, + }); + + + + return { + filename: file.originalname, + bytes: extractedText.length * 4, + filepath: FileSources.tika_ocr, + text: extractedText, + images: [] // Not used in this implementation + }; + } catch (error) { + const message = 'Error uploading document to Tika OCR API'; + throw new Error(logAxiosError({ error, message })); + } +}; + +module.exports = { + uploadTikaOCR +}; diff --git a/api/server/services/Files/TikaOCR/crud.spec.js b/api/server/services/Files/TikaOCR/crud.spec.js new file mode 100644 index 000000000000..fe781099914c --- /dev/null +++ b/api/server/services/Files/TikaOCR/crud.spec.js @@ -0,0 +1,169 @@ +const fs = require('fs'); +const mockAxios = { + put: jest.fn().mockResolvedValue({ data: 'Extracted text from Tika' }), +}; +jest.mock('axios', () => mockAxios); +jest.mock('fs'); +jest.mock('~/config', () => ({ + logger: { + error: jest.fn(), + }, + createAxiosInstance: () => mockAxios, +})); +jest.mock('~/server/services/Tools/credentials', () => ({ + loadAuthValues: jest.fn(), +})); + +const { uploadTikaOCR } = require('./crud'); + +describe('TikaOCR Service', () => { + afterEach(() => { + jest.clearAllMocks(); + }); + + describe('uploadDocumentToTika', () => { + it('should upload a document to Tika and return extracted text', async () => { + const mockFilePath = '/path/to/test.pdf'; + const mockFileData = Buffer.from('mock file data'); + fs.readFileSync.mockReturnValue(mockFileData); + + const result = await uploadTikaOCR({ + req: { + user: { id: 'user123' }, + app: { + locals: { + ocr: { + baseURL: 'http://tika:9998', + }, + }, + }, + }, + file: { + path: mockFilePath, + originalname: 'test.pdf', + }, + file_id: 'file123', + entity_id: 'entity123', + }); + + expect(fs.readFileSync).toHaveBeenCalledWith(mockFilePath); + expect(mockAxios.put).toHaveBeenCalledWith( + 'http://tika:9998/tika', + mockFileData, + expect.objectContaining({ + headers: { + 'Content-Type': 'application/pdf', + Accept: 'text/plain', + }, + maxBodyLength: Infinity, + maxContentLength: Infinity, + }), + ); + expect(result).toEqual({ + filename: 'test.pdf', + bytes: 'Extracted text from Tika'.length * 4, + filepath: 'tika_ocr', + text: 'Extracted text from Tika', + images: [], + }); + }); + + it('should handle errors during document upload', async () => { + const errorMessage = 'Tika API error'; + mockAxios.put.mockRejectedValueOnce(new Error(errorMessage)); + + await expect( + uploadTikaOCR({ + req: { + user: { id: 'user123' }, + app: { + locals: { + ocr: { + baseURL: 'http://tika:9998', + }, + }, + }, + }, + file: { + path: '/path/to/test.pdf', + originalname: 'test.pdf', + }, + file_id: 'file123', + entity_id: 'entity123', + }), + ).rejects.toThrow('Error uploading document to Tika OCR API'); + + const { logger } = require('~/config'); + expect(logger.error).toHaveBeenCalledWith( + expect.stringContaining('Error uploading document to Tika:'), + expect.any(String), + ); + }); + + it('should resolve baseURL from environment variables when configured', async () => { + const { loadAuthValues } = require('~/server/services/Tools/credentials'); + loadAuthValues.mockResolvedValue({ + OCR_BASEURL: 'http://env-tika:9998', + }); + + const result = await uploadTikaOCR({ + req: { + user: { id: 'user123' }, + app: { + locals: { + ocr: { + baseURL: '${OCR_BASEURL}', + }, + }, + }, + }, + file: { + path: '/path/to/test.pdf', + originalname: 'test.pdf', + }, + file_id: 'file123', + entity_id: 'entity123', + }); + + expect(loadAuthValues).toHaveBeenCalledWith({ + userId: 'user123', + authFields: ['OCR_BASEURL'], + optional: expect.any(Set), + }); + expect(mockAxios.put).toHaveBeenCalledWith( + 'http://env-tika:9998/tika', + expect.any(Buffer), + expect.any(Object), + ); + expect(result.text).toEqual('Extracted text from Tika'); + }); + + it('should handle empty baseURL and use default', async () => { + const result = await uploadTikaOCR({ + req: { + user: { id: 'user123' }, + app: { + locals: { + ocr: { + baseURL: '', + }, + }, + }, + }, + file: { + path: '/path/to/test.pdf', + originalname: 'test.pdf', + }, + file_id: 'file123', + entity_id: 'entity123', + }); + + expect(mockAxios.put).toHaveBeenCalledWith( + 'http://tika:9998/tika', + expect.any(Buffer), + expect.any(Object), + ); + expect(result.text).toEqual('Extracted text from Tika'); + }); + }); +}); \ No newline at end of file diff --git a/api/server/services/Files/TikaOCR/index.js b/api/server/services/Files/TikaOCR/index.js new file mode 100644 index 000000000000..a6223d1ee5d2 --- /dev/null +++ b/api/server/services/Files/TikaOCR/index.js @@ -0,0 +1,5 @@ +const crud = require('./crud'); + +module.exports = { + ...crud, +}; diff --git a/api/server/services/Files/process.js b/api/server/services/Files/process.js index 94b1bc4dadc3..b3a98a49c723 100644 --- a/api/server/services/Files/process.js +++ b/api/server/services/Files/process.js @@ -522,7 +522,7 @@ const processAgentFileUpload = async ({ req, res, metadata }) => { throw new Error('OCR capability is not enabled for Agents'); } - const { handleFileUpload: uploadMistralOCR } = getStrategyFunctions( + const { handleFileUpload: uploadOCR } = getStrategyFunctions( req.app.locals?.ocr?.strategy ?? FileSources.mistral_ocr, ); const { file_id, temp_file_id } = metadata; @@ -534,7 +534,7 @@ const processAgentFileUpload = async ({ req, res, metadata }) => { images, filename, filepath: ocrFileURL, - } = await uploadMistralOCR({ req, file, file_id, entity_id: agent_id, basePath }); + } = await uploadOCR({ req, file, file_id, entity_id: agent_id, basePath }); const fileInfo = removeNullishValues({ text, diff --git a/api/server/services/Files/strategies.js b/api/server/services/Files/strategies.js index c6cfe77069ed..58953f0f7e5c 100644 --- a/api/server/services/Files/strategies.js +++ b/api/server/services/Files/strategies.js @@ -47,6 +47,7 @@ const { uploadOpenAIFile, deleteOpenAIFile, getOpenAIFileStream } = require('./O const { getCodeOutputDownloadStream, uploadCodeEnvFile } = require('./Code'); const { uploadVectors, deleteVectors } = require('./VectorDB'); const { uploadMistralOCR } = require('./MistralOCR'); +const { uploadTikaOCR } = require('./TikaOCR'); /** * Firebase Storage Strategy Functions @@ -202,6 +203,26 @@ const mistralOCRStrategy = () => ({ handleFileUpload: uploadMistralOCR, }); +const tikaOCRStrategy = () => ({ + /** @type {typeof saveFileFromURL | null} */ + saveURL: null, + /** @type {typeof getLocalFileURL | null} */ + getFileURL: null, + /** @type {typeof saveLocalBuffer | null} */ + saveBuffer: null, + /** @type {typeof processLocalAvatar | null} */ // I am not sure if this is correct + processAvatar: null, + /** @type {typeof uploadLocalImage | null} */ + handleImageUpload: null, + /** @type {typeof prepareImagesLocal | null} */ + prepareImagePayload: null, + /** @type {typeof deleteLocalFile | null} */ + deleteFile: null, + /** @type {typeof getLocalFileStream | null} */ + getDownloadStream: null, + handleFileUpload: uploadTikaOCR, +}); + // Strategy Selector const getStrategyFunctions = (fileSource) => { if (fileSource === FileSources.firebase) { @@ -222,6 +243,8 @@ const getStrategyFunctions = (fileSource) => { return codeOutputStrategy(); } else if (fileSource === FileSources.mistral_ocr) { return mistralOCRStrategy(); + } else if (fileSource === FileSources.tika_ocr) { + return tikaOCRStrategy(); } else { throw new Error('Invalid file source'); } diff --git a/docker-compose.override.yml.example b/docker-compose.override.yml.example index 3799341ce696..443dd2243ee3 100644 --- a/docker-compose.override.yml.example +++ b/docker-compose.override.yml.example @@ -116,6 +116,20 @@ # volumes: # - ./ollama:/root/.ollama +# # ADD TIKA +# tika: +# image: apache/tika:latest-full +# container_name: tika +# ports: +# - "${TIKA_PORT-9998}:9998" +# configs: +# - source: tika_config_3 +# target: /tika-config.xml +# +# configs: +# tika_config_3: +# file: ./tika-config.xml + # # ADD LITELLM BASIC - NEED TO CONFIGURE litellm-config.yaml, ONLY NEED ENV TO ENABLE REDIS FOR CACHING OR LANGFUSE FOR MONITORING # litellm: # image: ghcr.io/berriai/litellm:main-latest diff --git a/librechat.example.yaml b/librechat.example.yaml index dfa8626eccce..b6577ca1c01b 100644 --- a/librechat.example.yaml +++ b/librechat.example.yaml @@ -156,6 +156,14 @@ actions: # - "mcp-obsidian" # - /path/to/obsidian/vault +# Example Tika OCR configuration + +# ocr: +# apiKey: "none" +# baseURL: "http://tika:9998" +# strategy: "tika_ocr" + + # Definition of custom endpoints endpoints: # assistants: @@ -181,6 +189,22 @@ endpoints: # # (optional) Agent Capabilities available to all users. Omit the ones you wish to exclude. Defaults to list below. # capabilities: ["execute_code", "file_search", "actions", "tools"] custom: + # Ollama example + - name: "ollama" + apiKey: "ollama" + baseURL: "http://ollama:11434/v1/" + models: + default: + [ + "llama3.2:3b", + ] + fetch: true + titleConvo: true + titleModel: "current_model" + summarize: false + summaryModel: "current_model" + forcePrompt: false + modelDisplayLabel: "Ollama" # Groq Example - name: 'groq' apiKey: '${GROQ_API_KEY}' diff --git a/packages/data-provider/src/config.ts b/packages/data-provider/src/config.ts index e40a662fdca0..463af07d0b53 100644 --- a/packages/data-provider/src/config.ts +++ b/packages/data-provider/src/config.ts @@ -572,6 +572,7 @@ export type TStartupConfig = { export enum OCRStrategy { MISTRAL_OCR = 'mistral_ocr', CUSTOM_OCR = 'custom_ocr', + TIKA_OCR = 'tika_ocr', } export const ocrSchema = z.object({ diff --git a/packages/data-provider/src/types/files.ts b/packages/data-provider/src/types/files.ts index 927002630fb8..52d39dafe632 100644 --- a/packages/data-provider/src/types/files.ts +++ b/packages/data-provider/src/types/files.ts @@ -10,6 +10,7 @@ export enum FileSources { vectordb = 'vectordb', execute_code = 'execute_code', mistral_ocr = 'mistral_ocr', + tika_ocr = 'tika_ocr', text = 'text', } diff --git a/tika-config.xml b/tika-config.xml new file mode 100644 index 000000000000..26f126c4c7e7 --- /dev/null +++ b/tika-config.xml @@ -0,0 +1,36 @@ + + + + + + application/pdf + + + + + application/pdf + + + + + + OCR_AND_TEXT_EXTRACTION + + + + application/xml + text/xml + + + + + + + debug + + true + true + + + \ No newline at end of file From 598082e53903180f9ea7139d7149f7628ee87a7e Mon Sep 17 00:00:00 2001 From: thook Date: Tue, 20 May 2025 14:10:51 -0700 Subject: [PATCH 3/5] Tika --- api/server/services/Files/TikaOCR/crud.spec.js | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/api/server/services/Files/TikaOCR/crud.spec.js b/api/server/services/Files/TikaOCR/crud.spec.js index fe781099914c..4818b1ce88bc 100644 --- a/api/server/services/Files/TikaOCR/crud.spec.js +++ b/api/server/services/Files/TikaOCR/crud.spec.js @@ -1,6 +1,11 @@ const fs = require('fs'); const mockAxios = { - put: jest.fn().mockResolvedValue({ data: 'Extracted text from Tika' }), + put: jest.fn().mockResolvedValue({ data: {} }), + interceptors: { + response: { + use: jest.fn(), + }, + }, }; jest.mock('axios', () => mockAxios); jest.mock('fs'); From 9dcec03ad1ee4ec5369da27deb37a9d0c7c9d58b Mon Sep 17 00:00:00 2001 From: thook Date: Tue, 20 May 2025 14:21:37 -0700 Subject: [PATCH 4/5] Tika --- api/server/services/Files/TikaOCR/crud.js | 5 +++++ api/server/services/Files/TikaOCR/crud.spec.js | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/api/server/services/Files/TikaOCR/crud.js b/api/server/services/Files/TikaOCR/crud.js index 58651b6a6de2..f6f99800afee 100644 --- a/api/server/services/Files/TikaOCR/crud.js +++ b/api/server/services/Files/TikaOCR/crud.js @@ -9,6 +9,11 @@ const { logAxiosError } = require('~/utils/axios'); const axios = createAxiosInstance(); +function extractVariableName(str) { + const match = str.match(envVarRegex); + return match ? match[1] : null; +} + /** * Uploads a document to Tika. DOES NOT FILE STREAM. * diff --git a/api/server/services/Files/TikaOCR/crud.spec.js b/api/server/services/Files/TikaOCR/crud.spec.js index 4818b1ce88bc..34adaaab454e 100644 --- a/api/server/services/Files/TikaOCR/crud.spec.js +++ b/api/server/services/Files/TikaOCR/crud.spec.js @@ -1,6 +1,6 @@ const fs = require('fs'); const mockAxios = { - put: jest.fn().mockResolvedValue({ data: {} }), + put: jest.fn().mockResolvedValue({ data: 'Extracted text from Tika' }), interceptors: { response: { use: jest.fn(), From 7b85c6909556e41be6b374613a4d82fdd80852ae Mon Sep 17 00:00:00 2001 From: thook Date: Tue, 20 May 2025 14:36:44 -0700 Subject: [PATCH 5/5] Tika --- api/server/services/Files/TikaOCR/crud.spec.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/api/server/services/Files/TikaOCR/crud.spec.js b/api/server/services/Files/TikaOCR/crud.spec.js index 34adaaab454e..9cc7445a98c3 100644 --- a/api/server/services/Files/TikaOCR/crud.spec.js +++ b/api/server/services/Files/TikaOCR/crud.spec.js @@ -108,7 +108,7 @@ describe('TikaOCR Service', () => { it('should resolve baseURL from environment variables when configured', async () => { const { loadAuthValues } = require('~/server/services/Tools/credentials'); loadAuthValues.mockResolvedValue({ - OCR_BASEURL: 'http://env-tika:9998', + OCR_BASEURL: 'http://tika:9998', }); const result = await uploadTikaOCR({ @@ -136,7 +136,7 @@ describe('TikaOCR Service', () => { optional: expect.any(Set), }); expect(mockAxios.put).toHaveBeenCalledWith( - 'http://env-tika:9998/tika', + 'http://tika:9998/tika', expect.any(Buffer), expect.any(Object), );