From 6782cd19ae4415ab2fbb4493a2404742b80aa9e6 Mon Sep 17 00:00:00 2001 From: Valentin Chmara Date: Tue, 20 May 2025 16:44:31 +0200 Subject: [PATCH 1/2] fix: search with exact: true --- packages/orama/package.json | 9 ++++-- packages/orama/src/components/index.ts | 4 +-- packages/orama/src/methods/search-fulltext.ts | 28 ++++++++++++++++--- packages/orama/src/package.json | 3 -- packages/orama/src/types.ts | 7 ++++- packages/orama/tests/dataset.test.ts | 21 +++++++------- packages/orama/tests/search.test.ts | 12 ++++---- packages/tokenizers/package.json | 9 ++++-- 8 files changed, 62 insertions(+), 31 deletions(-) delete mode 100644 packages/orama/src/package.json diff --git a/packages/orama/package.json b/packages/orama/package.json index 044657f61..7e98077d5 100644 --- a/packages/orama/package.json +++ b/packages/orama/package.json @@ -81,7 +81,9 @@ } }, "types": "./dist/commonjs/index.d.ts", - "files": ["dist"], + "files": [ + "dist" + ], "repository": { "type": "git", "url": "https://github.com/oramasearch/orama" @@ -169,7 +171,10 @@ "./components": "./src/components.ts", "./trees": "./src/trees.ts" }, - "esmDialects": ["deno", "browser"] + "esmDialects": [ + "deno", + "browser" + ] }, "module": "./dist/esm/index.js" } diff --git a/packages/orama/src/components/index.ts b/packages/orama/src/components/index.ts index efab99b1e..71da52b36 100644 --- a/packages/orama/src/components/index.ts +++ b/packages/orama/src/components/index.ts @@ -459,8 +459,8 @@ export function search( tokenizer: Tokenizer, language: string | undefined, propertiesToSearch: string[], - exact: boolean, tolerance: number, + exactToken: boolean, boost: Record, relevance: Required, docsCount: number, @@ -500,7 +500,7 @@ export function search( const tokenLength = tokens.length for (let i = 0; i < tokenLength; i++) { const token = tokens[i] - const searchResult = tree.node.find({ term: token, exact, tolerance }) + const searchResult = tree.node.find({ term: token, exact: exactToken, tolerance }) // See if this token was found (for threshold=0 filtering) const termsFound = Object.keys(searchResult) diff --git a/packages/orama/src/methods/search-fulltext.ts b/packages/orama/src/methods/search-fulltext.ts index b413d7301..68745eeb5 100644 --- a/packages/orama/src/methods/search-fulltext.ts +++ b/packages/orama/src/methods/search-fulltext.ts @@ -1,9 +1,10 @@ import { getFacets } from '../components/facets.js' import { getGroups } from '../components/groups.js' import { runAfterSearch, runBeforeSearch } from '../components/hooks.js' -import { getInternalDocumentId } from '../components/internal-document-id-store.js' +import { getInternalDocumentId, InternalDocumentID } from '../components/internal-document-id-store.js' import { Language } from '../components/tokenizer/languages.js' import { createError } from '../errors.js' +import { getNested } from '../utils.js' import type { AnyOrama, BM25Params, @@ -22,7 +23,7 @@ export function innerFullTextSearch( orama: T, params: Pick< SearchParamsFullText, - 'term' | 'properties' | 'where' | 'exact' | 'tolerance' | 'boost' | 'relevance' | 'threshold' + 'term' | 'properties' | 'where' | 'exact' | 'tolerance' | 'boost' | 'relevance' | 'threshold' | 'exactToken' >, language: Language | undefined ) { @@ -66,7 +67,26 @@ export function innerFullTextSearch( // in this case, we need to return all the documents that contains at least one of the given properties const threshold = params.threshold !== undefined && params.threshold !== null ? params.threshold : 1 - if (term || properties) { + if (params.exact && term) { + const docs = orama.documentsStore.getAll(orama.data.docs) as Record> + const normalizeTerm= term.toLowerCase() + + uniqueDocsIDs = Object.entries(docs) + .filter(([, doc]) => { + return propertiesToSearch.some((prop) => { + const value = getNested(doc, prop) + if (typeof value === 'string') { + return value.toLowerCase() === normalizeTerm + } + if (Array.isArray(value)) { + return value.some((v) => typeof v === 'string' && v.toLowerCase() === normalizeTerm) + } + return false + }) + }) + .map(([id,]) => [+id, 0] as TokenScore) + } + else if (term || properties) { const docsCount = count(orama) uniqueDocsIDs = orama.index.search( index, @@ -74,8 +94,8 @@ export function innerFullTextSearch( orama.tokenizer, language, propertiesToSearch, - params.exact || false, params.tolerance || 0, + params.exactToken || false, params.boost || {}, applyDefault(params.relevance), docsCount, diff --git a/packages/orama/src/package.json b/packages/orama/src/package.json deleted file mode 100644 index 3dbc1ca59..000000000 --- a/packages/orama/src/package.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "type": "module" -} diff --git a/packages/orama/src/types.ts b/packages/orama/src/types.ts index 38d768b4f..aae4bdb49 100644 --- a/packages/orama/src/types.ts +++ b/packages/orama/src/types.ts @@ -318,6 +318,11 @@ export interface SearchParamsFullText { tokenizer: Tokenizer, language: string | undefined, propertiesToSearch: string[], - exact: boolean, tolerance: number, + exactToken: boolean, boost: Partial[]>, number>>, relevance: Required, docsCount: number, diff --git a/packages/orama/tests/dataset.test.ts b/packages/orama/tests/dataset.test.ts index 1ce04277e..1c8085c20 100644 --- a/packages/orama/tests/dataset.test.ts +++ b/packages/orama/tests/dataset.test.ts @@ -108,9 +108,9 @@ t.test("orama.dataset", async (t) => { Object.keys((db.data.docs as DocumentsStore).docs).length, (dataset as EventJson).result.events.length, ); - t.equal(s1.count, 1117); - t.equal(s2.count, 7314); - t.equal(s3.count, 7314); + t.equal(s1.count, 1081); + t.equal(s2.count, 0); + t.equal(s3.count, 1842); t.end(); }); @@ -143,7 +143,7 @@ t.test("orama.dataset", async (t) => { const s1 = removeVariadicData( await search(db, { term: "war", - exact: true, + exactToken: true, // eslint-disable-next-line // @ts-ignore properties: ["description"], @@ -155,7 +155,7 @@ t.test("orama.dataset", async (t) => { const s2 = removeVariadicData( await search(db, { term: "war", - exact: true, + exactToken: true, properties: ["description"], limit: 10, offset: 10, @@ -165,7 +165,7 @@ t.test("orama.dataset", async (t) => { const s3 = removeVariadicData( await search(db, { term: "war", - exact: true, + exactToken: true, properties: ["description"], limit: 10, offset: 20, @@ -174,7 +174,7 @@ t.test("orama.dataset", async (t) => { const s4 = await search(db, { term: "war", - exact: true, + exactToken: true, properties: ["description"], limit: 2240, offset: 0, @@ -182,7 +182,7 @@ t.test("orama.dataset", async (t) => { const s5 = await search(db, { term: "war", - exact: true, + exactToken: true, properties: ["description"], limit: 10, offset: 2239, @@ -223,7 +223,7 @@ t.test("orama.dataset", async (t) => { t.test("should correctly delete documents", async (t) => { const documentsToDelete = await search(db, { term: "war", - exact: true, + exactToken: true, properties: ["description"], limit: 10, offset: 0, @@ -235,13 +235,12 @@ t.test("orama.dataset", async (t) => { const newSearch = await search(db, { term: "war", - exact: true, properties: ["description"], limit: 10, offset: 0, }); - t.equal(newSearch.count, 2347); + t.equal(newSearch.count, 2743); t.end(); }); diff --git a/packages/orama/tests/search.test.ts b/packages/orama/tests/search.test.ts index efcc1947b..2dbb45dbd 100644 --- a/packages/orama/tests/search.test.ts +++ b/packages/orama/tests/search.test.ts @@ -84,11 +84,11 @@ t.test('search method', async (t) => { await insert(db, { quote: 'I like cats. They are the best.', author: 'Jane Doe' }) // Exact search - const result1 = await search(db, { term: 'fox', exact: true }) + const result1 = await search(db, { term: 'John Doe', properties: ["author"], exact: true }) const result2 = await search(db, { term: 'dog', exact: true }) t.equal(result1.count, 2) - t.equal(result2.count, 3) + t.equal(result2.count, 0) // Prefix search const result3 = await search(db, { term: 'fox', exact: false }) @@ -193,7 +193,7 @@ t.test('search method', async (t) => { const partialSearch = await search(db, { term: 'alr', - exact: true + exactToken: true }) t.equal(partialSearch.count, 0) @@ -201,7 +201,7 @@ t.test('search method', async (t) => { const exactSearch = await search(db, { term: 'already', - exact: true + exactToken: true }) t.equal(exactSearch.count, 1) @@ -719,11 +719,11 @@ t.test('search method', async (t) => { const result1 = await search(db, { term: 'foxes', exact: true }) const result2 = await search(db, { term: 'cats', exact: true }) - const result3 = await search(db, { term: 'brown', exact: true }) + const result3 = await search(db, { term: 'John Doe', exact: true }) t.equal(result1.count, 0) t.equal(result2.count, 0) - t.equal(result3.count, 1) + t.equal(result3.count, 2) t.end() }) diff --git a/packages/tokenizers/package.json b/packages/tokenizers/package.json index d566abf80..903ac6498 100644 --- a/packages/tokenizers/package.json +++ b/packages/tokenizers/package.json @@ -30,7 +30,9 @@ "dependencies": { "@orama/orama": "workspace:*" }, - "files": ["dist"], + "files": [ + "dist" + ], "repository": { "type": "git", "url": "https://github.com/oramasearch/orama" @@ -65,7 +67,10 @@ "node": ">= 20.0.0" }, "tshy": { - "dialects": ["esm", "commonjs"], + "dialects": [ + "esm", + "commonjs" + ], "exports": { "./japanese": "./src/japanese.ts", "./mandarin": "./src/mandarin.ts", From 9379cecf4052662e72aaf2c0190338baf576fe3f Mon Sep 17 00:00:00 2001 From: Vachmara Date: Fri, 6 Jun 2025 17:09:43 +0200 Subject: [PATCH 2/2] test: fix plugin-pt15 tests --- packages/plugin-pt15/src/index.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/plugin-pt15/src/index.ts b/packages/plugin-pt15/src/index.ts index acb5cbf35..a2087b91b 100644 --- a/packages/plugin-pt15/src/index.ts +++ b/packages/plugin-pt15/src/index.ts @@ -110,11 +110,11 @@ function createComponents(schema: AnySchema): Partial {throw new Error()}, removeTokenScoreParameters: () => {throw new Error()}, calculateResultScores: () => {throw new Error()}, - search: function search(index: PT15IndexStore, term: string, tokenizer: Tokenizer, language: string | undefined, propertiesToSearch: string[], exact: boolean, tolerance: number, boost: Partial[]>, number>>, relevance: Required, docsCount: number, whereFiltersIDs: Set | undefined): TokenScore[] { + search: function search(index: PT15IndexStore, term: string, tokenizer: Tokenizer, language: string | undefined, propertiesToSearch: string[], tolerance: number, exactToken: boolean, boost: Partial[]>, number>>, relevance: Required, docsCount: number, whereFiltersIDs: Set | undefined): TokenScore[] { if (tolerance !== 0) { throw new Error('Tolerance not implemented yet') } - if (exact === true) { + if (exactToken === true) { throw new Error('Exact not implemented yet') }