Clarify how to perform exact match searches in Quickwit (#5797)

rdettai · web-flow · commit 077d747a2a25 · 2025-06-12T11:54:04.000Z
* Add test using range to emulate keyword query

* Add docs

* Fix prefix test case
diff --git a/docs/configuration/index-config.md b/docs/configuration/index-config.md
@@ -141,7 +141,7 @@ fast:
 
 | Tokenizer     | Description   |
 | ------------- | ------------- |
-| `raw`         | Does not process nor tokenize text. Filters out tokens larger than 255 bytes.  |
+| `raw`         | Does not process nor tokenize text. Filters out tokens larger than 255 bytes. This is similar to the `keyword` type in Elasticsearch. |
 | `raw_lowercase` | Does not tokenize text, but lowercase it. Filters out tokens larger than 255 bytes.  |
 | `default`     | Chops the text on according to whitespace and punctuation, removes tokens that are too long, and converts to lowercase. Filters out tokens larger than 255 bytes. |
 | `en_stem`     | Like `default`, but also applies stemming on the resulting tokens. Filters out tokens larger than 255 bytes.  |
diff --git a/docs/reference/es_compatible_api.md b/docs/reference/es_compatible_api.md
@@ -672,6 +672,12 @@ Moreover, while Quickwit does not support `best_fields` or `cross_fields`, it wi
 
 [Elasticsearch reference documentation](https://www.elastic.co/guide/en/elasticsearch/reference/8.8/query-dsl-term-query.html)
 
+:::note
+
+When working on text, it is recommended to only use `term` queries on fields configured with `tokenizer: raw`. This is the Quickwit equivalent of the Elasticsearch `keyword` type.
+
+:::
+
 #### Example
 
 ```json
diff --git a/quickwit/rest-api-tests/scenarii/qw_search_api/0004_exact_string.yaml b/quickwit/rest-api-tests/scenarii/qw_search_api/0004_exact_string.yaml
@@ -0,0 +1,85 @@
+## using an index (with the raw tokenizer)
+endpoint: nested/search
+method: POST
+json:
+  query: "text_raw:indexed-with-raw-tokenizer-dashes"
+expected:
+  num_hits: 1
+---
+endpoint: nested/search
+method: POST
+json:
+  query: "text_raw:indexed_with_raw_tokenizer_dashes"
+expected:
+  num_hits: 0
+---
+endpoint: nested/search
+method: POST
+json:
+  query: "text_raw:indexed-with-raw"
+expected:
+  num_hits: 0
+---
+endpoint: nested/search
+method: POST
+json:
+  query: 'text_raw:"indexed with raw tokenizer dashes"'
+expected:
+  num_hits: 1
+---
+endpoint: nested/search
+method: POST
+json:
+  query: 'text_raw:"indexed with raw"'
+expected:
+  num_hits: 0
+---
+## using a fast field (use a range query to force using the fast field)
+endpoint: nested/search
+method: POST
+json:
+  query: "text_fast:fast-text-value-dashes"
+status_code: 400
+expected:
+  message: "invalid query: query is incompatible with schema. field text_fast is not full-text searchable)"
+---
+endpoint: nested/search
+method: POST
+json:
+  query: "text_fast:[fast-text-value-dashes TO fast-text-value-dashes]"
+expected:
+  num_hits: 1
+---
+endpoint: nested/search
+method: POST
+json:
+  query: "text_fast:[fast_text_value_dashes TO fast_text_value_dashes]"
+expected:
+  num_hits: 0
+---
+endpoint: nested/search
+method: POST
+json:
+  query: "text_fast:[fast-text-value TO fast-text-value]"
+expected:
+  num_hits: 0
+---
+# unfortunately, the query parser does not support escaping whitespaces
+# use the Elasticsearch API instead
+endpoint: nested/search
+method: POST
+json:
+  query: 'text_fast:["fast text value whitespaces" TO "fast text value whitespacesd"]'
+status_code: 400
+---
+endpoint: nested/search
+method: POST
+json:
+  query: "text_fast:[fast text value whitespaces TO fast text value whitespaces]"
+status_code: 400
+---
+endpoint: nested/search
+method: POST
+json:
+  query: "text_fast:[fast\ text\ value\ whitespaces TO fast\ text\ value\ whitespaces]"
+status_code: 400
diff --git a/quickwit/rest-api-tests/scenarii/qw_search_api/_setup.quickwit.yaml b/quickwit/rest-api-tests/scenarii/qw_search_api/_setup.quickwit.yaml
@@ -72,7 +72,16 @@ json:
           - name: object_fast_field
             type: u64
             fast: true
-            
+      - name: text_fast
+        type: text
+        fast: true
+        indexed: false
+      - name: text_raw
+        type: text
+        fast: false
+        indexed: true
+        tokenizer: raw
+
 ---
 method: POST
 endpoint: nested/ingest
@@ -85,3 +94,7 @@ ndjson:
   - {"object_multi": {"object_text_field": "multi hello"}}
   - {"object_multi": {"object_fast_field": 1}}
   - {"object_multi": {"object_fast_field": 2}}
+  - {"text_raw": "indexed-with-raw-tokenizer-dashes"}
+  - {"text_raw": "indexed with raw tokenizer dashes"}
+  - {"text_fast": "fast-text-value-dashes"}
+  - {"text_fast": "fast text value whitespaces"}