From 7ad7e92e3258e2cd9f9d47deed2d89cbe83cf0df Mon Sep 17 00:00:00 2001 From: Anton Rubin Date: Mon, 19 May 2025 22:34:17 +0100 Subject: [PATCH 1/2] adding rank_feature dsl query docs Signed-off-by: Anton Rubin --- _query-dsl/specialized/index.md | 2 +- _query-dsl/specialized/rank-feature.md | 547 +++++++++++++++++++++++++ 2 files changed, 548 insertions(+), 1 deletion(-) create mode 100644 _query-dsl/specialized/rank-feature.md diff --git a/_query-dsl/specialized/index.md b/_query-dsl/specialized/index.md index d28451cfa8d..fd89887e397 100644 --- a/_query-dsl/specialized/index.md +++ b/_query-dsl/specialized/index.md @@ -22,7 +22,7 @@ OpenSearch supports the following specialized queries: - `percolate`: Finds queries (stored as documents) that match the provided document. -- `rank_feature`: Calculates scores based on the values of numeric features. This query can skip non-competitive hits. +- [`rank_feature`]({{site.url}}{{site.baseurl}}/query-dsl/specialized/rank-feature/): Calculates scores based on the values of numeric features. This query can skip non-competitive hits. - `script`: Uses a script as a filter. diff --git a/_query-dsl/specialized/rank-feature.md b/_query-dsl/specialized/rank-feature.md new file mode 100644 index 00000000000..73c13f4460a --- /dev/null +++ b/_query-dsl/specialized/rank-feature.md @@ -0,0 +1,547 @@ +--- +layout: default +title: Rank feature +parent: Specialized queries +nav_order: 75 +--- + +# Rank feature + +Use the `rank_feature` query to boost document scores based on numeric values in the document, such as relevance scores, popularity, or freshness. This query is ideal if you want to fine-tune relevance ranking using numerical features. Unlike [full-text queries]({{site.url}}{{site.baseurl}}/query-dsl/full-text/index/), `rank_feature` focuses solely on a numeric signal, and is most effective when combined with other queries in a compound query like `bool`. + +The `rank_feature` query expects the target field to be mapped as a [`rank_feature` field type]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/rank/). This enables internally optimized scoring for fast and efficient boosting. + +The score impact depends on the field value and the optional `saturation`, `log` or `sigmoid` function used. + +## Parameters + +| Parameter | Required/Optional | Description | +| ----------------------- | ----------------- | ------------------------------------------------------------------------------------------------------------------------ | +| `field` | Required | A `rank_feature` or `rank_features` field that contributes to document scoring.| +| `boost` | Optional | A multiplier applied to the score. Default is `1.0`. Values between 0 and 1 reduce the score, values above 1 amplify it. | +| `saturation` | Optional | Applies a saturation function to the feature value. Boost grows with value but levels off beyond the `pivot`. (Default function if no other function is provided)| +| `log` | Optional | Uses a logarithmic scoring function based on the field value. Best for large ranges of values.| +| `sigmoid` | Optional | Applies a sigmoid (S-shaped) curve to score impact, controlled by `pivot` and `exponent`.| +| `positive_score_impact` | Optional | When `false`, lower values score higher. Useful for features like price where smaller is better. Defined as part of the mapping. (Default is `true`)| + +Only one function out of `saturation`, `log`, or `sigmoid` may be used at a time. +{: .note} + +## Create an index with rank feature field + +Define an index with a `rank_feature` field to represent a signal like `popularity`: + +```json +PUT /products +{ + "mappings": { + "properties": { + "title": { "type": "text" }, + "popularity": { "type": "rank_feature" } + } + } +} +``` +{% include copy-curl.html %} + +## Index example documents + +Add sample products with varying popularity values: + +```json +POST /products/_bulk +{ "index": { "_id": 1 } } +{ "title": "Wireless Earbuds", "popularity": 1 } +{ "index": { "_id": 2 } } +{ "title": "Bluetooth Speaker", "popularity": 10 } +{ "index": { "_id": 3 } } +{ "title": "Portable Charger", "popularity": 25 } +{ "index": { "_id": 4 } } +{ "title": "Smartwatch", "popularity": 50 } +{ "index": { "_id": 5 } } +{ "title": "Noise Cancelling Headphones", "popularity": 100 } +{ "index": { "_id": 6 } } +{ "title": "Gaming Laptop", "popularity": 250 } +{ "index": { "_id": 7 } } +{ "title": "4K Monitor", "popularity": 500 } +``` +{% include copy-curl.html %} + +## Basic rank feature query + +You can boost results based on the `popularity` score using `rank_feature`: + +```json +POST /products/_search +{ + "query": { + "rank_feature": { + "field": "popularity" + } + } +} +``` +{% include copy-curl.html %} + +This query alone does not perform filtering, rather it scores all documents based on the value of `popularity`. Higher values yield higher scores: + +```json +{ + ... + "hits": { + "total": { + "value": 7, + "relation": "eq" + }, + "max_score": 0.9252834, + "hits": [ + { + "_index": "products", + "_id": "7", + "_score": 0.9252834, + "_source": { + "title": "4K Monitor", + "popularity": 500 + } + }, + { + "_index": "products", + "_id": "6", + "_score": 0.86095566, + "_source": { + "title": "Gaming Laptop", + "popularity": 250 + } + }, + { + "_index": "products", + "_id": "5", + "_score": 0.71237755, + "_source": { + "title": "Noise Cancelling Headphones", + "popularity": 100 + } + }, + { + "_index": "products", + "_id": "4", + "_score": 0.5532503, + "_source": { + "title": "Smartwatch", + "popularity": 50 + } + }, + { + "_index": "products", + "_id": "3", + "_score": 0.38240916, + "_source": { + "title": "Portable Charger", + "popularity": 25 + } + }, + { + "_index": "products", + "_id": "2", + "_score": 0.19851118, + "_source": { + "title": "Bluetooth Speaker", + "popularity": 10 + } + }, + { + "_index": "products", + "_id": "1", + "_score": 0.024169207, + "_source": { + "title": "Wireless Earbuds", + "popularity": 1 + } + } + ] + } +} +``` + +## Combine with full-text search + +To filter relevant results and boost them based on popularity use the following request: + +```json +POST /products/_search +{ + "query": { + "bool": { + "must": { + "match": { + "title": "headphones" + } + }, + "should": { + "rank_feature": { + "field": "popularity" + } + } + } + } +} +``` +{% include copy-curl.html %} + +This ranks all documents matching "headphones" and boosts those with higher popularity. + +## Boost parameter + +The `boost` parameter allows you to scale the score contribution of the rank_feature clause. It's especially useful in compound queries such as bool, where you want to control the influence of a feature relative to other conditions. + +In the following example, the `bool` query matches documents with the term "headphones" in the `title`, and boosts more popular results with a `rank_feature` clause using a `boost` of `2.0`: + +```json +POST /products/_search +{ + "query": { + "bool": { + "must": { + "match": { + "title": "headphones" + } + }, + "should": { + "rank_feature": { + "field": "popularity", + "boost": 2.0 + } + } + } + } +} +``` +{% include copy-curl.html %} + +This will double the contribution of the rank_feature score in the overall document score. A `boost` less than `1.0` would down-weight its influence. + +## Configure score function + +By default, the `rank_feature` query uses a `saturation` function with a `pivot` value derived from the field. You can explicitly control this with the `saturation`, `log` or `sigmoid` functions. + +### Saturation function + +The `saturation` function is the default scoring method used in `rank_feature` queries. It assigns higher scores to documents with larger feature values, but the increase in score becomes more gradual as the value exceeds a specified pivot. This is useful when you want to give diminishing returns to very large values, for example, boosting `popularity` while avoiding over-rewarding extremely high numbers. The formulae for calculating score is: `value of the rank_feature field / (value of the rank_feature field + pivot)`. The produced score is always between `0` and `1`. + +```json +POST /products/_search +{ + "query": { + "rank_feature": { + "field": "popularity", + "saturation": { + "pivot": 50 + } + } + } +} +``` +{% include copy-curl.html %} + +The `pivot` defines the point at which scoring growth slows down. Values higher than `pivot` still increase the score, but with diminishing returns, as can be seen in the returned hits: + +```json +{ + ... + "hits": { + "total": { + "value": 7, + "relation": "eq" + }, + "max_score": 0.9090909, + "hits": [ + { + "_index": "products", + "_id": "7", + "_score": 0.9090909, + "_source": { + "title": "4K Monitor", + "popularity": 500 + } + }, + { + "_index": "products", + "_id": "6", + "_score": 0.8333333, + "_source": { + "title": "Gaming Laptop", + "popularity": 250 + } + }, + { + "_index": "products", + "_id": "5", + "_score": 0.6666666, + "_source": { + "title": "Noise Cancelling Headphones", + "popularity": 100 + } + }, + { + "_index": "products", + "_id": "4", + "_score": 0.5, + "_source": { + "title": "Smartwatch", + "popularity": 50 + } + }, + { + "_index": "products", + "_id": "3", + "_score": 0.3333333, + "_source": { + "title": "Portable Charger", + "popularity": 25 + } + }, + { + "_index": "products", + "_id": "2", + "_score": 0.16666669, + "_source": { + "title": "Bluetooth Speaker", + "popularity": 10 + } + }, + { + "_index": "products", + "_id": "1", + "_score": 0.019607842, + "_source": { + "title": "Wireless Earbuds", + "popularity": 1 + } + } + ] + } +} +``` + +If the pivot is not provided, approximate geometric mean of all rank_feature values in the index is used. + +### Log function + +The log function is helpful when the range of values in your `rank_feature` field varies significantly. It applies a logarithmic scale to the `score`, which reduces the effect of extremely high values and helps normalize scoring across wide value distributions. This is especially useful when a small difference between low values should be more impactful than a large difference between high values. The score is derived using formulae: `log(scaling_factor + rank_feature field)`, see following example: + +```json +POST /products/_search +{ + "query": { + "rank_feature": { + "field": "popularity", + "log": { + "scaling_factor": 2 + } + } + } +} +``` +{% include copy-curl.html %} + +In the example dataset, the `popularity` field ranges from `1` to `500`. The `log` function compresses the `score` contribution from large values like `250` and `500`, while still allowing documents with `10` or `25` to have meaningful scores. This is unlike `saturation`, where documents above the pivot rapidly approach the same maximum score.” + +```json +{ + ... + "hits": { + "total": { + "value": 7, + "relation": "eq" + }, + "max_score": 6.2186003, + "hits": [ + { + "_index": "products", + "_id": "7", + "_score": 6.2186003, + "_source": { + "title": "4K Monitor", + "popularity": 500 + } + }, + { + "_index": "products", + "_id": "6", + "_score": 5.529429, + "_source": { + "title": "Gaming Laptop", + "popularity": 250 + } + }, + { + "_index": "products", + "_id": "5", + "_score": 4.624973, + "_source": { + "title": "Noise Cancelling Headphones", + "popularity": 100 + } + }, + { + "_index": "products", + "_id": "4", + "_score": 3.9512436, + "_source": { + "title": "Smartwatch", + "popularity": 50 + } + }, + { + "_index": "products", + "_id": "3", + "_score": 3.295837, + "_source": { + "title": "Portable Charger", + "popularity": 25 + } + }, + { + "_index": "products", + "_id": "2", + "_score": 2.4849067, + "_source": { + "title": "Bluetooth Speaker", + "popularity": 10 + } + }, + { + "_index": "products", + "_id": "1", + "_score": 1.0986123, + "_source": { + "title": "Wireless Earbuds", + "popularity": 1 + } + } + ] + } +} +``` + +### Sigmoid function + +The `sigmoid` function provides a smooth, S-shaped scoring curve which is especially useful when you want to control the steepness and midpoint of the scoring impact. The score is derived using formulae: `rank feature field value^exp / (rank feature field value^exp + pivot^exp)`, see following example: + +```json +POST /products/_search +{ + "query": { + "rank_feature": { + "field": "popularity", + "sigmoid": { + "pivot": 50, + "exponent": 0.5 + } + } + } +} +``` +{% include copy-curl.html %} + +* `pivot` defines the value at which the score is 0.5. +* `exponent` controls how steep the curve is. Lower values result in a sharper transition around the pivot. + +The sigmoid function smoothly boosts scores around the `pivot` (`50` in this case), giving moderate preference to values near the pivot while flattening out both high and low extremes: + +```json +{ + ... + "hits": { + "total": { + "value": 7, + "relation": "eq" + }, + "max_score": 0.7597469, + "hits": [ + { + "_index": "products", + "_id": "7", + "_score": 0.7597469, + "_source": { + "title": "4K Monitor", + "popularity": 500 + } + }, + { + "_index": "products", + "_id": "6", + "_score": 0.690983, + "_source": { + "title": "Gaming Laptop", + "popularity": 250 + } + }, + { + "_index": "products", + "_id": "5", + "_score": 0.58578646, + "_source": { + "title": "Noise Cancelling Headphones", + "popularity": 100 + } + }, + { + "_index": "products", + "_id": "4", + "_score": 0.5, + "_source": { + "title": "Smartwatch", + "popularity": 50 + } + }, + { + "_index": "products", + "_id": "3", + "_score": 0.41421357, + "_source": { + "title": "Portable Charger", + "popularity": 25 + } + }, + { + "_index": "products", + "_id": "2", + "_score": 0.309017, + "_source": { + "title": "Bluetooth Speaker", + "popularity": 10 + } + }, + { + "_index": "products", + "_id": "1", + "_score": 0.12389934, + "_source": { + "title": "Wireless Earbuds", + "popularity": 1 + } + } + ] + } +} +``` + +### Invert score impact + +By default, higher values lead to higher scores. If you want lower values to yield higher scores (e.g., lower prices are more relevant), set `positive_score_impact` to `false` during index creation: + +```json +PUT /products_new +{ + "mappings": { + "properties": { + "popularity": { + "type": "rank_feature", + "positive_score_impact": false + } + } + } +} +``` +{% include copy-curl.html %} From 4595b4529cc38a832521643b9cdbf06d54a6baab Mon Sep 17 00:00:00 2001 From: Anton Rubin Date: Tue, 8 Jul 2025 12:07:50 +0100 Subject: [PATCH 2/2] addressing PR comments Signed-off-by: Anton Rubin --- _query-dsl/specialized/rank-feature.md | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/_query-dsl/specialized/rank-feature.md b/_query-dsl/specialized/rank-feature.md index 73c13f4460a..b11fb63d6d9 100644 --- a/_query-dsl/specialized/rank-feature.md +++ b/_query-dsl/specialized/rank-feature.md @@ -11,7 +11,7 @@ Use the `rank_feature` query to boost document scores based on numeric values in The `rank_feature` query expects the target field to be mapped as a [`rank_feature` field type]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/rank/). This enables internally optimized scoring for fast and efficient boosting. -The score impact depends on the field value and the optional `saturation`, `log` or `sigmoid` function used. +The score impact depends on the field value and the optional `saturation`, `log` or `sigmoid` function used. These functions are applied dynamically at query time to compute the final document score, they do not alter or store any values in the document itself. ## Parameters @@ -192,9 +192,9 @@ This ranks all documents matching "headphones" and boosts those with higher popu ## Boost parameter -The `boost` parameter allows you to scale the score contribution of the rank_feature clause. It's especially useful in compound queries such as bool, where you want to control the influence of a feature relative to other conditions. +The `boost` parameter allows you to scale the score contribution of the rank_feature clause. It’s especially useful in compound queries such as `bool`, where you want to control how much influence a numeric field (such as popularity, freshness, or relevance score) has on the final document ranking. -In the following example, the `bool` query matches documents with the term "headphones" in the `title`, and boosts more popular results with a `rank_feature` clause using a `boost` of `2.0`: +In the following example, the bool query matches documents with the term "headphones" in the `title`, and boosts more popular results using a `rank_feature` clause with a `boost` of `2.0`: ```json POST /products/_search @@ -226,7 +226,7 @@ By default, the `rank_feature` query uses a `saturation` function with a `pivot` ### Saturation function -The `saturation` function is the default scoring method used in `rank_feature` queries. It assigns higher scores to documents with larger feature values, but the increase in score becomes more gradual as the value exceeds a specified pivot. This is useful when you want to give diminishing returns to very large values, for example, boosting `popularity` while avoiding over-rewarding extremely high numbers. The formulae for calculating score is: `value of the rank_feature field / (value of the rank_feature field + pivot)`. The produced score is always between `0` and `1`. +The `saturation` function is the default scoring method used in `rank_feature` queries. It assigns higher scores to documents with larger feature values, but the increase in score becomes more gradual as the value exceeds a specified pivot. This is useful when you want to give diminishing returns to very large values, for example, boosting `popularity` while avoiding over-rewarding extremely high numbers. The formulae for calculating score is: `value of the rank_feature field / (value of the rank_feature field + pivot)`. The produced score is always between `0` and `1`. If the pivot is not provided, approximate geometric mean of all `rank_feature` values in the index is used. See following example using `saturation` with `pivot` configured to `50`: ```json POST /products/_search @@ -323,8 +323,6 @@ The `pivot` defines the point at which scoring growth slows down. Values higher } ``` -If the pivot is not provided, approximate geometric mean of all rank_feature values in the index is used. - ### Log function The log function is helpful when the range of values in your `rank_feature` field varies significantly. It applies a logarithmic scale to the `score`, which reduces the effect of extremely high values and helps normalize scoring across wide value distributions. This is especially useful when a small difference between low values should be more impactful than a large difference between high values. The score is derived using formulae: `log(scaling_factor + rank_feature field)`, see following example: @@ -426,7 +424,7 @@ In the example dataset, the `popularity` field ranges from `1` to `500`. The `lo ### Sigmoid function -The `sigmoid` function provides a smooth, S-shaped scoring curve which is especially useful when you want to control the steepness and midpoint of the scoring impact. The score is derived using formulae: `rank feature field value^exp / (rank feature field value^exp + pivot^exp)`, see following example: +The `sigmoid` function provides a smooth, S-shaped scoring curve which is especially useful when you want to control the steepness and midpoint of the scoring impact. The score is derived using formulae: `rank feature field value^exp / (rank feature field value^exp + pivot^exp)`, see following example of a query using `sigmoid` function with configured `pivot` and `exponent`: ```json POST /products/_search @@ -529,7 +527,7 @@ The sigmoid function smoothly boosts scores around the `pivot` (`50` in this cas ### Invert score impact -By default, higher values lead to higher scores. If you want lower values to yield higher scores (e.g., lower prices are more relevant), set `positive_score_impact` to `false` during index creation: +By default, higher values lead to higher scores. If you want lower values to yield higher scores (for example, lower prices are more relevant), set `positive_score_impact` to `false` during index creation: ```json PUT /products_new