feat(ollama): Add min_p parameter for improved sampling control

GOODBOY008 · ilayaperumalg · commit 923e09a1b002 · 2025-04-10T11:20:13.000+01:00
- Add min_p option
- Add qwq model

Signed-off-by: gongzhongqiang &lt;gongzhongqiang@apache.org&gt;
diff --git a/models/spring-ai-ollama/src/main/java/org/springframework/ai/ollama/api/OllamaModel.java b/models/spring-ai-ollama/src/main/java/org/springframework/ai/ollama/api/OllamaModel.java
@@ -32,6 +32,11 @@ public enum OllamaModel implements ChatModelDescription {
 	 */
 	QWEN_2_5_7B("qwen2.5"),
 
+	/**
+	 * QwQ is the reasoning model of the Qwen series.
+	 */
+	QWQ("qwq"),
+
 	/**
 	 * Llama 2 is a collection of language models ranging from 7B to 70B parameters.
 	 */
diff --git a/models/spring-ai-ollama/src/main/java/org/springframework/ai/ollama/api/OllamaOptions.java b/models/spring-ai-ollama/src/main/java/org/springframework/ai/ollama/api/OllamaOptions.java
@@ -191,6 +191,16 @@ public class OllamaOptions implements ToolCallingChatOptions, EmbeddingOptions {
 	@JsonProperty("top_p")
 	private Double topP;
 
+	/**
+	 * Alternative to the top_p, and aims to ensure a balance of quality and variety.
+	 * The parameter p represents the minimum probability for a token to be considered,
+	 * relative to the probability of the most likely token. For example, with p=0.05 and
+	 * the most likely token having a probability of 0.9, logits with a value
+	 * less than 0.045 are filtered out. (Default: 0.0)
+	 */
+	@JsonProperty("min_p")
+	private Double minP;
+
 	/**
 	 * Tail free sampling is used to reduce the impact of less probable tokens
 	 * from the output. A higher value (e.g., 2.0) will reduce the impact more, while a
@@ -372,6 +382,7 @@ public static OllamaOptions fromOptions(OllamaOptions fromOptions) {
 				.numPredict(fromOptions.getNumPredict())
 				.topK(fromOptions.getTopK())
 				.topP(fromOptions.getTopP())
+				.minP(fromOptions.getMinP())
 				.tfsZ(fromOptions.getTfsZ())
 				.typicalP(fromOptions.getTypicalP())
 				.repeatLastN(fromOptions.getRepeatLastN())
@@ -567,6 +578,14 @@ public void setTopP(Double topP) {
 		this.topP = topP;
 	}
 
+	public Double getMinP() {
+		return this.minP;
+	}
+
+	public void setMinP(Double minP) {
+		this.minP = minP;
+	}
+
 	public Float getTfsZ() {
 		return this.tfsZ;
 	}
@@ -819,8 +838,9 @@ public boolean equals(Object o) {
 				&& Objects.equals(this.useMLock, that.useMLock) && Objects.equals(this.numThread, that.numThread)
 				&& Objects.equals(this.numKeep, that.numKeep) && Objects.equals(this.seed, that.seed)
 				&& Objects.equals(this.numPredict, that.numPredict) && Objects.equals(this.topK, that.topK)
-				&& Objects.equals(this.topP, that.topP) && Objects.equals(this.tfsZ, that.tfsZ)
-				&& Objects.equals(this.typicalP, that.typicalP) && Objects.equals(this.repeatLastN, that.repeatLastN)
+				&& Objects.equals(this.topP, that.topP) && Objects.equals(this.minP, that.minP)
+				&& Objects.equals(this.tfsZ, that.tfsZ) && Objects.equals(this.typicalP, that.typicalP)
+				&& Objects.equals(this.repeatLastN, that.repeatLastN)
 				&& Objects.equals(this.temperature, that.temperature)
 				&& Objects.equals(this.repeatPenalty, that.repeatPenalty)
 				&& Objects.equals(this.presencePenalty, that.presencePenalty)
@@ -838,7 +858,7 @@ public int hashCode() {
 		return Objects.hash(this.model, this.format, this.keepAlive, this.truncate, this.useNUMA, this.numCtx,
 				this.numBatch, this.numGPU, this.mainGPU, this.lowVRAM, this.f16KV, this.logitsAll, this.vocabOnly,
 				this.useMMap, this.useMLock, this.numThread, this.numKeep, this.seed, this.numPredict, this.topK,
-				this.topP, this.tfsZ, this.typicalP, this.repeatLastN, this.temperature, this.repeatPenalty,
+				this.topP, this.minP, this.tfsZ, this.typicalP, this.repeatLastN, this.temperature, this.repeatPenalty,
 				this.presencePenalty, this.frequencyPenalty, this.mirostat, this.mirostatTau, this.mirostatEta,
 				this.penalizeNewline, this.stop, this.toolCallbacks, this.toolNames, this.internalToolExecutionEnabled,
 				this.toolContext);
@@ -958,6 +978,11 @@ public Builder topP(Double topP) {
 			return this;
 		}
 
+		public Builder minP(Double minP) {
+			this.options.minP = minP;
+			return this;
+		}
+
 		public Builder tfsZ(Float tfsZ) {
 			this.options.tfsZ = tfsZ;
 			return this;
diff --git a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/chat/ollama-chat.adoc b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/chat/ollama-chat.adoc
@@ -137,6 +137,7 @@ The remaining `options` properties are based on the link:https://github.com/olla
 | spring.ai.ollama.chat.options.num-predict       | Maximum number of tokens to predict when generating text. (-1 = infinite generation, -2 = fill context) | -1
 | spring.ai.ollama.chat.options.top-k             | Reduces the probability of generating nonsense. A higher value (e.g., 100) will give more diverse answers, while a lower value (e.g., 10) will be more conservative.  | 40
 | spring.ai.ollama.chat.options.top-p             | Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text.  | 0.9
+| spring.ai.ollama.chat.options.min-p             | Alternative to the top_p, and aims to ensure a balance of quality and variety. The parameter p represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with p=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out.  | 0.0
 | spring.ai.ollama.chat.options.tfs-z             | Tail-free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting. | 1.0
 | spring.ai.ollama.chat.options.typical-p         | -                                                             | 1.0
 | spring.ai.ollama.chat.options.repeat-last-n     | Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx) | 64
diff --git a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/embeddings/ollama-embeddings.adoc b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/embeddings/ollama-embeddings.adoc
@@ -142,6 +142,7 @@ The remaining `options` properties are based on the link:https://github.com/olla
 | spring.ai.ollama.embedding.options.num-predict       | Maximum number of tokens to predict when generating text. (-1 = infinite generation, -2 = fill context) | -1
 | spring.ai.ollama.embedding.options.top-k             | Reduces the probability of generating nonsense. A higher value (e.g., 100) will give more diverse answers, while a lower value (e.g., 10) will be more conservative.  | 40
 | spring.ai.ollama.embedding.options.top-p             | Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text.  | 0.9
+| spring.ai.ollama.embedding.options.min-p             | Alternative to the top_p, and aims to ensure a balance of quality and variety. The parameter p represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with p=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out.  | 0.0
 | spring.ai.ollama.embedding.options.tfs-z             | Tail-free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting. | 1.0
 | spring.ai.ollama.embedding.options.typical-p         | -                                                             | 1.0
 | spring.ai.ollama.embedding.options.repeat-last-n     | Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx) | 64