Decouple smoothing from temp_ext

Silver267 · Silver267 · commit fecd48ac1172 · 2025-05-10T21:17:42.000-04:00
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -1760,6 +1760,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.sampling.xtc_threshold = std::stof(value);
         }
     ).set_sparam());
+    add_opt(common_arg(
+        {"--smoothing-factor"}, "N",
+        string_format("smoothing factor (default: %.1f, 0.0 = disabled)", (double)params.sampling.smoothing_factor),
+        [](common_params & params, const std::string & value) {
+            params.sampling.smoothing_factor = std::stof(value);
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--smoothing-curve"}, "N",
+        string_format("smoothing curve (default: %.1f, 1.0 = disabled)", (double)params.sampling.smoothing_curve),
+        [](common_params & params, const std::string & value) {
+            params.sampling.smoothing_curve = std::stof(value);
+        }
+    ).set_sparam());
     add_opt(common_arg(
         {"--typical"}, "N",
         string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sampling.typ_p),
diff --git a/common/common.h b/common/common.h
@@ -96,6 +96,7 @@ enum common_sampler_type {
     COMMON_SAMPLER_TYPE_INFILL      = 9,
     COMMON_SAMPLER_TYPE_PENALTIES   = 10,
     COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
+    COMMON_SAMPLER_TYPE_SMOOTHING   = 12,
 };
 
 // dimensionality reduction methods, used by cvector-generator
@@ -139,7 +140,7 @@ struct common_params_sampling {
     float   temp               = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
     float   dynatemp_range     = 0.00f; // 0.0 = disabled
     float   dynatemp_exponent  = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
-    float   smoothing_factor   = 0.0f;  // controls the quadratic adjustment in smooth / quadratic sampling
+    float   smoothing_factor   = 0.0f;  // controls the quadratic adjustment in smooth / quadratic sampling (0.0 = disabled)
     float   smoothing_curve    = 1.0f;  // controls the quadratic adjustment in smooth / quadratic sampling
     int32_t penalty_last_n     = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
     float   penalty_repeat     = 1.00f; // 1.0 = disabled
@@ -169,6 +170,7 @@ struct common_params_sampling {
         COMMON_SAMPLER_TYPE_TOP_P,
         COMMON_SAMPLER_TYPE_MIN_P,
         COMMON_SAMPLER_TYPE_XTC,
+        COMMON_SAMPLER_TYPE_SMOOTHING,
         COMMON_SAMPLER_TYPE_TEMPERATURE,
     };
 
diff --git a/common/sampling.cpp b/common/sampling.cpp
@@ -136,11 +136,11 @@ std::string common_params_sampling::print() const {
     snprintf(result, sizeof(result),
             "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
             "\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
-            "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, top_n_sigma = %.3f, temp = %.3f\n"
+            "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, smoothing_factor = %.3f, smoothing_curve = %.3f, typical_p = %.3f, top_n_sigma = %.3f, temp = %.3f\n"
             "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
             penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
             dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
-            top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, top_n_sigma, temp,
+            top_k, top_p, min_p, xtc_probability, xtc_threshold, smoothing_factor, smoothing_curve, typ_p, top_n_sigma, temp,
             mirostat, mirostat_eta, mirostat_tau);
 
     return std::string(result);
@@ -258,11 +258,14 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                 case COMMON_SAMPLER_TYPE_XTC:
                     llama_sampler_chain_add(result->chain, llama_sampler_init_xtc         (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
                     break;
+                case COMMON_SAMPLER_TYPE_SMOOTHING:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_smoothing   (params.smoothing_factor, params.smoothing_curve));
+                    break;
                 case COMMON_SAMPLER_TYPE_TYPICAL_P:
                     llama_sampler_chain_add(result->chain, llama_sampler_init_typical     (params.typ_p, params.min_keep));
                     break;
                 case COMMON_SAMPLER_TYPE_TEMPERATURE:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext    (params.temp, params.dynatemp_range, params.dynatemp_exponent, params.smoothing_factor, params.smoothing_curve));
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext    (params.temp, params.dynatemp_range, params.dynatemp_exponent));
                     break;
                 case COMMON_SAMPLER_TYPE_INFILL:
                     llama_sampler_chain_add(result->chain, llama_sampler_init_infill      (vocab));
@@ -479,6 +482,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
         case COMMON_SAMPLER_TYPE_XTC:         return 'x';
         case COMMON_SAMPLER_TYPE_INFILL:      return 'i';
         case COMMON_SAMPLER_TYPE_PENALTIES:   return 'e';
+        case COMMON_SAMPLER_TYPE_SMOOTHING:   return 'q';
         default : return '?';
     }
 }
@@ -495,6 +499,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
         case COMMON_SAMPLER_TYPE_XTC:         return "xtc";
         case COMMON_SAMPLER_TYPE_INFILL:      return "infill";
         case COMMON_SAMPLER_TYPE_PENALTIES:   return "penalties";
+        case COMMON_SAMPLER_TYPE_SMOOTHING:   return "smoothing";
         default : return "";
     }
 }
@@ -509,6 +514,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
         { "min_p",       COMMON_SAMPLER_TYPE_MIN_P },
         { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
         { "xtc",         COMMON_SAMPLER_TYPE_XTC },
+        { "smoothing",   COMMON_SAMPLER_TYPE_SMOOTHING},
         { "infill",      COMMON_SAMPLER_TYPE_INFILL },
         { "penalties",   COMMON_SAMPLER_TYPE_PENALTIES },
     };
@@ -525,6 +531,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
         { "typ-p",       COMMON_SAMPLER_TYPE_TYPICAL_P },
         { "typ",         COMMON_SAMPLER_TYPE_TYPICAL_P },
         { "min-p",       COMMON_SAMPLER_TYPE_MIN_P },
+        { "quadratic",   COMMON_SAMPLER_TYPE_SMOOTHING},
         { "temp",        COMMON_SAMPLER_TYPE_TEMPERATURE },
     };
 
@@ -560,6 +567,7 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
         { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P),       COMMON_SAMPLER_TYPE_MIN_P },
         { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
         { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC),         COMMON_SAMPLER_TYPE_XTC },
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_SMOOTHING),   COMMON_SAMPLER_TYPE_SMOOTHING},
         { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL),      COMMON_SAMPLER_TYPE_INFILL },
         { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_PENALTIES),   COMMON_SAMPLER_TYPE_PENALTIES },
     };
diff --git a/include/llama.h b/include/llama.h
@@ -1251,12 +1251,15 @@ extern "C" {
     /// #details Updates the logits l_i` = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original value, the rest are set to -inf
     LLAMA_API struct llama_sampler * llama_sampler_init_temp       (float   t);
 
-    /// @details Dynamic temperature (a.k.a. entropy) + Smooth Sampling implementations wrapped into one function, no research papers available.
-    LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext   (float   t, float   delta, float exponent, float   smoothing_factor, float   smoothing_curve);
+    /// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772.
+    LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext   (float   t, float   delta, float exponent);
 
     /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
     LLAMA_API struct llama_sampler * llama_sampler_init_xtc        (float   p, float   t,     size_t min_keep, uint32_t seed);
 
+    /// @details Smoothing sampling as described in https://github.com/ggml-org/llama.cpp/pull/6445
+    LLAMA_API struct llama_sampler * llama_sampler_init_smoothing  (float   factor, float curve);
+
     /// @details Top n sigma sampling as described in academic paper "Top-nσ: Not All Logits Are You Need" https://arxiv.org/pdf/2411.07641
     LLAMA_API struct llama_sampler * llama_sampler_init_top_n_sigma(float   n);
 
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
@@ -1005,8 +1005,6 @@ struct llama_sampler_temp_ext {
     const float temp;
     const float delta;
     const float exponent;
-    const float smoothing_factor;
-    const float smoothing_curve;
 };
 
 static const char * llama_sampler_temp_ext_name(const struct llama_sampler * /*smpl*/) {
@@ -1021,21 +1019,6 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke
         return;
     }
 
-    // Apply smoothing if smoothing_factor is > 0. Do not change base implementation otherwise.
-    if (ctx->smoothing_factor > 0.0f) {
-        llama_sampler_softmax_impl(cur_p);
-        float h = cur_p->data[0].logit; // Find the maximum logit for h to be added after the transformation
-
-        // Apply the modified quadratic transformation using the smoothing_factor and smoothing_curve
-        for (size_t i = 0; i < cur_p->size; ++i) {
-            float logit_shifted = cur_p->data[i].logit - h;
-            float k = (3 - ctx->smoothing_curve) / 2;
-            float s = (ctx->smoothing_curve - 1) / 2;
-            cur_p->data[i].logit = -(k * ctx->smoothing_factor * logit_shifted * logit_shifted) + (s * ctx->smoothing_factor * logit_shifted * logit_shifted * logit_shifted) + h;
-        }
-        llama_sampler_softmax_impl(cur_p);
-    }
-
     if (ctx->delta > 0) {
         const float min_temp = std::max(0.0f, ctx->temp - ctx->delta);
         const float max_temp = ctx->temp + ctx->delta;
@@ -1102,7 +1085,7 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke
 
 static struct llama_sampler * llama_sampler_temp_ext_clone(const struct llama_sampler * smpl) {
     const auto * ctx = (const llama_sampler_temp_ext *) smpl->ctx;
-    return llama_sampler_init_temp_ext(ctx->temp, ctx->delta, ctx->exponent, ctx->smoothing_factor, ctx->smoothing_curve);
+    return llama_sampler_init_temp_ext(ctx->temp, ctx->delta, ctx->exponent);
 }
 
 static void llama_sampler_temp_ext_free(struct llama_sampler * smpl) {
@@ -1118,15 +1101,13 @@ static struct llama_sampler_i llama_sampler_temp_ext_i = {
     /* .free   = */ llama_sampler_temp_ext_free,
 };
 
-struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, float exponent, float smoothing_factor, float smoothing_curve) {
+struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, float exponent) {
     return llama_sampler_init(
         /* .iface = */ &llama_sampler_temp_ext_i,
         /* .ctx   = */ new llama_sampler_temp_ext {
             /* .temp     = */ temp,
             /* .delta    = */ delta,
-            /* .exponent = */ exponent,
-            /* .smoothing_factor = */ smoothing_factor,
-            /* .smoothing_curve = */ smoothing_curve
+            /* .exponent = */ exponent
         }
     );
 }
@@ -1226,6 +1207,68 @@ struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep,
     );
 }
 
+// smoothing
+
+struct llama_sampler_smoothing {
+    const float factor;
+    const float curve;
+};
+
+static const char * llama_sampler_smoothing_name(const struct llama_sampler * /*smpl*/) {
+    return "smoothing";
+}
+
+static void llama_sampler_smoothing_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    const auto * ctx = (llama_sampler_smoothing *) smpl->ctx;
+
+    // no need to do anything if there is only one (or zero) candidates
+    if (cur_p->size <= 1) {
+        return;
+    }
+
+    if (ctx->factor > 0.0f) {
+        llama_sampler_softmax_impl(cur_p);
+        float h = cur_p->data[0].logit; // Find the maximum logit for h to be added after the transformation
+
+        // Apply the modified quadratic transformation using the smoothing_factor and smoothing_curve
+        for (size_t i = 0; i < cur_p->size; ++i) {
+            float logit_shifted = cur_p->data[i].logit - h;
+            float k = (3 - ctx->curve) / 2;
+            float s = (ctx->curve - 1) / 2;
+            cur_p->data[i].logit = -(k * ctx->factor * logit_shifted * logit_shifted) + (s * ctx->factor * logit_shifted * logit_shifted * logit_shifted) + h;
+        }
+        llama_sampler_softmax_impl(cur_p);
+    }
+}
+
+static struct llama_sampler * llama_sampler_smoothing_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_smoothing *) smpl->ctx;
+    return llama_sampler_init_smoothing(ctx->factor, ctx->curve);
+}
+
+static void llama_sampler_smoothing_free(struct llama_sampler * smpl) {
+    delete (llama_sampler_smoothing *) smpl->ctx;
+}
+
+static struct llama_sampler_i llama_sampler_smoothing_i = {
+    /* .name   = */ llama_sampler_smoothing_name,
+    /* .accept = */ nullptr,
+    /* .apply  = */ llama_sampler_smoothing_apply,
+    /* .reset  = */ nullptr,
+    /* .clone  = */ llama_sampler_smoothing_clone,
+    /* .free   = */ llama_sampler_smoothing_free,
+};
+
+struct llama_sampler * llama_sampler_init_smoothing(float factor, float curve) {
+    return llama_sampler_init(
+        /* .iface = */ &llama_sampler_smoothing_i,
+        /* .ctx   = */ new llama_sampler_smoothing {
+            /* .smoothing_factor = */ factor,
+            /* .smoothing_curve  = */ curve
+        }
+    );
+}
+
 // mirostat
 
 struct llama_sampler_mirostat {
diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
@@ -72,11 +72,11 @@ static void test_temp(const std::vector<float> & probs, const std::vector<float>
     tester.check();
 }
 
-static void test_temp_ext(const std::vector<float> & probs, const std::vector<float> & probs_expected, float temp, float delta, float exponent, float smoothing_factor, float smoothing_curve) {
+static void test_temp_ext(const std::vector<float> & probs, const std::vector<float> & probs_expected, float temp, float delta, float exponent) {
     sampler_tester tester(probs, probs_expected);
 
     DUMP(&tester.cur_p);
-    tester.apply(llama_sampler_init_temp_ext(temp, delta, exponent, smoothing_factor, smoothing_curve));
+    tester.apply(llama_sampler_init_temp_ext(temp, delta, exponent));
     tester.apply(llama_sampler_init_dist (0));
     DUMP(&tester.cur_p);
 
@@ -126,6 +126,17 @@ static void test_xtc(const std::vector<float> & probs, const std::vector<float>
     tester.check();
 }
 
+static void test_smoothing(const std::vector<float> & probs, const std::vector<float> & probs_expected, float factor, float curve) {
+    sampler_tester tester(probs, probs_expected);
+
+    DUMP(&tester.cur_p);
+    tester.apply(llama_sampler_init_smoothing(factor, curve));
+    tester.apply(llama_sampler_init_dist (0));
+    DUMP(&tester.cur_p);
+
+    tester.check();
+}
+
 static void test_typical(const std::vector<float> & probs, const std::vector<float> & probs_expected, float p) {
     sampler_tester tester(probs, probs_expected);
 
@@ -311,11 +322,8 @@ int main(void) {
     test_temp({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1.0f);
     test_temp({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f, 0.0f, 0.0f, 0.0f}, 0.0f);
 
-    test_temp_ext({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1.0f, 0.0f, 1.0f, 0.0f, 1.0f);
-    test_temp_ext({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f, 0.0f, 0.0f, 0.0f}, 0.0f, 0.0f, 1.0f, 0.0f, 1.0f);
-
-    test_temp_ext({0.1f, 0.2f, 0.3f, 0.4f}, {0.372382f, 0.342804f, 0.230319f, 0.054495f}, 1.0f, 0.0f, 1.0f, 1.0f, 1.0f);
-    test_temp_ext({0.1f, 0.2f, 0.3f, 0.4f}, {0.368339f, 0.349226f, 0.245247f, 0.037188f}, 1.0f, 0.0f, 1.0f, 1.0f, 2.0f);
+    test_temp_ext({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1.0f, 0.0f, 1.0f);
+    test_temp_ext({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f, 0.0f, 0.0f, 0.0f}, 0.0f, 0.0f, 1.0f);
 
     test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f}, 1);
     test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.44444f, 0.33333f, 0.22222f}, 3);
@@ -344,6 +352,9 @@ int main(void) {
     printf("XTC should not:\n");
     test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.4f, 0.3f, 0.2f, 0.1f},              0.99f, 0.39f);
 
+    test_smoothing({0.1f, 0.2f, 0.3f, 0.4f}, {0.372382f, 0.342804f, 0.230319f, 0.054495f}, 1.0f, 1.0f);
+    test_smoothing({0.1f, 0.2f, 0.3f, 0.4f}, {0.368339f, 0.349226f, 0.245247f, 0.037188f}, 1.0f, 2.0f);
+
     test_typical({0.97f, 0.01f, 0.01f, 0.01f}, {0.97f}, 0.5f);
     test_typical({0.4f, 0.2f, 0.2f, 0.2f}, {0.2f, 0.2f, 0.2f}, 0.5f);
 
diff --git a/tools/main/README.md b/tools/main/README.md
@@ -296,6 +296,13 @@ Being experimental and unique, XTC is disabled by default. The recommended combi
 
 Example usage: `--xtc-probability 0.5 --xtc-threshold 0.1`
 
+### Smoothing / Quadratic Sampling
+
+-   `--smoothing-factor N`: Set the smoothing factor for smoothing / quadratic sampling (default: 0.0).
+-   `--smoothing-curve N`: Set the cubic transformation curve for smoothing / quadratic sampling (default: 1.0).
+
+Smoothing / Quadratic Sampling is a sampler that modifies the probability of each token instead of removing tokens, similar to what temperature does. (TODO: finish this part)
+
 ### Top-nσ Sampling
 
 -   `--top-nsigma N`: Limit the next token selection to a subset of tokens with pre-softmax logits that are within n * σ less than the max logit (default: -1, -1 = disabled).
diff --git a/tools/server/README.md b/tools/server/README.md
@@ -115,6 +115,8 @@ The project is under active development, and we are [looking for feedback and co
 | `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) |
 | `--xtc-probability N` | xtc probability (default: 0.0, 0.0 = disabled) |
 | `--xtc-threshold N` | xtc threshold (default: 0.1, 1.0 = disabled) |
+| `--smoothing-factor N` | smoothing factor (default: 0.0, 0.0 = disabled) |
+| `--smoothing-curve N` | smoothing curve (default: 1.0, 1.0 = disabled) |
 | `--typical N` | locally typical sampling, parameter p (default: 1.0, 1.0 = disabled) |
 | `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) |
 | `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) |
@@ -447,6 +449,10 @@ These words will not be included in the completion, so make sure to add them to
 
 `xtc_threshold`: Set a minimum probability threshold for tokens to be removed via XTC sampler. Default: `0.1` (> `0.5` disables XTC)
 
+`smoothing_factor`: Set the smoothing factor for smoothing / quadratic sampling. Default: `0.0`, which is disabled.
+
+`smoothing_curve`: Set the cubic transformation curve for smoothing / quadratic sampling. Default: `1.0`, which makes it behaves like the original quadratic sampler.
+
 `mirostat`: Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0.
 
 `mirostat_tau`: Set the Mirostat target entropy, parameter tau. Default: `5.0`