MaggotHATE
diff --git a/‎Makefile
Lines changed: 2 additions & 1 deletion b/‎Makefile
Lines changed: 2 additions & 1 deletion
diff --git a/‎base_sampling2/chat_layer.h
Lines changed: 3 additions & 3 deletions b/‎base_sampling2/chat_layer.h
Lines changed: 3 additions & 3 deletions
diff --git a/‎base_sampling2/common.cpp
Lines changed: 40 additions & 33 deletions b/‎base_sampling2/common.cpp
Lines changed: 40 additions & 33 deletions
diff --git a/‎base_sampling2/common.h
Lines changed: 0 additions & 1 deletion b/‎base_sampling2/common.h
Lines changed: 0 additions & 1 deletion
diff --git a/‎base_sampling2/llama-addon.cpp
Lines changed: 111 additions & 0 deletions b/‎base_sampling2/llama-addon.cpp
Lines changed: 111 additions & 0 deletions
diff --git a/‎base_sampling2/llama-addon.h
Lines changed: 5 additions & 0 deletions b/‎base_sampling2/llama-addon.h
Lines changed: 5 additions & 0 deletions
diff --git a/‎base_sampling2/master/ggml/src/ggml-backend-reg.cpp
Lines changed: 5 additions & 0 deletions b/‎base_sampling2/master/ggml/src/ggml-backend-reg.cpp
Lines changed: 5 additions & 0 deletions
@@ -486,8 +486,9 @@ OBJS_GGUF_LLAMA = \
     $(TMP)$(PREFIX)_llama-io.o \
     $(TMP)$(PREFIX)_llama-kv-cache-unified.o \
     $(TMP)$(PREFIX)_llama-kv-cache-unified-iswa.o \
-    $(TMP)$(PREFIX)_llama-kv-cache-recurrent.o \
     $(TMP)$(PREFIX)_llama-memory.o \
+    $(TMP)$(PREFIX)_llama-memory-hybrid.o \
+    $(TMP)$(PREFIX)_llama-memory-recurrent.o \
     $(TMP)$(PREFIX)_llama-mmap.o \
     $(TMP)$(PREFIX)_llama-model-loader.o \
     $(TMP)$(PREFIX)_llama-model-saver.o \
 
@@ -1790,7 +1790,7 @@ class chat
                     for (auto l : logit_bias_tokens_start) {
                         ++checks;
                         if (id == l) {
-                            checks = 0;
+                            //checks = 0;
                             std::string c_restricted_tkn_string = common_token_to_piece(ctx, id);
                             writeTextFile("logit_biasing.txt", std::format("{}: Found '{}';", params.sparams.seed, c_restricted_tkn_string));
 
@@ -2341,8 +2341,8 @@ class chat
         //process_prompt(false);  // do not forget to include it elsewhere after loading the model  
         //inputOnly(input); // MOVED
 
-        // std::string bit = getBit(emptyMessage, shortMessage);
-        std::string bit = getMultiBit(2, emptyMessage, shortMessage);
+        std::string bit = getBit(emptyMessage, shortMessage);
+        // std::string bit = getMultiBit(2, emptyMessage, shortMessage);
 
         if ((int) std::size(embd_inp) <= n_consumed) {
             if (debug) printf("-cso");
 
@@ -705,6 +705,8 @@ bool fs_validate_filename(const std::string & filename) {
     return true;
 }
 
+#include <iostream>
+
 // returns true if successful, false otherwise
 bool fs_create_directory_with_parents(const std::string & path) {
 #ifdef _WIN32
@@ -722,9 +724,16 @@ bool fs_create_directory_with_parents(const std::string & path) {
     // process path from front to back, procedurally creating directories
     while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
         const std::wstring subpath = wpath.substr(0, pos_slash);
-        const wchar_t * test = subpath.c_str();
 
-        const bool success = CreateDirectoryW(test, NULL);
+        pos_slash += 1;
+
+        // skip the drive letter, in some systems it can return an access denied error
+        if (subpath.length() == 2 && subpath[1] == ':') {
+            continue;
+        }
+
+        const bool success = CreateDirectoryW(subpath.c_str(), NULL);
+
         if (!success) {
             const DWORD error = GetLastError();
 
@@ -739,7 +748,6 @@ bool fs_create_directory_with_parents(const std::string & path) {
             }
         }
 
-        pos_slash += 1;
     }
 
     return true;
@@ -841,31 +849,6 @@ struct common_init_result common_init_from_params(common_params & params) {
 
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
-    if (params.reranking) {
-        bool ok = true;
-
-        if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
-            printf("%s: warning: vocab does not have a  BOS token, reranking will not work\n", __func__);
-            ok = false;
-        }
-
-        if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
-            printf("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__);
-            ok = false;
-        }
-
-        if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) {
-            printf("%s: warning: vocab does not have a  SEP token, reranking will not work\n", __func__);
-            ok = false;
-        }
-
-        if (!ok) {
-            llama_model_free(model);
-
-            return iparams;
-        }
-    }
-
     auto cparams = common_context_params_to_llama(params);
 
     llama_context * lctx = llama_new_context_with_model(model, cparams);
@@ -908,6 +891,35 @@ struct common_init_result common_init_from_params(common_params & params) {
         } else printf("%s: vectors applied \n", __func__);
     }
 
+    if (llama_pooling_type(lctx) == LLAMA_POOLING_TYPE_RANK) {
+        bool ok = true;
+
+        if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
+            printf("%s: warning: vocab does not have a  BOS token, reranking will not work\n", __func__);
+            ok = false;
+        }
+
+        bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
+        bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
+
+        if (!has_eos && !has_sep) {
+            printf("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
+            ok = false;
+        } else if (!has_eos) {
+            printf("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
+        } else if (!has_sep) {
+            printf("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
+            ok = false;
+        }
+
+        if (!ok) {
+            llama_free(lctx);
+            llama_model_free(model);
+
+            return iparams;
+        }
+    }
+
     // load and optionally apply lora adapters
     for (auto & la : params.lora_adapters) {
         llama_adapter_lora_ptr lora;
@@ -1160,11 +1172,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
     cparams.no_perf           = params.no_perf;
     cparams.swa_full          = params.swa_full;
 
-    if (params.reranking) {
-        cparams.embeddings    = true;
-        cparams.pooling_type  = LLAMA_POOLING_TYPE_RANK;
-    }
-
     cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
     cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
 
 
@@ -367,7 +367,6 @@ struct common_params {
     int32_t embd_normalize = 2;     // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
     std::string embd_out   = "";    // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
     std::string embd_sep   = "\n";  // separator of embendings
-    bool reranking         = false; // enable reranking support on server
 
     // server params
     int32_t port           = 8080;         // server listens on this network port
 
@@ -681,6 +681,8 @@ static void llama_sampler_min_p_addon_apply(struct llama_sampler * smpl, llama_t
         if (!filtered_tokens.empty() && filtered_tokens.size() >= ctx->min_keep) {
             memcpy(cur_p->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
             cur_p->size = filtered_tokens.size();
+            // Guard against a single choice
+            if (cur_p->size < 2) cur_p->size = 2;
             min_p_applied = true;
         }
     }
@@ -706,6 +708,9 @@ static void llama_sampler_min_p_addon_apply(struct llama_sampler * smpl, llama_t
             }
         }
 
+        // Guard against a single choice
+        if (i < 2) i = 2;
+
         // Resize the output vector to keep only the matching tokens
         cur_p->size = i;
     }
@@ -2190,5 +2195,111 @@ struct llama_sampler * llama_sampler_init_logit_bias_addon(
     );
 }
 
+// logit-bias-start
+
+struct llama_sampler_logit_bias_start_addon {
+    const int32_t n_vocab;
+
+    const std::vector<llama_logit_bias> logit_bias;
+
+    std::vector<llama_logit_bias> to_search;
+};
+
+static const char * llama_sampler_logit_bias_start_addon_name(const struct llama_sampler * /*smpl*/) {
+    return "logit-bias";
+}
+
+static void llama_sampler_logit_bias_start_addon_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    auto * ctx = (llama_sampler_logit_bias_start_addon *) smpl->ctx;
+
+    if (ctx->logit_bias.empty()) {
+        // std::string logits_orig = "\nLOGITS: EMPTY\n";
+        // if (test_dumbed_logits_biased == false) {
+            // writeToFile("logit_biasing.txt", logits_orig);
+            // test_dumbed_logits_biased = true;
+        // }
+        return;
+    }
+
+    ctx->to_search.clear();
+
+                                    // std::string logits_orig = "\nLOGITS:\n";
+                                    // std::string logits_positive = "\nLOGITS POS:\n";
+    // update the candidates that have not been shuffled in the vocabulary (i.e. idx == id)
+    for (const auto & lb : ctx->logit_bias) {
+        if (lb.token >= 0 && cur_p->size > (size_t) lb.token && cur_p->data[lb.token].id == lb.token) {
+                                // if (lb.bias < 0) {
+                                    // logits_orig += std::to_string(cur_p->data[lb.token].id) + ": " + std::to_string(cur_p->data[lb.token].logit) + " -> ";
+                                // } else logits_positive += std::to_string(cur_p->data[lb.token].id) + ": " + std::to_string(cur_p->data[lb.token].logit) + " -> ";
+            cur_p->data[lb.token].logit += lb.bias;
+                                // if (lb.bias < 0) {
+                                    // logits_orig += std::to_string(cur_p->data[lb.token].logit) + ";\n";
+                                // } else logits_positive += std::to_string(cur_p->data[lb.token].logit) + ";\n";
+        } else {
+            ctx->to_search.push_back(lb);
+        }
+    }
+
+    if (ctx->to_search.empty()) {
+        // if (test_dumbed_logits_biased == false) {
+            // logits_orig += logits_positive + "\nNO SEARCH\n";
+            // writeToFile("logit_biasing.txt", logits_orig);
+            // test_dumbed_logits_biased = true;
+        // }
+        return;
+    }
+
+    // search for the remaining candidates that were not found in the previous step
+                                    // logits_orig += "\nSEARCH:\n";
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        for (const auto & lb : ctx->to_search) {
+            if (cur_p->data[i].id == lb.token) {
+                                    // logits_orig += std::to_string(cur_p->data[i].logit) + "->";
+                cur_p->data[i].logit += lb.bias;
+                                    // logits_orig += std::to_string(cur_p->data[i].logit) + ";\n";
+                break;
+            }
+        }
+    }
+
+                                // if (test_dumbed_logits_biased == false) {
+                                    // logits_orig += logits_positive;
+                                    // writeToFile("logit_biasing.txt", logits_orig);
+                                    // test_dumbed_logits_biased = true;
+                                // }
+}
+
+static struct llama_sampler * llama_sampler_logit_bias_start_addon_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_logit_bias_start_addon *) smpl->ctx;
+    return llama_sampler_init_logit_bias_start_addon(ctx->n_vocab, ctx->logit_bias.size(), ctx->logit_bias.data());
+}
+
+static void llama_sampler_logit_bias_start_addon_free(struct llama_sampler * smpl) {
+    delete (llama_sampler_logit_bias_start_addon *) smpl->ctx;
+}
+
+static struct llama_sampler_i llama_sampler_logit_bias_start_addon_i = {
+    /* .name   = */ llama_sampler_logit_bias_start_addon_name,
+    /* .accept = */ nullptr,
+    /* .apply  = */ llama_sampler_logit_bias_start_addon_apply,
+    /* .reset  = */ nullptr,
+    /* .clone  = */ llama_sampler_logit_bias_start_addon_clone,
+    /* .free   = */ llama_sampler_logit_bias_start_addon_free,
+};
+
+struct llama_sampler * llama_sampler_init_logit_bias_start_addon(
+                         int32_t   n_vocab,
+                         int32_t   n_logit_bias,
+          const llama_logit_bias * logit_bias) {
+    return llama_sampler_init(
+        /* .iface = */ &llama_sampler_logit_bias_start_addon_i,
+        /* .ctx   = */ new llama_sampler_logit_bias_start_addon {
+            /* .n_vocab    = */ n_vocab,
+            /* .logit_bias = */ std::vector<llama_logit_bias>(logit_bias, logit_bias + n_logit_bias),
+            /* .to_search  = */ {},
+        }
+    );
+}
+
 
 
@@ -154,4 +154,9 @@ void llama_set_time_impl(struct llama_sampling * smpl, const int64_t t_start_sam
     LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias_addon(
                              int32_t   n_vocab,
                              int32_t   n_logit_bias,
+              const llama_logit_bias * logit_bias);
+
+              LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias_start_addon(
+                             int32_t   n_vocab,
+                             int32_t   n_logit_bias,
               const llama_logit_bias * logit_bias);
@@ -69,6 +69,9 @@
 #if defined(__clang__)
 #    pragma clang diagnostic push
 #    pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#elif defined(__GNUC__)
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 #endif
 
 namespace fs = std::filesystem;
@@ -91,6 +94,8 @@ static std::string path_str(const fs::path & path) {
 
 #if defined(__clang__)
 #    pragma clang diagnostic pop
+#elif defined(__GNUC__)
+#    pragma GCC diagnostic pop
 #endif
 
 #ifdef _WIN32