fix modern-bert swa logic

huydt-bti · huydt-bti · commit c6b84e2f3181 · 2025-06-08T11:49:49.000+09:00
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -279,60 +279,7 @@ void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
 void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
     if (kq_mask) {
         // Check if we're using sliding window attention
-        if (n_swa > 0) {
-            const int64_t n_tokens     = ubatch->n_tokens;
-            const int64_t n_seq_tokens = ubatch->n_seq_tokens;
-            const int64_t n_seqs       = ubatch->n_seqs;
-            const int64_t n_stride     = ubatch->n_tokens;
-            const int64_t half_n_swa   = n_swa / 2;
-
-            GGML_ASSERT(ggml_backend_buffer_is_host(kq_mask->buffer));
-            float * data = (float *) kq_mask->data;
-
-            // Implement symmetric sliding window attention
-            // token i attends to tokens [i - n_swa/2, i + n_swa/2]
-            for (int h = 0; h < 1; ++h) {
-                for (int s1 = 0; s1 < n_seqs; ++s1) {
-                    const llama_seq_id seq_id = ubatch->seq_id[s1][0];
-
-                    for (int j = 0; j < n_seq_tokens; ++j) {
-                        const int32_t tj = s1*n_seq_tokens + j;
-                        const int64_t pos_j = ubatch->pos[tj];
-
-                        for (int s0 = 0; s0 < n_seqs; ++s0) {
-                            for (int i = 0; i < n_seq_tokens; ++i) {
-                                const int32_t ti = s0*n_seq_tokens + i;
-                                float f = -INFINITY;
-
-                                for (int s = 0; s < ubatch->n_seq_id[s0]; ++s) {
-                                    if (ubatch->seq_id[s0][s] == seq_id) {
-                                        const int64_t pos_i = ubatch->pos[ti];
-                                        const int64_t pos_diff = pos_j - pos_i;
-
-                                        // Apply sliding window constraint
-                                        // [i - n_swa/2, i + n_swa/2]
-                                        if (pos_diff >= -half_n_swa && pos_diff <= half_n_swa) {
-                                            if (hparams.use_alibi) {
-                                                f = -std::abs(pos_diff);
-                                            } else {
-                                                f = 0.0f;
-                                            }
-                                        }
-                                        break;
-                                    }
-                                }
-
-                                data[h*(n_tokens*n_tokens) + tj*n_stride + ti] = f;
-                            }
-                        }
-
-                        for (int i = n_tokens; i < n_stride; ++i) {
-                            data[h*(n_tokens*n_tokens) + tj*n_stride + i] = -INFINITY;
-                        }
-                    }
-                }
-            }
-        } else if (cparams.causal_attn) {
+        if (cparams.causal_attn) {
             const int64_t n_kv         = ubatch->n_tokens;
             const int64_t n_tokens     = ubatch->n_tokens;
             const int64_t n_seq_tokens = ubatch->n_seq_tokens;
@@ -375,6 +322,7 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
             const int64_t n_seq_tokens = ubatch->n_seq_tokens;
             const int64_t n_seqs       = ubatch->n_seqs;
             const int64_t n_stride     = ubatch->n_tokens;
+            const int64_t half_n_swa   = hparams.n_swa / 2;
 
             GGML_ASSERT(ggml_backend_buffer_is_host(kq_mask->buffer));
 
@@ -386,6 +334,7 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
 
                     for (int j = 0; j < n_seq_tokens; ++j) {
                         const int32_t tj = s1*n_seq_tokens + j;
+                        const int64_t pos_j = ubatch->pos[tj];
 
                         for (int s0 = 0; s0 < n_seqs; ++s0) {
                             for (int i = 0; i < n_seq_tokens; ++i) {
@@ -394,7 +343,11 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
 
                                 for (int s = 0; s < ubatch->n_seq_id[s0]; ++s) {
                                     if (ubatch->seq_id[s0][s] == seq_id) {
-                                        if (hparams.use_alibi) {
+                                        const int64_t pos_i = ubatch->pos[ti];
+                                        const int64_t pos_diff = pos_j - pos_i;
+
+                                        if (hparams.use_alibi &&
+                                                (pos_diff >= -half_n_swa && pos_diff <= half_n_swa)) {
                                             f = -std::abs(ubatch->pos[ti] - ubatch->pos[tj]);
                                         } else {
                                             f = 0.0f;
@@ -1242,22 +1195,6 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con
     return (llm_graph_input_attn_no_cache *) res->add_input(std::move(inp));
 }
 
-llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache_iswa() const {
-    // Use the sliding window size from hyperparameters
-    // If hparams.n_swa is 0, use a default value (128)
-    const int n_swa = hparams.n_swa > 0 ? hparams.n_swa : 128;
-
-    auto inp = std::make_unique<llm_graph_input_attn_no_cache>(hparams, cparams, n_swa);
-
-    // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
-    inp->kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
-    ggml_set_input(inp->kq_mask);
-
-    inp->kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->kq_mask, GGML_TYPE_F16) : inp->kq_mask;
-
-    return (llm_graph_input_attn_no_cache *) res->add_input(std::move(inp));
-}
-
 ggml_tensor * llm_graph_context::build_attn(
         llm_graph_input_attn_no_cache * inp,
         ggml_cgraph * gf,
diff --git a/src/llama-graph.h b/src/llama-graph.h
@@ -228,14 +228,7 @@ class llm_graph_input_attn_no_cache : public llm_graph_input_i {
 public:
     llm_graph_input_attn_no_cache(const llama_hparams & hparams, const llama_cparams & cparams) :
         hparams(hparams),
-        cparams(cparams),
-        n_swa(0) {
-    }
-
-    llm_graph_input_attn_no_cache(const llama_hparams & hparams, const llama_cparams & cparams, int n_swa) :
-        hparams(hparams),
-        cparams(cparams),
-        n_swa(n_swa) {
+        cparams(cparams) {
     }
 
     ~llm_graph_input_attn_no_cache() = default;
@@ -249,7 +242,6 @@ class llm_graph_input_attn_no_cache : public llm_graph_input_i {
 
     const llama_hparams & hparams;
     const llama_cparams & cparams;
-    const int n_swa;  // Sliding window attention size (0 = disabled)
 };
 
 class llm_graph_input_attn_kv_unified : public llm_graph_input_i {
@@ -551,7 +543,6 @@ struct llm_graph_context {
                    float   kq_scale) const;
 
     llm_graph_input_attn_no_cache * build_attn_inp_no_cache() const;
-    llm_graph_input_attn_no_cache * build_attn_inp_no_cache_iswa() const;
 
     ggml_tensor * build_attn(
             llm_graph_input_attn_no_cache * inp,
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
@@ -18,6 +18,7 @@ enum llama_swa_type {
     LLAMA_SWA_TYPE_NONE     = 0,
     LLAMA_SWA_TYPE_STANDARD = 1,
     LLAMA_SWA_TYPE_CHUNKED  = 2,
+    LLAMA_SWA_TYPE_SYMMETRIC  = 3,
 };
 
 struct llama_hparams_posnet {
diff --git a/src/llama-kv-cache-unified.cpp b/src/llama-kv-cache-unified.cpp
@@ -1265,6 +1265,12 @@ bool llama_kv_cache_unified::is_masked_swa(llama_pos p0, llama_pos p1) const {
                     return true;
                 }
             } break;
+        case LLAMA_SWA_TYPE_SYMMETRIC:
+            {
+                if ( p1 - p0 <= (int32_t) n_swa / 2 || p0 - p1 >= (int32_t) n_swa / 2) {
+                    return true;
+                }
+            } break;
     }
 
     return false;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -718,7 +718,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 hparams.rope_freq_base_train_swa = 10000.0f;
                 hparams.n_swa = 128;
 
-                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+                hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
                 hparams.set_swa_pattern(3, 0);
 
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
@@ -6144,7 +6144,7 @@ struct llm_build_modern_bert : public llm_graph_context {
         inpL = build_norm(inpL, model.tok_norm, nullptr, LLM_NORM, -1);
         cb(inpL, "inp_norm", -1);
 
-        auto * inp_attn = build_attn_inp_no_cache_iswa();
+        auto * inp_attn = build_attn_inp_no_cache();
 
         // iterate layers
         for (int il = 0; il < n_layer; ++il) {

Original file line number	Diff line number	Diff line change
`@@ -1265,6 +1265,12 @@ bool llama_kv_cache_unified::is_masked_swa(llama_pos p0, llama_pos p1) const {`
`1265`	`1265`	`return true;`
`1266`	`1266`	`}`
`1267`	`1267`	`} break;`
	`1268`	`+ case LLAMA_SWA_TYPE_SYMMETRIC:`
	`1269`	`+ {`
	`1270`	`+ if ( p1 - p0 <= (int32_t) n_swa / 2 \|\| p0 - p1 >= (int32_t) n_swa / 2) {`
	`1271`	`+ return true;`
	`1272`	`+ }`
	`1273`	`+ } break;`
`1268`	`1274`	`}`
`1269`	`1275`
`1270`	`1276`	`return false;`