From e92d43245dac469fc0e2e3c81164f36d1358b49d Mon Sep 17 00:00:00 2001
From: Seunghyuk Park <separk@habana.ai>
Date: Fri, 25 Apr 2025 00:47:30 +0300
Subject: [PATCH 01/31] Fix gemma3 workload execution failure

---
 vllm/model_executor/models/gemma3_mm.py |  4 ++++
 vllm/worker/hpu_model_runner.py         | 15 +++++++++++++--
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 65c177f8c5a..8afad9ac819 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -639,6 +639,10 @@ def prepare_attn_masks(
         **kwargs,
     ):
         kwargs["has_images"] = True
+
+        input_ids = input_ids.flatten()
+        positions = positions.flatten()
+
         # NOTE(woosuk): Here, we distinguish the sequences by the position id 0.
         # This is a HACK. Fix this.
         start_idices = (positions == 0).cpu().nonzero()
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 2dccaf85db8..ca5e4b3e28a 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -1117,6 +1117,17 @@ def move_to_device(self, tensor):
         return tensor if tensor is None else tensor.to(self.device,
                                                        non_blocking=True)
 
+    def _get_position_pad(self) -> int:
+        """
+        For gemma3 models,
+        due to the Hack in Gemma3ForConditionalGeneration::prepare_attn_masks,
+        '0' can't be used as pad for input position tensor.
+        In case, it might have '0's for bucketing, those '0' will be counted as
+        new sequence in the prepare_attn_masks() which is wrong.
+        """
+        model_type = getattr(self.model_config.hf_config, 'model_type', '')
+        return -1 if model_type == 'gemma3' else 0
+
     def _prepare_prompt(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
@@ -1326,11 +1337,11 @@ def _prepare_prompt(
                 make_mrope_positions_tensor_with_pad(input_positions=input_positions,
                                                      input_mrope_positions=input_mrope_positions,
                                                      max_prompt_len=max_prompt_len,
-                                                     pad=0)
+                                                     pad=self._get_position_pad())
         else:
             input_positions = make_cpu_tensor(input_positions,
                                               max_len=max_prompt_len,
-                                              pad=0,
+                                              pad=self._get_position_pad(),
                                               dtype=torch.long,
                                               flat=self.use_merged_prefill)
 

From bd751099799843613ca42892a780323b0ffd0584 Mon Sep 17 00:00:00 2001
From: Sayantan Sarkar <sasarkar@habana.ai>
Date: Wed, 21 May 2025 16:27:23 -0700
Subject: [PATCH 02/31] add run scripts

---
 gemma3.py                               |  84 ++++++++++++++++
 gemma3.sh                               |  13 +++
 vllm/model_executor/models/gemma3.py    |   7 +-
 vllm/model_executor/models/gemma3_mm.py | 124 ++++++++++++++++++++++++
 vllm/model_executor/models/siglip.py    |   6 +-
 5 files changed, 231 insertions(+), 3 deletions(-)
 create mode 100644 gemma3.py
 create mode 100755 gemma3.sh

diff --git a/gemma3.py b/gemma3.py
new file mode 100644
index 00000000000..64b55188a89
--- /dev/null
+++ b/gemma3.py
@@ -0,0 +1,84 @@
+import os
+from argparse import Namespace
+from dataclasses import asdict
+from typing import NamedTuple, Optional
+#from huggingface_hub import snapshot_download
+from PIL import Image
+from transformers import AutoProcessor, AutoTokenizer
+from vllm import LLM, EngineArgs, SamplingParams
+from vllm.lora.request import LoRARequest
+from vllm.multimodal.utils import fetch_image
+from vllm.utils import FlexibleArgumentParser
+import sys
+
+num_imgs = int(sys.argv[1])
+
+class ModelRequestData(NamedTuple):
+    engine_args: EngineArgs
+    prompt: str
+    image_data: list[Image]
+    stop_token_ids: Optional[list[int]] = None
+    chat_template: Optional[str] = None
+    lora_requests: Optional[list[LoRARequest]] = None
+image_urls = []
+if num_imgs == 1:
+    image_urls = [Image.open('jr.png').convert("RGB")]
+elif num_imgs == 2:
+    image_urls = [Image.open('jr.png').convert("RGB") for _ in range(2)]
+#question = "What is the name of the person in the form?"
+question = XXXXX
+model_name = "google/gemma-3-27b-it"
+#model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
+engine_args = EngineArgs(
+    model=model_name,
+    max_model_len=8192,
+    max_num_seqs=2,
+    tensor_parallel_size=1,
+    gpu_memory_utilization=0.9,
+    enforce_eager=True,    
+    limit_mm_per_prompt={"image": len(image_urls)},
+)
+placeholders = [{"type": "image", "image": url} for url in image_urls]
+messages = [{
+    "role":
+    "user",
+    "content": [
+        *placeholders,
+        {
+            "type": "text",
+            "text": question
+        },
+    ],
+}]
+processor = AutoProcessor.from_pretrained(model_name)
+prompt = processor.apply_chat_template(messages,
+                                        tokenize=False,
+                                        add_generation_prompt=True)
+req_data = ModelRequestData(
+    engine_args=engine_args,
+    prompt=prompt,
+    image_data=[url for url in image_urls],
+)
+engine_args = asdict(req_data.engine_args) 
+llm = LLM(**engine_args)
+sampling_params = SamplingParams(temperature=0.0,
+                                    max_tokens=1024,
+                                    stop_token_ids=req_data.stop_token_ids)
+num_prompts = 1
+for i in range(num_prompts):
+    outputs = llm.generate(
+        {
+            "prompt": req_data.prompt,
+            "multi_modal_data": {
+                "image": req_data.image_data
+            },
+        },
+        sampling_params=sampling_params,
+#    lora_request=req_data.lora_requests,
+    )
+    print("-" * 50)
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+        print("-" * 50)
+
diff --git a/gemma3.sh b/gemma3.sh
new file mode 100755
index 00000000000..33bc272a719
--- /dev/null
+++ b/gemma3.sh
@@ -0,0 +1,13 @@
+export VLLM_SKIP_WARMUP=true
+export LLM_MODEL_ID=google/gemma-3-27b-it
+export HF_TOKEN="xxx"
+export PT_HPUGRAPH_DISABLE_TENSOR_CACHE=false
+export DATA_PATH=~/data
+export MAX_TOTAL_TOKENS=500
+export VLLM_USE_V1=0
+export PT_HPU_LAZY_MODE=1
+export VLLM_FP32_SOFTMAX=1
+#python vllm-gemma3-offline.py
+
+
+python gemma3.py 0
diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
index 4e0d4f84ca6..05df1521463 100644
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -397,7 +397,12 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for idx, layer in enumerate(self.layers[self.start_layer:self.end_layer]):
+            #print(f'text: {idx}, {hidden_states.sum()} {residual}')
+            #if idx == 0 and hidden_states.shape[-1] < 1300:  # 1280 vs 1244
+            #    # when compared with hidden_states[...,:1244,:].. matches GPU exactly
+            #    breakpoint()
+            #    print()
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 8afad9ac819..c7307a2b501 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -566,6 +566,126 @@ def _process_image_input(
             self.vision_tower,
             pixel_values,
         )
+        #breakpoint()
+        '''
+        pixel_values: 1x3x896x896 for HPU, 2x3x896x896 for GPU, ...its a warmup thing
+        In second step, both HPU and GPU have same values
+
+        however image_features is diff in hpu vs gpu (shape [1, 4096, 1152]))
+
+        printing values b4 each layer in siglip:
+        GPU
+        0 -> 4718592.0
+1 -> 5275648.0
+2 -> 5505024.0
+3 -> 5505024.0
+4 -> 5537792.0
+5 -> 5537792.0
+6 -> 5570560.0
+7 -> 5603328.0
+8 -> 5603328.0
+9 -> 5603328.0
+10 -> 5570560.0
+11 -> 5505024.0
+12 -> 5439488.0
+13 -> 5373952.0
+14 -> 5275648.0
+15 -> 5210112.0
+16 -> 5111808.0
+17 -> 5046272.0
+18 -> 5013504.0
+19 -> 4980736.0
+20 -> 4980736.0
+21 -> 5013504.0
+22 -> 5046272.0
+23 -> 5013504.0
+24 -> 5013504.0
+25 -> 5013504.0
+26 -> 5079040.0
+
+HPU:
+0 -> 4718592.0
+1 -> 5275648.0
+2 -> 5505024.0
+3 -> 5505024.0
+4 -> 5537792.0
+5 -> 5537792.0
+6 -> 5570560.0
+7 -> 5603328.0
+8 -> 5603328.0
+9 -> 5603328.0
+10 -> 5570560.0
+11 -> 5505024.0
+12 -> 5439488.0
+13 -> 5373952.0
+14 -> 5308416.0
+15 -> 5210112.0
+16 -> 5111808.0
+17 -> 5046272.0
+18 -> 5013504.0
+19 -> 5013504.0
+20 -> 5013504.0
+21 -> 5013504.0
+22 -> 5046272.0
+23 -> 5046272.0
+24 -> 5013504.0
+25 -> 5013504.0
+26 -> 5079040.0
+
+however final sum is diff:
+ image_features.sum()
+tensor(18432., device='hpu:0', dtype=torch.bfloat16)
+        image_features.sum()
+tensor(18304., device='cuda:0', dtype=torch.bfloat16)
+
+
+GPU:
+25 -> 5013504.0 ... tensor([[ 2.8125,  0.7070,  3.8594, -5.2188],
+        [-0.9336,  0.4375,  1.2812, -0.1250],
+        [-1.1250,  0.5859,  1.7422,  0.9141],
+        [ 3.2188,  1.4609,  1.6250,  0.9844]], device='cuda:0',
+       dtype=torch.bfloat16)
+26 -> 5079040.0 ... tensor([[ 3.8125,  1.6406,  5.1250, -5.7500],
+        [-2.2812,  0.1641,  0.6094, -0.8711],
+        [-3.0000,  0.6172,  1.2031,  0.1328],
+        [ 3.3594,  1.3281,  2.3281,  0.6172]], device='cuda:0',
+       dtype=torch.bfloat16)
+BEFORE resolve_visual_encoder_outputs tensor(7274496., device='cuda:0', dtype=torch.bfloat16) tensor([[ -44.2500,  -39.0000,  -17.7500, -124.5000],
+        [  -2.1250,    0.4512,   -0.7734,   -2.6406],
+        [  -2.2812,    1.5781,    0.1875,    0.5000],
+        [ -49.2500,  -40.5000,  -20.1250, -127.0000]], device='cuda:0',
+       dtype=torch.bfloat16)
+AFTER resolve_visual_encoder_outputs tensor(18304., device='cuda:0', dtype=torch.bfloat16) tensor([[-0.0119,  0.0135,  0.0248,  0.0549],
+        [-1.1094, -0.0742, -0.7266, -0.6953],
+        [-3.0000,  0.3496, -0.9766,  0.1963],
+        [-0.0295,  0.0193,  0.0148,  0.1006]], device='cuda:0',
+
+
+HPU:
+25 -> 5013504.0... tensor([[ 2.9531,  1.2188,  4.1250, -5.0625],
+        [-1.0625,  0.4258,  1.3281, -0.1250],
+        [-1.1562,  0.5703,  1.6484,  0.8438],
+        [ 3.2188,  1.4375,  1.7188,  0.8906]], device='hpu:0',
+       dtype=torch.bfloat16)
+26 -> 5079040.0... tensor([[ 3.5938,  2.1406,  4.7812, -5.3125],
+        [-2.5000,  0.1211,  0.6367, -0.7578],
+        [-3.0000,  0.5781,  1.0938,  0.0938],
+        [ 3.4844,  1.3125,  2.4844,  0.4922]], device='hpu:0',
+       dtype=torch.bfloat16)
+BEFORE resolve_visual_encoder_outputs tensor(7241728., device='hpu:0', dtype=torch.bfloat16) tensor([[-2.2500e+01, -2.4000e+01, -1.0562e+01, -7.8500e+01],
+        [-2.0938e+00,  5.3125e-01, -5.8984e-01, -2.0000e+00],
+        [-2.3125e+00,  1.5000e+00,  5.4688e-02,  4.1406e-01],
+        [-4.9250e+01, -4.0500e+01, -2.0125e+01, -1.2700e+02]], device='hpu:0',
+       dtype=torch.bfloat16)
+AFTER resolve_visual_encoder_outputs tensor(18432., device='hpu:0', dtype=torch.bfloat16) tensor([[ 3.4180e-02, -1.2207e-03,  1.7944e-02, -1.6602e-02],
+        [-1.4062e+00, -1.1914e-01, -8.3203e-01, -7.4219e-01],
+        [-3.0156e+00,  2.8906e-01, -1.0859e+00,  1.1572e-01],
+        [-2.6978e-02,  2.1118e-02,  1.5747e-02,  1.0791e-01]], device='hpu:0',
+       dtype=torch.bfloat16)
+
+HPU vs GPU is about the same.... kind of sort of
+
+        '''
         image_embeds = self.multi_modal_projector(image_features)
 
         return [
@@ -604,12 +724,14 @@ def forward(self,
                 intermediate_tensors: Optional[IntermediateTensors] = None,
                 inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs: object) -> IntermediateTensors:
+        #breakpoint()
         if intermediate_tensors is not None:
             inputs_embeds = None
 
         # NOTE: In v1, inputs_embeds is always generated at model runner, this
         # condition is for v0 compatibility.
         elif inputs_embeds is None:
+            #breakpoint()
             vision_embeddings = self.get_multimodal_embeddings(**kwargs)
 
             inputs_embeds = self.get_input_embeddings(input_ids,
@@ -623,6 +745,8 @@ def forward(self,
                 )
             input_ids = None
 
+
+        #breakpoint()
         hidden_states = self.language_model.model(input_ids,
                                                   positions,
                                                   intermediate_tensors,
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index 75fcf540b0b..2eba5065f39 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -313,7 +313,8 @@ def forward(
         hidden_states_pool = [inputs_embeds]
         hidden_states = inputs_embeds
 
-        for encoder_layer in self.layers:
+        for idx, encoder_layer in enumerate(self.layers):
+            #print(f'{idx} -> {hidden_states.sum()}... {hidden_states[0,:4,:4]}')
             hidden_states, _ = encoder_layer(hidden_states)
             if return_all_hidden_states:
                 hidden_states_pool.append(hidden_states)
@@ -431,10 +432,11 @@ def forward(
         )
 
         # Handle post-norm (if applicable) and stacks feature layers if needed
+        #print(f'BEFORE resolve_visual_encoder_outputs', encoder_outputs.sum(), encoder_outputs[0, :4, :4])
         encoder_outputs = resolve_visual_encoder_outputs(
             encoder_outputs, feature_sample_layers, self.post_layernorm,
             self.config.num_hidden_layers)
-
+        #print(f'AFTER resolve_visual_encoder_outputs', encoder_outputs.sum(), encoder_outputs[0, :4, :4])
         # TODO: add this back when pooled_output is used in inference.
         # if self.use_head:
         # pooled_output = self.head(encoder_outputs)

From 90a3ae188401c19b0454d06abaf071bc7b018bf4 Mon Sep 17 00:00:00 2001
From: Sayantan Sarkar <sasarkar@habana.ai>
Date: Fri, 23 May 2025 09:38:41 -0700
Subject: [PATCH 03/31] minor

---
 gemma3.py   |  25 +-
 gemma3.sh   |  20 +-
 prompts.txt | 784 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 822 insertions(+), 7 deletions(-)
 create mode 100644 prompts.txt

diff --git a/gemma3.py b/gemma3.py
index 64b55188a89..8da266b536d 100644
--- a/gemma3.py
+++ b/gemma3.py
@@ -10,6 +10,7 @@
 from vllm.multimodal.utils import fetch_image
 from vllm.utils import FlexibleArgumentParser
 import sys
+import torch
 
 num_imgs = int(sys.argv[1])
 
@@ -26,8 +27,15 @@ class ModelRequestData(NamedTuple):
 elif num_imgs == 2:
     image_urls = [Image.open('jr.png').convert("RGB") for _ in range(2)]
 #question = "What is the name of the person in the form?"
-question = XXXXX
-model_name = "google/gemma-3-27b-it"
+if num_imgs > 0:
+    question = xxx
+else:
+    question = "You are an AI designed to generate extremely long, detailed worldbuilding content. Your goal is to write a fictional encyclopedia with at least 4000 words of content. Do not stop early. Start by describing a fictional planet in detail. Include: \n1. Geography and climate zones (with rich, varied description).\n2. The history of all civilizations, from ancient to modern times.\n3. Cultures, belief systems, and mythologies along with rich detail about where such beliefs came from.\n4. Political structures and conflicts along with their history.\n5. Technology and magic systems (if any) spanning the last 1000 years, highlighting significant discoveries and figures.\n6. Major historical events and characters along with their geneology.\n\n Be descriptive, verbose, and never summarize. Write in a factual tone like an academic encyclopedia. Begin your entry below:"
+    question00 = "Generate a list by repeating this 10000 times: hello, world, cat, dog"
+    question2 = "Generate all numbers from 1 to 6000, separate them with commas"
+
+model_name = "google/gemma-3-4b-it"
+#model_name = 'google/gemma-2-2b-it'
 #model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
 engine_args = EngineArgs(
     model=model_name,
@@ -36,14 +44,15 @@ class ModelRequestData(NamedTuple):
     tensor_parallel_size=1,
     gpu_memory_utilization=0.9,
     enforce_eager=True,    
-    limit_mm_per_prompt={"image": len(image_urls)},
+    limit_mm_per_prompt={"image": len(image_urls)},  # remove for gemma2
+    #dtype="float32"  ## remove
 )
 placeholders = [{"type": "image", "image": url} for url in image_urls]
 messages = [{
     "role":
     "user",
     "content": [
-        *placeholders,
+        *placeholders,  # remove for gemma2
         {
             "type": "text",
             "text": question
@@ -59,17 +68,20 @@ class ModelRequestData(NamedTuple):
     prompt=prompt,
     image_data=[url for url in image_urls],
 )
+#breakpoint()
 engine_args = asdict(req_data.engine_args) 
 llm = LLM(**engine_args)
+#breakpoint()
 sampling_params = SamplingParams(temperature=0.0,
-                                    max_tokens=1024,
+                                    max_tokens=8192,
                                     stop_token_ids=req_data.stop_token_ids)
 num_prompts = 1
 for i in range(num_prompts):
+    #breakpoint()
     outputs = llm.generate(
         {
             "prompt": req_data.prompt,
-            "multi_modal_data": {
+            "multi_modal_data": {   # remove for gemma2
                 "image": req_data.image_data
             },
         },
@@ -80,5 +92,6 @@ class ModelRequestData(NamedTuple):
     for o in outputs:
         generated_text = o.outputs[0].text
         print(generated_text)
+        breakpoint()
         print("-" * 50)
 
diff --git a/gemma3.sh b/gemma3.sh
index 33bc272a719..29ba2329f95 100755
--- a/gemma3.sh
+++ b/gemma3.sh
@@ -1,13 +1,31 @@
 export VLLM_SKIP_WARMUP=true
 export LLM_MODEL_ID=google/gemma-3-27b-it
-export HF_TOKEN="xxx"
+export HF_TOKEN=xxx
 export PT_HPUGRAPH_DISABLE_TENSOR_CACHE=false
 export DATA_PATH=~/data
 export MAX_TOTAL_TOKENS=500
 export VLLM_USE_V1=0
 export PT_HPU_LAZY_MODE=1
 export VLLM_FP32_SOFTMAX=1
+export VLLM_PROMPT_USE_FUSEDSDPA=False
 #python vllm-gemma3-offline.py
 
+#export VLLM_PROMPT_USE_FUSEDSDPA=False
+
+
+#export VLLM_PROMPT_BS_BUCKET_MIN=1
+#export VLLM_PROMPT_BS_BUCKET_STEP=1
+#export VLLM_PROMPT_BS_BUCKET_MAX=1
+#export VLLM_DECODE_BS_BUCKET_MIN=1
+#export VLLM_DECODE_BS_BUCKET_STEP=1
+#export VLLM_DECODE_BS_BUCKET_MAX=1
+ 
+#export VLLM_PROMPT_SEQ_BUCKET_MIN=128
+#export VLLM_PROMPT_SEQ_BUCKET_STEP=512
+#export VLLM_PROMPT_SEQ_BUCKET_MAX=2048
+#export VLLM_DECODE_BLOCK_BUCKET_MIN=128
+#export VLLM_DECODE_BLOCK_BUCKET_STEP=128
+#export VLLM_DECODE_BLOCK_BUCKET_MAX=2048
+
 
 python gemma3.py 0
diff --git a/prompts.txt b/prompts.txt
new file mode 100644
index 00000000000..927c620ad8e
--- /dev/null
+++ b/prompts.txt
@@ -0,0 +1,784 @@
+question = "You are an AI designed to generate extremely long, detailed worldbuilding content. Your goal is to write a fictional encyclopedia with at least 4000 words of content. Do not stop early. Start by describing a fictional planet in detail. Include: \n1. Geography and climate zones (with rich, varied description).\n2. The history of all civilizations, from ancient to modern times.\n3. Cultures, belief systems, and mythologies along with rich detail about where such beliefs came from.\n4. Political structures and conflicts along with their history.\n5. Technology and magic systems (if any) spanning the last 1000 years, highlighting significant discoveries and figures.\n6. Major historical events and characters along with their geneology.\n\n Be descriptive, verbose, and never summarize. Write in a factual tone like an academic encyclopedia. Begin your entry below:"
+    question00 = "Generate a list by repeating this 10000 times: hello, world, cat, dog"
+    question2 = "Generate all numbers from 1 to 6000, separate them with commas"
+    question3 = '''Continue this story for 5000 words, do not stop, keep generating a long rambling story:
+
+Call me Ishmael.  Some years ago--never mind how long
+precisely--having little or no money in my purse, and nothing
+particular to interest me on shore, I thought I would sail about a
+little and see the watery part of the world.  It is a way I have of
+driving off the spleen and regulating the circulation.  Whenever I
+find myself growing grim about the mouth; whenever it is a damp,
+drizzly November in my soul; whenever I find myself involuntarily
+pausing before coffin warehouses, and bringing up the rear of every
+funeral I meet; and especially whenever my hypos get such an upper
+hand of me, that it requires a strong moral principle to prevent me
+from deliberately stepping into the street, and methodically knocking
+people's hats off--then, I account it high time to get to sea as soon
+as I can.  This is my substitute for pistol and ball.  With a
+philosophical flourish Cato throws himself upon his sword; I quietly
+take to the ship.  There is nothing surprising in this.  If they but
+knew it, almost all men in their degree, some time or other, cherish
+very nearly the same feelings towards the ocean with me.
+
+There now is your insular city of the Manhattoes, belted round by
+wharves as Indian isles by coral reefs--commerce surrounds it with
+her surf.  Right and left, the streets take you waterward.  Its
+extreme downtown is the battery, where that noble mole is washed by
+waves, and cooled by breezes, which a few hours previous were out of
+sight of land.  Look at the crowds of water-gazers there.
+
+Circumambulate the city of a dreamy Sabbath afternoon.  Go from
+Corlears Hook to Coenties Slip, and from thence, by Whitehall,
+northward.  What do you see?--Posted like silent sentinels all around
+the town, stand thousands upon thousands of mortal men fixed in ocean
+reveries.  Some leaning against the spiles; some seated upon the
+pier-heads; some looking over the bulwarks of ships from China; some
+high aloft in the rigging, as if striving to get a still better
+seaward peep.  But these are all landsmen; of week days pent up in
+lath and plaster--tied to counters, nailed to benches, clinched to
+desks.  How then is this?  Are the green fields gone?  What do they
+here?
+
+But look! here come more crowds, pacing straight for the water, and
+seemingly bound for a dive.  Strange!  Nothing will content them but
+the extremest limit of the land; loitering under the shady lee of
+yonder warehouses will not suffice.  No.  They must get just as nigh
+the water as they possibly can without falling in.  And there they
+stand--miles of them--leagues.  Inlanders all, they come from lanes
+and alleys, streets and avenues--north, east, south, and west.  Yet
+here they all unite.  Tell me, does the magnetic virtue of the
+needles of the compasses of all those ships attract them thither?
+
+Once more.  Say you are in the country; in some high land of lakes.
+Take almost any path you please, and ten to one it carries you down
+in a dale, and leaves you there by a pool in the stream.  There is
+magic in it.  Let the most absent-minded of men be plunged in his
+deepest reveries--stand that man on his legs, set his feet a-going,
+and he will infallibly lead you to water, if water there be in all
+that region.  Should you ever be athirst in the great American
+desert, try this experiment, if your caravan happen to be supplied
+with a metaphysical professor.  Yes, as every one knows, meditation
+and water are wedded for ever.
+
+But here is an artist.  He desires to paint you the dreamiest,
+shadiest, quietest, most enchanting bit of romantic landscape in all
+the valley of the Saco.  What is the chief element he employs?  There
+stand his trees, each with a hollow trunk, as if a hermit and a
+crucifix were within; and here sleeps his meadow, and there sleep his
+cattle; and up from yonder cottage goes a sleepy smoke.  Deep into
+distant woodlands winds a mazy way, reaching to overlapping spurs of
+mountains bathed in their hill-side blue.  But though the picture
+lies thus tranced, and though this pine-tree shakes down its sighs
+like leaves upon this shepherd's head, yet all were vain, unless the
+shepherd's eye were fixed upon the magic stream before him.  Go visit
+the Prairies in June, when for scores on scores of miles you wade
+knee-deep among Tiger-lilies--what is the one charm
+wanting?--Water--there is not a drop of water there!  Were Niagara
+but a cataract of sand, would you travel your thousand miles to see
+it?  Why did the poor poet of Tennessee, upon suddenly receiving two
+handfuls of silver, deliberate whether to buy him a coat, which he
+sadly needed, or invest his money in a pedestrian trip to Rockaway
+Beach?  Why is almost every robust healthy boy with a robust healthy
+soul in him, at some time or other crazy to go to sea?  Why upon your
+first voyage as a passenger, did you yourself feel such a mystical
+vibration, when first told that you and your ship were now out of
+sight of land?  Why did the old Persians hold the sea holy?  Why did
+the Greeks give it a separate deity, and own brother of Jove?  Surely
+all this is not without meaning.  And still deeper the meaning of
+that story of Narcissus, who because he could not grasp the
+tormenting, mild image he saw in the fountain, plunged into it and
+was drowned.  But that same image, we ourselves see in all rivers and
+oceans.  It is the image of the ungraspable phantom of life; and this
+is the key to it all.
+
+Now, when I say that I am in the habit of going to sea whenever I
+begin to grow hazy about the eyes, and begin to be over conscious of
+my lungs, I do not mean to have it inferred that I ever go to sea as
+a passenger.  For to go as a passenger you must needs have a purse,
+and a purse is but a rag unless you have something in it.  Besides,
+passengers get sea-sick--grow quarrelsome--don't sleep of nights--do
+not enjoy themselves much, as a general thing;--no, I never go as a
+passenger; nor, though I am something of a salt, do I ever go to sea
+as a Commodore, or a Captain, or a Cook.  I abandon the glory and
+distinction of such offices to those who like them.  For my part, I
+abominate all honourable respectable toils, trials, and tribulations
+of every kind whatsoever.  It is quite as much as I can do to take
+care of myself, without taking care of ships, barques, brigs,
+schooners, and what not.  And as for going as cook,--though I confess
+there is considerable glory in that, a cook being a sort of officer
+on ship-board--yet, somehow, I never fancied broiling fowls;--though
+once broiled, judiciously buttered, and judgmatically salted and
+peppered, there is no one who will speak more respectfully, not to
+say reverentially, of a broiled fowl than I will.  It is out of the
+idolatrous dotings of the old Egyptians upon broiled ibis and roasted
+river horse, that you see the mummies of those creatures in their
+huge bake-houses the pyramids.
+
+No, when I go to sea, I go as a simple sailor, right before the mast,
+plumb down into the forecastle, aloft there to the royal mast-head.
+True, they rather order me about some, and make me jump from spar to
+spar, like a grasshopper in a May meadow.  And at first, this sort of
+thing is unpleasant enough.  It touches one's sense of honour,
+particularly if you come of an old established family in the land,
+the Van Rensselaers, or Randolphs, or Hardicanutes.  And more than
+all, if just previous to putting your hand into the tar-pot, you have
+been lording it as a country schoolmaster, making the tallest boys
+stand in awe of you.  The transition is a keen one, I assure you,
+from a schoolmaster to a sailor, and requires a strong decoction of
+Seneca and the Stoics to enable you to grin and bear it.  But even
+this wears off in time.
+
+What of it, if some old hunks of a sea-captain orders me to get a
+broom and sweep down the decks?  What does that indignity amount to,
+weighed, I mean, in the scales of the New Testament?  Do you think
+the archangel Gabriel thinks anything the less of me, because I
+promptly and respectfully obey that old hunks in that particular
+instance?  Who ain't a slave?  Tell me that.  Well, then, however the
+old sea-captains may order me about--however they may thump and punch
+me about, I have the satisfaction of knowing that it is all right;
+that everybody else is one way or other served in much the same
+way--either in a physical or metaphysical point of view, that is; and
+so the universal thump is passed round, and all hands should rub each
+other's shoulder-blades, and be content.
+
+Again, I always go to sea as a sailor, because they make a point of
+paying me for my trouble, whereas they never pay passengers a single
+penny that I ever heard of.  On the contrary, passengers themselves
+must pay.  And there is all the difference in the world between
+paying and being paid.  The act of paying is perhaps the most
+uncomfortable infliction that the two orchard thieves entailed upon
+us.  But BEING PAID,--what will compare with it?  The urbane activity
+with which a man receives money is really marvellous, considering
+that we so earnestly believe money to be the root of all earthly
+ills, and that on no account can a monied man enter heaven.  Ah! how
+cheerfully we consign ourselves to perdition!
+
+Finally, I always go to sea as a sailor, because of the wholesome
+exercise and pure air of the fore-castle deck.  For as in this world,
+head winds are far more prevalent than winds from astern (that is, if
+you never violate the Pythagorean maxim), so for the most part the
+Commodore on the quarter-deck gets his atmosphere at second hand from
+the sailors on the forecastle.  He thinks he breathes it first; but
+not so.  In much the same way do the commonalty lead their leaders in
+many other things, at the same time that the leaders little suspect
+it.  But wherefore it was that after having repeatedly smelt the sea
+as a merchant sailor, I should now take it into my head to go on a
+whaling voyage; this the invisible police officer of the Fates, who
+has the constant surveillance of me, and secretly dogs me, and
+influences me in some unaccountable way--he can better answer than
+any one else.  And, doubtless, my going on this whaling voyage,
+formed part of the grand programme of Providence that was drawn up a
+long time ago.  It came in as a sort of brief interlude and solo
+between more extensive performances.  I take it that this part of the
+bill must have run something like this:
+
+
+"GRAND CONTESTED ELECTION FOR THE PRESIDENCY OF THE UNITED STATES.
+"WHALING VOYAGE BY ONE ISHMAEL.
+"BLOODY BATTLE IN AFFGHANISTAN."
+
+
+Though I cannot tell why it was exactly that those stage managers,
+the Fates, put me down for this shabby part of a whaling voyage, when
+others were set down for magnificent parts in high tragedies, and
+short and easy parts in genteel comedies, and jolly parts in
+farces--though I cannot tell why this was exactly; yet, now that I
+recall all the circumstances, I think I can see a little into the
+springs and motives which being cunningly presented to me under
+various disguises, induced me to set about performing the part I did,
+besides cajoling me into the delusion that it was a choice resulting
+from my own unbiased freewill and discriminating judgment.
+
+Chief among these motives was the overwhelming idea of the great
+whale himself.  Such a portentous and mysterious monster roused all
+my curiosity.  Then the wild and distant seas where he rolled his
+island bulk; the undeliverable, nameless perils of the whale; these,
+with all the attending marvels of a thousand Patagonian sights and
+sounds, helped to sway me to my wish.  With other men, perhaps, such
+things would not have been inducements; but as for me, I am tormented
+with an everlasting itch for things remote.  I love to sail forbidden
+seas, and land on barbarous coasts.  Not ignoring what is good, I am
+quick to perceive a horror, and could still be social with it--would
+they let me--since it is but well to be on friendly terms with all
+the inmates of the place one lodges in.
+
+By reason of these things, then, the whaling voyage was welcome; the
+great flood-gates of the wonder-world swung open, and in the wild
+conceits that swayed me to my purpose, two and two there floated into
+my inmost soul, endless processions of the whale, and, mid most of
+them all, one grand hooded phantom, like a snow hill in the air.
+
+
+I stuffed a shirt or two into my old carpet-bag, tucked it under my
+arm, and started for Cape Horn and the Pacific.  Quitting the good
+city of old Manhatto, I duly arrived in New Bedford.  It was a
+Saturday night in December.  Much was I disappointed upon learning
+that the little packet for Nantucket had already sailed, and that no
+way of reaching that place would offer, till the following Monday.
+
+As most young candidates for the pains and penalties of whaling stop
+at this same New Bedford, thence to embark on their voyage, it may as
+well be related that I, for one, had no idea of so doing.  For my
+mind was made up to sail in no other than a Nantucket craft, because
+there was a fine, boisterous something about everything connected
+with that famous old island, which amazingly pleased me.  Besides
+though New Bedford has of late been gradually monopolising the
+business of whaling, and though in this matter poor old Nantucket is
+now much behind her, yet Nantucket was her great original--the Tyre
+of this Carthage;--the place where the first dead American whale was
+stranded.  Where else but from Nantucket did those aboriginal
+whalemen, the Red-Men, first sally out in canoes to give chase to the
+Leviathan?  And where but from Nantucket, too, did that first
+adventurous little sloop put forth, partly laden with imported
+cobblestones--so goes the story--to throw at the whales, in order to
+discover when they were nigh enough to risk a harpoon from the
+bowsprit?
+
+Now having a night, a day, and still another night following before
+me in New Bedford, ere I could embark for my destined port, it
+became a matter of concernment where I was to eat and sleep
+meanwhile.  It was a very dubious-looking, nay, a very dark and
+dismal night, bitingly cold and cheerless.  I knew no one in the
+place.  With anxious grapnels I had sounded my pocket, and only
+brought up a few pieces of silver,--So, wherever you go, Ishmael,
+said I to myself, as I stood in the middle of a dreary street
+shouldering my bag, and comparing the gloom towards the north with
+the darkness towards the south--wherever in your wisdom you may
+conclude to lodge for the night, my dear Ishmael, be sure to inquire
+the price, and don't be too particular.
+
+With halting steps I paced the streets, and passed the sign of "The
+Crossed Harpoons"--but it looked too expensive and jolly there.
+Further on, from the bright red windows of the "Sword-Fish Inn,"
+there came such fervent rays, that it seemed to have melted the
+packed snow and ice from before the house, for everywhere else the
+congealed frost lay ten inches thick in a hard, asphaltic
+pavement,--rather weary for me, when I struck my foot against the
+flinty projections, because from hard, remorseless service the soles
+of my boots were in a most miserable plight.  Too expensive and
+jolly, again thought I, pausing one moment to watch the broad glare
+in the street, and hear the sounds of the tinkling glasses within.
+But go on, Ishmael, said I at last; don't you hear? get away from
+before the door; your patched boots are stopping the way.  So on I
+went.  I now by instinct followed the streets that took me waterward,
+for there, doubtless, were the cheapest, if not the cheeriest inns.
+'''
+
+    question1 = '''Continue this story for 5000 words, do not stop, keep generating a long rambling story:
+
+Call me Ishmael.  Some years ago--never mind how long
+precisely--having little or no money in my purse, and nothing
+particular to interest me on shore, I thought I would sail about a
+little and see the watery part of the world.  It is a way I have of
+driving off the spleen and regulating the circulation.  Whenever I
+find myself growing grim about the mouth; whenever it is a damp,
+drizzly November in my soul; whenever I find myself involuntarily
+pausing before coffin warehouses, and bringing up the rear of every
+funeral I meet; and especially whenever my hypos get such an upper
+hand of me, that it requires a strong moral principle to prevent me
+from deliberately stepping into the street, and methodically knocking
+people's hats off--then, I account it high time to get to sea as soon
+as I can.  This is my substitute for pistol and ball.  With a
+philosophical flourish Cato throws himself upon his sword; I quietly
+take to the ship.  There is nothing surprising in this.  If they but
+knew it, almost all men in their degree, some time or other, cherish
+very nearly the same feelings towards the ocean with me.
+
+There now is your insular city of the Manhattoes, belted round by
+wharves as Indian isles by coral reefs--commerce surrounds it with
+her surf.  Right and left, the streets take you waterward.  Its
+extreme downtown is the battery, where that noble mole is washed by
+waves, and cooled by breezes, which a few hours previous were out of
+sight of land.  Look at the crowds of water-gazers there.
+
+Circumambulate the city of a dreamy Sabbath afternoon.  Go from
+Corlears Hook to Coenties Slip, and from thence, by Whitehall,
+northward.  What do you see?--Posted like silent sentinels all around
+the town, stand thousands upon thousands of mortal men fixed in ocean
+reveries.  Some leaning against the spiles; some seated upon the
+pier-heads; some looking over the bulwarks of ships from China; some
+high aloft in the rigging, as if striving to get a still better
+seaward peep.  But these are all landsmen; of week days pent up in
+lath and plaster--tied to counters, nailed to benches, clinched to
+desks.  How then is this?  Are the green fields gone?  What do they
+here?
+
+But look! here come more crowds, pacing straight for the water, and
+seemingly bound for a dive.  Strange!  Nothing will content them but
+the extremest limit of the land; loitering under the shady lee of
+yonder warehouses will not suffice.  No.  They must get just as nigh
+the water as they possibly can without falling in.  And there they
+stand--miles of them--leagues.  Inlanders all, they come from lanes
+and alleys, streets and avenues--north, east, south, and west.  Yet
+here they all unite.  Tell me, does the magnetic virtue of the
+needles of the compasses of all those ships attract them thither?'''
+
+    question6 = '''Please take this list and continue generating it. Keep adding the lines seen in this list again and again.
+    Do not stop, just extend this list please. Do not generate code. Just manually generate the list. Please do not stop, keep generating again and again.
+
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog
+hello, world, cat, dog'''
+

From 94341977ca60375c7c53b08f32e7a3bf734a6413 Mon Sep 17 00:00:00 2001
From: Jimin Ha <jimin.ha@intel.com>
Date: Tue, 27 May 2025 21:28:25 -0700
Subject: [PATCH 04/31] Update sliding_window attention

---
 vllm/attention/backends/hpu_attn.py | 87 ++++++++++++++++++++++++++++-
 vllm/worker/hpu_model_runner.py     | 10 ++++
 2 files changed, 94 insertions(+), 3 deletions(-)

diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
index 42fa30c12c1..7e69a80e4bc 100644
--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -7,6 +7,7 @@
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Type
 
+import math
 import torch
 import vllm_hpu_extension.kernels as kernels
 import vllm_hpu_extension.ops as ops
@@ -503,6 +504,17 @@ def forward(
             block_list = attn_metadata.block_list if attn_metadata \
                 and attn_metadata.block_list is not None else None
 
+            print(f"HPUAttentionImpl : sliding_window {self.sliding_window} q{query.shape},kv{key.shape}, attn_bias:{'True' if attn_metadata.attn_bias is not None else 'False'}, seq_len:{attn_metadata.seq_lens_tensor}")
+            if self.sliding_window:
+                attn_bias = _make_sliding_window_bias(batch_size, seq_len, attn_metadata.seq_lens_tensor,
+                       self.sliding_window, query.dtype)
+
+                #TODO: Ideally we want to create this sliding_window_bias mask only
+                #once in the model_runner then only retrieve here.
+                #however query_len(attn_metadata.seq_lens_tensor) was incorrect value.
+                #Need to be further debugged.
+                #attn_bias = attn_metadata.sliding_window_att
+
             out = ops.prompt_attention(
                 impl=self.prefill_impl,
                 query=query.view(query_shape),
@@ -516,12 +528,44 @@ def forward(
             output = out.reshape(batch_size, seq_len, hidden_size)
         else:
             # Decoding run.
+            logger.info(f"HPUAttentionImpl DECODE : sliding_window {self.sliding_window}")
+            attn_bias = None
+            if self.sliding_window:
+                window_block = (self.sliding_window//len(attn_metadata.block_groups))
+                valid_block = (attn_metadata.block_groups == 0).sum().item()
+
+                if valid_block > window_block:
+                    #print(f"valid_block : {valid_block}, window_block{window_block}")
+                    attn_bias = attn_metadata.attn_bias
+                    block_usage = attn_metadata.block_usage
+
+                    block_usage[0:valid_block-window_block+1]=1
+                    mask = torch.arange(0,
+                                len(attn_metadata.block_groups),
+                                device='hpu',
+                                dtype=torch.int32).unsqueeze(0)
+                    mask = mask >= attn_metadata.block_usage.unsqueeze(-1)
+                    attn_bias = (torch.zeros_like(mask, dtype=torch.bfloat16).masked_fill_(
+                        mask, -math.inf))
+
+                    block_groups = attn_metadata.block_groups
+                    block_mapping = attn_metadata.block_mapping
+                    block_list = attn_metadata.block_list
+
+                    block_groups[1:valid_block-window_block+1]= -1
+                    block_mapping[1:valid_block-window_block+1] = 0
+                    block_list[1:valid_block-window_block+1]= 0
+                    #print(f"block_groups:{block_groups}")
+                    #print(f"block_mapping:{block_mapping}")
+                    #print(f"block_list:{block_list}")
+
+                   # Decoding run.
             output = HPUPagedAttention.forward_decode(
                 query=query,
                 block_mapping=attn_metadata.block_mapping,
-                block_bias=attn_metadata.attn_bias,
-                block_groups=attn_metadata.block_groups,
-                **self.common_attention_args(attn_metadata.block_list,
+                block_bias=attn_metadata.attn_bias if attn_bias is None else attn_bias,
+                block_groups=attn_metadata.block_groups if attn_bias is None else block_groups,
+                **self.common_attention_args(attn_metadata.block_list if attn_bias is None else block_list,
                                              key_cache, value_cache))
         # Reshape the output tensor.
         return output.view(batch_size, seq_len, hidden_size)
@@ -663,3 +707,40 @@ def _make_alibi_bias(
     if num_heads != num_kv_heads:
         bias = bias.unflatten(1, (num_kv_heads, num_heads // num_kv_heads))
     return bias
+
+
+
+def _make_sliding_window_bias(
+    batch_size: int,
+    seq_len: int,
+    query_lens_t: torch.tensor,
+    window_size:int,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+
+    shift = 0
+    device = query_lens_t.device
+    # TODO: Currently both paddings are left padding.
+    # Validated with native sdpa. if need to use with FusedSDPA,
+    # the padding setting needs to be changed to Left or mask need to be changed to right padding.
+
+    # causal + sliding window (LEFT PADDING)
+    tensor = torch.full((batch_size, 1, seq_len, seq_len), device=device,dtype=dtype, fill_value=1)
+    mask = torch.tril(tensor, diagonal=shift)
+    mask = torch.triu(mask, diagonal=shift - window_size + 1)
+    attn_bias = torch.log(mask)
+
+
+    '''
+    # causal + sliding window + query_len (LEFT PADDING : Need kernel supports)
+    tensor = torch.full((batch_size, 1, seq_len, seq_len), device=device,fill_value=1)
+    mask = torch.tril(tensor, diagonal=shift)
+    len_mask = torch.arange(0, seq_len, device=device, dtype=torch.int32).view(seq_len,1)
+    len_mask = len_mask.ge(query_lens_t.unsqueeze(-1)).view(batch_size, 1, seq_len, 1)
+    len_mask = torch.where(len_mask == False, 1, 0)
+    mask = mask.logical_and(len_mask)
+    mask = torch.triu(mask, diagonal=shift - window_size + 1)
+    attn_bias =torch.where(mask,0, -math.inf)
+    '''
+
+    return attn_bias
\ No newline at end of file
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index ca5e4b3e28a..8bb8a53cb55 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -339,6 +339,7 @@ def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device,
         return attn_metadata
 
     def _set_block_mapping(self, metadata, batch_size, device, dtype):
+
         mask = torch.arange(0,
                             self.block_size,
                             device=device,
@@ -1177,6 +1178,7 @@ def _prepare_prompt(
             seq_lens.append(seq_len)
 
             # NOTE: This only works for oooooooxxx style attention.
+            #import pdb;pdb.set_trace()
             if computed_block_nums is not None and len(
                     computed_block_nums) > 0 and self.sliding_window is None:
                 # Prefix is not supported with sliding_window
@@ -1263,11 +1265,13 @@ def _prepare_prompt(
             # block size is 4, the first two tokens are masked and the slot
             # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1].
             start_idx = 0
+            '''
             if self.sliding_window is not None:
                 assert context_len == 0, (
                     "Prefix caching is currently not supported with "
                     "sliding window attention")
                 start_idx = max(0, seq_len - self.sliding_window)
+            '''
             for i in range(context_len, seq_len):
                 if i < start_idx:
                     slot_mapping[-1].append(_PAD_SLOT_ID)
@@ -1496,6 +1500,7 @@ def _prepare_decode(
                     for idx in range(3):
                         input_mrope_positions[idx].extend(pos_for_mrope[idx])
 
+                #logger.info(f"Decode: seq_len:{seq_len}, sliding_window{self.sliding_window}")
                 seq_len = seq_len if self.sliding_window is None else min(
                     seq_len, self.sliding_window)
                 seq_lens.append(seq_len)
@@ -1517,6 +1522,7 @@ def _prepare_decode(
                 lora_index_mapping.append(lora_id)
                 lora_prompt_mapping.append(lora_id)
 
+                #logger.info(f"Decode: sliding_window:{self.sliding_window}, blocksize:{self.block_size}, block_table:{block_table}")
                 if self.sliding_window is not None:
                     sliding_window_blocks = (self.sliding_window //
                                              self.block_size)
@@ -1675,6 +1681,10 @@ def _prepare_decode(
             encoder_seq_lens_tensor = encoder_seq_lens_tensor.to(  # type: ignore
                 self.device, non_blocking=True)
 
+        #print(f"block_list: :{block_list}")
+        #print(f"block_groups: :{block_groups}")
+        #print(f"block_usage: :{block_usage}")
+
         attn_metadata = self.attn_backend.make_metadata(
             is_prompt=False,
             block_list=block_list,

From 0364786e8e1a6c4f77c8bcfd1de7ff3e1a986d8c Mon Sep 17 00:00:00 2001
From: Sayantan Sarkar <sasarkar@habana.ai>
Date: Wed, 28 May 2025 09:08:37 -0700
Subject: [PATCH 05/31] Update run scripts

---
 gemma3.py | 102 +++++++++++++++++++++++++++++++++---------------------
 gemma3.sh |   2 +-
 2 files changed, 64 insertions(+), 40 deletions(-)

diff --git a/gemma3.py b/gemma3.py
index 8da266b536d..79f8a66fda3 100644
--- a/gemma3.py
+++ b/gemma3.py
@@ -12,7 +12,11 @@
 import sys
 import torch
 
-num_imgs = int(sys.argv[1])
+
+
+num_imgs = sys.argv[1]
+num_imgs = [int(i) for i in num_imgs.split(',')]
+
 
 class ModelRequestData(NamedTuple):
     engine_args: EngineArgs
@@ -21,18 +25,45 @@ class ModelRequestData(NamedTuple):
     stop_token_ids: Optional[list[int]] = None
     chat_template: Optional[str] = None
     lora_requests: Optional[list[LoRARequest]] = None
-image_urls = []
-if num_imgs == 1:
-    image_urls = [Image.open('jr.png').convert("RGB")]
-elif num_imgs == 2:
-    image_urls = [Image.open('jr.png').convert("RGB") for _ in range(2)]
-#question = "What is the name of the person in the form?"
-if num_imgs > 0:
-    question = xxx
-else:
-    question = "You are an AI designed to generate extremely long, detailed worldbuilding content. Your goal is to write a fictional encyclopedia with at least 4000 words of content. Do not stop early. Start by describing a fictional planet in detail. Include: \n1. Geography and climate zones (with rich, varied description).\n2. The history of all civilizations, from ancient to modern times.\n3. Cultures, belief systems, and mythologies along with rich detail about where such beliefs came from.\n4. Political structures and conflicts along with their history.\n5. Technology and magic systems (if any) spanning the last 1000 years, highlighting significant discoveries and figures.\n6. Major historical events and characters along with their geneology.\n\n Be descriptive, verbose, and never summarize. Write in a factual tone like an academic encyclopedia. Begin your entry below:"
-    question00 = "Generate a list by repeating this 10000 times: hello, world, cat, dog"
-    question2 = "Generate all numbers from 1 to 6000, separate them with commas"
+
+
+def generate_prompt(n):
+    if n == 1:
+        question = xxx
+    elif n == 5:
+        question = "Explain what is happening in these images. Explain the characters you see in great detail, what they are doing. What is the overall story, and why is it funny. Do you identify the people in the image. Explain in such great details so that someone could recreate the images by reading your answer."
+    elif n == 0:
+        question = "You are an AI designed to generate extremely long, detailed worldbuilding content. Your goal is to write a fictional encyclopedia with at least 4000 words of content. Do not stop early. Start by describing a fictional planet in detail. Include: \n1. Geography and climate zones (with rich, varied description).\n2. The history of all civilizations, from ancient to modern times.\n3. Cultures, belief systems, and mythologies along with rich detail about where such beliefs came from.\n4. Political structures and conflicts along with their history.\n5. Technology and magic systems (if any) spanning the last 1000 years, highlighting significant discoveries and figures.\n6. Major historical events and characters along with their geneology.\n\n Be descriptive, verbose, and never summarize. Write in a factual tone like an academic encyclopedia. Begin your entry below:"
+        question00 = "Generate a list by repeating this 10000 times: hello, world, cat, dog"
+        question2 = "Generate all numbers from 1 to 6000, separate them with commas"
+    else:
+        assert False
+
+    
+    image_urls = []
+    if n == 1:
+        image_urls = [Image.open('jr.png').convert("RGB")]
+    elif n == 5:
+        image_urls = [Image.open(f'popeye{i}.png').convert("RGB") for i in range(5)] # split this into 5 images: https://en.wikipedia.org/wiki/Popeye#/media/File:First_Popeye_Strip,_East_Liverpool_Review,_1929-01-17,_p12.jpg
+    #question = "What is the name of the person in the form?"
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    return {
+        "role":
+        "user",
+        "content": [
+            *placeholders,  # remove for gemma2
+            {
+                "type": "text",
+                "text": question
+            },
+        ],
+    }, image_urls
+
+messages = [generate_prompt(n) for n in num_imgs]
+
+max_num_imgs = max([len(img_url) for msg, img_url in messages])
+
 
 model_name = "google/gemma-3-4b-it"
 #model_name = 'google/gemma-2-2b-it'
@@ -44,30 +75,24 @@ class ModelRequestData(NamedTuple):
     tensor_parallel_size=1,
     gpu_memory_utilization=0.9,
     enforce_eager=True,    
-    limit_mm_per_prompt={"image": len(image_urls)},  # remove for gemma2
+    limit_mm_per_prompt={"image": max_num_imgs},  # remove for gemma2
     #dtype="float32"  ## remove
 )
-placeholders = [{"type": "image", "image": url} for url in image_urls]
-messages = [{
-    "role":
-    "user",
-    "content": [
-        *placeholders,  # remove for gemma2
-        {
-            "type": "text",
-            "text": question
-        },
-    ],
-}]
+
 processor = AutoProcessor.from_pretrained(model_name)
-prompt = processor.apply_chat_template(messages,
+
+req_datas = []
+for message, image_urls in messages:
+    prompt = processor.apply_chat_template([message],
                                         tokenize=False,
                                         add_generation_prompt=True)
-req_data = ModelRequestData(
-    engine_args=engine_args,
-    prompt=prompt,
-    image_data=[url for url in image_urls],
-)
+    req_data = ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[url for url in image_urls],
+    )
+    req_datas += [req_data]
+
 #breakpoint()
 engine_args = asdict(req_data.engine_args) 
 llm = LLM(**engine_args)
@@ -78,13 +103,10 @@ class ModelRequestData(NamedTuple):
 num_prompts = 1
 for i in range(num_prompts):
     #breakpoint()
+    # remove "multi_modal_data" for gemma2
+    llm_gen_inp = [{ "prompt": req_data.prompt, "multi_modal_data": { "image": req_data.image_data }, } for req_data in req_datas]
     outputs = llm.generate(
-        {
-            "prompt": req_data.prompt,
-            "multi_modal_data": {   # remove for gemma2
-                "image": req_data.image_data
-            },
-        },
+        llm_gen_inp,
         sampling_params=sampling_params,
 #    lora_request=req_data.lora_requests,
     )
@@ -92,6 +114,8 @@ class ModelRequestData(NamedTuple):
     for o in outputs:
         generated_text = o.outputs[0].text
         print(generated_text)
-        breakpoint()
+        
         print("-" * 50)
+    breakpoint()
+    print()
 
diff --git a/gemma3.sh b/gemma3.sh
index 29ba2329f95..b246338c06d 100755
--- a/gemma3.sh
+++ b/gemma3.sh
@@ -28,4 +28,4 @@ export VLLM_PROMPT_USE_FUSEDSDPA=False
 #export VLLM_DECODE_BLOCK_BUCKET_MAX=2048
 
 
-python gemma3.py 0
+python gemma3.py 5,1

From 8caf102d9625a67874be52365ca925794f4c1a4a Mon Sep 17 00:00:00 2001
From: Jimin Ha <jimin.ha@intel.com>
Date: Wed, 28 May 2025 11:35:16 -0700
Subject: [PATCH 06/31] Update sliding_window mask logic  for lazy mode

---
 gemma3.py                           |  2 +-
 vllm/attention/backends/hpu_attn.py | 53 +++++++++--------------------
 2 files changed, 18 insertions(+), 37 deletions(-)

diff --git a/gemma3.py b/gemma3.py
index 79f8a66fda3..ac6616ee921 100644
--- a/gemma3.py
+++ b/gemma3.py
@@ -74,7 +74,7 @@ def generate_prompt(n):
     max_num_seqs=2,
     tensor_parallel_size=1,
     gpu_memory_utilization=0.9,
-    enforce_eager=True,    
+    enforce_eager=False,    
     limit_mm_per_prompt={"image": max_num_imgs},  # remove for gemma2
     #dtype="float32"  ## remove
 )
diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
index 7e69a80e4bc..0ba690b8023 100644
--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -528,44 +528,25 @@ def forward(
             output = out.reshape(batch_size, seq_len, hidden_size)
         else:
             # Decoding run.
-            logger.info(f"HPUAttentionImpl DECODE : sliding_window {self.sliding_window}")
-            attn_bias = None
             if self.sliding_window:
-                window_block = (self.sliding_window//len(attn_metadata.block_groups))
-                valid_block = (attn_metadata.block_groups == 0).sum().item()
-
-                if valid_block > window_block:
-                    #print(f"valid_block : {valid_block}, window_block{window_block}")
-                    attn_bias = attn_metadata.attn_bias
-                    block_usage = attn_metadata.block_usage
-
-                    block_usage[0:valid_block-window_block+1]=1
-                    mask = torch.arange(0,
-                                len(attn_metadata.block_groups),
-                                device='hpu',
-                                dtype=torch.int32).unsqueeze(0)
-                    mask = mask >= attn_metadata.block_usage.unsqueeze(-1)
-                    attn_bias = (torch.zeros_like(mask, dtype=torch.bfloat16).masked_fill_(
-                        mask, -math.inf))
-
-                    block_groups = attn_metadata.block_groups
-                    block_mapping = attn_metadata.block_mapping
-                    block_list = attn_metadata.block_list
-
-                    block_groups[1:valid_block-window_block+1]= -1
-                    block_mapping[1:valid_block-window_block+1] = 0
-                    block_list[1:valid_block-window_block+1]= 0
-                    #print(f"block_groups:{block_groups}")
-                    #print(f"block_mapping:{block_mapping}")
-                    #print(f"block_list:{block_list}")
-
-                   # Decoding run.
+                block_size = len(attn_metadata.block_groups)
+                window_block = (self.sliding_window // block_size)
+                valid_block = (attn_metadata.block_groups == 0).sum()
+
+                #create_mask
+                rng = torch.arange(block_size, device='hpu')
+                mask = torch.logical_and(rng > 0, rng < valid_block-window_block+1)
+
+                block_groups= torch.where(mask,  torch.tensor(-1), attn_metadata.block_groups)
+                block_mapping=  torch.where(mask.unsqueeze(1), torch.tensor(0.0), attn_metadata.block_mapping)
+                block_list= torch.where(mask,  torch.tensor(0), attn_metadata.block_list)
+
             output = HPUPagedAttention.forward_decode(
                 query=query,
-                block_mapping=attn_metadata.block_mapping,
-                block_bias=attn_metadata.attn_bias if attn_bias is None else attn_bias,
-                block_groups=attn_metadata.block_groups if attn_bias is None else block_groups,
-                **self.common_attention_args(attn_metadata.block_list if attn_bias is None else block_list,
+                block_mapping=attn_metadata.block_mapping if self.sliding_window is None else block_mapping,
+                block_bias=attn_metadata.attn_bias,
+                block_groups=attn_metadata.block_groups if self.sliding_window is None else block_groups,
+                **self.common_attention_args(attn_metadata.block_list if self.sliding_window is None else block_list,
                                              key_cache, value_cache))
         # Reshape the output tensor.
         return output.view(batch_size, seq_len, hidden_size)
@@ -734,7 +715,7 @@ def _make_sliding_window_bias(
     '''
     # causal + sliding window + query_len (LEFT PADDING : Need kernel supports)
     tensor = torch.full((batch_size, 1, seq_len, seq_len), device=device,fill_value=1)
-    mask = torch.tril(tensor, diagonal=shift)
+    mask = torch.tril(tensor, diagonal=shift){}
     len_mask = torch.arange(0, seq_len, device=device, dtype=torch.int32).view(seq_len,1)
     len_mask = len_mask.ge(query_lens_t.unsqueeze(-1)).view(batch_size, 1, seq_len, 1)
     len_mask = torch.where(len_mask == False, 1, 0)

From bff7983f2dca117166c0e85f4080923742e6684c Mon Sep 17 00:00:00 2001
From: Jimin Ha <jimin.ha@intel.com>
Date: Thu, 29 May 2025 20:25:59 -0700
Subject: [PATCH 07/31] Fix long prompt accuracy issue

---
 vllm/attention/backends/hpu_attn.py | 46 ++++++++++++++++-------------
 vllm/worker/hpu_model_runner.py     |  2 --
 2 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
index 0ba690b8023..08ea6d3032f 100644
--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -504,16 +504,16 @@ def forward(
             block_list = attn_metadata.block_list if attn_metadata \
                 and attn_metadata.block_list is not None else None
 
-            print(f"HPUAttentionImpl : sliding_window {self.sliding_window} q{query.shape},kv{key.shape}, attn_bias:{'True' if attn_metadata.attn_bias is not None else 'False'}, seq_len:{attn_metadata.seq_lens_tensor}")
+            common_args = self.common_attention_args(block_list, key_cache,
+                                             value_cache)
+
+            #TODO: Ideally we want to create this sliding_window_bias mask only
+            #once in the model_runner or gemma model file then only retrieve here.
             if self.sliding_window:
-                attn_bias = _make_sliding_window_bias(batch_size, seq_len, attn_metadata.seq_lens_tensor,
+                attn_bias = _make_sliding_window_bias(batch_size, seq_len,
+                       attn_metadata.seq_lens_tensor,
                        self.sliding_window, query.dtype)
-
-                #TODO: Ideally we want to create this sliding_window_bias mask only
-                #once in the model_runner then only retrieve here.
-                #however query_len(attn_metadata.seq_lens_tensor) was incorrect value.
-                #Need to be further debugged.
-                #attn_bias = attn_metadata.sliding_window_att
+                common_args['pad'] = 'left'
 
             out = ops.prompt_attention(
                 impl=self.prefill_impl,
@@ -523,17 +523,21 @@ def forward(
                 is_causal=True,
                 attn_bias=attn_bias,
                 valid_seq_lengths=attn_metadata.seq_lens_tensor,
-                **self.common_attention_args(block_list, key_cache,
-                                             value_cache))
+                **common_args)
             output = out.reshape(batch_size, seq_len, hidden_size)
         else:
             # Decoding run.
+            block_groups = attn_metadata.block_groups
+            block_mapping = attn_metadata.block_mapping
+            block_list = attn_metadata.block_list
+            attn_bias= attn_metadata.attn_bias
+
             if self.sliding_window:
                 block_size = len(attn_metadata.block_groups)
                 window_block = (self.sliding_window // block_size)
                 valid_block = (attn_metadata.block_groups == 0).sum()
 
-                #create_mask
+                # Create a mask to retain elements within the sliding window and exclude others.
                 rng = torch.arange(block_size, device='hpu')
                 mask = torch.logical_and(rng > 0, rng < valid_block-window_block+1)
 
@@ -543,10 +547,10 @@ def forward(
 
             output = HPUPagedAttention.forward_decode(
                 query=query,
-                block_mapping=attn_metadata.block_mapping if self.sliding_window is None else block_mapping,
-                block_bias=attn_metadata.attn_bias,
-                block_groups=attn_metadata.block_groups if self.sliding_window is None else block_groups,
-                **self.common_attention_args(attn_metadata.block_list if self.sliding_window is None else block_list,
+                block_mapping=block_mapping,
+                block_bias=attn_bias,
+                block_groups=block_groups,
+                **self.common_attention_args(block_list,
                                              key_cache, value_cache))
         # Reshape the output tensor.
         return output.view(batch_size, seq_len, hidden_size)
@@ -701,9 +705,9 @@ def _make_sliding_window_bias(
 
     shift = 0
     device = query_lens_t.device
-    # TODO: Currently both paddings are left padding.
-    # Validated with native sdpa. if need to use with FusedSDPA,
-    # the padding setting needs to be changed to Left or mask need to be changed to right padding.
+
+    # TODO: this is not performant as of now. Need to investigate further
+    # once FusedSDPA kernel with sliding causal mask support is available.
 
     # causal + sliding window (LEFT PADDING)
     tensor = torch.full((batch_size, 1, seq_len, seq_len), device=device,dtype=dtype, fill_value=1)
@@ -711,11 +715,11 @@ def _make_sliding_window_bias(
     mask = torch.triu(mask, diagonal=shift - window_size + 1)
     attn_bias = torch.log(mask)
 
-
     '''
+    # TODO Accuracy issue need to be debugged.
     # causal + sliding window + query_len (LEFT PADDING : Need kernel supports)
-    tensor = torch.full((batch_size, 1, seq_len, seq_len), device=device,fill_value=1)
-    mask = torch.tril(tensor, diagonal=shift){}
+    tensor = torch.full((batch_size, 1, seq_len, seq_len), device=device,dtype=dtype,fill_value=1)
+    mask = torch.tril(tensor, diagonal=shift)
     len_mask = torch.arange(0, seq_len, device=device, dtype=torch.int32).view(seq_len,1)
     len_mask = len_mask.ge(query_lens_t.unsqueeze(-1)).view(batch_size, 1, seq_len, 1)
     len_mask = torch.where(len_mask == False, 1, 0)
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 8bb8a53cb55..f698d0bd625 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -1265,13 +1265,11 @@ def _prepare_prompt(
             # block size is 4, the first two tokens are masked and the slot
             # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1].
             start_idx = 0
-            '''
             if self.sliding_window is not None:
                 assert context_len == 0, (
                     "Prefix caching is currently not supported with "
                     "sliding window attention")
                 start_idx = max(0, seq_len - self.sliding_window)
-            '''
             for i in range(context_len, seq_len):
                 if i < start_idx:
                     slot_mapping[-1].append(_PAD_SLOT_ID)

From dccd67e35b323fd90228918f92640531aa61a316 Mon Sep 17 00:00:00 2001
From: Jimin Ha <jimin.ha@intel.com>
Date: Thu, 29 May 2025 21:09:46 -0700
Subject: [PATCH 08/31] Change back to Eager mode for Vision prompt

---
 gemma3.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gemma3.py b/gemma3.py
index ac6616ee921..2d0e0d48f22 100644
--- a/gemma3.py
+++ b/gemma3.py
@@ -74,7 +74,7 @@ def generate_prompt(n):
     max_num_seqs=2,
     tensor_parallel_size=1,
     gpu_memory_utilization=0.9,
-    enforce_eager=False,    
+    enforce_eager=True,
     limit_mm_per_prompt={"image": max_num_imgs},  # remove for gemma2
     #dtype="float32"  ## remove
 )

From 45aaede737b1d9c4a2c09de1490b891dbd5adbea Mon Sep 17 00:00:00 2001
From: Jimin Ha <jimin.ha@intel.com>
Date: Fri, 30 May 2025 09:00:45 -0700
Subject: [PATCH 09/31] Remove unnecessary files

---
 gemma3.py                               | 121 ----
 prompts.txt                             | 784 ------------------------
 vllm/model_executor/models/gemma3.py    |   7 +-
 vllm/model_executor/models/gemma3_mm.py | 125 +---
 vllm/model_executor/models/siglip.py    |   6 +-
 5 files changed, 4 insertions(+), 1039 deletions(-)
 delete mode 100644 gemma3.py
 delete mode 100644 prompts.txt

diff --git a/gemma3.py b/gemma3.py
deleted file mode 100644
index 2d0e0d48f22..00000000000
--- a/gemma3.py
+++ /dev/null
@@ -1,121 +0,0 @@
-import os
-from argparse import Namespace
-from dataclasses import asdict
-from typing import NamedTuple, Optional
-#from huggingface_hub import snapshot_download
-from PIL import Image
-from transformers import AutoProcessor, AutoTokenizer
-from vllm import LLM, EngineArgs, SamplingParams
-from vllm.lora.request import LoRARequest
-from vllm.multimodal.utils import fetch_image
-from vllm.utils import FlexibleArgumentParser
-import sys
-import torch
-
-
-
-num_imgs = sys.argv[1]
-num_imgs = [int(i) for i in num_imgs.split(',')]
-
-
-class ModelRequestData(NamedTuple):
-    engine_args: EngineArgs
-    prompt: str
-    image_data: list[Image]
-    stop_token_ids: Optional[list[int]] = None
-    chat_template: Optional[str] = None
-    lora_requests: Optional[list[LoRARequest]] = None
-
-
-def generate_prompt(n):
-    if n == 1:
-        question = xxx
-    elif n == 5:
-        question = "Explain what is happening in these images. Explain the characters you see in great detail, what they are doing. What is the overall story, and why is it funny. Do you identify the people in the image. Explain in such great details so that someone could recreate the images by reading your answer."
-    elif n == 0:
-        question = "You are an AI designed to generate extremely long, detailed worldbuilding content. Your goal is to write a fictional encyclopedia with at least 4000 words of content. Do not stop early. Start by describing a fictional planet in detail. Include: \n1. Geography and climate zones (with rich, varied description).\n2. The history of all civilizations, from ancient to modern times.\n3. Cultures, belief systems, and mythologies along with rich detail about where such beliefs came from.\n4. Political structures and conflicts along with their history.\n5. Technology and magic systems (if any) spanning the last 1000 years, highlighting significant discoveries and figures.\n6. Major historical events and characters along with their geneology.\n\n Be descriptive, verbose, and never summarize. Write in a factual tone like an academic encyclopedia. Begin your entry below:"
-        question00 = "Generate a list by repeating this 10000 times: hello, world, cat, dog"
-        question2 = "Generate all numbers from 1 to 6000, separate them with commas"
-    else:
-        assert False
-
-    
-    image_urls = []
-    if n == 1:
-        image_urls = [Image.open('jr.png').convert("RGB")]
-    elif n == 5:
-        image_urls = [Image.open(f'popeye{i}.png').convert("RGB") for i in range(5)] # split this into 5 images: https://en.wikipedia.org/wiki/Popeye#/media/File:First_Popeye_Strip,_East_Liverpool_Review,_1929-01-17,_p12.jpg
-    #question = "What is the name of the person in the form?"
-
-    placeholders = [{"type": "image", "image": url} for url in image_urls]
-    return {
-        "role":
-        "user",
-        "content": [
-            *placeholders,  # remove for gemma2
-            {
-                "type": "text",
-                "text": question
-            },
-        ],
-    }, image_urls
-
-messages = [generate_prompt(n) for n in num_imgs]
-
-max_num_imgs = max([len(img_url) for msg, img_url in messages])
-
-
-model_name = "google/gemma-3-4b-it"
-#model_name = 'google/gemma-2-2b-it'
-#model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
-engine_args = EngineArgs(
-    model=model_name,
-    max_model_len=8192,
-    max_num_seqs=2,
-    tensor_parallel_size=1,
-    gpu_memory_utilization=0.9,
-    enforce_eager=True,
-    limit_mm_per_prompt={"image": max_num_imgs},  # remove for gemma2
-    #dtype="float32"  ## remove
-)
-
-processor = AutoProcessor.from_pretrained(model_name)
-
-req_datas = []
-for message, image_urls in messages:
-    prompt = processor.apply_chat_template([message],
-                                        tokenize=False,
-                                        add_generation_prompt=True)
-    req_data = ModelRequestData(
-        engine_args=engine_args,
-        prompt=prompt,
-        image_data=[url for url in image_urls],
-    )
-    req_datas += [req_data]
-
-#breakpoint()
-engine_args = asdict(req_data.engine_args) 
-llm = LLM(**engine_args)
-#breakpoint()
-sampling_params = SamplingParams(temperature=0.0,
-                                    max_tokens=8192,
-                                    stop_token_ids=req_data.stop_token_ids)
-num_prompts = 1
-for i in range(num_prompts):
-    #breakpoint()
-    # remove "multi_modal_data" for gemma2
-    llm_gen_inp = [{ "prompt": req_data.prompt, "multi_modal_data": { "image": req_data.image_data }, } for req_data in req_datas]
-    outputs = llm.generate(
-        llm_gen_inp,
-        sampling_params=sampling_params,
-#    lora_request=req_data.lora_requests,
-    )
-    print("-" * 50)
-    for o in outputs:
-        generated_text = o.outputs[0].text
-        print(generated_text)
-        
-        print("-" * 50)
-    breakpoint()
-    print()
-
diff --git a/prompts.txt b/prompts.txt
deleted file mode 100644
index 927c620ad8e..00000000000
--- a/prompts.txt
+++ /dev/null
@@ -1,784 +0,0 @@
-question = "You are an AI designed to generate extremely long, detailed worldbuilding content. Your goal is to write a fictional encyclopedia with at least 4000 words of content. Do not stop early. Start by describing a fictional planet in detail. Include: \n1. Geography and climate zones (with rich, varied description).\n2. The history of all civilizations, from ancient to modern times.\n3. Cultures, belief systems, and mythologies along with rich detail about where such beliefs came from.\n4. Political structures and conflicts along with their history.\n5. Technology and magic systems (if any) spanning the last 1000 years, highlighting significant discoveries and figures.\n6. Major historical events and characters along with their geneology.\n\n Be descriptive, verbose, and never summarize. Write in a factual tone like an academic encyclopedia. Begin your entry below:"
-    question00 = "Generate a list by repeating this 10000 times: hello, world, cat, dog"
-    question2 = "Generate all numbers from 1 to 6000, separate them with commas"
-    question3 = '''Continue this story for 5000 words, do not stop, keep generating a long rambling story:
-
-Call me Ishmael.  Some years ago--never mind how long
-precisely--having little or no money in my purse, and nothing
-particular to interest me on shore, I thought I would sail about a
-little and see the watery part of the world.  It is a way I have of
-driving off the spleen and regulating the circulation.  Whenever I
-find myself growing grim about the mouth; whenever it is a damp,
-drizzly November in my soul; whenever I find myself involuntarily
-pausing before coffin warehouses, and bringing up the rear of every
-funeral I meet; and especially whenever my hypos get such an upper
-hand of me, that it requires a strong moral principle to prevent me
-from deliberately stepping into the street, and methodically knocking
-people's hats off--then, I account it high time to get to sea as soon
-as I can.  This is my substitute for pistol and ball.  With a
-philosophical flourish Cato throws himself upon his sword; I quietly
-take to the ship.  There is nothing surprising in this.  If they but
-knew it, almost all men in their degree, some time or other, cherish
-very nearly the same feelings towards the ocean with me.
-
-There now is your insular city of the Manhattoes, belted round by
-wharves as Indian isles by coral reefs--commerce surrounds it with
-her surf.  Right and left, the streets take you waterward.  Its
-extreme downtown is the battery, where that noble mole is washed by
-waves, and cooled by breezes, which a few hours previous were out of
-sight of land.  Look at the crowds of water-gazers there.
-
-Circumambulate the city of a dreamy Sabbath afternoon.  Go from
-Corlears Hook to Coenties Slip, and from thence, by Whitehall,
-northward.  What do you see?--Posted like silent sentinels all around
-the town, stand thousands upon thousands of mortal men fixed in ocean
-reveries.  Some leaning against the spiles; some seated upon the
-pier-heads; some looking over the bulwarks of ships from China; some
-high aloft in the rigging, as if striving to get a still better
-seaward peep.  But these are all landsmen; of week days pent up in
-lath and plaster--tied to counters, nailed to benches, clinched to
-desks.  How then is this?  Are the green fields gone?  What do they
-here?
-
-But look! here come more crowds, pacing straight for the water, and
-seemingly bound for a dive.  Strange!  Nothing will content them but
-the extremest limit of the land; loitering under the shady lee of
-yonder warehouses will not suffice.  No.  They must get just as nigh
-the water as they possibly can without falling in.  And there they
-stand--miles of them--leagues.  Inlanders all, they come from lanes
-and alleys, streets and avenues--north, east, south, and west.  Yet
-here they all unite.  Tell me, does the magnetic virtue of the
-needles of the compasses of all those ships attract them thither?
-
-Once more.  Say you are in the country; in some high land of lakes.
-Take almost any path you please, and ten to one it carries you down
-in a dale, and leaves you there by a pool in the stream.  There is
-magic in it.  Let the most absent-minded of men be plunged in his
-deepest reveries--stand that man on his legs, set his feet a-going,
-and he will infallibly lead you to water, if water there be in all
-that region.  Should you ever be athirst in the great American
-desert, try this experiment, if your caravan happen to be supplied
-with a metaphysical professor.  Yes, as every one knows, meditation
-and water are wedded for ever.
-
-But here is an artist.  He desires to paint you the dreamiest,
-shadiest, quietest, most enchanting bit of romantic landscape in all
-the valley of the Saco.  What is the chief element he employs?  There
-stand his trees, each with a hollow trunk, as if a hermit and a
-crucifix were within; and here sleeps his meadow, and there sleep his
-cattle; and up from yonder cottage goes a sleepy smoke.  Deep into
-distant woodlands winds a mazy way, reaching to overlapping spurs of
-mountains bathed in their hill-side blue.  But though the picture
-lies thus tranced, and though this pine-tree shakes down its sighs
-like leaves upon this shepherd's head, yet all were vain, unless the
-shepherd's eye were fixed upon the magic stream before him.  Go visit
-the Prairies in June, when for scores on scores of miles you wade
-knee-deep among Tiger-lilies--what is the one charm
-wanting?--Water--there is not a drop of water there!  Were Niagara
-but a cataract of sand, would you travel your thousand miles to see
-it?  Why did the poor poet of Tennessee, upon suddenly receiving two
-handfuls of silver, deliberate whether to buy him a coat, which he
-sadly needed, or invest his money in a pedestrian trip to Rockaway
-Beach?  Why is almost every robust healthy boy with a robust healthy
-soul in him, at some time or other crazy to go to sea?  Why upon your
-first voyage as a passenger, did you yourself feel such a mystical
-vibration, when first told that you and your ship were now out of
-sight of land?  Why did the old Persians hold the sea holy?  Why did
-the Greeks give it a separate deity, and own brother of Jove?  Surely
-all this is not without meaning.  And still deeper the meaning of
-that story of Narcissus, who because he could not grasp the
-tormenting, mild image he saw in the fountain, plunged into it and
-was drowned.  But that same image, we ourselves see in all rivers and
-oceans.  It is the image of the ungraspable phantom of life; and this
-is the key to it all.
-
-Now, when I say that I am in the habit of going to sea whenever I
-begin to grow hazy about the eyes, and begin to be over conscious of
-my lungs, I do not mean to have it inferred that I ever go to sea as
-a passenger.  For to go as a passenger you must needs have a purse,
-and a purse is but a rag unless you have something in it.  Besides,
-passengers get sea-sick--grow quarrelsome--don't sleep of nights--do
-not enjoy themselves much, as a general thing;--no, I never go as a
-passenger; nor, though I am something of a salt, do I ever go to sea
-as a Commodore, or a Captain, or a Cook.  I abandon the glory and
-distinction of such offices to those who like them.  For my part, I
-abominate all honourable respectable toils, trials, and tribulations
-of every kind whatsoever.  It is quite as much as I can do to take
-care of myself, without taking care of ships, barques, brigs,
-schooners, and what not.  And as for going as cook,--though I confess
-there is considerable glory in that, a cook being a sort of officer
-on ship-board--yet, somehow, I never fancied broiling fowls;--though
-once broiled, judiciously buttered, and judgmatically salted and
-peppered, there is no one who will speak more respectfully, not to
-say reverentially, of a broiled fowl than I will.  It is out of the
-idolatrous dotings of the old Egyptians upon broiled ibis and roasted
-river horse, that you see the mummies of those creatures in their
-huge bake-houses the pyramids.
-
-No, when I go to sea, I go as a simple sailor, right before the mast,
-plumb down into the forecastle, aloft there to the royal mast-head.
-True, they rather order me about some, and make me jump from spar to
-spar, like a grasshopper in a May meadow.  And at first, this sort of
-thing is unpleasant enough.  It touches one's sense of honour,
-particularly if you come of an old established family in the land,
-the Van Rensselaers, or Randolphs, or Hardicanutes.  And more than
-all, if just previous to putting your hand into the tar-pot, you have
-been lording it as a country schoolmaster, making the tallest boys
-stand in awe of you.  The transition is a keen one, I assure you,
-from a schoolmaster to a sailor, and requires a strong decoction of
-Seneca and the Stoics to enable you to grin and bear it.  But even
-this wears off in time.
-
-What of it, if some old hunks of a sea-captain orders me to get a
-broom and sweep down the decks?  What does that indignity amount to,
-weighed, I mean, in the scales of the New Testament?  Do you think
-the archangel Gabriel thinks anything the less of me, because I
-promptly and respectfully obey that old hunks in that particular
-instance?  Who ain't a slave?  Tell me that.  Well, then, however the
-old sea-captains may order me about--however they may thump and punch
-me about, I have the satisfaction of knowing that it is all right;
-that everybody else is one way or other served in much the same
-way--either in a physical or metaphysical point of view, that is; and
-so the universal thump is passed round, and all hands should rub each
-other's shoulder-blades, and be content.
-
-Again, I always go to sea as a sailor, because they make a point of
-paying me for my trouble, whereas they never pay passengers a single
-penny that I ever heard of.  On the contrary, passengers themselves
-must pay.  And there is all the difference in the world between
-paying and being paid.  The act of paying is perhaps the most
-uncomfortable infliction that the two orchard thieves entailed upon
-us.  But BEING PAID,--what will compare with it?  The urbane activity
-with which a man receives money is really marvellous, considering
-that we so earnestly believe money to be the root of all earthly
-ills, and that on no account can a monied man enter heaven.  Ah! how
-cheerfully we consign ourselves to perdition!
-
-Finally, I always go to sea as a sailor, because of the wholesome
-exercise and pure air of the fore-castle deck.  For as in this world,
-head winds are far more prevalent than winds from astern (that is, if
-you never violate the Pythagorean maxim), so for the most part the
-Commodore on the quarter-deck gets his atmosphere at second hand from
-the sailors on the forecastle.  He thinks he breathes it first; but
-not so.  In much the same way do the commonalty lead their leaders in
-many other things, at the same time that the leaders little suspect
-it.  But wherefore it was that after having repeatedly smelt the sea
-as a merchant sailor, I should now take it into my head to go on a
-whaling voyage; this the invisible police officer of the Fates, who
-has the constant surveillance of me, and secretly dogs me, and
-influences me in some unaccountable way--he can better answer than
-any one else.  And, doubtless, my going on this whaling voyage,
-formed part of the grand programme of Providence that was drawn up a
-long time ago.  It came in as a sort of brief interlude and solo
-between more extensive performances.  I take it that this part of the
-bill must have run something like this:
-
-
-"GRAND CONTESTED ELECTION FOR THE PRESIDENCY OF THE UNITED STATES.
-"WHALING VOYAGE BY ONE ISHMAEL.
-"BLOODY BATTLE IN AFFGHANISTAN."
-
-
-Though I cannot tell why it was exactly that those stage managers,
-the Fates, put me down for this shabby part of a whaling voyage, when
-others were set down for magnificent parts in high tragedies, and
-short and easy parts in genteel comedies, and jolly parts in
-farces--though I cannot tell why this was exactly; yet, now that I
-recall all the circumstances, I think I can see a little into the
-springs and motives which being cunningly presented to me under
-various disguises, induced me to set about performing the part I did,
-besides cajoling me into the delusion that it was a choice resulting
-from my own unbiased freewill and discriminating judgment.
-
-Chief among these motives was the overwhelming idea of the great
-whale himself.  Such a portentous and mysterious monster roused all
-my curiosity.  Then the wild and distant seas where he rolled his
-island bulk; the undeliverable, nameless perils of the whale; these,
-with all the attending marvels of a thousand Patagonian sights and
-sounds, helped to sway me to my wish.  With other men, perhaps, such
-things would not have been inducements; but as for me, I am tormented
-with an everlasting itch for things remote.  I love to sail forbidden
-seas, and land on barbarous coasts.  Not ignoring what is good, I am
-quick to perceive a horror, and could still be social with it--would
-they let me--since it is but well to be on friendly terms with all
-the inmates of the place one lodges in.
-
-By reason of these things, then, the whaling voyage was welcome; the
-great flood-gates of the wonder-world swung open, and in the wild
-conceits that swayed me to my purpose, two and two there floated into
-my inmost soul, endless processions of the whale, and, mid most of
-them all, one grand hooded phantom, like a snow hill in the air.
-
-
-I stuffed a shirt or two into my old carpet-bag, tucked it under my
-arm, and started for Cape Horn and the Pacific.  Quitting the good
-city of old Manhatto, I duly arrived in New Bedford.  It was a
-Saturday night in December.  Much was I disappointed upon learning
-that the little packet for Nantucket had already sailed, and that no
-way of reaching that place would offer, till the following Monday.
-
-As most young candidates for the pains and penalties of whaling stop
-at this same New Bedford, thence to embark on their voyage, it may as
-well be related that I, for one, had no idea of so doing.  For my
-mind was made up to sail in no other than a Nantucket craft, because
-there was a fine, boisterous something about everything connected
-with that famous old island, which amazingly pleased me.  Besides
-though New Bedford has of late been gradually monopolising the
-business of whaling, and though in this matter poor old Nantucket is
-now much behind her, yet Nantucket was her great original--the Tyre
-of this Carthage;--the place where the first dead American whale was
-stranded.  Where else but from Nantucket did those aboriginal
-whalemen, the Red-Men, first sally out in canoes to give chase to the
-Leviathan?  And where but from Nantucket, too, did that first
-adventurous little sloop put forth, partly laden with imported
-cobblestones--so goes the story--to throw at the whales, in order to
-discover when they were nigh enough to risk a harpoon from the
-bowsprit?
-
-Now having a night, a day, and still another night following before
-me in New Bedford, ere I could embark for my destined port, it
-became a matter of concernment where I was to eat and sleep
-meanwhile.  It was a very dubious-looking, nay, a very dark and
-dismal night, bitingly cold and cheerless.  I knew no one in the
-place.  With anxious grapnels I had sounded my pocket, and only
-brought up a few pieces of silver,--So, wherever you go, Ishmael,
-said I to myself, as I stood in the middle of a dreary street
-shouldering my bag, and comparing the gloom towards the north with
-the darkness towards the south--wherever in your wisdom you may
-conclude to lodge for the night, my dear Ishmael, be sure to inquire
-the price, and don't be too particular.
-
-With halting steps I paced the streets, and passed the sign of "The
-Crossed Harpoons"--but it looked too expensive and jolly there.
-Further on, from the bright red windows of the "Sword-Fish Inn,"
-there came such fervent rays, that it seemed to have melted the
-packed snow and ice from before the house, for everywhere else the
-congealed frost lay ten inches thick in a hard, asphaltic
-pavement,--rather weary for me, when I struck my foot against the
-flinty projections, because from hard, remorseless service the soles
-of my boots were in a most miserable plight.  Too expensive and
-jolly, again thought I, pausing one moment to watch the broad glare
-in the street, and hear the sounds of the tinkling glasses within.
-But go on, Ishmael, said I at last; don't you hear? get away from
-before the door; your patched boots are stopping the way.  So on I
-went.  I now by instinct followed the streets that took me waterward,
-for there, doubtless, were the cheapest, if not the cheeriest inns.
-'''
-
-    question1 = '''Continue this story for 5000 words, do not stop, keep generating a long rambling story:
-
-Call me Ishmael.  Some years ago--never mind how long
-precisely--having little or no money in my purse, and nothing
-particular to interest me on shore, I thought I would sail about a
-little and see the watery part of the world.  It is a way I have of
-driving off the spleen and regulating the circulation.  Whenever I
-find myself growing grim about the mouth; whenever it is a damp,
-drizzly November in my soul; whenever I find myself involuntarily
-pausing before coffin warehouses, and bringing up the rear of every
-funeral I meet; and especially whenever my hypos get such an upper
-hand of me, that it requires a strong moral principle to prevent me
-from deliberately stepping into the street, and methodically knocking
-people's hats off--then, I account it high time to get to sea as soon
-as I can.  This is my substitute for pistol and ball.  With a
-philosophical flourish Cato throws himself upon his sword; I quietly
-take to the ship.  There is nothing surprising in this.  If they but
-knew it, almost all men in their degree, some time or other, cherish
-very nearly the same feelings towards the ocean with me.
-
-There now is your insular city of the Manhattoes, belted round by
-wharves as Indian isles by coral reefs--commerce surrounds it with
-her surf.  Right and left, the streets take you waterward.  Its
-extreme downtown is the battery, where that noble mole is washed by
-waves, and cooled by breezes, which a few hours previous were out of
-sight of land.  Look at the crowds of water-gazers there.
-
-Circumambulate the city of a dreamy Sabbath afternoon.  Go from
-Corlears Hook to Coenties Slip, and from thence, by Whitehall,
-northward.  What do you see?--Posted like silent sentinels all around
-the town, stand thousands upon thousands of mortal men fixed in ocean
-reveries.  Some leaning against the spiles; some seated upon the
-pier-heads; some looking over the bulwarks of ships from China; some
-high aloft in the rigging, as if striving to get a still better
-seaward peep.  But these are all landsmen; of week days pent up in
-lath and plaster--tied to counters, nailed to benches, clinched to
-desks.  How then is this?  Are the green fields gone?  What do they
-here?
-
-But look! here come more crowds, pacing straight for the water, and
-seemingly bound for a dive.  Strange!  Nothing will content them but
-the extremest limit of the land; loitering under the shady lee of
-yonder warehouses will not suffice.  No.  They must get just as nigh
-the water as they possibly can without falling in.  And there they
-stand--miles of them--leagues.  Inlanders all, they come from lanes
-and alleys, streets and avenues--north, east, south, and west.  Yet
-here they all unite.  Tell me, does the magnetic virtue of the
-needles of the compasses of all those ships attract them thither?'''
-
-    question6 = '''Please take this list and continue generating it. Keep adding the lines seen in this list again and again.
-    Do not stop, just extend this list please. Do not generate code. Just manually generate the list. Please do not stop, keep generating again and again.
-
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog
-hello, world, cat, dog'''
-
diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
index 05df1521463..4e0d4f84ca6 100644
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -397,12 +397,7 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for idx, layer in enumerate(self.layers[self.start_layer:self.end_layer]):
-            #print(f'text: {idx}, {hidden_states.sum()} {residual}')
-            #if idx == 0 and hidden_states.shape[-1] < 1300:  # 1280 vs 1244
-            #    # when compared with hidden_states[...,:1244,:].. matches GPU exactly
-            #    breakpoint()
-            #    print()
+        for layer in self.layers[self.start_layer:self.end_layer]:
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index c7307a2b501..2b0fbc6f3ed 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -566,126 +566,7 @@ def _process_image_input(
             self.vision_tower,
             pixel_values,
         )
-        #breakpoint()
-        '''
-        pixel_values: 1x3x896x896 for HPU, 2x3x896x896 for GPU, ...its a warmup thing
-        In second step, both HPU and GPU have same values
-
-        however image_features is diff in hpu vs gpu (shape [1, 4096, 1152]))
-
-        printing values b4 each layer in siglip:
-        GPU
-        0 -> 4718592.0
-1 -> 5275648.0
-2 -> 5505024.0
-3 -> 5505024.0
-4 -> 5537792.0
-5 -> 5537792.0
-6 -> 5570560.0
-7 -> 5603328.0
-8 -> 5603328.0
-9 -> 5603328.0
-10 -> 5570560.0
-11 -> 5505024.0
-12 -> 5439488.0
-13 -> 5373952.0
-14 -> 5275648.0
-15 -> 5210112.0
-16 -> 5111808.0
-17 -> 5046272.0
-18 -> 5013504.0
-19 -> 4980736.0
-20 -> 4980736.0
-21 -> 5013504.0
-22 -> 5046272.0
-23 -> 5013504.0
-24 -> 5013504.0
-25 -> 5013504.0
-26 -> 5079040.0
-
-HPU:
-0 -> 4718592.0
-1 -> 5275648.0
-2 -> 5505024.0
-3 -> 5505024.0
-4 -> 5537792.0
-5 -> 5537792.0
-6 -> 5570560.0
-7 -> 5603328.0
-8 -> 5603328.0
-9 -> 5603328.0
-10 -> 5570560.0
-11 -> 5505024.0
-12 -> 5439488.0
-13 -> 5373952.0
-14 -> 5308416.0
-15 -> 5210112.0
-16 -> 5111808.0
-17 -> 5046272.0
-18 -> 5013504.0
-19 -> 5013504.0
-20 -> 5013504.0
-21 -> 5013504.0
-22 -> 5046272.0
-23 -> 5046272.0
-24 -> 5013504.0
-25 -> 5013504.0
-26 -> 5079040.0
-
-however final sum is diff:
- image_features.sum()
-tensor(18432., device='hpu:0', dtype=torch.bfloat16)
-        image_features.sum()
-tensor(18304., device='cuda:0', dtype=torch.bfloat16)
-
-
-GPU:
-25 -> 5013504.0 ... tensor([[ 2.8125,  0.7070,  3.8594, -5.2188],
-        [-0.9336,  0.4375,  1.2812, -0.1250],
-        [-1.1250,  0.5859,  1.7422,  0.9141],
-        [ 3.2188,  1.4609,  1.6250,  0.9844]], device='cuda:0',
-       dtype=torch.bfloat16)
-26 -> 5079040.0 ... tensor([[ 3.8125,  1.6406,  5.1250, -5.7500],
-        [-2.2812,  0.1641,  0.6094, -0.8711],
-        [-3.0000,  0.6172,  1.2031,  0.1328],
-        [ 3.3594,  1.3281,  2.3281,  0.6172]], device='cuda:0',
-       dtype=torch.bfloat16)
-BEFORE resolve_visual_encoder_outputs tensor(7274496., device='cuda:0', dtype=torch.bfloat16) tensor([[ -44.2500,  -39.0000,  -17.7500, -124.5000],
-        [  -2.1250,    0.4512,   -0.7734,   -2.6406],
-        [  -2.2812,    1.5781,    0.1875,    0.5000],
-        [ -49.2500,  -40.5000,  -20.1250, -127.0000]], device='cuda:0',
-       dtype=torch.bfloat16)
-AFTER resolve_visual_encoder_outputs tensor(18304., device='cuda:0', dtype=torch.bfloat16) tensor([[-0.0119,  0.0135,  0.0248,  0.0549],
-        [-1.1094, -0.0742, -0.7266, -0.6953],
-        [-3.0000,  0.3496, -0.9766,  0.1963],
-        [-0.0295,  0.0193,  0.0148,  0.1006]], device='cuda:0',
-
-
-HPU:
-25 -> 5013504.0... tensor([[ 2.9531,  1.2188,  4.1250, -5.0625],
-        [-1.0625,  0.4258,  1.3281, -0.1250],
-        [-1.1562,  0.5703,  1.6484,  0.8438],
-        [ 3.2188,  1.4375,  1.7188,  0.8906]], device='hpu:0',
-       dtype=torch.bfloat16)
-26 -> 5079040.0... tensor([[ 3.5938,  2.1406,  4.7812, -5.3125],
-        [-2.5000,  0.1211,  0.6367, -0.7578],
-        [-3.0000,  0.5781,  1.0938,  0.0938],
-        [ 3.4844,  1.3125,  2.4844,  0.4922]], device='hpu:0',
-       dtype=torch.bfloat16)
-BEFORE resolve_visual_encoder_outputs tensor(7241728., device='hpu:0', dtype=torch.bfloat16) tensor([[-2.2500e+01, -2.4000e+01, -1.0562e+01, -7.8500e+01],
-        [-2.0938e+00,  5.3125e-01, -5.8984e-01, -2.0000e+00],
-        [-2.3125e+00,  1.5000e+00,  5.4688e-02,  4.1406e-01],
-        [-4.9250e+01, -4.0500e+01, -2.0125e+01, -1.2700e+02]], device='hpu:0',
-       dtype=torch.bfloat16)
-AFTER resolve_visual_encoder_outputs tensor(18432., device='hpu:0', dtype=torch.bfloat16) tensor([[ 3.4180e-02, -1.2207e-03,  1.7944e-02, -1.6602e-02],
-        [-1.4062e+00, -1.1914e-01, -8.3203e-01, -7.4219e-01],
-        [-3.0156e+00,  2.8906e-01, -1.0859e+00,  1.1572e-01],
-        [-2.6978e-02,  2.1118e-02,  1.5747e-02,  1.0791e-01]], device='hpu:0',
-       dtype=torch.bfloat16)
-
-HPU vs GPU is about the same.... kind of sort of
-
-        '''
+
         image_embeds = self.multi_modal_projector(image_features)
 
         return [
@@ -724,14 +605,12 @@ def forward(self,
                 intermediate_tensors: Optional[IntermediateTensors] = None,
                 inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs: object) -> IntermediateTensors:
-        #breakpoint()
         if intermediate_tensors is not None:
             inputs_embeds = None
 
         # NOTE: In v1, inputs_embeds is always generated at model runner, this
         # condition is for v0 compatibility.
         elif inputs_embeds is None:
-            #breakpoint()
             vision_embeddings = self.get_multimodal_embeddings(**kwargs)
 
             inputs_embeds = self.get_input_embeddings(input_ids,
@@ -745,8 +624,6 @@ def forward(self,
                 )
             input_ids = None
 
-
-        #breakpoint()
         hidden_states = self.language_model.model(input_ids,
                                                   positions,
                                                   intermediate_tensors,
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index 2eba5065f39..75fcf540b0b 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -313,8 +313,7 @@ def forward(
         hidden_states_pool = [inputs_embeds]
         hidden_states = inputs_embeds
 
-        for idx, encoder_layer in enumerate(self.layers):
-            #print(f'{idx} -> {hidden_states.sum()}... {hidden_states[0,:4,:4]}')
+        for encoder_layer in self.layers:
             hidden_states, _ = encoder_layer(hidden_states)
             if return_all_hidden_states:
                 hidden_states_pool.append(hidden_states)
@@ -432,11 +431,10 @@ def forward(
         )
 
         # Handle post-norm (if applicable) and stacks feature layers if needed
-        #print(f'BEFORE resolve_visual_encoder_outputs', encoder_outputs.sum(), encoder_outputs[0, :4, :4])
         encoder_outputs = resolve_visual_encoder_outputs(
             encoder_outputs, feature_sample_layers, self.post_layernorm,
             self.config.num_hidden_layers)
-        #print(f'AFTER resolve_visual_encoder_outputs', encoder_outputs.sum(), encoder_outputs[0, :4, :4])
+
         # TODO: add this back when pooled_output is used in inference.
         # if self.use_head:
         # pooled_output = self.head(encoder_outputs)

From 7df18113b438d8c1efd922dab7a773c28d83513b Mon Sep 17 00:00:00 2001
From: Jimin Ha <jimin.ha@intel.com>
Date: Fri, 30 May 2025 09:02:30 -0700
Subject: [PATCH 10/31] Remove test file

---
 gemma3.sh | 31 -------------------------------
 1 file changed, 31 deletions(-)
 delete mode 100755 gemma3.sh

diff --git a/gemma3.sh b/gemma3.sh
deleted file mode 100755
index b246338c06d..00000000000
--- a/gemma3.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-export VLLM_SKIP_WARMUP=true
-export LLM_MODEL_ID=google/gemma-3-27b-it
-export HF_TOKEN=xxx
-export PT_HPUGRAPH_DISABLE_TENSOR_CACHE=false
-export DATA_PATH=~/data
-export MAX_TOTAL_TOKENS=500
-export VLLM_USE_V1=0
-export PT_HPU_LAZY_MODE=1
-export VLLM_FP32_SOFTMAX=1
-export VLLM_PROMPT_USE_FUSEDSDPA=False
-#python vllm-gemma3-offline.py
-
-#export VLLM_PROMPT_USE_FUSEDSDPA=False
-
-
-#export VLLM_PROMPT_BS_BUCKET_MIN=1
-#export VLLM_PROMPT_BS_BUCKET_STEP=1
-#export VLLM_PROMPT_BS_BUCKET_MAX=1
-#export VLLM_DECODE_BS_BUCKET_MIN=1
-#export VLLM_DECODE_BS_BUCKET_STEP=1
-#export VLLM_DECODE_BS_BUCKET_MAX=1
- 
-#export VLLM_PROMPT_SEQ_BUCKET_MIN=128
-#export VLLM_PROMPT_SEQ_BUCKET_STEP=512
-#export VLLM_PROMPT_SEQ_BUCKET_MAX=2048
-#export VLLM_DECODE_BLOCK_BUCKET_MIN=128
-#export VLLM_DECODE_BLOCK_BUCKET_STEP=128
-#export VLLM_DECODE_BLOCK_BUCKET_MAX=2048
-
-
-python gemma3.py 5,1

From 6088039599abbae79919d10bdda1dc827e41d441 Mon Sep 17 00:00:00 2001
From: Sayantan Sarkar <sasarkar@habana.ai>
Date: Tue, 3 Jun 2025 17:10:33 +0000
Subject: [PATCH 11/31] Enable bs>1

---
 vllm/attention/backends/hpu_attn.py  |  14 +++-
 vllm/model_executor/models/gemma3.py |   5 +-
 vllm/worker/hpu_model_runner.py      | 108 +++++++++++++++++++++++----
 3 files changed, 107 insertions(+), 20 deletions(-)

diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
index 580c3b465fc..56f71e755e5 100644
--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -136,6 +136,12 @@ class HPUAttentionMetadata(HPUPagedAttentionMetadata, AttentionMetadata):
     cross_block_groups: Optional[torch.Tensor] = None
     cross_block_usage: Optional[torch.Tensor] = None
     cross_attn_bias: Optional[torch.Tensor] = None
+    window_block_list: Optional[torch.Tensor] = None
+    window_slot_mapping: Optional[torch.Tensor] = None
+    window_block_mapping: Optional[torch.Tensor] = None
+    window_block_groups: Optional[torch.Tensor] = None
+    window_block_usage: Optional[torch.Tensor] = None
+    window_attn_bias: Optional[torch.Tensor] = None
 
 
 @dataclass
@@ -530,10 +536,10 @@ def forward(
             output = out.reshape(batch_size, seq_len, hidden_size)
         else:
             # Decoding run.
-            block_groups = attn_metadata.block_groups
-            block_mapping = attn_metadata.block_mapping
-            block_list = attn_metadata.block_list
-            attn_bias= attn_metadata.attn_bias
+            block_list = attn_metadata.block_list if not self.sliding_window else attn_metadata.window_block_list
+            block_groups = attn_metadata.block_groups if not self.sliding_window else attn_metadata.window_block_groups
+            block_mapping = attn_metadata.block_mapping if not self.sliding_window else attn_metadata.window_block_mapping
+            attn_bias = attn_metadata.attn_bias if not self.sliding_window else attn_metadata.window_attn_bias
 
             if self.sliding_window:
                 block_size = len(attn_metadata.block_groups)
diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
index 4e0d4f84ca6..94ce4ea20d8 100644
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -250,7 +250,7 @@ def naive_attn_with_masks(
 
         seq_lens = kwargs["seq_lens"]
         start_idx = 0
-        for seq_len, attn_mask in zip(seq_lens, attn_masks):
+        for cnt, (seq_len, attn_mask) in enumerate(zip(seq_lens, attn_masks)):
             end_idx = start_idx + seq_len
             query = q[start_idx:end_idx].unsqueeze(0)
             key = k[start_idx:end_idx].unsqueeze(0)
@@ -269,7 +269,8 @@ def naive_attn_with_masks(
                 self.scaling,
             )
             output = output.transpose(1, 2).flatten(-2, -1)
-            out[start_idx:end_idx] = output
+            #out[start_idx:end_idx] = output
+            out[cnt:cnt+1, start_idx:end_idx, :] = output
             start_idx = end_idx
         return out
 
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 9883b47ba2a..f1b8bb24fdf 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -357,38 +357,53 @@ def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device,
                                              attn_bias=attn_bias)
         return attn_metadata
 
-    def _set_block_mapping(self, metadata, batch_size, device, dtype):
+    def _set_block_mapping(self, metadata, batch_size, device, dtype, is_window_block):
+
+        block_usage = metadata.block_usage if not is_window_block else metadata.window_block_usage
+        block_groups = metadata.block_groups if not is_window_block else metadata.window_block_groups
 
         mask = torch.arange(0,
                             self.block_size,
                             device=device,
                             dtype=torch.int32).unsqueeze(0)
-        mask = mask >= metadata.block_usage.unsqueeze(-1)
+        mask = mask >= block_usage.unsqueeze(-1)
         attn_bias = (torch.zeros_like(mask, dtype=dtype).masked_fill_(
             mask, -math.inf))
 
         if not is_fake_hpu():
-            block_mapping = torch.nn.functional.one_hot(metadata.block_groups,
+            block_mapping = torch.nn.functional.one_hot(block_groups,
                                                         num_classes=batch_size)
         else:
             # Unfortunately one_hot on CPU
             # doesn't handle out of bounds classes so we need to convert
             # all negative values to 0 (block_mapping) or bs (block_groups)
-            block_groups = metadata.block_groups.to(torch.long)
+            block_groups = block_groups.to(torch.long)
             block_mapping = torch.nn.functional.relu(block_groups)
             block_mapping = torch.nn.functional.one_hot(block_mapping,
                                                         num_classes=batch_size)
             oob_values = block_groups.lt(0)
             block_mapping.masked_fill_(oob_values.unsqueeze(-1), 0)
             block_groups.masked_fill_(oob_values, batch_size)
+            if not is_window_block:
+                metadata = custom_tuple_replace(metadata,
+                                                "TrimmedAttentionMetadata",
+                                                block_groups=block_groups)
+            else:
+                metadata = custom_tuple_replace(metadata,
+                                                "TrimmedAttentionMetadata",
+                                                window_block_groups=block_groups)
+
+        block_mapping = block_mapping.to(dtype)
+        if not is_window_block:
             metadata = custom_tuple_replace(metadata,
                                             "TrimmedAttentionMetadata",
-                                            block_groups=block_groups)
-        block_mapping = block_mapping.to(dtype)
-        metadata = custom_tuple_replace(metadata,
-                                        "TrimmedAttentionMetadata",
-                                        block_mapping=block_mapping,
-                                        attn_bias=attn_bias)
+                                            block_mapping=block_mapping,
+                                            attn_bias=attn_bias)
+        else:
+            metadata = custom_tuple_replace(metadata,
+                                "TrimmedAttentionMetadata",
+                                window_block_mapping=block_mapping,
+                                window_attn_bias=attn_bias)
         return metadata
 
     def _update_metadata(self, attn_metadata, batch_size, seq_len, device,
@@ -399,7 +414,11 @@ def _update_metadata(self, attn_metadata, batch_size, seq_len, device,
                                                 seq_len, device, dtype)
         else:
             attn_metadata = self._set_block_mapping(attn_metadata, batch_size,
-                                                    device, dtype)
+                                                    device, dtype, False)
+        if attn_metadata.window_block_list is not None:
+                attn_metadata = self._set_block_mapping(attn_metadata, batch_size,
+                                                    device, dtype,
+                                                    True)
         return attn_metadata
 
     def _prepare_cos_sin(self, positions):
@@ -1472,6 +1491,7 @@ def _prepare_decode(
         encoder_seq_lens: List[int] = []
         cross_block_tables: List[List[int]] = []
         block_tables: List[List[int]] = []
+        window_block_tables: List[List[int]] = []
         lora_index_mapping: List[List[int]] = []
         lora_prompt_mapping: List[List[int]] = []
         lora_requests: Set[LoRARequest] = set()
@@ -1555,6 +1575,14 @@ def _prepare_decode(
                     block_table = block_table[-sliding_window_blocks:]
                 block_tables.append(block_table)
 
+                #TODO: There are many places which checks this config parameter, however this is
+                #very specific config to gemma3, we should first check if this parameter even exist before check.
+                if self.model_config.hf_text_config.interleaved_sliding_window is not None:
+                    sliding_window_blocks = (self.model_config.hf_text_config.interleaved_sliding_window //
+                                            self.block_size)
+                    window_block_table = block_table[-sliding_window_blocks:]
+                    window_block_tables.append(window_block_table)
+
         if output is None:
             input_tokens = torch.tensor(input_tokens,
                                         dtype=torch.long,
@@ -1585,6 +1613,23 @@ def _prepare_decode(
         assert len(block_list) == len(block_groups)
         assert len(block_list) == len(block_usage)
 
+        if self.model_config.hf_text_config.interleaved_sliding_window is not None:
+            window_block_groups = [[i] * len(bt) for i, bt in enumerate(window_block_tables)]
+            window_block_usage = [[self.block_size] * (len(bt) - 1) + [lbu]
+                        for bt, lbu in zip(block_tables, last_block_usage)
+                        if bt]
+
+            window_block_list = flatten(window_block_tables)
+            window_block_groups = flatten(window_block_groups)
+            window_block_usage = flatten(window_block_usage)
+
+            assert len(window_block_list) == len(window_block_groups)
+            assert len(window_block_list) == len(window_block_list)
+        else:
+            window_block_list = None
+            window_block_groups = None
+            window_block_usage = None
+
         if is_enc_dec_model:
             last_cross_block_usage = [
                 (encoder_seq_len - 1) % self.block_size + 1
@@ -1620,6 +1665,14 @@ def _prepare_decode(
                 indices[bid] = i
             padding_fn = lambda tensor, pad_value: gather_list(
                 tensor, indices, pad_value)
+            if self.model_config.hf_text_config.interleaved_sliding_window is not None:
+                window_indices: List[Any]
+                window_indices = [None] * block_bucket_size
+                for i, bid in enumerate(window_block_list):
+                    window_indices[bid] = i
+                window_padding_fn = lambda tensor, pad_value: gather_list(
+                    tensor, window_indices, pad_value)
+
         else:
             block_bucket_size = self.bucketing_ctx.get_padded_decode_num_blocks(
                 len(block_list))
@@ -1630,6 +1683,12 @@ def _prepare_decode(
         block_groups = padding_fn(block_groups, -1)
         block_usage = padding_fn(block_usage, 1)
 
+        if self.model_config.hf_text_config.interleaved_sliding_window is not None:
+            window_block_list = window_padding_fn(window_block_list, _PAD_BLOCK_ID)
+            window_block_groups = window_padding_fn(window_block_groups, -1)
+            #window_block_usage = window_padding_fn(window_block_usage, 1)
+            window_block_usage = [1 if i == 0 else block_usage[idx] for idx, (i, j) in enumerate(zip(window_block_list, block_usage))]
+
         if is_enc_dec_model:
             if self.use_contiguous_pa:
                 cross_block_bucket_size = max(
@@ -1707,9 +1766,21 @@ def _prepare_decode(
             encoder_seq_lens_tensor = encoder_seq_lens_tensor.to(  # type: ignore
                 self.device, non_blocking=True)
 
-        #print(f"block_list: :{block_list}")
-        #print(f"block_groups: :{block_groups}")
-        #print(f"block_usage: :{block_usage}")
+        if self.model_config.hf_text_config.interleaved_sliding_window is not None:
+            window_block_list = torch.tensor(window_block_list, dtype=torch.int, device='cpu')
+            window_block_groups = torch.tensor(window_block_groups,
+                                        dtype=torch.int,
+                                        device='cpu')
+            window_block_usage = torch.tensor(window_block_usage,
+                                    dtype=self.model_config.dtype,
+                                    device='cpu')
+
+            window_block_list = window_block_list.to(  # type: ignore
+            self.device, non_blocking=True)
+            window_block_groups = window_block_groups.to(  # type: ignore
+                self.device, non_blocking=True)
+            window_block_usage = window_block_usage.to(  # type: ignore
+                self.device, non_blocking=True)
 
         attn_metadata = self.attn_backend.make_metadata(
             is_prompt=False,
@@ -1718,6 +1789,10 @@ def _prepare_decode(
             block_mapping=None,
             block_usage=block_usage,
             block_groups=block_groups,
+            window_block_list=window_block_list,
+            window_block_mapping=None,
+            window_block_usage=window_block_usage,
+            window_block_groups=window_block_groups,
             attn_bias=None,
             seq_lens_tensor=None,
             encoder_seq_lens=encoder_seq_lens,
@@ -2046,6 +2121,11 @@ def trim_attn_metadata(self, metadata: AttentionMetadata) -> object:
             'block_size',
             'block_groups',
             'input_positions',
+            'window_block_list',
+            'window_block_mapping',
+            'window_block_usage',
+            'window_block_groups',
+            'window_attn_bias'
         ])
         return attention_metadata
 

From 8b13980267bd5c7b2ba373354ee37e29b4fe6a1d Mon Sep 17 00:00:00 2001
From: maktukmak <mehmet.aktukmak@intel.com>
Date: Wed, 4 Jun 2025 17:03:35 +0000
Subject: [PATCH 12/31] enable hpu graph model

---
 vllm/model_executor/models/gemma3.py | 40 +++++++++++-----------------
 1 file changed, 16 insertions(+), 24 deletions(-)

diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
index 94ce4ea20d8..6544f5976a4 100644
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -233,45 +233,37 @@ def naive_attn_with_masks(
         out: torch.Tensor,
         **kwargs,
     ) -> torch.Tensor:
-        # NOTE(woosuk): As described in the comment above, this code is not
-        # meant to be performant. It is only meant to be correct.
-        q = q.view(-1, self.num_heads, self.head_dim)
-        # Expand the key and value to handle GQA.
+
+
+        s = q.shape[1]
         num_queries_per_kv = self.num_heads // self.num_kv_heads
-        k = k.view(-1, self.num_kv_heads, self.head_dim)
-        k = k.repeat_interleave(num_queries_per_kv, dim=-2)
-        v = v.view(-1, self.num_kv_heads, self.head_dim)
-        v = v.repeat_interleave(num_queries_per_kv, dim=-2)
+        query = q.view(-1, s, self.num_heads, self.head_dim)
+        key = k.view(-1, s, self.num_kv_heads, self.head_dim)
+        key = key.repeat_interleave(num_queries_per_kv, dim=-2)
+        value = v.view(-1, s, self.num_kv_heads, self.head_dim)
+        value = value.repeat_interleave(num_queries_per_kv, dim=-2)
 
         if self.is_sliding:
             attn_masks = kwargs["local_attn_masks"]
         else:
             attn_masks = kwargs["global_attn_masks"]
 
-        seq_lens = kwargs["seq_lens"]
-        start_idx = 0
-        for cnt, (seq_len, attn_mask) in enumerate(zip(seq_lens, attn_masks)):
-            end_idx = start_idx + seq_len
-            query = q[start_idx:end_idx].unsqueeze(0)
-            key = k[start_idx:end_idx].unsqueeze(0)
-            value = v[start_idx:end_idx].unsqueeze(0)
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
 
-            # Transpose.
-            query = query.transpose(1, 2)
-            key = key.transpose(1, 2)
-            value = value.transpose(1, 2)
+        attn_mask = torch.vstack(attn_masks)
 
-            output = F.scaled_dot_product_attention(
+        output = F.scaled_dot_product_attention(
                 query,
                 key,
                 value,
                 attn_mask,
                 self.scaling,
             )
-            output = output.transpose(1, 2).flatten(-2, -1)
-            #out[start_idx:end_idx] = output
-            out[cnt:cnt+1, start_idx:end_idx, :] = output
-            start_idx = end_idx
+        
+        out = output.transpose(1, 2).flatten(-2, -1)
+
         return out
 
 

From a9e5a7dfe600433648ad7519248ab47ad33f8f3b Mon Sep 17 00:00:00 2001
From: Sayantan Sarkar <sasarkar@habana.ai>
Date: Wed, 4 Jun 2025 23:41:10 +0000
Subject: [PATCH 13/31] Add temporary  test scripts

---
 gemma3_offline.py | 201 ++++++++++++++++++++++++++++++++++++++++++++++
 gemma3_offline.sh |  17 ++++
 2 files changed, 218 insertions(+)
 create mode 100644 gemma3_offline.py
 create mode 100755 gemma3_offline.sh

diff --git a/gemma3_offline.py b/gemma3_offline.py
new file mode 100644
index 00000000000..abb8cae0b19
--- /dev/null
+++ b/gemma3_offline.py
@@ -0,0 +1,201 @@
+# Example adapted from: https://docs.vllm.ai/en/latest/getting_started/examples/vision_language_multi_image.html
+#eg PT_HPUGRAPH_DISABLE_TENSOR_CACHE=false VLLM_PROMPT_BS_BUCKET_MIN=1 VLLM_PROMPT_BS_BUCKET_STEP=1 VLLM_PROMPT_BS_BUCKET_MAX=1 VLLM_PROMPT_SEQ_BUCKET_MIN=384 VLLM_PROMPT_SEQ_BUCKET_MAX=384 VLLM_DECODE_BS_BUCKET_MIN=1 VLLM_DECODE_BS_BUCKET_MAX=1 VLLM_DECODE_BLOCK_BUCKET_MIN=512 VLLM_DECODE_BLOCK_BUCKET_MAX=512 python multi_image_example.py --model google/gemma-3-27b-it --tensor-parallel-size 1 --num-images 1
+# SPDX-License-Identifier: Apache-2.0
+"""
+This example shows how to use vLLM for running offline inference with
+multi-image input on vision language models for text generation,
+using the chat template defined by the model.
+"""
+import os
+from argparse import Namespace
+from dataclasses import asdict
+from typing import NamedTuple, Optional
+
+from huggingface_hub import snapshot_download
+from PIL import Image
+from transformers import AutoProcessor, AutoTokenizer
+
+from vllm import LLM, EngineArgs, SamplingParams
+from vllm.lora.request import LoRARequest
+from vllm.multimodal.utils import fetch_image
+from vllm.utils import FlexibleArgumentParser
+import numpy as np
+"""Images are:
+1. Medical form
+2. Duck
+3. Lion
+4. Blue Bird
+5. Whale
+6. Starfish
+7. Snail
+8. Bee on Purple Flower
+9. 2 Dogs
+10. Orange Cat
+11. Gerbil
+12. Rabbit
+13. Horse and foal
+"""
+IMAGE_URLS = [
+    "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/2/26/Ultramarine_Flycatcher_%28Ficedula_superciliaris%29_Naggar%2C_Himachal_Pradesh%2C_2013_%28cropped%29.JPG",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/e/e5/Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg/2560px-Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/d/d4/Starfish%2C_Caswell_Bay_-_geograph.org.uk_-_409413.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/6/69/Grapevinesnail_01.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0b/Texas_invasive_Musk_Thistle_1.jpg/1920px-Texas_invasive_Musk_Thistle_1.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/7/7a/Huskiesatrest.jpg/2880px-Huskiesatrest.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/6/68/Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg/1920px-Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/3/30/George_the_amazing_guinea_pig.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/1/1f/Oryctolagus_cuniculus_Rcdo.jpg/1920px-Oryctolagus_cuniculus_Rcdo.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/9/98/Horse-and-pony.jpg",
+]
+
+
+class ModelRequestData(NamedTuple):
+    engine_args: EngineArgs
+    prompt: str
+    image_data: list[Image.Image]
+    stop_token_ids: Optional[list[int]] = None
+    chat_template: Optional[str] = None
+    lora_requests: Optional[list[LoRARequest]] = None
+
+
+# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
+# lower-end GPUs.
+# Unless specified, these settings have been tested to work on a single L4.
+
+def load_model(model_name: str, tp_size: int, max_model_len:int, question: str, batch_size:int, image_urls: list[str]) -> ModelRequestData:
+
+    engine_args = EngineArgs(
+    model=model_name,
+    max_model_len=max_model_len,
+    max_num_batched_tokens=max_model_len,
+    max_num_seqs=len(image_urls),
+    tensor_parallel_size=tp_size,
+    #gpu_memory_utilization=0.9,
+    enforce_eager=False,   
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+    processor = AutoProcessor.from_pretrained(model_name)
+    if batch_size==1:
+        placeholders = [{"type": "image", "image": (url)} for url in image_urls]
+        messages = [{
+            "role":
+            "user",
+            "content": [
+                *placeholders,
+                {
+                    "type": "text",
+                    "text": question
+                },
+            ],
+        }]
+        prompt = processor.apply_chat_template(messages,tokenize=False,add_generation_prompt=True)
+        #breakpoint()
+        # [Image.open('jr.png').convert("RGB")]
+        requests = {"prompt":prompt,"multi_modal_data":{"image":[fetch_image(url) if type(url)==str else url for url in image_urls]}}
+
+    else:
+        chunks = np.array_split(image_urls, batch_size)
+        requests = []
+        for chunk in chunks:
+            placeholders = [{"type": "image", "image": (url)} for url in chunk]
+            messages = [{
+                "role":
+                "user",
+                "content": [
+                    *placeholders,
+                    {
+                        "type": "text",
+                        "text": question
+                    },
+                ],
+            }]
+            prompt = processor.apply_chat_template(messages,tokenize=False,add_generation_prompt=True)
+            import numpy
+            
+            requests.append({"prompt":prompt,"multi_modal_data":{"image":[fetch_image(url) if (type(url)==str) or (type(url)==numpy.str_) else url for url in chunk]}})
+            #breakpoint()
+            #print()
+    return engine_args,requests
+
+def run_generate(model_name: str, tp_size: int, max_model_len: int, question: str, batch_size: int, image_urls: list[str]):
+    engine_args,requests = load_model(model_name, tp_size, max_model_len, question, batch_size, image_urls)
+
+    engine_args = asdict(engine_args)
+    #breakpoint()
+    llm = LLM(**engine_args)
+    sampling_params = SamplingParams(temperature=0.0,
+                                     max_tokens=8192)
+
+    #breakpoint()
+    outputs = llm.generate(requests,
+        sampling_params=sampling_params
+    )
+    print("-" * 50)
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(len(o.outputs[0].token_ids))
+        print(generated_text)
+        print("-" * 50)
+
+def parse_args():
+    parser = FlexibleArgumentParser(
+        description='Demo on using vLLM for offline inference with '
+        'vision language models that support multi-image input for text '
+        'generation')
+    parser.add_argument('--model-name',
+                        '-m',
+                        type=str,
+                        default="google/gemma-3-4b-it",
+                        choices=['google/gemma-3-4b-it','google/gemma-3-27b-it'],
+                        help='Huggingface "model_type".')
+    parser.add_argument('--tensor-parallel-size',
+                        '-tp',
+                        type=int,
+                        default=1,
+                        help='tensor parallel size.')
+    parser.add_argument(
+        "--num-images",
+        "-n",
+        type=int,
+        choices=list(range(0,
+                           len(IMAGE_URLS) + 1)),  # the max number of images
+        default=2,
+        help="Number of images to use for the demo.")
+    parser.add_argument(
+        "--batch-size",
+        "-b",
+        type=int,
+        default=1,
+        help="Batches in which the images will be sent.")
+    parser.add_argument('--max-model-len',
+                        '-ml',
+                        type=int,
+                        default=8192,
+                        help='Max-Model-Len.')
+    return parser.parse_args()
+
+
+def main(args: Namespace):
+    model = args.model_name
+    tp_size = args.tensor_parallel_size
+    max_model_len = args.max_model_len
+
+    image_urls = IMAGE_URLS[:args.num_images]
+    if args.num_images==1:
+        QUESTION = "Extract all information from the provided image and provide it in a json format:"
+        batch_size=1
+    elif args.num_images==0:
+        QUESTION = "You are an AI designed to generate extremely long, detailed worldbuilding content. Your goal is to write a fictional encyclopedia with at least 4000 words of content. Do not stop early. Start by describing a fictional planet in detail. Include: \n1. Geography and climate zones (with rich, varied description).\n2. The history of all civilizations, from ancient to modern times.\n3. Cultures, belief systems, and mythologies along with rich detail about where such beliefs came from.\n4. Political structures and conflicts along with their history.\n5. Technology and magic systems (if any) spanning the last 1000 years, highlighting significant discoveries and figures.\n6. Major historical events and characters along with their geneology.\n\n Be descriptive, verbose, and never summarize. Write in a factual tone like an academic encyclopedia. Begin your entry below:"
+        batch_size=1
+    else:
+        QUESTION = "What is the content of each image? Once done, write a story that combines them all."
+        batch_size = args.batch_size
+
+    run_generate(model, tp_size, max_model_len, QUESTION, batch_size, image_urls)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/gemma3_offline.sh b/gemma3_offline.sh
new file mode 100755
index 00000000000..01b258b210f
--- /dev/null
+++ b/gemma3_offline.sh
@@ -0,0 +1,17 @@
+export PT_HPUGRAPH_DISABLE_TENSOR_CACHE=false
+export VLLM_PROMPT_BS_BUCKET_MIN=1
+export VLLM_PROMPT_BS_BUCKET_STEP=1
+export VLLM_PROMPT_BS_BUCKET_MA=1
+export VLLM_PROMPT_SEQ_BUCKET_MIN=384
+export VLLM_PROMPT_SEQ_BUCKET_MAX=384
+export VLLM_DECODE_BS_BUCKET_MIN=1
+export VLLM_DECODE_BS_BUCKET_MAX=1
+export VLLM_DECODE_BLOCK_BUCKET_MIN=512
+export VLLM_DECODE_BLOCK_BUCKET_MAX=512 
+
+
+#export PT_HPU_LAZY_MODE=1
+#export VLLM_FP32_SOFTMAX=1
+#export VLLM_PROMPT_USE_FUSEDSDPA=False
+
+python gemma3_offline.py --model google/gemma-3-4b-it --tensor-parallel-size 1 --num-images 2 --batch-size 1

From f783955da43f9539b7e669fff2c59a42e667839e Mon Sep 17 00:00:00 2001
From: Sayantan Sarkar <sasarkar@habana.ai>
Date: Wed, 4 Jun 2025 22:35:21 -0700
Subject: [PATCH 14/31] Fix for missing image

---
 gemma3_offline.py | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/gemma3_offline.py b/gemma3_offline.py
index abb8cae0b19..8a35d5ee280 100644
--- a/gemma3_offline.py
+++ b/gemma3_offline.py
@@ -65,7 +65,7 @@ class ModelRequestData(NamedTuple):
 # Unless specified, these settings have been tested to work on a single L4.
 
 def load_model(model_name: str, tp_size: int, max_model_len:int, question: str, batch_size:int, image_urls: list[str]) -> ModelRequestData:
-
+    assert len(image_urls) % batch_size == 0
     engine_args = EngineArgs(
     model=model_name,
     max_model_len=max_model_len,
@@ -73,9 +73,10 @@ def load_model(model_name: str, tp_size: int, max_model_len:int, question: str,
     max_num_seqs=len(image_urls),
     tensor_parallel_size=tp_size,
     #gpu_memory_utilization=0.9,
-    enforce_eager=False,   
-        limit_mm_per_prompt={"image": len(image_urls)},
+    enforce_eager=True,   
+        limit_mm_per_prompt={"image": int(len(image_urls)/batch_size)},
     )
+
     processor = AutoProcessor.from_pretrained(model_name)
     if batch_size==1:
         placeholders = [{"type": "image", "image": (url)} for url in image_urls]
@@ -99,6 +100,7 @@ def load_model(model_name: str, tp_size: int, max_model_len:int, question: str,
         chunks = np.array_split(image_urls, batch_size)
         requests = []
         for chunk in chunks:
+            print("chunk....", chunk)
             placeholders = [{"type": "image", "image": (url)} for url in chunk]
             messages = [{
                 "role":
@@ -137,7 +139,7 @@ def run_generate(model_name: str, tp_size: int, max_model_len: int, question: st
         generated_text = o.outputs[0].text
         print(len(o.outputs[0].token_ids))
         print(generated_text)
-        print("-" * 50)
+        print("-*." * 50)
 
 def parse_args():
     parser = FlexibleArgumentParser(
@@ -160,7 +162,7 @@ def parse_args():
         "-n",
         type=int,
         choices=list(range(0,
-                           len(IMAGE_URLS) + 1)),  # the max number of images
+                           len(IMAGE_URLS))),  # the max number of images
         default=2,
         help="Number of images to use for the demo.")
     parser.add_argument(
@@ -182,16 +184,24 @@ def main(args: Namespace):
     tp_size = args.tensor_parallel_size
     max_model_len = args.max_model_len
 
-    image_urls = IMAGE_URLS[:args.num_images]
+
+    image_urls = [IMAGE_URLS[idx % len(IMAGE_URLS)] for idx, i in enumerate(range(args.num_images * args.batch_size))]
+
+    '''
     if args.num_images==1:
         QUESTION = "Extract all information from the provided image and provide it in a json format:"
-        batch_size=1
+        #batch_size=1
     elif args.num_images==0:
         QUESTION = "You are an AI designed to generate extremely long, detailed worldbuilding content. Your goal is to write a fictional encyclopedia with at least 4000 words of content. Do not stop early. Start by describing a fictional planet in detail. Include: \n1. Geography and climate zones (with rich, varied description).\n2. The history of all civilizations, from ancient to modern times.\n3. Cultures, belief systems, and mythologies along with rich detail about where such beliefs came from.\n4. Political structures and conflicts along with their history.\n5. Technology and magic systems (if any) spanning the last 1000 years, highlighting significant discoveries and figures.\n6. Major historical events and characters along with their geneology.\n\n Be descriptive, verbose, and never summarize. Write in a factual tone like an academic encyclopedia. Begin your entry below:"
-        batch_size=1
+        #batch_size=1
+    else:
+        QUESTION = "What is the content of each image? Once done, write a story that combines them all."
+    '''
+    if args.num_images==0:
+        QUESTION = "You are an AI designed to generate extremely long, detailed worldbuilding content. Your goal is to write a fictional encyclopedia with at least 4000 words of content. Do not stop early. Start by describing a fictional planet in detail. Include: \n1. Geography and climate zones (with rich, varied description).\n2. The history of all civilizations, from ancient to modern times.\n3. Cultures, belief systems, and mythologies along with rich detail about where such beliefs came from.\n4. Political structures and conflicts along with their history.\n5. Technology and magic systems (if any) spanning the last 1000 years, highlighting significant discoveries and figures.\n6. Major historical events and characters along with their geneology.\n\n Be descriptive, verbose, and never summarize. Write in a factual tone like an academic encyclopedia. Begin your entry below:"
     else:
         QUESTION = "What is the content of each image? Once done, write a story that combines them all."
-        batch_size = args.batch_size
+    batch_size = args.batch_size
 
     run_generate(model, tp_size, max_model_len, QUESTION, batch_size, image_urls)
 

From 1297154433f0cb563f7431ad447a22ee1da6b6ad Mon Sep 17 00:00:00 2001
From: Sayantan Sarkar <sasarkar@habana.ai>
Date: Thu, 5 Jun 2025 09:47:40 -0700
Subject: [PATCH 15/31] Bring back +1

---
 gemma3_offline.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gemma3_offline.py b/gemma3_offline.py
index 8a35d5ee280..7c341d7f3c2 100644
--- a/gemma3_offline.py
+++ b/gemma3_offline.py
@@ -73,7 +73,7 @@ def load_model(model_name: str, tp_size: int, max_model_len:int, question: str,
     max_num_seqs=len(image_urls),
     tensor_parallel_size=tp_size,
     #gpu_memory_utilization=0.9,
-    enforce_eager=True,   
+    enforce_eager=True,
         limit_mm_per_prompt={"image": int(len(image_urls)/batch_size)},
     )
 
@@ -162,7 +162,7 @@ def parse_args():
         "-n",
         type=int,
         choices=list(range(0,
-                           len(IMAGE_URLS))),  # the max number of images
+                           len(IMAGE_URLS)+1)),  # the max number of images
         default=2,
         help="Number of images to use for the demo.")
     parser.add_argument(

From be41114feb41a4b533de2fdcbfee5650e86a4ab7 Mon Sep 17 00:00:00 2001
From: Sayantan Sarkar <sasarkar@habana.ai>
Date: Thu, 5 Jun 2025 12:23:00 -0700
Subject: [PATCH 16/31] Switch to lazy+hpugraphs, add v0 mode

---
 gemma3_offline.py | 2 +-
 gemma3_offline.sh | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/gemma3_offline.py b/gemma3_offline.py
index 7c341d7f3c2..ba582090672 100644
--- a/gemma3_offline.py
+++ b/gemma3_offline.py
@@ -73,7 +73,7 @@ def load_model(model_name: str, tp_size: int, max_model_len:int, question: str,
     max_num_seqs=len(image_urls),
     tensor_parallel_size=tp_size,
     #gpu_memory_utilization=0.9,
-    enforce_eager=True,
+    enforce_eager=False,
         limit_mm_per_prompt={"image": int(len(image_urls)/batch_size)},
     )
 
diff --git a/gemma3_offline.sh b/gemma3_offline.sh
index 01b258b210f..d060835c980 100755
--- a/gemma3_offline.sh
+++ b/gemma3_offline.sh
@@ -8,10 +8,11 @@ export VLLM_DECODE_BS_BUCKET_MIN=1
 export VLLM_DECODE_BS_BUCKET_MAX=1
 export VLLM_DECODE_BLOCK_BUCKET_MIN=512
 export VLLM_DECODE_BLOCK_BUCKET_MAX=512 
+export VLLM_USE_V1=0
 
 
-#export PT_HPU_LAZY_MODE=1
+export PT_HPU_LAZY_MODE=1
 #export VLLM_FP32_SOFTMAX=1
 #export VLLM_PROMPT_USE_FUSEDSDPA=False
 
-python gemma3_offline.py --model google/gemma-3-4b-it --tensor-parallel-size 1 --num-images 2 --batch-size 1
+python gemma3_offline.py --model google/gemma-3-4b-it --tensor-parallel-size 1 --num-images 12 --batch-size 1

From 74e4cfb2786832c44cc90c6a715bf7b8748a217e Mon Sep 17 00:00:00 2001
From: Sayantan Sarkar <sasarkar@habana.ai>
Date: Thu, 5 Jun 2025 20:31:44 -0700
Subject: [PATCH 17/31] Fix masks. Remove cross attn between images

---
 vllm/model_executor/models/gemma3_mm.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 2b0fbc6f3ed..6ec5053e3b6 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -680,10 +680,16 @@ def prepare_attn_masks(
 
             # Consider the bidirectional attention between image tokens.
             img_mask = torch.zeros_like(global_attn_mask)
-            img_pos = (input_token_ids == self.config.image_token_index)
-            img_mask[:, :, :, img_pos] += 1
-            img_mask[:, :, img_pos, :] += 1
+            img_pos = (input_token_ids == self.config.image_token_index) # this is doing bidirectional attn between 2 images.. why
+            img_tokens_cumsum = torch.cumsum(img_pos, 0)
+            num_imgs = img_tokens_cumsum[-1] // 256
+            img_start_pos = torch.arange(0, img_tokens_cumsum[-1], 256)+1
+            for i in img_start_pos:
+                img_mask[:,:,i:i+256, i:i+256] = 2
+            #img_mask[:, :, :, img_pos] += 1
+            #img_mask[:, :, img_pos, :] += 1
             global_attn_mask = torch.where(img_mask == 2, 0, global_attn_mask)
+            #breakpoint()
             global_attn_masks.append(global_attn_mask)
 
             if self.sliding_window is not None:

From 347e965b34ed0a6deb17d8d2aa6d3412f3da86cd Mon Sep 17 00:00:00 2001
From: Sayantan Sarkar <sasarkar@habana.ai>
Date: Fri, 6 Jun 2025 14:15:50 -0700
Subject: [PATCH 18/31] Script for variable batches

---
 config0.json          |   3 +
 config1.json          |   5 ++
 gemma3_offline_dyn.py | 152 ++++++++++++++++++++++++++++++++++++++++++
 gemma3_offline_dyn.sh |  22 ++++++
 4 files changed, 182 insertions(+)
 create mode 100644 config0.json
 create mode 100644 config1.json
 create mode 100644 gemma3_offline_dyn.py
 create mode 100755 gemma3_offline_dyn.sh

diff --git a/config0.json b/config0.json
new file mode 100644
index 00000000000..5306999163f
--- /dev/null
+++ b/config0.json
@@ -0,0 +1,3 @@
+[
+[{"prompt": 0, "images" : [0,1,2]}, {"prompt": 0, "images": [3,4,5]}]
+]
diff --git a/config1.json b/config1.json
new file mode 100644
index 00000000000..cc7590d1278
--- /dev/null
+++ b/config1.json
@@ -0,0 +1,5 @@
+[
+[{"prompt": 1}, {"prompt": 0, "images": [0,1,2]}],
+[{"prompt": 0, "images" : [0,2]}],
+[{"prompt": 2, "images" : [1]}]
+]
diff --git a/gemma3_offline_dyn.py b/gemma3_offline_dyn.py
new file mode 100644
index 00000000000..39b7cd0ea05
--- /dev/null
+++ b/gemma3_offline_dyn.py
@@ -0,0 +1,152 @@
+import torch
+from vllm.multimodal.utils import fetch_image
+import json
+from argparse import Namespace
+from vllm.utils import FlexibleArgumentParser
+from vllm import LLM, EngineArgs, SamplingParams
+import argparse
+from dataclasses import asdict
+from transformers import AutoProcessor, AutoTokenizer
+
+
+PROMPTS = [
+# 19
+"What is the content of each image? Once done, write a story that combines them all.",
+# 183
+"You are an AI designed to generate extremely long, detailed worldbuilding content. Your goal is to write a fictional encyclopedia with at least 4000 words of content. Do not stop early. Start by describing a fictional planet in detail. Include: \n1. Geography and climate zones (with rich, varied description).\n2. The history of all civilizations, from ancient to modern times.\n3. Cultures, belief systems, and mythologies along with rich detail about where such beliefs came from.\n4. Political structures and conflicts along with their history.\n5. Technology and magic systems (if any) spanning the last 1000 years, highlighting significant discoveries and figures.\n6. Major historical events and characters along with their geneology.\n\n Be descriptive, verbose, and never summarize. Write in a factual tone like an academic encyclopedia. Begin your entry below:",
+# 265
+'''Here is a short story: It contains some animals as its main characters. Rewrite this story by replacing the animals in this story with any of the animals shown in the images. Do not change the contents of the story, just the characters: One day a shepherd discovered a fat Pig in the meadow where his Sheep were pastured. He very quickly captured the porker, which squealed at the top of its voice the moment the Shepherd laid his hands on it. You would have thought, to hear the loud squealing, that the Pig was being cruelly hurt. But in spite of its squeals and struggles to escape, the Shepherd tucked his prize under his arm and started off to the butcher's in the market place. The Sheep in the pasture were much astonished and amused at the Pig's behavior, and followed the Shepherd and his charge to the pasture gate. "What makes you squeal like that?" asked one of the Sheep. "The Shepherd often catches and carries off one of us. But we should feel very much ashamed to make such a terrible fuss about it like you do." "That is all very well," replied the Pig, with a squeal and a frantic kick. "When he catches you he is only after your wool. But he wants my bacon! gree-ee-ee!"'''
+# 611
+"Here is a list of creatures. Analyse the images and if there are creatures present in the images and the pick the closest creature from this list for each creature in the images: African Elephant, Bengal Tiger, Arctic Fox, Blue Whale, Brown Bear, Cheetah, Cougar, Dingo, Dolphin, Elk, Flying Fox, Giraffe, Gorilla, Grizzly Bear, Hedgehog, Hippopotamus, Hyena, Indian Elephant, Jaguar, Kangaroo, Koala, Lemur, Leopard, Lion, Lynx, Manatee, Mole, Moose, Mountain Goat, Narwhal, Okapi, Orangutan, Otter, Panda, Platypus, Polar Bear, Porcupine, Possum, Prairie Dog, Puma, Quokka, Rabbit, Raccoon, Red Panda, Reindeer, Rhinoceros, Sea Lion, Seal, Sheep, Skunk, Sloth, Squirrel, Tapir, Tasmanian Devil, Walrus, Weasel, Whale, Wild Boar, Wombat, Yak, Zebra, Albatross, American Robin, Bald Eagle, Barn Owl, Blue Jay, Budgerigar, Canary, Cardinal, Cassowary, Chickadee, Cockatoo, Cormorant, Crane, Crow, Cuckoo, Dove, Duck, Eagle, Egret, Falcon, Finch, Flamingo, Goldfinch, Goose, Great Horned Owl, Gull, Hawk, Heron, Hummingbird, Ibis, Jay, Kestrel, Kingfisher, Kiwi, Lark, Macaw, Magpie, Mockingbird, Nightingale, Nuthatch, Oriole, Ostrich, Owl, Parrot, Partridge, Peacock, Pelican, Penguin, Peregrine Falcon, Pigeon, Puffin, Quail, Raven, Roadrunner, Robin, Rooster, Sparrow, Starling, Stork, Swallow, Swan, Toucan, Turkey, Vulture, Warbler, Woodpecker, Wren, Angelfish, Anglerfish, Barracuda, Betta Fish, Blue Tang, Catfish, Clownfish, Cod, Eel, Flounder, Flying Fish, Goldfish, Grouper, Guppy, Haddock, Halibut, Hammerhead Shark, Herring, Jellyfish, Koi, Lionfish, Lobster, Mackerel, Manta Ray, Marlin, Moray Eel, Octopus, Orca, Piranha, Pufferfish, Rainbow Trout, Salmon, Sardine, Seahorse, Shark, Shrimp, Squid, Starfish, Stingray, Swordfish, Tilapia, Tuna, Walrus, Whale Shark, Zebra Fish, Alligator, Anole, Boa Constrictor, Box Turtle, Chameleon, Cobra, Crocodile, Frog, Gecko, Gila Monster, Green Iguana, Komodo Dragon, Lizard, Monitor Lizard, Newt, Python, Rattlesnake, Salamander, Sea Turtle, Skink, Snake, Toad, Tortoise, Tree Frog, Viper, Ant, Bee, Beetle, Butterfly, Centipede, Cicada, Cricket, Dragonfly, Earthworm, Firefly, Grasshopper, Ladybug, Leech, Millipede, Moth, Praying Mantis, Scorpion, Snail, Spider, Termite, Tick, Wasp"
+]
+
+
+IMAGE_URLS = [
+    "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/2/26/Ultramarine_Flycatcher_%28Ficedula_superciliaris%29_Naggar%2C_Himachal_Pradesh%2C_2013_%28cropped%29.JPG",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/e/e5/Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg/2560px-Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/d/d4/Starfish%2C_Caswell_Bay_-_geograph.org.uk_-_409413.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/6/69/Grapevinesnail_01.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0b/Texas_invasive_Musk_Thistle_1.jpg/1920px-Texas_invasive_Musk_Thistle_1.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/7/7a/Huskiesatrest.jpg/2880px-Huskiesatrest.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/6/68/Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg/1920px-Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/3/30/George_the_amazing_guinea_pig.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/1/1f/Oryctolagus_cuniculus_Rcdo.jpg/1920px-Oryctolagus_cuniculus_Rcdo.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/9/98/Horse-and-pony.jpg",
+]
+
+
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Demo on using vLLM for offline inference with '
+        'vision language models that support multi-image input for text '
+        'generation')
+    parser.add_argument('--model-name',
+                        '-m',
+                        type=str,
+                        default="google/gemma-3-4b-it",
+                        choices=['google/gemma-3-4b-it','google/gemma-3-27b-it'],
+                        help='Huggingface "model_type".')
+    parser.add_argument('--tensor-parallel-size',
+                        '-tp',
+                        type=int,
+                        default=1,
+                        help='tensor parallel size.')
+    parser.add_argument(
+        "--batchconfig",
+        "-b",
+        type=str,
+        required=True,
+        help='''
+            #Sample json input
+            batch0_bs2 = [{"prompt": 1}, {"prompt": 0, "images": [0,1,2]}]
+            batch0_bs3 = [{"prompt": 2, "images" : [6,7,8]}, {"prompt": 0, "images": [3,2,1,0]}, {"prompt": 3, "images": [4]}]
+            inputs = [batch0_bs2, batch0_bs3]
+
+            so inp json should be:
+            [
+            [{"prompt": 1}, {"prompt": 0, "images": [0,1,2]}],
+            [{"prompt": 2, "images" : [6,7,8]}, {"prompt": 0, "images": [3,2,1,0]}, {"prompt": 3, "images": [4]}]
+            ]
+            ''')
+    parser.add_argument('--max-model-len',
+                        '-ml',
+                        type=int,
+                        default=8192,
+                        help='Max-Model-Len.')
+    return parser.parse_args()
+
+
+def make_model(model_name, max_model_len, tp_size, max_num_seqs, limit_mm_per_prompt):
+    engine_args = EngineArgs(
+    model=model_name,
+    max_model_len=max_model_len,
+    max_num_batched_tokens=max_model_len,
+    max_num_seqs=max_num_seqs,
+    tensor_parallel_size=tp_size,
+    #gpu_memory_utilization=0.9,
+    enforce_eager=False,
+        limit_mm_per_prompt={"image": limit_mm_per_prompt},
+    )
+    engine_args = asdict(engine_args)
+    llm = LLM(**engine_args)
+    processor = AutoProcessor.from_pretrained(model_name)
+    return llm, processor
+
+
+def create_inp_from_batchconfig(processor, batch):
+    requests = []
+    for prompt in batch:
+        placeholders = [{"type": "image", "image": (url)} for url in prompt.get('images', [])]
+        messages = [{
+            "role":
+            "user",
+            "content": [
+                *placeholders,
+                {
+                    "type": "text",
+                    "text": PROMPTS[prompt["prompt"]]
+                },
+            ],
+        }]
+        final_prompt = processor.apply_chat_template(messages,tokenize=False,add_generation_prompt=True)
+        requests.append({"prompt":final_prompt,"multi_modal_data":{"image":[fetch_image(IMAGE_URLS[urlid]) for urlid in prompt.get('images', [])]}})
+    return requests
+
+
+def run_generate(llm, processor, batch):
+    requests = create_inp_from_batchconfig(processor, batch)
+    sampling_params = SamplingParams(temperature=0.0,
+                                     max_tokens=8192)
+
+    outputs = llm.generate(requests,
+        sampling_params=sampling_params
+    )
+    print("-" * 50)
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(len(o.outputs[0].token_ids))
+        print(generated_text)
+        print("-*." * 50)
+
+
+def main(args: Namespace):
+    with open(args.batchconfig) as f:
+        config = json.load(f)
+
+    limit_mm_per_prompt = max(max([len(prompt.get("images", [])) for prompt in batch]) for batch in config)
+    max_num_seqs = max(sum([len(prompt.get("images", [])) for prompt in batch]) for batch in config)  # TODO: this one not sure if this is what it means?
+
+    llm, processor = make_model(args.model_name, args.max_model_len, args.tensor_parallel_size, max_num_seqs=max_num_seqs, limit_mm_per_prompt=limit_mm_per_prompt)
+    for batchidx, batch in enumerate(config):
+        run_generate(llm, processor, batch)
+        print(f'Done batch {batchidx}, of bs={len(batch)}. config: {batch}')
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
+
+
diff --git a/gemma3_offline_dyn.sh b/gemma3_offline_dyn.sh
new file mode 100755
index 00000000000..5f31a0cd31f
--- /dev/null
+++ b/gemma3_offline_dyn.sh
@@ -0,0 +1,22 @@
+export PT_HPUGRAPH_DISABLE_TENSOR_CACHE=false
+export VLLM_PROMPT_BS_BUCKET_MIN=1
+export VLLM_PROMPT_BS_BUCKET_STEP=1
+export VLLM_PROMPT_BS_BUCKET_MA=1
+export VLLM_PROMPT_SEQ_BUCKET_MIN=384
+export VLLM_PROMPT_SEQ_BUCKET_MAX=384
+export VLLM_DECODE_BS_BUCKET_MIN=1
+export VLLM_DECODE_BS_BUCKET_MAX=1
+export VLLM_DECODE_BLOCK_BUCKET_MIN=512
+export VLLM_DECODE_BLOCK_BUCKET_MAX=512 
+export VLLM_USE_V1=0
+
+
+export PT_HPU_LAZY_MODE=1
+#export VLLM_FP32_SOFTMAX=1
+#export VLLM_PROMPT_USE_FUSEDSDPA=False
+
+
+export VLLM_SKIP_WARMUP=true
+
+
+python gemma3_offline_dyn.py --model google/gemma-3-4b-it --tensor-parallel-size 1 --batchconfig config1.json
\ No newline at end of file

From a29d5379606ba4e93b0b7d0d94798aceed8a995a Mon Sep 17 00:00:00 2001
From: Sayantan Sarkar <sasarkar@habana.ai>
Date: Mon, 9 Jun 2025 20:32:25 -0700
Subject: [PATCH 19/31] Do vision+combining before text mdoel fwd

---
 gemma3_offline_dyn.py                   |  4 ++--
 vllm/model_executor/models/gemma3_mm.py |  1 +
 vllm/worker/hpu_model_runner.py         | 28 +++++++++++++++++++++++++
 3 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/gemma3_offline_dyn.py b/gemma3_offline_dyn.py
index 39b7cd0ea05..f5d13245b6e 100644
--- a/gemma3_offline_dyn.py
+++ b/gemma3_offline_dyn.py
@@ -14,8 +14,8 @@
 "What is the content of each image? Once done, write a story that combines them all.",
 # 183
 "You are an AI designed to generate extremely long, detailed worldbuilding content. Your goal is to write a fictional encyclopedia with at least 4000 words of content. Do not stop early. Start by describing a fictional planet in detail. Include: \n1. Geography and climate zones (with rich, varied description).\n2. The history of all civilizations, from ancient to modern times.\n3. Cultures, belief systems, and mythologies along with rich detail about where such beliefs came from.\n4. Political structures and conflicts along with their history.\n5. Technology and magic systems (if any) spanning the last 1000 years, highlighting significant discoveries and figures.\n6. Major historical events and characters along with their geneology.\n\n Be descriptive, verbose, and never summarize. Write in a factual tone like an academic encyclopedia. Begin your entry below:",
-# 265
-'''Here is a short story: It contains some animals as its main characters. Rewrite this story by replacing the animals in this story with any of the animals shown in the images. Do not change the contents of the story, just the characters: One day a shepherd discovered a fat Pig in the meadow where his Sheep were pastured. He very quickly captured the porker, which squealed at the top of its voice the moment the Shepherd laid his hands on it. You would have thought, to hear the loud squealing, that the Pig was being cruelly hurt. But in spite of its squeals and struggles to escape, the Shepherd tucked his prize under his arm and started off to the butcher's in the market place. The Sheep in the pasture were much astonished and amused at the Pig's behavior, and followed the Shepherd and his charge to the pasture gate. "What makes you squeal like that?" asked one of the Sheep. "The Shepherd often catches and carries off one of us. But we should feel very much ashamed to make such a terrible fuss about it like you do." "That is all very well," replied the Pig, with a squeal and a frantic kick. "When he catches you he is only after your wool. But he wants my bacon! gree-ee-ee!"'''
+# 273
+'''Here is a short story: It contains some animals as its main characters. Rewrite this story by replacing the animals in this story with any of the animals shown in the images. Do not change the contents of the story, just the characters. What animals are shown in the images?: One day a shepherd discovered a fat Pig in the meadow where his Sheep were pastured. He very quickly captured the porker, which squealed at the top of its voice the moment the Shepherd laid his hands on it. You would have thought, to hear the loud squealing, that the Pig was being cruelly hurt. But in spite of its squeals and struggles to escape, the Shepherd tucked his prize under his arm and started off to the butcher's in the market place. The Sheep in the pasture were much astonished and amused at the Pig's behavior, and followed the Shepherd and his charge to the pasture gate. "What makes you squeal like that?" asked one of the Sheep. "The Shepherd often catches and carries off one of us. But we should feel very much ashamed to make such a terrible fuss about it like you do." "That is all very well," replied the Pig, with a squeal and a frantic kick. "When he catches you he is only after your wool. But he wants my bacon! gree-ee-ee!"'''
 # 611
 "Here is a list of creatures. Analyse the images and if there are creatures present in the images and the pick the closest creature from this list for each creature in the images: African Elephant, Bengal Tiger, Arctic Fox, Blue Whale, Brown Bear, Cheetah, Cougar, Dingo, Dolphin, Elk, Flying Fox, Giraffe, Gorilla, Grizzly Bear, Hedgehog, Hippopotamus, Hyena, Indian Elephant, Jaguar, Kangaroo, Koala, Lemur, Leopard, Lion, Lynx, Manatee, Mole, Moose, Mountain Goat, Narwhal, Okapi, Orangutan, Otter, Panda, Platypus, Polar Bear, Porcupine, Possum, Prairie Dog, Puma, Quokka, Rabbit, Raccoon, Red Panda, Reindeer, Rhinoceros, Sea Lion, Seal, Sheep, Skunk, Sloth, Squirrel, Tapir, Tasmanian Devil, Walrus, Weasel, Whale, Wild Boar, Wombat, Yak, Zebra, Albatross, American Robin, Bald Eagle, Barn Owl, Blue Jay, Budgerigar, Canary, Cardinal, Cassowary, Chickadee, Cockatoo, Cormorant, Crane, Crow, Cuckoo, Dove, Duck, Eagle, Egret, Falcon, Finch, Flamingo, Goldfinch, Goose, Great Horned Owl, Gull, Hawk, Heron, Hummingbird, Ibis, Jay, Kestrel, Kingfisher, Kiwi, Lark, Macaw, Magpie, Mockingbird, Nightingale, Nuthatch, Oriole, Ostrich, Owl, Parrot, Partridge, Peacock, Pelican, Penguin, Peregrine Falcon, Pigeon, Puffin, Quail, Raven, Roadrunner, Robin, Rooster, Sparrow, Starling, Stork, Swallow, Swan, Toucan, Turkey, Vulture, Warbler, Woodpecker, Wren, Angelfish, Anglerfish, Barracuda, Betta Fish, Blue Tang, Catfish, Clownfish, Cod, Eel, Flounder, Flying Fish, Goldfish, Grouper, Guppy, Haddock, Halibut, Hammerhead Shark, Herring, Jellyfish, Koi, Lionfish, Lobster, Mackerel, Manta Ray, Marlin, Moray Eel, Octopus, Orca, Piranha, Pufferfish, Rainbow Trout, Salmon, Sardine, Seahorse, Shark, Shrimp, Squid, Starfish, Stingray, Swordfish, Tilapia, Tuna, Walrus, Whale Shark, Zebra Fish, Alligator, Anole, Boa Constrictor, Box Turtle, Chameleon, Cobra, Crocodile, Frog, Gecko, Gila Monster, Green Iguana, Komodo Dragon, Lizard, Monitor Lizard, Newt, Python, Rattlesnake, Salamander, Sea Turtle, Skink, Snake, Toad, Tortoise, Tree Frog, Viper, Ant, Bee, Beetle, Butterfly, Centipede, Cicada, Cricket, Dragonfly, Earthworm, Firefly, Grasshopper, Ladybug, Leech, Millipede, Moth, Praying Mantis, Scorpion, Snail, Spider, Termite, Tick, Wasp"
 ]
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 6ec5053e3b6..df0fc2f4715 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -611,6 +611,7 @@ def forward(self,
         # NOTE: In v1, inputs_embeds is always generated at model runner, this
         # condition is for v0 compatibility.
         elif inputs_embeds is None:
+            assert False, "hpu_model_runner should be computing inputs_embeds"
             vision_embeddings = self.get_multimodal_embeddings(**kwargs)
 
             inputs_embeds = self.get_input_embeddings(input_ids,
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index f1b8bb24fdf..d9e8452ed65 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -297,6 +297,22 @@ def __init__(self, model, vllm_config, layer_names, is_causal, sampler):
         self.is_causal = is_causal
         self.use_merged_prefill = VLLM_MERGED_PREFILL
 
+    # copying from PR 1163
+    # needs cleanup/unified approach later
+    def compute_input_embeddings_for_gemma(self, **kwargs):
+
+        # todo may or may not be needed for gemma3, check
+        compile_only_mode_context_false = functools.partial(
+            bc.env_setting, "PT_COMPILE_ONLY_MODE", False)
+
+        input_ids = kwargs['input_ids']
+        #with compile_only_mode_context_false():
+        vision_embeddings = self.model.get_multimodal_embeddings(**kwargs)
+        inputs_embeds = self.model.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+
+        return inputs_embeds
+
     def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device,
                        dtype):
         if (attn_metadata is None
@@ -2968,6 +2984,18 @@ def try_revert_dummy_output_tokens():
                     'real_batch_size': real_batch_size
                 }
 
+                if 'Gemma3ForConditionalGeneration' in str(type(self.model.model)):
+                    inputs_embeds = \
+                        self.model.compute_input_embeddings_for_gemma(
+                            **execute_model_kwargs
+                        )
+                    execute_model_kwargs.update({
+                        'inputs_embeds': inputs_embeds,
+                    })
+                    # done compute the visual tokens
+                    execute_model_kwargs.pop('pixel_values', None)
+
+
                 with self.profiler.record_event('internal',
                                                 model_event_name,
                                                 args=profiler_args):

From c9c575728a8996366264fc9174e5afdbc460dc3f Mon Sep 17 00:00:00 2001
From: Sayantan Sarkar <sasarkar@habana.ai>
Date: Mon, 9 Jun 2025 21:47:09 -0700
Subject: [PATCH 20/31] wrap vision and projector in hpu graphs

---
 vllm/model_executor/models/utils.py |  2 ++
 vllm/worker/hpu_model_runner.py     | 14 +++++++++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index b022832effc..8f5dfa87175 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -392,6 +392,8 @@ def _merge_multimodal_embeddings(
     # skip check for HPU, the number of tokens is a cpu fallback during HPU lazy
     if current_platform.is_hpu():
         flattened = _flatten_embeddings(multimodal_embeddings)
+        #TODO dynamic.. maybe torch.where? however multimodal_embeddings is a list of varying length
+        # still.. torch.where migth be faster than boolean indexing?
         inputs_embeds[is_multimodal] = flattened
         return inputs_embeds
 
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index d9e8452ed65..821d0208544 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -90,6 +90,10 @@
 DUMMY_TOKEN_ID = -1
 
 
+
+def is_gemma3(model):
+    return 'Gemma3ForConditionalGeneration' in str(type(model))
+
 class Singleton(type):
     _instances: Dict[type, object] = {}
 
@@ -297,6 +301,14 @@ def __init__(self, model, vllm_config, layer_names, is_causal, sampler):
         self.is_causal = is_causal
         self.use_merged_prefill = VLLM_MERGED_PREFILL
 
+        # TODO : right now just enabling it keeping gemma3 in mind
+        if htorch.utils.internal.is_lazy() and is_gemma3(self.model):
+            logger.info("[Multimodal] Wrapping Visual Model")
+            self.model.vision_tower = htorch.hpu.wrap_in_hpu_graph(
+                self.model.vision_tower, disable_tensor_cache=True)
+            self.model.multi_modal_projector = htorch.hpu.wrap_in_hpu_graph(
+                self.model.multi_modal_projector, disable_tensor_cache=True)
+
     # copying from PR 1163
     # needs cleanup/unified approach later
     def compute_input_embeddings_for_gemma(self, **kwargs):
@@ -2984,7 +2996,7 @@ def try_revert_dummy_output_tokens():
                     'real_batch_size': real_batch_size
                 }
 
-                if 'Gemma3ForConditionalGeneration' in str(type(self.model.model)):
+                if is_gemma3(self.model.model):
                     inputs_embeds = \
                         self.model.compute_input_embeddings_for_gemma(
                             **execute_model_kwargs

From 5af687089b75acc6a4878bebab3edbd07bd521de Mon Sep 17 00:00:00 2001
From: maktukmak <mehmet.aktukmak@intel.com>
Date: Tue, 10 Jun 2025 08:07:57 +0000
Subject: [PATCH 21/31] vectorized mask generation

---
 vllm/model_executor/models/gemma3.py    |   4 +-
 vllm/model_executor/models/gemma3_mm.py | 104 ++++++++++--------------
 2 files changed, 43 insertions(+), 65 deletions(-)

diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
index 6544f5976a4..a930ca35931 100644
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -252,13 +252,11 @@ def naive_attn_with_masks(
         key = key.transpose(1, 2)
         value = value.transpose(1, 2)
 
-        attn_mask = torch.vstack(attn_masks)
-
         output = F.scaled_dot_product_attention(
                 query,
                 key,
                 value,
-                attn_mask,
+                attn_masks,
                 self.scaling,
             )
         
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index df0fc2f4715..dd2a28cf7c1 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -641,68 +641,48 @@ def prepare_attn_masks(
         **kwargs,
     ):
         kwargs["has_images"] = True
-
-        input_ids = input_ids.flatten()
-        positions = positions.flatten()
-
-        # NOTE(woosuk): Here, we distinguish the sequences by the position id 0.
-        # This is a HACK. Fix this.
-        start_idices = (positions == 0).cpu().nonzero()
-        num_seqs = len(start_idices)
-        seq_lens = []
-        for i in range(num_seqs):
-            start_idx = start_idices[i].item()
-            if i < num_seqs - 1:
-                end_idx = start_idices[i + 1].item()
-            else:
-                end_idx = len(input_ids)
-            seq_lens.append(end_idx - start_idx)
-        kwargs["seq_lens"] = seq_lens
-
-        global_attn_masks = []
-        local_attn_masks = []
-        start_idx = 0
-        for seq_len in seq_lens:
-            end_idx = start_idx + seq_len
-            input_token_ids = input_ids[start_idx:end_idx]
-            start_idx = end_idx
-            # Create a global causal mask.
-            global_attn_mask = torch.empty(
-                1,
-                1,
-                seq_len,
-                seq_len,
-                dtype=mask_dtype,
-                device=input_ids.device,
-            )
-            global_attn_mask.fill_(float("-inf"))
-            # Fill the lower triangle with 0.
-            global_attn_mask = global_attn_mask.triu(diagonal=1)
-
-            # Consider the bidirectional attention between image tokens.
-            img_mask = torch.zeros_like(global_attn_mask)
-            img_pos = (input_token_ids == self.config.image_token_index) # this is doing bidirectional attn between 2 images.. why
-            img_tokens_cumsum = torch.cumsum(img_pos, 0)
-            num_imgs = img_tokens_cumsum[-1] // 256
-            img_start_pos = torch.arange(0, img_tokens_cumsum[-1], 256)+1
-            for i in img_start_pos:
-                img_mask[:,:,i:i+256, i:i+256] = 2
-            #img_mask[:, :, :, img_pos] += 1
-            #img_mask[:, :, img_pos, :] += 1
-            global_attn_mask = torch.where(img_mask == 2, 0, global_attn_mask)
-            #breakpoint()
-            global_attn_masks.append(global_attn_mask)
-
-            if self.sliding_window is not None:
-                # Create a local causal mask with sliding window (1024).
-                local_attn_mask = torch.ones_like(global_attn_mask)
-                local_attn_mask = torch.tril(local_attn_mask,
-                                             diagonal=-self.sliding_window)
-                local_attn_mask = torch.where(local_attn_mask == 0,
-                                              global_attn_mask, float("-inf"))
-                local_attn_masks.append(local_attn_mask)
-        kwargs["global_attn_masks"] = global_attn_masks
-        kwargs["local_attn_masks"] = local_attn_masks
+        IMG_TOKENS = 256
+        seq_len = input_ids.shape[1]
+        bs = input_ids.shape[0]
+        kwargs["seq_lens"] = [seq_len] * bs
+
+        global_attn_mask = torch.empty(
+            bs,
+            1,
+            seq_len,
+            seq_len,
+            dtype=mask_dtype,
+            device=input_ids.device,
+        )
+        global_attn_mask.fill_(float("-inf"))
+        global_attn_mask = global_attn_mask.triu(diagonal=1)
+
+        img_mask = torch.zeros_like(global_attn_mask)
+        img_pos = (input_ids == self.config.image_token_index)
+
+        img_mask[img_pos.unsqueeze(1)] += 1
+        img_mask = img_mask.permute(0,1,3,2)
+        img_mask[img_pos.unsqueeze(1)] += 1
+        img_mask = img_mask.permute(0,1,3,2)
+
+        img_pos_cum = torch.cumsum(img_pos, 1)
+        img_causal = torch.arange(seq_len, device = input_ids.device).unsqueeze(0) - img_pos_cum + (img_pos_cum//IMG_TOKENS + 1) * IMG_TOKENS + 1
+        img_causal = torch.cat((img_causal[:,0:1]-1, img_causal[:,:-1]), dim=1)
+        img_causal = img_causal.clamp_(min=0, max=seq_len-1).unsqueeze(1).unsqueeze(3)
+        ind = torch.arange(seq_len, device=input_ids.device).unsqueeze(0).unsqueeze(1).unsqueeze(2)
+        img_mask[ind < img_causal] += 1
+        global_attn_mask = torch.where(img_mask == 3, 0, global_attn_mask)
+
+        if self.sliding_window is not None:
+            # Create a local causal mask with sliding window (1024).
+            local_attn_mask = torch.ones_like(global_attn_mask)
+            local_attn_mask = torch.tril(local_attn_mask,
+                                            diagonal=-self.sliding_window)
+            local_attn_mask = torch.where(local_attn_mask == 0,
+                                            global_attn_mask, float("-inf"))
+
+        kwargs["global_attn_masks"] = global_attn_mask
+        kwargs["local_attn_masks"] = local_attn_mask
         return kwargs
 
     def compute_logits(

From 000b4e0e77895d266cae5c83661ae07b12215909 Mon Sep 17 00:00:00 2001
From: Sayantan Sarkar <sasarkar@habana.ai>
Date: Wed, 11 Jun 2025 09:26:35 -0700
Subject: [PATCH 22/31] Revert "wrap vision and projector in hpu graphs"

This reverts commit c9c575728a8996366264fc9174e5afdbc460dc3f.
---
 vllm/model_executor/models/utils.py |  2 --
 vllm/worker/hpu_model_runner.py     | 14 +-------------
 2 files changed, 1 insertion(+), 15 deletions(-)

diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 8f5dfa87175..b022832effc 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -392,8 +392,6 @@ def _merge_multimodal_embeddings(
     # skip check for HPU, the number of tokens is a cpu fallback during HPU lazy
     if current_platform.is_hpu():
         flattened = _flatten_embeddings(multimodal_embeddings)
-        #TODO dynamic.. maybe torch.where? however multimodal_embeddings is a list of varying length
-        # still.. torch.where migth be faster than boolean indexing?
         inputs_embeds[is_multimodal] = flattened
         return inputs_embeds
 
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 821d0208544..d9e8452ed65 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -90,10 +90,6 @@
 DUMMY_TOKEN_ID = -1
 
 
-
-def is_gemma3(model):
-    return 'Gemma3ForConditionalGeneration' in str(type(model))
-
 class Singleton(type):
     _instances: Dict[type, object] = {}
 
@@ -301,14 +297,6 @@ def __init__(self, model, vllm_config, layer_names, is_causal, sampler):
         self.is_causal = is_causal
         self.use_merged_prefill = VLLM_MERGED_PREFILL
 
-        # TODO : right now just enabling it keeping gemma3 in mind
-        if htorch.utils.internal.is_lazy() and is_gemma3(self.model):
-            logger.info("[Multimodal] Wrapping Visual Model")
-            self.model.vision_tower = htorch.hpu.wrap_in_hpu_graph(
-                self.model.vision_tower, disable_tensor_cache=True)
-            self.model.multi_modal_projector = htorch.hpu.wrap_in_hpu_graph(
-                self.model.multi_modal_projector, disable_tensor_cache=True)
-
     # copying from PR 1163
     # needs cleanup/unified approach later
     def compute_input_embeddings_for_gemma(self, **kwargs):
@@ -2996,7 +2984,7 @@ def try_revert_dummy_output_tokens():
                     'real_batch_size': real_batch_size
                 }
 
-                if is_gemma3(self.model.model):
+                if 'Gemma3ForConditionalGeneration' in str(type(self.model.model)):
                     inputs_embeds = \
                         self.model.compute_input_embeddings_for_gemma(
                             **execute_model_kwargs

From 658442d9721a6f0dc2e739af9ba4179a6df57b69 Mon Sep 17 00:00:00 2001
From: Sayantan Sarkar <sasarkar@habana.ai>
Date: Wed, 11 Jun 2025 09:26:53 -0700
Subject: [PATCH 23/31] Revert "Do vision+combining before text mdoel fwd"

This reverts commit a29d5379606ba4e93b0b7d0d94798aceed8a995a.
---
 gemma3_offline_dyn.py                   |  4 ++--
 vllm/model_executor/models/gemma3_mm.py |  1 -
 vllm/worker/hpu_model_runner.py         | 28 -------------------------
 3 files changed, 2 insertions(+), 31 deletions(-)

diff --git a/gemma3_offline_dyn.py b/gemma3_offline_dyn.py
index f5d13245b6e..39b7cd0ea05 100644
--- a/gemma3_offline_dyn.py
+++ b/gemma3_offline_dyn.py
@@ -14,8 +14,8 @@
 "What is the content of each image? Once done, write a story that combines them all.",
 # 183
 "You are an AI designed to generate extremely long, detailed worldbuilding content. Your goal is to write a fictional encyclopedia with at least 4000 words of content. Do not stop early. Start by describing a fictional planet in detail. Include: \n1. Geography and climate zones (with rich, varied description).\n2. The history of all civilizations, from ancient to modern times.\n3. Cultures, belief systems, and mythologies along with rich detail about where such beliefs came from.\n4. Political structures and conflicts along with their history.\n5. Technology and magic systems (if any) spanning the last 1000 years, highlighting significant discoveries and figures.\n6. Major historical events and characters along with their geneology.\n\n Be descriptive, verbose, and never summarize. Write in a factual tone like an academic encyclopedia. Begin your entry below:",
-# 273
-'''Here is a short story: It contains some animals as its main characters. Rewrite this story by replacing the animals in this story with any of the animals shown in the images. Do not change the contents of the story, just the characters. What animals are shown in the images?: One day a shepherd discovered a fat Pig in the meadow where his Sheep were pastured. He very quickly captured the porker, which squealed at the top of its voice the moment the Shepherd laid his hands on it. You would have thought, to hear the loud squealing, that the Pig was being cruelly hurt. But in spite of its squeals and struggles to escape, the Shepherd tucked his prize under his arm and started off to the butcher's in the market place. The Sheep in the pasture were much astonished and amused at the Pig's behavior, and followed the Shepherd and his charge to the pasture gate. "What makes you squeal like that?" asked one of the Sheep. "The Shepherd often catches and carries off one of us. But we should feel very much ashamed to make such a terrible fuss about it like you do." "That is all very well," replied the Pig, with a squeal and a frantic kick. "When he catches you he is only after your wool. But he wants my bacon! gree-ee-ee!"'''
+# 265
+'''Here is a short story: It contains some animals as its main characters. Rewrite this story by replacing the animals in this story with any of the animals shown in the images. Do not change the contents of the story, just the characters: One day a shepherd discovered a fat Pig in the meadow where his Sheep were pastured. He very quickly captured the porker, which squealed at the top of its voice the moment the Shepherd laid his hands on it. You would have thought, to hear the loud squealing, that the Pig was being cruelly hurt. But in spite of its squeals and struggles to escape, the Shepherd tucked his prize under his arm and started off to the butcher's in the market place. The Sheep in the pasture were much astonished and amused at the Pig's behavior, and followed the Shepherd and his charge to the pasture gate. "What makes you squeal like that?" asked one of the Sheep. "The Shepherd often catches and carries off one of us. But we should feel very much ashamed to make such a terrible fuss about it like you do." "That is all very well," replied the Pig, with a squeal and a frantic kick. "When he catches you he is only after your wool. But he wants my bacon! gree-ee-ee!"'''
 # 611
 "Here is a list of creatures. Analyse the images and if there are creatures present in the images and the pick the closest creature from this list for each creature in the images: African Elephant, Bengal Tiger, Arctic Fox, Blue Whale, Brown Bear, Cheetah, Cougar, Dingo, Dolphin, Elk, Flying Fox, Giraffe, Gorilla, Grizzly Bear, Hedgehog, Hippopotamus, Hyena, Indian Elephant, Jaguar, Kangaroo, Koala, Lemur, Leopard, Lion, Lynx, Manatee, Mole, Moose, Mountain Goat, Narwhal, Okapi, Orangutan, Otter, Panda, Platypus, Polar Bear, Porcupine, Possum, Prairie Dog, Puma, Quokka, Rabbit, Raccoon, Red Panda, Reindeer, Rhinoceros, Sea Lion, Seal, Sheep, Skunk, Sloth, Squirrel, Tapir, Tasmanian Devil, Walrus, Weasel, Whale, Wild Boar, Wombat, Yak, Zebra, Albatross, American Robin, Bald Eagle, Barn Owl, Blue Jay, Budgerigar, Canary, Cardinal, Cassowary, Chickadee, Cockatoo, Cormorant, Crane, Crow, Cuckoo, Dove, Duck, Eagle, Egret, Falcon, Finch, Flamingo, Goldfinch, Goose, Great Horned Owl, Gull, Hawk, Heron, Hummingbird, Ibis, Jay, Kestrel, Kingfisher, Kiwi, Lark, Macaw, Magpie, Mockingbird, Nightingale, Nuthatch, Oriole, Ostrich, Owl, Parrot, Partridge, Peacock, Pelican, Penguin, Peregrine Falcon, Pigeon, Puffin, Quail, Raven, Roadrunner, Robin, Rooster, Sparrow, Starling, Stork, Swallow, Swan, Toucan, Turkey, Vulture, Warbler, Woodpecker, Wren, Angelfish, Anglerfish, Barracuda, Betta Fish, Blue Tang, Catfish, Clownfish, Cod, Eel, Flounder, Flying Fish, Goldfish, Grouper, Guppy, Haddock, Halibut, Hammerhead Shark, Herring, Jellyfish, Koi, Lionfish, Lobster, Mackerel, Manta Ray, Marlin, Moray Eel, Octopus, Orca, Piranha, Pufferfish, Rainbow Trout, Salmon, Sardine, Seahorse, Shark, Shrimp, Squid, Starfish, Stingray, Swordfish, Tilapia, Tuna, Walrus, Whale Shark, Zebra Fish, Alligator, Anole, Boa Constrictor, Box Turtle, Chameleon, Cobra, Crocodile, Frog, Gecko, Gila Monster, Green Iguana, Komodo Dragon, Lizard, Monitor Lizard, Newt, Python, Rattlesnake, Salamander, Sea Turtle, Skink, Snake, Toad, Tortoise, Tree Frog, Viper, Ant, Bee, Beetle, Butterfly, Centipede, Cicada, Cricket, Dragonfly, Earthworm, Firefly, Grasshopper, Ladybug, Leech, Millipede, Moth, Praying Mantis, Scorpion, Snail, Spider, Termite, Tick, Wasp"
 ]
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index dd2a28cf7c1..5fad293b9ef 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -611,7 +611,6 @@ def forward(self,
         # NOTE: In v1, inputs_embeds is always generated at model runner, this
         # condition is for v0 compatibility.
         elif inputs_embeds is None:
-            assert False, "hpu_model_runner should be computing inputs_embeds"
             vision_embeddings = self.get_multimodal_embeddings(**kwargs)
 
             inputs_embeds = self.get_input_embeddings(input_ids,
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index d9e8452ed65..f1b8bb24fdf 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -297,22 +297,6 @@ def __init__(self, model, vllm_config, layer_names, is_causal, sampler):
         self.is_causal = is_causal
         self.use_merged_prefill = VLLM_MERGED_PREFILL
 
-    # copying from PR 1163
-    # needs cleanup/unified approach later
-    def compute_input_embeddings_for_gemma(self, **kwargs):
-
-        # todo may or may not be needed for gemma3, check
-        compile_only_mode_context_false = functools.partial(
-            bc.env_setting, "PT_COMPILE_ONLY_MODE", False)
-
-        input_ids = kwargs['input_ids']
-        #with compile_only_mode_context_false():
-        vision_embeddings = self.model.get_multimodal_embeddings(**kwargs)
-        inputs_embeds = self.model.get_input_embeddings(input_ids,
-                                                      vision_embeddings)
-
-        return inputs_embeds
-
     def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device,
                        dtype):
         if (attn_metadata is None
@@ -2984,18 +2968,6 @@ def try_revert_dummy_output_tokens():
                     'real_batch_size': real_batch_size
                 }
 
-                if 'Gemma3ForConditionalGeneration' in str(type(self.model.model)):
-                    inputs_embeds = \
-                        self.model.compute_input_embeddings_for_gemma(
-                            **execute_model_kwargs
-                        )
-                    execute_model_kwargs.update({
-                        'inputs_embeds': inputs_embeds,
-                    })
-                    # done compute the visual tokens
-                    execute_model_kwargs.pop('pixel_values', None)
-
-
                 with self.profiler.record_event('internal',
                                                 model_event_name,
                                                 args=profiler_args):

From 61a3e2f9af0da3f491cdf34bc78765d3177c18a8 Mon Sep 17 00:00:00 2001
From: Sayantan Sarkar <sasarkar@habana.ai>
Date: Wed, 11 Jun 2025 12:48:30 -0700
Subject: [PATCH 24/31] Fixing the earlier commit which was reverted

---
 vllm/model_executor/models/gemma3_mm.py |  1 +
 vllm/worker/hpu_model_runner.py         | 45 +++++++++++++++++++++++++
 2 files changed, 46 insertions(+)

diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 5fad293b9ef..dd2a28cf7c1 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -611,6 +611,7 @@ def forward(self,
         # NOTE: In v1, inputs_embeds is always generated at model runner, this
         # condition is for v0 compatibility.
         elif inputs_embeds is None:
+            assert False, "hpu_model_runner should be computing inputs_embeds"
             vision_embeddings = self.get_multimodal_embeddings(**kwargs)
 
             inputs_embeds = self.get_input_embeddings(input_ids,
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index f1b8bb24fdf..8de4dfcc591 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -297,6 +297,45 @@ def __init__(self, model, vllm_config, layer_names, is_causal, sampler):
         self.is_causal = is_causal
         self.use_merged_prefill = VLLM_MERGED_PREFILL
 
+
+    # copying from PR 1163
+    # needs cleanup/unified approach later
+    def compute_input_embeddings_for_gemma(self, **kwargs):
+
+        if 'inputs_embeds' in kwargs:
+            print('do nothing')
+            return kwargs
+
+        # todo may or may not be needed for gemma3, check
+        compile_only_mode_context_false = functools.partial(
+            bc.env_setting, "PT_COMPILE_ONLY_MODE", False)
+
+
+        input_ids = kwargs['input_ids']
+        #
+        #with compile_only_mode_context_false():
+        vision_embeddings = self.model.get_multimodal_embeddings(**kwargs)
+        inputs_embeds = self.model.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+
+        if vision_embeddings is not None:
+            print('vision_embeddings is not None')
+            #breakpoint()
+            input_ids = kwargs['input_ids']
+            positions = kwargs['positions']
+            kwargs = self.model.prepare_attn_masks(
+                mask_dtype=self.dtype,
+                **kwargs,
+            )
+            kwargs['input_ids'] = input_ids
+            kwargs['positions'] = positions
+            #input_ids = None
+
+        kwargs.update({'inputs_embeds': inputs_embeds})
+        # done compute the visual tokens
+        kwargs.pop('pixel_values', None)
+        return kwargs
+
     def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device,
                        dtype):
         if (attn_metadata is None
@@ -462,6 +501,7 @@ def forward(self, *args, **kwargs):
         virtual_engine = 0
         if 'virtual_engine' in kwargs:
             virtual_engine = kwargs.pop('virtual_engine')
+
         input_ids = kwargs['input_ids']
         kwargs['attn_metadata'] = self._update_metadata(
             kwargs['attn_metadata'], input_ids.size(0), input_ids.size(1),
@@ -2968,6 +3008,11 @@ def try_revert_dummy_output_tokens():
                     'real_batch_size': real_batch_size
                 }
 
+                if 'Gemma3ForConditionalGeneration' in str(type(self.model.model)):
+                    execute_model_kwargs = \
+                        self.model.compute_input_embeddings_for_gemma(
+                            **execute_model_kwargs
+                        )
                 with self.profiler.record_event('internal',
                                                 model_event_name,
                                                 args=profiler_args):

From 39e0f520118d4bf6fe13b608e51ebddf6b1939f4 Mon Sep 17 00:00:00 2001
From: Sayantan Sarkar <sasarkar@habana.ai>
Date: Wed, 11 Jun 2025 12:55:52 -0700
Subject: [PATCH 25/31] bring back reverted commit

---
 vllm/model_executor/models/utils.py |  2 ++
 vllm/worker/hpu_model_runner.py     | 12 +++++++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index b022832effc..8f5dfa87175 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -392,6 +392,8 @@ def _merge_multimodal_embeddings(
     # skip check for HPU, the number of tokens is a cpu fallback during HPU lazy
     if current_platform.is_hpu():
         flattened = _flatten_embeddings(multimodal_embeddings)
+        #TODO dynamic.. maybe torch.where? however multimodal_embeddings is a list of varying length
+        # still.. torch.where migth be faster than boolean indexing?
         inputs_embeds[is_multimodal] = flattened
         return inputs_embeds
 
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 8de4dfcc591..09cf3727f36 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -98,6 +98,8 @@ def __call__(cls, *args, **kwargs):
             cls._instances[cls] = super().__call__(*args, **kwargs)
         return cls._instances[cls]
 
+def is_gemma3(model):
+    return 'Gemma3ForConditionalGeneration' in str(type(model))
 
 class PhaseType(Enum):
     PREFILL = 'prefill'
@@ -297,6 +299,14 @@ def __init__(self, model, vllm_config, layer_names, is_causal, sampler):
         self.is_causal = is_causal
         self.use_merged_prefill = VLLM_MERGED_PREFILL
 
+        # TODO : right now just enabling it keeping gemma3 in mind
+        if htorch.utils.internal.is_lazy() and is_gemma3(self.model):
+            logger.info("[Multimodal] Wrapping Visual Model")
+            self.model.vision_tower = htorch.hpu.wrap_in_hpu_graph(
+                self.model.vision_tower, disable_tensor_cache=True)
+            self.model.multi_modal_projector = htorch.hpu.wrap_in_hpu_graph(
+                self.model.multi_modal_projector, disable_tensor_cache=True)
+
 
     # copying from PR 1163
     # needs cleanup/unified approach later
@@ -3008,7 +3018,7 @@ def try_revert_dummy_output_tokens():
                     'real_batch_size': real_batch_size
                 }
 
-                if 'Gemma3ForConditionalGeneration' in str(type(self.model.model)):
+                if is_gemma3(self.model.model):
                     execute_model_kwargs = \
                         self.model.compute_input_embeddings_for_gemma(
                             **execute_model_kwargs

From 661f59ac0a9d3c0c322219e946f2c2a176a7113b Mon Sep 17 00:00:00 2001
From: Jimin Ha <jimin.ha@intel.com>
Date: Fri, 20 Jun 2025 20:19:22 -0700
Subject: [PATCH 26/31] Fix accuracy issue with repeat words for long prompts

---
 vllm/attention/backends/hpu_attn.py | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
index 56f71e755e5..b25ee46d8b4 100644
--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -541,19 +541,6 @@ def forward(
             block_mapping = attn_metadata.block_mapping if not self.sliding_window else attn_metadata.window_block_mapping
             attn_bias = attn_metadata.attn_bias if not self.sliding_window else attn_metadata.window_attn_bias
 
-            if self.sliding_window:
-                block_size = len(attn_metadata.block_groups)
-                window_block = (self.sliding_window // block_size)
-                valid_block = (attn_metadata.block_groups == 0).sum()
-
-                # Create a mask to retain elements within the sliding window and exclude others.
-                rng = torch.arange(block_size, device='hpu')
-                mask = torch.logical_and(rng > 0, rng < valid_block-window_block+1)
-
-                block_groups= torch.where(mask,  torch.tensor(-1), attn_metadata.block_groups)
-                block_mapping=  torch.where(mask.unsqueeze(1), torch.tensor(0.0), attn_metadata.block_mapping)
-                block_list= torch.where(mask,  torch.tensor(0), attn_metadata.block_list)
-
             output = HPUPagedAttention.forward_decode(
                 query=query,
                 block_mapping=block_mapping,

From affc7a73262dd4f0408c024eaa4c21d5cb50d66f Mon Sep 17 00:00:00 2001
From: Jimin Ha <jimin.ha@intel.com>
Date: Mon, 23 Jun 2025 13:33:54 -0700
Subject: [PATCH 27/31] Change parameter check for intereleaved sliding_window

---
 vllm/worker/hpu_model_runner.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 09cf3727f36..2b88e6fdc8e 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -307,7 +307,6 @@ def __init__(self, model, vllm_config, layer_names, is_causal, sampler):
             self.model.multi_modal_projector = htorch.hpu.wrap_in_hpu_graph(
                 self.model.multi_modal_projector, disable_tensor_cache=True)
 
-
     # copying from PR 1163
     # needs cleanup/unified approach later
     def compute_input_embeddings_for_gemma(self, **kwargs):
@@ -760,6 +759,10 @@ def __init__(
 
         self.sliding_window = (self.model_config.get_sliding_window()
                                if self.model_config is not None else None)
+
+        self.interleaved_sliding_window = getattr(self.model_config.hf_text_config,
+                        "interleaved_sliding_window", None)
+
         self.device_config = (self.device_config if self.device_config
                               is not None else DeviceConfig())
         if is_fake_hpu():
@@ -1278,7 +1281,6 @@ def _prepare_prompt(
             seq_lens.append(seq_len)
 
             # NOTE: This only works for oooooooxxx style attention.
-            #import pdb;pdb.set_trace()
             if computed_block_nums is not None and len(
                     computed_block_nums) > 0 and self.sliding_window is None:
                 # Prefix is not supported with sliding_window
@@ -1618,7 +1620,6 @@ def _prepare_decode(
                 lora_index_mapping.append(lora_id)
                 lora_prompt_mapping.append(lora_id)
 
-                #logger.info(f"Decode: sliding_window:{self.sliding_window}, blocksize:{self.block_size}, block_table:{block_table}")
                 if self.sliding_window is not None:
                     sliding_window_blocks = (self.sliding_window //
                                              self.block_size)
@@ -1627,8 +1628,9 @@ def _prepare_decode(
 
                 #TODO: There are many places which checks this config parameter, however this is
                 #very specific config to gemma3, we should first check if this parameter even exist before check.
-                if self.model_config.hf_text_config.interleaved_sliding_window is not None:
-                    sliding_window_blocks = (self.model_config.hf_text_config.interleaved_sliding_window //
+                #This is for the models which use interleaved sliding window such as gemma3
+                if self.interleaved_sliding_window is not None:
+                    sliding_window_blocks = (self.interleaved_sliding_window //
                                             self.block_size)
                     window_block_table = block_table[-sliding_window_blocks:]
                     window_block_tables.append(window_block_table)
@@ -1663,7 +1665,7 @@ def _prepare_decode(
         assert len(block_list) == len(block_groups)
         assert len(block_list) == len(block_usage)
 
-        if self.model_config.hf_text_config.interleaved_sliding_window is not None:
+        if self.interleaved_sliding_window is not None:
             window_block_groups = [[i] * len(bt) for i, bt in enumerate(window_block_tables)]
             window_block_usage = [[self.block_size] * (len(bt) - 1) + [lbu]
                         for bt, lbu in zip(block_tables, last_block_usage)
@@ -1715,7 +1717,7 @@ def _prepare_decode(
                 indices[bid] = i
             padding_fn = lambda tensor, pad_value: gather_list(
                 tensor, indices, pad_value)
-            if self.model_config.hf_text_config.interleaved_sliding_window is not None:
+            if self.interleaved_sliding_window is not None:
                 window_indices: List[Any]
                 window_indices = [None] * block_bucket_size
                 for i, bid in enumerate(window_block_list):
@@ -1733,7 +1735,7 @@ def _prepare_decode(
         block_groups = padding_fn(block_groups, -1)
         block_usage = padding_fn(block_usage, 1)
 
-        if self.model_config.hf_text_config.interleaved_sliding_window is not None:
+        if self.interleaved_sliding_window is not None:
             window_block_list = window_padding_fn(window_block_list, _PAD_BLOCK_ID)
             window_block_groups = window_padding_fn(window_block_groups, -1)
             #window_block_usage = window_padding_fn(window_block_usage, 1)
@@ -1816,7 +1818,7 @@ def _prepare_decode(
             encoder_seq_lens_tensor = encoder_seq_lens_tensor.to(  # type: ignore
                 self.device, non_blocking=True)
 
-        if self.model_config.hf_text_config.interleaved_sliding_window is not None:
+        if self.interleaved_sliding_window is not None:
             window_block_list = torch.tensor(window_block_list, dtype=torch.int, device='cpu')
             window_block_groups = torch.tensor(window_block_groups,
                                         dtype=torch.int,

From 994de89fa7d32065df8a52b2d4ae7166331e47b0 Mon Sep 17 00:00:00 2001
From: Jimin Ha <jimin.ha@intel.com>
Date: Mon, 23 Jun 2025 13:34:28 -0700
Subject: [PATCH 28/31] Remove all test files

---
 config0.json          |   3 -
 config1.json          |   5 -
 gemma3_offline.py     | 211 ------------------------------------------
 gemma3_offline.sh     |  18 ----
 gemma3_offline_dyn.py | 152 ------------------------------
 gemma3_offline_dyn.sh |  22 -----
 6 files changed, 411 deletions(-)
 delete mode 100644 config0.json
 delete mode 100644 config1.json
 delete mode 100644 gemma3_offline.py
 delete mode 100755 gemma3_offline.sh
 delete mode 100644 gemma3_offline_dyn.py
 delete mode 100755 gemma3_offline_dyn.sh

diff --git a/config0.json b/config0.json
deleted file mode 100644
index 5306999163f..00000000000
--- a/config0.json
+++ /dev/null
@@ -1,3 +0,0 @@
-[
-[{"prompt": 0, "images" : [0,1,2]}, {"prompt": 0, "images": [3,4,5]}]
-]
diff --git a/config1.json b/config1.json
deleted file mode 100644
index cc7590d1278..00000000000
--- a/config1.json
+++ /dev/null
@@ -1,5 +0,0 @@
-[
-[{"prompt": 1}, {"prompt": 0, "images": [0,1,2]}],
-[{"prompt": 0, "images" : [0,2]}],
-[{"prompt": 2, "images" : [1]}]
-]
diff --git a/gemma3_offline.py b/gemma3_offline.py
deleted file mode 100644
index ba582090672..00000000000
--- a/gemma3_offline.py
+++ /dev/null
@@ -1,211 +0,0 @@
-# Example adapted from: https://docs.vllm.ai/en/latest/getting_started/examples/vision_language_multi_image.html
-#eg PT_HPUGRAPH_DISABLE_TENSOR_CACHE=false VLLM_PROMPT_BS_BUCKET_MIN=1 VLLM_PROMPT_BS_BUCKET_STEP=1 VLLM_PROMPT_BS_BUCKET_MAX=1 VLLM_PROMPT_SEQ_BUCKET_MIN=384 VLLM_PROMPT_SEQ_BUCKET_MAX=384 VLLM_DECODE_BS_BUCKET_MIN=1 VLLM_DECODE_BS_BUCKET_MAX=1 VLLM_DECODE_BLOCK_BUCKET_MIN=512 VLLM_DECODE_BLOCK_BUCKET_MAX=512 python multi_image_example.py --model google/gemma-3-27b-it --tensor-parallel-size 1 --num-images 1
-# SPDX-License-Identifier: Apache-2.0
-"""
-This example shows how to use vLLM for running offline inference with
-multi-image input on vision language models for text generation,
-using the chat template defined by the model.
-"""
-import os
-from argparse import Namespace
-from dataclasses import asdict
-from typing import NamedTuple, Optional
-
-from huggingface_hub import snapshot_download
-from PIL import Image
-from transformers import AutoProcessor, AutoTokenizer
-
-from vllm import LLM, EngineArgs, SamplingParams
-from vllm.lora.request import LoRARequest
-from vllm.multimodal.utils import fetch_image
-from vllm.utils import FlexibleArgumentParser
-import numpy as np
-"""Images are:
-1. Medical form
-2. Duck
-3. Lion
-4. Blue Bird
-5. Whale
-6. Starfish
-7. Snail
-8. Bee on Purple Flower
-9. 2 Dogs
-10. Orange Cat
-11. Gerbil
-12. Rabbit
-13. Horse and foal
-"""
-IMAGE_URLS = [
-    "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/2/26/Ultramarine_Flycatcher_%28Ficedula_superciliaris%29_Naggar%2C_Himachal_Pradesh%2C_2013_%28cropped%29.JPG",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/e/e5/Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg/2560px-Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/d/d4/Starfish%2C_Caswell_Bay_-_geograph.org.uk_-_409413.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/6/69/Grapevinesnail_01.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0b/Texas_invasive_Musk_Thistle_1.jpg/1920px-Texas_invasive_Musk_Thistle_1.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/7/7a/Huskiesatrest.jpg/2880px-Huskiesatrest.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/6/68/Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg/1920px-Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/3/30/George_the_amazing_guinea_pig.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/1/1f/Oryctolagus_cuniculus_Rcdo.jpg/1920px-Oryctolagus_cuniculus_Rcdo.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/9/98/Horse-and-pony.jpg",
-]
-
-
-class ModelRequestData(NamedTuple):
-    engine_args: EngineArgs
-    prompt: str
-    image_data: list[Image.Image]
-    stop_token_ids: Optional[list[int]] = None
-    chat_template: Optional[str] = None
-    lora_requests: Optional[list[LoRARequest]] = None
-
-
-# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
-# lower-end GPUs.
-# Unless specified, these settings have been tested to work on a single L4.
-
-def load_model(model_name: str, tp_size: int, max_model_len:int, question: str, batch_size:int, image_urls: list[str]) -> ModelRequestData:
-    assert len(image_urls) % batch_size == 0
-    engine_args = EngineArgs(
-    model=model_name,
-    max_model_len=max_model_len,
-    max_num_batched_tokens=max_model_len,
-    max_num_seqs=len(image_urls),
-    tensor_parallel_size=tp_size,
-    #gpu_memory_utilization=0.9,
-    enforce_eager=False,
-        limit_mm_per_prompt={"image": int(len(image_urls)/batch_size)},
-    )
-
-    processor = AutoProcessor.from_pretrained(model_name)
-    if batch_size==1:
-        placeholders = [{"type": "image", "image": (url)} for url in image_urls]
-        messages = [{
-            "role":
-            "user",
-            "content": [
-                *placeholders,
-                {
-                    "type": "text",
-                    "text": question
-                },
-            ],
-        }]
-        prompt = processor.apply_chat_template(messages,tokenize=False,add_generation_prompt=True)
-        #breakpoint()
-        # [Image.open('jr.png').convert("RGB")]
-        requests = {"prompt":prompt,"multi_modal_data":{"image":[fetch_image(url) if type(url)==str else url for url in image_urls]}}
-
-    else:
-        chunks = np.array_split(image_urls, batch_size)
-        requests = []
-        for chunk in chunks:
-            print("chunk....", chunk)
-            placeholders = [{"type": "image", "image": (url)} for url in chunk]
-            messages = [{
-                "role":
-                "user",
-                "content": [
-                    *placeholders,
-                    {
-                        "type": "text",
-                        "text": question
-                    },
-                ],
-            }]
-            prompt = processor.apply_chat_template(messages,tokenize=False,add_generation_prompt=True)
-            import numpy
-            
-            requests.append({"prompt":prompt,"multi_modal_data":{"image":[fetch_image(url) if (type(url)==str) or (type(url)==numpy.str_) else url for url in chunk]}})
-            #breakpoint()
-            #print()
-    return engine_args,requests
-
-def run_generate(model_name: str, tp_size: int, max_model_len: int, question: str, batch_size: int, image_urls: list[str]):
-    engine_args,requests = load_model(model_name, tp_size, max_model_len, question, batch_size, image_urls)
-
-    engine_args = asdict(engine_args)
-    #breakpoint()
-    llm = LLM(**engine_args)
-    sampling_params = SamplingParams(temperature=0.0,
-                                     max_tokens=8192)
-
-    #breakpoint()
-    outputs = llm.generate(requests,
-        sampling_params=sampling_params
-    )
-    print("-" * 50)
-    for o in outputs:
-        generated_text = o.outputs[0].text
-        print(len(o.outputs[0].token_ids))
-        print(generated_text)
-        print("-*." * 50)
-
-def parse_args():
-    parser = FlexibleArgumentParser(
-        description='Demo on using vLLM for offline inference with '
-        'vision language models that support multi-image input for text '
-        'generation')
-    parser.add_argument('--model-name',
-                        '-m',
-                        type=str,
-                        default="google/gemma-3-4b-it",
-                        choices=['google/gemma-3-4b-it','google/gemma-3-27b-it'],
-                        help='Huggingface "model_type".')
-    parser.add_argument('--tensor-parallel-size',
-                        '-tp',
-                        type=int,
-                        default=1,
-                        help='tensor parallel size.')
-    parser.add_argument(
-        "--num-images",
-        "-n",
-        type=int,
-        choices=list(range(0,
-                           len(IMAGE_URLS)+1)),  # the max number of images
-        default=2,
-        help="Number of images to use for the demo.")
-    parser.add_argument(
-        "--batch-size",
-        "-b",
-        type=int,
-        default=1,
-        help="Batches in which the images will be sent.")
-    parser.add_argument('--max-model-len',
-                        '-ml',
-                        type=int,
-                        default=8192,
-                        help='Max-Model-Len.')
-    return parser.parse_args()
-
-
-def main(args: Namespace):
-    model = args.model_name
-    tp_size = args.tensor_parallel_size
-    max_model_len = args.max_model_len
-
-
-    image_urls = [IMAGE_URLS[idx % len(IMAGE_URLS)] for idx, i in enumerate(range(args.num_images * args.batch_size))]
-
-    '''
-    if args.num_images==1:
-        QUESTION = "Extract all information from the provided image and provide it in a json format:"
-        #batch_size=1
-    elif args.num_images==0:
-        QUESTION = "You are an AI designed to generate extremely long, detailed worldbuilding content. Your goal is to write a fictional encyclopedia with at least 4000 words of content. Do not stop early. Start by describing a fictional planet in detail. Include: \n1. Geography and climate zones (with rich, varied description).\n2. The history of all civilizations, from ancient to modern times.\n3. Cultures, belief systems, and mythologies along with rich detail about where such beliefs came from.\n4. Political structures and conflicts along with their history.\n5. Technology and magic systems (if any) spanning the last 1000 years, highlighting significant discoveries and figures.\n6. Major historical events and characters along with their geneology.\n\n Be descriptive, verbose, and never summarize. Write in a factual tone like an academic encyclopedia. Begin your entry below:"
-        #batch_size=1
-    else:
-        QUESTION = "What is the content of each image? Once done, write a story that combines them all."
-    '''
-    if args.num_images==0:
-        QUESTION = "You are an AI designed to generate extremely long, detailed worldbuilding content. Your goal is to write a fictional encyclopedia with at least 4000 words of content. Do not stop early. Start by describing a fictional planet in detail. Include: \n1. Geography and climate zones (with rich, varied description).\n2. The history of all civilizations, from ancient to modern times.\n3. Cultures, belief systems, and mythologies along with rich detail about where such beliefs came from.\n4. Political structures and conflicts along with their history.\n5. Technology and magic systems (if any) spanning the last 1000 years, highlighting significant discoveries and figures.\n6. Major historical events and characters along with their geneology.\n\n Be descriptive, verbose, and never summarize. Write in a factual tone like an academic encyclopedia. Begin your entry below:"
-    else:
-        QUESTION = "What is the content of each image? Once done, write a story that combines them all."
-    batch_size = args.batch_size
-
-    run_generate(model, tp_size, max_model_len, QUESTION, batch_size, image_urls)
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)
diff --git a/gemma3_offline.sh b/gemma3_offline.sh
deleted file mode 100755
index d060835c980..00000000000
--- a/gemma3_offline.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-export PT_HPUGRAPH_DISABLE_TENSOR_CACHE=false
-export VLLM_PROMPT_BS_BUCKET_MIN=1
-export VLLM_PROMPT_BS_BUCKET_STEP=1
-export VLLM_PROMPT_BS_BUCKET_MA=1
-export VLLM_PROMPT_SEQ_BUCKET_MIN=384
-export VLLM_PROMPT_SEQ_BUCKET_MAX=384
-export VLLM_DECODE_BS_BUCKET_MIN=1
-export VLLM_DECODE_BS_BUCKET_MAX=1
-export VLLM_DECODE_BLOCK_BUCKET_MIN=512
-export VLLM_DECODE_BLOCK_BUCKET_MAX=512 
-export VLLM_USE_V1=0
-
-
-export PT_HPU_LAZY_MODE=1
-#export VLLM_FP32_SOFTMAX=1
-#export VLLM_PROMPT_USE_FUSEDSDPA=False
-
-python gemma3_offline.py --model google/gemma-3-4b-it --tensor-parallel-size 1 --num-images 12 --batch-size 1
diff --git a/gemma3_offline_dyn.py b/gemma3_offline_dyn.py
deleted file mode 100644
index 39b7cd0ea05..00000000000
--- a/gemma3_offline_dyn.py
+++ /dev/null
@@ -1,152 +0,0 @@
-import torch
-from vllm.multimodal.utils import fetch_image
-import json
-from argparse import Namespace
-from vllm.utils import FlexibleArgumentParser
-from vllm import LLM, EngineArgs, SamplingParams
-import argparse
-from dataclasses import asdict
-from transformers import AutoProcessor, AutoTokenizer
-
-
-PROMPTS = [
-# 19
-"What is the content of each image? Once done, write a story that combines them all.",
-# 183
-"You are an AI designed to generate extremely long, detailed worldbuilding content. Your goal is to write a fictional encyclopedia with at least 4000 words of content. Do not stop early. Start by describing a fictional planet in detail. Include: \n1. Geography and climate zones (with rich, varied description).\n2. The history of all civilizations, from ancient to modern times.\n3. Cultures, belief systems, and mythologies along with rich detail about where such beliefs came from.\n4. Political structures and conflicts along with their history.\n5. Technology and magic systems (if any) spanning the last 1000 years, highlighting significant discoveries and figures.\n6. Major historical events and characters along with their geneology.\n\n Be descriptive, verbose, and never summarize. Write in a factual tone like an academic encyclopedia. Begin your entry below:",
-# 265
-'''Here is a short story: It contains some animals as its main characters. Rewrite this story by replacing the animals in this story with any of the animals shown in the images. Do not change the contents of the story, just the characters: One day a shepherd discovered a fat Pig in the meadow where his Sheep were pastured. He very quickly captured the porker, which squealed at the top of its voice the moment the Shepherd laid his hands on it. You would have thought, to hear the loud squealing, that the Pig was being cruelly hurt. But in spite of its squeals and struggles to escape, the Shepherd tucked his prize under his arm and started off to the butcher's in the market place. The Sheep in the pasture were much astonished and amused at the Pig's behavior, and followed the Shepherd and his charge to the pasture gate. "What makes you squeal like that?" asked one of the Sheep. "The Shepherd often catches and carries off one of us. But we should feel very much ashamed to make such a terrible fuss about it like you do." "That is all very well," replied the Pig, with a squeal and a frantic kick. "When he catches you he is only after your wool. But he wants my bacon! gree-ee-ee!"'''
-# 611
-"Here is a list of creatures. Analyse the images and if there are creatures present in the images and the pick the closest creature from this list for each creature in the images: African Elephant, Bengal Tiger, Arctic Fox, Blue Whale, Brown Bear, Cheetah, Cougar, Dingo, Dolphin, Elk, Flying Fox, Giraffe, Gorilla, Grizzly Bear, Hedgehog, Hippopotamus, Hyena, Indian Elephant, Jaguar, Kangaroo, Koala, Lemur, Leopard, Lion, Lynx, Manatee, Mole, Moose, Mountain Goat, Narwhal, Okapi, Orangutan, Otter, Panda, Platypus, Polar Bear, Porcupine, Possum, Prairie Dog, Puma, Quokka, Rabbit, Raccoon, Red Panda, Reindeer, Rhinoceros, Sea Lion, Seal, Sheep, Skunk, Sloth, Squirrel, Tapir, Tasmanian Devil, Walrus, Weasel, Whale, Wild Boar, Wombat, Yak, Zebra, Albatross, American Robin, Bald Eagle, Barn Owl, Blue Jay, Budgerigar, Canary, Cardinal, Cassowary, Chickadee, Cockatoo, Cormorant, Crane, Crow, Cuckoo, Dove, Duck, Eagle, Egret, Falcon, Finch, Flamingo, Goldfinch, Goose, Great Horned Owl, Gull, Hawk, Heron, Hummingbird, Ibis, Jay, Kestrel, Kingfisher, Kiwi, Lark, Macaw, Magpie, Mockingbird, Nightingale, Nuthatch, Oriole, Ostrich, Owl, Parrot, Partridge, Peacock, Pelican, Penguin, Peregrine Falcon, Pigeon, Puffin, Quail, Raven, Roadrunner, Robin, Rooster, Sparrow, Starling, Stork, Swallow, Swan, Toucan, Turkey, Vulture, Warbler, Woodpecker, Wren, Angelfish, Anglerfish, Barracuda, Betta Fish, Blue Tang, Catfish, Clownfish, Cod, Eel, Flounder, Flying Fish, Goldfish, Grouper, Guppy, Haddock, Halibut, Hammerhead Shark, Herring, Jellyfish, Koi, Lionfish, Lobster, Mackerel, Manta Ray, Marlin, Moray Eel, Octopus, Orca, Piranha, Pufferfish, Rainbow Trout, Salmon, Sardine, Seahorse, Shark, Shrimp, Squid, Starfish, Stingray, Swordfish, Tilapia, Tuna, Walrus, Whale Shark, Zebra Fish, Alligator, Anole, Boa Constrictor, Box Turtle, Chameleon, Cobra, Crocodile, Frog, Gecko, Gila Monster, Green Iguana, Komodo Dragon, Lizard, Monitor Lizard, Newt, Python, Rattlesnake, Salamander, Sea Turtle, Skink, Snake, Toad, Tortoise, Tree Frog, Viper, Ant, Bee, Beetle, Butterfly, Centipede, Cicada, Cricket, Dragonfly, Earthworm, Firefly, Grasshopper, Ladybug, Leech, Millipede, Moth, Praying Mantis, Scorpion, Snail, Spider, Termite, Tick, Wasp"
-]
-
-
-IMAGE_URLS = [
-    "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/2/26/Ultramarine_Flycatcher_%28Ficedula_superciliaris%29_Naggar%2C_Himachal_Pradesh%2C_2013_%28cropped%29.JPG",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/e/e5/Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg/2560px-Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/d/d4/Starfish%2C_Caswell_Bay_-_geograph.org.uk_-_409413.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/6/69/Grapevinesnail_01.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0b/Texas_invasive_Musk_Thistle_1.jpg/1920px-Texas_invasive_Musk_Thistle_1.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/7/7a/Huskiesatrest.jpg/2880px-Huskiesatrest.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/6/68/Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg/1920px-Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/3/30/George_the_amazing_guinea_pig.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/1/1f/Oryctolagus_cuniculus_Rcdo.jpg/1920px-Oryctolagus_cuniculus_Rcdo.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/9/98/Horse-and-pony.jpg",
-]
-
-
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='Demo on using vLLM for offline inference with '
-        'vision language models that support multi-image input for text '
-        'generation')
-    parser.add_argument('--model-name',
-                        '-m',
-                        type=str,
-                        default="google/gemma-3-4b-it",
-                        choices=['google/gemma-3-4b-it','google/gemma-3-27b-it'],
-                        help='Huggingface "model_type".')
-    parser.add_argument('--tensor-parallel-size',
-                        '-tp',
-                        type=int,
-                        default=1,
-                        help='tensor parallel size.')
-    parser.add_argument(
-        "--batchconfig",
-        "-b",
-        type=str,
-        required=True,
-        help='''
-            #Sample json input
-            batch0_bs2 = [{"prompt": 1}, {"prompt": 0, "images": [0,1,2]}]
-            batch0_bs3 = [{"prompt": 2, "images" : [6,7,8]}, {"prompt": 0, "images": [3,2,1,0]}, {"prompt": 3, "images": [4]}]
-            inputs = [batch0_bs2, batch0_bs3]
-
-            so inp json should be:
-            [
-            [{"prompt": 1}, {"prompt": 0, "images": [0,1,2]}],
-            [{"prompt": 2, "images" : [6,7,8]}, {"prompt": 0, "images": [3,2,1,0]}, {"prompt": 3, "images": [4]}]
-            ]
-            ''')
-    parser.add_argument('--max-model-len',
-                        '-ml',
-                        type=int,
-                        default=8192,
-                        help='Max-Model-Len.')
-    return parser.parse_args()
-
-
-def make_model(model_name, max_model_len, tp_size, max_num_seqs, limit_mm_per_prompt):
-    engine_args = EngineArgs(
-    model=model_name,
-    max_model_len=max_model_len,
-    max_num_batched_tokens=max_model_len,
-    max_num_seqs=max_num_seqs,
-    tensor_parallel_size=tp_size,
-    #gpu_memory_utilization=0.9,
-    enforce_eager=False,
-        limit_mm_per_prompt={"image": limit_mm_per_prompt},
-    )
-    engine_args = asdict(engine_args)
-    llm = LLM(**engine_args)
-    processor = AutoProcessor.from_pretrained(model_name)
-    return llm, processor
-
-
-def create_inp_from_batchconfig(processor, batch):
-    requests = []
-    for prompt in batch:
-        placeholders = [{"type": "image", "image": (url)} for url in prompt.get('images', [])]
-        messages = [{
-            "role":
-            "user",
-            "content": [
-                *placeholders,
-                {
-                    "type": "text",
-                    "text": PROMPTS[prompt["prompt"]]
-                },
-            ],
-        }]
-        final_prompt = processor.apply_chat_template(messages,tokenize=False,add_generation_prompt=True)
-        requests.append({"prompt":final_prompt,"multi_modal_data":{"image":[fetch_image(IMAGE_URLS[urlid]) for urlid in prompt.get('images', [])]}})
-    return requests
-
-
-def run_generate(llm, processor, batch):
-    requests = create_inp_from_batchconfig(processor, batch)
-    sampling_params = SamplingParams(temperature=0.0,
-                                     max_tokens=8192)
-
-    outputs = llm.generate(requests,
-        sampling_params=sampling_params
-    )
-    print("-" * 50)
-    for o in outputs:
-        generated_text = o.outputs[0].text
-        print(len(o.outputs[0].token_ids))
-        print(generated_text)
-        print("-*." * 50)
-
-
-def main(args: Namespace):
-    with open(args.batchconfig) as f:
-        config = json.load(f)
-
-    limit_mm_per_prompt = max(max([len(prompt.get("images", [])) for prompt in batch]) for batch in config)
-    max_num_seqs = max(sum([len(prompt.get("images", [])) for prompt in batch]) for batch in config)  # TODO: this one not sure if this is what it means?
-
-    llm, processor = make_model(args.model_name, args.max_model_len, args.tensor_parallel_size, max_num_seqs=max_num_seqs, limit_mm_per_prompt=limit_mm_per_prompt)
-    for batchidx, batch in enumerate(config):
-        run_generate(llm, processor, batch)
-        print(f'Done batch {batchidx}, of bs={len(batch)}. config: {batch}')
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)
-
-
diff --git a/gemma3_offline_dyn.sh b/gemma3_offline_dyn.sh
deleted file mode 100755
index 5f31a0cd31f..00000000000
--- a/gemma3_offline_dyn.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-export PT_HPUGRAPH_DISABLE_TENSOR_CACHE=false
-export VLLM_PROMPT_BS_BUCKET_MIN=1
-export VLLM_PROMPT_BS_BUCKET_STEP=1
-export VLLM_PROMPT_BS_BUCKET_MA=1
-export VLLM_PROMPT_SEQ_BUCKET_MIN=384
-export VLLM_PROMPT_SEQ_BUCKET_MAX=384
-export VLLM_DECODE_BS_BUCKET_MIN=1
-export VLLM_DECODE_BS_BUCKET_MAX=1
-export VLLM_DECODE_BLOCK_BUCKET_MIN=512
-export VLLM_DECODE_BLOCK_BUCKET_MAX=512 
-export VLLM_USE_V1=0
-
-
-export PT_HPU_LAZY_MODE=1
-#export VLLM_FP32_SOFTMAX=1
-#export VLLM_PROMPT_USE_FUSEDSDPA=False
-
-
-export VLLM_SKIP_WARMUP=true
-
-
-python gemma3_offline_dyn.py --model google/gemma-3-4b-it --tensor-parallel-size 1 --batchconfig config1.json
\ No newline at end of file

From 805df557543af097075280c3e04b503ae90a7c8f Mon Sep 17 00:00:00 2001
From: Jimin Ha <jimin.ha@intel.com>
Date: Mon, 23 Jun 2025 14:53:54 -0700
Subject: [PATCH 29/31] Fix error from merge

---
 vllm/attention/backends/hpu_attn.py | 3 ++-
 vllm/worker/hpu_model_runner.py     | 3 ---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
index 2dc917c47b7..7de4d3936c1 100644
--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -745,7 +745,7 @@ def _make_prompt_alibi_bias(
         seq_len,  # Directly use seq_len instead of padded_len
         device=alibi_slopes.device,
         dtype=dtype,
-    )[:, :, :, :seq_len].copy_(bias)
+    )
 
     # Copy the bias matrix into each head
     per_head_bias[:, :] = bias
@@ -755,6 +755,7 @@ def _make_prompt_alibi_bias(
 
     return per_head_bias
 
+
 def _make_decode_alibi_bias(
     alibi_blocks: torch.Tensor,
     alibi_slopes: torch.Tensor,
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 8d4404aa858..52a42a6683e 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -1840,9 +1840,6 @@ def _prepare_decode(
                     block_table = block_table[-sliding_window_blocks:]
                 block_tables.append(block_table)
 
-                #TODO: There are many places which checks this config parameter, however this is
-                #very specific config to gemma3, we should first check if this parameter even exist before check.
-                #This is for the models which use interleaved sliding window such as gemma3
                 if self.interleaved_sliding_window is not None:
                     sliding_window_blocks = (self.interleaved_sliding_window //
                                             self.block_size)

From d531412fb73d4ce90423368ef21de0384197e1e6 Mon Sep 17 00:00:00 2001
From: Jimin Ha <jimin.ha@intel.com>
Date: Mon, 23 Jun 2025 15:51:07 -0700
Subject: [PATCH 30/31] Fix pre-commit errors

---
 vllm/attention/backends/hpu_attn.py     | 23 +++++----
 vllm/model_executor/models/gemma3.py    | 15 +++---
 vllm/model_executor/models/gemma3_mm.py | 21 +++++---
 vllm/worker/hpu_model_runner.py         | 68 +++++++++++++++----------
 4 files changed, 73 insertions(+), 54 deletions(-)

diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
index 7de4d3936c1..60b203df87a 100644
--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -8,7 +8,6 @@
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Type
 
-import math
 import torch
 import vllm_hpu_extension.kernels as kernels
 import vllm_hpu_extension.ops as ops
@@ -550,14 +549,15 @@ def forward(
                 and attn_metadata.block_list is not None else None
 
             common_args = self.common_attention_args(block_list, key_cache,
-                                             value_cache, attn_metadata.block_size)
+                                                     value_cache,
+                                                     attn_metadata.block_size)
 
             #TODO: Ideally we want to create this sliding_window_bias mask only
             #once in the model_runner or gemma model file then only retrieve here.
             if self.sliding_window:
-                attn_bias = _make_sliding_window_bias(batch_size, seq_len,
-                       attn_metadata.seq_lens_tensor,
-                       self.sliding_window, query.dtype)
+                attn_bias = _make_sliding_window_bias(
+                    batch_size, seq_len, attn_metadata.seq_lens_tensor,
+                    self.sliding_window, query.dtype)
                 common_args['pad'] = 'left'
 
             out = ops.prompt_attention(
@@ -598,8 +598,8 @@ def forward(
                 block_bias=attn_bias,
                 block_groups=block_groups,
                 position_bias=self.position_bias,
-                **self.common_attention_args(block_list,
-                                             key_cache, value_cache,
+                **self.common_attention_args(block_list, key_cache,
+                                             value_cache,
                                              attn_metadata.block_size))
         # Reshape the output tensor.
         return output.view(batch_size, seq_len, hidden_size)
@@ -791,11 +791,12 @@ def _make_decode_alibi_bias(
 
     return per_head_bias
 
+
 def _make_sliding_window_bias(
     batch_size: int,
     seq_len: int,
     query_lens_t: torch.tensor,
-    window_size:int,
+    window_size: int,
     dtype: torch.dtype,
 ) -> torch.Tensor:
 
@@ -806,11 +807,13 @@ def _make_sliding_window_bias(
     # once FusedSDPA kernel with sliding causal mask support is available.
 
     # causal + sliding window (LEFT PADDING)
-    tensor = torch.full((batch_size, 1, seq_len, seq_len), device=device,dtype=dtype, fill_value=1)
+    tensor = torch.full((batch_size, 1, seq_len, seq_len),
+                        device=device,
+                        dtype=dtype,
+                        fill_value=1)
     mask = torch.tril(tensor, diagonal=shift)
     mask = torch.triu(mask, diagonal=shift - window_size + 1)
     attn_bias = torch.log(mask)
-
     '''
     # TODO Accuracy issue need to be debugged.
     # causal + sliding window + query_len (LEFT PADDING : Need kernel supports)
diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
index a930ca35931..255d200556b 100644
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -234,7 +234,6 @@ def naive_attn_with_masks(
         **kwargs,
     ) -> torch.Tensor:
 
-
         s = q.shape[1]
         num_queries_per_kv = self.num_heads // self.num_kv_heads
         query = q.view(-1, s, self.num_heads, self.head_dim)
@@ -253,13 +252,13 @@ def naive_attn_with_masks(
         value = value.transpose(1, 2)
 
         output = F.scaled_dot_product_attention(
-                query,
-                key,
-                value,
-                attn_masks,
-                self.scaling,
-            )
-        
+            query,
+            key,
+            value,
+            attn_masks,
+            self.scaling,
+        )
+
         out = output.transpose(1, 2).flatten(-2, -1)
 
         return out
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index dd2a28cf7c1..fc502eea528 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -661,15 +661,20 @@ def prepare_attn_masks(
         img_pos = (input_ids == self.config.image_token_index)
 
         img_mask[img_pos.unsqueeze(1)] += 1
-        img_mask = img_mask.permute(0,1,3,2)
+        img_mask = img_mask.permute(0, 1, 3, 2)
         img_mask[img_pos.unsqueeze(1)] += 1
-        img_mask = img_mask.permute(0,1,3,2)
+        img_mask = img_mask.permute(0, 1, 3, 2)
 
         img_pos_cum = torch.cumsum(img_pos, 1)
-        img_causal = torch.arange(seq_len, device = input_ids.device).unsqueeze(0) - img_pos_cum + (img_pos_cum//IMG_TOKENS + 1) * IMG_TOKENS + 1
-        img_causal = torch.cat((img_causal[:,0:1]-1, img_causal[:,:-1]), dim=1)
-        img_causal = img_causal.clamp_(min=0, max=seq_len-1).unsqueeze(1).unsqueeze(3)
-        ind = torch.arange(seq_len, device=input_ids.device).unsqueeze(0).unsqueeze(1).unsqueeze(2)
+        img_causal = torch.arange(seq_len, device=input_ids.device).unsqueeze(
+            0) - img_pos_cum + (img_pos_cum // IMG_TOKENS + 1) * IMG_TOKENS + 1
+        img_causal = torch.cat((img_causal[:, 0:1] - 1, img_causal[:, :-1]),
+                               dim=1)
+        img_causal = img_causal.clamp_(min=0, max=seq_len -
+                                       1).unsqueeze(1).unsqueeze(3)
+        ind = torch.arange(
+            seq_len,
+            device=input_ids.device).unsqueeze(0).unsqueeze(1).unsqueeze(2)
         img_mask[ind < img_causal] += 1
         global_attn_mask = torch.where(img_mask == 3, 0, global_attn_mask)
 
@@ -677,9 +682,9 @@ def prepare_attn_masks(
             # Create a local causal mask with sliding window (1024).
             local_attn_mask = torch.ones_like(global_attn_mask)
             local_attn_mask = torch.tril(local_attn_mask,
-                                            diagonal=-self.sliding_window)
+                                         diagonal=-self.sliding_window)
             local_attn_mask = torch.where(local_attn_mask == 0,
-                                            global_attn_mask, float("-inf"))
+                                          global_attn_mask, float("-inf"))
 
         kwargs["global_attn_masks"] = global_attn_mask
         kwargs["local_attn_masks"] = local_attn_mask
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 52a42a6683e..2d8d02ce9d9 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -128,9 +128,11 @@ def __call__(cls, *args, **kwargs):
             cls._instances[cls] = super().__call__(*args, **kwargs)
         return cls._instances[cls]
 
+
 def is_gemma3(model):
     return 'Gemma3ForConditionalGeneration' in str(type(model))
 
+
 def pad_flat_tensor(tensor, desired_size):
     assert tensor.dim() == 1, 'Only flat tensors are supported'
     padding_needed = desired_size - tensor.size(0)
@@ -376,13 +378,12 @@ def compute_input_embeddings_for_gemma(self, **kwargs):
         compile_only_mode_context_false = functools.partial(
             bc.env_setting, "PT_COMPILE_ONLY_MODE", False)
 
-
         input_ids = kwargs['input_ids']
         #
         #with compile_only_mode_context_false():
         vision_embeddings = self.model.get_multimodal_embeddings(**kwargs)
-        inputs_embeds = self.model.get_input_embeddings(input_ids,
-                                                      vision_embeddings)
+        inputs_embeds = self.model.get_input_embeddings(
+            input_ids, vision_embeddings)
 
         if vision_embeddings is not None:
             print('vision_embeddings is not None')
@@ -462,7 +463,8 @@ def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device,
                                              attn_bias=attn_bias)
         return attn_metadata
 
-    def _set_block_mapping(self, metadata, batch_size, device, dtype, is_window_block):
+    def _set_block_mapping(self, metadata, batch_size, device, dtype,
+                           is_window_block):
 
         block_usage = metadata.block_usage if not is_window_block else metadata.window_block_usage
         block_groups = metadata.block_groups if not is_window_block else metadata.window_block_groups
@@ -494,9 +496,10 @@ def _set_block_mapping(self, metadata, batch_size, device, dtype, is_window_bloc
                                                 "TrimmedAttentionMetadata",
                                                 block_groups=block_groups)
             else:
-                metadata = custom_tuple_replace(metadata,
-                                                "TrimmedAttentionMetadata",
-                                                window_block_groups=block_groups)
+                metadata = custom_tuple_replace(
+                    metadata,
+                    "TrimmedAttentionMetadata",
+                    window_block_groups=block_groups)
 
         block_mapping = block_mapping.to(dtype)
         if not is_window_block:
@@ -506,9 +509,9 @@ def _set_block_mapping(self, metadata, batch_size, device, dtype, is_window_bloc
                                             attn_bias=attn_bias)
         else:
             metadata = custom_tuple_replace(metadata,
-                                "TrimmedAttentionMetadata",
-                                window_block_mapping=block_mapping,
-                                window_attn_bias=attn_bias)
+                                            "TrimmedAttentionMetadata",
+                                            window_block_mapping=block_mapping,
+                                            window_attn_bias=attn_bias)
         return metadata
 
     def forward_update_meta_only(self, *args, **kwargs):
@@ -533,9 +536,8 @@ def _update_metadata(self, attn_metadata, batch_size, seq_len, device,
             attn_metadata = self._set_block_mapping(attn_metadata, batch_size,
                                                     device, dtype, False)
         if attn_metadata.window_block_list is not None:
-                attn_metadata = self._set_block_mapping(attn_metadata, batch_size,
-                                                    device, dtype,
-                                                    True)
+            attn_metadata = self._set_block_mapping(attn_metadata, batch_size,
+                                                    device, dtype, True)
         return attn_metadata
 
     def _prepare_cos_sin(self, positions):
@@ -868,8 +870,9 @@ def __init__(
         self.sliding_window = (self.model_config.get_sliding_window()
                                if self.model_config is not None else None)
 
-        self.interleaved_sliding_window = getattr(self.model_config.hf_text_config,
-                        "interleaved_sliding_window", None)
+        self.interleaved_sliding_window = getattr(
+            self.model_config.hf_text_config, "interleaved_sliding_window",
+            None)
 
         self.device_config = (self.device_config if self.device_config
                               is not None else DeviceConfig())
@@ -1842,7 +1845,7 @@ def _prepare_decode(
 
                 if self.interleaved_sliding_window is not None:
                     sliding_window_blocks = (self.interleaved_sliding_window //
-                                            self.block_size)
+                                             self.block_size)
                     window_block_table = block_table[-sliding_window_blocks:]
                     window_block_tables.append(window_block_table)
 
@@ -1877,10 +1880,12 @@ def _prepare_decode(
         assert len(block_list) == len(block_usage)
 
         if self.interleaved_sliding_window is not None:
-            window_block_groups = [[i] * len(bt) for i, bt in enumerate(window_block_tables)]
-            window_block_usage = [[self.block_size] * (len(bt) - 1) + [lbu]
-                        for bt, lbu in zip(block_tables, last_block_usage)
-                        if bt]
+            window_block_groups = [[i] * len(bt)
+                                   for i, bt in enumerate(window_block_tables)]
+            window_block_usage = [
+                [self.block_size] * (len(bt) - 1) + [lbu]
+                for bt, lbu in zip(block_tables, last_block_usage) if bt
+            ]
 
             window_block_list = flatten(window_block_tables)
             window_block_groups = flatten(window_block_groups)
@@ -1961,10 +1966,15 @@ def _prepare_decode(
         block_usage = padding_fn(block_usage, 1)
 
         if self.interleaved_sliding_window is not None:
-            window_block_list = window_padding_fn(window_block_list, _PAD_BLOCK_ID)
+            window_block_list = window_padding_fn(window_block_list,
+                                                  _PAD_BLOCK_ID)
             window_block_groups = window_padding_fn(window_block_groups, -1)
             #window_block_usage = window_padding_fn(window_block_usage, 1)
-            window_block_usage = [1 if i == 0 else block_usage[idx] for idx, (i, j) in enumerate(zip(window_block_list, block_usage))]
+            window_block_usage = [
+                1 if i == 0 else block_usage[idx]
+                for idx, (i,
+                          j) in enumerate(zip(window_block_list, block_usage))
+            ]
 
         if is_enc_dec_model:
             if self.use_contiguous_pa:
@@ -2058,16 +2068,18 @@ def _prepare_decode(
                 self.device, non_blocking=True)
 
         if self.interleaved_sliding_window is not None:
-            window_block_list = torch.tensor(window_block_list, dtype=torch.int, device='cpu')
+            window_block_list = torch.tensor(window_block_list,
+                                             dtype=torch.int,
+                                             device='cpu')
             window_block_groups = torch.tensor(window_block_groups,
-                                        dtype=torch.int,
-                                        device='cpu')
+                                               dtype=torch.int,
+                                               device='cpu')
             window_block_usage = torch.tensor(window_block_usage,
-                                    dtype=self.model_config.dtype,
-                                    device='cpu')
+                                              dtype=self.model_config.dtype,
+                                              device='cpu')
 
             window_block_list = window_block_list.to(  # type: ignore
-            self.device, non_blocking=True)
+                self.device, non_blocking=True)
             window_block_groups = window_block_groups.to(  # type: ignore
                 self.device, non_blocking=True)
             window_block_usage = window_block_usage.to(  # type: ignore

From f99d76a286c876e577f08fdfab4ab4309498e95f Mon Sep 17 00:00:00 2001
From: Jimin Ha <jimin.ha@intel.com>
Date: Mon, 23 Jun 2025 17:21:29 -0700
Subject: [PATCH 31/31] Pre-commit fix for the list warning

---
 vllm/worker/hpu_model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 2d8d02ce9d9..731934bff2f 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -1971,7 +1971,7 @@ def _prepare_decode(
             window_block_groups = window_padding_fn(window_block_groups, -1)
             #window_block_usage = window_padding_fn(window_block_usage, 1)
             window_block_usage = [
-                1 if i == 0 else block_usage[idx]
+                [1] if i == 0 else [block_usage[idx]]
                 for idx, (i,
                           j) in enumerate(zip(window_block_list, block_usage))
             ]