turboderp-org
diff --git a/‎exllamav2/compat.py
Lines changed: 9 additions & 5 deletions b/‎exllamav2/compat.py
Lines changed: 9 additions & 5 deletions
diff --git a/‎exllamav2/device.py
Lines changed: 6 additions & 5 deletions b/‎exllamav2/device.py
Lines changed: 6 additions & 5 deletions
diff --git a/‎exllamav2/embedding.py
Lines changed: 15 additions & 11 deletions b/‎exllamav2/embedding.py
Lines changed: 15 additions & 11 deletions
diff --git a/‎exllamav2/ext.py
Lines changed: 45 additions & 41 deletions b/‎exllamav2/ext.py
Lines changed: 45 additions & 41 deletions
diff --git a/‎exllamav2/fasttensors.py
Lines changed: 19 additions & 14 deletions b/‎exllamav2/fasttensors.py
Lines changed: 19 additions & 14 deletions
diff --git a/‎exllamav2/headnorm.py
Lines changed: 27 additions & 21 deletions b/‎exllamav2/headnorm.py
Lines changed: 27 additions & 21 deletions
diff --git a/‎exllamav2/layernorm.py
Lines changed: 27 additions & 21 deletions b/‎exllamav2/layernorm.py
Lines changed: 27 additions & 21 deletions
@@ -18,8 +18,10 @@ def pairwise(iterable):
 
 tested_peer_copy = None
 
-def test_gpu_peer_copy(device_a: torch.Device,
-                       device_b: torch.Device):
+def test_gpu_peer_copy(
+    device_a: torch.Device,
+    device_b: torch.Device
+):
     global tested_peer_copy
 
     if tested_peer_copy is None:
@@ -47,9 +49,11 @@ def test_gpu_peer_copy(device_a: torch.Device,
         return False
 
 
-def safe_move_tensor(tensor: torch.Tensor | tuple[torch.Tensor],
-                     device: torch.Device | str | int,
-                     non_blocking = False):
+def safe_move_tensor(
+    tensor: torch.Tensor | tuple[torch.Tensor],
+    device: torch.Device | str | int,
+    non_blocking = False
+):
 
     # Accept tensor or tuple of tensors
 
 
@@ -40,11 +40,12 @@ class ExLlamaV2DeviceContext:
 
     stream: torch.cuda.Stream
 
-    def __init__(self,
-                 model: ExLlamaV2,
-                 device_idx: int,
-                 scratch_bytes: int):
-
+    def __init__(
+        self,
+        model: ExLlamaV2,
+        device_idx: int,
+        scratch_bytes: int
+    ):
         self.model = model
         self.device_idx = device_idx
         self.ready = False
 
@@ -20,9 +20,11 @@ class ExLlamaV2Embedding(ExLlamaV2Module):
 
     is_tp: bool
 
-    def __init__(self,
-                 model: ExLlamaV2,
-                 key: str):
+    def __init__(
+        self,
+        model: ExLlamaV2,
+        key: str
+    ):
         super().__init__(model, key)
 
         self.is_tp = False
@@ -93,14 +95,16 @@ def scratch_space(self) -> int:
         return 0
 
 
-    def forward(self,
-                hidden_states: torch.Tensor,
-                cache = None,
-                attn_params: ExLlamaV2Attention.Params = None,
-                past_len = None,
-                intermediates: bool = False,
-                loras = None,
-                **kwargs) -> torch.Tensor | dict[str: torch.Tensor]:
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cache = None,
+        attn_params: ExLlamaV2Attention.Params = None,
+        past_len = None,
+        intermediates: bool = False,
+        loras = None,
+        **kwargs
+    ) -> torch.Tensor | dict[str: torch.Tensor]:
 
         cfg = self.model.config
 
 
@@ -339,19 +339,21 @@ def make_q_matrix(w: dict,
         if "q_group_map" not in w:
             w["q_group_map"] = make_group_map(w["q_groups"], w["q_weight"].shape[0])
 
-        return ext_c.make_q_matrix(w["q_weight"],
-                                   w.get("q_perm", none_tensor),
-                                   w.get("q_invperm", none_tensor),
-                                   w["q_scale"],
-                                   w["q_scale_max"],
-                                   w["q_groups"],
-                                   w["q_group_map"],
-                                   none_tensor,
-                                   none_tensor,
-                                   none_tensor,
-                                   w.get("bias", none_tensor),
-                                   temp_dq,
-                                   max_dq_rows)
+        return ext_c.make_q_matrix(
+            w["q_weight"],
+            w.get("q_perm", none_tensor),
+            w.get("q_invperm", none_tensor),
+            w["q_scale"],
+            w["q_scale_max"],
+            w["q_groups"],
+            w["q_group_map"],
+            none_tensor,
+            none_tensor,
+            none_tensor,
+            w.get("bias", none_tensor),
+            temp_dq,
+            max_dq_rows
+        )
 
     # GPTQ
 
@@ -370,36 +372,38 @@ def make_q_matrix(w: dict,
             w["q_perm"] = torch.empty((w["qweight"].shape[0] * 8,), dtype = torch.short, device = w["qweight"].device)
             w["q_invperm"] = torch.empty_like(w["q_perm"])
 
-            return ext_c.make_q_matrix(w["qweight"],
-                                       w["q_perm"],
-                                       w["q_invperm"],
-                                       none_tensor,
-                                       none_tensor,
-                                       none_tensor,
-                                       none_tensor,
-                                       w["qzeros"],
-                                       w["scales"],
-                                       w["g_idx"].cpu(),
-                                       w.get("bias", none_tensor),
-                                       temp_dq,
-                                       max_dq_rows)
+            return ext_c.make_q_matrix(
+                w["qweight"],
+                w["q_perm"],
+                w["q_invperm"],
+                none_tensor,
+                none_tensor,
+                none_tensor,
+                none_tensor,
+                w["qzeros"],
+                w["scales"],
+                w["g_idx"].cpu(),
+                w.get("bias", none_tensor),
+                temp_dq,
+                max_dq_rows
+            )
 
         # GPTQ without g_idx
 
         else:
 
-            return ext_c.make_q_matrix(w["qweight"],
-                                       none_tensor,
-                                       none_tensor,
-                                       none_tensor,
-                                       none_tensor,
-                                       none_tensor,
-                                       none_tensor,
-                                       w["qzeros"],
-                                       w["scales"],
-                                       none_tensor,
-                                       w.get("bias", none_tensor),
-                                       temp_dq,
-                                       max_dq_rows)
-
-
+            return ext_c.make_q_matrix(
+                w["qweight"],
+                none_tensor,
+                none_tensor,
+                none_tensor,
+                none_tensor,
+                none_tensor,
+                none_tensor,
+                w["qzeros"],
+                w["scales"],
+                none_tensor,
+                w.get("bias", none_tensor),
+                temp_dq,
+                max_dq_rows
+            )
@@ -52,11 +52,12 @@ class STFile:
     st_context = None
     tensor_remap: dict | None
 
-    def __init__(self,
-                 filename: str,
-                 fast: bool = True,
-                 keymap: list[tuple[str, str]] = None):
-
+    def __init__(
+        self,
+        filename: str,
+        fast: bool = True,
+        keymap: list[tuple[str, str]] = None
+    ):
         global global_stfiles
 
         self.metadata = None
@@ -101,9 +102,11 @@ def __init__(self,
 
 
     @staticmethod
-    def open(filename,
-             fast = True,
-             keymap: list[tuple[str, str]] = None) -> STFile:
+    def open(
+        filename,
+        fast = True,
+        keymap: list[tuple[str, str]] = None
+    ) -> STFile:
         """
         Open safetensors file, scan header and retain handle.
 
@@ -181,12 +184,14 @@ def get_cm(self, device):
         return f
 
 
-    def get_tensor(self,
-                   key: str,
-                   device,
-                   not_fast: bool = False,
-                   cached: bool = False,
-                   out_dtype = None) -> torch.Tensor:
+    def get_tensor(
+        self,
+        key: str,
+        device,
+        not_fast: bool = False,
+        cached: bool = False,
+        out_dtype = None
+    ) -> torch.Tensor:
         global global_tensorcache
 
         torch.cuda.synchronize()
 
@@ -21,11 +21,13 @@ class ExLlamaV2HeadNorm(ExLlamaV2Module):
     num_heads: int
 
 
-    def __init__(self,
-                 model: ExLlamaV2,
-                 key: str,
-                 num_heads: int,
-                 head_dim: int):
+    def __init__(
+        self,
+        model: ExLlamaV2,
+        key: str,
+        num_heads: int,
+        head_dim: int
+    ):
         super().__init__(model, key)
 
         self.layernorm = None
@@ -101,14 +103,16 @@ def scratch_space(self) -> int:
         return 0
 
 
-    def forward(self,
-                hidden_states: torch.Tensor,
-                cache = None,
-                attn_params = None,
-                past_len = None,
-                intermediates: bool = False,
-                loras = None,
-                **kwargs) -> torch.Tensor | dict[str: torch.Tensor]:
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cache = None,
+        attn_params = None,
+        past_len = None,
+        intermediates: bool = False,
+        loras = None,
+        **kwargs
+    ) -> torch.Tensor | dict[str: torch.Tensor]:
 
         norm = torch.empty_like(hidden_states)
         ext_c.head_norm(hidden_states,
@@ -122,14 +126,16 @@ def forward(self,
         else:
             return hidden_states
 
-    def forward_torch(self,
-                      hidden_states: torch.Tensor,
-                      cache = None,
-                      attn_params = None,
-                      past_len = None,
-                      intermediates: bool = False,
-                      loras = None,
-                      **kwargs) -> torch.Tensor | dict[str: torch.Tensor]:
+    def forward_torch(
+        self,
+        hidden_states: torch.Tensor,
+        cache = None,
+        attn_params = None,
+        past_len = None,
+        intermediates: bool = False,
+        loras = None,
+        **kwargs
+    ) -> torch.Tensor | dict[str: torch.Tensor]:
 
         input_dtype = hidden_states.dtype
         hidden_states = hidden_states.to(torch.float32)
 
@@ -18,9 +18,11 @@ class ExLlamaV2LayerNorm(ExLlamaV2Module):
     variance_epsilon: float
 
 
-    def __init__(self,
-                 model: ExLlamaV2,
-                 key: str):
+    def __init__(
+        self,
+        model: ExLlamaV2,
+        key: str
+    ):
         super().__init__(model, key)
 
         self.layernorm = None
@@ -93,15 +95,17 @@ def scratch_space(self) -> int:
         return 0
 
 
-    def forward(self,
-                hidden_states: torch.Tensor,
-                cache = None,
-                attn_params = None,
-                past_len = None,
-                intermediates: bool = False,
-                loras = None,
-                output_fp32 = False,  # TODO:
-                **kwargs) -> torch.Tensor | dict[str: torch.Tensor]:
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cache = None,
+        attn_params = None,
+        past_len = None,
+        intermediates: bool = False,
+        loras = None,
+        output_fp32 = False,  # TODO:
+        **kwargs
+    ) -> torch.Tensor | dict[str: torch.Tensor]:
 
         output_shape = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
@@ -120,15 +124,17 @@ def forward(self,
             return hidden_states
 
 
-    def forward_torch(self,
-                      hidden_states: torch.Tensor,
-                      cache = None,
-                      attn_params = None,
-                      past_len = None,
-                      intermediates: bool = False,
-                      loras = None,
-                      output_fp32 = False,  # TODO:
-                      **kwargs) -> torch.Tensor | dict[str: torch.Tensor]:
+    def forward_torch(
+        self,
+        hidden_states: torch.Tensor,
+        cache = None,
+        attn_params = None,
+        past_len = None,
+        intermediates: bool = False,
+        loras = None,
+        output_fp32 = False,  # TODO:
+        **kwargs
+    ) -> torch.Tensor | dict[str: torch.Tensor]:
 
         hidden_states = self.layernorm(hidden_states)