fixing CI

HDCharles · HDCharles · commit eb49dedd01e2 · 2025-05-08T01:07:23.000-07:00
Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/torchao/_models/mixtral-moe/README.md b/torchao/_models/mixtral-moe/README.md
@@ -1,3 +1,8 @@
-This is the benchmarking setup primarily used for testing quantized moe. You can reproduce the above numbers by running
+## Mixtral-MoE
+
+This folder contains code and scripts for benchmarking the Mixtral-MoE model.
+Running
 
 `sh scripts/prepare.sh`
+
+should download the model and `sh run.sh` will run teh benchmarks.
diff --git a/torchao/_models/mixtral-moe/generate.py b/torchao/_models/mixtral-moe/generate.py
@@ -208,7 +208,6 @@ def main(
     assert checkpoint_path.is_file(), checkpoint_path
     tokenizer_path = checkpoint_path.parent / "tokenizer.model"
     assert tokenizer_path.is_file(), str(tokenizer_path)
-    print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
     print(f"Using device={device}")
     precision = torch.bfloat16
     is_chat = "chat" in str(checkpoint_path)
@@ -220,10 +219,10 @@ def main(
 
     print("Loading model ...")
     t0 = time.time()
-    model = _load_model(checkpoint_path, device, precision)
+    model = _load_model(checkpoint_path, "cpu", precision)
 
-    device_sync(device=device)  # MKG
     print(f"Time to load model: {time.time() - t0:.02f} seconds")
+    t0 = time.time()
 
     tokenizer = SentencePieceProcessor(model_file=str(tokenizer_path))
     encoded = encode_tokens(tokenizer, prompt, bos=True, device=device)
@@ -299,7 +298,12 @@ def main(
 
         if config is not None:
             quantize_(model, config, filter_fn=cond_ffn_filter)
-            torch.cuda.reset_peak_memory_stats()
+            print(f"Time to apply quantization to model: {time.time() - t0:.02f} seconds")
+    
+    model.to(device=device)
+    device_sync(device=device)
+
+    print(f"C: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
 
     if compile:
         # moe quant + compile causes repeated warnings
@@ -382,7 +386,7 @@ def callback(x):
 
         if not interactive:
             pass
-            print(tokenizer.decode(y[0].tolist()))
+            # print(tokenizer.decode(y[0].tolist()))
         else:
             print()
         tokens_generated = y.size(-1) - prompt_length
diff --git a/torchao/_models/mixtral-moe/model.py b/torchao/_models/mixtral-moe/model.py
@@ -395,9 +395,17 @@ def forward(
                 .to(torch.int64)
             )  #  [T]
 
-            num_tokens_per_expert = torch.histc(
-                expert_indices, bins=self.num_experts + 1, min=-1, max=self.num_experts
-            )  #  [E+1] (added leading 0 so can be used for indexing)
+            if not expert_indices.is_cuda:  # histc doesn't work on cpu for integers
+                num_tokens_per_expert = torch.bincount(
+                    expert_indices.view(-1) + 1, minlength=self.num_experts + 1
+                )
+            else:
+                num_tokens_per_expert = torch.histc(
+                    expert_indices,
+                    bins=self.num_experts + 1,
+                    min=-1,
+                    max=self.num_experts,
+                )  #  [E+1] (added leading 0 so can be used for indexing)
             cum_tokens_per_expert = num_tokens_per_expert.cumsum(0).to(
                 torch.int64
             )  #  [E+1]
diff --git a/torchao/_models/mixtral-moe/run.sh b/torchao/_models/mixtral-moe/run.sh
@@ -1,5 +1,5 @@
 export MODEL_REPO=mistralai/Mixtral-8x7B-Instruct-v0.1
-export CHECKPOINT_PATH=~/checkpoints/
+export CHECKPOINT_PATH=checkpoints/
 
 python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 1 --compile
 python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 8 --compile
@@ -16,11 +16,11 @@ python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --ba
 python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 1 --moe_quant int4wo-base --compile
 python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 8 --moe_quant int4wo-base --compile
 
-# EXPERT CHOICE
-# python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 1 --moe_quant int8dq --compile
-# # # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 8 --moe_quant int8dq --compile
-# python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 1 --moe_quant int8dq-base --compile
-# # # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 8 --moe_quant int8dq-base --compile
+# # EXPERT CHOICE
+# # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 1 --moe_quant int8dq --compile
+# # # # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 8 --moe_quant int8dq --compile
+# # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 1 --moe_quant int8dq-base --compile
+# # # # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 8 --moe_quant int8dq-base --compile
 
 python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 1 --moe_quant fp8wo --compile
 python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 8 --moe_quant fp8wo --compile
diff --git a/torchao/_models/mixtral-moe/scripts/prepare.sh b/torchao/_models/mixtral-moe/scripts/prepare.sh
@@ -1,2 +1,2 @@
 python scripts/download.py --repo_id mistralai/Mixtral-8x7B-Instruct-v0.1
-python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/mistralai/Mixtral-8x7B-v0.1
+python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/mistralai/Mixtral-8x7B-Instruct-v0.1
diff --git a/torchao/quantization/prototype/moe_quant/quantizable_moe_modules.py b/torchao/quantization/prototype/moe_quant/quantizable_moe_modules.py
@@ -131,7 +131,6 @@ def forward(
                     min=-1,
                     max=self.num_experts,
                 )  #  [E+1] (added leading 0 so can be used for indexing)
-            # num_tokens_per_expert = torch.bincount(expert_indices.view(-1)+1, minlength=self.num_experts+1)
             cum_tokens_per_expert = num_tokens_per_expert.cumsum(0).to(
                 torch.int64
             )  #  [E+1]

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`python scripts/download.py --repo_id mistralai/Mixtral-8x7B-Instruct-v0.1`
`2`		`-python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/mistralai/Mixtral-8x7B-v0.1`
	`2`	`+python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/mistralai/Mixtral-8x7B-Instruct-v0.1`