pytorch
diff --git a/‎test/sparsity/test_parametrization.py
Lines changed: 3 additions & 6 deletions b/‎test/sparsity/test_parametrization.py
Lines changed: 3 additions & 6 deletions
diff --git a/‎torchao/sparsity/prototype/sparsifier/utils.py
Lines changed: 0 additions & 6 deletions b/‎torchao/sparsity/prototype/sparsifier/utils.py
Lines changed: 0 additions & 6 deletions
diff --git a/‎torchao/sparsity/prototype/superblock/README.md
Lines changed: 7 additions & 7 deletions b/‎torchao/sparsity/prototype/superblock/README.md
Lines changed: 7 additions & 7 deletions
diff --git a/‎torchao/sparsity/prototype/superblock/benchmark.py
Lines changed: 34 additions & 79 deletions b/‎torchao/sparsity/prototype/superblock/benchmark.py
Lines changed: 34 additions & 79 deletions
diff --git a/‎torchao/sparsity/prototype/superblock/blocksparse.py
Lines changed: 7 additions & 0 deletions b/‎torchao/sparsity/prototype/superblock/blocksparse.py
Lines changed: 7 additions & 0 deletions
@@ -130,18 +130,15 @@ def test_state_dict_preserved(self):
             model_load.seq[1].parametrizations["weight"].original,
         )
 
-        # Check the masks are not preserved in the state_dict
-        # We store the state_dicts in the sparsifier, not in the model itself.
-        # TODO: Need to find a clean way of exporting the parametrized model
-        self.assertNotEqual(
+        self.assertEqual(
             model_save.linear.parametrizations["weight"][0].mask,
             model_load.linear.parametrizations["weight"][0].mask,
         )
-        self.assertNotEqual(
+        self.assertEqual(
             model_save.seq[0].parametrizations["weight"][0].mask,
             model_load.seq[0].parametrizations["weight"][0].mask,
         )
-        self.assertNotEqual(
+        self.assertEqual(
             model_save.seq[1].parametrizations["weight"][0].mask,
             model_load.seq[1].parametrizations["weight"][0].mask,
         )
 
@@ -128,9 +128,3 @@ def __init__(self, mask):
     def forward(self, x):
         assert self.mask.shape == x.shape
         return self.mask * x
-
-    def state_dict(self, *args, **kwargs):
-        # We don't want to let the parametrizations to save the mask.
-        # That way we make sure that the linear module doesn't store the masks
-        # alongside their parametrizations.
-        return {}
@@ -1,12 +1,12 @@
 # SuperBlock
 
-SuperBlock combines two techniques for efficient neural network training and inference: Supermask and Block Compressed Sparse Row (BSR). 
+SuperBlock combines two techniques for efficient neural network training and inference: Supermask and Block Compressed Sparse Row (BSR).
 The techniques are described in this [blog post](https://pytorch.org/blog/speeding-up-vits/).
 
 ### Supermask
 [Supermask](https://arxiv.org/abs/2207.00670) is a technique for applying structured sparsity to neural networks using a learned mask. It works by learning a continuous mask (scores) that is applied element-wise to the weights of a neural network layer. The mask scores are learned separately from the weights and are thresholded based on a target sparsity level to obtain a binary mask. The mask determines which weigths are kept and which are pruned, and is learned during training.
 
-During inference, the binary mask is applied element-wise to the weights, pruning the weights that correspond to a 0 in the mask, resulting in a sparse network that can be efficiently computed. 
+During inference, the binary mask is applied element-wise to the weights, pruning the weights that correspond to a 0 in the mask, resulting in a sparse network that can be efficiently computed.
 
 ### Block compressed Sparse Row Format (BSR)
 [The BSR format](https://pytorch.org/docs/main/sparse.html#sparse-bsr-tensor) is a sparse matrix representation that stores dense sub-blocks of non-zero elements instead of individual non-zero elements. The matrix is divided into equal-sized blocks, and only the non-zero blocks are stored.
@@ -105,7 +105,7 @@ torchrun --nproc_per_node=8 train.py\
     --model vit_b_16 --epochs 300 --batch-size 512 --opt adamw --lr 0.003 --wd 0.3\
     --lr-scheduler cosineannealinglr --lr-warmup-method linear --lr-warmup-epochs 30\
     --lr-warmup-decay 0.033 --amp --label-smoothing 0.11 --mixup-alpha 0.2 --auto-augment ra\
-    --clip-grad-norm 1 --ra-sampler --cutmix-alpha 1.0 --model-ema\ 
+    --clip-grad-norm 1 --ra-sampler --cutmix-alpha 1.0 --model-ema\
     --sparsity-linear 0.9 --sp-linear-tile-size 32
 ```
 Through this command, we are training a `vit_b_16` with 90% sparsity to linear layers using 32x32 tiles.
@@ -124,7 +124,7 @@ NGPUS=1 # put number of available GPUS here
 
 * Offline sparsification with BSR:
   ```
-  torchrun --nproc_per_node=${NGPUS} evaluate.py  --model vit_b_16 --batch-size 256 --sparsity-linear 0.9 --sp-linear-tile-size 32 --weights-path ${MODEL_PATH}  --data-path ${IMAGENET_PATH} --sparsify-weights --bsr 32
+  python evaluate.py  --model vit_b_16 --batch-size 256 --sparsity-linear 0.9 --sp-linear-tile-size 32 --weights-path ${MODEL_PATH}  --data-path ${IMAGENET_PATH} --sparsity bsr --bsr 64
   ```
   This command applies 90% sparsity to linear layers using 32x32 tiles, loads the model weights from ${MODEL_PATH}, loads the ImageNet validation set located at the specified path, applies offline sparsification to the weights, and converts the sparse weights to BSR format with a block size of 32. It is recommended to set `--bsr`      the same as tile size.
 
@@ -184,7 +184,7 @@ python benchmark.py --model vit_b_16 \
   --batch-size 256 \
   --sparsity-linear ${SPARSITY} \
   --sp-linear-tile-size ${BLOCK_SIZE} \
-  --sparsify-weights \
+  --sparsity bsr\
   --bsr ${BLOCK_SIZE} \
   --weights-path ./checkpoints/sp${SPARSITY}-ts${BLOCK_SIZE}.pth \
   > /dev/null
@@ -197,7 +197,7 @@ Result:
 ### Evaluate:
 8 x A100 GPUs:
 ```
-torchrun --nproc_per_node=8 evaluate.py --model vit_b_16 --batch-size 256 --sparsity-linear ${SPARSITY} --sp-linear-tile-size ${BLOCK_SIZE} --bsr ${BLOCK_SIZE} --sparsify-weights --weights-path checkpoints/sp${SPARSITY}-ts${BLOCK_SIZE}.pth --data-path ${IMAGENET_PATH}
+torchrun --nproc_per_node=8 evaluate.py --model vit_b_16 --batch-size 256 --sparsity-linear ${SPARSITY} --sp-linear-tile-size ${BLOCK_SIZE} --bsr ${BLOCK_SIZE} --sparsity bsr --weights-path checkpoints/sp${SPARSITY}-ts${BLOCK_SIZE}.pth --data-path ${IMAGENET_PATH}
 ```
 Result:
 ```
@@ -207,7 +207,7 @@ Test:  Acc@1 77.644 Acc@5 93.554
 
 1 x A100 GPUs:
 ```
-torchrun --nproc_per_node=1 evaluate.py --model vit_b_16 --batch-size 256 --sparsity-linear ${SPARSITY} --sp-linear-tile-size ${BLOCK_SIZE} --bsr ${BLOCK_SIZE} --sparsify-weights --weights-path checkpoints/sp${SPARSITY}-ts${BLOCK_SIZE}.pth --data-path ${IMAGENET_PATH}
+torchrun --nproc_per_node=1 evaluate.py --model vit_b_16 --batch-size 256 --sparsity-linear ${SPARSITY} --sp-linear-tile-size ${BLOCK_SIZE} --bsr ${BLOCK_SIZE} --sparsity bsr--weights-path checkpoints/sp${SPARSITY}-ts${BLOCK_SIZE}.pth --data-path ${IMAGENET_PATH}
 ```
 Result:
 ```
 
@@ -13,36 +13,10 @@
 import utils
 from torch import nn
 from torch.sparse._triton_ops_meta import optimize_bsr_dense_addmm
+from torchao.sparsity.prototype.superblock.utils import accelerate_with_sparsity, simulate_sparsity
+from torchao.utils import benchmark_model, profiler_runner
 
-sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
-from supermask import apply_supermask, SupermaskLinear
-from blocksparse import BlockSparseTensor
-from utils import benchmark_inference
-
-
-def apply_sparsity(model):
-    for name, module in model.named_modules():
-        if isinstance(module, SupermaskLinear) and "mlp" in name:
-            module.sparsify_offline()
-
-
-def apply_bsr(model, blocksize):
-    for name, module in model.named_modules():
-        if isinstance(module, torch.nn.Linear) and "mlp" in name:
-            try:
-                module.weight = torch.nn.Parameter(BlockSparseTensor.from_dense(module.weight.data, blocksize))
-                print(f"Converted {name} to bsr format.")
-            except ValueError as e:
-                print(f"Unable to convert weight of {name} to bsr format: {e}")
-
-
-def verify_sparsity(model):
-    for name, module in model.named_modules():
-        if isinstance(module, nn.Linear):
-            total_weights = module.weight.numel()
-            sparse_weights = (module.weight == 0).sum().item()
-            sparsity_percentage = (sparse_weights / total_weights) * 100
-            print(f"Sparsity verified in layer {name}: {sparsity_percentage:.2f}%")
+torch.sparse.SparseSemiStructuredTensor._FORCE_CUTLASS = False
 
 @torch.inference_mode
 def main(args):
@@ -54,36 +28,26 @@ def main(args):
     torch.backends.cudnn.deterministic = True
     num_classes = 1000
 
-    dtype = None
-    if args.bfloat16:
-        print("Using bfloat16")
-        dtype = torch.bfloat16
-    elif args.float16:
-        print("Using float16")
-        dtype = torch.float16
+    dtype = getattr(torch, args.dtype)
+    print(f"Using dtype: {dtype}")
 
+    # BSR kernel tuning
     if args.bsr and args.tune_kernel_params:
         print("Tuning kernel params")
-        assert args.model == "vit_b_16", "--tune-kernel-params only supported for vit-b-16!"
-        optimize_bsr_dense_addmm(3072, 768, 50432, args.bsr, args.bsr, dtype=dtype, sparsity=args.sparsity_linear, verbose=True)
-        optimize_bsr_dense_addmm(768, 3072, 50432, args.bsr, args.bsr, dtype=dtype, sparsity=args.sparsity_linear, verbose=True)
+        if args.model == "vit_b_16":
+            optimize_bsr_dense_addmm(3072, 768, 50432, args.bsr, args.bsr, dtype=dtype, sparsity=args.sparsity_linear, verbose=True)
+            optimize_bsr_dense_addmm(768, 3072, 50432, args.bsr, args.bsr, dtype=dtype, sparsity=args.sparsity_linear, verbose=True)
+        elif args.model == "vit_h_14":
+            optimize_bsr_dense_addmm(5120, 1280, 65792, args.bsr, args.bsr, dtype=dtype, sparsity=args.sparsity_linear, verbose=True)
+            optimize_bsr_dense_addmm(1280, 5120, 65792, args.bsr, args.bsr, dtype=dtype, sparsity=args.sparsity_linear, verbose=True)
+        else:
+            raise NotImplementedError("Tuning kernel params for this model is not supported yet.")
 
     print("Creating model")
     model = torchvision.models.get_model(args.model, weights=args.weights, num_classes=num_classes)
 
-    apply_supermask(
-        model,
-        linear_sparsity=args.sparsity_linear,
-        linear_sp_tilesize=args.sp_linear_tile_size,
-        conv1x1_sparsity=args.sparsity_conv1x1,
-        conv1x1_sp_tilesize=args.sp_conv1x1_tile_size,
-        conv_sparsity=args.sparsity_conv,
-        conv_sp_tilesize=args.sp_conv_tile_size,
-        skip_last_layer_sparsity=args.skip_last_layer_sparsity,
-        skip_first_transformer_sparsity=args.skip_first_transformer_sparsity,
-        device=device,
-        verbose=False,
-    )
+    # Fake sparsity necessary for BSR
+    simulate_sparsity(model, args)
 
     if args.weights_path:
         try:
@@ -93,33 +57,24 @@ def main(args):
         except FileNotFoundError:
             raise FileNotFoundError(f"No checkpoint found at {args.weights_path}.")
 
-    model.to(device)
-
-    if args.sparsify_weights:
-        apply_sparsity(model)
-        verify_sparsity(model)
-
-        # verify correctness
-        # output1 = model(input)
-        # assert torch.allclose(output0, output1), "Output of model before and after weight sparsification should be equal"
+    model.to(device).to(dtype)
 
-    if dtype:
-        model = model.to(dtype)
+    # Fake sparsity necessary for BSR
+    accelerate_with_sparsity(model, args)
 
-    if args.bsr:
-        if not args.sparsify_weights:
-            raise ValueError("--bsr can only be used when --sparsify_weights is also specified.")
-        apply_bsr(model, blocksize=args.bsr)
+    # compile 
+    model = torch.compile(model, mode='max-autotune', fullgraph=True)
 
-        # verify correctness
-        # output2 = model(input)
-        # assert torch.allclose(output2, output1), "Output of model before and after changing format to BSR should be equal"
+    # define image
+    image = torch.randn(args.batch_size, 3, args.val_crop_size, args.val_crop_size, dtype=dtype, device=device)
 
-    model = torch.compile(model, mode='max-autotune')
+    # warmup
+    benchmark_model(model, 10, args=(image,)) 
+    if args.profile:
+        return profiler_runner("test.json.gz", benchmark_model, model, 10, (image,)) 
+    else:
+        return benchmark_model(model, 100, args=(image,)) 
 
-    image = torch.empty(args.batch_size, 3, args.val_crop_size, args.val_crop_size, dtype=dtype, device=device)
-
-    return benchmark_inference(10, 100, model, image)
 
 
 def get_args_parser(add_help=True):
@@ -131,15 +86,13 @@ def get_args_parser(add_help=True):
     parser.add_argument(
         "-b", "--batch-size", default=32, type=int, help="images per gpu, the total batch size is $NGPU x batch_size"
     )
-
-    # Mixed precision training parameters
     parser.add_argument(
         "--val-crop-size", default=224, type=int, help="the central crop size used for validation (default: 224)"
     )
     parser.add_argument("--weights", default=None, type=str, help="the weights enum name to load")
     parser.add_argument("--weights-path", type=str, help="path of pretrained weights to load")
-
     # NOTE: sparsity args
+    parser.add_argument("--sparsity", choices=["bsr", "semi_structured"], default=None, help='weight sparsification to apply')
     parser.add_argument("--sparsity-linear", type=float, default=0.0)
     parser.add_argument("--sp-linear-tile-size", type=int, default=1)
     parser.add_argument("--sparsity-conv1x1", type=float, default=0.0)
@@ -148,11 +101,12 @@ def get_args_parser(add_help=True):
     parser.add_argument("--sp-conv-tile-size", type=int, default=1)
     parser.add_argument("--skip-last-layer-sparsity", action="store_true", help="Skip applying sparsity to the last linear layer (for vit only)")
     parser.add_argument("--skip-first-transformer-sparsity", action="store_true", help="Skip applying sparsity to the first transformer layer (for vit only)")
-    parser.add_argument('--sparsify-weights', action='store_true', help='Apply weight sparsification in evaluation mode')
     parser.add_argument('--bsr', type=int, nargs='?', const=256, default=None, help='Convert sparsified weights to BSR format with optional block size (default: 256)')
-    parser.add_argument("--bfloat16", action="store_true", help="Use bfloat16")
+    parser.add_argument("--dtype", choices=["float32", "bfloat16", "float16"], help="data type", default="bfloat16")
     parser.add_argument("--float16", action="store_true", help="Use float16")
     parser.add_argument("--tune-kernel-params", action="store_true", help="Tune kernel params")
+    parser.add_argument("--profile", action="store_true", help="Profile the run and dump Prefetto trace")   
+    parser.add_argument("--quantization", action="store_true", help="Profile the run and dump Prefetto trace")   
 
     return parser
 
@@ -161,3 +115,4 @@ def get_args_parser(add_help=True):
     args = get_args_parser().parse_args()
     result = main(args)
     print(f"{result:.3f} ms", file=sys.stderr)
+    print(f"{1000/result:.3f} img/s")
@@ -1,3 +1,5 @@
+from functools import partial
+
 import torch
 from typing import Optional, Tuple, List, Dict, Any, Callable
 from torch.utils._python_dispatch import return_and_correct_aliasing
@@ -6,6 +8,8 @@
     _dispatch__torch_function__,
     _dispatch__torch_dispatch__,
 )
+from torchao.quantization.quant_api import _get_linear_subclass_inserter
+
 aten = torch.ops.aten
 
 # bsr wrapper custom op
@@ -136,3 +140,6 @@ def block_sparse_linear(func, types, args, kwargs):
                                         w.col_indices(),
                                         w.values(),
                                         w.shape[0], w.shape[1], bias)
+
+def block_sparse_weight(blocksize=64):
+    return _get_linear_subclass_inserter(partial(BlockSparseTensor.from_dense, blocksize=blocksize))
Original file line number	Diff line number	Diff line change
`@@ -130,18 +130,15 @@ def test_state_dict_preserved(self):`
`130`	`130`	`model_load.seq[1].parametrizations["weight"].original,`
`131`	`131`	`)`
`132`	`132`
`133`		`- # Check the masks are not preserved in the state_dict`
`134`		`- # We store the state_dicts in the sparsifier, not in the model itself.`
`135`		`- # TODO: Need to find a clean way of exporting the parametrized model`
`136`		`- self.assertNotEqual(`
	`133`	`+ self.assertEqual(`
`137`	`134`	`model_save.linear.parametrizations["weight"][0].mask,`
`138`	`135`	`model_load.linear.parametrizations["weight"][0].mask,`
`139`	`136`	`)`
`140`		`- self.assertNotEqual(`
	`137`	`+ self.assertEqual(`
`141`	`138`	`model_save.seq[0].parametrizations["weight"][0].mask,`
`142`	`139`	`model_load.seq[0].parametrizations["weight"][0].mask,`
`143`	`140`	`)`
`144`		`- self.assertNotEqual(`
	`141`	`+ self.assertEqual(`
`145`	`142`	`model_save.seq[1].parametrizations["weight"][0].mask,`
`146`	`143`	`model_load.seq[1].parametrizations["weight"][0].mask,`
`147`	`144`	`)`