Merge branch 'main' into remove-conda

svekars · web-flow · commit 19319ae66910 · 2025-05-02T11:05:18.000-05:00
diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt
@@ -14,7 +14,7 @@ tqdm==4.66.1
 numpy==1.24.4
 matplotlib
 librosa
-torch==2.6
+torch==2.7
 torchvision
 torchdata
 networkx
@@ -67,7 +67,7 @@ iopath
 pygame==2.6.0
 pycocotools
 semilearn==0.3.2
-torchao==0.5.0
+torchao==0.10.0
 segment_anything==1.0
 torchrec==1.1.0; platform_system == "Linux"
-fbgemm-gpu==1.1.0; platform_system == "Linux"
+fbgemm-gpu==1.2.0; platform_system == "Linux"
diff --git a/.jenkins/build.sh b/.jenkins/build.sh
@@ -22,13 +22,10 @@ sudo apt-get install -y pandoc
 #Install PyTorch Nightly for test.
 # Nightly - pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html
 # Install 2.5 to merge all 2.4 PRs - uncomment to install nightly binaries (update the version as needed).
-# sudo pip uninstall -y torch torchvision torchaudio torchtext torchdata
-# sudo pip3 install torch==2.6.0 torchvision --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu124
 # sudo pip uninstall -y fbgemm-gpu torchrec
+# sudo pip uninstall -y torch torchvision torchaudio torchtext torchdata torchrl tensordict
 # sudo pip3 install fbgemm-gpu==1.1.0 torchrec==1.0.0 --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu124
-sudo pip uninstall -y torch torchvision torchaudio torchtext torchdata torchrl tensordict
-pip3 install torch==2.7.0 torchvision torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu126
-#sudo pip uninstall -y fbgemm-gpu
+# pip3 install torch==2.7.0 torchvision torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu126
 # Install two language tokenizers for Translation with TorchText tutorial
 python -m spacy download en_core_web_sm
 python -m spacy download de_core_news_sm
diff --git a/.jenkins/validate_tutorials_built.py b/.jenkins/validate_tutorials_built.py
@@ -31,6 +31,7 @@
     "prototype_source/vmap_recipe",
     "prototype_source/torchscript_freezing",
     "prototype_source/nestedtensor",
+    "prototype_source/gpu_direct_storage", # requires specific filesystem + GPUDirect Storage to be set up
     "recipes_source/recipes/saving_and_loading_models_for_inference",
     "recipes_source/recipes/saving_multiple_models_in_one_file",
     "recipes_source/recipes/tensorboard_with_pytorch",
@@ -51,14 +52,8 @@
     "intermediate_source/text_to_speech_with_torchaudio",
     "intermediate_source/tensorboard_profiler_tutorial", # reenable after 2.0 release.
     "advanced_source/semi_structured_sparse", # reenable after 3303 is fixed.
-    "intermediate_source/mario_rl_tutorial", # reenable after 3302 is fixed
-    "intermediate_source/reinforcement_ppo", # reenable after 3302 is fixed
-    "intermediate_source/pinmem_nonblock", # reenable after 3302 is fixed
-    "intermediate_source/dqn_with_rnn_tutorial", # reenable after 3302 is fixed
-    "advanced_source/pendulum", # reenable after 3302 is fixed
-    "advanced_source/coding_ddpg", # reenable after 3302 is fixed
-    "intermediate_source/torchrec_intro_tutorial", # reenable after 3302 is fixed
-    "recipes_source/recipes/reasoning_about_shapes" # reenable after 3326 is fixed
+    "intermediate_source/torchrec_intro_tutorial", # reenable after 3302 is fixe
+    "intermediate_source/memory_format_tutorial", # causes other tutorials like torch_logs fail. "state" issue, reseting dynamo didn't help
 ]
 
 def tutorial_source_dirs() -> List[Path]:
diff --git a/beginner_source/basics/optimization_tutorial.py b/beginner_source/basics/optimization_tutorial.py
@@ -76,7 +76,7 @@ def forward(self, x):
 # (`read more <https://pytorch.org/tutorials/beginner/hyperparameter_tuning_tutorial.html>`__ about hyperparameter tuning)
 #
 # We define the following hyperparameters for training:
-#  - **Number of Epochs** - the number times to iterate over the dataset
+#  - **Number of Epochs** - the number of times to iterate over the dataset
 #  - **Batch Size** - the number of data samples propagated through the network before the parameters are updated
 #  - **Learning Rate** - how much to update models parameters at each batch/epoch. Smaller values yield slow learning speed, while large values may result in unpredictable behavior during training.
 #
diff --git a/beginner_source/colab.rst b/beginner_source/colab.rst
@@ -11,7 +11,7 @@ PyTorch Version in Google Colab
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Wen you are running a tutorial that requires a version of PyTorch that has
-jst been released, that version might not be yet available in Google Colab.
+just been released, that version might not be yet available in Google Colab.
 To check that you have the required ``torch`` and compatible domain libraries
 installed, run ``!pip list``.
 
@@ -27,7 +27,7 @@ Using Tutorial Data from Google Drive in Colab
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 We've added a new feature to tutorials that allows users to open the
-ntebook associated with a tutorial in Google Colab. You may need to
+notebook associated with a tutorial in Google Colab. You may need to
 copy data to your Google drive account to get the more complex tutorials
 to work.
 
diff --git a/conf.py b/conf.py
@@ -99,10 +99,16 @@
 
 def reset_seeds(gallery_conf, fname):
     torch.cuda.empty_cache()
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    torch._dynamo.reset()
+    torch._inductor.config.force_disable_caches = True
     torch.manual_seed(42)
     torch.set_default_device(None)
     random.seed(10)
     numpy.random.seed(10)
+    torch.set_grad_enabled(True)
+
     gc.collect()
 
 sphinx_gallery_conf = {
diff --git a/en-wordlist.txt b/en-wordlist.txt
@@ -698,3 +698,14 @@ TorchServe
 Inductor’s
 onwards
 recompilations
+BiasCorrection
+ELU
+GELU
+NNCF
+OpenVINO
+OpenVINOQuantizer
+PReLU
+Quantizer
+SmoothQuant
+quantizer
+quantizers
diff --git a/intermediate_source/torch_compile_tutorial.py b/intermediate_source/torch_compile_tutorial.py
@@ -101,8 +101,11 @@ def forward(self, x):
         return torch.nn.functional.relu(self.lin(x))
 
 mod = MyModule()
-opt_mod = torch.compile(mod)
-print(opt_mod(t))
+mod.compile()
+print(mod(t))
+## or:
+# opt_mod = torch.compile(mod)
+# print(opt_mod(t))
 
 ######################################################################
 # torch.compile and Nested Calls
@@ -135,8 +138,8 @@ def forward(self, x):
         return torch.nn.functional.relu(self.outer_lin(x))
 
 outer_mod = OuterModule()
-opt_outer_mod = torch.compile(outer_mod)
-print(opt_outer_mod(t))
+outer_mod.compile()
+print(outer_mod(t))
 
 ######################################################################
 # We can also disable some functions from being compiled by using
@@ -197,6 +200,12 @@ def outer_function():
 # 4. **Compile Leaf Functions First:** In complex models with multiple nested
 # functions and modules, start by compiling the leaf functions or modules first.
 # For more information see `TorchDynamo APIs for fine-grained tracing <https://pytorch.org/docs/stable/torch.compiler_fine_grain_apis.html>`__.
+#
+# 5. **Prefer ``mod.compile()`` over ``torch.compile(mod)``:** Avoids ``_orig_`` prefix issues in ``state_dict``.
+#
+# 6. **Use ``fullgraph=True`` to catch graph breaks:** Helps ensure end-to-end compilation, maximizing speedup
+# and compatibility with ``torch.export``.
+
 
 ######################################################################
 # Demonstrating Speedups
diff --git a/prototype_source/gpu_direct_storage.py b/prototype_source/gpu_direct_storage.py
@@ -0,0 +1,132 @@
+"""
+(prototype) Accelerating ``torch.save`` and ``torch.load`` with GPUDirect Storage
+=================================================================================
+
+GPUDirect Storage enables a direct data path for direct memory access transfers
+between GPU memory and storage, avoiding a bounce buffer through the CPU.
+
+In version **2.7**, we introduced new prototype APIs to ``torch.cuda.gds`` that serve as thin wrappers around
+the `cuFile APIs <https://docs.nvidia.com/gpudirect-storage/api-reference-guide/index.html#cufile-io-api>`_
+that can be used with ``torch.Tensor`` to achieve improved I/O performance.
+
+In this tutorial, we will demonstrate how to use the ``torch.cuda.gds`` APIs in conjunction with
+checkpoints generated by ``torch.save`` and ``torch.load`` on local filesystem. 
+
+.. grid:: 2
+
+    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
+       :class-card: card-prerequisites
+
+       * Understand how to use the ``torch.cuda.gds`` APIs in conjunction with
+         checkpoints generated by ``torch.save`` and ``torch.load`` on local filesystem
+    
+    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
+       :class-card: card-prerequisites
+
+       * PyTorch v.2.7.0 or later
+       * GPUDirect Storage must be installed per
+         `the documentation <https://docs.nvidia.com/gpudirect-storage/troubleshooting-guide/contents.html>`_
+       * Ensure that the filesystem that you are saving/loading to supports GPUDirect Storage.
+"""
+
+################################################################################
+# Using GPUDirect Storage with ``torch.save`` and ``torch.load``
+# ------------------------------------------------------------------------------------
+# GPUDirect Storage requires a storage alignment of 4KB. You can toggle this by using
+# ``torch.utils.serialization.config.save.storage_alignment``:
+
+import torch
+from torch.utils.serialization import config as serialization_config
+
+serialization_config.save.storage_alignment = 4096
+
+################################################################################
+# The steps involved in the process are as follows:
+#    * Write the checkpoint file without any actual data. This reserves the space on disk.
+#    * Read the offsets for the storage associated with each tensor in the checkpoint using ``FakeTensor``.
+#    * Use ``GDSFile`` to write the appropriate data at these offsets.
+# 
+# Given a state dictionary of tensors that are on the GPU, one can use the ``torch.serialization.skip_data`` context
+# manager to save a checkpoint that contains all relevant metadata except the storage bytes. For each ``torch.Storage``
+# in the state dictionary, space will be reserved within the checkpoint for the storage bytes.
+
+import torch.nn as nn
+
+m = nn.Linear(5, 10, device='cuda')
+sd = m.state_dict()
+
+with torch.serialization.skip_data():
+    torch.save(sd, "checkpoint.pt")
+
+################################################################################
+# We can get the offsets that each storage should be written to within the checkpoint by loading under
+# a ``FakeTensorMode``. A FakeTensor is a tensor that has metadata (such as sizes, strides, dtype, device)
+# information about the tensor but does not have any storage bytes. The following snippet will not materialize
+# any data but will tag each ``FakeTensor`` with the offset within the checkpoint that
+# corresponds to the tensor.
+# 
+# If you are continuously saving the same state dictionary during training, you
+# would only need to obtain the offsets once and the same offsets can be re-used. Similarly if tensor is going to
+# be saved or loaded to repeatedly you can use the ``torch.cuda.gds.gds_register_buffer`` which wraps
+# ``cuFileBufRegister`` to register the storages as GDS buffers.
+#
+# Note that ``torch.cuda.gds.GdsFile.save_storage`` binds to the synchronous ``cuFileWrite`` API,
+# so no synchronization is needed afterwards.
+
+
+import os
+from torch._subclasses.fake_tensor import FakeTensorMode
+
+with FakeTensorMode() as mode:
+    fake_sd = torch.load("checkpoint.pt")
+
+for k, v in fake_sd.items():
+    print(f"key={k}, offset={v.untyped_storage()._checkpoint_offset}")
+
+f = torch.cuda.gds.GdsFile("checkpoint.pt", os.O_RDWR)
+
+for k, v in sd.items():
+    offset = fake_sd[k].untyped_storage()._checkpoint_offset
+    # save_storage is a wrapper around `cuFileWrite`
+    f.save_storage(v.untyped_storage(), offset)
+
+
+################################################################################
+# We verify correctness of the saved checkpoint by ``torch.load`` and comparing.
+
+sd_loaded = torch.load("checkpoint.pt")
+for k, v in sd_loaded.items():
+    assert torch.equal(v, sd[k])
+
+################################################################################
+# The loading flow is the inverse: you can use ``torch.load`` with the ``torch.serialization.skip_data`` context
+# manager to load everything except the storage bytes. This means that any tensors in the checkpoint will be
+# created but their storages will be empty (as if the tensors were created via ``torch.empty``).
+
+with torch.serialization.skip_data():
+    sd_loaded = torch.load("checkpoint.pt")
+
+################################################################################
+# We once again use the ``FakeTensorMode`` to get the checkpoint offsets and
+# ascertain that the loaded checkpoint is the same as the saved checkpoint.
+#
+# Similar to  ``torch.cuda.gds.GdsFile.save_storage``, ``torch.cuda.gds.GdsFile.load_storage``
+# binds to the synchronous ``cuFileRead`` API, so no synchronization is needed afterwards.
+
+for k, v in sd_loaded.items():
+    assert not torch.equal(v, sd[k])
+    offset = fake_sd[k].untyped_storage()._checkpoint_offset
+    # load_storage is a wrapper around `cuFileRead`
+    f.load_storage(v.untyped_storage(), offset)
+
+for k, v in sd_loaded.items():
+    assert torch.equal(v, sd[k])
+
+del f
+##########################################################
+# Conclusion
+# ==========
+#
+# In this tutorial we have demonstrated how to use the prototype ``torch.cuda.gds`` APIs
+# in conjunction with ``torch.save`` and ``torch.load`` on local filesystem. Please
+# file an issue in the PyTorch GitHub repo if you have any feedback.
diff --git a/prototype_source/inductor_windows.rst b/prototype_source/inductor_windows.rst
@@ -22,10 +22,9 @@ Install a Compiler
 
 C++ compiler is required for TorchInductor optimization, let's take Microsoft Visual C++ (MSVC) as an example.
 
-1. Download and install `MSVC <https://visualstudio.microsoft.com/downloads/>`_.
+#. Download and install `MSVC <https://visualstudio.microsoft.com/downloads/>`_.
 
-1. During Installation, select **Workloads** and then **Desktop & Mobile**.
-1. Select a checkmark on **Desktop Development with C++** and install.
+#. During Installation, select **Workloads** and then **Desktop & Mobile**. Select a checkmark on **Desktop Development with C++** and install.
 
 .. image:: ../_static/img/install_msvc.png
 
diff --git a/prototype_source/openvino_quantizer.rst b/prototype_source/openvino_quantizer.rst
diff --git a/prototype_source/prototype_index.rst b/prototype_source/prototype_index.rst
diff --git a/recipes_source/recipes/zeroing_out_gradients.py b/recipes_source/recipes/zeroing_out_gradients.py