Skip to content

Commit 19319ae

Browse files
authored
Merge branch 'main' into remove-conda
2 parents 78f9670 + fd981a5 commit 19319ae

13 files changed

+441
-27
lines changed

.ci/docker/requirements.txt

+3-3
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ tqdm==4.66.1
1414
numpy==1.24.4
1515
matplotlib
1616
librosa
17-
torch==2.6
17+
torch==2.7
1818
torchvision
1919
torchdata
2020
networkx
@@ -67,7 +67,7 @@ iopath
6767
pygame==2.6.0
6868
pycocotools
6969
semilearn==0.3.2
70-
torchao==0.5.0
70+
torchao==0.10.0
7171
segment_anything==1.0
7272
torchrec==1.1.0; platform_system == "Linux"
73-
fbgemm-gpu==1.1.0; platform_system == "Linux"
73+
fbgemm-gpu==1.2.0; platform_system == "Linux"

.jenkins/build.sh

+2-5
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,10 @@ sudo apt-get install -y pandoc
2222
#Install PyTorch Nightly for test.
2323
# Nightly - pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html
2424
# Install 2.5 to merge all 2.4 PRs - uncomment to install nightly binaries (update the version as needed).
25-
# sudo pip uninstall -y torch torchvision torchaudio torchtext torchdata
26-
# sudo pip3 install torch==2.6.0 torchvision --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu124
2725
# sudo pip uninstall -y fbgemm-gpu torchrec
26+
# sudo pip uninstall -y torch torchvision torchaudio torchtext torchdata torchrl tensordict
2827
# sudo pip3 install fbgemm-gpu==1.1.0 torchrec==1.0.0 --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu124
29-
sudo pip uninstall -y torch torchvision torchaudio torchtext torchdata torchrl tensordict
30-
pip3 install torch==2.7.0 torchvision torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu126
31-
#sudo pip uninstall -y fbgemm-gpu
28+
# pip3 install torch==2.7.0 torchvision torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu126
3229
# Install two language tokenizers for Translation with TorchText tutorial
3330
python -m spacy download en_core_web_sm
3431
python -m spacy download de_core_news_sm

.jenkins/validate_tutorials_built.py

+3-8
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
"prototype_source/vmap_recipe",
3232
"prototype_source/torchscript_freezing",
3333
"prototype_source/nestedtensor",
34+
"prototype_source/gpu_direct_storage", # requires specific filesystem + GPUDirect Storage to be set up
3435
"recipes_source/recipes/saving_and_loading_models_for_inference",
3536
"recipes_source/recipes/saving_multiple_models_in_one_file",
3637
"recipes_source/recipes/tensorboard_with_pytorch",
@@ -51,14 +52,8 @@
5152
"intermediate_source/text_to_speech_with_torchaudio",
5253
"intermediate_source/tensorboard_profiler_tutorial", # reenable after 2.0 release.
5354
"advanced_source/semi_structured_sparse", # reenable after 3303 is fixed.
54-
"intermediate_source/mario_rl_tutorial", # reenable after 3302 is fixed
55-
"intermediate_source/reinforcement_ppo", # reenable after 3302 is fixed
56-
"intermediate_source/pinmem_nonblock", # reenable after 3302 is fixed
57-
"intermediate_source/dqn_with_rnn_tutorial", # reenable after 3302 is fixed
58-
"advanced_source/pendulum", # reenable after 3302 is fixed
59-
"advanced_source/coding_ddpg", # reenable after 3302 is fixed
60-
"intermediate_source/torchrec_intro_tutorial", # reenable after 3302 is fixed
61-
"recipes_source/recipes/reasoning_about_shapes" # reenable after 3326 is fixed
55+
"intermediate_source/torchrec_intro_tutorial", # reenable after 3302 is fixe
56+
"intermediate_source/memory_format_tutorial", # causes other tutorials like torch_logs fail. "state" issue, reseting dynamo didn't help
6257
]
6358

6459
def tutorial_source_dirs() -> List[Path]:

beginner_source/basics/optimization_tutorial.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def forward(self, x):
7676
# (`read more <https://pytorch.org/tutorials/beginner/hyperparameter_tuning_tutorial.html>`__ about hyperparameter tuning)
7777
#
7878
# We define the following hyperparameters for training:
79-
# - **Number of Epochs** - the number times to iterate over the dataset
79+
# - **Number of Epochs** - the number of times to iterate over the dataset
8080
# - **Batch Size** - the number of data samples propagated through the network before the parameters are updated
8181
# - **Learning Rate** - how much to update models parameters at each batch/epoch. Smaller values yield slow learning speed, while large values may result in unpredictable behavior during training.
8282
#

beginner_source/colab.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ PyTorch Version in Google Colab
1111
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1212

1313
Wen you are running a tutorial that requires a version of PyTorch that has
14-
jst been released, that version might not be yet available in Google Colab.
14+
just been released, that version might not be yet available in Google Colab.
1515
To check that you have the required ``torch`` and compatible domain libraries
1616
installed, run ``!pip list``.
1717

@@ -27,7 +27,7 @@ Using Tutorial Data from Google Drive in Colab
2727
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2828

2929
We've added a new feature to tutorials that allows users to open the
30-
ntebook associated with a tutorial in Google Colab. You may need to
30+
notebook associated with a tutorial in Google Colab. You may need to
3131
copy data to your Google drive account to get the more complex tutorials
3232
to work.
3333

conf.py

+6
Original file line numberDiff line numberDiff line change
@@ -99,10 +99,16 @@
9999

100100
def reset_seeds(gallery_conf, fname):
101101
torch.cuda.empty_cache()
102+
torch.backends.cudnn.deterministic = True
103+
torch.backends.cudnn.benchmark = False
104+
torch._dynamo.reset()
105+
torch._inductor.config.force_disable_caches = True
102106
torch.manual_seed(42)
103107
torch.set_default_device(None)
104108
random.seed(10)
105109
numpy.random.seed(10)
110+
torch.set_grad_enabled(True)
111+
106112
gc.collect()
107113

108114
sphinx_gallery_conf = {

en-wordlist.txt

+11
Original file line numberDiff line numberDiff line change
@@ -698,3 +698,14 @@ TorchServe
698698
Inductor’s
699699
onwards
700700
recompilations
701+
BiasCorrection
702+
ELU
703+
GELU
704+
NNCF
705+
OpenVINO
706+
OpenVINOQuantizer
707+
PReLU
708+
Quantizer
709+
SmoothQuant
710+
quantizer
711+
quantizers

intermediate_source/torch_compile_tutorial.py

+13-4
Original file line numberDiff line numberDiff line change
@@ -101,8 +101,11 @@ def forward(self, x):
101101
return torch.nn.functional.relu(self.lin(x))
102102

103103
mod = MyModule()
104-
opt_mod = torch.compile(mod)
105-
print(opt_mod(t))
104+
mod.compile()
105+
print(mod(t))
106+
## or:
107+
# opt_mod = torch.compile(mod)
108+
# print(opt_mod(t))
106109

107110
######################################################################
108111
# torch.compile and Nested Calls
@@ -135,8 +138,8 @@ def forward(self, x):
135138
return torch.nn.functional.relu(self.outer_lin(x))
136139

137140
outer_mod = OuterModule()
138-
opt_outer_mod = torch.compile(outer_mod)
139-
print(opt_outer_mod(t))
141+
outer_mod.compile()
142+
print(outer_mod(t))
140143

141144
######################################################################
142145
# We can also disable some functions from being compiled by using
@@ -197,6 +200,12 @@ def outer_function():
197200
# 4. **Compile Leaf Functions First:** In complex models with multiple nested
198201
# functions and modules, start by compiling the leaf functions or modules first.
199202
# For more information see `TorchDynamo APIs for fine-grained tracing <https://pytorch.org/docs/stable/torch.compiler_fine_grain_apis.html>`__.
203+
#
204+
# 5. **Prefer ``mod.compile()`` over ``torch.compile(mod)``:** Avoids ``_orig_`` prefix issues in ``state_dict``.
205+
#
206+
# 6. **Use ``fullgraph=True`` to catch graph breaks:** Helps ensure end-to-end compilation, maximizing speedup
207+
# and compatibility with ``torch.export``.
208+
200209

201210
######################################################################
202211
# Demonstrating Speedups
+132
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
"""
2+
(prototype) Accelerating ``torch.save`` and ``torch.load`` with GPUDirect Storage
3+
=================================================================================
4+
5+
GPUDirect Storage enables a direct data path for direct memory access transfers
6+
between GPU memory and storage, avoiding a bounce buffer through the CPU.
7+
8+
In version **2.7**, we introduced new prototype APIs to ``torch.cuda.gds`` that serve as thin wrappers around
9+
the `cuFile APIs <https://docs.nvidia.com/gpudirect-storage/api-reference-guide/index.html#cufile-io-api>`_
10+
that can be used with ``torch.Tensor`` to achieve improved I/O performance.
11+
12+
In this tutorial, we will demonstrate how to use the ``torch.cuda.gds`` APIs in conjunction with
13+
checkpoints generated by ``torch.save`` and ``torch.load`` on local filesystem.
14+
15+
.. grid:: 2
16+
17+
.. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
18+
:class-card: card-prerequisites
19+
20+
* Understand how to use the ``torch.cuda.gds`` APIs in conjunction with
21+
checkpoints generated by ``torch.save`` and ``torch.load`` on local filesystem
22+
23+
.. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
24+
:class-card: card-prerequisites
25+
26+
* PyTorch v.2.7.0 or later
27+
* GPUDirect Storage must be installed per
28+
`the documentation <https://docs.nvidia.com/gpudirect-storage/troubleshooting-guide/contents.html>`_
29+
* Ensure that the filesystem that you are saving/loading to supports GPUDirect Storage.
30+
"""
31+
32+
################################################################################
33+
# Using GPUDirect Storage with ``torch.save`` and ``torch.load``
34+
# ------------------------------------------------------------------------------------
35+
# GPUDirect Storage requires a storage alignment of 4KB. You can toggle this by using
36+
# ``torch.utils.serialization.config.save.storage_alignment``:
37+
38+
import torch
39+
from torch.utils.serialization import config as serialization_config
40+
41+
serialization_config.save.storage_alignment = 4096
42+
43+
################################################################################
44+
# The steps involved in the process are as follows:
45+
# * Write the checkpoint file without any actual data. This reserves the space on disk.
46+
# * Read the offsets for the storage associated with each tensor in the checkpoint using ``FakeTensor``.
47+
# * Use ``GDSFile`` to write the appropriate data at these offsets.
48+
#
49+
# Given a state dictionary of tensors that are on the GPU, one can use the ``torch.serialization.skip_data`` context
50+
# manager to save a checkpoint that contains all relevant metadata except the storage bytes. For each ``torch.Storage``
51+
# in the state dictionary, space will be reserved within the checkpoint for the storage bytes.
52+
53+
import torch.nn as nn
54+
55+
m = nn.Linear(5, 10, device='cuda')
56+
sd = m.state_dict()
57+
58+
with torch.serialization.skip_data():
59+
torch.save(sd, "checkpoint.pt")
60+
61+
################################################################################
62+
# We can get the offsets that each storage should be written to within the checkpoint by loading under
63+
# a ``FakeTensorMode``. A FakeTensor is a tensor that has metadata (such as sizes, strides, dtype, device)
64+
# information about the tensor but does not have any storage bytes. The following snippet will not materialize
65+
# any data but will tag each ``FakeTensor`` with the offset within the checkpoint that
66+
# corresponds to the tensor.
67+
#
68+
# If you are continuously saving the same state dictionary during training, you
69+
# would only need to obtain the offsets once and the same offsets can be re-used. Similarly if tensor is going to
70+
# be saved or loaded to repeatedly you can use the ``torch.cuda.gds.gds_register_buffer`` which wraps
71+
# ``cuFileBufRegister`` to register the storages as GDS buffers.
72+
#
73+
# Note that ``torch.cuda.gds.GdsFile.save_storage`` binds to the synchronous ``cuFileWrite`` API,
74+
# so no synchronization is needed afterwards.
75+
76+
77+
import os
78+
from torch._subclasses.fake_tensor import FakeTensorMode
79+
80+
with FakeTensorMode() as mode:
81+
fake_sd = torch.load("checkpoint.pt")
82+
83+
for k, v in fake_sd.items():
84+
print(f"key={k}, offset={v.untyped_storage()._checkpoint_offset}")
85+
86+
f = torch.cuda.gds.GdsFile("checkpoint.pt", os.O_RDWR)
87+
88+
for k, v in sd.items():
89+
offset = fake_sd[k].untyped_storage()._checkpoint_offset
90+
# save_storage is a wrapper around `cuFileWrite`
91+
f.save_storage(v.untyped_storage(), offset)
92+
93+
94+
################################################################################
95+
# We verify correctness of the saved checkpoint by ``torch.load`` and comparing.
96+
97+
sd_loaded = torch.load("checkpoint.pt")
98+
for k, v in sd_loaded.items():
99+
assert torch.equal(v, sd[k])
100+
101+
################################################################################
102+
# The loading flow is the inverse: you can use ``torch.load`` with the ``torch.serialization.skip_data`` context
103+
# manager to load everything except the storage bytes. This means that any tensors in the checkpoint will be
104+
# created but their storages will be empty (as if the tensors were created via ``torch.empty``).
105+
106+
with torch.serialization.skip_data():
107+
sd_loaded = torch.load("checkpoint.pt")
108+
109+
################################################################################
110+
# We once again use the ``FakeTensorMode`` to get the checkpoint offsets and
111+
# ascertain that the loaded checkpoint is the same as the saved checkpoint.
112+
#
113+
# Similar to ``torch.cuda.gds.GdsFile.save_storage``, ``torch.cuda.gds.GdsFile.load_storage``
114+
# binds to the synchronous ``cuFileRead`` API, so no synchronization is needed afterwards.
115+
116+
for k, v in sd_loaded.items():
117+
assert not torch.equal(v, sd[k])
118+
offset = fake_sd[k].untyped_storage()._checkpoint_offset
119+
# load_storage is a wrapper around `cuFileRead`
120+
f.load_storage(v.untyped_storage(), offset)
121+
122+
for k, v in sd_loaded.items():
123+
assert torch.equal(v, sd[k])
124+
125+
del f
126+
##########################################################
127+
# Conclusion
128+
# ==========
129+
#
130+
# In this tutorial we have demonstrated how to use the prototype ``torch.cuda.gds`` APIs
131+
# in conjunction with ``torch.save`` and ``torch.load`` on local filesystem. Please
132+
# file an issue in the PyTorch GitHub repo if you have any feedback.

prototype_source/inductor_windows.rst

+2-3
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,9 @@ Install a Compiler
2222

2323
C++ compiler is required for TorchInductor optimization, let's take Microsoft Visual C++ (MSVC) as an example.
2424

25-
1. Download and install `MSVC <https://visualstudio.microsoft.com/downloads/>`_.
25+
#. Download and install `MSVC <https://visualstudio.microsoft.com/downloads/>`_.
2626

27-
1. During Installation, select **Workloads** and then **Desktop & Mobile**.
28-
1. Select a checkmark on **Desktop Development with C++** and install.
27+
#. During Installation, select **Workloads** and then **Desktop & Mobile**. Select a checkmark on **Desktop Development with C++** and install.
2928

3029
.. image:: ../_static/img/install_msvc.png
3130

0 commit comments

Comments
 (0)