2025-07-24 nightly release (145441b)

pytorchbot · pytorchbot · commit 0a39b97ef4b1 · 2025-07-24T11:36:28.000Z
diff --git a/torchrec/distributed/planner/planners.py b/torchrec/distributed/planner/planners.py
@@ -8,7 +8,6 @@
 # pyre-strict
 
 import copy
-import hashlib
 import logging
 import time
 from functools import reduce
@@ -143,33 +142,24 @@ def _merge_plans(best_plans: List[ShardingPlan]) -> ShardingPlan:
         return merged_plan
 
 
-class EmbeddingShardingPlanner(ShardingPlanner):
+class EmbeddingPlannerBase(ShardingPlanner):
     """
-    Provides an optimized sharding plan for a given module with shardable parameters
-    according to the provided sharders, topology, and constraints.
+    Base class for embedding sharding planners that provides common initialization
+    and shared functionality.
 
     Args:
         topology (Optional[Topology]): the topology of the current process group.
         batch_size (Optional[int]): the batch size of the model.
         enumerator (Optional[Enumerator]): the enumerator to use
         storage_reservation (Optional[StorageReservation]): the storage reservation to use
-        proposer (Optional[Union[Proposer, List[Proposer]]]): the proposer(s) to use
-        partitioner (Optional[Partitioner]): the partitioner to use
-        performance_model (Optional[PerfModel]): the performance model to use
         stats (Optional[Union[Stats, List[Stats]]]): the stats to use
         constraints (Optional[Dict[str, ParameterConstraints]]): per table constraints
             for sharding.
         debug (bool): whether to print debug information.
-
-    Example::
-
-        ebc = EmbeddingBagCollection(tables=eb_configs, device=torch.device("meta"))
-        planner = EmbeddingShardingPlanner()
-        plan = planner.plan(
-            module=ebc,
-            sharders=[EmbeddingBagCollectionSharder()],
-        )
-
+        callbacks (Optional[List[Callable[[List[ShardingOption]], List[ShardingOption]]]):
+            callback functions to apply to plans.
+        timeout_seconds (Optional[int]): timeout for planning in seconds.
+        heuristical_storage_reservation_percentage (float): percentage of storage to reserve for sparse archs.
     """
 
     def __init__(
@@ -178,16 +168,14 @@ def __init__(
         batch_size: Optional[int] = None,
         enumerator: Optional[Enumerator] = None,
         storage_reservation: Optional[StorageReservation] = None,
-        proposer: Optional[Union[Proposer, List[Proposer]]] = None,
-        partitioner: Optional[Partitioner] = None,
-        performance_model: Optional[PerfModel] = None,
         stats: Optional[Union[Stats, List[Stats]]] = None,
         constraints: Optional[Dict[str, ParameterConstraints]] = None,
         debug: bool = True,
         callbacks: Optional[
             List[Callable[[List[ShardingOption]], List[ShardingOption]]]
         ] = None,
         timeout_seconds: Optional[int] = None,
+        heuristical_storage_reservation_percentage: float = 0.15,
     ) -> None:
         if topology is None:
             topology = Topology(
@@ -210,7 +198,116 @@ def __init__(
         self._storage_reservation: StorageReservation = (
             storage_reservation
             if storage_reservation
-            else HeuristicalStorageReservation(percentage=0.15)
+            else HeuristicalStorageReservation(
+                percentage=heuristical_storage_reservation_percentage
+            )
+        )
+
+        if stats is not None:
+            self._stats: List[Stats] = [stats] if not isinstance(stats, list) else stats
+        else:
+            self._stats = [EmbeddingStats()]
+
+        self._debug = debug
+        self._callbacks: List[
+            Callable[[List[ShardingOption]], List[ShardingOption]]
+        ] = ([] if callbacks is None else callbacks)
+        if timeout_seconds is not None:
+            assert timeout_seconds > 0, "Timeout must be positive"
+        self._timeout_seconds = timeout_seconds
+
+    def collective_plan(
+        self,
+        module: nn.Module,
+        sharders: Optional[List[ModuleSharder[nn.Module]]] = None,
+        pg: Optional[dist.ProcessGroup] = None,
+    ) -> ShardingPlan:
+        """
+        Call self.plan(...) on rank 0 and broadcast
+
+        Args:
+            module (nn.Module): the module to shard.
+            sharders (Optional[List[ModuleSharder[nn.Module]]]): the sharders to use for sharding
+            pg (Optional[dist.ProcessGroup]): the process group to use for collective operations
+
+        Returns:
+            ShardingPlan: the sharding plan for the module.
+        """
+        if pg is None:
+            assert dist.is_initialized(), (
+                "The default process group is not yet initialized. "
+                "Please call torch.distributed.init_process_group() first before invoking this. "
+                "If you are not within a distributed environment, use the single rank version plan() instead."
+            )
+            pg = none_throws(dist.GroupMember.WORLD)
+
+        if sharders is None:
+            sharders = get_default_sharders()
+        return invoke_on_rank_and_broadcast_result(
+            pg,
+            0,
+            self.plan,
+            module,
+            sharders,
+        )
+
+
+class EmbeddingShardingPlanner(EmbeddingPlannerBase):
+    """
+    Provides an optimized sharding plan for a given module with shardable parameters
+    according to the provided sharders, topology, and constraints.
+
+    Args:
+        topology (Optional[Topology]): the topology of the current process group.
+        batch_size (Optional[int]): the batch size of the model.
+        enumerator (Optional[Enumerator]): the enumerator to use
+        storage_reservation (Optional[StorageReservation]): the storage reservation to use
+        proposer (Optional[Union[Proposer, List[Proposer]]]): the proposer(s) to use
+        partitioner (Optional[Partitioner]): the partitioner to use
+        performance_model (Optional[PerfModel]): the performance model to use
+        stats (Optional[Union[Stats, List[Stats]]]): the stats to use
+        constraints (Optional[Dict[str, ParameterConstraints]]): per table constraints
+            for sharding.
+        debug (bool): whether to print debug information.
+
+    Example::
+
+        ebc = EmbeddingBagCollection(tables=eb_configs, device=torch.device("meta"))
+        planner = EmbeddingShardingPlanner()
+        plan = planner.plan(
+            module=ebc,
+            sharders=[EmbeddingBagCollectionSharder()],
+        )
+
+    """
+
+    def __init__(
+        self,
+        topology: Optional[Topology] = None,
+        batch_size: Optional[int] = None,
+        enumerator: Optional[Enumerator] = None,
+        storage_reservation: Optional[StorageReservation] = None,
+        proposer: Optional[Union[Proposer, List[Proposer]]] = None,
+        partitioner: Optional[Partitioner] = None,
+        performance_model: Optional[PerfModel] = None,
+        stats: Optional[Union[Stats, List[Stats]]] = None,
+        constraints: Optional[Dict[str, ParameterConstraints]] = None,
+        debug: bool = True,
+        callbacks: Optional[
+            List[Callable[[List[ShardingOption]], List[ShardingOption]]]
+        ] = None,
+        timeout_seconds: Optional[int] = None,
+    ) -> None:
+        super().__init__(
+            topology=topology,
+            batch_size=batch_size,
+            enumerator=enumerator,
+            storage_reservation=storage_reservation,
+            stats=stats,
+            constraints=constraints,
+            debug=debug,
+            callbacks=callbacks,
+            timeout_seconds=timeout_seconds,
         )
         self._partitioner: Partitioner = (
             partitioner if partitioner else GreedyPerfPartitioner()
@@ -227,24 +324,14 @@ def __init__(
                 UniformProposer(),
             ]
         self._perf_model: PerfModel = (
-            performance_model if performance_model else NoopPerfModel(topology=topology)
+            performance_model
+            if performance_model
+            else NoopPerfModel(topology=self._topology)
         )
 
-        if stats is not None:
-            self._stats: List[Stats] = [stats] if not isinstance(stats, list) else stats
-        else:
-            self._stats = [EmbeddingStats()]
-
-        self._debug = debug
         self._num_proposals: int = 0
         self._num_plans: int = 0
         self._best_plan: Optional[List[ShardingOption]] = None
-        self._callbacks: List[
-            Callable[[List[ShardingOption]], List[ShardingOption]]
-        ] = ([] if callbacks is None else callbacks)
-        if timeout_seconds is not None:
-            assert timeout_seconds > 0, "Timeout must be positive"
-        self._timeout_seconds = timeout_seconds
 
     def collective_plan(
         self,
diff --git a/torchrec/inference/inference_legacy/src/BatchingQueue.cpp b/torchrec/inference/inference_legacy/src/BatchingQueue.cpp
@@ -18,11 +18,9 @@
 
 #include <ATen/Functions.h> // @manual
 #include <ATen/core/Dict.h>
-#include <ATen/core/interned_strings.h>
 #include <ATen/record_function.h> // @manual
 #include <c10/core/Device.h>
 #include <c10/core/DeviceType.h>
-#include <c10/cuda/CUDAFunctions.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/cuda/CUDAStream.h>
 #include <fmt/format.h>
diff --git a/torchrec/inference/inference_legacy/src/GPUExecutor.cpp b/torchrec/inference/inference_legacy/src/GPUExecutor.cpp
@@ -16,11 +16,8 @@
 #include <c10/cuda/CUDAGuard.h>
 #include <fmt/format.h>
 #include <folly/MPMCQueue.h>
-#include <folly/ScopeGuard.h>
-#include <folly/Synchronized.h>
 #include <folly/executors/CPUThreadPoolExecutor.h>
 #include <folly/futures/Future.h>
-#include <folly/io/IOBuf.h>
 #include <folly/io/async/Request.h>
 #include <folly/stop_watch.h>
 #include <gflags/gflags.h>
@@ -35,7 +32,6 @@
 #endif
 
 #include "ATen/cuda/CUDAEvent.h"
-#include "torchrec/inference/BatchingQueue.h"
 #include "torchrec/inference/ExceptionHandler.h"
 #include "torchrec/inference/Observer.h"
 #include "torchrec/inference/Types.h"
diff --git a/torchrec/inference/inference_legacy/src/ResultSplit.cpp b/torchrec/inference/inference_legacy/src/ResultSplit.cpp
@@ -8,12 +8,10 @@
 
 #include "torchrec/inference/ResultSplit.h"
 
-#include <c10/core/ScalarType.h>
 #include <folly/Range.h>
 #include <folly/container/Enumerate.h>
 #include <folly/io/Cursor.h>
 
-#include "ATen/Functions.h"
 #include "torchrec/inference/Types.h"
 
 namespace torchrec {