Support get/set the whole row of metaheader+weight+optimizer from backend for checkpoint saving/loading (#3153)

Jianbo Liu · facebook-github-bot · commit cba04eb40a08 · 2025-07-14T09:37:08.000-07:00
Summary: X-link: facebookresearch/FBGEMM#1500 Pull Request resolved: #3153 X-link: pytorch/FBGEMM#4435 # Context In our current KVZCH cp loading flow, we will keep hold of weight_id, weight, optimizer tensors throughout the checkpoint loading lifecycle, and at the end when all these tensors are downloaded in hand, we will explicitly call "apply_state_dict" to actually write them by chunk to the backend to ensure id->weight and id->opt are mapped correctly. The problem is when we have large number of weights, we will be short of memory since we need to hold all 3 tensors (double memory issue). To solve this challenge, we are going to save the whole row of (metaheader + weight + opt) as the same "weight" tensor during checkpoint saving, and when downloading the checkpoint, we will be able to extract the id from the header, and directly write the weight+opt part to the backend by id. When loading cp for optimizer, we added a no-op KVTensor, so it won't need to write to backend for optimizer states again. # This diff only contains frontend changes * added `backend_return_whole_row` flag in KVZCH params, with validation to make sure it's only True when opt_offloading is used * added `read_only_` flag in KVTensorWrapper to be used for checkpoint calls. When read-only=True, all write operations to this KVT will be no-op * added metadata recalc for optimizer state dict, because we are now returning read-only KVT for opt state dict, and model store will need to correct the global metadata before creating the save plan for KVZCH opt tensors * by default the opt offloading and return whole row is False on trunk, so should not break existing KVZCH runs Reviewed By: emlin Differential Revision: D77666892 Privacy Context Container: L1138451 fbshipit-source-id: b0ca5f0f880ede1a803f77d0d520abb3356a0c8d
diff --git a/torchrec/distributed/batched_embedding_kernel.py b/torchrec/distributed/batched_embedding_kernel.py
@@ -209,6 +209,7 @@ def _populate_zero_collision_tbe_params(
     tbe_params: Dict[str, Any],
     sharded_local_buckets: List[Tuple[int, int, int]],
     config: GroupedEmbeddingConfig,
+    backend_type: BackendType,
 ) -> None:
     """
     Construct Zero Collision TBE params from config and fused params dict.
@@ -220,11 +221,15 @@ def _populate_zero_collision_tbe_params(
     bucket_sizes: List[int] = [size for _, _, size in sharded_local_buckets]
 
     enabled = False
-    for table in config.embedding_tables:
-        if table.virtual_table_eviction_policy is not None and not isinstance(
-            table.virtual_table_eviction_policy, NoEvictionPolicy
-        ):
-            enabled = True
+    meta_header_lens = [0] * len(config.embedding_tables)
+    for i, table in enumerate(config.embedding_tables):
+        # virtual_table_eviction_policy won't be None in reality: https://fburl.com/code/864a0w0f
+        if table.virtual_table_eviction_policy is not None:
+            meta_header_lens[i] = (
+                table.virtual_table_eviction_policy.get_meta_header_len()
+            )
+            if not isinstance(table.virtual_table_eviction_policy, NoEvictionPolicy):
+                enabled = True
     if enabled:
         counter_thresholds = [0] * len(config.embedding_tables)
         ttls_in_mins = [0] * len(config.embedding_tables)
@@ -283,14 +288,16 @@ def _populate_zero_collision_tbe_params(
             ttls_in_mins=ttls_in_mins,
             counter_decay_rates=counter_decay_rates,
             l2_weight_thresholds=l2_weight_thresholds,
+            meta_header_lens=meta_header_lens,
         )
     else:
-        eviction_policy = None
+        eviction_policy = EvictionPolicy(meta_header_lens=meta_header_lens)
 
     tbe_params["kv_zch_params"] = KVZCHParams(
         bucket_offsets=bucket_offsets,
         bucket_sizes=bucket_sizes,
         enable_optimizer_offloading=True,
+        backend_return_whole_row=(backend_type == BackendType.DRAM),
         eviction_policy=eviction_policy,
     )
 
@@ -1395,7 +1402,9 @@ def __init__(
                 self._config.embedding_tables, self._pg
             )
         )
-        _populate_zero_collision_tbe_params(ssd_tbe_params, self._bucket_spec, config)
+        _populate_zero_collision_tbe_params(
+            ssd_tbe_params, self._bucket_spec, config, backend_type
+        )
         compute_kernel = config.embedding_tables[0].compute_kernel
         embedding_location = compute_kernel_to_embedding_location(compute_kernel)
 
@@ -2201,7 +2210,9 @@ def __init__(
                 self._config.embedding_tables, self._pg
             )
         )
-        _populate_zero_collision_tbe_params(ssd_tbe_params, self._bucket_spec, config)
+        _populate_zero_collision_tbe_params(
+            ssd_tbe_params, self._bucket_spec, config, backend_type
+        )
         compute_kernel = config.embedding_tables[0].compute_kernel
         embedding_location = compute_kernel_to_embedding_location(compute_kernel)
 
diff --git a/torchrec/distributed/embedding.py b/torchrec/distributed/embedding.py
@@ -956,7 +956,17 @@ def _initialize_torch_state(self) -> None:  # noqa
                         (
                             [
                                 # assuming virtual table only supports rw sharding for now
-                                0 if dim == 0 else dim_size
+                                # When backend return whole row, need to respect dim(1)
+                                # otherwise will see shard dim exceeded tensor dim error
+                                (
+                                    0
+                                    if dim == 0
+                                    else (
+                                        local_shards[0].metadata.shard_sizes[1]
+                                        if dim == 1
+                                        else dim_size
+                                    )
+                                )
                                 for dim, dim_size in enumerate(
                                     self._name_to_table_size[table_name]
                                 )

Original file line number	Diff line number	Diff line change
`@@ -956,7 +956,17 @@ def _initialize_torch_state(self) -> None: # noqa`
`956`	`956`	`(`
`957`	`957`	`[`
`958`	`958`	`# assuming virtual table only supports rw sharding for now`
`959`		`- 0 if dim == 0 else dim_size`
	`959`	`+ # When backend return whole row, need to respect dim(1)`
	`960`	`+ # otherwise will see shard dim exceeded tensor dim error`
	`961`	`+ (`
	`962`	`+ 0`
	`963`	`+ if dim == 0`
	`964`	`+ else (`
	`965`	`+ local_shards[0].metadata.shard_sizes[1]`
	`966`	`+ if dim == 1`
	`967`	`+ else dim_size`
	`968`	`+ )`
	`969`	`+ )`
`960`	`970`	`for dim, dim_size in enumerate(`
`961`	`971`	`self._name_to_table_size[table_name]`
`962`	`972`	`)`