pytorch
diff --git a/‎fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_common.py
Lines changed: 39 additions & 1 deletion b/‎fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_common.py
Lines changed: 39 additions & 1 deletion
diff --git a/‎fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py
Lines changed: 48 additions & 16 deletions b/‎fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py
Lines changed: 48 additions & 16 deletions
diff --git a/‎fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_base.h
Lines changed: 0 additions & 51 deletions b/‎fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_base.h
Lines changed: 0 additions & 51 deletions
@@ -11,7 +11,7 @@
 
 import enum
 from dataclasses import dataclass
-from typing import List, NamedTuple, Tuple
+from typing import List, NamedTuple, Optional, Tuple
 
 import torch
 from torch import Tensor
@@ -60,6 +60,43 @@ def from_str(cls, key: str):
             raise ValueError(f"Cannot parse value into EmbeddingLocation: {key}")
 
 
+class EvictionPolicy(NamedTuple):
+    eviction_trigger_mode: int = (
+        0  # disabled, 0: disabled, 1: iteration, 2: mem_util, 3: manual
+    )
+    eviction_strategy: int = (
+        0  # 0: timestamp, 1: counter (feature score), 2: counter (feature score) + timestamp, 3: feature l2 norm
+    )
+    eviction_step_intervals: Optional[int] = (
+        None  # trigger_step_interval if trigger mode is iteration
+    )
+    eviction_mem_threshold_gb: Optional[int] = (
+        None  # eviction trigger condition if trigger mode is mem_util
+    )
+    counter_thresholds: Optional[List[int]] = (
+        None  # count_thresholds for each table if eviction strategy is feature score
+    )
+    ttls_in_mins: Optional[List[int]] = (
+        None  # ttls_in_mins for each table if eviction strategy is timestamp
+    )
+    counter_decay_rates: Optional[List[float]] = (
+        None  # count_decay_rates for each table if eviction strategy is feature score
+    )
+    l2_weight_thresholds: Optional[List[float]] = (
+        None  # l2_weight_thresholds for each table if eviction strategy is feature l2 norm
+    )
+    interval_for_insufficient_eviction_s: int = (
+        # wait at least # seconds before trigger next round of eviction, if last finished eviction is insufficient
+        # insufficient means we didn't evict enough rows, so we want to wait longer time to
+        # avoid another insufficient eviction
+        600
+    )
+    interval_for_sufficient_eviction_s: int = (
+        # wait at least # seconds before trigger next round of eviction, if last finished eviction is sufficient
+        60
+    )
+
+
 class KVZCHParams(NamedTuple):
     # global bucket id start and global bucket id end offsets for each logical table,
     # where start offset is inclusive and end offset is exclusive
@@ -69,6 +106,7 @@ class KVZCHParams(NamedTuple):
     bucket_sizes: List[int] = []
     # enable optimizer offloading or not
     enable_optimizer_offloading: bool = False
+    eviction_policy: Optional[EvictionPolicy] = None
 
     def validate(self) -> None:
         assert len(self.bucket_offsets) == len(self.bucket_sizes), (
 
@@ -248,6 +248,12 @@ def __init__(
             self.total_hash_size_bits: int = 0
         else:
             self.total_hash_size_bits: int = int(log2(float(hash_size_cumsum[-1])) + 1)
+        self.register_buffer(
+            "table_hash_size_cumsum",
+            torch.tensor(
+                hash_size_cumsum, device=self.current_device, dtype=torch.int64
+            ),
+        )
         # The last element is to easily access # of rows of each table by
         self.total_hash_size_bits = int(log2(float(hash_size_cumsum[-1])) + 1)
         self.total_hash_size: int = hash_size_cumsum[-1]
@@ -288,6 +294,10 @@ def __init__(
             "feature_dims",
             torch.tensor(feature_dims, device="cpu", dtype=torch.int64),
         )
+        self.register_buffer(
+            "table_dims",
+            torch.tensor(dims, device="cpu", dtype=torch.int64),
+        )
 
         (info_B_num_bits_, info_B_mask_) = torch.ops.fbgemm.get_infos_metadata(
             self.D_offsets,  # unused tensor
@@ -518,6 +528,7 @@ def __init__(
                 logging.warning("dist is not initialized, treating as single gpu cases")
                 tbe_unique_id = SSDTableBatchedEmbeddingBags._local_instance_index
         self.tbe_unique_id = tbe_unique_id
+        self.l2_cache_size = l2_cache_size
         logging.info(f"tbe_unique_id: {tbe_unique_id}")
         if self.backend_type == BackendType.SSD:
             logging.info(
@@ -564,12 +575,12 @@ def __init__(
                 self.res_params.table_offsets,
                 self.res_params.table_sizes,
                 (
-                    tensor_pad4(self.feature_dims.cpu())
+                    tensor_pad4(self.table_dims)
                     if self.enable_optimizer_offloading
                     else None
                 ),
                 (
-                    self.hash_size_cumsum.cpu()
+                    self.table_hash_size_cumsum.cpu()
                     if self.enable_optimizer_offloading
                     else None
                 ),
@@ -609,28 +620,42 @@ def __init__(
                 f"feature_dims={self.feature_dims},"
                 f"hash_size_cumsum={self.hash_size_cumsum}"
             )
+            table_dims = (
+                tensor_pad4(self.table_dims)
+                if self.enable_optimizer_offloading
+                else None
+            )  # table_dims
+            eviction_config = None
+            if self.kv_zch_params and self.kv_zch_params.eviction_policy:
+                eviction_mem_threshold_gb = (
+                    self.kv_zch_params.eviction_policy.eviction_mem_threshold_gb
+                    if self.kv_zch_params.eviction_policy.eviction_mem_threshold_gb
+                    else self.l2_cache_size
+                )
+                eviction_config = torch.classes.fbgemm.FeatureEvictConfig(
+                    self.kv_zch_params.eviction_policy.eviction_trigger_mode,  # eviction is disabled, 0: disabled, 1: iteration, 2: mem_util, 3: manual
+                    self.kv_zch_params.eviction_policy.eviction_strategy,  # evict_trigger_strategy: 0: timestamp, 1: counter (feature score), 2: counter (feature score) + timestamp, 3: feature l2 norm
+                    self.kv_zch_params.eviction_policy.eviction_step_intervals,  # trigger_step_interval if trigger mode is iteration
+                    eviction_mem_threshold_gb,  # mem_util_threshold_in_GB if trigger mode is mem_util
+                    self.kv_zch_params.eviction_policy.ttls_in_mins,  # ttls_in_mins for each table if eviction strategy is timestamp
+                    self.kv_zch_params.eviction_policy.counter_thresholds,  # counter_thresholds for each table if eviction strategy is feature score
+                    self.kv_zch_params.eviction_policy.counter_decay_rates,  # counter_decay_rates for each table if eviction strategy is feature score
+                    self.kv_zch_params.eviction_policy.l2_weight_thresholds,  # l2_weight_thresholds for each table if eviction strategy is feature l2 norm
+                    table_dims.tolist() if table_dims is not None else None,
+                    self.kv_zch_params.eviction_policy.interval_for_insufficient_eviction_s,
+                    self.kv_zch_params.eviction_policy.interval_for_sufficient_eviction_s,
+                )
             self._ssd_db = torch.classes.fbgemm.DramKVEmbeddingCacheWrapper(
                 self.cache_row_dim,
                 ssd_uniform_init_lower,
                 ssd_uniform_init_upper,
-                0,  # eviction is disabled, 0: disabled, 1: iteration, 2: mem_util, 3: manual
-                0,  # trigger_step_interval if trigger mode is iteration
-                0,  # mem_util_threshold_in_GB if trigger mode is mem_util
-                0,  # evict_trigger_strategy: 0: timestamp, 1: counter (feature score), 2: counter (feature score) + timestamp, 3: feature l2 norm
-                None,  # count_thresholds for each table if eviction strategy is feature score
-                None,  # ttls_in_mins for each table if eviction strategy is timestamp
-                None,  # count_decay_rates for each table if eviction strategy is feature score
-                None,  # l2_weight_thresholds for each table if eviction strategy is feature l2 norm
+                eviction_config,
                 ssd_rocksdb_shards,  # num_shards
                 ssd_rocksdb_shards,  # num_threads
                 weights_precision.bit_rate(),  # row_storage_bitwidth
+                table_dims,
                 (
-                    tensor_pad4(self.feature_dims.cpu())
-                    if self.enable_optimizer_offloading
-                    else None
-                ),  # table_dims
-                (
-                    self.hash_size_cumsum.cpu()
+                    self.table_hash_size_cumsum.cpu()
                     if self.enable_optimizer_offloading
                     else None
                 ),  # hash_size_cumsum
@@ -2434,6 +2459,13 @@ def _may_create_snapshot_for_state_dict(
                     f"created snapshot for weight states: {snapshot_handle}, latency: {(time.time() - start_time) * 1000} ms"
                 )
         elif self.backend_type == BackendType.DRAM:
+            # if there is any ongoing eviction, lets wait until eviction is finished before state_dict
+            # so that we can reach consistent model state before/after state_dict
+            evict_wait_start_time = time.time()
+            self.ssd_db.wait_until_eviction_done()
+            logging.info(
+                f"state_dict wait for ongoing eviction: {time.time() - evict_wait_start_time} s"
+            )
             self.flush(force=should_flush)
         return snapshot_handle, checkpoint_handle