pyg-team · weihua916 · Dec 27, 2024 · Dec 20, 2024 · Dec 22, 2024 · Dec 22, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
+- Added support for materializing dataset for train and test dataframe separately([#470](https://github.com/pyg-team/pytorch-frame/issues/470))
 - Added support for PyTorch 2.5 ([#464](https://github.com/pyg-team/pytorch-frame/pull/464))
 - Added a benchmark script to compare PyTorch Frame with PyTorch Tabular ([#398](https://github.com/pyg-team/pytorch-frame/pull/398), [#444](https://github.com/pyg-team/pytorch-frame/pull/444))
 - Added `is_floating_point` method to `MultiNestedTensor` and `MultiEmbeddingTensor` ([#445](https://github.com/pyg-team/pytorch-frame/pull/445))

diff --git a/torch_frame/data/dataset.py b/torch_frame/data/dataset.py
@@ -554,6 +554,7 @@ def materialize(
         self,
         device: torch.device | None = None,
         path: str | None = None,
+        col_stats: dict[str, dict[StatType, Any]] | None = None,
     ) -> Dataset:
         r"""Materializes the dataset into a tensor representation. From this
         point onwards, the dataset should be treated as read-only.
@@ -570,6 +571,10 @@ def materialize(
                 :obj:`path`. If :obj:`path` is :obj:`None`, this will
                 materialize the dataset without caching.
                 (default: :obj:`None`)
+            col_stats (Dict[str, Dict[StatType, Any]], optional): optional
+            col_stats provided by the user. If not provided, the statistics
+            is calculated from the dataframe itself. (default: :obj:`None`)
+
         """
         if self.is_materialized:
             # Materialized without specifying path at first and materialize
@@ -589,23 +594,27 @@ def materialize(
             return self
 
         # 1. Fill column statistics:
-        for col, stype in self.col_to_stype.items():
-            ser = self.df[col]
-            self._col_stats[col] = compute_col_stats(
-                ser,
-                stype,
-                sep=self.col_to_sep.get(col, None),
-                time_format=self.col_to_time_format.get(col, None),
-            )
-            # For a target column, sort categories lexicographically such that
-            # we do not accidentally swap labels in binary classification
-            # tasks.
-            if col == self.target_col and stype == torch_frame.categorical:
-                index, value = self._col_stats[col][StatType.COUNT]
-                if len(index) == 2:
-                    ser = pd.Series(index=index, data=value).sort_index()
-                    index, value = ser.index.tolist(), ser.values.tolist()
-                    self._col_stats[col][StatType.COUNT] = (index, value)
+        if col_stats is None:
+            # calculate from data if col_stats is not provided
+            for col, stype in self.col_to_stype.items():
+                ser = self.df[col]
+                self._col_stats[col] = compute_col_stats(
+                    ser,
+                    stype,
+                    sep=self.col_to_sep.get(col, None),
+                    time_format=self.col_to_time_format.get(col, None),
+                )
+                # For a target column, sort categories lexicographically
+                # such that we do not accidentally swap labels in binary
+                # classification tasks.
+                if col == self.target_col and stype == torch_frame.categorical:
+                    index, value = self._col_stats[col][StatType.COUNT]
+                    if len(index) == 2:
+                        ser = pd.Series(index=index, data=value).sort_index()
+                        index, value = ser.index.tolist(), ser.values.tolist()
+                        self._col_stats[col][StatType.COUNT] = (index, value)
+        else:
+            self._col_stats = col_stats
 
         # 2. Create the `TensorFrame`:
         self._to_tensor_frame_converter = self._get_tensorframe_converter()