fix(dataframe_serialize): truncate content of dataframe columns (#1691)

ArslanSaleem · ellipsis-dev[bot] · web-flow · commit 30dd94f3fdde · 2025-03-20T20:55:50.000+01:00
* fix(dataframe_serialize): truncate content of dataframe columns

* fix(dataframe): clean dataframe serialization code

* Update pandasai/helpers/dataframe_serializer.py

Co-authored-by: ellipsis-dev[bot] &lt;65095814+ellipsis-dev[bot]@users.noreply.github.com&gt;

---------

Co-authored-by: ellipsis-dev[bot] &lt;65095814+ellipsis-dev[bot]@users.noreply.github.com&gt;
diff --git a/pandasai/helpers/dataframe_serializer.py b/pandasai/helpers/dataframe_serializer.py
@@ -1,23 +1,27 @@
+import json
 import typing
 
 if typing.TYPE_CHECKING:
     from ..dataframe.base import DataFrame
 
 
 class DataframeSerializer:
-    def __init__(self) -> None:
-        pass
+    MAX_COLUMN_TEXT_LENGTH = 200
 
-    @staticmethod
-    def serialize(df: "DataFrame", dialect: str = "postgres") -> str:
+    @classmethod
+    def serialize(cls, df: "DataFrame", dialect: str = "postgres") -> str:
         """
-        Convert df to csv like format where csv is wrapped inside <dataframe></dataframe>
+        Convert df to a CSV-like format wrapped inside <table> tags, truncating long text values, and serializing only a subset of rows using df.head().
+
         Args:
-            df (pd.DataFrame): PandaAI dataframe or dataframe
+            df (pd.DataFrame): Pandas DataFrame
+            dialect (str): Database dialect (default is "postgres")
 
         Returns:
-            str: dataframe stringify
+            str: Serialized DataFrame string
         """
+
+        # Start building the table metadata
         dataframe_info = f'<table dialect="{dialect}" table_name="{df.schema.name}"'
 
         # Add description attribute if available
@@ -26,10 +30,27 @@ def serialize(df: "DataFrame", dialect: str = "postgres") -> str:
 
         dataframe_info += f' dimensions="{df.rows_count}x{df.columns_count}">'
 
-        # Add dataframe details
-        dataframe_info += f"\n{df.head().to_csv(index=False)}"
+        # Truncate long values
+        df_truncated = cls._truncate_dataframe(df.head())
 
-        # Close the dataframe tag
+        # Convert to CSV format
+        dataframe_info += f"\n{df_truncated.to_csv(index=False)}"
+
+        # Close the table tag
         dataframe_info += "</table>\n"
 
         return dataframe_info
+
+    @classmethod
+    def _truncate_dataframe(cls, df: "DataFrame") -> "DataFrame":
+        """Truncates string values exceeding MAX_COLUMN_TEXT_LENGTH, and converts JSON-like values to truncated strings."""
+
+        def truncate_value(value):
+            if isinstance(value, (dict, list)):  # Convert JSON-like objects to strings
+                value = json.dumps(value, ensure_ascii=False)
+
+            if isinstance(value, str) and len(value) > cls.MAX_COLUMN_TEXT_LENGTH:
+                return f"{value[: cls.MAX_COLUMN_TEXT_LENGTH]}…"
+            return value
+
+        return df.applymap(truncate_value)
diff --git a/tests/unit_tests/helpers/test_dataframe_serializer.py b/tests/unit_tests/helpers/test_dataframe_serializer.py
@@ -1,3 +1,5 @@
+import pandas as pd
+
 from pandasai.helpers.dataframe_serializer import DataframeSerializer
 
 
@@ -27,3 +29,28 @@ def test_serialize_with_name_and_description_with_dialect(self, sample_df):
 </table>
 """
         assert result.replace("\r\n", "\n") == expected.replace("\r\n", "\n")
+
+    def test_serialize_with_dataframe_long_strings(self, sample_df):
+        """Test serialization with long strings to ensure truncation."""
+
+        # Generate a DataFrame with a long string in column 'A'
+        long_text = "A" * 300
+        sample_df.loc[0, "A"] = long_text
+
+        # Serialize the DataFrame
+        result = DataframeSerializer.serialize(sample_df, dialect="mysql")
+
+        # Expected truncated value (200 characters + ellipsis)
+        truncated_text = long_text[: DataframeSerializer.MAX_COLUMN_TEXT_LENGTH] + "…"
+
+        # Expected output
+        expected = f"""<table dialect="mysql" table_name="table_6c30b42101939c7bdf95f4c1052d615c" dimensions="3x2">
+A,B
+{truncated_text},4
+2,5
+3,6
+</table>
+"""
+
+        # Normalize line endings before asserting
+        assert result.replace("\r\n", "\n") == expected.replace("\r\n", "\n")