Skip to content

Commit 30dd94f

Browse files
fix(dataframe_serialize): truncate content of dataframe columns (#1691)
* fix(dataframe_serialize): truncate content of dataframe columns * fix(dataframe): clean dataframe serialization code * Update pandasai/helpers/dataframe_serializer.py Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com> --------- Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com>
1 parent 890fbba commit 30dd94f

File tree

2 files changed

+58
-10
lines changed

2 files changed

+58
-10
lines changed
Lines changed: 31 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,27 @@
1+
import json
12
import typing
23

34
if typing.TYPE_CHECKING:
45
from ..dataframe.base import DataFrame
56

67

78
class DataframeSerializer:
8-
def __init__(self) -> None:
9-
pass
9+
MAX_COLUMN_TEXT_LENGTH = 200
1010

11-
@staticmethod
12-
def serialize(df: "DataFrame", dialect: str = "postgres") -> str:
11+
@classmethod
12+
def serialize(cls, df: "DataFrame", dialect: str = "postgres") -> str:
1313
"""
14-
Convert df to csv like format where csv is wrapped inside <dataframe></dataframe>
14+
Convert df to a CSV-like format wrapped inside <table> tags, truncating long text values, and serializing only a subset of rows using df.head().
15+
1516
Args:
16-
df (pd.DataFrame): PandaAI dataframe or dataframe
17+
df (pd.DataFrame): Pandas DataFrame
18+
dialect (str): Database dialect (default is "postgres")
1719
1820
Returns:
19-
str: dataframe stringify
21+
str: Serialized DataFrame string
2022
"""
23+
24+
# Start building the table metadata
2125
dataframe_info = f'<table dialect="{dialect}" table_name="{df.schema.name}"'
2226

2327
# Add description attribute if available
@@ -26,10 +30,27 @@ def serialize(df: "DataFrame", dialect: str = "postgres") -> str:
2630

2731
dataframe_info += f' dimensions="{df.rows_count}x{df.columns_count}">'
2832

29-
# Add dataframe details
30-
dataframe_info += f"\n{df.head().to_csv(index=False)}"
33+
# Truncate long values
34+
df_truncated = cls._truncate_dataframe(df.head())
3135

32-
# Close the dataframe tag
36+
# Convert to CSV format
37+
dataframe_info += f"\n{df_truncated.to_csv(index=False)}"
38+
39+
# Close the table tag
3340
dataframe_info += "</table>\n"
3441

3542
return dataframe_info
43+
44+
@classmethod
45+
def _truncate_dataframe(cls, df: "DataFrame") -> "DataFrame":
46+
"""Truncates string values exceeding MAX_COLUMN_TEXT_LENGTH, and converts JSON-like values to truncated strings."""
47+
48+
def truncate_value(value):
49+
if isinstance(value, (dict, list)): # Convert JSON-like objects to strings
50+
value = json.dumps(value, ensure_ascii=False)
51+
52+
if isinstance(value, str) and len(value) > cls.MAX_COLUMN_TEXT_LENGTH:
53+
return f"{value[: cls.MAX_COLUMN_TEXT_LENGTH]}…"
54+
return value
55+
56+
return df.applymap(truncate_value)

tests/unit_tests/helpers/test_dataframe_serializer.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import pandas as pd
2+
13
from pandasai.helpers.dataframe_serializer import DataframeSerializer
24

35

@@ -27,3 +29,28 @@ def test_serialize_with_name_and_description_with_dialect(self, sample_df):
2729
</table>
2830
"""
2931
assert result.replace("\r\n", "\n") == expected.replace("\r\n", "\n")
32+
33+
def test_serialize_with_dataframe_long_strings(self, sample_df):
34+
"""Test serialization with long strings to ensure truncation."""
35+
36+
# Generate a DataFrame with a long string in column 'A'
37+
long_text = "A" * 300
38+
sample_df.loc[0, "A"] = long_text
39+
40+
# Serialize the DataFrame
41+
result = DataframeSerializer.serialize(sample_df, dialect="mysql")
42+
43+
# Expected truncated value (200 characters + ellipsis)
44+
truncated_text = long_text[: DataframeSerializer.MAX_COLUMN_TEXT_LENGTH] + "…"
45+
46+
# Expected output
47+
expected = f"""<table dialect="mysql" table_name="table_6c30b42101939c7bdf95f4c1052d615c" dimensions="3x2">
48+
A,B
49+
{truncated_text},4
50+
2,5
51+
3,6
52+
</table>
53+
"""
54+
55+
# Normalize line endings before asserting
56+
assert result.replace("\r\n", "\n") == expected.replace("\r\n", "\n")

0 commit comments

Comments
 (0)