Skip to content

fix: allow context.span_id as column name in pandas dataframe #7368

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -625,20 +625,31 @@ def _validate_dataframe(
raise ValueError(f"{name_column} values must be strings")

# Check for span_id in either columns or index
if "span_id" not in dataframe.columns and not all(isinstance(x, str) for x in dataframe.index): # pyright: ignore[reportUnknownVariableType,reportUnknownMemberType]
raise ValueError("DataFrame must have either a 'span_id' column or a string-based index")
has_span_id = "span_id" in dataframe.columns
has_context_span_id = "context.span_id" in dataframe.columns
if has_span_id and has_context_span_id:
raise ValueError("DataFrame cannot have both 'span_id' and 'context.span_id' columns")
if (
not has_span_id
and not has_context_span_id
and not all(isinstance(x, str) for x in dataframe.index) # pyright: ignore[reportUnknownVariableType,reportUnknownMemberType]
):
Comment on lines +635 to +636
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pyright ignore? Do we need pyright here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pyright is used here for type checking TypedDict (so they would work somewhat like js). Unfortunately it means we need to ignore it every time we use pandas because pandas types are very lax.

raise ValueError(
"DataFrame must have either a 'span_id' or 'context.span_id' column, or a string-based index" # noqa: E501
)

# Validate span_id values if using column
if "span_id" in dataframe.columns:
span_id_column = "context.span_id" if has_context_span_id else "span_id"
if span_id_column in dataframe.columns:
# Check for None values
if dataframe["span_id"].isna().any(): # pyright: ignore[reportUnknownMemberType]
raise ValueError("span_id values cannot be None")
if dataframe[span_id_column].isna().any(): # pyright: ignore[reportUnknownMemberType]
raise ValueError(f"{span_id_column} values cannot be None")
# Check for empty or whitespace-only strings
if (dataframe["span_id"].str.strip() == "").any(): # pyright: ignore[reportUnknownMemberType]
raise ValueError("span_id values must be non-empty strings")
if (dataframe[span_id_column].str.strip() == "").any(): # pyright: ignore[reportUnknownMemberType]
raise ValueError(f"{span_id_column} values must be non-empty strings")
# Check for non-string values
if not all(isinstance(x, str) for x in dataframe["span_id"]): # pyright: ignore[reportUnknownVariableType,reportUnknownMemberType]
raise ValueError("span_id values must be strings")
if not all(isinstance(x, str) for x in dataframe[span_id_column]): # pyright: ignore[reportUnknownVariableType,reportUnknownMemberType]
raise ValueError(f"{span_id_column} values must be strings")
# Validate index values if using index as span_id
else:
# Check for empty or whitespace-only strings
Expand Down Expand Up @@ -723,6 +734,8 @@ def _chunk_dataframe(
span_id = (
str(row["span_id"]) # pyright: ignore[reportUnknownArgumentType]
if "span_id" in dataframe.columns and bool(row["span_id"]) # pyright: ignore[reportUnknownArgumentType]
else str(row["context.span_id"]) # pyright: ignore[reportUnknownArgumentType]
if "context.span_id" in dataframe.columns and bool(row["context.span_id"]) # pyright: ignore[reportUnknownArgumentType]
else str(idx)
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,72 @@ def test_missing_span_id(self) -> None:
df = pd.DataFrame({"name": ["sentiment"], "annotator_kind": ["HUMAN"]})
with pytest.raises(
ValueError,
match="DataFrame must have either a 'span_id' column or a string-based index",
match="DataFrame must have either a 'span_id' or 'context.span_id' column, or a string-based index", # noqa: E501
):
_validate_dataframe(dataframe=df)

def test_both_span_id_columns(self) -> None:
"""Test validation when both span_id and context.span_id columns are present."""
df = pd.DataFrame(
{
"name": ["sentiment"],
"annotator_kind": ["HUMAN"],
"span_id": ["span1"],
"context.span_id": ["span1"],
}
)
with pytest.raises(
ValueError, match="DataFrame cannot have both 'span_id' and 'context.span_id' columns"
):
_validate_dataframe(dataframe=df)

def test_valid_with_context_span_id(self) -> None:
"""Test validation with valid DataFrame using context.span_id column."""
df = pd.DataFrame(
{
"name": ["sentiment"],
"annotator_kind": ["HUMAN"],
"context.span_id": ["span1"],
}
)
_validate_dataframe(dataframe=df) # Should not raise

def test_invalid_context_span_id_values(self) -> None:
"""Test validation with invalid context.span_id values."""
df = pd.DataFrame(
{
"name": ["sentiment", "sentiment"],
"annotator_kind": ["HUMAN", "HUMAN"],
"context.span_id": ["", " "], # Empty strings
}
)
with pytest.raises(ValueError, match="context.span_id values must be non-empty strings"):
_validate_dataframe(dataframe=df)

def test_none_context_span_id_values(self) -> None:
"""Test validation with None values in context.span_id column."""
df = pd.DataFrame(
{
"name": ["sentiment", "sentiment"],
"annotator_kind": ["HUMAN", "HUMAN"],
"context.span_id": [None, "valid_id"], # None value
}
)
with pytest.raises(ValueError, match="context.span_id values cannot be None"):
_validate_dataframe(dataframe=df)

def test_non_string_context_span_id(self) -> None:
"""Test validation with non-string values in context.span_id column."""
df = pd.DataFrame(
{
"name": ["sentiment", "sentiment"],
"annotator_kind": ["HUMAN", "HUMAN"],
"context.span_id": [123, "valid_id"], # Non-string value
}
)
with pytest.raises(ValueError, match="context.span_id values must be strings"):
_validate_dataframe(dataframe=df)

def test_valid_with_index(self) -> None:
"""Test validation with valid DataFrame using index as span_id."""
df = pd.DataFrame({"name": ["sentiment"], "annotator_kind": ["HUMAN"]}, index=["span1"])
Expand Down Expand Up @@ -153,18 +215,6 @@ def test_invalid_index_values(self) -> None:
):
_validate_dataframe(dataframe=df)

def test_none_span_id_values(self) -> None:
"""Test validation with None values in span_id column."""
df = pd.DataFrame(
{
"name": ["sentiment", "sentiment"],
"annotator_kind": ["HUMAN", "HUMAN"],
"span_id": [None, "valid_id"], # None value
}
)
with pytest.raises(ValueError, match="span_id values cannot be None"):
_validate_dataframe(dataframe=df)

def test_mixed_valid_invalid_values(self) -> None:
"""Test validation with mixed valid and invalid values in columns."""
df = pd.DataFrame(
Expand Down Expand Up @@ -383,3 +433,34 @@ def test_invalid_score_type(self) -> None:
match="Error processing row 0: Score value 'not_a_number' cannot be converted to float",
):
list(_chunk_dataframe(dataframe=df))

def test_chunk_with_context_span_id(self) -> None:
"""Test chunking with context.span_id column."""
df = pd.DataFrame(
{
"name": ["test1", "test2"],
"annotator_kind": ["HUMAN", "HUMAN"],
"context.span_id": ["id1", "id2"],
"label": ["label1", "label2"],
}
)
chunks = list(_chunk_dataframe(dataframe=df))
assert len(chunks) == 1
assert chunks[0][0]["span_id"] == "id1"
assert chunks[0][1]["span_id"] == "id2"

def test_chunk_with_both_span_id_columns(self) -> None:
"""Test chunking with both span_id and context.span_id columns."""
df = pd.DataFrame(
{
"name": ["test1", "test2"],
"annotator_kind": ["HUMAN", "HUMAN"],
"span_id": ["id1", "id2"],
"context.span_id": ["id1", "id2"],
"label": ["label1", "label2"],
}
)
with pytest.raises(
ValueError, match="DataFrame cannot have both 'span_id' and 'context.span_id' columns"
):
list(_chunk_dataframe(dataframe=df))