From 95674c65cb9c319f3f14dd2780edea94ee6ee80e Mon Sep 17 00:00:00 2001 From: Roger Yang Date: Fri, 9 May 2025 16:11:39 -0700 Subject: [PATCH] fix: allow context.span_id as column name --- .../client/resources/annotations/__init__.py | 31 +++-- .../resources/annotations/test_annotations.py | 107 +++++++++++++++--- 2 files changed, 116 insertions(+), 22 deletions(-) diff --git a/packages/phoenix-client/src/phoenix/client/resources/annotations/__init__.py b/packages/phoenix-client/src/phoenix/client/resources/annotations/__init__.py index 988546c4c6..f256e07546 100644 --- a/packages/phoenix-client/src/phoenix/client/resources/annotations/__init__.py +++ b/packages/phoenix-client/src/phoenix/client/resources/annotations/__init__.py @@ -625,20 +625,31 @@ def _validate_dataframe( raise ValueError(f"{name_column} values must be strings") # Check for span_id in either columns or index - if "span_id" not in dataframe.columns and not all(isinstance(x, str) for x in dataframe.index): # pyright: ignore[reportUnknownVariableType,reportUnknownMemberType] - raise ValueError("DataFrame must have either a 'span_id' column or a string-based index") + has_span_id = "span_id" in dataframe.columns + has_context_span_id = "context.span_id" in dataframe.columns + if has_span_id and has_context_span_id: + raise ValueError("DataFrame cannot have both 'span_id' and 'context.span_id' columns") + if ( + not has_span_id + and not has_context_span_id + and not all(isinstance(x, str) for x in dataframe.index) # pyright: ignore[reportUnknownVariableType,reportUnknownMemberType] + ): + raise ValueError( + "DataFrame must have either a 'span_id' or 'context.span_id' column, or a string-based index" # noqa: E501 + ) # Validate span_id values if using column - if "span_id" in dataframe.columns: + span_id_column = "context.span_id" if has_context_span_id else "span_id" + if span_id_column in dataframe.columns: # Check for None values - if dataframe["span_id"].isna().any(): # pyright: ignore[reportUnknownMemberType] - raise ValueError("span_id values cannot be None") + if dataframe[span_id_column].isna().any(): # pyright: ignore[reportUnknownMemberType] + raise ValueError(f"{span_id_column} values cannot be None") # Check for empty or whitespace-only strings - if (dataframe["span_id"].str.strip() == "").any(): # pyright: ignore[reportUnknownMemberType] - raise ValueError("span_id values must be non-empty strings") + if (dataframe[span_id_column].str.strip() == "").any(): # pyright: ignore[reportUnknownMemberType] + raise ValueError(f"{span_id_column} values must be non-empty strings") # Check for non-string values - if not all(isinstance(x, str) for x in dataframe["span_id"]): # pyright: ignore[reportUnknownVariableType,reportUnknownMemberType] - raise ValueError("span_id values must be strings") + if not all(isinstance(x, str) for x in dataframe[span_id_column]): # pyright: ignore[reportUnknownVariableType,reportUnknownMemberType] + raise ValueError(f"{span_id_column} values must be strings") # Validate index values if using index as span_id else: # Check for empty or whitespace-only strings @@ -723,6 +734,8 @@ def _chunk_dataframe( span_id = ( str(row["span_id"]) # pyright: ignore[reportUnknownArgumentType] if "span_id" in dataframe.columns and bool(row["span_id"]) # pyright: ignore[reportUnknownArgumentType] + else str(row["context.span_id"]) # pyright: ignore[reportUnknownArgumentType] + if "context.span_id" in dataframe.columns and bool(row["context.span_id"]) # pyright: ignore[reportUnknownArgumentType] else str(idx) ) diff --git a/packages/phoenix-client/tests/client/resources/annotations/test_annotations.py b/packages/phoenix-client/tests/client/resources/annotations/test_annotations.py index bb1fe6a076..9f18b33da1 100644 --- a/packages/phoenix-client/tests/client/resources/annotations/test_annotations.py +++ b/packages/phoenix-client/tests/client/resources/annotations/test_annotations.py @@ -94,10 +94,72 @@ def test_missing_span_id(self) -> None: df = pd.DataFrame({"name": ["sentiment"], "annotator_kind": ["HUMAN"]}) with pytest.raises( ValueError, - match="DataFrame must have either a 'span_id' column or a string-based index", + match="DataFrame must have either a 'span_id' or 'context.span_id' column, or a string-based index", # noqa: E501 ): _validate_dataframe(dataframe=df) + def test_both_span_id_columns(self) -> None: + """Test validation when both span_id and context.span_id columns are present.""" + df = pd.DataFrame( + { + "name": ["sentiment"], + "annotator_kind": ["HUMAN"], + "span_id": ["span1"], + "context.span_id": ["span1"], + } + ) + with pytest.raises( + ValueError, match="DataFrame cannot have both 'span_id' and 'context.span_id' columns" + ): + _validate_dataframe(dataframe=df) + + def test_valid_with_context_span_id(self) -> None: + """Test validation with valid DataFrame using context.span_id column.""" + df = pd.DataFrame( + { + "name": ["sentiment"], + "annotator_kind": ["HUMAN"], + "context.span_id": ["span1"], + } + ) + _validate_dataframe(dataframe=df) # Should not raise + + def test_invalid_context_span_id_values(self) -> None: + """Test validation with invalid context.span_id values.""" + df = pd.DataFrame( + { + "name": ["sentiment", "sentiment"], + "annotator_kind": ["HUMAN", "HUMAN"], + "context.span_id": ["", " "], # Empty strings + } + ) + with pytest.raises(ValueError, match="context.span_id values must be non-empty strings"): + _validate_dataframe(dataframe=df) + + def test_none_context_span_id_values(self) -> None: + """Test validation with None values in context.span_id column.""" + df = pd.DataFrame( + { + "name": ["sentiment", "sentiment"], + "annotator_kind": ["HUMAN", "HUMAN"], + "context.span_id": [None, "valid_id"], # None value + } + ) + with pytest.raises(ValueError, match="context.span_id values cannot be None"): + _validate_dataframe(dataframe=df) + + def test_non_string_context_span_id(self) -> None: + """Test validation with non-string values in context.span_id column.""" + df = pd.DataFrame( + { + "name": ["sentiment", "sentiment"], + "annotator_kind": ["HUMAN", "HUMAN"], + "context.span_id": [123, "valid_id"], # Non-string value + } + ) + with pytest.raises(ValueError, match="context.span_id values must be strings"): + _validate_dataframe(dataframe=df) + def test_valid_with_index(self) -> None: """Test validation with valid DataFrame using index as span_id.""" df = pd.DataFrame({"name": ["sentiment"], "annotator_kind": ["HUMAN"]}, index=["span1"]) @@ -153,18 +215,6 @@ def test_invalid_index_values(self) -> None: ): _validate_dataframe(dataframe=df) - def test_none_span_id_values(self) -> None: - """Test validation with None values in span_id column.""" - df = pd.DataFrame( - { - "name": ["sentiment", "sentiment"], - "annotator_kind": ["HUMAN", "HUMAN"], - "span_id": [None, "valid_id"], # None value - } - ) - with pytest.raises(ValueError, match="span_id values cannot be None"): - _validate_dataframe(dataframe=df) - def test_mixed_valid_invalid_values(self) -> None: """Test validation with mixed valid and invalid values in columns.""" df = pd.DataFrame( @@ -383,3 +433,34 @@ def test_invalid_score_type(self) -> None: match="Error processing row 0: Score value 'not_a_number' cannot be converted to float", ): list(_chunk_dataframe(dataframe=df)) + + def test_chunk_with_context_span_id(self) -> None: + """Test chunking with context.span_id column.""" + df = pd.DataFrame( + { + "name": ["test1", "test2"], + "annotator_kind": ["HUMAN", "HUMAN"], + "context.span_id": ["id1", "id2"], + "label": ["label1", "label2"], + } + ) + chunks = list(_chunk_dataframe(dataframe=df)) + assert len(chunks) == 1 + assert chunks[0][0]["span_id"] == "id1" + assert chunks[0][1]["span_id"] == "id2" + + def test_chunk_with_both_span_id_columns(self) -> None: + """Test chunking with both span_id and context.span_id columns.""" + df = pd.DataFrame( + { + "name": ["test1", "test2"], + "annotator_kind": ["HUMAN", "HUMAN"], + "span_id": ["id1", "id2"], + "context.span_id": ["id1", "id2"], + "label": ["label1", "label2"], + } + ) + with pytest.raises( + ValueError, match="DataFrame cannot have both 'span_id' and 'context.span_id' columns" + ): + list(_chunk_dataframe(dataframe=df))