diff --git a/.semversioner/next-release/major-20250909205146252760.json b/.semversioner/next-release/major-20250909205146252760.json new file mode 100644 index 0000000000..b4e99c5358 --- /dev/null +++ b/.semversioner/next-release/major-20250909205146252760.json @@ -0,0 +1,4 @@ +{ + "type": "major", + "description": "Remove text unit group-by ability." +} diff --git a/docs/config/yaml.md b/docs/config/yaml.md index fbc93bfed3..1a4ce326e8 100644 --- a/docs/config/yaml.md +++ b/docs/config/yaml.md @@ -99,7 +99,6 @@ These settings configure how we parse documents into text chunks. This is necess - `size` **int** - The max chunk size in tokens. - `overlap` **int** - The chunk overlap in tokens. -- `group_by_columns` **list[str]** - Group documents by these fields before chunking. - `strategy` **str**[tokens|sentences] - How to chunk the text. - `encoding_model` **str** - The text encoding model to use for splitting on token boundaries. - `prepend_metadata` **bool** - Determines if metadata values should be added at the beginning of each chunk. Default=`False`. diff --git a/docs/index/default_dataflow.md b/docs/index/default_dataflow.md index 1133963e89..19f59dfe8a 100644 --- a/docs/index/default_dataflow.md +++ b/docs/index/default_dataflow.md @@ -59,9 +59,7 @@ flowchart TB The first phase of the default-configuration workflow is to transform input documents into _TextUnits_. A _TextUnit_ is a chunk of text that is used for our graph extraction techniques. They are also used as source-references by extracted knowledge items in order to empower breadcrumbs and provenance by concepts back to their original source text. -The chunk size (counted in tokens), is user-configurable. By default this is set to 300 tokens, although we've had positive experience with 1200-token chunks using a single "glean" step. (A "glean" step is a follow-on extraction). Larger chunks result in lower-fidelity output and less meaningful reference texts; however, using larger chunks can result in much faster processing time. - -The group-by configuration is also user-configurable. By default, we align our chunks to document boundaries, meaning that there is a strict 1-to-many relationship between Documents and TextUnits. In rare cases, this can be turned into a many-to-many relationship. This is useful when the documents are very short and we need several of them to compose a meaningful analysis unit (e.g. Tweets or a chat log) +The chunk size (counted in tokens), is user-configurable. By default this is set to 1200 tokens. Larger chunks result in lower-fidelity output and less meaningful reference texts; however, using larger chunks can result in much faster processing time. ```mermaid --- diff --git a/docs/index/outputs.md b/docs/index/outputs.md index 0cc43b419b..89a48e8526 100644 --- a/docs/index/outputs.md +++ b/docs/index/outputs.md @@ -102,7 +102,7 @@ List of all text chunks parsed from the input documents. | ----------------- | ----- | ----------- | | text | str | Raw full text of the chunk. | | n_tokens | int | Number of tokens in the chunk. This should normally match the `chunk_size` config parameter, except for the last chunk which is often shorter. | -| document_ids | str[] | List of document IDs the chunk came from. This is normally only 1 due to our default groupby, but for very short text documents (e.g., microblogs) it can be configured so text units span multiple documents. | +| document_id | str | ID of the document the chunk came from. | | entity_ids | str[] | List of entities found in the text unit. | | relationships_ids | str[] | List of relationships found in the text unit. | | covariate_ids | str[] | Optional list of covariates found in the text unit. | \ No newline at end of file diff --git a/graphrag/config/defaults.py b/graphrag/config/defaults.py index cbb22c8574..fb9c5578a1 100644 --- a/graphrag/config/defaults.py +++ b/graphrag/config/defaults.py @@ -27,15 +27,15 @@ DEFAULT_OUTPUT_BASE_DIR = "output" DEFAULT_CHAT_MODEL_ID = "default_chat_model" DEFAULT_CHAT_MODEL_TYPE = ModelType.OpenAIChat -DEFAULT_CHAT_MODEL = "gpt-4-turbo-preview" +DEFAULT_CHAT_MODEL = "gpt-4o" DEFAULT_CHAT_MODEL_AUTH_TYPE = AuthType.APIKey DEFAULT_EMBEDDING_MODEL_ID = "default_embedding_model" DEFAULT_EMBEDDING_MODEL_TYPE = ModelType.OpenAIEmbedding -DEFAULT_EMBEDDING_MODEL = "text-embedding-3-small" +DEFAULT_EMBEDDING_MODEL = "text-embedding-ada-002" DEFAULT_EMBEDDING_MODEL_AUTH_TYPE = AuthType.APIKey DEFAULT_VECTOR_STORE_ID = "default_vector_store" -ENCODING_MODEL = "cl100k_base" +ENCODING_MODEL = "o200k_base" COGNITIVE_SERVICES_AUDIENCE = "https://cognitiveservices.azure.com/.default" @@ -68,9 +68,8 @@ class ChunksDefaults: size: int = 1200 overlap: int = 100 - group_by_columns: list[str] = field(default_factory=lambda: ["id"]) strategy: ClassVar[ChunkStrategyType] = ChunkStrategyType.tokens - encoding_model: str = "cl100k_base" + encoding_model: str = ENCODING_MODEL prepend_metadata: bool = False chunk_size_includes_metadata: bool = False diff --git a/graphrag/config/init_content.py b/graphrag/config/init_content.py index 98dbd27c59..137b3873b5 100644 --- a/graphrag/config/init_content.py +++ b/graphrag/config/init_content.py @@ -67,7 +67,6 @@ chunks: size: {graphrag_config_defaults.chunks.size} overlap: {graphrag_config_defaults.chunks.overlap} - group_by_columns: [{",".join(graphrag_config_defaults.chunks.group_by_columns)}] ### Output/storage settings ### ## If blob storage is specified in the following four sections, diff --git a/graphrag/config/models/chunking_config.py b/graphrag/config/models/chunking_config.py index 902bffcbb9..4d1214241c 100644 --- a/graphrag/config/models/chunking_config.py +++ b/graphrag/config/models/chunking_config.py @@ -20,10 +20,6 @@ class ChunkingConfig(BaseModel): description="The chunk overlap to use.", default=graphrag_config_defaults.chunks.overlap, ) - group_by_columns: list[str] = Field( - description="The chunk by columns to use.", - default=graphrag_config_defaults.chunks.group_by_columns, - ) strategy: ChunkStrategyType = Field( description="The chunking strategy to use.", default=graphrag_config_defaults.chunks.strategy, diff --git a/graphrag/data_model/schemas.py b/graphrag/data_model/schemas.py index 059605bfbf..93ede46191 100644 --- a/graphrag/data_model/schemas.py +++ b/graphrag/data_model/schemas.py @@ -52,7 +52,7 @@ RELATIONSHIP_IDS = "relationship_ids" TEXT_UNIT_IDS = "text_unit_ids" COVARIATE_IDS = "covariate_ids" -DOCUMENT_IDS = "document_ids" +DOCUMENT_ID = "document_id" PERIOD = "period" SIZE = "size" @@ -142,7 +142,7 @@ SHORT_ID, TEXT, N_TOKENS, - DOCUMENT_IDS, + DOCUMENT_ID, ENTITY_IDS, RELATIONSHIP_IDS, COVARIATE_IDS, diff --git a/graphrag/data_model/text_unit.py b/graphrag/data_model/text_unit.py index 07b1b9ae9c..55006ab15b 100644 --- a/graphrag/data_model/text_unit.py +++ b/graphrag/data_model/text_unit.py @@ -28,8 +28,8 @@ class TextUnit(Identified): n_tokens: int | None = None """The number of tokens in the text (optional).""" - document_ids: list[str] | None = None - """List of document IDs in which the text unit appears (optional).""" + document_id: str | None = None + """ID of the document in which the text unit appears (optional).""" attributes: dict[str, Any] | None = None """A dictionary of additional attributes associated with the text unit (optional).""" @@ -45,7 +45,7 @@ def from_dict( relationships_key: str = "relationship_ids", covariates_key: str = "covariate_ids", n_tokens_key: str = "n_tokens", - document_ids_key: str = "document_ids", + document_id_key: str = "document_id", attributes_key: str = "attributes", ) -> "TextUnit": """Create a new text unit from the dict data.""" @@ -57,6 +57,6 @@ def from_dict( relationship_ids=d.get(relationships_key), covariate_ids=d.get(covariates_key), n_tokens=d.get(n_tokens_key), - document_ids=d.get(document_ids_key), + document_id=d.get(document_id_key), attributes=d.get(attributes_key), ) diff --git a/graphrag/index/workflows/create_base_text_units.py b/graphrag/index/workflows/create_base_text_units.py index d94ee5951f..feeba7a065 100644 --- a/graphrag/index/workflows/create_base_text_units.py +++ b/graphrag/index/workflows/create_base_text_units.py @@ -35,7 +35,6 @@ async def run_workflow( output = create_base_text_units( documents, context.callbacks, - chunks.group_by_columns, chunks.size, chunks.overlap, chunks.encoding_model, @@ -53,7 +52,6 @@ async def run_workflow( def create_base_text_units( documents: pd.DataFrame, callbacks: WorkflowCallbacks, - group_by_columns: list[str], size: int, overlap: int, encoding_model: str, @@ -62,26 +60,9 @@ def create_base_text_units( chunk_size_includes_metadata: bool = False, ) -> pd.DataFrame: """All the steps to transform base text_units.""" - sort = documents.sort_values(by=["id"], ascending=[True]) + documents.sort_values(by=["id"], ascending=[True], inplace=True) - sort["text_with_ids"] = list( - zip(*[sort[col] for col in ["id", "text"]], strict=True) - ) - - agg_dict = {"text_with_ids": list} - if "metadata" in documents: - agg_dict["metadata"] = "first" # type: ignore - - aggregated = ( - ( - sort.groupby(group_by_columns, sort=False) - if len(group_by_columns) > 0 - else sort.groupby(lambda _x: True) - ) - .agg(agg_dict) - .reset_index() - ) - aggregated.rename(columns={"text_with_ids": "texts"}, inplace=True) + encode, _ = get_encoding_fn(encoding_model) def chunker(row: pd.Series) -> Any: line_delimiter = ".\n" @@ -99,7 +80,6 @@ def chunker(row: pd.Series) -> Any: ) if chunk_size_includes_metadata: - encode, _ = get_encoding_fn(encoding_model) metadata_tokens = len(encode(metadata_str)) if metadata_tokens >= size: message = "Metadata tokens exceeds the maximum tokens per chunk. Please increase the tokens per chunk." @@ -107,7 +87,7 @@ def chunker(row: pd.Series) -> Any: chunked = chunk_text( pd.DataFrame([row]).reset_index(drop=True), - column="texts", + column="text", size=size - metadata_tokens, overlap=overlap, encoding_model=encoding_model, @@ -128,7 +108,7 @@ def chunker(row: pd.Series) -> Any: return row # Track progress of row-wise apply operation - total_rows = len(aggregated) + total_rows = len(documents) logger.info("Starting chunking process for %d documents", total_rows) def chunker_with_logging(row: pd.Series, row_index: int) -> Any: @@ -137,27 +117,26 @@ def chunker_with_logging(row: pd.Series, row_index: int) -> Any: logger.info("chunker progress: %d/%d", row_index + 1, total_rows) return result - aggregated = aggregated.apply( + text_units = documents.apply( lambda row: chunker_with_logging(row, row.name), axis=1 ) - aggregated = cast("pd.DataFrame", aggregated[[*group_by_columns, "chunks"]]) - aggregated = aggregated.explode("chunks") - aggregated.rename( + text_units = cast("pd.DataFrame", text_units[["id", "chunks"]]) + text_units = text_units.explode("chunks") + text_units.rename( columns={ - "chunks": "chunk", + "id": "document_id", + "chunks": "text", }, inplace=True, ) - aggregated["id"] = aggregated.apply( - lambda row: gen_sha512_hash(row, ["chunk"]), axis=1 - ) - aggregated[["document_ids", "chunk", "n_tokens"]] = pd.DataFrame( - aggregated["chunk"].tolist(), index=aggregated.index + + text_units["id"] = text_units.apply( + lambda row: gen_sha512_hash(row, ["text"]), axis=1 ) - # rename for downstream consumption - aggregated.rename(columns={"chunk": "text"}, inplace=True) + # get a final token measurement + text_units["n_tokens"] = text_units["text"].apply(lambda x: len(encode(x))) return cast( - "pd.DataFrame", aggregated[aggregated["text"].notna()].reset_index(drop=True) + "pd.DataFrame", text_units[text_units["text"].notna()].reset_index(drop=True) ) diff --git a/graphrag/index/workflows/create_final_documents.py b/graphrag/index/workflows/create_final_documents.py index af81e8dfa8..f12560bab6 100644 --- a/graphrag/index/workflows/create_final_documents.py +++ b/graphrag/index/workflows/create_final_documents.py @@ -37,19 +37,15 @@ def create_final_documents( documents: pd.DataFrame, text_units: pd.DataFrame ) -> pd.DataFrame: """All the steps to transform final documents.""" - exploded = ( - text_units.explode("document_ids") - .loc[:, ["id", "document_ids", "text"]] - .rename( - columns={ - "document_ids": "chunk_doc_id", - "id": "chunk_id", - "text": "chunk_text", - } - ) + renamed = text_units.loc[:, ["id", "document_id", "text"]].rename( + columns={ + "document_id": "chunk_doc_id", + "id": "chunk_id", + "text": "chunk_text", + } ) - joined = exploded.merge( + joined = renamed.merge( documents, left_on="chunk_doc_id", right_on="id", diff --git a/graphrag/index/workflows/create_final_text_units.py b/graphrag/index/workflows/create_final_text_units.py index 373b9b4cb4..fbcf0af890 100644 --- a/graphrag/index/workflows/create_final_text_units.py +++ b/graphrag/index/workflows/create_final_text_units.py @@ -59,7 +59,7 @@ def create_final_text_units( final_covariates: pd.DataFrame | None, ) -> pd.DataFrame: """All the steps to transform the text units.""" - selected = text_units.loc[:, ["id", "text", "document_ids", "n_tokens"]] + selected = text_units.loc[:, ["id", "text", "document_id", "n_tokens"]] selected["human_readable_id"] = selected.index entity_join = _entities(final_entities) diff --git a/graphrag/prompt_tune/loader/input.py b/graphrag/prompt_tune/loader/input.py index 29010b05a8..36c8fca003 100644 --- a/graphrag/prompt_tune/loader/input.py +++ b/graphrag/prompt_tune/loader/input.py @@ -58,7 +58,6 @@ async def load_docs_in_chunks( chunks_df = create_base_text_units( documents=dataset, callbacks=NoopWorkflowCallbacks(), - group_by_columns=chunk_config.group_by_columns, size=chunk_size, overlap=overlap, encoding_model=chunk_config.encoding_model, diff --git a/graphrag/query/input/loaders/dfs.py b/graphrag/query/input/loaders/dfs.py index 7182090cd2..17aeb604a2 100644 --- a/graphrag/query/input/loaders/dfs.py +++ b/graphrag/query/input/loaders/dfs.py @@ -234,7 +234,7 @@ def read_text_units( relationships_col: str | None = "relationship_ids", covariates_col: str | None = "covariate_ids", tokens_col: str | None = "n_tokens", - document_ids_col: str | None = "document_ids", + document_id_col: str | None = "document_id", attributes_cols: list[str] | None = None, ) -> list[TextUnit]: """Read text units from a dataframe using pre-converted records.""" @@ -250,7 +250,7 @@ def read_text_units( row, covariates_col, key_type=str, value_type=str ), n_tokens=to_optional_int(row, tokens_col), - document_ids=to_optional_list(row, document_ids_col, item_type=str), + document_id=to_optional_str(row, document_id_col), attributes=( {col: row.get(col) for col in attributes_cols} if attributes_cols diff --git a/tests/unit/config/fixtures/minimal_config/settings.yaml b/tests/unit/config/fixtures/minimal_config/settings.yaml index 2fec50a3b9..2848968c06 100644 --- a/tests/unit/config/fixtures/minimal_config/settings.yaml +++ b/tests/unit/config/fixtures/minimal_config/settings.yaml @@ -2,8 +2,8 @@ models: default_chat_model: api_key: ${CUSTOM_API_KEY} type: openai_chat - model: gpt-4-turbo-preview + model: gpt-4o default_embedding_model: api_key: ${CUSTOM_API_KEY} type: openai_embedding - model: text-embedding-3-small \ No newline at end of file + model: text-embedding-ada-002 \ No newline at end of file diff --git a/tests/unit/config/fixtures/minimal_config_missing_env_var/settings.yaml b/tests/unit/config/fixtures/minimal_config_missing_env_var/settings.yaml index 9e1c45715a..812917da08 100644 --- a/tests/unit/config/fixtures/minimal_config_missing_env_var/settings.yaml +++ b/tests/unit/config/fixtures/minimal_config_missing_env_var/settings.yaml @@ -2,8 +2,8 @@ models: default_chat_model: api_key: ${SOME_NON_EXISTENT_ENV_VAR} type: openai_chat - model: gpt-4-turbo-preview + model: gpt-4o default_embedding_model: api_key: ${SOME_NON_EXISTENT_ENV_VAR} type: openai_embedding - model: text-embedding-3-small \ No newline at end of file + model: text-embedding-ada-002 \ No newline at end of file diff --git a/tests/unit/config/utils.py b/tests/unit/config/utils.py index 8ca5e45a8c..dc903db9f4 100644 --- a/tests/unit/config/utils.py +++ b/tests/unit/config/utils.py @@ -192,7 +192,6 @@ def assert_text_embedding_configs( def assert_chunking_configs(actual: ChunkingConfig, expected: ChunkingConfig) -> None: assert actual.size == expected.size assert actual.overlap == expected.overlap - assert actual.group_by_columns == expected.group_by_columns assert actual.strategy == expected.strategy assert actual.encoding_model == expected.encoding_model assert actual.prepend_metadata == expected.prepend_metadata diff --git a/tests/unit/indexing/graph/extractors/community_reports/test_sort_context.py b/tests/unit/indexing/graph/extractors/community_reports/test_sort_context.py index c5911344b6..68ad54ea32 100644 --- a/tests/unit/indexing/graph/extractors/community_reports/test_sort_context.py +++ b/tests/unit/indexing/graph/extractors/community_reports/test_sort_context.py @@ -207,8 +207,8 @@ def test_sort_context(): ctx = sort_context(context) assert ctx is not None, "Context is none" num = num_tokens(ctx) - assert num == 828 if platform.system() == "Windows" else 826, ( - f"num_tokens is not matched for platform (win = 827, else 826): {num}" + assert num == 825 if platform.system() == "Windows" else 826, ( + f"num_tokens is not matched for platform (win = 825, else 826): {num}" ) diff --git a/tests/unit/indexing/text_splitting/test_text_splitting.py b/tests/unit/indexing/text_splitting/test_text_splitting.py index 10a5a06344..e30bb5e721 100644 --- a/tests/unit/indexing/text_splitting/test_text_splitting.py +++ b/tests/unit/indexing/text_splitting/test_text_splitting.py @@ -83,8 +83,7 @@ def test_token_text_splitter(mock_tokenizer, mock_split_text): def test_encode_basic(): splitter = TokenTextSplitter() result = splitter.encode("abc def") - - assert result == [13997, 711], "Encoding failed to return expected tokens" + assert result == [26682, 1056], "Encoding failed to return expected tokens" def test_num_tokens_empty_input(): diff --git a/tests/verbs/data/communities.parquet b/tests/verbs/data/communities.parquet index d8a5c82dec..d85403d6b6 100644 Binary files a/tests/verbs/data/communities.parquet and b/tests/verbs/data/communities.parquet differ diff --git a/tests/verbs/data/community_reports.parquet b/tests/verbs/data/community_reports.parquet index 7600bb66c9..26b9daba09 100644 Binary files a/tests/verbs/data/community_reports.parquet and b/tests/verbs/data/community_reports.parquet differ diff --git a/tests/verbs/data/covariates.parquet b/tests/verbs/data/covariates.parquet index 92c2a1a753..844c414f73 100644 Binary files a/tests/verbs/data/covariates.parquet and b/tests/verbs/data/covariates.parquet differ diff --git a/tests/verbs/data/documents.parquet b/tests/verbs/data/documents.parquet index 654c5c7f5e..a846fecfce 100644 Binary files a/tests/verbs/data/documents.parquet and b/tests/verbs/data/documents.parquet differ diff --git a/tests/verbs/data/entities.parquet b/tests/verbs/data/entities.parquet index 8de7e6da15..0807395a7f 100644 Binary files a/tests/verbs/data/entities.parquet and b/tests/verbs/data/entities.parquet differ diff --git a/tests/verbs/data/relationships.parquet b/tests/verbs/data/relationships.parquet index f926d44091..e88d218270 100644 Binary files a/tests/verbs/data/relationships.parquet and b/tests/verbs/data/relationships.parquet differ diff --git a/tests/verbs/data/text_units.parquet b/tests/verbs/data/text_units.parquet index b47de34e9b..cdaa9b2b07 100644 Binary files a/tests/verbs/data/text_units.parquet and b/tests/verbs/data/text_units.parquet differ diff --git a/tests/verbs/data/text_units_metadata.parquet b/tests/verbs/data/text_units_metadata.parquet index 71ec620e2a..9b6b44342a 100644 Binary files a/tests/verbs/data/text_units_metadata.parquet and b/tests/verbs/data/text_units_metadata.parquet differ diff --git a/tests/verbs/data/text_units_metadata_included_chunk.parquet b/tests/verbs/data/text_units_metadata_included_chunk.parquet index ccf4b3903e..ca2495975e 100644 Binary files a/tests/verbs/data/text_units_metadata_included_chunk.parquet and b/tests/verbs/data/text_units_metadata_included_chunk.parquet differ diff --git a/tests/verbs/test_create_base_text_units.py b/tests/verbs/test_create_base_text_units.py index ea34ae8b9c..87148a981d 100644 --- a/tests/verbs/test_create_base_text_units.py +++ b/tests/verbs/test_create_base_text_units.py @@ -25,7 +25,7 @@ async def test_create_base_text_units(): actual = await load_table_from_storage("text_units", context.output_storage) - compare_outputs(actual, expected, columns=["text", "document_ids", "n_tokens"]) + compare_outputs(actual, expected, columns=["text", "document_id", "n_tokens"]) async def test_create_base_text_units_metadata(): @@ -34,8 +34,6 @@ async def test_create_base_text_units_metadata(): context = await create_test_context() config = create_graphrag_config({"models": DEFAULT_MODEL_CONFIG}) - # test data was created with 4o, so we need to match the encoding for chunks to be identical - config.chunks.encoding_model = "o200k_base" config.input.metadata = ["title"] config.chunks.prepend_metadata = True @@ -44,7 +42,7 @@ async def test_create_base_text_units_metadata(): await run_workflow(config, context) actual = await load_table_from_storage("text_units", context.output_storage) - compare_outputs(actual, expected) + compare_outputs(actual, expected, ["text", "document_id", "n_tokens"]) async def test_create_base_text_units_metadata_included_in_chunk(): @@ -53,8 +51,6 @@ async def test_create_base_text_units_metadata_included_in_chunk(): context = await create_test_context() config = create_graphrag_config({"models": DEFAULT_MODEL_CONFIG}) - # test data was created with 4o, so we need to match the encoding for chunks to be identical - config.chunks.encoding_model = "o200k_base" config.input.metadata = ["title"] config.chunks.prepend_metadata = True config.chunks.chunk_size_includes_metadata = True @@ -65,4 +61,4 @@ async def test_create_base_text_units_metadata_included_in_chunk(): actual = await load_table_from_storage("text_units", context.output_storage) # only check the columns from the base workflow - our expected table is the final and will have more - compare_outputs(actual, expected, columns=["text", "document_ids", "n_tokens"]) + compare_outputs(actual, expected, columns=["text", "document_id", "n_tokens"]) diff --git a/tests/verbs/test_extract_graph_nlp.py b/tests/verbs/test_extract_graph_nlp.py index 8c9367ad36..92da89288c 100644 --- a/tests/verbs/test_extract_graph_nlp.py +++ b/tests/verbs/test_extract_graph_nlp.py @@ -29,7 +29,7 @@ async def test_extract_graph_nlp(): # this will be the raw count of entities and edges with no pruning # with NLP it is deterministic, so we can assert exact row counts - assert len(nodes_actual) == 1148 + assert len(nodes_actual) == 1147 assert len(nodes_actual.columns) == 5 - assert len(edges_actual) == 29445 + assert len(edges_actual) == 29442 assert len(edges_actual.columns) == 5 diff --git a/tests/verbs/test_prune_graph.py b/tests/verbs/test_prune_graph.py index 34609d9cb5..6ed0001973 100644 --- a/tests/verbs/test_prune_graph.py +++ b/tests/verbs/test_prune_graph.py @@ -28,4 +28,4 @@ async def test_prune_graph(): nodes_actual = await load_table_from_storage("entities", context.output_storage) - assert len(nodes_actual) == 20 + assert len(nodes_actual) == 21 diff --git a/tests/verbs/util.py b/tests/verbs/util.py index 8d342b4756..f28adae4f4 100644 --- a/tests/verbs/util.py +++ b/tests/verbs/util.py @@ -66,7 +66,10 @@ def compare_outputs( ) for column in cols: - assert column in actual.columns + try: + assert column in actual.columns + except AssertionError: + print(f"Column '{column}' not found in actual output.") try: # dtypes can differ since the test data is read from parquet and our workflow runs in memory if column != "id": # don't check uuids @@ -77,6 +80,7 @@ def compare_outputs( check_index=False, ) except AssertionError: + print(f"Column '{column}' does not match.") print("Expected:") print(expected[column]) print("Actual:")