microsoft · natoverse · Sep 9, 2025 · Sep 9, 2025 · Sep 9, 2025 · Sep 9, 2025
@@ -0,0 +1,4 @@
+{
+  "type": "major",
+  "description": "Remove text unit group-by ability."
+}
@@ -100,7 +100,6 @@ These settings configure how we parse documents into text chunks. This is necess
 
 - `size` **int** - The max chunk size in tokens.
 - `overlap` **int** - The chunk overlap in tokens.
-- `group_by_columns` **list[str]** - Group documents by these fields before chunking.
 - `strategy` **str**[tokens|sentences] - How to chunk the text. 
 - `encoding_model` **str** - The text encoding model to use for splitting on token boundaries.
 - `prepend_metadata` **bool** - Determines if metadata values should be added at the beginning of each chunk. Default=`False`.

@@ -60,9 +60,7 @@ flowchart TB
 
 The first phase of the default-configuration workflow is to transform input documents into _TextUnits_. A _TextUnit_ is a chunk of text that is used for our graph extraction techniques. They are also used as source-references by extracted knowledge items in order to empower breadcrumbs and provenance by concepts back to their original source text.
 
-The chunk size (counted in tokens), is user-configurable. By default this is set to 300 tokens, although we've had positive experience with 1200-token chunks using a single "glean" step. (A "glean" step is a follow-on extraction). Larger chunks result in lower-fidelity output and less meaningful reference texts; however, using larger chunks can result in much faster processing time.
-
-The group-by configuration is also user-configurable. By default, we align our chunks to document boundaries, meaning that there is a strict 1-to-many relationship between Documents and TextUnits. In rare cases, this can be turned into a many-to-many relationship. This is useful when the documents are very short and we need several of them to compose a meaningful analysis unit (e.g. Tweets or a chat log)
+The chunk size (counted in tokens), is user-configurable. By default this is set to 1200 tokens. Larger chunks result in lower-fidelity output and less meaningful reference texts; however, using larger chunks can result in much faster processing time.
 
 ```mermaid
 ---

@@ -104,7 +104,7 @@ List of all text chunks parsed from the input documents.
 | ----------------- | ----- | ----------- |
 | text              | str   | Raw full text of the chunk. |
 | n_tokens          | int   | Number of tokens in the chunk. This should normally match the `chunk_size` config parameter, except for the last chunk which is often shorter. |
-| document_ids      | str[] | List of document IDs the chunk came from. This is normally only 1 due to our default groupby, but for very short text documents (e.g., microblogs) it can be configured so text units span multiple documents. |
+| document_id       | str   | ID of the document the chunk came from. |
 | entity_ids        | str[] | List of entities found in the text unit. |
 | relationships_ids | str[] | List of relationships found in the text unit. |
 | covariate_ids     | str[] | Optional list of covariates found in the text unit. |
@@ -27,15 +27,15 @@
 DEFAULT_OUTPUT_BASE_DIR = "output"
 DEFAULT_CHAT_MODEL_ID = "default_chat_model"
 DEFAULT_CHAT_MODEL_TYPE = ModelType.OpenAIChat
-DEFAULT_CHAT_MODEL = "gpt-4-turbo-preview"
+DEFAULT_CHAT_MODEL = "gpt-4o"
 DEFAULT_CHAT_MODEL_AUTH_TYPE = AuthType.APIKey
 DEFAULT_EMBEDDING_MODEL_ID = "default_embedding_model"
 DEFAULT_EMBEDDING_MODEL_TYPE = ModelType.OpenAIEmbedding
-DEFAULT_EMBEDDING_MODEL = "text-embedding-3-small"
+DEFAULT_EMBEDDING_MODEL = "text-embedding-ada-002"
 DEFAULT_EMBEDDING_MODEL_AUTH_TYPE = AuthType.APIKey
 DEFAULT_VECTOR_STORE_ID = "default_vector_store"
 
-ENCODING_MODEL = "cl100k_base"
+ENCODING_MODEL = "o200k_base"
 COGNITIVE_SERVICES_AUDIENCE = "https://cognitiveservices.azure.com/.default"
 
 
@@ -68,9 +68,8 @@ class ChunksDefaults:
 
     size: int = 1200
     overlap: int = 100
-    group_by_columns: list[str] = field(default_factory=lambda: ["id"])
     strategy: ClassVar[ChunkStrategyType] = ChunkStrategyType.tokens
-    encoding_model: str = "cl100k_base"
+    encoding_model: str = ENCODING_MODEL
     prepend_metadata: bool = False
     chunk_size_includes_metadata: bool = False
 

@@ -67,7 +67,6 @@
 chunks:
   size: {graphrag_config_defaults.chunks.size}
   overlap: {graphrag_config_defaults.chunks.overlap}
-  group_by_columns: [{",".join(graphrag_config_defaults.chunks.group_by_columns)}]
 
 ### Output/storage settings ###
 ## If blob storage is specified in the following four sections,

@@ -20,10 +20,6 @@ class ChunkingConfig(BaseModel):
         description="The chunk overlap to use.",
         default=graphrag_config_defaults.chunks.overlap,
     )
-    group_by_columns: list[str] = Field(
-        description="The chunk by columns to use.",
-        default=graphrag_config_defaults.chunks.group_by_columns,
-    )
     strategy: ChunkStrategyType = Field(
         description="The chunking strategy to use.",
         default=graphrag_config_defaults.chunks.strategy,

@@ -54,7 +54,7 @@
 RELATIONSHIP_IDS = "relationship_ids"
 TEXT_UNIT_IDS = "text_unit_ids"
 COVARIATE_IDS = "covariate_ids"
-DOCUMENT_IDS = "document_ids"
+DOCUMENT_ID = "document_id"
 
 PERIOD = "period"
 SIZE = "size"
@@ -146,7 +146,7 @@
     SHORT_ID,
     TEXT,
     N_TOKENS,
-    DOCUMENT_IDS,
+    DOCUMENT_ID,
     ENTITY_IDS,
     RELATIONSHIP_IDS,
     COVARIATE_IDS,

@@ -28,8 +28,8 @@ class TextUnit(Identified):
     n_tokens: int | None = None
     """The number of tokens in the text (optional)."""
 
-    document_ids: list[str] | None = None
-    """List of document IDs in which the text unit appears (optional)."""
+    document_id: str | None = None
+    """ID of the document in which the text unit appears (optional)."""
 
     attributes: dict[str, Any] | None = None
     """A dictionary of additional attributes associated with the text unit (optional)."""
@@ -45,7 +45,7 @@ def from_dict(
         relationships_key: str = "relationship_ids",
         covariates_key: str = "covariate_ids",
         n_tokens_key: str = "n_tokens",
-        document_ids_key: str = "document_ids",
+        document_id_key: str = "document_id",
         attributes_key: str = "attributes",
     ) -> "TextUnit":
         """Create a new text unit from the dict data."""
@@ -57,6 +57,6 @@ def from_dict(
             relationship_ids=d.get(relationships_key),
             covariate_ids=d.get(covariates_key),
             n_tokens=d.get(n_tokens_key),
-            document_ids=d.get(document_ids_key),
+            document_id=d.get(document_id_key),
             attributes=d.get(attributes_key),
         )
@@ -35,7 +35,6 @@ async def run_workflow(
     output = create_base_text_units(
         documents,
         context.callbacks,
-        chunks.group_by_columns,
         chunks.size,
         chunks.overlap,
         chunks.encoding_model,
@@ -53,7 +52,6 @@ async def run_workflow(
 def create_base_text_units(
     documents: pd.DataFrame,
     callbacks: WorkflowCallbacks,
-    group_by_columns: list[str],
     size: int,
     overlap: int,
     encoding_model: str,
@@ -62,26 +60,9 @@ def create_base_text_units(
     chunk_size_includes_metadata: bool = False,
 ) -> pd.DataFrame:
     """All the steps to transform base text_units."""
-    sort = documents.sort_values(by=["id"], ascending=[True])
+    documents.sort_values(by=["id"], ascending=[True])
 
-    sort["text_with_ids"] = list(
-        zip(*[sort[col] for col in ["id", "text"]], strict=True)
-    )
-
-    agg_dict = {"text_with_ids": list}
-    if "metadata" in documents:
-        agg_dict["metadata"] = "first"  # type: ignore
-
-    aggregated = (
-        (
-            sort.groupby(group_by_columns, sort=False)
-            if len(group_by_columns) > 0
-            else sort.groupby(lambda _x: True)
-        )
-        .agg(agg_dict)
-        .reset_index()
-    )
-    aggregated.rename(columns={"text_with_ids": "texts"}, inplace=True)
+    encode, _ = get_encoding_fn(encoding_model)
 
     def chunker(row: pd.Series) -> Any:
         line_delimiter = ".\n"
@@ -99,15 +80,14 @@ def chunker(row: pd.Series) -> Any:
                 )
 
             if chunk_size_includes_metadata:
-                encode, _ = get_encoding_fn(encoding_model)
                 metadata_tokens = len(encode(metadata_str))
                 if metadata_tokens >= size:
                     message = "Metadata tokens exceeds the maximum tokens per chunk. Please increase the tokens per chunk."
                     raise ValueError(message)
 
         chunked = chunk_text(
             pd.DataFrame([row]).reset_index(drop=True),
-            column="texts",
+            column="text",
             size=size - metadata_tokens,
             overlap=overlap,
             encoding_model=encoding_model,
@@ -128,7 +108,7 @@ def chunker(row: pd.Series) -> Any:
         return row
 
     # Track progress of row-wise apply operation
-    total_rows = len(aggregated)
+    total_rows = len(documents)
     logger.info("Starting chunking process for %d documents", total_rows)
 
     def chunker_with_logging(row: pd.Series, row_index: int) -> Any:
@@ -137,27 +117,26 @@ def chunker_with_logging(row: pd.Series, row_index: int) -> Any:
         logger.info("chunker progress:  %d/%d", row_index + 1, total_rows)
         return result
 
-    aggregated = aggregated.apply(
+    text_units = documents.apply(
         lambda row: chunker_with_logging(row, row.name), axis=1
     )
 
-    aggregated = cast("pd.DataFrame", aggregated[[*group_by_columns, "chunks"]])
-    aggregated = aggregated.explode("chunks")
-    aggregated.rename(
+    text_units = cast("pd.DataFrame", text_units[["id", "chunks"]])
+    text_units = text_units.explode("chunks")
+    text_units.rename(
         columns={
-            "chunks": "chunk",
+            "id": "document_id",
+            "chunks": "text",
         },
         inplace=True,
     )
-    aggregated["id"] = aggregated.apply(
-        lambda row: gen_sha512_hash(row, ["chunk"]), axis=1
-    )
-    aggregated[["document_ids", "chunk", "n_tokens"]] = pd.DataFrame(
-        aggregated["chunk"].tolist(), index=aggregated.index
+
+    text_units["id"] = text_units.apply(
+        lambda row: gen_sha512_hash(row, ["text"]), axis=1
     )
-    # rename for downstream consumption
-    aggregated.rename(columns={"chunk": "text"}, inplace=True)
+    # get a final token measurement
+    text_units["n_tokens"] = text_units["text"].apply(lambda x: len(encode(x)))
 
     return cast(
-        "pd.DataFrame", aggregated[aggregated["text"].notna()].reset_index(drop=True)
+        "pd.DataFrame", text_units[text_units["text"].notna()].reset_index(drop=True)
     )
@@ -37,19 +37,15 @@ def create_final_documents(
     documents: pd.DataFrame, text_units: pd.DataFrame
 ) -> pd.DataFrame:
     """All the steps to transform final documents."""
-    exploded = (
-        text_units.explode("document_ids")
-        .loc[:, ["id", "document_ids", "text"]]
-        .rename(
-            columns={
-                "document_ids": "chunk_doc_id",
-                "id": "chunk_id",
-                "text": "chunk_text",
-            }
-        )
+    renamed = text_units.loc[:, ["id", "document_id", "text"]].rename(
+        columns={
+            "document_id": "chunk_doc_id",
+            "id": "chunk_id",
+            "text": "chunk_text",
+        }
     )
 
-    joined = exploded.merge(
+    joined = renamed.merge(
         documents,
         left_on="chunk_doc_id",
         right_on="id",

@@ -59,7 +59,7 @@ def create_final_text_units(
     final_covariates: pd.DataFrame | None,
 ) -> pd.DataFrame:
     """All the steps to transform the text units."""
-    selected = text_units.loc[:, ["id", "text", "document_ids", "n_tokens"]]
+    selected = text_units.loc[:, ["id", "text", "document_id", "n_tokens"]]
     selected["human_readable_id"] = selected.index
 
     entity_join = _entities(final_entities)

@@ -58,7 +58,6 @@ async def load_docs_in_chunks(
     chunks_df = create_base_text_units(
         documents=dataset,
         callbacks=NoopWorkflowCallbacks(),
-        group_by_columns=chunk_config.group_by_columns,
         size=chunk_size,
         overlap=overlap,
         encoding_model=chunk_config.encoding_model,

@@ -234,7 +234,7 @@ def read_text_units(
     relationships_col: str | None = "relationship_ids",
     covariates_col: str | None = "covariate_ids",
     tokens_col: str | None = "n_tokens",
-    document_ids_col: str | None = "document_ids",
+    document_id_col: str | None = "document_id",
     attributes_cols: list[str] | None = None,
 ) -> list[TextUnit]:
     """Read text units from a dataframe using pre-converted records."""
@@ -250,7 +250,7 @@ def read_text_units(
                 row, covariates_col, key_type=str, value_type=str
             ),
             n_tokens=to_optional_int(row, tokens_col),
-            document_ids=to_optional_list(row, document_ids_col, item_type=str),
+            document_id=to_optional_str(row, document_id_col),
             attributes=(
                 {col: row.get(col) for col in attributes_cols}
                 if attributes_cols

@@ -2,8 +2,8 @@ models:
   default_chat_model:
     api_key: ${CUSTOM_API_KEY}
     type: openai_chat
-    model: gpt-4-turbo-preview
+    model: gpt-4o
   default_embedding_model:
     api_key: ${CUSTOM_API_KEY}
     type: openai_embedding
-    model: text-embedding-3-small
+    model: text-embedding-ada-002
@@ -2,8 +2,8 @@ models:
   default_chat_model:
     api_key: ${SOME_NON_EXISTENT_ENV_VAR}
     type: openai_chat
-    model: gpt-4-turbo-preview
+    model: gpt-4o
   default_embedding_model:
     api_key: ${SOME_NON_EXISTENT_ENV_VAR}
     type: openai_embedding
-    model: text-embedding-3-small
+    model: text-embedding-ada-002
@@ -208,7 +208,6 @@ def assert_text_embedding_configs(
 def assert_chunking_configs(actual: ChunkingConfig, expected: ChunkingConfig) -> None:
     assert actual.size == expected.size
     assert actual.overlap == expected.overlap
-    assert actual.group_by_columns == expected.group_by_columns
     assert actual.strategy == expected.strategy
     assert actual.encoding_model == expected.encoding_model
     assert actual.prepend_metadata == expected.prepend_metadata

@@ -207,8 +207,8 @@ def test_sort_context():
     ctx = sort_context(context)
     assert ctx is not None, "Context is none"
     num = num_tokens(ctx)
-    assert num == 828 if platform.system() == "Windows" else 826, (
-        f"num_tokens is not matched for platform (win = 827, else 826): {num}"
+    assert num == 825 if platform.system() == "Windows" else 826, (
+        f"num_tokens is not matched for platform (win = 825, else 826): {num}"
     )
 
 

@@ -83,8 +83,7 @@ def test_token_text_splitter(mock_tokenizer, mock_split_text):
 def test_encode_basic():
     splitter = TokenTextSplitter()
     result = splitter.encode("abc def")
-
-    assert result == [13997, 711], "Encoding failed to return expected tokens"
+    assert result == [26682, 1056], "Encoding failed to return expected tokens"
 
 
 def test_num_tokens_empty_input():

@@ -25,7 +25,7 @@ async def test_create_base_text_units():
 
     actual = await load_table_from_storage("text_units", context.output_storage)
 
-    compare_outputs(actual, expected, columns=["text", "document_ids", "n_tokens"])
+    compare_outputs(actual, expected, columns=["text", "document_id", "n_tokens"])
 
 
 async def test_create_base_text_units_metadata():
@@ -34,8 +34,6 @@ async def test_create_base_text_units_metadata():
     context = await create_test_context()
 
     config = create_graphrag_config({"models": DEFAULT_MODEL_CONFIG})
-    # test data was created with 4o, so we need to match the encoding for chunks to be identical
-    config.chunks.encoding_model = "o200k_base"
     config.input.metadata = ["title"]
     config.chunks.prepend_metadata = True
 
@@ -44,7 +42,7 @@ async def test_create_base_text_units_metadata():
     await run_workflow(config, context)
 
     actual = await load_table_from_storage("text_units", context.output_storage)
-    compare_outputs(actual, expected)
+    compare_outputs(actual, expected, ["text", "document_id", "n_tokens"])
 
 
 async def test_create_base_text_units_metadata_included_in_chunk():
@@ -53,8 +51,6 @@ async def test_create_base_text_units_metadata_included_in_chunk():
     context = await create_test_context()
 
     config = create_graphrag_config({"models": DEFAULT_MODEL_CONFIG})
-    # test data was created with 4o, so we need to match the encoding for chunks to be identical
-    config.chunks.encoding_model = "o200k_base"
     config.input.metadata = ["title"]
     config.chunks.prepend_metadata = True
     config.chunks.chunk_size_includes_metadata = True
@@ -65,4 +61,4 @@ async def test_create_base_text_units_metadata_included_in_chunk():
 
     actual = await load_table_from_storage("text_units", context.output_storage)
     # only check the columns from the base workflow - our expected table is the final and will have more
-    compare_outputs(actual, expected, columns=["text", "document_ids", "n_tokens"])
+    compare_outputs(actual, expected, columns=["text", "document_id", "n_tokens"])