Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .semversioner/next-release/major-20250909205146252760.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"type": "major",
"description": "Remove text unit group-by ability."
}
1 change: 0 additions & 1 deletion docs/config/yaml.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@ These settings configure how we parse documents into text chunks. This is necess

- `size` **int** - The max chunk size in tokens.
- `overlap` **int** - The chunk overlap in tokens.
- `group_by_columns` **list[str]** - Group documents by these fields before chunking.
- `strategy` **str**[tokens|sentences] - How to chunk the text.
- `encoding_model` **str** - The text encoding model to use for splitting on token boundaries.
- `prepend_metadata` **bool** - Determines if metadata values should be added at the beginning of each chunk. Default=`False`.
Expand Down
4 changes: 1 addition & 3 deletions docs/index/default_dataflow.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,7 @@ flowchart TB

The first phase of the default-configuration workflow is to transform input documents into _TextUnits_. A _TextUnit_ is a chunk of text that is used for our graph extraction techniques. They are also used as source-references by extracted knowledge items in order to empower breadcrumbs and provenance by concepts back to their original source text.

The chunk size (counted in tokens), is user-configurable. By default this is set to 300 tokens, although we've had positive experience with 1200-token chunks using a single "glean" step. (A "glean" step is a follow-on extraction). Larger chunks result in lower-fidelity output and less meaningful reference texts; however, using larger chunks can result in much faster processing time.

The group-by configuration is also user-configurable. By default, we align our chunks to document boundaries, meaning that there is a strict 1-to-many relationship between Documents and TextUnits. In rare cases, this can be turned into a many-to-many relationship. This is useful when the documents are very short and we need several of them to compose a meaningful analysis unit (e.g. Tweets or a chat log)
The chunk size (counted in tokens), is user-configurable. By default this is set to 1200 tokens. Larger chunks result in lower-fidelity output and less meaningful reference texts; however, using larger chunks can result in much faster processing time.

```mermaid
---
Expand Down
2 changes: 1 addition & 1 deletion docs/index/outputs.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ List of all text chunks parsed from the input documents.
| ----------------- | ----- | ----------- |
| text | str | Raw full text of the chunk. |
| n_tokens | int | Number of tokens in the chunk. This should normally match the `chunk_size` config parameter, except for the last chunk which is often shorter. |
| document_ids | str[] | List of document IDs the chunk came from. This is normally only 1 due to our default groupby, but for very short text documents (e.g., microblogs) it can be configured so text units span multiple documents. |
| document_id | str | ID of the document the chunk came from. |
| entity_ids | str[] | List of entities found in the text unit. |
| relationships_ids | str[] | List of relationships found in the text unit. |
| covariate_ids | str[] | Optional list of covariates found in the text unit. |
9 changes: 4 additions & 5 deletions graphrag/config/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,15 @@
DEFAULT_OUTPUT_BASE_DIR = "output"
DEFAULT_CHAT_MODEL_ID = "default_chat_model"
DEFAULT_CHAT_MODEL_TYPE = ModelType.OpenAIChat
DEFAULT_CHAT_MODEL = "gpt-4-turbo-preview"
DEFAULT_CHAT_MODEL = "gpt-4o"
DEFAULT_CHAT_MODEL_AUTH_TYPE = AuthType.APIKey
DEFAULT_EMBEDDING_MODEL_ID = "default_embedding_model"
DEFAULT_EMBEDDING_MODEL_TYPE = ModelType.OpenAIEmbedding
DEFAULT_EMBEDDING_MODEL = "text-embedding-3-small"
DEFAULT_EMBEDDING_MODEL = "text-embedding-ada-002"
DEFAULT_EMBEDDING_MODEL_AUTH_TYPE = AuthType.APIKey
DEFAULT_VECTOR_STORE_ID = "default_vector_store"

ENCODING_MODEL = "cl100k_base"
ENCODING_MODEL = "o200k_base"
COGNITIVE_SERVICES_AUDIENCE = "https://cognitiveservices.azure.com/.default"


Expand Down Expand Up @@ -68,9 +68,8 @@ class ChunksDefaults:

size: int = 1200
overlap: int = 100
group_by_columns: list[str] = field(default_factory=lambda: ["id"])
strategy: ClassVar[ChunkStrategyType] = ChunkStrategyType.tokens
encoding_model: str = "cl100k_base"
encoding_model: str = ENCODING_MODEL
prepend_metadata: bool = False
chunk_size_includes_metadata: bool = False

Expand Down
1 change: 0 additions & 1 deletion graphrag/config/init_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,6 @@
chunks:
size: {graphrag_config_defaults.chunks.size}
overlap: {graphrag_config_defaults.chunks.overlap}
group_by_columns: [{",".join(graphrag_config_defaults.chunks.group_by_columns)}]

### Output/storage settings ###
## If blob storage is specified in the following four sections,
Expand Down
4 changes: 0 additions & 4 deletions graphrag/config/models/chunking_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,6 @@ class ChunkingConfig(BaseModel):
description="The chunk overlap to use.",
default=graphrag_config_defaults.chunks.overlap,
)
group_by_columns: list[str] = Field(
description="The chunk by columns to use.",
default=graphrag_config_defaults.chunks.group_by_columns,
)
strategy: ChunkStrategyType = Field(
description="The chunking strategy to use.",
default=graphrag_config_defaults.chunks.strategy,
Expand Down
4 changes: 2 additions & 2 deletions graphrag/data_model/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
RELATIONSHIP_IDS = "relationship_ids"
TEXT_UNIT_IDS = "text_unit_ids"
COVARIATE_IDS = "covariate_ids"
DOCUMENT_IDS = "document_ids"
DOCUMENT_ID = "document_id"

PERIOD = "period"
SIZE = "size"
Expand Down Expand Up @@ -142,7 +142,7 @@
SHORT_ID,
TEXT,
N_TOKENS,
DOCUMENT_IDS,
DOCUMENT_ID,
ENTITY_IDS,
RELATIONSHIP_IDS,
COVARIATE_IDS,
Expand Down
8 changes: 4 additions & 4 deletions graphrag/data_model/text_unit.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ class TextUnit(Identified):
n_tokens: int | None = None
"""The number of tokens in the text (optional)."""

document_ids: list[str] | None = None
"""List of document IDs in which the text unit appears (optional)."""
document_id: str | None = None
"""ID of the document in which the text unit appears (optional)."""

attributes: dict[str, Any] | None = None
"""A dictionary of additional attributes associated with the text unit (optional)."""
Expand All @@ -45,7 +45,7 @@ def from_dict(
relationships_key: str = "relationship_ids",
covariates_key: str = "covariate_ids",
n_tokens_key: str = "n_tokens",
document_ids_key: str = "document_ids",
document_id_key: str = "document_id",
attributes_key: str = "attributes",
) -> "TextUnit":
"""Create a new text unit from the dict data."""
Expand All @@ -57,6 +57,6 @@ def from_dict(
relationship_ids=d.get(relationships_key),
covariate_ids=d.get(covariates_key),
n_tokens=d.get(n_tokens_key),
document_ids=d.get(document_ids_key),
document_id=d.get(document_id_key),
attributes=d.get(attributes_key),
)
53 changes: 16 additions & 37 deletions graphrag/index/workflows/create_base_text_units.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ async def run_workflow(
output = create_base_text_units(
documents,
context.callbacks,
chunks.group_by_columns,
chunks.size,
chunks.overlap,
chunks.encoding_model,
Expand All @@ -53,7 +52,6 @@ async def run_workflow(
def create_base_text_units(
documents: pd.DataFrame,
callbacks: WorkflowCallbacks,
group_by_columns: list[str],
size: int,
overlap: int,
encoding_model: str,
Expand All @@ -62,26 +60,9 @@ def create_base_text_units(
chunk_size_includes_metadata: bool = False,
) -> pd.DataFrame:
"""All the steps to transform base text_units."""
sort = documents.sort_values(by=["id"], ascending=[True])
documents.sort_values(by=["id"], ascending=[True], inplace=True)

sort["text_with_ids"] = list(
zip(*[sort[col] for col in ["id", "text"]], strict=True)
)

agg_dict = {"text_with_ids": list}
if "metadata" in documents:
agg_dict["metadata"] = "first" # type: ignore

aggregated = (
(
sort.groupby(group_by_columns, sort=False)
if len(group_by_columns) > 0
else sort.groupby(lambda _x: True)
)
.agg(agg_dict)
.reset_index()
)
aggregated.rename(columns={"text_with_ids": "texts"}, inplace=True)
encode, _ = get_encoding_fn(encoding_model)

def chunker(row: pd.Series) -> Any:
line_delimiter = ".\n"
Expand All @@ -99,15 +80,14 @@ def chunker(row: pd.Series) -> Any:
)

if chunk_size_includes_metadata:
encode, _ = get_encoding_fn(encoding_model)
metadata_tokens = len(encode(metadata_str))
if metadata_tokens >= size:
message = "Metadata tokens exceeds the maximum tokens per chunk. Please increase the tokens per chunk."
raise ValueError(message)

chunked = chunk_text(
pd.DataFrame([row]).reset_index(drop=True),
column="texts",
column="text",
size=size - metadata_tokens,
overlap=overlap,
encoding_model=encoding_model,
Expand All @@ -128,7 +108,7 @@ def chunker(row: pd.Series) -> Any:
return row

# Track progress of row-wise apply operation
total_rows = len(aggregated)
total_rows = len(documents)
logger.info("Starting chunking process for %d documents", total_rows)

def chunker_with_logging(row: pd.Series, row_index: int) -> Any:
Expand All @@ -137,27 +117,26 @@ def chunker_with_logging(row: pd.Series, row_index: int) -> Any:
logger.info("chunker progress: %d/%d", row_index + 1, total_rows)
return result

aggregated = aggregated.apply(
text_units = documents.apply(
lambda row: chunker_with_logging(row, row.name), axis=1
)

aggregated = cast("pd.DataFrame", aggregated[[*group_by_columns, "chunks"]])
aggregated = aggregated.explode("chunks")
aggregated.rename(
text_units = cast("pd.DataFrame", text_units[["id", "chunks"]])
text_units = text_units.explode("chunks")
text_units.rename(
columns={
"chunks": "chunk",
"id": "document_id",
"chunks": "text",
},
inplace=True,
)
aggregated["id"] = aggregated.apply(
lambda row: gen_sha512_hash(row, ["chunk"]), axis=1
)
aggregated[["document_ids", "chunk", "n_tokens"]] = pd.DataFrame(
aggregated["chunk"].tolist(), index=aggregated.index

text_units["id"] = text_units.apply(
lambda row: gen_sha512_hash(row, ["text"]), axis=1
)
# rename for downstream consumption
aggregated.rename(columns={"chunk": "text"}, inplace=True)
# get a final token measurement
text_units["n_tokens"] = text_units["text"].apply(lambda x: len(encode(x)))

return cast(
"pd.DataFrame", aggregated[aggregated["text"].notna()].reset_index(drop=True)
"pd.DataFrame", text_units[text_units["text"].notna()].reset_index(drop=True)
)
18 changes: 7 additions & 11 deletions graphrag/index/workflows/create_final_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,19 +37,15 @@ def create_final_documents(
documents: pd.DataFrame, text_units: pd.DataFrame
) -> pd.DataFrame:
"""All the steps to transform final documents."""
exploded = (
text_units.explode("document_ids")
.loc[:, ["id", "document_ids", "text"]]
.rename(
columns={
"document_ids": "chunk_doc_id",
"id": "chunk_id",
"text": "chunk_text",
}
)
renamed = text_units.loc[:, ["id", "document_id", "text"]].rename(
columns={
"document_id": "chunk_doc_id",
"id": "chunk_id",
"text": "chunk_text",
}
)

joined = exploded.merge(
joined = renamed.merge(
documents,
left_on="chunk_doc_id",
right_on="id",
Expand Down
2 changes: 1 addition & 1 deletion graphrag/index/workflows/create_final_text_units.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def create_final_text_units(
final_covariates: pd.DataFrame | None,
) -> pd.DataFrame:
"""All the steps to transform the text units."""
selected = text_units.loc[:, ["id", "text", "document_ids", "n_tokens"]]
selected = text_units.loc[:, ["id", "text", "document_id", "n_tokens"]]
selected["human_readable_id"] = selected.index

entity_join = _entities(final_entities)
Expand Down
1 change: 0 additions & 1 deletion graphrag/prompt_tune/loader/input.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ async def load_docs_in_chunks(
chunks_df = create_base_text_units(
documents=dataset,
callbacks=NoopWorkflowCallbacks(),
group_by_columns=chunk_config.group_by_columns,
size=chunk_size,
overlap=overlap,
encoding_model=chunk_config.encoding_model,
Expand Down
4 changes: 2 additions & 2 deletions graphrag/query/input/loaders/dfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ def read_text_units(
relationships_col: str | None = "relationship_ids",
covariates_col: str | None = "covariate_ids",
tokens_col: str | None = "n_tokens",
document_ids_col: str | None = "document_ids",
document_id_col: str | None = "document_id",
attributes_cols: list[str] | None = None,
) -> list[TextUnit]:
"""Read text units from a dataframe using pre-converted records."""
Expand All @@ -250,7 +250,7 @@ def read_text_units(
row, covariates_col, key_type=str, value_type=str
),
n_tokens=to_optional_int(row, tokens_col),
document_ids=to_optional_list(row, document_ids_col, item_type=str),
document_id=to_optional_str(row, document_id_col),
attributes=(
{col: row.get(col) for col in attributes_cols}
if attributes_cols
Expand Down
4 changes: 2 additions & 2 deletions tests/unit/config/fixtures/minimal_config/settings.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ models:
default_chat_model:
api_key: ${CUSTOM_API_KEY}
type: openai_chat
model: gpt-4-turbo-preview
model: gpt-4o
default_embedding_model:
api_key: ${CUSTOM_API_KEY}
type: openai_embedding
model: text-embedding-3-small
model: text-embedding-ada-002
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ models:
default_chat_model:
api_key: ${SOME_NON_EXISTENT_ENV_VAR}
type: openai_chat
model: gpt-4-turbo-preview
model: gpt-4o
default_embedding_model:
api_key: ${SOME_NON_EXISTENT_ENV_VAR}
type: openai_embedding
model: text-embedding-3-small
model: text-embedding-ada-002
1 change: 0 additions & 1 deletion tests/unit/config/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,6 @@ def assert_text_embedding_configs(
def assert_chunking_configs(actual: ChunkingConfig, expected: ChunkingConfig) -> None:
assert actual.size == expected.size
assert actual.overlap == expected.overlap
assert actual.group_by_columns == expected.group_by_columns
assert actual.strategy == expected.strategy
assert actual.encoding_model == expected.encoding_model
assert actual.prepend_metadata == expected.prepend_metadata
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -207,8 +207,8 @@ def test_sort_context():
ctx = sort_context(context)
assert ctx is not None, "Context is none"
num = num_tokens(ctx)
assert num == 828 if platform.system() == "Windows" else 826, (
f"num_tokens is not matched for platform (win = 827, else 826): {num}"
assert num == 825 if platform.system() == "Windows" else 826, (
f"num_tokens is not matched for platform (win = 825, else 826): {num}"
)


Expand Down
3 changes: 1 addition & 2 deletions tests/unit/indexing/text_splitting/test_text_splitting.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,7 @@ def test_token_text_splitter(mock_tokenizer, mock_split_text):
def test_encode_basic():
splitter = TokenTextSplitter()
result = splitter.encode("abc def")

assert result == [13997, 711], "Encoding failed to return expected tokens"
assert result == [26682, 1056], "Encoding failed to return expected tokens"


def test_num_tokens_empty_input():
Expand Down
Binary file modified tests/verbs/data/communities.parquet
Binary file not shown.
Binary file modified tests/verbs/data/community_reports.parquet
Binary file not shown.
Binary file modified tests/verbs/data/covariates.parquet
Binary file not shown.
Binary file modified tests/verbs/data/documents.parquet
Binary file not shown.
Binary file modified tests/verbs/data/entities.parquet
Binary file not shown.
Binary file modified tests/verbs/data/relationships.parquet
Binary file not shown.
Binary file modified tests/verbs/data/text_units.parquet
Binary file not shown.
Binary file modified tests/verbs/data/text_units_metadata.parquet
Binary file not shown.
Binary file modified tests/verbs/data/text_units_metadata_included_chunk.parquet
Binary file not shown.
10 changes: 3 additions & 7 deletions tests/verbs/test_create_base_text_units.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ async def test_create_base_text_units():

actual = await load_table_from_storage("text_units", context.output_storage)

compare_outputs(actual, expected, columns=["text", "document_ids", "n_tokens"])
compare_outputs(actual, expected, columns=["text", "document_id", "n_tokens"])


async def test_create_base_text_units_metadata():
Expand All @@ -34,8 +34,6 @@ async def test_create_base_text_units_metadata():
context = await create_test_context()

config = create_graphrag_config({"models": DEFAULT_MODEL_CONFIG})
# test data was created with 4o, so we need to match the encoding for chunks to be identical
config.chunks.encoding_model = "o200k_base"
config.input.metadata = ["title"]
config.chunks.prepend_metadata = True

Expand All @@ -44,7 +42,7 @@ async def test_create_base_text_units_metadata():
await run_workflow(config, context)

actual = await load_table_from_storage("text_units", context.output_storage)
compare_outputs(actual, expected)
compare_outputs(actual, expected, ["text", "document_id", "n_tokens"])


async def test_create_base_text_units_metadata_included_in_chunk():
Expand All @@ -53,8 +51,6 @@ async def test_create_base_text_units_metadata_included_in_chunk():
context = await create_test_context()

config = create_graphrag_config({"models": DEFAULT_MODEL_CONFIG})
# test data was created with 4o, so we need to match the encoding for chunks to be identical
config.chunks.encoding_model = "o200k_base"
config.input.metadata = ["title"]
config.chunks.prepend_metadata = True
config.chunks.chunk_size_includes_metadata = True
Expand All @@ -65,4 +61,4 @@ async def test_create_base_text_units_metadata_included_in_chunk():

actual = await load_table_from_storage("text_units", context.output_storage)
# only check the columns from the base workflow - our expected table is the final and will have more
compare_outputs(actual, expected, columns=["text", "document_ids", "n_tokens"])
compare_outputs(actual, expected, columns=["text", "document_id", "n_tokens"])
Loading
Loading