Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .semversioner/next-release/major-20250909205146252760.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"type": "major",
"description": "Remove text unit group-by ability."
}
1 change: 0 additions & 1 deletion docs/config/yaml.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,6 @@ These settings configure how we parse documents into text chunks. This is necess

- `size` **int** - The max chunk size in tokens.
- `overlap` **int** - The chunk overlap in tokens.
- `group_by_columns` **list[str]** - Group documents by these fields before chunking.
- `strategy` **str**[tokens|sentences] - How to chunk the text.
- `encoding_model` **str** - The text encoding model to use for splitting on token boundaries.
- `prepend_metadata` **bool** - Determines if metadata values should be added at the beginning of each chunk. Default=`False`.
Expand Down
4 changes: 1 addition & 3 deletions docs/index/default_dataflow.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,7 @@ flowchart TB

The first phase of the default-configuration workflow is to transform input documents into _TextUnits_. A _TextUnit_ is a chunk of text that is used for our graph extraction techniques. They are also used as source-references by extracted knowledge items in order to empower breadcrumbs and provenance by concepts back to their original source text.

The chunk size (counted in tokens), is user-configurable. By default this is set to 300 tokens, although we've had positive experience with 1200-token chunks using a single "glean" step. (A "glean" step is a follow-on extraction). Larger chunks result in lower-fidelity output and less meaningful reference texts; however, using larger chunks can result in much faster processing time.

The group-by configuration is also user-configurable. By default, we align our chunks to document boundaries, meaning that there is a strict 1-to-many relationship between Documents and TextUnits. In rare cases, this can be turned into a many-to-many relationship. This is useful when the documents are very short and we need several of them to compose a meaningful analysis unit (e.g. Tweets or a chat log)
The chunk size (counted in tokens), is user-configurable. By default this is set to 1200 tokens. Larger chunks result in lower-fidelity output and less meaningful reference texts; however, using larger chunks can result in much faster processing time.

```mermaid
---
Expand Down
2 changes: 1 addition & 1 deletion docs/index/outputs.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ List of all text chunks parsed from the input documents.
| ----------------- | ----- | ----------- |
| text | str | Raw full text of the chunk. |
| n_tokens | int | Number of tokens in the chunk. This should normally match the `chunk_size` config parameter, except for the last chunk which is often shorter. |
| document_ids | str[] | List of document IDs the chunk came from. This is normally only 1 due to our default groupby, but for very short text documents (e.g., microblogs) it can be configured so text units span multiple documents. |
| document_id | str | ID of the document the chunk came from. |
| entity_ids | str[] | List of entities found in the text unit. |
| relationships_ids | str[] | List of relationships found in the text unit. |
| covariate_ids | str[] | Optional list of covariates found in the text unit. |
9 changes: 4 additions & 5 deletions graphrag/config/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,15 @@
DEFAULT_OUTPUT_BASE_DIR = "output"
DEFAULT_CHAT_MODEL_ID = "default_chat_model"
DEFAULT_CHAT_MODEL_TYPE = ModelType.OpenAIChat
DEFAULT_CHAT_MODEL = "gpt-4-turbo-preview"
DEFAULT_CHAT_MODEL = "gpt-4o"
DEFAULT_CHAT_MODEL_AUTH_TYPE = AuthType.APIKey
DEFAULT_EMBEDDING_MODEL_ID = "default_embedding_model"
DEFAULT_EMBEDDING_MODEL_TYPE = ModelType.OpenAIEmbedding
DEFAULT_EMBEDDING_MODEL = "text-embedding-3-small"
DEFAULT_EMBEDDING_MODEL = "text-embedding-ada-002"
DEFAULT_EMBEDDING_MODEL_AUTH_TYPE = AuthType.APIKey
DEFAULT_VECTOR_STORE_ID = "default_vector_store"

ENCODING_MODEL = "cl100k_base"
ENCODING_MODEL = "o200k_base"
COGNITIVE_SERVICES_AUDIENCE = "https://cognitiveservices.azure.com/.default"


Expand Down Expand Up @@ -68,9 +68,8 @@ class ChunksDefaults:

size: int = 1200
overlap: int = 100
group_by_columns: list[str] = field(default_factory=lambda: ["id"])
strategy: ClassVar[ChunkStrategyType] = ChunkStrategyType.tokens
encoding_model: str = "cl100k_base"
encoding_model: str = ENCODING_MODEL
prepend_metadata: bool = False
chunk_size_includes_metadata: bool = False

Expand Down
1 change: 0 additions & 1 deletion graphrag/config/init_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,6 @@
chunks:
size: {graphrag_config_defaults.chunks.size}
overlap: {graphrag_config_defaults.chunks.overlap}
group_by_columns: [{",".join(graphrag_config_defaults.chunks.group_by_columns)}]

### Output/storage settings ###
## If blob storage is specified in the following four sections,
Expand Down
4 changes: 0 additions & 4 deletions graphrag/config/models/chunking_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,6 @@ class ChunkingConfig(BaseModel):
description="The chunk overlap to use.",
default=graphrag_config_defaults.chunks.overlap,
)
group_by_columns: list[str] = Field(
description="The chunk by columns to use.",
default=graphrag_config_defaults.chunks.group_by_columns,
)
strategy: ChunkStrategyType = Field(
description="The chunking strategy to use.",
default=graphrag_config_defaults.chunks.strategy,
Expand Down
4 changes: 2 additions & 2 deletions graphrag/data_model/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@
RELATIONSHIP_IDS = "relationship_ids"
TEXT_UNIT_IDS = "text_unit_ids"
COVARIATE_IDS = "covariate_ids"
DOCUMENT_IDS = "document_ids"
DOCUMENT_ID = "document_id"

PERIOD = "period"
SIZE = "size"
Expand Down Expand Up @@ -146,7 +146,7 @@
SHORT_ID,
TEXT,
N_TOKENS,
DOCUMENT_IDS,
DOCUMENT_ID,
ENTITY_IDS,
RELATIONSHIP_IDS,
COVARIATE_IDS,
Expand Down
8 changes: 4 additions & 4 deletions graphrag/data_model/text_unit.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ class TextUnit(Identified):
n_tokens: int | None = None
"""The number of tokens in the text (optional)."""

document_ids: list[str] | None = None
"""List of document IDs in which the text unit appears (optional)."""
document_id: str | None = None
"""ID of the document in which the text unit appears (optional)."""

attributes: dict[str, Any] | None = None
"""A dictionary of additional attributes associated with the text unit (optional)."""
Expand All @@ -45,7 +45,7 @@ def from_dict(
relationships_key: str = "relationship_ids",
covariates_key: str = "covariate_ids",
n_tokens_key: str = "n_tokens",
document_ids_key: str = "document_ids",
document_id_key: str = "document_id",
attributes_key: str = "attributes",
) -> "TextUnit":
"""Create a new text unit from the dict data."""
Expand All @@ -57,6 +57,6 @@ def from_dict(
relationship_ids=d.get(relationships_key),
covariate_ids=d.get(covariates_key),
n_tokens=d.get(n_tokens_key),
document_ids=d.get(document_ids_key),
document_id=d.get(document_id_key),
attributes=d.get(attributes_key),
)
53 changes: 16 additions & 37 deletions graphrag/index/workflows/create_base_text_units.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ async def run_workflow(
output = create_base_text_units(
documents,
context.callbacks,
chunks.group_by_columns,
chunks.size,
chunks.overlap,
chunks.encoding_model,
Expand All @@ -53,7 +52,6 @@ async def run_workflow(
def create_base_text_units(
documents: pd.DataFrame,
callbacks: WorkflowCallbacks,
group_by_columns: list[str],
size: int,
overlap: int,
encoding_model: str,
Expand All @@ -62,26 +60,9 @@ def create_base_text_units(
chunk_size_includes_metadata: bool = False,
) -> pd.DataFrame:
"""All the steps to transform base text_units."""
sort = documents.sort_values(by=["id"], ascending=[True])
documents.sort_values(by=["id"], ascending=[True])

sort["text_with_ids"] = list(
zip(*[sort[col] for col in ["id", "text"]], strict=True)
)

agg_dict = {"text_with_ids": list}
if "metadata" in documents:
agg_dict["metadata"] = "first" # type: ignore

aggregated = (
(
sort.groupby(group_by_columns, sort=False)
if len(group_by_columns) > 0
else sort.groupby(lambda _x: True)
)
.agg(agg_dict)
.reset_index()
)
aggregated.rename(columns={"text_with_ids": "texts"}, inplace=True)
encode, _ = get_encoding_fn(encoding_model)

def chunker(row: pd.Series) -> Any:
line_delimiter = ".\n"
Expand All @@ -99,15 +80,14 @@ def chunker(row: pd.Series) -> Any:
)

if chunk_size_includes_metadata:
encode, _ = get_encoding_fn(encoding_model)
metadata_tokens = len(encode(metadata_str))
if metadata_tokens >= size:
message = "Metadata tokens exceeds the maximum tokens per chunk. Please increase the tokens per chunk."
raise ValueError(message)

chunked = chunk_text(
pd.DataFrame([row]).reset_index(drop=True),
column="texts",
column="text",
size=size - metadata_tokens,
overlap=overlap,
encoding_model=encoding_model,
Expand All @@ -128,7 +108,7 @@ def chunker(row: pd.Series) -> Any:
return row

# Track progress of row-wise apply operation
total_rows = len(aggregated)
total_rows = len(documents)
logger.info("Starting chunking process for %d documents", total_rows)

def chunker_with_logging(row: pd.Series, row_index: int) -> Any:
Expand All @@ -137,27 +117,26 @@ def chunker_with_logging(row: pd.Series, row_index: int) -> Any:
logger.info("chunker progress: %d/%d", row_index + 1, total_rows)
return result

aggregated = aggregated.apply(
text_units = documents.apply(
lambda row: chunker_with_logging(row, row.name), axis=1
)

aggregated = cast("pd.DataFrame", aggregated[[*group_by_columns, "chunks"]])
aggregated = aggregated.explode("chunks")
aggregated.rename(
text_units = cast("pd.DataFrame", text_units[["id", "chunks"]])
text_units = text_units.explode("chunks")
text_units.rename(
columns={
"chunks": "chunk",
"id": "document_id",
"chunks": "text",
},
inplace=True,
)
aggregated["id"] = aggregated.apply(
lambda row: gen_sha512_hash(row, ["chunk"]), axis=1
)
aggregated[["document_ids", "chunk", "n_tokens"]] = pd.DataFrame(
aggregated["chunk"].tolist(), index=aggregated.index

text_units["id"] = text_units.apply(
lambda row: gen_sha512_hash(row, ["text"]), axis=1
)
# rename for downstream consumption
aggregated.rename(columns={"chunk": "text"}, inplace=True)
# get a final token measurement
text_units["n_tokens"] = text_units["text"].apply(lambda x: len(encode(x)))

return cast(
"pd.DataFrame", aggregated[aggregated["text"].notna()].reset_index(drop=True)
"pd.DataFrame", text_units[text_units["text"].notna()].reset_index(drop=True)
)
18 changes: 7 additions & 11 deletions graphrag/index/workflows/create_final_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,19 +37,15 @@ def create_final_documents(
documents: pd.DataFrame, text_units: pd.DataFrame
) -> pd.DataFrame:
"""All the steps to transform final documents."""
exploded = (
text_units.explode("document_ids")
.loc[:, ["id", "document_ids", "text"]]
.rename(
columns={
"document_ids": "chunk_doc_id",
"id": "chunk_id",
"text": "chunk_text",
}
)
renamed = text_units.loc[:, ["id", "document_id", "text"]].rename(
columns={
"document_id": "chunk_doc_id",
"id": "chunk_id",
"text": "chunk_text",
}
)

joined = exploded.merge(
joined = renamed.merge(
documents,
left_on="chunk_doc_id",
right_on="id",
Expand Down
2 changes: 1 addition & 1 deletion graphrag/index/workflows/create_final_text_units.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def create_final_text_units(
final_covariates: pd.DataFrame | None,
) -> pd.DataFrame:
"""All the steps to transform the text units."""
selected = text_units.loc[:, ["id", "text", "document_ids", "n_tokens"]]
selected = text_units.loc[:, ["id", "text", "document_id", "n_tokens"]]
selected["human_readable_id"] = selected.index

entity_join = _entities(final_entities)
Expand Down
1 change: 0 additions & 1 deletion graphrag/prompt_tune/loader/input.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ async def load_docs_in_chunks(
chunks_df = create_base_text_units(
documents=dataset,
callbacks=NoopWorkflowCallbacks(),
group_by_columns=chunk_config.group_by_columns,
size=chunk_size,
overlap=overlap,
encoding_model=chunk_config.encoding_model,
Expand Down
4 changes: 2 additions & 2 deletions graphrag/query/input/loaders/dfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ def read_text_units(
relationships_col: str | None = "relationship_ids",
covariates_col: str | None = "covariate_ids",
tokens_col: str | None = "n_tokens",
document_ids_col: str | None = "document_ids",
document_id_col: str | None = "document_id",
attributes_cols: list[str] | None = None,
) -> list[TextUnit]:
"""Read text units from a dataframe using pre-converted records."""
Expand All @@ -250,7 +250,7 @@ def read_text_units(
row, covariates_col, key_type=str, value_type=str
),
n_tokens=to_optional_int(row, tokens_col),
document_ids=to_optional_list(row, document_ids_col, item_type=str),
document_id=to_optional_str(row, document_id_col),
attributes=(
{col: row.get(col) for col in attributes_cols}
if attributes_cols
Expand Down
4 changes: 2 additions & 2 deletions tests/unit/config/fixtures/minimal_config/settings.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ models:
default_chat_model:
api_key: ${CUSTOM_API_KEY}
type: openai_chat
model: gpt-4-turbo-preview
model: gpt-4o
default_embedding_model:
api_key: ${CUSTOM_API_KEY}
type: openai_embedding
model: text-embedding-3-small
model: text-embedding-ada-002
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ models:
default_chat_model:
api_key: ${SOME_NON_EXISTENT_ENV_VAR}
type: openai_chat
model: gpt-4-turbo-preview
model: gpt-4o
default_embedding_model:
api_key: ${SOME_NON_EXISTENT_ENV_VAR}
type: openai_embedding
model: text-embedding-3-small
model: text-embedding-ada-002
1 change: 0 additions & 1 deletion tests/unit/config/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,6 @@ def assert_text_embedding_configs(
def assert_chunking_configs(actual: ChunkingConfig, expected: ChunkingConfig) -> None:
assert actual.size == expected.size
assert actual.overlap == expected.overlap
assert actual.group_by_columns == expected.group_by_columns
assert actual.strategy == expected.strategy
assert actual.encoding_model == expected.encoding_model
assert actual.prepend_metadata == expected.prepend_metadata
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -207,8 +207,8 @@ def test_sort_context():
ctx = sort_context(context)
assert ctx is not None, "Context is none"
num = num_tokens(ctx)
assert num == 828 if platform.system() == "Windows" else 826, (
f"num_tokens is not matched for platform (win = 827, else 826): {num}"
assert num == 825 if platform.system() == "Windows" else 826, (
f"num_tokens is not matched for platform (win = 825, else 826): {num}"
)


Expand Down
3 changes: 1 addition & 2 deletions tests/unit/indexing/text_splitting/test_text_splitting.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,7 @@ def test_token_text_splitter(mock_tokenizer, mock_split_text):
def test_encode_basic():
splitter = TokenTextSplitter()
result = splitter.encode("abc def")

assert result == [13997, 711], "Encoding failed to return expected tokens"
assert result == [26682, 1056], "Encoding failed to return expected tokens"


def test_num_tokens_empty_input():
Expand Down
Binary file modified tests/verbs/data/communities.parquet
Binary file not shown.
Binary file modified tests/verbs/data/community_reports.parquet
Binary file not shown.
Binary file modified tests/verbs/data/covariates.parquet
Binary file not shown.
Binary file modified tests/verbs/data/documents.parquet
Binary file not shown.
Binary file modified tests/verbs/data/entities.parquet
Binary file not shown.
Binary file modified tests/verbs/data/relationships.parquet
Binary file not shown.
Binary file modified tests/verbs/data/text_units.parquet
Binary file not shown.
Binary file modified tests/verbs/data/text_units_metadata.parquet
Binary file not shown.
Binary file modified tests/verbs/data/text_units_metadata_included_chunk.parquet
Binary file not shown.
10 changes: 3 additions & 7 deletions tests/verbs/test_create_base_text_units.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ async def test_create_base_text_units():

actual = await load_table_from_storage("text_units", context.output_storage)

compare_outputs(actual, expected, columns=["text", "document_ids", "n_tokens"])
compare_outputs(actual, expected, columns=["text", "document_id", "n_tokens"])


async def test_create_base_text_units_metadata():
Expand All @@ -34,8 +34,6 @@ async def test_create_base_text_units_metadata():
context = await create_test_context()

config = create_graphrag_config({"models": DEFAULT_MODEL_CONFIG})
# test data was created with 4o, so we need to match the encoding for chunks to be identical
config.chunks.encoding_model = "o200k_base"
config.input.metadata = ["title"]
config.chunks.prepend_metadata = True

Expand All @@ -44,7 +42,7 @@ async def test_create_base_text_units_metadata():
await run_workflow(config, context)

actual = await load_table_from_storage("text_units", context.output_storage)
compare_outputs(actual, expected)
compare_outputs(actual, expected, ["text", "document_id", "n_tokens"])


async def test_create_base_text_units_metadata_included_in_chunk():
Expand All @@ -53,8 +51,6 @@ async def test_create_base_text_units_metadata_included_in_chunk():
context = await create_test_context()

config = create_graphrag_config({"models": DEFAULT_MODEL_CONFIG})
# test data was created with 4o, so we need to match the encoding for chunks to be identical
config.chunks.encoding_model = "o200k_base"
config.input.metadata = ["title"]
config.chunks.prepend_metadata = True
config.chunks.chunk_size_includes_metadata = True
Expand All @@ -65,4 +61,4 @@ async def test_create_base_text_units_metadata_included_in_chunk():

actual = await load_table_from_storage("text_units", context.output_storage)
# only check the columns from the base workflow - our expected table is the final and will have more
compare_outputs(actual, expected, columns=["text", "document_ids", "n_tokens"])
compare_outputs(actual, expected, columns=["text", "document_id", "n_tokens"])
Loading
Loading