From 50d8efd3e4fd966d866b883efd81d4becd4af7f1 Mon Sep 17 00:00:00 2001 From: Stuart Page <38261603+stuartjohnpage@users.noreply.github.com> Date: Thu, 4 Apr 2024 13:04:04 -0500 Subject: [PATCH] adds metadata option --- README.md | 22 +++-- lib/text_chunker.ex | 6 +- lib/text_chunker/chunk.ex | 6 +- .../recursive_chunk/recursive_chunk.ex | 7 +- test/recursive_chunk_test.exs | 89 +++++++++++++++++++ 5 files changed, 117 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index f78b08e..eb032a5 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ Fill the gap in the Elixir ecosystem for a good semantic text chunker, and give - Semantic Chunking: Prioritizes chunking text into meaningful blocks based on separators relevant to the specified format (e.g., headings, paragraphs in Markdown). - Configurable Chunking: Fine-tune the chunking process with options for, text chunk size, overlap and format. -- Metadata Tracking: Automatically generates Chunk structs containing byte range information for accurately reassembling the original text if needed. +- Metadata Tracking: Automatically generates `Chunk` structs containing byte range information for accurately reassembling the original text if needed. Additional custom metadata can also be added to all chunks. - Extensibility: Designed to accommodate additional chunking strategies in the future. ## Installation @@ -48,7 +48,7 @@ text = "Your text to be split..." chunks = TextChunker.split(text) ``` -This will chunk up your text using the default parameters - a chunk size of `1000`, chunk overlap of `200`, format of `:plaintext` and using the `RecursiveChunk` strategy. +This will chunk up your text using the default parameters - a chunk size of `1000`, chunk overlap of `200`, format of `:plaintext` and using the `RecursiveChunk` strategy. It will add no additional `:metadata`. The split method returns `Chunks` of your text. These chunks include the start and end bytes of each chunk. @@ -57,6 +57,7 @@ The split method returns `Chunks` of your text. These chunks include the start a start_byte: 0, end_byte: 44, text: "This is a sample text. It will be split into", + metadata: %{} } ``` @@ -67,6 +68,7 @@ If you wish to adjust these parameters, configuration can optionally be passed v - `chunk_size` - The approximate target chunk size, as measured per code points. This means that both `a` and `👻` count as one. Chunks will not exceed this maximum, but may sometimes be smaller. **Important note** This means that graphemes *may* be split. For example, `👩‍🚒` may be split into `👩,🚒` or not depending on the split boundary. - `chunk_overlap` - The contextual overlap between chunks, as measured per code point. Overlap is *not* guaranteed; again this should be treated as a maximum. The size of an individual overlap will depend on the semantics of the text being split. - `format` - What informs separator selection. Because we are trying to preserve meaning between the chunks, the format of the text we are splitting is important. It's important to split newlines in plain text; it's important to split `###` headings in markdown. + - `metadata` - Any additional fields to be added into each chunk. This can be useful for adding the name or title of the document from where the chunk comes from. ```elixir text = """ @@ -74,7 +76,7 @@ text = """ Let's split your text up properly! """ -opts = [chunk_size: 10, chunk_overlap: 5, format: :markdown] +opts = [chunk_size: 10, chunk_overlap: 5, format: :markdown, metadata: %{title: "A split document title", chapter: 1}] chunks = TextChunker.split(text, opts) ``` @@ -97,12 +99,13 @@ iex(10)> TextChunker.split(text) %TextChunker.Chunk{ start_byte: 0, end_byte: 97, - text: "This is a sample text. It will be split into properly-sized chunks using the TextChunker library." + text: "This is a sample text. It will be split into properly-sized chunks using the TextChunker library.", + metadata: %{} } ] text = "This is a sample text. It will be split into properly-sized chunks using the TextChunker library." -opts = [chunk_size: 50, chunk_overlap: 5, format: :plaintext, strategy: TextChunker.Strategies.RecursiveChunk] +opts = [chunk_size: 50, chunk_overlap: 5, format: :plaintext, strategy: TextChunker.Strategies.RecursiveChunk, metadata: %{title: "Sample Text"}] iex(10)> TextChunker.split(text, opts) @@ -110,17 +113,20 @@ iex(10)> TextChunker.split(text, opts) %TextChunker.Chunk{ start_byte: 0, end_byte: 44, - text: "This is a sample text. It will be split into" + text: "This is a sample text. It will be split into", + metadata: %{title: "Sample Text"} }, %TextChunker.Chunk{ start_byte: 39, end_byte: 88, - text: " into properly-sized chunks using the TextChunker" + text: " into properly-sized chunks using the TextChunker", + metadata: %{title: "Sample Text"} }, %TextChunker.Chunk{ start_byte: 88, end_byte: 97, - text: " library." + text: " library.", + metadata: %{title: "Sample Text"} } ] ``` diff --git a/lib/text_chunker.ex b/lib/text_chunker.ex index 6d0f8bd..46c952f 100644 --- a/lib/text_chunker.ex +++ b/lib/text_chunker.ex @@ -6,7 +6,7 @@ defmodule TextChunker do * **Customizable Splitting:** Allows the splitting strategy to be customized via the `:strategy` option. * **Size and Overlap Control:** Provides options for `:chunk_size` and `:chunk_overlap`. - * **Metadata Tracking:** Generates `Chunk` structs containing byte range information. + * **Metadata Tracking:** Generates `Chunk` structs containing byte range information, as well as providing an option to attach custom metadata """ alias TextChunker.Strategies.RecursiveChunk @@ -14,7 +14,8 @@ defmodule TextChunker do chunk_size: 2000, chunk_overlap: 200, strategy: RecursiveChunk, - format: :plaintext + format: :plaintext, + metadata: %{} ] @doc """ @@ -26,6 +27,7 @@ defmodule TextChunker do * `:chunk_overlap` (integer, default: 200) - Number of overlapping code points between consecutive chunks to preserve context. * `:strategy` (function, default: `&RecursiveChunk.split/2`) - A function taking two arguments (text and options) and returning a list of `%Chunk{}` structs. Currently only `&RecursiveChunk.split/2` is fully supported. * `:format` (atom, default: `:plaintext`) - The format of the input text. Used to determine where to split the text in some strategies. + * ':metadata` (map, default: `%{}`) - Any optional additional metadata to be added to each chunk ## Examples diff --git a/lib/text_chunker/chunk.ex b/lib/text_chunker/chunk.ex index 0763afd..256a054 100644 --- a/lib/text_chunker/chunk.ex +++ b/lib/text_chunker/chunk.ex @@ -9,8 +9,10 @@ defmodule TextChunker.Chunk do # Byte offset marking the end of the chunk end_byte: integer(), # The textual content of this chunk - text: String.t() + text: String.t(), + # Any additional metadata + metadata: map() } - defstruct [:start_byte, :end_byte, :text] + defstruct [:start_byte, :end_byte, :text, metadata: %{}] end diff --git a/lib/text_chunker/strategies/recursive_chunk/recursive_chunk.ex b/lib/text_chunker/strategies/recursive_chunk/recursive_chunk.ex index 33192bb..8ba74af 100644 --- a/lib/text_chunker/strategies/recursive_chunk/recursive_chunk.ex +++ b/lib/text_chunker/strategies/recursive_chunk/recursive_chunk.ex @@ -90,7 +90,12 @@ defmodule TextChunker.Strategies.RecursiveChunk do text: chunk } - chunks ++ [chunk] + if Enum.empty?(opts[:metadata]) do + chunks ++ [chunk] + else + metadata = opts[:metadata] + chunks ++ [%{chunk | metadata: metadata}] + end end end) diff --git a/test/recursive_chunk_test.exs b/test/recursive_chunk_test.exs index 9942592..af679df 100644 --- a/test/recursive_chunk_test.exs +++ b/test/recursive_chunk_test.exs @@ -355,4 +355,93 @@ defmodule TextChunkerTest do assert result == expected_result end end + + describe "chunker returns metadata chunks correctly" do + test "returns an empty map as metadata when none is given" do + opts = [ + chunk_size: 50, + chunk_overlap: 10, + format: :plaintext + ] + + text = + "This is quite a short sentence. But what a headache does the darn thing create! Especially when splitting is involved. Do not look for meaning." + + result = + TextChunker.split(text, opts) + + expected_result = [ + %TextChunker.Chunk{ + start_byte: 0, + end_byte: 42, + text: "This is quite a short sentence. But what a", + metadata: %{} + }, + %TextChunker.Chunk{ + start_byte: 35, + end_byte: 79, + text: " what a headache does the darn thing create!", + metadata: %{} + }, + %TextChunker.Chunk{ + start_byte: 71, + end_byte: 121, + text: " create! Especially when splitting is involved. Do", + metadata: %{} + }, + %TextChunker.Chunk{ + start_byte: 118, + end_byte: 143, + text: " Do not look for meaning.", + metadata: %{} + } + ] + + assert result == expected_result + end + + test "returns chunks with metadata" do + opts = [ + chunk_size: 50, + chunk_overlap: 10, + format: :plaintext, + metadata: %{title: "Short Sentence: A Side-Splitting Endeavour"} + ] + + text = + "This is quite a short sentence. But what a headache does the darn thing create! Especially when splitting is involved. Do not look for meaning." + + result = + TextChunker.split(text, opts) + + expected_result = [ + %TextChunker.Chunk{ + start_byte: 0, + end_byte: 42, + text: "This is quite a short sentence. But what a", + metadata: %{title: "Short Sentence: A Side-Splitting Endeavour"} + }, + %TextChunker.Chunk{ + start_byte: 35, + end_byte: 79, + text: " what a headache does the darn thing create!", + metadata: %{title: "Short Sentence: A Side-Splitting Endeavour"} + }, + %TextChunker.Chunk{ + start_byte: 71, + end_byte: 121, + text: " create! Especially when splitting is involved. Do", + metadata: %{title: "Short Sentence: A Side-Splitting Endeavour"} + }, + %TextChunker.Chunk{ + start_byte: 118, + end_byte: 143, + text: " Do not look for meaning.", + metadata: %{title: "Short Sentence: A Side-Splitting Endeavour"} + } + ] + + assert result == expected_result + end + end end