From 85b17176fdc64739eaf5d81758151a038b011057 Mon Sep 17 00:00:00 2001 From: AshAnand34 Date: Mon, 26 May 2025 00:21:27 -0700 Subject: [PATCH 1/2] Add DuckDB integration with example usage and tests --- docs/components/model_catalog.md | 15 +++++++++ examples/Models/duckdb_rag_example.py | 28 ++++++++++++++++ llmware/duckdb_integration.py | 44 +++++++++++++++++++++++++ llmware/requirements.txt | 1 + tests/models/test_duckdb_integration.py | 29 ++++++++++++++++ 5 files changed, 117 insertions(+) create mode 100644 examples/Models/duckdb_rag_example.py create mode 100644 llmware/duckdb_integration.py create mode 100644 tests/models/test_duckdb_integration.py diff --git a/docs/components/model_catalog.md b/docs/components/model_catalog.md index 0982ae77..1ddd46e1 100644 --- a/docs/components/model_catalog.md +++ b/docs/components/model_catalog.md @@ -195,6 +195,21 @@ ModelCatalog().register_open_chat_model("my_open_chat_model2", model_type="chat") ``` +## DuckDB Integration + +DuckDB is an in-process SQL OLAP database management system designed for analytical workloads. It now supports similarity search using vectors, making it a great fit for Retrieval-Augmented Generation (RAG) workflows. + +### Features +- Lightweight and efficient, even on local machines. +- Supports vectorized execution for analytical queries. +- Enables similarity search for RAG workflows. + +### Example Usage +Refer to the example script `examples/Models/duckdb_rag_example.py` for a demonstration of how to use DuckDB with `llmware`. + +### Testing +Tests for DuckDB integration can be found in `tests/models/test_duckdb_integration.py`. + Need help or have questions? ============================ diff --git a/examples/Models/duckdb_rag_example.py b/examples/Models/duckdb_rag_example.py new file mode 100644 index 00000000..d9228a65 --- /dev/null +++ b/examples/Models/duckdb_rag_example.py @@ -0,0 +1,28 @@ +from llmware.duckdb_integration import DuckDBIntegration + +def main(): + # Initialize DuckDB + db = DuckDBIntegration(db_path='example.duckdb') + + # Create a table for documents + db.create_table('documents', 'id INTEGER, content TEXT') + + # Insert example data + documents = [ + (1, 'DuckDB is an in-process SQL OLAP database management system.'), + (2, 'It is designed for analytical workloads and supports vectorized execution.'), + (3, 'DuckDB recently added support for similarity search using vectors.') + ] + db.insert_data('documents', documents) + + # Query the table + results = db.query('SELECT * FROM documents;') + print('Documents in the database:') + for row in results: + print(row) + + # Close the connection + db.close() + +if __name__ == '__main__': + main() diff --git a/llmware/duckdb_integration.py b/llmware/duckdb_integration.py new file mode 100644 index 00000000..b27211b0 --- /dev/null +++ b/llmware/duckdb_integration.py @@ -0,0 +1,44 @@ +import duckdb + +class DuckDBIntegration: + def __init__(self, db_path: str = ':memory:'): + """Initialize the DuckDB connection. + + Args: + db_path (str): Path to the DuckDB database file. Defaults to in-memory. + """ + self.connection = duckdb.connect(database=db_path, read_only=False) + + def create_table(self, table_name: str, schema: str): + """Create a table in the DuckDB database. + + Args: + table_name (str): Name of the table to create. + schema (str): Schema definition for the table. + """ + self.connection.execute(f"CREATE TABLE IF NOT EXISTS {table_name} ({schema});") + + def insert_data(self, table_name: str, data: list): + """Insert data into a DuckDB table. + + Args: + table_name (str): Name of the table to insert data into. + data (list): List of tuples representing rows to insert. + """ + placeholders = ', '.join(['?'] * len(data[0])) + self.connection.executemany(f"INSERT INTO {table_name} VALUES ({placeholders});", data) + + def query(self, sql: str): + """Execute a query on the DuckDB database. + + Args: + sql (str): SQL query to execute. + + Returns: + list: Query results. + """ + return self.connection.execute(sql).fetchall() + + def close(self): + """Close the DuckDB connection.""" + self.connection.close() diff --git a/llmware/requirements.txt b/llmware/requirements.txt index bd79605c..72dda02f 100644 --- a/llmware/requirements.txt +++ b/llmware/requirements.txt @@ -7,5 +7,6 @@ huggingface-hub>=0.19.4 tokenizers>=0.15.0 boto3>=1.24.53 colorama==0.4.6 +duckdb>=0.7.1 diff --git a/tests/models/test_duckdb_integration.py b/tests/models/test_duckdb_integration.py new file mode 100644 index 00000000..7a4ee207 --- /dev/null +++ b/tests/models/test_duckdb_integration.py @@ -0,0 +1,29 @@ +import pytest +from llmware.duckdb_integration import DuckDBIntegration + +def test_duckdb_integration(): + # Initialize DuckDB in memory + db = DuckDBIntegration() + + # Create a table + db.create_table('test_table', 'id INTEGER, name TEXT') + + # Insert data + data = [ + (1, 'Alice'), + (2, 'Bob'), + (3, 'Charlie') + ] + db.insert_data('test_table', data) + + # Query the data + results = db.query('SELECT * FROM test_table;') + + # Validate the results + assert len(results) == 3 + assert results[0] == (1, 'Alice') + assert results[1] == (2, 'Bob') + assert results[2] == (3, 'Charlie') + + # Close the connection + db.close() From 670e6ed6f23e0febb3156677de4d20c463f36dc5 Mon Sep 17 00:00:00 2001 From: AshAnand34 Date: Mon, 26 May 2025 00:25:14 -0700 Subject: [PATCH 2/2] Minor documentation fix --- tests/models/test_duckdb_integration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_duckdb_integration.py b/tests/models/test_duckdb_integration.py index 7a4ee207..94013fa0 100644 --- a/tests/models/test_duckdb_integration.py +++ b/tests/models/test_duckdb_integration.py @@ -1,4 +1,4 @@ -import pytest +""" Test for DuckDB integration in llmware""" from llmware.duckdb_integration import DuckDBIntegration def test_duckdb_integration():