From fbdb4d52cca3100dd447618005681a1cc5aa366c Mon Sep 17 00:00:00 2001 From: Ann Zhang Date: Thu, 8 May 2025 01:34:59 -0700 Subject: [PATCH 01/27] add default for doc_uri and primary_key Signed-off-by: Ann Zhang --- src/databricks_ai_bridge/utils/vector_search.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/databricks_ai_bridge/utils/vector_search.py b/src/databricks_ai_bridge/utils/vector_search.py index 9fba15c1..5f6ff5b5 100644 --- a/src/databricks_ai_bridge/utils/vector_search.py +++ b/src/databricks_ai_bridge/utils/vector_search.py @@ -161,8 +161,8 @@ def validate_and_get_return_columns( columns: List[str], text_column: str, index_details: IndexDetails, - doc_uri: str, - primary_key: str, + doc_uri: str = None, + primary_key: str = None, ) -> List[str]: """ Get a list of columns to retrieve from the index. From 55d5d4686ccb0259d66d25dcb439dd8c09c043f6 Mon Sep 17 00:00:00 2001 From: Ann Zhang Date: Thu, 8 May 2025 01:38:18 -0700 Subject: [PATCH 02/27] update Signed-off-by: Ann Zhang --- .github/workflows/main.yml | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 521c55b0..17f1b327 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -55,6 +55,7 @@ jobs: strategy: matrix: python-version: ["3.10"] + databricks-langchain-version: ["0.1.0", "0.2.0", "0.3.0", "0.4.0", "dev"] # Include whatever past versions you need timeout-minutes: 20 steps: - name: Checkout code @@ -63,11 +64,17 @@ jobs: uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - - name: Install dependencies + - name: Install core package run: | pip install . - pip install integrations/langchain[dev] - - name: Run tests + - name: Install databricks-langchain version ${{ matrix.databricks-langchain-version }} + run: | + if [ "${{ matrix.databricks-langchain-version }}" = "dev" ]; then + pip install -e integrations/langchain[dev] + else + pip install "databricks-langchain==${{ matrix.databricks-langchain-version }}" + fi + - name: Run langchain integration tests run: | pytest integrations/langchain/tests/unit_tests From 43372bbd28487ab9e5ecbcfacaa372641ae63c9a Mon Sep 17 00:00:00 2001 From: Ann Zhang Date: Thu, 8 May 2025 01:44:40 -0700 Subject: [PATCH 03/27] test Signed-off-by: Ann Zhang --- .github/workflows/main.yml | 4 ++-- src/databricks_ai_bridge/utils/vector_search.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 17f1b327..ee9eae9f 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -66,11 +66,11 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install core package run: | - pip install . + pip install .[dev] - name: Install databricks-langchain version ${{ matrix.databricks-langchain-version }} run: | if [ "${{ matrix.databricks-langchain-version }}" = "dev" ]; then - pip install -e integrations/langchain[dev] + pip install integrations/langchain[dev] else pip install "databricks-langchain==${{ matrix.databricks-langchain-version }}" fi diff --git a/src/databricks_ai_bridge/utils/vector_search.py b/src/databricks_ai_bridge/utils/vector_search.py index 5f6ff5b5..9fba15c1 100644 --- a/src/databricks_ai_bridge/utils/vector_search.py +++ b/src/databricks_ai_bridge/utils/vector_search.py @@ -161,8 +161,8 @@ def validate_and_get_return_columns( columns: List[str], text_column: str, index_details: IndexDetails, - doc_uri: str = None, - primary_key: str = None, + doc_uri: str, + primary_key: str, ) -> List[str]: """ Get a list of columns to retrieve from the index. From 3a1a233f950eecda0af8fc04e42ea61f08fdd161 Mon Sep 17 00:00:00 2001 From: Ann Zhang Date: Thu, 8 May 2025 01:56:59 -0700 Subject: [PATCH 04/27] test Signed-off-by: Ann Zhang --- integrations/langchain/tests/unit_tests/test_imports.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/integrations/langchain/tests/unit_tests/test_imports.py b/integrations/langchain/tests/unit_tests/test_imports.py index f3624a07..24a745a1 100644 --- a/integrations/langchain/tests/unit_tests/test_imports.py +++ b/integrations/langchain/tests/unit_tests/test_imports.py @@ -1,3 +1,10 @@ +import pytest +from packaging import version +import databricks_langchain + +if version.parse(databricks_langchain.__version__) < version.parse("0.4.0"): + pytest.skip("Test requires databricks-langchain >= 0.4.0", allow_module_level=True) + from databricks_langchain import ( ChatDatabricks, DatabricksEmbeddings, From 55eb076028ea677715834e837f2e6447ec163053 Mon Sep 17 00:00:00 2001 From: Ann Zhang Date: Thu, 8 May 2025 02:01:56 -0700 Subject: [PATCH 05/27] test Signed-off-by: Ann Zhang --- integrations/langchain/tests/unit_tests/test_imports.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/integrations/langchain/tests/unit_tests/test_imports.py b/integrations/langchain/tests/unit_tests/test_imports.py index 24a745a1..a25595cf 100644 --- a/integrations/langchain/tests/unit_tests/test_imports.py +++ b/integrations/langchain/tests/unit_tests/test_imports.py @@ -1,9 +1,10 @@ +from importlib.metadata import version as get_version + import pytest -from packaging import version -import databricks_langchain +from packaging import version as pkg_version -if version.parse(databricks_langchain.__version__) < version.parse("0.4.0"): - pytest.skip("Test requires databricks-langchain >= 0.4.0", allow_module_level=True) +if pkg_version.parse(get_version("databricks-langchain")) < pkg_version.parse("0.4.0"): + pytest.skip("Requires databricks-langchain >= 0.4.0", allow_module_level=True) from databricks_langchain import ( ChatDatabricks, From a8490876804d99c0173ee0107c803c383781455a Mon Sep 17 00:00:00 2001 From: Ann Zhang Date: Thu, 8 May 2025 11:02:05 -0700 Subject: [PATCH 06/27] none Signed-off-by: Ann Zhang --- src/databricks_ai_bridge/utils/vector_search.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/databricks_ai_bridge/utils/vector_search.py b/src/databricks_ai_bridge/utils/vector_search.py index 9fba15c1..5f6ff5b5 100644 --- a/src/databricks_ai_bridge/utils/vector_search.py +++ b/src/databricks_ai_bridge/utils/vector_search.py @@ -161,8 +161,8 @@ def validate_and_get_return_columns( columns: List[str], text_column: str, index_details: IndexDetails, - doc_uri: str, - primary_key: str, + doc_uri: str = None, + primary_key: str = None, ) -> List[str]: """ Get a list of columns to retrieve from the index. From 2ab32440909e4bdbd36d16a84ac6e53eb25974f9 Mon Sep 17 00:00:00 2001 From: Ann Zhang Date: Thu, 8 May 2025 16:27:39 -0700 Subject: [PATCH 07/27] try Signed-off-by: Ann Zhang --- .github/workflows/main.yml | 36 +++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index ee9eae9f..b062d12b 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -67,16 +67,38 @@ jobs: - name: Install core package run: | pip install .[dev] - - name: Install databricks-langchain version ${{ matrix.databricks-langchain-version }} + # Cache the downloaded and extracted langchain package + - name: Cache langchain package + id: cache-langchain + uses: actions/cache@v3 + with: + path: /tmp/langchain-tests/databricks-langchain-${{ matrix.databricks-langchain-version }} + key: ${{ runner.os }}-langchain-${{ matrix.databricks-langchain-version }}-v1 + - name: Download specific langchain version tests + run: | + # Create temp directory for tests + mkdir -p /tmp/langchain-tests + cd /tmp/langchain-tests + + # Download the package + pip download databricks-langchain==${{ matrix.databricks-langchain-version }} --no-deps --no-binary=:all: + + # Extract it + tar -xzf databricks-langchain-${{ matrix.databricks-langchain-version }}.tar.gz + - name: Install test dependencies run: | - if [ "${{ matrix.databricks-langchain-version }}" = "dev" ]; then - pip install integrations/langchain[dev] - else - pip install "databricks-langchain==${{ matrix.databricks-langchain-version }}" + cd /tmp/langchain-tests/databricks-langchain-${{ matrix.databricks-langchain-version }} + pip install pytest pytest-cov + + # Install remaining dependencies (excluding ai-bridge which we have from our PR) + if [ -f requirements.txt ]; then + grep -v "databricks-ai-bridge" requirements.txt > filtered_requirements.txt + pip install -r filtered_requirements.txt fi - - name: Run langchain integration tests + - name: Run databricks-langchain tests run: | - pytest integrations/langchain/tests/unit_tests + cd /tmp/langchain-tests/databricks-langchain-${{ matrix.databricks-langchain-version }} + python -m pytest tests -v openai_test: runs-on: ubuntu-latest From 5aca188dcdf67a99c66739fb30b69a25ce1af0f5 Mon Sep 17 00:00:00 2001 From: Ann Zhang Date: Thu, 8 May 2025 16:36:16 -0700 Subject: [PATCH 08/27] try Signed-off-by: Ann Zhang --- .github/workflows/main.yml | 60 ++++++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 22 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index b062d12b..daee7933 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -50,12 +50,13 @@ jobs: run: | pytest tests/ - langchain_test: + langchain_cross_version_test: runs-on: ubuntu-latest strategy: matrix: python-version: ["3.10"] - databricks-langchain-version: ["0.1.0", "0.2.0", "0.3.0", "0.4.0", "dev"] # Include whatever past versions you need + databricks-langchain-version: ["0.1.0", "0.2.0", "0.3.0", "0.4.0"] + fail-fast: false timeout-minutes: 20 steps: - name: Checkout code @@ -64,41 +65,56 @@ jobs: uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} + cache: 'pip' - name: Install core package run: | pip install .[dev] - # Cache the downloaded and extracted langchain package - - name: Cache langchain package + - name: Cache langchain tests id: cache-langchain uses: actions/cache@v3 with: - path: /tmp/langchain-tests/databricks-langchain-${{ matrix.databricks-langchain-version }} - key: ${{ runner.os }}-langchain-${{ matrix.databricks-langchain-version }}-v1 - - name: Download specific langchain version tests + path: /tmp/langchain-tests/version-${{ matrix.databricks-langchain-version }} + key: ${{ runner.os }}-langchain-tests-${{ matrix.databricks-langchain-version }} + - name: Setup langchain tests + if: steps.cache-langchain.outputs.cache-hit != 'true' run: | - # Create temp directory for tests - mkdir -p /tmp/langchain-tests - cd /tmp/langchain-tests + mkdir -p /tmp/langchain-tests/version-${{ matrix.databricks-langchain-version }} + cd /tmp/langchain-tests/version-${{ matrix.databricks-langchain-version }} - # Download the package - pip download databricks-langchain==${{ matrix.databricks-langchain-version }} --no-deps --no-binary=:all: + pip install databricks-langchain==${{ matrix.databricks-langchain-version }} --no-deps --target=./package_files + + # Copy tests directory and setup files + if [ -d "./package_files/databricks_langchain/tests" ]; then + cp -r ./package_files/databricks_langchain/tests ./tests + elif [ -d "./package_files/tests" ]; then + cp -r ./package_files/tests ./tests + else + echo "Error: Tests directory not found in the installed package" + find ./package_files -type d -name "test*" | grep -v "__pycache__" + # Try to find tests directory + TEST_DIR=$(find ./package_files -type d -name "test*" | grep -v "__pycache__" | head -1) + if [ -n "$TEST_DIR" ]; then + cp -r $TEST_DIR ./tests + fi + fi - # Extract it - tar -xzf databricks-langchain-${{ matrix.databricks-langchain-version }}.tar.gz + # Copy requirements and setup files if they exist + find ./package_files -name "requirements*.txt" -exec cp {} . \; + find ./package_files -name "setup.py" -exec cp {} . \; - name: Install test dependencies run: | - cd /tmp/langchain-tests/databricks-langchain-${{ matrix.databricks-langchain-version }} - pip install pytest pytest-cov + cd /tmp/langchain-tests/version-${{ matrix.databricks-langchain-version }} - # Install remaining dependencies (excluding ai-bridge which we have from our PR) if [ -f requirements.txt ]; then - grep -v "databricks-ai-bridge" requirements.txt > filtered_requirements.txt - pip install -r filtered_requirements.txt + grep -v "databricks-ai-bridge" requirements.txt > filtered_requirements.txt || echo "No databricks-ai-bridge in requirements" + if [ -s filtered_requirements.txt ]; then + pip install -r filtered_requirements.txt + fi fi - - name: Run databricks-langchain tests + - name: Run tests run: | - cd /tmp/langchain-tests/databricks-langchain-${{ matrix.databricks-langchain-version }} - python -m pytest tests -v + cd /tmp/langchain-tests/version-${{ matrix.databricks-langchain-version }} + pytest tests/unit_tests openai_test: runs-on: ubuntu-latest From 9a2429b2a46d80d26c513ed566ba695f237efc9b Mon Sep 17 00:00:00 2001 From: Ann Zhang Date: Thu, 8 May 2025 17:00:38 -0700 Subject: [PATCH 09/27] update Signed-off-by: Ann Zhang --- .github/workflows/main.yml | 55 ++----------------- .../utils/vector_search.py | 7 ++- .../vector_search_retriever_tool.py | 2 +- 3 files changed, 12 insertions(+), 52 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index daee7933..521c55b0 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -50,13 +50,11 @@ jobs: run: | pytest tests/ - langchain_cross_version_test: + langchain_test: runs-on: ubuntu-latest strategy: matrix: python-version: ["3.10"] - databricks-langchain-version: ["0.1.0", "0.2.0", "0.3.0", "0.4.0"] - fail-fast: false timeout-minutes: 20 steps: - name: Checkout code @@ -65,56 +63,13 @@ jobs: uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - cache: 'pip' - - name: Install core package - run: | - pip install .[dev] - - name: Cache langchain tests - id: cache-langchain - uses: actions/cache@v3 - with: - path: /tmp/langchain-tests/version-${{ matrix.databricks-langchain-version }} - key: ${{ runner.os }}-langchain-tests-${{ matrix.databricks-langchain-version }} - - name: Setup langchain tests - if: steps.cache-langchain.outputs.cache-hit != 'true' - run: | - mkdir -p /tmp/langchain-tests/version-${{ matrix.databricks-langchain-version }} - cd /tmp/langchain-tests/version-${{ matrix.databricks-langchain-version }} - - pip install databricks-langchain==${{ matrix.databricks-langchain-version }} --no-deps --target=./package_files - - # Copy tests directory and setup files - if [ -d "./package_files/databricks_langchain/tests" ]; then - cp -r ./package_files/databricks_langchain/tests ./tests - elif [ -d "./package_files/tests" ]; then - cp -r ./package_files/tests ./tests - else - echo "Error: Tests directory not found in the installed package" - find ./package_files -type d -name "test*" | grep -v "__pycache__" - # Try to find tests directory - TEST_DIR=$(find ./package_files -type d -name "test*" | grep -v "__pycache__" | head -1) - if [ -n "$TEST_DIR" ]; then - cp -r $TEST_DIR ./tests - fi - fi - - # Copy requirements and setup files if they exist - find ./package_files -name "requirements*.txt" -exec cp {} . \; - find ./package_files -name "setup.py" -exec cp {} . \; - - name: Install test dependencies + - name: Install dependencies run: | - cd /tmp/langchain-tests/version-${{ matrix.databricks-langchain-version }} - - if [ -f requirements.txt ]; then - grep -v "databricks-ai-bridge" requirements.txt > filtered_requirements.txt || echo "No databricks-ai-bridge in requirements" - if [ -s filtered_requirements.txt ]; then - pip install -r filtered_requirements.txt - fi - fi + pip install . + pip install integrations/langchain[dev] - name: Run tests run: | - cd /tmp/langchain-tests/version-${{ matrix.databricks-langchain-version }} - pytest tests/unit_tests + pytest integrations/langchain/tests/unit_tests openai_test: runs-on: ubuntu-latest diff --git a/src/databricks_ai_bridge/utils/vector_search.py b/src/databricks_ai_bridge/utils/vector_search.py index 5f6ff5b5..a5e38578 100644 --- a/src/databricks_ai_bridge/utils/vector_search.py +++ b/src/databricks_ai_bridge/utils/vector_search.py @@ -112,7 +112,9 @@ def get_metadata(columns: List[str], result: List[Any], retriever_schema, ignore def parse_vector_search_response( search_resp: Dict, - retriever_schema: RetrieverSchema, + index_details: IndexDetails, # deprecated + text_column: str, # deprecated + retriever_schema: RetrieverSchema = None, # required as of 0.5.0 ignore_cols: Optional[List[str]] = None, document_class: Any = dict, ) -> List[Tuple[Dict, float]]: @@ -120,6 +122,9 @@ def parse_vector_search_response( Parse the search response into a list of Documents with score. The document_class parameter is used to specify the class of the document to be created. """ + if retriever_schema is None: + raise ValueError("retriever_schema is required as of v0.5.0") + if ignore_cols is None: ignore_cols = [] diff --git a/src/databricks_ai_bridge/vector_search_retriever_tool.py b/src/databricks_ai_bridge/vector_search_retriever_tool.py index a289ef6a..a31b296b 100644 --- a/src/databricks_ai_bridge/vector_search_retriever_tool.py +++ b/src/databricks_ai_bridge/vector_search_retriever_tool.py @@ -143,7 +143,7 @@ def _get_default_tool_description(self, index_details: IndexDetails) -> str: return description def _get_resources( - self, index_name: str, embedding_endpoint: str, index_details: IndexDetails + self, index_name: str, embedding_endpoint: str, index_details: IndexDetails = None ) -> List[Resource]: resources = [] if index_name: From b151d0448e539d52b091306ba3f4ce4ef5e6878d Mon Sep 17 00:00:00 2001 From: Ann Zhang Date: Thu, 8 May 2025 17:06:12 -0700 Subject: [PATCH 10/27] update Signed-off-by: Ann Zhang --- .../src/databricks_langchain/vectorstores.py | 6 +- .../vector_search_retriever_tool.py | 2 +- .../utils/vector_search.py | 5 +- .../contracts/test_signatures.py | 100 +++++++++++ .../contracts/test_vector_search_contract.py | 160 ++++++++++++++++++ .../utils/test_vector_search.py | 2 +- 6 files changed, 268 insertions(+), 7 deletions(-) create mode 100644 tests/databricks_ai_bridge/contracts/test_signatures.py create mode 100644 tests/databricks_ai_bridge/contracts/test_vector_search_contract.py diff --git a/integrations/langchain/src/databricks_langchain/vectorstores.py b/integrations/langchain/src/databricks_langchain/vectorstores.py index c0d35081..e4d64617 100644 --- a/integrations/langchain/src/databricks_langchain/vectorstores.py +++ b/integrations/langchain/src/databricks_langchain/vectorstores.py @@ -473,7 +473,7 @@ def similarity_search_with_score( ) search_resp = self.index.similarity_search(**kwargs) return parse_vector_search_response( - search_resp, self._retriever_schema, document_class=Document + search_resp, retriever_schema=self._retriever_schema, document_class=Document ) def _select_relevance_score_fn(self) -> Callable[[float], float]: @@ -586,7 +586,7 @@ def similarity_search_by_vector_with_score( **kwargs, ) return parse_vector_search_response( - search_resp, self._retriever_schema, document_class=Document + search_resp, retriever_schema=self._retriever_schema, document_class=Document ) def max_marginal_relevance_search( @@ -723,7 +723,7 @@ def max_marginal_relevance_search_by_vector( ignore_cols: List = [embedding_column] if embedding_column not in self._columns else [] candidates = parse_vector_search_response( search_resp, - self._retriever_schema, + retriever_schema=self._retriever_schema, ignore_cols=ignore_cols, document_class=Document, ) diff --git a/integrations/llamaindex/src/databricks_llamaindex/vector_search_retriever_tool.py b/integrations/llamaindex/src/databricks_llamaindex/vector_search_retriever_tool.py index c7ae3243..f9b86769 100644 --- a/integrations/llamaindex/src/databricks_llamaindex/vector_search_retriever_tool.py +++ b/integrations/llamaindex/src/databricks_llamaindex/vector_search_retriever_tool.py @@ -127,7 +127,7 @@ def get_query_text_vector(query: str) -> Tuple[Optional[str], Optional[List[floa ) search_resp = self._index.similarity_search(**kwargs) return parse_vector_search_response( - search_resp, self._retriever_schema, document_class=dict + search_resp, retriever_schema=self._retriever_schema, document_class=dict ) # Create tool metadata diff --git a/src/databricks_ai_bridge/utils/vector_search.py b/src/databricks_ai_bridge/utils/vector_search.py index a5e38578..3721d0d1 100644 --- a/src/databricks_ai_bridge/utils/vector_search.py +++ b/src/databricks_ai_bridge/utils/vector_search.py @@ -112,8 +112,9 @@ def get_metadata(columns: List[str], result: List[Any], retriever_schema, ignore def parse_vector_search_response( search_resp: Dict, - index_details: IndexDetails, # deprecated - text_column: str, # deprecated + index_details: IndexDetails = None, # deprecated + text_column: str = None, # deprecated + *, retriever_schema: RetrieverSchema = None, # required as of 0.5.0 ignore_cols: Optional[List[str]] = None, document_class: Any = dict, diff --git a/tests/databricks_ai_bridge/contracts/test_signatures.py b/tests/databricks_ai_bridge/contracts/test_signatures.py new file mode 100644 index 00000000..08fe3439 --- /dev/null +++ b/tests/databricks_ai_bridge/contracts/test_signatures.py @@ -0,0 +1,100 @@ +import inspect +import pytest +from unittest.mock import MagicMock + +# Import the modules that databricks-langchain depends on +from databricks_ai_bridge import ( + IndexDetails, + IndexType, + RetrieverSchema, + get_metadata, + parse_vector_search_response, + validate_and_get_text_column, + validate_and_get_return_columns +) + +from databricks_ai_bridge.test_utils.vector_search import ( # noqa: F401 + DELTA_SYNC_INDEX, + INDEX_DETAILS, + EXAMPLE_SEARCH_RESPONSE, +) + +class TestApiSignatures: + """Verify that the public API signatures remain compatible with databricks-langchain.""" + + def test_index_details_signatures(self): + """Test IndexDetails class signature.""" + # Verify constructor signature + init_sig = inspect.signature(IndexDetails.__init__) + assert list(init_sig.parameters.keys())[1] == "index", "Constructor must accept 'index' parameter" + + properties_to_check = [ + "name", "schema", "primary_key", "index_spec", + "embedding_vector_column", "embedding_source_column" + ] + + mock_index = MagicMock(spec=VectorSearchIndex) + mock_index.describe.return_value = INDEX_DETAILS[DELTA_SYNC_INDEX] + mock_index.similarity_search.return_value = EXAMPLE_SEARCH_RESPONSE + index_details = IndexDetails(mock_index) + + for prop_name in properties_to_check: + assert hasattr(index_details, prop_name), f"Missing property: {prop_name}" + + def test_function_signatures(self): + """ + Test function signatures for backward compatibility. + Ensures: + 1. Required parameters haven't been added (all new params must have defaults) + 2. Parameter order remains the same for the base parameters + 3. No parameters have been removed + """ + # Define expected minimal signatures with required parameter counts + functions_to_check = { + "get_metadata": { + "required_params": ["columns", "result", "retriever_schema", "ignore_cols"], + "optional_params": [] # Parameters with default values + }, + "parse_vector_search_response": { + "required_params": ["search_resp", "retriever_schema"], + "optional_params": ["ignore_cols", "document_class"] + }, + "validate_and_get_text_column": { + "required_params": ["text_column", "index_details"], + "optional_params": [] + }, + "validate_and_get_return_columns": { + "required_params": ["columns", "text_column", "index_details"], + "optional_params": ["doc_uri", "primary_key"] + } + } + + for func_name, expected in functions_to_check.items(): + func = eval(func_name) # Get the function object + sig = inspect.signature(func) + + # Get required parameters (those without default values) + required_params = [ + name for name, param in sig.parameters.items() + if param.default is inspect.Parameter.empty + ] + + # Check if we've added any new required parameters + required_count = len(expected["required_params"]) + assert len(required_params) <= required_count, \ + f"Function {func_name} has {len(required_params)} required parameters, but should have at most {required_count}. " \ + f"New parameters must have default values." + + # Check that original required parameters are still in the same order + for i, param_name in enumerate(expected["required_params"]): + if i < len(required_params): # In case the function now has fewer required params + assert required_params[i] == param_name, \ + f"Function {func_name}: parameter {i+1} should be '{param_name}', got '{required_params[i]}'" + + # Make sure all original parameters (required and optional) still exist + all_original_params = expected["required_params"] + expected["optional_params"] + current_params = list(sig.parameters.keys()) + + for param in all_original_params: + assert param in current_params, \ + f"Function {func_name}: parameter '{param}' has been removed" diff --git a/tests/databricks_ai_bridge/contracts/test_vector_search_contract.py b/tests/databricks_ai_bridge/contracts/test_vector_search_contract.py new file mode 100644 index 00000000..4fe951af --- /dev/null +++ b/tests/databricks_ai_bridge/contracts/test_vector_search_contract.py @@ -0,0 +1,160 @@ +import pytest +import json +from unittest.mock import MagicMock, Mock, patch + +from databricks.vector_search.client import VectorSearchIndex +from databricks_ai_bridge.test_utils.vector_search import ( # noqa: F401 + ALL_INDEX_NAMES, + DELTA_SYNC_INDEX, + DIRECT_ACCESS_INDEX, + ENDPOINT_NAME, + INDEX_DETAILS, + INPUT_TEXTS, + EXAMPLE_SEARCH_RESPONSE, + mock_vs_client, # noqa: F401 +) + +# Import the modules you want to test +from databricks_ai_bridge import ( + IndexDetails, + IndexType, + RetrieverSchema, + get_metadata, + parse_vector_search_response, + validate_and_get_text_column, + validate_and_get_return_columns +) + +class TestIndexDetailsContract: + """Contract tests for IndexDetails class.""" + + def setup_method(self): + """Create a mock index with consistent test data""" + mock_index = MagicMock(spec=VectorSearchIndex) + mock_index.describe.return_value = INDEX_DETAILS[DELTA_SYNC_INDEX] + mock_index.similarity_search.return_value = EXAMPLE_SEARCH_RESPONSE + self.index_details = IndexDetails(mock_index) + + def test_basic_properties(self): + """Test that basic properties are accessible and return expected values.""" + assert self.index_details.name == DELTA_SYNC_INDEX + assert self.index_details.primary_key == "id" + assert self.index_details.is_direct_access_index() is False + assert self.index_details.is_delta_sync_index() is True + + def test_embedding_columns(self): + """Test that embedding column information is accessible.""" + vector_column = self.index_details.embedding_vector_column + source_column = self.index_details.embedding_source_column + + assert vector_column.get("name") == "embedding" + assert vector_column.get("dimension") == 768 + assert source_column.get("name") == "text" + + def test_databricks_managed_embeddings_detection(self): + """Test detection of databricks managed embeddings.""" + assert self.index_details.is_databricks_managed_embeddings() is False + assert self.delta_index_details.is_databricks_managed_embeddings() is True + + +class TestVectorSearchContract: + """Contract tests for vector search functionality.""" + + def setup_method(self): + # Test data for vector search + self.columns = ["id", "text", "doc_uri", "score"] + self.search_response = EXAMPLE_SEARCH_RESPONSE + self.retriever_schema = RetrieverSchema( + text_column="text", + doc_uri="doc_uri", + primary_key="id" + ) + + # Setup mock index details + mock_index = MagicMock(spec=VectorSearchIndex) + mock_index.describe.return_value = INDEX_DETAILS[DELTA_SYNC_INDEX] + mock_index.similarity_search.return_value = EXAMPLE_SEARCH_RESPONSE + self.index_details = IndexDetails(mock_index) + + def test_get_metadata(self): + """Test that metadata is correctly extracted from search results.""" + columns = ["id", "text", "doc_uri", "score"] + result = ["doc1", "This is a test document", "uri1", 0.95] + ignore_cols = ["text"] + + metadata = get_metadata(columns, result, self.retriever_schema, ignore_cols) + + assert "chunk_id" in metadata + assert metadata["chunk_id"] == "doc1" + assert "doc_uri" in metadata + assert metadata["doc_uri"] == "uri1" + assert "text" not in metadata # Should be ignored + + def test_parse_vector_search_response(self): + """Test that search responses are correctly parsed.""" + docs_with_score = parse_vector_search_response( + self.search_response, + self.retriever_schema + ) + + assert len(docs_with_score) == 2 + doc1, score1 = docs_with_score[0] + + assert doc1["page_content"] == "This is a test document" + assert doc1["metadata"]["chunk_id"] == "doc1" + assert doc1["metadata"]["doc_uri"] == "uri1" + assert score1 == 0.95 + + def test_validate_text_column(self): + """Test text column validation.""" + # For regular direct access index + text_col = validate_and_get_text_column("text", self.index_details) + assert text_col == "text" + + # Should raise error when text_column is None for non-managed embeddings + with pytest.raises(ValueError): + validate_and_get_text_column(None, self.index_details) + + # Create a mock for managed embeddings + managed_mock = Mock() + managed_mock.describe.return_value = { + "name": "managed_index", + "primary_key": "id", + "index_type": IndexType.DELTA_SYNC.value, + "delta_sync_index_spec": { + "embedding_source_columns": [{"name": "text"}], + "embedding_vector_columns": [{"name": "embedding"}] + } + } + managed_index = IndexDetails(managed_mock) + + # For managed embeddings, should return source column + text_col = validate_and_get_text_column(None, managed_index) + assert text_col == "text" + + # Should raise error if text_column is provided but doesn't match + with pytest.raises(ValueError): + validate_and_get_text_column("wrong_column", managed_index) + + def test_validate_return_columns(self): + """Test column validation for return columns.""" + columns = ["text"] + validated = validate_and_get_return_columns( + columns, + "text", + self.index_details, + doc_uri="doc_uri" + ) + + # Should add required columns + assert "id" in validated # primary key + assert "text" in validated # text column + assert "doc_uri" in validated # doc_uri + + # Should raise error for non-existent columns + with pytest.raises(ValueError): + validate_and_get_return_columns( + ["non_existent_column"], + "text", + self.index_details + ) \ No newline at end of file diff --git a/tests/databricks_ai_bridge/utils/test_vector_search.py b/tests/databricks_ai_bridge/utils/test_vector_search.py index e22b79da..0f1b3d4d 100644 --- a/tests/databricks_ai_bridge/utils/test_vector_search.py +++ b/tests/databricks_ai_bridge/utils/test_vector_search.py @@ -111,5 +111,5 @@ def make_document(row_index: int, score: float): ) def test_parse_vector_search_response(retriever_schema, ignore_cols, docs_with_score): assert ( - parse_vector_search_response(search_resp, retriever_schema, ignore_cols) == docs_with_score + parse_vector_search_response(search_resp, retriever_schema=retriever_schema, ignore_cols=ignore_cols) == docs_with_score ) From fa2c242d5a6f6d4bea5fefc46f19ff4a821b66cc Mon Sep 17 00:00:00 2001 From: Ann Zhang Date: Thu, 8 May 2025 17:20:52 -0700 Subject: [PATCH 11/27] update Signed-off-by: Ann Zhang --- .../utils/vector_search.py | 55 +++--- .../contracts/test_signatures.py | 100 ----------- .../contracts/test_vector_search_contract.py | 160 ------------------ .../utils/test_vector_search.py | 5 +- 4 files changed, 33 insertions(+), 287 deletions(-) delete mode 100644 tests/databricks_ai_bridge/contracts/test_signatures.py delete mode 100644 tests/databricks_ai_bridge/contracts/test_vector_search_contract.py diff --git a/src/databricks_ai_bridge/utils/vector_search.py b/src/databricks_ai_bridge/utils/vector_search.py index 3721d0d1..9137682e 100644 --- a/src/databricks_ai_bridge/utils/vector_search.py +++ b/src/databricks_ai_bridge/utils/vector_search.py @@ -87,35 +87,40 @@ def get_metadata(columns: List[str], result: List[Any], retriever_schema, ignore """ metadata = {} - # Skipping the last column, which is always the score - for col, value in zip(columns[:-1], result[:-1]): - if col == retriever_schema.doc_uri: - metadata["doc_uri"] = value - elif col == retriever_schema.primary_key: - metadata["chunk_id"] = value - elif col == "doc_uri" and retriever_schema.doc_uri: - # Prioritize retriever_schema.doc_uri, don't override with the actual "doc_uri" column - continue - elif col == "chunk_id" and retriever_schema.primary_key: - # Prioritize retriever_schema.primary_key, don't override with the actual "chunk_id" column - continue - elif col in ignore_cols: - # ignore_cols has precedence over other_columns - continue - elif retriever_schema.other_columns is not None: - if col in retriever_schema.other_columns: + if retriever_schema: + # Skipping the last column, which is always the score + for col, value in zip(columns[:-1], result[:-1]): + if col == retriever_schema.doc_uri: + metadata["doc_uri"] = value + elif col == retriever_schema.primary_key: + metadata["chunk_id"] = value + elif col == "doc_uri" and retriever_schema.doc_uri: + # Prioritize retriever_schema.doc_uri, don't override with the actual "doc_uri" column + continue + elif col == "chunk_id" and retriever_schema.primary_key: + # Prioritize retriever_schema.primary_key, don't override with the actual "chunk_id" column + continue + elif col in ignore_cols: + # ignore_cols has precedence over other_columns + continue + elif retriever_schema.other_columns is not None: + if col in retriever_schema.other_columns: + metadata[col] = value + else: + metadata[col] = value + else: + for col, value in zip(columns[:-1], result[:-1]): + if col not in ignore_cols: metadata[col] = value - else: - metadata[col] = value return metadata def parse_vector_search_response( search_resp: Dict, - index_details: IndexDetails = None, # deprecated - text_column: str = None, # deprecated + index_details: IndexDetails = None, # deprecated + text_column: str = None, # deprecated *, - retriever_schema: RetrieverSchema = None, # required as of 0.5.0 + retriever_schema: RetrieverSchema = None, ignore_cols: Optional[List[str]] = None, document_class: Any = dict, ) -> List[Tuple[Dict, float]]: @@ -123,13 +128,11 @@ def parse_vector_search_response( Parse the search response into a list of Documents with score. The document_class parameter is used to specify the class of the document to be created. """ - if retriever_schema is None: - raise ValueError("retriever_schema is required as of v0.5.0") - if ignore_cols is None: ignore_cols = [] - text_column = retriever_schema.text_column + if retriever_schema: + text_column = retriever_schema.text_column ignore_cols.extend([text_column]) columns = [col["name"] for col in search_resp.get("manifest", dict()).get("columns", [])] diff --git a/tests/databricks_ai_bridge/contracts/test_signatures.py b/tests/databricks_ai_bridge/contracts/test_signatures.py deleted file mode 100644 index 08fe3439..00000000 --- a/tests/databricks_ai_bridge/contracts/test_signatures.py +++ /dev/null @@ -1,100 +0,0 @@ -import inspect -import pytest -from unittest.mock import MagicMock - -# Import the modules that databricks-langchain depends on -from databricks_ai_bridge import ( - IndexDetails, - IndexType, - RetrieverSchema, - get_metadata, - parse_vector_search_response, - validate_and_get_text_column, - validate_and_get_return_columns -) - -from databricks_ai_bridge.test_utils.vector_search import ( # noqa: F401 - DELTA_SYNC_INDEX, - INDEX_DETAILS, - EXAMPLE_SEARCH_RESPONSE, -) - -class TestApiSignatures: - """Verify that the public API signatures remain compatible with databricks-langchain.""" - - def test_index_details_signatures(self): - """Test IndexDetails class signature.""" - # Verify constructor signature - init_sig = inspect.signature(IndexDetails.__init__) - assert list(init_sig.parameters.keys())[1] == "index", "Constructor must accept 'index' parameter" - - properties_to_check = [ - "name", "schema", "primary_key", "index_spec", - "embedding_vector_column", "embedding_source_column" - ] - - mock_index = MagicMock(spec=VectorSearchIndex) - mock_index.describe.return_value = INDEX_DETAILS[DELTA_SYNC_INDEX] - mock_index.similarity_search.return_value = EXAMPLE_SEARCH_RESPONSE - index_details = IndexDetails(mock_index) - - for prop_name in properties_to_check: - assert hasattr(index_details, prop_name), f"Missing property: {prop_name}" - - def test_function_signatures(self): - """ - Test function signatures for backward compatibility. - Ensures: - 1. Required parameters haven't been added (all new params must have defaults) - 2. Parameter order remains the same for the base parameters - 3. No parameters have been removed - """ - # Define expected minimal signatures with required parameter counts - functions_to_check = { - "get_metadata": { - "required_params": ["columns", "result", "retriever_schema", "ignore_cols"], - "optional_params": [] # Parameters with default values - }, - "parse_vector_search_response": { - "required_params": ["search_resp", "retriever_schema"], - "optional_params": ["ignore_cols", "document_class"] - }, - "validate_and_get_text_column": { - "required_params": ["text_column", "index_details"], - "optional_params": [] - }, - "validate_and_get_return_columns": { - "required_params": ["columns", "text_column", "index_details"], - "optional_params": ["doc_uri", "primary_key"] - } - } - - for func_name, expected in functions_to_check.items(): - func = eval(func_name) # Get the function object - sig = inspect.signature(func) - - # Get required parameters (those without default values) - required_params = [ - name for name, param in sig.parameters.items() - if param.default is inspect.Parameter.empty - ] - - # Check if we've added any new required parameters - required_count = len(expected["required_params"]) - assert len(required_params) <= required_count, \ - f"Function {func_name} has {len(required_params)} required parameters, but should have at most {required_count}. " \ - f"New parameters must have default values." - - # Check that original required parameters are still in the same order - for i, param_name in enumerate(expected["required_params"]): - if i < len(required_params): # In case the function now has fewer required params - assert required_params[i] == param_name, \ - f"Function {func_name}: parameter {i+1} should be '{param_name}', got '{required_params[i]}'" - - # Make sure all original parameters (required and optional) still exist - all_original_params = expected["required_params"] + expected["optional_params"] - current_params = list(sig.parameters.keys()) - - for param in all_original_params: - assert param in current_params, \ - f"Function {func_name}: parameter '{param}' has been removed" diff --git a/tests/databricks_ai_bridge/contracts/test_vector_search_contract.py b/tests/databricks_ai_bridge/contracts/test_vector_search_contract.py deleted file mode 100644 index 4fe951af..00000000 --- a/tests/databricks_ai_bridge/contracts/test_vector_search_contract.py +++ /dev/null @@ -1,160 +0,0 @@ -import pytest -import json -from unittest.mock import MagicMock, Mock, patch - -from databricks.vector_search.client import VectorSearchIndex -from databricks_ai_bridge.test_utils.vector_search import ( # noqa: F401 - ALL_INDEX_NAMES, - DELTA_SYNC_INDEX, - DIRECT_ACCESS_INDEX, - ENDPOINT_NAME, - INDEX_DETAILS, - INPUT_TEXTS, - EXAMPLE_SEARCH_RESPONSE, - mock_vs_client, # noqa: F401 -) - -# Import the modules you want to test -from databricks_ai_bridge import ( - IndexDetails, - IndexType, - RetrieverSchema, - get_metadata, - parse_vector_search_response, - validate_and_get_text_column, - validate_and_get_return_columns -) - -class TestIndexDetailsContract: - """Contract tests for IndexDetails class.""" - - def setup_method(self): - """Create a mock index with consistent test data""" - mock_index = MagicMock(spec=VectorSearchIndex) - mock_index.describe.return_value = INDEX_DETAILS[DELTA_SYNC_INDEX] - mock_index.similarity_search.return_value = EXAMPLE_SEARCH_RESPONSE - self.index_details = IndexDetails(mock_index) - - def test_basic_properties(self): - """Test that basic properties are accessible and return expected values.""" - assert self.index_details.name == DELTA_SYNC_INDEX - assert self.index_details.primary_key == "id" - assert self.index_details.is_direct_access_index() is False - assert self.index_details.is_delta_sync_index() is True - - def test_embedding_columns(self): - """Test that embedding column information is accessible.""" - vector_column = self.index_details.embedding_vector_column - source_column = self.index_details.embedding_source_column - - assert vector_column.get("name") == "embedding" - assert vector_column.get("dimension") == 768 - assert source_column.get("name") == "text" - - def test_databricks_managed_embeddings_detection(self): - """Test detection of databricks managed embeddings.""" - assert self.index_details.is_databricks_managed_embeddings() is False - assert self.delta_index_details.is_databricks_managed_embeddings() is True - - -class TestVectorSearchContract: - """Contract tests for vector search functionality.""" - - def setup_method(self): - # Test data for vector search - self.columns = ["id", "text", "doc_uri", "score"] - self.search_response = EXAMPLE_SEARCH_RESPONSE - self.retriever_schema = RetrieverSchema( - text_column="text", - doc_uri="doc_uri", - primary_key="id" - ) - - # Setup mock index details - mock_index = MagicMock(spec=VectorSearchIndex) - mock_index.describe.return_value = INDEX_DETAILS[DELTA_SYNC_INDEX] - mock_index.similarity_search.return_value = EXAMPLE_SEARCH_RESPONSE - self.index_details = IndexDetails(mock_index) - - def test_get_metadata(self): - """Test that metadata is correctly extracted from search results.""" - columns = ["id", "text", "doc_uri", "score"] - result = ["doc1", "This is a test document", "uri1", 0.95] - ignore_cols = ["text"] - - metadata = get_metadata(columns, result, self.retriever_schema, ignore_cols) - - assert "chunk_id" in metadata - assert metadata["chunk_id"] == "doc1" - assert "doc_uri" in metadata - assert metadata["doc_uri"] == "uri1" - assert "text" not in metadata # Should be ignored - - def test_parse_vector_search_response(self): - """Test that search responses are correctly parsed.""" - docs_with_score = parse_vector_search_response( - self.search_response, - self.retriever_schema - ) - - assert len(docs_with_score) == 2 - doc1, score1 = docs_with_score[0] - - assert doc1["page_content"] == "This is a test document" - assert doc1["metadata"]["chunk_id"] == "doc1" - assert doc1["metadata"]["doc_uri"] == "uri1" - assert score1 == 0.95 - - def test_validate_text_column(self): - """Test text column validation.""" - # For regular direct access index - text_col = validate_and_get_text_column("text", self.index_details) - assert text_col == "text" - - # Should raise error when text_column is None for non-managed embeddings - with pytest.raises(ValueError): - validate_and_get_text_column(None, self.index_details) - - # Create a mock for managed embeddings - managed_mock = Mock() - managed_mock.describe.return_value = { - "name": "managed_index", - "primary_key": "id", - "index_type": IndexType.DELTA_SYNC.value, - "delta_sync_index_spec": { - "embedding_source_columns": [{"name": "text"}], - "embedding_vector_columns": [{"name": "embedding"}] - } - } - managed_index = IndexDetails(managed_mock) - - # For managed embeddings, should return source column - text_col = validate_and_get_text_column(None, managed_index) - assert text_col == "text" - - # Should raise error if text_column is provided but doesn't match - with pytest.raises(ValueError): - validate_and_get_text_column("wrong_column", managed_index) - - def test_validate_return_columns(self): - """Test column validation for return columns.""" - columns = ["text"] - validated = validate_and_get_return_columns( - columns, - "text", - self.index_details, - doc_uri="doc_uri" - ) - - # Should add required columns - assert "id" in validated # primary key - assert "text" in validated # text column - assert "doc_uri" in validated # doc_uri - - # Should raise error for non-existent columns - with pytest.raises(ValueError): - validate_and_get_return_columns( - ["non_existent_column"], - "text", - self.index_details - ) \ No newline at end of file diff --git a/tests/databricks_ai_bridge/utils/test_vector_search.py b/tests/databricks_ai_bridge/utils/test_vector_search.py index 0f1b3d4d..d12d9e9c 100644 --- a/tests/databricks_ai_bridge/utils/test_vector_search.py +++ b/tests/databricks_ai_bridge/utils/test_vector_search.py @@ -111,5 +111,8 @@ def make_document(row_index: int, score: float): ) def test_parse_vector_search_response(retriever_schema, ignore_cols, docs_with_score): assert ( - parse_vector_search_response(search_resp, retriever_schema=retriever_schema, ignore_cols=ignore_cols) == docs_with_score + parse_vector_search_response( + search_resp, retriever_schema=retriever_schema, ignore_cols=ignore_cols + ) + == docs_with_score ) From 3ef5ade780b49d6258052d99d6d6e3fc97c84d0b Mon Sep 17 00:00:00 2001 From: Ann Zhang Date: Mon, 12 May 2025 09:38:38 -0700 Subject: [PATCH 12/27] test Signed-off-by: Ann Zhang --- .github/workflows/main.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 521c55b0..56ba81be 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -55,10 +55,15 @@ jobs: strategy: matrix: python-version: ["3.10"] + version: + - {ref: "HEAD", name: "current"} + - {ref: "databricks-ai-bridge-0.3.0", name: "v0.3.0"} timeout-minutes: 20 steps: - name: Checkout code uses: actions/checkout@v4 + with: + ref: ${{ matrix.version.ref }} - name: Set up Python uses: actions/setup-python@v5 with: From 335a34e24347f2d09d84937c594ba25f5314539a Mon Sep 17 00:00:00 2001 From: Ann Zhang Date: Mon, 12 May 2025 09:42:51 -0700 Subject: [PATCH 13/27] update Signed-off-by: Ann Zhang --- .github/workflows/main.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 56ba81be..7512787c 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -57,7 +57,10 @@ jobs: python-version: ["3.10"] version: - {ref: "HEAD", name: "current"} - - {ref: "databricks-ai-bridge-0.3.0", name: "v0.3.0"} + - {ref: "databricks-ai-v0.3.0", name: "v0.3.0"} + - {ref: "databricks-ai-v0.2.0", name: "v0.2.0"} + - {ref: "databricks-ai-v0.1.0", name: "v0.1.0"} + - {ref: "databricks-ai-v0.0", name: "v0.0"} timeout-minutes: 20 steps: - name: Checkout code From dcdd809a731f3db9616284c61cff598dbf72a2b7 Mon Sep 17 00:00:00 2001 From: Ann Zhang Date: Mon, 12 May 2025 09:47:01 -0700 Subject: [PATCH 14/27] update Signed-off-by: Ann Zhang --- .github/workflows/main.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 7512787c..0ec9f97e 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -56,7 +56,7 @@ jobs: matrix: python-version: ["3.10"] version: - - {ref: "HEAD", name: "current"} + - {ref: "", name: "current"} - {ref: "databricks-ai-v0.3.0", name: "v0.3.0"} - {ref: "databricks-ai-v0.2.0", name: "v0.2.0"} - {ref: "databricks-ai-v0.1.0", name: "v0.1.0"} @@ -66,7 +66,8 @@ jobs: - name: Checkout code uses: actions/checkout@v4 with: - ref: ${{ matrix.version.ref }} + ref: ${{ matrix.version.ref == '' && github.sha || matrix.version.ref }} + fetch-depth: 1 - name: Set up Python uses: actions/setup-python@v5 with: From b486c0ee09260ba80d15481c38b849266c05d648 Mon Sep 17 00:00:00 2001 From: Ann Zhang Date: Mon, 12 May 2025 09:50:27 -0700 Subject: [PATCH 15/27] update Signed-off-by: Ann Zhang --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 0ec9f97e..6e40cfe1 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -53,6 +53,7 @@ jobs: langchain_test: runs-on: ubuntu-latest strategy: + fail-fast: false matrix: python-version: ["3.10"] version: @@ -60,7 +61,6 @@ jobs: - {ref: "databricks-ai-v0.3.0", name: "v0.3.0"} - {ref: "databricks-ai-v0.2.0", name: "v0.2.0"} - {ref: "databricks-ai-v0.1.0", name: "v0.1.0"} - - {ref: "databricks-ai-v0.0", name: "v0.0"} timeout-minutes: 20 steps: - name: Checkout code From 7cc0e1eeb8fefe92a42caa2e0bdbf696b0239d3e Mon Sep 17 00:00:00 2001 From: Ann Zhang Date: Mon, 12 May 2025 09:56:24 -0700 Subject: [PATCH 16/27] update Signed-off-by: Ann Zhang --- .github/workflows/main.yml | 13 +++++++++++++ .../langchain/tests/unit_tests/test_imports.py | 8 -------- .../vector_search_retriever_tool.py | 2 +- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 6e40cfe1..71956d78 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -52,12 +52,14 @@ jobs: langchain_test: runs-on: ubuntu-latest + name: langchain_test (${{ matrix.python-version }}, ${{ matrix.version.name }}) strategy: fail-fast: false matrix: python-version: ["3.10"] version: - {ref: "", name: "current"} + - {ref: "databricks-ai-v0.4.0", name: "v0.4.0"} - {ref: "databricks-ai-v0.3.0", name: "v0.3.0"} - {ref: "databricks-ai-v0.2.0", name: "v0.2.0"} - {ref: "databricks-ai-v0.1.0", name: "v0.1.0"} @@ -82,13 +84,24 @@ jobs: openai_test: runs-on: ubuntu-latest + name: openai_test (${{ matrix.python-version }}, ${{ matrix.version.name }}) strategy: + fail-fast: false matrix: python-version: ["3.10"] + version: + - {ref: "", name: "current"} + - {ref: "databricks-ai-v0.4.0", name: "v0.4.0"} + - {ref: "databricks-ai-v0.3.0", name: "v0.3.0"} + - {ref: "databricks-ai-v0.2.0", name: "v0.2.0"} + - {ref: "databricks-ai-v0.1.0", name: "v0.1.0"} timeout-minutes: 20 steps: - name: Checkout code uses: actions/checkout@v4 + with: + ref: ${{ matrix.version.ref == '' && github.sha || matrix.version.ref }} + fetch-depth: 1 - name: Set up Python uses: actions/setup-python@v5 with: diff --git a/integrations/langchain/tests/unit_tests/test_imports.py b/integrations/langchain/tests/unit_tests/test_imports.py index a25595cf..f3624a07 100644 --- a/integrations/langchain/tests/unit_tests/test_imports.py +++ b/integrations/langchain/tests/unit_tests/test_imports.py @@ -1,11 +1,3 @@ -from importlib.metadata import version as get_version - -import pytest -from packaging import version as pkg_version - -if pkg_version.parse(get_version("databricks-langchain")) < pkg_version.parse("0.4.0"): - pytest.skip("Requires databricks-langchain >= 0.4.0", allow_module_level=True) - from databricks_langchain import ( ChatDatabricks, DatabricksEmbeddings, diff --git a/src/databricks_ai_bridge/vector_search_retriever_tool.py b/src/databricks_ai_bridge/vector_search_retriever_tool.py index a31b296b..a289ef6a 100644 --- a/src/databricks_ai_bridge/vector_search_retriever_tool.py +++ b/src/databricks_ai_bridge/vector_search_retriever_tool.py @@ -143,7 +143,7 @@ def _get_default_tool_description(self, index_details: IndexDetails) -> str: return description def _get_resources( - self, index_name: str, embedding_endpoint: str, index_details: IndexDetails = None + self, index_name: str, embedding_endpoint: str, index_details: IndexDetails ) -> List[Resource]: resources = [] if index_name: From 22c2266da2b84c8e8599950577268567297df879 Mon Sep 17 00:00:00 2001 From: Ann Zhang Date: Mon, 12 May 2025 10:06:54 -0700 Subject: [PATCH 17/27] update Signed-off-by: Ann Zhang --- .github/workflows/main.yml | 11 ++++++++--- src/databricks_ai_bridge/utils/vector_search.py | 4 ++-- .../vector_search_retriever_tool.py | 2 +- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 71956d78..c06b45d6 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -67,9 +67,6 @@ jobs: steps: - name: Checkout code uses: actions/checkout@v4 - with: - ref: ${{ matrix.version.ref == '' && github.sha || matrix.version.ref }} - fetch-depth: 1 - name: Set up Python uses: actions/setup-python@v5 with: @@ -77,6 +74,14 @@ jobs: - name: Install dependencies run: | pip install . + - name: Checkout langchain version + uses: actions/checkout@v4 + with: + ref: ${{ matrix.version.ref == '' && github.sha || matrix.version.ref }} + fetch-depth: 1 + path: old-version + - name: Replace current langchain with older version + run: | pip install integrations/langchain[dev] - name: Run tests run: | diff --git a/src/databricks_ai_bridge/utils/vector_search.py b/src/databricks_ai_bridge/utils/vector_search.py index 9137682e..0e293ee3 100644 --- a/src/databricks_ai_bridge/utils/vector_search.py +++ b/src/databricks_ai_bridge/utils/vector_search.py @@ -170,8 +170,8 @@ def validate_and_get_return_columns( columns: List[str], text_column: str, index_details: IndexDetails, - doc_uri: str = None, - primary_key: str = None, + doc_uri: str, #= None, + primary_key: str, # = None, ) -> List[str]: """ Get a list of columns to retrieve from the index. diff --git a/src/databricks_ai_bridge/vector_search_retriever_tool.py b/src/databricks_ai_bridge/vector_search_retriever_tool.py index a289ef6a..a31b296b 100644 --- a/src/databricks_ai_bridge/vector_search_retriever_tool.py +++ b/src/databricks_ai_bridge/vector_search_retriever_tool.py @@ -143,7 +143,7 @@ def _get_default_tool_description(self, index_details: IndexDetails) -> str: return description def _get_resources( - self, index_name: str, embedding_endpoint: str, index_details: IndexDetails + self, index_name: str, embedding_endpoint: str, index_details: IndexDetails = None ) -> List[Resource]: resources = [] if index_name: From 27f36d1f63f860c9b2d609224154e48f2dfabe61 Mon Sep 17 00:00:00 2001 From: Ann Zhang Date: Mon, 12 May 2025 10:12:23 -0700 Subject: [PATCH 18/27] update Signed-off-by: Ann Zhang --- .github/workflows/main.yml | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index c06b45d6..4d3b52b2 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -79,10 +79,14 @@ jobs: with: ref: ${{ matrix.version.ref == '' && github.sha || matrix.version.ref }} fetch-depth: 1 - path: old-version - - name: Replace current langchain with older version + path: older-version + - name: Replace langchain with older version run: | - pip install integrations/langchain[dev] + # Remove current langchain if it exists to avoid conflicts + rm -rf integrations/langchain + + # Copy older version of langchain to the main repo + cp -r older-version/integrations/langchain integrations/ - name: Run tests run: | pytest integrations/langchain/tests/unit_tests From f82f1d54309ac3c1102fcb10d8e98b56c71bc3ed Mon Sep 17 00:00:00 2001 From: Ann Zhang Date: Mon, 12 May 2025 10:21:53 -0700 Subject: [PATCH 19/27] update Signed-off-by: Ann Zhang --- .github/workflows/main.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 4d3b52b2..f95d91e0 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -87,6 +87,9 @@ jobs: # Copy older version of langchain to the main repo cp -r older-version/integrations/langchain integrations/ + - name: Install langchain dependency + run: | + pip install integrations/langchain[dev] - name: Run tests run: | pytest integrations/langchain/tests/unit_tests From 38d15f5d9c766298e47106959ada05fc6253e27e Mon Sep 17 00:00:00 2001 From: Ann Zhang Date: Mon, 12 May 2025 13:27:06 -0700 Subject: [PATCH 20/27] update Signed-off-by: Ann Zhang --- .github/workflows/main.yml | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index f95d91e0..65df8be8 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -111,9 +111,6 @@ jobs: steps: - name: Checkout code uses: actions/checkout@v4 - with: - ref: ${{ matrix.version.ref == '' && github.sha || matrix.version.ref }} - fetch-depth: 1 - name: Set up Python uses: actions/setup-python@v5 with: @@ -121,6 +118,21 @@ jobs: - name: Install dependencies run: | pip install . + - name: Checkout openai version + uses: actions/checkout@v4 + with: + ref: ${{ matrix.version.ref == '' && github.sha || matrix.version.ref }} + fetch-depth: 1 + path: older-version + - name: Replace openai with older version + run: | + # Remove current openai if it exists to avoid conflicts + rm -rf integrations/openai + + # Copy older version of openai to the main repo + cp -r older-version/integrations/openai integrations/ + - name: Install openai dependency + run: | pip install integrations/openai[dev] - name: Run tests run: | From 6898b04ca57b8f0a70a16c585246b573f4d71058 Mon Sep 17 00:00:00 2001 From: Ann Zhang Date: Mon, 12 May 2025 13:29:44 -0700 Subject: [PATCH 21/27] default None Signed-off-by: Ann Zhang --- src/databricks_ai_bridge/utils/vector_search.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/databricks_ai_bridge/utils/vector_search.py b/src/databricks_ai_bridge/utils/vector_search.py index 0e293ee3..9137682e 100644 --- a/src/databricks_ai_bridge/utils/vector_search.py +++ b/src/databricks_ai_bridge/utils/vector_search.py @@ -170,8 +170,8 @@ def validate_and_get_return_columns( columns: List[str], text_column: str, index_details: IndexDetails, - doc_uri: str, #= None, - primary_key: str, # = None, + doc_uri: str = None, + primary_key: str = None, ) -> List[str]: """ Get a list of columns to retrieve from the index. From 6e91b9c2b836f5d83bb39523385b207781d2fc2d Mon Sep 17 00:00:00 2001 From: Ann Zhang Date: Mon, 12 May 2025 13:48:07 -0700 Subject: [PATCH 22/27] limit tests Signed-off-by: Ann Zhang --- .github/workflows/main.yml | 55 +++++++++++++++++++++++++++++++++----- 1 file changed, 48 insertions(+), 7 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 65df8be8..567abf60 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -51,14 +51,34 @@ jobs: pytest tests/ langchain_test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10"] + timeout-minutes: 20 + steps: + - name: Checkout code + uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + pip install . + pip install integrations/langchain[dev] + - name: Run tests + run: | + pytest integrations/langchain/tests/unit_tests + + langchain_cross_version_test: runs-on: ubuntu-latest name: langchain_test (${{ matrix.python-version }}, ${{ matrix.version.name }}) strategy: fail-fast: false matrix: python-version: ["3.10"] - version: - - {ref: "", name: "current"} + version: - {ref: "databricks-ai-v0.4.0", name: "v0.4.0"} - {ref: "databricks-ai-v0.3.0", name: "v0.3.0"} - {ref: "databricks-ai-v0.2.0", name: "v0.2.0"} @@ -77,7 +97,7 @@ jobs: - name: Checkout langchain version uses: actions/checkout@v4 with: - ref: ${{ matrix.version.ref == '' && github.sha || matrix.version.ref }} + ref: ${{ matrix.version.ref }} fetch-depth: 1 path: older-version - name: Replace langchain with older version @@ -92,9 +112,30 @@ jobs: pip install integrations/langchain[dev] - name: Run tests run: | - pytest integrations/langchain/tests/unit_tests + pytest integrations/langchain/tests/unit_tests/test_vector_search_retriever_tool.py::test_init openai_test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10"] + timeout-minutes: 20 + steps: + - name: Checkout code + uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + pip install . + pip install integrations/openai[dev] + - name: Run tests + run: | + pytest integrations/openai/tests/unit_tests + + openai_cross_version_test: runs-on: ubuntu-latest name: openai_test (${{ matrix.python-version }}, ${{ matrix.version.name }}) strategy: @@ -102,7 +143,6 @@ jobs: matrix: python-version: ["3.10"] version: - - {ref: "", name: "current"} - {ref: "databricks-ai-v0.4.0", name: "v0.4.0"} - {ref: "databricks-ai-v0.3.0", name: "v0.3.0"} - {ref: "databricks-ai-v0.2.0", name: "v0.2.0"} @@ -121,7 +161,7 @@ jobs: - name: Checkout openai version uses: actions/checkout@v4 with: - ref: ${{ matrix.version.ref == '' && github.sha || matrix.version.ref }} + ref: ${{ matrix.version.ref }} fetch-depth: 1 path: older-version - name: Replace openai with older version @@ -136,7 +176,8 @@ jobs: pip install integrations/openai[dev] - name: Run tests run: | - pytest integrations/openai/tests/unit_tests + pytest integrations/openai/tests/unit_tests/test_vector_search_retriever_tool.py::test_vector_search_retriever_tool_init + llamaindex_test: runs-on: ubuntu-latest From 579dad6d84c34ca2ad0960a91ae97c3ff6a48dc5 Mon Sep 17 00:00:00 2001 From: Ann Zhang Date: Mon, 12 May 2025 13:51:29 -0700 Subject: [PATCH 23/27] none check Signed-off-by: Ann Zhang --- src/databricks_ai_bridge/vector_search_retriever_tool.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks_ai_bridge/vector_search_retriever_tool.py b/src/databricks_ai_bridge/vector_search_retriever_tool.py index a31b296b..812fd6dd 100644 --- a/src/databricks_ai_bridge/vector_search_retriever_tool.py +++ b/src/databricks_ai_bridge/vector_search_retriever_tool.py @@ -150,7 +150,7 @@ def _get_resources( resources.append(DatabricksVectorSearchIndex(index_name=index_name)) if embedding_endpoint: resources.append(DatabricksServingEndpoint(endpoint_name=embedding_endpoint)) - if index_details.is_databricks_managed_embeddings and ( + if index_details and index_details.is_databricks_managed_embeddings and ( managed_embedding := index_details.embedding_source_column.get( "embedding_model_endpoint_name", None ) From 8e882b3afa9fdc72ed63c6cddc346e7b79936922 Mon Sep 17 00:00:00 2001 From: Ann Zhang Date: Mon, 12 May 2025 14:00:54 -0700 Subject: [PATCH 24/27] update Signed-off-by: Ann Zhang --- .github/workflows/main.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 567abf60..02addfa4 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -113,6 +113,9 @@ jobs: - name: Run tests run: | pytest integrations/langchain/tests/unit_tests/test_vector_search_retriever_tool.py::test_init + pytest integrations/langchain/tests/unit_tests/test_genie.py + pytest integrations/langchain/tests/unit_tests/test_embeddings.py + pytest integrations/langchain/tests/unit_tests/test_chat_models.py openai_test: runs-on: ubuntu-latest @@ -145,8 +148,6 @@ jobs: version: - {ref: "databricks-ai-v0.4.0", name: "v0.4.0"} - {ref: "databricks-ai-v0.3.0", name: "v0.3.0"} - - {ref: "databricks-ai-v0.2.0", name: "v0.2.0"} - - {ref: "databricks-ai-v0.1.0", name: "v0.1.0"} timeout-minutes: 20 steps: - name: Checkout code From 64e20e921851c070dbae06eea65233ff8416b6f0 Mon Sep 17 00:00:00 2001 From: Ann Zhang Date: Mon, 12 May 2025 14:02:51 -0700 Subject: [PATCH 25/27] format Signed-off-by: Ann Zhang --- .github/workflows/main.yml | 1 - .../vector_search_retriever_tool.py | 10 +++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 02addfa4..9a5a2c72 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -82,7 +82,6 @@ jobs: - {ref: "databricks-ai-v0.4.0", name: "v0.4.0"} - {ref: "databricks-ai-v0.3.0", name: "v0.3.0"} - {ref: "databricks-ai-v0.2.0", name: "v0.2.0"} - - {ref: "databricks-ai-v0.1.0", name: "v0.1.0"} timeout-minutes: 20 steps: - name: Checkout code diff --git a/src/databricks_ai_bridge/vector_search_retriever_tool.py b/src/databricks_ai_bridge/vector_search_retriever_tool.py index 812fd6dd..8fab74cd 100644 --- a/src/databricks_ai_bridge/vector_search_retriever_tool.py +++ b/src/databricks_ai_bridge/vector_search_retriever_tool.py @@ -150,9 +150,13 @@ def _get_resources( resources.append(DatabricksVectorSearchIndex(index_name=index_name)) if embedding_endpoint: resources.append(DatabricksServingEndpoint(endpoint_name=embedding_endpoint)) - if index_details and index_details.is_databricks_managed_embeddings and ( - managed_embedding := index_details.embedding_source_column.get( - "embedding_model_endpoint_name", None + if ( + index_details + and index_details.is_databricks_managed_embeddings + and ( + managed_embedding := index_details.embedding_source_column.get( + "embedding_model_endpoint_name", None + ) ) ): if managed_embedding != embedding_endpoint: From ac35b1e7fb56e235500599be80c718e489846ef3 Mon Sep 17 00:00:00 2001 From: Ann Zhang Date: Mon, 12 May 2025 14:23:04 -0700 Subject: [PATCH 26/27] add comment Signed-off-by: Ann Zhang --- .github/workflows/main.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 9a5a2c72..ce17cf1c 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -111,6 +111,7 @@ jobs: pip install integrations/langchain[dev] - name: Run tests run: | + # Only testing initialization since functionality can change pytest integrations/langchain/tests/unit_tests/test_vector_search_retriever_tool.py::test_init pytest integrations/langchain/tests/unit_tests/test_genie.py pytest integrations/langchain/tests/unit_tests/test_embeddings.py @@ -176,6 +177,7 @@ jobs: pip install integrations/openai[dev] - name: Run tests run: | + # Only testing initialization since functionality can change pytest integrations/openai/tests/unit_tests/test_vector_search_retriever_tool.py::test_vector_search_retriever_tool_init From a988886186f0f67608422aa40ea61dc654034f48 Mon Sep 17 00:00:00 2001 From: Ann Zhang Date: Mon, 12 May 2025 16:37:21 -0700 Subject: [PATCH 27/27] add test Signed-off-by: Ann Zhang --- .../databricks_ai_bridge/utils/test_vector_search.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/databricks_ai_bridge/utils/test_vector_search.py b/tests/databricks_ai_bridge/utils/test_vector_search.py index d12d9e9c..399eef64 100644 --- a/tests/databricks_ai_bridge/utils/test_vector_search.py +++ b/tests/databricks_ai_bridge/utils/test_vector_search.py @@ -116,3 +116,14 @@ def test_parse_vector_search_response(retriever_schema, ignore_cols, docs_with_s ) == docs_with_score ) + + +def test_parse_vector_search_response_without_retriever_schema(): + assert ( + parse_vector_search_response(search_resp, text_column="column_1", ignore_cols=["column_2"]) + == construct_docs_with_score( + page_content_column="column_2", + column_3="column_3", + column_4="column_4", + ), + )