Merge pull request #137 from anaregdesign/feature/similarity

piroyoung · web-flow · commit 80791e863768 · 2025-05-08T09:49:14.000+09:00
feature: similarity
diff --git a/src/openaivec/pandas_ext.py b/src/openaivec/pandas_ext.py
@@ -37,6 +37,7 @@
 import logging
 from typing import Awaitable, Callable, Type, TypeVar
 
+import numpy as np
 import pandas as pd
 from openai import AsyncAzureOpenAI, AsyncOpenAI, AzureOpenAI, OpenAI
 from pydantic import BaseModel
@@ -67,6 +68,11 @@
 _TIKTOKEN_ENCODING = tiktoken.encoding_for_model(_RESPONSES_MODEL_NAME)
 
 
+# internal method for accesing .ai accessor in spark udfs
+def _wakeup() -> None:
+    pass
+
+
 def use(client: OpenAI) -> None:
     """Register a custom OpenAI‑compatible client.
 
@@ -460,6 +466,12 @@ def responses(
             )
         )
 
+    def similarity(self, col1: str, col2: str) -> pd.Series:
+        return self._obj.apply(
+            lambda row: np.dot(row[col1], row[col2]) / (np.linalg.norm(row[col1]) * np.linalg.norm(row[col2])),
+            axis=1,
+        ).rename("similarity")
+
 
 @pd.api.extensions.register_series_accessor("aio")
 class AsyncOpenAIVecSeriesAccessor:
diff --git a/src/openaivec/spark.py b/src/openaivec/spark.py
@@ -458,3 +458,21 @@ def fn(col: Iterator[pd.Series]) -> Iterator[pd.Series]:
             yield part.map(lambda x: len(_TIKTOKEN_ENC.encode(x)) if isinstance(x, str) else 0)
 
     return fn
+
+
+def similarity_udf() -> UserDefinedFunction:
+    @pandas_udf(FloatType())
+    def fn(a: pd.Series, b: pd.Series) -> pd.Series:
+        """Compute cosine similarity between two vectors.
+
+        Args:
+            a: First vector.
+            b: Second vector.
+
+        Returns:
+            Cosine similarity between the two vectors.
+        """
+        pandas_ext._wakeup()
+        return pd.DataFrame({"a": a, "b": b}).ai.similarity("a", "b")
+
+    return fn
diff --git a/tests/test_pandas_ext.py b/tests/test_pandas_ext.py
@@ -203,3 +203,33 @@ def test_count_tokens(self):
 
         # assert all values are elements of int
         self.assertTrue(all(isinstance(num_token, int) for num_token in num_tokens))
+
+    def test_similarity(self):
+        sample_df = pd.DataFrame(
+            {
+                "vector1": [np.array([1, 0]), np.array([0, 1]), np.array([1, 1])],
+                "vector2": [np.array([1, 0]), np.array([0, 1]), np.array([1, -1])],
+            }
+        )
+        similarity_scores = sample_df.ai.similarity("vector1", "vector2")
+
+        # Expected cosine similarity values
+        expected_scores = [
+            1.0,  # Cosine similarity between [1, 0] and [1, 0]
+            1.0,  # Cosine similarity between [0, 1] and [0, 1]
+            0.0,  # Cosine similarity between [1, 1] and [1, -1]
+        ]
+
+        # Assert similarity scores match expected values
+        self.assertTrue(np.allclose(similarity_scores, expected_scores))
+
+    def test_similarity_with_invalid_vectors(self):
+        sample_df = pd.DataFrame(
+            {
+                "vector1": [np.array([1, 0]), "invalid", np.array([1, 1])],
+                "vector2": [np.array([1, 0]), np.array([0, 1]), np.array([1, -1])],
+            }
+        )
+
+        with self.assertRaises(TypeError):
+            sample_df.ai.similarity("vector1", "vector2")
diff --git a/tests/test_spark.py b/tests/test_spark.py
@@ -6,7 +6,13 @@
 from pyspark.sql.session import SparkSession
 from pyspark.sql.types import ArrayType, FloatType, IntegerType, StringType, StructField, StructType
 
-from openaivec.spark import EmbeddingsUDFBuilder, ResponsesUDFBuilder, _pydantic_to_spark_schema, count_tokens_udf
+from openaivec.spark import (
+    EmbeddingsUDFBuilder,
+    ResponsesUDFBuilder,
+    _pydantic_to_spark_schema,
+    count_tokens_udf,
+    similarity_udf,
+)
 
 
 class TestUDFBuilder(TestCase):
@@ -143,3 +149,30 @@ def test_count_token(self):
             SELECT sentence, count_tokens(sentence) as token_count from sentences
             """
         ).show(truncate=False)
+
+
+class TestSimilarityUDF(TestCase):
+    def setUp(self):
+        self.spark: SparkSession = SparkSession.builder.getOrCreate()
+        self.spark.sparkContext.setLogLevel("INFO")
+        self.spark.udf.register("similarity", similarity_udf())
+
+    def test_similarity(self):
+        df = self.spark.createDataFrame(
+            [
+                (1, [0.1, 0.2, 0.3]),
+                (2, [0.4, 0.5, 0.6]),
+                (3, [0.7, 0.8, 0.9]),
+            ],
+            ["id", "vector"],
+        )
+        df.createOrReplaceTempView("vectors")
+        result_df = self.spark.sql(
+            """
+            SELECT id, similarity(vector, vector) as similarity_score
+            FROM vectors
+            """
+        )
+        result_df.show(truncate=False)
+        df_pandas = result_df.toPandas()
+        assert df_pandas.shape == (3, 2)