Add Fuzzy match resolver and refactor code

NathalieCharbel · NathalieCharbel · commit f25eeb500146 · 2025-04-01T15:25:41.000+02:00
diff --git a/src/neo4j_graphrag/experimental/components/resolver.py b/src/neo4j_graphrag/experimental/components/resolver.py
@@ -15,15 +15,17 @@
 import abc
 import logging
 from itertools import combinations
-from typing import Any, Optional, List
+from typing import Any, List, Optional
 
-import neo4j
 import numpy as np
-from numpy.typing import NDArray
+import rapidfuzz.fuzz
 import spacy
-from spacy.language import Language
+from numpy.typing import NDArray
+from rapidfuzz import utils
 from spacy.cli.download import download as spacy_download
+from spacy.language import Language
 
+import neo4j
 from neo4j_graphrag.experimental.components.types import ResolutionStats
 from neo4j_graphrag.experimental.pipeline import Component
 from neo4j_graphrag.utils import driver_config
@@ -148,34 +150,28 @@ async def run(self) -> ResolutionStats:
         )
 
 
-class SpaCySemanticMatchResolver(EntityResolver):
+class BasePropertySimilarityResolver(EntityResolver, abc.ABC):
     """
+    Base class for similarity-based matching of properties for entity resolution.
     Resolve entities with same label and similar set of textual properties (default is
-    ["name"]) based on spaCy's static embeddings and cosine similarities.
+    ["name"]):
+    - Group entities by label
+    - Concatenate the specified textual properties
+    - Compute similarity between each pair
+    - Consolidate overlapping sets
+    - Merge similar nodes via APOC (See apoc.refactor.mergeNodes documentation for more
+    details).
+
+    Subclasses implement `compute_similarity` based on different strategies, and return
+    a similarity score between 0 and 1.
 
     Args:
         driver (neo4j.Driver): The Neo4j driver to connect to the database.
         filter_query (Optional[str]): Optional Cypher WHERE clause to reduce the resolution scope.
         resolve_properties (Optional[List[str]]): The list of properties to consider for embeddings Defaults to ["name"].
         similarity_threshold (float): The similarity threshold above which nodes are merged. Defaults to 0.8.
-        spacy_model (str): The name of the spaCy model to load. Defaults to "en_core_web_lg".
         neo4j_database (Optional[str]): The name of the Neo4j database. If not provided, this defaults to the server's default database ("neo4j" by default) (`see reference to documentation <https://neo4j.com/docs/operations-manual/current/database-administration/#manage-databases-default>`_).
 
-    Example:
-
-    .. code-block:: python
-
-        from neo4j import GraphDatabase
-        from neo4j_graphrag.experimental.components.resolver import SinglePropertyExactMatchResolver
-
-        URI = "neo4j://localhost:7687"
-        AUTH = ("neo4j", "password")
-        DATABASE = "neo4j"
-
-        driver = GraphDatabase.driver(URI, auth=AUTH)
-        resolver = SinglePropertyExactMatchResolver(driver=driver, neo4j_database=DATABASE)
-        await resolver.run()  # no expected parameters
-
     """
 
     def __init__(
@@ -184,22 +180,21 @@ def __init__(
         filter_query: Optional[str] = None,
         resolve_properties: Optional[List[str]] = None,
         similarity_threshold: float = 0.8,
-        spacy_model: str = "en_core_web_lg",
         neo4j_database: Optional[str] = None,
     ) -> None:
         super().__init__(driver, filter_query)
         self.resolve_properties = resolve_properties or ["name"]
         self.similarity_threshold = similarity_threshold
         self.neo4j_database = neo4j_database
-        self.nlp = self._load_or_download_spacy_model(spacy_model)
 
-    async def run(self) -> ResolutionStats:
-        """Resolve entities based on the following rules:
-         For each entity label, entities with similar 'resolve_properties'
-        (cosine similarity on embedding vectors) are merged into a single node.
-
-        See apoc.refactor.mergeNodes documentation for more details.
+    @abc.abstractmethod
+    def compute_similarity(self, text_a: str, text_b: str) -> float:
         """
+        Compute similarity between two textual strings.
+        """
+        pass
+
+    async def run(self) -> ResolutionStats:
         match_query = "MATCH (entity:__Entity__)"
         if self.filter_query:
             match_query += f" {self.filter_query}"
@@ -212,7 +207,7 @@ async def run(self) -> ResolutionStats:
         # matches extracted entities
         # filters entities if filter_query is provided
         # unwinds labels to skip reserved ones
-        # collects all properties needed for embeddings
+        # collects all properties needed for the calculation of similarity
         query = f"""
             {match_query}
             UNWIND labels(entity) AS lab
@@ -224,41 +219,37 @@ async def run(self) -> ResolutionStats:
 
         records, _, _ = self.driver.execute_query(query, database_=self.neo4j_database)
 
-        total_entities_embedded = 0
+        total_entities = 0
         total_merged_nodes = 0
 
         # for each row, 'lab' is the label, 'labelCluster' is a list of dicts (id + textual properties)
         for row in records:
             entities = row["labelCluster"]
 
-            # build node embeddings
-            node_embeddings = {}
+            node_texts = {}
             for ent in entities:
                 # concatenate all textual properties (if non-null) into a single string
                 texts = [
                     str(ent[p]) for p in self.resolve_properties if p in ent and ent[p]
                 ]
                 combined_text = " ".join(texts).strip()
                 if combined_text:
-                    node_embeddings[ent["id"]] = self.nlp(combined_text).vector
-            total_entities_embedded += len(node_embeddings)
+                    node_texts[ent["id"]] = combined_text
+            total_entities += len(node_texts)
 
-            # identify pairs to merge
+            # compute pairwise similarity and mark those above the threshold
             pairs_to_merge = []
-            for (id1, emb1), (id2, emb2) in combinations(node_embeddings.items(), 2):
-                sim = self._cosine_similarity(
-                    np.asarray(emb1, dtype=np.float64),
-                    np.asarray(emb2, dtype=np.float64),
-                )
+            for (id1, text1), (id2, text2) in combinations(node_texts.items(), 2):
+                sim = self.compute_similarity(text1, text2)
                 if sim >= self.similarity_threshold:
                     pairs_to_merge.append({id1, id2})
 
-            # consolidate overlapping sets of node IDs
-            resolved_sets = self._consolidate_sets(pairs_to_merge)
+            # consolidate overlapping pairs into unique merge sets.
+            merged_sets = self._consolidate_sets(pairs_to_merge)
 
-            # perform merges in the db using APOC
+            # perform merges in the db using APOC.
             merged_count = 0
-            for node_id_set in resolved_sets:
+            for node_id_set in merged_sets:
                 if len(node_id_set) > 1:
                     merge_query = (
                         "MATCH (n) WHERE id(n) IN $ids "
@@ -272,11 +263,10 @@ async def run(self) -> ResolutionStats:
                         database_=self.neo4j_database,
                     )
                     merged_count += len(result)
-
             total_merged_nodes += merged_count
 
         return ResolutionStats(
-            number_of_nodes_to_resolve=total_entities_embedded,
+            number_of_nodes_to_resolve=total_entities,
             number_of_created_nodes=total_merged_nodes,
         )
 
@@ -296,6 +286,63 @@ def _consolidate_sets(pairs: List[set[str]]) -> List[set[str]]:
                 consolidated.append(set(pair))
         return consolidated
 
+
+class SpaCySemanticMatchResolver(BasePropertySimilarityResolver):
+    """
+    Resolve entities with same label and similar set of textual properties (default is
+    ["name"]) based on spaCy's static embeddings and cosine similarities.
+
+    Args:
+        driver (neo4j.Driver): The Neo4j driver to connect to the database.
+        filter_query (Optional[str]): Optional Cypher WHERE clause to reduce the resolution scope.
+        resolve_properties (Optional[List[str]]): The list of properties to consider for embeddings Defaults to ["name"].
+        similarity_threshold (float): The similarity threshold above which nodes are merged. Defaults to 0.8.
+        spacy_model (str): The name of the spaCy model to load. Defaults to "en_core_web_lg".
+        neo4j_database (Optional[str]): The name of the Neo4j database. If not provided, this defaults to the server's default database ("neo4j" by default) (`see reference to documentation <https://neo4j.com/docs/operations-manual/current/database-administration/#manage-databases-default>`_).
+
+    Example:
+
+    .. code-block:: python
+
+        from neo4j import GraphDatabase
+        from neo4j_graphrag.experimental.components.resolver import SpaCySemanticMatchResolver
+
+        URI = "neo4j://localhost:7687"
+        AUTH = ("neo4j", "password")
+        DATABASE = "neo4j"
+
+        driver = GraphDatabase.driver(URI, auth=AUTH)
+        resolver = SpaCySemanticMatchResolver(driver=driver, neo4j_database=DATABASE)
+        await resolver.run()  # no expected parameters
+
+    """
+
+    def __init__(
+        self,
+        driver: neo4j.Driver,
+        filter_query: Optional[str] = None,
+        resolve_properties: Optional[List[str]] = None,
+        similarity_threshold: float = 0.8,
+        spacy_model: str = "en_core_web_lg",
+        neo4j_database: Optional[str] = None,
+    ) -> None:
+        super().__init__(
+            driver,
+            filter_query,
+            resolve_properties,
+            similarity_threshold,
+            neo4j_database,
+        )
+        self.nlp = self._load_or_download_spacy_model(spacy_model)
+
+    def compute_similarity(self, text_a: str, text_b: str) -> float:
+        emb1 = self.nlp(text_a).vector
+        emb2 = self.nlp(text_b).vector
+        sim = self._cosine_similarity(
+            np.asarray(emb1, dtype=np.float64), np.asarray(emb2, dtype=np.float64)
+        )
+        return sim
+
     @staticmethod
     def _cosine_similarity(
         vec1: NDArray[np.float64], vec2: NDArray[np.float64]
@@ -324,3 +371,21 @@ def _load_or_download_spacy_model(model_name: str) -> Language:
                 return spacy.load(model_name)
             else:
                 raise e
+
+
+class FuzzyMatchResolver(BasePropertySimilarityResolver):
+    """
+    Resolve entities with the same label and similar set of textual properties using
+    RapidFuzz for fuzzy matching. Similarity scores are normalized to a value between 0
+    and 1.
+    """
+
+    def compute_similarity(self, text_a: str, text_b: str) -> float:
+        # RapidFuzz's fuzz.WRatio returns a score from 0 to 100
+        # normalize the input strings before the comparison is done (processor=utils.default_process)
+        # e.g., lowercase the text, strip whitespace, and remove punctuation
+        # normalize the score to the 0..1 range
+        return (
+            rapidfuzz.fuzz.WRatio(text_a, text_b, processor=utils.default_process)
+            / 100.0
+        )