Skip to content

Commit 72db53b

Browse files
committed
Refactored get_enhanced_schema_cypher
1 parent c002a08 commit 72db53b

File tree

1 file changed

+187
-130
lines changed

1 file changed

+187
-130
lines changed

src/neo4j_graphrag/schema.py

Lines changed: 187 additions & 130 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
# limitations under the License.
1515
from __future__ import annotations
1616

17-
from typing import Any, Dict, List, Optional
17+
from typing import Any, Dict, List, Optional, Tuple
1818

1919
import neo4j
2020
from neo4j import Query
@@ -454,6 +454,150 @@ def format_schema(schema: Dict[str, Any], is_enhanced: bool) -> str:
454454
)
455455

456456

457+
def build_str_clauses(
458+
prop_name: str,
459+
driver: neo4j.Driver,
460+
label_or_type: str,
461+
exhaustive: bool,
462+
prop_index: Optional[List[Any]] = None,
463+
) -> Tuple[List[str], List[str]]:
464+
"""
465+
Build Cypher clauses for string property statistics.
466+
467+
Constructs and returns the parts of a Cypher query (`WITH` and `RETURN` clauses)
468+
required to gather statistical information about a string property. Depending on
469+
property index metadata and whether the query is exhaustive, this function may
470+
retrieve a distinct set of values directly from an index or a truncated list of
471+
distinct values from the actual nodes or relationships.
472+
473+
Args:
474+
prop_name (str): The name of the string property.
475+
driver (neo4j.Driver): Neo4j Python driver instance.
476+
label_or_type (str): The node label or relationship type to query.
477+
exhaustive (bool): Whether to perform an exhaustive search or a
478+
sampled query approach.
479+
prop_index (Optional[List[Any]]): Optional metadata about the property's
480+
index. If provided, certain optimizations are applied based on
481+
distinct value limits and index availability.
482+
483+
Returns:
484+
Tuple[List[str], List[str]]:
485+
A tuple of two lists. The first list contains the `WITH` clauses, and
486+
the second list contains the corresponding `RETURN` clauses for the
487+
string property.
488+
"""
489+
with_clauses = []
490+
return_clauses = []
491+
if (
492+
not exhaustive
493+
and prop_index
494+
and prop_index[0].get("size") > 0
495+
and prop_index[0].get("distinctValues") <= DISTINCT_VALUE_LIMIT
496+
):
497+
distinct_values = query_database(
498+
driver,
499+
f"CALL apoc.schema.properties.distinct("
500+
f"'{label_or_type}', '{prop_name}') YIELD value",
501+
)[0]["value"]
502+
return_clauses.append(
503+
(f"values: {distinct_values}," f" distinct_count: {len(distinct_values)}")
504+
)
505+
else:
506+
with_clauses.append(
507+
(
508+
f"collect(distinct substring(toString(n.`{prop_name}`)"
509+
f", 0, 50)) AS `{prop_name}_values`"
510+
)
511+
)
512+
if not exhaustive:
513+
return_clauses.append(f"values: `{prop_name}_values`")
514+
else:
515+
return_clauses.append(
516+
(
517+
f"values:`{prop_name}_values`[..{DISTINCT_VALUE_LIMIT}],"
518+
f" distinct_count: size(`{prop_name}_values`)"
519+
)
520+
)
521+
return with_clauses, return_clauses
522+
523+
524+
def build_list_clauses(prop_name: str) -> Tuple[str, str]:
525+
"""
526+
Build Cypher clauses for list property size statistics.
527+
528+
Constructs and returns the parts of a Cypher query (`WITH` and `RETURN` clauses)
529+
that gather minimum and maximum size information for properties that are lists.
530+
These clauses compute the smallest and largest list lengths across the matched
531+
entities.
532+
533+
Args:
534+
prop_name (str): The name of the list property.
535+
536+
Returns:
537+
Tuple[str, str]:
538+
A tuple consisting of a single `WITH` clause (calculating min and max
539+
sizes) and a corresponding `RETURN` clause that references these values.
540+
"""
541+
with_clause = (
542+
f"min(size(n.`{prop_name}`)) AS `{prop_name}_size_min`, "
543+
f"max(size(n.`{prop_name}`)) AS `{prop_name}_size_max`"
544+
)
545+
546+
return_clause = (
547+
f"min_size: `{prop_name}_size_min`, " f"max_size: `{prop_name}_size_max`"
548+
)
549+
return with_clause, return_clause
550+
551+
552+
def build_num_date_clauses(
553+
prop_name: str, exhaustive: bool, prop_index: Optional[List[Any]] = None
554+
) -> Tuple[List[str], List[str]]:
555+
"""
556+
Build Cypher clauses for numeric and date/datetime property statistics.
557+
558+
Constructs and returns the parts of a Cypher query (`WITH` and `RETURN` clauses)
559+
needed to gather statistical information about numeric or date/datetime
560+
properties. Depending on whether there is an available index or an exhaustive
561+
approach is required, this may collect a distinct set of values or compute
562+
minimum, maximum, and distinct counts.
563+
564+
Args:
565+
prop_name (str): The name of the numeric or date/datetime property.
566+
exhaustive (bool): Whether to perform an exhaustive search or a
567+
sampled query approach.
568+
prop_index (Optional[List[Any]]): Optional metadata about the property's
569+
index. If provided and the search is not exhaustive, it can be used
570+
to optimize the retrieval of distinct values.
571+
572+
Returns:
573+
Tuple[List[str], List[str]]:
574+
A tuple of two lists. The first list contains the `WITH` clauses, and
575+
the second list contains the corresponding `RETURN` clauses for the
576+
numeric or date/datetime property.
577+
"""
578+
with_clauses = []
579+
return_clauses = []
580+
if not prop_index and not exhaustive:
581+
with_clauses.append(
582+
f"collect(distinct toString(n.`{prop_name}`)) " f"AS `{prop_name}_values`"
583+
)
584+
return_clauses.append(f"values: `{prop_name}_values`")
585+
else:
586+
with_clauses.append(f"min(n.`{prop_name}`) AS `{prop_name}_min`")
587+
with_clauses.append(f"max(n.`{prop_name}`) AS `{prop_name}_max`")
588+
with_clauses.append(
589+
f"count(distinct n.`{prop_name}`) AS `{prop_name}_distinct`"
590+
)
591+
return_clauses.append(
592+
(
593+
f"min: toString(`{prop_name}_min`), "
594+
f"max: toString(`{prop_name}_max`), "
595+
f"distinct_count: `{prop_name}_distinct`"
596+
)
597+
)
598+
return with_clauses, return_clauses
599+
600+
457601
def get_enhanced_schema_cypher(
458602
driver: neo4j.Driver,
459603
structured_schema: Dict[str, Any],
@@ -494,148 +638,61 @@ def get_enhanced_schema_cypher(
494638
with_clauses = []
495639
return_clauses = []
496640
output_dict = {}
497-
if exhaustive:
498-
for prop in properties:
499-
prop_name = prop["property"]
500-
prop_type = prop["type"]
501-
if prop_type == "STRING":
502-
with_clauses.append(
503-
(
504-
f"collect(distinct substring(toString(n.`{prop_name}`)"
505-
f", 0, 50)) AS `{prop_name}_values`"
506-
)
507-
)
508-
return_clauses.append(
509-
(
510-
f"values:`{prop_name}_values`[..{DISTINCT_VALUE_LIMIT}],"
511-
f" distinct_count: size(`{prop_name}_values`)"
512-
)
513-
)
514-
elif prop_type in [
515-
"INTEGER",
516-
"FLOAT",
517-
"DATE",
518-
"DATE_TIME",
519-
"LOCAL_DATE_TIME",
520-
]:
521-
with_clauses.append(f"min(n.`{prop_name}`) AS `{prop_name}_min`")
522-
with_clauses.append(f"max(n.`{prop_name}`) AS `{prop_name}_max`")
523-
with_clauses.append(
524-
f"count(distinct n.`{prop_name}`) AS `{prop_name}_distinct`"
525-
)
526-
return_clauses.append(
527-
(
528-
f"min: toString(`{prop_name}_min`), "
529-
f"max: toString(`{prop_name}_max`), "
530-
f"distinct_count: `{prop_name}_distinct`"
531-
)
532-
)
533-
elif prop_type == "LIST":
534-
with_clauses.append(
535-
(
536-
f"min(size(n.`{prop_name}`)) AS `{prop_name}_size_min`, "
537-
f"max(size(n.`{prop_name}`)) AS `{prop_name}_size_max`"
538-
)
539-
)
540-
return_clauses.append(
541-
f"min_size: `{prop_name}_size_min`, "
542-
f"max_size: `{prop_name}_size_max`"
543-
)
544-
elif prop_type in ["BOOLEAN", "POINT", "DURATION"]:
545-
continue
546-
output_dict[prop_name] = "{" + return_clauses.pop() + "}"
547-
else:
548-
# Just sample 5 random nodes
641+
if not exhaustive:
642+
# Sample 5 random nodes if not exhaustive
549643
match_clause += " WITH n LIMIT 5"
550-
for prop in properties:
551-
prop_name = prop["property"]
552-
prop_type = prop["type"]
553-
554-
# Check if indexed property, we can still do exhaustive
555-
prop_index = [
644+
# Build the with and return clauses
645+
for prop in properties:
646+
prop_name = prop["property"]
647+
prop_type = prop["type"]
648+
# Check if indexed property, we can still do exhaustive
649+
prop_index = (
650+
[
556651
el
557652
for el in structured_schema["metadata"]["index"]
558653
if el["label"] == label_or_type
559654
and el["properties"] == [prop_name]
560655
and el["type"] == "RANGE"
561656
]
562-
if prop_type == "STRING":
563-
if (
564-
prop_index
565-
and prop_index[0].get("size") > 0
566-
and prop_index[0].get("distinctValues") <= DISTINCT_VALUE_LIMIT
567-
):
568-
distinct_values = query_database(
569-
driver,
570-
f"CALL apoc.schema.properties.distinct("
571-
f"'{label_or_type}', '{prop_name}') YIELD value",
572-
)[0]["value"]
573-
return_clauses.append(
574-
(
575-
f"values: {distinct_values},"
576-
f" distinct_count: {len(distinct_values)}"
577-
)
578-
)
579-
else:
580-
with_clauses.append(
581-
(
582-
f"collect(distinct substring(toString(n.`{prop_name}`)"
583-
f", 0, 50)) AS `{prop_name}_values`"
584-
)
585-
)
586-
return_clauses.append(f"values: `{prop_name}_values`")
587-
elif prop_type in [
588-
"INTEGER",
589-
"FLOAT",
590-
"DATE",
591-
"DATE_TIME",
592-
"LOCAL_DATE_TIME",
593-
]:
594-
if not prop_index:
595-
with_clauses.append(
596-
f"collect(distinct toString(n.`{prop_name}`)) "
597-
f"AS `{prop_name}_values`"
598-
)
599-
return_clauses.append(f"values: `{prop_name}_values`")
600-
else:
601-
with_clauses.append(f"min(n.`{prop_name}`) AS `{prop_name}_min`")
602-
with_clauses.append(f"max(n.`{prop_name}`) AS `{prop_name}_max`")
603-
with_clauses.append(
604-
f"count(distinct n.`{prop_name}`) AS `{prop_name}_distinct`"
605-
)
606-
return_clauses.append(
607-
(
608-
f"min: toString(`{prop_name}_min`), "
609-
f"max: toString(`{prop_name}_max`), "
610-
f"distinct_count: `{prop_name}_distinct`"
611-
)
612-
)
613-
614-
elif prop_type == "LIST":
615-
with_clauses.append(
616-
(
617-
f"min(size(n.`{prop_name}`)) AS `{prop_name}_size_min`, "
618-
f"max(size(n.`{prop_name}`)) AS `{prop_name}_size_max`"
619-
)
620-
)
621-
return_clauses.append(
622-
(
623-
f"min_size: `{prop_name}_size_min`, "
624-
f"max_size: `{prop_name}_size_max`"
625-
)
626-
)
627-
elif prop_type in ["BOOLEAN", "POINT", "DURATION"]:
628-
continue
629-
630-
output_dict[prop_name] = "{" + return_clauses.pop() + "}"
631-
657+
if not exhaustive
658+
else None
659+
)
660+
if prop_type == "STRING":
661+
str_w_clauses, str_r_clauses = build_str_clauses(
662+
prop_name=prop_name,
663+
driver=driver,
664+
label_or_type=label_or_type,
665+
exhaustive=exhaustive,
666+
prop_index=prop_index,
667+
)
668+
with_clauses += str_w_clauses
669+
return_clauses += str_r_clauses
670+
elif prop_type in [
671+
"INTEGER",
672+
"FLOAT",
673+
"DATE",
674+
"DATE_TIME",
675+
"LOCAL_DATE_TIME",
676+
]:
677+
num_date_w_clauses, num_date_r_clauses = build_num_date_clauses(
678+
prop_name=prop_name, exhaustive=exhaustive, prop_index=prop_index
679+
)
680+
with_clauses += num_date_w_clauses
681+
return_clauses += num_date_r_clauses
682+
elif prop_type == "LIST":
683+
list_w_clause, list_r_clause = build_list_clauses(prop_name=prop_name)
684+
with_clauses.append(list_w_clause)
685+
return_clauses.append(list_r_clause)
686+
elif prop_type in ["BOOLEAN", "POINT", "DURATION"]:
687+
continue
688+
output_dict[prop_name] = "{" + return_clauses.pop() + "}"
689+
# Combine with and return clauses
632690
with_clause = "WITH " + ",\n ".join(with_clauses)
633691
return_clause = (
634692
"RETURN {"
635693
+ ", ".join(f"`{k}`: {v}" for k, v in output_dict.items())
636694
+ "} AS output"
637695
)
638-
639696
# Combine all parts of the Cypher query
640697
cypher_query = "\n".join([match_clause, with_clause, return_clause])
641698
return cypher_query

0 commit comments

Comments
 (0)