|
14 | 14 | # limitations under the License.
|
15 | 15 | from __future__ import annotations
|
16 | 16 |
|
17 |
| -from typing import Any, Dict, List, Optional |
| 17 | +from typing import Any, Dict, List, Optional, Tuple |
18 | 18 |
|
19 | 19 | import neo4j
|
20 | 20 | from neo4j import Query
|
@@ -454,6 +454,150 @@ def format_schema(schema: Dict[str, Any], is_enhanced: bool) -> str:
|
454 | 454 | )
|
455 | 455 |
|
456 | 456 |
|
| 457 | +def build_str_clauses( |
| 458 | + prop_name: str, |
| 459 | + driver: neo4j.Driver, |
| 460 | + label_or_type: str, |
| 461 | + exhaustive: bool, |
| 462 | + prop_index: Optional[List[Any]] = None, |
| 463 | +) -> Tuple[List[str], List[str]]: |
| 464 | + """ |
| 465 | + Build Cypher clauses for string property statistics. |
| 466 | +
|
| 467 | + Constructs and returns the parts of a Cypher query (`WITH` and `RETURN` clauses) |
| 468 | + required to gather statistical information about a string property. Depending on |
| 469 | + property index metadata and whether the query is exhaustive, this function may |
| 470 | + retrieve a distinct set of values directly from an index or a truncated list of |
| 471 | + distinct values from the actual nodes or relationships. |
| 472 | +
|
| 473 | + Args: |
| 474 | + prop_name (str): The name of the string property. |
| 475 | + driver (neo4j.Driver): Neo4j Python driver instance. |
| 476 | + label_or_type (str): The node label or relationship type to query. |
| 477 | + exhaustive (bool): Whether to perform an exhaustive search or a |
| 478 | + sampled query approach. |
| 479 | + prop_index (Optional[List[Any]]): Optional metadata about the property's |
| 480 | + index. If provided, certain optimizations are applied based on |
| 481 | + distinct value limits and index availability. |
| 482 | +
|
| 483 | + Returns: |
| 484 | + Tuple[List[str], List[str]]: |
| 485 | + A tuple of two lists. The first list contains the `WITH` clauses, and |
| 486 | + the second list contains the corresponding `RETURN` clauses for the |
| 487 | + string property. |
| 488 | + """ |
| 489 | + with_clauses = [] |
| 490 | + return_clauses = [] |
| 491 | + if ( |
| 492 | + not exhaustive |
| 493 | + and prop_index |
| 494 | + and prop_index[0].get("size") > 0 |
| 495 | + and prop_index[0].get("distinctValues") <= DISTINCT_VALUE_LIMIT |
| 496 | + ): |
| 497 | + distinct_values = query_database( |
| 498 | + driver, |
| 499 | + f"CALL apoc.schema.properties.distinct(" |
| 500 | + f"'{label_or_type}', '{prop_name}') YIELD value", |
| 501 | + )[0]["value"] |
| 502 | + return_clauses.append( |
| 503 | + (f"values: {distinct_values}," f" distinct_count: {len(distinct_values)}") |
| 504 | + ) |
| 505 | + else: |
| 506 | + with_clauses.append( |
| 507 | + ( |
| 508 | + f"collect(distinct substring(toString(n.`{prop_name}`)" |
| 509 | + f", 0, 50)) AS `{prop_name}_values`" |
| 510 | + ) |
| 511 | + ) |
| 512 | + if not exhaustive: |
| 513 | + return_clauses.append(f"values: `{prop_name}_values`") |
| 514 | + else: |
| 515 | + return_clauses.append( |
| 516 | + ( |
| 517 | + f"values:`{prop_name}_values`[..{DISTINCT_VALUE_LIMIT}]," |
| 518 | + f" distinct_count: size(`{prop_name}_values`)" |
| 519 | + ) |
| 520 | + ) |
| 521 | + return with_clauses, return_clauses |
| 522 | + |
| 523 | + |
| 524 | +def build_list_clauses(prop_name: str) -> Tuple[str, str]: |
| 525 | + """ |
| 526 | + Build Cypher clauses for list property size statistics. |
| 527 | +
|
| 528 | + Constructs and returns the parts of a Cypher query (`WITH` and `RETURN` clauses) |
| 529 | + that gather minimum and maximum size information for properties that are lists. |
| 530 | + These clauses compute the smallest and largest list lengths across the matched |
| 531 | + entities. |
| 532 | +
|
| 533 | + Args: |
| 534 | + prop_name (str): The name of the list property. |
| 535 | +
|
| 536 | + Returns: |
| 537 | + Tuple[str, str]: |
| 538 | + A tuple consisting of a single `WITH` clause (calculating min and max |
| 539 | + sizes) and a corresponding `RETURN` clause that references these values. |
| 540 | + """ |
| 541 | + with_clause = ( |
| 542 | + f"min(size(n.`{prop_name}`)) AS `{prop_name}_size_min`, " |
| 543 | + f"max(size(n.`{prop_name}`)) AS `{prop_name}_size_max`" |
| 544 | + ) |
| 545 | + |
| 546 | + return_clause = ( |
| 547 | + f"min_size: `{prop_name}_size_min`, " f"max_size: `{prop_name}_size_max`" |
| 548 | + ) |
| 549 | + return with_clause, return_clause |
| 550 | + |
| 551 | + |
| 552 | +def build_num_date_clauses( |
| 553 | + prop_name: str, exhaustive: bool, prop_index: Optional[List[Any]] = None |
| 554 | +) -> Tuple[List[str], List[str]]: |
| 555 | + """ |
| 556 | + Build Cypher clauses for numeric and date/datetime property statistics. |
| 557 | +
|
| 558 | + Constructs and returns the parts of a Cypher query (`WITH` and `RETURN` clauses) |
| 559 | + needed to gather statistical information about numeric or date/datetime |
| 560 | + properties. Depending on whether there is an available index or an exhaustive |
| 561 | + approach is required, this may collect a distinct set of values or compute |
| 562 | + minimum, maximum, and distinct counts. |
| 563 | +
|
| 564 | + Args: |
| 565 | + prop_name (str): The name of the numeric or date/datetime property. |
| 566 | + exhaustive (bool): Whether to perform an exhaustive search or a |
| 567 | + sampled query approach. |
| 568 | + prop_index (Optional[List[Any]]): Optional metadata about the property's |
| 569 | + index. If provided and the search is not exhaustive, it can be used |
| 570 | + to optimize the retrieval of distinct values. |
| 571 | +
|
| 572 | + Returns: |
| 573 | + Tuple[List[str], List[str]]: |
| 574 | + A tuple of two lists. The first list contains the `WITH` clauses, and |
| 575 | + the second list contains the corresponding `RETURN` clauses for the |
| 576 | + numeric or date/datetime property. |
| 577 | + """ |
| 578 | + with_clauses = [] |
| 579 | + return_clauses = [] |
| 580 | + if not prop_index and not exhaustive: |
| 581 | + with_clauses.append( |
| 582 | + f"collect(distinct toString(n.`{prop_name}`)) " f"AS `{prop_name}_values`" |
| 583 | + ) |
| 584 | + return_clauses.append(f"values: `{prop_name}_values`") |
| 585 | + else: |
| 586 | + with_clauses.append(f"min(n.`{prop_name}`) AS `{prop_name}_min`") |
| 587 | + with_clauses.append(f"max(n.`{prop_name}`) AS `{prop_name}_max`") |
| 588 | + with_clauses.append( |
| 589 | + f"count(distinct n.`{prop_name}`) AS `{prop_name}_distinct`" |
| 590 | + ) |
| 591 | + return_clauses.append( |
| 592 | + ( |
| 593 | + f"min: toString(`{prop_name}_min`), " |
| 594 | + f"max: toString(`{prop_name}_max`), " |
| 595 | + f"distinct_count: `{prop_name}_distinct`" |
| 596 | + ) |
| 597 | + ) |
| 598 | + return with_clauses, return_clauses |
| 599 | + |
| 600 | + |
457 | 601 | def get_enhanced_schema_cypher(
|
458 | 602 | driver: neo4j.Driver,
|
459 | 603 | structured_schema: Dict[str, Any],
|
@@ -494,148 +638,61 @@ def get_enhanced_schema_cypher(
|
494 | 638 | with_clauses = []
|
495 | 639 | return_clauses = []
|
496 | 640 | output_dict = {}
|
497 |
| - if exhaustive: |
498 |
| - for prop in properties: |
499 |
| - prop_name = prop["property"] |
500 |
| - prop_type = prop["type"] |
501 |
| - if prop_type == "STRING": |
502 |
| - with_clauses.append( |
503 |
| - ( |
504 |
| - f"collect(distinct substring(toString(n.`{prop_name}`)" |
505 |
| - f", 0, 50)) AS `{prop_name}_values`" |
506 |
| - ) |
507 |
| - ) |
508 |
| - return_clauses.append( |
509 |
| - ( |
510 |
| - f"values:`{prop_name}_values`[..{DISTINCT_VALUE_LIMIT}]," |
511 |
| - f" distinct_count: size(`{prop_name}_values`)" |
512 |
| - ) |
513 |
| - ) |
514 |
| - elif prop_type in [ |
515 |
| - "INTEGER", |
516 |
| - "FLOAT", |
517 |
| - "DATE", |
518 |
| - "DATE_TIME", |
519 |
| - "LOCAL_DATE_TIME", |
520 |
| - ]: |
521 |
| - with_clauses.append(f"min(n.`{prop_name}`) AS `{prop_name}_min`") |
522 |
| - with_clauses.append(f"max(n.`{prop_name}`) AS `{prop_name}_max`") |
523 |
| - with_clauses.append( |
524 |
| - f"count(distinct n.`{prop_name}`) AS `{prop_name}_distinct`" |
525 |
| - ) |
526 |
| - return_clauses.append( |
527 |
| - ( |
528 |
| - f"min: toString(`{prop_name}_min`), " |
529 |
| - f"max: toString(`{prop_name}_max`), " |
530 |
| - f"distinct_count: `{prop_name}_distinct`" |
531 |
| - ) |
532 |
| - ) |
533 |
| - elif prop_type == "LIST": |
534 |
| - with_clauses.append( |
535 |
| - ( |
536 |
| - f"min(size(n.`{prop_name}`)) AS `{prop_name}_size_min`, " |
537 |
| - f"max(size(n.`{prop_name}`)) AS `{prop_name}_size_max`" |
538 |
| - ) |
539 |
| - ) |
540 |
| - return_clauses.append( |
541 |
| - f"min_size: `{prop_name}_size_min`, " |
542 |
| - f"max_size: `{prop_name}_size_max`" |
543 |
| - ) |
544 |
| - elif prop_type in ["BOOLEAN", "POINT", "DURATION"]: |
545 |
| - continue |
546 |
| - output_dict[prop_name] = "{" + return_clauses.pop() + "}" |
547 |
| - else: |
548 |
| - # Just sample 5 random nodes |
| 641 | + if not exhaustive: |
| 642 | + # Sample 5 random nodes if not exhaustive |
549 | 643 | match_clause += " WITH n LIMIT 5"
|
550 |
| - for prop in properties: |
551 |
| - prop_name = prop["property"] |
552 |
| - prop_type = prop["type"] |
553 |
| - |
554 |
| - # Check if indexed property, we can still do exhaustive |
555 |
| - prop_index = [ |
| 644 | + # Build the with and return clauses |
| 645 | + for prop in properties: |
| 646 | + prop_name = prop["property"] |
| 647 | + prop_type = prop["type"] |
| 648 | + # Check if indexed property, we can still do exhaustive |
| 649 | + prop_index = ( |
| 650 | + [ |
556 | 651 | el
|
557 | 652 | for el in structured_schema["metadata"]["index"]
|
558 | 653 | if el["label"] == label_or_type
|
559 | 654 | and el["properties"] == [prop_name]
|
560 | 655 | and el["type"] == "RANGE"
|
561 | 656 | ]
|
562 |
| - if prop_type == "STRING": |
563 |
| - if ( |
564 |
| - prop_index |
565 |
| - and prop_index[0].get("size") > 0 |
566 |
| - and prop_index[0].get("distinctValues") <= DISTINCT_VALUE_LIMIT |
567 |
| - ): |
568 |
| - distinct_values = query_database( |
569 |
| - driver, |
570 |
| - f"CALL apoc.schema.properties.distinct(" |
571 |
| - f"'{label_or_type}', '{prop_name}') YIELD value", |
572 |
| - )[0]["value"] |
573 |
| - return_clauses.append( |
574 |
| - ( |
575 |
| - f"values: {distinct_values}," |
576 |
| - f" distinct_count: {len(distinct_values)}" |
577 |
| - ) |
578 |
| - ) |
579 |
| - else: |
580 |
| - with_clauses.append( |
581 |
| - ( |
582 |
| - f"collect(distinct substring(toString(n.`{prop_name}`)" |
583 |
| - f", 0, 50)) AS `{prop_name}_values`" |
584 |
| - ) |
585 |
| - ) |
586 |
| - return_clauses.append(f"values: `{prop_name}_values`") |
587 |
| - elif prop_type in [ |
588 |
| - "INTEGER", |
589 |
| - "FLOAT", |
590 |
| - "DATE", |
591 |
| - "DATE_TIME", |
592 |
| - "LOCAL_DATE_TIME", |
593 |
| - ]: |
594 |
| - if not prop_index: |
595 |
| - with_clauses.append( |
596 |
| - f"collect(distinct toString(n.`{prop_name}`)) " |
597 |
| - f"AS `{prop_name}_values`" |
598 |
| - ) |
599 |
| - return_clauses.append(f"values: `{prop_name}_values`") |
600 |
| - else: |
601 |
| - with_clauses.append(f"min(n.`{prop_name}`) AS `{prop_name}_min`") |
602 |
| - with_clauses.append(f"max(n.`{prop_name}`) AS `{prop_name}_max`") |
603 |
| - with_clauses.append( |
604 |
| - f"count(distinct n.`{prop_name}`) AS `{prop_name}_distinct`" |
605 |
| - ) |
606 |
| - return_clauses.append( |
607 |
| - ( |
608 |
| - f"min: toString(`{prop_name}_min`), " |
609 |
| - f"max: toString(`{prop_name}_max`), " |
610 |
| - f"distinct_count: `{prop_name}_distinct`" |
611 |
| - ) |
612 |
| - ) |
613 |
| - |
614 |
| - elif prop_type == "LIST": |
615 |
| - with_clauses.append( |
616 |
| - ( |
617 |
| - f"min(size(n.`{prop_name}`)) AS `{prop_name}_size_min`, " |
618 |
| - f"max(size(n.`{prop_name}`)) AS `{prop_name}_size_max`" |
619 |
| - ) |
620 |
| - ) |
621 |
| - return_clauses.append( |
622 |
| - ( |
623 |
| - f"min_size: `{prop_name}_size_min`, " |
624 |
| - f"max_size: `{prop_name}_size_max`" |
625 |
| - ) |
626 |
| - ) |
627 |
| - elif prop_type in ["BOOLEAN", "POINT", "DURATION"]: |
628 |
| - continue |
629 |
| - |
630 |
| - output_dict[prop_name] = "{" + return_clauses.pop() + "}" |
631 |
| - |
| 657 | + if not exhaustive |
| 658 | + else None |
| 659 | + ) |
| 660 | + if prop_type == "STRING": |
| 661 | + str_w_clauses, str_r_clauses = build_str_clauses( |
| 662 | + prop_name=prop_name, |
| 663 | + driver=driver, |
| 664 | + label_or_type=label_or_type, |
| 665 | + exhaustive=exhaustive, |
| 666 | + prop_index=prop_index, |
| 667 | + ) |
| 668 | + with_clauses += str_w_clauses |
| 669 | + return_clauses += str_r_clauses |
| 670 | + elif prop_type in [ |
| 671 | + "INTEGER", |
| 672 | + "FLOAT", |
| 673 | + "DATE", |
| 674 | + "DATE_TIME", |
| 675 | + "LOCAL_DATE_TIME", |
| 676 | + ]: |
| 677 | + num_date_w_clauses, num_date_r_clauses = build_num_date_clauses( |
| 678 | + prop_name=prop_name, exhaustive=exhaustive, prop_index=prop_index |
| 679 | + ) |
| 680 | + with_clauses += num_date_w_clauses |
| 681 | + return_clauses += num_date_r_clauses |
| 682 | + elif prop_type == "LIST": |
| 683 | + list_w_clause, list_r_clause = build_list_clauses(prop_name=prop_name) |
| 684 | + with_clauses.append(list_w_clause) |
| 685 | + return_clauses.append(list_r_clause) |
| 686 | + elif prop_type in ["BOOLEAN", "POINT", "DURATION"]: |
| 687 | + continue |
| 688 | + output_dict[prop_name] = "{" + return_clauses.pop() + "}" |
| 689 | + # Combine with and return clauses |
632 | 690 | with_clause = "WITH " + ",\n ".join(with_clauses)
|
633 | 691 | return_clause = (
|
634 | 692 | "RETURN {"
|
635 | 693 | + ", ".join(f"`{k}`: {v}" for k, v in output_dict.items())
|
636 | 694 | + "} AS output"
|
637 | 695 | )
|
638 |
| - |
639 | 696 | # Combine all parts of the Cypher query
|
640 | 697 | cypher_query = "\n".join([match_clause, with_clause, return_clause])
|
641 | 698 | return cypher_query
|
|
0 commit comments