|
11 | 11 | import sys
|
12 | 12 | import os
|
13 | 13 | import logging
|
| 14 | +import hashlib |
| 15 | +import uuid |
14 | 16 | from enum import Enum
|
| 17 | +from collections import Counter |
15 | 18 |
|
16 | 19 | import attr
|
17 | 20 | from license_expression import combine_expressions
|
|
23 | 26 | from licensedcode.match import LicenseMatch
|
24 | 27 | from licensedcode.match import set_matched_lines
|
25 | 28 | from licensedcode.models import Rule
|
| 29 | +from licensedcode.models import BasicRule |
26 | 30 | from licensedcode.models import compute_relevance
|
27 | 31 | from licensedcode.spans import Span
|
28 | 32 | from licensedcode.tokenize import query_tokenizer
|
@@ -263,24 +267,39 @@ def identifier(self):
|
263 | 267 | """
|
264 | 268 | data = []
|
265 | 269 | for match in self.matches:
|
266 |
| - tokenized_matched_text = tuple(query_tokenizer(match['matched_text'])) |
267 |
| - identifier = ( |
268 |
| - match['rule_identifier'], |
269 |
| - match['match_coverage'], |
270 |
| - tokenized_matched_text, |
271 |
| - ) |
| 270 | + if isinstance(match, dict): |
| 271 | + tokenized_matched_text = tuple(query_tokenizer(match['matched_text'])) |
| 272 | + identifier = ( |
| 273 | + match['rule_identifier'], |
| 274 | + match['score'], |
| 275 | + tokenized_matched_text, |
| 276 | + ) |
| 277 | + else: |
| 278 | + tokenized_matched_text = tuple(query_tokenizer(match.matched_text)) |
| 279 | + identifier = ( |
| 280 | + match.identifier, |
| 281 | + match.score(), |
| 282 | + tokenized_matched_text, |
| 283 | + ) |
272 | 284 | data.append(identifier)
|
273 | 285 |
|
274 |
| - # Return a positive hash value for the tuple |
275 |
| - return tuple(data).__hash__() % ((sys.maxsize + 1) * 2) |
276 |
| - |
| 286 | + # Return a uuid generated from the contents of the matches |
| 287 | + identifier_string = repr(tuple(data)) |
| 288 | + md_hash = hashlib.md5() |
| 289 | + md_hash.update(identifier_string.encode('utf-8')) |
| 290 | + return str(uuid.UUID(md_hash.hexdigest())) |
| 291 | + |
277 | 292 | def get_start_end_line(self):
|
278 | 293 | """
|
279 | 294 | Returns start and end line for a license detection issue, from the
|
280 | 295 | license match(es).
|
281 | 296 | """
|
282 |
| - start_line = min([match['start_line'] for match in self.matches]) |
283 |
| - end_line = max([match['end_line'] for match in self.matches]) |
| 297 | + if isinstance(self.matches[0], dict): |
| 298 | + start_line = min([match['start_line'] for match in self.matches]) |
| 299 | + end_line = max([match['end_line'] for match in self.matches]) |
| 300 | + else: |
| 301 | + start_line = min([match.start_line for match in self.matches]) |
| 302 | + end_line = max([match.end_line for match in self.matches]) |
284 | 303 | return start_line, end_line
|
285 | 304 |
|
286 | 305 | def rules_length(self):
|
@@ -432,6 +451,222 @@ def dict_fields(attr, value):
|
432 | 451 | return detection
|
433 | 452 |
|
434 | 453 |
|
| 454 | + |
| 455 | +@attr.s |
| 456 | +class LicenseDetectionFromResult(LicenseDetection): |
| 457 | + """ |
| 458 | + A LicenseDetection object that is created from a LicenseDetection |
| 459 | + mapping, i.e. results mappings. The LicenseMatch objects in the |
| 460 | + `matches` will be LicenseMatchFromResult objects too, as these are |
| 461 | + created from data mappings and don't have the input text/spans |
| 462 | + available. |
| 463 | + """ |
| 464 | + |
| 465 | + @classmethod |
| 466 | + def from_license_detection_mapping(cls, license_detection_mapping, file_path): |
| 467 | + |
| 468 | + matches_from_results = matches_from_license_match_mappings( |
| 469 | + license_match_mappings=license_detection_mapping["matches"] |
| 470 | + ) |
| 471 | + |
| 472 | + detection = cls( |
| 473 | + license_expression=license_detection_mapping["license_expression"], |
| 474 | + detection_log=license_detection_mapping["detection_log"], |
| 475 | + matches=matches_from_results, |
| 476 | + file_region=None, |
| 477 | + ) |
| 478 | + detection.file_region = detection.get_file_region(path=file_path) |
| 479 | + return detection |
| 480 | + |
| 481 | + |
| 482 | +def detections_from_license_detection_mappings(license_detection_mappings, file_path): |
| 483 | + |
| 484 | + license_detections = [] |
| 485 | + |
| 486 | + for license_detection_mapping in license_detection_mappings: |
| 487 | + license_detections.append( |
| 488 | + LicenseDetectionFromResult.from_license_detection_mapping( |
| 489 | + license_detection_mapping=license_detection_mapping, |
| 490 | + file_path=file_path, |
| 491 | + ) |
| 492 | + ) |
| 493 | + |
| 494 | + return license_detections |
| 495 | + |
| 496 | + |
| 497 | +@attr.s |
| 498 | +class LicenseMatchFromResult(LicenseMatch): |
| 499 | + |
| 500 | + match_score = attr.ib( |
| 501 | + default=None, |
| 502 | + metadata=dict( |
| 503 | + help='License Detection Score') |
| 504 | + ) |
| 505 | + |
| 506 | + matched_length = attr.ib( |
| 507 | + default=None, |
| 508 | + metadata=dict( |
| 509 | + help='License match length') |
| 510 | + ) |
| 511 | + |
| 512 | + match_coverage = attr.ib( |
| 513 | + default=None, |
| 514 | + metadata=dict( |
| 515 | + help='License match coverage') |
| 516 | + ) |
| 517 | + |
| 518 | + text = attr.ib( |
| 519 | + default=None, |
| 520 | + metadata=dict( |
| 521 | + help='Text which was matched') |
| 522 | + ) |
| 523 | + |
| 524 | + def score(self): |
| 525 | + return self.match_score |
| 526 | + |
| 527 | + def len(self): |
| 528 | + return self.matched_length |
| 529 | + |
| 530 | + def coverage(self): |
| 531 | + return self.match_coverage |
| 532 | + |
| 533 | + @property |
| 534 | + def matched_text(self): |
| 535 | + return self.text |
| 536 | + |
| 537 | + @property |
| 538 | + def identifier(self): |
| 539 | + return self.rule.identifier |
| 540 | + |
| 541 | + @classmethod |
| 542 | + def from_license_match_mapping(cls, license_match_mapping): |
| 543 | + |
| 544 | + rule = RuleFromResult.from_license_match_mapping( |
| 545 | + license_match_mapping=license_match_mapping, |
| 546 | + ) |
| 547 | + |
| 548 | + if "matched_text" in license_match_mapping: |
| 549 | + matched_text = license_match_mapping["matched_text"] |
| 550 | + else: |
| 551 | + matched_text = None |
| 552 | + |
| 553 | + return cls( |
| 554 | + start_line=license_match_mapping["start_line"], |
| 555 | + end_line=license_match_mapping["end_line"], |
| 556 | + match_score=license_match_mapping["score"], |
| 557 | + matched_length=license_match_mapping["matched_length"], |
| 558 | + match_coverage=license_match_mapping["match_coverage"], |
| 559 | + matcher=license_match_mapping["matcher"], |
| 560 | + text=matched_text, |
| 561 | + rule=rule, |
| 562 | + qspan=None, |
| 563 | + ispan=None, |
| 564 | + ) |
| 565 | + |
| 566 | + |
| 567 | +@attr.s |
| 568 | +class RuleFromResult(BasicRule): |
| 569 | + |
| 570 | + @classmethod |
| 571 | + def from_license_match_mapping(cls, license_match_mapping): |
| 572 | + return cls( |
| 573 | + license_expression=license_match_mapping["license_expression"], |
| 574 | + identifier=license_match_mapping["rule_identifier"], |
| 575 | + referenced_filenames=license_match_mapping["referenced_filenames"], |
| 576 | + is_license_text=license_match_mapping["is_license_text"], |
| 577 | + is_license_notice=license_match_mapping["is_license_notice"], |
| 578 | + is_license_reference=license_match_mapping["is_license_reference"], |
| 579 | + is_license_tag=license_match_mapping["is_license_tag"], |
| 580 | + is_license_intro=license_match_mapping["is_license_intro"], |
| 581 | + length=license_match_mapping["rule_length"], |
| 582 | + relevance=license_match_mapping["rule_relevance"], |
| 583 | + ) |
| 584 | + |
| 585 | +def matches_from_license_match_mappings(license_match_mappings): |
| 586 | + |
| 587 | + license_matches = [] |
| 588 | + |
| 589 | + for license_match_mapping in license_match_mappings: |
| 590 | + license_matches.append( |
| 591 | + LicenseMatchFromResult.from_license_match_mapping( |
| 592 | + license_match_mapping=license_match_mapping |
| 593 | + ) |
| 594 | + ) |
| 595 | + |
| 596 | + return license_matches |
| 597 | + |
| 598 | + |
| 599 | +@attr.s |
| 600 | +class UniqueDetection: |
| 601 | + """ |
| 602 | + An unique License Detection. |
| 603 | + """ |
| 604 | + identifier = attr.ib(default=None) |
| 605 | + license_expression = attr.ib(default=None) |
| 606 | + occurance_count = attr.ib(default=None) |
| 607 | + detection_log = attr.ib(default=attr.Factory(list)) |
| 608 | + matches = attr.ib(default=attr.Factory(list)) |
| 609 | + files = attr.ib(factory=list) |
| 610 | + |
| 611 | + @classmethod |
| 612 | + def get_unique_detections(cls, license_detections): |
| 613 | + """ |
| 614 | + Get all unique license detections from a list of |
| 615 | + LicenseDetections. |
| 616 | + """ |
| 617 | + identifiers = get_identifiers(license_detections) |
| 618 | + unique_detection_counts = dict(Counter(identifiers)) |
| 619 | + |
| 620 | + unique_license_detections = [] |
| 621 | + for detection_identifier in unique_detection_counts.keys(): |
| 622 | + file_regions = ( |
| 623 | + detection.file_region |
| 624 | + for detection in license_detections |
| 625 | + if detection_identifier == detection.identifier |
| 626 | + ) |
| 627 | + all_detections = ( |
| 628 | + detection |
| 629 | + for detection in license_detections |
| 630 | + if detection_identifier == detection.identifier |
| 631 | + ) |
| 632 | + |
| 633 | + detection = next(all_detections) |
| 634 | + detection_mapping = detection.to_dict() |
| 635 | + files = list(file_regions) |
| 636 | + unique_license_detections.append( |
| 637 | + cls( |
| 638 | + identifier=detection.identifier, |
| 639 | + license_expression=detection_mapping["license_expression"], |
| 640 | + detection_log=detection_mapping["detection_log"], |
| 641 | + matches=detection_mapping["matches"], |
| 642 | + occurance_count=len(files), |
| 643 | + files=files, |
| 644 | + ) |
| 645 | + ) |
| 646 | + |
| 647 | + return unique_license_detections |
| 648 | + |
| 649 | + def to_dict(self): |
| 650 | + def dict_fields(attr, value): |
| 651 | + if attr.name == 'files': |
| 652 | + return False |
| 653 | + |
| 654 | + return True |
| 655 | + |
| 656 | + return attr.asdict(self, filter=dict_fields, dict_factory=dict) |
| 657 | + |
| 658 | + |
| 659 | +def get_identifiers(license_detections): |
| 660 | + """ |
| 661 | + Get identifiers for all license detections. |
| 662 | + """ |
| 663 | + identifiers = ( |
| 664 | + detection.identifier |
| 665 | + for detection in license_detections |
| 666 | + ) |
| 667 | + return identifiers |
| 668 | + |
| 669 | + |
435 | 670 | def get_detections_from_mappings(detection_mappings):
|
436 | 671 | """
|
437 | 672 | Return a list of LicenseDetection objects from a list of
|
|
0 commit comments