Skip to content

Commit 2d576ab

Browse files
Add codebase level unique license detections
- Add a new codebase level attribute `licenses` - Add a new resource level attribute `for_licenses` - Add unique license detections from files and packages in the top level attribute `licenses` and this is the usual `license_expression`, `detection_log` and `matches` and additionally an `occurance_count` and a `identifier` which is an UUID generated from the content of the matches in the detection. Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com>
1 parent a410f6e commit 2d576ab

File tree

3 files changed

+365
-68
lines changed

3 files changed

+365
-68
lines changed

src/licensedcode/detection.py

Lines changed: 246 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,10 @@
1111
import sys
1212
import os
1313
import logging
14+
import hashlib
15+
import uuid
1416
from enum import Enum
17+
from collections import Counter
1518

1619
import attr
1720
from license_expression import combine_expressions
@@ -23,6 +26,7 @@
2326
from licensedcode.match import LicenseMatch
2427
from licensedcode.match import set_matched_lines
2528
from licensedcode.models import Rule
29+
from licensedcode.models import BasicRule
2630
from licensedcode.models import compute_relevance
2731
from licensedcode.spans import Span
2832
from licensedcode.tokenize import query_tokenizer
@@ -263,24 +267,39 @@ def identifier(self):
263267
"""
264268
data = []
265269
for match in self.matches:
266-
tokenized_matched_text = tuple(query_tokenizer(match['matched_text']))
267-
identifier = (
268-
match['rule_identifier'],
269-
match['match_coverage'],
270-
tokenized_matched_text,
271-
)
270+
if isinstance(match, dict):
271+
tokenized_matched_text = tuple(query_tokenizer(match['matched_text']))
272+
identifier = (
273+
match['rule_identifier'],
274+
match['score'],
275+
tokenized_matched_text,
276+
)
277+
else:
278+
tokenized_matched_text = tuple(query_tokenizer(match.matched_text))
279+
identifier = (
280+
match.identifier,
281+
match.score(),
282+
tokenized_matched_text,
283+
)
272284
data.append(identifier)
273285

274-
# Return a positive hash value for the tuple
275-
return tuple(data).__hash__() % ((sys.maxsize + 1) * 2)
276-
286+
# Return a uuid generated from the contents of the matches
287+
identifier_string = repr(tuple(data))
288+
md_hash = hashlib.md5()
289+
md_hash.update(identifier_string.encode('utf-8'))
290+
return str(uuid.UUID(md_hash.hexdigest()))
291+
277292
def get_start_end_line(self):
278293
"""
279294
Returns start and end line for a license detection issue, from the
280295
license match(es).
281296
"""
282-
start_line = min([match['start_line'] for match in self.matches])
283-
end_line = max([match['end_line'] for match in self.matches])
297+
if isinstance(self.matches[0], dict):
298+
start_line = min([match['start_line'] for match in self.matches])
299+
end_line = max([match['end_line'] for match in self.matches])
300+
else:
301+
start_line = min([match.start_line for match in self.matches])
302+
end_line = max([match.end_line for match in self.matches])
284303
return start_line, end_line
285304

286305
def rules_length(self):
@@ -432,6 +451,222 @@ def dict_fields(attr, value):
432451
return detection
433452

434453

454+
455+
@attr.s
456+
class LicenseDetectionFromResult(LicenseDetection):
457+
"""
458+
A LicenseDetection object that is created from a LicenseDetection
459+
mapping, i.e. results mappings. The LicenseMatch objects in the
460+
`matches` will be LicenseMatchFromResult objects too, as these are
461+
created from data mappings and don't have the input text/spans
462+
available.
463+
"""
464+
465+
@classmethod
466+
def from_license_detection_mapping(cls, license_detection_mapping, file_path):
467+
468+
matches_from_results = matches_from_license_match_mappings(
469+
license_match_mappings=license_detection_mapping["matches"]
470+
)
471+
472+
detection = cls(
473+
license_expression=license_detection_mapping["license_expression"],
474+
detection_log=license_detection_mapping["detection_log"],
475+
matches=matches_from_results,
476+
file_region=None,
477+
)
478+
detection.file_region = detection.get_file_region(path=file_path)
479+
return detection
480+
481+
482+
def detections_from_license_detection_mappings(license_detection_mappings, file_path):
483+
484+
license_detections = []
485+
486+
for license_detection_mapping in license_detection_mappings:
487+
license_detections.append(
488+
LicenseDetectionFromResult.from_license_detection_mapping(
489+
license_detection_mapping=license_detection_mapping,
490+
file_path=file_path,
491+
)
492+
)
493+
494+
return license_detections
495+
496+
497+
@attr.s
498+
class LicenseMatchFromResult(LicenseMatch):
499+
500+
match_score = attr.ib(
501+
default=None,
502+
metadata=dict(
503+
help='License Detection Score')
504+
)
505+
506+
matched_length = attr.ib(
507+
default=None,
508+
metadata=dict(
509+
help='License match length')
510+
)
511+
512+
match_coverage = attr.ib(
513+
default=None,
514+
metadata=dict(
515+
help='License match coverage')
516+
)
517+
518+
text = attr.ib(
519+
default=None,
520+
metadata=dict(
521+
help='Text which was matched')
522+
)
523+
524+
def score(self):
525+
return self.match_score
526+
527+
def len(self):
528+
return self.matched_length
529+
530+
def coverage(self):
531+
return self.match_coverage
532+
533+
@property
534+
def matched_text(self):
535+
return self.text
536+
537+
@property
538+
def identifier(self):
539+
return self.rule.identifier
540+
541+
@classmethod
542+
def from_license_match_mapping(cls, license_match_mapping):
543+
544+
rule = RuleFromResult.from_license_match_mapping(
545+
license_match_mapping=license_match_mapping,
546+
)
547+
548+
if "matched_text" in license_match_mapping:
549+
matched_text = license_match_mapping["matched_text"]
550+
else:
551+
matched_text = None
552+
553+
return cls(
554+
start_line=license_match_mapping["start_line"],
555+
end_line=license_match_mapping["end_line"],
556+
match_score=license_match_mapping["score"],
557+
matched_length=license_match_mapping["matched_length"],
558+
match_coverage=license_match_mapping["match_coverage"],
559+
matcher=license_match_mapping["matcher"],
560+
text=matched_text,
561+
rule=rule,
562+
qspan=None,
563+
ispan=None,
564+
)
565+
566+
567+
@attr.s
568+
class RuleFromResult(BasicRule):
569+
570+
@classmethod
571+
def from_license_match_mapping(cls, license_match_mapping):
572+
return cls(
573+
license_expression=license_match_mapping["license_expression"],
574+
identifier=license_match_mapping["rule_identifier"],
575+
referenced_filenames=license_match_mapping["referenced_filenames"],
576+
is_license_text=license_match_mapping["is_license_text"],
577+
is_license_notice=license_match_mapping["is_license_notice"],
578+
is_license_reference=license_match_mapping["is_license_reference"],
579+
is_license_tag=license_match_mapping["is_license_tag"],
580+
is_license_intro=license_match_mapping["is_license_intro"],
581+
length=license_match_mapping["rule_length"],
582+
relevance=license_match_mapping["rule_relevance"],
583+
)
584+
585+
def matches_from_license_match_mappings(license_match_mappings):
586+
587+
license_matches = []
588+
589+
for license_match_mapping in license_match_mappings:
590+
license_matches.append(
591+
LicenseMatchFromResult.from_license_match_mapping(
592+
license_match_mapping=license_match_mapping
593+
)
594+
)
595+
596+
return license_matches
597+
598+
599+
@attr.s
600+
class UniqueDetection:
601+
"""
602+
An unique License Detection.
603+
"""
604+
identifier = attr.ib(default=None)
605+
license_expression = attr.ib(default=None)
606+
occurance_count = attr.ib(default=None)
607+
detection_log = attr.ib(default=attr.Factory(list))
608+
matches = attr.ib(default=attr.Factory(list))
609+
files = attr.ib(factory=list)
610+
611+
@classmethod
612+
def get_unique_detections(cls, license_detections):
613+
"""
614+
Get all unique license detections from a list of
615+
LicenseDetections.
616+
"""
617+
identifiers = get_identifiers(license_detections)
618+
unique_detection_counts = dict(Counter(identifiers))
619+
620+
unique_license_detections = []
621+
for detection_identifier in unique_detection_counts.keys():
622+
file_regions = (
623+
detection.file_region
624+
for detection in license_detections
625+
if detection_identifier == detection.identifier
626+
)
627+
all_detections = (
628+
detection
629+
for detection in license_detections
630+
if detection_identifier == detection.identifier
631+
)
632+
633+
detection = next(all_detections)
634+
detection_mapping = detection.to_dict()
635+
files = list(file_regions)
636+
unique_license_detections.append(
637+
cls(
638+
identifier=detection.identifier,
639+
license_expression=detection_mapping["license_expression"],
640+
detection_log=detection_mapping["detection_log"],
641+
matches=detection_mapping["matches"],
642+
occurance_count=len(files),
643+
files=files,
644+
)
645+
)
646+
647+
return unique_license_detections
648+
649+
def to_dict(self):
650+
def dict_fields(attr, value):
651+
if attr.name == 'files':
652+
return False
653+
654+
return True
655+
656+
return attr.asdict(self, filter=dict_fields, dict_factory=dict)
657+
658+
659+
def get_identifiers(license_detections):
660+
"""
661+
Get identifiers for all license detections.
662+
"""
663+
identifiers = (
664+
detection.identifier
665+
for detection in license_detections
666+
)
667+
return identifiers
668+
669+
435670
def get_detections_from_mappings(detection_mappings):
436671
"""
437672
Return a list of LicenseDetection objects from a list of

0 commit comments

Comments
 (0)