Skip to content

Commit c581828

Browse files
committed
Prefer exact license match over SPDX
Add a new matcher_order attribute to LicenseMatch and use it for sorting matches rather than the matcher string. This was we can ensure that there is a proper precedence between matchers when two matches are matching exactly the same text. The new sort order for matcher is like that: - 0: 1-hash - 1: 2-aho - 2: 1-spdx-id - 3: 3-seq - 4: 5-undetected - 5: 5-aho-frag - 6: 6-unknown The outcome is that a hash or aho match for the same text at the same position will take precedence of the SPDX id match, allowing to curate and correct some incorrect license expressions if needed. Reference: #3912 Reported-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com> Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com>
1 parent 9a7df2c commit c581828

File tree

9 files changed

+98
-25
lines changed

9 files changed

+98
-25
lines changed

src/licensedcode/detection.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ def logger_debug(*args):
7070
return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args))
7171

7272
MATCHER_UNDETECTED = '5-undetected'
73+
MATCHER_UNDETECTED_ORDER = 4
7374

7475
# All values of match_coverage less than this value then they are not considered
7576
# as perfect detections
@@ -1627,6 +1628,7 @@ def get_undetected_matches(query_string):
16271628
hispan=hispan,
16281629
query_run_start=match_start,
16291630
matcher=MATCHER_UNDETECTED,
1631+
matcher_order=MATCHER_UNDETECTED_ORDER,
16301632
query=query_run.query,
16311633
)
16321634

src/licensedcode/match.py

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,8 @@
4242
The filter functions are executed in a specific sequence over the list of matches.
4343
"""
4444

45-
TRACE = False
46-
TRACE_MERGE = False
45+
TRACE = True
46+
TRACE_MERGE = True
4747
TRACE_REFINE = False
4848
TRACE_FILTER_FALSE_POSITIVE = False
4949
TRACE_FILTER_CONTAINED = False
@@ -213,6 +213,15 @@ class LicenseMatch(object):
213213
)
214214
)
215215

216+
matcher_order = attr.ib(
217+
default=0,
218+
metadata=dict(
219+
help='An integer indicating the precedence of a matcher when compared to other matchers '
220+
'where the lowest value has the highest precedence. Used to select which of two '
221+
'equal matches to keep.'
222+
)
223+
)
224+
216225
start_line = attr.ib(
217226
default=0,
218227
metadata=dict(help='match start line, 1-based')
@@ -624,8 +633,10 @@ def combine(self, other):
624633

625634
if other.matcher not in self.matcher:
626635
newmatcher = ' '.join([self.matcher, other.matcher])
636+
newmatcher_order = max([self.matcher_order, other.matcher_order])
627637
else:
628638
newmatcher = self.matcher
639+
newmatcher_order = self.matcher_order
629640

630641
if (
631642
self.discard_reason == DiscardReason.NOT_DISCARDED
@@ -655,6 +666,7 @@ def combine(self, other):
655666
hispan=Span(self.hispan | other.hispan),
656667
query_run_start=min(self.query_run_start, other.query_run_start),
657668
matcher=newmatcher,
669+
matcher_order=newmatcher_order,
658670
query=self.query,
659671
discard_reason=discard_reason,
660672
)
@@ -671,6 +683,7 @@ def update(self, other):
671683
self.matcher = combined.matcher
672684
self.query_run_start = min(self.query_run_start, other.query_run_start)
673685
self.matcher = combined.matcher
686+
self.matcher_order = combined.matcher_order
674687
self.discard_reason = combined.discard_reason
675688
return self
676689

@@ -852,7 +865,7 @@ def merge_matches(matches, max_dist=None, trace=TRACE_MERGE):
852865

853866
# only merge matches with the same rule: sort then group by rule for the
854867
# same rule, sort on start, longer high, longer match, matcher type
855-
sorter = lambda m: (m.rule.identifier, m.qspan.start, -m.hilen(), -m.len(), m.matcher)
868+
sorter = lambda m: (m.rule.identifier, m.qspan.start, -m.hilen(), -m.len(), m.matcher_order)
856869
matches.sort(key=sorter)
857870
matches_by_rule = [
858871
(rid, list(rule_matches))
@@ -1069,7 +1082,7 @@ def filter_contained_matches(
10691082

10701083
# NOTE: we do not filter matches in place: sorted creates a copy
10711084
# sort on start, longer high, longer match, matcher type
1072-
sorter = lambda m: (m.qspan.start, -m.hilen(), -m.len(), m.matcher)
1085+
sorter = lambda m: (m.qspan.start, -m.hilen(), -m.len(), m.matcher_order)
10731086
matches = sorted(matches, key=sorter)
10741087
matches_pop = matches.pop
10751088

@@ -1190,7 +1203,7 @@ def filter_overlapping_matches(
11901203

11911204
# NOTE: we do not filter matches in place: sorted creates a copy
11921205
# sort on start, longer high, longer match, matcher type
1193-
sorter = lambda m: (m.qspan.start, -m.hilen(), -m.len(), m.matcher)
1206+
sorter = lambda m: (m.qspan.start, -m.hilen(), -m.len(), m.matcher_order)
11941207
matches = sorted(matches, key=sorter)
11951208
matches_pop = matches.pop
11961209

@@ -2734,6 +2747,12 @@ def _log(_matches, _discarded, msg):
27342747
matches, discarded_contained = filter_contained_matches(matches)
27352748
_log(matches, discarded_contained, 'NON CONTAINED')
27362749

2750+
if trace_basic:
2751+
logger_debug(' #####refine_matches: after FILTER matches#', len(matches))
2752+
if trace:
2753+
for m in matches:
2754+
logger_debug(m)
2755+
27372756
matches, discarded_overlapping = filter_overlapping_matches(matches)
27382757
_log(matches, discarded_overlapping, 'NON OVERLAPPING')
27392758

src/licensedcode/match_aho.py

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -76,10 +76,19 @@ def add_sequence(automaton, tids, rid, start=0, with_duplicates=False):
7676

7777

7878
MATCH_AHO_EXACT = '2-aho'
79+
MATCH_AHO_EXACT_ORDER = 2
7980
MATCH_AHO_FRAG = '5-aho-frag'
81+
MATCH_AHO_FRAG_ORDER = 5
8082

8183

82-
def exact_match(idx, query_run, automaton, matcher=MATCH_AHO_EXACT, **kwargs):
84+
def exact_match(
85+
idx,
86+
query_run,
87+
automaton,
88+
matcher=MATCH_AHO_EXACT,
89+
matcher_order=MATCH_AHO_EXACT_ORDER,
90+
**kwargs,
91+
):
8392
"""
8493
Return a list of exact LicenseMatch by matching the `query_run` against
8594
the `automaton` and `idx` index.
@@ -111,7 +120,15 @@ def exact_match(idx, query_run, automaton, matcher=MATCH_AHO_EXACT, **kwargs):
111120

112121
rule = rules_by_rid[rid]
113122
match = LicenseMatch(
114-
rule, qspan, ispan, hispan, qbegin, matcher=matcher, query=query)
123+
rule=rule,
124+
qspan=qspan,
125+
ispan=ispan,
126+
hispan=hispan,
127+
query_run_start=qbegin,
128+
matcher=matcher,
129+
matcher_order=matcher_order,
130+
query=query,
131+
)
115132
matches_append(match)
116133
if TRACE and matches:
117134
logger_debug(' ##exact_AHO: matches found#')
@@ -234,8 +251,16 @@ def match_fragments(idx, query_run):
234251
qspan = Span(range(qpos, qpos + mlen))
235252
ispan = Span(range(ipos, ipos + mlen))
236253
hispan = Span(p for p in ispan if itokens[p] < len_legalese)
237-
match = LicenseMatch(rule, qspan, ispan, hispan, qbegin,
238-
matcher=MATCH_AHO_FRAG, query=query)
254+
match = LicenseMatch(
255+
rule=rule,
256+
qspan=qspan,
257+
ispan=ispan,
258+
hispan=hispan,
259+
qbegin=qbegin,
260+
matcher=MATCH_AHO_FRAG,
261+
matcher_order=MATCH_AHO_FRAG_ORDER,
262+
query=query,
263+
)
239264
frag_matches.append(match)
240265

241266
# Merge matches as usual

src/licensedcode/match_hash.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
from array import array
1111
from hashlib import sha1
1212

13-
1413
from licensedcode.match import LicenseMatch
1514
from licensedcode.spans import Span
1615

@@ -39,6 +38,7 @@ def logger_debug(*args):
3938
pass
4039

4140
MATCH_HASH = '1-hash'
41+
MATCH_HASH_ORDER = 0
4242

4343

4444
def tokens_hash(tokens):
@@ -73,6 +73,15 @@ def hash_match(idx, query_run, **kwargs):
7373
qspan = Span(range(query_run.start, query_run.end + 1))
7474
ispan = Span(range(0, rule.length))
7575
hispan = Span(p for p in ispan if itokens[p] < len_legalese)
76-
match = LicenseMatch(rule, qspan, ispan, hispan, query_run.start, matcher=MATCH_HASH, query=query_run.query)
76+
match = LicenseMatch(
77+
rule=rule,
78+
qspan=qspan,
79+
ispan=ispan,
80+
hispan=hispan,
81+
query_run_start=query_run.start,
82+
matcher=MATCH_HASH,
83+
matcher_order=MATCH_HASH_ORDER,
84+
query=query_run.query,
85+
)
7786
matches.append(match)
7887
return matches

src/licensedcode/match_seq.py

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,9 @@
1010
from time import time
1111
import sys
1212

13-
1413
from licensedcode.match import LicenseMatch
1514
from licensedcode.spans import Span
1615

17-
1816
TRACE = False
1917
TRACE2 = False
2018
TRACE3 = False
@@ -38,17 +36,24 @@ def logger_debug(*args): pass
3836
def logger_debug(*args):
3937
return prn(' '.join(isinstance(a, str) and a or repr(a) for a in args))
4038

41-
4239
"""
4340
Matching strategy using pair-wise multiple local sequences alignment and diff-
4441
like approaches.
4542
"""
4643

4744
MATCH_SEQ = '3-seq'
48-
49-
50-
def match_sequence(idx, rule, query_run, high_postings, start_offset=0,
51-
match_blocks=None, deadline=sys.maxsize):
45+
MATCH_SEQ_ORDER = 3
46+
47+
48+
def match_sequence(
49+
idx,
50+
rule,
51+
query_run,
52+
high_postings,
53+
start_offset=0,
54+
match_blocks=None,
55+
deadline=sys.maxsize,
56+
):
5257
"""
5358
Return a list of LicenseMatch by matching the `query_run` tokens sequence
5459
starting at `start_offset` against the `idx` index for the candidate `rule`.
@@ -107,8 +112,15 @@ def match_sequence(idx, rule, query_run, high_postings, start_offset=0,
107112
ispan = Span(range(ipos, ipos + mlen))
108113
hispan = Span(p for p in ispan if itokens[p] < len_legalese)
109114
match = LicenseMatch(
110-
rule, qspan, ispan, hispan, qbegin,
111-
matcher=MATCH_SEQ, query=query)
115+
rule=rule,
116+
qspan=qspan,
117+
ispan=ispan,
118+
hispan=hispan,
119+
query_run_start=qbegin,
120+
matcher=MATCH_SEQ,
121+
matcher_order=MATCH_SEQ_ORDER,
122+
query=query,
123+
)
112124
matches.append(match)
113125

114126
if TRACE2:

src/licensedcode/match_spdx_lid.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ def logger_debug(*args):
5959
return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args))
6060

6161
MATCH_SPDX_ID = '1-spdx-id'
62+
MATCH_SPDX_ID_ORDER = 2
6263

6364

6465
def spdx_id_match(idx, query_run, text, expression_symbols=None):
@@ -112,6 +113,7 @@ def spdx_id_match(idx, query_run, text, expression_symbols=None):
112113
hispan=hispan,
113114
query_run_start=match_start,
114115
matcher=MATCH_SPDX_ID,
116+
matcher_order=MATCH_SPDX_ID_ORDER,
115117
query=query_run.query,
116118
)
117119
return match
@@ -136,7 +138,7 @@ def get_spdx_expression(text, expression_symbols=None):
136138
expression_symbols = get_spdx_symbols()
137139

138140
unknown_symbol = get_unknown_spdx_symbol()
139-
#_prefix, exp_text = prepare_text(text)
141+
# _prefix, exp_text = prepare_text(text)
140142

141143
expression = get_expression(
142144
text=text,
@@ -361,7 +363,7 @@ def clean_text(text):
361363
if is_markup_text(text):
362364
text = demarkup_text(text)
363365

364-
dangling_markup = ['</a>','</p>','</div>', '</licenseUrl>']
366+
dangling_markup = ['</a>', '</p>', '</div>', '</licenseUrl>']
365367
for markup in dangling_markup:
366368
if markup in text:
367369
text = text.replace(markup, '')
@@ -384,7 +386,7 @@ def clean_text(text):
384386
if '">' in text:
385387
text_fragments = text.split('">')
386388
if text_fragments[1] in text_fragments[0]:
387-
text = text_fragments[0]
389+
text = text_fragments[0]
388390

389391
return ' '.join(text.split())
390392

@@ -393,7 +395,6 @@ def clean_text(text):
393395
'(spdx(?:\\-|\\s)+licen(?:s|c)e(?:\\-|\\s)+identifier\\s*:?\\s*)',
394396
re.IGNORECASE).split
395397

396-
397398
_nuget_split_spdx_lid = re.compile(
398399
'(licenses(?:\\.|\\s)+nuget(?:\\.|\\s)+org\\s*:?\\s*)',
399400
re.IGNORECASE).split

src/licensedcode/match_unknown.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ def logger_debug(*args):
4444
pass
4545

4646
MATCH_UNKNOWN = '6-unknown'
47+
MATCH_UNKNOWN_ORDER = 6
4748

4849
UNKNOWN_NGRAM_LENGTH = 6
4950

@@ -176,7 +177,7 @@ def get_tokens(_toks):
176177
match_len = len(qspan)
177178

178179
if TRACE:
179-
#print('match_unknowns: matched_span:', get_tokens(matched_tokens))
180+
# print('match_unknowns: matched_span:', get_tokens(matched_tokens))
180181
print('match_unknowns: qspan, match_len, matched_span:', qspan, match_len, matched_tokens)
181182

182183
# we use the query side to build the ispans
@@ -227,6 +228,7 @@ def get_tokens(_toks):
227228
hispan=hispan,
228229
query_run_start=query_run.start,
229230
matcher=MATCH_UNKNOWN,
231+
matcher_order=MATCH_UNKNOWN_ORDER,
230232
query=query,
231233
)
232234

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
* SPDX-License-Identifier: (GPL-2.0+ OR BSD)
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
license_expressions:
2+
- gpl-2.0-plus OR bsd-new

0 commit comments

Comments
 (0)