Skip to content

Commit ca1c70f

Browse files
committed
Filter key phrases for continuity correctly
The test for a key phrase uninterrupted continuity must be done in all cases, whether a matched rule is "continuous", or a full "required phrase". Fix test and add new test. Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com>
1 parent 4211d02 commit ca1c70f

File tree

5 files changed

+79
-51
lines changed

5 files changed

+79
-51
lines changed

src/licensedcode/data/rules/lgpl-2.1-plus_6.RULE

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,17 +4,17 @@ is_license_notice: yes
44
notes: lgpl with markup
55
---
66

7-
// This library is free software; you can redistribute it and/or modify
8-
// it under the terms of the {{GNU Lesser General Public License as
9-
// published by the Free Software Foundation; either version 2.1 of the
10-
// License, or (at your option) any later version}}.
11-
//
12-
// This library is distributed in the hope that it will be useful, but
13-
// WITHOUT ANY WARRANTY; without even the implied warranty of
14-
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the {{GNU
15-
// Lesser General Public License}} for more details.
16-
//
17-
// You should have received a copy of the {{GNU Lesser General Public
18-
// License}} along with this library; if not, write to the Free Software
19-
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
20-
// USA.
7+
This library is free software; you can redistribute it and/or modify
8+
it under the terms of the {{GNU Lesser General Public License}} as
9+
published by the Free Software Foundation; {{either version 2.1}} of the
10+
License, {{or (at your option) any later version}}.
11+
12+
This library is distributed in the hope that it will be useful, but
13+
WITHOUT ANY WARRANTY; without even the implied warranty of
14+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15+
Lesser General Public License for more details.
16+
17+
You should have received a copy of the GNU Lesser General Public
18+
License along with this library; if not, write to the Free Software
19+
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
20+
USA.

src/licensedcode/match.py

Lines changed: 31 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -756,7 +756,7 @@ def matched_text(
756756
side effects as the caching depends on which index instance is being
757757
used and this index can change during testing.
758758
"""
759-
if TRACE_MATCHED_TEXT:
759+
if TRACE_MATCHED_TEXT and not TRACE_REPR_ALL_MATCHED_TEXTS:
760760
logger_debug(f'LicenseMatch.matched_text: self.query: {self.query}')
761761

762762
query = self.query
@@ -2205,7 +2205,7 @@ def filter_matches_missing_required_phrases(
22052205

22062206
# keep matches as candidate if they contain all required phrase positions in the ispan
22072207
if trace:
2208-
print(' CANDIDATE TO KEEP: all ikey_span in match.ispan:', ikey_spans, ispan)
2208+
print(' CANDIDATE TO KEEP: all ikey_span in match.ispan: ikey_spans:', ikey_spans, 'ispan:', ispan)
22092209

22102210
# discard matches that contain required phrases, but interrupted by
22112211
# unknown or stop words.
@@ -2219,7 +2219,7 @@ def filter_matches_missing_required_phrases(
22192219
istopwords_by_pos_get = istopwords_by_pos.get
22202220

22212221
# iterate on each required phrase span to ensure that they are continuous
2222-
# and contain no unknown words on the query side
2222+
# and contain no unknown words or stop words on the query side
22232223

22242224
is_valid = True
22252225

@@ -2239,18 +2239,15 @@ def filter_matches_missing_required_phrases(
22392239

22402240
qkey_span = Span(qkey_poss)
22412241
if len(qkey_span) != qkey_span.magnitude():
2242-
2243-
logger_debug(
2244-
' ==> DISCARDING, REQUIRED PHRASES PRESENT, BUT NOT CONTINUOUS:',
2245-
'qkey_span:', qkey_span, 'qpan:', qspan
2246-
)
2247-
2242+
if trace:
2243+
logger_debug(
2244+
' ==> DISCARDING, REQUIRED PHRASES PRESENT, BUT NOT CONTINUOUS:',
2245+
'qkey_span:', qkey_span, 'qspan:', qspan
2246+
)
22482247
is_valid = False
22492248
break
22502249

2251-
# check that required phrase spans does not contain stop words and does
2252-
# not contain unknown words
2253-
2250+
# Check that required phrase spans does not contain unknown words.
22542251
# NOTE: we do not check the last qkey_span position of a required phrase
22552252
# since unknown is a number of words after a given span position:
22562253
# these are pinned to the last position and we would not care for
@@ -2265,34 +2262,36 @@ def filter_matches_missing_required_phrases(
22652262
if contains_unknown:
22662263
logger_debug(
22672264
' ==> DISCARDING, REQUIRED PHRASES PRESENT, BUT UNKNOWNS:',
2268-
'qkey_span:', qkey_span, 'qpan:', qspan,
2265+
'qkey_span:', qkey_span, 'qspan:', qspan,
22692266
'unknown_by_pos:', unknown_by_pos
22702267
)
22712268

22722269
is_valid = False
22732270
break
22742271

2275-
if is_continuous:
2276-
has_same_stopwords_pos = True
2277-
for qpos, ipos in zip(qspan, ispan):
2278-
if qpos not in qkey_span or qpos == qkey_span_end:
2279-
continue
2280-
2281-
if istopwords_by_pos_get(ipos) != qstopwords_by_pos_get(qpos):
2282-
has_same_stopwords_pos = False
2283-
break
2284-
2285-
if not has_same_stopwords_pos:
2286-
logger_debug(
2287-
' ==> DISCARDING, REQUIRED PHRASES PRESENT, BUT STOPWORDS NOT SAME:',
2288-
'qkey_span:', qkey_span, 'qpan:', qspan,
2289-
'istopwords_by_pos:', istopwords_by_pos,
2290-
'qstopwords_by_pos:', qstopwords_by_pos
2291-
)
2292-
2293-
is_valid = False
2272+
# Check that required phrase spans does not contain stop words. This must be true for
2273+
# continuous rules or not, as long as we have a key span: it cannot be interrupted
2274+
2275+
has_same_stopwords_pos = True
2276+
for qpos, ipos in zip(qspan, ispan):
2277+
if qpos not in qkey_span or qpos == qkey_span_end:
2278+
continue
2279+
2280+
if istopwords_by_pos_get(ipos) != qstopwords_by_pos_get(qpos):
2281+
has_same_stopwords_pos = False
22942282
break
22952283

2284+
if not has_same_stopwords_pos:
2285+
logger_debug(
2286+
' ==> DISCARDING, REQUIRED PHRASES PRESENT, BUT STOPWORDS NOT SAME:',
2287+
'qkey_span:', qkey_span, 'qspan:', qspan,
2288+
'istopwords_by_pos:', istopwords_by_pos,
2289+
'qstopwords_by_pos:', qstopwords_by_pos
2290+
)
2291+
2292+
is_valid = False
2293+
break
2294+
22962295
if is_valid:
22972296
logger_debug(' ==> KEEPING, REQUIRED PHRASES PRESENT, CONTINUOUS AND NO UNKNOWNS')
22982297
kept_append(match)

tests/licensedcode/data/datadriven/external/glc/Apache-2.0-Header.t2.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
license_expressions:
22
- apache-2.0
3+
- warranty-disclaimer
34
notes: |
45
License test derived from a file of the BSD-licensed repository at:
56
https://raw.githubusercontent.com/google/licensecheck/v0.3.1/testdata/Apache-2.0-Header.t2

0 commit comments

Comments
 (0)