Skip to content

Commit f3f2c78

Browse files
committed
Correctly filter copyrights in licenses #3797
Reference: #3797 Reported-by: Jörg Arndt @Joerki Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com>
1 parent 850edc1 commit f3f2c78

File tree

9 files changed

+810
-121
lines changed

9 files changed

+810
-121
lines changed

src/cluecode/plugin_filter_clues.py

Lines changed: 65 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@
77
# See https://aboutcode.org for more information about nexB OSS projects.
88
#
99

10+
"""
11+
Filter out or ignore, as in "remove" redundant or irrelevant detected clues such as copyrights,
12+
authors, emails, and urls that are already contained in a matched license text or license rule and
13+
treated as ignorable.
14+
"""
15+
1016
from itertools import chain
1117

1218
import attr
@@ -63,22 +69,24 @@ def process_codebase(self, codebase, **kwargs):
6369
if TRACE: logger_debug('RedundantFilter:process_codebase')
6470

6571
from licensedcode.cache import get_index
72+
rules_by_id = get_index().rules_by_id
6673

6774
for resource in codebase.walk():
68-
filtered = filter_ignorable_resource_clues(resource, get_index().rules_by_id)
75+
filtered = filter_ignorable_resource_clues(resource=resource, rules_by_id=rules_by_id)
6976
if filtered:
7077
filtered.save(codebase)
7178

7279

7380
def filter_ignorable_resource_clues(resource, rules_by_id):
7481
"""
75-
Filter ignorable clues from the `resource` Resource objects using all the
76-
scan details attached to that `resource` and the `rules_by_id` mapping of
77-
{identifier: license Rule object}. Return the `resource` object modified in-
78-
place if it was modified.
82+
Filter ignorable clues from the ``resource`` Resource object using all the
83+
scan details attached to that ``resource`` and the ``rules_by_id`` mapping of
84+
{identifier: license Rule object}. Return the ``resource`` object modified in-
85+
place if it was modified, or None otherwise.
7986
"""
8087
detections = Detections.from_resource(resource)
81-
filtered = filter_ignorable_clues(detections, rules_by_id)
88+
filtered = filter_ignorable_clues(detections=detections, rules_by_id=rules_by_id)
89+
logger_debug(f'filter_ignorable_resource_clues: {filtered}')
8290
if filtered:
8391
if hasattr(resource, 'emails'):
8492
resource.emails = filtered.emails
@@ -97,8 +105,7 @@ def filter_ignorable_resource_clues(resource, rules_by_id):
97105
class Ignorable(object):
98106
# a frozenset of matched line numbers
99107
lines_range = attr.ib()
100-
# either a string or a frozenset of strings, such that we can test for `x in
101-
# value`
108+
# either a string or a frozenset of strings, such that we can test for `x in value`
102109
value = attr.ib()
103110

104111

@@ -119,20 +126,22 @@ class Detections(object):
119126
urls = attr.ib(default=attr.Factory(list))
120127
emails = attr.ib(default=attr.Factory(list))
121128

122-
licenses = attr.ib(default=attr.Factory(list))
129+
license_matches = attr.ib(default=attr.Factory(list))
123130

124131
# this is the same as author and copyrights, but restructured to be in the
125132
# same format as ignorables and is used to filter emails and urls in authors
126133
# and copyright
127-
copyrights_as_ignorable = attr.ib(default=attr.Factory(list), repr=False)
128-
holders_as_ignorable = attr.ib(default=attr.Factory(list), repr=False)
129-
authors_as_ignorable = attr.ib(default=attr.Factory(list), repr=False)
134+
copyrights_as_ignorable = attr.ib(default=attr.Factory(list))
135+
holders_as_ignorable = attr.ib(default=attr.Factory(list))
136+
authors_as_ignorable = attr.ib(default=attr.Factory(list))
130137

131138
@staticmethod
132139
def from_scan_data(data):
133140
detected_copyrights = data.get('copyrights', [])
134141
detected_authors = data.get('authors', [])
135142
detected_holders = data.get('holders', [])
143+
detected_emails = data.get('emails', [])
144+
detected_urls = data.get('urls', [])
136145

137146
copyrights_as_ignorable = frozenset(
138147
Ignorable(
@@ -155,19 +164,23 @@ def from_scan_data(data):
155164
for a in detected_authors
156165
)
157166

158-
return Detections(
167+
license_matches = list(chain.from_iterable(d['matches'] for d in data['license_detections']))
168+
169+
detections = Detections(
159170
copyrights=detected_copyrights,
160-
emails=data.get('emails', []),
161-
urls=data.get('urls', []),
171+
emails=detected_emails,
172+
urls=detected_urls,
162173
holders=detected_holders,
163174
authors=detected_authors,
164175

165-
authors_as_ignorable=authors_as_ignorable,
166176
copyrights_as_ignorable=copyrights_as_ignorable,
167177
holders_as_ignorable=holders_as_ignorable,
178+
authors_as_ignorable=authors_as_ignorable,
168179

169-
licenses=data.get('licenses', []),
180+
license_matches=license_matches,
170181
)
182+
detections.debug()
183+
return detections
171184

172185
@staticmethod
173186
def from_resource(resource):
@@ -185,11 +198,21 @@ def as_iterable(self):
185198
(('url', c) for c in self.urls),
186199
)
187200

201+
def debug(self):
202+
if TRACE:
203+
logger_debug('Detections')
204+
for nv in self.as_iterable():
205+
logger_debug(' ', nv),
206+
207+
logger_debug(' copyrights_as_ignorable:', self.copyrights_as_ignorable)
208+
logger_debug(' holders_as_ignorable: ', self.holders_as_ignorable)
209+
logger_debug(' authors_as_ignorable: ', self.authors_as_ignorable)
210+
logger_debug(' license_matches: ', self.license_matches)
211+
188212

189213
def is_empty(clues):
190214
if clues:
191-
return not any([
192-
clues.copyrights, clues.holders, clues.authors, clues.urls, clues.emails])
215+
return not any([clues.copyrights, clues.holders, clues.authors, clues.urls, clues.emails])
193216
else:
194217
# The logic is reversed, so a false or None "clues" object returns None, which
195218
# is interpreted as False (i.e., the object is *not* empty).
@@ -204,18 +227,22 @@ def filter_ignorable_clues(detections, rules_by_id):
204227
"""
205228
if is_empty(detections):
206229
return
230+
if TRACE:
231+
logger_debug('filter_ignorable_clues: detections')
232+
detections.debug()
207233

208234
no_detected_ignorables = not detections.copyrights and not detections.authors
209235

210-
ignorables = collect_ignorables(detections.licenses, rules_by_id)
211-
212-
no_ignorables = not detections.licenses or is_empty(ignorables)
236+
ignorables = collect_ignorables(license_matches=detections.license_matches, rules_by_id=rules_by_id)
237+
no_ignorables = not detections.license_matches or is_empty(ignorables)
213238

214239
if TRACE:
215240
logger_debug('ignorables', ignorables)
216241
# logger_debug('detections', detections)
217242

218243
if no_ignorables and no_detected_ignorables:
244+
if TRACE:
245+
logger_debug('filter_ignorable_clues: NO IGNORABLES')
219246
return
220247

221248
# discard redundant emails if ignorable or in a detections copyright or author
@@ -307,9 +334,9 @@ def filter_values(attributes, ignorables, value_key='copyright', strip=''):
307334

308335
def collect_ignorables(license_matches, rules_by_id):
309336
"""
310-
Collect and return an Ignorables object built from ``license_matches``
311-
matched licenses list of "licenses" objects returned in ScanCode JSON
312-
results and the ``rules_by_id`` mapping of Rule objects by identifier.
337+
Collect and return an Ignorables object built from ``license_matches`` list of license matches
338+
as returned in ScanCode results license_detection and the ``rules_by_id`` mapping of Rule
339+
objects by rule identifier.
313340
314341
The value of each ignorable list of clues is a set of (set of lines number,
315342
set of ignorable values).
@@ -321,38 +348,39 @@ def collect_ignorables(license_matches, rules_by_id):
321348
copyrights = set()
322349

323350
if not license_matches:
351+
if TRACE:
352+
logger_debug('collect_ignorables: No ignorables!!!!')
324353
return Ignorables(
325354
copyrights=frozenset(copyrights),
326355
holders=frozenset(holders),
327356
authors=frozenset(authors),
328357
urls=frozenset(urls),
329358
emails=frozenset(emails),
330359
)
331-
# build tuple of (set of lines number, set of ignorbale values)
332-
for lic in license_matches:
360+
361+
# build tuple of (set of lines number, set of ignorable values)
362+
for licmat in license_matches:
333363

334364
if TRACE:
335-
logger_debug('collect_ignorables: license:', lic['key'], lic['score'])
365+
logger_debug('collect_ignorables: license_match:', licmat['license_expression'], licmat['score'])
336366

337-
matched_rule = lic.get('matched_rule', {})
338-
rid = matched_rule.get('identifier')
339-
match_coverage = matched_rule.get('match_coverage', 0)
367+
rid = licmat['rule_identifier']
368+
if not rid:
369+
# we are missing the license match details, we can only skip
370+
if TRACE: logger_debug(' collect_ignorables: skipping, no RID')
371+
continue
340372

341373
# ignore poor partial matches
342374
# TODO: there must be a better way using coverage
375+
match_coverage = float(licmat['match_coverage'])
343376
if match_coverage < 90:
344377
if TRACE:
345378
logger_debug(' collect_ignorables: skipping, match_coverage under 90%')
346379
continue
347380

348-
if not rid:
349-
# we are missing the license match details, we can only skip
350-
if TRACE: logger_debug(' collect_ignorables: skipping, no RID')
351-
continue
352-
353381
rule = rules_by_id[rid]
354382

355-
lines_range = frozenset(range(lic['start_line'], lic['end_line'] + 1))
383+
lines_range = frozenset(range(licmat['start_line'], licmat['end_line'] + 1))
356384

357385
ign_copyrights = frozenset(rule.ignorable_copyrights or [])
358386
if ign_copyrights:

src/licensedcode/data/licenses/ricebsd.LICENSE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ other_urls:
1111
- https://github.com/search?q="Also%2C+we+ask+that+use+of+ARPACK+is+properly"&type=code
1212
have this
1313
ignorable_copyrights:
14-
- (c) 2001, Rice University
14+
- Copyright (c) 2001, Rice University
1515
ignorable_holders:
1616
- Rice University
1717
ignorable_authors:

tests/cluecode/data/copyrights/complex_4_line_statement_in_text-9.txt.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,13 @@ what:
44
- holders_summary
55
copyrights:
66
- Copyright 2002 Jonas Borgstrom <jonas@codefactory.se> 2002 Daniel Lundin <daniel@codefactory.se>
7-
2002 CodeFactory AB
7+
2002 CodeFactory AB.
88
- Copyright (c) 1994 The Regents of the University of California
99
holders:
10-
- Jonas Borgstrom Daniel Lundin CodeFactory AB
10+
- Jonas Borgstrom Daniel Lundin CodeFactory AB.
1111
- The Regents of the University of California
1212
holders_summary:
13-
- value: Jonas Borgstrom Daniel Lundin CodeFactory AB
13+
- value: Jonas Borgstrom Daniel Lundin CodeFactory AB.
1414
count: 1
1515
- value: The Regents of the University of California
1616
count: 1

tests/cluecode/data/copyrights/copytest/with_lead_copy_sign_and_debian_s_tags.txt.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,12 @@ what:
44
- holders_summary
55
copyrights:
66
- Copyright (c) 2008-2009 Nokia Corporation and/or its subsidiary(-ies)
7-
- (c) 1994-2008 Trolltech ASA
7+
- (c) 1994-2008 Trolltech ASA.
88
holders:
99
- Nokia Corporation and/or its subsidiary(-ies)
10-
- Trolltech ASA
10+
- Trolltech ASA.
1111
holders_summary:
1212
- value: Nokia Corporation and/or its subsidiary(-ies)
1313
count: 1
14-
- value: Trolltech ASA
14+
- value: Trolltech ASA.
1515
count: 1

0 commit comments

Comments
 (0)