7
7
# See https://aboutcode.org for more information about nexB OSS projects.
8
8
#
9
9
10
+ """
11
+ Filter out or ignore, as in "remove" redundant or irrelevant detected clues such as copyrights,
12
+ authors, emails, and urls that are already contained in a matched license text or license rule and
13
+ treated as ignorable.
14
+ """
15
+
10
16
from itertools import chain
11
17
12
18
import attr
@@ -63,22 +69,24 @@ def process_codebase(self, codebase, **kwargs):
63
69
if TRACE : logger_debug ('RedundantFilter:process_codebase' )
64
70
65
71
from licensedcode .cache import get_index
72
+ rules_by_id = get_index ().rules_by_id
66
73
67
74
for resource in codebase .walk ():
68
- filtered = filter_ignorable_resource_clues (resource , get_index (). rules_by_id )
75
+ filtered = filter_ignorable_resource_clues (resource = resource , rules_by_id = rules_by_id )
69
76
if filtered :
70
77
filtered .save (codebase )
71
78
72
79
73
80
def filter_ignorable_resource_clues (resource , rules_by_id ):
74
81
"""
75
- Filter ignorable clues from the `resource` Resource objects using all the
76
- scan details attached to that `resource` and the `rules_by_id` mapping of
77
- {identifier: license Rule object}. Return the `resource` object modified in-
78
- place if it was modified.
82
+ Filter ignorable clues from the `` resource`` Resource object using all the
83
+ scan details attached to that `` resource`` and the `` rules_by_id` ` mapping of
84
+ {identifier: license Rule object}. Return the `` resource` ` object modified in-
85
+ place if it was modified, or None otherwise .
79
86
"""
80
87
detections = Detections .from_resource (resource )
81
- filtered = filter_ignorable_clues (detections , rules_by_id )
88
+ filtered = filter_ignorable_clues (detections = detections , rules_by_id = rules_by_id )
89
+ logger_debug (f'filter_ignorable_resource_clues: { filtered } ' )
82
90
if filtered :
83
91
if hasattr (resource , 'emails' ):
84
92
resource .emails = filtered .emails
@@ -97,8 +105,7 @@ def filter_ignorable_resource_clues(resource, rules_by_id):
97
105
class Ignorable (object ):
98
106
# a frozenset of matched line numbers
99
107
lines_range = attr .ib ()
100
- # either a string or a frozenset of strings, such that we can test for `x in
101
- # value`
108
+ # either a string or a frozenset of strings, such that we can test for `x in value`
102
109
value = attr .ib ()
103
110
104
111
@@ -119,20 +126,22 @@ class Detections(object):
119
126
urls = attr .ib (default = attr .Factory (list ))
120
127
emails = attr .ib (default = attr .Factory (list ))
121
128
122
- licenses = attr .ib (default = attr .Factory (list ))
129
+ license_matches = attr .ib (default = attr .Factory (list ))
123
130
124
131
# this is the same as author and copyrights, but restructured to be in the
125
132
# same format as ignorables and is used to filter emails and urls in authors
126
133
# and copyright
127
- copyrights_as_ignorable = attr .ib (default = attr .Factory (list ), repr = False )
128
- holders_as_ignorable = attr .ib (default = attr .Factory (list ), repr = False )
129
- authors_as_ignorable = attr .ib (default = attr .Factory (list ), repr = False )
134
+ copyrights_as_ignorable = attr .ib (default = attr .Factory (list ))
135
+ holders_as_ignorable = attr .ib (default = attr .Factory (list ))
136
+ authors_as_ignorable = attr .ib (default = attr .Factory (list ))
130
137
131
138
@staticmethod
132
139
def from_scan_data (data ):
133
140
detected_copyrights = data .get ('copyrights' , [])
134
141
detected_authors = data .get ('authors' , [])
135
142
detected_holders = data .get ('holders' , [])
143
+ detected_emails = data .get ('emails' , [])
144
+ detected_urls = data .get ('urls' , [])
136
145
137
146
copyrights_as_ignorable = frozenset (
138
147
Ignorable (
@@ -155,19 +164,23 @@ def from_scan_data(data):
155
164
for a in detected_authors
156
165
)
157
166
158
- return Detections (
167
+ license_matches = list (chain .from_iterable (d ['matches' ] for d in data ['license_detections' ]))
168
+
169
+ detections = Detections (
159
170
copyrights = detected_copyrights ,
160
- emails = data . get ( 'emails' , []) ,
161
- urls = data . get ( 'urls' , []) ,
171
+ emails = detected_emails ,
172
+ urls = detected_urls ,
162
173
holders = detected_holders ,
163
174
authors = detected_authors ,
164
175
165
- authors_as_ignorable = authors_as_ignorable ,
166
176
copyrights_as_ignorable = copyrights_as_ignorable ,
167
177
holders_as_ignorable = holders_as_ignorable ,
178
+ authors_as_ignorable = authors_as_ignorable ,
168
179
169
- licenses = data . get ( 'licenses' , []) ,
180
+ license_matches = license_matches ,
170
181
)
182
+ detections .debug ()
183
+ return detections
171
184
172
185
@staticmethod
173
186
def from_resource (resource ):
@@ -185,11 +198,21 @@ def as_iterable(self):
185
198
(('url' , c ) for c in self .urls ),
186
199
)
187
200
201
+ def debug (self ):
202
+ if TRACE :
203
+ logger_debug ('Detections' )
204
+ for nv in self .as_iterable ():
205
+ logger_debug (' ' , nv ),
206
+
207
+ logger_debug (' copyrights_as_ignorable:' , self .copyrights_as_ignorable )
208
+ logger_debug (' holders_as_ignorable: ' , self .holders_as_ignorable )
209
+ logger_debug (' authors_as_ignorable: ' , self .authors_as_ignorable )
210
+ logger_debug (' license_matches: ' , self .license_matches )
211
+
188
212
189
213
def is_empty (clues ):
190
214
if clues :
191
- return not any ([
192
- clues .copyrights , clues .holders , clues .authors , clues .urls , clues .emails ])
215
+ return not any ([clues .copyrights , clues .holders , clues .authors , clues .urls , clues .emails ])
193
216
else :
194
217
# The logic is reversed, so a false or None "clues" object returns None, which
195
218
# is interpreted as False (i.e., the object is *not* empty).
@@ -204,18 +227,22 @@ def filter_ignorable_clues(detections, rules_by_id):
204
227
"""
205
228
if is_empty (detections ):
206
229
return
230
+ if TRACE :
231
+ logger_debug ('filter_ignorable_clues: detections' )
232
+ detections .debug ()
207
233
208
234
no_detected_ignorables = not detections .copyrights and not detections .authors
209
235
210
- ignorables = collect_ignorables (detections .licenses , rules_by_id )
211
-
212
- no_ignorables = not detections .licenses or is_empty (ignorables )
236
+ ignorables = collect_ignorables (license_matches = detections .license_matches , rules_by_id = rules_by_id )
237
+ no_ignorables = not detections .license_matches or is_empty (ignorables )
213
238
214
239
if TRACE :
215
240
logger_debug ('ignorables' , ignorables )
216
241
# logger_debug('detections', detections)
217
242
218
243
if no_ignorables and no_detected_ignorables :
244
+ if TRACE :
245
+ logger_debug ('filter_ignorable_clues: NO IGNORABLES' )
219
246
return
220
247
221
248
# discard redundant emails if ignorable or in a detections copyright or author
@@ -307,9 +334,9 @@ def filter_values(attributes, ignorables, value_key='copyright', strip=''):
307
334
308
335
def collect_ignorables (license_matches , rules_by_id ):
309
336
"""
310
- Collect and return an Ignorables object built from ``license_matches``
311
- matched licenses list of "licenses" objects returned in ScanCode JSON
312
- results and the ``rules_by_id`` mapping of Rule objects by identifier.
337
+ Collect and return an Ignorables object built from ``license_matches`` list of license matches
338
+ as returned in ScanCode results license_detection and the ``rules_by_id`` mapping of Rule
339
+ objects by rule identifier.
313
340
314
341
The value of each ignorable list of clues is a set of (set of lines number,
315
342
set of ignorable values).
@@ -321,38 +348,39 @@ def collect_ignorables(license_matches, rules_by_id):
321
348
copyrights = set ()
322
349
323
350
if not license_matches :
351
+ if TRACE :
352
+ logger_debug ('collect_ignorables: No ignorables!!!!' )
324
353
return Ignorables (
325
354
copyrights = frozenset (copyrights ),
326
355
holders = frozenset (holders ),
327
356
authors = frozenset (authors ),
328
357
urls = frozenset (urls ),
329
358
emails = frozenset (emails ),
330
359
)
331
- # build tuple of (set of lines number, set of ignorbale values)
332
- for lic in license_matches :
360
+
361
+ # build tuple of (set of lines number, set of ignorable values)
362
+ for licmat in license_matches :
333
363
334
364
if TRACE :
335
- logger_debug ('collect_ignorables: license :' , lic [ 'key ' ], lic ['score' ])
365
+ logger_debug ('collect_ignorables: license_match :' , licmat [ 'license_expression ' ], licmat ['score' ])
336
366
337
- matched_rule = lic .get ('matched_rule' , {})
338
- rid = matched_rule .get ('identifier' )
339
- match_coverage = matched_rule .get ('match_coverage' , 0 )
367
+ rid = licmat ['rule_identifier' ]
368
+ if not rid :
369
+ # we are missing the license match details, we can only skip
370
+ if TRACE : logger_debug (' collect_ignorables: skipping, no RID' )
371
+ continue
340
372
341
373
# ignore poor partial matches
342
374
# TODO: there must be a better way using coverage
375
+ match_coverage = float (licmat ['match_coverage' ])
343
376
if match_coverage < 90 :
344
377
if TRACE :
345
378
logger_debug (' collect_ignorables: skipping, match_coverage under 90%' )
346
379
continue
347
380
348
- if not rid :
349
- # we are missing the license match details, we can only skip
350
- if TRACE : logger_debug (' collect_ignorables: skipping, no RID' )
351
- continue
352
-
353
381
rule = rules_by_id [rid ]
354
382
355
- lines_range = frozenset (range (lic ['start_line' ], lic ['end_line' ] + 1 ))
383
+ lines_range = frozenset (range (licmat ['start_line' ], licmat ['end_line' ] + 1 ))
356
384
357
385
ign_copyrights = frozenset (rule .ignorable_copyrights or [])
358
386
if ign_copyrights :
0 commit comments