Skip to content

Commit 64bf18c

Browse files
Detect community files to ignore license detections
Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com>
1 parent 3178267 commit 64bf18c

33 files changed

+1535
-474
lines changed

src/licensedcode/detection.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@
3737
from licensedcode.spans import Span
3838
from licensedcode.tokenize import query_tokenizer
3939

40+
from summarycode.classify import check_is_path_community_file
41+
4042
"""
4143
LicenseDetection data structure and processing.
4244
@@ -1667,6 +1669,25 @@ def get_license_keys_from_detections(license_detections, licensing=Licensing()):
16671669
return list(license_keys)
16681670

16691671

1672+
def can_ignore_ambiguous_detection(license_detection):
1673+
"""
1674+
Return True if the license_detection is not an ambigious detection
1675+
which needs to be reviewed. A few cases are:
1676+
1. All the locations of the license detection are community files
1677+
"""
1678+
all_file_paths = [
1679+
file_region.path
1680+
for file_region in license_detection.file_regions
1681+
]
1682+
if all(
1683+
check_is_path_community_file(file_path)
1684+
for file_path in all_file_paths
1685+
):
1686+
return True
1687+
1688+
return False
1689+
1690+
16701691
def get_ambiguous_license_detections_by_type(unique_license_detections):
16711692
"""
16721693
Return a list of ambiguous unique license detections which needs review
@@ -1677,13 +1698,20 @@ def get_ambiguous_license_detections_by_type(unique_license_detections):
16771698
ambi_license_detections = {}
16781699

16791700
for detection in unique_license_detections:
1701+
16801702
if not detection.license_expression:
16811703
ambi_license_detections[DetectionCategory.LOW_QUALITY_MATCH_FRAGMENTS.value] = detection
16821704

1705+
elif can_ignore_ambiguous_detection(detection):
1706+
continue
1707+
16831708
elif is_undetected_license_matches(license_matches=detection.matches):
16841709
ambi_license_detections[DetectionCategory.UNDETECTED_LICENSE.value] = detection
16851710

1686-
elif has_correct_license_clue_matches(license_matches=detection.matches):
1711+
elif (
1712+
has_correct_license_clue_matches(license_matches=detection.matches) and
1713+
has_unknown_matches(license_matches=detection.matches)
1714+
):
16871715
ambi_license_detections[DetectionCategory.LICENSE_CLUES.value] = detection
16881716

16891717
elif "unknown" in detection.license_expression:

src/summarycode/classify.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88
#
99

1010

11+
from commoncode.fileutils import file_name
12+
from commoncode.fileutils import file_base_name
13+
1114
def get_relative_path(root_path, path):
1215
"""
1316
Return a path relativefrom the posix 'path' relative to a
@@ -91,6 +94,49 @@ def get_relative_path(root_path, path):
9194
'readme',
9295
)
9396

97+
COMMUNITY_FILES = (
98+
'CHANGELOG',
99+
'ROADMAP',
100+
'CONTRIBUTING',
101+
'CODE_OF_CONDUCT',
102+
'AUTHORS',
103+
'SECURITY',
104+
'FUNDING',
105+
)
106+
107+
108+
def clean_underscore_dash(filename):
109+
return filename.replace('_', '').replace('-', '')
110+
111+
112+
def check_is_community_file(filename):
113+
"""
114+
Return True if the resource is a known community filename,
115+
return False otherwise.
116+
"""
117+
community_files_cleaned = [
118+
clean_underscore_dash(filename.lower())
119+
for filename in COMMUNITY_FILES
120+
]
121+
name = clean_underscore_dash(filename.lower())
122+
if any(
123+
name.startswith(comm_name) or name.endswith(comm_name)
124+
for comm_name in community_files_cleaned
125+
):
126+
return True
127+
128+
return False
129+
130+
131+
def check_is_resource_community_file(resource):
132+
return check_is_community_file(resource.name) or check_is_community_file(resource.base_name)
133+
134+
135+
def check_is_path_community_file(path):
136+
name = file_name(path, force_posix=True)
137+
base_name = file_base_name(path, force_posix=True)
138+
return check_is_community_file(name) or check_is_community_file(base_name)
139+
94140

95141
def check_resource_name_start_and_end(resource, STARTS_ENDS):
96142
"""
@@ -119,6 +165,7 @@ def set_classification_flags(resource,
119165

120166
resource.is_legal = is_legal = check_resource_name_start_and_end(resource, _LEGAL)
121167
resource.is_readme = is_readme = check_resource_name_start_and_end(resource, _README)
168+
resource.is_community = check_is_resource_community_file(resource)
122169
# FIXME: this will never be picked up as this is NOT available in a pre-scan plugin
123170
has_package_data = bool(getattr(resource, 'package_data', False))
124171
resource.is_manifest = is_manifest = path.endswith(_MANIF) or has_package_data

src/summarycode/classify_plugin.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,10 @@ class FileClassifier(PostScanPlugin):
7777
Boolean(help='True if this file is "top-level" file and either a '
7878
'legal, readme or manifest file.')),
7979

80+
('is_community',
81+
Boolean(help='True if this file is a community file generally used for'
82+
'maintainance or participation in the FOSS project community.')),
83+
8084
# ('is_doc',
8185
# Boolean(help='True if this file is likely a documentation file.')),
8286
#

tests/summarycode/data/classify/cli.expected.json

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
"is_readme": false,
2525
"is_top_level": true,
2626
"is_key_file": false,
27+
"is_community": false,
2728
"files_count": 8,
2829
"dirs_count": 1,
2930
"size_count": 0,
@@ -53,6 +54,7 @@
5354
"is_readme": true,
5455
"is_top_level": true,
5556
"is_key_file": true,
57+
"is_community": false,
5658
"files_count": 0,
5759
"dirs_count": 0,
5860
"size_count": 0,
@@ -82,6 +84,7 @@
8284
"is_readme": true,
8385
"is_top_level": true,
8486
"is_key_file": true,
87+
"is_community": false,
8588
"files_count": 0,
8689
"dirs_count": 0,
8790
"size_count": 0,
@@ -111,6 +114,7 @@
111114
"is_readme": false,
112115
"is_top_level": true,
113116
"is_key_file": true,
117+
"is_community": false,
114118
"files_count": 0,
115119
"dirs_count": 0,
116120
"size_count": 0,
@@ -140,6 +144,7 @@
140144
"is_readme": false,
141145
"is_top_level": true,
142146
"is_key_file": true,
147+
"is_community": false,
143148
"files_count": 0,
144149
"dirs_count": 0,
145150
"size_count": 0,
@@ -169,6 +174,7 @@
169174
"is_readme": false,
170175
"is_top_level": true,
171176
"is_key_file": false,
177+
"is_community": false,
172178
"files_count": 2,
173179
"dirs_count": 0,
174180
"size_count": 0,
@@ -198,6 +204,7 @@
198204
"is_readme": false,
199205
"is_top_level": false,
200206
"is_key_file": false,
207+
"is_community": false,
201208
"files_count": 0,
202209
"dirs_count": 0,
203210
"size_count": 0,
@@ -227,6 +234,7 @@
227234
"is_readme": false,
228235
"is_top_level": false,
229236
"is_key_file": false,
237+
"is_community": false,
230238
"files_count": 0,
231239
"dirs_count": 0,
232240
"size_count": 0,
@@ -256,6 +264,7 @@
256264
"is_readme": false,
257265
"is_top_level": true,
258266
"is_key_file": false,
267+
"is_community": false,
259268
"files_count": 0,
260269
"dirs_count": 0,
261270
"size_count": 0,
@@ -285,6 +294,7 @@
285294
"is_readme": false,
286295
"is_top_level": true,
287296
"is_key_file": false,
297+
"is_community": false,
288298
"files_count": 0,
289299
"dirs_count": 0,
290300
"size_count": 0,

0 commit comments

Comments
 (0)