Skip to content

Commit b10a399

Browse files
Update license rules reference data
* Add rule text to reference data * Add rule url to reference data * make rule references unique * reorder rule references data * regenerate test expectations Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com>
1 parent 972df60 commit b10a399

File tree

76 files changed

+2119
-6478
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

76 files changed

+2119
-6478
lines changed

src/licensedcode/detection.py

Lines changed: 5 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -26,16 +26,17 @@
2626
from licensedcode.cache import get_cache
2727
from licensedcode.match import LicenseMatch
2828
from licensedcode.match import set_matched_lines
29-
from licensedcode.models import Rule
29+
from licensedcode.models import UnDetectedRule
3030
from licensedcode.models import BasicRule
31-
from licensedcode.models import SpdxRule
3231
from licensedcode.models import compute_relevance
32+
from licensedcode.models import get_rule_object_from_match
3333
from licensedcode.spans import Span
3434
from licensedcode.tokenize import query_tokenizer
3535
from licensedcode.query import Query
3636
from licensedcode.query import LINES_THRESHOLD
3737
from licensedcode.licenses_reference import extract_license_rules_reference_data
3838

39+
3940
"""
4041
LicenseDetection data structure and processing.
4142
@@ -612,24 +613,8 @@ def matches_from_license_match_mappings(license_match_mappings):
612613
license_matches = []
613614

614615
for license_match_mapping in license_match_mappings:
615-
matcher = license_match_mapping["matcher"]
616-
rule_identifier = license_match_mapping["rule_identifier"]
617-
if matcher == "1-spdx-id":
618-
rule = SpdxRule(
619-
license_expression=license_match_mapping["license_expression"],
620-
text=license_match_mapping.get("matched_text", None),
621-
length=license_match_mapping["matched_length"],
622-
)
623-
elif rule_identifier == 'package-manifest-unknown':
624-
rule = UnDetectedRule(
625-
license_expression=license_match_mapping["license_expression"],
626-
text=license_match_mapping.get("matched_text", None),
627-
length=license_match_mapping["matched_length"],
628-
)
629-
else:
630-
rule = get_index().rules_by_id[rule_identifier]
631-
632-
license_rule_reference = rule.get_reference_data(matcher=matcher)
616+
rule = get_rule_object_from_match(license_match_mapping)
617+
license_rule_reference = rule.get_reference_data()
633618
license_matches.append(
634619
LicenseMatchFromResult.from_license_match_mapping(
635620
license_match_mapping=license_match_mapping,
@@ -1182,35 +1167,6 @@ def get_undetected_matches(query_string):
11821167
return matches
11831168

11841169

1185-
@attr.s(slots=True, repr=False)
1186-
class UnDetectedRule(Rule):
1187-
"""
1188-
A specialized rule object that is used for the special case of extracted
1189-
license statements without any valid license detection.
1190-
1191-
Since there is a license where there is a non empty extracted license
1192-
statement (typically found in a package manifest), if there is no license
1193-
detected by scancode, it would be incorrect to not point out that there
1194-
is a license (though undetected).
1195-
"""
1196-
1197-
def __attrs_post_init__(self, *args, **kwargs):
1198-
self.identifier = 'package-manifest-' + self.license_expression
1199-
expression = self.licensing.parse(self.license_expression)
1200-
self.license_expression = expression.render()
1201-
self.license_expression_object = expression
1202-
self.is_license_tag = True
1203-
self.is_small = False
1204-
self.relevance = 100
1205-
self.has_stored_relevance = True
1206-
1207-
def load(self):
1208-
raise NotImplementedError
1209-
1210-
def dump(self):
1211-
raise NotImplementedError
1212-
1213-
12141170
def get_matches_from_detections(license_detections):
12151171
"""
12161172
Return a `license_matches` list of LicenseMatch objects from a

src/licensedcode/licenses_reference.py

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
import logging
1212
from license_expression import Licensing
1313

14+
from licensedcode.models import get_rule_object_from_match
15+
1416

1517
TRACE_REFERENCE = os.environ.get('SCANCODE_DEBUG_LICENSE_REFERENCE', False)
1618
TRACE_EXTRACT = os.environ.get('SCANCODE_DEBUG_LICENSE_REFERENCE_EXTRACT', False)
@@ -208,17 +210,13 @@ def get_unique_rule_references(rules_data):
208210
"""
209211
Get a list of unique Rule data from a list of Rule data.
210212
"""
211-
rule_identifiers = set()
212-
rules_references = []
213+
rules_references_by_identifier = {}
213214

214215
for rule_data in rules_data:
215-
216216
rule_identifier = rule_data['rule_identifier']
217-
if rule_identifier not in rule_identifiers:
218-
rule_identifiers.update(rule_identifier)
219-
rules_references.append(rule_data)
217+
rules_references_by_identifier[rule_identifier] = rule_data
220218

221-
return rules_references
219+
return rules_references_by_identifier.values()
222220

223221

224222
def extract_license_rules_reference_data(license_detections=None, license_matches=None):
@@ -273,17 +271,21 @@ def extract_license_rules_reference_data(license_detections=None, license_matche
273271

274272
def get_reference_data(match):
275273

274+
rule = get_rule_object_from_match(license_match_mapping=match)
275+
276276
ref_data = {}
277-
ref_data['license_expression'] = match['license_expression']
278277
ref_data['rule_identifier'] = match['rule_identifier']
279-
ref_data['referenced_filenames'] = match.pop('referenced_filenames')
278+
ref_data['license_expression'] = match['license_expression']
279+
ref_data['rule_url'] = rule.rule_url
280+
ref_data['rule_relevance'] = match.pop('rule_relevance')
281+
ref_data['rule_length'] = match.pop('rule_length')
280282
ref_data['is_license_text'] = match.pop('is_license_text')
281283
ref_data['is_license_notice'] = match.pop('is_license_notice')
282284
ref_data['is_license_reference'] = match.pop('is_license_reference')
283285
ref_data['is_license_tag'] = match.pop('is_license_tag')
284286
ref_data['is_license_intro'] = match.pop('is_license_intro')
285-
ref_data['rule_length'] = match.pop('rule_length')
286-
ref_data['rule_relevance'] = match.pop('rule_relevance')
287+
ref_data['referenced_filenames'] = match.pop('referenced_filenames')
288+
ref_data['rule_text'] = rule.text
287289

288290
_ = match.pop('licenses')
289291

src/licensedcode/models.py

Lines changed: 62 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
from licensedcode import MIN_MATCH_HIGH_LENGTH
3434
from licensedcode import MIN_MATCH_LENGTH
3535
from licensedcode import SMALL_RULE
36+
from licensedcode.cache import get_index
3637
from licensedcode.frontmatter import SaneYAMLHandler
3738
from licensedcode.frontmatter import FrontmatterPost
3839
from licensedcode.frontmatter import dumps_frontmatter
@@ -1577,6 +1578,18 @@ class BasicRule:
15771578
'position is using the magic -1 key.')
15781579
)
15791580

1581+
@property
1582+
def rule_url(self):
1583+
if 'spdx-license-identifier' in self.identifier:
1584+
return None
1585+
elif self.identifier == 'package-manifest-unknown':
1586+
return None
1587+
elif self.is_from_license:
1588+
return SCANCODE_LICENSE_RULE_URL.format(self.identifier)
1589+
else:
1590+
return SCANCODE_RULE_URL.format(self.identifier)
1591+
1592+
15801593
def rule_file(
15811594
self,
15821595
rules_data_dir=rules_data_dir,
@@ -1772,20 +1785,13 @@ def get_min_high_matched_length(self, unique=False):
17721785
return (self.min_high_matched_length_unique if unique
17731786
else self.min_high_matched_length)
17741787

1775-
def get_reference_data(self, matcher=None):
1788+
def get_reference_data(self):
17761789

17771790
data = {}
17781791

17791792
data['license_expression'] = self.license_expression
17801793
data['rule_identifier'] = self.identifier
1781-
if matcher:
1782-
if matcher == "1-spdx-id":
1783-
data['rule_url'] = None
1784-
elif self.is_from_license:
1785-
data['rule_url'] = SCANCODE_LICENSE_RULE_URL.format(self.identifier)
1786-
else:
1787-
data['rule_url'] = SCANCODE_RULE_URL.format(self.identifier)
1788-
1794+
data['rule_url'] = self.rule_url
17891795
data['referenced_filenames'] = self.referenced_filenames
17901796
data['is_license_text'] = self.is_license_text
17911797
data['is_license_notice'] = self.is_license_notice
@@ -2228,6 +2234,24 @@ def set_relevance(self):
22282234
self.relevance = computed_relevance
22292235

22302236

2237+
def get_rule_object_from_match(license_match_mapping):
2238+
rule_identifier = license_match_mapping["rule_identifier"]
2239+
if 'spdx-license-identifier' in rule_identifier:
2240+
return SpdxRule(
2241+
license_expression=license_match_mapping["license_expression"],
2242+
text=license_match_mapping.get("matched_text", None),
2243+
length=license_match_mapping["matched_length"],
2244+
)
2245+
elif rule_identifier == 'package-manifest-unknown':
2246+
return UnDetectedRule(
2247+
license_expression=license_match_mapping["license_expression"],
2248+
text=license_match_mapping.get("matched_text", None),
2249+
length=license_match_mapping["matched_length"],
2250+
)
2251+
else:
2252+
return get_index().rules_by_id[rule_identifier]
2253+
2254+
22312255
def compute_relevance(length):
22322256
"""
22332257
Return a computed ``relevance`` given a ``length`` and a threshold.
@@ -2445,6 +2469,35 @@ def compute_unique_id(self):
24452469
return hashlib.md5(self.text.encode('utf-8')).hexdigest()
24462470

24472471

2472+
@attr.s(slots=True, repr=False)
2473+
class UnDetectedRule(Rule):
2474+
"""
2475+
A specialized rule object that is used for the special case of extracted
2476+
license statements without any valid license detection.
2477+
2478+
Since there is a license where there is a non empty extracted license
2479+
statement (typically found in a package manifest), if there is no license
2480+
detected by scancode, it would be incorrect to not point out that there
2481+
is a license (though undetected).
2482+
"""
2483+
2484+
def __attrs_post_init__(self, *args, **kwargs):
2485+
self.identifier = 'package-manifest-' + self.license_expression
2486+
expression = self.licensing.parse(self.license_expression)
2487+
self.license_expression = expression.render()
2488+
self.license_expression_object = expression
2489+
self.is_license_tag = True
2490+
self.is_small = False
2491+
self.relevance = 100
2492+
self.has_stored_relevance = True
2493+
2494+
def load(self):
2495+
raise NotImplementedError
2496+
2497+
def dump(self):
2498+
raise NotImplementedError
2499+
2500+
24482501
def _print_rule_stats():
24492502
"""
24502503
Print rules statistics.

tests/cluecode/data/plugin_filter_clues/filtered-expected.json

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,16 +47,18 @@
4747
],
4848
"license_rule_references": [
4949
{
50-
"license_expression": "apache-1.1",
5150
"rule_identifier": "apache-1.1_63.RULE",
52-
"referenced_filenames": [],
51+
"license_expression": "apache-1.1",
52+
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/apache-1.1_63.RULE",
53+
"rule_relevance": 100,
54+
"rule_length": 367,
5355
"is_license_text": true,
5456
"is_license_notice": false,
5557
"is_license_reference": false,
5658
"is_license_tag": false,
5759
"is_license_intro": false,
58-
"rule_length": 367,
59-
"rule_relevance": 100
60+
"referenced_filenames": [],
61+
"rule_text": "is licensed under the\nApache Software License, Version 1.1, which is reproduced below.\n\n/*\n* The Apache Software License, Version 1.1\n*\n*\n* Copyright (c) The Apache Software Foundation. All rights\n* reserved.\n*\n* Redistribution and use in source and binary forms, with or without\n* modification, are permitted provided that the following conditions\n* are met:\n*\n* 1. Redistributions of source code must retain the above copyright\n* notice, this list of conditions and the following disclaimer.\n*\n* 2. Redistributions in binary form must reproduce the above copyright\n* notice, this list of conditions and the following disclaimer in\n* the documentation and/or other materials provided with the\n* distribution.\n*\n* 3. The end-user documentation included with the redistribution,\n* if any, must include the following acknowledgment:\n* \"This product includes software developed by the\n* Apache Software Foundation (http://www.apache.org/).\"\n* Alternately, this acknowledgment may appear in the software itself,\n* if and wherever such third-party acknowledgments normally appear.\n*\n* 4. The names \"Xerces\" and \"Apache Software Foundation\" must\n* not be used to endorse or promote products derived from this\n* software without prior written permission. For written\n* permission, please contact apache@apache.org.\n*\n* 5. Products derived from this software may not be called \"Apache\",\n* nor may \"Apache\" appear in their name, without prior written\n* permission of the Apache Software Foundation.\n*\n* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED\n* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES\n* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\n* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR\n* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\n* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\n* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF\n* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND\n* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\n* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT\n* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF\n* SUCH DAMAGE.\n* ====================================================================\n*\n* This software consists of voluntary contributions made by many\n* individuals on behalf of the Apache Software Foundation and was\n* originally based on software copyright (c) 1999, International\n* Business Machines, Inc., http://www.ibm.com. For more\n* information on the Apache Software Foundation, please see\n* <http://www.apache.org/>."
6062
}
6163
],
6264
"files": [

tests/cluecode/data/plugin_filter_clues/filtered-expected2.json

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,16 +39,18 @@
3939
],
4040
"license_rule_references": [
4141
{
42-
"license_expression": "pygres-2.2",
4342
"rule_identifier": "pygres-2.2_2.RULE",
44-
"referenced_filenames": [],
43+
"license_expression": "pygres-2.2",
44+
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/pygres-2.2_2.RULE",
45+
"rule_relevance": 100,
46+
"rule_length": 145,
4547
"is_license_text": true,
4648
"is_license_notice": false,
4749
"is_license_reference": false,
4850
"is_license_tag": false,
4951
"is_license_intro": false,
50-
"rule_length": 145,
51-
"rule_relevance": 100
52+
"referenced_filenames": [],
53+
"rule_text": "Permission to use, copy, modify, and distribute this software and its\ndocumentation for any purpose, without fee, and without a written\nagreement is hereby granted, provided that the above copyright notice and\nthis paragraph and the following two paragraphs appear in all copies or in\nany new file that contains a substantial portion of this file.\n\nIN NO EVENT SHALL THE AUTHOR BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT,\nSPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS,\nARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE\nAUTHOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\nTHE AUTHOR SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED\nTO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\nPURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN \"AS IS\" BASIS, AND THE\nAUTHOR HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,\nENHANCEMENTS, OR MODIFICATIONS."
5254
}
5355
],
5456
"files": [

tests/cluecode/data/plugin_filter_clues/filtered-expected3.json

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,16 +40,18 @@
4040
],
4141
"license_rule_references": [
4242
{
43-
"license_expression": "pcre",
4443
"rule_identifier": "pcre.LICENSE",
45-
"referenced_filenames": [],
44+
"license_expression": "pcre",
45+
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/pcre.LICENSE",
46+
"rule_relevance": 100,
47+
"rule_length": 303,
4648
"is_license_text": true,
4749
"is_license_notice": false,
4850
"is_license_reference": false,
4951
"is_license_tag": false,
5052
"is_license_intro": false,
51-
"rule_length": 303,
52-
"rule_relevance": 100
53+
"referenced_filenames": [],
54+
"rule_text": "PCRE LICENCE\n------------\n\nPCRE is a library of functions to support regular expressions whose\nsyntax and semantics are as close as possible to those of the Perl 5\nlanguage.\n\nWritten by: Philip Hazel <ph10@cam.ac.uk>\nUniversity of Cambridge Computing Service, Cambridge, England.\nPhone: +44 1223 334714.\nCopyright (c) 1997-2001 University of Cambridge\n\nPermission is granted to anyone to use this software for any purpose on\nany computer system, and to redistribute it freely, subject to the\nfollowing restrictions:\n\n1. This software is distributed in the hope that it will be useful, but\nWITHOUT ANY WARRANTY; without even the implied warranty of\nMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n\n2. The origin of this software must not be misrepresented, either by\nexplicit claim or by omission. In practice, this means that if you use\nPCRE in software which you distribute to others, commercially or\notherwise, you must put a sentence like this\n\"Regular expression support is provided by the PCRE library package,\nwhich is open source software, written by Philip Hazel, and copyright by\nthe University of Cambridge, England\"\n\nsomewhere reasonably visible in your documentation and in any relevant\nfiles or online help data or similar.\n\nA reference to the ftp site for the source, that is, to\nftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/\nshould also be given in the documentation.\n\n3. Altered versions must be plainly marked as such, and must not be\nmisrepresented as being the original software.\n\n4. If PCRE is embedded in any software that is released under the GNU\nGeneral Purpose Licence (GPL), or Lesser General Purpose Licence (LGPL),\nthen the terms of that licence shall supersede any condition above with\nwhich it is incompatible.\n\nThe documentation for PCRE, supplied in the \"doc\" directory, is\ndistributed under the same terms as the software itself.\n\nEnd PCRE LICENCE"
5355
}
5456
],
5557
"files": [

0 commit comments

Comments
 (0)