Skip to content

Commit b4309dd

Browse files
committed
Improve copyright detection more
Handle more edge cases with new preparation Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com>
1 parent e1c12ca commit b4309dd

30 files changed

+150
-45
lines changed

src/cluecode/copyrights.py

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2177,6 +2177,10 @@ def build_detection_from_node(
21772177
# exceptions to all CAPS words
21782178
(r'^[A-Z]{3,4}[0-9]{4},?$', 'NN'),
21792179

2180+
# exceptions to CAPS used in obfuscated emails like in joe AT foo DOT com
2181+
(r'^AT$', 'AT'),
2182+
(r'^DOT$', 'DOT'),
2183+
21802184
# all CAPS word, at least 1 char long such as MIT, including an optional trailing comma or dot
21812185
(r'^[A-Z0-9]+,?$', 'CAPS'),
21822186

@@ -2245,6 +2249,12 @@ def build_detection_from_node(
22452249
(r'Keio_University\)?,?$', 'NAME'),
22462250
(r'__MyCompanyName__[\.,]?$', 'NAME'),
22472251

2252+
# email in brackets <brett_AT_jdom_DOT_org>
2253+
#(karl AT indy.rr.com)
2254+
#<fdlibm-comments AT sun.com>
2255+
(r'(?i:^[<\(][\w\.\-\+]+at[\w\.\-\+]+(dot)?[\w\.\-\+]+[/)>]$)', 'EMAIL'),
2256+
2257+
22482258
# Code variable names including snake case
22492259
(r'^.*(_.*)+$', 'JUNK'),
22502260

@@ -2322,6 +2332,9 @@ def build_detection_from_node(
23222332
23232333
EMAIL: {<EMAIL_START> <CC> <NN>* <EMAIL_END>} # composite_email
23242334
2335+
# created by Jason Hunter <jhunter AT jdom DOT org>
2336+
EMAIL: {<EMAIL_START> <AT> <NN|NNP> <DOT> <NN|NNP> } # email_start
2337+
23252338
EMAIL: { <NN> <CC> <NN> <DOT> <NN> } # foo at bat dot com
23262339
23272340
# foo@bar.com or baz@bar.com
@@ -2470,6 +2483,9 @@ def build_detection_from_node(
24702483
# Copyright (C) 1995-06 ICP vortex, Achim Leubner
24712484
COPYRIGHT: {<COPY> <COPY> <YR-RANGE> <CAPS> <NN> <NNP> <NNP> } #350.2
24722485
2486+
# Jason Hunter <jhunter AT jdom DOT org>
2487+
EMAIL: {<NAME|NNP|NN> <AT> <NN|NNP> <DOT> <NN|NNP>} #350.3
2488+
24732489
# Academy of Motion Picture Arts
24742490
NAME: {<NNP|PN>+ <NNP>+} #351
24752491
@@ -2563,7 +2579,8 @@ def build_detection_from_node(
25632579
URL: {<PARENS> <URL> <PARENS>} #5700
25642580
25652581
#also accept trailing email and URLs
2566-
NAME-YEAR: {<NAME-YEAR> <EMAIL>?<URL>?} #5701
2582+
# and "VAN" e.g. Du: Copyright (c) 2008 Alek Du <alek.du@intel.com>
2583+
NAME-YEAR: {<NAME-YEAR> <VAN>? <EMAIL>?<URL>?} #5701
25672584
NAME-YEAR: {<NAME-YEAR>+} #5702
25682585
25692586
NAME: {<NNP> <OF> <NNP>} #580
@@ -4322,7 +4339,7 @@ def remove_code_comment_markers(s):
43224339
"""
43234340
Return ``s`` removing code comments such as C and C++ style comment markers and assimilated
43244341
4325-
>>> remove_code_comment_markers("\\*#%; /\\/*a*/b/*c\\d#e%f \\*#%; /")
4342+
>>> remove_code_comment_markers(r"\\*#%; /\\/*a*/b/*c\\d#e%f \\*#%; /")
43264343
'a b c\\d e f'
43274344
"""
43284345
return (s
@@ -4445,6 +4462,12 @@ def prepare_text_line(line):
44454462
.replace('§', " ")
44464463
# keep http
44474464
.replace('<http', " http")
4465+
# placeholders
4466+
.replace('<insert ', " ")
4467+
.replace('year>', " ")
4468+
.replace('<year>', " ")
4469+
.replace('<name>', " ")
4470+
44484471
)
44494472

44504473
if TRACE_TOK:

src/licensedcode/data/licenses/jdom.LICENSE

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,11 @@ notes: this based on an Apache 1.1 license
99
spdx_license_key: LicenseRef-scancode-jdom
1010
faq_url: http://www.jdom.org/docs/faq.html#a0030
1111
ignorable_authors:
12-
- Jason Hunter and Brett McLaughlin
13-
- the JDOM Project (http://www.jdom.org/)
12+
- Jason Hunter <jhunter_AT_jdom_DOT_org> and Brett McLaughlin <brett_AT_jdom_DOT_org>
13+
- the JDOM Project (http://www.jdom.org/)
1414
ignorable_urls:
15-
- http://www.jdom.org/
16-
- http://www.jdom.org/images/logos
15+
- http://www.jdom.org/
16+
- http://www.jdom.org/images/logos
1717
---
1818

1919
Redistribution and use in source and binary forms, with or without
@@ -61,4 +61,4 @@ This software consists of voluntary contributions made by many
6161
individuals on behalf of the JDOM Project and was originally
6262
created by Jason Hunter <jhunter_AT_jdom_DOT_org> and
6363
Brett McLaughlin <brett_AT_jdom_DOT_org>. For more information
64-
on the JDOM Project, please see <http://www.jdom.org/>.
64+
on the JDOM Project, please see <http://www.jdom.org/>.

src/licensedcode/data/licenses/lucent-pl-1.0.LICENSE

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@ osi_url: http://www.opensource.org/licenses/plan9.php
1313
other_urls:
1414
- http://opensource.org/licenses/LPL-1.0
1515
- https://opensource.org/licenses/LPL-1.0
16+
ignorable_copyrights:
17+
- Copyright (c), <ORGANIZATION> and others
18+
ignorable_holders:
19+
- others
1620
---
1721

1822
Lucent Public License Version 1.0
@@ -231,4 +235,4 @@ This Agreement is governed by the laws of the State of <STATE> and the
231235
intellectual property laws of the United States of America. No party to this
232236
Agreement will bring a legal action under this Agreement more than one year
233237
after the cause of action arose. Each party waives its rights to a jury trial in
234-
any resulting litigation.
238+
any resulting litigation.

src/licensedcode/data/rules/apache-2.0_1297.RULE

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@ license_expression: apache-2.0
33
is_license_notice: yes
44
ignorable_urls:
55
- http://www.apache.org/licenses/LICENSE-2.0
6+
ignorable_copyrights:
7+
- Copyright (c), Oracle and/or its affiliates
8+
ignorable_holders:
9+
- Oracle and/or its affiliates
610
---
711

812
The Apache Software License, Version 2.0
@@ -16,4 +20,4 @@ on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
1620
express or implied. See the License for the specific language governing
1721
permissions and limitations under the License.
1822

19-
Apache License Version 2.0, January 2004
23+
Apache License Version 2.0, January 2004

src/licensedcode/data/rules/gpl-1.0-plus_118.RULE

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,8 @@
22
license_expression: gpl-1.0-plus
33
is_license_tag: yes
44
relevance: 100
5-
ignorable_copyrights:
6-
- copyright GNU GENERAL PUBLIC LICENSE a href http://www.gnu.org/copyleft/gpl
7-
ignorable_holders:
8-
- GNU GENERAL PUBLIC LICENSE
95
ignorable_urls:
106
- http://www.gnu.org/copyleft/gpl
117
---
128

13-
copyright: GNU GENERAL PUBLIC LICENSE | | <a href="http://www.gnu.org/copyleft/gpl.
9+
copyright: GNU GENERAL PUBLIC LICENSE | | <a href="http://www.gnu.org/copyleft/gpl.

src/licensedcode/data/rules/gpl-1.0-plus_119.RULE

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,8 @@
22
license_expression: gpl-1.0-plus
33
is_license_tag: yes
44
relevance: 100
5-
ignorable_copyrights:
6-
- copyright GNU GENERAL PUBLIC LICENCE a href http://www.gnu.org/copyleft/gpl
7-
ignorable_holders:
8-
- GNU GENERAL PUBLIC LICENCE
95
ignorable_urls:
106
- http://www.gnu.org/copyleft/gpl
117
---
128

13-
copyright: GNU GENERAL PUBLIC LICENCE | | <a href="http://www.gnu.org/copyleft/gpl.
9+
copyright: GNU GENERAL PUBLIC LICENCE | | <a href="http://www.gnu.org/copyleft/gpl.

src/licensedcode/data/rules/gpl-1.0-plus_487.RULE

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,8 @@
22
license_expression: gpl-1.0-plus
33
is_license_tag: yes
44
relevance: 100
5-
ignorable_copyrights:
6-
- copyright GNU GENERAL PUBLIC LICENSE a href https://www.gnu.org/copyleft/gpl
7-
ignorable_holders:
8-
- GNU GENERAL PUBLIC LICENSE
95
ignorable_urls:
106
- https://www.gnu.org/copyleft/gpl
117
---
128

13-
copyright: GNU GENERAL PUBLIC LICENSE | | <a href="https://www.gnu.org/copyleft/gpl.
9+
copyright: GNU GENERAL PUBLIC LICENSE | | <a href="https://www.gnu.org/copyleft/gpl.

src/licensedcode/data/rules/gpl-1.0-plus_524.RULE

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,8 @@
22
license_expression: gpl-1.0-plus
33
is_license_tag: yes
44
relevance: 100
5-
ignorable_copyrights:
6-
- copyright GNU GENERAL PUBLIC LICENCE a href https://www.gnu.org/copyleft/gpl
7-
ignorable_holders:
8-
- GNU GENERAL PUBLIC LICENCE
95
ignorable_urls:
106
- https://www.gnu.org/copyleft/gpl
117
---
128

13-
copyright: GNU GENERAL PUBLIC LICENCE | | <a href="https://www.gnu.org/copyleft/gpl.
9+
copyright: GNU GENERAL PUBLIC LICENCE | | <a href="https://www.gnu.org/copyleft/gpl.

src/licensedcode/data/rules/jdom_3.RULE

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ ignorable_copyrights:
77
ignorable_holders:
88
- Jason Hunter & Brett McLaughlin
99
ignorable_authors:
10-
- Jason Hunter and Brett McLaughlin
10+
- Jason Hunter jhunter AT jdom DOT org and Brett McLaughlin brett AT jdom DOT org
1111
- the JDOM Project (http://www.jdom.org/)
1212
ignorable_urls:
1313
- http://www.jdom.org/
@@ -64,4 +64,4 @@ JdomLicense 1.1.txt
6464
individuals on behalf of the JDOM Project and was originally
6565
created by Jason Hunter <jhunter AT jdom DOT org> and
6666
Brett McLaughlin <brett AT jdom DOT org>. For more information on
67-
the JDOM Project, please see <http://www.jdom.org/>.
67+
the JDOM Project, please see <http://www.jdom.org/>.

src/licensedcode/data/rules/mpeg-iso_2.RULE

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@ relevance: 95
55
minimum_coverage: 95
66
notes: Seen in https://sourceforge.net/p/opencore-amr/code/ci/master/tree/opencore/NOTICE The
77
site for this code at https://portal.3gpp.org/desktopmodules/Specifications/SpecificationDetails.aspx?specificationId=1421
8+
ignorable_copyrights:
9+
- (c), 3GPP Organizational Partners
10+
ignorable_holders:
11+
- 3GPP Organizational Partners
812
ignorable_urls:
913
- http://www.3gpp.org/
1014
---
@@ -17,4 +21,4 @@ Portions of this file are derived from the following 3GPP standard:
1721

1822
(C) , 3GPP Organizational Partners (ARIB, ATIS, CCSA, ETSI, TTA, TTC)
1923
Permission to distribute, modify and use this file under the standard license
20-
terms listed above has been obtained from the copyright holder.
24+
terms listed above has been obtained from the copyright holder.

0 commit comments

Comments
 (0)