Skip to content

Commit 498467c

Browse files
authored
Merge pull request #3917 from aboutcode-org/misc-copyrights2
Improve copyrights detection more
2 parents 6e756c4 + 645ac27 commit 498467c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+533
-38
lines changed

azure-pipelines.yml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,33 @@ jobs:
7575
tests/licensedcode/test_detection_validate.py \
7676
-k TestValidateLicenseExtended5
7777
78+
79+
license_validate_ignorables_1: |
80+
venv/bin/pytest -n 3 -vvs --test-suite=validate \
81+
tests/licensedcode/test_detection_validate.py \
82+
-k TestValidateLicenseIgnorableClues1
83+
84+
license_validate_ignorables_2: |
85+
venv/bin/pytest -n 3 -vvs --test-suite=validate \
86+
tests/licensedcode/test_detection_validate.py \
87+
-k TestValidateLicenseIgnorableClues2
88+
89+
license_validate_ignorables_3: |
90+
venv/bin/pytest -n 3 -vvs --test-suite=validate \
91+
tests/licensedcode/test_detection_validate.py \
92+
-k TestValidateLicenseIgnorableClues3
93+
94+
license_validate_ignorables_4: |
95+
venv/bin/pytest -n 3 -vvs --test-suite=validate \
96+
tests/licensedcode/test_detection_validate.py \
97+
-k TestValidateLicenseIgnorableClues4
98+
99+
license_validate_ignorables_5: |
100+
venv/bin/pytest -n 3 -vvs --test-suite=validate \
101+
tests/licensedcode/test_detection_validate.py \
102+
-k TestValidateLicenseIgnorableClues5
103+
104+
78105
license_cache: |
79106
venv/bin/pytest -n 3 -vvs --test-suite=all \
80107
tests/licensedcode/test_zzzz_cache.py --reruns 2

src/cluecode/copyrights.py

Lines changed: 72 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -806,6 +806,12 @@ def build_detection_from_node(
806806
# verbatime star
807807
(r'^\*$', 'JUNK'),
808808

809+
# misc company names exception to next rule
810+
(r'^TinCanTools$', 'NNP'),
811+
(r'^SoftwareBitMaker$', 'NNP'),
812+
(r'^NetCommWireless$', 'NNP'),
813+
814+
# Repeated CamelCasedWords
809815
(r'^([A-Z][a-z]+){3,}$', 'JUNK'),
810816

811817
############################################################################
@@ -1079,7 +1085,7 @@ def build_detection_from_node(
10791085
(r'^whom$', 'JUNK'),
10801086
(r'^However,?$', 'JUNK'),
10811087
(r'^[Cc]ollectively$', 'JUNK'),
1082-
(r'^following$', 'JUNK'),
1088+
(r'^following$', 'FOLLOWING'),
10831089
(r'^[Cc]onfig$', 'JUNK'),
10841090
(r'^file\.$', 'JUNK'),
10851091

@@ -1184,7 +1190,7 @@ def build_detection_from_node(
11841190
(r'^[a-z]{3,10}[A-Z][a-z]{3,10}$', 'JUNK'),
11851191

11861192
(r'^\$?Guid$', 'JUNK'),
1187-
(r'^Small$', 'NN'),
1193+
#(r'^Small$', 'NN'),
11881194
(r'^implementing$', 'JUNK'),
11891195
(r'^Unlike$', 'JUNK'),
11901196
(r'^using$', 'JUNK'),
@@ -1206,6 +1212,11 @@ def build_detection_from_node(
12061212
# single period
12071213
(r"^\.$", 'JUNK'),
12081214

1215+
# exception to the next rule
1216+
1217+
# by PaX Team
1218+
(r"PaX$", 'NN'),
1219+
12091220
# short mixed caps with trailing cap: ZoY
12101221
(r"[A-Z][a-z][A-Z]$", 'JUNK'),
12111222

@@ -1405,6 +1416,7 @@ def build_detection_from_node(
14051416
(r'^STA$', 'NN'),
14061417
(r'^Page$', 'NN'),
14071418
(r'^Todo/Under$', 'JUNK'),
1419+
(r'^Under$', 'NN'),
14081420

14091421
(r'^Interrupt$', 'NN'),
14101422
(r'^cleanups?$', 'JUNK'),
@@ -1668,6 +1680,8 @@ def build_detection_from_node(
16681680
(r'^([Mm]onday|[Tt]uesday|[Ww]ednesday|[Tt]hursday|[Ff]riday|[Ss]aturday|[Ss]unday),?$', 'DAY'),
16691681
(r'^(Mon|Tue|Wed|Thu|Fri|Sat|Sun|May),?$', 'NN'),
16701682

1683+
(r'^[Dd]ebugging$', 'JUNK'),
1684+
16711685
# misc words that are not NNs
16721686
# lowercase verbs ending in "ing"
16731687
(r'^[a-z]+ing$', 'NN'),
@@ -1700,6 +1714,9 @@ def build_detection_from_node(
17001714
(r'^Moved$', 'NN'),
17011715
(r'^Phone$', 'NN'),
17021716

1717+
(r'^Inputs?$', 'NN'),
1718+
1719+
17031720
# dual caps that are not NNP
17041721
(r'^Make[A-Z]', 'JUNK'),
17051722
(r'^Create[A-Z]', 'JUNK'),
@@ -2069,6 +2086,7 @@ def build_detection_from_node(
20692086
# and Spanish/French Da Siva and De Gaulle
20702087
(r'^(([Vv][ao]n)|[Dd][aeu])$', 'VAN'),
20712088

2089+
(r'^aan$', 'OF'),
20722090
(r'^van$', 'VAN'),
20732091
(r'^Van$', 'VAN'),
20742092
(r'^von$', 'VAN'),
@@ -2134,7 +2152,10 @@ def build_detection_from_node(
21342152
(r'^\$?date-of-software$', 'YR'),
21352153
(r'^\$?date-of-document$', 'YR'),
21362154

2137-
# cardinal numbers
2155+
# small-cardinal numbers, under 30
2156+
(r'^[0-3]?[0-9]?[\.,]?$', 'CDS'),
2157+
2158+
# all other cardinal numbers
21382159
(r'^-?[0-9]+(.[0-9]+)?[\.,]?$', 'CD'),
21392160

21402161
############################################################################
@@ -2179,6 +2200,7 @@ def build_detection_from_node(
21792200

21802201
# exceptions to CAPS used in obfuscated emails like in joe AT foo DOT com
21812202
(r'^AT$', 'AT'),
2203+
(r'^AT$', '<at>'),
21822204
(r'^DOT$', 'DOT'),
21832205

21842206
# all CAPS word, at least 1 char long such as MIT, including an optional trailing comma or dot
@@ -2288,6 +2310,9 @@ def build_detection_from_node(
22882310
# some punctuation combos
22892311
(r'^(?:=>|->|<-|<=)$', 'JUNK'),
22902312

2313+
(r'^semiconductors?[\.,]?$', 'NNP'),
2314+
2315+
22912316
############################################################################
22922317
# catch all other as Nouns
22932318
############################################################################
@@ -2308,17 +2333,21 @@ def build_detection_from_node(
23082333
23092334
YR-RANGE: {<YR>+ <CC>+ <YR>} #20
23102335
YR-RANGE: {<YR> <DASH|TO>* <YR|BARE-YR>+} #30
2311-
YR-RANGE: {<CD|BARE-YR>? <YR> <BARE-YR>?} #40
2336+
YR-RANGE: {<CD|CDS|BARE-YR>? <YR> <BARE-YR>?} #40
23122337
YR-RANGE: {<YR>+ <BARE-YR>? } #50
23132338
YR-AND: {<CC>? <YR>+ <CC>+ <YR>} #60
23142339
YR-RANGE: {<YR-AND>+} #70
23152340
YR-RANGE: {<YR-RANGE>+ <DASH|TO> <YR-RANGE>+} #71
23162341
YR-RANGE: {<YR-RANGE>+ <DASH>?} #72
23172342
# Copyright (c) 1999, 2000, 01, 03, 06 Ralf Baechle
2318-
YR-RANGE: {<YR-RANGE> <CD>+} #72.2
2343+
YR-RANGE: {<YR-RANGE> <CD|CDS>+} #72.2
23192344
23202345
CD: {<BARE-YR>} #bareyear
23212346
2347+
# 5 Jan 2003
2348+
YR-RANGE: {<CDS> <NNP> <YR-RANGE>} #72.3
2349+
2350+
23222351
#######################################
23232352
# All/No/Some Rights Reserved
23242353
#######################################
@@ -2343,6 +2372,9 @@ def build_detection_from_node(
23432372
# foo@bar.com or baz@bar.com
23442373
EMAIL: {<EMAIL> <NN> <EMAIL>} # email or email
23452374
2375+
# <srinivasa.deevi at conexant dot com>
2376+
EMAIL: {<EMAIL_START> <CC> <NN> <DOT> <NN> } #email with brackets
2377+
23462378
#######################################
23472379
# NAMES and COMPANIES
23482380
#######################################
@@ -2408,8 +2440,9 @@ def build_detection_from_node(
24082440
# AT&T Laboratories, Cambridge
24092441
COMPANY: {<COMP> <COMP> <NNP>} #145
24102442
2443+
COMPANY: {<COMP> <CD|CDS> <COMP>} #170
2444+
24112445
# rare "Software in the public interest, Inc."
2412-
COMPANY: {<COMP> <CD> <COMP>} #170
24132446
COMPANY: {<NNP> <IN><NN> <NNP> <NNP>+<COMP>?} #180
24142447
24152448
# Commonwealth Scientific and Industrial Research Organisation (CSIRO)
@@ -2558,18 +2591,21 @@ def build_detection_from_node(
25582591
NAME: {<NAME|NAME-EMAIL>+ <OF> <NNP> <OF> <NN>? <COMPANY>} #550
25592592
NAME: {<NAME|NAME-EMAIL>+ <CC|OF>? <NAME|NAME-EMAIL|COMPANY>} #560
25602593
2561-
NAME: {<NNP><NNP>} #5611
2594+
NAME: {<NNP><NNP>} #561
25622595
25632596
# strip Software from Copyright (c) Ian Darwin 1995. Software
2564-
NAME-YEAR: {<NAME>+ <YR-RANGE>} #5611
2597+
NAME-YEAR: {<NAME>+ <YR-RANGE>} #561.1
25652598
25662599
# Copyright 2018, OpenCensus Authors
2567-
COPYRIGHT: {<COPY>+ <YR-RANGE> <NNP> <AUTHS>} #1579991
2600+
COPYRIGHT: {<COPY>+ <YR-RANGE> <NNP> <AUTHS>} #561.2
2601+
2602+
# Tom aan de Wiel
2603+
NAME: {<NNP> <OF> <VAN> <NNP> } # 561.3
25682604
2569-
NAME-YEAR: {<YR-RANGE> <NNP>+ <CAPS>? <LINUX>?} #5612
2605+
NAME-YEAR: {<YR-RANGE> <NNP>+ <CAPS>? <LINUX>?} #562
25702606
25712607
#Academy of Motion Picture Arts and Sciences
2572-
NAME: {<NAME> <CC> <NNP>} #561
2608+
NAME: {<NAME> <CC> <NNP>} #563
25732609
25742610
# Adam Weinberger and the GNOME Foundation
25752611
ANDCO: {<CC> <NN> <COMPANY>} #565
@@ -2581,6 +2617,8 @@ def build_detection_from_node(
25812617
25822618
URL: {<PARENS> <URL> <PARENS>} #5700
25832619
2620+
NAME-YEAR: {<NAME-YEAR> <CDS> <NNP>} #5700.1
2621+
25842622
#also accept trailing email and URLs
25852623
# and "VAN" e.g. Du: Copyright (c) 2008 Alek Du <alek.du@intel.com>
25862624
NAME-YEAR: {<NAME-YEAR> <VAN>? <EMAIL>?<URL>?} #5701
@@ -2591,7 +2629,7 @@ def build_detection_from_node(
25912629
NAME: {<NN|NNP|CAPS>+ <CC> <OTH>} #600
25922630
NAME: {<NNP> <CAPS>} #610
25932631
NAME: {<CAPS> <DASH>? <NNP|NAME>} #620
2594-
NAME: {<NNP> <CD> <NNP>} #630
2632+
NAME: {<NNP> <CD|CDS> <NNP>} #630
25952633
NAME: {<COMP> <NAME>+} #640
25962634
25972635
# Copyright 2018-2019 @paritytech/substrate-light-ui authors & contributors
@@ -2983,7 +3021,11 @@ def build_detection_from_node(
29833021
29843022
# Russ Dill <Russ.Dill@asu.edu> 2001-2003
29853023
# Rewrited by Vladimir Oleynik <dzo@simtreas.ru> (C) 2003
2986-
COPYRIGHT: {<NAME-EMAIL> <YR-RANGE> <AUTH2> <BY> <NAME-EMAIL> <COPY> <YR-RANGE>} #22793.5
3024+
COPYRIGHT: {<NAME-EMAIL> <YR-RANGE> <AUTH2> <BY> <NAME-EMAIL> <COPY> <YR-RANGE>} #2280-2
3025+
3026+
# Copyright (C) 2018
3027+
# Author: Jeff LaBundy <jeff@labundy.com>
3028+
COPYRIGHT: {<COPY> <COPY> <YR-RANGE> <AUTH> <NAME-EMAIL>} #2280-3
29873029
29883030
COPYRIGHT2: {<COPY>+ <NN|CAPS>? <YR-RANGE>+ <PN>*} #2280
29893031
@@ -3106,7 +3148,7 @@ def build_detection_from_node(
31063148
COPYRIGHT: {<COPYRIGHT2> <CAPS|COMPANY> <NN|LINUX> <COMPANY>} #2008
31073149
31083150
# Copyright (c) 2016-2018 JSR 371 expert group and contributors
3109-
COPYRIGHT: {<COPYRIGHT2> <CAPS> <CD> <COMPANY> <NAME>} #2009.1
3151+
COPYRIGHT: {<COPYRIGHT2> <CAPS> <CD|CDS> <COMPANY> <NAME>} #2009.1
31103152
31113153
# COPYRIGHT (c) 2006 - 2009 DIONYSOS
31123154
COPYRIGHT: {<COPYRIGHT2> <CAPS>} #2009
@@ -3235,7 +3277,7 @@ def build_detection_from_node(
32353277
COPYRIGHT: {<COPY> <NNP> <NAME-YEAR> <COMPANY>?} #15720
32363278
32373279
# Copyright (c) 2008-1010 Intel Corporation
3238-
COPYRIGHT: {<COPY> <COPY> <CD> <COMPANY>} #rare-cd-not-year
3280+
COPYRIGHT: {<COPY> <COPY> <CD|CDS> <COMPANY>} #rare-cd-not-year
32393281
32403282
# Copyright (C) 2005-2006 dann frazier <dannf@dannf.org>
32413283
COPYRIGHT: {<COPYRIGHT2> <NN> <NN> <EMAIL>} #999991
@@ -3258,6 +3300,9 @@ def build_detection_from_node(
32583300
# copyrighted by the Open Source Vulnerability Database (http://osvdb.org)
32593301
COPYRIGHT: {<COPY> <BY> <NN|NNP>{3} <NAME>} #83002.1
32603302
3303+
# (C) by the respective authors,
3304+
<COPYRIGHT>: { <COPY> <BY> <NN> <NN> <AUTHDOT>} #83002.2
3305+
32613306
# weird //opylefted by <-Harvie 2oo7
32623307
COPYRIGHT: {<COPY> <BY> <NN> <NN> <MAINT>?} #83003
32633308
@@ -3301,6 +3346,14 @@ def build_detection_from_node(
33013346
# Gracenote Software, copyright © 2000-2008 Gracenote.
33023347
COPYRIGHT: {<COMPANY> <COPY>{1,2} <NAME-YEAR>} #157999.12
33033348
3349+
#Copyright (C) 2012-2016 by the following authors:
3350+
#- Wladimir J. van der Laan <laanwj@gmail.com>
3351+
3352+
NAME-EMAIL: {<NNP> <NAME-EMAIL> } #157999.13
3353+
NAME-EMAIL: {<DASH> <NAME-EMAIL> <NN>?} #157999.14
3354+
COPYRIGHT: {<COPYRIGHT2> <FOLLOWING> <AUTHS> <NAME-EMAIL>+ } #157999.14
3355+
3356+
33043357
#######################################
33053358
# Copyright is held by ....
33063359
#######################################
@@ -3412,11 +3465,11 @@ def build_detection_from_node(
34123465
34133466
COPYRIGHT: {<COMPANY><COPY>+<ALLRIGHTRESERVED>} #99900
34143467
3415-
COPYRIGHT: {<COPYRIGHT|COPYRIGHT2|COPY|NAME-COPY> <COPY|NNP|AUTHDOT|CAPS|CD|YR-RANGE|NAME|NAME-EMAIL|NAME-YEAR|NAME-COPY|NAME-CAPS|AUTHORANDCO|COMPANY|YEAR|PN|COMP|UNI|CC|OF|IN|BY|OTH|VAN|URL|EMAIL|URL2|MIXEDCAP|NN>+ <ALLRIGHTRESERVED>} #99999
3468+
COPYRIGHT: {<COPYRIGHT|COPYRIGHT2|COPY|NAME-COPY> <COPY|NNP|AUTHDOT|CAPS|CD|CDS|YR-RANGE|NAME|NAME-EMAIL|NAME-YEAR|NAME-COPY|NAME-CAPS|AUTHORANDCO|COMPANY|YEAR|PN|COMP|UNI|CC|OF|IN|BY|OTH|VAN|URL|EMAIL|URL2|MIXEDCAP|NN>+ <ALLRIGHTRESERVED>} #99999
34163469
34173470
# * Copyright (C) 2004 Red Hat, Inc.
34183471
# * Copyright (C) 200 Matthias Clasen <mclasen@redhat.com>
3419-
COPYRIGHT: {<COPY> <COPY> <CD> <NAME-EMAIL>} #9999970
3472+
COPYRIGHT: {<COPY> <COPY> <CD|CDS> <NAME-EMAIL>} #9999970
34203473
34213474
# <p class="copyright"><a href="http://www.w3.org/Consortium/Legal/ipr-notice-20000612#Copyright">Copyright</a>
34223475
COPYRIGHT: {<COPYRIGHT> <COPY>} #9999980
@@ -3803,6 +3856,8 @@ def is_junk_copyright(s, patterns=COPYRIGHTS_JUNK_PATTERN_MATCHERS):
38033856
'$',
38043857
'current.year',
38053858
"©",
3859+
'author',
3860+
'authors',
38063861
])
38073862
))
38083863

src/licensedcode/data/licenses/array-input-method-pl.LICENSE

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,6 @@ ignorable_copyrights:
1111
- copyright holder of Array Input Method
1212
ignorable_holders:
1313
- Array Input Method
14-
ignorable_authors:
15-
- Array Input
1614
---
1715

1816
Array Input Method Public License
@@ -80,4 +78,4 @@ or other liability obligations and/or rights consistent with this License. Howev
8078
obligations, licensee may act only on his own behalf and on his sole responsibility, not on behalf of
8179
anyone else, and only if the licensee agrees toindemnify, defend, and hold everyone else harmless
8280
for any liability incurred by, or claims asserted against, such everyone else by reason of licensee's
83-
accepting any such warranty or additional liability.
81+
accepting any such warranty or additional liability.

src/licensedcode/data/licenses/wxwidgets.LICENSE

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,10 @@ text_urls:
1010
- http://www.wxwidgets.org/about/licence.htm
1111
ignorable_copyrights:
1212
- Copyright (c) 1997 Julian Smart, Markus Holzem
13-
- copyrighted by the wxWidgets
13+
- copyrighted by the wxWidgets authors
1414
ignorable_holders:
1515
- Julian Smart, Markus Holzem
16-
- the wxWidgets
16+
- the wxWidgets authors
1717
ignorable_emails:
1818
- julian@wxwidgets.org
1919
---
@@ -238,4 +238,4 @@ library for tweaking knobs) written by James Random Hacker.
238238

239239
<signature of Ty Coon>, 1 April 1990
240240

241-
Ty Coon, President of Vice
241+
Ty Coon, President of Vice

src/licensedcode/data/rules/apache-1.1_114.RULE

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ ignorable_holders:
1010
- Leo Galambos
1111
ignorable_authors:
1212
- the Egothor Project
13-
- the Egothor Project. Under
1413
ignorable_urls:
1514
- http://egothor.sf.net/
1615
ignorable_emails:
@@ -71,4 +70,4 @@ NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
7170
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
7271
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
7372
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
74-
OF THE POSSIBILITY OF SUCH DAMAGE.
73+
OF THE POSSIBILITY OF SUCH DAMAGE.

tests/cluecode/data/copyrights/misco2/mmiv.txt.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ what:
44
- holders_summary
55
- authors
66
copyrights:
7-
- Copyright (c) MMIV-MMV Anselm R. Garbe
7+
- Copyright (c) MMIV-MMV Anselm R. Garbe garbeam at gmail dot com
88
holders:
99
- MMIV-MMV Anselm R. Garbe
1010
holders_summary:
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Copyright 2016 Tom aan de Wiel
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
what:
2+
- copyrights
3+
- holders
4+
- authors
5+
copyrights:
6+
- Copyright 2016 Tom aan de Wiel
7+
holders:
8+
- Tom aan de Wiel
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
* Copyright (C) 2016-2018
2+
* Author: Matt Ranostay <matt.ranostay@konsulko.com>
3+
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
what:
2+
- copyrights
3+
- holders
4+
- authors
5+
copyrights:
6+
- Copyright (c) 2016-2018 Author Matt Ranostay <matt.ranostay@konsulko.com>
7+
holders:
8+
- Matt Ranostay

0 commit comments

Comments
 (0)