Skip to content

Commit 850edc1

Browse files
committed
Improve handle of parens in copyright
Also improve NOTICEs, and other misc. variants Don not detect "The Initial Developer" Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com>
1 parent 7ff3f8e commit 850edc1

File tree

4 files changed

+70
-31
lines changed

4 files changed

+70
-31
lines changed

src/cluecode/copyrights.py

Lines changed: 67 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -687,8 +687,7 @@ def build_detection_from_node(
687687
(r'^all$', 'NN'),
688688
(r'^ALL$', 'NN'),
689689
(r'^NO$', 'NN'),
690-
(r'^No$', 'NN'),
691-
(r'^no$', 'NN'),
690+
692691
(r'^Some$', 'NN'),
693692
(r'^[Rr]ights?$', 'RIGHT'),
694693
(r'^RIGHTS?$', 'RIGHT'),
@@ -709,13 +708,34 @@ def build_detection_from_node(
709708
(r'^[Rr]éservés[\.,]*$', 'RESERVED'),
710709
(r'^[Rr]eserves[\.,]*$', 'RESERVED'),
711710

711+
# in Spanish Reservados todos los derechos
712+
(r'^[Rr]eservados[\.,]*$', 'RESERVED'),
713+
(r'^[Tt]odos$', 'NN'),
714+
(r'^[Ll]os$', 'NN'),
715+
(r'^[Dr]erechos$', 'RIGHT'),
716+
717+
# in Dutch Alle rechten voorbehouden.
718+
(r'^[Aa]lle$', 'NN'),
719+
(r'^[Rr]echten$', 'RIGHT'),
720+
(r'^[Vv]oorbehouden[\.,]*$', 'RESERVED'),
721+
722+
# in German Alle Rechte vorbehalten
723+
(r'^[Aa]lle$', 'NN'),
724+
(r'^[Rr]echte$', 'RIGHT'),
725+
(r'^[Vv]orbehalten[\.,]*$', 'RESERVED'),
726+
712727
# used to detect "copyright is held by..."
713728
(r'^is$', 'IS'),
714729
(r'^are$', 'IS'),
715730
(r'^held$', 'HELD'),
716731

717-
# TODO: in Dutch Alle rechten voorbehouden.
718-
# TODO: in Spanish Reservados todos los derechos
732+
# NOTICE are a thing in some copyright statements, but not all
733+
(r'^NOTICE$', 'NOTICE'),
734+
(r'^NOTICES?[\.,]$', 'JUNK'),
735+
736+
(r'^[Nn]otice$', 'NOTICE'),
737+
(r'^[Nn]otices?[\.,]$', 'JUNK'),
738+
(r'^[Nn]otices?$', 'JUNK'),
719739

720740
############################################################################
721741
# JUNK are things to ignore
@@ -734,17 +754,23 @@ def build_detection_from_node(
734754
# short two chars as B3
735755
(r"^[A-Z][0-9]$", 'NN'),
736756

737-
# Short words skipping some leading caps
757+
# 2-letters short words, skipping some leading caps
738758
(r'^[BEFHJMNPQRTUVW][a-z]$', 'NN'),
739759

740-
# misc exceptions
760+
# Misc exceptions
741761
(r'^dead_horse$', 'NN'),
742762
(r'^A11yance', 'NNP'),
743763
(r'^Fu$', 'NNP'),
744764
(r'^W3C\(r\)$', 'COMP'),
745765

746-
# three or more AsCamelCase GetQueueReference, with some exceptions
747-
(r'^OpenStreetMap.?$', 'NAME'),
766+
# Three or more AsCamelCase GetQueueReference, with some exceptions
767+
(r'^(?:OpenStreetMap|AliasDotCom|AllThingsTalk).?$', 'NAME'),
768+
769+
(r'^Re-Creating$', 'JUNK'),
770+
(r'^[Nn]o$', 'JUNK'),
771+
(r'^Earth$', 'NN'),
772+
(r'^Maps/Google$', 'NN'),
773+
748774
(r'^([A-Z][a-z]+){3,}$', 'JUNK'),
749775

750776
############################################################################
@@ -788,7 +814,7 @@ def build_detection_from_node(
788814
(r'^[Oo]riginally?$', 'JUNK'),
789815
(r'^[Rr]epresentations?\.?$', 'JUNK'),
790816
(r'^works,$', 'JUNK'),
791-
817+
(r'^grant$', 'JUNK'),
792818
(r'^Refer$', 'JUNK'),
793819
(r'^Apt$', 'JUNK'),
794820
(r'^Agreement$', 'JUNK'),
@@ -829,7 +855,7 @@ def build_detection_from_node(
829855
(r'^[Cc]opyrighting$', 'JUNK'),
830856
(r'^[Aa]uthori.*$', 'JUNK'),
831857
(r'^such$', 'JUNK'),
832-
(r'^[Aa]ssignments?[.,]?$', 'JUNK'),
858+
(r'^[Aa]ssignments?[\.,]?$', 'JUNK'),
833859
(r'^[Bb]uild$', 'JUNK'),
834860
(r'^[Ss]tring$', 'JUNK'),
835861
(r'^Implementation-Vendor$', 'JUNK'),
@@ -912,6 +938,7 @@ def build_detection_from_node(
912938
(r'^Should$', 'JUNK'),
913939
(r'^[Ll]icensing\@?$', 'JUNK'),
914940
(r'^Disclaimer$', 'JUNK'),
941+
(r'^Directive.?$', 'JUNK'),
915942
(r'^LAWS\,?$', 'JUNK'),
916943
(r'^[Ll]aws?,?$', 'JUNK'),
917944
(r'^me$', 'JUNK'),
@@ -1087,7 +1114,7 @@ def build_detection_from_node(
10871114
(r'^GA$', 'JUNK'),
10881115
(r'^unzip$', 'JUNK'),
10891116
(r'^EULA', 'JUNK'),
1090-
(r'^Terms?[.,]?$', 'JUNK'),
1117+
(r'^Terms?[\.,]?$', 'JUNK'),
10911118
(r'^Non-Assertion$', 'JUNK'),
10921119

10931120
# this is not Copr.
@@ -1277,9 +1304,9 @@ def build_detection_from_node(
12771304
(r'^Education$', 'NN'),
12781305
(r'^Extended', 'NN'),
12791306
(r'^Every$', 'NN'),
1307+
(r'^Exhibit$', 'NN'),
12801308
(r'^Digitized', 'NN'),
12811309
(r'^[Ds]istributed?.?$', 'NN'),
1282-
12831310
(r'^Multiply$', 'NN'),
12841311
(r'^Convert$', 'NN'),
12851312
(r'^Compute$', 'NN'),
@@ -1295,7 +1322,7 @@ def build_detection_from_node(
12951322
(r'^Lexers?.?', 'NN'),
12961323
(r'^Symbols?.?', 'NN'),
12971324
(r'^Tokens?.?', 'NN'),
1298-
1325+
(r'^Initial', 'NN'),
12991326
(r'^END$', 'NN'),
13001327
(r'^Entity$', 'NN'),
13011328
(r'^Example', 'NN'),
@@ -1393,9 +1420,8 @@ def build_detection_from_node(
13931420
(r'^Neither$', 'NN'),
13941421
(r'^Norwegian$', 'NN'),
13951422
(r'^Notes?$', 'NN'),
1396-
(r'^NOTICE[\.\,]?$', 'NN'),
1397-
(r'^[Nn]otices?[\.,]?$', 'NN'),
13981423
(r'^NOT$', 'NN'),
1424+
(r'^Nessus$', 'NN'),
13991425
(r'^NULL$', 'NN'),
14001426
(r'^Objects?$', 'NN'),
14011427
(r'^Open$', 'NN'),
@@ -1488,7 +1514,7 @@ def build_detection_from_node(
14881514
(r'^Vendor', 'NN'),
14891515
(r'^VIEW$', 'NN'),
14901516
(r'^Visit', 'NN'),
1491-
(r'^Website', 'NN'),
1517+
# (r'^Website', 'NN'),
14921518
(r'^Wheel$', 'NN'),
14931519
(r'^Win32$', 'NN'),
14941520
(r'^Work', 'NN'),
@@ -1593,6 +1619,8 @@ def build_detection_from_node(
15931619
(r'^Message[A-Z]', 'JUNK'),
15941620
(r'^Short[a-z]*[A-Z]+[a-z]*', 'JUNK'),
15951621

1622+
(r'^[Ww]ebsites?[\.,]?', 'JUNK'),
1623+
15961624
# files
15971625
(r'^.*\.java$', 'NN'),
15981626

@@ -1705,7 +1733,7 @@ def build_detection_from_node(
17051733
(r'^[A-Z][a-z]+[\.,]+(LTD|LTd|LtD|Ltd|ltd|lTD|lTd|ltD).?,?$', 'COMP'),
17061734

17071735
# company suffix
1708-
(r'^[Ii]nc[.]?[,\.]?\)?$', 'COMP'),
1736+
(r'^[Ii]nc[\.]?[,\.]?\)?$', 'COMP'),
17091737
(r'^Incorporated[,\.]?\)?$', 'COMP'),
17101738

17111739
# ,Inc. suffix without spaces is directly a company name
@@ -1779,7 +1807,7 @@ def build_detection_from_node(
17791807
# (dutch and belgian) company suffix
17801808
(r'^[Bb]\.?[Vv]\.?|BVBA$', 'COMP'),
17811809
# university
1782-
(r'^\(?[Uu]niv(?:[.]|ersit(?:y|e|at?|ad?))[\.,\)]*$', 'UNI'),
1810+
(r'^\(?[Uu]niv(?:[\.]|ersit(?:y|e|at?|ad?))[\.,\)]*$', 'UNI'),
17831811
(r'^UNIVERSITY$', 'UNI'),
17841812
(r'^College$', 'UNI'),
17851813
# Academia/ie
@@ -2117,6 +2145,12 @@ def build_detection_from_node(
21172145
# moment/moment is an odd name
21182146
(r'moment/moment$', 'NAME'),
21192147

2148+
# single parens are special
2149+
(r'^[\(\)]$', 'PARENS'),
2150+
2151+
# some punctuation combos
2152+
(r'^(?:=>|->|<-|<=)$', 'JUNK'),
2153+
21202154
############################################################################
21212155
# catch all other as Nouns
21222156
############################################################################
@@ -2366,6 +2400,7 @@ def build_detection_from_node(
23662400
# Copyright 2018, OpenCensus Authors
23672401
COPYRIGHT: {<COPY>+ <YR-RANGE> <NNP> <AUTHS>} #1579991
23682402
2403+
23692404
NAME-YEAR: {<YR-RANGE> <NNP>+ <CAPS>?} #5612
23702405
23712406
#Academy of Motion Picture Arts and Sciences
@@ -2455,7 +2490,8 @@ def build_detection_from_node(
24552490
# <s>Timothy Terriberry</s>, <s>CSIRO</s>, and other contributors
24562491
ANDCO: {<CC> <CAPS|COMPANY|NAME|NAME-EMAIL|NAME-YEAR>+} #960
24572492
2458-
COMPANY: {<COMPANY|NAME|NAME-EMAIL|NAME-YEAR> <ANDCO>+} #970
2493+
# Copyright © 1998-2009 Bill Spitzak (spitzak@users.sourceforge.net ) and others,
2494+
COMPANY: {<COMPANY|NAME|NAME-EMAIL|NAME-YEAR> <PARENS>? <ANDCO>+} #970
24592495
24602496
# de Nemours and Company
24612497
NAME: {<VAN>? <NNP> <ANDCO>+} #980
@@ -2565,6 +2601,9 @@ def build_detection_from_node(
25652601
# The Rand Project Developers
25662602
COMPANY: {<COMPANY> <MAINT>} #19603
25672603
2604+
# Copyright (C) 1998-2001 VideoLAN ( Johan Bilien <jobi@via.ecp.fr> and Gildas Bazin <gbazin@netcourrier.com> )
2605+
NAME: {<PARENS> <NAME> <PARENS>} #19653
2606+
25682607
25692608
################################# #COPYRIGHT: {<COPY> <COPY> <MIT>} #1802
25702609
######
@@ -2755,7 +2794,7 @@ def build_detection_from_node(
27552794
27562795
COPYRIGHT2: {<COPY>+ <NN|CAPS>? <YR-RANGE>+ <PN>*} #2280
27572796
2758-
# using #2280 above: Copyright 2018 Developers of the Rand project
2797+
# using #2280 above: Copyright 2018 Developers of the Rand project
27592798
COPYRIGHT: {<COPYRIGHT2> <MAINT> <OF> <COMPANY>} #2280.123
27602799
27612800
COPYRIGHT2: {<COPY>+ <NN|CAPS>? <YR-RANGE>+ <NN|CAPS>* <COMPANY>?} #2300
@@ -2803,7 +2842,7 @@ def build_detection_from_node(
28032842
COPYRIGHT: {<PORTIONS> <COPYRIGHT|COPYRIGHT2>} #2610
28042843
28052844
#copyright notice (3dfx Interactive, Inc. 1999),
2806-
COPYRIGHT: {<COPY> <NN> <COMPANY> <YR-RANGE>} #2620
2845+
COPYRIGHT: {<COPY> <NOTICE> <COMPANY> <YR-RANGE>} #2620
28072846
28082847
# Copyright (C) <2013>, GENIVI Alliance, Inc.
28092848
COPYRIGHT: {<COPYRIGHT2> <ANDCO>} #2625
@@ -3032,6 +3071,9 @@ def build_detection_from_node(
30323071
# (C) Distributed Management Task Force (Distributed is an NN)
30333072
COPYRIGHT: {<COPY> <NN> <NAME>} #83010
30343073
3074+
# Copyright (c) 2014 The Rust Project Developers
3075+
COPYRIGHT: {<COPYRIGHT> <MAINT> } #83020
3076+
30353077
30363078
#######################################
30373079
# Copyright is held by ....
@@ -3214,11 +3256,10 @@ def refine_holder(h):
32143256
h = h.strip('/ ~')
32153257
h = strip_solo_quotes(h)
32163258
h = h.replace('( ', ' ').replace(' )', ' ')
3217-
h = h.strip()
3218-
h = h.strip('+-')
3259+
h = h.strip('+- ')
32193260
h = strip_trailing_period(h)
3220-
h = h.strip()
3221-
h = h.strip('+-')
3261+
h = h.strip('+- ')
3262+
h = ' '.join(h.split())
32223263
if h and h.lower() not in HOLDERS_JUNK:
32233264
return h
32243265

@@ -3325,7 +3366,7 @@ def refine_names(s, prefixes):
33253366
r'^copyright \(c\)$',
33263367
r'^\(c\) by$',
33273368

3328-
r"\(c\) [a-z][a-z] \(c\)",
3369+
r"\(c\) [a-zA-Z][a-z] \(c\)",
33293370
r"^copyright holder or simply",
33303371
r"^copyright notice\.",
33313372
r"^copyright of uc berkeley's berkeley software distribution",

tests/cluecode/data/copyrights/libcdio10-libcdio.label.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ copyrights:
2020
- Copyright (c) 1985, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1996, 1997, 1998, 1999,
2121
2000 Free Software Foundation, Inc.
2222
- Copyright (c) 2003 Matthias Drochner
23-
- Copyright (c) 1998-2001 VideoLAN Johan Bilien <jobi@via.ecp.fr> and Gildas Bazin <gbazin@netcourrier.com>
23+
- Copyright (c) 1998-2001 VideoLAN ( Johan Bilien <jobi@via.ecp.fr> and Gildas Bazin <gbazin@netcourrier.com> )
2424
- Copyright (c) 1992, 1993 Eric Youngdale
2525
- Copyright (c) 2003, 2004, 2005, 2006, 2007, 2008 Rocky Bernstein and Herbert Valerio Riedel
2626
holders:

tests/cluecode/data/copyrights/misco2/initial-dev.txt.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,4 @@ what:
33
- holders
44
- holders_summary
55
- authors
6-
authors:
7-
- the Initial Developer
6+

tests/cluecode/data/copyrights/misco2/regexhq/regexhq-025.txt.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,4 @@ what:
44
- authors
55
copyrights:
66
- Copyright (c) 2007
7-
authors:
8-
- the Initial Developer
7+

0 commit comments

Comments
 (0)