@@ -687,8 +687,7 @@ def build_detection_from_node(
687
687
(r'^all$' , 'NN' ),
688
688
(r'^ALL$' , 'NN' ),
689
689
(r'^NO$' , 'NN' ),
690
- (r'^No$' , 'NN' ),
691
- (r'^no$' , 'NN' ),
690
+
692
691
(r'^Some$' , 'NN' ),
693
692
(r'^[Rr]ights?$' , 'RIGHT' ),
694
693
(r'^RIGHTS?$' , 'RIGHT' ),
@@ -709,13 +708,34 @@ def build_detection_from_node(
709
708
(r'^[Rr]éservés[\.,]*$' , 'RESERVED' ),
710
709
(r'^[Rr]eserves[\.,]*$' , 'RESERVED' ),
711
710
711
+ # in Spanish Reservados todos los derechos
712
+ (r'^[Rr]eservados[\.,]*$' , 'RESERVED' ),
713
+ (r'^[Tt]odos$' , 'NN' ),
714
+ (r'^[Ll]os$' , 'NN' ),
715
+ (r'^[Dr]erechos$' , 'RIGHT' ),
716
+
717
+ # in Dutch Alle rechten voorbehouden.
718
+ (r'^[Aa]lle$' , 'NN' ),
719
+ (r'^[Rr]echten$' , 'RIGHT' ),
720
+ (r'^[Vv]oorbehouden[\.,]*$' , 'RESERVED' ),
721
+
722
+ # in German Alle Rechte vorbehalten
723
+ (r'^[Aa]lle$' , 'NN' ),
724
+ (r'^[Rr]echte$' , 'RIGHT' ),
725
+ (r'^[Vv]orbehalten[\.,]*$' , 'RESERVED' ),
726
+
712
727
# used to detect "copyright is held by..."
713
728
(r'^is$' , 'IS' ),
714
729
(r'^are$' , 'IS' ),
715
730
(r'^held$' , 'HELD' ),
716
731
717
- # TODO: in Dutch Alle rechten voorbehouden.
718
- # TODO: in Spanish Reservados todos los derechos
732
+ # NOTICE are a thing in some copyright statements, but not all
733
+ (r'^NOTICE$' , 'NOTICE' ),
734
+ (r'^NOTICES?[\.,]$' , 'JUNK' ),
735
+
736
+ (r'^[Nn]otice$' , 'NOTICE' ),
737
+ (r'^[Nn]otices?[\.,]$' , 'JUNK' ),
738
+ (r'^[Nn]otices?$' , 'JUNK' ),
719
739
720
740
############################################################################
721
741
# JUNK are things to ignore
@@ -734,17 +754,23 @@ def build_detection_from_node(
734
754
# short two chars as B3
735
755
(r"^[A-Z][0-9]$" , 'NN' ),
736
756
737
- # Short words skipping some leading caps
757
+ # 2-letters short words, skipping some leading caps
738
758
(r'^[BEFHJMNPQRTUVW][a-z]$' , 'NN' ),
739
759
740
- # misc exceptions
760
+ # Misc exceptions
741
761
(r'^dead_horse$' , 'NN' ),
742
762
(r'^A11yance' , 'NNP' ),
743
763
(r'^Fu$' , 'NNP' ),
744
764
(r'^W3C\(r\)$' , 'COMP' ),
745
765
746
- # three or more AsCamelCase GetQueueReference, with some exceptions
747
- (r'^OpenStreetMap.?$' , 'NAME' ),
766
+ # Three or more AsCamelCase GetQueueReference, with some exceptions
767
+ (r'^(?:OpenStreetMap|AliasDotCom|AllThingsTalk).?$' , 'NAME' ),
768
+
769
+ (r'^Re-Creating$' , 'JUNK' ),
770
+ (r'^[Nn]o$' , 'JUNK' ),
771
+ (r'^Earth$' , 'NN' ),
772
+ (r'^Maps/Google$' , 'NN' ),
773
+
748
774
(r'^([A-Z][a-z]+){3,}$' , 'JUNK' ),
749
775
750
776
############################################################################
@@ -788,7 +814,7 @@ def build_detection_from_node(
788
814
(r'^[Oo]riginally?$' , 'JUNK' ),
789
815
(r'^[Rr]epresentations?\.?$' , 'JUNK' ),
790
816
(r'^works,$' , 'JUNK' ),
791
-
817
+ ( r'^grant$' , 'JUNK' ),
792
818
(r'^Refer$' , 'JUNK' ),
793
819
(r'^Apt$' , 'JUNK' ),
794
820
(r'^Agreement$' , 'JUNK' ),
@@ -829,7 +855,7 @@ def build_detection_from_node(
829
855
(r'^[Cc]opyrighting$' , 'JUNK' ),
830
856
(r'^[Aa]uthori.*$' , 'JUNK' ),
831
857
(r'^such$' , 'JUNK' ),
832
- (r'^[Aa]ssignments?[.,]?$' , 'JUNK' ),
858
+ (r'^[Aa]ssignments?[\ .,]?$' , 'JUNK' ),
833
859
(r'^[Bb]uild$' , 'JUNK' ),
834
860
(r'^[Ss]tring$' , 'JUNK' ),
835
861
(r'^Implementation-Vendor$' , 'JUNK' ),
@@ -912,6 +938,7 @@ def build_detection_from_node(
912
938
(r'^Should$' , 'JUNK' ),
913
939
(r'^[Ll]icensing\@?$' , 'JUNK' ),
914
940
(r'^Disclaimer$' , 'JUNK' ),
941
+ (r'^Directive.?$' , 'JUNK' ),
915
942
(r'^LAWS\,?$' , 'JUNK' ),
916
943
(r'^[Ll]aws?,?$' , 'JUNK' ),
917
944
(r'^me$' , 'JUNK' ),
@@ -1087,7 +1114,7 @@ def build_detection_from_node(
1087
1114
(r'^GA$' , 'JUNK' ),
1088
1115
(r'^unzip$' , 'JUNK' ),
1089
1116
(r'^EULA' , 'JUNK' ),
1090
- (r'^Terms?[.,]?$' , 'JUNK' ),
1117
+ (r'^Terms?[\ .,]?$' , 'JUNK' ),
1091
1118
(r'^Non-Assertion$' , 'JUNK' ),
1092
1119
1093
1120
# this is not Copr.
@@ -1277,9 +1304,9 @@ def build_detection_from_node(
1277
1304
(r'^Education$' , 'NN' ),
1278
1305
(r'^Extended' , 'NN' ),
1279
1306
(r'^Every$' , 'NN' ),
1307
+ (r'^Exhibit$' , 'NN' ),
1280
1308
(r'^Digitized' , 'NN' ),
1281
1309
(r'^[Ds]istributed?.?$' , 'NN' ),
1282
-
1283
1310
(r'^Multiply$' , 'NN' ),
1284
1311
(r'^Convert$' , 'NN' ),
1285
1312
(r'^Compute$' , 'NN' ),
@@ -1295,7 +1322,7 @@ def build_detection_from_node(
1295
1322
(r'^Lexers?.?' , 'NN' ),
1296
1323
(r'^Symbols?.?' , 'NN' ),
1297
1324
(r'^Tokens?.?' , 'NN' ),
1298
-
1325
+ ( r'^Initial' , 'NN' ),
1299
1326
(r'^END$' , 'NN' ),
1300
1327
(r'^Entity$' , 'NN' ),
1301
1328
(r'^Example' , 'NN' ),
@@ -1393,9 +1420,8 @@ def build_detection_from_node(
1393
1420
(r'^Neither$' , 'NN' ),
1394
1421
(r'^Norwegian$' , 'NN' ),
1395
1422
(r'^Notes?$' , 'NN' ),
1396
- (r'^NOTICE[\.\,]?$' , 'NN' ),
1397
- (r'^[Nn]otices?[\.,]?$' , 'NN' ),
1398
1423
(r'^NOT$' , 'NN' ),
1424
+ (r'^Nessus$' , 'NN' ),
1399
1425
(r'^NULL$' , 'NN' ),
1400
1426
(r'^Objects?$' , 'NN' ),
1401
1427
(r'^Open$' , 'NN' ),
@@ -1488,7 +1514,7 @@ def build_detection_from_node(
1488
1514
(r'^Vendor' , 'NN' ),
1489
1515
(r'^VIEW$' , 'NN' ),
1490
1516
(r'^Visit' , 'NN' ),
1491
- (r'^Website' , 'NN' ),
1517
+ # (r'^Website', 'NN'),
1492
1518
(r'^Wheel$' , 'NN' ),
1493
1519
(r'^Win32$' , 'NN' ),
1494
1520
(r'^Work' , 'NN' ),
@@ -1593,6 +1619,8 @@ def build_detection_from_node(
1593
1619
(r'^Message[A-Z]' , 'JUNK' ),
1594
1620
(r'^Short[a-z]*[A-Z]+[a-z]*' , 'JUNK' ),
1595
1621
1622
+ (r'^[Ww]ebsites?[\.,]?' , 'JUNK' ),
1623
+
1596
1624
# files
1597
1625
(r'^.*\.java$' , 'NN' ),
1598
1626
@@ -1705,7 +1733,7 @@ def build_detection_from_node(
1705
1733
(r'^[A-Z][a-z]+[\.,]+(LTD|LTd|LtD|Ltd|ltd|lTD|lTd|ltD).?,?$' , 'COMP' ),
1706
1734
1707
1735
# company suffix
1708
- (r'^[Ii]nc[.]?[,\.]?\)?$' , 'COMP' ),
1736
+ (r'^[Ii]nc[\ .]?[,\.]?\)?$' , 'COMP' ),
1709
1737
(r'^Incorporated[,\.]?\)?$' , 'COMP' ),
1710
1738
1711
1739
# ,Inc. suffix without spaces is directly a company name
@@ -1779,7 +1807,7 @@ def build_detection_from_node(
1779
1807
# (dutch and belgian) company suffix
1780
1808
(r'^[Bb]\.?[Vv]\.?|BVBA$' , 'COMP' ),
1781
1809
# university
1782
- (r'^\(?[Uu]niv(?:[.]|ersit(?:y|e|at?|ad?))[\.,\)]*$' , 'UNI' ),
1810
+ (r'^\(?[Uu]niv(?:[\ .]|ersit(?:y|e|at?|ad?))[\.,\)]*$' , 'UNI' ),
1783
1811
(r'^UNIVERSITY$' , 'UNI' ),
1784
1812
(r'^College$' , 'UNI' ),
1785
1813
# Academia/ie
@@ -2117,6 +2145,12 @@ def build_detection_from_node(
2117
2145
# moment/moment is an odd name
2118
2146
(r'moment/moment$' , 'NAME' ),
2119
2147
2148
+ # single parens are special
2149
+ (r'^[\(\)]$' , 'PARENS' ),
2150
+
2151
+ # some punctuation combos
2152
+ (r'^(?:=>|->|<-|<=)$' , 'JUNK' ),
2153
+
2120
2154
############################################################################
2121
2155
# catch all other as Nouns
2122
2156
############################################################################
@@ -2366,6 +2400,7 @@ def build_detection_from_node(
2366
2400
# Copyright 2018, OpenCensus Authors
2367
2401
COPYRIGHT: {<COPY>+ <YR-RANGE> <NNP> <AUTHS>} #1579991
2368
2402
2403
+
2369
2404
NAME-YEAR: {<YR-RANGE> <NNP>+ <CAPS>?} #5612
2370
2405
2371
2406
#Academy of Motion Picture Arts and Sciences
@@ -2455,7 +2490,8 @@ def build_detection_from_node(
2455
2490
# <s>Timothy Terriberry</s>, <s>CSIRO</s>, and other contributors
2456
2491
ANDCO: {<CC> <CAPS|COMPANY|NAME|NAME-EMAIL|NAME-YEAR>+} #960
2457
2492
2458
- COMPANY: {<COMPANY|NAME|NAME-EMAIL|NAME-YEAR> <ANDCO>+} #970
2493
+ # Copyright © 1998-2009 Bill Spitzak (spitzak@users.sourceforge.net ) and others,
2494
+ COMPANY: {<COMPANY|NAME|NAME-EMAIL|NAME-YEAR> <PARENS>? <ANDCO>+} #970
2459
2495
2460
2496
# de Nemours and Company
2461
2497
NAME: {<VAN>? <NNP> <ANDCO>+} #980
@@ -2565,6 +2601,9 @@ def build_detection_from_node(
2565
2601
# The Rand Project Developers
2566
2602
COMPANY: {<COMPANY> <MAINT>} #19603
2567
2603
2604
+ # Copyright (C) 1998-2001 VideoLAN ( Johan Bilien <jobi@via.ecp.fr> and Gildas Bazin <gbazin@netcourrier.com> )
2605
+ NAME: {<PARENS> <NAME> <PARENS>} #19653
2606
+
2568
2607
2569
2608
################################# #COPYRIGHT: {<COPY> <COPY> <MIT>} #1802
2570
2609
######
@@ -2755,7 +2794,7 @@ def build_detection_from_node(
2755
2794
2756
2795
COPYRIGHT2: {<COPY>+ <NN|CAPS>? <YR-RANGE>+ <PN>*} #2280
2757
2796
2758
- # using #2280 above: Copyright 2018 Developers of the Rand project
2797
+ # using #2280 above: Copyright 2018 Developers of the Rand project
2759
2798
COPYRIGHT: {<COPYRIGHT2> <MAINT> <OF> <COMPANY>} #2280.123
2760
2799
2761
2800
COPYRIGHT2: {<COPY>+ <NN|CAPS>? <YR-RANGE>+ <NN|CAPS>* <COMPANY>?} #2300
@@ -2803,7 +2842,7 @@ def build_detection_from_node(
2803
2842
COPYRIGHT: {<PORTIONS> <COPYRIGHT|COPYRIGHT2>} #2610
2804
2843
2805
2844
#copyright notice (3dfx Interactive, Inc. 1999),
2806
- COPYRIGHT: {<COPY> <NN > <COMPANY> <YR-RANGE>} #2620
2845
+ COPYRIGHT: {<COPY> <NOTICE > <COMPANY> <YR-RANGE>} #2620
2807
2846
2808
2847
# Copyright (C) <2013>, GENIVI Alliance, Inc.
2809
2848
COPYRIGHT: {<COPYRIGHT2> <ANDCO>} #2625
@@ -3032,6 +3071,9 @@ def build_detection_from_node(
3032
3071
# (C) Distributed Management Task Force (Distributed is an NN)
3033
3072
COPYRIGHT: {<COPY> <NN> <NAME>} #83010
3034
3073
3074
+ # Copyright (c) 2014 The Rust Project Developers
3075
+ COPYRIGHT: {<COPYRIGHT> <MAINT> } #83020
3076
+
3035
3077
3036
3078
#######################################
3037
3079
# Copyright is held by ....
@@ -3214,11 +3256,10 @@ def refine_holder(h):
3214
3256
h = h .strip ('/ ~' )
3215
3257
h = strip_solo_quotes (h )
3216
3258
h = h .replace ('( ' , ' ' ).replace (' )' , ' ' )
3217
- h = h .strip ()
3218
- h = h .strip ('+-' )
3259
+ h = h .strip ('+- ' )
3219
3260
h = strip_trailing_period (h )
3220
- h = h .strip ()
3221
- h = h . strip ( '+-' )
3261
+ h = h .strip ('+- ' )
3262
+ h = ' ' . join ( h . split () )
3222
3263
if h and h .lower () not in HOLDERS_JUNK :
3223
3264
return h
3224
3265
@@ -3325,7 +3366,7 @@ def refine_names(s, prefixes):
3325
3366
r'^copyright \(c\)$' ,
3326
3367
r'^\(c\) by$' ,
3327
3368
3328
- r"\(c\) [a-z ][a-z] \(c\)" ,
3369
+ r"\(c\) [a-zA-Z ][a-z] \(c\)" ,
3329
3370
r"^copyright holder or simply" ,
3330
3371
r"^copyright notice\." ,
3331
3372
r"^copyright of uc berkeley's berkeley software distribution" ,
0 commit comments