Skip to content

Commit 461fd65

Browse files
committed
Improve copyright detection
Handle corner cases with markup Detect new copyright forms. Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com>
1 parent f3f2c78 commit 461fd65

13 files changed

+142
-118
lines changed

src/cluecode/copyrights.py

Lines changed: 36 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2237,7 +2237,8 @@ def build_detection_from_node(
22372237
# BY GEORGE J. CARRETTE
22382238
NAME: {<BY> <CAPS> <PN> <CAPS>} #85
22392239
2240-
DASHCAPS: {<DASH> <CAPS>}
2240+
DASHCAPS: {<DASH> <CAPS>} #899999
2241+
22412242
# INRIA - CIRAD - INRA
22422243
COMPANY: {<COMP> <DASHCAPS>+} #1280
22432244
@@ -2266,7 +2267,7 @@ def build_detection_from_node(
22662267
COMPANY: {<NNP> <IN><NN> <NNP> <NNP>+<COMP>?} #180
22672268
22682269
# Commonwealth Scientific and Industrial Research Organisation (CSIRO)
2269-
COMPANY: {<NNP> <NNP> <CC> <NNP> <COMP> <NNP> <CAPS>}
2270+
COMPANY: {<NNP> <NNP> <CC> <NNP> <COMP> <NNP> <CAPS>} #190
22702271
22712272
COMPANY: {<NNP> <CC> <NNP> <COMP> <NNP>*} #200
22722273
@@ -2334,6 +2335,9 @@ def build_detection_from_node(
23342335
# Academy of Motion Picture Arts
23352336
NAME: {<NNP|PN>+ <NNP>+} #351
23362337
2338+
# Distributed Management Task Force
2339+
# NAME: {<NN> <NNP>{3}} #881111
2340+
23372341
# @author <a href="mailto:stephane@hillion.org">Stephane Hillion</a>
23382342
NAME: { <NN>? <NN>? <EMAIL> <NAME> } #351.1
23392343
@@ -2452,7 +2456,7 @@ def build_detection_from_node(
24522456
COMPANY: {<COMP|COMPANY|NNP> <NN> <COMPANY|COMPANY> <NNP>+} #800
24532457
24542458
# by the Institute of Electrical and Electronics Engineers, Inc.
2455-
COMPANY: {<BY> <NN> <COMPANY> <OF> <NNP> <CC> <COMPANY>}
2459+
COMPANY: {<BY> <NN> <COMPANY> <OF> <NNP> <CC> <COMPANY>} #805
24562460
COMPANY: {<COMPANY> <CC> <AUTH|CONTRIBUTORS|AUTHS>} #810
24572461
24582462
# A community of developers
@@ -2461,9 +2465,12 @@ def build_detection_from_node(
24612465
# Copyright (c) 2002-2010 The ANGLE Project Authors
24622466
COMPANY: {<NN> <COMP|COMPANY>+ <AUTHS>?} #820
24632467
2468+
ANDCO: {<CC> <NNP>? <NN> <URL>} #825
2469+
24642470
# this is catching a wide net by treating any bare URL as a company
24652471
COMPANY: {<NNP>? <URL|URL2>} #830
24662472
2473+
24672474
COMPANY: {<COMPANY> <COMP|COMPANY>} #840
24682475
24692476
# the Software and Component Technologies group of Trimble Navigation, Ltd.
@@ -2543,10 +2550,10 @@ def build_detection_from_node(
25432550
COMPANY: {<BY> <NN>+ <COMP|COMPANY>} #1420
25442551
25452552
# the Regents of the University of California, Sun Microsystems, Inc., Scriptics Corporation
2546-
COMPANY: {<NN> <NNP> <OF> <NN> <UNI> <OF> <COMPANY>+}
2553+
COMPANY: {<NN> <NNP> <OF> <NN> <UNI> <OF> <COMPANY>+} #1422
25472554
2548-
# Copyright (c) 1998-2000 University College London
2549-
COMPANY: {<UNI> <UNI> <NNP>}
2555+
# Copyright (c) 1998-2000 University College London #1423
2556+
COMPANY: {<UNI> <UNI> <NNP>} #1427
25502557
25512558
# "And" some name
25522559
ANDCO: {<CC>+ <NN> <NNP>+<UNI|COMP>?} #1430
@@ -2589,8 +2596,8 @@ def build_detection_from_node(
25892596
# Copyright 2015 The Error Prone Authors.
25902597
NAME: {<NN> <NAME> <CONTRIBUTORS|AUTHS>} #196023
25912598
2592-
# Copyright (C) <s>Suresh P <suresh@ippimail.com></s> #19601
2593-
NAME: {<NNP> <PN> <EMAIL>}
2599+
# Copyright (C) <s>Suresh P <suresh@ippimail.com></s>
2600+
NAME: {<NNP> <PN> <EMAIL>} #19601.1
25942601
25952602
# Copyright or Copr. Mines Paristech, France - Mark NOBLE, Alexandrine GESRET
25962603
NAME: {<NAME> <DASH> <NAME> <CAPS>} #19601
@@ -2739,13 +2746,13 @@ def build_detection_from_node(
27392746
COPYRIGHT: {<NAME-COPY> <NNP>} #2274
27402747
27412748
# Copyright 1994-2007 (c) RealNetworks, Inc.
2742-
COPYRIGHT: {<COPY>+ <YR-RANGE> <COPYRIGHT>} #2274
2749+
COPYRIGHT: {<COPY>+ <YR-RANGE> <COPYRIGHT>} #2275
27432750
27442751
# Copyright (c) 2017 Contributors et.al.
27452752
COPYRIGHT: {<COPY> <COPY> <YR-RANGE> <CONTRIBUTORS> <OTH> } #2276
27462753
27472754
#Copyright (c) 2020 Contributors as noted in the AUTHORS file
2748-
COPYRIGHT: {<COPY> <COPY> <YR-RANGE> <CONTRIBUTORS> <NN>* <IN>? <NN>* <CAPS|AUTHS|ATH> <JUNK> }
2755+
COPYRIGHT: {<COPY> <COPY> <YR-RANGE> <CONTRIBUTORS> <NN>* <IN>? <NN>* <CAPS|AUTHS|ATH> <JUNK> } #2277.1
27492756
27502757
# copyrighted by Object Computing, Inc., St. Louis Missouri, Copyright (C) 2002, all rights reserved.
27512758
COPYRIGHT: {<COPYRIGHT> <COPY>+ <YR-RANGE> <ALLRIGHTRESERVED>} #2278
@@ -2922,9 +2929,9 @@ def build_detection_from_node(
29222929
COPYRIGHT2: {<COPYRIGHT2> <JUNK> <COMPANY>} # 2010
29232930
29242931
# copyright C 1988 by the Institute of Electrical and Electronics Engineers, Inc.
2925-
COPYRIGHT: {<COPY> <PN> <YR-RANGE> <COMPANY>}
2932+
COPYRIGHT: {<COPY> <PN> <YR-RANGE> <COMPANY>} #2274.1
29262933
2927-
COPYRIGHT2: {<NAME-COPY> <COPYRIGHT2>} #2274
2934+
COPYRIGHT2: {<NAME-COPY> <COPYRIGHT2>} #2274.2
29282935
29292936
# (C) COPYRIGHT 2004 UNIVERSITY OF CHICAGO
29302937
COPYRIGHT: {<COPYRIGHT2> <UNI> <OF> <CAPS>} #2276
@@ -3069,7 +3076,7 @@ def build_detection_from_node(
30693076
COPYRIGHT: {<COPY> <NN>?<NNP>+ <AUTHS>} #83004
30703077
30713078
# (C) Distributed Management Task Force (Distributed is an NN)
3072-
COPYRIGHT: {<COPY> <NN> <NAME>} #83010
3079+
# COPYRIGHT: {<COPY> <NN> <NAME>} #83010
30733080
30743081
# Copyright (c) 2014 The Rust Project Developers
30753082
COPYRIGHT: {<COPYRIGHT> <MAINT> } #83020
@@ -4030,7 +4037,16 @@ def candidate_lines(numbered_lines):
40304037

40314038
previous_chars = chars_only
40324039
if TRACE:
4033-
logger_debug(' candidate_lines: line is <s></s>candidate')
4040+
logger_debug(' candidate_lines: line is <s></s> candidate')
4041+
4042+
elif 'http' in line:
4043+
# this is for copyright listing many URLs
4044+
in_copyright = 2
4045+
candidates_append(numbered_line)
4046+
4047+
previous_chars = chars_only
4048+
if TRACE:
4049+
logger_debug(' candidate_lines: line is HTTP candidate')
40344050

40354051
elif in_copyright > 0:
40364052
# these are a sign that the copyrights continue after
@@ -4045,6 +4061,7 @@ def candidate_lines(numbered_lines):
40454061
'copyrights',
40464062
'and',
40474063
'by',
4064+
',',
40484065
))
40494066
)
40504067
and not has_trailing_year(previous_chars)
@@ -4177,8 +4194,12 @@ def prepare_text_line(line, dedeb=True, to_ascii=True):
41774194
.replace('( C)', ' (c) ')
41784195
.replace('(C)', ' (c) ')
41794196
.replace('(c)', ' (c) ')
4180-
# the case of \251 is tested by 'weirdencoding.h'
4197+
.replace('( © )', ' (c) ')
4198+
.replace('(©)', ' (c) ')
4199+
.replace('(© )', ' (c) ')
4200+
.replace('( ©)', ' (c) ')
41814201
.replace('©', ' (c) ')
4202+
# the case of \251 is tested by 'weirdencoding.h'
41824203
.replace('\251', ' (c) ')
41834204
.replace('&copy;', ' (c) ')
41844205
.replace('&copy', ' (c) ')

src/textcode/markup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -136,8 +136,8 @@ def demarkup_text(text):
136136
cleaned_append = cleaned.append
137137
for token in tags_and_ents:
138138
tlow = token.lower()
139-
if tlow.startswith(('<', '&', 'href',)) and not any(k in tlow for k in kept_tags):
140-
continue
139+
if tlow.startswith(('<', '/>', '&', 'href',)) and not any(k in tlow for k in kept_tags):
140+
cleaned_append(' ')
141141
else:
142142
cleaned_append(token)
143143
return ''.join(cleaned)

tests/cluecode/data/copyrights/debian_lib_1-libmono_cairo_cil.label.yml

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -7,42 +7,41 @@ copyrights:
77
- Copyright (c) 2001-2005 Novell
88
- Copyright (c) Microsoft Corporation
99
- Copyright (c) 2007 James Newton-King
10-
- Copyright (c) 2002-2004 James W. Newkirk , Michael C. Two , Alexei A. Vorontsov , Charlie
11-
Poole
10+
- Copyright (c) 2002-2004 James W. Newkirk, Michael C. Two, Alexei A. Vorontsov, Charlie Poole
1211
- Copyright (c) 2000-2004 Philip A. Craig
13-
- Portions Copyright (c) 2002-2004 James W. Newkirk , Michael C. Two , Alexei A. Vorontsov
14-
, Charlie Poole
12+
- Portions Copyright (c) 2002-2004 James W. Newkirk, Michael C. Two, Alexei A. Vorontsov,
13+
Charlie Poole
1514
- Copyright (c) 2000-2004 Philip A. Craig
1615
- Copyright (c) 2007, 2008 LShift Ltd.
1716
- Copyright (c) 2007, 2008 Cohesive Financial Technologies LLC.
1817
- Copyright (c) 2007, 2008 Rabbit Technologies Ltd.
19-
- Copyright (c) 2007, 2008 LShift Ltd. , Cohesive Financial Technologies LLC., and Rabbit
20-
Technologies Ltd.
21-
- Copyright (c) 2007, 2008 LShift Ltd. , Cohesive Financial Technologies LLC. , and Rabbit
22-
Technologies Ltd.
23-
- Copyright (c) 2007 LShift Ltd. , Cohesive Financial Technologies LLC. , and Rabbit Technologies
18+
- Copyright (c) 2007, 2008 LShift Ltd., Cohesive Financial Technologies LLC., and Rabbit Technologies
19+
Ltd.
20+
- Copyright (c) 2007, 2008 LShift Ltd., Cohesive Financial Technologies LLC., and Rabbit Technologies
21+
Ltd.
22+
- Copyright (c) 2007 LShift Ltd., Cohesive Financial Technologies LLC., and Rabbit Technologies
2423
Ltd.
2524
- Copyright (c) ???? Simon Mourier <simonm@microsoft.com>
2625
holders:
2726
- The Apache Software Foundation
2827
- Novell
2928
- Microsoft Corporation
3029
- James Newton-King
31-
- James W. Newkirk , Michael C. Two , Alexei A. Vorontsov , Charlie Poole
30+
- James W. Newkirk, Michael C. Two, Alexei A. Vorontsov, Charlie Poole
3231
- Philip A. Craig
33-
- James W. Newkirk , Michael C. Two , Alexei A. Vorontsov , Charlie Poole
32+
- James W. Newkirk, Michael C. Two, Alexei A. Vorontsov, Charlie Poole
3433
- Philip A. Craig
3534
- LShift Ltd.
3635
- Cohesive Financial Technologies LLC.
3736
- Rabbit Technologies Ltd.
38-
- LShift Ltd. , Cohesive Financial Technologies LLC., and Rabbit Technologies Ltd.
39-
- LShift Ltd. , Cohesive Financial Technologies LLC. , and Rabbit Technologies Ltd.
40-
- LShift Ltd. , Cohesive Financial Technologies LLC. , and Rabbit Technologies Ltd.
37+
- LShift Ltd., Cohesive Financial Technologies LLC., and Rabbit Technologies Ltd.
38+
- LShift Ltd., Cohesive Financial Technologies LLC., and Rabbit Technologies Ltd.
39+
- LShift Ltd., Cohesive Financial Technologies LLC., and Rabbit Technologies Ltd.
4140
- Simon Mourier
4241
holders_summary:
43-
- value: LShift Ltd. , Cohesive Financial Technologies LLC., and Rabbit Technologies Ltd.
42+
- value: LShift Ltd., Cohesive Financial Technologies LLC., and Rabbit Technologies Ltd.
4443
count: 3
45-
- value: James W. Newkirk , Michael C. Two , Alexei A. Vorontsov , Charlie Poole
44+
- value: James W. Newkirk, Michael C. Two, Alexei A. Vorontsov, Charlie Poole
4645
count: 2
4746
- value: Philip A. Craig
4847
count: 2

tests/cluecode/data/copyrights/debian_lib_2-libmono_cairo_cil.copyright.yml

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -7,42 +7,41 @@ copyrights:
77
- Copyright (c) 2001-2005 Novell
88
- Copyright (c) Microsoft Corporation
99
- Copyright (c) 2007 James Newton-King
10-
- Copyright (c) 2002-2004 James W. Newkirk , Michael C. Two , Alexei A. Vorontsov , Charlie
11-
Poole
10+
- Copyright (c) 2002-2004 James W. Newkirk, Michael C. Two, Alexei A. Vorontsov, Charlie Poole
1211
- Copyright (c) 2000-2004 Philip A. Craig
13-
- Portions Copyright (c) 2002-2004 James W. Newkirk , Michael C. Two , Alexei A. Vorontsov
14-
, Charlie Poole
12+
- Portions Copyright (c) 2002-2004 James W. Newkirk, Michael C. Two, Alexei A. Vorontsov,
13+
Charlie Poole
1514
- Copyright (c) 2000-2004 Philip A. Craig
1615
- Copyright (c) 2007, 2008 LShift Ltd.
1716
- Copyright (c) 2007, 2008 Cohesive Financial Technologies LLC.
1817
- Copyright (c) 2007, 2008 Rabbit Technologies Ltd.
19-
- Copyright (c) 2007, 2008 LShift Ltd., Cohesive Financial Technologies LLC. , and Rabbit
20-
Technologies Ltd.
21-
- Copyright (c) 2007, 2008 LShift Ltd. , Cohesive Financial Technologies LLC. , and Rabbit
22-
Technologies Ltd.
23-
- Copyright (c) 2007 LShift Ltd. , Cohesive Financial Technologies LLC. , and Rabbit Technologies
18+
- Copyright (c) 2007, 2008 LShift Ltd., Cohesive Financial Technologies LLC., and Rabbit Technologies
19+
Ltd.
20+
- Copyright (c) 2007, 2008 LShift Ltd., Cohesive Financial Technologies LLC., and Rabbit Technologies
21+
Ltd.
22+
- Copyright (c) 2007 LShift Ltd., Cohesive Financial Technologies LLC., and Rabbit Technologies
2423
Ltd.
2524
- Copyright (c) ???? Simon Mourier <simonm@microsoft.com>
2625
holders:
2726
- The Apache Software Foundation
2827
- Novell
2928
- Microsoft Corporation
3029
- James Newton-King
31-
- James W. Newkirk , Michael C. Two , Alexei A. Vorontsov , Charlie Poole
30+
- James W. Newkirk, Michael C. Two, Alexei A. Vorontsov, Charlie Poole
3231
- Philip A. Craig
33-
- James W. Newkirk , Michael C. Two , Alexei A. Vorontsov , Charlie Poole
32+
- James W. Newkirk, Michael C. Two, Alexei A. Vorontsov, Charlie Poole
3433
- Philip A. Craig
3534
- LShift Ltd.
3635
- Cohesive Financial Technologies LLC.
3736
- Rabbit Technologies Ltd.
38-
- LShift Ltd., Cohesive Financial Technologies LLC. , and Rabbit Technologies Ltd.
39-
- LShift Ltd. , Cohesive Financial Technologies LLC. , and Rabbit Technologies Ltd.
40-
- LShift Ltd. , Cohesive Financial Technologies LLC. , and Rabbit Technologies Ltd.
37+
- LShift Ltd., Cohesive Financial Technologies LLC., and Rabbit Technologies Ltd.
38+
- LShift Ltd., Cohesive Financial Technologies LLC., and Rabbit Technologies Ltd.
39+
- LShift Ltd., Cohesive Financial Technologies LLC., and Rabbit Technologies Ltd.
4140
- Simon Mourier
4241
holders_summary:
43-
- value: LShift Ltd., Cohesive Financial Technologies LLC. , and Rabbit Technologies Ltd.
42+
- value: LShift Ltd., Cohesive Financial Technologies LLC., and Rabbit Technologies Ltd.
4443
count: 3
45-
- value: James W. Newkirk , Michael C. Two , Alexei A. Vorontsov , Charlie Poole
44+
- value: James W. Newkirk, Michael C. Two, Alexei A. Vorontsov, Charlie Poole
4645
count: 2
4746
- value: Philip A. Craig
4847
count: 2

tests/cluecode/data/copyrights/debian_lib_3-libmono_security_cil.copyright.yml

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7,19 +7,18 @@ copyrights:
77
- Copyright (c) 2001-2005 Novell
88
- Copyright (c) Microsoft Corporation
99
- Copyright (c) 2007 James Newton-King
10-
- Copyright (c) 2002-2004 James W. Newkirk , Michael C. Two , Alexei A. Vorontsov , Charlie
11-
Poole
10+
- Copyright (c) 2002-2004 James W. Newkirk, Michael C. Two, Alexei A. Vorontsov, Charlie Poole
1211
- Copyright (c) 2000-2004 Philip A. Craig
13-
- Portions Copyright (c) 2002-2004 James W. Newkirk , Michael C. Two , Alexei A. Vorontsov
14-
, Charlie Poole
12+
- Portions Copyright (c) 2002-2004 James W. Newkirk, Michael C. Two, Alexei A. Vorontsov,
13+
Charlie Poole
1514
- Copyright (c) 2000-2004 Philip A. Craig
1615
- Copyright (c) 2007, 2008 LShift Ltd.
1716
- Copyright (c) 2007, 2008 Cohesive Financial Technologies LLC.
1817
- Copyright (c) 2007, 2008 Rabbit Technologies Ltd.
19-
- Copyright (c) 2007, 2008 LShift Ltd. , Cohesive Financial Technologies LLC. , and Rabbit
20-
Technologies Ltd.
21-
- Copyright (c) 2007, 2008 LShift Ltd. , Cohesive Financial Technologies LLC. , and Rabbit
22-
Technologies Ltd.
18+
- Copyright (c) 2007, 2008 LShift Ltd., Cohesive Financial Technologies LLC., and Rabbit Technologies
19+
Ltd.
20+
- Copyright (c) 2007, 2008 LShift Ltd., Cohesive Financial Technologies LLC., and Rabbit Technologies
21+
Ltd.
2322
- Copyright (c) 2007 LShift Ltd., Cohesive Financial Technologies LLC., and Rabbit Technologies
2423
Ltd.
2524
- Copyright (c) ???? Simon Mourier <simonm@microsoft.com>
@@ -28,21 +27,21 @@ holders:
2827
- Novell
2928
- Microsoft Corporation
3029
- James Newton-King
31-
- James W. Newkirk , Michael C. Two , Alexei A. Vorontsov , Charlie Poole
30+
- James W. Newkirk, Michael C. Two, Alexei A. Vorontsov, Charlie Poole
3231
- Philip A. Craig
33-
- James W. Newkirk , Michael C. Two , Alexei A. Vorontsov , Charlie Poole
32+
- James W. Newkirk, Michael C. Two, Alexei A. Vorontsov, Charlie Poole
3433
- Philip A. Craig
3534
- LShift Ltd.
3635
- Cohesive Financial Technologies LLC.
3736
- Rabbit Technologies Ltd.
38-
- LShift Ltd. , Cohesive Financial Technologies LLC. , and Rabbit Technologies Ltd.
39-
- LShift Ltd. , Cohesive Financial Technologies LLC. , and Rabbit Technologies Ltd.
37+
- LShift Ltd., Cohesive Financial Technologies LLC., and Rabbit Technologies Ltd.
38+
- LShift Ltd., Cohesive Financial Technologies LLC., and Rabbit Technologies Ltd.
4039
- LShift Ltd., Cohesive Financial Technologies LLC., and Rabbit Technologies Ltd.
4140
- Simon Mourier
4241
holders_summary:
43-
- value: LShift Ltd. , Cohesive Financial Technologies LLC. , and Rabbit Technologies Ltd.
42+
- value: LShift Ltd., Cohesive Financial Technologies LLC., and Rabbit Technologies Ltd.
4443
count: 3
45-
- value: James W. Newkirk , Michael C. Two , Alexei A. Vorontsov , Charlie Poole
44+
- value: James W. Newkirk, Michael C. Two, Alexei A. Vorontsov, Charlie Poole
4645
count: 2
4746
- value: Philip A. Craig
4847
count: 2

0 commit comments

Comments
 (0)