Skip to content

Commit 1f94c9d

Browse files
committed
Improve copyright detection
* Handle better various parens, markup and quotes Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com>
1 parent 6438377 commit 1f94c9d

File tree

57 files changed

+647
-37
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

57 files changed

+647
-37
lines changed

src/cluecode/copyrights.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2389,7 +2389,7 @@ def build_detection_from_node(
23892389
NAME-EMAIL: {<NAME> <EMAIL>} #530
23902390
23912391
# Project Mayo.
2392-
NAME-YEAR: {<YR-RANGE> <NAME-EMAIL|COMPANY>+ <NNP>?} #535
2392+
NAME-YEAR: {<PARENS>? <YR-RANGE> <NAME-EMAIL|COMPANY>+ <NNP>? <PARENS>?} #535
23932393
23942394
NAME-YEAR: {<YR-RANGE> <NAME-EMAIL|COMPANY>+ <CC> <YR-RANGE>} #540
23952395
@@ -2404,7 +2404,6 @@ def build_detection_from_node(
24042404
# Copyright 2018, OpenCensus Authors
24052405
COPYRIGHT: {<COPY>+ <YR-RANGE> <NNP> <AUTHS>} #1579991
24062406
2407-
24082407
NAME-YEAR: {<YR-RANGE> <NNP>+ <CAPS>?} #5612
24092408
24102409
#Academy of Motion Picture Arts and Sciences
@@ -2418,6 +2417,8 @@ def build_detection_from_node(
24182417
24192418
NAME-YEAR: {<YR-RANGE> <NAME>+ <CONTRIBUTORS>?} #570
24202419
2420+
URL: {<PARENS> <URL> <PARENS>} #5700
2421+
24212422
#also accept trailing email and URLs
24222423
NAME-YEAR: {<NAME-YEAR> <EMAIL>?<URL>?} #5701
24232424
NAME-YEAR: {<NAME-YEAR>+} #5702
@@ -2470,7 +2471,6 @@ def build_detection_from_node(
24702471
# this is catching a wide net by treating any bare URL as a company
24712472
COMPANY: {<NNP>? <URL|URL2>} #830
24722473
2473-
24742474
COMPANY: {<COMPANY> <COMP|COMPANY>} #840
24752475
24762476
# the Software and Component Technologies group of Trimble Navigation, Ltd.
@@ -2649,6 +2649,9 @@ def build_detection_from_node(
26492649
# Copyright (c) Ian F. Darwin 1986, 1987, 1989, 1990, 1991, 1992, 1994, 1995.
26502650
COPYRIGHT: {<COPY>+ <NAME|NAME-EMAIL|NAME-YEAR>+ <YR-RANGE>*} #157999
26512651
2652+
# portions copyright The Internet Society, Tom Tromey and Red Hat, Inc.
2653+
COPYRIGHT: {<PORTIONS> <COPY> <NN> <NAME>} #157998
2654+
26522655
COPYRIGHT: {<COPY>+ <CAPS|NNP>+ <CC> <NN> <COPY> <YR-RANGE>?} #1590
26532656
26542657
# // (c) (C) → ©
@@ -2737,6 +2740,9 @@ def build_detection_from_node(
27372740
# (c) Copyright 1985-1999 SOME TECHNOLOGY SYSTEMS
27382741
COPYRIGHT2: {<COPY> <COPY> <YR-RANGE> <CAPS> <CAPS> <CAPS>? <CAPS>?} #2271
27392742
2743+
# Minpack Copyright Notice (1999) University of Chicago
2744+
COPYRIGHT: {<COPY> <NOTICE> <NAME-YEAR>} #2273.1
2745+
27402746
# NAME-COPY is a name with a trailing copyright
27412747
# Daisy (c) 1998
27422748
NAME-COPY: {<NNP> <COPY>} #2272
@@ -3081,6 +3087,12 @@ def build_detection_from_node(
30813087
# Copyright (c) 2014 The Rust Project Developers
30823088
COPYRIGHT: {<COPYRIGHT> <MAINT> } #83020
30833089
3090+
# copyright its authors
3091+
COPYRIGHT: {<COPY> <NN> <AUTHS>} #83030
3092+
3093+
# Copyright: 2004-2007 by Internet Systems Consortium, Inc. ("ISC")
3094+
# 1995-2003 by Internet Software Consortium
3095+
COPYRIGHT: {<YR-RANGE> <BY> <COMPANY> } #1615
30843096
30853097
#######################################
30863098
# Copyright is held by ....

src/textcode/markup.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,22 @@ def demarkup(location):
108108
yield demarkup_text(line)
109109

110110

111-
get_tags_and_entities = re.compile(r'(</?[^\s></]+(?:>|\s)?|&[^\s&]+;|href|[\'"]?\/\>)', re.IGNORECASE).split
111+
get_tags_and_entities = re.compile(
112+
r'('
113+
r'</?[^\s></]+(?:>'
114+
r'|'
115+
r'\s)?'
116+
r'|'
117+
r'&[^\s&]+;'
118+
r'|'
119+
r'href'
120+
r'|'
121+
'[\'"]?\/\>'
122+
r'|'
123+
r'/>'
124+
r')',
125+
re.IGNORECASE,
126+
).split
112127

113128

114129
def demarkup_text(text):
@@ -136,7 +151,7 @@ def demarkup_text(text):
136151
cleaned_append = cleaned.append
137152
for token in tags_and_ents:
138153
tlow = token.lower()
139-
if tlow.startswith(('<', '/>', '&', 'href',)) and not any(k in tlow for k in kept_tags):
154+
if tlow.startswith(('<', '/>', '"/>', "'/>", '&', 'href',)) and not any(k in tlow for k in kept_tags):
140155
cleaned_append(' ')
141156
else:
142157
cleaned_append(token)

tests/cluecode/data/copyright_fossology/testdata87_raw

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ It was downloaded from http://ftp.isc.org/isc/dhcp/
66
Upstream <s>Author: Internet Systems Consortium (ISC) <dhcp-users@isc.org></s>
77

88
<s>Copyright 2004-2007 by Internet Systems Consortium, Inc.</s> ("ISC")
9-
1995-2003 by Internet Software Consortium
9+
<s>1995-2003 by Internet Software Consortium</s>
1010

1111
License:
1212

tests/cluecode/data/copyright_fossology/testdata93_raw

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,10 @@ Copyright:
3333
CTOCWidget.js:
3434
<s>Copyright (c) 2003 The Netscape Corporation</s>.
3535
xbCollapsibleLists.js:
36-
<s>Copyright (c) 1997 Michael Bostock</s> (Netscape Communications).
37-
<s>Copyright (c) 2001 Bob Clary</s> (Netscape Communications).
38-
<s>Copyright (c) 2001 Seth Dillingham</s> (Macrobyte Resources).
39-
<s>Copyright (c) 2002 Mark Filanowicz</s> (Amdahl IT Services).
36+
<s>Copyright (c) 1997 Michael Bostock (Netscape Communications)</s>.
37+
<s>Copyright (c) 2001 Bob Clary (Netscape Communications)</s>.
38+
<s>Copyright (c) 2001 Seth Dillingham (Macrobyte Resources)</s>.
39+
<s>Copyright (c) 2002 Mark Filanowicz (Amdahl IT Services)</s>.
4040

4141
Upstream <s>Author:
4242

tests/cluecode/data/copyrights/libcdio10-libcdio.label.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ copyrights:
2020
- Copyright (c) 1985, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1996, 1997, 1998, 1999,
2121
2000 Free Software Foundation, Inc.
2222
- Copyright (c) 2003 Matthias Drochner
23-
- Copyright (c) 1998-2001 VideoLAN ( Johan Bilien <jobi@via.ecp.fr> and Gildas Bazin <gbazin@netcourrier.com> )
23+
- Copyright (c) 1998-2001 VideoLAN Johan Bilien <jobi@via.ecp.fr> and Gildas Bazin <gbazin@netcourrier.com>
2424
- Copyright (c) 1992, 1993 Eric Youngdale
2525
- Copyright (c) 2003, 2004, 2005, 2006, 2007, 2008 Rocky Bernstein and Herbert Valerio Riedel
2626
holders:

tests/cluecode/data/copyrights/misco2/distributed_3.txt

Lines changed: 0 additions & 1 deletion
This file was deleted.

tests/cluecode/data/copyrights/misco2/distributed_3.txt.yml

Lines changed: 0 additions & 11 deletions
This file was deleted.

tests/cluecode/data/copyrights/misco2/its-authors.txt.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,11 @@ what:
33
- holders
44
- holders_summary
55
- authors
6+
copyrights:
7+
- copyright its authors
8+
holders:
9+
- its authors
10+
holders_summary:
11+
- value: its authors
12+
count: 1
13+
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Copyright © 1998-2009 Bill Spitzak
2+
(spitzak@users.sourceforge.net ) and others, including:
3+
4+
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
what:
2+
- copyrights
3+
- holders
4+
- holders_summary
5+
- authors
6+
copyrights:
7+
- Copyright (c) 1998-2009 Bill Spitzak (spitzak@users.sourceforge.net ) and others
8+
holders:
9+
- Bill Spitzak and others
10+
holders_summary:
11+
- value: Bill Spitzak and others
12+
count: 1

0 commit comments

Comments
 (0)