Skip to content

Commit a6c3846

Browse files
committed
Detect CREDITS authors
And also improve other copyright detections Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com>
1 parent 0003af2 commit a6c3846

File tree

5 files changed

+440
-0
lines changed

5 files changed

+440
-0
lines changed

src/cluecode/linux_credits.py

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
# -*- coding: utf-8 -*-
2+
#
3+
# Copyright (c) nexB Inc. and others. All rights reserved.
4+
# ScanCode is a trademark of nexB Inc.
5+
# SPDX-License-Identifier: Apache-2.0
6+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
7+
# See https://github.com/aboutcode-org/scancode-toolkit for support or download.
8+
# See https://aboutcode.org for more information about nexB OSS projects.
9+
#
10+
11+
import os
12+
import sys
13+
14+
from collections import deque
15+
16+
from commoncode.fileutils import file_name
17+
18+
"""
19+
Detect and collect authors from a Linux-formatted CREDITS file.
20+
This used by Linux, but also Raku, Phasar, u-boot, LLVM, Botan and other projects.
21+
An enetry looks like this:
22+
N: Jack Lloyd
23+
E: lloyd@randombit.net
24+
W: http://www.randombit.net/
25+
P: 3F69 2E64 6D92 3BBE E7AE 9258 5C0F 96E8 4EC1 6D6B
26+
B: 1DwxWb2J4vuX4vjsbzaCXW696rZfeamahz
27+
28+
We only consider the entries: N: name, E: email and W: web URL
29+
"""
30+
# Tracing flags
31+
TRACE = False or os.environ.get('SCANCODE_DEBUG_CREDITS', False)
32+
33+
34+
# Tracing flags
35+
def logger_debug(*args):
36+
pass
37+
38+
39+
if TRACE:
40+
import logging
41+
42+
logger = logging.getLogger(__name__)
43+
logging.basicConfig(stream=sys.stdout)
44+
logger.setLevel(logging.DEBUG)
45+
46+
def logger_debug(*args):
47+
return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args))
48+
49+
50+
def is_credits_file(location):
51+
"""
52+
Return True if the file is likely to be a credits file
53+
"""
54+
credits_filenames = set([
55+
"credit",
56+
"credits",
57+
"credits.rst",
58+
"credits.txt",
59+
"credits.md",
60+
"author",
61+
"authors",
62+
"authors.rst",
63+
"authors.txt",
64+
"authors.md",
65+
])
66+
67+
fn = file_name(location).lower()
68+
return fn in credits_filenames
69+
70+
71+
def detect_credits_authors(location):
72+
"""
73+
Yield AuthorDetection objects detected in the CREDITS file at ``location``.
74+
"""
75+
if not is_credits_file(location):
76+
return
77+
78+
from textcode.analysis import numbered_text_lines
79+
80+
numbered_lines = list(numbered_text_lines(location, demarkup=False))
81+
yield from detect_credits_authors_from_lines(numbered_lines)
82+
83+
84+
def detect_credits_authors_from_lines(numbered_lines):
85+
"""
86+
Yield AuthorDetection objects detected in the CREDITS file ``numbered_lines`` iterable of (line
87+
number, line text).
88+
"""
89+
90+
if TRACE:
91+
logger_debug('detect_credits_authors_from_lines: numbered_lines')
92+
for nl in numbered_lines:
93+
logger_debug(' numbered_line:', repr(nl))
94+
95+
from cluecode.copyrights import AuthorDetection
96+
97+
for lines in get_credit_lines_groups(numbered_lines):
98+
if TRACE:
99+
logger_debug('detect_credits_authors_from_lines: credit_lines group:', lines)
100+
101+
start_line, _ = lines[0]
102+
end_line, _ = lines[-1]
103+
names = []
104+
emails = []
105+
webs = []
106+
for _, line in lines:
107+
ltype, _, line = line.partition(":")
108+
line = line.strip()
109+
if ltype == "N":
110+
names.append(line)
111+
elif ltype == "E":
112+
emails.append(line)
113+
elif ltype == "W":
114+
webs.append(line)
115+
116+
items = list(" ".join(item) for item in (names, emails, webs) if item)
117+
if TRACE:
118+
logger_debug('detect_credits_authors_from_lines: items:', items)
119+
120+
author = " ".join(items)
121+
if author:
122+
yield AuthorDetection(author=author, start_line=start_line, end_line=end_line)
123+
124+
125+
def get_credit_lines_groups(numbered_lines):
126+
"""
127+
Yield groups of contiguous credit lines as separated by one of more empty lines.
128+
Only keep line of interest.
129+
"""
130+
lines_group = []
131+
lines_group_append = lines_group.append
132+
lines_group_clear = lines_group.clear
133+
134+
has_credits = False
135+
for ln, line in numbered_lines:
136+
line = line.strip()
137+
138+
if not line and lines_group:
139+
if TRACE:
140+
logger_debug('get_credit_lines_groups: lines_group:', lines_group)
141+
142+
yield list(lines_group)
143+
lines_group_clear()
144+
145+
if line.startswith(("N:", "E:", "W:")):
146+
has_credits = True
147+
lines_group_append((ln, line))
148+
149+
# bail out if there are no structured credits in the first 50 lines
150+
if ln > 50 and not has_credits:
151+
return
152+
153+
if lines_group:
154+
yield list(lines_group)

tests/cluecode/data/credits/CREDITS

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
This is at least a partial credits-file of people that have
2+
contributed to the Linux project. It is sorted by name and
3+
formatted to allow easy grepping and beautification by
4+
scripts. The fields are: name (N), email (E), web-address
5+
(W), PGP key ID and fingerprint (P), description (D), and
6+
snail-mail address (S).
7+
Thanks,
8+
9+
Linus
10+
----------
11+
12+
N: Matt Mackal
13+
E: mpm@selenic.com
14+
D: SLOB slab allocator
15+
16+
N: Werner Almesberger
17+
E: werner@almesberger.net
18+
W: http://www.almesberger.net/
19+
D: dosfs, LILO, some fd features, ATM, various other hacks here and there
20+
S: Buenos Aires
21+
S: Argentina
22+
23+
N: Tim Alpaerts
24+
E: tim_alpaerts@toyota-motor-europe.com
25+
D: 802.2 class II logical link control layer,
26+
D: the humble start of an opening towards the IBM SNA protocols
27+
S: Klaproosstraat 72 c 10
28+
S: B-2610 Wilrijk-Antwerpen
29+
S: Belgium
30+
31+
N: Anton Altaparmakov
32+
E: aia21@cantab.net
33+
W: http://www-stu.christs.cam.ac.uk/~aia21/
34+
D: Author of new NTFS driver, various other kernel hacks.
35+
S: Christ's College
36+
S: Cambridge CB2 3BU
37+
S: United Kingdom
38+
39+
N: C. Scott Ananian
40+
E: cananian@alumni.princeton.edu
41+
W: http://www.pdos.lcs.mit.edu/~cananian
42+
P: 1024/85AD9EED AD C0 49 08 91 67 DF D7 FA 04 1A EE 09 E8 44 B0
43+
D: Unix98 pty support.
44+
D: APM update to 1.2 spec.
45+
D: /devfs hacking.
46+
S: 7 Kiwi Loop
47+
S: Howell, NJ 07731
48+
S: USA
49+
50+
N: Erik Andersen
51+
E: andersen@codepoet.org
52+
W: http://www.codepoet.org/
53+
P: 1024D/30D39057 1BC4 2742 E885 E4DE 9301 0C82 5F9B 643E 30D3 9057
54+
D: Maintainer of ide-cd and Uniform CD-ROM driver,
55+
D: ATAPI CD-Changer support, Major 2.1.x CD-ROM update.
56+
S: 352 North 525 East
57+
S: Springville, Utah 84663
58+
S: USA
59+
60+
N: Michel Aubry
61+
E: giovanni <giovanni@sudfr.com>
62+
D: Aladdin 1533/1543(C) chipset IDE
63+
D: VIA MVP-3/TX Pro III chipset IDE
64+
65+
N: Ralf Baechle
66+
E: ralf@gnu.org
67+
P: 1024/AF7B30C1 CF 97 C2 CC 6D AE A7 FE C8 BA 9C FC 88 DE 32 C3
68+
D: Linux/MIPS port
69+
D: Linux/68k hacker
70+
S: Hauptstrasse 19
71+
S: 79837 St. Blasien
72+
S: Germany
73+
74+
N: Krishna Balasubramanian
75+
E: balasub@cis.ohio-state.edu
76+
D: Wrote SYS V IPC (part of standard kernel since 0.99.10)
77+
78+
N: Chris Ball
79+
E: chris@printf.net
80+
D: Former maintainer of the MMC/SD/SDIO subsystem.
81+
82+
N: Dario Ballabio
83+
E: ballabio_dario@emc.com
84+
E: dario.ballabio@tiscalinet.it
85+
E: dario.ballabio@inwind.it
86+
D: Author and maintainer of the Ultrastor 14F/34F SCSI driver
87+
D: Author and maintainer of the EATA ISA/EISA/PCI SCSI driver
88+
S: EMC Corporation
89+
S: Milano
90+
S: Italy
91+
92+
N: Paul Bame
93+
E: bame@debian.org
94+
E: bame@puffin.external.hp.com
95+
E: paul_bame@hp.com
96+
W: http://www.parisc-linux.org
97+
D: PA-RISC 32 and 64-bit early boot, firmware interface, interrupts, misc
98+
S: MS42
99+
S: Hewlett-Packard
100+
S: 3404 E Harmony Rd
101+
S: Fort Collins, CO 80525
102+
S: USA
103+
104+
N: Juan Jose Ciarlante
105+
W: http://juanjox.kernelnotes.org/
106+
E: jjciarla@raiz.uncu.edu.ar
107+
E: jjo@mendoza.gov.ar
108+
D: Network driver alias support
109+
D: IP masq hashing and app modules
110+
D: IP masq 2.1 features and bugs
111+
S: Las Cuevas 2385 - Bo Guemes
112+
S: Las Heras, Mendoza CP 5539
113+
S: Argentina
114+
W: http://juanjox.raiz.uncu.edu.ar/
115+
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
[
2+
{
3+
"author": "Matt Mackal mpm@selenic.com",
4+
"start_line": 12,
5+
"end_line": 13
6+
},
7+
{
8+
"author": "Werner Almesberger werner@almesberger.net http://www.almesberger.net/",
9+
"start_line": 16,
10+
"end_line": 18
11+
},
12+
{
13+
"author": "Tim Alpaerts tim_alpaerts@toyota-motor-europe.com",
14+
"start_line": 23,
15+
"end_line": 24
16+
},
17+
{
18+
"author": "Anton Altaparmakov aia21@cantab.net http://www-stu.christs.cam.ac.uk/~aia21/",
19+
"start_line": 31,
20+
"end_line": 33
21+
},
22+
{
23+
"author": "C. Scott Ananian cananian@alumni.princeton.edu http://www.pdos.lcs.mit.edu/~cananian",
24+
"start_line": 39,
25+
"end_line": 41
26+
},
27+
{
28+
"author": "Erik Andersen andersen@codepoet.org http://www.codepoet.org/",
29+
"start_line": 50,
30+
"end_line": 52
31+
},
32+
{
33+
"author": "Michel Aubry giovanni <giovanni@sudfr.com>",
34+
"start_line": 60,
35+
"end_line": 61
36+
},
37+
{
38+
"author": "Ralf Baechle ralf@gnu.org",
39+
"start_line": 65,
40+
"end_line": 66
41+
},
42+
{
43+
"author": "Krishna Balasubramanian balasub@cis.ohio-state.edu",
44+
"start_line": 74,
45+
"end_line": 75
46+
},
47+
{
48+
"author": "Chris Ball chris@printf.net",
49+
"start_line": 78,
50+
"end_line": 79
51+
},
52+
{
53+
"author": "Dario Ballabio ballabio_dario@emc.com dario.ballabio@tiscalinet.it dario.ballabio@inwind.it",
54+
"start_line": 82,
55+
"end_line": 85
56+
},
57+
{
58+
"author": "Paul Bame bame@debian.org bame@puffin.external.hp.com paul_bame@hp.com http://www.parisc-linux.org",
59+
"start_line": 92,
60+
"end_line": 96
61+
},
62+
{
63+
"author": "Juan Jose Ciarlante jjciarla@raiz.uncu.edu.ar jjo@mendoza.gov.ar http://juanjox.kernelnotes.org/ http://juanjox.raiz.uncu.edu.ar/",
64+
"start_line": 104,
65+
"end_line": 114
66+
}
67+
]

0 commit comments

Comments
 (0)