Skip to content

Commit 1242518

Browse files
Merge pull request #3752 from nexB/misc-copyrights
* Detect odd name in copyright #3655 Reported-by: Anton Augsburg @vw-anton Reference: #3655 Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com> * Do not detect trailing Distributed in copyright #3735 Reported-by: Dimitris Iliou @dimitris-iliou Reference: #3735 Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com> * Improve misc. copyright detections Spotted in some common python libraries such as numpy and scipy Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com> * Add new script to generate copyright tests Use an input file where each line is either: - a URL to fetch - a text to test Then generate a test data files pair accordingly Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com> * Improve copyright detection - Start detecting "is held by" - Do not include some trailing junk Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com> * Detect NN/EMAIL copyright combo #3764 Reference: #3764 Reported-by: Anton Augsburg @vw-anton Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com> * Detect NN/EMAIL copyright combo #3764 Make detection of copyright with a single lowercase name more specific Reference: #3764 Reported-by: Anton Augsburg @vw-anton Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com> * Align license with improved copyrights Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com> * Improve copyright detection of "distributed" Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com> * Do not detect some words as NNP This makes copyright detection more specific Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com> * Improve copyright tests Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com> * Detect OpenStreetMap correctly Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com> * Add new copyright detection tests Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com> * Improve copyright detection side-effects Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com> * Enable generation of copyright test file Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com> * Improve copyright debug tracing Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com> * Detect new form of copyright Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com> * Do not add arbitrary space around markup Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com> * Improve handle of parens in copyright Also improve NOTICEs, and other misc. variants Don not detect "The Initial Developer" Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com> * Correctly filter copyrights in licenses #3797 Reference: #3797 Reported-by: Jörg Arndt @Joerki Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com> * Improve copyright detection Handle corner cases with markup Detect new copyright forms. Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com> * Rename README file Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com> * Improve copyright detection * Handle better various parens, markup and quotes Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com> * Improve copyright detection Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com> * Refine copyright detection Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com> * Use latest commoncode Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com> * Enable generation of copyright test data files Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com> * Do not regen demarkup tests Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com> Co-authored-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com> --------- Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com> Co-authored-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com>
2 parents e4f6267 + f07eaee commit 1242518

File tree

234 files changed

+2672
-669
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

234 files changed

+2672
-669
lines changed

etc/scripts/gen_copyright_tests.py

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
#
4+
# Copyright (c) nexB Inc. and others. All rights reserved.
5+
# ScanCode is a trademark of nexB Inc.
6+
# SPDX-License-Identifier: Apache-2.0
7+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
8+
# See https://github.com/nexB/skeleton for support or download.
9+
# See https://aboutcode.org for more information about nexB OSS projects.
10+
#
11+
12+
import time
13+
14+
from datetime import datetime
15+
16+
import click
17+
import requests
18+
19+
20+
def timestamp():
21+
return datetime.utcnow().isoformat().split("T")[0]
22+
23+
24+
EMPTY_COPY_TEST = """what:
25+
- copyrights
26+
- holders
27+
copyrights:
28+
holders:
29+
"""
30+
31+
32+
@click.command()
33+
@click.option(
34+
"-u",
35+
"--urls",
36+
"urls_file",
37+
type=click.Path(exists=True, readable=True, path_type=str, dir_okay=False),
38+
metavar="URLS-FILE",
39+
multiple=False,
40+
required=True,
41+
help="Path to URLs file, one per line.",
42+
)
43+
@click.help_option("-h", "--help")
44+
def create_copyright_tests(
45+
urls_file,
46+
):
47+
"""
48+
Download the URLs listed in the URLS-FILE and create a copyight test for each in the current
49+
directory.
50+
51+
If a line number is provided as a URL fragment #L2, uses only 5 lines before and after this
52+
line.
53+
54+
If the URL is a plain GitHub URL, convert the URL to a raw URL.
55+
If the URL does not start with http it is treated as a plain copyright text to test
56+
"""
57+
58+
with open(urls_file) as urls:
59+
for i, url in enumerate(urls):
60+
url = url.strip()
61+
if not url:
62+
continue
63+
64+
name = ""
65+
if url.startswith("http"):
66+
print(f"Fetching URL: {url}")
67+
if url.startswith("https://github.com"):
68+
url = url.replace("https://github.com", "https://raw.githubusercontent.com")
69+
url = url.replace("/blob/", "/")
70+
71+
if "github" in url:
72+
segs = url.split("/")
73+
org = segs[3]
74+
repo = segs[4]
75+
name = f"copyright-test-{timestamp()}-{i}-{org}-{repo}.copyright"
76+
else:
77+
print(f"Processing test: {url}")
78+
name = f"copyright-test-{timestamp()}-{i}.copyright"
79+
80+
81+
start_line = 0
82+
end_line = 0
83+
if "#L" in url:
84+
_, _, line = url.rpartition("#L")
85+
line = int(line)
86+
if line > 5:
87+
start_line = line - 5
88+
end_line = line + 5
89+
90+
if url.startswith("http"):
91+
_header, content = get_remote_file_content(url, as_text=True)
92+
else:
93+
content = url
94+
95+
if end_line != 0:
96+
content = "".join(content.strip().splitlines()[start_line:end_line])
97+
98+
with open(name, "w") as out:
99+
out.write(content)
100+
101+
yml = EMPTY_COPY_TEST
102+
if url.startswith("http"):
103+
yml = f"{yml}\nnotes: from {url}\n"
104+
105+
with open(f"{name}.yml", "w") as out:
106+
out.write(yml)
107+
108+
if url.startswith("http"):
109+
time.sleep(1)
110+
111+
112+
class RemoteNotFetchedException(Exception):
113+
pass
114+
115+
116+
def get_remote_file_content(
117+
url,
118+
as_text=True,
119+
headers_only=False,
120+
headers=None,
121+
_delay=0,
122+
):
123+
"""
124+
Fetch and return a tuple of (headers, content) at `url`. Return content as a
125+
text string if `as_text` is True. Otherwise return the content as bytes.
126+
127+
If `header_only` is True, return only (headers, None). Headers is a mapping
128+
of HTTP headers.
129+
Retries multiple times to fetch if there is a HTTP 429 throttling response
130+
and this with an increasing delay.
131+
"""
132+
time.sleep(_delay)
133+
headers = headers or {}
134+
# using a GET with stream=True ensure we get the the final header from
135+
# several redirects and that we can ignore content there. A HEAD request may
136+
# not get us this last header
137+
print(f" DOWNLOADING: {url}")
138+
with requests.get(url, allow_redirects=True, stream=True, headers=headers) as response:
139+
status = response.status_code
140+
if status != requests.codes.ok: # NOQA
141+
if status == 429 and _delay < 20:
142+
# too many requests: start some exponential delay
143+
increased_delay = (_delay * 2) or 1
144+
145+
return get_remote_file_content(
146+
url,
147+
as_text=as_text,
148+
headers_only=headers_only,
149+
_delay=increased_delay,
150+
)
151+
152+
else:
153+
raise RemoteNotFetchedException(f"Failed HTTP request from {url} with {status}")
154+
155+
if headers_only:
156+
return response.headers, None
157+
158+
return response.headers, response.text if as_text else response.content
159+
160+
161+
if __name__ == "__main__":
162+
create_copyright_tests()

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ chardet==5.0.0
1010
charset-normalizer==2.1.0
1111
click==8.1.7
1212
colorama==0.4.5
13-
commoncode==31.0.3
13+
commoncode==31.2.1
1414
construct==2.10.68
1515
container-inspector==31.1.0
1616
cryptography==42.0.5

0 commit comments

Comments
 (0)