Skip to content

Commit b56b961

Browse files
committed
Improve copyrights more
Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com>
1 parent 59e4121 commit b56b961

File tree

33 files changed

+149
-72
lines changed

33 files changed

+149
-72
lines changed

src/cluecode/copyrights.py

Lines changed: 38 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -298,14 +298,13 @@ def detect(self,
298298
non_holder_labels_mini = frozenset([
299299
'COPY',
300300
'YR-RANGE', 'YR-AND', 'YR', 'YR-PLUS', 'BARE-YR',
301-
'HOLDER', 'AUTHOR',
302301
'IS', 'HELD',
303302
])
304303

305304
non_authors_labels = frozenset([
306305
'COPY',
307306
'YR-RANGE', 'YR-AND', 'YR', 'YR-PLUS', 'BARE-YR',
308-
'HOLDER', 'AUTHOR',
307+
'AUTH', 'AUTH2', 'HOLDER',
309308
'IS', 'HELD',
310309
])
311310

@@ -322,10 +321,9 @@ def detect(self,
322321
copyrght = build_detection_from_node(
323322
node=tree_node,
324323
cls=CopyrightDetection,
325-
ignores=non_copyright_labels,
324+
ignored_labels=non_copyright_labels,
326325
include_copyright_allrights=include_copyright_allrights,
327326
refiner=refine_copyright,
328-
junk=COPYRIGHTS_JUNK,
329327
)
330328

331329
if TRACE or TRACE_DEEP:
@@ -340,7 +338,7 @@ def detect(self,
340338
holder = build_detection_from_node(
341339
node=tree_node,
342340
cls=HolderDetection,
343-
ignores=non_holder_labels,
341+
ignored_labels=non_holder_labels,
344342
refiner=refine_holder,
345343
)
346344

@@ -351,7 +349,7 @@ def detect(self,
351349
holder = build_detection_from_node(
352350
node=tree_node,
353351
cls=HolderDetection,
354-
ignores=non_holder_labels_mini,
352+
ignored_labels=non_holder_labels_mini,
355353
refiner=refine_holder,
356354
)
357355

@@ -365,9 +363,8 @@ def detect(self,
365363
author = build_detection_from_node(
366364
node=tree_node,
367365
cls=AuthorDetection,
368-
ignores=non_authors_labels,
366+
ignored_labels=non_authors_labels,
369367
refiner=refine_author,
370-
junk=AUTHORS_JUNK,
371368
)
372369

373370
if author:
@@ -385,15 +382,21 @@ def get_tokens(numbered_lines, splitter=re.compile(r'[\t =;]+').split):
385382
We perform a simple tokenization on spaces, tabs and some punctuation: =;
386383
"""
387384
for start_line, line in numbered_lines:
385+
pos = 0
386+
388387
if TRACE_TOK:
389388
logger_debug(' get_tokens: bare line: ' + repr(line))
390389

390+
# if not line.strip():
391+
# yield Token(value="\n", label="EMPTY_LINE", start_line=start_line, pos=pos)
392+
# pos += 1
393+
# continue
394+
391395
line = prepare_text_line(line)
392396

393397
if TRACE_TOK:
394398
logger_debug(' get_tokens: preped line: ' + repr(line))
395399

396-
pos = 0
397400
for tok in splitter(line):
398401
# strip trailing quotes+comma
399402
if tok.endswith("',"):
@@ -406,7 +409,7 @@ def get_tokens(numbered_lines, splitter=re.compile(r'[\t =;]+').split):
406409
.strip()
407410
)
408411

409-
# the tokenizer allows a sinble colon or dot to be atoken and we discard these
412+
# the tokenizer allows a single colon or dot to be a token and we discard these
410413
if tok and tok not in ':.':
411414
yield Token(value=tok, start_line=start_line, pos=pos)
412415
pos += 1
@@ -475,42 +478,45 @@ class AuthorDetection(Detection):
475478
end_line = attr.ib()
476479

477480

481+
def filter_tokens(node, ignored_labels=frozenset()):
482+
"""
483+
Yield tokens for this parse tree Tree, ignoring nodes with a label in the ``ignored_labels`` set.
484+
The order reflects the order of the leaves in the tree's hierarchical structure, breadth-first.
485+
"""
486+
for token in node:
487+
if token.label in ignored_labels:
488+
continue
489+
if isinstance(token, Tree):
490+
yield from filter_tokens(token, ignored_labels=ignored_labels)
491+
else:
492+
yield token
493+
494+
478495
def build_detection_from_node(
479496
node,
480497
cls,
481-
ignores=frozenset(),
498+
ignored_labels=frozenset(),
482499
include_copyright_allrights=False,
483500
refiner=None,
484-
junk=frozenset(),
485-
junk_patterns=frozenset(),
486501
):
487502
"""
488503
Return a ``cls`` Detection object from a pygmars.tree.Tree ``node`` with a
489504
space-normalized string value or None.
490505
491-
Filter ``node`` Tokens with a type found in the ``ignores`` set of ignorable
506+
Filter ``node`` Tokens with a type found in the ``ignored_labels`` set of ignorable
492507
token types.
493508
494509
For copyright detection, include trailing "All rights reserved" if
495510
``include_copyright_allrights`` is True.
496511
497512
Apply the ``refiner`` callable function to the detection string.
498-
499-
Return None if the value exists in the ``junk`` strings set or is matched by
500-
any of the regex in the ``junk_patterns`` set.
501513
"""
502514
include_copyright_allrights = (
503515
cls == CopyrightDetection
504516
and include_copyright_allrights
505517
)
506518

507-
if ignores:
508-
leaves = [
509-
token for token in node.leaves()
510-
if token.label not in ignores
511-
]
512-
else:
513-
leaves = node.leaves()
519+
leaves = list(filter_tokens(node, ignored_labels=ignored_labels))
514520

515521
if include_copyright_allrights:
516522
filtered = leaves
@@ -545,7 +551,7 @@ def build_detection_from_node(
545551
if refiner:
546552
node_string = refiner(node_string)
547553

548-
if node_string and not is_junk_copyryright(node_string):
554+
if node_string and not is_junk_copyright(node_string):
549555
start_line = filtered[0].start_line
550556
end_line = filtered[-1].start_line
551557

@@ -1370,6 +1376,8 @@ def build_detection_from_node(
13701376
(r'^Bugfixes?$', 'NN'),
13711377
(r'^Likes?$', 'NN'),
13721378
(r'^STA$', 'NN'),
1379+
(r'^Page$', 'NN'),
1380+
(r'^Todo/Under$', 'JUNK'),
13731381

13741382
(r'^Interrupt$', 'NN'),
13751383
(r'^cleanups?$', 'JUNK'),
@@ -2071,7 +2079,7 @@ def build_detection_from_node(
20712079
(r'^\$?date-of-document$', 'YR'),
20722080

20732081
# cardinal numbers
2074-
(r'^-?[0-9]+(.[0-9]+)?\.?$', 'CD'),
2082+
(r'^-?[0-9]+(.[0-9]+)?[\.,]?$', 'CD'),
20752083

20762084
############################################################################
20772085
# All caps and proper nouns
@@ -2239,6 +2247,8 @@ def build_detection_from_node(
22392247
YR-RANGE: {<YR-AND>+} #70
22402248
YR-RANGE: {<YR-RANGE>+ <DASH|TO> <YR-RANGE>+} #71
22412249
YR-RANGE: {<YR-RANGE>+ <DASH>?} #72
2250+
# Copyright (c) 1999, 2000, 01, 03, 06 Ralf Baechle
2251+
YR-RANGE: {<YR-RANGE> <CD>+} #72.2
22422252
22432253
CD: {<BARE-YR>} #bareyear
22442254
@@ -3178,7 +3188,7 @@ def build_detection_from_node(
31783188
# the Initial Developer. All Rights Reserved.
31793189
COPYRIGHT: {<PORTIONS> <AUTH2> <INITIALDEV> <IS> <COPY|COPYRIGHT2>+ <YR-RANGE>? <INITIALDEV>} #2609.1
31803190
3181-
# Portions created by the Initial Developer are Copyright (C)
3191+
# Portions created by the Initial Developer are Copyright (C)
31823192
# the Initial Developer. All Rights Reserved.
31833193
# and
31843194
# Portions created by the Initial Developer are Copyright (C) 2002
@@ -3576,7 +3586,7 @@ def refine_names(s, prefixes):
35763586
COPYRIGHTS_JUNK_PATTERN_MATCHERS = [re.compile(p, re.IGNORECASE).match for p in COPYRIGHTS_JUNK]
35773587

35783588

3579-
def is_junk_copyryright(s, patterns=COPYRIGHTS_JUNK_PATTERN_MATCHERS):
3589+
def is_junk_copyright(s, patterns=COPYRIGHTS_JUNK_PATTERN_MATCHERS):
35803590
"""
35813591
Return True if the string ``s`` matches any junk patterns.
35823592
"""

tests/cluecode/data/authors/author_russ_c-c.c.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,5 +8,5 @@ copyrights:
88
- Vladimir Oleynik <dzo@simtreas.ru> (c) 2003
99
holders:
1010
- Russ Dill
11-
- Vladimir Oleynik
11+
- Vladimir Oleynik <dzo@simtreas.ru>
1212
notes: these are detected as copyrights, not authors

tests/cluecode/data/copyright_fossology/testdata100.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ copyrights:
1010
- Copyright (c) 1997 Doug Muth, Wescosville, Pennsylvania USA
1111
holders:
1212
- Eric S. Raymond
13-
- Carl E. Harris,
13+
- Carl E. Harris
1414
- Andrew Tridgell
1515
- George M. Sipe
1616
- Doug Muth, Wescosville, Pennsylvania USA

tests/cluecode/data/copyrights/ChangeLog.other.yml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,9 @@ what:
33
- holders
44
- holders_summary
55
copyrights:
6-
- copyright 2008-01-26
6+
- copyright 2008-01-26 11:46 vruppert
7+
holders:
8+
- vruppert
9+
holders_summary:
10+
- value: vruppert
11+
count: 1

tests/cluecode/data/copyrights/copytest/mixed_years_and_names.txt.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@ what:
66
copyrights:
77
- Copyright (c) 2004 - 2007 Nigel McNie, 2007 - 2008 Benny Baumann
88
holders:
9-
- Nigel McNie, - Benny Baumann
9+
- Nigel McNie, Benny Baumann
1010
authors:
1111
- Nigel McNie, Benny Baumann
1212
holders_summary:
13-
- value: Nigel McNie, - Benny Baumann
13+
- value: Nigel McNie, Benny Baumann
1414
count: 1

tests/cluecode/data/copyrights/copytest/mixed_years_no_auth.txt.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ what:
55
copyrights:
66
- Copyright (c) 2004 - 2007 Nigel McNie, 2007 - 2008 Benny Baumann
77
holders:
8-
- Nigel McNie, - Benny Baumann
8+
- Nigel McNie, Benny Baumann
99
holders_summary:
10-
- value: Nigel McNie, - Benny Baumann
10+
- value: Nigel McNie, Benny Baumann
1111
count: 1

tests/cluecode/data/copyrights/finnish.c.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@ what:
33
- holders
44
- holders_summary
55
copyrights:
6-
- (c) 2018 Nordea Bank Abp, Satamaradankatu 5, FI-00020 NORDEA, Helsinki
6+
- (c) 2018 Nordea Bank Abp, Satamaradankatu
77
holders:
8-
- Nordea Bank Abp, Satamaradankatu 5, FI-00020 NORDEA, Helsinki
8+
- Nordea Bank Abp, Satamaradankatu
99
holders_summary:
10-
- value: Nordea Bank Abp, Satamaradankatu 5, FI-00020 NORDEA, Helsinki
10+
- value: Nordea Bank Abp, Satamaradankatu
1111
count: 1

tests/cluecode/data/copyrights/math.c.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,16 +12,16 @@ holders:
1212
- The Regents of the University of California
1313
- Herbert Xu
1414
- Aaron Lehmann
15-
- Paul Mundt
16-
- Vladimir Oleynik
15+
- Paul Mundt <lethal@linux-sh.org>
16+
- Vladimir Oleynik <dzo@simtreas.ru>
1717
holders_summary:
1818
- value: Aaron Lehmann
1919
count: 1
2020
- value: Herbert Xu
2121
count: 1
22-
- value: Paul Mundt
22+
- value: Paul Mundt <lethal@linux-sh.org>
2323
count: 1
2424
- value: The Regents of the University of California
2525
count: 1
26-
- value: Vladimir Oleynik
26+
- value: Vladimir Oleynik <dzo@simtreas.ru>
2727
count: 1

tests/cluecode/data/copyrights/misco2/jan-17.txt.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@ what:
44
- holders_summary
55
- authors
66
authors:
7-
- Bela Ban Jan 17
7+
- Bela Ban Jan

tests/cluecode/data/copyrights/misco2/nick.txt.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ what:
66
copyrights:
77
- Copyright (c) 2005, 2006 Nick Galbreath - nickg at modp dot com
88
holders:
9-
- Nick Galbreath - nickg at modp dot com
9+
- Nick Galbreath
1010
holders_summary:
11-
- value: Nick Galbreath - nickg at modp dot com
11+
- value: Nick Galbreath
1212
count: 1

0 commit comments

Comments
 (0)