Improve copyrights more

pombredanne · pombredanne · commit b56b961e400f · 2024-09-07T18:43:13.000+02:00
Signed-off-by: Philippe Ombredanne &lt;pombredanne@nexb.com&gt;
diff --git a/src/cluecode/copyrights.py b/src/cluecode/copyrights.py
@@ -298,14 +298,13 @@ def detect(self,
         non_holder_labels_mini = frozenset([
             'COPY',
             'YR-RANGE', 'YR-AND', 'YR', 'YR-PLUS', 'BARE-YR',
-            'HOLDER', 'AUTHOR',
             'IS', 'HELD',
         ])
 
         non_authors_labels = frozenset([
             'COPY',
             'YR-RANGE', 'YR-AND', 'YR', 'YR-PLUS', 'BARE-YR',
-            'HOLDER', 'AUTHOR',
+            'AUTH', 'AUTH2', 'HOLDER',
             'IS', 'HELD',
         ])
 
@@ -322,10 +321,9 @@ def detect(self,
                 copyrght = build_detection_from_node(
                     node=tree_node,
                     cls=CopyrightDetection,
-                    ignores=non_copyright_labels,
+                    ignored_labels=non_copyright_labels,
                     include_copyright_allrights=include_copyright_allrights,
                     refiner=refine_copyright,
-                    junk=COPYRIGHTS_JUNK,
                 )
 
                 if TRACE or TRACE_DEEP:
@@ -340,7 +338,7 @@ def detect(self,
                         holder = build_detection_from_node(
                             node=tree_node,
                             cls=HolderDetection,
-                            ignores=non_holder_labels,
+                            ignored_labels=non_holder_labels,
                             refiner=refine_holder,
                         )
 
@@ -351,7 +349,7 @@ def detect(self,
                             holder = build_detection_from_node(
                                 node=tree_node,
                                 cls=HolderDetection,
-                                ignores=non_holder_labels_mini,
+                                ignored_labels=non_holder_labels_mini,
                                 refiner=refine_holder,
                             )
 
@@ -365,9 +363,8 @@ def detect(self,
                 author = build_detection_from_node(
                     node=tree_node,
                     cls=AuthorDetection,
-                    ignores=non_authors_labels,
+                    ignored_labels=non_authors_labels,
                     refiner=refine_author,
-                    junk=AUTHORS_JUNK,
                 )
 
                 if author:
@@ -385,15 +382,21 @@ def get_tokens(numbered_lines, splitter=re.compile(r'[\t =;]+').split):
     We perform a simple tokenization on spaces, tabs and some punctuation: =;
     """
     for start_line, line in numbered_lines:
+        pos = 0
+
         if TRACE_TOK:
             logger_debug('  get_tokens: bare line: ' + repr(line))
 
+        # if not line.strip():
+        #     yield Token(value="\n", label="EMPTY_LINE", start_line=start_line, pos=pos)
+        #     pos += 1
+        #     continue
+
         line = prepare_text_line(line)
 
         if TRACE_TOK:
             logger_debug('  get_tokens: preped line: ' + repr(line))
 
-        pos = 0
         for tok in splitter(line):
             # strip trailing quotes+comma
             if tok.endswith("',"):
@@ -406,7 +409,7 @@ def get_tokens(numbered_lines, splitter=re.compile(r'[\t =;]+').split):
                 .strip()
             )
 
-            # the tokenizer allows a sinble colon or dot to be atoken and we discard these
+            # the tokenizer allows a single colon or dot to be a token and we discard these
             if tok and tok not in ':.':
                 yield Token(value=tok, start_line=start_line, pos=pos)
                 pos += 1
@@ -475,42 +478,45 @@ class AuthorDetection(Detection):
     end_line = attr.ib()
 
 
+def filter_tokens(node, ignored_labels=frozenset()):
+    """
+    Yield tokens for this parse tree Tree, ignoring nodes with a label in the ``ignored_labels`` set.
+    The order reflects the order of the leaves in the tree's hierarchical structure, breadth-first.
+    """
+    for token in node:
+        if token.label in ignored_labels:
+            continue
+        if isinstance(token, Tree):
+            yield from filter_tokens(token, ignored_labels=ignored_labels)
+        else:
+            yield token
+
+
 def build_detection_from_node(
     node,
     cls,
-    ignores=frozenset(),
+    ignored_labels=frozenset(),
     include_copyright_allrights=False,
     refiner=None,
-    junk=frozenset(),
-    junk_patterns=frozenset(),
 ):
     """
     Return a ``cls`` Detection object from a pygmars.tree.Tree ``node`` with a
     space-normalized string value or None.
 
-    Filter ``node`` Tokens with a type found in the ``ignores`` set of ignorable
+    Filter ``node`` Tokens with a type found in the ``ignored_labels`` set of ignorable
     token types.
 
     For copyright detection, include trailing "All rights reserved" if
     ``include_copyright_allrights`` is True.
 
     Apply the ``refiner`` callable function to the detection string.
-
-    Return None if the value exists in the ``junk`` strings set or is matched by
-    any of the regex in the ``junk_patterns`` set.
     """
     include_copyright_allrights = (
         cls == CopyrightDetection
         and include_copyright_allrights
     )
 
-    if ignores:
-        leaves = [
-            token for token in node.leaves()
-            if token.label not in ignores
-        ]
-    else:
-        leaves = node.leaves()
+    leaves = list(filter_tokens(node, ignored_labels=ignored_labels))
 
     if include_copyright_allrights:
         filtered = leaves
@@ -545,7 +551,7 @@ def build_detection_from_node(
     if refiner:
         node_string = refiner(node_string)
 
-    if node_string and not is_junk_copyryright(node_string):
+    if node_string and not is_junk_copyright(node_string):
         start_line = filtered[0].start_line
         end_line = filtered[-1].start_line
 
@@ -1370,6 +1376,8 @@ def build_detection_from_node(
     (r'^Bugfixes?$', 'NN'),
     (r'^Likes?$', 'NN'),
     (r'^STA$', 'NN'),
+    (r'^Page$', 'NN'),
+    (r'^Todo/Under$', 'JUNK'),
 
     (r'^Interrupt$', 'NN'),
     (r'^cleanups?$', 'JUNK'),
@@ -2071,7 +2079,7 @@ def build_detection_from_node(
     (r'^\$?date-of-document$', 'YR'),
 
     # cardinal numbers
-    (r'^-?[0-9]+(.[0-9]+)?\.?$', 'CD'),
+    (r'^-?[0-9]+(.[0-9]+)?[\.,]?$', 'CD'),
 
     ############################################################################
     # All caps and proper nouns
@@ -2239,6 +2247,8 @@ def build_detection_from_node(
     YR-RANGE: {<YR-AND>+}        #70
     YR-RANGE: {<YR-RANGE>+ <DASH|TO> <YR-RANGE>+}        #71
     YR-RANGE: {<YR-RANGE>+ <DASH>?}        #72
+    # Copyright (c) 1999, 2000, 01, 03, 06 Ralf Baechle
+    YR-RANGE: {<YR-RANGE> <CD>+}        #72.2
 
     CD: {<BARE-YR>} #bareyear
 
@@ -3178,7 +3188,7 @@ def build_detection_from_node(
     # the Initial Developer. All Rights Reserved.
     COPYRIGHT: {<PORTIONS>  <AUTH2>  <INITIALDEV>  <IS>  <COPY|COPYRIGHT2>+  <YR-RANGE>? <INITIALDEV>} #2609.1
 
-    # Portions created by the Initial Developer are Copyright (C) 
+    # Portions created by the Initial Developer are Copyright (C)
     # the Initial Developer. All Rights Reserved.
     # and
     # Portions created by the Initial Developer are Copyright (C) 2002
@@ -3576,7 +3586,7 @@ def refine_names(s, prefixes):
 COPYRIGHTS_JUNK_PATTERN_MATCHERS = [re.compile(p, re.IGNORECASE).match for p in COPYRIGHTS_JUNK]
 
 
-def is_junk_copyryright(s, patterns=COPYRIGHTS_JUNK_PATTERN_MATCHERS):
+def is_junk_copyright(s, patterns=COPYRIGHTS_JUNK_PATTERN_MATCHERS):
     """
     Return True if the string ``s`` matches any junk patterns.
     """
diff --git a/tests/cluecode/data/authors/author_russ_c-c.c.yml b/tests/cluecode/data/authors/author_russ_c-c.c.yml
@@ -8,5 +8,5 @@ copyrights:
   - Vladimir Oleynik <dzo@simtreas.ru> (c) 2003
 holders:
   - Russ Dill
-  - Vladimir Oleynik
+  - Vladimir Oleynik <dzo@simtreas.ru>
 notes: these are detected as copyrights, not authors
diff --git a/tests/cluecode/data/copyright_fossology/testdata100.yml b/tests/cluecode/data/copyright_fossology/testdata100.yml
@@ -10,7 +10,7 @@ copyrights:
   - Copyright (c) 1997 Doug Muth, Wescosville, Pennsylvania USA
 holders:
   - Eric S. Raymond
-  - Carl E. Harris,
+  - Carl E. Harris
   - Andrew Tridgell
   - George M. Sipe
   - Doug Muth, Wescosville, Pennsylvania USA
diff --git a/tests/cluecode/data/copyrights/ChangeLog.other.yml b/tests/cluecode/data/copyrights/ChangeLog.other.yml
@@ -3,4 +3,9 @@ what:
   - holders
   - holders_summary
 copyrights:
-  - copyright 2008-01-26
+  - copyright 2008-01-26 11:46 vruppert
+holders:
+  - vruppert
+holders_summary:
+  - value: vruppert
+    count: 1
diff --git a/tests/cluecode/data/copyrights/copytest/mixed_years_and_names.txt.yml b/tests/cluecode/data/copyrights/copytest/mixed_years_and_names.txt.yml
@@ -6,9 +6,9 @@ what:
 copyrights:
   - Copyright (c) 2004 - 2007 Nigel McNie, 2007 - 2008 Benny Baumann
 holders:
-  - Nigel McNie, - Benny Baumann
+  - Nigel McNie, Benny Baumann
 authors:
   - Nigel McNie, Benny Baumann
 holders_summary:
-  - value: Nigel McNie, - Benny Baumann
+  - value: Nigel McNie, Benny Baumann
     count: 1
diff --git a/tests/cluecode/data/copyrights/copytest/mixed_years_no_auth.txt.yml b/tests/cluecode/data/copyrights/copytest/mixed_years_no_auth.txt.yml
@@ -5,7 +5,7 @@ what:
 copyrights:
   - Copyright (c) 2004 - 2007 Nigel McNie, 2007 - 2008 Benny Baumann
 holders:
-  - Nigel McNie, - Benny Baumann
+  - Nigel McNie, Benny Baumann
 holders_summary:
-  - value: Nigel McNie, - Benny Baumann
+  - value: Nigel McNie, Benny Baumann
     count: 1
diff --git a/tests/cluecode/data/copyrights/finnish.c.yml b/tests/cluecode/data/copyrights/finnish.c.yml
@@ -3,9 +3,9 @@ what:
   - holders
   - holders_summary
 copyrights:
-  - (c) 2018 Nordea Bank Abp, Satamaradankatu 5, FI-00020 NORDEA, Helsinki
+  - (c) 2018 Nordea Bank Abp, Satamaradankatu
 holders:
-  - Nordea Bank Abp, Satamaradankatu 5, FI-00020 NORDEA, Helsinki
+  - Nordea Bank Abp, Satamaradankatu
 holders_summary:
-  - value: Nordea Bank Abp, Satamaradankatu 5, FI-00020 NORDEA, Helsinki
+  - value: Nordea Bank Abp, Satamaradankatu
     count: 1
diff --git a/tests/cluecode/data/copyrights/math.c.yml b/tests/cluecode/data/copyrights/math.c.yml
@@ -12,16 +12,16 @@ holders:
   - The Regents of the University of California
   - Herbert Xu
   - Aaron Lehmann
-  - Paul Mundt
-  - Vladimir Oleynik
+  - Paul Mundt <lethal@linux-sh.org>
+  - Vladimir Oleynik <dzo@simtreas.ru>
 holders_summary:
   - value: Aaron Lehmann
     count: 1
   - value: Herbert Xu
     count: 1
-  - value: Paul Mundt
+  - value: Paul Mundt <lethal@linux-sh.org>
     count: 1
   - value: The Regents of the University of California
     count: 1
-  - value: Vladimir Oleynik
+  - value: Vladimir Oleynik <dzo@simtreas.ru>
     count: 1
diff --git a/tests/cluecode/data/copyrights/misco2/jan-17.txt.yml b/tests/cluecode/data/copyrights/misco2/jan-17.txt.yml
@@ -4,4 +4,4 @@ what:
   - holders_summary
   - authors
 authors:
-  - Bela Ban Jan 17
+  - Bela Ban Jan
diff --git a/tests/cluecode/data/copyrights/misco2/nick.txt.yml b/tests/cluecode/data/copyrights/misco2/nick.txt.yml
@@ -6,7 +6,7 @@ what:
 copyrights:
   - Copyright (c) 2005, 2006 Nick Galbreath - nickg at modp dot com
 holders:
-  - Nick Galbreath - nickg at modp dot com
+  - Nick Galbreath
 holders_summary:
-  - value: Nick Galbreath - nickg at modp dot com
+  - value: Nick Galbreath
     count: 1
diff --git a/tests/cluecode/data/copyrights/misco2/portion-copyright.txt.yml b/tests/cluecode/data/copyrights/misco2/portion-copyright.txt.yml
@@ -6,7 +6,7 @@ what:
 copyrights:
   - copyrighted by Carl E. Harris, 1993 and 1995
 holders:
-  - Carl E. Harris,
+  - Carl E. Harris
 holders_summary:
-  - value: Carl E. Harris,
+  - value: Carl E. Harris
     count: 1
diff --git a/tests/cluecode/data/copyrights/misco4/linux3/mixed-years.txt b/tests/cluecode/data/copyrights/misco4/linux3/mixed-years.txt
@@ -0,0 +1 @@
+ Copyright (C) 1999, 2000, 01, 03, 06 Ralf Baechle
diff --git a/tests/cluecode/data/copyrights/misco4/linux3/mixed-years.txt.yml b/tests/cluecode/data/copyrights/misco4/linux3/mixed-years.txt.yml
@@ -0,0 +1,8 @@
+what:
+  - copyrights
+  - holders
+  - authors
+copyrights:
+  - Copyright (c) 1999, 2000, 01, 03, 06 Ralf Baechle
+holders:
+  - Ralf Baechle
diff --git a/tests/cluecode/data/copyrights/misco4/linux3/pre-name.txt b/tests/cluecode/data/copyrights/misco4/linux3/pre-name.txt
@@ -0,0 +1,4 @@
+ * Modified by Paul Mundt <lethal@linux-sh.org> (c) 2004 to support
+ * dynamic variables.
+ *
+ * Modified by Vladimir Oleynik <dzo@simtreas.ru> (c) 2001-2005 to be
diff --git a/tests/cluecode/data/copyrights/misco4/linux3/pre-name.txt.yml b/tests/cluecode/data/copyrights/misco4/linux3/pre-name.txt.yml
@@ -0,0 +1,10 @@
+what:
+  - copyrights
+  - holders
+  - authors
+copyrights:
+  - Paul Mundt <lethal@linux-sh.org> (c) 2004
+  - Vladimir Oleynik <dzo@simtreas.ru> (c) 2001-2005
+holders:
+  - Paul Mundt <lethal@linux-sh.org>
+  - Vladimir Oleynik <dzo@simtreas.ru>
diff --git a/tests/cluecode/data/copyrights/misco4/linux3/rewrited.txt b/tests/cluecode/data/copyrights/misco4/linux3/rewrited.txt
@@ -0,0 +1,3 @@
+ * Russ Dill <Russ.Dill@asu.edu> 2001-2003
+ * Rewrited by Vladimir Oleynik <dzo@simtreas.ru> (C) 2003
+
diff --git a/tests/cluecode/data/copyrights/misco4/linux3/rewrited.txt.yml b/tests/cluecode/data/copyrights/misco4/linux3/rewrited.txt.yml
@@ -0,0 +1,10 @@
+what:
+  - copyrights
+  - holders
+  - authors
+copyrights:
+  - Russ Dill <Russ.Dill@asu.edu> 2001-2003
+  - Vladimir Oleynik <dzo@simtreas.ru> (c) 2003
+holders:
+  - Russ Dill
+  - Vladimir Oleynik <dzo@simtreas.ru>
diff --git a/tests/cluecode/data/copyrights/misco4/linux3/russian.txt b/tests/cluecode/data/copyrights/misco4/linux3/russian.txt
@@ -0,0 +1,3 @@
+developed by the Center for Information
+Protection and Special Communications of the Federal Security
+Service of the Russian Federation 
diff --git a/tests/cluecode/data/copyrights/misco4/linux3/russian.txt.yml b/tests/cluecode/data/copyrights/misco4/linux3/russian.txt.yml
@@ -0,0 +1,7 @@
+what:
+  - copyrights
+  - holders
+  - authors
+authors:
+  - the Center for Information Protection and Special Communications of the Federal Security
+    Service of the Russian Federation
diff --git a/tests/cluecode/data/copyrights/misco4/linux3/todo.txt b/tests/cluecode/data/copyrights/misco4/linux3/todo.txt
@@ -0,0 +1,2 @@
+(c) Page i/o:
+Todo/Under discussion:
diff --git a/tests/cluecode/data/copyrights/misco4/linux3/todo.txt.yml b/tests/cluecode/data/copyrights/misco4/linux3/todo.txt.yml
@@ -0,0 +1,4 @@
+what:
+  - copyrights
+  - holders
+  - authors
diff --git a/tests/cluecode/data/copyrights/misco4/linux3/vrup.txt b/tests/cluecode/data/copyrights/misco4/linux3/vrup.txt
@@ -0,0 +1,3 @@
+updated year in copyright
+
+2008-01-26 11:46  vruppert
diff --git a/tests/cluecode/data/copyrights/misco4/linux3/vrup.txt.yml b/tests/cluecode/data/copyrights/misco4/linux3/vrup.txt.yml
@@ -0,0 +1,8 @@
+what:
+  - copyrights
+  - holders
+  - authors
+copyrights:
+  - copyright 2008-01-26 11:46 vruppert
+holders:
+  - vruppert
diff --git a/tests/cluecode/data/copyrights/name_before_c-c.c.yml b/tests/cluecode/data/copyrights/name_before_c-c.c.yml
@@ -7,10 +7,9 @@ copyrights:
   - Vladimir Oleynik <dzo@simtreas.ru> (c) 2003
 holders:
   - Russ Dill
-  - Vladimir Oleynik
+  - Vladimir Oleynik <dzo@simtreas.ru>
 holders_summary:
   - value: Russ Dill
     count: 1
-  - value: Vladimir Oleynik
+  - value: Vladimir Oleynik <dzo@simtreas.ru>
     count: 1
-
diff --git a/tests/cluecode/data/ics/chromium-chrome-browser-resources/about_credits.html.yml b/tests/cluecode/data/ics/chromium-chrome-browser-resources/about_credits.html.yml
diff --git a/tests/cluecode/data/ics/chromium-chrome-common-extensions-docs-examples-extensions-oauth_contacts/NOTICE.yml b/tests/cluecode/data/ics/chromium-chrome-common-extensions-docs-examples-extensions-oauth_contacts/NOTICE.yml
diff --git a/tests/cluecode/data/ics/chromium-third_party-modp_b64/LICENSE.yml b/tests/cluecode/data/ics/chromium-third_party-modp_b64/LICENSE.yml
diff --git a/tests/cluecode/data/ics/chromium-third_party-modp_b64/modp_b64.cc.yml b/tests/cluecode/data/ics/chromium-third_party-modp_b64/modp_b64.cc.yml
diff --git a/tests/cluecode/data/ics/chromium-third_party-modp_b64/modp_b64.h.yml b/tests/cluecode/data/ics/chromium-third_party-modp_b64/modp_b64.h.yml
diff --git a/tests/cluecode/data/ics/iptables-extensions/libxt_conntrack.c.yml b/tests/cluecode/data/ics/iptables-extensions/libxt_conntrack.c.yml
diff --git a/tests/cluecode/data/ics/iptables-extensions/libxt_owner.c.yml b/tests/cluecode/data/ics/iptables-extensions/libxt_owner.c.yml
diff --git a/tests/cluecode/data/ics/libvpx-examples-includes-geshi-docs/geshi-doc.html.yml b/tests/cluecode/data/ics/libvpx-examples-includes-geshi-docs/geshi-doc.html.yml
diff --git a/tests/cluecode/data/ics/libvpx-examples-includes-geshi-docs/geshi-doc.txt.yml b/tests/cluecode/data/ics/libvpx-examples-includes-geshi-docs/geshi-doc.txt.yml