Improve rule text analysis #3067

pombredanne · pombredanne · commit ec50bd2c2564 · 2022-08-29T10:32:08.000+02:00
Only convert verbatim CR/LF in code-like files.
Also unicode_text_lines now accepts a new "decrlf" flag to remove these
optionally.

Signed-off-by: Philippe Ombredanne &lt;pombredanne@nexb.com&gt;
diff --git a/src/textcode/analysis.py b/src/textcode/analysis.py
@@ -14,12 +14,12 @@
 import unicodedata
 
 import chardet
+import typecode
 
 from textcode import pdf
 from textcode import markup
 from textcode import sfdb
 from textcode import strings
-import typecode
 
 """
 Utilities to analyze text. Files are the input.
@@ -66,7 +66,7 @@ def numbered_text_lines(
     markup and cleanup this markup.
 
     If `plain_text` is True treat the file as a plain text file and do not
-    attempt to detect its type and extract it's content with special procedures.
+    attempt to detect its type and extract its content with special procedures.
     This is used mostly when loading license texts and rules.
 
     Note: For testing or building from strings, location can be a is a list of
@@ -115,34 +115,42 @@ def numbered_text_lines(
     # lightweight markup stripping support
     if demarkup and markup.is_markup(location):
         try:
-            lines = list(enumerate(markup.demarkup(location), start_line))
+            numbered_lines = list(enumerate(markup.demarkup(location), start_line))
             if TRACE:
                 logger_debug('numbered_text_lines:', 'demarkup')
-            return lines
+            return numbered_lines
         except:
             # try again later with as plain text
             pass
 
     if T.is_js_map:
         try:
-            lines = list(enumerate(js_map_sources_lines(location), start_line))
+            numbered_lines = list(enumerate(js_map_sources_lines(location), start_line))
             if TRACE:
                 logger_debug('numbered_text_lines:', 'js_map')
-            return lines
+            return numbered_lines
         except:
             # try again later with as plain text otherwise
             pass
 
     if T.is_text:
-        numbered_lines = enumerate(unicode_text_lines(location), start_line)
+        lines = unicode_text_lines(location=location, decrlf=is_source(location))
+        numbered_lines = enumerate(lines, start_line)
+
         # text with very long lines such minified JS, JS map files or large JSON
-        if (not location.endswith('package.json')
-            and (T.is_text_with_long_lines or T.is_compact_js
-              or T.filetype_file == 'data' or 'locale' in location)):
+        if (
+            not location.endswith('package.json')
+            and (
+                T.is_text_with_long_lines or T.is_compact_js
+                or T.filetype_file == 'data' or 'locale' in location
+            )
+        ):
 
             numbered_lines = break_numbered_unicode_text_lines(numbered_lines)
+
             if TRACE:
                 logger_debug('numbered_text_lines:', 'break_numbered_unicode_text_lines')
+
         return numbered_lines
 
     # TODO: handle Office-like documents, RTF, etc
@@ -171,7 +179,7 @@ def unicode_text_lines_from_binary(location):
     T = typecode.get_type(location)
     if T.contains_text:
         for line in strings.strings_from_file(location):
-            yield line
+            yield remove_verbatim_cr_lf_tab_chars(line)
 
 
 def unicode_text_lines_from_pdf(location):
@@ -228,8 +236,11 @@ def js_map_sources_lines(location):
         content = json.load(jsm)
         sources = content.get('sourcesContent', [])
         for entry in sources:
+            entry = replace_verbatim_cr_lf_chars(entry)
             for line in entry.splitlines():
-                yield line
+                l = remove_verbatim_cr_lf_tab_chars(line)
+                print(repr(l), l)
+                yield l
 
 
 def as_unicode(line):
@@ -285,26 +296,124 @@ def remove_verbatim_cr_lf_tab_chars(s):
     Return a string replacing by a space any verbatim but escaped line endings
     and tabs (such as a literal \n or \r \t).
     """
-    if not s:
-        return s
     return s.replace('\\r', ' ').replace('\\n', ' ').replace('\\t', ' ')
 
 
-def unicode_text_lines(location):
+def replace_verbatim_cr_lf_chars(s):
+    """
+    Return a string replacing by a LF any verbatim but escaped line endings
+    and tabs (such as a literal \n or \r.
+    """
+    return (s
+        .replace('\\\\r\\\\n', '\n')
+        .replace('\\r\\n', '\n')
+        .replace('\\\\r', '\n')
+        .replace('\\\\n', '\n')
+        .replace('\\r', '\n')
+        .replace('\\n', '\n')
+    )
+
+
+def unicode_text_lines(location, decrlf=False):
     """
-    Return an iterable over unicode text lines from a file at `location` if it
-    contains text. Open the file as binary with universal new lines then try to
-    decode each line as Unicode.
+    Yield unicode text lines from a file at ``location`` if it
+    contains text.
+
+    Open the file as binary then try to decode each line as Unicode.
+    Remove verbatim, escaped CR, LF and tabs if ``decrlf`` is True.
     """
+    lines = _unicode_text_lines(location)
+    if decrlf:
+        return map(remove_verbatim_cr_lf_tab_chars, lines)
+    else:
+        return lines
+
+
+def _unicode_text_lines(location):
     with open(location, 'rb') as f:
         for line in f.read().splitlines(True):
-            yield remove_verbatim_cr_lf_tab_chars(as_unicode(line))
+            yield as_unicode(line)
 
 
-def unicode_text(location):
+def unicode_text(location, decrlf=False):
     """
     Return a string guaranteed to be unicode from the content of the file at
     location. The whole file content is returned at once, which may be a
     problem for very large files.
     """
-    return u' '.join(unicode_text_lines(location))
+    return u' '.join(unicode_text_lines(location, decrlf=decrlf))
+
+
+def is_source(location):
+    """
+    Return True if the file at location is source code, based on its file
+    extension
+    """
+    return location.endswith((
+        '.ada',
+        '.adb',
+        '.asm',
+        '.asp',
+        '.aj',
+        '.bas',
+        '.bat',
+        '.c',
+        '.c++',
+        '.cc',
+        '.clj',
+        '.cob',
+        '.cpp',
+        '.cs',
+        '.csh',
+        '.csx',
+        '.cxx',
+        '.d',
+        '.e',
+        '.el',
+        '.f',
+        '.fs',
+        '.f77',
+        '.f90',
+        '.for',
+        '.fth',
+        '.ftn',
+        '.go',
+        '.h',
+        '.hh',
+        '.hpp',
+        '.hs',
+        '.html',
+        '.htm',
+        '.hxx',
+        '.java',
+        '.js',
+        '.jsx',
+        '.jsp',
+        '.ksh',
+        '.kt',
+        '.lisp',
+        '.lua',
+        '.m',
+        '.m4',
+        '.nim',
+        '.pas',
+        '.php',
+        '.pl',
+        '.pp',
+        '.ps1',
+        '.py',
+        '.r',
+        '.rb',
+        '.ruby',
+        '.rs',
+        '.s',
+        '.scala',
+        '.sh',
+        '.swift',
+        '.ts',
+        '.vhdl',
+        '.verilog',
+        '.vb',
+        '.groovy',
+        '.po',
+    ))
diff --git a/tests/textcode/data/analysis/gpl-2.0-freertos.RULE b/tests/textcode/data/analysis/gpl-2.0-freertos.RULE
@@ -0,0 +1,94 @@
+The FreeRTOS source code is licensed by a modified GNU General Public License - the
+modification taking the form of an exception.
+
+The exception permits the source code of applications that use FreeRTOS solely
+through the API published on this website to remain closed source, thus permitting
+the use of FreeRTOS in commercial applications without necessitating that the whole
+application be open sourced. The exception can only be used if you wish to combine
+FreeRTOS with a proprietary product and you comply with the terms stated in the
+exception itself.
+
+The FreeRTOS download also includes demo application source code, some of which is
+provided by third parties AND IS LICENSED SEPARATELY FROM FREERTOS.
+
+For the avoidance of any doubt refer to the comment included at the top of each
+source and header file for license and copyright information.
+
+This is a list of files for which Real Time Engineers Ltd. is not the copyright owner
+and are NOT COVERED BY THE GPL.
+
+1. Various header files provided by silicon manufacturers and tool vendors that
+define processor specific memory addresses and utility macros. Permission has been
+granted by the various copyright holders for these files to be included in the
+FreeRTOS download. Users must ensure license conditions are adhered to for any use
+other than compilation of the FreeRTOS demo applications.
+
+2. The uIP TCP/IP stack the copyright of which is held by Adam Dunkels. Users must
+ensure the open source license conditions stated at the top of each uIP source file
+is understood and adhered to.
+
+3. The lwIP TCP/IP stack the copyright of which is held by the Swedish Institute of
+Computer Science. Users must ensure the open source license conditions stated at the
+top of each lwIP source file is understood and adhered to.
+
+4. Various peripheral driver source files and binaries provided by silicon
+manufacturers and tool vendors. Permission has been granted by the various copyright
+holders for these files to be included in the FreeRTOS download. Users must ensure
+license conditions are adhered to for any use other than compilation of the FreeRTOS
+demo applications.
+
+5. The files contained within FreeRTOS\Demo\WizNET_DEMO_TERN_186\tern_code, which are
+slightly modified versions of code provided by and copyright to Tern Inc.
+
+Errors and omissions should be reported to Richard Barry, contact details for whom
+can be obtained from the Contact page.
+
+This library is free software; you can redistribute it and/or modify it under the
+terms of the GNU General Public License as published by the Free Software Foundation;
+either version 2, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with this
+library; see the file COPYING.  If not, write to the Free Software Foundation, 51
+Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+GNU General Public License Exception
+
+Any FreeRTOS source code, whether modified or in its original release form, or
+whether in whole or in part, can only be distributed by you under the terms of the
+GNU General Public License plus this exception. An independent module is a module
+which is not derived from or based on FreeRTOS.
+
+EXCEPTION TEXT:
+
+Clause 1
+
+Linking FreeRTOS statically or dynamically with other modules is making a combined
+work based on FreeRTOS. Thus, the terms and conditions of the GNU General Public
+License cover the whole combination.
+
+As a special exception, the copyright holder of FreeRTOS gives you permission to link
+FreeRTOS with independent modules that communicate with FreeRTOS solely through the
+FreeRTOS API interface, regardless of the license terms of these independent modules,
+and to copy and distribute the resulting combined work under terms of your choice,
+provided that
+
+1. Every copy of the combined work is accompanied by a written statement that details
+to the recipient the version of FreeRTOS used and an offer by yourself to provide the
+FreeRTOS source code (including any modifications you may have made) should the
+recipient request it.
+
+2. The combined work is not itself an RTOS, scheduler, kernel or related product.
+
+3. The independent modules add significant and primary functionality to FreeRTOS and
+do not merely extend the existing functionality already present in FreeRTOS.
+
+Clause 2
+
+FreeRTOS may not be used for any competitive or comparative purpose, including the
+publication of any form of run time or compile time metric, without the express
+permission of Real Time Engineers Ltd. (this is the norm within the industry and is
+intended to ensure information accuracy).
diff --git a/tests/textcode/data/analysis/jsmap/broken.js.map.expected b/tests/textcode/data/analysis/jsmap/broken.js.map.expected
@@ -6,12 +6,12 @@
   "OAAO;SACvB;KACF;IACD,AADE;IAEF;QACE,CAAC,OAAO,EAAE,OAAO,CAAC,EAAE,CAAC,OAAO,EAAE,OAAO,CAAC,EAAE,CAAC,",
   "OAAO,EAAE,OAAO,CAAC,EAAE,CAAC,OAAO,EAAE,OAAO,CAAC;QAC9E,CAAC,OAAO,EAAE,OAAO,CAAC,",
   "EAAE,CAAC,OAAO,EAAE,OAAO,CAAC,EAAE,CAAC,OAAO,EAAE,OAAO,CAAC;KAC3D;CACF,CAAC\",\"",
-  "sourcesContent\":[\"/**  * @license  * Copyright Google Inc. All Rights Reserved.  *  * Use of this source code is governed by an MIT-style license that can be  * found in the LICENSE file at https://angular.io/license  */  // THIS CODE IS GENERATED - DO NOT MODIFY // See angular/tools/gulp-tasks/cldr/extract.js  export default [   [     [       '\u0641\u062c\u0631\u064b\u0627', '\u0635\u0628\u0627\u062d\u064b\u0627', '\u0638\u0647\u0631\u064b\u0627', '\u0628\u0639\u062f \u0627\u0644\u0638\u0647\u0631', '",
-  "\u0645\u0633\u0627\u0621\u064b',       '\u0645\u0646\u062a\u0635\u0641 \u0627\u0644\u0644\u064a\u0644', '\u0644\u064a\u0644\u0627\u064b'     ],     [       '\u0641\u062c\u0631\u064b\u0627', '\u0635', '",
-  "\u0638\u0647\u0631\u064b\u0627', '\u0628\u0639\u062f \u0627\u0644\u0638\u0647\u0631', '\u0645\u0633\u0627\u0621\u064b', '\u0645\u0646\u062a\u0635\u0641 \u0627\u0644\u0644\u064a\u0644',       '\u0644\u064a\u0644\u0627\u064b'     ],     [       '",
-  "\u0641\u062c\u0631\u064b\u0627', '\u0635\u0628\u0627\u062d\u064b\u0627', '\u0638\u0647\u0631\u064b\u0627', '\u0628\u0639\u062f \u0627\u0644\u0638\u0647\u0631', '\u0645\u0633\u0627\u0621\u064b',       '",
-  "\u0645\u0646\u062a\u0635\u0641 \u0627\u0644\u0644\u064a\u0644', '\u0644\u064a\u0644\u0627\u064b'     ]   ],   ,   [     ['03:00', '06:00'], ['06:00',",
+  "sourcesContent\":[\"/**\\n * @license\\n * Copyright Google Inc. All Rights Reserved.\\n *\\n * Use of this source code is governed by an MIT-style license that can be\\n * found in the LICENSE file at https://angular.io/license\\n */\\n\\n// THIS CODE IS GENERATED - DO NOT MODIFY\\n// See angular/tools/gulp-tasks/cldr/extract.js\\n\\nexport default [\\n  [\\n    [\\n      '\u0641\u062c\u0631\u064b\u0627', '\u0635\u0628\u0627\u062d\u064b\u0627', '\u0638\u0647\u0631\u064b\u0627', '\u0628\u0639\u062f \u0627\u0644\u0638\u0647\u0631', '",
+  "\u0645\u0633\u0627\u0621\u064b',\\n      '\u0645\u0646\u062a\u0635\u0641 \u0627\u0644\u0644\u064a\u0644', '\u0644\u064a\u0644\u0627\u064b'\\n    ],\\n    [\\n      '\u0641\u062c\u0631\u064b\u0627', '\u0635', '",
+  "\u0638\u0647\u0631\u064b\u0627', '\u0628\u0639\u062f \u0627\u0644\u0638\u0647\u0631', '\u0645\u0633\u0627\u0621\u064b', '\u0645\u0646\u062a\u0635\u0641 \u0627\u0644\u0644\u064a\u0644',\\n      '\u0644\u064a\u0644\u0627\u064b'\\n    ],\\n    [\\n      '",
+  "\u0641\u062c\u0631\u064b\u0627', '\u0635\u0628\u0627\u062d\u064b\u0627', '\u0638\u0647\u0631\u064b\u0627', '\u0628\u0639\u062f \u0627\u0644\u0638\u0647\u0631', '\u0645\u0633\u0627\u0621\u064b',\\n      '",
+  "\u0645\u0646\u062a\u0635\u0641 \u0627\u0644\u0644\u064a\u0644', '\u0644\u064a\u0644\u0627\u064b'\\n    ]\\n  ],\\n  ,\\n  [\\n    ['03:00', '06:00'], ['06:00',",
   " '12:00'], ['12:00', '13:00'], ['13:00', '18:00'],",
-  "     ['18:00', '24:00'], ['00:00', '01:00'], ['01:00',",
-  " '03:00']   ] ]; \"]}"
+  "\\n    ['18:00', '24:00'], ['00:00', '01:00'], ['01:00',",
+  " '03:00']\\n  ]\\n];\\n\"]}"
 ]
diff --git a/tests/textcode/data/analysis/jsmap/crlf.js.map b/tests/textcode/data/analysis/jsmap/crlf.js.map
diff --git a/tests/textcode/data/analysis/jsmap/crlf.js.map.expected b/tests/textcode/data/analysis/jsmap/crlf.js.map.expected
@@ -0,0 +1,7 @@
+[
+  "line1",
+  "line2",
+  "line3 start ",
+  " line3 end ",
+  "line4"
+]
diff --git a/tests/textcode/test_analysis.py b/tests/textcode/test_analysis.py
@@ -104,7 +104,14 @@ def test_numbered_text_lines_handles_broken_jsmap_as_plain_text(self):
         test_file = self.get_test_loc('analysis/jsmap/broken.js.map')
         result = list(l for _, l in numbered_text_lines(test_file))
         expected_file = test_file + '.expected'
-        check_text_lines(result, expected_file)
+        check_text_lines(result, expected_file, regen=REGEN_TEST_FIXTURES)
+
+    def test_numbered_text_lines_strips_verbatim_cr_lf_from_jsmap(self):
+        test_file = self.get_test_loc('analysis/jsmap/crlf.js.map')
+        result = list(numbered_text_lines(test_file))
+        result = [l for _, l in result]
+        expected_file = test_file + '.expected'
+        check_text_lines(result, expected_file, regen=REGEN_TEST_FIXTURES)
 
     def test_numbered_text_lines_return_correct_number_of_lines(self):
         test_file = self.get_test_loc('analysis/correct_lines')
@@ -147,3 +154,12 @@ def test_as_unicode_from_unicode_replaces_null_bytes_with_space(self):
         result = as_unicode(test)
         expected = ' is designed to give them,  BEFORE the      '
         assert result == expected
+
+    def test_numbered_text_lines_returns_same_text_from_file_and_from_strings(self):
+        test_file = self.get_test_loc('analysis/gpl-2.0-freertos.RULE')
+        from_file = list(numbered_text_lines(location=test_file))
+        with io.open(test_file, encoding='utf-8') as tf:
+            text = tf.read()
+        from_string = list(numbered_text_lines(location=text.splitlines(True)))
+        assert from_string == from_file
+