Skip to content

Commit ec50bd2

Browse files
committed
Improve rule text analysis #3067
Only convert verbatim CR/LF in code-like files. Also unicode_text_lines now accepts a new "decrlf" flag to remove these optionally. Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com>
1 parent 2b4e561 commit ec50bd2

File tree

6 files changed

+264
-29
lines changed

6 files changed

+264
-29
lines changed

src/textcode/analysis.py

Lines changed: 130 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,12 @@
1414
import unicodedata
1515

1616
import chardet
17+
import typecode
1718

1819
from textcode import pdf
1920
from textcode import markup
2021
from textcode import sfdb
2122
from textcode import strings
22-
import typecode
2323

2424
"""
2525
Utilities to analyze text. Files are the input.
@@ -66,7 +66,7 @@ def numbered_text_lines(
6666
markup and cleanup this markup.
6767
6868
If `plain_text` is True treat the file as a plain text file and do not
69-
attempt to detect its type and extract it's content with special procedures.
69+
attempt to detect its type and extract its content with special procedures.
7070
This is used mostly when loading license texts and rules.
7171
7272
Note: For testing or building from strings, location can be a is a list of
@@ -115,34 +115,42 @@ def numbered_text_lines(
115115
# lightweight markup stripping support
116116
if demarkup and markup.is_markup(location):
117117
try:
118-
lines = list(enumerate(markup.demarkup(location), start_line))
118+
numbered_lines = list(enumerate(markup.demarkup(location), start_line))
119119
if TRACE:
120120
logger_debug('numbered_text_lines:', 'demarkup')
121-
return lines
121+
return numbered_lines
122122
except:
123123
# try again later with as plain text
124124
pass
125125

126126
if T.is_js_map:
127127
try:
128-
lines = list(enumerate(js_map_sources_lines(location), start_line))
128+
numbered_lines = list(enumerate(js_map_sources_lines(location), start_line))
129129
if TRACE:
130130
logger_debug('numbered_text_lines:', 'js_map')
131-
return lines
131+
return numbered_lines
132132
except:
133133
# try again later with as plain text otherwise
134134
pass
135135

136136
if T.is_text:
137-
numbered_lines = enumerate(unicode_text_lines(location), start_line)
137+
lines = unicode_text_lines(location=location, decrlf=is_source(location))
138+
numbered_lines = enumerate(lines, start_line)
139+
138140
# text with very long lines such minified JS, JS map files or large JSON
139-
if (not location.endswith('package.json')
140-
and (T.is_text_with_long_lines or T.is_compact_js
141-
or T.filetype_file == 'data' or 'locale' in location)):
141+
if (
142+
not location.endswith('package.json')
143+
and (
144+
T.is_text_with_long_lines or T.is_compact_js
145+
or T.filetype_file == 'data' or 'locale' in location
146+
)
147+
):
142148

143149
numbered_lines = break_numbered_unicode_text_lines(numbered_lines)
150+
144151
if TRACE:
145152
logger_debug('numbered_text_lines:', 'break_numbered_unicode_text_lines')
153+
146154
return numbered_lines
147155

148156
# TODO: handle Office-like documents, RTF, etc
@@ -171,7 +179,7 @@ def unicode_text_lines_from_binary(location):
171179
T = typecode.get_type(location)
172180
if T.contains_text:
173181
for line in strings.strings_from_file(location):
174-
yield line
182+
yield remove_verbatim_cr_lf_tab_chars(line)
175183

176184

177185
def unicode_text_lines_from_pdf(location):
@@ -228,8 +236,11 @@ def js_map_sources_lines(location):
228236
content = json.load(jsm)
229237
sources = content.get('sourcesContent', [])
230238
for entry in sources:
239+
entry = replace_verbatim_cr_lf_chars(entry)
231240
for line in entry.splitlines():
232-
yield line
241+
l = remove_verbatim_cr_lf_tab_chars(line)
242+
print(repr(l), l)
243+
yield l
233244

234245

235246
def as_unicode(line):
@@ -285,26 +296,124 @@ def remove_verbatim_cr_lf_tab_chars(s):
285296
Return a string replacing by a space any verbatim but escaped line endings
286297
and tabs (such as a literal \n or \r \t).
287298
"""
288-
if not s:
289-
return s
290299
return s.replace('\\r', ' ').replace('\\n', ' ').replace('\\t', ' ')
291300

292301

293-
def unicode_text_lines(location):
302+
def replace_verbatim_cr_lf_chars(s):
303+
"""
304+
Return a string replacing by a LF any verbatim but escaped line endings
305+
and tabs (such as a literal \n or \r.
306+
"""
307+
return (s
308+
.replace('\\\\r\\\\n', '\n')
309+
.replace('\\r\\n', '\n')
310+
.replace('\\\\r', '\n')
311+
.replace('\\\\n', '\n')
312+
.replace('\\r', '\n')
313+
.replace('\\n', '\n')
314+
)
315+
316+
317+
def unicode_text_lines(location, decrlf=False):
294318
"""
295-
Return an iterable over unicode text lines from a file at `location` if it
296-
contains text. Open the file as binary with universal new lines then try to
297-
decode each line as Unicode.
319+
Yield unicode text lines from a file at ``location`` if it
320+
contains text.
321+
322+
Open the file as binary then try to decode each line as Unicode.
323+
Remove verbatim, escaped CR, LF and tabs if ``decrlf`` is True.
298324
"""
325+
lines = _unicode_text_lines(location)
326+
if decrlf:
327+
return map(remove_verbatim_cr_lf_tab_chars, lines)
328+
else:
329+
return lines
330+
331+
332+
def _unicode_text_lines(location):
299333
with open(location, 'rb') as f:
300334
for line in f.read().splitlines(True):
301-
yield remove_verbatim_cr_lf_tab_chars(as_unicode(line))
335+
yield as_unicode(line)
302336

303337

304-
def unicode_text(location):
338+
def unicode_text(location, decrlf=False):
305339
"""
306340
Return a string guaranteed to be unicode from the content of the file at
307341
location. The whole file content is returned at once, which may be a
308342
problem for very large files.
309343
"""
310-
return u' '.join(unicode_text_lines(location))
344+
return u' '.join(unicode_text_lines(location, decrlf=decrlf))
345+
346+
347+
def is_source(location):
348+
"""
349+
Return True if the file at location is source code, based on its file
350+
extension
351+
"""
352+
return location.endswith((
353+
'.ada',
354+
'.adb',
355+
'.asm',
356+
'.asp',
357+
'.aj',
358+
'.bas',
359+
'.bat',
360+
'.c',
361+
'.c++',
362+
'.cc',
363+
'.clj',
364+
'.cob',
365+
'.cpp',
366+
'.cs',
367+
'.csh',
368+
'.csx',
369+
'.cxx',
370+
'.d',
371+
'.e',
372+
'.el',
373+
'.f',
374+
'.fs',
375+
'.f77',
376+
'.f90',
377+
'.for',
378+
'.fth',
379+
'.ftn',
380+
'.go',
381+
'.h',
382+
'.hh',
383+
'.hpp',
384+
'.hs',
385+
'.html',
386+
'.htm',
387+
'.hxx',
388+
'.java',
389+
'.js',
390+
'.jsx',
391+
'.jsp',
392+
'.ksh',
393+
'.kt',
394+
'.lisp',
395+
'.lua',
396+
'.m',
397+
'.m4',
398+
'.nim',
399+
'.pas',
400+
'.php',
401+
'.pl',
402+
'.pp',
403+
'.ps1',
404+
'.py',
405+
'.r',
406+
'.rb',
407+
'.ruby',
408+
'.rs',
409+
'.s',
410+
'.scala',
411+
'.sh',
412+
'.swift',
413+
'.ts',
414+
'.vhdl',
415+
'.verilog',
416+
'.vb',
417+
'.groovy',
418+
'.po',
419+
))
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
The FreeRTOS source code is licensed by a modified GNU General Public License - the
2+
modification taking the form of an exception.
3+
4+
The exception permits the source code of applications that use FreeRTOS solely
5+
through the API published on this website to remain closed source, thus permitting
6+
the use of FreeRTOS in commercial applications without necessitating that the whole
7+
application be open sourced. The exception can only be used if you wish to combine
8+
FreeRTOS with a proprietary product and you comply with the terms stated in the
9+
exception itself.
10+
11+
The FreeRTOS download also includes demo application source code, some of which is
12+
provided by third parties AND IS LICENSED SEPARATELY FROM FREERTOS.
13+
14+
For the avoidance of any doubt refer to the comment included at the top of each
15+
source and header file for license and copyright information.
16+
17+
This is a list of files for which Real Time Engineers Ltd. is not the copyright owner
18+
and are NOT COVERED BY THE GPL.
19+
20+
1. Various header files provided by silicon manufacturers and tool vendors that
21+
define processor specific memory addresses and utility macros. Permission has been
22+
granted by the various copyright holders for these files to be included in the
23+
FreeRTOS download. Users must ensure license conditions are adhered to for any use
24+
other than compilation of the FreeRTOS demo applications.
25+
26+
2. The uIP TCP/IP stack the copyright of which is held by Adam Dunkels. Users must
27+
ensure the open source license conditions stated at the top of each uIP source file
28+
is understood and adhered to.
29+
30+
3. The lwIP TCP/IP stack the copyright of which is held by the Swedish Institute of
31+
Computer Science. Users must ensure the open source license conditions stated at the
32+
top of each lwIP source file is understood and adhered to.
33+
34+
4. Various peripheral driver source files and binaries provided by silicon
35+
manufacturers and tool vendors. Permission has been granted by the various copyright
36+
holders for these files to be included in the FreeRTOS download. Users must ensure
37+
license conditions are adhered to for any use other than compilation of the FreeRTOS
38+
demo applications.
39+
40+
5. The files contained within FreeRTOS\Demo\WizNET_DEMO_TERN_186\tern_code, which are
41+
slightly modified versions of code provided by and copyright to Tern Inc.
42+
43+
Errors and omissions should be reported to Richard Barry, contact details for whom
44+
can be obtained from the Contact page.
45+
46+
This library is free software; you can redistribute it and/or modify it under the
47+
terms of the GNU General Public License as published by the Free Software Foundation;
48+
either version 2, or (at your option) any later version.
49+
50+
This library is distributed in the hope that it will be useful, but WITHOUT ANY
51+
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
52+
PARTICULAR PURPOSE. See the GNU General Public License for more details.
53+
54+
You should have received a copy of the GNU General Public License along with this
55+
library; see the file COPYING. If not, write to the Free Software Foundation, 51
56+
Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
57+
58+
GNU General Public License Exception
59+
60+
Any FreeRTOS source code, whether modified or in its original release form, or
61+
whether in whole or in part, can only be distributed by you under the terms of the
62+
GNU General Public License plus this exception. An independent module is a module
63+
which is not derived from or based on FreeRTOS.
64+
65+
EXCEPTION TEXT:
66+
67+
Clause 1
68+
69+
Linking FreeRTOS statically or dynamically with other modules is making a combined
70+
work based on FreeRTOS. Thus, the terms and conditions of the GNU General Public
71+
License cover the whole combination.
72+
73+
As a special exception, the copyright holder of FreeRTOS gives you permission to link
74+
FreeRTOS with independent modules that communicate with FreeRTOS solely through the
75+
FreeRTOS API interface, regardless of the license terms of these independent modules,
76+
and to copy and distribute the resulting combined work under terms of your choice,
77+
provided that
78+
79+
1. Every copy of the combined work is accompanied by a written statement that details
80+
to the recipient the version of FreeRTOS used and an offer by yourself to provide the
81+
FreeRTOS source code (including any modifications you may have made) should the
82+
recipient request it.
83+
84+
2. The combined work is not itself an RTOS, scheduler, kernel or related product.
85+
86+
3. The independent modules add significant and primary functionality to FreeRTOS and
87+
do not merely extend the existing functionality already present in FreeRTOS.
88+
89+
Clause 2
90+
91+
FreeRTOS may not be used for any competitive or comparative purpose, including the
92+
publication of any form of run time or compile time metric, without the express
93+
permission of Real Time Engineers Ltd. (this is the norm within the industry and is
94+
intended to ensure information accuracy).

tests/textcode/data/analysis/jsmap/broken.js.map.expected

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,12 @@
66
"OAAO;SACvB;KACF;IACD,AADE;IAEF;QACE,CAAC,OAAO,EAAE,OAAO,CAAC,EAAE,CAAC,OAAO,EAAE,OAAO,CAAC,EAAE,CAAC,",
77
"OAAO,EAAE,OAAO,CAAC,EAAE,CAAC,OAAO,EAAE,OAAO,CAAC;QAC9E,CAAC,OAAO,EAAE,OAAO,CAAC,",
88
"EAAE,CAAC,OAAO,EAAE,OAAO,CAAC,EAAE,CAAC,OAAO,EAAE,OAAO,CAAC;KAC3D;CACF,CAAC\",\"",
9-
"sourcesContent\":[\"/** * @license * Copyright Google Inc. All Rights Reserved. * * Use of this source code is governed by an MIT-style license that can be * found in the LICENSE file at https://angular.io/license */ // THIS CODE IS GENERATED - DO NOT MODIFY // See angular/tools/gulp-tasks/cldr/extract.js export default [ [ [ '\u0641\u062c\u0631\u064b\u0627', '\u0635\u0628\u0627\u062d\u064b\u0627', '\u0638\u0647\u0631\u064b\u0627', '\u0628\u0639\u062f \u0627\u0644\u0638\u0647\u0631', '",
10-
"\u0645\u0633\u0627\u0621\u064b', '\u0645\u0646\u062a\u0635\u0641 \u0627\u0644\u0644\u064a\u0644', '\u0644\u064a\u0644\u0627\u064b' ], [ '\u0641\u062c\u0631\u064b\u0627', '\u0635', '",
11-
"\u0638\u0647\u0631\u064b\u0627', '\u0628\u0639\u062f \u0627\u0644\u0638\u0647\u0631', '\u0645\u0633\u0627\u0621\u064b', '\u0645\u0646\u062a\u0635\u0641 \u0627\u0644\u0644\u064a\u0644', '\u0644\u064a\u0644\u0627\u064b' ], [ '",
12-
"\u0641\u062c\u0631\u064b\u0627', '\u0635\u0628\u0627\u062d\u064b\u0627', '\u0638\u0647\u0631\u064b\u0627', '\u0628\u0639\u062f \u0627\u0644\u0638\u0647\u0631', '\u0645\u0633\u0627\u0621\u064b', '",
13-
"\u0645\u0646\u062a\u0635\u0641 \u0627\u0644\u0644\u064a\u0644', '\u0644\u064a\u0644\u0627\u064b' ] ], , [ ['03:00', '06:00'], ['06:00',",
9+
"sourcesContent\":[\"/**\\n * @license\\n * Copyright Google Inc. All Rights Reserved.\\n *\\n * Use of this source code is governed by an MIT-style license that can be\\n * found in the LICENSE file at https://angular.io/license\\n */\\n\\n// THIS CODE IS GENERATED - DO NOT MODIFY\\n// See angular/tools/gulp-tasks/cldr/extract.js\\n\\nexport default [\\n [\\n [\\n '\u0641\u062c\u0631\u064b\u0627', '\u0635\u0628\u0627\u062d\u064b\u0627', '\u0638\u0647\u0631\u064b\u0627', '\u0628\u0639\u062f \u0627\u0644\u0638\u0647\u0631', '",
10+
"\u0645\u0633\u0627\u0621\u064b',\\n '\u0645\u0646\u062a\u0635\u0641 \u0627\u0644\u0644\u064a\u0644', '\u0644\u064a\u0644\u0627\u064b'\\n ],\\n [\\n '\u0641\u062c\u0631\u064b\u0627', '\u0635', '",
11+
"\u0638\u0647\u0631\u064b\u0627', '\u0628\u0639\u062f \u0627\u0644\u0638\u0647\u0631', '\u0645\u0633\u0627\u0621\u064b', '\u0645\u0646\u062a\u0635\u0641 \u0627\u0644\u0644\u064a\u0644',\\n '\u0644\u064a\u0644\u0627\u064b'\\n ],\\n [\\n '",
12+
"\u0641\u062c\u0631\u064b\u0627', '\u0635\u0628\u0627\u062d\u064b\u0627', '\u0638\u0647\u0631\u064b\u0627', '\u0628\u0639\u062f \u0627\u0644\u0638\u0647\u0631', '\u0645\u0633\u0627\u0621\u064b',\\n '",
13+
"\u0645\u0646\u062a\u0635\u0641 \u0627\u0644\u0644\u064a\u0644', '\u0644\u064a\u0644\u0627\u064b'\\n ]\\n ],\\n ,\\n [\\n ['03:00', '06:00'], ['06:00',",
1414
" '12:00'], ['12:00', '13:00'], ['13:00', '18:00'],",
15-
" ['18:00', '24:00'], ['00:00', '01:00'], ['01:00',",
16-
" '03:00'] ] ]; \"]}"
15+
"\\n ['18:00', '24:00'], ['00:00', '01:00'], ['01:00',",
16+
" '03:00']\\n ]\\n];\\n\"]}"
1717
]

tests/textcode/data/analysis/jsmap/crlf.js.map

Lines changed: 9 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
[
2+
"line1",
3+
"line2",
4+
"line3 start ",
5+
" line3 end ",
6+
"line4"
7+
]

tests/textcode/test_analysis.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,14 @@ def test_numbered_text_lines_handles_broken_jsmap_as_plain_text(self):
104104
test_file = self.get_test_loc('analysis/jsmap/broken.js.map')
105105
result = list(l for _, l in numbered_text_lines(test_file))
106106
expected_file = test_file + '.expected'
107-
check_text_lines(result, expected_file)
107+
check_text_lines(result, expected_file, regen=REGEN_TEST_FIXTURES)
108+
109+
def test_numbered_text_lines_strips_verbatim_cr_lf_from_jsmap(self):
110+
test_file = self.get_test_loc('analysis/jsmap/crlf.js.map')
111+
result = list(numbered_text_lines(test_file))
112+
result = [l for _, l in result]
113+
expected_file = test_file + '.expected'
114+
check_text_lines(result, expected_file, regen=REGEN_TEST_FIXTURES)
108115

109116
def test_numbered_text_lines_return_correct_number_of_lines(self):
110117
test_file = self.get_test_loc('analysis/correct_lines')
@@ -147,3 +154,12 @@ def test_as_unicode_from_unicode_replaces_null_bytes_with_space(self):
147154
result = as_unicode(test)
148155
expected = ' is designed to give them, BEFORE the '
149156
assert result == expected
157+
158+
def test_numbered_text_lines_returns_same_text_from_file_and_from_strings(self):
159+
test_file = self.get_test_loc('analysis/gpl-2.0-freertos.RULE')
160+
from_file = list(numbered_text_lines(location=test_file))
161+
with io.open(test_file, encoding='utf-8') as tf:
162+
text = tf.read()
163+
from_string = list(numbered_text_lines(location=text.splitlines(True)))
164+
assert from_string == from_file
165+

0 commit comments

Comments
 (0)