|
14 | 14 | import unicodedata
|
15 | 15 |
|
16 | 16 | import chardet
|
| 17 | +import typecode |
17 | 18 |
|
18 | 19 | from textcode import pdf
|
19 | 20 | from textcode import markup
|
20 | 21 | from textcode import sfdb
|
21 | 22 | from textcode import strings
|
22 |
| -import typecode |
23 | 23 |
|
24 | 24 | """
|
25 | 25 | Utilities to analyze text. Files are the input.
|
@@ -66,7 +66,7 @@ def numbered_text_lines(
|
66 | 66 | markup and cleanup this markup.
|
67 | 67 |
|
68 | 68 | If `plain_text` is True treat the file as a plain text file and do not
|
69 |
| - attempt to detect its type and extract it's content with special procedures. |
| 69 | + attempt to detect its type and extract its content with special procedures. |
70 | 70 | This is used mostly when loading license texts and rules.
|
71 | 71 |
|
72 | 72 | Note: For testing or building from strings, location can be a is a list of
|
@@ -115,34 +115,42 @@ def numbered_text_lines(
|
115 | 115 | # lightweight markup stripping support
|
116 | 116 | if demarkup and markup.is_markup(location):
|
117 | 117 | try:
|
118 |
| - lines = list(enumerate(markup.demarkup(location), start_line)) |
| 118 | + numbered_lines = list(enumerate(markup.demarkup(location), start_line)) |
119 | 119 | if TRACE:
|
120 | 120 | logger_debug('numbered_text_lines:', 'demarkup')
|
121 |
| - return lines |
| 121 | + return numbered_lines |
122 | 122 | except:
|
123 | 123 | # try again later with as plain text
|
124 | 124 | pass
|
125 | 125 |
|
126 | 126 | if T.is_js_map:
|
127 | 127 | try:
|
128 |
| - lines = list(enumerate(js_map_sources_lines(location), start_line)) |
| 128 | + numbered_lines = list(enumerate(js_map_sources_lines(location), start_line)) |
129 | 129 | if TRACE:
|
130 | 130 | logger_debug('numbered_text_lines:', 'js_map')
|
131 |
| - return lines |
| 131 | + return numbered_lines |
132 | 132 | except:
|
133 | 133 | # try again later with as plain text otherwise
|
134 | 134 | pass
|
135 | 135 |
|
136 | 136 | if T.is_text:
|
137 |
| - numbered_lines = enumerate(unicode_text_lines(location), start_line) |
| 137 | + lines = unicode_text_lines(location=location, decrlf=is_source(location)) |
| 138 | + numbered_lines = enumerate(lines, start_line) |
| 139 | + |
138 | 140 | # text with very long lines such minified JS, JS map files or large JSON
|
139 |
| - if (not location.endswith('package.json') |
140 |
| - and (T.is_text_with_long_lines or T.is_compact_js |
141 |
| - or T.filetype_file == 'data' or 'locale' in location)): |
| 141 | + if ( |
| 142 | + not location.endswith('package.json') |
| 143 | + and ( |
| 144 | + T.is_text_with_long_lines or T.is_compact_js |
| 145 | + or T.filetype_file == 'data' or 'locale' in location |
| 146 | + ) |
| 147 | + ): |
142 | 148 |
|
143 | 149 | numbered_lines = break_numbered_unicode_text_lines(numbered_lines)
|
| 150 | + |
144 | 151 | if TRACE:
|
145 | 152 | logger_debug('numbered_text_lines:', 'break_numbered_unicode_text_lines')
|
| 153 | + |
146 | 154 | return numbered_lines
|
147 | 155 |
|
148 | 156 | # TODO: handle Office-like documents, RTF, etc
|
@@ -171,7 +179,7 @@ def unicode_text_lines_from_binary(location):
|
171 | 179 | T = typecode.get_type(location)
|
172 | 180 | if T.contains_text:
|
173 | 181 | for line in strings.strings_from_file(location):
|
174 |
| - yield line |
| 182 | + yield remove_verbatim_cr_lf_tab_chars(line) |
175 | 183 |
|
176 | 184 |
|
177 | 185 | def unicode_text_lines_from_pdf(location):
|
@@ -228,8 +236,11 @@ def js_map_sources_lines(location):
|
228 | 236 | content = json.load(jsm)
|
229 | 237 | sources = content.get('sourcesContent', [])
|
230 | 238 | for entry in sources:
|
| 239 | + entry = replace_verbatim_cr_lf_chars(entry) |
231 | 240 | for line in entry.splitlines():
|
232 |
| - yield line |
| 241 | + l = remove_verbatim_cr_lf_tab_chars(line) |
| 242 | + print(repr(l), l) |
| 243 | + yield l |
233 | 244 |
|
234 | 245 |
|
235 | 246 | def as_unicode(line):
|
@@ -285,26 +296,124 @@ def remove_verbatim_cr_lf_tab_chars(s):
|
285 | 296 | Return a string replacing by a space any verbatim but escaped line endings
|
286 | 297 | and tabs (such as a literal \n or \r \t).
|
287 | 298 | """
|
288 |
| - if not s: |
289 |
| - return s |
290 | 299 | return s.replace('\\r', ' ').replace('\\n', ' ').replace('\\t', ' ')
|
291 | 300 |
|
292 | 301 |
|
293 |
| -def unicode_text_lines(location): |
| 302 | +def replace_verbatim_cr_lf_chars(s): |
| 303 | + """ |
| 304 | + Return a string replacing by a LF any verbatim but escaped line endings |
| 305 | + and tabs (such as a literal \n or \r. |
| 306 | + """ |
| 307 | + return (s |
| 308 | + .replace('\\\\r\\\\n', '\n') |
| 309 | + .replace('\\r\\n', '\n') |
| 310 | + .replace('\\\\r', '\n') |
| 311 | + .replace('\\\\n', '\n') |
| 312 | + .replace('\\r', '\n') |
| 313 | + .replace('\\n', '\n') |
| 314 | + ) |
| 315 | + |
| 316 | + |
| 317 | +def unicode_text_lines(location, decrlf=False): |
294 | 318 | """
|
295 |
| - Return an iterable over unicode text lines from a file at `location` if it |
296 |
| - contains text. Open the file as binary with universal new lines then try to |
297 |
| - decode each line as Unicode. |
| 319 | + Yield unicode text lines from a file at ``location`` if it |
| 320 | + contains text. |
| 321 | +
|
| 322 | + Open the file as binary then try to decode each line as Unicode. |
| 323 | + Remove verbatim, escaped CR, LF and tabs if ``decrlf`` is True. |
298 | 324 | """
|
| 325 | + lines = _unicode_text_lines(location) |
| 326 | + if decrlf: |
| 327 | + return map(remove_verbatim_cr_lf_tab_chars, lines) |
| 328 | + else: |
| 329 | + return lines |
| 330 | + |
| 331 | + |
| 332 | +def _unicode_text_lines(location): |
299 | 333 | with open(location, 'rb') as f:
|
300 | 334 | for line in f.read().splitlines(True):
|
301 |
| - yield remove_verbatim_cr_lf_tab_chars(as_unicode(line)) |
| 335 | + yield as_unicode(line) |
302 | 336 |
|
303 | 337 |
|
304 |
| -def unicode_text(location): |
| 338 | +def unicode_text(location, decrlf=False): |
305 | 339 | """
|
306 | 340 | Return a string guaranteed to be unicode from the content of the file at
|
307 | 341 | location. The whole file content is returned at once, which may be a
|
308 | 342 | problem for very large files.
|
309 | 343 | """
|
310 |
| - return u' '.join(unicode_text_lines(location)) |
| 344 | + return u' '.join(unicode_text_lines(location, decrlf=decrlf)) |
| 345 | + |
| 346 | + |
| 347 | +def is_source(location): |
| 348 | + """ |
| 349 | + Return True if the file at location is source code, based on its file |
| 350 | + extension |
| 351 | + """ |
| 352 | + return location.endswith(( |
| 353 | + '.ada', |
| 354 | + '.adb', |
| 355 | + '.asm', |
| 356 | + '.asp', |
| 357 | + '.aj', |
| 358 | + '.bas', |
| 359 | + '.bat', |
| 360 | + '.c', |
| 361 | + '.c++', |
| 362 | + '.cc', |
| 363 | + '.clj', |
| 364 | + '.cob', |
| 365 | + '.cpp', |
| 366 | + '.cs', |
| 367 | + '.csh', |
| 368 | + '.csx', |
| 369 | + '.cxx', |
| 370 | + '.d', |
| 371 | + '.e', |
| 372 | + '.el', |
| 373 | + '.f', |
| 374 | + '.fs', |
| 375 | + '.f77', |
| 376 | + '.f90', |
| 377 | + '.for', |
| 378 | + '.fth', |
| 379 | + '.ftn', |
| 380 | + '.go', |
| 381 | + '.h', |
| 382 | + '.hh', |
| 383 | + '.hpp', |
| 384 | + '.hs', |
| 385 | + '.html', |
| 386 | + '.htm', |
| 387 | + '.hxx', |
| 388 | + '.java', |
| 389 | + '.js', |
| 390 | + '.jsx', |
| 391 | + '.jsp', |
| 392 | + '.ksh', |
| 393 | + '.kt', |
| 394 | + '.lisp', |
| 395 | + '.lua', |
| 396 | + '.m', |
| 397 | + '.m4', |
| 398 | + '.nim', |
| 399 | + '.pas', |
| 400 | + '.php', |
| 401 | + '.pl', |
| 402 | + '.pp', |
| 403 | + '.ps1', |
| 404 | + '.py', |
| 405 | + '.r', |
| 406 | + '.rb', |
| 407 | + '.ruby', |
| 408 | + '.rs', |
| 409 | + '.s', |
| 410 | + '.scala', |
| 411 | + '.sh', |
| 412 | + '.swift', |
| 413 | + '.ts', |
| 414 | + '.vhdl', |
| 415 | + '.verilog', |
| 416 | + '.vb', |
| 417 | + '.groovy', |
| 418 | + '.po', |
| 419 | + )) |
0 commit comments