1
1
mod extractor;
2
2
mod trap;
3
3
4
+ #[ macro_use]
5
+ extern crate lazy_static;
4
6
extern crate num_cpus;
5
7
6
8
use clap:: arg;
9
+ use encoding:: { self } ;
7
10
use rayon:: prelude:: * ;
11
+ use std:: borrow:: Cow ;
8
12
use std:: fs;
9
13
use std:: io:: BufRead ;
10
14
use std:: path:: { Path , PathBuf } ;
@@ -39,6 +43,21 @@ fn num_codeql_threads() -> usize {
39
43
}
40
44
}
41
45
46
+ lazy_static ! {
47
+ static ref CP_NUMBER : regex:: Regex = regex:: Regex :: new( "cp([0-9]+)" ) . unwrap( ) ;
48
+ }
49
+
50
+ fn encoding_from_name ( encoding_name : & str ) -> Option < & ( dyn encoding:: Encoding + Send + Sync ) > {
51
+ match encoding:: label:: encoding_from_whatwg_label ( encoding_name) {
52
+ s @ Some ( _) => s,
53
+ None => CP_NUMBER . captures ( encoding_name) . and_then ( |cap| {
54
+ encoding:: label:: encoding_from_windows_code_page (
55
+ str:: parse ( cap. get ( 1 ) . unwrap ( ) . as_str ( ) ) . unwrap ( ) ,
56
+ )
57
+ } ) ,
58
+ }
59
+ }
60
+
42
61
fn main ( ) -> std:: io:: Result < ( ) > {
43
62
tracing_subscriber:: fmt ( )
44
63
. with_target ( false )
@@ -104,6 +123,7 @@ fn main() -> std::io::Result<()> {
104
123
let path = PathBuf :: from ( line) . canonicalize ( ) ?;
105
124
let src_archive_file = path_for ( & src_archive_dir, & path, "" ) ;
106
125
let mut source = std:: fs:: read ( & path) ?;
126
+ let mut needs_conversion = false ;
107
127
let code_ranges;
108
128
let mut trap_writer = trap:: Writer :: new ( ) ;
109
129
if path. extension ( ) . map_or ( false , |x| x == "erb" ) {
@@ -132,6 +152,43 @@ fn main() -> std::io::Result<()> {
132
152
}
133
153
code_ranges = ranges;
134
154
} else {
155
+ if let Some ( encoding_name) = scan_coding_comment ( & source) {
156
+ // If the input is already UTF-8 then there is no need to recode the source
157
+ // If the declared encoding is 'binary' or 'ascii-8bit' then it is not clear how
158
+ // to interpret characters. In this case it is probably best to leave the input
159
+ // unchanged.
160
+ if !encoding_name. eq_ignore_ascii_case ( "utf-8" )
161
+ && !encoding_name. eq_ignore_ascii_case ( "ascii-8bit" )
162
+ && !encoding_name. eq_ignore_ascii_case ( "binary" )
163
+ {
164
+ if let Some ( encoding) = encoding_from_name ( & encoding_name) {
165
+ needs_conversion =
166
+ encoding. whatwg_name ( ) . unwrap_or_default ( ) != "utf-8" ;
167
+ if needs_conversion {
168
+ match encoding
169
+ . decode ( & source, encoding:: types:: DecoderTrap :: Replace )
170
+ {
171
+ Ok ( str) => source = str. as_bytes ( ) . to_owned ( ) ,
172
+ Err ( msg) => {
173
+ needs_conversion = false ;
174
+ tracing:: warn!(
175
+ "{}: character decoding failure: {} ({})" ,
176
+ & path. to_string_lossy( ) ,
177
+ msg,
178
+ & encoding_name
179
+ ) ;
180
+ }
181
+ }
182
+ }
183
+ } else {
184
+ tracing:: warn!(
185
+ "{}: unknown character encoding: '{}'" ,
186
+ & path. to_string_lossy( ) ,
187
+ & encoding_name
188
+ ) ;
189
+ }
190
+ }
191
+ }
135
192
code_ranges = vec ! [ ] ;
136
193
}
137
194
extractor:: extract (
@@ -144,7 +201,11 @@ fn main() -> std::io::Result<()> {
144
201
& code_ranges,
145
202
) ?;
146
203
std:: fs:: create_dir_all ( & src_archive_file. parent ( ) . unwrap ( ) ) ?;
147
- std:: fs:: copy ( & path, & src_archive_file) ?;
204
+ if needs_conversion {
205
+ std:: fs:: write ( & src_archive_file, & source) ?;
206
+ } else {
207
+ std:: fs:: copy ( & path, & src_archive_file) ?;
208
+ }
148
209
write_trap ( & trap_dir, path, & trap_writer, trap_compression)
149
210
} )
150
211
. expect ( "failed to extract files" ) ;
@@ -255,3 +316,143 @@ fn path_for(dir: &Path, path: &Path, ext: &str) -> PathBuf {
255
316
}
256
317
result
257
318
}
319
+
320
+ fn skip_space ( content : & [ u8 ] , index : usize ) -> usize {
321
+ let mut index = index;
322
+ while index < content. len ( ) {
323
+ let c = content[ index] as char ;
324
+ // white space except \n
325
+ let is_space = c == ' ' || ( '\t' ..='\r' ) . contains ( & c) && c != '\n' ;
326
+ if !is_space {
327
+ break ;
328
+ }
329
+ index += 1 ;
330
+ }
331
+ index
332
+ }
333
+
334
+ fn scan_coding_comment ( content : & [ u8 ] ) -> std:: option:: Option < Cow < str > > {
335
+ let mut index = 0 ;
336
+ // skip UTF-8 BOM marker if there is one
337
+ if content. len ( ) >= 3 && content[ 0 ] == 0xef && content[ 1 ] == 0xbb && content[ 2 ] == 0xbf {
338
+ index += 3 ;
339
+ }
340
+ // skip #! line if there is one
341
+ if index + 1 < content. len ( )
342
+ && content[ index] as char == '#'
343
+ && content[ index + 1 ] as char == '!'
344
+ {
345
+ index += 2 ;
346
+ while index < content. len ( ) && content[ index] as char != '\n' {
347
+ index += 1
348
+ }
349
+ index += 1
350
+ }
351
+ index = skip_space ( content, index) ;
352
+
353
+ if index >= content. len ( ) || content[ index] as char != '#' {
354
+ return None ;
355
+ }
356
+ index += 1 ;
357
+
358
+ const CODING : [ char ; 12 ] = [ 'C' , 'c' , 'O' , 'o' , 'D' , 'd' , 'I' , 'i' , 'N' , 'n' , 'G' , 'g' ] ;
359
+ let mut word_index = 0 ;
360
+ while index < content. len ( ) && word_index < CODING . len ( ) && content[ index] as char != '\n' {
361
+ if content[ index] as char == CODING [ word_index]
362
+ || content[ index] as char == CODING [ word_index + 1 ]
363
+ {
364
+ word_index += 2
365
+ } else {
366
+ word_index = 0 ;
367
+ }
368
+ index += 1 ;
369
+ }
370
+ if word_index < CODING . len ( ) {
371
+ return None ;
372
+ }
373
+ index = skip_space ( content, index) ;
374
+
375
+ if index < content. len ( ) && content[ index] as char != ':' && content[ index] as char != '=' {
376
+ return None ;
377
+ }
378
+ index += 1 ;
379
+ index = skip_space ( content, index) ;
380
+
381
+ let start = index;
382
+ while index < content. len ( ) {
383
+ let c = content[ index] as char ;
384
+ if c == '-' || c == '_' || c. is_ascii_alphanumeric ( ) {
385
+ index += 1 ;
386
+ } else {
387
+ break ;
388
+ }
389
+ }
390
+ if index > start {
391
+ return Some ( String :: from_utf8_lossy ( & content[ start..index] ) ) ;
392
+ }
393
+ None
394
+ }
395
+
396
+ #[ test]
397
+ fn test_scan_coding_comment ( ) {
398
+ let text = "# encoding: utf-8" ;
399
+ let result = scan_coding_comment ( text. as_bytes ( ) ) ;
400
+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
401
+
402
+ let text = "#coding:utf-8" ;
403
+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
404
+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
405
+
406
+ let text = "# foo\n # encoding: utf-8" ;
407
+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
408
+ assert_eq ! ( result, None ) ;
409
+
410
+ let text = "# encoding: latin1 encoding: utf-8" ;
411
+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
412
+ assert_eq ! ( result, Some ( "latin1" . into( ) ) ) ;
413
+
414
+ let text = "# encoding: nonsense" ;
415
+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
416
+ assert_eq ! ( result, Some ( "nonsense" . into( ) ) ) ;
417
+
418
+ let text = "# coding = utf-8" ;
419
+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
420
+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
421
+
422
+ let text = "# CODING = utf-8" ;
423
+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
424
+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
425
+
426
+ let text = "# CoDiNg = utf-8" ;
427
+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
428
+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
429
+
430
+ let text = "# blah blahblahcoding = utf-8" ;
431
+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
432
+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
433
+
434
+ // unicode BOM is ignored
435
+ let text = "\u{FEFF} # encoding: utf-8" ;
436
+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
437
+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
438
+
439
+ let text = "\u{FEFF} # encoding: utf-8" ;
440
+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
441
+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
442
+
443
+ let text = "#! /usr/bin/env ruby\n # encoding: utf-8" ;
444
+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
445
+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
446
+
447
+ let text = "\u{FEFF} #! /usr/bin/env ruby\n # encoding: utf-8" ;
448
+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
449
+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
450
+
451
+ // A #! must be the first thing on a line, otherwise it's a normal comment
452
+ let text = " #! /usr/bin/env ruby encoding = utf-8" ;
453
+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
454
+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
455
+ let text = " #! /usr/bin/env ruby \n # encoding = utf-8" ;
456
+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
457
+ assert_eq ! ( result, None ) ;
458
+ }
0 commit comments