1
1
mod extractor;
2
2
3
+ #[ macro_use]
4
+ extern crate lazy_static;
3
5
extern crate num_cpus;
4
6
5
7
use clap:: arg;
8
+ use encoding:: { self } ;
6
9
use flate2:: write:: GzEncoder ;
7
10
use rayon:: prelude:: * ;
11
+ use std:: borrow:: Cow ;
8
12
use std:: fs;
9
13
use std:: io:: { BufRead , BufWriter } ;
10
14
use std:: path:: { Path , PathBuf } ;
@@ -75,6 +79,25 @@ fn num_codeql_threads() -> usize {
75
79
}
76
80
}
77
81
82
+ lazy_static ! {
83
+ static ref CP_NUMBER : regex:: Regex = regex:: Regex :: new( "cp([0-9]+)" ) . unwrap( ) ;
84
+ }
85
+
86
+ fn encoding_from_name ( encoding_name : & str ) -> Option < & ( dyn encoding:: Encoding + Send + Sync ) > {
87
+ match encoding:: label:: encoding_from_whatwg_label ( & encoding_name) {
88
+ Some ( e) => return Some ( e) ,
89
+ None => {
90
+ if let Some ( cap) = CP_NUMBER . captures ( & encoding_name) {
91
+ return encoding:: label:: encoding_from_windows_code_page (
92
+ str:: parse ( cap. get ( 1 ) . unwrap ( ) . as_str ( ) ) . unwrap ( ) ,
93
+ ) ;
94
+ } else {
95
+ return None ;
96
+ }
97
+ }
98
+ }
99
+ }
100
+
78
101
fn main ( ) -> std:: io:: Result < ( ) > {
79
102
tracing_subscriber:: fmt ( )
80
103
. with_target ( false )
@@ -140,6 +163,7 @@ fn main() -> std::io::Result<()> {
140
163
let path = PathBuf :: from ( line) . canonicalize ( ) ?;
141
164
let src_archive_file = path_for ( & src_archive_dir, & path, "" ) ;
142
165
let mut source = std:: fs:: read ( & path) ?;
166
+ let mut needs_conversion = false ;
143
167
let code_ranges;
144
168
let mut trap_writer = extractor:: new_trap_writer ( ) ;
145
169
if path. extension ( ) . map_or ( false , |x| x == "erb" ) {
@@ -168,6 +192,43 @@ fn main() -> std::io::Result<()> {
168
192
}
169
193
code_ranges = ranges;
170
194
} else {
195
+ if let Some ( encoding_name) = scan_coding_comment ( & source) {
196
+ // If the input is already UTF-8 then there is no need to recode the source
197
+ // If the declared encoding is 'binary' or 'ascii-8bit' then it is not clear how
198
+ // to interpret characters. In this case it is probably best to leave the input
199
+ // unchanged.
200
+ if !encoding_name. eq_ignore_ascii_case ( "utf-8" )
201
+ && !encoding_name. eq_ignore_ascii_case ( "ascii-8bit" )
202
+ && !encoding_name. eq_ignore_ascii_case ( "binary" )
203
+ {
204
+ if let Some ( encoding) = encoding_from_name ( & encoding_name) {
205
+ needs_conversion =
206
+ encoding. whatwg_name ( ) . unwrap_or_default ( ) != "utf-8" ;
207
+ if needs_conversion {
208
+ match encoding
209
+ . decode ( & source, encoding:: types:: DecoderTrap :: Replace )
210
+ {
211
+ Ok ( str) => source = str. as_bytes ( ) . to_owned ( ) ,
212
+ Err ( msg) => {
213
+ needs_conversion = false ;
214
+ tracing:: warn!(
215
+ "{}: character decoding failure: {} ({})" ,
216
+ & path. to_string_lossy( ) ,
217
+ msg,
218
+ & encoding_name
219
+ ) ;
220
+ }
221
+ }
222
+ }
223
+ } else {
224
+ tracing:: warn!(
225
+ "{}: unknown character encoding: '{}'" ,
226
+ & path. to_string_lossy( ) ,
227
+ & encoding_name
228
+ ) ;
229
+ }
230
+ }
231
+ }
171
232
code_ranges = vec ! [ ] ;
172
233
}
173
234
extractor:: extract (
@@ -180,7 +241,11 @@ fn main() -> std::io::Result<()> {
180
241
& code_ranges,
181
242
) ?;
182
243
std:: fs:: create_dir_all ( & src_archive_file. parent ( ) . unwrap ( ) ) ?;
183
- std:: fs:: copy ( & path, & src_archive_file) ?;
244
+ if needs_conversion {
245
+ std:: fs:: write ( & src_archive_file, & source) ?;
246
+ } else {
247
+ std:: fs:: copy ( & path, & src_archive_file) ?;
248
+ }
184
249
write_trap ( & trap_dir, path, trap_writer, & trap_compression)
185
250
} )
186
251
. expect ( "failed to extract files" ) ;
@@ -299,3 +364,143 @@ fn path_for(dir: &Path, path: &Path, ext: &str) -> PathBuf {
299
364
}
300
365
result
301
366
}
367
+
368
+ fn skip_space ( content : & [ u8 ] , index : usize ) -> usize {
369
+ let mut index = index;
370
+ while index < content. len ( ) {
371
+ let c = content[ index] as char ;
372
+ // white space except \n
373
+ let is_space = c == ' ' || ( '\t' ..='\r' ) . contains ( & c) && c != '\n' ;
374
+ if !is_space {
375
+ break ;
376
+ }
377
+ index += 1 ;
378
+ }
379
+ index
380
+ }
381
+
382
+ fn scan_coding_comment ( content : & [ u8 ] ) -> std:: option:: Option < Cow < str > > {
383
+ let mut index = 0 ;
384
+ // skip UTF-8 BOM marker if there is one
385
+ if content. len ( ) >= 3 && content[ 0 ] == 0xef && content[ 1 ] == 0xbb && content[ 2 ] == 0xbf {
386
+ index += 3 ;
387
+ }
388
+ // skip #! line if there is one
389
+ if index + 1 < content. len ( )
390
+ && content[ index] as char == '#'
391
+ && content[ index + 1 ] as char == '!'
392
+ {
393
+ index += 2 ;
394
+ while index < content. len ( ) && content[ index] as char != '\n' {
395
+ index += 1
396
+ }
397
+ index += 1
398
+ }
399
+ index = skip_space ( content, index) ;
400
+
401
+ if index >= content. len ( ) || content[ index] as char != '#' {
402
+ return None ;
403
+ }
404
+ index += 1 ;
405
+
406
+ const CODING : [ char ; 12 ] = [ 'C' , 'c' , 'O' , 'o' , 'D' , 'd' , 'I' , 'i' , 'N' , 'n' , 'G' , 'g' ] ;
407
+ let mut word_index = 0 ;
408
+ while index < content. len ( ) && word_index < CODING . len ( ) && content[ index] as char != '\n' {
409
+ if content[ index] as char == CODING [ word_index]
410
+ || content[ index] as char == CODING [ word_index + 1 ]
411
+ {
412
+ word_index += 2
413
+ } else {
414
+ word_index = 0 ;
415
+ }
416
+ index += 1 ;
417
+ }
418
+ if word_index < CODING . len ( ) {
419
+ return None ;
420
+ }
421
+ index = skip_space ( content, index) ;
422
+
423
+ if index < content. len ( ) && content[ index] as char != ':' && content[ index] as char != '=' {
424
+ return None ;
425
+ }
426
+ index += 1 ;
427
+ index = skip_space ( content, index) ;
428
+
429
+ let start = index;
430
+ while index < content. len ( ) {
431
+ let c = content[ index] as char ;
432
+ if c == '-' || c == '_' || c. is_ascii_alphanumeric ( ) {
433
+ index += 1 ;
434
+ } else {
435
+ break ;
436
+ }
437
+ }
438
+ if index > start {
439
+ return Some ( String :: from_utf8_lossy ( & content[ start..index] ) ) ;
440
+ }
441
+ None
442
+ }
443
+
444
+ #[ test]
445
+ fn test_scan_coding_comment ( ) {
446
+ let text = "# encoding: utf-8" ;
447
+ let result = scan_coding_comment ( text. as_bytes ( ) ) ;
448
+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
449
+
450
+ let text = "#coding:utf-8" ;
451
+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
452
+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
453
+
454
+ let text = "# foo\n # encoding: utf-8" ;
455
+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
456
+ assert_eq ! ( result, None ) ;
457
+
458
+ let text = "# encoding: latin1 encoding: utf-8" ;
459
+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
460
+ assert_eq ! ( result, Some ( "latin1" . into( ) ) ) ;
461
+
462
+ let text = "# encoding: nonsense" ;
463
+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
464
+ assert_eq ! ( result, Some ( "nonsense" . into( ) ) ) ;
465
+
466
+ let text = "# coding = utf-8" ;
467
+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
468
+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
469
+
470
+ let text = "# CODING = utf-8" ;
471
+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
472
+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
473
+
474
+ let text = "# CoDiNg = utf-8" ;
475
+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
476
+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
477
+
478
+ let text = "# blah blahblahcoding = utf-8" ;
479
+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
480
+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
481
+
482
+ // unicode BOM is ignored
483
+ let text = "\u{FEFF} # encoding: utf-8" ;
484
+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
485
+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
486
+
487
+ let text = "\u{FEFF} # encoding: utf-8" ;
488
+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
489
+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
490
+
491
+ let text = "#! /usr/bin/env ruby\n # encoding: utf-8" ;
492
+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
493
+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
494
+
495
+ let text = "\u{FEFF} #! /usr/bin/env ruby\n # encoding: utf-8" ;
496
+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
497
+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
498
+
499
+ // A #! must be the first thing on a line, otherwise it's a normal comment
500
+ let text = " #! /usr/bin/env ruby encoding = utf-8" ;
501
+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
502
+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
503
+ let text = " #! /usr/bin/env ruby \n # encoding = utf-8" ;
504
+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
505
+ assert_eq ! ( result, None ) ;
506
+ }
0 commit comments