Skip to content

Commit 7be106d

Browse files
committed
Ruby: handle magic coding: comments
1 parent ca81957 commit 7be106d

File tree

7 files changed

+297
-1
lines changed

7 files changed

+297
-1
lines changed

ruby/Cargo.lock

Lines changed: 66 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ruby/extractor/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,5 @@ tracing-subscriber = { version = "0.3.3", features = ["env-filter"] }
1818
rayon = "1.5.0"
1919
num_cpus = "1.13.0"
2020
regex = "1.5.5"
21+
encoding = "0.2"
22+
lazy_static = "1.4.0"

ruby/extractor/src/main.rs

Lines changed: 206 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,14 @@
11
mod extractor;
22

3+
#[macro_use]
4+
extern crate lazy_static;
35
extern crate num_cpus;
46

57
use clap::arg;
8+
use encoding::{self};
69
use flate2::write::GzEncoder;
710
use rayon::prelude::*;
11+
use std::borrow::Cow;
812
use std::fs;
913
use std::io::{BufRead, BufWriter};
1014
use std::path::{Path, PathBuf};
@@ -75,6 +79,25 @@ fn num_codeql_threads() -> usize {
7579
}
7680
}
7781

82+
lazy_static! {
83+
static ref CP_NUMBER: regex::Regex = regex::Regex::new("cp([0-9]+)").unwrap();
84+
}
85+
86+
fn encoding_from_name(encoding_name: &str) -> Option<&(dyn encoding::Encoding + Send + Sync)> {
87+
match encoding::label::encoding_from_whatwg_label(&encoding_name) {
88+
Some(e) => return Some(e),
89+
None => {
90+
if let Some(cap) = CP_NUMBER.captures(&encoding_name) {
91+
return encoding::label::encoding_from_windows_code_page(
92+
str::parse(cap.get(1).unwrap().as_str()).unwrap(),
93+
);
94+
} else {
95+
return None;
96+
}
97+
}
98+
}
99+
}
100+
78101
fn main() -> std::io::Result<()> {
79102
tracing_subscriber::fmt()
80103
.with_target(false)
@@ -140,6 +163,7 @@ fn main() -> std::io::Result<()> {
140163
let path = PathBuf::from(line).canonicalize()?;
141164
let src_archive_file = path_for(&src_archive_dir, &path, "");
142165
let mut source = std::fs::read(&path)?;
166+
let mut needs_conversion = false;
143167
let code_ranges;
144168
let mut trap_writer = extractor::new_trap_writer();
145169
if path.extension().map_or(false, |x| x == "erb") {
@@ -168,6 +192,43 @@ fn main() -> std::io::Result<()> {
168192
}
169193
code_ranges = ranges;
170194
} else {
195+
if let Some(encoding_name) = scan_coding_comment(&source) {
196+
// If the input is already UTF-8 then there is no need to recode the source
197+
// If the declared encoding is 'binary' or 'ascii-8bit' then it is not clear how
198+
// to interpret characters. In this case it is probably best to leave the input
199+
// unchanged.
200+
if !encoding_name.eq_ignore_ascii_case("utf-8")
201+
&& !encoding_name.eq_ignore_ascii_case("ascii-8bit")
202+
&& !encoding_name.eq_ignore_ascii_case("binary")
203+
{
204+
if let Some(encoding) = encoding_from_name(&encoding_name) {
205+
needs_conversion =
206+
encoding.whatwg_name().unwrap_or_default() != "utf-8";
207+
if needs_conversion {
208+
match encoding
209+
.decode(&source, encoding::types::DecoderTrap::Replace)
210+
{
211+
Ok(str) => source = str.as_bytes().to_owned(),
212+
Err(msg) => {
213+
needs_conversion = false;
214+
tracing::warn!(
215+
"{}: character decoding failure: {} ({})",
216+
&path.to_string_lossy(),
217+
msg,
218+
&encoding_name
219+
);
220+
}
221+
}
222+
}
223+
} else {
224+
tracing::warn!(
225+
"{}: unknown character encoding: '{}'",
226+
&path.to_string_lossy(),
227+
&encoding_name
228+
);
229+
}
230+
}
231+
}
171232
code_ranges = vec![];
172233
}
173234
extractor::extract(
@@ -180,7 +241,11 @@ fn main() -> std::io::Result<()> {
180241
&code_ranges,
181242
)?;
182243
std::fs::create_dir_all(&src_archive_file.parent().unwrap())?;
183-
std::fs::copy(&path, &src_archive_file)?;
244+
if needs_conversion {
245+
std::fs::write(&src_archive_file, &source)?;
246+
} else {
247+
std::fs::copy(&path, &src_archive_file)?;
248+
}
184249
write_trap(&trap_dir, path, trap_writer, &trap_compression)
185250
})
186251
.expect("failed to extract files");
@@ -299,3 +364,143 @@ fn path_for(dir: &Path, path: &Path, ext: &str) -> PathBuf {
299364
}
300365
result
301366
}
367+
368+
fn skip_space(content: &[u8], index: usize) -> usize {
369+
let mut index = index;
370+
while index < content.len() {
371+
let c = content[index] as char;
372+
// white space except \n
373+
let is_space = c == ' ' || ('\t'..='\r').contains(&c) && c != '\n';
374+
if !is_space {
375+
break;
376+
}
377+
index += 1;
378+
}
379+
index
380+
}
381+
382+
fn scan_coding_comment(content: &[u8]) -> std::option::Option<Cow<str>> {
383+
let mut index = 0;
384+
// skip UTF-8 BOM marker if there is one
385+
if content.len() >= 3 && content[0] == 0xef && content[1] == 0xbb && content[2] == 0xbf {
386+
index += 3;
387+
}
388+
// skip #! line if there is one
389+
if index + 1 < content.len()
390+
&& content[index] as char == '#'
391+
&& content[index + 1] as char == '!'
392+
{
393+
index += 2;
394+
while index < content.len() && content[index] as char != '\n' {
395+
index += 1
396+
}
397+
index += 1
398+
}
399+
index = skip_space(content, index);
400+
401+
if index >= content.len() || content[index] as char != '#' {
402+
return None;
403+
}
404+
index += 1;
405+
406+
const CODING: [char; 12] = ['C', 'c', 'O', 'o', 'D', 'd', 'I', 'i', 'N', 'n', 'G', 'g'];
407+
let mut word_index = 0;
408+
while index < content.len() && word_index < CODING.len() && content[index] as char != '\n' {
409+
if content[index] as char == CODING[word_index]
410+
|| content[index] as char == CODING[word_index + 1]
411+
{
412+
word_index += 2
413+
} else {
414+
word_index = 0;
415+
}
416+
index += 1;
417+
}
418+
if word_index < CODING.len() {
419+
return None;
420+
}
421+
index = skip_space(content, index);
422+
423+
if index < content.len() && content[index] as char != ':' && content[index] as char != '=' {
424+
return None;
425+
}
426+
index += 1;
427+
index = skip_space(content, index);
428+
429+
let start = index;
430+
while index < content.len() {
431+
let c = content[index] as char;
432+
if c == '-' || c == '_' || c.is_ascii_alphanumeric() {
433+
index += 1;
434+
} else {
435+
break;
436+
}
437+
}
438+
if index > start {
439+
return Some(String::from_utf8_lossy(&content[start..index]));
440+
}
441+
None
442+
}
443+
444+
#[test]
445+
fn test_scan_coding_comment() {
446+
let text = "# encoding: utf-8";
447+
let result = scan_coding_comment(text.as_bytes());
448+
assert_eq!(result, Some("utf-8".into()));
449+
450+
let text = "#coding:utf-8";
451+
let result = scan_coding_comment(&text.as_bytes());
452+
assert_eq!(result, Some("utf-8".into()));
453+
454+
let text = "# foo\n# encoding: utf-8";
455+
let result = scan_coding_comment(&text.as_bytes());
456+
assert_eq!(result, None);
457+
458+
let text = "# encoding: latin1 encoding: utf-8";
459+
let result = scan_coding_comment(&text.as_bytes());
460+
assert_eq!(result, Some("latin1".into()));
461+
462+
let text = "# encoding: nonsense";
463+
let result = scan_coding_comment(&text.as_bytes());
464+
assert_eq!(result, Some("nonsense".into()));
465+
466+
let text = "# coding = utf-8";
467+
let result = scan_coding_comment(&text.as_bytes());
468+
assert_eq!(result, Some("utf-8".into()));
469+
470+
let text = "# CODING = utf-8";
471+
let result = scan_coding_comment(&text.as_bytes());
472+
assert_eq!(result, Some("utf-8".into()));
473+
474+
let text = "# CoDiNg = utf-8";
475+
let result = scan_coding_comment(&text.as_bytes());
476+
assert_eq!(result, Some("utf-8".into()));
477+
478+
let text = "# blah blahblahcoding = utf-8";
479+
let result = scan_coding_comment(&text.as_bytes());
480+
assert_eq!(result, Some("utf-8".into()));
481+
482+
// unicode BOM is ignored
483+
let text = "\u{FEFF}# encoding: utf-8";
484+
let result = scan_coding_comment(&text.as_bytes());
485+
assert_eq!(result, Some("utf-8".into()));
486+
487+
let text = "\u{FEFF} # encoding: utf-8";
488+
let result = scan_coding_comment(&text.as_bytes());
489+
assert_eq!(result, Some("utf-8".into()));
490+
491+
let text = "#! /usr/bin/env ruby\n # encoding: utf-8";
492+
let result = scan_coding_comment(&text.as_bytes());
493+
assert_eq!(result, Some("utf-8".into()));
494+
495+
let text = "\u{FEFF}#! /usr/bin/env ruby\n # encoding: utf-8";
496+
let result = scan_coding_comment(&text.as_bytes());
497+
assert_eq!(result, Some("utf-8".into()));
498+
499+
// A #! must be the first thing on a line, otherwise it's a normal comment
500+
let text = " #! /usr/bin/env ruby encoding = utf-8";
501+
let result = scan_coding_comment(&text.as_bytes());
502+
assert_eq!(result, Some("utf-8".into()));
503+
let text = " #! /usr/bin/env ruby \n # encoding = utf-8";
504+
let result = scan_coding_comment(&text.as_bytes());
505+
assert_eq!(result, None);
506+
}

ruby/ql/test/library-tests/ast/Ast.expected

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1762,6 +1762,12 @@ escape_sequences/escapes.rb:
17621762
# 93| getStmt: [SymbolLiteral] :"\C-?"
17631763
# 93| getComponent: [StringEscapeSequenceComponent] \C
17641764
# 93| getComponent: [StringTextComponent] -?
1765+
misc/iso-8859-15.rb:
1766+
# 1| [Toplevel] iso-8859-15.rb
1767+
# 4| getStmt: [MethodCall] call to print
1768+
# 4| getReceiver: [SelfVariableAccess] self
1769+
# 4| getArgument: [StringLiteral] "EUR = €"
1770+
# 4| getComponent: [StringTextComponent] EUR = €
17651771
literals/literals.rb:
17661772
# 1| [Toplevel] literals.rb
17671773
# 2| getStmt: [NilLiteral] nil

ruby/ql/test/library-tests/ast/TreeSitter.expected

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4604,6 +4604,17 @@ literals/literals.rb:
46044604
# 193| cat file.txt
46054605
# 193|
46064606
# 195| 1: [HeredocEnd] SCRIPT
4607+
misc/iso-8859-15.rb:
4608+
# 1| [Program] Program
4609+
# 4| 0: [Call] Call
4610+
# 4| 0: [Identifier] print
4611+
# 4| 1: [ArgumentList] ArgumentList
4612+
# 4| 0: [String] String
4613+
# 4| 0: [ReservedWord] "
4614+
# 4| 1: [StringContent] EUR = €
4615+
# 4| 2: [ReservedWord] "
4616+
# 1| [Comment] #! /usr/bin/ruby
4617+
# 2| [Comment] # coding: iso-8859-15
46074618
misc/misc.erb:
46084619
# 2| [Program] Program
46094620
# 2| 0: [Call] Call

ruby/ql/test/library-tests/ast/ValueText.expected

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -717,6 +717,7 @@ exprValue
717717
| literals/literals.rb:198:8:198:8 | 5 | 5 | int |
718718
| literals/literals.rb:199:2:199:2 | :y | :y | symbol |
719719
| literals/literals.rb:199:7:199:7 | :Z | :Z | symbol |
720+
| misc/iso-8859-15.rb:4:7:4:17 | "EUR = \u20ac" | EUR = \u20ac | string |
720721
| misc/misc.erb:2:15:2:37 | "main_include_admin.js" | main_include_admin.js | string |
721722
| misc/misc.rb:1:7:1:11 | "bar" | bar | string |
722723
| misc/misc.rb:3:7:3:9 | foo | foo | string |
@@ -1592,6 +1593,7 @@ exprCfgNodeValue
15921593
| literals/literals.rb:198:8:198:8 | 5 | 5 | int |
15931594
| literals/literals.rb:199:2:199:2 | :y | :y | symbol |
15941595
| literals/literals.rb:199:7:199:7 | :Z | :Z | symbol |
1596+
| misc/iso-8859-15.rb:4:7:4:17 | "EUR = \u20ac" | EUR = \u20ac | string |
15951597
| misc/misc.erb:2:15:2:37 | "main_include_admin.js" | main_include_admin.js | string |
15961598
| misc/misc.rb:1:7:1:11 | "bar" | bar | string |
15971599
| misc/misc.rb:3:7:3:9 | foo | foo | string |
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
#! /usr/bin/ruby
2+
# coding: iso-8859-15
3+
4+
print "EUR = ¤"

0 commit comments

Comments
 (0)