Skip to content

Commit 43266b7

Browse files
authored
Merge pull request #9866 from aibaars/encoding
Ruby: handle magic coding: comments
2 parents 5f9a03f + d44bf32 commit 43266b7

File tree

7 files changed

+293
-1
lines changed

7 files changed

+293
-1
lines changed

ruby/Cargo.lock

Lines changed: 66 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ruby/extractor/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,5 @@ tracing-subscriber = { version = "0.3.3", features = ["env-filter"] }
1818
rayon = "1.5.0"
1919
num_cpus = "1.13.0"
2020
regex = "1.5.5"
21+
encoding = "0.2"
22+
lazy_static = "1.4.0"

ruby/extractor/src/main.rs

Lines changed: 202 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,14 @@
11
mod extractor;
22
mod trap;
33

4+
#[macro_use]
5+
extern crate lazy_static;
46
extern crate num_cpus;
57

68
use clap::arg;
9+
use encoding::{self};
710
use rayon::prelude::*;
11+
use std::borrow::Cow;
812
use std::fs;
913
use std::io::BufRead;
1014
use std::path::{Path, PathBuf};
@@ -39,6 +43,21 @@ fn num_codeql_threads() -> usize {
3943
}
4044
}
4145

46+
lazy_static! {
47+
static ref CP_NUMBER: regex::Regex = regex::Regex::new("cp([0-9]+)").unwrap();
48+
}
49+
50+
fn encoding_from_name(encoding_name: &str) -> Option<&(dyn encoding::Encoding + Send + Sync)> {
51+
match encoding::label::encoding_from_whatwg_label(encoding_name) {
52+
s @ Some(_) => s,
53+
None => CP_NUMBER.captures(encoding_name).and_then(|cap| {
54+
encoding::label::encoding_from_windows_code_page(
55+
str::parse(cap.get(1).unwrap().as_str()).unwrap(),
56+
)
57+
}),
58+
}
59+
}
60+
4261
fn main() -> std::io::Result<()> {
4362
tracing_subscriber::fmt()
4463
.with_target(false)
@@ -104,6 +123,7 @@ fn main() -> std::io::Result<()> {
104123
let path = PathBuf::from(line).canonicalize()?;
105124
let src_archive_file = path_for(&src_archive_dir, &path, "");
106125
let mut source = std::fs::read(&path)?;
126+
let mut needs_conversion = false;
107127
let code_ranges;
108128
let mut trap_writer = trap::Writer::new();
109129
if path.extension().map_or(false, |x| x == "erb") {
@@ -132,6 +152,43 @@ fn main() -> std::io::Result<()> {
132152
}
133153
code_ranges = ranges;
134154
} else {
155+
if let Some(encoding_name) = scan_coding_comment(&source) {
156+
// If the input is already UTF-8 then there is no need to recode the source
157+
// If the declared encoding is 'binary' or 'ascii-8bit' then it is not clear how
158+
// to interpret characters. In this case it is probably best to leave the input
159+
// unchanged.
160+
if !encoding_name.eq_ignore_ascii_case("utf-8")
161+
&& !encoding_name.eq_ignore_ascii_case("ascii-8bit")
162+
&& !encoding_name.eq_ignore_ascii_case("binary")
163+
{
164+
if let Some(encoding) = encoding_from_name(&encoding_name) {
165+
needs_conversion =
166+
encoding.whatwg_name().unwrap_or_default() != "utf-8";
167+
if needs_conversion {
168+
match encoding
169+
.decode(&source, encoding::types::DecoderTrap::Replace)
170+
{
171+
Ok(str) => source = str.as_bytes().to_owned(),
172+
Err(msg) => {
173+
needs_conversion = false;
174+
tracing::warn!(
175+
"{}: character decoding failure: {} ({})",
176+
&path.to_string_lossy(),
177+
msg,
178+
&encoding_name
179+
);
180+
}
181+
}
182+
}
183+
} else {
184+
tracing::warn!(
185+
"{}: unknown character encoding: '{}'",
186+
&path.to_string_lossy(),
187+
&encoding_name
188+
);
189+
}
190+
}
191+
}
135192
code_ranges = vec![];
136193
}
137194
extractor::extract(
@@ -144,7 +201,11 @@ fn main() -> std::io::Result<()> {
144201
&code_ranges,
145202
)?;
146203
std::fs::create_dir_all(&src_archive_file.parent().unwrap())?;
147-
std::fs::copy(&path, &src_archive_file)?;
204+
if needs_conversion {
205+
std::fs::write(&src_archive_file, &source)?;
206+
} else {
207+
std::fs::copy(&path, &src_archive_file)?;
208+
}
148209
write_trap(&trap_dir, path, &trap_writer, trap_compression)
149210
})
150211
.expect("failed to extract files");
@@ -255,3 +316,143 @@ fn path_for(dir: &Path, path: &Path, ext: &str) -> PathBuf {
255316
}
256317
result
257318
}
319+
320+
fn skip_space(content: &[u8], index: usize) -> usize {
321+
let mut index = index;
322+
while index < content.len() {
323+
let c = content[index] as char;
324+
// white space except \n
325+
let is_space = c == ' ' || ('\t'..='\r').contains(&c) && c != '\n';
326+
if !is_space {
327+
break;
328+
}
329+
index += 1;
330+
}
331+
index
332+
}
333+
334+
fn scan_coding_comment(content: &[u8]) -> std::option::Option<Cow<str>> {
335+
let mut index = 0;
336+
// skip UTF-8 BOM marker if there is one
337+
if content.len() >= 3 && content[0] == 0xef && content[1] == 0xbb && content[2] == 0xbf {
338+
index += 3;
339+
}
340+
// skip #! line if there is one
341+
if index + 1 < content.len()
342+
&& content[index] as char == '#'
343+
&& content[index + 1] as char == '!'
344+
{
345+
index += 2;
346+
while index < content.len() && content[index] as char != '\n' {
347+
index += 1
348+
}
349+
index += 1
350+
}
351+
index = skip_space(content, index);
352+
353+
if index >= content.len() || content[index] as char != '#' {
354+
return None;
355+
}
356+
index += 1;
357+
358+
const CODING: [char; 12] = ['C', 'c', 'O', 'o', 'D', 'd', 'I', 'i', 'N', 'n', 'G', 'g'];
359+
let mut word_index = 0;
360+
while index < content.len() && word_index < CODING.len() && content[index] as char != '\n' {
361+
if content[index] as char == CODING[word_index]
362+
|| content[index] as char == CODING[word_index + 1]
363+
{
364+
word_index += 2
365+
} else {
366+
word_index = 0;
367+
}
368+
index += 1;
369+
}
370+
if word_index < CODING.len() {
371+
return None;
372+
}
373+
index = skip_space(content, index);
374+
375+
if index < content.len() && content[index] as char != ':' && content[index] as char != '=' {
376+
return None;
377+
}
378+
index += 1;
379+
index = skip_space(content, index);
380+
381+
let start = index;
382+
while index < content.len() {
383+
let c = content[index] as char;
384+
if c == '-' || c == '_' || c.is_ascii_alphanumeric() {
385+
index += 1;
386+
} else {
387+
break;
388+
}
389+
}
390+
if index > start {
391+
return Some(String::from_utf8_lossy(&content[start..index]));
392+
}
393+
None
394+
}
395+
396+
#[test]
397+
fn test_scan_coding_comment() {
398+
let text = "# encoding: utf-8";
399+
let result = scan_coding_comment(text.as_bytes());
400+
assert_eq!(result, Some("utf-8".into()));
401+
402+
let text = "#coding:utf-8";
403+
let result = scan_coding_comment(&text.as_bytes());
404+
assert_eq!(result, Some("utf-8".into()));
405+
406+
let text = "# foo\n# encoding: utf-8";
407+
let result = scan_coding_comment(&text.as_bytes());
408+
assert_eq!(result, None);
409+
410+
let text = "# encoding: latin1 encoding: utf-8";
411+
let result = scan_coding_comment(&text.as_bytes());
412+
assert_eq!(result, Some("latin1".into()));
413+
414+
let text = "# encoding: nonsense";
415+
let result = scan_coding_comment(&text.as_bytes());
416+
assert_eq!(result, Some("nonsense".into()));
417+
418+
let text = "# coding = utf-8";
419+
let result = scan_coding_comment(&text.as_bytes());
420+
assert_eq!(result, Some("utf-8".into()));
421+
422+
let text = "# CODING = utf-8";
423+
let result = scan_coding_comment(&text.as_bytes());
424+
assert_eq!(result, Some("utf-8".into()));
425+
426+
let text = "# CoDiNg = utf-8";
427+
let result = scan_coding_comment(&text.as_bytes());
428+
assert_eq!(result, Some("utf-8".into()));
429+
430+
let text = "# blah blahblahcoding = utf-8";
431+
let result = scan_coding_comment(&text.as_bytes());
432+
assert_eq!(result, Some("utf-8".into()));
433+
434+
// unicode BOM is ignored
435+
let text = "\u{FEFF}# encoding: utf-8";
436+
let result = scan_coding_comment(&text.as_bytes());
437+
assert_eq!(result, Some("utf-8".into()));
438+
439+
let text = "\u{FEFF} # encoding: utf-8";
440+
let result = scan_coding_comment(&text.as_bytes());
441+
assert_eq!(result, Some("utf-8".into()));
442+
443+
let text = "#! /usr/bin/env ruby\n # encoding: utf-8";
444+
let result = scan_coding_comment(&text.as_bytes());
445+
assert_eq!(result, Some("utf-8".into()));
446+
447+
let text = "\u{FEFF}#! /usr/bin/env ruby\n # encoding: utf-8";
448+
let result = scan_coding_comment(&text.as_bytes());
449+
assert_eq!(result, Some("utf-8".into()));
450+
451+
// A #! must be the first thing on a line, otherwise it's a normal comment
452+
let text = " #! /usr/bin/env ruby encoding = utf-8";
453+
let result = scan_coding_comment(&text.as_bytes());
454+
assert_eq!(result, Some("utf-8".into()));
455+
let text = " #! /usr/bin/env ruby \n # encoding = utf-8";
456+
let result = scan_coding_comment(&text.as_bytes());
457+
assert_eq!(result, None);
458+
}

ruby/ql/test/library-tests/ast/Ast.expected

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1762,6 +1762,12 @@ escape_sequences/escapes.rb:
17621762
# 93| getStmt: [SymbolLiteral] :"\C-?"
17631763
# 93| getComponent: [StringEscapeSequenceComponent] \C
17641764
# 93| getComponent: [StringTextComponent] -?
1765+
misc/iso-8859-15.rb:
1766+
# 1| [Toplevel] iso-8859-15.rb
1767+
# 4| getStmt: [MethodCall] call to print
1768+
# 4| getReceiver: [SelfVariableAccess] self
1769+
# 4| getArgument: [StringLiteral] "EUR = €"
1770+
# 4| getComponent: [StringTextComponent] EUR = €
17651771
literals/literals.rb:
17661772
# 1| [Toplevel] literals.rb
17671773
# 2| getStmt: [NilLiteral] nil

ruby/ql/test/library-tests/ast/TreeSitter.expected

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4604,6 +4604,17 @@ literals/literals.rb:
46044604
# 193| cat file.txt
46054605
# 193|
46064606
# 195| 1: [HeredocEnd] SCRIPT
4607+
misc/iso-8859-15.rb:
4608+
# 1| [Program] Program
4609+
# 4| 0: [Call] Call
4610+
# 4| 0: [Identifier] print
4611+
# 4| 1: [ArgumentList] ArgumentList
4612+
# 4| 0: [String] String
4613+
# 4| 0: [ReservedWord] "
4614+
# 4| 1: [StringContent] EUR = €
4615+
# 4| 2: [ReservedWord] "
4616+
# 1| [Comment] #! /usr/bin/ruby
4617+
# 2| [Comment] # coding: iso-8859-15
46074618
misc/misc.erb:
46084619
# 2| [Program] Program
46094620
# 2| 0: [Call] Call

ruby/ql/test/library-tests/ast/ValueText.expected

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -717,6 +717,7 @@ exprValue
717717
| literals/literals.rb:198:8:198:8 | 5 | 5 | int |
718718
| literals/literals.rb:199:2:199:2 | :y | :y | symbol |
719719
| literals/literals.rb:199:7:199:7 | :Z | :Z | symbol |
720+
| misc/iso-8859-15.rb:4:7:4:17 | "EUR = \u20ac" | EUR = \u20ac | string |
720721
| misc/misc.erb:2:15:2:37 | "main_include_admin.js" | main_include_admin.js | string |
721722
| misc/misc.rb:1:7:1:11 | "bar" | bar | string |
722723
| misc/misc.rb:3:7:3:9 | foo | foo | string |
@@ -1592,6 +1593,7 @@ exprCfgNodeValue
15921593
| literals/literals.rb:198:8:198:8 | 5 | 5 | int |
15931594
| literals/literals.rb:199:2:199:2 | :y | :y | symbol |
15941595
| literals/literals.rb:199:7:199:7 | :Z | :Z | symbol |
1596+
| misc/iso-8859-15.rb:4:7:4:17 | "EUR = \u20ac" | EUR = \u20ac | string |
15951597
| misc/misc.erb:2:15:2:37 | "main_include_admin.js" | main_include_admin.js | string |
15961598
| misc/misc.rb:1:7:1:11 | "bar" | bar | string |
15971599
| misc/misc.rb:3:7:3:9 | foo | foo | string |
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
#! /usr/bin/ruby
2+
# coding: iso-8859-15
3+
4+
print "EUR = ¤"

0 commit comments

Comments
 (0)