Skip to content

Commit 3d79389

Browse files
--max-size, --reparse
1 parent c873f44 commit 3d79389

File tree

2 files changed

+78
-27
lines changed

2 files changed

+78
-27
lines changed

crates/tree-splicer/src/cli.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,10 @@ pub struct Args {
7777
#[arg(short, long, default_value_t = num_cpus::get())]
7878
pub jobs: usize,
7979

80+
/// Approximate maximum file size to produce (bytes); default = 1MiB
81+
#[arg(long, default_value_t = 1048576)]
82+
pub max_size: usize,
83+
8084
/// Number of mutations per teset
8185
#[arg(short, long, default_value_t = 16)]
8286
pub mutations: usize,
@@ -85,6 +89,10 @@ pub struct Args {
8589
#[arg(short, long, default_value_os = "tree-splicer.out")]
8690
pub output: PathBuf,
8791

92+
/// Re-parse the file after this many mutations; higher is faster
93+
#[arg(short, long, default_value_t = 1)]
94+
pub reparse: usize,
95+
8896
/// Seed
8997
#[arg(short, long, default_value_t = 0)]
9098
pub seed: u64,
@@ -171,7 +179,9 @@ pub fn main(language: tree_sitter::Language, node_types_json_str: &'static str)
171179
language,
172180
// intra_splices: 10,
173181
inter_splices: args.mutations,
182+
max_size: args.max_size,
174183
node_types,
184+
reparse: args.reparse,
175185
seed: args.seed,
176186
tests: args.tests,
177187
};

crates/tree-splicer/src/splice.rs

Lines changed: 68 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -4,21 +4,21 @@ use std::collections::{HashMap, HashSet};
44
use rand::{prelude::StdRng, Rng, SeedableRng};
55
use tree_sitter::{Language, Node, Tree};
66

7-
use tree_sitter_edit::{Editor, NodeId};
7+
use tree_sitter_edit::Editor;
88

99
use crate::node_types::NodeTypes;
1010

1111
#[derive(Debug, Default)]
12-
pub struct Edits<'a>(HashMap<usize, &'a [u8]>);
12+
struct Edits(HashMap<usize, Vec<u8>>);
1313

14-
impl<'a> Editor for Edits<'a> {
14+
impl Editor for Edits {
1515
fn has_edit(&self, _tree: &Tree, node: &Node) -> bool {
1616
self.0.get(&node.id()).is_some()
1717
}
1818

1919
fn edit(&self, _source: &[u8], tree: &Tree, node: &Node) -> Vec<u8> {
2020
debug_assert!(self.has_edit(tree, node));
21-
Vec::from(*self.0.get(&node.id()).unwrap())
21+
self.0.get(&node.id()).unwrap().clone()
2222
}
2323
}
2424

@@ -71,15 +71,32 @@ fn parse(language: Language, code: &str) -> tree_sitter::Tree {
7171
parser.parse(code, None).expect("Failed to parse code")
7272
}
7373

74+
/// Splicing configuration
7475
#[derive(Debug)]
7576
pub struct Config {
77+
/// Percent chance to perform chaotic mutation
78+
///
79+
/// Chaotic mutations may result in invalid syntax.
7680
pub chaos: u8,
81+
/// Percent chance to perform a deletion.
82+
///
83+
/// By default, deletes optional nodes. Chaotic deletions delete any node.
7784
pub deletions: u8,
7885
pub language: Language,
7986
// pub intra_splices: usize,
87+
/// Perform anywhere from zero to this many inter-file splices per test.
8088
pub inter_splices: usize,
89+
/// Approximate maximum file size to produce (bytes)
90+
///
91+
/// Some of the input tests should be below this size.
92+
pub max_size: usize,
8193
pub node_types: NodeTypes,
94+
/// Re-parse the file after this many mutations.
95+
///
96+
/// When this is more than `inter_splices`, never re-parse.
97+
pub reparse: usize,
8298
pub seed: u64,
99+
/// How many tests to generate
83100
pub tests: usize,
84101
}
85102

@@ -91,13 +108,21 @@ struct Splicer<'a> {
91108
kinds: Vec<&'static str>,
92109
// intra_splices: usize,
93110
inter_splices: usize,
111+
max_size: usize,
94112
node_types: NodeTypes,
95113
trees: Vec<(&'a [u8], &'a Tree)>,
96114
remaining: usize,
115+
reparse: usize,
97116
rng: StdRng,
98117
}
99118

100119
impl<'a> Splicer<'a> {
120+
fn delta(node: Node<'_>, replace: &[u8]) -> isize {
121+
let range = node.byte_range();
122+
isize::try_from(replace.len()).unwrap_or_default()
123+
- isize::try_from(range.end - range.start).unwrap_or_default()
124+
}
125+
101126
fn pick_usize(&mut self, n: usize) -> usize {
102127
self.rng.gen_range(0..n)
103128
}
@@ -136,23 +161,25 @@ impl<'a> Splicer<'a> {
136161
*nodes.get(self.pick_idx(&nodes)).unwrap()
137162
}
138163

139-
fn delete_node(&mut self, _text: &[u8], tree: &Tree) -> (usize, Vec<u8>) {
164+
fn delete_node(&mut self, _text: &[u8], tree: &Tree) -> (usize, Vec<u8>, isize) {
140165
let chaotic = self.rng.gen_range(0..100) < self.chaos;
141166
if chaotic {
142-
return (self.pick_node(tree).id(), Vec::new());
167+
let node = self.pick_node(tree);
168+
return (node.id(), Vec::new(), Self::delta(node, &[]));
143169
}
144170
let nodes = self.all_nodes(tree);
145171
if nodes.iter().all(|n| !self.node_types.optional_node(n)) {
146-
return (self.pick_node(tree).id(), Vec::new());
172+
let node = self.pick_node(tree);
173+
return (node.id(), Vec::new(), Self::delta(node, &[]));
147174
}
148175
let mut node = nodes.get(self.pick_idx(&nodes)).unwrap();
149176
while !self.node_types.optional_node(node) {
150177
node = nodes.get(self.pick_idx(&nodes)).unwrap();
151178
}
152-
(node.id(), Vec::new())
179+
(node.id(), Vec::new(), Self::delta(*node, &[]))
153180
}
154181

155-
fn splice_node(&mut self, text: &[u8], tree: &Tree) -> (usize, Vec<u8>) {
182+
fn splice_node(&mut self, text: &[u8], tree: &Tree) -> (usize, Vec<u8>, isize) {
156183
let chaotic = self.rng.gen_range(0..100) < self.chaos;
157184

158185
let mut node = tree.root_node();
@@ -188,30 +215,38 @@ impl<'a> Splicer<'a> {
188215
// std::str::from_utf8(&text[node.byte_range()]).unwrap(),
189216
// std::str::from_utf8(candidate).unwrap(),
190217
// );
191-
(node.id(), Vec::from(*candidate))
218+
let replace = Vec::from(*candidate);
219+
let delta = Self::delta(node, replace.as_slice());
220+
(node.id(), replace, delta)
192221
}
193222

194223
fn splice_tree(&mut self, text0: &[u8], mut tree: Tree) -> Option<Vec<u8>> {
195-
let splices = self.rng.gen_range(0..self.inter_splices);
224+
let mut edits = Edits::default();
225+
if self.inter_splices == 0 {
226+
return None;
227+
}
228+
let splices = self.rng.gen_range(1..self.inter_splices);
196229
let mut text = Vec::from(text0);
197-
for _ in 0..splices {
198-
let (id, bytes) = if self.rng.gen_range(0..100) < self.deletions {
230+
let mut sz = isize::try_from(text.len()).unwrap_or_default();
231+
for i in 0..splices {
232+
let (id, bytes, delta) = if self.rng.gen_range(0..100) < self.deletions {
199233
self.delete_node(text.as_slice(), &tree)
200234
} else {
201235
self.splice_node(text.as_slice(), &tree)
202236
};
203-
let id = NodeId { id };
204-
let bytes = bytes.to_vec();
205-
let mut result = Vec::with_capacity(text.len() / 4); // low guesstimate
206-
tree_sitter_edit::render(
207-
&mut result,
208-
&tree,
209-
text.as_slice(),
210-
&tree_sitter_edit::Replace { id, bytes },
211-
)
212-
.ok()?;
213-
text = result.clone();
214-
tree = parse(self.language, &String::from_utf8_lossy(text.as_slice()));
237+
sz += delta;
238+
let sized_out = usize::try_from(sz).unwrap_or_default() >= self.max_size;
239+
edits.0.insert(id, bytes);
240+
if i % self.reparse == 0 || i + 1 == splices || sized_out {
241+
let mut result = Vec::with_capacity(usize::try_from(sz).unwrap_or_default());
242+
tree_sitter_edit::render(&mut result, &tree, text.as_slice(), &edits).ok()?;
243+
text = result.clone();
244+
tree = parse(self.language, &String::from_utf8_lossy(text.as_slice()));
245+
edits = Edits::default();
246+
}
247+
if sized_out {
248+
break;
249+
}
215250
}
216251
Some(text)
217252
}
@@ -226,8 +261,12 @@ impl<'a> Iterator for Splicer<'a> {
226261
}
227262
self.remaining -= 1;
228263

229-
let tree_idx: usize = self.pick_usize(self.trees.len());
230-
let (text, tree) = *self.trees.get(tree_idx).unwrap();
264+
let mut tree_idx: usize = self.pick_usize(self.trees.len());
265+
let (mut text, mut tree) = *self.trees.get(tree_idx).unwrap();
266+
while text.len() > self.max_size {
267+
tree_idx = self.pick_usize(self.trees.len());
268+
(text, tree) = *self.trees.get(tree_idx).unwrap();
269+
}
231270
self.splice_tree(text, tree.clone())
232271
}
233272
}
@@ -261,8 +300,10 @@ pub fn splice<'a>(
261300
kinds,
262301
// intra_splices: config.intra_splices,
263302
inter_splices: config.inter_splices,
303+
max_size: config.max_size,
264304
node_types: config.node_types,
265305
remaining: std::cmp::min(config.tests, possible),
306+
reparse: config.reparse,
266307
rng,
267308
trees,
268309
}

0 commit comments

Comments
 (0)