Skip to content

Commit b77b9ca

Browse files
committed
Handle surrogate-pairs in LFNs.
1 parent c8cc20f commit b77b9ca

File tree

1 file changed

+88
-9
lines changed

1 file changed

+88
-9
lines changed

src/filesystem/filename.rs

Lines changed: 88 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,7 @@ impl core::fmt::Debug for ShortFileName {
221221
}
222222

223223
/// Used to store a Long File Name
224+
#[derive(Debug)]
224225
pub struct LfnBuffer<'a> {
225226
/// We fill this buffer in from the back
226227
inner: &'a mut [u8],
@@ -230,6 +231,8 @@ pub struct LfnBuffer<'a> {
230231
free: usize,
231232
/// Did we overflow?
232233
overflow: bool,
234+
/// If a surrogate-pair is split over two directory entries, remember half of it here.
235+
unpaired_surrogate: Option<u16>,
233236
}
234237

235238
impl<'a> LfnBuffer<'a> {
@@ -240,19 +243,34 @@ impl<'a> LfnBuffer<'a> {
240243
inner: storage,
241244
free: len,
242245
overflow: false,
246+
unpaired_surrogate: None,
243247
}
244248
}
245249

246250
/// Empty out this buffer
247251
pub fn clear(&mut self) {
248252
self.free = self.inner.len();
249253
self.overflow = false;
254+
self.unpaired_surrogate = None;
250255
}
251256

252-
/// Push the 13 UCS-2 characters into this string
257+
/// Push the 13 UTF-16 codepoints into this string.
253258
///
254259
/// We assume they are pushed last-chunk-first, as you would find
255260
/// them on disk.
261+
///
262+
/// Any chunk starting with a half of a surrogate pair has that saved for the next call.
263+
///
264+
/// ```text
265+
/// [de00, 002e, 0074, 0078, 0074, 0000, ffff, ffff, ffff, ffff, ffff, ffff, ffff]
266+
/// [0041, 0042, 0030, 0031, 0032, 0033, 0034, 0035, 0036, 0037, 0038, 0039, d83d]
267+
///
268+
/// Would map to
269+
///
270+
/// 0041 0042 0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 1f600 002e 0074 0078 0074, or
271+
///
272+
/// "AB0123456789😀.txt"
273+
/// ```
256274
pub fn push(&mut self, buffer: &[u16; 13]) {
257275
// find the first null, if any
258276
let null_idx = buffer
@@ -261,25 +279,70 @@ impl<'a> LfnBuffer<'a> {
261279
.unwrap_or(buffer.len());
262280
// take all the wide chars, up to the null (or go to the end)
263281
let buffer = &buffer[0..null_idx];
264-
for ch in buffer.iter().rev() {
265-
let ch = char::from_u32(*ch as u32).unwrap_or('?');
282+
283+
// This next part will convert the 16-bit values into chars, noting that
284+
// chars outside the Basic Multilingual Plane will require two 16-bit
285+
// values to encode (see UTF-16 Surrogate Pairs).
286+
//
287+
// We cache the decoded chars into this array so we can iterate them
288+
// backwards. It's 60 bytes, but it'll have to do.
289+
let mut char_vec: heapless::Vec<char, 13> = heapless::Vec::new();
290+
// Now do the decode, including the unpaired surrogate (if any) from
291+
// last time (maybe it has a pair now!)
292+
let mut is_first = true;
293+
for ch in char::decode_utf16(
294+
buffer
295+
.iter()
296+
.cloned()
297+
.chain(self.unpaired_surrogate.take().iter().cloned()),
298+
) {
299+
match ch {
300+
Ok(ch) => {
301+
char_vec.push(ch).expect("Vec was full!?");
302+
}
303+
Err(e) => {
304+
// OK, so we found half a surrogate pair and nothing to go
305+
// with it. Was this the first codepoint in the chunk?
306+
if is_first {
307+
// it was - the other half is probably in the next chunk
308+
// so save this for next time
309+
trace!("LFN saved {:?}", e.unpaired_surrogate());
310+
self.unpaired_surrogate = Some(e.unpaired_surrogate());
311+
} else {
312+
// it wasn't - can't deal with it these mid-sequence, so
313+
// replace it
314+
trace!("LFN replaced {:?}", e.unpaired_surrogate());
315+
char_vec.push('\u{fffd}').expect("Vec was full?!");
316+
}
317+
}
318+
}
319+
is_first = false;
320+
}
321+
322+
for ch in char_vec.iter().rev() {
266323
trace!("LFN push {:?}", ch);
267-
let mut ch_bytes = [0u8; 4];
268-
// a buffer of length 4 is always enough
269-
let ch_str = ch.encode_utf8(&mut ch_bytes);
270-
if self.free < ch_str.len() {
324+
// a buffer of length 4 is enough to encode any char
325+
let mut encoded_ch = [0u8; 4];
326+
let encoded_ch = ch.encode_utf8(&mut encoded_ch);
327+
if self.free < encoded_ch.len() {
328+
// the LFN buffer they gave us was not long enough. Note for
329+
// later, so we don't show them garbage.
271330
self.overflow = true;
272331
return;
273332
}
274-
// store the encoded character in the buffer, working backwards
275-
for b in ch_str.bytes().rev() {
333+
// Store the encoded char in the buffer, working backwards. We
334+
// already checked there was enough space.
335+
for b in encoded_ch.bytes().rev() {
276336
self.free -= 1;
277337
self.inner[self.free] = b;
278338
}
279339
}
280340
}
281341

282342
/// View this LFN buffer as a string-slice
343+
///
344+
/// If the buffer overflowed while parsing the LFN, or if this buffer is
345+
/// empty, you get an empty string.
283346
pub fn as_str(&self) -> &str {
284347
if self.overflow {
285348
""
@@ -418,6 +481,22 @@ mod test {
418481
]);
419482
assert_eq!(buf.as_str(), "ABCDEFGHIJKLM0123∂");
420483
}
484+
485+
#[test]
486+
fn two_piece_split_surrogate() {
487+
let mut storage = [0u8; 64];
488+
let mut buf: LfnBuffer = LfnBuffer::new(&mut storage);
489+
490+
buf.push(&[
491+
0xde00, 0x002e, 0x0074, 0x0078, 0x0074, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
492+
0xffff, 0xffff,
493+
]);
494+
buf.push(&[
495+
0xd83d, 0xde00, 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038,
496+
0x0039, 0xd83d,
497+
]);
498+
assert_eq!(buf.as_str(), "😀0123456789😀.txt");
499+
}
421500
}
422501

423502
// ****************************************************************************

0 commit comments

Comments
 (0)