Handle surrogate-pairs in LFNs.

thejpster · thejpster · commit b77b9caf50da · 2024-10-27T13:43:33.000Z
diff --git a/src/filesystem/filename.rs b/src/filesystem/filename.rs
@@ -221,6 +221,7 @@ impl core::fmt::Debug for ShortFileName {
 }
 
 /// Used to store a Long File Name
+#[derive(Debug)]
 pub struct LfnBuffer<'a> {
     /// We fill this buffer in from the back
     inner: &'a mut [u8],
@@ -230,6 +231,8 @@ pub struct LfnBuffer<'a> {
     free: usize,
     /// Did we overflow?
     overflow: bool,
+    /// If a surrogate-pair is split over two directory entries, remember half of it here.
+    unpaired_surrogate: Option<u16>,
 }
 
 impl<'a> LfnBuffer<'a> {
@@ -240,19 +243,34 @@ impl<'a> LfnBuffer<'a> {
             inner: storage,
             free: len,
             overflow: false,
+            unpaired_surrogate: None,
         }
     }
 
     /// Empty out this buffer
     pub fn clear(&mut self) {
         self.free = self.inner.len();
         self.overflow = false;
+        self.unpaired_surrogate = None;
     }
 
-    /// Push the 13 UCS-2 characters into this string
+    /// Push the 13 UTF-16 codepoints into this string.
     ///
     /// We assume they are pushed last-chunk-first, as you would find
     /// them on disk.
+    ///
+    /// Any chunk starting with a half of a surrogate pair has that saved for the next call.
+    ///
+    /// ```text
+    /// [de00, 002e, 0074, 0078, 0074, 0000, ffff, ffff, ffff, ffff, ffff, ffff, ffff]
+    /// [0041, 0042, 0030, 0031, 0032, 0033, 0034, 0035, 0036, 0037, 0038, 0039, d83d]
+    ///
+    /// Would map to
+    ///
+    /// 0041 0042 0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 1f600 002e 0074 0078 0074, or
+    ///
+    /// "AB0123456789😀.txt"
+    /// ```
     pub fn push(&mut self, buffer: &[u16; 13]) {
         // find the first null, if any
         let null_idx = buffer
@@ -261,25 +279,70 @@ impl<'a> LfnBuffer<'a> {
             .unwrap_or(buffer.len());
         // take all the wide chars, up to the null (or go to the end)
         let buffer = &buffer[0..null_idx];
-        for ch in buffer.iter().rev() {
-            let ch = char::from_u32(*ch as u32).unwrap_or('?');
+
+        // This next part will convert the 16-bit values into chars, noting that
+        // chars outside the Basic Multilingual Plane will require two 16-bit
+        // values to encode (see UTF-16 Surrogate Pairs).
+        //
+        // We cache the decoded chars into this array so we can iterate them
+        // backwards. It's 60 bytes, but it'll have to do.
+        let mut char_vec: heapless::Vec<char, 13> = heapless::Vec::new();
+        // Now do the decode, including the unpaired surrogate (if any) from
+        // last time (maybe it has a pair now!)
+        let mut is_first = true;
+        for ch in char::decode_utf16(
+            buffer
+                .iter()
+                .cloned()
+                .chain(self.unpaired_surrogate.take().iter().cloned()),
+        ) {
+            match ch {
+                Ok(ch) => {
+                    char_vec.push(ch).expect("Vec was full!?");
+                }
+                Err(e) => {
+                    // OK, so we found half a surrogate pair and nothing to go
+                    // with it. Was this the first codepoint in the chunk?
+                    if is_first {
+                        // it was - the other half is probably in the next chunk
+                        // so save this for next time
+                        trace!("LFN saved {:?}", e.unpaired_surrogate());
+                        self.unpaired_surrogate = Some(e.unpaired_surrogate());
+                    } else {
+                        // it wasn't - can't deal with it these mid-sequence, so
+                        // replace it
+                        trace!("LFN replaced {:?}", e.unpaired_surrogate());
+                        char_vec.push('\u{fffd}').expect("Vec was full?!");
+                    }
+                }
+            }
+            is_first = false;
+        }
+
+        for ch in char_vec.iter().rev() {
             trace!("LFN push {:?}", ch);
-            let mut ch_bytes = [0u8; 4];
-            // a buffer of length 4 is always enough
-            let ch_str = ch.encode_utf8(&mut ch_bytes);
-            if self.free < ch_str.len() {
+            // a buffer of length 4 is enough to encode any char
+            let mut encoded_ch = [0u8; 4];
+            let encoded_ch = ch.encode_utf8(&mut encoded_ch);
+            if self.free < encoded_ch.len() {
+                // the LFN buffer they gave us was not long enough. Note for
+                // later, so we don't show them garbage.
                 self.overflow = true;
                 return;
             }
-            // store the encoded character in the buffer, working backwards
-            for b in ch_str.bytes().rev() {
+            // Store the encoded char in the buffer, working backwards. We
+            // already checked there was enough space.
+            for b in encoded_ch.bytes().rev() {
                 self.free -= 1;
                 self.inner[self.free] = b;
             }
         }
     }
 
     /// View this LFN buffer as a string-slice
+    ///
+    /// If the buffer overflowed while parsing the LFN, or if this buffer is
+    /// empty, you get an empty string.
     pub fn as_str(&self) -> &str {
         if self.overflow {
             ""
@@ -418,6 +481,22 @@ mod test {
         ]);
         assert_eq!(buf.as_str(), "ABCDEFGHIJKLM0123∂");
     }
+
+    #[test]
+    fn two_piece_split_surrogate() {
+        let mut storage = [0u8; 64];
+        let mut buf: LfnBuffer = LfnBuffer::new(&mut storage);
+
+        buf.push(&[
+            0xde00, 0x002e, 0x0074, 0x0078, 0x0074, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+            0xffff, 0xffff,
+        ]);
+        buf.push(&[
+            0xd83d, 0xde00, 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038,
+            0x0039, 0xd83d,
+        ]);
+        assert_eq!(buf.as_str(), "😀0123456789😀.txt");
+    }
 }
 
 // ****************************************************************************