Skip to content

Commit 4ce907b

Browse files
authored
if the tokenizer produces an incomplete character do not consume (#261)
- fix #252
1 parent 96cde3f commit 4ce907b

File tree

1 file changed

+6
-0
lines changed

1 file changed

+6
-0
lines changed

Libraries/MLXLMCommon/Tokenizer.swift

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,12 @@ public struct NaiveStreamingDetokenizer: StreamingDetokenizer {
136136
let newSegment = tokenizer.decode(tokens: segmentTokens)
137137
let new = newSegment.suffix(newSegment.count - segment.count)
138138

139+
// if the new segment ends with REPLACEMENT CHARACTER this means
140+
// that the token didn't produce a complete unicode character
141+
if new.last == "\u{fffd}" {
142+
return nil
143+
}
144+
139145
if new.hasSuffix("\n") {
140146
startNewSegment()
141147
} else {

0 commit comments

Comments
 (0)