Skip to content

Commit 7c146d9

Browse files
authored
Turns out we introduced a regression because bad code. (#1060)
1 parent 7bfab48 commit 7c146d9

File tree

1 file changed

+30
-4
lines changed

1 file changed

+30
-4
lines changed

tokenizers/src/decoders/wordpiece.rs

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,10 +49,12 @@ impl Decoder for WordPiece {
4949
.iter_mut()
5050
.enumerate()
5151
.map(|(i, token)| {
52-
if token.starts_with(&self.prefix) {
53-
*token = token.replacen(&self.prefix, "", 1);
54-
} else if i != 0 {
55-
*token = format!(" {}", token);
52+
if i != 0 {
53+
if token.starts_with(&self.prefix) {
54+
*token = token.replacen(&self.prefix, "", 1);
55+
} else {
56+
*token = format!(" {}", token);
57+
}
5658
}
5759
if self.cleanup {
5860
*token = cleanup(token);
@@ -62,3 +64,27 @@ impl Decoder for WordPiece {
6264
.collect::<Result<_>>()
6365
}
6466
}
67+
68+
#[cfg(test)]
69+
mod tests {
70+
use super::*;
71+
72+
#[test]
73+
fn wordpiece_decoder() {
74+
let decoder = WordPiece::new("##".to_string(), false);
75+
76+
assert_eq!(
77+
decoder
78+
.decode(vec![
79+
"##uelo".to_string(),
80+
"Ara".to_string(),
81+
"##új".to_string(),
82+
"##o".to_string(),
83+
"No".to_string(),
84+
"##guera".to_string()
85+
])
86+
.unwrap(),
87+
"##uelo Araújo Noguera"
88+
);
89+
}
90+
}

0 commit comments

Comments
 (0)