@@ -23,8 +23,8 @@ class UnigramTokenizer: PreTrainedTokenizerModel {
23
23
public var unknownToken : String ? { unknownPiece. token }
24
24
25
25
let minScore : Float
26
- let tokensToIds : [ String : Int ]
27
-
26
+ let tokensToIds : [ NSString : Int ]
27
+
28
28
let bosToken : String ? = " "
29
29
let bosTokenId : Int ?
30
30
let eosToken : String ?
@@ -63,20 +63,20 @@ class UnigramTokenizer: PreTrainedTokenizerModel {
63
63
self . unknownTokenId = unknownTokenId
64
64
self . unknownPiece = SentencePieceToken ( token: vocab [ unknownTokenId] . token, score: minScore - 10 )
65
65
66
- tokensToIds = Dictionary ( uniqueKeysWithValues: vocab. map { $0. token } . enumerated ( ) . map { ( $1, $0) } )
67
- bosTokenId = tokensToIds [ bosToken!] // May be nil
68
-
66
+ tokensToIds = Dictionary ( uniqueKeysWithValues: vocab. map { $0. token as NSString } . enumerated ( ) . map { ( $1, $0) } )
67
+ bosTokenId = tokensToIds [ bosToken! as NSString ] // May be nil
68
+
69
69
eosToken = tokenizerConfig. eosToken? . stringValue
70
- eosTokenId = eosToken == nil ? nil : tokensToIds [ eosToken!]
71
-
70
+ eosTokenId = eosToken == nil ? nil : tokensToIds [ eosToken! as NSString ]
71
+
72
72
trie = Trie ( )
73
73
trie. append ( contentsOf: vocab. map { $0. token } )
74
74
75
75
// TODO: set fuse_unk to true
76
76
}
77
77
78
78
func convertTokenToId( _ token: String ) -> Int ? {
79
- return tokensToIds [ token] ?? self . unknownTokenId
79
+ return tokensToIds [ token as NSString ] ?? self . unknownTokenId
80
80
}
81
81
82
82
func convertIdToToken( _ id: Int ) -> String ? {
@@ -95,7 +95,7 @@ class UnigramTokenizer: PreTrainedTokenizerModel {
95
95
96
96
let beginIndex = sentence. index ( sentence. startIndex, offsetBy: beginPos)
97
97
for token in trie. commonPrefixSearchIterator ( sentence [ beginIndex... ] ) . map ( { String ( $0) } ) {
98
- guard let tokenId = tokensToIds [ token] else { fatalError ( " Token not in vocab: \( token) " ) }
98
+ guard let tokenId = tokensToIds [ token as NSString ] else { fatalError ( " Token not in vocab: \( token) " ) }
99
99
let tokenScore = vocab [ tokenId] . score
100
100
lattice. insert ( startOffset: beginPos, length: token. count, score: tokenScore, tokenId: tokenId)
101
101
if !hasSingleNode && token. count == mblen {
0 commit comments