1
1
//
2
2
// Normalizer.swift
3
- //
3
+ //
4
4
//
5
5
// Created by Pedro Cuenca on 17/7/23.
6
6
//
@@ -11,7 +11,7 @@ import Hub
11
11
public protocol Normalizer {
12
12
func normalize( text: String ) -> String
13
13
func callAsFunction( text: String ) -> String
14
-
14
+
15
15
init ( config: Config )
16
16
}
17
17
@@ -33,6 +33,7 @@ enum NormalizerType: String {
33
33
case Bert
34
34
case Precompiled
35
35
case StripAccents
36
+ case Strip
36
37
case Unknown = " "
37
38
}
38
39
@@ -43,29 +44,32 @@ struct NormalizerFactory {
43
44
let type = NormalizerType ( rawValue: typeName)
44
45
switch type {
45
46
case . Sequence: return NormalizerSequence ( config: config)
46
- case . Prepend : return PrependNormalizer ( config: config)
47
- case . Replace : return ReplaceNormalizer ( config: config)
48
- case . Lowercase : return LowercaseNormalizer ( config: config)
49
- case . NFD : return NFDNormalizer ( config: config)
50
- case . NFC : return NFCNormalizer ( config: config)
51
- case . NFKD : return NFKDNormalizer ( config: config)
52
- case . NFKC : return NFKCNormalizer ( config: config)
53
- case . Bert : return BertNormalizer ( config: config)
54
- case . Precompiled : return PrecompiledNormalizer ( config: config)
55
- case . StripAccents : return StripAccentsNormalizer ( config: config)
56
- default : fatalError ( " Unsupported Normalizer type: \( typeName) " )
47
+ case . Prepend: return PrependNormalizer ( config: config)
48
+ case . Replace: return ReplaceNormalizer ( config: config)
49
+ case . Lowercase: return LowercaseNormalizer ( config: config)
50
+ case . NFD: return NFDNormalizer ( config: config)
51
+ case . NFC: return NFCNormalizer ( config: config)
52
+ case . NFKD: return NFKDNormalizer ( config: config)
53
+ case . NFKC: return NFKCNormalizer ( config: config)
54
+ case . Bert: return BertNormalizer ( config: config)
55
+ case . Precompiled: return PrecompiledNormalizer ( config: config)
56
+ case . StripAccents: return StripAccentsNormalizer ( config: config)
57
+ case . Strip: return StripNormalizer ( config: config)
58
+ default : fatalError ( " Unsupported Normalizer type: \( typeName) " )
57
59
}
58
60
}
59
61
}
60
62
61
63
class NormalizerSequence : Normalizer {
62
64
let normalizers : [ Normalizer ]
63
-
65
+
64
66
required public init ( config: Config ) {
65
- guard let configs = config. normalizers? . arrayValue else { fatalError ( " No normalizers in Sequence " ) }
67
+ guard let configs = config. normalizers? . arrayValue else {
68
+ fatalError ( " No normalizers in Sequence " )
69
+ }
66
70
normalizers = configs. compactMap { NormalizerFactory . fromConfig ( config: $0) }
67
71
}
68
-
72
+
69
73
public func normalize( text: String ) -> String {
70
74
normalizers. reduce ( text) { current, normalizer in
71
75
normalizer ( text: current)
@@ -75,23 +79,23 @@ class NormalizerSequence: Normalizer {
75
79
76
80
class PrependNormalizer : Normalizer {
77
81
let prepend : String
78
-
82
+
79
83
required public init ( config: Config ) {
80
84
prepend = config. prepend? . stringValue ?? " "
81
85
}
82
-
86
+
83
87
public func normalize( text: String ) -> String {
84
88
return prepend + text
85
89
}
86
90
}
87
91
88
92
class ReplaceNormalizer : Normalizer {
89
93
let pattern : StringReplacePattern ?
90
-
94
+
91
95
required public init ( config: Config ) {
92
96
self . pattern = StringReplacePattern . from ( config: config)
93
97
}
94
-
98
+
95
99
public func normalize( text: String ) -> String {
96
100
guard let pattern = pattern else { return text }
97
101
return pattern. replace ( text)
@@ -106,7 +110,7 @@ class LowercaseNormalizer: Normalizer {
106
110
}
107
111
}
108
112
109
- class NFDNormalizer : Normalizer {
113
+ class NFDNormalizer : Normalizer {
110
114
required public init ( config: Config ) { }
111
115
112
116
public func normalize( text: String ) -> String {
@@ -122,7 +126,7 @@ class NFCNormalizer: Normalizer {
122
126
}
123
127
}
124
128
125
- class NFKDNormalizer : Normalizer {
129
+ class NFKDNormalizer : Normalizer {
126
130
required init ( config: Config ) { }
127
131
128
132
func normalize( text: String ) -> String {
@@ -172,15 +176,13 @@ class BertNormalizer: Normalizer {
172
176
private func cleanText( text: String ) -> String {
173
177
text. map { c in
174
178
guard let scalar = c. unicodeScalars. first,
175
- scalar. value != 0x0 ,
176
- scalar. value != 0xFFFD ,
177
- !isControl( scalar)
179
+ scalar. value != 0x0 ,
180
+ scalar. value != 0xFFFD ,
181
+ !isControl( scalar)
178
182
else { return " \( c) " }
179
183
180
184
// Replace whitespace: \t, \n, \r
181
- if scalar. value == 0x009 ||
182
- scalar. value == 0x00A ||
183
- scalar. value == 0x000D {
185
+ if scalar. value == 0x009 || scalar. value == 0x00A || scalar. value == 0x000D {
184
186
return " "
185
187
} else {
186
188
return " \( c) "
@@ -201,29 +203,27 @@ class BertNormalizer: Normalizer {
201
203
}
202
204
203
205
private func isOther( _ c: Unicode . GeneralCategory ) -> Bool {
204
- c == . control ||
205
- c == . format ||
206
- c == . surrogate ||
207
- c == . privateUse ||
208
- c == . unassigned
206
+ c == . control || c == . format || c == . surrogate || c == . privateUse || c == . unassigned
209
207
}
210
208
211
209
private func handleChineseChars( text: String ) -> String {
212
210
text. map { c in
213
211
if let scalar = c. unicodeScalars. first, Utils . isChineseChar ( scalar) {
214
212
" \( c) "
215
213
} else {
216
- " \( c) "
214
+ " \( c) "
217
215
}
218
216
}
219
217
. joined ( )
220
218
}
221
219
222
220
private func stripAccents( text: String ) -> String {
223
221
text. decomposedStringWithCanonicalMapping
224
- . filter { $0. unicodeScalars. allSatisfy { scalar in
225
- !( 0x0300 <= scalar. value && scalar. value <= 0x036F )
226
- } }
222
+ . filter {
223
+ $0. unicodeScalars. allSatisfy { scalar in
224
+ !( 0x0300 <= scalar. value && scalar. value <= 0x036F )
225
+ }
226
+ }
227
227
}
228
228
}
229
229
@@ -245,7 +245,8 @@ class PrecompiledNormalizer: Normalizer {
245
245
case 0x0001 ... 0x0008 , 0x000B , 0x000E ... 0x001F , 0x007F , 0x008F , 0x009F :
246
246
// Non-printing control characters
247
247
output. append ( " " )
248
- case 0x0009 , 0x000A , 0x000C , 0x000D , 0x1680 , 0x200B ... 0x200F , 0x2028 , 0x2029 , 0x2581 , 0xFEFF , 0xFFFD :
248
+ case 0x0009 , 0x000A , 0x000C , 0x000D , 0x1680 , 0x200B ... 0x200F , 0x2028 , 0x2029 , 0x2581 ,
249
+ 0xFEFF , 0xFFFD :
249
250
// Separators
250
251
output. append ( " " )
251
252
case 0xFF5E :
@@ -257,7 +258,8 @@ class PrecompiledNormalizer: Normalizer {
257
258
}
258
259
259
260
if hasFullwidthTilde {
260
- return output
261
+ return
262
+ output
261
263
. split ( by: " \u{FF5E} " )
262
264
. map ( { $0. precomposedStringWithCompatibilityMapping } )
263
265
. joined ( separator: " \u{FF5E} " )
@@ -275,6 +277,30 @@ class StripAccentsNormalizer: Normalizer {
275
277
}
276
278
}
277
279
280
+ class StripNormalizer : Normalizer {
281
+ let leftStrip : Bool
282
+ let rightStrip : Bool
283
+
284
+ required init ( config: Config ) {
285
+ self . leftStrip = config. stripLeft? . boolValue ?? true
286
+ self . rightStrip = config. stripRight? . boolValue ?? true
287
+ }
288
+
289
+ func normalize( text: String ) -> String {
290
+ var result = text
291
+
292
+ if leftStrip {
293
+ result = String ( result. drop ( while: { $0. isWhitespace } ) )
294
+ }
295
+
296
+ if rightStrip {
297
+ result = String ( result. reversed ( ) . drop ( while: { $0. isWhitespace } ) . reversed ( ) )
298
+ }
299
+
300
+ return result
301
+ }
302
+ }
303
+
278
304
enum StringReplacePattern {
279
305
case regexp( regexp: NSRegularExpression , replacement: String )
280
306
case string( pattern: String , replacement: String )
@@ -285,7 +311,8 @@ extension StringReplacePattern {
285
311
switch self {
286
312
case . regexp( let regexp, let replacement) :
287
313
let range = NSRange ( text. startIndex... , in: text)
288
- let replaced = regexp. stringByReplacingMatches ( in: text, options: [ ] , range: range, withTemplate: replacement)
314
+ let replaced = regexp. stringByReplacingMatches (
315
+ in: text, options: [ ] , range: range, withTemplate: replacement)
289
316
return replaced
290
317
case . string( let toReplace, let replacement) :
291
318
return text. replacingOccurrences ( of: toReplace, with: replacement)
0 commit comments