@@ -119,6 +119,25 @@ public static unsafe class Utf8Utility
119
119
// return string.Join(" ", binaryStrings);
120
120
// }
121
121
122
+
123
+
124
+ // par:
125
+ // | Method | FileName | Mean | Error | StdDev | Allocated |
126
+ // |----------------------------------- |----------------------- |-----------:|----------:|----------:|----------:|
127
+ // | CompetitionUtf8ValidationRealData | data/arabic.utf8.txt | 199.315 us | 0.2632 us | 0.2334 us | - |
128
+ // | CompetitionUtf8ValidationErrorData | data/arabic.utf8.txt | 132.782 us | 0.5135 us | 0.4552 us | - |
129
+ // | CompetitionUtf8ValidationRealData | data/chinese.utf8.txt | 29.674 us | 0.3246 us | 0.2710 us | - |
130
+ // | CompetitionUtf8ValidationErrorData | data/chinese.utf8.txt | 5.185 us | 0.0177 us | 0.0148 us | - |
131
+ // | CompetitionUtf8ValidationRealData | data/english.utf8.txt | 16.251 us | 0.2844 us | 0.2793 us | - |
132
+ // | CompetitionUtf8ValidationErrorData | data/english.utf8.txt | 11.119 us | 0.0405 us | 0.0379 us | - |
133
+ // | CompetitionUtf8ValidationRealData | data/french.utf8.txt | 70.772 us | 0.2132 us | 0.1890 us | - |
134
+ // | CompetitionUtf8ValidationErrorData | data/french.utf8.txt | 22.515 us | 0.1278 us | 0.1195 us | - |
135
+ // | CompetitionUtf8ValidationRealData | data/german.utf8.txt | 14.132 us | 0.0722 us | 0.0640 us | - |
136
+ // | CompetitionUtf8ValidationErrorData | data/german.utf8.txt | 6.889 us | 0.0231 us | 0.0205 us | - |
137
+ // | CompetitionUtf8ValidationRealData | data/japanese.utf8.txt | 25.023 us | 0.1017 us | 0.0952 us | - |
138
+ // | CompetitionUtf8ValidationErrorData | data/japanese.utf8.txt | 17.504 us | 0.0712 us | 0.0666 us | - |
139
+ // | CompetitionUtf8ValidationRealData | data/turkish.utf8.txt | 23.755 us | 0.3332 us | 0.3117 us | - |
140
+ // | CompetitionUtf8ValidationErrorData | data/turkish.utf8.txt | 21.983 us | 0.1308 us | 0.1223 us | - |
122
141
public static byte * GetPointerToFirstInvalidByte ( byte * pInputBuffer , int inputLength )
123
142
{
124
143
@@ -146,16 +165,59 @@ public static unsafe class Utf8Utility
146
165
147
166
}
148
167
168
+ // First fix bencrmarks static utf checker
169
+ // | Method | FileName | Mean | Error | StdDev | Allocated |
170
+ // |---------------------------- |----------------------- |-----------:|----------:|-----------:|----------:|
171
+ // | SIMDUtf8ValidationRealData | data/arabic.utf8.txt | 478.655 us | 8.9312 us | 15.4059 us | - |
172
+ // | SIMDUtf8ValidationErrorData | data/arabic.utf8.txt | 283.895 us | 5.2810 us | 8.9675 us | - |
173
+ // | SIMDUtf8ValidationRealData | data/chinese.utf8.txt | 134.967 us | 2.6698 us | 5.1438 us | - |
174
+ // | SIMDUtf8ValidationErrorData | data/chinese.utf8.txt | 17.403 us | 0.3361 us | 0.4820 us | - |
175
+ // | SIMDUtf8ValidationRealData | data/english.utf8.txt | 11.186 us | 0.0707 us | 0.0626 us | - |
176
+ // | SIMDUtf8ValidationErrorData | data/english.utf8.txt | 11.167 us | 0.1118 us | 0.0991 us | - |
177
+ // | SIMDUtf8ValidationRealData | data/french.utf8.txt | 13.303 us | 0.2523 us | 0.2236 us | - |
178
+ // | SIMDUtf8ValidationErrorData | data/french.utf8.txt | 13.002 us | 0.1448 us | 0.1284 us | - |
179
+ // | SIMDUtf8ValidationRealData | data/german.utf8.txt | 5.965 us | 0.1016 us | 0.0901 us | - |
180
+ // | SIMDUtf8ValidationErrorData | data/german.utf8.txt | 5.981 us | 0.0683 us | 0.0639 us | - |
181
+ // | SIMDUtf8ValidationRealData | data/japanese.utf8.txt | 138.114 us | 2.6217 us | 3.0191 us | - |
182
+ // | SIMDUtf8ValidationErrorData | data/japanese.utf8.txt | 66.023 us | 1.2819 us | 1.1364 us | - |
183
+ // | SIMDUtf8ValidationRealData | data/turkish.utf8.txt | 168.166 us | 2.4131 us | 2.2572 us | - |
184
+ // | SIMDUtf8ValidationErrorData | data/turkish.utf8.txt | 112.761 us | 2.2175 us | 1.9657 us | - |
185
+
186
+
187
+
149
188
// Process the remaining bytes with the scalar function
189
+ // if (processedLength < inputLength)
190
+ // {
191
+ // byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByte(pInputBuffer + processedLength, inputLength - processedLength);
192
+ // if (invalidBytePointer != pInputBuffer + inputLength)
193
+ // {
194
+ // // An invalid byte was found by the scalar function
195
+ // return invalidBytePointer;
196
+ // }
197
+ // }
150
198
if ( processedLength < inputLength )
151
199
{
152
- byte * invalidBytePointer = SimdUnicode . UTF8 . GetPointerToFirstInvalidByte ( pInputBuffer + processedLength , inputLength - processedLength ) ;
153
- if ( invalidBytePointer != pInputBuffer + inputLength )
200
+
201
+ Span < byte > remainingBytes = stackalloc byte [ 32 ] ;
202
+ new Span < byte > ( pInputBuffer + processedLength , inputLength - processedLength ) . CopyTo ( remainingBytes ) ;
203
+
204
+ ReadOnlySpan < Byte > remainingBytesReadOnly = remainingBytes ;
205
+ Vector256 < byte > remainingBlock = Vector256 . Create ( remainingBytesReadOnly ) ;
206
+ Utf8Validation . utf8_checker . CheckNextInput ( remainingBlock ) ;
207
+
208
+ Utf8Validation . utf8_checker . CheckEof ( ) ;
209
+ if ( Utf8Validation . utf8_checker . Errors ( ) )
154
210
{
155
- // An invalid byte was found by the scalar function
156
- return invalidBytePointer ;
211
+ // return pInputBuffer + processedLength;
212
+ return SimdUnicode . UTF8 . GetPointerToFirstInvalidByte ( pInputBuffer + processedLength , inputLength - processedLength ) ;
157
213
}
214
+ processedLength += inputLength - processedLength ;
215
+
158
216
}
217
+
218
+
219
+
220
+
159
221
return pInputBuffer + inputLength ;
160
222
161
223
}
0 commit comments