4
4
using System . Linq ;
5
5
using System . Runtime . CompilerServices ;
6
6
7
-
8
7
// C# already have something that is *more or less* equivalent to our C++ simd class:
9
8
// Vector256 https://learn.microsoft.com/en-us/dotnet/api/system.runtime.intrinsics.vector256-1?view=net-7.0
10
9
// I extend it as needed
@@ -84,6 +83,25 @@ namespace SimdUnicode
84
83
public static unsafe class Utf8Utility
85
84
{
86
85
86
+ // Helper functions for debugging
87
+ // string VectorToString(Vector256<byte> vector)
88
+ // {
89
+ // Span<byte> span = stackalloc byte[Vector256<byte>.Count];
90
+ // vector.CopyTo(span);
91
+ // return BitConverter.ToString(span.ToArray());
92
+ // }
93
+
94
+ // string VectorToBinary(Vector256<byte> vector)
95
+ // {
96
+ // Span<byte> span = stackalloc byte[Vector256<byte>.Count];
97
+ // vector.CopyTo(span);
98
+
99
+ // var binaryStrings = span.ToArray().Select(b => Convert.ToString(b, 2).PadLeft(8, '0'));
100
+ // return string.Join(" ", binaryStrings);
101
+ // }
102
+
103
+
104
+
87
105
88
106
89
107
// Returns a pointer to the first invalid byte in the input buffer if it's invalid, or a pointer to the end if it's valid.
@@ -98,14 +116,23 @@ public static unsafe class Utf8Utility
98
116
var checker = new SimdUnicode . utf8_validation . utf8_checker ( ) ;
99
117
int processedLength = 0 ;
100
118
119
+ // Helpers.CheckForGCCollections("Before AVX2 procession");
101
120
while ( processedLength + 32 <= inputLength )
102
121
{
122
+ // Console.WriteLine("-------New AVX2 vector blocked processing!------------");
123
+
103
124
Vector256 < byte > currentBlock = Avx . LoadVector256 ( pInputBuffer + processedLength ) ;
125
+ // Helpers.CheckForGCCollections($"Before check_next_input:{processedLength}");
104
126
checker . check_next_input ( currentBlock ) ;
127
+ // Helpers.CheckForGCCollections($"After check_next_input:{processedLength}");
105
128
106
129
processedLength += 32 ;
130
+
107
131
}
108
132
133
+ // Helpers.CheckForGCCollections("After AVX2 procession");
134
+
135
+
109
136
if ( processedLength < inputLength )
110
137
{
111
138
Span < byte > remainingBytes = stackalloc byte [ 32 ] ;
@@ -121,6 +148,24 @@ public static unsafe class Utf8Utility
121
148
122
149
}
123
150
151
+ // CheckForGCCollections("After processed remaining bytes");
152
+
153
+
154
+ // if (processedLength < inputLength)
155
+ // {
156
+ // // Directly call the scalar function on the remaining part of the buffer
157
+ // byte* invalidBytePointer = GetPointerToFirstInvalidByte(pInputBuffer + processedLength, inputLength - processedLength -1);
158
+
159
+ // // You can then use `invalidBytePointer` as needed, for example:
160
+ // // if (invalidBytePointer != pInputBuffer + inputLength) {
161
+ // // // Handle the case where an invalid byte is found
162
+ // // }
163
+
164
+ // // Update processedLength to reflect the processing done by the scalar function
165
+ // processedLength += (int)(invalidBytePointer - pInputBuffer);
166
+ // }
167
+
168
+
124
169
checker . check_eof ( ) ;
125
170
if ( checker . errors ( ) )
126
171
{
@@ -131,15 +176,19 @@ public static unsafe class Utf8Utility
131
176
}
132
177
}
133
178
134
-
135
- public static class utf8_validation
179
+ // C# docs suggests that classes are allocated on the heap:
180
+ // it doesnt seem to do much in this case but I tthought the suggestion to be sensible.
181
+ public struct utf8_validation
136
182
{
137
- public class utf8_checker
183
+ public struct utf8_checker
138
184
{
139
185
Vector256 < byte > error ;
140
186
Vector256 < byte > prev_input_block ;
141
187
Vector256 < byte > prev_incomplete ;
142
188
189
+
190
+
191
+
143
192
public utf8_checker ( )
144
193
{
145
194
error = Vector256 < byte > . Zero ;
@@ -156,11 +205,13 @@ public utf8_checker()
156
205
public void check_next_input ( Vector256 < byte > input )
157
206
{
158
207
// Check if the entire 256-bit vector is ASCII
208
+
159
209
Vector256 < sbyte > inputSBytes = input . AsSByte ( ) ; // Reinterpret the byte vector as sbyte
160
210
int mask = Avx2 . MoveMask ( inputSBytes . AsByte ( ) ) ;
161
211
if ( mask != 0 )
162
212
{
163
213
// Contains non-ASCII characters, process the vector
214
+
164
215
check_utf8_bytes ( input , prev_input_block ) ;
165
216
prev_incomplete = is_incomplete ( input ) ;
166
217
}
@@ -316,21 +367,31 @@ private Vector256<byte> must_be_2_3_continuation(Vector256<byte> prev2, Vector25
316
367
return comparisonResult . AsByte ( ) ;
317
368
}
318
369
370
+
371
+ private static readonly byte [ ] MaxArray = new byte [ 32 ]
372
+ {
373
+ 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 ,
374
+ 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 ,
375
+ 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 ,
376
+ 255 , 255 , 255 , 255 , 255 , 0b11110000 - 1 , 0b11100000 - 1 , 0b11000000 - 1
377
+ } ;
378
+ Vector256 < byte > maxValue = Vector256 . Create ( MaxArray ) ;
379
+
319
380
[ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
320
381
321
382
private Vector256 < byte > is_incomplete ( Vector256 < byte > input )
322
383
{
323
384
// Console.WriteLine("Input Vector is_incomplete: " + VectorToString(input));
324
- byte [ ] maxArray = new byte [ 32 ]
325
- {
326
- 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 ,
327
- 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 ,
328
- 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 ,
329
- 255 , 255 , 255 , 255 , 255 , 0b11110000 - 1 , 0b11100000 - 1 , 0b11000000 - 1
330
- } ;
331
- Vector256 < byte > max_value = Vector256 . Create ( maxArray ) ;
332
-
333
- Vector256 < byte > result = SaturatingSubtractUnsigned ( input , max_value ) ;
385
+ // byte[] maxArray = new byte[32]
386
+ // {
387
+ // 255, 255, 255, 255, 255, 255, 255, 255,
388
+ // 255, 255, 255, 255, 255, 255, 255, 255,
389
+ // 255, 255, 255, 255, 255, 255, 255, 255,
390
+ // 255, 255, 255, 255, 255, 0b11110000 - 1, 0b11100000 - 1, 0b11000000 - 1
391
+ // };
392
+ // Vector256<byte> max_value = Vector256.Create(maxArray);
393
+
394
+ Vector256 < byte > result = SaturatingSubtractUnsigned ( input , maxValue ) ;
334
395
// Console.WriteLine("Result Vector is_incomplete: " + VectorToString(result));
335
396
336
397
return result ;
@@ -352,25 +413,6 @@ private Vector256<byte> SaturatingSubtractUnsigned(Vector256<byte> left, Vector2
352
413
353
414
return subtractionResult . AsByte ( ) ;
354
415
}
355
-
356
-
357
- // Helper functions for debugging
358
- private string VectorToString ( Vector256 < byte > vector )
359
- {
360
- Span < byte > span = stackalloc byte [ Vector256 < byte > . Count ] ;
361
- vector . CopyTo ( span ) ;
362
- return BitConverter . ToString ( span . ToArray ( ) ) ;
363
- }
364
-
365
- private string VectorToBinary ( Vector256 < byte > vector )
366
- {
367
- Span < byte > span = stackalloc byte [ Vector256 < byte > . Count ] ;
368
- vector . CopyTo ( span ) ;
369
-
370
- var binaryStrings = span . ToArray ( ) . Select ( b => Convert . ToString ( b , 2 ) . PadLeft ( 8 , '0' ) ) ;
371
- return string . Join ( " " , binaryStrings ) ;
372
- }
373
-
374
416
}
375
417
}
376
418
}
0 commit comments