Skip to content

Commit 2e85602

Browse files
authored
more cleaning and adding some benchmark results. (#43)
* more cleaning and adding some benchmark results. * Update Benchmark.cs
1 parent 9b2a743 commit 2e85602

File tree

4 files changed

+47
-27
lines changed

4 files changed

+47
-27
lines changed

README.md

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,9 @@ This is a fast C# library to validate UTF-8 strings.
77
## Motivation
88

99
We seek to speed up the `Utf8Utility.GetPointerToFirstInvalidByte` function from the C# runtime library.
10-
[The function is private in the Microsoft Runtime](https://github.com/dotnet/runtime/blob/4d709cd12269fcbb3d0fccfb2515541944475954/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs), but we can expose it manually.
10+
[The function is private in the Microsoft Runtime](https://github.com/dotnet/runtime/blob/4d709cd12269fcbb3d0fccfb2515541944475954/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs), but we can expose it manually. The C# runtime
11+
function is well optimized and it makes use of advanced CPU instructions. Nevertheless, we propose
12+
an alternative that can be several times faster.
1113

1214
Specifically, we provide the function `SimdUnicode.UTF8.GetPointerToFirstInvalidByte` which is a faster
1315
drop-in replacement:
@@ -35,7 +37,7 @@ We apply the algorithm used by Node.js, Bun, Oracle GraalVM, by the PHP interpre
3537

3638
## Requirements
3739

38-
We recommend you install .NET 8: https://dotnet.microsoft.com/en-us/download/dotnet/8.0
40+
We recommend you install .NET 8 or better: https://dotnet.microsoft.com/en-us/download/dotnet/8.0
3941

4042

4143
## Running tests
@@ -74,8 +76,6 @@ Or to target specific categories:
7476
dotnet test --filter "Category=scalar"
7577
```
7678

77-
78-
7979
## Running Benchmarks
8080

8181
To run the benchmarks, run the following command:
@@ -98,6 +98,28 @@ cd benchmark
9898
sudo dotnet run -c Release
9999
```
100100

101+
## Results (x64)
102+
103+
To be completed.
104+
105+
## Results (ARM)
106+
107+
On an Apple M2 system, our validation function is two to three times
108+
faster than the standard library.
109+
110+
| data set | SimdUnicode speed (GB/s) | .NET speed (GB/s) |
111+
|:----------------|:-----------|:--------------------------|
112+
| Arabic-Lipsum | 6.7 | 3.5 |
113+
| Chinese-Lipsum | 6.7 | 4.8 |
114+
| Emoji-Lipsum | 6.7 | 2.5 |
115+
| Hebrew-Lipsum | 6.7 | 3.5 |
116+
| Hindi-Lipsum | 6.8 | 3.0 |
117+
| Japanese-Lipsum | 6.8 | 4.6  |
118+
| Korean-Lipsum | 6.6 | 1.8 |
119+
| Latin-Lipsum | 87 | 38 |
120+
| Russian-Lipsum | 6.7 | 2.6 |
121+
122+
101123
## Building the library
102124

103125
```
@@ -139,7 +161,7 @@ You can print the content of a vector register like so:
139161

140162
## More reading
141163

142-
- https://github.com/dotnet/coreclr/pull/21948/files#diff-2a22774bd6bff8e217ecbb3a41afad033ce0ca0f33645e9d8f5bdf7c9e3ac248
164+
- [Add optimized UTF-8 validation and transcoding apis, hook them up to UTF8Encoding](https://github.com/dotnet/coreclr/pull/21948/files#diff-2a22774bd6bff8e217ecbb3a41afad033ce0ca0f33645e9d8f5bdf7c9e3ac248)
143165
- https://github.com/dotnet/runtime/issues/41699
144166
- https://learn.microsoft.com/en-us/dotnet/standard/design-guidelines/
145167
- https://learn.microsoft.com/en-us/dotnet/csharp/fundamentals/coding-style/coding-conventions

benchmark/Benchmark.cs

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
using BenchmarkDotNet.Configs;
66
using BenchmarkDotNet.Reports;
77
using BenchmarkDotNet.Filters;
8+
using BenchmarkDotNet.Jobs;
89
using System.Text;
910
using System.Runtime;
1011
using System.Runtime.InteropServices;
@@ -272,10 +273,23 @@ public unsafe void SIMDUtf8ValidationRealDataSse()
272273
}
273274
public class Program
274275
{
275-
static void Main(string[] args) => BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args, DefaultConfig.Instance
276-
.WithSummaryStyle(SummaryStyle.Default.WithMaxParameterColumnWidth(100)));
277-
276+
static void Main(string[] args)
277+
{
278+
if (args.Length == 0)
279+
{
280+
args = new string[] { "--filter", "*" };
281+
}
282+
var job = Job.Default
283+
.WithWarmupCount(1)
284+
.WithMinIterationCount(2)
285+
.WithMaxIterationCount(10)
286+
.AsDefault();
278287

288+
BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args, DefaultConfig.Instance.AddJob(job).WithSummaryStyle(SummaryStyle.Default.WithMaxParameterColumnWidth(100)));
289+
}
279290
}
280291

292+
293+
// }
294+
281295
}

benchmark/UTF8_runtime.cs

Lines changed: 2 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -9,22 +9,8 @@
99
using System.Runtime.Intrinsics.Arm;
1010
using System.Runtime.Intrinsics.X86;
1111

12-
// Changes made from the Runtime (most of the stuff in the runtime is behind some private/internal class or some such. The path of least resistance was to copy paste.
13-
// Copy pasted from: https://source.dot.net/#System.Private.CoreLib/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs,ca18aaa87cbdbfe8
14-
// https://source.dot.net/#System.Text.Encodings.Web/src/libraries/System.Private.CoreLib/src/System/Text/UnicodeUtility.cs,7d9353a7dd29c82b
15-
// https://source.dot.net/#System.Text.Encodings.Web/src/libraries/System.Private.CoreLib/src/System/Text/UnicodeDebug.cs,ac87f1ec2c614dce
16-
// https://source.dot.net/#System.Private.CoreLib/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs,0ca89dfb3ec2a5da
17-
// https://source.dot.net/#System.Private.CoreLib/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs,87a9969a4e35fdde
18-
// https://source.dot.net/#System.Private.CoreLib/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs,ff5eb1221e0665eb
19-
// https://source.dot.net/#System.Private.CoreLib/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs,068a221b81a99840
20-
// https://source.dot.net/#System.Private.CoreLib/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs,74fc59ef51d5afa8
21-
// https://source.dot.net/#System.Private.CoreLib/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs,7fc454ccbfa2fc31
22-
// https://source.dot.net/#System.Private.CoreLib/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs,70ca16f03c3cc41b
23-
// https://source.dot.net/#System.Private.CoreLib/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs,5a3126ae65f71ef2
24-
// https://source.dot.net/#System.Private.CoreLib/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs,6ee61246657067fb
25-
// https://source.dot.net/#System.Private.CoreLib/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs,706384069881fe22
26-
//
27-
12+
// Copy pasted from the System/Text/Unicode/Utf8Utility.Helpers.cs and associated files.
13+
// Important: copyright belongs to the .NET Foundation.
2814
namespace DotnetRuntime
2915
{
3016

src/UTF8.cs

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ public static class UTF8
2424
public unsafe static byte* GetPointerToFirstInvalidByte(byte* pInputBuffer, int inputLength, out int Utf16CodeUnitCountAdjustment, out int ScalarCodeUnitCountAdjustment)
2525
{
2626

27-
if (AdvSimd.Arm64.IsSupported)
27+
if (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian)
2828
{
2929
return GetPointerToFirstInvalidByteArm64(pInputBuffer, inputLength, out Utf16CodeUnitCountAdjustment, out ScalarCodeUnitCountAdjustment);
3030
}
@@ -707,7 +707,6 @@ private unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust
707707
return GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment);
708708
}
709709

710-
//
711710
public unsafe static byte* GetPointerToFirstInvalidByteAvx2(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment)
712711
{
713712
int processedLength = 0;
@@ -851,7 +850,6 @@ private unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust
851850
{
852851
// We have an ASCII block, no need to process it, but
853852
// we need to check if the previous block was incomplete.
854-
//
855853
if (!Avx2.TestZ(prevIncomplete, prevIncomplete))
856854
{
857855
int off = processedLength >= 3 ? processedLength - 3 : processedLength;

0 commit comments

Comments
 (0)