Add Tiktoken Synchronous Creation Using Model Name (#7080)

tarekgh · ericstj · web-flow · commit cea9d90294ff · 2024-03-19T17:05:04.000-07:00
* Add Tiktoken Synchronous Creation Using Model Name

* Add RemoteExecutor to Tokenizers tests

* Address the feedback

* Add tests

---------

Co-authored-by: Eric StJohn &lt;ericstj@microsoft.com&gt;
diff --git a/eng/Version.Details.xml b/eng/Version.Details.xml
@@ -23,6 +23,11 @@
       <Uri>https://github.com/dotnet/arcade</Uri>
       <Sha>812d978c303174dc1aa305d7359e79053d7d4971</Sha>
     </Dependency>
+    <!-- Stay on package 8.0 until we stop testing for net6.0
+    <Dependency Name="Microsoft.DotNet.RemoteExecutor" Version="9.0.0-beta.24165.3">
+      <Uri>https://github.com/dotnet/arcade</Uri>
+      <Sha>812d978c303174dc1aa305d7359e79053d7d4971</Sha>
+    </Dependency> -->
     <Dependency Name="Microsoft.DotNet.SwaggerGenerator.MSBuild" Version="9.0.0-beta.24165.3">
       <Uri>https://github.com/dotnet/arcade</Uri>
       <Sha>812d978c303174dc1aa305d7359e79053d7d4971</Sha>
diff --git a/eng/Versions.props b/eng/Versions.props
@@ -79,6 +79,7 @@
     <DotNetRuntime80Version>8.0.1</DotNetRuntime80Version>
     <FluentAssertionVersion>5.10.2</FluentAssertionVersion>
     <MicrosoftCodeAnalysisTestingVersion>1.1.2-beta1.23431.1</MicrosoftCodeAnalysisTestingVersion>
+    <MicrosoftDotNetRemoteExecutorVersion>8.0.0-beta.24165.4</MicrosoftDotNetRemoteExecutorVersion>
     <MicrosoftDotNetXUnitExtensionsVersion>9.0.0-beta.24165.3</MicrosoftDotNetXUnitExtensionsVersion>
     <MicrosoftExtensionsDependencyModelVersion>2.1.0</MicrosoftExtensionsDependencyModelVersion>
     <MicrosoftExtensionsTestVersion>3.0.1</MicrosoftExtensionsTestVersion>
diff --git a/src/Microsoft.ML.Tokenizers/Model/Tiktoken.cs b/src/Microsoft.ML.Tokenizers/Model/Tiktoken.cs
@@ -112,7 +112,7 @@ private Tiktoken(Stream vocabStream, IReadOnlyDictionary<string, int>? specialTo
         /// <param name="cacheSize">The size of the cache to use.</param>
         /// <param name="normalizer">To normalize the text before tokenization</param>
         /// <returns>The tokenizer</returns>
-        public static Tokenizer CreateByModelName(
+        public static Tokenizer CreateTokenizerForModel(
                                     string modelName,
                                     Stream vocabStream,
                                     IReadOnlyDictionary<string, int>? extraSpecialTokens = null,
@@ -124,7 +124,7 @@ public static Tokenizer CreateByModelName(
                 throw new ArgumentNullException(nameof(modelName));
             }
 
-            (Dictionary<string, int> SpecialTokens, Regex Regex) tiktokenConfiguration = GetTiktokenConfigurations(modelName);
+            (Dictionary<string, int> SpecialTokens, Regex Regex, string _) tiktokenConfiguration = GetTiktokenConfigurations(modelName);
 
             if (extraSpecialTokens is not null)
             {
@@ -150,7 +150,7 @@ public static Tokenizer CreateByModelName(
         /// <param name="normalizer">To normalize the text before tokenization</param>
         /// <param name="cancellationToken"><see cref="CancellationToken"/> used to request cancellation of the operation.</param>
         /// <returns>The tokenizer</returns>
-        public static async Task<Tokenizer> CreateByModelNameAsync(
+        public static async Task<Tokenizer> CreateTokenizerForModelAsync(
                                     string modelName,
                                     Stream vocabStream,
                                     IReadOnlyDictionary<string, int>? extraSpecialTokens = null,
@@ -163,7 +163,7 @@ public static async Task<Tokenizer> CreateByModelNameAsync(
                 throw new ArgumentNullException(nameof(modelName));
             }
 
-            (Dictionary<string, int> SpecialTokens, Regex Regex) tiktokenConfiguration = GetTiktokenConfigurations(modelName);
+            (Dictionary<string, int> SpecialTokens, Regex Regex, string _) tiktokenConfiguration = GetTiktokenConfigurations(modelName);
 
             if (extraSpecialTokens is not null)
             {
@@ -738,31 +738,30 @@ private static ModelEncoding GetModelEncoding(string modelName)
             return encoder;
         }
 
-        internal static (Dictionary<string, int> SpecialTokens, Regex Regex) GetTiktokenConfigurations(string modelName)
+        internal static (Dictionary<string, int> SpecialTokens, Regex Regex, string Url) GetTiktokenConfigurations(string modelName)
         {
             ModelEncoding modelEncoding = GetModelEncoding(modelName);
 
             switch (modelEncoding)
             {
                 case ModelEncoding.Cl100kBase:
                     return (new Dictionary<string, int>
-                        { { EndOfText, 100257}, { FimPrefix, 100258}, { FimMiddle, 100259}, { FimSuffix, 100260}, { EndOfPrompt, 100276} }, Cl100kBaseRegex());
+                        { { EndOfText, 100257}, { FimPrefix, 100258}, { FimMiddle, 100259}, { FimSuffix, 100260}, { EndOfPrompt, 100276} }, Cl100kBaseRegex(), Cl100kBaseVocabUrl);
 
                 case ModelEncoding.P50kBase:
-                    return (new Dictionary<string, int> { { EndOfText, 50256 } }, P50kBaseRegex());
+                    return (new Dictionary<string, int> { { EndOfText, 50256 } }, P50kBaseRegex(), P50RanksUrl);
 
                 case ModelEncoding.P50kEdit:
                     return (new Dictionary<string, int>
-                        { { EndOfText, 50256 }, { FimPrefix, 50281 }, { FimMiddle, 50282 }, { FimSuffix, 50283 } }, P50kBaseRegex());
+                        { { EndOfText, 50256 }, { FimPrefix, 50281 }, { FimMiddle, 50282 }, { FimSuffix, 50283 } }, P50kBaseRegex(), P50RanksUrl);
 
                 case ModelEncoding.R50kBase:
-                    return (new Dictionary<string, int> { { EndOfText, 50256 } }, P50kBaseRegex());
+                    return (new Dictionary<string, int> { { EndOfText, 50256 } }, P50kBaseRegex(), R50RanksUrl);
 
                 case ModelEncoding.GPT2:
-                    return (new Dictionary<string, int> { { EndOfText, 50256 }, }, P50kBaseRegex());
+                    return (new Dictionary<string, int> { { EndOfText, 50256 }, }, P50kBaseRegex(), GPT2Url);
 
                 default:
-                    Debug.Assert(false, $"Unexpected encoder [{modelEncoding}]");
                     throw new NotSupportedException($"The model '{modelName}' is not supported.");
             }
         }
@@ -775,22 +774,64 @@ internal static (Dictionary<string, int> SpecialTokens, Regex Regex) GetTiktoken
         /// <param name="normalizer">To normalize the text before tokenization</param>
         /// <param name="cancellationToken"><see cref="CancellationToken"/> used to request cancellation of the operation.</param>
         /// <returns>The tokenizer</returns>
-        public static Task<Tokenizer> CreateByModelNameAsync(
+        public static Task<Tokenizer> CreateTokenizerForModelAsync(
                                                 string modelName,
                                                 IReadOnlyDictionary<string, int>? extraSpecialTokens = null,
                                                 Normalizer? normalizer = null,
                                                 CancellationToken cancellationToken = default)
         {
             try
             {
-                return CreateByEncoderNameAsync(modelName, GetModelEncoding(modelName), extraSpecialTokens, normalizer, cancellationToken);
+                return CreateByEncoderNameAsync(GetModelEncoding(modelName), extraSpecialTokens, normalizer, cancellationToken);
             }
             catch (Exception ex)
             {
                 return Task.FromException<Tokenizer>(ex);
             }
         }
 
+        /// <summary>
+        /// Create tokenizer based on model name
+        /// </summary>
+        /// <param name="modelName">Model name</param>
+        /// <param name="extraSpecialTokens">Extra special tokens other than the built-in ones for the model</param>
+        /// <param name="normalizer">To normalize the text before tokenization</param>
+        /// <returns>The tokenizer</returns>
+        public static Tokenizer CreateTokenizerForModel(
+                                                string modelName,
+                                                IReadOnlyDictionary<string, int>? extraSpecialTokens = null,
+                                                Normalizer? normalizer = null)
+        {
+            if (string.IsNullOrEmpty(modelName))
+            {
+                throw new ArgumentNullException(nameof(modelName));
+            }
+
+            (Dictionary<string, int> SpecialTokens, Regex Regex, string Url) tiktokenConfiguration = GetTiktokenConfigurations(modelName);
+
+            if (extraSpecialTokens is not null)
+            {
+                foreach (var extraSpecialToken in extraSpecialTokens)
+                {
+                    tiktokenConfiguration.SpecialTokens.Add(extraSpecialToken.Key, extraSpecialToken.Value);
+                }
+            }
+
+            if (!_tiktokenCache.TryGetValue(tiktokenConfiguration.Url,
+                    out (Dictionary<ReadOnlyMemory<byte>, int> encoder, Dictionary<StringSpanOrdinalKey, int> vocab, Dictionary<int, ReadOnlyMemory<byte>> decoder) cache))
+            {
+                using Stream stream = Helpers.GetStream(_httpClient, tiktokenConfiguration.Url);
+                cache = LoadTikTokenBpeAsync(stream, useAsync: false).GetAwaiter().GetResult();
+
+                _tiktokenCache.TryAdd(tiktokenConfiguration.Url, cache);
+            }
+
+            return new Tokenizer(
+                            new Tiktoken(cache.encoder, cache.decoder, cache.vocab, tiktokenConfiguration.SpecialTokens, LruCache<int[]>.DefaultCacheSize),
+                            new TikTokenPreTokenizer(tiktokenConfiguration.Regex, tiktokenConfiguration.SpecialTokens),
+                            normalizer);
+        }
+
         // Regex patterns based on https://github.com/openai/tiktoken/blob/main/tiktoken_ext/openai_public.py
 
         private const string Cl100kBaseRegexPattern = /*lang=regex*/ @"'(?i:[sdmt]|re|ve|ll)|(?>[^\r\n\p{L}\p{N}]?)\p{L}+|\p{N}{1,3}| ?(?>[^\s\p{L}\p{N}]+)[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+";
@@ -818,15 +859,13 @@ public static Task<Tokenizer> CreateByModelNameAsync(
         /// <summary>
         /// Create tokenizer based on encoder name and extra special tokens
         /// </summary>
-        /// <param name="modelName">Model name</param>
         /// <param name="modelEncoding">Encoder label</param>
         /// <param name="extraSpecialTokens">Extra special tokens other than the built-in ones for the encoder</param>
         /// <param name="normalizer">To normalize the text before tokenization</param>
         /// <param name="cancellationToken"><see cref="CancellationToken"/> used to request cancellation of the operation.</param>
         /// <returns>The tokenizer</returns>
         /// <exception cref="NotSupportedException">Throws if the model name is not supported</exception>
         private static Task<Tokenizer> CreateByEncoderNameAsync(
-                                                string modelName,
                                                 ModelEncoding modelEncoding,
                                                 IReadOnlyDictionary<string, int>? extraSpecialTokens,
                                                 Normalizer? normalizer,
@@ -857,8 +896,7 @@ private static Task<Tokenizer> CreateByEncoderNameAsync(
                     return CreateTikTokenTokenizerAsync(P50kBaseRegex(), GPT2Url, specialTokens, extraSpecialTokens, normalizer, cancellationToken);
 
                 default:
-                    Debug.Assert(false, $"Unexpected encoder [{modelEncoding}]");
-                    throw new NotSupportedException($"The model '{modelName}' is not supported.");
+                    throw new NotSupportedException($"The encoder '{modelEncoding}' is not supported.");
             }
         }
 
@@ -894,7 +932,7 @@ private static async Task<Tokenizer> CreateTikTokenTokenizerAsync(
             {
                 using (Stream stream = await Helpers.GetStreamAsync(_httpClient, mergeableRanksFileUrl, cancellationToken).ConfigureAwait(false))
                 {
-                    cache = await Tiktoken.LoadTikTokenBpeAsync(stream, useAsync: true, cancellationToken).ConfigureAwait(false);
+                    cache = await LoadTikTokenBpeAsync(stream, useAsync: true, cancellationToken).ConfigureAwait(false);
                 }
 
                 _tiktokenCache.TryAdd(mergeableRanksFileUrl, cache);
diff --git a/src/Microsoft.ML.Tokenizers/Utils/Helpers.netcoreapp.cs b/src/Microsoft.ML.Tokenizers/Utils/Helpers.netcoreapp.cs
@@ -18,9 +18,16 @@ internal static class Helpers
         public static ValueTask<string?> ReadLineAsync(StreamReader reader, CancellationToken cancellationToken) =>
             reader.ReadLineAsync(cancellationToken);
 
-        public static Task<Stream> GetStreamAsync(HttpClient client, string url, CancellationToken cancellationToken) =>
+        public static Task<Stream> GetStreamAsync(HttpClient client, string url, CancellationToken cancellationToken = default) =>
             client.GetStreamAsync(url, cancellationToken);
 
+        public static Stream GetStream(HttpClient client, string url)
+        {
+            HttpResponseMessage response = client.Send(new HttpRequestMessage(HttpMethod.Get, url), HttpCompletionOption.ResponseHeadersRead);
+            response.EnsureSuccessStatusCode();
+            return response.Content.ReadAsStream();
+        }
+
         public static byte[] FromBase64String(string base64String, int offset, int length)
         {
             if (!Base64.IsValid(base64String.AsSpan(offset, length), out int decodedLength))
diff --git a/src/Microsoft.ML.Tokenizers/Utils/Helpers.netstandard.cs b/src/Microsoft.ML.Tokenizers/Utils/Helpers.netstandard.cs
@@ -18,13 +18,15 @@ public static ValueTask<string> ReadLineAsync(StreamReader reader, CancellationT
             return new ValueTask<string>(reader.ReadLineAsync());
         }
 
-        public static async Task<Stream> GetStreamAsync(HttpClient client, string url, CancellationToken cancellationToken)
+        public static async Task<Stream> GetStreamAsync(HttpClient client, string url, CancellationToken cancellationToken = default)
         {
             HttpResponseMessage response = await client.GetAsync(url, HttpCompletionOption.ResponseHeadersRead, cancellationToken).ConfigureAwait(false);
             response.EnsureSuccessStatusCode();
             return await response.Content.ReadAsStreamAsync().ConfigureAwait(false);
         }
 
+        public static Stream GetStream(HttpClient client, string url) => client.GetStreamAsync(url).GetAwaiter().GetResult();
+
         public static byte[] FromBase64String(string base64String, int offset, int length) => Convert.FromBase64String(base64String.Substring(offset, length));
 
         // Not support signed number
diff --git a/test/Microsoft.ML.TestFramework/Microsoft.ML.TestFramework.csproj b/test/Microsoft.ML.TestFramework/Microsoft.ML.TestFramework.csproj
@@ -34,8 +34,7 @@
   </ItemGroup>
 
   <ItemGroup Condition="'$(TargetFramework)' != 'net462'">
-  <!-- This reference will be updated to use DARC in a subsequent PR so we can leave the version here as is -->
-  <PackageReference Include="Microsoft.DotNet.RemoteExecutor" Version="7.0.0-beta.21456.1" />
+    <PackageReference Include="Microsoft.DotNet.RemoteExecutor" Version="$(MicrosoftDotNetRemoteExecutorVersion)" />
   </ItemGroup>
 
 </Project>
diff --git a/test/Microsoft.ML.Tokenizers.Tests/Microsoft.ML.Tokenizers.Tests.csproj b/test/Microsoft.ML.Tokenizers.Tests/Microsoft.ML.Tokenizers.Tests.csproj
@@ -36,6 +36,7 @@
   </ItemGroup>
 
   <ItemGroup>
+    <PackageReference Include="Microsoft.DotNet.RemoteExecutor" Version="$(MicrosoftDotNetRemoteExecutorVersion)" />
     <PackageReference Include="System.Text.Json" Version="$(SystemTextJsonVersion)" />
   </ItemGroup>
 
diff --git a/test/Microsoft.ML.Tokenizers.Tests/TitokenTests.cs b/test/Microsoft.ML.Tokenizers.Tests/TitokenTests.cs
@@ -2,6 +2,7 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
+using Microsoft.DotNet.RemoteExecutor;
 using Microsoft.ML.Tokenizers;
 using System;
 using System.Collections.Generic;
@@ -25,11 +26,11 @@ public class TiktokenTests
                                                     { IMEnd, 100265},
                                                 };
 
-        public static Tokenizer GPT4 { get; } = Tiktoken.CreateByModelNameAsync("gpt-4", _specialTokens).GetAwaiter().GetResult();
-        public static Tokenizer GPT2 { get; } = Tiktoken.CreateByModelNameAsync("gpt2").GetAwaiter().GetResult();
-        public static Tokenizer P50kBase { get; } = Tiktoken.CreateByModelNameAsync("text-davinci-003").GetAwaiter().GetResult();
-        public static Tokenizer R50kBase { get; } = Tiktoken.CreateByModelNameAsync("ada").GetAwaiter().GetResult();
-        public static Tokenizer P50kEdit { get; } = Tiktoken.CreateByModelNameAsync("text-davinci-edit-001").GetAwaiter().GetResult();
+        public static Tokenizer GPT4 { get; } = Tiktoken.CreateTokenizerForModelAsync("gpt-4", _specialTokens).GetAwaiter().GetResult();
+        public static Tokenizer GPT2 { get; } = Tiktoken.CreateTokenizerForModelAsync("gpt2").GetAwaiter().GetResult();
+        public static Tokenizer P50kBase { get; } = Tiktoken.CreateTokenizerForModelAsync("text-davinci-003").GetAwaiter().GetResult();
+        public static Tokenizer R50kBase { get; } = Tiktoken.CreateTokenizerForModelAsync("ada").GetAwaiter().GetResult();
+        public static Tokenizer P50kEdit { get; } = Tiktoken.CreateTokenizerForModelAsync("text-davinci-edit-001").GetAwaiter().GetResult();
 
         [Fact]
         public async void TestTokenizerCreation()
@@ -64,15 +65,18 @@ public async void TestTokenizerCreation()
 
                 using (Stream stream = File.OpenRead(tokenizerDataFileName))
                 {
-                    tokenizer = Tiktoken.CreateByModelName("gpt-4", stream);
+                    tokenizer = Tiktoken.CreateTokenizerForModel("gpt-4", stream);
                 }
                 TestGPT4TokenizationEncoding(tokenizer);
 
                 using (Stream stream = File.OpenRead(tokenizerDataFileName))
                 {
-                    tokenizer = await Tiktoken.CreateByModelNameAsync("gpt-3.5-turbo", stream);
+                    tokenizer = await Tiktoken.CreateTokenizerForModelAsync("gpt-3.5-turbo", stream);
                 }
                 TestGPT4TokenizationEncoding(tokenizer);
+
+                tokenizer = Tiktoken.CreateTokenizerForModel("gpt-4");
+                TestGPT4TokenizationEncoding(tokenizer);
             }
             finally
             {
@@ -298,11 +302,38 @@ public void TestEncodeR50kBase()
         [InlineData("gpt2")]
         public async void TestAllSupportedModelNames(string modelName)
         {
-            Tokenizer tokenizer = await Tiktoken.CreateByModelNameAsync(modelName);
+            Tokenizer tokenizer = Tiktoken.CreateTokenizerForModel(modelName);
+            Assert.NotNull(tokenizer.Model);
+            Assert.NotNull(tokenizer.PreTokenizer);
+
+            tokenizer = await Tiktoken.CreateTokenizerForModelAsync(modelName);
             Assert.NotNull(tokenizer.Model);
             Assert.NotNull(tokenizer.PreTokenizer);
         }
 
+        [InlineData("gpt-4")]
+        [InlineData("text-davinci-003")]
+        [InlineData("text-curie-001")]
+        [InlineData("text-davinci-edit-001")]
+        [ConditionalTheory(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))]
+        public void TestCreationUsingModel(string modelName)
+        {
+            // Execute remotely to ensure no caching is used.
+            RemoteExecutor.Invoke(static async (name) =>
+            {
+                Tokenizer tokenizer = await Tiktoken.CreateTokenizerForModelAsync(name);
+                Assert.NotNull(tokenizer.Model);
+                Assert.NotNull(tokenizer.PreTokenizer);
+            }, modelName).Dispose();
+
+            RemoteExecutor.Invoke(static (name) =>
+            {
+                Tokenizer tokenizer = Tiktoken.CreateTokenizerForModel(name);
+                Assert.NotNull(tokenizer.Model);
+                Assert.NotNull(tokenizer.PreTokenizer);
+            }, modelName).Dispose();
+        }
+
         // Test running copy the test data files to the output folder but sometimes the file content is mutated replacing '\n' with '\r\n'.
         // This method reads the file and removes the extra inserted '\r' characters. Having '\r' in the file content will cause the tests to fail.
         private string ReadAndSanitizeFile(string path)

Original file line number	Diff line number	Diff line change
`@@ -18,13 +18,15 @@ public static ValueTask<string> ReadLineAsync(StreamReader reader, CancellationT`
`18`	`18`	`return new ValueTask<string>(reader.ReadLineAsync());`
`19`	`19`	`}`
`20`	`20`
`21`		`- public static async Task<Stream> GetStreamAsync(HttpClient client, string url, CancellationToken cancellationToken)`
	`21`	`+ public static async Task<Stream> GetStreamAsync(HttpClient client, string url, CancellationToken cancellationToken = default)`
`22`	`22`	`{`
`23`	`23`	`HttpResponseMessage response = await client.GetAsync(url, HttpCompletionOption.ResponseHeadersRead, cancellationToken).ConfigureAwait(false);`
`24`	`24`	`response.EnsureSuccessStatusCode();`
`25`	`25`	`return await response.Content.ReadAsStreamAsync().ConfigureAwait(false);`
`26`	`26`	`}`
`27`	`27`
	`28`	`+ public static Stream GetStream(HttpClient client, string url) => client.GetStreamAsync(url).GetAwaiter().GetResult();`
	`29`	`+`
`28`	`30`	`public static byte[] FromBase64String(string base64String, int offset, int length) => Convert.FromBase64String(base64String.Substring(offset, length));`
`29`	`31`
`30`	`32`	`// Not support signed number`