openai · ReNothingg · Jun 20, 2025
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
@@ -1,39 +1,33 @@
-import base64
-import functools
-import gzip
-import json
-import os
-import random
+import gc
 import time
-from typing import Any, cast
-
-import blobfile
-
+import statistics
 import tiktoken
-
-
-def benchmark_batch(documents: list[str]) -> None:
-    num_threads = int(os.environ["RAYON_NUM_THREADS"])
-    num_bytes = sum(map(len, map(str.encode, documents)))
-    print(f"num_threads: {num_threads}, num_bytes: {num_bytes}")
-
+from transformers import GPT2TokenizerFast
+
+def measure(fn, docs, repeats=5): #Warm up
+    fn(docs)
+    times = []
+    for _ in range(repeats):
+        gc.disable()
+        start = time.perf_counter_ns()
+        fn(docs)
+        end = time.perf_counter_ns()
+        gc.enable()
+        times.append(end - start)
+    return statistics.mean(times), statistics.stdev(times)
+
+if __name__ == "__main__":
+    docs = [...]
+    num_bytes = sum(len(d.encode()) for d in docs)
+
+    # tiktoken
     enc = tiktoken.get_encoding("gpt2")
-    enc.encode("warmup")
-
-    start = time.perf_counter_ns()
-    enc.encode_ordinary_batch(documents, num_threads=num_threads)
-    end = time.perf_counter_ns()
-    print(f"tiktoken \t{num_bytes / (end - start) * 1e9} bytes / s")
-
-    import transformers
-
-    hf_enc = cast(Any, transformers).GPT2TokenizerFast.from_pretrained("gpt2")
-    hf_enc.model_max_length = 1e30  # silence!
-    hf_enc.encode("warmup")
-
-    start = time.perf_counter_ns()
-    hf_enc(documents)
-    end = time.perf_counter_ns()
-    print(f"huggingface \t{num_bytes / (end - start) * 1e9} bytes / s")
+    t_mean, t_stdev = measure(lambda d: enc.encode_ordinary_batch(d, num_threads=8), docs)
 
+    # HF
+    hf = GPT2TokenizerFast.from_pretrained("gpt2")
+    hf.model_max_length = int(1e30)
+    hf_mean, hf_stdev = measure(lambda d: hf(d, return_tensors=None), docs)
 
+    print(f"tiktoken: {num_bytes / (t_mean)*1e9:.2f} ± {t_stdev/ t_mean*100:.1f}% байт/с")
+    print(f"HuggingFace: {num_bytes / (hf_mean)*1e9:.2f} ± {hf_stdev/ hf_mean*100:.1f}% байт/с")