diff --git a/scripts/benchmark.py b/scripts/benchmark.py index 4d679fac..94a3bb07 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -1,39 +1,33 @@ -import base64 -import functools -import gzip -import json -import os -import random +import gc import time -from typing import Any, cast - -import blobfile - +import statistics import tiktoken - - -def benchmark_batch(documents: list[str]) -> None: - num_threads = int(os.environ["RAYON_NUM_THREADS"]) - num_bytes = sum(map(len, map(str.encode, documents))) - print(f"num_threads: {num_threads}, num_bytes: {num_bytes}") - +from transformers import GPT2TokenizerFast + +def measure(fn, docs, repeats=5): #Warm up + fn(docs) + times = [] + for _ in range(repeats): + gc.disable() + start = time.perf_counter_ns() + fn(docs) + end = time.perf_counter_ns() + gc.enable() + times.append(end - start) + return statistics.mean(times), statistics.stdev(times) + +if __name__ == "__main__": + docs = [...] + num_bytes = sum(len(d.encode()) for d in docs) + + # tiktoken enc = tiktoken.get_encoding("gpt2") - enc.encode("warmup") - - start = time.perf_counter_ns() - enc.encode_ordinary_batch(documents, num_threads=num_threads) - end = time.perf_counter_ns() - print(f"tiktoken \t{num_bytes / (end - start) * 1e9} bytes / s") - - import transformers - - hf_enc = cast(Any, transformers).GPT2TokenizerFast.from_pretrained("gpt2") - hf_enc.model_max_length = 1e30 # silence! - hf_enc.encode("warmup") - - start = time.perf_counter_ns() - hf_enc(documents) - end = time.perf_counter_ns() - print(f"huggingface \t{num_bytes / (end - start) * 1e9} bytes / s") + t_mean, t_stdev = measure(lambda d: enc.encode_ordinary_batch(d, num_threads=8), docs) + # HF + hf = GPT2TokenizerFast.from_pretrained("gpt2") + hf.model_max_length = int(1e30) + hf_mean, hf_stdev = measure(lambda d: hf(d, return_tensors=None), docs) + print(f"tiktoken: {num_bytes / (t_mean)*1e9:.2f} ± {t_stdev/ t_mean*100:.1f}% байт/с") + print(f"HuggingFace: {num_bytes / (hf_mean)*1e9:.2f} ± {hf_stdev/ hf_mean*100:.1f}% байт/с")