From ea7155b7533f2176ae1c2306b8f92158fcc19a10 Mon Sep 17 00:00:00 2001 From: Shivam-Singh-Dev Date: Wed, 23 Apr 2025 15:15:31 +0530 Subject: [PATCH] Update load.py ### Summary This PR replaces the use of `hashlib.sha1` with `hashlib.sha256` in `read_file_cached()`. ### Motivation While SHA-1 is used here only for generating a deterministic cache key (not cryptographic operations), many modern security scanners (e.g., Snyk, Trivy, Bandit) flag its usage due to known weaknesses. This change aligns with current best practices. ### Notes - This does not impact functionality. - The cache key format change is non-breaking as it will simply cause re-caching (safe fallback). - All tests pass. Let me know if you'd like this behind a feature flag or need further changes. Thanks! --- tiktoken/load.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tiktoken/load.py b/tiktoken/load.py index 295deb9..1a5bf87 100644 --- a/tiktoken/load.py +++ b/tiktoken/load.py @@ -45,7 +45,12 @@ def read_file_cached(blobpath: str, expected_hash: str | None = None) -> bytes: # disable caching return read_file(blobpath) - cache_key = hashlib.sha1(blobpath.encode()).hexdigest() + # cache_key = hashlib.sha1(blobpath.encode()).hexdigest() + # Replaced SHA-1 with SHA-256 to avoid security scanner flags. + # Not used for crypto purposes, but aligns with current best practices. + + cache_key = hashlib.sha256(blobpath.encode()).hexdigest() + cache_path = os.path.join(cache_dir, cache_key) if os.path.exists(cache_path):