From 3aa46206de01e6b6ed9a251e66fa884d16d2aff3 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Mon, 21 Jul 2025 15:19:57 +0200 Subject: [PATCH 1/2] Fix snapshot_download when unreliable number of files --- src/huggingface_hub/_snapshot_download.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/huggingface_hub/_snapshot_download.py b/src/huggingface_hub/_snapshot_download.py index bb814c257f..0373740115 100644 --- a/src/huggingface_hub/_snapshot_download.py +++ b/src/huggingface_hub/_snapshot_download.py @@ -254,14 +254,19 @@ def snapshot_download( # At this stage, internet connection is up and running # => let's download the files! assert repo_info.sha is not None, "Repo info returned from server must have a revision sha." - assert repo_info.siblings is not None, "Repo info returned from server must have a siblings list." # Corner case: on very large repos, the siblings list in `repo_info` might not contain all files. # In that case, we need to use the `list_repo_tree` method to prevent caching issues. repo_files: Iterable[str] = [f.rfilename for f in repo_info.siblings] - has_many_files = len(repo_info.siblings) > VERY_LARGE_REPO_THRESHOLD - if has_many_files: - logger.info("The repo has more than 50,000 files. Using `list_repo_tree` to ensure all files are listed.") + unreliable_nb_files = ( + repo_info.siblings is None + or len(repo_info.siblings) == 0 + or len(repo_info.siblings) > VERY_LARGE_REPO_THRESHOLD + ) + if unreliable_nb_files: + logger.info( + "Number of files in the repo is unreliable. Using `list_repo_tree` to ensure all files are listed." + ) repo_files = ( f.rfilename for f in api.list_repo_tree(repo_id=repo_id, recursive=True, revision=revision, repo_type=repo_type) @@ -274,7 +279,7 @@ def snapshot_download( ignore_patterns=ignore_patterns, ) - if not has_many_files: + if not unreliable_nb_files: filtered_repo_files = list(filtered_repo_files) tqdm_desc = f"Fetching {len(filtered_repo_files)} files" else: From 4f22e7fed14cb5f4718b016a85c26fc5fea40efd Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Mon, 21 Jul 2025 17:31:49 +0200 Subject: [PATCH 2/2] fix code quality --- src/huggingface_hub/_snapshot_download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/huggingface_hub/_snapshot_download.py b/src/huggingface_hub/_snapshot_download.py index 0373740115..0db8a29f7e 100644 --- a/src/huggingface_hub/_snapshot_download.py +++ b/src/huggingface_hub/_snapshot_download.py @@ -257,7 +257,7 @@ def snapshot_download( # Corner case: on very large repos, the siblings list in `repo_info` might not contain all files. # In that case, we need to use the `list_repo_tree` method to prevent caching issues. - repo_files: Iterable[str] = [f.rfilename for f in repo_info.siblings] + repo_files: Iterable[str] = [f.rfilename for f in repo_info.siblings] if repo_info.siblings is not None else [] unreliable_nb_files = ( repo_info.siblings is None or len(repo_info.siblings) == 0