Skip to content

Commit 0f27bdf

Browse files
authored
Do not fetch /preupload if already done in upload-large-folder (#3100)
* Batch preupload calls in upload-large-folder * Revert "Batch preupload calls in upload-large-folder" This reverts commit 73754c4. * upload mode was not fetched * type hints * make quality again * let fix #3016 as well
1 parent aee73c4 commit 0f27bdf

File tree

3 files changed

+40
-50
lines changed

3 files changed

+40
-50
lines changed

src/huggingface_hub/_local_folder.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,7 @@ class LocalUploadFileMetadata:
149149
should_ignore: Optional[bool] = None
150150
sha256: Optional[str] = None
151151
upload_mode: Optional[str] = None
152+
remote_oid: Optional[str] = None
152153
is_uploaded: bool = False
153154
is_committed: bool = False
154155

@@ -172,6 +173,9 @@ def save(self, paths: LocalUploadFilePaths) -> None:
172173

173174
if self.upload_mode is not None:
174175
f.write(self.upload_mode)
176+
177+
if self.remote_oid is not None:
178+
f.write(self.remote_oid)
175179
f.write("\n")
176180

177181
f.write(str(int(self.is_uploaded)) + "\n")
@@ -346,6 +350,9 @@ def read_upload_metadata(local_dir: Path, filename: str) -> LocalUploadFileMetad
346350
if upload_mode not in (None, "regular", "lfs"):
347351
raise ValueError(f"Invalid upload mode in metadata {paths.path_in_repo}: {upload_mode}")
348352

353+
_remote_oid = f.readline().strip()
354+
remote_oid = None if _remote_oid == "" else _remote_oid
355+
349356
is_uploaded = bool(int(f.readline().strip()))
350357
is_committed = bool(int(f.readline().strip()))
351358

@@ -355,6 +362,7 @@ def read_upload_metadata(local_dir: Path, filename: str) -> LocalUploadFileMetad
355362
should_ignore=should_ignore,
356363
sha256=sha256,
357364
upload_mode=upload_mode,
365+
remote_oid=remote_oid,
358366
is_uploaded=is_uploaded,
359367
is_committed=is_committed,
360368
)

src/huggingface_hub/_upload_large_folder.py

Lines changed: 15 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,7 @@
4242
logger = logging.getLogger(__name__)
4343

4444
WAITING_TIME_IF_NO_TASKS = 10 # seconds
45-
MAX_NB_REGULAR_FILES_PER_COMMIT = 75
46-
MAX_NB_LFS_FILES_PER_COMMIT = 150
45+
MAX_NB_FILES_FETCH_UPLOAD_MODE = 100
4746
COMMIT_SIZE_SCALE: List[int] = [20, 50, 75, 100, 125, 200, 250, 400, 600, 1000]
4847

4948

@@ -404,19 +403,19 @@ def _determine_next_job(status: LargeUploadStatus) -> Optional[Tuple[WorkerJob,
404403
):
405404
status.nb_workers_commit += 1
406405
logger.debug("Job: commit (more than 5 minutes since last commit attempt)")
407-
return (WorkerJob.COMMIT, _get_items_to_commit(status.queue_commit))
406+
return (WorkerJob.COMMIT, _get_n(status.queue_commit, status.target_chunk()))
408407

409408
# 2. Commit if at least 100 files are ready to commit
410409
elif status.nb_workers_commit == 0 and status.queue_commit.qsize() >= 150:
411410
status.nb_workers_commit += 1
412411
logger.debug("Job: commit (>100 files ready)")
413-
return (WorkerJob.COMMIT, _get_items_to_commit(status.queue_commit))
412+
return (WorkerJob.COMMIT, _get_n(status.queue_commit, status.target_chunk()))
414413

415-
# 3. Get upload mode if at least 10 files
416-
elif status.queue_get_upload_mode.qsize() >= 10:
414+
# 3. Get upload mode if at least 100 files
415+
elif status.queue_get_upload_mode.qsize() >= MAX_NB_FILES_FETCH_UPLOAD_MODE:
417416
status.nb_workers_get_upload_mode += 1
418-
logger.debug("Job: get upload mode (>10 files ready)")
419-
return (WorkerJob.GET_UPLOAD_MODE, _get_n(status.queue_get_upload_mode, status.target_chunk()))
417+
logger.debug(f"Job: get upload mode (>{MAX_NB_FILES_FETCH_UPLOAD_MODE} files ready)")
418+
return (WorkerJob.GET_UPLOAD_MODE, _get_n(status.queue_get_upload_mode, MAX_NB_FILES_FETCH_UPLOAD_MODE))
420419

421420
# 4. Preupload LFS file if at least 1 file and no worker is preuploading LFS
422421
elif status.queue_preupload_lfs.qsize() > 0 and status.nb_workers_preupload_lfs == 0:
@@ -434,7 +433,7 @@ def _determine_next_job(status: LargeUploadStatus) -> Optional[Tuple[WorkerJob,
434433
elif status.queue_get_upload_mode.qsize() > 0 and status.nb_workers_get_upload_mode == 0:
435434
status.nb_workers_get_upload_mode += 1
436435
logger.debug("Job: get upload mode (no other worker getting upload mode)")
437-
return (WorkerJob.GET_UPLOAD_MODE, _get_n(status.queue_get_upload_mode, status.target_chunk()))
436+
return (WorkerJob.GET_UPLOAD_MODE, _get_n(status.queue_get_upload_mode, MAX_NB_FILES_FETCH_UPLOAD_MODE))
438437

439438
# 7. Preupload LFS file if at least 1 file
440439
# Skip if hf_transfer is enabled and there is already a worker preuploading LFS
@@ -455,7 +454,7 @@ def _determine_next_job(status: LargeUploadStatus) -> Optional[Tuple[WorkerJob,
455454
elif status.queue_get_upload_mode.qsize() > 0:
456455
status.nb_workers_get_upload_mode += 1
457456
logger.debug("Job: get upload mode")
458-
return (WorkerJob.GET_UPLOAD_MODE, _get_n(status.queue_get_upload_mode, status.target_chunk()))
457+
return (WorkerJob.GET_UPLOAD_MODE, _get_n(status.queue_get_upload_mode, MAX_NB_FILES_FETCH_UPLOAD_MODE))
459458

460459
# 10. Commit if at least 1 file and 1 min since last commit attempt
461460
elif (
@@ -466,7 +465,7 @@ def _determine_next_job(status: LargeUploadStatus) -> Optional[Tuple[WorkerJob,
466465
):
467466
status.nb_workers_commit += 1
468467
logger.debug("Job: commit (1 min since last commit attempt)")
469-
return (WorkerJob.COMMIT, _get_items_to_commit(status.queue_commit))
468+
return (WorkerJob.COMMIT, _get_n(status.queue_commit, status.target_chunk()))
470469

471470
# 11. Commit if at least 1 file all other queues are empty and all workers are waiting
472471
# e.g. when it's the last commit
@@ -482,7 +481,7 @@ def _determine_next_job(status: LargeUploadStatus) -> Optional[Tuple[WorkerJob,
482481
):
483482
status.nb_workers_commit += 1
484483
logger.debug("Job: commit")
485-
return (WorkerJob.COMMIT, _get_items_to_commit(status.queue_commit))
484+
return (WorkerJob.COMMIT, _get_n(status.queue_commit, status.target_chunk()))
486485

487486
# 12. If all queues are empty, exit
488487
elif all(metadata.is_committed or metadata.should_ignore for _, metadata in status.items):
@@ -528,6 +527,7 @@ def _get_upload_mode(items: List[JOB_ITEM_T], api: "HfApi", repo_id: str, repo_t
528527
paths, metadata = item
529528
metadata.upload_mode = addition._upload_mode
530529
metadata.should_ignore = addition._should_ignore
530+
metadata.remote_oid = addition._remote_oid
531531
metadata.save(paths)
532532

533533

@@ -580,6 +580,9 @@ def _build_hacky_operation(item: JOB_ITEM_T) -> HackyCommitOperationAdd:
580580
if metadata.sha256 is None:
581581
raise ValueError("sha256 must have been computed by now!")
582582
operation.upload_info = UploadInfo(sha256=bytes.fromhex(metadata.sha256), size=metadata.size, sample=sample)
583+
operation._upload_mode = metadata.upload_mode # type: ignore[assignment]
584+
operation._should_ignore = metadata.should_ignore
585+
operation._remote_oid = metadata.remote_oid
583586
return operation
584587

585588

@@ -596,30 +599,6 @@ def _get_n(queue: "queue.Queue[JOB_ITEM_T]", n: int) -> List[JOB_ITEM_T]:
596599
return [queue.get() for _ in range(min(queue.qsize(), n))]
597600

598601

599-
def _get_items_to_commit(queue: "queue.Queue[JOB_ITEM_T]") -> List[JOB_ITEM_T]:
600-
"""Special case for commit job: the number of items to commit depends on the type of files."""
601-
# Can take at most 50 regular files and/or 100 LFS files in a single commit
602-
items: List[JOB_ITEM_T] = []
603-
nb_lfs, nb_regular = 0, 0
604-
while True:
605-
# If empty queue => commit everything
606-
if queue.qsize() == 0:
607-
return items
608-
609-
# If we have enough items => commit them
610-
if nb_lfs >= MAX_NB_LFS_FILES_PER_COMMIT or nb_regular >= MAX_NB_REGULAR_FILES_PER_COMMIT:
611-
return items
612-
613-
# Else, get a new item and increase counter
614-
item = queue.get()
615-
items.append(item)
616-
_, metadata = item
617-
if metadata.upload_mode == "lfs":
618-
nb_lfs += 1
619-
else:
620-
nb_regular += 1
621-
622-
623602
def _print_overwrite(report: str) -> None:
624603
"""Print a report, overwriting the previous lines.
625604

src/huggingface_hub/hf_api.py

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4421,20 +4421,23 @@ def preupload_lfs_files(
44214421
new_additions = [addition for addition in additions if not addition._is_uploaded]
44224422

44234423
# Check which new files are LFS
4424-
try:
4425-
_fetch_upload_modes(
4426-
additions=new_additions,
4427-
repo_type=repo_type,
4428-
repo_id=repo_id,
4429-
headers=headers,
4430-
revision=revision,
4431-
endpoint=self.endpoint,
4432-
create_pr=create_pr or False,
4433-
gitignore_content=gitignore_content,
4434-
)
4435-
except RepositoryNotFoundError as e:
4436-
e.append_to_message(_CREATE_COMMIT_NO_REPO_ERROR_MESSAGE)
4437-
raise
4424+
# For some items, we might have already fetched the upload mode (in case of upload_large_folder)
4425+
additions_no_upload_mode = [addition for addition in new_additions if addition._upload_mode is None]
4426+
if len(additions_no_upload_mode) > 0:
4427+
try:
4428+
_fetch_upload_modes(
4429+
additions=additions_no_upload_mode,
4430+
repo_type=repo_type,
4431+
repo_id=repo_id,
4432+
headers=headers,
4433+
revision=revision,
4434+
endpoint=self.endpoint,
4435+
create_pr=create_pr or False,
4436+
gitignore_content=gitignore_content,
4437+
)
4438+
except RepositoryNotFoundError as e:
4439+
e.append_to_message(_CREATE_COMMIT_NO_REPO_ERROR_MESSAGE)
4440+
raise
44384441

44394442
# Filter out regular files
44404443
new_lfs_additions = [addition for addition in new_additions if addition._upload_mode == "lfs"]

0 commit comments

Comments
 (0)