Skip to content

Commit 798ea8a

Browse files
lhoestqWauplin
andauthored
Align HfFileSystem and HfApi for the expand argument when listing files in repos (#3195)
* align hffs and hfapi for expand * update tests * fix tests * again * Update src/huggingface_hub/hf_file_system.py Co-authored-by: Lucain <lucain@huggingface.co> --------- Co-authored-by: Lucain <lucain@huggingface.co>
1 parent 7929072 commit 798ea8a

File tree

2 files changed

+24
-25
lines changed

2 files changed

+24
-25
lines changed

src/huggingface_hub/hf_file_system.py

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -367,7 +367,6 @@ def ls(
367367
"""
368368
resolved_path = self.resolve_path(path, revision=revision)
369369
path = resolved_path.unresolve()
370-
kwargs = {"expand_info": detail, **kwargs}
371370
try:
372371
out = self._ls_tree(path, refresh=refresh, revision=revision, **kwargs)
373372
except EntryNotFoundError:
@@ -386,7 +385,7 @@ def _ls_tree(
386385
recursive: bool = False,
387386
refresh: bool = False,
388387
revision: Optional[str] = None,
389-
expand_info: bool = True,
388+
expand_info: bool = False,
390389
):
391390
resolved_path = self.resolve_path(path, revision=revision)
392391
path = resolved_path.unresolve()
@@ -497,8 +496,6 @@ def walk(self, path: str, *args, **kwargs) -> Iterator[Tuple[str, List[str], Lis
497496
Returns:
498497
`Iterator[Tuple[str, List[str], List[str]]]`: An iterator of (path, list of directory names, list of file names) tuples.
499498
"""
500-
# Set expand_info=False by default to get a x10 speed boost
501-
kwargs = {"expand_info": kwargs.get("detail", False), **kwargs}
502499
path = self.resolve_path(path, revision=kwargs.get("revision")).unresolve()
503500
yield from super().walk(path, *args, **kwargs)
504501

@@ -515,8 +512,6 @@ def glob(self, path: str, **kwargs) -> List[str]:
515512
Returns:
516513
`List[str]`: List of paths matching the pattern.
517514
"""
518-
# Set expand_info=False by default to get a x10 speed boost
519-
kwargs = {"expand_info": kwargs.get("detail", False), **kwargs}
520515
path = self.resolve_path(path, revision=kwargs.get("revision")).unresolve()
521516
return super().glob(path, **kwargs)
522517

@@ -558,7 +553,6 @@ def find(
558553
)
559554
resolved_path = self.resolve_path(path, revision=revision)
560555
path = resolved_path.unresolve()
561-
kwargs = {"expand_info": detail, **kwargs}
562556
try:
563557
out = self._ls_tree(path, recursive=True, refresh=refresh, revision=resolved_path.revision, **kwargs)
564558
except EntryNotFoundError:
@@ -653,7 +647,7 @@ def modified(self, path: str, **kwargs) -> datetime:
653647
Returns:
654648
`datetime`: Last commit date of the file.
655649
"""
656-
info = self.info(path, **kwargs)
650+
info = self.info(path, **{**kwargs, "expand_info": True})
657651
return info["last_commit"]["date"]
658652

659653
def info(self, path: str, refresh: bool = False, revision: Optional[str] = None, **kwargs) -> Dict[str, Any]:
@@ -683,14 +677,15 @@ def info(self, path: str, refresh: bool = False, revision: Optional[str] = None,
683677
resolved_path = self.resolve_path(path, revision=revision)
684678
path = resolved_path.unresolve()
685679
expand_info = kwargs.get(
686-
"expand_info", True
680+
"expand_info", False
687681
) # don't expose it as a parameter in the public API to follow the spec
688682
if not resolved_path.path_in_repo:
689683
# Path is the root directory
690684
out = {
691685
"name": path,
692686
"size": 0,
693687
"type": "directory",
688+
"last_commit": None,
694689
}
695690
if expand_info:
696691
last_commit = self._api.list_repo_commits(
@@ -708,7 +703,7 @@ def info(self, path: str, refresh: bool = False, revision: Optional[str] = None,
708703
parent_path = self._parent(path)
709704
if not expand_info and parent_path not in self.dircache:
710705
# Fill the cache with cheap call
711-
self.ls(parent_path, expand_info=False)
706+
self.ls(parent_path)
712707
if parent_path in self.dircache:
713708
# Check if the path is in the cache
714709
out1 = [o for o in self.dircache[parent_path] if o["name"] == path]
@@ -779,7 +774,7 @@ def exists(self, path, **kwargs):
779774
if kwargs.get("refresh", False):
780775
self.invalidate_cache(path)
781776

782-
self.info(path, **{**kwargs, "expand_info": False})
777+
self.info(path, **kwargs)
783778
return True
784779
except: # noqa: E722
785780
return False
@@ -798,7 +793,7 @@ def isdir(self, path):
798793
`bool`: True if path is a directory, False otherwise.
799794
"""
800795
try:
801-
return self.info(path, expand_info=False)["type"] == "directory"
796+
return self.info(path)["type"] == "directory"
802797
except OSError:
803798
return False
804799

@@ -816,7 +811,7 @@ def isfile(self, path):
816811
`bool`: True if path is a file, False otherwise.
817812
"""
818813
try:
819-
return self.info(path, expand_info=False)["type"] == "file"
814+
return self.info(path)["type"] == "file"
820815
except: # noqa: E722
821816
return False
822817

@@ -942,9 +937,6 @@ def __init__(self, fs: HfFileSystem, path: str, revision: Optional[str] = None,
942937
f"{e}.\nMake sure the repository and revision exist before writing data."
943938
) from e
944939
raise
945-
# avoid an unnecessary .info() call with expensive expand_info=True to instantiate .details
946-
if kwargs.get("mode", "rb") == "rb":
947-
self.details = fs.info(self.resolved_path.unresolve(), expand_info=False)
948940
super().__init__(fs, self.resolved_path.unresolve(), **kwargs)
949941
self.fs: HfFileSystem
950942

tests/test_hf_file_system.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -66,21 +66,21 @@ def test_info(self):
6666
self.assertEqual(root_dir["type"], "directory")
6767
self.assertEqual(root_dir["size"], 0)
6868
self.assertTrue(root_dir["name"].endswith(self.repo_id))
69-
self.assertIsNotNone(root_dir["last_commit"])
69+
self.assertIsNone(root_dir["last_commit"])
7070

7171
data_dir = self.hffs.info(self.hf_path + "/data")
7272
self.assertEqual(data_dir["type"], "directory")
7373
self.assertEqual(data_dir["size"], 0)
7474
self.assertTrue(data_dir["name"].endswith("/data"))
75-
self.assertIsNotNone(data_dir["last_commit"])
75+
self.assertIsNone(data_dir["last_commit"])
7676
self.assertIsNotNone(data_dir["tree_id"])
7777

7878
text_data_file = self.hffs.info(self.text_file)
7979
self.assertEqual(text_data_file["type"], "file")
8080
self.assertGreater(text_data_file["size"], 0) # not empty
8181
self.assertTrue(text_data_file["name"].endswith("/data/text_data.txt"))
8282
self.assertIsNone(text_data_file["lfs"])
83-
self.assertIsNotNone(text_data_file["last_commit"])
83+
self.assertIsNone(text_data_file["last_commit"])
8484
self.assertIsNotNone(text_data_file["blob_id"])
8585
self.assertIn("security", text_data_file) # the staging endpoint does not run security checks
8686

@@ -140,7 +140,7 @@ def test_glob(self):
140140
self.assertTrue(
141141
files[keys[0]]["name"].endswith("/.gitattributes") and files[keys[1]]["name"].endswith("/data")
142142
)
143-
self.assertIsNotNone(files[keys[0]]["last_commit"])
143+
self.assertIsNone(files[keys[0]]["last_commit"])
144144

145145
def test_url(self):
146146
self.assertEqual(
@@ -277,13 +277,13 @@ def test_list_root_directory_no_revision(self):
277277
self.assertEqual(files[0]["type"], "directory")
278278
self.assertEqual(files[0]["size"], 0)
279279
self.assertTrue(files[0]["name"].endswith("/data"))
280-
self.assertIsNotNone(files[0]["last_commit"])
280+
self.assertIsNone(files[0]["last_commit"])
281281
self.assertIsNotNone(files[0]["tree_id"])
282282

283283
self.assertEqual(files[1]["type"], "file")
284284
self.assertGreater(files[1]["size"], 0) # not empty
285285
self.assertTrue(files[1]["name"].endswith("/.gitattributes"))
286-
self.assertIsNotNone(files[1]["last_commit"])
286+
self.assertIsNone(files[1]["last_commit"])
287287
self.assertIsNotNone(files[1]["blob_id"])
288288
self.assertIn("security", files[1]) # the staging endpoint does not run security checks
289289

@@ -298,15 +298,15 @@ def test_list_data_directory_no_revision(self):
298298
self.assertIn("sha256", files[0]["lfs"])
299299
self.assertIn("size", files[0]["lfs"])
300300
self.assertIn("pointer_size", files[0]["lfs"])
301-
self.assertIsNotNone(files[0]["last_commit"])
301+
self.assertIsNone(files[0]["last_commit"])
302302
self.assertIsNotNone(files[0]["blob_id"])
303303
self.assertIn("security", files[0]) # the staging endpoint does not run security checks
304304

305305
self.assertEqual(files[1]["type"], "file")
306306
self.assertGreater(files[1]["size"], 0) # not empty
307307
self.assertTrue(files[1]["name"].endswith("/data/text_data.txt"))
308308
self.assertIsNone(files[1]["lfs"])
309-
self.assertIsNotNone(files[1]["last_commit"])
309+
self.assertIsNone(files[1]["last_commit"])
310310
self.assertIsNotNone(files[1]["blob_id"])
311311
self.assertIn("security", files[1]) # the staging endpoint does not run security checks
312312

@@ -318,7 +318,7 @@ def test_list_data_file_no_revision(self):
318318
self.assertGreater(files[0]["size"], 0) # not empty
319319
self.assertTrue(files[0]["name"].endswith("/data/text_data.txt"))
320320
self.assertIsNone(files[0]["lfs"])
321-
self.assertIsNotNone(files[0]["last_commit"])
321+
self.assertIsNone(files[0]["last_commit"])
322322
self.assertIsNotNone(files[0]["blob_id"])
323323
self.assertIn("security", files[0]) # the staging endpoint does not run security checks
324324

@@ -351,6 +351,13 @@ def test_list_root_directory_no_revision_no_detail_then_with_detail(self):
351351
files = self.hffs.ls(self.hf_path, detail=True)
352352
self.assertEqual(len(files), 2)
353353
self.assertTrue(files[0]["name"].endswith("/data") and files[1]["name"].endswith("/.gitattributes"))
354+
self.assertIsNone(
355+
self.hffs.dircache[self.hf_path][0]["last_commit"]
356+
) # no expand_info -> no last_commit in cache
357+
358+
files = self.hffs.ls(self.hf_path, detail=True, expand_info=True)
359+
self.assertEqual(len(files), 2)
360+
self.assertTrue(files[0]["name"].endswith("/data") and files[1]["name"].endswith("/.gitattributes"))
354361
self.assertIsNotNone(self.hffs.dircache[self.hf_path][0]["last_commit"])
355362

356363
def test_find_root_directory_no_revision(self):

0 commit comments

Comments
 (0)