Skip to content

Align HfFileSystem and HfApi for the expand argument when listing files in repos #3195

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jul 4, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 8 additions & 16 deletions src/huggingface_hub/hf_file_system.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,7 +367,6 @@ def ls(
"""
resolved_path = self.resolve_path(path, revision=revision)
path = resolved_path.unresolve()
kwargs = {"expand_info": detail, **kwargs}
try:
out = self._ls_tree(path, refresh=refresh, revision=revision, **kwargs)
except EntryNotFoundError:
Expand All @@ -386,7 +385,7 @@ def _ls_tree(
recursive: bool = False,
refresh: bool = False,
revision: Optional[str] = None,
expand_info: bool = True,
expand_info: bool = False,
):
resolved_path = self.resolve_path(path, revision=revision)
path = resolved_path.unresolve()
Expand Down Expand Up @@ -497,8 +496,6 @@ def walk(self, path: str, *args, **kwargs) -> Iterator[Tuple[str, List[str], Lis
Returns:
`Iterator[Tuple[str, List[str], List[str]]]`: An iterator of (path, list of directory names, list of file names) tuples.
"""
# Set expand_info=False by default to get a x10 speed boost
kwargs = {"expand_info": kwargs.get("detail", False), **kwargs}
path = self.resolve_path(path, revision=kwargs.get("revision")).unresolve()
yield from super().walk(path, *args, **kwargs)

Expand All @@ -515,8 +512,6 @@ def glob(self, path: str, **kwargs) -> List[str]:
Returns:
`List[str]`: List of paths matching the pattern.
"""
# Set expand_info=False by default to get a x10 speed boost
kwargs = {"expand_info": kwargs.get("detail", False), **kwargs}
path = self.resolve_path(path, revision=kwargs.get("revision")).unresolve()
return super().glob(path, **kwargs)

Expand Down Expand Up @@ -558,7 +553,6 @@ def find(
)
resolved_path = self.resolve_path(path, revision=revision)
path = resolved_path.unresolve()
kwargs = {"expand_info": detail, **kwargs}
try:
out = self._ls_tree(path, recursive=True, refresh=refresh, revision=resolved_path.revision, **kwargs)
except EntryNotFoundError:
Expand Down Expand Up @@ -653,7 +647,7 @@ def modified(self, path: str, **kwargs) -> datetime:
Returns:
`datetime`: Last commit date of the file.
"""
info = self.info(path, **kwargs)
info = self.info(path, **{**kwargs, "expand_info": True})
return info["last_commit"]["date"]

def info(self, path: str, refresh: bool = False, revision: Optional[str] = None, **kwargs) -> Dict[str, Any]:
Expand Down Expand Up @@ -683,14 +677,15 @@ def info(self, path: str, refresh: bool = False, revision: Optional[str] = None,
resolved_path = self.resolve_path(path, revision=revision)
path = resolved_path.unresolve()
expand_info = kwargs.get(
"expand_info", True
"expand_info", False
) # don't expose it as a parameter in the public API to follow the spec
if not resolved_path.path_in_repo:
# Path is the root directory
out = {
"name": path,
"size": 0,
"type": "directory",
"last_commit": None,
}
if expand_info:
last_commit = self._api.list_repo_commits(
Expand All @@ -708,7 +703,7 @@ def info(self, path: str, refresh: bool = False, revision: Optional[str] = None,
parent_path = self._parent(path)
if not expand_info and parent_path not in self.dircache:
# Fill the cache with cheap call
self.ls(parent_path, expand_info=False)
self.ls(parent_path)
if parent_path in self.dircache:
# Check if the path is in the cache
out1 = [o for o in self.dircache[parent_path] if o["name"] == path]
Expand Down Expand Up @@ -779,7 +774,7 @@ def exists(self, path, **kwargs):
if kwargs.get("refresh", False):
self.invalidate_cache(path)

self.info(path, **{**kwargs, "expand_info": False})
self.info(path, **kwargs)
return True
except: # noqa: E722
return False
Expand All @@ -798,7 +793,7 @@ def isdir(self, path):
`bool`: True if path is a directory, False otherwise.
"""
try:
return self.info(path, expand_info=False)["type"] == "directory"
return self.info(path)["type"] == "directory"
except OSError:
return False

Expand All @@ -816,7 +811,7 @@ def isfile(self, path):
`bool`: True if path is a file, False otherwise.
"""
try:
return self.info(path, expand_info=False)["type"] == "file"
return self.info(path)["type"] == "file"
except: # noqa: E722
return False

Expand Down Expand Up @@ -942,9 +937,6 @@ def __init__(self, fs: HfFileSystem, path: str, revision: Optional[str] = None,
f"{e}.\nMake sure the repository and revision exist before writing data."
) from e
raise
# avoid an unnecessary .info() call with expensive expand_info=True to instantiate .details
if kwargs.get("mode", "rb") == "rb":
self.details = fs.info(self.resolved_path.unresolve(), expand_info=False)
super().__init__(fs, self.resolved_path.unresolve(), **kwargs)
self.fs: HfFileSystem

Expand Down
25 changes: 16 additions & 9 deletions tests/test_hf_file_system.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,21 +66,21 @@ def test_info(self):
self.assertEqual(root_dir["type"], "directory")
self.assertEqual(root_dir["size"], 0)
self.assertTrue(root_dir["name"].endswith(self.repo_id))
self.assertIsNotNone(root_dir["last_commit"])
self.assertIsNone(root_dir["last_commit"])

data_dir = self.hffs.info(self.hf_path + "/data")
self.assertEqual(data_dir["type"], "directory")
self.assertEqual(data_dir["size"], 0)
self.assertTrue(data_dir["name"].endswith("/data"))
self.assertIsNotNone(data_dir["last_commit"])
self.assertIsNone(data_dir["last_commit"])
self.assertIsNotNone(data_dir["tree_id"])

text_data_file = self.hffs.info(self.text_file)
self.assertEqual(text_data_file["type"], "file")
self.assertGreater(text_data_file["size"], 0) # not empty
self.assertTrue(text_data_file["name"].endswith("/data/text_data.txt"))
self.assertIsNone(text_data_file["lfs"])
self.assertIsNotNone(text_data_file["last_commit"])
self.assertIsNone(text_data_file["last_commit"])
self.assertIsNotNone(text_data_file["blob_id"])
self.assertIn("security", text_data_file) # the staging endpoint does not run security checks

Expand Down Expand Up @@ -140,7 +140,7 @@ def test_glob(self):
self.assertTrue(
files[keys[0]]["name"].endswith("/.gitattributes") and files[keys[1]]["name"].endswith("/data")
)
self.assertIsNotNone(files[keys[0]]["last_commit"])
self.assertIsNone(files[keys[0]]["last_commit"])

def test_url(self):
self.assertEqual(
Expand Down Expand Up @@ -277,13 +277,13 @@ def test_list_root_directory_no_revision(self):
self.assertEqual(files[0]["type"], "directory")
self.assertEqual(files[0]["size"], 0)
self.assertTrue(files[0]["name"].endswith("/data"))
self.assertIsNotNone(files[0]["last_commit"])
self.assertIsNone(files[0]["last_commit"])
self.assertIsNotNone(files[0]["tree_id"])

self.assertEqual(files[1]["type"], "file")
self.assertGreater(files[1]["size"], 0) # not empty
self.assertTrue(files[1]["name"].endswith("/.gitattributes"))
self.assertIsNotNone(files[1]["last_commit"])
self.assertIsNone(files[1]["last_commit"])
self.assertIsNotNone(files[1]["blob_id"])
self.assertIn("security", files[1]) # the staging endpoint does not run security checks

Expand All @@ -298,15 +298,15 @@ def test_list_data_directory_no_revision(self):
self.assertIn("sha256", files[0]["lfs"])
self.assertIn("size", files[0]["lfs"])
self.assertIn("pointer_size", files[0]["lfs"])
self.assertIsNotNone(files[0]["last_commit"])
self.assertIsNone(files[0]["last_commit"])
self.assertIsNotNone(files[0]["blob_id"])
self.assertIn("security", files[0]) # the staging endpoint does not run security checks

self.assertEqual(files[1]["type"], "file")
self.assertGreater(files[1]["size"], 0) # not empty
self.assertTrue(files[1]["name"].endswith("/data/text_data.txt"))
self.assertIsNone(files[1]["lfs"])
self.assertIsNotNone(files[1]["last_commit"])
self.assertIsNone(files[1]["last_commit"])
self.assertIsNotNone(files[1]["blob_id"])
self.assertIn("security", files[1]) # the staging endpoint does not run security checks

Expand All @@ -318,7 +318,7 @@ def test_list_data_file_no_revision(self):
self.assertGreater(files[0]["size"], 0) # not empty
self.assertTrue(files[0]["name"].endswith("/data/text_data.txt"))
self.assertIsNone(files[0]["lfs"])
self.assertIsNotNone(files[0]["last_commit"])
self.assertIsNone(files[0]["last_commit"])
self.assertIsNotNone(files[0]["blob_id"])
self.assertIn("security", files[0]) # the staging endpoint does not run security checks

Expand Down Expand Up @@ -351,6 +351,13 @@ def test_list_root_directory_no_revision_no_detail_then_with_detail(self):
files = self.hffs.ls(self.hf_path, detail=True)
self.assertEqual(len(files), 2)
self.assertTrue(files[0]["name"].endswith("/data") and files[1]["name"].endswith("/.gitattributes"))
self.assertIsNone(
self.hffs.dircache[self.hf_path][0]["last_commit"]
) # no expand_info -> no last_commit in cache

files = self.hffs.ls(self.hf_path, detail=True, expand_info=True)
self.assertEqual(len(files), 2)
self.assertTrue(files[0]["name"].endswith("/data") and files[1]["name"].endswith("/.gitattributes"))
self.assertIsNotNone(self.hffs.dircache[self.hf_path][0]["last_commit"])

def test_find_root_directory_no_revision(self):
Expand Down
Loading