Show progress loading with tqdm (#51)

jhamon · web-flow · commit a4a91261bb3a · 2025-03-14T12:08:28.000-04:00
## Problem

Some large datasets take multiple minutes to download. It feels as if
the notebook has crashed when nothing happens for such a long time.

## Solution

Display download progress with tqdm

## Type of Change

- [x] New feature (non-breaking change which adds functionality)
diff --git a/pinecone_datasets/dataset_fsreader.py b/pinecone_datasets/dataset_fsreader.py
@@ -1,13 +1,12 @@
-import sys
 import os
 import json
 import logging
 import warnings
-from typing import Literal, Dict, Optional
+from typing import Literal, Optional
 
 import pandas as pd
 import pyarrow.parquet as pq
-from typing import Any, Generator, Iterator, List, Dict, Optional, Tuple, NamedTuple
+from .tqdm import tqdm
 
 from .cfg import Schema
 from .dataset_metadata import DatasetMetadata
@@ -75,33 +74,41 @@ def _safe_read_from_path(
         read_path_str = os.path.join(dataset_path, data_type, "*.parquet")
         read_path = fs.glob(read_path_str)
         if DatasetFSReader._does_datatype_exist(fs, dataset_path, data_type):
-            dataset = pq.ParquetDataset(read_path, filesystem=fs)
-            dataset_schema_names = dataset.schema.names
+            # First, collect all the dataframes
+            dfs = []
+            for path in tqdm(read_path, desc=f"Loading {data_type} parquet files"):
+                piece = pq.read_pandas(path, filesystem=fs)
+                df_piece = piece.to_pandas()
+                dfs.append(df_piece)
+
+            if not dfs:
+                raise ValueError(f"No parquet files found in {read_path_str}")
+
+            # Combine all dataframes
+            df = pd.concat(dfs, ignore_index=True)
+
+            # Validate schema
+            dataset_schema_names = df.columns.tolist()
             columns_to_null = []
             columns_not_null = []
             for column_name, is_nullable, null_value in getattr(
                 Schema.Names, data_type
             ):
                 if column_name not in dataset_schema_names and not is_nullable:
                     raise ValueError(
-                        f"error, file is not matching Pinecone Datasets Schmea: {column_name} not found"
+                        f"error, file is not matching Pinecone Datasets Schema: {column_name} not found"
                     )
                 elif column_name not in dataset_schema_names and is_nullable:
                     columns_to_null.append((column_name, null_value))
                 else:
                     columns_not_null.append(column_name)
-            try:
-                # TODO: use of the columns_not_null and columns_to_null is only a workaround for proper schema validation and versioning
-                df = dataset.read_pandas(columns=columns_not_null).to_pandas()
-
-                for column_name, null_value in columns_to_null:
-                    df[column_name] = null_value
-                return df
-
-            # TODO: add more specific error handling, explain what is wrong
-            except Exception as e:
-                print("error, no exception: {}".format(e), file=sys.stderr)
-                raise (e)
+
+            # Add null columns if needed
+            for column_name, null_value in columns_to_null:
+                df[column_name] = null_value
+
+            return df[columns_not_null + [col for col, _ in columns_to_null]]
+
         else:
             warnings.warn(
                 "WARNING: No data found at: {}. Returning empty dataframe".format(
diff --git a/pinecone_datasets/tqdm.py b/pinecone_datasets/tqdm.py
@@ -0,0 +1,35 @@
+import warnings
+
+__all__ = ["tqdm"]
+
+try:
+    # Suppress the specific tqdm warning about IProgress
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", category=UserWarning, module="tqdm")
+        warnings.filterwarnings("ignore", message="IProgress not found.*")
+        from tqdm.auto import tqdm
+except ImportError:
+    # Fallback: define a dummy tqdm that supports the same interface.
+    class tqdm:  # type: ignore
+        def __init__(self, iterable=None, total=None, desc="", **kwargs):
+            self.iterable = iterable
+            self.total = total
+            self.desc = desc
+            # You can store additional kwargs if needed
+
+        def __iter__(self):
+            # Just iterate over the underlying iterable
+            for item in self.iterable:
+                yield item
+
+        def update(self, n=1):
+            # No-op: This stub doesn't track progress
+            pass
+
+        def __enter__(self):
+            # Allow use as a context manager
+            return self
+
+        def __exit__(self, exc_type, exc_value, traceback):
+            # Nothing to cleanup
+            pass
diff --git a/pyproject.toml b/pyproject.toml
@@ -18,7 +18,6 @@ gcsfs = "^2025.2.0"
 s3fs = "^2025.2.0"
 pydantic = "^2.0.0"
 pandas = "^2.0.0"
-tqdm = "^4.65.0"
 pyarrow = "^18.0.0"
 
 
@@ -36,6 +35,7 @@ pdoc = "^13.0.0"
 toml = "^0.10.2"
 pytest-xdist = "^3.3.1"
 tuna = "^0.5.11"
+tqdm = "^4.67.1"