Skip to content

Commit a4a9126

Browse files
authored
Show progress loading with tqdm (#51)
## Problem Some large datasets take multiple minutes to download. It feels as if the notebook has crashed when nothing happens for such a long time. ## Solution Display download progress with tqdm ## Type of Change - [x] New feature (non-breaking change which adds functionality)
1 parent 0998b72 commit a4a9126

File tree

3 files changed

+61
-19
lines changed

3 files changed

+61
-19
lines changed

pinecone_datasets/dataset_fsreader.py

Lines changed: 25 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,12 @@
1-
import sys
21
import os
32
import json
43
import logging
54
import warnings
6-
from typing import Literal, Dict, Optional
5+
from typing import Literal, Optional
76

87
import pandas as pd
98
import pyarrow.parquet as pq
10-
from typing import Any, Generator, Iterator, List, Dict, Optional, Tuple, NamedTuple
9+
from .tqdm import tqdm
1110

1211
from .cfg import Schema
1312
from .dataset_metadata import DatasetMetadata
@@ -75,33 +74,41 @@ def _safe_read_from_path(
7574
read_path_str = os.path.join(dataset_path, data_type, "*.parquet")
7675
read_path = fs.glob(read_path_str)
7776
if DatasetFSReader._does_datatype_exist(fs, dataset_path, data_type):
78-
dataset = pq.ParquetDataset(read_path, filesystem=fs)
79-
dataset_schema_names = dataset.schema.names
77+
# First, collect all the dataframes
78+
dfs = []
79+
for path in tqdm(read_path, desc=f"Loading {data_type} parquet files"):
80+
piece = pq.read_pandas(path, filesystem=fs)
81+
df_piece = piece.to_pandas()
82+
dfs.append(df_piece)
83+
84+
if not dfs:
85+
raise ValueError(f"No parquet files found in {read_path_str}")
86+
87+
# Combine all dataframes
88+
df = pd.concat(dfs, ignore_index=True)
89+
90+
# Validate schema
91+
dataset_schema_names = df.columns.tolist()
8092
columns_to_null = []
8193
columns_not_null = []
8294
for column_name, is_nullable, null_value in getattr(
8395
Schema.Names, data_type
8496
):
8597
if column_name not in dataset_schema_names and not is_nullable:
8698
raise ValueError(
87-
f"error, file is not matching Pinecone Datasets Schmea: {column_name} not found"
99+
f"error, file is not matching Pinecone Datasets Schema: {column_name} not found"
88100
)
89101
elif column_name not in dataset_schema_names and is_nullable:
90102
columns_to_null.append((column_name, null_value))
91103
else:
92104
columns_not_null.append(column_name)
93-
try:
94-
# TODO: use of the columns_not_null and columns_to_null is only a workaround for proper schema validation and versioning
95-
df = dataset.read_pandas(columns=columns_not_null).to_pandas()
96-
97-
for column_name, null_value in columns_to_null:
98-
df[column_name] = null_value
99-
return df
100-
101-
# TODO: add more specific error handling, explain what is wrong
102-
except Exception as e:
103-
print("error, no exception: {}".format(e), file=sys.stderr)
104-
raise (e)
105+
106+
# Add null columns if needed
107+
for column_name, null_value in columns_to_null:
108+
df[column_name] = null_value
109+
110+
return df[columns_not_null + [col for col, _ in columns_to_null]]
111+
105112
else:
106113
warnings.warn(
107114
"WARNING: No data found at: {}. Returning empty dataframe".format(

pinecone_datasets/tqdm.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import warnings
2+
3+
__all__ = ["tqdm"]
4+
5+
try:
6+
# Suppress the specific tqdm warning about IProgress
7+
with warnings.catch_warnings():
8+
warnings.filterwarnings("ignore", category=UserWarning, module="tqdm")
9+
warnings.filterwarnings("ignore", message="IProgress not found.*")
10+
from tqdm.auto import tqdm
11+
except ImportError:
12+
# Fallback: define a dummy tqdm that supports the same interface.
13+
class tqdm: # type: ignore
14+
def __init__(self, iterable=None, total=None, desc="", **kwargs):
15+
self.iterable = iterable
16+
self.total = total
17+
self.desc = desc
18+
# You can store additional kwargs if needed
19+
20+
def __iter__(self):
21+
# Just iterate over the underlying iterable
22+
for item in self.iterable:
23+
yield item
24+
25+
def update(self, n=1):
26+
# No-op: This stub doesn't track progress
27+
pass
28+
29+
def __enter__(self):
30+
# Allow use as a context manager
31+
return self
32+
33+
def __exit__(self, exc_type, exc_value, traceback):
34+
# Nothing to cleanup
35+
pass

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ gcsfs = "^2025.2.0"
1818
s3fs = "^2025.2.0"
1919
pydantic = "^2.0.0"
2020
pandas = "^2.0.0"
21-
tqdm = "^4.65.0"
2221
pyarrow = "^18.0.0"
2322

2423

@@ -36,6 +35,7 @@ pdoc = "^13.0.0"
3635
toml = "^0.10.2"
3736
pytest-xdist = "^3.3.1"
3837
tuna = "^0.5.11"
38+
tqdm = "^4.67.1"
3939

4040

4141

0 commit comments

Comments
 (0)