Skip to content

Commit 1d1e7eb

Browse files
authored
Merge pull request #38 from pinecone-io/serverless-merge
[merging of client update] merge workaround
2 parents 01ae732 + 36c7399 commit 1d1e7eb

File tree

8 files changed

+112
-74
lines changed

8 files changed

+112
-74
lines changed

.github/workflows/PR.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,11 @@ jobs:
3232
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
3333
PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY }}
3434
PINECONE_ENVIRONMENT: ${{ secrets.PINECONE_ENVIRONMENT }}
35-
run: poetry run pytest -v --html=report.html --cov pinecone_datasets tests/system
35+
run: |
36+
poetry run pytest -n 4 --html=report.html --cov pinecone_datasets tests/system
3637
- name: upload pytest report.html
3738
uses: actions/upload-artifact@v3
3839
if: always()
3940
with:
4041
name: dataset-pytest-report-py${{ matrix.python-version }}
41-
path: report.html
42+
path: report.html

.github/workflows/ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ jobs:
3232
PY_VERSION: ${{ matrix.python-version }}
3333
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
3434
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
35-
run: poetry run pytest -v --html=report.html --cov pinecone_datasets tests/unit
35+
run: poetry run pytest -n 4 --html=report.html --cov pinecone_datasets tests/unit
3636
- name: upload pytest report.html
3737
uses: actions/upload-artifact@v3
3838
if: always()

README.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ df = pd.read_parquet("my-dataset.parquet")
136136

137137
metadata = DatasetMetadata(**metadata_dict)
138138

139-
dataset = Dataset.from_pandas(documents = df, quries = None, metadata = metadata)
139+
dataset = Dataset.from_pandas(documents = df, queries = None, metadata = metadata)
140140
```
141141

142142
Please check the documentation for more information on the expected dataframe schema. There's also a column mapping variable that can be used to map the dataframe columns to the expected schema.
@@ -235,9 +235,10 @@ await ds.to_pinecone_index_async("index_name")
235235

236236
```
237237

238-
the `to_index` function also accepts additional parameters
238+
the `to_pinecone_index` function also accepts additional parameters:
239239

240240
* `batch_size` for controlling the upserting process
241+
* `api_key` for passing your API key, otherwise you can
241242
* `kwargs` - for passing additional parameters to the index creation process
242243

243244

pinecone_datasets/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
.. include:: ../README.md
33
"""
44

5-
__version__ = "0.6.2"
5+
__version__ = "0.6.3"
66

77

88
from .dataset import Dataset, DatasetInitializationError

pinecone_datasets/dataset.py

Lines changed: 47 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,22 @@
1-
import glob
21
import sys
32
import os
4-
import itertools
5-
import time
63
import json
7-
import asyncio
4+
import time
85
import warnings
96
from urllib.parse import urlparse
107
from dataclasses import dataclass
11-
from importlib.metadata import version
128

13-
import gcsfs
14-
import s3fs
159
import pandas as pd
16-
from tqdm.auto import tqdm
1710
import pyarrow.parquet as pq
1811
from pydantic import ValidationError
19-
from typing import Any, Generator, Iterator, List, Union, Dict, Optional, Tuple
12+
from typing import Any, Generator, Iterator, List, Dict, Optional, Tuple, NamedTuple
2013

2114
from pinecone_datasets import cfg
2215
from pinecone_datasets.catalog import DatasetMetadata
23-
from pinecone_datasets.fs import get_cloud_fs, LocalFileSystem
16+
from pinecone_datasets.fs import get_cloud_fs
2417

2518
import pinecone as pc
26-
from pinecone import Index
19+
from pinecone import Index, ServerlessSpec, PodSpec
2720

2821

2922
class DatasetInitializationError(Exception):
@@ -446,7 +439,7 @@ def to_catalog(
446439
def _upsert_to_index(
447440
self, index_name: str, namespace: str, batch_size: int, show_progress: bool
448441
):
449-
pinecone_index = Index(index_name=index_name)
442+
pinecone_index = self._pinecone_client.Index(index_name)
450443

451444
res = pinecone_index.upsert_from_dataframe(
452445
self.documents[self._config.Schema.documents_select_columns].dropna(
@@ -461,41 +454,51 @@ def _upsert_to_index(
461454
def _set_pinecone_index(
462455
self,
463456
api_key: Optional[str] = None,
464-
environment: Optional[str] = None,
465457
**kwargs,
466458
) -> None:
467-
pc.init(api_key=api_key, environment=environment, **kwargs)
468-
self._pinecone_client = pc
459+
self._pinecone_client = pc.Pinecone(api_key=api_key, **kwargs)
460+
461+
def _get_index_list(self) -> List[str]:
462+
return self._pinecone_client.list_indexes().names()
469463

470464
def _create_index(
471465
self,
472466
index_name: str,
473467
api_key: Optional[str] = None,
474-
environment: Optional[str] = None,
468+
spec: Optional[NamedTuple] = None,
475469
**kwargs,
476470
) -> Index:
477-
self._set_pinecone_index(api_key=api_key, environment=environment)
478-
pinecone_index_list = self._pinecone_client.list_indexes()
471+
self._set_pinecone_index(api_key=api_key)
472+
pinecone_index_list = self._get_index_list()
479473

480474
if index_name in pinecone_index_list:
481475
raise ValueError(
482476
f"index {index_name} already exists, Pinecone Datasets can only be upserted to a new indexe"
483477
)
484478
else:
485479
# create index
486-
print("creating index")
487480
try:
488481
self._pinecone_client.create_index(
489482
name=index_name,
490483
dimension=self.metadata.dense_model.dimension,
484+
spec=spec,
491485
**kwargs,
492486
)
493-
print("index created")
487+
self._wait_for_index_creation(index_name)
494488
return True
495489
except Exception as e:
496490
print(f"error creating index: {e}")
497491
return False
498492

493+
def _wait_for_index_creation(self, index_name: str, timeout: int = 60):
494+
for _ in range(timeout):
495+
try:
496+
self._pinecone_client.Index(index_name).describe_index_stats()
497+
return
498+
except Exception as e:
499+
time.sleep(1)
500+
raise TimeoutError(f"Index creation timed out after {timeout} seconds")
501+
499502
def to_pinecone_index(
500503
self,
501504
index_name: str,
@@ -505,20 +508,31 @@ def to_pinecone_index(
505508
show_progress: bool = True,
506509
api_key: Optional[str] = None,
507510
environment: Optional[str] = None,
511+
region: Optional[str] = None,
512+
cloud: Optional[str] = None,
513+
serverless: Optional[bool] = None,
508514
**kwargs,
509515
):
510516
"""
511517
Saves the dataset to a Pinecone index.
512518
513-
this function will look for two environment variables:
519+
this function will look for four environment variables:
520+
- SERVERLESS
514521
- PINECONE_API_KEY
522+
- PINECONE_REGION
523+
- PINECONE_CLOUD
515524
- PINECONE_ENVIRONMENT
516525
517526
Then, it will init a Pinecone Client and will perform an upsert to the index.
518527
The upsert will be using async batches to increase performance.
519528
520529
Args:
521530
index_name (str): the name of the index to upsert to
531+
api_key (str, optional): the api key to use for the upsert. Defaults to None.
532+
region (str, optional): the region to use for the upsert for serverless. Defaults to None.
533+
cloud (str, optional): the cloud to use for the upsert for serverless. Defaults to None.
534+
environment (str, optional): the environment to use for the upsert for pod-based. Defaults to None.
535+
serverless (bool, optional): whether to use serverless or pod-based. Defaults to None.
522536
namespace (str, optional): the namespace to use for the upsert. Defaults to "".
523537
batch_size (int, optional): the batch size to use for the upsert. Defaults to 100.
524538
show_progress (bool, optional): whether to show a progress bar while upserting. Defaults to True.
@@ -536,13 +550,21 @@ def to_pinecone_index(
536550
result = dataset.to_pinecone_index(index_name="my_index")
537551
```
538552
"""
553+
serverless = serverless or os.environ.get("SERVERLESS", False)
554+
if serverless:
555+
spec = ServerlessSpec(
556+
cloud=cloud or os.getenv("PINECONE_CLOUD", "aws"),
557+
region=region or os.getenv("PINECONE_REGION", "us-west2"),
558+
)
559+
else:
560+
spec = PodSpec(
561+
environment=environment or os.environ["PINECONE_ENVIRONMENT"],
562+
)
539563
if should_create_index:
540-
if not self._create_index(
541-
index_name, api_key=api_key, environment=environment, **kwargs
542-
):
564+
if not self._create_index(index_name, api_key=api_key, spec=spec, **kwargs):
543565
raise RuntimeError("index creation failed")
544566
else:
545-
self._set_pinecone_index(api_key=api_key, environment=environment, **kwargs)
567+
self._set_pinecone_index(api_key=api_key, **kwargs)
546568

547569
return self._upsert_to_index(
548570
index_name=index_name,

pyproject.toml

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "pinecone-datasets"
3-
version = "0.6.2"
3+
version = "0.6.3"
44
description = "Pinecone Datasets lets you easily load datasets into your Pinecone index."
55
authors = ["Pinecone Relevance Team <relevance@pinecone.io>"]
66
maintainers = [
@@ -12,15 +12,15 @@ readme = "README.md"
1212

1313

1414
[tool.poetry.dependencies]
15-
python = "^3.8"
15+
python = ">=3.8,<3.13"
1616
pyarrow = "^11.0.0"
1717
fsspec = "^2023.1.0"
1818
gcsfs = "^2023.1.0"
1919
s3fs = "^2023.1.0"
2020
pydantic = "^1.10.5"
2121
pandas = "^2.0.0"
2222
tqdm = "^4.65.0"
23-
pinecone-client = "^2.2.2"
23+
pinecone-client = "^3.0.0"
2424

2525

2626
[tool.poetry.group.dev]
@@ -35,6 +35,8 @@ pytest = "^7.2.2"
3535
pytest-html = "^3.2.0"
3636
pdoc = "^13.0.0"
3737
toml = "^0.10.2"
38+
pytest-xdist = "^3.3.1"
39+
3840

3941

4042
[build-system]

0 commit comments

Comments
 (0)