Merge pull request #38 from pinecone-io/serverless-merge

jamescalam · web-flow · commit 1d1e7eb2851e · 2024-01-16T13:36:15.000Z
[merging of client update] merge workaround
diff --git a/.github/workflows/PR.yml b/.github/workflows/PR.yml
@@ -32,10 +32,11 @@ jobs:
         AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
         PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY }}
         PINECONE_ENVIRONMENT: ${{ secrets.PINECONE_ENVIRONMENT }}
-      run: poetry run pytest -v --html=report.html --cov pinecone_datasets tests/system
+      run: |
+        poetry run pytest -n 4 --html=report.html --cov pinecone_datasets tests/system
     - name: upload pytest report.html
       uses: actions/upload-artifact@v3
       if: always()
       with:
         name: dataset-pytest-report-py${{ matrix.python-version }}
-        path: report.html
+        path: report.html
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -32,7 +32,7 @@ jobs:
         PY_VERSION: ${{ matrix.python-version }}
         AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
         AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-      run: poetry run pytest -v --html=report.html --cov pinecone_datasets tests/unit
+      run: poetry run pytest -n 4 --html=report.html --cov pinecone_datasets tests/unit
     - name: upload pytest report.html
       uses: actions/upload-artifact@v3
       if: always()
diff --git a/README.md b/README.md
@@ -136,7 +136,7 @@ df = pd.read_parquet("my-dataset.parquet")
 
 metadata = DatasetMetadata(**metadata_dict)
 
-dataset = Dataset.from_pandas(documents = df, quries = None, metadata = metadata)
+dataset = Dataset.from_pandas(documents = df, queries = None, metadata = metadata)
 ```
 
 Please check the documentation for more information on the expected dataframe schema. There's also a column mapping variable that can be used to map the dataframe columns to the expected schema.
@@ -235,9 +235,10 @@ await ds.to_pinecone_index_async("index_name")
 
 ```
 
-the `to_index` function also accepts additional parameters
+the `to_pinecone_index` function also accepts additional parameters:
 
 * `batch_size` for controlling the upserting process
+* `api_key` for passing your API key, otherwise you can 
 * `kwargs` - for passing additional parameters to the index creation process
 
 
diff --git a/pinecone_datasets/__init__.py b/pinecone_datasets/__init__.py
@@ -2,7 +2,7 @@
 .. include:: ../README.md
 """
 
-__version__ = "0.6.2"
+__version__ = "0.6.3"
 
 
 from .dataset import Dataset, DatasetInitializationError
diff --git a/pinecone_datasets/dataset.py b/pinecone_datasets/dataset.py
@@ -1,29 +1,22 @@
-import glob
 import sys
 import os
-import itertools
-import time
 import json
-import asyncio
+import time
 import warnings
 from urllib.parse import urlparse
 from dataclasses import dataclass
-from importlib.metadata import version
 
-import gcsfs
-import s3fs
 import pandas as pd
-from tqdm.auto import tqdm
 import pyarrow.parquet as pq
 from pydantic import ValidationError
-from typing import Any, Generator, Iterator, List, Union, Dict, Optional, Tuple
+from typing import Any, Generator, Iterator, List, Dict, Optional, Tuple, NamedTuple
 
 from pinecone_datasets import cfg
 from pinecone_datasets.catalog import DatasetMetadata
-from pinecone_datasets.fs import get_cloud_fs, LocalFileSystem
+from pinecone_datasets.fs import get_cloud_fs
 
 import pinecone as pc
-from pinecone import Index
+from pinecone import Index, ServerlessSpec, PodSpec
 
 
 class DatasetInitializationError(Exception):
@@ -446,7 +439,7 @@ def to_catalog(
     def _upsert_to_index(
         self, index_name: str, namespace: str, batch_size: int, show_progress: bool
     ):
-        pinecone_index = Index(index_name=index_name)
+        pinecone_index = self._pinecone_client.Index(index_name)
 
         res = pinecone_index.upsert_from_dataframe(
             self.documents[self._config.Schema.documents_select_columns].dropna(
@@ -461,41 +454,51 @@ def _upsert_to_index(
     def _set_pinecone_index(
         self,
         api_key: Optional[str] = None,
-        environment: Optional[str] = None,
         **kwargs,
     ) -> None:
-        pc.init(api_key=api_key, environment=environment, **kwargs)
-        self._pinecone_client = pc
+        self._pinecone_client = pc.Pinecone(api_key=api_key, **kwargs)
+
+    def _get_index_list(self) -> List[str]:
+        return self._pinecone_client.list_indexes().names()
 
     def _create_index(
         self,
         index_name: str,
         api_key: Optional[str] = None,
-        environment: Optional[str] = None,
+        spec: Optional[NamedTuple] = None,
         **kwargs,
     ) -> Index:
-        self._set_pinecone_index(api_key=api_key, environment=environment)
-        pinecone_index_list = self._pinecone_client.list_indexes()
+        self._set_pinecone_index(api_key=api_key)
+        pinecone_index_list = self._get_index_list()
 
         if index_name in pinecone_index_list:
             raise ValueError(
                 f"index {index_name} already exists, Pinecone Datasets can only be upserted to a new indexe"
             )
         else:
             # create index
-            print("creating index")
             try:
                 self._pinecone_client.create_index(
                     name=index_name,
                     dimension=self.metadata.dense_model.dimension,
+                    spec=spec,
                     **kwargs,
                 )
-                print("index created")
+                self._wait_for_index_creation(index_name)
                 return True
             except Exception as e:
                 print(f"error creating index: {e}")
                 return False
 
+    def _wait_for_index_creation(self, index_name: str, timeout: int = 60):
+        for _ in range(timeout):
+            try:
+                self._pinecone_client.Index(index_name).describe_index_stats()
+                return
+            except Exception as e:
+                time.sleep(1)
+        raise TimeoutError(f"Index creation timed out after {timeout} seconds")
+
     def to_pinecone_index(
         self,
         index_name: str,
@@ -505,20 +508,31 @@ def to_pinecone_index(
         show_progress: bool = True,
         api_key: Optional[str] = None,
         environment: Optional[str] = None,
+        region: Optional[str] = None,
+        cloud: Optional[str] = None,
+        serverless: Optional[bool] = None,
         **kwargs,
     ):
         """
         Saves the dataset to a Pinecone index.
 
-        this function will look for two environment variables:
+        this function will look for four environment variables:
+        - SERVERLESS
         - PINECONE_API_KEY
+        - PINECONE_REGION
+        - PINECONE_CLOUD
         - PINECONE_ENVIRONMENT
 
         Then, it will init a Pinecone Client and will perform an upsert to the index.
         The upsert will be using async batches to increase performance.
 
         Args:
             index_name (str): the name of the index to upsert to
+            api_key (str, optional): the api key to use for the upsert. Defaults to None.
+            region (str, optional): the region to use for the upsert for serverless. Defaults to None.
+            cloud (str, optional): the cloud to use for the upsert for serverless. Defaults to None.
+            environment (str, optional): the environment to use for the upsert for pod-based. Defaults to None.
+            serverless (bool, optional): whether to use serverless or pod-based. Defaults to None.
             namespace (str, optional): the namespace to use for the upsert. Defaults to "".
             batch_size (int, optional): the batch size to use for the upsert. Defaults to 100.
             show_progress (bool, optional): whether to show a progress bar while upserting. Defaults to True.
@@ -536,13 +550,21 @@ def to_pinecone_index(
             result = dataset.to_pinecone_index(index_name="my_index")
             ```
         """
+        serverless = serverless or os.environ.get("SERVERLESS", False)
+        if serverless:
+            spec = ServerlessSpec(
+                cloud=cloud or os.getenv("PINECONE_CLOUD", "aws"),
+                region=region or os.getenv("PINECONE_REGION", "us-west2"),
+            )
+        else:
+            spec = PodSpec(
+                environment=environment or os.environ["PINECONE_ENVIRONMENT"],
+            )
         if should_create_index:
-            if not self._create_index(
-                index_name, api_key=api_key, environment=environment, **kwargs
-            ):
+            if not self._create_index(index_name, api_key=api_key, spec=spec, **kwargs):
                 raise RuntimeError("index creation failed")
         else:
-            self._set_pinecone_index(api_key=api_key, environment=environment, **kwargs)
+            self._set_pinecone_index(api_key=api_key, **kwargs)
 
         return self._upsert_to_index(
             index_name=index_name,
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pinecone-datasets"
-version = "0.6.2"
+version = "0.6.3"
 description = "Pinecone Datasets lets you easily load datasets into your Pinecone index."
 authors = ["Pinecone Relevance Team <relevance@pinecone.io>"]
 maintainers = [
@@ -12,15 +12,15 @@ readme = "README.md"
 
 
 [tool.poetry.dependencies]
-python = "^3.8"
+python = ">=3.8,<3.13"
 pyarrow = "^11.0.0"
 fsspec = "^2023.1.0"
 gcsfs = "^2023.1.0"
 s3fs = "^2023.1.0"
 pydantic = "^1.10.5"
 pandas = "^2.0.0"
 tqdm = "^4.65.0"
-pinecone-client = "^2.2.2"
+pinecone-client = "^3.0.0"
 
 
 [tool.poetry.group.dev]
@@ -35,6 +35,8 @@ pytest = "^7.2.2"
 pytest-html = "^3.2.0"
 pdoc = "^13.0.0"
 toml = "^0.10.2"
+pytest-xdist = "^3.3.1"
+
 
 
 [build-system]
diff --git a/tests/system/test_io_pinecone.py b/tests/system/test_io_pinecone.py
diff --git a/tests/unit/test_basics.py b/tests/unit/test_basics.py