basetenlabs
diff --git a/‎.github/workflows/main.yml
Lines changed: 23 additions & 0 deletions b/‎.github/workflows/main.yml
Lines changed: 23 additions & 0 deletions
diff --git a/‎baseten-inference-client/Cargo.lock
Lines changed: 1 addition & 1 deletion b/‎baseten-inference-client/Cargo.lock
Lines changed: 1 addition & 1 deletion
diff --git a/‎baseten-inference-client/Cargo.toml
Lines changed: 1 addition & 1 deletion b/‎baseten-inference-client/Cargo.toml
Lines changed: 1 addition & 1 deletion
diff --git a/‎baseten-inference-client/README.md
Lines changed: 54 additions & 40 deletions b/‎baseten-inference-client/README.md
Lines changed: 54 additions & 40 deletions
diff --git a/‎baseten-inference-client/baseten_inference_client.pyi
Lines changed: 8 additions & 8 deletions b/‎baseten-inference-client/baseten_inference_client.pyi
Lines changed: 8 additions & 8 deletions
diff --git a/‎baseten-inference-client/src/lib.rs
Lines changed: 10 additions & 10 deletions b/‎baseten-inference-client/src/lib.rs
Lines changed: 10 additions & 10 deletions
diff --git a/‎baseten-inference-client/tests/test_bindings.py
Lines changed: 4 additions & 4 deletions b/‎baseten-inference-client/tests/test_bindings.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎baseten-inference-client/tests/test_client_embed.py
Lines changed: 3 additions & 3 deletions b/‎baseten-inference-client/tests/test_client_embed.py
Lines changed: 3 additions & 3 deletions
@@ -30,6 +30,29 @@ jobs:
     with:
       run_only_integration: false
 
+  report_to_slack:
+    runs-on: ubuntu-22.04
+    if: always() && github.ref == 'refs/heads/main'
+    needs:
+      - all-tests
+    steps:
+      - name: get-branch
+        run: echo ${{ github.ref }}
+      - name: show-slack-status
+        uses: 8398a7/action-slack@v3
+        with:
+          status: custom
+          fields: author, job, commit, repo
+          custom_payload: |
+            {
+              attachments: [{
+                color: "${{ needs.all-tests.result == 'failure' && 'danger' || 'good' }}",
+                text: `Truss post-commit tests ${{ needs.all-tests.result }}: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}`,
+              }]
+            }
+        env:
+          SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
+
   publish-rc-to-pypi:
     needs: [detect-version-changed]
     if: ${{ !failure() && !cancelled() && needs.detect-version-changed.outputs.release_version == 'true' && needs.detect-version-changed.outputs.is_prerelease_version == 'true' }}
 
@@ -1,6 +1,6 @@
 [package]
 name = "baseten_inference_client"
-version = "0.0.1-rc3"
+version = "0.0.1"
 edition = "2021"
 
 [dependencies]
 
@@ -21,8 +21,8 @@ base_url_embed = "https://model-yqv0rjjw.api.baseten.co/environments/production/
 # base_url_embed = "https://api.openai.com" or "https://api.mixedbread.com"
 client = InferenceClient(base_url=base_url_embed, api_key=api_key)
 ```
-
-### Synchronous Embedding
+### Embeddings
+#### Synchronous Embedding
 
 ```python
 texts = ["Hello world", "Example text", "Another sample"]
@@ -58,12 +58,12 @@ if numpy_array.shape[0] > 0:
 
 Note: The embed method is versatile and can be used with any embeddings service, e.g. OpenAI API embeddings, not just for Baseten deployments.
 
-### Asynchronous Embedding
+#### Asynchronous Embedding
 
 ```python
 async def async_embed():
     texts = ["Async hello", "Async example"]
-    response = await client.aembed(
+    response = await client.async_embed(
         input=texts,
         model="my_model",
         batch_size=2,
@@ -76,8 +76,22 @@ async def async_embed():
 # asyncio.run(async_embed())
 ```
 
-### Synchronous Batch POST
+#### Embedding Benchmarks
+Comparison against `pip install openai` for `/v1/embeddings`. Tested with the `./scripts/compare_latency_openai.py` with mini_batch_size of 128, and 4 server-side replicas. Results with OpenAI similar, OpenAI allows a max mini_batch_size of 2048.
+
+| Number of inputs / embeddings | Number of Tasks | InferenceClient (s) | AsyncOpenAI (s) | Speedup |
+|-------------------------------:|---------------:|---------------------:|----------------:|--------:|
+| 128                            |              1 |                0.12 |            0.13 |    1.08× |
+| 512                            |              4 |                0.14 |            0.21 |    1.50× |
+| 8 192                          |             64 |                0.83 |            1.95 |    2.35× |
+| 131 072                        |           1 024 |                4.63 |           39.07 |    8.44× |
+| 2 097 152                      |          16 384 |               70.92 |          903.68 |   12.74× |
+
+### Gerneral Batch POST
 
+The batch_post method is generic. It can be used to send POST requests to any URL, not limited to Baseten endpoints. The input and output can be any JSON item.
+
+#### Synchronous Batch POST
 ```python
 payload1 = {"model": "my_model", "input": ["Batch request sample 1"]}
 payload2 = {"model": "my_model", "input": ["Batch request sample 2"]}
@@ -90,15 +104,12 @@ response1, response2 = client.batch_post(
 print("Batch POST responses:", response1, response2)
 ```
 
-Note: The batch_post method is generic. It can be used to send POST requests to any URL,
-not limited to Baseten endpoints.
-
-### Asynchronous Batch POST
+#### Asynchronous Batch POST
 
 ```python
 async def async_batch_post():
     payload = {"model": "my_model", "input": ["Async batch sample"]}
-    responses = await client.abatch_post(
+    responses = await client.async_batch_post(
         url_path="/v1/embeddings",
         payloads=[payload, payload],
         max_concurrent_requests=4,
@@ -109,8 +120,10 @@ async def async_batch_post():
 # To run:
 # asyncio.run(async_batch_post())
 ```
+### Reranking
+Reranking compatible with BEI or text-embeddings-inference.
 
-### Synchronous Reranking
+#### Synchronous Reranking
 
 ```python
 query = "What is the best framework?"
@@ -127,13 +140,13 @@ for res in rerank_response.data:
     print(f"Index: {res.index} Score: {res.score}")
 ```
 
-### Asynchronous Reranking
+#### Asynchronous Reranking
 
 ```python
 async def async_rerank():
     query = "Async query sample"
     docs = ["Async doc1", "Async doc2"]
-    response = await client.arerank(
+    response = await client.async_rerank(
         query=query,
         texts=docs,
         return_text=True,
@@ -148,7 +161,9 @@ async def async_rerank():
 # asyncio.run(async_rerank())
 ```
 
-### Synchronous Classification
+### Classification
+Predicy (classification endpoint) compatible with BEI or text-embeddings-inference.
+#### Synchronous Classification
 
 ```python
 texts_to_classify = [
@@ -167,12 +182,11 @@ for group in classify_response.data:
         print(f"Label: {result.label}, Score: {result.score}")
 ```
 
-### Asynchronous Classification
-
+#### Asynchronous Classification
 ```python
 async def async_classify():
     texts = ["Async positive", "Async negative"]
-    response = await client.aclassify(
+    response = await client.async_classify(
         inputs=texts,
         batch_size=1,
         max_concurrent_requests=8,
@@ -187,28 +201,7 @@ async def async_classify():
 ```
 
 
-## Development
-
-```bash
-# Install prerequisites
-sudo apt-get install patchelf
-# Install cargo if not already installed.
-
-# Set up a Python virtual environment
-python -m venv .venv
-source .venv/bin/activate
-
-# Install development dependencies
-pip install maturin[patchelf] pytest requests numpy
-
-# Build and install the Rust extension in development mode
-maturin develop
-cargo fmt
-# Run tests
-pytest tests
-```
-
-## Error Handling
+### Error Handling
 
 The client can raise several types of errors. Here's how to handle common ones:
 
@@ -243,7 +236,28 @@ except requests.exceptions.HTTPError as e:
 
 ```
 
-For asynchronous methods (`aembed`, `arerank`, `aclassify`, `abatch_post`), the same exceptions will be raised by the `await` call and can be caught using a `try...except` block within an `async def` function.
+For asynchronous methods (`async_embed`, `async_rerank`, `async_classify`, `async_batch_post`), the same exceptions will be raised by the `await` call and can be caught using a `try...except` block within an `async def` function.
+
+## Development
+
+```bash
+# Install prerequisites
+sudo apt-get install patchelf
+# Install cargo if not already installed.
+
+# Set up a Python virtual environment
+python -m venv .venv
+source .venv/bin/activate
+
+# Install development dependencies
+pip install maturin[patchelf] pytest requests numpy
+
+# Build and install the Rust extension in development mode
+maturin develop
+cargo fmt
+# Run tests
+pytest tests
+```
 
 ## Contributions
 Feel free to contribute to this repo, tag @michaelfeil for review.
 
@@ -383,7 +383,7 @@ class InferenceClient:
         """
         ...
 
-    async def aembed(
+    async def async_embed(
         self,
         input: builtins.list[builtins.str],
         model: builtins.str,
@@ -415,12 +415,12 @@ class InferenceClient:
             requests.exceptions.HTTPError: If the request fails.
 
         Example:
-            >>> response = await client.aembed(["hello", "world"], model="model-id")
+            >>> response = await client.async_embed(["hello", "world"], model="model-id")
             >>> print(response.data[0].embedding)
         """
         ...
 
-    async def arerank(
+    async def async_rerank(
         self,
         query: builtins.str,
         texts: builtins.list[builtins.str],
@@ -454,13 +454,13 @@ class InferenceClient:
             requests.exceptions.HTTPError: If the request fails.
 
         Example:
-            >>> response = await client.arerank("find", ["doc1", "doc2"])
+            >>> response = await client.async_rerank("find", ["doc1", "doc2"])
             >>> for result in response.data:
             ...     print(result.index, result.score)
         """
         ...
 
-    async def aclassify(
+    async def async_classify(
         self,
         inputs: builtins.list[builtins.str],
         raw_scores: builtins.bool = False,
@@ -490,14 +490,14 @@ class InferenceClient:
             requests.exceptions.HTTPError: If the request fails.
 
         Example:
-            >>> response = await client.aclassify(["text1", "text2"])
+            >>> response = await client.async_classify(["text1", "text2"])
             >>> for group in response.data:
             ...     for result in group:
             ...         print(result.label, result.score)
         """
         ...
 
-    async def abatch_post(
+    async def async_batch_post(
         self,
         url_path: builtins.str,
         payloads: builtins.list[typing.Any],
@@ -521,7 +521,7 @@ class InferenceClient:
             requests.exceptions.HTTPError: If any underlying HTTP requests fail.
 
         Example:
-            >>> responses = await client.abatch_post("/v1/process_item", [{"data": "r1"}, {"data": "r2"}])
+            >>> responses = await client.async_batch_post("/v1/process_item", [{"data": "r1"}, {"data": "r2"}])
             >>> for resp in responses:
             ...     print(resp)
         """
 
@@ -411,8 +411,8 @@ impl InferenceClient {
         Python::with_gil(|py_gil| Ok(successful_response.into_py(py_gil)))
     }
 
-    #[pyo3(name = "aembed", signature = (input, model, encoding_format = None, dimensions = None, user = None, max_concurrent_requests = DEFAULT_CONCURRENCY, batch_size = DEFAULT_BATCH_SIZE, timeout_s = DEFAULT_REQUEST_TIMEOUT_S))]
-    fn aembed<'py>(
+    #[pyo3(name = "async_embed", signature = (input, model, encoding_format = None, dimensions = None, user = None, max_concurrent_requests = DEFAULT_CONCURRENCY, batch_size = DEFAULT_BATCH_SIZE, timeout_s = DEFAULT_REQUEST_TIMEOUT_S))]
+    fn async_embed<'py>(
         &self,
         py: Python<'py>,
         input: Vec<String>,
@@ -513,8 +513,8 @@ impl InferenceClient {
         Python::with_gil(|py| Ok(successful_response.into_py(py)))
     }
 
-    #[pyo3(name = "arerank", signature = (query, texts, raw_scores = false, return_text = false, truncate = false, truncation_direction = "Right", max_concurrent_requests = DEFAULT_CONCURRENCY, batch_size = DEFAULT_BATCH_SIZE, timeout_s = DEFAULT_REQUEST_TIMEOUT_S))]
-    fn arerank<'py>(
+    #[pyo3(name = "async_rerank", signature = (query, texts, raw_scores = false, return_text = false, truncate = false, truncation_direction = "Right", max_concurrent_requests = DEFAULT_CONCURRENCY, batch_size = DEFAULT_BATCH_SIZE, timeout_s = DEFAULT_REQUEST_TIMEOUT_S))]
+    fn async_rerank<'py>(
         &self,
         py: Python<'py>,
         query: String,
@@ -614,8 +614,8 @@ impl InferenceClient {
         Python::with_gil(|py| Ok(result_from_async_task?.into_py(py)))
     }
 
-    #[pyo3(name = "aclassify", signature = (inputs, raw_scores = false, truncate = false, truncation_direction = "Right", max_concurrent_requests = DEFAULT_CONCURRENCY, batch_size = DEFAULT_BATCH_SIZE, timeout_s = DEFAULT_REQUEST_TIMEOUT_S))]
-    fn aclassify<'py>(
+    #[pyo3(name = "async_classify", signature = (inputs, raw_scores = false, truncate = false, truncation_direction = "Right", max_concurrent_requests = DEFAULT_CONCURRENCY, batch_size = DEFAULT_BATCH_SIZE, timeout_s = DEFAULT_REQUEST_TIMEOUT_S))]
+    fn async_classify<'py>(
         &self,
         py: Python<'py>,
         inputs: Vec<String>,
@@ -668,7 +668,7 @@ impl InferenceClient {
         if payloads.is_empty() {
             return Err(PyValueError::new_err("Payloads list cannot be empty"));
         }
-        InferenceClient::validate_concurrency_parameters(max_concurrent_requests, 1)?; // Batch size is effectively 1
+        InferenceClient::validate_concurrency_parameters(max_concurrent_requests, 1000)?; // sent batch size to 1000 to allow higher batch
         let timeout_duration = InferenceClient::validate_and_get_timeout_duration(timeout_s)?;
 
         // Depythonize all payloads in the current thread (GIL is held)
@@ -737,8 +737,8 @@ impl InferenceClient {
         Ok(py_object_list.into())
     }
 
-    #[pyo3(name = "abatch_post", signature = (url_path, payloads, max_concurrent_requests = DEFAULT_CONCURRENCY, timeout_s = DEFAULT_REQUEST_TIMEOUT_S))]
-    fn abatch_post<'py>(
+    #[pyo3(name = "async_batch_post", signature = (url_path, payloads, max_concurrent_requests = DEFAULT_CONCURRENCY, timeout_s = DEFAULT_REQUEST_TIMEOUT_S))]
+    fn async_batch_post<'py>(
         &self,
         py: Python<'py>,
         url_path: String,
@@ -749,7 +749,7 @@ impl InferenceClient {
         if payloads.is_empty() {
             return Err(PyValueError::new_err("Payloads list cannot be empty"));
         }
-        InferenceClient::validate_concurrency_parameters(max_concurrent_requests, 1)?; // Batch size is effectively 1
+        InferenceClient::validate_concurrency_parameters(max_concurrent_requests, 1000)?; // sent batch size to 1000 to allow higher batch
         let timeout_duration = InferenceClient::validate_and_get_timeout_duration(timeout_s)?;
 
         // Depythonize all payloads in the current thread (GIL is held by `py` argument)
 
@@ -2,10 +2,10 @@ def test_baseten_inference_client_bindings_basic_test():
     from baseten_inference_client import InferenceClient
 
     InferenceClient.embed
-    InferenceClient.aembed
+    InferenceClient.async_embed
     InferenceClient.rerank
-    InferenceClient.arerank
+    InferenceClient.async_rerank
     InferenceClient.classify
-    InferenceClient.aclassify
+    InferenceClient.async_classify
     InferenceClient.batch_post
-    InferenceClient.abatch_post
+    InferenceClient.async_batch_post
@@ -274,7 +274,7 @@ def embed_job(start_time):
 async def test_embed_async():
     client = InferenceClient(base_url=base_url_embed, api_key=api_key)
 
-    response = await client.aembed(
+    response = await client.async_embed(
         ["Hello world", "Hello world 2"],
         model="my_model",
         batch_size=1,
@@ -296,7 +296,7 @@ async def test_embed_async():
 async def test_classify_async():
     client = InferenceClient(base_url=base_url_rerank, api_key=api_key)
 
-    response = await client.aclassify(
+    response = await client.async_classify(
         inputs=["who, who?", "Paris france"], batch_size=2, max_concurrent_requests=2
     )
     assert response is not None
@@ -313,7 +313,7 @@ async def test_classify_async():
 async def test_rerank_async():
     client = InferenceClient(base_url=base_url_rerank, api_key=api_key)
 
-    response = await client.arerank(
+    response = await client.async_rerank(
         query="Who let the dogs out?",
         texts=["who, who?", "Paris france"],
         batch_size=2,