Labelbox
diff --git a/‎examples/basics/custom_embeddings_sdk.ipynb
Lines changed: 269 additions & 0 deletions b/‎examples/basics/custom_embeddings_sdk.ipynb
Lines changed: 269 additions & 0 deletions
diff --git a/‎libs/labelbox/src/labelbox/adv_client.py
Lines changed: 115 additions & 0 deletions b/‎libs/labelbox/src/labelbox/adv_client.py
Lines changed: 115 additions & 0 deletions
@@ -0,0 +1,269 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {},
+ "cells": [
+  {
+   "metadata": {},
+   "source": [
+    "<td>\n",
+    "   <a target=\"_blank\" href=\"https://labelbox.com\" ><img src=\"https://labelbox.com/blog/content/images/2021/02/logo-v4.svg\" width=256/></a>\n",
+    "</td>"
+   ],
+   "cell_type": "markdown"
+  },
+  {
+   "metadata": {},
+   "source": [
+    "<td>\n",
+    "<a href=\"https://colab.research.google.com/github/Labelbox/labelbox-python/blob/master/examples/basics/custom_embeddings.ipynb\" target=\"_blank\"><img\n",
+    "src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"></a>\n",
+    "</td>\n",
+    "\n",
+    "<td>\n",
+    "<a href=\"https://github.com/Labelbox/labelbox-python/tree/master/examples/basics/custom_embeddings.ipynb\" target=\"_blank\"><img\n",
+    "src=\"https://img.shields.io/badge/GitHub-100000?logo=github&logoColor=white\" alt=\"GitHub\"></a>\n",
+    "</td>"
+   ],
+   "cell_type": "markdown"
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "# Custom Embeddings\n",
+    "\n",
+    "You can improve your data exploration and similarity search experience by adding your own custom embeddings. Labelbox allows you to upload up to 100 different custom embeddings on any kind of data. You can experiment with different embeddings to power your data selection."
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "metadata": {},
+   "source": [
+    "# Setup"
+   ],
+   "cell_type": "markdown"
+  },
+  {
+   "metadata": {},
+   "source": [
+    "!pip3 install -q \"labelbox\""
+   ],
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "source": [
+    "import labelbox as lb\n",
+    "import numpy as np\n",
+    "import json"
+   ],
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "source": [
+    "API_KEY = \"\"\n",
+    "client = lb.Client(API_KEY)"
+   ],
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "source": [
+    "# Select data rows in Labelbox for custom embeddings"
+   ],
+   "cell_type": "markdown"
+  },
+  {
+   "metadata": {},
+   "source": [
+    "client.enable_experimental = True\n",
+    "\n",
+    "# get images from a Labelbox dataset\n",
+    "# Our systems start to process data after 1000 embeddings of each type, for this demo make sure your dataset is over 1000 data rows\n",
+    "dataset = client.get_dataset(\"<ADD YOUR DATASET ID>\")\n",
+    "\n",
+    "export_task = dataset.export()\n",
+    "export_task.wait_till_done()"
+   ],
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "source": [
+    "data_rows = []\n",
+    "\n",
+    "def json_stream_handler(output: lb.JsonConverterOutput):\n",
+    "  data_row = json.loads(output.json_str)\n",
+    "  data_rows.append(data_row)\n",
+    "\n",
+    "if export_task.has_errors():\n",
+    "  export_task.get_stream(\n",
+    "  converter=lb.JsonConverter(),\n",
+    "  stream_type=lb.StreamType.ERRORS\n",
+    "  ).start(stream_handler=lambda error: print(error))\n",
+    "\n",
+    "if export_task.has_result():\n",
+    "  export_json = export_task.get_stream(\n",
+    "    converter=lb.JsonConverter(),\n",
+    "    stream_type=lb.StreamType.RESULT\n",
+    "  ).start(stream_handler=json_stream_handler)"
+   ],
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "source": [
+    "data_row_ids = [dr[\"data_row\"][\"id\"] for dr in data_rows]\n",
+    "\n",
+    "data_row_ids = data_row_ids[:1000] # keep the first 1000 examples for the sake of this demo"
+   ],
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "source": [
+    "# Create the payload for custom embeddings\n",
+    "-- It should be a .ndjson file.   \n",
+    "-- Every line is a json file that finishes with a \\n character.  \n",
+    "-- It does not have to be created through Python.  "
+   ],
+   "cell_type": "markdown"
+  },
+  {
+   "metadata": {},
+   "source": [
+    "nb_data_rows = len(data_row_ids)\n",
+    "print(\"Number of data rows: \", nb_data_rows)\n",
+    "# Generate random vectors, of dimension 2048 each\n",
+    "# Labelbox supports custom embedding vectors of dimension up to 2048\n",
+    "custom_embeddings = [list(np.random.random(2048)) for _ in range(nb_data_rows)]"
+   ],
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "source": [
+    "# Create the payload for custom embeddings\n",
+    "payload = []\n",
+    "for data_row_id,custom_embedding in zip(data_row_ids,custom_embeddings):\n",
+    "  payload.append({\"id\": data_row_id, \"vector\": custom_embedding})\n",
+    "\n",
+    "print('payload', len(payload),payload[:1])"
+   ],
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "source": [
+    "# Delete any pre-existing file\n",
+    "import os\n",
+    "if os.path.exists(\"payload.ndjson\"):\n",
+    "  os.remove(\"payload.ndjson\")\n",
+    "\n",
+    "# Convert the payload to a JSON file\n",
+    "with open('payload.ndjson', 'w') as f:\n",
+    "  for p in payload:\n",
+    "    f.write(json.dumps(p) + \"\\n\")\n",
+    "    # sanity_check_payload = json.dump(payload, f)"
+   ],
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "source": [
+    "# Sanity check that you can read/load the file and the payload is correct\n",
+    "with open('payload.ndjson') as f:\n",
+    "    sanity_check_payload = [json.loads(l) for l in f.readlines()]\n",
+    "print(\"Nb of custom embedding vectors in sanity_check_payload: \", len(sanity_check_payload))"
+   ],
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "source": [
+    "# See all custom embeddings available in your Labelbox workspace\n",
+    "embeddings = client.get_embeddings()"
+   ],
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "source": [
+    "# Create a new custom embedding, unless you want to re-use one\n",
+    "embedding = client.create_embedding(\"my_custom_embedding_2048_dimensions\", 2048)"
+   ],
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "source": [
+    "# Delete a custom embedding\n",
+    "embedding.delete()"
+   ],
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "source": [
+    "# Upload the payload to Labelbox"
+   ],
+   "cell_type": "markdown"
+  },
+  {
+   "metadata": {},
+   "source": [
+    "# Replace the current id with the newly generated id from the previous step, or any existing custom embedding id\n",
+    "embedding.import_vectors_from_file(\"./payload.ndjson\")"
+   ],
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "source": [
+    "# Get the count of imported vectors for a custom embedding"
+   ],
+   "cell_type": "markdown"
+  },
+  {
+   "metadata": {},
+   "source": [
+    "# Count how many data rows have a specific custom embedding (this can take a couple of minutes)\n",
+    "count = embedding.get_imported_vector_count()"
+   ],
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null
+  }
+ ]
+}
@@ -0,0 +1,115 @@
+import io
+import json
+import logging
+from typing import Dict, Any, Optional, List, Callable
+from urllib.parse import urlparse
+
+import requests
+from requests import Session, Response
+
+logger = logging.getLogger(__name__)
+
+
+class AdvClient:
+
+    def __init__(self, endpoint: str, api_key: str):
+        self.endpoint = endpoint
+        self.api_key = api_key
+        self.session = self._create_session()
+
+    def create_embedding(self, name: str, dims: int) -> Dict[str, Any]:
+        data = {"name": name, "dims": dims}
+        return self._request("POST", "/adv/v1/embeddings", data)
+
+    def delete_embedding(self, id: str):
+        return self._request("DELETE", f"/adv/v1/embeddings/{id}")
+
+    def get_embedding(self, id: str) -> Dict[str, Any]:
+        return self._request("GET", f"/adv/v1/embeddings/{id}")
+
+    def get_embeddings(self) -> List[Dict[str, Any]]:
+        return self._request("GET", "/adv/v1/embeddings").get("results", [])
+
+    def import_vectors_from_file(self, id: str, file_path: str, callback=None):
+        self._send_ndjson(f"/adv/v1/embeddings/{id}/_import_ndjson", file_path,
+                          callback)
+
+    def get_imported_vector_count(self, id: str) -> int:
+        data = self._request("GET", f"/adv/v1/embeddings/{id}/vectors/_count")
+        return data.get("count", 0)
+
+    def _create_session(self) -> Session:
+        session = requests.session()
+        session.headers.update({
+            "Authorization": f"Bearer {self.api_key}",
+            "Content-Type": "application/json"
+        })
+        return session
+
+    def _request(self,
+                 method: str,
+                 path: str,
+                 data: Optional[Dict[str, Any]] = None,
+                 headers: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
+        url = f"{self.endpoint}{path}"
+        requests_data = None
+        if data:
+            requests_data = json.dumps(data)
+        response = self.session.request(method,
+                                        url,
+                                        data=requests_data,
+                                        headers=headers)
+        response.raise_for_status()
+        return response.json()
+
+    def _send_ndjson(self,
+                     path: str,
+                     file_path: str,
+                     callback: Optional[Callable[[Dict[str, Any]],
+                                                 None]] = None):
+        """
+        Sends an NDJson file in chunks.
+
+        Args:
+            path: The URL path
+            file_path: The path to the NDJSON file.
+            callback: A callback to run for each chunk uploaded.
+        """
+
+        def upload_chunk(_buffer, _count):
+            _buffer.write(b"\n")
+            _headers = {
+                "Content-Type": "application/x-ndjson",
+                "X-Content-Lines": str(_count),
+                "Content-Length": str(buffer.tell())
+            }
+            rsp = self._send_bytes(f"{self.endpoint}{path}", _buffer, _headers)
+            rsp.raise_for_status()
+            if callback:
+                callback(rsp.json())
+
+        buffer = io.BytesIO()
+        count = 0
+        with open(file_path, 'rb') as fp:
+            for line in fp:
+                buffer.write(line)
+                count += 1
+                if count >= 1000:
+                    upload_chunk(buffer, count)
+                    buffer = io.BytesIO()
+                    count = 0
+        if count:
+            upload_chunk(buffer, count)
+
+    def _send_bytes(self,
+                    url: str,
+                    buffer: io.BytesIO,
+                    headers: Optional[Dict[str, Any]] = None) -> Response:
+        buffer.seek(0)
+        return self.session.put(url, headers=headers, data=buffer)
+
+    @classmethod
+    def factory(cls, api_endpoint: str, api_key: str) -> "AdvClient":
+        parsed_url = urlparse(api_endpoint)
+        endpoint = f"{parsed_url.scheme}://{parsed_url.netloc}/adv"
+        return AdvClient(endpoint, api_key)