|
39 | 39 | {
|
40 | 40 | "metadata": {},
|
41 | 41 | "source": [
|
42 |
| - "# Setup" |
| 42 | + "# Set up " |
43 | 43 | ],
|
44 | 44 | "cell_type": "markdown"
|
45 | 45 | },
|
46 | 46 | {
|
47 | 47 | "metadata": {},
|
48 | 48 | "source": [
|
49 |
| - "!pip3 install -q \"labelbox\"" |
| 49 | + "!pip3 install -q \"labelbox[data]\"" |
50 | 50 | ],
|
51 | 51 | "cell_type": "code",
|
52 | 52 | "outputs": [],
|
|
57 | 57 | "source": [
|
58 | 58 | "import labelbox as lb\n",
|
59 | 59 | "import numpy as np\n",
|
60 |
| - "import json" |
| 60 | + "import json\n", |
| 61 | + "import uuid\n", |
| 62 | + "import random" |
61 | 63 | ],
|
62 | 64 | "cell_type": "code",
|
63 | 65 | "outputs": [],
|
64 | 66 | "execution_count": null
|
65 | 67 | },
|
| 68 | + { |
| 69 | + "metadata": {}, |
| 70 | + "source": [ |
| 71 | + "# Replace with your API key" |
| 72 | + ], |
| 73 | + "cell_type": "markdown" |
| 74 | + }, |
66 | 75 | {
|
67 | 76 | "metadata": {},
|
68 | 77 | "source": [
|
|
76 | 85 | {
|
77 | 86 | "metadata": {},
|
78 | 87 | "source": [
|
79 |
| - "# Select data rows in Labelbox for custom embeddings" |
| 88 | + "# Select data rows" |
80 | 89 | ],
|
81 | 90 | "cell_type": "markdown"
|
82 | 91 | },
|
83 | 92 | {
|
84 | 93 | "metadata": {},
|
85 | 94 | "source": [
|
86 |
| - "client.enable_experimental = True\n", |
87 |
| - "\n", |
88 |
| - "# get images from a Labelbox dataset\n", |
89 |
| - "# Our systems start to process data after 1000 embeddings of each type, for this demo make sure your dataset is over 1000 data rows\n", |
90 |
| - "dataset = client.get_dataset(\"<DATASET-ID>\")\n", |
91 |
| - "\n", |
| 95 | + "- Get images from a Labelbox dataset\n", |
| 96 | + "- To improve similarity search, you need to upload custom embeddings to at least 1,000 data rows.\n" |
| 97 | + ], |
| 98 | + "cell_type": "markdown" |
| 99 | + }, |
| 100 | + { |
| 101 | + "metadata": {}, |
| 102 | + "source": [ |
| 103 | + "DATASET_ID = \"\"" |
| 104 | + ], |
| 105 | + "cell_type": "code", |
| 106 | + "outputs": [], |
| 107 | + "execution_count": null |
| 108 | + }, |
| 109 | + { |
| 110 | + "metadata": {}, |
| 111 | + "source": [ |
| 112 | + "dataset = client.get_dataset(dataset_id=DATASET_ID)\n", |
92 | 113 | "export_task = dataset.export()\n",
|
93 | 114 | "export_task.wait_till_done()"
|
94 | 115 | ],
|
|
124 | 145 | {
|
125 | 146 | "metadata": {},
|
126 | 147 | "source": [
|
127 |
| - "data_row_ids = [dr[\"data_row\"][\"id\"] for dr in data_rows]\n", |
128 |
| - "data_row_ids = data_row_ids[:1000] # keep the first 1000 examples for the sake of this demo" |
| 148 | + "data_row_dict = [{\"data_row_id\": dr[\"data_row\"][\"id\"], \"row_data\": dr[\"data_row\"][\"row_data\"]} for dr in data_rows]\n", |
| 149 | + "data_row_dict = data_row_dict[:1000] # keep the first 1000 examples for the sake of this demo" |
129 | 150 | ],
|
130 | 151 | "cell_type": "code",
|
131 | 152 | "outputs": [],
|
|
134 | 155 | {
|
135 | 156 | "metadata": {},
|
136 | 157 | "source": [
|
137 |
| - "# Create the payload for custom embeddings\n", |
138 |
| - "-- It should be a .ndjson file. \n", |
139 |
| - "-- Every line is a json file that finishes with a \\n character. \n", |
140 |
| - "-- It does not have to be created through Python. " |
| 158 | + "# Create custom embedding payload " |
141 | 159 | ],
|
142 | 160 | "cell_type": "markdown"
|
143 | 161 | },
|
144 | 162 | {
|
145 | 163 | "metadata": {},
|
146 | 164 | "source": [
|
147 |
| - "nb_data_rows = len(data_row_ids)\n", |
| 165 | + "Generate random vectors for embeddings (max : 2048 dimensions)" |
| 166 | + ], |
| 167 | + "cell_type": "markdown" |
| 168 | + }, |
| 169 | + { |
| 170 | + "metadata": {}, |
| 171 | + "source": [ |
| 172 | + "nb_data_rows = len(data_row_dict)\n", |
148 | 173 | "print(\"Number of data rows: \", nb_data_rows)\n",
|
149 |
| - "# Generate random vectors, of dimension 2048 each\n", |
150 | 174 | "# Labelbox supports custom embedding vectors of dimension up to 2048\n",
|
151 | 175 | "custom_embeddings = [list(np.random.random(2048)) for _ in range(nb_data_rows)]"
|
152 | 176 | ],
|
|
157 | 181 | {
|
158 | 182 | "metadata": {},
|
159 | 183 | "source": [
|
160 |
| - "# Create the payload for custom embeddings\n", |
161 |
| - "payload = []\n", |
162 |
| - "for data_row_id,custom_embedding in zip(data_row_ids,custom_embeddings):\n", |
163 |
| - " payload.append({\"id\": data_row_id, \"vector\": custom_embedding})\n", |
164 |
| - "\n", |
165 |
| - "print('payload', len(payload),payload[:1])" |
| 184 | + "List all custom embeddings available in your Labelbox workspace" |
| 185 | + ], |
| 186 | + "cell_type": "markdown" |
| 187 | + }, |
| 188 | + { |
| 189 | + "metadata": {}, |
| 190 | + "source": [ |
| 191 | + "embeddings = client.get_embeddings()" |
166 | 192 | ],
|
167 | 193 | "cell_type": "code",
|
168 | 194 | "outputs": [],
|
|
171 | 197 | {
|
172 | 198 | "metadata": {},
|
173 | 199 | "source": [
|
174 |
| - "# Delete any pre-existing file\n", |
175 |
| - "import os\n", |
176 |
| - "if os.path.exists(\"payload.ndjson\"):\n", |
177 |
| - " os.remove(\"payload.ndjson\")\n", |
178 |
| - "\n", |
179 |
| - "# Convert the payload to a JSON file\n", |
180 |
| - "with open('payload.ndjson', 'w') as f:\n", |
181 |
| - " for p in payload:\n", |
182 |
| - " f.write(json.dumps(p) + \"\\n\")\n", |
183 |
| - " # sanity_check_payload = json.dump(payload, f)" |
| 200 | + "Choose an existing embedding type or create a new one" |
| 201 | + ], |
| 202 | + "cell_type": "markdown" |
| 203 | + }, |
| 204 | + { |
| 205 | + "metadata": {}, |
| 206 | + "source": [ |
| 207 | + "# Name of the custom embedding must be unique\n", |
| 208 | + "embedding = client.create_embedding(\"my_custom_embedding_2048_dimensions\", 2048)" |
184 | 209 | ],
|
185 | 210 | "cell_type": "code",
|
186 | 211 | "outputs": [],
|
|
189 | 214 | {
|
190 | 215 | "metadata": {},
|
191 | 216 | "source": [
|
192 |
| - "# Sanity check that you can read/load the file and the payload is correct\n", |
193 |
| - "with open('payload.ndjson') as f:\n", |
194 |
| - " sanity_check_payload = [json.loads(l) for l in f.readlines()]\n", |
195 |
| - "print(\"Nb of custom embedding vectors in sanity_check_payload: \", len(sanity_check_payload))" |
| 217 | + "Create payload" |
| 218 | + ], |
| 219 | + "cell_type": "markdown" |
| 220 | + }, |
| 221 | + { |
| 222 | + "metadata": {}, |
| 223 | + "source": [ |
| 224 | + "- The payload should encompass all the data you wish to retain, along with the new embeddings vector data.\n", |
| 225 | + "- `row_data` and `key` is required when using `dataset.upsert_data_rows()` " |
| 226 | + ], |
| 227 | + "cell_type": "markdown" |
| 228 | + }, |
| 229 | + { |
| 230 | + "metadata": {}, |
| 231 | + "source": [ |
| 232 | + "payload = []\n", |
| 233 | + "for data_row_dict, custom_embedding in zip(data_row_dict,custom_embeddings):\n", |
| 234 | + " payload.append({\"key\": lb.UniqueId(data_row_dict['data_row_id']), \"row_data\": data_row_dict['row_data'], \"embeddings\": [{\"embedding_id\": embedding.id, \"vector\": custom_embedding}]})\n", |
| 235 | + "\n", |
| 236 | + "print('payload', len(payload),payload[:1])" |
196 | 237 | ],
|
197 | 238 | "cell_type": "code",
|
198 | 239 | "outputs": [],
|
|
201 | 242 | {
|
202 | 243 | "metadata": {},
|
203 | 244 | "source": [
|
204 |
| - "# See all custom embeddings available in your Labelbox workspace\n", |
205 |
| - "embeddings = client.get_embeddings()" |
| 245 | + "# Upload payload" |
| 246 | + ], |
| 247 | + "cell_type": "markdown" |
| 248 | + }, |
| 249 | + { |
| 250 | + "metadata": {}, |
| 251 | + "source": [ |
| 252 | + "Upsert data rows with custom embeddings" |
| 253 | + ], |
| 254 | + "cell_type": "markdown" |
| 255 | + }, |
| 256 | + { |
| 257 | + "metadata": {}, |
| 258 | + "source": [ |
| 259 | + "task = dataset.upsert_data_rows(payload)\n", |
| 260 | + "task.wait_till_done()\n", |
| 261 | + "print(task.errors)\n", |
| 262 | + "print(task.status)" |
206 | 263 | ],
|
207 | 264 | "cell_type": "code",
|
208 | 265 | "outputs": [],
|
|
211 | 268 | {
|
212 | 269 | "metadata": {},
|
213 | 270 | "source": [
|
214 |
| - "# Create a new custom embedding, unless you want to re-use one\n", |
215 |
| - "# Name of the custom embedding must be unique\n", |
216 |
| - "embedding = client.create_embedding(\"my_custom_embedding_2048_dimensions\", 2048)" |
| 271 | + "Get the count of imported vectors for a custom embedding" |
| 272 | + ], |
| 273 | + "cell_type": "markdown" |
| 274 | + }, |
| 275 | + { |
| 276 | + "metadata": {}, |
| 277 | + "source": [ |
| 278 | + "# Count how many data rows have a specific custom embedding (this can take a couple of minutes)\n", |
| 279 | + "count = embedding.get_imported_vector_count()\n", |
| 280 | + "print(count)" |
217 | 281 | ],
|
218 | 282 | "cell_type": "code",
|
219 | 283 | "outputs": [],
|
|
222 | 286 | {
|
223 | 287 | "metadata": {},
|
224 | 288 | "source": [
|
225 |
| - "# Delete a custom embedding\n", |
| 289 | + "Delete custom embedding type" |
| 290 | + ], |
| 291 | + "cell_type": "markdown" |
| 292 | + }, |
| 293 | + { |
| 294 | + "metadata": {}, |
| 295 | + "source": [ |
226 | 296 | "#embedding.delete()"
|
227 | 297 | ],
|
228 | 298 | "cell_type": "code",
|
|
232 | 302 | {
|
233 | 303 | "metadata": {},
|
234 | 304 | "source": [
|
235 |
| - "# Upload the payload to Labelbox" |
| 305 | + "# Upload custom embeddings during data row creation" |
| 306 | + ], |
| 307 | + "cell_type": "markdown" |
| 308 | + }, |
| 309 | + { |
| 310 | + "metadata": {}, |
| 311 | + "source": [ |
| 312 | + "Create a dataset" |
236 | 313 | ],
|
237 | 314 | "cell_type": "markdown"
|
238 | 315 | },
|
239 | 316 | {
|
240 | 317 | "metadata": {},
|
241 | 318 | "source": [
|
242 |
| - "# Replace the current id with the newly generated id from the previous step, or any existing custom embedding id\n", |
243 |
| - "embedding.import_vectors_from_file(\"./payload.ndjson\")" |
| 319 | + "# Create a dataset\n", |
| 320 | + "dataset_new = client.create_dataset(name=\"data_rows_with_embeddings\")" |
244 | 321 | ],
|
245 | 322 | "cell_type": "code",
|
246 | 323 | "outputs": [],
|
|
249 | 326 | {
|
250 | 327 | "metadata": {},
|
251 | 328 | "source": [
|
252 |
| - "# Get the count of imported vectors for a custom embedding" |
| 329 | + "Fetch an embedding (2048 dimension)" |
253 | 330 | ],
|
254 | 331 | "cell_type": "markdown"
|
255 | 332 | },
|
256 | 333 | {
|
257 | 334 | "metadata": {},
|
258 | 335 | "source": [
|
259 |
| - "# Count how many data rows have a specific custom embedding (this can take a couple of minutes)\n", |
260 |
| - "count = embedding.get_imported_vector_count()" |
| 336 | + "embedding = client.get_embedding_by_name(\"my_custom_embedding_2048_dimensions\")\n", |
| 337 | + "vector = [random.uniform(1.0, 2.0) for _ in range(embedding.dims)]" |
| 338 | + ], |
| 339 | + "cell_type": "code", |
| 340 | + "outputs": [], |
| 341 | + "execution_count": null |
| 342 | + }, |
| 343 | + { |
| 344 | + "metadata": {}, |
| 345 | + "source": [ |
| 346 | + "Upload data rows with embeddings" |
| 347 | + ], |
| 348 | + "cell_type": "markdown" |
| 349 | + }, |
| 350 | + { |
| 351 | + "metadata": {}, |
| 352 | + "source": [ |
| 353 | + "\n", |
| 354 | + "uploads = []\n", |
| 355 | + "# Generate data rows\n", |
| 356 | + "for i in range(1,9):\n", |
| 357 | + " uploads.append({\n", |
| 358 | + " \"row_data\": f\"https://storage.googleapis.com/labelbox-datasets/People_Clothing_Segmentation/jpeg_images/IMAGES/img_000{i}.jpeg\",\n", |
| 359 | + " \"global_key\": \"TEST-ID-%id\" % uuid.uuid1(),\n", |
| 360 | + " \"embeddings\": [{\n", |
| 361 | + " \"embedding_id\": embedding.id,\n", |
| 362 | + " \"vector\": vector\n", |
| 363 | + " }]\n", |
| 364 | + " })\n", |
| 365 | + "\n", |
| 366 | + "task1 = dataset_new.create_data_rows(uploads)\n", |
| 367 | + "task1.wait_till_done()\n", |
| 368 | + "print(\"ERRORS: \" , task1.errors)\n", |
| 369 | + "print(\"RESULTS:\" , task1.result)" |
261 | 370 | ],
|
262 | 371 | "cell_type": "code",
|
263 | 372 | "outputs": [],
|
|
0 commit comments