docs: async iteration notebook (#155)

gadomski · web-flow · commit d866853f809d · 2025-06-17T14:00:22.000Z
diff --git a/docs/notebooks/async-search.ipynb b/docs/notebooks/async-search.ipynb
@@ -0,0 +1,266 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "d2efdd4c",
+   "metadata": {},
+   "source": [
+    "# Async iteration\n",
+    "\n",
+    "In [rustac v0.8.1](https://github.com/stac-utils/rustac-py/releases/tag/v0.8.1) we added the ability to iterate a search asynchronously.\n",
+    "Let's compare this new capability with the synchronous version via [pystac-client](https://github.com/stac-utils/pystac-client).\n",
+    "\n",
+    "The `copernicus-dem` collection at https://stac.eoapi.dev has 26450 items, which makes it a good single collection test case for iterating over a bunch of things."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "eaf03f73",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "building \"rustac\"\n",
+      "rebuilt and loaded package \"rustac\" in 4.783s\n"
+     ]
+    }
+   ],
+   "source": [
+    "import time\n",
+    "from tqdm.notebook import tqdm\n",
+    "\n",
+    "import rustac\n",
+    "from pystac_client import Client\n",
+    "\n",
+    "url = \"https://stac.eoapi.dev\"\n",
+    "collection = \"copernicus-dem\"\n",
+    "total = 26450"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3c4ecaf4",
+   "metadata": {},
+   "source": [
+    "First, let's try **pystac-client**.\n",
+    "In our testing, it takes almost six minutes to iterate over everything, so we're going to limit things to the first one thousand items."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "fba8e0ca",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "db2cb3a4b8894a88a50358b8422c4f1a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/1000 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Got 1000 items in 14.28 seconds\n"
+     ]
+    }
+   ],
+   "source": [
+    "client = Client.open(url)\n",
+    "items = []\n",
+    "progress = tqdm(total=1000)\n",
+    "\n",
+    "start = time.time()\n",
+    "item_search = client.search(collections=[collection])\n",
+    "for item in item_search.items():\n",
+    "    items.append(item)\n",
+    "    progress.update()\n",
+    "    if len(items) >= 1000:\n",
+    "        break\n",
+    "print(f\"Got {len(items)} items in {time.time() - start:.2f} seconds\")\n",
+    "progress.close()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e63b830f",
+   "metadata": {},
+   "source": [
+    "**rustac** does some asynchronous page pre-fetching under the hood, so it might be faster?\n",
+    "Let's find out."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "211b184a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d3bdd8312b004cd3a2b537e429be1e5e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/1000 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Got 1000 items in 13.67 seconds\n"
+     ]
+    }
+   ],
+   "source": [
+    "progress = tqdm(total=1000)\n",
+    "items = []\n",
+    "\n",
+    "start = time.time()\n",
+    "search = await rustac.iter_search(url, collections=[collection])\n",
+    "async for item in search:\n",
+    "    items.append(item)\n",
+    "    progress.update()\n",
+    "    if len(items) >= 1000:\n",
+    "        break\n",
+    "print(f\"Got {len(items)} items in {time.time() - start:.2f} seconds\")\n",
+    "progress.close()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a0f4fae8",
+   "metadata": {},
+   "source": [
+    "Okay, that's about the same, which suggests we're mostly being limited by server response time.\n",
+    "If we increase the page size, does that make our async iteration faster?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "8ca810fc",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2d571afb0b684671b1e3316fbc9716db",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/5000 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Got 5000 items in 11.09 seconds\n"
+     ]
+    }
+   ],
+   "source": [
+    "client = Client.open(url)\n",
+    "items = []\n",
+    "progress = tqdm(total=5000)\n",
+    "\n",
+    "start = time.time()\n",
+    "item_search = client.search(collections=[collection], limit=500)\n",
+    "for item in item_search.items():\n",
+    "    items.append(item)\n",
+    "    progress.update()\n",
+    "    if len(items) >= 5000:\n",
+    "        break\n",
+    "print(f\"Got {len(items)} items in {time.time() - start:.2f} seconds\")\n",
+    "progress.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "e6a00733",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5c130f3626b64524a62a6daccde79694",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/5000 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Got 5000 items in 10.77 seconds\n"
+     ]
+    }
+   ],
+   "source": [
+    "progress = tqdm(total=5000)\n",
+    "items = []\n",
+    "\n",
+    "start = time.time()\n",
+    "search = await rustac.iter_search(url, collections=[collection], limit=500)\n",
+    "async for item in search:\n",
+    "    items.append(item)\n",
+    "    progress.update()\n",
+    "    if len(items) >= 5000:\n",
+    "        break\n",
+    "print(f\"Got {len(items)} items in {time.time() - start:.2f} seconds\")\n",
+    "progress.close()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "rustac-py",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -25,6 +25,7 @@ nav:
       - notebooks/stac-geoparquet.ipynb
       - notebooks/its-live.ipynb
       - notebooks/search.ipynb
+      - notebooks/async-search.ipynb
   - API:
       - api/index.md
       - arrow: api/arrow.md
diff --git a/pyproject.toml b/pyproject.toml
@@ -76,13 +76,15 @@ docs = [
     "griffe>=1.6.0",
     "humanize>=4.12.1",
     "ipykernel>=6.29.5",
+    "ipywidgets>=8.1.7",
     "jinja2>=3.1.4",
     "mike>=2.1.3",
     "mkdocs-jupyter>=0.25.1",
     "mkdocs-material[imaging]>=9.5.45",
     "mkdocstrings[python]>=0.27.0",
     "obstore>=0.6.0",
     "pystac-client>=0.8.5",
+    "tqdm>=4.67.1",
 ]
 
 [tool.uv]
diff --git a/uv.lock b/uv.lock