add standalone evaluate notebook

nina-xu · nina-xu · commit 00573208602c · 2025-05-22T20:00:56.000Z
diff --git a/docs/notebooks/safe-synthetics/runnning-standalone-evaluate.ipynb b/docs/notebooks/safe-synthetics/runnning-standalone-evaluate.ipynb
@@ -0,0 +1,237 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# 📊 Running Standalone Evaluate\n",
+        "This notebook allows you to run the Evaluate step with your own training and synthetic data. This is helpful if\n",
+        "- You want to compare the output from Gretel Synthetics to other means of generating synthetic data\n",
+        "- You want to make sure that the train/test split is consistent across multiple Safe Synthetics runs so that the scores are comparable"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 💾 Install Gretel SDK"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "qxXA-UJVMRhI"
+      },
+      "outputs": [],
+      "source": [
+        "%%capture\n",
+        "%pip install -U gretel-client"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 🌐 Configure your Gretel Session"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "-MMNWeINRAZr"
+      },
+      "outputs": [],
+      "source": [
+        "# Set Gretel API key as an environment variable\n",
+        "import os\n",
+        "os.environ[\"GRETEL_API_KEY\"] = \"grtu....\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "sCL3uQSLMTrs"
+      },
+      "outputs": [],
+      "source": [
+        "from gretel_client import create_or_get_unique_project\n",
+        "from gretel_client.config import get_session_config\n",
+        "from gretel_client.navigator_client import Gretel\n",
+        "\n",
+        "gretel = Gretel()\n",
+        "project_name = \"test-project\"\n",
+        "session = get_session_config()\n",
+        "project = create_or_get_unique_project(name=project_name, session=session)\n",
+        "\n",
+        "project.get_console_url()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 🔬 Load real and synthetic data"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "p0mN5rdCNbJ5"
+      },
+      "outputs": [],
+      "source": [
+        "import pandas as pd\n",
+        "\n",
+        "from sklearn.model_selection import train_test_split\n",
+        "\n",
+        "real_ds = \"https://gretel-datasets.s3.us-west-2.amazonaws.com/hipaa_patients.csv\"\n",
+        "synthetic_ds = \"https://gretel-datasets.s3.us-west-2.amazonaws.com/synthetic_hipaa_patients.csv\"\n",
+        "real_df = pd.read_csv(real_ds)\n",
+        "synthetic_df = pd.read_csv(synthetic_ds)\n",
+        "\n",
+        "train_df, holdout_df = train_test_split(df, test_size=0.05, random_state=42)\n",
+        "\n",
+        "print(f\"Number of rows - train: {len(train_df)}\")\n",
+        "print(f\"Number of rows - holdout: {len(holdout_df)}\")\n",
+        "print(f\"Number of rows - synthetic: {len(synthetic_df)}\")\n",
+        "train_df.head()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "V5HFCFW2M8FT"
+      },
+      "outputs": [],
+      "source": [
+        "# Convert any Pandas Data Frames to Datasets\n",
+        "from gretel_client.files import FileClient\n",
+        "\n",
+        "file_client = FileClient()\n",
+        "\n",
+        "resp_train = file_client.upload(train_df, \"dataset\")\n",
+        "train_file_id = resp_train.id\n",
+        "resp_holdout = file_client.upload(holdout_df, \"dataset\")\n",
+        "holdout_file_id = resp_holdout.id\n",
+        "resp_synthetic = file_client.upload(synthetic_df, \"dataset\")\n",
+        "synthetic_file_id = resp_synthetic.id"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 🏃 Run Evaluate"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "au2V0khbM5CJ"
+      },
+      "outputs": [],
+      "source": [
+        "import requests\n",
+        "import yaml\n",
+        "\n",
+        "def run_workflow(config: str):\n",
+        "    \"\"\"Create a workflow, and workflow run from a given yaml config. Blocks and\n",
+        "    prints log lines until the workflow reaches a terminal state.\n",
+        "\n",
+        "    Args:\n",
+        "        config: The workflow config to run.\n",
+        "    \"\"\"\n",
+        "    config_dict = yaml.safe_load(config)\n",
+        "\n",
+        "    response = requests.post(\n",
+        "        f\"{session.endpoint}/v2/workflows/exec_batch\",\n",
+        "        json={\n",
+        "            \"workflow_config\": config_dict,\n",
+        "            \"project_id\": project.project_guid,\n",
+        "        },\n",
+        "        headers={\"Authorization\": session.api_key}\n",
+        "    )\n",
+        "    response_body = response.json()\n",
+        "\n",
+        "    print(response_body)\n",
+        "\n",
+        "    workflow_id = response_body[\"workflow_id\"]\n",
+        "    workflow_run_id = response_body[\"workflow_run_id\"]\n",
+        "\n",
+        "    workflow_run_url = (\n",
+        "        f\"{project.get_console_url().replace(project.project_guid, '')}workflows/\"\n",
+        "        f\"{workflow_id}/runs/{workflow_run_id}\"\n",
+        "    )\n",
+        "\n",
+        "    print(f\"workflow: {workflow_id}\")\n",
+        "    print(f\"workflow run id: {workflow_run_id}\")\n",
+        "    print(workflow_run_url)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "rtjEVolUM-Yf"
+      },
+      "outputs": [],
+      "source": [
+        "eval_config = f\"\"\"\n",
+        "name: evaluate\n",
+        "version: \"2\"\n",
+        "\n",
+        "steps:\n",
+        "  - name: holdout\n",
+        "    task: holdout\n",
+        "    inputs: [{train_file_id}, {holdout_file_id}]\n",
+        "    config: {{}}\n",
+        "  - name: eval\n",
+        "    task: evaluate_safe_synthetics_dataset\n",
+        "    inputs: [{synthetic_file_id}, \"holdout\"]\n",
+        "    config: {{}}\n",
+        "\"\"\"\n",
+        "\n",
+        "run_workflow(eval_config)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "rWmL8_iFRlv8"
+      },
+      "outputs": [],
+      "source": []
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": ".venv",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.12.10"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}