From 0716c281bbcba6a1a74e0663ea2531bae309b0a1 Mon Sep 17 00:00:00 2001
From: Joe Fernandez <joefernandez@google.com>
Date: Wed, 4 Dec 2024 21:28:46 -0800
Subject: [PATCH 1/2] Updating PaliGemma notebooks

---
 .../paligemma/fine-tuning-paligemma.ipynb     |  92 ++++--
 .../docs/paligemma/inference-with-keras.ipynb | 290 +++++++++---------
 2 files changed, 215 insertions(+), 167 deletions(-)
diff --git a/site/en/gemma/docs/paligemma/fine-tuning-paligemma.ipynb b/site/en/gemma/docs/paligemma/fine-tuning-paligemma.ipynb
index b2cb645f0..4efd63bb2 100644
--- a/site/en/gemma/docs/paligemma/fine-tuning-paligemma.ipynb
+++ b/site/en/gemma/docs/paligemma/fine-tuning-paligemma.ipynb
@@ -6,8 +6,8 @@
         "id": "G3MMAcssHTML"
       },
       "source": [
-        "<link rel=\"stylesheet\" href=\"/site-assets/css/gemma.css\">\n",
-        "<link rel=\"stylesheet\" href=\"https://fonts.googleapis.com/css2?family=Google+Symbols:opsz,wght,FILL,GRAD@20..48,100..700,0..1,-50..200\" />"
+        "<link rel=\"stylesheet\" href=\"/site-assets/css/style.css\">\n",
+        "<link rel=\"stylesheet\" href=\"/site-assets/css/gemma.css\">\n"
       ]
     },
     {
@@ -59,15 +59,8 @@
         "<td>\n",
         "<a target=\"_blank\" href=\"https://github.com/google/generative-ai-docs/blob/main/site/en/gemma/docs/paligemma/fine-tuning-paligemma.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
         "</td>\n",
-        "</table>\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "wR53lePHuiP-"
-      },
-      "source": [
+        "</table>\n",
+        "\n",
         "This notebook shows how to fine-tune [PaliGemma](https://ai.google.dev/gemma/docs/paligemma) on a vision-language task with [JAX](https://jax.readthedocs.io/en/latest/index.html). *Fine-tuning* is a process that can improve your model's performance on specific tasks or help the model adhere to specific output requirements when instructions aren't sufficient and you have a set of examples that demonstrate the outputs you want. Gemma-based models like PaliGemma require fine-tuning to produce expected results.\n",
         "\n",
         "### What's in this notebook\n",
@@ -128,7 +121,8 @@
         "\n",
         "To generate a Kaggle API key, open your [**Settings** page in Kaggle](https://www.kaggle.com/settings) and click **Create New Token**. This triggers the download of a `kaggle.json` file containing your API credentials.\n",
         "\n",
-        "Then, in Colab, select **Secrets** (🔑) in the left pane and add your Kaggle username and Kaggle API key. Store your username under the name `KAGGLE_USERNAME` and your API key under the name `KAGGLE_KEY`.\n"
+        "Then, in Colab, select **Secrets** (🔑) in the left pane and add your Kaggle username and Kaggle API key. Store your username under the name `KAGGLE_USERNAME` and your API key under the name `KAGGLE_KEY`.\n",
+        "\n"
       ]
     },
     {
@@ -172,7 +166,11 @@
         "# vars as appropriate or make your credentials available in ~/.kaggle/kaggle.json\n",
         "\n",
         "os.environ[\"KAGGLE_USERNAME\"] = userdata.get('KAGGLE_USERNAME')\n",
-        "os.environ[\"KAGGLE_KEY\"] = userdata.get('KAGGLE_KEY')"
+        "os.environ[\"KAGGLE_KEY\"] = userdata.get('KAGGLE_KEY')\n",
+        "\n",
+        "# The T4 runtime is tight on memory to finetune this model. Preallocate\n",
+        "# all memory ahead of time to avoid out-of-memory due to fragmentation.\n",
+        "os.environ[\"XLA_PYTHON_CLIENT_MEM_FRACTION\"] = \"1.0\""
       ]
     },
     {
@@ -265,7 +263,7 @@
         "tf.config.set_visible_devices([], \"GPU\")\n",
         "tf.config.set_visible_devices([], \"TPU\")\n",
         "\n",
-        "backend = jax.lib.xla_bridge.get_backend()\n",
+        "backend = jax.extend.backend.get_backend()\n",
         "print(f\"JAX version:  {jax.__version__}\")\n",
         "print(f\"JAX platform: {backend.platform}\")\n",
         "print(f\"JAX devices:  {jax.device_count()}\")"
@@ -292,7 +290,7 @@
         "\n",
         "PaliGemma includes several model variations. For this tutorial, you'll use the base [JAX/FLAX PaliGemma 3B weight model](https://www.kaggle.com/models/google/paligemma/jax/paligemma-3b-pt-224).\n",
         "\n",
-        "Download the `float16` version of the model checkpoint from Kaggle by running the following code. This process takes several minutes to complete."
+        "Download the model checkpoint from Kaggle by running the following code. This process takes several minutes to complete."
       ]
     },
     {
@@ -306,12 +304,19 @@
         "import os\n",
         "import kagglehub\n",
         "\n",
-        "MODEL_PATH = \"./pt_224_128.params.f16.npz\"\n",
+        "# Use these for PaliGemma-2 3B 224px²\n",
+        "LLM_VARIANT = \"gemma2_2b\"\n",
+        "MODEL_PATH = \"./paligemma2-3b-pt-224.b16.npz\"\n",
+        "KAGGLE_HANDLE = \"google/paligemma-2/jax/paligemma2-3b-pt-224\"  # Path to fetch from Kaggle.\n",
+        "\n",
+        "# Use these for PaliGemma 1:\n",
+        "# LLM_VARIANT = \"gemma_2b\"\n",
+        "# MODEL_PATH = \"./paligemma-3b-pt-224.f16.npz\"\n",
+        "# KAGGLE_HANDLE = \"google/paligemma/jax/paligemma-3b-pt-224\"\n",
+        "\n",
         "if not os.path.exists(MODEL_PATH):\n",
         "  print(\"Downloading the checkpoint from Kaggle, this could take a few minutes....\")\n",
-        "  # Note: kaggle archive contains the same checkpoint in multiple formats.\n",
-        "  # Download only the float16 model.\n",
-        "  MODEL_PATH = kagglehub.model_download('google/paligemma/jax/paligemma-3b-pt-224', 'paligemma-3b-pt-224.f16.npz')\n",
+        "  MODEL_PATH = kagglehub.model_download(KAGGLE_HANDLE, MODEL_PATH)\n",
         "  print(f\"Model path: {MODEL_PATH}\")\n",
         "\n",
         "TOKENIZER_PATH = \"./paligemma_tokenizer.model\"\n",
@@ -360,8 +365,11 @@
       "outputs": [],
       "source": [
         "# Define model\n",
+        "\n",
+        "# IMPORTANT: Gemma-2 has a \"final_logits_softcap\" property, we set it to 0.0\n",
+        "# for better transfer results.\n",
         "model_config = ml_collections.FrozenConfigDict({\n",
-        "    \"llm\": {\"vocab_size\": 257_152},\n",
+        "    \"llm\": {\"vocab_size\": 257_152, \"variant\": LLM_VARIANT, \"final_logits_softcap\": 0.0},\n",
         "    \"img\": {\"variant\": \"So400m/14\", \"pool_type\": \"none\", \"scan\": True, \"dtype_mm\": \"float16\"}\n",
         "})\n",
         "model = paligemma.Model(**model_config)\n",
@@ -420,7 +428,9 @@
         "\n",
         "@functools.partial(jax.jit, donate_argnums=(0,), static_argnums=(1,))\n",
         "def maybe_cast_to_f32(params, trainable):\n",
-        "  return jax.tree.map(lambda p, m: p.astype(jnp.float32) if m else p,\n",
+        "  # Cast others to float16, since some GPUs don't support bf16.\n",
+        "  return jax.tree.map(lambda p, m: p.astype(jnp.float32)\n",
+        "                      if m else p.astype(jnp.float16),\n",
         "                      params, trainable)\n",
         "\n",
         "# Loading all params in simultaneous - albeit much faster and more succinct -\n",
@@ -492,7 +502,7 @@
         "\n",
         "  image = tf.constant(image)\n",
         "  image = tf.image.resize(image, (size, size), method='bilinear', antialias=True)\n",
-        "  return image.numpy() / 127.5 - 1.0  # [0, 255]->[-1,1]\n",
+        "  return image.numpy() / 127.5 - 1.0  # [0, 255]-\u003e[-1,1]\n",
         "\n",
         "def preprocess_tokens(prefix, suffix=None, seqlen=None):\n",
         "  # Model has been trained to handle tokenized text composed of a prefix with\n",
@@ -632,12 +642,12 @@
         "    return f\"data:image/jpeg;base64,{image_b64}\"\n",
         "\n",
         "def render_example(image, caption):\n",
-        "  image = ((image + 1)/2 * 255).astype(np.uint8)  # [-1,1] -> [0, 255]\n",
+        "  image = ((image + 1)/2 * 255).astype(np.uint8)  # [-1,1] -\u003e [0, 255]\n",
         "  return f\"\"\"\n",
-        "    <div style=\"display: inline-flex; align-items: center; justify-content: center;\">\n",
-        "        <img style=\"width:128px; height:128px;\" src=\"{render_inline(image, resize=(64,64))}\" />\n",
-        "        <p style=\"width:256px; margin:10px; font-size:small;\">{html.escape(caption)}</p>\n",
-        "    </div>\n",
+        "    \u003cdiv style=\"display: inline-flex; align-items: center; justify-content: center;\"\u003e\n",
+        "        \u003cimg style=\"width:128px; height:128px;\" src=\"{render_inline(image, resize=(64,64))}\" /\u003e\n",
+        "        \u003cp style=\"width:256px; margin:10px; font-size:small;\"\u003e{html.escape(caption)}\u003c/p\u003e\n",
+        "    \u003c/div\u003e\n",
         "    \"\"\"\n",
         "\n",
         "html_out = \"\"\n",
@@ -754,7 +764,7 @@
         "    # Append to html output.\n",
         "    for example, response in zip(examples, responses):\n",
         "      outputs.append((example[\"image\"], response))\n",
-        "      if num_examples and len(outputs) >= num_examples:\n",
+        "      if num_examples and len(outputs) \u003e= num_examples:\n",
         "        return outputs"
       ]
     },
@@ -862,14 +872,36 @@
   ],
   "metadata": {
     "colab": {
-      "name": "fine-tuning-paligemma.ipynb",
+      "gpuType": "T4",
+      "last_runtime": {
+        "build_target": "//learning/grp/tools/ml_python:ml_notebook",
+        "kind": "private"
+      },
+      "private_outputs": true,
+      "provenance": [
+        {
+          "file_id": "17AiK8gRY7oiquQGkBH0d08PFQo3Kyx1I",
+          "timestamp": 1715287187925
+        },
+        {
+          "file_id": "1qZlJfPyfKRrNcz2shxQ93HnnE5Ge1LLn",
+          "timestamp": 1715019972450
+        },
+        {
+          "file_id": "1JFnlD2kSiTNexdPw_NYRtuW6uuSTI0kD",
+          "timestamp": 1714585741026
+        }
+      ],
       "toc_visible": true
     },
     "kernelspec": {
       "display_name": "Python 3",
       "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
     }
   },
   "nbformat": 4,
   "nbformat_minor": 0
-}
+}
\ No newline at end of file
diff --git a/site/en/gemma/docs/paligemma/inference-with-keras.ipynb b/site/en/gemma/docs/paligemma/inference-with-keras.ipynb
index 32581fb4b..3aca91cf2 100644
--- a/site/en/gemma/docs/paligemma/inference-with-keras.ipynb
+++ b/site/en/gemma/docs/paligemma/inference-with-keras.ipynb
@@ -44,31 +44,31 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "etcMXWCUJApZ"
+        "id": "Q5_nIe-8gdJV"
       },
       "source": [
-        "# Inference with Keras\n"
+        "# Generate PaliGemma output with Keras\n",
+        "\n",
+        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
+        "\u003ctd\u003e\n",
+        "\u003ca target=\"_blank\" href=\"https://ai.google.dev/gemma/docs/paligemma/fine-tuning-paligemma\"\u003e\u003cimg src=\"https://ai.google.dev/static/site-assets/images/docs/notebook-site-button.png\" height=\"32\" width=\"32\" /\u003eView on ai.google.dev\u003c/a\u003e\n",
+        "\u003c/td\u003e\n",
+        "\u003ctd\u003e\n",
+        "\u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/google/generative-ai-docs/blob/main/site/en/gemma/docs/paligemma/inference-with-keras.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
+        "\u003c/td\u003e\n",
+        "\u003ctd\u003e\n",
+        "\u003ca target=\"_blank\" href=\"https://github.com/google/generative-ai-docs/blob/main/site/en/gemma/docs/paligemma/inference-with-keras.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
+        "\u003c/td\u003e\n",
+        "\u003c/table\u003e"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "Q5_nIe-8gdJV"
+        "id": "9hhIuS9sEKHx"
       },
       "source": [
-        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
-        "<td>\n",
-        "<a target=\"_blank\" href=\"https://ai.google.dev/gemma/docs/paligemma/inference-with-keras\"><img src=\"https://ai.google.dev/static/site-assets/images/docs/notebook-site-button.png\" height=\"32\" width=\"32\" />View on ai.google.dev</a>\n",
-        "</td>\n",
-        "<td>\n",
-        "<a target=\"_blank\" href=\"https://colab.research.google.com/github/google/generative-ai-docs/blob/main/site/en/gemma/docs/paligemma/inference-with-keras.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
-        "</td>\n",
-        "<td>\n",
-        "<a target=\"_blank\" href=\"https://github.com/google/generative-ai-docs/blob/main/site/en/gemma/docs/paligemma/inference-with-keras.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
-        "</td>\n",
-        "</table>\n",
-        "\n",
-        "When your AI model produces a conclusion or a prediction, it goes through a process called *inference*. This tutorial goes over how to use PaliGemma with Keras to set up a simple model that can infer information about supplied images and answer questions about them."
+        "PaliGemma models have *multimodal* capabilities, allowing you to generate output using both text and image input data. You can use image data with these models to provide additional context for your requests, or use the model to  analyze the content of images. This tutorial shows you how to use PaliGemma with Keras to can analyze images and answer questions about them."
       ]
     },
     {
@@ -223,7 +223,7 @@
       "outputs": [],
       "source": [
         "import keras\n",
-        "import keras_nlp\n",
+        "import keras_hub\n",
         "import numpy as np\n",
         "import PIL\n",
         "import requests\n",
@@ -237,26 +237,17 @@
         "keras.config.set_floatx(\"bfloat16\")"
       ]
     },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "ftjt5DiueVkL"
-      },
-      "source": [
-        "## Create your model\n",
-        "\n",
-        "Now that you've set everything up, you can download the pre-trained model and create some utility methods to help your model generate its responses."
-      ]
-    },
     {
       "cell_type": "markdown",
       "metadata": {
         "id": "X-LE2E1uiSpP"
       },
       "source": [
-        "### Download the model checkpoint\n",
+        "## Load the model\n",
+        "\n",
+        "Now that you've set everything up, you can download the pre-trained model and create some utility methods to help your model generate its responses.\n",
+        "In this step, you download a model using `PaliGemmaCausalLM` from Keras Hub. This class helps you manage and run the causal visual language model structure of PaliGemma. A *causal visual language model* predicts the next token based on previous tokens. Keras Hub provides implementations of many popular [model architectures](https://keras.io/keras_hub/api/models/).\n",
         "\n",
-        "KerasNLP provides implementations of many popular [model architectures](https://keras.io/api/keras_nlp/models/). In this notebook, you'll create a model using `PaliGemmaCausalLM`, an end-to-end PaliGemma model for *causal visual language modeling*. A causal visual language model predicts the next token based on previous tokens.\n",
         "\n",
         "Create the model using the `from_preset` method and print its summary. This process will take about a minute to complete."
       ]
@@ -269,7 +260,7 @@
       },
       "outputs": [],
       "source": [
-        "paligemma = keras_nlp.models.PaliGemmaCausalLM.from_preset(\"pali_gemma_3b_mix_224\")\n",
+        "paligemma = keras_hub.models.PaliGemmaCausalLM.from_preset(\"paligemma_3b_mix_224\")\n",
         "paligemma.summary()"
       ]
     },
@@ -279,7 +270,7 @@
         "id": "FBsWvKEvoGMe"
       },
       "source": [
-        "### Create utility methods\n",
+        "## Create utility methods\n",
         "\n",
         "To help you generate responses from your model, create two utility methods:\n",
         "\n",
@@ -287,7 +278,8 @@
         "*   **`read_img`:** Helper method for `read_img_from_url`. This method is what actually opens the image, resizes it so that it fits in the model's constraints, and puts it into an array that can be interpreted by the model.\n",
         "*   **`read_img_from_url`:** Takes in an image via a valid URL. You need this method to pass the image to the model.\n",
         "\n",
-        "You'll use `read_img_from_url` in the next step of this notebook.\n"
+        "You'll use `read_img_from_url` in the next step of this notebook.\n",
+        "\n"
       ]
     },
     {
@@ -318,8 +310,8 @@
         "\n",
         "def parse_bbox_and_labels(detokenized_output: str):\n",
         "  matches = re.finditer(\n",
-        "      '<loc(?P<y0>\\d\\d\\d\\d)><loc(?P<x0>\\d\\d\\d\\d)><loc(?P<y1>\\d\\d\\d\\d)><loc(?P<x1>\\d\\d\\d\\d)>'\n",
-        "      ' (?P<label>.+?)( ;|$)',\n",
+        "      '\u003cloc(?P\u003cy0\u003e\\d\\d\\d\\d)\u003e\u003cloc(?P\u003cx0\u003e\\d\\d\\d\\d)\u003e\u003cloc(?P\u003cy1\u003e\\d\\d\\d\\d)\u003e\u003cloc(?P\u003cx1\u003e\\d\\d\\d\\d)\u003e'\n",
+        "      ' (?P\u003clabel\u003e.+?)( ;|$)',\n",
         "      detokenized_output,\n",
         "  )\n",
         "  labels, boxes = [], []\n",
@@ -352,26 +344,46 @@
         "\n",
         "  plt.show()\n",
         "\n",
-        "def display_segment_output(image, segment_mask, target_image_size):\n",
-        "  # Calculate scaling factors\n",
-        "  h, w = target_image_size\n",
-        "  x_scale = w / 64\n",
-        "  y_scale = h / 64\n",
-        "\n",
-        "  # Create coordinate grids for the new image\n",
-        "  x_coords = np.arange(w)\n",
-        "  y_coords = np.arange(h)\n",
-        "  x_coords = (x_coords / x_scale).astype(int)\n",
-        "  y_coords = (y_coords / y_scale).astype(int)\n",
-        "  resized_array = segment_mask[y_coords[:, np.newaxis], x_coords]\n",
-        "  # Create a figure and axis\n",
-        "  fig, ax = plt.subplots()\n",
-        "\n",
-        "  # Display the image\n",
-        "  ax.imshow(image)\n",
-        "\n",
-        "  # Overlay the mask with transparency\n",
-        "  ax.imshow(resized_array, cmap='jet', alpha=0.5)"
+        "def display_segment_output(image, bounding_box, segment_mask, target_image_size):\n",
+        "    # Initialize a full mask with the target size\n",
+        "    full_mask = np.zeros(target_image_size, dtype=np.uint8)\n",
+        "    target_width, target_height = target_image_size\n",
+        "\n",
+        "    for bbox, mask in zip(bounding_box, segment_mask):\n",
+        "        y1, x1, y2, x2 = bbox\n",
+        "        x1 = int(x1 * target_width)\n",
+        "        y1 = int(y1 * target_height)\n",
+        "        x2 = int(x2 * target_width)\n",
+        "        y2 = int(y2 * target_height)\n",
+        "\n",
+        "        # Ensure mask is 2D before converting to Image\n",
+        "        if mask.ndim == 3:\n",
+        "            mask = mask.squeeze(axis=-1)\n",
+        "        mask = Image.fromarray(mask)\n",
+        "        mask = mask.resize((x2 - x1, y2 - y1), resample=Image.NEAREST)\n",
+        "        mask = np.array(mask)\n",
+        "        binary_mask = (mask \u003e 0.5).astype(np.uint8)\n",
+        "\n",
+        "\n",
+        "        # Place the binary mask onto the full mask\n",
+        "        full_mask[y1:y2, x1:x2] = np.maximum(full_mask[y1:y2, x1:x2], binary_mask)\n",
+        "    cmap = plt.get_cmap('jet')\n",
+        "    colored_mask = cmap(full_mask / 1.0)\n",
+        "    colored_mask = (colored_mask[:, :, :3] * 255).astype(np.uint8)\n",
+        "    if isinstance(image, Image.Image):\n",
+        "        image = np.array(image)\n",
+        "    blended_image = image.copy()\n",
+        "    mask_indices = full_mask \u003e 0\n",
+        "    alpha = 0.5\n",
+        "\n",
+        "    for c in range(3):\n",
+        "        blended_image[:, :, c] = np.where(mask_indices,\n",
+        "                                          (1 - alpha) * image[:, :, c] + alpha * colored_mask[:, :, c],\n",
+        "                                          image[:, :, c])\n",
+        "\n",
+        "    fig, ax = plt.subplots()\n",
+        "    ax.imshow(blended_image)\n",
+        "    plt.show()"
       ]
     },
     {
@@ -380,11 +392,11 @@
         "id": "AeVUHA_zP8ZF"
       },
       "source": [
-        "## Test your model\n",
+        "## Generate output\n",
         "\n",
-        "Now you're ready to give an image and prompt to your model and have it infer the response.\n",
+        "After loading the model and creating utility methods, you can prompt the model with image and text data to generate a responses. PaliGemma models are trained with specific prompt syntax for specific tasks, such as `answer`, `caption`, and `detect`. For more information about PaliGemma prompt task syntax, see [PaliGemma prompt and system instructions](https://ai.google.com/gemma/docs/paligemma/prompt-system-instructions##prompt_task_syntax).\n",
         "\n",
-        "Lets look at our test image and read it\n"
+        "Prepare an image for use in a generation prompt by using the following code to load a test image into an object:"
       ]
     },
     {
@@ -407,9 +419,9 @@
         "id": "R3FE63iMqb2G"
       },
       "source": [
-        "Here's a generation call with a single image and prompt. The prompts have to end with a `\\n`.\n",
+        "### Answer in a specific language\n",
         "\n",
-        "We've supplied you with several example prompts — play around with it! Comment and uncomment the prompt variables to change what prompt you supply the model with."
+        "The following example code shows how to prompt the PaliGemma model for information about an object appearing in a provided image. This example uses the `answer {lang}` syntax and shows additional questions in other languages:"
       ]
     },
     {
@@ -421,9 +433,10 @@
       "outputs": [],
       "source": [
         "prompt = 'answer en where is the cow standing?\\n'\n",
-        "# prompt = 'svar no hvor står kuen?'\n",
-        "# prompt = 'answer fr quelle couleur est le ciel?'\n",
-        "# prompt = 'responda pt qual a cor do animal?'\n",
+        "# prompt = 'svar no hvor står kuen?\\n'\n",
+        "# prompt = 'answer fr quelle couleur est le ciel?\\n'\n",
+        "# prompt = 'responda pt qual a cor do animal?\\n'\n",
+        "\n",
         "output = paligemma.generate(\n",
         "    inputs={\n",
         "        \"images\": cow_image,\n",
@@ -436,69 +449,10 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "MCMqS0vI6S36"
-      },
-      "source": [
-        "Here's a generation call with batched inputs."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "UZle4sJP6YwB"
-      },
-      "outputs": [],
-      "source": [
-        "prompts = [\n",
-        "    'answer en where is the cow standing?\\n',\n",
-        "    'answer en what color is the cow?\\n',\n",
-        "    'describe en\\n',\n",
-        "    'detect cow\\n',\n",
-        "    'segment cow\\n',\n",
-        "]\n",
-        "images = [cow_image, cow_image, cow_image, cow_image, cow_image]\n",
-        "outputs = paligemma.generate(\n",
-        "    inputs={\n",
-        "        \"images\": images,\n",
-        "        \"prompts\": prompts,\n",
-        "    }\n",
-        ")\n",
-        "for output in outputs:\n",
-        "    print(output)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "qJKvH8to6kb6"
+        "id": "tMPv8tfISvif"
       },
       "source": [
-        "We've supplied you with several example prompts — play around with it! Comment and uncomment the `prompt` variables to change what prompt you supply the model with."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "-uqMwZQY31mu"
-      },
-      "source": [
-        "### Other styles of prompts\n",
-        "\n",
-        "You may have noticed in the previous step that the provided examples are in several different languages. PaliGemma supports language recognition for 34 different languages. You can find the list of supported languages on [GitHub](https://github.com/google/crossmodal-3600/blob/main/web-data/README.md).\n",
-        "\n",
-        "PaliGemma can handle several other prompt styles:\n",
-        "\n",
-        "*   **`\"cap {lang}\\n\"`:** Very raw short caption (from WebLI-alt)\n",
-        "*   **`\"caption {lang}\\n\"`:** Nice, COCO-like short captions\n",
-        "*   **`\"describe {lang}\\n\"`:** Somewhat longer, more descriptive captions\n",
-        "*   **`\"ocr\"`:** Optical character recognition\n",
-        "*   **`\"answer en {question}\\n\"`:** Question answering about the image contents\n",
-        "*   **`\"question {lang} {answer}\\n\"`:** Question generation for a given answer\n",
-        "*   **`\"detect {object} ; {object}\\n\"`:** Count objects in a scene and return the bounding boxes for the objects\n",
-        "*   **`\"segment {object}\\n\"`:** Do image segmentation of the object in the scene\n",
-        "\n",
-        "Try them out!"
+        "Note: Prompts that use PaliGemma prompt command syntax must end with an \"`\\n`\" character."
       ]
     },
     {
@@ -507,7 +461,9 @@
         "id": "eUgPcUGDpsdx"
       },
       "source": [
-        "### Parse detect output"
+        "### Use `detect` prompt\n",
+        "\n",
+        "The following example code uses the `detect` prompt syntax to locate an object in the provided image. The code uses the previously defined `parse_bbox_and_labels()` and `display_boxes()` functions to interpret the model output and display the generated bounding boxes."
       ]
     },
     {
@@ -532,22 +488,24 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "mUwNeaV0sGau"
+        "id": "TET0Bz10mzxD"
       },
       "source": [
-        "### Parse segment output"
+        "### Use `segment` prompt\n",
+        "\n",
+        "The following example code uses the `segment` prompt syntax to locate the area of an image occupied by an object. It uses the Google `big_vision` library to interpret the model output and generate a mask for the segemented object.\n",
+        "\n",
+        "Before getting started, install the `big_vision` library and its dependencies, as shown in this code example:\n"
       ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "cellView": "form",
         "id": "tuxyIMAvsM4B"
       },
       "outputs": [],
       "source": [
-        "# @title  Fetch big_vision code and install dependencies.</font></center>\n",
         "import os\n",
         "import sys\n",
         "\n",
@@ -576,7 +534,7 @@
         "id": "18S4uHWqutps"
       },
       "source": [
-        "Let's take a look at another example image."
+        "For this segmentation example, load and prepare a different image that includes a cat."
       ]
     },
     {
@@ -610,10 +568,10 @@
       "source": [
         "import  big_vision.evaluators.proj.paligemma.transfers.segmentation as segeval\n",
         "reconstruct_masks = segeval.get_reconstruct_masks('oi')\n",
-        "def parse_segments(detokenized_output: str) -> tuple[np.ndarray, np.ndarray]:\n",
+        "def parse_segments(detokenized_output: str) -\u003e tuple[np.ndarray, np.ndarray]:\n",
         "  matches = re.finditer(\n",
-        "      '<loc(?P<y0>\\d\\d\\d\\d)><loc(?P<x0>\\d\\d\\d\\d)><loc(?P<y1>\\d\\d\\d\\d)><loc(?P<x1>\\d\\d\\d\\d)>'\n",
-        "      + ''.join(f'<seg(?P<s{i}>\\d\\d\\d)>' for i in range(16)),\n",
+        "      '\u003cloc(?P\u003cy0\u003e\\d\\d\\d\\d)\u003e\u003cloc(?P\u003cx0\u003e\\d\\d\\d\\d)\u003e\u003cloc(?P\u003cy1\u003e\\d\\d\\d\\d)\u003e\u003cloc(?P\u003cx1\u003e\\d\\d\\d\\d)\u003e'\n",
+        "      + ''.join(f'\u003cseg(?P\u003cs{i}\u003e\\d\\d\\d)\u003e' for i in range(16)),\n",
         "      detokenized_output,\n",
         "  )\n",
         "  boxes, segs = [], []\n",
@@ -668,22 +626,80 @@
       },
       "outputs": [],
       "source": [
-        "_, seg_output = parse_segments(output)\n",
-        "display_segment_output(cat, seg_output[0], target_size)"
+        "bboxes, seg_masks = parse_segments(output)\n",
+        "display_segment_output(cat, bboxes, seg_masks, target_size)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "jI9YAgLuGyAb"
+      },
+      "source": [
+        "### Batch prompts\n",
+        "\n",
+        "You can provide more than one prompt command within a single prompt as a batch of instructions. The following example demonstrates how to structure your prompt text to provide multiple instructions.\n",
+        "\n",
+        "Important: Each prompt command must end with an \"\\n\" character, as shown."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "UZle4sJP6YwB"
+      },
+      "outputs": [],
+      "source": [
+        "prompts = [\n",
+        "    'answer en where is the cow standing?\\n',\n",
+        "    'answer en what color is the cow?\\n',\n",
+        "    'describe en\\n',\n",
+        "    'detect cow\\n',\n",
+        "    'segment cow\\n',\n",
+        "]\n",
+        "images = [cow_image, cow_image, cow_image, cow_image, cow_image]\n",
+        "outputs = paligemma.generate(\n",
+        "    inputs={\n",
+        "        \"images\": images,\n",
+        "        \"prompts\": prompts,\n",
+        "    }\n",
+        ")\n",
+        "for output in outputs:\n",
+        "    print(output)"
       ]
     }
   ],
   "metadata": {
     "accelerator": "GPU",
     "colab": {
-      "name": "inference-with-keras.ipynb",
+      "gpuType": "T4",
+      "machine_shape": "hm",
+      "private_outputs": true,
+      "provenance": [
+        {
+          "file_id": "1Mh7SHaP1cd5XuuHLmprKoBfr9ddSVu3a",
+          "timestamp": 1715369116355
+        },
+        {
+          "file_id": "1Xo8MY-GJsjjBKS1l_hQ8AImMNw9yxqmH",
+          "timestamp": 1715208696058
+        },
+        {
+          "file_id": "1bLQYoesGy8awiA0UT0askWIjIqcivfdY",
+          "timestamp": 1715147134742
+        }
+      ],
       "toc_visible": true
     },
     "kernelspec": {
       "display_name": "Python 3",
       "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
     }
   },
   "nbformat": 4,
   "nbformat_minor": 0
-}
+}
\ No newline at end of file

From feda912fcc715b01afa08d314405ca315616c66c Mon Sep 17 00:00:00 2001
From: Joe Fernandez <joefernandez@google.com>
Date: Thu, 5 Dec 2024 09:24:03 -0800
Subject: [PATCH 2/2] Notebook format updates.

---
 .../paligemma/fine-tuning-paligemma.ipynb     | 43 +++----------
 .../docs/paligemma/inference-with-keras.ipynb | 62 +++++++------------
 2 files changed, 31 insertions(+), 74 deletions(-)

diff --git a/site/en/gemma/docs/paligemma/fine-tuning-paligemma.ipynb b/site/en/gemma/docs/paligemma/fine-tuning-paligemma.ipynb
index 4efd63bb2..a0d762e69 100644
--- a/site/en/gemma/docs/paligemma/fine-tuning-paligemma.ipynb
+++ b/site/en/gemma/docs/paligemma/fine-tuning-paligemma.ipynb
@@ -121,8 +121,7 @@
         "\n",
         "To generate a Kaggle API key, open your [**Settings** page in Kaggle](https://www.kaggle.com/settings) and click **Create New Token**. This triggers the download of a `kaggle.json` file containing your API credentials.\n",
         "\n",
-        "Then, in Colab, select **Secrets** (🔑) in the left pane and add your Kaggle username and Kaggle API key. Store your username under the name `KAGGLE_USERNAME` and your API key under the name `KAGGLE_KEY`.\n",
-        "\n"
+        "Then, in Colab, select **Secrets** (🔑) in the left pane and add your Kaggle username and Kaggle API key. Store your username under the name `KAGGLE_USERNAME` and your API key under the name `KAGGLE_KEY`.\n"
       ]
     },
     {
@@ -502,7 +501,7 @@
         "\n",
         "  image = tf.constant(image)\n",
         "  image = tf.image.resize(image, (size, size), method='bilinear', antialias=True)\n",
-        "  return image.numpy() / 127.5 - 1.0  # [0, 255]-\u003e[-1,1]\n",
+        "  return image.numpy() / 127.5 - 1.0  # [0, 255]->[-1,1]\n",
         "\n",
         "def preprocess_tokens(prefix, suffix=None, seqlen=None):\n",
         "  # Model has been trained to handle tokenized text composed of a prefix with\n",
@@ -642,12 +641,12 @@
         "    return f\"data:image/jpeg;base64,{image_b64}\"\n",
         "\n",
         "def render_example(image, caption):\n",
-        "  image = ((image + 1)/2 * 255).astype(np.uint8)  # [-1,1] -\u003e [0, 255]\n",
+        "  image = ((image + 1)/2 * 255).astype(np.uint8)  # [-1,1] -> [0, 255]\n",
         "  return f\"\"\"\n",
-        "    \u003cdiv style=\"display: inline-flex; align-items: center; justify-content: center;\"\u003e\n",
-        "        \u003cimg style=\"width:128px; height:128px;\" src=\"{render_inline(image, resize=(64,64))}\" /\u003e\n",
-        "        \u003cp style=\"width:256px; margin:10px; font-size:small;\"\u003e{html.escape(caption)}\u003c/p\u003e\n",
-        "    \u003c/div\u003e\n",
+        "    <div style=\"display: inline-flex; align-items: center; justify-content: center;\">\n",
+        "        <img style=\"width:128px; height:128px;\" src=\"{render_inline(image, resize=(64,64))}\" />\n",
+        "        <p style=\"width:256px; margin:10px; font-size:small;\">{html.escape(caption)}</p>\n",
+        "    </div>\n",
         "    \"\"\"\n",
         "\n",
         "html_out = \"\"\n",
@@ -764,7 +763,7 @@
         "    # Append to html output.\n",
         "    for example, response in zip(examples, responses):\n",
         "      outputs.append((example[\"image\"], response))\n",
-        "      if num_examples and len(outputs) \u003e= num_examples:\n",
+        "      if num_examples and len(outputs) >= num_examples:\n",
         "        return outputs"
       ]
     },
@@ -872,36 +871,14 @@
   ],
   "metadata": {
     "colab": {
-      "gpuType": "T4",
-      "last_runtime": {
-        "build_target": "//learning/grp/tools/ml_python:ml_notebook",
-        "kind": "private"
-      },
-      "private_outputs": true,
-      "provenance": [
-        {
-          "file_id": "17AiK8gRY7oiquQGkBH0d08PFQo3Kyx1I",
-          "timestamp": 1715287187925
-        },
-        {
-          "file_id": "1qZlJfPyfKRrNcz2shxQ93HnnE5Ge1LLn",
-          "timestamp": 1715019972450
-        },
-        {
-          "file_id": "1JFnlD2kSiTNexdPw_NYRtuW6uuSTI0kD",
-          "timestamp": 1714585741026
-        }
-      ],
+      "name": "fine-tuning-paligemma.ipynb",
       "toc_visible": true
     },
     "kernelspec": {
       "display_name": "Python 3",
       "name": "python3"
-    },
-    "language_info": {
-      "name": "python"
     }
   },
   "nbformat": 4,
   "nbformat_minor": 0
-}
\ No newline at end of file
+}
diff --git a/site/en/gemma/docs/paligemma/inference-with-keras.ipynb b/site/en/gemma/docs/paligemma/inference-with-keras.ipynb
index 3aca91cf2..43b5498ab 100644
--- a/site/en/gemma/docs/paligemma/inference-with-keras.ipynb
+++ b/site/en/gemma/docs/paligemma/inference-with-keras.ipynb
@@ -49,17 +49,17 @@
       "source": [
         "# Generate PaliGemma output with Keras\n",
         "\n",
-        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
-        "\u003ctd\u003e\n",
-        "\u003ca target=\"_blank\" href=\"https://ai.google.dev/gemma/docs/paligemma/fine-tuning-paligemma\"\u003e\u003cimg src=\"https://ai.google.dev/static/site-assets/images/docs/notebook-site-button.png\" height=\"32\" width=\"32\" /\u003eView on ai.google.dev\u003c/a\u003e\n",
-        "\u003c/td\u003e\n",
-        "\u003ctd\u003e\n",
-        "\u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/google/generative-ai-docs/blob/main/site/en/gemma/docs/paligemma/inference-with-keras.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
-        "\u003c/td\u003e\n",
-        "\u003ctd\u003e\n",
-        "\u003ca target=\"_blank\" href=\"https://github.com/google/generative-ai-docs/blob/main/site/en/gemma/docs/paligemma/inference-with-keras.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
-        "\u003c/td\u003e\n",
-        "\u003c/table\u003e"
+        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
+        "<td>\n",
+        "<a target=\"_blank\" href=\"https://ai.google.dev/gemma/docs/paligemma/fine-tuning-paligemma\"><img src=\"https://ai.google.dev/static/site-assets/images/docs/notebook-site-button.png\" height=\"32\" width=\"32\" />View on ai.google.dev</a>\n",
+        "</td>\n",
+        "<td>\n",
+        "<a target=\"_blank\" href=\"https://colab.research.google.com/github/google/generative-ai-docs/blob/main/site/en/gemma/docs/paligemma/inference-with-keras.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
+        "</td>\n",
+        "<td>\n",
+        "<a target=\"_blank\" href=\"https://github.com/google/generative-ai-docs/blob/main/site/en/gemma/docs/paligemma/inference-with-keras.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
+        "</td>\n",
+        "</table>"
       ]
     },
     {
@@ -278,8 +278,7 @@
         "*   **`read_img`:** Helper method for `read_img_from_url`. This method is what actually opens the image, resizes it so that it fits in the model's constraints, and puts it into an array that can be interpreted by the model.\n",
         "*   **`read_img_from_url`:** Takes in an image via a valid URL. You need this method to pass the image to the model.\n",
         "\n",
-        "You'll use `read_img_from_url` in the next step of this notebook.\n",
-        "\n"
+        "You'll use `read_img_from_url` in the next step of this notebook.\n"
       ]
     },
     {
@@ -310,8 +309,8 @@
         "\n",
         "def parse_bbox_and_labels(detokenized_output: str):\n",
         "  matches = re.finditer(\n",
-        "      '\u003cloc(?P\u003cy0\u003e\\d\\d\\d\\d)\u003e\u003cloc(?P\u003cx0\u003e\\d\\d\\d\\d)\u003e\u003cloc(?P\u003cy1\u003e\\d\\d\\d\\d)\u003e\u003cloc(?P\u003cx1\u003e\\d\\d\\d\\d)\u003e'\n",
-        "      ' (?P\u003clabel\u003e.+?)( ;|$)',\n",
+        "      '<loc(?P<y0>\\d\\d\\d\\d)><loc(?P<x0>\\d\\d\\d\\d)><loc(?P<y1>\\d\\d\\d\\d)><loc(?P<x1>\\d\\d\\d\\d)>'\n",
+        "      ' (?P<label>.+?)( ;|$)',\n",
         "      detokenized_output,\n",
         "  )\n",
         "  labels, boxes = [], []\n",
@@ -362,7 +361,7 @@
         "        mask = Image.fromarray(mask)\n",
         "        mask = mask.resize((x2 - x1, y2 - y1), resample=Image.NEAREST)\n",
         "        mask = np.array(mask)\n",
-        "        binary_mask = (mask \u003e 0.5).astype(np.uint8)\n",
+        "        binary_mask = (mask > 0.5).astype(np.uint8)\n",
         "\n",
         "\n",
         "        # Place the binary mask onto the full mask\n",
@@ -373,7 +372,7 @@
         "    if isinstance(image, Image.Image):\n",
         "        image = np.array(image)\n",
         "    blended_image = image.copy()\n",
-        "    mask_indices = full_mask \u003e 0\n",
+        "    mask_indices = full_mask > 0\n",
         "    alpha = 0.5\n",
         "\n",
         "    for c in range(3):\n",
@@ -568,10 +567,10 @@
       "source": [
         "import  big_vision.evaluators.proj.paligemma.transfers.segmentation as segeval\n",
         "reconstruct_masks = segeval.get_reconstruct_masks('oi')\n",
-        "def parse_segments(detokenized_output: str) -\u003e tuple[np.ndarray, np.ndarray]:\n",
+        "def parse_segments(detokenized_output: str) -> tuple[np.ndarray, np.ndarray]:\n",
         "  matches = re.finditer(\n",
-        "      '\u003cloc(?P\u003cy0\u003e\\d\\d\\d\\d)\u003e\u003cloc(?P\u003cx0\u003e\\d\\d\\d\\d)\u003e\u003cloc(?P\u003cy1\u003e\\d\\d\\d\\d)\u003e\u003cloc(?P\u003cx1\u003e\\d\\d\\d\\d)\u003e'\n",
-        "      + ''.join(f'\u003cseg(?P\u003cs{i}\u003e\\d\\d\\d)\u003e' for i in range(16)),\n",
+        "      '<loc(?P<y0>\\d\\d\\d\\d)><loc(?P<x0>\\d\\d\\d\\d)><loc(?P<y1>\\d\\d\\d\\d)><loc(?P<x1>\\d\\d\\d\\d)>'\n",
+        "      + ''.join(f'<seg(?P<s{i}>\\d\\d\\d)>' for i in range(16)),\n",
         "      detokenized_output,\n",
         "  )\n",
         "  boxes, segs = [], []\n",
@@ -673,33 +672,14 @@
   "metadata": {
     "accelerator": "GPU",
     "colab": {
-      "gpuType": "T4",
-      "machine_shape": "hm",
-      "private_outputs": true,
-      "provenance": [
-        {
-          "file_id": "1Mh7SHaP1cd5XuuHLmprKoBfr9ddSVu3a",
-          "timestamp": 1715369116355
-        },
-        {
-          "file_id": "1Xo8MY-GJsjjBKS1l_hQ8AImMNw9yxqmH",
-          "timestamp": 1715208696058
-        },
-        {
-          "file_id": "1bLQYoesGy8awiA0UT0askWIjIqcivfdY",
-          "timestamp": 1715147134742
-        }
-      ],
+      "name": "inference-with-keras.ipynb",
       "toc_visible": true
     },
     "kernelspec": {
       "display_name": "Python 3",
       "name": "python3"
-    },
-    "language_info": {
-      "name": "python"
     }
   },
   "nbformat": 4,
   "nbformat_minor": 0
-}
\ No newline at end of file
+}