CodeCutTech
diff --git a/‎Chapter5/natural_language_processing.ipynb
Lines changed: 88 additions & 211 deletions b/‎Chapter5/natural_language_processing.ipynb
Lines changed: 88 additions & 211 deletions
@@ -24476,7 +24476,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "57fa11d8-3152-48db-bb74-4575bbeca7bb",
    "metadata": {
     "colab": {
@@ -24627,216 +24627,7 @@
      "hide-output"
     ]
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2024-07-30 19:52:24,499 - BERTopic - Embedding - Transforming documents to embeddings.\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "8141eb80bf784dcaa7a721459e6009ba",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "b2937413668d44d5a950c578c6455884",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "25aeb49e460a47d1a966221a278c1aa9",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "e3e26066cd75463299b647b72d4bf613",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "79495920a49d462981e54feba77ba67c",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "c8b0666447f74c699f1af04653b1a761",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "43b105c5bbd542e78c6d50a3cc669cf3",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "787a8a9f95da415ba25e643addfa9db4",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "fef2e3140b4c419183ad0e150fa55cf8",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "93fd6d668f444b34a706af424dee5d77",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "73d5d8704d69476ca2ad98bae7c55907",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "dcd0c1bdd79941f296f76d961846ebe2",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Batches:   0%|          | 0/589 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2024-07-30 19:55:21,993 - BERTopic - Embedding - Completed ✓\n",
-      "2024-07-30 19:55:21,996 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm\n",
-      "2024-07-30 19:55:35,484 - BERTopic - Dimensionality - Completed ✓\n",
-      "2024-07-30 19:55:35,485 - BERTopic - Cluster - Start clustering the reduced embeddings\n",
-      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
-      "To disable this warning, you can either:\n",
-      "\t- Avoid using `tokenizers` before the fork if possible\n",
-      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
-      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
-      "To disable this warning, you can either:\n",
-      "\t- Avoid using `tokenizers` before the fork if possible\n",
-      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
-      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
-      "To disable this warning, you can either:\n",
-      "\t- Avoid using `tokenizers` before the fork if possible\n",
-      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
-      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
-      "To disable this warning, you can either:\n",
-      "\t- Avoid using `tokenizers` before the fork if possible\n",
-      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
-      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
-      "To disable this warning, you can either:\n",
-      "\t- Avoid using `tokenizers` before the fork if possible\n",
-      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
-      "2024-07-30 19:55:37,980 - BERTopic - Cluster - Completed ✓\n",
-      "2024-07-30 19:55:37,988 - BERTopic - Representation - Extracting topics from clusters using representation models.\n",
-      "2024-07-30 19:55:39,801 - BERTopic - Representation - Completed ✓\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from bertopic import BERTopic\n",
     "\n",
@@ -80847,6 +80638,92 @@
    "source": [
     "[Link to BertTopic](https://bit.ly/4fjwU9T)."
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "064743d9",
+   "metadata": {},
+   "source": [
+    "### Beyond Keywords: Building a Semantic Recipe Search Engine"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "82694664",
+   "metadata": {},
+   "source": [
+    "Semantic search enables content discovery based on meaning rather than just keywords. This approach uses vector embeddings - numerical representations of text that capture semantic essence. \n",
+    "\n",
+    "By converting text to vector embeddings, we can quantify semantic similarity between different pieces of content in a high-dimensional vector space. This allows for comparison and search based on underlying meaning, surpassing simple keyword matching.\n",
+    "\n",
+    "Here's a Python implementation of semantic search for recipe recommendations using sentence-transformers:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "84497e32",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Query: healthy dessert without sugar\n",
+      "Most similar recipes:\n",
+      "- No-Bake Berry Chia Seed Pudding (Similarity: 0.55)\n",
+      "- Banana and Date Sweetened Oatmeal Cookies (Similarity: 0.43)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sentence_transformers import SentenceTransformer\n",
+    "from sklearn.metrics.pairwise import cosine_similarity\n",
+    "\n",
+    "# Step 1: Prepare our data\n",
+    "recipes = [\n",
+    "    \"Banana and Date Sweetened Oatmeal Cookies\",\n",
+    "    \"No-Bake Berry Chia Seed Pudding\",\n",
+    "    \"Deep-Fried Oreo Sundae with Caramel Sauce\",\n",
+    "    \"Loaded Bacon Cheeseburger Pizza\",\n",
+    "]\n",
+    "\n",
+    "# Step 2: Load a pre-trained model for creating embeddings\n",
+    "model = SentenceTransformer('all-MiniLM-L6-v2')\n",
+    "\n",
+    "# Step 3: Create embeddings for our recipe descriptions\n",
+    "recipe_embeddings = model.encode(recipes)\n",
+    "\n",
+    "# Step 4: Function to find similar recipes \n",
+    "def find_similar_recipes(query, top_k=2):\n",
+    "    # Create embedding for the query\n",
+    "    query_embedding = model.encode([query])\n",
+    "    \n",
+    "    # Calculate similarity\n",
+    "    similarities = cosine_similarity(query_embedding, recipe_embeddings)[0]\n",
+    "    \n",
+    "    # Get top k similar recipes \n",
+    "    top_indices = similarities.argsort()[-top_k:][::-1]\n",
+    "    \n",
+    "    return [(recipes[i], similarities[i]) for i in top_indices]\n",
+    "\n",
+    "# Step 5: Test our semantic search\n",
+    "query = \"healthy dessert without sugar\"\n",
+    "results = find_similar_recipes(query)\n",
+    "\n",
+    "print(f\"Query: {query}\")\n",
+    "print(\"Most similar recipes:\")\n",
+    "for recipe, score in results:\n",
+    "    print(f\"- {recipe} (Similarity: {score:.2f})\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fe3a8c67",
+   "metadata": {},
+   "source": [
+    "This implementation successfully identifies healthier dessert options, understanding that ingredients like berries, chia seeds, bananas, and dates are often used in healthy, sugar-free desserts. It excludes clearly unhealthy options, demonstrating comprehension of \"healthy\" in the dessert context. The score difference (0.55 vs 0.43) indicates that the model considers the chia seed pudding a closer match to the concept of a healthy, sugar-free dessert than the oatmeal cookies."
+   ]
   }
  ],
  "metadata": {