|
24476 | 24476 | },
|
24477 | 24477 | {
|
24478 | 24478 | "cell_type": "code",
|
24479 |
| - "execution_count": 3, |
| 24479 | + "execution_count": null, |
24480 | 24480 | "id": "57fa11d8-3152-48db-bb74-4575bbeca7bb",
|
24481 | 24481 | "metadata": {
|
24482 | 24482 | "colab": {
|
@@ -24627,216 +24627,7 @@
|
24627 | 24627 | "hide-output"
|
24628 | 24628 | ]
|
24629 | 24629 | },
|
24630 |
| - "outputs": [ |
24631 |
| - { |
24632 |
| - "name": "stderr", |
24633 |
| - "output_type": "stream", |
24634 |
| - "text": [ |
24635 |
| - "2024-07-30 19:52:24,499 - BERTopic - Embedding - Transforming documents to embeddings.\n" |
24636 |
| - ] |
24637 |
| - }, |
24638 |
| - { |
24639 |
| - "data": { |
24640 |
| - "application/vnd.jupyter.widget-view+json": { |
24641 |
| - "model_id": "8141eb80bf784dcaa7a721459e6009ba", |
24642 |
| - "version_major": 2, |
24643 |
| - "version_minor": 0 |
24644 |
| - }, |
24645 |
| - "text/plain": [ |
24646 |
| - "modules.json: 0%| | 0.00/349 [00:00<?, ?B/s]" |
24647 |
| - ] |
24648 |
| - }, |
24649 |
| - "metadata": {}, |
24650 |
| - "output_type": "display_data" |
24651 |
| - }, |
24652 |
| - { |
24653 |
| - "data": { |
24654 |
| - "application/vnd.jupyter.widget-view+json": { |
24655 |
| - "model_id": "b2937413668d44d5a950c578c6455884", |
24656 |
| - "version_major": 2, |
24657 |
| - "version_minor": 0 |
24658 |
| - }, |
24659 |
| - "text/plain": [ |
24660 |
| - "config_sentence_transformers.json: 0%| | 0.00/116 [00:00<?, ?B/s]" |
24661 |
| - ] |
24662 |
| - }, |
24663 |
| - "metadata": {}, |
24664 |
| - "output_type": "display_data" |
24665 |
| - }, |
24666 |
| - { |
24667 |
| - "data": { |
24668 |
| - "application/vnd.jupyter.widget-view+json": { |
24669 |
| - "model_id": "25aeb49e460a47d1a966221a278c1aa9", |
24670 |
| - "version_major": 2, |
24671 |
| - "version_minor": 0 |
24672 |
| - }, |
24673 |
| - "text/plain": [ |
24674 |
| - "README.md: 0%| | 0.00/10.7k [00:00<?, ?B/s]" |
24675 |
| - ] |
24676 |
| - }, |
24677 |
| - "metadata": {}, |
24678 |
| - "output_type": "display_data" |
24679 |
| - }, |
24680 |
| - { |
24681 |
| - "data": { |
24682 |
| - "application/vnd.jupyter.widget-view+json": { |
24683 |
| - "model_id": "e3e26066cd75463299b647b72d4bf613", |
24684 |
| - "version_major": 2, |
24685 |
| - "version_minor": 0 |
24686 |
| - }, |
24687 |
| - "text/plain": [ |
24688 |
| - "sentence_bert_config.json: 0%| | 0.00/53.0 [00:00<?, ?B/s]" |
24689 |
| - ] |
24690 |
| - }, |
24691 |
| - "metadata": {}, |
24692 |
| - "output_type": "display_data" |
24693 |
| - }, |
24694 |
| - { |
24695 |
| - "data": { |
24696 |
| - "application/vnd.jupyter.widget-view+json": { |
24697 |
| - "model_id": "79495920a49d462981e54feba77ba67c", |
24698 |
| - "version_major": 2, |
24699 |
| - "version_minor": 0 |
24700 |
| - }, |
24701 |
| - "text/plain": [ |
24702 |
| - "config.json: 0%| | 0.00/612 [00:00<?, ?B/s]" |
24703 |
| - ] |
24704 |
| - }, |
24705 |
| - "metadata": {}, |
24706 |
| - "output_type": "display_data" |
24707 |
| - }, |
24708 |
| - { |
24709 |
| - "data": { |
24710 |
| - "application/vnd.jupyter.widget-view+json": { |
24711 |
| - "model_id": "c8b0666447f74c699f1af04653b1a761", |
24712 |
| - "version_major": 2, |
24713 |
| - "version_minor": 0 |
24714 |
| - }, |
24715 |
| - "text/plain": [ |
24716 |
| - "model.safetensors: 0%| | 0.00/90.9M [00:00<?, ?B/s]" |
24717 |
| - ] |
24718 |
| - }, |
24719 |
| - "metadata": {}, |
24720 |
| - "output_type": "display_data" |
24721 |
| - }, |
24722 |
| - { |
24723 |
| - "data": { |
24724 |
| - "application/vnd.jupyter.widget-view+json": { |
24725 |
| - "model_id": "43b105c5bbd542e78c6d50a3cc669cf3", |
24726 |
| - "version_major": 2, |
24727 |
| - "version_minor": 0 |
24728 |
| - }, |
24729 |
| - "text/plain": [ |
24730 |
| - "tokenizer_config.json: 0%| | 0.00/350 [00:00<?, ?B/s]" |
24731 |
| - ] |
24732 |
| - }, |
24733 |
| - "metadata": {}, |
24734 |
| - "output_type": "display_data" |
24735 |
| - }, |
24736 |
| - { |
24737 |
| - "data": { |
24738 |
| - "application/vnd.jupyter.widget-view+json": { |
24739 |
| - "model_id": "787a8a9f95da415ba25e643addfa9db4", |
24740 |
| - "version_major": 2, |
24741 |
| - "version_minor": 0 |
24742 |
| - }, |
24743 |
| - "text/plain": [ |
24744 |
| - "vocab.txt: 0%| | 0.00/232k [00:00<?, ?B/s]" |
24745 |
| - ] |
24746 |
| - }, |
24747 |
| - "metadata": {}, |
24748 |
| - "output_type": "display_data" |
24749 |
| - }, |
24750 |
| - { |
24751 |
| - "data": { |
24752 |
| - "application/vnd.jupyter.widget-view+json": { |
24753 |
| - "model_id": "fef2e3140b4c419183ad0e150fa55cf8", |
24754 |
| - "version_major": 2, |
24755 |
| - "version_minor": 0 |
24756 |
| - }, |
24757 |
| - "text/plain": [ |
24758 |
| - "tokenizer.json: 0%| | 0.00/466k [00:00<?, ?B/s]" |
24759 |
| - ] |
24760 |
| - }, |
24761 |
| - "metadata": {}, |
24762 |
| - "output_type": "display_data" |
24763 |
| - }, |
24764 |
| - { |
24765 |
| - "data": { |
24766 |
| - "application/vnd.jupyter.widget-view+json": { |
24767 |
| - "model_id": "93fd6d668f444b34a706af424dee5d77", |
24768 |
| - "version_major": 2, |
24769 |
| - "version_minor": 0 |
24770 |
| - }, |
24771 |
| - "text/plain": [ |
24772 |
| - "special_tokens_map.json: 0%| | 0.00/112 [00:00<?, ?B/s]" |
24773 |
| - ] |
24774 |
| - }, |
24775 |
| - "metadata": {}, |
24776 |
| - "output_type": "display_data" |
24777 |
| - }, |
24778 |
| - { |
24779 |
| - "data": { |
24780 |
| - "application/vnd.jupyter.widget-view+json": { |
24781 |
| - "model_id": "73d5d8704d69476ca2ad98bae7c55907", |
24782 |
| - "version_major": 2, |
24783 |
| - "version_minor": 0 |
24784 |
| - }, |
24785 |
| - "text/plain": [ |
24786 |
| - "1_Pooling/config.json: 0%| | 0.00/190 [00:00<?, ?B/s]" |
24787 |
| - ] |
24788 |
| - }, |
24789 |
| - "metadata": {}, |
24790 |
| - "output_type": "display_data" |
24791 |
| - }, |
24792 |
| - { |
24793 |
| - "data": { |
24794 |
| - "application/vnd.jupyter.widget-view+json": { |
24795 |
| - "model_id": "dcd0c1bdd79941f296f76d961846ebe2", |
24796 |
| - "version_major": 2, |
24797 |
| - "version_minor": 0 |
24798 |
| - }, |
24799 |
| - "text/plain": [ |
24800 |
| - "Batches: 0%| | 0/589 [00:00<?, ?it/s]" |
24801 |
| - ] |
24802 |
| - }, |
24803 |
| - "metadata": {}, |
24804 |
| - "output_type": "display_data" |
24805 |
| - }, |
24806 |
| - { |
24807 |
| - "name": "stderr", |
24808 |
| - "output_type": "stream", |
24809 |
| - "text": [ |
24810 |
| - "2024-07-30 19:55:21,993 - BERTopic - Embedding - Completed ✓\n", |
24811 |
| - "2024-07-30 19:55:21,996 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm\n", |
24812 |
| - "2024-07-30 19:55:35,484 - BERTopic - Dimensionality - Completed ✓\n", |
24813 |
| - "2024-07-30 19:55:35,485 - BERTopic - Cluster - Start clustering the reduced embeddings\n", |
24814 |
| - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", |
24815 |
| - "To disable this warning, you can either:\n", |
24816 |
| - "\t- Avoid using `tokenizers` before the fork if possible\n", |
24817 |
| - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", |
24818 |
| - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", |
24819 |
| - "To disable this warning, you can either:\n", |
24820 |
| - "\t- Avoid using `tokenizers` before the fork if possible\n", |
24821 |
| - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", |
24822 |
| - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", |
24823 |
| - "To disable this warning, you can either:\n", |
24824 |
| - "\t- Avoid using `tokenizers` before the fork if possible\n", |
24825 |
| - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", |
24826 |
| - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", |
24827 |
| - "To disable this warning, you can either:\n", |
24828 |
| - "\t- Avoid using `tokenizers` before the fork if possible\n", |
24829 |
| - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", |
24830 |
| - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", |
24831 |
| - "To disable this warning, you can either:\n", |
24832 |
| - "\t- Avoid using `tokenizers` before the fork if possible\n", |
24833 |
| - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", |
24834 |
| - "2024-07-30 19:55:37,980 - BERTopic - Cluster - Completed ✓\n", |
24835 |
| - "2024-07-30 19:55:37,988 - BERTopic - Representation - Extracting topics from clusters using representation models.\n", |
24836 |
| - "2024-07-30 19:55:39,801 - BERTopic - Representation - Completed ✓\n" |
24837 |
| - ] |
24838 |
| - } |
24839 |
| - ], |
| 24630 | + "outputs": [], |
24840 | 24631 | "source": [
|
24841 | 24632 | "from bertopic import BERTopic\n",
|
24842 | 24633 | "\n",
|
|
80847 | 80638 | "source": [
|
80848 | 80639 | "[Link to BertTopic](https://bit.ly/4fjwU9T)."
|
80849 | 80640 | ]
|
| 80641 | + }, |
| 80642 | + { |
| 80643 | + "cell_type": "markdown", |
| 80644 | + "id": "064743d9", |
| 80645 | + "metadata": {}, |
| 80646 | + "source": [ |
| 80647 | + "### Beyond Keywords: Building a Semantic Recipe Search Engine" |
| 80648 | + ] |
| 80649 | + }, |
| 80650 | + { |
| 80651 | + "cell_type": "markdown", |
| 80652 | + "id": "82694664", |
| 80653 | + "metadata": {}, |
| 80654 | + "source": [ |
| 80655 | + "Semantic search enables content discovery based on meaning rather than just keywords. This approach uses vector embeddings - numerical representations of text that capture semantic essence. \n", |
| 80656 | + "\n", |
| 80657 | + "By converting text to vector embeddings, we can quantify semantic similarity between different pieces of content in a high-dimensional vector space. This allows for comparison and search based on underlying meaning, surpassing simple keyword matching.\n", |
| 80658 | + "\n", |
| 80659 | + "Here's a Python implementation of semantic search for recipe recommendations using sentence-transformers:" |
| 80660 | + ] |
| 80661 | + }, |
| 80662 | + { |
| 80663 | + "cell_type": "code", |
| 80664 | + "execution_count": 9, |
| 80665 | + "id": "84497e32", |
| 80666 | + "metadata": {}, |
| 80667 | + "outputs": [ |
| 80668 | + { |
| 80669 | + "name": "stdout", |
| 80670 | + "output_type": "stream", |
| 80671 | + "text": [ |
| 80672 | + "Query: healthy dessert without sugar\n", |
| 80673 | + "Most similar recipes:\n", |
| 80674 | + "- No-Bake Berry Chia Seed Pudding (Similarity: 0.55)\n", |
| 80675 | + "- Banana and Date Sweetened Oatmeal Cookies (Similarity: 0.43)\n" |
| 80676 | + ] |
| 80677 | + } |
| 80678 | + ], |
| 80679 | + "source": [ |
| 80680 | + "from sentence_transformers import SentenceTransformer\n", |
| 80681 | + "from sklearn.metrics.pairwise import cosine_similarity\n", |
| 80682 | + "\n", |
| 80683 | + "# Step 1: Prepare our data\n", |
| 80684 | + "recipes = [\n", |
| 80685 | + " \"Banana and Date Sweetened Oatmeal Cookies\",\n", |
| 80686 | + " \"No-Bake Berry Chia Seed Pudding\",\n", |
| 80687 | + " \"Deep-Fried Oreo Sundae with Caramel Sauce\",\n", |
| 80688 | + " \"Loaded Bacon Cheeseburger Pizza\",\n", |
| 80689 | + "]\n", |
| 80690 | + "\n", |
| 80691 | + "# Step 2: Load a pre-trained model for creating embeddings\n", |
| 80692 | + "model = SentenceTransformer('all-MiniLM-L6-v2')\n", |
| 80693 | + "\n", |
| 80694 | + "# Step 3: Create embeddings for our recipe descriptions\n", |
| 80695 | + "recipe_embeddings = model.encode(recipes)\n", |
| 80696 | + "\n", |
| 80697 | + "# Step 4: Function to find similar recipes \n", |
| 80698 | + "def find_similar_recipes(query, top_k=2):\n", |
| 80699 | + " # Create embedding for the query\n", |
| 80700 | + " query_embedding = model.encode([query])\n", |
| 80701 | + " \n", |
| 80702 | + " # Calculate similarity\n", |
| 80703 | + " similarities = cosine_similarity(query_embedding, recipe_embeddings)[0]\n", |
| 80704 | + " \n", |
| 80705 | + " # Get top k similar recipes \n", |
| 80706 | + " top_indices = similarities.argsort()[-top_k:][::-1]\n", |
| 80707 | + " \n", |
| 80708 | + " return [(recipes[i], similarities[i]) for i in top_indices]\n", |
| 80709 | + "\n", |
| 80710 | + "# Step 5: Test our semantic search\n", |
| 80711 | + "query = \"healthy dessert without sugar\"\n", |
| 80712 | + "results = find_similar_recipes(query)\n", |
| 80713 | + "\n", |
| 80714 | + "print(f\"Query: {query}\")\n", |
| 80715 | + "print(\"Most similar recipes:\")\n", |
| 80716 | + "for recipe, score in results:\n", |
| 80717 | + " print(f\"- {recipe} (Similarity: {score:.2f})\")" |
| 80718 | + ] |
| 80719 | + }, |
| 80720 | + { |
| 80721 | + "cell_type": "markdown", |
| 80722 | + "id": "fe3a8c67", |
| 80723 | + "metadata": {}, |
| 80724 | + "source": [ |
| 80725 | + "This implementation successfully identifies healthier dessert options, understanding that ingredients like berries, chia seeds, bananas, and dates are often used in healthy, sugar-free desserts. It excludes clearly unhealthy options, demonstrating comprehension of \"healthy\" in the dessert context. The score difference (0.55 vs 0.43) indicates that the model considers the chia seed pudding a closer match to the concept of a healthy, sugar-free dessert than the oatmeal cookies." |
| 80726 | + ] |
80850 | 80727 | }
|
80851 | 80728 | ],
|
80852 | 80729 | "metadata": {
|
|
0 commit comments