Skip to content

Commit 22f519a

Browse files
add vector embedding
1 parent df7abba commit 22f519a

File tree

4 files changed

+239
-464
lines changed

4 files changed

+239
-464
lines changed

Chapter5/natural_language_processing.ipynb

Lines changed: 88 additions & 211 deletions
Original file line numberDiff line numberDiff line change
@@ -24476,7 +24476,7 @@
2447624476
},
2447724477
{
2447824478
"cell_type": "code",
24479-
"execution_count": 3,
24479+
"execution_count": null,
2448024480
"id": "57fa11d8-3152-48db-bb74-4575bbeca7bb",
2448124481
"metadata": {
2448224482
"colab": {
@@ -24627,216 +24627,7 @@
2462724627
"hide-output"
2462824628
]
2462924629
},
24630-
"outputs": [
24631-
{
24632-
"name": "stderr",
24633-
"output_type": "stream",
24634-
"text": [
24635-
"2024-07-30 19:52:24,499 - BERTopic - Embedding - Transforming documents to embeddings.\n"
24636-
]
24637-
},
24638-
{
24639-
"data": {
24640-
"application/vnd.jupyter.widget-view+json": {
24641-
"model_id": "8141eb80bf784dcaa7a721459e6009ba",
24642-
"version_major": 2,
24643-
"version_minor": 0
24644-
},
24645-
"text/plain": [
24646-
"modules.json: 0%| | 0.00/349 [00:00<?, ?B/s]"
24647-
]
24648-
},
24649-
"metadata": {},
24650-
"output_type": "display_data"
24651-
},
24652-
{
24653-
"data": {
24654-
"application/vnd.jupyter.widget-view+json": {
24655-
"model_id": "b2937413668d44d5a950c578c6455884",
24656-
"version_major": 2,
24657-
"version_minor": 0
24658-
},
24659-
"text/plain": [
24660-
"config_sentence_transformers.json: 0%| | 0.00/116 [00:00<?, ?B/s]"
24661-
]
24662-
},
24663-
"metadata": {},
24664-
"output_type": "display_data"
24665-
},
24666-
{
24667-
"data": {
24668-
"application/vnd.jupyter.widget-view+json": {
24669-
"model_id": "25aeb49e460a47d1a966221a278c1aa9",
24670-
"version_major": 2,
24671-
"version_minor": 0
24672-
},
24673-
"text/plain": [
24674-
"README.md: 0%| | 0.00/10.7k [00:00<?, ?B/s]"
24675-
]
24676-
},
24677-
"metadata": {},
24678-
"output_type": "display_data"
24679-
},
24680-
{
24681-
"data": {
24682-
"application/vnd.jupyter.widget-view+json": {
24683-
"model_id": "e3e26066cd75463299b647b72d4bf613",
24684-
"version_major": 2,
24685-
"version_minor": 0
24686-
},
24687-
"text/plain": [
24688-
"sentence_bert_config.json: 0%| | 0.00/53.0 [00:00<?, ?B/s]"
24689-
]
24690-
},
24691-
"metadata": {},
24692-
"output_type": "display_data"
24693-
},
24694-
{
24695-
"data": {
24696-
"application/vnd.jupyter.widget-view+json": {
24697-
"model_id": "79495920a49d462981e54feba77ba67c",
24698-
"version_major": 2,
24699-
"version_minor": 0
24700-
},
24701-
"text/plain": [
24702-
"config.json: 0%| | 0.00/612 [00:00<?, ?B/s]"
24703-
]
24704-
},
24705-
"metadata": {},
24706-
"output_type": "display_data"
24707-
},
24708-
{
24709-
"data": {
24710-
"application/vnd.jupyter.widget-view+json": {
24711-
"model_id": "c8b0666447f74c699f1af04653b1a761",
24712-
"version_major": 2,
24713-
"version_minor": 0
24714-
},
24715-
"text/plain": [
24716-
"model.safetensors: 0%| | 0.00/90.9M [00:00<?, ?B/s]"
24717-
]
24718-
},
24719-
"metadata": {},
24720-
"output_type": "display_data"
24721-
},
24722-
{
24723-
"data": {
24724-
"application/vnd.jupyter.widget-view+json": {
24725-
"model_id": "43b105c5bbd542e78c6d50a3cc669cf3",
24726-
"version_major": 2,
24727-
"version_minor": 0
24728-
},
24729-
"text/plain": [
24730-
"tokenizer_config.json: 0%| | 0.00/350 [00:00<?, ?B/s]"
24731-
]
24732-
},
24733-
"metadata": {},
24734-
"output_type": "display_data"
24735-
},
24736-
{
24737-
"data": {
24738-
"application/vnd.jupyter.widget-view+json": {
24739-
"model_id": "787a8a9f95da415ba25e643addfa9db4",
24740-
"version_major": 2,
24741-
"version_minor": 0
24742-
},
24743-
"text/plain": [
24744-
"vocab.txt: 0%| | 0.00/232k [00:00<?, ?B/s]"
24745-
]
24746-
},
24747-
"metadata": {},
24748-
"output_type": "display_data"
24749-
},
24750-
{
24751-
"data": {
24752-
"application/vnd.jupyter.widget-view+json": {
24753-
"model_id": "fef2e3140b4c419183ad0e150fa55cf8",
24754-
"version_major": 2,
24755-
"version_minor": 0
24756-
},
24757-
"text/plain": [
24758-
"tokenizer.json: 0%| | 0.00/466k [00:00<?, ?B/s]"
24759-
]
24760-
},
24761-
"metadata": {},
24762-
"output_type": "display_data"
24763-
},
24764-
{
24765-
"data": {
24766-
"application/vnd.jupyter.widget-view+json": {
24767-
"model_id": "93fd6d668f444b34a706af424dee5d77",
24768-
"version_major": 2,
24769-
"version_minor": 0
24770-
},
24771-
"text/plain": [
24772-
"special_tokens_map.json: 0%| | 0.00/112 [00:00<?, ?B/s]"
24773-
]
24774-
},
24775-
"metadata": {},
24776-
"output_type": "display_data"
24777-
},
24778-
{
24779-
"data": {
24780-
"application/vnd.jupyter.widget-view+json": {
24781-
"model_id": "73d5d8704d69476ca2ad98bae7c55907",
24782-
"version_major": 2,
24783-
"version_minor": 0
24784-
},
24785-
"text/plain": [
24786-
"1_Pooling/config.json: 0%| | 0.00/190 [00:00<?, ?B/s]"
24787-
]
24788-
},
24789-
"metadata": {},
24790-
"output_type": "display_data"
24791-
},
24792-
{
24793-
"data": {
24794-
"application/vnd.jupyter.widget-view+json": {
24795-
"model_id": "dcd0c1bdd79941f296f76d961846ebe2",
24796-
"version_major": 2,
24797-
"version_minor": 0
24798-
},
24799-
"text/plain": [
24800-
"Batches: 0%| | 0/589 [00:00<?, ?it/s]"
24801-
]
24802-
},
24803-
"metadata": {},
24804-
"output_type": "display_data"
24805-
},
24806-
{
24807-
"name": "stderr",
24808-
"output_type": "stream",
24809-
"text": [
24810-
"2024-07-30 19:55:21,993 - BERTopic - Embedding - Completed ✓\n",
24811-
"2024-07-30 19:55:21,996 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm\n",
24812-
"2024-07-30 19:55:35,484 - BERTopic - Dimensionality - Completed ✓\n",
24813-
"2024-07-30 19:55:35,485 - BERTopic - Cluster - Start clustering the reduced embeddings\n",
24814-
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
24815-
"To disable this warning, you can either:\n",
24816-
"\t- Avoid using `tokenizers` before the fork if possible\n",
24817-
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
24818-
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
24819-
"To disable this warning, you can either:\n",
24820-
"\t- Avoid using `tokenizers` before the fork if possible\n",
24821-
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
24822-
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
24823-
"To disable this warning, you can either:\n",
24824-
"\t- Avoid using `tokenizers` before the fork if possible\n",
24825-
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
24826-
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
24827-
"To disable this warning, you can either:\n",
24828-
"\t- Avoid using `tokenizers` before the fork if possible\n",
24829-
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
24830-
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
24831-
"To disable this warning, you can either:\n",
24832-
"\t- Avoid using `tokenizers` before the fork if possible\n",
24833-
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
24834-
"2024-07-30 19:55:37,980 - BERTopic - Cluster - Completed ✓\n",
24835-
"2024-07-30 19:55:37,988 - BERTopic - Representation - Extracting topics from clusters using representation models.\n",
24836-
"2024-07-30 19:55:39,801 - BERTopic - Representation - Completed ✓\n"
24837-
]
24838-
}
24839-
],
24630+
"outputs": [],
2484024631
"source": [
2484124632
"from bertopic import BERTopic\n",
2484224633
"\n",
@@ -80847,6 +80638,92 @@
8084780638
"source": [
8084880639
"[Link to BertTopic](https://bit.ly/4fjwU9T)."
8084980640
]
80641+
},
80642+
{
80643+
"cell_type": "markdown",
80644+
"id": "064743d9",
80645+
"metadata": {},
80646+
"source": [
80647+
"### Beyond Keywords: Building a Semantic Recipe Search Engine"
80648+
]
80649+
},
80650+
{
80651+
"cell_type": "markdown",
80652+
"id": "82694664",
80653+
"metadata": {},
80654+
"source": [
80655+
"Semantic search enables content discovery based on meaning rather than just keywords. This approach uses vector embeddings - numerical representations of text that capture semantic essence. \n",
80656+
"\n",
80657+
"By converting text to vector embeddings, we can quantify semantic similarity between different pieces of content in a high-dimensional vector space. This allows for comparison and search based on underlying meaning, surpassing simple keyword matching.\n",
80658+
"\n",
80659+
"Here's a Python implementation of semantic search for recipe recommendations using sentence-transformers:"
80660+
]
80661+
},
80662+
{
80663+
"cell_type": "code",
80664+
"execution_count": 9,
80665+
"id": "84497e32",
80666+
"metadata": {},
80667+
"outputs": [
80668+
{
80669+
"name": "stdout",
80670+
"output_type": "stream",
80671+
"text": [
80672+
"Query: healthy dessert without sugar\n",
80673+
"Most similar recipes:\n",
80674+
"- No-Bake Berry Chia Seed Pudding (Similarity: 0.55)\n",
80675+
"- Banana and Date Sweetened Oatmeal Cookies (Similarity: 0.43)\n"
80676+
]
80677+
}
80678+
],
80679+
"source": [
80680+
"from sentence_transformers import SentenceTransformer\n",
80681+
"from sklearn.metrics.pairwise import cosine_similarity\n",
80682+
"\n",
80683+
"# Step 1: Prepare our data\n",
80684+
"recipes = [\n",
80685+
" \"Banana and Date Sweetened Oatmeal Cookies\",\n",
80686+
" \"No-Bake Berry Chia Seed Pudding\",\n",
80687+
" \"Deep-Fried Oreo Sundae with Caramel Sauce\",\n",
80688+
" \"Loaded Bacon Cheeseburger Pizza\",\n",
80689+
"]\n",
80690+
"\n",
80691+
"# Step 2: Load a pre-trained model for creating embeddings\n",
80692+
"model = SentenceTransformer('all-MiniLM-L6-v2')\n",
80693+
"\n",
80694+
"# Step 3: Create embeddings for our recipe descriptions\n",
80695+
"recipe_embeddings = model.encode(recipes)\n",
80696+
"\n",
80697+
"# Step 4: Function to find similar recipes \n",
80698+
"def find_similar_recipes(query, top_k=2):\n",
80699+
" # Create embedding for the query\n",
80700+
" query_embedding = model.encode([query])\n",
80701+
" \n",
80702+
" # Calculate similarity\n",
80703+
" similarities = cosine_similarity(query_embedding, recipe_embeddings)[0]\n",
80704+
" \n",
80705+
" # Get top k similar recipes \n",
80706+
" top_indices = similarities.argsort()[-top_k:][::-1]\n",
80707+
" \n",
80708+
" return [(recipes[i], similarities[i]) for i in top_indices]\n",
80709+
"\n",
80710+
"# Step 5: Test our semantic search\n",
80711+
"query = \"healthy dessert without sugar\"\n",
80712+
"results = find_similar_recipes(query)\n",
80713+
"\n",
80714+
"print(f\"Query: {query}\")\n",
80715+
"print(\"Most similar recipes:\")\n",
80716+
"for recipe, score in results:\n",
80717+
" print(f\"- {recipe} (Similarity: {score:.2f})\")"
80718+
]
80719+
},
80720+
{
80721+
"cell_type": "markdown",
80722+
"id": "fe3a8c67",
80723+
"metadata": {},
80724+
"source": [
80725+
"This implementation successfully identifies healthier dessert options, understanding that ingredients like berries, chia seeds, bananas, and dates are often used in healthy, sugar-free desserts. It excludes clearly unhealthy options, demonstrating comprehension of \"healthy\" in the dessert context. The score difference (0.55 vs 0.43) indicates that the model considers the chia seed pudding a closer match to the concept of a healthy, sugar-free dessert than the oatmeal cookies."
80726+
]
8085080727
}
8085180728
],
8085280729
"metadata": {

0 commit comments

Comments
 (0)