From 0c165d88dd166eb5364dcbcd37a7720679ae5f36 Mon Sep 17 00:00:00 2001
From: Devin Gaffney <itsme@devingaffney.com>
Date: Mon, 16 Sep 2024 14:59:44 -0700
Subject: [PATCH 1/8] updates for making similarity work

---
 app/main/lib/elastic_crud.py    | 2 +-
 app/main/lib/text_similarity.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/app/main/lib/elastic_crud.py b/app/main/lib/elastic_crud.py
index fef79a05..f5e311af 100644
--- a/app/main/lib/elastic_crud.py
+++ b/app/main/lib/elastic_crud.py
@@ -42,7 +42,7 @@ def get_presto_request_response(modality, callback_url, task):
 
 def requires_encoding(obj):
     for model_key in obj.get("models", []):
-        if not obj.get('model_'+model_key):
+        if model_key != "elasticsearch" and not obj.get('model_'+model_key):
             return True
     return False
 
diff --git a/app/main/lib/text_similarity.py b/app/main/lib/text_similarity.py
index eb3830b2..dad634b4 100644
--- a/app/main/lib/text_similarity.py
+++ b/app/main/lib/text_similarity.py
@@ -36,7 +36,7 @@ def async_search_text(task, modality):
     return elastic_crud.get_async_presto_response(task, "text", modality)
 
 def fill_in_openai_embeddings(document):
-    for model_key in document.pop("models", []):
+    for model_key in document.get("models", []):
         if model_key != "elasticsearch" and model_key[:len(PREFIX_OPENAI)] == PREFIX_OPENAI:
             document['vector_'+model_key] = retrieve_openai_embeddings(document['content'], model_key)
             document['model_'+model_key] = 1
@@ -76,7 +76,7 @@ def search_text(search_params, use_document_vectors=False):
     if model_key != "elasticsearch":
       search_params.pop("model", None)
       if use_document_vectors:
-          vector_for_search = search_params[model_key+"-tokens"]
+          vector_for_search = search_params["vector_"+model_key]
       else:
           vector_for_search = None
     result = search_text_by_model(dict(**search_params, **{'model': model_key}), vector_for_search)

From 80a1aef2c7b75e55be4a476a484c293ce81eba07 Mon Sep 17 00:00:00 2001
From: Devin Gaffney <itsme@devingaffney.com>
Date: Tue, 17 Sep 2024 11:06:06 -0700
Subject: [PATCH 2/8] more tweaking during testing

---
 app/main/lib/elasticsearch.py   | 31 +++++++++++++++++--------------
 app/main/lib/text_similarity.py | 16 +++++++++++-----
 2 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/app/main/lib/elasticsearch.py b/app/main/lib/elasticsearch.py
index 70090916..370b1eec 100644
--- a/app/main/lib/elasticsearch.py
+++ b/app/main/lib/elasticsearch.py
@@ -48,16 +48,17 @@ def generate_matches(context):
     matches = []
     clause_count = 0
     for key in context:
-        if isinstance(context[key], list):
-            clause_count += len(context[key])
-            matches.append({
-                'query_string': { 'query': str.join(" OR ", [f"context.{key}: {v}" for v in context[key]])}
-            })
-        else:
-            clause_count += 1
-            matches.append({
-                'match': { 'context.' + key: context[key] }
-            })
+        if key not in ["project_media_id", "has_custom_id", "field"]:
+            if isinstance(context[key], list):
+                clause_count += len(context[key])
+                matches.append({
+                    'query_string': { 'query': str.join(" OR ", [f"context.{key}: {v}" for v in context[key]])}
+                })
+            else:
+                clause_count += 1
+                matches.append({
+                    'match': { 'context.' + key: context[key] }
+                })
     return matches, clause_count
 
 def truncate_query(query, clause_count):
@@ -112,12 +113,14 @@ def get_by_doc_id(doc_id):
     return response['_source']
 
 def store_document(body, doc_id, language=None):
-    for field in ["per_model_threshold", "threshold", "model", "confirmed", "limit", "requires_callback"]:
-        body.pop(field, None)
+    storable_doc = {}
+    for k,v in body.items():
+        if k not in ["per_model_threshold", "threshold", "model", "confirmed", "limit", "requires_callback"]:
+            storable_doc[k] = v
     indices = [app.config['ELASTICSEARCH_SIMILARITY']]
     # 'auto' indicates we should try to guess the appropriate language
     if language == 'auto':
-        text = body['content']
+        text = storable_doc['content']
         language = LangidProvider.langid(text)['result']['language']
         if language not in SUPPORTED_LANGUAGES:
             app.logger.warning('Detected language {} is not supported'.format(language))
@@ -129,7 +132,7 @@ def store_document(body, doc_id, language=None):
     
     results = []
     for index in indices:
-      index_result = update_or_create_document(body, doc_id, index)
+      index_result = update_or_create_document(storable_doc, doc_id, index)
       results.append(index_result)
       if index_result['result'] not in ['created', 'updated', 'noop']:
           app.logger.warning('Problem adding document to ES index for language {0}: {1}'.format(language, index_result))
diff --git a/app/main/lib/text_similarity.py b/app/main/lib/text_similarity.py
index dad634b4..99f72479 100644
--- a/app/main/lib/text_similarity.py
+++ b/app/main/lib/text_similarity.py
@@ -175,6 +175,9 @@ def insert_model_into_response(hits, model_key):
             hit["_source"]["model"] = model_key
     return hits
 
+def return_sources(results):
+    return [dict(**r["_source"], **{"score": r["_score"]}) for r in results]
+
 def strip_vectors(results):
     for result in results:
         vector_keys = [key for key in result["_source"].keys() if key[:7] == "vector_"]
@@ -197,6 +200,7 @@ def restrict_results(results, search_params, model_key):
     return results
 
 def search_text_by_model(search_params, vector_for_search):
+    import code;code.interact(local=dict(globals(), **locals())) 
     app.logger.info(
         f"[Alegre Similarity] search_text_by_model:search_params {search_params}")
     language = None
@@ -260,11 +264,13 @@ def search_text_by_model(search_params, vector_for_search):
         body=body,
         index=search_indices
     )
-    response = strip_vectors(
-        restrict_results(
-            insert_model_into_response(result['hits']['hits'], model_key),
-            search_params,
-            model_key
+    response = return_sources(
+        strip_vectors(
+            restrict_results(
+                insert_model_into_response(result['hits']['hits'], model_key),
+                search_params,
+                model_key
+            )
         )
     )
     return {

From a80df06eb619f1c9576f043a28f9322115659cc7 Mon Sep 17 00:00:00 2001
From: Devin Gaffney <itsme@devingaffney.com>
Date: Tue, 17 Sep 2024 13:17:03 -0700
Subject: [PATCH 3/8] remove debug

---
 app/main/lib/text_similarity.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/app/main/lib/text_similarity.py b/app/main/lib/text_similarity.py
index 99f72479..aa198795 100644
--- a/app/main/lib/text_similarity.py
+++ b/app/main/lib/text_similarity.py
@@ -200,7 +200,6 @@ def restrict_results(results, search_params, model_key):
     return results
 
 def search_text_by_model(search_params, vector_for_search):
-    import code;code.interact(local=dict(globals(), **locals())) 
     app.logger.info(
         f"[Alegre Similarity] search_text_by_model:search_params {search_params}")
     language = None

From 314512b51a00c8249ad36cace36e170e59dfd3ee Mon Sep 17 00:00:00 2001
From: Devin Gaffney <itsme@devingaffney.com>
Date: Tue, 17 Sep 2024 14:45:54 -0700
Subject: [PATCH 4/8] more tweaks

---
 app/main/lib/text_similarity.py            |  2 +-
 app/test/test_similarity.py                | 12 ++++++------
 app/test/test_similarity_lang_analyzers.py |  6 +++---
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/app/main/lib/text_similarity.py b/app/main/lib/text_similarity.py
index aa198795..b915c8dc 100644
--- a/app/main/lib/text_similarity.py
+++ b/app/main/lib/text_similarity.py
@@ -176,7 +176,7 @@ def insert_model_into_response(hits, model_key):
     return hits
 
 def return_sources(results):
-    return [dict(**r["_source"], **{"score": r["_score"]}) for r in results]
+    return [dict(**r["_source"], **{"index": r["_index"], "score": r["_score"]}) for r in results]
 
 def strip_vectors(results):
     for result in results:
diff --git a/app/test/test_similarity.py b/app/test/test_similarity.py
index 28d406cb..b8502f4e 100644
--- a/app/test/test_similarity.py
+++ b/app/test/test_similarity.py
@@ -306,10 +306,10 @@ def test_elasticsearch_performs_correct_fuzzy_search(self):
             post_response = self.client.post('/text/similarity/search/', data=json.dumps(lookup), content_type='application/json')
             lookup["fuzzy"] = True
             post_response_fuzzy = self.client.post('/text/similarity/search/', data=json.dumps(lookup), content_type='application/json')
-            self.assertGreater(json.loads(post_response_fuzzy.data.decode())["result"][0]["_score"], json.loads(post_response.data.decode())["result"][0]["_score"])
+            self.assertGreater(json.loads(post_response_fuzzy.data.decode())["result"][0]["score"], json.loads(post_response.data.decode())["result"][0]["score"])
             lookup["fuzzy"] = False
             post_response_fuzzy = self.client.post('/text/similarity/search/', data=json.dumps(lookup), content_type='application/json')
-            self.assertEqual(json.loads(post_response_fuzzy.data.decode())["result"][0]["_score"], json.loads(post_response.data.decode())["result"][0]["_score"])
+            self.assertEqual(json.loads(post_response_fuzzy.data.decode())["result"][0]["score"], json.loads(post_response.data.decode())["result"][0]["score"])
 
     def test_elasticsearch_update_text(self):
         with self.client:
@@ -455,7 +455,7 @@ def test_model_similarity(self):
         )
         result = json.loads(response.data.decode())
         self.assertEqual(1, len(result['result']))
-        similarity = result['result'][0]['_score']
+        similarity = result['result'][0]['score']
         self.assertGreater(similarity, 0.7)
 
         response = self.client.post(
@@ -487,7 +487,7 @@ def test_model_similarity(self):
         )
         result = json.loads(response.data.decode())
         self.assertEqual(1, len(result['result']))
-        similarity = result['result'][0]['_score']
+        similarity = result['result'][0]['score']
         self.assertGreater(similarity, 0.7)
 
         response = self.client.post(
@@ -501,7 +501,7 @@ def test_model_similarity(self):
         )
         result = json.loads(response.data.decode())
         self.assertEqual(1, len(result['result']))
-        similarity = result['result'][0]['_score']
+        similarity = result['result'][0]['score']
         self.assertGreater(similarity, 0.7)
 
     def test_wrong_model_key(self):
@@ -599,7 +599,7 @@ def test_min_es_search(self):
             result = json.loads(response.data.decode())
 
             self.assertEqual(1, len(result['result']))
-            data['min_es_score']=10+result['result'][0]['_score']
+            data['min_es_score']=10+result['result'][0]['score']
 
             response = self.client.post(
                 '/text/similarity/search/',
diff --git a/app/test/test_similarity_lang_analyzers.py b/app/test/test_similarity_lang_analyzers.py
index b6817750..4378b5c0 100644
--- a/app/test/test_similarity_lang_analyzers.py
+++ b/app/test/test_similarity_lang_analyzers.py
@@ -48,7 +48,7 @@ def test_all_analyzers(self):
                     content_type='application/json'
                 )
                 result = json.loads(response.data.decode())
-                self.assertTrue(app.config['ELASTICSEARCH_SIMILARITY']+"_"+example['language'] in [e['_index'] for e in result['result']])
+                self.assertTrue(app.config['ELASTICSEARCH_SIMILARITY']+"_"+example['language'] in [e['index'] for e in result['result']])
 
     def test_auto_language_id(self):
         # language examples as input to language classifier
@@ -86,7 +86,7 @@ def test_auto_language_id(self):
                 index_alias = app.config['ELASTICSEARCH_SIMILARITY']
                 if expected_lang is not None:
                     index_alias = app.config['ELASTICSEARCH_SIMILARITY']+"_"+expected_lang
-                self.assertTrue(index_alias in [e['_index'] for e in result['result']])
+                self.assertTrue(index_alias in [e['index'] for e in result['result']])
 
     def test_auto_language_query(self):
       # language examples as input to language classifier
@@ -124,7 +124,7 @@ def test_auto_language_query(self):
               index_alias = app.config['ELASTICSEARCH_SIMILARITY']
               if expected_lang is not None:
                   index_alias = app.config['ELASTICSEARCH_SIMILARITY']+"_"+expected_lang
-              self.assertTrue(index_alias in [e['_index'] for e in result['result']])
+              self.assertTrue(index_alias in [e['index'] for e in result['result']])
     
 
 if __name__ == '__main__':

From e03f11ed8293e0a0c5fa68893fe8c49defc86574 Mon Sep 17 00:00:00 2001
From: Devin Gaffney <itsme@devingaffney.com>
Date: Wed, 18 Sep 2024 07:07:14 -0700
Subject: [PATCH 5/8] fix syntax error

---
 app/main/lib/elasticsearch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/app/main/lib/elasticsearch.py b/app/main/lib/elasticsearch.py
index 370b1eec..7edf966f 100644
--- a/app/main/lib/elasticsearch.py
+++ b/app/main/lib/elasticsearch.py
@@ -114,7 +114,7 @@ def get_by_doc_id(doc_id):
 
 def store_document(body, doc_id, language=None):
     storable_doc = {}
-    for k,v in body.items():
+    for k, v in body.items():
         if k not in ["per_model_threshold", "threshold", "model", "confirmed", "limit", "requires_callback"]:
             storable_doc[k] = v
     indices = [app.config['ELASTICSEARCH_SIMILARITY']]

From f18c122eb67f7cc4c05a936d50c9e789c5d01d05 Mon Sep 17 00:00:00 2001
From: Devin Gaffney <itsme@devingaffney.com>
Date: Thu, 19 Sep 2024 09:31:43 -0700
Subject: [PATCH 6/8] tweak elastic pickups

---
 app/main/lib/text_similarity.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/app/main/lib/text_similarity.py b/app/main/lib/text_similarity.py
index b915c8dc..8a29817b 100644
--- a/app/main/lib/text_similarity.py
+++ b/app/main/lib/text_similarity.py
@@ -44,7 +44,8 @@ def fill_in_openai_embeddings(document):
 
 def async_search_text_on_callback(task):
     app.logger.info(f"async_search_text_on_callback(task) is {task}")
-    document = elastic_crud.get_object_by_doc_id(task["id"])
+    doc_id = task.get("raw", {}).get("doc_id")
+    document = elastic_crud.get_object_by_doc_id(doc_id)
     fill_in_openai_embeddings(document)
     app.logger.info(f"async_search_text_on_callback(task) document is {document}")
     if not elastic_crud.requires_encoding(document):

From 85218acdd815ca0f0fcceb9fe523270981348545 Mon Sep 17 00:00:00 2001
From: Devin Gaffney <itsme@devingaffney.com>
Date: Mon, 23 Sep 2024 07:22:22 -0700
Subject: [PATCH 7/8] small tweak for bad logging line

---
 app/main/lib/elastic_crud.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/app/main/lib/elastic_crud.py b/app/main/lib/elastic_crud.py
index f5e311af..d8335328 100644
--- a/app/main/lib/elastic_crud.py
+++ b/app/main/lib/elastic_crud.py
@@ -66,7 +66,7 @@ def get_blocked_presto_response(task, model, modality):
         return obj, temporary, get_context_for_search(task), {"body": obj}
 
 def get_async_presto_response(task, model, modality):
-    app.logger.error(f"get_async_presto_response: {task} {model} {modality}")
+    app.logger.info(f"get_async_presto_response: {task} {model} {modality}")
     obj, _ = get_object(task, model)
     callback_url = Presto.add_item_callback_url(app.config['ALEGRE_HOST'], modality)
     if task.get("doc_id") is None:

From 67954f68c3d68a9bd8a86d86e738200790b81f82 Mon Sep 17 00:00:00 2001
From: Devin Gaffney <itsme@devingaffney.com>
Date: Wed, 9 Oct 2024 10:53:34 -0700
Subject: [PATCH 8/8] fixes for sync requests

---
 app/main/controller/similarity_sync_controller.py | 2 +-
 app/main/lib/elastic_crud.py                      | 4 ++--
 app/main/lib/presto.py                            | 1 +
 app/main/lib/similarity.py                        | 6 +++++-
 app/main/lib/text_similarity.py                   | 9 +++++++++
 5 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/app/main/controller/similarity_sync_controller.py b/app/main/controller/similarity_sync_controller.py
index f691f76c..2ea5e73f 100644
--- a/app/main/controller/similarity_sync_controller.py
+++ b/app/main/controller/similarity_sync_controller.py
@@ -26,7 +26,7 @@ def post(self, similarity_type):
         app.logger.debug(f"Args are {args}")
         if similarity_type == "text":
             package = similarity.get_body_for_text_document(args, 'query')
-            return similarity.get_similar_items(package, similarity_type)
+            return similarity.blocking_get_similar_items(package, similarity_type)
         else:
             package = similarity.get_body_for_media_document(args, 'query')
             return similarity.blocking_get_similar_items(package, similarity_type)
diff --git a/app/main/lib/elastic_crud.py b/app/main/lib/elastic_crud.py
index d8335328..2c680afe 100644
--- a/app/main/lib/elastic_crud.py
+++ b/app/main/lib/elastic_crud.py
@@ -58,10 +58,10 @@ def get_blocked_presto_response(task, model, modality):
         for model_key in obj.pop("models", []):
             if model_key != "elasticsearch" and not obj.get('model_'+model_key):
                 response = get_presto_request_response(model_key, callback_url, obj)
-                blocked_results.append(Presto.blocked_response(response, modality))
+                blocked_results.append({"model": model_key, "response": Presto.blocked_response(response, modality)})
         # Warning: this is a blocking hold to wait until we get a response in
         # a redis key that we've received something from presto.
-        return obj, temporary, get_context_for_search(task), blocked_results[-1]
+        return obj, temporary, get_context_for_search(task), blocked_results
     else:
         return obj, temporary, get_context_for_search(task), {"body": obj}
 
diff --git a/app/main/lib/presto.py b/app/main/lib/presto.py
index cef3ba88..db26840b 100644
--- a/app/main/lib/presto.py
+++ b/app/main/lib/presto.py
@@ -15,6 +15,7 @@
     "xlm-r-bert-base-nli-stsb-mean-tokens": "mean_tokens__Model",
     "indian-sbert": "indian_sbert__Model",
     "paraphrase-filipino-mpnet-base-v2": "fptg__Model",
+    "paraphrase-multilingual-mpnet-base-v2": "paraphrase_multilingual__Model"
 }
 PRESTO_RESPONSE_TIMEOUT = os.getenv('PRESTO_RESPONSE_TIMEOUT', 120)
 
diff --git a/app/main/lib/similarity.py b/app/main/lib/similarity.py
index 473f9e1c..7c5405a1 100644
--- a/app/main/lib/similarity.py
+++ b/app/main/lib/similarity.py
@@ -6,7 +6,7 @@
 from app.main.lib.shared_models.video_model import VideoModel
 from app.main.lib.presto import Presto, PRESTO_MODEL_MAP
 from app.main.lib.image_similarity import add_image, callback_add_image, delete_image, blocking_search_image, async_search_image, async_search_image_on_callback
-from app.main.lib.text_similarity import add_text, async_search_text, async_search_text_on_callback, callback_add_text, delete_text, search_text
+from app.main.lib.text_similarity import add_text, async_search_text, async_search_text_on_callback, callback_add_text, delete_text, search_text, sync_search_text
 DEFAULT_SEARCH_LIMIT = 200
 logging.basicConfig(level=logging.INFO)
 def get_body_for_media_document(params, mode):
@@ -200,6 +200,10 @@ def blocking_get_similar_items(item, similarity_type):
     response = video_model().blocking_search(model_response_package(item, "search"), "video")
     app.logger.info(f"[Alegre Similarity] [Item {item}, Similarity type: {similarity_type}] response for search was {response}")
     return response
+  elif similarity_type == "text":
+    response = sync_search_text(item, "text")
+    app.logger.info(f"[Alegre Similarity] [Item {item}, Similarity type: {similarity_type}] response for search was {response}")
+    return response
   else:
       raise Exception(f"{similarity_type} modality not implemented for blocking requests!")
 
diff --git a/app/main/lib/text_similarity.py b/app/main/lib/text_similarity.py
index 8a29817b..643c8476 100644
--- a/app/main/lib/text_similarity.py
+++ b/app/main/lib/text_similarity.py
@@ -35,6 +35,15 @@ def get_document_body(body):
 def async_search_text(task, modality):
     return elastic_crud.get_async_presto_response(task, "text", modality)
 
+def sync_search_text(task, modality):
+    obj, temporary, context, presto_result = elastic_crud.get_blocked_presto_response(task, "text", modality)
+    if isinstance(presto_result, list):
+        for presto_vector_result in presto_result:
+            obj['vector_'+presto_vector_result["model"]] = presto_vector_result["response"]["body"]["result"]
+            obj['model_'+presto_vector_result["model"]] = 1
+    document, _ = elastic_crud.get_object(obj, "text")
+    return search_text(document, True)
+
 def fill_in_openai_embeddings(document):
     for model_key in document.get("models", []):
         if model_key != "elasticsearch" and model_key[:len(PREFIX_OPENAI)] == PREFIX_OPENAI: