From 0c165d88dd166eb5364dcbcd37a7720679ae5f36 Mon Sep 17 00:00:00 2001 From: Devin Gaffney Date: Mon, 16 Sep 2024 14:59:44 -0700 Subject: [PATCH 1/8] updates for making similarity work --- app/main/lib/elastic_crud.py | 2 +- app/main/lib/text_similarity.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/app/main/lib/elastic_crud.py b/app/main/lib/elastic_crud.py index fef79a05..f5e311af 100644 --- a/app/main/lib/elastic_crud.py +++ b/app/main/lib/elastic_crud.py @@ -42,7 +42,7 @@ def get_presto_request_response(modality, callback_url, task): def requires_encoding(obj): for model_key in obj.get("models", []): - if not obj.get('model_'+model_key): + if model_key != "elasticsearch" and not obj.get('model_'+model_key): return True return False diff --git a/app/main/lib/text_similarity.py b/app/main/lib/text_similarity.py index eb3830b2..dad634b4 100644 --- a/app/main/lib/text_similarity.py +++ b/app/main/lib/text_similarity.py @@ -36,7 +36,7 @@ def async_search_text(task, modality): return elastic_crud.get_async_presto_response(task, "text", modality) def fill_in_openai_embeddings(document): - for model_key in document.pop("models", []): + for model_key in document.get("models", []): if model_key != "elasticsearch" and model_key[:len(PREFIX_OPENAI)] == PREFIX_OPENAI: document['vector_'+model_key] = retrieve_openai_embeddings(document['content'], model_key) document['model_'+model_key] = 1 @@ -76,7 +76,7 @@ def search_text(search_params, use_document_vectors=False): if model_key != "elasticsearch": search_params.pop("model", None) if use_document_vectors: - vector_for_search = search_params[model_key+"-tokens"] + vector_for_search = search_params["vector_"+model_key] else: vector_for_search = None result = search_text_by_model(dict(**search_params, **{'model': model_key}), vector_for_search) From 80a1aef2c7b75e55be4a476a484c293ce81eba07 Mon Sep 17 00:00:00 2001 From: Devin Gaffney Date: Tue, 17 Sep 2024 11:06:06 -0700 Subject: [PATCH 2/8] more tweaking during testing --- app/main/lib/elasticsearch.py | 31 +++++++++++++++++-------------- app/main/lib/text_similarity.py | 16 +++++++++++----- 2 files changed, 28 insertions(+), 19 deletions(-) diff --git a/app/main/lib/elasticsearch.py b/app/main/lib/elasticsearch.py index 70090916..370b1eec 100644 --- a/app/main/lib/elasticsearch.py +++ b/app/main/lib/elasticsearch.py @@ -48,16 +48,17 @@ def generate_matches(context): matches = [] clause_count = 0 for key in context: - if isinstance(context[key], list): - clause_count += len(context[key]) - matches.append({ - 'query_string': { 'query': str.join(" OR ", [f"context.{key}: {v}" for v in context[key]])} - }) - else: - clause_count += 1 - matches.append({ - 'match': { 'context.' + key: context[key] } - }) + if key not in ["project_media_id", "has_custom_id", "field"]: + if isinstance(context[key], list): + clause_count += len(context[key]) + matches.append({ + 'query_string': { 'query': str.join(" OR ", [f"context.{key}: {v}" for v in context[key]])} + }) + else: + clause_count += 1 + matches.append({ + 'match': { 'context.' + key: context[key] } + }) return matches, clause_count def truncate_query(query, clause_count): @@ -112,12 +113,14 @@ def get_by_doc_id(doc_id): return response['_source'] def store_document(body, doc_id, language=None): - for field in ["per_model_threshold", "threshold", "model", "confirmed", "limit", "requires_callback"]: - body.pop(field, None) + storable_doc = {} + for k,v in body.items(): + if k not in ["per_model_threshold", "threshold", "model", "confirmed", "limit", "requires_callback"]: + storable_doc[k] = v indices = [app.config['ELASTICSEARCH_SIMILARITY']] # 'auto' indicates we should try to guess the appropriate language if language == 'auto': - text = body['content'] + text = storable_doc['content'] language = LangidProvider.langid(text)['result']['language'] if language not in SUPPORTED_LANGUAGES: app.logger.warning('Detected language {} is not supported'.format(language)) @@ -129,7 +132,7 @@ def store_document(body, doc_id, language=None): results = [] for index in indices: - index_result = update_or_create_document(body, doc_id, index) + index_result = update_or_create_document(storable_doc, doc_id, index) results.append(index_result) if index_result['result'] not in ['created', 'updated', 'noop']: app.logger.warning('Problem adding document to ES index for language {0}: {1}'.format(language, index_result)) diff --git a/app/main/lib/text_similarity.py b/app/main/lib/text_similarity.py index dad634b4..99f72479 100644 --- a/app/main/lib/text_similarity.py +++ b/app/main/lib/text_similarity.py @@ -175,6 +175,9 @@ def insert_model_into_response(hits, model_key): hit["_source"]["model"] = model_key return hits +def return_sources(results): + return [dict(**r["_source"], **{"score": r["_score"]}) for r in results] + def strip_vectors(results): for result in results: vector_keys = [key for key in result["_source"].keys() if key[:7] == "vector_"] @@ -197,6 +200,7 @@ def restrict_results(results, search_params, model_key): return results def search_text_by_model(search_params, vector_for_search): + import code;code.interact(local=dict(globals(), **locals())) app.logger.info( f"[Alegre Similarity] search_text_by_model:search_params {search_params}") language = None @@ -260,11 +264,13 @@ def search_text_by_model(search_params, vector_for_search): body=body, index=search_indices ) - response = strip_vectors( - restrict_results( - insert_model_into_response(result['hits']['hits'], model_key), - search_params, - model_key + response = return_sources( + strip_vectors( + restrict_results( + insert_model_into_response(result['hits']['hits'], model_key), + search_params, + model_key + ) ) ) return { From a80df06eb619f1c9576f043a28f9322115659cc7 Mon Sep 17 00:00:00 2001 From: Devin Gaffney Date: Tue, 17 Sep 2024 13:17:03 -0700 Subject: [PATCH 3/8] remove debug --- app/main/lib/text_similarity.py | 1 - 1 file changed, 1 deletion(-) diff --git a/app/main/lib/text_similarity.py b/app/main/lib/text_similarity.py index 99f72479..aa198795 100644 --- a/app/main/lib/text_similarity.py +++ b/app/main/lib/text_similarity.py @@ -200,7 +200,6 @@ def restrict_results(results, search_params, model_key): return results def search_text_by_model(search_params, vector_for_search): - import code;code.interact(local=dict(globals(), **locals())) app.logger.info( f"[Alegre Similarity] search_text_by_model:search_params {search_params}") language = None From 314512b51a00c8249ad36cace36e170e59dfd3ee Mon Sep 17 00:00:00 2001 From: Devin Gaffney Date: Tue, 17 Sep 2024 14:45:54 -0700 Subject: [PATCH 4/8] more tweaks --- app/main/lib/text_similarity.py | 2 +- app/test/test_similarity.py | 12 ++++++------ app/test/test_similarity_lang_analyzers.py | 6 +++--- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/app/main/lib/text_similarity.py b/app/main/lib/text_similarity.py index aa198795..b915c8dc 100644 --- a/app/main/lib/text_similarity.py +++ b/app/main/lib/text_similarity.py @@ -176,7 +176,7 @@ def insert_model_into_response(hits, model_key): return hits def return_sources(results): - return [dict(**r["_source"], **{"score": r["_score"]}) for r in results] + return [dict(**r["_source"], **{"index": r["_index"], "score": r["_score"]}) for r in results] def strip_vectors(results): for result in results: diff --git a/app/test/test_similarity.py b/app/test/test_similarity.py index 28d406cb..b8502f4e 100644 --- a/app/test/test_similarity.py +++ b/app/test/test_similarity.py @@ -306,10 +306,10 @@ def test_elasticsearch_performs_correct_fuzzy_search(self): post_response = self.client.post('/text/similarity/search/', data=json.dumps(lookup), content_type='application/json') lookup["fuzzy"] = True post_response_fuzzy = self.client.post('/text/similarity/search/', data=json.dumps(lookup), content_type='application/json') - self.assertGreater(json.loads(post_response_fuzzy.data.decode())["result"][0]["_score"], json.loads(post_response.data.decode())["result"][0]["_score"]) + self.assertGreater(json.loads(post_response_fuzzy.data.decode())["result"][0]["score"], json.loads(post_response.data.decode())["result"][0]["score"]) lookup["fuzzy"] = False post_response_fuzzy = self.client.post('/text/similarity/search/', data=json.dumps(lookup), content_type='application/json') - self.assertEqual(json.loads(post_response_fuzzy.data.decode())["result"][0]["_score"], json.loads(post_response.data.decode())["result"][0]["_score"]) + self.assertEqual(json.loads(post_response_fuzzy.data.decode())["result"][0]["score"], json.loads(post_response.data.decode())["result"][0]["score"]) def test_elasticsearch_update_text(self): with self.client: @@ -455,7 +455,7 @@ def test_model_similarity(self): ) result = json.loads(response.data.decode()) self.assertEqual(1, len(result['result'])) - similarity = result['result'][0]['_score'] + similarity = result['result'][0]['score'] self.assertGreater(similarity, 0.7) response = self.client.post( @@ -487,7 +487,7 @@ def test_model_similarity(self): ) result = json.loads(response.data.decode()) self.assertEqual(1, len(result['result'])) - similarity = result['result'][0]['_score'] + similarity = result['result'][0]['score'] self.assertGreater(similarity, 0.7) response = self.client.post( @@ -501,7 +501,7 @@ def test_model_similarity(self): ) result = json.loads(response.data.decode()) self.assertEqual(1, len(result['result'])) - similarity = result['result'][0]['_score'] + similarity = result['result'][0]['score'] self.assertGreater(similarity, 0.7) def test_wrong_model_key(self): @@ -599,7 +599,7 @@ def test_min_es_search(self): result = json.loads(response.data.decode()) self.assertEqual(1, len(result['result'])) - data['min_es_score']=10+result['result'][0]['_score'] + data['min_es_score']=10+result['result'][0]['score'] response = self.client.post( '/text/similarity/search/', diff --git a/app/test/test_similarity_lang_analyzers.py b/app/test/test_similarity_lang_analyzers.py index b6817750..4378b5c0 100644 --- a/app/test/test_similarity_lang_analyzers.py +++ b/app/test/test_similarity_lang_analyzers.py @@ -48,7 +48,7 @@ def test_all_analyzers(self): content_type='application/json' ) result = json.loads(response.data.decode()) - self.assertTrue(app.config['ELASTICSEARCH_SIMILARITY']+"_"+example['language'] in [e['_index'] for e in result['result']]) + self.assertTrue(app.config['ELASTICSEARCH_SIMILARITY']+"_"+example['language'] in [e['index'] for e in result['result']]) def test_auto_language_id(self): # language examples as input to language classifier @@ -86,7 +86,7 @@ def test_auto_language_id(self): index_alias = app.config['ELASTICSEARCH_SIMILARITY'] if expected_lang is not None: index_alias = app.config['ELASTICSEARCH_SIMILARITY']+"_"+expected_lang - self.assertTrue(index_alias in [e['_index'] for e in result['result']]) + self.assertTrue(index_alias in [e['index'] for e in result['result']]) def test_auto_language_query(self): # language examples as input to language classifier @@ -124,7 +124,7 @@ def test_auto_language_query(self): index_alias = app.config['ELASTICSEARCH_SIMILARITY'] if expected_lang is not None: index_alias = app.config['ELASTICSEARCH_SIMILARITY']+"_"+expected_lang - self.assertTrue(index_alias in [e['_index'] for e in result['result']]) + self.assertTrue(index_alias in [e['index'] for e in result['result']]) if __name__ == '__main__': From e03f11ed8293e0a0c5fa68893fe8c49defc86574 Mon Sep 17 00:00:00 2001 From: Devin Gaffney Date: Wed, 18 Sep 2024 07:07:14 -0700 Subject: [PATCH 5/8] fix syntax error --- app/main/lib/elasticsearch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/main/lib/elasticsearch.py b/app/main/lib/elasticsearch.py index 370b1eec..7edf966f 100644 --- a/app/main/lib/elasticsearch.py +++ b/app/main/lib/elasticsearch.py @@ -114,7 +114,7 @@ def get_by_doc_id(doc_id): def store_document(body, doc_id, language=None): storable_doc = {} - for k,v in body.items(): + for k, v in body.items(): if k not in ["per_model_threshold", "threshold", "model", "confirmed", "limit", "requires_callback"]: storable_doc[k] = v indices = [app.config['ELASTICSEARCH_SIMILARITY']] From f18c122eb67f7cc4c05a936d50c9e789c5d01d05 Mon Sep 17 00:00:00 2001 From: Devin Gaffney Date: Thu, 19 Sep 2024 09:31:43 -0700 Subject: [PATCH 6/8] tweak elastic pickups --- app/main/lib/text_similarity.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/app/main/lib/text_similarity.py b/app/main/lib/text_similarity.py index b915c8dc..8a29817b 100644 --- a/app/main/lib/text_similarity.py +++ b/app/main/lib/text_similarity.py @@ -44,7 +44,8 @@ def fill_in_openai_embeddings(document): def async_search_text_on_callback(task): app.logger.info(f"async_search_text_on_callback(task) is {task}") - document = elastic_crud.get_object_by_doc_id(task["id"]) + doc_id = task.get("raw", {}).get("doc_id") + document = elastic_crud.get_object_by_doc_id(doc_id) fill_in_openai_embeddings(document) app.logger.info(f"async_search_text_on_callback(task) document is {document}") if not elastic_crud.requires_encoding(document): From 85218acdd815ca0f0fcceb9fe523270981348545 Mon Sep 17 00:00:00 2001 From: Devin Gaffney Date: Mon, 23 Sep 2024 07:22:22 -0700 Subject: [PATCH 7/8] small tweak for bad logging line --- app/main/lib/elastic_crud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/main/lib/elastic_crud.py b/app/main/lib/elastic_crud.py index f5e311af..d8335328 100644 --- a/app/main/lib/elastic_crud.py +++ b/app/main/lib/elastic_crud.py @@ -66,7 +66,7 @@ def get_blocked_presto_response(task, model, modality): return obj, temporary, get_context_for_search(task), {"body": obj} def get_async_presto_response(task, model, modality): - app.logger.error(f"get_async_presto_response: {task} {model} {modality}") + app.logger.info(f"get_async_presto_response: {task} {model} {modality}") obj, _ = get_object(task, model) callback_url = Presto.add_item_callback_url(app.config['ALEGRE_HOST'], modality) if task.get("doc_id") is None: From 67954f68c3d68a9bd8a86d86e738200790b81f82 Mon Sep 17 00:00:00 2001 From: Devin Gaffney Date: Wed, 9 Oct 2024 10:53:34 -0700 Subject: [PATCH 8/8] fixes for sync requests --- app/main/controller/similarity_sync_controller.py | 2 +- app/main/lib/elastic_crud.py | 4 ++-- app/main/lib/presto.py | 1 + app/main/lib/similarity.py | 6 +++++- app/main/lib/text_similarity.py | 9 +++++++++ 5 files changed, 18 insertions(+), 4 deletions(-) diff --git a/app/main/controller/similarity_sync_controller.py b/app/main/controller/similarity_sync_controller.py index f691f76c..2ea5e73f 100644 --- a/app/main/controller/similarity_sync_controller.py +++ b/app/main/controller/similarity_sync_controller.py @@ -26,7 +26,7 @@ def post(self, similarity_type): app.logger.debug(f"Args are {args}") if similarity_type == "text": package = similarity.get_body_for_text_document(args, 'query') - return similarity.get_similar_items(package, similarity_type) + return similarity.blocking_get_similar_items(package, similarity_type) else: package = similarity.get_body_for_media_document(args, 'query') return similarity.blocking_get_similar_items(package, similarity_type) diff --git a/app/main/lib/elastic_crud.py b/app/main/lib/elastic_crud.py index d8335328..2c680afe 100644 --- a/app/main/lib/elastic_crud.py +++ b/app/main/lib/elastic_crud.py @@ -58,10 +58,10 @@ def get_blocked_presto_response(task, model, modality): for model_key in obj.pop("models", []): if model_key != "elasticsearch" and not obj.get('model_'+model_key): response = get_presto_request_response(model_key, callback_url, obj) - blocked_results.append(Presto.blocked_response(response, modality)) + blocked_results.append({"model": model_key, "response": Presto.blocked_response(response, modality)}) # Warning: this is a blocking hold to wait until we get a response in # a redis key that we've received something from presto. - return obj, temporary, get_context_for_search(task), blocked_results[-1] + return obj, temporary, get_context_for_search(task), blocked_results else: return obj, temporary, get_context_for_search(task), {"body": obj} diff --git a/app/main/lib/presto.py b/app/main/lib/presto.py index cef3ba88..db26840b 100644 --- a/app/main/lib/presto.py +++ b/app/main/lib/presto.py @@ -15,6 +15,7 @@ "xlm-r-bert-base-nli-stsb-mean-tokens": "mean_tokens__Model", "indian-sbert": "indian_sbert__Model", "paraphrase-filipino-mpnet-base-v2": "fptg__Model", + "paraphrase-multilingual-mpnet-base-v2": "paraphrase_multilingual__Model" } PRESTO_RESPONSE_TIMEOUT = os.getenv('PRESTO_RESPONSE_TIMEOUT', 120) diff --git a/app/main/lib/similarity.py b/app/main/lib/similarity.py index 473f9e1c..7c5405a1 100644 --- a/app/main/lib/similarity.py +++ b/app/main/lib/similarity.py @@ -6,7 +6,7 @@ from app.main.lib.shared_models.video_model import VideoModel from app.main.lib.presto import Presto, PRESTO_MODEL_MAP from app.main.lib.image_similarity import add_image, callback_add_image, delete_image, blocking_search_image, async_search_image, async_search_image_on_callback -from app.main.lib.text_similarity import add_text, async_search_text, async_search_text_on_callback, callback_add_text, delete_text, search_text +from app.main.lib.text_similarity import add_text, async_search_text, async_search_text_on_callback, callback_add_text, delete_text, search_text, sync_search_text DEFAULT_SEARCH_LIMIT = 200 logging.basicConfig(level=logging.INFO) def get_body_for_media_document(params, mode): @@ -200,6 +200,10 @@ def blocking_get_similar_items(item, similarity_type): response = video_model().blocking_search(model_response_package(item, "search"), "video") app.logger.info(f"[Alegre Similarity] [Item {item}, Similarity type: {similarity_type}] response for search was {response}") return response + elif similarity_type == "text": + response = sync_search_text(item, "text") + app.logger.info(f"[Alegre Similarity] [Item {item}, Similarity type: {similarity_type}] response for search was {response}") + return response else: raise Exception(f"{similarity_type} modality not implemented for blocking requests!") diff --git a/app/main/lib/text_similarity.py b/app/main/lib/text_similarity.py index 8a29817b..643c8476 100644 --- a/app/main/lib/text_similarity.py +++ b/app/main/lib/text_similarity.py @@ -35,6 +35,15 @@ def get_document_body(body): def async_search_text(task, modality): return elastic_crud.get_async_presto_response(task, "text", modality) +def sync_search_text(task, modality): + obj, temporary, context, presto_result = elastic_crud.get_blocked_presto_response(task, "text", modality) + if isinstance(presto_result, list): + for presto_vector_result in presto_result: + obj['vector_'+presto_vector_result["model"]] = presto_vector_result["response"]["body"]["result"] + obj['model_'+presto_vector_result["model"]] = 1 + document, _ = elastic_crud.get_object(obj, "text") + return search_text(document, True) + def fill_in_openai_embeddings(document): for model_key in document.get("models", []): if model_key != "elasticsearch" and model_key[:len(PREFIX_OPENAI)] == PREFIX_OPENAI: