IDEA-FinAI
diff --git a/‎ToG/freebase_func.py
Lines changed: 215 additions & 0 deletions b/‎ToG/freebase_func.py
Lines changed: 215 additions & 0 deletions
diff --git a/‎ToG/main_freebase.py
Lines changed: 12 additions & 2 deletions b/‎ToG/main_freebase.py
Lines changed: 12 additions & 2 deletions
diff --git a/‎ToG/main_wiki.py
Lines changed: 12 additions & 2 deletions b/‎ToG/main_wiki.py
Lines changed: 12 additions & 2 deletions
@@ -1,4 +1,6 @@
 from SPARQLWrapper import SPARQLWrapper, JSON
+from utils import *
+
 SPARQLPATH = "http://192.168.80.12:8890/sparql"  # depend on your own internal address and port, shown in Freebase folder's readme.md
 
 # pre-defined sparqls
@@ -42,3 +44,216 @@ def id2entity_name_or_type(entity_id):
         return "UnName_Entity"
     else:
         return results["results"]["bindings"][0]['tailEntity']['value']
+    
+from freebase_func import *
+from prompt_list import *
+import json
+import time
+import openai
+import re
+from prompt_list import *
+from rank_bm25 import BM25Okapi
+from sentence_transformers import util
+from sentence_transformers import SentenceTransformer
+
+
+def clean_relations(string, entity_id, head_relations):
+    pattern = r"{\s*(?P<relation>[^()]+)\s+\(Score:\s+(?P<score>[0-9.]+)\)}"
+    relations=[]
+    for match in re.finditer(pattern, string):
+        relation = match.group("relation").strip()
+        if ';' in relation:
+            continue
+        score = match.group("score")
+        if not relation or not score:
+            return False, "output uncompleted.."
+        try:
+            score = float(score)
+        except ValueError:
+            return False, "Invalid score"
+        if relation in head_relations:
+            relations.append({"entity": entity_id, "relation": relation, "score": score, "head": True})
+        else:
+            relations.append({"entity": entity_id, "relation": relation, "score": score, "head": False})
+    if not relations:
+        return False, "No relations found"
+    return True, relations
+
+
+def if_all_zero(topn_scores):
+    return all(score == 0 for score in topn_scores)
+
+
+def clean_relations_bm25_sent(topn_relations, topn_scores, entity_id, head_relations):
+    relations = []
+    if if_all_zero(topn_scores):
+        topn_scores = [float(1/len(topn_scores))] * len(topn_scores)
+    i=0
+    for relation in topn_relations:
+        if relation in head_relations:
+            relations.append({"entity": entity_id, "relation": relation, "score": topn_scores[i], "head": True})
+        else:
+            relations.append({"entity": entity_id, "relation": relation, "score": topn_scores[i], "head": False})
+        i+=1
+    return True, relations
+
+
+def construct_relation_prune_prompt(question, entity_name, total_relations, args):
+    return extract_relation_prompt % (args.width, args.width) + question + '\nTopic Entity: ' + entity_name + '\nRelations: '+ '; '.join(total_relations) + "\nA: "
+        
+
+def construct_entity_score_prompt(question, relation, entity_candidates):
+    return score_entity_candidates_prompt.format(question, relation) + "; ".join(entity_candidates) + '\nScore: '
+
+
+def relation_search_prune(entity_id, entity_name, pre_relations, pre_head, question, args):
+    sparql_relations_extract_head = sparql_head_relations % (entity_id)
+    head_relations = execurte_sparql(sparql_relations_extract_head)
+    head_relations = replace_relation_prefix(head_relations)
+    
+    sparql_relations_extract_tail= sparql_tail_relations % (entity_id)
+    tail_relations = execurte_sparql(sparql_relations_extract_tail)
+    tail_relations = replace_relation_prefix(tail_relations)
+
+    if args.remove_unnecessary_rel:
+        head_relations = [relation for relation in head_relations if not abandon_rels(relation)]
+        tail_relations = [relation for relation in tail_relations if not abandon_rels(relation)]
+    
+    if pre_head:
+        tail_relations = list(set(tail_relations) - set(pre_relations))
+    else:
+        head_relations = list(set(head_relations) - set(pre_relations))
+
+    head_relations = list(set(head_relations))
+    tail_relations = list(set(tail_relations))
+    total_relations = head_relations+tail_relations
+    total_relations.sort()  # make sure the order in prompt is always equal
+    
+    if args.prune_tools == "llm":
+        prompt = construct_relation_prune_prompt(question, entity_name, total_relations, args)
+
+        result = run_llm(prompt, args.temperature_exploration, args.max_length, args.opeani_api_keys, args.LLM_type)
+        flag, retrieve_relations_with_scores = clean_relations(result, entity_id, head_relations) 
+
+    elif args.prune_tools == "bm25":
+        topn_relations, topn_scores = compute_bm25_similarity(question, total_relations, args.width)
+        flag, retrieve_relations_with_scores = clean_relations_bm25_sent(topn_relations, topn_scores, entity_id, head_relations) 
+    else:
+        model = SentenceTransformer('sentence-transformers/msmarco-distilbert-base-tas-b')
+        topn_relations, topn_scores = retrieve_top_docs(question, total_relations, model, args.width)
+        flag, retrieve_relations_with_scores = clean_relations_bm25_sent(topn_relations, topn_scores, entity_id, head_relations) 
+
+    if flag:
+        return retrieve_relations_with_scores
+    else:
+        return [] # format error or too small max_length
+    
+    
+def entity_search(entity, relation, head=True):
+    if head:
+        tail_entities_extract = sparql_tail_entities_extract% (entity, relation)
+        entities = execurte_sparql(tail_entities_extract)
+    else:
+        head_entities_extract = sparql_head_entities_extract% (entity, relation)
+        entities = execurte_sparql(head_entities_extract)
+
+
+    entity_ids = replace_entities_prefix(entities)
+    new_entity = [entity for entity in entity_ids if entity.startswith("m.")]
+    return new_entity
+
+
+def entity_score(question, entity_candidates_id, score, relation, args):
+    entity_candidates = [id2entity_name_or_type(entity_id) for entity_id in entity_candidates_id]
+    if all_unknown_entity(entity_candidates):
+        return [1/len(entity_candidates) * score] * len(entity_candidates), entity_candidates, entity_candidates_id
+    entity_candidates = del_unknown_entity(entity_candidates)
+    if len(entity_candidates) == 1:
+        return [score], entity_candidates, entity_candidates_id
+    if len(entity_candidates) == 0:
+        return [0.0], entity_candidates, entity_candidates_id
+    
+    # make sure the id and entity are in the same order
+    zipped_lists = sorted(zip(entity_candidates, entity_candidates_id))
+    entity_candidates, entity_candidates_id = zip(*zipped_lists)
+    entity_candidates = list(entity_candidates)
+    entity_candidates_id = list(entity_candidates_id)
+    if args.prune_tools == "llm":
+        prompt = construct_entity_score_prompt(question, relation, entity_candidates)
+
+        result = run_llm(prompt, args.temperature_exploration, args.max_length, args.opeani_api_keys, args.LLM_type)
+        return [float(x) * score for x in clean_scores(result, entity_candidates)], entity_candidates, entity_candidates_id
+
+    elif args.prune_tools == "bm25":
+        topn_entities, topn_scores = compute_bm25_similarity(question, entity_candidates, args.width)
+    else:
+        model = SentenceTransformer('sentence-transformers/msmarco-distilbert-base-tas-b')
+        topn_entities, topn_scores = retrieve_top_docs(question, entity_candidates, model, args.width)
+    if if_all_zero(topn_scores):
+        topn_scores = [float(1/len(topn_scores))] * len(topn_scores)
+    return [float(x) * score for x in topn_scores], topn_entities, entity_candidates_id
+
+    
+def update_history(entity_candidates, entity, scores, entity_candidates_id, total_candidates, total_scores, total_relations, total_entities_id, total_topic_entities, total_head):
+    if len(entity_candidates) == 0:
+        entity_candidates.append("[FINISH]")
+        entity_candidates_id = ["[FINISH_ID]"]
+    candidates_relation = [entity['relation']] * len(entity_candidates)
+    topic_entities = [entity['entity']] * len(entity_candidates)
+    head_num = [entity['head']] * len(entity_candidates)
+    total_candidates.extend(entity_candidates)
+    total_scores.extend(scores)
+    total_relations.extend(candidates_relation)
+    total_entities_id.extend(entity_candidates_id)
+    total_topic_entities.extend(topic_entities)
+    total_head.extend(head_num)
+    return total_candidates, total_scores, total_relations, total_entities_id, total_topic_entities, total_head
+
+
+def half_stop(question, cluster_chain_of_entities, depth, args):
+    print("No new knowledge added during search depth %d, stop searching." % depth)
+    answer = generate_answer(question, cluster_chain_of_entities, args)
+    save_2_jsonl(question, answer, cluster_chain_of_entities, file_name=args.dataset)
+
+
+def generate_answer(question, cluster_chain_of_entities, args): 
+    prompt = answer_prompt + question + '\n'
+    chain_prompt = '\n'.join([', '.join([str(x) for x in chain]) for sublist in cluster_chain_of_entities for chain in sublist])
+    prompt += "\nKnowledge Triplets: " + chain_prompt + 'A: '
+    result = run_llm(prompt, args.temperature_reasoning, args.max_length, args.opeani_api_keys, args.LLM_type)
+    return result
+
+
+def entity_prune(total_entities_id, total_relations, total_candidates, total_topic_entities, total_head, total_scores, args):
+    zipped = list(zip(total_entities_id, total_relations, total_candidates, total_topic_entities, total_head, total_scores))
+    sorted_zipped = sorted(zipped, key=lambda x: x[5], reverse=True)
+    sorted_entities_id, sorted_relations, sorted_candidates, sorted_topic_entities, sorted_head, sorted_scores = [x[0] for x in sorted_zipped], [x[1] for x in sorted_zipped], [x[2] for x in sorted_zipped], [x[3] for x in sorted_zipped], [x[4] for x in sorted_zipped], [x[5] for x in sorted_zipped]
+
+    entities_id, relations, candidates, topics, heads, scores = sorted_entities_id[:args.width], sorted_relations[:args.width], sorted_candidates[:args.width], sorted_topic_entities[:args.width], sorted_head[:args.width], sorted_scores[:args.width]
+    merged_list = list(zip(entities_id, relations, candidates, topics, heads, scores))
+    filtered_list = [(id, rel, ent, top, hea, score) for id, rel, ent, top, hea, score in merged_list if score != 0]
+    if len(filtered_list) ==0:
+        return False, [], [], [], []
+    entities_id, relations, candidates, tops, heads, scores = map(list, zip(*filtered_list))
+
+    tops = [id2entity_name_or_type(entity_id) for entity_id in tops]
+    cluster_chain_of_entities = [[(tops[i], relations[i], candidates[i]) for i in range(len(candidates))]]
+    return True, cluster_chain_of_entities, entities_id, relations, heads
+
+
+def reasoning(question, cluster_chain_of_entities, args):
+    prompt = prompt_evaluate + question
+    chain_prompt = '\n'.join([', '.join([str(x) for x in chain]) for sublist in cluster_chain_of_entities for chain in sublist])
+    prompt += "\nKnowledge Triplets: " + chain_prompt + 'A: '
+
+    response = run_llm(prompt, args.temperature_reasoning, args.max_length, args.opeani_api_keys, args.LLM_type)
+    
+    result = extract_answer(response)
+    if if_true(result):
+        return True, response
+    else:
+        return False, response
+    
+
+
+
@@ -1,6 +1,7 @@
 from tqdm import tqdm
 import argparse
 from utils import *
+from freebase_func import *
 import random
 from client import *
 
@@ -37,6 +38,10 @@
         question = data[question_string]
         topic_entity = data['topic_entity']
         cluster_chain_of_entities = []
+        if len(topic_entity) == 0:
+            results = generate_without_explored_paths(question, args)
+            save_2_jsonl(question, results, [], file_name=args.dataset)
+            continue
         pre_relations = []
         pre_heads= [-1] * len(topic_entity)
         flag_printed = False
@@ -87,8 +92,13 @@
                     break
                 else:
                     print("depth %d still not find the answer." % depth)
-                    topic_entity = {entity: id2entity_name_or_type(entity) for entity in entities_id}
-                    continue
+                    flag_finish, entities_id = if_finish_list(entities_id)
+                    if flag_finish:
+                        half_stop(question, cluster_chain_of_entities, depth, args)
+                        flag_printed = True
+                    else:
+                        topic_entity = {entity: id2entity_name_or_type(entity) for entity in entities_id}
+                        continue
             else:
                 half_stop(question, cluster_chain_of_entities, depth, args)
                 flag_printed = True
 
@@ -3,6 +3,7 @@
 import random
 from wiki_func import *
 from client import *
+from utils import *
 
 
 if __name__ == '__main__':
@@ -39,6 +40,10 @@
         question = data[question_string]
         topic_entity = data['qid_topic_entity']
         cluster_chain_of_entities = []
+        if len(topic_entity) == 0:
+            results = generate_without_explored_paths(question, args)
+            save_2_jsonl(question, results, [], file_name=args.dataset)
+            continue
         pre_relations = []
         pre_heads= [-1] * len(topic_entity)
         flag_printed = False
@@ -105,8 +110,13 @@
                     break
                 else:
                     print("depth %d still not find the answer." % depth)
-                    topic_entity = {qid: topic for qid, topic in zip(entities_id, [wiki_client.query_all("qid2label", entity).pop() for entity in entities_id])}
-                    continue
+                    flag_finish, entities_id = if_finish_list(entities_id)
+                    if flag_finish:
+                        half_stop(question, cluster_chain_of_entities, depth, args)
+                        flag_printed = True
+                    else:
+                        topic_entity = {qid: topic for qid, topic in zip(entities_id, [wiki_client.query_all("qid2label", entity).pop() for entity in entities_id])}
+                        continue
             else:
                 half_stop(question, cluster_chain_of_entities, depth, args)
                 flag_printed = True