| 
1 | 1 | # https://github.com/maszhongming/UniEval/tree/main  | 
2 | 2 | 
 
  | 
 | 3 | +from dataclasses import dataclass, field  | 
 | 4 | +from tqdm import tqdm  | 
3 | 5 | import torch  | 
4 | 6 | from torch import nn  | 
5 |  | -from dataclasses import dataclass, field  | 
6 |  | -import asyncio  | 
7 |  | -from tqdm.asyncio import tqdm as tqdm_async  | 
 | 7 | +import torch.multiprocessing as mp  | 
8 | 8 | 
 
  | 
9 | 9 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM  | 
10 |  | -from models.evaluate.base_evaluator import BaseEvaluator  | 
11 |  | -from utils import create_event_loop  | 
12 |  | -from models.text.text_pair import TextPair  | 
13 |  | - | 
 | 10 | +from models import TextPair  | 
 | 11 | + | 
 | 12 | + | 
 | 13 | +def _add_questions(dimension: str, question: str, answer: str):  | 
 | 14 | +    if dimension == "naturalness":  | 
 | 15 | +        cur_input = 'question: Is this a natural response in the dialogue? </s> response: ' + answer  | 
 | 16 | +    elif dimension == "coherence":  | 
 | 17 | +        cur_input = 'question: Is this a coherent response given the dialogue history? </s> response: ' \  | 
 | 18 | +                    + answer + ' </s> dialogue history: ' + question  | 
 | 19 | +    elif dimension == "understandability":  | 
 | 20 | +        cur_input = 'question: Is this an understandable response in the dialogue? </s> response: ' + answer  | 
 | 21 | +    else:  | 
 | 22 | +        raise NotImplementedError(  | 
 | 23 | +            'The input format for this dimension is still undefined. Please customize it first.')  | 
 | 24 | +    return cur_input  | 
14 | 25 | 
 
  | 
15 | 26 | @dataclass  | 
16 |  | -class UniEvaluator(BaseEvaluator):  | 
 | 27 | +class UniEvaluator:  | 
17 | 28 |     model_name: str = "MingZhong/unieval-sum"  | 
18 | 29 |     dimensions: list = field(default_factory=lambda: ['naturalness', 'coherence', 'understandability'])  | 
19 |  | -    max_length: int = 1024  | 
 | 30 | +    max_length: int = 2560  | 
20 | 31 | 
 
  | 
21 | 32 |     def __post_init__(self):  | 
22 |  | -        self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name)  | 
23 |  | -        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)  | 
24 |  | - | 
25 |  | -        self.model.eval()  | 
26 |  | -        self.model.to("cuda")  | 
 | 33 | +        self.num_gpus = torch.cuda.device_count()  | 
27 | 34 | 
 
  | 
28 |  | -        self.softmax = nn.Softmax(dim=1)  | 
 | 35 | +    @staticmethod  | 
 | 36 | +    def process_chunk(rank, pairs, model_name, max_length, dimension, return_dict):  | 
 | 37 | +        device = f'cuda:{rank}'  | 
 | 38 | +        torch.cuda.set_device(rank)  | 
29 | 39 | 
 
  | 
30 |  | -        self.pos_id = self.tokenizer("Yes")["input_ids"][0]  | 
31 |  | -        self.neg_id = self.tokenizer("No")["input_ids"][0]  | 
 | 40 | +        rank_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)  | 
 | 41 | +        tokenizer = AutoTokenizer.from_pretrained(model_name)  | 
 | 42 | +        rank_model.to(device)  | 
 | 43 | +        rank_model.eval()  | 
32 | 44 | 
 
  | 
33 |  | -    def evaluate(self, pairs: list[TextPair], dimension: str) -> list[float]:  | 
34 |  | -        """  | 
35 |  | -        Evaluate the text and return a score.  | 
36 |  | -        """  | 
37 |  | -        return create_event_loop().run_until_complete(self.async_evaluate(pairs, dimension))  | 
 | 45 | +        softmax = nn.Softmax(dim=1)  | 
38 | 46 | 
 
  | 
39 |  | -    async def async_evaluate(self, pairs: list[TextPair], dimension: str) -> list[float]:  | 
40 |  | -        semaphore = asyncio.Semaphore(self.max_concurrent)  | 
41 |  | - | 
42 |  | -        async def evaluate_with_semaphore(pair):  | 
43 |  | -            async with semaphore:  | 
44 |  | -                return await self.evaluate_single(pair, dimension)  | 
 | 47 | +        pos_id = tokenizer("Yes")["input_ids"][0]  | 
 | 48 | +        neg_id = tokenizer("No")["input_ids"][0]  | 
45 | 49 | 
 
  | 
46 | 50 |         results = []  | 
47 |  | -        for result in tqdm_async(  | 
48 |  | -                asyncio.as_completed([evaluate_with_semaphore(pair) for pair in pairs]),  | 
49 |  | -                total=len(pairs),  | 
50 |  | -        ):  | 
51 |  | -            results.append(await result)  | 
52 |  | -        return results  | 
53 |  | - | 
54 |  | -    async def evaluate_single(self, pair: TextPair, dimension: str) -> float:  | 
55 |  | -        text = self._add_questions(dimension, pair.question, pair.answer)  | 
56 |  | -        loop = create_event_loop()  | 
57 |  | -        return await loop.run_in_executor(None, self._score, text)  | 
58 |  | - | 
59 |  | -    def get_average_score(self, pairs: list[TextPair], dimension: str) -> float:  | 
 | 51 | +        with torch.no_grad():  | 
 | 52 | +            for pair in tqdm(pairs):  | 
 | 53 | +                text = _add_questions(dimension, pair.question, pair.answer)  | 
 | 54 | + | 
 | 55 | +                tgt = "No"  | 
 | 56 | + | 
 | 57 | +                encoded_src = tokenizer(  | 
 | 58 | +                    text,  | 
 | 59 | +                    max_length=max_length,  | 
 | 60 | +                    truncation=True,  | 
 | 61 | +                    padding=True,  | 
 | 62 | +                    return_tensors='pt'  | 
 | 63 | +                )  | 
 | 64 | +                encoded_tgt = tokenizer(  | 
 | 65 | +                    tgt,  | 
 | 66 | +                    max_length=max_length,  | 
 | 67 | +                    truncation=True,  | 
 | 68 | +                    padding=True,  | 
 | 69 | +                    return_tensors='pt'  | 
 | 70 | +                )  | 
 | 71 | + | 
 | 72 | +                src_tokens = encoded_src['input_ids'].to(device)  | 
 | 73 | +                src_mask = encoded_src['attention_mask'].to(device)  | 
 | 74 | + | 
 | 75 | +                tgt_tokens = encoded_tgt['input_ids'].to(device)[:, 0].unsqueeze(-1)  | 
 | 76 | + | 
 | 77 | +                output = rank_model(  | 
 | 78 | +                    input_ids=src_tokens,  | 
 | 79 | +                    attention_mask=src_mask,  | 
 | 80 | +                    labels=tgt_tokens,  | 
 | 81 | +                    use_cache = False  | 
 | 82 | +                )  | 
 | 83 | + | 
 | 84 | +                logits = output.logits.view(-1, rank_model.config.vocab_size)  | 
 | 85 | + | 
 | 86 | +                pos_score = softmax(logits)[:, pos_id]  # Yes  | 
 | 87 | +                neg_score = softmax(logits)[:, neg_id]  | 
 | 88 | +                score = pos_score / (pos_score + neg_score)  | 
 | 89 | + | 
 | 90 | +                results.append(score.item())  | 
 | 91 | + | 
 | 92 | +        return_dict[rank] = results  | 
 | 93 | + | 
 | 94 | +    def evaluate(self, pairs: list[TextPair]) -> list[dict]:  | 
 | 95 | +        final_results = []  | 
 | 96 | +        for dimension in self.dimensions:  | 
 | 97 | +            chunk_size = len(pairs) // self.num_gpus  | 
 | 98 | +            chunks = []  | 
 | 99 | +            for i in range(self.num_gpus):  | 
 | 100 | +                start = i * chunk_size  | 
 | 101 | +                end = start + chunk_size  | 
 | 102 | +                if i == self.num_gpus - 1:  | 
 | 103 | +                    end = len(pairs)  | 
 | 104 | +                chunks.append(pairs[start:end])  | 
 | 105 | + | 
 | 106 | +            # multi-process  | 
 | 107 | +            manager = mp.Manager()  | 
 | 108 | +            return_dict = manager.dict()  | 
 | 109 | +            processes = []  | 
 | 110 | + | 
 | 111 | +            for rank, chunk in enumerate(chunks):  | 
 | 112 | +                p = mp.Process(  | 
 | 113 | +                    target=self.process_chunk,  | 
 | 114 | +                    args=(rank, chunk, self.model_name, self.max_length, dimension, return_dict)  | 
 | 115 | +                )  | 
 | 116 | +                p.start()  | 
 | 117 | +                processes.append(p)  | 
 | 118 | + | 
 | 119 | +            for p in processes:  | 
 | 120 | +                p.join()  | 
 | 121 | + | 
 | 122 | +            # 合并结果  | 
 | 123 | +            results = []  | 
 | 124 | +            for rank in range(len(chunks)):  | 
 | 125 | +                results.extend(return_dict[rank])  | 
 | 126 | + | 
 | 127 | +            for p in processes:  | 
 | 128 | +                if p.is_alive():  | 
 | 129 | +                    p.terminate()  | 
 | 130 | +                    p.join()  | 
 | 131 | + | 
 | 132 | +            final_results.append({  | 
 | 133 | +                dimension: results  | 
 | 134 | +            })  | 
 | 135 | +        return final_results  | 
 | 136 | + | 
 | 137 | +    def get_average_score(self, pairs: list[TextPair]) -> dict:  | 
60 | 138 |         """  | 
61 | 139 |         Get the average score of a batch of texts.  | 
62 | 140 |         """  | 
63 |  | -        return sum(self.evaluate(pairs, dimension)) / len(pairs)  | 
64 |  | - | 
65 |  | -    def _score(self, text: str) -> float:  | 
66 |  | -        """  | 
67 |  | -            Get scores for the given samples.  | 
68 |  | -            final_score = postive_score / (postive_score + negative_score)  | 
69 |  | -        """  | 
70 |  | - | 
71 |  | -        # The implementation of "forward" in T5 still requires decoder_input_ids.  | 
72 |  | -        # Therefore, we construct a random one-word target sequence.  | 
73 |  | -        # The content of the target has no effect on the final scores.  | 
74 |  | - | 
75 |  | -        tgt = "No"  | 
76 |  | - | 
77 |  | -        with torch.no_grad():  | 
78 |  | -            encoded_src = self.tokenizer(  | 
79 |  | -                text,  | 
80 |  | -                max_length=self.max_length,  | 
81 |  | -                truncation=True,  | 
82 |  | -                padding=True,  | 
83 |  | -                return_tensors='pt'  | 
84 |  | -            )  | 
85 |  | -            encoded_tgt = self.tokenizer(  | 
86 |  | -                tgt,  | 
87 |  | -                max_length=self.max_length,  | 
88 |  | -                truncation=True,  | 
89 |  | -                padding=True,  | 
90 |  | -                return_tensors='pt'  | 
91 |  | -            )  | 
92 |  | - | 
93 |  | -            src_tokens = encoded_src['input_ids'].to("cuda")  | 
94 |  | -            src_mask = encoded_src['attention_mask'].to("cuda")  | 
95 |  | - | 
96 |  | -            tgt_tokens = encoded_tgt['input_ids'].to("cuda")[:, 0].unsqueeze(-1)  | 
97 |  | - | 
98 |  | -            output = self.model(  | 
99 |  | -                input_ids=src_tokens,  | 
100 |  | -                attention_mask=src_mask,  | 
101 |  | -                labels=tgt_tokens  | 
102 |  | -            )  | 
103 |  | - | 
104 |  | -            logits = output.logits.view(-1, self.model.config.vocab_size)  | 
105 |  | - | 
106 |  | -            pos_score = self.softmax(logits)[:, self.pos_id]  # Yes  | 
107 |  | -            neg_score = self.softmax(logits)[:, self.neg_id]  | 
108 |  | - | 
109 |  | -            score = pos_score / (pos_score + neg_score)  | 
110 |  | - | 
111 |  | -        return score.item()  | 
112 |  | - | 
113 |  | -    def _add_questions(self, dimension: str, question: str, answer: str):  | 
114 |  | -        if dimension == "naturalness":  | 
115 |  | -            cur_input = 'question: Is this a natural response in the dialogue? </s> response: ' + answer  | 
116 |  | -        elif dimension == "coherence":  | 
117 |  | -            cur_input = 'question: Is this a coherent response given the dialogue history? </s> response: ' \  | 
118 |  | -                        + answer + ' </s> dialogue history: ' + question  | 
119 |  | -        elif dimension == "understandability":  | 
120 |  | -            cur_input = 'question: Is this an understandable response in the dialogue? </s> response: ' + answer  | 
121 |  | -        else:  | 
122 |  | -            raise NotImplementedError(  | 
123 |  | -                'The input format for this dimension is still undefined. Please customize it first.')  | 
124 |  | -        return cur_input  | 
 | 141 | +        results = self.evaluate(pairs)  | 
 | 142 | +        final_results = {}  | 
 | 143 | +        for result in results:  | 
 | 144 | +            for key, value in result.items():  | 
 | 145 | +                final_results[key] = sum(value) / len(value)  | 
 | 146 | +        return final_results  | 
0 commit comments