#fix benchmark GLUE-QNLI fix read_csv error and predict funciton and modify readme description (#1868)

xuhangscut · web-flow · commit 77e97d490f20 · 2024-12-16T22:21:19.000+08:00
diff --git a/benchmark/GLUE-QNLI/README.md b/benchmark/GLUE-QNLI/README.md
@@ -32,14 +32,18 @@ Please note that mindnlp is in the Ascend environment, while transformers is in
 Once the installation is complete, you can choose use differnet models to start inference. Here's how to run the inference:
    ```bash
    # Evaluate specific model using default dataset (dev.tsv)
-   python model_QNLI.py --model albert
+   python model_QNLI.py --model bart
 
    # Evaluate with custom dataset
-   python model_QNLI.py --model bert --data ./QNLI/dev.tsv
+   python model_QNLI.py --model bart --data ./QNLI/test.tsv
    ```
-   Supported model options: `albert`, `bert`, `roberta`, `xlm-roberta`, `distilbert`, `t5`, `gpt2`, `llama`, `opt`, `bart`
+   Supported model options: `bart`, `bert`, `roberta`, `xlm-roberta`, `gpt2`, `t5`, `distilbert`, `albert`, `llama`, `opt`
 
 ## Accuracy Comparsion
+Our reproduced model performance on QNLI/dev.tsv is reported as follows.
+Experiments are tested on ascend 910* with mindspore 2.3.1 graph mode.
+All fine-tuned models are derived from open-source models provided by huggingface.
+
 |  Model Name | bart | bert | roberta | xlm-roberta | gpt2 | t5 | distilbert | albert | opt | llama |
 |---|---|---|---|---|---|---|---|---|---|---|
 |  Base Model  | facebook/bart-base  |  google-bert/bert-base-uncased | FacebookAI/roberta-large | FacebookAI/xlm-roberta-large |  openai-community/gpt2 |  google-t5/t5-small |  distilbert/distilbert-base-uncased | albert/albert-base-v2  | facebook/opt-125m  | JackFram/llama-160m  |
diff --git a/benchmark/GLUE-QNLI/model_QNLI.py b/benchmark/GLUE-QNLI/model_QNLI.py
@@ -7,9 +7,9 @@
 import argparse
 
 MODEL_CONFIGS = {
-    "albert": {
-        "model_name": "orafandina/albert-base-v2-finetuned-qnli",
-        "tokenizer_name": "albert/albert-base-v2"
+    "bart": {
+        "model_name": "ModelTC/bart-base-qnli",
+        "tokenizer_name": "facebook/bart-base"
     },
     "bert": {
         "model_name": "Li/bert-base-uncased-qnli",
@@ -23,30 +23,34 @@
         "model_name": "tmnam20/xlm-roberta-large-qnli-1",
         "tokenizer_name": "FacebookAI/xlm-roberta-large"
     },
-    "distilbert": {
-        "model_name": "anirudh21/distilbert-base-uncased-finetuned-qnli",
-        "tokenizer_name": "distilbert/distilbert-base-uncased"
+    "gpt2": {
+        "model_name": "tanganke/gpt2_qnli",
+        "tokenizer_name": "openai-community/gpt2"
     },
     "t5": {
         "model_name": "lightsout19/t5-small-qnli",
         "tokenizer_name": "google-t5/t5-small"
     },
-    "gpt2": {
-        "model_name": "tanganke/gpt2_qnli",
-        "tokenizer_name": "openai-community/gpt2"
+    "distilbert": {
+        "model_name": "anirudh21/distilbert-base-uncased-finetuned-qnli",
+        "tokenizer_name": "distilbert/distilbert-base-uncased"
     },
     "llama": {
         "model_name": "Cheng98/llama-160m-qnli",
         "tokenizer_name": "JackFram/llama-160m"
     },
+    "albert": {
+        "model_name": "orafandina/albert-base-v2-finetuned-qnli",
+        "tokenizer_name": "albert/albert-base-v2"
+    },
     "opt": {
-        "model_name": "facebook/opt-125m",
-        "tokenizer_name": "utahnlp/qnli_facebook_opt-125m_seed-1"
+        "model_name": "utahnlp/qnli_facebook_opt-125m_seed-1",
+        "tokenizer_name": "facebook/opt-125m"
+    },
+    "llama": {
+        "model_name": "Cheng98/llama-160m-qnli",
+        "tokenizer_name": "JackFram/llama-160m"
     },
-    "bart": {
-        "model_name": "facebook/bart-large-qnli",
-        "tokenizer_name": "ModelTC/bart-base-qnli"
-    }
 }
 
 def get_model_and_tokenizer(model_type):
@@ -63,31 +67,29 @@ def get_model_and_tokenizer(model_type):
 def predict_qnli(model, tokenizer, question, sentence):
     """预测QNLI任务"""
     inputs = tokenizer(question, sentence, return_tensors="ms", truncation=True, max_length=512)
-    outputs = model(**inputs)
+    input_ids = inputs["input_ids"]
+    attention_mask = inputs["attention_mask"]
+    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
     logits = outputs.logits
     return logits.argmax(axis=1).asnumpy()[0]
 
 def evaluate_model(model_type, data_path):
     """评估模型在QNLI数据集上的表现"""
     print(f"正在评估模型: {model_type}")
     
-    # 加载模型和分词器
     model, tokenizer = get_model_and_tokenizer(model_type)
     print(f"模型类型: {model.config.model_type}")
     
-    # 加载数据
-    df = pd.read_csv(data_path, sep='\t', header=0, names=['idx', 'question', 'sentence', 'label'])
+    df = pd.read_csv(data_path, sep='\t', header=0, names=['idx', 'question', 'sentence', 'label'], on_bad_lines='skip')
     df = df.dropna(subset=['label'])
     
-    # 标签映射
     label_map = {'entailment': 0, 'not_entailment': 1}
     valid_data = df[df['label'].isin(label_map.keys())]
     
     questions = valid_data['question'].tolist()
     sentences = valid_data['sentence'].tolist()
     labels = [label_map[label] for label in valid_data['label']]
     
-    # 预测和评估
     predict_true = 0
     for question, sentence, true_label in tqdm(zip(questions, sentences, labels), 
                                              total=len(questions), 
@@ -96,7 +98,6 @@ def evaluate_model(model_type, data_path):
         if pred_label == true_label:
             predict_true += 1
     
-    # 输出结果
     accuracy = float(predict_true / len(questions) * 100)
     print(f"测试集总样本数: {len(questions)}")
     print(f"预测正确的数量: {predict_true}")