#benchmark: add GLUE-QNLI benchmark, including 10 models inference accuracy comparsion (#1865)

xuhangscut · web-flow · commit c2371b1fea64 · 2024-12-16T16:56:28.000+08:00
diff --git a/benchmark/GLUE-QNLI/README.md b/benchmark/GLUE-QNLI/README.md
@@ -0,0 +1,48 @@
+# GLUE-QNLI
+A repository comparing the inference accuracy of MindNLP and Transformer on the GLUE QNLI dataset
+
++ ## Dataset
++ The QNLI (Question Natural Language Inference) dataset is part of the GLUE benchmark. It is converted from the Stanford Question Answering Dataset (SQuAD).
++ 
++ ### Getting the Dataset
++ 1. Visit [GLUE Benchmark Tasks](https://gluebenchmark.com/tasks/)
++ 2. Register/Login to download the GLUE data
++ 3. Download and extract the QNLI dataset
++ 4. Place the following files in the `mindnlp/benchmark/GLUE-QNLI/` directory:
++    - dev.tsv (Development set)
++    - test.tsv (Test set)
++    - train.tsv (Training set)
++ 
++ The QNLI task is a binary classification task derived from SQuAD, where the goal is to determine whether a given context sentence contains the answer to a given question.
+
+## Quick Start
+
+### Installation
+To get started with this project, follow these steps:
+
+1. **Create a conda environment (optional but recommended):**
+    ```bash
+    conda create -n mindnlp python==3.9
+    conda activate mindnlp
+2. **Install the dependencies:**
+Please note that mindnlp is in the Ascend environment, while transformers is in the GPU environment, and the required dependencies are in the requirements of their respective folders.
+    ```bash
+    pip install -r requirements.txt
+3. **Usage**
+Once the installation is complete, you can choose use differnet models to start inference. Here's how to run the inference:
+   ```bash
+   # Evaluate specific model using default dataset (dev.tsv)
+   python model_QNLI.py --model albert
+
+   # Evaluate with custom dataset
+   python model_QNLI.py --model bert --data ./QNLI/dev.tsv
+   ```
+   Supported model options: `albert`, `bert`, `roberta`, `xlm-roberta`, `distilbert`, `t5`, `gpt2`, `llama`, `opt`, `bart`
+
+## Accuracy Comparsion
+|  Model Name | bart | bert | roberta | xlm-roberta | gpt2 | t5 | distilbert | albert | opt | llama |
+|---|---|---|---|---|---|---|---|---|---|---|
+|  Base Model  | facebook/bart-base  |  google-bert/bert-base-uncased | FacebookAI/roberta-large | FacebookAI/xlm-roberta-large |  openai-community/gpt2 |  google-t5/t5-small |  distilbert/distilbert-base-uncased | albert/albert-base-v2  | facebook/opt-125m  | JackFram/llama-160m  |
+|  Fine-tuned Model(hf)  | ModelTC/bart-base-qnli  | Li/bert-base-uncased-qnli  | howey/roberta-large-qnli | tmnam20/xlm-roberta-large-qnli-1 | tanganke/gpt2_qnli  | lightsout19/t5-small-qnli  | anirudh21/distilbert-base-uncased-finetuned-qnli  | orafandina/albert-base-v2-finetuned-qnli  | utahnlp/qnli_facebook_opt-125m_seed-1  | Cheng98/llama-160m-qnli  |
+| transformers accuracy(GPU) |  92.29 | 67.43  | 94.50 | 92.50 | 88.15  | 89.71  | 59.21  | 55.14  | 86.10  |  50.97 |
+| mindnlp accuracy(NPU) | 92.29  | 67.43  | 94.51 | 92.50 | 88.15  | 89.71  | 59.23  | 55.13  | 86.10  | 50.97  |
diff --git a/benchmark/GLUE-QNLI/model_QNLI.py b/benchmark/GLUE-QNLI/model_QNLI.py
@@ -0,0 +1,116 @@
+import pandas as pd
+from mindnlp.transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification
+)
+from tqdm import tqdm
+import argparse
+
+MODEL_CONFIGS = {
+    "albert": {
+        "model_name": "orafandina/albert-base-v2-finetuned-qnli",
+        "tokenizer_name": "albert/albert-base-v2"
+    },
+    "bert": {
+        "model_name": "Li/bert-base-uncased-qnli",
+        "tokenizer_name": "google-bert/bert-base-uncased"
+    },
+    "roberta": {
+        "model_name": "howey/roberta-large-qnli",
+        "tokenizer_name": "FacebookAI/roberta-large"
+    },
+    "xlm-roberta": {
+        "model_name": "tmnam20/xlm-roberta-large-qnli-1",
+        "tokenizer_name": "FacebookAI/xlm-roberta-large"
+    },
+    "distilbert": {
+        "model_name": "anirudh21/distilbert-base-uncased-finetuned-qnli",
+        "tokenizer_name": "distilbert/distilbert-base-uncased"
+    },
+    "t5": {
+        "model_name": "lightsout19/t5-small-qnli",
+        "tokenizer_name": "google-t5/t5-small"
+    },
+    "gpt2": {
+        "model_name": "tanganke/gpt2_qnli",
+        "tokenizer_name": "openai-community/gpt2"
+    },
+    "llama": {
+        "model_name": "Cheng98/llama-160m-qnli",
+        "tokenizer_name": "JackFram/llama-160m"
+    },
+    "opt": {
+        "model_name": "facebook/opt-125m",
+        "tokenizer_name": "utahnlp/qnli_facebook_opt-125m_seed-1"
+    },
+    "bart": {
+        "model_name": "facebook/bart-large-qnli",
+        "tokenizer_name": "ModelTC/bart-base-qnli"
+    }
+}
+
+def get_model_and_tokenizer(model_type):
+    """获取指定类型的模型和分词器"""
+    if model_type not in MODEL_CONFIGS:
+        raise ValueError(f"不支持的模型类型: {model_type}")
+    
+    config = MODEL_CONFIGS[model_type]
+    tokenizer = AutoTokenizer.from_pretrained(config["tokenizer_name"])
+    model = AutoModelForSequenceClassification.from_pretrained(config["model_name"], num_labels=2)
+    
+    return model, tokenizer
+
+def predict_qnli(model, tokenizer, question, sentence):
+    """预测QNLI任务"""
+    inputs = tokenizer(question, sentence, return_tensors="ms", truncation=True, max_length=512)
+    outputs = model(**inputs)
+    logits = outputs.logits
+    return logits.argmax(axis=1).asnumpy()[0]
+
+def evaluate_model(model_type, data_path):
+    """评估模型在QNLI数据集上的表现"""
+    print(f"正在评估模型: {model_type}")
+    
+    # 加载模型和分词器
+    model, tokenizer = get_model_and_tokenizer(model_type)
+    print(f"模型类型: {model.config.model_type}")
+    
+    # 加载数据
+    df = pd.read_csv(data_path, sep='\t', header=0, names=['idx', 'question', 'sentence', 'label'])
+    df = df.dropna(subset=['label'])
+    
+    # 标签映射
+    label_map = {'entailment': 0, 'not_entailment': 1}
+    valid_data = df[df['label'].isin(label_map.keys())]
+    
+    questions = valid_data['question'].tolist()
+    sentences = valid_data['sentence'].tolist()
+    labels = [label_map[label] for label in valid_data['label']]
+    
+    # 预测和评估
+    predict_true = 0
+    for question, sentence, true_label in tqdm(zip(questions, sentences, labels), 
+                                             total=len(questions), 
+                                             desc="预测进度"):
+        pred_label = predict_qnli(model, tokenizer, question, sentence)
+        if pred_label == true_label:
+            predict_true += 1
+    
+    # 输出结果
+    accuracy = float(predict_true / len(questions) * 100)
+    print(f"测试集总样本数: {len(questions)}")
+    print(f"预测正确的数量: {predict_true}")
+    print(f"准确率为: {accuracy:.2f}%")
+    
+    return accuracy
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='QNLI任务评估脚本')
+    parser.add_argument('--model', type=str, required=True, 
+                       choices=list(MODEL_CONFIGS.keys()),
+                       help='要评估的模型类型')
+    parser.add_argument('--data', type=str, default='./QNLI/dev.tsv',
+                       help='数据集路径')
+    
+    args = parser.parse_args()
+    evaluate_model(args.model, args.data)
diff --git a/benchmark/GLUE-QNLI/requirements.txt b/benchmark/GLUE-QNLI/requirements.txt
@@ -0,0 +1,5 @@
+mindspore==2.3.1
+mindnlp==0.4.1
+tqdm==latest
+pandas==latest
+numpy==1.26.4