From fb80ccff7d827b8f5600e24d58fd4149821880db Mon Sep 17 00:00:00 2001 From: C0ss4ck <32336251+Cossack9989@users.noreply.github.com> Date: Mon, 14 Aug 2023 18:56:56 +0800 Subject: [PATCH 1/2] Update: support clone-detection fine-tune by POJ104 --- CodeT5+/tune_codet5p_clone_detection.py | 413 ++++++++++++++++++++++++ 1 file changed, 413 insertions(+) create mode 100644 CodeT5+/tune_codet5p_clone_detection.py diff --git a/CodeT5+/tune_codet5p_clone_detection.py b/CodeT5+/tune_codet5p_clone_detection.py new file mode 100644 index 0000000..8bc29d4 --- /dev/null +++ b/CodeT5+/tune_codet5p_clone_detection.py @@ -0,0 +1,413 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import, division, print_function + +import os +import json +import random +import logging +import argparse + +import torch +import torch.nn as nn +import numpy as np + + +from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler + +from transformers import (AdamW, get_linear_schedule_with_warmup, + AutoTokenizer, AutoModel) + +logger = logging.getLogger(__name__) + + +class InputFeatures(object): + """A single training/test features for a example.""" + + def __init__(self, + input_tokens, + input_ids, + attention_mask, + index, + label, + ): + self.input_tokens = input_tokens + self.input_ids = input_ids + self.attention_mask = attention_mask + self.index = index + self.label = label + + +class Model(nn.Module): + + def __init__(self, encoder, tokenizer): + super(Model, self).__init__() + self.encoder = encoder + self.tokenizer = tokenizer + + def forward(self, input_ids, input_mask, p_input_ids, p_input_mask, n_input_ids, n_input_mask, labels=None): + bs, _ = input_ids.size() + # all_ids = torch.cat((input_ids, p_input_ids, n_input_ids), 0) + all_mask = torch.cat((input_mask, p_input_mask, n_input_mask), 0) + # outputs = self.encoder(all_ids, attention_mask=all_mask)[0] + # outputs = (outputs * all_mask[:, :, None]).sum(1) / all_mask.sum(1)[:, None] + # outputs = torch.nn.functional.normalize(outputs, p=2, dim=1) + # outputs = outputs.split(bs, 0) + + input_embed = self.encoder(input_ids, attention_mask=input_mask)[0] + p_embed = self.encoder(p_input_ids, attention_mask=p_input_mask)[0] + n_embed = self.encoder(n_input_ids, attention_mask=n_input_mask)[0] + + # input_embed_n = torch.nn.functional.normalize() + + outputs = torch.cat((input_embed, p_embed, n_embed), 0) + outputs = (outputs * all_mask[:, :, None]).sum(1) / all_mask.sum(1)[:, None] + outputs = torch.nn.functional.normalize(outputs, p=2, dim=1)[0] + outputs = outputs.split(len(input_embed), 0) + + input_embed_n = outputs[0].resize(1, len(input_embed)) + p_embed_n = outputs[1].resize(1, len(p_embed)) + n_embed_n = outputs[2].resize(1, len(n_embed)) + + prob_1 = (input_embed_n * p_embed_n).sum(-1) * 20 + prob_2 = (input_embed_n * n_embed_n).sum(-1) * 20 + temp = torch.cat((input_embed_n, p_embed_n), 0) + temp_labels = torch.cat((labels, labels), 0) + prob_3 = torch.mm(input_embed_n, temp.t()) * 20 + mask = labels[:, None] == temp_labels[None, :] + prob_3 = prob_3 * (1 - mask.float()) - 1e9 * mask.float() + + prob = torch.softmax(torch.cat((prob_1[:, None], prob_2[:, None], prob_3), -1), -1) + loss = torch.log(prob[:, 0] + 1e-10) + loss = -loss.mean() + + pdb.set_trace() + + return loss, input_embed_n + + +def convert_examples_to_features(js: dict, tokenizer, args): + """convert examples to token ids""" + text_input = tokenizer(js["code"], padding='max_length', + truncation=True, max_length=args.block_size, return_tensors="pt") + source_ids = text_input.input_ids[0] + source_tokens = tokenizer.convert_ids_to_tokens(text_input.input_ids[0]) + attention_mask = text_input.attention_mask[0] + return InputFeatures(source_tokens, source_ids, attention_mask, js['index'], int(js['label'])) + + +class TextDataset(Dataset): + def __init__(self, tokenizer, args, file_path=None): + self.examples = [] + data = [] + with open(file_path) as f: + for line in f: + line = line.strip() + js = json.loads(line) + data.append(js) + for js in data: + self.examples.append(convert_examples_to_features(js, tokenizer, args)) + if 'train' in file_path: + for idx, example in enumerate(self.examples[:3]): + logger.info("*** Example ***") + logger.info("idx: {}".format(idx)) + logger.info("label: {}".format(example.label)) + logger.info("input_ids: {}".format(' '.join(map(str, example.input_ids)))) + self.label_examples = {} + for e in self.examples: + if e.label not in self.label_examples: + self.label_examples[e.label] = [] + self.label_examples[e.label].append(e) + + def __len__(self): + return len(self.examples) + + def __getitem__(self, i): + label = self.examples[i].label + index = self.examples[i].index + labels = list(self.label_examples) + labels.remove(label) + while True: + shuffle_example = random.sample(self.label_examples[label], 1)[0] + if shuffle_example.index != index: + p_example = shuffle_example + break + n_example = random.sample(self.label_examples[random.sample(labels, 1)[0]], 1)[0] + + return ( + self.examples[i].input_ids, self.examples[i].attention_mask, + p_example.input_ids, p_example.attention_mask, + n_example.input_ids, n_example.attention_mask, + torch.tensor(label) + ) + + +def set_seed(seed=42): + random.seed(seed) + os.environ['PYHTONHASHSEED'] = str(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.backends.cudnn.deterministic = True + + +def train(args, train_dataset, model, tokenizer): + """ Train the model """ + train_sampler = RandomSampler(train_dataset) + train_dataloader = DataLoader(train_dataset, sampler=train_sampler, + batch_size=args.train_batch_size, num_workers=4, pin_memory=True) + + args.max_steps = args.num_train_epochs * len(train_dataloader) + + # Prepare optimizer and schedule (linear warmup and decay) + no_decay = ['bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [ + {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], + 'weight_decay': args.weight_decay}, + {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} + ] + optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) + scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.max_steps * 0.1, + num_training_steps=args.max_steps) + + # Train! + logger.info("***** Running training *****") + logger.info(f" Num examples = {len(train_dataset)}", ) + logger.info(f" Num Epochs = {args.num_train_epochs}", ) + logger.info(f" Instantaneous batch size per GPU = {args.train_batch_size // args.n_gpu}") + logger.info(f" Total train batch size = {args.train_batch_size}") + logger.info(f" Total optimization steps = {args.max_steps}") + + losses, best_map = [], 0 + + model.zero_grad() + for idx in range(args.num_train_epochs): + for step, batch in enumerate(train_dataloader): + inputs_ids = batch[0].to(args.device) + inputs_mask = batch[1].to(args.device) + p_inputs_ids = batch[2].to(args.device) + p_inputs_mask = batch[3].to(args.device) + n_inputs_ids = batch[4].to(args.device) + n_inputs_mask = batch[5].to(args.device) + labels = batch[6].to(args.device) + model.train() + loss, vec = model(inputs_ids, inputs_mask, + p_inputs_ids, p_inputs_mask, + n_inputs_ids, n_inputs_mask, + labels) + + if args.n_gpu > 1: + loss = loss.mean() # mean() to average on multi-gpu parallel training + + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) + + losses.append(loss.item()) + + if (step + 1) % 100 == 0: + logger.info("epoch {} step {} loss {}".format(idx, step + 1, round(np.mean(losses[-100:]), 4))) + + optimizer.step() + optimizer.zero_grad() + scheduler.step() + + results = evaluate(args, model, tokenizer, args.eval_data_file) + for key, value in results.items(): + logger.info(f" {key} = {round(value, 4)}") + + if results['eval_map'] > best_map: + best_map = results['eval_map'] + logger.info(" " + "*" * 20) + logger.info(f" Best map:{round(best_map, 4)}") + logger.info(" " + "*" * 20) + + checkpoint_prefix = 'checkpoint-best-map' + output_dir = os.path.join(args.output_dir, '{}'.format(checkpoint_prefix)) + if not os.path.exists(output_dir): + os.makedirs(output_dir) + output_dir = os.path.join(output_dir, '{}'.format('model.bin')) + model_to_save = model.module if hasattr(model, 'module') else model + torch.save(model_to_save.state_dict(), output_dir) + logger.info(f"Saving model checkpoint to {output_dir}") + + +def evaluate(args, model, tokenizer, data_file): + """ Evaluate the model """ + eval_dataset = TextDataset(tokenizer, args, data_file) + eval_sampler = SequentialSampler(eval_dataset) + eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, num_workers=4) + + eval_output_dir = args.output_dir + if not os.path.exists(eval_output_dir): + os.makedirs(eval_output_dir) + + # Eval! + logger.info("***** Running evaluation *****") + logger.info(f" Num examples = {len(eval_dataset)}") + logger.info(f" Batch size = {args.eval_batch_size}") + eval_loss = 0.0 + nb_eval_steps = 0 + model.eval() + vecs = [] + labels = [] + for batch in eval_dataloader: + inputs_ids = batch[0].to(args.device) + inputs_mask = batch[1].to(args.device) + p_inputs_ids = batch[2].to(args.device) + p_inputs_mask = batch[3].to(args.device) + n_inputs_ids = batch[4].to(args.device) + n_inputs_mask = batch[5].to(args.device) + tmp_labels = batch[6].to(args.device) + with torch.no_grad(): + lm_loss, vec = model(inputs_ids, inputs_mask, + p_inputs_ids, p_inputs_mask, + n_inputs_ids, n_inputs_mask, + tmp_labels) + eval_loss += lm_loss.mean().item() + vecs.append(vec.cpu().numpy()) + labels.append(tmp_labels.cpu().numpy()) + nb_eval_steps += 1 + vecs = np.concatenate(vecs, 0) + labels = np.concatenate(labels, 0) + eval_loss = eval_loss / nb_eval_steps + perplexity = torch.tensor(eval_loss) + + scores = np.matmul(vecs, vecs.T) + dic = {} + for i in range(scores.shape[0]): + scores[i, i] = -1000000 + if int(labels[i]) not in dic: + dic[int(labels[i])] = -1 + dic[int(labels[i])] += 1 + sort_ids = np.argsort(scores, axis=-1, kind='quicksort', order=None)[:, ::-1] + MAP = [] + for i in range(scores.shape[0]): + cont = 0 + label = int(labels[i]) + Avep = [] + for j in range(dic[label]): + index = sort_ids[i, j] + if int(labels[index]) == label: + Avep.append((len(Avep) + 1) / (j + 1)) + MAP.append(sum(Avep) / dic[label]) + + result = { + "eval_loss": float(perplexity), + "eval_map": float(np.mean(MAP)) + } + + return result + + +def main(): + parser = argparse.ArgumentParser() + + ## Required parameters + parser.add_argument("--output_dir", default=None, type=str, required=True, + help="The output directory where the model predictions and checkpoints will be written.") + + ## Other parameters + parser.add_argument("--train_data_file", default=None, type=str, + help="The input training data file (a jsonl file).") + parser.add_argument("--eval_data_file", default=None, type=str, + help="An optional input evaluation data file to evaluate the perplexity on (a jsonl file).") + parser.add_argument("--test_data_file", default=None, type=str, + help="An optional input test data file to evaluate the perplexity on (a jsonl file).") + parser.add_argument('--cache', type=str, default='cache') + parser.add_argument("--block_size", default=-1, type=int, + help="Optional input sequence length after tokenization.") + parser.add_argument("--do_train", action='store_true', + help="Whether to run training.") + parser.add_argument("--do_eval", action='store_true', + help="Whether to run eval on the dev set.") + parser.add_argument("--do_test", action='store_true', + help="Whether to run eval on the dev set.") + parser.add_argument("--train_batch_size", default=4, type=int, + help="Batch size per GPU/CPU for training.") + parser.add_argument("--eval_batch_size", default=4, type=int, + help="Batch size per GPU/CPU for evaluation.") + parser.add_argument("--learning_rate", default=5e-5, type=float, + help="The initial learning rate for Adam.") + parser.add_argument("--weight_decay", default=0.0, type=float, + help="Weight deay if we apply some.") + parser.add_argument("--adam_epsilon", default=1e-8, type=float, + help="Epsilon for Adam optimizer.") + parser.add_argument("--max_grad_norm", default=1.0, type=float, + help="Max gradient norm.") + parser.add_argument("--num_train_epochs", default=1, type=int, + help="Total number of training epochs to perform.") + parser.add_argument('--seed', type=int, default=42, + help="random seed for initialization") + + # print arguments + args = parser.parse_args() + # set log + logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) + # set device + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + args.n_gpu = torch.cuda.device_count() + args.device = device + logger.info(f"device: {device}, n_gpu: {args.n_gpu}") + + # Set seed + set_seed(args.seed) + + # build model + tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5p-110m-embedding", + cache_dir=args.cache, trust_remote_code=True) + model = AutoModel.from_pretrained("Salesforce/codet5p-110m-embedding", + cache_dir=args.cache, trust_remote_code=True) + + model = Model(model, tokenizer) + logger.info(f"Training/evaluation parameters {args}") + + model.to(args.device) + if args.n_gpu > 1: + model = torch.nn.DataParallel(model) + + # Training + if args.do_train: + train_dataset = TextDataset(tokenizer, args, args.train_data_file) + train(args, train_dataset, model, tokenizer) + + # Evaluation + results = {} + if args.do_eval: + checkpoint_prefix = 'checkpoint-best-map/model.bin' + output_dir = os.path.join(args.output_dir, '{}'.format(checkpoint_prefix)) + model_to_load = model.module if hasattr(model, 'module') else model + model_to_load.load_state_dict(torch.load(output_dir)) + result = evaluate(args, model, tokenizer, args.eval_data_file) + logger.info("***** Eval results *****") + for key in sorted(result.keys()): + logger.info(f" {key} = {str(round(result[key] * 100 if 'map' in key else result[key], 2))}") + + if args.do_test: + checkpoint_prefix = 'checkpoint-best-map/model.bin' + output_dir = os.path.join(args.output_dir, '{}'.format(checkpoint_prefix)) + model_to_load = model.module if hasattr(model, 'module') else model + model_to_load.load_state_dict(torch.load(output_dir)) + result = evaluate(args, model, tokenizer, args.test_data_file) + logger.info("***** Test results *****") + for key in sorted(result.keys()): + logger.info(f" {key} = {str(round(result[key] * 100 if 'map' in key else result[key], 2))}") + + +if __name__ == "__main__": + main() + From 40310585ea76754f1f83acbb7dca05302698b6fa Mon Sep 17 00:00:00 2001 From: C0ss4ck <32336251+Cossack9989@users.noreply.github.com> Date: Tue, 15 Aug 2023 12:04:31 +0800 Subject: [PATCH 2/2] Update: add description of clone-detection finetuning script --- CodeT5+/README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/CodeT5+/README.md b/CodeT5+/README.md index d64662d..cb79f8e 100644 --- a/CodeT5+/README.md +++ b/CodeT5+/README.md @@ -149,6 +149,18 @@ To finetune on your own data, you just need to prepare your customized data in t Besides, you can specify `--load` to select the specific CodeT5+ model (e.g., `Salesforce/codet5p-220m`) to finetune from. To tune the hyper-parameter setting that suit your task the best, you can customize other finetuning arguments such as `--epochs`, `--lr`, `--lr-warmup-steps`, `--max-source-len`, `--max-target-len`, `--batch-size-per-replica`, `--grad-acc-steps`, etc. This script naturally supports both single-GPU and multi-GPU training. If you have limited GPU memory issue and want to improve the training throughput, please consider to specify `--fp16` to enable mixed-precision training and use [DeepSpeed](https://github.com/microsoft/DeepSpeed) for further optimization by passing a deedspeed config file to `--deepspeed` (see [here](https://huggingface.co/docs/transformers/main_classes/deepspeed#zero2-example) for an example config file). +We also provide an example finetuning script [tune_codet5p_clone_detection.py](https://github.com/salesforce/CodeT5/blob/main/CodeT5%2B/tune_codet5p_clone_detection.py) for CodeT5+ models on Clone-Detection task, which is modified from [unixcoder](https://github.com/microsoft/CodeBERT/tree/master/UniXcoder/downstream-tasks/clone-detection/POJ-104). +You can run `tune_codet5p_clone_detection.py` to finetune codet5p-110m-embedding model on POJ104. You can run the script as the following: +```bash +python3.10 tune_codet5p_clone_detection.py.py --output_dir saved_models \ + --train_data_file /path/to/POJ-104/dataset/train.jsonl \ + --eval_data_file /path/to/POJ-104/dataset/valid.jsonl \ + --do_train --num_train_epochs 2 \ + --block_size 1024 --train_batch_size 8 \ + --eval_batch_size 8 --learning_rate 1e-5 \ + --max_grad_norm 1.0 --seed 42 +``` + # Reproduce the Results ## HumanEval