Skip to content

Commit 682986a

Browse files
KawrakowKawrakow
andauthored
Add Winogrande evaluation (#5015)
* winogrande: simple implementation It doesn't look like it is working - why? For Mistral-7B it is barely better than random chance (score ~60% for 1267 tasks), while I see Mistral-7B scoring 78.4% on the HF leader board. 1-sigma statistical uncertainty for 1267 tasks is ~1.4, so no way the difference is due to statistics. * winogrande: somewhat better Score for Mistrali7-B is now 68.9 on the validation set of winogrande_debiased. Still far from the reported 78.4, but better than what I had before. * winogrande: improving Mistral-7B score is now 73.56. Still not quite 78.4 but getting there. We are also getting a lower score on HellaSwag compared to HF leader board, so I'm not expecting we will get up to 78.4 anyway. It looks like it is better to skip the choice word(s) when evaluating the average log-likelihood. This kind of makes sense because a more common word (in Winogrande this is often a name) will have a higher probability without knowing about the follow up context, and this will skew the log-likelihood towards the more common word. We can only do this if the choice words are not last in the sentence. It also looks like it is better to skip the punctuation at the end of the sentence, provided the choice words are not last. * winogrande: add dataset instructions --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
1 parent dcad445 commit 682986a

File tree

3 files changed

+251
-5
lines changed

3 files changed

+251
-5
lines changed

common/common.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -681,6 +681,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
681681
break;
682682
}
683683
params.hellaswag_tasks = std::stoi(argv[i]);
684+
} else if (arg == "--winogrande") {
685+
params.winogrande = true;
686+
} else if (arg == "--winogrande-tasks") {
687+
if (++i >= argc) {
688+
invalid_param = true;
689+
break;
690+
}
691+
params.winogrande_tasks = std::stoi(argv[i]);
684692
} else if (arg == "--ignore-eos") {
685693
params.ignore_eos = true;
686694
} else if (arg == "--no-penalize-nl") {
@@ -926,6 +934,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
926934
printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n");
927935
printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
928936
printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
937+
printf(" --winogrande compute Winogrande score over random tasks from datafile supplied with -f\n");
938+
printf(" --winogrande-tasks N number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks);
929939
printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
930940
printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
931941
printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);

common/common.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,9 @@ struct gpt_params {
105105
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
106106
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
107107

108+
bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
109+
size_t winogrande_tasks= 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
110+
108111
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
109112
bool random_prompt = false; // do not randomize prompt if none provided
110113
bool use_color = false; // use color to distinguish generations and inputs

examples/perplexity/perplexity.cpp

Lines changed: 238 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@
99
#include <thread>
1010
#include <mutex>
1111
#include <vector>
12+
#include <array>
13+
#include <fstream>
14+
#include <sstream>
1215

1316
#if defined(_MSC_VER)
1417
#pragma warning(disable: 4244 4267) // possible loss of data
@@ -419,9 +422,8 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
419422
return {tokens, ppl, logit_history, prob_history};
420423
}
421424

422-
static std::vector<float> hellaswag_evaluate_tokens(
423-
llama_context * ctx, std::vector<int> & tokens, int n_past, int n_batch, int n_vocab
424-
) {
425+
static std::vector<float> evaluate_tokens(llama_context * ctx, std::vector<int> & tokens,
426+
int n_past, int n_batch, int n_vocab) {
425427
std::vector<float> result;
426428
result.reserve(tokens.size() * n_vocab);
427429
size_t n_chunk = (tokens.size() + n_batch - 1)/n_batch;
@@ -573,7 +575,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
573575
// clear the KV cache
574576
llama_kv_cache_clear(ctx);
575577

576-
auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab);
578+
auto logits = evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab);
577579
if (logits.empty()) {
578580
fprintf(stderr, "%s : failed to eval\n", __func__);
579581
return;
@@ -622,7 +624,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
622624
//}
623625

624626
// Evaluate the query
625-
logits = hellaswag_evaluate_tokens(ctx, query_embd, context_size, params.n_batch, n_vocab);
627+
logits = evaluate_tokens(ctx, query_embd, context_size, params.n_batch, n_vocab);
626628
if (logits.empty()) {
627629
fprintf(stderr, "%s : failed to eval\n", __func__);
628630
return;
@@ -676,6 +678,235 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
676678
printf("\n");
677679
}
678680

681+
struct winogrande_entry {
682+
std::string first;
683+
std::string second;
684+
std::array<std::string, 2> choices;
685+
int answer;
686+
};
687+
688+
static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string& prompt) {
689+
std::vector<winogrande_entry> result;
690+
std::istringstream in(prompt);
691+
std::string line;
692+
std::array<int, 4> comma_pos;
693+
while (true) {
694+
std::getline(in, line);
695+
if (in.fail() || in.eof()) break;
696+
int ipos = 0;
697+
bool quote_open = false;
698+
for (int i = 0; i < int(line.size()); ++i) {
699+
if (!quote_open) {
700+
if (line[i] == ',') {
701+
comma_pos[ipos++] = i;
702+
if (ipos == 4) break;
703+
}
704+
else if (line[i] == '"') {
705+
quote_open = true;
706+
}
707+
}
708+
else {
709+
if (line[i] == '"') {
710+
quote_open = false;
711+
}
712+
}
713+
}
714+
if (ipos != 4) {
715+
printf("%s: failed to find comma separators in <%s>\n", __func__, line.c_str());
716+
continue;
717+
}
718+
auto sentence = line[comma_pos[0]+1] == '"' ? line.substr(comma_pos[0]+2, comma_pos[1] - comma_pos[0] - 3)
719+
: line.substr(comma_pos[0]+1, comma_pos[1] - comma_pos[0] - 1);
720+
auto choice1 = line.substr(comma_pos[1]+1, comma_pos[2] - comma_pos[1] - 1);
721+
auto choice2 = line.substr(comma_pos[2]+1, comma_pos[3] - comma_pos[2] - 1);
722+
auto answer = line.substr(comma_pos[3]+1, line.size() - comma_pos[3] - 1);
723+
auto index = line.substr(0, comma_pos[0]);
724+
int where = 0;
725+
for ( ; where < int(sentence.size()); ++where) {
726+
if (sentence[where] == '_') break;
727+
}
728+
if (where == int(sentence.size())) {
729+
printf("%s: no _ in <%s>\n", __func__, sentence.c_str());
730+
continue;
731+
}
732+
std::istringstream stream(answer.c_str());
733+
int i_answer; stream >> i_answer;
734+
if (stream.fail() || i_answer < 1 || i_answer > 2) {
735+
printf("%s: failed to parse answer <%s>\n", __func__, answer.c_str());
736+
continue;
737+
}
738+
result.emplace_back();
739+
auto& wg = result.back();
740+
wg.first = sentence.substr(0, where);
741+
wg.second = sentence.substr(where + 1, sentence.size() - where - 1);
742+
wg.choices[0] = std::move(choice1);
743+
wg.choices[1] = std::move(choice2);
744+
wg.answer = i_answer;
745+
}
746+
return result;
747+
}
748+
749+
/*
750+
* Evaluates the Winogrande score.
751+
* Uses a CSV containing task index, dentence, choice 1, choice 2, answer (1 or 2)
752+
* You can get one such dataset from e.g. https://huggingface.co/datasets/ikawrakow/winogrande-eval-for-llama.cpp
753+
* As an example, the 1st row in the above dataset is
754+
*
755+
* 0,Sarah was a much better surgeon than Maria so _ always got the easier cases.,Sarah,Maria,2
756+
*
757+
*/
758+
static void winogrande_score(llama_context * ctx, const gpt_params & params) {
759+
760+
constexpr int k_min_trailing_ctx = 3;
761+
762+
auto data = load_winogrande_from_csv(params.prompt);
763+
if (data.empty()) {
764+
fprintf(stderr, "%s: no tasks\n", __func__);
765+
return;
766+
}
767+
768+
fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, data.size());
769+
770+
if (params.winogrande_tasks > 0 && params.winogrande_tasks < data.size()) {
771+
fprintf(stderr, "%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks);
772+
std::mt19937 rng(1);
773+
std::vector<int> aux(data.size());
774+
for (int i = 0; i < int(data.size()); ++i) {
775+
aux[i] = i;
776+
}
777+
float scale = 1/(1.f + (float)rng.max());
778+
std::vector<winogrande_entry> selected;
779+
selected.reserve(params.winogrande_tasks);
780+
for (int i = 0; i < int(params.winogrande_tasks); ++i) {
781+
int j = int(scale*rng()*aux.size());
782+
selected[i] = std::move(data[aux[j]]);
783+
aux[j] = aux.back();
784+
aux.pop_back();
785+
}
786+
data = std::move(selected);
787+
}
788+
789+
// This is needed as usual for LLaMA models
790+
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
791+
792+
fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__);
793+
794+
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
795+
const int n_ctx = llama_n_ctx(ctx);
796+
797+
std::vector<float> tok_logits(n_vocab);
798+
799+
int n_correct = 0;
800+
int n_done = 0;
801+
802+
for (size_t task_idx = 0; task_idx < data.size(); task_idx++) {
803+
const auto& task = data[task_idx];
804+
805+
auto base_context = ::llama_tokenize(ctx, task.first, add_bos);
806+
auto base_ctx_1st = ::llama_tokenize(ctx, task.first + task.choices[0], add_bos);
807+
auto base_ctx_2nd = ::llama_tokenize(ctx, task.first + task.choices[1], add_bos);
808+
809+
auto sentence_1st = task.first + task.choices[0] + task.second;
810+
auto sentence_2nd = task.first + task.choices[1] + task.second;
811+
auto query_1st = ::llama_tokenize(ctx, sentence_1st, add_bos);
812+
auto query_2nd = ::llama_tokenize(ctx, sentence_2nd, add_bos);
813+
814+
if (query_1st.size() > (size_t)n_ctx || query_2nd.size() > (size_t)n_ctx) {
815+
fprintf(stderr, "%s : number of tokens in queries %zu, %zu > n_ctxl\n", __func__, query_1st.size(), query_2nd.size());
816+
return;
817+
}
818+
819+
auto query_1st_size = query_1st.size();
820+
auto query_2nd_size = query_2nd.size();
821+
822+
// Speedup small evaluations by evaluating atleast 32 tokens
823+
// For Winogrande this seems to slow it down rather than speed it up.
824+
//if (query_1st.size() < 32) query_1st.resize(32);
825+
//if (query_2nd.size() < 32) query_2nd.resize(32);
826+
827+
llama_kv_cache_clear(ctx);
828+
auto logits_1st = evaluate_tokens(ctx, query_1st, 0, params.n_batch, n_vocab);
829+
830+
llama_kv_cache_clear(ctx);
831+
auto logits_2nd = evaluate_tokens(ctx, query_2nd, 0, params.n_batch, n_vocab);
832+
833+
if (logits_1st.empty() || logits_2nd.empty()) {
834+
fprintf(stderr, "%s : failed to eval\n", __func__);
835+
return;
836+
}
837+
838+
bool skip_choice = query_1st_size - base_ctx_1st.size() > k_min_trailing_ctx &&
839+
query_2nd_size - base_ctx_2nd.size() > k_min_trailing_ctx;
840+
841+
float score_1st = 0;
842+
bool is_nan_1st = false;
843+
const auto& base_1 = skip_choice ? base_ctx_1st : base_context;
844+
const int last_1st = query_1st_size - base_1.size() > 1 ? 1 : 0;
845+
for (size_t j = base_1.size()-1; j < query_1st_size-1-last_1st; ++j) {
846+
std::memcpy(tok_logits.data(), logits_1st.data() + j*n_vocab, n_vocab*sizeof(float));
847+
const float prob = softmax(tok_logits)[query_1st[j+1]];
848+
if (std::isnan(prob) || !prob) {
849+
fprintf(stderr, "%s: %g probability for token %zu when evaluating <%s>. Base context has %zu tokens\n", __func__,
850+
prob, j, sentence_1st.c_str(), base_context.size());
851+
is_nan_1st = true;
852+
break;
853+
}
854+
score_1st += std::log(prob);
855+
}
856+
score_1st /= (query_1st_size - base_1.size() - last_1st);
857+
858+
float score_2nd = 0;
859+
bool is_nan_2nd = false;
860+
const auto& base_2 = skip_choice ? base_ctx_2nd : base_context;
861+
const int last_2nd = query_2nd_size - base_2.size() > 1 ? 1 : 0;
862+
for (size_t j = base_2.size()-1; j < query_2nd_size-1-last_2nd; ++j) {
863+
std::memcpy(tok_logits.data(), logits_2nd.data() + j*n_vocab, n_vocab*sizeof(float));
864+
const float prob = softmax(tok_logits)[query_2nd[j+1]];
865+
if (std::isnan(prob) || !prob) {
866+
fprintf(stderr, "%s: %g probability for token %zu when evaluating <%s>. Base context has %zu tokens\n", __func__,
867+
prob, j, sentence_2nd.c_str(), base_context.size());
868+
is_nan_2nd = true;
869+
break;
870+
}
871+
score_2nd += std::log(prob);
872+
}
873+
score_2nd /= (query_2nd_size - base_2.size() - last_2nd);
874+
875+
if (is_nan_1st || is_nan_2nd) {
876+
continue;
877+
}
878+
879+
if (std::isnan(score_1st) || std::isnan(score_2nd)) {
880+
printf("================== NaN score %g, %g) for:\n", score_1st, score_2nd);
881+
printf("Q1: <%s> - %zu tokens\n", sentence_1st.c_str(), query_1st_size);
882+
printf("Q2: <%s> - %zu tokens\n", sentence_2nd.c_str(), query_2nd_size);
883+
printf("B : <%s> - %zu tokens\n", task.first.c_str(), base_context.size());
884+
printf("base_1 has %zu tokens, base_2 has %zu tokens, skip_choice = %d\n", base_1.size(), base_2.size(), skip_choice);
885+
continue;
886+
}
887+
888+
int result = score_1st > score_2nd ? 1 : 2;
889+
890+
if (result == task.answer) {
891+
++n_correct;
892+
}
893+
++n_done;
894+
895+
// Print the accumulated accuracy mean x 100
896+
printf("%zu\t%.4lf\t%10.6f %10.6f %d %d\n",task_idx+1, 100.0 * n_correct/n_done,score_1st,score_2nd,result,task.answer);
897+
fflush(stdout);
898+
}
899+
900+
printf("\n");
901+
902+
if (n_done < 100) return;
903+
904+
const float p = 1.f*n_correct/n_done;
905+
const float sigma = 100.f*sqrt(p*(1-p)/(n_done-1));
906+
printf("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
907+
}
908+
909+
679910
int main(int argc, char ** argv) {
680911
gpt_params params;
681912

@@ -733,6 +964,8 @@ int main(int argc, char ** argv) {
733964
struct results_perplexity results;
734965
if (params.hellaswag) {
735966
hellaswag_score(ctx, params);
967+
} else if (params.winogrande) {
968+
winogrande_score(ctx, params);
736969
} else {
737970
results = perplexity(ctx, params);
738971
}

0 commit comments

Comments
 (0)