|
9 | 9 | #include <thread>
|
10 | 10 | #include <mutex>
|
11 | 11 | #include <vector>
|
| 12 | +#include <array> |
| 13 | +#include <fstream> |
| 14 | +#include <sstream> |
12 | 15 |
|
13 | 16 | #if defined(_MSC_VER)
|
14 | 17 | #pragma warning(disable: 4244 4267) // possible loss of data
|
@@ -419,9 +422,8 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
419 | 422 | return {tokens, ppl, logit_history, prob_history};
|
420 | 423 | }
|
421 | 424 |
|
422 |
| -static std::vector<float> hellaswag_evaluate_tokens( |
423 |
| - llama_context * ctx, std::vector<int> & tokens, int n_past, int n_batch, int n_vocab |
424 |
| -) { |
| 425 | +static std::vector<float> evaluate_tokens(llama_context * ctx, std::vector<int> & tokens, |
| 426 | + int n_past, int n_batch, int n_vocab) { |
425 | 427 | std::vector<float> result;
|
426 | 428 | result.reserve(tokens.size() * n_vocab);
|
427 | 429 | size_t n_chunk = (tokens.size() + n_batch - 1)/n_batch;
|
@@ -573,7 +575,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
573 | 575 | // clear the KV cache
|
574 | 576 | llama_kv_cache_clear(ctx);
|
575 | 577 |
|
576 |
| - auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab); |
| 578 | + auto logits = evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab); |
577 | 579 | if (logits.empty()) {
|
578 | 580 | fprintf(stderr, "%s : failed to eval\n", __func__);
|
579 | 581 | return;
|
@@ -622,7 +624,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
622 | 624 | //}
|
623 | 625 |
|
624 | 626 | // Evaluate the query
|
625 |
| - logits = hellaswag_evaluate_tokens(ctx, query_embd, context_size, params.n_batch, n_vocab); |
| 627 | + logits = evaluate_tokens(ctx, query_embd, context_size, params.n_batch, n_vocab); |
626 | 628 | if (logits.empty()) {
|
627 | 629 | fprintf(stderr, "%s : failed to eval\n", __func__);
|
628 | 630 | return;
|
@@ -676,6 +678,235 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
676 | 678 | printf("\n");
|
677 | 679 | }
|
678 | 680 |
|
| 681 | +struct winogrande_entry { |
| 682 | + std::string first; |
| 683 | + std::string second; |
| 684 | + std::array<std::string, 2> choices; |
| 685 | + int answer; |
| 686 | +}; |
| 687 | + |
| 688 | +static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string& prompt) { |
| 689 | + std::vector<winogrande_entry> result; |
| 690 | + std::istringstream in(prompt); |
| 691 | + std::string line; |
| 692 | + std::array<int, 4> comma_pos; |
| 693 | + while (true) { |
| 694 | + std::getline(in, line); |
| 695 | + if (in.fail() || in.eof()) break; |
| 696 | + int ipos = 0; |
| 697 | + bool quote_open = false; |
| 698 | + for (int i = 0; i < int(line.size()); ++i) { |
| 699 | + if (!quote_open) { |
| 700 | + if (line[i] == ',') { |
| 701 | + comma_pos[ipos++] = i; |
| 702 | + if (ipos == 4) break; |
| 703 | + } |
| 704 | + else if (line[i] == '"') { |
| 705 | + quote_open = true; |
| 706 | + } |
| 707 | + } |
| 708 | + else { |
| 709 | + if (line[i] == '"') { |
| 710 | + quote_open = false; |
| 711 | + } |
| 712 | + } |
| 713 | + } |
| 714 | + if (ipos != 4) { |
| 715 | + printf("%s: failed to find comma separators in <%s>\n", __func__, line.c_str()); |
| 716 | + continue; |
| 717 | + } |
| 718 | + auto sentence = line[comma_pos[0]+1] == '"' ? line.substr(comma_pos[0]+2, comma_pos[1] - comma_pos[0] - 3) |
| 719 | + : line.substr(comma_pos[0]+1, comma_pos[1] - comma_pos[0] - 1); |
| 720 | + auto choice1 = line.substr(comma_pos[1]+1, comma_pos[2] - comma_pos[1] - 1); |
| 721 | + auto choice2 = line.substr(comma_pos[2]+1, comma_pos[3] - comma_pos[2] - 1); |
| 722 | + auto answer = line.substr(comma_pos[3]+1, line.size() - comma_pos[3] - 1); |
| 723 | + auto index = line.substr(0, comma_pos[0]); |
| 724 | + int where = 0; |
| 725 | + for ( ; where < int(sentence.size()); ++where) { |
| 726 | + if (sentence[where] == '_') break; |
| 727 | + } |
| 728 | + if (where == int(sentence.size())) { |
| 729 | + printf("%s: no _ in <%s>\n", __func__, sentence.c_str()); |
| 730 | + continue; |
| 731 | + } |
| 732 | + std::istringstream stream(answer.c_str()); |
| 733 | + int i_answer; stream >> i_answer; |
| 734 | + if (stream.fail() || i_answer < 1 || i_answer > 2) { |
| 735 | + printf("%s: failed to parse answer <%s>\n", __func__, answer.c_str()); |
| 736 | + continue; |
| 737 | + } |
| 738 | + result.emplace_back(); |
| 739 | + auto& wg = result.back(); |
| 740 | + wg.first = sentence.substr(0, where); |
| 741 | + wg.second = sentence.substr(where + 1, sentence.size() - where - 1); |
| 742 | + wg.choices[0] = std::move(choice1); |
| 743 | + wg.choices[1] = std::move(choice2); |
| 744 | + wg.answer = i_answer; |
| 745 | + } |
| 746 | + return result; |
| 747 | +} |
| 748 | + |
| 749 | +/* |
| 750 | + * Evaluates the Winogrande score. |
| 751 | + * Uses a CSV containing task index, dentence, choice 1, choice 2, answer (1 or 2) |
| 752 | + * You can get one such dataset from e.g. https://huggingface.co/datasets/ikawrakow/winogrande-eval-for-llama.cpp |
| 753 | + * As an example, the 1st row in the above dataset is |
| 754 | + * |
| 755 | + * 0,Sarah was a much better surgeon than Maria so _ always got the easier cases.,Sarah,Maria,2 |
| 756 | + * |
| 757 | + */ |
| 758 | +static void winogrande_score(llama_context * ctx, const gpt_params & params) { |
| 759 | + |
| 760 | + constexpr int k_min_trailing_ctx = 3; |
| 761 | + |
| 762 | + auto data = load_winogrande_from_csv(params.prompt); |
| 763 | + if (data.empty()) { |
| 764 | + fprintf(stderr, "%s: no tasks\n", __func__); |
| 765 | + return; |
| 766 | + } |
| 767 | + |
| 768 | + fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, data.size()); |
| 769 | + |
| 770 | + if (params.winogrande_tasks > 0 && params.winogrande_tasks < data.size()) { |
| 771 | + fprintf(stderr, "%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks); |
| 772 | + std::mt19937 rng(1); |
| 773 | + std::vector<int> aux(data.size()); |
| 774 | + for (int i = 0; i < int(data.size()); ++i) { |
| 775 | + aux[i] = i; |
| 776 | + } |
| 777 | + float scale = 1/(1.f + (float)rng.max()); |
| 778 | + std::vector<winogrande_entry> selected; |
| 779 | + selected.reserve(params.winogrande_tasks); |
| 780 | + for (int i = 0; i < int(params.winogrande_tasks); ++i) { |
| 781 | + int j = int(scale*rng()*aux.size()); |
| 782 | + selected[i] = std::move(data[aux[j]]); |
| 783 | + aux[j] = aux.back(); |
| 784 | + aux.pop_back(); |
| 785 | + } |
| 786 | + data = std::move(selected); |
| 787 | + } |
| 788 | + |
| 789 | + // This is needed as usual for LLaMA models |
| 790 | + const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); |
| 791 | + |
| 792 | + fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__); |
| 793 | + |
| 794 | + const int n_vocab = llama_n_vocab(llama_get_model(ctx)); |
| 795 | + const int n_ctx = llama_n_ctx(ctx); |
| 796 | + |
| 797 | + std::vector<float> tok_logits(n_vocab); |
| 798 | + |
| 799 | + int n_correct = 0; |
| 800 | + int n_done = 0; |
| 801 | + |
| 802 | + for (size_t task_idx = 0; task_idx < data.size(); task_idx++) { |
| 803 | + const auto& task = data[task_idx]; |
| 804 | + |
| 805 | + auto base_context = ::llama_tokenize(ctx, task.first, add_bos); |
| 806 | + auto base_ctx_1st = ::llama_tokenize(ctx, task.first + task.choices[0], add_bos); |
| 807 | + auto base_ctx_2nd = ::llama_tokenize(ctx, task.first + task.choices[1], add_bos); |
| 808 | + |
| 809 | + auto sentence_1st = task.first + task.choices[0] + task.second; |
| 810 | + auto sentence_2nd = task.first + task.choices[1] + task.second; |
| 811 | + auto query_1st = ::llama_tokenize(ctx, sentence_1st, add_bos); |
| 812 | + auto query_2nd = ::llama_tokenize(ctx, sentence_2nd, add_bos); |
| 813 | + |
| 814 | + if (query_1st.size() > (size_t)n_ctx || query_2nd.size() > (size_t)n_ctx) { |
| 815 | + fprintf(stderr, "%s : number of tokens in queries %zu, %zu > n_ctxl\n", __func__, query_1st.size(), query_2nd.size()); |
| 816 | + return; |
| 817 | + } |
| 818 | + |
| 819 | + auto query_1st_size = query_1st.size(); |
| 820 | + auto query_2nd_size = query_2nd.size(); |
| 821 | + |
| 822 | + // Speedup small evaluations by evaluating atleast 32 tokens |
| 823 | + // For Winogrande this seems to slow it down rather than speed it up. |
| 824 | + //if (query_1st.size() < 32) query_1st.resize(32); |
| 825 | + //if (query_2nd.size() < 32) query_2nd.resize(32); |
| 826 | + |
| 827 | + llama_kv_cache_clear(ctx); |
| 828 | + auto logits_1st = evaluate_tokens(ctx, query_1st, 0, params.n_batch, n_vocab); |
| 829 | + |
| 830 | + llama_kv_cache_clear(ctx); |
| 831 | + auto logits_2nd = evaluate_tokens(ctx, query_2nd, 0, params.n_batch, n_vocab); |
| 832 | + |
| 833 | + if (logits_1st.empty() || logits_2nd.empty()) { |
| 834 | + fprintf(stderr, "%s : failed to eval\n", __func__); |
| 835 | + return; |
| 836 | + } |
| 837 | + |
| 838 | + bool skip_choice = query_1st_size - base_ctx_1st.size() > k_min_trailing_ctx && |
| 839 | + query_2nd_size - base_ctx_2nd.size() > k_min_trailing_ctx; |
| 840 | + |
| 841 | + float score_1st = 0; |
| 842 | + bool is_nan_1st = false; |
| 843 | + const auto& base_1 = skip_choice ? base_ctx_1st : base_context; |
| 844 | + const int last_1st = query_1st_size - base_1.size() > 1 ? 1 : 0; |
| 845 | + for (size_t j = base_1.size()-1; j < query_1st_size-1-last_1st; ++j) { |
| 846 | + std::memcpy(tok_logits.data(), logits_1st.data() + j*n_vocab, n_vocab*sizeof(float)); |
| 847 | + const float prob = softmax(tok_logits)[query_1st[j+1]]; |
| 848 | + if (std::isnan(prob) || !prob) { |
| 849 | + fprintf(stderr, "%s: %g probability for token %zu when evaluating <%s>. Base context has %zu tokens\n", __func__, |
| 850 | + prob, j, sentence_1st.c_str(), base_context.size()); |
| 851 | + is_nan_1st = true; |
| 852 | + break; |
| 853 | + } |
| 854 | + score_1st += std::log(prob); |
| 855 | + } |
| 856 | + score_1st /= (query_1st_size - base_1.size() - last_1st); |
| 857 | + |
| 858 | + float score_2nd = 0; |
| 859 | + bool is_nan_2nd = false; |
| 860 | + const auto& base_2 = skip_choice ? base_ctx_2nd : base_context; |
| 861 | + const int last_2nd = query_2nd_size - base_2.size() > 1 ? 1 : 0; |
| 862 | + for (size_t j = base_2.size()-1; j < query_2nd_size-1-last_2nd; ++j) { |
| 863 | + std::memcpy(tok_logits.data(), logits_2nd.data() + j*n_vocab, n_vocab*sizeof(float)); |
| 864 | + const float prob = softmax(tok_logits)[query_2nd[j+1]]; |
| 865 | + if (std::isnan(prob) || !prob) { |
| 866 | + fprintf(stderr, "%s: %g probability for token %zu when evaluating <%s>. Base context has %zu tokens\n", __func__, |
| 867 | + prob, j, sentence_2nd.c_str(), base_context.size()); |
| 868 | + is_nan_2nd = true; |
| 869 | + break; |
| 870 | + } |
| 871 | + score_2nd += std::log(prob); |
| 872 | + } |
| 873 | + score_2nd /= (query_2nd_size - base_2.size() - last_2nd); |
| 874 | + |
| 875 | + if (is_nan_1st || is_nan_2nd) { |
| 876 | + continue; |
| 877 | + } |
| 878 | + |
| 879 | + if (std::isnan(score_1st) || std::isnan(score_2nd)) { |
| 880 | + printf("================== NaN score %g, %g) for:\n", score_1st, score_2nd); |
| 881 | + printf("Q1: <%s> - %zu tokens\n", sentence_1st.c_str(), query_1st_size); |
| 882 | + printf("Q2: <%s> - %zu tokens\n", sentence_2nd.c_str(), query_2nd_size); |
| 883 | + printf("B : <%s> - %zu tokens\n", task.first.c_str(), base_context.size()); |
| 884 | + printf("base_1 has %zu tokens, base_2 has %zu tokens, skip_choice = %d\n", base_1.size(), base_2.size(), skip_choice); |
| 885 | + continue; |
| 886 | + } |
| 887 | + |
| 888 | + int result = score_1st > score_2nd ? 1 : 2; |
| 889 | + |
| 890 | + if (result == task.answer) { |
| 891 | + ++n_correct; |
| 892 | + } |
| 893 | + ++n_done; |
| 894 | + |
| 895 | + // Print the accumulated accuracy mean x 100 |
| 896 | + printf("%zu\t%.4lf\t%10.6f %10.6f %d %d\n",task_idx+1, 100.0 * n_correct/n_done,score_1st,score_2nd,result,task.answer); |
| 897 | + fflush(stdout); |
| 898 | + } |
| 899 | + |
| 900 | + printf("\n"); |
| 901 | + |
| 902 | + if (n_done < 100) return; |
| 903 | + |
| 904 | + const float p = 1.f*n_correct/n_done; |
| 905 | + const float sigma = 100.f*sqrt(p*(1-p)/(n_done-1)); |
| 906 | + printf("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma); |
| 907 | +} |
| 908 | + |
| 909 | + |
679 | 910 | int main(int argc, char ** argv) {
|
680 | 911 | gpt_params params;
|
681 | 912 |
|
@@ -733,6 +964,8 @@ int main(int argc, char ** argv) {
|
733 | 964 | struct results_perplexity results;
|
734 | 965 | if (params.hellaswag) {
|
735 | 966 | hellaswag_score(ctx, params);
|
| 967 | + } else if (params.winogrande) { |
| 968 | + winogrande_score(ctx, params); |
736 | 969 | } else {
|
737 | 970 | results = perplexity(ctx, params);
|
738 | 971 | }
|
|
0 commit comments