How to implement Infinite Context and Attention? #1021
Unanswered
calebnwokocha
asked this question in
Q&A
Replies: 0 comments
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Uh oh!
There was an error while loading. Please reload this page.
Uh oh!
There was an error while loading. Please reload this page.
-
Please anyone who understand the method in this paper https://arxiv.org/abs/2404.07143 should help me implement the method in my code below for GPT-2 774M model. I think I am close to full implementation, but just a few things I am missing.
`
#include "ggml.h"
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
// default hparams (GPT-2 774M)
struct gpt_hparams {
int32_t n_vocab = 50257; // Vocabulary size remains the same
int32_t n_embd = 1024; // Embedding dimensionality
int32_t n_head = 16; // Number of attention heads
int32_t n_layer = 24; // Number of transformer layers
int32_t ftype = 1; // Set to 1 for FP16 precision (optional)
float eps = 1e-5f; // Small constant for numerical stability
};
struct gpt_vocab {
using id = int32_t;
using token = std::string;
};
struct gpt_layer {
// normalization
struct ggml_tensor * ln_1_g;
struct ggml_tensor * ln_1_b;
};
struct gpt_model {
gpt_hparams hparams;
};
// load the model's weights from a file
bool gpt_model_load(const std::string & fname, gpt_model & model, gpt_vocab & vocab) {
printf("%s: loading model from '%s'\n", func, fname.c_str());
}
void gpt_split_words(std::string str, std::vectorstd::string& words) {
const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
const std::regex re(pattern);
std::smatch m;
}
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
std::vectorstd::string words;
}
static std::vector<gpt_vocab::id> parse_tokens_from_string(const std::string& input, char delimiter) {
std::vector<gpt_vocab::id> output;
std::stringstream ss(input);
std::string token;
}
static std::map<std::string, std::vector<gpt_vocab::id>> extract_tests_from_file(const std::string & fpath_test){
if (fpath_test.empty()){
fprintf(stderr, "%s : No test file found.\n", func);
return std::map<std::string, std::vector<gpt_vocab::id>>();
}
}
void test_gpt_tokenizer(gpt_vocab & vocab, const std::string & fpath_test){
std::map<std::string, std::vector<gpt_vocab::id>> tests = extract_tests_from_file(fpath_test);
}
gpt_vocab::id gpt_sample_top_k_top_p(
const gpt_vocab & vocab,
const float * logits,
int top_k,
double top_p,
double temp,
std::mt19937 & rng) {
int n_logits = vocab.id_to_token.size();
}
struct ggml_tensor* load_compressed_memory(
struct ggml_context* ctx,
int layer,
const gpt_model & model,
int n_past,
int n_embd) {
}
// evaluate the transformer
//
// - model: the model
// - n_threads: number of threads to use
// - n_past: the context size so far
// - embd_inp: the embeddings of the tokens in the context
// - embd_w: the predicted logits for the next token
//
bool gpt_eval(
const gpt_model & model,
const int n_threads,
const int n_past,
const std::vector<gpt_vocab::id> & embd_inp,
std::vector & embd_w,
size_t & mem_per_token) {
const int N = embd_inp.size();
}
void gpt_print_usage(int argc, char ** argv, const gpt_hparams & params) {
fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help show this help message and exit\n");
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n");
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
fprintf(stderr, " prompt to start generation with (default: random)\n");
fprintf(stderr, " -f FNAME, --file FNAME\n");
fprintf(stderr, " load prompt from a file\n");
fprintf(stderr, " -tt TOKEN_TEST, --token_test TOKEN_TEST\n");
fprintf(stderr, " test tokenization\n");
fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict);
fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k);
fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p);
fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp);
fprintf(stderr, " --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled)\n", params.repeat_last_n);
fprintf(stderr, " --repeat-penalty N penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)\n", (double)params.repeat_penalty);
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stderr, " -c N, --context N context / KV cache size (default: %d)\n", params.n_ctx);
fprintf(stderr, " --ignore-eos ignore EOS token during generation\n");
fprintf(stderr, " -ngl N, --gpu-layers N number of layers to offload to GPU on supported models (default: %d)\n", params.n_gpu_layers);
fprintf(stderr, " -m FNAME, --model FNAME\n");
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
fprintf(stderr, "\n");
}
// Function to check if the next argument exists
static std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, gpt_hparams& params) {
if (i + 1 < argc && argv[i + 1][0] != '-') {
return argv[++i];
} else {
fprintf(stderr, "error: %s requires one argument.\n", flag.c_str());
gpt_print_usage(argc, argv, params);
exit(0);
}
}
bool gpt_params_parse(int argc, char ** argv, gpt_hparams & params) {
for (int i = 1; i < argc; i++) {
std::string arg = argv[i];
}
std::string gpt_random_prompt(std::mt19937 & rng) {
const int r = rng() % 10;
switch (r) {
case 0: return "So";
case 1: return "Once upon a time";
case 2: return "When";
case 3: return "The";
case 4: return "After";
case 5: return "If";
case 6: return "import";
case 7: return "He";
case 8: return "She";
case 9: return "They";
}
}
//// Class for Infinite Context Handling
//std::vector tokens; // Store tokens dynamically
//
//// Add new tokens to context
//void add_tokens(const std::vector& new_tokens) {
// tokens.insert(tokens.end(), new_tokens.begin(), new_tokens.end());
//}
//
//// Retrieve relevant context
//std::vector get_context(int max_length) const {
// if (tokens.size() <= max_length) {
// return tokens;
// }
// return std::vector(tokens.end() - max_length, tokens.end());
//}
int main(int argc, char ** argv) {
ggml_time_init();
}
`
Beta Was this translation helpful? Give feedback.
All reactions