Skip to content

Commit afbbfaa

Browse files
authored
server : add more env vars, improve gen-docs (ggml-org#9635)
* server : add more env vars, improve gen-docs * update server docs * LLAMA_ARG_NO_CONTEXT_SHIFT
1 parent 3d6bf69 commit afbbfaa

File tree

4 files changed

+157
-107
lines changed

4 files changed

+157
-107
lines changed

common/arg.cpp

Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -691,7 +691,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
691691
[](gpt_params & params) {
692692
params.ctx_shift = false;
693693
}
694-
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
694+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
695695
add_opt(llama_arg(
696696
{"--chunks"}, "N",
697697
format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
@@ -1102,7 +1102,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
11021102
else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
11031103
else { throw std::invalid_argument("invalid value"); }
11041104
}
1105-
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
1105+
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING"));
11061106
add_opt(llama_arg(
11071107
{"--attention"}, "{causal,non,causal}",
11081108
"attention type for embeddings, use model default if unspecified",
@@ -1121,77 +1121,77 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
11211121
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
11221122
else { throw std::invalid_argument("invalid value"); }
11231123
}
1124-
));
1124+
).set_env("LLAMA_ARG_ROPE_SCALING_TYPE"));
11251125
add_opt(llama_arg(
11261126
{"--rope-scale"}, "N",
11271127
"RoPE context scaling factor, expands context by a factor of N",
11281128
[](gpt_params & params, const std::string & value) {
11291129
params.rope_freq_scale = 1.0f / std::stof(value);
11301130
}
1131-
));
1131+
).set_env("LLAMA_ARG_ROPE_SCALE"));
11321132
add_opt(llama_arg(
11331133
{"--rope-freq-base"}, "N",
11341134
"RoPE base frequency, used by NTK-aware scaling (default: loaded from model)",
11351135
[](gpt_params & params, const std::string & value) {
11361136
params.rope_freq_base = std::stof(value);
11371137
}
1138-
));
1138+
).set_env("LLAMA_ARG_ROPE_FREQ_BASE"));
11391139
add_opt(llama_arg(
11401140
{"--rope-freq-scale"}, "N",
11411141
"RoPE frequency scaling factor, expands context by a factor of 1/N",
11421142
[](gpt_params & params, const std::string & value) {
11431143
params.rope_freq_scale = std::stof(value);
11441144
}
1145-
));
1145+
).set_env("LLAMA_ARG_ROPE_FREQ_SCALE"));
11461146
add_opt(llama_arg(
11471147
{"--yarn-orig-ctx"}, "N",
11481148
format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx),
11491149
[](gpt_params & params, int value) {
11501150
params.yarn_orig_ctx = value;
11511151
}
1152-
));
1152+
).set_env("LLAMA_ARG_YARN_ORIG_CTX"));
11531153
add_opt(llama_arg(
11541154
{"--yarn-ext-factor"}, "N",
11551155
format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor),
11561156
[](gpt_params & params, const std::string & value) {
11571157
params.yarn_ext_factor = std::stof(value);
11581158
}
1159-
));
1159+
).set_env("LLAMA_ARG_YARN_EXT_FACTOR"));
11601160
add_opt(llama_arg(
11611161
{"--yarn-attn-factor"}, "N",
11621162
format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor),
11631163
[](gpt_params & params, const std::string & value) {
11641164
params.yarn_attn_factor = std::stof(value);
11651165
}
1166-
));
1166+
).set_env("LLAMA_ARG_YARN_ATTN_FACTOR"));
11671167
add_opt(llama_arg(
11681168
{"--yarn-beta-slow"}, "N",
11691169
format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow),
11701170
[](gpt_params & params, const std::string & value) {
11711171
params.yarn_beta_slow = std::stof(value);
11721172
}
1173-
));
1173+
).set_env("LLAMA_ARG_YARN_BETA_SLOW"));
11741174
add_opt(llama_arg(
11751175
{"--yarn-beta-fast"}, "N",
11761176
format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast),
11771177
[](gpt_params & params, const std::string & value) {
11781178
params.yarn_beta_fast = std::stof(value);
11791179
}
1180-
));
1180+
).set_env("LLAMA_ARG_YARN_BETA_FAST"));
11811181
add_opt(llama_arg(
11821182
{"-gan", "--grp-attn-n"}, "N",
11831183
format("group-attention factor (default: %d)", params.grp_attn_n),
11841184
[](gpt_params & params, int value) {
11851185
params.grp_attn_n = value;
11861186
}
1187-
));
1187+
).set_env("LLAMA_ARG_GRP_ATTN_N"));
11881188
add_opt(llama_arg(
11891189
{"-gaw", "--grp-attn-w"}, "N",
11901190
format("group-attention width (default: %.1f)", (double)params.grp_attn_w),
11911191
[](gpt_params & params, int value) {
11921192
params.grp_attn_w = value;
11931193
}
1194-
));
1194+
).set_env("LLAMA_ARG_GRP_ATTN_W"));
11951195
add_opt(llama_arg(
11961196
{"-dkvc", "--dump-kv-cache"},
11971197
"verbose print of the KV cache",
@@ -1205,23 +1205,23 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
12051205
[](gpt_params & params) {
12061206
params.no_kv_offload = true;
12071207
}
1208-
));
1208+
).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
12091209
add_opt(llama_arg(
12101210
{"-ctk", "--cache-type-k"}, "TYPE",
12111211
format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()),
12121212
[](gpt_params & params, const std::string & value) {
12131213
// TODO: get the type right here
12141214
params.cache_type_k = value;
12151215
}
1216-
));
1216+
).set_env("LLAMA_ARG_CACHE_TYPE_K"));
12171217
add_opt(llama_arg(
12181218
{"-ctv", "--cache-type-v"}, "TYPE",
12191219
format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()),
12201220
[](gpt_params & params, const std::string & value) {
12211221
// TODO: get the type right here
12221222
params.cache_type_v = value;
12231223
}
1224-
));
1224+
).set_env("LLAMA_ARG_CACHE_TYPE_V"));
12251225
add_opt(llama_arg(
12261226
{"--perplexity", "--all-logits"},
12271227
format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
@@ -1355,22 +1355,22 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
13551355
[](gpt_params & params, const std::string & value) {
13561356
params.rpc_servers = value;
13571357
}
1358-
));
1358+
).set_env("LLAMA_ARG_RPC"));
13591359
#endif
13601360
add_opt(llama_arg(
13611361
{"--mlock"},
13621362
"force system to keep model in RAM rather than swapping or compressing",
13631363
[](gpt_params & params) {
13641364
params.use_mlock = true;
13651365
}
1366-
));
1366+
).set_env("LLAMA_ARG_MLOCK"));
13671367
add_opt(llama_arg(
13681368
{"--no-mmap"},
13691369
"do not memory-map model (slower load but may reduce pageouts if not using mlock)",
13701370
[](gpt_params & params) {
13711371
params.use_mmap = false;
13721372
}
1373-
));
1373+
).set_env("LLAMA_ARG_NO_MMAP"));
13741374
add_opt(llama_arg(
13751375
{"--numa"}, "TYPE",
13761376
"attempt optimizations that help on some NUMA systems\n"
@@ -1385,7 +1385,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
13851385
else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
13861386
else { throw std::invalid_argument("invalid value"); }
13871387
}
1388-
));
1388+
).set_env("LLAMA_ARG_NUMA"));
13891389
add_opt(llama_arg(
13901390
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
13911391
"number of layers to store in VRAM",
@@ -1433,7 +1433,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
14331433
fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the split mode has no effect.\n");
14341434
}
14351435
}
1436-
));
1436+
).set_env("LLAMA_ARG_SPLIT_MODE"));
14371437
add_opt(llama_arg(
14381438
{"-ts", "--tensor-split"}, "N0,N1,N2,...",
14391439
"fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1",
@@ -1460,7 +1460,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
14601460
fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting a tensor split has no effect.\n");
14611461
}
14621462
}
1463-
));
1463+
).set_env("LLAMA_ARG_TENSOR_SPLIT"));
14641464
add_opt(llama_arg(
14651465
{"-mg", "--main-gpu"}, "INDEX",
14661466
format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu),
@@ -1470,7 +1470,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
14701470
fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n");
14711471
}
14721472
}
1473-
));
1473+
).set_env("LLAMA_ARG_MAIN_GPU"));
14741474
add_opt(llama_arg(
14751475
{"--check-tensors"},
14761476
format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
@@ -1533,7 +1533,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
15331533
[](gpt_params & params, const std::string & value) {
15341534
params.model_alias = value;
15351535
}
1536-
).set_examples({LLAMA_EXAMPLE_SERVER}));
1536+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS"));
15371537
add_opt(llama_arg(
15381538
{"-m", "--model"}, "FNAME",
15391539
ex == LLAMA_EXAMPLE_EXPORT_LORA
@@ -1741,7 +1741,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
17411741
[](gpt_params & params, const std::string & value) {
17421742
params.public_path = value;
17431743
}
1744-
).set_examples({LLAMA_EXAMPLE_SERVER}));
1744+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
17451745
add_opt(llama_arg(
17461746
{"--embedding", "--embeddings"},
17471747
format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
@@ -1779,22 +1779,22 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
17791779
[](gpt_params & params, const std::string & value) {
17801780
params.ssl_file_key = value;
17811781
}
1782-
).set_examples({LLAMA_EXAMPLE_SERVER}));
1782+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_KEY_FILE"));
17831783
add_opt(llama_arg(
17841784
{"--ssl-cert-file"}, "FNAME",
17851785
"path to file a PEM-encoded SSL certificate",
17861786
[](gpt_params & params, const std::string & value) {
17871787
params.ssl_file_cert = value;
17881788
}
1789-
).set_examples({LLAMA_EXAMPLE_SERVER}));
1789+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
17901790
add_opt(llama_arg(
17911791
{"-to", "--timeout"}, "N",
17921792
format("server read/write timeout in seconds (default: %d)", params.timeout_read),
17931793
[](gpt_params & params, int value) {
17941794
params.timeout_read = value;
17951795
params.timeout_write = value;
17961796
}
1797-
).set_examples({LLAMA_EXAMPLE_SERVER}));
1797+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TIMEOUT"));
17981798
add_opt(llama_arg(
17991799
{"--threads-http"}, "N",
18001800
format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http),

examples/gen-docs/gen-docs.cpp

Lines changed: 58 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -6,42 +6,73 @@
66

77
// Export usage message (-h) to markdown format
88

9+
static void write_table_header(std::ofstream & file) {
10+
file << "| Argument | Explanation |\n";
11+
file << "| -------- | ----------- |\n";
12+
}
13+
14+
static void write_table_entry(std::ofstream & file, const llama_arg & opt) {
15+
file << "| `";
16+
// args
17+
for (const auto & arg : opt.args) {
18+
if (arg == opt.args.front()) {
19+
file << arg;
20+
if (opt.args.size() > 1) file << ", ";
21+
} else {
22+
file << arg << (arg != opt.args.back() ? ", " : "");
23+
}
24+
}
25+
// value hint
26+
if (opt.value_hint) {
27+
std::string md_value_hint(opt.value_hint);
28+
string_replace_all(md_value_hint, "|", "\\|");
29+
file << " " << md_value_hint;
30+
}
31+
if (opt.value_hint_2) {
32+
std::string md_value_hint_2(opt.value_hint_2);
33+
string_replace_all(md_value_hint_2, "|", "\\|");
34+
file << " " << md_value_hint_2;
35+
}
36+
// help text
37+
std::string md_help(opt.help);
38+
string_replace_all(md_help, "\n", "<br/>");
39+
string_replace_all(md_help, "|", "\\|");
40+
file << "` | " << md_help << " |\n";
41+
}
42+
43+
static void write_table(std::ofstream & file, std::vector<llama_arg *> & opts) {
44+
write_table_header(file);
45+
for (const auto & opt : opts) {
46+
write_table_entry(file, *opt);
47+
}
48+
}
49+
950
static void export_md(std::string fname, llama_example ex) {
1051
std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
1152

1253
gpt_params params;
1354
auto ctx_arg = gpt_params_parser_init(params, ex);
1455

15-
file << "| Argument | Explanation |\n";
16-
file << "| -------- | ----------- |\n";
56+
std::vector<llama_arg *> common_options;
57+
std::vector<llama_arg *> sparam_options;
58+
std::vector<llama_arg *> specific_options;
1759
for (auto & opt : ctx_arg.options) {
18-
file << "| `";
19-
// args
20-
for (const auto & arg : opt.args) {
21-
if (arg == opt.args.front()) {
22-
file << arg;
23-
if (opt.args.size() > 1) file << ", ";
24-
} else {
25-
file << arg << (arg != opt.args.back() ? ", " : "");
26-
}
27-
}
28-
// value hint
29-
if (opt.value_hint) {
30-
std::string md_value_hint(opt.value_hint);
31-
string_replace_all(md_value_hint, "|", "\\|");
32-
file << " " << md_value_hint;
60+
// in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example
61+
if (opt.is_sparam) {
62+
sparam_options.push_back(&opt);
63+
} else if (opt.in_example(ctx_arg.ex)) {
64+
specific_options.push_back(&opt);
65+
} else {
66+
common_options.push_back(&opt);
3367
}
34-
if (opt.value_hint_2) {
35-
std::string md_value_hint_2(opt.value_hint_2);
36-
string_replace_all(md_value_hint_2, "|", "\\|");
37-
file << " " << md_value_hint_2;
38-
}
39-
// help text
40-
std::string md_help(opt.help);
41-
string_replace_all(md_help, "\n", "<br/>");
42-
string_replace_all(md_help, "|", "\\|");
43-
file << "` | " << md_help << " |\n";
4468
}
69+
70+
file << "**Common params**\n\n";
71+
write_table(file, common_options);
72+
file << "\n\n**Sampling params**\n\n";
73+
write_table(file, sparam_options);
74+
file << "\n\n**Example-specific params**\n\n";
75+
write_table(file, specific_options);
4576
}
4677

4778
int main(int, char **) {

0 commit comments

Comments
 (0)