@@ -691,7 +691,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
691
691
[](gpt_params & params) {
692
692
params.ctx_shift = false ;
693
693
}
694
- ).set_examples ({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
694
+ ).set_examples ({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_NO_CONTEXT_SHIFT " ) );
695
695
add_opt (llama_arg (
696
696
{" --chunks" }, " N" ,
697
697
format (" max number of chunks to process (default: %d, -1 = all)" , params.n_chunks ),
@@ -1102,7 +1102,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
1102
1102
else if (value == " last" ) { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
1103
1103
else { throw std::invalid_argument (" invalid value" ); }
1104
1104
}
1105
- ).set_examples ({LLAMA_EXAMPLE_EMBEDDING} ));
1105
+ ).set_examples ({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_POOLING " ));
1106
1106
add_opt (llama_arg (
1107
1107
{" --attention" }, " {causal,non,causal}" ,
1108
1108
" attention type for embeddings, use model default if unspecified" ,
@@ -1121,77 +1121,77 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
1121
1121
else if (value == " yarn" ) { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
1122
1122
else { throw std::invalid_argument (" invalid value" ); }
1123
1123
}
1124
- ));
1124
+ ). set_env ( " LLAMA_ARG_ROPE_SCALING_TYPE " ) );
1125
1125
add_opt (llama_arg (
1126
1126
{" --rope-scale" }, " N" ,
1127
1127
" RoPE context scaling factor, expands context by a factor of N" ,
1128
1128
[](gpt_params & params, const std::string & value) {
1129
1129
params.rope_freq_scale = 1 .0f / std::stof (value);
1130
1130
}
1131
- ));
1131
+ ). set_env ( " LLAMA_ARG_ROPE_SCALE " ) );
1132
1132
add_opt (llama_arg (
1133
1133
{" --rope-freq-base" }, " N" ,
1134
1134
" RoPE base frequency, used by NTK-aware scaling (default: loaded from model)" ,
1135
1135
[](gpt_params & params, const std::string & value) {
1136
1136
params.rope_freq_base = std::stof (value);
1137
1137
}
1138
- ));
1138
+ ). set_env ( " LLAMA_ARG_ROPE_FREQ_BASE " ) );
1139
1139
add_opt (llama_arg (
1140
1140
{" --rope-freq-scale" }, " N" ,
1141
1141
" RoPE frequency scaling factor, expands context by a factor of 1/N" ,
1142
1142
[](gpt_params & params, const std::string & value) {
1143
1143
params.rope_freq_scale = std::stof (value);
1144
1144
}
1145
- ));
1145
+ ). set_env ( " LLAMA_ARG_ROPE_FREQ_SCALE " ) );
1146
1146
add_opt (llama_arg (
1147
1147
{" --yarn-orig-ctx" }, " N" ,
1148
1148
format (" YaRN: original context size of model (default: %d = model training context size)" , params.yarn_orig_ctx ),
1149
1149
[](gpt_params & params, int value) {
1150
1150
params.yarn_orig_ctx = value;
1151
1151
}
1152
- ));
1152
+ ). set_env ( " LLAMA_ARG_YARN_ORIG_CTX " ) );
1153
1153
add_opt (llama_arg (
1154
1154
{" --yarn-ext-factor" }, " N" ,
1155
1155
format (" YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)" , (double )params.yarn_ext_factor ),
1156
1156
[](gpt_params & params, const std::string & value) {
1157
1157
params.yarn_ext_factor = std::stof (value);
1158
1158
}
1159
- ));
1159
+ ). set_env ( " LLAMA_ARG_YARN_EXT_FACTOR " ) );
1160
1160
add_opt (llama_arg (
1161
1161
{" --yarn-attn-factor" }, " N" ,
1162
1162
format (" YaRN: scale sqrt(t) or attention magnitude (default: %.1f)" , (double )params.yarn_attn_factor ),
1163
1163
[](gpt_params & params, const std::string & value) {
1164
1164
params.yarn_attn_factor = std::stof (value);
1165
1165
}
1166
- ));
1166
+ ). set_env ( " LLAMA_ARG_YARN_ATTN_FACTOR " ) );
1167
1167
add_opt (llama_arg (
1168
1168
{" --yarn-beta-slow" }, " N" ,
1169
1169
format (" YaRN: high correction dim or alpha (default: %.1f)" , (double )params.yarn_beta_slow ),
1170
1170
[](gpt_params & params, const std::string & value) {
1171
1171
params.yarn_beta_slow = std::stof (value);
1172
1172
}
1173
- ));
1173
+ ). set_env ( " LLAMA_ARG_YARN_BETA_SLOW " ) );
1174
1174
add_opt (llama_arg (
1175
1175
{" --yarn-beta-fast" }, " N" ,
1176
1176
format (" YaRN: low correction dim or beta (default: %.1f)" , (double )params.yarn_beta_fast ),
1177
1177
[](gpt_params & params, const std::string & value) {
1178
1178
params.yarn_beta_fast = std::stof (value);
1179
1179
}
1180
- ));
1180
+ ). set_env ( " LLAMA_ARG_YARN_BETA_FAST " ) );
1181
1181
add_opt (llama_arg (
1182
1182
{" -gan" , " --grp-attn-n" }, " N" ,
1183
1183
format (" group-attention factor (default: %d)" , params.grp_attn_n ),
1184
1184
[](gpt_params & params, int value) {
1185
1185
params.grp_attn_n = value;
1186
1186
}
1187
- ));
1187
+ ). set_env ( " LLAMA_ARG_GRP_ATTN_N " ) );
1188
1188
add_opt (llama_arg (
1189
1189
{" -gaw" , " --grp-attn-w" }, " N" ,
1190
1190
format (" group-attention width (default: %.1f)" , (double )params.grp_attn_w ),
1191
1191
[](gpt_params & params, int value) {
1192
1192
params.grp_attn_w = value;
1193
1193
}
1194
- ));
1194
+ ). set_env ( " LLAMA_ARG_GRP_ATTN_W " ) );
1195
1195
add_opt (llama_arg (
1196
1196
{" -dkvc" , " --dump-kv-cache" },
1197
1197
" verbose print of the KV cache" ,
@@ -1205,23 +1205,23 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
1205
1205
[](gpt_params & params) {
1206
1206
params.no_kv_offload = true ;
1207
1207
}
1208
- ));
1208
+ ). set_env ( " LLAMA_ARG_NO_KV_OFFLOAD " ) );
1209
1209
add_opt (llama_arg (
1210
1210
{" -ctk" , " --cache-type-k" }, " TYPE" ,
1211
1211
format (" KV cache data type for K (default: %s)" , params.cache_type_k .c_str ()),
1212
1212
[](gpt_params & params, const std::string & value) {
1213
1213
// TODO: get the type right here
1214
1214
params.cache_type_k = value;
1215
1215
}
1216
- ));
1216
+ ). set_env ( " LLAMA_ARG_CACHE_TYPE_K " ) );
1217
1217
add_opt (llama_arg (
1218
1218
{" -ctv" , " --cache-type-v" }, " TYPE" ,
1219
1219
format (" KV cache data type for V (default: %s)" , params.cache_type_v .c_str ()),
1220
1220
[](gpt_params & params, const std::string & value) {
1221
1221
// TODO: get the type right here
1222
1222
params.cache_type_v = value;
1223
1223
}
1224
- ));
1224
+ ). set_env ( " LLAMA_ARG_CACHE_TYPE_V " ) );
1225
1225
add_opt (llama_arg (
1226
1226
{" --perplexity" , " --all-logits" },
1227
1227
format (" return logits for all tokens in the batch (default: %s)" , params.logits_all ? " true" : " false" ),
@@ -1355,22 +1355,22 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
1355
1355
[](gpt_params & params, const std::string & value) {
1356
1356
params.rpc_servers = value;
1357
1357
}
1358
- ));
1358
+ ). set_env ( " LLAMA_ARG_RPC " ) );
1359
1359
#endif
1360
1360
add_opt (llama_arg (
1361
1361
{" --mlock" },
1362
1362
" force system to keep model in RAM rather than swapping or compressing" ,
1363
1363
[](gpt_params & params) {
1364
1364
params.use_mlock = true ;
1365
1365
}
1366
- ));
1366
+ ). set_env ( " LLAMA_ARG_MLOCK " ) );
1367
1367
add_opt (llama_arg (
1368
1368
{" --no-mmap" },
1369
1369
" do not memory-map model (slower load but may reduce pageouts if not using mlock)" ,
1370
1370
[](gpt_params & params) {
1371
1371
params.use_mmap = false ;
1372
1372
}
1373
- ));
1373
+ ). set_env ( " LLAMA_ARG_NO_MMAP " ) );
1374
1374
add_opt (llama_arg (
1375
1375
{" --numa" }, " TYPE" ,
1376
1376
" attempt optimizations that help on some NUMA systems\n "
@@ -1385,7 +1385,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
1385
1385
else if (value == " numactl" ) { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
1386
1386
else { throw std::invalid_argument (" invalid value" ); }
1387
1387
}
1388
- ));
1388
+ ). set_env ( " LLAMA_ARG_NUMA " ) );
1389
1389
add_opt (llama_arg (
1390
1390
{" -ngl" , " --gpu-layers" , " --n-gpu-layers" }, " N" ,
1391
1391
" number of layers to store in VRAM" ,
@@ -1433,7 +1433,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
1433
1433
fprintf (stderr, " warning: llama.cpp was compiled without support for GPU offload. Setting the split mode has no effect.\n " );
1434
1434
}
1435
1435
}
1436
- ));
1436
+ ). set_env ( " LLAMA_ARG_SPLIT_MODE " ) );
1437
1437
add_opt (llama_arg (
1438
1438
{" -ts" , " --tensor-split" }, " N0,N1,N2,..." ,
1439
1439
" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1" ,
@@ -1460,7 +1460,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
1460
1460
fprintf (stderr, " warning: llama.cpp was compiled without support for GPU offload. Setting a tensor split has no effect.\n " );
1461
1461
}
1462
1462
}
1463
- ));
1463
+ ). set_env ( " LLAMA_ARG_TENSOR_SPLIT " ) );
1464
1464
add_opt (llama_arg (
1465
1465
{" -mg" , " --main-gpu" }, " INDEX" ,
1466
1466
format (" the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)" , params.main_gpu ),
@@ -1470,7 +1470,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
1470
1470
fprintf (stderr, " warning: llama.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n " );
1471
1471
}
1472
1472
}
1473
- ));
1473
+ ). set_env ( " LLAMA_ARG_MAIN_GPU " ) );
1474
1474
add_opt (llama_arg (
1475
1475
{" --check-tensors" },
1476
1476
format (" check model tensor data for invalid values (default: %s)" , params.check_tensors ? " true" : " false" ),
@@ -1533,7 +1533,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
1533
1533
[](gpt_params & params, const std::string & value) {
1534
1534
params.model_alias = value;
1535
1535
}
1536
- ).set_examples ({LLAMA_EXAMPLE_SERVER}));
1536
+ ).set_examples ({LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_ALIAS " ) );
1537
1537
add_opt (llama_arg (
1538
1538
{" -m" , " --model" }, " FNAME" ,
1539
1539
ex == LLAMA_EXAMPLE_EXPORT_LORA
@@ -1741,7 +1741,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
1741
1741
[](gpt_params & params, const std::string & value) {
1742
1742
params.public_path = value;
1743
1743
}
1744
- ).set_examples ({LLAMA_EXAMPLE_SERVER}));
1744
+ ).set_examples ({LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_STATIC_PATH " ) );
1745
1745
add_opt (llama_arg (
1746
1746
{" --embedding" , " --embeddings" },
1747
1747
format (" restrict to only support embedding use case; use only with dedicated embedding models (default: %s)" , params.embedding ? " enabled" : " disabled" ),
@@ -1779,22 +1779,22 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
1779
1779
[](gpt_params & params, const std::string & value) {
1780
1780
params.ssl_file_key = value;
1781
1781
}
1782
- ).set_examples ({LLAMA_EXAMPLE_SERVER}));
1782
+ ).set_examples ({LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_SSL_KEY_FILE " ) );
1783
1783
add_opt (llama_arg (
1784
1784
{" --ssl-cert-file" }, " FNAME" ,
1785
1785
" path to file a PEM-encoded SSL certificate" ,
1786
1786
[](gpt_params & params, const std::string & value) {
1787
1787
params.ssl_file_cert = value;
1788
1788
}
1789
- ).set_examples ({LLAMA_EXAMPLE_SERVER}));
1789
+ ).set_examples ({LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_SSL_CERT_FILE " ) );
1790
1790
add_opt (llama_arg (
1791
1791
{" -to" , " --timeout" }, " N" ,
1792
1792
format (" server read/write timeout in seconds (default: %d)" , params.timeout_read ),
1793
1793
[](gpt_params & params, int value) {
1794
1794
params.timeout_read = value;
1795
1795
params.timeout_write = value;
1796
1796
}
1797
- ).set_examples ({LLAMA_EXAMPLE_SERVER}));
1797
+ ).set_examples ({LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_TIMEOUT " ) );
1798
1798
add_opt (llama_arg (
1799
1799
{" --threads-http" }, " N" ,
1800
1800
format (" number of threads used to process HTTP requests (default: %d)" , params.n_threads_http ),
0 commit comments