@@ -468,7 +468,10 @@ struct completion_token_output {
468
468
469
469
struct server_task_result_cmpl_final : server_task_result {
470
470
int index = 0 ;
471
- std::string content;
471
+
472
+ std::string content;
473
+ llama_tokens tokens;
474
+
472
475
bool stream;
473
476
result_timings timings;
474
477
std::string prompt;
@@ -510,6 +513,7 @@ struct server_task_result_cmpl_final : server_task_result {
510
513
json res = json {
511
514
{" index" , index},
512
515
{" content" , stream ? " " : content}, // in stream mode, content is already in last partial chunk
516
+ {" tokens" , stream ? llama_tokens {} : tokens},
513
517
{" id_slot" , id_slot},
514
518
{" stop" , true },
515
519
{" model" , oaicompat_model},
@@ -541,7 +545,8 @@ struct server_task_result_cmpl_final : server_task_result {
541
545
{" index" , 0 },
542
546
{" message" , json{
543
547
{" content" , content},
544
- {" role" , " assistant" }
548
+ {" tokens" , tokens},
549
+ {" role" , " assistant" }
545
550
}
546
551
}}});
547
552
@@ -605,7 +610,9 @@ struct server_task_result_cmpl_final : server_task_result {
605
610
606
611
struct server_task_result_cmpl_partial : server_task_result {
607
612
int index = 0 ;
608
- std::string content;
613
+
614
+ std::string content;
615
+ llama_tokens tokens;
609
616
610
617
int32_t n_decoded;
611
618
int32_t n_prompt_tokens;
@@ -637,6 +644,7 @@ struct server_task_result_cmpl_partial : server_task_result {
637
644
json res = json {
638
645
{" index" , index},
639
646
{" content" , content},
647
+ {" tokens" , tokens},
640
648
{" stop" , false },
641
649
{" id_slot" , id_slot},
642
650
{" tokens_predicted" , n_decoded},
@@ -679,7 +687,8 @@ struct server_task_result_cmpl_partial : server_task_result {
679
687
{" choices" , json::array ({json{{" finish_reason" , nullptr },
680
688
{" index" , 0 },
681
689
{" delta" , json{
682
- {" content" , content}}}
690
+ {" content" , content},
691
+ {" tokens" , tokens}}}
683
692
}})},
684
693
{" created" , t},
685
694
{" id" , oaicompat_cmpl_id},
@@ -695,6 +704,7 @@ struct server_task_result_cmpl_partial : server_task_result {
695
704
{" delta" ,
696
705
json{
697
706
{" content" , content},
707
+ {" tokens" , tokens}
698
708
}},
699
709
}});
700
710
}
@@ -949,8 +959,11 @@ struct server_slot {
949
959
950
960
size_t last_nl_pos = 0 ;
951
961
952
- std::string generated_text;
962
+ std::string generated_text;
963
+ llama_tokens generated_tokens;
964
+
953
965
llama_tokens cache_tokens;
966
+
954
967
std::vector<completion_token_output> generated_token_probs;
955
968
956
969
bool has_next_token = true ;
@@ -985,6 +998,7 @@ struct server_slot {
985
998
n_prompt_tokens = 0 ;
986
999
last_nl_pos = 0 ;
987
1000
generated_text = " " ;
1001
+ generated_tokens = {};
988
1002
has_new_line = false ;
989
1003
truncated = false ;
990
1004
stop = STOP_TYPE_NONE;
@@ -1736,6 +1750,7 @@ struct server_context {
1736
1750
1737
1751
// search stop word and delete it
1738
1752
slot.generated_text += token_str;
1753
+ slot.generated_tokens .push_back (result.tok );
1739
1754
slot.has_next_token = true ;
1740
1755
1741
1756
// check if there is incomplete UTF-8 character at the end
@@ -1912,6 +1927,7 @@ struct server_context {
1912
1927
res->id = slot.id_task ;
1913
1928
res->index = slot.index ;
1914
1929
res->content = tkn.text_to_send ;
1930
+ res->tokens = { tkn.tok };
1915
1931
1916
1932
res->n_decoded = slot.n_decoded ;
1917
1933
res->n_prompt_tokens = slot.n_prompt_tokens ;
@@ -1952,6 +1968,7 @@ struct server_context {
1952
1968
1953
1969
res->index = slot.index ;
1954
1970
res->content = slot.generated_text ;
1971
+ res->tokens = slot.generated_tokens ;
1955
1972
res->timings = slot.get_timings ();
1956
1973
res->prompt = common_detokenize (ctx, slot.prompt_tokens , true );
1957
1974
0 commit comments