@@ -1009,39 +1009,83 @@ struct FluxCLIPEmbedder : public Conditioner {
1009
1009
std::shared_ptr<T5Runner> t5;
1010
1010
size_t chunk_len = 256 ;
1011
1011
1012
+ bool use_clip_l = false ;
1013
+ bool use_t5 = false ;
1014
+
1012
1015
FluxCLIPEmbedder (ggml_backend_t backend,
1013
1016
std::map<std::string, enum ggml_type>& tensor_types,
1014
1017
int clip_skip = -1 ) {
1015
- clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " text_encoders.clip_l.transformer.text_model" , OPENAI_CLIP_VIT_L_14, true );
1016
- t5 = std::make_shared<T5Runner>(backend, tensor_types, " text_encoders.t5xxl.transformer" );
1018
+
1019
+ for (auto pair : tensor_types) {
1020
+ if (pair.first .find (" text_encoders.clip_l" ) != std::string::npos) {
1021
+ use_clip_l = true ;
1022
+ } else if (pair.first .find (" text_encoders.t5xxl" ) != std::string::npos) {
1023
+ use_t5 = true ;
1024
+ }
1025
+ }
1026
+
1027
+ if (!use_clip_l && !use_t5) {
1028
+ LOG_WARN (" IMPORTANT NOTICE: No text encoders provided, cannot process prompts!" );
1029
+ return ;
1030
+ }
1031
+
1032
+ if (use_clip_l) {
1033
+ clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " text_encoders.clip_l.transformer.text_model" , OPENAI_CLIP_VIT_L_14, true );
1034
+ } else {
1035
+ LOG_WARN (" clip_l text encoder not found! Prompt adherence might be degraded." );
1036
+ }
1037
+ if (use_t5) {
1038
+ t5 = std::make_shared<T5Runner>(backend, tensor_types, " text_encoders.t5xxl.transformer" );
1039
+ } else {
1040
+ LOG_WARN (" t5xxl text encoder not found! Prompt adherence might be degraded." );
1041
+ }
1017
1042
set_clip_skip (clip_skip);
1018
1043
}
1019
1044
1020
1045
void set_clip_skip (int clip_skip) {
1021
1046
if (clip_skip <= 0 ) {
1022
1047
clip_skip = 2 ;
1023
1048
}
1024
- clip_l->set_clip_skip (clip_skip);
1049
+ if (use_clip_l) {
1050
+ clip_l->set_clip_skip (clip_skip);
1051
+ }
1025
1052
}
1026
1053
1027
1054
void get_param_tensors (std::map<std::string, struct ggml_tensor *>& tensors) {
1028
- clip_l->get_param_tensors (tensors, " text_encoders.clip_l.transformer.text_model" );
1029
- t5->get_param_tensors (tensors, " text_encoders.t5xxl.transformer" );
1055
+ if (use_clip_l) {
1056
+ clip_l->get_param_tensors (tensors, " text_encoders.clip_l.transformer.text_model" );
1057
+ }
1058
+ if (use_t5) {
1059
+ t5->get_param_tensors (tensors, " text_encoders.t5xxl.transformer" );
1060
+ }
1030
1061
}
1031
1062
1032
1063
void alloc_params_buffer () {
1033
- clip_l->alloc_params_buffer ();
1034
- t5->alloc_params_buffer ();
1064
+ if (use_clip_l) {
1065
+ clip_l->alloc_params_buffer ();
1066
+ }
1067
+ if (use_t5) {
1068
+ t5->alloc_params_buffer ();
1069
+ }
1035
1070
}
1036
1071
1037
1072
void free_params_buffer () {
1038
- clip_l->free_params_buffer ();
1039
- t5->free_params_buffer ();
1073
+ if (use_clip_l) {
1074
+ clip_l->free_params_buffer ();
1075
+ }
1076
+ if (use_t5) {
1077
+ t5->free_params_buffer ();
1078
+ }
1040
1079
}
1041
1080
1042
1081
size_t get_params_buffer_size () {
1043
- size_t buffer_size = clip_l->get_params_buffer_size ();
1044
- buffer_size += t5->get_params_buffer_size ();
1082
+ size_t buffer_size = 0 ;
1083
+ if (use_clip_l) {
1084
+ buffer_size += clip_l->get_params_buffer_size ();
1085
+ }
1086
+ if (use_t5) {
1087
+ buffer_size += t5->get_params_buffer_size ();
1088
+ }
1045
1089
return buffer_size;
1046
1090
}
1047
1091
@@ -1071,18 +1115,23 @@ struct FluxCLIPEmbedder : public Conditioner {
1071
1115
for (const auto & item : parsed_attention) {
1072
1116
const std::string& curr_text = item.first ;
1073
1117
float curr_weight = item.second ;
1074
-
1075
- std::vector<int > curr_tokens = clip_l_tokenizer.encode (curr_text, on_new_token_cb);
1076
- clip_l_tokens.insert (clip_l_tokens.end (), curr_tokens.begin (), curr_tokens.end ());
1077
- clip_l_weights.insert (clip_l_weights.end (), curr_tokens.size (), curr_weight);
1078
-
1079
- curr_tokens = t5_tokenizer.Encode (curr_text, true );
1080
- t5_tokens.insert (t5_tokens.end (), curr_tokens.begin (), curr_tokens.end ());
1081
- t5_weights.insert (t5_weights.end (), curr_tokens.size (), curr_weight);
1118
+ if (use_clip_l) {
1119
+ std::vector<int > curr_tokens = clip_l_tokenizer.encode (curr_text, on_new_token_cb);
1120
+ clip_l_tokens.insert (clip_l_tokens.end (), curr_tokens.begin (), curr_tokens.end ());
1121
+ clip_l_weights.insert (clip_l_weights.end (), curr_tokens.size (), curr_weight);
1122
+ }
1123
+ if (use_t5) {
1124
+ std::vector<int > curr_tokens = t5_tokenizer.Encode (curr_text, true );
1125
+ t5_tokens.insert (t5_tokens.end (), curr_tokens.begin (), curr_tokens.end ());
1126
+ t5_weights.insert (t5_weights.end (), curr_tokens.size (), curr_weight);
1127
+ }
1128
+ }
1129
+ if (use_clip_l) {
1130
+ clip_l_tokenizer.pad_tokens (clip_l_tokens, clip_l_weights, 77 , padding);
1131
+ }
1132
+ if (use_t5) {
1133
+ t5_tokenizer.pad_tokens (t5_tokens, t5_weights, NULL , max_length, padding);
1082
1134
}
1083
-
1084
- clip_l_tokenizer.pad_tokens (clip_l_tokens, clip_l_weights, 77 , padding);
1085
- t5_tokenizer.pad_tokens (t5_tokens, t5_weights, NULL , max_length, padding);
1086
1135
1087
1136
// for (int i = 0; i < clip_l_tokens.size(); i++) {
1088
1137
// std::cout << clip_l_tokens[i] << ":" << clip_l_weights[i] << ", ";
@@ -1114,34 +1163,36 @@ struct FluxCLIPEmbedder : public Conditioner {
1114
1163
struct ggml_tensor * pooled = NULL ; // [768,]
1115
1164
std::vector<float > hidden_states_vec;
1116
1165
1117
- size_t chunk_count = t5_tokens.size () / chunk_len;
1166
+ size_t chunk_count = std::max (clip_l_tokens. size () > 0 ? chunk_len : 0 , t5_tokens.size () ) / chunk_len;
1118
1167
for (int chunk_idx = 0 ; chunk_idx < chunk_count; chunk_idx++) {
1119
1168
// clip_l
1120
1169
if (chunk_idx == 0 ) {
1121
- size_t chunk_len_l = 77 ;
1122
- std::vector<int > chunk_tokens (clip_l_tokens.begin (),
1123
- clip_l_tokens.begin () + chunk_len_l);
1124
- std::vector<float > chunk_weights (clip_l_weights.begin (),
1125
- clip_l_weights.begin () + chunk_len_l);
1170
+ if (use_clip_l) {
1171
+ size_t chunk_len_l = 77 ;
1172
+ std::vector<int > chunk_tokens (clip_l_tokens.begin (),
1173
+ clip_l_tokens.begin () + chunk_len_l);
1174
+ std::vector<float > chunk_weights (clip_l_weights.begin (),
1175
+ clip_l_weights.begin () + chunk_len_l);
1126
1176
1127
- auto input_ids = vector_to_ggml_tensor_i32 (work_ctx, chunk_tokens);
1128
- size_t max_token_idx = 0 ;
1177
+ auto input_ids = vector_to_ggml_tensor_i32 (work_ctx, chunk_tokens);
1178
+ size_t max_token_idx = 0 ;
1129
1179
1130
- auto it = std::find (chunk_tokens.begin (), chunk_tokens.end (), clip_l_tokenizer.EOS_TOKEN_ID );
1131
- max_token_idx = std::min<size_t >(std::distance (chunk_tokens.begin (), it), chunk_tokens.size () - 1 );
1180
+ auto it = std::find (chunk_tokens.begin (), chunk_tokens.end (), clip_l_tokenizer.EOS_TOKEN_ID );
1181
+ max_token_idx = std::min<size_t >(std::distance (chunk_tokens.begin (), it), chunk_tokens.size () - 1 );
1132
1182
1133
- clip_l->compute (n_threads,
1134
- input_ids,
1135
- 0 ,
1136
- NULL ,
1137
- max_token_idx,
1138
- true ,
1139
- &pooled,
1140
- work_ctx);
1183
+ clip_l->compute (n_threads,
1184
+ input_ids,
1185
+ 0 ,
1186
+ NULL ,
1187
+ max_token_idx,
1188
+ true ,
1189
+ &pooled,
1190
+ work_ctx);
1191
+ }
1141
1192
}
1142
1193
1143
1194
// t5
1144
- {
1195
+ if (use_t5) {
1145
1196
std::vector<int > chunk_tokens (t5_tokens.begin () + chunk_idx * chunk_len,
1146
1197
t5_tokens.begin () + (chunk_idx + 1 ) * chunk_len);
1147
1198
std::vector<float > chunk_weights (t5_weights.begin () + chunk_idx * chunk_len,
@@ -1169,8 +1220,12 @@ struct FluxCLIPEmbedder : public Conditioner {
1169
1220
float new_mean = ggml_tensor_mean (tensor);
1170
1221
ggml_tensor_scale (tensor, (original_mean / new_mean));
1171
1222
}
1223
+ } else {
1224
+ chunk_hidden_states = ggml_new_tensor_2d (work_ctx, GGML_TYPE_F32, 4096 , chunk_len);
1225
+ ggml_set_f32 (chunk_hidden_states, 0 .f );
1172
1226
}
1173
1227
1228
+
1174
1229
int64_t t1 = ggml_time_ms ();
1175
1230
LOG_DEBUG (" computing condition graph completed, taking %" PRId64 " ms" , t1 - t0);
1176
1231
if (force_zero_embeddings) {
@@ -1179,17 +1234,26 @@ struct FluxCLIPEmbedder : public Conditioner {
1179
1234
vec[i] = 0 ;
1180
1235
}
1181
1236
}
1182
-
1237
+
1183
1238
hidden_states_vec.insert (hidden_states_vec.end (),
1184
- (float *)chunk_hidden_states->data ,
1185
- ((float *)chunk_hidden_states->data ) + ggml_nelements (chunk_hidden_states));
1239
+ (float *)chunk_hidden_states->data ,
1240
+ ((float *)chunk_hidden_states->data ) + ggml_nelements (chunk_hidden_states));
1241
+ }
1242
+
1243
+ if (hidden_states_vec.size () > 0 ) {
1244
+ hidden_states = vector_to_ggml_tensor (work_ctx, hidden_states_vec);
1245
+ hidden_states = ggml_reshape_2d (work_ctx,
1246
+ hidden_states,
1247
+ chunk_hidden_states->ne [0 ],
1248
+ ggml_nelements (hidden_states) / chunk_hidden_states->ne [0 ]);
1249
+ } else {
1250
+ hidden_states = ggml_new_tensor_2d (work_ctx, GGML_TYPE_F32, 4096 , 256 );
1251
+ ggml_set_f32 (hidden_states, 0 .f );
1252
+ }
1253
+ if (pooled == NULL ) {
1254
+ pooled = ggml_new_tensor_1d (work_ctx, GGML_TYPE_F32, 768 );
1255
+ ggml_set_f32 (pooled, 0 .f );
1186
1256
}
1187
-
1188
- hidden_states = vector_to_ggml_tensor (work_ctx, hidden_states_vec);
1189
- hidden_states = ggml_reshape_2d (work_ctx,
1190
- hidden_states,
1191
- chunk_hidden_states->ne [0 ],
1192
- ggml_nelements (hidden_states) / chunk_hidden_states->ne [0 ]);
1193
1257
return SDCondition (hidden_states, pooled, NULL );
1194
1258
}
1195
1259
0 commit comments