@@ -1015,7 +1015,6 @@ struct FluxCLIPEmbedder : public Conditioner {
1015
1015
FluxCLIPEmbedder (ggml_backend_t backend,
1016
1016
std::map<std::string, enum ggml_type>& tensor_types,
1017
1017
int clip_skip = -1 ) {
1018
-
1019
1018
for (auto pair : tensor_types) {
1020
1019
if (pair.first .find (" text_encoders.clip_l" ) != std::string::npos) {
1021
1020
use_clip_l = true ;
@@ -1225,7 +1224,6 @@ struct FluxCLIPEmbedder : public Conditioner {
1225
1224
ggml_set_f32 (chunk_hidden_states, 0 .f );
1226
1225
}
1227
1226
1228
-
1229
1227
int64_t t1 = ggml_time_ms ();
1230
1228
LOG_DEBUG (" computing condition graph completed, taking %" PRId64 " ms" , t1 - t0);
1231
1229
if (force_zero_embeddings) {
@@ -1234,12 +1232,12 @@ struct FluxCLIPEmbedder : public Conditioner {
1234
1232
vec[i] = 0 ;
1235
1233
}
1236
1234
}
1237
-
1235
+
1238
1236
hidden_states_vec.insert (hidden_states_vec.end (),
1239
- (float *)chunk_hidden_states->data ,
1240
- ((float *)chunk_hidden_states->data ) + ggml_nelements (chunk_hidden_states));
1237
+ (float *)chunk_hidden_states->data ,
1238
+ ((float *)chunk_hidden_states->data ) + ggml_nelements (chunk_hidden_states));
1241
1239
}
1242
-
1240
+
1243
1241
if (hidden_states_vec.size () > 0 ) {
1244
1242
hidden_states = vector_to_ggml_tensor (work_ctx, hidden_states_vec);
1245
1243
hidden_states = ggml_reshape_2d (work_ctx,
@@ -1294,35 +1292,54 @@ struct PixArtCLIPEmbedder : public Conditioner {
1294
1292
bool use_mask = false ;
1295
1293
int mask_pad = 1 ;
1296
1294
1295
+ bool use_t5 = false ;
1296
+
1297
1297
PixArtCLIPEmbedder (ggml_backend_t backend,
1298
1298
std::map<std::string, enum ggml_type>& tensor_types,
1299
1299
int clip_skip = -1 ,
1300
1300
bool use_mask = false ,
1301
1301
int mask_pad = 1 )
1302
1302
: use_mask(use_mask), mask_pad(mask_pad) {
1303
- t5 = std::make_shared<T5Runner>(backend, tensor_types, " text_encoders.t5xxl.transformer" );
1303
+ for (auto pair : tensor_types) {
1304
+ if (pair.first .find (" text_encoders.t5xxl" ) != std::string::npos) {
1305
+ use_t5 = true ;
1306
+ }
1307
+ }
1308
+
1309
+ if (!use_t5) {
1310
+ LOG_WARN (" IMPORTANT NOTICE: No text encoders provided, cannot process prompts!" );
1311
+ return ;
1312
+ } else {
1313
+ t5 = std::make_shared<T5Runner>(backend, tensor_types, " text_encoders.t5xxl.transformer" );
1314
+ }
1304
1315
}
1305
1316
1306
1317
void set_clip_skip (int clip_skip) {
1307
1318
}
1308
1319
1309
1320
void get_param_tensors (std::map<std::string, struct ggml_tensor *>& tensors) {
1310
- t5->get_param_tensors (tensors, " text_encoders.t5xxl.transformer" );
1321
+ if (use_t5) {
1322
+ t5->get_param_tensors (tensors, " text_encoders.t5xxl.transformer" );
1323
+ }
1311
1324
}
1312
1325
1313
1326
void alloc_params_buffer () {
1314
- t5->alloc_params_buffer ();
1327
+ if (use_t5) {
1328
+ t5->alloc_params_buffer ();
1329
+ }
1315
1330
}
1316
1331
1317
1332
void free_params_buffer () {
1318
- t5->free_params_buffer ();
1333
+ if (use_t5) {
1334
+ t5->free_params_buffer ();
1335
+ }
1319
1336
}
1320
1337
1321
1338
size_t get_params_buffer_size () {
1322
1339
size_t buffer_size = 0 ;
1323
-
1324
- buffer_size += t5->get_params_buffer_size ();
1325
-
1340
+ if (use_t5) {
1341
+ buffer_size += t5->get_params_buffer_size ();
1342
+ }
1326
1343
return buffer_size;
1327
1344
}
1328
1345
@@ -1348,17 +1365,18 @@ struct PixArtCLIPEmbedder : public Conditioner {
1348
1365
std::vector<int > t5_tokens;
1349
1366
std::vector<float > t5_weights;
1350
1367
std::vector<float > t5_mask;
1351
- for (const auto & item : parsed_attention) {
1352
- const std::string& curr_text = item.first ;
1353
- float curr_weight = item.second ;
1354
-
1355
- std::vector<int > curr_tokens = t5_tokenizer.Encode (curr_text, true );
1356
- t5_tokens.insert (t5_tokens.end (), curr_tokens.begin (), curr_tokens.end ());
1357
- t5_weights.insert (t5_weights.end (), curr_tokens.size (), curr_weight);
1358
- }
1368
+ if (use_t5) {
1369
+ for (const auto & item : parsed_attention) {
1370
+ const std::string& curr_text = item.first ;
1371
+ float curr_weight = item.second ;
1359
1372
1360
- t5_tokenizer.pad_tokens (t5_tokens, t5_weights, &t5_mask, max_length, padding);
1373
+ std::vector<int > curr_tokens = t5_tokenizer.Encode (curr_text, true );
1374
+ t5_tokens.insert (t5_tokens.end (), curr_tokens.begin (), curr_tokens.end ());
1375
+ t5_weights.insert (t5_weights.end (), curr_tokens.size (), curr_weight);
1376
+ }
1361
1377
1378
+ t5_tokenizer.pad_tokens (t5_tokens, t5_weights, &t5_mask, max_length, padding);
1379
+ }
1362
1380
return {t5_tokens, t5_weights, t5_mask};
1363
1381
}
1364
1382
@@ -1395,38 +1413,44 @@ struct PixArtCLIPEmbedder : public Conditioner {
1395
1413
std::vector<float > hidden_states_vec;
1396
1414
1397
1415
size_t chunk_count = t5_tokens.size () / chunk_len;
1398
-
1399
1416
for (int chunk_idx = 0 ; chunk_idx < chunk_count; chunk_idx++) {
1400
1417
// t5
1401
- std::vector<int > chunk_tokens (t5_tokens.begin () + chunk_idx * chunk_len,
1402
- t5_tokens.begin () + (chunk_idx + 1 ) * chunk_len);
1403
- std::vector<float > chunk_weights (t5_weights.begin () + chunk_idx * chunk_len,
1404
- t5_weights.begin () + (chunk_idx + 1 ) * chunk_len);
1405
- std::vector<float > chunk_mask (t5_attn_mask_vec.begin () + chunk_idx * chunk_len,
1406
- t5_attn_mask_vec.begin () + (chunk_idx + 1 ) * chunk_len);
1407
-
1408
- auto input_ids = vector_to_ggml_tensor_i32 (work_ctx, chunk_tokens);
1409
- auto t5_attn_mask_chunk = use_mask ? vector_to_ggml_tensor (work_ctx, chunk_mask) : NULL ;
1410
-
1411
- t5->compute (n_threads,
1412
- input_ids,
1413
- t5_attn_mask_chunk,
1414
- &chunk_hidden_states,
1415
- work_ctx);
1416
- {
1417
- auto tensor = chunk_hidden_states;
1418
- float original_mean = ggml_tensor_mean (tensor);
1419
- for (int i2 = 0 ; i2 < tensor->ne [2 ]; i2++) {
1420
- for (int i1 = 0 ; i1 < tensor->ne [1 ]; i1++) {
1421
- for (int i0 = 0 ; i0 < tensor->ne [0 ]; i0++) {
1422
- float value = ggml_tensor_get_f32 (tensor, i0, i1, i2);
1423
- value *= chunk_weights[i1];
1424
- ggml_tensor_set_f32 (tensor, value, i0, i1, i2);
1418
+
1419
+ if (use_t5) {
1420
+ std::vector<int > chunk_tokens (t5_tokens.begin () + chunk_idx * chunk_len,
1421
+ t5_tokens.begin () + (chunk_idx + 1 ) * chunk_len);
1422
+ std::vector<float > chunk_weights (t5_weights.begin () + chunk_idx * chunk_len,
1423
+ t5_weights.begin () + (chunk_idx + 1 ) * chunk_len);
1424
+ std::vector<float > chunk_mask (t5_attn_mask_vec.begin () + chunk_idx * chunk_len,
1425
+ t5_attn_mask_vec.begin () + (chunk_idx + 1 ) * chunk_len);
1426
+
1427
+ auto input_ids = vector_to_ggml_tensor_i32 (work_ctx, chunk_tokens);
1428
+ auto t5_attn_mask_chunk = use_mask ? vector_to_ggml_tensor (work_ctx, chunk_mask) : NULL ;
1429
+ t5->compute (n_threads,
1430
+ input_ids,
1431
+ t5_attn_mask_chunk,
1432
+ &chunk_hidden_states,
1433
+ work_ctx);
1434
+ {
1435
+ auto tensor = chunk_hidden_states;
1436
+ float original_mean = ggml_tensor_mean (tensor);
1437
+ for (int i2 = 0 ; i2 < tensor->ne [2 ]; i2++) {
1438
+ for (int i1 = 0 ; i1 < tensor->ne [1 ]; i1++) {
1439
+ for (int i0 = 0 ; i0 < tensor->ne [0 ]; i0++) {
1440
+ float value = ggml_tensor_get_f32 (tensor, i0, i1, i2);
1441
+ value *= chunk_weights[i1];
1442
+ ggml_tensor_set_f32 (tensor, value, i0, i1, i2);
1443
+ }
1425
1444
}
1426
1445
}
1446
+ float new_mean = ggml_tensor_mean (tensor);
1447
+ ggml_tensor_scale (tensor, (original_mean / new_mean));
1427
1448
}
1428
- float new_mean = ggml_tensor_mean (tensor);
1429
- ggml_tensor_scale (tensor, (original_mean / new_mean));
1449
+ } else {
1450
+ chunk_hidden_states = ggml_new_tensor_2d (work_ctx, GGML_TYPE_F32, 4096 , chunk_len);
1451
+ ggml_set_f32 (chunk_hidden_states, 0 .f );
1452
+ t5_attn_mask = ggml_new_tensor_1d (work_ctx, GGML_TYPE_F32, chunk_len);
1453
+ ggml_set_f32 (t5_attn_mask, -HUGE_VALF);
1430
1454
}
1431
1455
1432
1456
int64_t t1 = ggml_time_ms ();
@@ -1450,8 +1474,10 @@ struct PixArtCLIPEmbedder : public Conditioner {
1450
1474
chunk_hidden_states->ne [0 ],
1451
1475
ggml_nelements (hidden_states) / chunk_hidden_states->ne [0 ]);
1452
1476
} else {
1453
- hidden_states = ggml_new_tensor_2d (work_ctx, GGML_TYPE_F32, 4096 , 256 );
1477
+ hidden_states = ggml_new_tensor_2d (work_ctx, GGML_TYPE_F32, 4096 , chunk_len );
1454
1478
ggml_set_f32 (hidden_states, 0 .f );
1479
+ t5_attn_mask = ggml_new_tensor_1d (work_ctx, GGML_TYPE_F32, chunk_len);
1480
+ ggml_set_f32 (t5_attn_mask, -HUGE_VALF);
1455
1481
}
1456
1482
1457
1483
modify_mask_to_attend_padding (t5_attn_mask, ggml_nelements (t5_attn_mask), mask_pad);
0 commit comments