Skip to content

Commit 7e2971f

Browse files
committed
Revert "sched : support async weight copy"
This reverts commit ffd57f9.
1 parent 904a539 commit 7e2971f

File tree

5 files changed

+15
-168
lines changed

5 files changed

+15
-168
lines changed

examples/llama-bench/llama-bench.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1359,7 +1359,6 @@ int main(int argc, char ** argv) {
13591359
}
13601360

13611361
p->print_test(t);
1362-
fflush(p->fout);
13631362

13641363
llama_print_timings(ctx);
13651364

ggml-backend-impl.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,8 +114,6 @@ extern "C" {
114114
void (*GGML_CALL event_record) (ggml_backend_event_t event);
115115
void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
116116
void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
117-
118-
ggml_backend_t (*GGML_CALL backend_dup)(ggml_backend_t backend);
119117
};
120118

121119
struct ggml_backend {

ggml-backend.c

Lines changed: 12 additions & 154 deletions
Original file line numberDiff line numberDiff line change
@@ -180,13 +180,6 @@ void ggml_backend_free(ggml_backend_t backend) {
180180
backend->iface.free(backend);
181181
}
182182

183-
ggml_backend_t ggml_backend_dup(ggml_backend_t backend) {
184-
if (backend->iface.backend_dup) {
185-
return backend->iface.backend_dup(backend);
186-
}
187-
return backend;
188-
}
189-
190183
ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
191184
return backend->iface.get_default_buffer_type(backend);
192185
}
@@ -862,7 +855,6 @@ static struct ggml_backend_i cpu_backend_i = {
862855
/* .event_record = */ NULL,
863856
/* .event_wait = */ NULL,
864857
/* .event_synchronize = */ NULL,
865-
/* .backend_dup = */ NULL,
866858
};
867859

868860
static ggml_guid_t ggml_backend_cpu_guid(void) {
@@ -1034,34 +1026,16 @@ static bool ggml_is_view_op(enum ggml_op op) {
10341026
#define GGML_SCHED_MAX_COPIES 4
10351027
#endif
10361028

1037-
#ifndef GGML_SCHED_MAX_COPY_STREAMS
1038-
#define GGML_SCHED_MAX_COPY_STREAMS 8
1039-
#endif
1040-
10411029
struct ggml_backend_sched_split {
10421030
int backend_id;
10431031
int i_start;
10441032
int i_end;
1045-
1046-
// input tensors from other backends
10471033
struct ggml_tensor * inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
10481034
int n_inputs;
1049-
1050-
// copy stream to use to copy the inputs that are weights (-1 = no copy stream)
1051-
int w_copy_stream_id;
1052-
10531035
// graph view of this split
10541036
struct ggml_cgraph graph;
10551037
};
10561038

1057-
struct ggml_backend_sched_copy_stream {
1058-
ggml_backend_t stream;
1059-
ggml_backend_buffer_t buffer;
1060-
ggml_backend_event_t event_copy;
1061-
ggml_backend_event_t event_use;
1062-
size_t max_size;
1063-
};
1064-
10651039
struct ggml_backend_sched {
10661040
bool is_reset; // true if the scheduler has been reset since the last graph split
10671041
bool is_alloc;
@@ -1072,9 +1046,6 @@ struct ggml_backend_sched {
10721046
ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
10731047
ggml_gallocr_t galloc;
10741048

1075-
struct ggml_backend_sched_copy_stream copy_streams[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPY_STREAMS];
1076-
int cur_copy_stream[GGML_SCHED_MAX_BACKENDS];
1077-
10781049
// hash keys of the nodes in the graph
10791050
struct ggml_hash_set hash_set;
10801051
// hash values
@@ -1257,14 +1228,6 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
12571228
//#define DEBUG_PASS3
12581229
//#define DEBUG_PASS4
12591230

1260-
static void init_split(ggml_backend_sched_t sched, struct ggml_backend_sched_split * split, int backend_id, int i_start) {
1261-
split->backend_id = backend_id;
1262-
split->i_start = i_start;
1263-
split->i_end = -1;
1264-
split->n_inputs = 0;
1265-
split->w_copy_stream_id = -1;
1266-
}
1267-
12681231
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
12691232
static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
12701233
// reset splits
@@ -1443,17 +1406,19 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
14431406
// pass 4: split graph, find tensors that need to be copied
14441407
{
14451408
int i_split = 0;
1446-
int cur_backend_id = 0;
14471409
struct ggml_backend_sched_split * split = &sched->splits[0];
14481410
// find the backend of the first split, skipping view ops
14491411
for (int i = 0; i < graph->n_nodes; i++) {
14501412
struct ggml_tensor * node = graph->nodes[i];
14511413
if (!ggml_is_view_op(node->op)) {
1452-
cur_backend_id = tensor_backend_id(node);
1414+
split->backend_id = tensor_backend_id(node);
14531415
break;
14541416
}
14551417
}
1456-
init_split(sched, split, cur_backend_id, 0);
1418+
split->i_start = 0;
1419+
split->n_inputs = 0;
1420+
memset(split->inputs, 0, sizeof(split->inputs)); //HACK
1421+
int cur_backend_id = split->backend_id;
14571422
for (int i = 0; i < graph->n_nodes; i++) {
14581423
struct ggml_tensor * node = graph->nodes[i];
14591424

@@ -1468,11 +1433,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
14681433
// check if we should start a new split based on the sources of the current node
14691434
bool need_new_split = false;
14701435
if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
1471-
if (split->w_copy_stream_id != -1) {
1472-
// the previous op used a weight copy stream, start a new split to allow the next copy to start immediately after the op
1473-
need_new_split = true;
1474-
}
1475-
14761436
for (int j = 0; j < GGML_MAX_SRC; j++) {
14771437
struct ggml_tensor * src = node->src[j];
14781438
if (src == NULL) {
@@ -1492,6 +1452,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
14921452
const size_t id = hash_id(src);
14931453
int src_backend_id = sched->tensor_backend_id[id];
14941454
if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL) {
1455+
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
14951456
need_new_split = true;
14961457
break;
14971458
}
@@ -1509,8 +1470,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
15091470
}
15101471
GGML_ASSERT(i_split < GGML_SCHED_MAX_SPLITS);
15111472
split = &sched->splits[i_split];
1473+
split->backend_id = node_backend_id;
1474+
split->i_start = i;
1475+
split->n_inputs = 0;
15121476
cur_backend_id = node_backend_id;
1513-
init_split(sched, split, cur_backend_id, i);
15141477
}
15151478

15161479
// find inputs that are not on the same backend
@@ -1566,13 +1529,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
15661529
int n_inputs = split->n_inputs++;
15671530
GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
15681531
split->inputs[n_inputs] = src;
1569-
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS && split->w_copy_stream_id == -1 && GGML_SCHED_MAX_COPY_STREAMS > 0) {
1570-
split->w_copy_stream_id = sched->cur_copy_stream[cur_backend_id];
1571-
sched->copy_streams[cur_backend_id][split->w_copy_stream_id].max_size = MAX(
1572-
sched->copy_streams[cur_backend_id][split->w_copy_stream_id].max_size,
1573-
ggml_backend_buft_get_alloc_size(sched->bufts[cur_backend_id], src));
1574-
sched->cur_copy_stream[cur_backend_id] = (sched->cur_copy_stream[cur_backend_id] + 1) % GGML_SCHED_MAX_COPY_STREAMS;
1575-
}
15761532
}
15771533
node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
15781534
}
@@ -1584,10 +1540,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
15841540
#ifdef DEBUG_PASS4
15851541
fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
15861542
#endif
1587-
if (getenv("GGML_DEBUG_SCHED")) {
1588-
fprintf(stderr, "SPLIT GRAPH\n");
1589-
ggml_backend_sched_print_assignments(sched, graph);
1590-
}
15911543

15921544
// create copies of the graph for each split
15931545
// TODO: avoid this copy
@@ -1661,25 +1613,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
16611613
}
16621614

16631615
static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
1664-
// allocate weights in the copy buffers
1665-
for (int s = 0; s < sched->n_splits; s++) {
1666-
struct ggml_backend_sched_split * split = &sched->splits[s];
1667-
if (split->w_copy_stream_id != -1) {
1668-
struct ggml_backend_sched_copy_stream * stream = &sched->copy_streams[split->backend_id][split->w_copy_stream_id];
1669-
ggml_backend_buffer_t buffer = stream->buffer;
1670-
if (buffer == NULL) {
1671-
continue;
1672-
}
1673-
for (int j = 0; j < split->n_inputs; j++) {
1674-
struct ggml_tensor * input = split->inputs[j];
1675-
if (input->buffer != NULL && input->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1676-
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id][sched->cur_copy];
1677-
ggml_backend_tensor_alloc(buffer, input_cpy, ggml_backend_buffer_get_base(buffer));
1678-
}
1679-
}
1680-
}
1681-
}
1682-
16831616
// allocate graph
16841617
if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
16851618
// the re-allocation may cause the split inputs to be moved to a different address
@@ -1704,21 +1637,14 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
17041637
struct ggml_backend_sched_split * split = &splits[i];
17051638
int split_backend_id = split->backend_id;
17061639
ggml_backend_t split_backend = sched->backends[split_backend_id];
1707-
struct ggml_backend_sched_copy_stream * stream = NULL;
1708-
1709-
if (split->w_copy_stream_id != -1) {
1710-
stream = &sched->copy_streams[split_backend_id][split->w_copy_stream_id];
1711-
}
17121640

17131641
// copy the input tensors to the split backend
17141642
for (int j = 0; j < split->n_inputs; j++) {
17151643
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
17161644
struct ggml_tensor * input = split->inputs[j];
17171645
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id][sched->cur_copy];
17181646

1719-
if (input->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS && stream && stream->stream) {
1720-
ggml_backend_tensor_copy_async(input_backend, stream->stream, input, input_cpy);
1721-
} else if (input->flags & GGML_TENSOR_FLAG_INPUT) {
1647+
if (input->flags & GGML_TENSOR_FLAG_INPUT) {
17221648
// inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
17231649
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
17241650
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
@@ -1737,11 +1663,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
17371663
}
17381664
}
17391665

1740-
if (stream && stream->stream) {
1741-
ggml_backend_event_record(stream->event_copy);
1742-
ggml_backend_event_wait(split_backend, stream->event_copy);
1743-
}
1744-
17451666
if (!sched->callback_eval) {
17461667
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
17471668
if (ec != GGML_STATUS_SUCCESS) {
@@ -1781,12 +1702,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
17811702
}
17821703
}
17831704

1784-
// record event of this copy stream
1785-
if (stream && stream->stream) {
1786-
ggml_backend_event_record(stream->event_use);
1787-
ggml_backend_event_wait(stream->stream, stream->event_use);
1788-
}
1789-
17901705
// record the event of this copy
17911706
if (split->n_inputs > 0) {
17921707
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
@@ -1851,19 +1766,11 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
18511766
if (sched == NULL) {
18521767
return;
18531768
}
1854-
18551769
for (int b = 0; b < sched->n_backends; b++) {
18561770
for (int c = 0; c < sched->n_copies; c++) {
18571771
ggml_backend_event_free(sched->events[b][c]);
18581772
}
1859-
for (int s = 0; s < GGML_SCHED_MAX_COPY_STREAMS; s++) {
1860-
ggml_backend_buffer_free(sched->copy_streams[b][s].buffer);
1861-
ggml_backend_event_free(sched->copy_streams[b][s].event_copy);
1862-
ggml_backend_event_free(sched->copy_streams[b][s].event_use);
1863-
ggml_backend_free(sched->copy_streams[b][s].stream);
1864-
}
18651773
}
1866-
18671774
ggml_gallocr_free(sched->galloc);
18681775
ggml_free(sched->ctx);
18691776
free(sched->splits);
@@ -1882,7 +1789,6 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
18821789
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
18831790
memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
18841791
memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
1885-
memset(sched->cur_copy_stream, 0, sizeof(sched->cur_copy_stream[0]) * sched->n_backends);
18861792

18871793
sched->is_reset = true;
18881794
}
@@ -1894,46 +1800,7 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
18941800

18951801
ggml_backend_sched_split_graph(sched, measure_graph);
18961802

1897-
// allocate tensor copy streams
1898-
for (int b = 0; b < sched->n_backends; b++) {
1899-
for (int j = 0; j < GGML_SCHED_MAX_COPY_STREAMS; j++) {
1900-
struct ggml_backend_sched_copy_stream * stream = &sched->copy_streams[b][j];
1901-
if (stream->max_size > 0) {
1902-
// backend
1903-
if (!stream->stream) {
1904-
stream->stream = ggml_backend_dup(sched->backends[b]);
1905-
}
1906-
1907-
if (!stream->stream) {
1908-
continue;
1909-
}
1910-
1911-
// events
1912-
if (!stream->event_copy) {
1913-
stream->event_copy = ggml_backend_event_new(stream->stream);
1914-
}
1915-
1916-
if (!stream->event_use) {
1917-
stream->event_use = ggml_backend_event_new(sched->backends[b]);
1918-
}
1919-
1920-
if (!stream->event_copy || !stream->event_use) {
1921-
continue;
1922-
}
1923-
1924-
// buffer
1925-
if (!stream->buffer || ggml_backend_buffer_get_size(stream->buffer) < stream->max_size) {
1926-
ggml_backend_buffer_free(stream->buffer);
1927-
stream->buffer = ggml_backend_buft_alloc_buffer(sched->bufts[b], stream->max_size);
1928-
if (stream->buffer == NULL) {
1929-
fprintf(stderr, "%s: failed to allocate buffer for copy stream\n", __func__);
1930-
return false;
1931-
}
1932-
}
1933-
}
1934-
}
1935-
}
1936-
1803+
// TODO: extract this to a separate function
19371804
if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
19381805
return false;
19391806
}
@@ -2001,16 +1868,7 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
20011868
int backend_index = ggml_backend_sched_backend_id(sched, backend);
20021869
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
20031870

2004-
size_t size = ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
2005-
2006-
for (int i = 0; i < GGML_SCHED_MAX_COPY_STREAMS; i++) {
2007-
if (sched->copy_streams[backend_index][i].buffer == NULL) {
2008-
continue;
2009-
}
2010-
size += ggml_backend_buffer_get_size(sched->copy_streams[backend_index][i].buffer);
2011-
}
2012-
2013-
return size;
1871+
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
20141872
}
20151873

20161874
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {

ggml-backend.h

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,10 +50,9 @@ extern "C" {
5050
// Backend
5151
//
5252

53-
GGML_API ggml_guid_t ggml_backend_guid(ggml_backend_t backend);
54-
GGML_API const char * ggml_backend_name(ggml_backend_t backend);
55-
GGML_API void ggml_backend_free(ggml_backend_t backend);
56-
GGML_API ggml_backend_t ggml_backend_dup(ggml_backend_t backend);
53+
GGML_API ggml_guid_t ggml_backend_guid(ggml_backend_t backend);
54+
GGML_API const char * ggml_backend_name(ggml_backend_t backend);
55+
GGML_API void ggml_backend_free(ggml_backend_t backend);
5756

5857
GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
5958
GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);

ggml-cuda.cu

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2968,12 +2968,6 @@ static void ggml_backend_cuda_event_synchronize(ggml_backend_event_t event) {
29682968
CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
29692969
}
29702970

2971-
static ggml_backend_t ggml_backend_cuda_dup(ggml_backend_t backend) {
2972-
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2973-
2974-
return ggml_backend_cuda_init(cuda_ctx->device);
2975-
}
2976-
29772971
static ggml_backend_i ggml_backend_cuda_interface = {
29782972
/* .get_name = */ ggml_backend_cuda_name,
29792973
/* .free = */ ggml_backend_cuda_free,
@@ -2993,7 +2987,6 @@ static ggml_backend_i ggml_backend_cuda_interface = {
29932987
/* .event_record = */ ggml_backend_cuda_event_record,
29942988
/* .event_wait = */ ggml_backend_cuda_event_wait,
29952989
/* .event_synchronize = */ ggml_backend_cuda_event_synchronize,
2996-
/* .backend_dup = */ ggml_backend_cuda_dup,
29972990
};
29982991

29992992
static ggml_guid_t ggml_backend_cuda_guid() {

0 commit comments

Comments
 (0)