@@ -180,13 +180,6 @@ void ggml_backend_free(ggml_backend_t backend) {
180
180
backend -> iface .free (backend );
181
181
}
182
182
183
- ggml_backend_t ggml_backend_dup (ggml_backend_t backend ) {
184
- if (backend -> iface .backend_dup ) {
185
- return backend -> iface .backend_dup (backend );
186
- }
187
- return backend ;
188
- }
189
-
190
183
ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type (ggml_backend_t backend ) {
191
184
return backend -> iface .get_default_buffer_type (backend );
192
185
}
@@ -862,7 +855,6 @@ static struct ggml_backend_i cpu_backend_i = {
862
855
/* .event_record = */ NULL ,
863
856
/* .event_wait = */ NULL ,
864
857
/* .event_synchronize = */ NULL ,
865
- /* .backend_dup = */ NULL ,
866
858
};
867
859
868
860
static ggml_guid_t ggml_backend_cpu_guid (void ) {
@@ -1034,34 +1026,16 @@ static bool ggml_is_view_op(enum ggml_op op) {
1034
1026
#define GGML_SCHED_MAX_COPIES 4
1035
1027
#endif
1036
1028
1037
- #ifndef GGML_SCHED_MAX_COPY_STREAMS
1038
- #define GGML_SCHED_MAX_COPY_STREAMS 8
1039
- #endif
1040
-
1041
1029
struct ggml_backend_sched_split {
1042
1030
int backend_id ;
1043
1031
int i_start ;
1044
1032
int i_end ;
1045
-
1046
- // input tensors from other backends
1047
1033
struct ggml_tensor * inputs [GGML_SCHED_MAX_SPLIT_INPUTS ];
1048
1034
int n_inputs ;
1049
-
1050
- // copy stream to use to copy the inputs that are weights (-1 = no copy stream)
1051
- int w_copy_stream_id ;
1052
-
1053
1035
// graph view of this split
1054
1036
struct ggml_cgraph graph ;
1055
1037
};
1056
1038
1057
- struct ggml_backend_sched_copy_stream {
1058
- ggml_backend_t stream ;
1059
- ggml_backend_buffer_t buffer ;
1060
- ggml_backend_event_t event_copy ;
1061
- ggml_backend_event_t event_use ;
1062
- size_t max_size ;
1063
- };
1064
-
1065
1039
struct ggml_backend_sched {
1066
1040
bool is_reset ; // true if the scheduler has been reset since the last graph split
1067
1041
bool is_alloc ;
@@ -1072,9 +1046,6 @@ struct ggml_backend_sched {
1072
1046
ggml_backend_buffer_type_t bufts [GGML_SCHED_MAX_BACKENDS ];
1073
1047
ggml_gallocr_t galloc ;
1074
1048
1075
- struct ggml_backend_sched_copy_stream copy_streams [GGML_SCHED_MAX_BACKENDS ][GGML_SCHED_MAX_COPY_STREAMS ];
1076
- int cur_copy_stream [GGML_SCHED_MAX_BACKENDS ];
1077
-
1078
1049
// hash keys of the nodes in the graph
1079
1050
struct ggml_hash_set hash_set ;
1080
1051
// hash values
@@ -1257,14 +1228,6 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
1257
1228
//#define DEBUG_PASS3
1258
1229
//#define DEBUG_PASS4
1259
1230
1260
- static void init_split (ggml_backend_sched_t sched , struct ggml_backend_sched_split * split , int backend_id , int i_start ) {
1261
- split -> backend_id = backend_id ;
1262
- split -> i_start = i_start ;
1263
- split -> i_end = -1 ;
1264
- split -> n_inputs = 0 ;
1265
- split -> w_copy_stream_id = -1 ;
1266
- }
1267
-
1268
1231
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
1269
1232
static void ggml_backend_sched_split_graph (ggml_backend_sched_t sched , struct ggml_cgraph * graph ) {
1270
1233
// reset splits
@@ -1443,17 +1406,19 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1443
1406
// pass 4: split graph, find tensors that need to be copied
1444
1407
{
1445
1408
int i_split = 0 ;
1446
- int cur_backend_id = 0 ;
1447
1409
struct ggml_backend_sched_split * split = & sched -> splits [0 ];
1448
1410
// find the backend of the first split, skipping view ops
1449
1411
for (int i = 0 ; i < graph -> n_nodes ; i ++ ) {
1450
1412
struct ggml_tensor * node = graph -> nodes [i ];
1451
1413
if (!ggml_is_view_op (node -> op )) {
1452
- cur_backend_id = tensor_backend_id (node );
1414
+ split -> backend_id = tensor_backend_id (node );
1453
1415
break ;
1454
1416
}
1455
1417
}
1456
- init_split (sched , split , cur_backend_id , 0 );
1418
+ split -> i_start = 0 ;
1419
+ split -> n_inputs = 0 ;
1420
+ memset (split -> inputs , 0 , sizeof (split -> inputs )); //HACK
1421
+ int cur_backend_id = split -> backend_id ;
1457
1422
for (int i = 0 ; i < graph -> n_nodes ; i ++ ) {
1458
1423
struct ggml_tensor * node = graph -> nodes [i ];
1459
1424
@@ -1468,11 +1433,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1468
1433
// check if we should start a new split based on the sources of the current node
1469
1434
bool need_new_split = false;
1470
1435
if (node_backend_id == cur_backend_id && split -> n_inputs > 0 ) {
1471
- if (split -> w_copy_stream_id != -1 ) {
1472
- // the previous op used a weight copy stream, start a new split to allow the next copy to start immediately after the op
1473
- need_new_split = true;
1474
- }
1475
-
1476
1436
for (int j = 0 ; j < GGML_MAX_SRC ; j ++ ) {
1477
1437
struct ggml_tensor * src = node -> src [j ];
1478
1438
if (src == NULL ) {
@@ -1492,6 +1452,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1492
1452
const size_t id = hash_id (src );
1493
1453
int src_backend_id = sched -> tensor_backend_id [id ];
1494
1454
if (src_backend_id != cur_backend_id && sched -> tensor_copies [hash_id (src )][cur_backend_id ][0 ] == NULL ) {
1455
+ //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
1495
1456
need_new_split = true;
1496
1457
break ;
1497
1458
}
@@ -1509,8 +1470,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1509
1470
}
1510
1471
GGML_ASSERT (i_split < GGML_SCHED_MAX_SPLITS );
1511
1472
split = & sched -> splits [i_split ];
1473
+ split -> backend_id = node_backend_id ;
1474
+ split -> i_start = i ;
1475
+ split -> n_inputs = 0 ;
1512
1476
cur_backend_id = node_backend_id ;
1513
- init_split (sched , split , cur_backend_id , i );
1514
1477
}
1515
1478
1516
1479
// find inputs that are not on the same backend
@@ -1566,13 +1529,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1566
1529
int n_inputs = split -> n_inputs ++ ;
1567
1530
GGML_ASSERT (n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS );
1568
1531
split -> inputs [n_inputs ] = src ;
1569
- if (src -> buffer != NULL && src -> buffer -> usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS && split -> w_copy_stream_id == -1 && GGML_SCHED_MAX_COPY_STREAMS > 0 ) {
1570
- split -> w_copy_stream_id = sched -> cur_copy_stream [cur_backend_id ];
1571
- sched -> copy_streams [cur_backend_id ][split -> w_copy_stream_id ].max_size = MAX (
1572
- sched -> copy_streams [cur_backend_id ][split -> w_copy_stream_id ].max_size ,
1573
- ggml_backend_buft_get_alloc_size (sched -> bufts [cur_backend_id ], src ));
1574
- sched -> cur_copy_stream [cur_backend_id ] = (sched -> cur_copy_stream [cur_backend_id ] + 1 ) % GGML_SCHED_MAX_COPY_STREAMS ;
1575
- }
1576
1532
}
1577
1533
node -> src [j ] = sched -> tensor_copies [id ][cur_backend_id ][sched -> cur_copy ];
1578
1534
}
@@ -1584,10 +1540,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1584
1540
#ifdef DEBUG_PASS4
1585
1541
fprintf (stderr , "PASS 4 ASSIGNMENTS\n" ); ggml_backend_sched_print_assignments (sched , graph );
1586
1542
#endif
1587
- if (getenv ("GGML_DEBUG_SCHED" )) {
1588
- fprintf (stderr , "SPLIT GRAPH\n" );
1589
- ggml_backend_sched_print_assignments (sched , graph );
1590
- }
1591
1543
1592
1544
// create copies of the graph for each split
1593
1545
// TODO: avoid this copy
@@ -1661,25 +1613,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1661
1613
}
1662
1614
1663
1615
static bool ggml_backend_sched_alloc_splits (ggml_backend_sched_t sched ) {
1664
- // allocate weights in the copy buffers
1665
- for (int s = 0 ; s < sched -> n_splits ; s ++ ) {
1666
- struct ggml_backend_sched_split * split = & sched -> splits [s ];
1667
- if (split -> w_copy_stream_id != -1 ) {
1668
- struct ggml_backend_sched_copy_stream * stream = & sched -> copy_streams [split -> backend_id ][split -> w_copy_stream_id ];
1669
- ggml_backend_buffer_t buffer = stream -> buffer ;
1670
- if (buffer == NULL ) {
1671
- continue ;
1672
- }
1673
- for (int j = 0 ; j < split -> n_inputs ; j ++ ) {
1674
- struct ggml_tensor * input = split -> inputs [j ];
1675
- if (input -> buffer != NULL && input -> buffer -> usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS ) {
1676
- struct ggml_tensor * input_cpy = sched -> tensor_copies [hash_id (input )][split -> backend_id ][sched -> cur_copy ];
1677
- ggml_backend_tensor_alloc (buffer , input_cpy , ggml_backend_buffer_get_base (buffer ));
1678
- }
1679
- }
1680
- }
1681
- }
1682
-
1683
1616
// allocate graph
1684
1617
if (!ggml_gallocr_alloc_graph (sched -> galloc , sched -> graph )) {
1685
1618
// the re-allocation may cause the split inputs to be moved to a different address
@@ -1704,21 +1637,14 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
1704
1637
struct ggml_backend_sched_split * split = & splits [i ];
1705
1638
int split_backend_id = split -> backend_id ;
1706
1639
ggml_backend_t split_backend = sched -> backends [split_backend_id ];
1707
- struct ggml_backend_sched_copy_stream * stream = NULL ;
1708
-
1709
- if (split -> w_copy_stream_id != -1 ) {
1710
- stream = & sched -> copy_streams [split_backend_id ][split -> w_copy_stream_id ];
1711
- }
1712
1640
1713
1641
// copy the input tensors to the split backend
1714
1642
for (int j = 0 ; j < split -> n_inputs ; j ++ ) {
1715
1643
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend (sched , split -> inputs [j ]);
1716
1644
struct ggml_tensor * input = split -> inputs [j ];
1717
1645
struct ggml_tensor * input_cpy = sched -> tensor_copies [hash_id (input )][split_backend_id ][sched -> cur_copy ];
1718
1646
1719
- if (input -> buffer -> usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS && stream && stream -> stream ) {
1720
- ggml_backend_tensor_copy_async (input_backend , stream -> stream , input , input_cpy );
1721
- } else if (input -> flags & GGML_TENSOR_FLAG_INPUT ) {
1647
+ if (input -> flags & GGML_TENSOR_FLAG_INPUT ) {
1722
1648
// inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
1723
1649
if (sched -> events [split_backend_id ][sched -> cur_copy ] != NULL ) {
1724
1650
ggml_backend_event_synchronize (sched -> events [split_backend_id ][sched -> cur_copy ]);
@@ -1737,11 +1663,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
1737
1663
}
1738
1664
}
1739
1665
1740
- if (stream && stream -> stream ) {
1741
- ggml_backend_event_record (stream -> event_copy );
1742
- ggml_backend_event_wait (split_backend , stream -> event_copy );
1743
- }
1744
-
1745
1666
if (!sched -> callback_eval ) {
1746
1667
enum ggml_status ec = ggml_backend_graph_compute_async (split_backend , & split -> graph );
1747
1668
if (ec != GGML_STATUS_SUCCESS ) {
@@ -1781,12 +1702,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
1781
1702
}
1782
1703
}
1783
1704
1784
- // record event of this copy stream
1785
- if (stream && stream -> stream ) {
1786
- ggml_backend_event_record (stream -> event_use );
1787
- ggml_backend_event_wait (stream -> stream , stream -> event_use );
1788
- }
1789
-
1790
1705
// record the event of this copy
1791
1706
if (split -> n_inputs > 0 ) {
1792
1707
if (sched -> events [split_backend_id ][sched -> cur_copy ] != NULL ) {
@@ -1851,19 +1766,11 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
1851
1766
if (sched == NULL ) {
1852
1767
return ;
1853
1768
}
1854
-
1855
1769
for (int b = 0 ; b < sched -> n_backends ; b ++ ) {
1856
1770
for (int c = 0 ; c < sched -> n_copies ; c ++ ) {
1857
1771
ggml_backend_event_free (sched -> events [b ][c ]);
1858
1772
}
1859
- for (int s = 0 ; s < GGML_SCHED_MAX_COPY_STREAMS ; s ++ ) {
1860
- ggml_backend_buffer_free (sched -> copy_streams [b ][s ].buffer );
1861
- ggml_backend_event_free (sched -> copy_streams [b ][s ].event_copy );
1862
- ggml_backend_event_free (sched -> copy_streams [b ][s ].event_use );
1863
- ggml_backend_free (sched -> copy_streams [b ][s ].stream );
1864
- }
1865
1773
}
1866
-
1867
1774
ggml_gallocr_free (sched -> galloc );
1868
1775
ggml_free (sched -> ctx );
1869
1776
free (sched -> splits );
@@ -1882,7 +1789,6 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
1882
1789
memset (sched -> hash_set .keys , 0 , sizeof (sched -> hash_set .keys [0 ]) * hash_size ); // NOLINT
1883
1790
memset (sched -> tensor_backend_id , -1 , sizeof (sched -> tensor_backend_id [0 ]) * hash_size );
1884
1791
memset (sched -> tensor_copies , 0 , sizeof (sched -> tensor_copies [0 ]) * hash_size );
1885
- memset (sched -> cur_copy_stream , 0 , sizeof (sched -> cur_copy_stream [0 ]) * sched -> n_backends );
1886
1792
1887
1793
sched -> is_reset = true;
1888
1794
}
@@ -1894,46 +1800,7 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
1894
1800
1895
1801
ggml_backend_sched_split_graph (sched , measure_graph );
1896
1802
1897
- // allocate tensor copy streams
1898
- for (int b = 0 ; b < sched -> n_backends ; b ++ ) {
1899
- for (int j = 0 ; j < GGML_SCHED_MAX_COPY_STREAMS ; j ++ ) {
1900
- struct ggml_backend_sched_copy_stream * stream = & sched -> copy_streams [b ][j ];
1901
- if (stream -> max_size > 0 ) {
1902
- // backend
1903
- if (!stream -> stream ) {
1904
- stream -> stream = ggml_backend_dup (sched -> backends [b ]);
1905
- }
1906
-
1907
- if (!stream -> stream ) {
1908
- continue ;
1909
- }
1910
-
1911
- // events
1912
- if (!stream -> event_copy ) {
1913
- stream -> event_copy = ggml_backend_event_new (stream -> stream );
1914
- }
1915
-
1916
- if (!stream -> event_use ) {
1917
- stream -> event_use = ggml_backend_event_new (sched -> backends [b ]);
1918
- }
1919
-
1920
- if (!stream -> event_copy || !stream -> event_use ) {
1921
- continue ;
1922
- }
1923
-
1924
- // buffer
1925
- if (!stream -> buffer || ggml_backend_buffer_get_size (stream -> buffer ) < stream -> max_size ) {
1926
- ggml_backend_buffer_free (stream -> buffer );
1927
- stream -> buffer = ggml_backend_buft_alloc_buffer (sched -> bufts [b ], stream -> max_size );
1928
- if (stream -> buffer == NULL ) {
1929
- fprintf (stderr , "%s: failed to allocate buffer for copy stream\n" , __func__ );
1930
- return false;
1931
- }
1932
- }
1933
- }
1934
- }
1935
- }
1936
-
1803
+ // TODO: extract this to a separate function
1937
1804
if (!ggml_gallocr_reserve_n (sched -> galloc , sched -> graph , sched -> node_backend_ids , sched -> leaf_backend_ids )) {
1938
1805
return false;
1939
1806
}
@@ -2001,16 +1868,7 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
2001
1868
int backend_index = ggml_backend_sched_backend_id (sched , backend );
2002
1869
GGML_ASSERT (backend_index >= 0 && backend_index < sched -> n_backends );
2003
1870
2004
- size_t size = ggml_gallocr_get_buffer_size (sched -> galloc , backend_index );
2005
-
2006
- for (int i = 0 ; i < GGML_SCHED_MAX_COPY_STREAMS ; i ++ ) {
2007
- if (sched -> copy_streams [backend_index ][i ].buffer == NULL ) {
2008
- continue ;
2009
- }
2010
- size += ggml_backend_buffer_get_size (sched -> copy_streams [backend_index ][i ].buffer );
2011
- }
2012
-
2013
- return size ;
1871
+ return ggml_gallocr_get_buffer_size (sched -> galloc , backend_index );
2014
1872
}
2015
1873
2016
1874
void ggml_backend_sched_set_tensor_backend (ggml_backend_sched_t sched , struct ggml_tensor * node , ggml_backend_t backend ) {
0 commit comments