@@ -322,6 +322,7 @@ class Partitioner {
322
322
void saveTinyConstants (const std::string& func_name);
323
323
void saveScaleFactors (const std::string& func_name);
324
324
void saveRepeatedConstants (const std::string& func_name);
325
+ void saveTailDictConstants (const std::string& func_name);
325
326
void matchParameters (const std::string& func_name);
326
327
void matchResults (const std::string& func_name);
327
328
void createFunction (const std::string& func_name);
@@ -1430,6 +1431,42 @@ void Partitioner::saveRepeatedConstants(const std::string& func_name) {
1430
1431
}
1431
1432
}
1432
1433
1434
+ void Partitioner::saveTailDictConstants (const std::string& func_name) {
1435
+ if (!cfg.get <::intel_npu::NPUW_HOST_GATHER_QUANT>()) {
1436
+ // No need to preserve as constants
1437
+ return ;
1438
+ }
1439
+
1440
+ // Depending on the config we might want to save vocab in the tail subgraph as a Constant.
1441
+ auto & func_group = all_functions.at (func_name);
1442
+ auto & subgr_group = func_group.refs ;
1443
+
1444
+ if (subgr_group.size () > 1 ) {
1445
+ // Skip the repeated block
1446
+ return ;
1447
+ }
1448
+
1449
+ LOG_VERB (" Trying to preserve some (tail) constants for " << func_name << " in model " << model->get_friendly_name ()
1450
+ << " ..." );
1451
+ LOG_BLOCK ();
1452
+
1453
+ auto & model_group = func_group.mdls ;
1454
+
1455
+ using CPtr = std::shared_ptr<ov::op::v0::Constant>;
1456
+ std::vector<CPtr> to_keep;
1457
+
1458
+ ov::pass::GraphRewrite rewr;
1459
+ rewr.add_matcher <ov::npuw::patterns::opt::PreserveConstDictMatMulCWu>(std::ref (to_keep));
1460
+ rewr.add_matcher <ov::npuw::patterns::opt::PreserveConstDictMatMulCWf8>(std::ref (to_keep));
1461
+ rewr.run_on_model (model_group.front ());
1462
+
1463
+ for (auto && const_to_keep : to_keep) {
1464
+ LOG_DEBUG (" [KEEP] " << const_to_keep);
1465
+ func_group.consts_to_keep .insert (const_to_keep);
1466
+ }
1467
+ LOG_VERB (" Done" );
1468
+ }
1469
+
1433
1470
void Partitioner::matchParameters (const std::string& func_name) {
1434
1471
LOG_VERB (" Matching parameters for function " << func_name << " in model " << model->get_friendly_name () << " ..." );
1435
1472
LOG_BLOCK ();
@@ -1874,12 +1911,20 @@ void Partitioner::optimize(const std::string& func_name) {
1874
1911
ctx.is_spatial = f._spatial .has_value ();
1875
1912
ctx.pmm_dims = cfg.get <::intel_npu::NPUW_PMM>();
1876
1913
1914
+ if (cfg.get <::intel_npu::NPUW_HOST_GATHER_QUANT>() && cfg.get <::intel_npu::NPUW_HOST_GATHER>()) {
1915
+ NPUW_ASSERT (false && " Conflicting configuration: NPUW_HOST_GATHER and NPUW_HOST_GATHER_QUANT should not be "
1916
+ " enabled together!" );
1917
+ }
1918
+
1877
1919
// Run Head/Tail passes
1878
1920
ov::pass::GraphRewrite rewr;
1879
- rewr.add_matcher <ov::npuw::patterns::opt::DQUnpackDictGatheru>(std::ref (ctx));
1880
- rewr.add_matcher <ov::npuw::patterns::opt::DQUnpackDictGatherGQi>(std::ref (ctx));
1881
- rewr.add_matcher <ov::npuw::patterns::opt::DQUnpackDictMatMulCWu>(std::ref (ctx));
1882
- rewr.add_matcher <ov::npuw::patterns::opt::DQUnpackDictMatMulCWf8>(std::ref (ctx));
1921
+ if (!cfg.get <::intel_npu::NPUW_HOST_GATHER_QUANT>()) {
1922
+ rewr.add_matcher <ov::npuw::patterns::opt::DQUnpackDictGatheru>(std::ref (ctx));
1923
+ rewr.add_matcher <ov::npuw::patterns::opt::DQUnpackDictGatherGQi>(std::ref (ctx));
1924
+ rewr.add_matcher <ov::npuw::patterns::opt::DQUnpackDictMatMulCWu>(std::ref (ctx));
1925
+ rewr.add_matcher <ov::npuw::patterns::opt::DQUnpackDictMatMulCWf8>(std::ref (ctx));
1926
+ }
1927
+
1883
1928
// NB: This pass is disabled for reason! It doesn't make things better
1884
1929
// rewr.add_matcher<ov::npuw::patterns::opt::DQUnpackDictMatMulGQi>(std::ref(ctx));
1885
1930
rewr.add_matcher <ov::npuw::patterns::opt::CompressDictMatMulf32>(std::ref (ctx));
@@ -1888,6 +1933,29 @@ void Partitioner::optimize(const std::string& func_name) {
1888
1933
rewr.add_matcher <ov::npuw::patterns::opt::ConvToMatmul>(std::ref (ctx));
1889
1934
rewr.run_on_model (f._model );
1890
1935
1936
+ // Quantized Gather + Unpack on host in the runtime
1937
+ if (cfg.get <::intel_npu::NPUW_HOST_GATHER_QUANT>()) {
1938
+ // FIXME: since we are running it after lifted Gather,
1939
+ // we need to first try to match Asymm or Symm patterns.
1940
+ // Otherwise smaller HostGatherQuant might be matched first and break
1941
+ // the quantization logic.
1942
+ {
1943
+ ov::pass::GraphRewrite rewr2;
1944
+ rewr2.add_matcher <ov::npuw::patterns::opt::HostGatherQuantAsymm>(std::ref (ctx));
1945
+ rewr2.run_on_model (f._model );
1946
+ }
1947
+ {
1948
+ ov::pass::GraphRewrite rewr2;
1949
+ rewr2.add_matcher <ov::npuw::patterns::opt::HostGatherQuantSymm>(std::ref (ctx));
1950
+ rewr2.run_on_model (f._model );
1951
+ }
1952
+ {
1953
+ ov::pass::GraphRewrite rewr2;
1954
+ rewr2.add_matcher <ov::npuw::patterns::opt::HostGatherQuant>(std::ref (ctx));
1955
+ rewr2.run_on_model (f._model );
1956
+ }
1957
+ }
1958
+
1891
1959
// Move Gather to host, if required
1892
1960
if (cfg.get <::intel_npu::NPUW_HOST_GATHER>()) {
1893
1961
ov::pass::GraphRewrite rewr2;
@@ -1992,6 +2060,30 @@ void Partitioner::optimize(const std::string& func_name) {
1992
2060
}
1993
2061
}
1994
2062
2063
+ // Host-side quantized gather, pt 1. Add new parameters first
2064
+ if (ctx.params_to_quant_gather_unpack ) {
2065
+ auto & params_to_quant_gather_unpack = *ctx.params_to_quant_gather_unpack ;
2066
+ for (const auto & param_new_and_unpack : params_to_quant_gather_unpack.params_to_runtime_unpack_gather ) {
2067
+ // New input in the graph
2068
+ new_params.push_back (param_new_and_unpack.first );
2069
+ // Note: don't remove w, z and s params here to keep them shared with the quant vocab in tail
2070
+ for (auto && funcall : func_group.refs ) {
2071
+ auto new_elem_type = param_new_and_unpack.first ->get_element_type ();
2072
+ const auto & new_shape = param_new_and_unpack.first ->get_shape ();
2073
+ // Note: no allocation needed for this tensor - set to _closure and dummy in _lazy_closure
2074
+ // FIXME: It turns out this tensor will be completely unused.
2075
+ // It will just sit in the memory to do nothing.
2076
+ // Most likely it may stay empty since we need a 1:1 matching between
2077
+ // closure tensors and parameters (minus base).
2078
+ // Based on our logic (when tensors get transferred from lazy tensors via bank
2079
+ // to the closure), this tensor should be non-empty to avoid this process.
2080
+ funcall.get ()._closure .push_back (ov::Tensor (new_elem_type, new_shape));
2081
+ funcall.get ()._lazy_closure .push_back (LazyTensor ());
2082
+ funcall.get ()._is_lazy_unpack .push_back (false );
2083
+ }
2084
+ }
2085
+ }
2086
+
1995
2087
// Add all new parameters introduced by this change
1996
2088
f._model ->add_parameters (new_params);
1997
2089
@@ -2031,6 +2123,29 @@ void Partitioner::optimize(const std::string& func_name) {
2031
2123
}
2032
2124
}
2033
2125
2126
+ // Host-side quantized gather, pt. 2: Write the gather mappings to funcall
2127
+ if (ctx.params_to_quant_gather_unpack ) {
2128
+ auto & params_to_quant_gather_unpack = *ctx.params_to_quant_gather_unpack ;
2129
+ for (const auto & param_new_and_unpack_gather :
2130
+ params_to_quant_gather_unpack.params_to_runtime_unpack_gather ) {
2131
+ // New param in the graph
2132
+ auto gather_dst_id = f._model ->get_parameter_index (param_new_and_unpack_gather.first );
2133
+ // Orig params to gather from
2134
+ auto gather_w_id = f._model ->get_parameter_index (param_new_and_unpack_gather.second .w );
2135
+ auto gather_z_id = f._model ->get_parameter_index (param_new_and_unpack_gather.second .z );
2136
+ auto gather_s_id = f._model ->get_parameter_index (param_new_and_unpack_gather.second .s );
2137
+ // Original pids
2138
+ auto gather_idx_id = f._model ->get_parameter_index (params_to_quant_gather_unpack.pids );
2139
+ for (auto && funcall : func_group.refs ) {
2140
+ funcall.get ()._quant_unpack_gather = ov::npuw::Subgraph::QuantUnpackGather{gather_dst_id,
2141
+ gather_w_id,
2142
+ gather_z_id,
2143
+ gather_s_id,
2144
+ gather_idx_id};
2145
+ }
2146
+ }
2147
+ }
2148
+
2034
2149
// FIXME: workaround
2035
2150
// Set lazy unpack indexes not to be unpacked in DCOFF
2036
2151
for (auto && fref : func_group.refs ) {
@@ -2344,6 +2459,7 @@ ov::npuw::Partitioning ov::npuw::getPartitioning(const std::shared_ptr<ov::Model
2344
2459
p.propagateConvertsOut (func_group);
2345
2460
p.sanityCheck (func_group);
2346
2461
p.saveRepeatedConstants (func_group);
2462
+ p.saveTailDictConstants (func_group);
2347
2463
p.matchParameters (func_group);
2348
2464
p.matchResults (func_group);
2349
2465
p.matchRepeatedSubgraphs (func_group);
0 commit comments