openvinotoolkit
diff --git a/‎src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp
Lines changed: 1 addition & 0 deletions b/‎src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp
Lines changed: 8 additions & 0 deletions b/‎src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/plugins/intel_npu/src/al/src/config/npuw.cpp
Lines changed: 1 addition & 0 deletions b/‎src/plugins/intel_npu/src/al/src/config/npuw.cpp
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp
Lines changed: 66 additions & 0 deletions b/‎src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp
Lines changed: 66 additions & 0 deletions
diff --git a/‎src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp
Lines changed: 1 addition & 0 deletions b/‎src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
Lines changed: 14 additions & 0 deletions b/‎src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
Lines changed: 14 additions & 0 deletions
diff --git a/‎src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
Lines changed: 1 addition & 0 deletions b/‎src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp
Lines changed: 120 additions & 4 deletions b/‎src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp
Lines changed: 120 additions & 4 deletions
diff --git a/‎src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp
Lines changed: 11 additions & 0 deletions b/‎src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp
Lines changed: 11 additions & 0 deletions
@@ -82,6 +82,7 @@ DEFINE_OPT(NPUW_DQ_FULL, bool, true, npuw::partitioning::dyn_quant_full, RunTime
 DEFINE_OPT(NPUW_PMM, std::string, "2", npuw::partitioning::par_matmul_merge_dims, RunTime);
 DEFINE_OPT(NPUW_SLICE_OUT, bool, false, npuw::partitioning::slice_out, RunTime);
 DEFINE_OPT(NPUW_HOST_GATHER, bool, true, npuw::partitioning::host_gather, RunTime);
+DEFINE_OPT(NPUW_HOST_GATHER_QUANT, bool, false, npuw::partitioning::gather_quant, RunTime);
 DEFINE_OPT(NPUW_SPATIAL, bool, false, npuw::partitioning::spatial, RunTime);
 DEFINE_OPT(NPUW_F16IC, bool, true, npuw::partitioning::f16_interconnect, RunTime);
 DEFINE_OPT(NPUW_SPATIAL_NWAY, std::size_t, 128, npuw::partitioning::spatial_nway, RunTime);
 
@@ -250,6 +250,14 @@ static constexpr ov::Property<bool> f16_interconnect{"NPUW_F16IC"};
  */
 static constexpr ov::Property<bool> host_gather{"NPUW_HOST_GATHER"};
 
+/**
+ * @brief
+ * Type: boolean
+ * When applicable, do embedding gather on host but leave it quantized.
+ * Default value: false.
+ */
+static constexpr ov::Property<bool> gather_quant{"NPUW_HOST_GATHER_QUANT"};
+
 /**
  * @brief
  * Type: std::string.
 
@@ -34,6 +34,7 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) {
     desc.add<NPUW_SPATIAL_NWAY>();
     desc.add<NPUW_SPATIAL_DYN>();
     desc.add<NPUW_HOST_GATHER>();
+    desc.add<NPUW_HOST_GATHER_QUANT>();
     desc.add<NPUW_F16IC>();
     desc.add<NPUW_DCOFF_TYPE>();
     desc.add<NPUW_DCOFF_SCALE>();
 
@@ -465,12 +465,78 @@ void ov::npuw::IBaseInferRequest::bind_global_params(std::size_t idx, RqPtr requ
         const auto& vocab = comp_model_desc.closure[comp_model_desc.host_gather.src_idx - comp_model_desc.param_base];
         const auto& lport = comp_model_desc.compiled_model->inputs()[comp_model_desc.host_gather.idx_idx];
         const auto lookup = request->get_tensor(lport);
+
         ov::npuw::util::gather(ov::get_tensor_impl(vocab), lookup, gather);
     }
 
+    // Run host-side quantized gather, if required
+    handle_quant_host_gather(idx, request);
+
     LOG_DEBUG("Done");
 }
 
+void ov::npuw::IBaseInferRequest::handle_quant_host_gather(std::size_t idx, RqPtr request) {
+    auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx];
+
+    if (comp_model_desc.quant_unpack_gather.dst_idx != -1) {
+        NPUW_ASSERT(comp_model_desc.quant_unpack_gather.idx_idx != -1 &&
+                    comp_model_desc.quant_unpack_gather.src_w_idx != -1);
+
+        const auto& lport = comp_model_desc.compiled_model->inputs()[comp_model_desc.quant_unpack_gather.idx_idx];
+        const auto& lookup = request->get_tensor(lport);
+
+        const auto& gport = comp_model_desc.compiled_model->inputs()[comp_model_desc.quant_unpack_gather.dst_idx];
+        const auto& gather = request->get_tensor(gport);
+
+        const auto& wport = comp_model_desc.compiled_model->inputs()[comp_model_desc.quant_unpack_gather.src_w_idx];
+        const auto& vocabw = request->get_tensor(wport);
+
+        auto ids_shape = lookup->get_shape();
+
+        auto get_gathered_shape = [&ids_shape](const ov::Shape& shape) {
+            return ov::Shape{1, ids_shape[1], shape.size() == 3 ? shape[1] * shape[2] : shape[1]};
+        };
+
+        ov::Tensor gatherw(vocabw->get_element_type(), get_gathered_shape(vocabw->get_shape()));
+        // Gather weight
+        ov::npuw::util::gather(vocabw, lookup, ov::get_tensor_impl(gatherw));
+
+        if (comp_model_desc.quant_unpack_gather.src_z_idx != -1 &&
+            comp_model_desc.quant_unpack_gather.src_s_idx != -1) {
+            const auto& zport = comp_model_desc.compiled_model->inputs()[comp_model_desc.quant_unpack_gather.src_z_idx];
+            const auto& vocabz = request->get_tensor(zport);
+
+            const auto& sport = comp_model_desc.compiled_model->inputs()[comp_model_desc.quant_unpack_gather.src_s_idx];
+            const auto& vocabs = request->get_tensor(sport);
+
+            ov::Tensor gatherz(vocabz->get_element_type(), get_gathered_shape(vocabz->get_shape()));
+            ov::Tensor gathers(vocabs->get_element_type(), get_gathered_shape(vocabs->get_shape()));
+            // Gather first
+            ov::npuw::util::gather(vocabz, lookup, ov::get_tensor_impl(gatherz));
+            ov::npuw::util::gather(vocabs, lookup, ov::get_tensor_impl(gathers));
+
+            // Then unpack
+            ov::npuw::util::unpack(ov::get_tensor_impl(gatherw),
+                                   ov::get_tensor_impl(gatherz),
+                                   ov::get_tensor_impl(gathers),
+                                   gather);
+        } else if (comp_model_desc.quant_unpack_gather.src_s_idx != -1) {
+            const auto& sport = comp_model_desc.compiled_model->inputs()[comp_model_desc.quant_unpack_gather.src_s_idx];
+            const auto& vocabs = request->get_tensor(sport);
+
+            ov::Tensor gathers(vocabs->get_element_type(), get_gathered_shape(vocabs->get_shape()));
+            // Gather first
+            ov::npuw::util::gather(vocabs, lookup, ov::get_tensor_impl(gathers));
+
+            // Then unpack
+            ov::npuw::util::unpack(ov::get_tensor_impl(gatherw), ov::get_tensor_impl(gathers), gather);
+        } else {
+            // Already gathered above - just unpack
+            ov::npuw::util::unpack(ov::get_tensor_impl(gatherw), gather);
+        }
+    }
+}
+
 void ov::npuw::IBaseInferRequest::bind_global_results(std::size_t idx, RqPtr request) {
     LOG_DEBUG("Binding results for Subgraph[" << idx << "]");
     LOG_BLOCK();
 
@@ -149,6 +149,7 @@ class IBaseInferRequest : public ov::ISyncInferRequest {
     void unpack_closure(std::size_t idx, RqPtr request);
     virtual void bind_global_params(std::size_t idx, RqPtr request);
     virtual void bind_global_results(std::size_t idx, RqPtr request);
+    void handle_quant_host_gather(std::size_t idx, RqPtr request);
 
     void dump_input_tensors(std::size_t idx);
     void dump_output_tensors(std::size_t idx);
 
@@ -367,6 +367,7 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
                 LOG_INFO("Subgraph[" << id << "] is a function call to [" << compiled_fcn_iter->second << "]");
             }
             m_compiled_submodels[id].host_gather = subgraph._host_gather;
+            m_compiled_submodels[id].quant_unpack_gather = subgraph._quant_unpack_gather;
             m_compiled_submodels[id].param_base = fcn_template._param_offset;
             m_compiled_submodels[id].closure = subgraph._closure;
             m_compiled_submodels[id].lazy_closure = subgraph._lazy_closure;
@@ -541,6 +542,12 @@ void ov::npuw::CompiledModel::CompiledModelDesc::serialize(std::ostream& stream,
     write(stream, host_gather.src_idx);
     write(stream, host_gather.idx_idx);
 
+    write(stream, quant_unpack_gather.dst_idx);
+    write(stream, quant_unpack_gather.src_w_idx);
+    write(stream, quant_unpack_gather.src_z_idx);
+    write(stream, quant_unpack_gather.src_s_idx);
+    write(stream, quant_unpack_gather.idx_idx);
+
     write(stream, spatial);
 
     write(stream, is_remote);
@@ -609,6 +616,12 @@ void ov::npuw::CompiledModel::CompiledModelDesc::deserialize(std::istream& strea
     read(stream, host_gather.src_idx);
     read(stream, host_gather.idx_idx);
 
+    read(stream, quant_unpack_gather.dst_idx);
+    read(stream, quant_unpack_gather.src_w_idx);
+    read(stream, quant_unpack_gather.src_z_idx);
+    read(stream, quant_unpack_gather.src_s_idx);
+    read(stream, quant_unpack_gather.idx_idx);
+
     read(stream, spatial);
 
     read(stream, is_remote);
@@ -1699,6 +1712,7 @@ void ov::npuw::CompiledModel::implement_properties() {
                           BIND(npuw::partitioning::spatial_nway, NPUW_SPATIAL_NWAY),
                           BIND(npuw::partitioning::spatial_dyn, NPUW_SPATIAL_DYN),
                           BIND(npuw::partitioning::host_gather, NPUW_HOST_GATHER),
+                          BIND(npuw::partitioning::gather_quant, NPUW_HOST_GATHER_QUANT),
                           BIND(npuw::partitioning::funcall_for_all, NPUW_FUNCALL_FOR_ALL),
                           BIND(npuw::partitioning::f16_interconnect, NPUW_F16IC),
                           BIND(npuw::partitioning::dcoff_type, NPUW_DCOFF_TYPE),
 
@@ -152,6 +152,7 @@ class CompiledModel : public ov::npuw::ICompiledModel {
         std::optional<std::size_t> replaced_by;
 
         Subgraph::Gather host_gather;
+        Subgraph::QuantUnpackGather quant_unpack_gather;
         std::optional<ov::npuw::compiled::Spatial> spatial;
 
         // FIXME: This is a 1:1 copy of the ov::npuw::Subgraph structure
 
@@ -322,6 +322,7 @@ class Partitioner {
     void saveTinyConstants(const std::string& func_name);
     void saveScaleFactors(const std::string& func_name);
     void saveRepeatedConstants(const std::string& func_name);
+    void saveTailDictConstants(const std::string& func_name);
     void matchParameters(const std::string& func_name);
     void matchResults(const std::string& func_name);
     void createFunction(const std::string& func_name);
@@ -1430,6 +1431,42 @@ void Partitioner::saveRepeatedConstants(const std::string& func_name) {
     }
 }
 
+void Partitioner::saveTailDictConstants(const std::string& func_name) {
+    if (!cfg.get<::intel_npu::NPUW_HOST_GATHER_QUANT>()) {
+        // No need to preserve as constants
+        return;
+    }
+
+    // Depending on the config we might want to save vocab in the tail subgraph as a Constant.
+    auto& func_group = all_functions.at(func_name);
+    auto& subgr_group = func_group.refs;
+
+    if (subgr_group.size() > 1) {
+        // Skip the repeated block
+        return;
+    }
+
+    LOG_VERB("Trying to preserve some (tail) constants for " << func_name << " in model " << model->get_friendly_name()
+                                                             << "...");
+    LOG_BLOCK();
+
+    auto& model_group = func_group.mdls;
+
+    using CPtr = std::shared_ptr<ov::op::v0::Constant>;
+    std::vector<CPtr> to_keep;
+
+    ov::pass::GraphRewrite rewr;
+    rewr.add_matcher<ov::npuw::patterns::opt::PreserveConstDictMatMulCWu>(std::ref(to_keep));
+    rewr.add_matcher<ov::npuw::patterns::opt::PreserveConstDictMatMulCWf8>(std::ref(to_keep));
+    rewr.run_on_model(model_group.front());
+
+    for (auto&& const_to_keep : to_keep) {
+        LOG_DEBUG("[KEEP] " << const_to_keep);
+        func_group.consts_to_keep.insert(const_to_keep);
+    }
+    LOG_VERB("Done");
+}
+
 void Partitioner::matchParameters(const std::string& func_name) {
     LOG_VERB("Matching parameters for function " << func_name << " in model " << model->get_friendly_name() << "...");
     LOG_BLOCK();
@@ -1874,12 +1911,20 @@ void Partitioner::optimize(const std::string& func_name) {
         ctx.is_spatial = f._spatial.has_value();
         ctx.pmm_dims = cfg.get<::intel_npu::NPUW_PMM>();
 
+        if (cfg.get<::intel_npu::NPUW_HOST_GATHER_QUANT>() && cfg.get<::intel_npu::NPUW_HOST_GATHER>()) {
+            NPUW_ASSERT(false && "Conflicting configuration: NPUW_HOST_GATHER and NPUW_HOST_GATHER_QUANT should not be "
+                                 "enabled together!");
+        }
+
         // Run Head/Tail passes
         ov::pass::GraphRewrite rewr;
-        rewr.add_matcher<ov::npuw::patterns::opt::DQUnpackDictGatheru>(std::ref(ctx));
-        rewr.add_matcher<ov::npuw::patterns::opt::DQUnpackDictGatherGQi>(std::ref(ctx));
-        rewr.add_matcher<ov::npuw::patterns::opt::DQUnpackDictMatMulCWu>(std::ref(ctx));
-        rewr.add_matcher<ov::npuw::patterns::opt::DQUnpackDictMatMulCWf8>(std::ref(ctx));
+        if (!cfg.get<::intel_npu::NPUW_HOST_GATHER_QUANT>()) {
+            rewr.add_matcher<ov::npuw::patterns::opt::DQUnpackDictGatheru>(std::ref(ctx));
+            rewr.add_matcher<ov::npuw::patterns::opt::DQUnpackDictGatherGQi>(std::ref(ctx));
+            rewr.add_matcher<ov::npuw::patterns::opt::DQUnpackDictMatMulCWu>(std::ref(ctx));
+            rewr.add_matcher<ov::npuw::patterns::opt::DQUnpackDictMatMulCWf8>(std::ref(ctx));
+        }
+
         // NB: This pass is disabled for reason! It doesn't make things better
         // rewr.add_matcher<ov::npuw::patterns::opt::DQUnpackDictMatMulGQi>(std::ref(ctx));
         rewr.add_matcher<ov::npuw::patterns::opt::CompressDictMatMulf32>(std::ref(ctx));
@@ -1888,6 +1933,29 @@ void Partitioner::optimize(const std::string& func_name) {
         rewr.add_matcher<ov::npuw::patterns::opt::ConvToMatmul>(std::ref(ctx));
         rewr.run_on_model(f._model);
 
+        // Quantized Gather + Unpack on host in the runtime
+        if (cfg.get<::intel_npu::NPUW_HOST_GATHER_QUANT>()) {
+            // FIXME: since we are running it after lifted Gather,
+            // we need to first try to match Asymm or Symm patterns.
+            // Otherwise smaller HostGatherQuant might be matched first and break
+            // the quantization logic.
+            {
+                ov::pass::GraphRewrite rewr2;
+                rewr2.add_matcher<ov::npuw::patterns::opt::HostGatherQuantAsymm>(std::ref(ctx));
+                rewr2.run_on_model(f._model);
+            }
+            {
+                ov::pass::GraphRewrite rewr2;
+                rewr2.add_matcher<ov::npuw::patterns::opt::HostGatherQuantSymm>(std::ref(ctx));
+                rewr2.run_on_model(f._model);
+            }
+            {
+                ov::pass::GraphRewrite rewr2;
+                rewr2.add_matcher<ov::npuw::patterns::opt::HostGatherQuant>(std::ref(ctx));
+                rewr2.run_on_model(f._model);
+            }
+        }
+
         // Move Gather to host, if required
         if (cfg.get<::intel_npu::NPUW_HOST_GATHER>()) {
             ov::pass::GraphRewrite rewr2;
@@ -1992,6 +2060,30 @@ void Partitioner::optimize(const std::string& func_name) {
             }
         }
 
+        // Host-side quantized gather, pt 1. Add new parameters first
+        if (ctx.params_to_quant_gather_unpack) {
+            auto& params_to_quant_gather_unpack = *ctx.params_to_quant_gather_unpack;
+            for (const auto& param_new_and_unpack : params_to_quant_gather_unpack.params_to_runtime_unpack_gather) {
+                // New input in the graph
+                new_params.push_back(param_new_and_unpack.first);
+                // Note: don't remove w, z and s params here to keep them shared with the quant vocab in tail
+                for (auto&& funcall : func_group.refs) {
+                    auto new_elem_type = param_new_and_unpack.first->get_element_type();
+                    const auto& new_shape = param_new_and_unpack.first->get_shape();
+                    // Note: no allocation needed for this tensor - set to _closure and dummy in _lazy_closure
+                    // FIXME: It turns out this tensor will be completely unused.
+                    // It will just sit in the memory to do nothing.
+                    // Most likely it may stay empty since we need a 1:1 matching between
+                    // closure tensors and parameters (minus base).
+                    // Based on our logic (when tensors get transferred from lazy tensors via bank
+                    // to the closure), this tensor should be non-empty to avoid this process.
+                    funcall.get()._closure.push_back(ov::Tensor(new_elem_type, new_shape));
+                    funcall.get()._lazy_closure.push_back(LazyTensor());
+                    funcall.get()._is_lazy_unpack.push_back(false);
+                }
+            }
+        }
+
         // Add all new parameters introduced by this change
         f._model->add_parameters(new_params);
 
@@ -2031,6 +2123,29 @@ void Partitioner::optimize(const std::string& func_name) {
             }
         }
 
+        // Host-side quantized gather, pt. 2: Write the gather mappings to funcall
+        if (ctx.params_to_quant_gather_unpack) {
+            auto& params_to_quant_gather_unpack = *ctx.params_to_quant_gather_unpack;
+            for (const auto& param_new_and_unpack_gather :
+                 params_to_quant_gather_unpack.params_to_runtime_unpack_gather) {
+                // New param in the graph
+                auto gather_dst_id = f._model->get_parameter_index(param_new_and_unpack_gather.first);
+                // Orig params to gather from
+                auto gather_w_id = f._model->get_parameter_index(param_new_and_unpack_gather.second.w);
+                auto gather_z_id = f._model->get_parameter_index(param_new_and_unpack_gather.second.z);
+                auto gather_s_id = f._model->get_parameter_index(param_new_and_unpack_gather.second.s);
+                // Original pids
+                auto gather_idx_id = f._model->get_parameter_index(params_to_quant_gather_unpack.pids);
+                for (auto&& funcall : func_group.refs) {
+                    funcall.get()._quant_unpack_gather = ov::npuw::Subgraph::QuantUnpackGather{gather_dst_id,
+                                                                                               gather_w_id,
+                                                                                               gather_z_id,
+                                                                                               gather_s_id,
+                                                                                               gather_idx_id};
+                }
+            }
+        }
+
         // FIXME: workaround
         // Set lazy unpack indexes not to be unpacked in DCOFF
         for (auto&& fref : func_group.refs) {
@@ -2344,6 +2459,7 @@ ov::npuw::Partitioning ov::npuw::getPartitioning(const std::shared_ptr<ov::Model
                 p.propagateConvertsOut(func_group);
                 p.sanityCheck(func_group);
                 p.saveRepeatedConstants(func_group);
+                p.saveTailDictConstants(func_group);
                 p.matchParameters(func_group);
                 p.matchResults(func_group);
                 p.matchRepeatedSubgraphs(func_group);
 
@@ -59,6 +59,17 @@ struct Subgraph {
     };
     Gather _host_gather;
 
+    struct QuantUnpackGather {
+        int64_t dst_idx = -1;
+
+        int64_t src_w_idx = -1;
+        int64_t src_z_idx = -1;
+        int64_t src_s_idx = -1;
+
+        int64_t idx_idx = -1;
+    };
+    QuantUnpackGather _quant_unpack_gather;
+
     using Ref = std::reference_wrapper<Subgraph>;
 };