feat: --gpu-segments (#204)

b4rtaz · web-flow · commit a16d2f03e664 · 2025-04-26T14:21:26.000+02:00
diff --git a/src/app.cpp b/src/app.cpp
@@ -43,6 +43,9 @@ AppCliArgs AppCliArgs::parse(int argc, char* *argv, bool requireMode) {
     args.maxSeqLen = 0;
     args.netTurbo = true;
     args.gpuIndex = -1;
+    args.gpuSegmentFrom = -1;
+    args.gpuSegmentTo = -1;
+
     int i = 1;
     if (requireMode && argc > 1) {
         args.mode = argv[1];
@@ -79,15 +82,15 @@ AppCliArgs AppCliArgs::parse(int argc, char* *argv, bool requireMode) {
 
             for (int s = 0; s < count; s++) {
                 char *v = argv[i + 1 + s];
-                char *sep = std::strstr(v, ":");
-                if (sep == NULL) {
+                char *separator = std::strstr(v, ":");
+                if (separator == NULL) {
                     throw std::runtime_error("Invalid worker address: " + std::string(v));
                 }
-                int hostLen = sep - v;
+                int hostLen = separator - v;
                 args.workerHosts[s] = new char[hostLen + 1];
                 std::memcpy(args.workerHosts[s], v, hostLen);
                 args.workerHosts[s][hostLen] = '\0';
-                args.workerPorts[s] = std::atoi(sep + 1);
+                args.workerPorts[s] = std::atoi(separator + 1);
             }
 
             i += count - 1;
@@ -109,6 +112,12 @@ AppCliArgs AppCliArgs::parse(int argc, char* *argv, bool requireMode) {
             args.maxSeqLen = (unsigned int)atoi(value);
         } else if (std::strcmp(name, "--gpu-index") == 0) {
             args.gpuIndex = atoi(value);
+        } else if (std::strcmp(name, "--gpu-segments") == 0) {
+            char *separator = std::strstr(value, ":");
+            if (separator == NULL)
+                throw std::runtime_error("GPU segments expected in the format <from>:<to>");
+            args.gpuSegmentFrom = atoi(value);
+            args.gpuSegmentTo = atoi(separator + 1);
         } else if (std::strcmp(name, "--net-turbo") == 0) {
             args.netTurbo = atoi(value) == 1;
         } else {
@@ -128,23 +137,32 @@ AppCliArgs::~AppCliArgs() {
         delete[] workerPorts;
 }
 
-static NnDevice *createDevice(AppCliArgs *args, NnNetConfig *netConfig, NnNodeConfig *nodeConfig, NnNetExecution *netExecution) {
+static std::vector<NnExecutorDevice> resolveDevices(AppCliArgs *args, NnNetConfig *netConfig, NnNodeConfig *nodeConfig, NnNetExecution *netExecution) {
+    std::vector<NnExecutorDevice> devices;
+
     if (args->gpuIndex >= 0) {
 #if defined(DLLAMA_VULKAN)
-        return new NnVulkanDevice(args->gpuIndex, netConfig, nodeConfig, netExecution);
+        devices.push_back(NnExecutorDevice(
+            new NnVulkanDevice(args->gpuIndex, netConfig, nodeConfig, netExecution),
+            args->gpuSegmentFrom,
+            args->gpuSegmentTo
+        ));
 #else
         throw std::runtime_error("This build does not support GPU");
 #endif
     }
-    return new NnCpuDevice(netConfig, nodeConfig, netExecution);
+
+    if (args->gpuIndex < 0 || (args->gpuSegmentFrom >= 0 && args->gpuSegmentTo >= 0)) {
+        devices.push_back(NnExecutorDevice(new NnCpuDevice(netConfig, nodeConfig, netExecution), -1, -1));
+    }
+    return devices;
 }
 
-RootLlmInference::RootLlmInference(LlmNet *net, NnDevice *device, NnNetExecution *execution, NnExecutor *executor, NnNetwork *network) {
+RootLlmInference::RootLlmInference(LlmNet *net, NnNetExecution *execution, NnExecutor *executor, NnNetwork *network) {
     this->header = net->header;
     this->tokenPipe = (float *)execution->pipes[net->tokenPipeIndex];
     this->positionPipe = (float *)execution->pipes[net->positionPipeIndex];
     this->logitsPipe = (float *)execution->pipes[net->logitsPipeIndex];
-    this->device = device;
     this->execution = execution;
     this->executor = executor;
     this->network = network; // May be nullptr!
@@ -245,13 +263,13 @@ void runInferenceApp(AppCliArgs *args, void (*handler)(AppInferenceContext *cont
         configWriter.writeToWorkers(&net.netConfig, net.nodeConfigs);
     }
 
-    std::unique_ptr<NnDevice> device(createDevice(args, &net.netConfig, rootNodeConfig, &execution));
-    NnExecutor executor(&net.netConfig, rootNodeConfig, device.get(), &execution, synchronizer.get(), args->benchmark);
+    std::vector<NnExecutorDevice> devices = resolveDevices(args, &net.netConfig, rootNodeConfig, &execution);
+    NnExecutor executor(&net.netConfig, rootNodeConfig, &devices, &execution, synchronizer.get(), args->benchmark);
 
     NnRootWeightLoader weightLoader(&executor, network, nNodes);
     loadLlmNetWeight(args->modelPath, &net, &weightLoader);
 
-    RootLlmInference inference(&net, device.get(), &execution, &executor, network);
+    RootLlmInference inference(&net, &execution, &executor, network);
 
     if (network != nullptr) {
         network->resetStats();
@@ -290,10 +308,9 @@ void runWorkerApp(AppCliArgs *args) {
 
         NnNetExecution execution(args->nThreads, &netConfig);
 
-        std::unique_ptr<NnDevice> device(createDevice(args, &netConfig, &nodeConfig, &execution));
-
+        std::vector<NnExecutorDevice> devices = resolveDevices(args, &netConfig, &nodeConfig, &execution);
         NnNetworkNodeSynchronizer synchronizer(network, &execution, &netConfig, &nodeConfig);
-        NnExecutor executor(&netConfig, &nodeConfig, device.get(), &execution, &synchronizer, false);
+        NnExecutor executor(&netConfig, &nodeConfig, &devices, &execution, &synchronizer, false);
 
         NnWorkerWeightReader weightReader(&executor, network);
         weightReader.read();
diff --git a/src/app.hpp b/src/app.hpp
@@ -31,6 +31,8 @@ class AppCliArgs {
     NnUint maxSeqLen;
     bool netTurbo;
     int gpuIndex;
+    int gpuSegmentFrom;
+    int gpuSegmentTo;
 
     // worker
     NnUint port;
@@ -51,13 +53,12 @@ class RootLlmInference {
     float *tokenPipe;
     float *positionPipe;
     LlmHeader *header;
-    NnDevice *device;
     NnNetExecution *execution;
     NnExecutor *executor;
     NnNetwork *network;
     LlmControlPacket controlPacket;
 public:
-    RootLlmInference(LlmNet *net, NnDevice *device, NnNetExecution *execution, NnExecutor *executor, NnNetwork *network);
+    RootLlmInference(LlmNet *net, NnNetExecution *execution, NnExecutor *executor, NnNetwork *network);
     void setBatchSize(NnUint batchSize);
     void setPosition(NnUint position);
     void setToken(NnUint batchIndex, NnUint token);
diff --git a/src/nn/nn-cpu-test.cpp b/src/nn/nn-cpu-test.cpp
@@ -64,10 +64,13 @@ int main() {
     for (NnUint i = 0; i < DIM; i++)
         rmsNormWeight[i] = 0.5 + i / (float)DIM;
 
-    NnCpuDevice device(&netConfig, &nodeConfig, &execution);
+    NnCpuDevice *device = new NnCpuDevice(&netConfig, &nodeConfig, &execution);
+    std::vector<NnExecutorDevice> devices;
+    devices.push_back(NnExecutorDevice(device, -1, -1));
+
     NnFakeNodeSynchronizer synchronizer;
-    float *rms = (float *)device.buffers[0];
-    NnExecutor executor(&netConfig, &nodeConfig, &device, &execution, &synchronizer, false);
+    float *rms = (float *)device->buffers[0];
+    NnExecutor executor(&netConfig, &nodeConfig, &devices, &execution, &synchronizer, false);
     executor.loadWeight("rms_norm", 0, sizeof(rmsNormWeight), (NnByte *)rmsNormWeight);
 
     execution.setBatchSize(2);
diff --git a/src/nn/nn-executor.cpp b/src/nn/nn-executor.cpp
@@ -33,17 +33,41 @@ void NnNetExecution::setBatchSize(NnUint batchSize) {
     this->batchSize = batchSize;
 }
 
-NnExecutor::NnExecutor(NnNetConfig *netConfig, NnNodeConfig *nodeConfig, NnDevice *device, NnNetExecution *netExecution, NnNodeSynchronizer *synchronizer, bool benchmark)
+NnExecutorDevice::NnExecutorDevice(NnDevice *device, int segmentFrom, int segmentTo) {
+    this->device = std::unique_ptr<NnDevice>(device);
+    this->segmentFrom = segmentFrom;
+    this->segmentTo = segmentTo;
+}
+
+NnExecutor::NnExecutor(NnNetConfig *netConfig, NnNodeConfig *nodeConfig, std::vector<NnExecutorDevice> *devices, NnNetExecution *netExecution, NnNodeSynchronizer *synchronizer, bool benchmark)
     : segments(nodeConfig->nSegments), steps()
 {
-    NnUint maxNThreads = device->maxNThreads();
+    NnUint maxNThreads = 0;
+    for (NnExecutorDevice &d : *devices) {
+        if (d.device->maxNThreads() > maxNThreads)
+            maxNThreads = d.device->maxNThreads();
+    }
     if (netExecution->nThreads > maxNThreads)
-        throw std::invalid_argument("This device supports max " + std::to_string(maxNThreads) + " threads");
+        throw std::invalid_argument("This configuration supports max " + std::to_string(maxNThreads) + " threads");
+
     this->netExecution = netExecution;
     this->nodeConfig = nodeConfig;
 
     bool useSynchronizer = netConfig->nNodes > 1;
     for (NnUint segmentIndex = 0; segmentIndex < nodeConfig->nSegments; segmentIndex++) {
+        NnDevice *device = nullptr;
+        for (NnExecutorDevice &d : *devices) {
+            if (
+                (d.segmentFrom == -1 && d.segmentTo == -1) ||
+                (segmentIndex >= d.segmentFrom && segmentIndex <= d.segmentTo)
+            ) {
+                device = d.device.get();
+                break;
+            }
+        }
+        if (device == nullptr)
+            throw std::invalid_argument("Cannot locate device for segment " + std::to_string(segmentIndex));
+
         NnSegmentConfig *segmentConfig = &nodeConfig->segments[segmentIndex];
         if (segmentConfig->nOps > 0) {
             NnDeviceSegment *segment = device->createSegment(segmentIndex);
@@ -60,7 +84,6 @@ NnExecutor::NnExecutor(NnNetConfig *netConfig, NnNodeConfig *nodeConfig, NnDevic
 
     context.nThreads = netExecution->nThreads;
     context.synchronizer = synchronizer;
-    context.device = device;
     context.nSteps = (NnUint)steps.size();
     context.steps = steps.data();
     if (benchmark)
diff --git a/src/nn/nn-executor.hpp b/src/nn/nn-executor.hpp
@@ -51,6 +51,14 @@ enum NnExecutorStepType {
 
 #define N_STEP_TYPES STEP_SYNC_NODES + 1
 
+class NnExecutorDevice {
+public:
+    std::unique_ptr<NnDevice> device;
+    int segmentFrom;
+    int segmentTo;
+    NnExecutorDevice(NnDevice *device, int segmentFrom, int segmentTo);
+};
+
 typedef struct {
     NnExecutorStepType type;
     NnDeviceSegment *segment;
@@ -63,7 +71,6 @@ typedef struct {
     NnUint nSteps;
     NnExecutorStep *steps;
     NnNodeSynchronizer *synchronizer;
-    NnDevice *device;
     std::atomic_uint currentStepIndex;
     std::atomic_uint doneThreadCount;
     NnUint batchSize;
@@ -86,7 +93,7 @@ class NnExecutor {
     NnExecutorThread *threads;
     NnExecutorContext context;
 public:
-    NnExecutor(NnNetConfig *netConfig, NnNodeConfig *nodeConfig, NnDevice *device, NnNetExecution *netExecution, NnNodeSynchronizer *synchronizer, bool benchmark);
+    NnExecutor(NnNetConfig *netConfig, NnNodeConfig *nodeConfig, std::vector<NnExecutorDevice> *device, NnNetExecution *netExecution, NnNodeSynchronizer *synchronizer, bool benchmark);
     ~NnExecutor();
     void loadWeight(const char *name, NnUint index, NnSize nBytes, NnByte *weight);
     void forward();
diff --git a/src/nn/nn-vulkan-test.cpp b/src/nn/nn-vulkan-test.cpp
@@ -36,11 +36,13 @@ void execute(
     NnNetExecution execution(1, &netConfig);
 
     NnUint gpuIndex = 0;
-    NnVulkanDevice device(gpuIndex, &netConfig, &nodeConfig, &execution);
+    std::vector<NnExecutorDevice> devices;
+    NnVulkanDevice *device = new NnVulkanDevice(gpuIndex, &netConfig, &nodeConfig, &execution);
+    devices.push_back(NnExecutorDevice(device, -1, -1));
     NnFakeNodeSynchronizer synchronizer;
-    NnExecutor executor(&netConfig, &nodeConfig, &device, &execution, &synchronizer, false);
+    NnExecutor executor(&netConfig, &nodeConfig, &devices, &execution, &synchronizer, false);
 
-    execute(&executor, &execution, &device);
+    execute(&executor, &execution, device);
 }
 
 void testRmsNorm_F32_F32_F32() {