Merge branch 'merging_branch' into vitis_accelerator_dev

alex-yang-upenn · alex-yang-upenn · commit ba23a1c8c70c · 2024-05-25T12:21:48.000-07:00
diff --git a/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py b/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py
@@ -1,6 +1,7 @@
 import os
 import sys
 import subprocess
+import numpy as np
 
 from hls4ml.backends import VitisBackend, VivadoBackend
 from hls4ml.model.flow import get_flow, register_flow
@@ -69,8 +70,32 @@ def build(self, model, target="all"):
         else:
             raise Exception("Currently untested on non-Linux OS")
 
-    def predict(self, model, x):
-        raise Exception("TODO: Needs to be implemented")
+    def _numpy_to_dat(self, model, x):
+        if len(model.get_input_variables()) != 1:
+            raise Exception("Currently unsupported for multi-input/output projects")
+        
+        # Verify numpy array of correct shape
+        expected_shape = model.get_input_variables()[0].size()
+        if expected_shape != x.shape[-1]:
+            raise Exception(f'Input shape mismatch, got {x.shape}, expected (_, {expected_shape})')
+        
+        # Write to tb_data/tb_input_features.dat
+        input_dat = open(f'{model.config.get_output_dir()}/tb_data/tb_input_features.dat', 'w')
+        for input in x:
+            newline = " ".join(str(n) for n in input)
+            input_dat.write(newline + '\n')
+        input_dat.close()
+    
+    def _dat_to_numpy(self, model):
+        expected_shape = model.get_output_variables()[0].size()
+        output_file = f'{model.config.get_output_dir()}/tb_data/hw_results.dat'
+        y = np.loadtxt(output_file, dtype=float).reshape(-1, expected_shape)
+        return y
+
+    def hardware_predict(self, model, x):
+        self._numpy_to_dat(model, x)
+        os.system("./host build/kernel_wrapper.xclbin")
+        return self._dat_to_numpy(model)
 
     def _register_flows(self):
         validation_passes = [
diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
@@ -734,9 +734,6 @@ def _compute_n_samples(self, x):
         return int(n_sample)
 
     def predict(self, x):
-        if self.config.config.get('Backend', 'Vivado') == 'VitisAccelerator':
-            return self.config.backend.predict(self, x)
-
         top_function, ctype = self._get_top_function(x)
         n_samples = self._compute_n_samples(x)
         n_inputs = len(self.get_input_variables())
@@ -862,6 +859,14 @@ class TraceData(ctypes.Structure):
         else:
             return output, trace_output
 
+    def hardware_predict(self, x):
+        """Currently only supported for VitisAccelerator backend"""
+        backend = self.config.config.get('Backend', 'Vivado')
+        if backend != 'VitisAccelerator':
+            raise Exception(f"Function unsupported for {backend} backend")
+
+        return self.config.backend.hadrware_predict(self, x)
+
     def build(self, **kwargs):
         """Builds the generated project using HLS compiler.
 
diff --git a/hls4ml/templates/vitis_accelerator/myproject_host_cl.cpp b/hls4ml/templates/vitis_accelerator/myproject_host_cl.cpp
@@ -28,76 +28,52 @@ int main(int argc, char **argv) {
         std::cout << "Usage: " << argv[0] << " <XCLBIN Filename>" << std::endl;
         return EXIT_FAILURE;
     }
-    
     std::string xclbinFilename = argv[1];
 
     /*FPGATYPE*/<in_buffer_t, out_buffer_t> fpga(INSTREAMSIZE, OUTSTREAMSIZE, NUM_CU, NUM_THREAD, 100); 
 
     std::vector<cl::Device> devices = xcl::get_xil_devices();  // Utility API that finds xilinx platforms and return a list of devices connected to Xilinx platforms
-
     cl::Program::Binaries bins = xcl::import_binary_file(xclbinFilename);  // Load xclbin
-
     fpga.initializeOpenCL(devices, bins);
 
     fpga.allocateHostMemory(NUM_CHANNEL);
       
-    std::cout << "Loading input data from tb_data/tb_input_features.dat" 
-              << "and output predictions from tb_data/tb_output_features.dat" << std::endl;
-
-    std::cout << "Writing output predictions to tb_data/tb_output_predictions.dat" << std::endl;
-    
-    std::ifstream fpr("tb_data/tb_output_predictions.dat");
+    std::cout << "Loading input data from tb_data/tb_input_features.dat" << std::endl;
     std::ifstream fin("tb_data/tb_input_features.dat");
-
     if (!fin.is_open()) {
         std::cerr << "Error: Could not open tb_input_features.dat" << std::endl;
     }
-
-    if (!fpr.is_open()) {
-        std::cerr << "Error: Could not open tb_output_predictions.dat" << std::endl;
-    }
-
     std::vector<in_buffer_t> inputData;
-    std::vector<out_buffer_t> outputPredictions;
-    if (fin.is_open() && fpr.is_open()) {
-        int e = 0;
+    int num_inputs = 0;
+    if (fin.is_open()) {
         std::string iline;
-        std::string pline;
-        while (std::getline(fin, iline) && std::getline(fpr, pline)) {
-            if (e % 10 == 0) {
-                std::cout << "Processing input/prediction " << e << std::endl;
+        while (std::getline(fin, iline)) {
+            if (num_inputs % 10 == 0) {
+                std::cout << "Processing input " << num_inputs << std::endl;
             }
             std::stringstream in(iline); 
-            std::stringstream pred(pline); 
             std::string token;
             while (in >> token) {
                 in_buffer_t tmp = stof(token);
                 inputData.push_back(tmp);
             }
-            while (pred >> token) {
-                out_buffer_t tmp = stof(token);
-                outputPredictions.push_back(tmp);
-            }
+            num_inputs++;
         }
-        e++;
     }
     
     // Copying in testbench data
-    int n = std::min((int) inputData.size(), INSTREAMSIZE * NUM_CU * NUM_THREAD);
-    for (int i = 0; i < n; i++) {
-        fpga.source_in[i] = inputData[i];
-    }
+    int num_samples = std::min(num_inputs, BATCHSIZE * NUM_CU * NUM_THREAD);
+    memcpy(fpga.source_in.data(), inputData.data(), num_samples * DATA_SIZE_IN * sizeof(in_buffer_t));
 
     // Padding rest of buffer with arbitrary values
-    for (int i = n; i < INSTREAMSIZE * NUM_CU * NUM_THREAD; i++) {
-        fpga.source_in[i] = (in_buffer_t)(1234.567);
+    for (int i = num_samples * DATA_SIZE_IN; i < INSTREAMSIZE * NUM_CU * NUM_THREAD; i++) {
+        fpga.source_in[i] = (in_buffer_t)(2.345678);
     }
 
     std::vector<std::thread> hostAccelerationThreads;
     hostAccelerationThreads.reserve(NUM_THREAD);
 
     std::cout << "Beginning FPGA run" << std::endl;
-
     auto ts_start = SClock::now();
 
     for (int i = 0; i < NUM_THREAD; i++) {
@@ -114,21 +90,18 @@ int main(int argc, char **argv) {
     float throughput = (float(NUM_CU * NUM_THREAD * 100 * BATCHSIZE) /
             float(std::chrono::duration_cast<std::chrono::nanoseconds>(ts_end - ts_start).count())) *
             1000000000.;
-    
-    std::cout << "Throughput = "
-            << throughput
-            <<" predictions/second\n" << std::endl;
+    std::cout << "Throughput = " << throughput <<" predictions/second\n" << std::endl;
 
-    std::cout << "Writing hw resaults to file" << std::endl;
+    std::cout << "Writing hw results to file" << std::endl;
     std::ofstream resultsFile;
     resultsFile.open("tb_data/hw_results.dat", std::ios::trunc);
     if (resultsFile.is_open()) {   
-        for (int i = 0; i < NUM_THREAD * NUM_CU * BATCHSIZE; i++) {
-            std::stringstream line;
+        for (int i = 0; i < num_samples; i++) {
+            std::stringstream oline;
             for (int n = 0; n < DATA_SIZE_OUT; n++) {
-                line << (float)fpga.source_hw_results[(i * DATA_SIZE_OUT) + n] << " ";
+                oline << (float)fpga.source_hw_results[(i * DATA_SIZE_OUT) + n] << " ";
             }
-            resultsFile << line.str() << "\n";
+            resultsFile << oline.str() << "\n";
         }
         resultsFile.close();
     } else {
diff --git a/hls4ml/writer/vitis_accelerator_writer.py b/hls4ml/writer/vitis_accelerator_writer.py
@@ -1,5 +1,5 @@
 import os
-from shutil import copy, copytree
+from shutil import copy, copytree, rmtree
 
 from hls4ml.writer.vitis_writer import VitisWriter
 
@@ -82,6 +82,10 @@ def write_kernel(self, model):
         fout_header = open(f'{model.config.get_output_dir()}/kernel_wrapper.h', 'w')
         model_inputs = model.get_input_variables()
         model_outputs = model.get_output_variables()
+        if len(model_inputs) != 1 or len(model_outputs) != 1:
+            raise Exception("Accelerator currently only supports projects with a single input and a single output variable")
+        inp = model_inputs[0]
+        out = model_outputs[0]
         for line in f_header.readlines():
             if '// hls-fpga-machine-learning accelerator parameters' in line:
                 newline = ''
@@ -96,27 +100,25 @@ def write_kernel(self, model):
             elif '// hls-fpga-machine-learning accelerator io' in line:
                 newline = ''
                 if io_type == 'io_parallel':
-                    for inp in model_inputs:
-                        for out in model_outputs:
-                            newline += '#define DATA_SIZE_IN ' + format(inp.size_cpp()) + '\n'
-                            newline += '#define INSTREAMSIZE (BATCHSIZE * DATA_SIZE_IN)' + '\n\n'
-                            newline += '#define DATA_SIZE_OUT ' + format(out.size_cpp()) + '\n'
-                            newline += '#define OUTSTREAMSIZE (BATCHSIZE * DATA_SIZE_OUT)' + '\n\n'
-                            newline += 'typedef ' + format(inp.type.name) + ' in_buffer_t;\n'
-                            newline += 'typedef ' + format(out.type.name) + ' out_buffer_t;\n'
+                    newline += '#define DATA_SIZE_IN ' + format(inp.size_cpp()) + '\n'
+                    newline += '#define INSTREAMSIZE (BATCHSIZE * DATA_SIZE_IN)' + '\n\n'
+                    newline += '#define DATA_SIZE_OUT ' + format(out.size_cpp()) + '\n'
+                    newline += '#define OUTSTREAMSIZE (BATCHSIZE * DATA_SIZE_OUT)' + '\n\n'
+                    newline += 'typedef ' + format(inp.type.name) + ' in_buffer_t;\n'
+                    newline += 'typedef ' + format(out.type.name) + ' out_buffer_t;\n'
                 elif io_type == 'io_stream':
-                    for inp in model_inputs:
-                        for out in model_outputs:
-                            (dims, _) = inp.get_shape()
-                            nnet_array_depth = dims.pop()
-                            newline += '#define DATA_SIZE_IN ' + ' * '.join(dims) + '\n'
-                            newline += '#define NNET_ARRAY_DEPTH ' + format(nnet_array_depth) + '\n'
-                            newline += '#define INSTREAMSIZE (BATCHSIZE * DATA_SIZE_IN * NNET_ARRAY_DEPTH)' + '\n\n'
-                            newline += '#define DATA_SIZE_OUT ' + format(out.size_cpp()) + '\n'
-                            newline += '#define OUTSTREAMSIZE (BATCHSIZE * DATA_SIZE_OUT)' + '\n\n'
-                            precision_str = model.config.backend.convert_precision_string(model.config.model_precision.get('default'))
-                            newline += 'typedef ' + precision_str + ' in_buffer_t;\n'
-                            newline += 'typedef ' + precision_str + ' out_buffer_t;\n'
+                    dims, _ = zip(*inp.get_shape())
+                    dims = list(dims)
+                    nnet_array_depth = dims.pop()
+                    dims.append("1")
+                    newline += '#define DATA_SIZE_IN ' + ' * '.join(dims) + '\n'
+                    newline += '#define NNET_ARRAY_DEPTH ' + format(nnet_array_depth) + '\n'
+                    newline += '#define INSTREAMSIZE (BATCHSIZE * DATA_SIZE_IN * NNET_ARRAY_DEPTH)' + '\n\n'
+                    newline += '#define DATA_SIZE_OUT ' + format(out.size_cpp()) + '\n'
+                    newline += '#define OUTSTREAMSIZE (BATCHSIZE * DATA_SIZE_OUT)' + '\n\n'
+                    precision_str = str(model.config.backend.convert_precision_string(model.config.model_precision.get('default')))
+                    newline += 'typedef ' + precision_str + ' in_buffer_t;\n'
+                    newline += 'typedef ' + precision_str + ' out_buffer_t;\n'
             else:
                 newline = line
             fout_header.write(newline)
@@ -163,6 +165,8 @@ def write_host(self, model):
         # Write libraries
         src = os.path.join(filedir, '../templates/vitis_accelerator/libs')
         dst = f'{model.config.get_output_dir()}/libs'
+        if os.path.exists(dst):
+            rmtree(dst)
         copytree(src, dst, copy_function=copy)
 
     def write_makefile(self, model):
diff --git a/vitis_accelerator_test/9_Vitis_Accel.ipynb b/vitis_accelerator_test/9_Vitis_Accel.ipynb
diff --git a/vitis_accelerator_test/tb_input_features.dat b/vitis_accelerator_test/tb_input_features.dat