Merge branch 'vitis_accelerator_dev' into merge

alex-yang-upenn · alex-yang-upenn · commit 2c35c7b5ab76 · 2024-05-15T19:51:48.000-07:00
diff --git a/hls4ml/backends/__init__.py b/hls4ml/backends/__init__.py
@@ -2,20 +2,18 @@
 from hls4ml.backends.fpga.fpga_backend import FPGABackend  # noqa: F401
 from hls4ml.backends.quartus.quartus_backend import QuartusBackend
 from hls4ml.backends.symbolic.symbolic_backend import SymbolicExpressionBackend
+from hls4ml.backends.vitis_accelerator.vitis_accelerator_config import VitisAcceleratorConfig  # noqa: F401
 from hls4ml.backends.vivado.vivado_backend import VivadoBackend
 from hls4ml.backends.vivado_accelerator.vivado_accelerator_backend import VivadoAcceleratorBackend
 from hls4ml.backends.vivado_accelerator.vivado_accelerator_config import VivadoAcceleratorConfig  # noqa: F401
 
 from hls4ml.backends.vitis.vitis_backend import VitisBackend  # isort: skip
+from hls4ml.backends.vitis_accelerator.vitis_accelerator_backend import VitisAcceleratorBackend  # isort: skip
 
-#[K] start
-from hls4ml.backends.vitis_accelerator.vitis_accelerator_backend import VitisAcceleratorBackend
-from hls4ml.backends.vitis_accelerator.vitis_accelerator_config import VitisAcceleratorConfig
-#[K] end
 
 register_backend('Vivado', VivadoBackend)
 register_backend('VivadoAccelerator', VivadoAcceleratorBackend)
 register_backend('Vitis', VitisBackend)
-register_backend('VitisAccelerator', VitisAcceleratorBackend) #[K]
+register_backend('VitisAccelerator', VitisAcceleratorBackend)
 register_backend('Quartus', QuartusBackend)
 register_backend('SymbolicExpression', SymbolicExpressionBackend)
diff --git a/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py b/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py
@@ -1,10 +1,5 @@
-import os
-import sys
-
-from hls4ml.backends import VitisBackend
-from hls4ml.backends import VivadoBackend
+from hls4ml.backends import VitisBackend, VivadoBackend
 from hls4ml.model.flow import get_flow, register_flow
-from hls4ml.report import parse_vivado_report
 
 
 class VitisAcceleratorBackend(VitisBackend):
@@ -64,3 +59,52 @@ def _register_flows(self):
         ip_flow_requirements.insert(ip_flow_requirements.index('vivado:apply_templates'), template_flow)
 
         self._default_flow = register_flow('ip', None, requires=ip_flow_requirements, backend=self.name)
+
+    def create_initial_config(
+        self,
+        board='pynq-z2',
+        part=None,
+        clock_period=5,
+        io_type='io_parallel',
+        interface='axi_stream',
+        driver='python',
+        input_type='float',
+        output_type='float',
+        platform='xilinx_u250_xdma_201830_2',
+    ):
+        '''
+        Create initial accelerator config with default parameters
+
+        Args:
+            board: one of the keys defined in supported_boards.json
+            clock_period: clock period passed to hls project
+            io_type: io_parallel or io_stream
+            interface: `axi_stream`: generate hardware designs and drivers which exploit axi stream channels.
+                       `axi_master`: generate hardware designs and drivers which exploit axi master channels.
+                       `axi_lite` : generate hardware designs and drivers which exploit axi lite channels. (Don't use it
+                       to exchange large amount of data)
+            driver: `python`: generates the python driver to use the accelerator in the PYNQ stack.
+                    `c`: generates the c driver to use the accelerator bare-metal.
+            input_type: the wrapper input precision. Can be `float` or an `ap_type`. Note: VivadoAcceleratorBackend
+                             will round the number of bits used to the next power-of-2 value.
+            output_type: the wrapper output precision. Can be `float` or an `ap_type`. Note:
+                              VivadoAcceleratorBackend will round the number of bits used to the next power-of-2 value.
+            platform: development target platform
+
+        Returns:
+            populated config
+        '''
+        board = board if board is not None else 'pynq-z2'
+        config = super().create_initial_config(part, clock_period, io_type)
+        config['AcceleratorConfig'] = {}
+        config['AcceleratorConfig']['Board'] = board
+        config['AcceleratorConfig']['Interface'] = interface  # axi_stream, axi_master, axi_lite
+        config['AcceleratorConfig']['Driver'] = driver
+        config['AcceleratorConfig']['Precision'] = {}
+        config['AcceleratorConfig']['Precision']['Input'] = {}
+        config['AcceleratorConfig']['Precision']['Output'] = {}
+        config['AcceleratorConfig']['Precision']['Input'] = input_type  # float, double or ap_fixed<a,b>
+        config['AcceleratorConfig']['Precision']['Output'] = output_type  # float, double or ap_fixed<a,b>
+        config['AcceleratorConfig']['Platform'] = platform
+
+        return config
diff --git a/hls4ml/backends/vitis_accelerator/vitis_accelerator_config.py b/hls4ml/backends/vitis_accelerator/vitis_accelerator_config.py
@@ -1,11 +1,3 @@
-import json
-import os
-
-import numpy as np
-
-from hls4ml.model.layers import FixedPrecisionType, IntegerPrecisionType
-
-
 class VitisAcceleratorConfig:
     def __init__(self, config):
         self.config = config.config
diff --git a/hls4ml/templates/vitis_accelerator/myproject_host.cpp b/hls4ml/templates/vitis_accelerator/myproject_host.cpp
@@ -0,0 +1,230 @@
+/**
+ * Copyright (C) 2019-2022 Xilinx, Inc
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"). You may
+ * not use this file except in compliance with the License. A copy of the
+ * License is located at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <cstring>
+#include <iostream>
+
+// XRT includes
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+////////////////// HLS4ML Includes start //////////////////
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+
+#include "firmware/myproject.h"
+#include "firmware/nnet_utils/nnet_helpers.h"
+
+// hls-fpga-machine-learning insert bram
+
+#define CHECKPOINT 5000
+
+namespace nnet {
+bool trace_enabled = true;
+std::map<std::string, void *> *trace_outputs = NULL;
+size_t trace_type_size = sizeof(double);
+} // namespace nnet
+
+////////////////// HLS4ML Includes end //////////////////
+
+#define DATA_SIZE 1
+
+int main(int argc, char **argv) {
+
+    // Read settings
+    std::string binaryFile = argv[1];
+    int device_index = 0;
+
+    if (argc != 2) {
+        std::cout << "Usage: " << argv[0] << " <XCLBIN File>" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::string target_device = "myplatform";
+    auto device_type = target_device.substr(0, target_device.size() - 17);
+    std::cout << "Device type: " << device_type << std::endl;
+    std::cout << "xrt::device size " << sizeof(xrt::device) << std::endl;
+    for (int i = 0; i < sizeof(xrt::device); i++){
+      std::cout << "device[" << i << "] name:     " << xrt::device(i).get_info<xrt::info::device::name>() << "\n";
+      std::cout << "device[" << i << "] bdf:      " << xrt::device(i).get_info<xrt::info::device::bdf>() << "\n\n";
+      size_t found = xrt::device(i).get_info<xrt::info::device::name>().find(device_type);
+      if (found != std::string::npos){
+        std::cout << "Device: " << xrt::device(i).get_info<xrt::info::device::name>() << " found." << std::endl;
+        device_index = i;
+        std::cout << "Device index in loop: " <<device_index << std::endl;
+        break;
+      }
+        else{
+          std::cout << "Device not found" << std::endl;
+        }
+      }
+    std::cout << "Open the device" << device_index << std::endl;
+    auto device = xrt::device(device_index);
+    std::cout << "Load the xclbin " << binaryFile << std::endl;
+    auto uuid = device.load_xclbin(binaryFile);
+
+    size_t vector_size_bytes_in = sizeof(input_t) * DATA_SIZE;
+    size_t vector_size_bytes_out = sizeof(result_t) * DATA_SIZE;
+
+    auto krnl = xrt::kernel(device, uuid, "myproject_kernel");
+
+    std::cout << "Allocate Buffer in Global Memory\n";
+    auto bo0 = xrt::bo(device, vector_size_bytes_in, krnl.group_id(0));
+    auto bo_out = xrt::bo(device, vector_size_bytes_out, krnl.group_id(1));
+
+    // Map the contents of the buffer object into host memory
+    auto bo0_map = bo0.map<input_t *>();
+    auto bo0_out_map = bo_out.map<result_t *>();
+    memset((char *)bo0_map, 0, vector_size_bytes_in);
+    memset((char *)bo0_out_map, 0, vector_size_bytes_out);
+
+    // Create the test data
+    /////////////////////////// From HLS4ML test start ///////////////////////////
+
+    // load input data from text file
+    std::ifstream fin("output_dir/tb_data/tb_input_features.dat");
+    // load predictions from text file
+    std::ifstream fpr("output_dir/tb_data/tb_output_predictions.dat");
+
+    std::string RESULTS_LOG = "output_dir/tb_data/hw_results.log";
+    std::ofstream fout(RESULTS_LOG);
+
+    std::string iline;
+    std::string pline;
+    int e = 0;
+
+    if (fin.is_open() && fpr.is_open()) {
+        while (std::getline(fin, iline) && std::getline(fpr, pline)) {
+            if (e % CHECKPOINT == 0)
+                std::cout << "Processing input " << e << std::endl;
+            char *cstr = const_cast<char *>(iline.c_str());
+            char *current;
+            std::vector<float> in;
+            current = strtok(cstr, " ");
+            while (current != NULL) {
+                in.push_back(atof(current));
+                current = strtok(NULL, " ");
+            }
+            cstr = const_cast<char *>(pline.c_str());
+            std::vector<float> pr;
+            current = strtok(cstr, " ");
+            while (current != NULL) {
+                pr.push_back(atof(current));
+                current = strtok(NULL, " ");
+            }
+            // Ensure the size of in is not greater than bo0_map size
+            size_t minSize = std::min(in.size(), static_cast<size_t>(input_t::size)); // Access size as a static member
+
+            for (size_t i = 0; i < minSize; ++i) {
+                // Perform type conversion and scale appropriately to fit within ap_fixed<16,6>
+                (*bo0_map)[i] = static_cast<ap_fixed<16, 6>>(in[i]); // Assuming in[i] fits within range of ap_fixed<16,6>
+            }
+
+            // hls-fpga-machine-learning insert top-level-function
+            //////////////////// Run on HW start ////////////////////
+            // Synchronize buffer content with device side
+            std::cout << "synchronize input buffer data to device global memory\n";
+
+            bo0.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+            //    bo1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+            std::cout << "Execution of the kernel\n";
+            // auto run = krnl(bo0, bo1, bo_out, DATA_SIZE);
+            auto run = krnl(bo0, bo_out, DATA_SIZE);
+            run.wait();
+
+            // Get the output;
+            std::cout << "Get the output data from the device" << std::endl;
+            bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+            // Print contents of bo0_map
+            std::cout << "Contents of bo0_map (Input):" << std::endl;
+            for (int i = 0; i < DATA_SIZE; ++i) {
+                for (size_t j = 0; j < myinput; j++) {
+                    std::cout << bo0_map[i][j] << " ";
+                }
+            }
+            std::cout << std::endl;
+
+            std::cout << "Contents of bo0_out_map (Output):" << std::endl;
+            for (int i = 0; i < DATA_SIZE; ++i) {
+                for (size_t j = 0; j < mylayer_out; j++) {
+                    std::cout << bo0_out_map[i][j] << " ";
+                }
+            }
+            std::cout << std::endl;
+            std::cout << "TEST END\n";
+            //////////////////// Run on HW end ////////////////////
+
+            if (e % CHECKPOINT == 0) {
+                std::cout << "Predictions" << std::endl;
+                // hls-fpga-machine-learning insert predictions
+                for (int i = 0; i < mylayer_out; i++) {
+                    std::cout << pr[i] << " ";
+                }
+                std::cout << std::endl;
+                std::cout << "Quantized predictions" << std::endl;
+            }
+            e++;
+        }
+
+        delete bo0_map; // Don't forget to release memory if dynamically allocated
+
+        fin.close();
+        fpr.close();
+    } else {
+        std::cout << "INFO: Unable to open input/predictions file, using default input." << std::endl;
+
+        //////////////////// Run on HW start ////////////////////
+        bo0_map = {0};
+
+        // Synchronize buffer content with device side
+        std::cout << "synchronize input buffer data to device global memory\n";
+
+        bo0.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+        std::cout << "Execution of the kernel\n";
+        auto run = krnl(bo0, bo_out, DATA_SIZE);
+        run.wait();
+
+        // Get the output;
+        std::cout << "Get the output data from the device" << std::endl;
+        bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+        std::cout << "Contents of bo0_out_map (Output):" << std::endl;
+        for (int i = 0; i < DATA_SIZE; ++i) {
+            for (size_t j = 0; j < mylayer_out; j++) {
+                std::cout << bo0_out_map[i][j] << " ";
+            }
+        }
+        std::cout << std::endl;
+
+        std::cout << "TEST END\n";
+        //////////////////// Run on HW end ////////////////////
+    }
+    fout.close();
+    std::cout << "INFO: Saved inference results to file: " << RESULTS_LOG << std::endl;
+    /////////////////////////// From HLS4ML test end ///////////////////////////
+    return 0;
+}
diff --git a/hls4ml/templates/vitis_accelerator/myproject_host_cl.cpp b/hls4ml/templates/vitis_accelerator/myproject_host_cl.cpp
@@ -22,7 +22,7 @@ void runFPGAHelper(FpgaObj<in_buffer_t, out_buffer_t> &fpga) {
     fpga.write_ss_safe(ss.str());
 }
 
-int main(int argc, char** argv) {
+int main(int argc, char **argv) {
     if (argc != 2) {
         std::cout << "Usage: " << argv[0] << " <XCLBIN Filename>" << std::endl;
         return EXIT_FAILURE;
diff --git a/hls4ml/templates/vitis_accelerator/myproject_kernel.cpp b/hls4ml/templates/vitis_accelerator/myproject_kernel.cpp
@@ -0,0 +1,37 @@
+#include <iostream>
+
+#include "firmware/myproject.cpp"
+#include "firmware/myproject.h"
+#include "firmware/parameters.h"
+
+constexpr int c_size = 1024;
+
+static void load_input(input_t *in, hls::stream<input_t> &inStream, int size) {
+mem_rd:
+    for (int i = 0; i < size; i++) {
+        #pragma HLS LOOP_TRIPCOUNT min = c_size max = c_size
+        inStream << in[i];
+    }
+}
+// static void store_result(result_t* out, hls::stream<result_t>& out_stream, int size) {
+static void store_result(result_t *out, hls::stream<result_t> &out_stream, int size) {
+mem_wr:
+    for (int i = 0; i < size; i++) {
+        #pragma HLS LOOP_TRIPCOUNT min = c_size max = c_size
+        result_t temp = out_stream.read();
+        out[i] = temp;
+    }
+}
+
+void myproject_kernel(
+    // hls-fpga-machine-learning insert header
+) {
+    #pragma HLS INTERFACE m_axi port = project_input bundle = gmem0
+    #pragma HLS INTERFACE m_axi port = project_output bundle = gmem1
+    static hls::stream<input_t> project_input_stream("project_input_stream");
+    static hls::stream<result_t> project_output_stream("project_output_stream");
+    #pragma HLS dataflow
+    load_input(project_input, project_input_stream, size);
+    // hls-fpga-machine-learning insert project top
+    store_result(project_output, project_output_stream, size);
+}
diff --git a/hls4ml/templates/vitis_accelerator/nnet_utils/nnet_types.h b/hls4ml/templates/vitis_accelerator/nnet_utils/nnet_types.h
@@ -19,8 +19,8 @@ template <typename T, unsigned N> struct array {
     const T &operator[](size_t pos) const { return data[pos]; }
 
     array &operator=(const array &other) {
-//        if (&other == this)
-//            return *this;
+        //        if (&other == this)
+        //            return *this;
 
         assert(N == other.size && "Array sizes must match.");
 
diff --git a/hls4ml/writer/__init__.py b/hls4ml/writer/__init__.py
@@ -1,7 +1,7 @@
 from hls4ml.writer.quartus_writer import QuartusWriter
 from hls4ml.writer.symbolic_writer import SymbolicExpressionWriter
-from hls4ml.writer.vitis_writer import VitisWriter
 from hls4ml.writer.vitis_accelerator_writer import VitisAcceleratorWriter
+from hls4ml.writer.vitis_writer import VitisWriter
 from hls4ml.writer.vivado_accelerator_writer import VivadoAcceleratorWriter
 from hls4ml.writer.vivado_writer import VivadoWriter
 from hls4ml.writer.writers import Writer, get_writer, register_writer  # noqa: F401
diff --git a/hls4ml/writer/vitis_accelerator_writer.py b/hls4ml/writer/vitis_accelerator_writer.py
diff --git a/prj_backend_tests/hls4ml_setup.sh b/prj_backend_tests/hls4ml_setup.sh
diff --git a/vitis_accelerator_test/tuto_vitis_stream.py b/vitis_accelerator_test/tuto_vitis_stream.py

Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,7 @@ void runFPGAHelper(FpgaObj<in_buffer_t, out_buffer_t> &fpga) {`
`22`	`22`	`fpga.write_ss_safe(ss.str());`
`23`	`23`	`}`
`24`	`24`
`25`		`-int main(int argc, char** argv) {`
	`25`	`+int main(int argc, char **argv) {`
`26`	`26`	`if (argc != 2) {`
`27`	`27`	`std::cout << "Usage: " << argv[0] << " <XCLBIN Filename>" << std::endl;`
`28`	`28`	`return EXIT_FAILURE;`