implementing hw quant option

alex-yang-upenn · alex-yang-upenn · commit 9779fcdeefb3 · 2024-06-27T18:06:21.000-07:00
diff --git a/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py b/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py
@@ -22,6 +22,7 @@ def create_initial_config(
         num_kernel=1,
         num_thread=1,
         batchsize=8192,
+        hw_quant=False,
         vivado_directives=[]
     ):
         '''
@@ -45,6 +46,7 @@ def create_initial_config(
         config['AcceleratorConfig']['Num_Kernel'] = num_kernel
         config['AcceleratorConfig']['Num_Thread'] = num_thread
         config['AcceleratorConfig']['Batchsize'] = batchsize
+        config['AcceleratorConfig']['HW_Quant'] = hw_quant
         config['AcceleratorConfig']['Vivado_Directives'] = vivado_directives
         return config
 
diff --git a/hls4ml/backends/vitis_accelerator/vitis_accelerator_config.py b/hls4ml/backends/vitis_accelerator/vitis_accelerator_config.py
@@ -28,11 +28,12 @@ def __init__(self, config):
                 )
                 self.config['Part'] = self.part
         
-        self.num_kernel = accel_config.get('Num_Kernel')
-        self.num_thread = accel_config.get('Num_Thread')
-        self.batchsize = accel_config.get('Batchsize')
+        self.num_kernel = accel_config.get('Num_Kernel', 1)
+        self.num_thread = accel_config.get('Num_Thread', 1)
+        self.batchsize = accel_config.get('Batchsize', 8192)
+        self.hw_quant = accel_config.get('HW_Quant', False)
 
-        self.vivado_directives = accel_config.get('Vivado_Directives')    
+        self.vivado_directives = accel_config.get('Vivado_Directives', [])    
 
     def get_board_type(self):
         return self.board_type
@@ -55,5 +56,8 @@ def get_memory_type(self):
     def get_memory_channel_count(self):
         return self.memory_channel_count
     
+    def get_hw_quant(self):
+        return self.hw_quant
+    
     def get_vivado_directives(self):
         return self.vivado_directives
diff --git a/hls4ml/templates/vitis_accelerator/kernel_wrapper_io_parallel.cpp b/hls4ml/templates/vitis_accelerator/kernel_wrapper_io_parallel.cpp
@@ -1,12 +1,12 @@
 #include "kernel_wrapper.h"
 #include "firmware/myproject.h"
 
-static void read_input(const in_buffer_t *in, in_buffer_t (&in_buf)[BATCHSIZE][DATA_SIZE_IN]) {
+static void read_input(const /*IN_INTERFACE_TYPE*/ *in, in_buffer_t (&in_buf)[BATCHSIZE][DATA_SIZE_IN]) {
   for (int i = 0; i < BATCHSIZE; i++) {
       #pragma HLS PIPELINE
       for(int j = 0; j < DATA_SIZE_IN; j++) { 
         #pragma HLS UNROLL
-        in_buf[i][j] = in[i * DATA_SIZE_IN + j];
+        in_buf[i][j] = /*IN_HW_QUANT*/in[i * DATA_SIZE_IN + j];
       }
     }
 }
@@ -16,12 +16,12 @@ static void run_inference(in_buffer_t (&in_buf)[BATCHSIZE][DATA_SIZE_IN], out_bu
       myproject(in_buf[i],out_buf[i]);
     }
 }
-static void write_result(out_buffer_t *out, out_buffer_t (&out_buf)[BATCHSIZE][DATA_SIZE_OUT]) {
+static void write_result(/*OUT_INTERFACE_TYPE*/ *out, out_buffer_t (&out_buf)[BATCHSIZE][DATA_SIZE_OUT]) {
   for (int i = 0; i < BATCHSIZE; i++) {
     #pragma HLS PIPELINE
     for (int j = 0; j < DATA_SIZE_OUT; j++) {
       #pragma HLS UNROLL
-      out[i * DATA_SIZE_OUT + j] = out_buf[i][j];
+      out[i * DATA_SIZE_OUT + j] = /*OUT_HW_QUANT*/out_buf[i][j];
     }
   }
 }
@@ -32,7 +32,7 @@ extern "C" {
     \param in Input Vector
     \param out Output Vector
 */
-  void kernel_wrapper(const in_buffer_t *in, out_buffer_t *out) {
+  void kernel_wrapper(const /*IN_INTERFACE_TYPE*/ *in, /*OUT_INTERFACE_TYPE*/ *out) {
     in_buffer_t in_buf[BATCHSIZE][DATA_SIZE_IN];
     out_buffer_t out_buf[BATCHSIZE][DATA_SIZE_OUT];
     #pragma HLS ARRAY_RESHAPE   variable=in_buf  complete dim=2
diff --git a/hls4ml/templates/vitis_accelerator/kernel_wrapper_io_stream.cpp b/hls4ml/templates/vitis_accelerator/kernel_wrapper_io_stream.cpp
@@ -1,23 +1,23 @@
 #include "kernel_wrapper.h"
 #include "firmware/myproject.h"
 
-static void read_input(const in_buffer_t *in, hls::stream<input_t> &input, int n) {
+static void read_input(const /*IN_INTERFACE_TYPE*/ *in, hls::stream<input_t> &input, int n) {
   for (int i = 0; i < DATA_SIZE_IN; i++) {
     #pragma HLS PIPELINE
     input_t tmp;
     for (int j = 0; j < NNET_ARRAY_DEPTH; j++) {
       #pragma HLS UNROLL
-      tmp[j] = in[(n * DATA_SIZE_IN * NNET_ARRAY_DEPTH) + (i * NNET_ARRAY_DEPTH) + j];
+      tmp[j] = /*IN_HW_QUANT*/in[(n * DATA_SIZE_IN * NNET_ARRAY_DEPTH) + (i * NNET_ARRAY_DEPTH) + j];
     }
     input << tmp;
   }
 }
 
-static void write_result(out_buffer_t *out, hls::stream<result_t> &output, int n) {
+static void write_result(/*OUT_INTERFACE_TYPE*/ *out, hls::stream<result_t> &output, int n) {
   result_t tmp = output.read();
   for (int i = 0; i < DATA_SIZE_OUT; i++) {
     #pragma HLS UNROLL
-    out[(n * DATA_SIZE_OUT) + i] = tmp[i];
+    out[(n * DATA_SIZE_OUT) + i] = /*OUT_HW_QUANT*/tmp[i];
   }
 }
 
@@ -27,7 +27,7 @@ extern "C" {
     \param in Input Vector
     \param out Output Vector
 */
-  void kernel_wrapper(const in_buffer_t *in, out_buffer_t *out) {
+  void kernel_wrapper(const /*IN_INTERFACE_TYPE*/ *in, /*OUT_INTERFACE_TYPE*/ *out) {
     hls::stream<input_t> input("input");
     hls::stream<result_t> output("output");
     #pragma HLS STREAM variable=input depth=DATA_SIZE_IN
diff --git a/hls4ml/templates/vitis_accelerator/myproject_host_cl.cpp b/hls4ml/templates/vitis_accelerator/myproject_host_cl.cpp
@@ -16,7 +16,7 @@
 #define EXPAND_STRING(var) STRINGIFY(var)
 
 
-void runFPGAHelper(FpgaObj<in_buffer_t, out_buffer_t> &fpga) {
+void runFPGAHelper(FpgaObj</*IN_INTERFACE_TYPE*/, /*OUT_INTERFACE_TYPE*/> &fpga) {
     fpga.runFPGA();
 }
 
@@ -27,7 +27,7 @@ int main(int argc, char **argv) {
     }
     std::string xclbinFilename = argv[1];
 
-    /*FPGATYPE*/<in_buffer_t, out_buffer_t> fpga(BATCHSIZE * INSTREAMSIZE, BATCHSIZE * OUTSTREAMSIZE, NUM_CU, NUM_THREAD, 10); 
+    // hls-fpga-machine-learning FPGA type 
 
     std::vector<cl::Device> devices = xcl::get_xil_devices();  // Utility API that finds xilinx platforms and return a list of devices connected to Xilinx platforms
     auto fileBuf = xcl::read_binary_file(xclbinFilename);  // Load xclbin
@@ -41,7 +41,7 @@ int main(int argc, char **argv) {
     if (!fin.is_open()) {
         std::cerr << "Error: Could not open tb_input_features.dat" << std::endl;
     }
-    std::vector<in_buffer_t> inputData;
+    std::vector</*IN_INTERFACE_TYPE*/> inputData;
     int num_inputs = 0;
     if (fin.is_open()) {
         std::string iline;
@@ -52,17 +52,16 @@ int main(int argc, char **argv) {
             std::stringstream in(iline); 
             std::string token;
             while (in >> token) {
-                in_buffer_t tmp = stof(token);
-                inputData.push_back(tmp);
+                inputData.push_back(/*IN_TYPE_CAST*/stof(token));
             }
             num_inputs++;
         }
     }
     fin.close();
 
-    // Copying in testbench data
+    // Copying input data into memory-mapped arrays
     int num_samples = std::min(num_inputs, BATCHSIZE * NUM_CU * NUM_THREAD);
-    memcpy(fpga.source_in.data(), inputData.data(), num_samples * INSTREAMSIZE * sizeof(in_buffer_t));
+    memcpy(fpga.source_in.data(), inputData.data(), num_samples * INSTREAMSIZE * sizeof(/*IN_INTERFACE_TYPE*/));
 
     std::vector<std::thread> hostAccelerationThreads;
     hostAccelerationThreads.reserve(NUM_THREAD);
@@ -93,7 +92,7 @@ int main(int argc, char **argv) {
         for (int i = 0; i < num_samples; i++) {
             std::stringstream oline;
             for (int n = 0; n < DATA_SIZE_OUT; n++) {
-                oline << (float)fpga.source_hw_results[(i * DATA_SIZE_OUT) + n] << " ";
+                oline << /*OUT_TYPE_CAST*/fpga.source_hw_results[(i * DATA_SIZE_OUT) + n] << " ";
             }
             resultsFile << oline.str() << "\n";
         }
diff --git a/hls4ml/writer/vitis_accelerator_writer.py b/hls4ml/writer/vitis_accelerator_writer.py
@@ -127,11 +127,22 @@ def write_kernel(self, model):
         # Writing source file
         f_source = open(os.path.join(filedir, '../templates/vitis_accelerator/kernel_wrapper_' + io_type +'.cpp'))
         fout_source = open(f'{model.config.get_output_dir()}/kernel_wrapper.cpp', 'w')
+        isHwQuant = self.vitis_accelerator_config.get_hw_quant()
         for line in f_source.readlines():
             if 'myproject' in line:
                 newline = line.replace('myproject', format(model.config.get_project_name()))
+            elif '/*IN_HW_QUANT*/' in line:
+                newline = line.replace('/*IN_HW_QUANT*/', '(in_buffer_t)' if isHwQuant else '')
+            elif '/*OUT_HW_QUANT*/' in line:
+                newline = line.replace('/*OUT_HW_QUANT*/', '(float)' if isHwQuant else '')
             else:
                 newline = line
+
+            if '/*IN_INTERFACE_TYPE*/' in newline:
+                newline = newline.replace('/*IN_INTERFACE_TYPE*/', ('float' if isHwQuant else 'in_buffer_t'))
+            if '/*OUT_INTERFACE_TYPE*/' in newline:
+                newline = newline.replace('/*OUT_INTERFACE_TYPE*/', ('float' if isHwQuant else 'out_buffer_t'))
+
             fout_source.write(newline)
         f_source.close()
         fout_source.close()
@@ -149,14 +160,24 @@ def write_host(self, model):
         filedir = os.path.dirname(os.path.abspath(__file__))
         f = open(os.path.join(filedir, '../templates/vitis_accelerator/myproject_host_cl.cpp'))
         fout = open(f'{model.config.get_output_dir()}/{model.config.get_project_name()}_host_cl.cpp', 'w')
+        memoryType = self.vitis_accelerator_config.get_memory_type()
+        isHwQuant = self.vitis_accelerator_config.get_hw_quant()
         for line in f.readlines():
-            if '/*FPGATYPE*/' in line:
-                if self.vitis_accelerator_config.get_memory_type() == 'hbm':
-                    newline = line.replace('/*FPGATYPE*/', 'HbmFpga')
-                elif self.vitis_accelerator_config.get_memory_type() == 'ddr':
-                    newline = line.replace('/*FPGATYPE*/', 'DdrFpga')
+            if '// hls-fpga-machine-learning FPGA type' in line:
+                fpgaType = 'HbmFpga' if memoryType == 'hbm' else ('DdrFpga' if memoryType == 'ddr' else 'DdrFpga')
+                dataType = '<float, float>' if isHwQuant else '<in_buffer_t, out_buffer_t>'
+                newline = fpgaType + dataType + ' fpga(BATCHSIZE * INSTREAMSIZE, BATCHSIZE * OUTSTREAMSIZE, NUM_CU, NUM_THREAD, 10);'
+            elif '/*IN_TYPE_CAST*/' in line:
+                newline = line.replace('/*IN_TYPE_CAST*/', '' if isHwQuant else '(in_buffer_t)')
+            elif '/*OUT_TYPE_CAST*/' in line:
+                newline = line.replace('/*OUT_TYPE_CAST*/', '' if isHwQuant else '(float)')
             else:
                 newline = line
+            
+            if '/*IN_INTERFACE_TYPE*/' in line:
+                newline = newline.replace('/*IN_INTERFACE_TYPE*/', 'float' if isHwQuant else 'in_buffer_t')
+            if '/*OUT_INTERFACE_TYPE*/' in line:
+                newline = newline.replace('/*OUT_INTERFACE_TYPE*/', 'float' if isHwQuant else 'out_buffer_t')
             fout.write(newline)
         f.close()
         fout.close()

Original file line number	Diff line number	Diff line change
`@@ -1,12 +1,12 @@`
`1`	`1`	`#include "kernel_wrapper.h"`
`2`	`2`	`#include "firmware/myproject.h"`
`3`	`3`
`4`		`-static void read_input(const in_buffer_t *in, in_buffer_t (&in_buf)[BATCHSIZE][DATA_SIZE_IN]) {`
	`4`	`+static void read_input(const /IN_INTERFACE_TYPE/ *in, in_buffer_t (&in_buf)[BATCHSIZE][DATA_SIZE_IN]) {`
`5`	`5`	`for (int i = 0; i < BATCHSIZE; i++) {`
`6`	`6`	`#pragma HLS PIPELINE`
`7`	`7`	`for(int j = 0; j < DATA_SIZE_IN; j++) {`
`8`	`8`	`#pragma HLS UNROLL`
`9`		`- in_buf[i][j] = in[i * DATA_SIZE_IN + j];`
	`9`	`+ in_buf[i][j] = /IN_HW_QUANT/in[i * DATA_SIZE_IN + j];`
`10`	`10`	`}`
`11`	`11`	`}`
`12`	`12`	`}`
`@@ -16,12 +16,12 @@ static void run_inference(in_buffer_t (&in_buf)[BATCHSIZE][DATA_SIZE_IN], out_bu`
`16`	`16`	`myproject(in_buf[i],out_buf[i]);`
`17`	`17`	`}`
`18`	`18`	`}`
`19`		`-static void write_result(out_buffer_t *out, out_buffer_t (&out_buf)[BATCHSIZE][DATA_SIZE_OUT]) {`
	`19`	`+static void write_result(/OUT_INTERFACE_TYPE/ *out, out_buffer_t (&out_buf)[BATCHSIZE][DATA_SIZE_OUT]) {`
`20`	`20`	`for (int i = 0; i < BATCHSIZE; i++) {`
`21`	`21`	`#pragma HLS PIPELINE`
`22`	`22`	`for (int j = 0; j < DATA_SIZE_OUT; j++) {`
`23`	`23`	`#pragma HLS UNROLL`
`24`		`- out[i * DATA_SIZE_OUT + j] = out_buf[i][j];`
	`24`	`+ out[i * DATA_SIZE_OUT + j] = /OUT_HW_QUANT/out_buf[i][j];`
`25`	`25`	`}`
`26`	`26`	`}`
`27`	`27`	`}`
`@@ -32,7 +32,7 @@ extern "C" {`
`32`	`32`	`\param in Input Vector`
`33`	`33`	`\param out Output Vector`
`34`	`34`	`*/`
`35`		`- void kernel_wrapper(const in_buffer_t in, out_buffer_t out) {`
	`35`	`+ void kernel_wrapper(const /IN_INTERFACE_TYPE/ in, /OUT_INTERFACE_TYPE/ out) {`
`36`	`36`	`in_buffer_t in_buf[BATCHSIZE][DATA_SIZE_IN];`
`37`	`37`	`out_buffer_t out_buf[BATCHSIZE][DATA_SIZE_OUT];`
`38`	`38`	`#pragma HLS ARRAY_RESHAPE variable=in_buf complete dim=2`