Cleaned-up host code + improved .dat generation

alex-yang-upenn · alex-yang-upenn · commit b91c3513fe12 · 2024-05-28T11:40:36.000-07:00
diff --git a/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py b/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py
@@ -70,32 +70,36 @@ def build(self, model, target="all"):
         else:
             raise Exception("Currently untested on non-Linux OS")
 
-    def _numpy_to_dat(self, model, x):
+    def numpy_to_dat(self, model, x):
         if len(model.get_input_variables()) != 1:
             raise Exception("Currently unsupported for multi-input/output projects")
         
         # Verify numpy array of correct shape
         expected_shape = model.get_input_variables()[0].size()
-        if expected_shape != x.shape[-1]:
+        actual_shape = np.prod(x.shape[1:])
+        if expected_shape != actual_shape:
             raise Exception(f'Input shape mismatch, got {x.shape}, expected (_, {expected_shape})')
         
         # Write to tb_data/tb_input_features.dat
-        input_dat = open(f'{model.config.get_output_dir()}/tb_data/tb_input_features.dat', 'w')
-        for input in x:
-            newline = " ".join(str(n) for n in input)
-            input_dat.write(newline + '\n')
-        input_dat.close()
-    
-    def _dat_to_numpy(self, model):
+        samples = x.reshape(x.shape[0], -1)
+        input_dat = f'{model.config.get_output_dir()}/tb_data/tb_input_features.dat'
+        np.savetxt(input_dat, samples, fmt='%.4e')
+
+    def dat_to_numpy(self, model):
         expected_shape = model.get_output_variables()[0].size()
         output_file = f'{model.config.get_output_dir()}/tb_data/hw_results.dat'
         y = np.loadtxt(output_file, dtype=float).reshape(-1, expected_shape)
         return y
 
     def hardware_predict(self, model, x):
-        self._numpy_to_dat(model, x)
+        self.numpy_to_dat(model, x)
+
+        currdir = os.getcwd()
+        os.chdir(model.config.get_output_dir())
         os.system("./host build/kernel_wrapper.xclbin")
-        return self._dat_to_numpy(model)
+        os.chdir(currdir)
+
+        return self.dat_to_numpy(model)
 
     def _register_flows(self):
         validation_passes = [
diff --git a/hls4ml/templates/vitis_accelerator/libs/FpgaObj.hpp b/hls4ml/templates/vitis_accelerator/libs/FpgaObj.hpp
@@ -7,11 +7,9 @@
 #include <stdlib.h>
 #include <string>
 #include <stdio.h>
-#include <sstream>
 #include <thread>
 #include <vector>
 
-#include "timing.hpp"
 #include "xcl2.hpp"
 
 template <class T, class U>
@@ -20,7 +18,6 @@ class FpgaObj {
 	std::vector<T,aligned_allocator<T>> source_in;  // Vector containing inputs to all kernels
 	std::vector<U,aligned_allocator<U>> source_hw_results;  // Vector containing all outputs from all kernels
 	cl_int err;  // Stores potential error codes thrown by OpenCL functions
-	std::stringstream ss;  // Logs information from runFPGA(). Every thread logs to this stringstream
 
 	/**
 	 * \brief Constructor. Reserves and allocates buffers in host memory.
@@ -36,8 +33,7 @@ class FpgaObj {
 			_numCU(numCU),
 			_numThreads(numThreads),
 			_numEpochs(numEpochs),
-			ikern(0), 
-			ithr(0) {
+			ikern(0) {
 				source_in.reserve(_kernInputSize * _numCU * _numThreads);
 				source_hw_results.reserve(_kernOutputSize * _numCU * _numThreads);
 				isFirstRun.reserve(_numCU * _numThreads);
@@ -113,17 +109,6 @@ class FpgaObj {
 	*/
 	virtual void allocateHostMemory(int chan_per_port) = 0;
 
-	/**
-	 * \brief Logs information about thread completion
-	 * \param newss Additional thread-specific information to log
-	*/
-	void write_ss_safe(std::string newss) {
-		smtx.lock();
-		ss << "Thread " << ithr << "\n" << newss << "\n";
-		ithr++;
-		smtx.unlock();
-	}
-
 	/**
 	 * \brief Completes all enqueued operations
 	*/
@@ -135,22 +120,13 @@ class FpgaObj {
 
 	/**
 	 * \brief Migrates input to FPGA , executes kernels, and migrates output to host memory. Run this function in numThreads different threads
-	 * \return Stringstream containing logs of the run
 	*/
-	std::stringstream runFPGA() {
-		auto t_start = Clock::now();
-		auto t_end = Clock::now();
-		std::stringstream ss;
-
+	void runFPGA() {
 		for (int i = 0 ; i < _numCU * _numEpochs; i++){
-			t_start = Clock::now();
 			auto ikf = get_info_lock();
 			int ikb = ikf.first;
 			int ik = ikb % _numCU ;
 			bool firstRun = ikf.second;
-
-			auto ts1 = SClock::now();
-			print_nanoseconds("        start:  ",ts1, ik, ss);
 		
 			get_ilock(ikb);
 			// Copy input data to device global memory
@@ -179,13 +155,7 @@ class FpgaObj {
 		
 			OCL_CHECK(err, err = kern_event[ikb].wait());
 			OCL_CHECK(err, err = read_event[ikb].wait());
-			auto ts2 = SClock::now();
-			print_nanoseconds("       finish:  ",ts2, ik, ss);
-
-			t_end = Clock::now();
-			ss << "KERN"<<ik<<"   Total time: " << std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_start).count() << " ns\n";
 		}
-		return ss;
 	}
 
 	protected:
@@ -200,9 +170,6 @@ class FpgaObj {
 	mutable std::mutex mtx;  // Mutex for ikern, isFirstRun, and get_info_lock()
 	mutable std::vector<std::mutex> mtxi;  // Mutexes for each virtual kernel and associate resources
 
-	int ithr;  // Counter tracking the threads that ran to completion (for logging purposes)
-	mutable std::mutex smtx; // Mutex for ithr and write_ss_safe()
-
 	cl::Program program;  // Object containing the Program (built from kernel_wrapper.cpp) that runs on each physical compute unit
 	cl::Context context;  // Object containing the Device Context
 	std::vector<cl::CommandQueue> q;  // Vector containing Command Queue objects controlling physical compute units
diff --git a/hls4ml/templates/vitis_accelerator/libs/timing.hpp b/hls4ml/templates/vitis_accelerator/libs/timing.hpp
diff --git a/hls4ml/templates/vitis_accelerator/myproject_host_cl.cpp b/hls4ml/templates/vitis_accelerator/myproject_host_cl.cpp
@@ -10,17 +10,14 @@
 #include "FpgaObj.hpp"
 #include "HbmFpga.hpp"
 #include "DdrFpga.hpp"
-#include "timing.hpp"
 #include "xcl2.hpp"
 
 #define STRINGIFY(var) #var
 #define EXPAND_STRING(var) STRINGIFY(var)
 
 
 void runFPGAHelper(FpgaObj<in_buffer_t, out_buffer_t> &fpga) {
-    std::stringstream ss;
-    ss << (fpga.runFPGA()).str();
-    fpga.write_ss_safe(ss.str());
+    fpga.runFPGA();
 }
 
 int main(int argc, char **argv) {
@@ -30,7 +27,7 @@ int main(int argc, char **argv) {
     }
     std::string xclbinFilename = argv[1];
 
-    /*FPGATYPE*/<in_buffer_t, out_buffer_t> fpga(INSTREAMSIZE, OUTSTREAMSIZE, NUM_CU, NUM_THREAD, 100); 
+    /*FPGATYPE*/<in_buffer_t, out_buffer_t> fpga(BATCHSIZE * INSTREAMSIZE, BATCHSIZE * OUTSTREAMSIZE, NUM_CU, NUM_THREAD, 10); 
 
     std::vector<cl::Device> devices = xcl::get_xil_devices();  // Utility API that finds xilinx platforms and return a list of devices connected to Xilinx platforms
     cl::Program::Binaries bins = xcl::import_binary_file(xclbinFilename);  // Load xclbin
@@ -48,7 +45,7 @@ int main(int argc, char **argv) {
     if (fin.is_open()) {
         std::string iline;
         while (std::getline(fin, iline)) {
-            if (num_inputs % 10 == 0) {
+            if (num_inputs % 100 == 0) {
                 std::cout << "Processing input " << num_inputs << std::endl;
             }
             std::stringstream in(iline); 
@@ -60,21 +57,17 @@ int main(int argc, char **argv) {
             num_inputs++;
         }
     }
-    
+    fin.close();
+
     // Copying in testbench data
     int num_samples = std::min(num_inputs, BATCHSIZE * NUM_CU * NUM_THREAD);
-    memcpy(fpga.source_in.data(), inputData.data(), num_samples * DATA_SIZE_IN * sizeof(in_buffer_t));
-
-    // Padding rest of buffer with arbitrary values
-    for (int i = num_samples * DATA_SIZE_IN; i < INSTREAMSIZE * NUM_CU * NUM_THREAD; i++) {
-        fpga.source_in[i] = (in_buffer_t)(2.345678);
-    }
+    memcpy(fpga.source_in.data(), inputData.data(), num_samples * INSTREAMSIZE * sizeof(in_buffer_t));
 
     std::vector<std::thread> hostAccelerationThreads;
     hostAccelerationThreads.reserve(NUM_THREAD);
 
     std::cout << "Beginning FPGA run" << std::endl;
-    auto ts_start = SClock::now();
+    auto ts_start = std::chrono::system_clock::now();
 
     for (int i = 0; i < NUM_THREAD; i++) {
         hostAccelerationThreads.push_back(std::thread(runFPGAHelper, std::ref(fpga)));
@@ -86,8 +79,8 @@ int main(int argc, char **argv) {
 
     fpga.finishRun();
 
-    auto ts_end = SClock::now();
-    float throughput = (float(NUM_CU * NUM_THREAD * 100 * BATCHSIZE) /
+    auto ts_end = std::chrono::system_clock::now();
+    float throughput = (float(BATCHSIZE* NUM_CU * NUM_THREAD * 10 ) /
             float(std::chrono::duration_cast<std::chrono::nanoseconds>(ts_end - ts_start).count())) *
             1000000000.;
     std::cout << "Throughput = " << throughput <<" predictions/second\n" << std::endl;
@@ -107,15 +100,6 @@ int main(int argc, char **argv) {
     } else {
         std::cerr << "Error writing hw results to file" << std::endl;
     }
-
-    std::cout << "\nWriting run logs to file" << std::endl;
-    std::ofstream outFile("u55c_executable_logfile.log", std::ios::trunc);
-    if (outFile.is_open()) {
-        outFile << fpga.ss.rdbuf();
-        outFile.close();
-    } else {
-        std::cerr << "Error opening file for logging" << std::endl;
-    }
     
     return EXIT_SUCCESS;
-}
+}
diff --git a/hls4ml/writer/vitis_accelerator_writer.py b/hls4ml/writer/vitis_accelerator_writer.py
@@ -101,9 +101,9 @@ def write_kernel(self, model):
                 newline = ''
                 if io_type == 'io_parallel':
                     newline += '#define DATA_SIZE_IN ' + format(inp.size_cpp()) + '\n'
-                    newline += '#define INSTREAMSIZE (BATCHSIZE * DATA_SIZE_IN)' + '\n\n'
+                    newline += '#define INSTREAMSIZE DATA_SIZE_IN' + '\n\n'
                     newline += '#define DATA_SIZE_OUT ' + format(out.size_cpp()) + '\n'
-                    newline += '#define OUTSTREAMSIZE (BATCHSIZE * DATA_SIZE_OUT)' + '\n\n'
+                    newline += '#define OUTSTREAMSIZE DATA_SIZE_OUT' + '\n\n'
                     newline += 'typedef ' + format(inp.type.name) + ' in_buffer_t;\n'
                     newline += 'typedef ' + format(out.type.name) + ' out_buffer_t;\n'
                 elif io_type == 'io_stream':
@@ -113,12 +113,11 @@ def write_kernel(self, model):
                     dims.append("1")
                     newline += '#define DATA_SIZE_IN ' + ' * '.join(dims) + '\n'
                     newline += '#define NNET_ARRAY_DEPTH ' + format(nnet_array_depth) + '\n'
-                    newline += '#define INSTREAMSIZE (BATCHSIZE * DATA_SIZE_IN * NNET_ARRAY_DEPTH)' + '\n\n'
+                    newline += '#define INSTREAMSIZE (DATA_SIZE_IN * NNET_ARRAY_DEPTH)' + '\n\n'
                     newline += '#define DATA_SIZE_OUT ' + format(out.size_cpp()) + '\n'
-                    newline += '#define OUTSTREAMSIZE (BATCHSIZE * DATA_SIZE_OUT)' + '\n\n'
-                    precision_str = str(model.config.backend.convert_precision_string(model.config.model_precision.get('default')))
-                    newline += 'typedef ' + precision_str + ' in_buffer_t;\n'
-                    newline += 'typedef ' + precision_str + ' out_buffer_t;\n'
+                    newline += '#define OUTSTREAMSIZE DATA_SIZE_OUT' + '\n\n'
+                    newline += 'typedef ' +  inp.type.precision.definition_cpp() + ' in_buffer_t;\n'
+                    newline += 'typedef ' + out.type.precision.definition_cpp() + ' out_buffer_t;\n'
             else:
                 newline = line
             fout_header.write(newline)