Skip to content

Commit b91c351

Browse files
Cleaned-up host code + improved .dat generation
1 parent ba23a1c commit b91c351

File tree

5 files changed

+33
-154
lines changed

5 files changed

+33
-154
lines changed

hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -70,32 +70,36 @@ def build(self, model, target="all"):
7070
else:
7171
raise Exception("Currently untested on non-Linux OS")
7272

73-
def _numpy_to_dat(self, model, x):
73+
def numpy_to_dat(self, model, x):
7474
if len(model.get_input_variables()) != 1:
7575
raise Exception("Currently unsupported for multi-input/output projects")
7676

7777
# Verify numpy array of correct shape
7878
expected_shape = model.get_input_variables()[0].size()
79-
if expected_shape != x.shape[-1]:
79+
actual_shape = np.prod(x.shape[1:])
80+
if expected_shape != actual_shape:
8081
raise Exception(f'Input shape mismatch, got {x.shape}, expected (_, {expected_shape})')
8182

8283
# Write to tb_data/tb_input_features.dat
83-
input_dat = open(f'{model.config.get_output_dir()}/tb_data/tb_input_features.dat', 'w')
84-
for input in x:
85-
newline = " ".join(str(n) for n in input)
86-
input_dat.write(newline + '\n')
87-
input_dat.close()
88-
89-
def _dat_to_numpy(self, model):
84+
samples = x.reshape(x.shape[0], -1)
85+
input_dat = f'{model.config.get_output_dir()}/tb_data/tb_input_features.dat'
86+
np.savetxt(input_dat, samples, fmt='%.4e')
87+
88+
def dat_to_numpy(self, model):
9089
expected_shape = model.get_output_variables()[0].size()
9190
output_file = f'{model.config.get_output_dir()}/tb_data/hw_results.dat'
9291
y = np.loadtxt(output_file, dtype=float).reshape(-1, expected_shape)
9392
return y
9493

9594
def hardware_predict(self, model, x):
96-
self._numpy_to_dat(model, x)
95+
self.numpy_to_dat(model, x)
96+
97+
currdir = os.getcwd()
98+
os.chdir(model.config.get_output_dir())
9799
os.system("./host build/kernel_wrapper.xclbin")
98-
return self._dat_to_numpy(model)
100+
os.chdir(currdir)
101+
102+
return self.dat_to_numpy(model)
99103

100104
def _register_flows(self):
101105
validation_passes = [

hls4ml/templates/vitis_accelerator/libs/FpgaObj.hpp

Lines changed: 2 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,9 @@
77
#include <stdlib.h>
88
#include <string>
99
#include <stdio.h>
10-
#include <sstream>
1110
#include <thread>
1211
#include <vector>
1312

14-
#include "timing.hpp"
1513
#include "xcl2.hpp"
1614

1715
template <class T, class U>
@@ -20,7 +18,6 @@ class FpgaObj {
2018
std::vector<T,aligned_allocator<T>> source_in; // Vector containing inputs to all kernels
2119
std::vector<U,aligned_allocator<U>> source_hw_results; // Vector containing all outputs from all kernels
2220
cl_int err; // Stores potential error codes thrown by OpenCL functions
23-
std::stringstream ss; // Logs information from runFPGA(). Every thread logs to this stringstream
2421

2522
/**
2623
* \brief Constructor. Reserves and allocates buffers in host memory.
@@ -36,8 +33,7 @@ class FpgaObj {
3633
_numCU(numCU),
3734
_numThreads(numThreads),
3835
_numEpochs(numEpochs),
39-
ikern(0),
40-
ithr(0) {
36+
ikern(0) {
4137
source_in.reserve(_kernInputSize * _numCU * _numThreads);
4238
source_hw_results.reserve(_kernOutputSize * _numCU * _numThreads);
4339
isFirstRun.reserve(_numCU * _numThreads);
@@ -113,17 +109,6 @@ class FpgaObj {
113109
*/
114110
virtual void allocateHostMemory(int chan_per_port) = 0;
115111

116-
/**
117-
* \brief Logs information about thread completion
118-
* \param newss Additional thread-specific information to log
119-
*/
120-
void write_ss_safe(std::string newss) {
121-
smtx.lock();
122-
ss << "Thread " << ithr << "\n" << newss << "\n";
123-
ithr++;
124-
smtx.unlock();
125-
}
126-
127112
/**
128113
* \brief Completes all enqueued operations
129114
*/
@@ -135,22 +120,13 @@ class FpgaObj {
135120

136121
/**
137122
* \brief Migrates input to FPGA , executes kernels, and migrates output to host memory. Run this function in numThreads different threads
138-
* \return Stringstream containing logs of the run
139123
*/
140-
std::stringstream runFPGA() {
141-
auto t_start = Clock::now();
142-
auto t_end = Clock::now();
143-
std::stringstream ss;
144-
124+
void runFPGA() {
145125
for (int i = 0 ; i < _numCU * _numEpochs; i++){
146-
t_start = Clock::now();
147126
auto ikf = get_info_lock();
148127
int ikb = ikf.first;
149128
int ik = ikb % _numCU ;
150129
bool firstRun = ikf.second;
151-
152-
auto ts1 = SClock::now();
153-
print_nanoseconds(" start: ",ts1, ik, ss);
154130

155131
get_ilock(ikb);
156132
// Copy input data to device global memory
@@ -179,13 +155,7 @@ class FpgaObj {
179155

180156
OCL_CHECK(err, err = kern_event[ikb].wait());
181157
OCL_CHECK(err, err = read_event[ikb].wait());
182-
auto ts2 = SClock::now();
183-
print_nanoseconds(" finish: ",ts2, ik, ss);
184-
185-
t_end = Clock::now();
186-
ss << "KERN"<<ik<<" Total time: " << std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_start).count() << " ns\n";
187158
}
188-
return ss;
189159
}
190160

191161
protected:
@@ -200,9 +170,6 @@ class FpgaObj {
200170
mutable std::mutex mtx; // Mutex for ikern, isFirstRun, and get_info_lock()
201171
mutable std::vector<std::mutex> mtxi; // Mutexes for each virtual kernel and associate resources
202172

203-
int ithr; // Counter tracking the threads that ran to completion (for logging purposes)
204-
mutable std::mutex smtx; // Mutex for ithr and write_ss_safe()
205-
206173
cl::Program program; // Object containing the Program (built from kernel_wrapper.cpp) that runs on each physical compute unit
207174
cl::Context context; // Object containing the Device Context
208175
std::vector<cl::CommandQueue> q; // Vector containing Command Queue objects controlling physical compute units

hls4ml/templates/vitis_accelerator/libs/timing.hpp

Lines changed: 0 additions & 75 deletions
This file was deleted.

hls4ml/templates/vitis_accelerator/myproject_host_cl.cpp

Lines changed: 10 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -10,17 +10,14 @@
1010
#include "FpgaObj.hpp"
1111
#include "HbmFpga.hpp"
1212
#include "DdrFpga.hpp"
13-
#include "timing.hpp"
1413
#include "xcl2.hpp"
1514

1615
#define STRINGIFY(var) #var
1716
#define EXPAND_STRING(var) STRINGIFY(var)
1817

1918

2019
void runFPGAHelper(FpgaObj<in_buffer_t, out_buffer_t> &fpga) {
21-
std::stringstream ss;
22-
ss << (fpga.runFPGA()).str();
23-
fpga.write_ss_safe(ss.str());
20+
fpga.runFPGA();
2421
}
2522

2623
int main(int argc, char **argv) {
@@ -30,7 +27,7 @@ int main(int argc, char **argv) {
3027
}
3128
std::string xclbinFilename = argv[1];
3229

33-
/*FPGATYPE*/<in_buffer_t, out_buffer_t> fpga(INSTREAMSIZE, OUTSTREAMSIZE, NUM_CU, NUM_THREAD, 100);
30+
/*FPGATYPE*/<in_buffer_t, out_buffer_t> fpga(BATCHSIZE * INSTREAMSIZE, BATCHSIZE * OUTSTREAMSIZE, NUM_CU, NUM_THREAD, 10);
3431

3532
std::vector<cl::Device> devices = xcl::get_xil_devices(); // Utility API that finds xilinx platforms and return a list of devices connected to Xilinx platforms
3633
cl::Program::Binaries bins = xcl::import_binary_file(xclbinFilename); // Load xclbin
@@ -48,7 +45,7 @@ int main(int argc, char **argv) {
4845
if (fin.is_open()) {
4946
std::string iline;
5047
while (std::getline(fin, iline)) {
51-
if (num_inputs % 10 == 0) {
48+
if (num_inputs % 100 == 0) {
5249
std::cout << "Processing input " << num_inputs << std::endl;
5350
}
5451
std::stringstream in(iline);
@@ -60,21 +57,17 @@ int main(int argc, char **argv) {
6057
num_inputs++;
6158
}
6259
}
63-
60+
fin.close();
61+
6462
// Copying in testbench data
6563
int num_samples = std::min(num_inputs, BATCHSIZE * NUM_CU * NUM_THREAD);
66-
memcpy(fpga.source_in.data(), inputData.data(), num_samples * DATA_SIZE_IN * sizeof(in_buffer_t));
67-
68-
// Padding rest of buffer with arbitrary values
69-
for (int i = num_samples * DATA_SIZE_IN; i < INSTREAMSIZE * NUM_CU * NUM_THREAD; i++) {
70-
fpga.source_in[i] = (in_buffer_t)(2.345678);
71-
}
64+
memcpy(fpga.source_in.data(), inputData.data(), num_samples * INSTREAMSIZE * sizeof(in_buffer_t));
7265

7366
std::vector<std::thread> hostAccelerationThreads;
7467
hostAccelerationThreads.reserve(NUM_THREAD);
7568

7669
std::cout << "Beginning FPGA run" << std::endl;
77-
auto ts_start = SClock::now();
70+
auto ts_start = std::chrono::system_clock::now();
7871

7972
for (int i = 0; i < NUM_THREAD; i++) {
8073
hostAccelerationThreads.push_back(std::thread(runFPGAHelper, std::ref(fpga)));
@@ -86,8 +79,8 @@ int main(int argc, char **argv) {
8679

8780
fpga.finishRun();
8881

89-
auto ts_end = SClock::now();
90-
float throughput = (float(NUM_CU * NUM_THREAD * 100 * BATCHSIZE) /
82+
auto ts_end = std::chrono::system_clock::now();
83+
float throughput = (float(BATCHSIZE* NUM_CU * NUM_THREAD * 10 ) /
9184
float(std::chrono::duration_cast<std::chrono::nanoseconds>(ts_end - ts_start).count())) *
9285
1000000000.;
9386
std::cout << "Throughput = " << throughput <<" predictions/second\n" << std::endl;
@@ -107,15 +100,6 @@ int main(int argc, char **argv) {
107100
} else {
108101
std::cerr << "Error writing hw results to file" << std::endl;
109102
}
110-
111-
std::cout << "\nWriting run logs to file" << std::endl;
112-
std::ofstream outFile("u55c_executable_logfile.log", std::ios::trunc);
113-
if (outFile.is_open()) {
114-
outFile << fpga.ss.rdbuf();
115-
outFile.close();
116-
} else {
117-
std::cerr << "Error opening file for logging" << std::endl;
118-
}
119103

120104
return EXIT_SUCCESS;
121-
}
105+
}

hls4ml/writer/vitis_accelerator_writer.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -101,9 +101,9 @@ def write_kernel(self, model):
101101
newline = ''
102102
if io_type == 'io_parallel':
103103
newline += '#define DATA_SIZE_IN ' + format(inp.size_cpp()) + '\n'
104-
newline += '#define INSTREAMSIZE (BATCHSIZE * DATA_SIZE_IN)' + '\n\n'
104+
newline += '#define INSTREAMSIZE DATA_SIZE_IN' + '\n\n'
105105
newline += '#define DATA_SIZE_OUT ' + format(out.size_cpp()) + '\n'
106-
newline += '#define OUTSTREAMSIZE (BATCHSIZE * DATA_SIZE_OUT)' + '\n\n'
106+
newline += '#define OUTSTREAMSIZE DATA_SIZE_OUT' + '\n\n'
107107
newline += 'typedef ' + format(inp.type.name) + ' in_buffer_t;\n'
108108
newline += 'typedef ' + format(out.type.name) + ' out_buffer_t;\n'
109109
elif io_type == 'io_stream':
@@ -113,12 +113,11 @@ def write_kernel(self, model):
113113
dims.append("1")
114114
newline += '#define DATA_SIZE_IN ' + ' * '.join(dims) + '\n'
115115
newline += '#define NNET_ARRAY_DEPTH ' + format(nnet_array_depth) + '\n'
116-
newline += '#define INSTREAMSIZE (BATCHSIZE * DATA_SIZE_IN * NNET_ARRAY_DEPTH)' + '\n\n'
116+
newline += '#define INSTREAMSIZE (DATA_SIZE_IN * NNET_ARRAY_DEPTH)' + '\n\n'
117117
newline += '#define DATA_SIZE_OUT ' + format(out.size_cpp()) + '\n'
118-
newline += '#define OUTSTREAMSIZE (BATCHSIZE * DATA_SIZE_OUT)' + '\n\n'
119-
precision_str = str(model.config.backend.convert_precision_string(model.config.model_precision.get('default')))
120-
newline += 'typedef ' + precision_str + ' in_buffer_t;\n'
121-
newline += 'typedef ' + precision_str + ' out_buffer_t;\n'
118+
newline += '#define OUTSTREAMSIZE DATA_SIZE_OUT' + '\n\n'
119+
newline += 'typedef ' + inp.type.precision.definition_cpp() + ' in_buffer_t;\n'
120+
newline += 'typedef ' + out.type.precision.definition_cpp() + ' out_buffer_t;\n'
122121
else:
123122
newline = line
124123
fout_header.write(newline)

0 commit comments

Comments
 (0)