MLIR-AIE
xrt_test_wrapper.h
Go to the documentation of this file.
1
2#include <fstream>
3#include <iostream>
4#include <sstream>
5
6#include "test_utils.h"
7#include "xrt/xrt_bo.h"
8
9namespace po = boost::program_options;
10
11struct args {
17 std::string instr;
18 std::string xclbin;
19 std::string kernel;
20 std::string trace_file;
21};
22
23struct args parse_args(int argc, const char *argv[]) {
24 // ------------------------------------------------------
25 // Parse program arguments
26 // ------------------------------------------------------
27 po::options_description desc("Allowed options");
28 po::variables_map vm;
30
31 struct args myargs;
32
33 test_utils::parse_options(argc, argv, desc, vm);
34 myargs.verbosity = vm["verbosity"].as<int>();
35 myargs.do_verify = vm["verify"].as<bool>();
36 myargs.n_iterations = vm["iters"].as<int>();
37 myargs.n_warmup_iterations = vm["warmup"].as<int>();
38 myargs.trace_size = vm["trace_sz"].as<int>();
39 myargs.instr = vm["instr"].as<std::string>();
40 myargs.xclbin = vm["xclbin"].as<std::string>();
41 myargs.kernel = vm["kernel"].as<std::string>();
42 myargs.trace_file = vm["trace_file"].as<std::string>();
43
44 return myargs;
45}
46
47/*
48 ******************************************************************************
49 * XRT based test wrapper for 2 inputs and 1 output
50 ******************************************************************************
51 */
52template <typename T1, typename T2, typename T3, void (*init_bufIn1)(T1 *, int),
53 void (*init_bufIn2)(T2 *, int), void (*init_bufOut)(T3 *, int),
54 int (*verify_results)(T1 *, T2 *, T3 *, int, int)>
55int setup_and_run_aie(int IN1_VOLUME, int IN2_VOLUME, int OUT_VOLUME,
56 struct args myargs) {
57
58 srand(time(NULL));
59
60 // Load instruction sequence
61 std::vector<uint32_t> instr_v = test_utils::load_instr_binary(myargs.instr);
62 if (myargs.verbosity >= 1)
63 std::cout << "Sequence instr count: " << instr_v.size() << "\n";
64
65 // Start the XRT context and load the kernel
66 xrt::device device;
67 xrt::kernel kernel;
68
70 myargs.xclbin, myargs.kernel);
71
72 // set up the buffer objects
73 auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
74 XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
75 auto bo_in1 = xrt::bo(device, IN1_VOLUME * sizeof(T1), XRT_BO_FLAGS_HOST_ONLY,
76 kernel.group_id(3));
77 auto bo_in2 = xrt::bo(device, IN2_VOLUME * sizeof(T2), XRT_BO_FLAGS_HOST_ONLY,
78 kernel.group_id(4));
79 auto bo_out = xrt::bo(device, OUT_VOLUME * sizeof(T3), XRT_BO_FLAGS_HOST_ONLY,
80 kernel.group_id(5));
81
82 // Workaround so we declare a really small trace buffer when one is not used
83 int tmp_trace_size = (myargs.trace_size > 0) ? myargs.trace_size : 1;
84 auto bo_trace = xrt::bo(device, tmp_trace_size, XRT_BO_FLAGS_HOST_ONLY,
85 kernel.group_id(7));
86
87 if (myargs.verbosity >= 1)
88 std::cout << "Writing data into buffer objects.\n";
89
90 // Copy instruction stream to xrt buffer object
91 void *bufInstr = bo_instr.map<void *>();
92 memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
93
94 // Initialize buffer objects
95 T1 *bufIn1 = bo_in1.map<T1 *>();
96 T2 *bufIn2 = bo_in2.map<T2 *>();
97 T3 *bufOut = bo_out.map<T3 *>();
98 char *bufTrace = bo_trace.map<char *>();
99
100 init_bufIn1(bufIn1, IN1_VOLUME);
101 init_bufIn2(bufIn2, IN2_VOLUME);
102 init_bufOut(bufOut, OUT_VOLUME); // <<< what size do I pass it?
103
104 if (myargs.trace_size > 0)
105 memset(bufTrace, 0, myargs.trace_size);
106
107 // sync host to device memories
108 bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
109 bo_in1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
110 bo_in2.sync(XCL_BO_SYNC_BO_TO_DEVICE);
111 bo_out.sync(XCL_BO_SYNC_BO_TO_DEVICE);
112 if (myargs.trace_size > 0)
113 bo_trace.sync(XCL_BO_SYNC_BO_TO_DEVICE);
114
115 // ------------------------------------------------------
116 // Initialize run configs
117 // ------------------------------------------------------
118 unsigned num_iter = myargs.n_iterations + myargs.n_warmup_iterations;
119 float npu_time_total = 0;
120 float npu_time_min = 9999999;
121 float npu_time_max = 0;
122
123 int errors = 0;
124
125 // ------------------------------------------------------
126 // Main run loop
127 // ------------------------------------------------------
128 for (unsigned iter = 0; iter < num_iter; iter++) {
129
130 if (myargs.verbosity >= 1)
131 std::cout << "Running Kernel.\n";
132
133 // Run kernel
134 if (myargs.verbosity >= 1)
135 std::cout << "Running Kernel.\n";
136 auto start = std::chrono::high_resolution_clock::now();
137 unsigned int opcode = 3;
138 auto run = kernel(opcode, bo_instr, instr_v.size(), bo_in1, bo_in2, bo_out,
139 0, bo_trace);
140 run.wait();
141 auto stop = std::chrono::high_resolution_clock::now();
142 bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
143 if (myargs.trace_size > 0)
144 bo_trace.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
145
146 if (iter < myargs.n_warmup_iterations)
147 /* Warmup iterations do not count towards average runtime. */
148 continue;
149
150 // Copy output results and verify they are correct
151 if (myargs.do_verify) {
152 if (myargs.verbosity >= 1) {
153 std::cout << "Verifying results ..." << std::endl;
154 }
155 auto vstart = std::chrono::system_clock::now();
156
157 errors +=
158 verify_results(bufIn1, bufIn2, bufOut, IN1_VOLUME, myargs.verbosity);
159
160 auto vstop = std::chrono::system_clock::now();
161 float vtime =
162 std::chrono::duration_cast<std::chrono::seconds>(vstop - vstart)
163 .count();
164 if (myargs.verbosity >= 1)
165 std::cout << "Verify time: " << vtime << "secs." << std::endl;
166 } else {
167 if (myargs.verbosity >= 1)
168 std::cout << "WARNING: results not verified." << std::endl;
169 }
170
171 // Write trace values if trace_size > 0 and first iteration
172 if (myargs.trace_size > 0 && iter == myargs.n_warmup_iterations) {
173 test_utils::write_out_trace(((char *)bufTrace), myargs.trace_size,
174 myargs.trace_file);
175 }
176
177 // Accumulate run times
178 float npu_time =
179 std::chrono::duration_cast<std::chrono::microseconds>(stop - start)
180 .count();
181
182 npu_time_total += npu_time;
183 npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min;
184 npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max;
185 }
186
187 // ------------------------------------------------------
188 // Print verification and timing results
189 // ------------------------------------------------------
190
191 // TODO - Mac count to guide gflops
192 float macs = 0;
193
194 std::cout << std::endl
195 << "Avg NPU time: " << npu_time_total / myargs.n_iterations << "us."
196 << std::endl;
197 if (macs > 0)
198 std::cout << "Avg NPU gflops: "
199 << macs / (1000 * npu_time_total / myargs.n_iterations)
200 << std::endl;
201
202 std::cout << std::endl
203 << "Min NPU time: " << npu_time_min << "us." << std::endl;
204 if (macs > 0)
205 std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_min)
206 << std::endl;
207
208 std::cout << std::endl
209 << "Max NPU time: " << npu_time_max << "us." << std::endl;
210 if (macs > 0)
211 std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max)
212 << std::endl;
213
214 if (!errors) {
215 std::cout << "\nPASS!\n\n";
216 return 0;
217 } else {
218 std::cout << "\nError count: " << errors << "\n\n";
219 std::cout << "\nFailed.\n\n";
220 return 1;
221 }
222}
223
224/*
225 ******************************************************************************
226 * XRT based test wrapper for 1 input and 1 output
227 ******************************************************************************
228 */
229template <typename T1, typename T3, void (*init_bufIn1)(T1 *, int),
230 void (*init_bufOut)(T3 *, int),
231 int (*verify_results)(T1 *, T3 *, int, int)>
232int setup_and_run_aie(int IN1_VOLUME, int OUT_VOLUME, struct args myargs) {
233
234 srand(time(NULL));
235
236 // Load instruction sequence
237 std::vector<uint32_t> instr_v = test_utils::load_instr_binary(myargs.instr);
238 if (myargs.verbosity >= 1)
239 std::cout << "Sequence instr count: " << instr_v.size() << "\n";
240
241 // Start the XRT context and load the kernel
242 xrt::device device;
243 xrt::kernel kernel;
244
246 myargs.xclbin, myargs.kernel);
247
248 // set up the buffer objects
249 auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
250 XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
251 auto bo_in1 = xrt::bo(device, IN1_VOLUME * sizeof(T1), XRT_BO_FLAGS_HOST_ONLY,
252 kernel.group_id(3));
253 auto bo_out = xrt::bo(device, OUT_VOLUME * sizeof(T3), XRT_BO_FLAGS_HOST_ONLY,
254 kernel.group_id(4));
255 // Workaround so we declare a really small trace buffer when one is not used
256 int tmp_trace_size = (myargs.trace_size > 0) ? myargs.trace_size : 1;
257 auto bo_trace = xrt::bo(device, tmp_trace_size, XRT_BO_FLAGS_HOST_ONLY,
258 kernel.group_id(7));
259
260 if (myargs.verbosity >= 1)
261 std::cout << "Writing data into buffer objects.\n";
262
263 // Copy instruction stream to xrt buffer object
264 void *bufInstr = bo_instr.map<void *>();
265 memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
266
267 // Initialize buffer objects
268 T1 *bufIn1 = bo_in1.map<T1 *>();
269 T3 *bufOut = bo_out.map<T3 *>();
270 char *bufTrace = bo_trace.map<char *>();
271
272 init_bufIn1(bufIn1, IN1_VOLUME);
273 init_bufOut(bufOut,
274 OUT_VOLUME); // <<< what size do I pass it? reset with trace?
275 if (myargs.trace_size > 0)
276 memset(bufTrace, 0, myargs.trace_size);
277
278 // sync host to device memories
279 bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
280 bo_in1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
281 bo_out.sync(XCL_BO_SYNC_BO_TO_DEVICE);
282 if (myargs.trace_size > 0)
283 bo_trace.sync(XCL_BO_SYNC_BO_TO_DEVICE);
284
285 // ------------------------------------------------------
286 // Initialize run configs
287 // ------------------------------------------------------
288 unsigned num_iter = myargs.n_iterations + myargs.n_warmup_iterations;
289 float npu_time_total = 0;
290 float npu_time_min = 9999999;
291 float npu_time_max = 0;
292
293 int errors = 0;
294
295 // ------------------------------------------------------
296 // Main run loop
297 // ------------------------------------------------------
298 for (unsigned iter = 0; iter < num_iter; iter++) {
299
300 if (myargs.verbosity >= 1)
301 std::cout << "Running Kernel.\n";
302
303 // Run kernel
304 if (myargs.verbosity >= 1)
305 std::cout << "Running Kernel.\n";
306 auto start = std::chrono::high_resolution_clock::now();
307 unsigned int opcode = 3;
308 // auto run = kernel(opcode, bo_instr, instr_v.size(), bo_in1, bo_out);
309 auto run = kernel(opcode, bo_instr, instr_v.size(), bo_in1, bo_out, 0, 0,
310 bo_trace);
311 run.wait();
312 auto stop = std::chrono::high_resolution_clock::now();
313 bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
314 if (myargs.trace_size > 0)
315 bo_trace.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
316
317 if (iter < myargs.n_warmup_iterations)
318 /* Warmup iterations do not count towards average runtime. */
319 continue;
320
321 // Copy output results and verify they are correct
322 if (myargs.do_verify) {
323 if (myargs.verbosity >= 1) {
324 std::cout << "Verifying results ..." << std::endl;
325 }
326 auto vstart = std::chrono::system_clock::now();
327
328 errors += verify_results(bufIn1, bufOut, IN1_VOLUME, myargs.verbosity);
329
330 auto vstop = std::chrono::system_clock::now();
331 float vtime =
332 std::chrono::duration_cast<std::chrono::seconds>(vstop - vstart)
333 .count();
334 if (myargs.verbosity >= 1)
335 std::cout << "Verify time: " << vtime << "secs." << std::endl;
336 } else {
337 if (myargs.verbosity >= 1)
338 std::cout << "WARNING: results not verified." << std::endl;
339 }
340
341 // Write trace values if trace_size > 0 and first iteration
342 if (myargs.trace_size > 0 && iter == myargs.n_warmup_iterations) {
343 // std::cout << "Writing to offset " << OUT_VOLUME * sizeof(T3) <<
344 // std::endl;
345 std::cout << "Writing trace of size " << myargs.trace_size << std::endl;
346 // test_utils::write_out_trace(((char *)bufOut) + OUT_VOLUME * sizeof(T3),
347 test_utils::write_out_trace((char *)bufTrace, myargs.trace_size,
348 myargs.trace_file);
349 }
350
351 // Accumulate run times
352 float npu_time =
353 std::chrono::duration_cast<std::chrono::microseconds>(stop - start)
354 .count();
355
356 npu_time_total += npu_time;
357 npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min;
358 npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max;
359 }
360
361 // ------------------------------------------------------
362 // Print verification and timing results
363 // ------------------------------------------------------
364
365 // TODO - Mac count to guide gflops
366 float macs = 0;
367
368 std::cout << std::endl
369 << "Avg NPU time: " << npu_time_total / myargs.n_iterations << "us."
370 << std::endl;
371 if (macs > 0)
372 std::cout << "Avg NPU gflops: "
373 << macs / (1000 * npu_time_total / myargs.n_iterations)
374 << std::endl;
375
376 std::cout << std::endl
377 << "Min NPU time: " << npu_time_min << "us." << std::endl;
378 if (macs > 0)
379 std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_min)
380 << std::endl;
381
382 std::cout << std::endl
383 << "Max NPU time: " << npu_time_max << "us." << std::endl;
384 if (macs > 0)
385 std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max)
386 << std::endl;
387
388 if (!errors) {
389 std::cout << "\nPASS!\n\n";
390 return 0;
391 } else {
392 std::cout << "\nError count: " << errors << "\n\n";
393 std::cout << "\nFailed.\n\n";
394 return 1;
395 }
396}
void add_default_options(po::options_description &desc)
std::vector< uint32_t > load_instr_binary(std::string instr_path)
void init_xrt_load_kernel(xrt::device &device, xrt::kernel &kernel, int verbosity, std::string xclbinFileName, std::string kernelNameInXclbin)
void parse_options(int argc, const char *argv[], po::options_description &desc, po::variables_map &vm)
void write_out_trace(char *traceOutPtr, size_t trace_size, std::string path)
int n_warmup_iterations
std::string trace_file
std::string xclbin
int do_verify
int trace_size
int verbosity
std::string kernel
int n_iterations
std::string instr
struct args parse_args(int argc, const char *argv[])
int setup_and_run_aie(int IN1_VOLUME, int IN2_VOLUME, int OUT_VOLUME, struct args myargs)