MLIR-AIE
xrt_test_wrapper.h
Go to the documentation of this file.
1#include "cxxopts.hpp"
2#include "test_utils.h"
3
4#include "xrt/xrt_bo.h"
5#include "xrt/xrt_device.h"
6#include "xrt/xrt_hw_context.h"
7#include "xrt/xrt_kernel.h"
8
9#include <fstream>
10#include <iostream>
11#include <sstream>
12
13struct args {
19 std::string instr;
20 std::string xclbin;
21 std::string kernel;
22 std::string trace_file;
23};
24
25struct args parse_args(int argc, const char *argv[]) {
26 // ------------------------------------------------------
27 // Parse program arguments
28 // ------------------------------------------------------
29 cxxopts::Options options("XRT Test Wrapper");
32
33 struct args myargs;
34
35 test_utils::parse_options(argc, argv, options, vm);
36 myargs.verbosity = vm["verbosity"].as<int>();
37 myargs.do_verify = vm["verify"].as<bool>();
38 myargs.n_iterations = vm["iters"].as<int>();
39 myargs.n_warmup_iterations = vm["warmup"].as<int>();
40 myargs.trace_size = vm["trace_sz"].as<int>();
41 myargs.instr = vm["instr"].as<std::string>();
42 myargs.xclbin = vm["xclbin"].as<std::string>();
43 myargs.kernel = vm["kernel"].as<std::string>();
44 myargs.trace_file = vm["trace_file"].as<std::string>();
45
46 return myargs;
47}
48
49uint32_t getParity(uint32_t n) {
50 int count = 0;
51 while (n > 0) {
52 if (n & 1) { // Check if the least significant bit is 1
53 count++;
54 }
55 n >>= 1; // Right shift to check the next bit
56 }
57 return (count % 2 == 0) ? 0 : 1; // 0 for even parity, 1 for odd parity
58}
59
60uint32_t create_ctrl_pkt(int operation, int beats, int addr,
61 int ctrl_pkt_read_id = 28) {
62 uint32_t ctrl_pkt = ((ctrl_pkt_read_id & 0xFF) << 24) |
63 ((operation & 0x3) << 22) | ((beats & 0x3) << 20) |
64 (addr & 0x7FFFF);
65 ctrl_pkt |= (0x1 ^ getParity(ctrl_pkt)) << 31;
66 return ctrl_pkt;
67}
68
69/*
70 ******************************************************************************
71 * XRT based test wrapper for 2 inputs and 1 output
72 ******************************************************************************
73 */
74template <typename T1, typename T2, typename T3, void (*init_bufIn1)(T1 *, int),
75 void (*init_bufIn2)(T2 *, int), void (*init_bufOut)(T3 *, int),
76 int (*verify_results)(T1 *, T2 *, T3 *, int, int)>
77int setup_and_run_aie(int IN1_VOLUME, int IN2_VOLUME, int OUT_VOLUME,
78 struct args myargs, bool enable_ctrl_pkts = false) {
79
80 srand(time(NULL));
81
82 // Load instruction sequence
83 std::vector<uint32_t> instr_v = test_utils::load_instr_binary(myargs.instr);
84 if (myargs.verbosity >= 1)
85 std::cout << "Sequence instr count: " << instr_v.size() << "\n";
86
87 // Start the XRT context and load the kernel
88 xrt::device device;
89 xrt::kernel kernel;
90
92 myargs.xclbin, myargs.kernel);
93
94 // set up the buffer objects
95 auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
96 XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
97 auto bo_in1 = xrt::bo(device, IN1_VOLUME * sizeof(T1), XRT_BO_FLAGS_HOST_ONLY,
98 kernel.group_id(3));
99 auto bo_in2 = xrt::bo(device, IN2_VOLUME * sizeof(T2), XRT_BO_FLAGS_HOST_ONLY,
100 kernel.group_id(4));
101 auto bo_out = xrt::bo(device, OUT_VOLUME * sizeof(T3), XRT_BO_FLAGS_HOST_ONLY,
102 kernel.group_id(5));
103
104 // If we enable control packets, then this is the input xrt buffer for that.
105 // Otherwise, this is a dummy placedholder buffer.
106 auto bo_ctrlpkts =
107 xrt::bo(device, 8, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(6));
108
109 // Workaround so we declare a really small trace buffer when one is not used
110 // Second workaround for driver issue. Allocate large trace buffer *4
111 // This includes the 8 bytes needed for control packet response.
112 int tmp_trace_size = (myargs.trace_size > 0) ? myargs.trace_size * 4 : 1;
113 auto bo_trace = xrt::bo(device, tmp_trace_size, XRT_BO_FLAGS_HOST_ONLY,
114 kernel.group_id(7));
115
116 if (myargs.verbosity >= 1)
117 std::cout << "Writing data into buffer objects.\n";
118
119 // Copy instruction stream to xrt buffer object
120 void *bufInstr = bo_instr.map<void *>();
121 memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
122
123 // Initialize buffer objects
124 T1 *bufIn1 = bo_in1.map<T1 *>();
125 T2 *bufIn2 = bo_in2.map<T2 *>();
126 T3 *bufOut = bo_out.map<T3 *>();
127 char *bufTrace = bo_trace.map<char *>();
128 uint32_t *bufCtrlPkts = bo_ctrlpkts.map<uint32_t *>();
129
130 init_bufIn1(bufIn1, IN1_VOLUME);
131 init_bufIn2(bufIn2, IN2_VOLUME);
132 init_bufOut(bufOut, OUT_VOLUME); // <<< what size do I pass it?
133
134 // char *bufTmp1 = bo_tmp1.map<char *>();
135 // memset(bufTmp1, 0, 4);
136
137 if (myargs.trace_size > 0)
138 memset(bufTrace, 0, myargs.trace_size);
139
140 // Set control packet values
141 if (myargs.trace_size > 0 && enable_ctrl_pkts) {
142 bufCtrlPkts[0] = create_ctrl_pkt(1, 0, 0x32004); // core status
143 bufCtrlPkts[1] = create_ctrl_pkt(1, 0, 0x320D8); // trace status
144 if (myargs.verbosity >= 1) {
145 std::cout << "bufCtrlPkts[0]:" << std::hex << bufCtrlPkts[0] << std::endl;
146 std::cout << "bufCtrlPkts[1]:" << std::hex << bufCtrlPkts[1] << std::endl;
147 }
148 }
149
150 // sync host to device memories
151 bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
152 bo_in1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
153 bo_in2.sync(XCL_BO_SYNC_BO_TO_DEVICE);
154 bo_out.sync(XCL_BO_SYNC_BO_TO_DEVICE);
155 // bo_tmp1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
156 if (myargs.trace_size > 0) {
157 bo_trace.sync(XCL_BO_SYNC_BO_TO_DEVICE);
158 if (enable_ctrl_pkts)
159 bo_ctrlpkts.sync(XCL_BO_SYNC_BO_TO_DEVICE);
160 }
161
162 // ------------------------------------------------------
163 // Initialize run configs
164 // ------------------------------------------------------
165 unsigned num_iter = myargs.n_iterations + myargs.n_warmup_iterations;
166 float npu_time_total = 0;
167 float npu_time_min = 9999999;
168 float npu_time_max = 0;
169
170 int errors = 0;
171
172 // ------------------------------------------------------
173 // Main run loop
174 // ------------------------------------------------------
175 for (unsigned iter = 0; iter < num_iter; iter++) {
176
177 if (myargs.verbosity >= 1)
178 std::cout << "Running Kernel.\n";
179
180 // Run kernel
181 if (myargs.verbosity >= 1)
182 std::cout << "Running Kernel.\n";
183 auto start = std::chrono::high_resolution_clock::now();
184 unsigned int opcode = 3;
185 auto run = kernel(opcode, bo_instr, instr_v.size(), bo_in1, bo_in2, bo_out,
186 bo_ctrlpkts, bo_trace);
187 run.wait();
188 auto stop = std::chrono::high_resolution_clock::now();
189 bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
190 if (myargs.trace_size > 0)
191 bo_trace.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
192
193 if (iter < myargs.n_warmup_iterations)
194 /* Warmup iterations do not count towards average runtime. */
195 continue;
196
197 // Copy output results and verify they are correct
198 if (myargs.do_verify) {
199 if (myargs.verbosity >= 1) {
200 std::cout << "Verifying results ..." << std::endl;
201 }
202 auto vstart = std::chrono::system_clock::now();
203
204 errors +=
205 verify_results(bufIn1, bufIn2, bufOut, IN1_VOLUME, myargs.verbosity);
206
207 auto vstop = std::chrono::system_clock::now();
208 float vtime =
209 std::chrono::duration_cast<std::chrono::seconds>(vstop - vstart)
210 .count();
211 if (myargs.verbosity >= 1)
212 std::cout << "Verify time: " << vtime << "secs." << std::endl;
213 } else {
214 if (myargs.verbosity >= 1)
215 std::cout << "WARNING: results not verified." << std::endl;
216 }
217
218 // Write trace values if trace_size > 0 and first iteration
219 if (myargs.trace_size > 0 && iter == myargs.n_warmup_iterations) {
220 test_utils::write_out_trace(((char *)bufTrace), myargs.trace_size,
221 myargs.trace_file);
222 }
223
224 // Write out control packet outputs
225 if (enable_ctrl_pkts) {
226 uint32_t *ctrl_pkt_out =
227 (uint32_t *)(((char *)bufTrace) + myargs.trace_size);
228 if (myargs.verbosity >= 1) {
229 std::cout << "ctrl_pkt_out[0]:" << std::hex << ctrl_pkt_out[0]
230 << std::endl;
231 std::cout << "ctrl_pkt_out[1]:" << std::hex << ctrl_pkt_out[1]
232 << std::endl;
233 }
234 int col = (ctrl_pkt_out[0] >> 21) & 0x7F;
235 int row = (ctrl_pkt_out[0] >> 16) & 0x1F;
236 if ((ctrl_pkt_out[1] >> 8) == 3)
237 std::cout << "WARNING: Trace overflow detected in tile(" << row << ","
238 << col << ". Trace results may be invalid." << std::endl;
239 }
240
241 // Accumulate run times
242 float npu_time =
243 std::chrono::duration_cast<std::chrono::microseconds>(stop - start)
244 .count();
245
246 npu_time_total += npu_time;
247 npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min;
248 npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max;
249 }
250
251 // ------------------------------------------------------
252 // Print verification and timing results
253 // ------------------------------------------------------
254
255 // TODO - Mac count to guide gflops
256 float macs = 0;
257
258 std::cout << std::endl
259 << "Avg NPU time: " << npu_time_total / myargs.n_iterations << "us."
260 << std::endl;
261 if (macs > 0)
262 std::cout << "Avg NPU gflops: "
263 << macs / (1000 * npu_time_total / myargs.n_iterations)
264 << std::endl;
265
266 std::cout << std::endl
267 << "Min NPU time: " << npu_time_min << "us." << std::endl;
268 if (macs > 0)
269 std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_min)
270 << std::endl;
271
272 std::cout << std::endl
273 << "Max NPU time: " << npu_time_max << "us." << std::endl;
274 if (macs > 0)
275 std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max)
276 << std::endl;
277
278 if (!errors) {
279 std::cout << "\nPASS!\n\n";
280 return 0;
281 } else {
282 std::cout << "\nError count: " << errors << "\n\n";
283 std::cout << "\nFailed.\n\n";
284 return 1;
285 }
286}
287
288/*
289 ******************************************************************************
290 * XRT based test wrapper for 1 input and 1 output
291 ******************************************************************************
292 */
293template <typename T1, typename T3, void (*init_bufIn1)(T1 *, int),
294 void (*init_bufOut)(T3 *, int),
295 int (*verify_results)(T1 *, T3 *, int, int)>
296int setup_and_run_aie(int IN1_VOLUME, int OUT_VOLUME, struct args myargs,
297 bool enable_ctrl_pkts = false) {
298 srand(time(NULL));
299
300 // Load instruction sequence
301 std::vector<uint32_t> instr_v = test_utils::load_instr_binary(myargs.instr);
302 if (myargs.verbosity >= 1)
303 std::cout << "Sequence instr count: " << instr_v.size() << "\n";
304
305 // Start the XRT context and load the kernel
306 xrt::device device;
307 xrt::kernel kernel;
308
310 myargs.xclbin, myargs.kernel);
311
312 // set up the buffer objects
313 auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
314 XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
315 auto bo_in1 = xrt::bo(device, IN1_VOLUME * sizeof(T1), XRT_BO_FLAGS_HOST_ONLY,
316 kernel.group_id(3));
317 auto bo_out = xrt::bo(device, OUT_VOLUME * sizeof(T3), XRT_BO_FLAGS_HOST_ONLY,
318 kernel.group_id(4));
319
320 // Placeholder dummy buffer objects because 0 causes seg faults
321 auto bo_tmp1 = xrt::bo(device, 1, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5));
322
323 // If we enable control packets, then this is the input xrt buffer for that.
324 // Otherwise, this is a dummy placedholder buffer.
325 auto bo_ctrlpkts =
326 xrt::bo(device, 8, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(6));
327
328 // Workaround so we declare a really small trace buffer when one is not used
329 // Second workaround for driver issue. Allocate large trace buffer *4
330 int tmp_trace_size = (myargs.trace_size > 0) ? myargs.trace_size * 4 : 1;
331 auto bo_trace = xrt::bo(device, tmp_trace_size, XRT_BO_FLAGS_HOST_ONLY,
332 kernel.group_id(7));
333
334 if (myargs.verbosity >= 1)
335 std::cout << "Writing data into buffer objects.\n";
336
337 // Copy instruction stream to xrt buffer object
338 void *bufInstr = bo_instr.map<void *>();
339 memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
340
341 // Initialize buffer objects
342 T1 *bufIn1 = bo_in1.map<T1 *>();
343 T3 *bufOut = bo_out.map<T3 *>();
344
345 char *bufTrace = bo_trace.map<char *>();
346 uint32_t *bufCtrlPkts = bo_ctrlpkts.map<uint32_t *>();
347
348 init_bufIn1(bufIn1, IN1_VOLUME);
349 init_bufOut(bufOut,
350 OUT_VOLUME); // <<< what size do I pass it? reset with trace?
351 if (myargs.trace_size > 0)
352 memset(bufTrace, 0, myargs.trace_size);
353
354 // Set control packet values
355 if (myargs.trace_size > 0 && enable_ctrl_pkts) {
356 bufCtrlPkts[0] = create_ctrl_pkt(1, 0, 0x32004); // core status
357 bufCtrlPkts[1] = create_ctrl_pkt(1, 0, 0x320D8); // trace status
358 if (myargs.verbosity >= 1) {
359 std::cout << "bufCtrlPkts[0]:" << std::hex << bufCtrlPkts[0] << std::endl;
360 std::cout << "bufCtrlPkts[1]:" << std::hex << bufCtrlPkts[1] << std::endl;
361 }
362 }
363
364 // sync host to device memories
365 bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
366 bo_in1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
367 bo_out.sync(XCL_BO_SYNC_BO_TO_DEVICE);
368 if (myargs.trace_size > 0) {
369 bo_trace.sync(XCL_BO_SYNC_BO_TO_DEVICE);
370 if (enable_ctrl_pkts)
371 bo_ctrlpkts.sync(XCL_BO_SYNC_BO_TO_DEVICE);
372 }
373
374 // ------------------------------------------------------
375 // Initialize run configs
376 // ------------------------------------------------------
377 unsigned num_iter = myargs.n_iterations + myargs.n_warmup_iterations;
378 float npu_time_total = 0;
379 float npu_time_min = 9999999;
380 float npu_time_max = 0;
381
382 int errors = 0;
383
384 // ------------------------------------------------------
385 // Main run loop
386 // ------------------------------------------------------
387 for (unsigned iter = 0; iter < num_iter; iter++) {
388
389 if (myargs.verbosity >= 1)
390 std::cout << "Running Kernel.\n";
391
392 // Run kernel
393 if (myargs.verbosity >= 1)
394 std::cout << "Running Kernel.\n";
395 auto start = std::chrono::high_resolution_clock::now();
396 unsigned int opcode = 3;
397 // auto run = kernel(opcode, bo_instr, instr_v.size(), bo_in1, bo_out);
398 // auto run = kernel(opcode, bo_instr, instr_v.size(), bo_in1, bo_out, 0, 0,
399 auto run = kernel(opcode, bo_instr, instr_v.size(), bo_in1, bo_out, bo_tmp1,
400 bo_ctrlpkts, bo_trace);
401 run.wait();
402 auto stop = std::chrono::high_resolution_clock::now();
403 bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
404 if (myargs.trace_size > 0)
405 bo_trace.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
406
407 if (iter < myargs.n_warmup_iterations)
408 /* Warmup iterations do not count towards average runtime. */
409 continue;
410
411 // Copy output results and verify they are correct
412 if (myargs.do_verify) {
413 if (myargs.verbosity >= 1) {
414 std::cout << "Verifying results ..." << std::endl;
415 }
416 auto vstart = std::chrono::system_clock::now();
417
418 errors += verify_results(bufIn1, bufOut, IN1_VOLUME, myargs.verbosity);
419
420 auto vstop = std::chrono::system_clock::now();
421 float vtime =
422 std::chrono::duration_cast<std::chrono::seconds>(vstop - vstart)
423 .count();
424 if (myargs.verbosity >= 1)
425 std::cout << "Verify time: " << vtime << "secs." << std::endl;
426 } else {
427 if (myargs.verbosity >= 1)
428 std::cout << "WARNING: results not verified." << std::endl;
429 }
430
431 // Write trace values if trace_size > 0 and first iteration
432 if (myargs.trace_size > 0 && iter == myargs.n_warmup_iterations) {
433 // std::cout << "Writing to offset " << OUT_VOLUME * sizeof(T3) <<
434 // std::endl;
435 std::cout << "Writing trace of size " << myargs.trace_size << std::endl;
436 // test_utils::write_out_trace(((char *)bufOut) + OUT_VOLUME * sizeof(T3),
437 test_utils::write_out_trace((char *)bufTrace, myargs.trace_size,
438 myargs.trace_file);
439 }
440
441 // Write out control packet outputs
442 if (enable_ctrl_pkts) {
443 uint32_t *ctrl_pkt_out =
444 (uint32_t *)(((char *)bufTrace) + myargs.trace_size);
445 if (myargs.verbosity >= 1) {
446 std::cout << "ctrl_pkt_out[0]:" << std::hex << ctrl_pkt_out[0]
447 << std::endl;
448 std::cout << "ctrl_pkt_out[1]:" << std::hex << ctrl_pkt_out[1]
449 << std::endl;
450 }
451 int col = (ctrl_pkt_out[0] >> 21) & 0x7F;
452 int row = (ctrl_pkt_out[0] >> 16) & 0x1F;
453 if ((ctrl_pkt_out[1] >> 8) == 3)
454 std::cout << "WARNING: Trace overflow detected in tile(" << row << ","
455 << col << ". Trace results may be invalid." << std::endl;
456 }
457
458 // Accumulate run times
459 float npu_time =
460 std::chrono::duration_cast<std::chrono::microseconds>(stop - start)
461 .count();
462
463 npu_time_total += npu_time;
464 npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min;
465 npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max;
466 }
467
468 // ------------------------------------------------------
469 // Print verification and timing results
470 // ------------------------------------------------------
471
472 // TODO - Mac count to guide gflops
473 float macs = 0;
474
475 std::cout << std::endl
476 << "Avg NPU time: " << npu_time_total / myargs.n_iterations << "us."
477 << std::endl;
478 if (macs > 0)
479 std::cout << "Avg NPU gflops: "
480 << macs / (1000 * npu_time_total / myargs.n_iterations)
481 << std::endl;
482
483 std::cout << std::endl
484 << "Min NPU time: " << npu_time_min << "us." << std::endl;
485 if (macs > 0)
486 std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_min)
487 << std::endl;
488
489 std::cout << std::endl
490 << "Max NPU time: " << npu_time_max << "us." << std::endl;
491 if (macs > 0)
492 std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max)
493 << std::endl;
494
495 if (!errors) {
496 std::cout << "\nPASS!\n\n";
497 return 0;
498 } else {
499 std::cout << "\nError count: " << errors << "\n\n";
500 std::cout << "\nFailed.\n\n";
501 return 1;
502 }
503}
std::vector< uint32_t > load_instr_binary(std::string instr_path)
void init_xrt_load_kernel(xrt::device &device, xrt::kernel &kernel, int verbosity, std::string xclbinFileName, std::string kernelNameInXclbin)
void parse_options(int argc, const char *argv[], cxxopts::Options &options, cxxopts::ParseResult &result)
void write_out_trace(char *traceOutPtr, size_t trace_size, std::string path)
void add_default_options(cxxopts::Options &options)
int n_warmup_iterations
std::string trace_file
std::string xclbin
int do_verify
int trace_size
int verbosity
std::string kernel
int n_iterations
std::string instr
uint32_t create_ctrl_pkt(int operation, int beats, int addr, int ctrl_pkt_read_id=28)
int setup_and_run_aie(int IN1_VOLUME, int IN2_VOLUME, int OUT_VOLUME, struct args myargs, bool enable_ctrl_pkts=false)
struct args parse_args(int argc, const char *argv[])
uint32_t getParity(uint32_t n)