MLIR-AIE
xrt_test_wrapper.h
Go to the documentation of this file.
1#include "cxxopts.hpp"
2#include "test_utils.h"
3#include "xrt/xrt_bo.h"
4#include <fstream>
5#include <iostream>
6#include <sstream>
7
8struct args {
14 std::string instr;
15 std::string xclbin;
16 std::string kernel;
17 std::string trace_file;
18};
19
20struct args parse_args(int argc, const char *argv[]) {
21 // ------------------------------------------------------
22 // Parse program arguments
23 // ------------------------------------------------------
24 cxxopts::Options options("XRT Test Wrapper");
27
28 struct args myargs;
29
30 test_utils::parse_options(argc, argv, options, vm);
31 myargs.verbosity = vm["verbosity"].as<int>();
32 myargs.do_verify = vm["verify"].as<bool>();
33 myargs.n_iterations = vm["iters"].as<int>();
34 myargs.n_warmup_iterations = vm["warmup"].as<int>();
35 myargs.trace_size = vm["trace_sz"].as<int>();
36 myargs.instr = vm["instr"].as<std::string>();
37 myargs.xclbin = vm["xclbin"].as<std::string>();
38 myargs.kernel = vm["kernel"].as<std::string>();
39 myargs.trace_file = vm["trace_file"].as<std::string>();
40
41 return myargs;
42}
43
44/*
45 ******************************************************************************
46 * XRT based test wrapper for 2 inputs and 1 output
47 ******************************************************************************
48 */
49template <typename T1, typename T2, typename T3, void (*init_bufIn1)(T1 *, int),
50 void (*init_bufIn2)(T2 *, int), void (*init_bufOut)(T3 *, int),
51 int (*verify_results)(T1 *, T2 *, T3 *, int, int)>
52int setup_and_run_aie(int IN1_VOLUME, int IN2_VOLUME, int OUT_VOLUME,
53 struct args myargs) {
54
55 srand(time(NULL));
56
57 // Load instruction sequence
58 std::vector<uint32_t> instr_v = test_utils::load_instr_binary(myargs.instr);
59 if (myargs.verbosity >= 1)
60 std::cout << "Sequence instr count: " << instr_v.size() << "\n";
61
62 // Start the XRT context and load the kernel
63 xrt::device device;
64 xrt::kernel kernel;
65
67 myargs.xclbin, myargs.kernel);
68
69 // set up the buffer objects
70 auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
71 XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
72 auto bo_in1 = xrt::bo(device, IN1_VOLUME * sizeof(T1), XRT_BO_FLAGS_HOST_ONLY,
73 kernel.group_id(3));
74 auto bo_in2 = xrt::bo(device, IN2_VOLUME * sizeof(T2), XRT_BO_FLAGS_HOST_ONLY,
75 kernel.group_id(4));
76 auto bo_out = xrt::bo(device, OUT_VOLUME * sizeof(T3), XRT_BO_FLAGS_HOST_ONLY,
77 kernel.group_id(5));
78
79 // Placeholder dummy buffer objects because 0 causes seg faults
80 auto bo_tmp1 = xrt::bo(device, 4, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(6));
81
82 // Workaround so we declare a really small trace buffer when one is not used
83 // Second workaround for driver issue. Allocate large trace buffer *4
84 int tmp_trace_size = (myargs.trace_size > 0) ? myargs.trace_size * 4 : 1;
85 auto bo_trace = xrt::bo(device, tmp_trace_size, XRT_BO_FLAGS_HOST_ONLY,
86 kernel.group_id(7));
87
88 if (myargs.verbosity >= 1)
89 std::cout << "Writing data into buffer objects.\n";
90
91 // Copy instruction stream to xrt buffer object
92 void *bufInstr = bo_instr.map<void *>();
93 memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
94
95 // Initialize buffer objects
96 T1 *bufIn1 = bo_in1.map<T1 *>();
97 T2 *bufIn2 = bo_in2.map<T2 *>();
98 T3 *bufOut = bo_out.map<T3 *>();
99 char *bufTrace = bo_trace.map<char *>();
100
101 init_bufIn1(bufIn1, IN1_VOLUME);
102 init_bufIn2(bufIn2, IN2_VOLUME);
103 init_bufOut(bufOut, OUT_VOLUME); // <<< what size do I pass it?
104
105 char *bufTmp1 = bo_tmp1.map<char *>();
106 memset(bufTmp1, 0, 4);
107
108 if (myargs.trace_size > 0)
109 memset(bufTrace, 0, myargs.trace_size);
110
111 // sync host to device memories
112 bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
113 bo_in1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
114 bo_in2.sync(XCL_BO_SYNC_BO_TO_DEVICE);
115 bo_out.sync(XCL_BO_SYNC_BO_TO_DEVICE);
116 bo_tmp1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
117 if (myargs.trace_size > 0)
118 bo_trace.sync(XCL_BO_SYNC_BO_TO_DEVICE);
119
120 // ------------------------------------------------------
121 // Initialize run configs
122 // ------------------------------------------------------
123 unsigned num_iter = myargs.n_iterations + myargs.n_warmup_iterations;
124 float npu_time_total = 0;
125 float npu_time_min = 9999999;
126 float npu_time_max = 0;
127
128 int errors = 0;
129
130 // ------------------------------------------------------
131 // Main run loop
132 // ------------------------------------------------------
133 for (unsigned iter = 0; iter < num_iter; iter++) {
134
135 if (myargs.verbosity >= 1)
136 std::cout << "Running Kernel.\n";
137
138 // Run kernel
139 if (myargs.verbosity >= 1)
140 std::cout << "Running Kernel.\n";
141 auto start = std::chrono::high_resolution_clock::now();
142 unsigned int opcode = 3;
143 auto run = kernel(opcode, bo_instr, instr_v.size(), bo_in1, bo_in2, bo_out,
144 bo_tmp1, bo_trace);
145 run.wait();
146 auto stop = std::chrono::high_resolution_clock::now();
147 bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
148 if (myargs.trace_size > 0)
149 bo_trace.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
150
151 if (iter < myargs.n_warmup_iterations)
152 /* Warmup iterations do not count towards average runtime. */
153 continue;
154
155 // Copy output results and verify they are correct
156 if (myargs.do_verify) {
157 if (myargs.verbosity >= 1) {
158 std::cout << "Verifying results ..." << std::endl;
159 }
160 auto vstart = std::chrono::system_clock::now();
161
162 errors +=
163 verify_results(bufIn1, bufIn2, bufOut, IN1_VOLUME, myargs.verbosity);
164
165 auto vstop = std::chrono::system_clock::now();
166 float vtime =
167 std::chrono::duration_cast<std::chrono::seconds>(vstop - vstart)
168 .count();
169 if (myargs.verbosity >= 1)
170 std::cout << "Verify time: " << vtime << "secs." << std::endl;
171 } else {
172 if (myargs.verbosity >= 1)
173 std::cout << "WARNING: results not verified." << std::endl;
174 }
175
176 // Write trace values if trace_size > 0 and first iteration
177 if (myargs.trace_size > 0 && iter == myargs.n_warmup_iterations) {
178 test_utils::write_out_trace(((char *)bufTrace), myargs.trace_size,
179 myargs.trace_file);
180 }
181
182 // Accumulate run times
183 float npu_time =
184 std::chrono::duration_cast<std::chrono::microseconds>(stop - start)
185 .count();
186
187 npu_time_total += npu_time;
188 npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min;
189 npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max;
190 }
191
192 // ------------------------------------------------------
193 // Print verification and timing results
194 // ------------------------------------------------------
195
196 // TODO - Mac count to guide gflops
197 float macs = 0;
198
199 std::cout << std::endl
200 << "Avg NPU time: " << npu_time_total / myargs.n_iterations << "us."
201 << std::endl;
202 if (macs > 0)
203 std::cout << "Avg NPU gflops: "
204 << macs / (1000 * npu_time_total / myargs.n_iterations)
205 << std::endl;
206
207 std::cout << std::endl
208 << "Min NPU time: " << npu_time_min << "us." << std::endl;
209 if (macs > 0)
210 std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_min)
211 << std::endl;
212
213 std::cout << std::endl
214 << "Max NPU time: " << npu_time_max << "us." << std::endl;
215 if (macs > 0)
216 std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max)
217 << std::endl;
218
219 if (!errors) {
220 std::cout << "\nPASS!\n\n";
221 return 0;
222 } else {
223 std::cout << "\nError count: " << errors << "\n\n";
224 std::cout << "\nFailed.\n\n";
225 return 1;
226 }
227}
228
229/*
230 ******************************************************************************
231 * XRT based test wrapper for 1 input and 1 output
232 ******************************************************************************
233 */
234template <typename T1, typename T3, void (*init_bufIn1)(T1 *, int),
235 void (*init_bufOut)(T3 *, int),
236 int (*verify_results)(T1 *, T3 *, int, int)>
237int setup_and_run_aie(int IN1_VOLUME, int OUT_VOLUME, struct args myargs) {
238
239 srand(time(NULL));
240
241 // Load instruction sequence
242 std::vector<uint32_t> instr_v = test_utils::load_instr_binary(myargs.instr);
243 if (myargs.verbosity >= 1)
244 std::cout << "Sequence instr count: " << instr_v.size() << "\n";
245
246 // Start the XRT context and load the kernel
247 xrt::device device;
248 xrt::kernel kernel;
249
251 myargs.xclbin, myargs.kernel);
252
253 // set up the buffer objects
254 auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
255 XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
256 auto bo_in1 = xrt::bo(device, IN1_VOLUME * sizeof(T1), XRT_BO_FLAGS_HOST_ONLY,
257 kernel.group_id(3));
258 auto bo_out = xrt::bo(device, OUT_VOLUME * sizeof(T3), XRT_BO_FLAGS_HOST_ONLY,
259 kernel.group_id(4));
260
261 // Placeholder dummy buffer objects because 0 causes seg faults
262 auto bo_tmp1 = xrt::bo(device, 1, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5));
263 auto bo_tmp2 = xrt::bo(device, 1, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(6));
264
265 // Workaround so we declare a really small trace buffer when one is not used
266 // Second workaround for driver issue. Allocate large trace buffer *4
267 int tmp_trace_size = (myargs.trace_size > 0) ? myargs.trace_size * 4 : 1;
268 auto bo_trace = xrt::bo(device, tmp_trace_size, XRT_BO_FLAGS_HOST_ONLY,
269 kernel.group_id(7));
270
271 if (myargs.verbosity >= 1)
272 std::cout << "Writing data into buffer objects.\n";
273
274 // Copy instruction stream to xrt buffer object
275 void *bufInstr = bo_instr.map<void *>();
276 memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
277
278 // Initialize buffer objects
279 T1 *bufIn1 = bo_in1.map<T1 *>();
280 T3 *bufOut = bo_out.map<T3 *>();
281
282 char *bufTrace = bo_trace.map<char *>();
283
284 init_bufIn1(bufIn1, IN1_VOLUME);
285 init_bufOut(bufOut,
286 OUT_VOLUME); // <<< what size do I pass it? reset with trace?
287 if (myargs.trace_size > 0)
288 memset(bufTrace, 0, myargs.trace_size);
289
290 // sync host to device memories
291 bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
292 bo_in1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
293 bo_out.sync(XCL_BO_SYNC_BO_TO_DEVICE);
294 if (myargs.trace_size > 0)
295 bo_trace.sync(XCL_BO_SYNC_BO_TO_DEVICE);
296
297 // ------------------------------------------------------
298 // Initialize run configs
299 // ------------------------------------------------------
300 unsigned num_iter = myargs.n_iterations + myargs.n_warmup_iterations;
301 float npu_time_total = 0;
302 float npu_time_min = 9999999;
303 float npu_time_max = 0;
304
305 int errors = 0;
306
307 // ------------------------------------------------------
308 // Main run loop
309 // ------------------------------------------------------
310 for (unsigned iter = 0; iter < num_iter; iter++) {
311
312 if (myargs.verbosity >= 1)
313 std::cout << "Running Kernel.\n";
314
315 // Run kernel
316 if (myargs.verbosity >= 1)
317 std::cout << "Running Kernel.\n";
318 auto start = std::chrono::high_resolution_clock::now();
319 unsigned int opcode = 3;
320 // auto run = kernel(opcode, bo_instr, instr_v.size(), bo_in1, bo_out);
321 // auto run = kernel(opcode, bo_instr, instr_v.size(), bo_in1, bo_out, 0, 0,
322 auto run = kernel(opcode, bo_instr, instr_v.size(), bo_in1, bo_out, bo_tmp1,
323 bo_tmp2, bo_trace);
324 run.wait();
325 auto stop = std::chrono::high_resolution_clock::now();
326 bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
327 if (myargs.trace_size > 0)
328 bo_trace.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
329
330 if (iter < myargs.n_warmup_iterations)
331 /* Warmup iterations do not count towards average runtime. */
332 continue;
333
334 // Copy output results and verify they are correct
335 if (myargs.do_verify) {
336 if (myargs.verbosity >= 1) {
337 std::cout << "Verifying results ..." << std::endl;
338 }
339 auto vstart = std::chrono::system_clock::now();
340
341 errors += verify_results(bufIn1, bufOut, IN1_VOLUME, myargs.verbosity);
342
343 auto vstop = std::chrono::system_clock::now();
344 float vtime =
345 std::chrono::duration_cast<std::chrono::seconds>(vstop - vstart)
346 .count();
347 if (myargs.verbosity >= 1)
348 std::cout << "Verify time: " << vtime << "secs." << std::endl;
349 } else {
350 if (myargs.verbosity >= 1)
351 std::cout << "WARNING: results not verified." << std::endl;
352 }
353
354 // Write trace values if trace_size > 0 and first iteration
355 if (myargs.trace_size > 0 && iter == myargs.n_warmup_iterations) {
356 // std::cout << "Writing to offset " << OUT_VOLUME * sizeof(T3) <<
357 // std::endl;
358 std::cout << "Writing trace of size " << myargs.trace_size << std::endl;
359 // test_utils::write_out_trace(((char *)bufOut) + OUT_VOLUME * sizeof(T3),
360 test_utils::write_out_trace((char *)bufTrace, myargs.trace_size,
361 myargs.trace_file);
362 }
363
364 // Accumulate run times
365 float npu_time =
366 std::chrono::duration_cast<std::chrono::microseconds>(stop - start)
367 .count();
368
369 npu_time_total += npu_time;
370 npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min;
371 npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max;
372 }
373
374 // ------------------------------------------------------
375 // Print verification and timing results
376 // ------------------------------------------------------
377
378 // TODO - Mac count to guide gflops
379 float macs = 0;
380
381 std::cout << std::endl
382 << "Avg NPU time: " << npu_time_total / myargs.n_iterations << "us."
383 << std::endl;
384 if (macs > 0)
385 std::cout << "Avg NPU gflops: "
386 << macs / (1000 * npu_time_total / myargs.n_iterations)
387 << std::endl;
388
389 std::cout << std::endl
390 << "Min NPU time: " << npu_time_min << "us." << std::endl;
391 if (macs > 0)
392 std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_min)
393 << std::endl;
394
395 std::cout << std::endl
396 << "Max NPU time: " << npu_time_max << "us." << std::endl;
397 if (macs > 0)
398 std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max)
399 << std::endl;
400
401 if (!errors) {
402 std::cout << "\nPASS!\n\n";
403 return 0;
404 } else {
405 std::cout << "\nError count: " << errors << "\n\n";
406 std::cout << "\nFailed.\n\n";
407 return 1;
408 }
409}
std::vector< uint32_t > load_instr_binary(std::string instr_path)
void init_xrt_load_kernel(xrt::device &device, xrt::kernel &kernel, int verbosity, std::string xclbinFileName, std::string kernelNameInXclbin)
void parse_options(int argc, const char *argv[], cxxopts::Options &options, cxxopts::ParseResult &result)
void write_out_trace(char *traceOutPtr, size_t trace_size, std::string path)
void add_default_options(cxxopts::Options &options)
int n_warmup_iterations
std::string trace_file
std::string xclbin
int do_verify
int trace_size
int verbosity
std::string kernel
int n_iterations
std::string instr
struct args parse_args(int argc, const char *argv[])
int setup_and_run_aie(int IN1_VOLUME, int IN2_VOLUME, int OUT_VOLUME, struct args myargs)