78 struct args myargs,
bool enable_ctrl_pkts =
false) {
85 std::cout <<
"Sequence instr count: " << instr_v.size() <<
"\n";
95 auto bo_instr = xrt::bo(device, instr_v.size() *
sizeof(
int),
96 XCL_BO_FLAGS_CACHEABLE,
kernel.group_id(1));
97 auto bo_in1 = xrt::bo(device, IN1_VOLUME *
sizeof(T1), XRT_BO_FLAGS_HOST_ONLY,
99 auto bo_in2 = xrt::bo(device, IN2_VOLUME *
sizeof(T2), XRT_BO_FLAGS_HOST_ONLY,
101 auto bo_out = xrt::bo(device, OUT_VOLUME *
sizeof(T3), XRT_BO_FLAGS_HOST_ONLY,
107 xrt::bo(device, 8, XRT_BO_FLAGS_HOST_ONLY,
kernel.group_id(6));
113 auto bo_trace = xrt::bo(device, tmp_trace_size, XRT_BO_FLAGS_HOST_ONLY,
117 std::cout <<
"Writing data into buffer objects.\n";
120 void *bufInstr = bo_instr.map<
void *>();
121 memcpy(bufInstr, instr_v.data(), instr_v.size() *
sizeof(
int));
124 T1 *bufIn1 = bo_in1.map<T1 *>();
125 T2 *bufIn2 = bo_in2.map<T2 *>();
126 T3 *bufOut = bo_out.map<T3 *>();
127 char *bufTrace = bo_trace.map<
char *>();
128 uint32_t *bufCtrlPkts = bo_ctrlpkts.map<uint32_t *>();
130 init_bufIn1(bufIn1, IN1_VOLUME);
131 init_bufIn2(bufIn2, IN2_VOLUME);
132 init_bufOut(bufOut, OUT_VOLUME);
141 if (myargs.
trace_size > 0 && enable_ctrl_pkts) {
145 std::cout <<
"bufCtrlPkts[0]:" << std::hex << bufCtrlPkts[0] << std::endl;
146 std::cout <<
"bufCtrlPkts[1]:" << std::hex << bufCtrlPkts[1] << std::endl;
151 bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
152 bo_in1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
153 bo_in2.sync(XCL_BO_SYNC_BO_TO_DEVICE);
154 bo_out.sync(XCL_BO_SYNC_BO_TO_DEVICE);
157 bo_trace.sync(XCL_BO_SYNC_BO_TO_DEVICE);
158 if (enable_ctrl_pkts)
159 bo_ctrlpkts.sync(XCL_BO_SYNC_BO_TO_DEVICE);
166 float npu_time_total = 0;
167 float npu_time_min = 9999999;
168 float npu_time_max = 0;
175 for (
unsigned iter = 0; iter < num_iter; iter++) {
178 std::cout <<
"Running Kernel.\n";
182 std::cout <<
"Running Kernel.\n";
183 auto start = std::chrono::high_resolution_clock::now();
184 unsigned int opcode = 3;
185 auto run =
kernel(opcode, bo_instr, instr_v.size(), bo_in1, bo_in2, bo_out,
186 bo_ctrlpkts, bo_trace);
188 auto stop = std::chrono::high_resolution_clock::now();
189 bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
191 bo_trace.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
200 std::cout <<
"Verifying results ..." << std::endl;
202 auto vstart = std::chrono::system_clock::now();
205 verify_results(bufIn1, bufIn2, bufOut, IN1_VOLUME, myargs.
verbosity);
207 auto vstop = std::chrono::system_clock::now();
209 std::chrono::duration_cast<std::chrono::seconds>(vstop - vstart)
212 std::cout <<
"Verify time: " << vtime <<
"secs." << std::endl;
215 std::cout <<
"WARNING: results not verified." << std::endl;
225 if (enable_ctrl_pkts) {
226 uint32_t *ctrl_pkt_out =
227 (uint32_t *)(((
char *)bufTrace) + myargs.
trace_size);
229 std::cout <<
"ctrl_pkt_out[0]:" << std::hex << ctrl_pkt_out[0]
231 std::cout <<
"ctrl_pkt_out[1]:" << std::hex << ctrl_pkt_out[1]
234 int col = (ctrl_pkt_out[0] >> 21) & 0x7F;
235 int row = (ctrl_pkt_out[0] >> 16) & 0x1F;
236 if ((ctrl_pkt_out[1] >> 8) == 3)
237 std::cout <<
"WARNING: Trace overflow detected in tile(" << row <<
","
238 << col <<
". Trace results may be invalid." << std::endl;
243 std::chrono::duration_cast<std::chrono::microseconds>(stop - start)
246 npu_time_total += npu_time;
247 npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min;
248 npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max;
258 std::cout << std::endl
259 <<
"Avg NPU time: " << npu_time_total / myargs.
n_iterations <<
"us."
262 std::cout <<
"Avg NPU gflops: "
266 std::cout << std::endl
267 <<
"Min NPU time: " << npu_time_min <<
"us." << std::endl;
269 std::cout <<
"Max NPU gflops: " << macs / (1000 * npu_time_min)
272 std::cout << std::endl
273 <<
"Max NPU time: " << npu_time_max <<
"us." << std::endl;
275 std::cout <<
"Min NPU gflops: " << macs / (1000 * npu_time_max)
279 std::cout <<
"\nPASS!\n\n";
282 std::cout <<
"\nError count: " << errors <<
"\n\n";
283 std::cout <<
"\nFailed.\n\n";
297 bool enable_ctrl_pkts =
false) {
303 std::cout <<
"Sequence instr count: " << instr_v.size() <<
"\n";
313 auto bo_instr = xrt::bo(device, instr_v.size() *
sizeof(
int),
314 XCL_BO_FLAGS_CACHEABLE,
kernel.group_id(1));
315 auto bo_in1 = xrt::bo(device, IN1_VOLUME *
sizeof(T1), XRT_BO_FLAGS_HOST_ONLY,
317 auto bo_out = xrt::bo(device, OUT_VOLUME *
sizeof(T3), XRT_BO_FLAGS_HOST_ONLY,
321 auto bo_tmp1 = xrt::bo(device, 1, XRT_BO_FLAGS_HOST_ONLY,
kernel.group_id(5));
326 xrt::bo(device, 8, XRT_BO_FLAGS_HOST_ONLY,
kernel.group_id(6));
331 auto bo_trace = xrt::bo(device, tmp_trace_size, XRT_BO_FLAGS_HOST_ONLY,
335 std::cout <<
"Writing data into buffer objects.\n";
338 void *bufInstr = bo_instr.map<
void *>();
339 memcpy(bufInstr, instr_v.data(), instr_v.size() *
sizeof(
int));
342 T1 *bufIn1 = bo_in1.map<T1 *>();
343 T3 *bufOut = bo_out.map<T3 *>();
345 char *bufTrace = bo_trace.map<
char *>();
346 uint32_t *bufCtrlPkts = bo_ctrlpkts.map<uint32_t *>();
348 init_bufIn1(bufIn1, IN1_VOLUME);
355 if (myargs.
trace_size > 0 && enable_ctrl_pkts) {
359 std::cout <<
"bufCtrlPkts[0]:" << std::hex << bufCtrlPkts[0] << std::endl;
360 std::cout <<
"bufCtrlPkts[1]:" << std::hex << bufCtrlPkts[1] << std::endl;
365 bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
366 bo_in1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
367 bo_out.sync(XCL_BO_SYNC_BO_TO_DEVICE);
369 bo_trace.sync(XCL_BO_SYNC_BO_TO_DEVICE);
370 if (enable_ctrl_pkts)
371 bo_ctrlpkts.sync(XCL_BO_SYNC_BO_TO_DEVICE);
378 float npu_time_total = 0;
379 float npu_time_min = 9999999;
380 float npu_time_max = 0;
387 for (
unsigned iter = 0; iter < num_iter; iter++) {
390 std::cout <<
"Running Kernel.\n";
394 std::cout <<
"Running Kernel.\n";
395 auto start = std::chrono::high_resolution_clock::now();
396 unsigned int opcode = 3;
399 auto run =
kernel(opcode, bo_instr, instr_v.size(), bo_in1, bo_out, bo_tmp1,
400 bo_ctrlpkts, bo_trace);
402 auto stop = std::chrono::high_resolution_clock::now();
403 bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
405 bo_trace.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
414 std::cout <<
"Verifying results ..." << std::endl;
416 auto vstart = std::chrono::system_clock::now();
418 errors += verify_results(bufIn1, bufOut, IN1_VOLUME, myargs.
verbosity);
420 auto vstop = std::chrono::system_clock::now();
422 std::chrono::duration_cast<std::chrono::seconds>(vstop - vstart)
425 std::cout <<
"Verify time: " << vtime <<
"secs." << std::endl;
428 std::cout <<
"WARNING: results not verified." << std::endl;
435 std::cout <<
"Writing trace of size " << myargs.
trace_size << std::endl;
442 if (enable_ctrl_pkts) {
443 uint32_t *ctrl_pkt_out =
444 (uint32_t *)(((
char *)bufTrace) + myargs.
trace_size);
446 std::cout <<
"ctrl_pkt_out[0]:" << std::hex << ctrl_pkt_out[0]
448 std::cout <<
"ctrl_pkt_out[1]:" << std::hex << ctrl_pkt_out[1]
451 int col = (ctrl_pkt_out[0] >> 21) & 0x7F;
452 int row = (ctrl_pkt_out[0] >> 16) & 0x1F;
453 if ((ctrl_pkt_out[1] >> 8) == 3)
454 std::cout <<
"WARNING: Trace overflow detected in tile(" << row <<
","
455 << col <<
". Trace results may be invalid." << std::endl;
460 std::chrono::duration_cast<std::chrono::microseconds>(stop - start)
463 npu_time_total += npu_time;
464 npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min;
465 npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max;
475 std::cout << std::endl
476 <<
"Avg NPU time: " << npu_time_total / myargs.
n_iterations <<
"us."
479 std::cout <<
"Avg NPU gflops: "
483 std::cout << std::endl
484 <<
"Min NPU time: " << npu_time_min <<
"us." << std::endl;
486 std::cout <<
"Max NPU gflops: " << macs / (1000 * npu_time_min)
489 std::cout << std::endl
490 <<
"Max NPU time: " << npu_time_max <<
"us." << std::endl;
492 std::cout <<
"Min NPU gflops: " << macs / (1000 * npu_time_max)
496 std::cout <<
"\nPASS!\n\n";
499 std::cout <<
"\nError count: " << errors <<
"\n\n";
500 std::cout <<
"\nFailed.\n\n";