MLIR-AIE
AIEDMATasksToNPU.cpp
Go to the documentation of this file.
1//===- AIEDMATasksToNPU.cpp -------------------------------------*- C++ -*-===//
2//
3// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7// (c) Copyright 2024 Advanced Micro Devices, Inc.
8//
9//===----------------------------------------------------------------------===//
10
11#include <algorithm>
12#include <iterator>
13
18
19#include "mlir/Dialect/MemRef/IR/MemRef.h"
20#include "mlir/Pass/Pass.h"
21#include "mlir/Transforms/DialectConversion.h"
22#include "llvm/ADT/TypeSwitch.h"
23
24namespace xilinx::AIEX {
25#define GEN_PASS_DEF_AIEDMATASKSTONPU
26#include "aie/Dialect/AIEX/Transforms/AIEXPasses.h.inc"
27} // namespace xilinx::AIEX
28
29using namespace mlir;
30using namespace xilinx;
31using namespace xilinx::AIEX;
32
34 using OpConversionPattern::OpConversionPattern;
35
36 LogicalResult
37 matchAndRewrite(DMAStartTaskOp op, OpAdaptor adaptor,
38 ConversionPatternRewriter &rewriter) const override {
39 DMAConfigureTaskOp task_op = op.getTaskOp();
40 if (!task_op) {
41 // Cannot rewrite this; probably points to a DMAStartTaskForOp,
42 // which we will lower once it has been rewritten into a DMAStartTaskOp.
43 return failure();
44 }
45 AIE::TileOp tile = task_op.getTileOp();
46 std::optional<uint32_t> first_bd_id = task_op.getFirstBdId();
47 if (!first_bd_id) {
48 auto err = op.emitOpError(
49 "First buffer descriptor in chain has not been assigned an ID");
50 err.attachNote() << "Run the `aie-assign-runtime-buffer-descriptor-ids` "
51 "pass first or manually assign an ID.";
52 return failure();
53 }
54 rewriter.replaceOpWithNewOp<NpuPushQueueOp>(
55 op, tile.getCol(), tile.getRow(), task_op.getDirection(),
56 task_op.getChannel(), task_op.getIssueToken(), task_op.getRepeatCount(),
57 *first_bd_id);
58 return success();
59 }
60};
61
63 using OpConversionPattern::OpConversionPattern;
64
65 LogicalResult
66 matchAndRewrite(DMAAwaitTaskOp op, OpAdaptor adaptor,
67 ConversionPatternRewriter &rewriter) const override {
68 DMAConfigureTaskOp task_op = op.getTaskOp();
69 if (!task_op) {
70 return failure();
71 }
72 if (!task_op.getIssueToken()) {
73 auto err = op.emitOpError(
74 "Cannot wait on a BD that is not configured to issue a token.");
75 err.attachNote(task_op.getLoc())
76 << "Consider adding attribute `issue_token=true` here.";
77 return err;
78 }
79 AIE::TileOp tile = task_op.getTileOp();
80 rewriter.replaceOpWithNewOp<NpuSyncOp>(op, tile.getCol(), tile.getRow(),
81 (uint32_t)task_op.getDirection(),
82 task_op.getChannel(), 1, 1);
83 return success();
84 }
85};
86
88 : xilinx::AIEX::impl::AIEDMATasksToNPUBase<AIEDMATasksToNPUPass> {
89
90 bool shouldSkipBlock(Block &block) {
91 // Allow blocks in the input IR that contain nothing but a next_bd operation
92 // as the entry block. We will skip these blocks and not lower them to
93 // anything.
94 auto it = block.without_terminator();
95 return block.isEntryBlock() && it.begin() == it.end();
96 }
97
98 LogicalResult verifyBdInBlock(Block &block) {
99 auto bd_ops = block.getOps<AIE::DMABDOp>();
100 // Exactly one BD op per block
101 int n_bd_ops = std::distance(bd_ops.begin(), bd_ops.end());
102 if (n_bd_ops < 1) {
103 auto error = block.getTerminator()->emitError(
104 "Block ending in this terminator does not contain a required "
105 "aie.dma_bd operation.");
106 error.attachNote(block.getParentOp()->getLoc())
107 << "Error encountered while lowering this BD configuration.";
108 return failure();
109 } else if (n_bd_ops > 1) {
110 auto error = block.getTerminator()->emitOpError(
111 "This block contains multiple aie.dma_bd operations. Exactly one is "
112 "required.");
113 auto it = bd_ops.begin();
114 ++it;
115 for (; it != bd_ops.end(); ++it) {
116 error.attachNote((*it)->getLoc()) << "Extra aie.dma_bd operation here.";
117 }
118 return failure();
119 }
120 AIE::DMABDOp bd_op = *bd_ops.begin();
121 if (!bd_op.getBdId().has_value()) {
122 auto error = bd_op.emitOpError(
123 "Cannot lower buffer descriptor without assigned ID.");
124 error.attachNote()
125 << "Run the `--aie-assign-runtime-sequence-bd-ids` pass first or "
126 "manually assign an ID to this buffer descriptor.";
127 error.attachNote(block.getParentOp()->getLoc())
128 << "Error encountered while lowering this BD configuration.";
129 return failure();
130 }
131 return success();
132 }
133
134 LogicalResult verifyOptionalLocksInBlock(Block &block) {
135 auto lock_ops = block.getOps<AIE::UseLockOp>();
136 int n_lock_ops = std::distance(lock_ops.begin(), lock_ops.end());
137 // Allow exactly 0 or 2 lock ops (acquire and release)
138 if (n_lock_ops != 0 && n_lock_ops != 2) {
139 AIE::UseLockOp lock_op = *lock_ops.begin();
140 lock_op.emitOpError(
141 "BD blocks must have either 0 or 2 lock operations (acquire and "
142 "release). Found ")
143 << n_lock_ops << " lock operations.";
144 return failure();
145 }
146 return success();
147 }
148
149 LogicalResult verifyNoUnsupportedOpsInBlock(Block &block) {
150 WalkResult unsupported_ops = block.walk([&](Operation *inner_op) {
151 return llvm::TypeSwitch<Operation *, WalkResult>(inner_op)
152 .Case<AIE::DMABDOp>(
153 [&](AIE::DMABDOp bd_op) { return WalkResult::advance(); })
154 .Case<AIE::UseLockOp>(
155 [&](AIE::UseLockOp lock_op) { return WalkResult::advance(); })
156 .Case<AIE::NextBDOp>(
157 [&](AIE::NextBDOp lock_op) { return WalkResult::advance(); })
158 .Case<AIE::EndOp>(
159 [&](AIE::EndOp lock_op) { return WalkResult::advance(); })
160 .Default([&](Operation *inner_op) {
161 auto error = block.getParentOp()->emitOpError(
162 "Unsupported operation within BD block.");
163 error.attachNote(inner_op->getLoc())
164 << "No lowering to NPU instructions available for this "
165 "operation.";
166 return WalkResult::interrupt();
167 });
168 });
169 if (unsupported_ops.wasInterrupted()) {
170 return failure();
171 }
172 return success();
173 }
174
175 AIE::DMABDOp getBdForBlock(Block &block) {
176 auto bd_ops = block.getOps<AIE::DMABDOp>();
177 AIE::DMABDOp bd_op = *bd_ops.begin(); // Dereference first (and only, after
178 // previous checks) bd op iterator
179 return bd_op;
180 }
181
182 // Returns pair of (acquire_lock_op, release_lock_op) if present
183 std::optional<std::pair<AIE::UseLockOp, AIE::UseLockOp>>
185 auto lock_ops = block.getOps<AIE::UseLockOp>();
186 int n_lock_ops = std::distance(lock_ops.begin(), lock_ops.end());
187 if (n_lock_ops != 2) {
188 return std::nullopt;
189 }
190
191 AIE::UseLockOp acquire_op = nullptr;
192 AIE::UseLockOp release_op = nullptr;
193
194 for (auto lock_op : lock_ops) {
195 if (lock_op.acquire() || lock_op.acquireGE()) {
196 acquire_op = lock_op;
197 } else if (lock_op.release()) {
198 release_op = lock_op;
199 }
200 }
201
202 if (acquire_op && release_op) {
203 return std::make_pair(acquire_op, release_op);
204 }
205 return std::nullopt;
206 }
207
208 LogicalResult setAddressForSingleBD(OpBuilder &builder, AIE::DMABDOp &bd_op,
209 AIE::TileOp &tile) {
210 uint32_t bd_id = bd_op.getBdId().value();
211 const AIE::AIETargetModel &target_model = AIE::getTargetModel(bd_op);
212 auto buf = bd_op.getBuffer();
213 auto col = tile.getCol();
214 auto row = tile.getRow();
215 uint64_t register_addr = target_model.getDmaBdAddress(col, row, bd_id) +
216 target_model.getDmaBdAddressOffset(col, row);
217
218 // A buffer descriptor can refer to a statically allocated aie.buffer, or to
219 // a DDR buffer which will be passed as a runtime argument (block
220 // argument). Try to find the root block argument, either directly or
221 // through subviews/casts.
222 mlir::BlockArgument buf_arg = nullptr;
223 int64_t offset = 0;
224
225 if (auto directArg = llvm::dyn_cast<mlir::BlockArgument>(buf)) {
226 buf_arg = directArg;
227 offset = 0;
228 } else if (auto traceResult = traceSubviewToBlockArgument(buf)) {
229 buf_arg = traceResult->rootArg;
230 offset = traceResult->offsetInBytes;
231 }
232
233 if (buf_arg) {
234 if (!target_model.isShimNOCTile(tile.getCol(), tile.getRow())) {
235 return bd_op->emitOpError("DDR memory (runtime input arguments) can "
236 "only be referred to on shim tiles.");
237 }
238
239 unsigned arg_idx = buf_arg.getArgNumber();
240 offset += bd_op.getOffsetInBytes();
241 NpuAddressPatchOp::create(builder, bd_op.getLoc(),
242 /*addr*/ register_addr,
243 /*arg_idx*/ arg_idx,
244 /*arg_plus*/ offset);
245 } else if (AIE::BufferOp buffer =
246 llvm::dyn_cast<AIE::BufferOp>(buf.getDefiningOp())) {
247 uint64_t buf_addr;
248 if (!buffer.getAddress().has_value()) {
249 return bd_op->emitOpError(
250 "Cannot lower buffer without associated address. Run pass "
251 "--aie-assign-buffer-addresses first or manually assign an "
252 "address.");
253 }
254 buf_addr = *buffer.getAddress();
255 buf_addr += bd_op.getOffsetInBytes();
256 if (target_model.isCoreTile(col, row)) {
257 NpuMaskWrite32Op::create(builder, bd_op.getLoc(), register_addr,
258 (buf_addr / 4) << 14, 0x0fffc000, nullptr,
259 nullptr, nullptr);
260 } else if (target_model.isMemTile(col, row)) {
261 // On AIE2p (NPU2), memtile DMAs use an offset-based address
262 // space where the base depends on the relative position of the
263 // buffer's tile (west=0, internal=getMemTileSize, east=2x).
264 // On AIE2 (NPU1), memtile DMAs address local memory directly
265 // starting at 0. Only add the offset for AIE2p.
266 if (target_model.getTargetArch() == AIE::AIEArch::AIE2p) {
267 auto addrOffset = target_model.getMemLocalBaseAddress(
268 col, row, buffer.getTileOp().getCol(),
269 buffer.getTileOp().getRow());
270 if (addrOffset)
271 buf_addr += addrOffset.value();
272 }
273 NpuMaskWrite32Op::create(builder, bd_op.getLoc(), register_addr,
274 buf_addr / 4, 0x0007FFFF, nullptr, nullptr,
275 nullptr);
276 } else {
277 NpuWrite32Op::create(builder, bd_op.getLoc(), register_addr, buf_addr,
278 nullptr, nullptr, nullptr);
279 }
280 } else {
281 return bd_op->emitOpError(
282 "Buffer argument must be a constant aie.buffer, a runtime sequence "
283 "input argument, or a (chain of) subview(s) or cast(s) of a block "
284 "argument with constant offsets and strides equal to one.");
285 }
286 return success();
287 }
288
289 LogicalResult
290 rewriteSingleBD(OpBuilder &builder, Block &block, AIE::TileOp &tile,
291 AIE::DMAChannelDir channelDir,
292 std::optional<xilinx::AIE::PacketInfoAttr> packet) {
293 AIE::DMABDOp bd_op = getBdForBlock(block);
294 const auto &target_model = AIE::getTargetModel(bd_op);
295 auto buffer_type = llvm::cast<BaseMemRefType>(bd_op.getBuffer().getType());
296 uint32_t addr_granularity = target_model.getAddressGenGranularity();
297
298 uint32_t bd_id = bd_op.getBdId().value();
299 int64_t offset = bd_op.getOffsetInBytes();
300 uint64_t len = bd_op.getLenInBytes();
301 uint64_t len_addr_granularity = len * 8 / addr_granularity;
302
303 if (offset * 8 % addr_granularity != 0) {
304 return bd_op->emitOpError("Offset must be aligned to ")
305 << (addr_granularity / 8) << " byte boundary.";
306 }
307
308 if (len < addr_granularity / 8) {
309 return bd_op->emitOpError("Transfer size of ")
310 << len << " bytes falls below minimum hardware transfer unit of "
311 << (addr_granularity / 8) << " bytes.";
312 }
313 // Process strides/wraps
314 std::optional<llvm::ArrayRef<AIE::BDDimLayoutAttr>> dims =
315 bd_op.getDimensions();
316 llvm::SmallVector<int64_t, 4> sizes = llvm::SmallVector<int64_t, 4>(4, 0);
317 llvm::SmallVector<int64_t, 4> strides = llvm::SmallVector<int64_t, 4>(4, 0);
318
319 // Padding
320 std::optional<llvm::ArrayRef<AIE::BDPadLayoutAttr>> padDims =
321 bd_op.getPadDimensions();
322 llvm::SmallVector<int64_t, 4> padBefore =
323 llvm::SmallVector<int64_t, 4>(4, 0);
324 llvm::SmallVector<int64_t, 4> padAfter =
325 llvm::SmallVector<int64_t, 4>(4, 0);
326 std::fill(padBefore.begin(), padBefore.end(), 0);
327 std::fill(padAfter.begin(), padAfter.end(), 0);
328
329 auto enable_packet = 0;
330 auto out_of_order_id = 0;
331 auto packet_id = 0;
332 auto packet_type = 0;
333 auto d0size = 0;
334 auto d0stride = 0;
335 auto d1size = 0;
336 auto d1stride = 0;
337 auto d2size = 0;
338 auto d2stride = 0;
339 auto iteration_size = 0;
340 auto iteration_stride = 0;
341
342 if (dims && dims->size() > 0) {
343 llvm::SmallVector<int64_t, 4> input_sizes =
344 llvm::SmallVector<int64_t, 4>(4, 1);
345 llvm::SmallVector<int64_t, 4> input_strides =
346 llvm::SmallVector<int64_t, 4>(4, 0);
347 if (dims->size() > 4) {
348 return bd_op->emitOpError("At most four data layout transformation "
349 "dimensions may be provided.");
350 }
351
352 for (size_t i = 0; i < dims->size(); i++) {
353 // Pass down dimensions in reverse order; in the MLIR, this allows
354 // us to specify step sizes/wraps in the same order as we would
355 // access a multi-dim C array, with the highest dimension first.
356 int j = dims->size() - i - 1;
357 input_sizes[i] = (*dims)[j].getSize();
358 input_strides[i] = (*dims)[j].getStride();
359 }
360
361 // d3 (repeat) is excluded; a repeated linear transfer is still linear.
362 bool isLinearTransfer =
363 AIEX::isLinearTransfer(input_sizes, input_strides);
364
365 if (dims->size() > 2) {
366 d2size = (target_model.isMemTile(tile.getCol(), tile.getRow()))
367 ? (*dims)[2].getSize()
368 : 0;
369 }
370 if (padDims.has_value()) {
371 if (!target_model.isMemTile(tile.getCol(), tile.getRow()))
372 return bd_op->emitOpError()
373 << "Padding is only supported by memtile dma bds.";
374 if (padDims->size() > dims->size())
375 return bd_op->emitOpError()
376 << "Mismatch number of dimensions between padding(s)"
377 << " and wrap(s) and stride(s).";
378 if (channelDir == AIE::DMAChannelDir::MM2S) {
379 for (size_t i = 0; i < padDims->size(); i++) {
380 int j = padDims->size() - i - 1;
381 padBefore[i] = (*padDims)[j].getConstPadBefore();
382 padAfter[i] = (*padDims)[j].getConstPadAfter();
383 }
384 for (size_t i = padDims->size(); i < dims->size(); i++) {
385 padBefore[i] = 0;
386 padAfter[i] = 0;
387 }
388 } else
389 return bd_op->emitOpError()
390 << "supports padding only for MM2S direction on MemTiles.";
391 }
392 getHardwareStridesWraps(target_model, bd_op, buffer_type, input_sizes,
393 input_strides, sizes, strides);
394
395 if (failed(verifyStridesWraps(bd_op, buffer_type, tile.getCol(),
396 tile.getRow(), input_sizes, input_strides,
397 sizes, strides, isLinearTransfer))) {
398 return failure();
399 }
400
401 iteration_size = sizes[3];
402 iteration_stride = strides[3];
403
404 if (!isLinearTransfer) {
405 // d0_size, d0_stride
406 d0size = sizes[0];
407 d0stride = strides[0];
408
409 // d1_size, d1_stride
410 d1size = sizes[1];
411 d1stride = strides[1];
412
413 // d2_stride
414 d2stride = strides[2];
415 // d2_size set elsewhere
416 }
417 if (input_sizes[3] > 1 && input_strides[3] == 0) {
418 // We allow users to encode the repeat_count as a dimension 3 stride
419 // of 0. This must lower to a iteration wrap of 0, so no stride is
420 // ever added. We then repeat the BD using the repeat_count in
421 // NpuPushQueueOp.
422 iteration_size = 0;
423 iteration_stride = 0;
424 }
425
426 // Ensure the total transfer length and the length expressed in the lowest
427 // three dimensions of strides/wraps agree. (Fourth dimension is
428 // iteration/repeat count and repeats the whole BD, so should not be
429 // incorporated in length of a single BD invocation.)
430 uint64_t len_dims_addr_granularity = 1;
431 for (size_t i = 0; i < 3; i++) {
432 len_dims_addr_granularity *= sizes[i];
433 }
434 if (len_dims_addr_granularity != len_addr_granularity) {
435 auto err =
436 bd_op->emitOpError(
437 "Buffer descriptor length does not match length of transfer "
438 "expressed by lowest three dimensions of data layout "
439 "transformation strides/wraps. ")
440 << "BD length is " << (len_addr_granularity * addr_granularity / 8)
441 << " bytes. "
442 << "Lowest three dimensions of data layout transformation would "
443 "result in transfer of "
444 << (len_dims_addr_granularity * addr_granularity / 8) << " bytes. ";
445 err.attachNote() << "Do not include the highest dimension size in "
446 "transfer length, as this is the BD repeat count.";
447 return failure();
448 }
449 } else {
450 if (padDims && target_model.isMemTile(tile.getCol(), tile.getRow()) &&
451 channelDir == AIE::DMAChannelDir::MM2S) {
452 return bd_op->emitOpError()
453 << "Padding requires n-d data layouts expressed as "
454 << "wrap(s) and stride(s).";
455 } else if (padDims) {
456 return bd_op->emitOpError() << "Padding is supported only on MemTiles.";
457 }
458 }
459 // find next BD ID, if any
460 uint32_t use_next_bd = 0;
461 uint32_t next_bd_id = 0;
462 if (bd_op.getNextBdId().has_value()) {
463 next_bd_id = bd_op.getNextBdId().value();
464 use_next_bd = 1;
465 }
466
467 // enable_packet
468 // auto info = bd_op.getPacket() ? bd_op.getPacket() : packet;
469 auto info = bd_op.getPacket().value_or(packet.value_or(nullptr));
470 if (info) {
471 enable_packet = 1;
472 packet_type = info.getPktType();
473 packet_id = info.getPktId();
474 }
475
476 // Extract lock information if present
477 int32_t lock_rel_val = 0;
478 int32_t lock_rel_id = 0;
479 int32_t lock_acq_enable = 0;
480 int32_t lock_acq_val = 0;
481 int32_t lock_acq_id = 0;
482
483 auto lock_ops = getOptionalLockOpsForBlock(block);
484 if (lock_ops) {
485 auto [acquire_op, release_op] = *lock_ops;
486
487 // Get lock IDs from the lock operations
488 AIE::LockOp acq_lock = acquire_op.getLockOp();
489 AIE::LockOp rel_lock = release_op.getLockOp();
490
491 if (acq_lock.getLockID().has_value()) {
492 lock_acq_id = acq_lock.getLockID().value();
493 lock_acq_val = acquire_op.getLockValue();
494 // For AcquireGreaterEqual, negate the value to signal the hardware
495 // to use >= comparison instead of == comparison.
496 if (acquire_op.acquireGE())
497 lock_acq_val = -lock_acq_val;
498 lock_acq_enable = 1;
499 }
500
501 if (rel_lock.getLockID().has_value()) {
502 lock_rel_id = rel_lock.getLockID().value();
503 lock_rel_val = release_op.getLockValue();
504 }
505
506 // For memtile, add lock offset using getLockLocalBaseIndex.
507 // This matches AIERT.cpp implementation.
508 if (target_model.isMemTile(tile.getCol(), tile.getRow())) {
509 auto lockOffset = target_model.getLockLocalBaseIndex(
510 tile.getCol(), tile.getRow(), acq_lock.colIndex(),
511 acq_lock.rowIndex());
512 if (lockOffset && acq_lock.getLockID().has_value())
513 lock_acq_id += lockOffset.value();
514 if (lockOffset && rel_lock.getLockID().has_value())
515 lock_rel_id += lockOffset.value();
516 }
517 }
518
519 NpuWriteBdOp::create(
520 builder, bd_op.getLoc(), tile.getCol(), bd_id, len_addr_granularity,
521 offset,
522 /*enable_packet=*/enable_packet,
523 /*out_of_order_id=*/out_of_order_id,
524 /*packet_id=*/packet_id,
525 /*packet_type=*/packet_type,
526 /*d0_size=*/d0size, /*d0_stride=*/d0stride,
527 /*d1_size=*/d1size, /*d1_stride=*/d1stride,
528 /*d2_size=*/d2size, /*d2_stride=*/d2stride,
529 /*iteration_current=*/0, /*iteration_size=*/iteration_size,
530 /*iteration_stride=*/iteration_stride,
531 /*next_bd=*/next_bd_id,
532 /*row=*/tile.getRow(),
533 /*use_next_bd=*/use_next_bd,
534 /*valid_bd=*/1,
535 /*lock_rel_val=*/lock_rel_val, /*lock_rel_id=*/lock_rel_id,
536 /*lock_acq_enable=*/lock_acq_enable,
537 /*lock_acq_val=*/lock_acq_val, /*lock_acq_id=*/lock_acq_id,
538 /*d0_zero_before=*/padBefore[0],
539 /*d1_zero_before=*/padBefore[1], /*d2_zero_before=*/padBefore[2],
540 /*d0_zero_after=*/padAfter[0], /*d1_zero_after=*/padAfter[1],
541 /*d2_zero_after=*/padAfter[2],
542 /*burst_length=*/bd_op.getBurstLength());
543 return setAddressForSingleBD(builder, bd_op, tile);
544 }
545
546 LogicalResult hoistNextBdOpsIntoAttrs(DMAConfigureTaskOp op) {
547 Region &body = op.getBody();
548 for (auto it = body.begin(); it != body.end(); ++it) {
549 Block &block = *it;
550 if (shouldSkipBlock(block)) {
551 continue;
552 }
553 AIE::DMABDOp bd_op = getBdForBlock(block);
554 if (AIE::NextBDOp next_bd_op =
555 llvm::dyn_cast<AIE::NextBDOp>(block.getTerminator())) {
556 if (bd_op.getNextBdId().has_value()) {
557 auto error =
558 bd_op.emitOpError("Cannot specify both next_bd_id attribute and "
559 "aie.next_bd operation.");
560 error.attachNote(next_bd_op.getLoc())
561 << "Potentially conflicting next buffer descriptor ID specified "
562 "here.";
563 return failure();
564 }
565 Block &next_bd_block = *next_bd_op.getDest();
566 AIE::DMABDOp next_dma_bd_op = getBdForBlock(next_bd_block);
567 assert(next_dma_bd_op.getBdId()
568 .has_value()); // Next BD should have assigned ID, and this
569 // should have been checked by earlier
570 // verifyBdInBlock() call
571 bd_op.setNextBdId(next_dma_bd_op.getBdId().value());
572 OpBuilder builder(next_bd_op);
573 AIE::EndOp::create(builder, next_bd_op.getLoc());
574 next_bd_op.erase();
575 }
576 }
577 return success();
578 }
579
580 LogicalResult rewriteSingleDMAConfigureTaskOp(DMAConfigureTaskOp op) {
581 OpBuilder builder(op);
582 AIE::TileOp tile = op.getTileOp();
583
584 if (!op.use_empty()) {
585 auto err = op.emitOpError("Cannot lower while op still has uses.");
586 mlir::Operation::use_range uses = op.getOperation()->getUses();
587 for (auto it = uses.begin(); it != uses.end(); ++it) {
588 err.attachNote(it->getOwner()->getLoc()) << "Used here.";
589 }
590 return failure();
591 }
592
593 Region &body = op.getBody();
594
595 // Verify each BD block first; subsequent functions rely on them being
596 // well-formed
597 for (auto it = body.begin(); it != body.end(); ++it) {
598 if (shouldSkipBlock(*it)) {
599 continue;
600 }
601 if (failed(verifyNoUnsupportedOpsInBlock(*it))) {
602 return failure();
603 }
604 if (failed(verifyBdInBlock(*it))) {
605 return failure();
606 }
607 if (failed(verifyOptionalLocksInBlock(*it))) {
608 return failure();
609 }
610 }
611
612 // Hoist next_bd operations into next_bd_id attribute of the dma_bd
613 if (failed(hoistNextBdOpsIntoAttrs(op))) {
614 return failure();
615 }
616
617 auto channelDir = op.getDirection();
618 auto packet = op.getPacket();
619
620 // Lower all BDs
621 for (auto it = body.begin(); it != body.end(); ++it) {
622 Block &block = *it;
623 if (shouldSkipBlock(block)) {
624 continue;
625 }
626 if (failed(rewriteSingleBD(builder, block, tile, channelDir, packet))) {
627 return failure();
628 }
629 }
630
631 op.erase();
632
633 return success();
634 }
635
636 LogicalResult rewriteDMAConfigureTaskOp(AIE::DeviceOp device) {
637 WalkResult result = device.walk([&](DMAConfigureTaskOp op) {
638 if (failed(rewriteSingleDMAConfigureTaskOp(op))) {
639 return WalkResult::interrupt();
640 }
641 return WalkResult::advance();
642 });
643 if (result.wasInterrupted()) {
644 return failure();
645 }
646 return success();
647 }
648
649 void runOnOperation() override {
650 AIE::DeviceOp device = getOperation();
651
652 // Convert DMAStartBD and DMAAwaitBD ops
653 ConversionTarget target(getContext());
654 target.addLegalDialect<AIEXDialect>();
655 target.addIllegalOp<DMAStartTaskOp>();
656 target.addIllegalOp<DMAAwaitTaskOp>();
657 RewritePatternSet patterns(&getContext());
658 patterns.insert<DMAStartTaskOpPattern>(&getContext());
659 patterns.insert<DMAAwaitTaskOpPattern>(&getContext());
660 if (failed(applyPartialConversion(device, target, std::move(patterns)))) {
661 signalPassFailure();
662 }
663
664 // Lower the configuration for the BDs
665 if (failed(rewriteDMAConfigureTaskOp(device))) {
666 signalPassFailure();
667 }
668 }
669};
670
671std::unique_ptr<OperationPass<AIE::DeviceOp>>
673 return std::make_unique<AIEDMATasksToNPUPass>();
674}
std::optional< uint32_t > getMemLocalBaseAddress(int localCol, int localRow, int memCol, int memRow) const
Return the memory base address (or offset) in the local tile when accessing a neighbor's memory or an...
bool isCoreTile(int col, int row) const
Return true if the given tile is a Core tile.
virtual AIEArch getTargetArch() const =0
Return the target architecture.
bool isMemTile(int col, int row) const
Return true if the given tile is a Mem tile.
virtual uint64_t getDmaBdAddress(int col, int row, uint32_t bd_id, int channel=-1, AIE::DMAChannelDir direction=AIE::DMAChannelDir::MM2S) const =0
Return the array address of the dma buffer descriptor for the given col, row, buffer descriptor id,...
virtual uint32_t getDmaBdAddressOffset(int col, int row) const =0
Return the offset of the base address field within the shim dma buffer descriptor.
bool isShimNOCTile(int col, int row) const
Return true if the given tile is a ShimNOC tile.
std::optional< SubviewTraceResult > traceSubviewToBlockArgument(Value value)
Definition AIEUtils.cpp:19
std::unique_ptr< mlir::OperationPass< AIE::DeviceOp > > createAIEDMATasksToNPUPass()
void getHardwareStridesWraps(const AIE::AIETargetModel &targetModel, mlir::Operation *op, mlir::BaseMemRefType referencedBufType, llvm::SmallVector< int64_t, 4 > inputSizes, llvm::SmallVector< int64_t, 4 > inputStrides, llvm::SmallVector< int64_t, 4 > &sizes, llvm::SmallVector< int64_t, 4 > &strides)
mlir::LogicalResult verifyStridesWraps(mlir::Operation *forOp, mlir::BaseMemRefType referencedBufType, int tileCol, int tileRow, llvm::SmallVector< int64_t, 4 > inputSizes, llvm::SmallVector< int64_t, 4 > inputStrides, llvm::SmallVector< int64_t, 4 > hardwareSizes, llvm::SmallVector< int64_t, 4 > hardwareStrides, bool skipTransformationChecks=false)
bool isLinearTransfer(llvm::ArrayRef< int64_t > sizes, llvm::ArrayRef< int64_t > strides)
const AIETargetModel & getTargetModel(mlir::Operation *op)
void runOnOperation() override
bool shouldSkipBlock(Block &block)
LogicalResult rewriteDMAConfigureTaskOp(AIE::DeviceOp device)
LogicalResult verifyNoUnsupportedOpsInBlock(Block &block)
std::optional< std::pair< AIE::UseLockOp, AIE::UseLockOp > > getOptionalLockOpsForBlock(Block &block)
LogicalResult rewriteSingleBD(OpBuilder &builder, Block &block, AIE::TileOp &tile, AIE::DMAChannelDir channelDir, std::optional< xilinx::AIE::PacketInfoAttr > packet)
AIE::DMABDOp getBdForBlock(Block &block)
LogicalResult hoistNextBdOpsIntoAttrs(DMAConfigureTaskOp op)
LogicalResult verifyBdInBlock(Block &block)
LogicalResult rewriteSingleDMAConfigureTaskOp(DMAConfigureTaskOp op)
LogicalResult setAddressForSingleBD(OpBuilder &builder, AIE::DMABDOp &bd_op, AIE::TileOp &tile)
LogicalResult verifyOptionalLocksInBlock(Block &block)
LogicalResult matchAndRewrite(DMAAwaitTaskOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override
LogicalResult matchAndRewrite(DMAStartTaskOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override