MLIR-AIE
AIEXDialect.cpp
Go to the documentation of this file.
1//===- AIEXDialect.cpp ------------------------------------------*- C++ -*-===//
2//
3// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7// (c) Copyright 2019 Xilinx Inc.
8//
9//===----------------------------------------------------------------------===//
10
12
13#include "mlir/Dialect/Func/IR/FuncOps.h"
14#include "mlir/IR/DialectImplementation.h"
15#include "mlir/IR/Operation.h"
16#include "mlir/IR/TypeUtilities.h"
17#include "mlir/Interfaces/DataLayoutInterfaces.h"
18#include "mlir/Interfaces/FoldInterfaces.h"
19#include "mlir/Transforms/InliningUtils.h"
20
21#include "llvm/ADT/TypeSwitch.h"
22#include "llvm/Support/TypeSize.h"
23
24#include <cstdint>
25#include <numeric>
26
27using namespace mlir;
28using namespace xilinx;
29
30#include "aie/Dialect/AIEX/IR/AIEXDialect.cpp.inc"
31
32#define GET_TYPEDEF_CLASSES
33#include "aie/Dialect/AIEX/IR/AIEXTypes.cpp.inc"
34
35namespace xilinx::AIEX {
36
37// FIXME: use Tablegen'd dialect class
38void AIEXDialect::initialize() {
39 addOperations<
40#define GET_OP_LIST
41#include "aie/Dialect/AIEX/IR/AIEX.cpp.inc"
42 >();
43 addTypes<
44#define GET_TYPEDEF_LIST
45#include "aie/Dialect/AIEX/IR/AIEXTypes.cpp.inc"
46 >();
47}
48
49} // namespace xilinx::AIEX
50
51#define GET_OP_CLASSES
52#include "aie/Dialect/AIEX/IR/AIEX.cpp.inc"
53
54/* Return the correct values to write to the hardware registers to configure
55 strides and wraps given the input user-facing strides and wraps.
56
57 In the IR, we express strides in units of element data type, but the hardware
58 requires it in units of address granularity. Address granularity currently is
59 4 bytes for all hardware.
60
61
62 User-facing strides/wraps relate to hardware as follows:
63
64 - By default, stride 0 and size 1 is assumed if unspecified.
65 - If only N strides/wraps are defined, those define the lowest N dimensions.
66
67 inputStride[3] == iteration_stride / elemSizeFac + 1
68 inputWrap[3] == iteration_size + 1
69 Highest-dimension stride/wrap is iteration count / iteration stride.
70 inputStride[2] == d2_stride / elemSizeFac + 1
71 Note: d2_size is not specified in hardware as it is
72 implicit from the total buffer transfer length
73 inputStride[1] == d1_stride / elemSizeFac + 1
74 inputSize[1] == d1_size
75 inputStride[0] == d0_stride / elemSizeFac + 1
76 inputSize[0] == d0_size / elemSizeFac
77
78 where elemSizeFac == bufferElementSize / addressGranularity
79 where bufferElementSize == size in bytes of elements in buffer,
80 e.g. 4 for int32
81 where addressGranularity == transfer granularity in hardware, which is
82 4 bytes for all current hardware
83
84 Note: strides are expressed offset by one from user input strides, because the
85 hardware does not support a 0 stride (repeat).
86 */
88 mlir::Operation *op,
89 mlir::BaseMemRefType referencedBufType,
90 llvm::SmallVector<int64_t, 4> inputSizes,
91 llvm::SmallVector<int64_t, 4> inputStrides,
92 llvm::SmallVector<int64_t, 4> &sizes,
93 llvm::SmallVector<int64_t, 4> &strides) {
94 assert(inputSizes.size() == inputStrides.size());
95 assert(sizes.size() == 4);
96 assert(strides.size() == 4);
97
98 DataLayout dataLayout = DataLayout::closest(op);
99 auto elemWidth =
100 dataLayout.getTypeSizeInBits(referencedBufType.getElementType());
101 auto addressGranularity = targetModel.getAddressGenGranularity();
102
103 // Output strides and sizes are default-initialized to 0
104 std::fill(sizes.begin(), sizes.end(), 0);
105 std::fill(strides.begin(), strides.end(), 0);
106
107 if (inputSizes[0] == 0) {
108 // Illegal input, this won't transfer anything at all.
109 // Leave it to the verification functions to complain to the user.
110 return;
111 }
112
113 // d0_size, d0_stride
114 sizes[0] = inputSizes[0] * elemWidth / addressGranularity;
115 if (inputStrides[0] * elemWidth < addressGranularity ||
116 (elemWidth > addressGranularity)) {
117 // First check:
118 // While the hardware cannot transfer less than addressGranularity bits at
119 // a time, the user may expresses a contiguous transfer of multiple
120 // elements with a stride smaller than addressGranularity. We can thus set
121 // the stride to 1 (encoded in hardware as 0) here to allow such transfers.
122 // The verification function should ensure that
123 // inputStrides[0] * elemWidth < addressGranularity
124 // iff. inputSize[0] * elemWidth > addressGranularity.
125 // Second check:
126 // If the element width is larger than addressGranularity, we need to make
127 // sure that all bytes are properly copied and therefore the stride must be
128 // set to 1 (encoded in hardware as 0).
129 // The verification function should ensure that
130 // inputStrides[0] * elemWidth % addressGranularity == 0
131 // && inputStrides[0] == 1 if elemWidth > addressGranularity
132 // This makes it impossible to have a stride greater than 1 for
133 // elemWidths bigger than addressGranularity, even if they are a multiple of
134 // it. Such operations should make use of an additional dimension instead.
135 strides[0] = 0;
136 } else {
137 strides[0] = inputStrides[0] * elemWidth / addressGranularity - 1;
138 }
139
140 // d1_size, d1_stride
141 sizes[1] = inputSizes[1];
142 if (inputSizes[1] > 1) {
143 // Stride only matters if we have more than one iteration.
144 strides[1] = inputStrides[1] * elemWidth / addressGranularity - 1;
145 }
146
147 // d2_size, d2_stride
148 sizes[2] = inputSizes[2];
149 if (inputSizes[2] > 1) {
150 // Stride only matters if we have more than one iteration.
151 strides[2] = inputStrides[2] * elemWidth / addressGranularity - 1;
152 }
153
154 // iteration_size, iteration_stride
155 if (inputSizes[3] > 1) {
156 // Stride only matters if we have more than one iteration.
157 sizes[3] = inputSizes[3] - 1;
158 // Note that the iteration_stride must be positive, just like the other
159 // dimensions. However, one can encode a zero-stride "repeat" of the same
160 // transfer by setting a positive repeat_count on the pushToQueue instr,
161 // and setting the size here to 1. This causes the BD to "wrap" at every
162 // single iteration, effectively never adding the specified stride, in turn
163 // equalling a repeat without stride.
164 if (inputStrides[3] > 0) {
165 strides[3] = inputStrides[3] * elemWidth / addressGranularity - 1;
166 }
167 }
168}
169
170mlir::LogicalResult
171AIEX::verifyStridesWraps(mlir::Operation *forOp,
172 mlir::BaseMemRefType referencedBufType, int tileCol,
173 int tileRow, llvm::SmallVector<int64_t, 4> inputSizes,
174 llvm::SmallVector<int64_t, 4> inputStrides,
175 llvm::SmallVector<int64_t, 4> hardwareSizes,
176 llvm::SmallVector<int64_t, 4> hardwareStrides,
177 bool skipTransformationChecks) {
178 const auto &targetModel = AIE::getTargetModel(forOp);
179 auto addressGranularity = targetModel.getAddressGenGranularity();
180 DataLayout dataLayout = DataLayout::closest(forOp);
181 auto elemWidth =
182 dataLayout.getTypeSizeInBits(referencedBufType.getElementType());
183
184 uint32_t wrap_bits = 0;
185 uint32_t step_bits = 0;
186 uint32_t iter_bits = 6;
187 if (targetModel.isShimNOCTile(tileCol, tileRow)) {
188 step_bits = 20; // XAIEMLGBL_NOC_MODULE_DMA_BD0_3_D0_STEPSIZE_WIDTH
189 wrap_bits = 10; // XAIEMLGBL_NOC_MODULE_DMA_BD0_3_D0_WRAP_WIDTH
190 } else if (targetModel.isMemTile(tileCol, tileRow)) {
191 step_bits = 17; // XAIEMLGBL_MEM_TILE_MODULE_DMA_BD0_2_D0_STEPSIZE_WIDTH
192 wrap_bits = 10; // XAIEMLGBL_MEM_TILE_MODULE_DMA_BD0_2_D0_WRAP_WIDTH
193 } else if (targetModel.isCoreTile(tileCol, tileRow)) {
194 step_bits = 13; // XAIEMLGBL_MEMORY_MODULE_DMA_BD0_2_D0_STEPSIZE_WIDTH
195 wrap_bits = 8; // XAIEMLGBL_MEMORY_MODULE_DMA_BD0_3_D0_WRAP_WIDTH
196 } else {
197 return forOp->emitOpError(
198 "Unsupported tile type at (" + std::to_string(tileCol) + ", " +
199 std::to_string(tileRow) + ") Must be ShimNOC, Mem or Core.");
200 }
201
202 for (int i = 0; i < 4; i++) {
203 if (inputSizes[i] <= 0) {
204 return forOp->emitOpError("Size ") << i << " must be a positive integer.";
205 }
206 }
207
208 if (inputSizes[0] * elemWidth % addressGranularity != 0) {
209 std::stringstream msg;
210 msg << "Transfer sizes must be multiples of " << (addressGranularity / 8)
211 << " bytes. " << inputSizes[0] << " elements at " << (elemWidth / 8)
212 << " bytes each equal " << (inputSizes[0] * elemWidth / 8)
213 << " bytes, which is not divisible by " << (addressGranularity / 8)
214 << ". ";
215 return forOp->emitOpError(msg.str());
216 }
217
218 for (int i = 0; i < 3; i++) {
219 if (inputSizes[i] > 1 && inputStrides[i] < 1) {
220 // If inputSize[i] == 1, anything is allowable in the stride, since that
221 // stride will never be applied. For any larger size, we must verify that
222 // the stride is positive.
223 return forOp->emitOpError("Stride ")
224 << i << " must be a positive integer.";
225 }
226 }
227 // A value of zero is allowable for the fourth-dimension stride
228 // (this indicates an interation stride for the repeat of 0)
229 if (inputSizes[3] > 1 && inputStrides[3] < 0) {
230 return forOp->emitOpError("Stride 3 must be a non-negative integer.");
231 }
232
233 for (int i = 0; i < 4; i++) {
234 // strides[0] == 1 is ok iff the transfer size is a multiple of
235 // addressGranularity, which is checked below
236 if (i == 0 && inputStrides[i] == 1)
237 continue;
238 if (inputStrides[i] * elemWidth % addressGranularity != 0) {
239 std::stringstream msg;
240 msg << "Stride " << i << " is " << inputStrides[i] << " elements * "
241 << (elemWidth / 8) << " bytes = " << (inputStrides[i] * elemWidth / 8)
242 << " bytes, which is not divisible by " << (addressGranularity / 8)
243 << ". ";
244 return forOp->emitOpError(msg.str());
245 }
246 }
247
248 if (!skipTransformationChecks && hardwareSizes[0] > (1 << wrap_bits) - 1)
249 return forOp->emitOpError(
250 "Size 0 exceeds the [0:" + std::to_string((1 << wrap_bits) - 1) +
251 "] range.");
252 if (hardwareSizes[1] > (1 << wrap_bits) - 1)
253 return forOp->emitOpError(
254 "Size 1 exceeds the [0:" + std::to_string((1 << wrap_bits) - 1) +
255 "] range.");
256 if (hardwareSizes[3] > (1 << iter_bits))
257 return forOp->emitOpError(
258 "Size 3 exceeds the [1:" + std::to_string(1 << iter_bits) + "] range.");
259 if (hardwareStrides[0] > (1 << step_bits))
260 return forOp->emitOpError("Stride 0 exceeds the [1:" +
261 std::to_string(1 << step_bits) + "] range.");
262 if (hardwareStrides[1] > (1 << step_bits))
263 return forOp->emitOpError("Stride 1 exceeds the [1:" +
264 std::to_string(1 << step_bits) + "] range.");
265 if (hardwareStrides[2] > (1 << step_bits))
266 return forOp->emitOpError("Stride 2 exceeds the [1:" +
267 std::to_string(1 << step_bits) + "] range.");
268 // strides[3] exceeding the range is ok iff the sizes[3] is one, which is
269 // checked below
270 if (hardwareStrides[3] > (1 << step_bits) && hardwareSizes[3] > 0)
271 return forOp->emitOpError("Stride 3 exceeds the [1:" +
272 std::to_string(1 << step_bits) + "] range.");
273
274 return success();
275}
276
277//===----------------------------------------------------------------------===//
278// UseTokenOp
279//===----------------------------------------------------------------------===//
280
281LogicalResult AIEX::UseTokenOp::verify() {
282 auto *parentOp = (*this)->getParentOp();
283 if (isa<func::FuncOp>(parentOp) || isa<AIE::CoreOp>(parentOp) ||
284 isa<AIE::MemOp>(parentOp) || isa<AIE::ShimDMAOp>(parentOp))
285 return success();
286 return failure();
287}
288
289//===----------------------------------------------------------------------===//
290// MulticastOp
291//===----------------------------------------------------------------------===//
292
293LogicalResult AIEX::MulticastOp::verify() {
294 Region &body = getPorts();
295 assert(getOperation()->getNumRegions());
296 assert(!body.empty());
297 for (auto &ops : body.front())
298 if (!isa<MultiDestOp, AIE::EndOp>(ops))
299 return ops.emitOpError("cannot be contained in a Multicast op");
300
301 return success();
302}
303
304//===----------------------------------------------------------------------===//
305// BroadcastPacketOp
306//===----------------------------------------------------------------------===//
307
308LogicalResult AIEX::BroadcastPacketOp::verify() {
309 Region &body = getPorts();
310 assert(getOperation()->getNumRegions());
311 assert(!body.empty());
312 for (auto &ops : body.front())
313 if (!isa<BPIDOp, AIE::EndOp>(ops))
314 return ops.emitOpError("cannot be contained in a BroadcastPacket op");
315
316 return success();
317}
318
319//===----------------------------------------------------------------------===//
320// NpuDmaMemcpyNdOp
321//===----------------------------------------------------------------------===//
322
323/* Calculates the offset value to be written to the
324 */
325int64_t AIEX::NpuDmaMemcpyNdOp::getOffsetInBytes() {
326 llvm::SmallVector<int64_t, 4> offsets =
327 llvm::map_to_vector(llvm::reverse(getMixedOffsets()), [](OpFoldResult s) {
328 return getConstantIntValue(s).value();
329 });
330 llvm::SmallVector<int64_t, 4> strides =
331 llvm::map_to_vector(llvm::reverse(getMixedStrides()), [](OpFoldResult s) {
332 return getConstantIntValue(s).value();
333 });
334 size_t offset = 0;
335 size_t R = offsets.size();
336 size_t el_bit_width = getElementTypeBitwidth();
337 assert(el_bit_width % 8 == 0 &&
338 "Expected Memref element bitwidth to be multiple of 8.");
339 size_t S = el_bit_width / 8;
340 for (size_t i = 0; i < R; i++)
341 offset += offsets[i] * strides[i] * S;
342 return offset;
343}
344
345// dma_memcpy_nd transfers of the form [*, 1, 1, len][*, 0, 0, 1] do not
346// specify any data layout transformation, but simply express a contiguous
347// transfer of `len`. We exclude checks to 4th dimension, because repeat count
348// is still possible without a data layout transformation.
349bool AIEX::NpuDmaMemcpyNdOp::isLinearTransferWithoutTransformation() {
350 llvm::SmallVector<int64_t, 4> inputSizes =
351 llvm::map_to_vector(llvm::reverse(getMixedSizes()), [](OpFoldResult s) {
352 return getConstantIntValue(s).value();
353 });
354 llvm::SmallVector<int64_t, 4> inputStrides =
355 llvm::map_to_vector(llvm::reverse(getMixedStrides()), [](OpFoldResult s) {
356 return getConstantIntValue(s).value();
357 });
358 return (inputSizes[1] == 1 && inputSizes[2] == 1 && inputStrides[0] == 1 &&
359 inputStrides[1] == 0 && inputStrides[2] == 0);
360}
361
362// Helper method to check if a requested burst length is supported by the target
363// model. Returns an error message if the burst length is not supported or an
364// empty option otherwise.
365static std::optional<std::string>
366checkBurstLength(const xilinx::AIE::AIETargetModel &targetModel,
367 uint32_t requestedBurstLength) {
368 if (requestedBurstLength != 0) {
369 auto bel = targetModel.getShimBurstEncodingsAndLengths();
370 auto pair = std::find_if(bel.begin(), bel.end(),
371 [=](const std::pair<uint32_t, uint32_t> &p) {
372 return p.second == requestedBurstLength;
373 });
374
375 if (pair == bel.end()) {
376 std::string errorMessage =
377 "Requested burst length is not supported by the target. "
378 "Supported burst lengths:";
379
380 errorMessage =
381 std::accumulate(bel.begin(), bel.end(), errorMessage,
382 [](const std::string &a, auto b) {
383 return a + " " + std::to_string(b.second);
384 });
385
386 return errorMessage;
387 }
388 }
389
390 return std::nullopt;
391}
392
393LogicalResult AIEX::NpuDmaMemcpyNdOp::verify() {
394 BaseMemRefType buffer = getMemref().getType();
395 const auto &targetModel = AIE::getTargetModel(*this);
396 auto addressGranularity = targetModel.getAddressGenGranularity();
397
398 if (getElementTypeBitwidth() > addressGranularity) {
399 return emitOpError("Maximum element bit width allowed is ")
400 << addressGranularity << "bits. ";
401 }
402 if (buffer.hasStaticShape() &&
403 (buffer.getNumElements() * getElementTypeBitwidth()) <
404 addressGranularity) {
405 return emitOpError("Minimum data transfer size required is ")
406 << addressGranularity << "bits. ";
407 }
408 if (!llvm::all_of(getMixedStrides(), [](OpFoldResult s) {
409 return getConstantIntValue(s).has_value();
410 }))
411 return emitOpError("Only constant strides currently supported.");
412 if (!llvm::all_of(getMixedSizes(), [](OpFoldResult s) {
413 return getConstantIntValue(s).has_value();
414 }))
415 return emitOpError("Only constant sizes currently supported.");
416 if (!llvm::all_of(getMixedOffsets(), [](OpFoldResult s) {
417 return getConstantIntValue(s).has_value();
418 }))
419 return emitOpError("Only constant offsets currently supported.");
420
421 llvm::SmallVector<int64_t, 4> inputSizes =
422 llvm::map_to_vector(llvm::reverse(getMixedSizes()), [](OpFoldResult s) {
423 return getConstantIntValue(s).value();
424 });
425 llvm::SmallVector<int64_t, 4> inputStrides =
426 llvm::map_to_vector(llvm::reverse(getMixedStrides()), [](OpFoldResult s) {
427 return getConstantIntValue(s).value();
428 });
429 llvm::SmallVector<int64_t, 4> hardwareSizes(4);
430 llvm::SmallVector<int64_t, 4> hardwareStrides(4);
431 getHardwareStridesWraps(targetModel, getOperation(), buffer, inputSizes,
432 inputStrides, hardwareSizes, hardwareStrides);
433 int64_t offset = getOffsetInBytes();
434
435 auto errorMessage = checkBurstLength(targetModel, getBurstLength());
436 if (errorMessage.has_value()) {
437 return emitOpError(errorMessage.value());
438 }
439
440 // The experimental HSA target uses this op on AIE1, skip all the AIE2
441 // specific checks
442 if (targetModel.getTargetArch() == AIE::AIEArch::AIE1)
443 return success();
444
445 if (offset % 4 != 0) {
446 return emitOpError("Offset must be 4-byte-aligned.");
447 }
448
449 // dma_memcpy_nd transfers of the form [1, 1, 1, len][0, 0, 0, 1] do not
450 // specify any data layout transformation, but simply express a contiguous
451 // transfer of `len`. For backwards compatibility, we allow this to proceed
452 // even if it exceeds the maximum stride/wrap size of any one dimension,
453 // and simply do not lower any data layout transformations, since there is
454 // no other way to express this at the dma_memcpy_nd interface otherwise.
455 AIE::ShimDMAllocationGetter allocGetter;
456 AIE::DeviceOp dev = getOperation()->getParentOfType<AIE::DeviceOp>();
457 if (auto allocOp = allocGetter.get(dev, getMetadata())) {
458 int col = allocOp->getCol();
459 bool skipTransformationChecks = isLinearTransferWithoutTransformation();
460 if (failed(verifyStridesWraps(*this, buffer, col, 0, inputSizes,
461 inputStrides, hardwareSizes, hardwareStrides,
462 skipTransformationChecks))) {
463 return failure();
464 }
465 }
466
467 // packet header
468 if (auto packetInfo = getPacket()) {
469 if (packetInfo->getPktType() > 7)
470 return emitOpError("Packet type field can only hold 3 bits.");
471 if (packetInfo->getPktId() > 31)
472 return emitOpError("Packet ID field can only hold 5 bits.");
473 }
474
475 return success();
476}
477
478//===----------------------------------------------------------------------===//
479// NpuDmaWaitOp
480//===----------------------------------------------------------------------===//
481
482LogicalResult AIEX::NpuDmaWaitOp::verify() {
483 AIE::DeviceOp dev = (*this)->getParentOfType<AIE::DeviceOp>();
484 // Some passes (e.g. aie-standard-lowering) use aiex ops outside a DeviceOp,
485 // so we can't expect the device to always exist.
486 if (dev && !dev.lookupSymbol(getSymbol()))
487 return emitOpError("couldn't find symbol in parent device");
488 return success();
489}
490
491//===----------------------------------------------------------------------===//
492// NpuPushQueueOp
493//===----------------------------------------------------------------------===//
494
495LogicalResult AIEX::NpuPushQueueOp::verify() {
496 const auto &targetModel = AIE::getTargetModel(*this);
497 auto numBds = targetModel.getNumBDs(getColumn(), getRow());
498 if (getBdId() > numBds)
499 return emitOpError("BD ID exceeds the maximum ID.");
500 if (getRepeatCount() > 255)
501 return emitOpError("Repeat count exceeds the [0:255] range.");
502 return success();
503}
504
505//===----------------------------------------------------------------------===//
506// NpuWriteBdOp
507//===----------------------------------------------------------------------===//
508
509LogicalResult AIEX::NpuWriteBdOp::verify() {
510 const auto &targetModel = AIE::getTargetModel(*this);
511 auto numBds = targetModel.getNumBDs(getColumn(), getRow());
512 bool isLinearTransfer =
513 (getD0Size() >= 1) && (getD1Size() == 1) && (getIterationSize() == 0);
514 if (getBdId() > numBds)
515 return emitOpError("BD ID exceeds the maximum ID.");
516 if (getPacketId() > 31)
517 return emitOpError("Packet ID exceeds the maximum supported by 5 bits.");
518 if (getPacketType() > 7)
519 return emitOpError("Packet Type exceeds the maximum supported by 3 bits.");
520 if (!isLinearTransfer && getD0Size() > 0x3FF)
521 return emitOpError("D0 Size exceeds the [0:1023] range.");
522 if (getD0Stride() > 0xFFFFF)
523 return emitOpError("D0 Stride exceeds the [0:1M-1] range.");
524 if (getD1Size() > 0x3FF)
525 return emitOpError("D1 Size exceeds the [0:1023] range.");
526 if (getD1Stride() > 0xFFFFF)
527 return emitOpError("D1 Stride exceeds the [0:1M-1] range.");
528 if (getD2Stride() > 0xFFFFF)
529 return emitOpError("D2 Stride exceeds the [0:1M-1] range.");
530 if (getIterationSize() > 0x3F)
531 return emitOpError("Iteration Size exceeds the [0:63] range.");
532 if (getIterationStride() > 0xFFFFF)
533 return emitOpError("Iteration Stride exceeds the [0:1M-1] range.");
534 if (targetModel.isShimNOCTile(getColumn(), getRow()) && getD2Size() != 0)
535 return emitOpError("ShimTile only supports 3 dimensions of sizes.");
536 if (targetModel.isShimNOCTile(getColumn(), getRow()) &&
537 (getD0ZeroBefore() != 0 || getD0ZeroAfter() != 0 ||
538 getD1ZeroBefore() != 0 || getD1ZeroAfter() != 0 ||
539 getD2ZeroBefore() != 0 || getD2ZeroAfter() != 0))
540 return emitOpError("ShimTile doesn't support zero padding.");
541 if (!targetModel.isShimNOCTile(getColumn(), getRow()) &&
542 getBurstLength() != 0)
543 return emitOpError("Only ShimTiles support burst length.");
544 auto errorMessage = checkBurstLength(targetModel, getBurstLength());
545 if (errorMessage.has_value()) {
546 return emitOpError(errorMessage.value());
547 }
548
549 return success();
550}
551
552//===----------------------------------------------------------------------===//
553// RuntimeSequenceOp
554//===----------------------------------------------------------------------===//
555
556ParseResult AIEX::RuntimeSequenceOp::parse(OpAsmParser &parser,
557 OperationState &result) {
558
559 StringAttr nameAttr;
560 (void)parser.parseOptionalSymbolName(
561 nameAttr, mlir::SymbolTable::getSymbolAttrName(), result.attributes);
562
563 SmallVector<OpAsmParser::Argument> entryArgs;
564
565 // Entry arguments, e.g. (%addr: memref<1xi32>)
566 ParseResult argParseResult = parser.parseCommaSeparatedList(
567 OpAsmParser::Delimiter::Paren, [&]() -> ParseResult {
568 OpAsmParser::Argument argument;
569 if (parser.parseArgument(argument, true, true)) {
570 return failure();
571 }
572 entryArgs.push_back(argument);
573 return success();
574 });
575 if (argParseResult) {
576 return argParseResult;
577 }
578
579 // Body
580 auto *body = result.addRegion();
581 ParseResult bodyParseResult = parser.parseRegion(*body, entryArgs, false);
582 if (bodyParseResult) {
583 return bodyParseResult;
584 }
585
586 return success();
587}
588
589void AIEX::RuntimeSequenceOp::print(OpAsmPrinter &printer) {
590 Region &body = getRegion();
591
592 auto nameAttr = (*this)->getAttrOfType<StringAttr>(
593 mlir::SymbolTable::getSymbolAttrName());
594 if (nameAttr) {
595 printer << ' ';
596 printer.printSymbolName(nameAttr);
597 }
598
599 printer << '(';
600 for (unsigned i = 0, n = body.getNumArguments(); i < n; i++) {
601 if (i > 0) {
602 printer << ", ";
603 }
604 printer.printRegionArgument(body.getArgument(i));
605 }
606 printer << ')';
607
608 printer << ' ';
609 printer.printRegion(body, false, true);
610}
611
612LogicalResult AIEX::RuntimeSequenceOp::verify() {
613 AIE::DeviceOp device = (*this)->getParentOfType<AIE::DeviceOp>();
614 if (!device) {
615 // this check is redudnant with the HasParent trait, but can't hurt
616 (*this)->emitOpError() << "must be inside AIE device operation.";
617 return failure();
618 }
619 return success();
620}
621
622//===----------------------------------------------------------------------===//
623// DMAConfigureTaskOp
624//===----------------------------------------------------------------------===//
625
626std::optional<uint32_t> AIEX::DMAConfigureTaskOp::getFirstBdId() {
627 Region &body = getBody();
628 if (body.empty()) {
629 return std::nullopt;
630 }
631 auto bd_ops = body.front().getOps<AIE::DMABDOp>();
632 if (bd_ops.empty() && body.front().getNumSuccessors() == 1) {
633 // Allow the first block to be empty and point to the entry point of the
634 // chain. This allows for specifying cyclying BD chains (infinite loops)
635 // within the constraints of MLIR syntax.
636 Block &chain_entry = *body.front().getSuccessor(0);
637 bd_ops = chain_entry.getOps<AIE::DMABDOp>();
638 }
639 if (bd_ops.empty()) {
640 return std::nullopt;
641 }
642 AIE::DMABDOp bd = *bd_ops.begin();
643 if (!bd.getBdId().has_value()) {
644 return std::nullopt;
645 }
646 return bd.getBdId().value();
647}
648
649LogicalResult
650AIEX::DMAConfigureTaskOp::canonicalize(AIEX::DMAConfigureTaskOp op,
651 PatternRewriter &rewriter) {
652 // Remove blocks that contain nothing but a terminator
653 Region &body = op.getBody();
654 bool did_rewrite = false;
655 for (auto it = body.begin(); it != body.end(); ++it) {
656 Block &block = *it;
657 if (block.empty()) {
658 continue;
659 }
660 auto ops_it = block.without_terminator();
661 if (std::distance(ops_it.begin(), ops_it.end()) == 0) {
662 rewriter.eraseOp(block.getTerminator());
663 did_rewrite = true;
664 }
665 }
666 if (did_rewrite) {
667 return success();
668 }
669 return failure();
670}
671
672LogicalResult AIEX::DMAConfigureTaskOp::verify() {
673 Region &body = getBody();
674 for (auto it = body.begin(); it != body.end(); ++it) {
675 Block &block = *it;
676 if (block.empty()) {
677 continue;
678 }
679 if (block.hasNoPredecessors() && !block.isEntryBlock()) {
680 auto error = block.getTerminator()->emitError(
681 "Block ending in this terminator does not form a chain with "
682 "entry block.");
683 return failure();
684 }
685
686 const AIE::AIETargetModel &targetModel =
687 AIE::getTargetModel(getOperation());
688
689 // This is a layering violation on the DMABDOps, but they are never verified
690 // otherwise Because DMAConfigureTaskOps are not yet merged into the AIE
691 // dialect. The normal DMABDOp verify operation will skip over any BD inside
692 // a DMAConfigureTaskOp
693 LogicalResult result = success();
694 block.walk([&](AIE::DMABDOp bd) {
695 if (bd.getBurstLength() != 0 &&
696 !targetModel.isShimNOCTile(getTileID().col, getTileID().row)) {
697 bd.emitOpError("Burst length is only supported in Shim NOC tiles that "
698 "are connected to the memory-mapped NOC.");
699 result = failure();
700 }
701 });
702 if (failed(result)) {
703 return result;
704 }
705 }
706 return success();
707}
708
709//===----------------------------------------------------------------------===//
710// DMAStartBdChainOp
711//===----------------------------------------------------------------------===//
712
713AIE::BDChainOp AIEX::DMAStartBdChainOp::getBDChainOp() {
714 AIE::DeviceOp device = (*this)->getParentOfType<AIE::DeviceOp>();
715 AIE::BDChainOp chain = device.lookupSymbol<AIE::BDChainOp>(getSymbol());
716 return chain;
717}
718
719LogicalResult AIEX::DMAStartBdChainOp::verify() {
720 AIE::BDChainOp chain = getBDChainOp();
721 if (!chain) {
722 return emitOpError("symbol does not reference valid BD chain");
723 }
724
725 auto actualArgTypes = getArgs().getTypes();
726 auto expectedArgTypes = chain.getRegion().getArgumentTypes();
727 if (actualArgTypes.size() != expectedArgTypes.size()) {
728 return emitOpError("Number of arguments mismatches.");
729 }
730 for (unsigned i = 0, n = expectedArgTypes.size(); i < n; i++) {
731 if (actualArgTypes[i] != expectedArgTypes[i]) {
732 return emitOpError("Argument ") << (i + 1) << " types mismatch: "
733 << "expected " << expectedArgTypes[i]
734 << " but got " << actualArgTypes[i];
735 }
736 }
737 return success();
738}
739
740//===----------------------------------------------------------------------===//
741// NpuControlPacketOp
742//===----------------------------------------------------------------------===//
743
744uint32_t AIEX::NpuControlPacketOp::getRowFromAddr() {
745 const auto &targetModel = AIE::getTargetModel(*this);
746 uint32_t addr = getAddress();
747 uint32_t rowInt = (addr >> targetModel.getRowShift()) & 0x1f;
748 return rowInt;
749}
750
751uint32_t AIEX::NpuControlPacketOp::getColumnFromAddr() {
752 const auto &targetModel = AIE::getTargetModel(*this);
753 uint32_t addr = getAddress();
754 uint32_t colInt = (addr >> targetModel.getColumnShift()) & 0x1f;
755 return colInt;
756}
757
758//===----------------------------------------------------------------------===//
759// SetLockOp
760//===----------------------------------------------------------------------===//
761
762LogicalResult AIEX::SetLockOp::verify() {
763 const auto &targetModel = AIE::getTargetModel(*this);
764
765 if (targetModel.getTargetArch() == AIE::AIEArch::AIE1)
766 return emitOpError("SetLockOp is not supported on AIE1.");
767
768 if (getValue() > targetModel.getMaxLockValue())
769 return emitOpError("Lock value exceeds the maximum value of " +
770 std::to_string(targetModel.getMaxLockValue()));
771
772 auto lockOp = getLockOp();
773 auto lockIDOpt = getLockOp().getLockID();
774 // Note that the lockID may not be assigned initially, so lets wait until it
775 // is to verify the lockID dependent conditions
776 if (!lockIDOpt) {
777 return success();
778 }
779
780 auto col = lockOp.colIndex();
781 auto row = lockOp.rowIndex();
782 uint32_t lockID = lockOp.getLockIDValue();
783
784 if (lockID >= targetModel.getNumLocks(col, row)) {
785 return emitOpError("Lock ID out of range for given tile. Max ID: " +
786 std::to_string(targetModel.getNumLocks(col, row) - 1));
787 }
788
789 if (!targetModel.getLocalLockAddress(lockID, lockOp.getTileID())) {
790 return emitOpError("Invalid lock ID and tile combination when trying to "
791 "retrieve the local lock address.");
792 }
793
794 return success();
795}
796
797//===----------------------------------------------------------------------===//
798// BlockFloatingPointType
799//===----------------------------------------------------------------------===//
800uint64_t AIEX::BlockFloatType::getTotalSizeInBits() const {
801 return getBlockSize() * getMantissaBits() + getExponentBits() +
802 getSubtileShiftBits();
803}
804
805llvm::TypeSize AIEX::BlockFloatType::getTypeSizeInBits(
806 const mlir::DataLayout &dataLayout,
807 mlir::DataLayoutEntryListRef params) const {
808 return llvm::TypeSize::getFixed(getTotalSizeInBits());
809}
810
811uint64_t AIEX::BlockFloatType::getABIAlignment(
812 const mlir::DataLayout &dataLayout,
813 mlir::DataLayoutEntryListRef params) const {
814 // For the purposes of the data movement operations, we want all types to be
815 // packed <=> ABI alignment is 1.
816 return 1;
817}
818
819std::optional<AIEX::BlockFloatType::BlockFormat>
820AIEX::BlockFloatType::getBlockFormat(StringRef blockType) {
821 static const llvm::StringMap<AIEX::BlockFloatType::BlockFormat>
822 blockFormatsMap = {
823 {"v8bfp16ebs8", {8, 8, 8, 0}},
824 {"v16bfp16ebs16", {16, 8, 8, 0}},
825 };
826
827 auto it = blockFormatsMap.find(blockType);
828 if (it != blockFormatsMap.end()) {
829 return it->second;
830 }
831
832 return std::nullopt;
833}
834
835LogicalResult
836AIEX::BlockFloatType::verify(function_ref<InFlightDiagnostic()> emitError,
837 StringRef block_type) {
838 if (!getBlockFormat(block_type))
839 return emitError() << "Invalid block type: " << block_type
840 << ". Known types are: v8bfp16ebs8, v16bfp16ebs16.";
841
842 return success();
843}
virtual AIEArch getTargetArch() const =0
Return the target architecture.
virtual uint32_t getNumBDs(int col, int row) const =0
Return the number of buffer descriptors supported by the DMA in the given tile.
virtual std::vector< std::pair< uint32_t, uint32_t > > getShimBurstEncodingsAndLengths() const =0
virtual std::optional< uint32_t > getLocalLockAddress(uint32_t lockId, TileID tile) const =0
virtual uint32_t getNumLocks(int col, int row) const =0
Return the number of lock objects.
virtual bool isShimNOCTile(int col, int row) const =0
Return true if the given tile is a Shim NOC tile.
virtual uint32_t getMaxLockValue() const =0
Return the maximum value that can be stored in a lock register.
virtual uint32_t getColumnShift() const =0
virtual uint32_t getRowShift() const =0
virtual uint32_t getAddressGenGranularity() const =0
Return the data bus width of the device.
void getHardwareStridesWraps(const AIE::AIETargetModel &targetModel, mlir::Operation *op, mlir::BaseMemRefType referencedBufType, llvm::SmallVector< int64_t, 4 > inputSizes, llvm::SmallVector< int64_t, 4 > inputStrides, llvm::SmallVector< int64_t, 4 > &sizes, llvm::SmallVector< int64_t, 4 > &strides)
mlir::LogicalResult verifyStridesWraps(mlir::Operation *forOp, mlir::BaseMemRefType referencedBufType, int tileCol, int tileRow, llvm::SmallVector< int64_t, 4 > inputSizes, llvm::SmallVector< int64_t, 4 > inputStrides, llvm::SmallVector< int64_t, 4 > hardwareSizes, llvm::SmallVector< int64_t, 4 > hardwareStrides, bool skipTransformationChecks=false)
const AIETargetModel & getTargetModel(mlir::Operation *op)
std::optional< AIE::ShimDMAAllocationOp > get(DeviceOp dev, mlir::StringRef sym_name)