MLIR-AIE
AIETargetNPU.cpp
Go to the documentation of this file.
1//===- AIETargetNPU.cpp -----------------------------------------*- C++ -*-===//
2//
3// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7// (c) Copyright 2023-2025 Advanced Micro Devices, Inc.
8//
9//===----------------------------------------------------------------------===//
10
12
15
16#include "mlir/Dialect/Func/IR/FuncOps.h"
17#include "mlir/Interfaces/DataLayoutInterfaces.h"
18#include "mlir/Tools/mlir-translate/MlirTranslateMain.h"
19
20#include "llvm/ADT/ArrayRef.h"
21#include "llvm/ADT/TypeSwitch.h"
22#include "llvm/Support/Format.h"
23
24#include <vector>
25
26extern "C" {
27// #include "xaiengine/xaie_txn.h"
28// see aie-rt commit a6196eb, xaiengine/xaie_txn.h for source of this enum
56}
57
58using namespace mlir;
59using namespace xilinx;
60using namespace xilinx::AIE;
61using namespace xilinx::AIEX;
62
63namespace {
64
65// Example:
66// - instructions = {3,4,5}
67// - tailSize = 2
68// instructions becomes {3,4,5,0,0} and
69// a mutable reference to the tail {0,0} is returned.
70llvm::MutableArrayRef<uint32_t>
71reserveAndGetTail(std::vector<uint32_t> &instructions, uint64_t tailSize) {
72 auto oldSize = instructions.size();
73 auto newSize = oldSize + tailSize;
74 instructions.resize(newSize, 0);
75 return llvm::MutableArrayRef<uint32_t>(instructions.data() + oldSize,
76 tailSize);
77}
78
79void appendSync(std::vector<uint32_t> &instructions, NpuSyncOp op) {
80
81 auto words = reserveAndGetTail(instructions, 4);
82
83 // XAIE_IO_CUSTOM_OP_TCT
84 words[0] = XAIE_IO_CUSTOM_OP_TCT;
85
86 words[1] = words.size() * sizeof(uint32_t); // Operation Size
87
88 words[2] |= static_cast<uint32_t>(op.getDirection()) & 0xff;
89 words[2] |= (op.getRow() & 0xff) << 8;
90 words[2] |= (op.getColumn() & 0xff) << 16;
91
92 words[3] |= (op.getRowNum() & 0xff) << 8;
93 words[3] |= (op.getColumnNum() & 0xff) << 16;
94 words[3] |= (op.getChannel() & 0xff) << 24;
95}
96
97void appendWrite32(std::vector<uint32_t> &instructions, NpuWrite32Op op) {
98
99 auto words = reserveAndGetTail(instructions, 6);
100
101 if (op.getBuffer()) {
102 op.emitOpError("Cannot translate symbolic address");
103 return;
104 }
105
106 // XAIE_IO_WRITE
107 words[0] = XAIE_IO_WRITE;
108 words[2] = op.getAddress();
109 auto col = op.getColumn();
110 auto row = op.getRow();
111 if (col && row) {
112 const AIETargetModel &tm = op->getParentOfType<DeviceOp>().getTargetModel();
113 words[2] = ((*col & 0xff) << tm.getColumnShift()) |
114 ((*row & 0xff) << tm.getRowShift()) | (words[2] & 0xFFFFF);
115 }
116 words[3] = 0; // Extra bits for Reg Offset
117 words[4] = op.getValue(); // Value
118 words[5] = words.size() * sizeof(uint32_t); // Operation Size
119}
120
121void appendMaskWrite32(std::vector<uint32_t> &instructions,
122 NpuMaskWrite32Op op) {
123
124 auto words = reserveAndGetTail(instructions, 7);
125
126 if (op.getBuffer()) {
127 op.emitOpError("Cannot translate symbolic address");
128 return;
129 }
130
131 // XAIE_IO_MASKWRITE
132 words[0] = XAIE_IO_MASKWRITE;
133 words[2] = op.getAddress();
134 auto col = op.getColumn();
135 auto row = op.getRow();
136 if (col && row) {
137 const AIETargetModel &tm = op->getParentOfType<DeviceOp>().getTargetModel();
138 words[2] = ((*col & 0xff) << tm.getColumnShift()) |
139 ((*row & 0xff) << tm.getRowShift()) | (words[2] & 0xFFFFF);
140 }
141 words[3] = 0;
142 words[4] = op.getValue(); // Value
143 words[5] = op.getMask(); // Mask
144 words[6] = words.size() * sizeof(uint32_t); // Operation Size
145}
146
147void appendAddressPatch(std::vector<uint32_t> &instructions,
148 NpuAddressPatchOp op) {
149
150 auto words = reserveAndGetTail(instructions, 12);
151
152 // XAIE_IO_CUSTOM_OP_DDR_PATCH
154 words[1] = words.size() * sizeof(uint32_t); // Operation Size
155
156 words[5] = 0; // Action
157
158 words[6] = op.getAddr();
159
160 words[8] = op.getArgIdx();
161
162 words[10] = op.getArgPlus();
163}
164
165void appendBlockWrite(std::vector<uint32_t> &instructions, NpuBlockWriteOp op) {
166
167 Value memref = op.getData();
168 DataLayout dataLayout = DataLayout::closest(op);
169 int64_t width = dataLayout.getTypeSizeInBits(cast<MemRefType>(memref.getType()).getElementType());
170 if (width != 32) {
171 op.emitWarning("Only 32-bit data type is supported for now");
172 return;
173 }
174
175 memref::GetGlobalOp getGlobal = memref.getDefiningOp<memref::GetGlobalOp>();
176 if (!getGlobal) {
177 op.emitError("Only MemRefs from memref.get_global are supported");
178 return;
179 }
180
181 auto global = dyn_cast_if_present<memref::GlobalOp>(
182 op->getParentOfType<AIE::DeviceOp>().lookupSymbol(getGlobal.getName()));
183 if (!global) {
184 op.emitError("Global symbol not found");
185 return;
186 }
187
188 auto initVal = global.getInitialValue();
189 if (!initVal) {
190 op.emitError("Global symbol has no initial value");
191 return;
192 }
193
194 auto data = dyn_cast<DenseIntElementsAttr>(*initVal);
195 if (!data) {
196 op.emitError("Global symbol initial value is not a dense int array");
197 return;
198 }
199
200 unsigned payload_start = 4;
201 auto words = reserveAndGetTail(instructions, data.size() + payload_start);
202
203 // XAIE_IO_BLOCKWRITE
204 words[0] = XAIE_IO_BLOCKWRITE;
205 words[2] = op.getAddress();
206 auto col = op.getColumn();
207 auto row = op.getRow();
208 if (col && row) {
209 words[1] = (*col & 0xff) | ((*row & 0xff) << 8);
210 const AIETargetModel &tm = op->getParentOfType<DeviceOp>().getTargetModel();
211 words[2] = ((*col & 0xff) << tm.getColumnShift()) |
212 ((*row & 0xff) << tm.getRowShift()) | (words[2] & 0xFFFFF);
213 }
214 words[3] = words.size() * sizeof(uint32_t); // Operation Size
215
216 unsigned i = payload_start;
217 for (auto d : data)
218 words[i++] = d.getZExtValue();
219}
220
221void appendPreempt(std::vector<uint32_t> &instructions,
222 NpuPreemptOp op) {
223
224 auto words = reserveAndGetTail(instructions, 1);
225 words[0] = XAIE_IO_PREEMPT | (op.getLevel() << 8);
226}
227
228} // namespace
229
230LogicalResult
232 std::vector<uint32_t> &instructions,
233 StringRef sequenceName) {
234
235 auto words = reserveAndGetTail(instructions, 4);
236
237 DeviceOp deviceOp = *module.getOps<DeviceOp>().begin();
238 const AIETargetModel &tm = deviceOp.getTargetModel();
239
240 // setup txn header
241 uint8_t major = 0;
242 uint8_t minor = 1;
243 uint8_t devGen = 3; // NPU (PHX HWK)
244 if (llvm::isa<AIE::BaseNPU2TargetModel>(tm))
245 devGen = 4; // NPU2 (STX KRK)
246 uint8_t numRows = tm.rows();
247 uint8_t numCols = tm.columns();
248 uint8_t numMemTileRows = tm.getNumMemTileRows();
249 uint32_t count = 0;
250 words[0] = (numRows << 24) | (devGen << 16) | (minor << 8) | major;
251 words[1] = (numMemTileRows << 8) | numCols;
252
253 auto sequenceOps = deviceOp.getOps<AIEX::RuntimeSequenceOp>();
254 for (auto seq : sequenceOps) {
255 if (sequenceName.size() && sequenceName != seq.getSymName())
256 continue;
257 Block &entry = seq.getBody().front();
258 for (auto &o : entry) {
259 llvm::TypeSwitch<Operation *>(&o)
260 .Case<NpuSyncOp>([&](auto op) {
261 count++;
262 appendSync(instructions, op);
263 })
264 .Case<NpuWrite32Op>([&](auto op) {
265 count++;
266 appendWrite32(instructions, op);
267 })
268 .Case<NpuBlockWriteOp>([&](auto op) {
269 count++;
270 appendBlockWrite(instructions, op);
271 })
272 .Case<NpuMaskWrite32Op>([&](auto op) {
273 count++;
274 appendMaskWrite32(instructions, op);
275 })
276 .Case<NpuAddressPatchOp>([&](auto op) {
277 count++;
278 appendAddressPatch(instructions, op);
279 })
280 .Case<NpuPreemptOp>([&](auto op) {
281 count++;
282 appendPreempt(instructions, op);
283 });
284 }
285 }
286
287 // write size fields of the txn header
288 instructions[2] = count;
289 instructions[3] = instructions.size() * sizeof(uint32_t); // size of the txn
290 return success();
291}
292
294 ModuleOp module, std::vector<uint32_t> &instructions,
295 StringRef sequenceName) {
296 DeviceOp deviceOp = *module.getOps<DeviceOp>().begin();
297 OpBuilder builder = OpBuilder::atBlockBegin(deviceOp.getBody());
298
299 auto sequenceOps = deviceOp.getOps<AIEX::RuntimeSequenceOp>();
300 for (auto seq : sequenceOps) {
301 if (sequenceName.size() && sequenceName != seq.getSymName())
302 continue;
303 Block &entry = seq.getBody().front();
304 for (auto &o : entry) {
305 auto packetOp = dyn_cast<AIEX::NpuControlPacketOp>(o);
306 if (!packetOp)
307 continue;
308
309 uint32_t size = 0;
310 auto data = packetOp.getData();
311 if (data)
312 size = data->size();
313
314 auto words = reserveAndGetTail(instructions, 2 + size);
315
316 if (!data && packetOp.getLength())
317 size = *packetOp.getLength();
318
319 auto parity = [](uint32_t n) {
320 uint32_t p = 0;
321 while (n) {
322 p += n & 1;
323 n >>= 1;
324 }
325 return (p % 2) == 0;
326 };
327
328 // stream header is attached here instead of by shim dma
329 int col = packetOp.getColumnFromAddr();
330 int row = packetOp.getRowFromAddr();
331 auto destTile = TileOp::getOrCreate(builder, deviceOp, col, row);
332 auto info = destTile->getAttrOfType<AIE::PacketInfoAttr>("controller_id");
333 if (!info)
334 return destTile->emitError("Expected controller_id attribute");
335 uint32_t hdr = (info.getPktType() & 0x7) << 12 | (info.getPktId() & 0xff);
336 words[0] = hdr | (0x1 & parity(hdr)) << 31;
337
338 // control packet header
339 uint32_t addr = packetOp.getAddress() & 0xFFFFF;
340 uint32_t beats = size - 1;
341 uint32_t opc = packetOp.getOpcode();
342 uint32_t id = packetOp.getStreamId();
343 hdr = id << 24 | opc << 22 | beats << 20 | addr;
344 words[1] = hdr | (0x1 & parity(hdr)) << 31;
345
346 // configuration data
347 if (opc == 0x0 || opc == 0x2)
348 for (unsigned i = 0; i < size; i++)
349 words[i + 2] = data.value()[i];
350 }
351 }
352 return success();
353}
XAie_TxnOpcode
@ XAIE_IO_LOAD_PM_START
@ XAIE_IO_MASKPOLL
@ XAIE_IO_PREEMPT
@ XAIE_IO_CUSTOM_OP_NEXT
@ XAIE_IO_UPDATE_REG
@ XAIE_IO_CUSTOM_OP_RECORD_TIMER
@ XAIE_IO_CUSTOM_OP_TCT
@ XAIE_IO_MASKPOLL_BUSY
@ XAIE_IO_CUSTOM_OP_READ_REGS
@ XAIE_CONFIG_SHIMDMA_BD
@ XAIE_IO_UPDATE_STATE_TABLE
@ XAIE_IO_UPDATE_SCRATCH
@ XAIE_IO_CUSTOM_OP_DDR_PATCH
@ XAIE_IO_CUSTOM_OP_MERGE_SYNC
@ XAIE_IO_LOAD_PM_END_INTERNAL
@ XAIE_IO_BLOCKWRITE
@ XAIE_IO_BLOCKSET
@ XAIE_IO_CREATE_SCRATCHPAD
@ XAIE_IO_LOADPDI
@ XAIE_IO_MASKWRITE
@ XAIE_IO_CUSTOM_OP_BEGIN
@ XAIE_IO_CUSTOM_OP_MAX
@ XAIE_IO_NOOP
@ XAIE_CONFIG_SHIMDMA_DMABUF_BD
@ XAIE_IO_WRITE
virtual int rows() const =0
Return the number of rows in the device.
virtual uint32_t getColumnShift() const =0
virtual int columns() const =0
Return the number of columns in the device.
virtual uint32_t getNumMemTileRows() const =0
virtual uint32_t getRowShift() const =0
uint8_t major
Definition cxxopts.hpp:131
uint8_t minor
Definition cxxopts.hpp:131
Include the generated interface declarations.
mlir::LogicalResult AIETranslateNpuToBinary(mlir::ModuleOp, std::vector< uint32_t > &, llvm::StringRef sequenceName="")
const AIETargetModel & getTargetModel(mlir::Operation *op)
mlir::LogicalResult AIETranslateControlPacketsToUI32Vec(mlir::ModuleOp, std::vector< uint32_t > &, llvm::StringRef sequenceName="")