MLIR-AIE
AIEVectorize.cpp
Go to the documentation of this file.
1//===-AIEVectorize.cpp - Vectorizer for AIE architecture --------*- C++ -*-===//
2//
3// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7// (c) Copyright 2022 Xilinx Inc.
8//
9//===----------------------------------------------------------------------===//
10// This file implements the functionality to massage the output from affine
11// supervectorizer into a set of operations and datatypes corresponding to
12// AIE vector abstraction.
13//===----------------------------------------------------------------------===//
14
20
21#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
22#include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
23#include "mlir/Dialect/Affine/IR/AffineOps.h"
24#include "mlir/Dialect/Func/IR/FuncOps.h"
25#include "mlir/Dialect/MemRef/IR/MemRef.h"
26#include "mlir/Dialect/SCF/IR/SCF.h"
27#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
28#include "mlir/IR/TypeUtilities.h"
29#include "mlir/Pass/PassManager.h"
30#include "mlir/Transforms/Passes.h"
31
32#include "llvm/ADT/SmallSet.h"
33
34using namespace llvm;
35using namespace mlir;
36using namespace arith;
37using namespace vector;
38using namespace xilinx;
39using namespace xilinx::aievec;
40
41#define DEBUG_TYPE "aie-vect"
42
43static llvm::cl::opt<bool>
44 unalignedLoadsCheck("unaligned-loads-check",
45 llvm::cl::desc("Enable the unaligned loads check"),
46 llvm::cl::init(true));
47
48static llvm::cl::opt<bool> AIEML("aieml", llvm::cl::desc("AI Engine-ML"),
49 llvm::cl::init(false));
50
51namespace {
52// A struct to pack the global state required for vectorization at one place.
53// Local to this translation unit.
54struct VectState {
55 // A vector of all the reuse intervals created. Class IntervalReuse represents
56 // the cluster of data access (with reuse potential) along the vectorized
57 // dimension of each array, It clusters together reads that have a potential
58 // of vector-level data reuse. Therefore, array accesses A[i][j:j+8] and
59 // A[i+2][j:j+8] will map to different IntervalReuse objects.
60 SmallVector<IntervalReuse *, 16> reuseIntervals;
61 // Map from a transfer_read operation to the IntervalReuse object it belongs
62 // to.
63 mlir::DenseMap<Operation *, IntervalReuse *> opToIntervalMap;
64 // Map from a transfer_read operation to its linearized access expression.
65 // Linearized expression for access A[i][j], where A is of dimensionality MxN
66 // is (i*N+j). We assume that the innermost dimension is the vectorized
67 // dimension.
68 mlir::DenseMap<Operation *, AffineExpr> linearizedAccess;
69 // A map from an index (of array access) to an expr dim map (e.g., i->d0). We
70 // need this to create the correct linearized expressions for all the array
71 // accesses in the function.
72 mlir::DenseMap<Value, AffineExpr> indexToExprDimMap;
73 // For each transfer_read operation, a map from its container basic block to
74 // the enclosing for/while loops. This helps us identify two instructions
75 // that are nested together, even if they belong to different basic blocks.
76 mlir::DenseMap<Block *, SmallVector<Operation *, 8>> blockToEnclosingLoops;
77 // This is specific to 8x8 scheme. For an 8x8 scheme, every mul/fma is
78 // replaced by two mul/fmas in AIE dialect. So we keep track of the pair.
79 mlir::DenseMap<Operation *, Operation *> pairedOp;
80 // If we fuse a representative mul/fma op with another fma op to exploit the
81 // column topology of the AIE intrinsic, then cache, for the representative
82 // op, the compile-time constant access distance between their two operands.
83 // The first(second) offset of the pair represents the access distance
84 // between the first(second) operands of the representative op and the the
85 // fused op(s). This access distance will be used to compute the xstep/zstep
86 // attribute.
87 mlir::DenseMap<Operation *, std::pair<int32_t, int32_t>> opToColOffsets;
88 // Map from the sext op to the def op of the sext operand.
89 mlir::DenseMap<Operation *, Operation *> sextTruncDefMap;
90 // A set of operations that are msc (fmsub) ops. We do not differentiate
91 // between mac and msc ops at vector dialect level. The only op in vector
92 // dialect is just FMA op.
93 llvm::SmallSet<Operation *, 8> mscOps;
94 // Used to build and insert all the new operations created.
95 OpBuilder builder;
96 // The shift val for ups and srs intinsics. This value should be between 0
97 // and 63.
98 int8_t shift;
99 // The zero offset, indicating the position of recurring 0 in the input
100 // filter. The counting starts at 1. For example, if the filter array is
101 // {1,2,3,0,4,5,6,0,7,8,9,0}, then zeroOffset=4.
102 int32_t zeroOffset;
103 // The duplicate count, indicating the number of times a value is duplicated
104 // in the filter. The filter values must be duplicated at least twice for the
105 // i8xi8 scheme. An example of filter for i8xi8 scheme is {0,0,1,1,2,2,3,3},
106 // with dupFactor=2.
107 int32_t dupFactor;
108
109 bool unalignedLoadsCheck, aieml;
110
111 // Constructors
112 VectState(MLIRContext *context, int8_t s, int32_t z, int32_t d,
113 bool unalignedLoadsCheck, bool aieml)
114 : builder(context), shift(s), zeroOffset(z), dupFactor(d),
115 unalignedLoadsCheck(unalignedLoadsCheck), aieml(aieml) {}
116
117 IntervalReuse *getIntervalForOperation(Operation *op);
118};
119
120// Get the IntervalReuse object for a given read operation
121IntervalReuse *VectState::getIntervalForOperation(Operation *op) {
122 assert(opToIntervalMap.count(op) &&
123 "could not find the IntervalReuse object for op");
124 return opToIntervalMap[op];
125}
126
127// A struct to store the attributes (start, lo/hi offset, step, square) for an
128// AIE fma, mul, or select operation.
129struct AIEOpAttributes {
130 std::string select;
131 SmallVector<std::string, 2> start;
132 SmallVector<std::string, 2> offset, offset_hi;
133 SmallVector<std::string, 2> step;
134 SmallVector<std::string, 2> square;
135};
136
137// A struct that stores some of the attributes for a vector type
138struct AIEVecAttributes {
139 // The number of lanes along the vectorized dimension for the vector type.
140 // For a multidimensional vector, it is the innermost dimension size.
141 unsigned lanes;
142 // For a 1D vector, capture its size in bits. For an nD vector, capture the
143 // size of the innermost dimension in bits.
144 int32_t vecSizeInBits;
145 // Underlying scalar element type
146 Type elementType;
147 // The the element size in bits
148 int32_t elementSizeInBits;
149 // Does the vector load data from memory
150 bool loadFromMemory;
151 // Is the vector splat?
152 bool isSplat;
153 // Constructors
154 AIEVecAttributes(unsigned l, unsigned vs, Type et, int32_t es)
155 : lanes(l), vecSizeInBits(vs), elementType(et), elementSizeInBits(es),
156 loadFromMemory(false), isSplat(false) {}
157};
158
159// Structure to capture the lane/col topology, and the element type size of
160// xbuff and ybuff. Captures all the necessary information to map the incoming
161// mul/mac op to the vectorization scheme.
162struct Scheme {
163 // lanes and columns in the vector intrinsic
164 int32_t lanes, cols;
165 // size (in bits) of the underlying scalar element type of xbuff and zbuff
166 int32_t xbits, zbits;
167 // Constructor
168 Scheme(int32_t l, int32_t c, int32_t x, int32_t z)
169 : lanes(l), cols(c), xbits(x), zbits(z) {}
170};
171} // namespace
172
173//===----------------------------------------------------------------------===//
174// Helper Routines
175//===----------------------------------------------------------------------===//
176
177// Combine the result of vector-related utilities into a single utility.
178static AIEVecAttributes getVectorStats(VectorType type) {
179 return AIEVecAttributes(getVectorLaneSize(type), getVectorSizeInBits(type),
180 type.getElementType(), getElementSizeInBits(type));
181}
182
183// Get the vector stats for an operation's result.
184static AIEVecAttributes getResultVecStats(Operation *op, unsigned idx = 0) {
185 auto vtype = cast<VectorType>(op->getResult(idx).getType());
186 return getVectorStats(vtype);
187}
188
189static Operation *getOperandDefOp(VectState *state, Operation *op,
190 unsigned idx) {
191 return state->sextTruncDefMap.count(op->getOperand(idx).getDefiningOp())
192 ? state->sextTruncDefMap[op->getOperand(idx).getDefiningOp()]
193 : op->getOperand(idx).getDefiningOp();
194}
195
196// Get the vector stats for an operation's operand.
197static AIEVecAttributes getOperandVecStats(Operation *op, VectState *state,
198 unsigned idx = 0) {
199 assert(op->getNumOperands() > idx);
200 Operation *defOp = getOperandDefOp(state, op, idx);
201 auto vtype = cast<VectorType>(defOp->getResult(0).getType());
202 auto ret = getVectorStats(vtype);
203 // if the defining op is a transfer read, get the extent read from source
204 if (auto readOp = dyn_cast<TransferReadOp>(defOp)) {
205 IntervalReuse *iv = state->getIntervalForOperation(readOp);
206 ret.vecSizeInBits = iv->getIntervalWidth(readOp);
207 // Set load from memory to true
208 ret.loadFromMemory = true;
209 // Check if the load is splat
210 ret.isSplat = readOp.getPermutationMap().isConstant();
211 }
212 return ret;
213}
214
215// Get the number of rows and columns in the vector scheme.
216static std::pair<int32_t, int32_t> getNumRowsAndCols(Operation *op,
217 VectState *state) {
218 assert(op->getNumOperands() >= 2 && op->getNumResults() == 1);
219
220 Operation *left = getOperandDefOp(state, op, 0);
221 Operation *right = getOperandDefOp(state, op, 1);
222
223 // Get the number of lanes
224 auto vtype = cast<VectorType>(op->getResult(0).getType());
225 int32_t lanes = getVectorLaneSize(vtype);
226
227 // Get the data sizes for left and right operands
228 auto ltype = cast<VectorType>(left->getResult(0).getType());
229 auto rtype = cast<VectorType>(right->getResult(0).getType());
230 int32_t lsize = getElementSizeInBits(ltype);
231 int32_t rsize = getElementSizeInBits(rtype);
232
233 int32_t width = (lsize == 8 && rsize == 8) ? (state->aieml ? 256 : 128)
234 : (lsize == 16 && rsize == 8) ? 64
235 : 32;
236
237 if (state->aieml && getVectorSizeInBits(rtype) == 512) {
238 width *= 2;
239 }
240
241 // Now the computation
242 int32_t m = 1;
243 if (lsize == 32)
244 m *= 2;
245 if (rsize == 32)
246 m *= 2;
247 int32_t cols = width / (m * lanes);
248 return std::make_pair(lanes, cols);
249}
250
251// Fuse the access extent of two mul/fma operations. This means that for the
252// corresponding lhs(rhs) operands of op1 and op2, check if they read from
253// memory, and if they do, extend their access extent to their union. For
254// example if the left operand of Op1 has access extent [0,256], and the left
255// operand of Op2 has access extent [128,512], where these two accesses belong
256// to the same ReuseInterval, then the union is [0,512]. This union will be the
257// new access extent of the left operands of both Op1 and Op2.
258static void fuseAccessExtent(Operation *Op1, Operation *Op2, VectState *state) {
259 // Assert that the input operations are of expected type
260 assert([&] {
261 bool expectedTypes =
262 (isa<vector::FMAOp>(Op2) && isa<MulIOp, MulFOp, vector::FMAOp>(Op1));
263 if (!expectedTypes) {
264 printf("incorrect operation types\n");
265 return false;
266 }
267 return true;
268 }());
269
270 // Iterate over the even and odd operands for both the operations
271 for (int idx = 0; idx < 2; ++idx) {
272 Operation *op1 = getOperandDefOp(state, Op1, idx);
273 Operation *op2 = getOperandDefOp(state, Op2, idx);
274
275 // If both op1 and op2 are transfer read ops, then we need to create an
276 // interval that subsumes the extent read by both op1 an op2.
277 if (isa<TransferReadOp>(op1) && isa<TransferReadOp>(op2)) {
278 IntervalReuse *iv1 = state->getIntervalForOperation(op1);
279 IntervalReuse *iv2 = state->getIntervalForOperation(op2);
280 // Assert that both the ops belong to the same IntervalReuse object
281 assert(iv1 == iv2);
282 assert(iv1->getInterval(op1) == iv2->getInterval(op2));
283 auto op1Extent = iv1->getAccessExtent(op1);
284 auto op2Extent = iv2->getAccessExtent(op2);
285 // Create the new extent that's a union of refExtent and opExtent
286 auto newExtent =
287 std::make_pair(std::min(op1Extent.first, op2Extent.first),
288 std::max(op1Extent.second, op2Extent.second));
289 // And now update the read extents with the union
290 iv1->setAccessExtent(op1, newExtent);
291 iv2->setAccessExtent(op2, newExtent);
292 }
293 }
294}
295
296// To be a simple lane-wise multiplication, we check that
297// (1) both lhs and rhs operands come from vector of same size,
298// (2) no operand is splat, and
299// (3) no type is float if Op is mul/fma.
300static bool isSimpleVectIntrinsic(Operation *Op, VectState *state) {
301 // The incoming operator should be mul/fma/sub/add op
302 bool isMulOrFMAOp = isa<MulIOp, MulFOp, vector::FMAOp>(Op);
303 bool isSubOrAddOp = isa<SubIOp, SubFOp, AddIOp, AddFOp>(Op);
304 if (!isMulOrFMAOp && !isSubOrAddOp)
305 return true;
306
307 // Get the vec stats for result, left, and right operand
308 AIEVecAttributes vstat = getResultVecStats(Op);
309 AIEVecAttributes lstat = getOperandVecStats(Op, state, 0);
310 AIEVecAttributes rstat = getOperandVecStats(Op, state, 1);
311
312 bool sizeMatches = lstat.vecSizeInBits == rstat.vecSizeInBits &&
313 vstat.vecSizeInBits == rstat.vecSizeInBits &&
314 lstat.elementType == rstat.elementType &&
315 vstat.elementType == rstat.elementType;
316 bool noSplat = !lstat.isSplat && !rstat.isSplat;
317 bool noFloat = !isa<FloatType>(vstat.elementType) &&
318 !isa<FloatType>(lstat.elementType) &&
319 !isa<FloatType>(rstat.elementType);
320
321 return sizeMatches && noSplat && (isSubOrAddOp || noFloat);
322}
323
324// Return true if this is a vector dialect op meeting the following conditions:
325// (1) all the operands and results are vectorized; and
326// (2) all the vector sizes are the same.
327// (3) all the vectors have the same underlying scalar element type.
328static bool isWellFormedVectorOp(Operation *Op) {
329 // The op must have at least an operand or result
330 if (Op->getNumOperands() == 0 && Op->getNumResults() == 0)
331 return false;
332
333 SmallVector<Value, 8> operandsAndResults;
334 operandsAndResults.append(Op->operand_begin(), Op->operand_end());
335 operandsAndResults.append(Op->result_begin(), Op->result_end());
336
337 // Check 1. all the operands and results must be vector types
338 for (auto val : operandsAndResults) {
339 if (!isa<VectorType>(val.getType()))
340 return false;
341 }
342
343 auto refType = cast<VectorType>(operandsAndResults.back().getType());
344 Type scalarType = refType.getElementType();
345 unsigned refSize = getVectorLaneSize(refType);
346 for (auto val : operandsAndResults) {
347 auto vtype = cast<VectorType>(val.getType());
348 // Check 2. All the vector sizes must be same
349 if (refSize != getVectorLaneSize(vtype))
350 return false;
351 // Check 3. The underlying scalar type of all the vectors must be the same
352 if (scalarType != vtype.getElementType())
353 return false;
354 }
355
356 return true;
357}
358
359// Given an AIEOp, determines if an operation writes to an accumulator
360// based on operation type and operand types
361static bool writesToAccumulator(Operation *op) {
362 // Integer muls and FMAs write to accumulator
363 if (!isAIEOp(op))
364 return false;
365 if (auto mulOp = dyn_cast<aievec::aie1::MulOp>(op))
366 return isa<IntegerType>(
367 cast<VectorType>(mulOp.getResult().getType()).getElementType());
368 if (auto fmaOp = dyn_cast<aievec::aie1::FMAOp>(op))
369 return isa<IntegerType>(
370 cast<VectorType>(fmaOp.getResult().getType()).getElementType());
371
372 return isa<aievec::FMAElemOp, aievec::MulElemOp, aievec::FMAConvOp,
373 aievec::MulConvOp, aievec::UPSOp>(op);
374}
375
376//===----------------------------------------------------------------------===//
377// Manipulate affine expressions
378//===----------------------------------------------------------------------===//
379
380// Make a flattened affine expression from the given exprs array. Functionally
381// identical to makeCanonicalStridedLayoutExpr except that the returned
382// AffineExpr is not simplified.
383static AffineExpr makeFlattenedStridedExpr(ArrayRef<int64_t> sizes,
384 ArrayRef<AffineExpr> exprs,
385 MLIRContext *context) {
386 assert(!sizes.empty() && !exprs.empty() &&
387 "expected non-empty sizes and exprs");
388
389 // Size 0 corner case is useful for canonicalizations.
390 if (llvm::is_contained(sizes, 0))
391 return getAffineConstantExpr(0, context);
392
393 auto maps = AffineMap::inferFromExprList(exprs, context);
394 assert(!maps.empty() && "Expected one non-empty map");
395 unsigned nSymbols = maps[0].getNumSymbols();
396
397 AffineExpr expr;
398 bool dynamicPoisonBit = false;
399 int64_t runningSize = 1;
400 for (auto en : llvm::zip(llvm::reverse(exprs), llvm::reverse(sizes))) {
401 int64_t size = std::get<1>(en);
402 // Degenerate case, no size =-> no stride
403 if (size == 0)
404 continue;
405 AffineExpr dimExpr = std::get<0>(en);
406 AffineExpr stride = dynamicPoisonBit
407 ? getAffineSymbolExpr(nSymbols++, context)
408 : getAffineConstantExpr(runningSize, context);
409 expr = expr ? expr + dimExpr * stride : dimExpr * stride;
410 if (size > 0) {
411 runningSize *= size;
412 assert(runningSize > 0 && "integer overflow in size computation");
413 } else {
414 dynamicPoisonBit = true;
415 }
416 }
417 return expr;
418}
419
420// Construct a linearized affine expression for the transfer_read op.
421static AffineExpr constructLinearizedAffineExpr(TransferReadOp readOp,
422 VectState *state) {
423 // The global state stores a map from readOp to its linearized expression. If
424 // the linear expression is already computed for this readOp, return it.
425 if (state->linearizedAccess.count(readOp))
426 return state->linearizedAccess[readOp];
427
428 SmallVector<Value, 4> indices(readOp.getIndices().begin(),
429 readOp.getIndices().end());
430 auto memRefType = cast<MemRefType>(readOp.getSource().getType());
431 MLIRContext *context = memRefType.getContext();
432
433 SmallVector<AffineExpr, 8> exprVec;
434 // Iterate over all the indices. If the index has an affine apply op
435 // associated with it, we extract that. Otherwise we use the index from
436 // default map.
437 for (auto idxAndValue : llvm::enumerate(indices)) {
438 auto value = idxAndValue.value();
439 // If the access is a map via affine apply op (e.g., A[i+2], where the map
440 // is d0 -> d0+2), push in the map after replacing all the dims with unique
441 // index identifiers (e.g., let the unique identifier for index i be k0).
442 if (auto apOf = value.getDefiningOp<affine::AffineApplyOp>()) {
443 AffineMap map = apOf.getAffineMap();
444 assert(map.getNumResults() == 1 &&
445 "Failed to create linearized affineExpr for complicated index");
446 SmallVector<AffineExpr, 4> indexExprs;
447 // Each operand of the map corresponds to a loop index. For each operand
448 // (i.e., loop index), we create a unique dim expr.
449 for (auto index : apOf.getMapOperands()) {
450 if (auto cIdx = index.getDefiningOp<arith::ConstantOp>()) {
451 auto idxVal = cast<IntegerAttr>(cIdx.getValue()).getValue();
452 unsigned idx = idxVal.getSExtValue();
453 indexExprs.push_back(getAffineConstantExpr(idx, context));
454 } else {
455 if (!state->indexToExprDimMap.count(index))
456 state->indexToExprDimMap[index] =
457 getAffineDimExpr(state->indexToExprDimMap.size(), context);
458 indexExprs.push_back(state->indexToExprDimMap[index]);
459 }
460 }
461 // Now create a correct map expression using the unique dim exprs
462 exprVec.push_back(map.getResult(0).replaceDims(indexExprs));
463 }
464 // If the index is an arith constant (e.g., A[3]), create an affine expr
465 // from the constant value.
466 else if (auto cOp = value.getDefiningOp<arith::ConstantOp>()) {
467 auto idxVal = cast<IntegerAttr>(cOp.getValue()).getValue();
468 unsigned idx = idxVal.getSExtValue();
469 exprVec.push_back(getAffineConstantExpr(idx, context));
470 }
471 // Default: the readop index is simply the loop index (e.g., A[i]).
472 else {
473 if (!state->indexToExprDimMap.count(value))
474 state->indexToExprDimMap[value] =
475 getAffineDimExpr(state->indexToExprDimMap.size(), context);
476 exprVec.push_back(state->indexToExprDimMap[value]);
477 }
478 }
479
480 assert(!exprVec.empty() && "Could not construct linearized affineExpr");
481
482 // Linearize the exprVec as a strided access, but do not simplify
483 auto ret = makeFlattenedStridedExpr(memRefType.getShape(), exprVec,
484 memRefType.getContext());
485 // Cache this readOp and linearized expr into the global map
486 state->linearizedAccess[readOp] = ret;
487 return ret;
488}
489
490// From a linearized affine expression, compute the base and the constant
491// offset. If the access is A[i][j+2] for an N*N array A, the linearized
492// expression will be A[i*N+j+2]. The base in this case will be (i*N+j), and the
493// offset will be 2.
494static std::pair<AffineExpr, int32_t> getBaseAndOffset(AffineExpr expr) {
495 AffineExpr base = expr;
496 int32_t offset = 0;
497 // If expr is already a constant, the base is nullptr, and offset is expr
498 if (auto constExpr = llvm::dyn_cast<AffineConstantExpr>(expr)) {
499 base = nullptr;
500 offset += constExpr.getValue();
501 }
502 // If this is a binary '+' expression, compute the constant offset. Currently
503 // this is just a simple FSM. This must evolve as we explore more complex
504 // access patterns.
505 else if (auto binopExpr = llvm::dyn_cast<AffineBinaryOpExpr>(expr)) {
506 if (binopExpr.getKind() == AffineExprKind::Add) {
507 AffineExpr lhs = binopExpr.getLHS(), rhs = binopExpr.getRHS();
508 if (auto constExpr = llvm::dyn_cast<AffineConstantExpr>(lhs)) {
509 base = rhs;
510 offset += constExpr.getValue();
511 }
512 if (auto constExpr = llvm::dyn_cast<AffineConstantExpr>(rhs)) {
513 base = base == rhs ? nullptr : lhs;
514 offset += constExpr.getValue();
515 }
516 }
517 }
518 return std::make_pair(base, offset);
519}
520
521//===----------------------------------------------------------------------===//
522// AIE vector op generation routines
523//===----------------------------------------------------------------------===//
524// Generate and return a Cast op.
525static aievec::CastOp generateCastOp(Value source, VectorType resType,
526 bool isResAcc, VectState *state,
527 Location loc) {
528 // Create the Cast op
529 auto castOp =
530 state->builder.create<aievec::CastOp>(loc, resType, source, isResAcc);
531
532 assert(castOp && "could not create srs op");
533 return castOp;
534}
535
536// Generate and return an SRS op. Incoming `source` is an accumulator. The
537// output should be a vector of element type `scalarType`.
538static aievec::SRSOp generateSRSOp(Value source, Type scalarType,
539 VectState *state, Location loc) {
540 // The source should write to accumulator
541 Type accType = source.getType();
542 assert(writesToAccumulator(source.getDefiningOp()) &&
543 "srs source should write to accumulator");
544
545 // Get the number of lanes
546 unsigned lanes = getVectorLaneSize(cast<VectorType>(accType));
547 // Now generate the new vector type for the SRS intrinsic
548 VectorType srsType = createVectorType(lanes, scalarType);
549
550 auto shiftParamOp = state->builder.create<arith::ConstantOp>(
551 loc, state->builder.getI32IntegerAttr(state->shift));
552 // Create the SRS op
553 auto srsOp = state->builder.create<aievec::SRSOp>(loc, srsType, source,
554 shiftParamOp.getResult());
555
556 assert(srsOp && "could not create srs op");
557 return srsOp;
558}
559
560// Generate and return a UPS op. Incoming `source` is a vector which needs
561// to be moved to an accumulator.
562static aievec::UPSOp generateUPSOp(Value source, VectState *state,
563 Location loc) {
564 Type sourceType = source.getType();
565 Type accType =
566 getVectorOpDestType(cast<VectorType>(sourceType), state->aieml);
567 assert(!writesToAccumulator(source.getDefiningOp()) &&
568 "ups source should not be accumulator");
569
570 // Create a new UPS instruction
571 auto upsOp =
572 state->builder.create<aievec::UPSOp>(loc, accType, source, state->shift);
573
574 assert(upsOp && "could not create ups op");
575 return upsOp;
576}
577
578// Generate and return a Broadcast op.
579static aievec::BroadcastOp generateBroadcastOp(Value source, int8_t idx,
580 VectState *state, Location loc) {
581 auto type = cast<VectorType>(source.getType());
582 // Create a new Broadcast instruction
583 auto broadcastOp =
584 state->builder.create<aievec::BroadcastOp>(loc, type, source, idx);
585
586 assert(broadcastOp && "could not create broadcast op");
587 return broadcastOp;
588}
589
590// Generate and return a Concat op.
591static aievec::ConcatOp generateConcatOp(SmallVector<Value> &sources,
592 VectState *state, Location loc,
593 VectorType concatType = nullptr) {
594 assert(sources.size() > 1 && "must concat at least two vectors");
595
596 auto vecType = cast<VectorType>(sources.back().getType());
597
598 assert([&] {
599 for (auto source : sources) {
600 auto type = cast<VectorType>(source.getType());
601 if (type != vecType) {
602 printf("sources of concat op not of same type\n");
603 return false;
604 }
605 }
606 return true;
607 }());
608
609 if (!concatType) {
610 // Get the number of lanes and scalar type to create the concat result type
611 unsigned lanes = sources.size() * getVectorLaneSize(vecType);
612 Type scalarType = vecType.getElementType();
613 concatType = createVectorType(lanes, scalarType);
614 }
615
616 // Create the concat op
617 auto concatOp =
618 state->builder.create<aievec::ConcatOp>(loc, concatType, sources);
619
620 assert(concatOp && "could not create concat op");
621 return concatOp;
622}
623
624// Generate and return a select operation. The start, offset, etc. for lanes
625// are in opAttr.
626static aievec::aie1::SelectOp
627generateSelectOp(Value xbuff, AIEOpAttributes &opAttr, unsigned lanes,
628 VectState *state, Location loc, Value ybuff = nullptr) {
629 // Assert that we have computed the attributes (start, offset, etc.) for both
630 // lanes, and that select is non-empty.
631 assert(!opAttr.select.empty());
632 assert(opAttr.start.size() == opAttr.offset.size() &&
633 opAttr.start.size() == 2);
634
635 auto xtype = cast<VectorType>(xbuff.getType());
636 // Verify that lanes is <= xtype lanes
637 assert(lanes <= getVectorLaneSize(xtype));
638 // Create the result type
639 VectorType resultType = createVectorType(lanes, xtype.getElementType());
640
641 // Create AIE dialect select op
642 auto selectOp = state->builder.create<aievec::aie1::SelectOp>(
643 loc, resultType, xbuff, opAttr.select, opAttr.start[0], opAttr.offset[0],
644 opAttr.offset_hi[0], opAttr.square[0], opAttr.start[1], opAttr.offset[1],
645 opAttr.offset_hi[1], opAttr.square[1], ybuff);
646
647 assert(selectOp && "could not create select op");
648 return selectOp;
649}
650
651// Generate and return an Ext op. The lanes indicate the lanes in vector
652// output, and idx defines which part of source is extracted.
653static aievec::aie1::ExtOp generateExtOp(Value source, unsigned lanes,
654 int8_t idx, VectState *state,
655 Location loc) {
656 auto stype = cast<VectorType>(source.getType());
657 // Verify that lanes*idx is <= stype lanes
658 assert(lanes * (idx + 1) <= getVectorLaneSize(stype));
659 // Create the result type
660 VectorType resultType = createVectorType(lanes, stype.getElementType());
661
662 // Create AIE dialect ext op
663 auto extOp =
664 state->builder.create<aievec::aie1::ExtOp>(loc, resultType, source, idx);
665
666 assert(extOp && "could not create ext op");
667 return extOp;
668}
669
670// Generate and return an Pack op.
671static aievec::PackOp generatePackOp(Value source, VectState *state,
672 Location loc) {
673 // Create the result type
674 auto stype = cast<VectorType>(source.getType());
675 unsigned lanes = getVectorLaneSize(stype);
676 Type i8Type = IntegerType::get(source.getContext(), 8);
677 VectorType resultType = createVectorType(lanes, i8Type);
678
679 // Create AIE dialect pack op
680 auto packOp = state->builder.create<aievec::PackOp>(loc, resultType, source);
681
682 assert(packOp && "could not create pack op");
683 return packOp;
684}
685
686// Generate and return an Add op.
687static aievec::aie1::AddOp generateAddOp(Operation *Op, AIEOpAttributes &opAttr,
688 VectState *state) {
689 // Assert that we computed the attributes for both the operands
690 assert(opAttr.start.size() == opAttr.offset.size() &&
691 opAttr.start.size() == 2);
692
693 auto addOp = state->builder.create<aievec::aie1::AddOp>(
694 Op->getLoc(), Op->getResult(0).getType(), Op->getOperand(0),
695 Op->getOperand(1), opAttr.start[0], opAttr.offset[0], opAttr.offset_hi[0],
696 opAttr.square[0], opAttr.start[1], opAttr.offset[1], opAttr.offset_hi[1],
697 opAttr.square[1]);
698 return addOp;
699}
700
701// Generate and return a Sub op.
702static aievec::aie1::SubOp generateSubOp(Operation *Op, AIEOpAttributes &opAttr,
703 VectState *state) {
704 // Assert that we computed the attributes for both the operands
705 assert(opAttr.start.size() == opAttr.offset.size() &&
706 opAttr.start.size() == 2);
707
708 auto subOp = state->builder.create<aievec::aie1::SubOp>(
709 Op->getLoc(), Op->getResult(0).getType(), Op->getOperand(0),
710 Op->getOperand(1), opAttr.start[0], opAttr.offset[0], opAttr.offset_hi[0],
711 opAttr.square[0], opAttr.start[1], opAttr.offset[1], opAttr.offset_hi[1],
712 opAttr.square[1]);
713 return subOp;
714}
715
716static aievec::ShiftOp generateShiftOp(Value lhs, Value rhs, int32_t shiftBytes,
717 VectState *state, Location loc,
718 VectorType resType = nullptr) {
719 auto vecType = cast<VectorType>(rhs.getType());
720
721 assert([&] {
722 auto type = cast<VectorType>(lhs.getType());
723 if (type != vecType) {
724 printf("lhs and rhs do not have same type\n");
725 return false;
726 }
727 return true;
728 }());
729
730 if (!resType) {
731 unsigned lanes = getVectorLaneSize(vecType);
732 Type scalarType = vecType.getElementType();
733 resType = createVectorType(lanes, scalarType);
734 }
735
736 auto constOp = state->builder.create<arith::ConstantOp>(
737 loc, state->builder.getI32IntegerAttr(shiftBytes));
738 auto shiftOp = state->builder.create<aievec::ShiftOp>(loc, resType, lhs, rhs,
739 constOp.getResult());
740
741 return shiftOp;
742}
743
744static aievec::LegacyShuffleOp generateShuffleOp(Value source, VectState *state,
745 Location loc, unsigned mode,
746 VectorType resType = nullptr) {
747 auto vecType = cast<VectorType>(source.getType());
748
749 if (!resType) {
750 unsigned lanes = 512 / getElementSizeInBits(vecType);
751 Type scalarType = vecType.getElementType();
752 resType = createVectorType(lanes, scalarType);
753 }
754
755 auto shuffleOp = state->builder.create<aievec::LegacyShuffleOp>(loc, resType,
756 source, mode);
757
758 return shuffleOp;
759}
760
761// For AIEML, i8xi8 scheme generates one MulConvOp or FMAConvOp for each vector
762// dialect mul/fma op instead of generating two AIE dialect mul/fma ops for each
763// vector dialect mul/fma in AIE1.
764static Operation *generateMulOrFMAConvOpForInt8(Operation *Op,
765 AIEOpAttributes &opAttr,
766 VectState *state) {
767 // Assert that we have computed the attributes (start, offset, etc.) for both
768 // left and right operands of the fma operation.
769 assert(opAttr.start.size() == opAttr.offset.size() &&
770 opAttr.start.size() == 2 && state->dupFactor == 2);
771
772 Value lhs = state->sextTruncDefMap.count(Op->getOperand(1).getDefiningOp())
773 ? Op->getOperand(1).getDefiningOp()->getOperand(0)
774 : Op->getOperand(1);
775 Value rhs = state->sextTruncDefMap.count(Op->getOperand(0).getDefiningOp())
776 ? Op->getOperand(0).getDefiningOp()->getOperand(0)
777 : Op->getOperand(0);
778 auto vType = cast<VectorType>(lhs.getType());
779 Type stype = vType.getElementType();
780 auto itype = cast<IntegerType>(stype);
781 unsigned width = itype.getWidth() <= 8 ? 32 : 64;
782 int32_t M = 32;
783 int32_t N = 8;
784
785 Type ctype = IntegerType::get(itype.getContext(), width);
786 Type opType = VectorType::get(vType.getShape(), ctype);
787 auto defOp = rhs.getDefiningOp();
788 state->builder.setInsertionPointAfter(defOp);
789 Location loc = defOp->getLoc();
790
791 // Since we do not need to use duplicated data like in AIE1, if a dup-factor
792 // exists, we extract the identical data by shuffle op. We use mode 0 to
793 // extract the elements with even indices for i8 type data.
794 Operation *shuffleOp = generateShuffleOp(defOp->getResult(0), state, loc, 0);
795
796 int32_t shiftBytes = stoi(opAttr.start[0]) * getElementSizeInBits(vType) / 8 /
797 state->dupFactor;
798
799 // Generate a shift_bytes operation for rhs if xstart is not 0
800 if (shiftBytes) {
801 state->builder.setInsertionPointAfter(shuffleOp);
802 loc = shuffleOp->getLoc();
803 rhs = generateShiftOp(shuffleOp->getResult(0), shuffleOp->getResult(0),
804 shiftBytes, state, loc);
805 } else {
806 rhs = shuffleOp->getResult(0);
807 }
808
809 state->builder.setInsertionPoint(Op);
810 loc = Op->getLoc();
811
812 Operation *convOp = nullptr;
813
814 if (isa<MulIOp>(Op)) {
815 convOp =
816 state->builder.create<aievec::MulConvOp>(loc, opType, lhs, rhs, M, N);
817 }
818
819 if (isa<vector::FMAOp>(Op)) {
820 Value acc = Op->getOperand(2);
821 bool isSub = state->mscOps.count(Op);
822 convOp = state->builder.create<aievec::FMAConvOp>(loc, opType, lhs, rhs,
823 acc, M, N, isSub);
824 }
825
826 return convOp;
827}
828
829// Generate and return an FMA operation in AIE dialect. This operation will
830// have the start and offset fields for each operand. If the acc operand of
831// fmaOp is a transfer_read operation, then we need to add an SRS instruction
832// that will load the vector value into an accumulator.
833static Operation *generateFMAOp(vector::FMAOp fmaOp, AIEOpAttributes &opAttr,
834 VectState *state, bool i8xi8_pairedOp = false) {
835 // Assert that we have computed the attributes (start, offset, etc.) for both
836 // left and right operands of the fma operation.
837 assert(opAttr.start.size() == opAttr.offset.size() &&
838 opAttr.start.size() == 2);
839
840 Value lhs = state->sextTruncDefMap.count(fmaOp.getLhs().getDefiningOp())
841 ? fmaOp.getLhs().getDefiningOp()->getOperand(0)
842 : fmaOp.getLhs();
843 Value rhs = state->sextTruncDefMap.count(fmaOp.getRhs().getDefiningOp())
844 ? fmaOp.getRhs().getDefiningOp()->getOperand(0)
845 : fmaOp.getRhs();
846 Value acc = state->sextTruncDefMap.count(fmaOp.getAcc().getDefiningOp())
847 ? fmaOp.getAcc().getDefiningOp()->getOperand(0)
848 : fmaOp.getAcc();
849
850 // Check if this is an fmsub op, and if so, then we need to generate msc op
851 bool isSub = state->mscOps.count(fmaOp);
852
853 // We need to generate a UPS op for the integer and AIEML path if the
854 // accumulator is coming from a vector register.
855 bool isInt = isa<IntegerType>(
856 cast<VectorType>(fmaOp.getLhs().getType()).getElementType());
857
858 Operation *xfmaOp;
859 if (state->aieml &&
860 getVectorSizeInBits(cast<VectorType>(rhs.getType())) == 512) {
861 if (!writesToAccumulator(acc.getDefiningOp())) {
862 acc = generateUPSOp(acc, state, fmaOp->getLoc());
863 LLVM_DEBUG(llvm::dbgs()
864 << "\n\nCreated UPS op " << acc << " to move the output of "
865 << fmaOp << " into accumulator");
866 }
867
868 if (!isSimpleVectIntrinsic(fmaOp, state)) {
869 // If targeting for AIE-ML intrinsics, use broadcast operator for rhs.
870 // Check the legality of generating a broadcast op by checking whether
871 // zbuffer is a splat
872 AIEVecAttributes rstat = getOperandVecStats(fmaOp, state, 1);
873 if (rstat.isSplat) {
874 rhs = generateBroadcastOp(rhs, stoi(opAttr.start[1]), state,
875 fmaOp->getLoc());
876 }
877 }
878 // Create AIEML dalect fma_elem/msc_elem op
879 xfmaOp = state->builder.create<aievec::FMAElemOp>(fmaOp->getLoc(), lhs, rhs,
880 acc, isSub);
881 } else {
882 // If i8xi8_pairedOp is true, then we are trying to generated the paired FMA
883 // op for i8xi8 scheme. Find the paired accumulator.
884 if (i8xi8_pairedOp) {
885 Operation *defOp = acc.getDefiningOp();
886 if (state->pairedOp.count(defOp))
887 acc = state->pairedOp[defOp]->getResult(0);
888 }
889
890 if (isInt && !writesToAccumulator(acc.getDefiningOp())) {
891 acc = generateUPSOp(acc, state, fmaOp->getLoc());
892 LLVM_DEBUG(llvm::dbgs()
893 << "\n\nCreated UPS op " << acc << " to move the output of "
894 << fmaOp << " into accumulator");
895 }
896
897 // If the lhs operand vector is not >= twice the rhs operand vector, then
898 // use concat operator.
899 if (!isSimpleVectIntrinsic(fmaOp, state)) {
900 AIEVecAttributes lstat = getOperandVecStats(fmaOp, state, 0);
901 assert(lstat.vecSizeInBits % 256 == 0);
902
903 if (lstat.vecSizeInBits == 256) {
904 VectorType concatType =
905 createVectorType(512 / lstat.elementSizeInBits, lstat.elementType);
906 SmallVector<Value> sources = {lhs, lhs};
907 lhs = generateConcatOp(sources, state, fmaOp->getLoc(), concatType);
908 }
909 }
910 // Create AIE dialect fma/msc op
911 xfmaOp = state->builder.create<aievec::aie1::FMAOp>(
912 fmaOp->getLoc(), lhs, rhs, acc, opAttr.start[0], opAttr.offset[0],
913 opAttr.offset_hi[0], opAttr.step[0], opAttr.square[0], opAttr.start[1],
914 opAttr.offset[1], opAttr.offset_hi[1], opAttr.step[1], opAttr.square[1],
915 isSub);
916 }
917
918 assert(xfmaOp && "could not create fma op");
919 return xfmaOp;
920}
921
922// Generate a MUL operation in AIE dialect. This operation will have the start
923// and offset fields for each operand.
924template <typename T>
925static Operation *generateMulOp(T mulOp, AIEOpAttributes &opAttr,
926 VectState *state) {
927 // Assert that we have computed the attributes (start, offset, etc.) for both
928 // left and right operands of the mul operation.
929 assert(opAttr.start.size() == opAttr.offset.size() &&
930 opAttr.start.size() == 2);
931
932 Type opType =
933 getVectorOpDestType(cast<VectorType>(mulOp.getType()), state->aieml);
934
935 // If the lhs operand vector is not >= twice the rhs operand vector, then use
936 // concat operator.
937 Value lhs = state->sextTruncDefMap.count(mulOp.getLhs().getDefiningOp())
938 ? mulOp.getLhs().getDefiningOp()->getOperand(0)
939 : mulOp.getLhs();
940 Value rhs = state->sextTruncDefMap.count(mulOp.getRhs().getDefiningOp())
941 ? mulOp.getRhs().getDefiningOp()->getOperand(0)
942 : mulOp.getRhs();
943 if (!isSimpleVectIntrinsic(mulOp, state)) {
944 AIEVecAttributes lstat = getOperandVecStats(mulOp, state, 0);
945 assert(lstat.vecSizeInBits % 256 == 0);
946 if (lstat.vecSizeInBits == 256) {
947 VectorType concatType =
948 createVectorType(512 / lstat.elementSizeInBits, lstat.elementType);
949 SmallVector<Value> sources = {lhs, lhs};
950 lhs = generateConcatOp(sources, state, mulOp->getLoc(), concatType);
951 }
952 }
953
954 // Create AIE dialect mul op
955 Operation *xmulOp = state->builder.create<aievec::aie1::MulOp>(
956 mulOp->getLoc(), lhs, rhs, opType, opAttr.start[0], opAttr.offset[0],
957 opAttr.offset_hi[0], opAttr.step[0], opAttr.square[0], opAttr.start[1],
958 opAttr.offset[1], opAttr.offset_hi[1], opAttr.step[1], opAttr.square[1]);
959
960 assert(xmulOp && "could not create mul op");
961 return xmulOp;
962}
963
964// For a transfer_read op, generate a corresponding UPD op. Multiple
965// transfer_read ops will have the same UPD op if their read access extent is
966// subsumed by the same interval. The updOps will have to be inserted at the
967// head of region if the region has multiple blocks, or closer to the readOp
968// otherwise.
969static aievec::UPDOp
970generateUPDOp(TransferReadOp readOp,
971 mlir::DenseMap<std::tuple<IntervalReuse *, int32_t, int32_t>,
972 std::pair<aievec::UPDOp, int8_t>> &memToUpdMap,
973 Region &region, VectState *state) {
974 // Get the read access extent and interval of this read operation
975 IntervalReuse *iv = state->getIntervalForOperation(readOp);
976 auto extent = iv->getAccessExtent(readOp);
977 auto interval = iv->getInterval(readOp);
978
979 int32_t intervalWidth = interval.second - interval.first;
980 assert(intervalWidth >= 128 && "Interval computation incorrect");
981
982 // Create the upd vector type. To do so, we need the underlying element type.
983 // We can divide the interval size by that to get the number of lanes in the
984 // result vector of upd op.
985 auto vecType = cast<VectorType>(readOp.getVector().getType());
986 Type elementType = vecType.getElementType();
987 int32_t elementSizeInBits = getElementSizeInBits(vecType);
988 int intervalWidthInBytes = intervalWidth / elementSizeInBits;
989 Type updVecType = createVectorType(intervalWidthInBytes, elementType);
990
991 // Compute the mid value of the interval. This is useful because for
992 // intervalWidth > 256 or 512 if it is AIEML, we can split the load into two
993 // steps: the bits to the left/right of mid will be loaded using upd
994 // idx=0/idx=1 operator.
995 int32_t mid = interval.first + intervalWidth / 2;
996 // Compute the (aligned) extent of interval that this read requires to be
997 // loaded.
998 int32_t lb =
999 intervalWidth <= (state->aieml && elementSizeInBits == 8 ? 512 : 256) ||
1000 extent.first < mid
1001 ? interval.first
1002 : mid;
1003 int32_t ub =
1004 intervalWidth <= (state->aieml && elementSizeInBits == 8 ? 512 : 256) ||
1005 extent.second > mid
1006 ? interval.second
1007 : mid;
1008
1009 // Find if we have already created upd op idx=0/idx=1 for this interval
1010 aievec::UPDOp updOp = nullptr;
1011 // initial value 0 of updIndices means neither upd op idx=0 nor idx=1 were
1012 // created.
1013 int8_t updIndices = 0;
1014 auto key = std::make_tuple(iv, interval.first, interval.second);
1015 if (memToUpdMap.count(key)) {
1016 updOp = memToUpdMap[key].first;
1017 updIndices = memToUpdMap[key].second;
1018 }
1019
1020 // This readOp could be A[i][j+2], where A is a 32-bit array. Assume that its
1021 // read access extent is subsumed by interval [0,512]. This 512-bit interval
1022 // load should be broken into two 256-bit UPD ops. We need to find the right
1023 // offset from A[i][j+2] where the reads will start. The first offset (in
1024 // bits) will be A[i][j+2]-2*32, and the second offset will be
1025 // A[i][j+2]+256-2*32. Essentially, the offsets should make the load well
1026 // aligned. Below, we compute this (-2*32) offset to make the loads aligned.
1027 SmallVector<Value, 4> indices(readOp.getIndices().begin(),
1028 readOp.getIndices().end());
1029 // Get the linearized access expression for the read to compute the offset
1030 AffineExpr linearAccess = constructLinearizedAffineExpr(readOp, state);
1031 // Get the base and offset from linear access expr
1032 auto [base, offset] = getBaseAndOffset(linearAccess);
1033 offset *= elementSizeInBits; // get the offset in bits
1034
1035 // The insertion point depends on whether the region has a single block or
1036 // not. If it has a single block, that block will be the front block, so we
1037 // can insert the UPDOp closer to the readOp. However, if the region has
1038 // multiple blocks, we will insert all the UPDs to the front block of the
1039 // region so that the UPDs dominate the entire region.
1040 bool singleBlock = region.getBlocks().size() == 1;
1041 if (singleBlock)
1042 state->builder.setInsertionPoint(readOp);
1043 else
1044 state->builder.setInsertionPointToStart(&region.front());
1045
1046 // If the extent <= 256 bits, we can directly copy data from mem into vector
1047 // without using a upd. So we try to chunk the interval into sub-intervals of
1048 // width >= 256 bits. For AIEML, the size should be doubled.
1049 int width = state->aieml ? elementSizeInBits == 8
1050 ? 512
1051 : std::max(256, getVectorSizeInBits(vecType))
1052 : 256;
1053 int32_t incr = std::max(width, intervalWidth / 2);
1054 int8_t idx = 1;
1055 for (int32_t start = interval.first; start < interval.second;
1056 start += incr, ++idx) {
1057 // If idx=1, then this indicates a potential upd0 instruction. If idx=2, it
1058 // will be upd1 instruction.
1059 assert(idx <= 2 && "The only allowed values for UPD index are 0 and 1");
1060 int32_t end = std::min(interval.second, start + incr);
1061 // We are at sub-interval [start,end] of the vector interval. Check if this
1062 // sub-interval is subsumed by [lb,ub], and the upd op corresponding to this
1063 // sub-interval is not already generated.
1064 if (lb <= start && ub >= end && (updIndices & idx) == 0) {
1065 // Generate the upd instruction, and link it with a previous upd op
1066 // corresponding to the same read.
1067 updOp = state->builder.create<aievec::UPDOp>(
1068 readOp.getLoc(), updVecType, readOp.getSource(), indices,
1069 start - offset, idx - 1,
1070 updOp ? updOp.getResult() : TypedValue<VectorType>(nullptr));
1071
1072 LLVM_DEBUG(llvm::dbgs() << "\n\nCreated UPD op " << updOp
1073 << " for read op " << readOp);
1074
1075 // If the transfer_read has some apply operations, then they also need to
1076 // be hoisted.
1077 for (auto &value : indices) {
1078 if (auto apOf = value.getDefiningOp<affine::AffineApplyOp>()) {
1079 // Skip hoisting if already above in lexicographical order
1080 if (apOf->getBlock() == readOp->getBlock() &&
1081 apOf->isBeforeInBlock(updOp))
1082 continue;
1083 apOf.getOperation()->moveBefore(updOp);
1084 }
1085 }
1086 // Set the (idx-1)'th bit in updIndices to indicate that we have already
1087 // created a upd op for index idx.
1088 updIndices |= idx;
1089 }
1090 }
1091
1092 // Link the generated updOp to possibly pre-existing UPD ops for the key
1093 memToUpdMap[key] = std::make_pair(updOp, updIndices);
1094 return updOp;
1095}
1096
1097//===----------------------------------------------------------------------===//
1098// AIE vectorization routines
1099//===----------------------------------------------------------------------===//
1100
1101// For this vectorized read operation, find the loop that corresponds to the
1102// vectorized dimension, and return its step size.
1103static int32_t computeVecorizedLoopStepSize(Operation *op, VectState *state) {
1104 auto readOp = dyn_cast<TransferReadOp>(op);
1105 // If this operation is not a read op, return the default step size of 1
1106 if (!readOp)
1107 return 1;
1108
1109 int32_t step = 0;
1110 auto vectorType = cast<VectorType>(readOp.getResult().getType());
1111 SmallVector<Value, 4> indices(readOp.getIndices().begin(),
1112 readOp.getIndices().end());
1113 assert(vectorType && !indices.empty());
1114
1115 // Verify that enclosing loops have been computed for the read operation
1116 auto block = readOp->getBlock();
1117 assert(state->blockToEnclosingLoops.count(block) &&
1118 "enclosing loops should have been computed for the read operation");
1119 auto enclosingLoops = state->blockToEnclosingLoops[block];
1120
1121 // The vectorized (i.e., last) index of the permutation must correspond to a
1122 // loop nest. If not, this is a splat read.
1123 AffineExpr expr = readOp.getPermutationMap().getResults().back();
1124 if (auto dimExpr = llvm::dyn_cast<AffineDimExpr>(expr)) {
1125 assert(dimExpr.getPosition() <= indices.size() &&
1126 "Failed to find the permutation index in index map");
1127 auto index = indices[dimExpr.getPosition()];
1128 // Iterate over all enclosing loops, and find the one that is variant in
1129 // index.
1130 [[maybe_unused]] bool found = false;
1131 for (auto loop : enclosingLoops) {
1132 auto iv = cast<affine::AffineForOp>(loop).getInductionVar();
1133 auto invariants = affine::getInvariantAccesses(iv, indices);
1134 if (!invariants.count(index)) {
1135 assert(
1136 !found &&
1137 "stepsize computation already has an entry along the variant dim");
1138 step = cast<affine::AffineForOp>(loop).getStepAsInt();
1139 found = true;
1140 }
1141 }
1142 }
1143 assert(isPowerOfTwo(step) &&
1144 "non-power-of-two vectorization factor not supported");
1145 // The step increment in vectorized code is scaled by factor of vector lanes;
1146 // account for that.
1147 unsigned lanes = getVectorLaneSize(vectorType);
1148 return step / lanes;
1149}
1150
1151// AIE vector loads are always aligned to 128-bit boundary. So if the operation
1152// reads from an unaligned memory location, return the starting position of the
1153// read in the vector. Each element of the vector is 'elementSizeInBits' bits
1154// wide.
1155int32_t computeStartInAIEVec(Operation *op, VectState *state) {
1156 // In case the operation is not a transfer_read, return default start
1157 if (!isa<TransferReadOp>(op))
1158 return 0;
1159
1160 auto readOp = cast<TransferReadOp>(op);
1161
1162 // Get the scalar element type's size in bits
1163 auto vtype = cast<VectorType>(readOp.getVector().getType());
1164 int32_t scalarSizeInBits = getElementSizeInBits(vtype);
1165
1166 // Get the linearized access expr for this read
1167 AffineExpr linearAccess = constructLinearizedAffineExpr(readOp, state);
1168 // get the base and offset from linear access expr
1169 auto [base, offset] = getBaseAndOffset(linearAccess);
1170 offset *= scalarSizeInBits; // compute offset in bits
1171 // Now find the reuse interval to which this readOp belongs
1172 IntervalReuse *iv = state->getIntervalForOperation(op);
1173 std::pair<int32_t, int32_t> interval = iv->getInterval(op);
1174
1175 // The readOp reads from this interval, and the start of this interval is
1176 // aligned to 128 bits. The AIE vector corresponding to this read will hold
1177 // the value [inteval.first,interval.second]. Return the position of the first
1178 // element that is read.
1179 assert(offset >= interval.first && "Failed to compute the start");
1180 return (offset - interval.first) / scalarSizeInBits;
1181}
1182
1183// For an i8xi8 scheme, we require two muls to compute the 16-lane output. Each
1184// mul has a replicated computation, where the output in lane i is replicated
1185// in lane i+2. Given that, we take the output of two mul ops. and merge them
1186// into a v32int16 vector. Then we shuffle (using select) to form two v16int16
1187// vectors that are replicas of each other. Finally, we can pick one of them
1188// (using ext), and then pack it into a v16int8 output.
1189static Operation *concatAndInterleave_i8xi8(Operation *source1,
1190 Operation *source2,
1191 VectState *state, Location loc) {
1192 // The source values are in accumulator. So generate SRS intrinsic to convert
1193 // the accumulator output to vector output. We want the output to be in
1194 // v16int16 vector, since select operation does not operate on v16int8
1195 // vector.
1196 Type i16Type =
1197 IntegerType::get(source1->getResult(0).getType().getContext(), 16);
1198 auto srsOp1 = generateSRSOp(source1->getResult(0), i16Type, state, loc);
1199 auto srsOp2 = generateSRSOp(source2->getResult(0), i16Type, state, loc);
1200
1201 // Now we concat the result of the two SRS ops to form a 32-lane vector
1202 SmallVector<Value> sources = {srsOp1->getResult(0), srsOp2->getResult(0)};
1203 auto concatOp = generateConcatOp(sources, state, loc);
1204
1205 // Select the right bits of output to again form the 16-lane vector. opAttr
1206 // will cache the step,offsets,square, etc. for both lanes.
1207 AIEOpAttributes opAttr;
1208 // 0xi is 1100, which indicates that two values must alternately come from
1209 // xoffset and yoffset.
1210 opAttr.select = "0xcccccccc";
1211 // xstart is 0. Since there are only 2 unique values in the first 4 values
1212 // of the vector, ystart is 4.
1213 opAttr.start.push_back("0");
1214 opAttr.start.push_back("4");
1215 for (size_t idx = 0; idx < 2; ++idx) {
1216 // Consider only the even indices in offset (e.g., c, 8, 4, 0). The
1217 // absolute difference between even indices should be 4 (based on the
1218 // scheme, this will get multiplied by 2. So technically, xoffset picks the
1219 // values starting at indices 0, 8, 16, 24 from the v32int16 vector,
1220 // whereas yoffset picks values starting at indices 0+4, 8+4, 16+4, 24+4).
1221 opAttr.offset.push_back("0x0c080400");
1222 // We don't care for the lower 16 values in the v32int16 vector post
1223 // shuffle
1224 opAttr.offset_hi.push_back("0x0");
1225 // The first value must be permuted to offset 0, and the next to 1
1226 opAttr.square.push_back("0x1010");
1227 }
1228 // And now perform the selection
1229 auto selectOp =
1230 generateSelectOp(concatOp->getResult(0), opAttr, 32, state, loc);
1231 // The values in the first 16 lanes in the v32int16 vector are replicated in
1232 // the last 16 lanes. So select the first 16 lanes to form a v16int16 vector.
1233 auto extOp = generateExtOp(selectOp->getResult(0), 16, 0, state, loc);
1234 // Pack the int16 values to int8 values to form the v16int8 output vector
1235 auto packOp = generatePackOp(extOp->getResult(0), state, loc);
1236 return packOp;
1237}
1238
1239// Perform a multitude of checks to see if rhs operand of the incoming add/sub
1240// operator is a mul operator, so that we can fuse them to form an FMA
1241// operator.
1242static bool canFuseMulAndAddOrSubIntoFMAOp(Operation *Op, VectState *state) {
1243 // Check 1. This should be an add or sub operation
1244 assert((isa<AddIOp>(Op) || isa<AddFOp>(Op) || isa<SubIOp>(Op) ||
1245 isa<SubFOp>(Op)) &&
1246 "operation must be an add or sub op");
1247
1248 // Check 2. Op must have two operands and one result
1249 assert(Op->getNumOperands() == 2 && Op->getNumResults() == 1);
1250
1251 // Check 3. rhs operand of the Op should be a mul op (If any operand of add
1252 // op is mul op, it is guaranteed to be rhs operand by explicit
1253 // reassociation done earlier).
1254 Operation *mulOp = getOperandDefOp(state, Op, 1);
1255 if (!isa<MulIOp, MulFOp>(mulOp))
1256 return false;
1257
1258 // Check 4. mulOp must also have two operands and one result
1259 assert(mulOp->getNumOperands() == 2 && mulOp->getNumResults() == 1);
1260
1261 // Determine the lhs, rhs, and accumulator values.
1262 Value lhs = state->sextTruncDefMap.count(mulOp->getOperand(0).getDefiningOp())
1263 ? mulOp->getOperand(0).getDefiningOp()->getOperand(0)
1264 : mulOp->getOperand(0);
1265 Value rhs = state->sextTruncDefMap.count(mulOp->getOperand(1).getDefiningOp())
1266 ? mulOp->getOperand(1).getDefiningOp()->getOperand(0)
1267 : mulOp->getOperand(1);
1268 Value acc = state->sextTruncDefMap.count(Op->getOperand(0).getDefiningOp())
1269 ? Op->getOperand(0).getDefiningOp()->getOperand(0)
1270 : Op->getOperand(0);
1271
1272 assert(lhs && rhs && acc &&
1273 "Failed to find the three operands of the FMA op");
1274
1275 // Check 5. All lhs, rhs, and acc must be vector types
1276 if (!isa<VectorType>(lhs.getType()) || !isa<VectorType>(rhs.getType()) ||
1277 !isa<VectorType>(acc.getType()))
1278 return false;
1279
1280 // Check 6. All the ops should belong to the same block, otherwise we might
1281 // not be able to fuse them safely.
1282 if (lhs.getParentBlock() != rhs.getParentBlock() ||
1283 rhs.getParentBlock() != acc.getParentBlock())
1284 return false;
1285
1286 // Check 7. All the vector sizes must be same
1287 auto lhsType = cast<VectorType>(lhs.getType());
1288 auto rhsType = cast<VectorType>(rhs.getType());
1289 VectorType accType = state->sextTruncDefMap.count(
1290 acc.getDefiningOp()->getOperand(0).getDefiningOp())
1291 ? cast<VectorType>(acc.getDefiningOp()
1292 ->getOperand(0)
1293 .getDefiningOp()
1294 ->getOperand(0)
1295 .getType())
1296 : cast<VectorType>(acc.getType());
1297
1298 unsigned lhsVecSize = getVectorLaneSize(lhsType);
1299 unsigned rhsVecSize = getVectorLaneSize(rhsType);
1300 unsigned accVecSize = getVectorLaneSize(accType);
1301
1302 if (lhsVecSize != rhsVecSize || rhsVecSize != accVecSize)
1303 return false;
1304
1305 // Check 8. The underlying scalar element type of all vectors must be the
1306 // same
1307 if (lhsType.getElementType() != rhsType.getElementType() ||
1308 rhsType.getElementType() != accType.getElementType())
1309 return false;
1310
1311 // And after all this, we can fuse mul and add into fma
1312 return true;
1313}
1314
1315// In the advanced FMA schemes, the two vector operands of a mul/fma op are not
1316// the same size. If the incoming operation involves multiplication,
1317// reassociate the operands involved in multiplication so that the left operand
1318// comes from bigger vector. The exception to this rule is the 8x8 scheme,
1319// where the right operand must be the bigger vector.
1320static void reassociateMulOpBasedOnVecSize(Operation *Op, VectState *state) {
1321 // Get the stats for left and right operand vectors
1322 AIEVecAttributes lstat = getOperandVecStats(Op, state, 0);
1323 AIEVecAttributes rstat = getOperandVecStats(Op, state, 1);
1324
1325 // No need to do anything if both vectors are the same size
1326 if (lstat.vecSizeInBits == rstat.vecSizeInBits)
1327 return;
1328
1329 // Check if this is an 8x8 scheme
1330 bool is8x8 = lstat.elementSizeInBits == 8 && rstat.elementSizeInBits == 8;
1331
1332 // Flip the operands if necessary
1333 bool flip = is8x8 ? lstat.vecSizeInBits > rstat.vecSizeInBits
1334 : rstat.vecSizeInBits > lstat.vecSizeInBits;
1335 if (flip) {
1336 LLVM_DEBUG(llvm::dbgs()
1337 << "\n\nReassociating op " << *Op
1338 << " to correctly place operand coming from bigger vector");
1339 Value left = Op->getOperand(0);
1340 Value right = Op->getOperand(1);
1341 Op->setOperand(0, right);
1342 Op->setOperand(1, left);
1343 LLVM_DEBUG(llvm::dbgs() << "\n\tOp after reassociation: " << *Op);
1344 }
1345}
1346
1347// If Op involves multiplication, and any operand involved in the
1348// multiplication is splat, make it the second operand of mul, unless its the
1349// 8x8 scheme. In that case, make splat the first operand.
1350static void reassociateMulOpWithSplat(Operation *Op, VectState *state) {
1351 // Op must have at least two operands (two for mul, three for fma), and one
1352 // result.
1353 assert(Op->getNumOperands() == 2 || Op->getNumOperands() == 3);
1354 assert(Op->getNumResults() == 1);
1355
1356 // Get the left and right operand vector properties
1357 AIEVecAttributes lstat = getOperandVecStats(Op, state, 0);
1358 AIEVecAttributes rstat = getOperandVecStats(Op, state, 1);
1359
1360 // No need to do anything if both operands are splat
1361 if (lstat.isSplat && rstat.isSplat)
1362 return;
1363
1364 // Check if this is an 8x8 scheme
1365 bool is8x8 = lstat.elementSizeInBits == 8 && rstat.elementSizeInBits == 8;
1366
1367 // Now flip operands if required and set the operands to the operands of the
1368 // sext operations
1369 bool flip = is8x8 ? rstat.isSplat : lstat.isSplat;
1370 Value left = state->sextTruncDefMap.count(Op->getOperand(0).getDefiningOp())
1371 ? Op->getOperand(0).getDefiningOp()->getOperand(0)
1372 : Op->getOperand(0);
1373 Value right = state->sextTruncDefMap.count(Op->getOperand(1).getDefiningOp())
1374 ? Op->getOperand(1).getDefiningOp()->getOperand(0)
1375 : Op->getOperand(1);
1376 if (flip) {
1377 LLVM_DEBUG(llvm::dbgs() << "\n\nReassociating op " << *Op
1378 << " to place splat as correct operand");
1379 Op->setOperand(0, right);
1380 Op->setOperand(1, left);
1381 LLVM_DEBUG(llvm::dbgs() << "\n\tOp after reassociation: " << *Op);
1382 } else {
1383 Op->setOperand(0, left);
1384 Op->setOperand(1, right);
1385 }
1386
1387 Op->getResult(0).setType(Op->getOperand(0).getType());
1388
1389 if (Op->hasOneUse() &&
1390 isa<AddIOp, AddFOp, SubIOp, SubFOp>(*Op->getUsers().begin())) {
1391 Operation *usrOp = *Op->getUsers().begin();
1392 usrOp->getResult(0).setType(usrOp->getOperand(0).getType());
1393 }
1394}
1395
1396// Rewrite a mul and add/sub op as a vector dialect FMA op
1397static void fuseMulAndAddOrSubIntoFMAOp(Operation *Op, VectState *state) {
1398 Value acc = state->sextTruncDefMap.count(Op->getOperand(0).getDefiningOp())
1399 ? Op->getOperand(0).getDefiningOp()->getOperand(0)
1400 : Op->getOperand(0);
1401 Operation *mulOp = getOperandDefOp(state, Op, 1);
1402 Value lhs = state->sextTruncDefMap.count(mulOp->getOperand(0).getDefiningOp())
1403 ? mulOp->getOperand(0).getDefiningOp()->getOperand(0)
1404 : mulOp->getOperand(0);
1405 Value rhs = state->sextTruncDefMap.count(mulOp->getOperand(1).getDefiningOp())
1406 ? mulOp->getOperand(1).getDefiningOp()->getOperand(0)
1407 : mulOp->getOperand(1);
1408
1409 // Create a new FMA op
1410 state->builder.setInsertionPointAfter(Op);
1411 Operation *fmaOp =
1412 state->builder.create<vector::FMAOp>(Op->getLoc(), lhs, rhs, acc);
1413
1414 // If Op is a sub op, we tag the generated fma op as msc op
1415 bool isSub = isa<SubIOp, SubFOp>(Op);
1416 if (isSub)
1417 state->mscOps.insert(fmaOp);
1418
1419 LLVM_DEBUG(llvm::dbgs() << "\n\nFused " << (isSub ? "sub" : "add") << " op "
1420 << *Op << "\n\tand mul op " << *mulOp
1421 << "\n\tinto fma op " << *fmaOp);
1422
1423 // Replace all the uses of Op with the fmaOp, and remove Op
1424 Op->replaceAllUsesWith(fmaOp);
1425 Op->erase();
1426 // If Op was the only consumer of mulOp, then there are no more uses of
1427 // mulOp. Remove it.
1428 if (mulOp->use_empty())
1429 mulOp->erase();
1430}
1431
1432// Given the operation attributes (start, offset, step, square, etc.), generate
1433// an AIE mul/fma op for the incoming vector mul/fma Op. 'nextStart' is used
1434// for schemes that require two AIE dialect fma ops to be generated for one
1435// vector dialect fma op for AIE1; the only difference between the attributes of
1436// the two AIE dialect fma ops is the start field. For AIEML, i8xi8 scheme
1437// generates one MulConvOp or FMAConvOp for each vector dialect mul/fma op.
1438static void generateMulOrFMAOp(Operation *Op, Scheme &scheme,
1439 AIEOpAttributes &opAttr, VectState *state,
1440 const std::string &nextStart = "") {
1441 // Assert that we computed the attributes for both the operands
1442 assert(opAttr.start.size() == opAttr.offset.size() &&
1443 opAttr.start.size() == 2);
1444
1445 // Set insertion point of the AIE dialect mul/fma op
1446 state->builder.setInsertionPointAfter(Op);
1447
1448 // Return true if any user of this op is not mul/fma op
1449 auto notMulOrFMAOp = [&](Operation *op) {
1450 return !isa<MulIOp, MulFOp, vector::FMAOp>(op);
1451 };
1452
1453 // Generate an AIE dialect mul/fma op from a vector dialect mul/fma op
1454 auto genOp = [&](Operation *Op, AIEOpAttributes &opAttr, VectState *state,
1455 bool i8xi8_pairedOp = false) {
1456 Operation *repOp;
1457 // Create aievec::aie1::FMAOp corresponding to the vector::FMAOp
1458 if (auto fmaOp = dyn_cast<vector::FMAOp>(Op))
1459 repOp = generateFMAOp(fmaOp, opAttr, state, i8xi8_pairedOp);
1460 // Create aievec::aie1::MulOp corresponding to the vector::MulIOp
1461 else if (auto mulOp = dyn_cast<MulIOp>(Op))
1462 repOp = generateMulOp<MulIOp>(mulOp, opAttr, state);
1463 // Create aievec::aie1::MulOp corresponding to the vector::MulFOp
1464 else if (auto mulOp = dyn_cast<MulFOp>(Op))
1465 repOp = generateMulOp<MulFOp>(mulOp, opAttr, state);
1466 else
1467 llvm_unreachable("Operation not mul/fma op");
1468 return repOp;
1469 };
1470
1471 Operation *repOp = genOp(Op, opAttr, state);
1472 LLVM_DEBUG(llvm::dbgs() << "\n\nGenerated AIE dialect mul/fma op " << *repOp);
1473
1474 // For AIE1, i8xi8 scheme generates two AIE dialect mul/fma ops for each
1475 // vector dialect mul/fma op. Generate the paired mul/fma op if nextStart is
1476 // not empty. For AIEML, i8xi8 scheme generates one MulConvOp or FMAConvOp for
1477 // each vector dialect mul/fma op.
1478 if (!nextStart.empty()) {
1479 if (state->aieml && scheme.lanes == 32 && scheme.xbits == 8 &&
1480 scheme.zbits == 8) {
1481 repOp = generateMulOrFMAConvOpForInt8(Op, opAttr, state);
1482 if (any_of(repOp->getUsers(), notMulOrFMAOp)) {
1483 Type i8Type =
1484 IntegerType::get(repOp->getResult(0).getType().getContext(), 8);
1485 repOp =
1486 generateSRSOp(repOp->getResult(0), i8Type, state, repOp->getLoc());
1487 }
1488 } else {
1489 opAttr.start[1] = nextStart;
1490 Operation *pairedOp = genOp(Op, opAttr, state, true);
1491 LLVM_DEBUG(llvm::dbgs() << "\n\nGenerated the paired AIE dialect "
1492 << "mul/fma op for 8x8 scheme " << *repOp);
1493 // Link the two mul/fma ops
1494 assert(!state->pairedOp.count(repOp));
1495 state->pairedOp[repOp] = pairedOp;
1496 // If any of the uses of incoming op is not a mul/fma op, then we need to
1497 // concatenate the paired ops and generate a v16xi8 vector.
1498 if (any_of(Op->getUsers(), notMulOrFMAOp))
1499 repOp = concatAndInterleave_i8xi8(repOp, pairedOp, state, Op->getLoc());
1500 }
1501 }
1502
1503 // Replace all the uses of the vector mul/fma op with the AIE mul/fma op, and
1504 // remove vector op from the IR.
1505 Op->replaceAllUsesWith(repOp);
1506 Op->erase();
1507}
1508
1509// Compute the start and offset for xbuff/zbuff for 32x32 scheme.
1510static void computeBuffAttr_i32xi32(
1511 unsigned vecSize, // #lanes
1512 int32_t start, // start in AIE vec
1513 int32_t accIncr, // access change with each loop increment
1514 AIEOpAttributes &opAttr) {
1515 // Populate start
1516 std::string startStr = std::to_string(start);
1517 // Compute the offset resembling "0x76543210"
1518 std::string offsetStr = "0x";
1519 for (int i = vecSize - 1; i >= 0; --i)
1520 offsetStr.push_back(getHexValue(i * accIncr));
1521
1522 // And now we have everything to push into opAttr
1523 opAttr.start.push_back(startStr);
1524 opAttr.offset.push_back(offsetStr);
1525 opAttr.offset_hi.push_back("");
1526 opAttr.square.push_back("");
1527 opAttr.step.push_back("");
1528}
1529
1530// Compute the start, lo/hi offset, and square for xbuff for 16x16 scheme.
1531static void computeXbuffAttr_i16xi16(
1532 unsigned vecSize, // #lanes
1533 int32_t start, // computed start in AIE vec
1534 int32_t accIncr, // access change with each loop increment
1535 int32_t colOffset, // xbuff access distance between vector cols
1536 AIEOpAttributes &opAttr) {
1537 // The colOffset must be either <=1, or a multiple of 2
1538 assert(colOffset >= -1 && (colOffset <= 1 || colOffset % 2 == 0) &&
1539 "cannot compute offset and square for xbuff");
1540 // We can only generate the offsets and square if either accIncr or column
1541 // offset is <= 1.
1542 assert((accIncr <= 1 || colOffset <= 1) &&
1543 "cannot generate offset and square for xbuff");
1544
1545 // Arch restriction: xstart should be a multiple of 2.
1546 int32_t m2start = (start / 2) * 2;
1547 std::string startStr = std::to_string(m2start);
1548 // m2Offset accounts for the extra 1 if the start is not a multiple of 2
1549 int32_t m2Offset = start - m2start;
1550
1551 // Compute hi and lo offsets to something resembling "0x_7_6_5_4" and
1552 // "0x_3_2_1_0" respectively. The '_' are 0 if colOffset is 1.
1553 std::string offsetStr = "0x";
1554 int32_t offset = std::max(colOffset, accIncr);
1555 for (int i = vecSize / 2 - 2; i >= 0; i -= 2) {
1556 offsetStr.push_back(offset <= 1 ? '0' : getHexValue((offset - 2) / 2));
1557 offsetStr.push_back(getHexValue((i * accIncr) / 2));
1558 }
1559 std::string offsetHiStr = "0x";
1560 for (int i = vecSize - 2, e = vecSize / 2; i >= e; i -= 2) {
1561 offsetHiStr.push_back(offset <= 1 ? '0' : getHexValue((offset - 2) / 2));
1562 offsetHiStr.push_back(getHexValue((i * accIncr) / 2));
1563 }
1564
1565 // Now compute the square for xbuff.
1566 int32_t cstep = std::min(2, std::abs(colOffset));
1567 int32_t astep = std::min(2, accIncr);
1568 assert(m2Offset == 0 || (astep <= 1 && cstep <= 1));
1569
1570 SmallVector<int32_t> sqPattern = {astep + cstep, astep, cstep, 0};
1571 std::string squareStr = "0x";
1572 for (auto sq : sqPattern)
1573 squareStr.push_back(getHexValue(sq + m2Offset));
1574
1575 // And now we have everything to push into opAttr
1576 opAttr.start.push_back(startStr);
1577 opAttr.offset.push_back(offsetStr);
1578 opAttr.offset_hi.push_back(offsetHiStr);
1579 opAttr.square.push_back(squareStr);
1580 opAttr.step.push_back("");
1581}
1582
1583// Compute the start, lo/hi offset, and step for zbuff for 16x16 scheme.
1584static void computeZbuffAttr_i16xi16(
1585 unsigned vecSize, // #lanes
1586 int32_t start, // computed start in AIE vec
1587 int32_t accIncr, // access change with each loop increment
1588 int32_t zeroOffset, // offset of 0 value in the filter
1589 int32_t colOffset, // zbuff access distance between vector cols
1590 bool aieml, AIEOpAttributes &opAttr) {
1591 std::string offsetStr, offsetHiStr;
1592 // zstart must be 4b value.
1593 assert(start < (aieml ? 32 : 16) && "zstart must be 4b value");
1594 std::string startStr = std::to_string(start);
1595
1596 // If zbuff comes from splat, use default offsets
1597 if (accIncr == 0)
1598 offsetStr = offsetHiStr = "0";
1599 else {
1600 // Compute hi and lo offsets using general scheme
1601 offsetStr = "0x";
1602 for (int i = vecSize / 2 - 1; i >= 0; --i)
1603 offsetStr.push_back(getHexValue(i * accIncr));
1604 offsetHiStr = "0x";
1605 for (auto i = vecSize - 1, e = vecSize / 2; i >= e; --i)
1606 offsetStr.push_back(getHexValue(i * accIncr));
1607 }
1608
1609 // Compute step between columns
1610 int32_t step = colOffset == -1 ? zeroOffset - 1 - start : colOffset;
1611 assert(step >= 0 && "zstep cannot be negative");
1612 std::string stepStr = std::to_string(step);
1613
1614 // And now we have everything to push into opAttr
1615 opAttr.start.push_back(startStr);
1616 opAttr.offset.push_back(offsetStr);
1617 opAttr.offset_hi.push_back(offsetHiStr);
1618 opAttr.square.push_back("");
1619 opAttr.step.push_back(stepStr);
1620}
1621
1622// Compute the start, offset, square, and step for xbuff for 8x8 scheme. This
1623// is the data scheme, but since is is so restricted, we do a switcharoo, and
1624// use filter as xbuff. We assume that the filter elements are duplicated
1625// (duplication factor= 2). For example, the 2x2 filter should be
1626// {0,0,1,1,2,2,3,3}.
1627static void computeXbuffAttr_i8xi8(
1628 unsigned vecSize, // #lanes
1629 int32_t start, // computed start in AIE vec
1630 int32_t colOffset, // xbuff access distance between vector cols
1631 AIEOpAttributes &opAttr) {
1632 // Assert that colStep is a multiple of 4, where colStep is the difference
1633 // between idx[i][j] and idx[i][j+2].
1634 assert(
1635 colOffset >= 2 &&
1636 "each filter entry must be replicated at least twice for i8xi8 scheme");
1637 int32_t colStep = 2 * colOffset;
1638 assert(colStep % 4 == 0 && "xstep must be multiple of 4");
1639
1640 // Arch restriction: xstart must be a multiple of 4
1641 int32_t m4start = (start / 4) * 4;
1642 std::string startStr = std::to_string(m4start);
1643 // m4Offset accounts for the excess if start is not a multiple of 4
1644 int32_t m4Offset = start - m4start;
1645 // Because of duplication, m4Offset can only be 0 or 2
1646 assert(m4Offset == 0 || m4Offset == 2);
1647
1648 // Compute offsetStr to something resembling "0x_0_0_0_0", where _ is
1649 // (colStep-4)/4.
1650 std::string offsetStr = "0x";
1651 for (int i = vecSize / 4 - 1; i >= 0; --i) {
1652 offsetStr.push_back(getHexValue(colStep / 4 - 1));
1653 offsetStr += "0";
1654 }
1655 std::string stepStr = std::to_string(colStep);
1656
1657 // Now compute the square for zbuff. We want a {0,x,0,x} pattern.
1658 int32_t offsetWithoutDup = colOffset / 2;
1659 int32_t rstep = offsetWithoutDup >= 2 ? 2
1660 : colOffset == -1 ? 1
1661 : offsetWithoutDup;
1662 assert(m4Offset == 0 || rstep <= 1);
1663
1664 SmallVector<int32_t> sqPattern = {rstep, 0, rstep, 0};
1665 std::string squareStr = "0x";
1666 for (auto sq : sqPattern)
1667 squareStr.push_back(getHexValue(sq + m4Offset));
1668
1669 // And now we have everything to push into opAttr
1670 opAttr.start.push_back(startStr);
1671 opAttr.offset.push_back(offsetStr);
1672 opAttr.offset_hi.push_back("");
1673 opAttr.square.push_back(squareStr);
1674 opAttr.step.push_back(stepStr);
1675}
1676
1677// Compute the start, offset, square, and step for zbuff for 8x8 scheme. This
1678// is the coefficient scheme, but since the coefficient scheme is more relaxed,
1679// we use image as zbuff.
1680static void computeZbuffAttr_i8xi8(
1681 unsigned vecSize, // #lanes
1682 int32_t start, // computed start in AIE vec
1683 int32_t accIncr, // access change with each loop increment
1684 int32_t colOffset, // zbuff access distance between vector cols
1685 AIEOpAttributes &opAttr, std::string &nextStart) {
1686 // The colOffset must be either <=1, or a multiple of 2
1687 assert((colOffset <= 1 || colOffset % 2 == 0) && "zbuff value not supported");
1688
1689 // Arch restriction: zstart is a multiple of 2
1690 int32_t m2start = (start / 2) * 2;
1691 std::string startStr = std::to_string(m2start);
1692 // m2Offset accounts for the extra 1 if the start is not a multiple of 2
1693 int32_t m2Offset = start - m2start;
1694
1695 // Compute offsetStr to something resembling "0x43322110". The usual pattern
1696 // is "0x_3_2_1_0", and the purpose is to fill the "_".
1697 std::string offsetStr = "0x";
1698 for (int i = vecSize / 4 - 1; i >= 0; --i) {
1699 int32_t val = i * accIncr + (colOffset + 1) / 2;
1700 offsetStr.push_back(getHexValue(val));
1701 offsetStr.push_back(getHexValue(i * accIncr));
1702 }
1703 std::string stepStr = std::to_string(2 * std::abs(colOffset));
1704 nextStart = std::to_string(m2start + 2 * accIncr * (vecSize / 4));
1705
1706 // Now compute the square for zbuff. We want a {0,1+x,y,y+1+x} pattern, where
1707 // x is the square offset, and y is the accIncr.
1708 int32_t rstep = colOffset >= 2 ? 2 : std::abs(colOffset);
1709 assert(m2Offset == 0 || rstep <= 1);
1710
1711 SmallVector<int32_t> sqPattern = {accIncr + rstep, accIncr, rstep, 0};
1712 std::string squareStr = "0x";
1713 for (auto sq : sqPattern)
1714 squareStr.push_back(getHexValue(sq + m2Offset));
1715
1716 // And now we have everything to push into opAttr
1717 opAttr.start.push_back(startStr);
1718 opAttr.offset.push_back(offsetStr);
1719 opAttr.offset_hi.push_back("");
1720 opAttr.square.push_back(squareStr);
1721 opAttr.step.push_back(stepStr);
1722}
1723
1724// Find a length-k chain of FMA ops such that (1) the chain is linear; (2) the
1725// operand datawidth is 16 or 8 bits; (3) the access distance between lhs (rhs)
1726// operands of both FMAs is compile-time constant. These FMAs will be fused
1727// into a single FMA. Technically, k is equal to the number of columns in the
1728// FMA topology. If fused, cache the pair indicating the access difference
1729// between the operands for the two FMAs.
1730static void fuseFMAOps(Operation *refOp,
1731 llvm::SmallSet<Operation *, 8> &fusedOpSet, int32_t cols,
1732 VectState *state) {
1733 // The number of columns must be greater than 1. refOp must be mul/fma op,
1734 // and should not be covered by the simple vector scheme.
1735 if (cols <= 1 || !isa<MulIOp, MulFOp, vector::FMAOp>(refOp) ||
1736 isSimpleVectIntrinsic(refOp, state))
1737 return;
1738
1739 // Get the start offsets for left and right operands of the reference
1740 // operator, i.e., start of the fusion chain.
1741 Operation *lOp = getOperandDefOp(state, refOp, 0);
1742 Operation *rOp = getOperandDefOp(state, refOp, 1);
1743
1744 int32_t lstart = computeStartInAIEVec(lOp, state);
1745 int32_t rstart = computeStartInAIEVec(rOp, state);
1746
1747 // The xbuff and zbuff offsets between the fused FMA ops. The default value
1748 // is -1
1749 int xOffset = -1, zOffset = -1;
1750
1751 // We write a loop that tries to chase a linear chain of length col-1
1752 // starting at reference mul/fma op refOp. Let us consider a computational
1753 // chain of length 3 : {c = A[i]*B[i]; c += A[i+1]*B[i+1]; c +=
1754 // A[i+2]*B[i+2]}; We represent the start for each instruction as a pair
1755 // (lhs-operand-start, rhs-operand-start). The starts for the chain will be
1756 // {(0,0), (1,1), (2,2)}. Since the consecutive starts are equidistant in
1757 // the chain, we consider this chain fusable, and cache the fused operations
1758 // in fusedp vector.
1759 Operation *curOp = refOp;
1760 SmallVector<Operation *, 8> fusedOps;
1761
1762 for (auto len = 0; len < cols - 1; ++len) {
1763 // If this operation has more than one use, break loop.
1764 if (!curOp->hasOneUse())
1765 break;
1766 // Get the consumer of the curOp FMA
1767 Operation *usrOp = *curOp->getUsers().begin();
1768 // The user/consumer user operation must be a FMA, belonging to the same
1769 // basic block as curOp, and must not be covered by simple scheme.
1770 if (!isa<vector::FMAOp>(usrOp) || curOp->getBlock() != usrOp->getBlock() ||
1771 isSimpleVectIntrinsic(usrOp, state))
1772 break;
1773 // Both curOp and usrOp must be either fma or fmsub(msc)
1774 if (isa<vector::FMAOp>(curOp) &&
1775 state->mscOps.count(curOp) != state->mscOps.count(usrOp))
1776 break;
1777 // Compute the start/access distance for each operand of curOp and usrOp
1778 SmallVector<int32_t, 2> offsets;
1779 for (size_t idx = 0; idx < 2; ++idx) {
1780 // Get the vector attributes for this operand of curOp and usrOp
1781 AIEVecAttributes cstat = getOperandVecStats(curOp, state, idx);
1782 AIEVecAttributes ustat = getOperandVecStats(usrOp, state, idx);
1783 // We need to ensure that the accesses to this operand of curOp and usrOp
1784 // come from the same vector. To guarantee this, we peform two checks:
1785 // Check 1. The accesses must be similar
1786 if (cstat.vecSizeInBits != ustat.vecSizeInBits ||
1787 cstat.elementSizeInBits != ustat.elementSizeInBits ||
1788 cstat.loadFromMemory != ustat.loadFromMemory ||
1789 cstat.isSplat != ustat.isSplat)
1790 break;
1791 // Check 2. The accesses must come from the same vector/upd op
1792 Operation *cdefOp = getOperandDefOp(state, curOp, idx);
1793 Operation *udefOp = getOperandDefOp(state, usrOp, idx);
1794
1795 bool related = cdefOp == udefOp;
1796 if (!related && cstat.loadFromMemory && ustat.loadFromMemory) {
1797 IntervalReuse *civ = state->getIntervalForOperation(cdefOp);
1798 IntervalReuse *uiv = state->getIntervalForOperation(udefOp);
1799 related =
1800 civ == uiv && civ->getInterval(cdefOp) == uiv->getInterval(udefOp);
1801 }
1802 if (!related)
1803 break;
1804
1805 // We know that the accesses to this operand for both curOp and usrOp
1806 // come from the same AIE vector. So we can get the start value for the
1807 // operands.
1808 int32_t start1 = computeStartInAIEVec(cdefOp, state);
1809 int32_t start2 = computeStartInAIEVec(udefOp, state);
1810 int32_t offset = start2 - start1;
1811 // perform a set of checks to make sure that the distance can be encoded
1812 // in AIE intrinsic.
1813 // Check 1: the offset should be positive
1814 if (offset < 0)
1815 break;
1816 // Check 2: If offset is greater than 1, it should be a multiple of 2
1817 if (offset > 1 && offset % 2 != 0)
1818 break;
1819 // Check 3: If offset is >=2, then the reference op must have start=0
1820 int32_t refStart = idx == 0 ? lstart : rstart;
1821 if (!ustat.isSplat && offset > 1 && refStart != 0)
1822 break;
1823 // From this operand's perspective, we can fuse this usrOp with curOp.
1824 // Cache the start offset.
1825 offsets.push_back(offset);
1826 }
1827 // Verify that we computed offset for both operands
1828 if (offsets.size() < 2)
1829 break;
1830 // Ensure that the difference between consecutive xOffsets and zOffsets is
1831 // consistent throughout the chain.
1832 if ((xOffset != -1 && xOffset != offsets[0]) ||
1833 (zOffset != -1 && zOffset != offsets[1]))
1834 break;
1835 // Now the user FMA op can be fused with refOp
1836 xOffset = offsets[0];
1837 zOffset = offsets[1];
1838 fusedOps.push_back(usrOp);
1839 // usrOp now becomes curOp, so that we can chase the linear chain starting
1840 // at it.
1841 curOp = usrOp;
1842 }
1843
1844 // If there are no ops fused, return
1845 if (fusedOps.empty())
1846 return;
1847
1848 LLVM_DEBUG(llvm::dbgs() << "\n\nFused following fma ops with op " << *refOp);
1849
1850 // If we reached here, we have decided to fuse a linear chain of FMAs, we
1851 // need to remove the fused FMAs from the IR.
1852 for (auto &op : fusedOps) {
1853 LLVM_DEBUG(llvm::dbgs() << "\n\tfma op " << *op);
1854 fusedOpSet.insert(op);
1855 // Since we are fusing op with refOp, fuse their access extents too
1856 fuseAccessExtent(refOp, op, state);
1857 // Now replace the uses of op with reference
1858 op->replaceAllUsesWith(refOp);
1859 }
1860
1861 // Cache the column offsets for refOp
1862 assert(!state->opToColOffsets.count(refOp));
1863 state->opToColOffsets[refOp] = std::make_pair(xOffset, zOffset);
1864}
1865
1866// Compute all the attributes for xbuff, based on the scheme.
1867static void computeXbuffAttributes(
1868 Scheme &scheme, // vect scheme info
1869 int32_t start, // computed start in AIE vec
1870 int32_t colOffset, // xbuff access distance between vector cols
1871 int32_t accIncr, // xbuff access incr with each loop increment
1872 int32_t dupFactor, // duplication factor for i8xi8 filter
1873 bool aieml, AIEOpAttributes &opAttr) {
1874 // Branch to different schemes
1875 // Case 1: 32x32 real
1876 if ((scheme.lanes == 8 || (aieml && scheme.lanes == 16)) &&
1877 scheme.cols == 1 && scheme.xbits == 32 && scheme.zbits == 32)
1878 computeBuffAttr_i32xi32(scheme.lanes, start, accIncr, opAttr);
1879 // Case 2: 16x16 real
1880 else if ((scheme.lanes == 16 || (aieml && scheme.lanes == 32)) &&
1881 scheme.cols == 2 && scheme.xbits == 16 && scheme.zbits == 16) {
1882 // We only support a loop increment of <= 1
1883 assert((accIncr <= 1 || accIncr % 2 == 0) &&
1884 "loop step size value not supported");
1885 computeXbuffAttr_i16xi16(scheme.lanes, start, accIncr, colOffset, opAttr);
1886 }
1887 // Case 3: 8x8 real
1888 else if ((scheme.lanes == 16 || (aieml && scheme.lanes == 32)) &&
1889 scheme.cols == 8 && scheme.xbits == 8 && scheme.zbits == 8) {
1890 // We only support a loop increment of <= 1
1891 assert(accIncr <= 1 && "loop step size greater than 1 not supported");
1892 // If we were not able to fuse any of the macs to exploit column topology,
1893 // then colOffset must be equal to dupFactor.
1894 if (colOffset == -1)
1895 colOffset = dupFactor;
1896 computeXbuffAttr_i8xi8(scheme.lanes, start, colOffset, opAttr);
1897 } else
1898 llvm_unreachable("Unsupported vectorization scheme");
1899}
1900
1901// Compute all the attributes for zbuff, based on the scheme.
1902static void computeZbuffAttributes(
1903 Scheme &scheme, // vect scheme info
1904 int32_t start, // computed start in AIE vec
1905 int32_t colOffset, // zbuff access distance between vector cols
1906 int32_t accIncr, // zbuff access incr with each loop increment
1907 int32_t zeroOffset, // zero offset of filter for i16xi16 scheme
1908 bool aieml,
1909 std::string &nextStart, // start of mul/mac pair in i8xi8 scheme
1910 AIEOpAttributes &opAttr) {
1911 // Branch to different schemes
1912 // Case 1: 32x32 real
1913 if ((scheme.lanes == 8 || (aieml && scheme.lanes == 16)) &&
1914 scheme.cols == 1 && scheme.xbits == 32 && scheme.zbits == 32)
1915 computeBuffAttr_i32xi32(scheme.lanes, start, accIncr, opAttr);
1916 // Case 2: 16x16 real
1917 else if ((scheme.lanes == 16 || (aieml && scheme.lanes == 32)) &&
1918 scheme.cols == 2 && scheme.xbits == 16 && scheme.zbits == 16) {
1919 // We only support a loop increment of <= 1
1920 assert(accIncr <= 1 && "loop step size greater than 1 not supported");
1921 // Get the zero offset in filter if the user provided it in the command
1922 // line. The zero offset is cyclic, so compute an offset that is > start.
1923 zeroOffset = zeroOffset == 0 ? scheme.lanes
1924 : start + zeroOffset - (start % zeroOffset);
1925 computeZbuffAttr_i16xi16(scheme.lanes, start, accIncr, zeroOffset,
1926 colOffset, aieml, opAttr);
1927 }
1928 // Case 3: 8x8 real
1929 else if ((scheme.lanes == 16 || (aieml && scheme.lanes == 32)) &&
1930 scheme.cols == 8 && scheme.xbits == 8 && scheme.zbits == 8) {
1931 // We only support a loop increment of <= 1
1932 assert(accIncr <= 1 && "loop step size greater than 1 not supported");
1933 computeZbuffAttr_i8xi8(scheme.lanes, start, accIncr, colOffset, opAttr,
1934 nextStart);
1935 } else
1936 llvm_unreachable("Unsupported vectorization scheme");
1937}
1938
1939// For this mul/FMA operator, generate AIE dialect mul/FMA op based on
1940// different vector schemes.
1941static void generateSchemeBasedMulOrFMAOp(Operation *Op, VectState *state) {
1942 int32_t lanes, cols;
1943 std::tie(lanes, cols) = getNumRowsAndCols(Op, state);
1944 // Get the data sizes for left and right operands of mul/fma
1945 Value lhs = state->sextTruncDefMap.count(Op->getOperand(0).getDefiningOp())
1946 ? Op->getOperand(0).getDefiningOp()->getOperand(0)
1947 : Op->getOperand(0);
1948 Value rhs = state->sextTruncDefMap.count(Op->getOperand(1).getDefiningOp())
1949 ? Op->getOperand(1).getDefiningOp()->getOperand(0)
1950 : Op->getOperand(1);
1951 int32_t xbits = getElementSizeInBits(cast<VectorType>(lhs.getType()));
1952 int32_t zbits = getElementSizeInBits(cast<VectorType>(rhs.getType()));
1953 Scheme scheme(lanes, cols, xbits, zbits);
1954
1955 // First check if this operation requires simple vector operation, and not an
1956 // advanced scheme.
1957 if (isSimpleVectIntrinsic(Op, state)) {
1958 // opAttr will cache the attributes (start, step, offsets, square, etc.)
1959 // for both lhs and rhs operands.
1960 AIEOpAttributes opAttr;
1961 // For simple scheme, we do not need any attribute
1962 for (size_t idx = 0; idx < 2; ++idx) {
1963 opAttr.start.push_back("");
1964 opAttr.offset.push_back("");
1965 opAttr.offset_hi.push_back("");
1966 opAttr.square.push_back("");
1967 opAttr.step.push_back("");
1968 }
1969 generateMulOrFMAOp(Op, scheme, opAttr, state);
1970 return;
1971 }
1972
1973 // Otherwise generate mul or fma op based on advanced scheme. Get the rows,
1974 // cols, and datatype size for the vector scheme, and pack all that
1975 // information in the Scheme struct.
1976 // If element size is < 32 bits ,we can fuse multiple FMAs together to
1977 // exploit the column topology of FMA intrinsic.
1978 auto colOffset = state->opToColOffsets.count(Op) ? state->opToColOffsets[Op]
1979 : std::make_pair(-1, -1);
1980
1981 // opAttr will cache the step,offsets,square, etc. for both lhs and rhs
1982 // operands.
1983 AIEOpAttributes opAttr;
1984 // For i8xi8 scheme, each vector dialect mul/fma op is converted to two AIE
1985 // dialect mul/fma op. The two AIE ops are identical, except for the start
1986 // field. nextStart indicates the start of the second op.
1987 std::string nextStart;
1988 // Compute relevant attributes (start, offsets, step, square, etc.) for each
1989 // operand, and store them in opAttr.
1990 for (size_t idx = 0; idx < 2; ++idx) {
1991 AIEVecAttributes stat = getOperandVecStats(Op, state, idx);
1992 Operation *op = getOperandDefOp(state, Op, idx);
1993
1994 int32_t start = 0, accIncr = 1;
1995 // If the operand comes from transfer_read, compute the step and start
1996 // values.
1997 if (stat.loadFromMemory) {
1998 auto readOp = cast<TransferReadOp>(op);
1999 // How does the access change with each iteration of the vectorized loop?
2000 accIncr = stat.isSplat ? 0 : computeVecorizedLoopStepSize(readOp, state);
2001 // start in the AIE vector
2002 start = computeStartInAIEVec(op, state);
2003 }
2004 // Compute the xbuff and zbuff attributes
2005 if (idx == 0)
2006 computeXbuffAttributes(scheme, start, colOffset.first, accIncr,
2007 state->dupFactor, state->aieml, opAttr);
2008 else
2009 computeZbuffAttributes(scheme, start, colOffset.second, accIncr,
2010 state->zeroOffset, state->aieml, nextStart,
2011 opAttr);
2012 }
2013 // And now generate the mul/fma op
2014 generateMulOrFMAOp(Op, scheme, opAttr, state, nextStart);
2015}
2016
2017// If the datatype allows it, fuse a mul or fma op with other fma ops to
2018// utilize the column topology of the AIE mul/fma intrinsic (e.g., 2 fmas can
2019// be fused for i16xi16 scheme, and 8 for i8xi8 scheme).
2020static void fuseFMAOpsForColumnTopology(func::FuncOp func, VectState *state) {
2021 // A set of FMA ops that were fused in the column topology
2022 llvm::SmallSet<Operation *, 8> fusedOpSet;
2023
2024 // Fuse FMA ops to exploit column topology
2025 func.walk([&](Operation *op) {
2026 if (isa<MulIOp, MulFOp, vector::FMAOp>(op)) {
2027 // Only process fma ops that are not already fused with another mul/fma
2028 if (!fusedOpSet.count(op)) {
2029 auto [lanes, cols] = getNumRowsAndCols(op, state);
2030 // Try fusing a linear chain of FMA ops (max length = cols) starting at
2031 // op.
2032 fuseFMAOps(op, fusedOpSet, cols, state);
2033 }
2034 }
2035 });
2036
2037 // Remove all the ops that were fused with other FMAs
2038 for (auto op : fusedOpSet)
2039 op->erase();
2040}
2041
2042template <typename T1, typename T2>
2043static bool matchAttributesAndDistanceForFusion(T1 curOp, T2 defOp) {
2044 return curOp.getOffset(0) == defOp.getOffset(0) &&
2045 curOp.getOffsetHi(0) == defOp.getOffsetHi(0) &&
2046 curOp.getSquare(0) == defOp.getSquare(0) &&
2047 curOp.getStep(0) == defOp.getStep(0) &&
2048 curOp.getOffset(1) == defOp.getOffset(1) &&
2049 curOp.getOffsetHi(1) == defOp.getOffsetHi(1) &&
2050 curOp.getSquare(1) == defOp.getSquare(1) &&
2051 curOp.getStep(1) == defOp.getStep(1) &&
2052 stoi(static_cast<std::string>(curOp.getStart(0))) -
2053 stoi(static_cast<std::string>(defOp.getStart(0))) ==
2054 2 &&
2055 stoi(static_cast<std::string>(curOp.getStart(1))) -
2056 stoi(static_cast<std::string>(defOp.getStart(1))) ==
2057 2;
2058}
2059
2060// We go through each fma operation and try to find the pattern like this-
2061// the acc of fma is a mul/fma operation which uses the same operands as fma.
2062// the def of two operands are upd operations.
2063// Transform -
2064// %5 = aievec_aie1.mul %4, %0 {xoffsets = "[[Xo:.*]]", xoffsets_hi =
2065// "[[Xh:.*]]", xsquare = "[[Sq:.*]]", xstart = "0", zoffsets = "[[Zo:.*]]",
2066// zoffsets_hi =
2067// "[[Zh:.*]]", zstart = "0", zstep = "[[Zs:.*]]"}
2068//
2069// %6 = aievec_aie1.mac %4, %0, %5 {xoffsets = "[[Xo:.*]]",
2070// xoffsets_hi = "[[Xh:.*]]", xsquare = "[[Sq:.*]]", xstart = "2", zoffsets =
2071// "[[Zo:.*]]", zoffsets_hi = "[[Zh:.*]]", zstart = "2", zstep = "[[Zs:.*]]"}
2072//
2073// to-
2074//
2075// %7 = aievec_aie1.mul_conv %6, %1 {M = 16 : si32, N = 4 : si32}
2076//
2077// or transform the pattern like this-
2078//
2079// %9 = aievec_aie1.mac %8, %0, %6 {xoffsets = "[[Xo:.*]]", xoffsets_hi =
2080// "[[Xh:.*]]", xsquare = "[[Sq:.*]]", xstart = "0", zoffsets = "[[Zo:.*]]",
2081// zoffsets_hi =
2082// "[[Zh:.*]]", zstart = "4", zstep = "[[Zs:.*]]"}
2083//
2084// %10 = aievec_aie1.mac %8, %0, %9 {xoffsets =
2085// "[[Xo:.*]]", xoffsets_hi = "[[Xh:.*]]", xsquare = "[[Sq:.*]]", xstart = "2",
2086// zoffsets = "[[Zo:.*]]", zoffsets_hi = "[[Zh:.*]]", zstart = "6", zstep =
2087// "[[Zs:.*]]"}
2088//
2089// to-
2090//
2091// %9 =
2092// aievec.fma_conv %8, %2, %7 {M = 16 : si32, N = 4 : si32}
2093// Currently, we only support mul_conv_16x4 and mac_conv_16x4 intrinsics for
2094// int16 type of AIE-ML architecture.
2095static bool canFuseMulFMAOpsForInt16(Operation *Op) {
2096 // Check 1. This should be an aievec fma operation
2097 assert(isa<aievec::aie1::FMAOp>(Op) && "operation must be an aievec fma op");
2098 auto curOp = cast<aievec::aie1::FMAOp>(Op);
2099
2100 // Check 2. Element type should be int16
2101 auto vType = cast<VectorType>(Op->getOperand(1).getType());
2102 Type stype = vType.getElementType();
2103 auto itype = llvm::dyn_cast<IntegerType>(stype);
2104
2105 if (!itype)
2106 return false;
2107
2108 if (unsigned width = itype.getWidth(); width != 16)
2109 return false;
2110
2111 // Check 3. acc operand of the Op should be a mul op or fma op
2112 Operation *mulOrFMAOp = Op->getOperand(2).getDefiningOp();
2113
2114 if (!isa<aievec::aie1::MulOp, aievec::aie1::FMAOp>(mulOrFMAOp))
2115 return false;
2116
2117 // Check 4. mulOrFMAOp must have one use
2118 if (!mulOrFMAOp->hasOneUse())
2119 return false;
2120
2121 // Check 5. mulOrFMAOp and Op must have the same lhs and rhs
2122 if (mulOrFMAOp->getOperand(0) != Op->getOperand(0) ||
2123 mulOrFMAOp->getOperand(1) != Op->getOperand(1))
2124 return false;
2125
2126 Value lhs = nullptr;
2127 Value rhs = nullptr;
2128 Value acc = nullptr;
2129 bool isMulOp = false;
2130
2131 // If the acc operand is a mul op, we will try to generate mul_conv operation
2132 // If the acc operand is a fma op, we will try to generate fma_conv operation
2133 if (auto mulOp = dyn_cast<aievec::aie1::MulOp>(mulOrFMAOp)) {
2134 isMulOp = true;
2135
2136 // Determine the lhs and rhs values for the mul_conv
2137 lhs = mulOp->getOperand(0);
2138 rhs = mulOp->getOperand(1);
2139 } else {
2140 auto fmaOp = cast<aievec::aie1::FMAOp>(mulOrFMAOp);
2141
2142 // Determine the lhs, rhs and acc values for the fma_conv
2143 lhs = fmaOp->getOperand(0);
2144 rhs = fmaOp->getOperand(1);
2145 acc = fmaOp->getOperand(2);
2146 }
2147
2148 // Check 6. The def of two operands are upd operations
2149 auto lUpdOp = dyn_cast<aievec::UPDOp>(lhs.getDefiningOp());
2150 auto rUpdOp = dyn_cast<aievec::UPDOp>(rhs.getDefiningOp());
2151
2152 if (!lUpdOp || !rUpdOp) {
2153 return false;
2154 }
2155
2156 // Check 7. All the ops should belong to the same block, otherwise we might
2157 // not be able to fuse them safely
2158 if (lhs.getParentBlock() != rhs.getParentBlock())
2159 return false;
2160
2161 if (acc && rhs.getParentBlock() != acc.getParentBlock())
2162 return false;
2163
2164 // Check 8. xstart and zstart distance between two operations should be
2165 // 2. offsets, offsets_hi, square and step of two operations should be same.
2166 return (isMulOp && matchAttributesAndDistanceForFusion(
2167 curOp, cast<aievec::aie1::MulOp>(mulOrFMAOp))) ||
2168 matchAttributesAndDistanceForFusion(
2169 curOp, cast<aievec::aie1::FMAOp>(mulOrFMAOp));
2170}
2171
2172// Rewrite a mul/fma and fma op as a aievec MUL_conv or FMA_Conv op
2173static void fuseMulFMAOpsForInt16(Operation *Op, VectState *state) {
2174 auto curOp = cast<aievec::aie1::FMAOp>(Op);
2175
2176 Value lhs = curOp->getOperand(0);
2177
2178 // 1. Deal with the lhs:
2179 // lhs of current FMAOp should be an upd operation with 512-bit vector width.
2180 // For AIE-ML, we can directly load 512 bits vectors. Thus, we can delete the
2181 // upd operation with index 1.
2182 auto lUpdOp = dyn_cast<aievec::UPDOp>(lhs.getDefiningOp());
2183 if (lUpdOp.getIndex() == 1) {
2184 auto lUpdOp0 = dyn_cast<aievec::UPDOp>(lUpdOp.getVector().getDefiningOp());
2185 lUpdOp->replaceAllUsesWith(lUpdOp0);
2186 lUpdOp->erase();
2187 }
2188
2189 // 2. Deal with the rhs:
2190 // Since vector size of current FMAOp rhs is 256 bits, we need to generate a
2191 // concat op to make the vector size to 512 bits.
2192 auto rUpdOp = dyn_cast<aievec::UPDOp>(curOp->getOperand(1).getDefiningOp());
2193 state->builder.setInsertionPointAfter(rUpdOp);
2194 AIEVecAttributes rstat = getOperandVecStats(curOp, state, 1);
2195 assert(rstat.vecSizeInBits % 256 == 0);
2196 Value concatRhs = nullptr;
2197
2198 if (rstat.vecSizeInBits == 256) {
2199 VectorType concatType =
2200 createVectorType(512 / rstat.elementSizeInBits, rstat.elementType);
2201 SmallVector<Value> sources = {rUpdOp->getResult(0), rUpdOp->getResult(0)};
2202 concatRhs = generateConcatOp(sources, state, rUpdOp->getLoc(), concatType);
2203 }
2204
2205 // Get the def op of acc. It is either a mul op or a fma op.
2206 Operation *convOp = nullptr;
2207 Operation *mulOrFMAOp = Op->getOperand(2).getDefiningOp();
2208 auto mulOp = dyn_cast<aievec::aie1::MulOp>(mulOrFMAOp);
2209 auto fmaOp = dyn_cast<aievec::aie1::FMAOp>(mulOrFMAOp);
2210 int32_t zStart;
2211
2212 if (mulOp) {
2213 aievec::aie1::MulOp defOp = mulOp;
2214 zStart = stoi(static_cast<std::string>(defOp.getStart(1)));
2215 } else {
2216 aievec::aie1::FMAOp defOp = fmaOp;
2217 zStart = stoi(static_cast<std::string>(defOp.getStart(1)));
2218 }
2219
2220 auto vType = cast<VectorType>(Op->getOperand(1).getType());
2221 int32_t shiftBytes = zStart * getElementSizeInBits(vType) / 8;
2222
2223 auto defOp = mulOp ? mulOp : fmaOp;
2224 state->builder.setInsertionPoint(defOp);
2225 Location loc = defOp->getLoc();
2226
2227 // Generate a shift_bytes operation for concatRhs if needed.
2228 if (shiftBytes)
2229 concatRhs = generateShiftOp(concatRhs, concatRhs, shiftBytes, state, loc);
2230
2231 Type stype = vType.getElementType();
2232 auto itype = cast<IntegerType>(stype);
2233 unsigned width = itype.getWidth() <= 8 ? 32 : 64;
2234 Type ctype = IntegerType::get(itype.getContext(), width);
2235 Type opType = VectorType::get(vType.getShape(), ctype);
2236 Value acc = nullptr;
2237 // Curently, we only support 16x4 convolution intrinsics for int16 type
2238 // AIE-ML.
2239 int32_t M = itype.getWidth();
2240 int32_t N = 4;
2241 // Update lhs value, since it has been changed after we deleted the upd
2242 // operation with index 1
2243 lhs = curOp->getOperand(0);
2244
2245 if (mulOp)
2246 convOp = state->builder.create<aievec::MulConvOp>(loc, opType, lhs,
2247 concatRhs, M, N);
2248 else {
2249 acc = defOp->getOperand(2);
2250 bool isSub = state->mscOps.count(defOp);
2251 convOp = state->builder.create<aievec::FMAConvOp>(
2252 loc, opType, lhs, concatRhs, acc, M, N, isSub);
2253 }
2254
2255 Op->replaceAllUsesWith(convOp);
2256 Op->erase();
2257 defOp->erase();
2258}
2259
2260static void fuseMulFMAOpsByMulFMAConv(func::FuncOp func, VectState *state) {
2261 func.walk([&](Operation *Op) {
2262 if (isa<aievec::aie1::FMAOp>(Op) && canFuseMulFMAOpsForInt16(Op))
2263 fuseMulFMAOpsForInt16(Op, state);
2264 });
2265}
2266
2267// Generate the AIE mul/fma op for each vector mul/fma op. This function is the
2268// crux of AIE vectorization. It accomplishes two main tasks: (1) For each
2269// mul/fma operation, compute the operand attributes. The attributes are start,
2270// offsets, square, step, etc. based on the scheme; and (2) Once all the
2271// attributes are computed, generate appropriate mul/fma operation in AIE
2272// dialect.
2273static void generateAIEMulOrFMAOpsInFunc(func::FuncOp func, VectState *state) {
2274 // For each mul/fma op, compute the scheme-dependent operand attributes, and
2275 // generate corresponding AIE dialect ops.
2276 func.walk([&](Operation *op) {
2277 if (isa<MulIOp, MulFOp, vector::FMAOp>(op))
2278 generateSchemeBasedMulOrFMAOp(op, state);
2279 });
2280}
2281
2282// Given the operation attributes (start, offset, square, etc.), generate an
2283// AIE add/sub op for the incoming vector add/sub Op.
2284static void generateAddOrSubOp(Operation *Op, AIEOpAttributes &opAttr,
2285 VectState *state) {
2286
2287 // Set insertion point of the AIE dialect mul/fma op
2288 state->builder.setInsertionPointAfter(Op);
2289
2290 // Generate an AIE dialect add/sub op
2291 Operation *repOp = nullptr;
2292 if (isa<SubIOp, SubFOp>(Op)) {
2293 repOp = generateSubOp(Op, opAttr, state);
2294 LLVM_DEBUG(llvm::dbgs() << "\n\nGenerated AIE dialect sub op " << *repOp);
2295 } else {
2296 repOp = generateAddOp(Op, opAttr, state);
2297 LLVM_DEBUG(llvm::dbgs() << "\n\nGenerated AIE dialect sub op " << *repOp);
2298 }
2299
2300 // Replace all the uses of the vector add/sub op with the AIE add/sub op, and
2301 // remove Op from the IR.
2302 Op->replaceAllUsesWith(repOp);
2303 Op->erase();
2304}
2305
2306// For this add/sub operator, generate AIE dialect add/sub op based on
2307// different vector schemes.
2308static void generateSchemeBasedAddOrSubOp(Operation *Op, VectState *state) {
2309 // opAttr will cache the attributes (start, offsets, square, etc.) for both
2310 // lhs and rhs operands.
2311 AIEOpAttributes opAttr;
2312
2313 // First check if this operation requires simple vector operation, and not an
2314 // advanced scheme.
2315 if (isSimpleVectIntrinsic(Op, state)) {
2316 // For simple scheme, we do not need any attribute
2317 for (size_t idx = 0; idx < 2; ++idx) {
2318 opAttr.start.push_back("");
2319 opAttr.offset.push_back("");
2320 opAttr.offset_hi.push_back("");
2321 opAttr.square.push_back("");
2322 }
2323 generateAddOrSubOp(Op, opAttr, state);
2324 return;
2325 }
2326
2327 // Otherwise generate add/sub op based on advanced scheme.
2328 // Compute relevant attributes (start, offsets, square, etc.) for each
2329 // operand, and store them in opAttr.
2330 for (size_t idx = 0; idx < 2; ++idx) {
2331 AIEVecAttributes stat = getOperandVecStats(Op, state, idx);
2332 assert(stat.elementSizeInBits >= 16 &&
2333 "advanced scheme for add op on int8 data type not supported");
2334
2335 int32_t start = 0, accIncr = 1;
2336 std::string startStr;
2337 std::string offsetStr, offsetHiStr;
2338 std::string squareStr;
2339
2340 // If the operand comes from transfer_read, compute the loop step and start
2341 // values.
2342 if (stat.loadFromMemory) {
2343 Operation *op = Op->getOperand(idx).getDefiningOp();
2344 auto readOp = cast<TransferReadOp>(op);
2345 // How does the access change with each iteration of the vectorized loop?
2346 accIncr = stat.isSplat ? 0 : computeVecorizedLoopStepSize(readOp, state);
2347 // start in the AIE vector
2348 start = computeStartInAIEVec(op, state);
2349 }
2350 // Now the usual processing. For i32 datatype, use the regular lane
2351 // selection.
2352 if (stat.elementSizeInBits == 32) {
2353 startStr = std::to_string(start);
2354 offsetStr = "0x";
2355 for (int i = 7; i >= 0; --i)
2356 offsetStr.push_back(getHexValue(i * accIncr));
2357 // If there are >8 lanes, we need to compute offset_hi
2358 if (stat.lanes > 8) {
2359 assert(stat.lanes == 16 && "Cannot generate offset for add/sub op");
2360 // Cannot have loop stride > 1
2361 assert(accIncr <= 1 && "Cannot generate offset for given loop stride");
2362 offsetHiStr = "0x";
2363 for (int i = 15; i >= 8; --i)
2364 offsetStr.push_back(getHexValue(i * accIncr));
2365 }
2366 } else if (stat.elementSizeInBits == 16) {
2367 assert(accIncr <= 1 && "cannot generate offset for given loop stride");
2368 // start must be a multiple of 2 for i16 data type
2369 int32_t m2Offset = start % 2;
2370 startStr = std::to_string(start - m2Offset);
2371 // We must compute the offset and offset_hi only if the access is not
2372 // splat. For splat, we can use trivial offsets.
2373 if (accIncr == 0)
2374 offsetStr = offsetHiStr = "0";
2375 else {
2376 offsetStr = "0x";
2377 for (int i = 6; i >= 0; i -= 2) {
2378 offsetStr.push_back('0');
2379 offsetStr.push_back(getHexValue((i * accIncr) / 2));
2380 }
2381 offsetHiStr = "0x";
2382 for (int i = 14; i >= 8; i -= 2) {
2383 offsetHiStr.push_back('0');
2384 offsetHiStr.push_back(getHexValue((i * accIncr) / 2));
2385 }
2386 }
2387 // We use a simplistic square that covers only two cases: access is
2388 // splat, and access is regular with stride that's power of 2.
2389 if (m2Offset == 0 && accIncr == 0)
2390 squareStr = "0";
2391 else {
2392 assert(m2Offset == 0 || accIncr == 0);
2393 squareStr = "0x";
2394 int32_t astep = std::min(1, accIncr);
2395 SmallVector<int32_t> sqPattern = {3 * astep, 2 * astep, astep, 0};
2396 for (auto sq : sqPattern)
2397 squareStr.push_back(getHexValue(sq + m2Offset));
2398 }
2399 } else
2400 llvm_unreachable("Cannot generate advanced add op for given datatype");
2401
2402 // We have computed all the fields. Cache the attributes.
2403 opAttr.start.push_back(startStr);
2404 opAttr.offset.push_back(offsetStr);
2405 opAttr.offset_hi.push_back(offsetHiStr);
2406 opAttr.square.push_back(squareStr);
2407 }
2408 // And now generate the add/sub op
2409 generateAddOrSubOp(Op, opAttr, state);
2410}
2411
2412// The main focus of this function is to compute the right start/offset fields
2413// for the adds involving splat. If none of the operands of the add op is
2414// splat, we must generate simple scheme add op.
2415static void generateAIEAddOrSubOpsInFunc(func::FuncOp func, VectState *state) {
2416 func.walk([&](Operation *op) {
2417 if (isa<AddIOp, AddFOp, SubIOp, SubFOp>(op))
2418 generateSchemeBasedAddOrSubOp(op, state);
2419 });
2420}
2421
2422// Generate UPD ops to subsume all the transfer_read ops of affine dialect. To
2423// generate the UPD ops, we first visit the innermost for op, and for each
2424// transfer_read instruction nested inside that op, create a set of UPD ops,
2425// and then insert them in the front bb of that for op's region.
2426static void insertUPDOpsInLoop(affine::AffineForOp forOp, VectState *state) {
2427 // Recursively generate UPD ops in the nested for op's.
2428 for (affine::AffineForOp nestedOp :
2429 forOp.getRegion().getOps<affine::AffineForOp>())
2430 insertUPDOpsInLoop(nestedOp, state);
2431
2432 // A map from an interval to the UPD op. The key gives the interval that
2433 // should be loaded into the AIE vec, and the value indicates the UPD op
2434 // achieving that. The value also has an 8-bit field, whose first/second bit
2435 // is set if upd op idx=0/idx=1 is already created for this interval.
2436 mlir::DenseMap<std::tuple<IntervalReuse *, int32_t, int32_t>,
2437 std::pair<aievec::UPDOp, int8_t>>
2438 memToUpdMap;
2439 // A map from a read operation to its corresponding UPD operation. The idea
2440 // is that multiple read ops will derive from the same bigger vector
2441 // register.
2442 mlir::DenseMap<Operation *, aievec::UPDOp> readOpToUpdMap;
2443 // Iterate over all the transfer_read ops within this loop
2444 Region &region = forOp.getRegion();
2445 for (TransferReadOp readOp : region.getOps<TransferReadOp>()) {
2446 aievec::UPDOp updOp = generateUPDOp(readOp, memToUpdMap, region, state);
2447 readOpToUpdMap[readOp] = updOp;
2448 }
2449
2450 // Now replace all the uses of a transfer_read op with its UPD op
2451 for (auto &map : readOpToUpdMap) {
2452 Operation *op = map.first;
2453 op->replaceAllUsesWith(map.second);
2454 op->erase();
2455 }
2456}
2457
2458// Replace all the transfer_read ops with UPD ops in the function.
2459static void insertUPDOpsInFunc(func::FuncOp func, VectState *state) {
2460 for (affine::AffineForOp forOp : func.getOps<affine::AffineForOp>()) {
2461 insertUPDOpsInLoop(forOp, state);
2462 }
2463}
2464
2465// Incoming Op is an operation in AIE dialect whose result is an accumulator.
2466// Check all its uses, and if any user of Op is a non-AIE operation, insert an
2467// SRS instruction to move the value from accumulator to vector.
2468static void insertSRSOp(Operation *Op, VectState *state) {
2469 // This operation must have at least one use, and at least one result
2470 if (Op->use_empty() || Op->getNumResults() == 0)
2471 return;
2472
2473 // The operation must write to an accumulator
2474 assert(writesToAccumulator(Op));
2475
2476 // Check if any user of this operation is a non-AIE op. If any user of this
2477 // operation is non-AIE op, then we need to generate SRS op to move value
2478 // from accumulator to vector
2479 auto isNonAIEOp = [&](Operation *op) { return !isAIEOp(op); };
2480 if (!any_of(Op->getUsers(), isNonAIEOp))
2481 return;
2482
2483 // Given an accumulator, one can use different srs intrinsic to generate
2484 // different output types. Create a map from SRS output type to the SRS op.
2485 mlir::DenseMap<Type, aievec::SRSOp> typeToSRSOpMap;
2486
2487 // Set the insertion point for the AIE dialect SRS op
2488 state->builder.setInsertionPointAfter(Op);
2489
2490 // Iterate over all the users of this operation that are not in AIE dialect,
2491 // and replace the use of Op in them with srsOp
2492 for (auto user : Op->getUsers()) {
2493 // Skip AIE ops
2494 if (isAIEOp(user))
2495 continue;
2496
2497 // Get the underlying scalar element type of user op. If the user is a
2498 // write op, it won't have a result. So get the element type from memref.
2499 Type scalarType;
2500 MemRefType memRefType = nullptr;
2501 if (auto writeOp = dyn_cast<TransferWriteOp>(user)) {
2502 // Get the element type from the memref output
2503 memRefType = cast<MemRefType>(writeOp.getSource().getType());
2504 scalarType = memRefType.getElementType();
2505 } else
2506 scalarType = getElementTypeOrSelf(*user->getResultTypes().begin());
2507 assert(scalarType && "failed to form SRS op");
2508 // Iterate over all the operands of this user, and find the ones that
2509 // correspond to the Op.
2510 for (auto operand : user->getOperands()) {
2511 if (operand.getDefiningOp() == Op) {
2512 // Generate an AIE-ML cast op for the case that result vector width less
2513 // or equal that source vector width
2514 if (state->aieml && memRefType &&
2515 cast<VectorType>(Op->getOperand(0).getType())
2516 .getElementType()
2517 .getIntOrFloatBitWidth() == 8 &&
2518 cast<VectorType>(Op->getResult(0).getType())
2519 .getElementType()
2520 .getIntOrFloatBitWidth() ==
2521 scalarType.getIntOrFloatBitWidth()) {
2522 unsigned lanes =
2523 getVectorLaneSize(cast<VectorType>(Op->getResult(0).getType()));
2524 VectorType castType = createVectorType(lanes, scalarType);
2525 aievec::CastOp castOp = generateCastOp(Op->getResult(0), castType,
2526 false, state, Op->getLoc());
2527 assert(castOp && "Failed to create Cast intrinsic");
2528 user->replaceUsesOfWith(operand, castOp);
2529 break;
2530 }
2531 aievec::SRSOp srsOp;
2532 if (!typeToSRSOpMap.count(scalarType)) {
2533 srsOp =
2534 generateSRSOp(Op->getResult(0), scalarType, state, Op->getLoc());
2535 LLVM_DEBUG(llvm::dbgs() << "\n\nCreated SRS op " << srsOp
2536 << " for the acc output of operation " << Op);
2537 typeToSRSOpMap[scalarType] = srsOp;
2538 } else
2539 srsOp = typeToSRSOpMap[scalarType];
2540 assert(srsOp && "Failed to create SRS intrinsic");
2541 // And now we replace the operand with srsOp
2542 user->replaceUsesOfWith(operand, srsOp);
2543 }
2544 }
2545 }
2546}
2547
2548// Generate SRS op whenever we move data from an accumulator AIE dialect to a
2549// vector.
2550static void insertSRSOpsInFunc(func::FuncOp func, VectState *state) {
2551 func.walk([&](Operation *op) {
2552 // Insert an SRS op if the op outputs to an accumulator
2553 if (writesToAccumulator(op))
2554 insertSRSOp(op, state);
2555 });
2556}
2557
2558// Set existing read/write op to in-bounds, indicating that it always reads
2559// from/writes to a full buffer. We make this assumption for our vectorization
2560// framework.
2561template <typename TransferOp>
2562static void setInBounds(TransferOp op) {
2563 if (op.getTransferRank() == 0)
2564 return;
2565 SmallVector<bool, 4> bools(op.getTransferRank(), true);
2566 OpBuilder b(op.getContext());
2567 op->setAttr(op.getInBoundsAttrName(), b.getBoolArrayAttr(bools));
2568}
2569
2570// Remove redundant vector load/stores (i.e., transfer ops) that could be
2571// generated post unolling. The redundant operations are removed in two steps:
2572// first, we do a store to load forwarding. This removes the loads that
2573// immediately succeed a store to the same location. Then it removes multiple
2574// stores to the same memory location without an interfering store to that
2575// memref. The only preserves the last write. These transformations are already
2576// implemented in 'transferOpflowOpt' function. But these transformations only
2577// work on reads/writes that are within bounds. We safely assume that for AIE
2578// vectorization, all the transfer reads/writes are within bounds.
2579static void redundantLoadStoreOptimization(ModuleOp module) {
2580 for (func::FuncOp func : module.getOps<func::FuncOp>()) {
2581 // Mark all the transfer ops that have empty in_bounds as inbound
2582 func.walk([&](Operation *Op) {
2583 if (auto readOp = dyn_cast<TransferReadOp>(Op)) {
2584 if (!readOp.getInBounds())
2585 setInBounds<TransferReadOp>(readOp);
2586 } else if (auto writeOp = dyn_cast<TransferWriteOp>(Op)) {
2587 if (!writeOp.getInBounds())
2588 setInBounds<TransferWriteOp>(writeOp);
2589 }
2590 });
2591 // Now that all the transfer ops are marked inbound, remove redundant
2592 // vector loads/stores
2593 IRRewriter rewriter(module.getContext());
2594 vector::transferOpflowOpt(rewriter, func);
2595 }
2596}
2597
2598// Run a pre pipeline of cleanup passes (canonicalizer). Remove redundant
2599// load/store operations in case the code was generated via unrolling
2600static void preCanonicalizeIR(ModuleOp module) {
2601 PassManager pm(module.getContext());
2602 pm.addPass(createCanonicalizerPass());
2603 [[maybe_unused]] bool success = pm.run(module).succeeded();
2604 assert(success);
2605 redundantLoadStoreOptimization(module);
2606}
2607
2608// Run a post pipeline of cleanup and optimization passes (canonicalizer, LICM,
2609// CSE, etc). At the end, lower the output from affine to scf, so that we can
2610// use EmitC functionality to generate the loops.
2611static void postCanonicalizeIR(ModuleOp module) {
2612 PassManager pm(module.getContext());
2613 pm.addPass(createCanonicalizerPass());
2614 pm.addPass(createCSEPass());
2615 pm.addPass(createLoopInvariantCodeMotionPass());
2616 pm.addPass(createLowerAffinePass());
2617 [[maybe_unused]] bool success = pm.run(module).succeeded();
2618 assert(success);
2619}
2620
2621// Iterate over the loop nestings to form loop nesting bands. Then for each
2622// block within those bands, the enclosingLoops is set to the loop band.
2623static void
2624computeEnclosingLoopsPerBlock(affine::AffineForOp forOp, VectState *state,
2625 SmallVector<Operation *, 8> &enclosingLoops) {
2626 // Form the loop band for nested for ops
2627 for (affine::AffineForOp nestedOp :
2628 forOp.getRegion().getOps<affine::AffineForOp>()) {
2629 enclosingLoops.push_back(nestedOp);
2630 computeEnclosingLoopsPerBlock(nestedOp, state, enclosingLoops);
2631 enclosingLoops.pop_back();
2632 }
2633
2634 // Iterate over all the transfer_read operations enclosed within the current
2635 // region, and store the for loop nesting for the read op.
2636 for (TransferReadOp readOp : forOp.getRegion().getOps<TransferReadOp>()) {
2637 // Find the block corresponding to this transfer_read
2638 Block *block = readOp->getBlock();
2639 state->blockToEnclosingLoops[block] = enclosingLoops;
2640 }
2641}
2642
2643// We reorder the operands involved in multiplication so that (1) the splat
2644// operand is always the second operand, and (2) the bigger vector is the first
2645// operand. This allows us to form FMA intrinsic for AIE. The only exception to
2646// this rule is the 8x8 bit scheme, where the xbuff is a bit more restrictive,
2647// so we prefer splat as left operand of multiplication for 8x8 scheme.
2648static void reassociateMulOpInFunc(func::FuncOp func, VectState *state) {
2649 func.walk([&](Operation *op) {
2650 // Only reassociate vector mul ops that are well formed. This also includes
2651 // the multiplication component in fma ops.
2652 if (isa<MulIOp, MulFOp, vector::FMAOp>(op) && isWellFormedVectorOp(op)) {
2653 // 1. Reassociate so that splat is in the correct place
2654 reassociateMulOpWithSplat(op, state);
2655
2656 // 2. Reassociate so that bigger vector is the first operand
2657 reassociateMulOpBasedOnVecSize(op, state);
2658 }
2659 });
2660}
2661
2662// This is a very simple function that looks for add op of the form {a=b*c; d =
2663// a+e;}, and reassociates it so that the operand that computes a mult is the
2664// right operand of add op. This is a syntactic transformation that uses the
2665// commutativity of add op, and is only applied so that we can leverage the
2666// same code functionality for generating mac and msc ops.
2667static void reassociateAddOpInFunc(func::FuncOp func, VectState *state) {
2668 func.walk([&](Operation *op) {
2669 // Only reassociate vector add ops that are well formed.
2670 if (isa<AddIOp, AddFOp>(op) && isWellFormedVectorOp(op)) {
2671 // addOp must have two operands and one result
2672 assert(op->getNumOperands() == 2 && op->getNumResults() == 1);
2673
2674 // Determine which operand is the multiply
2675 Operation *rhsOp = getOperandDefOp(state, op, 1);
2676 Value left =
2677 state->sextTruncDefMap.count(op->getOperand(0).getDefiningOp())
2678 ? op->getOperand(0).getDefiningOp()->getOperand(0)
2679 : op->getOperand(0);
2680 Value right =
2681 state->sextTruncDefMap.count(op->getOperand(1).getDefiningOp())
2682 ? op->getOperand(1).getDefiningOp()->getOperand(0)
2683 : op->getOperand(1);
2684 // If rhs is mul operand, no need to proceed further
2685 if (!isa<MulIOp, MulFOp>(rhsOp)) {
2686 Operation *lhsOp = getOperandDefOp(state, op, 0);
2687 // If lhs is the mul operand, do the switcharoo
2688 if (isa<MulIOp, MulFOp>(lhsOp)) {
2689 LLVM_DEBUG(llvm::dbgs() << "\n\nReassociating addOp " << *op
2690 << " to place mul as rhs operand");
2691 op->setOperand(0, right);
2692 op->setOperand(1, left);
2693 LLVM_DEBUG(llvm::dbgs() << "\n\taddOp after reassociation: " << *op);
2694 }
2695 } else {
2696 op->setOperand(0, left);
2697 op->setOperand(1, right);
2698 }
2699 }
2700 });
2701}
2702
2703// For i8xi8 scheme, the lhs operand vector size could be <= 256 bits, but the
2704// intrinsic requires the lhs operand vector to be at least 512 bits.
2705// Therefore, we check each read op, and (1) if it only appears in the LHS of a
2706// mul/fma op, and (2) its interval width is <= 256 bits, we tag the vector
2707// corresponding to it. Then we can try to coalesce two consecutive tagged
2708// intervals (i.e., vectors) in each ReuseInterval object. This removes the
2709// need of extra vector and two concat ops.
2710static void coalesceLHSOpVectorsInFunc(func::FuncOp func, VectState *state) {
2711 // Iterate over all the transfer read ops in this function
2712 func.walk([&](TransferReadOp op) {
2713 // Iterate over all the users of this read operation. We want to identify
2714 // if this read op only appears as an LHS operand of a mul/fma op.
2715 bool onlyLHS = true;
2716 for (auto user : op->getUsers()) {
2717 if (!isa<MulIOp, MulFOp, vector::FMAOp>(user) ||
2718 user->getOperand(0).getDefiningOp() != op) {
2719 onlyLHS = false;
2720 break;
2721 }
2722 }
2723 // If this read op only appears as LHS operand of mul/fma op, we find the
2724 // IntervalReuse object this op belongs to, and tag the interval (i.e.,
2725 // vector) subsuming this read op's access extent.
2726 if (onlyLHS) {
2727 IntervalReuse *iv = state->getIntervalForOperation(op);
2728 iv->markLHSOperandVec(op);
2729 }
2730 });
2731
2732 // All the tagging is done. Now iterate over all the IntervalReuse objects.
2733 // If any of those has tagged vector, try to coalesce the tagged vectors.
2734 for (auto interval : state->reuseIntervals) {
2735 interval->coalesceIntervals();
2736 }
2737}
2738
2739// Go through sext operations and record the operand's defining operation.
2740static void recordSextOps(func::FuncOp func, VectState *state) {
2741 func.walk([&](ExtSIOp op) {
2742 state->sextTruncDefMap[op] = op->getOperand(0).getDefiningOp();
2743 });
2744 func.walk([&](TruncIOp op) {
2745 state->sextTruncDefMap[op] = op->getOperand(0).getDefiningOp();
2746 });
2747}
2748
2749// For each read operation, compute the potential vector-level data reuse we
2750// can exploit for it.
2751static void computeReuse(TransferReadOp readOp, VectState *state) {
2752 // Construct a linearized access expression for the transfer_read
2753 AffineExpr linearAccess = constructLinearizedAffineExpr(readOp, state);
2754 // Decompose the linear access into a base and constant offset value
2755 auto [base, offset] = getBaseAndOffset(linearAccess);
2756
2757 // Get the step size of the vectorized loop that encloses this read operation
2758 int32_t step = computeVecorizedLoopStepSize(readOp, state);
2759
2760 // If the permutation map is 0, the read operation is splat
2761 bool isSplat = readOp.getPermutationMap().isConstant();
2762
2763 // Check if this readOp is the lhs or rhs operand of a mul/fma op. If it is,
2764 // then the vector size corresponding to its access extent should at least be
2765 // 256 bits. Otherwise, AIE vectors are at least 128 bits.
2766 unsigned minVecSize = 128;
2767 for (auto user : readOp->getUsers()) {
2768 if (isa<MulIOp, MulFOp, vector::FMAOp>(user)) {
2769 if (user->getOperand(0).getDefiningOp() == readOp ||
2770 user->getOperand(1).getDefiningOp() == readOp) {
2771 minVecSize = 256;
2772 break;
2773 }
2774 }
2775 if (isa<ExtSIOp>(user)) {
2776 auto extsiOp = cast<ExtSIOp>(user);
2777 for (auto consumer : extsiOp->getUsers()) {
2778 if (isa<MulIOp, MulFOp, vector::FMAOp>(consumer)) {
2779 if ((state->sextTruncDefMap.count(
2780 consumer->getOperand(0).getDefiningOp()) &&
2781 state->sextTruncDefMap[consumer->getOperand(0)
2782 .getDefiningOp()] == readOp) ||
2783 (state->sextTruncDefMap.count(
2784 consumer->getOperand(1).getDefiningOp()) &&
2785 state->sextTruncDefMap[consumer->getOperand(1)
2786 .getDefiningOp()] == readOp)) {
2787 minVecSize = 256;
2788 break;
2789 }
2790 }
2791 }
2792 }
2793 }
2794
2795 auto vecType = cast<VectorType>(readOp.getVector().getType());
2796 if (state->aieml && (getVectorSizeInBits(vecType) == 512 ||
2797 getElementSizeInBits(vecType) == 8)) {
2798 minVecSize *= 2;
2799 }
2800
2801 bool found = false;
2802 // Iterate over all the IntervalReuse objects created thus far. Each object
2803 // represents a group of reads that have a potential of vector-level data
2804 // reuse. If we find an interval that (1) accesses an array with same base,
2805 // and (2) has other operations enclosed within the same same set of loops as
2806 // this operation, then we have the cluster of read ops that this op must be
2807 // grouped with.
2808 for (auto interval : state->reuseIntervals) {
2809 // Check if reuse is discovered
2810 if (interval->potentialReuse(readOp, base, state->blockToEnclosingLoops)) {
2811 // If the reuse is found with other operations in interval, add this
2812 // operation to interval.
2813 interval->insertInterval(readOp, state->opToIntervalMap, offset, step,
2814 isSplat, minVecSize);
2815 found = true;
2816 break;
2817 }
2818 }
2819 // If no reuse is found, create a new IntervalReuse object with just this
2820 // operation's read access extent.
2821 if (!found) {
2822 auto iv = new IntervalReuse(readOp, base);
2823 iv->insertInterval(readOp, state->opToIntervalMap, offset, step, isSplat,
2824 minVecSize);
2825 state->reuseIntervals.push_back(iv);
2826 }
2827}
2828
2829static LogicalResult isUnalignedLoad(TransferReadOp readOp, VectState *state) {
2830 auto vectorType = cast<VectorType>(readOp.getResult().getType());
2831 unsigned lanes = getVectorLaneSize(vectorType);
2832
2833 AffineExpr linearAccess = constructLinearizedAffineExpr(readOp, state);
2834 if (linearAccess.isSymbolicOrConstant()) {
2835 return success();
2836 }
2837
2838 auto memRefType = cast<MemRefType>(readOp.getSource().getType());
2839 MLIRContext *context = memRefType.getContext();
2840 ArrayRef<int64_t> sizes = memRefType.getShape();
2841 int numDims = sizes.size();
2842
2843 auto block = readOp->getBlock();
2844 assert(state->blockToEnclosingLoops.count(block) &&
2845 "enclosing loops should have been computed for the read operation\n");
2846 auto enclosingLoops = state->blockToEnclosingLoops[block];
2847
2848 SmallVector<Value, 4> indices(readOp.getIndices().begin(),
2849 readOp.getIndices().end());
2850
2851 // If the lowest dim has iv, check whether its corresponding loop step is
2852 // divisible by the vector lanes.
2853 if (auto dimExpr =
2854 dyn_cast<AffineDimExpr>(getAffineDimExpr(numDims - 1, context))) {
2855 auto index = indices[dimExpr.getPosition()];
2856 // Iterate over all enclosing loops, and find the one that is variant in
2857 // index.
2858 for (auto loop : enclosingLoops) {
2859 auto affineForOp = cast<affine::AffineForOp>(loop);
2860 auto iv = affineForOp.getInductionVar();
2861 auto invariants = affine::getInvariantAccesses(iv, indices);
2862
2863 if (!invariants.count(index)) {
2864 int step = affineForOp.getStepAsInt();
2865 if (step % lanes) {
2866 return readOp->emitError()
2867 << "Loop step of inner index of " << readOp->getName()
2868 << " is not divisible by number of vector lanes.";
2869 }
2870
2871 // To avoid generating the code with wrong results due to unaligned
2872 // upper bound's affine_map offset and loop step, we need to check
2873 // whether affine map's offset of loop upper bound is divisible by
2874 // the vector lanes.
2875 affine::AffineBound ub = affineForOp.getUpperBound();
2876 AffineMap origUbMap = ub.getMap();
2877 if (!origUbMap.isEmpty() && !origUbMap.isConstant()) {
2878 AffineExpr origUbMapResult = origUbMap.getResult(0);
2879 AffineExpr base;
2880 int32_t offset;
2881 std::tie(base, offset) = getBaseAndOffset(origUbMapResult);
2882 if (offset % lanes) {
2883 return readOp->emitError()
2884 << "Loop upper bound's affine map offset of inner index of "
2885 << readOp->getName()
2886 << " is not divisible by number of vector lanes.";
2887 }
2888 }
2889 }
2890 }
2891 }
2892
2893 // For the higher dimension, check whether the lower dimensions' shape sizes
2894 // is divisible by the vector lanes.
2895 for (int i = 1; i < numDims; ++i) {
2896 // Skip checking the higher dimensions with dynamic size.
2897 if (sizes[i] == -1) {
2898 continue;
2899 }
2900
2901 if (sizes[i] % lanes) {
2902 return readOp->emitError()
2903 << readOp->getName() << "'s shape size of index " << i
2904 << " is not divisible by number of vector lanes.";
2905 }
2906 }
2907
2908 return success();
2909}
2910
2911static LogicalResult hasUnalignedLoads(func::FuncOp func, VectState *state) {
2912 WalkResult result = func.walk([&](TransferReadOp op) {
2913 if (failed(isUnalignedLoad(op, state))) {
2914 return WalkResult::interrupt();
2915 }
2916 return WalkResult::advance();
2917 });
2918
2919 if (result.wasInterrupted()) {
2920 return failure();
2921 }
2922
2923 return success();
2924}
2925
2926// Compute the reuse interval for all the transfer_read operations. The
2927// transfer_read operations capture the vector load. Since AIE only allows for
2928// aligned vector loads, we need to compose multiple transfer reads together to
2929// form intervals of certain width (128, 256, 512, or 1024), and create an AIE
2930// vector from each interval.
2931static void computeReuseInFunc(func::FuncOp func, VectState *state) {
2932 // Now we can cluster all the transfer_read ops that have a potential of
2933 // vector-level data reuse.
2934 func.walk([&](TransferReadOp op) { computeReuse(op, state); });
2935}
2936
2937// Rewrite a sequence of mul and add/sub {a = b*c; d = a+e;} as an FMA op {d =
2938// b*c+e;}. This step only rewrites the FMA op in vector dialect.
2939static void rewriteFMAOpsInFunc(func::FuncOp func, VectState *state) {
2940 // Find a root add op that is well formed, and start from there
2941 func.walk([&](Operation *Op) {
2942 if (isa<AddIOp, AddFOp, SubIOp, SubFOp>(Op) && isWellFormedVectorOp(Op)) {
2943 // Perform a series of checks to see if we can find a mul and add/sub
2944 // that can be fused into a FMA. If found, fuse.
2945 if (canFuseMulAndAddOrSubIntoFMAOp(Op, state))
2946 fuseMulAndAddOrSubIntoFMAOp(Op, state);
2947 }
2948 });
2949}
2950
2951// Assuming commutativity and associativity of add and mul ops, reassociate ops
2952// so that code generation becomes feasible/easier.
2953static void reassociateOpsInFunc(func::FuncOp func, VectState *state) {
2954 // We assume that pointwise multiplication is commutative. So correct the
2955 // order of operands involved in multiplication so that we can form AIE
2956 // mul/fma intrinsic.
2957 reassociateMulOpInFunc(func, state);
2958 // We assume that pointwise addition is commutative. If any operand of the
2959 // add op is a mul op, then we reassociate it to be the right operand of add
2960 // op. This change ensures that in the next step, when we form FMA ops, we
2961 // reuse the functionality for mac/msc ops.
2962 reassociateAddOpInFunc(func, state);
2963}
2964
2965struct AIEVectorize : AIEVectorizeBase<AIEVectorize> {
2966 AIEVectorize() = default;
2967 void runOnOperation() override;
2968};
2969
2970/// Generate AIE vector intrinsics for the current module. Assumption: the
2971/// input to this function is the mlir output generated after vectorizing the
2972/// scalar mlir input with affine superVectorizer. The vectorization factor
2973/// should be appropriately set to a power of 2 (e.g., 8 for i32xi32 scheme, 16
2974/// for i16xi16 scheme and i8xi8 scheme).
2976 // Verify the bounds of the incoming arguments
2977 assert(shiftParam < 64 && "SRS shift parameter should be between 0 and 63");
2978 assert(zeroOffset < 128 &&
2979 "Zero offset in the filter should be between 0 and 127");
2980 assert(dupFactor < 128 &&
2981 "Duplicate offset in the filter should be between 0 and 127");
2982
2983 ModuleOp module = getOperation();
2984
2985 // Canonicalize the incoming IR, mostly to simplify affine/compose apply ops
2986 preCanonicalizeIR(module);
2987
2988 // Iterate over all the functions in this module, and vectorize them
2989 for (func::FuncOp func : module.getOps<func::FuncOp>()) {
2990 // Create a new global state
2991 bool aieml = ::AIEML;
2992 bool unallignedCheck = ::unalignedLoadsCheck;
2993 if (this->unalignedLoadsCheck.hasValue())
2994 unallignedCheck = this->unalignedLoadsCheck;
2995 if (this->aieml.hasValue())
2996 aieml = this->aieml;
2997 auto *state = new VectState(func.getContext(), shiftParam, zeroOffset,
2998 dupFactor, unallignedCheck, aieml);
2999
3000 // record the sext op and its operand's def op to sextTruncDefMap
3001 recordSextOps(func, state);
3002
3003 // First compute the loops surrounding each load/store operation. This is
3004 // necessary to identify loads/stores that are nested together.
3005 for (auto forOp : func.getOps<affine::AffineForOp>()) {
3006 SmallVector<Operation *, 8> enclosingLoops;
3007 enclosingLoops.push_back(forOp);
3008 computeEnclosingLoopsPerBlock(forOp, state, enclosingLoops);
3009 }
3010
3011 // Check whether there is any unalignment loads.
3012 if (state->unalignedLoadsCheck && failed(hasUnalignedLoads(func, state))) {
3013 func.emitError() << "Cannot apply aie-vectorize to " << func->getName()
3014 << " because alignment check has failed.\n";
3015 return;
3016 }
3017
3018 // Compute the reuse for all the transfer_read operations, and form the
3019 // initial vector sizes.
3020 computeReuseInFunc(func, state);
3021 // We leverage the assumption that pointwise addition and multiplication
3022 // are commutative and associative to reassociate the operands of some
3023 // operators. This IR massaging makes it feasible to generate aie dialect
3024 // fma/msc intrinsics.
3025 reassociateOpsInFunc(func, state);
3026 // Rewrite vector dialect add and mul operation chains as vector dialect
3027 // fma operation if feasible.
3028 rewriteFMAOpsInFunc(func, state);
3029 // Coalesce vectors that only appear as LHS operands of mul/fma op if their
3030 // size is <= 256 bits.
3031 coalesceLHSOpVectorsInFunc(func, state);
3032 // Check for opportunities of fusing FMA ops to exploit the column topology
3033 // of the AIE vector intrinsic.
3034 fuseFMAOpsForColumnTopology(func, state);
3035 // For each vector dialect mul/fma op, compute the start and offset values
3036 // of its operands. Finally, generate AIE dialect mul/FMA ops.
3037 generateAIEMulOrFMAOpsInFunc(func, state);
3038 // Insert SRS ops to move data from accumulator to vector when the producer
3039 // is an AIE dialect op that writes to an accumulator, and the consumer
3040 // isn't an AIE dialect op.
3041 insertSRSOpsInFunc(func, state);
3042 // For each vector dialect add/sub op, compute the start and offset values
3043 // of its operands. Finally, generate AIE dialect add/sub ops. This should
3044 // be done after srs ops are generated, so that the input to the add op is
3045 // always vectors.
3046 generateAIEAddOrSubOpsInFunc(func, state);
3047 // Generate UPD ops that subsume all the transfer_read ops in affine
3048 // dialect. This happens after generating aie dialect add/sub ops because
3049 // those ops need to query transfer reads to know if their operand is
3050 // splat.
3051 insertUPDOpsInFunc(func, state);
3052 // Check for the opportunities of fusing Mul and FMA ops by Mul_Conv or
3053 // FMA_Conv.
3054 if (state->aieml)
3055 fuseMulFMAOpsByMulFMAConv(func, state);
3056 }
3057
3058 // Canonicalize the IR of all the functions in the module by running a set of
3059 // cleanup passes.
3060 postCanonicalizeIR(module);
3061}
3062
3063std::unique_ptr<Pass> aievec::createAIEVectorizePass() {
3064 return std::make_unique<AIEVectorize>();
3065}
int32_t computeStartInAIEVec(Operation *op, VectState *state)
void insertInterval(mlir::vector::TransferReadOp readOp, llvm::DenseMap< mlir::Operation *, IntervalReuse * > &dataAccessToIntervalMap, int32_t offset, int32_t forLoopStepSize, bool isSplat=false, unsigned minVecSize=128)
void setAccessExtent(mlir::Operation *op, std::pair< int32_t, int32_t > &extent)
int32_t getIntervalWidth(mlir::Operation *op)
std::pair< int32_t, int32_t > getAccessExtent(mlir::Operation *op)
std::pair< int32_t, int32_t > getInterval(mlir::Operation *op)
void markLHSOperandVec(mlir::Operation *op)
bool isPowerOfTwo(int32_t n)
Definition AIEVecUtils.h:39
int32_t getVectorSizeInBits(mlir::VectorType type)
Definition AIEVecUtils.h:66
unsigned getVectorLaneSize(mlir::VectorType type)
Definition AIEVecUtils.h:55
char getHexValue(int val)
Definition AIEVecUtils.h:31
std::unique_ptr< mlir::Pass > createAIEVectorizePass()
mlir::VectorType createVectorType(unsigned lanes, mlir::Type elementType)
Definition AIEVecUtils.h:42
int32_t getElementSizeInBits(mlir::VectorType type)
Definition AIEVecUtils.h:49
mlir::VectorType getVectorOpDestType(mlir::VectorType type, bool AIE2)
Definition AIEVecUtils.h:80
bool isAIEOp(mlir::Operation *op)
Definition AIEVecUtils.h:73
AIEVectorize()=default
void runOnOperation() override
Generate AIE vector intrinsics for the current module.