MLIR-AIE
AIEVectorize.cpp
Go to the documentation of this file.
1//===-AIEVectorize.cpp - Vectorizer for AIE architecture --------*- C++ -*-===//
2//
3// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7// (c) Copyright 2022 Xilinx Inc.
8//
9//===----------------------------------------------------------------------===//
10// This file implements the functionality to massage the output from affine
11// supervectorizer into a set of operations and datatypes corresponding to
12// AIE vector abstraction.
13//===----------------------------------------------------------------------===//
14
20
21#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
22#include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
23#include "mlir/Dialect/Affine/IR/AffineOps.h"
24#include "mlir/Dialect/Func/IR/FuncOps.h"
25#include "mlir/Dialect/MemRef/IR/MemRef.h"
26#include "mlir/Dialect/SCF/IR/SCF.h"
27#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
28#include "mlir/IR/TypeUtilities.h"
29#include "mlir/Pass/PassManager.h"
30#include "mlir/Transforms/Passes.h"
31
32#include "llvm/ADT/SmallSet.h"
33
34namespace xilinx::aievec {
35#define GEN_PASS_DEF_AIEVECTORIZE
36#include "aie/Dialect/AIEVec/Transforms/Passes.h.inc"
37} // namespace xilinx::aievec
38
39using namespace llvm;
40using namespace mlir;
41using namespace arith;
42using namespace vector;
43using namespace xilinx;
44using namespace xilinx::aievec;
45
46#define DEBUG_TYPE "aie-vect"
47
48static llvm::cl::opt<bool>
49 unalignedLoadsCheck("unaligned-loads-check",
50 llvm::cl::desc("Enable the unaligned loads check"),
51 llvm::cl::init(true));
52
53static llvm::cl::opt<bool> AIEML("aieml", llvm::cl::desc("AI Engine-ML"),
54 llvm::cl::init(false));
55
56namespace {
57// A struct to pack the global state required for vectorization at one place.
58// Local to this translation unit.
59struct VectState {
60 // A vector of all the reuse intervals created. Class IntervalReuse represents
61 // the cluster of data access (with reuse potential) along the vectorized
62 // dimension of each array, It clusters together reads that have a potential
63 // of vector-level data reuse. Therefore, array accesses A[i][j:j+8] and
64 // A[i+2][j:j+8] will map to different IntervalReuse objects.
65 SmallVector<IntervalReuse *, 16> reuseIntervals;
66 // Map from a transfer_read operation to the IntervalReuse object it belongs
67 // to.
68 mlir::DenseMap<Operation *, IntervalReuse *> opToIntervalMap;
69 // Map from a transfer_read operation to its linearized access expression.
70 // Linearized expression for access A[i][j], where A is of dimensionality MxN
71 // is (i*N+j). We assume that the innermost dimension is the vectorized
72 // dimension.
73 mlir::DenseMap<Operation *, AffineExpr> linearizedAccess;
74 // A map from an index (of array access) to an expr dim map (e.g., i->d0). We
75 // need this to create the correct linearized expressions for all the array
76 // accesses in the function.
77 mlir::DenseMap<Value, AffineExpr> indexToExprDimMap;
78 // For each transfer_read operation, a map from its container basic block to
79 // the enclosing for/while loops. This helps us identify two instructions
80 // that are nested together, even if they belong to different basic blocks.
81 mlir::DenseMap<Block *, SmallVector<Operation *, 8>> blockToEnclosingLoops;
82 // This is specific to 8x8 scheme. For an 8x8 scheme, every mul/fma is
83 // replaced by two mul/fmas in AIE dialect. So we keep track of the pair.
84 mlir::DenseMap<Operation *, Operation *> pairedOp;
85 // If we fuse a representative mul/fma op with another fma op to exploit the
86 // column topology of the AIE intrinsic, then cache, for the representative
87 // op, the compile-time constant access distance between their two operands.
88 // The first(second) offset of the pair represents the access distance
89 // between the first(second) operands of the representative op and the the
90 // fused op(s). This access distance will be used to compute the xstep/zstep
91 // attribute.
92 mlir::DenseMap<Operation *, std::pair<int32_t, int32_t>> opToColOffsets;
93 // Map from the sext op to the def op of the sext operand.
94 mlir::DenseMap<Operation *, Operation *> sextTruncDefMap;
95 // A set of operations that are msc (fmsub) ops. We do not differentiate
96 // between mac and msc ops at vector dialect level. The only op in vector
97 // dialect is just FMA op.
98 llvm::SmallSet<Operation *, 8> mscOps;
99 // Used to build and insert all the new operations created.
100 OpBuilder builder;
101 // The shift val for ups and srs intinsics. This value should be between 0
102 // and 63.
103 int8_t shift;
104 // The zero offset, indicating the position of recurring 0 in the input
105 // filter. The counting starts at 1. For example, if the filter array is
106 // {1,2,3,0,4,5,6,0,7,8,9,0}, then zeroOffset=4.
107 int32_t zeroOffset;
108 // The duplicate count, indicating the number of times a value is duplicated
109 // in the filter. The filter values must be duplicated at least twice for the
110 // i8xi8 scheme. An example of filter for i8xi8 scheme is {0,0,1,1,2,2,3,3},
111 // with dupFactor=2.
112 int32_t dupFactor;
113
114 bool unalignedLoadsCheck, aieml;
115
116 // Constructors
117 VectState(MLIRContext *context, int8_t s, int32_t z, int32_t d,
118 bool unalignedLoadsCheck, bool aieml)
119 : builder(context), shift(s), zeroOffset(z), dupFactor(d),
120 unalignedLoadsCheck(unalignedLoadsCheck), aieml(aieml) {}
121
122 IntervalReuse *getIntervalForOperation(Operation *op);
123};
124
125// Get the IntervalReuse object for a given read operation
126IntervalReuse *VectState::getIntervalForOperation(Operation *op) {
127 assert(opToIntervalMap.count(op) &&
128 "could not find the IntervalReuse object for op");
129 return opToIntervalMap[op];
130}
131
132// A struct to store the attributes (start, lo/hi offset, step, square) for an
133// AIE fma, mul, or select operation.
134struct AIEOpAttributes {
135 std::string select;
136 SmallVector<std::string, 2> start;
137 SmallVector<std::string, 2> offset, offset_hi;
138 SmallVector<std::string, 2> step;
139 SmallVector<std::string, 2> square;
140};
141
142// A struct that stores some of the attributes for a vector type
143struct AIEVecAttributes {
144 // The number of lanes along the vectorized dimension for the vector type.
145 // For a multidimensional vector, it is the innermost dimension size.
146 unsigned lanes;
147 // For a 1D vector, capture its size in bits. For an nD vector, capture the
148 // size of the innermost dimension in bits.
149 int32_t vecSizeInBits;
150 // Underlying scalar element type
151 Type elementType;
152 // The the element size in bits
153 int32_t elementSizeInBits;
154 // Does the vector load data from memory
155 bool loadFromMemory;
156 // Is the vector splat?
157 bool isSplat;
158 // Constructors
159 AIEVecAttributes(unsigned l, unsigned vs, Type et, int32_t es)
160 : lanes(l), vecSizeInBits(vs), elementType(et), elementSizeInBits(es),
161 loadFromMemory(false), isSplat(false) {}
162};
163
164// Structure to capture the lane/col topology, and the element type size of
165// xbuff and ybuff. Captures all the necessary information to map the incoming
166// mul/mac op to the vectorization scheme.
167struct Scheme {
168 // lanes and columns in the vector intrinsic
169 int32_t lanes, cols;
170 // size (in bits) of the underlying scalar element type of xbuff and zbuff
171 int32_t xbits, zbits;
172 // Constructor
173 Scheme(int32_t l, int32_t c, int32_t x, int32_t z)
174 : lanes(l), cols(c), xbits(x), zbits(z) {}
175};
176} // namespace
177
178//===----------------------------------------------------------------------===//
179// Helper Routines
180//===----------------------------------------------------------------------===//
181
182// Combine the result of vector-related utilities into a single utility.
183static AIEVecAttributes getVectorStats(VectorType type) {
184 return AIEVecAttributes(getVectorLaneSize(type), getVectorSizeInBits(type),
185 type.getElementType(), getElementSizeInBits(type));
186}
187
188// Get the vector stats for an operation's result.
189static AIEVecAttributes getResultVecStats(Operation *op, unsigned idx = 0) {
190 auto vtype = cast<VectorType>(op->getResult(idx).getType());
191 return getVectorStats(vtype);
192}
193
194static Operation *getOperandDefOp(VectState *state, Operation *op,
195 unsigned idx) {
196 return state->sextTruncDefMap.count(op->getOperand(idx).getDefiningOp())
197 ? state->sextTruncDefMap[op->getOperand(idx).getDefiningOp()]
198 : op->getOperand(idx).getDefiningOp();
199}
200
201// Get the vector stats for an operation's operand.
202static AIEVecAttributes getOperandVecStats(Operation *op, VectState *state,
203 unsigned idx = 0) {
204 assert(op->getNumOperands() > idx);
205 Operation *defOp = getOperandDefOp(state, op, idx);
206 auto vtype = cast<VectorType>(defOp->getResult(0).getType());
207 auto ret = getVectorStats(vtype);
208 // if the defining op is a transfer read, get the extent read from source
209 if (auto readOp = dyn_cast<TransferReadOp>(defOp)) {
210 IntervalReuse *iv = state->getIntervalForOperation(readOp);
211 ret.vecSizeInBits = iv->getIntervalWidth(readOp);
212 // Set load from memory to true
213 ret.loadFromMemory = true;
214 // Check if the load is splat
215 ret.isSplat = readOp.getPermutationMap().isConstant();
216 }
217 return ret;
218}
219
220// Get the number of rows and columns in the vector scheme.
221static std::pair<int32_t, int32_t> getNumRowsAndCols(Operation *op,
222 VectState *state) {
223 assert(op->getNumOperands() >= 2 && op->getNumResults() == 1);
224
225 Operation *left = getOperandDefOp(state, op, 0);
226 Operation *right = getOperandDefOp(state, op, 1);
227
228 // Get the number of lanes
229 auto vtype = cast<VectorType>(op->getResult(0).getType());
230 int32_t lanes = getVectorLaneSize(vtype);
231
232 // Get the data sizes for left and right operands
233 auto ltype = cast<VectorType>(left->getResult(0).getType());
234 auto rtype = cast<VectorType>(right->getResult(0).getType());
235 int32_t lsize = getElementSizeInBits(ltype);
236 int32_t rsize = getElementSizeInBits(rtype);
237
238 int32_t width = (lsize == 8 && rsize == 8) ? (state->aieml ? 256 : 128)
239 : (lsize == 16 && rsize == 8) ? 64
240 : 32;
241
242 if (state->aieml && getVectorSizeInBits(rtype) == 512) {
243 width *= 2;
244 }
245
246 // Now the computation
247 int32_t m = 1;
248 if (lsize == 32)
249 m *= 2;
250 if (rsize == 32)
251 m *= 2;
252 int32_t cols = width / (m * lanes);
253 return std::make_pair(lanes, cols);
254}
255
256// Fuse the access extent of two mul/fma operations. This means that for the
257// corresponding lhs(rhs) operands of op1 and op2, check if they read from
258// memory, and if they do, extend their access extent to their union. For
259// example if the left operand of Op1 has access extent [0,256], and the left
260// operand of Op2 has access extent [128,512], where these two accesses belong
261// to the same ReuseInterval, then the union is [0,512]. This union will be the
262// new access extent of the left operands of both Op1 and Op2.
263static void fuseAccessExtent(Operation *Op1, Operation *Op2, VectState *state) {
264 // Assert that the input operations are of expected type
265 assert([&] {
266 bool expectedTypes =
267 (isa<vector::FMAOp>(Op2) && isa<MulIOp, MulFOp, vector::FMAOp>(Op1));
268 if (!expectedTypes) {
269 printf("incorrect operation types\n");
270 return false;
271 }
272 return true;
273 }());
274
275 // Iterate over the even and odd operands for both the operations
276 for (int idx = 0; idx < 2; ++idx) {
277 Operation *op1 = getOperandDefOp(state, Op1, idx);
278 Operation *op2 = getOperandDefOp(state, Op2, idx);
279
280 // If both op1 and op2 are transfer read ops, then we need to create an
281 // interval that subsumes the extent read by both op1 an op2.
282 if (isa<TransferReadOp>(op1) && isa<TransferReadOp>(op2)) {
283 IntervalReuse *iv1 = state->getIntervalForOperation(op1);
284 IntervalReuse *iv2 = state->getIntervalForOperation(op2);
285 // Assert that both the ops belong to the same IntervalReuse object
286 assert(iv1 == iv2);
287 assert(iv1->getInterval(op1) == iv2->getInterval(op2));
288 auto op1Extent = iv1->getAccessExtent(op1);
289 auto op2Extent = iv2->getAccessExtent(op2);
290 // Create the new extent that's a union of refExtent and opExtent
291 auto newExtent =
292 std::make_pair(std::min(op1Extent.first, op2Extent.first),
293 std::max(op1Extent.second, op2Extent.second));
294 // And now update the read extents with the union
295 iv1->setAccessExtent(op1, newExtent);
296 iv2->setAccessExtent(op2, newExtent);
297 }
298 }
299}
300
301// To be a simple lane-wise multiplication, we check that
302// (1) both lhs and rhs operands come from vector of same size,
303// (2) no operand is splat, and
304// (3) no type is float if Op is mul/fma.
305static bool isSimpleVectIntrinsic(Operation *Op, VectState *state) {
306 // The incoming operator should be mul/fma/sub/add op
307 bool isMulOrFMAOp = isa<MulIOp, MulFOp, vector::FMAOp>(Op);
308 bool isSubOrAddOp = isa<SubIOp, SubFOp, AddIOp, AddFOp>(Op);
309 if (!isMulOrFMAOp && !isSubOrAddOp)
310 return true;
311
312 // Get the vec stats for result, left, and right operand
313 AIEVecAttributes vstat = getResultVecStats(Op);
314 AIEVecAttributes lstat = getOperandVecStats(Op, state, 0);
315 AIEVecAttributes rstat = getOperandVecStats(Op, state, 1);
316
317 bool sizeMatches = lstat.vecSizeInBits == rstat.vecSizeInBits &&
318 vstat.vecSizeInBits == rstat.vecSizeInBits &&
319 lstat.elementType == rstat.elementType &&
320 vstat.elementType == rstat.elementType;
321 bool noSplat = !lstat.isSplat && !rstat.isSplat;
322 bool noFloat = !isa<FloatType>(vstat.elementType) &&
323 !isa<FloatType>(lstat.elementType) &&
324 !isa<FloatType>(rstat.elementType);
325
326 return sizeMatches && noSplat && (isSubOrAddOp || noFloat);
327}
328
329// Return true if this is a vector dialect op meeting the following conditions:
330// (1) all the operands and results are vectorized; and
331// (2) all the vector sizes are the same.
332// (3) all the vectors have the same underlying scalar element type.
333static bool isWellFormedVectorOp(Operation *Op) {
334 // The op must have at least an operand or result
335 if (Op->getNumOperands() == 0 && Op->getNumResults() == 0)
336 return false;
337
338 SmallVector<Value, 8> operandsAndResults;
339 operandsAndResults.append(Op->operand_begin(), Op->operand_end());
340 operandsAndResults.append(Op->result_begin(), Op->result_end());
341
342 // Check 1. all the operands and results must be vector types
343 for (auto val : operandsAndResults) {
344 if (!isa<VectorType>(val.getType()))
345 return false;
346 }
347
348 auto refType = cast<VectorType>(operandsAndResults.back().getType());
349 Type scalarType = refType.getElementType();
350 unsigned refSize = getVectorLaneSize(refType);
351 for (auto val : operandsAndResults) {
352 auto vtype = cast<VectorType>(val.getType());
353 // Check 2. All the vector sizes must be same
354 if (refSize != getVectorLaneSize(vtype))
355 return false;
356 // Check 3. The underlying scalar type of all the vectors must be the same
357 if (scalarType != vtype.getElementType())
358 return false;
359 }
360
361 return true;
362}
363
364// Given an AIEOp, determines if an operation writes to an accumulator
365// based on operation type and operand types
366static bool writesToAccumulator(Operation *op) {
367 // Integer muls and FMAs write to accumulator
368 if (!isAIEOp(op))
369 return false;
370 if (auto mulOp = dyn_cast<aievec::aie1::MulOp>(op))
371 return isa<IntegerType>(
372 cast<VectorType>(mulOp.getResult().getType()).getElementType());
373 if (auto fmaOp = dyn_cast<aievec::aie1::FMAOp>(op))
374 return isa<IntegerType>(
375 cast<VectorType>(fmaOp.getResult().getType()).getElementType());
376
377 return isa<aievec::FMAElemOp, aievec::MulElemOp, aievec::FMAConvOp,
378 aievec::MulConvOp, aievec::UPSOp>(op);
379}
380
381//===----------------------------------------------------------------------===//
382// Manipulate affine expressions
383//===----------------------------------------------------------------------===//
384
385// Make a flattened affine expression from the given exprs array. Functionally
386// identical to makeCanonicalStridedLayoutExpr except that the returned
387// AffineExpr is not simplified.
388static AffineExpr makeFlattenedStridedExpr(ArrayRef<int64_t> sizes,
389 ArrayRef<AffineExpr> exprs,
390 MLIRContext *context) {
391 assert(!sizes.empty() && !exprs.empty() &&
392 "expected non-empty sizes and exprs");
393
394 // Size 0 corner case is useful for canonicalizations.
395 if (llvm::is_contained(sizes, 0))
396 return getAffineConstantExpr(0, context);
397
398 auto maps = AffineMap::inferFromExprList(exprs, context);
399 assert(!maps.empty() && "Expected one non-empty map");
400 unsigned nSymbols = maps[0].getNumSymbols();
401
402 AffineExpr expr;
403 bool dynamicPoisonBit = false;
404 int64_t runningSize = 1;
405 for (auto en : llvm::zip(llvm::reverse(exprs), llvm::reverse(sizes))) {
406 int64_t size = std::get<1>(en);
407 // Degenerate case, no size =-> no stride
408 if (size == 0)
409 continue;
410 AffineExpr dimExpr = std::get<0>(en);
411 AffineExpr stride = dynamicPoisonBit
412 ? getAffineSymbolExpr(nSymbols++, context)
413 : getAffineConstantExpr(runningSize, context);
414 expr = expr ? expr + dimExpr * stride : dimExpr * stride;
415 if (size > 0) {
416 runningSize *= size;
417 assert(runningSize > 0 && "integer overflow in size computation");
418 } else {
419 dynamicPoisonBit = true;
420 }
421 }
422 return expr;
423}
424
425// Construct a linearized affine expression for the transfer_read op.
426static AffineExpr constructLinearizedAffineExpr(TransferReadOp readOp,
427 VectState *state) {
428 // The global state stores a map from readOp to its linearized expression. If
429 // the linear expression is already computed for this readOp, return it.
430 if (state->linearizedAccess.count(readOp))
431 return state->linearizedAccess[readOp];
432
433 SmallVector<Value, 4> indices(readOp.getIndices().begin(),
434 readOp.getIndices().end());
435 auto memRefType = cast<MemRefType>(readOp.getBase().getType());
436 MLIRContext *context = memRefType.getContext();
437
438 SmallVector<AffineExpr, 8> exprVec;
439 // Iterate over all the indices. If the index has an affine apply op
440 // associated with it, we extract that. Otherwise we use the index from
441 // default map.
442 for (auto idxAndValue : llvm::enumerate(indices)) {
443 auto value = idxAndValue.value();
444 // If the access is a map via affine apply op (e.g., A[i+2], where the map
445 // is d0 -> d0+2), push in the map after replacing all the dims with unique
446 // index identifiers (e.g., let the unique identifier for index i be k0).
447 if (auto apOf = value.getDefiningOp<affine::AffineApplyOp>()) {
448 AffineMap map = apOf.getAffineMap();
449 assert(map.getNumResults() == 1 &&
450 "Failed to create linearized affineExpr for complicated index");
451 SmallVector<AffineExpr, 4> indexExprs;
452 // Each operand of the map corresponds to a loop index. For each operand
453 // (i.e., loop index), we create a unique dim expr.
454 for (auto index : apOf.getMapOperands()) {
455 if (auto cIdx = index.getDefiningOp<arith::ConstantOp>()) {
456 auto idxVal = cast<IntegerAttr>(cIdx.getValue()).getValue();
457 unsigned idx = idxVal.getSExtValue();
458 indexExprs.push_back(getAffineConstantExpr(idx, context));
459 } else {
460 if (!state->indexToExprDimMap.count(index))
461 state->indexToExprDimMap[index] =
462 getAffineDimExpr(state->indexToExprDimMap.size(), context);
463 indexExprs.push_back(state->indexToExprDimMap[index]);
464 }
465 }
466 // Now create a correct map expression using the unique dim exprs
467 exprVec.push_back(map.getResult(0).replaceDims(indexExprs));
468 }
469 // If the index is an arith constant (e.g., A[3]), create an affine expr
470 // from the constant value.
471 else if (auto cOp = value.getDefiningOp<arith::ConstantOp>()) {
472 auto idxVal = cast<IntegerAttr>(cOp.getValue()).getValue();
473 unsigned idx = idxVal.getSExtValue();
474 exprVec.push_back(getAffineConstantExpr(idx, context));
475 }
476 // Default: the readop index is simply the loop index (e.g., A[i]).
477 else {
478 if (!state->indexToExprDimMap.count(value))
479 state->indexToExprDimMap[value] =
480 getAffineDimExpr(state->indexToExprDimMap.size(), context);
481 exprVec.push_back(state->indexToExprDimMap[value]);
482 }
483 }
484
485 assert(!exprVec.empty() && "Could not construct linearized affineExpr");
486
487 // Linearize the exprVec as a strided access, but do not simplify
488 auto ret = makeFlattenedStridedExpr(memRefType.getShape(), exprVec,
489 memRefType.getContext());
490 // Cache this readOp and linearized expr into the global map
491 state->linearizedAccess[readOp] = ret;
492 return ret;
493}
494
495// From a linearized affine expression, compute the base and the constant
496// offset. If the access is A[i][j+2] for an N*N array A, the linearized
497// expression will be A[i*N+j+2]. The base in this case will be (i*N+j), and the
498// offset will be 2.
499static std::pair<AffineExpr, int32_t> getBaseAndOffset(AffineExpr expr) {
500 AffineExpr base = expr;
501 int32_t offset = 0;
502 // If expr is already a constant, the base is nullptr, and offset is expr
503 if (auto constExpr = llvm::dyn_cast<AffineConstantExpr>(expr)) {
504 base = nullptr;
505 offset += constExpr.getValue();
506 }
507 // If this is a binary '+' expression, compute the constant offset. Currently
508 // this is just a simple FSM. This must evolve as we explore more complex
509 // access patterns.
510 else if (auto binopExpr = llvm::dyn_cast<AffineBinaryOpExpr>(expr)) {
511 if (binopExpr.getKind() == AffineExprKind::Add) {
512 AffineExpr lhs = binopExpr.getLHS(), rhs = binopExpr.getRHS();
513 if (auto constExpr = llvm::dyn_cast<AffineConstantExpr>(lhs)) {
514 base = rhs;
515 offset += constExpr.getValue();
516 }
517 if (auto constExpr = llvm::dyn_cast<AffineConstantExpr>(rhs)) {
518 base = base == rhs ? nullptr : lhs;
519 offset += constExpr.getValue();
520 }
521 }
522 }
523 return std::make_pair(base, offset);
524}
525
526//===----------------------------------------------------------------------===//
527// AIE vector op generation routines
528//===----------------------------------------------------------------------===//
529// Generate and return a Cast op.
530static aievec::CastOp generateCastOp(Value source, VectorType resType,
531 bool isResAcc, VectState *state,
532 Location loc) {
533 // Create the Cast op
534 auto castOp =
535 aievec::CastOp::create(state->builder, loc, resType, source, isResAcc);
536
537 assert(castOp && "could not create srs op");
538 return castOp;
539}
540
541// Generate and return an SRS op. Incoming `source` is an accumulator. The
542// output should be a vector of element type `scalarType`.
543static aievec::SRSOp generateSRSOp(Value source, Type scalarType,
544 VectState *state, Location loc) {
545 // The source should write to accumulator
546 Type accType = source.getType();
547 assert(writesToAccumulator(source.getDefiningOp()) &&
548 "srs source should write to accumulator");
549
550 // Get the number of lanes
551 unsigned lanes = getVectorLaneSize(cast<VectorType>(accType));
552 // Now generate the new vector type for the SRS intrinsic
553 VectorType srsType = createVectorType(lanes, scalarType);
554
555 auto shiftParamOp = arith::ConstantOp::create(
556 state->builder, loc, state->builder.getI32IntegerAttr(state->shift));
557 // Create the SRS op
558 auto srsOp = aievec::SRSOp::create(state->builder, loc, srsType, source,
559 shiftParamOp.getResult());
560
561 assert(srsOp && "could not create srs op");
562 return srsOp;
563}
564
565// Generate and return a UPS op. Incoming `source` is a vector which needs
566// to be moved to an accumulator.
567static aievec::UPSOp generateUPSOp(Value source, VectState *state,
568 Location loc) {
569 Type sourceType = source.getType();
570 Type accType =
571 getVectorOpDestType(cast<VectorType>(sourceType), state->aieml);
572 assert(!writesToAccumulator(source.getDefiningOp()) &&
573 "ups source should not be accumulator");
574
575 // Create a new UPS instruction
576 auto upsOp =
577 aievec::UPSOp::create(state->builder, loc, accType, source, state->shift);
578
579 assert(upsOp && "could not create ups op");
580 return upsOp;
581}
582
583// Generate and return a Broadcast op.
584static aievec::BroadcastOp generateBroadcastOp(Value source, int8_t idx,
585 VectState *state, Location loc) {
586 auto type = cast<VectorType>(source.getType());
587 // Create a new Broadcast instruction
588 auto broadcastOp =
589 aievec::BroadcastOp::create(state->builder, loc, type, source, idx);
590
591 assert(broadcastOp && "could not create broadcast op");
592 return broadcastOp;
593}
594
595// Generate and return a Concat op.
596static aievec::ConcatOp generateConcatOp(SmallVector<Value> &sources,
597 VectState *state, Location loc,
598 VectorType concatType = nullptr) {
599 assert(sources.size() > 1 && "must concat at least two vectors");
600
601 auto vecType = cast<VectorType>(sources.back().getType());
602
603 assert([&] {
604 for (auto source : sources) {
605 auto type = cast<VectorType>(source.getType());
606 if (type != vecType) {
607 printf("sources of concat op not of same type\n");
608 return false;
609 }
610 }
611 return true;
612 }());
613
614 if (!concatType) {
615 // Get the number of lanes and scalar type to create the concat result type
616 unsigned lanes = sources.size() * getVectorLaneSize(vecType);
617 Type scalarType = vecType.getElementType();
618 concatType = createVectorType(lanes, scalarType);
619 }
620
621 // Create the concat op
622 auto concatOp =
623 aievec::ConcatOp::create(state->builder, loc, concatType, sources);
624
625 assert(concatOp && "could not create concat op");
626 return concatOp;
627}
628
629// Generate and return a select operation. The start, offset, etc. for lanes
630// are in opAttr.
631static aievec::aie1::SelectOp
632generateSelectOp(Value xbuff, AIEOpAttributes &opAttr, unsigned lanes,
633 VectState *state, Location loc, Value ybuff = nullptr) {
634 // Assert that we have computed the attributes (start, offset, etc.) for both
635 // lanes, and that select is non-empty.
636 assert(!opAttr.select.empty());
637 assert(opAttr.start.size() == opAttr.offset.size() &&
638 opAttr.start.size() == 2);
639
640 auto xtype = cast<VectorType>(xbuff.getType());
641 // Verify that lanes is <= xtype lanes
642 assert(lanes <= getVectorLaneSize(xtype));
643 // Create the result type
644 VectorType resultType = createVectorType(lanes, xtype.getElementType());
645
646 // Create AIE dialect select op
647 auto selectOp = aievec::aie1::SelectOp::create(
648 state->builder, loc, resultType, xbuff, opAttr.select, opAttr.start[0],
649 opAttr.offset[0], opAttr.offset_hi[0], opAttr.square[0], opAttr.start[1],
650 opAttr.offset[1], opAttr.offset_hi[1], opAttr.square[1], ybuff);
651
652 assert(selectOp && "could not create select op");
653 return selectOp;
654}
655
656// Generate and return an Ext op. The lanes indicate the lanes in vector
657// output, and idx defines which part of source is extracted.
658static aievec::aie1::ExtOp generateExtOp(Value source, unsigned lanes,
659 int8_t idx, VectState *state,
660 Location loc) {
661 auto stype = cast<VectorType>(source.getType());
662 // Verify that lanes*idx is <= stype lanes
663 assert(lanes * (idx + 1) <= getVectorLaneSize(stype));
664 // Create the result type
665 VectorType resultType = createVectorType(lanes, stype.getElementType());
666
667 // Create AIE dialect ext op
668 auto extOp =
669 aievec::aie1::ExtOp::create(state->builder, loc, resultType, source, idx);
670
671 assert(extOp && "could not create ext op");
672 return extOp;
673}
674
675// Generate and return an Pack op.
676static aievec::PackOp generatePackOp(Value source, VectState *state,
677 Location loc) {
678 // Create the result type
679 auto stype = cast<VectorType>(source.getType());
680 unsigned lanes = getVectorLaneSize(stype);
681 Type i8Type = IntegerType::get(source.getContext(), 8);
682 VectorType resultType = createVectorType(lanes, i8Type);
683
684 // Create AIE dialect pack op
685 auto packOp = aievec::PackOp::create(state->builder, loc, resultType, source);
686
687 assert(packOp && "could not create pack op");
688 return packOp;
689}
690
691// Generate and return an Add op.
692static aievec::aie1::AddOp generateAddOp(Operation *Op, AIEOpAttributes &opAttr,
693 VectState *state) {
694 // Assert that we computed the attributes for both the operands
695 assert(opAttr.start.size() == opAttr.offset.size() &&
696 opAttr.start.size() == 2);
697
698 auto addOp = aievec::aie1::AddOp::create(
699 state->builder, Op->getLoc(), Op->getResult(0).getType(),
700 Op->getOperand(0), Op->getOperand(1), opAttr.start[0], opAttr.offset[0],
701 opAttr.offset_hi[0], opAttr.square[0], opAttr.start[1], opAttr.offset[1],
702 opAttr.offset_hi[1], opAttr.square[1]);
703 return addOp;
704}
705
706// Generate and return a Sub op.
707static aievec::aie1::SubOp generateSubOp(Operation *Op, AIEOpAttributes &opAttr,
708 VectState *state) {
709 // Assert that we computed the attributes for both the operands
710 assert(opAttr.start.size() == opAttr.offset.size() &&
711 opAttr.start.size() == 2);
712
713 auto subOp = aievec::aie1::SubOp::create(
714 state->builder, Op->getLoc(), Op->getResult(0).getType(),
715 Op->getOperand(0), Op->getOperand(1), opAttr.start[0], opAttr.offset[0],
716 opAttr.offset_hi[0], opAttr.square[0], opAttr.start[1], opAttr.offset[1],
717 opAttr.offset_hi[1], opAttr.square[1]);
718 return subOp;
719}
720
721static aievec::ShiftOp generateShiftOp(Value lhs, Value rhs, int32_t shiftBytes,
722 VectState *state, Location loc,
723 VectorType resType = nullptr) {
724 auto vecType = cast<VectorType>(rhs.getType());
725
726 assert([&] {
727 auto type = cast<VectorType>(lhs.getType());
728 if (type != vecType) {
729 printf("lhs and rhs do not have same type\n");
730 return false;
731 }
732 return true;
733 }());
734
735 if (!resType) {
736 unsigned lanes = getVectorLaneSize(vecType);
737 Type scalarType = vecType.getElementType();
738 resType = createVectorType(lanes, scalarType);
739 }
740
741 auto constOp = arith::ConstantOp::create(
742 state->builder, loc, state->builder.getI32IntegerAttr(shiftBytes));
743 auto shiftOp = aievec::ShiftOp::create(state->builder, loc, resType, lhs, rhs,
744 constOp.getResult());
745
746 return shiftOp;
747}
748
749static aievec::LegacyShuffleOp generateShuffleOp(Value source, VectState *state,
750 Location loc, unsigned mode,
751 VectorType resType = nullptr) {
752 auto vecType = cast<VectorType>(source.getType());
753
754 if (!resType) {
755 unsigned lanes = 512 / getElementSizeInBits(vecType);
756 Type scalarType = vecType.getElementType();
757 resType = createVectorType(lanes, scalarType);
758 }
759
760 auto shuffleOp = aievec::LegacyShuffleOp::create(state->builder, loc, resType,
761 source, mode);
762
763 return shuffleOp;
764}
765
766// For AIEML, i8xi8 scheme generates one MulConvOp or FMAConvOp for each vector
767// dialect mul/fma op instead of generating two AIE dialect mul/fma ops for each
768// vector dialect mul/fma in AIE1.
769static Operation *generateMulOrFMAConvOpForInt8(Operation *Op,
770 AIEOpAttributes &opAttr,
771 VectState *state) {
772 // Assert that we have computed the attributes (start, offset, etc.) for both
773 // left and right operands of the fma operation.
774 assert(opAttr.start.size() == opAttr.offset.size() &&
775 opAttr.start.size() == 2 && state->dupFactor == 2);
776
777 Value lhs = state->sextTruncDefMap.count(Op->getOperand(1).getDefiningOp())
778 ? Op->getOperand(1).getDefiningOp()->getOperand(0)
779 : Op->getOperand(1);
780 Value rhs = state->sextTruncDefMap.count(Op->getOperand(0).getDefiningOp())
781 ? Op->getOperand(0).getDefiningOp()->getOperand(0)
782 : Op->getOperand(0);
783 auto vType = cast<VectorType>(lhs.getType());
784 Type stype = vType.getElementType();
785 auto itype = cast<IntegerType>(stype);
786 unsigned width = itype.getWidth() <= 8 ? 32 : 64;
787 int32_t M = 32;
788 int32_t N = 8;
789
790 Type ctype = IntegerType::get(itype.getContext(), width);
791 Type opType = VectorType::get(vType.getShape(), ctype);
792 auto defOp = rhs.getDefiningOp();
793 state->builder.setInsertionPointAfter(defOp);
794 Location loc = defOp->getLoc();
795
796 // Since we do not need to use duplicated data like in AIE1, if a dup-factor
797 // exists, we extract the identical data by shuffle op. We use mode 0 to
798 // extract the elements with even indices for i8 type data.
799 Operation *shuffleOp = generateShuffleOp(defOp->getResult(0), state, loc, 0);
800
801 int32_t shiftBytes = stoi(opAttr.start[0]) * getElementSizeInBits(vType) / 8 /
802 state->dupFactor;
803
804 // Generate a shift_bytes operation for rhs if xstart is not 0
805 if (shiftBytes) {
806 state->builder.setInsertionPointAfter(shuffleOp);
807 loc = shuffleOp->getLoc();
808 rhs = generateShiftOp(shuffleOp->getResult(0), shuffleOp->getResult(0),
809 shiftBytes, state, loc);
810 } else {
811 rhs = shuffleOp->getResult(0);
812 }
813
814 state->builder.setInsertionPoint(Op);
815 loc = Op->getLoc();
816
817 Operation *convOp = nullptr;
818
819 if (isa<MulIOp>(Op)) {
820 convOp =
821 aievec::MulConvOp::create(state->builder, loc, opType, lhs, rhs, M, N);
822 }
823
824 if (isa<vector::FMAOp>(Op)) {
825 Value acc = Op->getOperand(2);
826 bool isSub = state->mscOps.count(Op);
827 convOp = aievec::FMAConvOp::create(state->builder, loc, opType, lhs, rhs,
828 acc, M, N, isSub);
829 }
830
831 return convOp;
832}
833
834// Generate and return an FMA operation in AIE dialect. This operation will
835// have the start and offset fields for each operand. If the acc operand of
836// fmaOp is a transfer_read operation, then we need to add an SRS instruction
837// that will load the vector value into an accumulator.
838static Operation *generateFMAOp(vector::FMAOp fmaOp, AIEOpAttributes &opAttr,
839 VectState *state, bool i8xi8_pairedOp = false) {
840 // Assert that we have computed the attributes (start, offset, etc.) for both
841 // left and right operands of the fma operation.
842 assert(opAttr.start.size() == opAttr.offset.size() &&
843 opAttr.start.size() == 2);
844
845 Value lhs = state->sextTruncDefMap.count(fmaOp.getLhs().getDefiningOp())
846 ? fmaOp.getLhs().getDefiningOp()->getOperand(0)
847 : fmaOp.getLhs();
848 Value rhs = state->sextTruncDefMap.count(fmaOp.getRhs().getDefiningOp())
849 ? fmaOp.getRhs().getDefiningOp()->getOperand(0)
850 : fmaOp.getRhs();
851 Value acc = state->sextTruncDefMap.count(fmaOp.getAcc().getDefiningOp())
852 ? fmaOp.getAcc().getDefiningOp()->getOperand(0)
853 : fmaOp.getAcc();
854
855 // Check if this is an fmsub op, and if so, then we need to generate msc op
856 bool isSub = state->mscOps.count(fmaOp);
857
858 // We need to generate a UPS op for the integer and AIEML path if the
859 // accumulator is coming from a vector register.
860 bool isInt = isa<IntegerType>(
861 cast<VectorType>(fmaOp.getLhs().getType()).getElementType());
862
863 Operation *xfmaOp;
864 if (state->aieml &&
865 getVectorSizeInBits(cast<VectorType>(rhs.getType())) == 512) {
866 if (!writesToAccumulator(acc.getDefiningOp())) {
867 acc = generateUPSOp(acc, state, fmaOp->getLoc());
868 LLVM_DEBUG(llvm::dbgs()
869 << "\n\nCreated UPS op " << acc << " to move the output of "
870 << fmaOp << " into accumulator");
871 }
872
873 if (!isSimpleVectIntrinsic(fmaOp, state)) {
874 // If targeting for AIE-ML intrinsics, use broadcast operator for rhs.
875 // Check the legality of generating a broadcast op by checking whether
876 // zbuffer is a splat
877 AIEVecAttributes rstat = getOperandVecStats(fmaOp, state, 1);
878 if (rstat.isSplat) {
879 rhs = generateBroadcastOp(rhs, stoi(opAttr.start[1]), state,
880 fmaOp->getLoc());
881 }
882 }
883 // Create AIEML dalect fma_elem/msc_elem op
884 xfmaOp = aievec::FMAElemOp::create(state->builder, fmaOp->getLoc(), lhs,
885 rhs, acc, isSub);
886 } else {
887 // If i8xi8_pairedOp is true, then we are trying to generated the paired FMA
888 // op for i8xi8 scheme. Find the paired accumulator.
889 if (i8xi8_pairedOp) {
890 Operation *defOp = acc.getDefiningOp();
891 if (state->pairedOp.count(defOp))
892 acc = state->pairedOp[defOp]->getResult(0);
893 }
894
895 if (isInt && !writesToAccumulator(acc.getDefiningOp())) {
896 acc = generateUPSOp(acc, state, fmaOp->getLoc());
897 LLVM_DEBUG(llvm::dbgs()
898 << "\n\nCreated UPS op " << acc << " to move the output of "
899 << fmaOp << " into accumulator");
900 }
901
902 // If the lhs operand vector is not >= twice the rhs operand vector, then
903 // use concat operator.
904 if (!isSimpleVectIntrinsic(fmaOp, state)) {
905 AIEVecAttributes lstat = getOperandVecStats(fmaOp, state, 0);
906 assert(lstat.vecSizeInBits % 256 == 0);
907
908 if (lstat.vecSizeInBits == 256) {
909 VectorType concatType =
910 createVectorType(512 / lstat.elementSizeInBits, lstat.elementType);
911 SmallVector<Value> sources = {lhs, lhs};
912 lhs = generateConcatOp(sources, state, fmaOp->getLoc(), concatType);
913 }
914 }
915 // Create AIE dialect fma/msc op
916 xfmaOp = aievec::aie1::FMAOp::create(
917 state->builder, fmaOp->getLoc(), lhs, rhs, acc, opAttr.start[0],
918 opAttr.offset[0], opAttr.offset_hi[0], opAttr.step[0], opAttr.square[0],
919 opAttr.start[1], opAttr.offset[1], opAttr.offset_hi[1], opAttr.step[1],
920 opAttr.square[1], isSub);
921 }
922
923 assert(xfmaOp && "could not create fma op");
924 return xfmaOp;
925}
926
927// Generate a MUL operation in AIE dialect. This operation will have the start
928// and offset fields for each operand.
929template <typename T>
930static Operation *generateMulOp(T mulOp, AIEOpAttributes &opAttr,
931 VectState *state) {
932 // Assert that we have computed the attributes (start, offset, etc.) for both
933 // left and right operands of the mul operation.
934 assert(opAttr.start.size() == opAttr.offset.size() &&
935 opAttr.start.size() == 2);
936
937 Type opType =
938 getVectorOpDestType(cast<VectorType>(mulOp.getType()), state->aieml);
939
940 // If the lhs operand vector is not >= twice the rhs operand vector, then use
941 // concat operator.
942 Value lhs = state->sextTruncDefMap.count(mulOp.getLhs().getDefiningOp())
943 ? mulOp.getLhs().getDefiningOp()->getOperand(0)
944 : mulOp.getLhs();
945 Value rhs = state->sextTruncDefMap.count(mulOp.getRhs().getDefiningOp())
946 ? mulOp.getRhs().getDefiningOp()->getOperand(0)
947 : mulOp.getRhs();
948 if (!isSimpleVectIntrinsic(mulOp, state)) {
949 AIEVecAttributes lstat = getOperandVecStats(mulOp, state, 0);
950 assert(lstat.vecSizeInBits % 256 == 0);
951 if (lstat.vecSizeInBits == 256) {
952 VectorType concatType =
953 createVectorType(512 / lstat.elementSizeInBits, lstat.elementType);
954 SmallVector<Value> sources = {lhs, lhs};
955 lhs = generateConcatOp(sources, state, mulOp->getLoc(), concatType);
956 }
957 }
958
959 // Create AIE dialect mul op
960 Operation *xmulOp = aievec::aie1::MulOp::create(
961 state->builder, mulOp->getLoc(), lhs, rhs, opType, opAttr.start[0],
962 opAttr.offset[0], opAttr.offset_hi[0], opAttr.step[0], opAttr.square[0],
963 opAttr.start[1], opAttr.offset[1], opAttr.offset_hi[1], opAttr.step[1],
964 opAttr.square[1]);
965
966 assert(xmulOp && "could not create mul op");
967 return xmulOp;
968}
969
970// For a transfer_read op, generate a corresponding UPD op. Multiple
971// transfer_read ops will have the same UPD op if their read access extent is
972// subsumed by the same interval. The updOps will have to be inserted at the
973// head of region if the region has multiple blocks, or closer to the readOp
974// otherwise.
975static aievec::UPDOp
976generateUPDOp(TransferReadOp readOp,
977 mlir::DenseMap<std::tuple<IntervalReuse *, int32_t, int32_t>,
978 std::pair<aievec::UPDOp, int8_t>> &memToUpdMap,
979 Region &region, VectState *state) {
980 // Get the read access extent and interval of this read operation
981 IntervalReuse *iv = state->getIntervalForOperation(readOp);
982 auto extent = iv->getAccessExtent(readOp);
983 auto interval = iv->getInterval(readOp);
984
985 int32_t intervalWidth = interval.second - interval.first;
986 assert(intervalWidth >= 128 && "Interval computation incorrect");
987
988 // Create the upd vector type. To do so, we need the underlying element type.
989 // We can divide the interval size by that to get the number of lanes in the
990 // result vector of upd op.
991 auto vecType = cast<VectorType>(readOp.getVector().getType());
992 Type elementType = vecType.getElementType();
993 int32_t elementSizeInBits = getElementSizeInBits(vecType);
994 int intervalWidthInBytes = intervalWidth / elementSizeInBits;
995 Type updVecType = createVectorType(intervalWidthInBytes, elementType);
996
997 // Compute the mid value of the interval. This is useful because for
998 // intervalWidth > 256 or 512 if it is AIEML, we can split the load into two
999 // steps: the bits to the left/right of mid will be loaded using upd
1000 // idx=0/idx=1 operator.
1001 int32_t mid = interval.first + intervalWidth / 2;
1002 // Compute the (aligned) extent of interval that this read requires to be
1003 // loaded.
1004 int32_t lb =
1005 intervalWidth <= (state->aieml && elementSizeInBits == 8 ? 512 : 256) ||
1006 extent.first < mid
1007 ? interval.first
1008 : mid;
1009 int32_t ub =
1010 intervalWidth <= (state->aieml && elementSizeInBits == 8 ? 512 : 256) ||
1011 extent.second > mid
1012 ? interval.second
1013 : mid;
1014
1015 // Find if we have already created upd op idx=0/idx=1 for this interval
1016 aievec::UPDOp updOp = nullptr;
1017 // initial value 0 of updIndices means neither upd op idx=0 nor idx=1 were
1018 // created.
1019 int8_t updIndices = 0;
1020 auto key = std::make_tuple(iv, interval.first, interval.second);
1021 if (memToUpdMap.count(key)) {
1022 updOp = memToUpdMap[key].first;
1023 updIndices = memToUpdMap[key].second;
1024 }
1025
1026 // This readOp could be A[i][j+2], where A is a 32-bit array. Assume that its
1027 // read access extent is subsumed by interval [0,512]. This 512-bit interval
1028 // load should be broken into two 256-bit UPD ops. We need to find the right
1029 // offset from A[i][j+2] where the reads will start. The first offset (in
1030 // bits) will be A[i][j+2]-2*32, and the second offset will be
1031 // A[i][j+2]+256-2*32. Essentially, the offsets should make the load well
1032 // aligned. Below, we compute this (-2*32) offset to make the loads aligned.
1033 SmallVector<Value, 4> indices(readOp.getIndices().begin(),
1034 readOp.getIndices().end());
1035 // Get the linearized access expression for the read to compute the offset
1036 AffineExpr linearAccess = constructLinearizedAffineExpr(readOp, state);
1037 // Get the base and offset from linear access expr
1038 auto [base, offset] = getBaseAndOffset(linearAccess);
1039 offset *= elementSizeInBits; // get the offset in bits
1040
1041 // The insertion point depends on whether the region has a single block or
1042 // not. If it has a single block, that block will be the front block, so we
1043 // can insert the UPDOp closer to the readOp. However, if the region has
1044 // multiple blocks, we will insert all the UPDs to the front block of the
1045 // region so that the UPDs dominate the entire region.
1046 bool singleBlock = region.getBlocks().size() == 1;
1047 if (singleBlock)
1048 state->builder.setInsertionPoint(readOp);
1049 else
1050 state->builder.setInsertionPointToStart(&region.front());
1051
1052 // If the extent <= 256 bits, we can directly copy data from mem into vector
1053 // without using a upd. So we try to chunk the interval into sub-intervals of
1054 // width >= 256 bits. For AIEML, the size should be doubled.
1055 int width = state->aieml ? elementSizeInBits == 8
1056 ? 512
1057 : std::max(256, getVectorSizeInBits(vecType))
1058 : 256;
1059 int32_t incr = std::max(width, intervalWidth / 2);
1060 int8_t idx = 1;
1061 for (int32_t start = interval.first; start < interval.second;
1062 start += incr, ++idx) {
1063 // If idx=1, then this indicates a potential upd0 instruction. If idx=2, it
1064 // will be upd1 instruction.
1065 assert(idx <= 2 && "The only allowed values for UPD index are 0 and 1");
1066 int32_t end = std::min(interval.second, start + incr);
1067 // We are at sub-interval [start,end] of the vector interval. Check if this
1068 // sub-interval is subsumed by [lb,ub], and the upd op corresponding to this
1069 // sub-interval is not already generated.
1070 if (lb <= start && ub >= end && (updIndices & idx) == 0) {
1071 // Generate the upd instruction, and link it with a previous upd op
1072 // corresponding to the same read.
1073 updOp = aievec::UPDOp::create(
1074 state->builder, readOp.getLoc(), updVecType, readOp.getBase(),
1075 indices, start - offset, idx - 1,
1076 updOp ? updOp.getResult() : TypedValue<VectorType>(nullptr));
1077
1078 LLVM_DEBUG(llvm::dbgs() << "\n\nCreated UPD op " << updOp
1079 << " for read op " << readOp);
1080
1081 // If the transfer_read has some apply operations, then they also need to
1082 // be hoisted.
1083 for (auto &value : indices) {
1084 if (auto apOf = value.getDefiningOp<affine::AffineApplyOp>()) {
1085 // Skip hoisting if already above in lexicographical order
1086 if (apOf->getBlock() == readOp->getBlock() &&
1087 apOf->isBeforeInBlock(updOp))
1088 continue;
1089 apOf.getOperation()->moveBefore(updOp);
1090 }
1091 }
1092 // Set the (idx-1)'th bit in updIndices to indicate that we have already
1093 // created a upd op for index idx.
1094 updIndices |= idx;
1095 }
1096 }
1097
1098 // Link the generated updOp to possibly pre-existing UPD ops for the key
1099 memToUpdMap[key] = std::make_pair(updOp, updIndices);
1100 return updOp;
1101}
1102
1103//===----------------------------------------------------------------------===//
1104// AIE vectorization routines
1105//===----------------------------------------------------------------------===//
1106
1107// For this vectorized read operation, find the loop that corresponds to the
1108// vectorized dimension, and return its step size.
1109static int32_t computeVecorizedLoopStepSize(Operation *op, VectState *state) {
1110 auto readOp = dyn_cast<TransferReadOp>(op);
1111 // If this operation is not a read op, return the default step size of 1
1112 if (!readOp)
1113 return 1;
1114
1115 int32_t step = 0;
1116 auto vectorType = cast<VectorType>(readOp.getResult().getType());
1117 SmallVector<Value, 4> indices(readOp.getIndices().begin(),
1118 readOp.getIndices().end());
1119 assert(vectorType && !indices.empty());
1120
1121 // Verify that enclosing loops have been computed for the read operation
1122 auto block = readOp->getBlock();
1123 assert(state->blockToEnclosingLoops.count(block) &&
1124 "enclosing loops should have been computed for the read operation");
1125 auto enclosingLoops = state->blockToEnclosingLoops[block];
1126
1127 // The vectorized (i.e., last) index of the permutation must correspond to a
1128 // loop nest. If not, this is a splat read.
1129 AffineExpr expr = readOp.getPermutationMap().getResults().back();
1130 if (auto dimExpr = llvm::dyn_cast<AffineDimExpr>(expr)) {
1131 assert(dimExpr.getPosition() <= indices.size() &&
1132 "Failed to find the permutation index in index map");
1133 auto index = indices[dimExpr.getPosition()];
1134 // Iterate over all enclosing loops, and find the one that is variant in
1135 // index.
1136 [[maybe_unused]] bool found = false;
1137 for (auto loop : enclosingLoops) {
1138 auto iv = cast<affine::AffineForOp>(loop).getInductionVar();
1139 auto invariants = affine::getInvariantAccesses(iv, indices);
1140 if (!invariants.count(index)) {
1141 assert(
1142 !found &&
1143 "stepsize computation already has an entry along the variant dim");
1144 step = cast<affine::AffineForOp>(loop).getStepAsInt();
1145 found = true;
1146 }
1147 }
1148 }
1149 assert(isPowerOfTwo(step) &&
1150 "non-power-of-two vectorization factor not supported");
1151 // The step increment in vectorized code is scaled by factor of vector lanes;
1152 // account for that.
1153 unsigned lanes = getVectorLaneSize(vectorType);
1154 return step / lanes;
1155}
1156
1157// AIE vector loads are always aligned to 128-bit boundary. So if the operation
1158// reads from an unaligned memory location, return the starting position of the
1159// read in the vector. Each element of the vector is 'elementSizeInBits' bits
1160// wide.
1161int32_t computeStartInAIEVec(Operation *op, VectState *state) {
1162 // In case the operation is not a transfer_read, return default start
1163 if (!isa<TransferReadOp>(op))
1164 return 0;
1165
1166 auto readOp = cast<TransferReadOp>(op);
1167
1168 // Get the scalar element type's size in bits
1169 auto vtype = cast<VectorType>(readOp.getVector().getType());
1170 int32_t scalarSizeInBits = getElementSizeInBits(vtype);
1171
1172 // Get the linearized access expr for this read
1173 AffineExpr linearAccess = constructLinearizedAffineExpr(readOp, state);
1174 // get the base and offset from linear access expr
1175 auto [base, offset] = getBaseAndOffset(linearAccess);
1176 offset *= scalarSizeInBits; // compute offset in bits
1177 // Now find the reuse interval to which this readOp belongs
1178 IntervalReuse *iv = state->getIntervalForOperation(op);
1179 std::pair<int32_t, int32_t> interval = iv->getInterval(op);
1180
1181 // The readOp reads from this interval, and the start of this interval is
1182 // aligned to 128 bits. The AIE vector corresponding to this read will hold
1183 // the value [inteval.first,interval.second]. Return the position of the first
1184 // element that is read.
1185 assert(offset >= interval.first && "Failed to compute the start");
1186 return (offset - interval.first) / scalarSizeInBits;
1187}
1188
1189// For an i8xi8 scheme, we require two muls to compute the 16-lane output. Each
1190// mul has a replicated computation, where the output in lane i is replicated
1191// in lane i+2. Given that, we take the output of two mul ops. and merge them
1192// into a v32int16 vector. Then we shuffle (using select) to form two v16int16
1193// vectors that are replicas of each other. Finally, we can pick one of them
1194// (using ext), and then pack it into a v16int8 output.
1195static Operation *concatAndInterleave_i8xi8(Operation *source1,
1196 Operation *source2,
1197 VectState *state, Location loc) {
1198 // The source values are in accumulator. So generate SRS intrinsic to convert
1199 // the accumulator output to vector output. We want the output to be in
1200 // v16int16 vector, since select operation does not operate on v16int8
1201 // vector.
1202 Type i16Type =
1203 IntegerType::get(source1->getResult(0).getType().getContext(), 16);
1204 auto srsOp1 = generateSRSOp(source1->getResult(0), i16Type, state, loc);
1205 auto srsOp2 = generateSRSOp(source2->getResult(0), i16Type, state, loc);
1206
1207 // Now we concat the result of the two SRS ops to form a 32-lane vector
1208 SmallVector<Value> sources = {srsOp1->getResult(0), srsOp2->getResult(0)};
1209 auto concatOp = generateConcatOp(sources, state, loc);
1210
1211 // Select the right bits of output to again form the 16-lane vector. opAttr
1212 // will cache the step,offsets,square, etc. for both lanes.
1213 AIEOpAttributes opAttr;
1214 // 0xi is 1100, which indicates that two values must alternately come from
1215 // xoffset and yoffset.
1216 opAttr.select = "0xcccccccc";
1217 // xstart is 0. Since there are only 2 unique values in the first 4 values
1218 // of the vector, ystart is 4.
1219 opAttr.start.push_back("0");
1220 opAttr.start.push_back("4");
1221 for (size_t idx = 0; idx < 2; ++idx) {
1222 // Consider only the even indices in offset (e.g., c, 8, 4, 0). The
1223 // absolute difference between even indices should be 4 (based on the
1224 // scheme, this will get multiplied by 2. So technically, xoffset picks the
1225 // values starting at indices 0, 8, 16, 24 from the v32int16 vector,
1226 // whereas yoffset picks values starting at indices 0+4, 8+4, 16+4, 24+4).
1227 opAttr.offset.push_back("0x0c080400");
1228 // We don't care for the lower 16 values in the v32int16 vector post
1229 // shuffle
1230 opAttr.offset_hi.push_back("0x0");
1231 // The first value must be permuted to offset 0, and the next to 1
1232 opAttr.square.push_back("0x1010");
1233 }
1234 // And now perform the selection
1235 auto selectOp =
1236 generateSelectOp(concatOp->getResult(0), opAttr, 32, state, loc);
1237 // The values in the first 16 lanes in the v32int16 vector are replicated in
1238 // the last 16 lanes. So select the first 16 lanes to form a v16int16 vector.
1239 auto extOp = generateExtOp(selectOp->getResult(0), 16, 0, state, loc);
1240 // Pack the int16 values to int8 values to form the v16int8 output vector
1241 auto packOp = generatePackOp(extOp->getResult(0), state, loc);
1242 return packOp;
1243}
1244
1245// Perform a multitude of checks to see if rhs operand of the incoming add/sub
1246// operator is a mul operator, so that we can fuse them to form an FMA
1247// operator.
1248static bool canFuseMulAndAddOrSubIntoFMAOp(Operation *Op, VectState *state) {
1249 // Check 1. This should be an add or sub operation
1250 assert((isa<AddIOp>(Op) || isa<AddFOp>(Op) || isa<SubIOp>(Op) ||
1251 isa<SubFOp>(Op)) &&
1252 "operation must be an add or sub op");
1253
1254 // Check 2. Op must have two operands and one result
1255 assert(Op->getNumOperands() == 2 && Op->getNumResults() == 1);
1256
1257 // Check 3. rhs operand of the Op should be a mul op (If any operand of add
1258 // op is mul op, it is guaranteed to be rhs operand by explicit
1259 // reassociation done earlier).
1260 Operation *mulOp = getOperandDefOp(state, Op, 1);
1261 if (!isa<MulIOp, MulFOp>(mulOp))
1262 return false;
1263
1264 // Check 4. mulOp must also have two operands and one result
1265 assert(mulOp->getNumOperands() == 2 && mulOp->getNumResults() == 1);
1266
1267 // Determine the lhs, rhs, and accumulator values.
1268 Value lhs = state->sextTruncDefMap.count(mulOp->getOperand(0).getDefiningOp())
1269 ? mulOp->getOperand(0).getDefiningOp()->getOperand(0)
1270 : mulOp->getOperand(0);
1271 Value rhs = state->sextTruncDefMap.count(mulOp->getOperand(1).getDefiningOp())
1272 ? mulOp->getOperand(1).getDefiningOp()->getOperand(0)
1273 : mulOp->getOperand(1);
1274 Value acc = state->sextTruncDefMap.count(Op->getOperand(0).getDefiningOp())
1275 ? Op->getOperand(0).getDefiningOp()->getOperand(0)
1276 : Op->getOperand(0);
1277
1278 assert(lhs && rhs && acc &&
1279 "Failed to find the three operands of the FMA op");
1280
1281 // Check 5. All lhs, rhs, and acc must be vector types
1282 if (!isa<VectorType>(lhs.getType()) || !isa<VectorType>(rhs.getType()) ||
1283 !isa<VectorType>(acc.getType()))
1284 return false;
1285
1286 // Check 6. All the ops should belong to the same block, otherwise we might
1287 // not be able to fuse them safely.
1288 if (lhs.getParentBlock() != rhs.getParentBlock() ||
1289 rhs.getParentBlock() != acc.getParentBlock())
1290 return false;
1291
1292 // Check 7. All the vector sizes must be same
1293 auto lhsType = cast<VectorType>(lhs.getType());
1294 auto rhsType = cast<VectorType>(rhs.getType());
1295 VectorType accType = state->sextTruncDefMap.count(
1296 acc.getDefiningOp()->getOperand(0).getDefiningOp())
1297 ? cast<VectorType>(acc.getDefiningOp()
1298 ->getOperand(0)
1299 .getDefiningOp()
1300 ->getOperand(0)
1301 .getType())
1302 : cast<VectorType>(acc.getType());
1303
1304 unsigned lhsVecSize = getVectorLaneSize(lhsType);
1305 unsigned rhsVecSize = getVectorLaneSize(rhsType);
1306 unsigned accVecSize = getVectorLaneSize(accType);
1307
1308 if (lhsVecSize != rhsVecSize || rhsVecSize != accVecSize)
1309 return false;
1310
1311 // Check 8. The underlying scalar element type of all vectors must be the
1312 // same
1313 if (lhsType.getElementType() != rhsType.getElementType() ||
1314 rhsType.getElementType() != accType.getElementType())
1315 return false;
1316
1317 // And after all this, we can fuse mul and add into fma
1318 return true;
1319}
1320
1321// In the advanced FMA schemes, the two vector operands of a mul/fma op are not
1322// the same size. If the incoming operation involves multiplication,
1323// reassociate the operands involved in multiplication so that the left operand
1324// comes from bigger vector. The exception to this rule is the 8x8 scheme,
1325// where the right operand must be the bigger vector.
1326static void reassociateMulOpBasedOnVecSize(Operation *Op, VectState *state) {
1327 // Get the stats for left and right operand vectors
1328 AIEVecAttributes lstat = getOperandVecStats(Op, state, 0);
1329 AIEVecAttributes rstat = getOperandVecStats(Op, state, 1);
1330
1331 // No need to do anything if both vectors are the same size
1332 if (lstat.vecSizeInBits == rstat.vecSizeInBits)
1333 return;
1334
1335 // Check if this is an 8x8 scheme
1336 bool is8x8 = lstat.elementSizeInBits == 8 && rstat.elementSizeInBits == 8;
1337
1338 // Flip the operands if necessary
1339 bool flip = is8x8 ? lstat.vecSizeInBits > rstat.vecSizeInBits
1340 : rstat.vecSizeInBits > lstat.vecSizeInBits;
1341 if (flip) {
1342 LLVM_DEBUG(llvm::dbgs()
1343 << "\n\nReassociating op " << *Op
1344 << " to correctly place operand coming from bigger vector");
1345 Value left = Op->getOperand(0);
1346 Value right = Op->getOperand(1);
1347 Op->setOperand(0, right);
1348 Op->setOperand(1, left);
1349 LLVM_DEBUG(llvm::dbgs() << "\n\tOp after reassociation: " << *Op);
1350 }
1351}
1352
1353// If Op involves multiplication, and any operand involved in the
1354// multiplication is splat, make it the second operand of mul, unless its the
1355// 8x8 scheme. In that case, make splat the first operand.
1356static void reassociateMulOpWithSplat(Operation *Op, VectState *state) {
1357 // Op must have at least two operands (two for mul, three for fma), and one
1358 // result.
1359 assert(Op->getNumOperands() == 2 || Op->getNumOperands() == 3);
1360 assert(Op->getNumResults() == 1);
1361
1362 // Get the left and right operand vector properties
1363 AIEVecAttributes lstat = getOperandVecStats(Op, state, 0);
1364 AIEVecAttributes rstat = getOperandVecStats(Op, state, 1);
1365
1366 // No need to do anything if both operands are splat
1367 if (lstat.isSplat && rstat.isSplat)
1368 return;
1369
1370 // Check if this is an 8x8 scheme
1371 bool is8x8 = lstat.elementSizeInBits == 8 && rstat.elementSizeInBits == 8;
1372
1373 // Now flip operands if required and set the operands to the operands of the
1374 // sext operations
1375 bool flip = is8x8 ? rstat.isSplat : lstat.isSplat;
1376 Value left = state->sextTruncDefMap.count(Op->getOperand(0).getDefiningOp())
1377 ? Op->getOperand(0).getDefiningOp()->getOperand(0)
1378 : Op->getOperand(0);
1379 Value right = state->sextTruncDefMap.count(Op->getOperand(1).getDefiningOp())
1380 ? Op->getOperand(1).getDefiningOp()->getOperand(0)
1381 : Op->getOperand(1);
1382 if (flip) {
1383 LLVM_DEBUG(llvm::dbgs() << "\n\nReassociating op " << *Op
1384 << " to place splat as correct operand");
1385 Op->setOperand(0, right);
1386 Op->setOperand(1, left);
1387 LLVM_DEBUG(llvm::dbgs() << "\n\tOp after reassociation: " << *Op);
1388 } else {
1389 Op->setOperand(0, left);
1390 Op->setOperand(1, right);
1391 }
1392
1393 Op->getResult(0).setType(Op->getOperand(0).getType());
1394
1395 if (Op->hasOneUse() &&
1396 isa<AddIOp, AddFOp, SubIOp, SubFOp>(*Op->getUsers().begin())) {
1397 Operation *usrOp = *Op->getUsers().begin();
1398 usrOp->getResult(0).setType(usrOp->getOperand(0).getType());
1399 }
1400}
1401
1402// Rewrite a mul and add/sub op as a vector dialect FMA op
1403static void fuseMulAndAddOrSubIntoFMAOp(Operation *Op, VectState *state) {
1404 Value acc = state->sextTruncDefMap.count(Op->getOperand(0).getDefiningOp())
1405 ? Op->getOperand(0).getDefiningOp()->getOperand(0)
1406 : Op->getOperand(0);
1407 Operation *mulOp = getOperandDefOp(state, Op, 1);
1408 Value lhs = state->sextTruncDefMap.count(mulOp->getOperand(0).getDefiningOp())
1409 ? mulOp->getOperand(0).getDefiningOp()->getOperand(0)
1410 : mulOp->getOperand(0);
1411 Value rhs = state->sextTruncDefMap.count(mulOp->getOperand(1).getDefiningOp())
1412 ? mulOp->getOperand(1).getDefiningOp()->getOperand(0)
1413 : mulOp->getOperand(1);
1414
1415 // Create a new FMA op
1416 state->builder.setInsertionPointAfter(Op);
1417 Operation *fmaOp =
1418 vector::FMAOp::create(state->builder, Op->getLoc(), lhs, rhs, acc);
1419
1420 // If Op is a sub op, we tag the generated fma op as msc op
1421 bool isSub = isa<SubIOp, SubFOp>(Op);
1422 if (isSub)
1423 state->mscOps.insert(fmaOp);
1424
1425 LLVM_DEBUG(llvm::dbgs() << "\n\nFused " << (isSub ? "sub" : "add") << " op "
1426 << *Op << "\n\tand mul op " << *mulOp
1427 << "\n\tinto fma op " << *fmaOp);
1428
1429 // Replace all the uses of Op with the fmaOp, and remove Op
1430 Op->replaceAllUsesWith(fmaOp);
1431 Op->erase();
1432 // If Op was the only consumer of mulOp, then there are no more uses of
1433 // mulOp. Remove it.
1434 if (mulOp->use_empty())
1435 mulOp->erase();
1436}
1437
1438// Given the operation attributes (start, offset, step, square, etc.), generate
1439// an AIE mul/fma op for the incoming vector mul/fma Op. 'nextStart' is used
1440// for schemes that require two AIE dialect fma ops to be generated for one
1441// vector dialect fma op for AIE1; the only difference between the attributes of
1442// the two AIE dialect fma ops is the start field. For AIEML, i8xi8 scheme
1443// generates one MulConvOp or FMAConvOp for each vector dialect mul/fma op.
1444static void generateMulOrFMAOp(Operation *Op, Scheme &scheme,
1445 AIEOpAttributes &opAttr, VectState *state,
1446 const std::string &nextStart = "") {
1447 // Assert that we computed the attributes for both the operands
1448 assert(opAttr.start.size() == opAttr.offset.size() &&
1449 opAttr.start.size() == 2);
1450
1451 // Set insertion point of the AIE dialect mul/fma op
1452 state->builder.setInsertionPointAfter(Op);
1453
1454 // Return true if any user of this op is not mul/fma op
1455 auto notMulOrFMAOp = [&](Operation *op) {
1456 return !isa<MulIOp, MulFOp, vector::FMAOp>(op);
1457 };
1458
1459 // Generate an AIE dialect mul/fma op from a vector dialect mul/fma op
1460 auto genOp = [&](Operation *Op, AIEOpAttributes &opAttr, VectState *state,
1461 bool i8xi8_pairedOp = false) {
1462 Operation *repOp;
1463 // Create aievec::aie1::FMAOp corresponding to the vector::FMAOp
1464 if (auto fmaOp = dyn_cast<vector::FMAOp>(Op))
1465 repOp = generateFMAOp(fmaOp, opAttr, state, i8xi8_pairedOp);
1466 // Create aievec::aie1::MulOp corresponding to the vector::MulIOp
1467 else if (auto mulOp = dyn_cast<MulIOp>(Op))
1468 repOp = generateMulOp<MulIOp>(mulOp, opAttr, state);
1469 // Create aievec::aie1::MulOp corresponding to the vector::MulFOp
1470 else if (auto mulOp = dyn_cast<MulFOp>(Op))
1471 repOp = generateMulOp<MulFOp>(mulOp, opAttr, state);
1472 else
1473 llvm_unreachable("Operation not mul/fma op");
1474 return repOp;
1475 };
1476
1477 Operation *repOp = genOp(Op, opAttr, state);
1478 LLVM_DEBUG(llvm::dbgs() << "\n\nGenerated AIE dialect mul/fma op " << *repOp);
1479
1480 // For AIE1, i8xi8 scheme generates two AIE dialect mul/fma ops for each
1481 // vector dialect mul/fma op. Generate the paired mul/fma op if nextStart is
1482 // not empty. For AIEML, i8xi8 scheme generates one MulConvOp or FMAConvOp for
1483 // each vector dialect mul/fma op.
1484 if (!nextStart.empty()) {
1485 if (state->aieml && scheme.lanes == 32 && scheme.xbits == 8 &&
1486 scheme.zbits == 8) {
1487 repOp = generateMulOrFMAConvOpForInt8(Op, opAttr, state);
1488 if (any_of(repOp->getUsers(), notMulOrFMAOp)) {
1489 Type i8Type =
1490 IntegerType::get(repOp->getResult(0).getType().getContext(), 8);
1491 repOp =
1492 generateSRSOp(repOp->getResult(0), i8Type, state, repOp->getLoc());
1493 }
1494 } else {
1495 opAttr.start[1] = nextStart;
1496 Operation *pairedOp = genOp(Op, opAttr, state, true);
1497 LLVM_DEBUG(llvm::dbgs() << "\n\nGenerated the paired AIE dialect "
1498 << "mul/fma op for 8x8 scheme " << *repOp);
1499 // Link the two mul/fma ops
1500 assert(!state->pairedOp.count(repOp));
1501 state->pairedOp[repOp] = pairedOp;
1502 // If any of the uses of incoming op is not a mul/fma op, then we need to
1503 // concatenate the paired ops and generate a v16xi8 vector.
1504 if (any_of(Op->getUsers(), notMulOrFMAOp))
1505 repOp = concatAndInterleave_i8xi8(repOp, pairedOp, state, Op->getLoc());
1506 }
1507 }
1508
1509 // Replace all the uses of the vector mul/fma op with the AIE mul/fma op, and
1510 // remove vector op from the IR.
1511 Op->replaceAllUsesWith(repOp);
1512 Op->erase();
1513}
1514
1515// Compute the start and offset for xbuff/zbuff for 32x32 scheme.
1516static void computeBuffAttr_i32xi32(
1517 unsigned vecSize, // #lanes
1518 int32_t start, // start in AIE vec
1519 int32_t accIncr, // access change with each loop increment
1520 AIEOpAttributes &opAttr) {
1521 // Populate start
1522 std::string startStr = std::to_string(start);
1523 // Compute the offset resembling "0x76543210"
1524 std::string offsetStr = "0x";
1525 for (int i = vecSize - 1; i >= 0; --i)
1526 offsetStr.push_back(getHexValue(i * accIncr));
1527
1528 // And now we have everything to push into opAttr
1529 opAttr.start.push_back(startStr);
1530 opAttr.offset.push_back(offsetStr);
1531 opAttr.offset_hi.push_back("");
1532 opAttr.square.push_back("");
1533 opAttr.step.push_back("");
1534}
1535
1536// Compute the start, lo/hi offset, and square for xbuff for 16x16 scheme.
1537static void computeXbuffAttr_i16xi16(
1538 unsigned vecSize, // #lanes
1539 int32_t start, // computed start in AIE vec
1540 int32_t accIncr, // access change with each loop increment
1541 int32_t colOffset, // xbuff access distance between vector cols
1542 AIEOpAttributes &opAttr) {
1543 // The colOffset must be either <=1, or a multiple of 2
1544 assert(colOffset >= -1 && (colOffset <= 1 || colOffset % 2 == 0) &&
1545 "cannot compute offset and square for xbuff");
1546 // We can only generate the offsets and square if either accIncr or column
1547 // offset is <= 1.
1548 assert((accIncr <= 1 || colOffset <= 1) &&
1549 "cannot generate offset and square for xbuff");
1550
1551 // Arch restriction: xstart should be a multiple of 2.
1552 int32_t m2start = (start / 2) * 2;
1553 std::string startStr = std::to_string(m2start);
1554 // m2Offset accounts for the extra 1 if the start is not a multiple of 2
1555 int32_t m2Offset = start - m2start;
1556
1557 // Compute hi and lo offsets to something resembling "0x_7_6_5_4" and
1558 // "0x_3_2_1_0" respectively. The '_' are 0 if colOffset is 1.
1559 std::string offsetStr = "0x";
1560 int32_t offset = std::max(colOffset, accIncr);
1561 for (int i = vecSize / 2 - 2; i >= 0; i -= 2) {
1562 offsetStr.push_back(offset <= 1 ? '0' : getHexValue((offset - 2) / 2));
1563 offsetStr.push_back(getHexValue((i * accIncr) / 2));
1564 }
1565 std::string offsetHiStr = "0x";
1566 for (int i = vecSize - 2, e = vecSize / 2; i >= e; i -= 2) {
1567 offsetHiStr.push_back(offset <= 1 ? '0' : getHexValue((offset - 2) / 2));
1568 offsetHiStr.push_back(getHexValue((i * accIncr) / 2));
1569 }
1570
1571 // Now compute the square for xbuff.
1572 int32_t cstep = std::min(2, std::abs(colOffset));
1573 int32_t astep = std::min(2, accIncr);
1574 assert(m2Offset == 0 || (astep <= 1 && cstep <= 1));
1575
1576 SmallVector<int32_t> sqPattern = {astep + cstep, astep, cstep, 0};
1577 std::string squareStr = "0x";
1578 for (auto sq : sqPattern)
1579 squareStr.push_back(getHexValue(sq + m2Offset));
1580
1581 // And now we have everything to push into opAttr
1582 opAttr.start.push_back(startStr);
1583 opAttr.offset.push_back(offsetStr);
1584 opAttr.offset_hi.push_back(offsetHiStr);
1585 opAttr.square.push_back(squareStr);
1586 opAttr.step.push_back("");
1587}
1588
1589// Compute the start, lo/hi offset, and step for zbuff for 16x16 scheme.
1590static void computeZbuffAttr_i16xi16(
1591 unsigned vecSize, // #lanes
1592 int32_t start, // computed start in AIE vec
1593 int32_t accIncr, // access change with each loop increment
1594 int32_t zeroOffset, // offset of 0 value in the filter
1595 int32_t colOffset, // zbuff access distance between vector cols
1596 bool aieml, AIEOpAttributes &opAttr) {
1597 std::string offsetStr, offsetHiStr;
1598 // zstart must be 4b value.
1599 assert(start < (aieml ? 32 : 16) && "zstart must be 4b value");
1600 std::string startStr = std::to_string(start);
1601
1602 // If zbuff comes from splat, use default offsets
1603 if (accIncr == 0)
1604 offsetStr = offsetHiStr = "0";
1605 else {
1606 // Compute hi and lo offsets using general scheme
1607 offsetStr = "0x";
1608 for (int i = vecSize / 2 - 1; i >= 0; --i)
1609 offsetStr.push_back(getHexValue(i * accIncr));
1610 offsetHiStr = "0x";
1611 for (auto i = vecSize - 1, e = vecSize / 2; i >= e; --i)
1612 offsetStr.push_back(getHexValue(i * accIncr));
1613 }
1614
1615 // Compute step between columns
1616 int32_t step = colOffset == -1 ? zeroOffset - 1 - start : colOffset;
1617 assert(step >= 0 && "zstep cannot be negative");
1618 std::string stepStr = std::to_string(step);
1619
1620 // And now we have everything to push into opAttr
1621 opAttr.start.push_back(startStr);
1622 opAttr.offset.push_back(offsetStr);
1623 opAttr.offset_hi.push_back(offsetHiStr);
1624 opAttr.square.push_back("");
1625 opAttr.step.push_back(stepStr);
1626}
1627
1628// Compute the start, offset, square, and step for xbuff for 8x8 scheme. This
1629// is the data scheme, but since is is so restricted, we do a switcharoo, and
1630// use filter as xbuff. We assume that the filter elements are duplicated
1631// (duplication factor= 2). For example, the 2x2 filter should be
1632// {0,0,1,1,2,2,3,3}.
1633static void computeXbuffAttr_i8xi8(
1634 unsigned vecSize, // #lanes
1635 int32_t start, // computed start in AIE vec
1636 int32_t colOffset, // xbuff access distance between vector cols
1637 AIEOpAttributes &opAttr) {
1638 // Assert that colStep is a multiple of 4, where colStep is the difference
1639 // between idx[i][j] and idx[i][j+2].
1640 assert(
1641 colOffset >= 2 &&
1642 "each filter entry must be replicated at least twice for i8xi8 scheme");
1643 int32_t colStep = 2 * colOffset;
1644 assert(colStep % 4 == 0 && "xstep must be multiple of 4");
1645
1646 // Arch restriction: xstart must be a multiple of 4
1647 int32_t m4start = (start / 4) * 4;
1648 std::string startStr = std::to_string(m4start);
1649 // m4Offset accounts for the excess if start is not a multiple of 4
1650 int32_t m4Offset = start - m4start;
1651 // Because of duplication, m4Offset can only be 0 or 2
1652 assert(m4Offset == 0 || m4Offset == 2);
1653
1654 // Compute offsetStr to something resembling "0x_0_0_0_0", where _ is
1655 // (colStep-4)/4.
1656 std::string offsetStr = "0x";
1657 for (int i = vecSize / 4 - 1; i >= 0; --i) {
1658 offsetStr.push_back(getHexValue(colStep / 4 - 1));
1659 offsetStr += "0";
1660 }
1661 std::string stepStr = std::to_string(colStep);
1662
1663 // Now compute the square for zbuff. We want a {0,x,0,x} pattern.
1664 int32_t offsetWithoutDup = colOffset / 2;
1665 int32_t rstep = offsetWithoutDup >= 2 ? 2
1666 : colOffset == -1 ? 1
1667 : offsetWithoutDup;
1668 assert(m4Offset == 0 || rstep <= 1);
1669
1670 SmallVector<int32_t> sqPattern = {rstep, 0, rstep, 0};
1671 std::string squareStr = "0x";
1672 for (auto sq : sqPattern)
1673 squareStr.push_back(getHexValue(sq + m4Offset));
1674
1675 // And now we have everything to push into opAttr
1676 opAttr.start.push_back(startStr);
1677 opAttr.offset.push_back(offsetStr);
1678 opAttr.offset_hi.push_back("");
1679 opAttr.square.push_back(squareStr);
1680 opAttr.step.push_back(stepStr);
1681}
1682
1683// Compute the start, offset, square, and step for zbuff for 8x8 scheme. This
1684// is the coefficient scheme, but since the coefficient scheme is more relaxed,
1685// we use image as zbuff.
1686static void computeZbuffAttr_i8xi8(
1687 unsigned vecSize, // #lanes
1688 int32_t start, // computed start in AIE vec
1689 int32_t accIncr, // access change with each loop increment
1690 int32_t colOffset, // zbuff access distance between vector cols
1691 AIEOpAttributes &opAttr, std::string &nextStart) {
1692 // The colOffset must be either <=1, or a multiple of 2
1693 assert((colOffset <= 1 || colOffset % 2 == 0) && "zbuff value not supported");
1694
1695 // Arch restriction: zstart is a multiple of 2
1696 int32_t m2start = (start / 2) * 2;
1697 std::string startStr = std::to_string(m2start);
1698 // m2Offset accounts for the extra 1 if the start is not a multiple of 2
1699 int32_t m2Offset = start - m2start;
1700
1701 // Compute offsetStr to something resembling "0x43322110". The usual pattern
1702 // is "0x_3_2_1_0", and the purpose is to fill the "_".
1703 std::string offsetStr = "0x";
1704 for (int i = vecSize / 4 - 1; i >= 0; --i) {
1705 int32_t val = i * accIncr + (colOffset + 1) / 2;
1706 offsetStr.push_back(getHexValue(val));
1707 offsetStr.push_back(getHexValue(i * accIncr));
1708 }
1709 std::string stepStr = std::to_string(2 * std::abs(colOffset));
1710 nextStart = std::to_string(m2start + 2 * accIncr * (vecSize / 4));
1711
1712 // Now compute the square for zbuff. We want a {0,1+x,y,y+1+x} pattern, where
1713 // x is the square offset, and y is the accIncr.
1714 int32_t rstep = colOffset >= 2 ? 2 : std::abs(colOffset);
1715 assert(m2Offset == 0 || rstep <= 1);
1716
1717 SmallVector<int32_t> sqPattern = {accIncr + rstep, accIncr, rstep, 0};
1718 std::string squareStr = "0x";
1719 for (auto sq : sqPattern)
1720 squareStr.push_back(getHexValue(sq + m2Offset));
1721
1722 // And now we have everything to push into opAttr
1723 opAttr.start.push_back(startStr);
1724 opAttr.offset.push_back(offsetStr);
1725 opAttr.offset_hi.push_back("");
1726 opAttr.square.push_back(squareStr);
1727 opAttr.step.push_back(stepStr);
1728}
1729
1730// Find a length-k chain of FMA ops such that (1) the chain is linear; (2) the
1731// operand datawidth is 16 or 8 bits; (3) the access distance between lhs (rhs)
1732// operands of both FMAs is compile-time constant. These FMAs will be fused
1733// into a single FMA. Technically, k is equal to the number of columns in the
1734// FMA topology. If fused, cache the pair indicating the access difference
1735// between the operands for the two FMAs.
1736static void fuseFMAOps(Operation *refOp,
1737 llvm::SmallSet<Operation *, 8> &fusedOpSet, int32_t cols,
1738 VectState *state) {
1739 // The number of columns must be greater than 1. refOp must be mul/fma op,
1740 // and should not be covered by the simple vector scheme.
1741 if (cols <= 1 || !isa<MulIOp, MulFOp, vector::FMAOp>(refOp) ||
1742 isSimpleVectIntrinsic(refOp, state))
1743 return;
1744
1745 // Get the start offsets for left and right operands of the reference
1746 // operator, i.e., start of the fusion chain.
1747 Operation *lOp = getOperandDefOp(state, refOp, 0);
1748 Operation *rOp = getOperandDefOp(state, refOp, 1);
1749
1750 int32_t lstart = computeStartInAIEVec(lOp, state);
1751 int32_t rstart = computeStartInAIEVec(rOp, state);
1752
1753 // The xbuff and zbuff offsets between the fused FMA ops. The default value
1754 // is -1
1755 int xOffset = -1, zOffset = -1;
1756
1757 // We write a loop that tries to chase a linear chain of length col-1
1758 // starting at reference mul/fma op refOp. Let us consider a computational
1759 // chain of length 3 : {c = A[i]*B[i]; c += A[i+1]*B[i+1]; c +=
1760 // A[i+2]*B[i+2]}; We represent the start for each instruction as a pair
1761 // (lhs-operand-start, rhs-operand-start). The starts for the chain will be
1762 // {(0,0), (1,1), (2,2)}. Since the consecutive starts are equidistant in
1763 // the chain, we consider this chain fusable, and cache the fused operations
1764 // in fusedp vector.
1765 Operation *curOp = refOp;
1766 SmallVector<Operation *, 8> fusedOps;
1767
1768 for (auto len = 0; len < cols - 1; ++len) {
1769 // If this operation has more than one use, break loop.
1770 if (!curOp->hasOneUse())
1771 break;
1772 // Get the consumer of the curOp FMA
1773 Operation *usrOp = *curOp->getUsers().begin();
1774 // The user/consumer user operation must be a FMA, belonging to the same
1775 // basic block as curOp, and must not be covered by simple scheme.
1776 if (!isa<vector::FMAOp>(usrOp) || curOp->getBlock() != usrOp->getBlock() ||
1777 isSimpleVectIntrinsic(usrOp, state))
1778 break;
1779 // Both curOp and usrOp must be either fma or fmsub(msc)
1780 if (isa<vector::FMAOp>(curOp) &&
1781 state->mscOps.count(curOp) != state->mscOps.count(usrOp))
1782 break;
1783 // Compute the start/access distance for each operand of curOp and usrOp
1784 SmallVector<int32_t, 2> offsets;
1785 for (size_t idx = 0; idx < 2; ++idx) {
1786 // Get the vector attributes for this operand of curOp and usrOp
1787 AIEVecAttributes cstat = getOperandVecStats(curOp, state, idx);
1788 AIEVecAttributes ustat = getOperandVecStats(usrOp, state, idx);
1789 // We need to ensure that the accesses to this operand of curOp and usrOp
1790 // come from the same vector. To guarantee this, we peform two checks:
1791 // Check 1. The accesses must be similar
1792 if (cstat.vecSizeInBits != ustat.vecSizeInBits ||
1793 cstat.elementSizeInBits != ustat.elementSizeInBits ||
1794 cstat.loadFromMemory != ustat.loadFromMemory ||
1795 cstat.isSplat != ustat.isSplat)
1796 break;
1797 // Check 2. The accesses must come from the same vector/upd op
1798 Operation *cdefOp = getOperandDefOp(state, curOp, idx);
1799 Operation *udefOp = getOperandDefOp(state, usrOp, idx);
1800
1801 bool related = cdefOp == udefOp;
1802 if (!related && cstat.loadFromMemory && ustat.loadFromMemory) {
1803 IntervalReuse *civ = state->getIntervalForOperation(cdefOp);
1804 IntervalReuse *uiv = state->getIntervalForOperation(udefOp);
1805 related =
1806 civ == uiv && civ->getInterval(cdefOp) == uiv->getInterval(udefOp);
1807 }
1808 if (!related)
1809 break;
1810
1811 // We know that the accesses to this operand for both curOp and usrOp
1812 // come from the same AIE vector. So we can get the start value for the
1813 // operands.
1814 int32_t start1 = computeStartInAIEVec(cdefOp, state);
1815 int32_t start2 = computeStartInAIEVec(udefOp, state);
1816 int32_t offset = start2 - start1;
1817 // perform a set of checks to make sure that the distance can be encoded
1818 // in AIE intrinsic.
1819 // Check 1: the offset should be positive
1820 if (offset < 0)
1821 break;
1822 // Check 2: If offset is greater than 1, it should be a multiple of 2
1823 if (offset > 1 && offset % 2 != 0)
1824 break;
1825 // Check 3: If offset is >=2, then the reference op must have start=0
1826 int32_t refStart = idx == 0 ? lstart : rstart;
1827 if (!ustat.isSplat && offset > 1 && refStart != 0)
1828 break;
1829 // From this operand's perspective, we can fuse this usrOp with curOp.
1830 // Cache the start offset.
1831 offsets.push_back(offset);
1832 }
1833 // Verify that we computed offset for both operands
1834 if (offsets.size() < 2)
1835 break;
1836 // Ensure that the difference between consecutive xOffsets and zOffsets is
1837 // consistent throughout the chain.
1838 if ((xOffset != -1 && xOffset != offsets[0]) ||
1839 (zOffset != -1 && zOffset != offsets[1]))
1840 break;
1841 // Now the user FMA op can be fused with refOp
1842 xOffset = offsets[0];
1843 zOffset = offsets[1];
1844 fusedOps.push_back(usrOp);
1845 // usrOp now becomes curOp, so that we can chase the linear chain starting
1846 // at it.
1847 curOp = usrOp;
1848 }
1849
1850 // If there are no ops fused, return
1851 if (fusedOps.empty())
1852 return;
1853
1854 LLVM_DEBUG(llvm::dbgs() << "\n\nFused following fma ops with op " << *refOp);
1855
1856 // If we reached here, we have decided to fuse a linear chain of FMAs, we
1857 // need to remove the fused FMAs from the IR.
1858 for (auto &op : fusedOps) {
1859 LLVM_DEBUG(llvm::dbgs() << "\n\tfma op " << *op);
1860 fusedOpSet.insert(op);
1861 // Since we are fusing op with refOp, fuse their access extents too
1862 fuseAccessExtent(refOp, op, state);
1863 // Now replace the uses of op with reference
1864 op->replaceAllUsesWith(refOp);
1865 }
1866
1867 // Cache the column offsets for refOp
1868 assert(!state->opToColOffsets.count(refOp));
1869 state->opToColOffsets[refOp] = std::make_pair(xOffset, zOffset);
1870}
1871
1872// Compute all the attributes for xbuff, based on the scheme.
1873static void computeXbuffAttributes(
1874 Scheme &scheme, // vect scheme info
1875 int32_t start, // computed start in AIE vec
1876 int32_t colOffset, // xbuff access distance between vector cols
1877 int32_t accIncr, // xbuff access incr with each loop increment
1878 int32_t dupFactor, // duplication factor for i8xi8 filter
1879 bool aieml, AIEOpAttributes &opAttr) {
1880 // Branch to different schemes
1881 // Case 1: 32x32 real
1882 if ((scheme.lanes == 8 || (aieml && scheme.lanes == 16)) &&
1883 scheme.cols == 1 && scheme.xbits == 32 && scheme.zbits == 32)
1884 computeBuffAttr_i32xi32(scheme.lanes, start, accIncr, opAttr);
1885 // Case 2: 16x16 real
1886 else if ((scheme.lanes == 16 || (aieml && scheme.lanes == 32)) &&
1887 scheme.cols == 2 && scheme.xbits == 16 && scheme.zbits == 16) {
1888 // We only support a loop increment of <= 1
1889 assert((accIncr <= 1 || accIncr % 2 == 0) &&
1890 "loop step size value not supported");
1891 computeXbuffAttr_i16xi16(scheme.lanes, start, accIncr, colOffset, opAttr);
1892 }
1893 // Case 3: 8x8 real
1894 else if ((scheme.lanes == 16 || (aieml && scheme.lanes == 32)) &&
1895 scheme.cols == 8 && scheme.xbits == 8 && scheme.zbits == 8) {
1896 // We only support a loop increment of <= 1
1897 assert(accIncr <= 1 && "loop step size greater than 1 not supported");
1898 // If we were not able to fuse any of the macs to exploit column topology,
1899 // then colOffset must be equal to dupFactor.
1900 if (colOffset == -1)
1901 colOffset = dupFactor;
1902 computeXbuffAttr_i8xi8(scheme.lanes, start, colOffset, opAttr);
1903 } else
1904 llvm_unreachable("Unsupported vectorization scheme");
1905}
1906
1907// Compute all the attributes for zbuff, based on the scheme.
1908static void computeZbuffAttributes(
1909 Scheme &scheme, // vect scheme info
1910 int32_t start, // computed start in AIE vec
1911 int32_t colOffset, // zbuff access distance between vector cols
1912 int32_t accIncr, // zbuff access incr with each loop increment
1913 int32_t zeroOffset, // zero offset of filter for i16xi16 scheme
1914 bool aieml,
1915 std::string &nextStart, // start of mul/mac pair in i8xi8 scheme
1916 AIEOpAttributes &opAttr) {
1917 // Branch to different schemes
1918 // Case 1: 32x32 real
1919 if ((scheme.lanes == 8 || (aieml && scheme.lanes == 16)) &&
1920 scheme.cols == 1 && scheme.xbits == 32 && scheme.zbits == 32)
1921 computeBuffAttr_i32xi32(scheme.lanes, start, accIncr, opAttr);
1922 // Case 2: 16x16 real
1923 else if ((scheme.lanes == 16 || (aieml && scheme.lanes == 32)) &&
1924 scheme.cols == 2 && scheme.xbits == 16 && scheme.zbits == 16) {
1925 // We only support a loop increment of <= 1
1926 assert(accIncr <= 1 && "loop step size greater than 1 not supported");
1927 // Get the zero offset in filter if the user provided it in the command
1928 // line. The zero offset is cyclic, so compute an offset that is > start.
1929 zeroOffset = zeroOffset == 0 ? scheme.lanes
1930 : start + zeroOffset - (start % zeroOffset);
1931 computeZbuffAttr_i16xi16(scheme.lanes, start, accIncr, zeroOffset,
1932 colOffset, aieml, opAttr);
1933 }
1934 // Case 3: 8x8 real
1935 else if ((scheme.lanes == 16 || (aieml && scheme.lanes == 32)) &&
1936 scheme.cols == 8 && scheme.xbits == 8 && scheme.zbits == 8) {
1937 // We only support a loop increment of <= 1
1938 assert(accIncr <= 1 && "loop step size greater than 1 not supported");
1939 computeZbuffAttr_i8xi8(scheme.lanes, start, accIncr, colOffset, opAttr,
1940 nextStart);
1941 } else
1942 llvm_unreachable("Unsupported vectorization scheme");
1943}
1944
1945// For this mul/FMA operator, generate AIE dialect mul/FMA op based on
1946// different vector schemes.
1947static void generateSchemeBasedMulOrFMAOp(Operation *Op, VectState *state) {
1948 int32_t lanes, cols;
1949 std::tie(lanes, cols) = getNumRowsAndCols(Op, state);
1950 // Get the data sizes for left and right operands of mul/fma
1951 Value lhs = state->sextTruncDefMap.count(Op->getOperand(0).getDefiningOp())
1952 ? Op->getOperand(0).getDefiningOp()->getOperand(0)
1953 : Op->getOperand(0);
1954 Value rhs = state->sextTruncDefMap.count(Op->getOperand(1).getDefiningOp())
1955 ? Op->getOperand(1).getDefiningOp()->getOperand(0)
1956 : Op->getOperand(1);
1957 int32_t xbits = getElementSizeInBits(cast<VectorType>(lhs.getType()));
1958 int32_t zbits = getElementSizeInBits(cast<VectorType>(rhs.getType()));
1959 Scheme scheme(lanes, cols, xbits, zbits);
1960
1961 // First check if this operation requires simple vector operation, and not an
1962 // advanced scheme.
1963 if (isSimpleVectIntrinsic(Op, state)) {
1964 // opAttr will cache the attributes (start, step, offsets, square, etc.)
1965 // for both lhs and rhs operands.
1966 AIEOpAttributes opAttr;
1967 // For simple scheme, we do not need any attribute
1968 for (size_t idx = 0; idx < 2; ++idx) {
1969 opAttr.start.push_back("");
1970 opAttr.offset.push_back("");
1971 opAttr.offset_hi.push_back("");
1972 opAttr.square.push_back("");
1973 opAttr.step.push_back("");
1974 }
1975 generateMulOrFMAOp(Op, scheme, opAttr, state);
1976 return;
1977 }
1978
1979 // Otherwise generate mul or fma op based on advanced scheme. Get the rows,
1980 // cols, and datatype size for the vector scheme, and pack all that
1981 // information in the Scheme struct.
1982 // If element size is < 32 bits ,we can fuse multiple FMAs together to
1983 // exploit the column topology of FMA intrinsic.
1984 auto colOffset = state->opToColOffsets.count(Op) ? state->opToColOffsets[Op]
1985 : std::make_pair(-1, -1);
1986
1987 // opAttr will cache the step,offsets,square, etc. for both lhs and rhs
1988 // operands.
1989 AIEOpAttributes opAttr;
1990 // For i8xi8 scheme, each vector dialect mul/fma op is converted to two AIE
1991 // dialect mul/fma op. The two AIE ops are identical, except for the start
1992 // field. nextStart indicates the start of the second op.
1993 std::string nextStart;
1994 // Compute relevant attributes (start, offsets, step, square, etc.) for each
1995 // operand, and store them in opAttr.
1996 for (size_t idx = 0; idx < 2; ++idx) {
1997 AIEVecAttributes stat = getOperandVecStats(Op, state, idx);
1998 Operation *op = getOperandDefOp(state, Op, idx);
1999
2000 int32_t start = 0, accIncr = 1;
2001 // If the operand comes from transfer_read, compute the step and start
2002 // values.
2003 if (stat.loadFromMemory) {
2004 auto readOp = cast<TransferReadOp>(op);
2005 // How does the access change with each iteration of the vectorized loop?
2006 accIncr = stat.isSplat ? 0 : computeVecorizedLoopStepSize(readOp, state);
2007 // start in the AIE vector
2008 start = computeStartInAIEVec(op, state);
2009 }
2010 // Compute the xbuff and zbuff attributes
2011 if (idx == 0)
2012 computeXbuffAttributes(scheme, start, colOffset.first, accIncr,
2013 state->dupFactor, state->aieml, opAttr);
2014 else
2015 computeZbuffAttributes(scheme, start, colOffset.second, accIncr,
2016 state->zeroOffset, state->aieml, nextStart,
2017 opAttr);
2018 }
2019 // And now generate the mul/fma op
2020 generateMulOrFMAOp(Op, scheme, opAttr, state, nextStart);
2021}
2022
2023// If the datatype allows it, fuse a mul or fma op with other fma ops to
2024// utilize the column topology of the AIE mul/fma intrinsic (e.g., 2 fmas can
2025// be fused for i16xi16 scheme, and 8 for i8xi8 scheme).
2026static void fuseFMAOpsForColumnTopology(func::FuncOp func, VectState *state) {
2027 // A set of FMA ops that were fused in the column topology
2028 llvm::SmallSet<Operation *, 8> fusedOpSet;
2029
2030 // Fuse FMA ops to exploit column topology
2031 func.walk([&](Operation *op) {
2032 if (isa<MulIOp, MulFOp, vector::FMAOp>(op)) {
2033 // Only process fma ops that are not already fused with another mul/fma
2034 if (!fusedOpSet.count(op)) {
2035 auto [lanes, cols] = getNumRowsAndCols(op, state);
2036 // Try fusing a linear chain of FMA ops (max length = cols) starting at
2037 // op.
2038 fuseFMAOps(op, fusedOpSet, cols, state);
2039 }
2040 }
2041 });
2042
2043 // Remove all the ops that were fused with other FMAs
2044 for (auto op : fusedOpSet)
2045 op->erase();
2046}
2047
2048template <typename T1, typename T2>
2049static bool matchAttributesAndDistanceForFusion(T1 curOp, T2 defOp) {
2050 return curOp.getOffset(0) == defOp.getOffset(0) &&
2051 curOp.getOffsetHi(0) == defOp.getOffsetHi(0) &&
2052 curOp.getSquare(0) == defOp.getSquare(0) &&
2053 curOp.getStep(0) == defOp.getStep(0) &&
2054 curOp.getOffset(1) == defOp.getOffset(1) &&
2055 curOp.getOffsetHi(1) == defOp.getOffsetHi(1) &&
2056 curOp.getSquare(1) == defOp.getSquare(1) &&
2057 curOp.getStep(1) == defOp.getStep(1) &&
2058 stoi(static_cast<std::string>(curOp.getStart(0))) -
2059 stoi(static_cast<std::string>(defOp.getStart(0))) ==
2060 2 &&
2061 stoi(static_cast<std::string>(curOp.getStart(1))) -
2062 stoi(static_cast<std::string>(defOp.getStart(1))) ==
2063 2;
2064}
2065
2066// We go through each fma operation and try to find the pattern like this-
2067// the acc of fma is a mul/fma operation which uses the same operands as fma.
2068// the def of two operands are upd operations.
2069// Transform -
2070// %5 = aievec_aie1.mul %4, %0 {xoffsets = "[[Xo:.*]]", xoffsets_hi =
2071// "[[Xh:.*]]", xsquare = "[[Sq:.*]]", xstart = "0", zoffsets = "[[Zo:.*]]",
2072// zoffsets_hi =
2073// "[[Zh:.*]]", zstart = "0", zstep = "[[Zs:.*]]"}
2074//
2075// %6 = aievec_aie1.mac %4, %0, %5 {xoffsets = "[[Xo:.*]]",
2076// xoffsets_hi = "[[Xh:.*]]", xsquare = "[[Sq:.*]]", xstart = "2", zoffsets =
2077// "[[Zo:.*]]", zoffsets_hi = "[[Zh:.*]]", zstart = "2", zstep = "[[Zs:.*]]"}
2078//
2079// to-
2080//
2081// %7 = aievec_aie1.mul_conv %6, %1 {M = 16 : si32, N = 4 : si32}
2082//
2083// or transform the pattern like this-
2084//
2085// %9 = aievec_aie1.mac %8, %0, %6 {xoffsets = "[[Xo:.*]]", xoffsets_hi =
2086// "[[Xh:.*]]", xsquare = "[[Sq:.*]]", xstart = "0", zoffsets = "[[Zo:.*]]",
2087// zoffsets_hi =
2088// "[[Zh:.*]]", zstart = "4", zstep = "[[Zs:.*]]"}
2089//
2090// %10 = aievec_aie1.mac %8, %0, %9 {xoffsets =
2091// "[[Xo:.*]]", xoffsets_hi = "[[Xh:.*]]", xsquare = "[[Sq:.*]]", xstart = "2",
2092// zoffsets = "[[Zo:.*]]", zoffsets_hi = "[[Zh:.*]]", zstart = "6", zstep =
2093// "[[Zs:.*]]"}
2094//
2095// to-
2096//
2097// %9 =
2098// aievec.fma_conv %8, %2, %7 {M = 16 : si32, N = 4 : si32}
2099// Currently, we only support mul_conv_16x4 and mac_conv_16x4 intrinsics for
2100// int16 type of AIE-ML architecture.
2101static bool canFuseMulFMAOpsForInt16(Operation *Op) {
2102 // Check 1. This should be an aievec fma operation
2103 assert(isa<aievec::aie1::FMAOp>(Op) && "operation must be an aievec fma op");
2104 auto curOp = cast<aievec::aie1::FMAOp>(Op);
2105
2106 // Check 2. Element type should be int16
2107 auto vType = cast<VectorType>(Op->getOperand(1).getType());
2108 Type stype = vType.getElementType();
2109 auto itype = llvm::dyn_cast<IntegerType>(stype);
2110
2111 if (!itype)
2112 return false;
2113
2114 if (unsigned width = itype.getWidth(); width != 16)
2115 return false;
2116
2117 // Check 3. acc operand of the Op should be a mul op or fma op
2118 Operation *mulOrFMAOp = Op->getOperand(2).getDefiningOp();
2119
2120 if (!isa<aievec::aie1::MulOp, aievec::aie1::FMAOp>(mulOrFMAOp))
2121 return false;
2122
2123 // Check 4. mulOrFMAOp must have one use
2124 if (!mulOrFMAOp->hasOneUse())
2125 return false;
2126
2127 // Check 5. mulOrFMAOp and Op must have the same lhs and rhs
2128 if (mulOrFMAOp->getOperand(0) != Op->getOperand(0) ||
2129 mulOrFMAOp->getOperand(1) != Op->getOperand(1))
2130 return false;
2131
2132 Value lhs = nullptr;
2133 Value rhs = nullptr;
2134 Value acc = nullptr;
2135 bool isMulOp = false;
2136
2137 // If the acc operand is a mul op, we will try to generate mul_conv operation
2138 // If the acc operand is a fma op, we will try to generate fma_conv operation
2139 if (auto mulOp = dyn_cast<aievec::aie1::MulOp>(mulOrFMAOp)) {
2140 isMulOp = true;
2141
2142 // Determine the lhs and rhs values for the mul_conv
2143 lhs = mulOp->getOperand(0);
2144 rhs = mulOp->getOperand(1);
2145 } else {
2146 auto fmaOp = cast<aievec::aie1::FMAOp>(mulOrFMAOp);
2147
2148 // Determine the lhs, rhs and acc values for the fma_conv
2149 lhs = fmaOp->getOperand(0);
2150 rhs = fmaOp->getOperand(1);
2151 acc = fmaOp->getOperand(2);
2152 }
2153
2154 // Check 6. The def of two operands are upd operations
2155 auto lUpdOp = dyn_cast<aievec::UPDOp>(lhs.getDefiningOp());
2156 auto rUpdOp = dyn_cast<aievec::UPDOp>(rhs.getDefiningOp());
2157
2158 if (!lUpdOp || !rUpdOp) {
2159 return false;
2160 }
2161
2162 // Check 7. All the ops should belong to the same block, otherwise we might
2163 // not be able to fuse them safely
2164 if (lhs.getParentBlock() != rhs.getParentBlock())
2165 return false;
2166
2167 if (acc && rhs.getParentBlock() != acc.getParentBlock())
2168 return false;
2169
2170 // Check 8. xstart and zstart distance between two operations should be
2171 // 2. offsets, offsets_hi, square and step of two operations should be same.
2172 return (isMulOp && matchAttributesAndDistanceForFusion(
2173 curOp, cast<aievec::aie1::MulOp>(mulOrFMAOp))) ||
2174 matchAttributesAndDistanceForFusion(
2175 curOp, cast<aievec::aie1::FMAOp>(mulOrFMAOp));
2176}
2177
2178// Rewrite a mul/fma and fma op as a aievec MUL_conv or FMA_Conv op
2179static void fuseMulFMAOpsForInt16(Operation *Op, VectState *state) {
2180 auto curOp = cast<aievec::aie1::FMAOp>(Op);
2181
2182 Value lhs = curOp->getOperand(0);
2183
2184 // 1. Deal with the lhs:
2185 // lhs of current FMAOp should be an upd operation with 512-bit vector width.
2186 // For AIE-ML, we can directly load 512 bits vectors. Thus, we can delete the
2187 // upd operation with index 1.
2188 auto lUpdOp = dyn_cast<aievec::UPDOp>(lhs.getDefiningOp());
2189 if (lUpdOp.getIndex() == 1) {
2190 auto lUpdOp0 = dyn_cast<aievec::UPDOp>(lUpdOp.getVector().getDefiningOp());
2191 lUpdOp->replaceAllUsesWith(lUpdOp0);
2192 lUpdOp->erase();
2193 }
2194
2195 // 2. Deal with the rhs:
2196 // Since vector size of current FMAOp rhs is 256 bits, we need to generate a
2197 // concat op to make the vector size to 512 bits.
2198 auto rUpdOp = dyn_cast<aievec::UPDOp>(curOp->getOperand(1).getDefiningOp());
2199 state->builder.setInsertionPointAfter(rUpdOp);
2200 AIEVecAttributes rstat = getOperandVecStats(curOp, state, 1);
2201 assert(rstat.vecSizeInBits % 256 == 0);
2202 Value concatRhs = nullptr;
2203
2204 if (rstat.vecSizeInBits == 256) {
2205 VectorType concatType =
2206 createVectorType(512 / rstat.elementSizeInBits, rstat.elementType);
2207 SmallVector<Value> sources = {rUpdOp->getResult(0), rUpdOp->getResult(0)};
2208 concatRhs = generateConcatOp(sources, state, rUpdOp->getLoc(), concatType);
2209 }
2210
2211 // Get the def op of acc. It is either a mul op or a fma op.
2212 Operation *convOp = nullptr;
2213 Operation *mulOrFMAOp = Op->getOperand(2).getDefiningOp();
2214 auto mulOp = dyn_cast<aievec::aie1::MulOp>(mulOrFMAOp);
2215 auto fmaOp = dyn_cast<aievec::aie1::FMAOp>(mulOrFMAOp);
2216 int32_t zStart;
2217
2218 if (mulOp) {
2219 aievec::aie1::MulOp defOp = mulOp;
2220 zStart = stoi(static_cast<std::string>(defOp.getStart(1)));
2221 } else {
2222 aievec::aie1::FMAOp defOp = fmaOp;
2223 zStart = stoi(static_cast<std::string>(defOp.getStart(1)));
2224 }
2225
2226 auto vType = cast<VectorType>(Op->getOperand(1).getType());
2227 int32_t shiftBytes = zStart * getElementSizeInBits(vType) / 8;
2228
2229 auto defOp = mulOp ? mulOp : fmaOp;
2230 state->builder.setInsertionPoint(defOp);
2231 Location loc = defOp->getLoc();
2232
2233 // Generate a shift_bytes operation for concatRhs if needed.
2234 if (shiftBytes)
2235 concatRhs = generateShiftOp(concatRhs, concatRhs, shiftBytes, state, loc);
2236
2237 Type stype = vType.getElementType();
2238 auto itype = cast<IntegerType>(stype);
2239 unsigned width = itype.getWidth() <= 8 ? 32 : 64;
2240 Type ctype = IntegerType::get(itype.getContext(), width);
2241 Type opType = VectorType::get(vType.getShape(), ctype);
2242 Value acc = nullptr;
2243 // Curently, we only support 16x4 convolution intrinsics for int16 type
2244 // AIE-ML.
2245 int32_t M = itype.getWidth();
2246 int32_t N = 4;
2247 // Update lhs value, since it has been changed after we deleted the upd
2248 // operation with index 1
2249 lhs = curOp->getOperand(0);
2250
2251 if (mulOp)
2252 convOp = aievec::MulConvOp::create(state->builder, loc, opType, lhs,
2253 concatRhs, M, N);
2254 else {
2255 acc = defOp->getOperand(2);
2256 bool isSub = state->mscOps.count(defOp);
2257 convOp = aievec::FMAConvOp::create(state->builder, loc, opType, lhs,
2258 concatRhs, acc, M, N, isSub);
2259 }
2260
2261 Op->replaceAllUsesWith(convOp);
2262 Op->erase();
2263 defOp->erase();
2264}
2265
2266static void fuseMulFMAOpsByMulFMAConv(func::FuncOp func, VectState *state) {
2267 func.walk([&](Operation *Op) {
2268 if (isa<aievec::aie1::FMAOp>(Op) && canFuseMulFMAOpsForInt16(Op))
2269 fuseMulFMAOpsForInt16(Op, state);
2270 });
2271}
2272
2273// Generate the AIE mul/fma op for each vector mul/fma op. This function is the
2274// crux of AIE vectorization. It accomplishes two main tasks: (1) For each
2275// mul/fma operation, compute the operand attributes. The attributes are start,
2276// offsets, square, step, etc. based on the scheme; and (2) Once all the
2277// attributes are computed, generate appropriate mul/fma operation in AIE
2278// dialect.
2279static void generateAIEMulOrFMAOpsInFunc(func::FuncOp func, VectState *state) {
2280 // For each mul/fma op, compute the scheme-dependent operand attributes, and
2281 // generate corresponding AIE dialect ops.
2282 func.walk([&](Operation *op) {
2283 if (isa<MulIOp, MulFOp, vector::FMAOp>(op))
2284 generateSchemeBasedMulOrFMAOp(op, state);
2285 });
2286}
2287
2288// Given the operation attributes (start, offset, square, etc.), generate an
2289// AIE add/sub op for the incoming vector add/sub Op.
2290static void generateAddOrSubOp(Operation *Op, AIEOpAttributes &opAttr,
2291 VectState *state) {
2292
2293 // Set insertion point of the AIE dialect mul/fma op
2294 state->builder.setInsertionPointAfter(Op);
2295
2296 // Generate an AIE dialect add/sub op
2297 Operation *repOp = nullptr;
2298 if (isa<SubIOp, SubFOp>(Op)) {
2299 repOp = generateSubOp(Op, opAttr, state);
2300 LLVM_DEBUG(llvm::dbgs() << "\n\nGenerated AIE dialect sub op " << *repOp);
2301 } else {
2302 repOp = generateAddOp(Op, opAttr, state);
2303 LLVM_DEBUG(llvm::dbgs() << "\n\nGenerated AIE dialect sub op " << *repOp);
2304 }
2305
2306 // Replace all the uses of the vector add/sub op with the AIE add/sub op, and
2307 // remove Op from the IR.
2308 Op->replaceAllUsesWith(repOp);
2309 Op->erase();
2310}
2311
2312// For this add/sub operator, generate AIE dialect add/sub op based on
2313// different vector schemes.
2314static void generateSchemeBasedAddOrSubOp(Operation *Op, VectState *state) {
2315 // opAttr will cache the attributes (start, offsets, square, etc.) for both
2316 // lhs and rhs operands.
2317 AIEOpAttributes opAttr;
2318
2319 // First check if this operation requires simple vector operation, and not an
2320 // advanced scheme.
2321 if (isSimpleVectIntrinsic(Op, state)) {
2322 // For simple scheme, we do not need any attribute
2323 for (size_t idx = 0; idx < 2; ++idx) {
2324 opAttr.start.push_back("");
2325 opAttr.offset.push_back("");
2326 opAttr.offset_hi.push_back("");
2327 opAttr.square.push_back("");
2328 }
2329 generateAddOrSubOp(Op, opAttr, state);
2330 return;
2331 }
2332
2333 // Otherwise generate add/sub op based on advanced scheme.
2334 // Compute relevant attributes (start, offsets, square, etc.) for each
2335 // operand, and store them in opAttr.
2336 for (size_t idx = 0; idx < 2; ++idx) {
2337 AIEVecAttributes stat = getOperandVecStats(Op, state, idx);
2338 assert(stat.elementSizeInBits >= 16 &&
2339 "advanced scheme for add op on int8 data type not supported");
2340
2341 int32_t start = 0, accIncr = 1;
2342 std::string startStr;
2343 std::string offsetStr, offsetHiStr;
2344 std::string squareStr;
2345
2346 // If the operand comes from transfer_read, compute the loop step and start
2347 // values.
2348 if (stat.loadFromMemory) {
2349 Operation *op = Op->getOperand(idx).getDefiningOp();
2350 auto readOp = cast<TransferReadOp>(op);
2351 // How does the access change with each iteration of the vectorized loop?
2352 accIncr = stat.isSplat ? 0 : computeVecorizedLoopStepSize(readOp, state);
2353 // start in the AIE vector
2354 start = computeStartInAIEVec(op, state);
2355 }
2356 // Now the usual processing. For i32 datatype, use the regular lane
2357 // selection.
2358 if (stat.elementSizeInBits == 32) {
2359 startStr = std::to_string(start);
2360 offsetStr = "0x";
2361 for (int i = 7; i >= 0; --i)
2362 offsetStr.push_back(getHexValue(i * accIncr));
2363 // If there are >8 lanes, we need to compute offset_hi
2364 if (stat.lanes > 8) {
2365 assert(stat.lanes == 16 && "Cannot generate offset for add/sub op");
2366 // Cannot have loop stride > 1
2367 assert(accIncr <= 1 && "Cannot generate offset for given loop stride");
2368 offsetHiStr = "0x";
2369 for (int i = 15; i >= 8; --i)
2370 offsetStr.push_back(getHexValue(i * accIncr));
2371 }
2372 } else if (stat.elementSizeInBits == 16) {
2373 assert(accIncr <= 1 && "cannot generate offset for given loop stride");
2374 // start must be a multiple of 2 for i16 data type
2375 int32_t m2Offset = start % 2;
2376 startStr = std::to_string(start - m2Offset);
2377 // We must compute the offset and offset_hi only if the access is not
2378 // splat. For splat, we can use trivial offsets.
2379 if (accIncr == 0)
2380 offsetStr = offsetHiStr = "0";
2381 else {
2382 offsetStr = "0x";
2383 for (int i = 6; i >= 0; i -= 2) {
2384 offsetStr.push_back('0');
2385 offsetStr.push_back(getHexValue((i * accIncr) / 2));
2386 }
2387 offsetHiStr = "0x";
2388 for (int i = 14; i >= 8; i -= 2) {
2389 offsetHiStr.push_back('0');
2390 offsetHiStr.push_back(getHexValue((i * accIncr) / 2));
2391 }
2392 }
2393 // We use a simplistic square that covers only two cases: access is
2394 // splat, and access is regular with stride that's power of 2.
2395 if (m2Offset == 0 && accIncr == 0)
2396 squareStr = "0";
2397 else {
2398 assert(m2Offset == 0 || accIncr == 0);
2399 squareStr = "0x";
2400 int32_t astep = std::min(1, accIncr);
2401 SmallVector<int32_t> sqPattern = {3 * astep, 2 * astep, astep, 0};
2402 for (auto sq : sqPattern)
2403 squareStr.push_back(getHexValue(sq + m2Offset));
2404 }
2405 } else
2406 llvm_unreachable("Cannot generate advanced add op for given datatype");
2407
2408 // We have computed all the fields. Cache the attributes.
2409 opAttr.start.push_back(startStr);
2410 opAttr.offset.push_back(offsetStr);
2411 opAttr.offset_hi.push_back(offsetHiStr);
2412 opAttr.square.push_back(squareStr);
2413 }
2414 // And now generate the add/sub op
2415 generateAddOrSubOp(Op, opAttr, state);
2416}
2417
2418// The main focus of this function is to compute the right start/offset fields
2419// for the adds involving splat. If none of the operands of the add op is
2420// splat, we must generate simple scheme add op.
2421static void generateAIEAddOrSubOpsInFunc(func::FuncOp func, VectState *state) {
2422 func.walk([&](Operation *op) {
2423 if (isa<AddIOp, AddFOp, SubIOp, SubFOp>(op))
2424 generateSchemeBasedAddOrSubOp(op, state);
2425 });
2426}
2427
2428// Generate UPD ops to subsume all the transfer_read ops of affine dialect. To
2429// generate the UPD ops, we first visit the innermost for op, and for each
2430// transfer_read instruction nested inside that op, create a set of UPD ops,
2431// and then insert them in the front bb of that for op's region.
2432static void insertUPDOpsInLoop(affine::AffineForOp forOp, VectState *state) {
2433 // Recursively generate UPD ops in the nested for op's.
2434 for (affine::AffineForOp nestedOp :
2435 forOp.getRegion().getOps<affine::AffineForOp>())
2436 insertUPDOpsInLoop(nestedOp, state);
2437
2438 // A map from an interval to the UPD op. The key gives the interval that
2439 // should be loaded into the AIE vec, and the value indicates the UPD op
2440 // achieving that. The value also has an 8-bit field, whose first/second bit
2441 // is set if upd op idx=0/idx=1 is already created for this interval.
2442 mlir::DenseMap<std::tuple<IntervalReuse *, int32_t, int32_t>,
2443 std::pair<aievec::UPDOp, int8_t>>
2444 memToUpdMap;
2445 // A map from a read operation to its corresponding UPD operation. The idea
2446 // is that multiple read ops will derive from the same bigger vector
2447 // register.
2448 mlir::DenseMap<Operation *, aievec::UPDOp> readOpToUpdMap;
2449 // Iterate over all the transfer_read ops within this loop
2450 Region &region = forOp.getRegion();
2451 for (TransferReadOp readOp : region.getOps<TransferReadOp>()) {
2452 aievec::UPDOp updOp = generateUPDOp(readOp, memToUpdMap, region, state);
2453 readOpToUpdMap[readOp] = updOp;
2454 }
2455
2456 // Now replace all the uses of a transfer_read op with its UPD op
2457 for (auto &map : readOpToUpdMap) {
2458 Operation *op = map.first;
2459 op->replaceAllUsesWith(map.second);
2460 op->erase();
2461 }
2462}
2463
2464// Replace all the transfer_read ops with UPD ops in the function.
2465static void insertUPDOpsInFunc(func::FuncOp func, VectState *state) {
2466 for (affine::AffineForOp forOp : func.getOps<affine::AffineForOp>()) {
2467 insertUPDOpsInLoop(forOp, state);
2468 }
2469}
2470
2471// Incoming Op is an operation in AIE dialect whose result is an accumulator.
2472// Check all its uses, and if any user of Op is a non-AIE operation, insert an
2473// SRS instruction to move the value from accumulator to vector.
2474static void insertSRSOp(Operation *Op, VectState *state) {
2475 // This operation must have at least one use, and at least one result
2476 if (Op->use_empty() || Op->getNumResults() == 0)
2477 return;
2478
2479 // The operation must write to an accumulator
2480 assert(writesToAccumulator(Op));
2481
2482 // Check if any user of this operation is a non-AIE op. If any user of this
2483 // operation is non-AIE op, then we need to generate SRS op to move value
2484 // from accumulator to vector
2485 auto isNonAIEOp = [&](Operation *op) { return !isAIEOp(op); };
2486 if (!any_of(Op->getUsers(), isNonAIEOp))
2487 return;
2488
2489 // Given an accumulator, one can use different srs intrinsic to generate
2490 // different output types. Create a map from SRS output type to the SRS op.
2491 mlir::DenseMap<Type, aievec::SRSOp> typeToSRSOpMap;
2492
2493 // Set the insertion point for the AIE dialect SRS op
2494 state->builder.setInsertionPointAfter(Op);
2495
2496 // Iterate over all the users of this operation that are not in AIE dialect,
2497 // and replace the use of Op in them with srsOp
2498 for (auto user : Op->getUsers()) {
2499 // Skip AIE ops
2500 if (isAIEOp(user))
2501 continue;
2502
2503 // Get the underlying scalar element type of user op. If the user is a
2504 // write op, it won't have a result. So get the element type from memref.
2505 Type scalarType;
2506 MemRefType memRefType = nullptr;
2507 if (auto writeOp = dyn_cast<TransferWriteOp>(user)) {
2508 // Get the element type from the memref output
2509 memRefType = cast<MemRefType>(writeOp.getBase().getType());
2510 scalarType = memRefType.getElementType();
2511 } else
2512 scalarType = getElementTypeOrSelf(*user->getResultTypes().begin());
2513 assert(scalarType && "failed to form SRS op");
2514 // Iterate over all the operands of this user, and find the ones that
2515 // correspond to the Op.
2516 for (auto operand : user->getOperands()) {
2517 if (operand.getDefiningOp() == Op) {
2518 // Generate an AIE-ML cast op for the case that result vector width less
2519 // or equal that source vector width
2520 if (state->aieml && memRefType &&
2521 cast<VectorType>(Op->getOperand(0).getType())
2522 .getElementType()
2523 .getIntOrFloatBitWidth() == 8 &&
2524 cast<VectorType>(Op->getResult(0).getType())
2525 .getElementType()
2526 .getIntOrFloatBitWidth() ==
2527 scalarType.getIntOrFloatBitWidth()) {
2528 unsigned lanes =
2529 getVectorLaneSize(cast<VectorType>(Op->getResult(0).getType()));
2530 VectorType castType = createVectorType(lanes, scalarType);
2531 aievec::CastOp castOp = generateCastOp(Op->getResult(0), castType,
2532 false, state, Op->getLoc());
2533 assert(castOp && "Failed to create Cast intrinsic");
2534 user->replaceUsesOfWith(operand, castOp);
2535 break;
2536 }
2537 aievec::SRSOp srsOp;
2538 if (!typeToSRSOpMap.count(scalarType)) {
2539 srsOp =
2540 generateSRSOp(Op->getResult(0), scalarType, state, Op->getLoc());
2541 LLVM_DEBUG(llvm::dbgs() << "\n\nCreated SRS op " << srsOp
2542 << " for the acc output of operation " << Op);
2543 typeToSRSOpMap[scalarType] = srsOp;
2544 } else
2545 srsOp = typeToSRSOpMap[scalarType];
2546 assert(srsOp && "Failed to create SRS intrinsic");
2547 // And now we replace the operand with srsOp
2548 user->replaceUsesOfWith(operand, srsOp);
2549 }
2550 }
2551 }
2552}
2553
2554// Generate SRS op whenever we move data from an accumulator AIE dialect to a
2555// vector.
2556static void insertSRSOpsInFunc(func::FuncOp func, VectState *state) {
2557 func.walk([&](Operation *op) {
2558 // Insert an SRS op if the op outputs to an accumulator
2559 if (writesToAccumulator(op))
2560 insertSRSOp(op, state);
2561 });
2562}
2563
2564// Set existing read/write op to in-bounds, indicating that it always reads
2565// from/writes to a full buffer. We make this assumption for our vectorization
2566// framework.
2567template <typename TransferOp>
2568static void setInBounds(TransferOp op) {
2569 if (op.getTransferRank() == 0)
2570 return;
2571 SmallVector<bool, 4> bools(op.getTransferRank(), true);
2572 OpBuilder b(op.getContext());
2573 op->setAttr(op.getInBoundsAttrName(), b.getBoolArrayAttr(bools));
2574}
2575
2576// Remove redundant vector load/stores (i.e., transfer ops) that could be
2577// generated post unolling. The redundant operations are removed in two steps:
2578// first, we do a store to load forwarding. This removes the loads that
2579// immediately succeed a store to the same location. Then it removes multiple
2580// stores to the same memory location without an interfering store to that
2581// memref. The only preserves the last write. These transformations are already
2582// implemented in 'transferOpflowOpt' function. But these transformations only
2583// work on reads/writes that are within bounds. We safely assume that for AIE
2584// vectorization, all the transfer reads/writes are within bounds.
2585static void redundantLoadStoreOptimization(ModuleOp module) {
2586 for (func::FuncOp func : module.getOps<func::FuncOp>()) {
2587 // Mark all the transfer ops that have empty in_bounds as inbound
2588 func.walk([&](Operation *Op) {
2589 if (auto readOp = dyn_cast<TransferReadOp>(Op)) {
2590 if (!readOp.getInBounds())
2591 setInBounds<TransferReadOp>(readOp);
2592 } else if (auto writeOp = dyn_cast<TransferWriteOp>(Op)) {
2593 if (!writeOp.getInBounds())
2594 setInBounds<TransferWriteOp>(writeOp);
2595 }
2596 });
2597 // Now that all the transfer ops are marked inbound, remove redundant
2598 // vector loads/stores
2599 IRRewriter rewriter(module.getContext());
2600 vector::transferOpflowOpt(rewriter, func);
2601 }
2602}
2603
2604// Run a pre pipeline of cleanup passes (canonicalizer). Remove redundant
2605// load/store operations in case the code was generated via unrolling
2606static void preCanonicalizeIR(ModuleOp module) {
2607 PassManager pm(module.getContext());
2608 pm.addPass(createCanonicalizerPass());
2609 [[maybe_unused]] bool success = pm.run(module).succeeded();
2610 assert(success);
2611 redundantLoadStoreOptimization(module);
2612}
2613
2614// Run a post pipeline of cleanup and optimization passes (canonicalizer, LICM,
2615// CSE, etc). At the end, lower the output from affine to scf, so that we can
2616// use EmitC functionality to generate the loops.
2617static void postCanonicalizeIR(ModuleOp module) {
2618 PassManager pm(module.getContext());
2619 pm.addPass(createCanonicalizerPass());
2620 pm.addPass(createCSEPass());
2621 pm.addPass(createLoopInvariantCodeMotionPass());
2622 pm.addPass(createLowerAffinePass());
2623 [[maybe_unused]] bool success = pm.run(module).succeeded();
2624 assert(success);
2625}
2626
2627// Iterate over the loop nestings to form loop nesting bands. Then for each
2628// block within those bands, the enclosingLoops is set to the loop band.
2629static void
2630computeEnclosingLoopsPerBlock(affine::AffineForOp forOp, VectState *state,
2631 SmallVector<Operation *, 8> &enclosingLoops) {
2632 // Form the loop band for nested for ops
2633 for (affine::AffineForOp nestedOp :
2634 forOp.getRegion().getOps<affine::AffineForOp>()) {
2635 enclosingLoops.push_back(nestedOp);
2636 computeEnclosingLoopsPerBlock(nestedOp, state, enclosingLoops);
2637 enclosingLoops.pop_back();
2638 }
2639
2640 // Iterate over all the transfer_read operations enclosed within the current
2641 // region, and store the for loop nesting for the read op.
2642 for (TransferReadOp readOp : forOp.getRegion().getOps<TransferReadOp>()) {
2643 // Find the block corresponding to this transfer_read
2644 Block *block = readOp->getBlock();
2645 state->blockToEnclosingLoops[block] = enclosingLoops;
2646 }
2647}
2648
2649// We reorder the operands involved in multiplication so that (1) the splat
2650// operand is always the second operand, and (2) the bigger vector is the first
2651// operand. This allows us to form FMA intrinsic for AIE. The only exception to
2652// this rule is the 8x8 bit scheme, where the xbuff is a bit more restrictive,
2653// so we prefer splat as left operand of multiplication for 8x8 scheme.
2654static void reassociateMulOpInFunc(func::FuncOp func, VectState *state) {
2655 func.walk([&](Operation *op) {
2656 // Only reassociate vector mul ops that are well formed. This also includes
2657 // the multiplication component in fma ops.
2658 if (isa<MulIOp, MulFOp, vector::FMAOp>(op) && isWellFormedVectorOp(op)) {
2659 // 1. Reassociate so that splat is in the correct place
2660 reassociateMulOpWithSplat(op, state);
2661
2662 // 2. Reassociate so that bigger vector is the first operand
2663 reassociateMulOpBasedOnVecSize(op, state);
2664 }
2665 });
2666}
2667
2668// This is a very simple function that looks for add op of the form {a=b*c; d =
2669// a+e;}, and reassociates it so that the operand that computes a mult is the
2670// right operand of add op. This is a syntactic transformation that uses the
2671// commutativity of add op, and is only applied so that we can leverage the
2672// same code functionality for generating mac and msc ops.
2673static void reassociateAddOpInFunc(func::FuncOp func, VectState *state) {
2674 func.walk([&](Operation *op) {
2675 // Only reassociate vector add ops that are well formed.
2676 if (isa<AddIOp, AddFOp>(op) && isWellFormedVectorOp(op)) {
2677 // addOp must have two operands and one result
2678 assert(op->getNumOperands() == 2 && op->getNumResults() == 1);
2679
2680 // Determine which operand is the multiply
2681 Operation *rhsOp = getOperandDefOp(state, op, 1);
2682 Value left =
2683 state->sextTruncDefMap.count(op->getOperand(0).getDefiningOp())
2684 ? op->getOperand(0).getDefiningOp()->getOperand(0)
2685 : op->getOperand(0);
2686 Value right =
2687 state->sextTruncDefMap.count(op->getOperand(1).getDefiningOp())
2688 ? op->getOperand(1).getDefiningOp()->getOperand(0)
2689 : op->getOperand(1);
2690 // If rhs is mul operand, no need to proceed further
2691 if (!isa<MulIOp, MulFOp>(rhsOp)) {
2692 Operation *lhsOp = getOperandDefOp(state, op, 0);
2693 // If lhs is the mul operand, do the switcharoo
2694 if (isa<MulIOp, MulFOp>(lhsOp)) {
2695 LLVM_DEBUG(llvm::dbgs() << "\n\nReassociating addOp " << *op
2696 << " to place mul as rhs operand");
2697 op->setOperand(0, right);
2698 op->setOperand(1, left);
2699 LLVM_DEBUG(llvm::dbgs() << "\n\taddOp after reassociation: " << *op);
2700 }
2701 } else {
2702 op->setOperand(0, left);
2703 op->setOperand(1, right);
2704 }
2705 }
2706 });
2707}
2708
2709// For i8xi8 scheme, the lhs operand vector size could be <= 256 bits, but the
2710// intrinsic requires the lhs operand vector to be at least 512 bits.
2711// Therefore, we check each read op, and (1) if it only appears in the LHS of a
2712// mul/fma op, and (2) its interval width is <= 256 bits, we tag the vector
2713// corresponding to it. Then we can try to coalesce two consecutive tagged
2714// intervals (i.e., vectors) in each ReuseInterval object. This removes the
2715// need of extra vector and two concat ops.
2716static void coalesceLHSOpVectorsInFunc(func::FuncOp func, VectState *state) {
2717 // Iterate over all the transfer read ops in this function
2718 func.walk([&](TransferReadOp op) {
2719 // Iterate over all the users of this read operation. We want to identify
2720 // if this read op only appears as an LHS operand of a mul/fma op.
2721 bool onlyLHS = true;
2722 for (auto user : op->getUsers()) {
2723 if (!isa<MulIOp, MulFOp, vector::FMAOp>(user) ||
2724 user->getOperand(0).getDefiningOp() != op) {
2725 onlyLHS = false;
2726 break;
2727 }
2728 }
2729 // If this read op only appears as LHS operand of mul/fma op, we find the
2730 // IntervalReuse object this op belongs to, and tag the interval (i.e.,
2731 // vector) subsuming this read op's access extent.
2732 if (onlyLHS) {
2733 IntervalReuse *iv = state->getIntervalForOperation(op);
2734 iv->markLHSOperandVec(op);
2735 }
2736 });
2737
2738 // All the tagging is done. Now iterate over all the IntervalReuse objects.
2739 // If any of those has tagged vector, try to coalesce the tagged vectors.
2740 for (auto interval : state->reuseIntervals) {
2741 interval->coalesceIntervals();
2742 }
2743}
2744
2745// Go through sext operations and record the operand's defining operation.
2746static void recordSextOps(func::FuncOp func, VectState *state) {
2747 func.walk([&](ExtSIOp op) {
2748 state->sextTruncDefMap[op] = op->getOperand(0).getDefiningOp();
2749 });
2750 func.walk([&](TruncIOp op) {
2751 state->sextTruncDefMap[op] = op->getOperand(0).getDefiningOp();
2752 });
2753}
2754
2755// For each read operation, compute the potential vector-level data reuse we
2756// can exploit for it.
2757static void computeReuse(TransferReadOp readOp, VectState *state) {
2758 // Construct a linearized access expression for the transfer_read
2759 AffineExpr linearAccess = constructLinearizedAffineExpr(readOp, state);
2760 // Decompose the linear access into a base and constant offset value
2761 auto [base, offset] = getBaseAndOffset(linearAccess);
2762
2763 // Get the step size of the vectorized loop that encloses this read operation
2764 int32_t step = computeVecorizedLoopStepSize(readOp, state);
2765
2766 // If the permutation map is 0, the read operation is splat
2767 bool isSplat = readOp.getPermutationMap().isConstant();
2768
2769 // Check if this readOp is the lhs or rhs operand of a mul/fma op. If it is,
2770 // then the vector size corresponding to its access extent should at least be
2771 // 256 bits. Otherwise, AIE vectors are at least 128 bits.
2772 unsigned minVecSize = 128;
2773 for (auto user : readOp->getUsers()) {
2774 if (isa<MulIOp, MulFOp, vector::FMAOp>(user)) {
2775 if (user->getOperand(0).getDefiningOp() == readOp ||
2776 user->getOperand(1).getDefiningOp() == readOp) {
2777 minVecSize = 256;
2778 break;
2779 }
2780 }
2781 if (isa<ExtSIOp>(user)) {
2782 auto extsiOp = cast<ExtSIOp>(user);
2783 for (auto consumer : extsiOp->getUsers()) {
2784 if (isa<MulIOp, MulFOp, vector::FMAOp>(consumer)) {
2785 if ((state->sextTruncDefMap.count(
2786 consumer->getOperand(0).getDefiningOp()) &&
2787 state->sextTruncDefMap[consumer->getOperand(0)
2788 .getDefiningOp()] == readOp) ||
2789 (state->sextTruncDefMap.count(
2790 consumer->getOperand(1).getDefiningOp()) &&
2791 state->sextTruncDefMap[consumer->getOperand(1)
2792 .getDefiningOp()] == readOp)) {
2793 minVecSize = 256;
2794 break;
2795 }
2796 }
2797 }
2798 }
2799 }
2800
2801 auto vecType = cast<VectorType>(readOp.getVector().getType());
2802 if (state->aieml && (getVectorSizeInBits(vecType) == 512 ||
2803 getElementSizeInBits(vecType) == 8)) {
2804 minVecSize *= 2;
2805 }
2806
2807 bool found = false;
2808 // Iterate over all the IntervalReuse objects created thus far. Each object
2809 // represents a group of reads that have a potential of vector-level data
2810 // reuse. If we find an interval that (1) accesses an array with same base,
2811 // and (2) has other operations enclosed within the same same set of loops as
2812 // this operation, then we have the cluster of read ops that this op must be
2813 // grouped with.
2814 for (auto interval : state->reuseIntervals) {
2815 // Check if reuse is discovered
2816 if (interval->potentialReuse(readOp, base, state->blockToEnclosingLoops)) {
2817 // If the reuse is found with other operations in interval, add this
2818 // operation to interval.
2819 interval->insertInterval(readOp, state->opToIntervalMap, offset, step,
2820 isSplat, minVecSize);
2821 found = true;
2822 break;
2823 }
2824 }
2825 // If no reuse is found, create a new IntervalReuse object with just this
2826 // operation's read access extent.
2827 if (!found) {
2828 auto iv = new IntervalReuse(readOp, base);
2829 iv->insertInterval(readOp, state->opToIntervalMap, offset, step, isSplat,
2830 minVecSize);
2831 state->reuseIntervals.push_back(iv);
2832 }
2833}
2834
2835static LogicalResult isUnalignedLoad(TransferReadOp readOp, VectState *state) {
2836 auto vectorType = cast<VectorType>(readOp.getResult().getType());
2837 unsigned lanes = getVectorLaneSize(vectorType);
2838
2839 AffineExpr linearAccess = constructLinearizedAffineExpr(readOp, state);
2840 if (linearAccess.isSymbolicOrConstant()) {
2841 return success();
2842 }
2843
2844 auto memRefType = cast<MemRefType>(readOp.getBase().getType());
2845 MLIRContext *context = memRefType.getContext();
2846 ArrayRef<int64_t> sizes = memRefType.getShape();
2847 int numDims = sizes.size();
2848
2849 auto block = readOp->getBlock();
2850 assert(state->blockToEnclosingLoops.count(block) &&
2851 "enclosing loops should have been computed for the read operation\n");
2852 auto enclosingLoops = state->blockToEnclosingLoops[block];
2853
2854 SmallVector<Value, 4> indices(readOp.getIndices().begin(),
2855 readOp.getIndices().end());
2856
2857 // If the lowest dim has iv, check whether its corresponding loop step is
2858 // divisible by the vector lanes.
2859 if (auto dimExpr =
2860 dyn_cast<AffineDimExpr>(getAffineDimExpr(numDims - 1, context))) {
2861 auto index = indices[dimExpr.getPosition()];
2862 // Iterate over all enclosing loops, and find the one that is variant in
2863 // index.
2864 for (auto loop : enclosingLoops) {
2865 auto affineForOp = cast<affine::AffineForOp>(loop);
2866 auto iv = affineForOp.getInductionVar();
2867 auto invariants = affine::getInvariantAccesses(iv, indices);
2868
2869 if (!invariants.count(index)) {
2870 int step = affineForOp.getStepAsInt();
2871 if (step % lanes) {
2872 return readOp->emitError()
2873 << "Loop step of inner index of " << readOp->getName()
2874 << " is not divisible by number of vector lanes.";
2875 }
2876
2877 // To avoid generating the code with wrong results due to unaligned
2878 // upper bound's affine_map offset and loop step, we need to check
2879 // whether affine map's offset of loop upper bound is divisible by
2880 // the vector lanes.
2881 affine::AffineBound ub = affineForOp.getUpperBound();
2882 AffineMap origUbMap = ub.getMap();
2883 if (!origUbMap.isEmpty() && !origUbMap.isConstant()) {
2884 AffineExpr origUbMapResult = origUbMap.getResult(0);
2885 AffineExpr base;
2886 int32_t offset;
2887 std::tie(base, offset) = getBaseAndOffset(origUbMapResult);
2888 if (offset % lanes) {
2889 return readOp->emitError()
2890 << "Loop upper bound's affine map offset of inner index of "
2891 << readOp->getName()
2892 << " is not divisible by number of vector lanes.";
2893 }
2894 }
2895 }
2896 }
2897 }
2898
2899 // For the higher dimension, check whether the lower dimensions' shape sizes
2900 // is divisible by the vector lanes.
2901 for (int i = 1; i < numDims; ++i) {
2902 // Skip checking the higher dimensions with dynamic size.
2903 if (sizes[i] == -1) {
2904 continue;
2905 }
2906
2907 if (sizes[i] % lanes) {
2908 return readOp->emitError()
2909 << readOp->getName() << "'s shape size of index " << i
2910 << " is not divisible by number of vector lanes.";
2911 }
2912 }
2913
2914 return success();
2915}
2916
2917static LogicalResult hasUnalignedLoads(func::FuncOp func, VectState *state) {
2918 WalkResult result = func.walk([&](TransferReadOp op) {
2919 if (failed(isUnalignedLoad(op, state))) {
2920 return WalkResult::interrupt();
2921 }
2922 return WalkResult::advance();
2923 });
2924
2925 if (result.wasInterrupted()) {
2926 return failure();
2927 }
2928
2929 return success();
2930}
2931
2932// Compute the reuse interval for all the transfer_read operations. The
2933// transfer_read operations capture the vector load. Since AIE only allows for
2934// aligned vector loads, we need to compose multiple transfer reads together to
2935// form intervals of certain width (128, 256, 512, or 1024), and create an AIE
2936// vector from each interval.
2937static void computeReuseInFunc(func::FuncOp func, VectState *state) {
2938 // Now we can cluster all the transfer_read ops that have a potential of
2939 // vector-level data reuse.
2940 func.walk([&](TransferReadOp op) { computeReuse(op, state); });
2941}
2942
2943// Rewrite a sequence of mul and add/sub {a = b*c; d = a+e;} as an FMA op {d =
2944// b*c+e;}. This step only rewrites the FMA op in vector dialect.
2945static void rewriteFMAOpsInFunc(func::FuncOp func, VectState *state) {
2946 // Find a root add op that is well formed, and start from there
2947 func.walk([&](Operation *Op) {
2948 if (isa<AddIOp, AddFOp, SubIOp, SubFOp>(Op) && isWellFormedVectorOp(Op)) {
2949 // Perform a series of checks to see if we can find a mul and add/sub
2950 // that can be fused into a FMA. If found, fuse.
2951 if (canFuseMulAndAddOrSubIntoFMAOp(Op, state))
2952 fuseMulAndAddOrSubIntoFMAOp(Op, state);
2953 }
2954 });
2955}
2956
2957// Assuming commutativity and associativity of add and mul ops, reassociate ops
2958// so that code generation becomes feasible/easier.
2959static void reassociateOpsInFunc(func::FuncOp func, VectState *state) {
2960 // We assume that pointwise multiplication is commutative. So correct the
2961 // order of operands involved in multiplication so that we can form AIE
2962 // mul/fma intrinsic.
2963 reassociateMulOpInFunc(func, state);
2964 // We assume that pointwise addition is commutative. If any operand of the
2965 // add op is a mul op, then we reassociate it to be the right operand of add
2966 // op. This change ensures that in the next step, when we form FMA ops, we
2967 // reuse the functionality for mac/msc ops.
2968 reassociateAddOpInFunc(func, state);
2969}
2970
2972 AIEVectorize() = default;
2973 void runOnOperation() override;
2974};
2975
2976/// Generate AIE vector intrinsics for the current module. Assumption: the
2977/// input to this function is the mlir output generated after vectorizing the
2978/// scalar mlir input with affine superVectorizer. The vectorization factor
2979/// should be appropriately set to a power of 2 (e.g., 8 for i32xi32 scheme, 16
2980/// for i16xi16 scheme and i8xi8 scheme).
2982 // Verify the bounds of the incoming arguments
2983 assert(shiftParam < 64 && "SRS shift parameter should be between 0 and 63");
2984 assert(zeroOffset < 128 &&
2985 "Zero offset in the filter should be between 0 and 127");
2986 assert(dupFactor < 128 &&
2987 "Duplicate offset in the filter should be between 0 and 127");
2988
2989 ModuleOp module = getOperation();
2990
2991 // Canonicalize the incoming IR, mostly to simplify affine/compose apply ops
2992 preCanonicalizeIR(module);
2993
2994 // Iterate over all the functions in this module, and vectorize them
2995 for (func::FuncOp func : module.getOps<func::FuncOp>()) {
2996 // Create a new global state
2997 bool aieml = ::AIEML;
2998 bool unallignedCheck = ::unalignedLoadsCheck;
2999 if (this->unalignedLoadsCheck.hasValue())
3000 unallignedCheck = this->unalignedLoadsCheck;
3001 if (this->aieml.hasValue())
3002 aieml = this->aieml;
3003 auto *state = new VectState(func.getContext(), shiftParam, zeroOffset,
3004 dupFactor, unallignedCheck, aieml);
3005
3006 // record the sext op and its operand's def op to sextTruncDefMap
3007 recordSextOps(func, state);
3008
3009 // First compute the loops surrounding each load/store operation. This is
3010 // necessary to identify loads/stores that are nested together.
3011 for (auto forOp : func.getOps<affine::AffineForOp>()) {
3012 SmallVector<Operation *, 8> enclosingLoops;
3013 enclosingLoops.push_back(forOp);
3014 computeEnclosingLoopsPerBlock(forOp, state, enclosingLoops);
3015 }
3016
3017 // Check whether there is any unalignment loads.
3018 if (state->unalignedLoadsCheck && failed(hasUnalignedLoads(func, state))) {
3019 func.emitError() << "Cannot apply aie-vectorize to " << func->getName()
3020 << " because alignment check has failed.\n";
3021 return;
3022 }
3023
3024 // Compute the reuse for all the transfer_read operations, and form the
3025 // initial vector sizes.
3026 computeReuseInFunc(func, state);
3027 // We leverage the assumption that pointwise addition and multiplication
3028 // are commutative and associative to reassociate the operands of some
3029 // operators. This IR massaging makes it feasible to generate aie dialect
3030 // fma/msc intrinsics.
3031 reassociateOpsInFunc(func, state);
3032 // Rewrite vector dialect add and mul operation chains as vector dialect
3033 // fma operation if feasible.
3034 rewriteFMAOpsInFunc(func, state);
3035 // Coalesce vectors that only appear as LHS operands of mul/fma op if their
3036 // size is <= 256 bits.
3037 coalesceLHSOpVectorsInFunc(func, state);
3038 // Check for opportunities of fusing FMA ops to exploit the column topology
3039 // of the AIE vector intrinsic.
3040 fuseFMAOpsForColumnTopology(func, state);
3041 // For each vector dialect mul/fma op, compute the start and offset values
3042 // of its operands. Finally, generate AIE dialect mul/FMA ops.
3043 generateAIEMulOrFMAOpsInFunc(func, state);
3044 // Insert SRS ops to move data from accumulator to vector when the producer
3045 // is an AIE dialect op that writes to an accumulator, and the consumer
3046 // isn't an AIE dialect op.
3047 insertSRSOpsInFunc(func, state);
3048 // For each vector dialect add/sub op, compute the start and offset values
3049 // of its operands. Finally, generate AIE dialect add/sub ops. This should
3050 // be done after srs ops are generated, so that the input to the add op is
3051 // always vectors.
3052 generateAIEAddOrSubOpsInFunc(func, state);
3053 // Generate UPD ops that subsume all the transfer_read ops in affine
3054 // dialect. This happens after generating aie dialect add/sub ops because
3055 // those ops need to query transfer reads to know if their operand is
3056 // splat.
3057 insertUPDOpsInFunc(func, state);
3058 // Check for the opportunities of fusing Mul and FMA ops by Mul_Conv or
3059 // FMA_Conv.
3060 if (state->aieml)
3061 fuseMulFMAOpsByMulFMAConv(func, state);
3062 }
3063
3064 // Canonicalize the IR of all the functions in the module by running a set of
3065 // cleanup passes.
3066 postCanonicalizeIR(module);
3067}
3068
3069std::unique_ptr<Pass> aievec::createAIEVectorizePass() {
3070 return std::make_unique<AIEVectorize>();
3071}
int32_t computeStartInAIEVec(Operation *op, VectState *state)
void insertInterval(mlir::vector::TransferReadOp readOp, llvm::DenseMap< mlir::Operation *, IntervalReuse * > &dataAccessToIntervalMap, int32_t offset, int32_t forLoopStepSize, bool isSplat=false, unsigned minVecSize=128)
void setAccessExtent(mlir::Operation *op, std::pair< int32_t, int32_t > &extent)
int32_t getIntervalWidth(mlir::Operation *op)
std::pair< int32_t, int32_t > getAccessExtent(mlir::Operation *op)
std::pair< int32_t, int32_t > getInterval(mlir::Operation *op)
void markLHSOperandVec(mlir::Operation *op)
std::shared_ptr< Value > value()
Definition cxxopts.hpp:1026
bool isPowerOfTwo(int32_t n)
Definition AIEVecUtils.h:39
int32_t getVectorSizeInBits(mlir::VectorType type)
Definition AIEVecUtils.h:66
unsigned getVectorLaneSize(mlir::VectorType type)
Definition AIEVecUtils.h:55
char getHexValue(int val)
Definition AIEVecUtils.h:31
std::unique_ptr< mlir::Pass > createAIEVectorizePass()
mlir::VectorType createVectorType(unsigned lanes, mlir::Type elementType)
Definition AIEVecUtils.h:42
int32_t getElementSizeInBits(mlir::VectorType type)
Definition AIEVecUtils.h:49
mlir::VectorType getVectorOpDestType(mlir::VectorType type, bool AIE2)
Definition AIEVecUtils.h:80
bool isAIEOp(mlir::Operation *op)
Definition AIEVecUtils.h:73
AIEVectorize()=default
void runOnOperation() override
Generate AIE vector intrinsics for the current module.