AIR Asynchronous Concurrency

A CDFG abstraction is adopted in MLIR-AIR to represent asynchronous concurrency in an MLIR program. This abstraction contains compiler passes which progressively extracts, analyses and optimizes the program’s CDFG, in order to model asynchronous executions in AI Engine.

Compilation pipeline

Step one: extract CDFG from the MLIR-AIR program using -air-dependency

Below is a code snippet showing the memory allocation, data movement and computation of a tiled matrix multiplication program in MLIR-AIR, in synchronous execution order.

scf.for %arg11 = %c0 to %c128 step %c32 {
  %5 = memref.alloc() : memref<16x32xf32, 2>
  %6 = memref.alloc() : memref<32x64xf32, 2>
  %7 = memref.alloc() : memref<16x64xf32, 2>
  air.dma_memcpy_nd (%5[] [] [], %arg8[%3, %arg11] [%c16, %c32] [%c128, %c1_0]) {id = 1 : i32} : (memref<16x32xf32, 2>, memref<128x128xf32>)
  air.dma_memcpy_nd (%6[] [] [], %arg9[%arg11, %4] [%c32, %c64] [%c128, %c1_0]) {id = 2 : i32} : (memref<32x64xf32, 2>, memref<128x128xf32>)
  air.dma_memcpy_nd (%7[] [] [], %arg10[%3, %4] [%c16, %c64] [%c128, %c1_0]) {id = 3 : i32} : (memref<16x64xf32, 2>, memref<128x128xf32>)
  linalg.matmul ins(%5, %6 : memref<16x32xf32, 2>, memref<32x64xf32, 2>) outs(%7 : memref<16x64xf32, 2>)
  air.dma_memcpy_nd (%arg10[%3, %4] [%c16, %c64] [%c128, %c1_0], %7[] [] []) {id = 4 : i32} : (memref<128x128xf32>, memref<16x64xf32, 2>)
  memref.dealloc %5 : memref<16x32xf32, 2>
  memref.dealloc %6 : memref<32x64xf32, 2>
  memref.dealloc %7 : memref<16x64xf32, 2>
}

-air-dependency pass automatically analyzes the data dependency and loop-carried dependency in code, generates a CDFG object in the compiler backend, and updates the MLIR-AIR program with AIR Async operation interface. The post-analysis asynchronous MLIR-AIR program is shown below.

%2 = air.wait_all async [%async_token_8, %async_token_10]  {id = 2 : i32}
%3 = scf.for %arg11 = %c0 to %c128 step %c32 iter_args(%arg12 = %2) -> (!air.async.token) {
  %async_token_12, %results_13 = air.execute -> (memref<16x32xf32, 2>) {
    %alloc = memref.alloc() : memref<16x32xf32, 2>
    air.execute_terminator %alloc : memref<16x32xf32, 2>
  } {id = 8 : i32}
  %async_token_14, %results_15 = air.execute -> (memref<32x64xf32, 2>) {
    %alloc = memref.alloc() : memref<32x64xf32, 2>
    air.execute_terminator %alloc : memref<32x64xf32, 2>
  } {id = 9 : i32}
  %async_token_16, %results_17 = air.execute -> (memref<16x64xf32, 2>) {
    %alloc = memref.alloc() : memref<16x64xf32, 2>
    air.execute_terminator %alloc : memref<16x64xf32, 2>
  } {id = 10 : i32}
  %4 = air.dma_memcpy_nd async [%async_token_12, %arg12] (%results_13[] [] [], %arg8[%results_9, %arg11] [%c16, %c32] [%c128, %c1_7]) {id = 1 : i32} : (memref<16x32xf32, 2>, memref<128x128xf32>)
  %5 = air.dma_memcpy_nd async [%async_token_14, %arg12] (%results_15[] [] [], %arg9[%arg11, %results_11] [%c32, %c64] [%c128, %c1_7]) {id = 2 : i32} : (memref<32x64xf32, 2>, memref<128x128xf32>)
  %6 = air.dma_memcpy_nd async [%async_token_16, %arg12, %arg12] (%results_17[] [] [], %arg10[%results_9, %results_11] [%c16, %c64] [%c128, %c1_7]) {id = 3 : i32} : (memref<16x64xf32, 2>, memref<128x128xf32>)
  %async_token_18 = air.execute [%arg12, %5, %6, %4] {
    linalg.matmul ins(%results_13, %results_15 : memref<16x32xf32, 2>, memref<32x64xf32, 2>) outs(%results_17 : memref<16x64xf32, 2>)
  } {id = 11 : i32}
  %7 = air.dma_memcpy_nd async [%arg12, %async_token_18] (%arg10[%results_9, %results_11] [%c16, %c64] [%c128, %c1_7], %results_17[] [] []) {id = 4 : i32} : (memref<128x128xf32>, memref<16x64xf32, 2>)
  %async_token_19 = air.execute [%async_token_18] {
    memref.dealloc %results_13 : memref<16x32xf32, 2>
  } {id = 12 : i32}
  %async_token_20 = air.execute [%async_token_18] {
    memref.dealloc %results_15 : memref<32x64xf32, 2>
  } {id = 13 : i32}
  %async_token_21 = air.execute [%7] {
    memref.dealloc %results_17 : memref<16x64xf32, 2>
  } {id = 14 : i32}
  %8 = air.wait_all async [%arg12, %7]  {id = 1 : i32}
  scf.yield %8 : !air.async.token
}

Step two: CDFG canonicalization using -canonicalize and -air-dependency-canonicalize

MLIR-AIR provides direct canonicalization pass -canonicalize to transform the MLIR-AIR async operations to canoncal form.

Many CDFG edges, represented as async dependency tokens in an operation’s dependence list, are redundant as they do not represent the dominant producer-consumer relationships. MLIR-AIR provides a graph canonicalization pass -air-dependency-canonicalize which removes redundant edges in the CDFG using transitive reduction algorithm.

Optional: visualize CDFG using -air-dependency-parse-graph

The CDFG which represents the MLIR-AIR program’s asynchronous concurrency can be visualized as .dot files using the -air-dependency-parse-graph pass.

Dumping dependency graphs

Use the output-dir option to specify where DOT files are written:

air-opt input.mlir -air-dependency \
    -air-dependency-parse-graph='output-dir=/tmp/my_graphs'

This generates one DOT file per hierarchy level, plus a combined.dot that nests all levels using GraphViz subgraph clusters:

/tmp/my_graphs/
├── host.dot                                          # Top-level function graph
├── host_air_launch_1.dot                             # air.launch level
├── host_air_launch_1_air_segment_1.dot               # air.segment level
├── host_air_launch_1_air_segment_1_air_herd_1.dot    # air.herd level
└── combined.dot                                      # All levels nested

To visualize the post-canonicalization (transitive-reduced) graph instead, use dump-graph on -air-dependency-canonicalize:

air-opt input.mlir -air-dependency \
    -air-dependency-canonicalize='dump-graph=true output-dir=/tmp/canon_graphs'

To show per-core graphs within each herd (instead of a single herd-level graph), use the show-cores option:

air-opt input.mlir -air-dependency \
    -air-dependency-parse-graph='output-dir=/tmp/core_graphs show-cores=true'

Rendering DOT files

The generated DOT files can be rendered using Graphviz:

# Render to PNG
dot -Tpng /tmp/my_graphs/combined.dot -o combined.png

# Render to SVG
dot -Tsvg /tmp/my_graphs/host.dot -o host.svg

# Interactive viewer
xdot /tmp/my_graphs/combined.dot

Node color legend

Color Shape Meaning
Yellow Box Hierarchy ops (start, LaunchOp, SegmentOp, HerdOp, terminators)
Chartreuse Oval Compute ops (AllocOp, DeallocOp, LinalgOp, ExecuteTerminatorOp)
Cyan Oval Data movement ops (DmaMemcpyNdOp, ChannelPutOp, ChannelGetOp, CopyOp)
Crimson Box Synchronization ops (WaitAllOp, ScfForOp, ScfForYieldOp)

The rendered CDFG below is generated from the matrix multiplication example in the previous section.

Step three: CDFG optimization using -air-dependency-schedule-opt

Having extracted a CDFG from the MLIR-AIR code, the -air-dependency-schedule-opt pass performs scheduling optimization by detecting and transforming inefficient code patterns in CDFG. Data broadcasting opportunities across spatially mapped AIE cores are automatically detected and labelled. Below is the output code snippet lowered from the tiled matrix multiplication example above. Here, data broadcasting opportunities are labelled with keyword broadcast_pattern, followed by an affine set representing the address pattern of broadcast copy recepients.

#set = affine_set<(d0, d1)[s0] : (d0 - s0 == 0, d1 >= 0, -d1 + 1 >= 0, s0 >= 0, -s0 + 1 >= 0)>
#set1 = affine_set<(d0, d1)[s0] : (d0 >= 0, -d0 + 7 >= 0, d1 - s0 == 0, s0 >= 0, -s0 + 7 >= 0)>
%2 = air.wait_all async [%async_token_10, %async_token_8] 
%3 = scf.for %arg11 = %c0 to %c128 step %c32 iter_args(%arg12 = %2) -> (!air.async.token) {
  %async_token_12, %results_13 = air.execute -> (memref<16x32xf32, 2>) {
    %alloc = memref.alloc() : memref<16x32xf32, 2>
    air.execute_terminator %alloc : memref<16x32xf32, 2>
  }
  %async_token_14, %results_15 = air.execute -> (memref<32x64xf32, 2>) {
    %alloc = memref.alloc() : memref<32x64xf32, 2>
    air.execute_terminator %alloc : memref<32x64xf32, 2>
  }
  %async_token_16, %results_17 = air.execute -> (memref<16x64xf32, 2>) {
    %alloc = memref.alloc() : memref<16x64xf32, 2>
    air.execute_terminator %alloc : memref<16x64xf32, 2>
  }
  %4 = air.dma_memcpy_nd async [%async_token_12, %arg12] (%results_13[] [] [], %arg8[%results_9, %arg11] [%c16, %c32] [%c128, %c1_7]) {broadcast_pattern = #set, id = 1 : i32} : (memref<16x32xf32, 2>, memref<128x128xf32>)
  %5 = air.dma_memcpy_nd async [%async_token_14, %arg12] (%results_15[] [] [], %arg9[%arg11, %results_11] [%c32, %c64] [%c128, %c1_7]) {broadcast_pattern = #set1, id = 2 : i32} : (memref<32x64xf32, 2>, memref<128x128xf32>)
  %6 = air.dma_memcpy_nd async [%async_token_16, %arg12] (%results_17[] [] [], %arg10[%results_9, %results_11] [%c16, %c64] [%c128, %c1_7]) {id = 3 : i32} : (memref<16x64xf32, 2>, memref<128x128xf32>)
  %async_token_18 = air.execute [%6, %5, %4] {
    linalg.matmul ins(%results_13, %results_15 : memref<16x32xf32, 2>, memref<32x64xf32, 2>) outs(%results_17 : memref<16x64xf32, 2>)
  }
  %7 = air.dma_memcpy_nd async [%async_token_18] (%arg10[%results_9, %results_11] [%c16, %c64] [%c128, %c1_7], %results_17[] [] []) {id = 4 : i32} : (memref<128x128xf32>, memref<16x64xf32, 2>)
  %async_token_19 = air.execute [%async_token_18] {
    memref.dealloc %results_13 : memref<16x32xf32, 2>
  }
  %async_token_20 = air.execute [%async_token_18] {
    memref.dealloc %results_15 : memref<32x64xf32, 2>
  }
  %async_token_21 = air.execute [%7] {
    memref.dealloc %results_17 : memref<16x64xf32, 2>
  }
  scf.yield %7 : !air.async.token
}