GQE Kernel APIs

These APIs are implemented as OpenCL kernels:

gqeAggr

#include "xf_database/gqe_kernel_aggr_v2.hpp"
void gqeAggr (
    ap_uint <8*TPCH_INT_SZ*8> buf_in0 [],
    ap_uint <8*TPCH_INT_SZ*8> buf_in1 [],
    ap_uint <8*TPCH_INT_SZ*8> buf_in2 [],
    ap_uint <8*TPCH_INT_SZ*8> buf_in3 [],
    ap_uint <8*TPCH_INT_SZ*8> buf_in4 [],
    ap_uint <8*TPCH_INT_SZ*8> buf_in5 [],
    ap_uint <8*TPCH_INT_SZ*8> buf_in6 [],
    ap_uint <8*TPCH_INT_SZ*8> buf_in7 [],
    ap_uint <512> buf_metain [],
    ap_uint <512> buf_metaout [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out0 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out1 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out2 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out3 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out4 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out5 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out6 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out7 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out8 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out9 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out10 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out11 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out12 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out13 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out14 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out15 [],
    ap_uint <8*TPCH_INT_SZ> buf_cfg [],
    ap_uint <8*TPCH_INT_SZ> buf_result_info [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> ping_buf0 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> ping_buf1 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> ping_buf2 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> ping_buf3 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> pong_buf0 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> pong_buf1 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> pong_buf2 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> pong_buf3 []
    )

GQE Aggr Kernel.

For detailed document, see GQE Kernel Design.

Parameters:

buf_in0 input table buffer.
buf_in1 input table buffer.
buf_in2 input table buffer.
buf_in3 input table buffer.
buf_in4 input table buffer.
buf_in5 input table buffer.
buf_in6 input table buffer.
buf_in7 input table buffer.
nrow input row number.
buf_out output table buffer.
buf_cfg input configuration buffer.
buf_result_info output information buffer.
ping_buf0 gqeAggr’s temporal buffer for storing overflow.
ping_buf1 gqeAggr’s temporal buffer for storing overflow.
ping_buf2 gqeAggr’s temporal buffer for storing overflow.
ping_buf3 gqeAggr’s temporal buffer for storing overflow.
pong_buf0 gqeAggr’s temporal buffer for storing overflow.
pong_buf1 gqeAggr’s temporal buffer for storing overflow.
pong_buf2 gqeAggr’s temporal buffer for storing overflow.
pong_buf3 gqeAggr’s temporal buffer for storing overflow.

gqeJoin

#include "xf_database/gqe_kernel_join_v2.hpp"
void gqeJoin (
    ap_uint <8*TPCH_INT_SZ*8> buf_A1 [8000],
    ap_uint <8*TPCH_INT_SZ*8> buf_A2 [8000],
    ap_uint <8*TPCH_INT_SZ*8> buf_A3 [8000],
    ap_uint <8*TPCH_INT_SZ*8> buf_A4 [8000],
    ap_uint <8*TPCH_INT_SZ*8> buf_A5 [8000],
    ap_uint <8*TPCH_INT_SZ*8> buf_A6 [8000],
    ap_uint <8*TPCH_INT_SZ*8> buf_A7 [8000],
    ap_uint <8*TPCH_INT_SZ*8> buf_A8 [8000],
    size_t _build_probe_flag,
    ap_uint <512> tin_meta [24],
    ap_uint <512> tout_meta [24],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_C1 [8000],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_C2 [8000],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_C3 [8000],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_C4 [8000],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_C5 [8000],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_C6 [8000],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_C7 [8000],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_C8 [8000],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_D [8000],
    ap_uint <8*TPCH_INT_SZ*8> htb_buf0 [8000],
    ap_uint <8*TPCH_INT_SZ*8> htb_buf1 [8000],
    ap_uint <8*TPCH_INT_SZ*8> htb_buf2 [8000],
    ap_uint <8*TPCH_INT_SZ*8> htb_buf3 [8000],
    ap_uint <8*TPCH_INT_SZ*8> htb_buf4 [8000],
    ap_uint <8*TPCH_INT_SZ*8> htb_buf5 [8000],
    ap_uint <8*TPCH_INT_SZ*8> htb_buf6 [8000],
    ap_uint <8*TPCH_INT_SZ*8> htb_buf7 [8000],
    ap_uint <8*TPCH_INT_SZ*8> stb_buf0 [8000],
    ap_uint <8*TPCH_INT_SZ*8> stb_buf1 [8000],
    ap_uint <8*TPCH_INT_SZ*8> stb_buf2 [8000],
    ap_uint <8*TPCH_INT_SZ*8> stb_buf3 [8000],
    ap_uint <8*TPCH_INT_SZ*8> stb_buf4 [8000],
    ap_uint <8*TPCH_INT_SZ*8> stb_buf5 [8000],
    ap_uint <8*TPCH_INT_SZ*8> stb_buf6 [8000],
    ap_uint <8*TPCH_INT_SZ*8> stb_buf7 [8000]
    )

GQE join kernel.

Parameters:

_build_probe_flag kernel mode flag.
buf_A input table buffer
buf_C output table buffer
htb_buf hash table.
stb_buf overflow region of hash table.

gqePart

#include "xf_database/gqe_kernel_part_v2.hpp"
void gqePart (
    const int k_depth,
    const int col_index,
    const int bit_num,
    ap_uint <8*4*16> buf_A1 [],
    ap_uint <8*4*16> buf_A2 [],
    ap_uint <8*4*16> buf_A3 [],
    ap_uint <8*4*16> buf_A4 [],
    ap_uint <8*4*16> buf_A5 [],
    ap_uint <8*4*16> buf_A6 [],
    ap_uint <8*4*16> buf_A7 [],
    ap_uint <8*4*16> buf_A8 [],
    ap_uint <512> tin_meta [],
    ap_uint <512> tout_meta [],
    ap_uint <8*4*16> buf_B1 [],
    ap_uint <8*4*16> buf_B2 [],
    ap_uint <8*4*16> buf_B3 [],
    ap_uint <8*4*16> buf_B4 [],
    ap_uint <8*4*16> buf_B5 [],
    ap_uint <8*4*16> buf_B6 [],
    ap_uint <8*4*16> buf_B7 [],
    ap_uint <8*4*16> buf_B8 [],
    ap_uint <8*4*16> buf_D []
    )

GQE partition kernel.

Parameters:

k_depth depth of each hash bucket in URAM
col_index index of input column
bit_num number of defined partition, log2(number of partition)
buf_A input table buffer
buf_B output table buffer
buf_D configuration buffer

Note

GQE has been tested on Alveo U280 card, and makes use of both HBM and DDR. While other cards like U250 and U200 are not supported out-of-box, porting and gaining acceleration is surely possible, with tailoring and tuning.