GQE Kernel APIs

These APIs are implemented as OpenCL kernels:

gqeAggr

#include "xf_database/gqe_kernel_aggr_v2.hpp"
void gqeAggr (
    ap_uint <8*TPCH_INT_SZ*8> buf_in0 [],
    ap_uint <8*TPCH_INT_SZ*8> buf_in1 [],
    ap_uint <8*TPCH_INT_SZ*8> buf_in2 [],
    ap_uint <8*TPCH_INT_SZ*8> buf_in3 [],
    ap_uint <8*TPCH_INT_SZ*8> buf_in4 [],
    ap_uint <8*TPCH_INT_SZ*8> buf_in5 [],
    ap_uint <8*TPCH_INT_SZ*8> buf_in6 [],
    ap_uint <8*TPCH_INT_SZ*8> buf_in7 [],
    ap_uint <512> buf_metain [],
    ap_uint <512> buf_metaout [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out0 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out1 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out2 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out3 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out4 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out5 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out6 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out7 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out8 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out9 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out10 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out11 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out12 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out13 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out14 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out15 [],
    ap_uint <8*TPCH_INT_SZ> buf_cfg [],
    ap_uint <8*TPCH_INT_SZ> buf_result_info [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> ping_buf0 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> ping_buf1 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> ping_buf2 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> ping_buf3 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> pong_buf0 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> pong_buf1 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> pong_buf2 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> pong_buf3 []
    )

GQE Aggr Kernel.

For detailed document, see GQE Kernel Design.

Parameters:

buf_in0 input table buffer.
buf_in1 input table buffer.
buf_in2 input table buffer.
buf_in3 input table buffer.
buf_in4 input table buffer.
buf_in5 input table buffer.
buf_in6 input table buffer.
buf_in7 input table buffer.
nrow input row number.
buf_out output table buffer.
buf_cfg input configuration buffer.
buf_result_info output information buffer.
ping_buf0 gqeAggr’s temporal buffer for storing overflow.
ping_buf1 gqeAggr’s temporal buffer for storing overflow.
ping_buf2 gqeAggr’s temporal buffer for storing overflow.
ping_buf3 gqeAggr’s temporal buffer for storing overflow.
pong_buf0 gqeAggr’s temporal buffer for storing overflow.
pong_buf1 gqeAggr’s temporal buffer for storing overflow.
pong_buf2 gqeAggr’s temporal buffer for storing overflow.
pong_buf3 gqeAggr’s temporal buffer for storing overflow.

gqeJoin

#include "xf_database/gqe_kernel_join_filter.hpp"
void gqeJoin (
    size_t _build_probe_flag,
    hls::burst_maxi <ap_uint <8*TPCH_INT_SZ*VEC_SCAN>> din_col0,
    hls::burst_maxi <ap_uint <8*TPCH_INT_SZ*VEC_SCAN>> din_col1,
    hls::burst_maxi <ap_uint <8*TPCH_INT_SZ*VEC_SCAN>> din_col2,
    hls::burst_maxi <ap_uint <64>> din_val,
    ap_uint <512> din_krn_cfg [14],
    ap_uint <512> din_meta [24],
    ap_uint <512> dout_meta [24],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN>* dout_col0,
    ap_uint <8*TPCH_INT_SZ*VEC_LEN>* dout_col1,
    ap_uint <8*TPCH_INT_SZ*VEC_LEN>* dout_col2,
    ap_uint <8*TPCH_INT_SZ*VEC_LEN>* dout_col3,
    ap_uint <256>* htb_buf0,
    ap_uint <256>* htb_buf1,
    ap_uint <256>* htb_buf2,
    ap_uint <256>* htb_buf3,
    ap_uint <256>* htb_buf4,
    ap_uint <256>* htb_buf5,
    ap_uint <256>* htb_buf6,
    ap_uint <256>* htb_buf7,
    ap_uint <256>* stb_buf0,
    ap_uint <256>* stb_buf1,
    ap_uint <256>* stb_buf2,
    ap_uint <256>* stb_buf3,
    ap_uint <256>* stb_buf4,
    ap_uint <256>* stb_buf5,
    ap_uint <256>* stb_buf6,
    ap_uint <256>* stb_buf7
    )

GQE Join/Filter kernel (64-bit key version)

Parameters:

_build_probe_flag build/probe flag, 0 for build, 1 for probe
din_col input table columns
din_val validation bits column
din_krn_cfg input kernel configurations
din_meta input meta info
dout_meta output meta info
dout_col output table columns
htb_buf HBM buffers used to save build table key/payload
stb_buf HBM buffers used to save overflowed build table key/payload for Join flow, to save hash-table of bloom-filter for Bloom-filter probe only flow

gqePart

gqePart overload (1)

#include "xf_database/gqe_kernel_part_v2.hpp"
void gqePart (
    const int k_depth,
    const int col_index,
    const int bit_num,
    ap_uint <8*4*16> buf_A1 [],
    ap_uint <8*4*16> buf_A2 [],
    ap_uint <8*4*16> buf_A3 [],
    ap_uint <8*4*16> buf_A4 [],
    ap_uint <8*4*16> buf_A5 [],
    ap_uint <8*4*16> buf_A6 [],
    ap_uint <8*4*16> buf_A7 [],
    ap_uint <8*4*16> buf_A8 [],
    ap_uint <512> tin_meta [],
    ap_uint <512> tout_meta [],
    ap_uint <8*4*16> buf_B1 [],
    ap_uint <8*4*16> buf_B2 [],
    ap_uint <8*4*16> buf_B3 [],
    ap_uint <8*4*16> buf_B4 [],
    ap_uint <8*4*16> buf_B5 [],
    ap_uint <8*4*16> buf_B6 [],
    ap_uint <8*4*16> buf_B7 [],
    ap_uint <8*4*16> buf_B8 [],
    ap_uint <8*4*16> buf_D []
    )

GQE partition kernel.

Parameters:

k_depth depth of each hash bucket in URAM
col_index index of input column
bit_num number of defined partition, log2(number of partition)
tin_meta input meta info
tout_meta output meta info
buf_A input table buffer
buf_B output table buffer
buf_D configuration buffer

gqePart overload (2)

#include "xf_database/gqe_kernel_part_v3.hpp"
void gqePart (
    const int tab_index,
    hls::burst_maxi <ap_uint <8*TPCH_INT_SZ*VEC_SCAN>> din_col0,
    hls::burst_maxi <ap_uint <8*TPCH_INT_SZ*VEC_SCAN>> din_col1,
    hls::burst_maxi <ap_uint <8*TPCH_INT_SZ*VEC_SCAN>> din_col2,
    hls::burst_maxi <ap_uint <64>> din_val,
    ap_uint <512> din_krn_cfg [14],
    ap_uint <512> din_meta [24],
    ap_uint <512> dout_meta [24],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN>* dout_col0,
    ap_uint <8*TPCH_INT_SZ*VEC_LEN>* dout_col1,
    ap_uint <8*TPCH_INT_SZ*VEC_LEN>* dout_col2
    )

GQE partition kernel (64-bit key version)

Parameters:

table_index table index indicating build table or join table
din_col input table columns
din_val validation bits column
din_krn_cfg input kernel configurations
din_meta input meta info
dout_meta output meta info
dout_col output table columns

Note

GQE has been tested on Alveo U280 card, and makes use of both HBM and DDR. While other cards like U250 and U200 are not supported out-of-box, porting and gaining acceleration is surely possible, with tailoring and tuning.