GQE Kernel APIs

These APIs are implemented as OpenCL kernels:

gqeKernel

#include "xf_database/gqe_kernel_3_in_1.hpp"
void gqeKernel (
    hls::burst_maxi <ap_uint <8*TPCH_INT_SZ*VEC_SCAN>> din_col0,
    hls::burst_maxi <ap_uint <8*TPCH_INT_SZ*VEC_SCAN>> din_col1,
    hls::burst_maxi <ap_uint <8*TPCH_INT_SZ*VEC_SCAN>> din_col2,
    hls::burst_maxi <ap_uint <64>> din_val,
    ap_uint <64>* din_krn_cfg,
    ap_uint <64>* din_meta,
    ap_uint <256>* dout_meta,
    hls::burst_maxi <ap_uint <8*TPCH_INT_SZ*VEC_LEN>> dout_col0,
    hls::burst_maxi <ap_uint <8*TPCH_INT_SZ*VEC_LEN>> dout_col1,
    hls::burst_maxi <ap_uint <8*TPCH_INT_SZ*VEC_LEN>> dout_col2,
    hls::burst_maxi <ap_uint <8*TPCH_INT_SZ*VEC_LEN>> dout_col3,
    hls::burst_maxi <ap_uint <256>> htb_buf0,
    hls::burst_maxi <ap_uint <256>> htb_buf1,
    hls::burst_maxi <ap_uint <256>> htb_buf2,
    hls::burst_maxi <ap_uint <256>> htb_buf3,
    hls::burst_maxi <ap_uint <256>> htb_buf4,
    hls::burst_maxi <ap_uint <256>> htb_buf5,
    hls::burst_maxi <ap_uint <256>> htb_buf6,
    hls::burst_maxi <ap_uint <256>> htb_buf7,
    hls::burst_maxi <ap_uint <256>> stb_buf0,
    hls::burst_maxi <ap_uint <256>> stb_buf1,
    hls::burst_maxi <ap_uint <256>> stb_buf2,
    hls::burst_maxi <ap_uint <256>> stb_buf3,
    hls::burst_maxi <ap_uint <256>> stb_buf4,
    hls::burst_maxi <ap_uint <256>> stb_buf5,
    hls::burst_maxi <ap_uint <256>> stb_buf6,
    hls::burst_maxi <ap_uint <256>> stb_buf7
    )

3-in-1 GQE kernel (64-bit key version)

Parameters:

din_col input table columns
din_val validation bits column
din_krn_cfg input kernel configurations
din_meta input meta info
dout_meta output meta info
dout_col output table columns
htb_buf HBM buffers used to save build table key/payload for JOIN flow and lower space of hash-table for BF flow
stb_buf HBM buffers used to save overflowed build table key/payload for JOIN flow and higher space of hash-table for BF flow

gqeAggr

#include "xf_database/gqe_kernel_aggr_v2.hpp"
void gqeAggr (
    ap_uint <8*TPCH_INT_SZ*8> buf_in0 [],
    ap_uint <8*TPCH_INT_SZ*8> buf_in1 [],
    ap_uint <8*TPCH_INT_SZ*8> buf_in2 [],
    ap_uint <8*TPCH_INT_SZ*8> buf_in3 [],
    ap_uint <8*TPCH_INT_SZ*8> buf_in4 [],
    ap_uint <8*TPCH_INT_SZ*8> buf_in5 [],
    ap_uint <8*TPCH_INT_SZ*8> buf_in6 [],
    ap_uint <8*TPCH_INT_SZ*8> buf_in7 [],
    ap_uint <512> buf_metain [],
    ap_uint <512> buf_metaout [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out0 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out1 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out2 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out3 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out4 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out5 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out6 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out7 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out8 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out9 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out10 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out11 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out12 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out13 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out14 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out15 [],
    ap_uint <8*TPCH_INT_SZ> buf_cfg [],
    ap_uint <8*TPCH_INT_SZ> buf_result_info [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> ping_buf0 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> ping_buf1 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> ping_buf2 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> ping_buf3 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> pong_buf0 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> pong_buf1 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> pong_buf2 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> pong_buf3 []
    )

GQE Aggr Kernel.

For detailed document, see GQE Kernel Design.

Parameters:

buf_in0 input table buffer.
buf_in1 input table buffer.
buf_in2 input table buffer.
buf_in3 input table buffer.
buf_in4 input table buffer.
buf_in5 input table buffer.
buf_in6 input table buffer.
buf_in7 input table buffer.
nrow input row number.
buf_out output table buffer.
buf_cfg input configuration buffer.
buf_result_info output information buffer.
ping_buf0 gqeAggr’s temporal buffer for storing overflow.
ping_buf1 gqeAggr’s temporal buffer for storing overflow.
ping_buf2 gqeAggr’s temporal buffer for storing overflow.
ping_buf3 gqeAggr’s temporal buffer for storing overflow.
pong_buf0 gqeAggr’s temporal buffer for storing overflow.
pong_buf1 gqeAggr’s temporal buffer for storing overflow.
pong_buf2 gqeAggr’s temporal buffer for storing overflow.
pong_buf3 gqeAggr’s temporal buffer for storing overflow.

gqePart

#include "xf_database/gqe_kernel_part_v2.hpp"
void gqePart (
    const int k_depth,
    const int col_index,
    const int bit_num,
    ap_uint <8*4*16> buf_A1 [],
    ap_uint <8*4*16> buf_A2 [],
    ap_uint <8*4*16> buf_A3 [],
    ap_uint <8*4*16> buf_A4 [],
    ap_uint <8*4*16> buf_A5 [],
    ap_uint <8*4*16> buf_A6 [],
    ap_uint <8*4*16> buf_A7 [],
    ap_uint <8*4*16> buf_A8 [],
    ap_uint <512> tin_meta [],
    ap_uint <512> tout_meta [],
    ap_uint <8*4*16> buf_B1 [],
    ap_uint <8*4*16> buf_B2 [],
    ap_uint <8*4*16> buf_B3 [],
    ap_uint <8*4*16> buf_B4 [],
    ap_uint <8*4*16> buf_B5 [],
    ap_uint <8*4*16> buf_B6 [],
    ap_uint <8*4*16> buf_B7 [],
    ap_uint <8*4*16> buf_B8 [],
    ap_uint <8*4*16> buf_D []
    )

GQE partition kernel.

Parameters:

k_depth depth of each hash bucket in URAM
col_index index of input column
bit_num number of defined partition, log2(number of partition)
tin_meta input meta info
tout_meta output meta info
buf_A input table buffer
buf_B output table buffer
buf_D configuration buffer

Note

3-in-1 GQE has been tested on Alveo U50 card, and makes only use of HBM. Only gqeAggr is now still using the Alveo U280 card, and makes use of both HBM and DDR. While other cards like U200 and U250 are not supported out-of-box, porting and gaining acceleration is surely possible, with tailoring and tuning.