GQE Kernel APIs¶

These APIs are implemented as OpenCL kernels:

gqeKernel¶

#include "xf_database/gqe_kernel_3_in_1.hpp"

void gqeKernel (
    hls::burst_maxi <ap_uint <8*TPCH_INT_SZ*VEC_SCAN>> din_col0,
    hls::burst_maxi <ap_uint <8*TPCH_INT_SZ*VEC_SCAN>> din_col1,
    hls::burst_maxi <ap_uint <8*TPCH_INT_SZ*VEC_SCAN>> din_col2,
    hls::burst_maxi <ap_uint <64>> din_val,
    ap_uint <64>* din_krn_cfg,
    ap_uint <64>* din_meta,
    ap_uint <256>* dout_meta,
    hls::burst_maxi <ap_uint <8*TPCH_INT_SZ*VEC_LEN>> dout_col0,
    hls::burst_maxi <ap_uint <8*TPCH_INT_SZ*VEC_LEN>> dout_col1,
    hls::burst_maxi <ap_uint <8*TPCH_INT_SZ*VEC_LEN>> dout_col2,
    hls::burst_maxi <ap_uint <8*TPCH_INT_SZ*VEC_LEN>> dout_col3,
    hls::burst_maxi <ap_uint <256>> htb_buf0,
    hls::burst_maxi <ap_uint <256>> htb_buf1,
    hls::burst_maxi <ap_uint <256>> htb_buf2,
    hls::burst_maxi <ap_uint <256>> htb_buf3,
    hls::burst_maxi <ap_uint <256>> htb_buf4,
    hls::burst_maxi <ap_uint <256>> htb_buf5,
    hls::burst_maxi <ap_uint <256>> htb_buf6,
    hls::burst_maxi <ap_uint <256>> htb_buf7,
    hls::burst_maxi <ap_uint <256>> stb_buf0,
    hls::burst_maxi <ap_uint <256>> stb_buf1,
    hls::burst_maxi <ap_uint <256>> stb_buf2,
    hls::burst_maxi <ap_uint <256>> stb_buf3,
    hls::burst_maxi <ap_uint <256>> stb_buf4,
    hls::burst_maxi <ap_uint <256>> stb_buf5,
    hls::burst_maxi <ap_uint <256>> stb_buf6,
    hls::burst_maxi <ap_uint <256>> stb_buf7
    )

3-in-1 GQE kernel (64-bit key version)

Parameters:

din_col	input table columns
din_val	validation bits column
din_krn_cfg	input kernel configurations
din_meta	input meta info
dout_meta	output meta info
dout_col	output table columns
htb_buf	HBM buffers used to save build table key/payload for JOIN flow and lower space of hash-table for BF flow
stb_buf	HBM buffers used to save overflowed build table key/payload for JOIN flow and higher space of hash-table for BF flow

gqeAggr¶

#include "xf_database/gqe_kernel_aggr_v2.hpp"

void gqeAggr (
    ap_uint <8*TPCH_INT_SZ*8> buf_in0 [],
    ap_uint <8*TPCH_INT_SZ*8> buf_in1 [],
    ap_uint <8*TPCH_INT_SZ*8> buf_in2 [],
    ap_uint <8*TPCH_INT_SZ*8> buf_in3 [],
    ap_uint <8*TPCH_INT_SZ*8> buf_in4 [],
    ap_uint <8*TPCH_INT_SZ*8> buf_in5 [],
    ap_uint <8*TPCH_INT_SZ*8> buf_in6 [],
    ap_uint <8*TPCH_INT_SZ*8> buf_in7 [],
    ap_uint <512> buf_metain [],
    ap_uint <512> buf_metaout [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out0 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out1 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out2 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out3 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out4 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out5 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out6 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out7 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out8 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out9 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out10 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out11 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out12 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out13 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out14 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out15 [],
    ap_uint <8*TPCH_INT_SZ> buf_cfg [],
    ap_uint <8*TPCH_INT_SZ> buf_result_info [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> ping_buf0 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> ping_buf1 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> ping_buf2 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> ping_buf3 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> pong_buf0 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> pong_buf1 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> pong_buf2 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> pong_buf3 []
    )

GQE Aggr Kernel.

For detailed document, see GQE Kernel Design.

Parameters:

buf_in0	input table buffer.
buf_in1	input table buffer.
buf_in2	input table buffer.
buf_in3	input table buffer.
buf_in4	input table buffer.
buf_in5	input table buffer.
buf_in6	input table buffer.
buf_in7	input table buffer.
nrow	input row number.
buf_out	output table buffer.
buf_cfg	input configuration buffer.
buf_result_info	output information buffer.
ping_buf0	gqeAggr’s temporal buffer for storing overflow.
ping_buf1	gqeAggr’s temporal buffer for storing overflow.
ping_buf2	gqeAggr’s temporal buffer for storing overflow.
ping_buf3	gqeAggr’s temporal buffer for storing overflow.
pong_buf0	gqeAggr’s temporal buffer for storing overflow.
pong_buf1	gqeAggr’s temporal buffer for storing overflow.
pong_buf2	gqeAggr’s temporal buffer for storing overflow.
pong_buf3	gqeAggr’s temporal buffer for storing overflow.

gqePart¶

#include "xf_database/gqe_kernel_part_v2.hpp"

void gqePart (
    const int k_depth,
    const int col_index,
    const int bit_num,
    ap_uint <8*4*16> buf_A1 [],
    ap_uint <8*4*16> buf_A2 [],
    ap_uint <8*4*16> buf_A3 [],
    ap_uint <8*4*16> buf_A4 [],
    ap_uint <8*4*16> buf_A5 [],
    ap_uint <8*4*16> buf_A6 [],
    ap_uint <8*4*16> buf_A7 [],
    ap_uint <8*4*16> buf_A8 [],
    ap_uint <512> tin_meta [],
    ap_uint <512> tout_meta [],
    ap_uint <8*4*16> buf_B1 [],
    ap_uint <8*4*16> buf_B2 [],
    ap_uint <8*4*16> buf_B3 [],
    ap_uint <8*4*16> buf_B4 [],
    ap_uint <8*4*16> buf_B5 [],
    ap_uint <8*4*16> buf_B6 [],
    ap_uint <8*4*16> buf_B7 [],
    ap_uint <8*4*16> buf_B8 [],
    ap_uint <8*4*16> buf_D []
    )

GQE partition kernel.

Parameters:

k_depth	depth of each hash bucket in URAM
col_index	index of input column
bit_num	number of defined partition, log2(number of partition)
tin_meta	input meta info
tout_meta	output meta info
buf_A	input table buffer
buf_B	output table buffer
buf_D	configuration buffer

Note

3-in-1 GQE has been tested on Alveo U50 card, and makes only use of HBM. Only gqeAggr is now still using the Alveo U280 card, and makes use of both HBM and DDR. While other cards like U200 and U250 are not supported out-of-box, porting and gaining acceleration is surely possible, with tailoring and tuning.