GQE Kernel APIs¶

These APIs are implemented as OpenCL kernels:

gqeAggr¶

#include "xf_database/gqe_kernel_aggr_v2.hpp"

void gqeAggr (
    ap_uint <8*TPCH_INT_SZ*8> buf_in0 [],
    ap_uint <8*TPCH_INT_SZ*8> buf_in1 [],
    ap_uint <8*TPCH_INT_SZ*8> buf_in2 [],
    ap_uint <8*TPCH_INT_SZ*8> buf_in3 [],
    ap_uint <8*TPCH_INT_SZ*8> buf_in4 [],
    ap_uint <8*TPCH_INT_SZ*8> buf_in5 [],
    ap_uint <8*TPCH_INT_SZ*8> buf_in6 [],
    ap_uint <8*TPCH_INT_SZ*8> buf_in7 [],
    ap_uint <512> buf_metain [],
    ap_uint <512> buf_metaout [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out0 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out1 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out2 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out3 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out4 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out5 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out6 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out7 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out8 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out9 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out10 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out11 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out12 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out13 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out14 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out15 [],
    ap_uint <8*TPCH_INT_SZ> buf_cfg [],
    ap_uint <8*TPCH_INT_SZ> buf_result_info [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> ping_buf0 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> ping_buf1 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> ping_buf2 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> ping_buf3 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> pong_buf0 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> pong_buf1 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> pong_buf2 [],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN> pong_buf3 []
    )

GQE Aggr Kernel.

For detailed document, see GQE Kernel Design.

Parameters:

buf_in0	input table buffer.
buf_in1	input table buffer.
buf_in2	input table buffer.
buf_in3	input table buffer.
buf_in4	input table buffer.
buf_in5	input table buffer.
buf_in6	input table buffer.
buf_in7	input table buffer.
nrow	input row number.
buf_out	output table buffer.
buf_cfg	input configuration buffer.
buf_result_info	output information buffer.
ping_buf0	gqeAggr’s temporal buffer for storing overflow.
ping_buf1	gqeAggr’s temporal buffer for storing overflow.
ping_buf2	gqeAggr’s temporal buffer for storing overflow.
ping_buf3	gqeAggr’s temporal buffer for storing overflow.
pong_buf0	gqeAggr’s temporal buffer for storing overflow.
pong_buf1	gqeAggr’s temporal buffer for storing overflow.
pong_buf2	gqeAggr’s temporal buffer for storing overflow.
pong_buf3	gqeAggr’s temporal buffer for storing overflow.

gqeJoin¶

#include "xf_database/gqe_kernel_join_filter.hpp"

void gqeJoin (
    size_t _build_probe_flag,
    hls::burst_maxi <ap_uint <8*TPCH_INT_SZ*VEC_SCAN>> din_col0,
    hls::burst_maxi <ap_uint <8*TPCH_INT_SZ*VEC_SCAN>> din_col1,
    hls::burst_maxi <ap_uint <8*TPCH_INT_SZ*VEC_SCAN>> din_col2,
    hls::burst_maxi <ap_uint <64>> din_val,
    ap_uint <512> din_krn_cfg [14],
    ap_uint <512> din_meta [24],
    ap_uint <512> dout_meta [24],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN>* dout_col0,
    ap_uint <8*TPCH_INT_SZ*VEC_LEN>* dout_col1,
    ap_uint <8*TPCH_INT_SZ*VEC_LEN>* dout_col2,
    ap_uint <8*TPCH_INT_SZ*VEC_LEN>* dout_col3,
    ap_uint <256>* htb_buf0,
    ap_uint <256>* htb_buf1,
    ap_uint <256>* htb_buf2,
    ap_uint <256>* htb_buf3,
    ap_uint <256>* htb_buf4,
    ap_uint <256>* htb_buf5,
    ap_uint <256>* htb_buf6,
    ap_uint <256>* htb_buf7,
    ap_uint <256>* stb_buf0,
    ap_uint <256>* stb_buf1,
    ap_uint <256>* stb_buf2,
    ap_uint <256>* stb_buf3,
    ap_uint <256>* stb_buf4,
    ap_uint <256>* stb_buf5,
    ap_uint <256>* stb_buf6,
    ap_uint <256>* stb_buf7
    )

GQE Join/Filter kernel (64-bit key version)

Parameters:

_build_probe_flag	build/probe flag, 0 for build, 1 for probe
din_col	input table columns
din_val	validation bits column
din_krn_cfg	input kernel configurations
din_meta	input meta info
dout_meta	output meta info
dout_col	output table columns
htb_buf	HBM buffers used to save build table key/payload
stb_buf	HBM buffers used to save overflowed build table key/payload for Join flow, to save hash-table of bloom-filter for Bloom-filter probe only flow

gqePart¶

gqePart overload (1)¶

#include "xf_database/gqe_kernel_part_v2.hpp"

void gqePart (
    const int k_depth,
    const int col_index,
    const int bit_num,
    ap_uint <8*4*16> buf_A1 [],
    ap_uint <8*4*16> buf_A2 [],
    ap_uint <8*4*16> buf_A3 [],
    ap_uint <8*4*16> buf_A4 [],
    ap_uint <8*4*16> buf_A5 [],
    ap_uint <8*4*16> buf_A6 [],
    ap_uint <8*4*16> buf_A7 [],
    ap_uint <8*4*16> buf_A8 [],
    ap_uint <512> tin_meta [],
    ap_uint <512> tout_meta [],
    ap_uint <8*4*16> buf_B1 [],
    ap_uint <8*4*16> buf_B2 [],
    ap_uint <8*4*16> buf_B3 [],
    ap_uint <8*4*16> buf_B4 [],
    ap_uint <8*4*16> buf_B5 [],
    ap_uint <8*4*16> buf_B6 [],
    ap_uint <8*4*16> buf_B7 [],
    ap_uint <8*4*16> buf_B8 [],
    ap_uint <8*4*16> buf_D []
    )

GQE partition kernel.

Parameters:

k_depth	depth of each hash bucket in URAM
col_index	index of input column
bit_num	number of defined partition, log2(number of partition)
tin_meta	input meta info
tout_meta	output meta info
buf_A	input table buffer
buf_B	output table buffer
buf_D	configuration buffer

gqePart overload (2)¶

#include "xf_database/gqe_kernel_part_v3.hpp"

void gqePart (
    const int tab_index,
    hls::burst_maxi <ap_uint <8*TPCH_INT_SZ*VEC_SCAN>> din_col0,
    hls::burst_maxi <ap_uint <8*TPCH_INT_SZ*VEC_SCAN>> din_col1,
    hls::burst_maxi <ap_uint <8*TPCH_INT_SZ*VEC_SCAN>> din_col2,
    hls::burst_maxi <ap_uint <64>> din_val,
    ap_uint <512> din_krn_cfg [14],
    ap_uint <512> din_meta [24],
    ap_uint <512> dout_meta [24],
    ap_uint <8*TPCH_INT_SZ*VEC_LEN>* dout_col0,
    ap_uint <8*TPCH_INT_SZ*VEC_LEN>* dout_col1,
    ap_uint <8*TPCH_INT_SZ*VEC_LEN>* dout_col2
    )

GQE partition kernel (64-bit key version)

Parameters:

table_index	table index indicating build table or join table
din_col	input table columns
din_val	validation bits column
din_krn_cfg	input kernel configurations
din_meta	input meta info
dout_meta	output meta info
dout_col	output table columns

Note

GQE has been tested on Alveo U280 card, and makes use of both HBM and DDR. While other cards like U250 and U200 are not supported out-of-box, porting and gaining acceleration is surely possible, with tailoring and tuning.