GQE Kernel APIs¶
These APIs are implemented as OpenCL kernels:
gqeAggr¶
#include "xf_database/gqe_kernel_aggr_v2.hpp"
void gqeAggr ( ap_uint <8*TPCH_INT_SZ*8> buf_in0 [], ap_uint <8*TPCH_INT_SZ*8> buf_in1 [], ap_uint <8*TPCH_INT_SZ*8> buf_in2 [], ap_uint <8*TPCH_INT_SZ*8> buf_in3 [], ap_uint <8*TPCH_INT_SZ*8> buf_in4 [], ap_uint <8*TPCH_INT_SZ*8> buf_in5 [], ap_uint <8*TPCH_INT_SZ*8> buf_in6 [], ap_uint <8*TPCH_INT_SZ*8> buf_in7 [], ap_uint <512> buf_metain [], ap_uint <512> buf_metaout [], ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out0 [], ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out1 [], ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out2 [], ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out3 [], ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out4 [], ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out5 [], ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out6 [], ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out7 [], ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out8 [], ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out9 [], ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out10 [], ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out11 [], ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out12 [], ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out13 [], ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out14 [], ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_out15 [], ap_uint <8*TPCH_INT_SZ> buf_cfg [], ap_uint <8*TPCH_INT_SZ> buf_result_info [], ap_uint <8*TPCH_INT_SZ*VEC_LEN> ping_buf0 [], ap_uint <8*TPCH_INT_SZ*VEC_LEN> ping_buf1 [], ap_uint <8*TPCH_INT_SZ*VEC_LEN> ping_buf2 [], ap_uint <8*TPCH_INT_SZ*VEC_LEN> ping_buf3 [], ap_uint <8*TPCH_INT_SZ*VEC_LEN> pong_buf0 [], ap_uint <8*TPCH_INT_SZ*VEC_LEN> pong_buf1 [], ap_uint <8*TPCH_INT_SZ*VEC_LEN> pong_buf2 [], ap_uint <8*TPCH_INT_SZ*VEC_LEN> pong_buf3 [] )
GQE Aggr Kernel.
For detailed document, see GQE Kernel Design.
Parameters:
buf_in0 | input table buffer. |
buf_in1 | input table buffer. |
buf_in2 | input table buffer. |
buf_in3 | input table buffer. |
buf_in4 | input table buffer. |
buf_in5 | input table buffer. |
buf_in6 | input table buffer. |
buf_in7 | input table buffer. |
nrow | input row number. |
buf_out | output table buffer. |
buf_cfg | input configuration buffer. |
buf_result_info | output information buffer. |
ping_buf0 | gqeAggr’s temporal buffer for storing overflow. |
ping_buf1 | gqeAggr’s temporal buffer for storing overflow. |
ping_buf2 | gqeAggr’s temporal buffer for storing overflow. |
ping_buf3 | gqeAggr’s temporal buffer for storing overflow. |
pong_buf0 | gqeAggr’s temporal buffer for storing overflow. |
pong_buf1 | gqeAggr’s temporal buffer for storing overflow. |
pong_buf2 | gqeAggr’s temporal buffer for storing overflow. |
pong_buf3 | gqeAggr’s temporal buffer for storing overflow. |
gqeJoin¶
#include "xf_database/gqe_kernel_join_v2.hpp"
void gqeJoin ( ap_uint <8*TPCH_INT_SZ*8> buf_A1 [8000], ap_uint <8*TPCH_INT_SZ*8> buf_A2 [8000], ap_uint <8*TPCH_INT_SZ*8> buf_A3 [8000], ap_uint <8*TPCH_INT_SZ*8> buf_A4 [8000], ap_uint <8*TPCH_INT_SZ*8> buf_A5 [8000], ap_uint <8*TPCH_INT_SZ*8> buf_A6 [8000], ap_uint <8*TPCH_INT_SZ*8> buf_A7 [8000], ap_uint <8*TPCH_INT_SZ*8> buf_A8 [8000], size_t _build_probe_flag, ap_uint <512> tin_meta [24], ap_uint <512> tout_meta [24], ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_C1 [8000], ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_C2 [8000], ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_C3 [8000], ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_C4 [8000], ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_C5 [8000], ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_C6 [8000], ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_C7 [8000], ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_C8 [8000], ap_uint <8*TPCH_INT_SZ*VEC_LEN> buf_D [8000], ap_uint <8*TPCH_INT_SZ*8> htb_buf0 [8000], ap_uint <8*TPCH_INT_SZ*8> htb_buf1 [8000], ap_uint <8*TPCH_INT_SZ*8> htb_buf2 [8000], ap_uint <8*TPCH_INT_SZ*8> htb_buf3 [8000], ap_uint <8*TPCH_INT_SZ*8> htb_buf4 [8000], ap_uint <8*TPCH_INT_SZ*8> htb_buf5 [8000], ap_uint <8*TPCH_INT_SZ*8> htb_buf6 [8000], ap_uint <8*TPCH_INT_SZ*8> htb_buf7 [8000], ap_uint <8*TPCH_INT_SZ*8> stb_buf0 [8000], ap_uint <8*TPCH_INT_SZ*8> stb_buf1 [8000], ap_uint <8*TPCH_INT_SZ*8> stb_buf2 [8000], ap_uint <8*TPCH_INT_SZ*8> stb_buf3 [8000], ap_uint <8*TPCH_INT_SZ*8> stb_buf4 [8000], ap_uint <8*TPCH_INT_SZ*8> stb_buf5 [8000], ap_uint <8*TPCH_INT_SZ*8> stb_buf6 [8000], ap_uint <8*TPCH_INT_SZ*8> stb_buf7 [8000] )
GQE join kernel.
Parameters:
_build_probe_flag | kernel mode flag. |
buf_A | input table buffer |
buf_C | output table buffer |
htb_buf | hash table. |
stb_buf | overflow region of hash table. |
gqePart¶
#include "xf_database/gqe_kernel_part_v2.hpp"
void gqePart ( const int k_depth, const int col_index, const int bit_num, ap_uint <8*4*16> buf_A1 [], ap_uint <8*4*16> buf_A2 [], ap_uint <8*4*16> buf_A3 [], ap_uint <8*4*16> buf_A4 [], ap_uint <8*4*16> buf_A5 [], ap_uint <8*4*16> buf_A6 [], ap_uint <8*4*16> buf_A7 [], ap_uint <8*4*16> buf_A8 [], ap_uint <512> tin_meta [], ap_uint <512> tout_meta [], ap_uint <8*4*16> buf_B1 [], ap_uint <8*4*16> buf_B2 [], ap_uint <8*4*16> buf_B3 [], ap_uint <8*4*16> buf_B4 [], ap_uint <8*4*16> buf_B5 [], ap_uint <8*4*16> buf_B6 [], ap_uint <8*4*16> buf_B7 [], ap_uint <8*4*16> buf_B8 [], ap_uint <8*4*16> buf_D [] )
GQE partition kernel.
Parameters:
k_depth | depth of each hash bucket in URAM |
col_index | index of input column |
bit_num | number of defined partition, log2(number of partition) |
buf_A | input table buffer |
buf_B | output table buffer |
buf_D | configuration buffer |
Note
GQE has been tested on Alveo U280 card, and makes use of both HBM and DDR. While other cards like U250 and U200 are not supported out-of-box, porting and gaining acceleration is surely possible, with tailoring and tuning.