Double Precision SPMV Kernel APIs

Note

The double precision SPMV implementation in the current release uses 16 HBM channels to store NNZ values and indices, 1 HBM channel to store input dense vector X, 2 HBM channels to store partition parameters and 1 HBM channel to store result Y vector.

assembleYkernel

#include "fp64/assembleYkernel.hpp"
void assembleYkernel (
    ParamStrTyp& p_paramStr,
    DatStrTyp& p_datStr,
    DatStrTyp& p_yStr
    )

assembleYkernel is used to assemble the accumulated results into Y

Parameters:

p_paramStr input axis stream of row block parameters
p_datStr input axis stream of accumulated results
p_yStr output axis stream of Y

loadNnzKernel

#include "fp64/loadNnzKernel.hpp"
void loadNnzKernel (
    HBM_InfTyp* p_nnzPtr0,
    HBM_InfTyp* p_nnzPtr1,
    HBM_InfTyp* p_nnzPtr2,
    HBM_InfTyp* p_nnzPtr3,
    HBM_InfTyp* p_nnzPtr4,
    HBM_InfTyp* p_nnzPtr5,
    HBM_InfTyp* p_nnzPtr6,
    HBM_InfTyp* p_nnzPtr7,
    HBM_InfTyp* p_nnzPtr8,
    HBM_InfTyp* p_nnzPtr9,
    HBM_InfTyp* p_nnzPtr10,
    HBM_InfTyp* p_nnzPtr11,
    HBM_InfTyp* p_nnzPtr12,
    HBM_InfTyp* p_nnzPtr13,
    HBM_InfTyp* p_nnzPtr14,
    HBM_InfTyp* p_nnzPtr15,
    HBM_StrTyp& p_nnzStr0,
    HBM_StrTyp& p_nnzStr1,
    HBM_StrTyp& p_nnzStr2,
    HBM_StrTyp& p_nnzStr3,
    HBM_StrTyp& p_nnzStr4,
    HBM_StrTyp& p_nnzStr5,
    HBM_StrTyp& p_nnzStr6,
    HBM_StrTyp& p_nnzStr7,
    HBM_StrTyp& p_nnzStr8,
    HBM_StrTyp& p_nnzStr9,
    HBM_StrTyp& p_nnzStr10,
    HBM_StrTyp& p_nnzStr11,
    HBM_StrTyp& p_nnzStr12,
    HBM_StrTyp& p_nnzStr13,
    HBM_StrTyp& p_nnzStr14,
    HBM_StrTyp& p_nnzStr15
    )

loadNnzKernel is used to read the values of NNZs out of the device memory

Parameters:

p_nnzPtr device memory pointer for reading the values of NNZs
p_nnzStr output axis stream of NNZ values

loadParXkernel

#include "fp64/loadParXkernel.hpp"
void loadParXkernel (
    HBM_InfTyp* p_parParamPtr,
    HBM_InfTyp* p_xPtr,
    WideParamStrTyp& p_paramStr,
    HBM_StrTyp& p_outXstr
    )

loadParXkernel is used to read the input vector X and partition parameters out of device memory

Parameters:

p_parParamPtr device memory pointer for reading the partition parameters
p_xPtr device memory pointer for reading vector X
p_paramStr output axis stream of partition parameters
p_xStr output axis stream of X entries

loadRbParamKernel

#include "fp64/loadRbParamKernel.hpp"
void loadRbParamKernel (
    HBM_InfTyp* p_rbParamPtr,
    WideParamStrTyp& p_chRbParamStr,
    ParamStrTyp& p_rbParamStr
    )

loadRbParamKernel is used to read the row block parameters out of the device memory

Parameters:

p_rbParamPtr device memory pointer for reading the row block parameters
p_chRbParamStr output axis streams of channel row block parameters
p_rbParamStr output axis stream of row block parameters

moveXkernel

#include "fp64/moveXkernel.hpp"
void moveXkernel (
    HBM_StrTyp& p_inStr,
    HBM_StrTyp& p_outStr0,
    HBM_StrTyp& p_outStr1,
    HBM_StrTyp& p_outStr2,
    HBM_StrTyp& p_outStr3,
    HBM_StrTyp& p_outStr4,
    HBM_StrTyp& p_outStr5,
    HBM_StrTyp& p_outStr6,
    HBM_StrTyp& p_outStr7,
    HBM_StrTyp& p_outStr8,
    HBM_StrTyp& p_outStr9,
    HBM_StrTyp& p_outStr10,
    HBM_StrTyp& p_outStr11,
    HBM_StrTyp& p_outStr12,
    HBM_StrTyp& p_outStr13,
    HBM_StrTyp& p_outStr14,
    HBM_StrTyp& p_outStr15
    )

moveXkernel is used to dispatch X entries to multiple computation paths

Parameters:

p_inStr input axis stream of X entries
p_outStr output axis streams of X entries

rowAccKernel

#include "fp64/rowAccKernel.hpp"
void rowAccKernel (
    WideParamStrTyp& p_paramStr,
    DatStrTyp& p_inDatStr0,
    DatStrTyp& p_inDatStr1,
    DatStrTyp& p_inDatStr2,
    DatStrTyp& p_inDatStr3,
    DatStrTyp& p_inDatStr4,
    DatStrTyp& p_inDatStr5,
    DatStrTyp& p_inDatStr6,
    DatStrTyp& p_inDatStr7,
    DatStrTyp& p_inDatStr8,
    DatStrTyp& p_inDatStr9,
    DatStrTyp& p_inDatStr10,
    DatStrTyp& p_inDatStr11,
    DatStrTyp& p_inDatStr12,
    DatStrTyp& p_inDatStr13,
    DatStrTyp& p_inDatStr14,
    DatStrTyp& p_inDatStr15,
    IdxStrTyp& p_idxStr0,
    IdxStrTyp& p_idxStr1,
    IdxStrTyp& p_idxStr2,
    IdxStrTyp& p_idxStr3,
    IdxStrTyp& p_idxStr4,
    IdxStrTyp& p_idxStr5,
    IdxStrTyp& p_idxStr6,
    IdxStrTyp& p_idxStr7,
    IdxStrTyp& p_idxStr8,
    IdxStrTyp& p_idxStr9,
    IdxStrTyp& p_idxStr10,
    IdxStrTyp& p_idxStr11,
    IdxStrTyp& p_idxStr12,
    IdxStrTyp& p_idxStr13,
    IdxStrTyp& p_idxStr14,
    IdxStrTyp& p_idxStr15,
    DatStrTyp& p_outDatStr
    )

rowAccKernel is accumulate the data along row indices

Parameters:

p_paramStr input axis stream of row block parameters
p_inDatStr input axis stream of multiplication and partially accumulated results
p_idxStr input axis stream of row indices
p_outDatStr output axis stream of accumulation results

selMultXkernel

#include "fp64/selMultXkernel.hpp"
void selMultXkernel (
    ParamStrTyp& p_paramStr,
    HBM_StrTyp& p_xStr,
    HBM_StrTyp& p_nnzStr,
    DatStrTyp& p_outDatStr,
    IdxStrTyp& p_idxStr
    )

selMultXkernel is used to select and multiply input vector X with NNZ values

Parameters:

p_paramStr input axis stream of partition parameters
p_xStr input axis stream of X entries
p_nnzStr input axis stream of NNZ values and indices
p_outDatStr output axis stream of multiplication results
p_idxStr output row indices stream of the partially accumulated results

storeYkernel

#include "fp64/storeYkernel.hpp"
void storeYkernel (
    unsigned int p_rows,
    HBM_InfTyp* p_yPtr,
    DatStrTyp& p_yStr
    )

storeYkernel is used to write result Y vector into device memory

Parameters:

p_rows number of entries in the result Y vector
p_yStr input axis stream of Y vector entries
p_yPtr device memory pointer for writing Y vector