Double Precision SPMV Kernel APIs¶
Note
The double precision SPMV implementation in the current release uses 16 HBM channels to store NNZ values and indices, 1 HBM channel to store input dense vector X, 2 HBM channels to store partition parameters and 1 HBM channel to store result Y vector.
assembleYkernel¶
#include "fp64/assembleYkernel.hpp"
void assembleYkernel ( ParamStrTyp& p_paramStr, DatStrTyp& p_datStr, DatStrTyp& p_yStr )
assembleYkernel is used to assemble the accumulated results into Y
Parameters:
p_paramStr | input axis stream of row block parameters |
p_datStr | input axis stream of accumulated results |
p_yStr | output axis stream of Y |
loadNnzKernel¶
#include "fp64/loadNnzKernel.hpp"
void loadNnzKernel ( HBM_InfTyp* p_nnzPtr0, HBM_InfTyp* p_nnzPtr1, HBM_InfTyp* p_nnzPtr2, HBM_InfTyp* p_nnzPtr3, HBM_InfTyp* p_nnzPtr4, HBM_InfTyp* p_nnzPtr5, HBM_InfTyp* p_nnzPtr6, HBM_InfTyp* p_nnzPtr7, HBM_InfTyp* p_nnzPtr8, HBM_InfTyp* p_nnzPtr9, HBM_InfTyp* p_nnzPtr10, HBM_InfTyp* p_nnzPtr11, HBM_InfTyp* p_nnzPtr12, HBM_InfTyp* p_nnzPtr13, HBM_InfTyp* p_nnzPtr14, HBM_InfTyp* p_nnzPtr15, HBM_StrTyp& p_nnzStr0, HBM_StrTyp& p_nnzStr1, HBM_StrTyp& p_nnzStr2, HBM_StrTyp& p_nnzStr3, HBM_StrTyp& p_nnzStr4, HBM_StrTyp& p_nnzStr5, HBM_StrTyp& p_nnzStr6, HBM_StrTyp& p_nnzStr7, HBM_StrTyp& p_nnzStr8, HBM_StrTyp& p_nnzStr9, HBM_StrTyp& p_nnzStr10, HBM_StrTyp& p_nnzStr11, HBM_StrTyp& p_nnzStr12, HBM_StrTyp& p_nnzStr13, HBM_StrTyp& p_nnzStr14, HBM_StrTyp& p_nnzStr15 )
loadNnzKernel is used to read the values of NNZs out of the device memory
Parameters:
p_nnzPtr | device memory pointer for reading the values of NNZs |
p_nnzStr | output axis stream of NNZ values |
loadParXkernel¶
#include "fp64/loadParXkernel.hpp"
void loadParXkernel ( HBM_InfTyp* p_parParamPtr, HBM_InfTyp* p_xPtr, WideParamStrTyp& p_paramStr, HBM_StrTyp& p_outXstr )
loadParXkernel is used to read the input vector X and partition parameters out of device memory
Parameters:
p_parParamPtr | device memory pointer for reading the partition parameters |
p_xPtr | device memory pointer for reading vector X |
p_paramStr | output axis stream of partition parameters |
p_xStr | output axis stream of X entries |
loadRbParamKernel¶
#include "fp64/loadRbParamKernel.hpp"
void loadRbParamKernel ( HBM_InfTyp* p_rbParamPtr, WideParamStrTyp& p_chRbParamStr, ParamStrTyp& p_rbParamStr )
loadRbParamKernel is used to read the row block parameters out of the device memory
Parameters:
p_rbParamPtr | device memory pointer for reading the row block parameters |
p_chRbParamStr | output axis streams of channel row block parameters |
p_rbParamStr | output axis stream of row block parameters |
moveXkernel¶
#include "fp64/moveXkernel.hpp"
void moveXkernel ( HBM_StrTyp& p_inStr, HBM_StrTyp& p_outStr0, HBM_StrTyp& p_outStr1, HBM_StrTyp& p_outStr2, HBM_StrTyp& p_outStr3, HBM_StrTyp& p_outStr4, HBM_StrTyp& p_outStr5, HBM_StrTyp& p_outStr6, HBM_StrTyp& p_outStr7, HBM_StrTyp& p_outStr8, HBM_StrTyp& p_outStr9, HBM_StrTyp& p_outStr10, HBM_StrTyp& p_outStr11, HBM_StrTyp& p_outStr12, HBM_StrTyp& p_outStr13, HBM_StrTyp& p_outStr14, HBM_StrTyp& p_outStr15 )
moveXkernel is used to dispatch X entries to multiple computation paths
Parameters:
p_inStr | input axis stream of X entries |
p_outStr | output axis streams of X entries |
rowAccKernel¶
#include "fp64/rowAccKernel.hpp"
void rowAccKernel ( WideParamStrTyp& p_paramStr, DatStrTyp& p_inDatStr0, DatStrTyp& p_inDatStr1, DatStrTyp& p_inDatStr2, DatStrTyp& p_inDatStr3, DatStrTyp& p_inDatStr4, DatStrTyp& p_inDatStr5, DatStrTyp& p_inDatStr6, DatStrTyp& p_inDatStr7, DatStrTyp& p_inDatStr8, DatStrTyp& p_inDatStr9, DatStrTyp& p_inDatStr10, DatStrTyp& p_inDatStr11, DatStrTyp& p_inDatStr12, DatStrTyp& p_inDatStr13, DatStrTyp& p_inDatStr14, DatStrTyp& p_inDatStr15, IdxStrTyp& p_idxStr0, IdxStrTyp& p_idxStr1, IdxStrTyp& p_idxStr2, IdxStrTyp& p_idxStr3, IdxStrTyp& p_idxStr4, IdxStrTyp& p_idxStr5, IdxStrTyp& p_idxStr6, IdxStrTyp& p_idxStr7, IdxStrTyp& p_idxStr8, IdxStrTyp& p_idxStr9, IdxStrTyp& p_idxStr10, IdxStrTyp& p_idxStr11, IdxStrTyp& p_idxStr12, IdxStrTyp& p_idxStr13, IdxStrTyp& p_idxStr14, IdxStrTyp& p_idxStr15, DatStrTyp& p_outDatStr )
rowAccKernel is accumulate the data along row indices
Parameters:
p_paramStr | input axis stream of row block parameters |
p_inDatStr | input axis stream of multiplication and partially accumulated results |
p_idxStr | input axis stream of row indices |
p_outDatStr | output axis stream of accumulation results |
selMultXkernel¶
#include "fp64/selMultXkernel.hpp"
void selMultXkernel ( ParamStrTyp& p_paramStr, HBM_StrTyp& p_xStr, HBM_StrTyp& p_nnzStr, DatStrTyp& p_outDatStr, IdxStrTyp& p_idxStr )
selMultXkernel is used to select and multiply input vector X with NNZ values
Parameters:
p_paramStr | input axis stream of partition parameters |
p_xStr | input axis stream of X entries |
p_nnzStr | input axis stream of NNZ values and indices |
p_outDatStr | output axis stream of multiplication results |
p_idxStr | output row indices stream of the partially accumulated results |
storeYkernel¶
#include "fp64/storeYkernel.hpp"
void storeYkernel ( unsigned int p_rows, HBM_InfTyp* p_yPtr, DatStrTyp& p_yStr )
storeYkernel is used to write result Y vector into device memory
Parameters:
p_rows | number of entries in the result Y vector |
p_yStr | input axis stream of Y vector entries |
p_yPtr | device memory pointer for writing Y vector |