namespace blas¶
// enums enum OpType // classes class BLASArgs class BLASHost class BLASHostHandle class ConfigDict class GEMMHost class GEMVHost template < unsigned int t_KBufferDim, unsigned int t_ParEntriesM, unsigned int t_ParEntriesN > class Gemm <float, t_KBufferDim, t_ParEntriesM, t_ParEntriesN, float> template < typename t_DataType, unsigned int t_KBufferDim, unsigned int t_ParEntriesM, unsigned int t_ParEntriesN = t_ParEntriesM, typename t_MacDataType = t_DataType > class Gemm class GemmArgs class GemvArgs class XFpga class XFpgaHold class XHost
amax¶
#include "xf_blas/amax.hpp"
template < typename t_DataType, unsigned int t_LogParEntries, typename t_IndexType > void amax (unsigned int p_n)
amax function that returns the position of the vector element that has the maximum magnitude.
Parameters:
t_DataType | the data type of the vector entries |
t_LogParEntries | log2 of the number of parallelly processed entries in the input vector |
t_IndexType | the datatype of the index |
p_n | the number of entries in the input vector p_x, p_n % l_ParEntries == 0 |
p_x | the input stream of packed vector entries |
p_result | the resulting index, which is 0 if p_n <= 0 |
amin¶
#include "xf_blas/amin.hpp"
template < typename t_DataType, unsigned int t_LogParEntries, typename t_IndexType > void amin (unsigned int p_n)
amin function that returns the position of the vector element that has the minimum magnitude.
Parameters:
t_DataType | the data type of the vector entries |
t_LogParEntries | log2 of the number of parallelly processed entries in the input vector |
t_IndexType | the datatype of the index |
p_n | the number of entries in the input vector p_x, p_n % l_ParEntries == 0 |
p_x | the input stream of packed vector entries |
p_result | the resulting index, which is 0 if p_n <= 0 |
asum¶
#include "xf_blas/asum.hpp"
template < typename t_DataType, unsigned int t_LogParEntries, typename t_IndexType = unsigned int > void asum (unsigned int p_n)
asum function that returns the sum of the magnitude of vector elements.
Parameters:
t_DataType | the data type of the vector entries |
t_LogParEntries | log2 of the number of parallelly processed entries in the input vector |
t_IndexType | the datatype of the index |
p_n | the number of entries in the input vector p_x, p_n % l_ParEntries == 0 |
p_x | the input stream of packed vector entries |
p_sum | the sum, which is 0 if p_n <= 0 |
axpy¶
#include "xf_blas/axpy.hpp"
template < typename t_DataType, unsigned int t_ParEntries, typename t_IndexType = unsigned int > void axpy ( unsigned int p_n, const t_DataType p_alpha, hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_x, hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_y, hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_r )
axpy function that compute Y = alpha*X + Y.
Parameters:
t_DataType | the data type of the vector entries |
t_LogParEntries | log2 of the number of parallelly processed entries in the input vector |
t_IndexType | the datatype of the index |
p_n | the number of entries in the input vector p_x, p_n % t_ParEntries == 0 |
p_x | the input stream of packed entries of vector X |
p_y | the input stream of packed entries of vector Y |
p_r | the output stream of packed entries of result vector Y |
copy¶
#include "xf_blas/copy.hpp"
template < typename t_DataType, unsigned int t_ParEntries, typename t_IndexType = unsigned int > void copy ( unsigned int p_n, hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_x, hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_y )
copy function that compute Y = X
Parameters:
t_DataType | the data type of the vector entries |
t_ParEntries | number of parallelly processed entries in the packed input vector stream |
t_IndexType | the datatype of the index |
p_n | the number of entries in vector X and Y |
p_x | the packed input vector stream |
p_y | the packed output vector stream |
dot¶
#include "xf_blas/dot.hpp"
template < typename t_DataType, unsigned int t_LogParEntries, typename t_IndexType = unsigned int > void dot (unsigned int p_n)
dot function that returns the dot product of vector x and y.
Parameters:
t_DataType | the data type of the vector entries |
t_LogParEntries | log2 of the number of parallelly processed entries in the input vector |
t_IndexType | the datatype of the index |
p_n | the number of entries in the input vector p_x, p_n % l_ParEntries == 0 |
p_x | the input stream of packed vector entries |
p_res | the dot product of x and y |
gbmv¶
#include "xf_blas/gbmv.hpp"
template < typename t_DataType, unsigned int t_ParEntries, unsigned int t_MaxRows, typename t_IndexType = unsigned int, typename t_MacType = t_DataType > void gbmv ( const unsigned int p_m, const unsigned int p_n, const unsigned int p_kl, const unsigned int p_ku, const t_DataType p_alpha, hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_M, hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_x, const t_DataType p_beta, hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_y, hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_yr )
gbmv function performs general banded matrix-vector multiplication matrix and a vector y = alpha * M * x + beta * y
Parameters:
t_DataType | the data type of the vector entries |
t_ParEntries | the number of parallelly processed entries in the input vector |
t_MaxRows | the maximum size of buffers for output vector |
t_IndexType | the datatype of the index |
t_MacType | the datatype of the output stream |
p_m | the number of rows of input matrix p_M |
p_alpha | scalar alpha |
p_M | the input stream of packed Matrix entries |
p_x | the input stream of packed vector entries |
p_beta | scalar beta |
p_y | the output vector |
gemv¶
#include "xf_blas/gemv.hpp"
template < typename t_DataType, unsigned int t_LogParEntries, typename t_IndexType = unsigned int > void gemv ( const unsigned int p_m, const unsigned int p_n, const t_DataType p_alpha )
gemv function that returns the result vector of the multiplication of a matrix and a vector y = alpha * M * x
- beta * y
Parameters:
t_DataType | the data type of the vector entries |
t_LogParEntries | log2 of the number of parallelly processed entries in the input vector |
t_IndexType | the datatype of the index |
p_m | the number of rows of input matrix p_M |
p_n | the number of cols of input matrix p_M, as well as the number of entries in the input vector p_x, p_n % l_ParEntries == 0 |
p_alpha | scalar alpha |
p_M | the input stream of packed Matrix entries |
p_x | the input stream of packed vector entries |
p_beta | scalar beta |
p_y | the output vector |
nrm2¶
#include "xf_blas/nrm2.hpp"
template < typename t_DataType, unsigned int t_LogParEntries, typename t_IndexType = unsigned int > void nrm2 (unsigned int p_n)
nrm2 function that returns the Euclidean norm of the vector x.
Parameters:
t_DataType | the data type of the vector entries |
t_LogParEntries | log2 of the number of parallelly processed entries in the input vector |
t_IndexType | the datatype of the index |
p_n | the number of entries in the input vector p_x, p_n % (1<<l_LogParEntries) == 0 |
p_x | the input stream of packed vector entries |
p_res | the nrm2 of x |
scal¶
#include "xf_blas/scal.hpp"
template < typename t_DataType, unsigned int t_ParEntries, typename t_IndexType = unsigned int > void scal ( unsigned int p_n, t_DataType p_alpha, hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_x, hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_res )
scal function that compute X = alpha * X
Parameters:
t_DataType | the data type of the vector entries |
t_ParEntries | number of parallelly processed entries in the packed input vector stream |
t_IndexType | the datatype of the index |
p_n | the number of entries in vector X, p_n % t_ParEntries == 0 |
p_x | the packed input vector stream |
p_res | the packed output vector stream |
swap¶
#include "xf_blas/swap.hpp"
template < typename t_DataType, unsigned int t_ParEntries, typename t_IndexType = unsigned int > void swap ( unsigned int p_n, hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_x, hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_y, hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_xRes, hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_yRes )
swap function that swap vector x and y
Parameters:
t_DataType | the data type of the vector entries |
t_ParEntries | number of parallelly processed entries in the packed input vector stream |
t_IndexType | the datatype of the index |
p_n | the number of entries in vector X and Y, p_n % t_ParEntries == 0 |
p_x | the packed input vector stream |
p_y | the packed input vector stream |
p_xRes | the packed output stream |
p_yRes | the packed output stream |
symv¶
#include "xf_blas/symv.hpp"
template < typename t_DataType, unsigned int t_LogParEntries, typename t_IndexType = unsigned int > void symv ( const unsigned int p_n, const t_DataType p_alpha )
symv function that returns the result vector of the multiplication of a symmetric matrix and a vector y = alpha * M * x + beta * y
Parameters:
t_DataType | the data type of the vector entries |
t_LogParEntries | log2 of the number of parallelly processed entries in the input vector |
t_IndexType | the datatype of the index |
p_n | the dimention of input matrix p_M, as well as the number of entries in the input vector p_x, p_n % l_ParEntries == 0 |
p_alpha | |
scalar | alpha |
p_M | the input stream of packed Matrix entries |
p_x | the input stream of packed vector entries |
p_beta | |
scalar | beta |
p_y | the output vector |
trmv¶
#include "xf_blas/trmv.hpp"
template < typename t_DataType, unsigned int t_LogParEntries, typename t_IndexType = unsigned int > void trmv ( const bool uplo, const unsigned int p_n, const t_DataType p_alpha )
trmv function that returns the result vector of the multiplication of a triangular matrix and a vector y = alpha * M * x + beta * y
Parameters:
t_DataType | the data type of the vector entries |
t_LogParEntries | log2 of the number of parallelly processed entries in the input vector |
t_IndexType | the datatype of the index |
p_n | the number of cols of input matrix p_M, as well as the number of entries in the input vector p_x, p_n % l_ParEntries == 0 |
p_alpha | |
scalar | alpha |
p_M | the input stream of packed Matrix entries |
p_x | the input stream of packed vector entries |
p_beta | |
scalar | beta |
p_y | the output vector |
xfblasCreate¶
#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasCreate ( const char* xclbin, string configFile, xfblasEngine_t engineName, unsigned int kernelNumber = 1, unsigned int deviceIndex = 0 )
This function initializes the XFBLAS library and creates a handle for the specific engine. It must be called prior to any other XFBLAS library calls.
Parameters:
xclbin | file path to FPGA bitstream |
configFile | file path to config_info.dat file |
engineName | XFBLAS engine to run |
kernelNumber | number of kernels that is being used, default is 1 |
deviceIndex | index of device that is being used, default is 0 |
xfblasStatus_t | 0 if the initialization succeeded |
xfblasStatus_t | 1 if the opencl runtime initialization failed |
xfblasStatus_t | 2 if the xclbin doesn’t contain the engine |
xfblasStatus_t | 4 if the engine is not supported for now |
xfblasMalloc¶
xfblasMalloc overload (1)¶
#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasMalloc ( short** devPtr, int rows, int lda, int elemSize, unsigned int kernelIndex = 0, unsigned int deviceIndex = 0 )
This function allocates memory on the FPGA device.
Parameters:
devPtr | pointer to mapped memory |
rows | number of rows in the matrix |
lda | leading dimension of the matrix that indicates the total number of cols in the matrix |
elemSize | number of bytes required to store each element in the matrix |
kernelIndex | index of kernel that is being used, default is 0 |
deviceIndex | index of device that is being used, default is 0 |
xfblasStatus_t | 0 if the allocation completed successfully |
xfblasStatus_t | 1 if the library was not initialized |
xfblasStatus_t | 2 if parameters rows, cols, elemSize, lda <= 0 or cols > lda or data types are not matched |
xfblasStatus_t | 3 if there is memory already allocated to the same matrix |
xfblasStatus_t | 4 if the engine is not supported for now |
xfblasMallocRestricted¶
#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasMallocRestricted ( int rows, int cols, int elemSize, void* A, int lda, unsigned int kernelIndex = 0, unsigned int deviceIndex = 0 )
This function allocates memory for host row-major format matrix on the FPGA device.
Parameters:
rows | number of rows in the matrix |
cols | number of cols in the matrix that is being used |
elemSize | number of bytes required to store each element in the matrix |
A | pointer to the matrix array in the host memory |
lda | leading dimension of the matrix that indicates the total number of cols in the matrix |
kernelIndex | index of kernel that is being used, default is 0 |
deviceIndex | index of device that is being used, default is 0 |
xfblasStatus_t | 0 if the allocation completed successfully |
xfblasStatus_t | 1 if the library was not initialized |
xfblasStatus_t | 2 if parameters rows, cols, elemSize, lda <= 0 or cols > lda or data types are not matched |
xfblasStatus_t | 3 if there is memory already allocated to the same matrix |
xfblasStatus_t | 4 if the engine is not supported for now |
xfblasStatus_t | 5 if rows, cols or lda is not padded correctly |
xfblasMallocManaged¶
xfblasMallocManaged overload (1)¶
#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasMallocManaged ( short** devPtr, int* paddedLda, int rows, int lda, int elemSize, unsigned int kernelIndex = 0, unsigned int deviceIndex = 0 )
This function allocates memory on the FPGA device, rewrites the leading dimension size after padding.
Parameters:
devPtr | pointer to mapped memory |
paddedLda | leading dimension of the matrix after padding |
rows | number of rows in the matrix |
lda | leading dimension of the matrix that indicates the total number of cols in the matrix |
elemSize | number of bytes required to store each element in the matrix |
kernelIndex | index of kernel that is being used, default is 0 |
deviceIndex | index of device that is being used, default is 0 |
xfblasStatus_t | 0 if the allocation completed successfully |
xfblasStatus_t | 1 if the library was not initialized |
xfblasStatus_t | 2 if parameters rows, cols, elemSize, lda <= 0 or cols > lda or data types are not matched |
xfblasStatus_t | 3 if there is memory already allocated to the same matrix |
xfblasStatus_t | 4 if the engine is not supported for now |
xfblasSetMatrix¶
xfblasSetMatrix overload (1)¶
#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasSetMatrix ( int rows, int cols, int elemSize, short* A, int lda, short* d_A, unsigned int kernelIndex = 0, unsigned int deviceIndex = 0 )
This function copies a matrix in host memory to FPGA device memory. xfblasMalloc() need to be called prior to this function.
Parameters:
rows | number of rows in the matrix |
cols | number of cols in the matrix that is being used |
elemSize | number of bytes required to store each element in the matrix |
A | pointer to the matrix array in the host memory |
lda | leading dimension of the matrix that indicates the total number of cols in the matrix |
d_A | pointer to mapped memory |
kernelIndex | index of kernel that is being used, default is 0 |
deviceIndex | index of device that is being used, default is 0 |
xfblasStatus_t | 0 if the operation completed successfully |
xfblasStatus_t | 1 if the library was not initialized |
xfblasStatus_t | 2 if parameters rows, cols, elemSize, lda <= 0 or cols > lda or data types are not matched |
xfblasStatus_t | 3 if there is no FPGA device memory allocated for the matrix |
xfblasStatus_t | 4 if the engine is not supported for now |
xfblasSetMatrixRestricted¶
#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasSetMatrixRestricted ( void* A, unsigned int kernelIndex = 0, unsigned int deviceIndex = 0 )
This function copies a matrix in host memory to FPGA device memory. xfblasMallocRestricted() need to be called prior to this function.
Parameters:
A | pointer to the matrix array in the host memory |
kernelIndex | index of kernel that is being used, default is 0 |
deviceIndex | index of device that is being used, default is 0 |
xfblasStatus_t | 0 if the operation completed successfully |
xfblasStatus_t | 1 if the library was not initialized |
xfblasStatus_t | 3 if there is no FPGA device memory allocated for the matrix |
xfblasSetVectorRestricted¶
#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasSetVectorRestricted ( void* x, unsigned int kernelIndex = 0, unsigned int deviceIndex = 0 )
This function copies a vector in host memory to FPGA device memory. xfblasMallocRestricted() need to be called prior to this function.
Parameters:
x | pointer to the vector in the host memory |
kernelIndex | index of kernel that is being used, default is 0 |
deviceIndex | index of device that is being used, default is 0 |
xfblasStatus_t | 0 if the operation completed successfully |
xfblasStatus_t | 1 if the library was not initialized |
xfblasStatus_t | 3 if there is no FPGA device memory allocated for the vector |
xfblasDeviceSynchronize¶
#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasDeviceSynchronize ( unsigned int kernelIndex = 0, unsigned int deviceIndex = 0 )
This function will synchronize all the device memory to host memory.
Parameters:
kernelIndex | index of kernel that is being used, default is 0 |
deviceIndex | index of device that is being used, default is 0 |
xfblasStatus_t | 0 if the operation completed successfully |
xfblasStatus_t | 1 if the library was not initialized |
xfblasStatus_t | 3 if there is no FPGA device memory allocated for some of the matrices in the host memory |
xfblasGetMatrix¶
xfblasGetMatrix overload (1)¶
#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasGetMatrix ( int rows, int cols, int elemSize, short* d_A, short* A, int lda, unsigned int kernelIndex = 0, unsigned int deviceIndex = 0 )
This function copies a matrix in FPGA device memory to host memory.
Parameters:
rows | number of rows in the matrix |
cols | number of cols in the matrix that is being used |
elemSize | number of bytes required to store each element in the matrix |
d_A | pointer to mapped memory |
A | pointer to the matrix array in the host memory |
lda | leading dimension of the matrix that indicates the total number of cols in the matrix |
kernelIndex | index of kernel that is being used, default is 0 |
deviceIndex | index of device that is being used, default is 0 |
xfblasStatus_t | 0 if the operation completed successfully |
xfblasStatus_t | 1 if the library was not initialized |
xfblasStatus_t | 3 if there is no FPGA device memory allocated for the matrix |
xfblasGetMatrixRestricted¶
#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasGetMatrixRestricted ( void* A, unsigned int kernelIndex = 0, unsigned int deviceIndex = 0 )
This function copies a matrix in FPGA device memory to host memory.
Parameters:
A | pointer to matrix A in the host memory |
kernelIndex | index of kernel that is being used, default is 0 |
deviceIndex | index of device that is being used, default is 0 |
xfblasStatus_t | 0 if the operation completed successfully |
xfblasStatus_t | 1 if the library was not initialized |
xfblasStatus_t | 3 if there is no FPGA device memory allocated for the matrix |
xfblasGetVectorRestricted¶
#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasGetVectorRestricted ( void* x, unsigned int kernelIndex = 0, unsigned int deviceIndex = 0 )
This function copies a matrix in FPGA device memory to host memory.
Parameters:
x | pointer to vetcor x in the host memory |
kernelIndex | index of kernel that is being used, default is 0 |
deviceIndex | index of device that is being used, default is 0 |
xfblasStatus_t | 0 if the operation completed successfully |
xfblasStatus_t | 1 if the library was not initialized |
xfblasStatus_t | 3 if there is no FPGA device memory allocated for the matrix |
xfblasFree¶
#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasFree ( void* A, unsigned int kernelIndex = 0, unsigned int deviceIndex = 0 )
This function frees memory in FPGA device.
Parameters:
A | pointer to matrix A in the host memory |
kernelIndex | index of kernel that is being used, default is 0 |
deviceIndex | index of device that is being used, default is 0 |
xfblasStatus_t | 0 if the operation completed successfully |
xfblasStatus_t | 1 if the library was not initialized |
xfblasStatus_t | 3 if there is no FPGA device memory allocated for the matrix |
xfblasFreeInstr¶
#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasFreeInstr ( unsigned int kernelIndex = 0, unsigned int deviceIndex = 0 )
This function frees instrution.
Parameters:
kernelIndex | index of kernel that is being used, default is 0 |
deviceIndex | index of device that is being used, default is 0 |
xfblasStatus_t | 0 if the operation completed successfully |
xfblasStatus_t | 1 if the library was not initialized |
xfblasDestroy¶
#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasDestroy ( unsigned int kernelNumber = 1, unsigned int deviceIndex = 0 )
This function releases handle used by the XFBLAS library.
Parameters:
kernelNumber | number of kernels that is being used, default is 1 |
deviceIndex | index of device that is being used, default is 0 |
xfblasStatus_t | 0 if the shut down succeeded |
xfblasStatus_t | 1 if the library was not initialized |
xfblasGemm¶
xfblasGemm overload (1)¶
#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasGemm ( xfblasOperation_t transa, xfblasOperation_t transb, int m, int n, int k, int alpha, void* A, int lda, void* B, int ldb, int beta, void* C, int ldc, unsigned int kernelIndex = 0, unsigned int deviceIndex = 0 )
This function performs the matrix-matrix multiplication C = alpha*op(A)op(B) + beta*C.
Parameters:
transa | operation op(A) that is non- or (conj.) transpose |
transb | operation op(B) that is non- or (conj.) transpose |
m | number of rows in matrix A, matrix C |
n | number of cols in matrix B, matrix C |
k | number of cols in matrix A, number of rows in matrix B |
alpha | scalar used for multiplication |
A | pointer to matrix A in the host memory |
lda | leading dimension of matrix A |
B | pointer to matrix B in the host memory |
ldb | leading dimension of matrix B |
beta | scalar used for multiplication |
C | pointer to matrix C in the host memory |
ldc | leading dimension of matrix C |
kernelIndex | index of kernel that is being used, default is 0 |
deviceIndex | index of device that is being used, default is 0 |
xfblasStatus_t | 0 if the operation completed successfully |
xfblasStatus_t | 1 if the library was not initialized |
xfblasStatus_t | 3 if not all the matrices have FPGA devie memory allocated |
xfblasStatus_t | 4 if the engine is not supported for now |
xfblasGetByPointer¶
#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasGetByPointer ( void* A, unsigned int kernelIndex = 0, unsigned int deviceIndex = 0 )
This function copies a matrix in FPGA device memory to host memory by pointer.
Parameters:
A | pointer to matrix A in the host memory |
kernelIndex | index of kernel that is being used, default is 0 |
deviceIndex | index of device that is being used, default is 0 |
xfblasStatus_t | 0 if the operation completed successfully |
xfblasStatus_t | 1 if the library was not initialized |
xfblasStatus_t | 3 if there is no FPGA device memory allocated for the matrix |
xfblasGetByAddress¶
#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasGetByAddress ( void* A, unsigned long long p_bufSize, unsigned int offset, unsigned int kernelIndex = 0, unsigned int deviceIndex = 0 )
This function copies a matrix in FPGA device memory to host memory by its address in device memory.
Parameters:
A | pointer to matrix A in the host memory |
p_bufSize | size of matrix A |
offset | A’s address in device memory |
kernelIndex | index of kernel that is being used, default is 0 |
deviceIndex | index of device that is being used, default is 0 |
xfblasStatus_t | 0 if the operation completed successfully |
xfblasStatus_t | 1 if the library was not initialized |
xfblasStatus_t | 3 if there is no FPGA device memory allocated for the matrix |
xfblasExecute¶
#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasExecute ( unsigned int kernelIndex = 0, unsigned int deviceIndex = 0 )
This function starts the kernel and wait until it finishes.
Parameters:
kernelIndex | index of kernel that is being used, default is 0 |
deviceIndex | index of device that is being used, default is 0 |
xfblasStatus_t | 0 if the operation completed successfully |
xfblasStatus_t | 1 if the library was not initialized |
xfblasStatus_t | 3 if there is no FPGA device memory allocated for instrution |
xfblasExecuteAsync¶
#include "xf_blas/wrapper.hpp"
void xfblasExecuteAsync ( unsigned int numKernels = 1, unsigned int deviceIndex = 0 )
This asynchronous function starts all kernels and wait until them finish.
Parameters:
numKernels | number of kernels that is being used, default is 1 |
deviceIndex | index of device that is being used, default is 0 |