namespace blas¶
// enums enum OpType // classes class BLASArgs class BLASHost class BLASHostHandle class ConfigDict class GEMMHost class GEMVHost template < typename t_DataType, unsigned int t_KBufferDim, unsigned int t_ParEntriesM, unsigned int t_ParEntriesN = t_ParEntriesM, typename t_MacDataType = t_DataType > class Gemm template < unsigned int t_KBufferDim, unsigned int t_ParEntriesM, unsigned int t_ParEntriesN > class Gemm <float, t_KBufferDim, t_ParEntriesM, t_ParEntriesN, float> class GemmArgs template < typename t_FloatType, typename t_XDataType, unsigned int t_DdrWidth, unsigned int t_XDdrWidth, unsigned int t_aColMemWords = 1, unsigned int t_aRowMemWords = 1, unsigned int t_bColMemWords = 1 > class GemmKernel template < typename t_FloatType, unsigned int t_DdrWidth, unsigned int t_colMemWords, unsigned int t_rowMemWords, unsigned int t_kVectorBlocks, unsigned int t_mVectorBlocks > class Gemv class GemvArgs template < typename t_FloatType, unsigned int t_DdrWidth, unsigned int t_colMemWords, unsigned int t_rowMemWords > class Transp class XFpga class XFpgaHold class XHost
amax¶
#include "xf_blas/amax.hpp"
template < typename t_DataType, unsigned int t_LogParEntries, typename t_IndexType > void amax (unsigned int p_n)
amax function that returns the position of the vector element that has the maximum magnitude.
Parameters:
t_DataType |
the data type of the vector entries |
t_LogParEntries |
log2 of the number of parallelly processed entries in the input vector |
t_IndexType |
the datatype of the index |
p_n |
the number of entries in the input vector p_x, p_n % l_ParEntries == 0 |
p_x |
the input stream of packed vector entries |
p_result |
the resulting index, which is 0 if p_n <= 0 |
amin¶
#include "xf_blas/amin.hpp"
template < typename t_DataType, unsigned int t_LogParEntries, typename t_IndexType > void amin (unsigned int p_n)
amin function that returns the position of the vector element that has the minimum magnitude.
Parameters:
t_DataType |
the data type of the vector entries |
t_LogParEntries |
log2 of the number of parallelly processed entries in the input vector |
t_IndexType |
the datatype of the index |
p_n |
the number of entries in the input vector p_x, p_n % l_ParEntries == 0 |
p_x |
the input stream of packed vector entries |
p_result |
the resulting index, which is 0 if p_n <= 0 |
asum¶
#include "xf_blas/asum.hpp"
template < typename t_DataType, unsigned int t_LogParEntries, typename t_IndexType = unsigned int > void asum (unsigned int p_n)
asum function that returns the sum of the magnitude of vector elements.
Parameters:
t_DataType |
the data type of the vector entries |
t_LogParEntries |
log2 of the number of parallelly processed entries in the input vector |
t_IndexType |
the datatype of the index |
p_n |
the number of entries in the input vector p_x, p_n % l_ParEntries == 0 |
p_x |
the input stream of packed vector entries |
p_sum |
the sum, which is 0 if p_n <= 0 |
axpy¶
#include "xf_blas/axpy.hpp"
template < typename t_DataType, unsigned int t_ParEntries, typename t_IndexType = unsigned int > void axpy ( unsigned int p_n, const t_DataType p_alpha, hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_x, hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_y, hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_r )
axpy function that compute Y = alpha*X + Y.
Parameters:
t_DataType |
the data type of the vector entries |
t_LogParEntries |
log2 of the number of parallelly processed entries in the input vector |
t_IndexType |
the datatype of the index |
p_n |
the number of entries in the input vector p_x, p_n % t_ParEntries == 0 |
p_x |
the input stream of packed entries of vector X |
p_y |
the input stream of packed entries of vector Y |
p_r |
the output stream of packed entries of result vector Y |
copy¶
#include "xf_blas/copy.hpp"
template < typename t_DataType, unsigned int t_ParEntries, typename t_IndexType = unsigned int > void copy ( unsigned int p_n, hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_x, hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_y )
copy function that compute Y = X
Parameters:
t_DataType |
the data type of the vector entries |
t_ParEntries |
number of parallelly processed entries in the packed input vector stream |
t_IndexType |
the datatype of the index |
p_n |
the number of entries in vector X and Y |
p_x |
the packed input vector stream |
p_y |
the packed output vector stream |
dot¶
#include "xf_blas/dot.hpp"
template < typename t_DataType, unsigned int t_LogParEntries, typename t_IndexType = unsigned int > void dot (unsigned int p_n)
dot function that returns the dot product of vector x and y.
Parameters:
t_DataType |
the data type of the vector entries |
t_LogParEntries |
log2 of the number of parallelly processed entries in the input vector |
t_IndexType |
the datatype of the index |
p_n |
the number of entries in the input vector p_x, p_n % l_ParEntries == 0 |
p_x |
the input stream of packed vector entries |
p_res |
the dot product of x and y |
gbmv¶
#include "xf_blas/gbmv.hpp"
template < typename t_DataType, unsigned int t_ParEntries, unsigned int t_MaxRows, typename t_IndexType = unsigned int, typename t_MacType = t_DataType > void gbmv ( const unsigned int p_m, const unsigned int p_n, const unsigned int p_kl, const unsigned int p_ku, const t_DataType p_alpha, hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_M, hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_x, const t_DataType p_beta, hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_y, hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_yr )
gbmv function performs general banded matrix-vector multiplication matrix and a vector y = alpha * M * x + beta * y
Parameters:
t_DataType |
the data type of the vector entries |
t_ParEntries |
the number of parallelly processed entries in the input vector |
t_MaxRows |
the maximum size of buffers for output vector |
t_IndexType |
the datatype of the index |
t_MacType |
the datatype of the output stream |
p_m |
the number of rows of input matrix p_M |
p_alpha |
scalar alpha |
p_M |
the input stream of packed Matrix entries |
p_x |
the input stream of packed vector entries |
p_beta |
scalar beta |
p_y |
the output vector |
gemv¶
#include "xf_blas/gemv.hpp"
template < typename t_DataType, unsigned int t_LogParEntries, typename t_IndexType = unsigned int > void gemv ( const unsigned int p_m, const unsigned int p_n, const t_DataType p_alpha )
gemv function that returns the result vector of the multiplication of a matrix and a vector y = alpha * M * x
beta * y
Parameters:
t_DataType |
the data type of the vector entries |
t_LogParEntries |
log2 of the number of parallelly processed entries in the input vector |
t_IndexType |
the datatype of the index |
p_m |
the number of rows of input matrix p_M |
p_n |
the number of cols of input matrix p_M, as well as the number of entries in the input vector p_x, p_n % l_ParEntries == 0 |
p_alpha |
scalar alpha |
p_M |
the input stream of packed Matrix entries |
p_x |
the input stream of packed vector entries |
p_beta |
scalar beta |
p_y |
the output vector |
nrm2¶
#include "xf_blas/nrm2.hpp"
template < typename t_DataType, unsigned int t_LogParEntries, typename t_IndexType = unsigned int > void nrm2 (unsigned int p_n)
nrm2 function that returns the Euclidean norm of the vector x.
Parameters:
t_DataType |
the data type of the vector entries |
t_LogParEntries |
log2 of the number of parallelly processed entries in the input vector |
t_IndexType |
the datatype of the index |
p_n |
the number of entries in the input vector p_x, p_n % (1<<l_LogParEntries) == 0 |
p_x |
the input stream of packed vector entries |
p_res |
the nrm2 of x |
scal¶
#include "xf_blas/scal.hpp"
template < typename t_DataType, unsigned int t_ParEntries, typename t_IndexType = unsigned int > void scal ( unsigned int p_n, t_DataType p_alpha, hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_x, hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_res )
scal function that compute X = alpha * X
Parameters:
t_DataType |
the data type of the vector entries |
t_ParEntries |
number of parallelly processed entries in the packed input vector stream |
t_IndexType |
the datatype of the index |
p_n |
the number of entries in vector X, p_n % t_ParEntries == 0 |
p_x |
the packed input vector stream |
p_res |
the packed output vector stream |
swap¶
#include "xf_blas/swap.hpp"
template < typename t_DataType, unsigned int t_ParEntries, typename t_IndexType = unsigned int > void swap ( unsigned int p_n, hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_x, hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_y, hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_xRes, hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_yRes )
swap function that swap vector x and y
Parameters:
t_DataType |
the data type of the vector entries |
t_ParEntries |
number of parallelly processed entries in the packed input vector stream |
t_IndexType |
the datatype of the index |
p_n |
the number of entries in vector X and Y, p_n % t_ParEntries == 0 |
p_x |
the packed input vector stream |
p_y |
the packed input vector stream |
p_xRes |
the packed output stream |
p_yRes |
the packed output stream |
symv¶
#include "xf_blas/symv.hpp"
template < typename t_DataType, unsigned int t_LogParEntries, typename t_IndexType = unsigned int > void symv ( const unsigned int p_n, const t_DataType p_alpha )
symv function that returns the result vector of the multiplication of a symmetric matrix and a vector y = alpha * M * x + beta * y
Parameters:
t_DataType |
the data type of the vector entries |
t_LogParEntries |
log2 of the number of parallelly processed entries in the input vector |
t_IndexType |
the datatype of the index |
p_n |
the dimention of input matrix p_M, as well as the number of entries in the input vector p_x, p_n % l_ParEntries == 0 |
p_alpha |
|
scalar |
alpha |
p_M |
the input stream of packed Matrix entries |
p_x |
the input stream of packed vector entries |
p_beta |
|
scalar |
beta |
p_y |
the output vector |
trmv¶
#include "xf_blas/trmv.hpp"
template < typename t_DataType, unsigned int t_LogParEntries, typename t_IndexType = unsigned int > void trmv ( const bool uplo, const unsigned int p_n, const t_DataType p_alpha )
trmv function that returns the result vector of the multiplication of a triangular matrix and a vector y = alpha * M * x + beta * y
Parameters:
t_DataType |
the data type of the vector entries |
t_LogParEntries |
log2 of the number of parallelly processed entries in the input vector |
t_IndexType |
the datatype of the index |
p_n |
the number of cols of input matrix p_M, as well as the number of entries in the input vector p_x, p_n % l_ParEntries == 0 |
p_alpha |
|
scalar |
alpha |
p_M |
the input stream of packed Matrix entries |
p_x |
the input stream of packed vector entries |
p_beta |
|
scalar |
beta |
p_y |
the output vector |
xfblasCreate¶
#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasCreate ( const char* xclbin, string configFile, xfblasEngine_t engineName, unsigned int kernelNumber = 1, unsigned int deviceIndex = 0 )
This function initializes the XFBLAS library and creates a handle for the specific engine. It must be called prior to any other XFBLAS library calls.
Parameters:
xclbin |
file path to FPGA bitstream |
configFile |
file path to config_info.dat file |
engineName |
XFBLAS engine to run |
kernelNumber |
number of kernels that is being used, default is 1 |
deviceIndex |
index of device that is being used, default is 0 |
xfblasStatus_t |
0 if the initialization succeeded |
xfblasStatus_t |
1 if the opencl runtime initialization failed |
xfblasStatus_t |
2 if the xclbin doesn’t contain the engine |
xfblasStatus_t |
4 if the engine is not supported for now |
xfblasMalloc¶
xfblasMalloc overload (1)¶
#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasMalloc ( short** devPtr, int rows, int lda, int elemSize, unsigned int kernelIndex = 0, unsigned int deviceIndex = 0 )
This function allocates memory on the FPGA device.
Parameters:
devPtr |
pointer to mapped memory |
rows |
number of rows in the matrix |
lda |
leading dimension of the matrix that indicates the total number of cols in the matrix |
elemSize |
number of bytes required to store each element in the matrix |
kernelIndex |
index of kernel that is being used, default is 0 |
deviceIndex |
index of device that is being used, default is 0 |
xfblasStatus_t |
0 if the allocation completed successfully |
xfblasStatus_t |
1 if the library was not initialized |
xfblasStatus_t |
2 if parameters rows, cols, elemSize, lda <= 0 or cols > lda or data types are not matched |
xfblasStatus_t |
3 if there is memory already allocated to the same matrix |
xfblasStatus_t |
4 if the engine is not supported for now |
xfblasMallocRestricted¶
#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasMallocRestricted ( int rows, int cols, int elemSize, void* A, int lda, unsigned int kernelIndex = 0, unsigned int deviceIndex = 0 )
This function allocates memory for host row-major format matrix on the FPGA device.
Parameters:
rows |
number of rows in the matrix |
cols |
number of cols in the matrix that is being used |
elemSize |
number of bytes required to store each element in the matrix |
A |
pointer to the matrix array in the host memory |
lda |
leading dimension of the matrix that indicates the total number of cols in the matrix |
kernelIndex |
index of kernel that is being used, default is 0 |
deviceIndex |
index of device that is being used, default is 0 |
xfblasStatus_t |
0 if the allocation completed successfully |
xfblasStatus_t |
1 if the library was not initialized |
xfblasStatus_t |
2 if parameters rows, cols, elemSize, lda <= 0 or cols > lda or data types are not matched |
xfblasStatus_t |
3 if there is memory already allocated to the same matrix |
xfblasStatus_t |
4 if the engine is not supported for now |
xfblasStatus_t |
5 if rows, cols or lda is not padded correctly |
xfblasMallocManaged¶
xfblasMallocManaged overload (1)¶
#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasMallocManaged ( short** devPtr, int* paddedLda, int rows, int lda, int elemSize, unsigned int kernelIndex = 0, unsigned int deviceIndex = 0 )
This function allocates memory on the FPGA device, rewrites the leading dimension size after padding.
Parameters:
devPtr |
pointer to mapped memory |
paddedLda |
leading dimension of the matrix after padding |
rows |
number of rows in the matrix |
lda |
leading dimension of the matrix that indicates the total number of cols in the matrix |
elemSize |
number of bytes required to store each element in the matrix |
kernelIndex |
index of kernel that is being used, default is 0 |
deviceIndex |
index of device that is being used, default is 0 |
xfblasStatus_t |
0 if the allocation completed successfully |
xfblasStatus_t |
1 if the library was not initialized |
xfblasStatus_t |
2 if parameters rows, cols, elemSize, lda <= 0 or cols > lda or data types are not matched |
xfblasStatus_t |
3 if there is memory already allocated to the same matrix |
xfblasStatus_t |
4 if the engine is not supported for now |
xfblasSetMatrix¶
xfblasSetMatrix overload (1)¶
#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasSetMatrix ( int rows, int cols, int elemSize, short* A, int lda, short* d_A, unsigned int kernelIndex = 0, unsigned int deviceIndex = 0 )
This function copies a matrix in host memory to FPGA device memory. xfblasMalloc() need to be called prior to this function.
Parameters:
rows |
number of rows in the matrix |
cols |
number of cols in the matrix that is being used |
elemSize |
number of bytes required to store each element in the matrix |
A |
pointer to the matrix array in the host memory |
lda |
leading dimension of the matrix that indicates the total number of cols in the matrix |
d_A |
pointer to mapped memory |
kernelIndex |
index of kernel that is being used, default is 0 |
deviceIndex |
index of device that is being used, default is 0 |
xfblasStatus_t |
0 if the operation completed successfully |
xfblasStatus_t |
1 if the library was not initialized |
xfblasStatus_t |
2 if parameters rows, cols, elemSize, lda <= 0 or cols > lda or data types are not matched |
xfblasStatus_t |
3 if there is no FPGA device memory allocated for the matrix |
xfblasStatus_t |
4 if the engine is not supported for now |
xfblasSetVector¶
xfblasSetVector overload (1)¶
#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasSetVector ( int n, int elemSize, short* x, int incx, short* d_x, unsigned int kernelIndex = 0, unsigned int deviceIndex = 0 )
This function copies a vector in host memory to FPGA device memory. xfblasMalloc() need to be called prior to this function.
Parameters:
n |
number of elements in vector |
elemSize |
number of bytes required to store each element in the vector |
x |
pointer to the vector in the host memory |
incx |
the storage spacing between consecutive elements of vector x |
d_x |
pointer to mapped memory |
kernelIndex |
index of kernel that is being used, default is 0 |
deviceIndex |
index of device that is being used, default is 0 |
xfblasStatus_t |
0 if the operation completed successfully |
xfblasStatus_t |
1 if the library was not initialized |
xfblasStatus_t |
2 if parameters rows, cols, elemSize, lda <= 0 or cols > lda or data types are not matched |
xfblasStatus_t |
3 if there is no FPGA device memory allocated for the vector |
xfblasStatus_t |
4 if the engine is not supported for now |
xfblasSetMatrixRestricted¶
#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasSetMatrixRestricted ( void* A, unsigned int kernelIndex = 0, unsigned int deviceIndex = 0 )
This function copies a matrix in host memory to FPGA device memory. xfblasMallocRestricted() need to be called prior to this function.
Parameters:
A |
pointer to the matrix array in the host memory |
kernelIndex |
index of kernel that is being used, default is 0 |
deviceIndex |
index of device that is being used, default is 0 |
xfblasStatus_t |
0 if the operation completed successfully |
xfblasStatus_t |
1 if the library was not initialized |
xfblasStatus_t |
3 if there is no FPGA device memory allocated for the matrix |
xfblasSetVectorRestricted¶
#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasSetVectorRestricted ( void* x, unsigned int kernelIndex = 0, unsigned int deviceIndex = 0 )
This function copies a vector in host memory to FPGA device memory. xfblasMallocRestricted() need to be called prior to this function.
Parameters:
x |
pointer to the vector in the host memory |
kernelIndex |
index of kernel that is being used, default is 0 |
deviceIndex |
index of device that is being used, default is 0 |
xfblasStatus_t |
0 if the operation completed successfully |
xfblasStatus_t |
1 if the library was not initialized |
xfblasStatus_t |
3 if there is no FPGA device memory allocated for the vector |
xfblasDeviceSynchronize¶
#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasDeviceSynchronize ( unsigned int kernelIndex = 0, unsigned int deviceIndex = 0 )
This function will synchronize all the device memory to host memory.
Parameters:
kernelIndex |
index of kernel that is being used, default is 0 |
deviceIndex |
index of device that is being used, default is 0 |
xfblasStatus_t |
0 if the operation completed successfully |
xfblasStatus_t |
1 if the library was not initialized |
xfblasStatus_t |
3 if there is no FPGA device memory allocated for some of the matrices in the host memory |
xfblasGetMatrix¶
xfblasGetMatrix overload (1)¶
#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasGetMatrix ( int rows, int cols, int elemSize, short* d_A, short* A, int lda, unsigned int kernelIndex = 0, unsigned int deviceIndex = 0 )
This function copies a matrix in FPGA device memory to host memory.
Parameters:
rows |
number of rows in the matrix |
cols |
number of cols in the matrix that is being used |
elemSize |
number of bytes required to store each element in the matrix |
d_A |
pointer to mapped memory |
A |
pointer to the matrix array in the host memory |
lda |
leading dimension of the matrix that indicates the total number of cols in the matrix |
kernelIndex |
index of kernel that is being used, default is 0 |
deviceIndex |
index of device that is being used, default is 0 |
xfblasStatus_t |
0 if the operation completed successfully |
xfblasStatus_t |
1 if the library was not initialized |
xfblasStatus_t |
3 if there is no FPGA device memory allocated for the matrix |
xfblasGetVector¶
xfblasGetVector overload (1)¶
#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasGetVector ( int n, int elemSize, short* d_x, short* x, int incx, unsigned int kernelIndex = 0, unsigned int deviceIndex = 0 )
This function copies a vector in FPGA device memory to host memory.
Parameters:
n |
number of elements in vector |
elemSize |
number of bytes required to store each element in the vector |
d_x |
pointer to mapped memory |
x |
pointer to the vector in the host memory |
incx |
the storage spacing between consecutive elements of vector x |
kernelIndex |
index of kernel that is being used, default is 0 |
deviceIndex |
index of device that is being used, default is 0 |
xfblasStatus_t |
0 if the operation completed successfully |
xfblasStatus_t |
1 if the library was not initialized |
xfblasStatus_t |
3 if there is no FPGA device memory allocated for the vector |
xfblasGetMatrixRestricted¶
#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasGetMatrixRestricted ( void* A, unsigned int kernelIndex = 0, unsigned int deviceIndex = 0 )
This function copies a matrix in FPGA device memory to host memory.
Parameters:
A |
pointer to matrix A in the host memory |
kernelIndex |
index of kernel that is being used, default is 0 |
deviceIndex |
index of device that is being used, default is 0 |
xfblasStatus_t |
0 if the operation completed successfully |
xfblasStatus_t |
1 if the library was not initialized |
xfblasStatus_t |
3 if there is no FPGA device memory allocated for the matrix |
xfblasGetVectorRestricted¶
#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasGetVectorRestricted ( void* x, unsigned int kernelIndex = 0, unsigned int deviceIndex = 0 )
This function copies a matrix in FPGA device memory to host memory.
Parameters:
x |
pointer to vetcor x in the host memory |
kernelIndex |
index of kernel that is being used, default is 0 |
deviceIndex |
index of device that is being used, default is 0 |
xfblasStatus_t |
0 if the operation completed successfully |
xfblasStatus_t |
1 if the library was not initialized |
xfblasStatus_t |
3 if there is no FPGA device memory allocated for the matrix |
xfblasFree¶
#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasFree ( void* A, unsigned int kernelIndex = 0, unsigned int deviceIndex = 0 )
This function frees memory in FPGA device.
Parameters:
A |
pointer to matrix A in the host memory |
kernelIndex |
index of kernel that is being used, default is 0 |
deviceIndex |
index of device that is being used, default is 0 |
xfblasStatus_t |
0 if the operation completed successfully |
xfblasStatus_t |
1 if the library was not initialized |
xfblasStatus_t |
3 if there is no FPGA device memory allocated for the matrix |
xfblasFreeInstr¶
#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasFreeInstr ( unsigned int kernelIndex = 0, unsigned int deviceIndex = 0 )
This function frees instrution.
Parameters:
kernelIndex |
index of kernel that is being used, default is 0 |
deviceIndex |
index of device that is being used, default is 0 |
xfblasStatus_t |
0 if the operation completed successfully |
xfblasStatus_t |
1 if the library was not initialized |
xfblasDestroy¶
#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasDestroy ( unsigned int kernelNumber = 1, unsigned int deviceIndex = 0 )
This function releases handle used by the XFBLAS library.
Parameters:
kernelNumber |
number of kernels that is being used, default is 1 |
deviceIndex |
index of device that is being used, default is 0 |
xfblasStatus_t |
0 if the shut down succeeded |
xfblasStatus_t |
1 if the library was not initialized |
xfblasGemm¶
xfblasGemm overload (1)¶
#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasGemm ( xfblasOperation_t transa, xfblasOperation_t transb, int m, int n, int k, int alpha, void* A, int lda, void* B, int ldb, int beta, void* C, int ldc, unsigned int kernelIndex = 0, unsigned int deviceIndex = 0 )
This function performs the matrix-matrix multiplication C = alpha*op(A)op(B) + beta*C.
Parameters:
transa |
operation op(A) that is non- or (conj.) transpose |
transb |
operation op(B) that is non- or (conj.) transpose |
m |
number of rows in matrix A, matrix C |
n |
number of cols in matrix B, matrix C |
k |
number of cols in matrix A, number of rows in matrix B |
alpha |
scalar used for multiplication |
A |
pointer to matrix A in the host memory |
lda |
leading dimension of matirx A |
B |
pointer to matrix B in the host memory |
ldb |
leading dimension of matrix B |
beta |
scalar used for multiplication |
C |
pointer to matrix C in the host memory |
ldc |
leading dimension of matrix C |
kernelIndex |
index of kernel that is being used, default is 0 |
deviceIndex |
index of device that is being used, default is 0 |
xfblasStatus_t |
0 if the operation completed successfully |
xfblasStatus_t |
1 if the library was not initialized |
xfblasStatus_t |
3 if not all the matrices have FPGA devie memory allocated |
xfblasStatus_t |
4 if the engine is not supported for now |
xfblasGemv¶
#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasGemv ( xfblasOperation_t trans, int m, int n, int alpha, void* A, int lda, void* x, int incx, int beta, void* y, int incy, unsigned int kernelIndex = 0, unsigned int deviceIndex = 0 )
This function performs the matrix-vector multiplication y = alpha*op(A) x+ beta*y.
Parameters:
transa |
operation op(A) that is non- or (conj.) transpose |
m |
number of rows in matrix A |
n |
number of cols in matrix A |
alpha |
scalar used for multiplication |
A |
pointer to matrix A in the host memory |
lda |
leading dimension of matirx A |
x |
pointer to vector x in the host memory |
incx |
stride between consecutive elements of x |
beta |
scalar used for multiplication |
y |
pointer to vector y in the host memory |
incy |
stride between consecutive elements of y |
kernelIndex |
index of kernel that is being used, default is 0 |
deviceIndex |
index of device that is being used, default is 0 |
xfblasStatus_t |
0 if the operation completed successfully |
xfblasStatus_t |
1 if the library was not initialized |
xfblasStatus_t |
3 if not all the matrices have FPGA devie memory allocated |
xfblasStatus_t |
4 if the engine is not supported for now |
xfblasGetByPointer¶
#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasGetByPointer ( void* A, unsigned int kernelIndex = 0, unsigned int deviceIndex = 0 )
This function copies a matrix in FPGA device memory to host memory by pointer.
Parameters:
A |
pointer to matrix A in the host memory |
kernelIndex |
index of kernel that is being used, default is 0 |
deviceIndex |
index of device that is being used, default is 0 |
xfblasStatus_t |
0 if the operation completed successfully |
xfblasStatus_t |
1 if the library was not initialized |
xfblasStatus_t |
3 if there is no FPGA device memory allocated for the matrix |
xfblasGetByAddress¶
#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasGetByAddress ( void* A, unsigned long long p_bufSize, unsigned int offset, unsigned int kernelIndex = 0, unsigned int deviceIndex = 0 )
This function copies a matrix in FPGA device memory to host memory by its address in device memory.
Parameters:
A |
pointer to matrix A in the host memory |
p_bufSize |
size of matrix A |
offset |
A’s address in device memory |
kernelIndex |
index of kernel that is being used, default is 0 |
deviceIndex |
index of device that is being used, default is 0 |
xfblasStatus_t |
0 if the operation completed successfully |
xfblasStatus_t |
1 if the library was not initialized |
xfblasStatus_t |
3 if there is no FPGA device memory allocated for the matrix |
xfblasExecute¶
#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasExecute ( unsigned int kernelIndex = 0, unsigned int deviceIndex = 0 )
This function starts the kernel and wait until it finishes.
Parameters:
kernelIndex |
index of kernel that is being used, default is 0 |
deviceIndex |
index of device that is being used, default is 0 |
xfblasStatus_t |
0 if the operation completed successfully |
xfblasStatus_t |
1 if the library was not initialized |
xfblasStatus_t |
3 if there is no FPGA device memory allocated for instrution |
xfblasExecuteAsync¶
#include "xf_blas/wrapper.hpp"
void xfblasExecuteAsync ( unsigned int numKernels = 1, unsigned int deviceIndex = 0 )
This asynchronous function starts all kernels and wait until them finish.
Parameters:
numKernels |
number of kernels that is being used, default is 1 |
deviceIndex |
index of device that is being used, default is 0 |