namespace blas¶

// enums

enum OpType

// classes

class BLASArgs
class BLASHost
class BLASHostHandle
class ConfigDict
class GEMMHost
class GEMVHost

template <
    typename t_DataType,
    unsigned int t_KBufferDim,
    unsigned int t_ParEntriesM,
    unsigned int t_ParEntriesN = t_ParEntriesM,
    typename t_MacDataType = t_DataType
    >
class Gemm

template <
    unsigned int t_KBufferDim,
    unsigned int t_ParEntriesM,
    unsigned int t_ParEntriesN
    >
class Gemm <float, t_KBufferDim, t_ParEntriesM, t_ParEntriesN, float>

class GemmArgs

template <
    typename t_FloatType,
    typename t_XDataType,
    unsigned int t_DdrWidth,
    unsigned int t_XDdrWidth,
    unsigned int t_aColMemWords = 1,
    unsigned int t_aRowMemWords = 1,
    unsigned int t_bColMemWords = 1
    >
class GemmKernel

template <
    typename t_FloatType,
    unsigned int t_DdrWidth,
    unsigned int t_colMemWords,
    unsigned int t_rowMemWords,
    unsigned int t_kVectorBlocks,
    unsigned int t_mVectorBlocks
    >
class Gemv

class GemvArgs

template <
    typename t_FloatType,
    unsigned int t_DdrWidth,
    unsigned int t_colMemWords,
    unsigned int t_rowMemWords
    >
class Transp

class XFpga
class XFpgaHold
class XHost

amax¶

#include "xf_blas/amax.hpp"

template <
    typename t_DataType,
    unsigned int t_LogParEntries,
    typename t_IndexType
    >
void amax (unsigned int p_n)

amax function that returns the position of the vector element that has the maximum magnitude.

Parameters:

t_DataType	the data type of the vector entries
t_LogParEntries	log2 of the number of parallelly processed entries in the input vector
t_IndexType	the datatype of the index
p_n	the number of entries in the input vector p_x, p_n % l_ParEntries == 0
p_x	the input stream of packed vector entries
p_result	the resulting index, which is 0 if p_n <= 0

amin¶

#include "xf_blas/amin.hpp"

template <
    typename t_DataType,
    unsigned int t_LogParEntries,
    typename t_IndexType
    >
void amin (unsigned int p_n)

amin function that returns the position of the vector element that has the minimum magnitude.

Parameters:

t_DataType	the data type of the vector entries
t_LogParEntries	log2 of the number of parallelly processed entries in the input vector
t_IndexType	the datatype of the index
p_n	the number of entries in the input vector p_x, p_n % l_ParEntries == 0
p_x	the input stream of packed vector entries
p_result	the resulting index, which is 0 if p_n <= 0

asum¶

#include "xf_blas/asum.hpp"

template <
    typename t_DataType,
    unsigned int t_LogParEntries,
    typename t_IndexType = unsigned int
    >
void asum (unsigned int p_n)

asum function that returns the sum of the magnitude of vector elements.

Parameters:

t_DataType	the data type of the vector entries
t_LogParEntries	log2 of the number of parallelly processed entries in the input vector
t_IndexType	the datatype of the index
p_n	the number of entries in the input vector p_x, p_n % l_ParEntries == 0
p_x	the input stream of packed vector entries
p_sum	the sum, which is 0 if p_n <= 0

axpy¶

#include "xf_blas/axpy.hpp"

template <
    typename t_DataType,
    unsigned int t_ParEntries,
    typename t_IndexType = unsigned int
    >
void axpy (
    unsigned int p_n,
    const t_DataType p_alpha,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_x,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_y,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_r
    )

axpy function that compute Y = alpha*X + Y.

Parameters:

t_DataType	the data type of the vector entries
t_LogParEntries	log2 of the number of parallelly processed entries in the input vector
t_IndexType	the datatype of the index
p_n	the number of entries in the input vector p_x, p_n % t_ParEntries == 0
p_x	the input stream of packed entries of vector X
p_y	the input stream of packed entries of vector Y
p_r	the output stream of packed entries of result vector Y

copy¶

#include "xf_blas/copy.hpp"

template <
    typename t_DataType,
    unsigned int t_ParEntries,
    typename t_IndexType = unsigned int
    >
void copy (
    unsigned int p_n,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_x,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_y
    )

copy function that compute Y = X

Parameters:

t_DataType	the data type of the vector entries
t_ParEntries	number of parallelly processed entries in the packed input vector stream
t_IndexType	the datatype of the index
p_n	the number of entries in vector X and Y
p_x	the packed input vector stream
p_y	the packed output vector stream

dot¶

#include "xf_blas/dot.hpp"

template <
    typename t_DataType,
    unsigned int t_LogParEntries,
    typename t_IndexType = unsigned int
    >
void dot (unsigned int p_n)

dot function that returns the dot product of vector x and y.

Parameters:

t_DataType	the data type of the vector entries
t_LogParEntries	log2 of the number of parallelly processed entries in the input vector
t_IndexType	the datatype of the index
p_n	the number of entries in the input vector p_x, p_n % l_ParEntries == 0
p_x	the input stream of packed vector entries
p_res	the dot product of x and y

gbmv¶

#include "xf_blas/gbmv.hpp"

template <
    typename t_DataType,
    unsigned int t_ParEntries,
    unsigned int t_MaxRows,
    typename t_IndexType = unsigned int,
    typename t_MacType = t_DataType
    >
void gbmv (
    const unsigned int p_m,
    const unsigned int p_n,
    const unsigned int p_kl,
    const unsigned int p_ku,
    const t_DataType p_alpha,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_M,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_x,
    const t_DataType p_beta,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_y,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_yr
    )

gbmv function performs general banded matrix-vector multiplication matrix and a vector y = alpha * M * x + beta * y

Parameters:

t_DataType	the data type of the vector entries
t_ParEntries	the number of parallelly processed entries in the input vector
t_MaxRows	the maximum size of buffers for output vector
t_IndexType	the datatype of the index
t_MacType	the datatype of the output stream
p_m	the number of rows of input matrix p_M
p_alpha	scalar alpha
p_M	the input stream of packed Matrix entries
p_x	the input stream of packed vector entries
p_beta	scalar beta
p_y	the output vector

gemv¶

#include "xf_blas/gemv.hpp"

template <
    typename t_DataType,
    unsigned int t_LogParEntries,
    typename t_IndexType = unsigned int
    >
void gemv (
    const unsigned int p_m,
    const unsigned int p_n,
    const t_DataType p_alpha
    )

gemv function that returns the result vector of the multiplication of a matrix and a vector y = alpha * M * x

beta * y

Parameters:

t_DataType	the data type of the vector entries
t_LogParEntries	log2 of the number of parallelly processed entries in the input vector
t_IndexType	the datatype of the index
p_m	the number of rows of input matrix p_M
p_n	the number of cols of input matrix p_M, as well as the number of entries in the input vector p_x, p_n % l_ParEntries == 0
p_alpha	scalar alpha
p_M	the input stream of packed Matrix entries
p_x	the input stream of packed vector entries
p_beta	scalar beta
p_y	the output vector

nrm2¶

#include "xf_blas/nrm2.hpp"

template <
    typename t_DataType,
    unsigned int t_LogParEntries,
    typename t_IndexType = unsigned int
    >
void nrm2 (unsigned int p_n)

nrm2 function that returns the Euclidean norm of the vector x.

Parameters:

t_DataType	the data type of the vector entries
t_LogParEntries	log2 of the number of parallelly processed entries in the input vector
t_IndexType	the datatype of the index
p_n	the number of entries in the input vector p_x, p_n % (1<<l_LogParEntries) == 0
p_x	the input stream of packed vector entries
p_res	the nrm2 of x

scal¶

#include "xf_blas/scal.hpp"

template <
    typename t_DataType,
    unsigned int t_ParEntries,
    typename t_IndexType = unsigned int
    >
void scal (
    unsigned int p_n,
    t_DataType p_alpha,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_x,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_res
    )

scal function that compute X = alpha * X

Parameters:

t_DataType	the data type of the vector entries
t_ParEntries	number of parallelly processed entries in the packed input vector stream
t_IndexType	the datatype of the index
p_n	the number of entries in vector X, p_n % t_ParEntries == 0
p_x	the packed input vector stream
p_res	the packed output vector stream

swap¶

#include "xf_blas/swap.hpp"

template <
    typename t_DataType,
    unsigned int t_ParEntries,
    typename t_IndexType = unsigned int
    >
void swap (
    unsigned int p_n,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_x,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_y,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_xRes,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_yRes
    )

swap function that swap vector x and y

Parameters:

t_DataType	the data type of the vector entries
t_ParEntries	number of parallelly processed entries in the packed input vector stream
t_IndexType	the datatype of the index
p_n	the number of entries in vector X and Y, p_n % t_ParEntries == 0
p_x	the packed input vector stream
p_y	the packed input vector stream
p_xRes	the packed output stream
p_yRes	the packed output stream

symv¶

#include "xf_blas/symv.hpp"

template <
    typename t_DataType,
    unsigned int t_LogParEntries,
    typename t_IndexType = unsigned int
    >
void symv (
    const unsigned int p_n,
    const t_DataType p_alpha
    )

symv function that returns the result vector of the multiplication of a symmetric matrix and a vector y = alpha * M * x + beta * y

Parameters:

t_DataType	the data type of the vector entries
t_LogParEntries	log2 of the number of parallelly processed entries in the input vector
t_IndexType	the datatype of the index
p_n	the dimention of input matrix p_M, as well as the number of entries in the input vector p_x, p_n % l_ParEntries == 0
p_alpha
scalar	alpha
p_M	the input stream of packed Matrix entries
p_x	the input stream of packed vector entries
p_beta
scalar	beta
p_y	the output vector

trmv¶

#include "xf_blas/trmv.hpp"

template <
    typename t_DataType,
    unsigned int t_LogParEntries,
    typename t_IndexType = unsigned int
    >
void trmv (
    const bool uplo,
    const unsigned int p_n,
    const t_DataType p_alpha
    )

trmv function that returns the result vector of the multiplication of a triangular matrix and a vector y = alpha * M * x + beta * y

Parameters:

t_DataType	the data type of the vector entries
t_LogParEntries	log2 of the number of parallelly processed entries in the input vector
t_IndexType	the datatype of the index
p_n	the number of cols of input matrix p_M, as well as the number of entries in the input vector p_x, p_n % l_ParEntries == 0
p_alpha
scalar	alpha
p_M	the input stream of packed Matrix entries
p_x	the input stream of packed vector entries
p_beta
scalar	beta
p_y	the output vector

xfblasCreate¶

#include "xf_blas/wrapper.hpp"

xfblasStatus_t xfblasCreate (
    const char* xclbin,
    string configFile,
    xfblasEngine_t engineName,
    unsigned int kernelNumber = 1,
    unsigned int deviceIndex = 0
    )

This function initializes the XFBLAS library and creates a handle for the specific engine. It must be called prior to any other XFBLAS library calls.

Parameters:

xclbin	file path to FPGA bitstream
configFile	file path to config_info.dat file
engineName	XFBLAS engine to run
kernelNumber	number of kernels that is being used, default is 1
deviceIndex	index of device that is being used, default is 0
xfblasStatus_t	0 if the initialization succeeded
xfblasStatus_t	1 if the opencl runtime initialization failed
xfblasStatus_t	2 if the xclbin doesn’t contain the engine
xfblasStatus_t	4 if the engine is not supported for now

xfblasMalloc¶

xfblasMalloc overload (1)¶

#include "xf_blas/wrapper.hpp"

xfblasStatus_t xfblasMalloc (
    short** devPtr,
    int rows,
    int lda,
    int elemSize,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function allocates memory on the FPGA device.

Parameters:

devPtr	pointer to mapped memory
rows	number of rows in the matrix
lda	leading dimension of the matrix that indicates the total number of cols in the matrix
elemSize	number of bytes required to store each element in the matrix
kernelIndex	index of kernel that is being used, default is 0
deviceIndex	index of device that is being used, default is 0
xfblasStatus_t	0 if the allocation completed successfully
xfblasStatus_t	1 if the library was not initialized
xfblasStatus_t	2 if parameters rows, cols, elemSize, lda <= 0 or cols > lda or data types are not matched
xfblasStatus_t	3 if there is memory already allocated to the same matrix
xfblasStatus_t	4 if the engine is not supported for now

xfblasMallocRestricted¶

#include "xf_blas/wrapper.hpp"

xfblasStatus_t xfblasMallocRestricted (
    int rows,
    int cols,
    int elemSize,
    void* A,
    int lda,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function allocates memory for host row-major format matrix on the FPGA device.

Parameters:

rows	number of rows in the matrix
cols	number of cols in the matrix that is being used
elemSize	number of bytes required to store each element in the matrix
A	pointer to the matrix array in the host memory
lda	leading dimension of the matrix that indicates the total number of cols in the matrix
kernelIndex	index of kernel that is being used, default is 0
deviceIndex	index of device that is being used, default is 0
xfblasStatus_t	0 if the allocation completed successfully
xfblasStatus_t	1 if the library was not initialized
xfblasStatus_t	2 if parameters rows, cols, elemSize, lda <= 0 or cols > lda or data types are not matched
xfblasStatus_t	3 if there is memory already allocated to the same matrix
xfblasStatus_t	4 if the engine is not supported for now
xfblasStatus_t	5 if rows, cols or lda is not padded correctly

xfblasMallocManaged¶

xfblasMallocManaged overload (1)¶

#include "xf_blas/wrapper.hpp"

xfblasStatus_t xfblasMallocManaged (
    short** devPtr,
    int* paddedLda,
    int rows,
    int lda,
    int elemSize,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function allocates memory on the FPGA device, rewrites the leading dimension size after padding.

Parameters:

devPtr	pointer to mapped memory
paddedLda	leading dimension of the matrix after padding
rows	number of rows in the matrix
lda	leading dimension of the matrix that indicates the total number of cols in the matrix
elemSize	number of bytes required to store each element in the matrix
kernelIndex	index of kernel that is being used, default is 0
deviceIndex	index of device that is being used, default is 0
xfblasStatus_t	0 if the allocation completed successfully
xfblasStatus_t	1 if the library was not initialized
xfblasStatus_t	2 if parameters rows, cols, elemSize, lda <= 0 or cols > lda or data types are not matched
xfblasStatus_t	3 if there is memory already allocated to the same matrix
xfblasStatus_t	4 if the engine is not supported for now

xfblasSetMatrix¶

xfblasSetMatrix overload (1)¶

#include "xf_blas/wrapper.hpp"

xfblasStatus_t xfblasSetMatrix (
    int rows,
    int cols,
    int elemSize,
    short* A,
    int lda,
    short* d_A,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function copies a matrix in host memory to FPGA device memory. xfblasMalloc() need to be called prior to this function.

Parameters:

rows	number of rows in the matrix
cols	number of cols in the matrix that is being used
elemSize	number of bytes required to store each element in the matrix
A	pointer to the matrix array in the host memory
lda	leading dimension of the matrix that indicates the total number of cols in the matrix
d_A	pointer to mapped memory
kernelIndex	index of kernel that is being used, default is 0
deviceIndex	index of device that is being used, default is 0
xfblasStatus_t	0 if the operation completed successfully
xfblasStatus_t	1 if the library was not initialized
xfblasStatus_t	2 if parameters rows, cols, elemSize, lda <= 0 or cols > lda or data types are not matched
xfblasStatus_t	3 if there is no FPGA device memory allocated for the matrix
xfblasStatus_t	4 if the engine is not supported for now

xfblasSetVector¶

xfblasSetVector overload (1)¶

#include "xf_blas/wrapper.hpp"

xfblasStatus_t xfblasSetVector (
    int n,
    int elemSize,
    short* x,
    int incx,
    short* d_x,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function copies a vector in host memory to FPGA device memory. xfblasMalloc() need to be called prior to this function.

Parameters:

n	number of elements in vector
elemSize	number of bytes required to store each element in the vector
x	pointer to the vector in the host memory
incx	the storage spacing between consecutive elements of vector x
d_x	pointer to mapped memory
kernelIndex	index of kernel that is being used, default is 0
deviceIndex	index of device that is being used, default is 0
xfblasStatus_t	0 if the operation completed successfully
xfblasStatus_t	1 if the library was not initialized
xfblasStatus_t	2 if parameters rows, cols, elemSize, lda <= 0 or cols > lda or data types are not matched
xfblasStatus_t	3 if there is no FPGA device memory allocated for the vector
xfblasStatus_t	4 if the engine is not supported for now

xfblasSetMatrixRestricted¶

#include "xf_blas/wrapper.hpp"

xfblasStatus_t xfblasSetMatrixRestricted (
    void* A,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function copies a matrix in host memory to FPGA device memory. xfblasMallocRestricted() need to be called prior to this function.

Parameters:

A	pointer to the matrix array in the host memory
kernelIndex	index of kernel that is being used, default is 0
deviceIndex	index of device that is being used, default is 0
xfblasStatus_t	0 if the operation completed successfully
xfblasStatus_t	1 if the library was not initialized
xfblasStatus_t	3 if there is no FPGA device memory allocated for the matrix

xfblasSetVectorRestricted¶

#include "xf_blas/wrapper.hpp"

xfblasStatus_t xfblasSetVectorRestricted (
    void* x,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function copies a vector in host memory to FPGA device memory. xfblasMallocRestricted() need to be called prior to this function.

Parameters:

x	pointer to the vector in the host memory
kernelIndex	index of kernel that is being used, default is 0
deviceIndex	index of device that is being used, default is 0
xfblasStatus_t	0 if the operation completed successfully
xfblasStatus_t	1 if the library was not initialized
xfblasStatus_t	3 if there is no FPGA device memory allocated for the vector

xfblasDeviceSynchronize¶

#include "xf_blas/wrapper.hpp"

xfblasStatus_t xfblasDeviceSynchronize (
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function will synchronize all the device memory to host memory.

Parameters:

kernelIndex	index of kernel that is being used, default is 0
deviceIndex	index of device that is being used, default is 0
xfblasStatus_t	0 if the operation completed successfully
xfblasStatus_t	1 if the library was not initialized
xfblasStatus_t	3 if there is no FPGA device memory allocated for some of the matrices in the host memory

xfblasGetMatrix¶

xfblasGetMatrix overload (1)¶

#include "xf_blas/wrapper.hpp"

xfblasStatus_t xfblasGetMatrix (
    int rows,
    int cols,
    int elemSize,
    short* d_A,
    short* A,
    int lda,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function copies a matrix in FPGA device memory to host memory.

Parameters:

rows	number of rows in the matrix
cols	number of cols in the matrix that is being used
elemSize	number of bytes required to store each element in the matrix
d_A	pointer to mapped memory
A	pointer to the matrix array in the host memory
lda	leading dimension of the matrix that indicates the total number of cols in the matrix
kernelIndex	index of kernel that is being used, default is 0
deviceIndex	index of device that is being used, default is 0
xfblasStatus_t	0 if the operation completed successfully
xfblasStatus_t	1 if the library was not initialized
xfblasStatus_t	3 if there is no FPGA device memory allocated for the matrix

xfblasGetVector¶

xfblasGetVector overload (1)¶

#include "xf_blas/wrapper.hpp"

xfblasStatus_t xfblasGetVector (
    int n,
    int elemSize,
    short* d_x,
    short* x,
    int incx,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function copies a vector in FPGA device memory to host memory.

Parameters:

n	number of elements in vector
elemSize	number of bytes required to store each element in the vector
d_x	pointer to mapped memory
x	pointer to the vector in the host memory
incx	the storage spacing between consecutive elements of vector x
kernelIndex	index of kernel that is being used, default is 0
deviceIndex	index of device that is being used, default is 0
xfblasStatus_t	0 if the operation completed successfully
xfblasStatus_t	1 if the library was not initialized
xfblasStatus_t	3 if there is no FPGA device memory allocated for the vector

xfblasGetMatrixRestricted¶

#include "xf_blas/wrapper.hpp"

xfblasStatus_t xfblasGetMatrixRestricted (
    void* A,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function copies a matrix in FPGA device memory to host memory.

Parameters:

A	pointer to matrix A in the host memory
kernelIndex	index of kernel that is being used, default is 0
deviceIndex	index of device that is being used, default is 0
xfblasStatus_t	0 if the operation completed successfully
xfblasStatus_t	1 if the library was not initialized
xfblasStatus_t	3 if there is no FPGA device memory allocated for the matrix

xfblasGetVectorRestricted¶

#include "xf_blas/wrapper.hpp"

xfblasStatus_t xfblasGetVectorRestricted (
    void* x,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function copies a matrix in FPGA device memory to host memory.

Parameters:

x	pointer to vetcor x in the host memory
kernelIndex	index of kernel that is being used, default is 0
deviceIndex	index of device that is being used, default is 0
xfblasStatus_t	0 if the operation completed successfully
xfblasStatus_t	1 if the library was not initialized
xfblasStatus_t	3 if there is no FPGA device memory allocated for the matrix

xfblasFree¶

#include "xf_blas/wrapper.hpp"

xfblasStatus_t xfblasFree (
    void* A,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function frees memory in FPGA device.

Parameters:

A	pointer to matrix A in the host memory
kernelIndex	index of kernel that is being used, default is 0
deviceIndex	index of device that is being used, default is 0
xfblasStatus_t	0 if the operation completed successfully
xfblasStatus_t	1 if the library was not initialized
xfblasStatus_t	3 if there is no FPGA device memory allocated for the matrix

xfblasFreeInstr¶

#include "xf_blas/wrapper.hpp"

xfblasStatus_t xfblasFreeInstr (
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function frees instrution.

Parameters:

kernelIndex	index of kernel that is being used, default is 0
deviceIndex	index of device that is being used, default is 0
xfblasStatus_t	0 if the operation completed successfully
xfblasStatus_t	1 if the library was not initialized

xfblasDestroy¶

#include "xf_blas/wrapper.hpp"

xfblasStatus_t xfblasDestroy (
    unsigned int kernelNumber = 1,
    unsigned int deviceIndex = 0
    )

This function releases handle used by the XFBLAS library.

Parameters:

kernelNumber	number of kernels that is being used, default is 1
deviceIndex	index of device that is being used, default is 0
xfblasStatus_t	0 if the shut down succeeded
xfblasStatus_t	1 if the library was not initialized

xfblasGemm¶

xfblasGemm overload (1)¶

#include "xf_blas/wrapper.hpp"

xfblasStatus_t xfblasGemm (
    xfblasOperation_t transa,
    xfblasOperation_t transb,
    int m,
    int n,
    int k,
    int alpha,
    void* A,
    int lda,
    void* B,
    int ldb,
    int beta,
    void* C,
    int ldc,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function performs the matrix-matrix multiplication C = alpha*op(A)op(B) + beta*C.

Parameters:

transa	operation op(A) that is non- or (conj.) transpose
transb	operation op(B) that is non- or (conj.) transpose
m	number of rows in matrix A, matrix C
n	number of cols in matrix B, matrix C
k	number of cols in matrix A, number of rows in matrix B
alpha	scalar used for multiplication
A	pointer to matrix A in the host memory
lda	leading dimension of matirx A
B	pointer to matrix B in the host memory
ldb	leading dimension of matrix B
beta	scalar used for multiplication
C	pointer to matrix C in the host memory
ldc	leading dimension of matrix C
kernelIndex	index of kernel that is being used, default is 0
deviceIndex	index of device that is being used, default is 0
xfblasStatus_t	0 if the operation completed successfully
xfblasStatus_t	1 if the library was not initialized
xfblasStatus_t	3 if not all the matrices have FPGA devie memory allocated
xfblasStatus_t	4 if the engine is not supported for now

xfblasGemv¶

#include "xf_blas/wrapper.hpp"

xfblasStatus_t xfblasGemv (
    xfblasOperation_t trans,
    int m,
    int n,
    int alpha,
    void* A,
    int lda,
    void* x,
    int incx,
    int beta,
    void* y,
    int incy,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function performs the matrix-vector multiplication y = alpha*op(A) x+ beta*y.

Parameters:

transa	operation op(A) that is non- or (conj.) transpose
m	number of rows in matrix A
n	number of cols in matrix A
alpha	scalar used for multiplication
A	pointer to matrix A in the host memory
lda	leading dimension of matirx A
x	pointer to vector x in the host memory
incx	stride between consecutive elements of x
beta	scalar used for multiplication
y	pointer to vector y in the host memory
incy	stride between consecutive elements of y
kernelIndex	index of kernel that is being used, default is 0
deviceIndex	index of device that is being used, default is 0
xfblasStatus_t	0 if the operation completed successfully
xfblasStatus_t	1 if the library was not initialized
xfblasStatus_t	3 if not all the matrices have FPGA devie memory allocated
xfblasStatus_t	4 if the engine is not supported for now

xfblasGetByPointer¶

#include "xf_blas/wrapper.hpp"

xfblasStatus_t xfblasGetByPointer (
    void* A,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function copies a matrix in FPGA device memory to host memory by pointer.

Parameters:

A	pointer to matrix A in the host memory
kernelIndex	index of kernel that is being used, default is 0
deviceIndex	index of device that is being used, default is 0
xfblasStatus_t	0 if the operation completed successfully
xfblasStatus_t	1 if the library was not initialized
xfblasStatus_t	3 if there is no FPGA device memory allocated for the matrix

xfblasGetByAddress¶

#include "xf_blas/wrapper.hpp"

xfblasStatus_t xfblasGetByAddress (
    void* A,
    unsigned long long p_bufSize,
    unsigned int offset,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function copies a matrix in FPGA device memory to host memory by its address in device memory.

Parameters:

A	pointer to matrix A in the host memory
p_bufSize	size of matrix A
offset	A’s address in device memory
kernelIndex	index of kernel that is being used, default is 0
deviceIndex	index of device that is being used, default is 0
xfblasStatus_t	0 if the operation completed successfully
xfblasStatus_t	1 if the library was not initialized
xfblasStatus_t	3 if there is no FPGA device memory allocated for the matrix

xfblasExecute¶

#include "xf_blas/wrapper.hpp"

xfblasStatus_t xfblasExecute (
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function starts the kernel and wait until it finishes.

Parameters:

kernelIndex	index of kernel that is being used, default is 0
deviceIndex	index of device that is being used, default is 0
xfblasStatus_t	0 if the operation completed successfully
xfblasStatus_t	1 if the library was not initialized
xfblasStatus_t	3 if there is no FPGA device memory allocated for instrution

xfblasExecuteAsync¶

#include "xf_blas/wrapper.hpp"

void xfblasExecuteAsync (
    unsigned int numKernels = 1,
    unsigned int deviceIndex = 0
    )

This asynchronous function starts all kernels and wait until them finish.

Parameters:

numKernels	number of kernels that is being used, default is 1
deviceIndex	index of device that is being used, default is 0