namespace blas

// enums

enum OpType

// classes

class BLASArgs
class BLASHost
class BLASHostHandle
class ConfigDict
class GEMMHost
class GEMVHost

template <
    unsigned int t_KBufferDim,
    unsigned int t_ParEntriesM,
    unsigned int t_ParEntriesN
    >
class Gemm <float, t_KBufferDim, t_ParEntriesM, t_ParEntriesN, float>

template <
    typename t_DataType,
    unsigned int t_KBufferDim,
    unsigned int t_ParEntriesM,
    unsigned int t_ParEntriesN = t_ParEntriesM,
    typename t_MacDataType = t_DataType
    >
class Gemm

class GemmArgs
class GemvArgs
class XFpga
class XFpgaHold
class XHost

amax

#include "xf_blas/amax.hpp"
template <
    typename t_DataType,
    unsigned int t_LogParEntries,
    typename t_IndexType
    >
void amax (unsigned int p_n)

amax function that returns the position of the vector element that has the maximum magnitude.

Parameters:

t_DataType the data type of the vector entries
t_LogParEntries log2 of the number of parallelly processed entries in the input vector
t_IndexType the datatype of the index
p_n the number of entries in the input vector p_x, p_n % l_ParEntries == 0
p_x the input stream of packed vector entries
p_result the resulting index, which is 0 if p_n <= 0

amin

#include "xf_blas/amin.hpp"
template <
    typename t_DataType,
    unsigned int t_LogParEntries,
    typename t_IndexType
    >
void amin (unsigned int p_n)

amin function that returns the position of the vector element that has the minimum magnitude.

Parameters:

t_DataType the data type of the vector entries
t_LogParEntries log2 of the number of parallelly processed entries in the input vector
t_IndexType the datatype of the index
p_n the number of entries in the input vector p_x, p_n % l_ParEntries == 0
p_x the input stream of packed vector entries
p_result the resulting index, which is 0 if p_n <= 0

asum

#include "xf_blas/asum.hpp"
template <
    typename t_DataType,
    unsigned int t_LogParEntries,
    typename t_IndexType = unsigned int
    >
void asum (unsigned int p_n)

asum function that returns the sum of the magnitude of vector elements.

Parameters:

t_DataType the data type of the vector entries
t_LogParEntries log2 of the number of parallelly processed entries in the input vector
t_IndexType the datatype of the index
p_n the number of entries in the input vector p_x, p_n % l_ParEntries == 0
p_x the input stream of packed vector entries
p_sum the sum, which is 0 if p_n <= 0

axpy

#include "xf_blas/axpy.hpp"
template <
    typename t_DataType,
    unsigned int t_ParEntries,
    typename t_IndexType = unsigned int
    >
void axpy (
    unsigned int p_n,
    const t_DataType p_alpha,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_x,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_y,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_r
    )

axpy function that compute Y = alpha*X + Y.

Parameters:

t_DataType the data type of the vector entries
t_ParEntries the number of parallelly processed entries in the input vector
t_IndexType the datatype of the index
p_n the number of entries in the input vector p_x, p_n % t_ParEntries == 0
p_x the input stream of packed entries of vector X
p_y the input stream of packed entries of vector Y
p_r the output stream of packed entries of result vector Y

copy

#include "xf_blas/copy.hpp"
template <
    typename t_DataType,
    unsigned int t_ParEntries,
    typename t_IndexType = unsigned int
    >
void copy (
    unsigned int p_n,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_x,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_y
    )

copy function that compute Y = X

Parameters:

t_DataType the data type of the vector entries
t_ParEntries number of parallelly processed entries in the packed input vector stream
t_IndexType the datatype of the index
p_n the number of entries in vector X and Y
p_x the packed input vector stream
p_y the packed output vector stream

dot

#include "xf_blas/dot.hpp"
template <
    typename t_DataType,
    unsigned int t_LogParEntries,
    typename t_IndexType = unsigned int
    >
void dot (unsigned int p_n)

dot function that returns the dot product of vector x and y.

Parameters:

t_DataType the data type of the vector entries
t_LogParEntries log2 of the number of parallelly processed entries in the input vector
t_IndexType the datatype of the index
p_n the number of entries in the input vector p_x, p_n % l_ParEntries == 0
p_x the input stream of packed vector entries
p_res the dot product of x and y

gbmv

#include "xf_blas/gbmv.hpp"
template <
    typename t_DataType,
    unsigned int t_ParEntries,
    unsigned int t_MaxRows,
    typename t_IndexType = unsigned int,
    typename t_MacType = t_DataType
    >
void gbmv (
    const unsigned int p_m,
    const unsigned int p_n,
    const unsigned int p_kl,
    const unsigned int p_ku,
    const t_DataType p_alpha,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_M,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_x,
    const t_DataType p_beta,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_y,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_yr
    )

gbmv function performs general banded matrix-vector multiplication matrix and a vector y = alpha * M * x + beta * y

Parameters:

t_DataType the data type of the vector entries
t_ParEntries the number of parallelly processed entries in the input vector
t_MaxRows the maximum size of buffers for output vector
t_IndexType the datatype of the index
t_MacType the datatype of the output stream
p_m the number of rows of input matrix p_M
p_alpha scalar alpha
p_M the input stream of packed Matrix entries
p_x the input stream of packed vector entries
p_beta scalar beta
p_y the output vector

gemm

gemm overload (1)

#include "xf_blas/gemm.hpp"
template <
    typename t_DataType,
    unsigned int t_KBufferDim,
    unsigned int t_ParEntries
    >
void gemm (
    const unsigned int p_m,
    const unsigned int p_n,
    const unsigned int p_k,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_A,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_B,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_C
    )

gemm function that returns the result vector of the multiplication of two matrix

Parameters:

t_DataType the data type of the vector entries
t_KBufferDim description to be added
t_ParEntries the number of parallelly processed entries in the input vector
p_m the number of rows of input matrix p_M
p_n the number of cols of input matrix p_M, as well as the number of entries in the input vector p_x, p_n % l_ParEntries == 0
p_k description to be added
p_A the input stream of packed Matrix entries
p_B the input stream of packed Matrix entries
p_C the output matrix

gemv

#include "xf_blas/gemv.hpp"
template <
    typename t_DataType,
    unsigned int t_LogParEntries,
    typename t_IndexType = unsigned int
    >
void gemv (
    const unsigned int p_m,
    const unsigned int p_n,
    const t_DataType p_alpha
    )

gemv function that returns the result vector of the multiplication of a matrix and a vector y = alpha * M * x

  • beta * y

Parameters:

t_DataType the data type of the vector entries
t_LogParEntries log2 of the number of parallelly processed entries in the input vector
t_IndexType the datatype of the index
p_m the number of rows of input matrix p_M
p_n the number of cols of input matrix p_M, as well as the number of entries in the input vector p_x, p_n % l_ParEntries == 0
p_alpha scalar alpha
p_M the input stream of packed Matrix entries
p_x the input stream of packed vector entries
p_beta scalar beta
p_y the output vector

nrm2

#include "xf_blas/nrm2.hpp"
template <
    typename t_DataType,
    unsigned int t_LogParEntries,
    typename t_IndexType = unsigned int
    >
void nrm2 (unsigned int p_n)

nrm2 function that returns the Euclidean norm of the vector x.

Parameters:

t_DataType the data type of the vector entries
t_LogParEntries log2 of the number of parallelly processed entries in the input vector
t_IndexType the datatype of the index
p_n the number of entries in the input vector p_x, p_n % (1<<l_LogParEntries) == 0
p_x the input stream of packed vector entries
p_res the nrm2 of x

scal

#include "xf_blas/scal.hpp"
template <
    typename t_DataType,
    unsigned int t_ParEntries,
    typename t_IndexType = unsigned int
    >
void scal (
    unsigned int p_n,
    t_DataType p_alpha,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_x,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_res
    )

scal function that compute X = alpha * X

Parameters:

t_DataType the data type of the vector entries
t_ParEntries number of parallelly processed entries in the packed input vector stream
t_IndexType the datatype of the index
p_n the number of entries in vector X, p_n % t_ParEntries == 0
p_x the packed input vector stream
p_res the packed output vector stream

swap

#include "xf_blas/swap.hpp"
template <
    typename t_DataType,
    unsigned int t_ParEntries,
    typename t_IndexType = unsigned int
    >
void swap (
    unsigned int p_n,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_x,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_y,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_xRes,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_yRes
    )

swap function that swap vector x and y

Parameters:

t_DataType the data type of the vector entries
t_ParEntries number of parallelly processed entries in the packed input vector stream
t_IndexType the datatype of the index
p_n the number of entries in vector X and Y, p_n % t_ParEntries == 0
p_x the packed input vector stream
p_y the packed input vector stream
p_xRes the packed output stream
p_yRes the packed output stream

symv

#include "xf_blas/symv.hpp"
template <
    typename t_DataType,
    unsigned int t_LogParEntries,
    typename t_IndexType = unsigned int
    >
void symv (
    const unsigned int p_n,
    const t_DataType p_alpha
    )

symv function that returns the result vector of the multiplication of a symmetric matrix and a vector y = alpha * M * x + beta * y

Parameters:

t_DataType the data type of the vector entries
t_LogParEntries log2 of the number of parallelly processed entries in the input vector
t_IndexType the datatype of the index
p_n the dimention of input matrix p_M, as well as the number of entries in the input vector p_x, p_n % l_ParEntries == 0
p_alpha  
scalar alpha
p_M the input stream of packed Matrix entries
p_x the input stream of packed vector entries
p_beta  
scalar beta
p_y the output vector

trmv

#include "xf_blas/trmv.hpp"
template <
    typename t_DataType,
    unsigned int t_LogParEntries,
    typename t_IndexType = unsigned int
    >
void trmv (
    const bool uplo,
    const unsigned int p_n,
    const t_DataType p_alpha
    )

trmv function that returns the result vector of the multiplication of a triangular matrix and a vector y = alpha * M * x + beta * y

Parameters:

t_DataType the data type of the vector entries
t_LogParEntries log2 of the number of parallelly processed entries in the input vector
t_IndexType the datatype of the index
p_n the number of cols of input matrix p_M, as well as the number of entries in the input vector p_x, p_n % l_ParEntries == 0
p_alpha  
scalar alpha
p_M the input stream of packed Matrix entries
p_x the input stream of packed vector entries
p_beta  
scalar beta
p_y the output vector

xfblasCreate

#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasCreate (
    const char* xclbin,
    string configFile,
    xfblasEngine_t engineName,
    unsigned int kernelNumber = 1,
    unsigned int deviceIndex = 0
    )

This function initializes the XFBLAS library and creates a handle for the specific engine. It must be called prior to any other XFBLAS library calls.

Parameters:

xclbin file path to FPGA bitstream
configFile file path to config_info.dat file
engineName XFBLAS engine to run
kernelNumber number of kernels that is being used, default is 1
deviceIndex index of device that is being used, default is 0
xfblasStatus_t 0 if the initialization succeeded
xfblasStatus_t 1 if the opencl runtime initialization failed
xfblasStatus_t 2 if the xclbin doesn’t contain the engine
xfblasStatus_t 4 if the engine is not supported for now

xfblasMalloc

xfblasMalloc overload (1)

#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasMalloc (
    short** devPtr,
    int rows,
    int lda,
    int elemSize,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function allocates memory on the FPGA device.

Parameters:

devPtr pointer to mapped memory
rows number of rows in the matrix
lda leading dimension of the matrix that indicates the total number of cols in the matrix
elemSize number of bytes required to store each element in the matrix
kernelIndex index of kernel that is being used, default is 0
deviceIndex index of device that is being used, default is 0
xfblasStatus_t 0 if the allocation completed successfully
xfblasStatus_t 1 if the library was not initialized
xfblasStatus_t 2 if parameters rows, cols, elemSize, lda <= 0 or cols > lda or data types are not matched
xfblasStatus_t 3 if there is memory already allocated to the same matrix
xfblasStatus_t 4 if the engine is not supported for now

xfblasMallocRestricted

#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasMallocRestricted (
    int rows,
    int cols,
    int elemSize,
    void* A,
    int lda,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function allocates memory for host row-major format matrix on the FPGA device.

Parameters:

rows number of rows in the matrix
cols number of cols in the matrix that is being used
elemSize number of bytes required to store each element in the matrix
A pointer to the matrix array in the host memory
lda leading dimension of the matrix that indicates the total number of cols in the matrix
kernelIndex index of kernel that is being used, default is 0
deviceIndex index of device that is being used, default is 0
xfblasStatus_t 0 if the allocation completed successfully
xfblasStatus_t 1 if the library was not initialized
xfblasStatus_t 2 if parameters rows, cols, elemSize, lda <= 0 or cols > lda or data types are not matched
xfblasStatus_t 3 if there is memory already allocated to the same matrix
xfblasStatus_t 4 if the engine is not supported for now
xfblasStatus_t 5 if rows, cols or lda is not padded correctly

xfblasMallocManaged

xfblasMallocManaged overload (1)

#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasMallocManaged (
    short** devPtr,
    int* paddedLda,
    int rows,
    int lda,
    int elemSize,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function allocates memory on the FPGA device, rewrites the leading dimension size after padding.

Parameters:

devPtr pointer to mapped memory
paddedLda leading dimension of the matrix after padding
rows number of rows in the matrix
lda leading dimension of the matrix that indicates the total number of cols in the matrix
elemSize number of bytes required to store each element in the matrix
kernelIndex index of kernel that is being used, default is 0
deviceIndex index of device that is being used, default is 0
xfblasStatus_t 0 if the allocation completed successfully
xfblasStatus_t 1 if the library was not initialized
xfblasStatus_t 2 if parameters rows, cols, elemSize, lda <= 0 or cols > lda or data types are not matched
xfblasStatus_t 3 if there is memory already allocated to the same matrix
xfblasStatus_t 4 if the engine is not supported for now

xfblasSetMatrix

xfblasSetMatrix overload (1)

#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasSetMatrix (
    int rows,
    int cols,
    int elemSize,
    short* A,
    int lda,
    short* d_A,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function copies a matrix in host memory to FPGA device memory. xfblasMalloc() need to be called prior to this function.

Parameters:

rows number of rows in the matrix
cols number of cols in the matrix that is being used
elemSize number of bytes required to store each element in the matrix
A pointer to the matrix array in the host memory
lda leading dimension of the matrix that indicates the total number of cols in the matrix
d_A pointer to mapped memory
kernelIndex index of kernel that is being used, default is 0
deviceIndex index of device that is being used, default is 0
xfblasStatus_t 0 if the operation completed successfully
xfblasStatus_t 1 if the library was not initialized
xfblasStatus_t 2 if parameters rows, cols, elemSize, lda <= 0 or cols > lda or data types are not matched
xfblasStatus_t 3 if there is no FPGA device memory allocated for the matrix
xfblasStatus_t 4 if the engine is not supported for now

xfblasSetMatrixRestricted

#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasSetMatrixRestricted (
    void* A,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function copies a matrix in host memory to FPGA device memory. xfblasMallocRestricted() need to be called prior to this function.

Parameters:

A pointer to the matrix array in the host memory
kernelIndex index of kernel that is being used, default is 0
deviceIndex index of device that is being used, default is 0
xfblasStatus_t 0 if the operation completed successfully
xfblasStatus_t 1 if the library was not initialized
xfblasStatus_t 3 if there is no FPGA device memory allocated for the matrix

xfblasSetVectorRestricted

#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasSetVectorRestricted (
    void* x,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function copies a vector in host memory to FPGA device memory. xfblasMallocRestricted() need to be called prior to this function.

Parameters:

x pointer to the vector in the host memory
kernelIndex index of kernel that is being used, default is 0
deviceIndex index of device that is being used, default is 0
xfblasStatus_t 0 if the operation completed successfully
xfblasStatus_t 1 if the library was not initialized
xfblasStatus_t 3 if there is no FPGA device memory allocated for the vector

xfblasDeviceSynchronize

#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasDeviceSynchronize (
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function will synchronize all the device memory to host memory.

Parameters:

kernelIndex index of kernel that is being used, default is 0
deviceIndex index of device that is being used, default is 0
xfblasStatus_t 0 if the operation completed successfully
xfblasStatus_t 1 if the library was not initialized
xfblasStatus_t 3 if there is no FPGA device memory allocated for some of the matrices in the host memory

xfblasGetMatrix

xfblasGetMatrix overload (1)

#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasGetMatrix (
    int rows,
    int cols,
    int elemSize,
    short* d_A,
    short* A,
    int lda,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function copies a matrix in FPGA device memory to host memory.

Parameters:

rows number of rows in the matrix
cols number of cols in the matrix that is being used
elemSize number of bytes required to store each element in the matrix
d_A pointer to mapped memory
A pointer to the matrix array in the host memory
lda leading dimension of the matrix that indicates the total number of cols in the matrix
kernelIndex index of kernel that is being used, default is 0
deviceIndex index of device that is being used, default is 0
xfblasStatus_t 0 if the operation completed successfully
xfblasStatus_t 1 if the library was not initialized
xfblasStatus_t 3 if there is no FPGA device memory allocated for the matrix

xfblasGetMatrixRestricted

#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasGetMatrixRestricted (
    void* A,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function copies a matrix in FPGA device memory to host memory.

Parameters:

A pointer to matrix A in the host memory
kernelIndex index of kernel that is being used, default is 0
deviceIndex index of device that is being used, default is 0
xfblasStatus_t 0 if the operation completed successfully
xfblasStatus_t 1 if the library was not initialized
xfblasStatus_t 3 if there is no FPGA device memory allocated for the matrix

xfblasGetVectorRestricted

#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasGetVectorRestricted (
    void* x,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function copies a matrix in FPGA device memory to host memory.

Parameters:

x pointer to vetcor x in the host memory
kernelIndex index of kernel that is being used, default is 0
deviceIndex index of device that is being used, default is 0
xfblasStatus_t 0 if the operation completed successfully
xfblasStatus_t 1 if the library was not initialized
xfblasStatus_t 3 if there is no FPGA device memory allocated for the matrix

xfblasFree

#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasFree (
    void* A,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function frees memory in FPGA device.

Parameters:

A pointer to matrix A in the host memory
kernelIndex index of kernel that is being used, default is 0
deviceIndex index of device that is being used, default is 0
xfblasStatus_t 0 if the operation completed successfully
xfblasStatus_t 1 if the library was not initialized
xfblasStatus_t 3 if there is no FPGA device memory allocated for the matrix

xfblasFreeInstr

#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasFreeInstr (
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function frees instrution.

Parameters:

kernelIndex index of kernel that is being used, default is 0
deviceIndex index of device that is being used, default is 0
xfblasStatus_t 0 if the operation completed successfully
xfblasStatus_t 1 if the library was not initialized

xfblasDestroy

#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasDestroy (
    unsigned int kernelNumber = 1,
    unsigned int deviceIndex = 0
    )

This function releases handle used by the XFBLAS library.

Parameters:

kernelNumber number of kernels that is being used, default is 1
deviceIndex index of device that is being used, default is 0
xfblasStatus_t 0 if the shut down succeeded
xfblasStatus_t 1 if the library was not initialized

xfblasGemm

xfblasGemm overload (1)

#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasGemm (
    xfblasOperation_t transa,
    xfblasOperation_t transb,
    int m,
    int n,
    int k,
    int alpha,
    void* A,
    int lda,
    void* B,
    int ldb,
    int beta,
    void* C,
    int ldc,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function performs the matrix-matrix multiplication C = alpha*op(A)op(B) + beta*C.

Parameters:

transa operation op(A) that is non- or (conj.) transpose
transb operation op(B) that is non- or (conj.) transpose
m number of rows in matrix A, matrix C
n number of cols in matrix B, matrix C
k number of cols in matrix A, number of rows in matrix B
alpha scalar used for multiplication
A pointer to matrix A in the host memory
lda leading dimension of matrix A
B pointer to matrix B in the host memory
ldb leading dimension of matrix B
beta scalar used for multiplication
C pointer to matrix C in the host memory
ldc leading dimension of matrix C
kernelIndex index of kernel that is being used, default is 0
deviceIndex index of device that is being used, default is 0
xfblasStatus_t 0 if the operation completed successfully
xfblasStatus_t 1 if the library was not initialized
xfblasStatus_t 3 if not all the matrices have FPGA devie memory allocated
xfblasStatus_t 4 if the engine is not supported for now

xfblasGetByPointer

#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasGetByPointer (
    void* A,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function copies a matrix in FPGA device memory to host memory by pointer.

Parameters:

A pointer to matrix A in the host memory
kernelIndex index of kernel that is being used, default is 0
deviceIndex index of device that is being used, default is 0
xfblasStatus_t 0 if the operation completed successfully
xfblasStatus_t 1 if the library was not initialized
xfblasStatus_t 3 if there is no FPGA device memory allocated for the matrix

xfblasGetByAddress

#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasGetByAddress (
    void* A,
    unsigned long long p_bufSize,
    unsigned int offset,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function copies a matrix in FPGA device memory to host memory by its address in device memory.

Parameters:

A pointer to matrix A in the host memory
p_bufSize size of matrix A
offset A’s address in device memory
kernelIndex index of kernel that is being used, default is 0
deviceIndex index of device that is being used, default is 0
xfblasStatus_t 0 if the operation completed successfully
xfblasStatus_t 1 if the library was not initialized
xfblasStatus_t 3 if there is no FPGA device memory allocated for the matrix

xfblasExecute

#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasExecute (
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function starts the kernel and wait until it finishes.

Parameters:

kernelIndex index of kernel that is being used, default is 0
deviceIndex index of device that is being used, default is 0
xfblasStatus_t 0 if the operation completed successfully
xfblasStatus_t 1 if the library was not initialized
xfblasStatus_t 3 if there is no FPGA device memory allocated for instrution

xfblasExecuteAsync

#include "xf_blas/wrapper.hpp"
void xfblasExecuteAsync (
    unsigned int numKernels = 1,
    unsigned int deviceIndex = 0
    )

This asynchronous function starts all kernels and wait until them finish.

Parameters:

numKernels number of kernels that is being used, default is 1
deviceIndex index of device that is being used, default is 0