namespace blas

// enums

enum OpType

// classes

class BLASArgs
class BLASHost
class BLASHostHandle
class ConfigDict
class GEMMHost
class GEMVHost

template <
    typename t_DataType,
    unsigned int t_KBufferDim,
    unsigned int t_ParEntriesM,
    unsigned int t_ParEntriesN = t_ParEntriesM,
    typename t_MacDataType = t_DataType
    >
class Gemm

template <
    unsigned int t_KBufferDim,
    unsigned int t_ParEntriesM,
    unsigned int t_ParEntriesN
    >
class Gemm <float, t_KBufferDim, t_ParEntriesM, t_ParEntriesN, float>

class GemmArgs

template <
    typename t_FloatType,
    typename t_XDataType,
    unsigned int t_DdrWidth,
    unsigned int t_XDdrWidth,
    unsigned int t_aColMemWords = 1,
    unsigned int t_aRowMemWords = 1,
    unsigned int t_bColMemWords = 1
    >
class GemmKernel

template <
    typename t_FloatType,
    unsigned int t_DdrWidth,
    unsigned int t_colMemWords,
    unsigned int t_rowMemWords,
    unsigned int t_kVectorBlocks,
    unsigned int t_mVectorBlocks
    >
class Gemv

class GemvArgs

template <
    typename t_FloatType,
    unsigned int t_DdrWidth,
    unsigned int t_colMemWords,
    unsigned int t_rowMemWords
    >
class Transp

class XFpga
class XFpgaHold
class XHost

amax

#include "xf_blas/amax.hpp"
template <
    typename t_DataType,
    unsigned int t_LogParEntries,
    typename t_IndexType
    >
void amax (unsigned int p_n)

amax function that returns the position of the vector element that has the maximum magnitude.

Parameters:

t_DataType

the data type of the vector entries

t_LogParEntries

log2 of the number of parallelly processed entries in the input vector

t_IndexType

the datatype of the index

p_n

the number of entries in the input vector p_x, p_n % l_ParEntries == 0

p_x

the input stream of packed vector entries

p_result

the resulting index, which is 0 if p_n <= 0

amin

#include "xf_blas/amin.hpp"
template <
    typename t_DataType,
    unsigned int t_LogParEntries,
    typename t_IndexType
    >
void amin (unsigned int p_n)

amin function that returns the position of the vector element that has the minimum magnitude.

Parameters:

t_DataType

the data type of the vector entries

t_LogParEntries

log2 of the number of parallelly processed entries in the input vector

t_IndexType

the datatype of the index

p_n

the number of entries in the input vector p_x, p_n % l_ParEntries == 0

p_x

the input stream of packed vector entries

p_result

the resulting index, which is 0 if p_n <= 0

asum

#include "xf_blas/asum.hpp"
template <
    typename t_DataType,
    unsigned int t_LogParEntries,
    typename t_IndexType = unsigned int
    >
void asum (unsigned int p_n)

asum function that returns the sum of the magnitude of vector elements.

Parameters:

t_DataType

the data type of the vector entries

t_LogParEntries

log2 of the number of parallelly processed entries in the input vector

t_IndexType

the datatype of the index

p_n

the number of entries in the input vector p_x, p_n % l_ParEntries == 0

p_x

the input stream of packed vector entries

p_sum

the sum, which is 0 if p_n <= 0

axpy

#include "xf_blas/axpy.hpp"
template <
    typename t_DataType,
    unsigned int t_ParEntries,
    typename t_IndexType = unsigned int
    >
void axpy (
    unsigned int p_n,
    const t_DataType p_alpha,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_x,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_y,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_r
    )

axpy function that compute Y = alpha*X + Y.

Parameters:

t_DataType

the data type of the vector entries

t_LogParEntries

log2 of the number of parallelly processed entries in the input vector

t_IndexType

the datatype of the index

p_n

the number of entries in the input vector p_x, p_n % t_ParEntries == 0

p_x

the input stream of packed entries of vector X

p_y

the input stream of packed entries of vector Y

p_r

the output stream of packed entries of result vector Y

copy

#include "xf_blas/copy.hpp"
template <
    typename t_DataType,
    unsigned int t_ParEntries,
    typename t_IndexType = unsigned int
    >
void copy (
    unsigned int p_n,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_x,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_y
    )

copy function that compute Y = X

Parameters:

t_DataType

the data type of the vector entries

t_ParEntries

number of parallelly processed entries in the packed input vector stream

t_IndexType

the datatype of the index

p_n

the number of entries in vector X and Y

p_x

the packed input vector stream

p_y

the packed output vector stream

dot

#include "xf_blas/dot.hpp"
template <
    typename t_DataType,
    unsigned int t_LogParEntries,
    typename t_IndexType = unsigned int
    >
void dot (unsigned int p_n)

dot function that returns the dot product of vector x and y.

Parameters:

t_DataType

the data type of the vector entries

t_LogParEntries

log2 of the number of parallelly processed entries in the input vector

t_IndexType

the datatype of the index

p_n

the number of entries in the input vector p_x, p_n % l_ParEntries == 0

p_x

the input stream of packed vector entries

p_res

the dot product of x and y

gbmv

#include "xf_blas/gbmv.hpp"
template <
    typename t_DataType,
    unsigned int t_ParEntries,
    unsigned int t_MaxRows,
    typename t_IndexType = unsigned int,
    typename t_MacType = t_DataType
    >
void gbmv (
    const unsigned int p_m,
    const unsigned int p_n,
    const unsigned int p_kl,
    const unsigned int p_ku,
    const t_DataType p_alpha,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_M,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_x,
    const t_DataType p_beta,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_y,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_yr
    )

gbmv function performs general banded matrix-vector multiplication matrix and a vector y = alpha * M * x + beta * y

Parameters:

t_DataType

the data type of the vector entries

t_ParEntries

the number of parallelly processed entries in the input vector

t_MaxRows

the maximum size of buffers for output vector

t_IndexType

the datatype of the index

t_MacType

the datatype of the output stream

p_m

the number of rows of input matrix p_M

p_alpha

scalar alpha

p_M

the input stream of packed Matrix entries

p_x

the input stream of packed vector entries

p_beta

scalar beta

p_y

the output vector

gemv

#include "xf_blas/gemv.hpp"
template <
    typename t_DataType,
    unsigned int t_LogParEntries,
    typename t_IndexType = unsigned int
    >
void gemv (
    const unsigned int p_m,
    const unsigned int p_n,
    const t_DataType p_alpha
    )

gemv function that returns the result vector of the multiplication of a matrix and a vector y = alpha * M * x

  • beta * y

Parameters:

t_DataType

the data type of the vector entries

t_LogParEntries

log2 of the number of parallelly processed entries in the input vector

t_IndexType

the datatype of the index

p_m

the number of rows of input matrix p_M

p_n

the number of cols of input matrix p_M, as well as the number of entries in the input vector p_x, p_n % l_ParEntries == 0

p_alpha

scalar alpha

p_M

the input stream of packed Matrix entries

p_x

the input stream of packed vector entries

p_beta

scalar beta

p_y

the output vector

nrm2

#include "xf_blas/nrm2.hpp"
template <
    typename t_DataType,
    unsigned int t_LogParEntries,
    typename t_IndexType = unsigned int
    >
void nrm2 (unsigned int p_n)

nrm2 function that returns the Euclidean norm of the vector x.

Parameters:

t_DataType

the data type of the vector entries

t_LogParEntries

log2 of the number of parallelly processed entries in the input vector

t_IndexType

the datatype of the index

p_n

the number of entries in the input vector p_x, p_n % (1<<l_LogParEntries) == 0

p_x

the input stream of packed vector entries

p_res

the nrm2 of x

scal

#include "xf_blas/scal.hpp"
template <
    typename t_DataType,
    unsigned int t_ParEntries,
    typename t_IndexType = unsigned int
    >
void scal (
    unsigned int p_n,
    t_DataType p_alpha,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_x,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_res
    )

scal function that compute X = alpha * X

Parameters:

t_DataType

the data type of the vector entries

t_ParEntries

number of parallelly processed entries in the packed input vector stream

t_IndexType

the datatype of the index

p_n

the number of entries in vector X, p_n % t_ParEntries == 0

p_x

the packed input vector stream

p_res

the packed output vector stream

swap

#include "xf_blas/swap.hpp"
template <
    typename t_DataType,
    unsigned int t_ParEntries,
    typename t_IndexType = unsigned int
    >
void swap (
    unsigned int p_n,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_x,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_y,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_xRes,
    hls::stream <typename WideType <t_DataType, t_ParEntries>::t_TypeInt>& p_yRes
    )

swap function that swap vector x and y

Parameters:

t_DataType

the data type of the vector entries

t_ParEntries

number of parallelly processed entries in the packed input vector stream

t_IndexType

the datatype of the index

p_n

the number of entries in vector X and Y, p_n % t_ParEntries == 0

p_x

the packed input vector stream

p_y

the packed input vector stream

p_xRes

the packed output stream

p_yRes

the packed output stream

symv

#include "xf_blas/symv.hpp"
template <
    typename t_DataType,
    unsigned int t_LogParEntries,
    typename t_IndexType = unsigned int
    >
void symv (
    const unsigned int p_n,
    const t_DataType p_alpha
    )

symv function that returns the result vector of the multiplication of a symmetric matrix and a vector y = alpha * M * x + beta * y

Parameters:

t_DataType

the data type of the vector entries

t_LogParEntries

log2 of the number of parallelly processed entries in the input vector

t_IndexType

the datatype of the index

p_n

the dimention of input matrix p_M, as well as the number of entries in the input vector p_x, p_n % l_ParEntries == 0

p_alpha

scalar

alpha

p_M

the input stream of packed Matrix entries

p_x

the input stream of packed vector entries

p_beta

scalar

beta

p_y

the output vector

trmv

#include "xf_blas/trmv.hpp"
template <
    typename t_DataType,
    unsigned int t_LogParEntries,
    typename t_IndexType = unsigned int
    >
void trmv (
    const bool uplo,
    const unsigned int p_n,
    const t_DataType p_alpha
    )

trmv function that returns the result vector of the multiplication of a triangular matrix and a vector y = alpha * M * x + beta * y

Parameters:

t_DataType

the data type of the vector entries

t_LogParEntries

log2 of the number of parallelly processed entries in the input vector

t_IndexType

the datatype of the index

p_n

the number of cols of input matrix p_M, as well as the number of entries in the input vector p_x, p_n % l_ParEntries == 0

p_alpha

scalar

alpha

p_M

the input stream of packed Matrix entries

p_x

the input stream of packed vector entries

p_beta

scalar

beta

p_y

the output vector

xfblasCreate

#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasCreate (
    const char* xclbin,
    string configFile,
    xfblasEngine_t engineName,
    unsigned int kernelNumber = 1,
    unsigned int deviceIndex = 0
    )

This function initializes the XFBLAS library and creates a handle for the specific engine. It must be called prior to any other XFBLAS library calls.

Parameters:

xclbin

file path to FPGA bitstream

configFile

file path to config_info.dat file

engineName

XFBLAS engine to run

kernelNumber

number of kernels that is being used, default is 1

deviceIndex

index of device that is being used, default is 0

xfblasStatus_t

0 if the initialization succeeded

xfblasStatus_t

1 if the opencl runtime initialization failed

xfblasStatus_t

2 if the xclbin doesn’t contain the engine

xfblasStatus_t

4 if the engine is not supported for now

xfblasMalloc

xfblasMalloc overload (1)

#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasMalloc (
    short** devPtr,
    int rows,
    int lda,
    int elemSize,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function allocates memory on the FPGA device.

Parameters:

devPtr

pointer to mapped memory

rows

number of rows in the matrix

lda

leading dimension of the matrix that indicates the total number of cols in the matrix

elemSize

number of bytes required to store each element in the matrix

kernelIndex

index of kernel that is being used, default is 0

deviceIndex

index of device that is being used, default is 0

xfblasStatus_t

0 if the allocation completed successfully

xfblasStatus_t

1 if the library was not initialized

xfblasStatus_t

2 if parameters rows, cols, elemSize, lda <= 0 or cols > lda or data types are not matched

xfblasStatus_t

3 if there is memory already allocated to the same matrix

xfblasStatus_t

4 if the engine is not supported for now

xfblasMallocRestricted

#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasMallocRestricted (
    int rows,
    int cols,
    int elemSize,
    void* A,
    int lda,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function allocates memory for host row-major format matrix on the FPGA device.

Parameters:

rows

number of rows in the matrix

cols

number of cols in the matrix that is being used

elemSize

number of bytes required to store each element in the matrix

A

pointer to the matrix array in the host memory

lda

leading dimension of the matrix that indicates the total number of cols in the matrix

kernelIndex

index of kernel that is being used, default is 0

deviceIndex

index of device that is being used, default is 0

xfblasStatus_t

0 if the allocation completed successfully

xfblasStatus_t

1 if the library was not initialized

xfblasStatus_t

2 if parameters rows, cols, elemSize, lda <= 0 or cols > lda or data types are not matched

xfblasStatus_t

3 if there is memory already allocated to the same matrix

xfblasStatus_t

4 if the engine is not supported for now

xfblasStatus_t

5 if rows, cols or lda is not padded correctly

xfblasMallocManaged

xfblasMallocManaged overload (1)

#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasMallocManaged (
    short** devPtr,
    int* paddedLda,
    int rows,
    int lda,
    int elemSize,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function allocates memory on the FPGA device, rewrites the leading dimension size after padding.

Parameters:

devPtr

pointer to mapped memory

paddedLda

leading dimension of the matrix after padding

rows

number of rows in the matrix

lda

leading dimension of the matrix that indicates the total number of cols in the matrix

elemSize

number of bytes required to store each element in the matrix

kernelIndex

index of kernel that is being used, default is 0

deviceIndex

index of device that is being used, default is 0

xfblasStatus_t

0 if the allocation completed successfully

xfblasStatus_t

1 if the library was not initialized

xfblasStatus_t

2 if parameters rows, cols, elemSize, lda <= 0 or cols > lda or data types are not matched

xfblasStatus_t

3 if there is memory already allocated to the same matrix

xfblasStatus_t

4 if the engine is not supported for now

xfblasSetMatrix

xfblasSetMatrix overload (1)

#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasSetMatrix (
    int rows,
    int cols,
    int elemSize,
    short* A,
    int lda,
    short* d_A,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function copies a matrix in host memory to FPGA device memory. xfblasMalloc() need to be called prior to this function.

Parameters:

rows

number of rows in the matrix

cols

number of cols in the matrix that is being used

elemSize

number of bytes required to store each element in the matrix

A

pointer to the matrix array in the host memory

lda

leading dimension of the matrix that indicates the total number of cols in the matrix

d_A

pointer to mapped memory

kernelIndex

index of kernel that is being used, default is 0

deviceIndex

index of device that is being used, default is 0

xfblasStatus_t

0 if the operation completed successfully

xfblasStatus_t

1 if the library was not initialized

xfblasStatus_t

2 if parameters rows, cols, elemSize, lda <= 0 or cols > lda or data types are not matched

xfblasStatus_t

3 if there is no FPGA device memory allocated for the matrix

xfblasStatus_t

4 if the engine is not supported for now

xfblasSetVector

xfblasSetVector overload (1)

#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasSetVector (
    int n,
    int elemSize,
    short* x,
    int incx,
    short* d_x,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function copies a vector in host memory to FPGA device memory. xfblasMalloc() need to be called prior to this function.

Parameters:

n

number of elements in vector

elemSize

number of bytes required to store each element in the vector

x

pointer to the vector in the host memory

incx

the storage spacing between consecutive elements of vector x

d_x

pointer to mapped memory

kernelIndex

index of kernel that is being used, default is 0

deviceIndex

index of device that is being used, default is 0

xfblasStatus_t

0 if the operation completed successfully

xfblasStatus_t

1 if the library was not initialized

xfblasStatus_t

2 if parameters rows, cols, elemSize, lda <= 0 or cols > lda or data types are not matched

xfblasStatus_t

3 if there is no FPGA device memory allocated for the vector

xfblasStatus_t

4 if the engine is not supported for now

xfblasSetMatrixRestricted

#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasSetMatrixRestricted (
    void* A,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function copies a matrix in host memory to FPGA device memory. xfblasMallocRestricted() need to be called prior to this function.

Parameters:

A

pointer to the matrix array in the host memory

kernelIndex

index of kernel that is being used, default is 0

deviceIndex

index of device that is being used, default is 0

xfblasStatus_t

0 if the operation completed successfully

xfblasStatus_t

1 if the library was not initialized

xfblasStatus_t

3 if there is no FPGA device memory allocated for the matrix

xfblasSetVectorRestricted

#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasSetVectorRestricted (
    void* x,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function copies a vector in host memory to FPGA device memory. xfblasMallocRestricted() need to be called prior to this function.

Parameters:

x

pointer to the vector in the host memory

kernelIndex

index of kernel that is being used, default is 0

deviceIndex

index of device that is being used, default is 0

xfblasStatus_t

0 if the operation completed successfully

xfblasStatus_t

1 if the library was not initialized

xfblasStatus_t

3 if there is no FPGA device memory allocated for the vector

xfblasDeviceSynchronize

#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasDeviceSynchronize (
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function will synchronize all the device memory to host memory.

Parameters:

kernelIndex

index of kernel that is being used, default is 0

deviceIndex

index of device that is being used, default is 0

xfblasStatus_t

0 if the operation completed successfully

xfblasStatus_t

1 if the library was not initialized

xfblasStatus_t

3 if there is no FPGA device memory allocated for some of the matrices in the host memory

xfblasGetMatrix

xfblasGetMatrix overload (1)

#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasGetMatrix (
    int rows,
    int cols,
    int elemSize,
    short* d_A,
    short* A,
    int lda,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function copies a matrix in FPGA device memory to host memory.

Parameters:

rows

number of rows in the matrix

cols

number of cols in the matrix that is being used

elemSize

number of bytes required to store each element in the matrix

d_A

pointer to mapped memory

A

pointer to the matrix array in the host memory

lda

leading dimension of the matrix that indicates the total number of cols in the matrix

kernelIndex

index of kernel that is being used, default is 0

deviceIndex

index of device that is being used, default is 0

xfblasStatus_t

0 if the operation completed successfully

xfblasStatus_t

1 if the library was not initialized

xfblasStatus_t

3 if there is no FPGA device memory allocated for the matrix

xfblasGetVector

xfblasGetVector overload (1)

#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasGetVector (
    int n,
    int elemSize,
    short* d_x,
    short* x,
    int incx,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function copies a vector in FPGA device memory to host memory.

Parameters:

n

number of elements in vector

elemSize

number of bytes required to store each element in the vector

d_x

pointer to mapped memory

x

pointer to the vector in the host memory

incx

the storage spacing between consecutive elements of vector x

kernelIndex

index of kernel that is being used, default is 0

deviceIndex

index of device that is being used, default is 0

xfblasStatus_t

0 if the operation completed successfully

xfblasStatus_t

1 if the library was not initialized

xfblasStatus_t

3 if there is no FPGA device memory allocated for the vector

xfblasGetMatrixRestricted

#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasGetMatrixRestricted (
    void* A,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function copies a matrix in FPGA device memory to host memory.

Parameters:

A

pointer to matrix A in the host memory

kernelIndex

index of kernel that is being used, default is 0

deviceIndex

index of device that is being used, default is 0

xfblasStatus_t

0 if the operation completed successfully

xfblasStatus_t

1 if the library was not initialized

xfblasStatus_t

3 if there is no FPGA device memory allocated for the matrix

xfblasGetVectorRestricted

#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasGetVectorRestricted (
    void* x,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function copies a matrix in FPGA device memory to host memory.

Parameters:

x

pointer to vetcor x in the host memory

kernelIndex

index of kernel that is being used, default is 0

deviceIndex

index of device that is being used, default is 0

xfblasStatus_t

0 if the operation completed successfully

xfblasStatus_t

1 if the library was not initialized

xfblasStatus_t

3 if there is no FPGA device memory allocated for the matrix

xfblasFree

#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasFree (
    void* A,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function frees memory in FPGA device.

Parameters:

A

pointer to matrix A in the host memory

kernelIndex

index of kernel that is being used, default is 0

deviceIndex

index of device that is being used, default is 0

xfblasStatus_t

0 if the operation completed successfully

xfblasStatus_t

1 if the library was not initialized

xfblasStatus_t

3 if there is no FPGA device memory allocated for the matrix

xfblasFreeInstr

#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasFreeInstr (
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function frees instrution.

Parameters:

kernelIndex

index of kernel that is being used, default is 0

deviceIndex

index of device that is being used, default is 0

xfblasStatus_t

0 if the operation completed successfully

xfblasStatus_t

1 if the library was not initialized

xfblasDestroy

#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasDestroy (
    unsigned int kernelNumber = 1,
    unsigned int deviceIndex = 0
    )

This function releases handle used by the XFBLAS library.

Parameters:

kernelNumber

number of kernels that is being used, default is 1

deviceIndex

index of device that is being used, default is 0

xfblasStatus_t

0 if the shut down succeeded

xfblasStatus_t

1 if the library was not initialized

xfblasGemm

xfblasGemm overload (1)

#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasGemm (
    xfblasOperation_t transa,
    xfblasOperation_t transb,
    int m,
    int n,
    int k,
    int alpha,
    void* A,
    int lda,
    void* B,
    int ldb,
    int beta,
    void* C,
    int ldc,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function performs the matrix-matrix multiplication C = alpha*op(A)op(B) + beta*C.

Parameters:

transa

operation op(A) that is non- or (conj.) transpose

transb

operation op(B) that is non- or (conj.) transpose

m

number of rows in matrix A, matrix C

n

number of cols in matrix B, matrix C

k

number of cols in matrix A, number of rows in matrix B

alpha

scalar used for multiplication

A

pointer to matrix A in the host memory

lda

leading dimension of matirx A

B

pointer to matrix B in the host memory

ldb

leading dimension of matrix B

beta

scalar used for multiplication

C

pointer to matrix C in the host memory

ldc

leading dimension of matrix C

kernelIndex

index of kernel that is being used, default is 0

deviceIndex

index of device that is being used, default is 0

xfblasStatus_t

0 if the operation completed successfully

xfblasStatus_t

1 if the library was not initialized

xfblasStatus_t

3 if not all the matrices have FPGA devie memory allocated

xfblasStatus_t

4 if the engine is not supported for now

xfblasGemv

#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasGemv (
    xfblasOperation_t trans,
    int m,
    int n,
    int alpha,
    void* A,
    int lda,
    void* x,
    int incx,
    int beta,
    void* y,
    int incy,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function performs the matrix-vector multiplication y = alpha*op(A) x+ beta*y.

Parameters:

transa

operation op(A) that is non- or (conj.) transpose

m

number of rows in matrix A

n

number of cols in matrix A

alpha

scalar used for multiplication

A

pointer to matrix A in the host memory

lda

leading dimension of matirx A

x

pointer to vector x in the host memory

incx

stride between consecutive elements of x

beta

scalar used for multiplication

y

pointer to vector y in the host memory

incy

stride between consecutive elements of y

kernelIndex

index of kernel that is being used, default is 0

deviceIndex

index of device that is being used, default is 0

xfblasStatus_t

0 if the operation completed successfully

xfblasStatus_t

1 if the library was not initialized

xfblasStatus_t

3 if not all the matrices have FPGA devie memory allocated

xfblasStatus_t

4 if the engine is not supported for now

xfblasGetByPointer

#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasGetByPointer (
    void* A,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function copies a matrix in FPGA device memory to host memory by pointer.

Parameters:

A

pointer to matrix A in the host memory

kernelIndex

index of kernel that is being used, default is 0

deviceIndex

index of device that is being used, default is 0

xfblasStatus_t

0 if the operation completed successfully

xfblasStatus_t

1 if the library was not initialized

xfblasStatus_t

3 if there is no FPGA device memory allocated for the matrix

xfblasGetByAddress

#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasGetByAddress (
    void* A,
    unsigned long long p_bufSize,
    unsigned int offset,
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function copies a matrix in FPGA device memory to host memory by its address in device memory.

Parameters:

A

pointer to matrix A in the host memory

p_bufSize

size of matrix A

offset

A’s address in device memory

kernelIndex

index of kernel that is being used, default is 0

deviceIndex

index of device that is being used, default is 0

xfblasStatus_t

0 if the operation completed successfully

xfblasStatus_t

1 if the library was not initialized

xfblasStatus_t

3 if there is no FPGA device memory allocated for the matrix

xfblasExecute

#include "xf_blas/wrapper.hpp"
xfblasStatus_t xfblasExecute (
    unsigned int kernelIndex = 0,
    unsigned int deviceIndex = 0
    )

This function starts the kernel and wait until it finishes.

Parameters:

kernelIndex

index of kernel that is being used, default is 0

deviceIndex

index of device that is being used, default is 0

xfblasStatus_t

0 if the operation completed successfully

xfblasStatus_t

1 if the library was not initialized

xfblasStatus_t

3 if there is no FPGA device memory allocated for instrution

xfblasExecuteAsync

#include "xf_blas/wrapper.hpp"
void xfblasExecuteAsync (
    unsigned int numKernels = 1,
    unsigned int deviceIndex = 0
    )

This asynchronous function starts all kernels and wait until them finish.

Parameters:

numKernels

number of kernels that is being used, default is 1

deviceIndex

index of device that is being used, default is 0