namespace graph¶

// namespaces

namespace xf::graph::internal
    namespace xf::graph::internal::bfs
    namespace xf::graph::internal::calc_degree
    namespace xf::graph::internal::connected_components
    namespace xf::graph::internal::convert_csr_csc
    namespace xf::graph::internal::hash_group_aggregate
    namespace xf::graph::internal::label_propagation
    namespace xf::graph::internal::mssp
    namespace xf::graph::internal::pagerank
    namespace xf::graph::internal::pagerankMultiChannel
    namespace xf::graph::internal::scc
    namespace xf::graph::internal::sssp
        namespace xf::graph::internal::sssp::nopred
        namespace xf::graph::internal::sssp::pred
    namespace xf::graph::internal::triangle_count

bfs¶

#include "bfs.hpp"

template <int MAXOUTDEGREE>
void bfs (
    const int srcID,
    const int numVertex,
    ap_uint <512>* offsetCSR,
    ap_uint <512>* indexCSR,
    ap_uint <512>* queue512,
    ap_uint <32>* queue32,
    ap_uint <512>* color512,
    ap_uint <32>* dtime,
    ap_uint <32>* ftime,
    ap_uint <32>* pred,
    ap_uint <32>* distance
    )

bfs Implement the directed graph traversal by breath-first search algorithm

Parameters:

MAXOUTDEGREE	the maximum outdegree of input graphs. Large value will result in more URAM usage.
srcID	the source vertex ID for this search, starting from 0
numVertex	vertex number of the input graph
indexCSR	column index of CSR format
offsetCSR	row offset of CSR format
color512	intermediate color map which should be shared with dtime, pred or distance in host.
queue32	intermediate queue32 used during the BFS
dtime	the result of discovery time stamp for each vertex
ftime	the result of finish time stamp for each vertex
pred	the result of parent index of each vertex
distance	the distance result from given source vertex for each vertex

calcuDegree¶

#include "calc_degree.hpp"

template <
    int MAXVERTEX,
    int MAXEDGE,
    int LOG2CACHEDEPTH,
    int LOG2DATAPERCACHELINE,
    int RAMTYPE
    >
void calcuDegree (
    int numVertex,
    int numEdge,
    ap_uint <512>* index,
    ap_uint <512>* degree
    )

calculate degree algorithm is implemented

Parameters:

MAXVERTEX	CSC/CSR data vertex(offset) array maxsize
MAXEDGE	CSC/CSR data edge(indice) array maxsize
LOG2CACHEDEPTH	cache depth in Binary, the cache onchip memory is 512 bit x uramRow
LOG2DATAPERCACHELINE	number of data in one 512bit in Binary, for double, it’s 3, for float, it’s 4
RAMTYPE	flag to tell use URAM LUTRAM or BRAM, 0 : LUTRAM, 1 : URAM, 2 : BRAM
numVertex	CSR/CSC data vertex number
numEdge	CSR/CSC data edge number
index	input CSR/CSC data index array
degree	output degree array

calcuWeightedDegree¶

calcuWeightedDegree overload (1)¶

#include "calc_degree.hpp"

template <
    int MAXVERTEX,
    int MAXEDGE,
    int LOG2CACHEDEPTH,
    int LOG2DATAPERCACHELINE,
    int RAMTYPE
    >
void calcuWeightedDegree (
    int numVertex,
    int numEdge,
    ap_uint <512> index [MAXEDGE],
    ap_uint <512> weight [MAXEDGE],
    ap_uint <512> degree [MAXVERTEX]
    )

calculate weighted degree algorithm is implemented

Parameters:

MAXVERTEX	CSC/CSR data vertex(offset) array maxsize
MAXEDGE	CSC/CSR data edge(indice) array maxsize
LOG2CACHEDEPTH	cache depth in Binary, the cache onchip memory is 512 bit x uramRow
LOG2DATAPERCACHELINE	number of data in one 512bit in Binary, for double, it’s 3, for float, it’s 4
RAMTYPE	flag to tell use URAM LUTRAM or BRAM, 0 : LUTRAM, 1 : URAM, 2 : BRAM
numVertex	CSR/CSC data vertex number
numEdge	CSR/CSC data edge number
index	input CSR/CSC data index array
weight	input CSR/CSC data weight array, default float type.
degree	output degree array, default float type.

calcuWeightedDegree overload (2)¶

#include "calc_degree.hpp"

template <
    int MAXVERTEX,
    int MAXEDGE,
    int LOG2CACHEDEPTH,
    int LOG2DATAPERCACHELINE,
    int RAMTYPE,
    int unrollbin,
    int CHANNELNUM
    >
void calcuWeightedDegree (
    int numVertex,
    int numEdge,
    int numEdgePerChannel [CHANNELNUM],
    ap_uint <512> index [MAXEDGE],
    ap_uint <512> weight [MAXEDGE],
    ap_uint <512> degree [MAXVERTEX]
    )

calculate weighted degree algorithm is implemented

Parameters:

MAXVERTEX	CSC/CSR data vertex(offset) array maxsize
MAXEDGE	CSC/CSR data edge(indice) array maxsize
LOG2CACHEDEPTH	cache depth in Binary, the cache onchip memory is 512 bit x uramRow
LOG2DATAPERCACHELINE	number of data in one 512bit in Binary, for double, it’s 3, for float, it’s 4
RAMTYPE	flag to tell use URAM LUTRAM or BRAM, 0 : LUTRAM, 1 : URAM, 2 : BRAM
numVertex	CSR/CSC data vertex number
numEdge	CSR/CSC data edge number
index	input CSR/CSC data index array
weight	input CSR/CSC data weight array, default float type.
degree	output degree array, default float type.

connectedComponents¶

#include "connected_components.hpp"

template <
    int MAXVERTEX,
    int MAXEDGE,
    int LOG2CACHEDEPTH,
    int LOG2DATAPERCACHELINE,
    int RAMTYPE,
    int MAXOUTDEGREE
    >
void connectedComponents (
    const int numEdge,
    const int numVertex,
    ap_uint <512>* offsetCSR,
    ap_uint <512>* indexCSR,
    ap_uint <512>* offsetCSC,
    ap_uint <512>* indexCSC512,
    ap_uint <32>* indexCSC32,
    ap_uint <512>* offsetCSCTmp1,
    ap_uint <512>* offsetCSCTmp2,
    ap_uint <512>* queue512,
    ap_uint <32>* queue32,
    ap_uint <512>* component512,
    ap_uint <32>* component32
    )

connectedComponents Compute the connected component membership of each vertex only for undirected graph

Parameters:

MAXVERTEX	CSC/CSR data vertex(offset) array maxsize
MAXEDGE	CSC/CSR data edge(indice) array maxsize
LOG2CACHEDEPTH	cache depth in Binary, the cache onchip memory is 512 bit x uramRow
LOG2DATAPERCACHELINE	The log2 of number of data in each cache line (512bit), for double, it’s 3, for float, it’s 4
RAMTYPE	flag to tell use URAM LUTRAM or BRAM, 0 : LUTRAM, 1 : URAM, 2 : BRAM
MAXOUTDEGREE	the max sum of indegree and outdegree of the input graph supported. Large value will result in more URAM usage
numEdge	edge number of the input graph
numVertex	vertex number of the input graph
indexCSR	column index of CSR format
offsetCSR	row offset of CSR format
indexCSC512	column index of CSC format in 512bit, which should be shared the same buffer with indexCSC32 in host
indexCSC32	column index of CSC format in 32bit, which should be shared the same buffer with indexCSC512 in host
offsetCSC	row offset of CSC format
offsetCSCTmp1	temp row offset for CSR2CSC convert
offsetCSCTmp2	temp row offset of CSR2CSC convert
queue32	intermediate queue32 used during the internal BFS
component512	Same as component32 but in 512bit, which shoude be shared the same buffer with component32 in host
component32	return result buffer with the vertex label containing the lowest vertex id in the connnected component containing that vertex

convertCsrCsc¶

#include "convert_csr_csc.hpp"

template <
    typename DT,
    int MAXVERTEX,
    int MAXEDGE,
    int LOG2CACHEDEPTH,
    int LOG2DATAPERCACHELINE,
    int RAMTYPE
    >
void convertCsrCsc (
    int numEdge,
    int numVertex,
    ap_uint <512>* offsetIn,
    ap_uint <512>* indexIn,
    ap_uint <512>* offsetOut,
    DT* indexOut,
    ap_uint <512>* offsetTmp0,
    ap_uint <512>* offsetTmp1
    )

convert Csr Csc algorithm is implemented

Parameters:

MAXVERTEX	CSC/CSR data vertex(offset) array maxsize
MAXEDGE	CSC/CSR data edge(indice) array maxsize
LOG2CACHEDEPTH	cache depth in Binary, the cache onchip memory is 512 bit x uramRow
LOG2DATAPERCACHELINE	number of data in one 512bit in Binary, for double, it’s 3, for float, it’s 4
RAMTYPE	flag to tell use URAM LUTRAM or BRAM, 0 : LUTRAM, 1 : URAM, 2 : BRAM
numEdge	CSR/CSC data indices number
numVertex	CSR/CSC data offsets number
indexIn	original CSR/CSC data indice array
offsetIn	original CSR/CSC data offset array
indexOut	output transfered CSC/CSR data indice array
offsetOut	output transfered CSC/CSR data offset array
offsetTmp0	internal temporary CSC/CSR data offset array
offsetTmp1	internal temporary CSC/CSR data offset array

labelPropagation¶

#include "label_propagation.hpp"

void labelPropagation (
    int numEdge,
    int numVertex,
    int numIter,
    ap_uint <512>* offsetCSR,
    ap_uint <512>* indexCSR,
    ap_uint <512>* offsetCSC,
    ap_uint <512>* indexCSC,
    ap_uint <512>* pingHashBuf,
    ap_uint <512>* pongHashBuf,
    ap_uint <512>* labelPing,
    ap_uint <512>* labelPong
    )

labelPropagation the label propagation algorithm is implemented

Parameters:

numEdge	edge number of the graph
numVertex	vertex number of the graph
numIter	iteration number
indexCSR	column index of CSR format
offsetCSR	row offset of CSR format
indexCSC	row index of CSC format
offsetCSC	column of CSC format
labelPing	label ping buffer
labelPong	label pong buffer

pageRankTop¶

#include "pagerank.hpp"

template <
    typename T,
    int MAXVERTEX,
    int MAXEDGE,
    int LOG2UNROLL,
    int WIDTHOR,
    int LOG2CACHEDEPTH,
    int LOG2DATAPERCACHELINECORE,
    int LOG2DATAPERCACHELINEDEGREE,
    int RAMTYPE
    >
void pageRankTop (
    int numVertex,
    int numEdge,
    ap_uint <512>* degreeCSR,
    ap_uint <512>* offsetCSC,
    ap_uint <512>* indexCSC,
    ap_uint <512>* weightCSC,
    ap_uint <512>* cntValFull,
    ap_uint <512>* buffPing,
    ap_uint <512>* buffPong,
    ap_uint <WIDTHOR>* orderUnroll,
    int* resultInfo,
    T randomProbability = 1.0,
    T alpha = 0.85,
    T tolerance = 1e-4,
    int numIter = 200
    )

pagerank algorithm is implemented

Parameters:

T	date type of pagerank, double or float
MAXVERTEX	CSC/CSR data vertex(offset) array maxsize
MAXEDGE	CSC/CSR data edge(indice) array maxsize
LOG2UNROLL	log2 of unroll number, due to DDR limit, best LOG2UNROLL is 3
WIDTHOR	order array bandwidth, it’s 256 in our case
LOG2CACHEDEPTH	log2(cache depth), the onchip memory for cache is 512 bit x CACHEDEPTH (512 bit x 2^LOG2CACHEDEPTH)
LOG2DATAPERCACHELINECORE	param for module pageRankCore, log2 of number of data in one 512bit (64 byte), for double, it’s log2(64/sizeof(double)) = 3, for float, it’s log2(64/sizeof(float)) = 4
LOG2DATAPERCACHELINEDEGREE	param for module calduDegree, log2 of number of data in one 512bit (64 byte), for double, it’s log2(64/sizeof(double)) = 3, for float, it’s log2(64/sizeof(float)) = 4
RAMTYPE	flag to tell use URAM LUTRAM or BRAM, 0 : LUTRAM, 1 : URAM, 2 : BRAM
numVertex	CSR/CSC data offsets number
numEdge	CSR/CSC data indices number
degreeCSR	temporary internal degree value
offsetCSC	CSR/CSC data offset array
indexCSC	CSR/CSC data indice array
weightCSC	CSR/CSC data weight array, support type float
cntValFull	temporary internal initialized mulplier values, length equals to numVertex
buffPing	ping array to keep temporary pagerank value
buffPong	pong array to keep temporary pagerank value
orderUnroll	temporary internal order array to keep initialized offset values
resultInfo	The output information. resultInfo[0] is isResultinPong, resultInfo[1] is iterations.
randomProbability	initial PR value, normally 1.0 or 1.0/numVertex
alpha	damping factor, normally 0.85
tolerance	converge tolerance
numIter	max iteration

pageRankTop overload (2)¶

#include "pagerank_multi_channels.hpp"

template <
    typename T,
    int MAXVERTEX,
    int MAXEDGE,
    int LOG2UNROLL,
    int WIDTHOR,
    int LOG2CACHEDEPTH,
    int LOG2DATAPERCACHELINECORE,
    int LOG2DATAPERCACHELINEDEGREE,
    int RAMTYPE
    >
void pageRankTop (
    int numVertex,
    int numEdge,
    int nsource,
    ap_uint <32>* sourceID,
    ap_uint <512>* degreeCSR,
    ap_uint <512>* offsetCSC,
    ap_uint <512>* indexCSC,
    ap_uint <512>* weightCSC,
    ap_uint <512>* cntValFull0,
    ap_uint <512>* buffPing0,
    ap_uint <512>* buffPong0,
    ap_uint <512>* cntValFull1,
    ap_uint <512>* buffPing1,
    ap_uint <512>* buffPong1,
    ap_uint <WIDTHOR>* orderUnroll,
    int* resultInfo,
    T randomProbability = 1.0,
    T alpha = 0.85,
    T tolerance = 1e-4,
    int numIter = 200
    )

pagerank algorithm is implemented support: 1. HBM based board

double / float for PR value calculate
weighted / unweighted graph / personalized graph
2 channel / 6 channel, 2 channel will take 14 persudo channels of HBM while 6 channel will take 26 persudo channels of HBM

Parameters:

T	date type of pagerank, double or float
MAXVERTEX	CSC/CSR data vertex(offset) array maxsize
MAXEDGE	CSC/CSR data edge(indice) array maxsize
LOG2UNROLL	log2 of unroll number for float adder
WIDTHOR	order array bandwidth, it’s 256 in our case
LOG2CACHEDEPTH	log2(cache depth), the onchip memory for cache is 512bits x CACHEDEPTH (512 bits x 2^LOG2CACHEDEPTH)
LOG2DATAPERCACHELINECORE	param for module pageRankCore, log2 of number of data in one 512bit (64 byte), for double, it’s log2(64/sizeof(double)) = 3, for float, it’s log2(64/sizeof(float)) = 4
LOG2DATAPERCACHELINEDEGREE	param for module calduDegree, log2 of number of data in one 512bit (64 byte), for double, it’s log2(64/sizeof(double)) = 3, for float, it’s log2(64/sizeof(float)) = 4
RAMTYPE	flag to tell use URAM LUTRAM or BRAM, 0 : LUTRAM, 1 : URAM, 2: BRAM
CHANNEL_NUM	pingpong channel number for the design
numVertex	CSR/CSC data offsets number
numEdge	CSR/CSC data indices number
nsource	source vertex ID number, 0 means not apply pagerank_personalized
sourceID	source vertex ID array for pagerank_personalized
degreeCSR	temporary internal degree value
offsetCSC	CSR/CSC data offset array
indexCSC	CSR/CSC data indice array
weightCSC	CSR/CSC data weight array, support type float
cntValFull0	temporary internal initialized mulplier values, length equals to numVertex
buffPing0	ping array to keep temporary pagerank value
buffPong0	pong array to keep temporary pagerank value
cntValFull1	temporary internal initialized mulplier values, length equals to numVertex
buffPing1	ping array to keep temporary pagerank value
buffPong1	pong array to keep temporary pagerank value
orderUnroll	temporary internal order array to keep initialized offset values
resultInfo	The output information. resultInfo[0] is isResultinPong, resultInfo[1] is iterations.
randomProbability	initial PR value, normally 1.0 or 1.0/numVertex
alpha	damping factor, normally 0.85
tolerance	converge tolerance
numIter	max iteration

singleSourceShortestPath¶

singleSourceShortestPath overload (1)¶

#include "shortest_path.hpp"

template <
    int WIDTH,
    int MAXOUTDEGREE
    >
void singleSourceShortestPath (
    ap_uint <32>* config,
    ap_uint <512>* offsetCSR,
    ap_uint <512>* indexCSR,
    ap_uint <512>* weightCSR,
    ap_uint <512>* queue512,
    ap_uint <32>* queue32,
    ap_uint <512>* distance512,
    ap_uint <WIDTH>* distance32,
    ap_uint <512>* pred512,
    ap_uint <32>* pred32,
    ap_uint <8>* info
    )

singleSourceShortestPath the single source shortest path algorithm is implemented, the input is the matrix in CSR format.

Parameters:

WIDTH	date width of the weight and the result distance
MAXOUTDEGREE	The max out put degree of the input graph supported. Large max out degree value
config	The config data. config[0] is the number of vertices in the graph. config[1] is the lower 32bit of the max distance value. config[2] is the higher 32bit of the max distance value. config[3] is the depth of the queue. config[4] is the option for the sssp: config[4].get_bit(0) is the weight/unweight switch, config[4].get_bit(1) is the path switch, config[4].get_bit(2) is the fixed/float switch. config[5] is sourceID.
offsetCSR	The offsetCSR buffer that stores the offsetCSR data in CSR format
indexCSR	The indexCSR buffer that stores the indexCSR dada in CSR format
weightCSR	The weight buffer that stores the weight data in CSR format
queue512	The shortest path requires a queue. The queue will be stored here in 512bit. Please allocate the same buffer with queue32 and budle to the same gmem port with queue32.
queue32	The shortest path requires a queue. The queue will be stored here in 32bit. Please allocate the same buffer with queue512 and budle to the same gmem port with queue512.
distance512	The distance32 data. The width is 512. When allocating buffers, distance512 and distance32 should point to the same buffer. And please bundle distance512 and distance32 to the same gmem port.
distance32	The distance32 data is stored here. When allocating buffers, distance512 and distance32 should point to the same buffer. And please bundle distance512 and distance32 to the same gmem port.
pred512	The predecessor data. The width is 512. When allocating buffers, pred512 and pred32 should point to the same buffer. And please bundle pred512 and pred32 to the same gmem port.
pred32	The predecessor data is stored here. When allocating buffers, pred512 and pred32 should point to the same buffer. And please bundle pred512 and pred32 to the same gmem port.
info	The debug information. info[0] shows if the queue overflow. info[1] shows if the precessed graph exceeds the supported maxoutdegree.

singleSourceShortestPath overload (2)¶

#include "shortest_path.hpp"

template <
    int WIDTH,
    int MAXOUTDEGREE
    >
void singleSourceShortestPath (
    ap_uint <32>* config,
    ap_uint <512>* offsetCSR,
    ap_uint <512>* indexCSR,
    ap_uint <512>* weightCSR,
    ap_uint <512>* queue512,
    ap_uint <32>* queue32,
    ap_uint <512>* distance512,
    ap_uint <WIDTH>* distance32,
    ap_uint <8>* info
    )

singleSourceShortestPath the single source shortest path algorithm is implemented, the input is the matrix in CSR format.

Parameters:

WIDTH	date width of the weight and the result distance
MAXOUTDEGREE	The max out put degree of the input graph supported. Large max out degree value
config	The config data. config[0] is the number of vertices in the graph. config[1] is the lower 32bit of the max distance value. config[2] is the higher 32bit of the max distance value. config[3] is the depth of the queue. config[4] is the option for the sssp: config[4].get_bit(0) is the weight/unweight switch, config[4].get_bit(2) is the fixed/float switch. config[5] is the sourceID.
offsetCSR	The offsetCSR buffer that stores the offsetCSR data in CSR format
indexCSR	The indexCSR buffer that stores the indexCSR dada in CSR format
weightCSR	The weight buffer that stores the weight data in CSR format
queue512	The shortest path requires a queue. The queue will be stored here in 512bit. Please allocate the same buffer with queue32 and budle to the same gmem port with queue32.
queue32	The shortest path requires a queue. The queue will be stored here in 32bit. Please allocate the same buffer with queue512 and budle to the same gmem port with queue512.
distance512	The distance32 data. The width is 512. When allocating buffers, distance512 and distance32 should point to the same buffer. And please bundle distance512 and distance32 to the same gmem port.
distance32	The distance32 data is stored here. When allocating buffers, distance512 and distance32 should point to the same buffer. And please bundle distance512 and distance32 to the same gmem port.
info	The debug information. info[0] shows if the queue overflow. info[1] shows if the precessed graph exceeds the supported maxoutdegree.

stronglyConnectedComponents¶

#include "strongly_connected_components.hpp"

template <
    int MAXVERTEX,
    int MAXEDGE,
    int LOG2CACHEDEPTH,
    int LOG2DATAPERCACHELINE,
    int RAMTYPE,
    int MAXOUTDEGREE
    >
void stronglyConnectedComponents (
    const int edgeNum,
    const int vertexNum,
    ap_uint <512>* offsetCSR0,
    ap_uint <512>* indexCSR0,
    ap_uint <512>* offsetCSC,
    ap_uint <512>* indxeCSC512,
    ap_uint <32>* indexCSC32,
    ap_uint <512>* offsetCSR1,
    ap_uint <512>* indexCSR1,
    ap_uint <512>* offsetCSCTmp1,
    ap_uint <512>* offsetCSCTmp2,
    ap_uint <512>* colorCSR0512,
    ap_uint <32>* colorCSR032,
    ap_uint <32>* queueCSR0,
    ap_uint <512>* colorCSC512,
    ap_uint <32>* colorCSC32,
    ap_uint <32>* queueCSC,
    ap_uint <32>* queueCSR1,
    ap_uint <32>* component
    )

stronglyConnectedComponents Compute the strongly connected component membership of each vertex only for directed graph, and label each vertex with one value containing the lowest vertex id in the SCC containing that vertex.

Parameters:

MAXVERTEX	CSC/CSR data vertex(offset) array maxsize
MAXEDGE	CSC/CSR data edge(indice) array maxsize
LOG2CACHEDEPTH	cache depth in Binary, the cache onchip memory is 512 bit x uramRow
LOG2DATAPERCACHELINE	number of data in one 512bit in Binary, for double, it’s 3, for float, it’s 4
RAMTYPE	flag to tell use URAM LUTRAM or BRAM, 0 : LUTRAM, 1 : URAM, 2 : BRAM
MAXOUTDEGREE	the max indegree or outdegree of the input graph supported. Large value will result in more URAM usage
edgeNum	edge number of the input graph
vertexNum	vertex number of the input graph
indexCSR0	column index of CSR format
offsetCSR0	row offset of CSR format
indxeCSC512	column index of CSC format in 512bit, which should be shared the same buffer with indexCSC32 in host
indexCSC32	column index of CSC format in 32bit, which should be shared the same buffer with indxeCSC512 in host
offsetCSC	row offset of CSC format
indexCSR1	column index of CSR format, which should be shared the same buffer with indexCSR0 in host
offsetCSR1	row offset of CSR format, which should be shared the same buffer with offsetCSR0 in host
offsetCSCTmp1	temp row offset for CSR2CSC convert
offsetCSCTmp2	temp row offset of CSR2CSC convert
colorCSR0512	intermediate color map in 512bit for forward-BFS
colorCSR032	intermediate color map in 32bit for forward-BFS, which should be shared the same buffer with colorCSR0512 in host
queueCSR0	intermediate queue used during the internal forward-BFS
colorCSC512	intermediate color map in 512bit for backward-BFS, which should be shared the same buffer with colorCSR0512 in host
colorCSC32	intermediate color map in 32bit for backward-BFS, which should be shared the same buffer with colorCSR0512 in host
queueCSC	intermediate queue used during the internal backward-BFS
queueCSR1	intermediate queue which should be shared the same buffer with queueCSR0
component	return component buffer with the vertex label containing the lowest vertex id in the strongly connnected component containing that vertex

triangleCount¶

#include "triangle_count.hpp"

template <
    int LEN,
    int ML
    >
void triangleCount (
    int numVertex,
    int numEdge,
    ap_uint <512>* offset0,
    ap_uint <512>* index0,
    ap_uint <512>* offset1,
    ap_uint <512>* index1,
    ap_uint <512>* offset2,
    ap_uint <512>* index2,
    uint64_t* triangles
    )

triangleCount the triangle counting algorithm is implemented, the input is the matrix in CSC format.

Parameters:

LEN	the depth of stream
ML	URAM depth in the design
numVertex	length of column offsets
numEdge	length of row indices
offset0	column offsets (begin+end) value
index0	row indices value
offset1	column offsets (begin+end) value
index1	row indices value
offset2	8 column offsets (begin+end) values
index2	16 row indices values
triangles	return triangle number