Namespace vart

namespace vart

Functions

std::vector<float> get_input_scale(std::vector<const xir::Tensor*> input_tensors)
std::vector<float> get_output_scale(std::vector<const xir::Tensor*> output_tensors)
float get_input_scale(const xir::Tensor *input_tensor)
float get_output_scale(const xir::Tensor *output_tensor)
template<typename InputType, typename OutputType = InputType>
class BaseRunner

Subclassed by Runner

Public Functions

virtual std::pair<std::uint32_t, int> execute_async(InputType input, OutputType output) = 0

execute_async

Parameters:
  • input – inputs with a customized type

  • output – outputs with a customized type

Returns:

pair<jobid, status> status 0 for exit successfully, others for customized warnings or errors

virtual int wait(int jobid, int timeout = -1) = 0

wait

modes: 1. Blocking wait for specific ID. 2. Non-blocking wait for specific ID. 3. Blocking wait for any ID. 4. Non-blocking wait for any ID

Parameters:
  • jobid – job id, neg for any id, others for specific job id

  • timeout – timeout, neg for block for ever, 0 for non-block, pos for block with a limitation(ms).

Returns:

status 0 for exit successfully, others for customized warnings or errors

struct DpuMeta : public Meta
struct Meta

Subclassed by DpuMeta

class Runner : public vart::BaseRunner<const std::vector<TensorBuffer*>&>
#include <runner.hpp>

Class of the Runner, provides API to use the runner.

The runner instance has a number of member functions to control the execution and get the input and output tensors of the runner.

Sample code:

// This example assumes that you have a DPU subgraph called dpu_subgraph.
// The way to create a DPU runner to run dpu_subgraph is shown below.

// create runner
auto runner = vart::Runner::create_runner(dpu_subgraph, ”run”);
// get input tensors
auto input_tensors = runner->get_input_tensors();
// get input tensor buffers
auto input_tensor_buffers = std::vector<vart::TensorBuffer*>();
    for (auto input : input_tensors) {
        auto t = vart::alloc_cpu_flat_tensor_buffer(input);
        input_tensor_buffers.emplace_back(t.get());
}
// get output tensors
auto output_tensors = runner->get_output_tensors();
// get output tensor buffers
auto output_tensor_buffers = std::vector< vart::TensorBuffer*>();
for (auto output : output _tensors) {
    auto t = vart::alloc_cpu_flat_tensor_buffer(output);
            output_tensor_buffers.emplace_back(t.get());
}
// sync input tensor buffers
for (auto& input : input_tensor_buffers) {
    input->sync_for_write(0, input->get_tensor()->get_data_size() /
            input->get_tensor()->get_shape()[0]);
}
// run runner
auto v = runner->execute_async(input_tensor_buffers, output_tensor_buffers);
auto status = runner->wait((int)v.first, 1000000000);
// sync output tensor buffers
for (auto& output : output_tensor_buffers) {
    output->sync_for_read(0, output->get_tensor()->get_data_size() /
    output->get_tensor()->get_shape()[0]);
}

Subclassed by RunnerExt

Public Functions

virtual std::pair<uint32_t, int> execute_async(const std::vector<TensorBuffer*> &input, const std::vector<TensorBuffer*> &output) = 0

Executes the runner.

This is a blocking function.

Parameters:
  • input – A vector of TensorBuffer create by all input tensors of runner.

  • output – A vector of TensorBuffer create by all output tensors of runner.

Returns:

pair<jobid, status> status 0 for exit successfully, others for customized warnings or errors

virtual int wait(int jobid, int timeout) = 0

Waits for the end of DPU processing.

modes: 1. Blocking wait for specific ID. 2. Non-blocking wait for specific ID. 3. Blocking wait for any ID. 4. Non-blocking wait for any ID

Parameters:
  • jobid – job id, neg for any id, others for specific job id

  • timeout – timeout, neg for block for ever, 0 for non-block, pos for block with a limitation(ms).

Returns:

status 0 for exit successfully, others for customized warnings or errors

virtual TensorFormat get_tensor_format()

Get the tensor format of runner.

Sample code:

auto format = runner->get_tensor_format();
switch (format) {
    case vart::Runner::TensorFormat::NCHW:
        // do something
        break;
    case vart::Runner::TensorFormat::NHWC:
        // do something
        break;
}
Returns:

TensorFormat : NHWC / HCHW

virtual std::vector<const xir::Tensor*> get_input_tensors() = 0

Get all input tensors of runner.

Sample code:

inputTensors = runner->get_input_tensors();
for (auto input : inputTensor) {
    input->get_name();
    input->get_shape();
    input->get_element_num();
}
Returns:

All input tensors. A vector of raw pointer to the input tensor.

virtual std::vector<const xir::Tensor*> get_output_tensors() = 0

Get all output tensors of runner.

Sample code:

outputTensors = runner->get_output_tensors();
  for (auto output : outputTensor) {
      output->get_name();
      output->get_shape();
      output->get_element_num();
}

Returns:

All output tensors. A vector of raw pointer to the output tensor.

virtual std::pair<std::uint32_t, int> execute_async(InputType input, OutputType output) = 0

execute_async

Parameters:
  • input – inputs with a customized type

  • output – outputs with a customized type

Returns:

pair<jobid, status> status 0 for exit successfully, others for customized warnings or errors

Public Static Functions

static std::unique_ptr<Runner> create_runner(const xir::Subgraph *subgraph, const std::string &mode = std::string(""))

Factory function to create an instance of DPU runner by subgraph.

Sample code:

// This API can be used like:
auto runner = vart::Runner::create_runner(subgraph, "run");
Parameters:
  • subgraph – XIR Subgraph

  • mode – 1 mode supported: ‘run’ - DPU runner.

Returns:

An instance of DPU runner.

static std::unique_ptr<Runner> create_runner_with_attrs(const xir::Subgraph *subgraph, xir::Attrs *attrs)

Factory function to create an instance of DPU runner by subgraph, and attrs.

Parameters:
  • subgraph – XIR Subgraph

  • attrs – XIR attrs object, this object is shared among all runners on the same graph.

  • attrs["mode"], 1 – mode supported: ‘run’ - DPU runner.

Returns:

An instance of DPU runner.

class RunnerExt : public Runner

Public Functions

virtual std::vector<vart::TensorBuffer*> get_inputs() = 0

Gets all input TensorBuffers of RunnerExt.

Sample code:

auto runner = vart::RunnerExt::create_runner(subgraph, attrs);
auto input_tensor_buffers = runner->get_inputs();
    for (auto input : input_tensor_buffers) {
        auto shape = input->get_tensor()->get_shape();
}
Returns:

All input TensorBuffers. A vector of raw pointer to the input TensorBuffer.

virtual std::vector<vart::TensorBuffer*> get_outputs() = 0

Gets all output TensorBuffers of RunnerExt.

Sample code:

auto runner = vart::RunnerExt::create_runner(subgraph, attrs);
auto output_tensor_buffers = runner->get_outputs();
    for (auto output : output_tensor_buffers) {
        auto shape = output->get_tensor()->get_shape();
}
Returns:

All output TensorBuffers. A vector of raw pointer to the output TensorBuffer.

virtual std::pair<uint32_t, int> execute_async(const std::vector<TensorBuffer*> &input, const std::vector<TensorBuffer*> &output) = 0

Executes the runner.

This is a blocking function.

Parameters:
  • input – A vector of TensorBuffer create by all input tensors of runner.

  • output – A vector of TensorBuffer create by all output tensors of runner.

Returns:

pair<jobid, status> status 0 for exit successfully, others for customized warnings or errors

virtual std::pair<std::uint32_t, int> execute_async(InputType input, OutputType output) = 0

execute_async

Parameters:
  • input – inputs with a customized type

  • output – outputs with a customized type

Returns:

pair<jobid, status> status 0 for exit successfully, others for customized warnings or errors

virtual int wait(int jobid, int timeout) = 0

Waits for the end of DPU processing.

modes: 1. Blocking wait for specific ID. 2. Non-blocking wait for specific ID. 3. Blocking wait for any ID. 4. Non-blocking wait for any ID

Parameters:
  • jobid – job id, neg for any id, others for specific job id

  • timeout – timeout, neg for block for ever, 0 for non-block, pos for block with a limitation(ms).

Returns:

status 0 for exit successfully, others for customized warnings or errors

virtual TensorFormat get_tensor_format()

Get the tensor format of runner.

Sample code:

auto format = runner->get_tensor_format();
switch (format) {
    case vart::Runner::TensorFormat::NCHW:
        // do something
        break;
    case vart::Runner::TensorFormat::NHWC:
        // do something
        break;
}
Returns:

TensorFormat : NHWC / HCHW

virtual std::vector<const xir::Tensor*> get_input_tensors() = 0

Get all input tensors of runner.

Sample code:

inputTensors = runner->get_input_tensors();
for (auto input : inputTensor) {
    input->get_name();
    input->get_shape();
    input->get_element_num();
}
Returns:

All input tensors. A vector of raw pointer to the input tensor.

virtual std::vector<const xir::Tensor*> get_output_tensors() = 0

Get all output tensors of runner.

Sample code:

outputTensors = runner->get_output_tensors();
  for (auto output : outputTensor) {
      output->get_name();
      output->get_shape();
      output->get_element_num();
}

Returns:

All output tensors. A vector of raw pointer to the output tensor.

Public Static Functions

static std::unique_ptr<RunnerExt> create_runner(const xir::Subgraph *subgraph, xir::Attrs *attrs)

Factory fucntion to create an instance of runner by subgraph and attrs.

Parameters:
  • subgraph – XIR Subgraph

  • attrs – XIR attrs object, this object is shared among all runners on the same graph.

Returns:

An instance of runner.

static std::unique_ptr<Runner> create_runner(const xir::Subgraph *subgraph, const std::string &mode = std::string(""))

Factory function to create an instance of DPU runner by subgraph.

Sample code:

// This API can be used like:
auto runner = vart::Runner::create_runner(subgraph, "run");
Parameters:
  • subgraph – XIR Subgraph

  • mode – 1 mode supported: ‘run’ - DPU runner.

Returns:

An instance of DPU runner.

static std::unique_ptr<Runner> create_runner_with_attrs(const xir::Subgraph *subgraph, xir::Attrs *attrs)

Factory function to create an instance of DPU runner by subgraph, and attrs.

Parameters:
  • subgraph – XIR Subgraph

  • attrs – XIR attrs object, this object is shared among all runners on the same graph.

  • attrs["mode"], 1 – mode supported: ‘run’ - DPU runner.

Returns:

An instance of DPU runner.

class TensorBuffer
#include <tensor_buffer.hpp>

Class of TensorBuffer.

Subclassed by TensorBufferExt

Public Functions

virtual std::pair<std::uint64_t, std::size_t> data(const std::vector<std::int32_t> idx = {}) = 0

Get the data address of the index and the size of the data available for use.

Sample code:

vart::TensorBuffer* tb;
std::tie(data_addr, tensor_size) = tb->data({0,0,0,0});
Parameters:

idx – The index of the data to be accessed, its dimension same as the tensor shape.

Returns:

A pair of the data address of the index and the size of the data available for use in byte unit.

inline virtual location_t get_location() const

Get where the tensor buffer located.

Sample code:

vart::TensorBuffer* tb;
switch (tb->get_location()) {
            case vart::TensorBuffer::location_t::HOST_VIRT:
                  // do nothing
                  break;
            case vart::TensorBuffer::location_t::HOST_PHY:
                  // do nothing
                  break;
           default:
                  // do nothing
                  break;
      }
Returns:

the tensor buffer location, a location_t enum type value: HOST_VIRT/HOST_PHY/DEVICE_*.

inline virtual std::pair<uint64_t, size_t> data_phy(const std::vector<std::int32_t> idx)

Get the data physical address of the index and the size of the data available for use.

Sample code:

vart::TensorBuffer* tb;
std::tie(phy_data, phy_size) = tb->data_phy({0, 0});
Parameters:

idx – The index of the data to be accessed, its dimension same to the tensor shape.

Returns:

A pair of the data physical address of the index and the size of the data available for use in byte unit.

inline virtual void sync_for_read(uint64_t offset, size_t size)

Invalid cache for reading Before read, it is no-op in case get_location() returns DEVICE_ONLY or HOST_VIRT.

Sample code:

for (auto& output : output_tensor_buffers) {
    output->sync_for_read(0, output->get_tensor()->get_data_size() /
                                output->get_tensor()->get_shape()[0]);
}
Parameters:
  • offset – The start offset address.

  • size – The data size.

Returns:

void

inline virtual void sync_for_write(uint64_t offset, size_t size)

Flush cache for writing after write, it is no-op in case get_location() returns DEVICE_ONLY or HOST_VIRT.

Sample code:

for (auto& input : input_tensor_buffers) {
    input->sync_for_write(0, input->get_tensor()->get_data_size() /
                              input->get_tensor()->get_shape()[0]);
}
Parameters:
  • offset – The start offset address.

  • size – The data size.

Returns:

void

virtual void copy_from_host(size_t batch_idx, const void *buf, size_t size, size_t offset)

copy data from source buffer.

Parameters:
  • batch_idx – the batch index.

  • buf – source buffer start address.

  • size – data size to be copied.

  • offset – the start offset to be copied.

Returns:

void

virtual void copy_to_host(size_t batch_idx, void *buf, size_t size, size_t offset)

copy data to destination buffer.

Sample code:

vart::TensorBuffer* tb_from;
vart::TensorBuffer* tb_to;
for (auto batch = 0u; batch < batch_size; ++batch) {
       std::tie(data, tensor_size) = tb_to->data({(int)batch, 0, 0, 0});
    tb_from->copy_to_host(batch, reinterpret_cast<void*>(data),
                        tensor_size, 0u);
}
Parameters:
  • batch_idx – the batch index.

  • buf – destination buffer start address.

  • size – data size to be copied.

  • offset – the start offset to be copied.

Returns:

void

const xir::Tensor *get_tensor() const

Get tensor of TensorBuffer.

Returns:

A pointer to the tensor.

virtual std::string to_string() const

for fancy log messages

Public Static Functions

static std::string to_string(location_t value)

for TensorBuffer location message

static void copy_tensor_buffer(vart::TensorBuffer *tb_from, vart::TensorBuffer *tb_to)

copy TensorBuffer from one to another.

Sample code:

vart::TensorBuffer* tb_from;
vart::TensorBuffer* tb_to;
vart::TensorBuffer::copy_tensor_buffer(tb_from.get(), tb_to.get());
Parameters:
Returns:

void

static std::unique_ptr<TensorBuffer> create_unowned_device_tensor_buffer(const xir::Tensor *tensor, uint64_t batch_addr[], size_t addr_arrsize)

create unowned device tensor buffer with device physical addresses for a tensor.

There are some limitations on the arguments:

  1. The addr_arrsize must NOT be greater than the tensor batch.

  2. The tensor must have attribute ddr_addr whose value must be 0.

Sample code:

auto runner = vart::RunnerExt::create_runner(subgraph, attrs);
auto input_tensors = runner->get_input_tensors();
auto output_tensors = runner->get_output_tensors();
std::vector<vart::TensorBuffer*> input_tensor_buffers;
std::vector<vart::TensorBuffer*> output_tensor_buffers;
uint64_t in_batch_addr[1];
uint64_t out_batch_addr[1];
in_batch_addr[0] = DEVICE_PHY_ADDRESS_IN;
out_batch_addr[0] = DEVICE_PHY_ADDRESS_OUT;
auto input_tb = vart::TensorBuffer::create_unowned_device_tensor_buffer(
      input_tensors[0], in_batch_addr, 1);
auto output_tb = vart::TensorBuffer::create_unowned_device_tensor_buffer(
      output_tensors[0], out_batch_addr, 1);
input_tensor_buffers.emplace_back(input_tb.get());
output_tensor_buffers.emplace_back(output_tb.get());
auto v = runner->execute_async(input_tensor_buffers, output_tensor_buffers);
Parameters:
  • tensor – XIR tensor pointer

  • batch_addr – Array which contains device physical address for each batch

  • addr_arrsize – The array size of batch_addr

Returns:

Unique pointer of created tensor buffer.

class TensorBufferExt : public TensorBuffer

Public Functions

virtual std::string to_string() const

for fancy log messages

virtual std::pair<std::uint64_t, std::size_t> data(const std::vector<std::int32_t> idx = {}) = 0

Get the data address of the index and the size of the data available for use.

Sample code:

vart::TensorBuffer* tb;
std::tie(data_addr, tensor_size) = tb->data({0,0,0,0});
Parameters:

idx – The index of the data to be accessed, its dimension same as the tensor shape.

Returns:

A pair of the data address of the index and the size of the data available for use in byte unit.

inline virtual location_t get_location() const

Get where the tensor buffer located.

Sample code:

vart::TensorBuffer* tb;
switch (tb->get_location()) {
            case vart::TensorBuffer::location_t::HOST_VIRT:
                  // do nothing
                  break;
            case vart::TensorBuffer::location_t::HOST_PHY:
                  // do nothing
                  break;
           default:
                  // do nothing
                  break;
      }
Returns:

the tensor buffer location, a location_t enum type value: HOST_VIRT/HOST_PHY/DEVICE_*.

inline virtual std::pair<uint64_t, size_t> data_phy(const std::vector<std::int32_t> idx)

Get the data physical address of the index and the size of the data available for use.

Sample code:

vart::TensorBuffer* tb;
std::tie(phy_data, phy_size) = tb->data_phy({0, 0});
Parameters:

idx – The index of the data to be accessed, its dimension same to the tensor shape.

Returns:

A pair of the data physical address of the index and the size of the data available for use in byte unit.

inline virtual void sync_for_read(uint64_t offset, size_t size)

Invalid cache for reading Before read, it is no-op in case get_location() returns DEVICE_ONLY or HOST_VIRT.

Sample code:

for (auto& output : output_tensor_buffers) {
    output->sync_for_read(0, output->get_tensor()->get_data_size() /
                                output->get_tensor()->get_shape()[0]);
}
Parameters:
  • offset – The start offset address.

  • size – The data size.

Returns:

void

inline virtual void sync_for_write(uint64_t offset, size_t size)

Flush cache for writing after write, it is no-op in case get_location() returns DEVICE_ONLY or HOST_VIRT.

Sample code:

for (auto& input : input_tensor_buffers) {
    input->sync_for_write(0, input->get_tensor()->get_data_size() /
                              input->get_tensor()->get_shape()[0]);
}
Parameters:
  • offset – The start offset address.

  • size – The data size.

Returns:

void

virtual void copy_from_host(size_t batch_idx, const void *buf, size_t size, size_t offset)

copy data from source buffer.

Parameters:
  • batch_idx – the batch index.

  • buf – source buffer start address.

  • size – data size to be copied.

  • offset – the start offset to be copied.

Returns:

void

virtual void copy_to_host(size_t batch_idx, void *buf, size_t size, size_t offset)

copy data to destination buffer.

Sample code:

vart::TensorBuffer* tb_from;
vart::TensorBuffer* tb_to;
for (auto batch = 0u; batch < batch_size; ++batch) {
       std::tie(data, tensor_size) = tb_to->data({(int)batch, 0, 0, 0});
    tb_from->copy_to_host(batch, reinterpret_cast<void*>(data),
                        tensor_size, 0u);
}
Parameters:
  • batch_idx – the batch index.

  • buf – destination buffer start address.

  • size – data size to be copied.

  • offset – the start offset to be copied.

Returns:

void

const xir::Tensor *get_tensor() const

Get tensor of TensorBuffer.

Returns:

A pointer to the tensor.

Public Static Functions

static std::string to_string(location_t value)

for TensorBuffer location message

static void copy_tensor_buffer(vart::TensorBuffer *tb_from, vart::TensorBuffer *tb_to)

copy TensorBuffer from one to another.

Sample code:

vart::TensorBuffer* tb_from;
vart::TensorBuffer* tb_to;
vart::TensorBuffer::copy_tensor_buffer(tb_from.get(), tb_to.get());
Parameters:
Returns:

void

static std::unique_ptr<TensorBuffer> create_unowned_device_tensor_buffer(const xir::Tensor *tensor, uint64_t batch_addr[], size_t addr_arrsize)

create unowned device tensor buffer with device physical addresses for a tensor.

There are some limitations on the arguments:

  1. The addr_arrsize must NOT be greater than the tensor batch.

  2. The tensor must have attribute ddr_addr whose value must be 0.

Sample code:

auto runner = vart::RunnerExt::create_runner(subgraph, attrs);
auto input_tensors = runner->get_input_tensors();
auto output_tensors = runner->get_output_tensors();
std::vector<vart::TensorBuffer*> input_tensor_buffers;
std::vector<vart::TensorBuffer*> output_tensor_buffers;
uint64_t in_batch_addr[1];
uint64_t out_batch_addr[1];
in_batch_addr[0] = DEVICE_PHY_ADDRESS_IN;
out_batch_addr[0] = DEVICE_PHY_ADDRESS_OUT;
auto input_tb = vart::TensorBuffer::create_unowned_device_tensor_buffer(
      input_tensors[0], in_batch_addr, 1);
auto output_tb = vart::TensorBuffer::create_unowned_device_tensor_buffer(
      output_tensors[0], out_batch_addr, 1);
input_tensor_buffers.emplace_back(input_tb.get());
output_tensor_buffers.emplace_back(output_tb.get());
auto v = runner->execute_async(input_tensor_buffers, output_tensor_buffers);
Parameters:
  • tensor – XIR tensor pointer

  • batch_addr – Array which contains device physical address for each batch

  • addr_arrsize – The array size of batch_addr

Returns:

Unique pointer of created tensor buffer.

struct XclBo