Program Listing for File stack.cuh#

Return to documentation for file (src/stack.cuh)

// cuEVM: CUDA Ethereum Virtual Machine implementation
// Copyright 2023 Stefan-Dan Ciocirlan (SBIP - Singapore Blockchain Innovation Programme)
// Author: Stefan-Dan Ciocirlan
// Data: 2023-11-30
// SPDX-License-Identifier: MIT

#ifndef _STACK_H_
#define _STACK_H_

#include "utils.h"

template <class params>
class stack_t
{
public:
  typedef arith_env_t<params> arith_t;
  typedef typename arith_t::bn_t bn_t;
  typedef cgbn_mem_t<params::BITS> evm_word_t;
  typedef typename arith_env_t<params>::bn_wide_t bn_wide_t;
  static const uint32_t STACK_SIZE = params::STACK_SIZE;

  typedef struct
  {
    evm_word_t *stack_base;
    uint32_t stack_offset;
  } stack_data_t;

  stack_data_t *_content;
  arith_t _arith;
  __host__ __device__ __forceinline__ stack_t(
      arith_t arith,
      stack_data_t *content) : _arith(arith),
                               _content(content)
  {
  }

  __host__ __device__ __forceinline__ stack_t(
      arith_t arith) : _arith(arith)
  {
    SHARED_MEMORY stack_data_t *content;
    ONE_THREAD_PER_INSTANCE(
        content = new stack_data_t;
        content->stack_base = new evm_word_t[STACK_SIZE];
        content->stack_offset = 0;)
    _content = content;
  }

  __host__ __device__ __forceinline__ ~stack_t()
  {
    ONE_THREAD_PER_INSTANCE(
        if (_content->stack_base != NULL) {
          delete[] _content->stack_base;
          _content->stack_base = NULL;
          _content->stack_offset = 0;
        } if (_content != NULL) {
          delete _content;
        })
    _content = NULL;
  }

  __host__ __device__ __forceinline__ uint32_t size()
  {
    return _content->stack_offset;
  }

  __host__ __device__ __forceinline__ evm_word_t *top()
  {
    return _content->stack_base + _content->stack_offset;
  }

  __host__ __device__ __forceinline__ void push(const bn_t &value, uint32_t &error_code)
  {
    if (size() >= STACK_SIZE)
    {
      error_code = ERR_STACK_OVERFLOW;
      return;
    }
    cgbn_store(_arith._env, top(), value);
    _content->stack_offset++;
  }

  __host__ __device__ __forceinline__ void pop(bn_t &y, uint32_t &error_code)
  {
    if (size() == 0)
    {
      error_code = ERR_STACK_UNDERFLOW;
      cgbn_set_ui32(_arith._env, y, 0);
      return;
    }
    _content->stack_offset--;
    cgbn_load(_arith._env, y, top());
  }

  __host__ __device__ __forceinline__ void pushx(
      uint8_t x,
      uint32_t &error_code,
      uint8_t *src_byte_data,
      uint8_t src_byte_size)
  {
    if (x > 32)
    {
      error_code = ERROR_STACK_INVALID_PUSHX_X;
      return;
    }
    bn_t r;
    cgbn_set_ui32(_arith._env, r, 0);
    for (uint8_t idx = (x - src_byte_size); idx < x; idx++)
    {
      cgbn_insert_bits_ui32(
          _arith._env,
          r,
          r,
          idx * 8,
          8,
          src_byte_data[x - 1 - idx]);
    }
    push(r, error_code);
  }

  __host__ __device__ __forceinline__ evm_word_t *get_index(
    uint32_t index,
    uint32_t &error_code
  )
  {
    if (size() < index)
    {
      error_code = ERR_STACK_UNDERFLOW;
      return NULL;
    }
    return _content->stack_base + (size() - index);
  }

  __host__ __device__ __forceinline__ void dupx(
      uint8_t x,
      uint32_t &error_code)
  {
    if ((x < 1) || (x > 16))
    {
      error_code = ERROR_STACK_INVALID_DUPX_X;
      return;
    }
    bn_t r;
    evm_word_t *value = get_index(x, error_code);
    if (value == NULL)
    {
      return;
    }
    cgbn_load(_arith._env, r, value);
    push(r, error_code);
  }

  __host__ __device__ __forceinline__ void swapx(
    uint32_t index,
    uint32_t &error_code)
  {
    if ((index < 1) || (index > 16))
    {
      error_code = ERR_STACK_INVALID_SIZE;
      return;
    }
    bn_t a, b;
    evm_word_t *value_a = get_index(1, error_code);
    evm_word_t *value_b = get_index(index + 1, error_code);
    if ((value_a == NULL) || (value_b == NULL))
    {
      return;
    }
    cgbn_load(_arith._env, a, value_b);
    cgbn_load(_arith._env, b, value_a);
    cgbn_store(_arith._env, value_a, a);
    cgbn_store(_arith._env, value_b, b);
  }

  __host__ __device__ __forceinline__ void to_stack_data_t(
      stack_data_t &dst)
  {
    ONE_THREAD_PER_INSTANCE(
        if (
            (dst.stack_offset > 0) &&
            (dst.stack_base != NULL)) {
          delete[] dst.stack_base;
          dst.stack_base = NULL;
        } dst.stack_offset = _content->stack_offset;
        if (dst.stack_offset == 0) {
          dst.stack_base = NULL;
        } else {
          dst.stack_base = new evm_word_t[dst.stack_offset];
          memcpy(
              dst.stack_base,
              _content->stack_base,
              sizeof(evm_word_t) * dst.stack_offset);
        })
  }

  __host__ __device__ __forceinline__ static void print_stack_data_t(
      arith_t &arith,
      stack_data_t &stack_data)
  {
    printf("Stack size: %d, data:\n", stack_data.stack_offset);
    for (uint32_t idx = 0; idx < stack_data.stack_offset; idx++)
    {
      arith.print_cgbn_memory(stack_data.stack_base[idx]);
    }
  }

  __host__ __device__ void print(
      bool full = false)
  {
    printf("Stack size: %d, data:\n", size());
    uint32_t print_size = full ? STACK_SIZE : size();
    for (uint32_t idx = 0; idx < print_size; idx++)
    {
      _arith.print_cgbn_memory(_content->stack_base[idx]);
    }
  }

  __host__ static cJSON *json_from_stack_data_t(
      arith_t &arith,
      stack_data_t &stack_data)
  {
    char *hex_string_ptr = new char[arith_t::BYTES * 2 + 3];
    cJSON *stack_json = cJSON_CreateObject();

    cJSON *stack_data_json = cJSON_CreateArray();
    for (uint32_t idx = 0; idx < stack_data.stack_offset; idx++)
    {
      arith.hex_string_from_cgbn_memory(hex_string_ptr, stack_data.stack_base[idx]);
      cJSON_AddItemToArray(stack_data_json, cJSON_CreateString(hex_string_ptr));
    }
    cJSON_AddItemToObject(stack_json, "data", stack_data_json);
    delete[] hex_string_ptr;
    return stack_json;
  }

  __host__ cJSON *json(bool full = false)
  {
    char *hex_string_ptr = new char[arith_t::BYTES * 2 + 3];
    cJSON *stack_json = cJSON_CreateObject();

    cJSON *stack_data_json = cJSON_CreateArray();
    uint32_t print_size = full ? STACK_SIZE : size();
    for (uint32_t idx = 0; idx < print_size; idx++)
    {
      _arith.hex_string_from_cgbn_memory(hex_string_ptr, _content->stack_base[idx]);
      cJSON_AddItemToArray(stack_data_json, cJSON_CreateString(hex_string_ptr));
    }
    cJSON_AddItemToObject(stack_json, "data", stack_data_json);
    delete[] hex_string_ptr;
    return stack_json;
  }

  __host__ static stack_data_t *get_cpu_instances(
      uint32_t count)
  {
    stack_data_t *cpu_instances = new stack_data_t[count];
    for (uint32_t idx = 0; idx < count; idx++)
    {
      cpu_instances[idx].stack_base = NULL;
      cpu_instances[idx].stack_offset = 0;
    }
    return cpu_instances;
  }

  __host__ static void free_cpu_instances(
      stack_data_t *cpu_instances,
      uint32_t count)
  {
    for (int index = 0; index < count; index++)
    {
      if (cpu_instances[index].stack_base != NULL)
      {
        delete[] cpu_instances[index].stack_base;
        cpu_instances[index].stack_base = NULL;
      }
      cpu_instances[index].stack_offset = 0;
    }
    delete[] cpu_instances;
  }

  __host__ static stack_data_t *get_gpu_instances_from_cpu_instances(
      stack_data_t *cpu_instances,
      uint32_t count)
  {
    stack_data_t *gpu_instances, *tmp_cpu_instances;
    tmp_cpu_instances = new stack_data_t[count];
    memcpy(
        tmp_cpu_instances,
        cpu_instances,
        sizeof(stack_data_t) * count);
    for (uint32_t idx = 0; idx < count; idx++)
    {
      if (cpu_instances[idx].stack_offset > 0)
      {
        CUDA_CHECK(cudaMalloc(
            (void **)&tmp_cpu_instances[idx].stack_base,
            sizeof(evm_word_t) * cpu_instances[idx].stack_offset));
        CUDA_CHECK(cudaMemcpy(
            tmp_cpu_instances[idx].stack_base,
            cpu_instances[idx].stack_base,
            sizeof(evm_word_t) * cpu_instances[idx].stack_offset,
            cudaMemcpyHostToDevice));
      }
      else
      {
        tmp_cpu_instances[idx].stack_base = NULL;
      }
    }
    CUDA_CHECK(cudaMalloc(
        (void **)&gpu_instances,
        sizeof(stack_data_t) * count));
    CUDA_CHECK(cudaMemcpy(
        gpu_instances,
        tmp_cpu_instances,
        sizeof(stack_data_t) * count,
        cudaMemcpyHostToDevice));
    delete[] tmp_cpu_instances;
    tmp_cpu_instances = NULL;
    return gpu_instances;
  }

  __host__ static void free_gpu_instances(
      stack_data_t *gpu_instances,
      uint32_t count)
  {
    stack_data_t *tmp_cpu_instances;
    tmp_cpu_instances = new stack_data_t[count];
    CUDA_CHECK(cudaMemcpy(
        tmp_cpu_instances,
        gpu_instances,
        sizeof(stack_data_t) * count,
        cudaMemcpyDeviceToHost));
    for (uint32_t idx = 0; idx < count; idx++)
    {
      if (tmp_cpu_instances[idx].stack_base != NULL)
        CUDA_CHECK(cudaFree(tmp_cpu_instances[idx].stack_base));
    }
    delete[] tmp_cpu_instances;
    CUDA_CHECK(cudaFree(gpu_instances));
  }

  __host__ static stack_data_t *get_cpu_instances_from_gpu_instances(
      stack_data_t *gpu_instances,
      uint32_t count)
  {
    stack_data_t *cpu_instances, *tmp_cpu_instances, *tmp_gpu_instances;
    cpu_instances = new stack_data_t[count];
    tmp_cpu_instances = new stack_data_t[count];
    CUDA_CHECK(cudaMemcpy(
        cpu_instances,
        gpu_instances,
        sizeof(stack_data_t) * count,
        cudaMemcpyDeviceToHost));
    memcpy(
        tmp_cpu_instances,
        cpu_instances,
        sizeof(stack_data_t) * count);
    for (uint32_t idx = 0; idx < count; idx++)
    {
      tmp_cpu_instances[idx].stack_offset = cpu_instances[idx].stack_offset;
      if (cpu_instances[idx].stack_offset > 0)
      {
        CUDA_CHECK(cudaMalloc(
            (void **)&tmp_cpu_instances[idx].stack_base,
            sizeof(evm_word_t) * cpu_instances[idx].stack_offset));
      }
      else
      {
        tmp_cpu_instances[idx].stack_base = NULL;
      }
    }
    CUDA_CHECK(cudaMalloc(
        (void **)&tmp_gpu_instances,
        sizeof(stack_data_t) * count));
    CUDA_CHECK(cudaMemcpy(
        tmp_gpu_instances,
        tmp_cpu_instances,
        sizeof(stack_data_t) * count,
        cudaMemcpyHostToDevice));
    delete[] tmp_cpu_instances;
    tmp_cpu_instances = NULL;
    kernel_stacks<params><<<count, 1>>>(
        tmp_gpu_instances,
        gpu_instances,
        count);
    CUDA_CHECK(cudaDeviceSynchronize());
    CUDA_CHECK(cudaFree(gpu_instances));
    gpu_instances = tmp_gpu_instances;

    CUDA_CHECK(cudaMemcpy(
        cpu_instances,
        gpu_instances,
        sizeof(stack_data_t) * count,
        cudaMemcpyDeviceToHost));
    tmp_cpu_instances = new stack_data_t[count];
    memcpy(
        tmp_cpu_instances,
        cpu_instances,
        sizeof(stack_data_t) * count);

    for (uint32_t idx = 0; idx < count; idx++)
    {
      tmp_cpu_instances[idx].stack_offset = cpu_instances[idx].stack_offset;
      if (cpu_instances[idx].stack_offset > 0)
      {
        tmp_cpu_instances[idx].stack_base = new evm_word_t[cpu_instances[idx].stack_offset];
        CUDA_CHECK(cudaMemcpy(
            tmp_cpu_instances[idx].stack_base,
            cpu_instances[idx].stack_base,
            sizeof(evm_word_t) * cpu_instances[idx].stack_offset,
            cudaMemcpyDeviceToHost));
      }
      else
      {
        tmp_cpu_instances[idx].stack_base = NULL;
      }
    }

    memcpy(
        cpu_instances,
        tmp_cpu_instances,
        sizeof(stack_data_t) * count);
    delete[] tmp_cpu_instances;
    tmp_cpu_instances = NULL;
    free_gpu_instances(gpu_instances, count);
    return cpu_instances;
  }
};

template <class params>
__global__ void kernel_stacks(
    typename stack_t<params>::stack_data_t *dst,
    typename stack_t<params>::stack_data_t *src,
    uint32_t count)
{
  typedef typename stack_t<params>::evm_word_t evm_word_t;
  uint32_t instance = blockIdx.x * blockDim.x + threadIdx.x;
  if (instance >= count)
  {
    return;
  }
  dst[instance].stack_offset = src[instance].stack_offset;
  if (dst[instance].stack_offset > 0)
  {
    memcpy(
        dst[instance].stack_base,
        src[instance].stack_base,
        sizeof(evm_word_t) * src[instance].stack_offset);
    delete[] src[instance].stack_base;
    src[instance].stack_base = NULL;
    src[instance].stack_offset = 0;
  }
}

#endif