Program Listing for File memory.cuh#
↰ Return to documentation for file (src/memory.cuh
)
// cuEVM: CUDA Ethereum Virtual Machine implementation
// Copyright 2023 Stefan-Dan Ciocirlan (SBIP - Singapore Blockchain Innovation Programme)
// Author: Stefan-Dan Ciocirlan
// Data: 2023-11-30
// SPDX-License-Identifier: MIT
#ifndef _MEMORY_H_
#define _MEMORY_H_
#include "utils.h"
template <class params>
class memory_t
{
public:
typedef arith_env_t<params> arith_t;
typedef typename arith_t::bn_t bn_t;
typedef cgbn_mem_t<params::BITS> evm_word_t;
static const size_t PAGE_SIZE = params::PAGE_SIZE;
typedef struct
{
size_t size;
size_t allocated_size;
evm_word_t memory_cost;
uint8_t *data;
} memory_data_t;
memory_data_t *_content;
arith_t _arith;
__host__ __forceinline__ memory_t(
arith_t arith,
memory_data_t *content
) : _arith(arith),
_content(content)
{
}
__host__ __device__ __forceinline__ memory_t(
arith_t arith
) : _arith(arith)
{
SHARED_MEMORY memory_data_t *content;
ONE_THREAD_PER_INSTANCE(
content = new memory_data_t;
content->size = 0;
content->allocated_size = 0;
content->data = NULL;
)
_content = content;
_arith.cgbn_memory_from_size_t(_content->memory_cost, 0);
}
__host__ __device__ __forceinline__ ~memory_t()
{
if ((_content->allocated_size > 0) && (_content->data != NULL))
{
ONE_THREAD_PER_INSTANCE(
delete[] _content->data;
)
_content->allocated_size = 0;
_content->size = 0;
_content->data = NULL;
}
_arith.cgbn_memory_from_size_t(_content->memory_cost, 0);
ONE_THREAD_PER_INSTANCE(
delete _content;
)
_content = NULL;
}
__host__ __device__ __forceinline__ size_t size()
{
return _content->size;
}
__host__ __device__ __forceinline__ void allocate_pages(
size_t new_size,
uint32_t &error_code
)
{
if (new_size <= _content->allocated_size)
{
return;
}
size_t no_pages = (new_size / PAGE_SIZE) + 1;
SHARED_MEMORY uint8_t *new_data;
ONE_THREAD_PER_INSTANCE(
new_data = new uint8_t[no_pages * PAGE_SIZE];
if (new_data == NULL) {
error_code = ERR_MEMORY_INVALID_ALLOCATION;
return;
}
// 0 all the data
memset(new_data, 0, no_pages * PAGE_SIZE);
if (
(_content->allocated_size > 0) &&
(_content->data != NULL)
)
{
memcpy(new_data, _content->data, _content->allocated_size);
delete[] _content->data;
_content->data = NULL;
_content->allocated_size = 0;
}
_content->allocated_size = no_pages * PAGE_SIZE;
_content->data = new_data;
)
}
__host__ __device__ __forceinline__ void grow_cost(
bn_t &index,
bn_t &length,
bn_t &gas_used,
uint32_t &error_code
)
{
//
if (cgbn_compare_ui32(_arith._env, length, 0) > 0)
{
bn_t offset;
int32_t overflow;
size_t last_offset;
// first overflow check
overflow = cgbn_add(_arith._env, offset, index, length);
overflow = overflow | _arith.size_t_from_cgbn(last_offset, offset);
bn_t old_memory_cost;
cgbn_load(_arith._env, old_memory_cost, &(_content->memory_cost));
// memort_size_word = (offset + 31) / 32
bn_t memory_size_word;
cgbn_add_ui32(_arith._env, memory_size_word, offset, 31);
cgbn_div_ui32(_arith._env, memory_size_word, memory_size_word, 32);
// memory_cost = (memory_size_word * memory_size_word) / 512 + 3 * memory_size_word
bn_t memory_cost;
cgbn_mul(_arith._env, memory_cost, memory_size_word, memory_size_word);
cgbn_div_ui32(_arith._env, memory_cost, memory_cost, 512);
bn_t tmp;
cgbn_mul_ui32(_arith._env, tmp, memory_size_word, GAS_MEMORY);
cgbn_add(_arith._env, memory_cost, memory_cost, tmp);
// gas_used = gas_used + memory_cost - old_memory_cost
bn_t memory_expansion_cost;
if (cgbn_compare(_arith._env, memory_cost, old_memory_cost) == 1)
{
cgbn_sub(_arith._env, memory_expansion_cost, memory_cost, old_memory_cost);
// set the new memory cost
cgbn_store(_arith._env, &(_content->memory_cost), memory_cost);
}
else
{
cgbn_set_ui32(_arith._env, memory_expansion_cost, 0);
}
cgbn_add(_arith._env, gas_used, gas_used, memory_expansion_cost);
// size is always a multiple of 32
cgbn_mul_ui32(_arith._env, offset, memory_size_word, 32);
// get the new size
overflow = overflow | _arith.size_t_from_cgbn(last_offset, offset);
if (overflow != 0)
{
error_code = ERR_MEMORY_INVALID_OFFSET;
}
}
}
__host__ __device__ __forceinline__ size_t get_last_offset(
bn_t &index,
bn_t &length,
uint32_t &error_code
)
{
bn_t offset;
int32_t overflow;
size_t last_offset;
// first overflow check
overflow = cgbn_add(_arith._env, offset, index, length);
overflow = overflow | _arith.size_t_from_cgbn(last_offset, offset);
bn_t memory_size_word;
cgbn_add_ui32(_arith._env, memory_size_word, offset, 31);
cgbn_div_ui32(_arith._env, memory_size_word, memory_size_word, 32);
cgbn_mul_ui32(_arith._env, offset, memory_size_word, 32);
// get the new size
overflow = overflow | _arith.size_t_from_cgbn(last_offset, offset);
if (overflow != 0)
{
error_code = ERR_MEMORY_INVALID_OFFSET;
}
return last_offset;
}
__host__ __device__ __forceinline__ void grow(
bn_t &index,
bn_t &length,
uint32_t &error_code
)
{
size_t offset = get_last_offset(index, length, error_code);
if (
(error_code == ERR_NONE) &&
(offset > _content->size)
)
{
if (offset > _content->allocated_size)
{
allocate_pages(offset, error_code);
}
_content->size = offset;
}
}
__host__ __device__ __forceinline__ uint8_t *get(
bn_t &index,
bn_t &length,
uint32_t &error_code
)
{
if (cgbn_compare_ui32(_arith._env, length, 0) > 0)
{
grow(index, length, error_code);
size_t index_s;
if (error_code == ERR_NONE)
{
_arith.size_t_from_cgbn(index_s, index);
return _content->data + index_s;
}
}
return NULL;
}
__host__ __device__ __forceinline__ void set(
uint8_t *data,
bn_t &index,
bn_t &length,
size_t &available_size,
uint32_t &error_code
)
{
if (cgbn_compare_ui32(_arith._env, length, 0) > 0)
{
size_t index_s;
grow(index, length, error_code);
_arith.size_t_from_cgbn(index_s, index);
if (
(data != NULL) &&
(available_size > 0) &&
(error_code == ERR_NONE)
)
{
ONE_THREAD_PER_INSTANCE(
memcpy(_content->data + index_s, data, available_size);
)
}
}
}
__host__ __device__ __forceinline__ void to_memory_data_t(memory_data_t &dst)
{
// free if any memory is allocated
if (
(dst.allocated_size > 0) &&
(dst.data != NULL)
)
{
ONE_THREAD_PER_INSTANCE(
delete[] dst.data;
)
dst.data = NULL;
dst.allocated_size = 0;
dst.size = 0;
_arith.cgbn_memory_from_size_t(dst.memory_cost, 0);
}
dst.size = _content->size;
dst.allocated_size = _content->size;
bn_t memory_cost;
cgbn_load(_arith._env, memory_cost, &(_content->memory_cost));
cgbn_store(_arith._env, &(dst.memory_cost), memory_cost);
if (_content->size > 0)
{
ONE_THREAD_PER_INSTANCE(
dst.data = new uint8_t[_content->size];
memcpy(dst.data, _content->data, _content->size);
)
}
else
{
dst.data = NULL;
}
}
__host__ static memory_data_t *get_cpu_instances(
uint32_t count
)
{
memory_data_t *instances;
instances = new memory_data_t[count];
memset(instances, 0, sizeof(memory_data_t) * count);
return instances;
}
__host__ static void free_cpu_instances(
memory_data_t *instances,
uint32_t count
)
{
for (uint32_t idx = 0; idx < count; idx++)
{
if (
(instances[idx].data != NULL) &&
(instances[idx].allocated_size > 0)
)
{
delete[] instances[idx].data;
instances[idx].data = NULL;
instances[idx].allocated_size = 0;
}
}
delete[] instances;
}
__host__ static memory_data_t *get_gpu_instances_from_cpu_instances(
memory_data_t *cpu_instances,
uint32_t count
)
{
memory_data_t *gpu_instances;
memory_data_t *tmp_cpu_instances;
tmp_cpu_instances = new memory_data_t[count];
memcpy(tmp_cpu_instances, cpu_instances, sizeof(memory_data_t) * count);
for (size_t idx = 0; idx < count; idx++)
{
if (
(tmp_cpu_instances[idx].allocated_size > 0) &&
(tmp_cpu_instances[idx].data != NULL)
)
{
CUDA_CHECK(cudaMalloc(
(void **)&tmp_cpu_instances[idx].data,
sizeof(uint8_t) * tmp_cpu_instances[idx].allocated_size
));
CUDA_CHECK(cudaMemcpy(
tmp_cpu_instances[idx].data,
cpu_instances[idx].data,
sizeof(uint8_t) * tmp_cpu_instances[idx].allocated_size,
cudaMemcpyHostToDevice
));
}
else
{
tmp_cpu_instances[idx].data = NULL;
}
}
CUDA_CHECK(cudaMalloc(
(void **)&gpu_instances,
sizeof(memory_data_t) * count
));
CUDA_CHECK(cudaMemcpy(
gpu_instances,
tmp_cpu_instances,
sizeof(memory_data_t) * count, cudaMemcpyHostToDevice
));
delete[] tmp_cpu_instances;
tmp_cpu_instances = NULL;
return gpu_instances;
}
__host__ static void free_gpu_instances(
memory_data_t *gpu_instances,
uint32_t count
)
{
memory_data_t *tmp_cpu_instances;
tmp_cpu_instances = new memory_data_t[count];
CUDA_CHECK(cudaMemcpy(
tmp_cpu_instances,
gpu_instances,
sizeof(memory_data_t) * count, cudaMemcpyDeviceToHost
));
for (size_t idx = 0; idx < count; idx++)
{
if (
(tmp_cpu_instances[idx].allocated_size > 0) &&
(tmp_cpu_instances[idx].data != NULL)
)
{
CUDA_CHECK(cudaFree(tmp_cpu_instances[idx].data));
}
}
CUDA_CHECK(cudaFree(gpu_instances));
delete[] tmp_cpu_instances;
tmp_cpu_instances = NULL;
}
__host__ static memory_data_t *get_cpu_instances_from_gpu_instances(
memory_data_t *gpu_instances,
uint32_t count
)
{
memory_data_t *cpu_instances;
cpu_instances = new memory_data_t[count];
CUDA_CHECK(cudaMemcpy(
cpu_instances,
gpu_instances,
sizeof(memory_data_t) * count,
cudaMemcpyDeviceToHost
));
// 1. alocate the memory for gpu memory as memory which can be addressed by the cpu
memory_data_t *tmp_cpu_instances, *tmp_gpu_instances;
tmp_cpu_instances = new memory_data_t[count];
memcpy(
tmp_cpu_instances,
cpu_instances,
sizeof(memory_data_t) * count
);
for (size_t idx = 0; idx < count; idx++)
{
if (tmp_cpu_instances[idx].size > 0)
{
CUDA_CHECK(cudaMalloc(
(void **)&tmp_cpu_instances[idx].data,
sizeof(uint8_t) * tmp_cpu_instances[idx].size
));
}
else
{
tmp_cpu_instances[idx].data = NULL;
}
tmp_cpu_instances[idx].allocated_size = tmp_cpu_instances[idx].size;
}
CUDA_CHECK(cudaMalloc(
(void **)&tmp_gpu_instances,
sizeof(memory_data_t) * count
));
CUDA_CHECK(cudaMemcpy(
tmp_gpu_instances,
tmp_cpu_instances,
sizeof(memory_data_t) * count,
cudaMemcpyHostToDevice
));
delete[] tmp_cpu_instances;
tmp_cpu_instances = NULL;
// 2. call the kernel to copy the memory between the gpu memories
kernel_get_memory<params><<<count, 1>>>(tmp_gpu_instances, gpu_instances, count);
CUDA_CHECK(cudaDeviceSynchronize());
CUDA_CHECK(cudaFree(gpu_instances));
gpu_instances = tmp_gpu_instances;
// 3. copy the gpu memories back in the cpu memories
CUDA_CHECK(cudaMemcpy(
cpu_instances,
gpu_instances,
sizeof(memory_data_t) * count,
cudaMemcpyDeviceToHost
));
tmp_cpu_instances = new memory_data_t[count];
memcpy(
tmp_cpu_instances,
cpu_instances,
sizeof(memory_data_t) * count
);
for (size_t idx = 0; idx < count; idx++)
{
if (tmp_cpu_instances[idx].size > 0)
{
tmp_cpu_instances[idx].data = new uint8_t[tmp_cpu_instances[idx].size];
CUDA_CHECK(cudaMemcpy(
tmp_cpu_instances[idx].data,
cpu_instances[idx].data,
sizeof(uint8_t) * tmp_cpu_instances[idx].size,
cudaMemcpyDeviceToHost
));
}
else
{
tmp_cpu_instances[idx].data = NULL;
}
}
free_gpu_instances(gpu_instances, count);
memcpy(
cpu_instances,
tmp_cpu_instances,
sizeof(memory_data_t) * count
);
delete[] tmp_cpu_instances;
tmp_cpu_instances = NULL;
return cpu_instances;
}
__host__ __device__ __forceinline__ static void print_memory_data_t(
arith_t &arith,
memory_data_t &memory_data
)
{
printf("size=%lu\n", memory_data.size);
printf("allocated_size=%lu\n", memory_data.allocated_size);
printf("memory_cost=");
arith.print_cgbn_memory(memory_data.memory_cost);
if (memory_data.size > 0)
print_bytes(memory_data.data, memory_data.size);
}
__host__ __device__ void print()
{
print_memory_data_t(_arith, *_content);
}
__host__ static cJSON *json_from_memory_data_t(
arith_t &arith,
memory_data_t &memory_data
)
{
cJSON *memory_json = cJSON_CreateObject();
cJSON_AddNumberToObject(memory_json, "size", memory_data.size);
cJSON_AddNumberToObject(memory_json, "allocated_size", memory_data.allocated_size);
char *hex_string_ptr = new char[arith_t::BYTES * 2 + 3];
arith.hex_string_from_cgbn_memory(hex_string_ptr, memory_data.memory_cost);
cJSON_AddStringToObject(memory_json, "memory_cost", hex_string_ptr);
if (memory_data.size > 0)
{
char *bytes_string = hex_from_bytes(memory_data.data, memory_data.size);
cJSON_AddStringToObject(memory_json, "data", bytes_string);
delete[] bytes_string;
}
else
{
cJSON_AddStringToObject(memory_json, "data", "0x");
}
delete[] hex_string_ptr;
hex_string_ptr = NULL;
return memory_json;
}
__host__ cJSON *json()
{
return json_from_memory_data_t(_arith, *_content);
}
};
template <class params>
__global__ void kernel_get_memory(
typename memory_t<params>::memory_data_t *dst_instances,
typename memory_t<params>::memory_data_t *src_instances,
uint32_t instance_count
)
{
uint32_t instance = blockIdx.x * blockDim.x + threadIdx.x;
if (instance >= instance_count)
return;
if (src_instances[instance].size > 0)
{
memcpy(
dst_instances[instance].data,
src_instances[instance].data,
src_instances[instance].size * sizeof(uint8_t)
);
delete[] src_instances[instance].data;
src_instances[instance].data = NULL;
}
}
#endif