Skip to content

Commit

Permalink
Merge branch 'dev'
Browse files Browse the repository at this point in the history
  • Loading branch information
xmrig committed Apr 4, 2022
2 parents 7d34e20 + 75535a2 commit f04d875
Show file tree
Hide file tree
Showing 14 changed files with 685 additions and 11 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# v6.17.0
- [#157](https://github.com/xmrig/xmrig-cuda/pull/157) Added Dero HE (`astrobwt/v2`) support.

# v6.15.1
- [#119](https://github.com/xmrig/xmrig-cuda/issues/119) Fixed compile error on Linux.
- [#124](https://github.com/xmrig/xmrig-cuda/pull/124) Fixed `"out of memory"` error on non-CryptoNight algorithms.
Expand Down
4 changes: 4 additions & 0 deletions cmake/CUDA.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,10 @@ if (WITH_ASTROBWT)
src/AstroBWT/dero/BWT.h
src/AstroBWT/dero/salsa20.h
src/AstroBWT/dero/sha3.h
src/AstroBWT/dero_he/AstroBWT_v2.cu
src/AstroBWT/dero_he/BWT.h
src/AstroBWT/dero_he/salsa20.h
src/AstroBWT/dero_he/sha3.h
)
else()
set(CUDA_ASTROBWT_SOURCES "")
Expand Down
2 changes: 1 addition & 1 deletion src/AstroBWT/dero/AstroBWT.cu
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
#include <cuda_runtime.h>
#include <cstdint>

__device__ void sync()
static __device__ __forceinline__ void sync()
{
# if (__CUDACC_VER_MAJOR__ >= 9)
__syncwarp();
Expand Down
166 changes: 166 additions & 0 deletions src/AstroBWT/dero_he/AstroBWT_v2.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
/* XMRig
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
* Copyright 2012-2014 pooler <pooler@litecoinpool.org>
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
* Copyright 2016 Jay D Dee <jayddee246@gmail.com>
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
* Copyright 2018-2022 SChernykh <https://github.com/SChernykh>
* Copyright 2016-2022 XMRig <https://github.com/xmrig>, <support@xmrig.com>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/


#include "cryptonight.h"
#include "cuda_device.hpp"


#include <cuda.h>
#include <cuda_runtime.h>
#include <cstdint>

static __device__ __forceinline__ void sync()
{
# if (__CUDACC_VER_MAJOR__ >= 9)
__syncwarp();
# else
__syncthreads();
# endif
}

#include "sha3.h"
#include "salsa20.h"
#include "BWT.h"
namespace tmp {
#include "../dero/sha3.h"
}


#if (__CUDACC_VER_MAJOR__ >= 11)
#include <cub/device/device_segmented_radix_sort.cuh>
#else
#include "3rdparty/cub/device/device_segmented_radix_sort.cuh"
#endif


static constexpr uint32_t DATA_STRIDE = 10240;


void astrobwt_prepare_v2(nvid_ctx *ctx, uint32_t batch_size)
{
if (batch_size == ctx->astrobwt_intensity)
return;

ctx->astrobwt_intensity = batch_size;

const uint32_t BATCH_SIZE = batch_size;
const uint32_t ALLOCATION_SIZE = BATCH_SIZE * DATA_STRIDE;

CUDA_CHECK(ctx->device_id, cudaFree(ctx->astrobwt_shares));
CUDA_CHECK(ctx->device_id, cudaFree(ctx->astrobwt_salsa20_keys));
CUDA_CHECK(ctx->device_id, cudaFree(ctx->astrobwt_bwt_data));
CUDA_CHECK(ctx->device_id, cudaFree(ctx->astrobwt_indices));
CUDA_CHECK(ctx->device_id, cudaFree(ctx->astrobwt_tmp_indices));
CUDA_CHECK(ctx->device_id, cudaFree(ctx->astrobwt_offsets_begin));
CUDA_CHECK(ctx->device_id, cudaFree(ctx->astrobwt_offsets_end));

CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->astrobwt_shares, 16 * sizeof(uint32_t)));
CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->astrobwt_salsa20_keys, BATCH_SIZE * 32));
CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->astrobwt_bwt_data, ALLOCATION_SIZE));
CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->astrobwt_indices, ALLOCATION_SIZE * 8));
CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->astrobwt_tmp_indices, ALLOCATION_SIZE * sizeof(uint32_t) + 65536));
CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->astrobwt_offsets_begin, BATCH_SIZE * sizeof(uint32_t)));
CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->astrobwt_offsets_end, BATCH_SIZE * sizeof(uint32_t)));

std::vector<uint32_t> v(BATCH_SIZE);

for (uint32_t i = 0; i < BATCH_SIZE; ++i) {
v[i] = i * DATA_STRIDE;
}
cudaMemcpy(ctx->astrobwt_offsets_begin, v.data(), BATCH_SIZE * sizeof(uint32_t), cudaMemcpyHostToDevice);

for (uint32_t i = 0; i < BATCH_SIZE; ++i) {
v[i] = i * DATA_STRIDE + 9973;
}
cudaMemcpy(ctx->astrobwt_offsets_end, v.data(), BATCH_SIZE * sizeof(uint32_t), cudaMemcpyHostToDevice);

CUDA_CHECK(ctx->device_id, cudaDeviceSynchronize());
}

namespace AstroBWT_Dero_HE {

void hash(nvid_ctx *ctx, uint32_t nonce, uint64_t target, uint32_t *rescount, uint32_t *resnonce)
{
const uint32_t BATCH_SIZE = ctx->astrobwt_intensity;
const uint32_t ALLOCATION_SIZE = BATCH_SIZE * DATA_STRIDE;

const uint32_t zero = 0;
CUDA_CHECK(ctx->device_id, cudaMemcpy((uint8_t*)(ctx->astrobwt_shares), &zero, sizeof(zero), cudaMemcpyHostToDevice));

CUDA_CHECK_KERNEL(ctx->device_id, sha3_initial<<<BATCH_SIZE, 32>>>((uint8_t*)ctx->d_input, ctx->inputlen, nonce, (uint64_t*)ctx->astrobwt_salsa20_keys));
CUDA_CHECK_KERNEL(ctx->device_id, Salsa20_XORKeyStream<<<BATCH_SIZE, 32>>>((uint32_t*)ctx->astrobwt_salsa20_keys, (uint32_t*)ctx->astrobwt_bwt_data));

uint16_t* p = (uint16_t*)ctx->astrobwt_indices;
uint16_t* keys_in = p;
uint16_t* keys_out = p + ALLOCATION_SIZE;
uint16_t* values_in = p + ALLOCATION_SIZE * 2;
uint16_t* values_out = p + ALLOCATION_SIZE * 3;

CUDA_CHECK_KERNEL(ctx->device_id, BWT_preprocess<<<BATCH_SIZE, 1024>>>(
(uint8_t*)ctx->astrobwt_bwt_data,
keys_in,
values_in
));

size_t temp_storage_bytes = ALLOCATION_SIZE * 4 + 65536;
cub::DeviceSegmentedRadixSort::SortPairs(
ctx->astrobwt_tmp_indices,
temp_storage_bytes,
keys_in,
keys_out,
values_in,
values_out,
BATCH_SIZE * DATA_STRIDE,
BATCH_SIZE,
(uint32_t*)ctx->astrobwt_offsets_begin,
(uint32_t*)ctx->astrobwt_offsets_end
);

CUDA_CHECK_KERNEL(ctx->device_id, BWT_fix_order<<<BATCH_SIZE, 1024>>>(
(uint8_t*)ctx->astrobwt_bwt_data,
keys_out,
values_out
));

CUDA_CHECK_KERNEL(ctx->device_id, sha3<<<BATCH_SIZE, 32>>>((uint8_t*)values_out, (uint64_t*)ctx->astrobwt_salsa20_keys));
CUDA_CHECK_KERNEL(ctx->device_id, find_shares<<<BATCH_SIZE / 32, 32>>>((uint64_t*)ctx->astrobwt_salsa20_keys, target, (uint32_t*)ctx->astrobwt_shares));

CUDA_CHECK(ctx->device_id, cudaDeviceSynchronize());

uint32_t shares[11];
CUDA_CHECK(ctx->device_id, cudaMemcpy(shares, ctx->astrobwt_shares, sizeof(shares), cudaMemcpyDeviceToHost));

if (shares[0] > 10)
shares[0] = 10;

*rescount = shares[0];
for (uint32_t i = 0; i < shares[0]; ++i) {
resnonce[i] = nonce + shares[i + 1];
}

ctx->astrobwt_processed_hashes = BATCH_SIZE;
}

} // AstroBWT_Dero_HE
115 changes: 115 additions & 0 deletions src/AstroBWT/dero_he/BWT.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
/* XMRig
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
* Copyright 2012-2014 pooler <pooler@litecoinpool.org>
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
* Copyright 2016 Jay D Dee <jayddee246@gmail.com>
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
* Copyright 2018-2022 SChernykh <https://github.com/SChernykh>
* Copyright 2016-2022 XMRig <https://github.com/xmrig>, <support@xmrig.com>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/


#pragma once

namespace AstroBWT_Dero_HE {

__global__ __launch_bounds__(1024)
void BWT_preprocess(const uint8_t* datas, uint16_t* keys_in, uint16_t* values_in)
{
const uint32_t data_offset = blockIdx.x * 10240;

datas += data_offset;
keys_in += data_offset;
values_in += data_offset;

for (uint32_t i = threadIdx.x; i < 9973; i += 1024)
{
keys_in[i] = (static_cast<uint16_t>(datas[i]) << 8) | datas[i + 1];
values_in[i] = static_cast<uint16_t>(i);
}
}

__device__ __forceinline__ void fix_order(const uint8_t* input, uint32_t a, uint32_t b, uint16_t* values_in)
{
const uint32_t value_in_a = values_in[a];
const uint32_t value_in_b = values_in[b];

const uint32_t value_a =
(static_cast<uint32_t>(input[value_in_a + 2]) << 24) |
(static_cast<uint32_t>(input[value_in_a + 3]) << 16) |
(static_cast<uint32_t>(input[value_in_a + 4]) << 8) |
static_cast<uint32_t>(input[value_in_a + 5]);

const uint32_t value_b =
(static_cast<uint32_t>(input[value_in_b + 2]) << 24) |
(static_cast<uint32_t>(input[value_in_b + 3]) << 16) |
(static_cast<uint32_t>(input[value_in_b + 4]) << 8) |
static_cast<uint32_t>(input[value_in_b + 5]);

if (value_a > value_b)
{
values_in[a] = value_in_b;
values_in[b] = value_in_a;
}
}

__global__ __launch_bounds__(1024)
void BWT_fix_order(const uint8_t* datas, uint16_t* keys_in, uint16_t* values_in)
{
const uint32_t tid = threadIdx.x;
const uint32_t gid = blockIdx.x;

const uint32_t data_offset = gid * 10240;
const uint8_t* input = datas + data_offset;

const uint32_t N = 9973;

keys_in += data_offset;
values_in += data_offset;

for (uint32_t i = tid, N1 = N - 1; i < N1; i += 1024)
{
const uint16_t value = keys_in[i];
if (value == (keys_in[i + 1]))
{
if (i && (value == keys_in[i - 1]))
continue;

uint32_t n = i + 2;
while ((n < N) && (value == keys_in[n]))
++n;

for (uint32_t j = i; j < n; ++j)
for (uint32_t k = j + 1; k < n; ++k)
fix_order(input, j, k, values_in);
}
}
}

__global__ void __launch_bounds__(32) find_shares(const uint64_t* hashes, uint64_t target, uint32_t* shares)
{
const uint32_t global_index = blockIdx.x * 32 + threadIdx.x;

if (hashes[global_index * 4 + 3] < target)
{
const int idx = atomicAdd((int*)(shares), 1) + 1;
if (idx < 16)
shares[idx] = global_index;
}
}

} // AstroBWT_Dero_HE
Loading

0 comments on commit f04d875

Please sign in to comment.