diff --git a/Operators/CUDAMathKernels.cuh b/Operators/CUDAMathKernels.cuh new file mode 100644 index 0000000000000000000000000000000000000000..4dc130d3409e507fa23411d5327c6cee57ba21e7 --- /dev/null +++ b/Operators/CUDAMathKernels.cuh @@ -0,0 +1,105 @@ +namespace Nektar::Operators::detail +{ + +template <typename TData> +__global__ void negKernel(const size_t nsize, const TData *x, TData *y) +{ + size_t i = blockDim.x * blockIdx.x + threadIdx.x; + + while (i < nsize) + { + y[i] = -x[i]; + i += blockDim.x * gridDim.x; + } +} + +template <typename TData> +__global__ void addKernel(const size_t nsize, const TData *x, const TData *y, + TData *z) +{ + size_t i = blockDim.x * blockIdx.x + threadIdx.x; + + while (i < nsize) + { + z[i] = x[i] + y[i]; + i += blockDim.x * gridDim.x; + } +} + +template <typename TData> +__global__ void subKernel(const size_t nsize, const TData *x, const TData *y, + TData *z) +{ + size_t i = blockDim.x * blockIdx.x + threadIdx.x; + + while (i < nsize) + { + z[i] = x[i] - y[i]; + i += blockDim.x * gridDim.x; + } +} + +template <typename TData> +__global__ void daxpyKernel(const size_t nsize, const TData alpha, + const TData *x, const TData *y, TData *z) +{ + size_t i = blockDim.x * blockIdx.x + threadIdx.x; + + while (i < nsize) + { + z[i] = alpha * x[i] + y[i]; + i += blockDim.x * gridDim.x; + } +} + +template <typename TData> +__global__ void vdivKernel(const size_t nsize, const TData *x, const TData *y, + TData *z) +{ + size_t i = blockDim.x * blockIdx.x + threadIdx.x; + + while (i < nsize) + { + z[i] = x[i] / y[i]; + i += blockDim.x * gridDim.x; + } +} + +template <typename TData> +__global__ void dotKernel(const size_t nsize, const TData *x, const TData *y, + TData *out) +{ + extern __shared__ TData s[]; + + size_t i = blockDim.x * blockIdx.x + threadIdx.x; + size_t sIndex = threadIdx.x; + + TData tmp = 0.0; + while (i < nsize) + { + tmp += x[i] * y[i]; + i += blockDim.x * gridDim.x; + } + + s[sIndex] = tmp; + + __syncthreads(); + + i = blockDim.x / 2; + while (i != 0) + { + if (sIndex < i) + { + s[sIndex] += s[sIndex + i]; + } + __syncthreads(); + i /= 2; + } + + if (threadIdx.x == 0) + { + atomicAdd(out, s[0]); + } +} + +} // namespace Nektar::Operators::detail diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 5c45a984598ad5eb08bc3c2382a8dc1c1c47521d..cae22f3c05a2a7df627e36aaba934a5156ffc188 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -4,6 +4,17 @@ find_package( COMPONENTS unit_test_framework ) +IF (NEKTAR_USE_CUDA) + add_executable(test_cudakernels test_cudakernels.cu) + target_link_libraries(test_cudakernels PRIVATE Operators) + target_link_libraries(test_cudakernels PRIVATE ${NEKTAR++_LIBRARIES}) + target_link_libraries(test_cudakernels PRIVATE Boost::unit_test_framework) + target_include_directories(test_cudakernels PRIVATE "${CMAKE_SOURCE_DIR}") + target_include_directories(test_cudakernels PRIVATE ${NEKTAR++_INCLUDE_DIRS}) + target_compile_definitions(test_cudakernels PRIVATE + -DTEST_PATH="${CMAKE_SOURCE_DIR}") +ENDIF() + add_executable(test_bwdtrans test_bwdtrans.cpp) target_link_libraries(test_bwdtrans PRIVATE Operators) target_link_libraries(test_bwdtrans PRIVATE ${NEKTAR++_LIBRARIES}) @@ -305,6 +316,14 @@ target_include_directories(test_helmsolve PRIVATE ${NEKTAR++_INCLUDE_DIRS}) target_compile_definitions(test_helmsolve PRIVATE -DTEST_PATH="${CMAKE_SOURCE_DIR}") +IF (NEKTAR_USE_CUDA) + add_test( + NAME CUDAKernels + COMMAND test_cudakernels + WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" + ) +ENDIF() + add_test( NAME BwdTrans COMMAND test_bwdtrans diff --git a/tests/init_cudakernels.hpp b/tests/init_cudakernels.hpp new file mode 100644 index 0000000000000000000000000000000000000000..46b14d5ccfd06efedc44fa0e367d3eae2b55f1b4 --- /dev/null +++ b/tests/init_cudakernels.hpp @@ -0,0 +1,51 @@ +#include "init_fields.hpp" + +using namespace std; +using namespace Nektar::Operators; +using namespace Nektar::LibUtilities; +using namespace Nektar; + +class CUDAKernelsField + : public InitFields<double, FieldState::Phys, FieldState::Phys, + MultiRegions::ContField> +{ +public: + CUDAKernelsField() + : InitFields<double, FieldState::Phys, FieldState::Phys, + MultiRegions::ContField>() + { + } + + void SetTestCase() + { + Array<OneD, NekDouble> x(fixt_explist->GetTotPoints()); + Array<OneD, NekDouble> y(fixt_explist->GetTotPoints()); + Array<OneD, NekDouble> z(fixt_explist->GetTotPoints()); + Array<OneD, NekDouble> fce(fixt_explist->GetTotPoints()); + fixt_explist->GetCoords(x, y, z); + auto func1 = fixt_explist->GetSession()->GetFunction("Forcing", 0); + func1->Evaluate(x, y, z, fce); + std::copy(fce.get(), fce.get() + fixt_explist->GetTotPoints(), + fixt_in->GetStorage().GetCPUPtr()); + std::copy( + fce.get(), fce.get() + fixt_explist->GetTotPoints(), + fixtcuda_in->template GetStorage<MemoryRegionCUDA>().GetCPUPtr()); + auto func2 = + fixt_explist->GetSession()->GetFunction("ExactSolution", 0); + func2->Evaluate(x, y, z, fce); + std::copy(fce.get(), fce.get() + fixt_explist->GetTotPoints(), + fixt_out->GetStorage().GetCPUPtr()); + std::copy( + fce.get(), fce.get() + fixt_explist->GetTotPoints(), + fixtcuda_out->template GetStorage<MemoryRegionCUDA>().GetCPUPtr()); + } +}; + +class CUDAKernels : public CUDAKernelsField +{ +public: + CUDAKernels() + { + meshName = "run/Helmholtz3D_Hex_AllBCs_P6.xml"; + } +}; diff --git a/tests/test_cudakernels.cu b/tests/test_cudakernels.cu new file mode 100644 index 0000000000000000000000000000000000000000..858b5b23007e35a1d7fa9b928730877dc91a6879 --- /dev/null +++ b/tests/test_cudakernels.cu @@ -0,0 +1,166 @@ +#define BOOST_TEST_DYN_LINK +#define BOOST_TEST_MODULE TestCUDAMathKernels +#include <boost/test/tools/output_test_stream.hpp> +#include <boost/test/unit_test.hpp> + +#include <iostream> +#include <memory> + +#include <LibUtilities/BasicUtils/Vmath.hpp> + +#include "MemoryRegionCUDA.hpp" +#include "Operators/CUDAMathKernels.cuh" +#include "init_cudakernels.hpp" + +using namespace Nektar::Operators::detail; + +BOOST_AUTO_TEST_SUITE(TestCUDAKernels) + +BOOST_FIXTURE_TEST_CASE(cuda_dotkernel, CUDAKernels) +{ + Configure(); + SetTestCase(); + size_t n = fixtcuda_in->template GetStorage<MemoryRegionCUDA>().size(); + size_t gridSize = 1024; + size_t blockSize = 32; + double out, h_out, *d_out, *x, *y; + + // Vmath results + x = fixt_in->GetStorage().GetCPUPtr(); + y = fixt_out->GetStorage().GetCPUPtr(); + out = Vmath::Dot(n, x, 1, y, 1); + + // CUDA results + cudaMalloc((void **)&d_out, sizeof(double)); + cudaMemset(d_out, 0, sizeof(double)); + x = fixtcuda_in->template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); + y = fixtcuda_out->template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); + dotKernel<<<gridSize, blockSize, sizeof(double) * blockSize>>>(n, x, y, + d_out); + cudaMemcpy(&h_out, d_out, sizeof(double), cudaMemcpyDeviceToHost); + + // Check results + BOOST_TEST(fabs(h_out - out) < 1.0E-12); + boost::test_tools::output_test_stream output; + { + std::cout << "CUDA = " << h_out << " Vmath = " << out << std::endl; + } +} + +BOOST_FIXTURE_TEST_CASE(cuda_addkernel, CUDAKernels) +{ + Configure(); + SetTestCase(); + size_t n = fixtcuda_in->template GetStorage<MemoryRegionCUDA>().size(); + size_t gridSize = 1024; + size_t blockSize = 32; + double *x, *y; + + // Vmath results + x = fixt_in->GetStorage().GetCPUPtr(); + y = fixt_out->GetStorage().GetCPUPtr(); + Vmath::Vadd(n, x, 1, y, 1, y, 1); + + // CUDA results + x = fixtcuda_in->template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); + y = fixtcuda_out->template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); + addKernel<<<gridSize, blockSize>>>(n, x, y, y); + + // Check results + BOOST_TEST(fixtcuda_out->compare(*fixt_out, 1.0E-15)); + boost::test_tools::output_test_stream output; + { + OutputIfNotMatch( + fixtcuda_out->template GetStorage<MemoryRegionCUDA>().GetCPUPtr(), + fixt_out->GetStorage().GetCPUPtr(), 1.0E-15); + } +} + +BOOST_FIXTURE_TEST_CASE(cuda_subkernel, CUDAKernels) +{ + Configure(); + SetTestCase(); + size_t n = fixtcuda_in->template GetStorage<MemoryRegionCUDA>().size(); + size_t gridSize = 1024; + size_t blockSize = 32; + double *x, *y; + + // Vmath results + x = fixt_in->GetStorage().GetCPUPtr(); + y = fixt_out->GetStorage().GetCPUPtr(); + Vmath::Vsub(n, x, 1, y, 1, y, 1); + + // CUDA results + x = fixtcuda_in->template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); + y = fixtcuda_out->template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); + subKernel<<<gridSize, blockSize>>>(n, x, y, y); + + // Check results + BOOST_TEST(fixtcuda_out->compare(*fixt_out, 1.0E-15)); + boost::test_tools::output_test_stream output; + { + OutputIfNotMatch( + fixtcuda_out->template GetStorage<MemoryRegionCUDA>().GetCPUPtr(), + fixt_out->GetStorage().GetCPUPtr(), 1.0E-15); + } +} + +BOOST_FIXTURE_TEST_CASE(cuda_daxpykernel, CUDAKernels) +{ + Configure(); + SetTestCase(); + size_t n = fixtcuda_in->template GetStorage<MemoryRegionCUDA>().size(); + size_t gridSize = 1024; + size_t blockSize = 32; + double *x, *y, alpha = 1.5; + + // Vmath results + x = fixt_in->GetStorage().GetCPUPtr(); + y = fixt_out->GetStorage().GetCPUPtr(); + Vmath::Svtvp(n, alpha, x, 1, y, 1, y, 1); + + // CUDA results + x = fixtcuda_in->template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); + y = fixtcuda_out->template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); + daxpyKernel<<<gridSize, blockSize>>>(n, alpha, x, y, y); + + // Check results + BOOST_TEST(fixtcuda_out->compare(*fixt_out, 1.0E-14)); + boost::test_tools::output_test_stream output; + { + OutputIfNotMatch( + fixtcuda_out->template GetStorage<MemoryRegionCUDA>().GetCPUPtr(), + fixt_out->GetStorage().GetCPUPtr(), 1.0E-14); + } +} + +BOOST_FIXTURE_TEST_CASE(cuda_vdivkernel, CUDAKernels) +{ + Configure(); + SetTestCase(); + size_t n = fixtcuda_in->template GetStorage<MemoryRegionCUDA>().size(); + size_t gridSize = 1024; + size_t blockSize = 32; + double *x, *y; + + // Vmath results + x = fixt_in->GetStorage().GetCPUPtr(); + y = fixt_out->GetStorage().GetCPUPtr(); + Vmath::Vdiv(n, x, 1, y, 1, y, 1); + + // CUDA results + x = fixtcuda_in->template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); + y = fixtcuda_out->template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); + vdivKernel<<<gridSize, blockSize>>>(n, x, y, y); + + // Check results + BOOST_TEST(fixtcuda_out->compare(*fixt_out, 1.0E-15)); + boost::test_tools::output_test_stream output; + { + OutputIfNotMatch( + fixtcuda_out->template GetStorage<MemoryRegionCUDA>().GetCPUPtr(), + fixt_out->GetStorage().GetCPUPtr(), 1.0E-15); + } +} + +BOOST_AUTO_TEST_SUITE_END()