diff --git a/CMakeLists.txt b/CMakeLists.txt index 70454f537b5af922fc1d83bf114731bdbce647c3..47c1b9fd2902c84217bbce132541db6419c0774c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -26,7 +26,7 @@ message(STATUS "Found Nektar++: version ${NEKTAR++_VERSION}") set(CMAKE_INSTALL_RPATH "${NEKTAR++_LIBRARY_DIRS}") -set(SRC Field.cpp Operators/Operator.cpp Operators/OperatorBwdTrans.cpp Operators/BwdTrans/BwdTransImpl.cpp Operators/OperatorIProductWRTBase.cpp Operators/IProductWRTBase/IProductWRTBaseImpl.cpp Operators/OperatorPhysDeriv.cpp Operators/PhysDeriv/PhysDerivImpl.cpp) +set(SRC Field.cpp Operators/Operator.cpp Operators/OperatorBwdTrans.cpp Operators/BwdTrans/BwdTransImpl.cpp Operators/OperatorIProductWRTBase.cpp Operators/IProductWRTBase/IProductWRTBaseImpl.cpp Operators/OperatorIProductWRTDerivBase.cpp Operators/IProductWRTDerivBase/IProductWRTDerivBaseImpl.cpp Operators/OperatorPhysDeriv.cpp Operators/PhysDeriv/PhysDerivImpl.cpp) if (NEKTAR_USE_CUDA AND NEKTAR_USE_SIMD) MESSAGE(FATAL_ERROR "Cannot use both SIMD and CUDA") @@ -35,7 +35,7 @@ endif() if (NEKTAR_USE_CUDA) enable_language(CUDA) add_definitions(-DNEKTAR_USE_CUDA) - set(SRC ${SRC} MemoryRegionCUDA.cu Operators/BwdTrans/BwdTransCUDA.cu Operators/IProductWRTBase/IProductWRTBaseCUDA.cu Operators/PhysDeriv/PhysDerivCUDA.cu) + set(SRC ${SRC} MemoryRegionCUDA.cu Operators/BwdTrans/BwdTransCUDA.cu Operators/IProductWRTBase/IProductWRTBaseCUDA.cu Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDA.cu Operators/PhysDeriv/PhysDerivCUDA.cu) endif() if (NEKTAR_USE_SIMD) diff --git a/Operators/IProductWRTBase/IProductWRTBaseCUDA.hpp b/Operators/IProductWRTBase/IProductWRTBaseCUDA.hpp index 54b99215c02118699537b4ce17a528c0d4c36406..ec4d03beba0b190e9c274168946befab788e3ae3 100644 --- a/Operators/IProductWRTBase/IProductWRTBaseCUDA.hpp +++ b/Operators/IProductWRTBase/IProductWRTBaseCUDA.hpp @@ -1,3 +1,5 @@ +#pragma once + #include "MemoryRegionCUDA.hpp" #include "Operators/IProductWRTBase/IProductWRTBaseCUDAKernels.cuh" #include "Operators/OperatorHelper.cuh" @@ -6,14 +8,16 @@ namespace Nektar::Operators::detail { -template <typename TData, bool APPEND = false, bool DEFORMED = false> +template <typename TData, bool SCALE = false, bool APPEND = false, + bool DEFORMED = false> void IProductWRTBase1DKernel(const size_t gridSize, const size_t blockSize, const size_t nm0, const size_t nq0, const size_t nElmts, const TData *basis0, const TData *w0, const TData *jac, const TData *in, - TData *out); + TData *out, TData scale = 1.0); -template <typename TData, bool APPEND = false, bool DEFORMED = false> +template <typename TData, bool SCALE = false, bool APPEND = false, + bool DEFORMED = false> void IProductWRTBase2DKernel(const size_t gridSize, const size_t blockSize, LibUtilities::ShapeType shapetype, const size_t nm0, const size_t nm1, @@ -21,16 +25,20 @@ void IProductWRTBase2DKernel(const size_t gridSize, const size_t blockSize, const size_t nElmts, const bool correct, const TData *basis0, const TData *basis1, const TData *w0, const TData *w1, const TData *jac, - const TData *in, TData *out); + const TData *in, TData *out, TData scale = 1.0); -template <typename TData, bool APPEND = false, bool DEFORMED = false> -void IProductWRTBase3DKernel( - const size_t gridSize, const size_t blockSize, - LibUtilities::ShapeType shapetype, const size_t nm0, const size_t nm1, - const size_t nm2, const size_t nq0, const size_t nq1, const size_t nq2, - const size_t nElmts, const bool correct, const TData *basis0, - const TData *basis1, const TData *basis2, const TData *w0, const TData *w1, - const TData *w2, const TData *jac, const TData *in, TData *out); +template <typename TData, bool SCALE = false, bool APPEND = false, + bool DEFORMED = false> +void IProductWRTBase3DKernel(const size_t gridSize, const size_t blockSize, + LibUtilities::ShapeType shapetype, + const size_t nm0, const size_t nm1, + const size_t nm2, const size_t nq0, + const size_t nq1, const size_t nq2, + const size_t nElmts, const bool correct, + const TData *basis0, const TData *basis1, + const TData *basis2, const TData *w0, + const TData *w1, const TData *w2, const TData *jac, + const TData *in, TData *out, TData scale = 1.0); // IProductWRTBase implementation template <typename TData> @@ -49,7 +57,7 @@ public: cudaMemcpy(m_jac, jac.get(), sizeof(TData) * jacSize, cudaMemcpyHostToDevice); - // Initialize basiskey. + // Initialize basis. m_basis = GetBasisDataCUDA<TData>(expansionList); // Initialize weight. @@ -58,25 +66,14 @@ public: ~OperatorIProductWRTBaseImpl(void) { - for (auto &basis : m_basis) - { - for (size_t i = 0; i < basis.second.size(); i++) - { - cudaFree(basis.second[i]); - } - } - for (auto &weight : m_weight) - { - for (size_t i = 0; i < weight.second.size(); i++) - { - cudaFree(weight.second[i]); - } - } + DeallocateDataCUDA<TData>(m_basis); + DeallocateDataCUDA<TData>(m_weight); cudaFree(m_jac); } void apply(Field<TData, FieldState::Phys> &in, - Field<TData, FieldState::Coeff> &out) override + Field<TData, FieldState::Coeff> &out, + const TData lambda = 1.0) override { // Copy memory to GPU, if necessary and get raw pointers. auto const *inptr = @@ -126,15 +123,33 @@ public: auto nq0 = expPtr->GetNumPoints(0); if (deformed) { - IProductWRTBase1DKernel<TData, false, true>( - m_gridSize, m_blockSize, nm0, nq0, nElmts, basis0, w0, - jacptr, inptr, outptr); + if (lambda == 1.0) + { + IProductWRTBase1DKernel<TData, false, false, true>( + m_gridSize, m_blockSize, nm0, nq0, nElmts, basis0, + w0, jacptr, inptr, outptr); + } + else + { + IProductWRTBase1DKernel<TData, true, false, true>( + m_gridSize, m_blockSize, nm0, nq0, nElmts, basis0, + w0, jacptr, inptr, outptr, lambda); + } } else { - IProductWRTBase1DKernel<TData, false, false>( - m_gridSize, m_blockSize, nm0, nq0, nElmts, basis0, w0, - jacptr, inptr, outptr); + if (lambda == 1.0) + { + IProductWRTBase1DKernel<TData, false, false, false>( + m_gridSize, m_blockSize, nm0, nq0, nElmts, basis0, + w0, jacptr, inptr, outptr); + } + else + { + IProductWRTBase1DKernel<TData, true, false, false>( + m_gridSize, m_blockSize, nm0, nq0, nElmts, basis0, + w0, jacptr, inptr, outptr, lambda); + } } } else if (expPtr->GetShapeDimension() == 2) @@ -150,17 +165,37 @@ public: auto nq1 = expPtr->GetNumPoints(1); if (deformed) { - IProductWRTBase2DKernel<TData, false, true>( - m_gridSize, m_blockSize, shape, nm0, nm1, nq0, nq1, - nElmts, correct, basis0, basis1, w0, w1, jacptr, inptr, - outptr); + if (lambda == 1.0) + { + IProductWRTBase2DKernel<TData, false, false, true>( + m_gridSize, m_blockSize, shape, nm0, nm1, nq0, nq1, + nElmts, correct, basis0, basis1, w0, w1, jacptr, + inptr, outptr); + } + else + { + IProductWRTBase2DKernel<TData, true, false, true>( + m_gridSize, m_blockSize, shape, nm0, nm1, nq0, nq1, + nElmts, correct, basis0, basis1, w0, w1, jacptr, + inptr, outptr, lambda); + } } else { - IProductWRTBase2DKernel<TData, false, false>( - m_gridSize, m_blockSize, shape, nm0, nm1, nq0, nq1, - nElmts, correct, basis0, basis1, w0, w1, jacptr, inptr, - outptr); + if (lambda == 1.0) + { + IProductWRTBase2DKernel<TData, false, false, false>( + m_gridSize, m_blockSize, shape, nm0, nm1, nq0, nq1, + nElmts, correct, basis0, basis1, w0, w1, jacptr, + inptr, outptr); + } + else + { + IProductWRTBase2DKernel<TData, true, false, false>( + m_gridSize, m_blockSize, shape, nm0, nm1, nq0, nq1, + nElmts, correct, basis0, basis1, w0, w1, jacptr, + inptr, outptr, lambda); + } } } else if (expPtr->GetShapeDimension() == 3) @@ -180,17 +215,37 @@ public: auto nq2 = expPtr->GetNumPoints(2); if (deformed) { - IProductWRTBase3DKernel<TData, false, true>( - m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0, nq1, - nq2, nElmts, correct, basis0, basis1, basis2, w0, w1, - w2, jacptr, inptr, outptr); + if (lambda == 1.0) + { + IProductWRTBase3DKernel<TData, false, false, true>( + m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0, + nq1, nq2, nElmts, correct, basis0, basis1, basis2, + w0, w1, w2, jacptr, inptr, outptr); + } + else + { + IProductWRTBase3DKernel<TData, true, false, true>( + m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0, + nq1, nq2, nElmts, correct, basis0, basis1, basis2, + w0, w1, w2, jacptr, inptr, outptr, lambda); + } } else { - IProductWRTBase3DKernel<TData, false, false>( - m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0, nq1, - nq2, nElmts, correct, basis0, basis1, basis2, w0, w1, - w2, jacptr, inptr, outptr); + if (lambda == 1.0) + { + IProductWRTBase3DKernel<TData, false, false, false>( + m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0, + nq1, nq2, nElmts, correct, basis0, basis1, basis2, + w0, w1, w2, jacptr, inptr, outptr); + } + else + { + IProductWRTBase3DKernel<TData, true, false, false>( + m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0, + nq1, nq2, nElmts, correct, basis0, basis1, basis2, + w0, w1, w2, jacptr, inptr, outptr, lambda); + } } } @@ -220,18 +275,19 @@ private: size_t m_gridSize; }; -template <typename TData, bool APPEND, bool DEFORMED> +template <typename TData, bool SCALE, bool APPEND, bool DEFORMED> void IProductWRTBase1DKernel(const size_t gridSize, const size_t blockSize, const size_t nm0, const size_t nq0, const size_t nElmts, const TData *basis0, const TData *w0, const TData *jac, const TData *in, - TData *out) + TData *out, TData scale) { - IProductWRTBaseSegKernel<TData, false, APPEND, DEFORMED> - <<<gridSize, blockSize>>>(nm0, nq0, nElmts, basis0, w0, jac, in, out); + IProductWRTBaseSegKernel<TData, SCALE, APPEND, DEFORMED> + <<<gridSize, blockSize>>>(nm0, nq0, nElmts, basis0, w0, jac, in, out, + scale); } -template <typename TData, bool APPEND, bool DEFORMED> +template <typename TData, bool SCALE, bool APPEND, bool DEFORMED> void IProductWRTBase2DKernel(const size_t gridSize, const size_t blockSize, LibUtilities::ShapeType shapetype, const size_t nm0, const size_t nm1, @@ -239,67 +295,67 @@ void IProductWRTBase2DKernel(const size_t gridSize, const size_t blockSize, const size_t nElmts, const bool correct, const TData *basis0, const TData *basis1, const TData *w0, const TData *w1, const TData *jac, - const TData *in, TData *out) + const TData *in, TData *out, TData scale) { if (shapetype == LibUtilities::Quad) { - IProductWRTBaseQuadKernel<TData, false, APPEND, DEFORMED> + IProductWRTBaseQuadKernel<TData, SCALE, APPEND, DEFORMED> <<<gridSize, blockSize>>>(nm0, nm1, nq0, nq1, nElmts, basis0, - basis1, w0, w1, jac, in, out); + basis1, w0, w1, jac, in, out, scale); } else if (shapetype == LibUtilities::Tri) { size_t nmTot = LibUtilities::StdTriData::getNumberOfCoefficients(nm0, nm1); - IProductWRTBaseTriKernel<TData, false, APPEND, DEFORMED> + IProductWRTBaseTriKernel<TData, SCALE, APPEND, DEFORMED> <<<gridSize, blockSize>>>(nm0, nm1, nmTot, nq0, nq1, nElmts, correct, basis0, basis1, w0, w1, jac, in, - out); + out, scale); } } -template <typename TData, bool APPEND, bool DEFORMED> +template <typename TData, bool SCALE, bool APPEND, bool DEFORMED> void IProductWRTBase3DKernel( const size_t gridSize, const size_t blockSize, LibUtilities::ShapeType shapetype, const size_t nm0, const size_t nm1, const size_t nm2, const size_t nq0, const size_t nq1, const size_t nq2, const size_t nElmts, const bool correct, const TData *basis0, const TData *basis1, const TData *basis2, const TData *w0, const TData *w1, - const TData *w2, const TData *jac, const TData *in, TData *out) + const TData *w2, const TData *jac, const TData *in, TData *out, TData scale) { if (shapetype == LibUtilities::Hex) { - IProductWRTBaseHexKernel<TData, false, APPEND, DEFORMED> + IProductWRTBaseHexKernel<TData, SCALE, APPEND, DEFORMED> <<<gridSize, blockSize>>>(nm0, nm1, nm2, nq0, nq1, nq2, nElmts, basis0, basis1, basis2, w0, w1, w2, jac, - in, out); + in, out, scale); } else if (shapetype == LibUtilities::Tet) { size_t nmTot = LibUtilities::StdTetData::getNumberOfCoefficients(nm0, nm1, nm2); - IProductWRTBaseTetKernel<TData, false, APPEND, DEFORMED> + IProductWRTBaseTetKernel<TData, SCALE, APPEND, DEFORMED> <<<gridSize, blockSize>>>(nm0, nm1, nm2, nmTot, nq0, nq1, nq2, nElmts, correct, basis0, basis1, basis2, - w0, w1, w2, jac, in, out); + w0, w1, w2, jac, in, out, scale); } else if (shapetype == LibUtilities::Pyr) { size_t nmTot = LibUtilities::StdPyrData::getNumberOfCoefficients(nm0, nm1, nm2); - IProductWRTBasePyrKernel<TData, false, APPEND, DEFORMED> + IProductWRTBasePyrKernel<TData, SCALE, APPEND, DEFORMED> <<<gridSize, blockSize>>>(nm0, nm1, nm2, nmTot, nq0, nq1, nq2, nElmts, correct, basis0, basis1, basis2, - w0, w1, w2, jac, in, out); + w0, w1, w2, jac, in, out, scale); } else if (shapetype == LibUtilities::Prism) { size_t nmTot = LibUtilities::StdPrismData::getNumberOfCoefficients(nm0, nm1, nm2); - IProductWRTBasePrismKernel<TData, false, APPEND, DEFORMED> + IProductWRTBasePrismKernel<TData, SCALE, APPEND, DEFORMED> <<<gridSize, blockSize>>>(nm0, nm1, nm2, nmTot, nq0, nq1, nq2, nElmts, correct, basis0, basis1, basis2, - w0, w1, w2, jac, in, out); + w0, w1, w2, jac, in, out, scale); } } diff --git a/Operators/IProductWRTBase/IProductWRTBaseCUDAKernels.cuh b/Operators/IProductWRTBase/IProductWRTBaseCUDAKernels.cuh index 8ca73cd463e193e638b5b29ab0795faa12cadebb..e4fd4f06e7a2eb5d612ca6bf40acd413e64afeff 100644 --- a/Operators/IProductWRTBase/IProductWRTBaseCUDAKernels.cuh +++ b/Operators/IProductWRTBase/IProductWRTBaseCUDAKernels.cuh @@ -28,6 +28,10 @@ __global__ void IProductWRTBaseSegKernel(const size_t nm0, const size_t nq0, TData jac_val = DEFORMED ? jacptr[i] : jacptr[0]; sum += inptr[i] * basis0[p * nq0 + i] * jac_val * w0[i]; } + if (SCALE) + { + sum *= scale; + } outptr[p] = APPEND ? outptr[p] + sum : sum; } } @@ -76,9 +80,14 @@ __global__ void IProductWRTBaseQuadKernel( { sum += wsp[j] * basis1[q * nq1 + j] * w1[j]; } + if (SCALE) + { + sum *= scale; + } outptr[q * nm0 + p] = APPEND ? outptr[q * nm0 + p] + sum : sum; } } + delete wsp; } template <typename TData, bool SCALE, bool APPEND, bool DEFORMED> @@ -128,7 +137,11 @@ __global__ void IProductWRTBaseTriKernel( { sum += wsp[eta1] * basis1[mode * nq1 + eta1] * w1[eta1]; } - outptr[mode++] = APPEND ? outptr[mode++] + sum : sum; + if (SCALE) + { + sum *= scale; + } + outptr[mode++] = APPEND ? outptr[mode] + sum : sum; } } @@ -158,8 +171,9 @@ __global__ void IProductWRTBaseTriKernel( iprod_01 += prod * basis0[nq0 + eta0]; } } - outptr[1] += iprod_01; + outptr[1] += SCALE ? iprod_01 * scale : iprod_01; } + delete wsp; } template <typename TData, bool SCALE, bool APPEND, bool DEFORMED> @@ -227,11 +241,17 @@ __global__ void IProductWRTBaseHexKernel( { sum += wsp1[k] * basis2[r * nq2 + k] * w2[k]; } + if (SCALE) + { + sum *= scale; + } outptr[r * nm0 * nm1 + q * nm0 + p] = APPEND ? outptr[r * nm0 * nm1 + q * nm0 + p] + sum : sum; } } } + delete wsp0; + delete wsp1; } // NOTE: Not workign when nm2 > nm1 @@ -303,7 +323,11 @@ __global__ void IProductWRTBaseTetKernel( { tmp += wsp1[k] * basis2[mode2 * nq2 + k] * w2[k]; } - outptr[cnt_pqr++] = APPEND ? outptr[cnt_pqr++] + tmp : tmp; + if (SCALE) + { + tmp *= scale; + } + outptr[cnt_pqr++] = APPEND ? outptr[cnt_pqr] + tmp : tmp; } } } @@ -342,26 +366,28 @@ __global__ void IProductWRTBaseTetKernel( tmp *= inptr[cnt] * tmpQ; // add to existing entry - outptr[1] += tmp; + outptr[1] += SCALE ? tmp * scale : tmp; // bottom vertex // tmp = basis0[nq0 + i] * basis1[nq1 + j] * basis2[k] * inptr[cnt] * tmpQ; - outptr[nm2] += tmp; + outptr[nm2] += SCALE ? tmp * scale : tmp; // singular edge for (size_t r = 1; r < nm2 - 1; ++r) { tmp = basis2[(r + 1) * nq2 + k] * basis1[nq1 + j] * basis0[nq0 + i] * inptr[cnt] * tmpQ; - outptr[nm2 + r] += tmp; + outptr[nm2 + r] += SCALE ? tmp * scale : tmp; } cnt++; } } } } + delete wsp0; + delete wsp1; } template <typename TData, bool SCALE, bool APPEND, bool DEFORMED> @@ -431,8 +457,11 @@ __global__ void IProductWRTBasePrismKernel( { sum_k += basis2[(mode_pr + r) * nq2 + k] * w2[k] * wsp1[k]; } - outptr[mode_pqr++] = - APPEND ? outptr[mode_pqr++] + sum_k : sum_k; + if (SCALE) + { + sum_k *= scale; + } + outptr[mode_pqr++] = APPEND ? outptr[mode_pqr] + sum_k : sum_k; } } mode_pr += nm2 - p; @@ -477,9 +506,12 @@ __global__ void IProductWRTBasePrismKernel( for (size_t q = 0; q < nm1; ++q) { - outptr[nm2 * q + 1] += wsp2[q]; + outptr[nm2 * q + 1] += SCALE ? wsp2[q] * scale : wsp2[q]; } } + delete wsp0; + delete wsp1; + delete wsp2; } // NOTE: Not workign when nm2 > nm1 @@ -549,8 +581,11 @@ __global__ void IProductWRTBasePyrKernel( { sum_k += basis2[mode_pqr * nq2 + k] * w2[k] * wsp1[k]; } - outptr[mode_pqr++] = - APPEND ? outptr[mode_pqr++] + sum_k : sum_k; + if (SCALE) + { + sum_k *= scale; + } + outptr[mode_pqr++] = APPEND ? outptr[mode_pqr] + sum_k : sum_k; } } @@ -574,8 +609,11 @@ __global__ void IProductWRTBasePyrKernel( { sum_k += basis2[mode_pqr * nq2 + k] * w2[k] * wsp1[k]; } - outptr[mode_pqr++] = - APPEND ? outptr[mode_pqr++] + sum_k : sum_k; + if (SCALE) + { + sum_k *= scale; + } + outptr[mode_pqr++] = APPEND ? outptr[mode_pqr] + sum_k : sum_k; } } } @@ -612,11 +650,13 @@ __global__ void IProductWRTBasePyrKernel( tmp *= inptr[cnt++] * tmpQ; // add to existing entry - outptr[1] += tmp; + outptr[1] += SCALE ? tmp * scale : tmp; } } } } + delete wsp0; + delete wsp1; } } // namespace Nektar::Operators::detail diff --git a/Operators/IProductWRTBase/IProductWRTBaseStdMat.hpp b/Operators/IProductWRTBase/IProductWRTBaseStdMat.hpp index dc5edd48e0aa6be326ddfbb2fdd07b3d2db646e7..fb2aeeddc62b1678ee1c32cce364054209936dec 100644 --- a/Operators/IProductWRTBase/IProductWRTBaseStdMat.hpp +++ b/Operators/IProductWRTBase/IProductWRTBaseStdMat.hpp @@ -20,7 +20,8 @@ public: } void apply(Field<TData, FieldState::Phys> &in, - Field<TData, FieldState::Coeff> &out) override + Field<TData, FieldState::Coeff> &out, + const TData lambda = 1.0) override { auto const *inptr = in.GetStorage().GetCPUPtr(); auto *outptr = out.GetStorage().GetCPUPtr(); @@ -64,7 +65,7 @@ public: } Blas::Dgemm('N', 'N', matPtr->GetRows(), nElmts, - matPtr->GetColumns(), 1.0, matPtr->GetRawPtr(), + matPtr->GetColumns(), lambda, matPtr->GetRawPtr(), matPtr->GetRows(), wsp.get(), nqTot, 0.0, outptr, nmTot); diff --git a/Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDA.cu b/Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDA.cu new file mode 100644 index 0000000000000000000000000000000000000000..8153c1d9dc8425b8c99f0395ffef0b04742476a8 --- /dev/null +++ b/Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDA.cu @@ -0,0 +1,10 @@ +#include "IProductWRTDerivBaseCUDA.hpp" + +namespace Nektar::Operators::detail +{ +template <> +std::string OperatorIProductWRTDerivBaseImpl<double, ImplCUDA>::className = + GetOperatorFactory<double>().RegisterCreatorFunction( + "IProductWRTDerivBaseCUDA", + OperatorIProductWRTDerivBaseImpl<double, ImplCUDA>::instantiate, "..."); +} diff --git a/Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDA.hpp b/Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDA.hpp new file mode 100644 index 0000000000000000000000000000000000000000..497a60f3415a0648983ca1fd2edd3af0ae4b8aa4 --- /dev/null +++ b/Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDA.hpp @@ -0,0 +1,418 @@ +#include "MemoryRegionCUDA.hpp" +#include "Operators/IProductWRTBase/IProductWRTBaseCUDA.hpp" +#include "Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDAKernels.cuh" +#include "Operators/OperatorHelper.cuh" +#include "Operators/OperatorIProductWRTDerivBase.hpp" + +namespace Nektar::Operators::detail +{ + +template <typename TData, bool DEFORMED = false> +void IProductWRTDerivBase1DKernel(const size_t gridSize, const size_t blockSize, + const size_t nq0, const size_t nCoord, + const size_t nElmts, const size_t dfSize, + TData *df, TData *in0, TData *in1, TData *in2, + TData *out0); + +template <typename TData, bool DEFORMED = false> +void IProductWRTDerivBase2DKernel(const size_t gridSize, const size_t blockSize, + LibUtilities::ShapeType shapetype, + const size_t nq0, const size_t nq1, + const size_t nCoord, const size_t nElmts, + const TData *Z0, const TData *Z1, + const size_t dfSize, TData *df, TData *in0, + TData *in1, TData *in2, TData *out0, + TData *out1); + +template <typename TData, bool DEFORMED = false> +void IProductWRTDerivBase3DKernel( + const size_t gridSize, const size_t blockSize, + LibUtilities::ShapeType shapetype, const size_t nq0, const size_t nq1, + const size_t nq2, const size_t nCoord, const size_t nElmts, const TData *Z0, + const TData *Z1, const TData *Z2, const size_t dfSize, TData *df, + TData *in0, TData *in1, TData *in2, TData *out0, TData *out1, TData *out2); + +// IProductWRTDerivBase implementation +template <typename TData> +class OperatorIProductWRTDerivBaseImpl<TData, ImplCUDA> + : public OperatorIProductWRTDerivBase<TData> +{ +public: + OperatorIProductWRTDerivBaseImpl( + const MultiRegions::ExpListSharedPtr &expansionList) + : OperatorIProductWRTDerivBase<TData>(expansionList) + { + size_t nDim = this->m_expansionList->GetShapeDimension(); + size_t nCoord = this->m_expansionList->GetCoordim(0); + m_dfSize = Operator<TData>::GetGeometricFactorSize(); + + // Initialise jacobian. + auto jac = Operator<TData>::SetJacobian(m_dfSize); + cudaMalloc((void **)&m_jac, sizeof(TData) * m_dfSize); + cudaMemcpy(m_jac, jac.get(), sizeof(TData) * m_dfSize, + cudaMemcpyHostToDevice); + + // Initialise derivative factor. + auto derivFac = Operator<TData>::SetDerivativeFactor(m_dfSize); + cudaMalloc((void **)&m_derivFac, + sizeof(TData) * nDim * nCoord * m_dfSize); + for (size_t d = 0; d < nDim * nCoord; d++) + { + auto hostPtr = derivFac[d].get(); + auto devicePtr = m_derivFac + d * m_dfSize; + cudaMemcpy(devicePtr, hostPtr, sizeof(TData) * m_dfSize, + cudaMemcpyHostToDevice); + } + + // Initialize basis. + m_basis = GetBasisDataCUDA<TData>(expansionList); + + // Initialize basis derivative. + m_dbasis = GetDeriveBasisDataCUDA<TData>(expansionList); + + // Initialize weight. + m_weight = GetWeightDataCUDA<TData>(expansionList); + + // Initialize points. + m_Z = GetPointDataCUDA<TData>(expansionList); + + // Initialize derivative matrix. + m_D = GetDerivativeDataCUDA<TData>(expansionList); + + // Initialize workspace memory. + auto ndata = this->m_expansionList->GetTotPoints(); + cudaMalloc((void **)&m_wsp0, sizeof(TData) * ndata); + if (nCoord > 1) + { + cudaMalloc((void **)&m_wsp1, sizeof(TData) * ndata); + } + if (nCoord > 2) + { + cudaMalloc((void **)&m_wsp2, sizeof(TData) * ndata); + } + } + + ~OperatorIProductWRTDerivBaseImpl(void) + { + DeallocateDataCUDA<TData>(m_basis); + DeallocateDataCUDA<TData>(m_dbasis); + DeallocateDataCUDA<TData>(m_weight); + DeallocateDataCUDA<TData>(m_Z); + DeallocateDataCUDA<TData>(m_D); + cudaFree(m_jac); + cudaFree(m_derivFac); + } + + void apply(Field<TData, FieldState::Phys> &in0, + Field<TData, FieldState::Phys> &in1, + Field<TData, FieldState::Phys> &in2, + Field<TData, FieldState::Coeff> &out, + bool APPEND = false) override + { + if (!APPEND) // Zero output (temporary solution) + { + auto *tmpptr = + out.template GetStorage<MemoryRegionCUDA>().GetCPUPtr(); + for (size_t block_idx = 0; block_idx < out.GetBlocks().size(); + ++block_idx) + { + for (size_t i = 0; i < out.GetBlocks()[block_idx].block_size; + i++) + { + *(tmpptr++) = 0.0; + } + } + } + + // Copy memory to GPU, if necessary and get raw pointers. + std::vector<TData *> inptr{ + in0.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(), + in1.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(), + in2.template GetStorage<MemoryRegionCUDA>().GetGPUPtr()}; + auto *outptr = out.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); + + std::vector<TData *> wspptr{m_wsp0, m_wsp1, m_wsp2}; + auto jacptr = m_jac; + auto dfptr = m_derivFac; + + // Initialize index. + size_t expIdx = 0; + + // Initialize basiskey. + std::vector<LibUtilities::BasisKey> basisKeys( + 3, LibUtilities::NullBasisKey); + + // Loop over the blocks. + for (size_t block_idx = 0; block_idx < out.GetBlocks().size(); + ++block_idx) + { + auto const expPtr = this->m_expansionList->GetExp(expIdx); + auto nElmts = out.GetBlocks()[block_idx].num_elements; + auto nqTot = expPtr->GetTotPoints(); + auto nmTot = expPtr->GetNcoeffs(); + auto nDim = expPtr->GetShapeDimension(); + auto nCoord = expPtr->GetCoordim(); + auto shape = expPtr->DetShapeType(); + auto deformed = expPtr->GetMetricInfo()->GetGtype() == + SpatialDomains::eDeformed; + + // Deterime CUDA grid parameters. + m_gridSize = nElmts / m_blockSize; + m_gridSize += (nElmts % m_blockSize == 0) ? 0 : 1; + + // Flag for collapsed coordinate correction. + bool correct = expPtr->GetBasis(0)->GetBasisType() == + LibUtilities::eModified_A; + + // Fetch basis key for the current element type. + for (size_t d = 0; d < nDim; d++) + { + basisKeys[d] = expPtr->GetBasis(d)->GetBasisKey(); + } + + // Function call to kernel functions. + if (nDim == 1) + { + auto dbasis0 = m_dbasis[basisKeys][0]; + auto w0 = m_weight[basisKeys][0]; + auto D0 = m_D[basisKeys][0]; + auto nm0 = expPtr->GetBasisNumModes(0); + auto nq0 = expPtr->GetNumPoints(0); + if (deformed) + { + IProductWRTDerivBase1DKernel<TData, true>( + m_gridSize, m_blockSize, nq0, nCoord, nElmts, m_dfSize, + dfptr, inptr[0], inptr[1], inptr[2], wspptr[0]); + IProductWRTBase1DKernel<TData, false, true, true>( + m_gridSize, m_blockSize, nm0, nq0, nElmts, dbasis0, w0, + jacptr, wspptr[0], outptr); + } + else + { + IProductWRTDerivBase1DKernel<TData, false>( + m_gridSize, m_blockSize, nq0, nCoord, nElmts, m_dfSize, + dfptr, inptr[0], inptr[1], inptr[2], wspptr[0]); + IProductWRTBase1DKernel<TData, false, true, false>( + m_gridSize, m_blockSize, nm0, nq0, nElmts, dbasis0, w0, + jacptr, wspptr[0], outptr); + } + } + else if (nDim == 2) + { + auto basis0 = m_basis[basisKeys][0]; + auto basis1 = m_basis[basisKeys][1]; + auto dbasis0 = m_dbasis[basisKeys][0]; + auto dbasis1 = m_dbasis[basisKeys][1]; + auto w0 = m_weight[basisKeys][0]; + auto w1 = m_weight[basisKeys][1]; + auto D0 = m_D[basisKeys][0]; + auto D1 = m_D[basisKeys][1]; + auto Z0 = m_Z[basisKeys][0]; + auto Z1 = m_Z[basisKeys][1]; + auto nm0 = expPtr->GetBasisNumModes(0); + auto nm1 = expPtr->GetBasisNumModes(1); + auto nq0 = expPtr->GetNumPoints(0); + auto nq1 = expPtr->GetNumPoints(1); + if (deformed) + { + IProductWRTDerivBase2DKernel<TData, true>( + m_gridSize, m_blockSize, shape, nq0, nq1, nCoord, + nElmts, Z0, Z1, m_dfSize, dfptr, inptr[0], inptr[1], + inptr[2], wspptr[0], wspptr[1]); + IProductWRTBase2DKernel<TData, false, true, true>( + m_gridSize, m_blockSize, shape, nm0, nm1, nq0, nq1, + nElmts, correct, dbasis0, basis1, w0, w1, jacptr, + wspptr[0], outptr); + IProductWRTBase2DKernel<TData, false, true, true>( + m_gridSize, m_blockSize, shape, nm0, nm1, nq0, nq1, + nElmts, correct, basis0, dbasis1, w0, w1, jacptr, + wspptr[1], outptr); + } + else + { + IProductWRTDerivBase2DKernel<TData, false>( + m_gridSize, m_blockSize, shape, nq0, nq1, nCoord, + nElmts, Z0, Z1, m_dfSize, dfptr, inptr[0], inptr[1], + inptr[2], wspptr[0], wspptr[1]); + IProductWRTBase2DKernel<TData, false, true, false>( + m_gridSize, m_blockSize, shape, nm0, nm1, nq0, nq1, + nElmts, correct, dbasis0, basis1, w0, w1, jacptr, + wspptr[0], outptr); + IProductWRTBase2DKernel<TData, false, true, false>( + m_gridSize, m_blockSize, shape, nm0, nm1, nq0, nq1, + nElmts, correct, basis0, dbasis1, w0, w1, jacptr, + wspptr[1], outptr); + } + } + else if (nDim == 3) + { + auto basis0 = m_basis[basisKeys][0]; + auto basis1 = m_basis[basisKeys][1]; + auto basis2 = m_basis[basisKeys][2]; + auto dbasis0 = m_dbasis[basisKeys][0]; + auto dbasis1 = m_dbasis[basisKeys][1]; + auto dbasis2 = m_dbasis[basisKeys][2]; + auto w0 = m_weight[basisKeys][0]; + auto w1 = m_weight[basisKeys][1]; + auto w2 = m_weight[basisKeys][2]; + auto D0 = m_D[basisKeys][0]; + auto D1 = m_D[basisKeys][1]; + auto D2 = m_D[basisKeys][2]; + auto Z0 = m_Z[basisKeys][0]; + auto Z1 = m_Z[basisKeys][1]; + auto Z2 = m_Z[basisKeys][2]; + auto nm0 = expPtr->GetBasisNumModes(0); + auto nm1 = expPtr->GetBasisNumModes(1); + auto nm2 = expPtr->GetBasisNumModes(2); + auto nq0 = expPtr->GetNumPoints(0); + auto nq1 = expPtr->GetNumPoints(1); + auto nq2 = expPtr->GetNumPoints(2); + if (deformed) + { + IProductWRTDerivBase3DKernel<TData, true>( + m_gridSize, m_blockSize, shape, nq0, nq1, nq2, nCoord, + nElmts, Z0, Z1, Z2, m_dfSize, dfptr, inptr[0], inptr[1], + inptr[2], wspptr[0], wspptr[1], wspptr[2]); + IProductWRTBase3DKernel<TData, false, true, true>( + m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0, nq1, + nq2, nElmts, correct, dbasis0, basis1, basis2, w0, w1, + w2, jacptr, wspptr[0], outptr); + IProductWRTBase3DKernel<TData, false, true, true>( + m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0, nq1, + nq2, nElmts, correct, basis0, dbasis1, basis2, w0, w1, + w2, jacptr, wspptr[1], outptr); + IProductWRTBase3DKernel<TData, false, true, true>( + m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0, nq1, + nq2, nElmts, correct, basis0, basis1, dbasis2, w0, w1, + w2, jacptr, wspptr[2], outptr); + } + else + { + IProductWRTDerivBase3DKernel<TData, false>( + m_gridSize, m_blockSize, shape, nq0, nq1, nq2, nCoord, + nElmts, Z0, Z1, Z2, m_dfSize, dfptr, inptr[0], inptr[1], + inptr[2], wspptr[0], wspptr[1], wspptr[2]); + IProductWRTBase3DKernel<TData, false, true, false>( + m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0, nq1, + nq2, nElmts, correct, dbasis0, basis1, basis2, w0, w1, + w2, jacptr, wspptr[0], outptr); + IProductWRTBase3DKernel<TData, false, true, false>( + m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0, nq1, + nq2, nElmts, correct, basis0, dbasis1, basis2, w0, w1, + w2, jacptr, wspptr[1], outptr); + IProductWRTBase3DKernel<TData, false, true, false>( + m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0, nq1, + nq2, nElmts, correct, basis0, basis1, dbasis2, w0, w1, + w2, jacptr, wspptr[2], outptr); + } + } + + // Increment pointer and index for next element type. + jacptr += deformed ? nqTot * nElmts : nElmts; + dfptr += deformed ? nqTot * nElmts : nElmts; + for (size_t d = 0; d < nDim; d++) + { + inptr[d] += nqTot * nElmts; + wspptr[d] += nqTot * nElmts; + } + outptr += nmTot * nElmts; + expIdx += nElmts; + } + } + + static std::unique_ptr<Operator<TData>> instantiate( + MultiRegions::ExpListSharedPtr expansionList) + { + return std::make_unique< + OperatorIProductWRTDerivBaseImpl<TData, ImplCUDA>>(expansionList); + } + + static std::string className; + +private: + TData *m_wsp0, *m_wsp1, *m_wsp2; + TData *m_derivFac; + TData *m_jac; + std::map<std::vector<LibUtilities::BasisKey>, std::vector<TData *>> m_basis; + std::map<std::vector<LibUtilities::BasisKey>, std::vector<TData *>> + m_dbasis; + std::map<std::vector<LibUtilities::BasisKey>, std::vector<TData *>> + m_weight; + std::map<std::vector<LibUtilities::BasisKey>, std::vector<TData *>> m_D; + std::map<std::vector<LibUtilities::BasisKey>, std::vector<TData *>> m_Z; + size_t m_dfSize; + size_t m_blockSize = 32; + size_t m_gridSize; +}; + +template <typename TData, bool DEFORMED> +void IProductWRTDerivBase1DKernel(const size_t gridSize, const size_t blockSize, + const size_t nq0, const size_t nCoord, + const size_t nElmts, const size_t dfSize, + TData *df, TData *in0, TData *in1, TData *in2, + TData *out0) +{ + IProductWRTDerivBaseSegKernel<TData, DEFORMED><<<gridSize, blockSize>>>( + nq0, nCoord, nElmts, dfSize, df, in0, in1, in2, out0); +} + +template <typename TData, bool DEFORMED> +void IProductWRTDerivBase2DKernel(const size_t gridSize, const size_t blockSize, + LibUtilities::ShapeType shapetype, + const size_t nq0, const size_t nq1, + const size_t nCoord, const size_t nElmts, + const TData *Z0, const TData *Z1, + const size_t dfSize, TData *df, TData *in0, + TData *in1, TData *in2, TData *out0, + TData *out1) +{ + if (shapetype == LibUtilities::Quad) + { + IProductWRTDerivBaseQuadKernel<TData, DEFORMED> + <<<gridSize, blockSize>>>(nq0, nq1, nCoord, nElmts, dfSize, df, in0, + in1, in2, out0, out1); + } + else if (shapetype == LibUtilities::Tri) + { + IProductWRTDerivBaseTriKernel<TData, DEFORMED> + <<<gridSize, blockSize>>>(nq0, nq1, nCoord, nElmts, Z0, Z1, dfSize, + df, in0, in1, in2, out0, out1); + } +} + +template <typename TData, bool DEFORMED> +void IProductWRTDerivBase3DKernel( + const size_t gridSize, const size_t blockSize, + LibUtilities::ShapeType shapetype, const size_t nq0, const size_t nq1, + const size_t nq2, const size_t nCoord, const size_t nElmts, const TData *Z0, + const TData *Z1, const TData *Z2, const size_t dfSize, TData *df, + TData *in0, TData *in1, TData *in2, TData *out0, TData *out1, TData *out2) +{ + if (shapetype == LibUtilities::Hex) + { + IProductWRTDerivBaseHexKernel<TData, DEFORMED> + <<<gridSize, blockSize>>>(nq0, nq1, nq2, nCoord, nElmts, dfSize, df, + in0, in1, in2, out0, out1, out2); + } + else if (shapetype == LibUtilities::Tet) + { + IProductWRTDerivBaseTetKernel<TData, DEFORMED><<<gridSize, blockSize>>>( + nq0, nq1, nq2, nCoord, nElmts, Z0, Z1, Z2, dfSize, df, in0, in1, + in2, out0, out1, out2); + } + else if (shapetype == LibUtilities::Pyr) + { + IProductWRTDerivBasePyrKernel<TData, DEFORMED><<<gridSize, blockSize>>>( + nq0, nq1, nq2, nCoord, nElmts, Z0, Z1, Z2, dfSize, df, in0, in1, + in2, out0, out1, out2); + } + else if (shapetype == LibUtilities::Prism) + { + IProductWRTDerivBasePrismKernel<TData, DEFORMED> + <<<gridSize, blockSize>>>(nq0, nq1, nq2, nCoord, nElmts, Z0, Z2, + dfSize, df, in0, in1, in2, out0, out1, + out2); + } +} + +} // namespace Nektar::Operators::detail diff --git a/Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDAKernels.cuh b/Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDAKernels.cuh new file mode 100644 index 0000000000000000000000000000000000000000..433a294497a7d07a175dac8dc78f23ffd08de2ce --- /dev/null +++ b/Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDAKernels.cuh @@ -0,0 +1,419 @@ +namespace Nektar::Operators::detail +{ +template <typename TData, bool DEFORMED> +__global__ void IProductWRTDerivBaseSegKernel(const size_t nq0, + const size_t ncoord, + const size_t nelmt, + const size_t dfSize, TData *df, + TData *in0, TData *in1, + TData *in2, TData *out0) +{ + size_t e = blockDim.x * blockIdx.x + threadIdx.x; + + if (e >= nelmt) + { + return; + } + + size_t ndf = ncoord; + + // Assign pointers. + TData **inptr = new TData *[ncoord]; + inptr[0] = in0 + (nq0 * e); + if (ncoord > 1) + { + inptr[1] = in1 + (nq0 * e); + } + if (ncoord > 2) + { + inptr[2] = in2 + (nq0 * e); + } + + TData *outptr0 = out0 + (nq0 * e); + + TData **dfptr = new TData *[ndf]; + for (size_t d = 0; d < ndf; d++) + { + dfptr[d] = df + d * dfSize; + dfptr[d] += DEFORMED ? nq0 * e : e; + } + + // Calculate dxi/dx in[0] + dxi/dy in[1] + dxi/dz in[2] + for (size_t i = 0; i < nq0; ++i) + { + size_t dfindex = DEFORMED ? i : 0; + outptr0[i] = dfptr[0][dfindex] * inptr[0][i]; + for (size_t d = 1; d < ncoord; ++d) + { + outptr0[i] += (dfptr[d][dfindex] * inptr[d][i]); + } + } + delete[] inptr; + delete[] dfptr; +} + +template <typename TData, bool DEFORMED> +__global__ void IProductWRTDerivBaseQuadKernel( + const size_t nq0, const size_t nq1, const size_t ncoord, const size_t nelmt, + const size_t dfSize, TData *df, TData *in0, TData *in1, TData *in2, + TData *out0, TData *out1) +{ + size_t e = blockDim.x * blockIdx.x + threadIdx.x; + + if (e >= nelmt) + { + return; + } + + const auto ndf = 2 * ncoord; + + // Assign pointers. + TData **inptr = new TData *[ncoord]; + inptr[0] = in0 + (nq0 * nq1 * e); + inptr[1] = in1 + (nq0 * nq1 * e); + if (ncoord > 2) + { + inptr[2] = in2 + (nq0 * nq1 * e); + } + + TData *outptr0 = out0 + (nq0 * nq1 * e); + TData *outptr1 = out1 + (nq0 * nq1 * e); + + TData **dfptr = new TData *[ndf]; + for (size_t d = 0; d < ndf; d++) + { + dfptr[d] = df + d * dfSize; + dfptr[d] += DEFORMED ? nq0 * nq1 * e : e; + } + + // Calculate dxi/dx in[0] + dxi/dy in[1] + dxi/dz in[2] + for (size_t i = 0; i < nq0 * nq1; ++i) + { + size_t dfindex = DEFORMED ? i : 0; + outptr0[i] = dfptr[0][dfindex] * inptr[0][i]; + outptr1[i] = dfptr[1][dfindex] * inptr[0][i]; + for (size_t d = 1; d < ncoord; ++d) + { + outptr0[i] += (dfptr[2 * d][dfindex] * inptr[d][i]); + outptr1[i] += (dfptr[2 * d + 1][dfindex] * inptr[d][i]); + } + } + delete[] inptr; + delete[] dfptr; +} + +template <typename TData, bool DEFORMED> +__global__ void IProductWRTDerivBaseTriKernel( + const size_t nq0, const size_t nq1, const size_t ncoord, const size_t nelmt, + const TData *Z0, const TData *Z1, const size_t dfSize, TData *df, + TData *in0, TData *in1, TData *in2, TData *out0, TData *out1) +{ + size_t e = blockDim.x * blockIdx.x + threadIdx.x; + + if (e >= nelmt) + { + return; + } + + const auto ndf = 2 * ncoord; + + // Assign pointers. + TData **inptr = new TData *[ncoord]; + inptr[0] = in0 + (nq0 * nq1 * e); + inptr[1] = in1 + (nq0 * nq1 * e); + if (ncoord > 2) + { + inptr[2] = in2 + (nq0 * nq1 * e); + } + + TData *outptr0 = out0 + (nq0 * nq1 * e); + TData *outptr1 = out1 + (nq0 * nq1 * e); + + TData **dfptr = new TData *[ndf]; + for (size_t d = 0; d < ndf; d++) + { + dfptr[d] = df + d * dfSize; + dfptr[d] += DEFORMED ? nq0 * nq1 * e : e; + } + + // Calculate dxi/dx in[0] + dxi/dy in[1] + dxi/dz in[2] + for (size_t j = 0, cnt_ji = 0; j < nq1; ++j) + { + TData f0 = 2.0 / (1.0 - Z1[j]); + for (size_t i = 0; i < nq0; ++i, ++cnt_ji) + { + size_t dfindex = DEFORMED ? cnt_ji : 0; + outptr0[cnt_ji] = dfptr[0][dfindex] * inptr[0][cnt_ji]; + outptr1[cnt_ji] = dfptr[1][dfindex] * inptr[0][cnt_ji]; + for (size_t d = 1; d < ncoord; ++d) + { + outptr0[cnt_ji] += (dfptr[2 * d][dfindex] * inptr[d][cnt_ji]); + outptr1[cnt_ji] += + (dfptr[2 * d + 1][dfindex] * inptr[d][cnt_ji]); + } + + // Multiply by geometric factors + TData f1 = 0.5 * (1.0 + Z0[i]); + outptr0[cnt_ji] += outptr1[cnt_ji] * f1; + outptr0[cnt_ji] *= f0; + } + } + delete[] inptr; + delete[] dfptr; +} + +template <typename TData, bool DEFORMED> +__global__ void IProductWRTDerivBaseHexKernel( + const size_t nq0, const size_t nq1, const size_t nq2, const size_t ncoord, + const size_t nelmt, const size_t dfSize, TData *df, TData *in0, TData *in1, + TData *in2, TData *out0, TData *out1, TData *out2) +{ + size_t e = blockDim.x * blockIdx.x + threadIdx.x; + + if (e >= nelmt) + { + return; + } + + const auto ndf = 3 * ncoord; + + // Assign pointers. + TData **inptr = new TData *[ncoord]; + inptr[0] = in0 + (nq0 * nq1 * nq2 * e); + inptr[1] = in1 + (nq0 * nq1 * nq2 * e); + inptr[2] = in2 + (nq0 * nq1 * nq2 * e); + + TData *outptr0 = out0 + (nq0 * nq1 * nq2 * e); + TData *outptr1 = out1 + (nq0 * nq1 * nq2 * e); + TData *outptr2 = out2 + (nq0 * nq1 * nq2 * e); + + TData **dfptr = new TData *[ndf]; + for (size_t d = 0; d < ndf; d++) + { + dfptr[d] = df + d * dfSize; + dfptr[d] += DEFORMED ? nq0 * nq1 * nq2 * e : e; + } + + // Calculate dxi/dx in[0] + dxi/dy in[1] + dxi/dz in[2] + for (size_t i = 0; i < nq0 * nq1 * nq2; ++i) + { + size_t dfindex = DEFORMED ? i : 0; + outptr0[i] = dfptr[0][dfindex] * inptr[0][i]; + outptr1[i] = dfptr[1][dfindex] * inptr[0][i]; + outptr2[i] = dfptr[2][dfindex] * inptr[0][i]; + for (size_t d = 1; d < ncoord; ++d) + { + outptr0[i] += (dfptr[3 * d][dfindex] * inptr[d][i]); + outptr1[i] += (dfptr[3 * d + 1][dfindex] * inptr[d][i]); + outptr2[i] += (dfptr[3 * d + 2][dfindex] * inptr[d][i]); + } + } + delete[] inptr; + delete[] dfptr; +} + +template <typename TData, bool DEFORMED> +__global__ void IProductWRTDerivBaseTetKernel( + const size_t nq0, const size_t nq1, const size_t nq2, const size_t ncoord, + const size_t nelmt, const TData *Z0, const TData *Z1, const TData *Z2, + const size_t dfSize, TData *df, TData *in0, TData *in1, TData *in2, + TData *out0, TData *out1, TData *out2) +{ + size_t e = blockDim.x * blockIdx.x + threadIdx.x; + + if (e >= nelmt) + { + return; + } + + const auto ndf = 3 * ncoord; + + // Assign pointers. + TData **inptr = new TData *[ncoord]; + inptr[0] = in0 + (nq0 * nq1 * nq2 * e); + inptr[1] = in1 + (nq0 * nq1 * nq2 * e); + inptr[2] = in2 + (nq0 * nq1 * nq2 * e); + + TData *outptr0 = out0 + (nq0 * nq1 * nq2 * e); + TData *outptr1 = out1 + (nq0 * nq1 * nq2 * e); + TData *outptr2 = out2 + (nq0 * nq1 * nq2 * e); + + TData **dfptr = new TData *[ndf]; + for (size_t d = 0; d < ndf; d++) + { + dfptr[d] = df + d * dfSize; + dfptr[d] += DEFORMED ? nq0 * nq1 * nq2 * e : e; + } + + // Calculate dxi/dx in[0] + dxi/dy in[1] + dxi/dz in[2] + for (size_t k = 0, cnt_kji = 0; k < nq2; ++k) + { + TData f2 = 2.0 / (1.0 - Z2[k]); + for (size_t j = 0; j < nq1; ++j) + { + TData f3 = 0.5 * (1.0 + Z1[j]); + TData f0 = 2.0 * f2 / (1.0 - Z1[j]); + for (size_t i = 0; i < nq0; ++i, ++cnt_kji) + { + size_t dfindex = DEFORMED ? cnt_kji : 0; + outptr0[cnt_kji] = dfptr[0][dfindex] * inptr[0][cnt_kji]; + outptr1[cnt_kji] = dfptr[1][dfindex] * inptr[0][cnt_kji]; + outptr2[cnt_kji] = dfptr[2][dfindex] * inptr[0][cnt_kji]; + for (size_t d = 1; d < ncoord; ++d) + { + outptr0[cnt_kji] += + (dfptr[3 * d][dfindex] * inptr[d][cnt_kji]); + outptr1[cnt_kji] += + (dfptr[3 * d + 1][dfindex] * inptr[d][cnt_kji]); + outptr2[cnt_kji] += + (dfptr[3 * d + 2][dfindex] * inptr[d][cnt_kji]); + } + + // Multiply by geometric factors + TData f1 = 0.5 * (1.0 + Z0[i]); + outptr0[cnt_kji] += (outptr1[cnt_kji] + outptr2[cnt_kji]) * f1; + outptr0[cnt_kji] *= f0; + outptr1[cnt_kji] += outptr2[cnt_kji] * f3; + outptr1[cnt_kji] *= f2; + } + } + } + delete[] inptr; + delete[] dfptr; +} + +template <typename TData, bool DEFORMED> +__global__ void IProductWRTDerivBasePrismKernel( + const size_t nq0, const size_t nq1, const size_t nq2, const size_t ncoord, + const size_t nelmt, const TData *Z0, const TData *Z2, const size_t dfSize, + TData *df, TData *in0, TData *in1, TData *in2, TData *out0, TData *out1, + TData *out2) +{ + size_t e = blockDim.x * blockIdx.x + threadIdx.x; + + if (e >= nelmt) + { + return; + } + + const auto ndf = 3 * ncoord; + + // Assign pointers. + TData **inptr = new TData *[ncoord]; + inptr[0] = in0 + (nq0 * nq1 * nq2 * e); + inptr[1] = in1 + (nq0 * nq1 * nq2 * e); + inptr[2] = in2 + (nq0 * nq1 * nq2 * e); + + TData *outptr0 = out0 + (nq0 * nq1 * nq2 * e); + TData *outptr1 = out1 + (nq0 * nq1 * nq2 * e); + TData *outptr2 = out2 + (nq0 * nq1 * nq2 * e); + + TData **dfptr = new TData *[ndf]; + for (size_t d = 0; d < ndf; d++) + { + dfptr[d] = df + d * dfSize; + dfptr[d] += DEFORMED ? nq0 * nq1 * nq2 * e : e; + } + + // Calculate dxi/dx in[0] + dxi/dy in[1] + dxi/dz in[2] + for (size_t k = 0, cnt_kji = 0; k < nq2; ++k) + { + TData f0 = 2.0 / (1.0 - Z2[k]); + for (size_t j = 0; j < nq1; ++j) + { + for (size_t i = 0; i < nq0; ++i, ++cnt_kji) + { + size_t dfindex = DEFORMED ? cnt_kji : 0; + outptr0[cnt_kji] = dfptr[0][dfindex] * inptr[0][cnt_kji]; + outptr1[cnt_kji] = dfptr[1][dfindex] * inptr[0][cnt_kji]; + outptr2[cnt_kji] = dfptr[2][dfindex] * inptr[0][cnt_kji]; + for (size_t d = 1; d < ncoord; ++d) + { + outptr0[cnt_kji] += + (dfptr[3 * d][dfindex] * inptr[d][cnt_kji]); + outptr1[cnt_kji] += + (dfptr[3 * d + 1][dfindex] * inptr[d][cnt_kji]); + outptr2[cnt_kji] += + (dfptr[3 * d + 2][dfindex] * inptr[d][cnt_kji]); + } + + // Multiply by geometric factors + TData f1 = 0.5 * (1.0 + Z0[i]); + outptr0[cnt_kji] += outptr2[cnt_kji] * f1; + outptr0[cnt_kji] *= f0; + } + } + } + delete[] inptr; + delete[] dfptr; +} + +template <typename TData, bool DEFORMED> +__global__ void IProductWRTDerivBasePyrKernel( + const size_t nq0, const size_t nq1, const size_t nq2, const size_t ncoord, + const size_t nelmt, const TData *Z0, const TData *Z1, const TData *Z2, + const size_t dfSize, TData *df, TData *in0, TData *in1, TData *in2, + TData *out0, TData *out1, TData *out2) +{ + size_t e = blockDim.x * blockIdx.x + threadIdx.x; + + if (e >= nelmt) + { + return; + } + + const auto ndf = 3 * ncoord; + + // Assign pointers. + TData **inptr = new TData *[ncoord]; + inptr[0] = in0 + (nq0 * nq1 * nq2 * e); + inptr[1] = in1 + (nq0 * nq1 * nq2 * e); + inptr[2] = in2 + (nq0 * nq1 * nq2 * e); + + TData *outptr0 = out0 + (nq0 * nq1 * nq2 * e); + TData *outptr1 = out1 + (nq0 * nq1 * nq2 * e); + TData *outptr2 = out2 + (nq0 * nq1 * nq2 * e); + + TData **dfptr = new TData *[ndf]; + for (size_t d = 0; d < ndf; d++) + { + dfptr[d] = df + d * dfSize; + dfptr[d] += DEFORMED ? nq0 * nq1 * nq2 * e : e; + } + + // Calculate dxi/dx in[0] + dxi/dy in[1] + dxi/dz in[2] + for (size_t k = 0, cnt_kji = 0; k < nq2; ++k) + { + TData f0 = 2.0 / (1.0 - Z2[k]); + for (size_t j = 0; j < nq1; ++j) + { + TData f2 = 0.5 * (1.0 + Z1[j]); + for (size_t i = 0; i < nq0; ++i, ++cnt_kji) + { + size_t dfindex = DEFORMED ? cnt_kji : 0; + outptr0[cnt_kji] = dfptr[0][dfindex] * inptr[0][cnt_kji]; + outptr1[cnt_kji] = dfptr[1][dfindex] * inptr[0][cnt_kji]; + outptr2[cnt_kji] = dfptr[2][dfindex] * inptr[0][cnt_kji]; + for (size_t d = 1; d < ncoord; ++d) + { + outptr0[cnt_kji] += + (dfptr[3 * d][dfindex] * inptr[d][cnt_kji]); + outptr1[cnt_kji] += + (dfptr[3 * d + 1][dfindex] * inptr[d][cnt_kji]); + outptr2[cnt_kji] += + (dfptr[3 * d + 2][dfindex] * inptr[d][cnt_kji]); + } + + // Multiply by geometric factors + TData f1 = 0.5 * (1.0 + Z0[i]); + outptr0[cnt_kji] += outptr2[cnt_kji] * f1; + outptr0[cnt_kji] *= f0; + outptr1[cnt_kji] += outptr2[cnt_kji] * f2; + outptr1[cnt_kji] *= f0; + } + } + } + delete[] inptr; + delete[] dfptr; +} +} // namespace Nektar::Operators::detail diff --git a/Operators/IProductWRTDerivBase/IProductWRTDerivBaseImpl.cpp b/Operators/IProductWRTDerivBase/IProductWRTDerivBaseImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d7b4198758f8e643385a7ba18a1228a711aa28fa --- /dev/null +++ b/Operators/IProductWRTDerivBase/IProductWRTDerivBaseImpl.cpp @@ -0,0 +1,14 @@ +#include "IProductWRTDerivBaseStdMat.hpp" + +namespace Nektar::Operators::detail +{ + +// Add different IProductWRTDerivBase implementations to the factory. +template <> +std::string OperatorIProductWRTDerivBaseImpl<double, ImplStdMat>::className = + GetOperatorFactory<double>().RegisterCreatorFunction( + "IProductWRTDerivBaseStdMat", + OperatorIProductWRTDerivBaseImpl<double, ImplStdMat>::instantiate, + "..."); + +} // namespace Nektar::Operators::detail diff --git a/Operators/IProductWRTDerivBase/IProductWRTDerivBaseStdMat.hpp b/Operators/IProductWRTDerivBase/IProductWRTDerivBaseStdMat.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e9ecbbd7c7d78ea19b9ae793039bbc9d63ed8e2c --- /dev/null +++ b/Operators/IProductWRTDerivBase/IProductWRTDerivBaseStdMat.hpp @@ -0,0 +1,213 @@ +#include "Operators/OperatorIProductWRTDerivBase.hpp" +#include <StdRegions/StdExpansion.h> + +namespace Nektar::Operators::detail +{ + +// standard matrix implementation +template <typename TData> +class OperatorIProductWRTDerivBaseImpl<TData, ImplStdMat> + : public OperatorIProductWRTDerivBase<TData> +{ +public: + OperatorIProductWRTDerivBaseImpl( + const MultiRegions::ExpListSharedPtr &expansionList) + : OperatorIProductWRTDerivBase<TData>(std::move(expansionList)) + { + size_t nTotElmts = this->m_expansionList->GetNumElmts(); + size_t nDim = this->m_expansionList->GetShapeDimension(); + + // Initialise jacobian. + size_t jacSize = Operator<TData>::GetGeometricFactorSize(); + m_jac = Operator<TData>::SetJacobian(jacSize); + m_derivFac = Operator<TData>::SetDerivativeFactor(jacSize); + + // Initialize basiskey. + std::vector<LibUtilities::BasisKey> basisKeys( + 3, LibUtilities::NullBasisKey); + + // Loop over the elements of expansionList. + for (size_t e = 0; e < nTotElmts; ++e) + { + auto const expPtr = this->m_expansionList->GetExp(e); + + // Fetch basiskeys of current element. + for (size_t d = 0; d < nDim; d++) + { + basisKeys[d] = expPtr->GetBasis(d)->GetBasisKey(); + } + + // Copy data to m_matPtr, if necessary. + if (m_matPtr.find(basisKeys) == m_matPtr.end()) + { + size_t nqTot = expPtr->GetTotPoints(); + size_t nmTot = expPtr->GetNcoeffs(); + auto &matPtr = m_matPtr[basisKeys]; + matPtr = Array<OneD, Array<OneD, TData>>(nDim); + Array<OneD, NekDouble> tmp(nqTot), t; + for (size_t d = 0; d < nDim; ++d) + { + // Get IProductWRTDerivBase matrix. + matPtr[d] = Array<OneD, TData>(nqTot * nmTot); + for (size_t i = 0; i < nqTot; ++i) + { + Vmath::Zero(nqTot, tmp, 1); + tmp[i] = 1.0; + expPtr->GetStdExp()->IProductWRTDerivBase( + d, tmp, t = matPtr[d] + i * nmTot); + } + } + } + } + + // Initialize workspace memory. + auto ndata = this->m_expansionList->GetTotPoints(); + m_wsp = Array<OneD, Array<OneD, NekDouble>>(3); + for (size_t i = 0; i < nDim; ++i) + { + m_wsp[i] = Array<OneD, NekDouble>(ndata); + } + } + + void apply(Field<TData, FieldState::Phys> &in0, + Field<TData, FieldState::Phys> &in1, + Field<TData, FieldState::Phys> &in2, + Field<TData, FieldState::Coeff> &out, + bool APPEND = false) override + { + // Copy memory to GPU, if necessary and get raw pointers. + std::vector<TData *> inptr{in0.GetStorage().GetCPUPtr(), + in1.GetStorage().GetCPUPtr(), + in2.GetStorage().GetCPUPtr()}; + auto *outptr = out.GetStorage().GetCPUPtr(); + std::vector<TData *> wspptr{m_wsp[0].get(), m_wsp[1].get(), + m_wsp[2].get()}; + + // Initialize index. + size_t expIdx = 0; + size_t jacIdx = 0; + size_t dfIdx = 0; + + // Initialize basiskey. + std::vector<LibUtilities::BasisKey> basisKeys( + 3, LibUtilities::NullBasisKey); + + // Loop over the blocks. + for (size_t block_idx = 0; block_idx < out.GetBlocks().size(); + ++block_idx) + { + // Determine shape and type of the element. + auto const expPtr = this->m_expansionList->GetExp(expIdx); + auto nElmts = out.GetBlocks()[block_idx].num_elements; + auto nDim = expPtr->GetShapeDimension(); + auto nCoord = expPtr->GetCoordim(); + auto nqTot = expPtr->GetTotPoints(); + auto nmTot = expPtr->GetNcoeffs(); + auto deformed = expPtr->GetMetricInfo()->GetGtype() == + SpatialDomains::eDeformed; + + // calculate dx/dxi in[0] + dy/dxi in[1] + dz/dxi in[2] + if (deformed) + { + for (size_t d = 0; d < nDim; ++d) + { + Vmath::Vmul(nqTot * nElmts, m_derivFac[d].get() + dfIdx, 1, + inptr[0], 1, wspptr[d], 1); + for (size_t i = 1; i < nCoord; ++i) + { + Vmath::Vvtvp(nqTot * nElmts, + m_derivFac[d + i * nDim].get() + dfIdx, 1, + inptr[i], 1, wspptr[d], 1, wspptr[d], 1); + } + } + } + else + { + for (size_t e = 0; e < nElmts; ++e) + { + for (size_t d = 0; d < nDim; ++d) + { + Vmath::Smul(nqTot, m_derivFac[d][dfIdx + e], + inptr[0] + e * nqTot, 1, + wspptr[d] + e * nqTot, 1); + for (size_t i = 1; i < nCoord; ++i) + { + Vmath::Svtvp( + nqTot, m_derivFac[d + i * nDim][dfIdx + e], + inptr[i] + e * nqTot, 1, wspptr[d] + e * nqTot, + 1, wspptr[d] + e * nqTot, 1); + } + } + } + } + + // Multiply by jacobian. + if (deformed) + { + for (size_t d = 0; d < nDim; ++d) + { + Vmath::Vmul(nqTot * nElmts, m_jac.get() + jacIdx, 1, + wspptr[d], 1, wspptr[d], 1); + } + } + else + { + for (size_t e = 0; e < nElmts; ++e) + { + for (size_t d = 0; d < nDim; ++d) + { + Vmath::Smul(nqTot, m_jac[jacIdx + e], + wspptr[d] + e * nqTot, 1, + wspptr[d] + e * nqTot, 1); + } + } + } + + // Fetch basis key for the current element type. + for (size_t d = 0; d < nDim; d++) + { + basisKeys[d] = expPtr->GetBasis(d)->GetBasisKey(); + } + + // Fetch matrix. + auto matPtr = m_matPtr[basisKeys]; + + // Matrix products. + for (size_t d = 0; d < nDim; d++) + { + TData alpha = (d == 0 && !APPEND) ? 0.0 : 1.0; + Blas::Dgemm('N', 'N', nmTot, nElmts, nqTot, 1.0, + matPtr[d].get(), nmTot, wspptr[d], nqTot, alpha, + outptr, nmTot); + + // Increment pointer and index for next element type. + inptr[d] += nqTot * nElmts; + wspptr[d] += nqTot * nElmts; + } + jacIdx += deformed ? nqTot * nElmts : nElmts; + dfIdx += deformed ? nqTot * nElmts : nElmts; + outptr += nmTot * nElmts; + expIdx += nElmts; + } + } + + static std::unique_ptr<Operator<TData>> instantiate( + const MultiRegions::ExpListSharedPtr &expansionList) + { + return std::make_unique< + OperatorIProductWRTDerivBaseImpl<TData, ImplStdMat>>( + std::move(expansionList)); + } + + static std::string className; + +private: + Array<OneD, TData> m_jac; + Array<OneD, Array<OneD, TData>> m_derivFac; + std::map<std::vector<LibUtilities::BasisKey>, + Array<OneD, Array<OneD, TData>>> + m_matPtr; + Array<OneD, Array<OneD, TData>> m_wsp; +}; + +} // namespace Nektar::Operators::detail diff --git a/Operators/OperatorHelper.cuh b/Operators/OperatorHelper.cuh index 1470e53cfc2fabb75bdeea4ddc90bee4f52bbf83..0d9f7b73c7bd29818e45fda89b6bc2ac54b36f71 100644 --- a/Operators/OperatorHelper.cuh +++ b/Operators/OperatorHelper.cuh @@ -1,3 +1,5 @@ +#pragma once + #ifdef NEKTAR_USE_CUDA #include "MemoryRegionCUDA.hpp" #endif @@ -52,11 +54,11 @@ DataMap<TData> GetBasisDataCUDA( } template <typename TData> -DataMap<TData> GetPointDataCUDA( +DataMap<TData> GetDeriveBasisDataCUDA( const MultiRegions::ExpListSharedPtr &expansionList) { // Initialize data map. - DataMap<TData> points; + DataMap<TData> dbasis; // Initialize basiskey. std::vector<LibUtilities::BasisKey> basisKeys(3, @@ -74,30 +76,30 @@ DataMap<TData> GetPointDataCUDA( basisKeys[d] = expPtr->GetBasis(d)->GetBasisKey(); } - // Copy data to points, if necessary. - if (points.find(basisKeys) == points.end()) + // Copy data to dbasis, if necessary. + if (dbasis.find(basisKeys) == dbasis.end()) { - points[basisKeys] = std::vector<TData *>(nDim, 0); + dbasis[basisKeys] = std::vector<TData *>(nDim, 0); for (size_t d = 0; d < nDim; d++) { - auto ndata = expPtr->GetBasis(d)->GetZ().size(); - auto hostPtr = expPtr->GetBasis(d)->GetZ().get(); - auto &devicePtr = points[basisKeys][d]; + auto ndata = expPtr->GetBasis(d)->GetDbdata().size(); + auto hostPtr = expPtr->GetBasis(d)->GetDbdata().get(); + auto &devicePtr = dbasis[basisKeys][d]; cudaMalloc((void **)&devicePtr, sizeof(TData) * ndata); cudaMemcpy(devicePtr, hostPtr, sizeof(TData) * ndata, cudaMemcpyHostToDevice); } } } - return points; + return dbasis; } template <typename TData> -DataMap<TData> GetDerivativeDataCUDA( +DataMap<TData> GetWeightDataCUDA( const MultiRegions::ExpListSharedPtr &expansionList) { // Initialize data map. - DataMap<TData> derivative; + DataMap<TData> weight; // Initialize basiskey. std::vector<LibUtilities::BasisKey> basisKeys(3, @@ -115,30 +117,48 @@ DataMap<TData> GetDerivativeDataCUDA( basisKeys[d] = expPtr->GetBasis(d)->GetBasisKey(); } - // Copy data to derivative, if necessary. - if (derivative.find(basisKeys) == derivative.end()) + // Copy data to weight, if necessary. + if (weight.find(basisKeys) == weight.end()) { - derivative[basisKeys] = std::vector<TData *>(nDim, 0); + weight[basisKeys] = std::vector<TData *>(nDim, 0); for (size_t d = 0; d < nDim; d++) { - auto ndata = expPtr->GetBasis(d)->GetD()->GetPtr().size(); - auto hostPtr = expPtr->GetBasis(d)->GetD()->GetPtr().get(); - auto &devicePtr = derivative[basisKeys][d]; + auto ndata = expPtr->GetBasis(d)->GetW().size(); + Array<OneD, TData> w(ndata); + if (expPtr->GetBasis(d)->GetPointsType() == + LibUtilities::eGaussRadauMAlpha1Beta0) + { + Vmath::Smul(ndata, 0.5, expPtr->GetBasis(d)->GetW().get(), + 1, w.get(), 1); + } + else if (expPtr->GetBasis(d)->GetPointsType() == + LibUtilities::eGaussRadauMAlpha2Beta0) + { + Vmath::Smul(ndata, 0.25, expPtr->GetBasis(d)->GetW().get(), + 1, w.get(), 1); + } + else + { + Vmath::Vcopy(ndata, expPtr->GetBasis(d)->GetW().get(), 1, + w.get(), 1); + } + auto hostPtr = w.get(); + auto &devicePtr = weight[basisKeys][d]; cudaMalloc((void **)&devicePtr, sizeof(TData) * ndata); cudaMemcpy(devicePtr, hostPtr, sizeof(TData) * ndata, cudaMemcpyHostToDevice); } } } - return derivative; + return weight; } template <typename TData> -DataMap<TData> GetWeightDataCUDA( +DataMap<TData> GetPointDataCUDA( const MultiRegions::ExpListSharedPtr &expansionList) { // Initialize data map. - DataMap<TData> weight; + DataMap<TData> points; // Initialize basiskey. std::vector<LibUtilities::BasisKey> basisKeys(3, @@ -156,40 +176,63 @@ DataMap<TData> GetWeightDataCUDA( basisKeys[d] = expPtr->GetBasis(d)->GetBasisKey(); } - // Copy data to weight, if necessary. - if (weight.find(basisKeys) == weight.end()) + // Copy data to points, if necessary. + if (points.find(basisKeys) == points.end()) { - weight[basisKeys] = std::vector<TData *>(nDim, 0); + points[basisKeys] = std::vector<TData *>(nDim, 0); for (size_t d = 0; d < nDim; d++) { - auto ndata = expPtr->GetBasis(d)->GetW().size(); - Array<OneD, TData> w(ndata); - if (expPtr->GetBasis(d)->GetPointsType() == - LibUtilities::eGaussRadauMAlpha1Beta0) - { - Vmath::Smul(ndata, 0.5, expPtr->GetBasis(d)->GetW().get(), - 1, w.get(), 1); - } - else if (expPtr->GetBasis(d)->GetPointsType() == - LibUtilities::eGaussRadauMAlpha2Beta0) - { - Vmath::Smul(ndata, 0.25, expPtr->GetBasis(d)->GetW().get(), - 1, w.get(), 1); - } - else - { - Vmath::Vcopy(ndata, expPtr->GetBasis(d)->GetW().get(), 1, - w.get(), 1); - } - auto hostPtr = w.get(); - auto &devicePtr = weight[basisKeys][d]; + auto ndata = expPtr->GetBasis(d)->GetZ().size(); + auto hostPtr = expPtr->GetBasis(d)->GetZ().get(); + auto &devicePtr = points[basisKeys][d]; cudaMalloc((void **)&devicePtr, sizeof(TData) * ndata); cudaMemcpy(devicePtr, hostPtr, sizeof(TData) * ndata, cudaMemcpyHostToDevice); } } } - return weight; + return points; +} + +template <typename TData> +DataMap<TData> GetDerivativeDataCUDA( + const MultiRegions::ExpListSharedPtr &expansionList) +{ + // Initialize data map. + DataMap<TData> derivative; + + // Initialize basiskey. + std::vector<LibUtilities::BasisKey> basisKeys(3, + LibUtilities::NullBasisKey); + + // Loop over the elements of expansionList. + size_t nDim = expansionList->GetShapeDimension(); + for (size_t i = 0; i < expansionList->GetNumElmts(); ++i) + { + auto const expPtr = expansionList->GetExp(i); + + // Fetch basiskeys of current element. + for (size_t d = 0; d < nDim; d++) + { + basisKeys[d] = expPtr->GetBasis(d)->GetBasisKey(); + } + + // Copy data to derivative, if necessary. + if (derivative.find(basisKeys) == derivative.end()) + { + derivative[basisKeys] = std::vector<TData *>(nDim, 0); + for (size_t d = 0; d < nDim; d++) + { + auto ndata = expPtr->GetBasis(d)->GetD()->GetPtr().size(); + auto hostPtr = expPtr->GetBasis(d)->GetD()->GetPtr().get(); + auto &devicePtr = derivative[basisKeys][d]; + cudaMalloc((void **)&devicePtr, sizeof(TData) * ndata); + cudaMemcpy(devicePtr, hostPtr, sizeof(TData) * ndata, + cudaMemcpyHostToDevice); + } + } + } + return derivative; } template <typename TData> diff --git a/Operators/OperatorIProductWRTBase.hpp b/Operators/OperatorIProductWRTBase.hpp index fe6adfdd22a9d4c76dec75b2e9b6a085b7ef7abe..05827d106f2ce6000bec29bfd6de8108db7928d1 100644 --- a/Operators/OperatorIProductWRTBase.hpp +++ b/Operators/OperatorIProductWRTBase.hpp @@ -19,7 +19,8 @@ public: } virtual void apply(Field<TData, FieldState::Phys> &in, - Field<TData, FieldState::Coeff> &out) = 0; + Field<TData, FieldState::Coeff> &out, + const TData lambda = 1.0) = 0; virtual void operator()(Field<TData, FieldState::Phys> &in, Field<TData, FieldState::Coeff> &out) { diff --git a/Operators/OperatorIProductWRTDerivBase.cpp b/Operators/OperatorIProductWRTDerivBase.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7dcf177dd5a40207fe7c3c8d644e663345926047 --- /dev/null +++ b/Operators/OperatorIProductWRTDerivBase.cpp @@ -0,0 +1,10 @@ +#include "OperatorIProductWRTDerivBase.hpp" + +namespace Nektar::Operators +{ + +template <> +const std::string IProductWRTDerivBase<>::key = "IProductWRTDerivBase"; +template <> const std::string IProductWRTDerivBase<>::default_impl = "StdMat"; + +} // namespace Nektar::Operators diff --git a/Operators/OperatorIProductWRTDerivBase.hpp b/Operators/OperatorIProductWRTDerivBase.hpp new file mode 100644 index 0000000000000000000000000000000000000000..1e25c29dd7547429ea4ea761d6627e0715f7a352 --- /dev/null +++ b/Operators/OperatorIProductWRTDerivBase.hpp @@ -0,0 +1,62 @@ +#pragma once + +#include "Field.hpp" +#include "Operator.hpp" + +namespace Nektar::Operators +{ + +// IProductWRTDerivBase base class +// Defines the apply operator to enforce apply parameter types +template <typename TData> +class OperatorIProductWRTDerivBase : public Operator<TData> +{ +public: + virtual ~OperatorIProductWRTDerivBase() = default; + + OperatorIProductWRTDerivBase( + const MultiRegions::ExpListSharedPtr &expansionList) + : Operator<TData>(std::move(expansionList)) + { + } + + virtual void apply(Field<TData, FieldState::Phys> &in0, + Field<TData, FieldState::Phys> &in1, + Field<TData, FieldState::Phys> &in2, + Field<TData, FieldState::Coeff> &out, + bool APPEND = false) = 0; + virtual void operator()(Field<TData, FieldState::Phys> &in0, + Field<TData, FieldState::Phys> &in1, + Field<TData, FieldState::Phys> &in2, + Field<TData, FieldState::Coeff> &out, + bool APPEND = false) + { + apply(in0, in1, in2, out); + } +}; + +// Descriptor / traits class for IProductWRTDerivBase +template <typename TData = default_fp_type> struct IProductWRTDerivBase +{ + using class_name = OperatorIProductWRTDerivBase<TData>; + using FieldIn = Field<TData, FieldState::Phys>; + using FieldOut = Field<TData, FieldState::Coeff>; + static const std::string key; + static const std::string default_impl; + + static std::shared_ptr<class_name> create( + const MultiRegions::ExpListSharedPtr &expansionList, + std::string pKey = "") + { + return Operator<TData>::template create<IProductWRTDerivBase>( + std::move(expansionList), pKey); + } +}; + +namespace detail +{ +// Template for IProductWRTDerivBase implementations +template <typename TData, typename Op> class OperatorIProductWRTDerivBaseImpl; +} // namespace detail + +} // namespace Nektar::Operators diff --git a/main.cpp b/main.cpp index 0cca2d5a87f34e64b33b5fd375a585b2bc2eeea2..576060ccb53fdfdb94f92ed7e3390ba32a26faab 100644 --- a/main.cpp +++ b/main.cpp @@ -1,4 +1,4 @@ -/** +/*i* * @file main.cpp * @author Nektar++ Development Team * @brief Demonstrator program for the new Field class. @@ -15,6 +15,7 @@ #include "Field.hpp" #include "Operators/OperatorBwdTrans.hpp" #include "Operators/OperatorIProductWRTBase.hpp" +#include "Operators/OperatorIProductWRTDerivBase.hpp" #include "Operators/OperatorPhysDeriv.hpp" #include <LibUtilities/BasicUtils/SessionReader.h> @@ -433,6 +434,33 @@ int main(int argc, char *argv[]) std::cout << std::endl; } std::cout << std::endl; + + std::cout << "IProductWRTDerivBase (StdMat) test starts." << std::endl; + + // Create two Field objects with a MemoryRegionCPU backend by default + // for the inner product with respect to deriv base + auto outCoeff = Field<double, FieldState::Coeff>::create(blocks_coeff); + + // IProductWRTDerivBase + IProductWRTDerivBase<>::create(explist, "StdMat") + ->apply(outPhys0, outPhys1, outPhys2, outCoeff); + + // Check output values. + std::cout << "Out:" << std::endl; + auto *outptr = outCoeff.GetStorage().GetCPUPtr(); + for (auto const &block : outCoeff.GetBlocks()) + { + for (size_t el = 0; el < block.num_elements; ++el) + { + for (size_t coeff = 0; coeff < block.num_pts; ++coeff) + { + std::cout << *(outptr++) << ' '; + } + std::cout << std::endl; + } + std::cout << std::endl; + } + std::cout << std::endl; } // Test PhysDeriv (CUDA) Implementation #ifdef NEKTAR_USE_CUDA @@ -526,6 +554,36 @@ int main(int argc, char *argv[]) std::cout << std::endl; } std::cout << std::endl; + + std::cout << "IProductWRTDerivBase (CUDA) test starts." << std::endl; + + // Create two Field objects with a MemoryRegionCPU backend by default + // for the inner product with respect to deriv base + auto outCoeff = + Field<double, FieldState::Coeff>::create<MemoryRegionCUDA>( + blocks_coeff); + + // IProductWRTDerivBase + IProductWRTDerivBase<>::create(explist, "CUDA") + ->apply(outPhys0, outPhys1, outPhys2, outCoeff); + + // Check output values. + std::cout << "Out:" << std::endl; + auto *outptr = + outCoeff.template GetStorage<MemoryRegionCUDA>().GetCPUPtr(); + for (auto const &block : outCoeff.GetBlocks()) + { + for (size_t el = 0; el < block.num_elements; ++el) + { + for (size_t coeff = 0; coeff < block.num_pts; ++coeff) + { + std::cout << *(outptr++) << ' '; + } + std::cout << std::endl; + } + std::cout << std::endl; + } + std::cout << std::endl; } #endif