diff --git a/Operators/PhysDeriv/PhysDerivCUDA.cu b/Operators/PhysDeriv/PhysDerivCUDA.cu index 1db3d8d9b2c119d009250c9e4b54ebafb9f7a71d..e07e45e1d0efc458a666a135babc81e3e67c6839 100644 --- a/Operators/PhysDeriv/PhysDerivCUDA.cu +++ b/Operators/PhysDeriv/PhysDerivCUDA.cu @@ -2,9 +2,11 @@ namespace Nektar::Operators::detail { + template <> std::string OperatorPhysDerivImpl<double, ImplCUDA>::className = GetOperatorFactory<double>().RegisterCreatorFunction( "PhysDerivCUDA", OperatorPhysDerivImpl<double, ImplCUDA>::instantiate, "..."); -} + +} // namespace Nektar::Operators::detail diff --git a/Operators/PhysDeriv/PhysDerivCUDA.hpp b/Operators/PhysDeriv/PhysDerivCUDA.hpp index 197c1e06db4ad0b49ee6675bdc8c6d2152af4d81..1e14b29b640824c265c450e91712862a1419edf5 100644 --- a/Operators/PhysDeriv/PhysDerivCUDA.hpp +++ b/Operators/PhysDeriv/PhysDerivCUDA.hpp @@ -1,39 +1,46 @@ -# pragma once +#pragma once #include "MemoryRegionCUDA.hpp" #include "Operators/OperatorHelper.cuh" #include "Operators/OperatorPhysDeriv.hpp" #include "Operators/PhysDeriv/PhysDerivCUDAKernels.cuh" +#define FLAG_QP false + namespace Nektar::Operators::detail { template <typename TData, bool DEFORMED = false> -void PhysDeriv1DKernel(const size_t gridSize, const size_t blockSize, - const size_t nq0, const size_t nCoord, - const size_t nElmts, const TData *D0, - const size_t dfSize, TData *df, const TData *in, - TData *out0, TData *out1, TData *out2); +void PhysDeriv1DKernel(const unsigned int gridSize, + const unsigned int blockSize, const unsigned int nq0, + const unsigned int nCoord, const unsigned int nElmts, + const unsigned int nSize, const unsigned int dfSize, + const TData *D0, const TData *df, const TData *in, + TData *out); template <typename TData, bool DEFORMED = false> -void PhysDeriv2DKernel(const size_t gridSize, const size_t blockSize, - LibUtilities::ShapeType shapetype, const size_t nq0, - const size_t nq1, const size_t nCoord, - const size_t nElmts, const TData *D0, const TData *D1, - const TData *Z0, const TData *Z1, const size_t dfSize, - TData *df, const TData *in, TData *out0, TData *out1, - TData *out2); +void PhysDeriv2DKernel(const unsigned int gridSize, + const unsigned int blockSize, + LibUtilities::ShapeType shapetype, + const unsigned int nq0, const unsigned int nq1, + const unsigned int nCoord, const unsigned int nElmts, + const unsigned int nSize, const unsigned int dfSize, + const TData *D0, const TData *D1, const TData *Z0, + const TData *Z1, const TData *df, const TData *in, + TData *out); template <typename TData, bool DEFORMED = false> -void PhysDeriv3DKernel(const size_t gridSize, const size_t blockSize, - LibUtilities::ShapeType shapetype, const size_t nq0, - const size_t nq1, const size_t nq2, const size_t nCoord, - const size_t nElmts, const TData *D0, const TData *D1, - const TData *D2, const TData *Z0, const TData *Z1, - const TData *Z2, const size_t dfSize, TData *df, - const TData *in, TData *out0, TData *out1, TData *out2); +void PhysDeriv3DKernel(const unsigned int gridSize, + const unsigned int blockSize, + LibUtilities::ShapeType shapetype, + const unsigned int nq0, const unsigned int nq1, + const unsigned int nq2, const unsigned int nElmts, + const unsigned int nSize, const unsigned int dfSize, + const TData *D0, const TData *D1, const TData *D2, + const TData *Z0, const TData *Z1, const TData *Z2, + const TData *df, const TData *in, TData *out); -// Matrix-free implementation +// CUDA implementation template <typename TData> class OperatorPhysDerivImpl<TData, ImplCUDA> : public OperatorPhysDeriv<TData> { @@ -78,10 +85,9 @@ public: // Initialize pointers. auto const *inptr = in.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); - auto *outptr0 = out.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); - auto *outptr1 = outptr0 + out.GetFieldSize(); - auto *outptr2 = outptr1 + out.GetFieldSize(); - auto dfptr = m_derivFac; + auto *outptr = out.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); + auto dfptr = m_derivFac; + size_t nSize = out.GetFieldSize(); // Initialize index. size_t expIdx = 0; @@ -120,14 +126,14 @@ public: if (deformed) { PhysDeriv1DKernel<TData, true>( - m_gridSize, m_blockSize, nq0, nCoord, nElmts, D0, - m_dfSize, dfptr, inptr, outptr0, outptr1, outptr2); + m_gridSize, m_blockSize, nq0, nCoord, nElmts, nSize, + m_dfSize, D0, dfptr, inptr, outptr); } else { PhysDeriv1DKernel<TData, false>( - m_gridSize, m_blockSize, nq0, nCoord, nElmts, D0, - m_dfSize, dfptr, inptr, outptr0, outptr1, outptr2); + m_gridSize, m_blockSize, nq0, nCoord, nElmts, nSize, + m_dfSize, D0, dfptr, inptr, outptr); } } else if (expPtr->GetShapeDimension() == 2) @@ -142,15 +148,15 @@ public: { PhysDeriv2DKernel<TData, true>( m_gridSize, m_blockSize, shape, nq0, nq1, nCoord, - nElmts, D0, D1, Z0, Z1, m_dfSize, dfptr, inptr, outptr0, - outptr1, outptr2); + nElmts, nSize, m_dfSize, D0, D1, Z0, Z1, dfptr, inptr, + outptr); } else { PhysDeriv2DKernel<TData, false>( m_gridSize, m_blockSize, shape, nq0, nq1, nCoord, - nElmts, D0, D1, Z0, Z1, m_dfSize, dfptr, inptr, outptr0, - outptr1, outptr2); + nElmts, nSize, m_dfSize, D0, D1, Z0, Z1, dfptr, inptr, + outptr); } } else if (expPtr->GetShapeDimension() == 3) @@ -167,24 +173,22 @@ public: if (deformed) { PhysDeriv3DKernel<TData, true>( - m_gridSize, m_blockSize, shape, nq0, nq1, nq2, nCoord, - nElmts, D0, D1, D2, Z0, Z1, Z2, m_dfSize, dfptr, inptr, - outptr0, outptr1, outptr2); + m_gridSize, m_blockSize, shape, nq0, nq1, nq2, nElmts, + nSize, m_dfSize, D0, D1, D2, Z0, Z1, Z2, dfptr, inptr, + outptr); } else { PhysDeriv3DKernel<TData, false>( - m_gridSize, m_blockSize, shape, nq0, nq1, nq2, nCoord, - nElmts, D0, D1, D2, Z0, Z1, Z2, m_dfSize, dfptr, inptr, - outptr0, outptr1, outptr2); + m_gridSize, m_blockSize, shape, nq0, nq1, nq2, nElmts, + nSize, m_dfSize, D0, D1, D2, Z0, Z1, Z2, dfptr, inptr, + outptr); } } // Increment pointer and index for next element type. dfptr += deformed ? nqTot * nElmts : nElmts; - outptr0 += (nPadElmts + nElmts) * nqTot; - outptr1 += (nPadElmts + nElmts) * nqTot; - outptr2 += (nPadElmts + nElmts) * nqTot; + outptr += (nPadElmts + nElmts) * nqTot; inptr += (nPadElmts + nElmts) * nqTot; expIdx += nElmts; } @@ -204,88 +208,192 @@ private: std::map<std::vector<LibUtilities::BasisKey>, std::vector<TData *>> m_D; std::map<std::vector<LibUtilities::BasisKey>, std::vector<TData *>> m_Z; size_t m_dfSize; + size_t m_gridSize = 1024; size_t m_blockSize = 32; - size_t m_gridSize; }; template <typename TData, bool DEFORMED> -void PhysDeriv1DKernel(const size_t gridSize, const size_t blockSize, - const size_t nq0, const size_t nCoord, - const size_t nElmts, const TData *D0, - const size_t dfSize, TData *df, const TData *in, - TData *out0, TData *out1, TData *out2) +void PhysDeriv1DKernel(const unsigned int gridSize, + const unsigned int blockSize, const unsigned int nq0, + const unsigned int nCoord, const unsigned int nElmts, + const unsigned int nSize, const unsigned int dfSize, + const TData *D0, const TData *df, const TData *in, + TData *out) { // Compute tensorial derivative. - PhysDerivTensor1DKernel<TData> - <<<gridSize, blockSize>>>(nq0, nElmts, D0, in, out0); + if (!FLAG_QP) + { + unsigned int nshared = sizeof(TData) * (nq0 * nq0); + PhysDerivTensor1DKernel<TData> + <<<gridSize, blockSize, nshared>>>(nq0, nElmts, D0, in, out); + } + else + { + PhysDerivTensor1DKernel_QP<TData> + <<<gridSize, dim3(32)>>>(nq0, nElmts, D0, in, out); + } // Compute physical derivative. - PhysDerivSegKernel<TData, DEFORMED><<<gridSize, blockSize>>>( - nq0, nCoord, nElmts, dfSize, df, out0, out1, out2); + if (!FLAG_QP) + { + PhysDeriv1DKernel<TData, DEFORMED><<<gridSize, blockSize>>>( + nq0, nCoord, nElmts, nSize, dfSize, df, out); + } + else + { + PhysDeriv1DKernel_QP<TData, DEFORMED><<<gridSize, dim3(32)>>>( + nq0, nCoord, nElmts, nSize, dfSize, df, out); + } } template <typename TData, bool DEFORMED> -void PhysDeriv2DKernel(const size_t gridSize, const size_t blockSize, - LibUtilities::ShapeType shapetype, const size_t nq0, - const size_t nq1, const size_t nCoord, - const size_t nElmts, const TData *D0, const TData *D1, - const TData *Z0, const TData *Z1, const size_t dfSize, - TData *df, const TData *in, TData *out0, TData *out1, - TData *out2) +void PhysDeriv2DKernel(const unsigned int gridSize, + const unsigned int blockSize, + LibUtilities::ShapeType shapetype, + const unsigned int nq0, const unsigned int nq1, + const unsigned int nCoord, const unsigned int nElmts, + const unsigned int nSize, const unsigned int dfSize, + const TData *D0, const TData *D1, const TData *Z0, + const TData *Z1, const TData *df, const TData *in, + TData *out) { // Compute tensorial derivative. - PhysDerivTensor2DKernel<TData> - <<<gridSize, blockSize>>>(nq0, nq1, nElmts, D0, D1, in, out0, out1); + if (!FLAG_QP) + { + unsigned int nshared = sizeof(TData) * (nq0 * nq0 + nq1 * nq1); + PhysDerivTensor2DKernel<TData><<<gridSize, blockSize, nshared>>>( + nq0, nq1, nElmts, nSize, D0, D1, in, out); + } + else + { + PhysDerivTensor2DKernel_QP<TData><<<gridSize, dim3(8, 8)>>>( + nq0, nq1, nElmts, nSize, D0, D1, in, out); + } // Compute physical derivative. if (shapetype == LibUtilities::Quad) { - PhysDerivQuadKernel<TData, DEFORMED><<<gridSize, blockSize>>>( - nq0, nq1, nCoord, nElmts, dfSize, df, out0, out1, out2); + if (!FLAG_QP) + { + PhysDeriv2DKernel<TData, LibUtilities::Quad, DEFORMED> + <<<gridSize, blockSize>>>(nq0, nq1, nCoord, nElmts, nSize, + dfSize, nullptr, nullptr, df, out); + } + else + { + PhysDeriv2DKernel_QP<TData, LibUtilities::Quad, DEFORMED> + <<<gridSize, dim3(8, 8)>>>(nq0, nq1, nCoord, nElmts, nSize, + dfSize, nullptr, nullptr, df, out); + } } else if (shapetype == LibUtilities::Tri) { - PhysDerivTriKernel<TData, DEFORMED><<<gridSize, blockSize>>>( - nq0, nq1, nCoord, nElmts, Z0, Z1, dfSize, df, out0, out1, out2); + if (!FLAG_QP) + { + unsigned int nshared = sizeof(TData) * (nq0 + nq1); + PhysDeriv2DKernel<TData, LibUtilities::Tri, DEFORMED> + <<<gridSize, blockSize, nshared>>>( + nq0, nq1, nCoord, nElmts, nSize, dfSize, Z0, Z1, df, out); + } + else + { + PhysDeriv2DKernel_QP<TData, LibUtilities::Tri, DEFORMED> + <<<gridSize, dim3(8, 8)>>>(nq0, nq1, nCoord, nElmts, nSize, + dfSize, Z0, Z1, df, out); + } } } template <typename TData, bool DEFORMED> -void PhysDeriv3DKernel(const size_t gridSize, const size_t blockSize, - LibUtilities::ShapeType shapetype, const size_t nq0, - const size_t nq1, const size_t nq2, const size_t nCoord, - const size_t nElmts, const TData *D0, const TData *D1, - const TData *D2, const TData *Z0, const TData *Z1, - const TData *Z2, const size_t dfSize, TData *df, - const TData *in, TData *out0, TData *out1, TData *out2) +void PhysDeriv3DKernel(const unsigned int gridSize, + const unsigned int blockSize, + LibUtilities::ShapeType shapetype, + const unsigned int nq0, const unsigned int nq1, + const unsigned int nq2, const unsigned int nElmts, + const unsigned int nSize, const unsigned int dfSize, + const TData *D0, const TData *D1, const TData *D2, + const TData *Z0, const TData *Z1, const TData *Z2, + const TData *df, const TData *in, TData *out) { // Compute tensorial derivative. - PhysDerivTensor3DKernel<TData><<<gridSize, blockSize>>>( - nq0, nq1, nq2, nElmts, D0, D1, D2, in, out0, out1, out2); + if (!FLAG_QP) + { + unsigned int nshared = + sizeof(TData) * (nq0 * nq0 + nq1 * nq1 + nq2 * nq2); + PhysDerivTensor3DKernel<TData><<<gridSize, blockSize, nshared>>>( + nq0, nq1, nq2, nElmts, nSize, D0, D1, D2, in, out); + } + else + { + PhysDerivTensor3DKernel_QP<TData><<<gridSize, dim3(4, 4, 4)>>>( + nq0, nq1, nq2, nElmts, nSize, D0, D1, D2, in, out); + } // Compute physical derivative. if (shapetype == LibUtilities::Hex) { - PhysDerivHexKernel<TData, DEFORMED><<<gridSize, blockSize>>>( - nq0, nq1, nq2, nCoord, nElmts, dfSize, df, out0, out1, out2); + if (!FLAG_QP) + { + PhysDeriv3DKernel<TData, LibUtilities::Hex, DEFORMED> + <<<gridSize, blockSize>>>(nq0, nq1, nq2, nElmts, nSize, dfSize, + nullptr, nullptr, nullptr, df, out); + } + else + { + PhysDeriv3DKernel_QP<TData, LibUtilities::Hex, DEFORMED> + <<<gridSize, dim3(4, 4, 4)>>>(nq0, nq1, nq2, nElmts, nSize, + dfSize, nullptr, nullptr, nullptr, + df, out); + } } else if (shapetype == LibUtilities::Tet) { - PhysDerivTetKernel<TData, DEFORMED> - <<<gridSize, blockSize>>>(nq0, nq1, nq2, nCoord, nElmts, Z0, Z1, Z2, - dfSize, df, out0, out1, out2); + if (!FLAG_QP) + { + unsigned int nshared = sizeof(TData) * (nq0 + 2 * nq1 + nq2); + PhysDeriv3DKernel<TData, LibUtilities::Tet, DEFORMED> + <<<gridSize, blockSize, nshared>>>(nq0, nq1, nq2, nElmts, nSize, + dfSize, Z0, Z1, Z2, df, out); + } + else + { + PhysDeriv3DKernel_QP<TData, LibUtilities::Tet, DEFORMED> + <<<gridSize, dim3(4, 4, 4)>>>(nq0, nq1, nq2, nElmts, nSize, + dfSize, Z0, Z1, Z2, df, out); + } } else if (shapetype == LibUtilities::Prism) { - PhysDerivPrismKernel<TData, DEFORMED> - <<<gridSize, blockSize>>>(nq0, nq1, nq2, nCoord, nElmts, Z0, Z1, Z2, - dfSize, df, out0, out1, out2); + if (!FLAG_QP) + { + unsigned int nshared = sizeof(TData) * (nq0 + nq2); + PhysDeriv3DKernel<TData, LibUtilities::Prism, DEFORMED> + <<<gridSize, blockSize, nshared>>>(nq0, nq1, nq2, nElmts, nSize, + dfSize, Z0, nullptr, Z2, df, + out); + } + else + { + PhysDeriv3DKernel_QP<TData, LibUtilities::Prism, DEFORMED> + <<<gridSize, dim3(4, 4, 4)>>>(nq0, nq1, nq2, nElmts, nSize, + dfSize, Z0, nullptr, Z2, df, out); + } } else if (shapetype == LibUtilities::Pyr) { - PhysDerivPyrKernel<TData, DEFORMED> - <<<gridSize, blockSize>>>(nq0, nq1, nq2, nCoord, nElmts, Z0, Z1, Z2, - dfSize, df, out0, out1, out2); + if (!FLAG_QP) + { + unsigned int nshared = sizeof(TData) * (nq0 + nq1 + nq2); + PhysDeriv3DKernel<TData, LibUtilities::Pyr, DEFORMED> + <<<gridSize, blockSize, nshared>>>(nq0, nq1, nq2, nElmts, nSize, + dfSize, Z0, Z1, Z2, df, out); + } + else + { + PhysDeriv3DKernel_QP<TData, LibUtilities::Pyr, DEFORMED> + <<<gridSize, dim3(4, 4, 4)>>>(nq0, nq1, nq2, nElmts, nSize, + dfSize, Z0, Z1, Z2, df, out); + } } } diff --git a/Operators/PhysDeriv/PhysDerivCUDAKernels.cuh b/Operators/PhysDeriv/PhysDerivCUDAKernels.cuh index 28482cf005179ea68ff701cc12bd769cc3515090..15268c08d629d4a4cae4a97d9794c9fd9634b6fb 100644 --- a/Operators/PhysDeriv/PhysDerivCUDAKernels.cuh +++ b/Operators/PhysDeriv/PhysDerivCUDAKernels.cuh @@ -1,602 +1,782 @@ +#pragma once + namespace Nektar::Operators::detail { + template <typename TData, bool DEFORMED> -__global__ void PhysDerivSegKernel(const size_t nq0, const size_t ncoord, - const size_t nelmt, const size_t dfSize, - TData *df, TData *inout0, TData *inout1, - TData *inout2) +__global__ void PhysDeriv1DKernel( + const unsigned int nq0, const unsigned int ncoord, const unsigned int nelmt, + const unsigned int nSize, const unsigned int dfSize, + const TData *__restrict df, TData *__restrict inout) { - size_t e = blockDim.x * blockIdx.x + threadIdx.x; - - if (e >= nelmt) - { - return; - } - - size_t ndf = ncoord; - - // Assign pointers. - TData **inoutptr = new TData *[ncoord]; - inoutptr[0] = inout0 + (nq0 * e); - if (ncoord > 1) - { - inoutptr[1] = inout1 + (nq0 * e); - } - if (ncoord > 2) - { - inoutptr[2] = inout2 + (nq0 * e); - } + unsigned int e = blockDim.x * blockIdx.x + threadIdx.x; - TData **dfptr = new TData *[ndf]; - for (size_t d = 0; d < ndf; d++) + while (e < nelmt) { - dfptr[d] = df + d * dfSize; - dfptr[d] += DEFORMED ? nq0 * e : e; - } + unsigned int offset = nq0 * e; - // Compute derivative. - for (size_t j = 0; j < nq0; ++j) - { - size_t dfindex = DEFORMED ? j : 0; - for (size_t d = ndf; d > 0; d--) + for (unsigned int i = 0; i < nq0; ++i) { - inoutptr[d - 1][j] = inoutptr[0][j] * dfptr[d - 1][dfindex]; + unsigned int index = offset + i; + unsigned int dfindex = DEFORMED ? index : e; + + TData d0 = inout[index]; + for (unsigned int d = 0; d < ncoord; d++) + { + inout[index + d * nSize] = d0 * df[dfindex + d * dfSize]; + } } - } - delete[] inoutptr; - delete[] dfptr; + e += blockDim.x * gridDim.x; + } } template <typename TData, bool DEFORMED> -__global__ void PhysDerivQuadKernel(const size_t nq0, const size_t nq1, - const size_t ncoord, const size_t nelmt, - const size_t dfSize, TData *df, - TData *inout0, TData *inout1, TData *inout2) +__global__ void PhysDeriv1DKernel_QP( + const unsigned int nq0, const unsigned int ncoord, const unsigned int nelmt, + const unsigned int nSize, const unsigned int dfSize, + const TData *__restrict df, TData *__restrict inout) { - size_t e = blockDim.x * blockIdx.x + threadIdx.x; - - if (e >= nelmt) - { - return; - } - - auto ndf = 2 * ncoord; + unsigned int e = blockIdx.x; - // Assign pointers. - TData **inoutptr = new TData *[ncoord]; - inoutptr[0] = inout0 + (nq0 * nq1 * e); - inoutptr[1] = inout1 + (nq0 * nq1 * e); - if (ncoord > 2) + while (e < nelmt) { - inoutptr[2] = inout2 + (nq0 * nq1 * e); - } + unsigned int offset = nq0 * e; - TData **dfptr = new TData *[ndf]; - for (size_t d = 0; d < ndf; d++) - { - dfptr[d] = df + d * dfSize; - dfptr[d] += DEFORMED ? nq0 * nq1 * e : e; - } - - // Compute derivative. - for (size_t j = 0, cnt_ji = 0; j < nq1; ++j) - { - for (size_t i = 0; i < nq0; ++i, ++cnt_ji) + for (unsigned int i = threadIdx.x; i < nq0; i += blockDim.x) { - TData d0 = inoutptr[0][cnt_ji]; - TData d1 = inoutptr[1][cnt_ji]; + unsigned int index = offset + i; + unsigned int dfindex = DEFORMED ? index : e; - size_t dfindex = DEFORMED ? cnt_ji : 0; - for (size_t d = 0; d < ncoord; d++) + TData d0 = inout[index]; + for (unsigned int d = 0; d < ncoord; d++) { - inoutptr[d][cnt_ji] = - d0 * dfptr[2 * d][dfindex] + d1 * dfptr[2 * d + 1][dfindex]; + inout[index + d * nSize] = d0 * df[dfindex + d * dfSize]; } } - } - delete[] inoutptr; - delete[] dfptr; + e += gridDim.x; + } } -template <typename TData, bool DEFORMED> -__global__ void PhysDerivTriKernel(const size_t nq0, const size_t nq1, - const size_t ncoord, const size_t nelmt, - const TData *Z0, const TData *Z1, - const size_t dfSize, TData *df, - TData *inout0, TData *inout1, TData *inout2) +template <typename TData, LibUtilities::ShapeType SHAPETYPE, bool DEFORMED> +__global__ void PhysDeriv2DKernel( + const unsigned int nq0, const unsigned int nq1, const unsigned int ncoord, + const unsigned int nelmt, const unsigned int nSize, + const unsigned int dfSize, const TData *__restrict Z0, + const TData *__restrict Z1, const TData *__restrict df, + TData *__restrict inout) { - size_t e = blockDim.x * blockIdx.x + threadIdx.x; + extern __shared__ TData shared[]; + TData *s_xfrm0, *s_xfrm1; - if (e >= nelmt) + // Copy to shared memory. + if constexpr (SHAPETYPE == LibUtilities::Tri) { - return; - } + s_xfrm0 = shared; + s_xfrm1 = s_xfrm0 + nq1; - auto ndf = 2 * ncoord; + unsigned int sIndex = threadIdx.x; + while (sIndex < nq1) + { + s_xfrm0[sIndex] = 2.0 / (1.0 - Z1[sIndex]); + sIndex += blockDim.x; + } - // Assign pointers. - TData **inoutptr = new TData *[ncoord]; - inoutptr[0] = inout0 + (nq0 * nq1 * e); - inoutptr[1] = inout1 + (nq0 * nq1 * e); - if (ncoord > 2) - { - inoutptr[2] = inout2 + (nq0 * nq1 * e); - } + sIndex = threadIdx.x; + while (sIndex < nq0) + { + s_xfrm1[sIndex] = 0.5 * (1.0 + Z0[sIndex]); + sIndex += blockDim.x; + } - TData **dfptr = new TData *[ndf]; - for (size_t d = 0; d < ndf; d++) - { - dfptr[d] = df + d * dfSize; - dfptr[d] += DEFORMED ? nq0 * nq1 * e : e; + __syncthreads(); } - // Compute derivative. - for (int j = 0, cnt_ji = 0; j < nq1; ++j) + unsigned int e = blockDim.x * blockIdx.x + threadIdx.x; + + while (e < nelmt) { - TData xfrm0 = 2.0 / (1.0 - Z1[j]); - for (int i = 0; i < nq0; ++i, ++cnt_ji) + unsigned int offset = nq0 * nq1 * e; + + for (unsigned int j = 0, cnt_ji = 0; j < nq1; ++j) { - TData d0 = inoutptr[0][cnt_ji]; - TData d1 = inoutptr[1][cnt_ji]; + for (unsigned int i = 0; i < nq0; ++i, ++cnt_ji) + { + unsigned int index = offset + cnt_ji; + unsigned int dfindex = DEFORMED ? index : e; - // Moving from standard to collapsed coordinates. - TData xfrm1 = 0.5 * (1.0 + Z0[i]); - d0 *= xfrm0; - d1 += d0 * xfrm1; + TData d0 = inout[index]; + TData d1 = inout[index + nSize]; - // Multiply by derivative factors. - size_t dfindex = DEFORMED ? cnt_ji : 0; - for (size_t d = 0; d < ncoord; d++) - { - inoutptr[d][cnt_ji] = - d0 * dfptr[2 * d][dfindex] + d1 * dfptr[2 * d + 1][dfindex]; + // Moving from standard to collapsed coordinates. + if constexpr (SHAPETYPE == LibUtilities::Tri) + { + d0 *= s_xfrm0[j]; + d1 += d0 * s_xfrm1[i]; + } + + // Multiply by derivative factors. + for (unsigned int d = 0; d < ncoord; d++) + { + inout[index + d * nSize] = + d0 * df[dfindex + (2 * d) * dfSize] + + d1 * df[dfindex + (2 * d + 1) * dfSize]; + } } } - } - delete[] inoutptr; - delete[] dfptr; + e += blockDim.x * gridDim.x; + } } -template <typename TData, bool DEFORMED> -__global__ void PhysDerivHexKernel(const size_t nq0, const size_t nq1, - const size_t nq2, const size_t ncoord, - const size_t nelmt, const size_t dfSize, - TData *df, TData *inout0, TData *inout1, - TData *inout2) +template <typename TData, LibUtilities::ShapeType SHAPETYPE, bool DEFORMED> +__global__ void PhysDeriv2DKernel_QP( + const unsigned int nq0, const unsigned int nq1, const unsigned int ncoord, + const unsigned int nelmt, const unsigned int nSize, + const unsigned int dfSize, const TData *__restrict Z0, + const TData *__restrict Z1, const TData *__restrict df, + TData *__restrict inout) { - size_t e = blockDim.x * blockIdx.x + threadIdx.x; - - if (e >= nelmt) - { - return; - } - - size_t ndf = 3 * ncoord; + TData xfrm0, xfrm1; - // Assign pointers. - TData **inoutptr = new TData *[ncoord]; - inoutptr[0] = inout0 + (nq0 * nq1 * nq2 * e); - inoutptr[1] = inout1 + (nq0 * nq1 * nq2 * e); - inoutptr[2] = inout2 + (nq0 * nq1 * nq2 * e); + unsigned int e = blockIdx.x; - TData **dfptr = new TData *[ndf]; - for (size_t d = 0; d < ndf; d++) + while (e < nelmt) { - dfptr[d] = df + d * dfSize; - dfptr[d] += DEFORMED ? nq0 * nq1 * nq2 * e : e; - } + unsigned int offset = nq0 * nq1 * e; - // Compute derivative. - for (size_t k = 0, cnt_ijk = 0; k < nq2; ++k) - { - for (size_t j = 0; j < nq1; ++j) + for (unsigned int j = threadIdx.y; j < nq1; j += blockDim.y) { - for (size_t i = 0; i < nq0; ++i, ++cnt_ijk) + if constexpr (SHAPETYPE == LibUtilities::Tri) { - TData tmp[] = {inoutptr[0][cnt_ijk], inoutptr[1][cnt_ijk], - inoutptr[2][cnt_ijk]}; + xfrm0 = 2.0 / (1.0 - Z1[j]); + } + + for (unsigned int i = threadIdx.x; i < nq0; i += blockDim.x) + { + unsigned int cnt_ji = nq0 * j + i; + unsigned int index = offset + cnt_ji; + unsigned int dfindex = DEFORMED ? index : e; + + TData d0 = inout[index]; + TData d1 = inout[index + nSize]; + + // Moving from standard to collapsed coordinates. + if constexpr (SHAPETYPE == LibUtilities::Tri) + { + xfrm1 = 0.5 * (1.0 + Z0[i]); + d0 *= xfrm0; + d1 += d0 * xfrm1; + } // Multiply by derivative factors. - size_t dfindex = DEFORMED ? cnt_ijk : 0; - for (size_t d = 0; d < ncoord; ++d) + for (unsigned int d = 0; d < ncoord; d++) { - TData sum = 0.0; - for (size_t n = 0; n < ncoord; ++n) - { - sum += tmp[n] * dfptr[d * ncoord + n][dfindex]; - } - inoutptr[d][cnt_ijk] = sum; + inout[index + d * nSize] = + d0 * df[dfindex + (2 * d) * dfSize] + + d1 * df[dfindex + (2 * d + 1) * dfSize]; } } } - } - delete[] inoutptr; - delete[] dfptr; + e += gridDim.x; + } } -template <typename TData, bool DEFORMED> -__global__ void PhysDerivTetKernel(const size_t nq0, const size_t nq1, - const size_t nq2, const size_t ncoord, - const size_t nelmt, const TData *Z0, - const TData *Z1, const TData *Z2, - const size_t dfSize, TData *df, - TData *inout0, TData *inout1, TData *inout2) +template <typename TData, LibUtilities::ShapeType SHAPETYPE, bool DEFORMED> +__global__ void PhysDeriv3DKernel( + const unsigned int nq0, const unsigned int nq1, const unsigned int nq2, + const unsigned int nelmt, const unsigned int nSize, + const unsigned int dfSize, const TData *__restrict Z0, + const TData *__restrict Z1, const TData *__restrict Z2, + const TData *__restrict df, TData *__restrict inout) { - size_t e = blockDim.x * blockIdx.x + threadIdx.x; + extern __shared__ TData shared[]; + TData *s_xfrm_eta0, *s_xfrm_eta1, *s_xfrm_eta1m, *s_xfrm_eta2; - if (e >= nelmt) + // Copy to shared memory. + if constexpr (SHAPETYPE == LibUtilities::Tet) { - return; - } + s_xfrm_eta0 = shared; + s_xfrm_eta1 = s_xfrm_eta0 + nq0; + s_xfrm_eta1m = s_xfrm_eta1 + nq1; + s_xfrm_eta2 = s_xfrm_eta1m + nq1; - size_t ndf = 3 * ncoord; + unsigned int sIndex = threadIdx.x; + while (sIndex < nq0) + { + s_xfrm_eta0[sIndex] = 0.5 * (1.0 + Z0[sIndex]); + sIndex += blockDim.x; + } - // Allocate workspace memory. - TData *wsp0 = new TData[nq0 * nq1 * nq2]; - TData *wsp1 = new TData[nq0 * nq1 * nq2]; + sIndex = threadIdx.x; + while (sIndex < nq1) + { + s_xfrm_eta1[sIndex] = 0.5 * (1.0 + Z1[sIndex]); + sIndex += blockDim.x; + } - // Assign pointers. - TData **inoutptr = new TData *[ncoord]; - inoutptr[0] = inout0 + (nq0 * nq1 * nq2 * e); - inoutptr[1] = inout1 + (nq0 * nq1 * nq2 * e); - inoutptr[2] = inout2 + (nq0 * nq1 * nq2 * e); + sIndex = threadIdx.x; + while (sIndex < nq1) + { + s_xfrm_eta1m[sIndex] = 2.0 / (1.0 - Z1[sIndex]); + sIndex += blockDim.x; + } - TData **dfptr = new TData *[ndf]; - for (size_t d = 0; d < ndf; d++) - { - dfptr[d] = df + d * dfSize; - dfptr[d] += DEFORMED ? nq0 * nq1 * nq2 * e : e; - } + sIndex = threadIdx.x; + while (sIndex < nq2) + { + s_xfrm_eta2[sIndex] = 2.0 / (1.0 - Z2[sIndex]); + sIndex += blockDim.x; + } - // Moving from standard to collapsed coordinates. - for (int k = 0, eta0 = 0; k < nq2; ++k) + __syncthreads(); + } + else if constexpr (SHAPETYPE == LibUtilities::Prism) { - TData xfrm_eta2 = 2.0 / (1.0 - Z2[k]); - for (int j = 0; j < nq1; ++j) + s_xfrm_eta0 = shared; + s_xfrm_eta2 = s_xfrm_eta0 + nq0; + + unsigned int sIndex = threadIdx.x; + while (sIndex < nq0) { - TData xfrm_eta1 = 2.0 / (1.0 - Z1[j]); - TData xfrm = xfrm_eta1 * xfrm_eta2; - for (int i = 0; i < nq0; ++i, ++eta0) - { - inoutptr[0][eta0] *= xfrm; - wsp0[eta0] = inoutptr[0][eta0]; - } + s_xfrm_eta0[sIndex] = 0.5 * (1.0 + Z0[sIndex]); + sIndex += blockDim.x; } - } - for (int k = 0, eta0 = 0; k < nq2; ++k) + sIndex = threadIdx.x; + while (sIndex < nq2) + { + s_xfrm_eta2[sIndex] = 2.0 / (1.0 - Z2[sIndex]); + sIndex += blockDim.x; + } + + __syncthreads(); + } + else if constexpr (SHAPETYPE == LibUtilities::Pyr) { - TData xfrm_eta2 = 2.0 / (1.0 - Z2[k]); - for (int j = 0; j < nq1; ++j) + s_xfrm_eta0 = shared; + s_xfrm_eta1 = s_xfrm_eta0 + nq0; + s_xfrm_eta2 = s_xfrm_eta1 + nq1; + + unsigned int sIndex = threadIdx.x; + while (sIndex < nq0) { - for (int i = 0; i < nq0; ++i, ++eta0) - { - TData xfrm_eta0 = 0.5 * (1.0 + Z0[i]); - wsp0[eta0] = xfrm_eta0 * wsp0[eta0]; - wsp1[eta0] = xfrm_eta2 * inoutptr[1][eta0]; - inoutptr[1][eta0] = wsp0[eta0] + wsp1[eta0]; - } + s_xfrm_eta0[sIndex] = 0.5 * (1.0 + Z0[sIndex]); + sIndex += blockDim.x; } + + sIndex = threadIdx.x; + while (sIndex < nq1) + { + s_xfrm_eta1[sIndex] = 0.5 * (1.0 + Z1[sIndex]); + sIndex += blockDim.x; + } + + sIndex = threadIdx.x; + while (sIndex < nq2) + { + s_xfrm_eta2[sIndex] = 2.0 / (1.0 - Z2[sIndex]); + sIndex += blockDim.x; + } + + __syncthreads(); } - for (int k = 0, eta0 = 0; k < nq2; ++k) + constexpr unsigned int ncoord = 3; + + unsigned int e = blockDim.x * blockIdx.x + threadIdx.x; + + while (e < nelmt) { - for (int j = 0; j < nq1; ++j) + unsigned int offset = nq0 * nq1 * nq2 * e; + + // Moving from standard to collapsed coordinates. + if constexpr (SHAPETYPE == LibUtilities::Tet) { - TData xfrm_eta1 = 0.5 * (1.0 + Z1[j]); - for (int i = 0; i < nq0; ++i, ++eta0) + for (unsigned int k = 0, cnt_kji = 0; k < nq2; ++k) { - inoutptr[2][eta0] += wsp0[eta0] + wsp1[eta0] * xfrm_eta1; + for (unsigned int j = 0; j < nq1; ++j) + { + TData xfrm = s_xfrm_eta1m[j] * s_xfrm_eta2[k]; + for (unsigned int i = 0; i < nq0; ++i, ++cnt_kji) + { + unsigned int index = offset + cnt_kji; + + TData tmp0 = xfrm * inout[index]; + TData tmp1 = s_xfrm_eta0[i] * tmp0; + TData tmp2 = s_xfrm_eta2[k] * inout[index + nSize]; + inout[index] = tmp0; + inout[index + nSize] = tmp1 + tmp2; + inout[index + 2 * nSize] += + tmp1 + s_xfrm_eta1[j] * tmp2; + } + } } } - } - // Compute derivative. - for (size_t k = 0, cnt_ijk = 0; k < nq2; ++k) - { - for (size_t j = 0; j < nq1; ++j) + for (unsigned int k = 0, cnt_kji = 0; k < nq2; ++k) { - for (size_t i = 0; i < nq0; ++i, ++cnt_ijk) + for (unsigned int j = 0; j < nq1; ++j) { - TData tmp[] = {inoutptr[0][cnt_ijk], inoutptr[1][cnt_ijk], - inoutptr[2][cnt_ijk]}; - - // Multiply by derivative factors. - size_t dfindex = DEFORMED ? cnt_ijk : 0; - for (size_t d = 0; d < ncoord; ++d) + for (unsigned int i = 0; i < nq0; ++i, ++cnt_kji) { - TData sum = 0.0; - for (size_t n = 0; n < ncoord; ++n) + unsigned int index = offset + cnt_kji; + unsigned int dfindex = DEFORMED ? index : e; + + TData d0 = inout[index]; + TData d1 = inout[index + nSize]; + TData d2 = inout[index + 2 * nSize]; + + // Chain-rule for eta_0 and eta_2. + if constexpr (SHAPETYPE == LibUtilities::Prism) + { + d0 *= s_xfrm_eta2[k]; + d2 += s_xfrm_eta0[i] * d0; + } + else if constexpr (SHAPETYPE == LibUtilities::Pyr) { - sum += tmp[n] * dfptr[d * ncoord + n][dfindex]; + d0 *= s_xfrm_eta2[k]; + d1 *= s_xfrm_eta2[k]; + d2 += s_xfrm_eta0[i] * d0 + s_xfrm_eta1[j] * d1; + } + + // Multiply by derivative factors. + for (unsigned int d = 0; d < ncoord; d++) + { + inout[index + d * nSize] = + d0 * df[dfindex + (3 * d) * dfSize] + + d1 * df[dfindex + (3 * d + 1) * dfSize] + + d2 * df[dfindex + (3 * d + 2) * dfSize]; } - inoutptr[d][cnt_ijk] = sum; } } } - } - delete[] wsp0; - delete[] wsp1; - delete[] inoutptr; - delete[] dfptr; + e += blockDim.x * gridDim.x; + } } -template <typename TData, bool DEFORMED> -__global__ void PhysDerivPrismKernel( - const size_t nq0, const size_t nq1, const size_t nq2, const size_t ncoord, - const size_t nelmt, const TData *Z0, const TData *Z1, const TData *Z2, - const size_t dfSize, TData *df, TData *inout0, TData *inout1, TData *inout2) +template <typename TData, LibUtilities::ShapeType SHAPETYPE, bool DEFORMED> +__global__ void PhysDeriv3DKernel_QP( + const unsigned int nq0, const unsigned int nq1, const unsigned int nq2, + const unsigned int nelmt, const unsigned int nSize, + const unsigned int dfSize, const TData *__restrict Z0, + const TData *__restrict Z1, const TData *__restrict Z2, + const TData *__restrict df, TData *__restrict inout) { - size_t e = blockDim.x * blockIdx.x + threadIdx.x; + TData xfrm_eta0, xfrm_eta1, xfrm_eta1m, xfrm_eta2; - if (e >= nelmt) - { - return; - } - - size_t ndf = 3 * ncoord; + constexpr unsigned int ncoord = 3; - // Assign pointers. - TData **inoutptr = new TData *[ncoord]; - inoutptr[0] = inout0 + (nq0 * nq1 * nq2 * e); - inoutptr[1] = inout1 + (nq0 * nq1 * nq2 * e); - inoutptr[2] = inout2 + (nq0 * nq1 * nq2 * e); + unsigned int e = blockIdx.x; - TData **dfptr = new TData *[ndf]; - for (size_t d = 0; d < ndf; d++) + while (e < nelmt) { - dfptr[d] = df + d * dfSize; - dfptr[d] += DEFORMED ? nq0 * nq1 * nq2 * e : e; - } + unsigned int offset = nq0 * nq1 * nq2 * e; - // Compute derivative. - for (size_t k = 0, cnt_ijk = 0; k < nq2; ++k) - { - TData xfrm_eta2 = 2.0 / (1.0 - Z2[k]); - for (size_t j = 0; j < nq1; ++j) + // Moving from standard to collapsed coordinates. + if constexpr (SHAPETYPE == LibUtilities::Tet) { - for (size_t i = 0; i < nq0; ++i, ++cnt_ijk) + for (unsigned int k = threadIdx.z; k < nq2; k += blockDim.z) { - TData d0 = inoutptr[0][cnt_ijk]; - TData d1 = inoutptr[1][cnt_ijk]; - TData d2 = inoutptr[2][cnt_ijk]; + xfrm_eta2 = 2.0 / (1.0 - Z2[k]); + for (unsigned int j = threadIdx.y; j < nq1; j += blockDim.y) + { + xfrm_eta1m = 2.0 / (1.0 - Z1[j]); + xfrm_eta1 = 0.5 * (1.0 + Z1[j]); + TData xfrm = xfrm_eta1m * xfrm_eta2; + for (unsigned int i = threadIdx.x; i < nq0; i += blockDim.x) + { + unsigned int index = + offset + nq0 * nq1 * k + nq0 * j + i; + + xfrm_eta0 = 0.5 * (1.0 + Z0[i]); + TData tmp0 = xfrm * inout[index]; + TData tmp1 = xfrm_eta0 * tmp0; + TData tmp2 = xfrm_eta2 * inout[index + nSize]; + inout[index] = tmp0; + inout[index + nSize] = tmp1 + tmp2; + inout[index + 2 * nSize] += tmp1 + xfrm_eta1 * tmp2; + } + } + } - // Chain-rule for eta_0 and eta_2. - TData xfrm_eta0 = 0.5 * (1.0 + Z0[i]); - d0 *= xfrm_eta2; - d2 += xfrm_eta0 * d0; + __syncthreads(); + } - // Multiply by derivative factors. - size_t dfindex = DEFORMED ? cnt_ijk : 0; - inoutptr[0][cnt_ijk] = d0 * dfptr[0][dfindex] + - d1 * dfptr[1][dfindex] + - d2 * dfptr[2][dfindex]; - inoutptr[1][cnt_ijk] = d0 * dfptr[3][dfindex] + - d1 * dfptr[4][dfindex] + - d2 * dfptr[5][dfindex]; - inoutptr[2][cnt_ijk] = d0 * dfptr[6][dfindex] + - d1 * dfptr[7][dfindex] + - d2 * dfptr[8][dfindex]; + for (unsigned int k = threadIdx.z; k < nq2; k += blockDim.z) + { + if constexpr (SHAPETYPE == LibUtilities::Prism || + SHAPETYPE == LibUtilities::Pyr) + { + xfrm_eta2 = 2.0 / (1.0 - Z2[k]); + } + + for (unsigned int j = threadIdx.y; j < nq1; j += blockDim.y) + { + if constexpr (SHAPETYPE == LibUtilities::Pyr) + { + xfrm_eta1 = 0.5 * (1.0 + Z1[j]); + } + + for (unsigned int i = threadIdx.x; i < nq0; i += blockDim.x) + { + unsigned int cnt_kji = nq0 * nq1 * k + nq0 * j + i; + unsigned int index = offset + cnt_kji; + unsigned int dfindex = DEFORMED ? index : e; + + TData d0 = inout[index]; + TData d1 = inout[index + nSize]; + TData d2 = inout[index + 2 * nSize]; + + if constexpr (SHAPETYPE == LibUtilities::Prism || + SHAPETYPE == LibUtilities::Pyr) + { + xfrm_eta0 = 0.5 * (1.0 + Z0[i]); + } + + // Chain-rule for eta_0 and eta_2. + if constexpr (SHAPETYPE == LibUtilities::Prism) + { + d0 *= xfrm_eta2; + d2 += xfrm_eta0 * d0; + } + else if constexpr (SHAPETYPE == LibUtilities::Pyr) + { + d0 *= xfrm_eta2; + d1 *= xfrm_eta2; + d2 += xfrm_eta0 * d0 + xfrm_eta1 * d1; + } + + // Multiply by derivative factors. + for (unsigned int d = 0; d < ncoord; d++) + { + inout[index + d * nSize] = + d0 * df[dfindex + (3 * d) * dfSize] + + d1 * df[dfindex + (3 * d + 1) * dfSize] + + d2 * df[dfindex + (3 * d + 2) * dfSize]; + } + } } } - } - delete[] inoutptr; - delete[] dfptr; + e += gridDim.x; + } } -template <typename TData, bool DEFORMED> -__global__ void PhysDerivPyrKernel(const size_t nq0, const size_t nq1, - const size_t nq2, const size_t ncoord, - const size_t nelmt, const TData *Z0, - const TData *Z1, const TData *Z2, - const size_t dfSize, TData *df, - TData *inout0, TData *inout1, TData *inout2) +template <typename TData> +__global__ void PhysDerivTensor1DKernel(const unsigned int nq0, + const unsigned int nelmt, + const TData *__restrict D0, + const TData *__restrict in, + TData *__restrict out) { - size_t e = blockDim.x * blockIdx.x + threadIdx.x; + extern __shared__ TData shared[]; + TData *s_D0 = shared; - if (e >= nelmt) + // Copy to shared memory. + unsigned int sIndex = threadIdx.x; + while (sIndex < nq0 * nq0) { - return; + s_D0[sIndex] = D0[sIndex]; + sIndex += blockDim.x; } - size_t ndf = 3 * ncoord; + __syncthreads(); - // Assign pointers. - TData **inoutptr = new TData *[ncoord]; - inoutptr[0] = inout0 + (nq0 * nq1 * nq2 * e); - inoutptr[1] = inout1 + (nq0 * nq1 * nq2 * e); - inoutptr[2] = inout2 + (nq0 * nq1 * nq2 * e); + unsigned int e = blockDim.x * blockIdx.x + threadIdx.x; - TData **dfptr = new TData *[ndf]; - for (size_t d = 0; d < ndf; d++) + while (e < nelmt) { - dfptr[d] = df + d * dfSize; - dfptr[d] += DEFORMED ? nq0 * nq1 * nq2 * e : e; - } + unsigned int offset = nq0 * e; - // Compute derivative. - for (size_t k = 0, cnt_ijk = 0; k < nq2; ++k) - { - TData xfrm_eta2 = 2.0 / (1.0 - Z2[k]); - for (size_t j = 0; j < nq1; ++j) + for (unsigned int i = 0; i < nq0; ++i) { - TData xfrm_eta1 = 0.5 * (1.0 + Z1[j]); - for (size_t i = 0; i < nq0; ++i, ++cnt_ijk) + TData sum = 0.0; + for (unsigned int q = 0; q < nq0; ++q) { - TData d0 = inoutptr[0][cnt_ijk]; - TData d1 = inoutptr[1][cnt_ijk]; - TData d2 = inoutptr[2][cnt_ijk]; - - // Chain-rule for eta_0 and eta_2. - TData xfrm_eta0 = 0.5 * (1.0 + Z0[i]); - d0 *= xfrm_eta2; - d1 *= xfrm_eta2; - d2 += xfrm_eta0 * d0 + xfrm_eta1 * d1; - - // Multiply by derivative factors. - size_t dfindex = DEFORMED ? cnt_ijk : 0; - inoutptr[0][cnt_ijk] = d0 * dfptr[0][dfindex] + - d1 * dfptr[1][dfindex] + - d2 * dfptr[2][dfindex]; - inoutptr[1][cnt_ijk] = d0 * dfptr[3][dfindex] + - d1 * dfptr[4][dfindex] + - d2 * dfptr[5][dfindex]; - inoutptr[2][cnt_ijk] = d0 * dfptr[6][dfindex] + - d1 * dfptr[7][dfindex] + - d2 * dfptr[8][dfindex]; + sum += s_D0[q * nq0 + i] * in[offset + q]; } + out[offset + i] = sum; } - } - delete[] inoutptr; - delete[] dfptr; + e += blockDim.x * gridDim.x; + } } template <typename TData> -__global__ void PhysDerivTensor1DKernel(const size_t nq0, const size_t nelmt, - const TData *D0, const TData *in, - TData *out0) +__global__ void PhysDerivTensor1DKernel_QP(const unsigned int nq0, + const unsigned int nelmt, + const TData *__restrict D0, + const TData *__restrict in, + TData *__restrict out) { - size_t e = blockDim.x * blockIdx.x + threadIdx.x; + unsigned int e = blockIdx.x; - if (e >= nelmt) + while (e < nelmt) { - return; - } + unsigned int offset = nq0 * e; - // Assign pointers. - const TData *inptr = in + (nq0 * e); - TData *outptr0 = out0 + (nq0 * e); - - // Direction 1 - for (size_t i = 0; i < nq0; ++i) - { - TData sum = 0.0; - for (size_t k = 0; k < nq0; ++k) + for (unsigned int i = threadIdx.x; i < nq0; i += blockDim.x) { - sum += D0[k * nq0 + i] * inptr[k]; + TData sum = 0.0; + for (unsigned int q = 0; q < nq0; ++q) + { + sum += D0[q * nq0 + i] * in[offset + q]; + } + out[offset + i] = sum; } - outptr0[i] = sum; + + e += gridDim.x; } } template <typename TData> -__global__ void PhysDerivTensor2DKernel(const size_t nq0, const size_t nq1, - const size_t nelmt, const TData *D0, - const TData *D1, const TData *in, - TData *out0, TData *out1) +__global__ void PhysDerivTensor2DKernel( + const unsigned int nq0, const unsigned int nq1, const unsigned int nelmt, + const unsigned int nSize, const TData *__restrict D0, + const TData *__restrict D1, const TData *__restrict in, + TData *__restrict out) { - size_t e = blockDim.x * blockIdx.x + threadIdx.x; + extern __shared__ TData shared[]; + TData *s_D0 = shared; + TData *s_D1 = s_D0 + nq0 * nq0; - if (e >= nelmt) + // Copy to shared memory. + unsigned int sIndex = threadIdx.x; + while (sIndex < nq0 * nq0) { - return; + s_D0[sIndex] = D0[sIndex]; + sIndex += blockDim.x; } - // Assign pointers. - const TData *inptr = in + (nq0 * nq1 * e); - TData *outptr0 = out0 + (nq0 * nq1 * e); - TData *outptr1 = out1 + (nq0 * nq1 * e); + sIndex = threadIdx.x; + while (sIndex < nq1 * nq1) + { + s_D1[sIndex] = D1[sIndex]; + sIndex += blockDim.x; + } - // Direction 1 - for (size_t i = 0; i < nq0; ++i) + __syncthreads(); + + unsigned int e = blockDim.x * blockIdx.x + threadIdx.x; + + while (e < nelmt) { - for (size_t j = 0; j < nq1; ++j) + unsigned int offset = nq0 * nq1 * e; + + for (unsigned int j = 0; j < nq1; ++j) { - TData sum = 0.0; - for (size_t k = 0; k < nq0; ++k) + for (unsigned int i = 0; i < nq0; ++i) { - sum += D0[k * nq0 + i] * inptr[j * nq0 + k]; + unsigned int cnt_ji = nq0 * j + i; + + // Direction 1 + TData sum = 0.0; + for (unsigned int q = 0; q < nq0; ++q) + { + sum += s_D0[q * nq0 + i] * in[offset + nq0 * j + q]; + } + out[offset + cnt_ji] = sum; + + // Direction 2 + sum = 0.0; + for (unsigned int q = 0; q < nq1; ++q) + { + sum += s_D1[q * nq1 + j] * in[offset + nq0 * q + i]; + } + out[offset + cnt_ji + nSize] = sum; } - outptr0[j * nq0 + i] = sum; } + + e += blockDim.x * gridDim.x; } +} + +template <typename TData> +__global__ void PhysDerivTensor2DKernel_QP( + const unsigned int nq0, const unsigned int nq1, const unsigned int nelmt, + const unsigned int nSize, const TData *__restrict D0, + const TData *__restrict D1, const TData *__restrict in, + TData *__restrict out) +{ + unsigned int e = blockIdx.x; - // Direction 2 - for (size_t i = 0; i < nq0; ++i) + while (e < nelmt) { - for (size_t j = 0; j < nq1; ++j) + unsigned int offset = nq0 * nq1 * e; + + for (unsigned int j = threadIdx.y; j < nq1; j += blockDim.y) { - TData sum = 0.0; - for (size_t k = 0; k < nq1; ++k) + for (unsigned int i = threadIdx.x; i < nq0; i += blockDim.x) { - sum += D1[k * nq1 + j] * inptr[k * nq0 + i]; + unsigned int cnt_ji = nq0 * j + i; + + // Direction 1 + TData sum = 0.0; + for (unsigned int q = 0; q < nq0; ++q) + { + sum += D0[q * nq0 + i] * in[offset + nq0 * j + q]; + } + out[offset + cnt_ji] = sum; + + // Direction 2 + sum = 0.0; + for (unsigned int q = 0; q < nq1; ++q) + { + sum += D1[q * nq1 + j] * in[offset + nq0 * q + i]; + } + out[offset + cnt_ji + nSize] = sum; } - outptr1[j * nq0 + i] = sum; } + + e += gridDim.x; } } template <typename TData> -__global__ void PhysDerivTensor3DKernel(const size_t nq0, const size_t nq1, - const size_t nq2, const size_t nelmt, - const TData *D0, const TData *D1, - const TData *D2, const TData *in, - TData *out0, TData *out1, TData *out2) +__global__ void PhysDerivTensor3DKernel( + const unsigned int nq0, const unsigned int nq1, const unsigned int nq2, + const unsigned int nelmt, const unsigned int nSize, + const TData *__restrict D0, const TData *__restrict D1, + const TData *__restrict D2, const TData *__restrict in, + TData *__restrict out) { - size_t e = blockDim.x * blockIdx.x + threadIdx.x; + extern __shared__ TData shared[]; + TData *s_D0 = shared; + TData *s_D1 = s_D0 + nq0 * nq0; + TData *s_D2 = s_D1 + nq1 * nq1; - if (e >= nelmt) + // Copy to shared memory. + unsigned int sIndex = threadIdx.x; + while (sIndex < nq0 * nq0) { - return; + s_D0[sIndex] = D0[sIndex]; + sIndex += blockDim.x; } - // Assign pointers. - const TData *inptr = in + (nq0 * nq1 * nq2 * e); - TData *outptr0 = out0 + (nq0 * nq1 * nq2 * e); - TData *outptr1 = out1 + (nq0 * nq1 * nq2 * e); - TData *outptr2 = out2 + (nq0 * nq1 * nq2 * e); + sIndex = threadIdx.x; + while (sIndex < nq1 * nq1) + { + s_D1[sIndex] = D1[sIndex]; + sIndex += blockDim.x; + } - // Direction 1 - for (size_t i = 0; i < nq0; ++i) + sIndex = threadIdx.x; + while (sIndex < nq2 * nq2) { - for (size_t j = 0; j < nq1 * nq2; ++j) - { - TData sum = 0.0; - for (size_t k = 0; k < nq0; ++k) - { - sum += D0[k * nq0 + i] * inptr[j * nq0 + k]; - } - outptr0[j * nq0 + i] = sum; - } + s_D2[sIndex] = D2[sIndex]; + sIndex += blockDim.x; } - // Direction 2 - for (size_t block = 0; block < nq2; ++block) + __syncthreads(); + + unsigned int e = blockDim.x * blockIdx.x + threadIdx.x; + + while (e < nelmt) { - size_t start = block * nq0 * nq1; - for (size_t i = 0; i < nq0; ++i) + unsigned int offset = nq0 * nq1 * nq2 * e; + + for (unsigned int k = 0; k < nq2; k++) { - for (size_t j = 0; j < nq1; ++j) + unsigned int cnt_k = nq0 * nq1 * k; + for (unsigned int j = 0; j < nq1; j++) { - TData sum = 0.0; - for (size_t k = 0; k < nq1; ++k) + unsigned int cnt_kj = cnt_k + nq0 * j; + for (unsigned int i = 0; i < nq0; i++) { - sum += D1[k * nq1 + j] * inptr[start + k * nq0 + i]; + unsigned int cnt_ji = nq0 * j + i; + unsigned int cnt_kji = cnt_k + cnt_ji; + + // Direction 1 + TData sum = 0.0; + for (unsigned int q = 0; q < nq0; ++q) + { + sum += s_D0[q * nq0 + i] * in[offset + cnt_kj + q]; + } + out[offset + cnt_kji] = sum; + + // Direction 2 + sum = 0.0; + for (unsigned int q = 0; q < nq1; ++q) + { + sum += s_D1[q * nq1 + j] * + in[offset + cnt_k + nq0 * q + i]; + } + out[offset + cnt_kji + nSize] = sum; + + // Direction 3 + sum = 0.0; + for (unsigned int q = 0; q < nq2; ++q) + { + sum += s_D2[q * nq2 + k] * + in[offset + nq0 * nq1 * q + cnt_ji]; + } + out[offset + cnt_kji + 2 * nSize] = sum; } - outptr1[start + j * nq0 + i] = sum; } } + + e += blockDim.x * gridDim.x; } +} - // Direction 3 - for (size_t i = 0; i < nq0 * nq1; ++i) +template <typename TData> +__global__ void PhysDerivTensor3DKernel_QP( + const unsigned int nq0, const unsigned int nq1, const unsigned int nq2, + const unsigned int nelmt, const unsigned int nSize, + const TData *__restrict D0, const TData *__restrict D1, + const TData *__restrict D2, const TData *__restrict in, + TData *__restrict out) +{ + unsigned int e = blockIdx.x; + + while (e < nelmt) { - for (size_t j = 0; j < nq2; ++j) + unsigned int offset = nq0 * nq1 * nq2 * e; + + for (unsigned int k = threadIdx.z; k < nq2; k += blockDim.z) { - TData sum = 0.0; - for (size_t k = 0; k < nq2; ++k) + for (unsigned int j = threadIdx.y; j < nq1; j += blockDim.y) { - sum += D2[k * nq2 + j] * inptr[k * nq0 * nq1 + i]; + for (unsigned int i = threadIdx.x; i < nq0; i += blockDim.x) + { + unsigned int cnt_kji = nq0 * nq1 * k + nq0 * j + i; + + // Direction 1 + TData sum = 0.0; + for (unsigned int q = 0; q < nq0; ++q) + { + sum += D0[q * nq0 + i] * + in[offset + nq0 * nq1 * k + nq0 * j + q]; + } + out[offset + cnt_kji] = sum; + + // Direction 2 + sum = 0.0; + for (unsigned int q = 0; q < nq1; ++q) + { + sum += D1[q * nq1 + j] * + in[offset + nq0 * nq1 * k + nq0 * q + i]; + } + out[offset + cnt_kji + nSize] = sum; + + // Direction 3 + sum = 0.0; + for (unsigned int q = 0; q < nq2; ++q) + { + sum += D2[q * nq2 + k] * + in[offset + nq0 * nq1 * q + nq0 * j + i]; + } + out[offset + cnt_kji + 2 * nSize] = sum; + } } - outptr2[j * nq0 * nq1 + i] = sum; } + + e += gridDim.x; } } + } // namespace Nektar::Operators::detail diff --git a/Operators/PhysDeriv/PhysDerivMatFree.hpp b/Operators/PhysDeriv/PhysDerivMatFree.hpp index cd0532b74c90b1eb85b6bc46a7b6487a22dbefea..1a671a8cc4d8ac1891a6e824992d8c1e9e64aace 100644 --- a/Operators/PhysDeriv/PhysDerivMatFree.hpp +++ b/Operators/PhysDeriv/PhysDerivMatFree.hpp @@ -1,11 +1,12 @@ -#include <LibUtilities/Foundations/Basis.h> +#pragma once #include <LibUtilities/BasicUtils/ShapeType.hpp> #include <LibUtilities/BasicUtils/SharedArray.hpp> +#include <LibUtilities/Foundations/Basis.h> +#include <LibUtilities/SimdLib/tinysimd.hpp> #include "Operators/OperatorPhysDeriv.hpp" #include "PhysDerivMatFreeKernels.hpp" -#include <LibUtilities/SimdLib/tinysimd.hpp> namespace Nektar::Operators::detail { diff --git a/Operators/PhysDeriv/PhysDerivMatFreeKernels.hpp b/Operators/PhysDeriv/PhysDerivMatFreeKernels.hpp index 762f95de82967d368fdd75ba11def79e26fecf09..e4ef1184ddc4632146a963255f188dbf1a25985d 100644 --- a/Operators/PhysDeriv/PhysDerivMatFreeKernels.hpp +++ b/Operators/PhysDeriv/PhysDerivMatFreeKernels.hpp @@ -1,4 +1,5 @@ #pragma once + #include <LibUtilities/BasicUtils/NekInline.hpp> namespace Nektar::Operators::detail diff --git a/Operators/PhysDeriv/PhysDerivStdMat.hpp b/Operators/PhysDeriv/PhysDerivStdMat.hpp index 86baed306156a46ca161b78fe2b093e41f8f658c..a72856f956f8308a14bf1c9daec76ea9a51d8df6 100644 --- a/Operators/PhysDeriv/PhysDerivStdMat.hpp +++ b/Operators/PhysDeriv/PhysDerivStdMat.hpp @@ -1,6 +1,9 @@ -#include "Operators/OperatorPhysDeriv.hpp" +#pragma once + #include <StdRegions/StdExpansion.h> +#include "Operators/OperatorPhysDeriv.hpp" + namespace Nektar::Operators::detail { @@ -62,10 +65,8 @@ public: { // Initialize pointers. auto const *inptr = in.GetStorage().GetCPUPtr(); - auto *outptr0 = out.GetStorage().GetCPUPtr(); - auto *outptr1 = outptr0 + out.GetFieldSize(); - auto *outptr2 = outptr1 + out.GetFieldSize(); - std::vector<TData *> outptr{outptr0, outptr1, outptr2}; + auto *outptr = out.GetStorage().GetCPUPtr(); + auto nSize = out.GetFieldSize(); // Initialize index. size_t expIdx = 0; @@ -113,39 +114,38 @@ public: { Vmath::Vmul(nqTot * nElmts, m_derivFac[i * nDim].get() + dfindex, 1, - deriv[0].get(), 1, outptr[i], 1); + deriv[0].get(), 1, outptr + i * nSize, 1); for (size_t d = 1; d < nDim; d++) { Vmath::Vvtvp(nqTot * nElmts, m_derivFac[i * nDim + d].get() + dfindex, - 1, deriv[d].get(), 1, outptr[i], 1, - outptr[i], 1); + 1, deriv[d].get(), 1, outptr + i * nSize, + 1, outptr + i * nSize, 1); } - outptr[i] += (nPadElmts + nElmts) * nqTot; } + outptr += (nPadElmts + nElmts) * nqTot; dfindex += nqTot * nElmts; } else { - for (size_t i = 0; i < nCoord; i++) + for (size_t e = 0; e < nElmts; ++e) { - for (size_t e = 0; e < nElmts; ++e) + for (size_t i = 0; i < nCoord; i++) { Vmath::Smul(nqTot, m_derivFac[i * nDim][dfindex + e], - deriv[0].get() + e * nqTot, 1, outptr[i], - 1); + deriv[0].get() + e * nqTot, 1, + outptr + i * nSize, 1); for (size_t d = 1; d < nDim; d++) { - Vmath::Svtvp(nqTot, - m_derivFac[i * nDim + d][dfindex + e], - deriv[d].get() + e * nqTot, 1, - outptr[i], 1, outptr[i], 1); + Vmath::Svtvp( + nqTot, m_derivFac[i * nDim + d][dfindex + e], + deriv[d].get() + e * nqTot, 1, + outptr + i * nSize, 1, outptr + i * nSize, 1); } - outptr[i] += nqTot; } - // skip padding elements - outptr[i] += nPadElmts * nqTot; + outptr += nqTot; } + outptr += nPadElmts * nqTot; dfindex += nElmts; } inptr += (nPadElmts + nElmts) * nqTot;