diff --git a/Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDA.cu b/Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDA.cu index 8153c1d9dc8425b8c99f0395ffef0b04742476a8..620b627b4e61e398739bb9ebbdbbaa6502f6ea88 100644 --- a/Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDA.cu +++ b/Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDA.cu @@ -2,9 +2,12 @@ namespace Nektar::Operators::detail { + +// Add different IProductWRTDerivBase implementations to the factory. template <> std::string OperatorIProductWRTDerivBaseImpl<double, ImplCUDA>::className = GetOperatorFactory<double>().RegisterCreatorFunction( "IProductWRTDerivBaseCUDA", OperatorIProductWRTDerivBaseImpl<double, ImplCUDA>::instantiate, "..."); -} + +} // namespace Nektar::Operators::detail diff --git a/Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDA.hpp b/Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDA.hpp index bb475d9d9fd8f77dec7f42786c047feed4effafa..496da326b354afc0efaaae772da13394cecfd6ba 100644 --- a/Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDA.hpp +++ b/Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDA.hpp @@ -6,33 +6,35 @@ #include "Operators/OperatorHelper.cuh" #include "Operators/OperatorIProductWRTDerivBase.hpp" +#define FLAG_QP false + namespace Nektar::Operators::detail { template <typename TData, bool DEFORMED = false> -void IProductWRTDerivBase1DKernel(const size_t gridSize, const size_t blockSize, - const size_t nq0, const size_t nCoord, - const size_t nElmts, const size_t dfSize, - TData *df, TData *in0, TData *in1, TData *in2, - TData *out0); +void IProductWRTDerivBase1DKernel( + const unsigned int gridSize, const unsigned int blockSize, + const unsigned int nq0, const unsigned int nCoord, + const unsigned int nElmts, const unsigned int nSize, + const unsigned int dfSize, const TData *df, const TData *in, TData *out); template <typename TData, bool DEFORMED = false> -void IProductWRTDerivBase2DKernel(const size_t gridSize, const size_t blockSize, - LibUtilities::ShapeType shapetype, - const size_t nq0, const size_t nq1, - const size_t nCoord, const size_t nElmts, - const TData *Z0, const TData *Z1, - const size_t dfSize, TData *df, TData *in0, - TData *in1, TData *in2, TData *out0, - TData *out1); +void IProductWRTDerivBase2DKernel( + const unsigned int gridSize, const unsigned int blockSize, + LibUtilities::ShapeType shapetype, const unsigned int nq0, + const unsigned int nq1, const unsigned int nCoord, + const unsigned int nElmts, const unsigned int nSize, + const unsigned int dfSize, const TData *Z0, const TData *Z1, + const TData *df, const TData *in, TData *out); template <typename TData, bool DEFORMED = false> void IProductWRTDerivBase3DKernel( - const size_t gridSize, const size_t blockSize, - LibUtilities::ShapeType shapetype, const size_t nq0, const size_t nq1, - const size_t nq2, const size_t nCoord, const size_t nElmts, const TData *Z0, - const TData *Z1, const TData *Z2, const size_t dfSize, TData *df, - TData *in0, TData *in1, TData *in2, TData *out0, TData *out1, TData *out2); + const unsigned int gridSize, const unsigned int blockSize, + LibUtilities::ShapeType shapetype, const unsigned int nq0, + const unsigned int nq1, const unsigned int nq2, const unsigned int nCoord, + const unsigned int nElmts, const unsigned int nSize, + const unsigned int dfSize, const TData *Z0, const TData *Z1, + const TData *Z2, const TData *df, const TData *in, TData *out); // IProductWRTDerivBase implementation template <typename TData> @@ -82,22 +84,12 @@ public: m_D = GetDerivativeDataCUDA<TData>(expansionList); // Initialize workspace memory. - auto ndata = this->m_expansionList->GetTotPoints(); - cudaMalloc((void **)&m_wsp0, sizeof(TData) * ndata); - if (nCoord > 1) - { - cudaMalloc((void **)&m_wsp1, sizeof(TData) * ndata); - } - if (nCoord > 2) - { - cudaMalloc((void **)&m_wsp2, sizeof(TData) * ndata); - } + auto nStorage = this->m_expansionList->GetTotPoints(); + cudaMalloc((void **)&m_wsp, sizeof(TData) * nStorage * nCoord); } ~OperatorIProductWRTDerivBaseImpl(void) { - size_t nCoord = this->m_expansionList->GetCoordim(0); - DeallocateDataCUDA<TData>(m_basis); DeallocateDataCUDA<TData>(m_dbasis); DeallocateDataCUDA<TData>(m_weight); @@ -105,45 +97,20 @@ public: DeallocateDataCUDA<TData>(m_D); cudaFree(m_jac); cudaFree(m_derivFac); - cudaFree(m_wsp0); - if (nCoord > 1) - { - cudaFree(m_wsp1); - } - if (nCoord > 2) - { - cudaFree(m_wsp2); - } + cudaFree(m_wsp); } void apply(Field<TData, FieldState::Phys> &in, Field<TData, FieldState::Coeff> &out, bool APPEND = false) override { - if (!APPEND) // Zero output (temporary solution) - { - auto *tmpptr = - out.template GetStorage<MemoryRegionCUDA>().GetCPUPtr(); - for (size_t block_idx = 0; block_idx < out.GetBlocks().size(); - ++block_idx) - { - for (size_t i = 0; i < out.GetBlocks()[block_idx].block_size; - i++) - { - *(tmpptr++) = 0.0; - } - } - } - // Copy memory to GPU, if necessary and get raw pointers. - auto *inptr0 = in.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); - auto *inptr1 = inptr0 + in.GetFieldSize(); - auto *inptr2 = inptr1 + in.GetFieldSize(); - std::vector<TData *> inptr{inptr0, inptr1, inptr2}; + auto *inptr = in.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); auto *outptr = out.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); - std::vector<TData *> wspptr{m_wsp0, m_wsp1, m_wsp2}; - auto jacptr = m_jac; - auto dfptr = m_derivFac; + auto jacptr = m_jac; + auto dfptr = m_derivFac; + auto wspptr = m_wsp; + auto nSize = in.GetFieldSize(); // Initialize index. size_t expIdx = 0; @@ -152,6 +119,13 @@ public: std::vector<LibUtilities::BasisKey> basisKeys( 3, LibUtilities::NullBasisKey); + // Zero output. + if (!APPEND) + { + size_t ndata = out.template GetStorage<MemoryRegionCUDA>().size(); + cudaMemset(outptr, 0, sizeof(TData) * ndata); + } + // Loop over the blocks. for (size_t block_idx = 0; block_idx < out.GetBlocks().size(); ++block_idx) @@ -167,7 +141,7 @@ public: SpatialDomains::eDeformed; // Deterime CUDA grid size. - m_gridSize = GetCUDAGridSize(nElmts, m_blockSize); + m_gridSize = GetCUDAGridSize(nElmts, m_blockSize); // Flag for collapsed coordinate correction. bool correct = expPtr->GetBasis(0)->GetBasisType() == @@ -190,20 +164,20 @@ public: if (deformed) { IProductWRTDerivBase1DKernel<TData, true>( - m_gridSize, m_blockSize, nq0, nCoord, nElmts, m_dfSize, - dfptr, inptr[0], inptr[1], inptr[2], wspptr[0]); + m_gridSize, m_blockSize, nq0, nCoord, nElmts, nSize, + m_dfSize, dfptr, inptr, wspptr); IProductWRTBase1DKernel<TData, false, true, true>( m_gridSize, m_blockSize, nm0, nq0, nElmts, dbasis0, w0, - jacptr, wspptr[0], outptr); + jacptr, wspptr, outptr); } else { IProductWRTDerivBase1DKernel<TData, false>( - m_gridSize, m_blockSize, nq0, nCoord, nElmts, m_dfSize, - dfptr, inptr[0], inptr[1], inptr[2], wspptr[0]); + m_gridSize, m_blockSize, nq0, nCoord, nElmts, nSize, + m_dfSize, dfptr, inptr, wspptr); IProductWRTBase1DKernel<TData, false, true, false>( m_gridSize, m_blockSize, nm0, nq0, nElmts, dbasis0, w0, - jacptr, wspptr[0], outptr); + jacptr, wspptr, outptr); } } else if (nDim == 2) @@ -226,31 +200,29 @@ public: { IProductWRTDerivBase2DKernel<TData, true>( m_gridSize, m_blockSize, shape, nq0, nq1, nCoord, - nElmts, Z0, Z1, m_dfSize, dfptr, inptr[0], inptr[1], - inptr[2], wspptr[0], wspptr[1]); + nElmts, nSize, m_dfSize, Z0, Z1, dfptr, inptr, wspptr); IProductWRTBase2DKernel<TData, false, true, true>( m_gridSize, m_blockSize, shape, nm0, nm1, nq0, nq1, nElmts, correct, dbasis0, basis1, w0, w1, jacptr, - wspptr[0], outptr); + wspptr, outptr); IProductWRTBase2DKernel<TData, false, true, true>( m_gridSize, m_blockSize, shape, nm0, nm1, nq0, nq1, nElmts, correct, basis0, dbasis1, w0, w1, jacptr, - wspptr[1], outptr); + wspptr + nSize, outptr); } else { IProductWRTDerivBase2DKernel<TData, false>( m_gridSize, m_blockSize, shape, nq0, nq1, nCoord, - nElmts, Z0, Z1, m_dfSize, dfptr, inptr[0], inptr[1], - inptr[2], wspptr[0], wspptr[1]); + nElmts, nSize, m_dfSize, Z0, Z1, dfptr, inptr, wspptr); IProductWRTBase2DKernel<TData, false, true, false>( m_gridSize, m_blockSize, shape, nm0, nm1, nq0, nq1, nElmts, correct, dbasis0, basis1, w0, w1, jacptr, - wspptr[0], outptr); + wspptr, outptr); IProductWRTBase2DKernel<TData, false, true, false>( m_gridSize, m_blockSize, shape, nm0, nm1, nq0, nq1, nElmts, correct, basis0, dbasis1, w0, w1, jacptr, - wspptr[1], outptr); + wspptr + nSize, outptr); } } else if (nDim == 3) @@ -280,50 +252,47 @@ public: { IProductWRTDerivBase3DKernel<TData, true>( m_gridSize, m_blockSize, shape, nq0, nq1, nq2, nCoord, - nElmts, Z0, Z1, Z2, m_dfSize, dfptr, inptr[0], inptr[1], - inptr[2], wspptr[0], wspptr[1], wspptr[2]); + nElmts, nSize, m_dfSize, Z0, Z1, Z2, dfptr, inptr, + wspptr); IProductWRTBase3DKernel<TData, false, true, true>( m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0, nq1, nq2, nElmts, correct, dbasis0, basis1, basis2, w0, w1, - w2, jacptr, wspptr[0], outptr); + w2, jacptr, wspptr, outptr); IProductWRTBase3DKernel<TData, false, true, true>( m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0, nq1, nq2, nElmts, correct, basis0, dbasis1, basis2, w0, w1, - w2, jacptr, wspptr[1], outptr); + w2, jacptr, wspptr + nSize, outptr); IProductWRTBase3DKernel<TData, false, true, true>( m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0, nq1, nq2, nElmts, correct, basis0, basis1, dbasis2, w0, w1, - w2, jacptr, wspptr[2], outptr); + w2, jacptr, wspptr + 2 * nSize, outptr); } else { IProductWRTDerivBase3DKernel<TData, false>( m_gridSize, m_blockSize, shape, nq0, nq1, nq2, nCoord, - nElmts, Z0, Z1, Z2, m_dfSize, dfptr, inptr[0], inptr[1], - inptr[2], wspptr[0], wspptr[1], wspptr[2]); + nElmts, nSize, m_dfSize, Z0, Z1, Z2, dfptr, inptr, + wspptr); IProductWRTBase3DKernel<TData, false, true, false>( m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0, nq1, nq2, nElmts, correct, dbasis0, basis1, basis2, w0, w1, - w2, jacptr, wspptr[0], outptr); + w2, jacptr, wspptr, outptr); IProductWRTBase3DKernel<TData, false, true, false>( m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0, nq1, nq2, nElmts, correct, basis0, dbasis1, basis2, w0, w1, - w2, jacptr, wspptr[1], outptr); + w2, jacptr, wspptr + nSize, outptr); IProductWRTBase3DKernel<TData, false, true, false>( m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0, nq1, nq2, nElmts, correct, basis0, basis1, dbasis2, w0, w1, - w2, jacptr, wspptr[2], outptr); + w2, jacptr, wspptr + 2 * nSize, outptr); } } // Increment pointer and index for next element type. jacptr += deformed ? nqTot * nElmts : nElmts; dfptr += deformed ? nqTot * nElmts : nElmts; - for (size_t d = 0; d < nDim; d++) - { - inptr[d] += nqTot * nElmts; - wspptr[d] += nqTot * nElmts; - } + inptr += nqTot * nElmts; + wspptr += nqTot * nElmts; outptr += nmTot * nElmts; expIdx += nElmts; } @@ -339,7 +308,7 @@ public: static std::string className; private: - TData *m_wsp0, *m_wsp1, *m_wsp2; + TData *m_wsp; TData *m_derivFac; TData *m_jac; std::map<std::vector<LibUtilities::BasisKey>, std::vector<TData *>> m_basis; @@ -350,77 +319,155 @@ private: std::map<std::vector<LibUtilities::BasisKey>, std::vector<TData *>> m_D; std::map<std::vector<LibUtilities::BasisKey>, std::vector<TData *>> m_Z; size_t m_dfSize; + size_t m_gridSize = 1024; size_t m_blockSize = 32; - size_t m_gridSize; }; template <typename TData, bool DEFORMED> -void IProductWRTDerivBase1DKernel(const size_t gridSize, const size_t blockSize, - const size_t nq0, const size_t nCoord, - const size_t nElmts, const size_t dfSize, - TData *df, TData *in0, TData *in1, TData *in2, - TData *out0) +void IProductWRTDerivBase1DKernel( + const unsigned int gridSize, const unsigned int blockSize, + const unsigned int nq0, const unsigned int nCoord, + const unsigned int nElmts, const unsigned int nSize, + const unsigned int dfSize, const TData *df, const TData *in, TData *out) { - IProductWRTDerivBaseSegKernel<TData, DEFORMED><<<gridSize, blockSize>>>( - nq0, nCoord, nElmts, dfSize, df, in0, in1, in2, out0); + if (!FLAG_QP) + { + IProductWRTDerivBase1DKernel<TData, DEFORMED><<<gridSize, blockSize>>>( + nq0, nCoord, nElmts, nSize, dfSize, df, in, out); + } + else + { + IProductWRTDerivBase1DKernel_QP<TData, DEFORMED> + <<<gridSize, dim3(32)>>>(nq0, nCoord, nElmts, nSize, dfSize, df, in, + out); + } } template <typename TData, bool DEFORMED> -void IProductWRTDerivBase2DKernel(const size_t gridSize, const size_t blockSize, - LibUtilities::ShapeType shapetype, - const size_t nq0, const size_t nq1, - const size_t nCoord, const size_t nElmts, - const TData *Z0, const TData *Z1, - const size_t dfSize, TData *df, TData *in0, - TData *in1, TData *in2, TData *out0, - TData *out1) +void IProductWRTDerivBase2DKernel( + const unsigned int gridSize, const unsigned int blockSize, + LibUtilities::ShapeType shapetype, const unsigned int nq0, + const unsigned int nq1, const unsigned int nCoord, + const unsigned int nElmts, const unsigned int nSize, + const unsigned int dfSize, const TData *Z0, const TData *Z1, + const TData *df, const TData *in, TData *out) { if (shapetype == LibUtilities::Quad) { - IProductWRTDerivBaseQuadKernel<TData, DEFORMED> - <<<gridSize, blockSize>>>(nq0, nq1, nCoord, nElmts, dfSize, df, in0, - in1, in2, out0, out1); + if (!FLAG_QP) + { + IProductWRTDerivBase2DKernel<TData, LibUtilities::Quad, DEFORMED> + <<<gridSize, blockSize>>>(nq0, nq1, nCoord, nElmts, nSize, + dfSize, nullptr, nullptr, df, in, + out); + } + else + { + IProductWRTDerivBase2DKernel_QP<TData, LibUtilities::Quad, DEFORMED> + <<<gridSize, dim3(8, 8)>>>(nq0, nq1, nCoord, nElmts, nSize, + dfSize, nullptr, nullptr, df, in, + out); + } } else if (shapetype == LibUtilities::Tri) { - IProductWRTDerivBaseTriKernel<TData, DEFORMED> - <<<gridSize, blockSize>>>(nq0, nq1, nCoord, nElmts, Z0, Z1, dfSize, - df, in0, in1, in2, out0, out1); + if (!FLAG_QP) + { + unsigned int nshared = sizeof(TData) * (nq0 + nq1); + IProductWRTDerivBase2DKernel<TData, LibUtilities::Tri, DEFORMED> + <<<gridSize, blockSize, nshared>>>(nq0, nq1, nCoord, nElmts, + nSize, dfSize, Z0, Z1, df, + in, out); + } + else + { + IProductWRTDerivBase2DKernel_QP<TData, LibUtilities::Tri, DEFORMED> + <<<gridSize, dim3(8, 8)>>>(nq0, nq1, nCoord, nElmts, nSize, + dfSize, Z0, Z1, df, in, out); + } } } template <typename TData, bool DEFORMED> void IProductWRTDerivBase3DKernel( - const size_t gridSize, const size_t blockSize, - LibUtilities::ShapeType shapetype, const size_t nq0, const size_t nq1, - const size_t nq2, const size_t nCoord, const size_t nElmts, const TData *Z0, - const TData *Z1, const TData *Z2, const size_t dfSize, TData *df, - TData *in0, TData *in1, TData *in2, TData *out0, TData *out1, TData *out2) + const unsigned int gridSize, const unsigned int blockSize, + LibUtilities::ShapeType shapetype, const unsigned int nq0, + const unsigned int nq1, const unsigned int nq2, const unsigned int nCoord, + const unsigned int nElmts, const unsigned int nSize, + const unsigned int dfSize, const TData *Z0, const TData *Z1, + const TData *Z2, const TData *df, const TData *in, TData *out) { if (shapetype == LibUtilities::Hex) { - IProductWRTDerivBaseHexKernel<TData, DEFORMED> - <<<gridSize, blockSize>>>(nq0, nq1, nq2, nCoord, nElmts, dfSize, df, - in0, in1, in2, out0, out1, out2); + if (!FLAG_QP) + { + IProductWRTDerivBase3DKernel<TData, LibUtilities::Hex, DEFORMED> + <<<gridSize, blockSize>>>(nq0, nq1, nq2, nCoord, nElmts, nSize, + dfSize, nullptr, nullptr, nullptr, df, + in, out); + } + else + { + IProductWRTDerivBase3DKernel_QP<TData, LibUtilities::Hex, DEFORMED> + <<<gridSize, dim3(4, 4, 4)>>>(nq0, nq1, nq2, nCoord, nElmts, + nSize, dfSize, nullptr, nullptr, + nullptr, df, in, out); + } } else if (shapetype == LibUtilities::Tet) { - IProductWRTDerivBaseTetKernel<TData, DEFORMED><<<gridSize, blockSize>>>( - nq0, nq1, nq2, nCoord, nElmts, Z0, Z1, Z2, dfSize, df, in0, in1, - in2, out0, out1, out2); + if (!FLAG_QP) + { + unsigned int nshared = sizeof(TData) * (nq0 + 2 * nq1 + nq2); + IProductWRTDerivBase3DKernel<TData, LibUtilities::Tet, DEFORMED> + <<<gridSize, blockSize, nshared>>>(nq0, nq1, nq2, nCoord, + nElmts, nSize, dfSize, Z0, + Z1, Z2, df, in, out); + } + else + { + IProductWRTDerivBase3DKernel_QP<TData, LibUtilities::Tet, DEFORMED> + <<<gridSize, dim3(4, 4, 4)>>>(nq0, nq1, nq2, nCoord, nElmts, + nSize, dfSize, Z0, Z1, Z2, df, in, + out); + } } - else if (shapetype == LibUtilities::Pyr) + else if (shapetype == LibUtilities::Prism) { - IProductWRTDerivBasePyrKernel<TData, DEFORMED><<<gridSize, blockSize>>>( - nq0, nq1, nq2, nCoord, nElmts, Z0, Z1, Z2, dfSize, df, in0, in1, - in2, out0, out1, out2); + if (!FLAG_QP) + { + unsigned int nshared = sizeof(TData) * (nq0 + nq2); + IProductWRTDerivBase3DKernel<TData, LibUtilities::Prism, DEFORMED> + <<<gridSize, blockSize, nshared>>>(nq0, nq1, nq2, nCoord, + nElmts, nSize, dfSize, Z0, + nullptr, Z2, df, in, out); + } + else + { + IProductWRTDerivBase3DKernel_QP<TData, LibUtilities::Prism, + DEFORMED> + <<<gridSize, dim3(4, 4, 4)>>>(nq0, nq1, nq2, nCoord, nElmts, + nSize, dfSize, Z0, nullptr, Z2, + df, in, out); + } } - else if (shapetype == LibUtilities::Prism) + else if (shapetype == LibUtilities::Pyr) { - IProductWRTDerivBasePrismKernel<TData, DEFORMED> - <<<gridSize, blockSize>>>(nq0, nq1, nq2, nCoord, nElmts, Z0, Z2, - dfSize, df, in0, in1, in2, out0, out1, - out2); + if (!FLAG_QP) + { + unsigned int nshared = sizeof(TData) * (nq0 + nq1 + nq2); + IProductWRTDerivBase3DKernel<TData, LibUtilities::Pyr, DEFORMED> + <<<gridSize, blockSize, nshared>>>(nq0, nq1, nq2, nCoord, + nElmts, nSize, dfSize, Z0, + Z1, Z2, df, in, out); + } + else + { + IProductWRTDerivBase3DKernel_QP<TData, LibUtilities::Pyr, DEFORMED> + <<<gridSize, dim3(4, 4, 4)>>>(nq0, nq1, nq2, nCoord, nElmts, + nSize, dfSize, Z0, Z1, Z2, df, in, + out); + } } } diff --git a/Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDAKernels.cuh b/Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDAKernels.cuh index 433a294497a7d07a175dac8dc78f23ffd08de2ce..cb0c7f21697962216589d2d324bee4a8590bd72b 100644 --- a/Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDAKernels.cuh +++ b/Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDAKernels.cuh @@ -1,419 +1,453 @@ +#pragma once + namespace Nektar::Operators::detail { + template <typename TData, bool DEFORMED> -__global__ void IProductWRTDerivBaseSegKernel(const size_t nq0, - const size_t ncoord, - const size_t nelmt, - const size_t dfSize, TData *df, - TData *in0, TData *in1, - TData *in2, TData *out0) +__global__ void IProductWRTDerivBase1DKernel( + const unsigned int nq0, const unsigned int ncoord, const unsigned int nelmt, + const unsigned int nSize, const unsigned int dfSize, + const TData *__restrict df, const TData *__restrict in, + TData *__restrict out) { - size_t e = blockDim.x * blockIdx.x + threadIdx.x; - - if (e >= nelmt) - { - return; - } - - size_t ndf = ncoord; - - // Assign pointers. - TData **inptr = new TData *[ncoord]; - inptr[0] = in0 + (nq0 * e); - if (ncoord > 1) - { - inptr[1] = in1 + (nq0 * e); - } - if (ncoord > 2) - { - inptr[2] = in2 + (nq0 * e); - } - - TData *outptr0 = out0 + (nq0 * e); + unsigned int e = blockDim.x * blockIdx.x + threadIdx.x; - TData **dfptr = new TData *[ndf]; - for (size_t d = 0; d < ndf; d++) + while (e < nelmt) { - dfptr[d] = df + d * dfSize; - dfptr[d] += DEFORMED ? nq0 * e : e; - } + unsigned int offset = nq0 * e; - // Calculate dxi/dx in[0] + dxi/dy in[1] + dxi/dz in[2] - for (size_t i = 0; i < nq0; ++i) - { - size_t dfindex = DEFORMED ? i : 0; - outptr0[i] = dfptr[0][dfindex] * inptr[0][i]; - for (size_t d = 1; d < ncoord; ++d) + for (unsigned int i = 0; i < nq0; ++i) { - outptr0[i] += (dfptr[d][dfindex] * inptr[d][i]); + unsigned int index = offset + i; + unsigned int dfindex = DEFORMED ? index : e; + + TData sum = 0.0; + for (unsigned int d = 0; d < ncoord; ++d) + { + sum += df[dfindex + d * dfSize] * in[index + d * nSize]; + } + out[index] = sum; } + + e += blockDim.x * gridDim.x; } - delete[] inptr; - delete[] dfptr; } template <typename TData, bool DEFORMED> -__global__ void IProductWRTDerivBaseQuadKernel( - const size_t nq0, const size_t nq1, const size_t ncoord, const size_t nelmt, - const size_t dfSize, TData *df, TData *in0, TData *in1, TData *in2, - TData *out0, TData *out1) +__global__ void IProductWRTDerivBase1DKernel_QP( + const unsigned int nq0, const unsigned int ncoord, const unsigned int nelmt, + const unsigned int nSize, const unsigned int dfSize, + const TData *__restrict df, const TData *__restrict in, + TData *__restrict out) { - size_t e = blockDim.x * blockIdx.x + threadIdx.x; - - if (e >= nelmt) - { - return; - } - - const auto ndf = 2 * ncoord; - - // Assign pointers. - TData **inptr = new TData *[ncoord]; - inptr[0] = in0 + (nq0 * nq1 * e); - inptr[1] = in1 + (nq0 * nq1 * e); - if (ncoord > 2) - { - inptr[2] = in2 + (nq0 * nq1 * e); - } - - TData *outptr0 = out0 + (nq0 * nq1 * e); - TData *outptr1 = out1 + (nq0 * nq1 * e); + unsigned int e = blockIdx.x; - TData **dfptr = new TData *[ndf]; - for (size_t d = 0; d < ndf; d++) + while (e < nelmt) { - dfptr[d] = df + d * dfSize; - dfptr[d] += DEFORMED ? nq0 * nq1 * e : e; - } + unsigned int offset = nq0 * e; - // Calculate dxi/dx in[0] + dxi/dy in[1] + dxi/dz in[2] - for (size_t i = 0; i < nq0 * nq1; ++i) - { - size_t dfindex = DEFORMED ? i : 0; - outptr0[i] = dfptr[0][dfindex] * inptr[0][i]; - outptr1[i] = dfptr[1][dfindex] * inptr[0][i]; - for (size_t d = 1; d < ncoord; ++d) + for (unsigned int i = threadIdx.x; i < nq0; i += blockDim.x) { - outptr0[i] += (dfptr[2 * d][dfindex] * inptr[d][i]); - outptr1[i] += (dfptr[2 * d + 1][dfindex] * inptr[d][i]); + unsigned int index = offset + i; + unsigned int dfindex = DEFORMED ? index : e; + + TData sum = 0.0; + for (unsigned int d = 0; d < ncoord; ++d) + { + sum += df[dfindex + d * dfSize] * in[index + d * nSize]; + } + out[index] = sum; } + + e += gridDim.x; } - delete[] inptr; - delete[] dfptr; } -template <typename TData, bool DEFORMED> -__global__ void IProductWRTDerivBaseTriKernel( - const size_t nq0, const size_t nq1, const size_t ncoord, const size_t nelmt, - const TData *Z0, const TData *Z1, const size_t dfSize, TData *df, - TData *in0, TData *in1, TData *in2, TData *out0, TData *out1) +template <typename TData, LibUtilities::ShapeType SHAPETYPE, bool DEFORMED> +__global__ void IProductWRTDerivBase2DKernel( + const unsigned int nq0, const unsigned int nq1, const unsigned int ncoord, + const unsigned int nelmt, const unsigned int nSize, + const unsigned int dfSize, const TData *__restrict Z0, + const TData *__restrict Z1, const TData *__restrict df, + const TData *__restrict in, TData *__restrict out) { - size_t e = blockDim.x * blockIdx.x + threadIdx.x; + extern __shared__ TData shared[]; + TData *s_f0, *s_f1; - if (e >= nelmt) + // Copy to shared memory. + if constexpr (SHAPETYPE == LibUtilities::Tri) { - return; - } + s_f0 = shared; + s_f1 = s_f0 + nq1; + + unsigned int sIndex = threadIdx.x; + while (sIndex < nq1) + { + s_f0[sIndex] = 2.0 / (1.0 - Z1[sIndex]); + sIndex += blockDim.x; + } - const auto ndf = 2 * ncoord; + sIndex = threadIdx.x; + while (sIndex < nq0) + { + s_f1[sIndex] = 0.5 * (1.0 + Z0[sIndex]); + sIndex += blockDim.x; + } - // Assign pointers. - TData **inptr = new TData *[ncoord]; - inptr[0] = in0 + (nq0 * nq1 * e); - inptr[1] = in1 + (nq0 * nq1 * e); - if (ncoord > 2) - { - inptr[2] = in2 + (nq0 * nq1 * e); + __syncthreads(); } - TData *outptr0 = out0 + (nq0 * nq1 * e); - TData *outptr1 = out1 + (nq0 * nq1 * e); + unsigned int e = blockDim.x * blockIdx.x + threadIdx.x; - TData **dfptr = new TData *[ndf]; - for (size_t d = 0; d < ndf; d++) + while (e < nelmt) { - dfptr[d] = df + d * dfSize; - dfptr[d] += DEFORMED ? nq0 * nq1 * e : e; - } + unsigned int offset = nq0 * nq1 * e; - // Calculate dxi/dx in[0] + dxi/dy in[1] + dxi/dz in[2] - for (size_t j = 0, cnt_ji = 0; j < nq1; ++j) - { - TData f0 = 2.0 / (1.0 - Z1[j]); - for (size_t i = 0; i < nq0; ++i, ++cnt_ji) + for (unsigned int j = 0, cnt_ji = 0; j < nq1; ++j) { - size_t dfindex = DEFORMED ? cnt_ji : 0; - outptr0[cnt_ji] = dfptr[0][dfindex] * inptr[0][cnt_ji]; - outptr1[cnt_ji] = dfptr[1][dfindex] * inptr[0][cnt_ji]; - for (size_t d = 1; d < ncoord; ++d) + for (unsigned int i = 0; i < nq0; ++i, ++cnt_ji) { - outptr0[cnt_ji] += (dfptr[2 * d][dfindex] * inptr[d][cnt_ji]); - outptr1[cnt_ji] += - (dfptr[2 * d + 1][dfindex] * inptr[d][cnt_ji]); - } + unsigned int index = offset + cnt_ji; + unsigned int dfindex = DEFORMED ? index : e; + + TData sum1 = 0.0, sum2 = 0.0; + for (unsigned int d = 0; d < ncoord; ++d) + { + sum1 += (df[dfindex + (2 * d) * dfSize] * + in[index + d * nSize]); + sum2 += (df[dfindex + (2 * d + 1) * dfSize] * + in[index + d * nSize]); + } - // Multiply by geometric factors - TData f1 = 0.5 * (1.0 + Z0[i]); - outptr0[cnt_ji] += outptr1[cnt_ji] * f1; - outptr0[cnt_ji] *= f0; + if constexpr (SHAPETYPE == LibUtilities::Quad) + { + out[index] = sum1; + out[index + nSize] = sum2; + } + else if constexpr (SHAPETYPE == LibUtilities::Tri) + { + out[index] = (sum1 + sum2 * s_f1[i]) * s_f0[j]; + out[index + nSize] = sum2; + } + } } + + e += blockDim.x * gridDim.x; } - delete[] inptr; - delete[] dfptr; } -template <typename TData, bool DEFORMED> -__global__ void IProductWRTDerivBaseHexKernel( - const size_t nq0, const size_t nq1, const size_t nq2, const size_t ncoord, - const size_t nelmt, const size_t dfSize, TData *df, TData *in0, TData *in1, - TData *in2, TData *out0, TData *out1, TData *out2) +template <typename TData, LibUtilities::ShapeType SHAPETYPE, bool DEFORMED> +__global__ void IProductWRTDerivBase2DKernel_QP( + const unsigned int nq0, const unsigned int nq1, const unsigned int ncoord, + const unsigned int nelmt, const unsigned int nSize, + const unsigned int dfSize, const TData *__restrict Z0, + const TData *__restrict Z1, const TData *__restrict df, + const TData *__restrict in, TData *__restrict out) { - size_t e = blockDim.x * blockIdx.x + threadIdx.x; + TData f0, f1; + + unsigned int e = blockIdx.x; - if (e >= nelmt) + while (e < nelmt) { - return; - } + unsigned int offset = nq0 * nq1 * e; - const auto ndf = 3 * ncoord; + for (unsigned int j = threadIdx.y; j < nq1; j += blockDim.y) + { + if constexpr (SHAPETYPE == LibUtilities::Tri) + { + f0 = 2.0 / (1.0 - Z1[j]); + } - // Assign pointers. - TData **inptr = new TData *[ncoord]; - inptr[0] = in0 + (nq0 * nq1 * nq2 * e); - inptr[1] = in1 + (nq0 * nq1 * nq2 * e); - inptr[2] = in2 + (nq0 * nq1 * nq2 * e); + for (unsigned int i = threadIdx.x; i < nq0; i += blockDim.x) + { + unsigned int cnt_ji = nq0 * j + i; + unsigned int index = offset + cnt_ji; + unsigned int dfindex = DEFORMED ? index : e; - TData *outptr0 = out0 + (nq0 * nq1 * nq2 * e); - TData *outptr1 = out1 + (nq0 * nq1 * nq2 * e); - TData *outptr2 = out2 + (nq0 * nq1 * nq2 * e); + TData sum1 = 0.0, sum2 = 0.0; + for (unsigned int d = 0; d < ncoord; ++d) + { + sum1 += (df[dfindex + (2 * d) * dfSize] * + in[index + d * nSize]); + sum2 += (df[dfindex + (2 * d + 1) * dfSize] * + in[index + d * nSize]); + } - TData **dfptr = new TData *[ndf]; - for (size_t d = 0; d < ndf; d++) - { - dfptr[d] = df + d * dfSize; - dfptr[d] += DEFORMED ? nq0 * nq1 * nq2 * e : e; - } + // Moving from standard to collapsed coordinates. + if constexpr (SHAPETYPE == LibUtilities::Tri) + { + f1 = 0.5 * (1.0 + Z0[i]); + } - // Calculate dxi/dx in[0] + dxi/dy in[1] + dxi/dz in[2] - for (size_t i = 0; i < nq0 * nq1 * nq2; ++i) - { - size_t dfindex = DEFORMED ? i : 0; - outptr0[i] = dfptr[0][dfindex] * inptr[0][i]; - outptr1[i] = dfptr[1][dfindex] * inptr[0][i]; - outptr2[i] = dfptr[2][dfindex] * inptr[0][i]; - for (size_t d = 1; d < ncoord; ++d) - { - outptr0[i] += (dfptr[3 * d][dfindex] * inptr[d][i]); - outptr1[i] += (dfptr[3 * d + 1][dfindex] * inptr[d][i]); - outptr2[i] += (dfptr[3 * d + 2][dfindex] * inptr[d][i]); + if constexpr (SHAPETYPE == LibUtilities::Quad) + { + out[index] = sum1; + out[index + nSize] = sum2; + } + else if constexpr (SHAPETYPE == LibUtilities::Tri) + { + out[index] = (sum1 + sum2 * f1) * f0; + out[index + nSize] = sum2; + } + } } + + e += gridDim.x; } - delete[] inptr; - delete[] dfptr; } -template <typename TData, bool DEFORMED> -__global__ void IProductWRTDerivBaseTetKernel( - const size_t nq0, const size_t nq1, const size_t nq2, const size_t ncoord, - const size_t nelmt, const TData *Z0, const TData *Z1, const TData *Z2, - const size_t dfSize, TData *df, TData *in0, TData *in1, TData *in2, - TData *out0, TData *out1, TData *out2) +template <typename TData, LibUtilities::ShapeType SHAPETYPE, bool DEFORMED> +__global__ void IProductWRTDerivBase3DKernel( + const unsigned int nq0, const unsigned int nq1, const unsigned int nq2, + const unsigned int ncoord, const unsigned int nelmt, + const unsigned int nSize, const unsigned int dfSize, + const TData *__restrict Z0, const TData *__restrict Z1, + const TData *__restrict Z2, const TData *__restrict df, + const TData *__restrict in, TData *__restrict out) { - size_t e = blockDim.x * blockIdx.x + threadIdx.x; + extern __shared__ TData shared[]; + TData *s_f0, *s_f1, *s_f2, *s_f3; - if (e >= nelmt) + // Copy to shared memory. + if constexpr (SHAPETYPE == LibUtilities::Tet) { - return; - } + s_f0 = shared; + s_f1 = s_f0 + nq1; + s_f2 = s_f1 + nq0; + s_f3 = s_f2 + nq2; + + unsigned int sIndex = threadIdx.x; + while (sIndex < nq1) + { + s_f0[sIndex] = 2.0 / (1.0 - Z1[sIndex]); + sIndex += blockDim.x; + } - const auto ndf = 3 * ncoord; + sIndex = threadIdx.x; + while (sIndex < nq0) + { + s_f1[sIndex] = 0.5 * (1.0 + Z0[sIndex]); + sIndex += blockDim.x; + } - // Assign pointers. - TData **inptr = new TData *[ncoord]; - inptr[0] = in0 + (nq0 * nq1 * nq2 * e); - inptr[1] = in1 + (nq0 * nq1 * nq2 * e); - inptr[2] = in2 + (nq0 * nq1 * nq2 * e); + sIndex = threadIdx.x; + while (sIndex < nq2) + { + s_f2[sIndex] = 2.0 / (1.0 - Z2[sIndex]); + sIndex += blockDim.x; + } - TData *outptr0 = out0 + (nq0 * nq1 * nq2 * e); - TData *outptr1 = out1 + (nq0 * nq1 * nq2 * e); - TData *outptr2 = out2 + (nq0 * nq1 * nq2 * e); + sIndex = threadIdx.x; + while (sIndex < nq1) + { + s_f3[sIndex] = 0.5 * (1.0 + Z1[sIndex]); + sIndex += blockDim.x; + } - TData **dfptr = new TData *[ndf]; - for (size_t d = 0; d < ndf; d++) - { - dfptr[d] = df + d * dfSize; - dfptr[d] += DEFORMED ? nq0 * nq1 * nq2 * e : e; + __syncthreads(); } - - // Calculate dxi/dx in[0] + dxi/dy in[1] + dxi/dz in[2] - for (size_t k = 0, cnt_kji = 0; k < nq2; ++k) + else if constexpr (SHAPETYPE == LibUtilities::Prism) { - TData f2 = 2.0 / (1.0 - Z2[k]); - for (size_t j = 0; j < nq1; ++j) - { - TData f3 = 0.5 * (1.0 + Z1[j]); - TData f0 = 2.0 * f2 / (1.0 - Z1[j]); - for (size_t i = 0; i < nq0; ++i, ++cnt_kji) - { - size_t dfindex = DEFORMED ? cnt_kji : 0; - outptr0[cnt_kji] = dfptr[0][dfindex] * inptr[0][cnt_kji]; - outptr1[cnt_kji] = dfptr[1][dfindex] * inptr[0][cnt_kji]; - outptr2[cnt_kji] = dfptr[2][dfindex] * inptr[0][cnt_kji]; - for (size_t d = 1; d < ncoord; ++d) - { - outptr0[cnt_kji] += - (dfptr[3 * d][dfindex] * inptr[d][cnt_kji]); - outptr1[cnt_kji] += - (dfptr[3 * d + 1][dfindex] * inptr[d][cnt_kji]); - outptr2[cnt_kji] += - (dfptr[3 * d + 2][dfindex] * inptr[d][cnt_kji]); - } + s_f1 = shared; + s_f2 = s_f1 + nq0; - // Multiply by geometric factors - TData f1 = 0.5 * (1.0 + Z0[i]); - outptr0[cnt_kji] += (outptr1[cnt_kji] + outptr2[cnt_kji]) * f1; - outptr0[cnt_kji] *= f0; - outptr1[cnt_kji] += outptr2[cnt_kji] * f3; - outptr1[cnt_kji] *= f2; - } + unsigned int sIndex = threadIdx.x; + while (sIndex < nq0) + { + s_f1[sIndex] = 0.5 * (1.0 + Z0[sIndex]); + sIndex += blockDim.x; } - } - delete[] inptr; - delete[] dfptr; -} -template <typename TData, bool DEFORMED> -__global__ void IProductWRTDerivBasePrismKernel( - const size_t nq0, const size_t nq1, const size_t nq2, const size_t ncoord, - const size_t nelmt, const TData *Z0, const TData *Z2, const size_t dfSize, - TData *df, TData *in0, TData *in1, TData *in2, TData *out0, TData *out1, - TData *out2) -{ - size_t e = blockDim.x * blockIdx.x + threadIdx.x; + sIndex = threadIdx.x; + while (sIndex < nq2) + { + s_f2[sIndex] = 2.0 / (1.0 - Z2[sIndex]); + sIndex += blockDim.x; + } - if (e >= nelmt) - { - return; + __syncthreads(); } + else if constexpr (SHAPETYPE == LibUtilities::Pyr) + { + s_f1 = shared; + s_f2 = s_f1 + nq0; + s_f3 = s_f2 + nq2; - const auto ndf = 3 * ncoord; + unsigned int sIndex = threadIdx.x; + while (sIndex < nq0) + { + s_f1[sIndex] = 0.5 * (1.0 + Z0[sIndex]); + sIndex += blockDim.x; + } - // Assign pointers. - TData **inptr = new TData *[ncoord]; - inptr[0] = in0 + (nq0 * nq1 * nq2 * e); - inptr[1] = in1 + (nq0 * nq1 * nq2 * e); - inptr[2] = in2 + (nq0 * nq1 * nq2 * e); + sIndex = threadIdx.x; + while (sIndex < nq2) + { + s_f2[sIndex] = 2.0 / (1.0 - Z2[sIndex]); + sIndex += blockDim.x; + } - TData *outptr0 = out0 + (nq0 * nq1 * nq2 * e); - TData *outptr1 = out1 + (nq0 * nq1 * nq2 * e); - TData *outptr2 = out2 + (nq0 * nq1 * nq2 * e); + sIndex = threadIdx.x; + while (sIndex < nq1) + { + s_f3[sIndex] = 0.5 * (1.0 + Z1[sIndex]); + sIndex += blockDim.x; + } - TData **dfptr = new TData *[ndf]; - for (size_t d = 0; d < ndf; d++) - { - dfptr[d] = df + d * dfSize; - dfptr[d] += DEFORMED ? nq0 * nq1 * nq2 * e : e; + __syncthreads(); } - // Calculate dxi/dx in[0] + dxi/dy in[1] + dxi/dz in[2] - for (size_t k = 0, cnt_kji = 0; k < nq2; ++k) + unsigned int e = blockDim.x * blockIdx.x + threadIdx.x; + + while (e < nelmt) { - TData f0 = 2.0 / (1.0 - Z2[k]); - for (size_t j = 0; j < nq1; ++j) + unsigned int offset = nq0 * nq1 * nq2 * e; + + for (unsigned int k = 0, cnt_kji = 0; k < nq2; ++k) { - for (size_t i = 0; i < nq0; ++i, ++cnt_kji) + for (unsigned int j = 0; j < nq1; ++j) { - size_t dfindex = DEFORMED ? cnt_kji : 0; - outptr0[cnt_kji] = dfptr[0][dfindex] * inptr[0][cnt_kji]; - outptr1[cnt_kji] = dfptr[1][dfindex] * inptr[0][cnt_kji]; - outptr2[cnt_kji] = dfptr[2][dfindex] * inptr[0][cnt_kji]; - for (size_t d = 1; d < ncoord; ++d) + for (unsigned int i = 0; i < nq0; ++i, ++cnt_kji) { - outptr0[cnt_kji] += - (dfptr[3 * d][dfindex] * inptr[d][cnt_kji]); - outptr1[cnt_kji] += - (dfptr[3 * d + 1][dfindex] * inptr[d][cnt_kji]); - outptr2[cnt_kji] += - (dfptr[3 * d + 2][dfindex] * inptr[d][cnt_kji]); + unsigned int index = offset + cnt_kji; + unsigned int dfindex = DEFORMED ? index : e; + + TData sum1 = 0.0, sum2 = 0.0, sum3 = 0.0; + for (unsigned int d = 0; d < ncoord; ++d) + { + sum1 += (df[dfindex + (3 * d) * dfSize] * + in[index + d * nSize]); + sum2 += (df[dfindex + (3 * d + 1) * dfSize] * + in[index + d * nSize]); + sum3 += (df[dfindex + (3 * d + 2) * dfSize] * + in[index + d * nSize]); + } + + if constexpr (SHAPETYPE == LibUtilities::Hex) + { + out[index] = sum1; + out[index + nSize] = sum2; + out[index + 2 * nSize] = sum3; + } + else if constexpr (SHAPETYPE == LibUtilities::Tet) + { + out[index] = (sum1 + (sum2 + sum3) * s_f1[i]) * + s_f2[k] * s_f0[j]; + out[index + nSize] = (sum2 + sum3 * s_f3[j]) * s_f2[k]; + out[index + 2 * nSize] = sum3; + } + else if constexpr (SHAPETYPE == LibUtilities::Prism) + { + out[index] = (sum1 + sum3 * s_f1[i]) * s_f2[k]; + out[index + nSize] = sum2; + out[index + 2 * nSize] = sum3; + } + else if constexpr (SHAPETYPE == LibUtilities::Pyr) + { + out[index] = (sum1 + sum3 * s_f1[i]) * s_f2[k]; + out[index + nSize] = (sum2 + sum3 * s_f3[j]) * s_f2[k]; + out[index + 2 * nSize] = sum3; + } } - - // Multiply by geometric factors - TData f1 = 0.5 * (1.0 + Z0[i]); - outptr0[cnt_kji] += outptr2[cnt_kji] * f1; - outptr0[cnt_kji] *= f0; } } + + e += blockDim.x * gridDim.x; } - delete[] inptr; - delete[] dfptr; } -template <typename TData, bool DEFORMED> -__global__ void IProductWRTDerivBasePyrKernel( - const size_t nq0, const size_t nq1, const size_t nq2, const size_t ncoord, - const size_t nelmt, const TData *Z0, const TData *Z1, const TData *Z2, - const size_t dfSize, TData *df, TData *in0, TData *in1, TData *in2, - TData *out0, TData *out1, TData *out2) +template <typename TData, LibUtilities::ShapeType SHAPETYPE, bool DEFORMED> +__global__ void IProductWRTDerivBase3DKernel_QP( + const unsigned int nq0, const unsigned int nq1, const unsigned int nq2, + const unsigned int ncoord, const unsigned int nelmt, + const unsigned int nSize, const unsigned int dfSize, + const TData *__restrict Z0, const TData *__restrict Z1, + const TData *__restrict Z2, const TData *__restrict df, + const TData *__restrict in, TData *__restrict out) { - size_t e = blockDim.x * blockIdx.x + threadIdx.x; + TData f0, f1, f2, f3; - if (e >= nelmt) - { - return; - } - - const auto ndf = 3 * ncoord; - - // Assign pointers. - TData **inptr = new TData *[ncoord]; - inptr[0] = in0 + (nq0 * nq1 * nq2 * e); - inptr[1] = in1 + (nq0 * nq1 * nq2 * e); - inptr[2] = in2 + (nq0 * nq1 * nq2 * e); + unsigned int e = blockIdx.x; - TData *outptr0 = out0 + (nq0 * nq1 * nq2 * e); - TData *outptr1 = out1 + (nq0 * nq1 * nq2 * e); - TData *outptr2 = out2 + (nq0 * nq1 * nq2 * e); - - TData **dfptr = new TData *[ndf]; - for (size_t d = 0; d < ndf; d++) + while (e < nelmt) { - dfptr[d] = df + d * dfSize; - dfptr[d] += DEFORMED ? nq0 * nq1 * nq2 * e : e; - } + unsigned int offset = nq0 * nq1 * nq2 * e; - // Calculate dxi/dx in[0] + dxi/dy in[1] + dxi/dz in[2] - for (size_t k = 0, cnt_kji = 0; k < nq2; ++k) - { - TData f0 = 2.0 / (1.0 - Z2[k]); - for (size_t j = 0; j < nq1; ++j) + for (unsigned int k = threadIdx.z; k < nq2; k += blockDim.z) { - TData f2 = 0.5 * (1.0 + Z1[j]); - for (size_t i = 0; i < nq0; ++i, ++cnt_kji) + if constexpr (SHAPETYPE == LibUtilities::Tet || + SHAPETYPE == LibUtilities::Prism || + SHAPETYPE == LibUtilities::Pyr) + { + f2 = 2.0 / (1.0 - Z2[k]); + } + + for (unsigned int j = threadIdx.y; j < nq1; j += blockDim.y) { - size_t dfindex = DEFORMED ? cnt_kji : 0; - outptr0[cnt_kji] = dfptr[0][dfindex] * inptr[0][cnt_kji]; - outptr1[cnt_kji] = dfptr[1][dfindex] * inptr[0][cnt_kji]; - outptr2[cnt_kji] = dfptr[2][dfindex] * inptr[0][cnt_kji]; - for (size_t d = 1; d < ncoord; ++d) + if constexpr (SHAPETYPE == LibUtilities::Tet || + SHAPETYPE == LibUtilities::Pyr) + { + f3 = 0.5 * (1.0 + Z1[j]); + } + else if constexpr (SHAPETYPE == LibUtilities::Tet) { - outptr0[cnt_kji] += - (dfptr[3 * d][dfindex] * inptr[d][cnt_kji]); - outptr1[cnt_kji] += - (dfptr[3 * d + 1][dfindex] * inptr[d][cnt_kji]); - outptr2[cnt_kji] += - (dfptr[3 * d + 2][dfindex] * inptr[d][cnt_kji]); + f0 = 2.0 * f2 / (1.0 - Z1[j]); } - // Multiply by geometric factors - TData f1 = 0.5 * (1.0 + Z0[i]); - outptr0[cnt_kji] += outptr2[cnt_kji] * f1; - outptr0[cnt_kji] *= f0; - outptr1[cnt_kji] += outptr2[cnt_kji] * f2; - outptr1[cnt_kji] *= f0; + for (unsigned int i = threadIdx.x; i < nq0; i += blockDim.x) + { + unsigned int index = offset + nq0 * nq1 * k + nq0 * j + i; + unsigned int dfindex = DEFORMED ? index : e; + + TData sum1 = 0.0, sum2 = 0.0, sum3 = 0.0; + for (unsigned int d = 0; d < ncoord; ++d) + { + sum1 += (df[dfindex + (3 * d) * dfSize] * + in[index + d * nSize]); + sum2 += (df[dfindex + (3 * d + 1) * dfSize] * + in[index + d * nSize]); + sum3 += (df[dfindex + (3 * d + 2) * dfSize] * + in[index + d * nSize]); + } + + if constexpr (SHAPETYPE == LibUtilities::Tet || + SHAPETYPE == LibUtilities::Prism || + SHAPETYPE == LibUtilities::Pyr) + { + f1 = 0.5 * (1.0 + Z0[i]); + } + + if constexpr (SHAPETYPE == LibUtilities::Hex) + { + out[index] = sum1; + out[index + nSize] = sum2; + out[index + 2 * nSize] = sum3; + } + else if constexpr (SHAPETYPE == LibUtilities::Tet) + { + out[index] = (sum1 + (sum2 + sum3) * f1) * f2 * f0; + out[index + nSize] = (sum2 + sum3 * f3) * f2; + out[index + 2 * nSize] = sum3; + } + else if constexpr (SHAPETYPE == LibUtilities::Prism) + { + out[index] = (sum1 + sum3 * f1) * f2; + out[index + nSize] = sum2; + out[index + 2 * nSize] = sum3; + } + else if constexpr (SHAPETYPE == LibUtilities::Pyr) + { + out[index] = (sum1 + sum3 * f1) * f2; + out[index + nSize] = (sum2 + sum3 * f3) * f2; + out[index + 2 * nSize] = sum3; + } + } } } + + e += gridDim.x; } - delete[] inptr; - delete[] dfptr; } + } // namespace Nektar::Operators::detail diff --git a/Operators/IProductWRTDerivBase/IProductWRTDerivBaseStdMat.hpp b/Operators/IProductWRTDerivBase/IProductWRTDerivBaseStdMat.hpp index 6a5d7fa301958e56ea8a9089679b2e30767fc0db..1e24a732b5ca307b45612cc496bba7e7fbab9c48 100644 --- a/Operators/IProductWRTDerivBase/IProductWRTDerivBaseStdMat.hpp +++ b/Operators/IProductWRTDerivBase/IProductWRTDerivBaseStdMat.hpp @@ -1,6 +1,9 @@ -#include "Operators/OperatorIProductWRTDerivBase.hpp" +#pragma once + #include <StdRegions/StdExpansion.h> +#include "Operators/OperatorIProductWRTDerivBase.hpp" + namespace Nektar::Operators::detail { @@ -44,7 +47,7 @@ public: size_t nmTot = expPtr->GetNcoeffs(); auto &matPtr = m_matPtr[basisKeys]; matPtr = Array<OneD, Array<OneD, TData>>(nDim); - Array<OneD, NekDouble> tmp(nqTot), t; + Array<OneD, TData> tmp(nqTot), t; for (size_t d = 0; d < nDim; ++d) { // Get IProductWRTDerivBase matrix. @@ -61,12 +64,8 @@ public: } // Initialize workspace memory. - auto ndata = this->m_expansionList->GetTotPoints(); - m_wsp = Array<OneD, Array<OneD, NekDouble>>(3); - for (size_t i = 0; i < nDim; ++i) - { - m_wsp[i] = Array<OneD, NekDouble>(ndata); - } + auto nStorage = this->m_expansionList->GetTotPoints(); + m_wsp = Array<OneD, TData>(nStorage * nDim); } void apply(Field<TData, FieldState::Phys> &in, @@ -74,13 +73,11 @@ public: bool APPEND = false) override { // Copy memory to GPU, if necessary and get raw pointers. - auto *inptr0 = in.GetStorage().GetCPUPtr(); - auto *inptr1 = inptr0 + in.GetFieldSize(); - auto *inptr2 = inptr1 + in.GetFieldSize(); - std::vector<TData *> inptr{inptr0, inptr1, inptr2}; - auto *outptr = out.GetStorage().GetCPUPtr(); - std::vector<TData *> wspptr{m_wsp[0].get(), m_wsp[1].get(), - m_wsp[2].get()}; + auto *inptr = in.GetStorage().GetCPUPtr(); + auto *outptr = out.GetStorage().GetCPUPtr(); + auto *wspptr = m_wsp.get(); + auto nSize = in.GetFieldSize(); + auto nStorage = this->m_expansionList->GetTotPoints(); // Initialize index. size_t expIdx = 0; @@ -111,12 +108,14 @@ public: for (size_t d = 0; d < nDim; ++d) { Vmath::Vmul(nqTot * nElmts, m_derivFac[d].get() + dfIdx, 1, - inptr[0], 1, wspptr[d], 1); + inptr, 1, wspptr + d * nStorage, 1); for (size_t i = 1; i < nCoord; ++i) { Vmath::Vvtvp(nqTot * nElmts, m_derivFac[d + i * nDim].get() + dfIdx, 1, - inptr[i], 1, wspptr[d], 1, wspptr[d], 1); + inptr + i * nSize, 1, + wspptr + d * nStorage, 1, + wspptr + d * nStorage, 1); } } } @@ -127,14 +126,15 @@ public: for (size_t d = 0; d < nDim; ++d) { Vmath::Smul(nqTot, m_derivFac[d][dfIdx + e], - inptr[0] + e * nqTot, 1, - wspptr[d] + e * nqTot, 1); + inptr + e * nqTot, 1, + wspptr + d * nStorage + e * nqTot, 1); for (size_t i = 1; i < nCoord; ++i) { - Vmath::Svtvp( - nqTot, m_derivFac[d + i * nDim][dfIdx + e], - inptr[i] + e * nqTot, 1, wspptr[d] + e * nqTot, - 1, wspptr[d] + e * nqTot, 1); + Vmath::Svtvp(nqTot, + m_derivFac[d + i * nDim][dfIdx + e], + inptr + i * nSize + e * nqTot, 1, + wspptr + d * nStorage + e * nqTot, 1, + wspptr + d * nStorage + e * nqTot, 1); } } } @@ -146,7 +146,8 @@ public: for (size_t d = 0; d < nDim; ++d) { Vmath::Vmul(nqTot * nElmts, m_jac.get() + jacIdx, 1, - wspptr[d], 1, wspptr[d], 1); + wspptr + d * nStorage, 1, wspptr + d * nStorage, + 1); } } else @@ -156,8 +157,8 @@ public: for (size_t d = 0; d < nDim; ++d) { Vmath::Smul(nqTot, m_jac[jacIdx + e], - wspptr[d] + e * nqTot, 1, - wspptr[d] + e * nqTot, 1); + wspptr + d * nStorage + e * nqTot, 1, + wspptr + d * nStorage + e * nqTot, 1); } } } @@ -176,15 +177,13 @@ public: { TData alpha = (d == 0 && !APPEND) ? 0.0 : 1.0; Blas::Dgemm('N', 'N', nmTot, nElmts, nqTot, 1.0, - matPtr[d].get(), nmTot, wspptr[d], nqTot, alpha, - outptr, nmTot); - - // Increment pointer and index for next element type. - inptr[d] += in.GetBlocks()[block_idx].block_size; - wspptr[d] += nqTot * nElmts; + matPtr[d].get(), nmTot, wspptr + d * nStorage, + nqTot, alpha, outptr, nmTot); } jacIdx += deformed ? nqTot * nElmts : nElmts; dfIdx += deformed ? nqTot * nElmts : nElmts; + wspptr += nqTot * nElmts; + inptr += in.GetBlocks()[block_idx].block_size; outptr += out.GetBlocks()[block_idx].block_size; expIdx += nElmts; } @@ -205,7 +204,7 @@ private: std::map<std::vector<LibUtilities::BasisKey>, Array<OneD, Array<OneD, TData>>> m_matPtr; - Array<OneD, Array<OneD, TData>> m_wsp; + Array<OneD, TData> m_wsp; }; } // namespace Nektar::Operators::detail diff --git a/tests/test_ipwrtderivbasecuda.cpp b/tests/test_ipwrtderivbasecuda.cpp index efe3fa636dec457b56ba8b06ce57660672fdfa86..7a269389ff3017eff0695dadecbfafb5ccc65d43 100644 --- a/tests/test_ipwrtderivbasecuda.cpp +++ b/tests/test_ipwrtderivbasecuda.cpp @@ -14,14 +14,13 @@ BOOST_AUTO_TEST_SUITE(TestIProductWRTDerivBaseCUDA) BOOST_FIXTURE_TEST_CASE(ipwrtderivbasecuda_seg, Seg) { Configure(1, 1); - SetTestCase(fixt_in->GetBlocks(), fixt_in->GetStorage().GetCPUPtr()); SetTestCase( fixtcuda_in->GetBlocks(), fixtcuda_in->template GetStorage<MemoryRegionCUDA>().GetCPUPtr()); - IProductWRTDerivBase<>::create(fixt_explist, "StdMat") - ->apply(*fixt_in, *fixt_expected); IProductWRTDerivBase<>::create(fixt_explist, "CUDA") ->apply(*fixtcuda_in, *fixtcuda_out); + ExpectedSolution(fixt_expected->GetBlocks(), + fixt_expected->GetStorage().GetCPUPtr()); BOOST_TEST(fixtcuda_out->compare(*fixt_expected, 1.0E-12)); boost::test_tools::output_test_stream output; { @@ -34,14 +33,13 @@ BOOST_FIXTURE_TEST_CASE(ipwrtderivbasecuda_seg, Seg) BOOST_FIXTURE_TEST_CASE(ipwrtderivbasecuda_quad, Quad) { Configure(2, 1); - SetTestCase(fixt_in->GetBlocks(), fixt_in->GetStorage().GetCPUPtr()); SetTestCase( fixtcuda_in->GetBlocks(), fixtcuda_in->template GetStorage<MemoryRegionCUDA>().GetCPUPtr()); - IProductWRTDerivBase<>::create(fixt_explist, "StdMat") - ->apply(*fixt_in, *fixt_expected); IProductWRTDerivBase<>::create(fixt_explist, "CUDA") ->apply(*fixtcuda_in, *fixtcuda_out); + ExpectedSolution(fixt_expected->GetBlocks(), + fixt_expected->GetStorage().GetCPUPtr()); BOOST_TEST(fixtcuda_out->compare(*fixt_expected, 1.0E-12)); boost::test_tools::output_test_stream output; { @@ -54,14 +52,13 @@ BOOST_FIXTURE_TEST_CASE(ipwrtderivbasecuda_quad, Quad) BOOST_FIXTURE_TEST_CASE(ipwrtderivbasecuda_tri, Tri) { Configure(2, 1); - SetTestCase(fixt_in->GetBlocks(), fixt_in->GetStorage().GetCPUPtr()); SetTestCase( fixtcuda_in->GetBlocks(), fixtcuda_in->template GetStorage<MemoryRegionCUDA>().GetCPUPtr()); - IProductWRTDerivBase<>::create(fixt_explist, "StdMat") - ->apply(*fixt_in, *fixt_expected); IProductWRTDerivBase<>::create(fixt_explist, "CUDA") ->apply(*fixtcuda_in, *fixtcuda_out); + ExpectedSolution(fixt_expected->GetBlocks(), + fixt_expected->GetStorage().GetCPUPtr()); BOOST_TEST(fixtcuda_out->compare(*fixt_expected, 1.0E-12)); boost::test_tools::output_test_stream output; { @@ -75,14 +72,13 @@ BOOST_FIXTURE_TEST_CASE(ipwrtderivbasecuda_square_all_elements, SquareAllElements) { Configure(2, 1); - SetTestCase(fixt_in->GetBlocks(), fixt_in->GetStorage().GetCPUPtr()); SetTestCase( fixtcuda_in->GetBlocks(), fixtcuda_in->template GetStorage<MemoryRegionCUDA>().GetCPUPtr()); - IProductWRTDerivBase<>::create(fixt_explist, "StdMat") - ->apply(*fixt_in, *fixt_expected); IProductWRTDerivBase<>::create(fixt_explist, "CUDA") ->apply(*fixtcuda_in, *fixtcuda_out); + ExpectedSolution(fixt_expected->GetBlocks(), + fixt_expected->GetStorage().GetCPUPtr()); BOOST_TEST(fixtcuda_out->compare(*fixt_expected, 1.0E-12)); boost::test_tools::output_test_stream output; { @@ -95,14 +91,13 @@ BOOST_FIXTURE_TEST_CASE(ipwrtderivbasecuda_square_all_elements, BOOST_FIXTURE_TEST_CASE(ipwrtderivbasecuda_hex, Hex) { Configure(3, 1); - SetTestCase(fixt_in->GetBlocks(), fixt_in->GetStorage().GetCPUPtr()); SetTestCase( fixtcuda_in->GetBlocks(), fixtcuda_in->template GetStorage<MemoryRegionCUDA>().GetCPUPtr()); - IProductWRTDerivBase<>::create(fixt_explist, "StdMat") - ->apply(*fixt_in, *fixt_expected); IProductWRTDerivBase<>::create(fixt_explist, "CUDA") ->apply(*fixtcuda_in, *fixtcuda_out); + ExpectedSolution(fixt_expected->GetBlocks(), + fixt_expected->GetStorage().GetCPUPtr()); BOOST_TEST(fixtcuda_out->compare(*fixt_expected, 1.0E-12)); boost::test_tools::output_test_stream output; { @@ -115,14 +110,13 @@ BOOST_FIXTURE_TEST_CASE(ipwrtderivbasecuda_hex, Hex) BOOST_FIXTURE_TEST_CASE(ipwrtderivbasecuda_prism, Prism) { Configure(3, 1); - SetTestCase(fixt_in->GetBlocks(), fixt_in->GetStorage().GetCPUPtr()); SetTestCase( fixtcuda_in->GetBlocks(), fixtcuda_in->template GetStorage<MemoryRegionCUDA>().GetCPUPtr()); - IProductWRTDerivBase<>::create(fixt_explist, "StdMat") - ->apply(*fixt_in, *fixt_expected); IProductWRTDerivBase<>::create(fixt_explist, "CUDA") ->apply(*fixtcuda_in, *fixtcuda_out); + ExpectedSolution(fixt_expected->GetBlocks(), + fixt_expected->GetStorage().GetCPUPtr()); BOOST_TEST(fixtcuda_out->compare(*fixt_expected, 1.0E-12)); boost::test_tools::output_test_stream output; { @@ -135,14 +129,13 @@ BOOST_FIXTURE_TEST_CASE(ipwrtderivbasecuda_prism, Prism) BOOST_FIXTURE_TEST_CASE(ipwrtderivbasecuda_pyr, Pyr) { Configure(3, 1); - SetTestCase(fixt_in->GetBlocks(), fixt_in->GetStorage().GetCPUPtr()); SetTestCase( fixtcuda_in->GetBlocks(), fixtcuda_in->template GetStorage<MemoryRegionCUDA>().GetCPUPtr()); - IProductWRTDerivBase<>::create(fixt_explist, "StdMat") - ->apply(*fixt_in, *fixt_expected); IProductWRTDerivBase<>::create(fixt_explist, "CUDA") ->apply(*fixtcuda_in, *fixtcuda_out); + ExpectedSolution(fixt_expected->GetBlocks(), + fixt_expected->GetStorage().GetCPUPtr()); BOOST_TEST(fixtcuda_out->compare(*fixt_expected, 1.0E-12)); boost::test_tools::output_test_stream output; { @@ -155,14 +148,13 @@ BOOST_FIXTURE_TEST_CASE(ipwrtderivbasecuda_pyr, Pyr) BOOST_FIXTURE_TEST_CASE(ipwrtderivbasecuda_tet, Tet) { Configure(3, 1); - SetTestCase(fixt_in->GetBlocks(), fixt_in->GetStorage().GetCPUPtr()); SetTestCase( fixtcuda_in->GetBlocks(), fixtcuda_in->template GetStorage<MemoryRegionCUDA>().GetCPUPtr()); - IProductWRTDerivBase<>::create(fixt_explist, "StdMat") - ->apply(*fixt_in, *fixt_expected); IProductWRTDerivBase<>::create(fixt_explist, "CUDA") ->apply(*fixtcuda_in, *fixtcuda_out); + ExpectedSolution(fixt_expected->GetBlocks(), + fixt_expected->GetStorage().GetCPUPtr()); BOOST_TEST(fixtcuda_out->compare(*fixt_expected, 1.0E-12)); boost::test_tools::output_test_stream output; { @@ -175,14 +167,13 @@ BOOST_FIXTURE_TEST_CASE(ipwrtderivbasecuda_tet, Tet) BOOST_FIXTURE_TEST_CASE(ipwrtderivbasecuda_cube_prism_hex, CubePrismHex) { Configure(3, 1); - SetTestCase(fixt_in->GetBlocks(), fixt_in->GetStorage().GetCPUPtr()); SetTestCase( fixtcuda_in->GetBlocks(), fixtcuda_in->template GetStorage<MemoryRegionCUDA>().GetCPUPtr()); - IProductWRTDerivBase<>::create(fixt_explist, "StdMat") - ->apply(*fixt_in, *fixt_expected); IProductWRTDerivBase<>::create(fixt_explist, "CUDA") ->apply(*fixtcuda_in, *fixtcuda_out); + ExpectedSolution(fixt_expected->GetBlocks(), + fixt_expected->GetStorage().GetCPUPtr()); BOOST_TEST(fixtcuda_out->compare(*fixt_expected, 1.0E-12)); boost::test_tools::output_test_stream output; { @@ -195,14 +186,13 @@ BOOST_FIXTURE_TEST_CASE(ipwrtderivbasecuda_cube_prism_hex, CubePrismHex) BOOST_FIXTURE_TEST_CASE(ipwrtderivbasecuda_cube_all_elements, CubeAllElements) { Configure(3, 1); - SetTestCase(fixt_in->GetBlocks(), fixt_in->GetStorage().GetCPUPtr()); SetTestCase( fixtcuda_in->GetBlocks(), fixtcuda_in->template GetStorage<MemoryRegionCUDA>().GetCPUPtr()); - IProductWRTDerivBase<>::create(fixt_explist, "StdMat") - ->apply(*fixt_in, *fixt_expected); IProductWRTDerivBase<>::create(fixt_explist, "CUDA") ->apply(*fixtcuda_in, *fixtcuda_out); + ExpectedSolution(fixt_expected->GetBlocks(), + fixt_expected->GetStorage().GetCPUPtr()); BOOST_TEST(fixtcuda_out->compare(*fixt_expected, 1.0E-12)); boost::test_tools::output_test_stream output; {