diff --git a/Operators/BwdTrans/BwdTransCUDA.cu b/Operators/BwdTrans/BwdTransCUDA.cu index 4e4e580a27936b0c35982c830ceaa77e0b8e66fb..2f9ac718a4d1940b8c61a2d08db26dad51093145 100644 --- a/Operators/BwdTrans/BwdTransCUDA.cu +++ b/Operators/BwdTrans/BwdTransCUDA.cu @@ -7,4 +7,4 @@ std::string OperatorBwdTransImpl<double, ImplCUDA>::className = GetOperatorFactory<double>().RegisterCreatorFunction( "BwdTransCUDA", OperatorBwdTransImpl<double, ImplCUDA>::instantiate, "..."); -} +} // namespace Nektar::Operators::detail diff --git a/Operators/DirBndCond/DirBndCondCUDAKernels.cuh b/Operators/DirBndCond/DirBndCondCUDAKernels.cuh index f37b523ab77188974f3b331b4454caf056c69b46..879444cb9e5380a484c7047011faa6801d9563ad 100644 --- a/Operators/DirBndCond/DirBndCondCUDAKernels.cuh +++ b/Operators/DirBndCond/DirBndCondCUDAKernels.cuh @@ -21,8 +21,9 @@ __global__ void DirBndCondKernel(const size_t nsize, const int *offsetptr, if (bctypeptr[i] == eDirichlet) { - int offset = offsetptr[i]; - for (size_t j = 0; j < ncoeffptr[i]; j++) + size_t offset = offsetptr[i]; + size_t ncoeff = ncoeffptr[i]; + for (size_t j = 0; j < ncoeff; j++) { outptr[mapptr[offset + j]] = inptr[offset + j]; } @@ -45,8 +46,9 @@ __global__ void DirBndCondKernel(const size_t nsize, const int *offsetptr, if (bctypeptr[i] == eDirichlet) { - int offset = offsetptr[i]; - for (size_t j = 0; j < ncoeffptr[i]; j++) + size_t offset = offsetptr[i]; + size_t ncoeff = ncoeffptr[i]; + for (size_t j = 0; j < ncoeff; j++) { outptr[mapptr[offset + j]] = signptr[offset + j] * inptr[offset + j]; diff --git a/Operators/Helmholtz/HelmholtzCUDA.hpp b/Operators/Helmholtz/HelmholtzCUDA.hpp index 3dae3c4b153d719e46aa3df3568d12df541d19ae..17a293ec0a8f761b58e95efa3750bbd6eae11a2d 100644 --- a/Operators/Helmholtz/HelmholtzCUDA.hpp +++ b/Operators/Helmholtz/HelmholtzCUDA.hpp @@ -1,3 +1,5 @@ +#pragma once + #include "Operators/BwdTrans/BwdTransCUDA.hpp" #include "Operators/Helmholtz/HelmholtzCUDAKernels.cuh" #include "Operators/IProductWRTBase/IProductWRTBaseCUDA.hpp" @@ -42,6 +44,11 @@ public: sizeof(TData) * nCoord * nCoord, cudaMemcpyHostToDevice); } + ~OperatorHelmholtzImpl(void) + { + cudaFree(m_diffCoeff); + } + void apply(Field<TData, FieldState::Coeff> &in, Field<TData, FieldState::Coeff> &out) override { @@ -68,7 +75,6 @@ public: deriv.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); auto *derivptr1 = derivptr0 + deriv.GetFieldSize(); auto *derivptr2 = derivptr1 + deriv.GetFieldSize(); - std::vector<TData *> derivptr{derivptr0, derivptr1, derivptr2}; // Initialize index. size_t expIdx = 0; @@ -83,38 +89,33 @@ public: auto nqTot = expPtr->GetTotPoints(); // Determine CUDA grid parameters. - m_gridSize = nElmts / m_blockSize; - m_gridSize += (nElmts % m_blockSize == 0) ? 0 : 1; + m_gridSize = (nElmts * nqTot) / m_blockSize; + m_gridSize += ((nElmts * nqTot) % m_blockSize == 0) ? 0 : 1; // Multiply by diffusion coefficient. if (nCoord == 1) { - auto nq0 = expPtr->GetNumPoints(0); DiffusionCoeff1DKernel<<<m_gridSize, m_blockSize>>>( - nq0, nElmts, m_diffCoeff, derivptr[0]); + nqTot * nElmts, m_diffCoeff, derivptr0); + derivptr0 += nqTot * nElmts; } else if (nCoord == 2) { - auto nq0 = expPtr->GetNumPoints(0); - auto nq1 = expPtr->GetNumPoints(1); DiffusionCoeff2DKernel<<<m_gridSize, m_blockSize>>>( - nq0, nq1, nElmts, m_diffCoeff, derivptr[0], derivptr[1]); + nqTot * nElmts, m_diffCoeff, derivptr0, derivptr1); + derivptr0 += nqTot * nElmts; + derivptr1 += nqTot * nElmts; } else { - auto nq0 = expPtr->GetNumPoints(0); - auto nq1 = expPtr->GetNumPoints(1); - auto nq2 = expPtr->GetNumPoints(2); DiffusionCoeff3DKernel<<<m_gridSize, m_blockSize>>>( - nq0, nq1, nq2, nElmts, m_diffCoeff, derivptr[0], - derivptr[1], derivptr[2]); + nqTot * nElmts, m_diffCoeff, derivptr0, derivptr1, + derivptr2); + derivptr0 += nqTot * nElmts; + derivptr1 += nqTot * nElmts; + derivptr2 += nqTot * nElmts; } - // Increment pointer and index for next element type. - for (size_t d = 0; d < nCoord; d++) - { - derivptr[d] += nqTot * nElmts; - } expIdx += nElmts; } } diff --git a/Operators/Helmholtz/HelmholtzCUDAKernels.cuh b/Operators/Helmholtz/HelmholtzCUDAKernels.cuh index 61d4bb480899c66b505076ac28e844ed868c5d44..02f9154de0b855a79c302334ff04a2b9dd2f41a1 100644 --- a/Operators/Helmholtz/HelmholtzCUDAKernels.cuh +++ b/Operators/Helmholtz/HelmholtzCUDAKernels.cuh @@ -1,98 +1,78 @@ namespace Nektar::Operators::detail { + template <typename TData> -__global__ void DiffusionCoeff1DKernel(const size_t nq0, const size_t nelmt, +__global__ void DiffusionCoeff1DKernel(const size_t nsize, const TData *diffCoeff, TData *deriv0) { - size_t e = blockDim.x * blockIdx.x + threadIdx.x; + size_t i = blockDim.x * blockIdx.x + threadIdx.x; - if (e >= nelmt) + if (i >= nsize) { return; } - // Assign pointers. - TData *derivptr = deriv0 + nq0 * e; - - for (size_t i = 0; i < nq0; i++) - { - derivptr[i] *= diffCoeff[0]; - } + deriv0[i] *= diffCoeff[0]; } template <typename TData> -__global__ void DiffusionCoeff2DKernel(const size_t nq0, const size_t nq1, - const size_t nelmt, +__global__ void DiffusionCoeff2DKernel(const size_t nsize, const TData *diffCoeff, TData *deriv0, TData *deriv1) { - size_t e = blockDim.x * blockIdx.x + threadIdx.x; + __shared__ TData s_diffCoeff[4]; - if (e >= nelmt) + size_t ind = threadIdx.x; + if (ind < 4) { - return; + s_diffCoeff[ind] = diffCoeff[ind]; } - constexpr size_t ncoord = 2; + __syncthreads(); - // Assign pointers. - TData **derivptr = new TData *[ncoord]; - derivptr[0] = deriv0 + nq0 * nq1 * e; - derivptr[1] = deriv1 + nq0 * nq1 * e; + size_t i = blockDim.x * blockIdx.x + threadIdx.x; - for (size_t j = 0, cnt = 0; j < nq1; j++) + if (i >= nsize) { - for (size_t i = 0; i < nq0; i++) - { - TData deriv[2] = {derivptr[0][cnt], derivptr[1][cnt]}; - for (size_t d = 0; d < ncoord; d++) - { - derivptr[d][cnt] = diffCoeff[d * ncoord + 0] * deriv[0] + - diffCoeff[d * ncoord + 1] * deriv[1]; - } - cnt++; - } + return; } + + TData deriv[2] = {deriv0[i], deriv1[i]}; + + deriv0[i] = s_diffCoeff[0] * deriv[0] + s_diffCoeff[1] * deriv[1]; + deriv1[i] = s_diffCoeff[2] * deriv[0] + s_diffCoeff[3] * deriv[1]; } template <typename TData> -__global__ void DiffusionCoeff3DKernel(const size_t nq0, const size_t nq1, - const size_t nq2, const size_t nelmt, - TData *diffCoeff, TData *deriv0, - TData *deriv1, TData *deriv2) +__global__ void DiffusionCoeff3DKernel(const size_t nsize, TData *diffCoeff, + TData *deriv0, TData *deriv1, + TData *deriv2) { - size_t e = blockDim.x * blockIdx.x + threadIdx.x; + __shared__ TData s_diffCoeff[9]; - if (e >= nelmt) + size_t ind = threadIdx.x; + if (ind < 9) { - return; + s_diffCoeff[ind] = diffCoeff[ind]; } - constexpr size_t ncoord = 3; + __syncthreads(); - // Assign pointers. - TData **derivptr = new TData *[ncoord]; - derivptr[0] = deriv0 + nq0 * nq1 * nq2 * e; - derivptr[1] = deriv1 + nq0 * nq1 * nq2 * e; - derivptr[2] = deriv2 + nq0 * nq1 * nq2 * e; + size_t i = blockDim.x * blockIdx.x + threadIdx.x; - for (size_t k = 0, cnt = 0; k < nq2; k++) + if (i >= nsize) { - for (size_t j = 0; j < nq1; j++) - { - for (size_t i = 0; i < nq0; i++) - { - TData deriv[3] = {derivptr[0][cnt], derivptr[1][cnt], - derivptr[2][cnt]}; - for (size_t d = 0; d < ncoord; d++) - { - derivptr[d][cnt] = diffCoeff[d * ncoord + 0] * deriv[0] + - diffCoeff[d * ncoord + 1] * deriv[1] + - diffCoeff[d * ncoord + 2] * deriv[2]; - } - cnt++; - } - } + return; } + + TData deriv[3] = {deriv0[i], deriv1[i], deriv2[i]}; + + deriv0[i] = s_diffCoeff[0] * deriv[0] + s_diffCoeff[1] * deriv[1] + + s_diffCoeff[2] * deriv[2]; + deriv1[i] = s_diffCoeff[3] * deriv[0] + s_diffCoeff[4] * deriv[1] + + s_diffCoeff[5] * deriv[2]; + deriv2[i] = s_diffCoeff[6] * deriv[0] + s_diffCoeff[7] * deriv[1] + + s_diffCoeff[8] * deriv[2]; } + } // namespace Nektar::Operators::detail diff --git a/Operators/Helmholtz/HelmholtzStdMat.hpp b/Operators/Helmholtz/HelmholtzStdMat.hpp index 0ae8eaf500b4a87ee93ad618d325ab60ca5d8427..11b0961926967275c76ff536da432262cfa1dbe2 100644 --- a/Operators/Helmholtz/HelmholtzStdMat.hpp +++ b/Operators/Helmholtz/HelmholtzStdMat.hpp @@ -1,3 +1,5 @@ +#pragma once + #include "Operators/BwdTrans/BwdTransStdMat.hpp" #include "Operators/IProductWRTBase/IProductWRTBaseStdMat.hpp" #include "Operators/IProductWRTDerivBase/IProductWRTDerivBaseStdMat.hpp" diff --git a/Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDA.hpp b/Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDA.hpp index a3b3b3ac1e4a3693d4018dd8159c1da74baf6572..7b34b6edd3a142c35561ccc0035d886e83589a54 100644 --- a/Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDA.hpp +++ b/Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDA.hpp @@ -94,6 +94,8 @@ public: ~OperatorIProductWRTDerivBaseImpl(void) { + size_t nCoord = this->m_expansionList->GetCoordim(0); + DeallocateDataCUDA<TData>(m_basis); DeallocateDataCUDA<TData>(m_dbasis); DeallocateDataCUDA<TData>(m_weight); @@ -101,6 +103,15 @@ public: DeallocateDataCUDA<TData>(m_D); cudaFree(m_jac); cudaFree(m_derivFac); + cudaFree(m_wsp0); + if (nCoord > 1) + { + cudaFree(m_wsp1); + } + if (nCoord > 2) + { + cudaFree(m_wsp2); + } } void apply(Field<TData, FieldState::Phys> &in, diff --git a/Operators/Matrix/MatrixCUDA.hpp b/Operators/Matrix/MatrixCUDA.hpp index 40910be81397e9940b1648fc4f59ffbda1eadeb0..3ab7e5987197a1212ad6525833433a3d27a3be84 100644 --- a/Operators/Matrix/MatrixCUDA.hpp +++ b/Operators/Matrix/MatrixCUDA.hpp @@ -50,7 +50,7 @@ public: std::vector<TData> matrix_print(m_size * m_size); // Copy device memory to host memory for printing cudaMemcpy(matrix_print.data(), m_matrix, - m_size * m_size * sizeof(float), cudaMemcpyDeviceToHost); + m_size * m_size * sizeof(TData), cudaMemcpyDeviceToHost); auto pMat = matrix_print.cbegin(); std::string str; diff --git a/Operators/PhysDeriv/PhysDerivCUDAKernels.cuh b/Operators/PhysDeriv/PhysDerivCUDAKernels.cuh index ec110caf848217fb51880e428a4a7ea30f4d5694..28482cf005179ea68ff701cc12bd769cc3515090 100644 --- a/Operators/PhysDeriv/PhysDerivCUDAKernels.cuh +++ b/Operators/PhysDeriv/PhysDerivCUDAKernels.cuh @@ -43,6 +43,9 @@ __global__ void PhysDerivSegKernel(const size_t nq0, const size_t ncoord, inoutptr[d - 1][j] = inoutptr[0][j] * dfptr[d - 1][dfindex]; } } + + delete[] inoutptr; + delete[] dfptr; } template <typename TData, bool DEFORMED> @@ -92,6 +95,9 @@ __global__ void PhysDerivQuadKernel(const size_t nq0, const size_t nq1, } } } + + delete[] inoutptr; + delete[] dfptr; } template <typename TData, bool DEFORMED> @@ -149,6 +155,9 @@ __global__ void PhysDerivTriKernel(const size_t nq0, const size_t nq1, } } } + + delete[] inoutptr; + delete[] dfptr; } template <typename TData, bool DEFORMED> @@ -204,6 +213,9 @@ __global__ void PhysDerivHexKernel(const size_t nq0, const size_t nq1, } } } + + delete[] inoutptr; + delete[] dfptr; } template <typename TData, bool DEFORMED> @@ -307,6 +319,11 @@ __global__ void PhysDerivTetKernel(const size_t nq0, const size_t nq1, } } } + + delete[] wsp0; + delete[] wsp1; + delete[] inoutptr; + delete[] dfptr; } template <typename TData, bool DEFORMED> @@ -368,6 +385,9 @@ __global__ void PhysDerivPrismKernel( } } } + + delete[] inoutptr; + delete[] dfptr; } template <typename TData, bool DEFORMED> @@ -433,6 +453,9 @@ __global__ void PhysDerivPyrKernel(const size_t nq0, const size_t nq1, } } } + + delete[] inoutptr; + delete[] dfptr; } template <typename TData>