From 73afd8a2793ac680e89018726c4968046118e62d Mon Sep 17 00:00:00 2001 From: Jacques Xing <jacques.xing@kcl.ac.uk> Date: Thu, 18 Jan 2024 22:51:09 +0000 Subject: [PATCH] Fix various CUDA memory leaks --- Operators/BwdTrans/BwdTransCUDA.cu | 2 +- .../DirBndCond/DirBndCondCUDAKernels.cuh | 10 +- Operators/Helmholtz/HelmholtzCUDA.hpp | 37 +++---- Operators/Helmholtz/HelmholtzCUDAKernels.cuh | 100 +++++++----------- Operators/Helmholtz/HelmholtzStdMat.hpp | 2 + .../IProductWRTDerivBaseCUDA.hpp | 11 ++ Operators/Matrix/MatrixCUDA.hpp | 2 +- Operators/PhysDeriv/PhysDerivCUDAKernels.cuh | 23 ++++ 8 files changed, 103 insertions(+), 84 deletions(-) diff --git a/Operators/BwdTrans/BwdTransCUDA.cu b/Operators/BwdTrans/BwdTransCUDA.cu index 4e4e580a..2f9ac718 100644 --- a/Operators/BwdTrans/BwdTransCUDA.cu +++ b/Operators/BwdTrans/BwdTransCUDA.cu @@ -7,4 +7,4 @@ std::string OperatorBwdTransImpl<double, ImplCUDA>::className = GetOperatorFactory<double>().RegisterCreatorFunction( "BwdTransCUDA", OperatorBwdTransImpl<double, ImplCUDA>::instantiate, "..."); -} +} // namespace Nektar::Operators::detail diff --git a/Operators/DirBndCond/DirBndCondCUDAKernels.cuh b/Operators/DirBndCond/DirBndCondCUDAKernels.cuh index f37b523a..879444cb 100644 --- a/Operators/DirBndCond/DirBndCondCUDAKernels.cuh +++ b/Operators/DirBndCond/DirBndCondCUDAKernels.cuh @@ -21,8 +21,9 @@ __global__ void DirBndCondKernel(const size_t nsize, const int *offsetptr, if (bctypeptr[i] == eDirichlet) { - int offset = offsetptr[i]; - for (size_t j = 0; j < ncoeffptr[i]; j++) + size_t offset = offsetptr[i]; + size_t ncoeff = ncoeffptr[i]; + for (size_t j = 0; j < ncoeff; j++) { outptr[mapptr[offset + j]] = inptr[offset + j]; } @@ -45,8 +46,9 @@ __global__ void DirBndCondKernel(const size_t nsize, const int *offsetptr, if (bctypeptr[i] == eDirichlet) { - int offset = offsetptr[i]; - for (size_t j = 0; j < ncoeffptr[i]; j++) + size_t offset = offsetptr[i]; + size_t ncoeff = ncoeffptr[i]; + for (size_t j = 0; j < ncoeff; j++) { outptr[mapptr[offset + j]] = signptr[offset + j] * inptr[offset + j]; diff --git a/Operators/Helmholtz/HelmholtzCUDA.hpp b/Operators/Helmholtz/HelmholtzCUDA.hpp index 3dae3c4b..17a293ec 100644 --- a/Operators/Helmholtz/HelmholtzCUDA.hpp +++ b/Operators/Helmholtz/HelmholtzCUDA.hpp @@ -1,3 +1,5 @@ +#pragma once + #include "Operators/BwdTrans/BwdTransCUDA.hpp" #include "Operators/Helmholtz/HelmholtzCUDAKernels.cuh" #include "Operators/IProductWRTBase/IProductWRTBaseCUDA.hpp" @@ -42,6 +44,11 @@ public: sizeof(TData) * nCoord * nCoord, cudaMemcpyHostToDevice); } + ~OperatorHelmholtzImpl(void) + { + cudaFree(m_diffCoeff); + } + void apply(Field<TData, FieldState::Coeff> &in, Field<TData, FieldState::Coeff> &out) override { @@ -68,7 +75,6 @@ public: deriv.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); auto *derivptr1 = derivptr0 + deriv.GetFieldSize(); auto *derivptr2 = derivptr1 + deriv.GetFieldSize(); - std::vector<TData *> derivptr{derivptr0, derivptr1, derivptr2}; // Initialize index. size_t expIdx = 0; @@ -83,38 +89,33 @@ public: auto nqTot = expPtr->GetTotPoints(); // Determine CUDA grid parameters. - m_gridSize = nElmts / m_blockSize; - m_gridSize += (nElmts % m_blockSize == 0) ? 0 : 1; + m_gridSize = (nElmts * nqTot) / m_blockSize; + m_gridSize += ((nElmts * nqTot) % m_blockSize == 0) ? 0 : 1; // Multiply by diffusion coefficient. if (nCoord == 1) { - auto nq0 = expPtr->GetNumPoints(0); DiffusionCoeff1DKernel<<<m_gridSize, m_blockSize>>>( - nq0, nElmts, m_diffCoeff, derivptr[0]); + nqTot * nElmts, m_diffCoeff, derivptr0); + derivptr0 += nqTot * nElmts; } else if (nCoord == 2) { - auto nq0 = expPtr->GetNumPoints(0); - auto nq1 = expPtr->GetNumPoints(1); DiffusionCoeff2DKernel<<<m_gridSize, m_blockSize>>>( - nq0, nq1, nElmts, m_diffCoeff, derivptr[0], derivptr[1]); + nqTot * nElmts, m_diffCoeff, derivptr0, derivptr1); + derivptr0 += nqTot * nElmts; + derivptr1 += nqTot * nElmts; } else { - auto nq0 = expPtr->GetNumPoints(0); - auto nq1 = expPtr->GetNumPoints(1); - auto nq2 = expPtr->GetNumPoints(2); DiffusionCoeff3DKernel<<<m_gridSize, m_blockSize>>>( - nq0, nq1, nq2, nElmts, m_diffCoeff, derivptr[0], - derivptr[1], derivptr[2]); + nqTot * nElmts, m_diffCoeff, derivptr0, derivptr1, + derivptr2); + derivptr0 += nqTot * nElmts; + derivptr1 += nqTot * nElmts; + derivptr2 += nqTot * nElmts; } - // Increment pointer and index for next element type. - for (size_t d = 0; d < nCoord; d++) - { - derivptr[d] += nqTot * nElmts; - } expIdx += nElmts; } } diff --git a/Operators/Helmholtz/HelmholtzCUDAKernels.cuh b/Operators/Helmholtz/HelmholtzCUDAKernels.cuh index 61d4bb48..02f9154d 100644 --- a/Operators/Helmholtz/HelmholtzCUDAKernels.cuh +++ b/Operators/Helmholtz/HelmholtzCUDAKernels.cuh @@ -1,98 +1,78 @@ namespace Nektar::Operators::detail { + template <typename TData> -__global__ void DiffusionCoeff1DKernel(const size_t nq0, const size_t nelmt, +__global__ void DiffusionCoeff1DKernel(const size_t nsize, const TData *diffCoeff, TData *deriv0) { - size_t e = blockDim.x * blockIdx.x + threadIdx.x; + size_t i = blockDim.x * blockIdx.x + threadIdx.x; - if (e >= nelmt) + if (i >= nsize) { return; } - // Assign pointers. - TData *derivptr = deriv0 + nq0 * e; - - for (size_t i = 0; i < nq0; i++) - { - derivptr[i] *= diffCoeff[0]; - } + deriv0[i] *= diffCoeff[0]; } template <typename TData> -__global__ void DiffusionCoeff2DKernel(const size_t nq0, const size_t nq1, - const size_t nelmt, +__global__ void DiffusionCoeff2DKernel(const size_t nsize, const TData *diffCoeff, TData *deriv0, TData *deriv1) { - size_t e = blockDim.x * blockIdx.x + threadIdx.x; + __shared__ TData s_diffCoeff[4]; - if (e >= nelmt) + size_t ind = threadIdx.x; + if (ind < 4) { - return; + s_diffCoeff[ind] = diffCoeff[ind]; } - constexpr size_t ncoord = 2; + __syncthreads(); - // Assign pointers. - TData **derivptr = new TData *[ncoord]; - derivptr[0] = deriv0 + nq0 * nq1 * e; - derivptr[1] = deriv1 + nq0 * nq1 * e; + size_t i = blockDim.x * blockIdx.x + threadIdx.x; - for (size_t j = 0, cnt = 0; j < nq1; j++) + if (i >= nsize) { - for (size_t i = 0; i < nq0; i++) - { - TData deriv[2] = {derivptr[0][cnt], derivptr[1][cnt]}; - for (size_t d = 0; d < ncoord; d++) - { - derivptr[d][cnt] = diffCoeff[d * ncoord + 0] * deriv[0] + - diffCoeff[d * ncoord + 1] * deriv[1]; - } - cnt++; - } + return; } + + TData deriv[2] = {deriv0[i], deriv1[i]}; + + deriv0[i] = s_diffCoeff[0] * deriv[0] + s_diffCoeff[1] * deriv[1]; + deriv1[i] = s_diffCoeff[2] * deriv[0] + s_diffCoeff[3] * deriv[1]; } template <typename TData> -__global__ void DiffusionCoeff3DKernel(const size_t nq0, const size_t nq1, - const size_t nq2, const size_t nelmt, - TData *diffCoeff, TData *deriv0, - TData *deriv1, TData *deriv2) +__global__ void DiffusionCoeff3DKernel(const size_t nsize, TData *diffCoeff, + TData *deriv0, TData *deriv1, + TData *deriv2) { - size_t e = blockDim.x * blockIdx.x + threadIdx.x; + __shared__ TData s_diffCoeff[9]; - if (e >= nelmt) + size_t ind = threadIdx.x; + if (ind < 9) { - return; + s_diffCoeff[ind] = diffCoeff[ind]; } - constexpr size_t ncoord = 3; + __syncthreads(); - // Assign pointers. - TData **derivptr = new TData *[ncoord]; - derivptr[0] = deriv0 + nq0 * nq1 * nq2 * e; - derivptr[1] = deriv1 + nq0 * nq1 * nq2 * e; - derivptr[2] = deriv2 + nq0 * nq1 * nq2 * e; + size_t i = blockDim.x * blockIdx.x + threadIdx.x; - for (size_t k = 0, cnt = 0; k < nq2; k++) + if (i >= nsize) { - for (size_t j = 0; j < nq1; j++) - { - for (size_t i = 0; i < nq0; i++) - { - TData deriv[3] = {derivptr[0][cnt], derivptr[1][cnt], - derivptr[2][cnt]}; - for (size_t d = 0; d < ncoord; d++) - { - derivptr[d][cnt] = diffCoeff[d * ncoord + 0] * deriv[0] + - diffCoeff[d * ncoord + 1] * deriv[1] + - diffCoeff[d * ncoord + 2] * deriv[2]; - } - cnt++; - } - } + return; } + + TData deriv[3] = {deriv0[i], deriv1[i], deriv2[i]}; + + deriv0[i] = s_diffCoeff[0] * deriv[0] + s_diffCoeff[1] * deriv[1] + + s_diffCoeff[2] * deriv[2]; + deriv1[i] = s_diffCoeff[3] * deriv[0] + s_diffCoeff[4] * deriv[1] + + s_diffCoeff[5] * deriv[2]; + deriv2[i] = s_diffCoeff[6] * deriv[0] + s_diffCoeff[7] * deriv[1] + + s_diffCoeff[8] * deriv[2]; } + } // namespace Nektar::Operators::detail diff --git a/Operators/Helmholtz/HelmholtzStdMat.hpp b/Operators/Helmholtz/HelmholtzStdMat.hpp index 0ae8eaf5..11b09619 100644 --- a/Operators/Helmholtz/HelmholtzStdMat.hpp +++ b/Operators/Helmholtz/HelmholtzStdMat.hpp @@ -1,3 +1,5 @@ +#pragma once + #include "Operators/BwdTrans/BwdTransStdMat.hpp" #include "Operators/IProductWRTBase/IProductWRTBaseStdMat.hpp" #include "Operators/IProductWRTDerivBase/IProductWRTDerivBaseStdMat.hpp" diff --git a/Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDA.hpp b/Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDA.hpp index a3b3b3ac..7b34b6ed 100644 --- a/Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDA.hpp +++ b/Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDA.hpp @@ -94,6 +94,8 @@ public: ~OperatorIProductWRTDerivBaseImpl(void) { + size_t nCoord = this->m_expansionList->GetCoordim(0); + DeallocateDataCUDA<TData>(m_basis); DeallocateDataCUDA<TData>(m_dbasis); DeallocateDataCUDA<TData>(m_weight); @@ -101,6 +103,15 @@ public: DeallocateDataCUDA<TData>(m_D); cudaFree(m_jac); cudaFree(m_derivFac); + cudaFree(m_wsp0); + if (nCoord > 1) + { + cudaFree(m_wsp1); + } + if (nCoord > 2) + { + cudaFree(m_wsp2); + } } void apply(Field<TData, FieldState::Phys> &in, diff --git a/Operators/Matrix/MatrixCUDA.hpp b/Operators/Matrix/MatrixCUDA.hpp index 40910be8..3ab7e598 100644 --- a/Operators/Matrix/MatrixCUDA.hpp +++ b/Operators/Matrix/MatrixCUDA.hpp @@ -50,7 +50,7 @@ public: std::vector<TData> matrix_print(m_size * m_size); // Copy device memory to host memory for printing cudaMemcpy(matrix_print.data(), m_matrix, - m_size * m_size * sizeof(float), cudaMemcpyDeviceToHost); + m_size * m_size * sizeof(TData), cudaMemcpyDeviceToHost); auto pMat = matrix_print.cbegin(); std::string str; diff --git a/Operators/PhysDeriv/PhysDerivCUDAKernels.cuh b/Operators/PhysDeriv/PhysDerivCUDAKernels.cuh index ec110caf..28482cf0 100644 --- a/Operators/PhysDeriv/PhysDerivCUDAKernels.cuh +++ b/Operators/PhysDeriv/PhysDerivCUDAKernels.cuh @@ -43,6 +43,9 @@ __global__ void PhysDerivSegKernel(const size_t nq0, const size_t ncoord, inoutptr[d - 1][j] = inoutptr[0][j] * dfptr[d - 1][dfindex]; } } + + delete[] inoutptr; + delete[] dfptr; } template <typename TData, bool DEFORMED> @@ -92,6 +95,9 @@ __global__ void PhysDerivQuadKernel(const size_t nq0, const size_t nq1, } } } + + delete[] inoutptr; + delete[] dfptr; } template <typename TData, bool DEFORMED> @@ -149,6 +155,9 @@ __global__ void PhysDerivTriKernel(const size_t nq0, const size_t nq1, } } } + + delete[] inoutptr; + delete[] dfptr; } template <typename TData, bool DEFORMED> @@ -204,6 +213,9 @@ __global__ void PhysDerivHexKernel(const size_t nq0, const size_t nq1, } } } + + delete[] inoutptr; + delete[] dfptr; } template <typename TData, bool DEFORMED> @@ -307,6 +319,11 @@ __global__ void PhysDerivTetKernel(const size_t nq0, const size_t nq1, } } } + + delete[] wsp0; + delete[] wsp1; + delete[] inoutptr; + delete[] dfptr; } template <typename TData, bool DEFORMED> @@ -368,6 +385,9 @@ __global__ void PhysDerivPrismKernel( } } } + + delete[] inoutptr; + delete[] dfptr; } template <typename TData, bool DEFORMED> @@ -433,6 +453,9 @@ __global__ void PhysDerivPyrKernel(const size_t nq0, const size_t nq1, } } } + + delete[] inoutptr; + delete[] dfptr; } template <typename TData> -- GitLab