Skip to content
Snippets Groups Projects
Commit 0d9525a6 authored by Chris Cantwell's avatar Chris Cantwell
Browse files

Merge branch 'fix-cuda-memory-leak' into 'master'

Fix various CUDA memory leaks

See merge request !67
parents 5a7510d2 73afd8a2
No related branches found
No related tags found
1 merge request!67Fix various CUDA memory leaks
...@@ -7,4 +7,4 @@ std::string OperatorBwdTransImpl<double, ImplCUDA>::className = ...@@ -7,4 +7,4 @@ std::string OperatorBwdTransImpl<double, ImplCUDA>::className =
GetOperatorFactory<double>().RegisterCreatorFunction( GetOperatorFactory<double>().RegisterCreatorFunction(
"BwdTransCUDA", OperatorBwdTransImpl<double, ImplCUDA>::instantiate, "BwdTransCUDA", OperatorBwdTransImpl<double, ImplCUDA>::instantiate,
"..."); "...");
} } // namespace Nektar::Operators::detail
...@@ -21,8 +21,9 @@ __global__ void DirBndCondKernel(const size_t nsize, const int *offsetptr, ...@@ -21,8 +21,9 @@ __global__ void DirBndCondKernel(const size_t nsize, const int *offsetptr,
if (bctypeptr[i] == eDirichlet) if (bctypeptr[i] == eDirichlet)
{ {
int offset = offsetptr[i]; size_t offset = offsetptr[i];
for (size_t j = 0; j < ncoeffptr[i]; j++) size_t ncoeff = ncoeffptr[i];
for (size_t j = 0; j < ncoeff; j++)
{ {
outptr[mapptr[offset + j]] = inptr[offset + j]; outptr[mapptr[offset + j]] = inptr[offset + j];
} }
...@@ -45,8 +46,9 @@ __global__ void DirBndCondKernel(const size_t nsize, const int *offsetptr, ...@@ -45,8 +46,9 @@ __global__ void DirBndCondKernel(const size_t nsize, const int *offsetptr,
if (bctypeptr[i] == eDirichlet) if (bctypeptr[i] == eDirichlet)
{ {
int offset = offsetptr[i]; size_t offset = offsetptr[i];
for (size_t j = 0; j < ncoeffptr[i]; j++) size_t ncoeff = ncoeffptr[i];
for (size_t j = 0; j < ncoeff; j++)
{ {
outptr[mapptr[offset + j]] = outptr[mapptr[offset + j]] =
signptr[offset + j] * inptr[offset + j]; signptr[offset + j] * inptr[offset + j];
......
#pragma once
#include "Operators/BwdTrans/BwdTransCUDA.hpp" #include "Operators/BwdTrans/BwdTransCUDA.hpp"
#include "Operators/Helmholtz/HelmholtzCUDAKernels.cuh" #include "Operators/Helmholtz/HelmholtzCUDAKernels.cuh"
#include "Operators/IProductWRTBase/IProductWRTBaseCUDA.hpp" #include "Operators/IProductWRTBase/IProductWRTBaseCUDA.hpp"
...@@ -42,6 +44,11 @@ public: ...@@ -42,6 +44,11 @@ public:
sizeof(TData) * nCoord * nCoord, cudaMemcpyHostToDevice); sizeof(TData) * nCoord * nCoord, cudaMemcpyHostToDevice);
} }
~OperatorHelmholtzImpl(void)
{
cudaFree(m_diffCoeff);
}
void apply(Field<TData, FieldState::Coeff> &in, void apply(Field<TData, FieldState::Coeff> &in,
Field<TData, FieldState::Coeff> &out) override Field<TData, FieldState::Coeff> &out) override
{ {
...@@ -68,7 +75,6 @@ public: ...@@ -68,7 +75,6 @@ public:
deriv.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); deriv.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
auto *derivptr1 = derivptr0 + deriv.GetFieldSize(); auto *derivptr1 = derivptr0 + deriv.GetFieldSize();
auto *derivptr2 = derivptr1 + deriv.GetFieldSize(); auto *derivptr2 = derivptr1 + deriv.GetFieldSize();
std::vector<TData *> derivptr{derivptr0, derivptr1, derivptr2};
// Initialize index. // Initialize index.
size_t expIdx = 0; size_t expIdx = 0;
...@@ -83,38 +89,33 @@ public: ...@@ -83,38 +89,33 @@ public:
auto nqTot = expPtr->GetTotPoints(); auto nqTot = expPtr->GetTotPoints();
// Determine CUDA grid parameters. // Determine CUDA grid parameters.
m_gridSize = nElmts / m_blockSize; m_gridSize = (nElmts * nqTot) / m_blockSize;
m_gridSize += (nElmts % m_blockSize == 0) ? 0 : 1; m_gridSize += ((nElmts * nqTot) % m_blockSize == 0) ? 0 : 1;
// Multiply by diffusion coefficient. // Multiply by diffusion coefficient.
if (nCoord == 1) if (nCoord == 1)
{ {
auto nq0 = expPtr->GetNumPoints(0);
DiffusionCoeff1DKernel<<<m_gridSize, m_blockSize>>>( DiffusionCoeff1DKernel<<<m_gridSize, m_blockSize>>>(
nq0, nElmts, m_diffCoeff, derivptr[0]); nqTot * nElmts, m_diffCoeff, derivptr0);
derivptr0 += nqTot * nElmts;
} }
else if (nCoord == 2) else if (nCoord == 2)
{ {
auto nq0 = expPtr->GetNumPoints(0);
auto nq1 = expPtr->GetNumPoints(1);
DiffusionCoeff2DKernel<<<m_gridSize, m_blockSize>>>( DiffusionCoeff2DKernel<<<m_gridSize, m_blockSize>>>(
nq0, nq1, nElmts, m_diffCoeff, derivptr[0], derivptr[1]); nqTot * nElmts, m_diffCoeff, derivptr0, derivptr1);
derivptr0 += nqTot * nElmts;
derivptr1 += nqTot * nElmts;
} }
else else
{ {
auto nq0 = expPtr->GetNumPoints(0);
auto nq1 = expPtr->GetNumPoints(1);
auto nq2 = expPtr->GetNumPoints(2);
DiffusionCoeff3DKernel<<<m_gridSize, m_blockSize>>>( DiffusionCoeff3DKernel<<<m_gridSize, m_blockSize>>>(
nq0, nq1, nq2, nElmts, m_diffCoeff, derivptr[0], nqTot * nElmts, m_diffCoeff, derivptr0, derivptr1,
derivptr[1], derivptr[2]); derivptr2);
derivptr0 += nqTot * nElmts;
derivptr1 += nqTot * nElmts;
derivptr2 += nqTot * nElmts;
} }
// Increment pointer and index for next element type.
for (size_t d = 0; d < nCoord; d++)
{
derivptr[d] += nqTot * nElmts;
}
expIdx += nElmts; expIdx += nElmts;
} }
} }
......
namespace Nektar::Operators::detail namespace Nektar::Operators::detail
{ {
template <typename TData> template <typename TData>
__global__ void DiffusionCoeff1DKernel(const size_t nq0, const size_t nelmt, __global__ void DiffusionCoeff1DKernel(const size_t nsize,
const TData *diffCoeff, TData *deriv0) const TData *diffCoeff, TData *deriv0)
{ {
size_t e = blockDim.x * blockIdx.x + threadIdx.x; size_t i = blockDim.x * blockIdx.x + threadIdx.x;
if (e >= nelmt) if (i >= nsize)
{ {
return; return;
} }
// Assign pointers. deriv0[i] *= diffCoeff[0];
TData *derivptr = deriv0 + nq0 * e;
for (size_t i = 0; i < nq0; i++)
{
derivptr[i] *= diffCoeff[0];
}
} }
template <typename TData> template <typename TData>
__global__ void DiffusionCoeff2DKernel(const size_t nq0, const size_t nq1, __global__ void DiffusionCoeff2DKernel(const size_t nsize,
const size_t nelmt,
const TData *diffCoeff, TData *deriv0, const TData *diffCoeff, TData *deriv0,
TData *deriv1) TData *deriv1)
{ {
size_t e = blockDim.x * blockIdx.x + threadIdx.x; __shared__ TData s_diffCoeff[4];
if (e >= nelmt) size_t ind = threadIdx.x;
if (ind < 4)
{ {
return; s_diffCoeff[ind] = diffCoeff[ind];
} }
constexpr size_t ncoord = 2; __syncthreads();
// Assign pointers. size_t i = blockDim.x * blockIdx.x + threadIdx.x;
TData **derivptr = new TData *[ncoord];
derivptr[0] = deriv0 + nq0 * nq1 * e;
derivptr[1] = deriv1 + nq0 * nq1 * e;
for (size_t j = 0, cnt = 0; j < nq1; j++) if (i >= nsize)
{ {
for (size_t i = 0; i < nq0; i++) return;
{
TData deriv[2] = {derivptr[0][cnt], derivptr[1][cnt]};
for (size_t d = 0; d < ncoord; d++)
{
derivptr[d][cnt] = diffCoeff[d * ncoord + 0] * deriv[0] +
diffCoeff[d * ncoord + 1] * deriv[1];
}
cnt++;
}
} }
TData deriv[2] = {deriv0[i], deriv1[i]};
deriv0[i] = s_diffCoeff[0] * deriv[0] + s_diffCoeff[1] * deriv[1];
deriv1[i] = s_diffCoeff[2] * deriv[0] + s_diffCoeff[3] * deriv[1];
} }
template <typename TData> template <typename TData>
__global__ void DiffusionCoeff3DKernel(const size_t nq0, const size_t nq1, __global__ void DiffusionCoeff3DKernel(const size_t nsize, TData *diffCoeff,
const size_t nq2, const size_t nelmt, TData *deriv0, TData *deriv1,
TData *diffCoeff, TData *deriv0, TData *deriv2)
TData *deriv1, TData *deriv2)
{ {
size_t e = blockDim.x * blockIdx.x + threadIdx.x; __shared__ TData s_diffCoeff[9];
if (e >= nelmt) size_t ind = threadIdx.x;
if (ind < 9)
{ {
return; s_diffCoeff[ind] = diffCoeff[ind];
} }
constexpr size_t ncoord = 3; __syncthreads();
// Assign pointers. size_t i = blockDim.x * blockIdx.x + threadIdx.x;
TData **derivptr = new TData *[ncoord];
derivptr[0] = deriv0 + nq0 * nq1 * nq2 * e;
derivptr[1] = deriv1 + nq0 * nq1 * nq2 * e;
derivptr[2] = deriv2 + nq0 * nq1 * nq2 * e;
for (size_t k = 0, cnt = 0; k < nq2; k++) if (i >= nsize)
{ {
for (size_t j = 0; j < nq1; j++) return;
{
for (size_t i = 0; i < nq0; i++)
{
TData deriv[3] = {derivptr[0][cnt], derivptr[1][cnt],
derivptr[2][cnt]};
for (size_t d = 0; d < ncoord; d++)
{
derivptr[d][cnt] = diffCoeff[d * ncoord + 0] * deriv[0] +
diffCoeff[d * ncoord + 1] * deriv[1] +
diffCoeff[d * ncoord + 2] * deriv[2];
}
cnt++;
}
}
} }
TData deriv[3] = {deriv0[i], deriv1[i], deriv2[i]};
deriv0[i] = s_diffCoeff[0] * deriv[0] + s_diffCoeff[1] * deriv[1] +
s_diffCoeff[2] * deriv[2];
deriv1[i] = s_diffCoeff[3] * deriv[0] + s_diffCoeff[4] * deriv[1] +
s_diffCoeff[5] * deriv[2];
deriv2[i] = s_diffCoeff[6] * deriv[0] + s_diffCoeff[7] * deriv[1] +
s_diffCoeff[8] * deriv[2];
} }
} // namespace Nektar::Operators::detail } // namespace Nektar::Operators::detail
#pragma once
#include "Operators/BwdTrans/BwdTransStdMat.hpp" #include "Operators/BwdTrans/BwdTransStdMat.hpp"
#include "Operators/IProductWRTBase/IProductWRTBaseStdMat.hpp" #include "Operators/IProductWRTBase/IProductWRTBaseStdMat.hpp"
#include "Operators/IProductWRTDerivBase/IProductWRTDerivBaseStdMat.hpp" #include "Operators/IProductWRTDerivBase/IProductWRTDerivBaseStdMat.hpp"
......
...@@ -94,6 +94,8 @@ public: ...@@ -94,6 +94,8 @@ public:
~OperatorIProductWRTDerivBaseImpl(void) ~OperatorIProductWRTDerivBaseImpl(void)
{ {
size_t nCoord = this->m_expansionList->GetCoordim(0);
DeallocateDataCUDA<TData>(m_basis); DeallocateDataCUDA<TData>(m_basis);
DeallocateDataCUDA<TData>(m_dbasis); DeallocateDataCUDA<TData>(m_dbasis);
DeallocateDataCUDA<TData>(m_weight); DeallocateDataCUDA<TData>(m_weight);
...@@ -101,6 +103,15 @@ public: ...@@ -101,6 +103,15 @@ public:
DeallocateDataCUDA<TData>(m_D); DeallocateDataCUDA<TData>(m_D);
cudaFree(m_jac); cudaFree(m_jac);
cudaFree(m_derivFac); cudaFree(m_derivFac);
cudaFree(m_wsp0);
if (nCoord > 1)
{
cudaFree(m_wsp1);
}
if (nCoord > 2)
{
cudaFree(m_wsp2);
}
} }
void apply(Field<TData, FieldState::Phys> &in, void apply(Field<TData, FieldState::Phys> &in,
......
...@@ -50,7 +50,7 @@ public: ...@@ -50,7 +50,7 @@ public:
std::vector<TData> matrix_print(m_size * m_size); std::vector<TData> matrix_print(m_size * m_size);
// Copy device memory to host memory for printing // Copy device memory to host memory for printing
cudaMemcpy(matrix_print.data(), m_matrix, cudaMemcpy(matrix_print.data(), m_matrix,
m_size * m_size * sizeof(float), cudaMemcpyDeviceToHost); m_size * m_size * sizeof(TData), cudaMemcpyDeviceToHost);
auto pMat = matrix_print.cbegin(); auto pMat = matrix_print.cbegin();
std::string str; std::string str;
......
...@@ -43,6 +43,9 @@ __global__ void PhysDerivSegKernel(const size_t nq0, const size_t ncoord, ...@@ -43,6 +43,9 @@ __global__ void PhysDerivSegKernel(const size_t nq0, const size_t ncoord,
inoutptr[d - 1][j] = inoutptr[0][j] * dfptr[d - 1][dfindex]; inoutptr[d - 1][j] = inoutptr[0][j] * dfptr[d - 1][dfindex];
} }
} }
delete[] inoutptr;
delete[] dfptr;
} }
template <typename TData, bool DEFORMED> template <typename TData, bool DEFORMED>
...@@ -92,6 +95,9 @@ __global__ void PhysDerivQuadKernel(const size_t nq0, const size_t nq1, ...@@ -92,6 +95,9 @@ __global__ void PhysDerivQuadKernel(const size_t nq0, const size_t nq1,
} }
} }
} }
delete[] inoutptr;
delete[] dfptr;
} }
template <typename TData, bool DEFORMED> template <typename TData, bool DEFORMED>
...@@ -149,6 +155,9 @@ __global__ void PhysDerivTriKernel(const size_t nq0, const size_t nq1, ...@@ -149,6 +155,9 @@ __global__ void PhysDerivTriKernel(const size_t nq0, const size_t nq1,
} }
} }
} }
delete[] inoutptr;
delete[] dfptr;
} }
template <typename TData, bool DEFORMED> template <typename TData, bool DEFORMED>
...@@ -204,6 +213,9 @@ __global__ void PhysDerivHexKernel(const size_t nq0, const size_t nq1, ...@@ -204,6 +213,9 @@ __global__ void PhysDerivHexKernel(const size_t nq0, const size_t nq1,
} }
} }
} }
delete[] inoutptr;
delete[] dfptr;
} }
template <typename TData, bool DEFORMED> template <typename TData, bool DEFORMED>
...@@ -307,6 +319,11 @@ __global__ void PhysDerivTetKernel(const size_t nq0, const size_t nq1, ...@@ -307,6 +319,11 @@ __global__ void PhysDerivTetKernel(const size_t nq0, const size_t nq1,
} }
} }
} }
delete[] wsp0;
delete[] wsp1;
delete[] inoutptr;
delete[] dfptr;
} }
template <typename TData, bool DEFORMED> template <typename TData, bool DEFORMED>
...@@ -368,6 +385,9 @@ __global__ void PhysDerivPrismKernel( ...@@ -368,6 +385,9 @@ __global__ void PhysDerivPrismKernel(
} }
} }
} }
delete[] inoutptr;
delete[] dfptr;
} }
template <typename TData, bool DEFORMED> template <typename TData, bool DEFORMED>
...@@ -433,6 +453,9 @@ __global__ void PhysDerivPyrKernel(const size_t nq0, const size_t nq1, ...@@ -433,6 +453,9 @@ __global__ void PhysDerivPyrKernel(const size_t nq0, const size_t nq1,
} }
} }
} }
delete[] inoutptr;
delete[] dfptr;
} }
template <typename TData> template <typename TData>
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment