Skip to content
Snippets Groups Projects
Commit 29faa872 authored by Jacques Xing's avatar Jacques Xing
Browse files

Tidy-up AssmbScatr operators implementation

parent 2f360248
No related branches found
No related tags found
1 merge request!66Tidy-up AssmbScatr operators implementation
#pragma once
#include "MemoryRegionCUDA.hpp"
#include "Operators/AssmbScatr/AssmbScatrCUDAKernels.cuh"
#include "Operators/OperatorAssmbScatr.hpp"
#include "Operators/OperatorHelper.cuh"
#include <MultiRegions/AssemblyMap/AssemblyMapCG.h>
#include <MultiRegions/ContField.h>
......@@ -24,21 +25,20 @@ public:
std::dynamic_pointer_cast<ContField>(this->m_expansionList);
m_assmbMap = contfield->GetLocalToGlobalMap();
// Get the solution type
GlobalSysSolnType solnType = m_assmbMap->GetGlobalSysSolnType();
auto nloc = m_assmbMap->GetNumLocalCoeffs();
auto nglo = (solnType == eIterativeFull)
? m_assmbMap->GetNumGlobalCoeffs()
: m_assmbMap->GetNumGlobalBndCoeffs();
m_solnType = m_assmbMap->GetGlobalSysSolnType();
m_nloc = m_assmbMap->GetNumLocalCoeffs();
m_nglo = (m_solnType == eIterativeFull)
? m_assmbMap->GetNumGlobalCoeffs()
: m_assmbMap->GetNumGlobalBndCoeffs();
m_ndir = m_assmbMap->GetNumGlobalDirBndCoeffs();
// Memory allocation for tmp pointer
cudaMalloc((void **)&m_tmpptr, sizeof(TData) * nglo);
cudaMemset(m_tmpptr, 0.0, sizeof(TData) * nglo);
// Memory allocation for global space pointer
cudaMalloc((void **)&m_gloptr, sizeof(TData) * m_nglo);
// Memory allocation for assemble pointer
cudaMalloc((void **)&m_assmbptr, sizeof(int) * nloc);
cudaMalloc((void **)&m_assmbptr, sizeof(int) * m_nloc);
auto assmbptr = m_assmbMap->GetLocalToGlobalMap().get();
cudaMemcpy(m_assmbptr, assmbptr, sizeof(int) * nloc,
cudaMemcpy(m_assmbptr, assmbptr, sizeof(int) * m_nloc,
cudaMemcpyHostToDevice);
m_signChange = m_assmbMap->AssemblyMap::GetSignChange();
......@@ -46,23 +46,20 @@ public:
if (m_signChange)
{
// Memory allocation for sign pointer
cudaMalloc((void **)&m_signptr, sizeof(TData) * nloc);
cudaMalloc((void **)&m_signptr, sizeof(TData) * m_nloc);
auto signptr = m_assmbMap->GetLocalToGlobalSign().get();
cudaMemcpy(m_signptr, signptr, sizeof(TData) * nloc,
cudaMemcpy(m_signptr, signptr, sizeof(TData) * m_nloc,
cudaMemcpyHostToDevice);
}
}
~OperatorAssmbScatrImpl()
{
cudaFree(m_tmpptr);
cudaFree(m_gloptr);
cudaFree(m_assmbptr);
m_tmpptr = nullptr;
m_assmbptr = nullptr;
if (m_signChange)
{
cudaFree(m_signptr);
m_signptr = nullptr;
}
}
......@@ -70,16 +67,15 @@ public:
Field<TData, FieldState::Coeff> &out,
const bool &zeroDir = false)
{
Assemble(in, m_tmpptr);
Assemble(in, m_gloptr);
// Zeroing Dirichlet BC
if (zeroDir)
{
size_t nDir = m_assmbMap->GetNumGlobalDirBndCoeffs();
cudaMemset(m_tmpptr, 0.0, sizeof(TData) * nDir);
cudaMemset(m_gloptr, 0, sizeof(TData) * m_ndir);
}
GlobalToLocal(m_tmpptr, out);
GlobalToLocal(m_gloptr, out);
}
void Assemble(Field<TData, FieldState::Coeff> &in, TData *outptr)
......@@ -87,10 +83,10 @@ public:
auto const *inptr =
in.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
// Get the solution type
GlobalSysSolnType solnType = m_assmbMap->GetGlobalSysSolnType();
// Zero output
cudaMemset(outptr, 0, sizeof(TData) * m_nglo);
if (solnType == eIterativeFull)
if (m_solnType == eIterativeFull)
{
// Initialise index
size_t expIdx = 0;
......@@ -102,7 +98,7 @@ public:
// Determine shape and type of the element.
auto const expPtr = this->m_expansionList->GetExp(expIdx);
auto nElmts = in.GetBlocks()[block_idx].num_elements;
auto numPts = expPtr->GetNcoeffs();
auto ncoeff = expPtr->GetNcoeffs();
// Deterime CUDA grid parameters.
m_gridSize = nElmts / m_blockSize;
......@@ -111,17 +107,17 @@ public:
if (m_signChange)
{
AssembleKernel<<<m_gridSize, m_blockSize>>>(
numPts, nElmts, offset, m_assmbptr, m_signptr, inptr,
ncoeff, nElmts, offset, m_assmbptr, m_signptr, inptr,
outptr);
}
else
{
AssembleKernel<<<m_gridSize, m_blockSize>>>(
numPts, nElmts, offset, m_assmbptr, inptr, outptr);
ncoeff, nElmts, offset, m_assmbptr, inptr, outptr);
}
// Increment pointer and index for next element type.
offset += numPts * nElmts;
offset += ncoeff * nElmts;
expIdx += nElmts;
}
}
......@@ -131,10 +127,10 @@ public:
{
auto *outptr = out.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
// Get the solution type
GlobalSysSolnType solnType = m_assmbMap->GetGlobalSysSolnType();
// Zero output
cudaMemset(outptr, 0, sizeof(TData) * m_nloc);
if (solnType == eIterativeFull)
if (m_solnType == eIterativeFull)
{
// Initialise index
size_t expIdx = 0;
......@@ -146,7 +142,7 @@ public:
// Determine shape and type of the element.
auto const expPtr = this->m_expansionList->GetExp(expIdx);
auto nElmts = out.GetBlocks()[block_idx].num_elements;
auto numPts = expPtr->GetNcoeffs();
auto ncoeff = expPtr->GetNcoeffs();
// Deterime CUDA grid parameters.
m_gridSize = nElmts / m_blockSize;
......@@ -155,17 +151,17 @@ public:
if (m_signChange)
{
GlobalToLocalKernel<<<m_gridSize, m_blockSize>>>(
numPts, nElmts, offset, m_assmbptr, m_signptr, inptr,
ncoeff, nElmts, offset, m_assmbptr, m_signptr, inptr,
outptr);
}
else
{
GlobalToLocalKernel<<<m_gridSize, m_blockSize>>>(
numPts, nElmts, offset, m_assmbptr, inptr, outptr);
ncoeff, nElmts, offset, m_assmbptr, inptr, outptr);
}
// Increment pointer and index for next element type.
offset += numPts * nElmts;
offset += ncoeff * nElmts;
expIdx += nElmts;
}
}
......@@ -184,10 +180,14 @@ public:
protected:
AssemblyMapCGSharedPtr m_assmbMap;
TData *m_tmpptr;
GlobalSysSolnType m_solnType;
TData *m_gloptr;
bool m_signChange;
TData *m_signptr;
int *m_assmbptr;
size_t m_nloc;
size_t m_nglo;
size_t m_ndir;
size_t m_gridSize;
size_t m_blockSize = 32;
};
......
......@@ -2,7 +2,7 @@ namespace Nektar::Operators::detail
{
template <typename TData>
__global__ void AssembleKernel(const size_t numPts, const size_t nelmt,
__global__ void AssembleKernel(const size_t ncoeff, const size_t nelmt,
const size_t offset, const int *assmbptr,
const TData *signptr, const TData *inptr,
TData *outptr)
......@@ -14,9 +14,9 @@ __global__ void AssembleKernel(const size_t numPts, const size_t nelmt,
return;
}
size_t index = offset + e * numPts;
size_t index = offset + e * ncoeff;
for (size_t i = 0; i < numPts; i++)
for (size_t i = 0; i < ncoeff; i++)
{
atomicAdd(outptr + assmbptr[index + i],
signptr[index + i] * inptr[index + i]);
......@@ -24,7 +24,7 @@ __global__ void AssembleKernel(const size_t numPts, const size_t nelmt,
}
template <typename TData>
__global__ void AssembleKernel(const size_t numPts, const size_t nelmt,
__global__ void AssembleKernel(const size_t ncoeff, const size_t nelmt,
const size_t offset, const int *assmbptr,
const TData sign, const TData *inptr,
TData *outptr)
......@@ -36,16 +36,16 @@ __global__ void AssembleKernel(const size_t numPts, const size_t nelmt,
return;
}
size_t index = offset + e * numPts;
size_t index = offset + e * ncoeff;
for (size_t i = 0; i < numPts; i++)
for (size_t i = 0; i < ncoeff; i++)
{
atomicAdd(outptr + assmbptr[index + i], sign * inptr[index + i]);
}
}
template <typename TData>
__global__ void AssembleKernel(const size_t numPts, const size_t nelmt,
__global__ void AssembleKernel(const size_t ncoeff, const size_t nelmt,
const size_t offset, const int *assmbptr,
const TData *inptr, TData *outptr)
{
......@@ -56,16 +56,16 @@ __global__ void AssembleKernel(const size_t numPts, const size_t nelmt,
return;
}
size_t index = offset + e * numPts;
size_t index = offset + e * ncoeff;
for (size_t i = 0; i < numPts; i++)
for (size_t i = 0; i < ncoeff; i++)
{
atomicAdd(outptr + assmbptr[index + i], inptr[index + i]);
}
}
template <typename TData>
__global__ void GlobalToLocalKernel(const size_t numPts, const size_t nelmt,
__global__ void GlobalToLocalKernel(const size_t ncoeff, const size_t nelmt,
const size_t offset, const int *assmbptr,
const TData *signptr, const TData *inptr,
TData *outptr)
......@@ -77,16 +77,16 @@ __global__ void GlobalToLocalKernel(const size_t numPts, const size_t nelmt,
return;
}
size_t index = offset + e * numPts;
size_t index = offset + e * ncoeff;
for (size_t i = 0; i < numPts; i++)
for (size_t i = 0; i < ncoeff; i++)
{
outptr[index + i] = signptr[index + i] * inptr[assmbptr[index + i]];
}
}
template <typename TData>
__global__ void GlobalToLocalKernel(const size_t numPts, const size_t nelmt,
__global__ void GlobalToLocalKernel(const size_t ncoeff, const size_t nelmt,
const size_t offset, const int *assmbptr,
const TData sign, const TData *inptr,
TData *outptr)
......@@ -98,16 +98,16 @@ __global__ void GlobalToLocalKernel(const size_t numPts, const size_t nelmt,
return;
}
size_t index = offset + e * numPts;
size_t index = offset + e * ncoeff;
for (size_t i = 0; i < numPts; i++)
for (size_t i = 0; i < ncoeff; i++)
{
outptr[index + i] = sign * inptr[assmbptr[index + i]];
}
}
template <typename TData>
__global__ void GlobalToLocalKernel(const size_t numPts, const size_t nelmt,
__global__ void GlobalToLocalKernel(const size_t ncoeff, const size_t nelmt,
const size_t offset, const int *assmbptr,
const TData *inptr, TData *outptr)
{
......@@ -118,9 +118,9 @@ __global__ void GlobalToLocalKernel(const size_t numPts, const size_t nelmt,
return;
}
size_t index = offset + e * numPts;
size_t index = offset + e * ncoeff;
for (size_t i = 0; i < numPts; i++)
for (size_t i = 0; i < ncoeff; i++)
{
outptr[index + i] = inptr[assmbptr[index + i]];
}
......
......@@ -2,7 +2,6 @@
#include "Operators/OperatorAssmbScatr.hpp"
#include <LibUtilities/BasicUtils/Vmath.hpp>
#include <MultiRegions/AssemblyMap/AssemblyMapCG.h>
#include <MultiRegions/ContField.h>
......@@ -25,37 +24,35 @@ public:
std::dynamic_pointer_cast<ContField>(this->m_expansionList);
m_assmbMap = contfield->GetLocalToGlobalMap();
GlobalSysSolnType solnType = m_assmbMap->GetGlobalSysSolnType();
auto nglo = (solnType == eIterativeFull)
? m_assmbMap->GetNumGlobalCoeffs()
: m_assmbMap->GetNumGlobalBndCoeffs();
m_solnType = m_assmbMap->GetGlobalSysSolnType();
m_nloc = m_assmbMap->GetNumLocalCoeffs();
m_nglo = (m_solnType == eIterativeFull)
? m_assmbMap->GetNumGlobalCoeffs()
: m_assmbMap->GetNumGlobalBndCoeffs();
m_ndir = m_assmbMap->GetNumGlobalDirBndCoeffs();
m_tmp = Array<OneD, TData>(nglo);
m_glo = Array<OneD, TData>(m_nglo);
}
void apply(Field<TData, FieldState::Coeff> &in,
Field<TData, FieldState::Coeff> &out,
const bool &zeroDir = false)
{
Assemble(in, m_tmp);
Assemble(in, m_glo);
// Zeroing Dirichlet BC
if (zeroDir)
{
auto nDir = m_assmbMap->GetNumGlobalDirBndCoeffs();
Vmath::Zero(nDir, m_tmp.get(), 1);
Vmath::Zero(m_ndir, m_glo.get(), 1);
}
GlobalToLocal(m_tmp, out);
GlobalToLocal(m_glo, out);
}
void Assemble(Field<TData, FieldState::Coeff> &in,
Array<OneD, TData> outarray)
{
// Get the solution type
GlobalSysSolnType solnType = m_assmbMap->GetGlobalSysSolnType();
auto nloc = m_assmbMap->GetNumLocalCoeffs();
Array<OneD, TData> inarray(nloc);
Array<OneD, TData> inarray(m_nloc);
// Copy data from input field
auto *inarrptr = inarray.data();
......@@ -73,7 +70,7 @@ public:
inptr += nSize;
}
if (solnType == eIterativeFull)
if (m_solnType == eIterativeFull)
{
m_assmbMap->Assemble(inarray, outarray);
}
......@@ -86,13 +83,9 @@ public:
void GlobalToLocal(Array<OneD, TData> inarray,
Field<TData, FieldState::Coeff> &out)
{
// Get the solution type
GlobalSysSolnType solnType = m_assmbMap->GetGlobalSysSolnType();
auto nloc = m_assmbMap->GetNumLocalCoeffs();
Array<OneD, TData> outarray(nloc);
Array<OneD, TData> outarray(m_nloc);
if (solnType == eIterativeFull)
if (m_solnType == eIterativeFull)
{
m_assmbMap->GlobalToLocal(inarray, outarray);
}
......@@ -131,7 +124,11 @@ public:
protected:
AssemblyMapCGSharedPtr m_assmbMap;
Array<OneD, TData> m_tmp;
GlobalSysSolnType m_solnType;
Array<OneD, TData> m_glo;
size_t m_nloc;
size_t m_nglo;
size_t m_ndir;
};
} // namespace Nektar::Operators::detail
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment