Skip to content
Snippets Groups Projects
Commit a2912074 authored by Jacques Xing's avatar Jacques Xing Committed by Chris Cantwell
Browse files

Improve efficiency of DiagPrecon operators

parent 47f84d4e
No related branches found
No related tags found
1 merge request!76Improve efficiency of DiagPrecon operators
...@@ -3,7 +3,9 @@ ...@@ -3,7 +3,9 @@
#include "Field.hpp" #include "Field.hpp"
#include "Operators/AssmbScatr/AssmbScatrCUDA.hpp" #include "Operators/AssmbScatr/AssmbScatrCUDA.hpp"
#include "Operators/CUDAMathKernels.cuh" #include "Operators/CUDAMathKernels.cuh"
#include "Operators/DiagPrecon/DiagPreconCUDAKernel.cuh"
#include "Operators/OperatorDiagPrecon.hpp" #include "Operators/OperatorDiagPrecon.hpp"
#include "Operators/OperatorHelper.cuh"
#include "Operators/OperatorRobBndCond.hpp" #include "Operators/OperatorRobBndCond.hpp"
#include <MultiRegions/AssemblyMap/AssemblyMapCG.h> #include <MultiRegions/AssemblyMap/AssemblyMapCG.h>
...@@ -40,9 +42,6 @@ public: ...@@ -40,9 +42,6 @@ public:
m_assmbScatr = m_assmbScatr =
std::static_pointer_cast<OperatorAssmbScatrImpl<TData, ImplCUDA>>( std::static_pointer_cast<OperatorAssmbScatrImpl<TData, ImplCUDA>>(
AssmbScatr<TData>::create(this->m_expansionList, "CUDA")); AssmbScatr<TData>::create(this->m_expansionList, "CUDA"));
// Deterime CUDA grid size.
m_gridSize = GetCUDAGridSize(m_nGlobal - m_nDir, m_blockSize);
} }
~OperatorDiagPreconImpl(void) ~OperatorDiagPreconImpl(void)
...@@ -54,6 +53,9 @@ public: ...@@ -54,6 +53,9 @@ public:
void apply(Field<TData, FieldState::Coeff> &in, void apply(Field<TData, FieldState::Coeff> &in,
Field<TData, FieldState::Coeff> &out) override Field<TData, FieldState::Coeff> &out) override
{ {
// Deterime CUDA grid size.
m_gridSize = GetCUDAGridSize(m_nGlobal - m_nDir, m_blockSize);
m_assmbScatr->Assemble(in, m_wk); m_assmbScatr->Assemble(in, m_wk);
vdivKernel<<<m_gridSize, m_blockSize>>>( vdivKernel<<<m_gridSize, m_blockSize>>>(
...@@ -71,7 +73,8 @@ public: ...@@ -71,7 +73,8 @@ public:
// auto robBCOp = RobBndCond<TData>::create(this->m_expansionList, // auto robBCOp = RobBndCond<TData>::create(this->m_expansionList,
// "CUDA"); // "CUDA");
Array<OneD, TData> diag(m_nLocal); TData *diag;
cudaMalloc((void **)&diag, sizeof(TData) * m_nLocal);
// create unit vector field to extract diagonal // create unit vector field to extract diagonal
Field<TData, FieldState::Coeff> unit_vec = Field<TData, FieldState::Coeff> unit_vec =
...@@ -87,36 +90,58 @@ public: ...@@ -87,36 +90,58 @@ public:
unit_vec.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); unit_vec.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
auto *actn_ptr = auto *actn_ptr =
action.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); action.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
auto diag_ptr = diag;
TData init = 1.0; size_t expIdx = 0;
for (size_t i = 0; i < m_nLocal; ++i) for (size_t block_idx = 0; block_idx < unit_vec.GetBlocks().size();
++block_idx)
{ {
// set ith term in unit vector to be 1 // Determine shape and type of the element.
cudaMemcpy(uvec_ptr + i, &init, sizeof(TData), auto const expPtr = this->m_expansionList->GetExp(expIdx);
cudaMemcpyHostToDevice); auto nElmts = unit_vec.GetBlocks()[block_idx].num_elements;
auto nmTot = expPtr->GetNcoeffs();
// apply operator to unit vector and store in action field
op->apply(unit_vec, action); // Deterime CUDA grid size.
// robBCOp->apply(unit_vec, action); m_gridSize = GetCUDAGridSize(nElmts, m_blockSize);
// copy ith row term from the action field to get ith diagonal for (size_t i = 0; i < nmTot; ++i)
cudaMemcpy(diag.get() + i, actn_ptr + i, sizeof(TData), {
cudaMemcpyDeviceToHost); // set ith term in unit vector to be 1
SetDiagonalKernel<<<m_gridSize, m_blockSize>>>(nmTot, nElmts, i,
// reset ith term in unit vector to be 0 1.0, uvec_ptr);
cudaMemset(uvec_ptr + i, 0, sizeof(TData));
// apply operator to unit vector and store in action field
op->apply(unit_vec, action);
// robBCOp->apply(unit_vec, action);
// copy ith row term from the action field to get ith diagonal
CopyDiagonalKernel<<<m_gridSize, m_blockSize>>>(
nmTot, nElmts, i, actn_ptr, diag_ptr);
// reset ith term in unit vector to be 0
SetDiagonalKernel<<<m_gridSize, m_blockSize>>>(nmTot, nElmts, i,
0.0, uvec_ptr);
}
uvec_ptr += unit_vec.GetBlocks()[block_idx].block_size;
diag_ptr += unit_vec.GetBlocks()[block_idx].block_size;
actn_ptr += unit_vec.GetBlocks()[block_idx].block_size;
expIdx += nElmts;
} }
// Assembly // Assembly
Array<OneD, TData> tmp(m_nGlobal, 0.0); Array<OneD, TData> locdiag(m_nLocal);
Array<OneD, TData> glodiag(m_nGlobal, 0.0);
cudaMemcpy(locdiag.get(), diag, sizeof(TData) * m_nLocal,
cudaMemcpyDeviceToHost);
for (size_t i = 0; i < m_nLocal; ++i) for (size_t i = 0; i < m_nLocal; ++i)
{ {
size_t gid1 = m_assmbMap->GetLocalToGlobalMap(i); size_t gid1 = m_assmbMap->GetLocalToGlobalMap(i);
tmp[gid1] += diag[i]; glodiag[gid1] += locdiag[i];
} }
m_assmbMap->UniversalAssemble(tmp); m_assmbMap->UniversalAssemble(glodiag);
cudaMemcpy(m_diag, tmp.get(), sizeof(TData) * m_nGlobal, cudaMemcpy(m_diag, glodiag.get(), sizeof(TData) * m_nGlobal,
cudaMemcpyHostToDevice); cudaMemcpyHostToDevice);
} }
......
namespace Nektar::Operators::detail
{
template <typename TData>
__global__ void SetDiagonalKernel(const size_t nm, const size_t nelmt,
const size_t mode, const TData val,
TData *out)
{
size_t e = blockDim.x * blockIdx.x + threadIdx.x;
while (e < nelmt)
{
out[e * nm + mode] = val;
e += blockDim.x * gridDim.x;
}
}
template <typename TData>
__global__ void CopyDiagonalKernel(const size_t nm, const size_t nelmt,
const size_t mode, const TData *in,
TData *out)
{
size_t e = blockDim.x * blockIdx.x + threadIdx.x;
while (e < nelmt)
{
out[e * nm + mode] = in[e * nm + mode];
e += blockDim.x * gridDim.x;
}
}
} // namespace Nektar::Operators::detail
...@@ -78,20 +78,41 @@ public: ...@@ -78,20 +78,41 @@ public:
auto *actn_ptr = action.GetStorage().GetCPUPtr(); auto *actn_ptr = action.GetStorage().GetCPUPtr();
auto *diag_ptr = diag.get(); auto *diag_ptr = diag.get();
for (size_t i = 0; i < m_nLocal; ++i) size_t expIdx = 0;
for (size_t block_idx = 0; block_idx < unit_vec.GetBlocks().size();
++block_idx)
{ {
// set ith term in unit vector to be 1 // Determine shape and type of the element.
*uvec_ptr = 1.0; auto const expPtr = this->m_expansionList->GetExp(expIdx);
auto nElmts = unit_vec.GetBlocks()[block_idx].num_elements;
// apply operator to unit vector and store in action field auto nmTot = expPtr->GetNcoeffs();
op->apply(unit_vec, action);
robBCOp->apply(unit_vec, action); for (size_t i = 0; i < nmTot; ++i)
{
// copy ith row term from the action field to get ith diagonal for (size_t j = 0; j < nElmts; ++j)
*(diag_ptr++) = *(actn_ptr++); {
// set ith term in unit vector to be 1
// reset ith term in unit vector to be 0 uvec_ptr[j * nmTot + i] = 1.0;
*(uvec_ptr++) = 0.0; }
// apply operator to unit vector and store in action field
op->apply(unit_vec, action);
robBCOp->apply(unit_vec, action);
for (size_t j = 0; j < nElmts; ++j)
{
// copy ith row term from the action field to get ith
// diagonal
diag_ptr[j * nmTot + i] = actn_ptr[j * nmTot + i];
// reset ith term in unit vector to be 0
uvec_ptr[j * nmTot + i] = 0.0;
}
}
uvec_ptr += unit_vec.GetBlocks()[block_idx].block_size;
diag_ptr += unit_vec.GetBlocks()[block_idx].block_size;
actn_ptr += unit_vec.GetBlocks()[block_idx].block_size;
expIdx += nElmts;
} }
// Assembly // Assembly
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment