Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • nektar/redesign-prototypes
1 result
Show changes
Commits on Source (2)
Showing
with 717 additions and 264 deletions
......@@ -70,9 +70,9 @@ if (NEKTAR_USE_CUDA)
Operators/DirBndCond/DirBndCondCUDA.cu
Operators/NeuBndCond/NeuBndCondCUDA.cu
#Operators/RobBndCond/RobBndCondCUDA.cu
#Operators/ConjGrad/ConjGradCUDA.cu
#Operators/FwdTrans/FwdTransCUDA.cu
#Operators/HelmSolve/HelmSolveCUDA.cu
Operators/ConjGrad/ConjGradCUDA.cu
Operators/FwdTrans/FwdTransCUDA.cu
Operators/HelmSolve/HelmSolveCUDA.cu
Operators/Matrix/MatrixCUDA.cu
)
endif()
......
......@@ -3,6 +3,7 @@
#include "MemoryRegionCUDA.hpp"
#include "Operators/AssmbScatr/AssmbScatrCUDAKernels.cuh"
#include "Operators/OperatorAssmbScatr.hpp"
#include "Operators/OperatorHelper.cuh"
#include <MultiRegions/AssemblyMap/AssemblyMapCG.h>
#include <MultiRegions/ContField.h>
......@@ -100,9 +101,8 @@ public:
auto nElmts = in.GetBlocks()[block_idx].num_elements;
auto ncoeff = expPtr->GetNcoeffs();
// Deterime CUDA grid parameters.
m_gridSize = nElmts / m_blockSize;
m_gridSize += (nElmts % m_blockSize == 0) ? 0 : 1;
// Deterime CUDA grid size.
m_gridSize = GetCUDAGridSize(nElmts, m_blockSize);
if (m_signChange)
{
......@@ -144,9 +144,8 @@ public:
auto nElmts = out.GetBlocks()[block_idx].num_elements;
auto ncoeff = expPtr->GetNcoeffs();
// Deterime CUDA grid parameters.
m_gridSize = nElmts / m_blockSize;
m_gridSize += (nElmts % m_blockSize == 0) ? 0 : 1;
// Deterime CUDA grid size.
m_gridSize = GetCUDAGridSize(nElmts, m_blockSize);
if (m_signChange)
{
......@@ -188,7 +187,7 @@ protected:
size_t m_nloc;
size_t m_nglo;
size_t m_ndir;
size_t m_gridSize;
size_t m_gridSize = 1024;
size_t m_blockSize = 32;
};
......
......@@ -9,17 +9,16 @@ __global__ void AssembleKernel(const size_t ncoeff, const size_t nelmt,
{
size_t e = blockDim.x * blockIdx.x + threadIdx.x;
if (e >= nelmt)
while (e < nelmt)
{
return;
}
size_t index = offset + e * ncoeff;
for (size_t i = 0; i < ncoeff; i++)
{
atomicAdd(outptr + assmbptr[index + i],
signptr[index + i] * inptr[index + i]);
size_t index = offset + e * ncoeff;
for (size_t i = 0; i < ncoeff; i++)
{
atomicAdd(outptr + assmbptr[index + i],
signptr[index + i] * inptr[index + i]);
}
e += blockDim.x * gridDim.x;
}
}
......@@ -31,16 +30,15 @@ __global__ void AssembleKernel(const size_t ncoeff, const size_t nelmt,
{
size_t e = blockDim.x * blockIdx.x + threadIdx.x;
if (e >= nelmt)
while (e < nelmt)
{
return;
}
size_t index = offset + e * ncoeff;
size_t index = offset + e * ncoeff;
for (size_t i = 0; i < ncoeff; i++)
{
atomicAdd(outptr + assmbptr[index + i], sign * inptr[index + i]);
for (size_t i = 0; i < ncoeff; i++)
{
atomicAdd(outptr + assmbptr[index + i], sign * inptr[index + i]);
}
e += blockDim.x * gridDim.x;
}
}
......@@ -51,16 +49,15 @@ __global__ void AssembleKernel(const size_t ncoeff, const size_t nelmt,
{
size_t e = blockDim.x * blockIdx.x + threadIdx.x;
if (e >= nelmt)
while (e < nelmt)
{
return;
}
size_t index = offset + e * ncoeff;
size_t index = offset + e * ncoeff;
for (size_t i = 0; i < ncoeff; i++)
{
atomicAdd(outptr + assmbptr[index + i], inptr[index + i]);
for (size_t i = 0; i < ncoeff; i++)
{
atomicAdd(outptr + assmbptr[index + i], inptr[index + i]);
}
e += blockDim.x * gridDim.x;
}
}
......@@ -72,16 +69,15 @@ __global__ void GlobalToLocalKernel(const size_t ncoeff, const size_t nelmt,
{
size_t e = blockDim.x * blockIdx.x + threadIdx.x;
if (e >= nelmt)
while (e < nelmt)
{
return;
}
size_t index = offset + e * ncoeff;
size_t index = offset + e * ncoeff;
for (size_t i = 0; i < ncoeff; i++)
{
outptr[index + i] = signptr[index + i] * inptr[assmbptr[index + i]];
for (size_t i = 0; i < ncoeff; i++)
{
outptr[index + i] = signptr[index + i] * inptr[assmbptr[index + i]];
}
e += blockDim.x * gridDim.x;
}
}
......@@ -93,16 +89,15 @@ __global__ void GlobalToLocalKernel(const size_t ncoeff, const size_t nelmt,
{
size_t e = blockDim.x * blockIdx.x + threadIdx.x;
if (e >= nelmt)
while (e < nelmt)
{
return;
}
size_t index = offset + e * ncoeff;
size_t index = offset + e * ncoeff;
for (size_t i = 0; i < ncoeff; i++)
{
outptr[index + i] = sign * inptr[assmbptr[index + i]];
for (size_t i = 0; i < ncoeff; i++)
{
outptr[index + i] = sign * inptr[assmbptr[index + i]];
}
e += blockDim.x * gridDim.x;
}
}
......@@ -113,16 +108,15 @@ __global__ void GlobalToLocalKernel(const size_t ncoeff, const size_t nelmt,
{
size_t e = blockDim.x * blockIdx.x + threadIdx.x;
if (e >= nelmt)
while (e < nelmt)
{
return;
}
size_t index = offset + e * ncoeff;
size_t index = offset + e * ncoeff;
for (size_t i = 0; i < ncoeff; i++)
{
outptr[index + i] = inptr[assmbptr[index + i]];
for (size_t i = 0; i < ncoeff; i++)
{
outptr[index + i] = inptr[assmbptr[index + i]];
}
e += blockDim.x * gridDim.x;
}
}
......
#pragma once
#include "MemoryRegionCUDA.hpp"
#include "Operators/BwdTrans/BwdTransCUDAKernels.cuh"
#include "Operators/OperatorBwdTrans.hpp"
......@@ -90,9 +92,8 @@ public:
auto nqTot = expPtr->GetTotPoints();
auto shape = expPtr->DetShapeType();
// Deterime CUDA grid parameters.
m_gridSize = nElmts / m_blockSize;
m_gridSize += (nElmts % m_blockSize == 0) ? 0 : 1;
// Deterime CUDA grid size.
m_gridSize = GetCUDAGridSize(nElmts, m_blockSize);
// Flag for collapsed coordinate correction.
bool correct = expPtr->GetBasis(0)->GetBasisType() ==
......
#include "ConjGradCUDA.hpp"
namespace Nektar::Operators::detail
{
// Register implementation with Operator Factory
template <>
std::string OperatorConjGradImpl<double, ImplCUDA>::className =
GetOperatorFactory<double>().RegisterCreatorFunction(
"ConjGradCUDA", OperatorConjGradImpl<double, ImplCUDA>::instantiate,
"...");
} // namespace Nektar::Operators::detail
#pragma once
#include <algorithm>
#include <array>
#include <assert.h>
#include <cmath>
#include <memory>
#include <LibUtilities/BasicUtils/SessionReader.h>
#include <LibUtilities/BasicUtils/Vmath.hpp>
#include <MultiRegions/ContField.h>
#include "MemoryRegionCUDA.hpp"
#include "Operators/CUDAMathKernels.cuh"
#include "Operators/OperatorAssmbScatr.hpp"
#include "Operators/OperatorConjGrad.hpp"
#include "Operators/OperatorHelper.cuh"
#include "Operators/OperatorRobBndCond.hpp"
using namespace Nektar;
using namespace Nektar::MultiRegions;
namespace Nektar::Operators::detail
{
template <typename TData>
class OperatorConjGradImpl<TData, ImplCUDA> : public OperatorConjGrad<TData>
{
public:
OperatorConjGradImpl(const MultiRegions::ExpListSharedPtr &expansionList)
: OperatorConjGrad<TData>(expansionList),
m_w_A(Field<TData, FieldState::Coeff>::template create<
MemoryRegionCUDA>(
GetBlockAttributes(FieldState::Coeff, expansionList))),
m_s_A(Field<TData, FieldState::Coeff>::template create<
MemoryRegionCUDA>(
GetBlockAttributes(FieldState::Coeff, expansionList))),
m_r_A(Field<TData, FieldState::Coeff>::template create<
MemoryRegionCUDA>(
GetBlockAttributes(FieldState::Coeff, expansionList))),
m_wk(Field<TData, FieldState::Coeff>::template create<
MemoryRegionCUDA>(
GetBlockAttributes(FieldState::Coeff, expansionList)))
{
auto contfield =
std::dynamic_pointer_cast<ContField>(this->m_expansionList);
contfield->GetSession()->LoadParameter("NekLinSysMaxIterations",
m_maxIter, 5000);
contfield->GetSession()->LoadParameter("IterativeSolverTolerance",
m_tol, 1.0E-09);
m_nloc = contfield->GetLocalToGlobalMap()->GetNumLocalCoeffs();
m_assmbScatr = AssmbScatr<TData>::create(this->m_expansionList, "CUDA");
// m_robBndCond = RobBndCond<TData>::create(this->m_expansionList,
// "CUDA");
cudaMalloc((void **)&m_p_A, sizeof(TData) * m_nloc);
cudaMalloc((void **)&m_q_A, sizeof(TData) * m_nloc);
cudaMalloc((void **)&m_vExchange, sizeof(TData) * 4);
// Deterime CUDA grid size.
m_gridSize = GetCUDAGridSize(m_nloc, m_blockSize);
}
~OperatorConjGradImpl(void)
{
cudaFree(m_p_A);
cudaFree(m_q_A);
cudaFree(m_vExchange);
}
void apply(Field<TData, FieldState::Coeff> &in,
Field<TData, FieldState::Coeff> &out) override
{
// Store pointers to temporary fields (device)
auto *p_in = in.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
auto *p_out = out.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
auto *p_w_A = m_w_A.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
auto *p_s_A = m_s_A.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
auto *p_r_A = m_r_A.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
auto *p_wk = m_wk.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
auto p_p_A = m_p_A;
auto p_q_A = m_q_A;
// Set the fields to zero (device)
cudaMemset(p_out, 0, sizeof(TData) * m_nloc);
cudaMemset(p_w_A, 0, sizeof(TData) * m_nloc);
cudaMemset(p_s_A, 0, sizeof(TData) * m_nloc);
cudaMemset(p_wk, 0, sizeof(TData) * m_nloc);
cudaMemset(p_p_A, 0, sizeof(TData) * m_nloc);
cudaMemset(p_q_A, 0, sizeof(TData) * m_nloc);
// Convergence parameters (host)
size_t totalIterations = 0;
TData rhsMagnitude;
TData alpha;
TData beta;
TData rho;
TData rho_new;
TData mu;
TData eps;
// Temporary (device)
cudaMemset(m_vExchange, 0, sizeof(TData) * 4);
// Copy RHS into initial residual
cudaMemcpy(p_r_A, p_in, sizeof(TData) * m_nloc,
cudaMemcpyDeviceToDevice);
// Assembly (communication)
m_assmbScatr->apply(m_r_A, m_wk, true);
dotKernel<<<m_gridSize, m_blockSize, sizeof(TData) * m_blockSize>>>(
m_nloc, p_wk, p_r_A, m_vExchange + 2);
cudaMemcpy(&eps, m_vExchange + 2, sizeof(TData),
cudaMemcpyDeviceToHost);
// Calculate rhs magnitude
m_assmbScatr->apply(m_r_A, m_wk);
dotKernel<<<m_gridSize, m_blockSize, sizeof(TData) * m_blockSize>>>(
m_nloc, p_in, p_wk, m_vExchange + 3);
cudaMemcpy(&rhsMagnitude, m_vExchange + 3, sizeof(TData),
cudaMemcpyDeviceToHost);
rhsMagnitude = (rhsMagnitude > 1.0e-6) ? rhsMagnitude : 1.0;
// If input residual is less than tolerance skip solve.
if (eps < m_tol * m_tol * rhsMagnitude)
{
return;
}
// Apply preconditioner
this->m_precon->apply(m_r_A, m_w_A);
// Perform the method-specific matrix-vector multiply operation.
this->m_LHS->apply(m_w_A, m_s_A);
// Apply Robin BCs
// m_robBndCond->apply(m_w_A, m_s_A);
cudaMemset(m_vExchange, 0, sizeof(TData) * 3);
dotKernel<<<m_gridSize, m_blockSize, sizeof(TData) * m_blockSize>>>(
m_nloc, p_r_A, p_w_A, m_vExchange + 0);
dotKernel<<<m_gridSize, m_blockSize, sizeof(TData) * m_blockSize>>>(
m_nloc, p_s_A, p_w_A, m_vExchange + 1);
cudaMemcpy(&rho, m_vExchange + 0, sizeof(TData),
cudaMemcpyDeviceToHost);
cudaMemcpy(&mu, m_vExchange + 1, sizeof(TData), cudaMemcpyDeviceToHost);
beta = 0.0;
alpha = rho / mu;
totalIterations = 1;
while (true)
{
if (totalIterations > m_maxIter)
{
std::cout << "Exceeded max iterations\n";
return;
}
// Compute new search direction p_k
daxpyKernel<<<m_gridSize, m_blockSize>>>(m_nloc, beta, p_p_A, p_w_A,
p_p_A);
// Compute new search direction q_k
daxpyKernel<<<m_gridSize, m_blockSize>>>(m_nloc, beta, p_q_A, p_s_A,
p_q_A);
// Update solution x_{k+1}
daxpyKernel<<<m_gridSize, m_blockSize>>>(m_nloc, alpha, p_p_A,
p_out, p_out);
// Update residual vector r_{k+1}
daxpyKernel<<<m_gridSize, m_blockSize>>>(m_nloc, -alpha, p_q_A,
p_r_A, p_r_A);
// Apply preconditioner
this->m_precon->apply(m_r_A, m_w_A);
// Perform the method-specific matrix-vector multiply operation.
this->m_LHS->apply(m_w_A, m_s_A);
// Apply Robin BCs
// m_robBndCond->apply(m_w_A, m_s_A);
cudaMemset(m_vExchange, 0, sizeof(TData) * 3);
// <r_{k+1}, w_{k+1}>
dotKernel<<<m_gridSize, m_blockSize, sizeof(TData) * m_blockSize>>>(
m_nloc, p_r_A, p_w_A, m_vExchange + 0);
// <s_{k+1}, w_{k+1}>
dotKernel<<<m_gridSize, m_blockSize, sizeof(TData) * m_blockSize>>>(
m_nloc, p_s_A, p_w_A, m_vExchange + 1);
// <r_{k+1}, r_{k+1}>
m_assmbScatr->apply(m_r_A, m_wk, true);
dotKernel<<<m_gridSize, m_blockSize, sizeof(TData) * m_blockSize>>>(
m_nloc, p_wk, p_r_A, m_vExchange + 2);
cudaMemcpy(&rho_new, m_vExchange + 0, sizeof(TData),
cudaMemcpyDeviceToHost);
cudaMemcpy(&mu, m_vExchange + 1, sizeof(TData),
cudaMemcpyDeviceToHost);
cudaMemcpy(&eps, m_vExchange + 2, sizeof(TData),
cudaMemcpyDeviceToHost);
std::cout << "Iteration " << totalIterations << " -- eps = " << eps
<< "\n";
totalIterations++;
// Test if norm is within tolerance
if (eps < m_tol * m_tol * rhsMagnitude)
{
break;
}
// Compute search direction and solution coefficients
beta = rho_new / rho;
alpha = rho_new / (mu - rho_new * beta / alpha);
rho = rho_new;
}
}
// instantiation function for CreatorFunction in Operator Factory
static std::unique_ptr<Operator<TData>> instantiate(
const MultiRegions::ExpListSharedPtr &expansionList)
{
return std::make_unique<OperatorConjGradImpl<TData, ImplCUDA>>(
expansionList);
}
// className - for OperatorFactory
static std::string className;
protected:
std::shared_ptr<OperatorAssmbScatr<TData>> m_assmbScatr;
std::shared_ptr<OperatorRobBndCond<TData>> m_robBndCond;
Field<TData, FieldState::Coeff> m_w_A;
Field<TData, FieldState::Coeff> m_s_A;
Field<TData, FieldState::Coeff> m_r_A;
Field<TData, FieldState::Coeff> m_wk;
TData *m_q_A;
TData *m_p_A;
TData *m_vExchange;
TData m_tol;
size_t m_nloc;
size_t m_maxIter;
size_t m_gridSize = 1024;
size_t m_blockSize = 32;
};
} // namespace Nektar::Operators::detail
#include "Operators/OperatorConjGrad.hpp"
#pragma once
#include <algorithm>
#include <array>
......@@ -11,6 +11,7 @@
#include <MultiRegions/ContField.h>
#include "Operators/OperatorAssmbScatr.hpp"
#include "Operators/OperatorConjGrad.hpp"
#include "Operators/OperatorRobBndCond.hpp"
using namespace Nektar;
......@@ -40,22 +41,19 @@ public:
m_maxIter, 5000);
contfield->GetSession()->LoadParameter("IterativeSolverTolerance",
m_tol, 1.0E-09);
m_assmbMap = contfield->GetLocalToGlobalMap();
m_nloc = contfield->GetLocalToGlobalMap()->GetNumLocalCoeffs();
m_assmbScatr = AssmbScatr<TData>::create(this->m_expansionList);
m_robBndCond = RobBndCond<TData>::create(this->m_expansionList);
m_p_A = std::make_unique<TData[]>(m_assmbMap->GetNumLocalCoeffs());
m_q_A = std::make_unique<TData[]>(m_assmbMap->GetNumLocalCoeffs());
m_p_A = std::make_unique<TData[]>(m_nloc);
m_q_A = std::make_unique<TData[]>(m_nloc);
}
void apply(Field<TData, FieldState::Coeff> &in,
Field<TData, FieldState::Coeff> &out) override
{
// get number of local coeffs (=size of in/out fields)
size_t nloc = m_assmbMap->GetNumLocalCoeffs();
// store pointers to temporary fields
Array<OneD, TData> inArr(nloc, 0.0);
Array<OneD, TData> outArr(nloc, 0.0);
// Store pointers to temporary fields
Array<OneD, TData> inArr(m_nloc, 0.0);
Array<OneD, TData> outArr(m_nloc, 0.0);
auto *p_in = inArr.get();
auto *p_out = outArr.get();
auto *p_w_A = m_w_A.GetStorage().GetCPUPtr();
......@@ -65,13 +63,14 @@ public:
auto *p_p_A = m_p_A.get();
auto *p_q_A = m_q_A.get();
// set the fields to zero
std::fill(p_w_A, p_w_A + nloc, 0.0);
std::fill(p_s_A, p_s_A + nloc, 0.0);
std::fill(p_wk, p_wk + nloc, 0.0);
std::fill(p_p_A, p_p_A + nloc, 0.0);
std::fill(p_q_A, p_q_A + nloc, 0.0);
// Set the fields to zero
std::fill(p_w_A, p_w_A + m_nloc, 0.0);
std::fill(p_s_A, p_s_A + m_nloc, 0.0);
std::fill(p_wk, p_wk + m_nloc, 0.0);
std::fill(p_p_A, p_p_A + m_nloc, 0.0);
std::fill(p_q_A, p_q_A + m_nloc, 0.0);
// Convergence parameters
size_t totalIterations = 0;
TData rhsMagnitude;
TData alpha;
......@@ -99,21 +98,21 @@ public:
inptr += nSize;
}
// copy RHS into initial residual
std::copy(p_in, p_in + nloc, p_r_A);
// Copy RHS into initial residual
std::copy(p_in, p_in + m_nloc, p_r_A);
// Assembly (communication)
m_assmbScatr->apply(m_r_A, m_wk, true);
vExchange[2] = std::inner_product(p_wk, p_wk + nloc, p_r_A, 0.0);
vExchange[2] = std::inner_product(p_wk, p_wk + m_nloc, p_r_A, 0.0);
// Perform inner-product exchanges
// m_rowComm->AllReduce(vExchange, Nektar::LibUtilities::ReduceSum);
eps = vExchange[2];
// calculate rhs magnitude
// Calculate rhs magnitude
m_assmbScatr->apply(m_r_A, m_wk);
rhsMagnitude = std::inner_product(p_in, p_in + nloc, p_wk, 0.0);
rhsMagnitude = std::inner_product(p_in, p_in + m_nloc, p_wk, 0.0);
// m_rowComm->AllReduce(rhsMagnitude, Nektar::LibUtilities::ReduceSum);
rhsMagnitude = (rhsMagnitude > 1.0e-6) ? rhsMagnitude : 1.0;
......@@ -124,16 +123,16 @@ public:
}
// Apply preconditioner
m_precon->apply(m_r_A, m_w_A);
this->m_precon->apply(m_r_A, m_w_A);
// Perform the method-specific matrix-vector multiply operation.
m_LHS->apply(m_w_A, m_s_A);
this->m_LHS->apply(m_w_A, m_s_A);
// Apply Robin BCs
m_robBndCond->apply(m_w_A, m_s_A);
vExchange[0] = std::inner_product(p_r_A, p_r_A + nloc, p_w_A, 0.0);
vExchange[1] = std::inner_product(p_s_A, p_s_A + nloc, p_w_A, 0.0);
vExchange[0] = std::inner_product(p_r_A, p_r_A + m_nloc, p_w_A, 0.0);
vExchange[1] = std::inner_product(p_s_A, p_s_A + m_nloc, p_w_A, 0.0);
// m_rowComm->AllReduce(vExchange, Nektar::LibUtilities::ReduceSum);
......@@ -142,7 +141,6 @@ public:
beta = 0.0;
alpha = rho / mu;
totalIterations = 1;
while (true)
{
if (totalIterations > m_maxIter)
......@@ -151,46 +149,46 @@ public:
return;
}
// Compute new search direction p_k, q_k
std::transform(p_p_A, p_p_A + nloc, p_w_A, p_p_A,
[&beta](const TData &pElem, const TData &wElem) {
return beta * pElem + wElem;
});
std::transform(p_q_A, p_q_A + nloc, p_s_A, p_q_A,
[&beta](const TData &qElem, const TData &sElem) {
return beta * qElem + sElem;
});
// Compute new search direction p_k
std::transform(p_p_A, p_p_A + m_nloc, p_w_A, p_p_A,
[&beta](const TData &pElem, const TData &wElem)
{ return beta * pElem + wElem; });
// Compute new search direction q_k
std::transform(p_q_A, p_q_A + m_nloc, p_s_A, p_q_A,
[&beta](const TData &qElem, const TData &sElem)
{ return beta * qElem + sElem; });
// Update solution x_{k+1}
std::transform(p_p_A, p_p_A + nloc, p_out, p_out,
[&alpha](const TData &pElem, const TData &xElem) {
return alpha * pElem + xElem;
});
std::transform(p_p_A, p_p_A + m_nloc, p_out, p_out,
[&alpha](const TData &pElem, const TData &xElem)
{ return alpha * pElem + xElem; });
// Update residual vector r_{k+1}
std::transform(p_q_A, p_q_A + nloc, p_r_A, p_r_A,
[&alpha](const TData &qElem, const TData &rElem) {
return -alpha * qElem + rElem;
});
std::transform(p_q_A, p_q_A + m_nloc, p_r_A, p_r_A,
[&alpha](const TData &qElem, const TData &rElem)
{ return -alpha * qElem + rElem; });
// Apply preconditioner
m_precon->apply(m_r_A, m_w_A);
this->m_precon->apply(m_r_A, m_w_A);
// Perform the method-specific matrix-vector multiply operation.
m_LHS->apply(m_w_A, m_s_A);
this->m_LHS->apply(m_w_A, m_s_A);
// Apply Robin BCs
m_robBndCond->apply(m_w_A, m_s_A);
// <r_{k+1}, w_{k+1}>
vExchange[0] = std::inner_product(p_r_A, p_r_A + nloc, p_w_A, 0.0);
vExchange[0] =
std::inner_product(p_r_A, p_r_A + m_nloc, p_w_A, 0.0);
// <s_{k+1}, w_{k+1}>
vExchange[1] = std::inner_product(p_s_A, p_s_A + nloc, p_w_A, 0.0);
vExchange[1] =
std::inner_product(p_s_A, p_s_A + m_nloc, p_w_A, 0.0);
// <r_{k+1}, r_{k+1}>
m_assmbScatr->apply(m_r_A, m_wk, true);
vExchange[2] = std::inner_product(p_wk, p_wk + nloc, p_r_A, 0.0);
vExchange[2] = std::inner_product(p_wk, p_wk + m_nloc, p_r_A, 0.0);
// Perform inner-product exchanges
// m_rowComm->AllReduce(vExchange, Nektar::LibUtilities::ReduceSum);
......@@ -234,19 +232,6 @@ public:
}
}
void setLHS(const std::shared_ptr<OperatorLinear<TData, FieldState::Coeff,
FieldState::Coeff>> &ptr)
{
m_LHS = ptr;
}
void setPrecon(
const std::shared_ptr<
OperatorLinear<TData, FieldState::Coeff, FieldState::Coeff>> &ptr)
{
m_precon = ptr;
}
// instantiation function for CreatorFunction in Operator Factory
static std::unique_ptr<Operator<TData>> instantiate(
const MultiRegions::ExpListSharedPtr &expansionList)
......@@ -261,22 +246,14 @@ public:
protected:
std::shared_ptr<OperatorAssmbScatr<TData>> m_assmbScatr;
std::shared_ptr<OperatorRobBndCond<TData>> m_robBndCond;
std::shared_ptr<OperatorLinear<TData, FieldState::Coeff, FieldState::Coeff>>
m_LHS;
std::shared_ptr<OperatorLinear<TData, FieldState::Coeff, FieldState::Coeff>>
m_precon;
AssemblyMapCGSharedPtr m_assmbMap;
Field<TData, FieldState::Coeff> m_w_A;
Field<TData, FieldState::Coeff> m_s_A;
Field<TData, FieldState::Coeff> m_r_A;
Field<TData, FieldState::Coeff> m_wk;
std::unique_ptr<TData[]> m_w_A_glo;
std::unique_ptr<TData[]> m_r_A_glo;
std::unique_ptr<TData[]> m_q_A;
std::unique_ptr<TData[]> m_p_A;
Array<OneD, TData> m_local;
Array<OneD, TData> m_global;
TData m_tol;
size_t m_nloc;
size_t m_maxIter;
};
......
......@@ -2,7 +2,7 @@
#include "Field.hpp"
#include "Operators/AssmbScatr/AssmbScatrCUDA.hpp"
#include "Operators/DiagPrecon/DiagPreconCUDAKernels.cuh"
#include "Operators/CUDAMathKernels.cuh"
#include "Operators/OperatorDiagPrecon.hpp"
#include "Operators/OperatorRobBndCond.hpp"
......@@ -41,9 +41,8 @@ public:
std::static_pointer_cast<OperatorAssmbScatrImpl<TData, ImplCUDA>>(
AssmbScatr<TData>::create(this->m_expansionList, "CUDA"));
// Determine CUDA grid parameters.
m_gridSize = m_nGlobal / m_blockSize;
m_gridSize += (m_nGlobal % m_blockSize == 0) ? 0 : 1;
// Deterime CUDA grid size.
m_gridSize = GetCUDAGridSize(m_nGlobal - m_nDir, m_blockSize);
}
~OperatorDiagPreconImpl(void)
......@@ -57,8 +56,8 @@ public:
{
m_assmbScatr->Assemble(in, m_wk);
DiagPreconKernel<<<m_gridSize, m_blockSize>>>(m_nGlobal, m_nDir, m_diag,
m_wk);
vdivKernel<<<m_gridSize, m_blockSize>>>(
m_nGlobal - m_nDir, m_wk + m_nDir, m_diag + m_nDir, m_wk + m_nDir);
cudaMemset(m_wk, 0, sizeof(TData) * m_nDir);
......@@ -69,38 +68,43 @@ public:
const std::shared_ptr<
OperatorLinear<TData, FieldState::Coeff, FieldState::Coeff>> &op)
{
auto robBCOp = RobBndCond<TData>::create(this->m_expansionList);
// auto robBCOp = RobBndCond<TData>::create(this->m_expansionList,
// "CUDA");
Array<OneD, TData> diag(m_nLocal);
// create unit vector field to extract diagonal
Field<TData, FieldState::Coeff> unit_vec =
Field<TData, FieldState::Coeff>::create(
Field<TData, FieldState::Coeff>::template create<MemoryRegionCUDA>(
GetBlockAttributes(FieldState::Coeff, this->m_expansionList));
// create action field to receive column action from unit vector
Field<TData, FieldState::Coeff> action =
Field<TData, FieldState::Coeff>::create(
Field<TData, FieldState::Coeff>::template create<MemoryRegionCUDA>(
GetBlockAttributes(FieldState::Coeff, this->m_expansionList));
auto *uvec_ptr = unit_vec.GetStorage().GetCPUPtr();
auto *actn_ptr = action.GetStorage().GetCPUPtr();
auto *diag_ptr = diag.get();
auto *uvec_ptr =
unit_vec.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
auto *actn_ptr =
action.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
TData init = 1.0;
for (size_t i = 0; i < m_nLocal; ++i)
{
// set ith term in unit vector to be 1
*uvec_ptr = 1.0;
cudaMemcpy(uvec_ptr + i, &init, sizeof(TData),
cudaMemcpyHostToDevice);
// apply operator to unit vector and store in action field
op->apply(unit_vec, action);
robBCOp->apply(unit_vec, action);
// robBCOp->apply(unit_vec, action);
// copy ith row term from the action field to get ith diagonal
*(diag_ptr++) = *(actn_ptr++);
cudaMemcpy(diag.get() + i, actn_ptr + i, sizeof(TData),
cudaMemcpyDeviceToHost);
// reset ith term in unit vector to be 0
*(uvec_ptr++) = 0.0;
cudaMemset(uvec_ptr + i, 0, sizeof(TData));
}
// Assembly
......@@ -135,7 +139,7 @@ protected:
size_t m_nGlobal;
size_t m_nLocal;
size_t m_nDir;
size_t m_gridSize;
size_t m_gridSize = 1024;
size_t m_blockSize = 32;
};
......
namespace Nektar::Operators::detail
{
template <typename TData>
__global__ void DiagPreconKernel(const size_t nGlobal, const size_t nDir,
const TData *diagptr, TData *inoutptr)
{
size_t i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < nDir || i >= nGlobal)
{
return;
}
inoutptr[i] /= diagptr[i];
}
} // namespace Nektar::Operators::detail
......@@ -3,6 +3,7 @@
#include "MemoryRegionCUDA.hpp"
#include "Operators/DirBndCond/DirBndCondCUDAKernels.cuh"
#include "Operators/OperatorDirBndCond.hpp"
#include "Operators/OperatorHelper.cuh"
#include <MultiRegions/AssemblyMap/AssemblyMapCG.h>
#include <MultiRegions/ContField.h>
......@@ -133,9 +134,8 @@ public:
// Copy memory to GPU, if necessary and get raw pointers.
auto *outptr = out.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
// Deterime CUDA grid parameters.
m_gridSize = m_bndExpSize / m_blockSize;
m_gridSize += (m_bndExpSize % m_blockSize == 0) ? 0 : 1;
// Deterime CUDA grid size.
m_gridSize = GetCUDAGridSize(m_bndExpSize, m_blockSize);
if (m_signChange)
{
......@@ -168,9 +168,9 @@ public:
// outarr[it] *= -1;
// }
// Deterime CUDA grid parameters.
m_gridSize = m_localDirSize / m_blockSize;
m_gridSize += (m_localDirSize % m_blockSize == 0) ? 0 : 1;
// Deterime CUDA grid size.
m_gridSize = GetCUDAGridSize(m_localDirSize, m_blockSize);
LocalDirBndCondKernel<TData><<<m_gridSize, m_blockSize>>>(
m_localDirSize, m_locid0, m_locid1, m_locsign, outptr);
}
......@@ -199,7 +199,7 @@ protected:
int *m_locid0;
int *m_locid1;
TData *m_locsign;
size_t m_gridSize;
size_t m_gridSize = 1024;
size_t m_blockSize = 32;
};
......
......@@ -14,19 +14,18 @@ __global__ void DirBndCondKernel(const size_t nsize, const int *offsetptr,
{
size_t i = blockDim.x * blockIdx.x + threadIdx.x;
if (i >= nsize)
while (i < nsize)
{
return;
}
if (bctypeptr[i] == eDirichlet)
{
size_t offset = offsetptr[i];
size_t ncoeff = ncoeffptr[i];
for (size_t j = 0; j < ncoeff; j++)
if (bctypeptr[i] == eDirichlet)
{
outptr[mapptr[offset + j]] = inptr[offset + j];
size_t offset = offsetptr[i];
size_t ncoeff = ncoeffptr[i];
for (size_t j = 0; j < ncoeff; j++)
{
outptr[mapptr[offset + j]] = inptr[offset + j];
}
}
i += blockDim.x * gridDim.x;
}
}
......@@ -39,20 +38,19 @@ __global__ void DirBndCondKernel(const size_t nsize, const int *offsetptr,
{
size_t i = blockDim.x * blockIdx.x + threadIdx.x;
if (i >= nsize)
while (i < nsize)
{
return;
}
if (bctypeptr[i] == eDirichlet)
{
size_t offset = offsetptr[i];
size_t ncoeff = ncoeffptr[i];
for (size_t j = 0; j < ncoeff; j++)
if (bctypeptr[i] == eDirichlet)
{
outptr[mapptr[offset + j]] =
signptr[offset + j] * inptr[offset + j];
size_t offset = offsetptr[i];
size_t ncoeff = ncoeffptr[i];
for (size_t j = 0; j < ncoeff; j++)
{
outptr[mapptr[offset + j]] =
signptr[offset + j] * inptr[offset + j];
}
}
i += blockDim.x * gridDim.x;
}
}
......@@ -63,12 +61,11 @@ __global__ void LocalDirBndCondKernel(const size_t nsize, const int *id0ptr,
{
size_t i = blockDim.x * blockIdx.x + threadIdx.x;
if (i >= nsize)
while (i < nsize)
{
return;
outptr[id0ptr[i]] = outptr[id1ptr[i]] * signptr[i];
i += blockDim.x * gridDim.x;
}
outptr[id0ptr[i]] = outptr[id1ptr[i]] * signptr[i];
}
} // namespace Nektar::Operators::detail
#include "FwdTransCUDA.hpp"
namespace Nektar::Operators::detail
{
// Register implementation with Operator Factory
template <>
std::string OperatorFwdTransImpl<double, ImplCUDA>::className =
GetOperatorFactory<double>().RegisterCreatorFunction(
"FwdTransCUDA", OperatorFwdTransImpl<double, ImplCUDA>::instantiate,
"...");
} // namespace Nektar::Operators::detail
#pragma once
#include "MemoryRegionCUDA.hpp"
#include "Operators/CUDAMathKernels.cuh"
#include "Operators/OperatorConjGrad.hpp"
#include "Operators/OperatorDirBndCond.hpp"
#include "Operators/OperatorFwdTrans.hpp"
#include "Operators/OperatorHelper.cuh"
#include "Operators/OperatorIProductWRTBase.hpp"
#include "Operators/OperatorMass.hpp"
#include "Operators/OperatorPrecon.hpp"
#include "Operators/OperatorRobBndCond.hpp"
using vec_t = tinysimd::simd<double>;
namespace Nektar::Operators::detail
{
template <typename TData>
class OperatorFwdTransImpl<TData, ImplCUDA> : public OperatorFwdTrans<TData>
{
public:
OperatorFwdTransImpl(const MultiRegions::ExpListSharedPtr &expansionList)
: OperatorFwdTrans<TData>(expansionList),
m_rhs(Field<TData, FieldState::Coeff>::template create<
MemoryRegionCUDA>(
GetBlockAttributes(FieldState::Coeff, expansionList))),
m_tmp(Field<TData, FieldState::Coeff>::template create<
MemoryRegionCUDA>(
GetBlockAttributes(FieldState::Coeff, expansionList)))
{
m_MassOp = Mass<TData>::create(this->m_expansionList, "CUDA");
m_DirBCOp = DirBndCond<TData>::create(this->m_expansionList, "CUDA");
// m_RobBCOp = RobBndCond<TData>::create(this->m_expansionList, "CUDA");
m_IProdOp =
IProductWRTBase<TData>::create(this->m_expansionList, "CUDA");
m_CGOp = ConjGrad<TData>::create(this->m_expansionList, "CUDA");
m_CGOp->setLHS(m_MassOp);
}
void apply(Field<TData, FieldState::Phys> &in,
Field<TData, FieldState::Coeff> &out) override
{
size_t nloc = out.template GetStorage<MemoryRegionCUDA>().size();
auto *outptr = out.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
auto *rhsptr =
m_rhs.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
auto *tmpptr =
m_tmp.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
// Deterime CUDA grid size.
m_gridSize = GetCUDAGridSize(nloc, m_blockSize);
// IProductWRT of RHS
m_IProdOp->apply(in, m_rhs);
// Handle Dirichlet BCs
m_DirBCOp->apply(out);
m_MassOp->apply(out, m_tmp);
subKernel<<<m_gridSize, m_blockSize>>>(nloc, rhsptr, tmpptr, rhsptr);
// Handle Robin BCs
// m_RobBCOp->apply(out, m_rhs, true);
// Solve for u_hat using Conjugate Gradient
m_CGOp->apply(m_rhs, m_tmp);
// Add Dirichlet BCs
addKernel<<<m_gridSize, m_blockSize>>>(nloc, outptr, tmpptr, outptr);
}
void setPrecon(
const std::shared_ptr<OperatorPrecon<TData>> &precon) override
{
precon->configure(m_MassOp);
m_CGOp->setPrecon(precon);
}
// instantiation function for CreatorFunction in OperatorFactory
static std::unique_ptr<Operator<TData>> instantiate(
const MultiRegions::ExpListSharedPtr &expansionList)
{
return std::make_unique<OperatorFwdTransImpl<TData, ImplCUDA>>(
expansionList);
}
// className - for OperatorFactory
static std::string className;
protected:
std::shared_ptr<OperatorIProductWRTBase<TData>> m_IProdOp;
std::shared_ptr<OperatorDirBndCond<TData>> m_DirBCOp;
std::shared_ptr<OperatorRobBndCond<TData>> m_RobBCOp;
std::shared_ptr<OperatorMass<TData>> m_MassOp;
std::shared_ptr<OperatorConjGrad<TData>> m_CGOp;
Field<TData, FieldState::Coeff> m_rhs;
Field<TData, FieldState::Coeff> m_tmp;
size_t m_gridSize = 1024;
size_t m_blockSize = 32;
};
} // namespace Nektar::Operators::detail
......@@ -7,7 +7,6 @@
#include "Operators/OperatorMass.hpp"
#include "Operators/OperatorPrecon.hpp"
#include "Operators/OperatorRobBndCond.hpp"
#include <StdRegions/StdExpansion.h>
using vec_t = tinysimd::simd<double>;
......@@ -24,7 +23,7 @@ public:
GetBlockAttributes(FieldState::Coeff, expansionList,
vec_t::width),
1, vec_t::alignment)),
m_dir(Field<TData, FieldState::Coeff>::create(
m_tmp(Field<TData, FieldState::Coeff>::create(
GetBlockAttributes(FieldState::Coeff, expansionList,
vec_t::width),
1, vec_t::alignment))
......@@ -43,36 +42,36 @@ public:
size_t nloc = out.GetStorage().size();
auto *outptr = out.GetStorage().GetCPUPtr();
auto *rhsptr = m_rhs.GetStorage().GetCPUPtr();
auto *dirptr = m_dir.GetStorage().GetCPUPtr();
auto *tmpptr = m_tmp.GetStorage().GetCPUPtr();
// IProductWRT of RHS
m_IProdOp->apply(in, m_rhs);
// Handle Dirichlet BCs
m_DirBCOp->apply(m_dir);
m_MassOp->apply(m_dir, out); // use out as temporary storage
std::transform(
rhsptr, rhsptr + nloc, outptr, rhsptr,
[](const TData &rhs, const TData &dir) { return rhs - dir; });
m_DirBCOp->apply(out);
m_MassOp->apply(out, m_tmp);
std::transform(rhsptr, rhsptr + nloc, tmpptr, rhsptr,
[](const TData &rhs, const TData &dir)
{ return rhs - dir; });
// Handle Robin BCs
m_RobBCOp->apply(m_dir, m_rhs, true);
m_RobBCOp->apply(out, m_rhs, true);
// Solve for u_hat using Conjugate Gradient
m_CGOp->apply(m_rhs, out);
m_CGOp->apply(m_rhs, m_tmp);
// Add Dirichlet BCs
std::transform(
outptr, outptr + nloc, dirptr, outptr,
[](const TData &x, const TData &dir) { return x + dir; });
std::transform(outptr, outptr + nloc, tmpptr, outptr,
[](const TData &x, const TData &diff)
{ return x + diff; });
}
void setPrecon(
const std::shared_ptr<OperatorPrecon<TData>> &precon) override
{
m_CGOp->setPrecon(precon);
precon->configure(m_MassOp);
m_CGOp->setPrecon(precon);
}
// instantiation function for CreatorFunction in OperatorFactory
......@@ -93,7 +92,7 @@ protected:
std::shared_ptr<OperatorMass<TData>> m_MassOp;
std::shared_ptr<OperatorConjGrad<TData>> m_CGOp;
Field<TData, FieldState::Coeff> m_rhs;
Field<TData, FieldState::Coeff> m_dir;
Field<TData, FieldState::Coeff> m_tmp;
};
} // namespace Nektar::Operators::detail
#include "HelmSolveCUDA.hpp"
namespace Nektar::Operators::detail
{
// Register implementation with Operator Factory
template <>
std::string OperatorHelmSolveImpl<double, ImplCUDA>::className =
GetOperatorFactory<double>().RegisterCreatorFunction(
"HelmSolveCUDA",
OperatorHelmSolveImpl<double, ImplCUDA>::instantiate, "...");
} // namespace Nektar::Operators::detail
#pragma once
#include "MemoryRegionCUDA.hpp"
#include "Operators/CUDAMathKernels.cuh"
#include "Operators/OperatorConjGrad.hpp"
#include "Operators/OperatorDirBndCond.hpp"
#include "Operators/OperatorHelmSolve.hpp"
#include "Operators/OperatorHelmholtz.hpp"
#include "Operators/OperatorHelper.cuh"
#include "Operators/OperatorIProductWRTBase.hpp"
#include "Operators/OperatorLinear.hpp"
#include "Operators/OperatorNeuBndCond.hpp"
#include "Operators/OperatorPrecon.hpp"
#include "Operators/OperatorRobBndCond.hpp"
using vec_t = tinysimd::simd<double>;
namespace Nektar::Operators::detail
{
template <typename TData>
class OperatorHelmSolveImpl<TData, ImplCUDA> : public OperatorHelmSolve<TData>
{
public:
OperatorHelmSolveImpl(const MultiRegions::ExpListSharedPtr &expansionList)
: OperatorHelmSolve<TData>(expansionList),
m_rhs(Field<TData, FieldState::Coeff>::template create<
MemoryRegionCUDA>(
GetBlockAttributes(FieldState::Coeff, expansionList))),
m_tmp(Field<TData, FieldState::Coeff>::template create<
MemoryRegionCUDA>(
GetBlockAttributes(FieldState::Coeff, expansionList)))
{
m_IProdOp =
IProductWRTBase<TData>::create(this->m_expansionList, "CUDA");
m_DirBCOp = DirBndCond<TData>::create(this->m_expansionList, "CUDA");
m_NeuBCOp = NeuBndCond<TData>::create(this->m_expansionList, "CUDA");
// m_RobBCOp = RobBndCond<TData>::create(this->m_expansionList, "CUDA");
m_HelmOp = Helmholtz<TData>::create(this->m_expansionList, "CUDA");
m_CGOp = ConjGrad<TData>::create(this->m_expansionList, "CUDA");
m_CGOp->setLHS(m_HelmOp);
}
void apply(Field<TData, FieldState::Phys> &in,
Field<TData, FieldState::Coeff> &out) override
{
size_t nloc = out.template GetStorage<MemoryRegionCUDA>().size();
auto *outptr = out.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
auto *rhsptr =
m_rhs.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
auto *tmpptr =
m_tmp.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
// Deterime CUDA grid size.
m_gridSize = GetCUDAGridSize(nloc, m_blockSize);
// IProductWRT of RHS
m_IProdOp->apply(in, m_rhs);
negKernel<<<m_gridSize, m_blockSize>>>(nloc, rhsptr, rhsptr);
// Handle Neumann BCs on RHS
m_NeuBCOp->apply(m_rhs);
// Handle Dirichlet BCs
m_DirBCOp->apply(out);
m_HelmOp->apply(out, m_tmp);
subKernel<<<m_gridSize, m_blockSize>>>(nloc, rhsptr, tmpptr, rhsptr);
// Handle Robin BCs
// m_RobBCOp->apply(out, m_rhs, true);
// Solve for u_hat using Conjugate Gradient
m_CGOp->apply(m_rhs, m_tmp);
// Add Dirichlet BCs
addKernel<<<m_gridSize, m_blockSize>>>(nloc, outptr, tmpptr, outptr);
}
void setLambda(const TData &lambda)
{
m_HelmOp->setLambda(lambda);
}
void setPrecon(
const std::shared_ptr<OperatorPrecon<TData>> &precon) override
{
precon->configure(m_HelmOp);
m_CGOp->setPrecon(precon);
}
// instantiation function for CreatorFunction in OperatorFactory
static std::unique_ptr<Operator<TData>> instantiate(
const MultiRegions::ExpListSharedPtr &expansionList)
{
return std::make_unique<OperatorHelmSolveImpl<TData, ImplCUDA>>(
expansionList);
}
// className - for OperatorFactory
static std::string className;
protected:
std::shared_ptr<OperatorIProductWRTBase<TData>> m_IProdOp;
std::shared_ptr<OperatorDirBndCond<TData>> m_DirBCOp;
std::shared_ptr<OperatorNeuBndCond<TData>> m_NeuBCOp;
std::shared_ptr<OperatorRobBndCond<TData>> m_RobBCOp;
std::shared_ptr<OperatorHelmholtz<TData>> m_HelmOp;
std::shared_ptr<OperatorConjGrad<TData>> m_CGOp;
Field<TData, FieldState::Coeff> m_rhs;
Field<TData, FieldState::Coeff> m_tmp;
size_t m_gridSize = 1024;
size_t m_blockSize = 32;
};
} // namespace Nektar::Operators::detail
#pragma once
#include "Operators/OperatorHelmSolve.hpp"
#include "Operators/OperatorConjGrad.hpp"
#include "Operators/OperatorDirBndCond.hpp"
#include "Operators/OperatorHelmSolve.hpp"
#include "Operators/OperatorHelmholtz.hpp"
#include "Operators/OperatorIProductWRTBase.hpp"
#include "Operators/OperatorLinear.hpp"
......@@ -11,10 +10,6 @@
#include "Operators/OperatorPrecon.hpp"
#include "Operators/OperatorRobBndCond.hpp"
using namespace Nektar;
using namespace Nektar::Operators;
using namespace Nektar::MultiRegions;
using vec_t = tinysimd::simd<double>;
namespace Nektar::Operators::detail
......@@ -30,7 +25,7 @@ public:
GetBlockAttributes(FieldState::Coeff, expansionList,
vec_t::width),
1, vec_t::alignment)),
m_dir(Field<TData, FieldState::Coeff>::create(
m_tmp(Field<TData, FieldState::Coeff>::create(
GetBlockAttributes(FieldState::Coeff, expansionList,
vec_t::width),
1, vec_t::alignment))
......@@ -50,7 +45,7 @@ public:
size_t nloc = out.GetStorage().size();
auto *outptr = out.GetStorage().GetCPUPtr();
auto *rhsptr = m_rhs.GetStorage().GetCPUPtr();
auto *dirptr = m_dir.GetStorage().GetCPUPtr();
auto *tmpptr = m_tmp.GetStorage().GetCPUPtr();
// IProductWRT of RHS
m_IProdOp->apply(in, m_rhs);
......@@ -60,22 +55,22 @@ public:
m_NeuBCOp->apply(m_rhs);
// Handle Dirichlet BCs
m_DirBCOp->apply(m_dir);
m_HelmOp->apply(m_dir, out); // use out as temporary storage
std::transform(
rhsptr, rhsptr + nloc, outptr, rhsptr,
[](const TData &rhs, const TData &dir) { return rhs - dir; });
m_DirBCOp->apply(out);
m_HelmOp->apply(out, m_tmp);
std::transform(rhsptr, rhsptr + nloc, tmpptr, rhsptr,
[](const TData &rhs, const TData &dir)
{ return rhs - dir; });
// Handle Robin BCs
m_RobBCOp->apply(m_dir, m_rhs, true);
m_RobBCOp->apply(out, m_rhs, true);
// Solve for u_hat using Conjugate Gradient
m_CGOp->apply(m_rhs, out);
m_CGOp->apply(m_rhs, m_tmp);
// Add Dirichlet BCs
std::transform(
outptr, outptr + nloc, dirptr, outptr,
[](const TData &x, const TData &dir) { return x + dir; });
std::transform(outptr, outptr + nloc, tmpptr, outptr,
[](const TData &x, const TData &diff)
{ return x + diff; });
}
void setLambda(const TData &lambda)
......@@ -110,7 +105,7 @@ protected:
std::shared_ptr<OperatorHelmholtz<TData>> m_HelmOp;
std::shared_ptr<OperatorConjGrad<TData>> m_CGOp;
Field<TData, FieldState::Coeff> m_rhs;
Field<TData, FieldState::Coeff> m_dir;
Field<TData, FieldState::Coeff> m_tmp;
};
} // namespace Nektar::Operators::detail
......@@ -88,9 +88,8 @@ public:
auto nCoord = expPtr->GetCoordim();
auto nqTot = expPtr->GetTotPoints();
// Determine CUDA grid parameters.
m_gridSize = (nElmts * nqTot) / m_blockSize;
m_gridSize += ((nElmts * nqTot) % m_blockSize == 0) ? 0 : 1;
// Deterime CUDA grid size.
m_gridSize = GetCUDAGridSize(nqTot * nElmts, m_blockSize);
// Multiply by diffusion coefficient.
if (nCoord == 1)
......@@ -138,8 +137,8 @@ private:
Field<TData, FieldState::Phys> m_bwd;
Field<TData, FieldState::Phys> m_deriv;
TData *m_diffCoeff;
size_t m_gridSize = 1024;
size_t m_blockSize = 32;
size_t m_gridSize;
};
} // namespace Nektar::Operators::detail
......@@ -7,12 +7,11 @@ __global__ void DiffusionCoeff1DKernel(const size_t nsize,
{
size_t i = blockDim.x * blockIdx.x + threadIdx.x;
if (i >= nsize)
while (i < nsize)
{
return;
deriv0[i] *= diffCoeff[0];
i += blockDim.x * gridDim.x;
}
deriv0[i] *= diffCoeff[0];
}
template <typename TData>
......@@ -32,15 +31,14 @@ __global__ void DiffusionCoeff2DKernel(const size_t nsize,
size_t i = blockDim.x * blockIdx.x + threadIdx.x;
if (i >= nsize)
while (i < nsize)
{
return;
}
TData deriv[2] = {deriv0[i], deriv1[i]};
TData deriv[2] = {deriv0[i], deriv1[i]};
deriv0[i] = s_diffCoeff[0] * deriv[0] + s_diffCoeff[1] * deriv[1];
deriv1[i] = s_diffCoeff[2] * deriv[0] + s_diffCoeff[3] * deriv[1];
deriv0[i] = s_diffCoeff[0] * deriv[0] + s_diffCoeff[1] * deriv[1];
deriv1[i] = s_diffCoeff[2] * deriv[0] + s_diffCoeff[3] * deriv[1];
i += blockDim.x * gridDim.x;
}
}
template <typename TData>
......@@ -60,19 +58,18 @@ __global__ void DiffusionCoeff3DKernel(const size_t nsize, TData *diffCoeff,
size_t i = blockDim.x * blockIdx.x + threadIdx.x;
if (i >= nsize)
while (i < nsize)
{
return;
TData deriv[3] = {deriv0[i], deriv1[i], deriv2[i]};
deriv0[i] = s_diffCoeff[0] * deriv[0] + s_diffCoeff[1] * deriv[1] +
s_diffCoeff[2] * deriv[2];
deriv1[i] = s_diffCoeff[3] * deriv[0] + s_diffCoeff[4] * deriv[1] +
s_diffCoeff[5] * deriv[2];
deriv2[i] = s_diffCoeff[6] * deriv[0] + s_diffCoeff[7] * deriv[1] +
s_diffCoeff[8] * deriv[2];
i += blockDim.x * gridDim.x;
}
TData deriv[3] = {deriv0[i], deriv1[i], deriv2[i]};
deriv0[i] = s_diffCoeff[0] * deriv[0] + s_diffCoeff[1] * deriv[1] +
s_diffCoeff[2] * deriv[2];
deriv1[i] = s_diffCoeff[3] * deriv[0] + s_diffCoeff[4] * deriv[1] +
s_diffCoeff[5] * deriv[2];
deriv2[i] = s_diffCoeff[6] * deriv[0] + s_diffCoeff[7] * deriv[1] +
s_diffCoeff[8] * deriv[2];
}
} // namespace Nektar::Operators::detail
......@@ -100,9 +100,8 @@ public:
auto deformed = expPtr->GetMetricInfo()->GetGtype() ==
SpatialDomains::eDeformed;
// Deterime CUDA grid parameters.
m_gridSize = nElmts / m_blockSize;
m_gridSize += (nElmts % m_blockSize == 0) ? 0 : 1;
// Deterime CUDA grid size.
m_gridSize = GetCUDAGridSize(nElmts, m_blockSize);
// Flag for collapsed coordinate correction.
bool correct = expPtr->GetBasis(0)->GetBasisType() ==
......