Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • nektar/redesign-prototypes
1 result
Show changes
Commits on Source (12)
Showing
with 2749 additions and 85 deletions
...@@ -26,12 +26,15 @@ message(STATUS "Found Nektar++: version ${NEKTAR++_VERSION}") ...@@ -26,12 +26,15 @@ message(STATUS "Found Nektar++: version ${NEKTAR++_VERSION}")
set(CMAKE_INSTALL_RPATH "${NEKTAR++_LIBRARY_DIRS}") set(CMAKE_INSTALL_RPATH "${NEKTAR++_LIBRARY_DIRS}")
set(SRC Field.cpp Operators/Operator.cpp Operators/OperatorBwdTrans.cpp Operators/BwdTrans/BwdTransImpl.cpp Operators/OperatorIProductWRTBase.cpp Operators/IProductWRTBase/IProductWRTBaseImpl.cpp) set(SRC Field.cpp Operators/Operator.cpp Operators/OperatorBwdTrans.cpp Operators/BwdTrans/BwdTransImpl.cpp Operators/OperatorIProductWRTBase.cpp Operators/IProductWRTBase/IProductWRTBaseImpl.cpp Operators/OperatorIProductWRTDerivBase.cpp Operators/IProductWRTDerivBase/IProductWRTDerivBaseImpl.cpp Operators/OperatorPhysDeriv.cpp Operators/PhysDeriv/PhysDerivImpl.cpp Operators/OperatorHelmholtz.cpp Operators/Helmholtz/HelmholtzImpl.cpp)
if (NEKTAR_USE_CUDA AND NEKTAR_USE_SIMD)
MESSAGE(FATAL_ERROR "Cannot use both SIMD and CUDA")
endif()
if (NEKTAR_USE_CUDA) if (NEKTAR_USE_CUDA)
enable_language(CUDA) enable_language(CUDA)
add_definitions(-DNEKTAR_USE_CUDA) add_definitions(-DNEKTAR_USE_CUDA)
set(SRC ${SRC} MemoryRegionCUDA.cu Operators/BwdTrans/BwdTransCUDA.cu) set(SRC ${SRC} MemoryRegionCUDA.cu Operators/BwdTrans/BwdTransCUDA.cu Operators/IProductWRTBase/IProductWRTBaseCUDA.cu Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDA.cu Operators/PhysDeriv/PhysDerivCUDA.cu Operators/Helmholtz/HelmholtzCUDA.cu)
endif() endif()
if (NEKTAR_USE_SIMD) if (NEKTAR_USE_SIMD)
......
...@@ -99,38 +99,45 @@ public: ...@@ -99,38 +99,45 @@ public:
return *this; return *this;
} }
/** /**
* @brief Compare this field to another field, with absolute * @brief Compare this field to another field, with absolute
* tolerance tol. * tolerance tol.
* @return bool * @return bool
*/ */
bool compare(Field<TType, TState> &rhs, double tol) bool compare(Field<TType, TState> &rhs, double tol)
{ {
const std::vector<BlockAttributes>& rhs_blocks = rhs.GetBlocks(); const std::vector<BlockAttributes> &rhs_blocks = rhs.GetBlocks();
TType *store = GetStorage().GetCPUPtr(); TType *store = GetStorage().GetCPUPtr();
TType *rhs_store = rhs.GetStorage().GetCPUPtr(); TType *rhs_store = rhs.GetStorage().GetCPUPtr();
if (rhs_blocks.size() != block_attributes.size()) return false; if (rhs_blocks.size() != block_attributes.size())
return false;
size_t i {0};
for (size_t bl = 0; bl < block_attributes.size(); ++bl){ size_t i{0};
for (size_t bl = 0; bl < block_attributes.size(); ++bl)
size_t num_elements = block_attributes[bl].num_elements; {
size_t num_pts = block_attributes[bl].num_pts;
size_t num_elements = block_attributes[bl].num_elements;
// Check that each block have the same structure size_t num_pts = block_attributes[bl].num_pts;
if (num_elements != rhs_blocks[bl].num_elements) return false;
if (num_pts != rhs_blocks[bl].num_pts) return false; // Check that each block have the same structure
if (num_elements != rhs_blocks[bl].num_elements)
// Compare elements in the blocks return false;
for (size_t el = 0 ; el < num_elements; ++el) { if (num_pts != rhs_blocks[bl].num_pts)
for (size_t coeff = 0; coeff < num_pts; ++coeff) { return false;
i = coeff + el * num_pts + bl * num_pts * num_elements;
if (std::abs(store[i] - rhs_store[i]) > tol) return false; // Compare elements in the blocks
} for (size_t el = 0; el < num_elements; ++el)
} {
for (size_t coeff = 0; coeff < num_pts; ++coeff)
{
i = coeff + el * num_pts + bl * num_pts * num_elements;
if (std::abs(store[i] - rhs_store[i]) > tol)
return false;
}
}
}
return true;
} }
return true;
}
/** /**
* @brief Static templated creation method. * @brief Static templated creation method.
......
...@@ -61,13 +61,7 @@ public: ...@@ -61,13 +61,7 @@ public:
~OperatorBwdTransImpl() ~OperatorBwdTransImpl()
{ {
for (auto &basis : m_basis) DeallocateDataCUDA<TData>(m_basis);
{
for (size_t i = 0; i < basis.second.size(); i++)
{
cudaFree(basis.second[i]);
}
}
} }
void apply(Field<TData, FieldState::Coeff> &in, void apply(Field<TData, FieldState::Coeff> &in,
......
#include "HelmholtzCUDA.hpp"
namespace Nektar::Operators::detail
{
template <>
std::string OperatorHelmholtzImpl<double, ImplCUDA>::className =
GetOperatorFactory<double>().RegisterCreatorFunction(
"HelmholtzCUDA", OperatorHelmholtzImpl<double, ImplCUDA>::instantiate,
"...");
}
#include "Operators/BwdTrans/BwdTransCUDA.hpp"
#include "Operators/Helmholtz/HelmholtzCUDAKernels.cuh"
#include "Operators/IProductWRTBase/IProductWRTBaseCUDA.hpp"
#include "Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDA.hpp"
#include "Operators/OperatorHelmholtz.hpp"
#include "Operators/PhysDeriv/PhysDerivCUDA.hpp"
namespace Nektar::Operators::detail
{
// standard matrix implementation
template <typename TData>
class OperatorHelmholtzImpl<TData, ImplCUDA> : public OperatorHelmholtz<TData>
{
public:
OperatorHelmholtzImpl(const MultiRegions::ExpListSharedPtr &expansionList)
: OperatorHelmholtz<TData>(expansionList),
m_bwd(
Field<TData, FieldState::Phys>::template create<MemoryRegionCUDA>(
GetBlockAttributes(FieldState::Phys, expansionList))),
m_deriv0(
Field<TData, FieldState::Phys>::template create<MemoryRegionCUDA>(
GetBlockAttributes(FieldState::Phys, expansionList))),
m_deriv1(
Field<TData, FieldState::Phys>::template create<MemoryRegionCUDA>(
GetBlockAttributes(FieldState::Phys, expansionList))),
m_deriv2(
Field<TData, FieldState::Phys>::template create<MemoryRegionCUDA>(
GetBlockAttributes(FieldState::Phys, expansionList)))
{
auto nCoord = this->m_expansionList->GetCoordim(0);
m_BwdTransOp = BwdTrans<>::create(this->m_expansionList, "CUDA");
m_PhysDerivOp = PhysDeriv<>::create(this->m_expansionList, "CUDA");
m_IProductWRTBaseOp =
IProductWRTBase<>::create(this->m_expansionList, "CUDA");
m_IProductWRTDerivBaseOp =
IProductWRTDerivBase<>::create(this->m_expansionList, "CUDA");
Array<OneD, TData> diffCoeff(nCoord * nCoord, 0.0);
for (size_t d = 0; d < nCoord; d++)
{
diffCoeff[d * nCoord + d] = 1.0; // Temporary solution
}
cudaMalloc((void **)&m_diffCoeff, sizeof(TData) * nCoord * nCoord);
cudaMemcpy(m_diffCoeff, diffCoeff.get(),
sizeof(TData) * nCoord * nCoord, cudaMemcpyHostToDevice);
}
void apply(Field<TData, FieldState::Coeff> &in,
Field<TData, FieldState::Coeff> &out) override
{
// Step 1: BwdTrans
m_BwdTransOp->apply(in, m_bwd);
// Step 2: PhysDeriv
m_PhysDerivOp->apply(m_bwd, m_deriv0, m_deriv1, m_deriv2);
// Step 3: Inner product for mass matrix operation
m_IProductWRTBaseOp->apply(m_bwd, out, m_lambda);
// Step 4: Multiply by diffusion coefficient
DiffusionCoeff(m_deriv0, m_deriv1, m_deriv2);
// Step 5: Inner product
m_IProductWRTDerivBaseOp->apply(m_deriv0, m_deriv1, m_deriv2, out,
true);
}
void DiffusionCoeff(Field<TData, FieldState::Phys> &deriv0,
Field<TData, FieldState::Phys> &deriv1,
Field<TData, FieldState::Phys> &deriv2)
{
// Initialize pointers.
std::vector<TData *> derivptr{deriv0.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(),
deriv1.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(),
deriv2.template GetStorage<MemoryRegionCUDA>().GetGPUPtr()};
// Initialize index.
size_t expIdx = 0;
for (size_t block_idx = 0; block_idx < deriv0.GetBlocks().size();
++block_idx)
{
// Determine shape and type of the element.
auto const expPtr = this->m_expansionList->GetExp(expIdx);
auto nElmts = deriv0.GetBlocks()[block_idx].num_elements;
auto nCoord = expPtr->GetCoordim();
auto nqTot = expPtr->GetTotPoints();
// Determine CUDA grid parameters.
m_gridSize = nElmts / m_blockSize;
m_gridSize += (nElmts % m_blockSize == 0) ? 0 : 1;
// Multiply by diffusion coefficient.
if (nCoord == 1)
{
auto nq0 = expPtr->GetNumPoints(0);
DiffusionCoeff1DKernel<<<m_gridSize, m_blockSize>>>(
nq0, nElmts, m_diffCoeff, derivptr[0]);
}
else if (nCoord == 2)
{
auto nq0 = expPtr->GetNumPoints(0);
auto nq1 = expPtr->GetNumPoints(1);
DiffusionCoeff2DKernel<<<m_gridSize, m_blockSize>>>(
nq0, nq1, nElmts, m_diffCoeff, derivptr[0], derivptr[1]);
}
else
{
auto nq0 = expPtr->GetNumPoints(0);
auto nq1 = expPtr->GetNumPoints(1);
auto nq2 = expPtr->GetNumPoints(2);
DiffusionCoeff3DKernel<<<m_gridSize, m_blockSize>>>(
nq0, nq1, nq2, nElmts, m_diffCoeff, derivptr[0],
derivptr[1], derivptr[2]);
}
// Increment pointer and index for next element type.
for (size_t d = 0; d < nCoord; d++)
{
derivptr[d] += nqTot * nElmts;
}
expIdx += nElmts;
}
}
static std::unique_ptr<Operator<TData>> instantiate(
const MultiRegions::ExpListSharedPtr &expansionList)
{
return std::make_unique<OperatorHelmholtzImpl<TData, ImplCUDA>>(
expansionList);
}
void SetLambda(TData lambda)
{
m_lambda = lambda;
}
static std::string className;
private:
std::shared_ptr<OperatorBwdTrans<TData>> m_BwdTransOp;
std::shared_ptr<OperatorPhysDeriv<TData>> m_PhysDerivOp;
std::shared_ptr<OperatorIProductWRTBase<TData>> m_IProductWRTBaseOp;
std::shared_ptr<OperatorIProductWRTDerivBase<TData>>
m_IProductWRTDerivBaseOp;
Field<TData, FieldState::Phys> m_bwd;
Field<TData, FieldState::Phys> m_deriv0;
Field<TData, FieldState::Phys> m_deriv1;
Field<TData, FieldState::Phys> m_deriv2;
TData m_lambda = 1.0;
TData *m_diffCoeff;
size_t m_blockSize = 32;
size_t m_gridSize;
};
} // namespace Nektar::Operators::detail
namespace Nektar::Operators::detail
{
template <typename TData>
__global__ void DiffusionCoeff1DKernel(const size_t nq0, const size_t nelmt,
const TData *diffCoeff, TData *deriv0)
{
size_t e = blockDim.x * blockIdx.x + threadIdx.x;
if (e >= nelmt)
{
return;
}
// Assign pointers.
TData *derivptr = deriv0 + nq0 * e;
for (size_t i = 0; i < nq0; i++)
{
derivptr[i] *= diffCoeff[0];
}
}
template <typename TData>
__global__ void DiffusionCoeff2DKernel(const size_t nq0, const size_t nq1,
const size_t nelmt,
const TData *diffCoeff, TData *deriv0,
TData *deriv1)
{
size_t e = blockDim.x * blockIdx.x + threadIdx.x;
if (e >= nelmt)
{
return;
}
constexpr size_t ncoord = 2;
// Assign pointers.
TData **derivptr = new TData *[ncoord];
derivptr[0] = deriv0 + nq0 * nq1 * e;
derivptr[1] = deriv1 + nq0 * nq1 * e;
for (size_t j = 0, cnt = 0; j < nq1; j++)
{
for (size_t i = 0; i < nq0; i++)
{
TData deriv[2] = {derivptr[0][cnt], derivptr[1][cnt]};
for (size_t d = 0; d < ncoord; d++)
{
derivptr[d][cnt] = diffCoeff[d * ncoord + 0] * deriv[0] +
diffCoeff[d * ncoord + 1] * deriv[1];
}
cnt++;
}
}
}
template <typename TData>
__global__ void DiffusionCoeff3DKernel(const size_t nq0, const size_t nq1,
const size_t nq2, const size_t nelmt,
TData *diffCoeff, TData *deriv0,
TData *deriv1, TData *deriv2)
{
size_t e = blockDim.x * blockIdx.x + threadIdx.x;
if (e >= nelmt)
{
return;
}
constexpr size_t ncoord = 3;
// Assign pointers.
TData **derivptr = new TData *[ncoord];
derivptr[0] = deriv0 + nq0 * nq1 * nq2 * e;
derivptr[1] = deriv1 + nq0 * nq1 * nq2 * e;
derivptr[2] = deriv2 + nq0 * nq1 * nq2 * e;
for (size_t k = 0, cnt = 0; k < nq2; k++)
{
for (size_t j = 0; j < nq1; j++)
{
for (size_t i = 0; i < nq0; i++)
{
TData deriv[3] = {derivptr[0][cnt], derivptr[1][cnt],
derivptr[2][cnt]};
for (size_t d = 0; d < ncoord; d++)
{
derivptr[d][cnt] = diffCoeff[d * ncoord + 0] * deriv[0] +
diffCoeff[d * ncoord + 1] * deriv[1] +
diffCoeff[d * ncoord + 2] * deriv[2];
}
cnt++;
}
}
}
}
} // namespace Nektar::Operators::detail
#include "HelmholtzStdMat.hpp"
namespace Nektar::Operators::detail
{
// Add different Helmholtz implementations to the factory.
template <>
std::string OperatorHelmholtzImpl<double, ImplStdMat>::className =
GetOperatorFactory<double>().RegisterCreatorFunction(
"HelmholtzStdMat",
OperatorHelmholtzImpl<double, ImplStdMat>::instantiate, "...");
} // namespace Nektar::Operators::detail
#include "Operators/BwdTrans/BwdTransStdMat.hpp"
#include "Operators/IProductWRTBase/IProductWRTBaseStdMat.hpp"
#include "Operators/IProductWRTDerivBase/IProductWRTDerivBaseStdMat.hpp"
#include "Operators/OperatorHelmholtz.hpp"
#include "Operators/PhysDeriv/PhysDerivStdMat.hpp"
namespace Nektar::Operators::detail
{
// standard matrix implementation
template <typename TData>
class OperatorHelmholtzImpl<TData, ImplStdMat> : public OperatorHelmholtz<TData>
{
public:
OperatorHelmholtzImpl(const MultiRegions::ExpListSharedPtr &expansionList)
: OperatorHelmholtz<TData>(expansionList),
m_bwd(Field<TData, FieldState::Phys>::create(
GetBlockAttributes(FieldState::Phys, expansionList))),
m_deriv0(Field<TData, FieldState::Phys>::create(
GetBlockAttributes(FieldState::Phys, expansionList))),
m_deriv1(Field<TData, FieldState::Phys>::create(
GetBlockAttributes(FieldState::Phys, expansionList))),
m_deriv2(Field<TData, FieldState::Phys>::create(
GetBlockAttributes(FieldState::Phys, expansionList))),
m_derivcoeff0(Field<TData, FieldState::Phys>::create(
GetBlockAttributes(FieldState::Phys, expansionList))),
m_derivcoeff1(Field<TData, FieldState::Phys>::create(
GetBlockAttributes(FieldState::Phys, expansionList))),
m_derivcoeff2(Field<TData, FieldState::Phys>::create(
GetBlockAttributes(FieldState::Phys, expansionList)))
{
auto nCoord = this->m_expansionList->GetCoordim(0);
m_BwdTransOp = BwdTrans<>::create(this->m_expansionList, "StdMat");
m_PhysDerivOp = PhysDeriv<>::create(this->m_expansionList, "StdMat");
m_IProductWRTBaseOp =
IProductWRTBase<>::create(this->m_expansionList, "StdMat");
m_IProductWRTDerivBaseOp =
IProductWRTDerivBase<>::create(this->m_expansionList, "StdMat");
m_diffCoeff = Array<OneD, TData>(nCoord * nCoord, 0.0);
for (size_t d = 0; d < nCoord; d++)
{
m_diffCoeff[d * nCoord + d] = 1.0; // temporary solution
}
}
void apply(Field<TData, FieldState::Coeff> &in,
Field<TData, FieldState::Coeff> &out) override
{
// Step 1: BwdTrans
m_BwdTransOp->apply(in, m_bwd);
// Step 2: PhysDeriv
m_PhysDerivOp->apply(m_bwd, m_deriv0, m_deriv1, m_deriv2);
// Step 3: Inner product for mass matrix operation
m_IProductWRTBaseOp->apply(m_bwd, out, m_lambda);
// Step 4: Multiply by diffusion coefficient
DiffusionCoeff(m_deriv0, m_deriv1, m_deriv2, m_derivcoeff0,
m_derivcoeff1, m_derivcoeff2);
// Step 5: Inner product
m_IProductWRTDerivBaseOp->apply(m_derivcoeff0, m_derivcoeff1,
m_derivcoeff2, out, true);
}
void DiffusionCoeff(Field<TData, FieldState::Phys> &deriv0,
Field<TData, FieldState::Phys> &deriv1,
Field<TData, FieldState::Phys> &deriv2,
Field<TData, FieldState::Phys> &derivcoeff0,
Field<TData, FieldState::Phys> &derivcoeff1,
Field<TData, FieldState::Phys> &derivcoeff2)
{
// Initialize pointers.
std::vector<TData *> derivptr{deriv0.GetStorage().GetCPUPtr(),
deriv1.GetStorage().GetCPUPtr(),
deriv2.GetStorage().GetCPUPtr()};
std::vector<TData *> derivcoeffptr{
derivcoeff0.GetStorage().GetCPUPtr(),
derivcoeff1.GetStorage().GetCPUPtr(),
derivcoeff2.GetStorage().GetCPUPtr()};
// Initialize index.
size_t expIdx = 0;
for (size_t block_idx = 0; block_idx < deriv0.GetBlocks().size();
++block_idx)
{
// Determine shape and type of the element.
auto const expPtr = this->m_expansionList->GetExp(expIdx);
auto nElmts = deriv0.GetBlocks()[block_idx].num_elements;
auto nCoord = expPtr->GetCoordim();
auto nqTot = expPtr->GetTotPoints();
// Multiply by diffusion coefficient.
for (size_t d = 0; d < nCoord; d++)
{
Vmath::Smul(nqTot * nElmts, m_diffCoeff[d * nCoord], derivptr[0], 1,
derivcoeffptr[d], 1);
for (size_t l = 1; l < nCoord; l++)
{
Vmath::Svtvp(nqTot * nElmts, m_diffCoeff[d * nCoord + l],
derivptr[l], 1, derivcoeffptr[d], 1,
derivcoeffptr[d], 1);
}
}
// Increment pointer and index for next element type.
for (size_t d = 0; d < nCoord; d++)
{
derivptr[d] += nqTot * nElmts;
derivcoeffptr[d] += nqTot * nElmts;
}
expIdx += nElmts;
}
}
static std::unique_ptr<Operator<TData>> instantiate(
const MultiRegions::ExpListSharedPtr &expansionList)
{
return std::make_unique<OperatorHelmholtzImpl<TData, ImplStdMat>>(
expansionList);
}
void SetLambda(TData lambda)
{
m_lambda = lambda;
}
static std::string className;
private:
std::shared_ptr<OperatorBwdTrans<TData>> m_BwdTransOp;
std::shared_ptr<OperatorPhysDeriv<TData>> m_PhysDerivOp;
std::shared_ptr<OperatorIProductWRTBase<TData>> m_IProductWRTBaseOp;
std::shared_ptr<OperatorIProductWRTDerivBase<TData>>
m_IProductWRTDerivBaseOp;
Field<TData, FieldState::Phys> m_bwd;
Field<TData, FieldState::Phys> m_deriv0;
Field<TData, FieldState::Phys> m_deriv1;
Field<TData, FieldState::Phys> m_deriv2;
Field<TData, FieldState::Phys> m_derivcoeff0;
Field<TData, FieldState::Phys> m_derivcoeff1;
Field<TData, FieldState::Phys> m_derivcoeff2;
TData m_lambda = 1.0;
Array<OneD, TData> m_diffCoeff;
};
} // namespace Nektar::Operators::detail
#include "IProductWRTBaseCUDA.hpp"
namespace Nektar::Operators::detail
{
template <>
std::string OperatorIProductWRTBaseImpl<double, ImplCUDA>::className =
GetOperatorFactory<double>().RegisterCreatorFunction(
"IProductWRTBaseCUDA",
OperatorIProductWRTBaseImpl<double, ImplCUDA>::instantiate, "...");
}
#pragma once
#include "MemoryRegionCUDA.hpp"
#include "Operators/IProductWRTBase/IProductWRTBaseCUDAKernels.cuh"
#include "Operators/OperatorHelper.cuh"
#include "Operators/OperatorIProductWRTBase.hpp"
namespace Nektar::Operators::detail
{
template <typename TData, bool SCALE = false, bool APPEND = false,
bool DEFORMED = false>
void IProductWRTBase1DKernel(const size_t gridSize, const size_t blockSize,
const size_t nm0, const size_t nq0,
const size_t nElmts, const TData *basis0,
const TData *w0, const TData *jac, const TData *in,
TData *out, TData scale = 1.0);
template <typename TData, bool SCALE = false, bool APPEND = false,
bool DEFORMED = false>
void IProductWRTBase2DKernel(const size_t gridSize, const size_t blockSize,
LibUtilities::ShapeType shapetype,
const size_t nm0, const size_t nm1,
const size_t nq0, const size_t nq1,
const size_t nElmts, const bool correct,
const TData *basis0, const TData *basis1,
const TData *w0, const TData *w1, const TData *jac,
const TData *in, TData *out, TData scale = 1.0);
template <typename TData, bool SCALE = false, bool APPEND = false,
bool DEFORMED = false>
void IProductWRTBase3DKernel(const size_t gridSize, const size_t blockSize,
LibUtilities::ShapeType shapetype,
const size_t nm0, const size_t nm1,
const size_t nm2, const size_t nq0,
const size_t nq1, const size_t nq2,
const size_t nElmts, const bool correct,
const TData *basis0, const TData *basis1,
const TData *basis2, const TData *w0,
const TData *w1, const TData *w2, const TData *jac,
const TData *in, TData *out, TData scale = 1.0);
// IProductWRTBase implementation
template <typename TData>
class OperatorIProductWRTBaseImpl<TData, ImplCUDA>
: public OperatorIProductWRTBase<TData>
{
public:
OperatorIProductWRTBaseImpl(
const MultiRegions::ExpListSharedPtr &expansionList)
: OperatorIProductWRTBase<TData>(expansionList)
{
// Initialise jacobian.
size_t jacSize = Operator<TData>::GetGeometricFactorSize();
auto jac = Operator<TData>::SetJacobian(jacSize);
cudaMalloc((void **)&m_jac, sizeof(TData) * jacSize);
cudaMemcpy(m_jac, jac.get(), sizeof(TData) * jacSize,
cudaMemcpyHostToDevice);
// Initialize basis.
m_basis = GetBasisDataCUDA<TData>(expansionList);
// Initialize weight.
m_weight = GetWeightDataCUDA<TData>(expansionList);
}
~OperatorIProductWRTBaseImpl(void)
{
DeallocateDataCUDA<TData>(m_basis);
DeallocateDataCUDA<TData>(m_weight);
cudaFree(m_jac);
}
void apply(Field<TData, FieldState::Phys> &in,
Field<TData, FieldState::Coeff> &out,
const TData lambda = 1.0) override
{
// Copy memory to GPU, if necessary and get raw pointers.
auto const *inptr =
in.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
auto *outptr = out.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
// Initialize index.
size_t expIdx = 0;
auto jacptr = m_jac;
// Initialize basiskey.
std::vector<LibUtilities::BasisKey> basisKeys(
3, LibUtilities::NullBasisKey);
// Loop over the blocks.
for (size_t block_idx = 0; block_idx < in.GetBlocks().size();
++block_idx)
{
auto const expPtr = this->m_expansionList->GetExp(expIdx);
auto nElmts = in.GetBlocks()[block_idx].num_elements;
auto nqTot = expPtr->GetTotPoints();
auto nmTot = expPtr->GetNcoeffs();
auto ptsKeys = expPtr->GetPointsKeys();
auto deformed = expPtr->GetMetricInfo()->GetGtype() ==
SpatialDomains::eDeformed;
// Deterime CUDA grid parameters.
m_gridSize = nElmts / m_blockSize;
m_gridSize += (nElmts % m_blockSize == 0) ? 0 : 1;
// Flag for collapsed coordinate correction.
bool correct = expPtr->GetBasis(0)->GetBasisType() ==
LibUtilities::eModified_A;
// Fetch basis key for the current element type.
for (size_t d = 0; d < expPtr->GetShapeDimension(); d++)
{
basisKeys[d] = expPtr->GetBasis(d)->GetBasisKey();
}
// Function call to kernel functions.
if (expPtr->GetShapeDimension() == 1)
{
auto basis0 = m_basis[basisKeys][0];
auto w0 = m_weight[basisKeys][0];
auto nm0 = expPtr->GetBasisNumModes(0);
auto nq0 = expPtr->GetNumPoints(0);
if (deformed)
{
if (lambda == 1.0)
{
IProductWRTBase1DKernel<TData, false, false, true>(
m_gridSize, m_blockSize, nm0, nq0, nElmts, basis0,
w0, jacptr, inptr, outptr);
}
else
{
IProductWRTBase1DKernel<TData, true, false, true>(
m_gridSize, m_blockSize, nm0, nq0, nElmts, basis0,
w0, jacptr, inptr, outptr, lambda);
}
}
else
{
if (lambda == 1.0)
{
IProductWRTBase1DKernel<TData, false, false, false>(
m_gridSize, m_blockSize, nm0, nq0, nElmts, basis0,
w0, jacptr, inptr, outptr);
}
else
{
IProductWRTBase1DKernel<TData, true, false, false>(
m_gridSize, m_blockSize, nm0, nq0, nElmts, basis0,
w0, jacptr, inptr, outptr, lambda);
}
}
}
else if (expPtr->GetShapeDimension() == 2)
{
auto basis0 = m_basis[basisKeys][0];
auto basis1 = m_basis[basisKeys][1];
auto w0 = m_weight[basisKeys][0];
auto w1 = m_weight[basisKeys][1];
auto shape = expPtr->DetShapeType();
auto nm0 = expPtr->GetBasisNumModes(0);
auto nm1 = expPtr->GetBasisNumModes(1);
auto nq0 = expPtr->GetNumPoints(0);
auto nq1 = expPtr->GetNumPoints(1);
if (deformed)
{
if (lambda == 1.0)
{
IProductWRTBase2DKernel<TData, false, false, true>(
m_gridSize, m_blockSize, shape, nm0, nm1, nq0, nq1,
nElmts, correct, basis0, basis1, w0, w1, jacptr,
inptr, outptr);
}
else
{
IProductWRTBase2DKernel<TData, true, false, true>(
m_gridSize, m_blockSize, shape, nm0, nm1, nq0, nq1,
nElmts, correct, basis0, basis1, w0, w1, jacptr,
inptr, outptr, lambda);
}
}
else
{
if (lambda == 1.0)
{
IProductWRTBase2DKernel<TData, false, false, false>(
m_gridSize, m_blockSize, shape, nm0, nm1, nq0, nq1,
nElmts, correct, basis0, basis1, w0, w1, jacptr,
inptr, outptr);
}
else
{
IProductWRTBase2DKernel<TData, true, false, false>(
m_gridSize, m_blockSize, shape, nm0, nm1, nq0, nq1,
nElmts, correct, basis0, basis1, w0, w1, jacptr,
inptr, outptr, lambda);
}
}
}
else if (expPtr->GetShapeDimension() == 3)
{
auto basis0 = m_basis[basisKeys][0];
auto basis1 = m_basis[basisKeys][1];
auto basis2 = m_basis[basisKeys][2];
auto w0 = m_weight[basisKeys][0];
auto w1 = m_weight[basisKeys][1];
auto w2 = m_weight[basisKeys][2];
auto shape = expPtr->DetShapeType();
auto nm0 = expPtr->GetBasisNumModes(0);
auto nm1 = expPtr->GetBasisNumModes(1);
auto nm2 = expPtr->GetBasisNumModes(2);
auto nq0 = expPtr->GetNumPoints(0);
auto nq1 = expPtr->GetNumPoints(1);
auto nq2 = expPtr->GetNumPoints(2);
if (deformed)
{
if (lambda == 1.0)
{
IProductWRTBase3DKernel<TData, false, false, true>(
m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0,
nq1, nq2, nElmts, correct, basis0, basis1, basis2,
w0, w1, w2, jacptr, inptr, outptr);
}
else
{
IProductWRTBase3DKernel<TData, true, false, true>(
m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0,
nq1, nq2, nElmts, correct, basis0, basis1, basis2,
w0, w1, w2, jacptr, inptr, outptr, lambda);
}
}
else
{
if (lambda == 1.0)
{
IProductWRTBase3DKernel<TData, false, false, false>(
m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0,
nq1, nq2, nElmts, correct, basis0, basis1, basis2,
w0, w1, w2, jacptr, inptr, outptr);
}
else
{
IProductWRTBase3DKernel<TData, true, false, false>(
m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0,
nq1, nq2, nElmts, correct, basis0, basis1, basis2,
w0, w1, w2, jacptr, inptr, outptr, lambda);
}
}
}
// Increment pointer and index for next element type.
jacptr += deformed ? nqTot * nElmts : nElmts;
inptr += nqTot * nElmts;
outptr += nmTot * nElmts;
expIdx += nElmts;
}
}
static std::unique_ptr<Operator<TData>> instantiate(
MultiRegions::ExpListSharedPtr expansionList)
{
return std::make_unique<OperatorIProductWRTBaseImpl<TData, ImplCUDA>>(
expansionList);
}
static std::string className;
private:
std::map<std::vector<LibUtilities::BasisKey>, std::vector<TData *>> m_basis;
std::map<std::vector<LibUtilities::BasisKey>, std::vector<TData *>>
m_weight;
TData *m_jac;
size_t m_blockSize = 32;
size_t m_gridSize;
};
template <typename TData, bool SCALE, bool APPEND, bool DEFORMED>
void IProductWRTBase1DKernel(const size_t gridSize, const size_t blockSize,
const size_t nm0, const size_t nq0,
const size_t nElmts, const TData *basis0,
const TData *w0, const TData *jac, const TData *in,
TData *out, TData scale)
{
IProductWRTBaseSegKernel<TData, SCALE, APPEND, DEFORMED>
<<<gridSize, blockSize>>>(nm0, nq0, nElmts, basis0, w0, jac, in, out,
scale);
}
template <typename TData, bool SCALE, bool APPEND, bool DEFORMED>
void IProductWRTBase2DKernel(const size_t gridSize, const size_t blockSize,
LibUtilities::ShapeType shapetype,
const size_t nm0, const size_t nm1,
const size_t nq0, const size_t nq1,
const size_t nElmts, const bool correct,
const TData *basis0, const TData *basis1,
const TData *w0, const TData *w1, const TData *jac,
const TData *in, TData *out, TData scale)
{
if (shapetype == LibUtilities::Quad)
{
IProductWRTBaseQuadKernel<TData, SCALE, APPEND, DEFORMED>
<<<gridSize, blockSize>>>(nm0, nm1, nq0, nq1, nElmts, basis0,
basis1, w0, w1, jac, in, out, scale);
}
else if (shapetype == LibUtilities::Tri)
{
size_t nmTot =
LibUtilities::StdTriData::getNumberOfCoefficients(nm0, nm1);
IProductWRTBaseTriKernel<TData, SCALE, APPEND, DEFORMED>
<<<gridSize, blockSize>>>(nm0, nm1, nmTot, nq0, nq1, nElmts,
correct, basis0, basis1, w0, w1, jac, in,
out, scale);
}
}
template <typename TData, bool SCALE, bool APPEND, bool DEFORMED>
void IProductWRTBase3DKernel(
const size_t gridSize, const size_t blockSize,
LibUtilities::ShapeType shapetype, const size_t nm0, const size_t nm1,
const size_t nm2, const size_t nq0, const size_t nq1, const size_t nq2,
const size_t nElmts, const bool correct, const TData *basis0,
const TData *basis1, const TData *basis2, const TData *w0, const TData *w1,
const TData *w2, const TData *jac, const TData *in, TData *out, TData scale)
{
if (shapetype == LibUtilities::Hex)
{
IProductWRTBaseHexKernel<TData, SCALE, APPEND, DEFORMED>
<<<gridSize, blockSize>>>(nm0, nm1, nm2, nq0, nq1, nq2, nElmts,
basis0, basis1, basis2, w0, w1, w2, jac,
in, out, scale);
}
else if (shapetype == LibUtilities::Tet)
{
size_t nmTot =
LibUtilities::StdTetData::getNumberOfCoefficients(nm0, nm1, nm2);
IProductWRTBaseTetKernel<TData, SCALE, APPEND, DEFORMED>
<<<gridSize, blockSize>>>(nm0, nm1, nm2, nmTot, nq0, nq1, nq2,
nElmts, correct, basis0, basis1, basis2,
w0, w1, w2, jac, in, out, scale);
}
else if (shapetype == LibUtilities::Pyr)
{
size_t nmTot =
LibUtilities::StdPyrData::getNumberOfCoefficients(nm0, nm1, nm2);
IProductWRTBasePyrKernel<TData, SCALE, APPEND, DEFORMED>
<<<gridSize, blockSize>>>(nm0, nm1, nm2, nmTot, nq0, nq1, nq2,
nElmts, correct, basis0, basis1, basis2,
w0, w1, w2, jac, in, out, scale);
}
else if (shapetype == LibUtilities::Prism)
{
size_t nmTot =
LibUtilities::StdPrismData::getNumberOfCoefficients(nm0, nm1, nm2);
IProductWRTBasePrismKernel<TData, SCALE, APPEND, DEFORMED>
<<<gridSize, blockSize>>>(nm0, nm1, nm2, nmTot, nq0, nq1, nq2,
nElmts, correct, basis0, basis1, basis2,
w0, w1, w2, jac, in, out, scale);
}
}
} // namespace Nektar::Operators::detail
This diff is collapsed.
...@@ -14,52 +14,14 @@ public: ...@@ -14,52 +14,14 @@ public:
const MultiRegions::ExpListSharedPtr &expansionList) const MultiRegions::ExpListSharedPtr &expansionList)
: OperatorIProductWRTBase<TData>(std::move(expansionList)) : OperatorIProductWRTBase<TData>(std::move(expansionList))
{ {
size_t jacSize = 0;
size_t nTotElmts = this->m_expansionList->GetNumElmts();
// Calculate the jacobian array size
for (size_t e = 0; e < nTotElmts; ++e)
{
// Determine shape and type of the element
auto const expPtr = this->m_expansionList->GetExp(e);
if (expPtr->GetMetricInfo()->GetGtype() ==
SpatialDomains::eDeformed)
{
jacSize += expPtr->GetTotPoints();
}
else
{
jacSize++;
}
}
// Allocate memory for the jacobian
m_jac = {jacSize, 0.0};
// Initialise jacobian. // Initialise jacobian.
size_t index = 0; size_t jacSize = Operator<TData>::GetGeometricFactorSize();
for (size_t e = 0; e < nTotElmts; ++e) m_jac = Operator<TData>::SetJacobian(jacSize);
{
auto expPtr = this->m_expansionList->GetExp(e);
auto &auxJac =
expPtr->GetMetricInfo()->GetJac(expPtr->GetPointsKeys());
if (expPtr->GetMetricInfo()->GetGtype() ==
SpatialDomains::eDeformed)
{
size_t nqe = expPtr->GetTotPoints();
for (size_t i = 0; i < nqe; ++i)
{
m_jac[index++] = auxJac[i];
}
}
else
{
m_jac[index++] = auxJac[0];
}
}
} }
void apply(Field<TData, FieldState::Phys> &in, void apply(Field<TData, FieldState::Phys> &in,
Field<TData, FieldState::Coeff> &out) override Field<TData, FieldState::Coeff> &out,
const TData lambda = 1.0) override
{ {
auto const *inptr = in.GetStorage().GetCPUPtr(); auto const *inptr = in.GetStorage().GetCPUPtr();
auto *outptr = out.GetStorage().GetCPUPtr(); auto *outptr = out.GetStorage().GetCPUPtr();
...@@ -80,7 +42,7 @@ public: ...@@ -80,7 +42,7 @@ public:
// This is the B^{T} matrix // This is the B^{T} matrix
auto const matPtr = expPtr->GetStdMatrix(key); auto const matPtr = expPtr->GetStdMatrix(key);
Array<OneD, NekDouble> wsp(nqTot * nElmts, 0.0); Array<OneD, TData> wsp(nqTot * nElmts, 0.0);
if (expPtr->GetMetricInfo()->GetGtype() == if (expPtr->GetMetricInfo()->GetGtype() ==
SpatialDomains::eDeformed) SpatialDomains::eDeformed)
{ {
...@@ -103,7 +65,7 @@ public: ...@@ -103,7 +65,7 @@ public:
} }
Blas::Dgemm('N', 'N', matPtr->GetRows(), nElmts, Blas::Dgemm('N', 'N', matPtr->GetRows(), nElmts,
matPtr->GetColumns(), 1.0, matPtr->GetRawPtr(), matPtr->GetColumns(), lambda, matPtr->GetRawPtr(),
matPtr->GetRows(), wsp.get(), nqTot, 0.0, outptr, matPtr->GetRows(), wsp.get(), nqTot, 0.0, outptr,
nmTot); nmTot);
...@@ -123,7 +85,7 @@ public: ...@@ -123,7 +85,7 @@ public:
static std::string className; static std::string className;
private: private:
Array<OneD, NekDouble> m_jac; Array<OneD, TData> m_jac;
}; };
} // namespace Nektar::Operators::detail } // namespace Nektar::Operators::detail
#include "IProductWRTDerivBaseCUDA.hpp"
namespace Nektar::Operators::detail
{
template <>
std::string OperatorIProductWRTDerivBaseImpl<double, ImplCUDA>::className =
GetOperatorFactory<double>().RegisterCreatorFunction(
"IProductWRTDerivBaseCUDA",
OperatorIProductWRTDerivBaseImpl<double, ImplCUDA>::instantiate, "...");
}
#include "MemoryRegionCUDA.hpp"
#include "Operators/IProductWRTBase/IProductWRTBaseCUDA.hpp"
#include "Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDAKernels.cuh"
#include "Operators/OperatorHelper.cuh"
#include "Operators/OperatorIProductWRTDerivBase.hpp"
namespace Nektar::Operators::detail
{
template <typename TData, bool DEFORMED = false>
void IProductWRTDerivBase1DKernel(const size_t gridSize, const size_t blockSize,
const size_t nq0, const size_t nCoord,
const size_t nElmts, const size_t dfSize,
TData *df, TData *in0, TData *in1, TData *in2,
TData *out0);
template <typename TData, bool DEFORMED = false>
void IProductWRTDerivBase2DKernel(const size_t gridSize, const size_t blockSize,
LibUtilities::ShapeType shapetype,
const size_t nq0, const size_t nq1,
const size_t nCoord, const size_t nElmts,
const TData *Z0, const TData *Z1,
const size_t dfSize, TData *df, TData *in0,
TData *in1, TData *in2, TData *out0,
TData *out1);
template <typename TData, bool DEFORMED = false>
void IProductWRTDerivBase3DKernel(
const size_t gridSize, const size_t blockSize,
LibUtilities::ShapeType shapetype, const size_t nq0, const size_t nq1,
const size_t nq2, const size_t nCoord, const size_t nElmts, const TData *Z0,
const TData *Z1, const TData *Z2, const size_t dfSize, TData *df,
TData *in0, TData *in1, TData *in2, TData *out0, TData *out1, TData *out2);
// IProductWRTDerivBase implementation
template <typename TData>
class OperatorIProductWRTDerivBaseImpl<TData, ImplCUDA>
: public OperatorIProductWRTDerivBase<TData>
{
public:
OperatorIProductWRTDerivBaseImpl(
const MultiRegions::ExpListSharedPtr &expansionList)
: OperatorIProductWRTDerivBase<TData>(expansionList)
{
size_t nDim = this->m_expansionList->GetShapeDimension();
size_t nCoord = this->m_expansionList->GetCoordim(0);
m_dfSize = Operator<TData>::GetGeometricFactorSize();
// Initialise jacobian.
auto jac = Operator<TData>::SetJacobian(m_dfSize);
cudaMalloc((void **)&m_jac, sizeof(TData) * m_dfSize);
cudaMemcpy(m_jac, jac.get(), sizeof(TData) * m_dfSize,
cudaMemcpyHostToDevice);
// Initialise derivative factor.
auto derivFac = Operator<TData>::SetDerivativeFactor(m_dfSize);
cudaMalloc((void **)&m_derivFac,
sizeof(TData) * nDim * nCoord * m_dfSize);
for (size_t d = 0; d < nDim * nCoord; d++)
{
auto hostPtr = derivFac[d].get();
auto devicePtr = m_derivFac + d * m_dfSize;
cudaMemcpy(devicePtr, hostPtr, sizeof(TData) * m_dfSize,
cudaMemcpyHostToDevice);
}
// Initialize basis.
m_basis = GetBasisDataCUDA<TData>(expansionList);
// Initialize basis derivative.
m_dbasis = GetDeriveBasisDataCUDA<TData>(expansionList);
// Initialize weight.
m_weight = GetWeightDataCUDA<TData>(expansionList);
// Initialize points.
m_Z = GetPointDataCUDA<TData>(expansionList);
// Initialize derivative matrix.
m_D = GetDerivativeDataCUDA<TData>(expansionList);
// Initialize workspace memory.
auto ndata = this->m_expansionList->GetTotPoints();
cudaMalloc((void **)&m_wsp0, sizeof(TData) * ndata);
if (nCoord > 1)
{
cudaMalloc((void **)&m_wsp1, sizeof(TData) * ndata);
}
if (nCoord > 2)
{
cudaMalloc((void **)&m_wsp2, sizeof(TData) * ndata);
}
}
~OperatorIProductWRTDerivBaseImpl(void)
{
DeallocateDataCUDA<TData>(m_basis);
DeallocateDataCUDA<TData>(m_dbasis);
DeallocateDataCUDA<TData>(m_weight);
DeallocateDataCUDA<TData>(m_Z);
DeallocateDataCUDA<TData>(m_D);
cudaFree(m_jac);
cudaFree(m_derivFac);
}
void apply(Field<TData, FieldState::Phys> &in0,
Field<TData, FieldState::Phys> &in1,
Field<TData, FieldState::Phys> &in2,
Field<TData, FieldState::Coeff> &out,
bool APPEND = false) override
{
if (!APPEND) // Zero output (temporary solution)
{
auto *tmpptr =
out.template GetStorage<MemoryRegionCUDA>().GetCPUPtr();
for (size_t block_idx = 0; block_idx < out.GetBlocks().size();
++block_idx)
{
for (size_t i = 0; i < out.GetBlocks()[block_idx].block_size;
i++)
{
*(tmpptr++) = 0.0;
}
}
}
// Copy memory to GPU, if necessary and get raw pointers.
std::vector<TData *> inptr{
in0.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(),
in1.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(),
in2.template GetStorage<MemoryRegionCUDA>().GetGPUPtr()};
auto *outptr = out.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
std::vector<TData *> wspptr{m_wsp0, m_wsp1, m_wsp2};
auto jacptr = m_jac;
auto dfptr = m_derivFac;
// Initialize index.
size_t expIdx = 0;
// Initialize basiskey.
std::vector<LibUtilities::BasisKey> basisKeys(
3, LibUtilities::NullBasisKey);
// Loop over the blocks.
for (size_t block_idx = 0; block_idx < out.GetBlocks().size();
++block_idx)
{
auto const expPtr = this->m_expansionList->GetExp(expIdx);
auto nElmts = out.GetBlocks()[block_idx].num_elements;
auto nqTot = expPtr->GetTotPoints();
auto nmTot = expPtr->GetNcoeffs();
auto nDim = expPtr->GetShapeDimension();
auto nCoord = expPtr->GetCoordim();
auto shape = expPtr->DetShapeType();
auto deformed = expPtr->GetMetricInfo()->GetGtype() ==
SpatialDomains::eDeformed;
// Deterime CUDA grid parameters.
m_gridSize = nElmts / m_blockSize;
m_gridSize += (nElmts % m_blockSize == 0) ? 0 : 1;
// Flag for collapsed coordinate correction.
bool correct = expPtr->GetBasis(0)->GetBasisType() ==
LibUtilities::eModified_A;
// Fetch basis key for the current element type.
for (size_t d = 0; d < nDim; d++)
{
basisKeys[d] = expPtr->GetBasis(d)->GetBasisKey();
}
// Function call to kernel functions.
if (nDim == 1)
{
auto dbasis0 = m_dbasis[basisKeys][0];
auto w0 = m_weight[basisKeys][0];
auto D0 = m_D[basisKeys][0];
auto nm0 = expPtr->GetBasisNumModes(0);
auto nq0 = expPtr->GetNumPoints(0);
if (deformed)
{
IProductWRTDerivBase1DKernel<TData, true>(
m_gridSize, m_blockSize, nq0, nCoord, nElmts, m_dfSize,
dfptr, inptr[0], inptr[1], inptr[2], wspptr[0]);
IProductWRTBase1DKernel<TData, false, true, true>(
m_gridSize, m_blockSize, nm0, nq0, nElmts, dbasis0, w0,
jacptr, wspptr[0], outptr);
}
else
{
IProductWRTDerivBase1DKernel<TData, false>(
m_gridSize, m_blockSize, nq0, nCoord, nElmts, m_dfSize,
dfptr, inptr[0], inptr[1], inptr[2], wspptr[0]);
IProductWRTBase1DKernel<TData, false, true, false>(
m_gridSize, m_blockSize, nm0, nq0, nElmts, dbasis0, w0,
jacptr, wspptr[0], outptr);
}
}
else if (nDim == 2)
{
auto basis0 = m_basis[basisKeys][0];
auto basis1 = m_basis[basisKeys][1];
auto dbasis0 = m_dbasis[basisKeys][0];
auto dbasis1 = m_dbasis[basisKeys][1];
auto w0 = m_weight[basisKeys][0];
auto w1 = m_weight[basisKeys][1];
auto D0 = m_D[basisKeys][0];
auto D1 = m_D[basisKeys][1];
auto Z0 = m_Z[basisKeys][0];
auto Z1 = m_Z[basisKeys][1];
auto nm0 = expPtr->GetBasisNumModes(0);
auto nm1 = expPtr->GetBasisNumModes(1);
auto nq0 = expPtr->GetNumPoints(0);
auto nq1 = expPtr->GetNumPoints(1);
if (deformed)
{
IProductWRTDerivBase2DKernel<TData, true>(
m_gridSize, m_blockSize, shape, nq0, nq1, nCoord,
nElmts, Z0, Z1, m_dfSize, dfptr, inptr[0], inptr[1],
inptr[2], wspptr[0], wspptr[1]);
IProductWRTBase2DKernel<TData, false, true, true>(
m_gridSize, m_blockSize, shape, nm0, nm1, nq0, nq1,
nElmts, correct, dbasis0, basis1, w0, w1, jacptr,
wspptr[0], outptr);
IProductWRTBase2DKernel<TData, false, true, true>(
m_gridSize, m_blockSize, shape, nm0, nm1, nq0, nq1,
nElmts, correct, basis0, dbasis1, w0, w1, jacptr,
wspptr[1], outptr);
}
else
{
IProductWRTDerivBase2DKernel<TData, false>(
m_gridSize, m_blockSize, shape, nq0, nq1, nCoord,
nElmts, Z0, Z1, m_dfSize, dfptr, inptr[0], inptr[1],
inptr[2], wspptr[0], wspptr[1]);
IProductWRTBase2DKernel<TData, false, true, false>(
m_gridSize, m_blockSize, shape, nm0, nm1, nq0, nq1,
nElmts, correct, dbasis0, basis1, w0, w1, jacptr,
wspptr[0], outptr);
IProductWRTBase2DKernel<TData, false, true, false>(
m_gridSize, m_blockSize, shape, nm0, nm1, nq0, nq1,
nElmts, correct, basis0, dbasis1, w0, w1, jacptr,
wspptr[1], outptr);
}
}
else if (nDim == 3)
{
auto basis0 = m_basis[basisKeys][0];
auto basis1 = m_basis[basisKeys][1];
auto basis2 = m_basis[basisKeys][2];
auto dbasis0 = m_dbasis[basisKeys][0];
auto dbasis1 = m_dbasis[basisKeys][1];
auto dbasis2 = m_dbasis[basisKeys][2];
auto w0 = m_weight[basisKeys][0];
auto w1 = m_weight[basisKeys][1];
auto w2 = m_weight[basisKeys][2];
auto D0 = m_D[basisKeys][0];
auto D1 = m_D[basisKeys][1];
auto D2 = m_D[basisKeys][2];
auto Z0 = m_Z[basisKeys][0];
auto Z1 = m_Z[basisKeys][1];
auto Z2 = m_Z[basisKeys][2];
auto nm0 = expPtr->GetBasisNumModes(0);
auto nm1 = expPtr->GetBasisNumModes(1);
auto nm2 = expPtr->GetBasisNumModes(2);
auto nq0 = expPtr->GetNumPoints(0);
auto nq1 = expPtr->GetNumPoints(1);
auto nq2 = expPtr->GetNumPoints(2);
if (deformed)
{
IProductWRTDerivBase3DKernel<TData, true>(
m_gridSize, m_blockSize, shape, nq0, nq1, nq2, nCoord,
nElmts, Z0, Z1, Z2, m_dfSize, dfptr, inptr[0], inptr[1],
inptr[2], wspptr[0], wspptr[1], wspptr[2]);
IProductWRTBase3DKernel<TData, false, true, true>(
m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0, nq1,
nq2, nElmts, correct, dbasis0, basis1, basis2, w0, w1,
w2, jacptr, wspptr[0], outptr);
IProductWRTBase3DKernel<TData, false, true, true>(
m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0, nq1,
nq2, nElmts, correct, basis0, dbasis1, basis2, w0, w1,
w2, jacptr, wspptr[1], outptr);
IProductWRTBase3DKernel<TData, false, true, true>(
m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0, nq1,
nq2, nElmts, correct, basis0, basis1, dbasis2, w0, w1,
w2, jacptr, wspptr[2], outptr);
}
else
{
IProductWRTDerivBase3DKernel<TData, false>(
m_gridSize, m_blockSize, shape, nq0, nq1, nq2, nCoord,
nElmts, Z0, Z1, Z2, m_dfSize, dfptr, inptr[0], inptr[1],
inptr[2], wspptr[0], wspptr[1], wspptr[2]);
IProductWRTBase3DKernel<TData, false, true, false>(
m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0, nq1,
nq2, nElmts, correct, dbasis0, basis1, basis2, w0, w1,
w2, jacptr, wspptr[0], outptr);
IProductWRTBase3DKernel<TData, false, true, false>(
m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0, nq1,
nq2, nElmts, correct, basis0, dbasis1, basis2, w0, w1,
w2, jacptr, wspptr[1], outptr);
IProductWRTBase3DKernel<TData, false, true, false>(
m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0, nq1,
nq2, nElmts, correct, basis0, basis1, dbasis2, w0, w1,
w2, jacptr, wspptr[2], outptr);
}
}
// Increment pointer and index for next element type.
jacptr += deformed ? nqTot * nElmts : nElmts;
dfptr += deformed ? nqTot * nElmts : nElmts;
for (size_t d = 0; d < nDim; d++)
{
inptr[d] += nqTot * nElmts;
wspptr[d] += nqTot * nElmts;
}
outptr += nmTot * nElmts;
expIdx += nElmts;
}
}
static std::unique_ptr<Operator<TData>> instantiate(
MultiRegions::ExpListSharedPtr expansionList)
{
return std::make_unique<
OperatorIProductWRTDerivBaseImpl<TData, ImplCUDA>>(expansionList);
}
static std::string className;
private:
TData *m_wsp0, *m_wsp1, *m_wsp2;
TData *m_derivFac;
TData *m_jac;
std::map<std::vector<LibUtilities::BasisKey>, std::vector<TData *>> m_basis;
std::map<std::vector<LibUtilities::BasisKey>, std::vector<TData *>>
m_dbasis;
std::map<std::vector<LibUtilities::BasisKey>, std::vector<TData *>>
m_weight;
std::map<std::vector<LibUtilities::BasisKey>, std::vector<TData *>> m_D;
std::map<std::vector<LibUtilities::BasisKey>, std::vector<TData *>> m_Z;
size_t m_dfSize;
size_t m_blockSize = 32;
size_t m_gridSize;
};
template <typename TData, bool DEFORMED>
void IProductWRTDerivBase1DKernel(const size_t gridSize, const size_t blockSize,
const size_t nq0, const size_t nCoord,
const size_t nElmts, const size_t dfSize,
TData *df, TData *in0, TData *in1, TData *in2,
TData *out0)
{
IProductWRTDerivBaseSegKernel<TData, DEFORMED><<<gridSize, blockSize>>>(
nq0, nCoord, nElmts, dfSize, df, in0, in1, in2, out0);
}
template <typename TData, bool DEFORMED>
void IProductWRTDerivBase2DKernel(const size_t gridSize, const size_t blockSize,
LibUtilities::ShapeType shapetype,
const size_t nq0, const size_t nq1,
const size_t nCoord, const size_t nElmts,
const TData *Z0, const TData *Z1,
const size_t dfSize, TData *df, TData *in0,
TData *in1, TData *in2, TData *out0,
TData *out1)
{
if (shapetype == LibUtilities::Quad)
{
IProductWRTDerivBaseQuadKernel<TData, DEFORMED>
<<<gridSize, blockSize>>>(nq0, nq1, nCoord, nElmts, dfSize, df, in0,
in1, in2, out0, out1);
}
else if (shapetype == LibUtilities::Tri)
{
IProductWRTDerivBaseTriKernel<TData, DEFORMED>
<<<gridSize, blockSize>>>(nq0, nq1, nCoord, nElmts, Z0, Z1, dfSize,
df, in0, in1, in2, out0, out1);
}
}
template <typename TData, bool DEFORMED>
void IProductWRTDerivBase3DKernel(
const size_t gridSize, const size_t blockSize,
LibUtilities::ShapeType shapetype, const size_t nq0, const size_t nq1,
const size_t nq2, const size_t nCoord, const size_t nElmts, const TData *Z0,
const TData *Z1, const TData *Z2, const size_t dfSize, TData *df,
TData *in0, TData *in1, TData *in2, TData *out0, TData *out1, TData *out2)
{
if (shapetype == LibUtilities::Hex)
{
IProductWRTDerivBaseHexKernel<TData, DEFORMED>
<<<gridSize, blockSize>>>(nq0, nq1, nq2, nCoord, nElmts, dfSize, df,
in0, in1, in2, out0, out1, out2);
}
else if (shapetype == LibUtilities::Tet)
{
IProductWRTDerivBaseTetKernel<TData, DEFORMED><<<gridSize, blockSize>>>(
nq0, nq1, nq2, nCoord, nElmts, Z0, Z1, Z2, dfSize, df, in0, in1,
in2, out0, out1, out2);
}
else if (shapetype == LibUtilities::Pyr)
{
IProductWRTDerivBasePyrKernel<TData, DEFORMED><<<gridSize, blockSize>>>(
nq0, nq1, nq2, nCoord, nElmts, Z0, Z1, Z2, dfSize, df, in0, in1,
in2, out0, out1, out2);
}
else if (shapetype == LibUtilities::Prism)
{
IProductWRTDerivBasePrismKernel<TData, DEFORMED>
<<<gridSize, blockSize>>>(nq0, nq1, nq2, nCoord, nElmts, Z0, Z2,
dfSize, df, in0, in1, in2, out0, out1,
out2);
}
}
} // namespace Nektar::Operators::detail
namespace Nektar::Operators::detail
{
template <typename TData, bool DEFORMED>
__global__ void IProductWRTDerivBaseSegKernel(const size_t nq0,
const size_t ncoord,
const size_t nelmt,
const size_t dfSize, TData *df,
TData *in0, TData *in1,
TData *in2, TData *out0)
{
size_t e = blockDim.x * blockIdx.x + threadIdx.x;
if (e >= nelmt)
{
return;
}
size_t ndf = ncoord;
// Assign pointers.
TData **inptr = new TData *[ncoord];
inptr[0] = in0 + (nq0 * e);
if (ncoord > 1)
{
inptr[1] = in1 + (nq0 * e);
}
if (ncoord > 2)
{
inptr[2] = in2 + (nq0 * e);
}
TData *outptr0 = out0 + (nq0 * e);
TData **dfptr = new TData *[ndf];
for (size_t d = 0; d < ndf; d++)
{
dfptr[d] = df + d * dfSize;
dfptr[d] += DEFORMED ? nq0 * e : e;
}
// Calculate dxi/dx in[0] + dxi/dy in[1] + dxi/dz in[2]
for (size_t i = 0; i < nq0; ++i)
{
size_t dfindex = DEFORMED ? i : 0;
outptr0[i] = dfptr[0][dfindex] * inptr[0][i];
for (size_t d = 1; d < ncoord; ++d)
{
outptr0[i] += (dfptr[d][dfindex] * inptr[d][i]);
}
}
delete[] inptr;
delete[] dfptr;
}
template <typename TData, bool DEFORMED>
__global__ void IProductWRTDerivBaseQuadKernel(
const size_t nq0, const size_t nq1, const size_t ncoord, const size_t nelmt,
const size_t dfSize, TData *df, TData *in0, TData *in1, TData *in2,
TData *out0, TData *out1)
{
size_t e = blockDim.x * blockIdx.x + threadIdx.x;
if (e >= nelmt)
{
return;
}
const auto ndf = 2 * ncoord;
// Assign pointers.
TData **inptr = new TData *[ncoord];
inptr[0] = in0 + (nq0 * nq1 * e);
inptr[1] = in1 + (nq0 * nq1 * e);
if (ncoord > 2)
{
inptr[2] = in2 + (nq0 * nq1 * e);
}
TData *outptr0 = out0 + (nq0 * nq1 * e);
TData *outptr1 = out1 + (nq0 * nq1 * e);
TData **dfptr = new TData *[ndf];
for (size_t d = 0; d < ndf; d++)
{
dfptr[d] = df + d * dfSize;
dfptr[d] += DEFORMED ? nq0 * nq1 * e : e;
}
// Calculate dxi/dx in[0] + dxi/dy in[1] + dxi/dz in[2]
for (size_t i = 0; i < nq0 * nq1; ++i)
{
size_t dfindex = DEFORMED ? i : 0;
outptr0[i] = dfptr[0][dfindex] * inptr[0][i];
outptr1[i] = dfptr[1][dfindex] * inptr[0][i];
for (size_t d = 1; d < ncoord; ++d)
{
outptr0[i] += (dfptr[2 * d][dfindex] * inptr[d][i]);
outptr1[i] += (dfptr[2 * d + 1][dfindex] * inptr[d][i]);
}
}
delete[] inptr;
delete[] dfptr;
}
template <typename TData, bool DEFORMED>
__global__ void IProductWRTDerivBaseTriKernel(
const size_t nq0, const size_t nq1, const size_t ncoord, const size_t nelmt,
const TData *Z0, const TData *Z1, const size_t dfSize, TData *df,
TData *in0, TData *in1, TData *in2, TData *out0, TData *out1)
{
size_t e = blockDim.x * blockIdx.x + threadIdx.x;
if (e >= nelmt)
{
return;
}
const auto ndf = 2 * ncoord;
// Assign pointers.
TData **inptr = new TData *[ncoord];
inptr[0] = in0 + (nq0 * nq1 * e);
inptr[1] = in1 + (nq0 * nq1 * e);
if (ncoord > 2)
{
inptr[2] = in2 + (nq0 * nq1 * e);
}
TData *outptr0 = out0 + (nq0 * nq1 * e);
TData *outptr1 = out1 + (nq0 * nq1 * e);
TData **dfptr = new TData *[ndf];
for (size_t d = 0; d < ndf; d++)
{
dfptr[d] = df + d * dfSize;
dfptr[d] += DEFORMED ? nq0 * nq1 * e : e;
}
// Calculate dxi/dx in[0] + dxi/dy in[1] + dxi/dz in[2]
for (size_t j = 0, cnt_ji = 0; j < nq1; ++j)
{
TData f0 = 2.0 / (1.0 - Z1[j]);
for (size_t i = 0; i < nq0; ++i, ++cnt_ji)
{
size_t dfindex = DEFORMED ? cnt_ji : 0;
outptr0[cnt_ji] = dfptr[0][dfindex] * inptr[0][cnt_ji];
outptr1[cnt_ji] = dfptr[1][dfindex] * inptr[0][cnt_ji];
for (size_t d = 1; d < ncoord; ++d)
{
outptr0[cnt_ji] += (dfptr[2 * d][dfindex] * inptr[d][cnt_ji]);
outptr1[cnt_ji] +=
(dfptr[2 * d + 1][dfindex] * inptr[d][cnt_ji]);
}
// Multiply by geometric factors
TData f1 = 0.5 * (1.0 + Z0[i]);
outptr0[cnt_ji] += outptr1[cnt_ji] * f1;
outptr0[cnt_ji] *= f0;
}
}
delete[] inptr;
delete[] dfptr;
}
template <typename TData, bool DEFORMED>
__global__ void IProductWRTDerivBaseHexKernel(
const size_t nq0, const size_t nq1, const size_t nq2, const size_t ncoord,
const size_t nelmt, const size_t dfSize, TData *df, TData *in0, TData *in1,
TData *in2, TData *out0, TData *out1, TData *out2)
{
size_t e = blockDim.x * blockIdx.x + threadIdx.x;
if (e >= nelmt)
{
return;
}
const auto ndf = 3 * ncoord;
// Assign pointers.
TData **inptr = new TData *[ncoord];
inptr[0] = in0 + (nq0 * nq1 * nq2 * e);
inptr[1] = in1 + (nq0 * nq1 * nq2 * e);
inptr[2] = in2 + (nq0 * nq1 * nq2 * e);
TData *outptr0 = out0 + (nq0 * nq1 * nq2 * e);
TData *outptr1 = out1 + (nq0 * nq1 * nq2 * e);
TData *outptr2 = out2 + (nq0 * nq1 * nq2 * e);
TData **dfptr = new TData *[ndf];
for (size_t d = 0; d < ndf; d++)
{
dfptr[d] = df + d * dfSize;
dfptr[d] += DEFORMED ? nq0 * nq1 * nq2 * e : e;
}
// Calculate dxi/dx in[0] + dxi/dy in[1] + dxi/dz in[2]
for (size_t i = 0; i < nq0 * nq1 * nq2; ++i)
{
size_t dfindex = DEFORMED ? i : 0;
outptr0[i] = dfptr[0][dfindex] * inptr[0][i];
outptr1[i] = dfptr[1][dfindex] * inptr[0][i];
outptr2[i] = dfptr[2][dfindex] * inptr[0][i];
for (size_t d = 1; d < ncoord; ++d)
{
outptr0[i] += (dfptr[3 * d][dfindex] * inptr[d][i]);
outptr1[i] += (dfptr[3 * d + 1][dfindex] * inptr[d][i]);
outptr2[i] += (dfptr[3 * d + 2][dfindex] * inptr[d][i]);
}
}
delete[] inptr;
delete[] dfptr;
}
template <typename TData, bool DEFORMED>
__global__ void IProductWRTDerivBaseTetKernel(
const size_t nq0, const size_t nq1, const size_t nq2, const size_t ncoord,
const size_t nelmt, const TData *Z0, const TData *Z1, const TData *Z2,
const size_t dfSize, TData *df, TData *in0, TData *in1, TData *in2,
TData *out0, TData *out1, TData *out2)
{
size_t e = blockDim.x * blockIdx.x + threadIdx.x;
if (e >= nelmt)
{
return;
}
const auto ndf = 3 * ncoord;
// Assign pointers.
TData **inptr = new TData *[ncoord];
inptr[0] = in0 + (nq0 * nq1 * nq2 * e);
inptr[1] = in1 + (nq0 * nq1 * nq2 * e);
inptr[2] = in2 + (nq0 * nq1 * nq2 * e);
TData *outptr0 = out0 + (nq0 * nq1 * nq2 * e);
TData *outptr1 = out1 + (nq0 * nq1 * nq2 * e);
TData *outptr2 = out2 + (nq0 * nq1 * nq2 * e);
TData **dfptr = new TData *[ndf];
for (size_t d = 0; d < ndf; d++)
{
dfptr[d] = df + d * dfSize;
dfptr[d] += DEFORMED ? nq0 * nq1 * nq2 * e : e;
}
// Calculate dxi/dx in[0] + dxi/dy in[1] + dxi/dz in[2]
for (size_t k = 0, cnt_kji = 0; k < nq2; ++k)
{
TData f2 = 2.0 / (1.0 - Z2[k]);
for (size_t j = 0; j < nq1; ++j)
{
TData f3 = 0.5 * (1.0 + Z1[j]);
TData f0 = 2.0 * f2 / (1.0 - Z1[j]);
for (size_t i = 0; i < nq0; ++i, ++cnt_kji)
{
size_t dfindex = DEFORMED ? cnt_kji : 0;
outptr0[cnt_kji] = dfptr[0][dfindex] * inptr[0][cnt_kji];
outptr1[cnt_kji] = dfptr[1][dfindex] * inptr[0][cnt_kji];
outptr2[cnt_kji] = dfptr[2][dfindex] * inptr[0][cnt_kji];
for (size_t d = 1; d < ncoord; ++d)
{
outptr0[cnt_kji] +=
(dfptr[3 * d][dfindex] * inptr[d][cnt_kji]);
outptr1[cnt_kji] +=
(dfptr[3 * d + 1][dfindex] * inptr[d][cnt_kji]);
outptr2[cnt_kji] +=
(dfptr[3 * d + 2][dfindex] * inptr[d][cnt_kji]);
}
// Multiply by geometric factors
TData f1 = 0.5 * (1.0 + Z0[i]);
outptr0[cnt_kji] += (outptr1[cnt_kji] + outptr2[cnt_kji]) * f1;
outptr0[cnt_kji] *= f0;
outptr1[cnt_kji] += outptr2[cnt_kji] * f3;
outptr1[cnt_kji] *= f2;
}
}
}
delete[] inptr;
delete[] dfptr;
}
template <typename TData, bool DEFORMED>
__global__ void IProductWRTDerivBasePrismKernel(
const size_t nq0, const size_t nq1, const size_t nq2, const size_t ncoord,
const size_t nelmt, const TData *Z0, const TData *Z2, const size_t dfSize,
TData *df, TData *in0, TData *in1, TData *in2, TData *out0, TData *out1,
TData *out2)
{
size_t e = blockDim.x * blockIdx.x + threadIdx.x;
if (e >= nelmt)
{
return;
}
const auto ndf = 3 * ncoord;
// Assign pointers.
TData **inptr = new TData *[ncoord];
inptr[0] = in0 + (nq0 * nq1 * nq2 * e);
inptr[1] = in1 + (nq0 * nq1 * nq2 * e);
inptr[2] = in2 + (nq0 * nq1 * nq2 * e);
TData *outptr0 = out0 + (nq0 * nq1 * nq2 * e);
TData *outptr1 = out1 + (nq0 * nq1 * nq2 * e);
TData *outptr2 = out2 + (nq0 * nq1 * nq2 * e);
TData **dfptr = new TData *[ndf];
for (size_t d = 0; d < ndf; d++)
{
dfptr[d] = df + d * dfSize;
dfptr[d] += DEFORMED ? nq0 * nq1 * nq2 * e : e;
}
// Calculate dxi/dx in[0] + dxi/dy in[1] + dxi/dz in[2]
for (size_t k = 0, cnt_kji = 0; k < nq2; ++k)
{
TData f0 = 2.0 / (1.0 - Z2[k]);
for (size_t j = 0; j < nq1; ++j)
{
for (size_t i = 0; i < nq0; ++i, ++cnt_kji)
{
size_t dfindex = DEFORMED ? cnt_kji : 0;
outptr0[cnt_kji] = dfptr[0][dfindex] * inptr[0][cnt_kji];
outptr1[cnt_kji] = dfptr[1][dfindex] * inptr[0][cnt_kji];
outptr2[cnt_kji] = dfptr[2][dfindex] * inptr[0][cnt_kji];
for (size_t d = 1; d < ncoord; ++d)
{
outptr0[cnt_kji] +=
(dfptr[3 * d][dfindex] * inptr[d][cnt_kji]);
outptr1[cnt_kji] +=
(dfptr[3 * d + 1][dfindex] * inptr[d][cnt_kji]);
outptr2[cnt_kji] +=
(dfptr[3 * d + 2][dfindex] * inptr[d][cnt_kji]);
}
// Multiply by geometric factors
TData f1 = 0.5 * (1.0 + Z0[i]);
outptr0[cnt_kji] += outptr2[cnt_kji] * f1;
outptr0[cnt_kji] *= f0;
}
}
}
delete[] inptr;
delete[] dfptr;
}
template <typename TData, bool DEFORMED>
__global__ void IProductWRTDerivBasePyrKernel(
const size_t nq0, const size_t nq1, const size_t nq2, const size_t ncoord,
const size_t nelmt, const TData *Z0, const TData *Z1, const TData *Z2,
const size_t dfSize, TData *df, TData *in0, TData *in1, TData *in2,
TData *out0, TData *out1, TData *out2)
{
size_t e = blockDim.x * blockIdx.x + threadIdx.x;
if (e >= nelmt)
{
return;
}
const auto ndf = 3 * ncoord;
// Assign pointers.
TData **inptr = new TData *[ncoord];
inptr[0] = in0 + (nq0 * nq1 * nq2 * e);
inptr[1] = in1 + (nq0 * nq1 * nq2 * e);
inptr[2] = in2 + (nq0 * nq1 * nq2 * e);
TData *outptr0 = out0 + (nq0 * nq1 * nq2 * e);
TData *outptr1 = out1 + (nq0 * nq1 * nq2 * e);
TData *outptr2 = out2 + (nq0 * nq1 * nq2 * e);
TData **dfptr = new TData *[ndf];
for (size_t d = 0; d < ndf; d++)
{
dfptr[d] = df + d * dfSize;
dfptr[d] += DEFORMED ? nq0 * nq1 * nq2 * e : e;
}
// Calculate dxi/dx in[0] + dxi/dy in[1] + dxi/dz in[2]
for (size_t k = 0, cnt_kji = 0; k < nq2; ++k)
{
TData f0 = 2.0 / (1.0 - Z2[k]);
for (size_t j = 0; j < nq1; ++j)
{
TData f2 = 0.5 * (1.0 + Z1[j]);
for (size_t i = 0; i < nq0; ++i, ++cnt_kji)
{
size_t dfindex = DEFORMED ? cnt_kji : 0;
outptr0[cnt_kji] = dfptr[0][dfindex] * inptr[0][cnt_kji];
outptr1[cnt_kji] = dfptr[1][dfindex] * inptr[0][cnt_kji];
outptr2[cnt_kji] = dfptr[2][dfindex] * inptr[0][cnt_kji];
for (size_t d = 1; d < ncoord; ++d)
{
outptr0[cnt_kji] +=
(dfptr[3 * d][dfindex] * inptr[d][cnt_kji]);
outptr1[cnt_kji] +=
(dfptr[3 * d + 1][dfindex] * inptr[d][cnt_kji]);
outptr2[cnt_kji] +=
(dfptr[3 * d + 2][dfindex] * inptr[d][cnt_kji]);
}
// Multiply by geometric factors
TData f1 = 0.5 * (1.0 + Z0[i]);
outptr0[cnt_kji] += outptr2[cnt_kji] * f1;
outptr0[cnt_kji] *= f0;
outptr1[cnt_kji] += outptr2[cnt_kji] * f2;
outptr1[cnt_kji] *= f0;
}
}
}
delete[] inptr;
delete[] dfptr;
}
} // namespace Nektar::Operators::detail
#include "IProductWRTDerivBaseStdMat.hpp"
namespace Nektar::Operators::detail
{
// Add different IProductWRTDerivBase implementations to the factory.
template <>
std::string OperatorIProductWRTDerivBaseImpl<double, ImplStdMat>::className =
GetOperatorFactory<double>().RegisterCreatorFunction(
"IProductWRTDerivBaseStdMat",
OperatorIProductWRTDerivBaseImpl<double, ImplStdMat>::instantiate,
"...");
} // namespace Nektar::Operators::detail
#include "Operators/OperatorIProductWRTDerivBase.hpp"
#include <StdRegions/StdExpansion.h>
namespace Nektar::Operators::detail
{
// standard matrix implementation
template <typename TData>
class OperatorIProductWRTDerivBaseImpl<TData, ImplStdMat>
: public OperatorIProductWRTDerivBase<TData>
{
public:
OperatorIProductWRTDerivBaseImpl(
const MultiRegions::ExpListSharedPtr &expansionList)
: OperatorIProductWRTDerivBase<TData>(std::move(expansionList))
{
size_t nTotElmts = this->m_expansionList->GetNumElmts();
size_t nDim = this->m_expansionList->GetShapeDimension();
// Initialise jacobian.
size_t jacSize = Operator<TData>::GetGeometricFactorSize();
m_jac = Operator<TData>::SetJacobian(jacSize);
m_derivFac = Operator<TData>::SetDerivativeFactor(jacSize);
// Initialize basiskey.
std::vector<LibUtilities::BasisKey> basisKeys(
3, LibUtilities::NullBasisKey);
// Loop over the elements of expansionList.
for (size_t e = 0; e < nTotElmts; ++e)
{
auto const expPtr = this->m_expansionList->GetExp(e);
// Fetch basiskeys of current element.
for (size_t d = 0; d < nDim; d++)
{
basisKeys[d] = expPtr->GetBasis(d)->GetBasisKey();
}
// Copy data to m_matPtr, if necessary.
if (m_matPtr.find(basisKeys) == m_matPtr.end())
{
size_t nqTot = expPtr->GetTotPoints();
size_t nmTot = expPtr->GetNcoeffs();
auto &matPtr = m_matPtr[basisKeys];
matPtr = Array<OneD, Array<OneD, TData>>(nDim);
Array<OneD, NekDouble> tmp(nqTot), t;
for (size_t d = 0; d < nDim; ++d)
{
// Get IProductWRTDerivBase matrix.
matPtr[d] = Array<OneD, TData>(nqTot * nmTot);
for (size_t i = 0; i < nqTot; ++i)
{
Vmath::Zero(nqTot, tmp, 1);
tmp[i] = 1.0;
expPtr->GetStdExp()->IProductWRTDerivBase(
d, tmp, t = matPtr[d] + i * nmTot);
}
}
}
}
// Initialize workspace memory.
auto ndata = this->m_expansionList->GetTotPoints();
m_wsp = Array<OneD, Array<OneD, NekDouble>>(3);
for (size_t i = 0; i < nDim; ++i)
{
m_wsp[i] = Array<OneD, NekDouble>(ndata);
}
}
void apply(Field<TData, FieldState::Phys> &in0,
Field<TData, FieldState::Phys> &in1,
Field<TData, FieldState::Phys> &in2,
Field<TData, FieldState::Coeff> &out,
bool APPEND = false) override
{
// Copy memory to GPU, if necessary and get raw pointers.
std::vector<TData *> inptr{in0.GetStorage().GetCPUPtr(),
in1.GetStorage().GetCPUPtr(),
in2.GetStorage().GetCPUPtr()};
auto *outptr = out.GetStorage().GetCPUPtr();
std::vector<TData *> wspptr{m_wsp[0].get(), m_wsp[1].get(),
m_wsp[2].get()};
// Initialize index.
size_t expIdx = 0;
size_t jacIdx = 0;
size_t dfIdx = 0;
// Initialize basiskey.
std::vector<LibUtilities::BasisKey> basisKeys(
3, LibUtilities::NullBasisKey);
// Loop over the blocks.
for (size_t block_idx = 0; block_idx < out.GetBlocks().size();
++block_idx)
{
// Determine shape and type of the element.
auto const expPtr = this->m_expansionList->GetExp(expIdx);
auto nElmts = out.GetBlocks()[block_idx].num_elements;
auto nDim = expPtr->GetShapeDimension();
auto nCoord = expPtr->GetCoordim();
auto nqTot = expPtr->GetTotPoints();
auto nmTot = expPtr->GetNcoeffs();
auto deformed = expPtr->GetMetricInfo()->GetGtype() ==
SpatialDomains::eDeformed;
// calculate dx/dxi in[0] + dy/dxi in[1] + dz/dxi in[2]
if (deformed)
{
for (size_t d = 0; d < nDim; ++d)
{
Vmath::Vmul(nqTot * nElmts, m_derivFac[d].get() + dfIdx, 1,
inptr[0], 1, wspptr[d], 1);
for (size_t i = 1; i < nCoord; ++i)
{
Vmath::Vvtvp(nqTot * nElmts,
m_derivFac[d + i * nDim].get() + dfIdx, 1,
inptr[i], 1, wspptr[d], 1, wspptr[d], 1);
}
}
}
else
{
for (size_t e = 0; e < nElmts; ++e)
{
for (size_t d = 0; d < nDim; ++d)
{
Vmath::Smul(nqTot, m_derivFac[d][dfIdx + e],
inptr[0] + e * nqTot, 1,
wspptr[d] + e * nqTot, 1);
for (size_t i = 1; i < nCoord; ++i)
{
Vmath::Svtvp(
nqTot, m_derivFac[d + i * nDim][dfIdx + e],
inptr[i] + e * nqTot, 1, wspptr[d] + e * nqTot,
1, wspptr[d] + e * nqTot, 1);
}
}
}
}
// Multiply by jacobian.
if (deformed)
{
for (size_t d = 0; d < nDim; ++d)
{
Vmath::Vmul(nqTot * nElmts, m_jac.get() + jacIdx, 1,
wspptr[d], 1, wspptr[d], 1);
}
}
else
{
for (size_t e = 0; e < nElmts; ++e)
{
for (size_t d = 0; d < nDim; ++d)
{
Vmath::Smul(nqTot, m_jac[jacIdx + e],
wspptr[d] + e * nqTot, 1,
wspptr[d] + e * nqTot, 1);
}
}
}
// Fetch basis key for the current element type.
for (size_t d = 0; d < nDim; d++)
{
basisKeys[d] = expPtr->GetBasis(d)->GetBasisKey();
}
// Fetch matrix.
auto matPtr = m_matPtr[basisKeys];
// Matrix products.
for (size_t d = 0; d < nDim; d++)
{
TData alpha = (d == 0 && !APPEND) ? 0.0 : 1.0;
Blas::Dgemm('N', 'N', nmTot, nElmts, nqTot, 1.0,
matPtr[d].get(), nmTot, wspptr[d], nqTot, alpha,
outptr, nmTot);
// Increment pointer and index for next element type.
inptr[d] += nqTot * nElmts;
wspptr[d] += nqTot * nElmts;
}
jacIdx += deformed ? nqTot * nElmts : nElmts;
dfIdx += deformed ? nqTot * nElmts : nElmts;
outptr += nmTot * nElmts;
expIdx += nElmts;
}
}
static std::unique_ptr<Operator<TData>> instantiate(
const MultiRegions::ExpListSharedPtr &expansionList)
{
return std::make_unique<
OperatorIProductWRTDerivBaseImpl<TData, ImplStdMat>>(
std::move(expansionList));
}
static std::string className;
private:
Array<OneD, TData> m_jac;
Array<OneD, Array<OneD, TData>> m_derivFac;
std::map<std::vector<LibUtilities::BasisKey>,
Array<OneD, Array<OneD, TData>>>
m_matPtr;
Array<OneD, Array<OneD, TData>> m_wsp;
};
} // namespace Nektar::Operators::detail
...@@ -60,6 +60,104 @@ public: ...@@ -60,6 +60,104 @@ public:
} }
protected: protected:
size_t GetGeometricFactorSize(void)
{
size_t gfSize = 0;
size_t nTotElmts = this->m_expansionList->GetNumElmts();
// Calculate the jacobian array size
for (size_t e = 0; e < nTotElmts; ++e)
{
// Determine shape and type of the element
auto const expPtr = this->m_expansionList->GetExp(e);
if (expPtr->GetMetricInfo()->GetGtype() ==
SpatialDomains::eDeformed)
{
gfSize += expPtr->GetTotPoints();
}
else
{
gfSize++;
}
}
return gfSize;
}
Array<OneD, TData> SetJacobian(size_t jacSize)
{
// Allocate memory for the jacobian
Array<OneD, TData> jac(jacSize, 0.0);
// Initialise jacobian.
size_t index = 0;
size_t nTotElmts = this->m_expansionList->GetNumElmts();
for (size_t e = 0; e < nTotElmts; ++e)
{
auto expPtr = this->m_expansionList->GetExp(e);
auto &auxJac =
expPtr->GetMetricInfo()->GetJac(expPtr->GetPointsKeys());
if (expPtr->GetMetricInfo()->GetGtype() ==
SpatialDomains::eDeformed)
{
size_t nqe = expPtr->GetTotPoints();
for (size_t i = 0; i < nqe; ++i)
{
jac[index++] = auxJac[i];
}
}
else
{
jac[index++] = auxJac[0];
}
}
return jac;
}
Array<OneD, Array<OneD, TData>> SetDerivativeFactor(size_t dfSize)
{
// Allocate memory for the derivative factor
size_t nDim = this->m_expansionList->GetShapeDimension();
size_t nCoord = this->m_expansionList->GetCoordim(0);
Array<OneD, Array<OneD, TData>> derivFac(nDim * nCoord);
for (size_t d = 0; d < nDim * nCoord; d++)
{
derivFac[d] = Array<OneD, TData>(dfSize);
}
// Initialise derivative factor.
size_t dfindex = 0;
size_t nTotElmts = this->m_expansionList->GetNumElmts();
for (size_t e = 0; e < nTotElmts; ++e)
{
auto expPtr = this->m_expansionList->GetExp(e);
auto &df = expPtr->GetMetricInfo()->GetDerivFactors(
expPtr->GetPointsKeys());
size_t nqTot = expPtr->GetTotPoints();
if (expPtr->GetMetricInfo()->GetGtype() ==
SpatialDomains::eDeformed)
{
for (size_t d = 0; d < nDim * nCoord; d++)
{
for (size_t i = 0; i < nqTot; ++i)
{
derivFac[d][dfindex + i] = df[d][i];
}
}
dfindex += nqTot;
}
else
{
for (size_t d = 0; d < nDim * nCoord; d++)
{
derivFac[d][dfindex] = df[d][0];
}
dfindex += 1;
}
}
return derivFac;
}
MultiRegions::ExpListSharedPtr m_expansionList; MultiRegions::ExpListSharedPtr m_expansionList;
}; };
......
#include "OperatorHelmholtz.hpp"
namespace Nektar::Operators
{
template <> const std::string Helmholtz<>::key = "Helmholtz";
template <> const std::string Helmholtz<>::default_impl = "MatFree";
} // namespace Nektar::Operators
#pragma once
#include "Field.hpp"
#include "Operator.hpp"
namespace Nektar::Operators
{
// Helmholtz base class
// Defines the apply operator to enforce apply parameter types
template <typename TData> class OperatorHelmholtz : public Operator<TData>
{
public:
virtual ~OperatorHelmholtz() = default;
OperatorHelmholtz(const MultiRegions::ExpListSharedPtr &expansionList)
: Operator<TData>(expansionList)
{
}
virtual void apply(Field<TData, FieldState::Coeff> &in,
Field<TData, FieldState::Coeff> &out) = 0;
virtual void operator()(Field<TData, FieldState::Coeff> &in,
Field<TData, FieldState::Coeff> &out)
{
apply(in, out);
}
};
// Descriptor / traits class for Helmholtz
template <typename TData = default_fp_type> struct Helmholtz
{
using class_name = OperatorHelmholtz<TData>;
using FieldIn = Field<TData, FieldState::Coeff>;
using FieldOut = Field<TData, FieldState::Coeff>;
static const std::string key;
static const std::string default_impl;
static std::shared_ptr<class_name> create(
const MultiRegions::ExpListSharedPtr &expansionList,
std::string pKey = "")
{
return Operator<TData>::template create<Helmholtz>(expansionList, pKey);
}
};
namespace detail
{
// Template for Helmholtz implementations
template <typename TData, typename Op> class OperatorHelmholtzImpl;
} // namespace detail
} // namespace Nektar::Operators