Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • nektar/redesign-prototypes
1 result
Show changes
Commits on Source (6)
Showing
with 1509 additions and 139 deletions
......@@ -26,12 +26,16 @@ message(STATUS "Found Nektar++: version ${NEKTAR++_VERSION}")
set(CMAKE_INSTALL_RPATH "${NEKTAR++_LIBRARY_DIRS}")
set(SRC Field.cpp Operators/Operator.cpp Operators/OperatorBwdTrans.cpp Operators/BwdTrans/BwdTransImpl.cpp Operators/OperatorIProductWRTBase.cpp Operators/IProductWRTBase/IProductWRTBaseImpl.cpp Operators/OperatorPhysDeriv.cpp Operators/PhysDeriv/PhysDerivImpl.cpp)
set(SRC Field.cpp Operators/Operator.cpp Operators/OperatorBwdTrans.cpp Operators/BwdTrans/BwdTransImpl.cpp Operators/OperatorIProductWRTBase.cpp Operators/IProductWRTBase/IProductWRTBaseImpl.cpp Operators/OperatorIProductWRTDerivBase.cpp Operators/IProductWRTDerivBase/IProductWRTDerivBaseImpl.cpp Operators/OperatorPhysDeriv.cpp Operators/PhysDeriv/PhysDerivImpl.cpp)
if (NEKTAR_USE_CUDA AND NEKTAR_USE_SIMD)
MESSAGE(FATAL_ERROR "Cannot use both SIMD and CUDA")
endif()
if (NEKTAR_USE_CUDA)
enable_language(CUDA)
add_definitions(-DNEKTAR_USE_CUDA)
set(SRC ${SRC} MemoryRegionCUDA.cu Operators/BwdTrans/BwdTransCUDA.cu Operators/IProductWRTBase/IProductWRTBaseCUDA.cu Operators/PhysDeriv/PhysDerivCUDA.cu)
set(SRC ${SRC} MemoryRegionCUDA.cu Operators/BwdTrans/BwdTransCUDA.cu Operators/IProductWRTBase/IProductWRTBaseCUDA.cu Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDA.cu Operators/PhysDeriv/PhysDerivCUDA.cu)
endif()
if (NEKTAR_USE_SIMD)
......
#pragma once
#include "MemoryRegionCUDA.hpp"
#include "Operators/IProductWRTBase/IProductWRTBaseCUDAKernels.cuh"
#include "Operators/OperatorHelper.cuh"
......@@ -6,14 +8,16 @@
namespace Nektar::Operators::detail
{
template <typename TData, bool APPEND = false, bool DEFORMED = false>
template <typename TData, bool SCALE = false, bool APPEND = false,
bool DEFORMED = false>
void IProductWRTBase1DKernel(const size_t gridSize, const size_t blockSize,
const size_t nm0, const size_t nq0,
const size_t nElmts, const TData *basis0,
const TData *w0, const TData *jac, const TData *in,
TData *out);
TData *out, TData scale = 1.0);
template <typename TData, bool APPEND = false, bool DEFORMED = false>
template <typename TData, bool SCALE = false, bool APPEND = false,
bool DEFORMED = false>
void IProductWRTBase2DKernel(const size_t gridSize, const size_t blockSize,
LibUtilities::ShapeType shapetype,
const size_t nm0, const size_t nm1,
......@@ -21,16 +25,20 @@ void IProductWRTBase2DKernel(const size_t gridSize, const size_t blockSize,
const size_t nElmts, const bool correct,
const TData *basis0, const TData *basis1,
const TData *w0, const TData *w1, const TData *jac,
const TData *in, TData *out);
const TData *in, TData *out, TData scale = 1.0);
template <typename TData, bool APPEND = false, bool DEFORMED = false>
void IProductWRTBase3DKernel(
const size_t gridSize, const size_t blockSize,
LibUtilities::ShapeType shapetype, const size_t nm0, const size_t nm1,
const size_t nm2, const size_t nq0, const size_t nq1, const size_t nq2,
const size_t nElmts, const bool correct, const TData *basis0,
const TData *basis1, const TData *basis2, const TData *w0, const TData *w1,
const TData *w2, const TData *jac, const TData *in, TData *out);
template <typename TData, bool SCALE = false, bool APPEND = false,
bool DEFORMED = false>
void IProductWRTBase3DKernel(const size_t gridSize, const size_t blockSize,
LibUtilities::ShapeType shapetype,
const size_t nm0, const size_t nm1,
const size_t nm2, const size_t nq0,
const size_t nq1, const size_t nq2,
const size_t nElmts, const bool correct,
const TData *basis0, const TData *basis1,
const TData *basis2, const TData *w0,
const TData *w1, const TData *w2, const TData *jac,
const TData *in, TData *out, TData scale = 1.0);
// IProductWRTBase implementation
template <typename TData>
......@@ -49,7 +57,7 @@ public:
cudaMemcpy(m_jac, jac.get(), sizeof(TData) * jacSize,
cudaMemcpyHostToDevice);
// Initialize basiskey.
// Initialize basis.
m_basis = GetBasisDataCUDA<TData>(expansionList);
// Initialize weight.
......@@ -58,25 +66,14 @@ public:
~OperatorIProductWRTBaseImpl(void)
{
for (auto &basis : m_basis)
{
for (size_t i = 0; i < basis.second.size(); i++)
{
cudaFree(basis.second[i]);
}
}
for (auto &weight : m_weight)
{
for (size_t i = 0; i < weight.second.size(); i++)
{
cudaFree(weight.second[i]);
}
}
DeallocateDataCUDA<TData>(m_basis);
DeallocateDataCUDA<TData>(m_weight);
cudaFree(m_jac);
}
void apply(Field<TData, FieldState::Phys> &in,
Field<TData, FieldState::Coeff> &out) override
Field<TData, FieldState::Coeff> &out,
const TData lambda = 1.0) override
{
// Copy memory to GPU, if necessary and get raw pointers.
auto const *inptr =
......@@ -126,15 +123,33 @@ public:
auto nq0 = expPtr->GetNumPoints(0);
if (deformed)
{
IProductWRTBase1DKernel<TData, false, true>(
m_gridSize, m_blockSize, nm0, nq0, nElmts, basis0, w0,
jacptr, inptr, outptr);
if (lambda == 1.0)
{
IProductWRTBase1DKernel<TData, false, false, true>(
m_gridSize, m_blockSize, nm0, nq0, nElmts, basis0,
w0, jacptr, inptr, outptr);
}
else
{
IProductWRTBase1DKernel<TData, true, false, true>(
m_gridSize, m_blockSize, nm0, nq0, nElmts, basis0,
w0, jacptr, inptr, outptr, lambda);
}
}
else
{
IProductWRTBase1DKernel<TData, false, false>(
m_gridSize, m_blockSize, nm0, nq0, nElmts, basis0, w0,
jacptr, inptr, outptr);
if (lambda == 1.0)
{
IProductWRTBase1DKernel<TData, false, false, false>(
m_gridSize, m_blockSize, nm0, nq0, nElmts, basis0,
w0, jacptr, inptr, outptr);
}
else
{
IProductWRTBase1DKernel<TData, true, false, false>(
m_gridSize, m_blockSize, nm0, nq0, nElmts, basis0,
w0, jacptr, inptr, outptr, lambda);
}
}
}
else if (expPtr->GetShapeDimension() == 2)
......@@ -150,17 +165,37 @@ public:
auto nq1 = expPtr->GetNumPoints(1);
if (deformed)
{
IProductWRTBase2DKernel<TData, false, true>(
m_gridSize, m_blockSize, shape, nm0, nm1, nq0, nq1,
nElmts, correct, basis0, basis1, w0, w1, jacptr, inptr,
outptr);
if (lambda == 1.0)
{
IProductWRTBase2DKernel<TData, false, false, true>(
m_gridSize, m_blockSize, shape, nm0, nm1, nq0, nq1,
nElmts, correct, basis0, basis1, w0, w1, jacptr,
inptr, outptr);
}
else
{
IProductWRTBase2DKernel<TData, true, false, true>(
m_gridSize, m_blockSize, shape, nm0, nm1, nq0, nq1,
nElmts, correct, basis0, basis1, w0, w1, jacptr,
inptr, outptr, lambda);
}
}
else
{
IProductWRTBase2DKernel<TData, false, false>(
m_gridSize, m_blockSize, shape, nm0, nm1, nq0, nq1,
nElmts, correct, basis0, basis1, w0, w1, jacptr, inptr,
outptr);
if (lambda == 1.0)
{
IProductWRTBase2DKernel<TData, false, false, false>(
m_gridSize, m_blockSize, shape, nm0, nm1, nq0, nq1,
nElmts, correct, basis0, basis1, w0, w1, jacptr,
inptr, outptr);
}
else
{
IProductWRTBase2DKernel<TData, true, false, false>(
m_gridSize, m_blockSize, shape, nm0, nm1, nq0, nq1,
nElmts, correct, basis0, basis1, w0, w1, jacptr,
inptr, outptr, lambda);
}
}
}
else if (expPtr->GetShapeDimension() == 3)
......@@ -180,17 +215,37 @@ public:
auto nq2 = expPtr->GetNumPoints(2);
if (deformed)
{
IProductWRTBase3DKernel<TData, false, true>(
m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0, nq1,
nq2, nElmts, correct, basis0, basis1, basis2, w0, w1,
w2, jacptr, inptr, outptr);
if (lambda == 1.0)
{
IProductWRTBase3DKernel<TData, false, false, true>(
m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0,
nq1, nq2, nElmts, correct, basis0, basis1, basis2,
w0, w1, w2, jacptr, inptr, outptr);
}
else
{
IProductWRTBase3DKernel<TData, true, false, true>(
m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0,
nq1, nq2, nElmts, correct, basis0, basis1, basis2,
w0, w1, w2, jacptr, inptr, outptr, lambda);
}
}
else
{
IProductWRTBase3DKernel<TData, false, false>(
m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0, nq1,
nq2, nElmts, correct, basis0, basis1, basis2, w0, w1,
w2, jacptr, inptr, outptr);
if (lambda == 1.0)
{
IProductWRTBase3DKernel<TData, false, false, false>(
m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0,
nq1, nq2, nElmts, correct, basis0, basis1, basis2,
w0, w1, w2, jacptr, inptr, outptr);
}
else
{
IProductWRTBase3DKernel<TData, true, false, false>(
m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0,
nq1, nq2, nElmts, correct, basis0, basis1, basis2,
w0, w1, w2, jacptr, inptr, outptr, lambda);
}
}
}
......@@ -220,18 +275,19 @@ private:
size_t m_gridSize;
};
template <typename TData, bool APPEND, bool DEFORMED>
template <typename TData, bool SCALE, bool APPEND, bool DEFORMED>
void IProductWRTBase1DKernel(const size_t gridSize, const size_t blockSize,
const size_t nm0, const size_t nq0,
const size_t nElmts, const TData *basis0,
const TData *w0, const TData *jac, const TData *in,
TData *out)
TData *out, TData scale)
{
IProductWRTBaseSegKernel<TData, false, APPEND, DEFORMED>
<<<gridSize, blockSize>>>(nm0, nq0, nElmts, basis0, w0, jac, in, out);
IProductWRTBaseSegKernel<TData, SCALE, APPEND, DEFORMED>
<<<gridSize, blockSize>>>(nm0, nq0, nElmts, basis0, w0, jac, in, out,
scale);
}
template <typename TData, bool APPEND, bool DEFORMED>
template <typename TData, bool SCALE, bool APPEND, bool DEFORMED>
void IProductWRTBase2DKernel(const size_t gridSize, const size_t blockSize,
LibUtilities::ShapeType shapetype,
const size_t nm0, const size_t nm1,
......@@ -239,67 +295,67 @@ void IProductWRTBase2DKernel(const size_t gridSize, const size_t blockSize,
const size_t nElmts, const bool correct,
const TData *basis0, const TData *basis1,
const TData *w0, const TData *w1, const TData *jac,
const TData *in, TData *out)
const TData *in, TData *out, TData scale)
{
if (shapetype == LibUtilities::Quad)
{
IProductWRTBaseQuadKernel<TData, false, APPEND, DEFORMED>
IProductWRTBaseQuadKernel<TData, SCALE, APPEND, DEFORMED>
<<<gridSize, blockSize>>>(nm0, nm1, nq0, nq1, nElmts, basis0,
basis1, w0, w1, jac, in, out);
basis1, w0, w1, jac, in, out, scale);
}
else if (shapetype == LibUtilities::Tri)
{
size_t nmTot =
LibUtilities::StdTriData::getNumberOfCoefficients(nm0, nm1);
IProductWRTBaseTriKernel<TData, false, APPEND, DEFORMED>
IProductWRTBaseTriKernel<TData, SCALE, APPEND, DEFORMED>
<<<gridSize, blockSize>>>(nm0, nm1, nmTot, nq0, nq1, nElmts,
correct, basis0, basis1, w0, w1, jac, in,
out);
out, scale);
}
}
template <typename TData, bool APPEND, bool DEFORMED>
template <typename TData, bool SCALE, bool APPEND, bool DEFORMED>
void IProductWRTBase3DKernel(
const size_t gridSize, const size_t blockSize,
LibUtilities::ShapeType shapetype, const size_t nm0, const size_t nm1,
const size_t nm2, const size_t nq0, const size_t nq1, const size_t nq2,
const size_t nElmts, const bool correct, const TData *basis0,
const TData *basis1, const TData *basis2, const TData *w0, const TData *w1,
const TData *w2, const TData *jac, const TData *in, TData *out)
const TData *w2, const TData *jac, const TData *in, TData *out, TData scale)
{
if (shapetype == LibUtilities::Hex)
{
IProductWRTBaseHexKernel<TData, false, APPEND, DEFORMED>
IProductWRTBaseHexKernel<TData, SCALE, APPEND, DEFORMED>
<<<gridSize, blockSize>>>(nm0, nm1, nm2, nq0, nq1, nq2, nElmts,
basis0, basis1, basis2, w0, w1, w2, jac,
in, out);
in, out, scale);
}
else if (shapetype == LibUtilities::Tet)
{
size_t nmTot =
LibUtilities::StdTetData::getNumberOfCoefficients(nm0, nm1, nm2);
IProductWRTBaseTetKernel<TData, false, APPEND, DEFORMED>
IProductWRTBaseTetKernel<TData, SCALE, APPEND, DEFORMED>
<<<gridSize, blockSize>>>(nm0, nm1, nm2, nmTot, nq0, nq1, nq2,
nElmts, correct, basis0, basis1, basis2,
w0, w1, w2, jac, in, out);
w0, w1, w2, jac, in, out, scale);
}
else if (shapetype == LibUtilities::Pyr)
{
size_t nmTot =
LibUtilities::StdPyrData::getNumberOfCoefficients(nm0, nm1, nm2);
IProductWRTBasePyrKernel<TData, false, APPEND, DEFORMED>
IProductWRTBasePyrKernel<TData, SCALE, APPEND, DEFORMED>
<<<gridSize, blockSize>>>(nm0, nm1, nm2, nmTot, nq0, nq1, nq2,
nElmts, correct, basis0, basis1, basis2,
w0, w1, w2, jac, in, out);
w0, w1, w2, jac, in, out, scale);
}
else if (shapetype == LibUtilities::Prism)
{
size_t nmTot =
LibUtilities::StdPrismData::getNumberOfCoefficients(nm0, nm1, nm2);
IProductWRTBasePrismKernel<TData, false, APPEND, DEFORMED>
IProductWRTBasePrismKernel<TData, SCALE, APPEND, DEFORMED>
<<<gridSize, blockSize>>>(nm0, nm1, nm2, nmTot, nq0, nq1, nq2,
nElmts, correct, basis0, basis1, basis2,
w0, w1, w2, jac, in, out);
w0, w1, w2, jac, in, out, scale);
}
}
......
......@@ -28,6 +28,10 @@ __global__ void IProductWRTBaseSegKernel(const size_t nm0, const size_t nq0,
TData jac_val = DEFORMED ? jacptr[i] : jacptr[0];
sum += inptr[i] * basis0[p * nq0 + i] * jac_val * w0[i];
}
if (SCALE)
{
sum *= scale;
}
outptr[p] = APPEND ? outptr[p] + sum : sum;
}
}
......@@ -76,9 +80,14 @@ __global__ void IProductWRTBaseQuadKernel(
{
sum += wsp[j] * basis1[q * nq1 + j] * w1[j];
}
if (SCALE)
{
sum *= scale;
}
outptr[q * nm0 + p] = APPEND ? outptr[q * nm0 + p] + sum : sum;
}
}
delete wsp;
}
template <typename TData, bool SCALE, bool APPEND, bool DEFORMED>
......@@ -128,7 +137,11 @@ __global__ void IProductWRTBaseTriKernel(
{
sum += wsp[eta1] * basis1[mode * nq1 + eta1] * w1[eta1];
}
outptr[mode++] = APPEND ? outptr[mode++] + sum : sum;
if (SCALE)
{
sum *= scale;
}
outptr[mode++] = APPEND ? outptr[mode] + sum : sum;
}
}
......@@ -158,8 +171,9 @@ __global__ void IProductWRTBaseTriKernel(
iprod_01 += prod * basis0[nq0 + eta0];
}
}
outptr[1] += iprod_01;
outptr[1] += SCALE ? iprod_01 * scale : iprod_01;
}
delete wsp;
}
template <typename TData, bool SCALE, bool APPEND, bool DEFORMED>
......@@ -227,11 +241,17 @@ __global__ void IProductWRTBaseHexKernel(
{
sum += wsp1[k] * basis2[r * nq2 + k] * w2[k];
}
if (SCALE)
{
sum *= scale;
}
outptr[r * nm0 * nm1 + q * nm0 + p] =
APPEND ? outptr[r * nm0 * nm1 + q * nm0 + p] + sum : sum;
}
}
}
delete wsp0;
delete wsp1;
}
// NOTE: Not workign when nm2 > nm1
......@@ -303,7 +323,11 @@ __global__ void IProductWRTBaseTetKernel(
{
tmp += wsp1[k] * basis2[mode2 * nq2 + k] * w2[k];
}
outptr[cnt_pqr++] = APPEND ? outptr[cnt_pqr++] + tmp : tmp;
if (SCALE)
{
tmp *= scale;
}
outptr[cnt_pqr++] = APPEND ? outptr[cnt_pqr] + tmp : tmp;
}
}
}
......@@ -342,26 +366,28 @@ __global__ void IProductWRTBaseTetKernel(
tmp *= inptr[cnt] * tmpQ;
// add to existing entry
outptr[1] += tmp;
outptr[1] += SCALE ? tmp * scale : tmp;
// bottom vertex
//
tmp = basis0[nq0 + i] * basis1[nq1 + j] * basis2[k] *
inptr[cnt] * tmpQ;
outptr[nm2] += tmp;
outptr[nm2] += SCALE ? tmp * scale : tmp;
// singular edge
for (size_t r = 1; r < nm2 - 1; ++r)
{
tmp = basis2[(r + 1) * nq2 + k] * basis1[nq1 + j] *
basis0[nq0 + i] * inptr[cnt] * tmpQ;
outptr[nm2 + r] += tmp;
outptr[nm2 + r] += SCALE ? tmp * scale : tmp;
}
cnt++;
}
}
}
}
delete wsp0;
delete wsp1;
}
template <typename TData, bool SCALE, bool APPEND, bool DEFORMED>
......@@ -431,8 +457,11 @@ __global__ void IProductWRTBasePrismKernel(
{
sum_k += basis2[(mode_pr + r) * nq2 + k] * w2[k] * wsp1[k];
}
outptr[mode_pqr++] =
APPEND ? outptr[mode_pqr++] + sum_k : sum_k;
if (SCALE)
{
sum_k *= scale;
}
outptr[mode_pqr++] = APPEND ? outptr[mode_pqr] + sum_k : sum_k;
}
}
mode_pr += nm2 - p;
......@@ -477,9 +506,12 @@ __global__ void IProductWRTBasePrismKernel(
for (size_t q = 0; q < nm1; ++q)
{
outptr[nm2 * q + 1] += wsp2[q];
outptr[nm2 * q + 1] += SCALE ? wsp2[q] * scale : wsp2[q];
}
}
delete wsp0;
delete wsp1;
delete wsp2;
}
// NOTE: Not workign when nm2 > nm1
......@@ -549,8 +581,11 @@ __global__ void IProductWRTBasePyrKernel(
{
sum_k += basis2[mode_pqr * nq2 + k] * w2[k] * wsp1[k];
}
outptr[mode_pqr++] =
APPEND ? outptr[mode_pqr++] + sum_k : sum_k;
if (SCALE)
{
sum_k *= scale;
}
outptr[mode_pqr++] = APPEND ? outptr[mode_pqr] + sum_k : sum_k;
}
}
......@@ -574,8 +609,11 @@ __global__ void IProductWRTBasePyrKernel(
{
sum_k += basis2[mode_pqr * nq2 + k] * w2[k] * wsp1[k];
}
outptr[mode_pqr++] =
APPEND ? outptr[mode_pqr++] + sum_k : sum_k;
if (SCALE)
{
sum_k *= scale;
}
outptr[mode_pqr++] = APPEND ? outptr[mode_pqr] + sum_k : sum_k;
}
}
}
......@@ -612,11 +650,13 @@ __global__ void IProductWRTBasePyrKernel(
tmp *= inptr[cnt++] * tmpQ;
// add to existing entry
outptr[1] += tmp;
outptr[1] += SCALE ? tmp * scale : tmp;
}
}
}
}
delete wsp0;
delete wsp1;
}
} // namespace Nektar::Operators::detail
......@@ -20,7 +20,8 @@ public:
}
void apply(Field<TData, FieldState::Phys> &in,
Field<TData, FieldState::Coeff> &out) override
Field<TData, FieldState::Coeff> &out,
const TData lambda = 1.0) override
{
auto const *inptr = in.GetStorage().GetCPUPtr();
auto *outptr = out.GetStorage().GetCPUPtr();
......@@ -64,7 +65,7 @@ public:
}
Blas::Dgemm('N', 'N', matPtr->GetRows(), nElmts,
matPtr->GetColumns(), 1.0, matPtr->GetRawPtr(),
matPtr->GetColumns(), lambda, matPtr->GetRawPtr(),
matPtr->GetRows(), wsp.get(), nqTot, 0.0, outptr,
nmTot);
......
#include "IProductWRTDerivBaseCUDA.hpp"
namespace Nektar::Operators::detail
{
template <>
std::string OperatorIProductWRTDerivBaseImpl<double, ImplCUDA>::className =
GetOperatorFactory<double>().RegisterCreatorFunction(
"IProductWRTDerivBaseCUDA",
OperatorIProductWRTDerivBaseImpl<double, ImplCUDA>::instantiate, "...");
}
#include "MemoryRegionCUDA.hpp"
#include "Operators/IProductWRTBase/IProductWRTBaseCUDA.hpp"
#include "Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDAKernels.cuh"
#include "Operators/OperatorHelper.cuh"
#include "Operators/OperatorIProductWRTDerivBase.hpp"
namespace Nektar::Operators::detail
{
template <typename TData, bool DEFORMED = false>
void IProductWRTDerivBase1DKernel(const size_t gridSize, const size_t blockSize,
const size_t nq0, const size_t nCoord,
const size_t nElmts, const size_t dfSize,
TData *df, TData *in0, TData *in1, TData *in2,
TData *out0);
template <typename TData, bool DEFORMED = false>
void IProductWRTDerivBase2DKernel(const size_t gridSize, const size_t blockSize,
LibUtilities::ShapeType shapetype,
const size_t nq0, const size_t nq1,
const size_t nCoord, const size_t nElmts,
const TData *Z0, const TData *Z1,
const size_t dfSize, TData *df, TData *in0,
TData *in1, TData *in2, TData *out0,
TData *out1);
template <typename TData, bool DEFORMED = false>
void IProductWRTDerivBase3DKernel(
const size_t gridSize, const size_t blockSize,
LibUtilities::ShapeType shapetype, const size_t nq0, const size_t nq1,
const size_t nq2, const size_t nCoord, const size_t nElmts, const TData *Z0,
const TData *Z1, const TData *Z2, const size_t dfSize, TData *df,
TData *in0, TData *in1, TData *in2, TData *out0, TData *out1, TData *out2);
// IProductWRTDerivBase implementation
template <typename TData>
class OperatorIProductWRTDerivBaseImpl<TData, ImplCUDA>
: public OperatorIProductWRTDerivBase<TData>
{
public:
OperatorIProductWRTDerivBaseImpl(
const MultiRegions::ExpListSharedPtr &expansionList)
: OperatorIProductWRTDerivBase<TData>(expansionList)
{
size_t nDim = this->m_expansionList->GetShapeDimension();
size_t nCoord = this->m_expansionList->GetCoordim(0);
m_dfSize = Operator<TData>::GetGeometricFactorSize();
// Initialise jacobian.
auto jac = Operator<TData>::SetJacobian(m_dfSize);
cudaMalloc((void **)&m_jac, sizeof(TData) * m_dfSize);
cudaMemcpy(m_jac, jac.get(), sizeof(TData) * m_dfSize,
cudaMemcpyHostToDevice);
// Initialise derivative factor.
auto derivFac = Operator<TData>::SetDerivativeFactor(m_dfSize);
cudaMalloc((void **)&m_derivFac,
sizeof(TData) * nDim * nCoord * m_dfSize);
for (size_t d = 0; d < nDim * nCoord; d++)
{
auto hostPtr = derivFac[d].get();
auto devicePtr = m_derivFac + d * m_dfSize;
cudaMemcpy(devicePtr, hostPtr, sizeof(TData) * m_dfSize,
cudaMemcpyHostToDevice);
}
// Initialize basis.
m_basis = GetBasisDataCUDA<TData>(expansionList);
// Initialize basis derivative.
m_dbasis = GetDeriveBasisDataCUDA<TData>(expansionList);
// Initialize weight.
m_weight = GetWeightDataCUDA<TData>(expansionList);
// Initialize points.
m_Z = GetPointDataCUDA<TData>(expansionList);
// Initialize derivative matrix.
m_D = GetDerivativeDataCUDA<TData>(expansionList);
// Initialize workspace memory.
auto ndata = this->m_expansionList->GetTotPoints();
cudaMalloc((void **)&m_wsp0, sizeof(TData) * ndata);
if (nCoord > 1)
{
cudaMalloc((void **)&m_wsp1, sizeof(TData) * ndata);
}
if (nCoord > 2)
{
cudaMalloc((void **)&m_wsp2, sizeof(TData) * ndata);
}
}
~OperatorIProductWRTDerivBaseImpl(void)
{
DeallocateDataCUDA<TData>(m_basis);
DeallocateDataCUDA<TData>(m_dbasis);
DeallocateDataCUDA<TData>(m_weight);
DeallocateDataCUDA<TData>(m_Z);
DeallocateDataCUDA<TData>(m_D);
cudaFree(m_jac);
cudaFree(m_derivFac);
}
void apply(Field<TData, FieldState::Phys> &in0,
Field<TData, FieldState::Phys> &in1,
Field<TData, FieldState::Phys> &in2,
Field<TData, FieldState::Coeff> &out,
bool APPEND = false) override
{
if (!APPEND) // Zero output (temporary solution)
{
auto *tmpptr =
out.template GetStorage<MemoryRegionCUDA>().GetCPUPtr();
for (size_t block_idx = 0; block_idx < out.GetBlocks().size();
++block_idx)
{
for (size_t i = 0; i < out.GetBlocks()[block_idx].block_size;
i++)
{
*(tmpptr++) = 0.0;
}
}
}
// Copy memory to GPU, if necessary and get raw pointers.
std::vector<TData *> inptr{
in0.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(),
in1.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(),
in2.template GetStorage<MemoryRegionCUDA>().GetGPUPtr()};
auto *outptr = out.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
std::vector<TData *> wspptr{m_wsp0, m_wsp1, m_wsp2};
auto jacptr = m_jac;
auto dfptr = m_derivFac;
// Initialize index.
size_t expIdx = 0;
// Initialize basiskey.
std::vector<LibUtilities::BasisKey> basisKeys(
3, LibUtilities::NullBasisKey);
// Loop over the blocks.
for (size_t block_idx = 0; block_idx < out.GetBlocks().size();
++block_idx)
{
auto const expPtr = this->m_expansionList->GetExp(expIdx);
auto nElmts = out.GetBlocks()[block_idx].num_elements;
auto nqTot = expPtr->GetTotPoints();
auto nmTot = expPtr->GetNcoeffs();
auto nDim = expPtr->GetShapeDimension();
auto nCoord = expPtr->GetCoordim();
auto shape = expPtr->DetShapeType();
auto deformed = expPtr->GetMetricInfo()->GetGtype() ==
SpatialDomains::eDeformed;
// Deterime CUDA grid parameters.
m_gridSize = nElmts / m_blockSize;
m_gridSize += (nElmts % m_blockSize == 0) ? 0 : 1;
// Flag for collapsed coordinate correction.
bool correct = expPtr->GetBasis(0)->GetBasisType() ==
LibUtilities::eModified_A;
// Fetch basis key for the current element type.
for (size_t d = 0; d < nDim; d++)
{
basisKeys[d] = expPtr->GetBasis(d)->GetBasisKey();
}
// Function call to kernel functions.
if (nDim == 1)
{
auto dbasis0 = m_dbasis[basisKeys][0];
auto w0 = m_weight[basisKeys][0];
auto D0 = m_D[basisKeys][0];
auto nm0 = expPtr->GetBasisNumModes(0);
auto nq0 = expPtr->GetNumPoints(0);
if (deformed)
{
IProductWRTDerivBase1DKernel<TData, true>(
m_gridSize, m_blockSize, nq0, nCoord, nElmts, m_dfSize,
dfptr, inptr[0], inptr[1], inptr[2], wspptr[0]);
IProductWRTBase1DKernel<TData, false, true, true>(
m_gridSize, m_blockSize, nm0, nq0, nElmts, dbasis0, w0,
jacptr, wspptr[0], outptr);
}
else
{
IProductWRTDerivBase1DKernel<TData, false>(
m_gridSize, m_blockSize, nq0, nCoord, nElmts, m_dfSize,
dfptr, inptr[0], inptr[1], inptr[2], wspptr[0]);
IProductWRTBase1DKernel<TData, false, true, false>(
m_gridSize, m_blockSize, nm0, nq0, nElmts, dbasis0, w0,
jacptr, wspptr[0], outptr);
}
}
else if (nDim == 2)
{
auto basis0 = m_basis[basisKeys][0];
auto basis1 = m_basis[basisKeys][1];
auto dbasis0 = m_dbasis[basisKeys][0];
auto dbasis1 = m_dbasis[basisKeys][1];
auto w0 = m_weight[basisKeys][0];
auto w1 = m_weight[basisKeys][1];
auto D0 = m_D[basisKeys][0];
auto D1 = m_D[basisKeys][1];
auto Z0 = m_Z[basisKeys][0];
auto Z1 = m_Z[basisKeys][1];
auto nm0 = expPtr->GetBasisNumModes(0);
auto nm1 = expPtr->GetBasisNumModes(1);
auto nq0 = expPtr->GetNumPoints(0);
auto nq1 = expPtr->GetNumPoints(1);
if (deformed)
{
IProductWRTDerivBase2DKernel<TData, true>(
m_gridSize, m_blockSize, shape, nq0, nq1, nCoord,
nElmts, Z0, Z1, m_dfSize, dfptr, inptr[0], inptr[1],
inptr[2], wspptr[0], wspptr[1]);
IProductWRTBase2DKernel<TData, false, true, true>(
m_gridSize, m_blockSize, shape, nm0, nm1, nq0, nq1,
nElmts, correct, dbasis0, basis1, w0, w1, jacptr,
wspptr[0], outptr);
IProductWRTBase2DKernel<TData, false, true, true>(
m_gridSize, m_blockSize, shape, nm0, nm1, nq0, nq1,
nElmts, correct, basis0, dbasis1, w0, w1, jacptr,
wspptr[1], outptr);
}
else
{
IProductWRTDerivBase2DKernel<TData, false>(
m_gridSize, m_blockSize, shape, nq0, nq1, nCoord,
nElmts, Z0, Z1, m_dfSize, dfptr, inptr[0], inptr[1],
inptr[2], wspptr[0], wspptr[1]);
IProductWRTBase2DKernel<TData, false, true, false>(
m_gridSize, m_blockSize, shape, nm0, nm1, nq0, nq1,
nElmts, correct, dbasis0, basis1, w0, w1, jacptr,
wspptr[0], outptr);
IProductWRTBase2DKernel<TData, false, true, false>(
m_gridSize, m_blockSize, shape, nm0, nm1, nq0, nq1,
nElmts, correct, basis0, dbasis1, w0, w1, jacptr,
wspptr[1], outptr);
}
}
else if (nDim == 3)
{
auto basis0 = m_basis[basisKeys][0];
auto basis1 = m_basis[basisKeys][1];
auto basis2 = m_basis[basisKeys][2];
auto dbasis0 = m_dbasis[basisKeys][0];
auto dbasis1 = m_dbasis[basisKeys][1];
auto dbasis2 = m_dbasis[basisKeys][2];
auto w0 = m_weight[basisKeys][0];
auto w1 = m_weight[basisKeys][1];
auto w2 = m_weight[basisKeys][2];
auto D0 = m_D[basisKeys][0];
auto D1 = m_D[basisKeys][1];
auto D2 = m_D[basisKeys][2];
auto Z0 = m_Z[basisKeys][0];
auto Z1 = m_Z[basisKeys][1];
auto Z2 = m_Z[basisKeys][2];
auto nm0 = expPtr->GetBasisNumModes(0);
auto nm1 = expPtr->GetBasisNumModes(1);
auto nm2 = expPtr->GetBasisNumModes(2);
auto nq0 = expPtr->GetNumPoints(0);
auto nq1 = expPtr->GetNumPoints(1);
auto nq2 = expPtr->GetNumPoints(2);
if (deformed)
{
IProductWRTDerivBase3DKernel<TData, true>(
m_gridSize, m_blockSize, shape, nq0, nq1, nq2, nCoord,
nElmts, Z0, Z1, Z2, m_dfSize, dfptr, inptr[0], inptr[1],
inptr[2], wspptr[0], wspptr[1], wspptr[2]);
IProductWRTBase3DKernel<TData, false, true, true>(
m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0, nq1,
nq2, nElmts, correct, dbasis0, basis1, basis2, w0, w1,
w2, jacptr, wspptr[0], outptr);
IProductWRTBase3DKernel<TData, false, true, true>(
m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0, nq1,
nq2, nElmts, correct, basis0, dbasis1, basis2, w0, w1,
w2, jacptr, wspptr[1], outptr);
IProductWRTBase3DKernel<TData, false, true, true>(
m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0, nq1,
nq2, nElmts, correct, basis0, basis1, dbasis2, w0, w1,
w2, jacptr, wspptr[2], outptr);
}
else
{
IProductWRTDerivBase3DKernel<TData, false>(
m_gridSize, m_blockSize, shape, nq0, nq1, nq2, nCoord,
nElmts, Z0, Z1, Z2, m_dfSize, dfptr, inptr[0], inptr[1],
inptr[2], wspptr[0], wspptr[1], wspptr[2]);
IProductWRTBase3DKernel<TData, false, true, false>(
m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0, nq1,
nq2, nElmts, correct, dbasis0, basis1, basis2, w0, w1,
w2, jacptr, wspptr[0], outptr);
IProductWRTBase3DKernel<TData, false, true, false>(
m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0, nq1,
nq2, nElmts, correct, basis0, dbasis1, basis2, w0, w1,
w2, jacptr, wspptr[1], outptr);
IProductWRTBase3DKernel<TData, false, true, false>(
m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0, nq1,
nq2, nElmts, correct, basis0, basis1, dbasis2, w0, w1,
w2, jacptr, wspptr[2], outptr);
}
}
// Increment pointer and index for next element type.
jacptr += deformed ? nqTot * nElmts : nElmts;
dfptr += deformed ? nqTot * nElmts : nElmts;
for (size_t d = 0; d < nDim; d++)
{
inptr[d] += nqTot * nElmts;
wspptr[d] += nqTot * nElmts;
}
outptr += nmTot * nElmts;
expIdx += nElmts;
}
}
static std::unique_ptr<Operator<TData>> instantiate(
MultiRegions::ExpListSharedPtr expansionList)
{
return std::make_unique<
OperatorIProductWRTDerivBaseImpl<TData, ImplCUDA>>(expansionList);
}
static std::string className;
private:
TData *m_wsp0, *m_wsp1, *m_wsp2;
TData *m_derivFac;
TData *m_jac;
std::map<std::vector<LibUtilities::BasisKey>, std::vector<TData *>> m_basis;
std::map<std::vector<LibUtilities::BasisKey>, std::vector<TData *>>
m_dbasis;
std::map<std::vector<LibUtilities::BasisKey>, std::vector<TData *>>
m_weight;
std::map<std::vector<LibUtilities::BasisKey>, std::vector<TData *>> m_D;
std::map<std::vector<LibUtilities::BasisKey>, std::vector<TData *>> m_Z;
size_t m_dfSize;
size_t m_blockSize = 32;
size_t m_gridSize;
};
template <typename TData, bool DEFORMED>
void IProductWRTDerivBase1DKernel(const size_t gridSize, const size_t blockSize,
const size_t nq0, const size_t nCoord,
const size_t nElmts, const size_t dfSize,
TData *df, TData *in0, TData *in1, TData *in2,
TData *out0)
{
IProductWRTDerivBaseSegKernel<TData, DEFORMED><<<gridSize, blockSize>>>(
nq0, nCoord, nElmts, dfSize, df, in0, in1, in2, out0);
}
template <typename TData, bool DEFORMED>
void IProductWRTDerivBase2DKernel(const size_t gridSize, const size_t blockSize,
LibUtilities::ShapeType shapetype,
const size_t nq0, const size_t nq1,
const size_t nCoord, const size_t nElmts,
const TData *Z0, const TData *Z1,
const size_t dfSize, TData *df, TData *in0,
TData *in1, TData *in2, TData *out0,
TData *out1)
{
if (shapetype == LibUtilities::Quad)
{
IProductWRTDerivBaseQuadKernel<TData, DEFORMED>
<<<gridSize, blockSize>>>(nq0, nq1, nCoord, nElmts, dfSize, df, in0,
in1, in2, out0, out1);
}
else if (shapetype == LibUtilities::Tri)
{
IProductWRTDerivBaseTriKernel<TData, DEFORMED>
<<<gridSize, blockSize>>>(nq0, nq1, nCoord, nElmts, Z0, Z1, dfSize,
df, in0, in1, in2, out0, out1);
}
}
template <typename TData, bool DEFORMED>
void IProductWRTDerivBase3DKernel(
const size_t gridSize, const size_t blockSize,
LibUtilities::ShapeType shapetype, const size_t nq0, const size_t nq1,
const size_t nq2, const size_t nCoord, const size_t nElmts, const TData *Z0,
const TData *Z1, const TData *Z2, const size_t dfSize, TData *df,
TData *in0, TData *in1, TData *in2, TData *out0, TData *out1, TData *out2)
{
if (shapetype == LibUtilities::Hex)
{
IProductWRTDerivBaseHexKernel<TData, DEFORMED>
<<<gridSize, blockSize>>>(nq0, nq1, nq2, nCoord, nElmts, dfSize, df,
in0, in1, in2, out0, out1, out2);
}
else if (shapetype == LibUtilities::Tet)
{
IProductWRTDerivBaseTetKernel<TData, DEFORMED><<<gridSize, blockSize>>>(
nq0, nq1, nq2, nCoord, nElmts, Z0, Z1, Z2, dfSize, df, in0, in1,
in2, out0, out1, out2);
}
else if (shapetype == LibUtilities::Pyr)
{
IProductWRTDerivBasePyrKernel<TData, DEFORMED><<<gridSize, blockSize>>>(
nq0, nq1, nq2, nCoord, nElmts, Z0, Z1, Z2, dfSize, df, in0, in1,
in2, out0, out1, out2);
}
else if (shapetype == LibUtilities::Prism)
{
IProductWRTDerivBasePrismKernel<TData, DEFORMED>
<<<gridSize, blockSize>>>(nq0, nq1, nq2, nCoord, nElmts, Z0, Z2,
dfSize, df, in0, in1, in2, out0, out1,
out2);
}
}
} // namespace Nektar::Operators::detail
namespace Nektar::Operators::detail
{
template <typename TData, bool DEFORMED>
__global__ void IProductWRTDerivBaseSegKernel(const size_t nq0,
const size_t ncoord,
const size_t nelmt,
const size_t dfSize, TData *df,
TData *in0, TData *in1,
TData *in2, TData *out0)
{
size_t e = blockDim.x * blockIdx.x + threadIdx.x;
if (e >= nelmt)
{
return;
}
size_t ndf = ncoord;
// Assign pointers.
TData **inptr = new TData *[ncoord];
inptr[0] = in0 + (nq0 * e);
if (ncoord > 1)
{
inptr[1] = in1 + (nq0 * e);
}
if (ncoord > 2)
{
inptr[2] = in2 + (nq0 * e);
}
TData *outptr0 = out0 + (nq0 * e);
TData **dfptr = new TData *[ndf];
for (size_t d = 0; d < ndf; d++)
{
dfptr[d] = df + d * dfSize;
dfptr[d] += DEFORMED ? nq0 * e : e;
}
// Calculate dxi/dx in[0] + dxi/dy in[1] + dxi/dz in[2]
for (size_t i = 0; i < nq0; ++i)
{
size_t dfindex = DEFORMED ? i : 0;
outptr0[i] = dfptr[0][dfindex] * inptr[0][i];
for (size_t d = 1; d < ncoord; ++d)
{
outptr0[i] += (dfptr[d][dfindex] * inptr[d][i]);
}
}
delete[] inptr;
delete[] dfptr;
}
template <typename TData, bool DEFORMED>
__global__ void IProductWRTDerivBaseQuadKernel(
const size_t nq0, const size_t nq1, const size_t ncoord, const size_t nelmt,
const size_t dfSize, TData *df, TData *in0, TData *in1, TData *in2,
TData *out0, TData *out1)
{
size_t e = blockDim.x * blockIdx.x + threadIdx.x;
if (e >= nelmt)
{
return;
}
const auto ndf = 2 * ncoord;
// Assign pointers.
TData **inptr = new TData *[ncoord];
inptr[0] = in0 + (nq0 * nq1 * e);
inptr[1] = in1 + (nq0 * nq1 * e);
if (ncoord > 2)
{
inptr[2] = in2 + (nq0 * nq1 * e);
}
TData *outptr0 = out0 + (nq0 * nq1 * e);
TData *outptr1 = out1 + (nq0 * nq1 * e);
TData **dfptr = new TData *[ndf];
for (size_t d = 0; d < ndf; d++)
{
dfptr[d] = df + d * dfSize;
dfptr[d] += DEFORMED ? nq0 * nq1 * e : e;
}
// Calculate dxi/dx in[0] + dxi/dy in[1] + dxi/dz in[2]
for (size_t i = 0; i < nq0 * nq1; ++i)
{
size_t dfindex = DEFORMED ? i : 0;
outptr0[i] = dfptr[0][dfindex] * inptr[0][i];
outptr1[i] = dfptr[1][dfindex] * inptr[0][i];
for (size_t d = 1; d < ncoord; ++d)
{
outptr0[i] += (dfptr[2 * d][dfindex] * inptr[d][i]);
outptr1[i] += (dfptr[2 * d + 1][dfindex] * inptr[d][i]);
}
}
delete[] inptr;
delete[] dfptr;
}
template <typename TData, bool DEFORMED>
__global__ void IProductWRTDerivBaseTriKernel(
const size_t nq0, const size_t nq1, const size_t ncoord, const size_t nelmt,
const TData *Z0, const TData *Z1, const size_t dfSize, TData *df,
TData *in0, TData *in1, TData *in2, TData *out0, TData *out1)
{
size_t e = blockDim.x * blockIdx.x + threadIdx.x;
if (e >= nelmt)
{
return;
}
const auto ndf = 2 * ncoord;
// Assign pointers.
TData **inptr = new TData *[ncoord];
inptr[0] = in0 + (nq0 * nq1 * e);
inptr[1] = in1 + (nq0 * nq1 * e);
if (ncoord > 2)
{
inptr[2] = in2 + (nq0 * nq1 * e);
}
TData *outptr0 = out0 + (nq0 * nq1 * e);
TData *outptr1 = out1 + (nq0 * nq1 * e);
TData **dfptr = new TData *[ndf];
for (size_t d = 0; d < ndf; d++)
{
dfptr[d] = df + d * dfSize;
dfptr[d] += DEFORMED ? nq0 * nq1 * e : e;
}
// Calculate dxi/dx in[0] + dxi/dy in[1] + dxi/dz in[2]
for (size_t j = 0, cnt_ji = 0; j < nq1; ++j)
{
TData f0 = 2.0 / (1.0 - Z1[j]);
for (size_t i = 0; i < nq0; ++i, ++cnt_ji)
{
size_t dfindex = DEFORMED ? cnt_ji : 0;
outptr0[cnt_ji] = dfptr[0][dfindex] * inptr[0][cnt_ji];
outptr1[cnt_ji] = dfptr[1][dfindex] * inptr[0][cnt_ji];
for (size_t d = 1; d < ncoord; ++d)
{
outptr0[cnt_ji] += (dfptr[2 * d][dfindex] * inptr[d][cnt_ji]);
outptr1[cnt_ji] +=
(dfptr[2 * d + 1][dfindex] * inptr[d][cnt_ji]);
}
// Multiply by geometric factors
TData f1 = 0.5 * (1.0 + Z0[i]);
outptr0[cnt_ji] += outptr1[cnt_ji] * f1;
outptr0[cnt_ji] *= f0;
}
}
delete[] inptr;
delete[] dfptr;
}
template <typename TData, bool DEFORMED>
__global__ void IProductWRTDerivBaseHexKernel(
const size_t nq0, const size_t nq1, const size_t nq2, const size_t ncoord,
const size_t nelmt, const size_t dfSize, TData *df, TData *in0, TData *in1,
TData *in2, TData *out0, TData *out1, TData *out2)
{
size_t e = blockDim.x * blockIdx.x + threadIdx.x;
if (e >= nelmt)
{
return;
}
const auto ndf = 3 * ncoord;
// Assign pointers.
TData **inptr = new TData *[ncoord];
inptr[0] = in0 + (nq0 * nq1 * nq2 * e);
inptr[1] = in1 + (nq0 * nq1 * nq2 * e);
inptr[2] = in2 + (nq0 * nq1 * nq2 * e);
TData *outptr0 = out0 + (nq0 * nq1 * nq2 * e);
TData *outptr1 = out1 + (nq0 * nq1 * nq2 * e);
TData *outptr2 = out2 + (nq0 * nq1 * nq2 * e);
TData **dfptr = new TData *[ndf];
for (size_t d = 0; d < ndf; d++)
{
dfptr[d] = df + d * dfSize;
dfptr[d] += DEFORMED ? nq0 * nq1 * nq2 * e : e;
}
// Calculate dxi/dx in[0] + dxi/dy in[1] + dxi/dz in[2]
for (size_t i = 0; i < nq0 * nq1 * nq2; ++i)
{
size_t dfindex = DEFORMED ? i : 0;
outptr0[i] = dfptr[0][dfindex] * inptr[0][i];
outptr1[i] = dfptr[1][dfindex] * inptr[0][i];
outptr2[i] = dfptr[2][dfindex] * inptr[0][i];
for (size_t d = 1; d < ncoord; ++d)
{
outptr0[i] += (dfptr[3 * d][dfindex] * inptr[d][i]);
outptr1[i] += (dfptr[3 * d + 1][dfindex] * inptr[d][i]);
outptr2[i] += (dfptr[3 * d + 2][dfindex] * inptr[d][i]);
}
}
delete[] inptr;
delete[] dfptr;
}
template <typename TData, bool DEFORMED>
__global__ void IProductWRTDerivBaseTetKernel(
const size_t nq0, const size_t nq1, const size_t nq2, const size_t ncoord,
const size_t nelmt, const TData *Z0, const TData *Z1, const TData *Z2,
const size_t dfSize, TData *df, TData *in0, TData *in1, TData *in2,
TData *out0, TData *out1, TData *out2)
{
size_t e = blockDim.x * blockIdx.x + threadIdx.x;
if (e >= nelmt)
{
return;
}
const auto ndf = 3 * ncoord;
// Assign pointers.
TData **inptr = new TData *[ncoord];
inptr[0] = in0 + (nq0 * nq1 * nq2 * e);
inptr[1] = in1 + (nq0 * nq1 * nq2 * e);
inptr[2] = in2 + (nq0 * nq1 * nq2 * e);
TData *outptr0 = out0 + (nq0 * nq1 * nq2 * e);
TData *outptr1 = out1 + (nq0 * nq1 * nq2 * e);
TData *outptr2 = out2 + (nq0 * nq1 * nq2 * e);
TData **dfptr = new TData *[ndf];
for (size_t d = 0; d < ndf; d++)
{
dfptr[d] = df + d * dfSize;
dfptr[d] += DEFORMED ? nq0 * nq1 * nq2 * e : e;
}
// Calculate dxi/dx in[0] + dxi/dy in[1] + dxi/dz in[2]
for (size_t k = 0, cnt_kji = 0; k < nq2; ++k)
{
TData f2 = 2.0 / (1.0 - Z2[k]);
for (size_t j = 0; j < nq1; ++j)
{
TData f3 = 0.5 * (1.0 + Z1[j]);
TData f0 = 2.0 * f2 / (1.0 - Z1[j]);
for (size_t i = 0; i < nq0; ++i, ++cnt_kji)
{
size_t dfindex = DEFORMED ? cnt_kji : 0;
outptr0[cnt_kji] = dfptr[0][dfindex] * inptr[0][cnt_kji];
outptr1[cnt_kji] = dfptr[1][dfindex] * inptr[0][cnt_kji];
outptr2[cnt_kji] = dfptr[2][dfindex] * inptr[0][cnt_kji];
for (size_t d = 1; d < ncoord; ++d)
{
outptr0[cnt_kji] +=
(dfptr[3 * d][dfindex] * inptr[d][cnt_kji]);
outptr1[cnt_kji] +=
(dfptr[3 * d + 1][dfindex] * inptr[d][cnt_kji]);
outptr2[cnt_kji] +=
(dfptr[3 * d + 2][dfindex] * inptr[d][cnt_kji]);
}
// Multiply by geometric factors
TData f1 = 0.5 * (1.0 + Z0[i]);
outptr0[cnt_kji] += (outptr1[cnt_kji] + outptr2[cnt_kji]) * f1;
outptr0[cnt_kji] *= f0;
outptr1[cnt_kji] += outptr2[cnt_kji] * f3;
outptr1[cnt_kji] *= f2;
}
}
}
delete[] inptr;
delete[] dfptr;
}
template <typename TData, bool DEFORMED>
__global__ void IProductWRTDerivBasePrismKernel(
const size_t nq0, const size_t nq1, const size_t nq2, const size_t ncoord,
const size_t nelmt, const TData *Z0, const TData *Z2, const size_t dfSize,
TData *df, TData *in0, TData *in1, TData *in2, TData *out0, TData *out1,
TData *out2)
{
size_t e = blockDim.x * blockIdx.x + threadIdx.x;
if (e >= nelmt)
{
return;
}
const auto ndf = 3 * ncoord;
// Assign pointers.
TData **inptr = new TData *[ncoord];
inptr[0] = in0 + (nq0 * nq1 * nq2 * e);
inptr[1] = in1 + (nq0 * nq1 * nq2 * e);
inptr[2] = in2 + (nq0 * nq1 * nq2 * e);
TData *outptr0 = out0 + (nq0 * nq1 * nq2 * e);
TData *outptr1 = out1 + (nq0 * nq1 * nq2 * e);
TData *outptr2 = out2 + (nq0 * nq1 * nq2 * e);
TData **dfptr = new TData *[ndf];
for (size_t d = 0; d < ndf; d++)
{
dfptr[d] = df + d * dfSize;
dfptr[d] += DEFORMED ? nq0 * nq1 * nq2 * e : e;
}
// Calculate dxi/dx in[0] + dxi/dy in[1] + dxi/dz in[2]
for (size_t k = 0, cnt_kji = 0; k < nq2; ++k)
{
TData f0 = 2.0 / (1.0 - Z2[k]);
for (size_t j = 0; j < nq1; ++j)
{
for (size_t i = 0; i < nq0; ++i, ++cnt_kji)
{
size_t dfindex = DEFORMED ? cnt_kji : 0;
outptr0[cnt_kji] = dfptr[0][dfindex] * inptr[0][cnt_kji];
outptr1[cnt_kji] = dfptr[1][dfindex] * inptr[0][cnt_kji];
outptr2[cnt_kji] = dfptr[2][dfindex] * inptr[0][cnt_kji];
for (size_t d = 1; d < ncoord; ++d)
{
outptr0[cnt_kji] +=
(dfptr[3 * d][dfindex] * inptr[d][cnt_kji]);
outptr1[cnt_kji] +=
(dfptr[3 * d + 1][dfindex] * inptr[d][cnt_kji]);
outptr2[cnt_kji] +=
(dfptr[3 * d + 2][dfindex] * inptr[d][cnt_kji]);
}
// Multiply by geometric factors
TData f1 = 0.5 * (1.0 + Z0[i]);
outptr0[cnt_kji] += outptr2[cnt_kji] * f1;
outptr0[cnt_kji] *= f0;
}
}
}
delete[] inptr;
delete[] dfptr;
}
template <typename TData, bool DEFORMED>
__global__ void IProductWRTDerivBasePyrKernel(
const size_t nq0, const size_t nq1, const size_t nq2, const size_t ncoord,
const size_t nelmt, const TData *Z0, const TData *Z1, const TData *Z2,
const size_t dfSize, TData *df, TData *in0, TData *in1, TData *in2,
TData *out0, TData *out1, TData *out2)
{
size_t e = blockDim.x * blockIdx.x + threadIdx.x;
if (e >= nelmt)
{
return;
}
const auto ndf = 3 * ncoord;
// Assign pointers.
TData **inptr = new TData *[ncoord];
inptr[0] = in0 + (nq0 * nq1 * nq2 * e);
inptr[1] = in1 + (nq0 * nq1 * nq2 * e);
inptr[2] = in2 + (nq0 * nq1 * nq2 * e);
TData *outptr0 = out0 + (nq0 * nq1 * nq2 * e);
TData *outptr1 = out1 + (nq0 * nq1 * nq2 * e);
TData *outptr2 = out2 + (nq0 * nq1 * nq2 * e);
TData **dfptr = new TData *[ndf];
for (size_t d = 0; d < ndf; d++)
{
dfptr[d] = df + d * dfSize;
dfptr[d] += DEFORMED ? nq0 * nq1 * nq2 * e : e;
}
// Calculate dxi/dx in[0] + dxi/dy in[1] + dxi/dz in[2]
for (size_t k = 0, cnt_kji = 0; k < nq2; ++k)
{
TData f0 = 2.0 / (1.0 - Z2[k]);
for (size_t j = 0; j < nq1; ++j)
{
TData f2 = 0.5 * (1.0 + Z1[j]);
for (size_t i = 0; i < nq0; ++i, ++cnt_kji)
{
size_t dfindex = DEFORMED ? cnt_kji : 0;
outptr0[cnt_kji] = dfptr[0][dfindex] * inptr[0][cnt_kji];
outptr1[cnt_kji] = dfptr[1][dfindex] * inptr[0][cnt_kji];
outptr2[cnt_kji] = dfptr[2][dfindex] * inptr[0][cnt_kji];
for (size_t d = 1; d < ncoord; ++d)
{
outptr0[cnt_kji] +=
(dfptr[3 * d][dfindex] * inptr[d][cnt_kji]);
outptr1[cnt_kji] +=
(dfptr[3 * d + 1][dfindex] * inptr[d][cnt_kji]);
outptr2[cnt_kji] +=
(dfptr[3 * d + 2][dfindex] * inptr[d][cnt_kji]);
}
// Multiply by geometric factors
TData f1 = 0.5 * (1.0 + Z0[i]);
outptr0[cnt_kji] += outptr2[cnt_kji] * f1;
outptr0[cnt_kji] *= f0;
outptr1[cnt_kji] += outptr2[cnt_kji] * f2;
outptr1[cnt_kji] *= f0;
}
}
}
delete[] inptr;
delete[] dfptr;
}
} // namespace Nektar::Operators::detail
#include "IProductWRTDerivBaseStdMat.hpp"
namespace Nektar::Operators::detail
{
// Add different IProductWRTDerivBase implementations to the factory.
template <>
std::string OperatorIProductWRTDerivBaseImpl<double, ImplStdMat>::className =
GetOperatorFactory<double>().RegisterCreatorFunction(
"IProductWRTDerivBaseStdMat",
OperatorIProductWRTDerivBaseImpl<double, ImplStdMat>::instantiate,
"...");
} // namespace Nektar::Operators::detail
#include "Operators/OperatorIProductWRTDerivBase.hpp"
#include <StdRegions/StdExpansion.h>
namespace Nektar::Operators::detail
{
// standard matrix implementation
template <typename TData>
class OperatorIProductWRTDerivBaseImpl<TData, ImplStdMat>
: public OperatorIProductWRTDerivBase<TData>
{
public:
OperatorIProductWRTDerivBaseImpl(
const MultiRegions::ExpListSharedPtr &expansionList)
: OperatorIProductWRTDerivBase<TData>(std::move(expansionList))
{
size_t nTotElmts = this->m_expansionList->GetNumElmts();
size_t nDim = this->m_expansionList->GetShapeDimension();
// Initialise jacobian.
size_t jacSize = Operator<TData>::GetGeometricFactorSize();
m_jac = Operator<TData>::SetJacobian(jacSize);
m_derivFac = Operator<TData>::SetDerivativeFactor(jacSize);
// Initialize basiskey.
std::vector<LibUtilities::BasisKey> basisKeys(
3, LibUtilities::NullBasisKey);
// Loop over the elements of expansionList.
for (size_t e = 0; e < nTotElmts; ++e)
{
auto const expPtr = this->m_expansionList->GetExp(e);
// Fetch basiskeys of current element.
for (size_t d = 0; d < nDim; d++)
{
basisKeys[d] = expPtr->GetBasis(d)->GetBasisKey();
}
// Copy data to m_matPtr, if necessary.
if (m_matPtr.find(basisKeys) == m_matPtr.end())
{
size_t nqTot = expPtr->GetTotPoints();
size_t nmTot = expPtr->GetNcoeffs();
auto &matPtr = m_matPtr[basisKeys];
matPtr = Array<OneD, Array<OneD, TData>>(nDim);
Array<OneD, NekDouble> tmp(nqTot), t;
for (size_t d = 0; d < nDim; ++d)
{
// Get IProductWRTDerivBase matrix.
matPtr[d] = Array<OneD, TData>(nqTot * nmTot);
for (size_t i = 0; i < nqTot; ++i)
{
Vmath::Zero(nqTot, tmp, 1);
tmp[i] = 1.0;
expPtr->GetStdExp()->IProductWRTDerivBase(
d, tmp, t = matPtr[d] + i * nmTot);
}
}
}
}
// Initialize workspace memory.
auto ndata = this->m_expansionList->GetTotPoints();
m_wsp = Array<OneD, Array<OneD, NekDouble>>(3);
for (size_t i = 0; i < nDim; ++i)
{
m_wsp[i] = Array<OneD, NekDouble>(ndata);
}
}
void apply(Field<TData, FieldState::Phys> &in0,
Field<TData, FieldState::Phys> &in1,
Field<TData, FieldState::Phys> &in2,
Field<TData, FieldState::Coeff> &out,
bool APPEND = false) override
{
// Copy memory to GPU, if necessary and get raw pointers.
std::vector<TData *> inptr{in0.GetStorage().GetCPUPtr(),
in1.GetStorage().GetCPUPtr(),
in2.GetStorage().GetCPUPtr()};
auto *outptr = out.GetStorage().GetCPUPtr();
std::vector<TData *> wspptr{m_wsp[0].get(), m_wsp[1].get(),
m_wsp[2].get()};
// Initialize index.
size_t expIdx = 0;
size_t jacIdx = 0;
size_t dfIdx = 0;
// Initialize basiskey.
std::vector<LibUtilities::BasisKey> basisKeys(
3, LibUtilities::NullBasisKey);
// Loop over the blocks.
for (size_t block_idx = 0; block_idx < out.GetBlocks().size();
++block_idx)
{
// Determine shape and type of the element.
auto const expPtr = this->m_expansionList->GetExp(expIdx);
auto nElmts = out.GetBlocks()[block_idx].num_elements;
auto nDim = expPtr->GetShapeDimension();
auto nCoord = expPtr->GetCoordim();
auto nqTot = expPtr->GetTotPoints();
auto nmTot = expPtr->GetNcoeffs();
auto deformed = expPtr->GetMetricInfo()->GetGtype() ==
SpatialDomains::eDeformed;
// calculate dx/dxi in[0] + dy/dxi in[1] + dz/dxi in[2]
if (deformed)
{
for (size_t d = 0; d < nDim; ++d)
{
Vmath::Vmul(nqTot * nElmts, m_derivFac[d].get() + dfIdx, 1,
inptr[0], 1, wspptr[d], 1);
for (size_t i = 1; i < nCoord; ++i)
{
Vmath::Vvtvp(nqTot * nElmts,
m_derivFac[d + i * nDim].get() + dfIdx, 1,
inptr[i], 1, wspptr[d], 1, wspptr[d], 1);
}
}
}
else
{
for (size_t e = 0; e < nElmts; ++e)
{
for (size_t d = 0; d < nDim; ++d)
{
Vmath::Smul(nqTot, m_derivFac[d][dfIdx + e],
inptr[0] + e * nqTot, 1,
wspptr[d] + e * nqTot, 1);
for (size_t i = 1; i < nCoord; ++i)
{
Vmath::Svtvp(
nqTot, m_derivFac[d + i * nDim][dfIdx + e],
inptr[i] + e * nqTot, 1, wspptr[d] + e * nqTot,
1, wspptr[d] + e * nqTot, 1);
}
}
}
}
// Multiply by jacobian.
if (deformed)
{
for (size_t d = 0; d < nDim; ++d)
{
Vmath::Vmul(nqTot * nElmts, m_jac.get() + jacIdx, 1,
wspptr[d], 1, wspptr[d], 1);
}
}
else
{
for (size_t e = 0; e < nElmts; ++e)
{
for (size_t d = 0; d < nDim; ++d)
{
Vmath::Smul(nqTot, m_jac[jacIdx + e],
wspptr[d] + e * nqTot, 1,
wspptr[d] + e * nqTot, 1);
}
}
}
// Fetch basis key for the current element type.
for (size_t d = 0; d < nDim; d++)
{
basisKeys[d] = expPtr->GetBasis(d)->GetBasisKey();
}
// Fetch matrix.
auto matPtr = m_matPtr[basisKeys];
// Matrix products.
for (size_t d = 0; d < nDim; d++)
{
TData alpha = (d == 0 && !APPEND) ? 0.0 : 1.0;
Blas::Dgemm('N', 'N', nmTot, nElmts, nqTot, 1.0,
matPtr[d].get(), nmTot, wspptr[d], nqTot, alpha,
outptr, nmTot);
// Increment pointer and index for next element type.
inptr[d] += nqTot * nElmts;
wspptr[d] += nqTot * nElmts;
}
jacIdx += deformed ? nqTot * nElmts : nElmts;
dfIdx += deformed ? nqTot * nElmts : nElmts;
outptr += nmTot * nElmts;
expIdx += nElmts;
}
}
static std::unique_ptr<Operator<TData>> instantiate(
const MultiRegions::ExpListSharedPtr &expansionList)
{
return std::make_unique<
OperatorIProductWRTDerivBaseImpl<TData, ImplStdMat>>(
std::move(expansionList));
}
static std::string className;
private:
Array<OneD, TData> m_jac;
Array<OneD, Array<OneD, TData>> m_derivFac;
std::map<std::vector<LibUtilities::BasisKey>,
Array<OneD, Array<OneD, TData>>>
m_matPtr;
Array<OneD, Array<OneD, TData>> m_wsp;
};
} // namespace Nektar::Operators::detail
#pragma once
#ifdef NEKTAR_USE_CUDA
#include "MemoryRegionCUDA.hpp"
#endif
......@@ -52,11 +54,11 @@ DataMap<TData> GetBasisDataCUDA(
}
template <typename TData>
DataMap<TData> GetPointDataCUDA(
DataMap<TData> GetDeriveBasisDataCUDA(
const MultiRegions::ExpListSharedPtr &expansionList)
{
// Initialize data map.
DataMap<TData> points;
DataMap<TData> dbasis;
// Initialize basiskey.
std::vector<LibUtilities::BasisKey> basisKeys(3,
......@@ -74,30 +76,30 @@ DataMap<TData> GetPointDataCUDA(
basisKeys[d] = expPtr->GetBasis(d)->GetBasisKey();
}
// Copy data to points, if necessary.
if (points.find(basisKeys) == points.end())
// Copy data to dbasis, if necessary.
if (dbasis.find(basisKeys) == dbasis.end())
{
points[basisKeys] = std::vector<TData *>(nDim, 0);
dbasis[basisKeys] = std::vector<TData *>(nDim, 0);
for (size_t d = 0; d < nDim; d++)
{
auto ndata = expPtr->GetBasis(d)->GetZ().size();
auto hostPtr = expPtr->GetBasis(d)->GetZ().get();
auto &devicePtr = points[basisKeys][d];
auto ndata = expPtr->GetBasis(d)->GetDbdata().size();
auto hostPtr = expPtr->GetBasis(d)->GetDbdata().get();
auto &devicePtr = dbasis[basisKeys][d];
cudaMalloc((void **)&devicePtr, sizeof(TData) * ndata);
cudaMemcpy(devicePtr, hostPtr, sizeof(TData) * ndata,
cudaMemcpyHostToDevice);
}
}
}
return points;
return dbasis;
}
template <typename TData>
DataMap<TData> GetDerivativeDataCUDA(
DataMap<TData> GetWeightDataCUDA(
const MultiRegions::ExpListSharedPtr &expansionList)
{
// Initialize data map.
DataMap<TData> derivative;
DataMap<TData> weight;
// Initialize basiskey.
std::vector<LibUtilities::BasisKey> basisKeys(3,
......@@ -115,30 +117,48 @@ DataMap<TData> GetDerivativeDataCUDA(
basisKeys[d] = expPtr->GetBasis(d)->GetBasisKey();
}
// Copy data to derivative, if necessary.
if (derivative.find(basisKeys) == derivative.end())
// Copy data to weight, if necessary.
if (weight.find(basisKeys) == weight.end())
{
derivative[basisKeys] = std::vector<TData *>(nDim, 0);
weight[basisKeys] = std::vector<TData *>(nDim, 0);
for (size_t d = 0; d < nDim; d++)
{
auto ndata = expPtr->GetBasis(d)->GetD()->GetPtr().size();
auto hostPtr = expPtr->GetBasis(d)->GetD()->GetPtr().get();
auto &devicePtr = derivative[basisKeys][d];
auto ndata = expPtr->GetBasis(d)->GetW().size();
Array<OneD, TData> w(ndata);
if (expPtr->GetBasis(d)->GetPointsType() ==
LibUtilities::eGaussRadauMAlpha1Beta0)
{
Vmath::Smul(ndata, 0.5, expPtr->GetBasis(d)->GetW().get(),
1, w.get(), 1);
}
else if (expPtr->GetBasis(d)->GetPointsType() ==
LibUtilities::eGaussRadauMAlpha2Beta0)
{
Vmath::Smul(ndata, 0.25, expPtr->GetBasis(d)->GetW().get(),
1, w.get(), 1);
}
else
{
Vmath::Vcopy(ndata, expPtr->GetBasis(d)->GetW().get(), 1,
w.get(), 1);
}
auto hostPtr = w.get();
auto &devicePtr = weight[basisKeys][d];
cudaMalloc((void **)&devicePtr, sizeof(TData) * ndata);
cudaMemcpy(devicePtr, hostPtr, sizeof(TData) * ndata,
cudaMemcpyHostToDevice);
}
}
}
return derivative;
return weight;
}
template <typename TData>
DataMap<TData> GetWeightDataCUDA(
DataMap<TData> GetPointDataCUDA(
const MultiRegions::ExpListSharedPtr &expansionList)
{
// Initialize data map.
DataMap<TData> weight;
DataMap<TData> points;
// Initialize basiskey.
std::vector<LibUtilities::BasisKey> basisKeys(3,
......@@ -156,40 +176,63 @@ DataMap<TData> GetWeightDataCUDA(
basisKeys[d] = expPtr->GetBasis(d)->GetBasisKey();
}
// Copy data to weight, if necessary.
if (weight.find(basisKeys) == weight.end())
// Copy data to points, if necessary.
if (points.find(basisKeys) == points.end())
{
weight[basisKeys] = std::vector<TData *>(nDim, 0);
points[basisKeys] = std::vector<TData *>(nDim, 0);
for (size_t d = 0; d < nDim; d++)
{
auto ndata = expPtr->GetBasis(d)->GetW().size();
Array<OneD, TData> w(ndata);
if (expPtr->GetBasis(d)->GetPointsType() ==
LibUtilities::eGaussRadauMAlpha1Beta0)
{
Vmath::Smul(ndata, 0.5, expPtr->GetBasis(d)->GetW().get(),
1, w.get(), 1);
}
else if (expPtr->GetBasis(d)->GetPointsType() ==
LibUtilities::eGaussRadauMAlpha2Beta0)
{
Vmath::Smul(ndata, 0.25, expPtr->GetBasis(d)->GetW().get(),
1, w.get(), 1);
}
else
{
Vmath::Vcopy(ndata, expPtr->GetBasis(d)->GetW().get(), 1,
w.get(), 1);
}
auto hostPtr = w.get();
auto &devicePtr = weight[basisKeys][d];
auto ndata = expPtr->GetBasis(d)->GetZ().size();
auto hostPtr = expPtr->GetBasis(d)->GetZ().get();
auto &devicePtr = points[basisKeys][d];
cudaMalloc((void **)&devicePtr, sizeof(TData) * ndata);
cudaMemcpy(devicePtr, hostPtr, sizeof(TData) * ndata,
cudaMemcpyHostToDevice);
}
}
}
return weight;
return points;
}
template <typename TData>
DataMap<TData> GetDerivativeDataCUDA(
const MultiRegions::ExpListSharedPtr &expansionList)
{
// Initialize data map.
DataMap<TData> derivative;
// Initialize basiskey.
std::vector<LibUtilities::BasisKey> basisKeys(3,
LibUtilities::NullBasisKey);
// Loop over the elements of expansionList.
size_t nDim = expansionList->GetShapeDimension();
for (size_t i = 0; i < expansionList->GetNumElmts(); ++i)
{
auto const expPtr = expansionList->GetExp(i);
// Fetch basiskeys of current element.
for (size_t d = 0; d < nDim; d++)
{
basisKeys[d] = expPtr->GetBasis(d)->GetBasisKey();
}
// Copy data to derivative, if necessary.
if (derivative.find(basisKeys) == derivative.end())
{
derivative[basisKeys] = std::vector<TData *>(nDim, 0);
for (size_t d = 0; d < nDim; d++)
{
auto ndata = expPtr->GetBasis(d)->GetD()->GetPtr().size();
auto hostPtr = expPtr->GetBasis(d)->GetD()->GetPtr().get();
auto &devicePtr = derivative[basisKeys][d];
cudaMalloc((void **)&devicePtr, sizeof(TData) * ndata);
cudaMemcpy(devicePtr, hostPtr, sizeof(TData) * ndata,
cudaMemcpyHostToDevice);
}
}
}
return derivative;
}
template <typename TData>
......
......@@ -19,7 +19,8 @@ public:
}
virtual void apply(Field<TData, FieldState::Phys> &in,
Field<TData, FieldState::Coeff> &out) = 0;
Field<TData, FieldState::Coeff> &out,
const TData lambda = 1.0) = 0;
virtual void operator()(Field<TData, FieldState::Phys> &in,
Field<TData, FieldState::Coeff> &out)
{
......
#include "OperatorIProductWRTDerivBase.hpp"
namespace Nektar::Operators
{
template <>
const std::string IProductWRTDerivBase<>::key = "IProductWRTDerivBase";
template <> const std::string IProductWRTDerivBase<>::default_impl = "StdMat";
} // namespace Nektar::Operators
#pragma once
#include "Field.hpp"
#include "Operator.hpp"
namespace Nektar::Operators
{
// IProductWRTDerivBase base class
// Defines the apply operator to enforce apply parameter types
template <typename TData>
class OperatorIProductWRTDerivBase : public Operator<TData>
{
public:
virtual ~OperatorIProductWRTDerivBase() = default;
OperatorIProductWRTDerivBase(
const MultiRegions::ExpListSharedPtr &expansionList)
: Operator<TData>(std::move(expansionList))
{
}
virtual void apply(Field<TData, FieldState::Phys> &in0,
Field<TData, FieldState::Phys> &in1,
Field<TData, FieldState::Phys> &in2,
Field<TData, FieldState::Coeff> &out,
bool APPEND = false) = 0;
virtual void operator()(Field<TData, FieldState::Phys> &in0,
Field<TData, FieldState::Phys> &in1,
Field<TData, FieldState::Phys> &in2,
Field<TData, FieldState::Coeff> &out,
bool APPEND = false)
{
apply(in0, in1, in2, out);
}
};
// Descriptor / traits class for IProductWRTDerivBase
template <typename TData = default_fp_type> struct IProductWRTDerivBase
{
using class_name = OperatorIProductWRTDerivBase<TData>;
using FieldIn = Field<TData, FieldState::Phys>;
using FieldOut = Field<TData, FieldState::Coeff>;
static const std::string key;
static const std::string default_impl;
static std::shared_ptr<class_name> create(
const MultiRegions::ExpListSharedPtr &expansionList,
std::string pKey = "")
{
return Operator<TData>::template create<IProductWRTDerivBase>(
std::move(expansionList), pKey);
}
};
namespace detail
{
// Template for IProductWRTDerivBase implementations
template <typename TData, typename Op> class OperatorIProductWRTDerivBaseImpl;
} // namespace detail
} // namespace Nektar::Operators
/**
/*i*
* @file main.cpp
* @author Nektar++ Development Team
* @brief Demonstrator program for the new Field class.
......@@ -15,6 +15,7 @@
#include "Field.hpp"
#include "Operators/OperatorBwdTrans.hpp"
#include "Operators/OperatorIProductWRTBase.hpp"
#include "Operators/OperatorIProductWRTDerivBase.hpp"
#include "Operators/OperatorPhysDeriv.hpp"
#include <LibUtilities/BasicUtils/SessionReader.h>
......@@ -433,6 +434,33 @@ int main(int argc, char *argv[])
std::cout << std::endl;
}
std::cout << std::endl;
std::cout << "IProductWRTDerivBase (StdMat) test starts." << std::endl;
// Create two Field objects with a MemoryRegionCPU backend by default
// for the inner product with respect to deriv base
auto outCoeff = Field<double, FieldState::Coeff>::create(blocks_coeff);
// IProductWRTDerivBase
IProductWRTDerivBase<>::create(explist, "StdMat")
->apply(outPhys0, outPhys1, outPhys2, outCoeff);
// Check output values.
std::cout << "Out:" << std::endl;
auto *outptr = outCoeff.GetStorage().GetCPUPtr();
for (auto const &block : outCoeff.GetBlocks())
{
for (size_t el = 0; el < block.num_elements; ++el)
{
for (size_t coeff = 0; coeff < block.num_pts; ++coeff)
{
std::cout << *(outptr++) << ' ';
}
std::cout << std::endl;
}
std::cout << std::endl;
}
std::cout << std::endl;
}
// Test PhysDeriv (CUDA) Implementation
#ifdef NEKTAR_USE_CUDA
......@@ -526,6 +554,36 @@ int main(int argc, char *argv[])
std::cout << std::endl;
}
std::cout << std::endl;
std::cout << "IProductWRTDerivBase (CUDA) test starts." << std::endl;
// Create two Field objects with a MemoryRegionCPU backend by default
// for the inner product with respect to deriv base
auto outCoeff =
Field<double, FieldState::Coeff>::create<MemoryRegionCUDA>(
blocks_coeff);
// IProductWRTDerivBase
IProductWRTDerivBase<>::create(explist, "CUDA")
->apply(outPhys0, outPhys1, outPhys2, outCoeff);
// Check output values.
std::cout << "Out:" << std::endl;
auto *outptr =
outCoeff.template GetStorage<MemoryRegionCUDA>().GetCPUPtr();
for (auto const &block : outCoeff.GetBlocks())
{
for (size_t el = 0; el < block.num_elements; ++el)
{
for (size_t coeff = 0; coeff < block.num_pts; ++coeff)
{
std::cout << *(outptr++) << ' ';
}
std::cout << std::endl;
}
std::cout << std::endl;
}
std::cout << std::endl;
}
#endif
......
#define BOOST_TEST_MODULE example
#define BOOST_TEST_DYN_LINK
#define BOOST_TEST_MODULE TestBwdTrans
#include <boost/test/unit_test.hpp>
#include <iostream>
......@@ -13,11 +14,14 @@
#include "init_fields.hpp"
BOOST_AUTO_TEST_SUITE(TestBwdTrans)
using namespace std;
using namespace Nektar::Operators;
using namespace Nektar::LibUtilities;
using namespace Nektar;
class Line : public InitFields<double, FieldState::Coeff, FieldState::Phys>
{
public:
......@@ -90,3 +94,5 @@ BOOST_FIXTURE_TEST_CASE(bwdtrans, Line)
double TOL{0.01};
BOOST_TEST(fixt_out->compare(*fixt_expected, TOL));
}
BOOST_AUTO_TEST_SUITE_END()
#define BOOST_TEST_MODULE example
#define BOOST_TEST_DYN_LINK
#define BOOST_TEST_MODULE TestBwdTransCUDA
#include <boost/test/unit_test.hpp>
#include <iostream>
......@@ -13,6 +14,8 @@
#include "init_fields.hpp"
BOOST_AUTO_TEST_SUITE(TestBwdTransCUDA)
using namespace std;
using namespace Nektar::Operators;
using namespace Nektar::LibUtilities;
......@@ -68,3 +71,5 @@ BOOST_FIXTURE_TEST_CASE(bwdtranscuda, Line)
BOOST_CHECK_CLOSE(y[5], 4.082915537389534, TOL);
BOOST_CHECK_CLOSE(y[6], 2.000000000000000, TOL);
}
BOOST_AUTO_TEST_SUITE_END()
#define BOOST_TEST_MODULE example
#define BOOST_TEST_DYN_LINK
#define BOOST_TEST_MODULE TestIPWRTBase
#include <boost/test/unit_test.hpp>
#include <iostream>
......@@ -13,6 +14,8 @@
#include "init_fields.hpp"
BOOST_AUTO_TEST_SUITE(TestIPWRTBase)
using namespace std;
using namespace Nektar::Operators;
using namespace Nektar::LibUtilities;
......@@ -65,3 +68,5 @@ BOOST_FIXTURE_TEST_CASE(ipwrtbase, Line)
BOOST_TEST(std::abs(y[4] - 9.36750677027476e-17) < TOL);
BOOST_CHECK_CLOSE(y[5], 0.00746847423694819, TOL);
}
BOOST_AUTO_TEST_SUITE_END()
#define BOOST_TEST_MODULE example
#define BOOST_TEST_DYN_LINK
#define BOOST_TEST_MODULE TestIPWRTBaseCUDA
#include <boost/test/unit_test.hpp>
#include <iostream>
......@@ -13,6 +14,8 @@
#include "init_fields.hpp"
BOOST_AUTO_TEST_SUITE(TestIPWRTBaseCUDA)
using namespace std;
using namespace Nektar::Operators;
using namespace Nektar::LibUtilities;
......@@ -67,3 +70,5 @@ BOOST_FIXTURE_TEST_CASE(ipwrtbasecuda, Line)
BOOST_TEST(std::abs(y[4] - 1.387778780781446e-17) < TOL);
BOOST_CHECK_CLOSE(y[5], 0.0074684742369482, TOL);
}
BOOST_AUTO_TEST_SUITE_END()