diff --git a/.gitignore b/.gitignore index 9001fc99691800ef86c7376fc64932f1fee2c688..1fae8ef53b6af92f20e2e5026b1f64801a323501 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ builds a.out .ccls-cache/ .cache +*.opt .* !.gitignore !.gitlab-ci.yml diff --git a/CMakeLists.txt b/CMakeLists.txt index ada084f53d15d441b3deac7a722522f444e28d18..1ff69fff82d7406a9c5d8778c20d63db46fd4d9c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,6 +10,10 @@ set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) +if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + set(CMAKE_CUDA_ARCHITECTURES 86) +endif() + # Default install location: build/dist IF (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) SET(CMAKE_INSTALL_PREFIX ${CMAKE_BINARY_DIR}/dist CACHE PATH "" FORCE) diff --git a/MemoryRegionCUDA.hpp b/MemoryRegionCUDA.hpp index ff61995b92af25660b9e3533555dccf0c0fc3d76..a849cb0d0d95569d962c3247a346fab60308e582 100644 --- a/MemoryRegionCUDA.hpp +++ b/MemoryRegionCUDA.hpp @@ -76,7 +76,7 @@ public: void HostToDevice(); void DeviceToHost(); - bool GetOnDevice() const + bool IsOnDevice() const { return m_ondevice; } diff --git a/Operators/BwdTrans/BwdTransCUDA.hpp b/Operators/BwdTrans/BwdTransCUDA.hpp index 76c2616ab58d443ce06dc044ab2ec76167cd1edf..024251a075c5635bd5fb14e880494dba2ff5a41a 100644 --- a/Operators/BwdTrans/BwdTransCUDA.hpp +++ b/Operators/BwdTrans/BwdTransCUDA.hpp @@ -1,10 +1,54 @@ #include "MemoryRegionCUDA.hpp" +#include "Operators/BwdTrans/BwdTransCUDAKernels.cuh" #include "Operators/OperatorBwdTrans.hpp" +#include "Operators/OperatorHelper.cuh" namespace Nektar::Operators::detail { -// Matrix-free implementation +template <typename TData> +void BwdTrans1DKernel(const size_t gridSize, const size_t blockSize, + const size_t nm0, const size_t nq0, const size_t nElmts, + const TData *basis0, const TData *in, TData *out); + +template <typename TData> +void BwdTrans1DKernel_QP(const size_t nm0, const size_t nq0, + const size_t nElmts, const TData *basis0, + const TData *in, TData *out); + +template <typename TData> +void BwdTrans2DKernel(const size_t gridSize, const size_t blockSize, + LibUtilities::ShapeType shapetype, const size_t nm0, + const size_t nm1, const size_t nq0, const size_t nq1, + const size_t nElmts, const bool correct, + const TData *basis0, const TData *basis1, const TData *in, + TData *out); + +template <typename TData> +void BwdTrans2DKernel_QP(LibUtilities::ShapeType shapetype, const size_t nm0, + const size_t nm1, const size_t nq0, const size_t nq1, + const size_t nElmts, const bool correct, + const TData *basis0, const TData *basis1, + const TData *in, TData *out); + +template <typename TData> +void BwdTrans3DKernel(const size_t gridSize, const size_t blockSize, + LibUtilities::ShapeType shapetype, const size_t nm0, + const size_t nm1, const size_t nm2, const size_t nq0, + const size_t nq1, const size_t nq2, const size_t nElmts, + const bool correct, const TData *basis0, + const TData *basis1, const TData *basis2, const TData *in, + TData *out); + +template <typename TData> +void BwdTrans3DKernel_QP(LibUtilities::ShapeType shapetype, const size_t nm0, + const size_t nm1, const size_t nm2, const size_t nq0, + const size_t nq1, const size_t nq2, + const size_t nElmts, const bool correct, + const TData *basis0, const TData *basis1, + const TData *basis2, const TData *in, TData *out); + +// BwdTrans implementation template <typename TData> class OperatorBwdTransImpl<TData, ImplCUDA> : public OperatorBwdTrans<TData> { @@ -12,14 +56,102 @@ public: OperatorBwdTransImpl(const MultiRegions::ExpListSharedPtr &expansionList) : OperatorBwdTrans<TData>(expansionList) { + m_basis = GetBasisDataCUDA<TData>(expansionList); + } + + ~OperatorBwdTransImpl() + { + for (auto &basis : m_basis) + { + for (size_t i = 0; i < basis.second.size(); i++) + { + cudaFree(basis.second[i]); + } + } } void apply(Field<TData, FieldState::Coeff> &in, Field<TData, FieldState::Phys> &out) override { - TData *x = in.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); - TData *y = out.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); - std::cout << "Op bwd trans CUDA\n"; + // Copy memory to GPU, if necessary and get raw pointers. + auto const *inptr = + in.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); + auto *outptr = out.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); + + // Initialize index. + size_t expIdx = 0; + + // Initialize basiskey. + std::vector<LibUtilities::BasisKey> basisKeys( + 3, LibUtilities::NullBasisKey); + + // Loop over the blocks. + for (size_t block_idx = 0; block_idx < in.GetBlocks().size(); + ++block_idx) + { + // Determine shape and type of the element. + auto const expPtr = this->m_expansionList->GetExp(expIdx); + auto nElmts = in.GetBlocks()[block_idx].num_elements; + auto nmTot = expPtr->GetNcoeffs(); + auto nqTot = expPtr->GetTotPoints(); + auto shape = expPtr->DetShapeType(); + + // Deterime CUDA grid parameters. + m_gridSize = nElmts / m_blockSize; + m_gridSize += (nElmts % m_blockSize == 0) ? 0 : 1; + + // Flag for collapsed coordinate correction. + bool correct = expPtr->GetBasis(0)->GetBasisType() == + LibUtilities::eModified_A; + + // Fetch basis key for the current element type. + for (size_t d = 0; d < expPtr->GetShapeDimension(); d++) + { + basisKeys[d] = expPtr->GetBasis(d)->GetBasisKey(); + } + + // Function call to kernel functions. + if (expPtr->GetShapeDimension() == 1) + { + auto basis0 = m_basis[basisKeys][0]; + auto nm0 = expPtr->GetBasisNumModes(0); + auto nq0 = expPtr->GetNumPoints(0); + BwdTrans1DKernel(m_gridSize, m_blockSize, nm0, nq0, nElmts, + basis0, inptr, outptr); + } + else if (expPtr->GetShapeDimension() == 2) + { + auto basis0 = m_basis[basisKeys][0]; + auto basis1 = m_basis[basisKeys][1]; + auto nm0 = expPtr->GetBasisNumModes(0); + auto nm1 = expPtr->GetBasisNumModes(1); + auto nq0 = expPtr->GetNumPoints(0); + auto nq1 = expPtr->GetNumPoints(1); + BwdTrans2DKernel(m_gridSize, m_blockSize, shape, nm0, nm1, nq0, + nq1, nElmts, correct, basis0, basis1, inptr, + outptr); + } + else if (expPtr->GetShapeDimension() == 3) + { + auto basis0 = m_basis[basisKeys][0]; + auto basis1 = m_basis[basisKeys][1]; + auto basis2 = m_basis[basisKeys][2]; + auto nm0 = expPtr->GetBasisNumModes(0); + auto nm1 = expPtr->GetBasisNumModes(1); + auto nm2 = expPtr->GetBasisNumModes(2); + auto nq0 = expPtr->GetNumPoints(0); + auto nq1 = expPtr->GetNumPoints(1); + auto nq2 = expPtr->GetNumPoints(2); + BwdTrans3DKernel(m_gridSize, m_blockSize, shape, nm0, nm1, nm2, + nq0, nq1, nq2, nElmts, correct, basis0, basis1, + basis2, inptr, outptr); + } + + // Increment pointer and index for next element type. + inptr += nmTot * nElmts; + outptr += nqTot * nElmts; + expIdx += nElmts; + } } static std::unique_ptr<Operator<TData>> instantiate( @@ -30,6 +162,184 @@ public: } static std::string className; + +private: + std::map<std::vector<LibUtilities::BasisKey>, std::vector<TData *>> m_basis; + size_t m_blockSize = 32; + size_t m_gridSize; }; +template <typename TData> +void BwdTrans1DKernel(const size_t gridSize, const size_t blockSize, + const size_t nm0, const size_t nq0, const size_t nElmts, + const TData *basis0, const TData *in, TData *out) +{ + BwdTransSegKernel<<<gridSize, blockSize>>>(nm0, nq0, nElmts, basis0, in, + out); +} + +template <typename TData> +void BwdTrans1DKernel_QP(const size_t nm0, const size_t nq0, + const size_t nElmts, const TData *basis0, + const TData *in, TData *out) +{ + BwdTransSegKernel_QP<<<nElmts, nq0>>>(nm0, nq0, basis0, in, out); +} + +template <typename TData> +void BwdTrans2DKernel(const size_t gridSize, const size_t blockSize, + LibUtilities::ShapeType shapetype, const size_t nm0, + const size_t nm1, const size_t nq0, const size_t nq1, + const size_t nElmts, const bool correct, + const TData *basis0, const TData *basis1, const TData *in, + TData *out) +{ + if (shapetype == LibUtilities::Quad) + { + BwdTransQuadKernel<<<gridSize, blockSize>>>(nm0, nm1, nq0, nq1, nElmts, + basis0, basis1, in, out); + } + else if (shapetype == LibUtilities::Tri) + { + size_t nmTot = + LibUtilities::StdTriData::getNumberOfCoefficients(nm0, nm1); + BwdTransTriKernel<<<gridSize, blockSize>>>(nm0, nm1, nmTot, nq0, nq1, + nElmts, correct, basis0, + basis1, in, out); + } +} + +template <typename TData> +void BwdTrans2DKernel_QP(LibUtilities::ShapeType shapetype, const size_t nm0, + const size_t nm1, const size_t nq0, const size_t nq1, + const size_t nElmts, const bool correct, + const TData *basis0, const TData *basis1, + const TData *in, TData *out) +{ + if (shapetype == LibUtilities::Quad) + { + TData *wsp = 0; + cudaMalloc((void **)&wsp, sizeof(TData) * nq0 * nm1); + BwdTransQuadKernel_QP<<<nElmts, dim3(nq0, nq1)>>>( + nm0, nm1, nq0, nq1, basis0, basis1, in, wsp, out); + cudaFree(wsp); + } + else if (shapetype == LibUtilities::Tri) + { + size_t nmTot = + LibUtilities::StdTriData::getNumberOfCoefficients(nm0, nm1); + TData *wsp = 0; + cudaMalloc((void **)&wsp, sizeof(TData) * nm0 * nq1); + BwdTransTriKernel_QP<<<nElmts, dim3(nq0, nq1)>>>( + nm0, nm1, nmTot, nq0, nq1, correct, basis0, basis1, in, wsp, out); + cudaFree(wsp); + } +} + +template <typename TData> +void BwdTrans3DKernel(const size_t gridSize, const size_t blockSize, + LibUtilities::ShapeType shapetype, const size_t nm0, + const size_t nm1, const size_t nm2, const size_t nq0, + const size_t nq1, const size_t nq2, const size_t nElmts, + const bool correct, const TData *basis0, + const TData *basis1, const TData *basis2, const TData *in, + TData *out) +{ + if (shapetype == LibUtilities::Hex) + { + BwdTransHexKernel<<<gridSize, blockSize>>>(nm0, nm1, nm2, nq0, nq1, nq2, + nElmts, basis0, basis1, + basis2, in, out); + } + else if (shapetype == LibUtilities::Tet) + { + size_t nmTot = + LibUtilities::StdTetData::getNumberOfCoefficients(nm0, nm1, nm2); + BwdTransTetKernel<<<gridSize, blockSize>>>( + nm0, nm1, nm2, nmTot, nq0, nq1, nq2, nElmts, correct, basis0, + basis1, basis2, in, out); + } + else if (shapetype == LibUtilities::Pyr) + { + size_t nmTot = + LibUtilities::StdPyrData::getNumberOfCoefficients(nm0, nm1, nm2); + BwdTransPyrKernel<<<gridSize, blockSize>>>( + nm0, nm1, nm2, nmTot, nq0, nq1, nq2, nElmts, correct, basis0, + basis1, basis2, in, out); + } + else if (shapetype == LibUtilities::Prism) + { + size_t nmTot = + LibUtilities::StdPrismData::getNumberOfCoefficients(nm0, nm1, nm2); + BwdTransPrismKernel<<<gridSize, blockSize>>>( + nm0, nm1, nm2, nmTot, nq0, nq1, nq2, nElmts, correct, basis0, + basis1, basis2, in, out); + } +} + +template <typename TData> +void BwdTrans3DKernel_QP(LibUtilities::ShapeType shapetype, const size_t nm0, + const size_t nm1, const size_t nm2, const size_t nq0, + const size_t nq1, const size_t nq2, + const size_t nElmts, const bool correct, + const TData *basis0, const TData *basis1, + const TData *basis2, const TData *in, TData *out) +{ + if (shapetype == LibUtilities::Hex) + { + TData *wsp0 = 0; + TData *wsp1 = 0; + cudaMalloc((void **)&wsp0, sizeof(TData) * nq0 * nm1 * nm2); + cudaMalloc((void **)&wsp1, sizeof(TData) * nm0 * nq1 * nq2); + BwdTransHexKernel_QP<<<nElmts, dim3(nq0, nq1, nq2)>>>( + nm0, nm1, nm2, nq0, nq1, nq2, basis0, basis1, basis2, in, wsp0, + wsp1, out); + cudaFree(wsp0); + cudaFree(wsp1); + } + if (shapetype == LibUtilities::Tet) + { + size_t nmTot = + LibUtilities::StdTetData::getNumberOfCoefficients(nm0, nm1, nm2); + TData *fpq = 0; + TData *fp = 0; + cudaMalloc((void **)&fpq, + sizeof(TData) * (2 * nm1 - nm0 + 1) * nm0 / 2 * nq2); + cudaMalloc((void **)&fp, sizeof(TData) * nm0 * nq1 * nq2); + BwdTransTetKernel_QP<<<nElmts, dim3(nq0, nq1, nq2)>>>( + nm0, nm1, nm2, nmTot, nq0, nq1, nq2, correct, basis0, basis1, + basis2, in, fpq, fp, out); + cudaFree(fpq); + cudaFree(fp); + } + if (shapetype == LibUtilities::Pyr) + { + size_t nmTot = + LibUtilities::StdPyrData::getNumberOfCoefficients(nm0, nm1, nm2); + TData *fpq = 0; + TData *fp = 0; + cudaMalloc((void **)&fpq, sizeof(TData) * nm0 * nm1 * nq2); + cudaMalloc((void **)&fp, sizeof(TData) * nm0 * nq1 * nq2); + BwdTransPyrKernel_QP<<<nElmts, dim3(nq0, nq1, nq2)>>>( + nm0, nm1, nm2, nmTot, nq0, nq1, nq2, correct, basis0, basis1, + basis2, in, fpq, fp, out); + cudaFree(fpq); + cudaFree(fp); + } + if (shapetype == LibUtilities::Prism) + { + size_t nmTot = + LibUtilities::StdPrismData::getNumberOfCoefficients(nm0, nm1, nm2); + TData *fpq = 0; + TData *fp = 0; + cudaMalloc((void **)&fpq, sizeof(TData) * nm0 * nm1 * nq2); + cudaMalloc((void **)&fp, sizeof(TData) * nm0 * nq1 * nq2); + BwdTransPrismKernel_QP<<<nElmts, dim3(nq0, nq1, nq2)>>>( + nm0, nm1, nm2, nmTot, nq0, nq1, nq2, correct, basis0, basis1, + basis2, in, fpq, fp, out); + cudaFree(fpq); + cudaFree(fp); + } +} + } // namespace Nektar::Operators::detail diff --git a/Operators/BwdTrans/BwdTransCUDAKernels.cuh b/Operators/BwdTrans/BwdTransCUDAKernels.cuh new file mode 100644 index 0000000000000000000000000000000000000000..ca3ae323e69084dc22d46c27d0c22b5d31e57ebd --- /dev/null +++ b/Operators/BwdTrans/BwdTransCUDAKernels.cuh @@ -0,0 +1,943 @@ +namespace Nektar::Operators::detail +{ +template <typename TData> +__global__ void BwdTransSegKernel(const size_t nm0, const size_t nq0, + const size_t nelmt, const TData *basis0, + const TData *in, TData *out) +{ + size_t e = blockDim.x * blockIdx.x + threadIdx.x; + + if (e >= nelmt) + { + return; + } + + const TData *inptr = in + (nm0 * e); + TData *outptr = out + (nq0 * e); + + for (size_t i = 0; i < nq0; ++i) + { + TData tmp = inptr[0] * basis0[i]; + for (size_t p = 1; p < nm0; ++p) + { + tmp += inptr[p] * basis0[p * nq0 + i]; + } + outptr[i] = tmp; + } +} + +template <typename TData> +__global__ void BwdTransQuadKernel(const size_t nm0, const size_t nm1, + const size_t nq0, const size_t nq1, + const size_t nelmt, const TData *basis0, + const TData *basis1, const TData *in, + TData *out) +{ + size_t e = blockDim.x * blockIdx.x + threadIdx.x; + + if (e >= nelmt) + { + return; + } + + TData *wsp = new TData[nq0 * nm1]; + + const TData *inptr = in + (nm0 * nm1 * e); + TData *outptr = out + (nq0 * nq1 * e); + + size_t cnt_iq = 0; + for (size_t i = 0; i < nq0; ++i) + { + size_t cnt_pq = 0; + for (size_t q = 0; q < nm1; ++q) + { + TData tmp = inptr[cnt_pq++] * basis0[i]; + for (size_t p = 1; p < nm0; ++p) + { + tmp += inptr[cnt_pq++] * basis0[p * nq0 + i]; + } + wsp[cnt_iq++] = tmp; + } + } + + size_t cnt_ij = 0; + for (size_t j = 0; j < nq1; ++j) + { + size_t cnt_iq = 0; + for (size_t i = 0; i < nq0; ++i) + { + TData tmp = wsp[cnt_iq++] * basis1[j]; + for (size_t q = 1; q < nm1; ++q) + { + tmp += wsp[cnt_iq++] * basis1[q * nq1 + j]; + } + outptr[cnt_ij++] = tmp; + } + } + + delete wsp; +} + +template <typename TData> +__global__ void BwdTransTriKernel(const size_t nm0, const size_t nm1, + const size_t nmTot, const size_t nq0, + const size_t nq1, const size_t nelmt, + const bool correct, const TData *basis0, + const TData *basis1, const TData *in, + TData *out) +{ + size_t e = blockDim.x * blockIdx.x + threadIdx.x; + + if (e >= nelmt) + { + return; + } + + TData *wsp = new TData[nm0]; + + const TData *inptr = in + (nmTot * e); + TData *outptr = out + (nq0 * nq1 * e); + + size_t cnt_ij = 0; + for (size_t j = 0; j < nq1; ++j) + { + size_t mode = 0; + for (size_t p = 0; p < nm0; ++p) + { + TData tmp = 0.0; + for (size_t q = 0; q < (nm1 - p); ++q) + { + tmp += basis1[mode * nq1 + j] * inptr[mode]; + mode++; + } + wsp[p] = tmp; + } + + for (size_t i = 0; i < nq0; ++i) + { + TData tmp = wsp[0] * basis0[i]; + for (size_t p = 1; p < nm0; ++p) + { + tmp += wsp[p] * basis0[p * nq0 + i]; + } + + if (correct) + { + tmp += inptr[1] * basis0[nq0 + i] * basis1[nq1 + j]; + } + outptr[cnt_ij++] = tmp; + } + } + + delete wsp; +} + +template <typename TData> +__global__ void BwdTransHexKernel(const size_t nm0, const size_t nm1, + const size_t nm2, const size_t nq0, + const size_t nq1, const size_t nq2, + const size_t nelmt, const TData *basis0, + const TData *basis1, const TData *basis2, + const TData *in, TData *out) +{ + size_t e = blockDim.x * blockIdx.x + threadIdx.x; + + if (e >= nelmt) + { + return; + } + + TData *wsp0 = new TData[nq0 * nm1 * nm2]; + TData *wsp1 = new TData[nq0 * nq1 * nm2]; + + const TData *inptr = in + (nm0 * nm1 * nm2 * e); + TData *outptr = out + (nq0 * nq1 * nq2 * e); + + size_t cnt_irq = 0; + for (size_t i = 0; i < nq0; ++i) + { + size_t cnt_rqp = 0; + for (size_t r = 0; r < nm2; ++r) + { + for (size_t q = 0; q < nm1; ++q) + { + TData tmp = inptr[cnt_rqp++] * basis0[i]; + for (size_t p = 1; p < nm0; ++p) + { + tmp += inptr[cnt_rqp++] * basis0[p * nq0 + i]; + } + wsp0[cnt_irq++] = tmp; + } + } + } + + size_t cnt_jir = 0; + for (size_t j = 0; j < nq1; ++j) + { + size_t cnt_irq = 0; + for (size_t i = 0; i < nq0; ++i) + { + for (size_t r = 0; r < nm2; ++r) + { + TData tmp = wsp0[cnt_irq++] * basis1[j]; + for (size_t q = 1; q < nm1; ++q) + { + tmp += wsp0[cnt_irq++] * basis1[q * nq1 + j]; + } + wsp1[cnt_jir++] = tmp; + } + } + } + + size_t cnt_kji = 0; + for (size_t k = 0; k < nq2; ++k) + { + size_t cnt_jir = 0; + for (size_t j = 0; j < nq1; ++j) + { + for (size_t i = 0; i < nq0; ++i) + { + TData tmp = wsp1[cnt_jir++] * basis2[k]; + for (size_t r = 1; r < nm2; ++r) + { + tmp += wsp1[cnt_jir++] * basis2[r * nq2 + k]; + } + outptr[cnt_kji++] = tmp; + } + } + } + + delete wsp0; + delete wsp1; +} + +template <typename TData> // not working for nm2 > nm1 +__global__ void BwdTransTetKernel(const size_t nm0, const size_t nm1, + const size_t nm2, const size_t nmTot, + const size_t nq0, const size_t nq1, + const size_t nq2, const size_t nelmt, + const bool correct, const TData *basis0, + const TData *basis1, const TData *basis2, + const TData *in, TData *out) +{ + size_t e = blockDim.x * blockIdx.x + threadIdx.x; + + if (e >= nelmt) + { + return; + } + + size_t nm01 = (2 * nm1 - nm0 + 1) * nm0 / 2; + TData *fpq = new TData[nm01]; + TData *fp = new TData[nm0]; + + const TData *inptr = in + (nmTot * e); + TData *outptr = out + (nq0 * nq1 * nq2 * e); + + size_t cnt_kji = 0; + for (size_t k = 0; k < nq2; ++k) + { + size_t cnt_pq = 0; + size_t mode = 0; + for (size_t p = 0; p < nm0; ++p) + { + for (size_t q = 0; q < nm1 - p; ++q) + { + TData tmp = basis2[k + nq2 * mode] * inptr[mode++]; + for (size_t r = 1; r < nm2 - p - q; ++r) + { + tmp += basis2[k + nq2 * mode] * inptr[mode++]; + } + fpq[cnt_pq++] = tmp; + } + + // increment mode in case order1!=order2 + for (size_t q = nm1 - p; q < nm2 - p; ++q) + { + mode += nm2 - p - q; + } + } + + for (size_t j = 0; j < nq1; ++j) + { + mode = 0; + cnt_pq = 0; + for (size_t p = 0; p < nm0; ++p) + { + TData tmp = fpq[cnt_pq++] * basis1[mode * nq1 + j]; + for (size_t q = 1; q < nm1 - p; ++q) + { + tmp += fpq[cnt_pq++] * basis1[(mode + q) * nq1 + j]; + } + + fp[p] = tmp; + mode += nm1 - p; + } + + for (size_t i = 0; i < nq0; ++i) + { + TData tmp = basis0[i] * fp[0]; + for (size_t p = 1; p < nm0; ++p) + { + tmp += basis0[p * nq0 + i] * fp[p]; + } + + if (correct) + { + // top vertex + TData tmp1 = basis0[i] * basis1[nq1 + j]; + tmp1 += basis0[nq0 + i] * basis1[j]; + tmp1 += basis0[nq0 + i] * basis1[nq1 + j]; + tmp1 *= basis2[nq2 + k]; + tmp += tmp1 * inptr[1]; + + // bottom vertex + tmp1 = basis0[nq0 + i] * basis1[nq1 + j]; + tmp1 *= basis2[k]; + tmp += tmp1 * inptr[nm2]; + + // singular edge + for (size_t r = 1; r < nm2 - 1; ++r) + { + tmp1 = basis1[nq1 + j] * basis0[nq0 + i]; + tmp1 *= basis2[(r + 1) * nq2 + k]; + tmp += tmp1 * inptr[nm2 + r]; + } + } + outptr[cnt_kji++] = tmp; + } + } + } + + delete fpq; + delete fp; +} + +template <typename TData> +__global__ void BwdTransPrismKernel(const size_t nm0, const size_t nm1, + const size_t nm2, const size_t nmTot, + const size_t nq0, const size_t nq1, + const size_t nq2, const size_t nelmt, + const bool correct, const TData *basis0, + const TData *basis1, const TData *basis2, + const TData *in, TData *out) +{ + size_t e = blockDim.x * blockIdx.x + threadIdx.x; + + if (e >= nelmt) + { + return; + } + + TData *fpq = new TData[nm0 * nm1]; + TData *fp = new TData[nm0]; + + const TData *inptr = in + (nmTot * e); + TData *outptr = out + (nq0 * nq1 * nq2 * e); + + size_t cnt_kji = 0; + for (size_t k = 0; k < nq2; ++k) + { + size_t mode_pqr = 0; + size_t mode_pr = 0; + size_t mode_pq = 0; + for (size_t p = 0; p < nm0; ++p) + { + for (size_t q = 0; q < nm1; ++q) + { + TData tmp = 0.0; + for (size_t r = 0; r < nm2 - p; ++r) + { + tmp += inptr[mode_pqr++] * basis2[(mode_pr + r) * nq2 + k]; + } + fpq[mode_pq++] = tmp; + } + mode_pr += nm2 - p; + } + + for (size_t j = 0; j < nq1; ++j) + { + size_t mode_pq = 0; + for (size_t p = 0; p < nm0; ++p) + { + TData tmp = fpq[mode_pq++] * basis1[j]; + for (size_t q = 1; q < nm1; ++q) + { + tmp += fpq[mode_pq++] * basis1[q * nq1 + j]; + } + fp[p] = tmp; + } + + for (size_t i = 0; i < nq0; ++i) + { + TData tmp = fp[0] * basis0[i]; + for (size_t p = 1; p < nm0; ++p) + { + tmp += fp[p] * basis0[p * nq0 + i]; + } + + if (correct) + { + for (size_t q = 0; q < nm1; ++q) + { + tmp += basis2[nq2 + k] * basis1[q * nq1 + j] * + basis0[nq0 + i] * inptr[q * nm2 + 1]; + } + } + outptr[cnt_kji++] = tmp; + } + } + } + + delete fpq; + delete fp; +} + +template <typename TData> // not working for nm2 > nm1 +__global__ void BwdTransPyrKernel(const size_t nm0, const size_t nm1, + const size_t nm2, const size_t nmTot, + const size_t nq0, const size_t nq1, + const size_t nq2, const size_t nelmt, + const bool correct, const TData *basis0, + const TData *basis1, const TData *basis2, + const TData *in, TData *out) +{ + size_t e = blockDim.x * blockIdx.x + threadIdx.x; + + if (e >= nelmt) + { + return; + } + + TData *fpq = new TData[nm0 * nm1]; + TData *fp = new TData[nm0]; + + const TData *inptr = in + (nmTot * e); + TData *outptr = out + (nq0 * nq1 * nq2 * e); + + size_t cnt_kji = 0; + for (int k = 0; k < nq2; ++k) + { + size_t mode_pqr = 0; + size_t mode_pq = 0; + for (size_t p = 0; p < nm0; ++p) + { + for (size_t q = 0; q < p; ++q) + { + TData tmp = 0.0; + for (size_t r = 0; r < nm2 - p; ++r) + { + tmp += basis2[mode_pqr * nq2 + k] * inptr[mode_pqr++]; + } + fpq[mode_pq++] = tmp; + } + + for (size_t q = p; q < nm1; ++q) + { + TData tmp = 0.0; + for (size_t r = 0; r < nm2 - q; ++r) + { + tmp += basis2[mode_pqr * nq2 + k] * inptr[mode_pqr++]; + } + fpq[mode_pq++] = tmp; + } + + // increment mode in case nm2>nm1 + for (size_t q = nm1; q < nm2 - p; ++q) + { + mode_pqr += nm2 - q; + } + } + + for (size_t j = 0; j < nq1; ++j) + { + size_t mode_pq = 0; + for (size_t p = 0; p < nm0; ++p) + { + TData tmp = fpq[mode_pq++] * basis1[j]; + for (int q = 1; q < nm1; ++q) + { + tmp += fpq[mode_pq++] * basis1[q * nq1 + j]; + } + fp[p] = tmp; + } + + for (size_t i = 0; i < nq0; ++i) + { + TData tmp = fp[0] * basis0[i]; + for (size_t p = 1; p < nm0; ++p) + { + tmp += fp[p] * basis0[p * nq0 + i]; + } + + if (correct) + { + // top vertex + TData tmp1 = basis0[i] * basis1[nq1 + j]; + tmp1 += basis0[nq0 + i] * basis1[j]; + tmp1 += basis0[nq0 + i] * basis1[nq1 + j]; + tmp1 *= basis2[nq2 + k]; + tmp += tmp1 * inptr[1]; + } + outptr[cnt_kji++] = tmp; + } + } + } + + delete fpq; + delete fp; +} + +template <typename TData> +__global__ void BwdTransSegKernel_QP(const size_t nm0, const size_t nq0, + const TData *basis0, const TData *in, + TData *out) +{ + size_t e = blockIdx.x; + + const TData *inptr = in + (nm0 * e); + TData *outptr = out + (nq0 * e); + + size_t i = threadIdx.x; + if (i < nq0) + { + TData tmp = inptr[0] * basis0[i]; + for (size_t p = 1; p < nm0; p++) + { + tmp += inptr[p] * basis0[p * nq0 + i]; + } + outptr[i] = tmp; + } +} + +template <typename TData> +__global__ void BwdTransQuadKernel_QP(const size_t nm0, const size_t nm1, + const size_t nq0, const size_t nq1, + const TData *basis0, const TData *basis1, + const TData *in, TData *wsp, TData *out) +{ + size_t e = blockIdx.x; + + const TData *inptr = in + (nm0 * nm1 * e); + TData *outptr = out + (nq0 * nq1 * e); + + size_t i, j, q; + i = threadIdx.x; + q = threadIdx.y; + if (i < nq0 && q < nm1) + { + size_t cnt_iq = nm1 * i + q; + size_t cnt_pq = nm0 * q; + TData tmp = inptr[cnt_pq++] * basis0[i]; + for (size_t p = 1; p < nm0; ++p) + { + tmp += inptr[cnt_pq++] * basis0[p * nq0 + i]; + } + wsp[cnt_iq] = tmp; + } + + __syncthreads(); + + i = threadIdx.x; + j = threadIdx.y; + if (i < nq0 && j < nq1) + { + size_t cnt_iq = nm1 * i; + size_t cnt_ij = nq0 * j + i; + TData tmp = wsp[cnt_iq++] * basis1[j]; + for (size_t q = 1; q < nm1; ++q) + { + tmp += wsp[cnt_iq++] * basis1[q * nq1 + j]; + } + outptr[cnt_ij] = tmp; + } +} + +template <typename TData> +__global__ void BwdTransTriKernel_QP(const size_t nm0, const size_t nm1, + const size_t nmTot, const size_t nq0, + const size_t nq1, const bool correct, + const TData *basis0, const TData *basis1, + const TData *in, TData *wsp, TData *out) +{ + size_t e = blockIdx.x; + + const TData *inptr = in + (nmTot * e); + TData *outptr = out + (nq0 * nq1 * e); + + size_t i, j, p; + p = threadIdx.x; + j = threadIdx.y; + if (p < nm0 && j < nq1) + { + size_t mode = (2 * nm1 - p + 1) * p / 2; + TData tmp = 0.0; + for (size_t q = 0; q < (nm1 - p); ++q) + { + tmp += basis1[mode * nq1 + j] * inptr[mode]; + mode++; + } + wsp[nm0 * j + p] = tmp; + } + + __syncthreads(); + + i = threadIdx.x; + j = threadIdx.y; + if (i < nq0 && j < nq1) + { + size_t cnt_ij = nq0 * j + i; + TData tmp = wsp[nm0 * j] * basis0[i]; + for (size_t p = 1; p < nm0; ++p) + { + tmp += wsp[nm0 * j + p] * basis0[p * nq0 + i]; + } + + if (correct) + { + tmp += inptr[1] * basis0[nq0 + i] * basis1[nq1 + j]; + } + + outptr[cnt_ij] = tmp; + } +} + +template <typename TData> +__global__ void BwdTransHexKernel_QP(const size_t nm0, const size_t nm1, + const size_t nm2, const size_t nq0, + const size_t nq1, const size_t nq2, + const TData *basis0, const TData *basis1, + const TData *basis2, const TData *in, + TData *wsp0, TData *wsp1, TData *out) +{ + size_t e = blockIdx.x; + + const TData *inptr = in + (nm0 * nm1 * nm2 * e); + TData *outptr = out + (nq0 * nq1 * nq2 * e); + + size_t i, j, k, q, r; + i = threadIdx.x; + q = threadIdx.y; + r = threadIdx.z; + if (i < nq0 && q < nm1 && r < nm2) + { + size_t cnt_rqp = nm1 * nm0 * r + nm0 * q; + size_t cnt_irq = nm1 * nm2 * i + nm1 * r + q; + TData tmp = inptr[cnt_rqp++] * basis0[i]; + for (size_t p = 1; p < nm0; ++p) + { + tmp += inptr[cnt_rqp++] * basis0[p * nq0 + i]; + } + wsp0[cnt_irq] = tmp; + } + + __syncthreads(); + + i = threadIdx.x; + j = threadIdx.y; + r = threadIdx.z; + if (i < nq0 && j < nq1 && r < nm2) + { + size_t cnt_irq = nm1 * nm2 * i + nm1 * r; + size_t cnt_jir = nq0 * nm2 * j + nm2 * i + r; + TData tmp = wsp0[cnt_irq++] * basis1[j]; + for (size_t q = 1; q < nm1; ++q) + { + tmp += wsp0[cnt_irq++] * basis1[q * nq1 + j]; + } + wsp1[cnt_jir] = tmp; + } + + __syncthreads(); + + i = threadIdx.x; + j = threadIdx.y; + k = threadIdx.z; + if (i < nq0 && j < nq1 && k < nq2) + { + size_t cnt_jir = nq0 * nm2 * j + nm2 * i; + size_t cnt_kji = nq0 * nq1 * k + nq0 * j + i; + TData tmp = wsp1[cnt_jir++] * basis2[k]; + for (size_t r = 1; r < nm2; ++r) + { + tmp += wsp1[cnt_jir++] * basis2[r * nq2 + k]; + } + outptr[cnt_kji] = tmp; + } +} + +template <typename TData> // not working for nm2 > nm1 +__global__ void BwdTransTetKernel_QP(const size_t nm0, const size_t nm1, + const size_t nm2, const size_t nmTot, + const size_t nq0, const size_t nq1, + const size_t nq2, const bool correct, + const TData *basis0, const TData *basis1, + const TData *basis2, const TData *in, + TData *fpq, TData *fp, TData *out) +{ + size_t nm01 = (2 * nm1 - nm0 + 1) * nm0 / 2; + size_t e = blockIdx.x; + + const TData *inptr = in + (nmTot * e); + TData *outptr = out + (nq0 * nq1 * nq2 * e); + + size_t i, j, k, p, q; + p = threadIdx.x; + q = threadIdx.y; + k = threadIdx.z; + if (p < nm0 && q < nm1 && k < nq2) + { + size_t cnt_pq = nm01 * k + (2 * nm1 - p + 1) * p / 2 + q; + size_t mode = (2 * (nm2 - p) - q + 1) * q / 2; + for (size_t n = 0; n < p; ++n) + { + mode += (nm2 - n + 1) * (nm2 - n) / 2; + } + + if (q < nm1 - p) + { + TData tmp = basis2[k + nq2 * mode] * inptr[mode++]; + for (size_t r = 1; r < nm2 - p - q; ++r) + { + tmp += basis2[k + nq2 * mode] * inptr[mode++]; + } + fpq[cnt_pq] = tmp; + } + } + + __syncthreads(); + + p = threadIdx.x; + j = threadIdx.y; + k = threadIdx.z; + if (p < nm0 && j < nq1 && k < nq2) + { + size_t cnt_pq = nm01 * k + (2 * nm1 - p + 1) * p / 2; + size_t mode = (2 * nm1 - p + 1) * p / 2; + size_t mode_p = nm0 * nq1 * k + nm0 * j + p; + TData tmp = fpq[cnt_pq++] * basis1[mode * nq1 + j]; + for (size_t q = 1; q < nm1 - p; ++q) + { + tmp += fpq[cnt_pq++] * basis1[(mode + q) * nq1 + j]; + } + fp[mode_p] = tmp; + } + + __syncthreads(); + + i = threadIdx.x; + j = threadIdx.y; + k = threadIdx.z; + if (i < nq0 && j < nq1 && k < nq2) + { + size_t cnt_kji = nq0 * nq1 * k + nq0 * j + i; + size_t mode_p = nm0 * nq1 * k + nm0 * j; + TData tmp = basis0[i] * fp[mode_p++]; + for (size_t p = 1; p < nm0; ++p) + { + tmp += basis0[p * nq0 + i] * fp[mode_p++]; + } + + if (correct) + { + // top vertex + TData tmp1 = basis0[i] * basis1[nq1 + j]; + tmp1 += basis0[nq0 + i] * basis1[j]; + tmp1 += basis0[nq0 + i] * basis1[nq1 + j]; + tmp1 *= basis2[nq2 + k]; + tmp += tmp1 * inptr[1]; + + // bottom vertex + tmp1 = basis0[nq0 + i] * basis1[nq1 + j]; + tmp1 *= basis2[k]; + tmp += tmp1 * inptr[nm2]; + + // singular edge + for (size_t r = 1; r < nm2 - 1; ++r) + { + tmp1 = basis1[nq1 + j] * basis0[nq0 + i]; + tmp1 *= basis2[(r + 1) * nq2 + k]; + tmp += tmp1 * inptr[nm2 + r]; + } + } + + outptr[cnt_kji] = tmp; + } +} + +template <typename TData> +__global__ void BwdTransPrismKernel_QP(const size_t nm0, const size_t nm1, + const size_t nm2, const size_t nmTot, + const size_t nq0, const size_t nq1, + const size_t nq2, const bool correct, + const TData *basis0, const TData *basis1, + const TData *basis2, const TData *in, + TData *fpq, TData *fp, TData *out) +{ + size_t e = blockIdx.x; + + const TData *inptr = in + (nmTot * e); + TData *outptr = out + (nq0 * nq1 * nq2 * e); + + size_t i, j, k, p, q; + p = threadIdx.x; + q = threadIdx.y; + k = threadIdx.z; + if (p < nm0 && q < nm1 && k < nq2) + { + size_t mode_pr = (2 * nm2 - p + 1) * p / 2; + size_t mode_pqr = mode_pr * nm1 + (nm2 - p) * q; + size_t mode_pq = nm0 * nm1 * k + nm1 * p + q; + TData tmp = 0.0; + for (size_t r = 0; r < nm2 - p; ++r) + { + tmp += inptr[mode_pqr++] * basis2[(mode_pr + r) * nq2 + k]; + } + fpq[mode_pq] = tmp; + } + + __syncthreads(); + + p = threadIdx.x; + j = threadIdx.y; + k = threadIdx.z; + if (p < nm0 && j < nq1 && k < nq2) + { + size_t mode_pq = nm0 * nm1 * k + nm1 * p; + size_t mode_p = nm0 * nq1 * k + nm0 * j + p; + TData tmp = fpq[mode_pq++] * basis1[j]; + for (int q = 1; q < nm1; ++q) + { + tmp += fpq[mode_pq++] * basis1[q * nq1 + j]; + } + fp[mode_p] = tmp; + } + + __syncthreads(); + + i = threadIdx.x; + j = threadIdx.y; + k = threadIdx.z; + if (i < nq0 && j < nq1 && k < nq2) + { + size_t cnt_kji = nq0 * nq1 * k + nq0 * j + i; + size_t mode_p = nm0 * nq1 * k + nm0 * j; + TData tmp = fp[mode_p++] * basis0[i]; + for (int p = 1; p < nm0; ++p) + { + tmp += fp[mode_p++] * basis0[p * nq0 + i]; + } + + if (correct) + { + for (int q = 0; q < nm1; ++q) + { + tmp += basis2[nq2 + k] * basis1[q * nq1 + j] * basis0[nq0 + i] * + inptr[q * nm2 + 1]; + } + } + outptr[cnt_kji] = tmp; + } +} + +template <typename TData> // not working for nm2 > nm1 +__global__ void BwdTransPyrKernel_QP(const size_t nm0, const size_t nm1, + const size_t nm2, const size_t nmTot, + const size_t nq0, const size_t nq1, + const size_t nq2, const bool correct, + const TData *basis0, const TData *basis1, + const TData *basis2, const TData *in, + TData *fpq, TData *fp, TData *out) +{ + size_t e = blockIdx.x; + + const TData *inptr = in + (nmTot * e); + TData *outptr = out + (nq0 * nq1 * nq2 * e); + + size_t i, j, k, p, q; + p = threadIdx.x; + q = threadIdx.y; + k = threadIdx.z; + if (p < nm0 && q < nm1 && k < nq2) + { + size_t mode_pq = nm0 * nm1 * k + nm1 * p + q; + size_t mode_tmp = 0; + for (size_t n = 0; n < p; ++n) + { + mode_tmp += n * (nm2 - n); + mode_tmp += ((2 * nm2 - nm1 - n + 1) * (nm1 - n)) / 2; + if (nm2 > nm1 && nm2 - nm1 > n) + { + mode_tmp += (((nm2 - nm1) + (n + 1)) * (nm2 - nm1 - n)) / 2; + } + } + + if (q < p) + { + size_t mode_pqr = mode_tmp + q * (nm2 - p); + TData tmp = 0.0; + for (size_t r = 0; r < nm2 - p; ++r) + { + tmp += basis2[mode_pqr * nq2 + k] * inptr[mode_pqr++]; + } + fpq[mode_pq] = tmp; + } + else if (q < nm1) + { + size_t mode_pqr = mode_tmp + p * (nm2 - p); + mode_pqr += ((2 * (nm2 - p) - (q - p) + 1) * (q - p)) / 2; + TData tmp = 0.0; + for (size_t r = 0; r < nm2 - q; ++r) + { + tmp += basis2[mode_pqr * nq2 + k] * inptr[mode_pqr++]; + } + fpq[mode_pq] = tmp; + } + } + + __syncthreads(); + + p = threadIdx.x; + j = threadIdx.y; + k = threadIdx.z; + if (p < nm0 && j < nq1 && k < nq2) + { + size_t mode_p = nm0 * nq1 * k + nm0 * j + p; + size_t mode_pq = nm0 * nm1 * k + nm1 * p; + TData tmp = fpq[mode_pq++] * basis1[j]; + for (size_t q = 1; q < nm1; ++q) + { + tmp += fpq[mode_pq++] * basis1[q * nq1 + j]; + } + fp[mode_p] = tmp; + } + + __syncthreads(); + + i = threadIdx.x; + j = threadIdx.y; + k = threadIdx.z; + if (i < nq0 && j < nq1 && k < nq2) + { + size_t cnt_kji = nq0 * nq1 * k + nq0 * j + i; + size_t mode_p = nm0 * nq1 * k + nm0 * j; + TData tmp = fp[mode_p++] * basis0[i]; + for (size_t p = 1; p < nm0; ++p) + { + tmp += fp[mode_p++] * basis0[p * nq0 + i]; + } + + if (correct) + { + // top vertex + TData tmp1 = basis0[i] * basis1[nq1 + j]; + tmp1 += basis0[nq0 + i] * basis1[j]; + tmp1 += basis0[nq0 + i] * basis1[nq1 + j]; + tmp1 *= basis2[nq2 + k]; + tmp += tmp1 * inptr[1]; + } + outptr[cnt_kji] = tmp; + } +} + +} // namespace Nektar::Operators::detail diff --git a/Operators/BwdTrans/BwdTransStdMat.hpp b/Operators/BwdTrans/BwdTransStdMat.hpp index b0a71f1439d48f488c128125c48f0a49d9e2e91e..d41205c55a679c45a3c492b75fbe5ceeb16fe58e 100644 --- a/Operators/BwdTrans/BwdTransStdMat.hpp +++ b/Operators/BwdTrans/BwdTransStdMat.hpp @@ -4,7 +4,7 @@ namespace Nektar::Operators::detail { -// sum-factorisation implementation +// standard matrix implementation template <typename TData> class OperatorBwdTransImpl<TData, ImplStdMat> : public OperatorBwdTrans<TData> { @@ -17,29 +17,34 @@ public: void apply(Field<TData, FieldState::Coeff> &in, Field<TData, FieldState::Phys> &out) override { + // Initialize pointers. auto const *inptr = in.GetStorage().GetCPUPtr(); auto *outptr = out.GetStorage().GetCPUPtr(); - size_t exp_idx = 0; + size_t expIdx = 0; for (size_t block_idx = 0; block_idx < in.GetBlocks().size(); ++block_idx) { - auto const expPtr = this->m_expansionList->GetExp(exp_idx); + // Determine shape and type of the element. + auto const expPtr = this->m_expansionList->GetExp(expIdx); + auto nElmts = in.GetBlocks()[block_idx].num_elements; + auto nmTot = expPtr->GetNcoeffs(); + auto nqTot = expPtr->GetTotPoints(); + // Get BwdTrans matrix. Nektar::StdRegions::StdMatrixKey key( StdRegions::eBwdTrans, expPtr->DetShapeType(), *expPtr); auto const matPtr = expPtr->GetStdMatrix(key); - auto const &block = in.GetBlocks()[block_idx]; + // Perform matrix-matrix multiply. + Blas::Dgemm('N', 'N', nqTot, nElmts, nmTot, 1.0, + matPtr->GetRawPtr(), nqTot, inptr, nmTot, 0.0, outptr, + nqTot); - Blas::Dgemm('N', 'N', matPtr->GetRows(), block.num_elements, - matPtr->GetColumns(), 1.0, matPtr->GetRawPtr(), - matPtr->GetRows(), inptr, block.num_pts, 0.0, outptr, - expPtr->GetTotPoints()); - - inptr += block.block_size; - outptr += expPtr->GetTotPoints() * block.num_elements; - exp_idx += block.num_elements; + // Increment pointer and index for next element type. + inptr += nmTot * nElmts; + outptr += nqTot * nElmts; + expIdx += nElmts; } } diff --git a/Operators/Operator.hpp b/Operators/Operator.hpp index 58c70e732d23ccb6d8fa7a2da85afd006f3f72ab..6e887c3a6667fa75a4650976b40c0c3295eb0b61 100644 --- a/Operators/Operator.hpp +++ b/Operators/Operator.hpp @@ -1,12 +1,9 @@ #pragma once -#include "Field.hpp" - #include <string> #include <LibUtilities/BasicUtils/NekFactory.hpp> #include <MultiRegions/ExpList.h> -// #include <StdRegions/StdExpansion.h> namespace Nektar::Operators { diff --git a/Operators/OperatorHelper.cuh b/Operators/OperatorHelper.cuh new file mode 100644 index 0000000000000000000000000000000000000000..39cc7539c5dcdcbeda97c7c9f670f7f09682bfae --- /dev/null +++ b/Operators/OperatorHelper.cuh @@ -0,0 +1,54 @@ +#ifdef NEKTAR_USE_CUDA +#include "MemoryRegionCUDA.hpp" +#endif +#include <MultiRegions/ExpList.h> + +namespace Nektar::Operators +{ + +template <typename TData> +using BasisMap = + std::map<std::vector<LibUtilities::BasisKey>, std::vector<TData *>>; + +template <typename TData> +BasisMap<TData> GetBasisDataCUDA( + const MultiRegions::ExpListSharedPtr &expansionList) +{ + // Initialize data map. + BasisMap<TData> basis; + + // Initialize basiskey. + std::vector<LibUtilities::BasisKey> basisKeys(3, + LibUtilities::NullBasisKey); + + // Loop over the elements of expansionList. + size_t nDim = expansionList->GetShapeDimension(); + for (size_t i = 0; i < expansionList->GetNumElmts(); ++i) + { + auto const expPtr = expansionList->GetExp(i); + + // Fetch basiskeys of current element. + for (size_t d = 0; d < nDim; d++) + { + basisKeys[d] = expPtr->GetBasis(d)->GetBasisKey(); + } + + // Copy data to basis, if necessary. + if (basis.find(basisKeys) == basis.end()) + { + basis[basisKeys] = std::vector<TData *>(nDim, 0); + for (size_t d = 0; d < expPtr->GetShapeDimension(); d++) + { + auto ndata = expPtr->GetBasis(d)->GetBdata().size(); + auto hostPtr = expPtr->GetBasis(d)->GetBdata().get(); + auto &devicePtr = basis[basisKeys][d]; + cudaMalloc((void **)&devicePtr, sizeof(TData) * ndata); + cudaMemcpy(devicePtr, hostPtr, sizeof(TData) * ndata, + cudaMemcpyHostToDevice); + } + } + } + return basis; +} + +} // namespace Nektar::Operators diff --git a/cube_all_elements.xml b/cube_all_elements.xml new file mode 100644 index 0000000000000000000000000000000000000000..fe77d2d679cb104fd864ce15ced3fea790615829 --- /dev/null +++ b/cube_all_elements.xml @@ -0,0 +1,37 @@ +<?xml version="1.0" encoding="utf-8" ?> +<NEKTAR> + <EXPANSIONS> + <E COMPOSITE="C[0]" NUMMODES="4,5,5" FIELDS="u" BASISTYPE="Modified_A,Modified_B,Modified_C" NUMPOINTS="5,6,6" POINTSTYPE="GaussLobattoLegendre,GaussRadauMAlpha1Beta0,GaussRadauMAlpha2Beta0"/> + <E COMPOSITE="C[7]" NUMMODES="4,5,6" FIELDS="u" BASISTYPE="Modified_A,Modified_A,Modified_A" NUMPOINTS="5,6,7" POINTSTYPE="GaussLobattoLegendre,GaussLobattoLegendre,GaussLobattoLegendre"/> + <E COMPOSITE="C[8]" NUMMODES="4,5,6" FIELDS="u" BASISTYPE="Modified_A,Modified_A,Modified_B" NUMPOINTS="5,6,7" POINTSTYPE="GaussLobattoLegendre,GaussLobattoLegendre,GaussRadauMAlpha1Beta0"/> + <E COMPOSITE="C[9]" NUMMODES="4,5,5" FIELDS="u" BASISTYPE="Modified_A,Modified_A,ModifiedPyr_C" NUMPOINTS="5,6,6" POINTSTYPE="GaussLobattoLegendre,GaussLobattoLegendre,GaussRadauMAlpha2Beta0"/> + </EXPANSIONS> + + <GEOMETRY DIM="3" SPACE="3"> + <VERTEX COMPRESSED="B64Z-LittleEndian" BITSIZE="64">eJx9lTlTFFEUhdt9A9cRcQM3BMUVwV2vOyq4gDSiAu6oiEtiomaWEQGRf2ICMquMrfIfQELiaNVLMCExMLN0+syrOfPOe8mr11+fc8+93dOTJMVV8/lPT5pOWnZM6j/+6vpSN2mzkvACny304+0/3ubGv9uc7Dzxqao3Tb8Z9qH/a8rmCn/o59F11EH9+YJDvyDan7OFIt+73D+Hn7Yo2r+zxdF8zpbQdeZVpH/T8CFNU1fqr5ryFU+F0vyWCj36Xyb06G95tL6zFZSf+crgdGZK+VeRnnlO6JF/NeUfnXqfz+enS/3XCH/o10TzOasV/pjP2mh+Z+tobqiA/tYLjnwbynIVKn4fG6P+zuqEHvnrSc/v36agv38+m8mf579F6NHfVqFHvm3R+s4aiPP7t73Mf6Zi/o2CI19TdkYunv8O0vP3Z6fg8G+O5nO2S9THfHZH6zvbQ5y/H3sFR759VL948t+X/VF/Zy1Cj/wHgv3796tVcORrI39+/w5G/Z0dEnrkO1ymL1TkP5LE19Fs5/8XrGPE+b7jdD/zE0KP+Z0kzt8JDsT8lNDD/3Qwl3+PzwSn4vlZURfrHHHu/7zgyHeBONdpFxz6i8F8Pv8l0RfWZeKcv4M45+sM+vs6V7Kdnz9+H1fFfMCvif7Br5M/8y7qF/eNff3d1D82bd08kGyB3xD5wXuEP+aQCn/wXuEPfjPo6/P1CQ79LaqI+UB/m/TM7wg9/PuFHnxAzAX+g2Iu4HeFHvs9ocd+n3Lz83sgOPQPg337+T8SHPrHxHm+Q4JD/yTo7+fzVHDszygP9z8sOM7Pg3P1/Y8IjvMLOnP/LwXH+VXQ3/f/WnDsfwFYCm9Q</VERTEX> + <EDGE COMPRESSED="B64Z-LittleEndian" BITSIZE="64">eJx1mmWQFkcURRk0sNjiIcgii0MguC9OcHcL7iS4u7sEjbu7u7u7u7u7/nlnq+ZUwZ+ue+b2nf56emem35AnT/pfcpw2r9o80vnE0fmVAy9wnP4F5ad/IfnhJ0iTU1g5nK+INP0yxMkpKs74iqk/vLg0OSXk5/eUjDaveKb8/M5S8sFLKwdeRn7mpazy4eWkySkfbUEdryCO/0T5mPeK8sNPkianknK4TpWjLSReRZqcquKcJytari/Xu5o0460uP+ughvzwmuoPz5afdVNLfnhtaXLqiPO76oqz/urJB6+v88IbyM96bah8eCNpck5WDuu7cbQZ4k3EyTlFPnhTcc7fLFr+bvg7aS7N30kL+eEtxfn7aaUceGv1h7eRn7+3tvLD20mT0z5a1g/j6iDN9ekoPzk58sE7iTOuztLcF7ponPCu0twvuikH3l058B7K4f7SU374qdLk9NJ5WSe9pfH3iba4eF/5GVc/cc7fP9pM8QHi5A+Mlvsl/wZJc98cLD98iDj306HKgQ9Tf/hwcZ4zI6TxjxTnfj1KOfDR8nMfHyMffKxy4OPk5/kwXhr/BHGeE6cpBz5RmpxJ4ty3JkvjnxIt94GsaKdKs66nyc/zabr88BnqD58pP8+zWfLDZ0uTM0ec9TxXnOfiPPngp+u88DPk5zk6X/nwBdLkLIw2U8cXRZstvlicfkukyV8qP3xZtDzP+ftbLs1zfYX88JXiPO9XKQe+Wv3ha8T5u1krjX9dtLxHcP710rxPbJAfvlGc/E3ywTeLM94t0ryvbNX44dukeY/Zrhz4DuXAdyqH+8Euafy7lc970p5o64nvlSZnn/zw/fIzrgMaD8fPlOb+dFCa/EPR8h7HdTsszfvcEfnhR8V5zzumHPhZ6g8/W5z5OEca/7nivEeepxz4+fLzfnmBfPALlQO/SH7m+2Jp/JdEy/zT71Jx/JeJc7+5XP3hV0iTc2W0vC9zfa6S5r35avnh14jzPn2tcuDXqT/8enHm9QZp/DdGm6Xz3yTN8+Fm+cm5RT74reKM6zZp9gm3a5zwO6TZP9ypHPhdyoHfrRyu8z3S+O9VPs+1+6Tx3x9tdfEH5Of8D0abLf6QODkPR5uh449Ey3pmH/Wo/PDH1B/+eLSZ4k9Eyz6MfdeT0uy7npJm3/W0NHnPSFNnelb5/HtO+ezfnlc+dYUX1B//i9L4X1I++8CXpdkHviLN+V+VPyfa1+RnH/m6xoP/DWn8byqffeZb0uwz35bm/O/Iz3vxu/KzD35P48H/frRcL/axH0izj/1Qmvn+SH7eyz6Wn33vJ/LBP5VmXj9TDvvkz6WZ1y/E2T9/qXz4V9I50X6tHPbb30hzHb4Vz4r2O+XDv4+WeWPf/oM0+/YfpZmPn+Tn/fRn+Xnv+EWa+fhVfuoCv0kzH7+L89z7Q5rf/af81Bf+kmb+/hbn+fZPtKwP6hH/RsvvoR7xn8ZBPYIbV05I6hFJks4lJ2+S1uTkS9KanPxJWpNXIDj1DOodBcWp+xZK0r7cOnFw6h8cL6wcjheRn+dMRnDqItRNioqTVyxJ+zhv8eDUUTheQjkcLyk/48rU+Km/lErSmrpFaXFyyui81GvKJmlNv3LijKu85p/6ToUkrXPrXsFHyV9RObn1IPl5nlfSPFMnqpykNflVgo+Rv6pyOJ4lP+evpnUCr67rRb8awalLUYeqmaQ1dcJs+alz1NJ8cry2OHl1glPfmhBt3SSt8deTn/z6mk+ONxAnr6GuFzmNNJ/knBycuhrz2ljrluNN5M/dDwWnDsd1aKp1y/Fm8pPXXOelftciSWv20y3FyWml81Lva52kNf3aiDOutsGpCzKv7bQOOd5eft6LOwSnjsh16Kh1y/Ec+cnrpPmHd9Z80q9LcOqUrIeu4uwnumn9cLx7cOqarJ8e4vh7ar1x/FTND/5e+r34e+v6Uj/tk6Q1+/i+4sxTP11f6q39k7Sm3wBx5nWgxkN9dlCS1tR/BouTP0TjoZ47NElr+g0TZ1zDtR6o/45I0pr96Ejx3HWt9UO9eHSS1vQbI851G6v5xz9O80m/8cGpQ7MeJoiznz1N64fjE4NTt2b9TBLHP1nrjeNTND/4p+r34p+m9zG+k0wX5/8bzAjeXnxmcL7z0m9W8Azx2crne/occb5Hz9V54fP0Hgg/Xe+f8DOC892NfvN1Xr7/LgieKf9CnRf/InG+ey7WeOBLgvOdgn5LlcP3u2V6T6auvFyc72UrgncTX6nfC1+leYavDs7zkvOs0Xn5DrVW+fB1wXmfrxbt+uA9xDdo3uAbNT/wTcGpO3GezcGzxbdoPHxn2ap8+Db9Xuq728X57rBD8w/fKT918l3yU3/eLT98j8ZPnW+vOP59mmf4fuVT1z2gHPiZyqHOeVB++CFx6niHdV7qbEe0fuBHlQM/pusF/x9lNlD9</EDGE> + <FACE> + <T COMPRESSED="B64Z-LittleEndian" BITSIZE="64">eJyFmVm4jlUYhrcQoqJIEUqGRMKuZChzbaESGZvIkLYUTUKZsiOKohBKSlEhQoMGjdIg6WqejuqgDjrqoIOGk+c+2Pd1fdfeJ8/1vu/zPOtd61/ft/9/rZKSyn/Vgkcp5q9GsHoBr2YVvJrKwzs6WKvAj3rtKnQ1xSNfJ3hMgR/1uvK3rrZ4xwjrBY8Vj/xxBfljVa+t/PHB+sFa4lM/IdigQHeiePaj3kDYUH6N5NNQ8Unyg99A9YaqM9/GBfmTC3waq+78KYqbiE8dXVPlzW+qPPM4NdhMPPLNNY751E9Qnj5aFPi1UNxU+dOCpwfrKQ+eEWwpPtiqgFdP9ZbitQ62KfCj3rYKXSvxyJ8ZbKJ8W9XPCrYr0LUv4LVRvZ14HeR3tvgdxO+oPPx2qpPnczwn2Ek86vh3Fh/sIl4nIfXOQvosLfArVdxR+XOD5wXPVx68INhVfLCbeOcLqXcVou8e7CEf8j2V7yZ+T+XRXRi8KMi+7aE6/fSSH7re4qFrrTp6dMy/j/K9lEffTXFf9dtPOuoDgv0LdBeL109Ivb/wEvHK5HOJ+APlV6aYOrpLgzxPzKtM9UEaFx90g8VD10f1AeINEb+v+EPE76/8ZcHLg6XKg/Q7SHzwCvGISxWjHxpkHRmHvq4Uj3iYkPpVweFCdCPEA4epPlwx3y9Hym+k6qPkO0q8EcqPFo4Rf4zqVwfHCuFdIx44UvWxiq/VOPWVB/G7Tgh/rPLXS088Tkh+vPTjxSOmjxuCzRTjN0F1YvwnSjdevEniEfNenqz6jcoTo5uiPHF3xehvEpbLr1z1qfKfKt4U5W8Wlok/TfVblCdGd6vyxOWK0U8XMs4MIXV8bxPCm6b87cI7gncqpo7urgLezCp4M5Unpn/G4X15t2J0zHOWeMSzhdTvCc4RortXPHC26nMUo+c9OVdIfp5854lHjO984YLgQsXU6eu+At6iKniLlCdm/hXyqxDCp//7hcxzsZD6A8ElQnRLxQMXq75E8TLxxigPontQCH+p8g8JlwcrFFNfob5XiPew8sQVitE/ImQc1nOl6vgyz1XBZQX8R5VHz/wfU53/Q/iuVh4/9GvEp844azUOfugeV50Y3Trp1om3Xn7U0W8ILhQfvydUh4//k9KtE2+jfMx/SnVidJuk2yTe0/Kjjv4Z8RiH/bhZdXzRPSu+fZ+Tns8Z3y3K44d+q/jU6eN51RkP3xeUxw+fF8Wnjv82jYMfuu2qE6PbId0O8V6SH3X0O6Vnn6LbpTx8/F8Wnzrj7JYPfL4v7VGdvtDtlW6PeK8U+KF/Ncj87fea8ujp/3WNwzzx3ac6fHzfkG6feG8W+KF/Sz7weB7fVh0dvvvF36f6OxqH73Gsx7uqMx79vyfdfvHelx919B8U9MF6fqg6vszzQJD585zR90fK0y/zPyg+dfw/1vjw+D76iWL09PlpkN83rAfvi89UZ76syyHpDoj3ufyooz8c5PyP78n4faE664X/EekOiFc9Fzk1g3WCtYO1gnWDxwePU75B8ET5wGsYPEk+6BoHmwRPCTYKnhpsEWwePFn9ni6fJuqjpXyaBc8Itg22kd+ZwQ7B9sFW4tP32cq3V98d5cu6dQqeIx3rVhrsIt55wa7ygXdBsLt8ztU8ewZ7yI916xW8SH700Uc+PdVHX/mA/bRuA8S/WOtWFuwvPn0PlK5MfV8q3qDgEM17cPCy4BWa9+XBocFh8rsyODw4Qn5XBUcGR2sdRwXHBK/WuowNXiOfIcpfKx/mcZ2Q9RgXHC9k3SYEJwrHaR6Tg5OkYx5Tgv+UVI5vDN4U5D1XXbxyjcP9Mvqbg0ek4353msZBx73tdI0zlec49Rkan/tUeLfLnzz3lHdo/tPU953qv6V4d8mf+zf0M4Nc2N8avCV4d/Bb+cKbFayhvtDPCX5XUlnH/eA9iadLx73fXK0v68m91jzNj/s6eAuCt2m9uedaqP5niLdI/ZHnnq9C68fnwed2v+bPfQ68xfLvXVJZ/0DwF+m431qqzxkd91IPav583twTPKTxuc+At0L+5LmneSTxv4lna1+sDP6a+nDxVmmfcX6P/lHtf/YL++qx4G/yhbc6WE19oV+j9wc67hfW6vlFx/3C+sT3BtmvnHNv0Py4D4D3pPY3+5nz8Y3qf654T6k/8twTbEr8X2Keh/nBp4N/pM7zAO8Z7W/q9wU3B6srj/9zwT81PufnW/ScoeP8fKv2P+f29PG89ifn7/SxLchzyvPIufF29b9YvB16fqhzjrxT/aPjnHyX+uN8Hp/den54nnnu9wT/Yl7i7Q0uoY+SyvpX9Pyh4/z4Ve1PdPNS36f9vYznMvU3ND7ni/DelD95zkHf0vx4X3H+9bbWhzrvrf0anzznY+9q/z+s99p7wb9TXyve+8HlQc7f0H8Q/F06+vwweJTGpc+P1D/v043xO6j1p8750yfa37xvOYf6VP1zfgfvkPpbqXE+1/rwvuZ86LDWlzrv7S80PnnOkY7o/cC5Dj5f6vni/Aafr+TP/wPOQb7W+lDnd/w38t9fUpn3rdaf/yf8/v4uMe/nNeJ9HyxXnd/fP6g//t/wO/xHfT6rxftJ76fH2VfR/6z321rxfgnWUJ4+/weSxzHH</T> + <Q COMPRESSED="B64Z-LittleEndian" BITSIZE="64">eJx1mGtsnnMYxt+nHTNaihWdjTqsbDbbqNHNMEwddjQzNptVDTEJc8wiIY1DIlmyOGQRhA/ikPFhIZEQXyYSMea0GRvDaHVrp9gwzOmD6yd5ftH3y5X7vq/7vq7/4e37PF1f+ffzdnBt8J3gu8EN4r1XKX/WBT/WnPeDH6pvo3TgfRT8IPiJfJAfJN1P5Qc95sHfJN31QdaH/82q4xcd/H6mfnjo4P9zzSffGGRdW+QbvU3ifyFd/OIHvS9V36I8fr9SP7xm6W3VfPKTgqzra/lGr1n8b6TPfPq5R53i0TcryL3q0pxO5en7VjqdynO+3dLpEo97tk3zupXnHm1XnbhHPoi3idcb3KgYPz3Kc392aB7xd8HNiqn3Bb+Xjz7xiRcHuT/09SqGx734QfhjcGdwq/LwdgWXBTnfneL/FOxQ3y7x4f0c7FQenQ7xuhQz75fg7mC38uCvwZXS2y2E91twu/LorxSvRzHzfg/uCe5QHvwjuEp6e4TwXgr2KY/+KvG4F38Guc9/BYuiXCfPuv4OVhXlOvlexdUFg8v43+9I4gHSxQeIXrX45PcK7h1syJyB4lMfmvq+iQdpDn1HireP5uwXrFF/rfg10qkLHqC8++DtX5R9HBg8KDi4UvZVo/owreNg6Q0O1mt+jXzWqx9/hyg+VPs4KvqHyRf1MakfnniIdOg7WbwG+R+qdeFjmHzWSqeR8+6nr0G8I5RnX8fK31FFOa5TnnlHy/8xwWODrZk7XPrUp6XOPh2n9dA3Xbwm+Tg+OEI+RsrnCOmwT6P66WsS7wT5AmfEH/s1Wjojlcf3ifLF/VqUeeM4H83h3NrFG6P11Ku/mfsoP3Xqh3eSfJ4SHB9cWCn7Hqt6m/ydWpTj04Itml8vny3qx98E9bEvb0Z3UuLT5Ytzul28ifJVp/6zgmfKV6P64Z2hPOt+TrqTi3LcrDzzztbcc3QPl2buueojvjV11j9FOufpnjG/tZ94itZ/vvLo3qb1XqA5rcrj+0LNvSg4Nfhw5o4Xn/ojqbPv0zRnOt9f5VvEn6GY85mpObOCFweXy99M1R+qlP3O1pxLgnOUbxF/jmL8Xqo5c4OXBZ+J/jjxqT+fOud2uebMC85Xvln8+Yo53ys0Z4H0n47+VPEXau4CrZv6lbqHazJvUfH/+IbW2ya8SveV+e39YJvWe7VwsfRfjz7rv0bYrj7Wf12Q51DeO6uC16Z+Q7Amef6PQN/1qd9YlPsGpl4r3k3BJeLXhcfz2y3JD1E/fTyv3pl4eGL+P8Dz9tLgXUGej5nXVCnzOoqy/gDp8txzb+LR6qeP57cHEk9IzPsp/u7ge6+YeZNDnxhcwd+3SpmPLs8VDybmvYX3ydnBZak/VpTjualzLryXPK59hD8v/Lbgk9qvQn08fzzLOSTmfY33qftSf6Eox9XapyXhv5j4fvHYF363Vye+WTr08bvzsvaP9+B7gpzrq/Qnf3eQ/YH3mvaB/Irw+d1Zkzz7wPvto0HO/63gE/LDeyK8tVof+afSx9/rdfJXpT6ePzYkXi0/7OMryf8DbeJGigAA</Q> + </FACE> + <ELEMENT> + <A COMPRESSED="B64Z-LittleEndian" BITSIZE="64">eJxd1edfj1EcxvGMZO89M1JRVpQtMysS2RlZGZGdRNmbUPbIXmUnUvaO0F/kyed64Po9eb9e97l+55z73Od8T0DA/79qWB1r2POaGIi1LC+DsDbWsf+p37rWXg/r23gNsCE2wsbWbxNsavlmNt/m9rwFtrT5t8LW1t4G9X5tbdx22B71fh2sv46W13p0wmDsjF1Q69QVu2GI5bWO3TEUwzActb49sKflI1DrGYm9sDf2QX2fvtZfP8vr+0VhfxyA0ajvGmPtA3EQav0H4xAcisNQ3324jTPC8tonsTgSR+Fo1P4Zg2OtfRxq/8TheJyAE1H7bhJOtrzmoX0Uj1NwKiag9us0m5/y6l/7OBGn4wxMQu3XmTgLZ1te+38OzsV5OB91LpJxgeUXos7LIsstxhTU+VmCS3GZ5YNxOa7AVFyJOk+rcDWusbzOZxquxXWYjjq363GD5TeizvUm3IxbcCvq3GdYf8pvQ9WBTNyOWbgDde53Wns25qDqyC7cjXtwL6rO7LNx9lte9ecAHsRDeBhVT47gUWs/hqpbx/EE5uJJVD07hactr3mo3uXZuPn2P9W3M9Z+Fs+h6uV5vIAX8RKqjl7GK3jV8qqvBXgNr+MNVL29ibcsfxtVj+9Y7i7eQ9Xj+/b8ARai6mwRPsRH+BhV559Y+1N8hqr/z7EYX2AJ6n54aeO8srzujVJ8jWVYjrpX3uBba3+Hukfe4wf8iJ9Q99Jn/GJ5zUP31Vcb9xt+R91jP6w/tVdgLP7EX1iJv1H3zR/8i1WW/wfhWnRd</A> + <P COMPRESSED="B64Z-LittleEndian" BITSIZE="64">eJxdlUlsjVEUx++LRCQkbCwkIiIkJCIiFkREYiFhQyJCQiIRYip9aKsofU8HqsNr6Tyjg+o8oRSlRWmrWlPRkthY2lhYmBb9/SX3vM0v/3Pu/5zz3ffd+wXd5O9AYJIH4SE4hfxh9DF0FHo6+ojxH4XHyY9Bwv/ryzeN+AkYHfD7TiU+w/n1Y9BBzae+cCb5uTBWfcjPQccY/zzicWaeWc6vL99s4ieh9icWxsEFWoeOR2tj4uEp4z+tOMvPoBeiE4z/rHzkJ/Q8pr5881UXnjN9Ne8i59dPQGs/E2EILia/TD4YJn8eJhn/cs0Bk4kvRacY/wriiWb+kKkv3xLWhaD2MxVegGvJX0SH0WnoVehLxp+u+ch/dX5drV9JfA1MghmmjvqvI78aJovkM2EW3Eh+C0yBEfIb0NnGv5l4qplHdbV+PflNej6YQ/4yvAK3kc/Vc6F/wzzi+cZfANNYp/tqO7ow4PuLtM/kv8Gtzp9HfeRPJ18c8Ofc4fy6mieDeAm6FJbBneT3wExYTr4CVhr/PtZlQd1vu9FXjX8v8YiZf5fz51Ef+bPJ/4XXiF+HVbAa5rCuRvczujbg+2/o/yOfZurV6XtB/qbeE/R358+h+vud789F15u+mld1o1iXBxuIN8Im2Kz7jnX5sIW4vk+txq/vRAGMmHptMJq87vFCM7/mUP2g8/1F6HZ0B+yEug9uoYvROo+6H28b/x1YQv4H1PdA95j6yF9KvEv3Hzrk/Lqap4z4XfQ92G3eR+1LOdT50vm5b/za9wozj+4LvbfqI38lce2n+j1AP4Q9OjfkH6Efw16zH306D+g/zq/3xPif6tyx7qeZQ/9jr/FXEX+m+8D586qu5qkmr/2vQfejn+scOT//gvgAHDTP1wBr4ZCp99L4653fR/P3m30YNP464tr/NvQw+hUc0fnXc5j8qN5j8q/NXL9M3SZ0p/P9eu436Hbn91fdLuKNUPvZg36Lfqd9cH7fFpN/r/eO/ABsNvOobjfxPuf7W9D6Ho3BD/AjHGJdK9T/+wmOG/8w6/T/6L6dgJ+Nf5R12r+w6f/F+EdY1wEDZu5x00f+f9N28hAA</P> + <R COMPRESSED="B64Z-LittleEndian" BITSIZE="64">eJxdlEdOQ0EQBT8552xyjiYHk3M2JmMwycbGZNiw4CCIKyBxESTEgiOxcL3FbzalUvdjPNMz3+8k/pKTEkyBqbCWehq+hafjGTDT5LNggP5sPAfmQo/jXm8bz8PzYYHJa70d+gvxIlgMB6iX4Lt4KV4Gy02+Au7Rr/1Xwiroddzr7ePavwdWm7zWO6C/RucN68zvrccP8Qa8ETaZfDM8MvkWqPPQPFvxoOPOt5nzUL4dHtOveXbAThil3oWf4JpnN+wxeS8M0d+L98F+GHbc653iA/ggHDJ5rXdGv+Y5DEfgH/VR/BzXPMegz+TH4QX92v8EnIS/jnu9S1z7n4LTJu8z+5/BZ6Hm+UZ9Do/g8/iCmafyi/DK5JegzuOV+rKZt/Ir5jyUX4Ux+tfwdbgBP6hr3tf4Ju7XPTZ5zTtOf0DfF3034LtZ/wbf1fvXuzZ5rX9L/4Hep94d/KKued/hQb0fvQuT17zv6Q/pfuu7AT+p67we8DPdV71rk9d5PdJ/ofsHNc8f6mH8CY/ofph5Kh+Fz/THND/dW/hNXef1gsc1D/P/rs15/QM7bzcM</R> + <H COMPRESSED="B64Z-LittleEndian" BITSIZE="64">eJxdz8diAQAQRVF+TRefr0ZE7wRREp2FYzOzOZu3uJNKvK7MCqussc4G0/wMuya/2OI3M2yHXYdd9thnlgMOOeI4dE+Y4zTsZpyH7gXzXIbdD1ehe80CN/zllrvQvWeRh7D743/oPvKDp7A78xK6ryzx/e+Ndz6YSIJPE3E3WQAA</H> + </ELEMENT> + <COMPOSITE> + <C ID="0"> A[0-65] </C> + <C ID="1"> F[211,217,222,227,232,236,240,245,249] </C> + <C ID="2"> F[4,0,17,13,86,82,93,89,126,122,139,135,212,228,241,266,269,302,305,334,337] </C> + <C ID="3"> F[133,129,146,142,157,153,164,160,177,173,190,186,250,246,242,336,339,346,348,356,358] </C> + <C ID="4"> F[184,180,197,193,106,102,113,109,55,51,68,64,251,238,224,350,353,318,321,284,288] </C> + <C ID="5"> F[62,58,75,71,35,31,42,38,10,7,23,20,225,220,215,286,290,272,276,258,262] </C> + <C ID="6"> F[263,270,277,282,289,294,299,306,311,316,322,326,332,338,344,349,354,359] </C> + <C ID="7"> H[66-75] </C> + <C ID="8"> R[76-111] </C> + <C ID="9"> P[112-180] </C> + </COMPOSITE> + <DOMAIN> C[0,7,8,9] </DOMAIN> + </GEOMETRY> +</NEKTAR> diff --git a/main.cpp b/main.cpp index 94521fba49a392368c2ca4c15eb1a9225626fb11..f68675d91a0b27a54ed26e680548599f54660b07 100644 --- a/main.cpp +++ b/main.cpp @@ -92,46 +92,52 @@ int main(int argc, char *argv[]) std::cout << "Initial shape:\n" << in << std::endl << std::endl; + // Test SIMD Implementation #ifdef NEKTAR_ENABLE_SIMD_AVX2 - // Test out field reshaping - in.ReshapeStorage<4>(); - std::cout << "Reshaped to 4:\n" << in << std::endl << std::endl; + { + // Test out field reshaping + in.ReshapeStorage<4>(); + std::cout << "Reshaped to 4:\n" << in << std::endl << std::endl; - // Test out SIMD instructions - using vec_t = tinysimd::simd<double>; + // Test out SIMD instructions + using vec_t = tinysimd::simd<double>; - // Reshape into vec_t::width, with the right memory alignment requirements - in.ReshapeStorage<vec_t::width, vec_t::alignment>(); - std::cout << "SIMD In:\n" << in << std::endl << std::endl; + // Reshape into vec_t::width, with the right memory alignment + // requirements + in.ReshapeStorage<vec_t::width, vec_t::alignment>(); + std::cout << "SIMD In:\n" << in << std::endl << std::endl; - in2.ReshapeStorage<vec_t::width, vec_t::alignment>(); - std::cout << "SIMD Out (before):\n" << out << std::endl << std::endl; + in2.ReshapeStorage<vec_t::width, vec_t::alignment>(); + std::cout << "SIMD Out (before):\n" << out << std::endl << std::endl; - // Reinterpret casting from double to vector type allows for efficient - // conversion to SIMD intrinsics - vec_t::vectorType *inptr = - reinterpret_cast<vec_t::vectorType *>(in.GetStorage().GetCPUPtr()); - vec_t::scalarType *in2ptr = in2.GetStorage().GetCPUPtr(); + // Reinterpret casting from double to vector type allows for efficient + // conversion to SIMD intrinsics + vec_t::vectorType *inptr = + reinterpret_cast<vec_t::vectorType *>(in.GetStorage().GetCPUPtr()); - // Loop over each block in the field and square each element - for (auto const &block : blocks_coeff) - { - const size_t numMetaBlocks = block.num_elements / vec_t::width; - for (size_t metaBlock = 0; metaBlock < numMetaBlocks * block.num_pts; - ++metaBlock) + vec_t::scalarType *in2ptr = in2.GetStorage().GetCPUPtr(); + + // Loop over each block in the field and square each element + for (auto const &block : inCoeff.GetBlocks()) { - (vec_t(inptr[metaBlock]) * vec_t(inptr[metaBlock])).store(in2ptr); - in2ptr += vec_t::width; - } + const size_t numMetaBlocks = block.num_elements / vec_t::width; + for (size_t metaBlock = 0; + metaBlock < numMetaBlocks * block.num_pts; ++metaBlock) + { + (vec_t(inptr[metaBlock]) * vec_t(inptr[metaBlock])) + .store(in2ptr); + in2ptr += vec_t::width; + } - inptr += numMetaBlocks * block.num_pts; - } + inptr += numMetaBlocks * block.num_pts; + } - // Back to non-interleaved for non-SIMD Operators - in.ReshapeStorage<1>(); - in2.ReshapeStorage<1>(); + // Back to non-interleaved for non-SIMD Operators + in.ReshapeStorage<1>(); + in2.ReshapeStorage<1>(); - std::cout << "Out:\n" << in2 << std::endl; + std::cout << "Out:\n" << in2 << std::endl; + } #endif // Operators are instantiated using a Factory pattern. First lets check @@ -153,93 +159,146 @@ int main(int argc, char *argv[]) // Let's display the result std::cout << out << std::endl; - in = Field<double, FieldState::Coeff>::create(blocks_coeff); - auto &inStorage = in.GetStorage(); - std::fill(inStorage.GetCPUPtr(), inStorage.GetCPUPtr() + inStorage.size(), - 0); - out = Field<double, FieldState::Phys>::create(blocks_phys); + // Test BwdTrans Implementation + { + std::cout << "BwdTrans (StdMat) test starts." << std::endl; - auto &outStorage = out.GetStorage(); - std::fill(outStorage.GetCPUPtr(), - outStorage.GetCPUPtr() + outStorage.size(), 0); + // Create two Fields with memory on the CPU. + auto inCoeff = Field<double, FieldState::Coeff>::create(blocks_coeff); + auto outPhys = Field<double, FieldState::Phys>::create(blocks_phys); - for (auto const &block : in.GetBlocks()) - { - for (size_t el = 0; el < block.num_elements; ++el) + // Assign input values from the CPU. + auto *inptr = inCoeff.GetStorage().GetCPUPtr(); + for (auto const &block : inCoeff.GetBlocks()) { - in.GetStorage().GetCPUPtr()[el * block.num_pts] = 1; + for (size_t el = 0; el < block.num_elements; ++el) + { + for (size_t coeff = 0; coeff < block.num_pts; ++coeff) + { + *(inptr++) = coeff + 1; + } + } } - } - - std::cout << in << std::endl; - BwdTrans<>::create(explist, "StdMat")->apply(in, out); + std::cout << "Initial shape:\n" << inCoeff << std::endl; - std::cout << out << std::endl; + // Perform the BwdTrans on the fields. + BwdTrans<>::create(explist, "StdMat")->apply(inCoeff, outPhys); + // Check output values. + std::cout << "Out:" << std::endl; + auto outptr = outPhys.GetStorage().GetCPUPtr(); + for (auto const &block : out.GetBlocks()) + { + for (size_t el = 0; el < block.num_elements; ++el) + { + for (size_t phys = 0; phys < block.num_pts; ++phys) + { + std::cout << *(outptr++) << ' '; + } + } + std::cout << std::endl; + } + std::cout << std::endl; + } + // Test BwdTrans (CUDA) Implementation #ifdef NEKTAR_USE_CUDA - - // Test CUDA MemoryRegion - - // Create two Fields with memory on the GPU - in = Field<double, FieldState::Coeff>::create<MemoryRegionCUDA>( - blocks_coeff); - out = - Field<double, FieldState::Phys>::create<MemoryRegionCUDA>(blocks_phys); - - // Perform the BwdTrans on the fields using the CUDA implementation - // Since this is a CUDA operator, acting on CUDA fields, everything happens - // on the GPU. - BwdTrans<>::create(explist, "CUDA")->apply(in, out); - - // Test the GPU-backed fields with a CPU operator - // This should show a debug warning due to the implicit conversion. The - // purpose of this is to allow us to transition the code to the new - // infrastructure and add CUDA operators, without the need to add ALL CUDA - // operators before we can test anything. - BwdTrans<>::create(explist, "MatFree")->apply(in, out); - - // Create two CPU backed fields and use them with a GPU operator - // This call implicitly converts the fields to a GPU backend and warns the - // user about that fact - in = Field<double, FieldState::Coeff>::create(blocks); - out = Field<double, FieldState::Phys>::create(blocks); - BwdTrans<>::create(explist, "CUDA")->apply(in, out); -#endif - - std::cout << "Inner Product of a function WRT the Basis" << std::endl; - - // Create two Field objects with a MemoryRegionCPU backend by default - // for the inner product with respect to base - auto inPhys = Field<double, FieldState::Phys>::create(blocks_phys); - auto outCoeff = Field<double, FieldState::Coeff>::create(blocks_coeff); - - double *y = inPhys.GetStorage().GetCPUPtr(); - for (auto const &block : blocks_phys) { - for (size_t el = 0; el < block.num_elements; ++el) + std::cout << "BwdTrans (CUDA) test starts." << std::endl; + + // Create two Fields with memory on the GPU. + auto inCoeff = + Field<double, FieldState::Coeff>::create<MemoryRegionCUDA>( + blocks_coeff); + auto outPhys = + Field<double, FieldState::Phys>::create<MemoryRegionCUDA>( + blocks_phys); + + // Assign input values from the CPU. + std::cout << "Initial shape: " << std::endl; + auto *inptr = + inCoeff.template GetStorage<MemoryRegionCUDA>().GetCPUPtr(); + for (auto const &block : inCoeff.GetBlocks()) { - for (size_t phys = 0; phys < block.num_pts; ++phys) + for (size_t el = 0; el < block.num_elements; ++el) { - // Each element is the index of the quadrature points - // this is useful for testing reshapes - *(y++) = phys; + for (size_t coeff = 0; coeff < block.num_pts; ++coeff) + { + *(inptr++) = coeff + 1; + std::cout << coeff + 1 << " "; + } + } + } + std::cout << std::endl << std::endl; + + // Perform the BwdTrans on the fields using the CUDA implementation + // Since this is a CUDA operator, acting on CUDA fields, everything + // happens on the GPU. + BwdTrans<>::create(explist, "CUDA")->apply(inCoeff, outPhys); + + // Check output values. + std::cout << "Out:" << std::endl; + auto *outptr = + outPhys.template GetStorage<MemoryRegionCUDA>().GetCPUPtr(); + for (auto const &block : outPhys.GetBlocks()) + { + for (size_t el = 0; el < block.num_elements; ++el) + { + for (size_t phys = 0; phys < block.num_pts; ++phys) + { + std::cout << *(outptr++) << ' '; + } } + std::cout << std::endl; } + std::cout << std::endl; } +#endif + // Test IProductWRTBase Implementation + { + std::cout << "IProductWRTBase (StdMat) test starts." << std::endl; - memset(outCoeff.GetStorage().GetCPUPtr(), 0, - outCoeff.GetStorage().size() * sizeof(double)); + // Create two Field objects with a MemoryRegionCPU backend by default + // for the inner product with respect to base + auto inPhys = Field<double, FieldState::Phys>::create(blocks_phys); + auto outCoeff = Field<double, FieldState::Coeff>::create(blocks_coeff); - std::cout << "Initial shape:\n" << inPhys << std::endl << std::endl; + // Assign input values + auto *inptr = inPhys.GetStorage().GetCPUPtr(); + for (auto const &block : inPhys.GetBlocks()) + { + for (size_t el = 0; el < block.num_elements; ++el) + { + for (size_t phys = 0; phys < block.num_pts; ++phys) + { + // Each element is the index of the quadrature points + // this is useful for testing reshapes + *(inptr++) = phys; + } + } + } - // IProductWRTBase - auto ipwrtb = IProductWRTBase<>::create(explist, "StdMat"); - // ... and then apply it - ipwrtb->apply(inPhys, outCoeff); // out and in is inverted for the IP + std::cout << "Initial shape:\n" << inPhys << std::endl; - // Let's display the result - std::cout << "Out:\n" << outCoeff << std::endl; + // IProductWRTBase + IProductWRTBase<>::create(explist, "StdMat")->apply(inPhys, outCoeff); + + // Check output values. + std::cout << "Out:" << std::endl; + auto *outptr = outCoeff.GetStorage().GetCPUPtr(); + for (auto const &block : outCoeff.GetBlocks()) + { + for (size_t el = 0; el < block.num_elements; ++el) + { + for (size_t coeff = 0; coeff < block.num_pts; ++coeff) + { + std::cout << *(outptr++) << ' '; + } + } + std::cout << std::endl; + } + std::cout << std::endl; + } std::cout << "END" << std::endl; return 0; diff --git a/segment.xml b/segment.xml new file mode 100644 index 0000000000000000000000000000000000000000..48ae6b113810743a75093477a74c88fb4d4e0688 --- /dev/null +++ b/segment.xml @@ -0,0 +1,44 @@ +<?xml version="1.0" encoding="utf-8" ?> +<NEKTAR> + <GEOMETRY DIM="1" SPACE="1"> + <VERTEX> + <V ID="0"> -1.0 0.0 0.0</V> + <V ID="1"> -0.8 0.0 0.0</V> + <V ID="2"> -0.6 0.0 0.0</V> + <V ID="3"> -0.4 0.0 0.0</V> + <V ID="4"> -0.2 0.0 0.0</V> + <V ID="5"> 0.0 0.0 0.0</V> + <V ID="6"> 0.2 0.0 0.0</V> + <V ID="7"> 0.4 0.0 0.0</V> + <V ID="8"> 0.6 0.0 0.0</V> + <V ID="9"> 0.8 0.0 0.0</V> + <V ID="10"> 1.0 0.0 0.0</V> + </VERTEX> + + <ELEMENT> + <S ID="0"> 0 1 </S> + <S ID="1"> 1 2 </S> + <S ID="2"> 2 3 </S> + <S ID="3"> 3 4 </S> + <S ID="4"> 4 5 </S> + <S ID="5"> 5 6 </S> + <S ID="6"> 6 7 </S> + <S ID="7"> 7 8 </S> + <S ID="8"> 8 9 </S> + <S ID="9"> 9 10 </S> + </ELEMENT> + + <COMPOSITE> + <C ID="0"> S[0-9] </C> + <C ID="1"> V[0] </C> + <C ID="2"> V[10] </C> + </COMPOSITE> + + <DOMAIN> C[0] </DOMAIN> + </GEOMETRY> + + <EXPANSIONS> + <E COMPOSITE="C[0]" FIELDS="u" TYPE="MODIFIED" NUMMODES="4"/> + </EXPANSIONS> + +</NEKTAR> diff --git a/square2.xml b/square2.xml deleted file mode 100644 index e3961c52f714de67ee79c2b684737657c1b21904..0000000000000000000000000000000000000000 --- a/square2.xml +++ /dev/null @@ -1,41 +0,0 @@ -<?xml version="1.0" encoding="utf-8" ?> -<NEKTAR> - <GEOMETRY DIM="2" SPACE="2"> - <VERTEX COMPRESSED="B64Z-LittleEndian" BITSIZE="64">eJx1j8sVgjAABIOIgIrK324o3RYoIaV4yi7guBfey2SySwjbxM/+G0K24+sPPyFfxXN83/yMvlOg71y4f0knJfeLV+hH8Rp98+tx0GHnDbn/847cadhX/4P7xZ/oed8Lufe1f3pTOvZ1r+d+8QF3e9+I3Psm5M7Mvu69uV/8C2/eNsUA</VERTEX> - <EDGE COMPRESSED="B64Z-LittleEndian" BITSIZE="64">eJx1kEkSglAMBQEVZxxBHEARh/vf0I29oKvyN6l+FUI6STJ8aVCzoI6CyhurH54EnIuZN1U/fTMxfXMx8xbB90sx36+Ceet/5T7MLcTM3Yi5x1Y5/9uJ8d4rZ4+DGL+jcvYrxXhX2p+9T2L2rgOPs3J8LmI8rsrxvInxaJTj34rxu2t/7vIQc5dOjMdTOffqxXi8lHPHtxiPj3Lu+xXj9wOocAa/</EDGE> - <ELEMENT> - <Q COMPRESSED="B64Z-LittleEndian" BITSIZE="64">eJx10EcOwjAABdFQ0kkPvYTO/W/IZrxgJLx50kSyvxJFv2eGc1z86UuMMdH30FPMMNe9oRdY4krvVFhjo12xeoud7knUexy0N1Ufca29Yf8Gt7jT3lx9jwftLdSPeNLeUv2MF+0N/3HCK960t1K/40N7a/UnvrS3UX/jR3u/iOQFQAAA</Q> - </ELEMENT> - <COMPOSITE> - <C ID="0"> Q[0-11] </C> - <C ID="1"> E[0,13,22,31,11,21,30,39,3,6,9,12,32,34,36,38] </C> - <C ID="2"> Q[12-15] </C> - </COMPOSITE> - <DOMAIN> C[0,2] </DOMAIN> - </GEOMETRY> - <Metadata> - <Provenance> - <GitBranch></GitBranch> - <GitSHA1>88b1cb3be29cc9090d8a14af19c26decd88345db</GitSHA1> - <Hostname>kingscross</Hostname> - <NektarVersion>5.1.0</NektarVersion> - <Timestamp>27-Apr-2023 15:32:08</Timestamp> - </Provenance> - <NekMeshCommandLine>square.msh square.xml </NekMeshCommandLine> - </Metadata> - <EXPANSIONS> - <E COMPOSITE="C[0]" NUMMODES="4" TYPE="MODIFIED" FIELDS="u" /> - <E COMPOSITE="C[2]" NUMMODES="3" TYPE="MODIFIED" FIELDS="u" /> - </EXPANSIONS> - <CONDITIONS> - <SOLVERINFO> - <I PROPERTY="EQTYPE" VALUE="Helmholtz" /> - </SOLVERINFO> - <VARIABLES> - <V ID="0">u</V> - </VARIABLES> - <FUNCTION NAME="Forcing"> - <E VAR="u" VALUE="0" /> - </FUNCTION> - </CONDITIONS> -</NEKTAR> diff --git a/square_all_elements.xml b/square_all_elements.xml new file mode 100644 index 0000000000000000000000000000000000000000..3519e31cd62bf0ccfb15e507e20dffc628c408db --- /dev/null +++ b/square_all_elements.xml @@ -0,0 +1,63 @@ +<?xml version="1.0" encoding="utf-8"?> + +<NEKTAR xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:noNamespaceSchemaLocation="http://www.nektar.info/schema/nektar.xsd"> + + <GEOMETRY DIM="2" SPACE="2"> + + <VERTEX> + <V ID="0"> -1.0 -1.0 0.0 </V> + <V ID="1"> 0.0 -1.0 0.0 </V> + <V ID="2"> 1.0 -1.0 0.0 </V> + <V ID="3"> -1.0 0.0 0.0 </V> + <V ID="4"> 0.0 0.0 0.0 </V> + <V ID="5"> 1.0 0.0 0.0 </V> + <V ID="6"> -1.0 1.0 0.0 </V> + <V ID="7"> 0.0 1.0 0.0 </V> + <V ID="8"> 1.0 1.0 0.0 </V> + </VERTEX> + + <EDGE> + <E ID="0"> 0 1 </E> + <E ID="1"> 1 2 </E> + <E ID="2"> 0 3 </E> + <E ID="3"> 1 4 </E> + <E ID="4"> 2 5 </E> + <E ID="5"> 3 4 </E> + <E ID="6"> 4 5 </E> + <E ID="7"> 6 3 </E> + <E ID="8"> 4 7 </E> + <E ID="9"> 5 8 </E> + <E ID="10"> 6 7 </E> + <E ID="11"> 7 8 </E> + <E ID="12"> 0 4 </E> + <E ID="13"> 1 5 </E> + </EDGE> + + <ELEMENT> + <T ID="0"> 0 3 12 </T> + <T ID="1"> 2 12 5 </T> + <T ID="2"> 1 4 13 </T> + <T ID="3"> 3 13 6 </T> + <Q ID="4"> 5 8 10 7 </Q> + <Q ID="5"> 8 6 9 11 </Q> + </ELEMENT> + + <COMPOSITE> + <C ID="0"> T[0-3] </C> + <C ID="1"> Q[4-5] </C> + <C ID="2"> E[2,7,4,9] </C> + <C ID="3"> E[0,1,10,11] </C> + </COMPOSITE> + + <DOMAIN> C[0-1] </DOMAIN> + + </GEOMETRY> + + <EXPANSIONS> + <E COMPOSITE="C[0]" NUMMODES="6,7" FIELDS="u" BASISTYPE="Modified_A,Modified_B" NUMPOINTS="7,8" POINTSTYPE="GaussLobattoLegendre,GaussRadauMAlpha1Beta0"/> + <E COMPOSITE="C[1]" NUMMODES="6,7" FIELDS="u" BASISTYPE="Modified_A,Modified_A" NUMPOINTS="7,8" POINTSTYPE="GaussLobattoLegendre,GaussLobattoLegendre"/> + </EXPANSIONS> + +</NEKTAR> + diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 1a4f15028a80f74ef5c49515f64334c88b5d3599..8ffb0015f0679548ca828c34ae4ed4e67b9f4b6a 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -13,6 +13,15 @@ target_include_directories(test_bwdtrans PRIVATE ${NEKTAR++_INCLUDE_DIRS}) target_compile_definitions(test_bwdtrans PRIVATE -DTEST_PATH="${CMAKE_SOURCE_DIR}") +add_executable(test_bwdtranscuda test_bwdtranscuda.cpp) +target_link_libraries(test_bwdtranscuda PRIVATE Operators) +target_link_libraries(test_bwdtranscuda PRIVATE ${NEKTAR++_LIBRARIES}) +target_link_libraries(test_bwdtranscuda PRIVATE Boost::unit_test_framework) +target_include_directories(test_bwdtranscuda PRIVATE "${CMAKE_SOURCE_DIR}") +target_include_directories(test_bwdtranscuda PRIVATE ${NEKTAR++_INCLUDE_DIRS}) +target_compile_definitions(test_bwdtranscuda PRIVATE + -DTEST_PATH="${CMAKE_SOURCE_DIR}") + add_executable(test_ipwrtbase test_ipwrtbase.cpp) target_link_libraries(test_ipwrtbase PRIVATE Operators) target_link_libraries(test_ipwrtbase PRIVATE ${NEKTAR++_LIBRARIES}) @@ -25,8 +34,12 @@ target_compile_definitions(test_ipwrtbase PRIVATE add_test( NAME BwdTrans COMMAND test_bwdtrans - # Test executable is looking for input session file in the working - # directory + WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" +) + +add_test( + NAME BwdTransCUDA + COMMAND test_bwdtranscuda WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" ) diff --git a/tests/init_fields.hpp b/tests/init_fields.hpp index d623d935d7584e4f50323e4cc454c497358d0748..594b38e9a661ffe3e7cc6635c74abfaf536b2ccc 100644 --- a/tests/init_fields.hpp +++ b/tests/init_fields.hpp @@ -8,6 +8,10 @@ #include <Operators/OperatorBwdTrans.hpp> #include <Operators/OperatorIProductWRTBase.hpp> +#ifdef NEKTAR_USE_CUDA +#include "MemoryRegionCUDA.hpp" +#endif + using namespace Nektar::Operators; using namespace Nektar::LibUtilities; using namespace Nektar; @@ -25,23 +29,26 @@ using namespace Nektar; * https://www.boost.org/doc/libs/1_82_0/libs/test/doc/html/boost_test/tests_organization/fixtures/case.html */ -template <FieldState stateIn = FieldState::Coeff, +template <typename TData, FieldState stateIn = FieldState::Coeff, FieldState stateOut = FieldState::Phys> class InitFields { public: - Field<double, stateIn> *fixt_in; - Field<double, stateOut> *fixt_out; - Field<double, stateOut> *fixt_expected; + Field<TData, stateIn> *fixt_in; + Field<TData, stateOut> *fixt_out; + Field<TData, stateOut> *fixt_expected; +#ifdef NEKTAR_USE_CUDA + Field<TData, stateIn> *fixtcuda_in; + Field<TData, stateOut> *fixtcuda_out; +#endif MultiRegions::ExpListSharedPtr fixt_explist{nullptr}; + ~InitFields() { BOOST_TEST_MESSAGE("teardown fixture"); } - InitFields() - { - } + InitFields() = default; void Configure() { @@ -66,12 +73,21 @@ public: auto blocks_out = GetBlockAttributes(stateOut, fixt_explist); // Create two Field objects with a MemoryRegionCPU backend by default - auto f_in = Field<double, stateIn>::create(blocks_in); - auto f_out = Field<double, stateOut>::create(blocks_out); - auto f_expected = Field<double, stateOut>::create(blocks_out); - fixt_in = new Field<double, stateIn>(std::move(f_in)); - fixt_out = new Field<double, stateOut>(std::move(f_out)); - fixt_expected = new Field<double, stateOut>(std::move(f_expected)); + auto f_in = Field<TData, stateIn>::create(blocks_in); + auto f_out = Field<TData, stateOut>::create(blocks_out); + auto f_expected = Field<TData, stateOut>::create(blocks_out); + fixt_in = new Field<TData, stateIn>(std::move(f_in)); + fixt_out = new Field<TData, stateOut>(std::move(f_out)); + fixt_expected = new Field<TData, stateOut>(std::move(f_expected)); +#ifdef NEKTAR_USE_CUDA + auto fcuda_in = + Field<TData, stateIn>::template create<MemoryRegionCUDA>(blocks_in); + auto fcuda_out = + Field<TData, stateOut>::template create<MemoryRegionCUDA>( + blocks_out); + fixtcuda_in = new Field<TData, stateIn>(std::move(fcuda_in)); + fixtcuda_out = new Field<TData, stateOut>(std::move(fcuda_out)); +#endif } protected: diff --git a/tests/test_bwdtrans.cpp b/tests/test_bwdtrans.cpp index 70c2a51f03456facac226ff46a19b869e4820c8e..c87d47622e022910231cfde6a7fd999f55754d7c 100644 --- a/tests/test_bwdtrans.cpp +++ b/tests/test_bwdtrans.cpp @@ -18,7 +18,7 @@ using namespace Nektar::Operators; using namespace Nektar::LibUtilities; using namespace Nektar; -class Line : public InitFields<FieldState::Coeff, FieldState::Phys> +class Line : public InitFields<double, FieldState::Coeff, FieldState::Phys> { protected: virtual std::string GetMeshName() override @@ -27,7 +27,7 @@ protected: } }; -class Square : public InitFields<FieldState::Coeff, FieldState::Phys> +class Square : public InitFields<double, FieldState::Coeff, FieldState::Phys> { protected: virtual std::string GetMeshName() override @@ -36,9 +36,9 @@ protected: } }; -using init = InitFields<FieldState::Coeff, FieldState::Phys>; BOOST_FIXTURE_TEST_CASE(bwdtrans, Line) { + Configure(); double *x = fixt_in->GetStorage().GetCPUPtr(); @@ -74,10 +74,12 @@ BOOST_FIXTURE_TEST_CASE(bwdtrans, Line) { for (size_t coeff = 0; coeff < block.num_pts; ++coeff) { - if (coeff == 0) { + if (coeff == 0) + { *(x++) = 1.0; } - else { + else + { *(x++) = 0.0; } } @@ -85,6 +87,6 @@ BOOST_FIXTURE_TEST_CASE(bwdtrans, Line) } BwdTrans<>::create(fixt_explist, "StdMat")->apply(*fixt_in, *fixt_out); - double TOL {0.01}; - BOOST_TEST( fixt_out->compare(*fixt_expected, TOL)); + double TOL{0.01}; + BOOST_TEST(fixt_out->compare(*fixt_expected, TOL)); } diff --git a/tests/test_bwdtranscuda.cpp b/tests/test_bwdtranscuda.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a219516c5d19773c7f6dca8c164ab4c25c144230 --- /dev/null +++ b/tests/test_bwdtranscuda.cpp @@ -0,0 +1,70 @@ +#define BOOST_TEST_MODULE example +#include <boost/test/unit_test.hpp> + +#include <iostream> +#include <memory> + +#include "Field.hpp" +#include "Operators/OperatorBwdTrans.hpp" + +#include <LibUtilities/BasicUtils/SessionReader.h> +#include <MultiRegions/ExpList.h> +#include <SpatialDomains/MeshGraph.h> + +#include "init_fields.hpp" + +using namespace std; +using namespace Nektar::Operators; +using namespace Nektar::LibUtilities; +using namespace Nektar; + +class Line : public InitFields<double, FieldState::Coeff, FieldState::Phys> +{ +protected: + virtual std::string GetMeshName() override + { + return "line.xml"; + } +}; + +class Square : public InitFields<double, FieldState::Coeff, FieldState::Phys> +{ +protected: + virtual std::string GetMeshName() override + { + return "square.xml"; + } +}; + +BOOST_FIXTURE_TEST_CASE(bwdtranscuda, Line) +{ + Configure(); + + static double *x = + fixtcuda_in->template GetStorage<MemoryRegionCUDA>().GetCPUPtr(); + + for (auto const &block : fixtcuda_in->GetBlocks()) + { + for (size_t el = 0; el < block.num_elements; ++el) + { + for (size_t coeff = 0; coeff < block.num_pts; ++coeff) + { + *(x++) = coeff + 1; + } + } + } + + BwdTrans<>::create(fixt_explist, "CUDA") + ->apply(*fixtcuda_in, *fixtcuda_out); + + static double *y = + fixtcuda_out->template GetStorage<MemoryRegionCUDA>().GetCPUPtr(); + double TOL = 1e-12; + BOOST_CHECK_CLOSE(y[0], 1.000000000000000, TOL); + BOOST_CHECK_CLOSE(y[1], 0.808463389187877, TOL); + BOOST_CHECK_CLOSE(y[2], 1.993385866728399, TOL); + BOOST_CHECK_CLOSE(y[3], 1.312500000000000, TOL); + BOOST_CHECK_CLOSE(y[4], 2.321846776942122, TOL); + BOOST_CHECK_CLOSE(y[5], 4.082915537389534, TOL); + BOOST_CHECK_CLOSE(y[6], 2.000000000000000, TOL); +} diff --git a/tests/test_ipwrtbase.cpp b/tests/test_ipwrtbase.cpp index 459f0e24a1a651ba4c016bf3d496b17725048967..5e2f5f371736d33c1db8288a9405b79ea7deba23 100644 --- a/tests/test_ipwrtbase.cpp +++ b/tests/test_ipwrtbase.cpp @@ -18,7 +18,7 @@ using namespace Nektar::Operators; using namespace Nektar::LibUtilities; using namespace Nektar; -class Line : public InitFields<FieldState::Phys, FieldState::Coeff> +class Line : public InitFields<double, FieldState::Phys, FieldState::Coeff> { protected: virtual std::string GetMeshName() override @@ -27,7 +27,7 @@ protected: } }; -class Square : public InitFields<FieldState::Phys, FieldState::Coeff> +class Square : public InitFields<double, FieldState::Phys, FieldState::Coeff> { protected: virtual std::string GetMeshName() override