diff --git a/Operators/CUDAMathKernels.cuh b/Operators/CUDAMathKernels.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..4dc130d3409e507fa23411d5327c6cee57ba21e7
--- /dev/null
+++ b/Operators/CUDAMathKernels.cuh
@@ -0,0 +1,105 @@
+namespace Nektar::Operators::detail
+{
+
+template <typename TData>
+__global__ void negKernel(const size_t nsize, const TData *x, TData *y)
+{
+    size_t i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    while (i < nsize)
+    {
+        y[i] = -x[i];
+        i += blockDim.x * gridDim.x;
+    }
+}
+
+template <typename TData>
+__global__ void addKernel(const size_t nsize, const TData *x, const TData *y,
+                          TData *z)
+{
+    size_t i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    while (i < nsize)
+    {
+        z[i] = x[i] + y[i];
+        i += blockDim.x * gridDim.x;
+    }
+}
+
+template <typename TData>
+__global__ void subKernel(const size_t nsize, const TData *x, const TData *y,
+                          TData *z)
+{
+    size_t i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    while (i < nsize)
+    {
+        z[i] = x[i] - y[i];
+        i += blockDim.x * gridDim.x;
+    }
+}
+
+template <typename TData>
+__global__ void daxpyKernel(const size_t nsize, const TData alpha,
+                            const TData *x, const TData *y, TData *z)
+{
+    size_t i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    while (i < nsize)
+    {
+        z[i] = alpha * x[i] + y[i];
+        i += blockDim.x * gridDim.x;
+    }
+}
+
+template <typename TData>
+__global__ void vdivKernel(const size_t nsize, const TData *x, const TData *y,
+                           TData *z)
+{
+    size_t i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    while (i < nsize)
+    {
+        z[i] = x[i] / y[i];
+        i += blockDim.x * gridDim.x;
+    }
+}
+
+template <typename TData>
+__global__ void dotKernel(const size_t nsize, const TData *x, const TData *y,
+                          TData *out)
+{
+    extern __shared__ TData s[];
+
+    size_t i      = blockDim.x * blockIdx.x + threadIdx.x;
+    size_t sIndex = threadIdx.x;
+
+    TData tmp = 0.0;
+    while (i < nsize)
+    {
+        tmp += x[i] * y[i];
+        i += blockDim.x * gridDim.x;
+    }
+
+    s[sIndex] = tmp;
+
+    __syncthreads();
+
+    i = blockDim.x / 2;
+    while (i != 0)
+    {
+        if (sIndex < i)
+        {
+            s[sIndex] += s[sIndex + i];
+        }
+        __syncthreads();
+        i /= 2;
+    }
+
+    if (threadIdx.x == 0)
+    {
+        atomicAdd(out, s[0]);
+    }
+}
+
+} // namespace Nektar::Operators::detail
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 5c45a984598ad5eb08bc3c2382a8dc1c1c47521d..cae22f3c05a2a7df627e36aaba934a5156ffc188 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -4,6 +4,17 @@ find_package(
   COMPONENTS unit_test_framework
 )
 
+IF (NEKTAR_USE_CUDA)
+    add_executable(test_cudakernels test_cudakernels.cu)
+    target_link_libraries(test_cudakernels PRIVATE Operators)
+    target_link_libraries(test_cudakernels PRIVATE ${NEKTAR++_LIBRARIES})
+    target_link_libraries(test_cudakernels PRIVATE Boost::unit_test_framework)
+    target_include_directories(test_cudakernels PRIVATE "${CMAKE_SOURCE_DIR}")
+    target_include_directories(test_cudakernels PRIVATE ${NEKTAR++_INCLUDE_DIRS})
+    target_compile_definitions(test_cudakernels PRIVATE
+        -DTEST_PATH="${CMAKE_SOURCE_DIR}")
+ENDIF()
+
 add_executable(test_bwdtrans test_bwdtrans.cpp)
 target_link_libraries(test_bwdtrans PRIVATE Operators)
 target_link_libraries(test_bwdtrans PRIVATE ${NEKTAR++_LIBRARIES})
@@ -305,6 +316,14 @@ target_include_directories(test_helmsolve PRIVATE ${NEKTAR++_INCLUDE_DIRS})
 target_compile_definitions(test_helmsolve PRIVATE
     -DTEST_PATH="${CMAKE_SOURCE_DIR}")
 
+IF (NEKTAR_USE_CUDA)
+    add_test(
+      NAME CUDAKernels
+      COMMAND test_cudakernels
+      WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+    )
+ENDIF()
+
 add_test(
   NAME BwdTrans
   COMMAND test_bwdtrans
diff --git a/tests/init_cudakernels.hpp b/tests/init_cudakernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..46b14d5ccfd06efedc44fa0e367d3eae2b55f1b4
--- /dev/null
+++ b/tests/init_cudakernels.hpp
@@ -0,0 +1,51 @@
+#include "init_fields.hpp"
+
+using namespace std;
+using namespace Nektar::Operators;
+using namespace Nektar::LibUtilities;
+using namespace Nektar;
+
+class CUDAKernelsField
+    : public InitFields<double, FieldState::Phys, FieldState::Phys,
+                        MultiRegions::ContField>
+{
+public:
+    CUDAKernelsField()
+        : InitFields<double, FieldState::Phys, FieldState::Phys,
+                     MultiRegions::ContField>()
+    {
+    }
+
+    void SetTestCase()
+    {
+        Array<OneD, NekDouble> x(fixt_explist->GetTotPoints());
+        Array<OneD, NekDouble> y(fixt_explist->GetTotPoints());
+        Array<OneD, NekDouble> z(fixt_explist->GetTotPoints());
+        Array<OneD, NekDouble> fce(fixt_explist->GetTotPoints());
+        fixt_explist->GetCoords(x, y, z);
+        auto func1 = fixt_explist->GetSession()->GetFunction("Forcing", 0);
+        func1->Evaluate(x, y, z, fce);
+        std::copy(fce.get(), fce.get() + fixt_explist->GetTotPoints(),
+                  fixt_in->GetStorage().GetCPUPtr());
+        std::copy(
+            fce.get(), fce.get() + fixt_explist->GetTotPoints(),
+            fixtcuda_in->template GetStorage<MemoryRegionCUDA>().GetCPUPtr());
+        auto func2 =
+            fixt_explist->GetSession()->GetFunction("ExactSolution", 0);
+        func2->Evaluate(x, y, z, fce);
+        std::copy(fce.get(), fce.get() + fixt_explist->GetTotPoints(),
+                  fixt_out->GetStorage().GetCPUPtr());
+        std::copy(
+            fce.get(), fce.get() + fixt_explist->GetTotPoints(),
+            fixtcuda_out->template GetStorage<MemoryRegionCUDA>().GetCPUPtr());
+    }
+};
+
+class CUDAKernels : public CUDAKernelsField
+{
+public:
+    CUDAKernels()
+    {
+        meshName = "run/Helmholtz3D_Hex_AllBCs_P6.xml";
+    }
+};
diff --git a/tests/test_cudakernels.cu b/tests/test_cudakernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..858b5b23007e35a1d7fa9b928730877dc91a6879
--- /dev/null
+++ b/tests/test_cudakernels.cu
@@ -0,0 +1,166 @@
+#define BOOST_TEST_DYN_LINK
+#define BOOST_TEST_MODULE TestCUDAMathKernels
+#include <boost/test/tools/output_test_stream.hpp>
+#include <boost/test/unit_test.hpp>
+
+#include <iostream>
+#include <memory>
+
+#include <LibUtilities/BasicUtils/Vmath.hpp>
+
+#include "MemoryRegionCUDA.hpp"
+#include "Operators/CUDAMathKernels.cuh"
+#include "init_cudakernels.hpp"
+
+using namespace Nektar::Operators::detail;
+
+BOOST_AUTO_TEST_SUITE(TestCUDAKernels)
+
+BOOST_FIXTURE_TEST_CASE(cuda_dotkernel, CUDAKernels)
+{
+    Configure();
+    SetTestCase();
+    size_t n = fixtcuda_in->template GetStorage<MemoryRegionCUDA>().size();
+    size_t gridSize  = 1024;
+    size_t blockSize = 32;
+    double out, h_out, *d_out, *x, *y;
+
+    // Vmath results
+    x   = fixt_in->GetStorage().GetCPUPtr();
+    y   = fixt_out->GetStorage().GetCPUPtr();
+    out = Vmath::Dot(n, x, 1, y, 1);
+
+    // CUDA results
+    cudaMalloc((void **)&d_out, sizeof(double));
+    cudaMemset(d_out, 0, sizeof(double));
+    x = fixtcuda_in->template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
+    y = fixtcuda_out->template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
+    dotKernel<<<gridSize, blockSize, sizeof(double) * blockSize>>>(n, x, y,
+                                                                   d_out);
+    cudaMemcpy(&h_out, d_out, sizeof(double), cudaMemcpyDeviceToHost);
+
+    // Check results
+    BOOST_TEST(fabs(h_out - out) < 1.0E-12);
+    boost::test_tools::output_test_stream output;
+    {
+        std::cout << "CUDA = " << h_out << " Vmath = " << out << std::endl;
+    }
+}
+
+BOOST_FIXTURE_TEST_CASE(cuda_addkernel, CUDAKernels)
+{
+    Configure();
+    SetTestCase();
+    size_t n = fixtcuda_in->template GetStorage<MemoryRegionCUDA>().size();
+    size_t gridSize  = 1024;
+    size_t blockSize = 32;
+    double *x, *y;
+
+    // Vmath results
+    x = fixt_in->GetStorage().GetCPUPtr();
+    y = fixt_out->GetStorage().GetCPUPtr();
+    Vmath::Vadd(n, x, 1, y, 1, y, 1);
+
+    // CUDA results
+    x = fixtcuda_in->template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
+    y = fixtcuda_out->template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
+    addKernel<<<gridSize, blockSize>>>(n, x, y, y);
+
+    // Check results
+    BOOST_TEST(fixtcuda_out->compare(*fixt_out, 1.0E-15));
+    boost::test_tools::output_test_stream output;
+    {
+        OutputIfNotMatch(
+            fixtcuda_out->template GetStorage<MemoryRegionCUDA>().GetCPUPtr(),
+            fixt_out->GetStorage().GetCPUPtr(), 1.0E-15);
+    }
+}
+
+BOOST_FIXTURE_TEST_CASE(cuda_subkernel, CUDAKernels)
+{
+    Configure();
+    SetTestCase();
+    size_t n = fixtcuda_in->template GetStorage<MemoryRegionCUDA>().size();
+    size_t gridSize  = 1024;
+    size_t blockSize = 32;
+    double *x, *y;
+
+    // Vmath results
+    x = fixt_in->GetStorage().GetCPUPtr();
+    y = fixt_out->GetStorage().GetCPUPtr();
+    Vmath::Vsub(n, x, 1, y, 1, y, 1);
+
+    // CUDA results
+    x = fixtcuda_in->template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
+    y = fixtcuda_out->template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
+    subKernel<<<gridSize, blockSize>>>(n, x, y, y);
+
+    // Check results
+    BOOST_TEST(fixtcuda_out->compare(*fixt_out, 1.0E-15));
+    boost::test_tools::output_test_stream output;
+    {
+        OutputIfNotMatch(
+            fixtcuda_out->template GetStorage<MemoryRegionCUDA>().GetCPUPtr(),
+            fixt_out->GetStorage().GetCPUPtr(), 1.0E-15);
+    }
+}
+
+BOOST_FIXTURE_TEST_CASE(cuda_daxpykernel, CUDAKernels)
+{
+    Configure();
+    SetTestCase();
+    size_t n = fixtcuda_in->template GetStorage<MemoryRegionCUDA>().size();
+    size_t gridSize  = 1024;
+    size_t blockSize = 32;
+    double *x, *y, alpha = 1.5;
+
+    // Vmath results
+    x = fixt_in->GetStorage().GetCPUPtr();
+    y = fixt_out->GetStorage().GetCPUPtr();
+    Vmath::Svtvp(n, alpha, x, 1, y, 1, y, 1);
+
+    // CUDA results
+    x = fixtcuda_in->template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
+    y = fixtcuda_out->template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
+    daxpyKernel<<<gridSize, blockSize>>>(n, alpha, x, y, y);
+
+    // Check results
+    BOOST_TEST(fixtcuda_out->compare(*fixt_out, 1.0E-14));
+    boost::test_tools::output_test_stream output;
+    {
+        OutputIfNotMatch(
+            fixtcuda_out->template GetStorage<MemoryRegionCUDA>().GetCPUPtr(),
+            fixt_out->GetStorage().GetCPUPtr(), 1.0E-14);
+    }
+}
+
+BOOST_FIXTURE_TEST_CASE(cuda_vdivkernel, CUDAKernels)
+{
+    Configure();
+    SetTestCase();
+    size_t n = fixtcuda_in->template GetStorage<MemoryRegionCUDA>().size();
+    size_t gridSize  = 1024;
+    size_t blockSize = 32;
+    double *x, *y;
+
+    // Vmath results
+    x = fixt_in->GetStorage().GetCPUPtr();
+    y = fixt_out->GetStorage().GetCPUPtr();
+    Vmath::Vdiv(n, x, 1, y, 1, y, 1);
+
+    // CUDA results
+    x = fixtcuda_in->template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
+    y = fixtcuda_out->template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
+    vdivKernel<<<gridSize, blockSize>>>(n, x, y, y);
+
+    // Check results
+    BOOST_TEST(fixtcuda_out->compare(*fixt_out, 1.0E-15));
+    boost::test_tools::output_test_stream output;
+    {
+        OutputIfNotMatch(
+            fixtcuda_out->template GetStorage<MemoryRegionCUDA>().GetCPUPtr(),
+            fixt_out->GetStorage().GetCPUPtr(), 1.0E-15);
+    }
+}
+
+BOOST_AUTO_TEST_SUITE_END()