40#include <cuda_runtime.h>
43#include <neko/device/device_config.h>
44#include <neko/device/cuda/check.h>
45#include <neko/math/bcknd/device/device_mpi_op.h>
46#include <neko/math/bcknd/device/device_mpi_reduce.h>
49#include "math_ext_kernel.h"
56void cuda_copy_mask(
void* a,
void* b,
int* size,
int* mask,
int* mask_size) {
58 const dim3 nthrds(1024, 1, 1);
59 const dim3 nblcks(((*mask_size) + 1024 - 1) / 1024, 1, 1);
61 if (*mask_size == 0)
return;
62 copy_mask_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>(
63 (real*)a, (real*)b, *size, mask, *mask_size);
64 CUDA_CHECK(cudaGetLastError());
70void cuda_cadd_mask(
void* a, real* c,
int* size,
int* mask,
int* mask_size) {
72 const dim3 nthrds(1024, 1, 1);
73 const dim3 nblcks(((*mask_size) + 1024 - 1) / 1024, 1, 1);
75 if (*mask_size == 0)
return;
76 cadd_mask_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>(
77 (real*)a, *c, *size, mask, *mask_size);
78 CUDA_CHECK(cudaGetLastError());
84void cuda_invcol1_mask(
void* a,
int* size,
int* mask,
int* mask_size) {
86 const dim3 nthrds(1024, 1, 1);
87 const dim3 nblcks(((*mask_size) + 1024 - 1) / 1024, 1, 1);
89 if (*mask_size == 0)
return;
90 invcol1_mask_kernel<real>
91 <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>(
92 (real*)a, *size, mask, *mask_size);
93 CUDA_CHECK(cudaGetLastError());
99void cuda_col2_mask(
void* a,
void* b,
int* size,
int* mask,
int* mask_size) {
101 const dim3 nthrds(1024, 1, 1);
102 const dim3 nblcks(((*mask_size) + 1024 - 1) / 1024, 1, 1);
104 if (*mask_size == 0)
return;
105 col2_mask_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>(
106 (real*)a, (real*)b, *size, mask, *mask_size);
107 CUDA_CHECK(cudaGetLastError());
114 void* a,
void* b,
void* c,
int* size,
int* mask,
int* mask_size) {
116 const dim3 nthrds(1024, 1, 1);
117 const dim3 nblcks(((*mask_size) + 1024 - 1) / 1024, 1, 1);
119 if (*mask_size == 0)
return;
120 col3_mask_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>(
121 (real*)a, (real*)b, (real*)c, *size, mask, *mask_size);
122 CUDA_CHECK(cudaGetLastError());
129 void* a,
void* b,
void* c,
int* size,
int* mask,
int* mask_size) {
131 const dim3 nthrds(1024, 1, 1);
132 const dim3 nblcks(((*mask_size) + 1024 - 1) / 1024, 1, 1);
134 if (*mask_size == 0)
return;
135 sub3_mask_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>(
136 (real*)a, (real*)b, (real*)c, *size, mask, *mask_size);
137 CUDA_CHECK(cudaGetLastError());