35#include "math_ext_kernel.h"
36#include <device/cuda/check.h>
37#include <device/device_config.h>
43#include <math/bcknd/device/device_mpi_op.h>
44#include <math/bcknd/device/device_mpi_reduce.h>
49void cuda_copy_mask(
void* a,
void* b,
int* size,
int* mask,
int* mask_size) {
51 const dim3 nthrds(1024, 1, 1);
52 const dim3 nblcks(((*mask_size) + 1024 - 1) / 1024, 1, 1);
54 if (*mask_size == 0)
return;
55 copy_mask_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>(
56 (real*)a, (real*)b, *size, mask, *mask_size);
57 CUDA_CHECK(cudaGetLastError());
62void cuda_cadd_mask(
void* a, real* c,
int* size,
int* mask,
int* mask_size) {
64 const dim3 nthrds(1024, 1, 1);
65 const dim3 nblcks(((*mask_size) + 1024 - 1) / 1024, 1, 1);
67 if (*mask_size == 0)
return;
68 cadd_mask_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>(
69 (real*)a, *c, *size, mask, *mask_size);
70 CUDA_CHECK(cudaGetLastError());
75void cuda_invcol1_mask(
void* a,
int* size,
int* mask,
int* mask_size) {
77 const dim3 nthrds(1024, 1, 1);
78 const dim3 nblcks(((*mask_size) + 1024 - 1) / 1024, 1, 1);
80 if (*mask_size == 0)
return;
81 invcol1_mask_kernel<real>
82 <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>(
83 (real*)a, *size, mask, *mask_size);
84 CUDA_CHECK(cudaGetLastError());
89void cuda_col2_mask(
void* a,
void* b,
int* size,
int* mask,
int* mask_size) {
91 const dim3 nthrds(1024, 1, 1);
92 const dim3 nblcks(((*mask_size) + 1024 - 1) / 1024, 1, 1);
94 if (*mask_size == 0)
return;
95 col2_mask_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>(
96 (real*)a, (real*)b, *size, mask, *mask_size);
97 CUDA_CHECK(cudaGetLastError());
103 void* a,
void* b,
void* c,
int* size,
int* mask,
int* mask_size) {
105 const dim3 nthrds(1024, 1, 1);
106 const dim3 nblcks(((*mask_size) + 1024 - 1) / 1024, 1, 1);
108 if (*mask_size == 0)
return;
109 col3_mask_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>(
110 (real*)a, (real*)b, (real*)c, *size, mask, *mask_size);
111 CUDA_CHECK(cudaGetLastError());
117 void* a,
void* b,
void* c,
int* size,
int* mask,
int* mask_size) {
119 const dim3 nthrds(1024, 1, 1);
120 const dim3 nblcks(((*mask_size) + 1024 - 1) / 1024, 1, 1);
122 if (*mask_size == 0)
return;
123 sub3_mask_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>(
124 (real*)a, (real*)b, (real*)c, *size, mask, *mask_size);
125 CUDA_CHECK(cudaGetLastError());