35#ifndef __NEKO_CUDA_MATH_EXT_KERNELS__
36#define __NEKO_CUDA_MATH_EXT_KERNELS__
42__global__
void copy_mask_kernel(
43 T* __restrict__ a, T* __restrict__ b,
const int size,
44 int* __restrict__ mask,
const int mask_size) {
46 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
47 const int str = blockDim.x * gridDim.x;
49 for (
int i = idx; i < mask_size; i += str) {
50 a[mask[i]-1] = b[mask[i]-1];
58__global__
void cadd_mask_kernel(
59 T* __restrict__ a,
const T c,
const int size,
int* __restrict__ mask,
60 const int mask_size) {
62 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
63 const int str = blockDim.x * gridDim.x;
65 for (
int i = idx; i < mask_size; i += str) {
66 a[mask[i]-1] = a[mask[i]-1] + c;
74__global__
void invcol1_mask_kernel(
75 T* __restrict__ a,
const int size,
int* __restrict__ mask,
76 const int mask_size) {
78 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
79 const int str = blockDim.x * gridDim.x;
81 for (
int i = idx; i < mask_size; i += str) {
82 a[mask[i]-1] = 1.0 / a[mask[i]-1];
90__global__
void col2_mask_kernel(
91 T* __restrict__ a, T* __restrict__ b,
const int size,
92 int* __restrict__ mask,
const int mask_size) {
94 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
95 const int str = blockDim.x * gridDim.x;
97 for (
int i = idx; i < mask_size; i += str) {
98 a[mask[i]-1] = a[mask[i]-1] * b[mask[i]-1];
106__global__
void col3_mask_kernel(
107 T* __restrict__ a, T* __restrict__ b, T* __restrict__ c,
const int size,
108 int* __restrict__ mask,
const int mask_size) {
110 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
111 const int str = blockDim.x * gridDim.x;
113 for (
int i = idx; i < mask_size; i += str) {
114 a[mask[i]-1] = b[mask[i]-1] * c[mask[i]-1];
122__global__
void sub3_mask_kernel(
123 T* __restrict__ a, T* __restrict__ b, T* __restrict__ c,
const int size,
124 int* __restrict__ mask,
const int mask_size) {
126 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
127 const int str = blockDim.x * gridDim.x;
129 for (
int i = idx; i < mask_size; i += str) {
130 a[mask[i]-1] = b[mask[i]-1] - c[mask[i]-1];