35#ifndef MMA_CUDA_KERNEL_H
36#define MMA_CUDA_KERNEL_H
39__global__
void mma_Ljjxinv_kernel(T* __restrict__ Ljjxinv,
40 const T* __restrict__ pjlambda,
const T* __restrict__ qjlambda,
41 const T* __restrict__ x,
const T* __restrict__ low,
const T* __restrict__ upp,
42 const T* __restrict__ alpha,
const T* __restrict__ beta,
44 int tj = blockIdx.x * blockDim.x + threadIdx.x;
47 T val = -1.0 / (2.0 * pjlambda[tj] / pow(upp[tj] - xt, 3) +
48 2.0 * qjlambda[tj] / pow(xt - low[tj], 3));
50 bool is_alpha = xt == alpha[tj];
51 bool is_beta = xt == beta[tj];
52 Ljjxinv[tj] = (is_alpha || is_beta) ? T(0.0) : val;
58__global__
void mma_dipsolvesub1_kernel(T* __restrict__ x,
59 const T* __restrict__ pjlambda,
const T* __restrict__ qjlambda,
60 const T* __restrict__ low,
const T* __restrict__ upp,
61 const T* __restrict__ alpha,
const T* __restrict__ beta,
63 int tj = blockIdx.x * blockDim.x + threadIdx.x;
65 T pj = sqrt(pjlambda[tj]);
66 T qj = sqrt(qjlambda[tj]);
68 T val = (pj * low[tj] + qj * upp[tj]) / denom;
71 x[tj] = fmax(fmin(val, beta[tj]), alpha[tj]);
77__global__
void mattrans_v_mul_kernel(T* __restrict__ output,
78 const T* __restrict__ pij,
const T* __restrict__ lambda,
79 const int m,
const int n) {
80 int tj = blockIdx.x * blockDim.x + threadIdx.x;
83 for (
int i = 0; i < m; i++) {
85 output[tj] = output[tj] + pij[i + tj * m] * lambda[i];
91__global__
void mma_sub1_kernel(T* __restrict__ xlow, T* __restrict__ xupp,
92 const T* __restrict__ x,
const T* __restrict__ xmin,
93 const T* __restrict__ xmax,
const T asyinit,
const int n) {
94 int tj = blockIdx.x * blockDim.x + threadIdx.x;
96 T xgap = xmax[tj] - xmin[tj];
97 xlow[tj] = x[tj] - asyinit * xgap;
98 xupp[tj] = x[tj] + asyinit * xgap;
103template<
typename T >
104__global__
void mma_sub2_kernel(T* __restrict__ low, T* __restrict__ upp,
105 const T* __restrict__ x,
const T* __restrict__ xold1,
106 const T* __restrict__ xold2,
const T* __restrict__ xdiff,
107 const T asydecr,
const T asyincr,
const int n) {
108 int tj = blockIdx.x * blockDim.x + threadIdx.x;
113 const T xval = x[tj];
114 const T xold1val = xold1[tj];
115 const T xold2val = xold2[tj];
116 const T lowval = low[tj];
117 const T uppval = upp[tj];
118 const T xdiffval = xdiff[tj];
121 const T prod = (xval - xold1val) * (xold1val - xold2val);
124 T asy_factor = (prod < T(0)) ? asydecr :
125 (prod > T(0)) ? asyincr : T(1);
128 T new_low = fma(-asy_factor, (xold1val - lowval), xval);
129 T new_upp = fma(asy_factor, (uppval - xold1val), xval);
132 new_low = max(new_low, xval - T(10.0) * xdiffval);
133 new_low = min(new_low, xval - T(0.01) * xdiffval);
135 new_upp = min(new_upp, xval + T(10.0) * xdiffval);
136 new_upp = max(new_upp, xval + T(0.01) * xdiffval);
143template<
typename T >
144__global__
void mma_sub3_kernel(
const T* __restrict__ x,
145 const T* __restrict__ df0dx,
const T* __restrict__ dfdx,
146 T* __restrict__ low, T* __restrict__ upp,
const T* __restrict__ xmin,
147 const T* __restrict__ xmax, T* __restrict__ alpha, T* __restrict__ beta,
148 T* __restrict__ p0j, T* __restrict__ q0j, T* __restrict__ pij,
149 T* __restrict__ qij,
const int n,
const int m) {
150 int tj = blockIdx.x * blockDim.x + threadIdx.x;
152 T xgap = xmax[tj] - xmin[tj];
153 alpha[tj] = max(max(xmin[tj], low[tj] +
154 0.1 * (x[tj] - low[tj])), x[tj] - 0.5 * xgap);
155 beta[tj] = min(min(xmax[tj], upp[tj] - 0.1 * (upp[tj] - x[tj])), x[tj] +
158 p0j[tj] = pow(upp[tj] - x[tj], 2) * (1.001 * max(df0dx[tj], 0.0) +
159 0.001 * max(-df0dx[tj], 0.0) + 0.00001 / max(0.00001, xgap));
160 q0j[tj] = pow(x[tj] - low[tj], 2) * (0.001 * max(df0dx[tj], 0.0) +
161 1.001 * max(-df0dx[tj], 0.0) + 0.00001 / max(0.00001, xgap));
163 for (
int i = 0; i < m; i++) {
164 pij[i + tj*m] = pow(upp[tj] - x[tj], 2) *
165 (1.001 * max(dfdx[i + tj*m], 0.0) + 0.001 *
166 max(-dfdx[i + tj*m], 0.0) + 0.00001 / max(0.00001, xgap));
167 qij[i + tj*m] = pow(x[tj] - low[tj], 2) *
168 (0.001 * max(dfdx[i + tj*m], 0.0) + 1.001 *
169 max(-dfdx[i + tj*m], 0.0) + 0.00001 / max(0.00001, xgap));
175template<
typename T >
176__global__
void mma_sub4_kernel(
const T* __restrict__ x, T* __restrict__ low,
177 T* __restrict__ upp, T* __restrict__ pij, T* __restrict__ qij,
178 T* __restrict__ temp,
const int n,
const int m) {
179 int tj = blockIdx.x * blockDim.x + threadIdx.x;
181 for (
int i = 0; i < m; i++) {
182 temp[i + tj*m] = pij[i + tj*m] / (upp[tj] - x[tj]) +
183 qij[i + tj*m] / (x[tj] - low[tj]);
190__global__
void mma_max2_kernel(T* __restrict__ xsi,
const T* __restrict__ x,
191 T* __restrict__ alpha,
const int n) {
192 int tj = blockIdx.x * blockDim.x + threadIdx.x;
194 xsi[tj] = max(1.0, 1.0 / (x[tj] - alpha[tj]));
201__global__
void relambda_kernel(T* __restrict__ temp,
const T* __restrict__ x,
202 const T* __restrict__ xupp,
const T* __restrict__ xlow,
203 const T* __restrict__ pij,
const T* __restrict__ qij,
const int n,
205 int tj = blockIdx.x * blockDim.x + threadIdx.x;
207 for (
int i = 0; i < m; i++) {
208 temp[i + tj*m] = pij[i + tj*m] / (xupp[tj] - x[tj]) +
209 qij[i + tj*m] / (x[tj] - xlow[tj]);
216__global__
void sub2cons2_kernel(T* __restrict__ a,
const T* __restrict__ b,
217 const T* __restrict__ c,
const T* __restrict__ d,
218 const T e,
const int n) {
219 int tj = blockIdx.x * blockDim.x + threadIdx.x;
221 a[tj] = b[tj]*(c[tj]-d[tj])-e;
226__inline__ __device__ T max_reduce_warp(T val) {
227 val = max(val, __shfl_down_sync(0xffffffff, val, 16));
228 val = max(val, __shfl_down_sync(0xffffffff, val, 8));
229 val = max(val, __shfl_down_sync(0xffffffff, val, 4));
230 val = max(val, __shfl_down_sync(0xffffffff, val, 2));
231 val = max(val, __shfl_down_sync(0xffffffff, val, 1));
237template<
typename T >
238__global__
void maxval_kernel(
const T* __restrict__ a, T *temp,
const int n) {
240 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
241 const int str = blockDim.x * gridDim.x;
243 const unsigned int lane = threadIdx.x % warpSize;
244 const unsigned int wid = threadIdx.x / warpSize;
246 __shared__ T shared[32];
248 for (
int i = idx; i < n; i += str) {
249 maxval = max(maxval, abs(a[i]));
252 maxval = max_reduce_warp<T>(maxval);
254 shared[wid] = maxval;
257 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0.0;
259 maxval = max_reduce_warp<T>(maxval);
261 if (threadIdx.x == 0)
262 temp[blockIdx.x] = maxval;
267__global__
void max_reduce_kernel(T* __restrict__ bufred,
const int n) {
270 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
271 const int str = blockDim.x * gridDim.x;
272 for (
int i = idx; i < n; i += str)
274 maxval =max(maxval, bufred[i]);
277 __shared__ T shared[32];
278 unsigned int lane = threadIdx.x % warpSize;
279 unsigned int wid = threadIdx.x / warpSize;
281 maxval = max_reduce_warp<T>(maxval);
283 shared[wid] = maxval;
286 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0.0;
288 maxval = max_reduce_warp<T>(maxval);
290 if (threadIdx.x == 0)
291 bufred[blockIdx.x] = maxval;
295__global__
void delx_kernel(T* __restrict__ delx,
const T* __restrict__ x,
296 const T* __restrict__ xlow,
const T* __restrict__ xupp,
297 const T* __restrict__ pij,
const T* __restrict__ qij,
298 const T* __restrict__ p0j,
const T* __restrict__ q0j,
299 const T* __restrict__ alpha,
const T* __restrict__ beta,
300 const T* __restrict__ lambda,
const T epsi,
const int n,
const int m) {
301 int tj = blockIdx.x * blockDim.x + threadIdx.x;
304 for (
int i = 0; i < m; i++) {
305 delx[tj] = delx[tj] + pij[i + tj * m] *
306 lambda[i] / pow(xupp[tj] - x[tj], 2) -
307 qij[i + tj * m] * lambda[i] / pow(x[tj] - xlow[tj], 2);
309 delx[tj] = delx[tj] + p0j[tj] / pow(xupp[tj] - x[tj], 2) -
310 q0j[tj] / pow(x[tj] - xlow[tj], 2) - epsi / (x[tj] - alpha[tj])
311 + epsi / (beta[tj] - x[tj]);
317__global__
void GG_kernel(T* __restrict__ GG,
const T* __restrict__ x,
318 const T* __restrict__ xlow,
const T* __restrict__ xupp,
319 const T* __restrict__ pij,
const T* __restrict__ qij,
const int n,
321 int tj = blockIdx.x * blockDim.x + threadIdx.x;
323 for (
int ggdumiter = 0; ggdumiter < m; ggdumiter++) {
324 GG[ggdumiter + m*tj] = pij[ggdumiter + m*tj] / pow(xupp[tj] - x[tj], 2) -
325 qij[ggdumiter + m*tj] / pow(x[tj] - xlow[tj], 2);
330__global__
void diagx_kernel(T* __restrict__ diagx,
const T* __restrict__ x,
331 const T* __restrict__ xsi,
const T* __restrict__ xlow,
332 const T* __restrict__ xupp,
const T* __restrict__ p0j,
333 const T* __restrict__ q0j,
const T* __restrict__ pij,
334 const T* __restrict__ qij,
const T* alpha,
const T* beta,
335 const T* eta,
const T* lambda,
const int n,
const int m) {
336 int tj = blockIdx.x * blockDim.x + threadIdx.x;
340 for (
int i = 0; i < m; i++) {
341 sum = sum + pij[tj *m+ i] * lambda[i];
342 sum1 = sum1 + qij[tj*m + i] * lambda[i];
344 diagx[tj] = (p0j[tj] + sum) / pow(xupp[tj] - x[tj], 3) +
345 (q0j[tj] + sum1) / pow(x[tj] - xlow[tj], 3);
346 diagx[tj] = 2.0 * diagx[tj] + xsi[tj] / (x[tj] - alpha[tj]) +
347 eta[tj] / (beta[tj] - x[tj]);
353__inline__ __device__ T reduce_warp(T val) {
354 val += __shfl_down_sync(0xffffffff, val, 16);
355 val += __shfl_down_sync(0xffffffff, val, 8);
356 val += __shfl_down_sync(0xffffffff, val, 4);
357 val += __shfl_down_sync(0xffffffff, val, 2);
358 val += __shfl_down_sync(0xffffffff, val, 1);
363__global__
void mmareduce_kernel(T* __restrict__ bufred,
const int n) {
366 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
367 const int str = blockDim.x * gridDim.x;
368 for (
int i = idx; i < n; i += str)
373 __shared__ T shared[32];
374 unsigned int lane = threadIdx.x % warpSize;
375 unsigned int wid = threadIdx.x / warpSize;
377 sum = reduce_warp<T>(sum);
382 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
384 sum = reduce_warp<T>(sum);
386 if (threadIdx.x == 0)
387 bufred[blockIdx.x] = sum;
391template<
typename T >
392__global__
void mmasum_kernel(
const T* __restrict__ a, T* __restrict__ buf_h,
393 const int n,
const int m,
const int k) {
395 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
396 const int str = blockDim.x * gridDim.x;
398 const unsigned int lane = threadIdx.x % warpSize;
399 const unsigned int wid = threadIdx.x / warpSize;
401 __shared__ T shared[32];
403 for (
int i = idx; i < n; i += str)
405 sum += a[m * i + k ];
408 sum = reduce_warp<T>(sum);
413 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
415 sum = reduce_warp<T>(sum);
417 if (threadIdx.x == 0)
418 buf_h[blockIdx.x] = sum;
421template<
typename T >
422__global__
void mmasumbb_kernel(
const T* __restrict__ GG,
423 const T* __restrict__ delx,
const T* __restrict__ diagx,
424 T* __restrict__ buf_h,
const int n,
const int m,
const int k) {
426 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
427 const int str = blockDim.x * gridDim.x;
429 const unsigned int lane = threadIdx.x % warpSize;
430 const unsigned int wid = threadIdx.x / warpSize;
432 __shared__ T shared[32];
434 for (
int i = idx; i < n; i += str)
436 sum += GG[ k + i * m] * delx[i] / diagx[i];
439 sum = reduce_warp<T>(sum);
444 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
446 sum = reduce_warp<T>(sum);
448 if (threadIdx.x == 0)
449 buf_h[blockIdx.x] = sum;
453template<
typename T >
454__global__
void mmasumHess_kernel(
const T* __restrict__ hijx,
455 const T* __restrict__ Ljjxinv, T* __restrict__ buf_h,
const int n,
456 const int m,
const int k0,
const int k1) {
458 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
459 const int str = blockDim.x * gridDim.x;
461 const unsigned int lane = threadIdx.x % warpSize;
462 const unsigned int wid = threadIdx.x / warpSize;
464 __shared__ T shared[32];
466 for (
int i = idx; i < n; i += str)
468 sum += hijx[ k0 + i * m] * Ljjxinv[i] * hijx[ k1 + i * m];
471 sum = reduce_warp<T>(sum);
476 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
478 sum = reduce_warp<T>(sum);
480 if (threadIdx.x == 0)
481 buf_h[blockIdx.x] = sum;
485template<
typename T >
486__global__
void mmasumAA_kernel(
const T* __restrict__ GG,
487 const T* __restrict__ diagx, T* __restrict__ buf_h,
const int n,
488 const int m,
const int k0,
const int k1) {
490 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
491 const int str = blockDim.x * gridDim.x;
493 const unsigned int lane = threadIdx.x % warpSize;
494 const unsigned int wid = threadIdx.x / warpSize;
496 __shared__ T shared[32];
498 for (
int i = idx; i < n; i += str)
500 sum += GG[ k0 + i * m] /diagx[i] * GG[ k1 + i * m];
503 sum = reduce_warp<T>(sum);
508 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
510 sum = reduce_warp<T>(sum);
512 if (threadIdx.x == 0)
513 buf_h[blockIdx.x] = sum;
519__global__
void mma_copy_kernel(T* __restrict__ a,
const T* __restrict__ b,
520 const int n,
const int m) {
521 int tj = blockIdx.x * blockDim.x + threadIdx.x;
529__global__
void AA_kernel(T* __restrict__ temp,
const T* __restrict__ GG,
530 const T* __restrict__ diagx,
const int n,
const int m) {
531 int tj = blockIdx.x * blockDim.x + threadIdx.x;
533 for (
int i0 = 0; i0 < m; i0++) {
534 for (
int i1 = 0; i1 < m; i1++) {
535 temp[tj + i0 * (n + 1) + i1 * (m + 1) * (n + 1)] = GG[i0 * n + tj] *
536 (1.0 / diagx[tj]) * GG[i1 * n + tj];
544__global__
void dx_kernel(T* __restrict__ dx,
const T* __restrict__ delx,
545 const T* __restrict__ diagx,
const T* __restrict__ GG,
546 const T* __restrict__ dlambda,
const int n,
const int m) {
547 int tj = blockIdx.x * blockDim.x + threadIdx.x;
549 dx[tj] = -delx[tj]/diagx[tj];
550 for(
int i=0;i<m;i++){
551 dx[tj] =dx[tj] - GG[tj*m+i]*dlambda[i]/diagx[tj];
559__global__
void dxsi_kernel(T* __restrict__ dxsi,
const T* __restrict__ xsi,
560 const T* __restrict__ dx,
const T* __restrict__ x,
561 const T* __restrict__ alpha,
const T epsi,
const int n) {
562 int tj = blockIdx.x * blockDim.x + threadIdx.x;
564 dxsi[tj]= -xsi[tj] + (epsi-dx[tj]*xsi[tj])/(x[tj] - alpha[tj]);
570__global__
void deta_kernel(T* __restrict__ deta,
const T* __restrict__ eta,
571 const T* __restrict__ dx,
const T* __restrict__ x,
572 const T* __restrict__ beta,
const T epsi,
const int n) {
573 int tj = blockIdx.x * blockDim.x + threadIdx.x;
575 deta[tj] = -eta[tj] + (epsi + dx[tj] * eta[tj]) / (beta[tj] - x[tj]);
582__global__
void RexCalculation_kernel(T* __restrict__ rex,
583 const T* __restrict__ x,
const T* __restrict__ xlow,
584 const T* __restrict__ xupp,
const T* __restrict__ pij,
585 const T* __restrict__ p0j,
const T* __restrict__ qij,
586 const T* __restrict__ q0j,
const T* __restrict__ lambda,
587 const T* __restrict__ xsi,
const T* __restrict__ eta,
const int n,
589 int tj = blockIdx.x * blockDim.x + threadIdx.x;
592 for (
int i = 0; i < m; i++) {
593 rex[tj] = rex[tj] + pij[i +tj*m] * lambda[i] / pow(xupp[tj] - x[tj], 2) -
594 qij[i +tj*m] * lambda[i] / pow(x[tj] - xlow[tj], 2);
596 rex[tj] = rex[tj] + p0j[tj] / pow(xupp[tj] - x[tj], 2) -
597 q0j[tj] / pow(x[tj] - xlow[tj], 2) - xsi[tj] + eta[tj];
602__global__
void rey_calculation_kernel(T* __restrict__ rey,
603 const T* __restrict__ c,
const T* __restrict__ d,
const T* __restrict__ y,
604 const T* __restrict__ lambda,
const T* __restrict__ mu,
const int n) {
605 int tj = blockIdx.x * blockDim.x + threadIdx.x;
607 rey[tj] = c[tj] + d[tj] * y[tj] - lambda[tj] - mu[tj];
612template<
typename T >
613__global__
void norm_kernel(
const T* __restrict__ a, T* __restrict__ buf_h,
616 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
617 const int str = blockDim.x * gridDim.x;
619 const unsigned int lane = threadIdx.x % warpSize;
620 const unsigned int wid = threadIdx.x / warpSize;
622 __shared__ T shared[32];
624 for (
int i = idx; i < n; i += str)
629 sum = reduce_warp<T>(sum);
634 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
636 sum = reduce_warp<T>(sum);
638 if (threadIdx.x == 0)
639 buf_h[blockIdx.x] = sum;
646__global__
void sub2cons_kernel(T* __restrict__ a,
const T* __restrict__ b,
647 const T* __restrict__ c,
648 const T d,
const int n) {
649 int tj = blockIdx.x * blockDim.x + threadIdx.x;
651 a[tj] = b[tj]*c[tj]-d;
657__global__
void dely_kernel(T* __restrict__ dely,
const T* __restrict__ c,
658 const T* __restrict__ d,
const T* __restrict__ y,
659 const T* __restrict__ lambda,
const T epsi,
const int n) {
660 int tj = blockIdx.x * blockDim.x + threadIdx.x;
662 dely[tj] = c[tj] + d[tj]*y[tj] - lambda[tj] - epsi/y[tj];
668template<
typename T >
669__global__
void maxval2_kernel(
const T* __restrict__ a,
const T* __restrict__ b,
670 T* __restrict__ temp,
const T cons,
const int n) {
672 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
673 const int str = blockDim.x * gridDim.x;
675 const unsigned int lane = threadIdx.x % warpSize;
676 const unsigned int wid = threadIdx.x / warpSize;
678 __shared__ T shared[32];
679 T maxval = cons * a[0] / b[0];
680 for (
int i = idx; i < n; i += str)
682 maxval = max(maxval, cons * a[i] / b[i]);
685 maxval = max_reduce_warp<T>(maxval);
687 shared[wid] = maxval;
690 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0.0;
692 maxval = max_reduce_warp<T>(maxval);
694 if (threadIdx.x == 0)
695 temp[blockIdx.x] = maxval;
700template<
typename T >
701__global__
void maxval3_kernel(
const T* __restrict__ a,
const T* __restrict__ b,
702 const T* __restrict__ c, T* __restrict__ temp,
const T cons,
const int n) {
704 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
705 const int str = blockDim.x * gridDim.x;
707 const unsigned int lane = threadIdx.x % warpSize;
708 const unsigned int wid = threadIdx.x / warpSize;
710 __shared__ T shared[32];
711 T maxval = cons * a[0] / b[0];
712 for (
int i = idx; i < n; i += str)
714 maxval = max(maxval, cons * a[i] / (b[i] - c[i]));
717 maxval = max_reduce_warp<T>(maxval);
719 shared[wid] = maxval;
722 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
724 maxval = max_reduce_warp<T>(maxval);
726 if (threadIdx.x == 0)
727 temp[blockIdx.x] = maxval;
733__global__
void kkt_rex_kernel(T* __restrict__ rex,
const T* __restrict__ df0dx,
734 const T* __restrict__ dfdx,
const T* __restrict__ xsi,
735 const T* __restrict__ eta,
const T* __restrict__ lambda,
const int n,
737 int tj = blockIdx.x * blockDim.x + threadIdx.x;
740 for (
int i = 0; i < m; i++) {
741 rex[tj] = rex[tj] + dfdx[i + tj*m] * lambda[i];
743 rex[tj] += df0dx[tj] - xsi[tj] + eta[tj];
749__global__
void maxcons_kernel(T* __restrict__ a,
const T b,
750 const T c,
const T* __restrict__ d,
const int n) {
751 int tj = blockIdx.x * blockDim.x + threadIdx.x;
753 a[tj] = max(b, c * d[tj]);
759 template<
typename T >
760 __global__
void glsum_kernel(
const T * a, T * buf_h,
const int n) {
761 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
762 const int str = blockDim.x * gridDim.x;
764 const unsigned int lane = threadIdx.x % warpSize;
765 const unsigned int wid = threadIdx.x / warpSize;
767 __shared__ T shared[32];
769 for (
int i = idx; i<n ; i += str)
774 sum = reduce_warp<T>(sum);
779 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
781 sum = reduce_warp<T>(sum);
783 if (threadIdx.x == 0)
784 buf_h[blockIdx.x] = sum;
787 template<
typename T >
788__global__
void glsc2_kernel(
const T * a,
793 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
794 const int str = blockDim.x * gridDim.x;
796 const unsigned int lane = threadIdx.x % warpSize;
797 const unsigned int wid = threadIdx.x / warpSize;
799 __shared__ T shared[32];
801 for (
int i = idx; i < n; i+= str) {
805 sum = reduce_warp<T>(sum);
810 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
812 sum = reduce_warp<T>(sum);
814 if (threadIdx.x == 0)
815 buf_h[blockIdx.x] = sum;
824__global__
void add2inv2_kernel(T* __restrict__ a,
const T* __restrict__ b,
825 const T c,
const int n) {
826 int tj = blockIdx.x * blockDim.x + threadIdx.x;
828 a[tj] = a[tj]+c/b[tj];
833__global__
void max2_kernel(T* __restrict__ a,
const T b,
834 const T* __restrict__ c,
const T d,
const int n) {
835 int tj = blockIdx.x * blockDim.x + threadIdx.x;
837 a[tj]=max(b, d*c[tj]);
842__global__
void updatebb_kernel(T* __restrict__ bb,
843 const T* __restrict__ dellambda,
const T* __restrict__ dely,
844 const T* __restrict__ d,
const T* __restrict__ mu,
845 const T* __restrict__ y,
const T delz,
const int m) {
846 int tj = blockIdx.x * blockDim.x + threadIdx.x;
848 bb[tj]=dellambda[tj] + dely[tj]/(d[tj] + mu[tj]/y[tj]) - bb[tj];
856__global__
void updateAA_kernel(T* __restrict__ AA,
857 const T* __restrict__ globaltmp_mm,
const T* __restrict__ s,
858 const T* __restrict__ lambda,
const T* __restrict__ d,
859 const T* __restrict__ mu,
const T* __restrict__ y,
const T* __restrict__ a,
860 const T zeta,
const T z,
const int m) {
861 int tj = blockIdx.x * blockDim.x + threadIdx.x;
864 AA[tj+tj*(m+1)]=globaltmp_mm[tj+tj*m] + (s[tj] / lambda[tj] +
865 1.0/ (d[tj] + mu[tj] / y[tj]));
866 AA[tj+m*(m+1)]=a[tj];
867 AA[m+tj*(m+1)]=a[tj];
870 AA[tj+tj*(m+1)]= -zeta/z;
874__global__
void dy_kernel(T* __restrict__ dy,
const T* __restrict__ dely,
875 const T* __restrict__ dlambda,
const T* __restrict__ d,
876 const T* __restrict__ mu,
const T* __restrict__ y,
const int n) {
877 int tj = blockIdx.x * blockDim.x + threadIdx.x;
879 dy[tj] = (-dely[tj]+dlambda[tj])/(d[tj] + mu[tj]/y[tj]);