35#ifndef MMA_HIP_KERNEL_H
36#define MMA_HIP_KERNEL_H
39__global__
void delta_1dbeam_kernel(T* __restrict__ Delta,
40 const T L_total,
const T Le,
const int offset,
const int n) {
41 int k = blockIdx.x * blockDim.x + threadIdx.x;
48 T x1 = L_total - Le *
static_cast<T
>(offset + idx - 1);
49 T term1 = x1 * x1 * x1;
52 T x2 = L_total - Le *
static_cast<T
>(offset + idx);
53 T term2 = x2 * x2 * x2;
56 Delta[k] = (term1 - term2) /
static_cast<T
>(3.0);
60__global__
void mma_Ljjxinv_kernel(T* __restrict__ Ljjxinv,
61 const T* __restrict__ pjlambda,
const T* __restrict__ qjlambda,
62 const T* __restrict__ x,
const T* __restrict__ low,
const T* __restrict__ upp,
63 const T* __restrict__ alpha,
const T* __restrict__ beta,
65 int tj = blockIdx.x * blockDim.x + threadIdx.x;
70 const T low_j = low[tj];
71 const T upp_j = upp[tj];
72 const T pj = pjlambda[tj];
73 const T qj = qjlambda[tj];
76 const T diff_u = upp_j - xt;
77 const T diff_l = xt - low_j;
80 const T diff_u3 = diff_u * diff_u * diff_u;
81 const T diff_l3 = diff_l * diff_l * diff_l;
84 T denom = 2.0 * pj / diff_u3 + 2.0 * qj / diff_l3;
88 bool active = (fabs(xt - alpha[tj]) <= T(1e-16)) ||
89 (fabs(xt - beta[tj]) <= T(1e-16));
91 Ljjxinv[tj] = active ? T(0.0) : val;
95__global__
void mma_dipsolvesub1_kernel(T* __restrict__ x,
96 const T* __restrict__ pjlambda,
const T* __restrict__ qjlambda,
97 const T* __restrict__ low,
const T* __restrict__ upp,
98 const T* __restrict__ alpha,
const T* __restrict__ beta,
101 int tj = blockIdx.x * blockDim.x + threadIdx.x;
105 const T pj_sqrt = sqrt(pjlambda[tj]);
106 const T qj_sqrt = sqrt(qjlambda[tj]);
107 const T denom = pj_sqrt + qj_sqrt;
109 const T low_j = low[tj];
110 const T upp_j = upp[tj];
111 const T alpha_j = alpha[tj];
112 const T beta_j = beta[tj];
115 const T val = (pj_sqrt * low_j + qj_sqrt * upp_j) / denom;
118 x[tj] = fmin(fmax(val, alpha_j), beta_j);
122__global__
void mattrans_v_mul_kernel(T* __restrict__ output,
123 const T* __restrict__ pij,
const T* __restrict__ lambda,
124 const int m,
const int n) {
126 int tj = blockIdx.x * blockDim.x + threadIdx.x;
132 for (
int i = 0; i < m; ++i) {
133 acc += pij[i + tj * m] * lambda[i];
141__global__
void mma_sub1_kernel(
142 T* __restrict__ xlow,
143 T* __restrict__ xupp,
144 const T* __restrict__ x,
145 const T* __restrict__ xmin,
146 const T* __restrict__ xmax,
149 int tj = blockIdx.x * blockDim.x + threadIdx.x;
154 const T xmin_j = xmin[tj];
155 const T xmax_j = xmax[tj];
158 const T xgap = xmax_j - xmin_j;
159 const T offset = asyinit * xgap;
161 xlow[tj] = xt - offset;
162 xupp[tj] = xt + offset;
165template<
typename T >
166__global__
void mma_sub2_kernel(T* __restrict__ low, T* __restrict__ upp,
167 const T* __restrict__ x,
const T* __restrict__ xold1,
168 const T* __restrict__ xold2,
const T* __restrict__ xdiff,
169 const T asydecr,
const T asyincr,
const int n) {
170 int tj = blockIdx.x * blockDim.x + threadIdx.x;
175 const T xval = x[tj];
176 const T xold1val = xold1[tj];
177 const T xold2val = xold2[tj];
178 const T lowval = low[tj];
179 const T uppval = upp[tj];
180 const T xdiffval = xdiff[tj];
183 const T prod = (xval - xold1val) * (xold1val - xold2val);
186 T asy_factor = (prod < T(0)) ? asydecr :
187 (prod > T(0)) ? asyincr : T(1);
190 T new_low = fma(-asy_factor, (xold1val - lowval), xval);
191 T new_upp = fma(asy_factor, (uppval - xold1val), xval);
194 new_low = max(new_low, xval - T(10.0) * xdiffval);
195 new_low = min(new_low, xval - T(0.01) * xdiffval);
197 new_upp = min(new_upp, xval + T(10.0) * xdiffval);
198 new_upp = max(new_upp, xval + T(0.01) * xdiffval);
206__global__
void mma_sub3_kernel(
const T* __restrict__ x,
207 const T* __restrict__ df0dx,
const T* __restrict__ dfdx,
208 T* __restrict__ low, T* __restrict__ upp,
const T* __restrict__ xmin,
209 const T* __restrict__ xmax, T* __restrict__ alpha, T* __restrict__ beta,
210 T* __restrict__ p0j, T* __restrict__ q0j, T* __restrict__ pij,
211 T* __restrict__ qij,
const int n,
const int m) {
212 int tj = blockIdx.x * blockDim.x + threadIdx.x;
217 const T xmin_j = xmin[tj];
218 const T xmax_j = xmax[tj];
219 const T low_j = low[tj];
220 const T upp_j = upp[tj];
221 const T df0 = df0dx[tj];
222 const T xgap = xmax_j - xmin_j;
225 const T half_xgap = 0.5 * xgap;
226 const T tenth_low_diff = T(0.1) * (xt - low_j);
227 const T tenth_upp_diff = T(0.1) * (upp_j - xt);
230 T alpha_val = max(max(xmin_j, low_j + tenth_low_diff), xt - half_xgap);
231 T beta_val = min(min(xmax_j, upp_j - tenth_upp_diff), xt + half_xgap);
233 alpha[tj] = alpha_val;
237 const T upp_minus_x = upp_j - xt;
238 const T x_minus_low = xt - low_j;
239 const T upp_minus_x_sq = upp_minus_x * upp_minus_x;
240 const T x_minus_low_sq = x_minus_low * x_minus_low;
243 const T eps = T(1e-5);
244 const T inv_xgap = T(1.0) / max(eps, xgap);
247 const T max_df0_pos = max(df0, T(0));
248 const T max_df0_neg = max(-df0, T(0));
250 p0j[tj] = upp_minus_x_sq * (T(1.001) * max_df0_pos +
251 T(0.001) * max_df0_neg + eps * inv_xgap);
252 q0j[tj] = x_minus_low_sq * (T(0.001) * max_df0_pos +
253 T(1.001) * max_df0_neg + eps * inv_xgap);
256 for (
int i = 0; i < m; ++i) {
257 int idx = i + tj * m;
259 T dfdx_val = dfdx[idx];
260 T max_pos = max(dfdx_val, T(0));
261 T max_neg = max(-dfdx_val, T(0));
263 pij[idx] = upp_minus_x_sq * (T(1.001) * max_pos +
264 T(0.001) * max_neg + eps * inv_xgap);
265 qij[idx] = x_minus_low_sq * (T(0.001) * max_pos +
266 T(1.001) * max_neg + eps * inv_xgap);
271__global__
void mma_sub4_kernel(
272 const T* __restrict__ x,
273 const T* __restrict__ low,
274 const T* __restrict__ upp,
275 const T* __restrict__ pij,
276 const T* __restrict__ qij,
277 T* __restrict__ temp,
281 int tj = blockIdx.x * blockDim.x + threadIdx.x;
286 const T low_j = low[tj];
287 const T upp_j = upp[tj];
289 const T denom_upp = upp_j - xt;
290 const T denom_low = xt - low_j;
292 const T eps = T(1e-12);
293 const T inv_denom_upp = T(1) / max(denom_upp, eps);
294 const T inv_denom_low = T(1) / max(denom_low, eps);
296 const int base_idx = tj * m;
298 for (
int i = 0; i < m; ++i) {
299 int idx = base_idx + i;
300 temp[idx] = pij[idx] * inv_denom_upp + qij[idx] * inv_denom_low;
306__global__
void mma_max2_kernel(
308 const T* __restrict__ x,
309 const T* __restrict__ alpha,
312 int tj = blockIdx.x * blockDim.x + threadIdx.x;
315 const T eps = T(1e-12);
316 T denom = max(x[tj] - alpha[tj], eps);
317 xsi[tj] = max(T(1), T(1) / denom);
321__global__
void relambda_kernel(
322 T* __restrict__ temp,
323 const T* __restrict__ x,
324 const T* __restrict__ xupp,
325 const T* __restrict__ xlow,
326 const T* __restrict__ pij,
327 const T* __restrict__ qij,
331 int tj = blockIdx.x * blockDim.x + threadIdx.x;
335 const T xup = xupp[tj];
336 const T xlo = xlow[tj];
339 const T eps = T(1e-12);
340 const T inv_denom_upp = T(1) / max(xup - xt, eps);
341 const T inv_denom_low = T(1) / max(xt - xlo, eps);
343 int base_idx = tj * m;
344 for (
int i = 0; i < m; ++i) {
345 int idx = base_idx + i;
346 temp[idx] = pij[idx] * inv_denom_upp + qij[idx] * inv_denom_low;
352__global__
void sub2cons2_kernel(
354 const T* __restrict__ b,
355 const T* __restrict__ c,
356 const T* __restrict__ d,
359 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
360 if (idx >= n)
return;
366 a[idx] = bt * (ct - dt) - e;
370__inline__ __device__ T max_reduce_warp(T val) {
372 for (
int offset = w / 2; offset > 0; offset /= 2)
373 val = max(val, __shfl_down(val, offset, w));
377template<
typename T >
378__global__
void maxval_kernel(
const T* __restrict__ a, T *temp,
const int n) {
380 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
381 const int str = blockDim.x * gridDim.x;
383 const unsigned int lane = threadIdx.x % warpSize;
384 const unsigned int wid = threadIdx.x / warpSize;
386 __shared__ T shared[16];
388 for (
int i = idx; i < n; i += str) {
389 maxval = max(maxval, abs(a[i]));
392 maxval = max_reduce_warp<T>(maxval);
394 shared[wid] = maxval;
397 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0.0;
399 maxval = max_reduce_warp<T>(maxval);
401 if (threadIdx.x == 0)
402 temp[blockIdx.x] = maxval;
408__global__
void max_reduce_kernel(T* __restrict__ bufred,
const int n) {
411 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
412 const int stride = blockDim.x * gridDim.x;
415 for (
int i = idx; i < n; i += stride) {
416 maxval = max(maxval, bufred[i]);
419 __shared__ T shared[16];
421 unsigned int lane = threadIdx.x % warpSize;
422 unsigned int wid = threadIdx.x / warpSize;
425 maxval = max_reduce_warp<T>(maxval);
429 shared[wid] = maxval;
434 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : T(0);
436 maxval = max_reduce_warp<T>(maxval);
440 if (threadIdx.x == 0) {
441 bufred[blockIdx.x] = maxval;
447__global__
void delx_kernel(
448 T* __restrict__ delx,
449 const T* __restrict__ x,
450 const T* __restrict__ xlow,
451 const T* __restrict__ xupp,
452 const T* __restrict__ pij,
453 const T* __restrict__ qij,
454 const T* __restrict__ p0j,
455 const T* __restrict__ q0j,
456 const T* __restrict__ alpha,
457 const T* __restrict__ beta,
458 const T* __restrict__ lambda,
463 int tj = blockIdx.x * blockDim.x + threadIdx.x;
468 T alpha_j = alpha[tj];
472 T denom_low = xt - xlow_j;
473 T denom_upp = xupp_j - xt;
474 T denom_alpha = xt - alpha_j;
475 T denom_beta = beta_j - xt;
477 const T small_eps = T(1e-12);
478 denom_low = (abs(denom_low) < small_eps) ? small_eps : denom_low;
479 denom_upp = (abs(denom_upp) < small_eps) ? small_eps : denom_upp;
480 denom_alpha = (abs(denom_alpha) < small_eps) ? small_eps : denom_alpha;
481 denom_beta = (abs(denom_beta) < small_eps) ? small_eps : denom_beta;
484 for (
int i = 0; i < m; i++) {
485 T lambda_i = lambda[i];
486 sum += pij[i + tj * m] * lambda_i / (denom_upp * denom_upp)
487 - qij[i + tj * m] * lambda_i / (denom_low * denom_low);
489 sum += p0j[tj] / (denom_upp * denom_upp)
490 - q0j[tj] / (denom_low * denom_low)
499__global__
void GG_kernel(
501 const T* __restrict__ x,
502 const T* __restrict__ xlow,
503 const T* __restrict__ xupp,
504 const T* __restrict__ pij,
505 const T* __restrict__ qij,
509 int tj = blockIdx.x * blockDim.x + threadIdx.x;
516 T diff_upper = xupp_j - xt;
517 T diff_lower = xt - xlow_j;
520 T diff_upper2 = diff_upper * diff_upper;
521 T diff_lower2 = diff_lower * diff_lower;
523 for (
int i = 0; i < m; i++) {
524 int idx = i + tj * m;
525 GG[idx] = pij[idx] / diff_upper2 - qij[idx] / diff_lower2;
531__global__
void diagx_kernel(T* __restrict__ diagx,
const T* __restrict__ x,
532 const T* __restrict__ xsi,
const T* __restrict__ xlow,
533 const T* __restrict__ xupp,
const T* __restrict__ p0j,
534 const T* __restrict__ q0j,
const T* __restrict__ pij,
535 const T* __restrict__ qij,
const T* alpha,
const T* beta,
536 const T* eta,
const T* lambda,
const int n,
const int m) {
537 int tj = blockIdx.x * blockDim.x + threadIdx.x;
541 for (
int i = 0; i < m; i++) {
542 sum = sum + pij[tj *m+ i] * lambda[i];
543 sum1 = sum1 + qij[tj*m + i] * lambda[i];
545 diagx[tj] = (p0j[tj] + sum) / pow(xupp[tj] - x[tj], 3) +
546 (q0j[tj] + sum1) / pow(x[tj] - xlow[tj], 3);
547 diagx[tj] = 2.0 * diagx[tj] + xsi[tj] / (x[tj] - alpha[tj]) +
548 eta[tj] / (beta[tj] - x[tj]);
554__inline__ __device__ T reduce_warp(T val) {
556 for (
int offset = w / 2; offset > 0; offset /= 2)
557 val += __shfl_down(val, offset, w);
562__global__
void mmareduce_kernel(T* __restrict__ bufred,
const int n) {
565 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
566 const int str = blockDim.x * gridDim.x;
567 for (
int i = idx; i < n; i += str)
572 __shared__ T shared[16];
573 unsigned int lane = threadIdx.x % warpSize;
574 unsigned int wid = threadIdx.x / warpSize;
576 sum = reduce_warp<T>(sum);
581 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
583 sum = reduce_warp<T>(sum);
585 if (threadIdx.x == 0)
586 bufred[blockIdx.x] = sum;
590template<
typename T >
591__global__
void mmasum_kernel(
const T* __restrict__ a, T* __restrict__ buf_h,
592 const int n,
const int m,
const int k) {
594 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
595 const int str = blockDim.x * gridDim.x;
597 const unsigned int lane = threadIdx.x % warpSize;
598 const unsigned int wid = threadIdx.x / warpSize;
600 __shared__ T shared[16];
602 for (
int i = idx; i < n; i += str)
604 sum += a[m * i + k ];
607 sum = reduce_warp<T>(sum);
612 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
614 sum = reduce_warp<T>(sum);
616 if (threadIdx.x == 0)
617 buf_h[blockIdx.x] = sum;
620template<
typename T >
621__global__
void mmasumbb_kernel(
const T* __restrict__ GG,
622 const T* __restrict__ delx,
const T* __restrict__ diagx,
623 T* __restrict__ buf_h,
const int n,
const int m,
const int k) {
625 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
626 const int str = blockDim.x * gridDim.x;
628 const unsigned int lane = threadIdx.x % warpSize;
629 const unsigned int wid = threadIdx.x / warpSize;
631 __shared__ T shared[16];
633 for (
int i = idx; i < n; i += str)
635 sum += GG[ k + i * m] * delx[i] / diagx[i];
638 sum = reduce_warp<T>(sum);
643 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
645 sum = reduce_warp<T>(sum);
647 if (threadIdx.x == 0)
648 buf_h[blockIdx.x] = sum;
653__global__
void mmasumHess_kernel(
654 const T* __restrict__ hijx,
655 const T* __restrict__ Ljjxinv,
656 T* __restrict__ buf_h,
662 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
663 const int stride = blockDim.x * gridDim.x;
666 const unsigned int lane = threadIdx.x % warpSize;
667 const unsigned int wid = threadIdx.x / warpSize;
669 __shared__ T shared[16];
674 for (
int i = idx; i < n; i += stride) {
676 T val0 = hijx[k0 + i * m];
677 T val1 = hijx[k1 + i * m];
678 sum += val0 * Ljjxinv[i] * val1;
682 sum = reduce_warp<T>(sum);
691 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : T(0);
693 sum = reduce_warp<T>(sum);
697 if (threadIdx.x == 0) {
698 buf_h[blockIdx.x] = sum;
702template<
typename T >
703__global__
void mmasumAA_kernel(
const T* __restrict__ GG,
704 const T* __restrict__ diagx, T* __restrict__ buf_h,
const int n,
705 const int m,
const int k0,
const int k1) {
707 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
708 const int str = blockDim.x * gridDim.x;
710 const unsigned int lane = threadIdx.x % warpSize;
711 const unsigned int wid = threadIdx.x / warpSize;
713 __shared__ T shared[16];
715 for (
int i = idx; i < n; i += str)
717 sum += GG[ k0 + i * m] /diagx[i] * GG[ k1 + i * m];
720 sum = reduce_warp<T>(sum);
725 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
727 sum = reduce_warp<T>(sum);
729 if (threadIdx.x == 0)
730 buf_h[blockIdx.x] = sum;
736__global__
void mma_copy_kernel(T* __restrict__ a,
const T* __restrict__ b,
737 const int n,
const int m) {
738 int tj = blockIdx.x * blockDim.x + threadIdx.x;
746__global__
void AA_kernel(T* __restrict__ temp,
const T* __restrict__ GG,
747 const T* __restrict__ diagx,
const int n,
const int m) {
748 int tj = blockIdx.x * blockDim.x + threadIdx.x;
750 for (
int i0 = 0; i0 < m; i0++) {
751 for (
int i1 = 0; i1 < m; i1++) {
752 temp[tj + i0 * (n + 1) + i1 * (m + 1) * (n + 1)] = GG[i0 * n + tj] *
753 (1.0 / diagx[tj]) * GG[i1 * n + tj];
761__global__
void dx_kernel(T* __restrict__ dx,
const T* __restrict__ delx,
762 const T* __restrict__ diagx,
const T* __restrict__ GG,
763 const T* __restrict__ dlambda,
const int n,
const int m) {
764 int tj = blockIdx.x * blockDim.x + threadIdx.x;
766 dx[tj] = -delx[tj]/diagx[tj];
767 for(
int i=0;i<m;i++){
768 dx[tj] =dx[tj] - GG[tj*m+i]*dlambda[i]/diagx[tj];
776__global__
void dxsi_kernel(T* __restrict__ dxsi,
const T* __restrict__ xsi,
777 const T* __restrict__ dx,
const T* __restrict__ x,
778 const T* __restrict__ alpha,
const T epsi,
const int n) {
779 int tj = blockIdx.x * blockDim.x + threadIdx.x;
781 dxsi[tj]= -xsi[tj] + (epsi-dx[tj]*xsi[tj])/(x[tj] - alpha[tj]);
787__global__
void deta_kernel(T* __restrict__ deta,
const T* __restrict__ eta,
788 const T* __restrict__ dx,
const T* __restrict__ x,
789 const T* __restrict__ beta,
const T epsi,
const int n) {
790 int tj = blockIdx.x * blockDim.x + threadIdx.x;
792 deta[tj] = -eta[tj] + (epsi + dx[tj] * eta[tj]) / (beta[tj] - x[tj]);
797__global__
void RexCalculation_kernel(
799 const T* __restrict__ x,
800 const T* __restrict__ xlow,
801 const T* __restrict__ xupp,
802 const T* __restrict__ pij,
803 const T* __restrict__ p0j,
804 const T* __restrict__ qij,
805 const T* __restrict__ q0j,
806 const T* __restrict__ lambda,
807 const T* __restrict__ xsi,
808 const T* __restrict__ eta,
812 int tj = blockIdx.x * blockDim.x + threadIdx.x;
814 T upp_diff = xupp[tj] - x[tj];
815 T low_diff = x[tj] - xlow[tj];
816 T upp_diff_sq = upp_diff * upp_diff;
817 T low_diff_sq = low_diff * low_diff;
820 for (
int i = 0; i < m; ++i) {
821 sum += pij[i + tj * m] * lambda[i] / upp_diff_sq
822 - qij[i + tj * m] * lambda[i] / low_diff_sq;
826 + p0j[tj] / upp_diff_sq
827 - q0j[tj] / low_diff_sq
833__global__
void rey_calculation_kernel(T* __restrict__ rey,
834 const T* __restrict__ c,
const T* __restrict__ d,
const T* __restrict__ y,
835 const T* __restrict__ lambda,
const T* __restrict__ mu,
const int n) {
836 int tj = blockIdx.x * blockDim.x + threadIdx.x;
838 rey[tj] = c[tj] + d[tj] * y[tj] - lambda[tj] - mu[tj];
843template<
typename T >
844__global__
void norm_kernel(
const T* __restrict__ a, T* __restrict__ buf_h,
847 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
848 const int str = blockDim.x * gridDim.x;
850 const unsigned int lane = threadIdx.x % warpSize;
851 const unsigned int wid = threadIdx.x / warpSize;
853 __shared__ T shared[16];
855 for (
int i = idx; i < n; i += str)
860 sum = reduce_warp<T>(sum);
865 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
867 sum = reduce_warp<T>(sum);
869 if (threadIdx.x == 0)
870 buf_h[blockIdx.x] = sum;
877__global__
void sub2cons_kernel(T* __restrict__ a,
const T* __restrict__ b,
878 const T* __restrict__ c,
879 const T d,
const int n) {
880 int tj = blockIdx.x * blockDim.x + threadIdx.x;
882 a[tj] = b[tj]*c[tj]-d;
888__global__
void dely_kernel(T* __restrict__ dely,
const T* __restrict__ c,
889 const T* __restrict__ d,
const T* __restrict__ y,
890 const T* __restrict__ lambda,
const T epsi,
const int n) {
891 int tj = blockIdx.x * blockDim.x + threadIdx.x;
893 dely[tj] = c[tj] + d[tj]*y[tj] - lambda[tj] - epsi/y[tj];
899template<
typename T >
900__global__
void maxval2_kernel(
const T* __restrict__ a,
const T* __restrict__ b,
901 T* __restrict__ temp,
const T cons,
const int n) {
903 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
904 const int str = blockDim.x * gridDim.x;
906 const unsigned int lane = threadIdx.x % warpSize;
907 const unsigned int wid = threadIdx.x / warpSize;
909 __shared__ T shared[16];
910 T maxval = cons * a[0] / b[0];
911 for (
int i = idx; i < n; i += str)
913 maxval = max(maxval, cons * a[i] / b[i]);
916 maxval = max_reduce_warp<T>(maxval);
918 shared[wid] = maxval;
921 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0.0;
923 maxval = max_reduce_warp<T>(maxval);
925 if (threadIdx.x == 0)
926 temp[blockIdx.x] = maxval;
931template<
typename T >
932__global__
void maxval3_kernel(
const T* __restrict__ a,
const T* __restrict__ b,
933 const T* __restrict__ c, T* __restrict__ temp,
const T cons,
const int n) {
935 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
936 const int str = blockDim.x * gridDim.x;
938 const unsigned int lane = threadIdx.x % warpSize;
939 const unsigned int wid = threadIdx.x / warpSize;
941 __shared__ T shared[16];
942 T maxval = cons * a[0] / b[0];
943 for (
int i = idx; i < n; i += str)
945 maxval = max(maxval, cons * a[i] / (b[i] - c[i]));
948 maxval = max_reduce_warp<T>(maxval);
950 shared[wid] = maxval;
953 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
955 maxval = max_reduce_warp<T>(maxval);
957 if (threadIdx.x == 0)
958 temp[blockIdx.x] = maxval;
964__global__
void kkt_rex_kernel(T* __restrict__ rex,
const T* __restrict__ df0dx,
965 const T* __restrict__ dfdx,
const T* __restrict__ xsi,
966 const T* __restrict__ eta,
const T* __restrict__ lambda,
const int n,
968 int tj = blockIdx.x * blockDim.x + threadIdx.x;
971 for (
int i = 0; i < m; i++) {
972 rex[tj] = rex[tj] + dfdx[i + tj*m] * lambda[i];
974 rex[tj] += df0dx[tj] - xsi[tj] + eta[tj];
980__global__
void maxcons_kernel(T* __restrict__ a,
const T b,
981 const T c,
const T* __restrict__ d,
const int n) {
982 int tj = blockIdx.x * blockDim.x + threadIdx.x;
984 a[tj] = max(b, c * d[tj]);
990 template<
typename T >
991 __global__
void glsum_kernel(
const T * a, T * buf_h,
const int n) {
992 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
993 const int str = blockDim.x * gridDim.x;
995 const unsigned int lane = threadIdx.x % warpSize;
996 const unsigned int wid = threadIdx.x / warpSize;
998 __shared__ T shared[16];
1000 for (
int i = idx; i<n ; i += str)
1005 sum = reduce_warp<T>(sum);
1010 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1012 sum = reduce_warp<T>(sum);
1014 if (threadIdx.x == 0)
1015 buf_h[blockIdx.x] = sum;
1018 template<
typename T >
1019__global__
void glsc2_kernel(
const T * a,
1024 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1025 const int str = blockDim.x * gridDim.x;
1027 const unsigned int lane = threadIdx.x % warpSize;
1028 const unsigned int wid = threadIdx.x / warpSize;
1030 __shared__ T shared[16];
1032 for (
int i = idx; i < n; i+= str) {
1036 sum = reduce_warp<T>(sum);
1041 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1043 sum = reduce_warp<T>(sum);
1045 if (threadIdx.x == 0)
1046 buf_h[blockIdx.x] = sum;
1054template <
typename T>
1055__global__
void add2inv2_kernel(T* __restrict__ a,
const T* __restrict__ b,
1056 const T c,
const int n) {
1057 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1059 a[tj] = a[tj]+c/b[tj];
1063template <
typename T>
1064__global__
void max2_kernel(T* __restrict__ a,
const T b,
1065 const T* __restrict__ c,
const T d,
const int n) {
1066 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1068 a[tj]=max(b, d*c[tj]);
1072template <
typename T>
1073__global__
void updatebb_kernel(T* __restrict__ bb,
1074 const T* __restrict__ dellambda,
const T* __restrict__ dely,
1075 const T* __restrict__ d,
const T* __restrict__ mu,
1076 const T* __restrict__ y,
const T delz,
const int m) {
1077 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1079 bb[tj]=dellambda[tj] + dely[tj]/(d[tj] + mu[tj]/y[tj]) - bb[tj];
1086template <
typename T>
1087__global__
void updateAA_kernel(T* __restrict__ AA,
1088 const T* __restrict__ globaltmp_mm,
const T* __restrict__ s,
1089 const T* __restrict__ lambda,
const T* __restrict__ d,
1090 const T* __restrict__ mu,
const T* __restrict__ y,
const T* __restrict__ a,
1091 const T zeta,
const T z,
const int m) {
1092 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1095 AA[tj+tj*(m+1)]=globaltmp_mm[tj+tj*m] + (s[tj] / lambda[tj] +
1096 1.0/ (d[tj] + mu[tj] / y[tj]));
1097 AA[tj+m*(m+1)]=a[tj];
1098 AA[m+tj*(m+1)]=a[tj];
1101 AA[tj+tj*(m+1)]= -zeta/z;
1104template <
typename T>
1105__global__
void dy_kernel(T* __restrict__ dy,
const T* __restrict__ dely,
1106 const T* __restrict__ dlambda,
const T* __restrict__ d,
1107 const T* __restrict__ mu,
const T* __restrict__ y,
const int n) {
1108 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1110 dy[tj] = (-dely[tj]+dlambda[tj])/(d[tj] + mu[tj]/y[tj]);