35#ifndef MMA_CUDA_KERNEL_H
36#define MMA_CUDA_KERNEL_H
39__global__
void delta_1dbeam_kernel(T* __restrict__ Delta,
40 const T L_total,
const T Le,
const int offset,
const int n) {
41 int k = blockIdx.x * blockDim.x + threadIdx.x;
48 T x1 = L_total - Le *
static_cast<T
>(offset + idx - 1);
49 T term1 = x1 * x1 * x1;
52 T x2 = L_total - Le *
static_cast<T
>(offset + idx);
53 T term2 = x2 * x2 * x2;
56 Delta[k] = (term1 - term2) /
static_cast<T
>(3.0);
60__global__
void mma_Ljjxinv_kernel(T* __restrict__ Ljjxinv,
61 const T* __restrict__ pjlambda,
const T* __restrict__ qjlambda,
62 const T* __restrict__ x,
const T* __restrict__ low,
const T* __restrict__ upp,
63 const T* __restrict__ alpha,
const T* __restrict__ beta,
65 int tj = blockIdx.x * blockDim.x + threadIdx.x;
70 const T low_j = low[tj];
71 const T upp_j = upp[tj];
72 const T pj = pjlambda[tj];
73 const T qj = qjlambda[tj];
76 const T diff_u = upp_j - xt;
77 const T diff_l = xt - low_j;
80 const T diff_u3 = diff_u * diff_u * diff_u;
81 const T diff_l3 = diff_l * diff_l * diff_l;
84 T denom = 2.0 * pj / diff_u3 + 2.0 * qj / diff_l3;
88 bool active = (fabs(xt - alpha[tj]) <= T(1e-16)) ||
89 (fabs(xt - beta[tj]) <= T(1e-16));
91 Ljjxinv[tj] = active ? T(0.0) : val;
95__global__
void mma_dipsolvesub1_kernel(T* __restrict__ x,
96 const T* __restrict__ pjlambda,
const T* __restrict__ qjlambda,
97 const T* __restrict__ low,
const T* __restrict__ upp,
98 const T* __restrict__ alpha,
const T* __restrict__ beta,
101 int tj = blockIdx.x * blockDim.x + threadIdx.x;
105 const T pj_sqrt = sqrt(pjlambda[tj]);
106 const T qj_sqrt = sqrt(qjlambda[tj]);
107 const T denom = pj_sqrt + qj_sqrt;
109 const T low_j = low[tj];
110 const T upp_j = upp[tj];
111 const T alpha_j = alpha[tj];
112 const T beta_j = beta[tj];
115 const T val = (pj_sqrt * low_j + qj_sqrt * upp_j) / denom;
118 x[tj] = fmin(fmax(val, alpha_j), beta_j);
122__global__
void mattrans_v_mul_kernel(T* __restrict__ output,
123 const T* __restrict__ pij,
const T* __restrict__ lambda,
124 const int m,
const int n) {
126 int tj = blockIdx.x * blockDim.x + threadIdx.x;
132 for (
int i = 0; i < m; ++i) {
133 acc += pij[i + tj * m] * lambda[i];
140__global__
void mma_sub1_kernel(
141 T* __restrict__ xlow,
142 T* __restrict__ xupp,
143 const T* __restrict__ x,
144 const T* __restrict__ xmin,
145 const T* __restrict__ xmax,
148 int tj = blockIdx.x * blockDim.x + threadIdx.x;
153 const T xmin_j = xmin[tj];
154 const T xmax_j = xmax[tj];
157 const T xgap = xmax_j - xmin_j;
158 const T offset = asyinit * xgap;
160 xlow[tj] = xt - offset;
161 xupp[tj] = xt + offset;
166template<
typename T >
167__global__
void mma_sub2_kernel(T* __restrict__ low, T* __restrict__ upp,
168 const T* __restrict__ x,
const T* __restrict__ xold1,
169 const T* __restrict__ xold2,
const T* __restrict__ xdiff,
170 const T asydecr,
const T asyincr,
const int n) {
171 int tj = blockIdx.x * blockDim.x + threadIdx.x;
176 const T xval = x[tj];
177 const T xold1val = xold1[tj];
178 const T xold2val = xold2[tj];
179 const T lowval = low[tj];
180 const T uppval = upp[tj];
181 const T xdiffval = xdiff[tj];
184 const T prod = (xval - xold1val) * (xold1val - xold2val);
187 T asy_factor = (prod < T(0)) ? asydecr :
188 (prod > T(0)) ? asyincr : T(1);
191 T new_low = fma(-asy_factor, (xold1val - lowval), xval);
192 T new_upp = fma(asy_factor, (uppval - xold1val), xval);
195 new_low = max(new_low, xval - T(10.0) * xdiffval);
196 new_low = min(new_low, xval - T(0.01) * xdiffval);
198 new_upp = min(new_upp, xval + T(10.0) * xdiffval);
199 new_upp = max(new_upp, xval + T(0.01) * xdiffval);
207__global__
void mma_sub3_kernel(
const T* __restrict__ x,
208 const T* __restrict__ df0dx,
const T* __restrict__ dfdx,
209 T* __restrict__ low, T* __restrict__ upp,
const T* __restrict__ xmin,
210 const T* __restrict__ xmax, T* __restrict__ alpha, T* __restrict__ beta,
211 T* __restrict__ p0j, T* __restrict__ q0j, T* __restrict__ pij,
212 T* __restrict__ qij,
const int n,
const int m) {
213 int tj = blockIdx.x * blockDim.x + threadIdx.x;
218 const T xmin_j = xmin[tj];
219 const T xmax_j = xmax[tj];
220 const T low_j = low[tj];
221 const T upp_j = upp[tj];
222 const T df0 = df0dx[tj];
223 const T xgap = xmax_j - xmin_j;
226 const T half_xgap = 0.5 * xgap;
227 const T tenth_low_diff = T(0.1) * (xt - low_j);
228 const T tenth_upp_diff = T(0.1) * (upp_j - xt);
231 T alpha_val = max(max(xmin_j, low_j + tenth_low_diff), xt - half_xgap);
232 T beta_val = min(min(xmax_j, upp_j - tenth_upp_diff), xt + half_xgap);
234 alpha[tj] = alpha_val;
238 const T upp_minus_x = upp_j - xt;
239 const T x_minus_low = xt - low_j;
240 const T upp_minus_x_sq = upp_minus_x * upp_minus_x;
241 const T x_minus_low_sq = x_minus_low * x_minus_low;
244 const T eps = T(1e-5);
245 const T inv_xgap = T(1.0) / max(eps, xgap);
248 const T max_df0_pos = max(df0, T(0));
249 const T max_df0_neg = max(-df0, T(0));
251 p0j[tj] = upp_minus_x_sq * (T(1.001) * max_df0_pos +
252 T(0.001) * max_df0_neg + eps * inv_xgap);
253 q0j[tj] = x_minus_low_sq * (T(0.001) * max_df0_pos +
254 T(1.001) * max_df0_neg + eps * inv_xgap);
257 for (
int i = 0; i < m; ++i) {
258 int idx = i + tj * m;
260 T dfdx_val = dfdx[idx];
261 T max_pos = max(dfdx_val, T(0));
262 T max_neg = max(-dfdx_val, T(0));
264 pij[idx] = upp_minus_x_sq * (T(1.001) * max_pos +
265 T(0.001) * max_neg + eps * inv_xgap);
266 qij[idx] = x_minus_low_sq * (T(0.001) * max_pos +
267 T(1.001) * max_neg + eps * inv_xgap);
272__global__
void mma_sub4_kernel(
273 const T* __restrict__ x,
274 const T* __restrict__ low,
275 const T* __restrict__ upp,
276 const T* __restrict__ pij,
277 const T* __restrict__ qij,
278 T* __restrict__ temp,
282 int tj = blockIdx.x * blockDim.x + threadIdx.x;
287 const T low_j = low[tj];
288 const T upp_j = upp[tj];
290 const T denom_upp = upp_j - xt;
291 const T denom_low = xt - low_j;
293 const T eps = T(1e-12);
294 const T inv_denom_upp = T(1) / max(denom_upp, eps);
295 const T inv_denom_low = T(1) / max(denom_low, eps);
297 const int base_idx = tj * m;
299 for (
int i = 0; i < m; ++i) {
300 int idx = base_idx + i;
301 temp[idx] = pij[idx] * inv_denom_upp + qij[idx] * inv_denom_low;
307__global__
void mma_max2_kernel(
309 const T* __restrict__ x,
310 const T* __restrict__ alpha,
313 int tj = blockIdx.x * blockDim.x + threadIdx.x;
316 const T eps = T(1e-12);
317 T denom = max(x[tj] - alpha[tj], eps);
318 xsi[tj] = max(T(1), T(1) / denom);
325__global__
void relambda_kernel(
326 T* __restrict__ temp,
327 const T* __restrict__ x,
328 const T* __restrict__ xupp,
329 const T* __restrict__ xlow,
330 const T* __restrict__ pij,
331 const T* __restrict__ qij,
335 int tj = blockIdx.x * blockDim.x + threadIdx.x;
339 const T xup = xupp[tj];
340 const T xlo = xlow[tj];
343 const T eps = T(1e-12);
344 const T inv_denom_upp = T(1) / max(xup - xt, eps);
345 const T inv_denom_low = T(1) / max(xt - xlo, eps);
347 int base_idx = tj * m;
348 for (
int i = 0; i < m; ++i) {
349 int idx = base_idx + i;
350 temp[idx] = pij[idx] * inv_denom_upp + qij[idx] * inv_denom_low;
356__global__
void sub2cons2_kernel(
358 const T* __restrict__ b,
359 const T* __restrict__ c,
360 const T* __restrict__ d,
363 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
364 if (idx >= n)
return;
370 a[idx] = bt * (ct - dt) - e;
375__inline__ __device__ T max_reduce_warp(T val) {
376 val = max(val, __shfl_down_sync(0xffffffff, val, 16));
377 val = max(val, __shfl_down_sync(0xffffffff, val, 8));
378 val = max(val, __shfl_down_sync(0xffffffff, val, 4));
379 val = max(val, __shfl_down_sync(0xffffffff, val, 2));
380 val = max(val, __shfl_down_sync(0xffffffff, val, 1));
387__global__
void maxval_kernel(
const T* __restrict__ a, T* temp,
const int n) {
388 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
389 const int stride = blockDim.x * gridDim.x;
391 const unsigned int lane = threadIdx.x % warpSize;
392 const unsigned int warp_id = threadIdx.x / warpSize;
395 __shared__ T shared[32];
398 for (
int i = idx; i < n; i += stride) {
399 local_max = max(local_max, abs(a[i]));
403 local_max = max_reduce_warp<T>(local_max);
407 shared[warp_id] = local_max;
412 local_max = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : T(0);
415 local_max = max_reduce_warp<T>(local_max);
418 if (threadIdx.x == 0) {
419 temp[blockIdx.x] = local_max;
425__global__
void max_reduce_kernel(T* __restrict__ bufred,
const int n) {
428 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
429 const int stride = blockDim.x * gridDim.x;
432 for (
int i = idx; i < n; i += stride) {
433 maxval = max(maxval, bufred[i]);
436 __shared__ T shared[32];
438 unsigned int lane = threadIdx.x % warpSize;
439 unsigned int wid = threadIdx.x / warpSize;
442 maxval = max_reduce_warp<T>(maxval);
446 shared[wid] = maxval;
451 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : T(0);
453 maxval = max_reduce_warp<T>(maxval);
457 if (threadIdx.x == 0) {
458 bufred[blockIdx.x] = maxval;
463__global__
void delx_kernel(
464 T* __restrict__ delx,
465 const T* __restrict__ x,
466 const T* __restrict__ xlow,
467 const T* __restrict__ xupp,
468 const T* __restrict__ pij,
469 const T* __restrict__ qij,
470 const T* __restrict__ p0j,
471 const T* __restrict__ q0j,
472 const T* __restrict__ alpha,
473 const T* __restrict__ beta,
474 const T* __restrict__ lambda,
479 int tj = blockIdx.x * blockDim.x + threadIdx.x;
484 T alpha_j = alpha[tj];
488 T denom_low = xt - xlow_j;
489 T denom_upp = xupp_j - xt;
490 T denom_alpha = xt - alpha_j;
491 T denom_beta = beta_j - xt;
493 const T small_eps = T(1e-12);
494 denom_low = (abs(denom_low) < small_eps) ? small_eps : denom_low;
495 denom_upp = (abs(denom_upp) < small_eps) ? small_eps : denom_upp;
496 denom_alpha = (abs(denom_alpha) < small_eps) ? small_eps : denom_alpha;
497 denom_beta = (abs(denom_beta) < small_eps) ? small_eps : denom_beta;
500 for (
int i = 0; i < m; i++) {
501 T lambda_i = lambda[i];
502 sum += pij[i + tj * m] * lambda_i / (denom_upp * denom_upp)
503 - qij[i + tj * m] * lambda_i / (denom_low * denom_low);
505 sum += p0j[tj] / (denom_upp * denom_upp)
506 - q0j[tj] / (denom_low * denom_low)
515__global__
void GG_kernel(
517 const T* __restrict__ x,
518 const T* __restrict__ xlow,
519 const T* __restrict__ xupp,
520 const T* __restrict__ pij,
521 const T* __restrict__ qij,
525 int tj = blockIdx.x * blockDim.x + threadIdx.x;
532 T diff_upper = xupp_j - xt;
533 T diff_lower = xt - xlow_j;
536 T diff_upper2 = diff_upper * diff_upper;
537 T diff_lower2 = diff_lower * diff_lower;
539 for (
int i = 0; i < m; i++) {
540 int idx = i + tj * m;
541 GG[idx] = pij[idx] / diff_upper2 - qij[idx] / diff_lower2;
548__global__
void diagx_kernel(T* __restrict__ diagx,
const T* __restrict__ x,
549 const T* __restrict__ xsi,
const T* __restrict__ xlow,
550 const T* __restrict__ xupp,
const T* __restrict__ p0j,
551 const T* __restrict__ q0j,
const T* __restrict__ pij,
552 const T* __restrict__ qij,
const T* alpha,
const T* beta,
553 const T* eta,
const T* lambda,
const int n,
const int m) {
554 int tj = blockIdx.x * blockDim.x + threadIdx.x;
558 for (
int i = 0; i < m; i++) {
559 sum = sum + pij[tj *m+ i] * lambda[i];
560 sum1 = sum1 + qij[tj*m + i] * lambda[i];
562 diagx[tj] = (p0j[tj] + sum) / pow(xupp[tj] - x[tj], 3) +
563 (q0j[tj] + sum1) / pow(x[tj] - xlow[tj], 3);
564 diagx[tj] = 2.0 * diagx[tj] + xsi[tj] / (x[tj] - alpha[tj]) +
565 eta[tj] / (beta[tj] - x[tj]);
571__inline__ __device__ T reduce_warp(T val) {
573 v += __shfl_down_sync(0xffffffff, v, 16);
574 v += __shfl_down_sync(0xffffffff, v, 8);
575 v += __shfl_down_sync(0xffffffff, v, 4);
576 v += __shfl_down_sync(0xffffffff, v, 2);
577 v += __shfl_down_sync(0xffffffff, v, 1);
583__global__
void mmareduce_kernel(T* __restrict__ bufred,
const int n) {
586 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
587 const int str = blockDim.x * gridDim.x;
588 for (
int i = idx; i < n; i += str)
593 __shared__ T shared[32];
594 unsigned int lane = threadIdx.x % warpSize;
595 unsigned int wid = threadIdx.x / warpSize;
597 sum = reduce_warp<T>(sum);
602 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
604 sum = reduce_warp<T>(sum);
606 if (threadIdx.x == 0)
607 bufred[blockIdx.x] = sum;
611template<
typename T >
612__global__
void mmasum_kernel(
const T* __restrict__ a, T* __restrict__ buf_h,
613 const int n,
const int m,
const int k) {
615 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
616 const int str = blockDim.x * gridDim.x;
618 const unsigned int lane = threadIdx.x % warpSize;
619 const unsigned int wid = threadIdx.x / warpSize;
621 __shared__ T shared[32];
623 for (
int i = idx; i < n; i += str)
625 sum += a[m * i + k ];
628 sum = reduce_warp<T>(sum);
633 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
635 sum = reduce_warp<T>(sum);
637 if (threadIdx.x == 0)
638 buf_h[blockIdx.x] = sum;
641template<
typename T >
642__global__
void mmasumbb_kernel(
const T* __restrict__ GG,
643 const T* __restrict__ delx,
const T* __restrict__ diagx,
644 T* __restrict__ buf_h,
const int n,
const int m,
const int k) {
646 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
647 const int str = blockDim.x * gridDim.x;
649 const unsigned int lane = threadIdx.x % warpSize;
650 const unsigned int wid = threadIdx.x / warpSize;
652 __shared__ T shared[32];
654 for (
int i = idx; i < n; i += str)
656 sum += GG[ k + i * m] * delx[i] / diagx[i];
659 sum = reduce_warp<T>(sum);
664 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
666 sum = reduce_warp<T>(sum);
668 if (threadIdx.x == 0)
669 buf_h[blockIdx.x] = sum;
674__global__
void mmasumHess_kernel(
675 const T* __restrict__ hijx,
676 const T* __restrict__ Ljjxinv,
677 T* __restrict__ buf_h,
683 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
684 const int stride = blockDim.x * gridDim.x;
687 const unsigned int lane = threadIdx.x % warpSize;
688 const unsigned int wid = threadIdx.x / warpSize;
690 __shared__ T shared[32];
695 for (
int i = idx; i < n; i += stride) {
697 T val0 = hijx[k0 + i * m];
698 T val1 = hijx[k1 + i * m];
699 sum += val0 * Ljjxinv[i] * val1;
703 sum = reduce_warp<T>(sum);
712 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : T(0);
714 sum = reduce_warp<T>(sum);
718 if (threadIdx.x == 0) {
719 buf_h[blockIdx.x] = sum;
724template<
typename T >
725__global__
void mmasumAA_kernel(
const T* __restrict__ GG,
726 const T* __restrict__ diagx, T* __restrict__ buf_h,
const int n,
727 const int m,
const int k0,
const int k1) {
729 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
730 const int str = blockDim.x * gridDim.x;
732 const unsigned int lane = threadIdx.x % warpSize;
733 const unsigned int wid = threadIdx.x / warpSize;
735 __shared__ T shared[32];
737 for (
int i = idx; i < n; i += str)
739 sum += GG[ k0 + i * m] /diagx[i] * GG[ k1 + i * m];
742 sum = reduce_warp<T>(sum);
747 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
749 sum = reduce_warp<T>(sum);
751 if (threadIdx.x == 0)
752 buf_h[blockIdx.x] = sum;
758__global__
void mma_copy_kernel(T* __restrict__ a,
const T* __restrict__ b,
759 const int n,
const int m) {
760 int tj = blockIdx.x * blockDim.x + threadIdx.x;
768__global__
void AA_kernel(T* __restrict__ temp,
const T* __restrict__ GG,
769 const T* __restrict__ diagx,
const int n,
const int m) {
770 int tj = blockIdx.x * blockDim.x + threadIdx.x;
772 for (
int i0 = 0; i0 < m; i0++) {
773 for (
int i1 = 0; i1 < m; i1++) {
774 temp[tj + i0 * (n + 1) + i1 * (m + 1) * (n + 1)] = GG[i0 * n + tj] *
775 (1.0 / diagx[tj]) * GG[i1 * n + tj];
783__global__
void dx_kernel(T* __restrict__ dx,
const T* __restrict__ delx,
784 const T* __restrict__ diagx,
const T* __restrict__ GG,
785 const T* __restrict__ dlambda,
const int n,
const int m) {
786 int tj = blockIdx.x * blockDim.x + threadIdx.x;
788 dx[tj] = -delx[tj]/diagx[tj];
789 for(
int i=0;i<m;i++){
790 dx[tj] =dx[tj] - GG[tj*m+i]*dlambda[i]/diagx[tj];
798__global__
void dxsi_kernel(T* __restrict__ dxsi,
const T* __restrict__ xsi,
799 const T* __restrict__ dx,
const T* __restrict__ x,
800 const T* __restrict__ alpha,
const T epsi,
const int n) {
801 int tj = blockIdx.x * blockDim.x + threadIdx.x;
803 dxsi[tj]= -xsi[tj] + (epsi-dx[tj]*xsi[tj])/(x[tj] - alpha[tj]);
809__global__
void deta_kernel(T* __restrict__ deta,
const T* __restrict__ eta,
810 const T* __restrict__ dx,
const T* __restrict__ x,
811 const T* __restrict__ beta,
const T epsi,
const int n) {
812 int tj = blockIdx.x * blockDim.x + threadIdx.x;
814 deta[tj] = -eta[tj] + (epsi + dx[tj] * eta[tj]) / (beta[tj] - x[tj]);
820__global__
void RexCalculation_kernel(
822 const T* __restrict__ x,
823 const T* __restrict__ xlow,
824 const T* __restrict__ xupp,
825 const T* __restrict__ pij,
826 const T* __restrict__ p0j,
827 const T* __restrict__ qij,
828 const T* __restrict__ q0j,
829 const T* __restrict__ lambda,
830 const T* __restrict__ xsi,
831 const T* __restrict__ eta,
835 int tj = blockIdx.x * blockDim.x + threadIdx.x;
837 T upp_diff = xupp[tj] - x[tj];
838 T low_diff = x[tj] - xlow[tj];
839 T upp_diff_sq = upp_diff * upp_diff;
840 T low_diff_sq = low_diff * low_diff;
843 for (
int i = 0; i < m; ++i) {
844 sum += pij[i + tj * m] * lambda[i] / upp_diff_sq
845 - qij[i + tj * m] * lambda[i] / low_diff_sq;
849 + p0j[tj] / upp_diff_sq
850 - q0j[tj] / low_diff_sq
857__global__
void rey_calculation_kernel(T* __restrict__ rey,
858 const T* __restrict__ c,
const T* __restrict__ d,
const T* __restrict__ y,
859 const T* __restrict__ lambda,
const T* __restrict__ mu,
const int n) {
860 int tj = blockIdx.x * blockDim.x + threadIdx.x;
862 rey[tj] = c[tj] + d[tj] * y[tj] - lambda[tj] - mu[tj];
868__global__
void norm_kernel(
const T* __restrict__ a,
869 T* __restrict__ buf_h,
const int n) {
870 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
871 const int str = blockDim.x * gridDim.x;
873 const unsigned int lane = threadIdx.x % warpSize;
874 const unsigned int wid = threadIdx.x / warpSize;
876 __shared__ T shared[32];
879 for (
int i = idx; i < n; i += str) {
884 sum = reduce_warp<T>(sum);
889 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : T(0);
891 sum = reduce_warp<T>(sum);
893 if (threadIdx.x == 0)
894 buf_h[blockIdx.x] = sum;
899__global__
void sub2cons_kernel(T* __restrict__ a,
const T* __restrict__ b,
900 const T* __restrict__ c,
901 const T d,
const int n) {
902 int tj = blockIdx.x * blockDim.x + threadIdx.x;
904 a[tj] = b[tj]*c[tj]-d;
910__global__
void dely_kernel(T* __restrict__ dely,
const T* __restrict__ c,
911 const T* __restrict__ d,
const T* __restrict__ y,
912 const T* __restrict__ lambda,
const T epsi,
const int n) {
913 int tj = blockIdx.x * blockDim.x + threadIdx.x;
915 dely[tj] = c[tj] + d[tj]*y[tj] - lambda[tj] - epsi/y[tj];
921template<
typename T >
922__global__
void maxval2_kernel(
const T* __restrict__ a,
const T* __restrict__ b,
923 T* __restrict__ temp,
const T cons,
const int n) {
925 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
926 const int str = blockDim.x * gridDim.x;
928 const unsigned int lane = threadIdx.x % warpSize;
929 const unsigned int wid = threadIdx.x / warpSize;
931 __shared__ T shared[32];
932 T maxval = cons * a[0] / b[0];
933 for (
int i = idx; i < n; i += str)
935 maxval = max(maxval, cons * a[i] / b[i]);
938 maxval = max_reduce_warp<T>(maxval);
940 shared[wid] = maxval;
943 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0.0;
945 maxval = max_reduce_warp<T>(maxval);
947 if (threadIdx.x == 0)
948 temp[blockIdx.x] = maxval;
953template<
typename T >
954__global__
void maxval3_kernel(
const T* __restrict__ a,
const T* __restrict__ b,
955 const T* __restrict__ c, T* __restrict__ temp,
const T cons,
const int n) {
957 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
958 const int str = blockDim.x * gridDim.x;
960 const unsigned int lane = threadIdx.x % warpSize;
961 const unsigned int wid = threadIdx.x / warpSize;
963 __shared__ T shared[32];
964 T maxval = cons * a[0] / b[0];
965 for (
int i = idx; i < n; i += str)
967 maxval = max(maxval, cons * a[i] / (b[i] - c[i]));
970 maxval = max_reduce_warp<T>(maxval);
972 shared[wid] = maxval;
975 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
977 maxval = max_reduce_warp<T>(maxval);
979 if (threadIdx.x == 0)
980 temp[blockIdx.x] = maxval;
986__global__
void kkt_rex_kernel(T* __restrict__ rex,
const T* __restrict__ df0dx,
987 const T* __restrict__ dfdx,
const T* __restrict__ xsi,
988 const T* __restrict__ eta,
const T* __restrict__ lambda,
const int n,
990 int tj = blockIdx.x * blockDim.x + threadIdx.x;
993 for (
int i = 0; i < m; i++) {
994 rex[tj] = rex[tj] + dfdx[i + tj*m] * lambda[i];
996 rex[tj] += df0dx[tj] - xsi[tj] + eta[tj];
1001template <
typename T>
1002__global__
void maxcons_kernel(T* __restrict__ a,
const T b,
1003 const T c,
const T* __restrict__ d,
const int n) {
1004 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1006 a[tj] = max(b, c * d[tj]);
1012 template<
typename T >
1013 __global__
void glsum_kernel(
const T * a, T * buf_h,
const int n) {
1014 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1015 const int str = blockDim.x * gridDim.x;
1017 const unsigned int lane = threadIdx.x % warpSize;
1018 const unsigned int wid = threadIdx.x / warpSize;
1020 __shared__ T shared[32];
1022 for (
int i = idx; i<n ; i += str)
1027 sum = reduce_warp<T>(sum);
1032 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1034 sum = reduce_warp<T>(sum);
1036 if (threadIdx.x == 0)
1037 buf_h[blockIdx.x] = sum;
1040 template<
typename T >
1041__global__
void glsc2_kernel(
const T * a,
1046 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1047 const int str = blockDim.x * gridDim.x;
1049 const unsigned int lane = threadIdx.x % warpSize;
1050 const unsigned int wid = threadIdx.x / warpSize;
1052 __shared__ T shared[32];
1054 for (
int i = idx; i < n; i+= str) {
1058 sum = reduce_warp<T>(sum);
1063 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1065 sum = reduce_warp<T>(sum);
1067 if (threadIdx.x == 0)
1068 buf_h[blockIdx.x] = sum;
1076template <
typename T>
1077__global__
void add2inv2_kernel(T* __restrict__ a,
const T* __restrict__ b,
1078 const T c,
const int n) {
1079 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1081 a[tj] = a[tj]+c/b[tj];
1085template <
typename T>
1086__global__
void max2_kernel(T* __restrict__ a,
const T b,
1087 const T* __restrict__ c,
const T d,
const int n) {
1088 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1090 a[tj]=max(b, d*c[tj]);
1094template <
typename T>
1095__global__
void updatebb_kernel(T* __restrict__ bb,
1096 const T* __restrict__ dellambda,
const T* __restrict__ dely,
1097 const T* __restrict__ d,
const T* __restrict__ mu,
1098 const T* __restrict__ y,
const T delz,
const int m) {
1099 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1101 bb[tj]=dellambda[tj] + dely[tj]/(d[tj] + mu[tj]/y[tj]) - bb[tj];
1108template <
typename T>
1109__global__
void updateAA_kernel(T* __restrict__ AA,
1110 const T* __restrict__ globaltmp_mm,
const T* __restrict__ s,
1111 const T* __restrict__ lambda,
const T* __restrict__ d,
1112 const T* __restrict__ mu,
const T* __restrict__ y,
const T* __restrict__ a,
1113 const T zeta,
const T z,
const int m) {
1114 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1117 AA[tj+tj*(m+1)]=globaltmp_mm[tj+tj*m] + (s[tj] / lambda[tj] +
1118 1.0/ (d[tj] + mu[tj] / y[tj]));
1119 AA[tj+m*(m+1)]=a[tj];
1120 AA[m+tj*(m+1)]=a[tj];
1123 AA[tj+tj*(m+1)]= -zeta/z;
1126template <
typename T>
1127__global__
void dy_kernel(T* __restrict__ dy,
const T* __restrict__ dely,
1128 const T* __restrict__ dlambda,
const T* __restrict__ d,
1129 const T* __restrict__ mu,
const T* __restrict__ y,
const int n) {
1130 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1132 dy[tj] = (-dely[tj]+dlambda[tj])/(d[tj] + mu[tj]/y[tj]);