35#ifndef MMA_CUDA_KERNEL_H
36#define MMA_CUDA_KERNEL_H
39__global__
void mma_prepare_aa_matrix_kernel(T* __restrict__ AA,
40 const T* __restrict__ s,
const T* __restrict__ lambda,
41 const T* __restrict__ d,
const T* __restrict__ mu,
42 const T* __restrict__ y,
const T* __restrict__ a,
43 const T zeta,
const T z,
const int m) {
44 const int tj = blockIdx.x * blockDim.x + threadIdx.x;
45 const int matrix_size = m + 1;
48 AA[tj * matrix_size + tj] += s[tj] / lambda[tj] +
49 (T)1.0 / (d[tj] + mu[tj] / y[tj]);
50 AA[tj * matrix_size + m] = a[tj];
51 AA[m * matrix_size + tj] = a[tj];
55 AA[m * matrix_size + m] = -zeta / z;
61__global__
void mma_update_hessian_diagonal_kernel(T* __restrict__ Hess,
62 const T* __restrict__ y,
const T* __restrict__ d,
63 const T* __restrict__ mu,
const T* __restrict__ lambda,
const int m) {
64 int tj = blockIdx.x * blockDim.x + threadIdx.x;
67 T diag = Hess[tj * m + tj];
70 if (fabs(d[tj]) >= (T)1.0e-15) {
71 diag -= (T)1.0 / d[tj];
77 diag -= mu[tj] / lambda[tj];
78 Hess[tj * m + tj] = diag;
84__global__
void mma_stabilize_hessian_single_kernel(T* __restrict__ Hess,
const int m) {
85 const int tid = threadIdx.x;
90 for (
int j = 0; j < m; j++) {
91 trace += Hess[j * m + j];
93 T lm_factor = max((T)(-1.0e-4) * trace / m, (T)1.0e-7);
96 for (
int j = 0; j < m; j++) {
97 Hess[j * m + j] -= lm_factor;
105__global__
void mma_stabilize_hessian_multi_kernel(T* __restrict__ Hess,
const T lm_factor,
const int m) {
106 const int i = blockIdx.x * blockDim.x + threadIdx.x;
108 Hess[i * m + i] -= lm_factor;
114__global__
void mma_small_lu_kernel(T* __restrict__ A, T* __restrict__ b,
const int n) {
115 const int tid = threadIdx.x;
119 if (tid == 0 && abs(A[0]) > (T)1e-12) b[0] /= A[0];
124 for (
int k = 0; k < n; k++) {
128 T max_val = abs(A[k * n + k]);
129 for (
int i = k + 1; i < n; i++) {
130 T val = abs(A[i * n + k]);
136 if (max_val > (T)1e-12 && max_row != k) {
138 for (
int j = k; j < n; j++) {
139 T temp = A[k * n + j];
140 A[k * n + j] = A[max_row * n + j];
141 A[max_row * n + j] = temp;
152 T diag = A[k * n + k];
153 if (abs(diag) > (T)1e-12) {
154 for (
int i = tid + k + 1; i < n; i += blockDim.x) {
155 T factor = A[i * n + k] / diag;
156 A[i * n + k] = factor;
157 for (
int j = k + 1; j < n; j++) {
158 A[i * n + j] -= factor * A[k * n + j];
166 for (
int i = tid; i < n; i += blockDim.x) {
168 for (
int j = 0; j < i; j++) {
169 sum -= A[i * n + j] * b[j];
176 for (
int i = n - 1 - tid; i >= 0; i -= blockDim.x) {
179 for (
int j = i + 1; j < n; j++) {
180 sum -= A[i * n + j] * b[j];
182 if (abs(A[i * n + i]) > (T)1e-12) {
183 b[i] = sum / A[i * n + i];
191__global__
void delta_1dbeam_kernel(T* __restrict__ Delta,
192 const T L_total,
const T Le,
const int offset,
const int n) {
193 int k = blockIdx.x * blockDim.x + threadIdx.x;
200 T x1 = L_total - Le *
static_cast<T
>(offset + idx - 1);
201 T term1 = x1 * x1 * x1;
204 T x2 = L_total - Le *
static_cast<T
>(offset + idx);
205 T term2 = x2 * x2 * x2;
208 Delta[k] = (term1 - term2) /
static_cast<T
>(3.0);
212__global__
void mma_Ljjxinv_kernel(T* __restrict__ Ljjxinv,
213 const T* __restrict__ pjlambda,
const T* __restrict__ qjlambda,
214 const T* __restrict__ x,
const T* __restrict__ low,
const T* __restrict__ upp,
215 const T* __restrict__ alpha,
const T* __restrict__ beta,
217 int tj = blockIdx.x * blockDim.x + threadIdx.x;
222 const T low_j = low[tj];
223 const T upp_j = upp[tj];
224 const T pj = pjlambda[tj];
225 const T qj = qjlambda[tj];
228 const T diff_u = upp_j - xt;
229 const T diff_l = xt - low_j;
232 const T diff_u3 = diff_u * diff_u * diff_u;
233 const T diff_l3 = diff_l * diff_l * diff_l;
236 T denom = 2.0 * pj / diff_u3 + 2.0 * qj / diff_l3;
237 T val = -1.0 / denom;
240 bool active = (fabs(xt - alpha[tj]) <= T(1e-16)) ||
241 (fabs(xt - beta[tj]) <= T(1e-16));
243 Ljjxinv[tj] = active ? T(0.0) : val;
247__global__
void mma_dipsolvesub1_kernel(T* __restrict__ x,
248 const T* __restrict__ pjlambda,
const T* __restrict__ qjlambda,
249 const T* __restrict__ low,
const T* __restrict__ upp,
250 const T* __restrict__ alpha,
const T* __restrict__ beta,
253 int tj = blockIdx.x * blockDim.x + threadIdx.x;
257 const T pj_sqrt = sqrt(pjlambda[tj]);
258 const T qj_sqrt = sqrt(qjlambda[tj]);
259 const T denom = pj_sqrt + qj_sqrt;
261 const T low_j = low[tj];
262 const T upp_j = upp[tj];
263 const T alpha_j = alpha[tj];
264 const T beta_j = beta[tj];
267 const T val = (pj_sqrt * low_j + qj_sqrt * upp_j) / denom;
270 x[tj] = fmin(fmax(val, alpha_j), beta_j);
274__global__
void mattrans_v_mul_kernel(T* __restrict__ output,
275 const T* __restrict__ pij,
const T* __restrict__ lambda,
276 const int m,
const int n) {
278 int tj = blockIdx.x * blockDim.x + threadIdx.x;
284 for (
int i = 0; i < m; ++i) {
285 acc += pij[i + tj * m] * lambda[i];
292__global__
void mma_sub1_kernel(
293 T* __restrict__ xlow,
294 T* __restrict__ xupp,
295 const T* __restrict__ x,
296 const T* __restrict__ xmin,
297 const T* __restrict__ xmax,
300 int tj = blockIdx.x * blockDim.x + threadIdx.x;
305 const T xmin_j = xmin[tj];
306 const T xmax_j = xmax[tj];
309 const T xgap = xmax_j - xmin_j;
310 const T offset = asyinit * xgap;
312 xlow[tj] = xt - offset;
313 xupp[tj] = xt + offset;
318template<
typename T >
319__global__
void mma_sub2_kernel(T* __restrict__ low, T* __restrict__ upp,
320 const T* __restrict__ x,
const T* __restrict__ xold1,
321 const T* __restrict__ xold2,
const T* __restrict__ xdiff,
322 const T asydecr,
const T asyincr,
const int n) {
323 int tj = blockIdx.x * blockDim.x + threadIdx.x;
328 const T xval = x[tj];
329 const T xold1val = xold1[tj];
330 const T xold2val = xold2[tj];
331 const T lowval = low[tj];
332 const T uppval = upp[tj];
333 const T xdiffval = xdiff[tj];
336 const T prod = (xval - xold1val) * (xold1val - xold2val);
339 T asy_factor = (prod < T(0)) ? asydecr :
340 (prod > T(0)) ? asyincr : T(1);
343 T new_low = fma(-asy_factor, (xold1val - lowval), xval);
344 T new_upp = fma(asy_factor, (uppval - xold1val), xval);
347 new_low = max(new_low, xval - T(10.0) * xdiffval);
348 new_low = min(new_low, xval - T(0.01) * xdiffval);
350 new_upp = min(new_upp, xval + T(10.0) * xdiffval);
351 new_upp = max(new_upp, xval + T(0.01) * xdiffval);
359__global__
void mma_sub3_kernel(
const T* __restrict__ x,
360 const T* __restrict__ df0dx,
const T* __restrict__ dfdx,
361 T* __restrict__ low, T* __restrict__ upp,
const T* __restrict__ xmin,
362 const T* __restrict__ xmax, T* __restrict__ alpha, T* __restrict__ beta,
363 T* __restrict__ p0j, T* __restrict__ q0j, T* __restrict__ pij,
364 T* __restrict__ qij,
const int n,
const int m) {
365 int tj = blockIdx.x * blockDim.x + threadIdx.x;
370 const T xmin_j = xmin[tj];
371 const T xmax_j = xmax[tj];
372 const T low_j = low[tj];
373 const T upp_j = upp[tj];
374 const T df0 = df0dx[tj];
375 const T xgap = xmax_j - xmin_j;
378 const T half_xgap = 0.5 * xgap;
379 const T tenth_low_diff = T(0.1) * (xt - low_j);
380 const T tenth_upp_diff = T(0.1) * (upp_j - xt);
383 T alpha_val = max(max(xmin_j, low_j + tenth_low_diff), xt - half_xgap);
384 T beta_val = min(min(xmax_j, upp_j - tenth_upp_diff), xt + half_xgap);
386 alpha[tj] = alpha_val;
390 const T upp_minus_x = upp_j - xt;
391 const T x_minus_low = xt - low_j;
392 const T upp_minus_x_sq = upp_minus_x * upp_minus_x;
393 const T x_minus_low_sq = x_minus_low * x_minus_low;
396 const T eps = T(1e-5);
397 const T inv_xgap = T(1.0) / max(eps, xgap);
400 const T max_df0_pos = max(df0, T(0));
401 const T max_df0_neg = max(-df0, T(0));
403 p0j[tj] = upp_minus_x_sq * (T(1.001) * max_df0_pos +
404 T(0.001) * max_df0_neg + eps * inv_xgap);
405 q0j[tj] = x_minus_low_sq * (T(0.001) * max_df0_pos +
406 T(1.001) * max_df0_neg + eps * inv_xgap);
409 for (
int i = 0; i < m; ++i) {
410 int idx = i + tj * m;
412 T dfdx_val = dfdx[idx];
413 T max_pos = max(dfdx_val, T(0));
414 T max_neg = max(-dfdx_val, T(0));
416 pij[idx] = upp_minus_x_sq * (T(1.001) * max_pos +
417 T(0.001) * max_neg + eps * inv_xgap);
418 qij[idx] = x_minus_low_sq * (T(0.001) * max_pos +
419 T(1.001) * max_neg + eps * inv_xgap);
424__global__
void mma_sub4_kernel(
425 const T* __restrict__ x,
426 const T* __restrict__ low,
427 const T* __restrict__ upp,
428 const T* __restrict__ pij,
429 const T* __restrict__ qij,
430 T* __restrict__ temp,
434 int tj = blockIdx.x * blockDim.x + threadIdx.x;
439 const T low_j = low[tj];
440 const T upp_j = upp[tj];
442 const T denom_upp = upp_j - xt;
443 const T denom_low = xt - low_j;
445 const T eps = T(1e-12);
446 const T inv_denom_upp = T(1) / max(denom_upp, eps);
447 const T inv_denom_low = T(1) / max(denom_low, eps);
449 const int base_idx = tj * m;
451 for (
int i = 0; i < m; ++i) {
452 int idx = base_idx + i;
453 temp[idx] = pij[idx] * inv_denom_upp + qij[idx] * inv_denom_low;
459__global__
void mma_max2_kernel(
461 const T* __restrict__ x,
462 const T* __restrict__ alpha,
465 int tj = blockIdx.x * blockDim.x + threadIdx.x;
468 const T eps = T(1e-12);
469 T denom = max(x[tj] - alpha[tj], eps);
470 xsi[tj] = max(T(1), T(1) / denom);
477__global__
void relambda_kernel(
478 T* __restrict__ temp,
479 const T* __restrict__ x,
480 const T* __restrict__ xupp,
481 const T* __restrict__ xlow,
482 const T* __restrict__ pij,
483 const T* __restrict__ qij,
487 int tj = blockIdx.x * blockDim.x + threadIdx.x;
491 const T xup = xupp[tj];
492 const T xlo = xlow[tj];
495 const T eps = T(1e-12);
496 const T inv_denom_upp = T(1) / max(xup - xt, eps);
497 const T inv_denom_low = T(1) / max(xt - xlo, eps);
499 int base_idx = tj * m;
500 for (
int i = 0; i < m; ++i) {
501 int idx = base_idx + i;
502 temp[idx] = pij[idx] * inv_denom_upp + qij[idx] * inv_denom_low;
508__global__
void sub2cons2_kernel(
510 const T* __restrict__ b,
511 const T* __restrict__ c,
512 const T* __restrict__ d,
515 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
516 if (idx >= n)
return;
522 a[idx] = bt * (ct - dt) - e;
527__inline__ __device__ T max_reduce_warp(T val) {
528 val = max(val, __shfl_down_sync(0xffffffff, val, 16));
529 val = max(val, __shfl_down_sync(0xffffffff, val, 8));
530 val = max(val, __shfl_down_sync(0xffffffff, val, 4));
531 val = max(val, __shfl_down_sync(0xffffffff, val, 2));
532 val = max(val, __shfl_down_sync(0xffffffff, val, 1));
539__global__
void maxval_kernel(
const T* __restrict__ a, T* temp,
const int n) {
540 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
541 const int stride = blockDim.x * gridDim.x;
543 const unsigned int lane = threadIdx.x % warpSize;
544 const unsigned int warp_id = threadIdx.x / warpSize;
547 __shared__ T shared[32];
550 for (
int i = idx; i < n; i += stride) {
551 local_max = max(local_max, abs(a[i]));
555 local_max = max_reduce_warp<T>(local_max);
559 shared[warp_id] = local_max;
564 local_max = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : T(0);
567 local_max = max_reduce_warp<T>(local_max);
570 if (threadIdx.x == 0) {
571 temp[blockIdx.x] = local_max;
577__global__
void max_reduce_kernel(T* __restrict__ bufred,
const int n) {
580 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
581 const int stride = blockDim.x * gridDim.x;
584 for (
int i = idx; i < n; i += stride) {
585 maxval = max(maxval, bufred[i]);
588 __shared__ T shared[32];
590 unsigned int lane = threadIdx.x % warpSize;
591 unsigned int wid = threadIdx.x / warpSize;
594 maxval = max_reduce_warp<T>(maxval);
598 shared[wid] = maxval;
603 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : T(0);
605 maxval = max_reduce_warp<T>(maxval);
609 if (threadIdx.x == 0) {
610 bufred[blockIdx.x] = maxval;
615__global__
void delx_kernel(
616 T* __restrict__ delx,
617 const T* __restrict__ x,
618 const T* __restrict__ xlow,
619 const T* __restrict__ xupp,
620 const T* __restrict__ pij,
621 const T* __restrict__ qij,
622 const T* __restrict__ p0j,
623 const T* __restrict__ q0j,
624 const T* __restrict__ alpha,
625 const T* __restrict__ beta,
626 const T* __restrict__ lambda,
631 int tj = blockIdx.x * blockDim.x + threadIdx.x;
636 T alpha_j = alpha[tj];
640 T denom_low = xt - xlow_j;
641 T denom_upp = xupp_j - xt;
642 T denom_alpha = xt - alpha_j;
643 T denom_beta = beta_j - xt;
645 const T small_eps = T(1e-12);
646 denom_low = (abs(denom_low) < small_eps) ? small_eps : denom_low;
647 denom_upp = (abs(denom_upp) < small_eps) ? small_eps : denom_upp;
648 denom_alpha = (abs(denom_alpha) < small_eps) ? small_eps : denom_alpha;
649 denom_beta = (abs(denom_beta) < small_eps) ? small_eps : denom_beta;
652 for (
int i = 0; i < m; i++) {
653 T lambda_i = lambda[i];
654 sum += pij[i + tj * m] * lambda_i / (denom_upp * denom_upp)
655 - qij[i + tj * m] * lambda_i / (denom_low * denom_low);
657 sum += p0j[tj] / (denom_upp * denom_upp)
658 - q0j[tj] / (denom_low * denom_low)
667__global__
void GG_kernel(
669 const T* __restrict__ x,
670 const T* __restrict__ xlow,
671 const T* __restrict__ xupp,
672 const T* __restrict__ pij,
673 const T* __restrict__ qij,
677 int tj = blockIdx.x * blockDim.x + threadIdx.x;
684 T diff_upper = xupp_j - xt;
685 T diff_lower = xt - xlow_j;
688 T diff_upper2 = diff_upper * diff_upper;
689 T diff_lower2 = diff_lower * diff_lower;
691 for (
int i = 0; i < m; i++) {
692 int idx = i + tj * m;
693 GG[idx] = pij[idx] / diff_upper2 - qij[idx] / diff_lower2;
700__global__
void diagx_kernel(T* __restrict__ diagx,
const T* __restrict__ x,
701 const T* __restrict__ xsi,
const T* __restrict__ xlow,
702 const T* __restrict__ xupp,
const T* __restrict__ p0j,
703 const T* __restrict__ q0j,
const T* __restrict__ pij,
704 const T* __restrict__ qij,
const T* alpha,
const T* beta,
705 const T* eta,
const T* lambda,
const int n,
const int m) {
706 int tj = blockIdx.x * blockDim.x + threadIdx.x;
710 for (
int i = 0; i < m; i++) {
711 sum = sum + pij[tj *m+ i] * lambda[i];
712 sum1 = sum1 + qij[tj*m + i] * lambda[i];
714 diagx[tj] = (p0j[tj] + sum) / pow(xupp[tj] - x[tj], 3) +
715 (q0j[tj] + sum1) / pow(x[tj] - xlow[tj], 3);
716 diagx[tj] = 2.0 * diagx[tj] + xsi[tj] / (x[tj] - alpha[tj]) +
717 eta[tj] / (beta[tj] - x[tj]);
723__inline__ __device__ T reduce_warp(T val) {
725 v += __shfl_down_sync(0xffffffff, v, 16);
726 v += __shfl_down_sync(0xffffffff, v, 8);
727 v += __shfl_down_sync(0xffffffff, v, 4);
728 v += __shfl_down_sync(0xffffffff, v, 2);
729 v += __shfl_down_sync(0xffffffff, v, 1);
735__global__
void mmareduce_kernel(T* __restrict__ bufred,
const int n) {
738 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
739 const int str = blockDim.x * gridDim.x;
740 for (
int i = idx; i < n; i += str)
745 __shared__ T shared[32];
746 unsigned int lane = threadIdx.x % warpSize;
747 unsigned int wid = threadIdx.x / warpSize;
749 sum = reduce_warp<T>(sum);
754 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
756 sum = reduce_warp<T>(sum);
758 if (threadIdx.x == 0)
759 bufred[blockIdx.x] = sum;
763template<
typename T >
764__global__
void mmasum_kernel(
const T* __restrict__ a, T* __restrict__ buf_h,
765 const int n,
const int m,
const int k) {
767 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
768 const int str = blockDim.x * gridDim.x;
770 const unsigned int lane = threadIdx.x % warpSize;
771 const unsigned int wid = threadIdx.x / warpSize;
773 __shared__ T shared[32];
775 for (
int i = idx; i < n; i += str)
777 sum += a[m * i + k ];
780 sum = reduce_warp<T>(sum);
785 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
787 sum = reduce_warp<T>(sum);
789 if (threadIdx.x == 0)
790 buf_h[blockIdx.x] = sum;
793template<
typename T >
794__global__
void mmasumbb_kernel(
const T* __restrict__ GG,
795 const T* __restrict__ delx,
const T* __restrict__ diagx,
796 T* __restrict__ buf_h,
const int n,
const int m,
const int k) {
798 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
799 const int str = blockDim.x * gridDim.x;
801 const unsigned int lane = threadIdx.x % warpSize;
802 const unsigned int wid = threadIdx.x / warpSize;
804 __shared__ T shared[32];
806 for (
int i = idx; i < n; i += str)
808 sum += GG[ k + i * m] * delx[i] / diagx[i];
811 sum = reduce_warp<T>(sum);
816 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
818 sum = reduce_warp<T>(sum);
820 if (threadIdx.x == 0)
821 buf_h[blockIdx.x] = sum;
826__global__
void mmasumHess_kernel(
827 const T* __restrict__ hijx,
828 const T* __restrict__ Ljjxinv,
829 T* __restrict__ buf_h,
835 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
836 const int stride = blockDim.x * gridDim.x;
839 const unsigned int lane = threadIdx.x % warpSize;
840 const unsigned int wid = threadIdx.x / warpSize;
842 __shared__ T shared[32];
847 for (
int i = idx; i < n; i += stride) {
849 T val0 = hijx[k0 + i * m];
850 T val1 = hijx[k1 + i * m];
851 sum += val0 * Ljjxinv[i] * val1;
855 sum = reduce_warp<T>(sum);
864 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : T(0);
866 sum = reduce_warp<T>(sum);
870 if (threadIdx.x == 0) {
871 buf_h[blockIdx.x] = sum;
876template<
typename T >
877__global__
void mmasumAA_kernel(
const T* __restrict__ GG,
878 const T* __restrict__ diagx, T* __restrict__ buf_h,
const int n,
879 const int m,
const int k0,
const int k1) {
881 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
882 const int str = blockDim.x * gridDim.x;
884 const unsigned int lane = threadIdx.x % warpSize;
885 const unsigned int wid = threadIdx.x / warpSize;
887 __shared__ T shared[32];
889 for (
int i = idx; i < n; i += str)
891 sum += GG[ k0 + i * m] /diagx[i] * GG[ k1 + i * m];
894 sum = reduce_warp<T>(sum);
899 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
901 sum = reduce_warp<T>(sum);
903 if (threadIdx.x == 0)
904 buf_h[blockIdx.x] = sum;
910__global__
void mma_copy_kernel(T* __restrict__ a,
const T* __restrict__ b,
911 const int n,
const int m) {
912 int tj = blockIdx.x * blockDim.x + threadIdx.x;
920__global__
void AA_kernel(T* __restrict__ temp,
const T* __restrict__ GG,
921 const T* __restrict__ diagx,
const int n,
const int m) {
922 int tj = blockIdx.x * blockDim.x + threadIdx.x;
924 for (
int i0 = 0; i0 < m; i0++) {
925 for (
int i1 = 0; i1 < m; i1++) {
926 temp[tj + i0 * (n + 1) + i1 * (m + 1) * (n + 1)] = GG[i0 * n + tj] *
927 (1.0 / diagx[tj]) * GG[i1 * n + tj];
935__global__
void dx_kernel(T* __restrict__ dx,
const T* __restrict__ delx,
936 const T* __restrict__ diagx,
const T* __restrict__ GG,
937 const T* __restrict__ dlambda,
const int n,
const int m) {
938 int tj = blockIdx.x * blockDim.x + threadIdx.x;
940 dx[tj] = -delx[tj]/diagx[tj];
941 for(
int i=0;i<m;i++){
942 dx[tj] =dx[tj] - GG[tj*m+i]*dlambda[i]/diagx[tj];
950__global__
void dxsi_kernel(T* __restrict__ dxsi,
const T* __restrict__ xsi,
951 const T* __restrict__ dx,
const T* __restrict__ x,
952 const T* __restrict__ alpha,
const T epsi,
const int n) {
953 int tj = blockIdx.x * blockDim.x + threadIdx.x;
955 dxsi[tj]= -xsi[tj] + (epsi-dx[tj]*xsi[tj])/(x[tj] - alpha[tj]);
961__global__
void deta_kernel(T* __restrict__ deta,
const T* __restrict__ eta,
962 const T* __restrict__ dx,
const T* __restrict__ x,
963 const T* __restrict__ beta,
const T epsi,
const int n) {
964 int tj = blockIdx.x * blockDim.x + threadIdx.x;
966 deta[tj] = -eta[tj] + (epsi + dx[tj] * eta[tj]) / (beta[tj] - x[tj]);
972__global__
void RexCalculation_kernel(
974 const T* __restrict__ x,
975 const T* __restrict__ xlow,
976 const T* __restrict__ xupp,
977 const T* __restrict__ pij,
978 const T* __restrict__ p0j,
979 const T* __restrict__ qij,
980 const T* __restrict__ q0j,
981 const T* __restrict__ lambda,
982 const T* __restrict__ xsi,
983 const T* __restrict__ eta,
987 int tj = blockIdx.x * blockDim.x + threadIdx.x;
989 T upp_diff = xupp[tj] - x[tj];
990 T low_diff = x[tj] - xlow[tj];
991 T upp_diff_sq = upp_diff * upp_diff;
992 T low_diff_sq = low_diff * low_diff;
995 for (
int i = 0; i < m; ++i) {
996 sum += pij[i + tj * m] * lambda[i] / upp_diff_sq
997 - qij[i + tj * m] * lambda[i] / low_diff_sq;
1001 + p0j[tj] / upp_diff_sq
1002 - q0j[tj] / low_diff_sq
1003 - xsi[tj] + eta[tj];
1008template <
typename T>
1009__global__
void rey_calculation_kernel(T* __restrict__ rey,
1010 const T* __restrict__ c,
const T* __restrict__ d,
const T* __restrict__ y,
1011 const T* __restrict__ lambda,
const T* __restrict__ mu,
const int n) {
1012 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1014 rey[tj] = c[tj] + d[tj] * y[tj] - lambda[tj] - mu[tj];
1019template <
typename T>
1020__global__
void norm_kernel(
const T* __restrict__ a,
1021 T* __restrict__ buf_h,
const int n) {
1022 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1023 const int str = blockDim.x * gridDim.x;
1025 const unsigned int lane = threadIdx.x % warpSize;
1026 const unsigned int wid = threadIdx.x / warpSize;
1028 __shared__ T shared[32];
1031 for (
int i = idx; i < n; i += str) {
1036 sum = reduce_warp<T>(sum);
1041 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : T(0);
1043 sum = reduce_warp<T>(sum);
1045 if (threadIdx.x == 0)
1046 buf_h[blockIdx.x] = sum;
1050template <
typename T>
1051__global__
void sub2cons_kernel(T* __restrict__ a,
const T* __restrict__ b,
1052 const T* __restrict__ c,
1053 const T d,
const int n) {
1054 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1056 a[tj] = b[tj]*c[tj]-d;
1061template <
typename T>
1062__global__
void dely_kernel(T* __restrict__ dely,
const T* __restrict__ c,
1063 const T* __restrict__ d,
const T* __restrict__ y,
1064 const T* __restrict__ lambda,
const T epsi,
const int n) {
1065 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1067 dely[tj] = c[tj] + d[tj]*y[tj] - lambda[tj] - epsi/y[tj];
1073template<
typename T >
1074__global__
void maxval2_kernel(
const T* __restrict__ a,
const T* __restrict__ b,
1075 T* __restrict__ temp,
const T cons,
const int n) {
1077 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1078 const int str = blockDim.x * gridDim.x;
1080 const unsigned int lane = threadIdx.x % warpSize;
1081 const unsigned int wid = threadIdx.x / warpSize;
1083 __shared__ T shared[32];
1084 T maxval = cons * a[0] / b[0];
1085 for (
int i = idx; i < n; i += str)
1087 maxval = max(maxval, cons * a[i] / b[i]);
1090 maxval = max_reduce_warp<T>(maxval);
1092 shared[wid] = maxval;
1095 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0.0;
1097 maxval = max_reduce_warp<T>(maxval);
1099 if (threadIdx.x == 0)
1100 temp[blockIdx.x] = maxval;
1105template<
typename T >
1106__global__
void maxval3_kernel(
const T* __restrict__ a,
const T* __restrict__ b,
1107 const T* __restrict__ c, T* __restrict__ temp,
const T cons,
const int n) {
1109 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1110 const int str = blockDim.x * gridDim.x;
1112 const unsigned int lane = threadIdx.x % warpSize;
1113 const unsigned int wid = threadIdx.x / warpSize;
1115 __shared__ T shared[32];
1116 T maxval = cons * a[0] / b[0];
1117 for (
int i = idx; i < n; i += str)
1119 maxval = max(maxval, cons * a[i] / (b[i] - c[i]));
1122 maxval = max_reduce_warp<T>(maxval);
1124 shared[wid] = maxval;
1127 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1129 maxval = max_reduce_warp<T>(maxval);
1131 if (threadIdx.x == 0)
1132 temp[blockIdx.x] = maxval;
1137template <
typename T>
1138__global__
void kkt_rex_kernel(T* __restrict__ rex,
const T* __restrict__ df0dx,
1139 const T* __restrict__ dfdx,
const T* __restrict__ xsi,
1140 const T* __restrict__ eta,
const T* __restrict__ lambda,
const int n,
1142 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1145 for (
int i = 0; i < m; i++) {
1146 rex[tj] = rex[tj] + dfdx[i + tj*m] * lambda[i];
1148 rex[tj] += df0dx[tj] - xsi[tj] + eta[tj];
1153template <
typename T>
1154__global__
void maxcons_kernel(T* __restrict__ a,
const T b,
1155 const T c,
const T* __restrict__ d,
const int n) {
1156 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1158 a[tj] = max(b, c * d[tj]);
1164 template<
typename T >
1165 __global__
void glsum_kernel(
const T * a, T * buf_h,
const int n) {
1166 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1167 const int str = blockDim.x * gridDim.x;
1169 const unsigned int lane = threadIdx.x % warpSize;
1170 const unsigned int wid = threadIdx.x / warpSize;
1172 __shared__ T shared[32];
1174 for (
int i = idx; i<n ; i += str)
1179 sum = reduce_warp<T>(sum);
1184 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1186 sum = reduce_warp<T>(sum);
1188 if (threadIdx.x == 0)
1189 buf_h[blockIdx.x] = sum;
1192 template<
typename T >
1193__global__
void glsc2_kernel(
const T * a,
1198 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1199 const int str = blockDim.x * gridDim.x;
1201 const unsigned int lane = threadIdx.x % warpSize;
1202 const unsigned int wid = threadIdx.x / warpSize;
1204 __shared__ T shared[32];
1206 for (
int i = idx; i < n; i+= str) {
1210 sum = reduce_warp<T>(sum);
1215 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1217 sum = reduce_warp<T>(sum);
1219 if (threadIdx.x == 0)
1220 buf_h[blockIdx.x] = sum;
1228template <
typename T>
1229__global__
void add2inv2_kernel(T* __restrict__ a,
const T* __restrict__ b,
1230 const T c,
const int n) {
1231 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1233 a[tj] = a[tj]+c/b[tj];
1237template <
typename T>
1238__global__
void max2_kernel(T* __restrict__ a,
const T b,
1239 const T* __restrict__ c,
const T d,
const int n) {
1240 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1242 a[tj]=max(b, d*c[tj]);
1246template <
typename T>
1247__global__
void updatebb_kernel(T* __restrict__ bb,
1248 const T* __restrict__ dellambda,
const T* __restrict__ dely,
1249 const T* __restrict__ d,
const T* __restrict__ mu,
1250 const T* __restrict__ y,
const T delz,
const int m) {
1251 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1253 bb[tj]=dellambda[tj] + dely[tj]/(d[tj] + mu[tj]/y[tj]) - bb[tj];
1260template <
typename T>
1261__global__
void updateAA_kernel(T* __restrict__ AA,
1262 const T* __restrict__ globaltmp_mm,
const T* __restrict__ s,
1263 const T* __restrict__ lambda,
const T* __restrict__ d,
1264 const T* __restrict__ mu,
const T* __restrict__ y,
const T* __restrict__ a,
1265 const T zeta,
const T z,
const int m) {
1266 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1269 AA[tj+tj*(m+1)]=globaltmp_mm[tj+tj*m] + (s[tj] / lambda[tj] +
1270 1.0/ (d[tj] + mu[tj] / y[tj]));
1271 AA[tj+m*(m+1)]=a[tj];
1272 AA[m+tj*(m+1)]=a[tj];
1275 AA[tj+tj*(m+1)]= -zeta/z;
1278template <
typename T>
1279__global__
void dy_kernel(T* __restrict__ dy,
const T* __restrict__ dely,
1280 const T* __restrict__ dlambda,
const T* __restrict__ d,
1281 const T* __restrict__ mu,
const T* __restrict__ y,
const int n) {
1282 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1284 dy[tj] = (-dely[tj]+dlambda[tj])/(d[tj] + mu[tj]/y[tj]);