35#ifndef MMA_HIP_KERNEL_H
36#define MMA_HIP_KERNEL_H
40__global__
void mma_prepare_aa_matrix_kernel(T* __restrict__ AA,
41 const T* __restrict__ s,
const T* __restrict__ lambda,
42 const T* __restrict__ d,
const T* __restrict__ mu,
43 const T* __restrict__ y,
const T* __restrict__ a,
44 const T zeta,
const T z,
const int m) {
45 const int tj = blockIdx.x * blockDim.x + threadIdx.x;
46 const int matrix_size = m + 1;
49 AA[tj * matrix_size + tj] += s[tj] / lambda[tj] +
50 (T)1.0 / (d[tj] + mu[tj] / y[tj]);
51 AA[tj * matrix_size + m] = a[tj];
52 AA[m * matrix_size + tj] = a[tj];
56 AA[m * matrix_size + m] = -zeta / z;
62__global__
void mma_update_hessian_diagonal_kernel(T* __restrict__ Hess,
63 const T* __restrict__ y,
const T* __restrict__ d,
64 const T* __restrict__ mu,
const T* __restrict__ lambda,
const int m) {
65 int tj = blockIdx.x * blockDim.x + threadIdx.x;
68 T diag = Hess[tj * m + tj];
71 if (fabs(d[tj]) >= (T)1.0e-15) {
72 diag -= (T)1.0 / d[tj];
78 diag -= mu[tj] / lambda[tj];
79 Hess[tj * m + tj] = diag;
85__global__
void mma_stabilize_hessian_single_kernel(T* __restrict__ Hess,
const int m) {
86 const int tid = threadIdx.x;
91 for (
int j = 0; j < m; j++) {
92 trace += Hess[j * m + j];
94 T lm_factor = max((T)(-1.0e-4) * trace / m, (T)1.0e-7);
97 for (
int j = 0; j < m; j++) {
98 Hess[j * m + j] -= lm_factor;
106__global__
void mma_stabilize_hessian_multi_kernel(T* __restrict__ Hess,
const T lm_factor,
const int m) {
107 const int i = blockIdx.x * blockDim.x + threadIdx.x;
109 Hess[i * m + i] -= lm_factor;
115__global__
void mma_small_lu_kernel(T* __restrict__ A, T* __restrict__ b,
const int n) {
116 const int tid = threadIdx.x;
120 if (tid == 0 && abs(A[0]) > (T)1e-12) b[0] /= A[0];
125 for (
int k = 0; k < n; k++) {
129 T max_val = abs(A[k * n + k]);
130 for (
int i = k + 1; i < n; i++) {
131 T val = abs(A[i * n + k]);
137 if (max_val > (T)1e-12 && max_row != k) {
139 for (
int j = k; j < n; j++) {
140 T temp = A[k * n + j];
141 A[k * n + j] = A[max_row * n + j];
142 A[max_row * n + j] = temp;
153 T diag = A[k * n + k];
154 if (abs(diag) > (T)1e-12) {
155 for (
int i = tid + k + 1; i < n; i += blockDim.x) {
156 T factor = A[i * n + k] / diag;
157 A[i * n + k] = factor;
158 for (
int j = k + 1; j < n; j++) {
159 A[i * n + j] -= factor * A[k * n + j];
167 for (
int i = tid; i < n; i += blockDim.x) {
169 for (
int j = 0; j < i; j++) {
170 sum -= A[i * n + j] * b[j];
177 for (
int i = n - 1 - tid; i >= 0; i -= blockDim.x) {
180 for (
int j = i + 1; j < n; j++) {
181 sum -= A[i * n + j] * b[j];
183 if (abs(A[i * n + i]) > (T)1e-12) {
184 b[i] = sum / A[i * n + i];
193__global__
void delta_1dbeam_kernel(T* __restrict__ Delta,
194 const T L_total,
const T Le,
const int offset,
const int n) {
195 int k = blockIdx.x * blockDim.x + threadIdx.x;
202 T x1 = L_total - Le *
static_cast<T
>(offset + idx - 1);
203 T term1 = x1 * x1 * x1;
206 T x2 = L_total - Le *
static_cast<T
>(offset + idx);
207 T term2 = x2 * x2 * x2;
210 Delta[k] = (term1 - term2) /
static_cast<T
>(3.0);
214__global__
void mma_Ljjxinv_kernel(T* __restrict__ Ljjxinv,
215 const T* __restrict__ pjlambda,
const T* __restrict__ qjlambda,
216 const T* __restrict__ x,
const T* __restrict__ low,
const T* __restrict__ upp,
217 const T* __restrict__ alpha,
const T* __restrict__ beta,
219 int tj = blockIdx.x * blockDim.x + threadIdx.x;
224 const T low_j = low[tj];
225 const T upp_j = upp[tj];
226 const T pj = pjlambda[tj];
227 const T qj = qjlambda[tj];
230 const T diff_u = upp_j - xt;
231 const T diff_l = xt - low_j;
234 const T diff_u3 = diff_u * diff_u * diff_u;
235 const T diff_l3 = diff_l * diff_l * diff_l;
238 T denom = 2.0 * pj / diff_u3 + 2.0 * qj / diff_l3;
239 T val = -1.0 / denom;
242 bool active = (fabs(xt - alpha[tj]) <= T(1e-16)) ||
243 (fabs(xt - beta[tj]) <= T(1e-16));
245 Ljjxinv[tj] = active ? T(0.0) : val;
249__global__
void mma_dipsolvesub1_kernel(T* __restrict__ x,
250 const T* __restrict__ pjlambda,
const T* __restrict__ qjlambda,
251 const T* __restrict__ low,
const T* __restrict__ upp,
252 const T* __restrict__ alpha,
const T* __restrict__ beta,
255 int tj = blockIdx.x * blockDim.x + threadIdx.x;
259 const T pj_sqrt = sqrt(pjlambda[tj]);
260 const T qj_sqrt = sqrt(qjlambda[tj]);
261 const T denom = pj_sqrt + qj_sqrt;
263 const T low_j = low[tj];
264 const T upp_j = upp[tj];
265 const T alpha_j = alpha[tj];
266 const T beta_j = beta[tj];
269 const T val = (pj_sqrt * low_j + qj_sqrt * upp_j) / denom;
272 x[tj] = fmin(fmax(val, alpha_j), beta_j);
276__global__
void mattrans_v_mul_kernel(T* __restrict__ output,
277 const T* __restrict__ pij,
const T* __restrict__ lambda,
278 const int m,
const int n) {
280 int tj = blockIdx.x * blockDim.x + threadIdx.x;
286 for (
int i = 0; i < m; ++i) {
287 acc += pij[i + tj * m] * lambda[i];
295__global__
void mma_sub1_kernel(
296 T* __restrict__ xlow,
297 T* __restrict__ xupp,
298 const T* __restrict__ x,
299 const T* __restrict__ xmin,
300 const T* __restrict__ xmax,
303 int tj = blockIdx.x * blockDim.x + threadIdx.x;
308 const T xmin_j = xmin[tj];
309 const T xmax_j = xmax[tj];
312 const T xgap = xmax_j - xmin_j;
313 const T offset = asyinit * xgap;
315 xlow[tj] = xt - offset;
316 xupp[tj] = xt + offset;
319template<
typename T >
320__global__
void mma_sub2_kernel(T* __restrict__ low, T* __restrict__ upp,
321 const T* __restrict__ x,
const T* __restrict__ xold1,
322 const T* __restrict__ xold2,
const T* __restrict__ xdiff,
323 const T asydecr,
const T asyincr,
const int n) {
324 int tj = blockIdx.x * blockDim.x + threadIdx.x;
329 const T xval = x[tj];
330 const T xold1val = xold1[tj];
331 const T xold2val = xold2[tj];
332 const T lowval = low[tj];
333 const T uppval = upp[tj];
334 const T xdiffval = xdiff[tj];
337 const T prod = (xval - xold1val) * (xold1val - xold2val);
340 T asy_factor = (prod < T(0)) ? asydecr :
341 (prod > T(0)) ? asyincr : T(1);
344 T new_low = fma(-asy_factor, (xold1val - lowval), xval);
345 T new_upp = fma(asy_factor, (uppval - xold1val), xval);
348 new_low = max(new_low, xval - T(10.0) * xdiffval);
349 new_low = min(new_low, xval - T(0.01) * xdiffval);
351 new_upp = min(new_upp, xval + T(10.0) * xdiffval);
352 new_upp = max(new_upp, xval + T(0.01) * xdiffval);
360__global__
void mma_sub3_kernel(
const T* __restrict__ x,
361 const T* __restrict__ df0dx,
const T* __restrict__ dfdx,
362 T* __restrict__ low, T* __restrict__ upp,
const T* __restrict__ xmin,
363 const T* __restrict__ xmax, T* __restrict__ alpha, T* __restrict__ beta,
364 T* __restrict__ p0j, T* __restrict__ q0j, T* __restrict__ pij,
365 T* __restrict__ qij,
const int n,
const int m) {
366 int tj = blockIdx.x * blockDim.x + threadIdx.x;
371 const T xmin_j = xmin[tj];
372 const T xmax_j = xmax[tj];
373 const T low_j = low[tj];
374 const T upp_j = upp[tj];
375 const T df0 = df0dx[tj];
376 const T xgap = xmax_j - xmin_j;
379 const T half_xgap = 0.5 * xgap;
380 const T tenth_low_diff = T(0.1) * (xt - low_j);
381 const T tenth_upp_diff = T(0.1) * (upp_j - xt);
384 T alpha_val = max(max(xmin_j, low_j + tenth_low_diff), xt - half_xgap);
385 T beta_val = min(min(xmax_j, upp_j - tenth_upp_diff), xt + half_xgap);
387 alpha[tj] = alpha_val;
391 const T upp_minus_x = upp_j - xt;
392 const T x_minus_low = xt - low_j;
393 const T upp_minus_x_sq = upp_minus_x * upp_minus_x;
394 const T x_minus_low_sq = x_minus_low * x_minus_low;
397 const T eps = T(1e-5);
398 const T inv_xgap = T(1.0) / max(eps, xgap);
401 const T max_df0_pos = max(df0, T(0));
402 const T max_df0_neg = max(-df0, T(0));
404 p0j[tj] = upp_minus_x_sq * (T(1.001) * max_df0_pos +
405 T(0.001) * max_df0_neg + eps * inv_xgap);
406 q0j[tj] = x_minus_low_sq * (T(0.001) * max_df0_pos +
407 T(1.001) * max_df0_neg + eps * inv_xgap);
410 for (
int i = 0; i < m; ++i) {
411 int idx = i + tj * m;
413 T dfdx_val = dfdx[idx];
414 T max_pos = max(dfdx_val, T(0));
415 T max_neg = max(-dfdx_val, T(0));
417 pij[idx] = upp_minus_x_sq * (T(1.001) * max_pos +
418 T(0.001) * max_neg + eps * inv_xgap);
419 qij[idx] = x_minus_low_sq * (T(0.001) * max_pos +
420 T(1.001) * max_neg + eps * inv_xgap);
425__global__
void mma_sub4_kernel(
426 const T* __restrict__ x,
427 const T* __restrict__ low,
428 const T* __restrict__ upp,
429 const T* __restrict__ pij,
430 const T* __restrict__ qij,
431 T* __restrict__ temp,
435 int tj = blockIdx.x * blockDim.x + threadIdx.x;
440 const T low_j = low[tj];
441 const T upp_j = upp[tj];
443 const T denom_upp = upp_j - xt;
444 const T denom_low = xt - low_j;
446 const T eps = T(1e-12);
447 const T inv_denom_upp = T(1) / max(denom_upp, eps);
448 const T inv_denom_low = T(1) / max(denom_low, eps);
450 const int base_idx = tj * m;
452 for (
int i = 0; i < m; ++i) {
453 int idx = base_idx + i;
454 temp[idx] = pij[idx] * inv_denom_upp + qij[idx] * inv_denom_low;
460__global__
void mma_max2_kernel(
462 const T* __restrict__ x,
463 const T* __restrict__ alpha,
466 int tj = blockIdx.x * blockDim.x + threadIdx.x;
469 const T eps = T(1e-12);
470 T denom = max(x[tj] - alpha[tj], eps);
471 xsi[tj] = max(T(1), T(1) / denom);
475__global__
void relambda_kernel(
476 T* __restrict__ temp,
477 const T* __restrict__ x,
478 const T* __restrict__ xupp,
479 const T* __restrict__ xlow,
480 const T* __restrict__ pij,
481 const T* __restrict__ qij,
485 int tj = blockIdx.x * blockDim.x + threadIdx.x;
489 const T xup = xupp[tj];
490 const T xlo = xlow[tj];
493 const T eps = T(1e-12);
494 const T inv_denom_upp = T(1) / max(xup - xt, eps);
495 const T inv_denom_low = T(1) / max(xt - xlo, eps);
497 int base_idx = tj * m;
498 for (
int i = 0; i < m; ++i) {
499 int idx = base_idx + i;
500 temp[idx] = pij[idx] * inv_denom_upp + qij[idx] * inv_denom_low;
506__global__
void sub2cons2_kernel(
508 const T* __restrict__ b,
509 const T* __restrict__ c,
510 const T* __restrict__ d,
513 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
514 if (idx >= n)
return;
520 a[idx] = bt * (ct - dt) - e;
524__inline__ __device__ T max_reduce_warp(T val) {
526 for (
int offset = w / 2; offset > 0; offset /= 2)
527 val = max(val, __shfl_down(val, offset, w));
531template<
typename T >
532__global__
void maxval_kernel(
const T* __restrict__ a, T *temp,
const int n) {
534 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
535 const int str = blockDim.x * gridDim.x;
537 const unsigned int lane = threadIdx.x % warpSize;
538 const unsigned int wid = threadIdx.x / warpSize;
540 __shared__ T shared[16];
542 for (
int i = idx; i < n; i += str) {
543 maxval = max(maxval, abs(a[i]));
546 maxval = max_reduce_warp<T>(maxval);
548 shared[wid] = maxval;
551 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0.0;
553 maxval = max_reduce_warp<T>(maxval);
555 if (threadIdx.x == 0)
556 temp[blockIdx.x] = maxval;
562__global__
void max_reduce_kernel(T* __restrict__ bufred,
const int n) {
565 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
566 const int stride = blockDim.x * gridDim.x;
569 for (
int i = idx; i < n; i += stride) {
570 maxval = max(maxval, bufred[i]);
573 __shared__ T shared[16];
575 unsigned int lane = threadIdx.x % warpSize;
576 unsigned int wid = threadIdx.x / warpSize;
579 maxval = max_reduce_warp<T>(maxval);
583 shared[wid] = maxval;
588 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : T(0);
590 maxval = max_reduce_warp<T>(maxval);
594 if (threadIdx.x == 0) {
595 bufred[blockIdx.x] = maxval;
601__global__
void delx_kernel(
602 T* __restrict__ delx,
603 const T* __restrict__ x,
604 const T* __restrict__ xlow,
605 const T* __restrict__ xupp,
606 const T* __restrict__ pij,
607 const T* __restrict__ qij,
608 const T* __restrict__ p0j,
609 const T* __restrict__ q0j,
610 const T* __restrict__ alpha,
611 const T* __restrict__ beta,
612 const T* __restrict__ lambda,
617 int tj = blockIdx.x * blockDim.x + threadIdx.x;
622 T alpha_j = alpha[tj];
626 T denom_low = xt - xlow_j;
627 T denom_upp = xupp_j - xt;
628 T denom_alpha = xt - alpha_j;
629 T denom_beta = beta_j - xt;
631 const T small_eps = T(1e-12);
632 denom_low = (abs(denom_low) < small_eps) ? small_eps : denom_low;
633 denom_upp = (abs(denom_upp) < small_eps) ? small_eps : denom_upp;
634 denom_alpha = (abs(denom_alpha) < small_eps) ? small_eps : denom_alpha;
635 denom_beta = (abs(denom_beta) < small_eps) ? small_eps : denom_beta;
638 for (
int i = 0; i < m; i++) {
639 T lambda_i = lambda[i];
640 sum += pij[i + tj * m] * lambda_i / (denom_upp * denom_upp)
641 - qij[i + tj * m] * lambda_i / (denom_low * denom_low);
643 sum += p0j[tj] / (denom_upp * denom_upp)
644 - q0j[tj] / (denom_low * denom_low)
653__global__
void GG_kernel(
655 const T* __restrict__ x,
656 const T* __restrict__ xlow,
657 const T* __restrict__ xupp,
658 const T* __restrict__ pij,
659 const T* __restrict__ qij,
663 int tj = blockIdx.x * blockDim.x + threadIdx.x;
670 T diff_upper = xupp_j - xt;
671 T diff_lower = xt - xlow_j;
674 T diff_upper2 = diff_upper * diff_upper;
675 T diff_lower2 = diff_lower * diff_lower;
677 for (
int i = 0; i < m; i++) {
678 int idx = i + tj * m;
679 GG[idx] = pij[idx] / diff_upper2 - qij[idx] / diff_lower2;
685__global__
void diagx_kernel(T* __restrict__ diagx,
const T* __restrict__ x,
686 const T* __restrict__ xsi,
const T* __restrict__ xlow,
687 const T* __restrict__ xupp,
const T* __restrict__ p0j,
688 const T* __restrict__ q0j,
const T* __restrict__ pij,
689 const T* __restrict__ qij,
const T* alpha,
const T* beta,
690 const T* eta,
const T* lambda,
const int n,
const int m) {
691 int tj = blockIdx.x * blockDim.x + threadIdx.x;
695 for (
int i = 0; i < m; i++) {
696 sum = sum + pij[tj *m+ i] * lambda[i];
697 sum1 = sum1 + qij[tj*m + i] * lambda[i];
699 diagx[tj] = (p0j[tj] + sum) / pow(xupp[tj] - x[tj], 3) +
700 (q0j[tj] + sum1) / pow(x[tj] - xlow[tj], 3);
701 diagx[tj] = 2.0 * diagx[tj] + xsi[tj] / (x[tj] - alpha[tj]) +
702 eta[tj] / (beta[tj] - x[tj]);
708__inline__ __device__ T reduce_warp(T val) {
710 for (
int offset = w / 2; offset > 0; offset /= 2)
711 val += __shfl_down(val, offset, w);
716__global__
void mmareduce_kernel(T* __restrict__ bufred,
const int n) {
719 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
720 const int str = blockDim.x * gridDim.x;
721 for (
int i = idx; i < n; i += str)
726 __shared__ T shared[16];
727 unsigned int lane = threadIdx.x % warpSize;
728 unsigned int wid = threadIdx.x / warpSize;
730 sum = reduce_warp<T>(sum);
735 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
737 sum = reduce_warp<T>(sum);
739 if (threadIdx.x == 0)
740 bufred[blockIdx.x] = sum;
744template<
typename T >
745__global__
void mmasum_kernel(
const T* __restrict__ a, T* __restrict__ buf_h,
746 const int n,
const int m,
const int k) {
748 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
749 const int str = blockDim.x * gridDim.x;
751 const unsigned int lane = threadIdx.x % warpSize;
752 const unsigned int wid = threadIdx.x / warpSize;
754 __shared__ T shared[16];
756 for (
int i = idx; i < n; i += str)
758 sum += a[m * i + k ];
761 sum = reduce_warp<T>(sum);
766 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
768 sum = reduce_warp<T>(sum);
770 if (threadIdx.x == 0)
771 buf_h[blockIdx.x] = sum;
774template<
typename T >
775__global__
void mmasumbb_kernel(
const T* __restrict__ GG,
776 const T* __restrict__ delx,
const T* __restrict__ diagx,
777 T* __restrict__ buf_h,
const int n,
const int m,
const int k) {
779 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
780 const int str = blockDim.x * gridDim.x;
782 const unsigned int lane = threadIdx.x % warpSize;
783 const unsigned int wid = threadIdx.x / warpSize;
785 __shared__ T shared[16];
787 for (
int i = idx; i < n; i += str)
789 sum += GG[ k + i * m] * delx[i] / diagx[i];
792 sum = reduce_warp<T>(sum);
797 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
799 sum = reduce_warp<T>(sum);
801 if (threadIdx.x == 0)
802 buf_h[blockIdx.x] = sum;
807__global__
void mmasumHess_kernel(
808 const T* __restrict__ hijx,
809 const T* __restrict__ Ljjxinv,
810 T* __restrict__ buf_h,
816 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
817 const int stride = blockDim.x * gridDim.x;
820 const unsigned int lane = threadIdx.x % warpSize;
821 const unsigned int wid = threadIdx.x / warpSize;
823 __shared__ T shared[16];
828 for (
int i = idx; i < n; i += stride) {
830 T val0 = hijx[k0 + i * m];
831 T val1 = hijx[k1 + i * m];
832 sum += val0 * Ljjxinv[i] * val1;
836 sum = reduce_warp<T>(sum);
845 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : T(0);
847 sum = reduce_warp<T>(sum);
851 if (threadIdx.x == 0) {
852 buf_h[blockIdx.x] = sum;
856template<
typename T >
857__global__
void mmasumAA_kernel(
const T* __restrict__ GG,
858 const T* __restrict__ diagx, T* __restrict__ buf_h,
const int n,
859 const int m,
const int k0,
const int k1) {
861 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
862 const int str = blockDim.x * gridDim.x;
864 const unsigned int lane = threadIdx.x % warpSize;
865 const unsigned int wid = threadIdx.x / warpSize;
867 __shared__ T shared[16];
869 for (
int i = idx; i < n; i += str)
871 sum += GG[ k0 + i * m] /diagx[i] * GG[ k1 + i * m];
874 sum = reduce_warp<T>(sum);
879 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
881 sum = reduce_warp<T>(sum);
883 if (threadIdx.x == 0)
884 buf_h[blockIdx.x] = sum;
890__global__
void mma_copy_kernel(T* __restrict__ a,
const T* __restrict__ b,
891 const int n,
const int m) {
892 int tj = blockIdx.x * blockDim.x + threadIdx.x;
900__global__
void AA_kernel(T* __restrict__ temp,
const T* __restrict__ GG,
901 const T* __restrict__ diagx,
const int n,
const int m) {
902 int tj = blockIdx.x * blockDim.x + threadIdx.x;
904 for (
int i0 = 0; i0 < m; i0++) {
905 for (
int i1 = 0; i1 < m; i1++) {
906 temp[tj + i0 * (n + 1) + i1 * (m + 1) * (n + 1)] = GG[i0 * n + tj] *
907 (1.0 / diagx[tj]) * GG[i1 * n + tj];
915__global__
void dx_kernel(T* __restrict__ dx,
const T* __restrict__ delx,
916 const T* __restrict__ diagx,
const T* __restrict__ GG,
917 const T* __restrict__ dlambda,
const int n,
const int m) {
918 int tj = blockIdx.x * blockDim.x + threadIdx.x;
920 dx[tj] = -delx[tj]/diagx[tj];
921 for(
int i=0;i<m;i++){
922 dx[tj] =dx[tj] - GG[tj*m+i]*dlambda[i]/diagx[tj];
930__global__
void dxsi_kernel(T* __restrict__ dxsi,
const T* __restrict__ xsi,
931 const T* __restrict__ dx,
const T* __restrict__ x,
932 const T* __restrict__ alpha,
const T epsi,
const int n) {
933 int tj = blockIdx.x * blockDim.x + threadIdx.x;
935 dxsi[tj]= -xsi[tj] + (epsi-dx[tj]*xsi[tj])/(x[tj] - alpha[tj]);
941__global__
void deta_kernel(T* __restrict__ deta,
const T* __restrict__ eta,
942 const T* __restrict__ dx,
const T* __restrict__ x,
943 const T* __restrict__ beta,
const T epsi,
const int n) {
944 int tj = blockIdx.x * blockDim.x + threadIdx.x;
946 deta[tj] = -eta[tj] + (epsi + dx[tj] * eta[tj]) / (beta[tj] - x[tj]);
951__global__
void RexCalculation_kernel(
953 const T* __restrict__ x,
954 const T* __restrict__ xlow,
955 const T* __restrict__ xupp,
956 const T* __restrict__ pij,
957 const T* __restrict__ p0j,
958 const T* __restrict__ qij,
959 const T* __restrict__ q0j,
960 const T* __restrict__ lambda,
961 const T* __restrict__ xsi,
962 const T* __restrict__ eta,
966 int tj = blockIdx.x * blockDim.x + threadIdx.x;
968 T upp_diff = xupp[tj] - x[tj];
969 T low_diff = x[tj] - xlow[tj];
970 T upp_diff_sq = upp_diff * upp_diff;
971 T low_diff_sq = low_diff * low_diff;
974 for (
int i = 0; i < m; ++i) {
975 sum += pij[i + tj * m] * lambda[i] / upp_diff_sq
976 - qij[i + tj * m] * lambda[i] / low_diff_sq;
980 + p0j[tj] / upp_diff_sq
981 - q0j[tj] / low_diff_sq
987__global__
void rey_calculation_kernel(T* __restrict__ rey,
988 const T* __restrict__ c,
const T* __restrict__ d,
const T* __restrict__ y,
989 const T* __restrict__ lambda,
const T* __restrict__ mu,
const int n) {
990 int tj = blockIdx.x * blockDim.x + threadIdx.x;
992 rey[tj] = c[tj] + d[tj] * y[tj] - lambda[tj] - mu[tj];
997template<
typename T >
998__global__
void norm_kernel(
const T* __restrict__ a, T* __restrict__ buf_h,
1001 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1002 const int str = blockDim.x * gridDim.x;
1004 const unsigned int lane = threadIdx.x % warpSize;
1005 const unsigned int wid = threadIdx.x / warpSize;
1007 __shared__ T shared[16];
1009 for (
int i = idx; i < n; i += str)
1011 sum += pow(a[i], 2);
1014 sum = reduce_warp<T>(sum);
1019 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1021 sum = reduce_warp<T>(sum);
1023 if (threadIdx.x == 0)
1024 buf_h[blockIdx.x] = sum;
1030template <
typename T>
1031__global__
void sub2cons_kernel(T* __restrict__ a,
const T* __restrict__ b,
1032 const T* __restrict__ c,
1033 const T d,
const int n) {
1034 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1036 a[tj] = b[tj]*c[tj]-d;
1041template <
typename T>
1042__global__
void dely_kernel(T* __restrict__ dely,
const T* __restrict__ c,
1043 const T* __restrict__ d,
const T* __restrict__ y,
1044 const T* __restrict__ lambda,
const T epsi,
const int n) {
1045 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1047 dely[tj] = c[tj] + d[tj]*y[tj] - lambda[tj] - epsi/y[tj];
1053template<
typename T >
1054__global__
void maxval2_kernel(
const T* __restrict__ a,
const T* __restrict__ b,
1055 T* __restrict__ temp,
const T cons,
const int n) {
1057 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1058 const int str = blockDim.x * gridDim.x;
1060 const unsigned int lane = threadIdx.x % warpSize;
1061 const unsigned int wid = threadIdx.x / warpSize;
1063 __shared__ T shared[16];
1064 T maxval = cons * a[0] / b[0];
1065 for (
int i = idx; i < n; i += str)
1067 maxval = max(maxval, cons * a[i] / b[i]);
1070 maxval = max_reduce_warp<T>(maxval);
1072 shared[wid] = maxval;
1075 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0.0;
1077 maxval = max_reduce_warp<T>(maxval);
1079 if (threadIdx.x == 0)
1080 temp[blockIdx.x] = maxval;
1085template<
typename T >
1086__global__
void maxval3_kernel(
const T* __restrict__ a,
const T* __restrict__ b,
1087 const T* __restrict__ c, T* __restrict__ temp,
const T cons,
const int n) {
1089 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1090 const int str = blockDim.x * gridDim.x;
1092 const unsigned int lane = threadIdx.x % warpSize;
1093 const unsigned int wid = threadIdx.x / warpSize;
1095 __shared__ T shared[16];
1096 T maxval = cons * a[0] / b[0];
1097 for (
int i = idx; i < n; i += str)
1099 maxval = max(maxval, cons * a[i] / (b[i] - c[i]));
1102 maxval = max_reduce_warp<T>(maxval);
1104 shared[wid] = maxval;
1107 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1109 maxval = max_reduce_warp<T>(maxval);
1111 if (threadIdx.x == 0)
1112 temp[blockIdx.x] = maxval;
1117template <
typename T>
1118__global__
void kkt_rex_kernel(T* __restrict__ rex,
const T* __restrict__ df0dx,
1119 const T* __restrict__ dfdx,
const T* __restrict__ xsi,
1120 const T* __restrict__ eta,
const T* __restrict__ lambda,
const int n,
1122 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1125 for (
int i = 0; i < m; i++) {
1126 rex[tj] = rex[tj] + dfdx[i + tj*m] * lambda[i];
1128 rex[tj] += df0dx[tj] - xsi[tj] + eta[tj];
1133template <
typename T>
1134__global__
void maxcons_kernel(T* __restrict__ a,
const T b,
1135 const T c,
const T* __restrict__ d,
const int n) {
1136 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1138 a[tj] = max(b, c * d[tj]);
1144 template<
typename T >
1145 __global__
void glsum_kernel(
const T * a, T * buf_h,
const int n) {
1146 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1147 const int str = blockDim.x * gridDim.x;
1149 const unsigned int lane = threadIdx.x % warpSize;
1150 const unsigned int wid = threadIdx.x / warpSize;
1152 __shared__ T shared[16];
1154 for (
int i = idx; i<n ; i += str)
1159 sum = reduce_warp<T>(sum);
1164 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1166 sum = reduce_warp<T>(sum);
1168 if (threadIdx.x == 0)
1169 buf_h[blockIdx.x] = sum;
1172 template<
typename T >
1173__global__
void glsc2_kernel(
const T * a,
1178 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1179 const int str = blockDim.x * gridDim.x;
1181 const unsigned int lane = threadIdx.x % warpSize;
1182 const unsigned int wid = threadIdx.x / warpSize;
1184 __shared__ T shared[16];
1186 for (
int i = idx; i < n; i+= str) {
1190 sum = reduce_warp<T>(sum);
1195 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1197 sum = reduce_warp<T>(sum);
1199 if (threadIdx.x == 0)
1200 buf_h[blockIdx.x] = sum;
1208template <
typename T>
1209__global__
void add2inv2_kernel(T* __restrict__ a,
const T* __restrict__ b,
1210 const T c,
const int n) {
1211 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1213 a[tj] = a[tj]+c/b[tj];
1217template <
typename T>
1218__global__
void max2_kernel(T* __restrict__ a,
const T b,
1219 const T* __restrict__ c,
const T d,
const int n) {
1220 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1222 a[tj]=max(b, d*c[tj]);
1226template <
typename T>
1227__global__
void updatebb_kernel(T* __restrict__ bb,
1228 const T* __restrict__ dellambda,
const T* __restrict__ dely,
1229 const T* __restrict__ d,
const T* __restrict__ mu,
1230 const T* __restrict__ y,
const T delz,
const int m) {
1231 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1233 bb[tj]=dellambda[tj] + dely[tj]/(d[tj] + mu[tj]/y[tj]) - bb[tj];
1240template <
typename T>
1241__global__
void updateAA_kernel(T* __restrict__ AA,
1242 const T* __restrict__ globaltmp_mm,
const T* __restrict__ s,
1243 const T* __restrict__ lambda,
const T* __restrict__ d,
1244 const T* __restrict__ mu,
const T* __restrict__ y,
const T* __restrict__ a,
1245 const T zeta,
const T z,
const int m) {
1246 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1249 AA[tj+tj*(m+1)]=globaltmp_mm[tj+tj*m] + (s[tj] / lambda[tj] +
1250 1.0/ (d[tj] + mu[tj] / y[tj]));
1251 AA[tj+m*(m+1)]=a[tj];
1252 AA[m+tj*(m+1)]=a[tj];
1255 AA[tj+tj*(m+1)]= -zeta/z;
1258template <
typename T>
1259__global__
void dy_kernel(T* __restrict__ dy,
const T* __restrict__ dely,
1260 const T* __restrict__ dlambda,
const T* __restrict__ d,
1261 const T* __restrict__ mu,
const T* __restrict__ y,
const int n) {
1262 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1264 dy[tj] = (-dely[tj]+dlambda[tj])/(d[tj] + mu[tj]/y[tj]);