Neko-TOP
A portable framework for high-order spectral element flow toplogy optimization.
Loading...
Searching...
No Matches
mma_kernel.h
Go to the documentation of this file.
1
37#ifndef MMA_HIP_KERNEL_H
38#define MMA_HIP_KERNEL_H
39
40template <typename T>
41__global__ void mma_update_hessian_z_kernel(
43 const T* __restrict__ a,
44 const int m)
45{
46 int tid = blockIdx.x * blockDim.x + threadIdx.x;
47 int total = m * m;
48
49 if (tid >= total) return;
50
51 int i = tid % m;
52 int j = tid / m;
53
54 Hess[tid] -= a[i] * a[j];
55}
56
57template<typename T>
58__global__ void mma_prepare_aa_matrix_kernel(T* __restrict__ AA,
59 const T* __restrict__ s, const T* __restrict__ lambda,
60 const T* __restrict__ d, const T* __restrict__ mu,
61 const T* __restrict__ y, const T* __restrict__ a,
62 const T zeta, const T z, const int m) {
63 const int tj = blockIdx.x * blockDim.x + threadIdx.x;
64 const int matrix_size = m + 1;
65
66 if (tj >= m) return;
67 AA[tj * matrix_size + tj] += s[tj] / lambda[tj] +
68 (T)1.0 / (d[tj] + mu[tj] / y[tj]);
69 AA[tj * matrix_size + m] = a[tj]; // column m+1
70 AA[m * matrix_size + tj] = a[tj];// row m+1
71
72 // Only first thread updates the bottom-right corner element.
73 if (tj == 0)
74 AA[m * matrix_size + m] = -zeta / z;
75}
76
77
78//Update Hessian diagonal elements (y contributions for dip subsolve)
79template<typename T>
80__global__ void mma_update_hessian_diagonal_kernel(T* __restrict__ Hess,
81 const T* __restrict__ y, const T* __restrict__ mu,
82 const T* __restrict__ lambda, const int m) {
83 int tj = blockIdx.x * blockDim.x + threadIdx.x;
84 if (tj >= m) return;
85
86 T diag = Hess[tj * m + tj];
87 // Contribution from y terms (inactive constraints)
88 if (y[tj] > (T)0.0) {
89 diag -= (T)1.0;
90 }
91
92 // Contribution from -mu/lambda (eq 10)
93 diag -= mu[tj] / lambda[tj];
94 Hess[tj * m + tj] = diag;
95}
96
97// Levenberg-Marquardt algorithm (heuristically)
98// Single-block version for m <= 1024
99template<typename T>
100__global__ void mma_stabilize_hessian_single_kernel(T* __restrict__ Hess, const int m) {
101 const int tid = threadIdx.x;
102
103 // Single thread computes trace and LM factor
104 if (tid == 0) {
105 T trace = (T)0.0;
106 for (int j = 0; j < m; j++) {
107 trace += Hess[j * m + j];
108 }
109 T lm_factor = max((T)(-1.0e-4) * trace / m, (T)1.0e-7);
110
111 // Apply to all diagonal elements
112 for (int j = 0; j < m; j++) {
113 Hess[j * m + j] -= lm_factor;
114 }
115 }
116}
117
118// Levenberg-Marquardt algorithm (heuristically)
119// Multi-block version for m > 1024
120template<typename T>
121__global__ void mma_stabilize_hessian_multi_kernel(T* __restrict__ Hess, const T lm_factor, const int m) {
122 const int i = blockIdx.x * blockDim.x + threadIdx.x;
123 if (i < m) {
124 Hess[i * m + i] -= lm_factor;
125 }
126}
127
128// Small linear solver efficient for (n <= 100)
129template<typename T>
130__global__ void mma_small_lu_kernel(T* __restrict__ A, T* __restrict__ b, const int n) {
131 const int tid = threadIdx.x;
132
133 // Handle 1x1 case
134 if (n == 1) {
135 if (tid == 0 && abs(A[0]) > (T)1e-12) b[0] /= A[0];
136 return;
137 }
138
139 // LU decomposition with partial pivoting
140 for (int k = 0; k < n; k++) {
141 // Pivoting - single thread
142 if (tid == 0) {
143 int max_row = k;
144 T max_val = abs(A[k * n + k]);
145 for (int i = k + 1; i < n; i++) {
146 T val = abs(A[i * n + k]);
147 if (val > max_val) {
148 max_val = val;
149 max_row = i;
150 }
151 }
152 if (max_val > (T)1e-12 && max_row != k) {
153 // Swap rows
154 for (int j = k; j < n; j++) {
155 T temp = A[k * n + j];
156 A[k * n + j] = A[max_row * n + j];
157 A[max_row * n + j] = temp;
158 }
159 // Swap rhs
160 T temp_b = b[k];
161 b[k] = b[max_row];
162 b[max_row] = temp_b;
163 }
164 }
166
167 // Parallel elimination
168 T diag = A[k * n + k];
169 if (abs(diag) > (T)1e-12) {
170 for (int i = tid + k + 1; i < n; i += blockDim.x) {
171 T factor = A[i * n + k] / diag;
172 A[i * n + k] = factor;
173 for (int j = k + 1; j < n; j++) {
174 A[i * n + j] -= factor * A[k * n + j];
175 }
176 }
177 }
179 }
180
181 // Parallel forward substitution
182 for (int i = tid; i < n; i += blockDim.x) {
183 T sum = b[i];
184 for (int j = 0; j < i; j++) {
185 sum -= A[i * n + j] * b[j];
186 }
187 b[i] = sum;
188 }
190
191 // Parallel backward substitution
192 for (int i = n - 1 - tid; i >= 0; i -= blockDim.x) {
193 if (i >= 0) {
194 T sum = b[i];
195 for (int j = i + 1; j < n; j++) {
196 sum -= A[i * n + j] * b[j];
197 }
198 if (abs(A[i * n + i]) > (T)1e-12) {
199 b[i] = sum / A[i * n + i];
200 }
201 }
202 }
204}
205
206
207template <typename T>
208__global__ void delta_1dbeam_kernel(T* __restrict__ Delta,
209 const T L_total, const T Le, const int offset, const int n) {
210 int k = blockIdx.x * blockDim.x + threadIdx.x;
211 if (k >= n) return;
212
213 // Convert to 1-based indexing for the calculation
214 int idx = k + 1;
215
216 // Calculate first term: (L_total - Le*(offset+idx-1))^3
217 T x1 = L_total - Le * static_cast<T>(offset + idx - 1);
218 T term1 = x1 * x1 * x1;
219
220 // Calculate second term: (L_total - Le*(offset+idx))^3
221 T x2 = L_total - Le * static_cast<T>(offset + idx);
222 T term2 = x2 * x2 * x2;
223
224 // Final result
225 Delta[k] = (term1 - term2) / static_cast<T>(3.0);
226}
227
228template <typename T>
229__global__ void mma_Ljjxinv_kernel(T* __restrict__ Ljjxinv,
231 const T* __restrict__ x, const T* __restrict__ low, const T* __restrict__ upp,
232 const T* __restrict__ alpha, const T* __restrict__ beta,
233 const int n) {
234 int tj = blockIdx.x * blockDim.x + threadIdx.x;
235 if (tj >= n) return;
236
237 // Load inputs into registers
238 const T xt = x[tj];
239 const T low_j = low[tj];
240 const T upp_j = upp[tj];
241 const T pj = pjlambda[tj];
242 const T qj = qjlambda[tj];
243
244 // Precompute reused differences
245 const T diff_u = upp_j - xt;
246 const T diff_l = xt - low_j;
247
248 // Cube once (avoiding pow for speed)
249 const T diff_u3 = diff_u * diff_u * diff_u;
250 const T diff_l3 = diff_l * diff_l * diff_l;
251
252 // Compute inverse value safely
253 T denom = 2.0 * pj / diff_u3 + 2.0 * qj / diff_l3;
254 T val = -1.0 / denom;
255
256 // Mask out active primal constraints
257 bool active = (fabs(xt - alpha[tj]) <= T(1e-16)) ||
258 (fabs(xt - beta[tj]) <= T(1e-16));
259
260 Ljjxinv[tj] = active ? T(0.0) : val;
261}
262
263template <typename T>
264__global__ void mma_dipsolvesub1_kernel(T* __restrict__ x,
266 const T* __restrict__ low, const T* __restrict__ upp,
267 const T* __restrict__ alpha, const T* __restrict__ beta,
268 const int n) {
269
270 int tj = blockIdx.x * blockDim.x + threadIdx.x;
271 if (tj >= n) return;
272
273 // Load inputs into registers
274 const T pj_sqrt = sqrt(pjlambda[tj]);
275 const T qj_sqrt = sqrt(qjlambda[tj]);
276 const T denom = pj_sqrt + qj_sqrt;
277
278 const T low_j = low[tj];
279 const T upp_j = upp[tj];
280 const T alpha_j = alpha[tj];
281 const T beta_j = beta[tj];
282
283 // Weighted average
284 const T val = (pj_sqrt * low_j + qj_sqrt * upp_j) / denom;
285
286 // Clamp x between alpha and beta using branchless min/max
287 x[tj] = fmin(fmax(val, alpha_j), beta_j);
288}
289
290template <typename T>
291__global__ void mattrans_v_mul_kernel(T* __restrict__ output,
292 const T* __restrict__ pij, const T* __restrict__ lambda,
293 const int m, const int n) {
294
295 int tj = blockIdx.x * blockDim.x + threadIdx.x;
296 if (tj >= n) return;
297
298 T acc = 0.0;
299
300 // Use register accumulation
301 for (int i = 0; i < m; ++i) {
302 acc += pij[i + tj * m] * lambda[i];
303 }
304
305 output[tj] = acc;
306}
307
308
309template <typename T>
310__global__ void mma_sub1_kernel(
313 const T* __restrict__ x,
314 const T* __restrict__ xmin,
315 const T* __restrict__ xmax,
316 const T asyinit,
317 const int n) {
318 int tj = blockIdx.x * blockDim.x + threadIdx.x;
319 if (tj >= n) return;
320
321 // Load values once into registers (global memory is slow)
322 const T xt = x[tj];
323 const T xmin_j = xmin[tj];
324 const T xmax_j = xmax[tj];
325
326 // Reuse xgap calculation
327 const T xgap = xmax_j - xmin_j;
328 const T offset = asyinit * xgap;
329
330 xlow[tj] = xt - offset;
331 xupp[tj] = xt + offset;
332}
333
334template< typename T >
335__global__ void mma_sub2_kernel(T* __restrict__ low, T* __restrict__ upp,
336 const T* __restrict__ x, const T* __restrict__ xold1,
337 const T* __restrict__ xold2, const T* __restrict__ xdiff,
338 const T asydecr, const T asyincr, const int n) {
339 int tj = blockIdx.x * blockDim.x + threadIdx.x;
340 if (tj >= n) return;
341
342 // Load data into registers for faster accessing compare to global memory
343 // when accessing repeatedly)
344 const T xval = x[tj];
345 const T xold1val = xold1[tj];
346 const T xold2val = xold2[tj];
347 const T lowval = low[tj];
348 const T uppval = upp[tj];
349 const T xdiffval = xdiff[tj];
350
351 // Compute the product
352 const T prod = (xval - xold1val) * (xold1val - xold2val);
353
354 // Compute asy_factor without branching
355 T asy_factor = (prod < T(0)) ? asydecr :
356 (prod > T(0)) ? asyincr : T(1);
357
358 // Update low and upp using fma (fused multiply-add) for numerical stability
361
362 // Apply bounds
363 new_low = max(new_low, xval - T(10.0) * xdiffval);
364 new_low = min(new_low, xval - T(0.01) * xdiffval);
365
366 new_upp = min(new_upp, xval + T(10.0) * xdiffval);
367 new_upp = max(new_upp, xval + T(0.01) * xdiffval);
368
369 // Write results back
370 low[tj] = new_low;
371 upp[tj] = new_upp;
372}
373
374template <typename T>
375__global__ void mma_sub3_kernel( const T* __restrict__ x,
376 const T* __restrict__ df0dx, const T* __restrict__ dfdx,
377 T* __restrict__ low, T* __restrict__ upp, const T* __restrict__ xmin,
378 const T* __restrict__ xmax, T* __restrict__ alpha, T* __restrict__ beta,
379 T* __restrict__ p0j, T* __restrict__ q0j, T* __restrict__ pij,
380 T* __restrict__ qij, const int n, const int m) {
381 int tj = blockIdx.x * blockDim.x + threadIdx.x;
382 if (tj >= n) return;
383
384 // Load into registers once
385 const T xt = x[tj];
386 const T xmin_j = xmin[tj];
387 const T xmax_j = xmax[tj];
388 const T low_j = low[tj];
389 const T upp_j = upp[tj];
390 const T df0 = df0dx[tj];
391 const T xgap = xmax_j - xmin_j;
392
393 // Clamp helpers
394 const T half_xgap = 0.5 * xgap;
395 const T tenth_low_diff = T(0.1) * (xt - low_j);
396 const T tenth_upp_diff = T(0.1) * (upp_j - xt);
397
398 // Compute alpha and beta with fused max/min and fewer calls
401
402 alpha[tj] = alpha_val;
403 beta[tj] = beta_val;
404
405 // Avoid multiple pow calls, compute once
406 const T upp_minus_x = upp_j - xt;
407 const T x_minus_low = xt - low_j;
410
411 // Small epsilon for numerical stability
412 const T eps = T(1e-5);
413 const T inv_xgap = T(1.0) / max(eps, xgap);
414
415 // Compute terms reused multiple times
416 const T max_df0_pos = max(df0, T(0));
417 const T max_df0_neg = max(-df0, T(0));
418
419 p0j[tj] = upp_minus_x_sq * (T(1.001) * max_df0_pos +
420 T(0.001) * max_df0_neg + eps * inv_xgap);
421 q0j[tj] = x_minus_low_sq * (T(0.001) * max_df0_pos +
422 T(1.001) * max_df0_neg + eps * inv_xgap);
423
424 // Loop over m for pij and qij
425 for (int i = 0; i < m; ++i) {
426 int idx = i + tj * m;
427
428 T dfdx_val = dfdx[idx];
429 T max_pos = max(dfdx_val, T(0));
430 T max_neg = max(-dfdx_val, T(0));
431
432 pij[idx] = upp_minus_x_sq * (T(1.001) * max_pos +
433 T(0.001) * max_neg + eps * inv_xgap);
434 qij[idx] = x_minus_low_sq * (T(0.001) * max_pos +
435 T(1.001) * max_neg + eps * inv_xgap);
436 }
437}
438
439template <typename T>
440__global__ void mma_sub4_kernel(
441 const T* __restrict__ x,
442 const T* __restrict__ low,
443 const T* __restrict__ upp,
444 const T* __restrict__ pij,
445 const T* __restrict__ qij,
446 T* __restrict__ temp,
447 const int n,
448 const int m) {
449
450 int tj = blockIdx.x * blockDim.x + threadIdx.x;
451 if (tj >= n) return;
452
453 // Register caching
454 const T xt = x[tj];
455 const T low_j = low[tj];
456 const T upp_j = upp[tj];
457
458 const T denom_upp = upp_j - xt;
459 const T denom_low = xt - low_j;
460
461 const T eps = T(1e-12); // Precision-dependent epsilon
462 const T inv_denom_upp = T(1) / max(denom_upp, eps);
463 const T inv_denom_low = T(1) / max(denom_low, eps);
464
465 const int base_idx = tj * m;
466
467 for (int i = 0; i < m; ++i) {
468 int idx = base_idx + i;
469 temp[idx] = pij[idx] * inv_denom_upp + qij[idx] * inv_denom_low;
470 }
471}
472
473
474template <typename T>
475__global__ void mma_max2_kernel(
476 T* __restrict__ xsi,
477 const T* __restrict__ x,
478 const T* __restrict__ alpha,
479 const int n) {
480
481 int tj = blockIdx.x * blockDim.x + threadIdx.x;
482 if (tj >= n) return;
483
484 const T eps = T(1e-12);
485 T denom = max(x[tj] - alpha[tj], eps);
486 xsi[tj] = max(T(1), T(1) / denom);
487}
488
489template <typename T>
490__global__ void relambda_kernel(
491 T* __restrict__ temp,
492 const T* __restrict__ x,
493 const T* __restrict__ xupp,
494 const T* __restrict__ xlow,
495 const T* __restrict__ pij,
496 const T* __restrict__ qij,
497 const int n,
498 const int m) {
499
500 int tj = blockIdx.x * blockDim.x + threadIdx.x;
501 if (tj >= n) return;
502
503 const T xt = x[tj];
504 const T xup = xupp[tj];
505 const T xlo = xlow[tj];
506
507 // Prevent divide-by-zero using small epsilon
508 const T eps = T(1e-12);
509 const T inv_denom_upp = T(1) / max(xup - xt, eps);
510 const T inv_denom_low = T(1) / max(xt - xlo, eps);
511
512 int base_idx = tj * m;
513 for (int i = 0; i < m; ++i) {
514 int idx = base_idx + i;
515 temp[idx] = pij[idx] * inv_denom_upp + qij[idx] * inv_denom_low;
516 }
517}
518
519
520template <typename T>
521__global__ void sub2cons2_kernel(
522 T* __restrict__ a,
523 const T* __restrict__ b,
524 const T* __restrict__ c,
525 const T* __restrict__ d,
526 const T e,
527 const int n) {
528 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
529 if (idx >= n) return;
530
531 const T bt = b[idx];
532 const T ct = c[idx];
533 const T dt = d[idx];
534
535 a[idx] = bt * (ct - dt) - e;
536}
537
538template <typename T>
539__inline__ __device__ T max_reduce_warp(T val) {
540 int w = warpSize;
541 for (int offset = w / 2; offset > 0; offset /= 2)
542 val = max(val, __shfl_down(val, offset, w));
543 return val;
544}
545
546template< typename T >
547__global__ void maxval_kernel(const T* __restrict__ a, T *temp, const int n) {
548
549 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
550 const int str = blockDim.x * gridDim.x;
551
552 const unsigned int lane = threadIdx.x % warpSize;
553 const unsigned int wid = threadIdx.x / warpSize;
554
555 __shared__ T shared[16];
556 T maxval = 0.0;
557 for (int i = idx; i < n; i += str) {
558 maxval = max(maxval, abs(a[i]));
559 }
560
562 if (lane == 0)
563 shared[wid] = maxval;
565
566 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0.0;
567 if (wid == 0)
569
570 if (threadIdx.x == 0)
571 temp[blockIdx.x] = maxval;
572
573}
574
575
576template <typename T>
577__global__ void max_reduce_kernel(T* __restrict__ bufred, const int n) {
578
579 T maxval = T(0);
580 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
581 const int stride = blockDim.x * gridDim.x;
582
583 // Grid-stride loop to cover all elements
584 for (int i = idx; i < n; i += stride) {
585 maxval = max(maxval, bufred[i]);
586 }
587
588 __shared__ T shared[16]; // One slot per warp (max 1024 threads/block)
589
590 unsigned int lane = threadIdx.x % warpSize;
591 unsigned int wid = threadIdx.x / warpSize;
592
593 // Warp-level max reduction
595
596 // Store each warp's max value in shared memory
597 if (lane == 0) {
598 shared[wid] = maxval;
599 }
601
602 // Let the first warp reduce the warp-level maxima
603 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : T(0);
604 if (wid == 0) {
606 }
607
608 // Write the block's maximum value back to bufred[blockIdx.x]
609 if (threadIdx.x == 0) {
611 }
612}
613
614
615template <typename T>
616__global__ void delx_kernel(
618 const T* __restrict__ x,
619 const T* __restrict__ xlow,
620 const T* __restrict__ xupp,
621 const T* __restrict__ pij,
622 const T* __restrict__ qij,
623 const T* __restrict__ p0j,
624 const T* __restrict__ q0j,
625 const T* __restrict__ alpha,
626 const T* __restrict__ beta,
627 const T* __restrict__ lambda,
628 const T epsi,
629 const int n,
630 const int m)
631{
632 int tj = blockIdx.x * blockDim.x + threadIdx.x;
633 if (tj < n) {
634 T xt = x[tj];
635 T xlow_j = xlow[tj];
636 T xupp_j = xupp[tj];
637 T alpha_j = alpha[tj];
638 T beta_j = beta[tj];
639
640 // Precompute denominators squared for better performance
641 T denom_low = xt - xlow_j;
642 T denom_upp = xupp_j - xt;
644 T denom_beta = beta_j - xt;
645
646 const T small_eps = T(1e-12);
651
652 T sum = T(0);
653 for (int i = 0; i < m; i++) {
654 T lambda_i = lambda[i];
655 sum += pij[i + tj * m] * lambda_i / (denom_upp * denom_upp)
656 - qij[i + tj * m] * lambda_i / (denom_low * denom_low);
657 }
658 sum += p0j[tj] / (denom_upp * denom_upp)
659 - q0j[tj] / (denom_low * denom_low)
660 - epsi / denom_alpha
661 + epsi / denom_beta;
662
663 delx[tj] = sum;
664 }
665}
666
667template <typename T>
668__global__ void GG_kernel(
670 const T* __restrict__ x,
671 const T* __restrict__ xlow,
672 const T* __restrict__ xupp,
673 const T* __restrict__ pij,
674 const T* __restrict__ qij,
675 const int n,
676 const int m) {
677
678 int tj = blockIdx.x * blockDim.x + threadIdx.x;
679 if (tj < n) {
680 T xt = x[tj];
681 T xlow_j = xlow[tj];
682 T xupp_j = xupp[tj];
683
684 // Distances from bounds
685 T diff_upper = xupp_j - xt;
686 T diff_lower = xt - xlow_j;
687
688 // Squared distances
691
692 for (int i = 0; i < m; i++) {
693 int idx = i + tj * m;
694 GG[idx] = pij[idx] / diff_upper2 - qij[idx] / diff_lower2;
695 }
696 }
697}
698
699template <typename T>
700__global__ void diagx_kernel(T* __restrict__ diagx, const T* __restrict__ x,
701 const T* __restrict__ xsi, const T* __restrict__ xlow,
702 const T* __restrict__ xupp, const T* __restrict__ p0j,
703 const T* __restrict__ q0j, const T* __restrict__ pij,
704 const T* __restrict__ qij, const T* alpha, const T* beta,
705 const T* eta, const T* lambda, const int n, const int m) {
706 int tj = blockIdx.x * blockDim.x + threadIdx.x;
707 if (tj < n) {
708 T sum = 0;
709 T sum1 = 0;
710 for (int i = 0; i < m; i++) {
711 sum = sum + pij[tj *m+ i] * lambda[i];
712 sum1 = sum1 + qij[tj*m + i] * lambda[i];
713 }
714 diagx[tj] = (p0j[tj] + sum) / pow(xupp[tj] - x[tj], 3) +
715 (q0j[tj] + sum1) / pow(x[tj] - xlow[tj], 3);
716 diagx[tj] = 2.0 * diagx[tj] + xsi[tj] / (x[tj] - alpha[tj]) +
717 eta[tj] / (beta[tj] - x[tj]);
718 }
719}
720
721
722template <typename T>
723__inline__ __device__ T reduce_warp(T val) {
724 int w = warpSize;
725 for (int offset = w / 2; offset > 0; offset /= 2)
726 val += __shfl_down(val, offset, w);
727 return val;
728}
729
730template <typename T>
731__global__ void mmareduce_kernel(T* __restrict__ bufred, const int n) {
732
733 T sum = 0;
734 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
735 const int str = blockDim.x * gridDim.x;
736 for (int i = idx; i < n; i += str)
737 {
738 sum += bufred[i];
739 }
740
741 __shared__ T shared[16];
742 unsigned int lane = threadIdx.x % warpSize;
743 unsigned int wid = threadIdx.x / warpSize;
744
746 if (lane == 0)
747 shared[wid] = sum;
749
750 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
751 if (wid == 0)
753
754 if (threadIdx.x == 0)
755 bufred[blockIdx.x] = sum;
756}
757
758
759template< typename T >
760__global__ void mmasum_kernel(const T* __restrict__ a, T* __restrict__ buf_h,
761 const int n, const int m, const int k) {
762
763 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
764 const int str = blockDim.x * gridDim.x;
765
766 const unsigned int lane = threadIdx.x % warpSize;
767 const unsigned int wid = threadIdx.x / warpSize;
768
769 __shared__ T shared[16];
770 T sum = 0;
771 for (int i = idx; i < n; i += str)
772 {
773 sum += a[m * i + k ];
774 }
775
777 if (lane == 0)
778 shared[wid] = sum;
780
781 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
782 if (wid == 0)
784
785 if (threadIdx.x == 0)
786 buf_h[blockIdx.x] = sum;
787
788}
789template< typename T >
790__global__ void mmasumbb_kernel(const T* __restrict__ GG,
791 const T* __restrict__ delx, const T* __restrict__ diagx,
792 T* __restrict__ buf_h, const int n, const int m, const int k) {
793
794 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
795 const int str = blockDim.x * gridDim.x;
796
797 const unsigned int lane = threadIdx.x % warpSize;
798 const unsigned int wid = threadIdx.x / warpSize;
799
800 __shared__ T shared[16];
801 T sum = 0;
802 for (int i = idx; i < n; i += str)
803 {
804 sum += GG[ k + i * m] * delx[i] / diagx[i];
805 }
806
808 if (lane == 0)
809 shared[wid] = sum;
811
812 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
813 if (wid == 0)
815
816 if (threadIdx.x == 0)
817 buf_h[blockIdx.x] = sum;
818
819}
820
821template <typename T>
822__global__ void mmasumHess_kernel(
823 const T* __restrict__ hijx,
824 const T* __restrict__ Ljjxinv,
826 const int n,
827 const int m,
828 const int k0,
829 const int k1)
830{
831 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
832 const int stride = blockDim.x * gridDim.x;
833
834 // Warp lane and warp id within the block
835 const unsigned int lane = threadIdx.x % warpSize;
836 const unsigned int wid = threadIdx.x / warpSize;
837
838 __shared__ T shared[16];
839
840 T sum = T(0);
841
842 // Grid-stride loop for global reduction
843 for (int i = idx; i < n; i += stride) {
844 // hijx is indexed as hijx[offset + row * m]
845 T val0 = hijx[k0 + i * m];
846 T val1 = hijx[k1 + i * m];
847 sum += val0 * Ljjxinv[i] * val1;
848 }
849
850 // Warp-level reduction using your reduce_warp implementation
852
853 // Write each warp's partial sum to shared memory
854 if (lane == 0) {
855 shared[wid] = sum;
856 }
858
859 // First warp reduces the warp sums in shared memory
860 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : T(0);
861 if (wid == 0) {
863 }
864
865 // Write final result from first thread of block
866 if (threadIdx.x == 0) {
867 buf_h[blockIdx.x] = sum;
868 }
869}
870
871template< typename T >
872__global__ void mmasumAA_kernel(const T* __restrict__ GG,
873 const T* __restrict__ diagx, T* __restrict__ buf_h, const int n,
874 const int m, const int k0, const int k1) {
875
876 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
877 const int str = blockDim.x * gridDim.x;
878
879 const unsigned int lane = threadIdx.x % warpSize;
880 const unsigned int wid = threadIdx.x / warpSize;
881
882 __shared__ T shared[16];
883 T sum = 0;
884 for (int i = idx; i < n; i += str)
885 {
886 sum += GG[ k0 + i * m] /diagx[i] * GG[ k1 + i * m];
887 }
888
890 if (lane == 0)
891 shared[wid] = sum;
893
894 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
895 if (wid == 0)
897
898 if (threadIdx.x == 0)
899 buf_h[blockIdx.x] = sum;
900
901}
902
903
904template <typename T>
905__global__ void mma_copy_kernel(T* __restrict__ a, const T* __restrict__ b,
906 const int n, const int m) {
907 int tj = blockIdx.x * blockDim.x + threadIdx.x;
908 if(tj<n)
909 a[tj+m]=b[tj];
910}
911
912
913
914template <typename T>
915__global__ void AA_kernel(T* __restrict__ temp, const T* __restrict__ GG,
916 const T* __restrict__ diagx, const int n, const int m) {
917 int tj = blockIdx.x * blockDim.x + threadIdx.x;
918 if (tj < n) {
919 for (int i0 = 0; i0 < m; i0++) {
920 for (int i1 = 0; i1 < m; i1++) {
921 temp[tj + i0 * (n + 1) + i1 * (m + 1) * (n + 1)] = GG[i0 * n + tj] *
922 (1.0 / diagx[tj]) * GG[i1 * n + tj];
923 }
924 }
925 }
926}
927
928
929template <typename T>
930__global__ void dx_kernel(T* __restrict__ dx, const T* __restrict__ delx,
931 const T* __restrict__ diagx, const T* __restrict__ GG,
932 const T* __restrict__ dlambda, const int n, const int m) {
933 int tj = blockIdx.x * blockDim.x + threadIdx.x;
934 if (tj < n) {
935 dx[tj] = -delx[tj]/diagx[tj];
936 for(int i=0;i<m;i++){
937 dx[tj] =dx[tj] - GG[tj*m+i]*dlambda[i]/diagx[tj];
938 }
939 }
940}
941
942
943
944template <typename T>
945__global__ void dxsi_kernel(T* __restrict__ dxsi, const T* __restrict__ xsi,
946 const T* __restrict__ dx, const T* __restrict__ x,
947 const T* __restrict__ alpha, const T epsi, const int n) {
948 int tj = blockIdx.x * blockDim.x + threadIdx.x;
949 if (tj < n) {
950 dxsi[tj]= -xsi[tj] + (epsi-dx[tj]*xsi[tj])/(x[tj] - alpha[tj]);
951 }
952}
953
954
955template <typename T>
956__global__ void deta_kernel(T* __restrict__ deta, const T* __restrict__ eta,
957 const T* __restrict__ dx, const T* __restrict__ x,
958 const T* __restrict__ beta, const T epsi, const int n) {
959 int tj = blockIdx.x * blockDim.x + threadIdx.x;
960 if (tj < n) {
961 deta[tj] = -eta[tj] + (epsi + dx[tj] * eta[tj]) / (beta[tj] - x[tj]);
962 }
963}
964
965template <typename T>
966__global__ void RexCalculation_kernel(
968 const T* __restrict__ x,
969 const T* __restrict__ xlow,
970 const T* __restrict__ xupp,
971 const T* __restrict__ pij,
972 const T* __restrict__ p0j,
973 const T* __restrict__ qij,
974 const T* __restrict__ q0j,
975 const T* __restrict__ lambda,
976 const T* __restrict__ xsi,
977 const T* __restrict__ eta,
978 const int n,
979 const int m)
980{
981 int tj = blockIdx.x * blockDim.x + threadIdx.x;
982 if (tj < n) {
983 T upp_diff = xupp[tj] - x[tj];
984 T low_diff = x[tj] - xlow[tj];
987
988 T sum = 0.0;
989 for (int i = 0; i < m; ++i) {
990 sum += pij[i + tj * m] * lambda[i] / upp_diff_sq
991 - qij[i + tj * m] * lambda[i] / low_diff_sq;
992 }
993
994 rex[tj] = sum
995 + p0j[tj] / upp_diff_sq
996 - q0j[tj] / low_diff_sq
997 - xsi[tj] + eta[tj];
998 }
999}
1000
1001template <typename T>
1002__global__ void rey_calculation_kernel(T* __restrict__ rey,
1003 const T* __restrict__ c, const T* __restrict__ d, const T* __restrict__ y,
1004 const T* __restrict__ lambda, const T* __restrict__ mu, const int n) {
1005 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1006 if (tj < n) {
1007 rey[tj] = c[tj] + d[tj] * y[tj] - lambda[tj] - mu[tj];
1008 }
1009}
1010
1011
1012template< typename T >
1013__global__ void norm_kernel(const T* __restrict__ a, T* __restrict__ buf_h,
1014 const int n) {
1015
1016 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1017 const int str = blockDim.x * gridDim.x;
1018
1019 const unsigned int lane = threadIdx.x % warpSize;
1020 const unsigned int wid = threadIdx.x / warpSize;
1021
1022 __shared__ T shared[16];
1023 T sum = 0;
1024 for (int i = idx; i < n; i += str)
1025 {
1026 sum += pow(a[i], 2);
1027 }
1028
1030 if (lane == 0)
1031 shared[wid] = sum;
1032 __syncthreads();
1033
1034 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1035 if (wid == 0)
1037
1038 if (threadIdx.x == 0)
1039 buf_h[blockIdx.x] = sum;
1040
1041}
1042
1043
1044
1045template <typename T>
1046__global__ void sub2cons_kernel(T* __restrict__ a, const T* __restrict__ b,
1047 const T* __restrict__ c,
1048 const T d, const int n) {
1049 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1050 if (tj < n) {
1051 a[tj] = b[tj]*c[tj]-d;
1052 }
1053}
1054
1055
1056template <typename T>
1057__global__ void dely_kernel(T* __restrict__ dely, const T* __restrict__ c,
1058 const T* __restrict__ d, const T* __restrict__ y,
1059 const T* __restrict__ lambda, const T epsi, const int n) {
1060 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1061 if (tj < n) {
1062 dely[tj] = c[tj] + d[tj]*y[tj] - lambda[tj] - epsi/y[tj];
1063 }
1064}
1065
1066
1067
1068template< typename T >
1069__global__ void maxval2_kernel(const T* __restrict__ a, const T* __restrict__ b,
1070 T* __restrict__ temp, const T cons, const int n) {
1071
1072 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1073 const int str = blockDim.x * gridDim.x;
1074
1075 const unsigned int lane = threadIdx.x % warpSize;
1076 const unsigned int wid = threadIdx.x / warpSize;
1077
1078 __shared__ T shared[16];
1079 T maxval = cons * a[0] / b[0];
1080 for (int i = idx; i < n; i += str)
1081 {
1082 maxval = max(maxval, cons * a[i] / b[i]);
1083 }
1084
1086 if (lane == 0)
1087 shared[wid] = maxval;
1088 __syncthreads();
1089
1090 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0.0;
1091 if (wid == 0)
1093
1094 if (threadIdx.x == 0)
1095 temp[blockIdx.x] = maxval;
1096
1097}
1098
1099
1100template< typename T >
1101__global__ void maxval3_kernel(const T* __restrict__ a, const T* __restrict__ b,
1102 const T* __restrict__ c, T* __restrict__ temp, const T cons, const int n) {
1103
1104 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1105 const int str = blockDim.x * gridDim.x;
1106
1107 const unsigned int lane = threadIdx.x % warpSize;
1108 const unsigned int wid = threadIdx.x / warpSize;
1109
1110 __shared__ T shared[16];
1111 T maxval = cons * a[0] / b[0];
1112 for (int i = idx; i < n; i += str)
1113 {
1114 maxval = max(maxval, cons * a[i] / (b[i] - c[i]));
1115 }
1116
1118 if (lane == 0)
1119 shared[wid] = maxval;
1120 __syncthreads();
1121
1122 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1123 if (wid == 0)
1125
1126 if (threadIdx.x == 0)
1127 temp[blockIdx.x] = maxval;
1128
1129}
1130
1131
1132template <typename T>
1133__global__ void kkt_rex_kernel(T* __restrict__ rex, const T* __restrict__ df0dx,
1134 const T* __restrict__ dfdx, const T* __restrict__ xsi,
1135 const T* __restrict__ eta, const T* __restrict__ lambda, const int n,
1136 const int m) {
1137 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1138 if (tj < n) {
1139 rex[tj] = 0.0;
1140 for (int i = 0; i < m; i++) {
1141 rex[tj] = rex[tj] + dfdx[i + tj*m] * lambda[i];
1142 }
1143 rex[tj] += df0dx[tj] - xsi[tj] + eta[tj];
1144 }
1145}
1146
1147
1148template <typename T>
1149__global__ void maxcons_kernel(T* __restrict__ a, const T b,
1150 const T c, const T* __restrict__ d, const int n) {
1151 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1152 if (tj < n) {
1153 a[tj] = max(b, c * d[tj]);
1154 }
1155}
1156
1159 template< typename T >
1160 __global__ void glsum_kernel(const T * a, T * buf_h, const int n) {
1161 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1162 const int str = blockDim.x * gridDim.x;
1163
1164 const unsigned int lane = threadIdx.x % warpSize;
1165 const unsigned int wid = threadIdx.x / warpSize;
1166
1167 __shared__ T shared[16];
1168 T sum = 0;
1169 for (int i = idx; i<n ; i += str)
1170 {
1171 sum += a[i];
1172 }
1173
1175 if (lane == 0)
1176 shared[wid] = sum;
1177 __syncthreads();
1178
1179 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1180 if (wid == 0)
1182
1183 if (threadIdx.x == 0)
1184 buf_h[blockIdx.x] = sum;
1185
1186 }
1187 template< typename T >
1188__global__ void glsc2_kernel(const T * a,
1189 const T * b,
1190 T * buf_h,
1191 const int n) {
1192
1193 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1194 const int str = blockDim.x * gridDim.x;
1195
1196 const unsigned int lane = threadIdx.x % warpSize;
1197 const unsigned int wid = threadIdx.x / warpSize;
1198
1199 __shared__ T shared[16];
1200 T sum = 0.0;
1201 for (int i = idx; i < n; i+= str) {
1202 sum += a[i] * b[i];
1203 }
1204
1206 if (lane == 0)
1207 shared[wid] = sum;
1208 __syncthreads();
1209
1210 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1211 if (wid == 0)
1213
1214 if (threadIdx.x == 0)
1215 buf_h[blockIdx.x] = sum;
1216
1217 }
1218
1219
1220
1221
1222
1223template <typename T>
1224__global__ void add2inv2_kernel(T* __restrict__ a, const T* __restrict__ b,
1225 const T c, const int n) {
1226 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1227 if (tj < n) {
1228 a[tj] = a[tj]+c/b[tj];
1229 }
1230}
1231
1232template <typename T>
1233__global__ void max2_kernel(T* __restrict__ a, const T b,
1234 const T* __restrict__ c, const T d, const int n) {
1235 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1236 if (tj < n) {
1237 a[tj]=max(b, d*c[tj]);
1238 }
1239}
1240
1241template <typename T>
1242__global__ void updatebb_kernel(T* __restrict__ bb,
1243 const T* __restrict__ dellambda, const T* __restrict__ dely,
1244 const T* __restrict__ d, const T* __restrict__ mu,
1245 const T* __restrict__ y, const T delz, const int m) {
1246 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1247 if(tj<m)
1248 bb[tj]=dellambda[tj] + dely[tj]/(d[tj] + mu[tj]/y[tj]) - bb[tj];
1249 else if(tj<m+1)
1250 bb[tj]=delz;
1251}
1252
1253
1254
1255template <typename T>
1256__global__ void updateAA_kernel(T* __restrict__ AA,
1257 const T* __restrict__ globaltmp_mm, const T* __restrict__ s,
1258 const T* __restrict__ lambda, const T* __restrict__ d,
1259 const T* __restrict__ mu, const T* __restrict__ y, const T* __restrict__ a,
1260 const T zeta, const T z, const int m) {
1261 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1262 if(tj<m)
1263 {
1264 AA[tj+tj*(m+1)]=globaltmp_mm[tj+tj*m] + (s[tj] / lambda[tj] +
1265 1.0/ (d[tj] + mu[tj] / y[tj]));
1266 AA[tj+m*(m+1)]=a[tj];
1267 AA[m+tj*(m+1)]=a[tj];
1268 }
1269 else if(tj<m+1)
1270 AA[tj+tj*(m+1)]= -zeta/z;
1271}
1272
1273template <typename T>
1274__global__ void dy_kernel(T* __restrict__ dy, const T* __restrict__ dely,
1275 const T* __restrict__ dlambda, const T* __restrict__ d,
1276 const T* __restrict__ mu, const T* __restrict__ y, const int n) {
1277 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1278 if(tj<n)
1279 dy[tj] = (-dely[tj]+dlambda[tj])/(d[tj] + mu[tj]/y[tj]);
1280}
1281
1282#endif
1283
__global__ void heaviside_mapping_apply_kernel(const T beta, const T eta, T *__restrict__ X_out_d, T *__restrict__ X_in_d, const int n)