Neko-TOP
A portable framework for high-order spectral element flow toplogy optimization.
Loading...
Searching...
No Matches
mma_kernel.h
Go to the documentation of this file.
1
37#ifndef MMA_HIP_KERNEL_H
38#define MMA_HIP_KERNEL_H
39
40
41template<typename T>
42__global__ void mma_prepare_aa_matrix_kernel(T* __restrict__ AA,
43 const T* __restrict__ s, const T* __restrict__ lambda,
44 const T* __restrict__ d, const T* __restrict__ mu,
45 const T* __restrict__ y, const T* __restrict__ a,
46 const T zeta, const T z, const int m) {
47 const int tj = blockIdx.x * blockDim.x + threadIdx.x;
48 const int matrix_size = m + 1;
49
50 if (tj >= m) return;
51 AA[tj * matrix_size + tj] += s[tj] / lambda[tj] +
52 (T)1.0 / (d[tj] + mu[tj] / y[tj]);
53 AA[tj * matrix_size + m] = a[tj]; // column m+1
54 AA[m * matrix_size + tj] = a[tj];// row m+1
55
56 // Only first thread updates the bottom-right corner element.
57 if (tj == 0)
58 AA[m * matrix_size + m] = -zeta / z;
59}
60
61
62//Update Hessian diagonal elements (y contributions for dip subsolve)
63template<typename T>
64__global__ void mma_update_hessian_diagonal_kernel(T* __restrict__ Hess,
65 const T* __restrict__ y, const T* __restrict__ d,
66 const T* __restrict__ mu, const T* __restrict__ lambda, const int m) {
67 int tj = blockIdx.x * blockDim.x + threadIdx.x;
68 if (tj >= m) return;
69
70 T diag = Hess[tj * m + tj];
71 // Contribution from y terms (inactive constraints)
72 if (y[tj] > (T)0.0) {
73 if (fabs(d[tj]) >= (T)1.0e-15) {
74 diag -= (T)1.0 / d[tj];
75 }
76 // else: skip - equivalent to subtracting 1.0/1.0e-8
77 }
78
79 // Contribution from -mu/lambda (eq 10)
80 diag -= mu[tj] / lambda[tj];
81 Hess[tj * m + tj] = diag;
82}
83
84// Levenberg-Marquardt algorithm (heuristically)
85// Single-block version for m <= 1024
86template<typename T>
87__global__ void mma_stabilize_hessian_single_kernel(T* __restrict__ Hess, const int m) {
88 const int tid = threadIdx.x;
89
90 // Single thread computes trace and LM factor
91 if (tid == 0) {
92 T trace = (T)0.0;
93 for (int j = 0; j < m; j++) {
94 trace += Hess[j * m + j];
95 }
96 T lm_factor = max((T)(-1.0e-4) * trace / m, (T)1.0e-7);
97
98 // Apply to all diagonal elements
99 for (int j = 0; j < m; j++) {
100 Hess[j * m + j] -= lm_factor;
101 }
102 }
103}
104
105// Levenberg-Marquardt algorithm (heuristically)
106// Multi-block version for m > 1024
107template<typename T>
108__global__ void mma_stabilize_hessian_multi_kernel(T* __restrict__ Hess, const T lm_factor, const int m) {
109 const int i = blockIdx.x * blockDim.x + threadIdx.x;
110 if (i < m) {
111 Hess[i * m + i] -= lm_factor;
112 }
113}
114
115// Small linear solver efficient for (n <= 100)
116template<typename T>
117__global__ void mma_small_lu_kernel(T* __restrict__ A, T* __restrict__ b, const int n) {
118 const int tid = threadIdx.x;
119
120 // Handle 1x1 case
121 if (n == 1) {
122 if (tid == 0 && abs(A[0]) > (T)1e-12) b[0] /= A[0];
123 return;
124 }
125
126 // LU decomposition with partial pivoting
127 for (int k = 0; k < n; k++) {
128 // Pivoting - single thread
129 if (tid == 0) {
130 int max_row = k;
131 T max_val = abs(A[k * n + k]);
132 for (int i = k + 1; i < n; i++) {
133 T val = abs(A[i * n + k]);
134 if (val > max_val) {
135 max_val = val;
136 max_row = i;
137 }
138 }
139 if (max_val > (T)1e-12 && max_row != k) {
140 // Swap rows
141 for (int j = k; j < n; j++) {
142 T temp = A[k * n + j];
143 A[k * n + j] = A[max_row * n + j];
144 A[max_row * n + j] = temp;
145 }
146 // Swap rhs
147 T temp_b = b[k];
148 b[k] = b[max_row];
149 b[max_row] = temp_b;
150 }
151 }
153
154 // Parallel elimination
155 T diag = A[k * n + k];
156 if (abs(diag) > (T)1e-12) {
157 for (int i = tid + k + 1; i < n; i += blockDim.x) {
158 T factor = A[i * n + k] / diag;
159 A[i * n + k] = factor;
160 for (int j = k + 1; j < n; j++) {
161 A[i * n + j] -= factor * A[k * n + j];
162 }
163 }
164 }
166 }
167
168 // Parallel forward substitution
169 for (int i = tid; i < n; i += blockDim.x) {
170 T sum = b[i];
171 for (int j = 0; j < i; j++) {
172 sum -= A[i * n + j] * b[j];
173 }
174 b[i] = sum;
175 }
177
178 // Parallel backward substitution
179 for (int i = n - 1 - tid; i >= 0; i -= blockDim.x) {
180 if (i >= 0) {
181 T sum = b[i];
182 for (int j = i + 1; j < n; j++) {
183 sum -= A[i * n + j] * b[j];
184 }
185 if (abs(A[i * n + i]) > (T)1e-12) {
186 b[i] = sum / A[i * n + i];
187 }
188 }
189 }
191}
192
193
194template <typename T>
195__global__ void delta_1dbeam_kernel(T* __restrict__ Delta,
196 const T L_total, const T Le, const int offset, const int n) {
197 int k = blockIdx.x * blockDim.x + threadIdx.x;
198 if (k >= n) return;
199
200 // Convert to 1-based indexing for the calculation
201 int idx = k + 1;
202
203 // Calculate first term: (L_total - Le*(offset+idx-1))^3
204 T x1 = L_total - Le * static_cast<T>(offset + idx - 1);
205 T term1 = x1 * x1 * x1;
206
207 // Calculate second term: (L_total - Le*(offset+idx))^3
208 T x2 = L_total - Le * static_cast<T>(offset + idx);
209 T term2 = x2 * x2 * x2;
210
211 // Final result
212 Delta[k] = (term1 - term2) / static_cast<T>(3.0);
213}
214
215template <typename T>
216__global__ void mma_Ljjxinv_kernel(T* __restrict__ Ljjxinv,
218 const T* __restrict__ x, const T* __restrict__ low, const T* __restrict__ upp,
219 const T* __restrict__ alpha, const T* __restrict__ beta,
220 const int n) {
221 int tj = blockIdx.x * blockDim.x + threadIdx.x;
222 if (tj >= n) return;
223
224 // Load inputs into registers
225 const T xt = x[tj];
226 const T low_j = low[tj];
227 const T upp_j = upp[tj];
228 const T pj = pjlambda[tj];
229 const T qj = qjlambda[tj];
230
231 // Precompute reused differences
232 const T diff_u = upp_j - xt;
233 const T diff_l = xt - low_j;
234
235 // Cube once (avoiding pow for speed)
236 const T diff_u3 = diff_u * diff_u * diff_u;
237 const T diff_l3 = diff_l * diff_l * diff_l;
238
239 // Compute inverse value safely
240 T denom = 2.0 * pj / diff_u3 + 2.0 * qj / diff_l3;
241 T val = -1.0 / denom;
242
243 // Mask out active primal constraints
244 bool active = (fabs(xt - alpha[tj]) <= T(1e-16)) ||
245 (fabs(xt - beta[tj]) <= T(1e-16));
246
247 Ljjxinv[tj] = active ? T(0.0) : val;
248}
249
250template <typename T>
251__global__ void mma_dipsolvesub1_kernel(T* __restrict__ x,
253 const T* __restrict__ low, const T* __restrict__ upp,
254 const T* __restrict__ alpha, const T* __restrict__ beta,
255 const int n) {
256
257 int tj = blockIdx.x * blockDim.x + threadIdx.x;
258 if (tj >= n) return;
259
260 // Load inputs into registers
261 const T pj_sqrt = sqrt(pjlambda[tj]);
262 const T qj_sqrt = sqrt(qjlambda[tj]);
263 const T denom = pj_sqrt + qj_sqrt;
264
265 const T low_j = low[tj];
266 const T upp_j = upp[tj];
267 const T alpha_j = alpha[tj];
268 const T beta_j = beta[tj];
269
270 // Weighted average
271 const T val = (pj_sqrt * low_j + qj_sqrt * upp_j) / denom;
272
273 // Clamp x between alpha and beta using branchless min/max
274 x[tj] = fmin(fmax(val, alpha_j), beta_j);
275}
276
277template <typename T>
278__global__ void mattrans_v_mul_kernel(T* __restrict__ output,
279 const T* __restrict__ pij, const T* __restrict__ lambda,
280 const int m, const int n) {
281
282 int tj = blockIdx.x * blockDim.x + threadIdx.x;
283 if (tj >= n) return;
284
285 T acc = 0.0;
286
287 // Use register accumulation
288 for (int i = 0; i < m; ++i) {
289 acc += pij[i + tj * m] * lambda[i];
290 }
291
292 output[tj] = acc;
293}
294
295
296template <typename T>
297__global__ void mma_sub1_kernel(
300 const T* __restrict__ x,
301 const T* __restrict__ xmin,
302 const T* __restrict__ xmax,
303 const T asyinit,
304 const int n) {
305 int tj = blockIdx.x * blockDim.x + threadIdx.x;
306 if (tj >= n) return;
307
308 // Load values once into registers (global memory is slow)
309 const T xt = x[tj];
310 const T xmin_j = xmin[tj];
311 const T xmax_j = xmax[tj];
312
313 // Reuse xgap calculation
314 const T xgap = xmax_j - xmin_j;
315 const T offset = asyinit * xgap;
316
317 xlow[tj] = xt - offset;
318 xupp[tj] = xt + offset;
319}
320
321template< typename T >
322__global__ void mma_sub2_kernel(T* __restrict__ low, T* __restrict__ upp,
323 const T* __restrict__ x, const T* __restrict__ xold1,
324 const T* __restrict__ xold2, const T* __restrict__ xdiff,
325 const T asydecr, const T asyincr, const int n) {
326 int tj = blockIdx.x * blockDim.x + threadIdx.x;
327 if (tj >= n) return;
328
329 // Load data into registers for faster accessing compare to global memory
330 // when accessing repeatedly)
331 const T xval = x[tj];
332 const T xold1val = xold1[tj];
333 const T xold2val = xold2[tj];
334 const T lowval = low[tj];
335 const T uppval = upp[tj];
336 const T xdiffval = xdiff[tj];
337
338 // Compute the product
339 const T prod = (xval - xold1val) * (xold1val - xold2val);
340
341 // Compute asy_factor without branching
342 T asy_factor = (prod < T(0)) ? asydecr :
343 (prod > T(0)) ? asyincr : T(1);
344
345 // Update low and upp using fma (fused multiply-add) for numerical stability
348
349 // Apply bounds
350 new_low = max(new_low, xval - T(10.0) * xdiffval);
351 new_low = min(new_low, xval - T(0.01) * xdiffval);
352
353 new_upp = min(new_upp, xval + T(10.0) * xdiffval);
354 new_upp = max(new_upp, xval + T(0.01) * xdiffval);
355
356 // Write results back
357 low[tj] = new_low;
358 upp[tj] = new_upp;
359}
360
361template <typename T>
362__global__ void mma_sub3_kernel( const T* __restrict__ x,
363 const T* __restrict__ df0dx, const T* __restrict__ dfdx,
364 T* __restrict__ low, T* __restrict__ upp, const T* __restrict__ xmin,
365 const T* __restrict__ xmax, T* __restrict__ alpha, T* __restrict__ beta,
366 T* __restrict__ p0j, T* __restrict__ q0j, T* __restrict__ pij,
367 T* __restrict__ qij, const int n, const int m) {
368 int tj = blockIdx.x * blockDim.x + threadIdx.x;
369 if (tj >= n) return;
370
371 // Load into registers once
372 const T xt = x[tj];
373 const T xmin_j = xmin[tj];
374 const T xmax_j = xmax[tj];
375 const T low_j = low[tj];
376 const T upp_j = upp[tj];
377 const T df0 = df0dx[tj];
378 const T xgap = xmax_j - xmin_j;
379
380 // Clamp helpers
381 const T half_xgap = 0.5 * xgap;
382 const T tenth_low_diff = T(0.1) * (xt - low_j);
383 const T tenth_upp_diff = T(0.1) * (upp_j - xt);
384
385 // Compute alpha and beta with fused max/min and fewer calls
388
389 alpha[tj] = alpha_val;
390 beta[tj] = beta_val;
391
392 // Avoid multiple pow calls, compute once
393 const T upp_minus_x = upp_j - xt;
394 const T x_minus_low = xt - low_j;
397
398 // Small epsilon for numerical stability
399 const T eps = T(1e-5);
400 const T inv_xgap = T(1.0) / max(eps, xgap);
401
402 // Compute terms reused multiple times
403 const T max_df0_pos = max(df0, T(0));
404 const T max_df0_neg = max(-df0, T(0));
405
406 p0j[tj] = upp_minus_x_sq * (T(1.001) * max_df0_pos +
407 T(0.001) * max_df0_neg + eps * inv_xgap);
408 q0j[tj] = x_minus_low_sq * (T(0.001) * max_df0_pos +
409 T(1.001) * max_df0_neg + eps * inv_xgap);
410
411 // Loop over m for pij and qij
412 for (int i = 0; i < m; ++i) {
413 int idx = i + tj * m;
414
415 T dfdx_val = dfdx[idx];
416 T max_pos = max(dfdx_val, T(0));
417 T max_neg = max(-dfdx_val, T(0));
418
419 pij[idx] = upp_minus_x_sq * (T(1.001) * max_pos +
420 T(0.001) * max_neg + eps * inv_xgap);
421 qij[idx] = x_minus_low_sq * (T(0.001) * max_pos +
422 T(1.001) * max_neg + eps * inv_xgap);
423 }
424}
425
426template <typename T>
427__global__ void mma_sub4_kernel(
428 const T* __restrict__ x,
429 const T* __restrict__ low,
430 const T* __restrict__ upp,
431 const T* __restrict__ pij,
432 const T* __restrict__ qij,
433 T* __restrict__ temp,
434 const int n,
435 const int m) {
436
437 int tj = blockIdx.x * blockDim.x + threadIdx.x;
438 if (tj >= n) return;
439
440 // Register caching
441 const T xt = x[tj];
442 const T low_j = low[tj];
443 const T upp_j = upp[tj];
444
445 const T denom_upp = upp_j - xt;
446 const T denom_low = xt - low_j;
447
448 const T eps = T(1e-12); // Precision-dependent epsilon
449 const T inv_denom_upp = T(1) / max(denom_upp, eps);
450 const T inv_denom_low = T(1) / max(denom_low, eps);
451
452 const int base_idx = tj * m;
453
454 for (int i = 0; i < m; ++i) {
455 int idx = base_idx + i;
456 temp[idx] = pij[idx] * inv_denom_upp + qij[idx] * inv_denom_low;
457 }
458}
459
460
461template <typename T>
462__global__ void mma_max2_kernel(
463 T* __restrict__ xsi,
464 const T* __restrict__ x,
465 const T* __restrict__ alpha,
466 const int n) {
467
468 int tj = blockIdx.x * blockDim.x + threadIdx.x;
469 if (tj >= n) return;
470
471 const T eps = T(1e-12);
472 T denom = max(x[tj] - alpha[tj], eps);
473 xsi[tj] = max(T(1), T(1) / denom);
474}
475
476template <typename T>
477__global__ void relambda_kernel(
478 T* __restrict__ temp,
479 const T* __restrict__ x,
480 const T* __restrict__ xupp,
481 const T* __restrict__ xlow,
482 const T* __restrict__ pij,
483 const T* __restrict__ qij,
484 const int n,
485 const int m) {
486
487 int tj = blockIdx.x * blockDim.x + threadIdx.x;
488 if (tj >= n) return;
489
490 const T xt = x[tj];
491 const T xup = xupp[tj];
492 const T xlo = xlow[tj];
493
494 // Prevent divide-by-zero using small epsilon
495 const T eps = T(1e-12);
496 const T inv_denom_upp = T(1) / max(xup - xt, eps);
497 const T inv_denom_low = T(1) / max(xt - xlo, eps);
498
499 int base_idx = tj * m;
500 for (int i = 0; i < m; ++i) {
501 int idx = base_idx + i;
502 temp[idx] = pij[idx] * inv_denom_upp + qij[idx] * inv_denom_low;
503 }
504}
505
506
507template <typename T>
508__global__ void sub2cons2_kernel(
509 T* __restrict__ a,
510 const T* __restrict__ b,
511 const T* __restrict__ c,
512 const T* __restrict__ d,
513 const T e,
514 const int n) {
515 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
516 if (idx >= n) return;
517
518 const T bt = b[idx];
519 const T ct = c[idx];
520 const T dt = d[idx];
521
522 a[idx] = bt * (ct - dt) - e;
523}
524
525template <typename T>
526__inline__ __device__ T max_reduce_warp(T val) {
527 int w = warpSize;
528 for (int offset = w / 2; offset > 0; offset /= 2)
529 val = max(val, __shfl_down(val, offset, w));
530 return val;
531}
532
533template< typename T >
534__global__ void maxval_kernel(const T* __restrict__ a, T *temp, const int n) {
535
536 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
537 const int str = blockDim.x * gridDim.x;
538
539 const unsigned int lane = threadIdx.x % warpSize;
540 const unsigned int wid = threadIdx.x / warpSize;
541
542 __shared__ T shared[16];
543 T maxval = 0.0;
544 for (int i = idx; i < n; i += str) {
545 maxval = max(maxval, abs(a[i]));
546 }
547
549 if (lane == 0)
550 shared[wid] = maxval;
552
553 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0.0;
554 if (wid == 0)
556
557 if (threadIdx.x == 0)
558 temp[blockIdx.x] = maxval;
559
560}
561
562
563template <typename T>
564__global__ void max_reduce_kernel(T* __restrict__ bufred, const int n) {
565
566 T maxval = T(0);
567 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
568 const int stride = blockDim.x * gridDim.x;
569
570 // Grid-stride loop to cover all elements
571 for (int i = idx; i < n; i += stride) {
572 maxval = max(maxval, bufred[i]);
573 }
574
575 __shared__ T shared[16]; // One slot per warp (max 1024 threads/block)
576
577 unsigned int lane = threadIdx.x % warpSize;
578 unsigned int wid = threadIdx.x / warpSize;
579
580 // Warp-level max reduction
582
583 // Store each warp's max value in shared memory
584 if (lane == 0) {
585 shared[wid] = maxval;
586 }
588
589 // Let the first warp reduce the warp-level maxima
590 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : T(0);
591 if (wid == 0) {
593 }
594
595 // Write the block's maximum value back to bufred[blockIdx.x]
596 if (threadIdx.x == 0) {
598 }
599}
600
601
602template <typename T>
603__global__ void delx_kernel(
605 const T* __restrict__ x,
606 const T* __restrict__ xlow,
607 const T* __restrict__ xupp,
608 const T* __restrict__ pij,
609 const T* __restrict__ qij,
610 const T* __restrict__ p0j,
611 const T* __restrict__ q0j,
612 const T* __restrict__ alpha,
613 const T* __restrict__ beta,
614 const T* __restrict__ lambda,
615 const T epsi,
616 const int n,
617 const int m)
618{
619 int tj = blockIdx.x * blockDim.x + threadIdx.x;
620 if (tj < n) {
621 T xt = x[tj];
622 T xlow_j = xlow[tj];
623 T xupp_j = xupp[tj];
624 T alpha_j = alpha[tj];
625 T beta_j = beta[tj];
626
627 // Precompute denominators squared for better performance
628 T denom_low = xt - xlow_j;
629 T denom_upp = xupp_j - xt;
631 T denom_beta = beta_j - xt;
632
633 const T small_eps = T(1e-12);
638
639 T sum = T(0);
640 for (int i = 0; i < m; i++) {
641 T lambda_i = lambda[i];
642 sum += pij[i + tj * m] * lambda_i / (denom_upp * denom_upp)
643 - qij[i + tj * m] * lambda_i / (denom_low * denom_low);
644 }
645 sum += p0j[tj] / (denom_upp * denom_upp)
646 - q0j[tj] / (denom_low * denom_low)
647 - epsi / denom_alpha
648 + epsi / denom_beta;
649
650 delx[tj] = sum;
651 }
652}
653
654template <typename T>
655__global__ void GG_kernel(
657 const T* __restrict__ x,
658 const T* __restrict__ xlow,
659 const T* __restrict__ xupp,
660 const T* __restrict__ pij,
661 const T* __restrict__ qij,
662 const int n,
663 const int m) {
664
665 int tj = blockIdx.x * blockDim.x + threadIdx.x;
666 if (tj < n) {
667 T xt = x[tj];
668 T xlow_j = xlow[tj];
669 T xupp_j = xupp[tj];
670
671 // Distances from bounds
672 T diff_upper = xupp_j - xt;
673 T diff_lower = xt - xlow_j;
674
675 // Squared distances
678
679 for (int i = 0; i < m; i++) {
680 int idx = i + tj * m;
681 GG[idx] = pij[idx] / diff_upper2 - qij[idx] / diff_lower2;
682 }
683 }
684}
685
686template <typename T>
687__global__ void diagx_kernel(T* __restrict__ diagx, const T* __restrict__ x,
688 const T* __restrict__ xsi, const T* __restrict__ xlow,
689 const T* __restrict__ xupp, const T* __restrict__ p0j,
690 const T* __restrict__ q0j, const T* __restrict__ pij,
691 const T* __restrict__ qij, const T* alpha, const T* beta,
692 const T* eta, const T* lambda, const int n, const int m) {
693 int tj = blockIdx.x * blockDim.x + threadIdx.x;
694 if (tj < n) {
695 T sum = 0;
696 T sum1 = 0;
697 for (int i = 0; i < m; i++) {
698 sum = sum + pij[tj *m+ i] * lambda[i];
699 sum1 = sum1 + qij[tj*m + i] * lambda[i];
700 }
701 diagx[tj] = (p0j[tj] + sum) / pow(xupp[tj] - x[tj], 3) +
702 (q0j[tj] + sum1) / pow(x[tj] - xlow[tj], 3);
703 diagx[tj] = 2.0 * diagx[tj] + xsi[tj] / (x[tj] - alpha[tj]) +
704 eta[tj] / (beta[tj] - x[tj]);
705 }
706}
707
708
709template <typename T>
710__inline__ __device__ T reduce_warp(T val) {
711 int w = warpSize;
712 for (int offset = w / 2; offset > 0; offset /= 2)
713 val += __shfl_down(val, offset, w);
714 return val;
715}
716
717template <typename T>
718__global__ void mmareduce_kernel(T* __restrict__ bufred, const int n) {
719
720 T sum = 0;
721 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
722 const int str = blockDim.x * gridDim.x;
723 for (int i = idx; i < n; i += str)
724 {
725 sum += bufred[i];
726 }
727
728 __shared__ T shared[16];
729 unsigned int lane = threadIdx.x % warpSize;
730 unsigned int wid = threadIdx.x / warpSize;
731
733 if (lane == 0)
734 shared[wid] = sum;
736
737 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
738 if (wid == 0)
740
741 if (threadIdx.x == 0)
742 bufred[blockIdx.x] = sum;
743}
744
745
746template< typename T >
747__global__ void mmasum_kernel(const T* __restrict__ a, T* __restrict__ buf_h,
748 const int n, const int m, const int k) {
749
750 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
751 const int str = blockDim.x * gridDim.x;
752
753 const unsigned int lane = threadIdx.x % warpSize;
754 const unsigned int wid = threadIdx.x / warpSize;
755
756 __shared__ T shared[16];
757 T sum = 0;
758 for (int i = idx; i < n; i += str)
759 {
760 sum += a[m * i + k ];
761 }
762
764 if (lane == 0)
765 shared[wid] = sum;
767
768 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
769 if (wid == 0)
771
772 if (threadIdx.x == 0)
773 buf_h[blockIdx.x] = sum;
774
775}
776template< typename T >
777__global__ void mmasumbb_kernel(const T* __restrict__ GG,
778 const T* __restrict__ delx, const T* __restrict__ diagx,
779 T* __restrict__ buf_h, const int n, const int m, const int k) {
780
781 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
782 const int str = blockDim.x * gridDim.x;
783
784 const unsigned int lane = threadIdx.x % warpSize;
785 const unsigned int wid = threadIdx.x / warpSize;
786
787 __shared__ T shared[16];
788 T sum = 0;
789 for (int i = idx; i < n; i += str)
790 {
791 sum += GG[ k + i * m] * delx[i] / diagx[i];
792 }
793
795 if (lane == 0)
796 shared[wid] = sum;
798
799 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
800 if (wid == 0)
802
803 if (threadIdx.x == 0)
804 buf_h[blockIdx.x] = sum;
805
806}
807
808template <typename T>
809__global__ void mmasumHess_kernel(
810 const T* __restrict__ hijx,
811 const T* __restrict__ Ljjxinv,
813 const int n,
814 const int m,
815 const int k0,
816 const int k1)
817{
818 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
819 const int stride = blockDim.x * gridDim.x;
820
821 // Warp lane and warp id within the block
822 const unsigned int lane = threadIdx.x % warpSize;
823 const unsigned int wid = threadIdx.x / warpSize;
824
825 __shared__ T shared[16];
826
827 T sum = T(0);
828
829 // Grid-stride loop for global reduction
830 for (int i = idx; i < n; i += stride) {
831 // hijx is indexed as hijx[offset + row * m]
832 T val0 = hijx[k0 + i * m];
833 T val1 = hijx[k1 + i * m];
834 sum += val0 * Ljjxinv[i] * val1;
835 }
836
837 // Warp-level reduction using your reduce_warp implementation
839
840 // Write each warp's partial sum to shared memory
841 if (lane == 0) {
842 shared[wid] = sum;
843 }
845
846 // First warp reduces the warp sums in shared memory
847 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : T(0);
848 if (wid == 0) {
850 }
851
852 // Write final result from first thread of block
853 if (threadIdx.x == 0) {
854 buf_h[blockIdx.x] = sum;
855 }
856}
857
858template< typename T >
859__global__ void mmasumAA_kernel(const T* __restrict__ GG,
860 const T* __restrict__ diagx, T* __restrict__ buf_h, const int n,
861 const int m, const int k0, const int k1) {
862
863 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
864 const int str = blockDim.x * gridDim.x;
865
866 const unsigned int lane = threadIdx.x % warpSize;
867 const unsigned int wid = threadIdx.x / warpSize;
868
869 __shared__ T shared[16];
870 T sum = 0;
871 for (int i = idx; i < n; i += str)
872 {
873 sum += GG[ k0 + i * m] /diagx[i] * GG[ k1 + i * m];
874 }
875
877 if (lane == 0)
878 shared[wid] = sum;
880
881 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
882 if (wid == 0)
884
885 if (threadIdx.x == 0)
886 buf_h[blockIdx.x] = sum;
887
888}
889
890
891template <typename T>
892__global__ void mma_copy_kernel(T* __restrict__ a, const T* __restrict__ b,
893 const int n, const int m) {
894 int tj = blockIdx.x * blockDim.x + threadIdx.x;
895 if(tj<n)
896 a[tj+m]=b[tj];
897}
898
899
900
901template <typename T>
902__global__ void AA_kernel(T* __restrict__ temp, const T* __restrict__ GG,
903 const T* __restrict__ diagx, const int n, const int m) {
904 int tj = blockIdx.x * blockDim.x + threadIdx.x;
905 if (tj < n) {
906 for (int i0 = 0; i0 < m; i0++) {
907 for (int i1 = 0; i1 < m; i1++) {
908 temp[tj + i0 * (n + 1) + i1 * (m + 1) * (n + 1)] = GG[i0 * n + tj] *
909 (1.0 / diagx[tj]) * GG[i1 * n + tj];
910 }
911 }
912 }
913}
914
915
916template <typename T>
917__global__ void dx_kernel(T* __restrict__ dx, const T* __restrict__ delx,
918 const T* __restrict__ diagx, const T* __restrict__ GG,
919 const T* __restrict__ dlambda, const int n, const int m) {
920 int tj = blockIdx.x * blockDim.x + threadIdx.x;
921 if (tj < n) {
922 dx[tj] = -delx[tj]/diagx[tj];
923 for(int i=0;i<m;i++){
924 dx[tj] =dx[tj] - GG[tj*m+i]*dlambda[i]/diagx[tj];
925 }
926 }
927}
928
929
930
931template <typename T>
932__global__ void dxsi_kernel(T* __restrict__ dxsi, const T* __restrict__ xsi,
933 const T* __restrict__ dx, const T* __restrict__ x,
934 const T* __restrict__ alpha, const T epsi, const int n) {
935 int tj = blockIdx.x * blockDim.x + threadIdx.x;
936 if (tj < n) {
937 dxsi[tj]= -xsi[tj] + (epsi-dx[tj]*xsi[tj])/(x[tj] - alpha[tj]);
938 }
939}
940
941
942template <typename T>
943__global__ void deta_kernel(T* __restrict__ deta, const T* __restrict__ eta,
944 const T* __restrict__ dx, const T* __restrict__ x,
945 const T* __restrict__ beta, const T epsi, const int n) {
946 int tj = blockIdx.x * blockDim.x + threadIdx.x;
947 if (tj < n) {
948 deta[tj] = -eta[tj] + (epsi + dx[tj] * eta[tj]) / (beta[tj] - x[tj]);
949 }
950}
951
952template <typename T>
953__global__ void RexCalculation_kernel(
955 const T* __restrict__ x,
956 const T* __restrict__ xlow,
957 const T* __restrict__ xupp,
958 const T* __restrict__ pij,
959 const T* __restrict__ p0j,
960 const T* __restrict__ qij,
961 const T* __restrict__ q0j,
962 const T* __restrict__ lambda,
963 const T* __restrict__ xsi,
964 const T* __restrict__ eta,
965 const int n,
966 const int m)
967{
968 int tj = blockIdx.x * blockDim.x + threadIdx.x;
969 if (tj < n) {
970 T upp_diff = xupp[tj] - x[tj];
971 T low_diff = x[tj] - xlow[tj];
974
975 T sum = 0.0;
976 for (int i = 0; i < m; ++i) {
977 sum += pij[i + tj * m] * lambda[i] / upp_diff_sq
978 - qij[i + tj * m] * lambda[i] / low_diff_sq;
979 }
980
981 rex[tj] = sum
982 + p0j[tj] / upp_diff_sq
983 - q0j[tj] / low_diff_sq
984 - xsi[tj] + eta[tj];
985 }
986}
987
988template <typename T>
989__global__ void rey_calculation_kernel(T* __restrict__ rey,
990 const T* __restrict__ c, const T* __restrict__ d, const T* __restrict__ y,
991 const T* __restrict__ lambda, const T* __restrict__ mu, const int n) {
992 int tj = blockIdx.x * blockDim.x + threadIdx.x;
993 if (tj < n) {
994 rey[tj] = c[tj] + d[tj] * y[tj] - lambda[tj] - mu[tj];
995 }
996}
997
998
999template< typename T >
1000__global__ void norm_kernel(const T* __restrict__ a, T* __restrict__ buf_h,
1001 const int n) {
1002
1003 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1004 const int str = blockDim.x * gridDim.x;
1005
1006 const unsigned int lane = threadIdx.x % warpSize;
1007 const unsigned int wid = threadIdx.x / warpSize;
1008
1009 __shared__ T shared[16];
1010 T sum = 0;
1011 for (int i = idx; i < n; i += str)
1012 {
1013 sum += pow(a[i], 2);
1014 }
1015
1017 if (lane == 0)
1018 shared[wid] = sum;
1019 __syncthreads();
1020
1021 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1022 if (wid == 0)
1024
1025 if (threadIdx.x == 0)
1026 buf_h[blockIdx.x] = sum;
1027
1028}
1029
1030
1031
1032template <typename T>
1033__global__ void sub2cons_kernel(T* __restrict__ a, const T* __restrict__ b,
1034 const T* __restrict__ c,
1035 const T d, const int n) {
1036 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1037 if (tj < n) {
1038 a[tj] = b[tj]*c[tj]-d;
1039 }
1040}
1041
1042
1043template <typename T>
1044__global__ void dely_kernel(T* __restrict__ dely, const T* __restrict__ c,
1045 const T* __restrict__ d, const T* __restrict__ y,
1046 const T* __restrict__ lambda, const T epsi, const int n) {
1047 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1048 if (tj < n) {
1049 dely[tj] = c[tj] + d[tj]*y[tj] - lambda[tj] - epsi/y[tj];
1050 }
1051}
1052
1053
1054
1055template< typename T >
1056__global__ void maxval2_kernel(const T* __restrict__ a, const T* __restrict__ b,
1057 T* __restrict__ temp, const T cons, const int n) {
1058
1059 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1060 const int str = blockDim.x * gridDim.x;
1061
1062 const unsigned int lane = threadIdx.x % warpSize;
1063 const unsigned int wid = threadIdx.x / warpSize;
1064
1065 __shared__ T shared[16];
1066 T maxval = cons * a[0] / b[0];
1067 for (int i = idx; i < n; i += str)
1068 {
1069 maxval = max(maxval, cons * a[i] / b[i]);
1070 }
1071
1073 if (lane == 0)
1074 shared[wid] = maxval;
1075 __syncthreads();
1076
1077 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0.0;
1078 if (wid == 0)
1080
1081 if (threadIdx.x == 0)
1082 temp[blockIdx.x] = maxval;
1083
1084}
1085
1086
1087template< typename T >
1088__global__ void maxval3_kernel(const T* __restrict__ a, const T* __restrict__ b,
1089 const T* __restrict__ c, T* __restrict__ temp, const T cons, const int n) {
1090
1091 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1092 const int str = blockDim.x * gridDim.x;
1093
1094 const unsigned int lane = threadIdx.x % warpSize;
1095 const unsigned int wid = threadIdx.x / warpSize;
1096
1097 __shared__ T shared[16];
1098 T maxval = cons * a[0] / b[0];
1099 for (int i = idx; i < n; i += str)
1100 {
1101 maxval = max(maxval, cons * a[i] / (b[i] - c[i]));
1102 }
1103
1105 if (lane == 0)
1106 shared[wid] = maxval;
1107 __syncthreads();
1108
1109 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1110 if (wid == 0)
1112
1113 if (threadIdx.x == 0)
1114 temp[blockIdx.x] = maxval;
1115
1116}
1117
1118
1119template <typename T>
1120__global__ void kkt_rex_kernel(T* __restrict__ rex, const T* __restrict__ df0dx,
1121 const T* __restrict__ dfdx, const T* __restrict__ xsi,
1122 const T* __restrict__ eta, const T* __restrict__ lambda, const int n,
1123 const int m) {
1124 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1125 if (tj < n) {
1126 rex[tj] = 0.0;
1127 for (int i = 0; i < m; i++) {
1128 rex[tj] = rex[tj] + dfdx[i + tj*m] * lambda[i];
1129 }
1130 rex[tj] += df0dx[tj] - xsi[tj] + eta[tj];
1131 }
1132}
1133
1134
1135template <typename T>
1136__global__ void maxcons_kernel(T* __restrict__ a, const T b,
1137 const T c, const T* __restrict__ d, const int n) {
1138 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1139 if (tj < n) {
1140 a[tj] = max(b, c * d[tj]);
1141 }
1142}
1143
1146 template< typename T >
1147 __global__ void glsum_kernel(const T * a, T * buf_h, const int n) {
1148 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1149 const int str = blockDim.x * gridDim.x;
1150
1151 const unsigned int lane = threadIdx.x % warpSize;
1152 const unsigned int wid = threadIdx.x / warpSize;
1153
1154 __shared__ T shared[16];
1155 T sum = 0;
1156 for (int i = idx; i<n ; i += str)
1157 {
1158 sum += a[i];
1159 }
1160
1162 if (lane == 0)
1163 shared[wid] = sum;
1164 __syncthreads();
1165
1166 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1167 if (wid == 0)
1169
1170 if (threadIdx.x == 0)
1171 buf_h[blockIdx.x] = sum;
1172
1173 }
1174 template< typename T >
1175__global__ void glsc2_kernel(const T * a,
1176 const T * b,
1177 T * buf_h,
1178 const int n) {
1179
1180 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1181 const int str = blockDim.x * gridDim.x;
1182
1183 const unsigned int lane = threadIdx.x % warpSize;
1184 const unsigned int wid = threadIdx.x / warpSize;
1185
1186 __shared__ T shared[16];
1187 T sum = 0.0;
1188 for (int i = idx; i < n; i+= str) {
1189 sum += a[i] * b[i];
1190 }
1191
1193 if (lane == 0)
1194 shared[wid] = sum;
1195 __syncthreads();
1196
1197 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1198 if (wid == 0)
1200
1201 if (threadIdx.x == 0)
1202 buf_h[blockIdx.x] = sum;
1203
1204 }
1205
1206
1207
1208
1209
1210template <typename T>
1211__global__ void add2inv2_kernel(T* __restrict__ a, const T* __restrict__ b,
1212 const T c, const int n) {
1213 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1214 if (tj < n) {
1215 a[tj] = a[tj]+c/b[tj];
1216 }
1217}
1218
1219template <typename T>
1220__global__ void max2_kernel(T* __restrict__ a, const T b,
1221 const T* __restrict__ c, const T d, const int n) {
1222 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1223 if (tj < n) {
1224 a[tj]=max(b, d*c[tj]);
1225 }
1226}
1227
1228template <typename T>
1229__global__ void updatebb_kernel(T* __restrict__ bb,
1230 const T* __restrict__ dellambda, const T* __restrict__ dely,
1231 const T* __restrict__ d, const T* __restrict__ mu,
1232 const T* __restrict__ y, const T delz, const int m) {
1233 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1234 if(tj<m)
1235 bb[tj]=dellambda[tj] + dely[tj]/(d[tj] + mu[tj]/y[tj]) - bb[tj];
1236 else if(tj<m+1)
1237 bb[tj]=delz;
1238}
1239
1240
1241
1242template <typename T>
1243__global__ void updateAA_kernel(T* __restrict__ AA,
1244 const T* __restrict__ globaltmp_mm, const T* __restrict__ s,
1245 const T* __restrict__ lambda, const T* __restrict__ d,
1246 const T* __restrict__ mu, const T* __restrict__ y, const T* __restrict__ a,
1247 const T zeta, const T z, const int m) {
1248 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1249 if(tj<m)
1250 {
1251 AA[tj+tj*(m+1)]=globaltmp_mm[tj+tj*m] + (s[tj] / lambda[tj] +
1252 1.0/ (d[tj] + mu[tj] / y[tj]));
1253 AA[tj+m*(m+1)]=a[tj];
1254 AA[m+tj*(m+1)]=a[tj];
1255 }
1256 else if(tj<m+1)
1257 AA[tj+tj*(m+1)]= -zeta/z;
1258}
1259
1260template <typename T>
1261__global__ void dy_kernel(T* __restrict__ dy, const T* __restrict__ dely,
1262 const T* __restrict__ dlambda, const T* __restrict__ d,
1263 const T* __restrict__ mu, const T* __restrict__ y, const int n) {
1264 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1265 if(tj<n)
1266 dy[tj] = (-dely[tj]+dlambda[tj])/(d[tj] + mu[tj]/y[tj]);
1267}
1268
1269#endif
1270
__global__ void convex_down_RAMP_mapping_apply_kernel(const T f_min, const T f_max, const T q, T *__restrict__ X_out_d, T *__restrict__ X_in_d, const int n)