Neko-TOP
A portable framework for high-order spectral element flow toplogy optimization.
Loading...
Searching...
No Matches
mma_kernel.h
1/*
2 Copyright (c) 2021-2025, The Neko Authors
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 * Redistributions in binary form must reproduce the above
13 copyright notice, this list of conditions and the following
14 disclaimer in the documentation and/or other materials provided
15 with the distribution.
16
17 * Neither the name of the authors nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 POSSIBILITY OF SUCH DAMAGE.
33*/
34
35#ifndef MMA_HIP_KERNEL_H
36#define MMA_HIP_KERNEL_H
37
38
39template<typename T>
40__global__ void mma_prepare_aa_matrix_kernel(T* __restrict__ AA,
41 const T* __restrict__ s, const T* __restrict__ lambda,
42 const T* __restrict__ d, const T* __restrict__ mu,
43 const T* __restrict__ y, const T* __restrict__ a,
44 const T zeta, const T z, const int m) {
45 const int tj = blockIdx.x * blockDim.x + threadIdx.x;
46 const int matrix_size = m + 1;
47
48 if (tj >= m) return;
49 AA[tj * matrix_size + tj] += s[tj] / lambda[tj] +
50 (T)1.0 / (d[tj] + mu[tj] / y[tj]);
51 AA[tj * matrix_size + m] = a[tj]; // column m+1
52 AA[m * matrix_size + tj] = a[tj];// row m+1
53
54 // Only first thread updates the bottom-right corner element.
55 if (tj == 0)
56 AA[m * matrix_size + m] = -zeta / z;
57}
58
59
60//Update Hessian diagonal elements (y contributions for dip subsolve)
61template<typename T>
62__global__ void mma_update_hessian_diagonal_kernel(T* __restrict__ Hess,
63 const T* __restrict__ y, const T* __restrict__ d,
64 const T* __restrict__ mu, const T* __restrict__ lambda, const int m) {
65 int tj = blockIdx.x * blockDim.x + threadIdx.x;
66 if (tj >= m) return;
67
68 T diag = Hess[tj * m + tj];
69 // Contribution from y terms (inactive constraints)
70 if (y[tj] > (T)0.0) {
71 if (fabs(d[tj]) >= (T)1.0e-15) {
72 diag -= (T)1.0 / d[tj];
73 }
74 // else: skip - equivalent to subtracting 1.0/1.0e-8
75 }
76
77 // Contribution from -mu/lambda (eq 10)
78 diag -= mu[tj] / lambda[tj];
79 Hess[tj * m + tj] = diag;
80}
81
82// Levenberg-Marquardt algorithm (heuristically)
83// Single-block version for m <= 1024
84template<typename T>
85__global__ void mma_stabilize_hessian_single_kernel(T* __restrict__ Hess, const int m) {
86 const int tid = threadIdx.x;
87
88 // Single thread computes trace and LM factor
89 if (tid == 0) {
90 T trace = (T)0.0;
91 for (int j = 0; j < m; j++) {
92 trace += Hess[j * m + j];
93 }
94 T lm_factor = max((T)(-1.0e-4) * trace / m, (T)1.0e-7);
95
96 // Apply to all diagonal elements
97 for (int j = 0; j < m; j++) {
98 Hess[j * m + j] -= lm_factor;
99 }
100 }
101}
102
103// Levenberg-Marquardt algorithm (heuristically)
104// Multi-block version for m > 1024
105template<typename T>
106__global__ void mma_stabilize_hessian_multi_kernel(T* __restrict__ Hess, const T lm_factor, const int m) {
107 const int i = blockIdx.x * blockDim.x + threadIdx.x;
108 if (i < m) {
109 Hess[i * m + i] -= lm_factor;
110 }
111}
112
113// Small linear solver efficient for (n <= 100)
114template<typename T>
115__global__ void mma_small_lu_kernel(T* __restrict__ A, T* __restrict__ b, const int n) {
116 const int tid = threadIdx.x;
117
118 // Handle 1x1 case
119 if (n == 1) {
120 if (tid == 0 && abs(A[0]) > (T)1e-12) b[0] /= A[0];
121 return;
122 }
123
124 // LU decomposition with partial pivoting
125 for (int k = 0; k < n; k++) {
126 // Pivoting - single thread
127 if (tid == 0) {
128 int max_row = k;
129 T max_val = abs(A[k * n + k]);
130 for (int i = k + 1; i < n; i++) {
131 T val = abs(A[i * n + k]);
132 if (val > max_val) {
133 max_val = val;
134 max_row = i;
135 }
136 }
137 if (max_val > (T)1e-12 && max_row != k) {
138 // Swap rows
139 for (int j = k; j < n; j++) {
140 T temp = A[k * n + j];
141 A[k * n + j] = A[max_row * n + j];
142 A[max_row * n + j] = temp;
143 }
144 // Swap rhs
145 T temp_b = b[k];
146 b[k] = b[max_row];
147 b[max_row] = temp_b;
148 }
149 }
150 __syncthreads();
151
152 // Parallel elimination
153 T diag = A[k * n + k];
154 if (abs(diag) > (T)1e-12) {
155 for (int i = tid + k + 1; i < n; i += blockDim.x) {
156 T factor = A[i * n + k] / diag;
157 A[i * n + k] = factor;
158 for (int j = k + 1; j < n; j++) {
159 A[i * n + j] -= factor * A[k * n + j];
160 }
161 }
162 }
163 __syncthreads();
164 }
165
166 // Parallel forward substitution
167 for (int i = tid; i < n; i += blockDim.x) {
168 T sum = b[i];
169 for (int j = 0; j < i; j++) {
170 sum -= A[i * n + j] * b[j];
171 }
172 b[i] = sum;
173 }
174 __syncthreads();
175
176 // Parallel backward substitution
177 for (int i = n - 1 - tid; i >= 0; i -= blockDim.x) {
178 if (i >= 0) {
179 T sum = b[i];
180 for (int j = i + 1; j < n; j++) {
181 sum -= A[i * n + j] * b[j];
182 }
183 if (abs(A[i * n + i]) > (T)1e-12) {
184 b[i] = sum / A[i * n + i];
185 }
186 }
187 }
188 __syncthreads();
189}
190
191
192template <typename T>
193__global__ void delta_1dbeam_kernel(T* __restrict__ Delta,
194 const T L_total, const T Le, const int offset, const int n) {
195 int k = blockIdx.x * blockDim.x + threadIdx.x;
196 if (k >= n) return;
197
198 // Convert to 1-based indexing for the calculation
199 int idx = k + 1;
200
201 // Calculate first term: (L_total - Le*(offset+idx-1))^3
202 T x1 = L_total - Le * static_cast<T>(offset + idx - 1);
203 T term1 = x1 * x1 * x1;
204
205 // Calculate second term: (L_total - Le*(offset+idx))^3
206 T x2 = L_total - Le * static_cast<T>(offset + idx);
207 T term2 = x2 * x2 * x2;
208
209 // Final result
210 Delta[k] = (term1 - term2) / static_cast<T>(3.0);
211}
212
213template <typename T>
214__global__ void mma_Ljjxinv_kernel(T* __restrict__ Ljjxinv,
215 const T* __restrict__ pjlambda, const T* __restrict__ qjlambda,
216 const T* __restrict__ x, const T* __restrict__ low, const T* __restrict__ upp,
217 const T* __restrict__ alpha, const T* __restrict__ beta,
218 const int n) {
219 int tj = blockIdx.x * blockDim.x + threadIdx.x;
220 if (tj >= n) return;
221
222 // Load inputs into registers
223 const T xt = x[tj];
224 const T low_j = low[tj];
225 const T upp_j = upp[tj];
226 const T pj = pjlambda[tj];
227 const T qj = qjlambda[tj];
228
229 // Precompute reused differences
230 const T diff_u = upp_j - xt;
231 const T diff_l = xt - low_j;
232
233 // Cube once (avoiding pow for speed)
234 const T diff_u3 = diff_u * diff_u * diff_u;
235 const T diff_l3 = diff_l * diff_l * diff_l;
236
237 // Compute inverse value safely
238 T denom = 2.0 * pj / diff_u3 + 2.0 * qj / diff_l3;
239 T val = -1.0 / denom;
240
241 // Mask out active primal constraints
242 bool active = (fabs(xt - alpha[tj]) <= T(1e-16)) ||
243 (fabs(xt - beta[tj]) <= T(1e-16));
244
245 Ljjxinv[tj] = active ? T(0.0) : val;
246}
247
248template <typename T>
249__global__ void mma_dipsolvesub1_kernel(T* __restrict__ x,
250 const T* __restrict__ pjlambda, const T* __restrict__ qjlambda,
251 const T* __restrict__ low, const T* __restrict__ upp,
252 const T* __restrict__ alpha, const T* __restrict__ beta,
253 const int n) {
254
255 int tj = blockIdx.x * blockDim.x + threadIdx.x;
256 if (tj >= n) return;
257
258 // Load inputs into registers
259 const T pj_sqrt = sqrt(pjlambda[tj]);
260 const T qj_sqrt = sqrt(qjlambda[tj]);
261 const T denom = pj_sqrt + qj_sqrt;
262
263 const T low_j = low[tj];
264 const T upp_j = upp[tj];
265 const T alpha_j = alpha[tj];
266 const T beta_j = beta[tj];
267
268 // Weighted average
269 const T val = (pj_sqrt * low_j + qj_sqrt * upp_j) / denom;
270
271 // Clamp x between alpha and beta using branchless min/max
272 x[tj] = fmin(fmax(val, alpha_j), beta_j);
273}
274
275template <typename T>
276__global__ void mattrans_v_mul_kernel(T* __restrict__ output,
277 const T* __restrict__ pij, const T* __restrict__ lambda,
278 const int m, const int n) {
279
280 int tj = blockIdx.x * blockDim.x + threadIdx.x;
281 if (tj >= n) return;
282
283 T acc = 0.0;
284
285 // Use register accumulation
286 for (int i = 0; i < m; ++i) {
287 acc += pij[i + tj * m] * lambda[i];
288 }
289
290 output[tj] = acc;
291}
292
293
294template <typename T>
295__global__ void mma_sub1_kernel(
296 T* __restrict__ xlow,
297 T* __restrict__ xupp,
298 const T* __restrict__ x,
299 const T* __restrict__ xmin,
300 const T* __restrict__ xmax,
301 const T asyinit,
302 const int n) {
303 int tj = blockIdx.x * blockDim.x + threadIdx.x;
304 if (tj >= n) return;
305
306 // Load values once into registers (global memory is slow)
307 const T xt = x[tj];
308 const T xmin_j = xmin[tj];
309 const T xmax_j = xmax[tj];
310
311 // Reuse xgap calculation
312 const T xgap = xmax_j - xmin_j;
313 const T offset = asyinit * xgap;
314
315 xlow[tj] = xt - offset;
316 xupp[tj] = xt + offset;
317}
318
319template< typename T >
320__global__ void mma_sub2_kernel(T* __restrict__ low, T* __restrict__ upp,
321 const T* __restrict__ x, const T* __restrict__ xold1,
322 const T* __restrict__ xold2, const T* __restrict__ xdiff,
323 const T asydecr, const T asyincr, const int n) {
324 int tj = blockIdx.x * blockDim.x + threadIdx.x;
325 if (tj >= n) return;
326
327 // Load data into registers for faster accessing compare to global memory
328 // when accessing repeatedly)
329 const T xval = x[tj];
330 const T xold1val = xold1[tj];
331 const T xold2val = xold2[tj];
332 const T lowval = low[tj];
333 const T uppval = upp[tj];
334 const T xdiffval = xdiff[tj];
335
336 // Compute the product
337 const T prod = (xval - xold1val) * (xold1val - xold2val);
338
339 // Compute asy_factor without branching
340 T asy_factor = (prod < T(0)) ? asydecr :
341 (prod > T(0)) ? asyincr : T(1);
342
343 // Update low and upp using fma (fused multiply-add) for numerical stability
344 T new_low = fma(-asy_factor, (xold1val - lowval), xval);
345 T new_upp = fma(asy_factor, (uppval - xold1val), xval);
346
347 // Apply bounds
348 new_low = max(new_low, xval - T(10.0) * xdiffval);
349 new_low = min(new_low, xval - T(0.01) * xdiffval);
350
351 new_upp = min(new_upp, xval + T(10.0) * xdiffval);
352 new_upp = max(new_upp, xval + T(0.01) * xdiffval);
353
354 // Write results back
355 low[tj] = new_low;
356 upp[tj] = new_upp;
357}
358
359template <typename T>
360__global__ void mma_sub3_kernel( const T* __restrict__ x,
361 const T* __restrict__ df0dx, const T* __restrict__ dfdx,
362 T* __restrict__ low, T* __restrict__ upp, const T* __restrict__ xmin,
363 const T* __restrict__ xmax, T* __restrict__ alpha, T* __restrict__ beta,
364 T* __restrict__ p0j, T* __restrict__ q0j, T* __restrict__ pij,
365 T* __restrict__ qij, const int n, const int m) {
366 int tj = blockIdx.x * blockDim.x + threadIdx.x;
367 if (tj >= n) return;
368
369 // Load into registers once
370 const T xt = x[tj];
371 const T xmin_j = xmin[tj];
372 const T xmax_j = xmax[tj];
373 const T low_j = low[tj];
374 const T upp_j = upp[tj];
375 const T df0 = df0dx[tj];
376 const T xgap = xmax_j - xmin_j;
377
378 // Clamp helpers
379 const T half_xgap = 0.5 * xgap;
380 const T tenth_low_diff = T(0.1) * (xt - low_j);
381 const T tenth_upp_diff = T(0.1) * (upp_j - xt);
382
383 // Compute alpha and beta with fused max/min and fewer calls
384 T alpha_val = max(max(xmin_j, low_j + tenth_low_diff), xt - half_xgap);
385 T beta_val = min(min(xmax_j, upp_j - tenth_upp_diff), xt + half_xgap);
386
387 alpha[tj] = alpha_val;
388 beta[tj] = beta_val;
389
390 // Avoid multiple pow calls, compute once
391 const T upp_minus_x = upp_j - xt;
392 const T x_minus_low = xt - low_j;
393 const T upp_minus_x_sq = upp_minus_x * upp_minus_x;
394 const T x_minus_low_sq = x_minus_low * x_minus_low;
395
396 // Small epsilon for numerical stability
397 const T eps = T(1e-5);
398 const T inv_xgap = T(1.0) / max(eps, xgap);
399
400 // Compute terms reused multiple times
401 const T max_df0_pos = max(df0, T(0));
402 const T max_df0_neg = max(-df0, T(0));
403
404 p0j[tj] = upp_minus_x_sq * (T(1.001) * max_df0_pos +
405 T(0.001) * max_df0_neg + eps * inv_xgap);
406 q0j[tj] = x_minus_low_sq * (T(0.001) * max_df0_pos +
407 T(1.001) * max_df0_neg + eps * inv_xgap);
408
409 // Loop over m for pij and qij
410 for (int i = 0; i < m; ++i) {
411 int idx = i + tj * m;
412
413 T dfdx_val = dfdx[idx];
414 T max_pos = max(dfdx_val, T(0));
415 T max_neg = max(-dfdx_val, T(0));
416
417 pij[idx] = upp_minus_x_sq * (T(1.001) * max_pos +
418 T(0.001) * max_neg + eps * inv_xgap);
419 qij[idx] = x_minus_low_sq * (T(0.001) * max_pos +
420 T(1.001) * max_neg + eps * inv_xgap);
421 }
422}
423
424template <typename T>
425__global__ void mma_sub4_kernel(
426 const T* __restrict__ x,
427 const T* __restrict__ low,
428 const T* __restrict__ upp,
429 const T* __restrict__ pij,
430 const T* __restrict__ qij,
431 T* __restrict__ temp,
432 const int n,
433 const int m) {
434
435 int tj = blockIdx.x * blockDim.x + threadIdx.x;
436 if (tj >= n) return;
437
438 // Register caching
439 const T xt = x[tj];
440 const T low_j = low[tj];
441 const T upp_j = upp[tj];
442
443 const T denom_upp = upp_j - xt;
444 const T denom_low = xt - low_j;
445
446 const T eps = T(1e-12); // Precision-dependent epsilon
447 const T inv_denom_upp = T(1) / max(denom_upp, eps);
448 const T inv_denom_low = T(1) / max(denom_low, eps);
449
450 const int base_idx = tj * m;
451
452 for (int i = 0; i < m; ++i) {
453 int idx = base_idx + i;
454 temp[idx] = pij[idx] * inv_denom_upp + qij[idx] * inv_denom_low;
455 }
456}
457
458
459template <typename T>
460__global__ void mma_max2_kernel(
461 T* __restrict__ xsi,
462 const T* __restrict__ x,
463 const T* __restrict__ alpha,
464 const int n) {
465
466 int tj = blockIdx.x * blockDim.x + threadIdx.x;
467 if (tj >= n) return;
468
469 const T eps = T(1e-12);
470 T denom = max(x[tj] - alpha[tj], eps);
471 xsi[tj] = max(T(1), T(1) / denom);
472}
473
474template <typename T>
475__global__ void relambda_kernel(
476 T* __restrict__ temp,
477 const T* __restrict__ x,
478 const T* __restrict__ xupp,
479 const T* __restrict__ xlow,
480 const T* __restrict__ pij,
481 const T* __restrict__ qij,
482 const int n,
483 const int m) {
484
485 int tj = blockIdx.x * blockDim.x + threadIdx.x;
486 if (tj >= n) return;
487
488 const T xt = x[tj];
489 const T xup = xupp[tj];
490 const T xlo = xlow[tj];
491
492 // Prevent divide-by-zero using small epsilon
493 const T eps = T(1e-12);
494 const T inv_denom_upp = T(1) / max(xup - xt, eps);
495 const T inv_denom_low = T(1) / max(xt - xlo, eps);
496
497 int base_idx = tj * m;
498 for (int i = 0; i < m; ++i) {
499 int idx = base_idx + i;
500 temp[idx] = pij[idx] * inv_denom_upp + qij[idx] * inv_denom_low;
501 }
502}
503
504
505template <typename T>
506__global__ void sub2cons2_kernel(
507 T* __restrict__ a,
508 const T* __restrict__ b,
509 const T* __restrict__ c,
510 const T* __restrict__ d,
511 const T e,
512 const int n) {
513 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
514 if (idx >= n) return;
515
516 const T bt = b[idx];
517 const T ct = c[idx];
518 const T dt = d[idx];
519
520 a[idx] = bt * (ct - dt) - e;
521}
522
523template <typename T>
524__inline__ __device__ T max_reduce_warp(T val) {
525 int w = warpSize;
526 for (int offset = w / 2; offset > 0; offset /= 2)
527 val = max(val, __shfl_down(val, offset, w));
528 return val;
529}
530
531template< typename T >
532__global__ void maxval_kernel(const T* __restrict__ a, T *temp, const int n) {
533
534 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
535 const int str = blockDim.x * gridDim.x;
536
537 const unsigned int lane = threadIdx.x % warpSize;
538 const unsigned int wid = threadIdx.x / warpSize;
539
540 __shared__ T shared[16];
541 T maxval = 0.0;
542 for (int i = idx; i < n; i += str) {
543 maxval = max(maxval, abs(a[i]));
544 }
545
546 maxval = max_reduce_warp<T>(maxval);
547 if (lane == 0)
548 shared[wid] = maxval;
549 __syncthreads();
550
551 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0.0;
552 if (wid == 0)
553 maxval = max_reduce_warp<T>(maxval);
554
555 if (threadIdx.x == 0)
556 temp[blockIdx.x] = maxval;
557
558}
559
560
561template <typename T>
562__global__ void max_reduce_kernel(T* __restrict__ bufred, const int n) {
563
564 T maxval = T(0);
565 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
566 const int stride = blockDim.x * gridDim.x;
567
568 // Grid-stride loop to cover all elements
569 for (int i = idx; i < n; i += stride) {
570 maxval = max(maxval, bufred[i]);
571 }
572
573 __shared__ T shared[16]; // One slot per warp (max 1024 threads/block)
574
575 unsigned int lane = threadIdx.x % warpSize;
576 unsigned int wid = threadIdx.x / warpSize;
577
578 // Warp-level max reduction
579 maxval = max_reduce_warp<T>(maxval);
580
581 // Store each warp's max value in shared memory
582 if (lane == 0) {
583 shared[wid] = maxval;
584 }
585 __syncthreads();
586
587 // Let the first warp reduce the warp-level maxima
588 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : T(0);
589 if (wid == 0) {
590 maxval = max_reduce_warp<T>(maxval);
591 }
592
593 // Write the block's maximum value back to bufred[blockIdx.x]
594 if (threadIdx.x == 0) {
595 bufred[blockIdx.x] = maxval;
596 }
597}
598
599
600template <typename T>
601__global__ void delx_kernel(
602 T* __restrict__ delx,
603 const T* __restrict__ x,
604 const T* __restrict__ xlow,
605 const T* __restrict__ xupp,
606 const T* __restrict__ pij,
607 const T* __restrict__ qij,
608 const T* __restrict__ p0j,
609 const T* __restrict__ q0j,
610 const T* __restrict__ alpha,
611 const T* __restrict__ beta,
612 const T* __restrict__ lambda,
613 const T epsi,
614 const int n,
615 const int m)
616{
617 int tj = blockIdx.x * blockDim.x + threadIdx.x;
618 if (tj < n) {
619 T xt = x[tj];
620 T xlow_j = xlow[tj];
621 T xupp_j = xupp[tj];
622 T alpha_j = alpha[tj];
623 T beta_j = beta[tj];
624
625 // Precompute denominators squared for better performance
626 T denom_low = xt - xlow_j;
627 T denom_upp = xupp_j - xt;
628 T denom_alpha = xt - alpha_j;
629 T denom_beta = beta_j - xt;
630
631 const T small_eps = T(1e-12);
632 denom_low = (abs(denom_low) < small_eps) ? small_eps : denom_low;
633 denom_upp = (abs(denom_upp) < small_eps) ? small_eps : denom_upp;
634 denom_alpha = (abs(denom_alpha) < small_eps) ? small_eps : denom_alpha;
635 denom_beta = (abs(denom_beta) < small_eps) ? small_eps : denom_beta;
636
637 T sum = T(0);
638 for (int i = 0; i < m; i++) {
639 T lambda_i = lambda[i];
640 sum += pij[i + tj * m] * lambda_i / (denom_upp * denom_upp)
641 - qij[i + tj * m] * lambda_i / (denom_low * denom_low);
642 }
643 sum += p0j[tj] / (denom_upp * denom_upp)
644 - q0j[tj] / (denom_low * denom_low)
645 - epsi / denom_alpha
646 + epsi / denom_beta;
647
648 delx[tj] = sum;
649 }
650}
651
652template <typename T>
653__global__ void GG_kernel(
654 T* __restrict__ GG,
655 const T* __restrict__ x,
656 const T* __restrict__ xlow,
657 const T* __restrict__ xupp,
658 const T* __restrict__ pij,
659 const T* __restrict__ qij,
660 const int n,
661 const int m) {
662
663 int tj = blockIdx.x * blockDim.x + threadIdx.x;
664 if (tj < n) {
665 T xt = x[tj];
666 T xlow_j = xlow[tj];
667 T xupp_j = xupp[tj];
668
669 // Distances from bounds
670 T diff_upper = xupp_j - xt;
671 T diff_lower = xt - xlow_j;
672
673 // Squared distances
674 T diff_upper2 = diff_upper * diff_upper;
675 T diff_lower2 = diff_lower * diff_lower;
676
677 for (int i = 0; i < m; i++) {
678 int idx = i + tj * m;
679 GG[idx] = pij[idx] / diff_upper2 - qij[idx] / diff_lower2;
680 }
681 }
682}
683
684template <typename T>
685__global__ void diagx_kernel(T* __restrict__ diagx, const T* __restrict__ x,
686 const T* __restrict__ xsi, const T* __restrict__ xlow,
687 const T* __restrict__ xupp, const T* __restrict__ p0j,
688 const T* __restrict__ q0j, const T* __restrict__ pij,
689 const T* __restrict__ qij, const T* alpha, const T* beta,
690 const T* eta, const T* lambda, const int n, const int m) {
691 int tj = blockIdx.x * blockDim.x + threadIdx.x;
692 if (tj < n) {
693 T sum = 0;
694 T sum1 = 0;
695 for (int i = 0; i < m; i++) {
696 sum = sum + pij[tj *m+ i] * lambda[i];
697 sum1 = sum1 + qij[tj*m + i] * lambda[i];
698 }
699 diagx[tj] = (p0j[tj] + sum) / pow(xupp[tj] - x[tj], 3) +
700 (q0j[tj] + sum1) / pow(x[tj] - xlow[tj], 3);
701 diagx[tj] = 2.0 * diagx[tj] + xsi[tj] / (x[tj] - alpha[tj]) +
702 eta[tj] / (beta[tj] - x[tj]);
703 }
704}
705
706
707template <typename T>
708__inline__ __device__ T reduce_warp(T val) {
709 int w = warpSize;
710 for (int offset = w / 2; offset > 0; offset /= 2)
711 val += __shfl_down(val, offset, w);
712 return val;
713}
714
715template <typename T>
716__global__ void mmareduce_kernel(T* __restrict__ bufred, const int n) {
717
718 T sum = 0;
719 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
720 const int str = blockDim.x * gridDim.x;
721 for (int i = idx; i < n; i += str)
722 {
723 sum += bufred[i];
724 }
725
726 __shared__ T shared[16];
727 unsigned int lane = threadIdx.x % warpSize;
728 unsigned int wid = threadIdx.x / warpSize;
729
730 sum = reduce_warp<T>(sum);
731 if (lane == 0)
732 shared[wid] = sum;
733 __syncthreads();
734
735 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
736 if (wid == 0)
737 sum = reduce_warp<T>(sum);
738
739 if (threadIdx.x == 0)
740 bufred[blockIdx.x] = sum;
741}
742
743
744template< typename T >
745__global__ void mmasum_kernel(const T* __restrict__ a, T* __restrict__ buf_h,
746 const int n, const int m, const int k) {
747
748 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
749 const int str = blockDim.x * gridDim.x;
750
751 const unsigned int lane = threadIdx.x % warpSize;
752 const unsigned int wid = threadIdx.x / warpSize;
753
754 __shared__ T shared[16];
755 T sum = 0;
756 for (int i = idx; i < n; i += str)
757 {
758 sum += a[m * i + k ];
759 }
760
761 sum = reduce_warp<T>(sum);
762 if (lane == 0)
763 shared[wid] = sum;
764 __syncthreads();
765
766 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
767 if (wid == 0)
768 sum = reduce_warp<T>(sum);
769
770 if (threadIdx.x == 0)
771 buf_h[blockIdx.x] = sum;
772
773}
774template< typename T >
775__global__ void mmasumbb_kernel(const T* __restrict__ GG,
776 const T* __restrict__ delx, const T* __restrict__ diagx,
777 T* __restrict__ buf_h, const int n, const int m, const int k) {
778
779 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
780 const int str = blockDim.x * gridDim.x;
781
782 const unsigned int lane = threadIdx.x % warpSize;
783 const unsigned int wid = threadIdx.x / warpSize;
784
785 __shared__ T shared[16];
786 T sum = 0;
787 for (int i = idx; i < n; i += str)
788 {
789 sum += GG[ k + i * m] * delx[i] / diagx[i];
790 }
791
792 sum = reduce_warp<T>(sum);
793 if (lane == 0)
794 shared[wid] = sum;
795 __syncthreads();
796
797 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
798 if (wid == 0)
799 sum = reduce_warp<T>(sum);
800
801 if (threadIdx.x == 0)
802 buf_h[blockIdx.x] = sum;
803
804}
805
806template <typename T>
807__global__ void mmasumHess_kernel(
808 const T* __restrict__ hijx,
809 const T* __restrict__ Ljjxinv,
810 T* __restrict__ buf_h,
811 const int n,
812 const int m,
813 const int k0,
814 const int k1)
815{
816 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
817 const int stride = blockDim.x * gridDim.x;
818
819 // Warp lane and warp id within the block
820 const unsigned int lane = threadIdx.x % warpSize;
821 const unsigned int wid = threadIdx.x / warpSize;
822
823 __shared__ T shared[16];
824
825 T sum = T(0);
826
827 // Grid-stride loop for global reduction
828 for (int i = idx; i < n; i += stride) {
829 // hijx is indexed as hijx[offset + row * m]
830 T val0 = hijx[k0 + i * m];
831 T val1 = hijx[k1 + i * m];
832 sum += val0 * Ljjxinv[i] * val1;
833 }
834
835 // Warp-level reduction using your reduce_warp implementation
836 sum = reduce_warp<T>(sum);
837
838 // Write each warp's partial sum to shared memory
839 if (lane == 0) {
840 shared[wid] = sum;
841 }
842 __syncthreads();
843
844 // First warp reduces the warp sums in shared memory
845 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : T(0);
846 if (wid == 0) {
847 sum = reduce_warp<T>(sum);
848 }
849
850 // Write final result from first thread of block
851 if (threadIdx.x == 0) {
852 buf_h[blockIdx.x] = sum;
853 }
854}
855
856template< typename T >
857__global__ void mmasumAA_kernel(const T* __restrict__ GG,
858 const T* __restrict__ diagx, T* __restrict__ buf_h, const int n,
859 const int m, const int k0, const int k1) {
860
861 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
862 const int str = blockDim.x * gridDim.x;
863
864 const unsigned int lane = threadIdx.x % warpSize;
865 const unsigned int wid = threadIdx.x / warpSize;
866
867 __shared__ T shared[16];
868 T sum = 0;
869 for (int i = idx; i < n; i += str)
870 {
871 sum += GG[ k0 + i * m] /diagx[i] * GG[ k1 + i * m];
872 }
873
874 sum = reduce_warp<T>(sum);
875 if (lane == 0)
876 shared[wid] = sum;
877 __syncthreads();
878
879 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
880 if (wid == 0)
881 sum = reduce_warp<T>(sum);
882
883 if (threadIdx.x == 0)
884 buf_h[blockIdx.x] = sum;
885
886}
887
888
889template <typename T>
890__global__ void mma_copy_kernel(T* __restrict__ a, const T* __restrict__ b,
891 const int n, const int m) {
892 int tj = blockIdx.x * blockDim.x + threadIdx.x;
893 if(tj<n)
894 a[tj+m]=b[tj];
895}
896
897
898
899template <typename T>
900__global__ void AA_kernel(T* __restrict__ temp, const T* __restrict__ GG,
901 const T* __restrict__ diagx, const int n, const int m) {
902 int tj = blockIdx.x * blockDim.x + threadIdx.x;
903 if (tj < n) {
904 for (int i0 = 0; i0 < m; i0++) {
905 for (int i1 = 0; i1 < m; i1++) {
906 temp[tj + i0 * (n + 1) + i1 * (m + 1) * (n + 1)] = GG[i0 * n + tj] *
907 (1.0 / diagx[tj]) * GG[i1 * n + tj];
908 }
909 }
910 }
911}
912
913
914template <typename T>
915__global__ void dx_kernel(T* __restrict__ dx, const T* __restrict__ delx,
916 const T* __restrict__ diagx, const T* __restrict__ GG,
917 const T* __restrict__ dlambda, const int n, const int m) {
918 int tj = blockIdx.x * blockDim.x + threadIdx.x;
919 if (tj < n) {
920 dx[tj] = -delx[tj]/diagx[tj];
921 for(int i=0;i<m;i++){
922 dx[tj] =dx[tj] - GG[tj*m+i]*dlambda[i]/diagx[tj];
923 }
924 }
925}
926
927
928
929template <typename T>
930__global__ void dxsi_kernel(T* __restrict__ dxsi, const T* __restrict__ xsi,
931 const T* __restrict__ dx, const T* __restrict__ x,
932 const T* __restrict__ alpha, const T epsi, const int n) {
933 int tj = blockIdx.x * blockDim.x + threadIdx.x;
934 if (tj < n) {
935 dxsi[tj]= -xsi[tj] + (epsi-dx[tj]*xsi[tj])/(x[tj] - alpha[tj]);
936 }
937}
938
939
940template <typename T>
941__global__ void deta_kernel(T* __restrict__ deta, const T* __restrict__ eta,
942 const T* __restrict__ dx, const T* __restrict__ x,
943 const T* __restrict__ beta, const T epsi, const int n) {
944 int tj = blockIdx.x * blockDim.x + threadIdx.x;
945 if (tj < n) {
946 deta[tj] = -eta[tj] + (epsi + dx[tj] * eta[tj]) / (beta[tj] - x[tj]);
947 }
948}
949
950template <typename T>
951__global__ void RexCalculation_kernel(
952 T* __restrict__ rex,
953 const T* __restrict__ x,
954 const T* __restrict__ xlow,
955 const T* __restrict__ xupp,
956 const T* __restrict__ pij,
957 const T* __restrict__ p0j,
958 const T* __restrict__ qij,
959 const T* __restrict__ q0j,
960 const T* __restrict__ lambda,
961 const T* __restrict__ xsi,
962 const T* __restrict__ eta,
963 const int n,
964 const int m)
965{
966 int tj = blockIdx.x * blockDim.x + threadIdx.x;
967 if (tj < n) {
968 T upp_diff = xupp[tj] - x[tj];
969 T low_diff = x[tj] - xlow[tj];
970 T upp_diff_sq = upp_diff * upp_diff;
971 T low_diff_sq = low_diff * low_diff;
972
973 T sum = 0.0;
974 for (int i = 0; i < m; ++i) {
975 sum += pij[i + tj * m] * lambda[i] / upp_diff_sq
976 - qij[i + tj * m] * lambda[i] / low_diff_sq;
977 }
978
979 rex[tj] = sum
980 + p0j[tj] / upp_diff_sq
981 - q0j[tj] / low_diff_sq
982 - xsi[tj] + eta[tj];
983 }
984}
985
986template <typename T>
987__global__ void rey_calculation_kernel(T* __restrict__ rey,
988 const T* __restrict__ c, const T* __restrict__ d, const T* __restrict__ y,
989 const T* __restrict__ lambda, const T* __restrict__ mu, const int n) {
990 int tj = blockIdx.x * blockDim.x + threadIdx.x;
991 if (tj < n) {
992 rey[tj] = c[tj] + d[tj] * y[tj] - lambda[tj] - mu[tj];
993 }
994}
995
996
997template< typename T >
998__global__ void norm_kernel(const T* __restrict__ a, T* __restrict__ buf_h,
999 const int n) {
1000
1001 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1002 const int str = blockDim.x * gridDim.x;
1003
1004 const unsigned int lane = threadIdx.x % warpSize;
1005 const unsigned int wid = threadIdx.x / warpSize;
1006
1007 __shared__ T shared[16];
1008 T sum = 0;
1009 for (int i = idx; i < n; i += str)
1010 {
1011 sum += pow(a[i], 2);
1012 }
1013
1014 sum = reduce_warp<T>(sum);
1015 if (lane == 0)
1016 shared[wid] = sum;
1017 __syncthreads();
1018
1019 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1020 if (wid == 0)
1021 sum = reduce_warp<T>(sum);
1022
1023 if (threadIdx.x == 0)
1024 buf_h[blockIdx.x] = sum;
1025
1026}
1027
1028
1029
1030template <typename T>
1031__global__ void sub2cons_kernel(T* __restrict__ a, const T* __restrict__ b,
1032 const T* __restrict__ c,
1033 const T d, const int n) {
1034 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1035 if (tj < n) {
1036 a[tj] = b[tj]*c[tj]-d;
1037 }
1038}
1039
1040
1041template <typename T>
1042__global__ void dely_kernel(T* __restrict__ dely, const T* __restrict__ c,
1043 const T* __restrict__ d, const T* __restrict__ y,
1044 const T* __restrict__ lambda, const T epsi, const int n) {
1045 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1046 if (tj < n) {
1047 dely[tj] = c[tj] + d[tj]*y[tj] - lambda[tj] - epsi/y[tj];
1048 }
1049}
1050
1051
1052
1053template< typename T >
1054__global__ void maxval2_kernel(const T* __restrict__ a, const T* __restrict__ b,
1055 T* __restrict__ temp, const T cons, const int n) {
1056
1057 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1058 const int str = blockDim.x * gridDim.x;
1059
1060 const unsigned int lane = threadIdx.x % warpSize;
1061 const unsigned int wid = threadIdx.x / warpSize;
1062
1063 __shared__ T shared[16];
1064 T maxval = cons * a[0] / b[0];
1065 for (int i = idx; i < n; i += str)
1066 {
1067 maxval = max(maxval, cons * a[i] / b[i]);
1068 }
1069
1070 maxval = max_reduce_warp<T>(maxval);
1071 if (lane == 0)
1072 shared[wid] = maxval;
1073 __syncthreads();
1074
1075 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0.0;
1076 if (wid == 0)
1077 maxval = max_reduce_warp<T>(maxval);
1078
1079 if (threadIdx.x == 0)
1080 temp[blockIdx.x] = maxval;
1081
1082}
1083
1084
1085template< typename T >
1086__global__ void maxval3_kernel(const T* __restrict__ a, const T* __restrict__ b,
1087 const T* __restrict__ c, T* __restrict__ temp, const T cons, const int n) {
1088
1089 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1090 const int str = blockDim.x * gridDim.x;
1091
1092 const unsigned int lane = threadIdx.x % warpSize;
1093 const unsigned int wid = threadIdx.x / warpSize;
1094
1095 __shared__ T shared[16];
1096 T maxval = cons * a[0] / b[0];
1097 for (int i = idx; i < n; i += str)
1098 {
1099 maxval = max(maxval, cons * a[i] / (b[i] - c[i]));
1100 }
1101
1102 maxval = max_reduce_warp<T>(maxval);
1103 if (lane == 0)
1104 shared[wid] = maxval;
1105 __syncthreads();
1106
1107 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1108 if (wid == 0)
1109 maxval = max_reduce_warp<T>(maxval);
1110
1111 if (threadIdx.x == 0)
1112 temp[blockIdx.x] = maxval;
1113
1114}
1115
1116
1117template <typename T>
1118__global__ void kkt_rex_kernel(T* __restrict__ rex, const T* __restrict__ df0dx,
1119 const T* __restrict__ dfdx, const T* __restrict__ xsi,
1120 const T* __restrict__ eta, const T* __restrict__ lambda, const int n,
1121 const int m) {
1122 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1123 if (tj < n) {
1124 rex[tj] = 0.0;
1125 for (int i = 0; i < m; i++) {
1126 rex[tj] = rex[tj] + dfdx[i + tj*m] * lambda[i];
1127 }
1128 rex[tj] += df0dx[tj] - xsi[tj] + eta[tj];
1129 }
1130}
1131
1132
1133template <typename T>
1134__global__ void maxcons_kernel(T* __restrict__ a, const T b,
1135 const T c, const T* __restrict__ d, const int n) {
1136 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1137 if (tj < n) {
1138 a[tj] = max(b, c * d[tj]);
1139 }
1140}
1141
1144 template< typename T >
1145 __global__ void glsum_kernel(const T * a, T * buf_h, const int n) {
1146 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1147 const int str = blockDim.x * gridDim.x;
1148
1149 const unsigned int lane = threadIdx.x % warpSize;
1150 const unsigned int wid = threadIdx.x / warpSize;
1151
1152 __shared__ T shared[16];
1153 T sum = 0;
1154 for (int i = idx; i<n ; i += str)
1155 {
1156 sum += a[i];
1157 }
1158
1159 sum = reduce_warp<T>(sum);
1160 if (lane == 0)
1161 shared[wid] = sum;
1162 __syncthreads();
1163
1164 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1165 if (wid == 0)
1166 sum = reduce_warp<T>(sum);
1167
1168 if (threadIdx.x == 0)
1169 buf_h[blockIdx.x] = sum;
1170
1171 }
1172 template< typename T >
1173__global__ void glsc2_kernel(const T * a,
1174 const T * b,
1175 T * buf_h,
1176 const int n) {
1177
1178 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1179 const int str = blockDim.x * gridDim.x;
1180
1181 const unsigned int lane = threadIdx.x % warpSize;
1182 const unsigned int wid = threadIdx.x / warpSize;
1183
1184 __shared__ T shared[16];
1185 T sum = 0.0;
1186 for (int i = idx; i < n; i+= str) {
1187 sum += a[i] * b[i];
1188 }
1189
1190 sum = reduce_warp<T>(sum);
1191 if (lane == 0)
1192 shared[wid] = sum;
1193 __syncthreads();
1194
1195 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1196 if (wid == 0)
1197 sum = reduce_warp<T>(sum);
1198
1199 if (threadIdx.x == 0)
1200 buf_h[blockIdx.x] = sum;
1201
1202 }
1203
1204
1205
1206
1207
1208template <typename T>
1209__global__ void add2inv2_kernel(T* __restrict__ a, const T* __restrict__ b,
1210 const T c, const int n) {
1211 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1212 if (tj < n) {
1213 a[tj] = a[tj]+c/b[tj];
1214 }
1215}
1216
1217template <typename T>
1218__global__ void max2_kernel(T* __restrict__ a, const T b,
1219 const T* __restrict__ c, const T d, const int n) {
1220 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1221 if (tj < n) {
1222 a[tj]=max(b, d*c[tj]);
1223 }
1224}
1225
1226template <typename T>
1227__global__ void updatebb_kernel(T* __restrict__ bb,
1228 const T* __restrict__ dellambda, const T* __restrict__ dely,
1229 const T* __restrict__ d, const T* __restrict__ mu,
1230 const T* __restrict__ y, const T delz, const int m) {
1231 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1232 if(tj<m)
1233 bb[tj]=dellambda[tj] + dely[tj]/(d[tj] + mu[tj]/y[tj]) - bb[tj];
1234 else if(tj<m+1)
1235 bb[tj]=delz;
1236}
1237
1238
1239
1240template <typename T>
1241__global__ void updateAA_kernel(T* __restrict__ AA,
1242 const T* __restrict__ globaltmp_mm, const T* __restrict__ s,
1243 const T* __restrict__ lambda, const T* __restrict__ d,
1244 const T* __restrict__ mu, const T* __restrict__ y, const T* __restrict__ a,
1245 const T zeta, const T z, const int m) {
1246 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1247 if(tj<m)
1248 {
1249 AA[tj+tj*(m+1)]=globaltmp_mm[tj+tj*m] + (s[tj] / lambda[tj] +
1250 1.0/ (d[tj] + mu[tj] / y[tj]));
1251 AA[tj+m*(m+1)]=a[tj];
1252 AA[m+tj*(m+1)]=a[tj];
1253 }
1254 else if(tj<m+1)
1255 AA[tj+tj*(m+1)]= -zeta/z;
1256}
1257
1258template <typename T>
1259__global__ void dy_kernel(T* __restrict__ dy, const T* __restrict__ dely,
1260 const T* __restrict__ dlambda, const T* __restrict__ d,
1261 const T* __restrict__ mu, const T* __restrict__ y, const int n) {
1262 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1263 if(tj<n)
1264 dy[tj] = (-dely[tj]+dlambda[tj])/(d[tj] + mu[tj]/y[tj]);
1265}
1266
1267#endif
1268