Neko-TOP
A portable framework for high-order spectral element flow toplogy optimization.
Loading...
Searching...
No Matches
mma_kernel.h
1/*
2 Copyright (c) 2021-2025, The Neko Authors
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 * Redistributions in binary form must reproduce the above
13 copyright notice, this list of conditions and the following
14 disclaimer in the documentation and/or other materials provided
15 with the distribution.
16
17 * Neither the name of the authors nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 POSSIBILITY OF SUCH DAMAGE.
33*/
34
35#ifndef MMA_CUDA_KERNEL_H
36#define MMA_CUDA_KERNEL_H
37
38template<typename T>
39__global__ void mma_prepare_aa_matrix_kernel(T* __restrict__ AA,
40 const T* __restrict__ s, const T* __restrict__ lambda,
41 const T* __restrict__ d, const T* __restrict__ mu,
42 const T* __restrict__ y, const T* __restrict__ a,
43 const T zeta, const T z, const int m) {
44 const int tj = blockIdx.x * blockDim.x + threadIdx.x;
45 const int matrix_size = m + 1;
46
47 if (tj >= m) return;
48 AA[tj * matrix_size + tj] += s[tj] / lambda[tj] +
49 (T)1.0 / (d[tj] + mu[tj] / y[tj]);
50 AA[tj * matrix_size + m] = a[tj]; // column m+1
51 AA[m * matrix_size + tj] = a[tj];// row m+1
52
53 // Only first thread updates the bottom-right corner element.
54 if (tj == 0)
55 AA[m * matrix_size + m] = -zeta / z;
56}
57
58
59//Update Hessian diagonal elements (y contributions for dip subsolve)
60template<typename T>
61__global__ void mma_update_hessian_diagonal_kernel(T* __restrict__ Hess,
62 const T* __restrict__ y, const T* __restrict__ d,
63 const T* __restrict__ mu, const T* __restrict__ lambda, const int m) {
64 int tj = blockIdx.x * blockDim.x + threadIdx.x;
65 if (tj >= m) return;
66
67 T diag = Hess[tj * m + tj];
68 // Contribution from y terms (inactive constraints)
69 if (y[tj] > (T)0.0) {
70 if (fabs(d[tj]) >= (T)1.0e-15) {
71 diag -= (T)1.0 / d[tj];
72 }
73 // else: skip - equivalent to subtracting 1.0/1.0e-8
74 }
75
76 // Contribution from -mu/lambda (eq 10)
77 diag -= mu[tj] / lambda[tj];
78 Hess[tj * m + tj] = diag;
79}
80
81// Levenberg-Marquardt algorithm (heuristically)
82// Single-block version for m <= 1024
83template<typename T>
84__global__ void mma_stabilize_hessian_single_kernel(T* __restrict__ Hess, const int m) {
85 const int tid = threadIdx.x;
86
87 // Single thread computes trace and LM factor
88 if (tid == 0) {
89 T trace = (T)0.0;
90 for (int j = 0; j < m; j++) {
91 trace += Hess[j * m + j];
92 }
93 T lm_factor = max((T)(-1.0e-4) * trace / m, (T)1.0e-7);
94
95 // Apply to all diagonal elements
96 for (int j = 0; j < m; j++) {
97 Hess[j * m + j] -= lm_factor;
98 }
99 }
100}
101
102// Levenberg-Marquardt algorithm (heuristically)
103// Multi-block version for m > 1024
104template<typename T>
105__global__ void mma_stabilize_hessian_multi_kernel(T* __restrict__ Hess, const T lm_factor, const int m) {
106 const int i = blockIdx.x * blockDim.x + threadIdx.x;
107 if (i < m) {
108 Hess[i * m + i] -= lm_factor;
109 }
110}
111
112// Small linear solver for MMA (n <= 50)
113template<typename T>
114__global__ void mma_small_lu_kernel(T* __restrict__ A, T* __restrict__ b, const int n) {
115 const int tid = threadIdx.x;
116
117 // Handle 1x1 case
118 if (n == 1) {
119 if (tid == 0 && abs(A[0]) > (T)1e-12) b[0] /= A[0];
120 return;
121 }
122
123 // LU decomposition with partial pivoting
124 for (int k = 0; k < n; k++) {
125 // Pivoting - single thread
126 if (tid == 0) {
127 int max_row = k;
128 T max_val = abs(A[k * n + k]);
129 for (int i = k + 1; i < n; i++) {
130 T val = abs(A[i * n + k]);
131 if (val > max_val) {
132 max_val = val;
133 max_row = i;
134 }
135 }
136 if (max_val > (T)1e-12 && max_row != k) {
137 // Swap rows
138 for (int j = k; j < n; j++) {
139 T temp = A[k * n + j];
140 A[k * n + j] = A[max_row * n + j];
141 A[max_row * n + j] = temp;
142 }
143 // Swap rhs
144 T temp_b = b[k];
145 b[k] = b[max_row];
146 b[max_row] = temp_b;
147 }
148 }
149 __syncthreads();
150
151 // Parallel elimination
152 T diag = A[k * n + k];
153 if (abs(diag) > (T)1e-12) {
154 for (int i = tid + k + 1; i < n; i += blockDim.x) {
155 T factor = A[i * n + k] / diag;
156 A[i * n + k] = factor;
157 for (int j = k + 1; j < n; j++) {
158 A[i * n + j] -= factor * A[k * n + j];
159 }
160 }
161 }
162 __syncthreads();
163 }
164
165 // Parallel forward substitution
166 for (int i = tid; i < n; i += blockDim.x) {
167 T sum = b[i];
168 for (int j = 0; j < i; j++) {
169 sum -= A[i * n + j] * b[j];
170 }
171 b[i] = sum;
172 }
173 __syncthreads();
174
175 // Parallel backward substitution
176 for (int i = n - 1 - tid; i >= 0; i -= blockDim.x) {
177 if (i >= 0) {
178 T sum = b[i];
179 for (int j = i + 1; j < n; j++) {
180 sum -= A[i * n + j] * b[j];
181 }
182 if (abs(A[i * n + i]) > (T)1e-12) {
183 b[i] = sum / A[i * n + i];
184 }
185 }
186 }
187 __syncthreads();
188}
189
190template <typename T>
191__global__ void delta_1dbeam_kernel(T* __restrict__ Delta,
192 const T L_total, const T Le, const int offset, const int n) {
193 int k = blockIdx.x * blockDim.x + threadIdx.x;
194 if (k >= n) return;
195
196 // Convert to 1-based indexing for the calculation
197 int idx = k + 1;
198
199 // Calculate first term: (L_total - Le*(offset+idx-1))^3
200 T x1 = L_total - Le * static_cast<T>(offset + idx - 1);
201 T term1 = x1 * x1 * x1;
202
203 // Calculate second term: (L_total - Le*(offset+idx))^3
204 T x2 = L_total - Le * static_cast<T>(offset + idx);
205 T term2 = x2 * x2 * x2;
206
207 // Final result
208 Delta[k] = (term1 - term2) / static_cast<T>(3.0);
209}
210
211template <typename T>
212__global__ void mma_Ljjxinv_kernel(T* __restrict__ Ljjxinv,
213 const T* __restrict__ pjlambda, const T* __restrict__ qjlambda,
214 const T* __restrict__ x, const T* __restrict__ low, const T* __restrict__ upp,
215 const T* __restrict__ alpha, const T* __restrict__ beta,
216 const int n) {
217 int tj = blockIdx.x * blockDim.x + threadIdx.x;
218 if (tj >= n) return;
219
220 // Load inputs into registers
221 const T xt = x[tj];
222 const T low_j = low[tj];
223 const T upp_j = upp[tj];
224 const T pj = pjlambda[tj];
225 const T qj = qjlambda[tj];
226
227 // Precompute reused differences
228 const T diff_u = upp_j - xt;
229 const T diff_l = xt - low_j;
230
231 // Cube once (avoiding pow for speed)
232 const T diff_u3 = diff_u * diff_u * diff_u;
233 const T diff_l3 = diff_l * diff_l * diff_l;
234
235 // Compute inverse value safely
236 T denom = 2.0 * pj / diff_u3 + 2.0 * qj / diff_l3;
237 T val = -1.0 / denom;
238
239 // Mask out active primal constraints
240 bool active = (fabs(xt - alpha[tj]) <= T(1e-16)) ||
241 (fabs(xt - beta[tj]) <= T(1e-16));
242
243 Ljjxinv[tj] = active ? T(0.0) : val;
244}
245
246template <typename T>
247__global__ void mma_dipsolvesub1_kernel(T* __restrict__ x,
248 const T* __restrict__ pjlambda, const T* __restrict__ qjlambda,
249 const T* __restrict__ low, const T* __restrict__ upp,
250 const T* __restrict__ alpha, const T* __restrict__ beta,
251 const int n) {
252
253 int tj = blockIdx.x * blockDim.x + threadIdx.x;
254 if (tj >= n) return;
255
256 // Load inputs into registers
257 const T pj_sqrt = sqrt(pjlambda[tj]);
258 const T qj_sqrt = sqrt(qjlambda[tj]);
259 const T denom = pj_sqrt + qj_sqrt;
260
261 const T low_j = low[tj];
262 const T upp_j = upp[tj];
263 const T alpha_j = alpha[tj];
264 const T beta_j = beta[tj];
265
266 // Weighted average
267 const T val = (pj_sqrt * low_j + qj_sqrt * upp_j) / denom;
268
269 // Clamp x between alpha and beta using branchless min/max
270 x[tj] = fmin(fmax(val, alpha_j), beta_j);
271}
272
273template <typename T>
274__global__ void mattrans_v_mul_kernel(T* __restrict__ output,
275 const T* __restrict__ pij, const T* __restrict__ lambda,
276 const int m, const int n) {
277
278 int tj = blockIdx.x * blockDim.x + threadIdx.x;
279 if (tj >= n) return;
280
281 T acc = 0.0;
282
283 // Use register accumulation
284 for (int i = 0; i < m; ++i) {
285 acc += pij[i + tj * m] * lambda[i];
286 }
287
288 output[tj] = acc;
289}
290
291template <typename T>
292__global__ void mma_sub1_kernel(
293 T* __restrict__ xlow,
294 T* __restrict__ xupp,
295 const T* __restrict__ x,
296 const T* __restrict__ xmin,
297 const T* __restrict__ xmax,
298 const T asyinit,
299 const int n) {
300 int tj = blockIdx.x * blockDim.x + threadIdx.x;
301 if (tj >= n) return;
302
303 // Load values once into registers (global memory is slow)
304 const T xt = x[tj];
305 const T xmin_j = xmin[tj];
306 const T xmax_j = xmax[tj];
307
308 // Reuse xgap calculation
309 const T xgap = xmax_j - xmin_j;
310 const T offset = asyinit * xgap;
311
312 xlow[tj] = xt - offset;
313 xupp[tj] = xt + offset;
314}
315
316
317
318template< typename T >
319__global__ void mma_sub2_kernel(T* __restrict__ low, T* __restrict__ upp,
320 const T* __restrict__ x, const T* __restrict__ xold1,
321 const T* __restrict__ xold2, const T* __restrict__ xdiff,
322 const T asydecr, const T asyincr, const int n) {
323 int tj = blockIdx.x * blockDim.x + threadIdx.x;
324 if (tj >= n) return;
325
326 // Load data into registers for faster accessing compare to global memory
327 // when accessing repeatedly)
328 const T xval = x[tj];
329 const T xold1val = xold1[tj];
330 const T xold2val = xold2[tj];
331 const T lowval = low[tj];
332 const T uppval = upp[tj];
333 const T xdiffval = xdiff[tj];
334
335 // Compute the product
336 const T prod = (xval - xold1val) * (xold1val - xold2val);
337
338 // Compute asy_factor without branching
339 T asy_factor = (prod < T(0)) ? asydecr :
340 (prod > T(0)) ? asyincr : T(1);
341
342 // Update low and upp using fma (fused multiply-add) for numerical stability
343 T new_low = fma(-asy_factor, (xold1val - lowval), xval);
344 T new_upp = fma(asy_factor, (uppval - xold1val), xval);
345
346 // Apply bounds
347 new_low = max(new_low, xval - T(10.0) * xdiffval);
348 new_low = min(new_low, xval - T(0.01) * xdiffval);
349
350 new_upp = min(new_upp, xval + T(10.0) * xdiffval);
351 new_upp = max(new_upp, xval + T(0.01) * xdiffval);
352
353 // Write results back
354 low[tj] = new_low;
355 upp[tj] = new_upp;
356}
357
358template <typename T>
359__global__ void mma_sub3_kernel( const T* __restrict__ x,
360 const T* __restrict__ df0dx, const T* __restrict__ dfdx,
361 T* __restrict__ low, T* __restrict__ upp, const T* __restrict__ xmin,
362 const T* __restrict__ xmax, T* __restrict__ alpha, T* __restrict__ beta,
363 T* __restrict__ p0j, T* __restrict__ q0j, T* __restrict__ pij,
364 T* __restrict__ qij, const int n, const int m) {
365 int tj = blockIdx.x * blockDim.x + threadIdx.x;
366 if (tj >= n) return;
367
368 // Load into registers once
369 const T xt = x[tj];
370 const T xmin_j = xmin[tj];
371 const T xmax_j = xmax[tj];
372 const T low_j = low[tj];
373 const T upp_j = upp[tj];
374 const T df0 = df0dx[tj];
375 const T xgap = xmax_j - xmin_j;
376
377 // Clamp helpers
378 const T half_xgap = 0.5 * xgap;
379 const T tenth_low_diff = T(0.1) * (xt - low_j);
380 const T tenth_upp_diff = T(0.1) * (upp_j - xt);
381
382 // Compute alpha and beta with fused max/min and fewer calls
383 T alpha_val = max(max(xmin_j, low_j + tenth_low_diff), xt - half_xgap);
384 T beta_val = min(min(xmax_j, upp_j - tenth_upp_diff), xt + half_xgap);
385
386 alpha[tj] = alpha_val;
387 beta[tj] = beta_val;
388
389 // Avoid multiple pow calls, compute once
390 const T upp_minus_x = upp_j - xt;
391 const T x_minus_low = xt - low_j;
392 const T upp_minus_x_sq = upp_minus_x * upp_minus_x;
393 const T x_minus_low_sq = x_minus_low * x_minus_low;
394
395 // Small epsilon for numerical stability
396 const T eps = T(1e-5);
397 const T inv_xgap = T(1.0) / max(eps, xgap);
398
399 // Compute terms reused multiple times
400 const T max_df0_pos = max(df0, T(0));
401 const T max_df0_neg = max(-df0, T(0));
402
403 p0j[tj] = upp_minus_x_sq * (T(1.001) * max_df0_pos +
404 T(0.001) * max_df0_neg + eps * inv_xgap);
405 q0j[tj] = x_minus_low_sq * (T(0.001) * max_df0_pos +
406 T(1.001) * max_df0_neg + eps * inv_xgap);
407
408 // Loop over m for pij and qij
409 for (int i = 0; i < m; ++i) {
410 int idx = i + tj * m;
411
412 T dfdx_val = dfdx[idx];
413 T max_pos = max(dfdx_val, T(0));
414 T max_neg = max(-dfdx_val, T(0));
415
416 pij[idx] = upp_minus_x_sq * (T(1.001) * max_pos +
417 T(0.001) * max_neg + eps * inv_xgap);
418 qij[idx] = x_minus_low_sq * (T(0.001) * max_pos +
419 T(1.001) * max_neg + eps * inv_xgap);
420 }
421}
422
423template <typename T>
424__global__ void mma_sub4_kernel(
425 const T* __restrict__ x,
426 const T* __restrict__ low,
427 const T* __restrict__ upp,
428 const T* __restrict__ pij,
429 const T* __restrict__ qij,
430 T* __restrict__ temp,
431 const int n,
432 const int m) {
433
434 int tj = blockIdx.x * blockDim.x + threadIdx.x;
435 if (tj >= n) return;
436
437 // Register caching
438 const T xt = x[tj];
439 const T low_j = low[tj];
440 const T upp_j = upp[tj];
441
442 const T denom_upp = upp_j - xt;
443 const T denom_low = xt - low_j;
444
445 const T eps = T(1e-12); // Precision-dependent epsilon
446 const T inv_denom_upp = T(1) / max(denom_upp, eps);
447 const T inv_denom_low = T(1) / max(denom_low, eps);
448
449 const int base_idx = tj * m;
450
451 for (int i = 0; i < m; ++i) {
452 int idx = base_idx + i;
453 temp[idx] = pij[idx] * inv_denom_upp + qij[idx] * inv_denom_low;
454 }
455}
456
457
458template <typename T>
459__global__ void mma_max2_kernel(
460 T* __restrict__ xsi,
461 const T* __restrict__ x,
462 const T* __restrict__ alpha,
463 const int n) {
464
465 int tj = blockIdx.x * blockDim.x + threadIdx.x;
466 if (tj >= n) return;
467
468 const T eps = T(1e-12);
469 T denom = max(x[tj] - alpha[tj], eps);
470 xsi[tj] = max(T(1), T(1) / denom);
471}
472
473
474
475
476template <typename T>
477__global__ void relambda_kernel(
478 T* __restrict__ temp,
479 const T* __restrict__ x,
480 const T* __restrict__ xupp,
481 const T* __restrict__ xlow,
482 const T* __restrict__ pij,
483 const T* __restrict__ qij,
484 const int n,
485 const int m) {
486
487 int tj = blockIdx.x * blockDim.x + threadIdx.x;
488 if (tj >= n) return;
489
490 const T xt = x[tj];
491 const T xup = xupp[tj];
492 const T xlo = xlow[tj];
493
494 // Prevent divide-by-zero using small epsilon
495 const T eps = T(1e-12);
496 const T inv_denom_upp = T(1) / max(xup - xt, eps);
497 const T inv_denom_low = T(1) / max(xt - xlo, eps);
498
499 int base_idx = tj * m;
500 for (int i = 0; i < m; ++i) {
501 int idx = base_idx + i;
502 temp[idx] = pij[idx] * inv_denom_upp + qij[idx] * inv_denom_low;
503 }
504}
505
506
507template <typename T>
508__global__ void sub2cons2_kernel(
509 T* __restrict__ a,
510 const T* __restrict__ b,
511 const T* __restrict__ c,
512 const T* __restrict__ d,
513 const T e,
514 const int n) {
515 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
516 if (idx >= n) return;
517
518 const T bt = b[idx];
519 const T ct = c[idx];
520 const T dt = d[idx];
521
522 a[idx] = bt * (ct - dt) - e;
523}
524
525
526template< typename T>
527__inline__ __device__ T max_reduce_warp(T val) {
528 val = max(val, __shfl_down_sync(0xffffffff, val, 16));
529 val = max(val, __shfl_down_sync(0xffffffff, val, 8));
530 val = max(val, __shfl_down_sync(0xffffffff, val, 4));
531 val = max(val, __shfl_down_sync(0xffffffff, val, 2));
532 val = max(val, __shfl_down_sync(0xffffffff, val, 1));
533 return val;
534}
535
536
537
538template <typename T>
539__global__ void maxval_kernel(const T* __restrict__ a, T* temp, const int n) {
540 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
541 const int stride = blockDim.x * gridDim.x;
542
543 const unsigned int lane = threadIdx.x % warpSize;
544 const unsigned int warp_id = threadIdx.x / warpSize;
545
546 // Shared memory to store warp-level maxima
547 __shared__ T shared[32]; // Assumes max 1024 threads per block
548
549 T local_max = T(0);
550 for (int i = idx; i < n; i += stride) {
551 local_max = max(local_max, abs(a[i]));
552 }
553
554 // Warp-level reduction
555 local_max = max_reduce_warp<T>(local_max);
556
557 // Write warp-level results to shared memory
558 if (lane == 0) {
559 shared[warp_id] = local_max;
560 }
561 __syncthreads();
562
563 // Let the first warp reduce shared values
564 local_max = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : T(0);
565
566 if (warp_id == 0) {
567 local_max = max_reduce_warp<T>(local_max);
568 }
569
570 if (threadIdx.x == 0) {
571 temp[blockIdx.x] = local_max;
572 }
573}
574
575
576template <typename T>
577__global__ void max_reduce_kernel(T* __restrict__ bufred, const int n) {
578
579 T maxval = T(0);
580 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
581 const int stride = blockDim.x * gridDim.x;
582
583 // Grid-stride loop to cover all elements
584 for (int i = idx; i < n; i += stride) {
585 maxval = max(maxval, bufred[i]);
586 }
587
588 __shared__ T shared[32]; // One slot per warp (max 1024 threads/block)
589
590 unsigned int lane = threadIdx.x % warpSize;
591 unsigned int wid = threadIdx.x / warpSize;
592
593 // Warp-level max reduction
594 maxval = max_reduce_warp<T>(maxval);
595
596 // Store each warp's max value in shared memory
597 if (lane == 0) {
598 shared[wid] = maxval;
599 }
600 __syncthreads();
601
602 // Let the first warp reduce the warp-level maxima
603 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : T(0);
604 if (wid == 0) {
605 maxval = max_reduce_warp<T>(maxval);
606 }
607
608 // Write the block's maximum value back to bufred[blockIdx.x]
609 if (threadIdx.x == 0) {
610 bufred[blockIdx.x] = maxval;
611 }
612}
613
614template <typename T>
615__global__ void delx_kernel(
616 T* __restrict__ delx,
617 const T* __restrict__ x,
618 const T* __restrict__ xlow,
619 const T* __restrict__ xupp,
620 const T* __restrict__ pij,
621 const T* __restrict__ qij,
622 const T* __restrict__ p0j,
623 const T* __restrict__ q0j,
624 const T* __restrict__ alpha,
625 const T* __restrict__ beta,
626 const T* __restrict__ lambda,
627 const T epsi,
628 const int n,
629 const int m)
630{
631 int tj = blockIdx.x * blockDim.x + threadIdx.x;
632 if (tj < n) {
633 T xt = x[tj];
634 T xlow_j = xlow[tj];
635 T xupp_j = xupp[tj];
636 T alpha_j = alpha[tj];
637 T beta_j = beta[tj];
638
639 // Precompute denominators squared for better performance
640 T denom_low = xt - xlow_j;
641 T denom_upp = xupp_j - xt;
642 T denom_alpha = xt - alpha_j;
643 T denom_beta = beta_j - xt;
644
645 const T small_eps = T(1e-12);
646 denom_low = (abs(denom_low) < small_eps) ? small_eps : denom_low;
647 denom_upp = (abs(denom_upp) < small_eps) ? small_eps : denom_upp;
648 denom_alpha = (abs(denom_alpha) < small_eps) ? small_eps : denom_alpha;
649 denom_beta = (abs(denom_beta) < small_eps) ? small_eps : denom_beta;
650
651 T sum = T(0);
652 for (int i = 0; i < m; i++) {
653 T lambda_i = lambda[i];
654 sum += pij[i + tj * m] * lambda_i / (denom_upp * denom_upp)
655 - qij[i + tj * m] * lambda_i / (denom_low * denom_low);
656 }
657 sum += p0j[tj] / (denom_upp * denom_upp)
658 - q0j[tj] / (denom_low * denom_low)
659 - epsi / denom_alpha
660 + epsi / denom_beta;
661
662 delx[tj] = sum;
663 }
664}
665
666template <typename T>
667__global__ void GG_kernel(
668 T* __restrict__ GG,
669 const T* __restrict__ x,
670 const T* __restrict__ xlow,
671 const T* __restrict__ xupp,
672 const T* __restrict__ pij,
673 const T* __restrict__ qij,
674 const int n,
675 const int m) {
676
677 int tj = blockIdx.x * blockDim.x + threadIdx.x;
678 if (tj < n) {
679 T xt = x[tj];
680 T xlow_j = xlow[tj];
681 T xupp_j = xupp[tj];
682
683 // Distances from bounds
684 T diff_upper = xupp_j - xt;
685 T diff_lower = xt - xlow_j;
686
687 // Squared distances
688 T diff_upper2 = diff_upper * diff_upper;
689 T diff_lower2 = diff_lower * diff_lower;
690
691 for (int i = 0; i < m; i++) {
692 int idx = i + tj * m;
693 GG[idx] = pij[idx] / diff_upper2 - qij[idx] / diff_lower2;
694 }
695 }
696}
697
698
699template <typename T>
700__global__ void diagx_kernel(T* __restrict__ diagx, const T* __restrict__ x,
701 const T* __restrict__ xsi, const T* __restrict__ xlow,
702 const T* __restrict__ xupp, const T* __restrict__ p0j,
703 const T* __restrict__ q0j, const T* __restrict__ pij,
704 const T* __restrict__ qij, const T* alpha, const T* beta,
705 const T* eta, const T* lambda, const int n, const int m) {
706 int tj = blockIdx.x * blockDim.x + threadIdx.x;
707 if (tj < n) {
708 T sum = 0;
709 T sum1 = 0;
710 for (int i = 0; i < m; i++) {
711 sum = sum + pij[tj *m+ i] * lambda[i];
712 sum1 = sum1 + qij[tj*m + i] * lambda[i];
713 }
714 diagx[tj] = (p0j[tj] + sum) / pow(xupp[tj] - x[tj], 3) +
715 (q0j[tj] + sum1) / pow(x[tj] - xlow[tj], 3);
716 diagx[tj] = 2.0 * diagx[tj] + xsi[tj] / (x[tj] - alpha[tj]) +
717 eta[tj] / (beta[tj] - x[tj]);
718 }
719}
720
721
722template <typename T>
723__inline__ __device__ T reduce_warp(T val) {
724 volatile T v = val;
725 v += __shfl_down_sync(0xffffffff, v, 16);
726 v += __shfl_down_sync(0xffffffff, v, 8);
727 v += __shfl_down_sync(0xffffffff, v, 4);
728 v += __shfl_down_sync(0xffffffff, v, 2);
729 v += __shfl_down_sync(0xffffffff, v, 1);
730 return v;
731}
732
733
734template <typename T>
735__global__ void mmareduce_kernel(T* __restrict__ bufred, const int n) {
736
737 T sum = 0;
738 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
739 const int str = blockDim.x * gridDim.x;
740 for (int i = idx; i < n; i += str)
741 {
742 sum += bufred[i];
743 }
744
745 __shared__ T shared[32];
746 unsigned int lane = threadIdx.x % warpSize;
747 unsigned int wid = threadIdx.x / warpSize;
748
749 sum = reduce_warp<T>(sum);
750 if (lane == 0)
751 shared[wid] = sum;
752 __syncthreads();
753
754 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
755 if (wid == 0)
756 sum = reduce_warp<T>(sum);
757
758 if (threadIdx.x == 0)
759 bufred[blockIdx.x] = sum;
760}
761
762
763template< typename T >
764__global__ void mmasum_kernel(const T* __restrict__ a, T* __restrict__ buf_h,
765 const int n, const int m, const int k) {
766
767 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
768 const int str = blockDim.x * gridDim.x;
769
770 const unsigned int lane = threadIdx.x % warpSize;
771 const unsigned int wid = threadIdx.x / warpSize;
772
773 __shared__ T shared[32];
774 T sum = 0;
775 for (int i = idx; i < n; i += str)
776 {
777 sum += a[m * i + k ];
778 }
779
780 sum = reduce_warp<T>(sum);
781 if (lane == 0)
782 shared[wid] = sum;
783 __syncthreads();
784
785 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
786 if (wid == 0)
787 sum = reduce_warp<T>(sum);
788
789 if (threadIdx.x == 0)
790 buf_h[blockIdx.x] = sum;
791
792}
793template< typename T >
794__global__ void mmasumbb_kernel(const T* __restrict__ GG,
795 const T* __restrict__ delx, const T* __restrict__ diagx,
796 T* __restrict__ buf_h, const int n, const int m, const int k) {
797
798 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
799 const int str = blockDim.x * gridDim.x;
800
801 const unsigned int lane = threadIdx.x % warpSize;
802 const unsigned int wid = threadIdx.x / warpSize;
803
804 __shared__ T shared[32];
805 T sum = 0;
806 for (int i = idx; i < n; i += str)
807 {
808 sum += GG[ k + i * m] * delx[i] / diagx[i];
809 }
810
811 sum = reduce_warp<T>(sum);
812 if (lane == 0)
813 shared[wid] = sum;
814 __syncthreads();
815
816 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
817 if (wid == 0)
818 sum = reduce_warp<T>(sum);
819
820 if (threadIdx.x == 0)
821 buf_h[blockIdx.x] = sum;
822
823}
824
825template <typename T>
826__global__ void mmasumHess_kernel(
827 const T* __restrict__ hijx,
828 const T* __restrict__ Ljjxinv,
829 T* __restrict__ buf_h,
830 const int n,
831 const int m,
832 const int k0,
833 const int k1)
834{
835 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
836 const int stride = blockDim.x * gridDim.x;
837
838 // Warp lane and warp id within the block
839 const unsigned int lane = threadIdx.x % warpSize;
840 const unsigned int wid = threadIdx.x / warpSize;
841
842 __shared__ T shared[32];
843
844 T sum = T(0);
845
846 // Grid-stride loop for global reduction
847 for (int i = idx; i < n; i += stride) {
848 // hijx is indexed as hijx[offset + row * m]
849 T val0 = hijx[k0 + i * m];
850 T val1 = hijx[k1 + i * m];
851 sum += val0 * Ljjxinv[i] * val1;
852 }
853
854 // Warp-level reduction using your reduce_warp implementation
855 sum = reduce_warp<T>(sum);
856
857 // Write each warp's partial sum to shared memory
858 if (lane == 0) {
859 shared[wid] = sum;
860 }
861 __syncthreads();
862
863 // First warp reduces the warp sums in shared memory
864 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : T(0);
865 if (wid == 0) {
866 sum = reduce_warp<T>(sum);
867 }
868
869 // Write final result from first thread of block
870 if (threadIdx.x == 0) {
871 buf_h[blockIdx.x] = sum;
872 }
873}
874
875
876template< typename T >
877__global__ void mmasumAA_kernel(const T* __restrict__ GG,
878 const T* __restrict__ diagx, T* __restrict__ buf_h, const int n,
879 const int m, const int k0, const int k1) {
880
881 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
882 const int str = blockDim.x * gridDim.x;
883
884 const unsigned int lane = threadIdx.x % warpSize;
885 const unsigned int wid = threadIdx.x / warpSize;
886
887 __shared__ T shared[32];
888 T sum = 0;
889 for (int i = idx; i < n; i += str)
890 {
891 sum += GG[ k0 + i * m] /diagx[i] * GG[ k1 + i * m];
892 }
893
894 sum = reduce_warp<T>(sum);
895 if (lane == 0)
896 shared[wid] = sum;
897 __syncthreads();
898
899 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
900 if (wid == 0)
901 sum = reduce_warp<T>(sum);
902
903 if (threadIdx.x == 0)
904 buf_h[blockIdx.x] = sum;
905
906}
907
908
909template <typename T>
910__global__ void mma_copy_kernel(T* __restrict__ a, const T* __restrict__ b,
911 const int n, const int m) {
912 int tj = blockIdx.x * blockDim.x + threadIdx.x;
913 if(tj<n)
914 a[tj+m]=b[tj];
915}
916
917
918
919template <typename T>
920__global__ void AA_kernel(T* __restrict__ temp, const T* __restrict__ GG,
921 const T* __restrict__ diagx, const int n, const int m) {
922 int tj = blockIdx.x * blockDim.x + threadIdx.x;
923 if (tj < n) {
924 for (int i0 = 0; i0 < m; i0++) {
925 for (int i1 = 0; i1 < m; i1++) {
926 temp[tj + i0 * (n + 1) + i1 * (m + 1) * (n + 1)] = GG[i0 * n + tj] *
927 (1.0 / diagx[tj]) * GG[i1 * n + tj];
928 }
929 }
930 }
931}
932
933
934template <typename T>
935__global__ void dx_kernel(T* __restrict__ dx, const T* __restrict__ delx,
936 const T* __restrict__ diagx, const T* __restrict__ GG,
937 const T* __restrict__ dlambda, const int n, const int m) {
938 int tj = blockIdx.x * blockDim.x + threadIdx.x;
939 if (tj < n) {
940 dx[tj] = -delx[tj]/diagx[tj];
941 for(int i=0;i<m;i++){
942 dx[tj] =dx[tj] - GG[tj*m+i]*dlambda[i]/diagx[tj];
943 }
944 }
945}
946
947
948
949template <typename T>
950__global__ void dxsi_kernel(T* __restrict__ dxsi, const T* __restrict__ xsi,
951 const T* __restrict__ dx, const T* __restrict__ x,
952 const T* __restrict__ alpha, const T epsi, const int n) {
953 int tj = blockIdx.x * blockDim.x + threadIdx.x;
954 if (tj < n) {
955 dxsi[tj]= -xsi[tj] + (epsi-dx[tj]*xsi[tj])/(x[tj] - alpha[tj]);
956 }
957}
958
959
960template <typename T>
961__global__ void deta_kernel(T* __restrict__ deta, const T* __restrict__ eta,
962 const T* __restrict__ dx, const T* __restrict__ x,
963 const T* __restrict__ beta, const T epsi, const int n) {
964 int tj = blockIdx.x * blockDim.x + threadIdx.x;
965 if (tj < n) {
966 deta[tj] = -eta[tj] + (epsi + dx[tj] * eta[tj]) / (beta[tj] - x[tj]);
967 }
968}
969
970
971template <typename T>
972__global__ void RexCalculation_kernel(
973 T* __restrict__ rex,
974 const T* __restrict__ x,
975 const T* __restrict__ xlow,
976 const T* __restrict__ xupp,
977 const T* __restrict__ pij,
978 const T* __restrict__ p0j,
979 const T* __restrict__ qij,
980 const T* __restrict__ q0j,
981 const T* __restrict__ lambda,
982 const T* __restrict__ xsi,
983 const T* __restrict__ eta,
984 const int n,
985 const int m)
986{
987 int tj = blockIdx.x * blockDim.x + threadIdx.x;
988 if (tj < n) {
989 T upp_diff = xupp[tj] - x[tj];
990 T low_diff = x[tj] - xlow[tj];
991 T upp_diff_sq = upp_diff * upp_diff;
992 T low_diff_sq = low_diff * low_diff;
993
994 T sum = 0.0;
995 for (int i = 0; i < m; ++i) {
996 sum += pij[i + tj * m] * lambda[i] / upp_diff_sq
997 - qij[i + tj * m] * lambda[i] / low_diff_sq;
998 }
999
1000 rex[tj] = sum
1001 + p0j[tj] / upp_diff_sq
1002 - q0j[tj] / low_diff_sq
1003 - xsi[tj] + eta[tj];
1004 }
1005}
1006
1007
1008template <typename T>
1009__global__ void rey_calculation_kernel(T* __restrict__ rey,
1010 const T* __restrict__ c, const T* __restrict__ d, const T* __restrict__ y,
1011 const T* __restrict__ lambda, const T* __restrict__ mu, const int n) {
1012 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1013 if (tj < n) {
1014 rey[tj] = c[tj] + d[tj] * y[tj] - lambda[tj] - mu[tj];
1015 }
1016}
1017
1018
1019template <typename T>
1020__global__ void norm_kernel(const T* __restrict__ a,
1021 T* __restrict__ buf_h, const int n) {
1022 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1023 const int str = blockDim.x * gridDim.x;
1024
1025 const unsigned int lane = threadIdx.x % warpSize;
1026 const unsigned int wid = threadIdx.x / warpSize;
1027
1028 __shared__ T shared[32];
1029 T sum = T(0);
1030
1031 for (int i = idx; i < n; i += str) {
1032 T val = a[i];
1033 sum += val * val; // faster than pow(val, 2)
1034 }
1035
1036 sum = reduce_warp<T>(sum); // your warp-level sum reduction function
1037 if (lane == 0)
1038 shared[wid] = sum;
1039 __syncthreads();
1040
1041 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : T(0);
1042 if (wid == 0)
1043 sum = reduce_warp<T>(sum);
1044
1045 if (threadIdx.x == 0)
1046 buf_h[blockIdx.x] = sum;
1047}
1048
1049
1050template <typename T>
1051__global__ void sub2cons_kernel(T* __restrict__ a, const T* __restrict__ b,
1052 const T* __restrict__ c,
1053 const T d, const int n) {
1054 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1055 if (tj < n) {
1056 a[tj] = b[tj]*c[tj]-d;
1057 }
1058}
1059
1060
1061template <typename T>
1062__global__ void dely_kernel(T* __restrict__ dely, const T* __restrict__ c,
1063 const T* __restrict__ d, const T* __restrict__ y,
1064 const T* __restrict__ lambda, const T epsi, const int n) {
1065 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1066 if (tj < n) {
1067 dely[tj] = c[tj] + d[tj]*y[tj] - lambda[tj] - epsi/y[tj];
1068 }
1069}
1070
1071
1072
1073template< typename T >
1074__global__ void maxval2_kernel(const T* __restrict__ a, const T* __restrict__ b,
1075 T* __restrict__ temp, const T cons, const int n) {
1076
1077 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1078 const int str = blockDim.x * gridDim.x;
1079
1080 const unsigned int lane = threadIdx.x % warpSize;
1081 const unsigned int wid = threadIdx.x / warpSize;
1082
1083 __shared__ T shared[32];
1084 T maxval = cons * a[0] / b[0];
1085 for (int i = idx; i < n; i += str)
1086 {
1087 maxval = max(maxval, cons * a[i] / b[i]);
1088 }
1089
1090 maxval = max_reduce_warp<T>(maxval);
1091 if (lane == 0)
1092 shared[wid] = maxval;
1093 __syncthreads();
1094
1095 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0.0;
1096 if (wid == 0)
1097 maxval = max_reduce_warp<T>(maxval);
1098
1099 if (threadIdx.x == 0)
1100 temp[blockIdx.x] = maxval;
1101
1102}
1103
1104
1105template< typename T >
1106__global__ void maxval3_kernel(const T* __restrict__ a, const T* __restrict__ b,
1107 const T* __restrict__ c, T* __restrict__ temp, const T cons, const int n) {
1108
1109 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1110 const int str = blockDim.x * gridDim.x;
1111
1112 const unsigned int lane = threadIdx.x % warpSize;
1113 const unsigned int wid = threadIdx.x / warpSize;
1114
1115 __shared__ T shared[32];
1116 T maxval = cons * a[0] / b[0];
1117 for (int i = idx; i < n; i += str)
1118 {
1119 maxval = max(maxval, cons * a[i] / (b[i] - c[i]));
1120 }
1121
1122 maxval = max_reduce_warp<T>(maxval);
1123 if (lane == 0)
1124 shared[wid] = maxval;
1125 __syncthreads();
1126
1127 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1128 if (wid == 0)
1129 maxval = max_reduce_warp<T>(maxval);
1130
1131 if (threadIdx.x == 0)
1132 temp[blockIdx.x] = maxval;
1133
1134}
1135
1136
1137template <typename T>
1138__global__ void kkt_rex_kernel(T* __restrict__ rex, const T* __restrict__ df0dx,
1139 const T* __restrict__ dfdx, const T* __restrict__ xsi,
1140 const T* __restrict__ eta, const T* __restrict__ lambda, const int n,
1141 const int m) {
1142 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1143 if (tj < n) {
1144 rex[tj] = 0.0;
1145 for (int i = 0; i < m; i++) {
1146 rex[tj] = rex[tj] + dfdx[i + tj*m] * lambda[i];
1147 }
1148 rex[tj] += df0dx[tj] - xsi[tj] + eta[tj];
1149 }
1150}
1151
1152
1153template <typename T>
1154__global__ void maxcons_kernel(T* __restrict__ a, const T b,
1155 const T c, const T* __restrict__ d, const int n) {
1156 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1157 if (tj < n) {
1158 a[tj] = max(b, c * d[tj]);
1159 }
1160}
1161
1164 template< typename T >
1165 __global__ void glsum_kernel(const T * a, T * buf_h, const int n) {
1166 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1167 const int str = blockDim.x * gridDim.x;
1168
1169 const unsigned int lane = threadIdx.x % warpSize;
1170 const unsigned int wid = threadIdx.x / warpSize;
1171
1172 __shared__ T shared[32];
1173 T sum = 0;
1174 for (int i = idx; i<n ; i += str)
1175 {
1176 sum += a[i];
1177 }
1178
1179 sum = reduce_warp<T>(sum);
1180 if (lane == 0)
1181 shared[wid] = sum;
1182 __syncthreads();
1183
1184 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1185 if (wid == 0)
1186 sum = reduce_warp<T>(sum);
1187
1188 if (threadIdx.x == 0)
1189 buf_h[blockIdx.x] = sum;
1190
1191 }
1192 template< typename T >
1193__global__ void glsc2_kernel(const T * a,
1194 const T * b,
1195 T * buf_h,
1196 const int n) {
1197
1198 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1199 const int str = blockDim.x * gridDim.x;
1200
1201 const unsigned int lane = threadIdx.x % warpSize;
1202 const unsigned int wid = threadIdx.x / warpSize;
1203
1204 __shared__ T shared[32];
1205 T sum = 0.0;
1206 for (int i = idx; i < n; i+= str) {
1207 sum += a[i] * b[i];
1208 }
1209
1210 sum = reduce_warp<T>(sum);
1211 if (lane == 0)
1212 shared[wid] = sum;
1213 __syncthreads();
1214
1215 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1216 if (wid == 0)
1217 sum = reduce_warp<T>(sum);
1218
1219 if (threadIdx.x == 0)
1220 buf_h[blockIdx.x] = sum;
1221
1222 }
1223
1224
1225
1226
1227
1228template <typename T>
1229__global__ void add2inv2_kernel(T* __restrict__ a, const T* __restrict__ b,
1230 const T c, const int n) {
1231 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1232 if (tj < n) {
1233 a[tj] = a[tj]+c/b[tj];
1234 }
1235}
1236
1237template <typename T>
1238__global__ void max2_kernel(T* __restrict__ a, const T b,
1239 const T* __restrict__ c, const T d, const int n) {
1240 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1241 if (tj < n) {
1242 a[tj]=max(b, d*c[tj]);
1243 }
1244}
1245
1246template <typename T>
1247__global__ void updatebb_kernel(T* __restrict__ bb,
1248 const T* __restrict__ dellambda, const T* __restrict__ dely,
1249 const T* __restrict__ d, const T* __restrict__ mu,
1250 const T* __restrict__ y, const T delz, const int m) {
1251 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1252 if(tj<m)
1253 bb[tj]=dellambda[tj] + dely[tj]/(d[tj] + mu[tj]/y[tj]) - bb[tj];
1254 else if(tj<m+1)
1255 bb[tj]=delz;
1256}
1257
1258
1259
1260template <typename T>
1261__global__ void updateAA_kernel(T* __restrict__ AA,
1262 const T* __restrict__ globaltmp_mm, const T* __restrict__ s,
1263 const T* __restrict__ lambda, const T* __restrict__ d,
1264 const T* __restrict__ mu, const T* __restrict__ y, const T* __restrict__ a,
1265 const T zeta, const T z, const int m) {
1266 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1267 if(tj<m)
1268 {
1269 AA[tj+tj*(m+1)]=globaltmp_mm[tj+tj*m] + (s[tj] / lambda[tj] +
1270 1.0/ (d[tj] + mu[tj] / y[tj]));
1271 AA[tj+m*(m+1)]=a[tj];
1272 AA[m+tj*(m+1)]=a[tj];
1273 }
1274 else if(tj<m+1)
1275 AA[tj+tj*(m+1)]= -zeta/z;
1276}
1277
1278template <typename T>
1279__global__ void dy_kernel(T* __restrict__ dy, const T* __restrict__ dely,
1280 const T* __restrict__ dlambda, const T* __restrict__ d,
1281 const T* __restrict__ mu, const T* __restrict__ y, const int n) {
1282 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1283 if(tj<n)
1284 dy[tj] = (-dely[tj]+dlambda[tj])/(d[tj] + mu[tj]/y[tj]);
1285}
1286
1287#endif
1288