Neko-TOP
A portable framework for high-order spectral element flow toplogy optimization.
Loading...
Searching...
No Matches
mma_kernel.h
1/*
2 Copyright (c) 2021-2025, The Neko Authors
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 * Redistributions in binary form must reproduce the above
13 copyright notice, this list of conditions and the following
14 disclaimer in the documentation and/or other materials provided
15 with the distribution.
16
17 * Neither the name of the authors nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 POSSIBILITY OF SUCH DAMAGE.
33*/
34
35#ifndef MMA_CUDA_KERNEL_H
36#define MMA_CUDA_KERNEL_H
37
38template <typename T>
39__global__ void delta_1dbeam_kernel(T* __restrict__ Delta,
40 const T L_total, const T Le, const int offset, const int n) {
41 int k = blockIdx.x * blockDim.x + threadIdx.x;
42 if (k >= n) return;
43
44 // Convert to 1-based indexing for the calculation
45 int idx = k + 1;
46
47 // Calculate first term: (L_total - Le*(offset+idx-1))^3
48 T x1 = L_total - Le * static_cast<T>(offset + idx - 1);
49 T term1 = x1 * x1 * x1;
50
51 // Calculate second term: (L_total - Le*(offset+idx))^3
52 T x2 = L_total - Le * static_cast<T>(offset + idx);
53 T term2 = x2 * x2 * x2;
54
55 // Final result
56 Delta[k] = (term1 - term2) / static_cast<T>(3.0);
57}
58
59template <typename T>
60__global__ void mma_Ljjxinv_kernel(T* __restrict__ Ljjxinv,
61 const T* __restrict__ pjlambda, const T* __restrict__ qjlambda,
62 const T* __restrict__ x, const T* __restrict__ low, const T* __restrict__ upp,
63 const T* __restrict__ alpha, const T* __restrict__ beta,
64 const int n) {
65 int tj = blockIdx.x * blockDim.x + threadIdx.x;
66 if (tj >= n) return;
67
68 // Load inputs into registers
69 const T xt = x[tj];
70 const T low_j = low[tj];
71 const T upp_j = upp[tj];
72 const T pj = pjlambda[tj];
73 const T qj = qjlambda[tj];
74
75 // Precompute reused differences
76 const T diff_u = upp_j - xt;
77 const T diff_l = xt - low_j;
78
79 // Cube once (avoiding pow for speed)
80 const T diff_u3 = diff_u * diff_u * diff_u;
81 const T diff_l3 = diff_l * diff_l * diff_l;
82
83 // Compute inverse value safely
84 T denom = 2.0 * pj / diff_u3 + 2.0 * qj / diff_l3;
85 T val = -1.0 / denom;
86
87 // Mask out active primal constraints
88 bool active = (fabs(xt - alpha[tj]) <= T(1e-16)) ||
89 (fabs(xt - beta[tj]) <= T(1e-16));
90
91 Ljjxinv[tj] = active ? T(0.0) : val;
92}
93
94template <typename T>
95__global__ void mma_dipsolvesub1_kernel(T* __restrict__ x,
96 const T* __restrict__ pjlambda, const T* __restrict__ qjlambda,
97 const T* __restrict__ low, const T* __restrict__ upp,
98 const T* __restrict__ alpha, const T* __restrict__ beta,
99 const int n) {
100
101 int tj = blockIdx.x * blockDim.x + threadIdx.x;
102 if (tj >= n) return;
103
104 // Load inputs into registers
105 const T pj_sqrt = sqrt(pjlambda[tj]);
106 const T qj_sqrt = sqrt(qjlambda[tj]);
107 const T denom = pj_sqrt + qj_sqrt;
108
109 const T low_j = low[tj];
110 const T upp_j = upp[tj];
111 const T alpha_j = alpha[tj];
112 const T beta_j = beta[tj];
113
114 // Weighted average
115 const T val = (pj_sqrt * low_j + qj_sqrt * upp_j) / denom;
116
117 // Clamp x between alpha and beta using branchless min/max
118 x[tj] = fmin(fmax(val, alpha_j), beta_j);
119}
120
121template <typename T>
122__global__ void mattrans_v_mul_kernel(T* __restrict__ output,
123 const T* __restrict__ pij, const T* __restrict__ lambda,
124 const int m, const int n) {
125
126 int tj = blockIdx.x * blockDim.x + threadIdx.x;
127 if (tj >= n) return;
128
129 T acc = 0.0;
130
131 // Use register accumulation
132 for (int i = 0; i < m; ++i) {
133 acc += pij[i + tj * m] * lambda[i];
134 }
135
136 output[tj] = acc;
137}
138
139template <typename T>
140__global__ void mma_sub1_kernel(
141 T* __restrict__ xlow,
142 T* __restrict__ xupp,
143 const T* __restrict__ x,
144 const T* __restrict__ xmin,
145 const T* __restrict__ xmax,
146 const T asyinit,
147 const int n) {
148 int tj = blockIdx.x * blockDim.x + threadIdx.x;
149 if (tj >= n) return;
150
151 // Load values once into registers (global memory is slow)
152 const T xt = x[tj];
153 const T xmin_j = xmin[tj];
154 const T xmax_j = xmax[tj];
155
156 // Reuse xgap calculation
157 const T xgap = xmax_j - xmin_j;
158 const T offset = asyinit * xgap;
159
160 xlow[tj] = xt - offset;
161 xupp[tj] = xt + offset;
162}
163
164
165
166template< typename T >
167__global__ void mma_sub2_kernel(T* __restrict__ low, T* __restrict__ upp,
168 const T* __restrict__ x, const T* __restrict__ xold1,
169 const T* __restrict__ xold2, const T* __restrict__ xdiff,
170 const T asydecr, const T asyincr, const int n) {
171 int tj = blockIdx.x * blockDim.x + threadIdx.x;
172 if (tj >= n) return;
173
174 // Load data into registers for faster accessing compare to global memory
175 // when accessing repeatedly)
176 const T xval = x[tj];
177 const T xold1val = xold1[tj];
178 const T xold2val = xold2[tj];
179 const T lowval = low[tj];
180 const T uppval = upp[tj];
181 const T xdiffval = xdiff[tj];
182
183 // Compute the product
184 const T prod = (xval - xold1val) * (xold1val - xold2val);
185
186 // Compute asy_factor without branching
187 T asy_factor = (prod < T(0)) ? asydecr :
188 (prod > T(0)) ? asyincr : T(1);
189
190 // Update low and upp using fma (fused multiply-add) for numerical stability
191 T new_low = fma(-asy_factor, (xold1val - lowval), xval);
192 T new_upp = fma(asy_factor, (uppval - xold1val), xval);
193
194 // Apply bounds
195 new_low = max(new_low, xval - T(10.0) * xdiffval);
196 new_low = min(new_low, xval - T(0.01) * xdiffval);
197
198 new_upp = min(new_upp, xval + T(10.0) * xdiffval);
199 new_upp = max(new_upp, xval + T(0.01) * xdiffval);
200
201 // Write results back
202 low[tj] = new_low;
203 upp[tj] = new_upp;
204}
205
206template <typename T>
207__global__ void mma_sub3_kernel( const T* __restrict__ x,
208 const T* __restrict__ df0dx, const T* __restrict__ dfdx,
209 T* __restrict__ low, T* __restrict__ upp, const T* __restrict__ xmin,
210 const T* __restrict__ xmax, T* __restrict__ alpha, T* __restrict__ beta,
211 T* __restrict__ p0j, T* __restrict__ q0j, T* __restrict__ pij,
212 T* __restrict__ qij, const int n, const int m) {
213 int tj = blockIdx.x * blockDim.x + threadIdx.x;
214 if (tj >= n) return;
215
216 // Load into registers once
217 const T xt = x[tj];
218 const T xmin_j = xmin[tj];
219 const T xmax_j = xmax[tj];
220 const T low_j = low[tj];
221 const T upp_j = upp[tj];
222 const T df0 = df0dx[tj];
223 const T xgap = xmax_j - xmin_j;
224
225 // Clamp helpers
226 const T half_xgap = 0.5 * xgap;
227 const T tenth_low_diff = T(0.1) * (xt - low_j);
228 const T tenth_upp_diff = T(0.1) * (upp_j - xt);
229
230 // Compute alpha and beta with fused max/min and fewer calls
231 T alpha_val = max(max(xmin_j, low_j + tenth_low_diff), xt - half_xgap);
232 T beta_val = min(min(xmax_j, upp_j - tenth_upp_diff), xt + half_xgap);
233
234 alpha[tj] = alpha_val;
235 beta[tj] = beta_val;
236
237 // Avoid multiple pow calls, compute once
238 const T upp_minus_x = upp_j - xt;
239 const T x_minus_low = xt - low_j;
240 const T upp_minus_x_sq = upp_minus_x * upp_minus_x;
241 const T x_minus_low_sq = x_minus_low * x_minus_low;
242
243 // Small epsilon for numerical stability
244 const T eps = T(1e-5);
245 const T inv_xgap = T(1.0) / max(eps, xgap);
246
247 // Compute terms reused multiple times
248 const T max_df0_pos = max(df0, T(0));
249 const T max_df0_neg = max(-df0, T(0));
250
251 p0j[tj] = upp_minus_x_sq * (T(1.001) * max_df0_pos +
252 T(0.001) * max_df0_neg + eps * inv_xgap);
253 q0j[tj] = x_minus_low_sq * (T(0.001) * max_df0_pos +
254 T(1.001) * max_df0_neg + eps * inv_xgap);
255
256 // Loop over m for pij and qij
257 for (int i = 0; i < m; ++i) {
258 int idx = i + tj * m;
259
260 T dfdx_val = dfdx[idx];
261 T max_pos = max(dfdx_val, T(0));
262 T max_neg = max(-dfdx_val, T(0));
263
264 pij[idx] = upp_minus_x_sq * (T(1.001) * max_pos +
265 T(0.001) * max_neg + eps * inv_xgap);
266 qij[idx] = x_minus_low_sq * (T(0.001) * max_pos +
267 T(1.001) * max_neg + eps * inv_xgap);
268 }
269}
270
271template <typename T>
272__global__ void mma_sub4_kernel(
273 const T* __restrict__ x,
274 const T* __restrict__ low,
275 const T* __restrict__ upp,
276 const T* __restrict__ pij,
277 const T* __restrict__ qij,
278 T* __restrict__ temp,
279 const int n,
280 const int m) {
281
282 int tj = blockIdx.x * blockDim.x + threadIdx.x;
283 if (tj >= n) return;
284
285 // Register caching
286 const T xt = x[tj];
287 const T low_j = low[tj];
288 const T upp_j = upp[tj];
289
290 const T denom_upp = upp_j - xt;
291 const T denom_low = xt - low_j;
292
293 const T eps = T(1e-12); // Precision-dependent epsilon
294 const T inv_denom_upp = T(1) / max(denom_upp, eps);
295 const T inv_denom_low = T(1) / max(denom_low, eps);
296
297 const int base_idx = tj * m;
298
299 for (int i = 0; i < m; ++i) {
300 int idx = base_idx + i;
301 temp[idx] = pij[idx] * inv_denom_upp + qij[idx] * inv_denom_low;
302 }
303}
304
305
306template <typename T>
307__global__ void mma_max2_kernel(
308 T* __restrict__ xsi,
309 const T* __restrict__ x,
310 const T* __restrict__ alpha,
311 const int n) {
312
313 int tj = blockIdx.x * blockDim.x + threadIdx.x;
314 if (tj >= n) return;
315
316 const T eps = T(1e-12);
317 T denom = max(x[tj] - alpha[tj], eps);
318 xsi[tj] = max(T(1), T(1) / denom);
319}
320
321
322
323
324template <typename T>
325__global__ void relambda_kernel(
326 T* __restrict__ temp,
327 const T* __restrict__ x,
328 const T* __restrict__ xupp,
329 const T* __restrict__ xlow,
330 const T* __restrict__ pij,
331 const T* __restrict__ qij,
332 const int n,
333 const int m) {
334
335 int tj = blockIdx.x * blockDim.x + threadIdx.x;
336 if (tj >= n) return;
337
338 const T xt = x[tj];
339 const T xup = xupp[tj];
340 const T xlo = xlow[tj];
341
342 // Prevent divide-by-zero using small epsilon
343 const T eps = T(1e-12);
344 const T inv_denom_upp = T(1) / max(xup - xt, eps);
345 const T inv_denom_low = T(1) / max(xt - xlo, eps);
346
347 int base_idx = tj * m;
348 for (int i = 0; i < m; ++i) {
349 int idx = base_idx + i;
350 temp[idx] = pij[idx] * inv_denom_upp + qij[idx] * inv_denom_low;
351 }
352}
353
354
355template <typename T>
356__global__ void sub2cons2_kernel(
357 T* __restrict__ a,
358 const T* __restrict__ b,
359 const T* __restrict__ c,
360 const T* __restrict__ d,
361 const T e,
362 const int n) {
363 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
364 if (idx >= n) return;
365
366 const T bt = b[idx];
367 const T ct = c[idx];
368 const T dt = d[idx];
369
370 a[idx] = bt * (ct - dt) - e;
371}
372
373
374template< typename T>
375__inline__ __device__ T max_reduce_warp(T val) {
376 val = max(val, __shfl_down_sync(0xffffffff, val, 16));
377 val = max(val, __shfl_down_sync(0xffffffff, val, 8));
378 val = max(val, __shfl_down_sync(0xffffffff, val, 4));
379 val = max(val, __shfl_down_sync(0xffffffff, val, 2));
380 val = max(val, __shfl_down_sync(0xffffffff, val, 1));
381 return val;
382}
383
384
385
386template <typename T>
387__global__ void maxval_kernel(const T* __restrict__ a, T* temp, const int n) {
388 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
389 const int stride = blockDim.x * gridDim.x;
390
391 const unsigned int lane = threadIdx.x % warpSize;
392 const unsigned int warp_id = threadIdx.x / warpSize;
393
394 // Shared memory to store warp-level maxima
395 __shared__ T shared[32]; // Assumes max 1024 threads per block
396
397 T local_max = T(0);
398 for (int i = idx; i < n; i += stride) {
399 local_max = max(local_max, abs(a[i]));
400 }
401
402 // Warp-level reduction
403 local_max = max_reduce_warp<T>(local_max);
404
405 // Write warp-level results to shared memory
406 if (lane == 0) {
407 shared[warp_id] = local_max;
408 }
409 __syncthreads();
410
411 // Let the first warp reduce shared values
412 local_max = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : T(0);
413
414 if (warp_id == 0) {
415 local_max = max_reduce_warp<T>(local_max);
416 }
417
418 if (threadIdx.x == 0) {
419 temp[blockIdx.x] = local_max;
420 }
421}
422
423
424template <typename T>
425__global__ void max_reduce_kernel(T* __restrict__ bufred, const int n) {
426
427 T maxval = T(0);
428 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
429 const int stride = blockDim.x * gridDim.x;
430
431 // Grid-stride loop to cover all elements
432 for (int i = idx; i < n; i += stride) {
433 maxval = max(maxval, bufred[i]);
434 }
435
436 __shared__ T shared[32]; // One slot per warp (max 1024 threads/block)
437
438 unsigned int lane = threadIdx.x % warpSize;
439 unsigned int wid = threadIdx.x / warpSize;
440
441 // Warp-level max reduction
442 maxval = max_reduce_warp<T>(maxval);
443
444 // Store each warp's max value in shared memory
445 if (lane == 0) {
446 shared[wid] = maxval;
447 }
448 __syncthreads();
449
450 // Let the first warp reduce the warp-level maxima
451 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : T(0);
452 if (wid == 0) {
453 maxval = max_reduce_warp<T>(maxval);
454 }
455
456 // Write the block's maximum value back to bufred[blockIdx.x]
457 if (threadIdx.x == 0) {
458 bufred[blockIdx.x] = maxval;
459 }
460}
461
462template <typename T>
463__global__ void delx_kernel(
464 T* __restrict__ delx,
465 const T* __restrict__ x,
466 const T* __restrict__ xlow,
467 const T* __restrict__ xupp,
468 const T* __restrict__ pij,
469 const T* __restrict__ qij,
470 const T* __restrict__ p0j,
471 const T* __restrict__ q0j,
472 const T* __restrict__ alpha,
473 const T* __restrict__ beta,
474 const T* __restrict__ lambda,
475 const T epsi,
476 const int n,
477 const int m)
478{
479 int tj = blockIdx.x * blockDim.x + threadIdx.x;
480 if (tj < n) {
481 T xt = x[tj];
482 T xlow_j = xlow[tj];
483 T xupp_j = xupp[tj];
484 T alpha_j = alpha[tj];
485 T beta_j = beta[tj];
486
487 // Precompute denominators squared for better performance
488 T denom_low = xt - xlow_j;
489 T denom_upp = xupp_j - xt;
490 T denom_alpha = xt - alpha_j;
491 T denom_beta = beta_j - xt;
492
493 const T small_eps = T(1e-12);
494 denom_low = (abs(denom_low) < small_eps) ? small_eps : denom_low;
495 denom_upp = (abs(denom_upp) < small_eps) ? small_eps : denom_upp;
496 denom_alpha = (abs(denom_alpha) < small_eps) ? small_eps : denom_alpha;
497 denom_beta = (abs(denom_beta) < small_eps) ? small_eps : denom_beta;
498
499 T sum = T(0);
500 for (int i = 0; i < m; i++) {
501 T lambda_i = lambda[i];
502 sum += pij[i + tj * m] * lambda_i / (denom_upp * denom_upp)
503 - qij[i + tj * m] * lambda_i / (denom_low * denom_low);
504 }
505 sum += p0j[tj] / (denom_upp * denom_upp)
506 - q0j[tj] / (denom_low * denom_low)
507 - epsi / denom_alpha
508 + epsi / denom_beta;
509
510 delx[tj] = sum;
511 }
512}
513
514template <typename T>
515__global__ void GG_kernel(
516 T* __restrict__ GG,
517 const T* __restrict__ x,
518 const T* __restrict__ xlow,
519 const T* __restrict__ xupp,
520 const T* __restrict__ pij,
521 const T* __restrict__ qij,
522 const int n,
523 const int m) {
524
525 int tj = blockIdx.x * blockDim.x + threadIdx.x;
526 if (tj < n) {
527 T xt = x[tj];
528 T xlow_j = xlow[tj];
529 T xupp_j = xupp[tj];
530
531 // Distances from bounds
532 T diff_upper = xupp_j - xt;
533 T diff_lower = xt - xlow_j;
534
535 // Squared distances
536 T diff_upper2 = diff_upper * diff_upper;
537 T diff_lower2 = diff_lower * diff_lower;
538
539 for (int i = 0; i < m; i++) {
540 int idx = i + tj * m;
541 GG[idx] = pij[idx] / diff_upper2 - qij[idx] / diff_lower2;
542 }
543 }
544}
545
546
547template <typename T>
548__global__ void diagx_kernel(T* __restrict__ diagx, const T* __restrict__ x,
549 const T* __restrict__ xsi, const T* __restrict__ xlow,
550 const T* __restrict__ xupp, const T* __restrict__ p0j,
551 const T* __restrict__ q0j, const T* __restrict__ pij,
552 const T* __restrict__ qij, const T* alpha, const T* beta,
553 const T* eta, const T* lambda, const int n, const int m) {
554 int tj = blockIdx.x * blockDim.x + threadIdx.x;
555 if (tj < n) {
556 T sum = 0;
557 T sum1 = 0;
558 for (int i = 0; i < m; i++) {
559 sum = sum + pij[tj *m+ i] * lambda[i];
560 sum1 = sum1 + qij[tj*m + i] * lambda[i];
561 }
562 diagx[tj] = (p0j[tj] + sum) / pow(xupp[tj] - x[tj], 3) +
563 (q0j[tj] + sum1) / pow(x[tj] - xlow[tj], 3);
564 diagx[tj] = 2.0 * diagx[tj] + xsi[tj] / (x[tj] - alpha[tj]) +
565 eta[tj] / (beta[tj] - x[tj]);
566 }
567}
568
569
570template <typename T>
571__inline__ __device__ T reduce_warp(T val) {
572 volatile T v = val;
573 v += __shfl_down_sync(0xffffffff, v, 16);
574 v += __shfl_down_sync(0xffffffff, v, 8);
575 v += __shfl_down_sync(0xffffffff, v, 4);
576 v += __shfl_down_sync(0xffffffff, v, 2);
577 v += __shfl_down_sync(0xffffffff, v, 1);
578 return v;
579}
580
581
582template <typename T>
583__global__ void mmareduce_kernel(T* __restrict__ bufred, const int n) {
584
585 T sum = 0;
586 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
587 const int str = blockDim.x * gridDim.x;
588 for (int i = idx; i < n; i += str)
589 {
590 sum += bufred[i];
591 }
592
593 __shared__ T shared[32];
594 unsigned int lane = threadIdx.x % warpSize;
595 unsigned int wid = threadIdx.x / warpSize;
596
597 sum = reduce_warp<T>(sum);
598 if (lane == 0)
599 shared[wid] = sum;
600 __syncthreads();
601
602 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
603 if (wid == 0)
604 sum = reduce_warp<T>(sum);
605
606 if (threadIdx.x == 0)
607 bufred[blockIdx.x] = sum;
608}
609
610
611template< typename T >
612__global__ void mmasum_kernel(const T* __restrict__ a, T* __restrict__ buf_h,
613 const int n, const int m, const int k) {
614
615 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
616 const int str = blockDim.x * gridDim.x;
617
618 const unsigned int lane = threadIdx.x % warpSize;
619 const unsigned int wid = threadIdx.x / warpSize;
620
621 __shared__ T shared[32];
622 T sum = 0;
623 for (int i = idx; i < n; i += str)
624 {
625 sum += a[m * i + k ];
626 }
627
628 sum = reduce_warp<T>(sum);
629 if (lane == 0)
630 shared[wid] = sum;
631 __syncthreads();
632
633 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
634 if (wid == 0)
635 sum = reduce_warp<T>(sum);
636
637 if (threadIdx.x == 0)
638 buf_h[blockIdx.x] = sum;
639
640}
641template< typename T >
642__global__ void mmasumbb_kernel(const T* __restrict__ GG,
643 const T* __restrict__ delx, const T* __restrict__ diagx,
644 T* __restrict__ buf_h, const int n, const int m, const int k) {
645
646 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
647 const int str = blockDim.x * gridDim.x;
648
649 const unsigned int lane = threadIdx.x % warpSize;
650 const unsigned int wid = threadIdx.x / warpSize;
651
652 __shared__ T shared[32];
653 T sum = 0;
654 for (int i = idx; i < n; i += str)
655 {
656 sum += GG[ k + i * m] * delx[i] / diagx[i];
657 }
658
659 sum = reduce_warp<T>(sum);
660 if (lane == 0)
661 shared[wid] = sum;
662 __syncthreads();
663
664 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
665 if (wid == 0)
666 sum = reduce_warp<T>(sum);
667
668 if (threadIdx.x == 0)
669 buf_h[blockIdx.x] = sum;
670
671}
672
673template <typename T>
674__global__ void mmasumHess_kernel(
675 const T* __restrict__ hijx,
676 const T* __restrict__ Ljjxinv,
677 T* __restrict__ buf_h,
678 const int n,
679 const int m,
680 const int k0,
681 const int k1)
682{
683 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
684 const int stride = blockDim.x * gridDim.x;
685
686 // Warp lane and warp id within the block
687 const unsigned int lane = threadIdx.x % warpSize;
688 const unsigned int wid = threadIdx.x / warpSize;
689
690 __shared__ T shared[32];
691
692 T sum = T(0);
693
694 // Grid-stride loop for global reduction
695 for (int i = idx; i < n; i += stride) {
696 // hijx is indexed as hijx[offset + row * m]
697 T val0 = hijx[k0 + i * m];
698 T val1 = hijx[k1 + i * m];
699 sum += val0 * Ljjxinv[i] * val1;
700 }
701
702 // Warp-level reduction using your reduce_warp implementation
703 sum = reduce_warp<T>(sum);
704
705 // Write each warp's partial sum to shared memory
706 if (lane == 0) {
707 shared[wid] = sum;
708 }
709 __syncthreads();
710
711 // First warp reduces the warp sums in shared memory
712 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : T(0);
713 if (wid == 0) {
714 sum = reduce_warp<T>(sum);
715 }
716
717 // Write final result from first thread of block
718 if (threadIdx.x == 0) {
719 buf_h[blockIdx.x] = sum;
720 }
721}
722
723
724template< typename T >
725__global__ void mmasumAA_kernel(const T* __restrict__ GG,
726 const T* __restrict__ diagx, T* __restrict__ buf_h, const int n,
727 const int m, const int k0, const int k1) {
728
729 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
730 const int str = blockDim.x * gridDim.x;
731
732 const unsigned int lane = threadIdx.x % warpSize;
733 const unsigned int wid = threadIdx.x / warpSize;
734
735 __shared__ T shared[32];
736 T sum = 0;
737 for (int i = idx; i < n; i += str)
738 {
739 sum += GG[ k0 + i * m] /diagx[i] * GG[ k1 + i * m];
740 }
741
742 sum = reduce_warp<T>(sum);
743 if (lane == 0)
744 shared[wid] = sum;
745 __syncthreads();
746
747 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
748 if (wid == 0)
749 sum = reduce_warp<T>(sum);
750
751 if (threadIdx.x == 0)
752 buf_h[blockIdx.x] = sum;
753
754}
755
756
757template <typename T>
758__global__ void mma_copy_kernel(T* __restrict__ a, const T* __restrict__ b,
759 const int n, const int m) {
760 int tj = blockIdx.x * blockDim.x + threadIdx.x;
761 if(tj<n)
762 a[tj+m]=b[tj];
763}
764
765
766
767template <typename T>
768__global__ void AA_kernel(T* __restrict__ temp, const T* __restrict__ GG,
769 const T* __restrict__ diagx, const int n, const int m) {
770 int tj = blockIdx.x * blockDim.x + threadIdx.x;
771 if (tj < n) {
772 for (int i0 = 0; i0 < m; i0++) {
773 for (int i1 = 0; i1 < m; i1++) {
774 temp[tj + i0 * (n + 1) + i1 * (m + 1) * (n + 1)] = GG[i0 * n + tj] *
775 (1.0 / diagx[tj]) * GG[i1 * n + tj];
776 }
777 }
778 }
779}
780
781
782template <typename T>
783__global__ void dx_kernel(T* __restrict__ dx, const T* __restrict__ delx,
784 const T* __restrict__ diagx, const T* __restrict__ GG,
785 const T* __restrict__ dlambda, const int n, const int m) {
786 int tj = blockIdx.x * blockDim.x + threadIdx.x;
787 if (tj < n) {
788 dx[tj] = -delx[tj]/diagx[tj];
789 for(int i=0;i<m;i++){
790 dx[tj] =dx[tj] - GG[tj*m+i]*dlambda[i]/diagx[tj];
791 }
792 }
793}
794
795
796
797template <typename T>
798__global__ void dxsi_kernel(T* __restrict__ dxsi, const T* __restrict__ xsi,
799 const T* __restrict__ dx, const T* __restrict__ x,
800 const T* __restrict__ alpha, const T epsi, const int n) {
801 int tj = blockIdx.x * blockDim.x + threadIdx.x;
802 if (tj < n) {
803 dxsi[tj]= -xsi[tj] + (epsi-dx[tj]*xsi[tj])/(x[tj] - alpha[tj]);
804 }
805}
806
807
808template <typename T>
809__global__ void deta_kernel(T* __restrict__ deta, const T* __restrict__ eta,
810 const T* __restrict__ dx, const T* __restrict__ x,
811 const T* __restrict__ beta, const T epsi, const int n) {
812 int tj = blockIdx.x * blockDim.x + threadIdx.x;
813 if (tj < n) {
814 deta[tj] = -eta[tj] + (epsi + dx[tj] * eta[tj]) / (beta[tj] - x[tj]);
815 }
816}
817
818
819template <typename T>
820__global__ void RexCalculation_kernel(
821 T* __restrict__ rex,
822 const T* __restrict__ x,
823 const T* __restrict__ xlow,
824 const T* __restrict__ xupp,
825 const T* __restrict__ pij,
826 const T* __restrict__ p0j,
827 const T* __restrict__ qij,
828 const T* __restrict__ q0j,
829 const T* __restrict__ lambda,
830 const T* __restrict__ xsi,
831 const T* __restrict__ eta,
832 const int n,
833 const int m)
834{
835 int tj = blockIdx.x * blockDim.x + threadIdx.x;
836 if (tj < n) {
837 T upp_diff = xupp[tj] - x[tj];
838 T low_diff = x[tj] - xlow[tj];
839 T upp_diff_sq = upp_diff * upp_diff;
840 T low_diff_sq = low_diff * low_diff;
841
842 T sum = 0.0;
843 for (int i = 0; i < m; ++i) {
844 sum += pij[i + tj * m] * lambda[i] / upp_diff_sq
845 - qij[i + tj * m] * lambda[i] / low_diff_sq;
846 }
847
848 rex[tj] = sum
849 + p0j[tj] / upp_diff_sq
850 - q0j[tj] / low_diff_sq
851 - xsi[tj] + eta[tj];
852 }
853}
854
855
856template <typename T>
857__global__ void rey_calculation_kernel(T* __restrict__ rey,
858 const T* __restrict__ c, const T* __restrict__ d, const T* __restrict__ y,
859 const T* __restrict__ lambda, const T* __restrict__ mu, const int n) {
860 int tj = blockIdx.x * blockDim.x + threadIdx.x;
861 if (tj < n) {
862 rey[tj] = c[tj] + d[tj] * y[tj] - lambda[tj] - mu[tj];
863 }
864}
865
866
867template <typename T>
868__global__ void norm_kernel(const T* __restrict__ a,
869 T* __restrict__ buf_h, const int n) {
870 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
871 const int str = blockDim.x * gridDim.x;
872
873 const unsigned int lane = threadIdx.x % warpSize;
874 const unsigned int wid = threadIdx.x / warpSize;
875
876 __shared__ T shared[32];
877 T sum = T(0);
878
879 for (int i = idx; i < n; i += str) {
880 T val = a[i];
881 sum += val * val; // faster than pow(val, 2)
882 }
883
884 sum = reduce_warp<T>(sum); // your warp-level sum reduction function
885 if (lane == 0)
886 shared[wid] = sum;
887 __syncthreads();
888
889 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : T(0);
890 if (wid == 0)
891 sum = reduce_warp<T>(sum);
892
893 if (threadIdx.x == 0)
894 buf_h[blockIdx.x] = sum;
895}
896
897
898template <typename T>
899__global__ void sub2cons_kernel(T* __restrict__ a, const T* __restrict__ b,
900 const T* __restrict__ c,
901 const T d, const int n) {
902 int tj = blockIdx.x * blockDim.x + threadIdx.x;
903 if (tj < n) {
904 a[tj] = b[tj]*c[tj]-d;
905 }
906}
907
908
909template <typename T>
910__global__ void dely_kernel(T* __restrict__ dely, const T* __restrict__ c,
911 const T* __restrict__ d, const T* __restrict__ y,
912 const T* __restrict__ lambda, const T epsi, const int n) {
913 int tj = blockIdx.x * blockDim.x + threadIdx.x;
914 if (tj < n) {
915 dely[tj] = c[tj] + d[tj]*y[tj] - lambda[tj] - epsi/y[tj];
916 }
917}
918
919
920
921template< typename T >
922__global__ void maxval2_kernel(const T* __restrict__ a, const T* __restrict__ b,
923 T* __restrict__ temp, const T cons, const int n) {
924
925 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
926 const int str = blockDim.x * gridDim.x;
927
928 const unsigned int lane = threadIdx.x % warpSize;
929 const unsigned int wid = threadIdx.x / warpSize;
930
931 __shared__ T shared[32];
932 T maxval = cons * a[0] / b[0];
933 for (int i = idx; i < n; i += str)
934 {
935 maxval = max(maxval, cons * a[i] / b[i]);
936 }
937
938 maxval = max_reduce_warp<T>(maxval);
939 if (lane == 0)
940 shared[wid] = maxval;
941 __syncthreads();
942
943 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0.0;
944 if (wid == 0)
945 maxval = max_reduce_warp<T>(maxval);
946
947 if (threadIdx.x == 0)
948 temp[blockIdx.x] = maxval;
949
950}
951
952
953template< typename T >
954__global__ void maxval3_kernel(const T* __restrict__ a, const T* __restrict__ b,
955 const T* __restrict__ c, T* __restrict__ temp, const T cons, const int n) {
956
957 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
958 const int str = blockDim.x * gridDim.x;
959
960 const unsigned int lane = threadIdx.x % warpSize;
961 const unsigned int wid = threadIdx.x / warpSize;
962
963 __shared__ T shared[32];
964 T maxval = cons * a[0] / b[0];
965 for (int i = idx; i < n; i += str)
966 {
967 maxval = max(maxval, cons * a[i] / (b[i] - c[i]));
968 }
969
970 maxval = max_reduce_warp<T>(maxval);
971 if (lane == 0)
972 shared[wid] = maxval;
973 __syncthreads();
974
975 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
976 if (wid == 0)
977 maxval = max_reduce_warp<T>(maxval);
978
979 if (threadIdx.x == 0)
980 temp[blockIdx.x] = maxval;
981
982}
983
984
985template <typename T>
986__global__ void kkt_rex_kernel(T* __restrict__ rex, const T* __restrict__ df0dx,
987 const T* __restrict__ dfdx, const T* __restrict__ xsi,
988 const T* __restrict__ eta, const T* __restrict__ lambda, const int n,
989 const int m) {
990 int tj = blockIdx.x * blockDim.x + threadIdx.x;
991 if (tj < n) {
992 rex[tj] = 0.0;
993 for (int i = 0; i < m; i++) {
994 rex[tj] = rex[tj] + dfdx[i + tj*m] * lambda[i];
995 }
996 rex[tj] += df0dx[tj] - xsi[tj] + eta[tj];
997 }
998}
999
1000
1001template <typename T>
1002__global__ void maxcons_kernel(T* __restrict__ a, const T b,
1003 const T c, const T* __restrict__ d, const int n) {
1004 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1005 if (tj < n) {
1006 a[tj] = max(b, c * d[tj]);
1007 }
1008}
1009
1012 template< typename T >
1013 __global__ void glsum_kernel(const T * a, T * buf_h, const int n) {
1014 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1015 const int str = blockDim.x * gridDim.x;
1016
1017 const unsigned int lane = threadIdx.x % warpSize;
1018 const unsigned int wid = threadIdx.x / warpSize;
1019
1020 __shared__ T shared[32];
1021 T sum = 0;
1022 for (int i = idx; i<n ; i += str)
1023 {
1024 sum += a[i];
1025 }
1026
1027 sum = reduce_warp<T>(sum);
1028 if (lane == 0)
1029 shared[wid] = sum;
1030 __syncthreads();
1031
1032 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1033 if (wid == 0)
1034 sum = reduce_warp<T>(sum);
1035
1036 if (threadIdx.x == 0)
1037 buf_h[blockIdx.x] = sum;
1038
1039 }
1040 template< typename T >
1041__global__ void glsc2_kernel(const T * a,
1042 const T * b,
1043 T * buf_h,
1044 const int n) {
1045
1046 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1047 const int str = blockDim.x * gridDim.x;
1048
1049 const unsigned int lane = threadIdx.x % warpSize;
1050 const unsigned int wid = threadIdx.x / warpSize;
1051
1052 __shared__ T shared[32];
1053 T sum = 0.0;
1054 for (int i = idx; i < n; i+= str) {
1055 sum += a[i] * b[i];
1056 }
1057
1058 sum = reduce_warp<T>(sum);
1059 if (lane == 0)
1060 shared[wid] = sum;
1061 __syncthreads();
1062
1063 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1064 if (wid == 0)
1065 sum = reduce_warp<T>(sum);
1066
1067 if (threadIdx.x == 0)
1068 buf_h[blockIdx.x] = sum;
1069
1070 }
1071
1072
1073
1074
1075
1076template <typename T>
1077__global__ void add2inv2_kernel(T* __restrict__ a, const T* __restrict__ b,
1078 const T c, const int n) {
1079 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1080 if (tj < n) {
1081 a[tj] = a[tj]+c/b[tj];
1082 }
1083}
1084
1085template <typename T>
1086__global__ void max2_kernel(T* __restrict__ a, const T b,
1087 const T* __restrict__ c, const T d, const int n) {
1088 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1089 if (tj < n) {
1090 a[tj]=max(b, d*c[tj]);
1091 }
1092}
1093
1094template <typename T>
1095__global__ void updatebb_kernel(T* __restrict__ bb,
1096 const T* __restrict__ dellambda, const T* __restrict__ dely,
1097 const T* __restrict__ d, const T* __restrict__ mu,
1098 const T* __restrict__ y, const T delz, const int m) {
1099 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1100 if(tj<m)
1101 bb[tj]=dellambda[tj] + dely[tj]/(d[tj] + mu[tj]/y[tj]) - bb[tj];
1102 else if(tj<m+1)
1103 bb[tj]=delz;
1104}
1105
1106
1107
1108template <typename T>
1109__global__ void updateAA_kernel(T* __restrict__ AA,
1110 const T* __restrict__ globaltmp_mm, const T* __restrict__ s,
1111 const T* __restrict__ lambda, const T* __restrict__ d,
1112 const T* __restrict__ mu, const T* __restrict__ y, const T* __restrict__ a,
1113 const T zeta, const T z, const int m) {
1114 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1115 if(tj<m)
1116 {
1117 AA[tj+tj*(m+1)]=globaltmp_mm[tj+tj*m] + (s[tj] / lambda[tj] +
1118 1.0/ (d[tj] + mu[tj] / y[tj]));
1119 AA[tj+m*(m+1)]=a[tj];
1120 AA[m+tj*(m+1)]=a[tj];
1121 }
1122 else if(tj<m+1)
1123 AA[tj+tj*(m+1)]= -zeta/z;
1124}
1125
1126template <typename T>
1127__global__ void dy_kernel(T* __restrict__ dy, const T* __restrict__ dely,
1128 const T* __restrict__ dlambda, const T* __restrict__ d,
1129 const T* __restrict__ mu, const T* __restrict__ y, const int n) {
1130 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1131 if(tj<n)
1132 dy[tj] = (-dely[tj]+dlambda[tj])/(d[tj] + mu[tj]/y[tj]);
1133}
1134
1135#endif
1136