Neko-TOP
A portable framework for high-order spectral element flow toplogy optimization.
Loading...
Searching...
No Matches
mma_kernel.h
1/*
2 Copyright (c) 2021-2025, The Neko Authors
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 * Redistributions in binary form must reproduce the above
13 copyright notice, this list of conditions and the following
14 disclaimer in the documentation and/or other materials provided
15 with the distribution.
16
17 * Neither the name of the authors nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 POSSIBILITY OF SUCH DAMAGE.
33*/
34
35#ifndef MMA_HIP_KERNEL_H
36#define MMA_HIP_KERNEL_H
37
38template <typename T>
39__global__ void delta_1dbeam_kernel(T* __restrict__ Delta,
40 const T L_total, const T Le, const int offset, const int n) {
41 int k = blockIdx.x * blockDim.x + threadIdx.x;
42 if (k >= n) return;
43
44 // Convert to 1-based indexing for the calculation
45 int idx = k + 1;
46
47 // Calculate first term: (L_total - Le*(offset+idx-1))^3
48 T x1 = L_total - Le * static_cast<T>(offset + idx - 1);
49 T term1 = x1 * x1 * x1;
50
51 // Calculate second term: (L_total - Le*(offset+idx))^3
52 T x2 = L_total - Le * static_cast<T>(offset + idx);
53 T term2 = x2 * x2 * x2;
54
55 // Final result
56 Delta[k] = (term1 - term2) / static_cast<T>(3.0);
57}
58
59template <typename T>
60__global__ void mma_Ljjxinv_kernel(T* __restrict__ Ljjxinv,
61 const T* __restrict__ pjlambda, const T* __restrict__ qjlambda,
62 const T* __restrict__ x, const T* __restrict__ low, const T* __restrict__ upp,
63 const T* __restrict__ alpha, const T* __restrict__ beta,
64 const int n) {
65 int tj = blockIdx.x * blockDim.x + threadIdx.x;
66 if (tj >= n) return;
67
68 // Load inputs into registers
69 const T xt = x[tj];
70 const T low_j = low[tj];
71 const T upp_j = upp[tj];
72 const T pj = pjlambda[tj];
73 const T qj = qjlambda[tj];
74
75 // Precompute reused differences
76 const T diff_u = upp_j - xt;
77 const T diff_l = xt - low_j;
78
79 // Cube once (avoiding pow for speed)
80 const T diff_u3 = diff_u * diff_u * diff_u;
81 const T diff_l3 = diff_l * diff_l * diff_l;
82
83 // Compute inverse value safely
84 T denom = 2.0 * pj / diff_u3 + 2.0 * qj / diff_l3;
85 T val = -1.0 / denom;
86
87 // Mask out active primal constraints
88 bool active = (fabs(xt - alpha[tj]) <= T(1e-16)) ||
89 (fabs(xt - beta[tj]) <= T(1e-16));
90
91 Ljjxinv[tj] = active ? T(0.0) : val;
92}
93
94template <typename T>
95__global__ void mma_dipsolvesub1_kernel(T* __restrict__ x,
96 const T* __restrict__ pjlambda, const T* __restrict__ qjlambda,
97 const T* __restrict__ low, const T* __restrict__ upp,
98 const T* __restrict__ alpha, const T* __restrict__ beta,
99 const int n) {
100
101 int tj = blockIdx.x * blockDim.x + threadIdx.x;
102 if (tj >= n) return;
103
104 // Load inputs into registers
105 const T pj_sqrt = sqrt(pjlambda[tj]);
106 const T qj_sqrt = sqrt(qjlambda[tj]);
107 const T denom = pj_sqrt + qj_sqrt;
108
109 const T low_j = low[tj];
110 const T upp_j = upp[tj];
111 const T alpha_j = alpha[tj];
112 const T beta_j = beta[tj];
113
114 // Weighted average
115 const T val = (pj_sqrt * low_j + qj_sqrt * upp_j) / denom;
116
117 // Clamp x between alpha and beta using branchless min/max
118 x[tj] = fmin(fmax(val, alpha_j), beta_j);
119}
120
121template <typename T>
122__global__ void mattrans_v_mul_kernel(T* __restrict__ output,
123 const T* __restrict__ pij, const T* __restrict__ lambda,
124 const int m, const int n) {
125
126 int tj = blockIdx.x * blockDim.x + threadIdx.x;
127 if (tj >= n) return;
128
129 T acc = 0.0;
130
131 // Use register accumulation
132 for (int i = 0; i < m; ++i) {
133 acc += pij[i + tj * m] * lambda[i];
134 }
135
136 output[tj] = acc;
137}
138
139
140template <typename T>
141__global__ void mma_sub1_kernel(
142 T* __restrict__ xlow,
143 T* __restrict__ xupp,
144 const T* __restrict__ x,
145 const T* __restrict__ xmin,
146 const T* __restrict__ xmax,
147 const T asyinit,
148 const int n) {
149 int tj = blockIdx.x * blockDim.x + threadIdx.x;
150 if (tj >= n) return;
151
152 // Load values once into registers (global memory is slow)
153 const T xt = x[tj];
154 const T xmin_j = xmin[tj];
155 const T xmax_j = xmax[tj];
156
157 // Reuse xgap calculation
158 const T xgap = xmax_j - xmin_j;
159 const T offset = asyinit * xgap;
160
161 xlow[tj] = xt - offset;
162 xupp[tj] = xt + offset;
163}
164
165template< typename T >
166__global__ void mma_sub2_kernel(T* __restrict__ low, T* __restrict__ upp,
167 const T* __restrict__ x, const T* __restrict__ xold1,
168 const T* __restrict__ xold2, const T* __restrict__ xdiff,
169 const T asydecr, const T asyincr, const int n) {
170 int tj = blockIdx.x * blockDim.x + threadIdx.x;
171 if (tj >= n) return;
172
173 // Load data into registers for faster accessing compare to global memory
174 // when accessing repeatedly)
175 const T xval = x[tj];
176 const T xold1val = xold1[tj];
177 const T xold2val = xold2[tj];
178 const T lowval = low[tj];
179 const T uppval = upp[tj];
180 const T xdiffval = xdiff[tj];
181
182 // Compute the product
183 const T prod = (xval - xold1val) * (xold1val - xold2val);
184
185 // Compute asy_factor without branching
186 T asy_factor = (prod < T(0)) ? asydecr :
187 (prod > T(0)) ? asyincr : T(1);
188
189 // Update low and upp using fma (fused multiply-add) for numerical stability
190 T new_low = fma(-asy_factor, (xold1val - lowval), xval);
191 T new_upp = fma(asy_factor, (uppval - xold1val), xval);
192
193 // Apply bounds
194 new_low = max(new_low, xval - T(10.0) * xdiffval);
195 new_low = min(new_low, xval - T(0.01) * xdiffval);
196
197 new_upp = min(new_upp, xval + T(10.0) * xdiffval);
198 new_upp = max(new_upp, xval + T(0.01) * xdiffval);
199
200 // Write results back
201 low[tj] = new_low;
202 upp[tj] = new_upp;
203}
204
205template <typename T>
206__global__ void mma_sub3_kernel( const T* __restrict__ x,
207 const T* __restrict__ df0dx, const T* __restrict__ dfdx,
208 T* __restrict__ low, T* __restrict__ upp, const T* __restrict__ xmin,
209 const T* __restrict__ xmax, T* __restrict__ alpha, T* __restrict__ beta,
210 T* __restrict__ p0j, T* __restrict__ q0j, T* __restrict__ pij,
211 T* __restrict__ qij, const int n, const int m) {
212 int tj = blockIdx.x * blockDim.x + threadIdx.x;
213 if (tj >= n) return;
214
215 // Load into registers once
216 const T xt = x[tj];
217 const T xmin_j = xmin[tj];
218 const T xmax_j = xmax[tj];
219 const T low_j = low[tj];
220 const T upp_j = upp[tj];
221 const T df0 = df0dx[tj];
222 const T xgap = xmax_j - xmin_j;
223
224 // Clamp helpers
225 const T half_xgap = 0.5 * xgap;
226 const T tenth_low_diff = T(0.1) * (xt - low_j);
227 const T tenth_upp_diff = T(0.1) * (upp_j - xt);
228
229 // Compute alpha and beta with fused max/min and fewer calls
230 T alpha_val = max(max(xmin_j, low_j + tenth_low_diff), xt - half_xgap);
231 T beta_val = min(min(xmax_j, upp_j - tenth_upp_diff), xt + half_xgap);
232
233 alpha[tj] = alpha_val;
234 beta[tj] = beta_val;
235
236 // Avoid multiple pow calls, compute once
237 const T upp_minus_x = upp_j - xt;
238 const T x_minus_low = xt - low_j;
239 const T upp_minus_x_sq = upp_minus_x * upp_minus_x;
240 const T x_minus_low_sq = x_minus_low * x_minus_low;
241
242 // Small epsilon for numerical stability
243 const T eps = T(1e-5);
244 const T inv_xgap = T(1.0) / max(eps, xgap);
245
246 // Compute terms reused multiple times
247 const T max_df0_pos = max(df0, T(0));
248 const T max_df0_neg = max(-df0, T(0));
249
250 p0j[tj] = upp_minus_x_sq * (T(1.001) * max_df0_pos +
251 T(0.001) * max_df0_neg + eps * inv_xgap);
252 q0j[tj] = x_minus_low_sq * (T(0.001) * max_df0_pos +
253 T(1.001) * max_df0_neg + eps * inv_xgap);
254
255 // Loop over m for pij and qij
256 for (int i = 0; i < m; ++i) {
257 int idx = i + tj * m;
258
259 T dfdx_val = dfdx[idx];
260 T max_pos = max(dfdx_val, T(0));
261 T max_neg = max(-dfdx_val, T(0));
262
263 pij[idx] = upp_minus_x_sq * (T(1.001) * max_pos +
264 T(0.001) * max_neg + eps * inv_xgap);
265 qij[idx] = x_minus_low_sq * (T(0.001) * max_pos +
266 T(1.001) * max_neg + eps * inv_xgap);
267 }
268}
269
270template <typename T>
271__global__ void mma_sub4_kernel(
272 const T* __restrict__ x,
273 const T* __restrict__ low,
274 const T* __restrict__ upp,
275 const T* __restrict__ pij,
276 const T* __restrict__ qij,
277 T* __restrict__ temp,
278 const int n,
279 const int m) {
280
281 int tj = blockIdx.x * blockDim.x + threadIdx.x;
282 if (tj >= n) return;
283
284 // Register caching
285 const T xt = x[tj];
286 const T low_j = low[tj];
287 const T upp_j = upp[tj];
288
289 const T denom_upp = upp_j - xt;
290 const T denom_low = xt - low_j;
291
292 const T eps = T(1e-12); // Precision-dependent epsilon
293 const T inv_denom_upp = T(1) / max(denom_upp, eps);
294 const T inv_denom_low = T(1) / max(denom_low, eps);
295
296 const int base_idx = tj * m;
297
298 for (int i = 0; i < m; ++i) {
299 int idx = base_idx + i;
300 temp[idx] = pij[idx] * inv_denom_upp + qij[idx] * inv_denom_low;
301 }
302}
303
304
305template <typename T>
306__global__ void mma_max2_kernel(
307 T* __restrict__ xsi,
308 const T* __restrict__ x,
309 const T* __restrict__ alpha,
310 const int n) {
311
312 int tj = blockIdx.x * blockDim.x + threadIdx.x;
313 if (tj >= n) return;
314
315 const T eps = T(1e-12);
316 T denom = max(x[tj] - alpha[tj], eps);
317 xsi[tj] = max(T(1), T(1) / denom);
318}
319
320template <typename T>
321__global__ void relambda_kernel(
322 T* __restrict__ temp,
323 const T* __restrict__ x,
324 const T* __restrict__ xupp,
325 const T* __restrict__ xlow,
326 const T* __restrict__ pij,
327 const T* __restrict__ qij,
328 const int n,
329 const int m) {
330
331 int tj = blockIdx.x * blockDim.x + threadIdx.x;
332 if (tj >= n) return;
333
334 const T xt = x[tj];
335 const T xup = xupp[tj];
336 const T xlo = xlow[tj];
337
338 // Prevent divide-by-zero using small epsilon
339 const T eps = T(1e-12);
340 const T inv_denom_upp = T(1) / max(xup - xt, eps);
341 const T inv_denom_low = T(1) / max(xt - xlo, eps);
342
343 int base_idx = tj * m;
344 for (int i = 0; i < m; ++i) {
345 int idx = base_idx + i;
346 temp[idx] = pij[idx] * inv_denom_upp + qij[idx] * inv_denom_low;
347 }
348}
349
350
351template <typename T>
352__global__ void sub2cons2_kernel(
353 T* __restrict__ a,
354 const T* __restrict__ b,
355 const T* __restrict__ c,
356 const T* __restrict__ d,
357 const T e,
358 const int n) {
359 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
360 if (idx >= n) return;
361
362 const T bt = b[idx];
363 const T ct = c[idx];
364 const T dt = d[idx];
365
366 a[idx] = bt * (ct - dt) - e;
367}
368
369template <typename T>
370__inline__ __device__ T max_reduce_warp(T val) {
371 int w = warpSize;
372 for (int offset = w / 2; offset > 0; offset /= 2)
373 val = max(val, __shfl_down(val, offset, w));
374 return val;
375}
376
377template< typename T >
378__global__ void maxval_kernel(const T* __restrict__ a, T *temp, const int n) {
379
380 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
381 const int str = blockDim.x * gridDim.x;
382
383 const unsigned int lane = threadIdx.x % warpSize;
384 const unsigned int wid = threadIdx.x / warpSize;
385
386 __shared__ T shared[16];
387 T maxval = 0.0;
388 for (int i = idx; i < n; i += str) {
389 maxval = max(maxval, abs(a[i]));
390 }
391
392 maxval = max_reduce_warp<T>(maxval);
393 if (lane == 0)
394 shared[wid] = maxval;
395 __syncthreads();
396
397 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0.0;
398 if (wid == 0)
399 maxval = max_reduce_warp<T>(maxval);
400
401 if (threadIdx.x == 0)
402 temp[blockIdx.x] = maxval;
403
404}
405
406
407template <typename T>
408__global__ void max_reduce_kernel(T* __restrict__ bufred, const int n) {
409
410 T maxval = T(0);
411 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
412 const int stride = blockDim.x * gridDim.x;
413
414 // Grid-stride loop to cover all elements
415 for (int i = idx; i < n; i += stride) {
416 maxval = max(maxval, bufred[i]);
417 }
418
419 __shared__ T shared[16]; // One slot per warp (max 1024 threads/block)
420
421 unsigned int lane = threadIdx.x % warpSize;
422 unsigned int wid = threadIdx.x / warpSize;
423
424 // Warp-level max reduction
425 maxval = max_reduce_warp<T>(maxval);
426
427 // Store each warp's max value in shared memory
428 if (lane == 0) {
429 shared[wid] = maxval;
430 }
431 __syncthreads();
432
433 // Let the first warp reduce the warp-level maxima
434 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : T(0);
435 if (wid == 0) {
436 maxval = max_reduce_warp<T>(maxval);
437 }
438
439 // Write the block's maximum value back to bufred[blockIdx.x]
440 if (threadIdx.x == 0) {
441 bufred[blockIdx.x] = maxval;
442 }
443}
444
445
446template <typename T>
447__global__ void delx_kernel(
448 T* __restrict__ delx,
449 const T* __restrict__ x,
450 const T* __restrict__ xlow,
451 const T* __restrict__ xupp,
452 const T* __restrict__ pij,
453 const T* __restrict__ qij,
454 const T* __restrict__ p0j,
455 const T* __restrict__ q0j,
456 const T* __restrict__ alpha,
457 const T* __restrict__ beta,
458 const T* __restrict__ lambda,
459 const T epsi,
460 const int n,
461 const int m)
462{
463 int tj = blockIdx.x * blockDim.x + threadIdx.x;
464 if (tj < n) {
465 T xt = x[tj];
466 T xlow_j = xlow[tj];
467 T xupp_j = xupp[tj];
468 T alpha_j = alpha[tj];
469 T beta_j = beta[tj];
470
471 // Precompute denominators squared for better performance
472 T denom_low = xt - xlow_j;
473 T denom_upp = xupp_j - xt;
474 T denom_alpha = xt - alpha_j;
475 T denom_beta = beta_j - xt;
476
477 const T small_eps = T(1e-12);
478 denom_low = (abs(denom_low) < small_eps) ? small_eps : denom_low;
479 denom_upp = (abs(denom_upp) < small_eps) ? small_eps : denom_upp;
480 denom_alpha = (abs(denom_alpha) < small_eps) ? small_eps : denom_alpha;
481 denom_beta = (abs(denom_beta) < small_eps) ? small_eps : denom_beta;
482
483 T sum = T(0);
484 for (int i = 0; i < m; i++) {
485 T lambda_i = lambda[i];
486 sum += pij[i + tj * m] * lambda_i / (denom_upp * denom_upp)
487 - qij[i + tj * m] * lambda_i / (denom_low * denom_low);
488 }
489 sum += p0j[tj] / (denom_upp * denom_upp)
490 - q0j[tj] / (denom_low * denom_low)
491 - epsi / denom_alpha
492 + epsi / denom_beta;
493
494 delx[tj] = sum;
495 }
496}
497
498template <typename T>
499__global__ void GG_kernel(
500 T* __restrict__ GG,
501 const T* __restrict__ x,
502 const T* __restrict__ xlow,
503 const T* __restrict__ xupp,
504 const T* __restrict__ pij,
505 const T* __restrict__ qij,
506 const int n,
507 const int m) {
508
509 int tj = blockIdx.x * blockDim.x + threadIdx.x;
510 if (tj < n) {
511 T xt = x[tj];
512 T xlow_j = xlow[tj];
513 T xupp_j = xupp[tj];
514
515 // Distances from bounds
516 T diff_upper = xupp_j - xt;
517 T diff_lower = xt - xlow_j;
518
519 // Squared distances
520 T diff_upper2 = diff_upper * diff_upper;
521 T diff_lower2 = diff_lower * diff_lower;
522
523 for (int i = 0; i < m; i++) {
524 int idx = i + tj * m;
525 GG[idx] = pij[idx] / diff_upper2 - qij[idx] / diff_lower2;
526 }
527 }
528}
529
530template <typename T>
531__global__ void diagx_kernel(T* __restrict__ diagx, const T* __restrict__ x,
532 const T* __restrict__ xsi, const T* __restrict__ xlow,
533 const T* __restrict__ xupp, const T* __restrict__ p0j,
534 const T* __restrict__ q0j, const T* __restrict__ pij,
535 const T* __restrict__ qij, const T* alpha, const T* beta,
536 const T* eta, const T* lambda, const int n, const int m) {
537 int tj = blockIdx.x * blockDim.x + threadIdx.x;
538 if (tj < n) {
539 T sum = 0;
540 T sum1 = 0;
541 for (int i = 0; i < m; i++) {
542 sum = sum + pij[tj *m+ i] * lambda[i];
543 sum1 = sum1 + qij[tj*m + i] * lambda[i];
544 }
545 diagx[tj] = (p0j[tj] + sum) / pow(xupp[tj] - x[tj], 3) +
546 (q0j[tj] + sum1) / pow(x[tj] - xlow[tj], 3);
547 diagx[tj] = 2.0 * diagx[tj] + xsi[tj] / (x[tj] - alpha[tj]) +
548 eta[tj] / (beta[tj] - x[tj]);
549 }
550}
551
552
553template <typename T>
554__inline__ __device__ T reduce_warp(T val) {
555 int w = warpSize;
556 for (int offset = w / 2; offset > 0; offset /= 2)
557 val += __shfl_down(val, offset, w);
558 return val;
559}
560
561template <typename T>
562__global__ void mmareduce_kernel(T* __restrict__ bufred, const int n) {
563
564 T sum = 0;
565 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
566 const int str = blockDim.x * gridDim.x;
567 for (int i = idx; i < n; i += str)
568 {
569 sum += bufred[i];
570 }
571
572 __shared__ T shared[16];
573 unsigned int lane = threadIdx.x % warpSize;
574 unsigned int wid = threadIdx.x / warpSize;
575
576 sum = reduce_warp<T>(sum);
577 if (lane == 0)
578 shared[wid] = sum;
579 __syncthreads();
580
581 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
582 if (wid == 0)
583 sum = reduce_warp<T>(sum);
584
585 if (threadIdx.x == 0)
586 bufred[blockIdx.x] = sum;
587}
588
589
590template< typename T >
591__global__ void mmasum_kernel(const T* __restrict__ a, T* __restrict__ buf_h,
592 const int n, const int m, const int k) {
593
594 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
595 const int str = blockDim.x * gridDim.x;
596
597 const unsigned int lane = threadIdx.x % warpSize;
598 const unsigned int wid = threadIdx.x / warpSize;
599
600 __shared__ T shared[16];
601 T sum = 0;
602 for (int i = idx; i < n; i += str)
603 {
604 sum += a[m * i + k ];
605 }
606
607 sum = reduce_warp<T>(sum);
608 if (lane == 0)
609 shared[wid] = sum;
610 __syncthreads();
611
612 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
613 if (wid == 0)
614 sum = reduce_warp<T>(sum);
615
616 if (threadIdx.x == 0)
617 buf_h[blockIdx.x] = sum;
618
619}
620template< typename T >
621__global__ void mmasumbb_kernel(const T* __restrict__ GG,
622 const T* __restrict__ delx, const T* __restrict__ diagx,
623 T* __restrict__ buf_h, const int n, const int m, const int k) {
624
625 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
626 const int str = blockDim.x * gridDim.x;
627
628 const unsigned int lane = threadIdx.x % warpSize;
629 const unsigned int wid = threadIdx.x / warpSize;
630
631 __shared__ T shared[16];
632 T sum = 0;
633 for (int i = idx; i < n; i += str)
634 {
635 sum += GG[ k + i * m] * delx[i] / diagx[i];
636 }
637
638 sum = reduce_warp<T>(sum);
639 if (lane == 0)
640 shared[wid] = sum;
641 __syncthreads();
642
643 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
644 if (wid == 0)
645 sum = reduce_warp<T>(sum);
646
647 if (threadIdx.x == 0)
648 buf_h[blockIdx.x] = sum;
649
650}
651
652template <typename T>
653__global__ void mmasumHess_kernel(
654 const T* __restrict__ hijx,
655 const T* __restrict__ Ljjxinv,
656 T* __restrict__ buf_h,
657 const int n,
658 const int m,
659 const int k0,
660 const int k1)
661{
662 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
663 const int stride = blockDim.x * gridDim.x;
664
665 // Warp lane and warp id within the block
666 const unsigned int lane = threadIdx.x % warpSize;
667 const unsigned int wid = threadIdx.x / warpSize;
668
669 __shared__ T shared[16];
670
671 T sum = T(0);
672
673 // Grid-stride loop for global reduction
674 for (int i = idx; i < n; i += stride) {
675 // hijx is indexed as hijx[offset + row * m]
676 T val0 = hijx[k0 + i * m];
677 T val1 = hijx[k1 + i * m];
678 sum += val0 * Ljjxinv[i] * val1;
679 }
680
681 // Warp-level reduction using your reduce_warp implementation
682 sum = reduce_warp<T>(sum);
683
684 // Write each warp's partial sum to shared memory
685 if (lane == 0) {
686 shared[wid] = sum;
687 }
688 __syncthreads();
689
690 // First warp reduces the warp sums in shared memory
691 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : T(0);
692 if (wid == 0) {
693 sum = reduce_warp<T>(sum);
694 }
695
696 // Write final result from first thread of block
697 if (threadIdx.x == 0) {
698 buf_h[blockIdx.x] = sum;
699 }
700}
701
702template< typename T >
703__global__ void mmasumAA_kernel(const T* __restrict__ GG,
704 const T* __restrict__ diagx, T* __restrict__ buf_h, const int n,
705 const int m, const int k0, const int k1) {
706
707 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
708 const int str = blockDim.x * gridDim.x;
709
710 const unsigned int lane = threadIdx.x % warpSize;
711 const unsigned int wid = threadIdx.x / warpSize;
712
713 __shared__ T shared[16];
714 T sum = 0;
715 for (int i = idx; i < n; i += str)
716 {
717 sum += GG[ k0 + i * m] /diagx[i] * GG[ k1 + i * m];
718 }
719
720 sum = reduce_warp<T>(sum);
721 if (lane == 0)
722 shared[wid] = sum;
723 __syncthreads();
724
725 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
726 if (wid == 0)
727 sum = reduce_warp<T>(sum);
728
729 if (threadIdx.x == 0)
730 buf_h[blockIdx.x] = sum;
731
732}
733
734
735template <typename T>
736__global__ void mma_copy_kernel(T* __restrict__ a, const T* __restrict__ b,
737 const int n, const int m) {
738 int tj = blockIdx.x * blockDim.x + threadIdx.x;
739 if(tj<n)
740 a[tj+m]=b[tj];
741}
742
743
744
745template <typename T>
746__global__ void AA_kernel(T* __restrict__ temp, const T* __restrict__ GG,
747 const T* __restrict__ diagx, const int n, const int m) {
748 int tj = blockIdx.x * blockDim.x + threadIdx.x;
749 if (tj < n) {
750 for (int i0 = 0; i0 < m; i0++) {
751 for (int i1 = 0; i1 < m; i1++) {
752 temp[tj + i0 * (n + 1) + i1 * (m + 1) * (n + 1)] = GG[i0 * n + tj] *
753 (1.0 / diagx[tj]) * GG[i1 * n + tj];
754 }
755 }
756 }
757}
758
759
760template <typename T>
761__global__ void dx_kernel(T* __restrict__ dx, const T* __restrict__ delx,
762 const T* __restrict__ diagx, const T* __restrict__ GG,
763 const T* __restrict__ dlambda, const int n, const int m) {
764 int tj = blockIdx.x * blockDim.x + threadIdx.x;
765 if (tj < n) {
766 dx[tj] = -delx[tj]/diagx[tj];
767 for(int i=0;i<m;i++){
768 dx[tj] =dx[tj] - GG[tj*m+i]*dlambda[i]/diagx[tj];
769 }
770 }
771}
772
773
774
775template <typename T>
776__global__ void dxsi_kernel(T* __restrict__ dxsi, const T* __restrict__ xsi,
777 const T* __restrict__ dx, const T* __restrict__ x,
778 const T* __restrict__ alpha, const T epsi, const int n) {
779 int tj = blockIdx.x * blockDim.x + threadIdx.x;
780 if (tj < n) {
781 dxsi[tj]= -xsi[tj] + (epsi-dx[tj]*xsi[tj])/(x[tj] - alpha[tj]);
782 }
783}
784
785
786template <typename T>
787__global__ void deta_kernel(T* __restrict__ deta, const T* __restrict__ eta,
788 const T* __restrict__ dx, const T* __restrict__ x,
789 const T* __restrict__ beta, const T epsi, const int n) {
790 int tj = blockIdx.x * blockDim.x + threadIdx.x;
791 if (tj < n) {
792 deta[tj] = -eta[tj] + (epsi + dx[tj] * eta[tj]) / (beta[tj] - x[tj]);
793 }
794}
795
796template <typename T>
797__global__ void RexCalculation_kernel(
798 T* __restrict__ rex,
799 const T* __restrict__ x,
800 const T* __restrict__ xlow,
801 const T* __restrict__ xupp,
802 const T* __restrict__ pij,
803 const T* __restrict__ p0j,
804 const T* __restrict__ qij,
805 const T* __restrict__ q0j,
806 const T* __restrict__ lambda,
807 const T* __restrict__ xsi,
808 const T* __restrict__ eta,
809 const int n,
810 const int m)
811{
812 int tj = blockIdx.x * blockDim.x + threadIdx.x;
813 if (tj < n) {
814 T upp_diff = xupp[tj] - x[tj];
815 T low_diff = x[tj] - xlow[tj];
816 T upp_diff_sq = upp_diff * upp_diff;
817 T low_diff_sq = low_diff * low_diff;
818
819 T sum = 0.0;
820 for (int i = 0; i < m; ++i) {
821 sum += pij[i + tj * m] * lambda[i] / upp_diff_sq
822 - qij[i + tj * m] * lambda[i] / low_diff_sq;
823 }
824
825 rex[tj] = sum
826 + p0j[tj] / upp_diff_sq
827 - q0j[tj] / low_diff_sq
828 - xsi[tj] + eta[tj];
829 }
830}
831
832template <typename T>
833__global__ void rey_calculation_kernel(T* __restrict__ rey,
834 const T* __restrict__ c, const T* __restrict__ d, const T* __restrict__ y,
835 const T* __restrict__ lambda, const T* __restrict__ mu, const int n) {
836 int tj = blockIdx.x * blockDim.x + threadIdx.x;
837 if (tj < n) {
838 rey[tj] = c[tj] + d[tj] * y[tj] - lambda[tj] - mu[tj];
839 }
840}
841
842
843template< typename T >
844__global__ void norm_kernel(const T* __restrict__ a, T* __restrict__ buf_h,
845 const int n) {
846
847 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
848 const int str = blockDim.x * gridDim.x;
849
850 const unsigned int lane = threadIdx.x % warpSize;
851 const unsigned int wid = threadIdx.x / warpSize;
852
853 __shared__ T shared[16];
854 T sum = 0;
855 for (int i = idx; i < n; i += str)
856 {
857 sum += pow(a[i], 2);
858 }
859
860 sum = reduce_warp<T>(sum);
861 if (lane == 0)
862 shared[wid] = sum;
863 __syncthreads();
864
865 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
866 if (wid == 0)
867 sum = reduce_warp<T>(sum);
868
869 if (threadIdx.x == 0)
870 buf_h[blockIdx.x] = sum;
871
872}
873
874
875
876template <typename T>
877__global__ void sub2cons_kernel(T* __restrict__ a, const T* __restrict__ b,
878 const T* __restrict__ c,
879 const T d, const int n) {
880 int tj = blockIdx.x * blockDim.x + threadIdx.x;
881 if (tj < n) {
882 a[tj] = b[tj]*c[tj]-d;
883 }
884}
885
886
887template <typename T>
888__global__ void dely_kernel(T* __restrict__ dely, const T* __restrict__ c,
889 const T* __restrict__ d, const T* __restrict__ y,
890 const T* __restrict__ lambda, const T epsi, const int n) {
891 int tj = blockIdx.x * blockDim.x + threadIdx.x;
892 if (tj < n) {
893 dely[tj] = c[tj] + d[tj]*y[tj] - lambda[tj] - epsi/y[tj];
894 }
895}
896
897
898
899template< typename T >
900__global__ void maxval2_kernel(const T* __restrict__ a, const T* __restrict__ b,
901 T* __restrict__ temp, const T cons, const int n) {
902
903 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
904 const int str = blockDim.x * gridDim.x;
905
906 const unsigned int lane = threadIdx.x % warpSize;
907 const unsigned int wid = threadIdx.x / warpSize;
908
909 __shared__ T shared[16];
910 T maxval = cons * a[0] / b[0];
911 for (int i = idx; i < n; i += str)
912 {
913 maxval = max(maxval, cons * a[i] / b[i]);
914 }
915
916 maxval = max_reduce_warp<T>(maxval);
917 if (lane == 0)
918 shared[wid] = maxval;
919 __syncthreads();
920
921 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0.0;
922 if (wid == 0)
923 maxval = max_reduce_warp<T>(maxval);
924
925 if (threadIdx.x == 0)
926 temp[blockIdx.x] = maxval;
927
928}
929
930
931template< typename T >
932__global__ void maxval3_kernel(const T* __restrict__ a, const T* __restrict__ b,
933 const T* __restrict__ c, T* __restrict__ temp, const T cons, const int n) {
934
935 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
936 const int str = blockDim.x * gridDim.x;
937
938 const unsigned int lane = threadIdx.x % warpSize;
939 const unsigned int wid = threadIdx.x / warpSize;
940
941 __shared__ T shared[16];
942 T maxval = cons * a[0] / b[0];
943 for (int i = idx; i < n; i += str)
944 {
945 maxval = max(maxval, cons * a[i] / (b[i] - c[i]));
946 }
947
948 maxval = max_reduce_warp<T>(maxval);
949 if (lane == 0)
950 shared[wid] = maxval;
951 __syncthreads();
952
953 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
954 if (wid == 0)
955 maxval = max_reduce_warp<T>(maxval);
956
957 if (threadIdx.x == 0)
958 temp[blockIdx.x] = maxval;
959
960}
961
962
963template <typename T>
964__global__ void kkt_rex_kernel(T* __restrict__ rex, const T* __restrict__ df0dx,
965 const T* __restrict__ dfdx, const T* __restrict__ xsi,
966 const T* __restrict__ eta, const T* __restrict__ lambda, const int n,
967 const int m) {
968 int tj = blockIdx.x * blockDim.x + threadIdx.x;
969 if (tj < n) {
970 rex[tj] = 0.0;
971 for (int i = 0; i < m; i++) {
972 rex[tj] = rex[tj] + dfdx[i + tj*m] * lambda[i];
973 }
974 rex[tj] += df0dx[tj] - xsi[tj] + eta[tj];
975 }
976}
977
978
979template <typename T>
980__global__ void maxcons_kernel(T* __restrict__ a, const T b,
981 const T c, const T* __restrict__ d, const int n) {
982 int tj = blockIdx.x * blockDim.x + threadIdx.x;
983 if (tj < n) {
984 a[tj] = max(b, c * d[tj]);
985 }
986}
987
990 template< typename T >
991 __global__ void glsum_kernel(const T * a, T * buf_h, const int n) {
992 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
993 const int str = blockDim.x * gridDim.x;
994
995 const unsigned int lane = threadIdx.x % warpSize;
996 const unsigned int wid = threadIdx.x / warpSize;
997
998 __shared__ T shared[16];
999 T sum = 0;
1000 for (int i = idx; i<n ; i += str)
1001 {
1002 sum += a[i];
1003 }
1004
1005 sum = reduce_warp<T>(sum);
1006 if (lane == 0)
1007 shared[wid] = sum;
1008 __syncthreads();
1009
1010 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1011 if (wid == 0)
1012 sum = reduce_warp<T>(sum);
1013
1014 if (threadIdx.x == 0)
1015 buf_h[blockIdx.x] = sum;
1016
1017 }
1018 template< typename T >
1019__global__ void glsc2_kernel(const T * a,
1020 const T * b,
1021 T * buf_h,
1022 const int n) {
1023
1024 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1025 const int str = blockDim.x * gridDim.x;
1026
1027 const unsigned int lane = threadIdx.x % warpSize;
1028 const unsigned int wid = threadIdx.x / warpSize;
1029
1030 __shared__ T shared[16];
1031 T sum = 0.0;
1032 for (int i = idx; i < n; i+= str) {
1033 sum += a[i] * b[i];
1034 }
1035
1036 sum = reduce_warp<T>(sum);
1037 if (lane == 0)
1038 shared[wid] = sum;
1039 __syncthreads();
1040
1041 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1042 if (wid == 0)
1043 sum = reduce_warp<T>(sum);
1044
1045 if (threadIdx.x == 0)
1046 buf_h[blockIdx.x] = sum;
1047
1048 }
1049
1050
1051
1052
1053
1054template <typename T>
1055__global__ void add2inv2_kernel(T* __restrict__ a, const T* __restrict__ b,
1056 const T c, const int n) {
1057 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1058 if (tj < n) {
1059 a[tj] = a[tj]+c/b[tj];
1060 }
1061}
1062
1063template <typename T>
1064__global__ void max2_kernel(T* __restrict__ a, const T b,
1065 const T* __restrict__ c, const T d, const int n) {
1066 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1067 if (tj < n) {
1068 a[tj]=max(b, d*c[tj]);
1069 }
1070}
1071
1072template <typename T>
1073__global__ void updatebb_kernel(T* __restrict__ bb,
1074 const T* __restrict__ dellambda, const T* __restrict__ dely,
1075 const T* __restrict__ d, const T* __restrict__ mu,
1076 const T* __restrict__ y, const T delz, const int m) {
1077 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1078 if(tj<m)
1079 bb[tj]=dellambda[tj] + dely[tj]/(d[tj] + mu[tj]/y[tj]) - bb[tj];
1080 else if(tj<m+1)
1081 bb[tj]=delz;
1082}
1083
1084
1085
1086template <typename T>
1087__global__ void updateAA_kernel(T* __restrict__ AA,
1088 const T* __restrict__ globaltmp_mm, const T* __restrict__ s,
1089 const T* __restrict__ lambda, const T* __restrict__ d,
1090 const T* __restrict__ mu, const T* __restrict__ y, const T* __restrict__ a,
1091 const T zeta, const T z, const int m) {
1092 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1093 if(tj<m)
1094 {
1095 AA[tj+tj*(m+1)]=globaltmp_mm[tj+tj*m] + (s[tj] / lambda[tj] +
1096 1.0/ (d[tj] + mu[tj] / y[tj]));
1097 AA[tj+m*(m+1)]=a[tj];
1098 AA[m+tj*(m+1)]=a[tj];
1099 }
1100 else if(tj<m+1)
1101 AA[tj+tj*(m+1)]= -zeta/z;
1102}
1103
1104template <typename T>
1105__global__ void dy_kernel(T* __restrict__ dy, const T* __restrict__ dely,
1106 const T* __restrict__ dlambda, const T* __restrict__ d,
1107 const T* __restrict__ mu, const T* __restrict__ y, const int n) {
1108 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1109 if(tj<n)
1110 dy[tj] = (-dely[tj]+dlambda[tj])/(d[tj] + mu[tj]/y[tj]);
1111}
1112
1113#endif
1114