Neko-TOP
A portable framework for high-order spectral element flow toplogy optimization.
Loading...
Searching...
No Matches
mma_kernel.h
Go to the documentation of this file.
1
37#ifndef MMA_CUDA_KERNEL_H
38#define MMA_CUDA_KERNEL_H
39
40template <typename T>
41__global__ void mma_update_hessian_z_kernel(
43 const T* __restrict__ a,
44 const int m)
45{
46 int tid = blockIdx.x * blockDim.x + threadIdx.x;
47 int total = m * m;
48
49 if (tid >= total) return;
50
51 int i = tid % m;
52 int j = tid / m;
53
54 Hess[tid] -= a[i] * a[j];
55}
56
57template<typename T>
58__global__ void mma_prepare_aa_matrix_kernel(T* __restrict__ AA,
59 const T* __restrict__ s, const T* __restrict__ lambda,
60 const T* __restrict__ d, const T* __restrict__ mu,
61 const T* __restrict__ y, const T* __restrict__ a,
62 const T zeta, const T z, const int m) {
63 const int tj = blockIdx.x * blockDim.x + threadIdx.x;
64 const int matrix_size = m + 1;
65
66 if (tj >= m) return;
67 AA[tj * matrix_size + tj] += s[tj] / lambda[tj] +
68 (T)1.0 / (d[tj] + mu[tj] / y[tj]);
69 AA[tj * matrix_size + m] = a[tj]; // column m+1
70 AA[m * matrix_size + tj] = a[tj];// row m+1
71
72 // Only first thread updates the bottom-right corner element.
73 if (tj == 0)
74 AA[m * matrix_size + m] = -zeta / z;
75}
76
77
78//Update Hessian diagonal elements (y contributions for dip subsolve)
79template<typename T>
80__global__ void mma_update_hessian_diagonal_kernel(T* __restrict__ Hess,
81 const T* __restrict__ y, const T* __restrict__ mu,
82 const T* __restrict__ lambda, const int m) {
83 int tj = blockIdx.x * blockDim.x + threadIdx.x;
84 if (tj >= m) return;
85
86 T diag = Hess[tj * m + tj];
87 // Contribution from y terms (inactive constraints)
88 if (y[tj] > (T)0.0) {
89 diag -= (T)1.0;
90 }
91
92 // Contribution from -mu/lambda (eq 10)
93 diag -= mu[tj] / lambda[tj];
94 Hess[tj * m + tj] = diag;
95}
96
97// Levenberg-Marquardt algorithm (heuristically)
98// Single-block version for m <= 1024
99template<typename T>
100__global__ void mma_stabilize_hessian_single_kernel(T* __restrict__ Hess, const int m) {
101 const int tid = threadIdx.x;
102
103 // Single thread computes trace and LM factor
104 if (tid == 0) {
105 T trace = (T)0.0;
106 for (int j = 0; j < m; j++) {
107 trace += Hess[j * m + j];
108 }
109 T lm_factor = max((T)(-1.0e-4) * trace / m, (T)1.0e-7);
110
111 // Apply to all diagonal elements
112 for (int j = 0; j < m; j++) {
113 Hess[j * m + j] -= lm_factor;
114 }
115 }
116}
117
118// Levenberg-Marquardt algorithm (heuristically)
119// Multi-block version for m > 1024
120template<typename T>
121__global__ void mma_stabilize_hessian_multi_kernel(T* __restrict__ Hess, const T lm_factor, const int m) {
122 const int i = blockIdx.x * blockDim.x + threadIdx.x;
123 if (i < m) {
124 Hess[i * m + i] -= lm_factor;
125 }
126}
127
128// Small linear solver for MMA (n <= 50)
129template<typename T>
130__global__ void mma_small_lu_kernel(T* __restrict__ A, T* __restrict__ b, const int n) {
131 const int tid = threadIdx.x;
132
133 // Handle 1x1 case
134 if (n == 1) {
135 if (tid == 0 && abs(A[0]) > (T)1e-12) b[0] /= A[0];
136 return;
137 }
138
139 // LU decomposition with partial pivoting
140 for (int k = 0; k < n; k++) {
141 // Pivoting - single thread
142 if (tid == 0) {
143 int max_row = k;
144 T max_val = abs(A[k * n + k]);
145 for (int i = k + 1; i < n; i++) {
146 T val = abs(A[i * n + k]);
147 if (val > max_val) {
148 max_val = val;
149 max_row = i;
150 }
151 }
152 if (max_val > (T)1e-12 && max_row != k) {
153 // Swap rows
154 for (int j = k; j < n; j++) {
155 T temp = A[k * n + j];
156 A[k * n + j] = A[max_row * n + j];
157 A[max_row * n + j] = temp;
158 }
159 // Swap rhs
160 T temp_b = b[k];
161 b[k] = b[max_row];
162 b[max_row] = temp_b;
163 }
164 }
166
167 // Parallel elimination
168 T diag = A[k * n + k];
169 if (abs(diag) > (T)1e-12) {
170 for (int i = tid + k + 1; i < n; i += blockDim.x) {
171 T factor = A[i * n + k] / diag;
172 A[i * n + k] = factor;
173 for (int j = k + 1; j < n; j++) {
174 A[i * n + j] -= factor * A[k * n + j];
175 }
176 }
177 }
179 }
180
181 // Parallel forward substitution
182 for (int i = tid; i < n; i += blockDim.x) {
183 T sum = b[i];
184 for (int j = 0; j < i; j++) {
185 sum -= A[i * n + j] * b[j];
186 }
187 b[i] = sum;
188 }
190
191 // Parallel backward substitution
192 for (int i = n - 1 - tid; i >= 0; i -= blockDim.x) {
193 if (i >= 0) {
194 T sum = b[i];
195 for (int j = i + 1; j < n; j++) {
196 sum -= A[i * n + j] * b[j];
197 }
198 if (abs(A[i * n + i]) > (T)1e-12) {
199 b[i] = sum / A[i * n + i];
200 }
201 }
202 }
204}
205
206template <typename T>
207__global__ void delta_1dbeam_kernel(T* __restrict__ Delta,
208 const T L_total, const T Le, const int offset, const int n) {
209 int k = blockIdx.x * blockDim.x + threadIdx.x;
210 if (k >= n) return;
211
212 // Convert to 1-based indexing for the calculation
213 int idx = k + 1;
214
215 // Calculate first term: (L_total - Le*(offset+idx-1))^3
216 T x1 = L_total - Le * static_cast<T>(offset + idx - 1);
217 T term1 = x1 * x1 * x1;
218
219 // Calculate second term: (L_total - Le*(offset+idx))^3
220 T x2 = L_total - Le * static_cast<T>(offset + idx);
221 T term2 = x2 * x2 * x2;
222
223 // Final result
224 Delta[k] = (term1 - term2) / static_cast<T>(3.0);
225}
226
227template <typename T>
228__global__ void mma_Ljjxinv_kernel(T* __restrict__ Ljjxinv,
230 const T* __restrict__ x, const T* __restrict__ low, const T* __restrict__ upp,
231 const T* __restrict__ alpha, const T* __restrict__ beta,
232 const int n) {
233 int tj = blockIdx.x * blockDim.x + threadIdx.x;
234 if (tj >= n) return;
235
236 // Load inputs into registers
237 const T xt = x[tj];
238 const T low_j = low[tj];
239 const T upp_j = upp[tj];
240 const T pj = pjlambda[tj];
241 const T qj = qjlambda[tj];
242
243 // Precompute reused differences
244 const T diff_u = upp_j - xt;
245 const T diff_l = xt - low_j;
246
247 // Cube once (avoiding pow for speed)
248 const T diff_u3 = diff_u * diff_u * diff_u;
249 const T diff_l3 = diff_l * diff_l * diff_l;
250
251 // Compute inverse value safely
252 T denom = 2.0 * pj / diff_u3 + 2.0 * qj / diff_l3;
253 T val = -1.0 / denom;
254
255 // Mask out active primal constraints
256 bool active = (fabs(xt - alpha[tj]) <= T(1e-16)) ||
257 (fabs(xt - beta[tj]) <= T(1e-16));
258
259 Ljjxinv[tj] = active ? T(0.0) : val;
260}
261
262template <typename T>
263__global__ void mma_dipsolvesub1_kernel(T* __restrict__ x,
265 const T* __restrict__ low, const T* __restrict__ upp,
266 const T* __restrict__ alpha, const T* __restrict__ beta,
267 const int n) {
268
269 int tj = blockIdx.x * blockDim.x + threadIdx.x;
270 if (tj >= n) return;
271
272 // Load inputs into registers
273 const T pj_sqrt = sqrt(pjlambda[tj]);
274 const T qj_sqrt = sqrt(qjlambda[tj]);
275 const T denom = pj_sqrt + qj_sqrt;
276
277 const T low_j = low[tj];
278 const T upp_j = upp[tj];
279 const T alpha_j = alpha[tj];
280 const T beta_j = beta[tj];
281
282 // Weighted average
283 const T val = (pj_sqrt * low_j + qj_sqrt * upp_j) / denom;
284
285 // Clamp x between alpha and beta using branchless min/max
286 x[tj] = fmin(fmax(val, alpha_j), beta_j);
287}
288
289template <typename T>
290__global__ void mattrans_v_mul_kernel(T* __restrict__ output,
291 const T* __restrict__ pij, const T* __restrict__ lambda,
292 const int m, const int n) {
293
294 int tj = blockIdx.x * blockDim.x + threadIdx.x;
295 if (tj >= n) return;
296
297 T acc = 0.0;
298
299 // Use register accumulation
300 for (int i = 0; i < m; ++i) {
301 acc += pij[i + tj * m] * lambda[i];
302 }
303
304 output[tj] = acc;
305}
306
307template <typename T>
308__global__ void mma_sub1_kernel(
311 const T* __restrict__ x,
312 const T* __restrict__ xmin,
313 const T* __restrict__ xmax,
314 const T asyinit,
315 const int n) {
316 int tj = blockIdx.x * blockDim.x + threadIdx.x;
317 if (tj >= n) return;
318
319 // Load values once into registers (global memory is slow)
320 const T xt = x[tj];
321 const T xmin_j = xmin[tj];
322 const T xmax_j = xmax[tj];
323
324 // Reuse xgap calculation
325 const T xgap = xmax_j - xmin_j;
326 const T offset = asyinit * xgap;
327
328 xlow[tj] = xt - offset;
329 xupp[tj] = xt + offset;
330}
331
332
333
334template< typename T >
335__global__ void mma_sub2_kernel(T* __restrict__ low, T* __restrict__ upp,
336 const T* __restrict__ x, const T* __restrict__ xold1,
337 const T* __restrict__ xold2, const T* __restrict__ xdiff,
338 const T asydecr, const T asyincr, const int n) {
339 int tj = blockIdx.x * blockDim.x + threadIdx.x;
340 if (tj >= n) return;
341
342 // Load data into registers for faster accessing compare to global memory
343 // when accessing repeatedly)
344 const T xval = x[tj];
345 const T xold1val = xold1[tj];
346 const T xold2val = xold2[tj];
347 const T lowval = low[tj];
348 const T uppval = upp[tj];
349 const T xdiffval = xdiff[tj];
350
351 // Compute the product
352 const T prod = (xval - xold1val) * (xold1val - xold2val);
353
354 // Compute asy_factor without branching
355 T asy_factor = (prod < T(0)) ? asydecr :
356 (prod > T(0)) ? asyincr : T(1);
357
358 // Update low and upp using fma (fused multiply-add) for numerical stability
361
362 // Apply bounds
363 new_low = max(new_low, xval - T(10.0) * xdiffval);
364 new_low = min(new_low, xval - T(0.01) * xdiffval);
365
366 new_upp = min(new_upp, xval + T(10.0) * xdiffval);
367 new_upp = max(new_upp, xval + T(0.01) * xdiffval);
368
369 // Write results back
370 low[tj] = new_low;
371 upp[tj] = new_upp;
372}
373
374template <typename T>
375__global__ void mma_sub3_kernel( const T* __restrict__ x,
376 const T* __restrict__ df0dx, const T* __restrict__ dfdx,
377 T* __restrict__ low, T* __restrict__ upp, const T* __restrict__ xmin,
378 const T* __restrict__ xmax, T* __restrict__ alpha, T* __restrict__ beta,
379 T* __restrict__ p0j, T* __restrict__ q0j, T* __restrict__ pij,
380 T* __restrict__ qij, const int n, const int m) {
381 int tj = blockIdx.x * blockDim.x + threadIdx.x;
382 if (tj >= n) return;
383
384 // Load into registers once
385 const T xt = x[tj];
386 const T xmin_j = xmin[tj];
387 const T xmax_j = xmax[tj];
388 const T low_j = low[tj];
389 const T upp_j = upp[tj];
390 const T df0 = df0dx[tj];
391 const T xgap = xmax_j - xmin_j;
392
393 // Clamp helpers
394 const T half_xgap = 0.5 * xgap;
395 const T tenth_low_diff = T(0.1) * (xt - low_j);
396 const T tenth_upp_diff = T(0.1) * (upp_j - xt);
397
398 // Compute alpha and beta with fused max/min and fewer calls
401
402 alpha[tj] = alpha_val;
403 beta[tj] = beta_val;
404
405 // Avoid multiple pow calls, compute once
406 const T upp_minus_x = upp_j - xt;
407 const T x_minus_low = xt - low_j;
410
411 // Small epsilon for numerical stability
412 const T eps = T(1e-5);
413 const T inv_xgap = T(1.0) / max(eps, xgap);
414
415 // Compute terms reused multiple times
416 const T max_df0_pos = max(df0, T(0));
417 const T max_df0_neg = max(-df0, T(0));
418
419 p0j[tj] = upp_minus_x_sq * (T(1.001) * max_df0_pos +
420 T(0.001) * max_df0_neg + eps * inv_xgap);
421 q0j[tj] = x_minus_low_sq * (T(0.001) * max_df0_pos +
422 T(1.001) * max_df0_neg + eps * inv_xgap);
423
424 // Loop over m for pij and qij
425 for (int i = 0; i < m; ++i) {
426 int idx = i + tj * m;
427
428 T dfdx_val = dfdx[idx];
429 T max_pos = max(dfdx_val, T(0));
430 T max_neg = max(-dfdx_val, T(0));
431
432 pij[idx] = upp_minus_x_sq * (T(1.001) * max_pos +
433 T(0.001) * max_neg + eps * inv_xgap);
434 qij[idx] = x_minus_low_sq * (T(0.001) * max_pos +
435 T(1.001) * max_neg + eps * inv_xgap);
436 }
437}
438
439template <typename T>
440__global__ void mma_sub4_kernel(
441 const T* __restrict__ x,
442 const T* __restrict__ low,
443 const T* __restrict__ upp,
444 const T* __restrict__ pij,
445 const T* __restrict__ qij,
446 T* __restrict__ temp,
447 const int n,
448 const int m) {
449
450 int tj = blockIdx.x * blockDim.x + threadIdx.x;
451 if (tj >= n) return;
452
453 // Register caching
454 const T xt = x[tj];
455 const T low_j = low[tj];
456 const T upp_j = upp[tj];
457
458 const T denom_upp = upp_j - xt;
459 const T denom_low = xt - low_j;
460
461 const T eps = T(1e-12); // Precision-dependent epsilon
462 const T inv_denom_upp = T(1) / max(denom_upp, eps);
463 const T inv_denom_low = T(1) / max(denom_low, eps);
464
465 const int base_idx = tj * m;
466
467 for (int i = 0; i < m; ++i) {
468 int idx = base_idx + i;
469 temp[idx] = pij[idx] * inv_denom_upp + qij[idx] * inv_denom_low;
470 }
471}
472
473
474template <typename T>
475__global__ void mma_max2_kernel(
476 T* __restrict__ xsi,
477 const T* __restrict__ x,
478 const T* __restrict__ alpha,
479 const int n) {
480
481 int tj = blockIdx.x * blockDim.x + threadIdx.x;
482 if (tj >= n) return;
483
484 const T eps = T(1e-12);
485 T denom = max(x[tj] - alpha[tj], eps);
486 xsi[tj] = max(T(1), T(1) / denom);
487}
488
489
490
491
492template <typename T>
493__global__ void relambda_kernel(
494 T* __restrict__ temp,
495 const T* __restrict__ x,
496 const T* __restrict__ xupp,
497 const T* __restrict__ xlow,
498 const T* __restrict__ pij,
499 const T* __restrict__ qij,
500 const int n,
501 const int m) {
502
503 int tj = blockIdx.x * blockDim.x + threadIdx.x;
504 if (tj >= n) return;
505
506 const T xt = x[tj];
507 const T xup = xupp[tj];
508 const T xlo = xlow[tj];
509
510 // Prevent divide-by-zero using small epsilon
511 const T eps = T(1e-12);
512 const T inv_denom_upp = T(1) / max(xup - xt, eps);
513 const T inv_denom_low = T(1) / max(xt - xlo, eps);
514
515 int base_idx = tj * m;
516 for (int i = 0; i < m; ++i) {
517 int idx = base_idx + i;
518 temp[idx] = pij[idx] * inv_denom_upp + qij[idx] * inv_denom_low;
519 }
520}
521
522
523template <typename T>
524__global__ void sub2cons2_kernel(
525 T* __restrict__ a,
526 const T* __restrict__ b,
527 const T* __restrict__ c,
528 const T* __restrict__ d,
529 const T e,
530 const int n) {
531 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
532 if (idx >= n) return;
533
534 const T bt = b[idx];
535 const T ct = c[idx];
536 const T dt = d[idx];
537
538 a[idx] = bt * (ct - dt) - e;
539}
540
541
542template< typename T>
543__inline__ __device__ T max_reduce_warp(T val) {
544 val = max(val, __shfl_down_sync(0xffffffff, val, 16));
545 val = max(val, __shfl_down_sync(0xffffffff, val, 8));
546 val = max(val, __shfl_down_sync(0xffffffff, val, 4));
547 val = max(val, __shfl_down_sync(0xffffffff, val, 2));
548 val = max(val, __shfl_down_sync(0xffffffff, val, 1));
549 return val;
550}
551
552
553
554template <typename T>
555__global__ void maxval_kernel(const T* __restrict__ a, T* temp, const int n) {
556 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
557 const int stride = blockDim.x * gridDim.x;
558
559 const unsigned int lane = threadIdx.x % warpSize;
560 const unsigned int warp_id = threadIdx.x / warpSize;
561
562 // Shared memory to store warp-level maxima
563 __shared__ T shared[32]; // Assumes max 1024 threads per block
564
565 T local_max = T(0);
566 for (int i = idx; i < n; i += stride) {
567 local_max = max(local_max, abs(a[i]));
568 }
569
570 // Warp-level reduction
572
573 // Write warp-level results to shared memory
574 if (lane == 0) {
576 }
578
579 // Let the first warp reduce shared values
580 local_max = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : T(0);
581
582 if (warp_id == 0) {
584 }
585
586 if (threadIdx.x == 0) {
587 temp[blockIdx.x] = local_max;
588 }
589}
590
591
592template <typename T>
593__global__ void max_reduce_kernel(T* __restrict__ bufred, const int n) {
594
595 T maxval = T(0);
596 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
597 const int stride = blockDim.x * gridDim.x;
598
599 // Grid-stride loop to cover all elements
600 for (int i = idx; i < n; i += stride) {
601 maxval = max(maxval, bufred[i]);
602 }
603
604 __shared__ T shared[32]; // One slot per warp (max 1024 threads/block)
605
606 unsigned int lane = threadIdx.x % warpSize;
607 unsigned int wid = threadIdx.x / warpSize;
608
609 // Warp-level max reduction
611
612 // Store each warp's max value in shared memory
613 if (lane == 0) {
614 shared[wid] = maxval;
615 }
617
618 // Let the first warp reduce the warp-level maxima
619 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : T(0);
620 if (wid == 0) {
622 }
623
624 // Write the block's maximum value back to bufred[blockIdx.x]
625 if (threadIdx.x == 0) {
627 }
628}
629
630template <typename T>
631__global__ void delx_kernel(
633 const T* __restrict__ x,
634 const T* __restrict__ xlow,
635 const T* __restrict__ xupp,
636 const T* __restrict__ pij,
637 const T* __restrict__ qij,
638 const T* __restrict__ p0j,
639 const T* __restrict__ q0j,
640 const T* __restrict__ alpha,
641 const T* __restrict__ beta,
642 const T* __restrict__ lambda,
643 const T epsi,
644 const int n,
645 const int m)
646{
647 int tj = blockIdx.x * blockDim.x + threadIdx.x;
648 if (tj < n) {
649 T xt = x[tj];
650 T xlow_j = xlow[tj];
651 T xupp_j = xupp[tj];
652 T alpha_j = alpha[tj];
653 T beta_j = beta[tj];
654
655 // Precompute denominators squared for better performance
656 T denom_low = xt - xlow_j;
657 T denom_upp = xupp_j - xt;
659 T denom_beta = beta_j - xt;
660
661 const T small_eps = T(1e-12);
666
667 T sum = T(0);
668 for (int i = 0; i < m; i++) {
669 T lambda_i = lambda[i];
670 sum += pij[i + tj * m] * lambda_i / (denom_upp * denom_upp)
671 - qij[i + tj * m] * lambda_i / (denom_low * denom_low);
672 }
673 sum += p0j[tj] / (denom_upp * denom_upp)
674 - q0j[tj] / (denom_low * denom_low)
675 - epsi / denom_alpha
676 + epsi / denom_beta;
677
678 delx[tj] = sum;
679 }
680}
681
682template <typename T>
683__global__ void GG_kernel(
685 const T* __restrict__ x,
686 const T* __restrict__ xlow,
687 const T* __restrict__ xupp,
688 const T* __restrict__ pij,
689 const T* __restrict__ qij,
690 const int n,
691 const int m) {
692
693 int tj = blockIdx.x * blockDim.x + threadIdx.x;
694 if (tj < n) {
695 T xt = x[tj];
696 T xlow_j = xlow[tj];
697 T xupp_j = xupp[tj];
698
699 // Distances from bounds
700 T diff_upper = xupp_j - xt;
701 T diff_lower = xt - xlow_j;
702
703 // Squared distances
706
707 for (int i = 0; i < m; i++) {
708 int idx = i + tj * m;
709 GG[idx] = pij[idx] / diff_upper2 - qij[idx] / diff_lower2;
710 }
711 }
712}
713
714
715template <typename T>
716__global__ void diagx_kernel(T* __restrict__ diagx, const T* __restrict__ x,
717 const T* __restrict__ xsi, const T* __restrict__ xlow,
718 const T* __restrict__ xupp, const T* __restrict__ p0j,
719 const T* __restrict__ q0j, const T* __restrict__ pij,
720 const T* __restrict__ qij, const T* alpha, const T* beta,
721 const T* eta, const T* lambda, const int n, const int m) {
722 int tj = blockIdx.x * blockDim.x + threadIdx.x;
723 if (tj < n) {
724 T sum = 0;
725 T sum1 = 0;
726 for (int i = 0; i < m; i++) {
727 sum = sum + pij[tj *m+ i] * lambda[i];
728 sum1 = sum1 + qij[tj*m + i] * lambda[i];
729 }
730 diagx[tj] = (p0j[tj] + sum) / pow(xupp[tj] - x[tj], 3) +
731 (q0j[tj] + sum1) / pow(x[tj] - xlow[tj], 3);
732 diagx[tj] = 2.0 * diagx[tj] + xsi[tj] / (x[tj] - alpha[tj]) +
733 eta[tj] / (beta[tj] - x[tj]);
734 }
735}
736
737
738template <typename T>
739__inline__ __device__ T reduce_warp(T val) {
740 volatile T v = val;
741 v += __shfl_down_sync(0xffffffff, v, 16);
742 v += __shfl_down_sync(0xffffffff, v, 8);
743 v += __shfl_down_sync(0xffffffff, v, 4);
744 v += __shfl_down_sync(0xffffffff, v, 2);
745 v += __shfl_down_sync(0xffffffff, v, 1);
746 return v;
747}
748
749
750template <typename T>
751__global__ void mmareduce_kernel(T* __restrict__ bufred, const int n) {
752
753 T sum = 0;
754 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
755 const int str = blockDim.x * gridDim.x;
756 for (int i = idx; i < n; i += str)
757 {
758 sum += bufred[i];
759 }
760
761 __shared__ T shared[32];
762 unsigned int lane = threadIdx.x % warpSize;
763 unsigned int wid = threadIdx.x / warpSize;
764
766 if (lane == 0)
767 shared[wid] = sum;
769
770 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
771 if (wid == 0)
773
774 if (threadIdx.x == 0)
775 bufred[blockIdx.x] = sum;
776}
777
778
779template< typename T >
780__global__ void mmasum_kernel(const T* __restrict__ a, T* __restrict__ buf_h,
781 const int n, const int m, const int k) {
782
783 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
784 const int str = blockDim.x * gridDim.x;
785
786 const unsigned int lane = threadIdx.x % warpSize;
787 const unsigned int wid = threadIdx.x / warpSize;
788
789 __shared__ T shared[32];
790 T sum = 0;
791 for (int i = idx; i < n; i += str)
792 {
793 sum += a[m * i + k ];
794 }
795
797 if (lane == 0)
798 shared[wid] = sum;
800
801 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
802 if (wid == 0)
804
805 if (threadIdx.x == 0)
806 buf_h[blockIdx.x] = sum;
807
808}
809template< typename T >
810__global__ void mmasumbb_kernel(const T* __restrict__ GG,
811 const T* __restrict__ delx, const T* __restrict__ diagx,
812 T* __restrict__ buf_h, const int n, const int m, const int k) {
813
814 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
815 const int str = blockDim.x * gridDim.x;
816
817 const unsigned int lane = threadIdx.x % warpSize;
818 const unsigned int wid = threadIdx.x / warpSize;
819
820 __shared__ T shared[32];
821 T sum = 0;
822 for (int i = idx; i < n; i += str)
823 {
824 sum += GG[ k + i * m] * delx[i] / diagx[i];
825 }
826
828 if (lane == 0)
829 shared[wid] = sum;
831
832 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
833 if (wid == 0)
835
836 if (threadIdx.x == 0)
837 buf_h[blockIdx.x] = sum;
838
839}
840
841template <typename T>
842__global__ void mmasumHess_kernel(
843 const T* __restrict__ hijx,
844 const T* __restrict__ Ljjxinv,
846 const int n,
847 const int m,
848 const int k0,
849 const int k1)
850{
851 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
852 const int stride = blockDim.x * gridDim.x;
853
854 // Warp lane and warp id within the block
855 const unsigned int lane = threadIdx.x % warpSize;
856 const unsigned int wid = threadIdx.x / warpSize;
857
858 __shared__ T shared[32];
859
860 T sum = T(0);
861
862 // Grid-stride loop for global reduction
863 for (int i = idx; i < n; i += stride) {
864 // hijx is indexed as hijx[offset + row * m]
865 T val0 = hijx[k0 + i * m];
866 T val1 = hijx[k1 + i * m];
867 sum += val0 * Ljjxinv[i] * val1;
868 }
869
870 // Warp-level reduction using your reduce_warp implementation
872
873 // Write each warp's partial sum to shared memory
874 if (lane == 0) {
875 shared[wid] = sum;
876 }
878
879 // First warp reduces the warp sums in shared memory
880 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : T(0);
881 if (wid == 0) {
883 }
884
885 // Write final result from first thread of block
886 if (threadIdx.x == 0) {
887 buf_h[blockIdx.x] = sum;
888 }
889}
890
891
892template< typename T >
893__global__ void mmasumAA_kernel(const T* __restrict__ GG,
894 const T* __restrict__ diagx, T* __restrict__ buf_h, const int n,
895 const int m, const int k0, const int k1) {
896
897 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
898 const int str = blockDim.x * gridDim.x;
899
900 const unsigned int lane = threadIdx.x % warpSize;
901 const unsigned int wid = threadIdx.x / warpSize;
902
903 __shared__ T shared[32];
904 T sum = 0;
905 for (int i = idx; i < n; i += str)
906 {
907 sum += GG[ k0 + i * m] /diagx[i] * GG[ k1 + i * m];
908 }
909
911 if (lane == 0)
912 shared[wid] = sum;
914
915 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
916 if (wid == 0)
918
919 if (threadIdx.x == 0)
920 buf_h[blockIdx.x] = sum;
921
922}
923
924
925template <typename T>
926__global__ void mma_copy_kernel(T* __restrict__ a, const T* __restrict__ b,
927 const int n, const int m) {
928 int tj = blockIdx.x * blockDim.x + threadIdx.x;
929 if(tj<n)
930 a[tj+m]=b[tj];
931}
932
933
934
935template <typename T>
936__global__ void AA_kernel(T* __restrict__ temp, const T* __restrict__ GG,
937 const T* __restrict__ diagx, const int n, const int m) {
938 int tj = blockIdx.x * blockDim.x + threadIdx.x;
939 if (tj < n) {
940 for (int i0 = 0; i0 < m; i0++) {
941 for (int i1 = 0; i1 < m; i1++) {
942 temp[tj + i0 * (n + 1) + i1 * (m + 1) * (n + 1)] = GG[i0 * n + tj] *
943 (1.0 / diagx[tj]) * GG[i1 * n + tj];
944 }
945 }
946 }
947}
948
949
950template <typename T>
951__global__ void dx_kernel(T* __restrict__ dx, const T* __restrict__ delx,
952 const T* __restrict__ diagx, const T* __restrict__ GG,
953 const T* __restrict__ dlambda, const int n, const int m) {
954 int tj = blockIdx.x * blockDim.x + threadIdx.x;
955 if (tj < n) {
956 dx[tj] = -delx[tj]/diagx[tj];
957 for(int i=0;i<m;i++){
958 dx[tj] =dx[tj] - GG[tj*m+i]*dlambda[i]/diagx[tj];
959 }
960 }
961}
962
963
964
965template <typename T>
966__global__ void dxsi_kernel(T* __restrict__ dxsi, const T* __restrict__ xsi,
967 const T* __restrict__ dx, const T* __restrict__ x,
968 const T* __restrict__ alpha, const T epsi, const int n) {
969 int tj = blockIdx.x * blockDim.x + threadIdx.x;
970 if (tj < n) {
971 dxsi[tj]= -xsi[tj] + (epsi-dx[tj]*xsi[tj])/(x[tj] - alpha[tj]);
972 }
973}
974
975
976template <typename T>
977__global__ void deta_kernel(T* __restrict__ deta, const T* __restrict__ eta,
978 const T* __restrict__ dx, const T* __restrict__ x,
979 const T* __restrict__ beta, const T epsi, const int n) {
980 int tj = blockIdx.x * blockDim.x + threadIdx.x;
981 if (tj < n) {
982 deta[tj] = -eta[tj] + (epsi + dx[tj] * eta[tj]) / (beta[tj] - x[tj]);
983 }
984}
985
986
987template <typename T>
988__global__ void RexCalculation_kernel(
990 const T* __restrict__ x,
991 const T* __restrict__ xlow,
992 const T* __restrict__ xupp,
993 const T* __restrict__ pij,
994 const T* __restrict__ p0j,
995 const T* __restrict__ qij,
996 const T* __restrict__ q0j,
997 const T* __restrict__ lambda,
998 const T* __restrict__ xsi,
999 const T* __restrict__ eta,
1000 const int n,
1001 const int m)
1002{
1003 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1004 if (tj < n) {
1005 T upp_diff = xupp[tj] - x[tj];
1006 T low_diff = x[tj] - xlow[tj];
1009
1010 T sum = 0.0;
1011 for (int i = 0; i < m; ++i) {
1012 sum += pij[i + tj * m] * lambda[i] / upp_diff_sq
1013 - qij[i + tj * m] * lambda[i] / low_diff_sq;
1014 }
1015
1016 rex[tj] = sum
1017 + p0j[tj] / upp_diff_sq
1018 - q0j[tj] / low_diff_sq
1019 - xsi[tj] + eta[tj];
1020 }
1021}
1022
1023
1024template <typename T>
1025__global__ void rey_calculation_kernel(T* __restrict__ rey,
1026 const T* __restrict__ c, const T* __restrict__ d, const T* __restrict__ y,
1027 const T* __restrict__ lambda, const T* __restrict__ mu, const int n) {
1028 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1029 if (tj < n) {
1030 rey[tj] = c[tj] + d[tj] * y[tj] - lambda[tj] - mu[tj];
1031 }
1032}
1033
1034
1035template <typename T>
1036__global__ void norm_kernel(const T* __restrict__ a,
1037 T* __restrict__ buf_h, const int n) {
1038 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1039 const int str = blockDim.x * gridDim.x;
1040
1041 const unsigned int lane = threadIdx.x % warpSize;
1042 const unsigned int wid = threadIdx.x / warpSize;
1043
1044 __shared__ T shared[32];
1045 T sum = T(0);
1046
1047 for (int i = idx; i < n; i += str) {
1048 T val = a[i];
1049 sum += val * val; // faster than pow(val, 2)
1050 }
1051
1052 sum = reduce_warp<T>(sum); // your warp-level sum reduction function
1053 if (lane == 0)
1054 shared[wid] = sum;
1055 __syncthreads();
1056
1057 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : T(0);
1058 if (wid == 0)
1060
1061 if (threadIdx.x == 0)
1062 buf_h[blockIdx.x] = sum;
1063}
1064
1065
1066template <typename T>
1067__global__ void sub2cons_kernel(T* __restrict__ a, const T* __restrict__ b,
1068 const T* __restrict__ c,
1069 const T d, const int n) {
1070 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1071 if (tj < n) {
1072 a[tj] = b[tj]*c[tj]-d;
1073 }
1074}
1075
1076
1077template <typename T>
1078__global__ void dely_kernel(T* __restrict__ dely, const T* __restrict__ c,
1079 const T* __restrict__ d, const T* __restrict__ y,
1080 const T* __restrict__ lambda, const T epsi, const int n) {
1081 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1082 if (tj < n) {
1083 dely[tj] = c[tj] + d[tj]*y[tj] - lambda[tj] - epsi/y[tj];
1084 }
1085}
1086
1087
1088
1089template< typename T >
1090__global__ void maxval2_kernel(const T* __restrict__ a, const T* __restrict__ b,
1091 T* __restrict__ temp, const T cons, const int n) {
1092
1093 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1094 const int str = blockDim.x * gridDim.x;
1095
1096 const unsigned int lane = threadIdx.x % warpSize;
1097 const unsigned int wid = threadIdx.x / warpSize;
1098
1099 __shared__ T shared[32];
1100 T maxval = cons * a[0] / b[0];
1101 for (int i = idx; i < n; i += str)
1102 {
1103 maxval = max(maxval, cons * a[i] / b[i]);
1104 }
1105
1107 if (lane == 0)
1108 shared[wid] = maxval;
1109 __syncthreads();
1110
1111 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0.0;
1112 if (wid == 0)
1114
1115 if (threadIdx.x == 0)
1116 temp[blockIdx.x] = maxval;
1117
1118}
1119
1120
1121template< typename T >
1122__global__ void maxval3_kernel(const T* __restrict__ a, const T* __restrict__ b,
1123 const T* __restrict__ c, T* __restrict__ temp, const T cons, const int n) {
1124
1125 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1126 const int str = blockDim.x * gridDim.x;
1127
1128 const unsigned int lane = threadIdx.x % warpSize;
1129 const unsigned int wid = threadIdx.x / warpSize;
1130
1131 __shared__ T shared[32];
1132 T maxval = cons * a[0] / b[0];
1133 for (int i = idx; i < n; i += str)
1134 {
1135 maxval = max(maxval, cons * a[i] / (b[i] - c[i]));
1136 }
1137
1139 if (lane == 0)
1140 shared[wid] = maxval;
1141 __syncthreads();
1142
1143 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1144 if (wid == 0)
1146
1147 if (threadIdx.x == 0)
1148 temp[blockIdx.x] = maxval;
1149
1150}
1151
1152
1153template <typename T>
1154__global__ void kkt_rex_kernel(T* __restrict__ rex, const T* __restrict__ df0dx,
1155 const T* __restrict__ dfdx, const T* __restrict__ xsi,
1156 const T* __restrict__ eta, const T* __restrict__ lambda, const int n,
1157 const int m) {
1158 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1159 if (tj < n) {
1160 rex[tj] = 0.0;
1161 for (int i = 0; i < m; i++) {
1162 rex[tj] = rex[tj] + dfdx[i + tj*m] * lambda[i];
1163 }
1164 rex[tj] += df0dx[tj] - xsi[tj] + eta[tj];
1165 }
1166}
1167
1168
1169template <typename T>
1170__global__ void maxcons_kernel(T* __restrict__ a, const T b,
1171 const T c, const T* __restrict__ d, const int n) {
1172 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1173 if (tj < n) {
1174 a[tj] = max(b, c * d[tj]);
1175 }
1176}
1177
1180 template< typename T >
1181 __global__ void glsum_kernel(const T * a, T * buf_h, const int n) {
1182 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1183 const int str = blockDim.x * gridDim.x;
1184
1185 const unsigned int lane = threadIdx.x % warpSize;
1186 const unsigned int wid = threadIdx.x / warpSize;
1187
1188 __shared__ T shared[32];
1189 T sum = 0;
1190 for (int i = idx; i<n ; i += str)
1191 {
1192 sum += a[i];
1193 }
1194
1196 if (lane == 0)
1197 shared[wid] = sum;
1198 __syncthreads();
1199
1200 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1201 if (wid == 0)
1203
1204 if (threadIdx.x == 0)
1205 buf_h[blockIdx.x] = sum;
1206
1207 }
1208 template< typename T >
1209__global__ void glsc2_kernel(const T * a,
1210 const T * b,
1211 T * buf_h,
1212 const int n) {
1213
1214 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1215 const int str = blockDim.x * gridDim.x;
1216
1217 const unsigned int lane = threadIdx.x % warpSize;
1218 const unsigned int wid = threadIdx.x / warpSize;
1219
1220 __shared__ T shared[32];
1221 T sum = 0.0;
1222 for (int i = idx; i < n; i+= str) {
1223 sum += a[i] * b[i];
1224 }
1225
1227 if (lane == 0)
1228 shared[wid] = sum;
1229 __syncthreads();
1230
1231 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1232 if (wid == 0)
1234
1235 if (threadIdx.x == 0)
1236 buf_h[blockIdx.x] = sum;
1237
1238 }
1239
1240
1241
1242
1243
1244template <typename T>
1245__global__ void add2inv2_kernel(T* __restrict__ a, const T* __restrict__ b,
1246 const T c, const int n) {
1247 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1248 if (tj < n) {
1249 a[tj] = a[tj]+c/b[tj];
1250 }
1251}
1252
1253template <typename T>
1254__global__ void max2_kernel(T* __restrict__ a, const T b,
1255 const T* __restrict__ c, const T d, const int n) {
1256 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1257 if (tj < n) {
1258 a[tj]=max(b, d*c[tj]);
1259 }
1260}
1261
1262template <typename T>
1263__global__ void updatebb_kernel(T* __restrict__ bb,
1264 const T* __restrict__ dellambda, const T* __restrict__ dely,
1265 const T* __restrict__ d, const T* __restrict__ mu,
1266 const T* __restrict__ y, const T delz, const int m) {
1267 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1268 if(tj<m)
1269 bb[tj]=dellambda[tj] + dely[tj]/(d[tj] + mu[tj]/y[tj]) - bb[tj];
1270 else if(tj<m+1)
1271 bb[tj]=delz;
1272}
1273
1274
1275
1276template <typename T>
1277__global__ void updateAA_kernel(T* __restrict__ AA,
1278 const T* __restrict__ globaltmp_mm, const T* __restrict__ s,
1279 const T* __restrict__ lambda, const T* __restrict__ d,
1280 const T* __restrict__ mu, const T* __restrict__ y, const T* __restrict__ a,
1281 const T zeta, const T z, const int m) {
1282 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1283 if(tj<m)
1284 {
1285 AA[tj+tj*(m+1)]=globaltmp_mm[tj+tj*m] + (s[tj] / lambda[tj] +
1286 1.0/ (d[tj] + mu[tj] / y[tj]));
1287 AA[tj+m*(m+1)]=a[tj];
1288 AA[m+tj*(m+1)]=a[tj];
1289 }
1290 else if(tj<m+1)
1291 AA[tj+tj*(m+1)]= -zeta/z;
1292}
1293
1294template <typename T>
1295__global__ void dy_kernel(T* __restrict__ dy, const T* __restrict__ dely,
1296 const T* __restrict__ dlambda, const T* __restrict__ d,
1297 const T* __restrict__ mu, const T* __restrict__ y, const int n) {
1298 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1299 if(tj<n)
1300 dy[tj] = (-dely[tj]+dlambda[tj])/(d[tj] + mu[tj]/y[tj]);
1301}
1302
1303#endif
1304
__global__ void heaviside_mapping_apply_kernel(const T beta, const T eta, T *__restrict__ X_out_d, T *__restrict__ X_in_d, const int n)