Neko-TOP
A portable framework for high-order spectral element flow toplogy optimization.
Loading...
Searching...
No Matches
mma_kernel.h
Go to the documentation of this file.
1
37#ifndef MMA_CUDA_KERNEL_H
38#define MMA_CUDA_KERNEL_H
39
40template<typename T>
41__global__ void mma_prepare_aa_matrix_kernel(T* __restrict__ AA,
42 const T* __restrict__ s, const T* __restrict__ lambda,
43 const T* __restrict__ d, const T* __restrict__ mu,
44 const T* __restrict__ y, const T* __restrict__ a,
45 const T zeta, const T z, const int m) {
46 const int tj = blockIdx.x * blockDim.x + threadIdx.x;
47 const int matrix_size = m + 1;
48
49 if (tj >= m) return;
50 AA[tj * matrix_size + tj] += s[tj] / lambda[tj] +
51 (T)1.0 / (d[tj] + mu[tj] / y[tj]);
52 AA[tj * matrix_size + m] = a[tj]; // column m+1
53 AA[m * matrix_size + tj] = a[tj];// row m+1
54
55 // Only first thread updates the bottom-right corner element.
56 if (tj == 0)
57 AA[m * matrix_size + m] = -zeta / z;
58}
59
60
61//Update Hessian diagonal elements (y contributions for dip subsolve)
62template<typename T>
63__global__ void mma_update_hessian_diagonal_kernel(T* __restrict__ Hess,
64 const T* __restrict__ y, const T* __restrict__ d,
65 const T* __restrict__ mu, const T* __restrict__ lambda, const int m) {
66 int tj = blockIdx.x * blockDim.x + threadIdx.x;
67 if (tj >= m) return;
68
69 T diag = Hess[tj * m + tj];
70 // Contribution from y terms (inactive constraints)
71 if (y[tj] > (T)0.0) {
72 if (fabs(d[tj]) >= (T)1.0e-15) {
73 diag -= (T)1.0 / d[tj];
74 }
75 // else: skip - equivalent to subtracting 1.0/1.0e-8
76 }
77
78 // Contribution from -mu/lambda (eq 10)
79 diag -= mu[tj] / lambda[tj];
80 Hess[tj * m + tj] = diag;
81}
82
83// Levenberg-Marquardt algorithm (heuristically)
84// Single-block version for m <= 1024
85template<typename T>
86__global__ void mma_stabilize_hessian_single_kernel(T* __restrict__ Hess, const int m) {
87 const int tid = threadIdx.x;
88
89 // Single thread computes trace and LM factor
90 if (tid == 0) {
91 T trace = (T)0.0;
92 for (int j = 0; j < m; j++) {
93 trace += Hess[j * m + j];
94 }
95 T lm_factor = max((T)(-1.0e-4) * trace / m, (T)1.0e-7);
96
97 // Apply to all diagonal elements
98 for (int j = 0; j < m; j++) {
99 Hess[j * m + j] -= lm_factor;
100 }
101 }
102}
103
104// Levenberg-Marquardt algorithm (heuristically)
105// Multi-block version for m > 1024
106template<typename T>
107__global__ void mma_stabilize_hessian_multi_kernel(T* __restrict__ Hess, const T lm_factor, const int m) {
108 const int i = blockIdx.x * blockDim.x + threadIdx.x;
109 if (i < m) {
110 Hess[i * m + i] -= lm_factor;
111 }
112}
113
114// Small linear solver for MMA (n <= 50)
115template<typename T>
116__global__ void mma_small_lu_kernel(T* __restrict__ A, T* __restrict__ b, const int n) {
117 const int tid = threadIdx.x;
118
119 // Handle 1x1 case
120 if (n == 1) {
121 if (tid == 0 && abs(A[0]) > (T)1e-12) b[0] /= A[0];
122 return;
123 }
124
125 // LU decomposition with partial pivoting
126 for (int k = 0; k < n; k++) {
127 // Pivoting - single thread
128 if (tid == 0) {
129 int max_row = k;
130 T max_val = abs(A[k * n + k]);
131 for (int i = k + 1; i < n; i++) {
132 T val = abs(A[i * n + k]);
133 if (val > max_val) {
134 max_val = val;
135 max_row = i;
136 }
137 }
138 if (max_val > (T)1e-12 && max_row != k) {
139 // Swap rows
140 for (int j = k; j < n; j++) {
141 T temp = A[k * n + j];
142 A[k * n + j] = A[max_row * n + j];
143 A[max_row * n + j] = temp;
144 }
145 // Swap rhs
146 T temp_b = b[k];
147 b[k] = b[max_row];
148 b[max_row] = temp_b;
149 }
150 }
152
153 // Parallel elimination
154 T diag = A[k * n + k];
155 if (abs(diag) > (T)1e-12) {
156 for (int i = tid + k + 1; i < n; i += blockDim.x) {
157 T factor = A[i * n + k] / diag;
158 A[i * n + k] = factor;
159 for (int j = k + 1; j < n; j++) {
160 A[i * n + j] -= factor * A[k * n + j];
161 }
162 }
163 }
165 }
166
167 // Parallel forward substitution
168 for (int i = tid; i < n; i += blockDim.x) {
169 T sum = b[i];
170 for (int j = 0; j < i; j++) {
171 sum -= A[i * n + j] * b[j];
172 }
173 b[i] = sum;
174 }
176
177 // Parallel backward substitution
178 for (int i = n - 1 - tid; i >= 0; i -= blockDim.x) {
179 if (i >= 0) {
180 T sum = b[i];
181 for (int j = i + 1; j < n; j++) {
182 sum -= A[i * n + j] * b[j];
183 }
184 if (abs(A[i * n + i]) > (T)1e-12) {
185 b[i] = sum / A[i * n + i];
186 }
187 }
188 }
190}
191
192template <typename T>
193__global__ void delta_1dbeam_kernel(T* __restrict__ Delta,
194 const T L_total, const T Le, const int offset, const int n) {
195 int k = blockIdx.x * blockDim.x + threadIdx.x;
196 if (k >= n) return;
197
198 // Convert to 1-based indexing for the calculation
199 int idx = k + 1;
200
201 // Calculate first term: (L_total - Le*(offset+idx-1))^3
202 T x1 = L_total - Le * static_cast<T>(offset + idx - 1);
203 T term1 = x1 * x1 * x1;
204
205 // Calculate second term: (L_total - Le*(offset+idx))^3
206 T x2 = L_total - Le * static_cast<T>(offset + idx);
207 T term2 = x2 * x2 * x2;
208
209 // Final result
210 Delta[k] = (term1 - term2) / static_cast<T>(3.0);
211}
212
213template <typename T>
214__global__ void mma_Ljjxinv_kernel(T* __restrict__ Ljjxinv,
216 const T* __restrict__ x, const T* __restrict__ low, const T* __restrict__ upp,
217 const T* __restrict__ alpha, const T* __restrict__ beta,
218 const int n) {
219 int tj = blockIdx.x * blockDim.x + threadIdx.x;
220 if (tj >= n) return;
221
222 // Load inputs into registers
223 const T xt = x[tj];
224 const T low_j = low[tj];
225 const T upp_j = upp[tj];
226 const T pj = pjlambda[tj];
227 const T qj = qjlambda[tj];
228
229 // Precompute reused differences
230 const T diff_u = upp_j - xt;
231 const T diff_l = xt - low_j;
232
233 // Cube once (avoiding pow for speed)
234 const T diff_u3 = diff_u * diff_u * diff_u;
235 const T diff_l3 = diff_l * diff_l * diff_l;
236
237 // Compute inverse value safely
238 T denom = 2.0 * pj / diff_u3 + 2.0 * qj / diff_l3;
239 T val = -1.0 / denom;
240
241 // Mask out active primal constraints
242 bool active = (fabs(xt - alpha[tj]) <= T(1e-16)) ||
243 (fabs(xt - beta[tj]) <= T(1e-16));
244
245 Ljjxinv[tj] = active ? T(0.0) : val;
246}
247
248template <typename T>
249__global__ void mma_dipsolvesub1_kernel(T* __restrict__ x,
251 const T* __restrict__ low, const T* __restrict__ upp,
252 const T* __restrict__ alpha, const T* __restrict__ beta,
253 const int n) {
254
255 int tj = blockIdx.x * blockDim.x + threadIdx.x;
256 if (tj >= n) return;
257
258 // Load inputs into registers
259 const T pj_sqrt = sqrt(pjlambda[tj]);
260 const T qj_sqrt = sqrt(qjlambda[tj]);
261 const T denom = pj_sqrt + qj_sqrt;
262
263 const T low_j = low[tj];
264 const T upp_j = upp[tj];
265 const T alpha_j = alpha[tj];
266 const T beta_j = beta[tj];
267
268 // Weighted average
269 const T val = (pj_sqrt * low_j + qj_sqrt * upp_j) / denom;
270
271 // Clamp x between alpha and beta using branchless min/max
272 x[tj] = fmin(fmax(val, alpha_j), beta_j);
273}
274
275template <typename T>
276__global__ void mattrans_v_mul_kernel(T* __restrict__ output,
277 const T* __restrict__ pij, const T* __restrict__ lambda,
278 const int m, const int n) {
279
280 int tj = blockIdx.x * blockDim.x + threadIdx.x;
281 if (tj >= n) return;
282
283 T acc = 0.0;
284
285 // Use register accumulation
286 for (int i = 0; i < m; ++i) {
287 acc += pij[i + tj * m] * lambda[i];
288 }
289
290 output[tj] = acc;
291}
292
293template <typename T>
294__global__ void mma_sub1_kernel(
297 const T* __restrict__ x,
298 const T* __restrict__ xmin,
299 const T* __restrict__ xmax,
300 const T asyinit,
301 const int n) {
302 int tj = blockIdx.x * blockDim.x + threadIdx.x;
303 if (tj >= n) return;
304
305 // Load values once into registers (global memory is slow)
306 const T xt = x[tj];
307 const T xmin_j = xmin[tj];
308 const T xmax_j = xmax[tj];
309
310 // Reuse xgap calculation
311 const T xgap = xmax_j - xmin_j;
312 const T offset = asyinit * xgap;
313
314 xlow[tj] = xt - offset;
315 xupp[tj] = xt + offset;
316}
317
318
319
320template< typename T >
321__global__ void mma_sub2_kernel(T* __restrict__ low, T* __restrict__ upp,
322 const T* __restrict__ x, const T* __restrict__ xold1,
323 const T* __restrict__ xold2, const T* __restrict__ xdiff,
324 const T asydecr, const T asyincr, const int n) {
325 int tj = blockIdx.x * blockDim.x + threadIdx.x;
326 if (tj >= n) return;
327
328 // Load data into registers for faster accessing compare to global memory
329 // when accessing repeatedly)
330 const T xval = x[tj];
331 const T xold1val = xold1[tj];
332 const T xold2val = xold2[tj];
333 const T lowval = low[tj];
334 const T uppval = upp[tj];
335 const T xdiffval = xdiff[tj];
336
337 // Compute the product
338 const T prod = (xval - xold1val) * (xold1val - xold2val);
339
340 // Compute asy_factor without branching
341 T asy_factor = (prod < T(0)) ? asydecr :
342 (prod > T(0)) ? asyincr : T(1);
343
344 // Update low and upp using fma (fused multiply-add) for numerical stability
347
348 // Apply bounds
349 new_low = max(new_low, xval - T(10.0) * xdiffval);
350 new_low = min(new_low, xval - T(0.01) * xdiffval);
351
352 new_upp = min(new_upp, xval + T(10.0) * xdiffval);
353 new_upp = max(new_upp, xval + T(0.01) * xdiffval);
354
355 // Write results back
356 low[tj] = new_low;
357 upp[tj] = new_upp;
358}
359
360template <typename T>
361__global__ void mma_sub3_kernel( const T* __restrict__ x,
362 const T* __restrict__ df0dx, const T* __restrict__ dfdx,
363 T* __restrict__ low, T* __restrict__ upp, const T* __restrict__ xmin,
364 const T* __restrict__ xmax, T* __restrict__ alpha, T* __restrict__ beta,
365 T* __restrict__ p0j, T* __restrict__ q0j, T* __restrict__ pij,
366 T* __restrict__ qij, const int n, const int m) {
367 int tj = blockIdx.x * blockDim.x + threadIdx.x;
368 if (tj >= n) return;
369
370 // Load into registers once
371 const T xt = x[tj];
372 const T xmin_j = xmin[tj];
373 const T xmax_j = xmax[tj];
374 const T low_j = low[tj];
375 const T upp_j = upp[tj];
376 const T df0 = df0dx[tj];
377 const T xgap = xmax_j - xmin_j;
378
379 // Clamp helpers
380 const T half_xgap = 0.5 * xgap;
381 const T tenth_low_diff = T(0.1) * (xt - low_j);
382 const T tenth_upp_diff = T(0.1) * (upp_j - xt);
383
384 // Compute alpha and beta with fused max/min and fewer calls
387
388 alpha[tj] = alpha_val;
389 beta[tj] = beta_val;
390
391 // Avoid multiple pow calls, compute once
392 const T upp_minus_x = upp_j - xt;
393 const T x_minus_low = xt - low_j;
396
397 // Small epsilon for numerical stability
398 const T eps = T(1e-5);
399 const T inv_xgap = T(1.0) / max(eps, xgap);
400
401 // Compute terms reused multiple times
402 const T max_df0_pos = max(df0, T(0));
403 const T max_df0_neg = max(-df0, T(0));
404
405 p0j[tj] = upp_minus_x_sq * (T(1.001) * max_df0_pos +
406 T(0.001) * max_df0_neg + eps * inv_xgap);
407 q0j[tj] = x_minus_low_sq * (T(0.001) * max_df0_pos +
408 T(1.001) * max_df0_neg + eps * inv_xgap);
409
410 // Loop over m for pij and qij
411 for (int i = 0; i < m; ++i) {
412 int idx = i + tj * m;
413
414 T dfdx_val = dfdx[idx];
415 T max_pos = max(dfdx_val, T(0));
416 T max_neg = max(-dfdx_val, T(0));
417
418 pij[idx] = upp_minus_x_sq * (T(1.001) * max_pos +
419 T(0.001) * max_neg + eps * inv_xgap);
420 qij[idx] = x_minus_low_sq * (T(0.001) * max_pos +
421 T(1.001) * max_neg + eps * inv_xgap);
422 }
423}
424
425template <typename T>
426__global__ void mma_sub4_kernel(
427 const T* __restrict__ x,
428 const T* __restrict__ low,
429 const T* __restrict__ upp,
430 const T* __restrict__ pij,
431 const T* __restrict__ qij,
432 T* __restrict__ temp,
433 const int n,
434 const int m) {
435
436 int tj = blockIdx.x * blockDim.x + threadIdx.x;
437 if (tj >= n) return;
438
439 // Register caching
440 const T xt = x[tj];
441 const T low_j = low[tj];
442 const T upp_j = upp[tj];
443
444 const T denom_upp = upp_j - xt;
445 const T denom_low = xt - low_j;
446
447 const T eps = T(1e-12); // Precision-dependent epsilon
448 const T inv_denom_upp = T(1) / max(denom_upp, eps);
449 const T inv_denom_low = T(1) / max(denom_low, eps);
450
451 const int base_idx = tj * m;
452
453 for (int i = 0; i < m; ++i) {
454 int idx = base_idx + i;
455 temp[idx] = pij[idx] * inv_denom_upp + qij[idx] * inv_denom_low;
456 }
457}
458
459
460template <typename T>
461__global__ void mma_max2_kernel(
462 T* __restrict__ xsi,
463 const T* __restrict__ x,
464 const T* __restrict__ alpha,
465 const int n) {
466
467 int tj = blockIdx.x * blockDim.x + threadIdx.x;
468 if (tj >= n) return;
469
470 const T eps = T(1e-12);
471 T denom = max(x[tj] - alpha[tj], eps);
472 xsi[tj] = max(T(1), T(1) / denom);
473}
474
475
476
477
478template <typename T>
479__global__ void relambda_kernel(
480 T* __restrict__ temp,
481 const T* __restrict__ x,
482 const T* __restrict__ xupp,
483 const T* __restrict__ xlow,
484 const T* __restrict__ pij,
485 const T* __restrict__ qij,
486 const int n,
487 const int m) {
488
489 int tj = blockIdx.x * blockDim.x + threadIdx.x;
490 if (tj >= n) return;
491
492 const T xt = x[tj];
493 const T xup = xupp[tj];
494 const T xlo = xlow[tj];
495
496 // Prevent divide-by-zero using small epsilon
497 const T eps = T(1e-12);
498 const T inv_denom_upp = T(1) / max(xup - xt, eps);
499 const T inv_denom_low = T(1) / max(xt - xlo, eps);
500
501 int base_idx = tj * m;
502 for (int i = 0; i < m; ++i) {
503 int idx = base_idx + i;
504 temp[idx] = pij[idx] * inv_denom_upp + qij[idx] * inv_denom_low;
505 }
506}
507
508
509template <typename T>
510__global__ void sub2cons2_kernel(
511 T* __restrict__ a,
512 const T* __restrict__ b,
513 const T* __restrict__ c,
514 const T* __restrict__ d,
515 const T e,
516 const int n) {
517 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
518 if (idx >= n) return;
519
520 const T bt = b[idx];
521 const T ct = c[idx];
522 const T dt = d[idx];
523
524 a[idx] = bt * (ct - dt) - e;
525}
526
527
528template< typename T>
529__inline__ __device__ T max_reduce_warp(T val) {
530 val = max(val, __shfl_down_sync(0xffffffff, val, 16));
531 val = max(val, __shfl_down_sync(0xffffffff, val, 8));
532 val = max(val, __shfl_down_sync(0xffffffff, val, 4));
533 val = max(val, __shfl_down_sync(0xffffffff, val, 2));
534 val = max(val, __shfl_down_sync(0xffffffff, val, 1));
535 return val;
536}
537
538
539
540template <typename T>
541__global__ void maxval_kernel(const T* __restrict__ a, T* temp, const int n) {
542 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
543 const int stride = blockDim.x * gridDim.x;
544
545 const unsigned int lane = threadIdx.x % warpSize;
546 const unsigned int warp_id = threadIdx.x / warpSize;
547
548 // Shared memory to store warp-level maxima
549 __shared__ T shared[32]; // Assumes max 1024 threads per block
550
551 T local_max = T(0);
552 for (int i = idx; i < n; i += stride) {
553 local_max = max(local_max, abs(a[i]));
554 }
555
556 // Warp-level reduction
558
559 // Write warp-level results to shared memory
560 if (lane == 0) {
562 }
564
565 // Let the first warp reduce shared values
566 local_max = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : T(0);
567
568 if (warp_id == 0) {
570 }
571
572 if (threadIdx.x == 0) {
573 temp[blockIdx.x] = local_max;
574 }
575}
576
577
578template <typename T>
579__global__ void max_reduce_kernel(T* __restrict__ bufred, const int n) {
580
581 T maxval = T(0);
582 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
583 const int stride = blockDim.x * gridDim.x;
584
585 // Grid-stride loop to cover all elements
586 for (int i = idx; i < n; i += stride) {
587 maxval = max(maxval, bufred[i]);
588 }
589
590 __shared__ T shared[32]; // One slot per warp (max 1024 threads/block)
591
592 unsigned int lane = threadIdx.x % warpSize;
593 unsigned int wid = threadIdx.x / warpSize;
594
595 // Warp-level max reduction
597
598 // Store each warp's max value in shared memory
599 if (lane == 0) {
600 shared[wid] = maxval;
601 }
603
604 // Let the first warp reduce the warp-level maxima
605 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : T(0);
606 if (wid == 0) {
608 }
609
610 // Write the block's maximum value back to bufred[blockIdx.x]
611 if (threadIdx.x == 0) {
613 }
614}
615
616template <typename T>
617__global__ void delx_kernel(
619 const T* __restrict__ x,
620 const T* __restrict__ xlow,
621 const T* __restrict__ xupp,
622 const T* __restrict__ pij,
623 const T* __restrict__ qij,
624 const T* __restrict__ p0j,
625 const T* __restrict__ q0j,
626 const T* __restrict__ alpha,
627 const T* __restrict__ beta,
628 const T* __restrict__ lambda,
629 const T epsi,
630 const int n,
631 const int m)
632{
633 int tj = blockIdx.x * blockDim.x + threadIdx.x;
634 if (tj < n) {
635 T xt = x[tj];
636 T xlow_j = xlow[tj];
637 T xupp_j = xupp[tj];
638 T alpha_j = alpha[tj];
639 T beta_j = beta[tj];
640
641 // Precompute denominators squared for better performance
642 T denom_low = xt - xlow_j;
643 T denom_upp = xupp_j - xt;
645 T denom_beta = beta_j - xt;
646
647 const T small_eps = T(1e-12);
652
653 T sum = T(0);
654 for (int i = 0; i < m; i++) {
655 T lambda_i = lambda[i];
656 sum += pij[i + tj * m] * lambda_i / (denom_upp * denom_upp)
657 - qij[i + tj * m] * lambda_i / (denom_low * denom_low);
658 }
659 sum += p0j[tj] / (denom_upp * denom_upp)
660 - q0j[tj] / (denom_low * denom_low)
661 - epsi / denom_alpha
662 + epsi / denom_beta;
663
664 delx[tj] = sum;
665 }
666}
667
668template <typename T>
669__global__ void GG_kernel(
671 const T* __restrict__ x,
672 const T* __restrict__ xlow,
673 const T* __restrict__ xupp,
674 const T* __restrict__ pij,
675 const T* __restrict__ qij,
676 const int n,
677 const int m) {
678
679 int tj = blockIdx.x * blockDim.x + threadIdx.x;
680 if (tj < n) {
681 T xt = x[tj];
682 T xlow_j = xlow[tj];
683 T xupp_j = xupp[tj];
684
685 // Distances from bounds
686 T diff_upper = xupp_j - xt;
687 T diff_lower = xt - xlow_j;
688
689 // Squared distances
692
693 for (int i = 0; i < m; i++) {
694 int idx = i + tj * m;
695 GG[idx] = pij[idx] / diff_upper2 - qij[idx] / diff_lower2;
696 }
697 }
698}
699
700
701template <typename T>
702__global__ void diagx_kernel(T* __restrict__ diagx, const T* __restrict__ x,
703 const T* __restrict__ xsi, const T* __restrict__ xlow,
704 const T* __restrict__ xupp, const T* __restrict__ p0j,
705 const T* __restrict__ q0j, const T* __restrict__ pij,
706 const T* __restrict__ qij, const T* alpha, const T* beta,
707 const T* eta, const T* lambda, const int n, const int m) {
708 int tj = blockIdx.x * blockDim.x + threadIdx.x;
709 if (tj < n) {
710 T sum = 0;
711 T sum1 = 0;
712 for (int i = 0; i < m; i++) {
713 sum = sum + pij[tj *m+ i] * lambda[i];
714 sum1 = sum1 + qij[tj*m + i] * lambda[i];
715 }
716 diagx[tj] = (p0j[tj] + sum) / pow(xupp[tj] - x[tj], 3) +
717 (q0j[tj] + sum1) / pow(x[tj] - xlow[tj], 3);
718 diagx[tj] = 2.0 * diagx[tj] + xsi[tj] / (x[tj] - alpha[tj]) +
719 eta[tj] / (beta[tj] - x[tj]);
720 }
721}
722
723
724template <typename T>
725__inline__ __device__ T reduce_warp(T val) {
726 volatile T v = val;
727 v += __shfl_down_sync(0xffffffff, v, 16);
728 v += __shfl_down_sync(0xffffffff, v, 8);
729 v += __shfl_down_sync(0xffffffff, v, 4);
730 v += __shfl_down_sync(0xffffffff, v, 2);
731 v += __shfl_down_sync(0xffffffff, v, 1);
732 return v;
733}
734
735
736template <typename T>
737__global__ void mmareduce_kernel(T* __restrict__ bufred, const int n) {
738
739 T sum = 0;
740 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
741 const int str = blockDim.x * gridDim.x;
742 for (int i = idx; i < n; i += str)
743 {
744 sum += bufred[i];
745 }
746
747 __shared__ T shared[32];
748 unsigned int lane = threadIdx.x % warpSize;
749 unsigned int wid = threadIdx.x / warpSize;
750
752 if (lane == 0)
753 shared[wid] = sum;
755
756 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
757 if (wid == 0)
759
760 if (threadIdx.x == 0)
761 bufred[blockIdx.x] = sum;
762}
763
764
765template< typename T >
766__global__ void mmasum_kernel(const T* __restrict__ a, T* __restrict__ buf_h,
767 const int n, const int m, const int k) {
768
769 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
770 const int str = blockDim.x * gridDim.x;
771
772 const unsigned int lane = threadIdx.x % warpSize;
773 const unsigned int wid = threadIdx.x / warpSize;
774
775 __shared__ T shared[32];
776 T sum = 0;
777 for (int i = idx; i < n; i += str)
778 {
779 sum += a[m * i + k ];
780 }
781
783 if (lane == 0)
784 shared[wid] = sum;
786
787 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
788 if (wid == 0)
790
791 if (threadIdx.x == 0)
792 buf_h[blockIdx.x] = sum;
793
794}
795template< typename T >
796__global__ void mmasumbb_kernel(const T* __restrict__ GG,
797 const T* __restrict__ delx, const T* __restrict__ diagx,
798 T* __restrict__ buf_h, const int n, const int m, const int k) {
799
800 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
801 const int str = blockDim.x * gridDim.x;
802
803 const unsigned int lane = threadIdx.x % warpSize;
804 const unsigned int wid = threadIdx.x / warpSize;
805
806 __shared__ T shared[32];
807 T sum = 0;
808 for (int i = idx; i < n; i += str)
809 {
810 sum += GG[ k + i * m] * delx[i] / diagx[i];
811 }
812
814 if (lane == 0)
815 shared[wid] = sum;
817
818 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
819 if (wid == 0)
821
822 if (threadIdx.x == 0)
823 buf_h[blockIdx.x] = sum;
824
825}
826
827template <typename T>
828__global__ void mmasumHess_kernel(
829 const T* __restrict__ hijx,
830 const T* __restrict__ Ljjxinv,
832 const int n,
833 const int m,
834 const int k0,
835 const int k1)
836{
837 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
838 const int stride = blockDim.x * gridDim.x;
839
840 // Warp lane and warp id within the block
841 const unsigned int lane = threadIdx.x % warpSize;
842 const unsigned int wid = threadIdx.x / warpSize;
843
844 __shared__ T shared[32];
845
846 T sum = T(0);
847
848 // Grid-stride loop for global reduction
849 for (int i = idx; i < n; i += stride) {
850 // hijx is indexed as hijx[offset + row * m]
851 T val0 = hijx[k0 + i * m];
852 T val1 = hijx[k1 + i * m];
853 sum += val0 * Ljjxinv[i] * val1;
854 }
855
856 // Warp-level reduction using your reduce_warp implementation
858
859 // Write each warp's partial sum to shared memory
860 if (lane == 0) {
861 shared[wid] = sum;
862 }
864
865 // First warp reduces the warp sums in shared memory
866 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : T(0);
867 if (wid == 0) {
869 }
870
871 // Write final result from first thread of block
872 if (threadIdx.x == 0) {
873 buf_h[blockIdx.x] = sum;
874 }
875}
876
877
878template< typename T >
879__global__ void mmasumAA_kernel(const T* __restrict__ GG,
880 const T* __restrict__ diagx, T* __restrict__ buf_h, const int n,
881 const int m, const int k0, const int k1) {
882
883 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
884 const int str = blockDim.x * gridDim.x;
885
886 const unsigned int lane = threadIdx.x % warpSize;
887 const unsigned int wid = threadIdx.x / warpSize;
888
889 __shared__ T shared[32];
890 T sum = 0;
891 for (int i = idx; i < n; i += str)
892 {
893 sum += GG[ k0 + i * m] /diagx[i] * GG[ k1 + i * m];
894 }
895
897 if (lane == 0)
898 shared[wid] = sum;
900
901 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
902 if (wid == 0)
904
905 if (threadIdx.x == 0)
906 buf_h[blockIdx.x] = sum;
907
908}
909
910
911template <typename T>
912__global__ void mma_copy_kernel(T* __restrict__ a, const T* __restrict__ b,
913 const int n, const int m) {
914 int tj = blockIdx.x * blockDim.x + threadIdx.x;
915 if(tj<n)
916 a[tj+m]=b[tj];
917}
918
919
920
921template <typename T>
922__global__ void AA_kernel(T* __restrict__ temp, const T* __restrict__ GG,
923 const T* __restrict__ diagx, const int n, const int m) {
924 int tj = blockIdx.x * blockDim.x + threadIdx.x;
925 if (tj < n) {
926 for (int i0 = 0; i0 < m; i0++) {
927 for (int i1 = 0; i1 < m; i1++) {
928 temp[tj + i0 * (n + 1) + i1 * (m + 1) * (n + 1)] = GG[i0 * n + tj] *
929 (1.0 / diagx[tj]) * GG[i1 * n + tj];
930 }
931 }
932 }
933}
934
935
936template <typename T>
937__global__ void dx_kernel(T* __restrict__ dx, const T* __restrict__ delx,
938 const T* __restrict__ diagx, const T* __restrict__ GG,
939 const T* __restrict__ dlambda, const int n, const int m) {
940 int tj = blockIdx.x * blockDim.x + threadIdx.x;
941 if (tj < n) {
942 dx[tj] = -delx[tj]/diagx[tj];
943 for(int i=0;i<m;i++){
944 dx[tj] =dx[tj] - GG[tj*m+i]*dlambda[i]/diagx[tj];
945 }
946 }
947}
948
949
950
951template <typename T>
952__global__ void dxsi_kernel(T* __restrict__ dxsi, const T* __restrict__ xsi,
953 const T* __restrict__ dx, const T* __restrict__ x,
954 const T* __restrict__ alpha, const T epsi, const int n) {
955 int tj = blockIdx.x * blockDim.x + threadIdx.x;
956 if (tj < n) {
957 dxsi[tj]= -xsi[tj] + (epsi-dx[tj]*xsi[tj])/(x[tj] - alpha[tj]);
958 }
959}
960
961
962template <typename T>
963__global__ void deta_kernel(T* __restrict__ deta, const T* __restrict__ eta,
964 const T* __restrict__ dx, const T* __restrict__ x,
965 const T* __restrict__ beta, const T epsi, const int n) {
966 int tj = blockIdx.x * blockDim.x + threadIdx.x;
967 if (tj < n) {
968 deta[tj] = -eta[tj] + (epsi + dx[tj] * eta[tj]) / (beta[tj] - x[tj]);
969 }
970}
971
972
973template <typename T>
974__global__ void RexCalculation_kernel(
976 const T* __restrict__ x,
977 const T* __restrict__ xlow,
978 const T* __restrict__ xupp,
979 const T* __restrict__ pij,
980 const T* __restrict__ p0j,
981 const T* __restrict__ qij,
982 const T* __restrict__ q0j,
983 const T* __restrict__ lambda,
984 const T* __restrict__ xsi,
985 const T* __restrict__ eta,
986 const int n,
987 const int m)
988{
989 int tj = blockIdx.x * blockDim.x + threadIdx.x;
990 if (tj < n) {
991 T upp_diff = xupp[tj] - x[tj];
992 T low_diff = x[tj] - xlow[tj];
995
996 T sum = 0.0;
997 for (int i = 0; i < m; ++i) {
998 sum += pij[i + tj * m] * lambda[i] / upp_diff_sq
999 - qij[i + tj * m] * lambda[i] / low_diff_sq;
1000 }
1001
1002 rex[tj] = sum
1003 + p0j[tj] / upp_diff_sq
1004 - q0j[tj] / low_diff_sq
1005 - xsi[tj] + eta[tj];
1006 }
1007}
1008
1009
1010template <typename T>
1011__global__ void rey_calculation_kernel(T* __restrict__ rey,
1012 const T* __restrict__ c, const T* __restrict__ d, const T* __restrict__ y,
1013 const T* __restrict__ lambda, const T* __restrict__ mu, const int n) {
1014 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1015 if (tj < n) {
1016 rey[tj] = c[tj] + d[tj] * y[tj] - lambda[tj] - mu[tj];
1017 }
1018}
1019
1020
1021template <typename T>
1022__global__ void norm_kernel(const T* __restrict__ a,
1023 T* __restrict__ buf_h, const int n) {
1024 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1025 const int str = blockDim.x * gridDim.x;
1026
1027 const unsigned int lane = threadIdx.x % warpSize;
1028 const unsigned int wid = threadIdx.x / warpSize;
1029
1030 __shared__ T shared[32];
1031 T sum = T(0);
1032
1033 for (int i = idx; i < n; i += str) {
1034 T val = a[i];
1035 sum += val * val; // faster than pow(val, 2)
1036 }
1037
1038 sum = reduce_warp<T>(sum); // your warp-level sum reduction function
1039 if (lane == 0)
1040 shared[wid] = sum;
1041 __syncthreads();
1042
1043 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : T(0);
1044 if (wid == 0)
1046
1047 if (threadIdx.x == 0)
1048 buf_h[blockIdx.x] = sum;
1049}
1050
1051
1052template <typename T>
1053__global__ void sub2cons_kernel(T* __restrict__ a, const T* __restrict__ b,
1054 const T* __restrict__ c,
1055 const T d, const int n) {
1056 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1057 if (tj < n) {
1058 a[tj] = b[tj]*c[tj]-d;
1059 }
1060}
1061
1062
1063template <typename T>
1064__global__ void dely_kernel(T* __restrict__ dely, const T* __restrict__ c,
1065 const T* __restrict__ d, const T* __restrict__ y,
1066 const T* __restrict__ lambda, const T epsi, const int n) {
1067 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1068 if (tj < n) {
1069 dely[tj] = c[tj] + d[tj]*y[tj] - lambda[tj] - epsi/y[tj];
1070 }
1071}
1072
1073
1074
1075template< typename T >
1076__global__ void maxval2_kernel(const T* __restrict__ a, const T* __restrict__ b,
1077 T* __restrict__ temp, const T cons, const int n) {
1078
1079 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1080 const int str = blockDim.x * gridDim.x;
1081
1082 const unsigned int lane = threadIdx.x % warpSize;
1083 const unsigned int wid = threadIdx.x / warpSize;
1084
1085 __shared__ T shared[32];
1086 T maxval = cons * a[0] / b[0];
1087 for (int i = idx; i < n; i += str)
1088 {
1089 maxval = max(maxval, cons * a[i] / b[i]);
1090 }
1091
1093 if (lane == 0)
1094 shared[wid] = maxval;
1095 __syncthreads();
1096
1097 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0.0;
1098 if (wid == 0)
1100
1101 if (threadIdx.x == 0)
1102 temp[blockIdx.x] = maxval;
1103
1104}
1105
1106
1107template< typename T >
1108__global__ void maxval3_kernel(const T* __restrict__ a, const T* __restrict__ b,
1109 const T* __restrict__ c, T* __restrict__ temp, const T cons, const int n) {
1110
1111 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1112 const int str = blockDim.x * gridDim.x;
1113
1114 const unsigned int lane = threadIdx.x % warpSize;
1115 const unsigned int wid = threadIdx.x / warpSize;
1116
1117 __shared__ T shared[32];
1118 T maxval = cons * a[0] / b[0];
1119 for (int i = idx; i < n; i += str)
1120 {
1121 maxval = max(maxval, cons * a[i] / (b[i] - c[i]));
1122 }
1123
1125 if (lane == 0)
1126 shared[wid] = maxval;
1127 __syncthreads();
1128
1129 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1130 if (wid == 0)
1132
1133 if (threadIdx.x == 0)
1134 temp[blockIdx.x] = maxval;
1135
1136}
1137
1138
1139template <typename T>
1140__global__ void kkt_rex_kernel(T* __restrict__ rex, const T* __restrict__ df0dx,
1141 const T* __restrict__ dfdx, const T* __restrict__ xsi,
1142 const T* __restrict__ eta, const T* __restrict__ lambda, const int n,
1143 const int m) {
1144 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1145 if (tj < n) {
1146 rex[tj] = 0.0;
1147 for (int i = 0; i < m; i++) {
1148 rex[tj] = rex[tj] + dfdx[i + tj*m] * lambda[i];
1149 }
1150 rex[tj] += df0dx[tj] - xsi[tj] + eta[tj];
1151 }
1152}
1153
1154
1155template <typename T>
1156__global__ void maxcons_kernel(T* __restrict__ a, const T b,
1157 const T c, const T* __restrict__ d, const int n) {
1158 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1159 if (tj < n) {
1160 a[tj] = max(b, c * d[tj]);
1161 }
1162}
1163
1166 template< typename T >
1167 __global__ void glsum_kernel(const T * a, T * buf_h, const int n) {
1168 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1169 const int str = blockDim.x * gridDim.x;
1170
1171 const unsigned int lane = threadIdx.x % warpSize;
1172 const unsigned int wid = threadIdx.x / warpSize;
1173
1174 __shared__ T shared[32];
1175 T sum = 0;
1176 for (int i = idx; i<n ; i += str)
1177 {
1178 sum += a[i];
1179 }
1180
1182 if (lane == 0)
1183 shared[wid] = sum;
1184 __syncthreads();
1185
1186 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1187 if (wid == 0)
1189
1190 if (threadIdx.x == 0)
1191 buf_h[blockIdx.x] = sum;
1192
1193 }
1194 template< typename T >
1195__global__ void glsc2_kernel(const T * a,
1196 const T * b,
1197 T * buf_h,
1198 const int n) {
1199
1200 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
1201 const int str = blockDim.x * gridDim.x;
1202
1203 const unsigned int lane = threadIdx.x % warpSize;
1204 const unsigned int wid = threadIdx.x / warpSize;
1205
1206 __shared__ T shared[32];
1207 T sum = 0.0;
1208 for (int i = idx; i < n; i+= str) {
1209 sum += a[i] * b[i];
1210 }
1211
1213 if (lane == 0)
1214 shared[wid] = sum;
1215 __syncthreads();
1216
1217 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
1218 if (wid == 0)
1220
1221 if (threadIdx.x == 0)
1222 buf_h[blockIdx.x] = sum;
1223
1224 }
1225
1226
1227
1228
1229
1230template <typename T>
1231__global__ void add2inv2_kernel(T* __restrict__ a, const T* __restrict__ b,
1232 const T c, const int n) {
1233 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1234 if (tj < n) {
1235 a[tj] = a[tj]+c/b[tj];
1236 }
1237}
1238
1239template <typename T>
1240__global__ void max2_kernel(T* __restrict__ a, const T b,
1241 const T* __restrict__ c, const T d, const int n) {
1242 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1243 if (tj < n) {
1244 a[tj]=max(b, d*c[tj]);
1245 }
1246}
1247
1248template <typename T>
1249__global__ void updatebb_kernel(T* __restrict__ bb,
1250 const T* __restrict__ dellambda, const T* __restrict__ dely,
1251 const T* __restrict__ d, const T* __restrict__ mu,
1252 const T* __restrict__ y, const T delz, const int m) {
1253 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1254 if(tj<m)
1255 bb[tj]=dellambda[tj] + dely[tj]/(d[tj] + mu[tj]/y[tj]) - bb[tj];
1256 else if(tj<m+1)
1257 bb[tj]=delz;
1258}
1259
1260
1261
1262template <typename T>
1263__global__ void updateAA_kernel(T* __restrict__ AA,
1264 const T* __restrict__ globaltmp_mm, const T* __restrict__ s,
1265 const T* __restrict__ lambda, const T* __restrict__ d,
1266 const T* __restrict__ mu, const T* __restrict__ y, const T* __restrict__ a,
1267 const T zeta, const T z, const int m) {
1268 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1269 if(tj<m)
1270 {
1271 AA[tj+tj*(m+1)]=globaltmp_mm[tj+tj*m] + (s[tj] / lambda[tj] +
1272 1.0/ (d[tj] + mu[tj] / y[tj]));
1273 AA[tj+m*(m+1)]=a[tj];
1274 AA[m+tj*(m+1)]=a[tj];
1275 }
1276 else if(tj<m+1)
1277 AA[tj+tj*(m+1)]= -zeta/z;
1278}
1279
1280template <typename T>
1281__global__ void dy_kernel(T* __restrict__ dy, const T* __restrict__ dely,
1282 const T* __restrict__ dlambda, const T* __restrict__ d,
1283 const T* __restrict__ mu, const T* __restrict__ y, const int n) {
1284 int tj = blockIdx.x * blockDim.x + threadIdx.x;
1285 if(tj<n)
1286 dy[tj] = (-dely[tj]+dlambda[tj])/(d[tj] + mu[tj]/y[tj]);
1287}
1288
1289#endif
1290
__global__ void convex_down_RAMP_mapping_apply_kernel(const T f_min, const T f_max, const T q, T *__restrict__ X_out_d, T *__restrict__ X_in_d, const int n)