Neko-TOP
A portable framework for high-order spectral element flow toplogy optimization.
Loading...
Searching...
No Matches
mma_kernel.h
1/*
2 Copyright (c) 2021-2025, The Neko Authors
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 * Redistributions in binary form must reproduce the above
13 copyright notice, this list of conditions and the following
14 disclaimer in the documentation and/or other materials provided
15 with the distribution.
16
17 * Neither the name of the authors nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 POSSIBILITY OF SUCH DAMAGE.
33*/
34
35#ifndef MMA_HIP_KERNEL_H
36#define MMA_HIP_KERNEL_H
37
38template <typename T>
39__global__ void mma_Ljjxinv_kernel(T* __restrict__ Ljjxinv,
40 const T* __restrict__ pjlambda, const T* __restrict__ qjlambda,
41 const T* __restrict__ x, const T* __restrict__ low, const T* __restrict__ upp,
42 const T* __restrict__ alpha, const T* __restrict__ beta,
43 const int n) {
44 int tj = blockIdx.x * blockDim.x + threadIdx.x;
45 if (tj < n) {
46 const T xt = x[tj];
47 T val = -1.0 / (2.0 * pjlambda[tj] / pow(upp[tj] - xt, 3) +
48 2.0 * qjlambda[tj] / pow(xt - low[tj], 3));
49 // Remove the sensitivity for the active primal constraints
50 bool is_alpha = xt == alpha[tj];
51 bool is_beta = xt == beta[tj];
52 Ljjxinv[tj] = (is_alpha || is_beta) ? T(0.0) : val;
53 }
54}
55
56
57template <typename T>
58__global__ void mma_dipsolvesub1_kernel(T* __restrict__ x,
59 const T* __restrict__ pjlambda, const T* __restrict__ qjlambda,
60 const T* __restrict__ low, const T* __restrict__ upp,
61 const T* __restrict__ alpha, const T* __restrict__ beta,
62 const int n) {
63 int tj = blockIdx.x * blockDim.x + threadIdx.x;
64 if (tj < n) {
65 T pj = sqrt(pjlambda[tj]);
66 T qj = sqrt(qjlambda[tj]);
67 T denom = pj + qj;
68 T val = (pj * low[tj] + qj * upp[tj]) / denom;
69
70 // Clamp x between alpha and beta using branchless min/max
71 x[tj] = fmax(fmin(val, beta[tj]), alpha[tj]);
72 }
73
74}
75
76template <typename T>
77__global__ void mattrans_v_mul_kernel(T* __restrict__ output,
78 const T* __restrict__ pij, const T* __restrict__ lambda,
79 const int m, const int n) {
80 int tj = blockIdx.x * blockDim.x + threadIdx.x;
81 if (tj < n) {
82 output[tj] = 0.0;
83 for (int i = 0; i < m; i++) {
84 // output[tj] = output[tj] + pij[tj + i * n] * lambda[i];
85 output[tj] = output[tj] + pij[i + tj * m] * lambda[i];
86 }
87 }
88}
89
90template <typename T>
91__global__ void mma_sub1_kernel(T* __restrict__ xlow, T* __restrict__ xupp,
92 const T* __restrict__ x, const T* __restrict__ xmin,
93 const T* __restrict__ xmax, const T asyinit, const int n) {
94 int tj = blockIdx.x * blockDim.x + threadIdx.x;
95 if (tj < n) {
96 T xgap = xmax[tj] - xmin[tj];
97 xlow[tj] = x[tj] - asyinit * xgap;
98 xupp[tj] = x[tj] + asyinit * xgap;
99 }
100}
101
102
103template< typename T >
104__global__ void mma_sub2_kernel(T* __restrict__ low, T* __restrict__ upp,
105 const T* __restrict__ x, const T* __restrict__ xold1,
106 const T* __restrict__ xold2, const T* __restrict__ xdiff,
107 const T asydecr, const T asyincr, const int n) {
108 int tj = blockIdx.x * blockDim.x + threadIdx.x;
109 if (tj >= n) return;
110
111 // Load data into registers for faster accessing compare to global memory
112 // when accessing repeatedly)
113 const T xval = x[tj];
114 const T xold1val = xold1[tj];
115 const T xold2val = xold2[tj];
116 const T lowval = low[tj];
117 const T uppval = upp[tj];
118 const T xdiffval = xdiff[tj];
119
120 // Compute the product
121 const T prod = (xval - xold1val) * (xold1val - xold2val);
122
123 // Compute asy_factor without branching
124 T asy_factor = (prod < T(0)) ? asydecr :
125 (prod > T(0)) ? asyincr : T(1);
126
127 // Update low and upp using fma (fused multiply-add) for numerical stability
128 T new_low = fma(-asy_factor, (xold1val - lowval), xval);
129 T new_upp = fma(asy_factor, (uppval - xold1val), xval);
130
131 // Apply bounds
132 new_low = max(new_low, xval - T(10.0) * xdiffval);
133 new_low = min(new_low, xval - T(0.01) * xdiffval);
134
135 new_upp = min(new_upp, xval + T(10.0) * xdiffval);
136 new_upp = max(new_upp, xval + T(0.01) * xdiffval);
137
138 // Write results back
139 low[tj] = new_low;
140 upp[tj] = new_upp;
141}
142
143template< typename T >
144__global__ void mma_sub3_kernel(const T* __restrict__ x,
145 const T* __restrict__ df0dx, const T* __restrict__ dfdx,
146 T* __restrict__ low, T* __restrict__ upp, const T* __restrict__ xmin,
147 const T* __restrict__ xmax, T* __restrict__ alpha, T* __restrict__ beta,
148 T* __restrict__ p0j, T* __restrict__ q0j, T* __restrict__ pij,
149 T* __restrict__ qij, const int n, const int m) {
150 int tj = blockIdx.x * blockDim.x + threadIdx.x;
151 if (tj < n) {
152 T xgap = xmax[tj] - xmin[tj];
153 alpha[tj] = max(max(xmin[tj], low[tj] +
154 0.1 * (x[tj] - low[tj])), x[tj] - 0.5 * xgap);
155 beta[tj] = min(min(xmax[tj], upp[tj] - 0.1 * (upp[tj] - x[tj])), x[tj] +
156 0.5 * xgap);
157
158 p0j[tj] = pow(upp[tj] - x[tj], 2) * (1.001 * max(df0dx[tj], 0.0) +
159 0.001 * max(-df0dx[tj], 0.0) + 0.00001 / max(0.00001, xgap));
160 q0j[tj] = pow(x[tj] - low[tj], 2) * (0.001 * max(df0dx[tj], 0.0) +
161 1.001 * max(-df0dx[tj], 0.0) + 0.00001 / max(0.00001, xgap));
162
163 for (int i = 0; i < m; i++) {
164 pij[i + tj*m] = pow(upp[tj] - x[tj], 2) *
165 (1.001 * max(dfdx[i + tj*m], 0.0) + 0.001 *
166 max(-dfdx[i + tj*m], 0.0) + 0.00001 / max(0.00001, xgap));
167 qij[i + tj*m] = pow(x[tj] - low[tj], 2) *
168 (0.001 * max(dfdx[i + tj*m], 0.0) + 1.001 *
169 max(-dfdx[i + tj*m], 0.0) + 0.00001 / max(0.00001, xgap));
170 }
171 }
172}
173
174
175template< typename T >
176__global__ void mma_sub4_kernel(const T* __restrict__ x, T* __restrict__ low,
177 T* __restrict__ upp, T* __restrict__ pij, T* __restrict__ qij,
178 T* __restrict__ temp, const int n, const int m) {
179 int tj = blockIdx.x * blockDim.x + threadIdx.x;
180 if (tj < n) {
181 for (int i = 0; i < m; i++) {
182 temp[i + tj*m] = pij[i + tj*m] / (upp[tj] - x[tj]) +
183 qij[i + tj*m] / (x[tj] - low[tj]);
184 }
185 }
186}
187
188
189template <typename T>
190__global__ void mma_max2_kernel(T* __restrict__ xsi, const T* __restrict__ x,
191 T* __restrict__ alpha, const int n) {
192 int tj = blockIdx.x * blockDim.x + threadIdx.x;
193 if (tj < n) {
194 xsi[tj] = max(1.0, 1.0 / (x[tj] - alpha[tj]));
195 }
196}
197
198
199
200template <typename T>
201__global__ void relambda_kernel(T* __restrict__ temp, const T* __restrict__ x,
202 const T* __restrict__ xupp, const T* __restrict__ xlow,
203 const T* __restrict__ pij, const T* __restrict__ qij, const int n,
204 const int m) {
205 int tj = blockIdx.x * blockDim.x + threadIdx.x;
206 if (tj < n) {
207 for (int i = 0; i < m; i++) {
208 temp[i + tj*m] = pij[i + tj*m] / (xupp[tj] - x[tj]) +
209 qij[i + tj*m] / (x[tj] - xlow[tj]);
210 }
211 }
212}
213
214
215template <typename T>
216__global__ void sub2cons2_kernel(T* __restrict__ a, const T* __restrict__ b,
217 const T* __restrict__ c, const T* __restrict__ d,
218 const T e, const int n) {
219 int tj = blockIdx.x * blockDim.x + threadIdx.x;
220 if (tj < n) {
221 a[tj] = b[tj]*(c[tj]-d[tj])-e;
222 }
223}
224
225template< typename T>
226__inline__ __device__ T max_reduce_warp(T val) {
227 val = max(val, __shfl_down(val, 16, 32));
228 val = max(val, __shfl_down(val, 8, 32));
229 val = max(val, __shfl_down(val, 4, 32));
230 val = max(val, __shfl_down(val, 2, 32));
231 val = max(val, __shfl_down(val, 1, 32));
232 return val;
233}
234
235
236
237template< typename T >
238__global__ void maxval_kernel(const T* __restrict__ a, T *temp, const int n) {
239
240 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
241 const int str = blockDim.x * gridDim.x;
242
243 const unsigned int lane = threadIdx.x % warpSize;
244 const unsigned int wid = threadIdx.x / warpSize;
245
246 __shared__ T shared[32];
247 T maxval = 0.0;
248 for (int i = idx; i < n; i += str) {
249 maxval = max(maxval, abs(a[i]));
250 }
251
252 maxval = max_reduce_warp<T>(maxval);
253 if (lane == 0)
254 shared[wid] = maxval;
255 __syncthreads();
256
257 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0.0;
258 if (wid == 0)
259 maxval = max_reduce_warp<T>(maxval);
260
261 if (threadIdx.x == 0)
262 temp[blockIdx.x] = maxval;
263
264}
265
266template <typename T>
267__global__ void max_reduce_kernel(T* __restrict__ bufred, const int n) {
268
269 T maxval = 0.0;
270 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
271 const int str = blockDim.x * gridDim.x;
272 for (int i = idx; i < n; i += str)
273 {
274 maxval =max(maxval, bufred[i]);
275 }
276
277 __shared__ T shared[32];
278 unsigned int lane = threadIdx.x % warpSize;
279 unsigned int wid = threadIdx.x / warpSize;
280
281 maxval = max_reduce_warp<T>(maxval);
282 if (lane == 0)
283 shared[wid] = maxval;
284 __syncthreads();
285
286 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0.0;
287 if (wid == 0)
288 maxval = max_reduce_warp<T>(maxval);
289
290 if (threadIdx.x == 0)
291 bufred[blockIdx.x] = maxval;
292}
293
294template <typename T>
295__global__ void delx_kernel(T* __restrict__ delx, const T* __restrict__ x,
296 const T* __restrict__ xlow, const T* __restrict__ xupp,
297 const T* __restrict__ pij, const T* __restrict__ qij,
298 const T* __restrict__ p0j, const T* __restrict__ q0j,
299 const T* __restrict__ alpha, const T* __restrict__ beta,
300 const T* __restrict__ lambda, const T epsi, const int n, const int m) {
301 int tj = blockIdx.x * blockDim.x + threadIdx.x;
302 if (tj < n) {
303 delx[tj]=0;
304 for (int i = 0; i < m; i++) {
305 delx[tj] = delx[tj] + pij[i + tj * m] *
306 lambda[i] / pow(xupp[tj] - x[tj], 2) -
307 qij[i + tj * m] * lambda[i] / pow(x[tj] - xlow[tj], 2);
308 }
309 delx[tj] = delx[tj] + p0j[tj] / pow(xupp[tj] - x[tj], 2) -
310 q0j[tj] / pow(x[tj] - xlow[tj], 2) - epsi / (x[tj] - alpha[tj])
311 + epsi / (beta[tj] - x[tj]);
312 }
313}
314
315
316template <typename T>
317__global__ void GG_kernel(T* __restrict__ GG, const T* __restrict__ x,
318 const T* __restrict__ xlow, const T* __restrict__ xupp,
319 const T* __restrict__ pij, const T* __restrict__ qij, const int n,
320 const int m) {
321 int tj = blockIdx.x * blockDim.x + threadIdx.x;
322 if (tj < n) {
323 for (int ggdumiter = 0; ggdumiter < m; ggdumiter++) {
324 GG[ggdumiter + m*tj] = pij[ggdumiter + m*tj] / pow(xupp[tj] - x[tj], 2) -
325 qij[ggdumiter + m*tj] / pow(x[tj] - xlow[tj], 2);
326 }
327 }
328}
329template <typename T>
330__global__ void diagx_kernel(T* __restrict__ diagx, const T* __restrict__ x,
331 const T* __restrict__ xsi, const T* __restrict__ xlow,
332 const T* __restrict__ xupp, const T* __restrict__ p0j,
333 const T* __restrict__ q0j, const T* __restrict__ pij,
334 const T* __restrict__ qij, const T* alpha, const T* beta,
335 const T* eta, const T* lambda, const int n, const int m) {
336 int tj = blockIdx.x * blockDim.x + threadIdx.x;
337 if (tj < n) {
338 T sum = 0;
339 T sum1 = 0;
340 for (int i = 0; i < m; i++) {
341 sum = sum + pij[tj *m+ i] * lambda[i];
342 sum1 = sum1 + qij[tj*m + i] * lambda[i];
343 }
344 diagx[tj] = (p0j[tj] + sum) / pow(xupp[tj] - x[tj], 3) +
345 (q0j[tj] + sum1) / pow(x[tj] - xlow[tj], 3);
346 diagx[tj] = 2.0 * diagx[tj] + xsi[tj] / (x[tj] - alpha[tj]) +
347 eta[tj] / (beta[tj] - x[tj]);
348 }
349}
350
351
352template< typename T>
353__inline__ __device__ T reduce_warp(T val) {
354 val += __shfl_down(val, 16, 32);
355 val += __shfl_down(val, 8, 32);
356 val += __shfl_down(val, 4, 32);
357 val += __shfl_down(val, 2, 32);
358 val += __shfl_down(val, 1, 32);
359 return val;
360}
361
362template <typename T>
363__global__ void mmareduce_kernel(T* __restrict__ bufred, const int n) {
364
365 T sum = 0;
366 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
367 const int str = blockDim.x * gridDim.x;
368 for (int i = idx; i < n; i += str)
369 {
370 sum += bufred[i];
371 }
372
373 __shared__ T shared[32];
374 unsigned int lane = threadIdx.x % warpSize;
375 unsigned int wid = threadIdx.x / warpSize;
376
377 sum = reduce_warp<T>(sum);
378 if (lane == 0)
379 shared[wid] = sum;
380 __syncthreads();
381
382 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
383 if (wid == 0)
384 sum = reduce_warp<T>(sum);
385
386 if (threadIdx.x == 0)
387 bufred[blockIdx.x] = sum;
388}
389
390
391template< typename T >
392__global__ void mmasum_kernel(const T* __restrict__ a, T* __restrict__ buf_h,
393 const int n, const int m, const int k) {
394
395 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
396 const int str = blockDim.x * gridDim.x;
397
398 const unsigned int lane = threadIdx.x % warpSize;
399 const unsigned int wid = threadIdx.x / warpSize;
400
401 __shared__ T shared[32];
402 T sum = 0;
403 for (int i = idx; i < n; i += str)
404 {
405 sum += a[m * i + k ];
406 }
407
408 sum = reduce_warp<T>(sum);
409 if (lane == 0)
410 shared[wid] = sum;
411 __syncthreads();
412
413 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
414 if (wid == 0)
415 sum = reduce_warp<T>(sum);
416
417 if (threadIdx.x == 0)
418 buf_h[blockIdx.x] = sum;
419
420}
421template< typename T >
422__global__ void mmasumbb_kernel(const T* __restrict__ GG,
423 const T* __restrict__ delx, const T* __restrict__ diagx,
424 T* __restrict__ buf_h, const int n, const int m, const int k) {
425
426 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
427 const int str = blockDim.x * gridDim.x;
428
429 const unsigned int lane = threadIdx.x % warpSize;
430 const unsigned int wid = threadIdx.x / warpSize;
431
432 __shared__ T shared[32];
433 T sum = 0;
434 for (int i = idx; i < n; i += str)
435 {
436 sum += GG[ k + i * m] * delx[i] / diagx[i];
437 }
438
439 sum = reduce_warp<T>(sum);
440 if (lane == 0)
441 shared[wid] = sum;
442 __syncthreads();
443
444 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
445 if (wid == 0)
446 sum = reduce_warp<T>(sum);
447
448 if (threadIdx.x == 0)
449 buf_h[blockIdx.x] = sum;
450
451}
452
453template< typename T >
454__global__ void mmasumHess_kernel(const T* __restrict__ hijx,
455 const T* __restrict__ Ljjxinv, T* __restrict__ buf_h, const int n,
456 const int m, const int k0, const int k1) {
457
458 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
459 const int str = blockDim.x * gridDim.x;
460
461 const unsigned int lane = threadIdx.x % warpSize;
462 const unsigned int wid = threadIdx.x / warpSize;
463 // this is similar to mmasumAA_kernel but with Ljjxinv_d = 1/diagx
464 __shared__ T shared[32];
465 T sum = 0;
466 for (int i = idx; i < n; i += str)
467 {
468 sum += hijx[ k0 + i * m] * Ljjxinv[i] * hijx[ k1 + i * m];
469 }
470
471 sum = reduce_warp<T>(sum);
472 if (lane == 0)
473 shared[wid] = sum;
474 __syncthreads();
475
476 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
477 if (wid == 0)
478 sum = reduce_warp<T>(sum);
479
480 if (threadIdx.x == 0)
481 buf_h[blockIdx.x] = sum;
482
483}
484
485template< typename T >
486__global__ void mmasumAA_kernel(const T* __restrict__ GG,
487 const T* __restrict__ diagx, T* __restrict__ buf_h, const int n,
488 const int m, const int k0, const int k1) {
489
490 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
491 const int str = blockDim.x * gridDim.x;
492
493 const unsigned int lane = threadIdx.x % warpSize;
494 const unsigned int wid = threadIdx.x / warpSize;
495
496 __shared__ T shared[32];
497 T sum = 0;
498 for (int i = idx; i < n; i += str)
499 {
500 sum += GG[ k0 + i * m] /diagx[i] * GG[ k1 + i * m];
501 }
502
503 sum = reduce_warp<T>(sum);
504 if (lane == 0)
505 shared[wid] = sum;
506 __syncthreads();
507
508 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
509 if (wid == 0)
510 sum = reduce_warp<T>(sum);
511
512 if (threadIdx.x == 0)
513 buf_h[blockIdx.x] = sum;
514
515}
516
517
518template <typename T>
519__global__ void mma_copy_kernel(T* __restrict__ a, const T* __restrict__ b,
520 const int n, const int m) {
521 int tj = blockIdx.x * blockDim.x + threadIdx.x;
522 if(tj<n)
523 a[tj+m]=b[tj];
524}
525
526
527
528template <typename T>
529__global__ void AA_kernel(T* __restrict__ temp, const T* __restrict__ GG,
530 const T* __restrict__ diagx, const int n, const int m) {
531 int tj = blockIdx.x * blockDim.x + threadIdx.x;
532 if (tj < n) {
533 for (int i0 = 0; i0 < m; i0++) {
534 for (int i1 = 0; i1 < m; i1++) {
535 temp[tj + i0 * (n + 1) + i1 * (m + 1) * (n + 1)] = GG[i0 * n + tj] *
536 (1.0 / diagx[tj]) * GG[i1 * n + tj];
537 }
538 }
539 }
540}
541
542
543template <typename T>
544__global__ void dx_kernel(T* __restrict__ dx, const T* __restrict__ delx,
545 const T* __restrict__ diagx, const T* __restrict__ GG,
546 const T* __restrict__ dlambda, const int n, const int m) {
547 int tj = blockIdx.x * blockDim.x + threadIdx.x;
548 if (tj < n) {
549 dx[tj] = -delx[tj]/diagx[tj];
550 for(int i=0;i<m;i++){
551 dx[tj] =dx[tj] - GG[tj*m+i]*dlambda[i]/diagx[tj];
552 }
553 }
554}
555
556
557
558template <typename T>
559__global__ void dxsi_kernel(T* __restrict__ dxsi, const T* __restrict__ xsi,
560 const T* __restrict__ dx, const T* __restrict__ x,
561 const T* __restrict__ alpha, const T epsi, const int n) {
562 int tj = blockIdx.x * blockDim.x + threadIdx.x;
563 if (tj < n) {
564 dxsi[tj]= -xsi[tj] + (epsi-dx[tj]*xsi[tj])/(x[tj] - alpha[tj]);
565 }
566}
567
568
569template <typename T>
570__global__ void deta_kernel(T* __restrict__ deta, const T* __restrict__ eta,
571 const T* __restrict__ dx, const T* __restrict__ x,
572 const T* __restrict__ beta, const T epsi, const int n) {
573 int tj = blockIdx.x * blockDim.x + threadIdx.x;
574 if (tj < n) {
575 deta[tj] = -eta[tj] + (epsi + dx[tj] * eta[tj]) / (beta[tj] - x[tj]);
576 }
577}
578
579
580
581template <typename T>
582__global__ void RexCalculation_kernel(T* __restrict__ rex,
583 const T* __restrict__ x, const T* __restrict__ xlow,
584 const T* __restrict__ xupp, const T* __restrict__ pij,
585 const T* __restrict__ p0j, const T* __restrict__ qij,
586 const T* __restrict__ q0j, const T* __restrict__ lambda,
587 const T* __restrict__ xsi, const T* __restrict__ eta, const int n,
588 const int m) {
589 int tj = blockIdx.x * blockDim.x + threadIdx.x;
590 if (tj < n) {
591 rex[tj] = 0.0;
592 for (int i = 0; i < m; i++) {
593 rex[tj] = rex[tj] + pij[i +tj*m] * lambda[i] / pow(xupp[tj] - x[tj], 2) -
594 qij[i +tj*m] * lambda[i] / pow(x[tj] - xlow[tj], 2);
595 }
596 rex[tj] = rex[tj] + p0j[tj] / pow(xupp[tj] - x[tj], 2) -
597 q0j[tj] / pow(x[tj] - xlow[tj], 2) - xsi[tj] + eta[tj];
598 }
599}
600
601template <typename T>
602__global__ void rey_calculation_kernel(T* __restrict__ rey,
603 const T* __restrict__ c, const T* __restrict__ d, const T* __restrict__ y,
604 const T* __restrict__ lambda, const T* __restrict__ mu, const int n) {
605 int tj = blockIdx.x * blockDim.x + threadIdx.x;
606 if (tj < n) {
607 rey[tj] = c[tj] + d[tj] * y[tj] - lambda[tj] - mu[tj];
608 }
609}
610
611
612template< typename T >
613__global__ void norm_kernel(const T* __restrict__ a, T* __restrict__ buf_h,
614 const int n) {
615
616 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
617 const int str = blockDim.x * gridDim.x;
618
619 const unsigned int lane = threadIdx.x % warpSize;
620 const unsigned int wid = threadIdx.x / warpSize;
621
622 __shared__ T shared[32];
623 T sum = 0;
624 for (int i = idx; i < n; i += str)
625 {
626 sum += pow(a[i], 2);
627 }
628
629 sum = reduce_warp<T>(sum);
630 if (lane == 0)
631 shared[wid] = sum;
632 __syncthreads();
633
634 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
635 if (wid == 0)
636 sum = reduce_warp<T>(sum);
637
638 if (threadIdx.x == 0)
639 buf_h[blockIdx.x] = sum;
640
641}
642
643
644
645template <typename T>
646__global__ void sub2cons_kernel(T* __restrict__ a, const T* __restrict__ b,
647 const T* __restrict__ c,
648 const T d, const int n) {
649 int tj = blockIdx.x * blockDim.x + threadIdx.x;
650 if (tj < n) {
651 a[tj] = b[tj]*c[tj]-d;
652 }
653}
654
655
656template <typename T>
657__global__ void dely_kernel(T* __restrict__ dely, const T* __restrict__ c,
658 const T* __restrict__ d, const T* __restrict__ y,
659 const T* __restrict__ lambda, const T epsi, const int n) {
660 int tj = blockIdx.x * blockDim.x + threadIdx.x;
661 if (tj < n) {
662 dely[tj] = c[tj] + d[tj]*y[tj] - lambda[tj] - epsi/y[tj];
663 }
664}
665
666
667
668template< typename T >
669__global__ void maxval2_kernel(const T* __restrict__ a, const T* __restrict__ b,
670 T* __restrict__ temp, const T cons, const int n) {
671
672 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
673 const int str = blockDim.x * gridDim.x;
674
675 const unsigned int lane = threadIdx.x % warpSize;
676 const unsigned int wid = threadIdx.x / warpSize;
677
678 __shared__ T shared[32];
679 T maxval = cons * a[0] / b[0];
680 for (int i = idx; i < n; i += str)
681 {
682 maxval = max(maxval, cons * a[i] / b[i]);
683 }
684
685 maxval = max_reduce_warp<T>(maxval);
686 if (lane == 0)
687 shared[wid] = maxval;
688 __syncthreads();
689
690 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0.0;
691 if (wid == 0)
692 maxval = max_reduce_warp<T>(maxval);
693
694 if (threadIdx.x == 0)
695 temp[blockIdx.x] = maxval;
696
697}
698
699
700template< typename T >
701__global__ void maxval3_kernel(const T* __restrict__ a, const T* __restrict__ b,
702 const T* __restrict__ c, T* __restrict__ temp, const T cons, const int n) {
703
704 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
705 const int str = blockDim.x * gridDim.x;
706
707 const unsigned int lane = threadIdx.x % warpSize;
708 const unsigned int wid = threadIdx.x / warpSize;
709
710 __shared__ T shared[32];
711 T maxval = cons * a[0] / b[0];
712 for (int i = idx; i < n; i += str)
713 {
714 maxval = max(maxval, cons * a[i] / (b[i] - c[i]));
715 }
716
717 maxval = max_reduce_warp<T>(maxval);
718 if (lane == 0)
719 shared[wid] = maxval;
720 __syncthreads();
721
722 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
723 if (wid == 0)
724 maxval = max_reduce_warp<T>(maxval);
725
726 if (threadIdx.x == 0)
727 temp[blockIdx.x] = maxval;
728
729}
730
731
732template <typename T>
733__global__ void kkt_rex_kernel(T* __restrict__ rex, const T* __restrict__ df0dx,
734 const T* __restrict__ dfdx, const T* __restrict__ xsi,
735 const T* __restrict__ eta, const T* __restrict__ lambda, const int n,
736 const int m) {
737 int tj = blockIdx.x * blockDim.x + threadIdx.x;
738 if (tj < n) {
739 rex[tj] = 0.0;
740 for (int i = 0; i < m; i++) {
741 rex[tj] = rex[tj] + dfdx[i + tj*m] * lambda[i];
742 }
743 rex[tj] += df0dx[tj] - xsi[tj] + eta[tj];
744 }
745}
746
747
748template <typename T>
749__global__ void maxcons_kernel(T* __restrict__ a, const T b,
750 const T c, const T* __restrict__ d, const int n) {
751 int tj = blockIdx.x * blockDim.x + threadIdx.x;
752 if (tj < n) {
753 a[tj] = max(b, c * d[tj]);
754 }
755}
756
759 template< typename T >
760 __global__ void glsum_kernel(const T * a, T * buf_h, const int n) {
761 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
762 const int str = blockDim.x * gridDim.x;
763
764 const unsigned int lane = threadIdx.x % warpSize;
765 const unsigned int wid = threadIdx.x / warpSize;
766
767 __shared__ T shared[32];
768 T sum = 0;
769 for (int i = idx; i<n ; i += str)
770 {
771 sum += a[i];
772 }
773
774 sum = reduce_warp<T>(sum);
775 if (lane == 0)
776 shared[wid] = sum;
777 __syncthreads();
778
779 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
780 if (wid == 0)
781 sum = reduce_warp<T>(sum);
782
783 if (threadIdx.x == 0)
784 buf_h[blockIdx.x] = sum;
785
786 }
787 template< typename T >
788__global__ void glsc2_kernel(const T * a,
789 const T * b,
790 T * buf_h,
791 const int n) {
792
793 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
794 const int str = blockDim.x * gridDim.x;
795
796 const unsigned int lane = threadIdx.x % warpSize;
797 const unsigned int wid = threadIdx.x / warpSize;
798
799 __shared__ T shared[32];
800 T sum = 0.0;
801 for (int i = idx; i < n; i+= str) {
802 sum += a[i] * b[i];
803 }
804
805 sum = reduce_warp<T>(sum);
806 if (lane == 0)
807 shared[wid] = sum;
808 __syncthreads();
809
810 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
811 if (wid == 0)
812 sum = reduce_warp<T>(sum);
813
814 if (threadIdx.x == 0)
815 buf_h[blockIdx.x] = sum;
816
817 }
818
819
820
821
822
823template <typename T>
824__global__ void add2inv2_kernel(T* __restrict__ a, const T* __restrict__ b,
825 const T c, const int n) {
826 int tj = blockIdx.x * blockDim.x + threadIdx.x;
827 if (tj < n) {
828 a[tj] = a[tj]+c/b[tj];
829 }
830}
831
832template <typename T>
833__global__ void max2_kernel(T* __restrict__ a, const T b,
834 const T* __restrict__ c, const T d, const int n) {
835 int tj = blockIdx.x * blockDim.x + threadIdx.x;
836 if (tj < n) {
837 a[tj]=max(b, d*c[tj]);
838 }
839}
840
841template <typename T>
842__global__ void updatebb_kernel(T* __restrict__ bb,
843 const T* __restrict__ dellambda, const T* __restrict__ dely,
844 const T* __restrict__ d, const T* __restrict__ mu,
845 const T* __restrict__ y, const T delz, const int m) {
846 int tj = blockIdx.x * blockDim.x + threadIdx.x;
847 if(tj<m)
848 bb[tj]=dellambda[tj] + dely[tj]/(d[tj] + mu[tj]/y[tj]) - bb[tj];
849 else if(tj<m+1)
850 bb[tj]=delz;
851}
852
853
854
855template <typename T>
856__global__ void updateAA_kernel(T* __restrict__ AA,
857 const T* __restrict__ globaltmp_mm, const T* __restrict__ s,
858 const T* __restrict__ lambda, const T* __restrict__ d,
859 const T* __restrict__ mu, const T* __restrict__ y, const T* __restrict__ a,
860 const T zeta, const T z, const int m) {
861 int tj = blockIdx.x * blockDim.x + threadIdx.x;
862 if(tj<m)
863 {
864 AA[tj+tj*(m+1)]=globaltmp_mm[tj+tj*m] + (s[tj] / lambda[tj] +
865 1.0/ (d[tj] + mu[tj] / y[tj]));
866 AA[tj+m*(m+1)]=a[tj];
867 AA[m+tj*(m+1)]=a[tj];
868 }
869 else if(tj<m+1)
870 AA[tj+tj*(m+1)]= -zeta/z;
871}
872
873template <typename T>
874__global__ void dy_kernel(T* __restrict__ dy, const T* __restrict__ dely,
875 const T* __restrict__ dlambda, const T* __restrict__ d,
876 const T* __restrict__ mu, const T* __restrict__ y, const int n) {
877 int tj = blockIdx.x * blockDim.x + threadIdx.x;
878 if(tj<n)
879 dy[tj] = (-dely[tj]+dlambda[tj])/(d[tj] + mu[tj]/y[tj]);
880}
881
882#endif
883