Neko-TOP
A portable framework for high-order spectral element flow toplogy optimization.
Loading...
Searching...
No Matches
mma_kernel.h
Go to the documentation of this file.
1#ifndef MMA_KERNEL_H
2#define MMA_KERNEL_H
3template <typename T>
5 const T* __restrict__ x, const T* __restrict__ xmin,
6 const T* __restrict__ xmax, const T asyinit, const int n) {
7 int tj = blockIdx.x * blockDim.x + threadIdx.x;
8 if (tj < n) {
9 T xgap = xmax[tj] - xmin[tj];
10 xlow[tj] = x[tj] - asyinit * xgap;
11 xupp[tj] = x[tj] + asyinit * xgap;
12 }
13}
14
15
16template< typename T >
18 const T* __restrict__ x, const T* __restrict__ xold1,
19 const T* __restrict__ xold2, const T* __restrict__ xmin,
20 const T* __restrict__ xmax, const T asydecr, const T asyincr,
21 const int n) {
22 int tj = blockIdx.x * blockDim.x + threadIdx.x;
23 if (tj < n) {
24 T xgap = xmax[tj] - xmin[tj];
25 T xdiff = (x[tj] - xold1[tj]) * (xold1[tj] - xold2[tj]);
26 if (xdiff < 0){
27 low[tj] = x[tj] - asydecr * (xold1[tj] - low[tj]);
28 upp[tj] = x[tj] + asydecr * (upp[tj] - xold1[tj]);
29 }
30 else if (xdiff > 0){
31 low[tj] = x[tj] - asyincr * (xold1[tj] - low[tj]);
32 upp[tj] = x[tj] + asyincr * (upp[tj] - xold1[tj]);
33 }
34 else {
35 low[tj] = x[tj] - (xold1[tj] - low[tj]);
36 upp[tj] = x[tj] + (upp[tj] - xold1[tj]);
37 }
38 low[tj] = max(low[tj], x[tj] - 10 * xgap);
39 low[tj] = min(low[tj], x[tj] - 0.01 * xgap);
40 upp[tj] = min(upp[tj], x[tj] + 10 * xgap);
41 upp[tj] = max(upp[tj], x[tj] - 0.01 * xgap);
42 }
43}
44
45template< typename T >
47 const T* __restrict__ df0dx, const T* __restrict__ dfdx,
48 T* __restrict__ low, T* __restrict__ upp, const T* __restrict__ xmin,
49 const T* __restrict__ xmax, T* __restrict__ alpha, T* __restrict__ beta,
50 T* __restrict__ p0j, T* __restrict__ q0j, T* __restrict__ pij,
51 T* __restrict__ qij, const int n, const int m) {
52 int tj = blockIdx.x * blockDim.x + threadIdx.x;
53 if (tj < n) {
54 T xgap = xmax[tj] - xmin[tj];
55 alpha[tj] = max(max(xmin[tj], low[tj] +
56 0.1 * (x[tj] - low[tj])), x[tj] - 0.5 * xgap);
57 beta[tj] = min(min(xmax[tj], upp[tj] - 0.1 * (upp[tj] - x[tj])), x[tj] +
58 0.5 * xgap);
59 p0j[tj] = pow(upp[tj] - x[tj], 2) * (1.001 * max(df0dx[tj], 0.0) +
60 0.001 * max(-df0dx[tj], 0.0) + 0.00001 / max(0.00001, xgap));
61
62 q0j[tj] = pow(x[tj] - low[tj], 2) * (0.001 * max(df0dx[tj], 0.0) +
63 1.001 * max(-df0dx[tj], 0.0) + 0.00001 / max(0.00001, xgap));
64 for (int i = 0; i < m; i++) {
65 pij[i + tj*m] = pow(upp[tj] - x[tj], 2) *
66 (1.001 * max(dfdx[i + tj*m], 0.0) + 0.001 *
67 max(-dfdx[i + tj*m], 0.0) + 0.00001 / max(0.00001, xgap));
68 qij[i + tj*m] = pow(x[tj] - low[tj], 2) *
69 (0.001 * max(dfdx[i + tj*m], 0.0) + 1.001 *
70 max(-dfdx[i + tj*m], 0.0) + 0.00001 / max(0.00001, xgap));
71 }
72 }
73}
74
75
76template< typename T >
78 T* __restrict__ upp, T* __restrict__ pij, T* __restrict__ qij,
79 T* __restrict__ temp, const int n, const int m) {
80 int tj = blockIdx.x * blockDim.x + threadIdx.x;
81 if (tj < n) {
82 for (int i = 0; i < m; i++) {
83 temp[i + tj*m] = pij[i + tj*m] / (upp[tj] - x[tj]) +
84 qij[i + tj*m] / (x[tj] - low[tj]);
85 }
86 }
87}
88
89
90template <typename T>
92 T* __restrict__ alpha, const int n) {
93 int tj = blockIdx.x * blockDim.x + threadIdx.x;
94 if (tj < n) {
95 xsi[tj] = max(1.0, 1.0 / (x[tj] - alpha[tj]));
96 }
97}
98
99
100
101template <typename T>
103 const T* __restrict__ xupp, const T* __restrict__ xlow,
104 const T* __restrict__ pij, const T* __restrict__ qij, const int n,
105 const int m) {
106 int tj = blockIdx.x * blockDim.x + threadIdx.x;
107 if (tj < n) {
108 for (int i = 0; i < m; i++) {
109 temp[i + tj*m] = pij[i + tj*m] / (xupp[tj] - x[tj]) +
110 qij[i + tj*m] / (x[tj] - xlow[tj]);
111 }
112 }
113}
114
115
116template <typename T>
118 const T* __restrict__ c, const T* __restrict__ d,
119 const T e, const int n) {
120 int tj = blockIdx.x * blockDim.x + threadIdx.x;
121 if (tj < n) {
122 a[tj] = b[tj]*(c[tj]-d[tj])-e;
123 }
124}
125
126template< typename T>
128 val = max(val, __shfl_down_sync(0xffffffff, val, 16));
129 val = max(val, __shfl_down_sync(0xffffffff, val, 8));
130 val = max(val, __shfl_down_sync(0xffffffff, val, 4));
131 val = max(val, __shfl_down_sync(0xffffffff, val, 2));
132 val = max(val, __shfl_down_sync(0xffffffff, val, 1));
133 return val;
134}
135
136
137
138template< typename T >
139__global__ void maxval_kernel(const T* __restrict__ a, T *temp, const int n) {
140
141 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
142 const int str = blockDim.x * gridDim.x;
143
144 const unsigned int lane = threadIdx.x % warpSize;
145 const unsigned int wid = threadIdx.x / warpSize;
146
147 __shared__ T shared[32];
148 T maxval = 0.0;
149 for (int i = idx; i < n; i += str) {
150 maxval = max(maxval, abs(a[i]));
151 }
152
154 if (lane == 0)
155 shared[wid] = maxval;
157
158 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0.0;
159 if (wid == 0)
161
162 if (threadIdx.x == 0)
163 temp[blockIdx.x] = maxval;
164
165}
166
167template <typename T>
169
170 T maxval = 0.0;
171 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
172 const int str = blockDim.x * gridDim.x;
173 for (int i = idx; i < n; i += str)
174 {
175 maxval =max(maxval, bufred[i]);
176 }
177
178 __shared__ T shared[32];
179 unsigned int lane = threadIdx.x % warpSize;
180 unsigned int wid = threadIdx.x / warpSize;
181
183 if (lane == 0)
184 shared[wid] = maxval;
186
187 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0.0;
188 if (wid == 0)
190
191 if (threadIdx.x == 0)
193}
194
195template <typename T>
197 const T* __restrict__ xlow, const T* __restrict__ xupp,
198 const T* __restrict__ pij, const T* __restrict__ qij,
199 const T* __restrict__ p0j, const T* __restrict__ q0j,
200 const T* __restrict__ alpha, const T* __restrict__ beta,
201 const T* __restrict__ lambda, const T epsi, const int n, const int m) {
202 int tj = blockIdx.x * blockDim.x + threadIdx.x;
203 if (tj < n) {
204 delx[tj]=0;
205 for (int i = 0; i < m; i++) {
206 delx[tj] = delx[tj] + pij[i + tj * m] *
207 lambda[i] / pow(xupp[tj] - x[tj], 2) -
208 qij[i + tj * m] * lambda[i] / pow(x[tj] - xlow[tj], 2);
209 }
210 delx[tj] = delx[tj] + p0j[tj] / pow(xupp[tj] - x[tj], 2) -
211 q0j[tj] / pow(x[tj] - xlow[tj], 2) - epsi / (x[tj] - alpha[tj])
212 + epsi / (beta[tj] - x[tj]);
213 }
214}
215
216
217template <typename T>
219 const T* __restrict__ xlow, const T* __restrict__ xupp,
220 const T* __restrict__ pij, const T* __restrict__ qij, const int n,
221 const int m) {
222 int tj = blockIdx.x * blockDim.x + threadIdx.x;
223 if (tj < n) {
224 for (int ggdumiter = 0; ggdumiter < m; ggdumiter++) {
225 GG[ggdumiter + m*tj] = pij[ggdumiter + m*tj] / pow(xupp[tj] - x[tj], 2) -
226 qij[ggdumiter + m*tj] / pow(x[tj] - xlow[tj], 2);
227 }
228 }
229}
230template <typename T>
232 const T* __restrict__ xsi, const T* __restrict__ xlow,
233 const T* __restrict__ xupp, const T* __restrict__ p0j,
234 const T* __restrict__ q0j, const T* __restrict__ pij,
235 const T* __restrict__ qij, const T* alpha, const T* beta,
236 const T* eta, const T* lambda, const int n, const int m) {
237 int tj = blockIdx.x * blockDim.x + threadIdx.x;
238 if (tj < n) {
239 T sum = 0;
240 T sum1 = 0;
241 for (int i = 0; i < m; i++) {
242 sum = sum + pij[tj *m+ i] * lambda[i];
243 sum1 = sum1 + qij[tj*m + i] * lambda[i];
244 }
245 diagx[tj] = (p0j[tj] + sum) / pow(xupp[tj] - x[tj], 3) +
246 (q0j[tj] + sum1) / pow(x[tj] - xlow[tj], 3);
247 diagx[tj] = 2.0 * diagx[tj] + xsi[tj] / (x[tj] - alpha[tj]) +
248 eta[tj] / (beta[tj] - x[tj]);
249 }
250}
251
252
253template< typename T>
255 val += __shfl_down_sync(0xffffffff, val, 16);
256 val += __shfl_down_sync(0xffffffff, val, 8);
257 val += __shfl_down_sync(0xffffffff, val, 4);
258 val += __shfl_down_sync(0xffffffff, val, 2);
259 val += __shfl_down_sync(0xffffffff, val, 1);
260 return val;
261}
262
263template <typename T>
265
266 T sum = 0;
267 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
268 const int str = blockDim.x * gridDim.x;
269 for (int i = idx; i < n; i += str)
270 {
271 sum += bufred[i];
272 }
273
274 __shared__ T shared[32];
275 unsigned int lane = threadIdx.x % warpSize;
276 unsigned int wid = threadIdx.x / warpSize;
277
279 if (lane == 0)
280 shared[wid] = sum;
282
283 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
284 if (wid == 0)
286
287 if (threadIdx.x == 0)
288 bufred[blockIdx.x] = sum;
289}
290
291
292template< typename T >
294 const int n, const int m, const int k) {
295
296 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
297 const int str = blockDim.x * gridDim.x;
298
299 const unsigned int lane = threadIdx.x % warpSize;
300 const unsigned int wid = threadIdx.x / warpSize;
301
302 __shared__ T shared[32];
303 T sum = 0;
304 for (int i = idx; i < n; i += str)
305 {
306 sum += a[m * i + k ];
307 }
308
310 if (lane == 0)
311 shared[wid] = sum;
313
314 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
315 if (wid == 0)
317
318 if (threadIdx.x == 0)
319 buf_h[blockIdx.x] = sum;
320
321}
322template< typename T >
324 const T* __restrict__ delx, const T* __restrict__ diagx,
325 T* __restrict__ buf_h, const int n, const int m, const int k) {
326
327 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
328 const int str = blockDim.x * gridDim.x;
329
330 const unsigned int lane = threadIdx.x % warpSize;
331 const unsigned int wid = threadIdx.x / warpSize;
332
333 __shared__ T shared[32];
334 T sum = 0;
335 for (int i = idx; i < n; i += str)
336 {
337 sum += GG[ k + i * m] * delx[i] / diagx[i];
338 }
339
341 if (lane == 0)
342 shared[wid] = sum;
344
345 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
346 if (wid == 0)
348
349 if (threadIdx.x == 0)
350 buf_h[blockIdx.x] = sum;
351
352}
353
354template< typename T >
356 const T* __restrict__ diagx, T* __restrict__ buf_h, const int n,
357 const int m, const int k0, const int k1) {
358
359 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
360 const int str = blockDim.x * gridDim.x;
361
362 const unsigned int lane = threadIdx.x % warpSize;
363 const unsigned int wid = threadIdx.x / warpSize;
364
365 __shared__ T shared[32];
366 T sum = 0;
367 for (int i = idx; i < n; i += str)
368 {
369 sum += GG[ k0 + i * m] /diagx[i] * GG[ k1 + i * m];
370 }
371
373 if (lane == 0)
374 shared[wid] = sum;
376
377 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
378 if (wid == 0)
380
381 if (threadIdx.x == 0)
382 buf_h[blockIdx.x] = sum;
383
384}
385
386
387template <typename T>
389 const int n, const int m) {
390 int tj = blockIdx.x * blockDim.x + threadIdx.x;
391 if(tj<n)
392 a[tj+m]=b[tj];
393}
394
395
396
397template <typename T>
399 const T* __restrict__ diagx, const int n, const int m) {
400 int tj = blockIdx.x * blockDim.x + threadIdx.x;
401 if (tj < n) {
402 for (int i0 = 0; i0 < m; i0++) {
403 for (int i1 = 0; i1 < m; i1++) {
404 temp[tj + i0 * n + i1 * m * n] = GG[i0 * n + tj] *
405 (1.0 / diagx[tj]) * GG[i1 * n + tj];
406 }
407 }
408 }
409}
410
411
412template <typename T>
414 const T* __restrict__ diagx, const T* __restrict__ GG,
415 const T* __restrict__ dlambda, const int n, const int m) {
416 int tj = blockIdx.x * blockDim.x + threadIdx.x;
417 if (tj < n) {
418 dx[tj] = -delx[tj]/diagx[tj];
419 for(int i=0;i<m;i++){
420 dx[tj] =dx[tj] - GG[tj*m+i]*dlambda[i]/diagx[tj];
421 }
422 }
423}
424
425
426
427template <typename T>
429 const T* __restrict__ dx, const T* __restrict__ x,
430 const T* __restrict__ alpha, const T epsi, const int n) {
431 int tj = blockIdx.x * blockDim.x + threadIdx.x;
432 if (tj < n) {
433 dxsi[tj]= -xsi[tj] + (epsi-dx[tj]*xsi[tj])/(x[tj] - alpha[tj]);
434 }
435}
436
437
438template <typename T>
440 const T* __restrict__ dx, const T* __restrict__ x,
441 const T* __restrict__ beta, const T epsi, const int n) {
442 int tj = blockIdx.x * blockDim.x + threadIdx.x;
443 if (tj < n) {
444 deta[tj] = -eta[tj] + (epsi + dx[tj] * eta[tj]) / (beta[tj] - x[tj]);
445 }
446}
447
448
449
450template <typename T>
452 const T* __restrict__ x, const T* __restrict__ xlow,
453 const T* __restrict__ xupp, const T* __restrict__ pij,
454 const T* __restrict__ p0j, const T* __restrict__ qij,
455 const T* __restrict__ q0j, const T* __restrict__ lambda,
456 const T* __restrict__ xsi, const T* __restrict__ eta, const int n,
457 const int m) {
458 int tj = blockIdx.x * blockDim.x + threadIdx.x;
459 if (tj < n) {
460 rex[tj] = 0.0;
461 for (int i = 0; i < m; i++) {
462 rex[tj] = rex[tj] + pij[i +tj*m] * lambda[i] / pow(xupp[tj] - x[tj], 2) -
463 qij[i +tj*m] * lambda[i] / pow(x[tj] - xlow[tj], 2);
464 }
465 rex[tj] = rex[tj] + p0j[tj] / pow(xupp[tj] - x[tj], 2) -
466 q0j[tj] / pow(x[tj] - xlow[tj], 2) - xsi[tj] + eta[tj];
467 }
468}
469
470template <typename T>
472 const T* __restrict__ c, const T* __restrict__ d, const T* __restrict__ y,
473 const T* __restrict__ lambda, const T* __restrict__ mu, const int n) {
474 int tj = blockIdx.x * blockDim.x + threadIdx.x;
475 if (tj < n) {
476 rey[tj] = c[tj] + d[tj] * y[tj] - lambda[tj] - mu[tj];
477 }
478}
479
480
481template< typename T >
483 const int n) {
484
485 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
486 const int str = blockDim.x * gridDim.x;
487
488 const unsigned int lane = threadIdx.x % warpSize;
489 const unsigned int wid = threadIdx.x / warpSize;
490
491 __shared__ T shared[32];
492 T sum = 0;
493 for (int i = idx; i < n; i += str)
494 {
495 sum += pow(a[i], 2);
496 }
497
499 if (lane == 0)
500 shared[wid] = sum;
502
503 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
504 if (wid == 0)
506
507 if (threadIdx.x == 0)
508 buf_h[blockIdx.x] = sum;
509
510}
511
512
513
514template <typename T>
516 const T* __restrict__ c,
517 const T d, const int n) {
518 int tj = blockIdx.x * blockDim.x + threadIdx.x;
519 if (tj < n) {
520 a[tj] = b[tj]*c[tj]-d;
521 }
522}
523
524
525template <typename T>
527 const T* __restrict__ d, const T* __restrict__ y,
528 const T* __restrict__ lambda, const T epsi, const int n) {
529 int tj = blockIdx.x * blockDim.x + threadIdx.x;
530 if (tj < n) {
531 dely[tj] = c[tj] + d[tj]*y[tj] - lambda[tj] - epsi/y[tj];
532 }
533}
534
535
536
537template< typename T >
539 T* __restrict__ temp, const T cons, const int n) {
540
541 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
542 const int str = blockDim.x * gridDim.x;
543
544 const unsigned int lane = threadIdx.x % warpSize;
545 const unsigned int wid = threadIdx.x / warpSize;
546
547 __shared__ T shared[32];
548 T maxval = cons * a[0] / b[0];
549 for (int i = idx; i < n; i += str)
550 {
551 maxval = max(maxval, cons * a[i] / b[i]);
552 }
553
555 if (lane == 0)
556 shared[wid] = maxval;
558
559 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0.0;
560 if (wid == 0)
562
563 if (threadIdx.x == 0)
564 temp[blockIdx.x] = maxval;
565
566}
567
568
569template< typename T >
571 const T* __restrict__ c, T* __restrict__ temp, const T cons, const int n) {
572
573 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
574 const int str = blockDim.x * gridDim.x;
575
576 const unsigned int lane = threadIdx.x % warpSize;
577 const unsigned int wid = threadIdx.x / warpSize;
578
579 __shared__ T shared[32];
580 T maxval = cons * a[0] / b[0];
581 for (int i = idx; i < n; i += str)
582 {
583 maxval = max(maxval, cons * a[i] / (b[i] - c[i]));
584 }
585
587 if (lane == 0)
588 shared[wid] = maxval;
590
591 maxval = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
592 if (wid == 0)
594
595 if (threadIdx.x == 0)
596 temp[blockIdx.x] = maxval;
597
598}
599
600
601template <typename T>
603 const T* __restrict__ dfdx, const T* __restrict__ xsi,
604 const T* __restrict__ eta, const T* __restrict__ lambda, const int n,
605 const int m) {
606 int tj = blockIdx.x * blockDim.x + threadIdx.x;
607 if (tj < n) {
608 rex[tj] = 0.0;
609 for (int i = 0; i < m; i++) {
610 rex[tj] = rex[tj] + dfdx[i + tj*m] * lambda[i];
611 }
612 rex[tj] += df0dx[tj] - xsi[tj] + eta[tj];
613 }
614}
615
616
617template <typename T>
619 const T c, const T* __restrict__ d, const int n) {
620 int tj = blockIdx.x * blockDim.x + threadIdx.x;
621 if (tj < n) {
622 a[tj] = max(b, c * d[tj]);
623 }
624}
625
628 template< typename T >
629 __global__ void glsum_kernel(const T * a, T * buf_h, const int n) {
630 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
631 const int str = blockDim.x * gridDim.x;
632
633 const unsigned int lane = threadIdx.x % warpSize;
634 const unsigned int wid = threadIdx.x / warpSize;
635
636 __shared__ T shared[32];
637 T sum = 0;
638 for (int i = idx; i<n ; i += str)
639 {
640 sum += a[i];
641 }
642
644 if (lane == 0)
645 shared[wid] = sum;
647
648 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
649 if (wid == 0)
651
652 if (threadIdx.x == 0)
653 buf_h[blockIdx.x] = sum;
654
655 }
656 template< typename T >
657__global__ void glsc2_kernel(const T * a,
658 const T * b,
659 T * buf_h,
660 const int n) {
661
662 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
663 const int str = blockDim.x * gridDim.x;
664
665 const unsigned int lane = threadIdx.x % warpSize;
666 const unsigned int wid = threadIdx.x / warpSize;
667
668 __shared__ T shared[32];
669 T sum = 0.0;
670 for (int i = idx; i < n; i+= str) {
671 sum += a[i] * b[i];
672 }
673
675 if (lane == 0)
676 shared[wid] = sum;
678
679 sum = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
680 if (wid == 0)
682
683 if (threadIdx.x == 0)
684 buf_h[blockIdx.x] = sum;
685
686 }
687
688
689
690
691
692template <typename T>
694 const T c, const int n) {
695 int tj = blockIdx.x * blockDim.x + threadIdx.x;
696 if (tj < n) {
697 a[tj] = a[tj]+c/b[tj];
698 }
699}
700
701template <typename T>
703 const T* __restrict__ c, const T d, const int n) {
704 int tj = blockIdx.x * blockDim.x + threadIdx.x;
705 if (tj < n) {
706 a[tj]=max(b, d*c[tj]);
707 }
708}
709
710template <typename T>
712 const T* __restrict__ dellambda, const T* __restrict__ dely,
713 const T* __restrict__ d, const T* __restrict__ mu,
714 const T* __restrict__ y, const T delz, const int m) {
715 int tj = blockIdx.x * blockDim.x + threadIdx.x;
716 if(tj<m)
717 bb[tj]=dellambda[tj] + dely[tj]/(d[tj] + mu[tj]/y[tj]) - bb[tj];
718 else if(tj<m+1)
719 bb[tj]=delz;
720}
721
722
723
724template <typename T>
726 const T* __restrict__ globaltmp_mm, const T* __restrict__ s,
727 const T* __restrict__ lambda, const T* __restrict__ d,
728 const T* __restrict__ mu, const T* __restrict__ y, const T* __restrict__ a,
729 const T zeta, const T z, const int m) {
730 int tj = blockIdx.x * blockDim.x + threadIdx.x;
731 if(tj<m)
732 {
733 AA[tj+tj*(m+1)]=globaltmp_mm[tj+tj*m] + (s[tj] / lambda[tj] +
734 1.0/ (d[tj] + mu[tj] / y[tj]));
735 AA[tj+m*(m+1)]=a[tj];
736 AA[m+tj*(m+1)]=a[tj];
737 }
738 else if(tj<m+1)
739 AA[tj+tj*(m+1)]= -zeta/z;
740}
741
742template <typename T>
744 const T* __restrict__ dlambda, const T* __restrict__ d,
745 const T* __restrict__ mu, const T* __restrict__ y, const int n) {
746 int tj = blockIdx.x * blockDim.x + threadIdx.x;
747 if(tj<n)
748 dy[tj] = (-dely[tj]+dlambda[tj])/(d[tj] + mu[tj]/y[tj]);
749}
750
751#endif
752
__global__ void convex_down_RAMP_mapping_apply_kernel(const T f_min, const T f_max, const T q, T *__restrict__ X_out_d, T *__restrict__ X_in_d, const int n)
__global__ void mmasumAA_kernel(const T *__restrict__ GG, const T *__restrict__ diagx, T *__restrict__ buf_h, const int n, const int m, const int k0, const int k1)
Definition mma_kernel.h:355
__inline__ __device__ T reduce_warp(T val)
Definition mma_kernel.h:254
__global__ void max2_kernel(T *__restrict__ a, const T b, const T *__restrict__ c, const T d, const int n)
Definition mma_kernel.h:702
__global__ void mmasumbb_kernel(const T *__restrict__ GG, const T *__restrict__ delx, const T *__restrict__ diagx, T *__restrict__ buf_h, const int n, const int m, const int k)
Definition mma_kernel.h:323
__global__ void add2inv2_kernel(T *__restrict__ a, const T *__restrict__ b, const T c, const int n)
Definition mma_kernel.h:693
__inline__ __device__ T max_reduce_warp(T val)
Definition mma_kernel.h:127
__global__ void mmareduce_kernel(T *__restrict__ bufred, const int n)
Definition mma_kernel.h:264
__global__ void mma_sub4_kernel(const T *__restrict__ x, T *__restrict__ low, T *__restrict__ upp, T *__restrict__ pij, T *__restrict__ qij, T *__restrict__ temp, const int n, const int m)
Definition mma_kernel.h:77
__global__ void RexCalculation_kernel(T *__restrict__ rex, const T *__restrict__ x, const T *__restrict__ xlow, const T *__restrict__ xupp, const T *__restrict__ pij, const T *__restrict__ p0j, const T *__restrict__ qij, const T *__restrict__ q0j, const T *__restrict__ lambda, const T *__restrict__ xsi, const T *__restrict__ eta, const int n, const int m)
Definition mma_kernel.h:451
__global__ void relambda_kernel(T *__restrict__ temp, const T *__restrict__ x, const T *__restrict__ xupp, const T *__restrict__ xlow, const T *__restrict__ pij, const T *__restrict__ qij, const int n, const int m)
Definition mma_kernel.h:102
__global__ void maxval_kernel(const T *__restrict__ a, T *temp, const int n)
Definition mma_kernel.h:139
__global__ void norm_kernel(const T *__restrict__ a, T *__restrict__ buf_h, const int n)
Definition mma_kernel.h:482
__global__ void mma_sub2_kernel(T *__restrict__ low, T *__restrict__ upp, const T *__restrict__ x, const T *__restrict__ xold1, const T *__restrict__ xold2, const T *__restrict__ xmin, const T *__restrict__ xmax, const T asydecr, const T asyincr, const int n)
Definition mma_kernel.h:17
__global__ void dely_kernel(T *__restrict__ dely, const T *__restrict__ c, const T *__restrict__ d, const T *__restrict__ y, const T *__restrict__ lambda, const T epsi, const int n)
Definition mma_kernel.h:526
__global__ void maxval3_kernel(const T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, T *__restrict__ temp, const T cons, const int n)
Definition mma_kernel.h:570
__global__ void mma_sub1_kernel(T *__restrict__ xlow, T *__restrict__ xupp, const T *__restrict__ x, const T *__restrict__ xmin, const T *__restrict__ xmax, const T asyinit, const int n)
Definition mma_kernel.h:4
__global__ void mma_copy_kernel(T *__restrict__ a, const T *__restrict__ b, const int n, const int m)
Definition mma_kernel.h:388
__global__ void mma_max2_kernel(T *__restrict__ xsi, const T *__restrict__ x, T *__restrict__ alpha, const int n)
Definition mma_kernel.h:91
__global__ void dx_kernel(T *__restrict__ dx, const T *__restrict__ delx, const T *__restrict__ diagx, const T *__restrict__ GG, const T *__restrict__ dlambda, const int n, const int m)
Definition mma_kernel.h:413
__global__ void kkt_rex_kernel(T *__restrict__ rex, const T *__restrict__ df0dx, const T *__restrict__ dfdx, const T *__restrict__ xsi, const T *__restrict__ eta, const T *__restrict__ lambda, const int n, const int m)
Definition mma_kernel.h:602
__global__ void maxval2_kernel(const T *__restrict__ a, const T *__restrict__ b, T *__restrict__ temp, const T cons, const int n)
Definition mma_kernel.h:538
__global__ void deta_kernel(T *__restrict__ deta, const T *__restrict__ eta, const T *__restrict__ dx, const T *__restrict__ x, const T *__restrict__ beta, const T epsi, const int n)
Definition mma_kernel.h:439
__global__ void updateAA_kernel(T *__restrict__ AA, const T *__restrict__ globaltmp_mm, const T *__restrict__ s, const T *__restrict__ lambda, const T *__restrict__ d, const T *__restrict__ mu, const T *__restrict__ y, const T *__restrict__ a, const T zeta, const T z, const int m)
Definition mma_kernel.h:725
__global__ void glsc2_kernel(const T *a, const T *b, T *buf_h, const int n)
Definition mma_kernel.h:657
__global__ void maxcons_kernel(T *__restrict__ a, const T b, const T c, const T *__restrict__ d, const int n)
Definition mma_kernel.h:618
__global__ void updatebb_kernel(T *__restrict__ bb, const T *__restrict__ dellambda, const T *__restrict__ dely, const T *__restrict__ d, const T *__restrict__ mu, const T *__restrict__ y, const T delz, const int m)
Definition mma_kernel.h:711
__global__ void glsum_kernel(const T *a, T *buf_h, const int n)
Definition mma_kernel.h:629
__global__ void mmasum_kernel(const T *__restrict__ a, T *__restrict__ buf_h, const int n, const int m, const int k)
Definition mma_kernel.h:293
__global__ void diagx_kernel(T *__restrict__ diagx, const T *__restrict__ x, const T *__restrict__ xsi, const T *__restrict__ xlow, const T *__restrict__ xupp, const T *__restrict__ p0j, const T *__restrict__ q0j, const T *__restrict__ pij, const T *__restrict__ qij, const T *alpha, const T *beta, const T *eta, const T *lambda, const int n, const int m)
Definition mma_kernel.h:231
__global__ void sub2cons_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T d, const int n)
Definition mma_kernel.h:515
__global__ void sub2cons2_kernel(T *__restrict__ a, const T *__restrict__ b, const T *__restrict__ c, const T *__restrict__ d, const T e, const int n)
Definition mma_kernel.h:117
__global__ void max_reduce_kernel(T *__restrict__ bufred, const int n)
Definition mma_kernel.h:168
__global__ void dxsi_kernel(T *__restrict__ dxsi, const T *__restrict__ xsi, const T *__restrict__ dx, const T *__restrict__ x, const T *__restrict__ alpha, const T epsi, const int n)
Definition mma_kernel.h:428
__global__ void delx_kernel(T *__restrict__ delx, const T *__restrict__ x, const T *__restrict__ xlow, const T *__restrict__ xupp, const T *__restrict__ pij, const T *__restrict__ qij, const T *__restrict__ p0j, const T *__restrict__ q0j, const T *__restrict__ alpha, const T *__restrict__ beta, const T *__restrict__ lambda, const T epsi, const int n, const int m)
Definition mma_kernel.h:196
__global__ void AA_kernel(T *__restrict__ temp, const T *__restrict__ GG, const T *__restrict__ diagx, const int n, const int m)
Definition mma_kernel.h:398
__global__ void mma_sub3_kernel(const T *__restrict__ x, const T *__restrict__ df0dx, const T *__restrict__ dfdx, T *__restrict__ low, T *__restrict__ upp, const T *__restrict__ xmin, const T *__restrict__ xmax, T *__restrict__ alpha, T *__restrict__ beta, T *__restrict__ p0j, T *__restrict__ q0j, T *__restrict__ pij, T *__restrict__ qij, const int n, const int m)
Definition mma_kernel.h:46
__global__ void rey_calculation_kernel(T *__restrict__ rey, const T *__restrict__ c, const T *__restrict__ d, const T *__restrict__ y, const T *__restrict__ lambda, const T *__restrict__ mu, const int n)
Definition mma_kernel.h:471
__global__ void dy_kernel(T *__restrict__ dy, const T *__restrict__ dely, const T *__restrict__ dlambda, const T *__restrict__ d, const T *__restrict__ mu, const T *__restrict__ y, const int n)
Definition mma_kernel.h:743
__global__ void GG_kernel(T *__restrict__ GG, const T *__restrict__ x, const T *__restrict__ xlow, const T *__restrict__ xupp, const T *__restrict__ pij, const T *__restrict__ qij, const int n, const int m)
Definition mma_kernel.h:218