Neko-TOP
A portable framework for high-order spectral element flow toplogy optimization.
Loading...
Searching...
No Matches
mma.cu
Go to the documentation of this file.
1
37// System includes
38#include <stdio.h>
39#include <stdlib.h>
40
41// Device includes
42#include <cuda_runtime.h>
43#include <cusolverDn.h>
44
45// Neko includes
46#include <neko/device/device_config.h>
47#include <neko/device/cuda/check.h>
48#include <neko/math/bcknd/device/device_mpi_reduce.h>
49#include <neko/math/bcknd/device/device_mpi_op.h>
50
51// Local includes
52#include "mma_kernel.h"
53
54
55extern "C" {
56
57 int mma_red_s = 0;
58 real* mma_bufred = NULL;
59 real* mma_bufred_d = NULL;
60
61 void mma_prepare_aa_matrix_cuda(void* AA, void* s, void* lambda,
62 void* d, void* mu, void* y,
63 void* a, real* zeta, real* z, int* m) {
64 const dim3 nthrds(1024, 1, 1);
65 const dim3 nblcks(((*m) + 1024 - 1) / 1024, 1, 1);
66
67 // Launch kernel to prepare AA matrix
69 (real*)AA, (real*)s, (real*)lambda, (real*)d,
70 (real*)mu, (real*)y, (real*)a,
71 *zeta, *z, *m);
72
74 }
75
76 void mma_prepare_hessian_cuda(void* Hess, void* y, void* d,
77 void* mu, void* lambda, int* m) {
78 const int M = *m;
79 const dim3 nthrds(1024, 1, 1);
80 const dim3 nblcks((M + 1024 - 1) / 1024, 1, 1);
82
83 // Update diagonal elements
85 (real*)Hess, (real*)y, (real*)d, (real*)mu, (real*)lambda, M);
87
88 // Synchronize to ensure diagonal updates are complete
90
91 // Choose kernel based on problem size
92 if (M <= 1024) {
93 // Single-block version (fast for small m)
94 const dim3 stab_nblcks(1, 1, 1);
96 (real*)Hess, M);
98 } else {
99 // Multi-block version (for large m)
100 // Compute trace on host (simple and reliable)
101 real* h_Hess = (real*)malloc(M * sizeof(real));
102
103 // Extract diagonal elements
104 for (int i = 0; i < M; i++) {
106 (real*)Hess + i * M + i,
107 sizeof(real),
109 }
111
112 // Compute trace and LM factor
113 real trace = 0.0;
114 for (int i = 0; i < M; i++) {
115 trace += h_Hess[i];
116 }
117 real lm_factor = fmax(-1.0e-4 * trace / M, 1.0e-7);
118
119 // Apply stabilization in parallel
121 (real*)Hess, lm_factor, M);
123
124 free(h_Hess);
125 }
126 }
127
128 void cuda_custom_solver(void* A, void* b, int n, int* info) {
130
131 if (n <= 0) {
132 *info = -1; // Use CPU fallback
133 return;
134 }
135 const dim3 nthrds(1024, 1, 1);
136 const dim3 nblcks(1, 1, 1);
137
139 (real*)A, (real*)b, n);
140
142 if (err == cudaSuccess) {
143 *info = 0; // GPU solver succeeded
144 } else {
145 *info = -1; // GPU failed
146 }
147 }
148
149 void cuSOLVER_wrapper(void* A, void* b, int n, int* jj) {
153
154 int lwork;
155 double* workspace;
156 int* ipiv;
157 int* info; // Device pointer for cuSOLVER info
158 int host_info = 0; // Host variable to store the info
159
160 // Workspace query
161 status = cusolverDnDgetrf_bufferSize(handle, n, n, (double*)A, n, &lwork);
164 }
165 cudaMalloc(&workspace, lwork * sizeof(double));
166 cudaMalloc(&ipiv, n * sizeof(int));
167 cudaMalloc(&info, sizeof(int));
168
169 // LU factorization and solve
170 cusolverDnDgetrf(handle, n, n, (double*)A, n, workspace, ipiv, info);
171
172 // Copy info from device to host to check if factorization succeeded
174
175 if (host_info == 0) {
176 // Only solve if factorization was successful
177 cusolverDnDgetrs(handle, CUBLAS_OP_N, n, 1, (double*)A, n, ipiv, (double*)b, n, info);
178 // Copy the final info value
180 }
181
182
183 // Return the actual info value through jj
184 *jj = host_info;
185
186 // Cleanup
188 cudaFree(ipiv);
189 cudaFree(info);
191 }
192
193 void custom_solve_linear_system(void* A, void* b, int n, int* info) {
195
196 if (n <= 0) {
197 *info = -1; // Use CPU fallback
198 return;
199 }
200 const dim3 nthrds(1024, 1, 1);
201 const dim3 nblcks(1, 1, 1);
202
204 (real*)A, (real*)b, n);
205
207 if (err == cudaSuccess) {
208 *info = 0; // GPU solver succeeded
209 } else {
210 *info = -1; // GPU failed, use CPU fallback
211 }
212 }
213
214 void delta_1dbeam_cuda(void* Delta, real* L_total, real* Le,
215 int* offset, int* n) {
216 const dim3 nthrds(1024, 1, 1);
217 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
219 ((real*)Delta, *L_total, *Le, *offset, *n);
221 }
222
223 void cuda_Hess(void* Hess, void* hijx, void* Ljjxinv, int *n, int *m) {
224 const dim3 nthrds(1024, 1, 1);
225 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
226 const int nb = ((*n) + 1024 - 1)/ 1024;
229 if(nb > mma_red_s){
230 mma_red_s = nb;
231 if(mma_bufred != NULL){
232 CUDA_CHECK(cudaFreeHost(mma_bufred));
233 CUDA_CHECK(cudaFree(mma_bufred_d));
234 }
235 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
236 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
237 }
238 for (int i = 0; i < (*m); i++){
239 for (int j=0; j<(*m);j++){
241 ((real*)hijx,(real*)Ljjxinv, mma_bufred_d, (*n),(*m), i, j);
243 mmareduce_kernel<real> <<<1, 1024, 0, stream>>> (mma_bufred_d, nb);
245 mma_copy_kernel<<<1, 1, 0, stream>>>((real*)Hess, mma_bufred_d, 1,
246 i+j*(*m));
249 }
250 }
251 }
252
253 void mma_Ljjxinv_cuda(void* Ljjxinv, void* pjlambda, void* qjlambda, void* x,
254 void* low, void* upp, void* alpha, void* beta, int* n) {
255 const dim3 nthrds(1024, 1, 1);
256 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
258 ((real*)Ljjxinv, (real*)pjlambda, (real*)qjlambda, (real*)x, (real*)low,
259 (real*)upp, (real*)alpha, (real*)beta, *n);
261 }
262
263 void mma_dipsolvesub1_cuda(void* x, void* pjlambda, void* qjlambda,
264 void* low, void* upp, void* alpha, void* beta, int* n) {
265 const dim3 nthrds(1024, 1, 1);
266 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
268 ((real*)x, (real*)pjlambda, (real*)qjlambda, (real*)low, (real*)upp,
269 (real*)alpha, (real*)beta, *n);
271 }
272
273 void mattrans_v_mul_cuda(void* output, void* pij, void* lambda,
274 int* m, int* n) {
275 const dim3 nthrds(1024, 1, 1);
276 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
278 ((real*)output, (real*)pij, (real*)lambda, *m, *n);
280 }
281
282 void mma_gensub4_cuda(const void* x, const void* low, const void* upp,
283 const void* pij, const void* qij,
284 const int* n, const int* m, void* bi) {
285
286 const int N = *n;
287 const int M = *m;
288
289 const dim3 nthrds(1024, 1, 1);
290 const dim3 nblcks((N + 1023) / 1024, 1, 1);
291 const int nb = (N + 1023) / 1024;
293
294 if (nb > mma_red_s) {
295 mma_red_s = nb;
296
297 if (mma_bufred != NULL) {
298 CUDA_CHECK(cudaFreeHost(mma_bufred));
299 CUDA_CHECK(cudaFree(mma_bufred_d));
300 }
301
302 CUDA_CHECK(cudaMallocHost(&mma_bufred, nb * sizeof(real)));
303 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb * sizeof(real)));
304 }
305
306 real* temp;
307 real* bi_d = static_cast<real*>(bi);
308 CUDA_CHECK(cudaMalloc(&temp, M * N * sizeof(real)));
309
311 static_cast<const real*>(x),
312 static_cast<const real*>(low),
313 static_cast<const real*>(upp),
314 static_cast<const real*>(pij),
315 static_cast<const real*>(qij),
316 temp, N, M);
317
318 for (int i = 0; i < M; ++i) {
320 temp, mma_bufred_d, N, M, i);
322
323 mmareduce_kernel<real><<<1, 1024, 0, stream>>>(mma_bufred_d, nb);
325
327 bi_d + i, mma_bufred_d, sizeof(real),
329
331 }
332
333 CUDA_CHECK(cudaFree(temp));
334 }
335
336 void mma_gensub3_cuda(void* x, void* df0dx, void* dfdx, void* low,
337 void* upp, void* xmin, void* xmax, void* alpha, void* beta,
338 void* p0j, void* q0j, void* pij, void* qij, int* n, int* m) {
339 const dim3 nthrds(1024, 1, 1);
340 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
341
344 (real*)x, (real*)df0dx, (real*)dfdx, (real*)low,
345 (real*)upp, (real*)xmin, (real*)xmax, (real*)alpha,
346 (real*)beta, (real*)p0j, (real*)q0j, (real*)pij,
347 (real*)qij, *n, *m);
348
350 }
351
352 void mma_gensub2_cuda(void* low, void* upp, void* x, void* xold1,
353 void* xold2, void* xdiff, real* asydecr, real* asyincr, int* n) {
354 const dim3 nthrds(1024, 1, 1);
355 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
356
359 (real*)low, (real*)upp, (real*)x, (real*)xold1,
360 (real*)xold2, (real*)xdiff, *asydecr, *asyincr, *n);
361
363 }
364
365 void mma_gensub1_cuda(void* low, void* upp, void* x, void* xmin, void* xmax,
366 real* asyinit, int* n) {
367 const dim3 nthrds(1024, 1, 1);
368 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
370 ((real*)low, (real*)upp, (real*)x, (real*)xmin, (real*)xmax,
371 *asyinit, *n);
373 }
374
375 void cuda_mma_max(void* xsi, void* x, void* alpha, int* n) {
376 const dim3 nthrds(1024, 1, 1);
377 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
378
380 ((real*)xsi, (real*)x, (real*)alpha, *n);
382 }
383
384 void cuda_relambda(void* relambda, void* x, void* xupp, void* xlow,
385 void* pij, void* qij, int* n, int* m) {
386 const dim3 nthrds(1024, 1, 1);
387 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
388 const int nb = ((*n) + 1024 - 1)/ 1024;
390
391 if ( nb > mma_red_s){
392 mma_red_s = nb;
393 if (mma_bufred != NULL) {
394 CUDA_CHECK(cudaFreeHost(mma_bufred));
395 CUDA_CHECK(cudaFree(mma_bufred_d));
396 }
397 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
398 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
399 }
400 real* temp;
401 cudaMalloc(&temp, (*n) * (*m) * sizeof(real));
402 relambda_kernel<real> <<<nblcks, nthrds, 0, stream >>> (temp, (real*)x,
403 (real*)xupp, (real*)xlow, (real*)pij, (real*)qij, *n, *m);
404 for (int i = 0; i < (*m); i++) {
406 (temp, mma_bufred_d, (*n),(*m), i);
408 mmareduce_kernel<real> <<<1, 1024, 0, stream >>>
409 (mma_bufred_d, nb);
411 mma_copy_kernel<<<1, 1, 0, stream>>>((real*)relambda, mma_bufred_d, 1, i);
414 }
415 cudaFree(temp);
416 }
417
418 void cuda_sub2cons2(void* a, void* b, void* c, void* d, real* e, int* n) {
419
420 const dim3 nthrds(1024, 1, 1);
421 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
422
424 (real*)a, (real*)b, (real*)c, (real*)d, *e, *n);
426 }
427
428 //max abs values of input
429 real cuda_maxval(void* a, int* n) {
430
431 const dim3 nthrds(1024, 1, 1);
432 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
433 const int nb = ((*n) + 1024 - 1) / 1024;
435
436 if (nb > mma_red_s) {
437 mma_red_s = nb;
438 if (mma_bufred != NULL) {
439 CUDA_CHECK(cudaFreeHost(mma_bufred));
440 CUDA_CHECK(cudaFree(mma_bufred_d));
441 }
442 CUDA_CHECK(cudaMallocHost(&mma_bufred, nb * sizeof(real)));
443 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb * sizeof(real)));
444 }
445
447 (real*)a, mma_bufred_d, (*n));
449
450 max_reduce_kernel<real><<<1, 1024, 0, stream>>>(
451 mma_bufred_d, nb);
453
454 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d, sizeof(real),
457
458 return mma_bufred[0];
459 }
460
461 void cuda_delx(void* delx, void* x, void* xlow, void* xupp, void* pij,
462 void* qij, void* p0j, void* q0j, void* alpha, void* beta, void* lambda,
463 real* epsi, int* n, int* m) {
464
465 const dim3 nthrds(1024, 1, 1);
466 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
467
469 (real*)delx, (real*)x, (real*)xlow, (real*)xupp, (real*)pij,
470 (real*)qij, (real*)p0j, (real*)q0j, (real*)alpha, (real*)beta,
471 (real*)lambda, *epsi, *n, *m);
473 }
474
475 void cuda_GG(void* GG, void* x, void* xlow, void* xupp,
476 void* pij, void* qij, int* n, int* m) {
477 const dim3 nthrds(1024, 1, 1);
478 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
480 ((real*)GG, (real*)x, (real*)xlow,(real*) xupp,
481 (real*)pij, (real*) qij, *n,*m);
483 }
484
485 void cuda_diagx(void* diagx, void* x, void* xsi,void* xlow, void* xupp,
486 void* p0j, void* q0j, void* pij, void* qij, void* alpha, void* beta,
487 void* eta, void* lambda, int *n, int *m) {
488 const dim3 nthrds(1024, 1, 1);
489 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
491 ((real*)diagx, (real*)x, (real*)xsi,(real*)xlow,
492 (real*) xupp,(real*)p0j, (real*) q0j, (real*)pij, (real*) qij,
493 (real*)alpha, (real*) beta, (real*)eta, (real*) lambda, *n,*m);
495 }
496
497 void cuda_bb(void* bb, void* GG, void* delx,void* diagx, int *n, int *m) {
498 const dim3 nthrds(1024, 1, 1);
499 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
500 const int nb = ((*n) + 1024 - 1)/ 1024;
503 if(nb > mma_red_s){
504 mma_red_s = nb;
505 if(mma_bufred != NULL){
506 CUDA_CHECK(cudaFreeHost(mma_bufred));
507 CUDA_CHECK(cudaFree(mma_bufred_d));
508 }
509 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
510 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
511 }
512 for (int i = 0; i < (*m); i++) {
514 ((real*)GG,(real*)delx,(real*)diagx, mma_bufred_d, (*n),(*m), i);
516 mmareduce_kernel<real> <<<1, 1024, 0, stream>>> (mma_bufred_d, nb);
518 mma_copy_kernel<<<1, 1, 0, stream>>>((real*)bb, mma_bufred_d, 1, i);
521 }
522 }
523
524 void cuda_AA(void* AA, void* GG, void* diagx, int *n, int *m) {
525 const dim3 nthrds(1024, 1, 1);
526 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
527 const int nb = ((*n) + 1024 - 1)/ 1024;
530 if(nb > mma_red_s){
531 mma_red_s = nb;
532 if(mma_bufred != NULL){
533 CUDA_CHECK(cudaFreeHost(mma_bufred));
534 CUDA_CHECK(cudaFree(mma_bufred_d));
535 }
536 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
537 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
538 }
539 for (int i = 0; i < (*m); i++){
540 for (int j=0; j<(*m);j++){
542 ((real*)GG,(real*)diagx, mma_bufred_d, (*n),(*m), i, j);
544 mmareduce_kernel<real> <<<1, 1024, 0, stream>>> (mma_bufred_d, nb);
546 mma_copy_kernel<<<1, 1, 0, stream>>>((real*)AA, mma_bufred_d, 1,
547 i+j*(*m+1));
550 }
551 }
552 }
553
554 void cuda_dx(void* dx,void* delx, void* diagx, void* GG, void* dlambda,
555 int* n, int* m) {
556 const dim3 nthrds(1024, 1, 1);
557 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
559 ((real*)dx, (real*)delx, (real*)diagx,(real*) GG, (real*)dlambda,
560 *n,*m);
562 }
563
564 void cuda_dxsi(void* dxsi, void* xsi, void* dx,void* x,
565 void* alpha, real*epsi, int* n) {
566 const dim3 nthrds(1024, 1, 1);
567 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
569 ((real*)dxsi, (real*)xsi, (real*)dx,(real*) x,
570 (real*)alpha, *epsi,*n);
572 }
573
574 void cuda_deta(void* deta, void* eta, void* dx, void* x,
575 void* beta, real* epsi, int* n) {
576 const dim3 nthrds(1024, 1, 1);
577 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
579 ((real*)deta, (real*)eta, (real*)dx, (real*)x,
580 (real*)beta, *epsi, *n);
582 }
583
584 void cuda_rex(void* rex, void* x, void* xlow, void* xupp, void* pij,
585 void* p0j, void* qij, void* q0j, void* lambda, void* xsi, void* eta,
586 int* n, int* m) {
587 const dim3 nthrds(1024, 1, 1);
588 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
591 (real*)xupp, (real*)pij, (real*)p0j, (real*)qij, (real*)q0j,
592 (real*)lambda, (real*)xsi, (real*)eta, *n, *m);
594 }
595
596 void cuda_rey(void* rey, void* c, void* d, void* y, void* lambda, void* mu,
597 int* n) {
598 const dim3 nthrds(1024, 1, 1);
599 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
602 (real*)d, (real*)y, (real*)lambda, (real*)mu, * n);
604 }
605
606
607 void cuda_sub2cons(void * a,void * b,void * c, real *d, int * n) {
608 const dim3 nthrds(1024, 1, 1);
609 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
611 ((real *) a, (real *) b, (real *) c, *d, *n);
613 }
614
615 real cuda_norm(void* a, int* n) {
616 const dim3 nthrds(1024, 1, 1);
617 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
618 const int nb = ((*n) + 1024 - 1)/ 1024;
620 if(nb > mma_red_s){
621 mma_red_s = nb;
622 if(mma_bufred != NULL){
623 CUDA_CHECK(cudaFreeHost(mma_bufred));
624 CUDA_CHECK(cudaFree(mma_bufred_d));
625 }
626 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
627 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
628 }
630 ((real*)a, mma_bufred_d, (*n));
632 mmareduce_kernel<real> <<<1, 1024, 0, stream >>> (mma_bufred_d, nb);
634 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d, sizeof(real),
637 return mma_bufred[0];
638 }
639
640 void cuda_dely(void* dely, void* c, void* d, void* y, void* lambda,
641 real* epsi, int* n) {
642 const dim3 nthrds(1024, 1, 1);
643 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
645 ((real*)dely,(real*)c, (real*)d, (real*)y, (real*)lambda,*epsi, * n);
647 }
648
649 real cuda_maxval2(void* a, void* b, real* cons, int* n) {
650 const dim3 nthrds(1024, 1, 1);
651 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
652 const int nb = ((*n) + 1024 - 1)/ 1024;
654 if(nb > mma_red_s){
655 mma_red_s = nb;
656 if(mma_bufred != NULL) {
657 CUDA_CHECK(cudaFreeHost(mma_bufred));
658 CUDA_CHECK(cudaFree(mma_bufred_d));
659 }
660 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
661 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
662 }
664 ((real*)a, (real*)b, mma_bufred_d, *cons, *n);
666 max_reduce_kernel<real> <<<1, 1024, 0,stream >>> (mma_bufred_d, nb);
668 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d, sizeof(real),
671 return mma_bufred[0];
672 }
673
674 real cuda_maxval3(void* a, void* b, void* c, real* cons, int* n) {
675 const dim3 nthrds(1024, 1, 1);
676 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
677 const int nb = ((*n) + 1024 - 1)/ 1024;
679 if(nb > mma_red_s){
680 mma_red_s = nb;
681 if(mma_bufred != NULL) {
682 CUDA_CHECK(cudaFreeHost(mma_bufred));
683 CUDA_CHECK(cudaFree(mma_bufred_d));
684 }
685 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
686 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
687 }
689 ((real*)a, (real*)b, (real*)c, mma_bufred_d, *cons, *n);
690 max_reduce_kernel<real> <<<1, 1024, 0,stream >>> (mma_bufred_d, nb);
692 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d, sizeof(real),
695 return mma_bufred[0];
696 }
697
698 void cuda_kkt_rex(void* rex, void* df0dx, void* dfdx, void* xsi,
699 void* eta, void* lambda, int* n, int* m) {
700 const dim3 nthrds(1024, 1, 1);
701 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
703 ((real*)rex, (real*)df0dx, (real*)dfdx, (real*)xsi,
704 (real*)eta, (real*)lambda, *n, *m);
706 }
707
708 void cuda_maxcons(void* a, real* b, real* c, void* d, int* n) {
709 const dim3 nthrds(1024, 1, 1);
710 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
712 ((real*)a, *b, *c, (real*)d, *n);
714 }
715
716 real cuda_lcsc2(void *a, void*b, int *n) {
717 const dim3 nthrds(1024, 1, 1);
718 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
719 const int nb = ((*n) + 1024 - 1)/ 1024;
721 if ( nb > mma_red_s){
722 mma_red_s = nb;
723 if (mma_bufred != NULL) {
724 CUDA_CHECK(cudaFreeHost(mma_bufred));
725 CUDA_CHECK(cudaFree(mma_bufred_d));
726 }
727 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
728 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
729 }
731 ((real*)a, (real*)b, mma_bufred_d, (*n));
733 mmareduce_kernel<real> <<<1, 1024, 0, stream>>> (mma_bufred_d, nb);
735 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d, sizeof(real),
738 return mma_bufred[0];
739 }
740
741 void cuda_mpisum(void *a, int *n) {
742#ifdef HAVE_DEVICE_MPI
743 real* temp=(real*)a;
746#endif
747 }
748
749 void cuda_add2inv2(void* a, void *b, real* c, int* n) {
750 const dim3 nthrds(1024, 1, 1);
751 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
753 ((real*)a, (real*) b, *c, *n);
755 }
756
757 void cuda_max2(void* a, real* b, void* c, real* d, int* n) {
758 const dim3 nthrds(1024, 1, 1);
759 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
761 ((real*)a, *b, (real*)c,*d, *n);
763 }
764
765 void cuda_updatebb(void* bb, void* dellambda, void* dely,void* d,
766 void* mu, void* y, real* delz, int *m) {
767 const dim3 nthrds(1024, 1, 1);
768 const dim3 nblcks(((*m+1) + 1024 - 1) / 1024, 1, 1);
770 ((real*) bb, (real*) dellambda, (real*) dely,(real*) d,
771 (real*) mu, (real*) y, *delz, *m);
773 }
774
775 void cuda_updateAA(void* AA, void* globaltmp_mm, void* s, void* lambda,
776 void* d, void*mu,void* y,void* a, real* zeta, real* z, int* m) {
777 const dim3 nthrds(1024, 1, 1);
778 const dim3 nblcks(((*m+1) + 1024 - 1) / 1024, 1, 1);
780 ((real*) AA,(real*) globaltmp_mm, (real*) s, (real*) lambda,(real*) d,
781 (real*)mu,(real*) y, (real*)a, *zeta, *z, *m);
783 }
784
785 void cuda_dy(void* dy, void* dely, void* dlambda,void* d, void* mu,
786 void* y, int* n) {
787 const dim3 nthrds(1024, 1, 1);
788 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
790 ((real*) dy, (real*) dely, (real*) dlambda, (real*) d,
791 (real*) mu,(real*) y, *n);
793 }
794
795}/* extern "C" */
__global__ void convex_down_RAMP_mapping_apply_kernel(const T f_min, const T f_max, const T q, T *__restrict__ X_out_d, T *__restrict__ X_in_d, const int n)