Neko-TOP
A portable framework for high-order spectral element flow toplogy optimization.
Loading...
Searching...
No Matches
mma.cu
Go to the documentation of this file.
1
37// System includes
38#include <stdio.h>
39#include <stdlib.h>
40
41// Device includes
42#include <cuda_runtime.h>
43#include <cusolverDn.h>
44
45// Neko includes
46#include <neko/device/device_config.h>
47#include <neko/device/cuda/check.h>
48#include <neko/math/bcknd/device/device_mpi_reduce.h>
49#include <neko/math/bcknd/device/device_mpi_op.h>
50
51// Local includes
52#include "mma_kernel.h"
53
54
55extern "C" {
56
57 int mma_red_s = 0;
58 real* mma_bufred = NULL;
59 real* mma_bufred_d = NULL;
60
61 void mma_update_hessian_z_cuda(void* Hess, void* a, int* m) {
62 const int M = *m;
63 // This function is called ONLY if dot(lambda,a) > 0
64
65 const int total = M * M;
66 const dim3 nthrds(1024, 1, 1);
67 const dim3 nblcks((total + 1024 - 1) / 1024, 1, 1);
68
70 (real*)Hess, (real*)a, M);
71
73 }
74
75 void mma_prepare_aa_matrix_cuda(void* AA, void* s, void* lambda,
76 void* d, void* mu, void* y,
77 void* a, real* zeta, real* z, int* m) {
78 const dim3 nthrds(1024, 1, 1);
79 const dim3 nblcks(((*m) + 1024 - 1) / 1024, 1, 1);
80
81 // Launch kernel to prepare AA matrix
83 (real*)AA, (real*)s, (real*)lambda, (real*)d,
84 (real*)mu, (real*)y, (real*)a,
85 *zeta, *z, *m);
86
88 }
89
90 void mma_prepare_hessian_cuda(void* Hess, void* y,
91 void* mu, void* lambda, int* m) {
92 const int M = *m;
93 const dim3 nthrds(1024, 1, 1);
94 const dim3 nblcks((M + 1024 - 1) / 1024, 1, 1);
96
97 // Update diagonal elements
99 (real*)Hess, (real*)y, (real*)mu, (real*)lambda, M);
101
102 // Synchronize to ensure diagonal updates are complete
104
105 // Choose kernel based on problem size
106 if (M <= 1024) {
107 // Single-block version (fast for small m)
108 const dim3 stab_nblcks(1, 1, 1);
110 (real*)Hess, M);
112 } else {
113 // Multi-block version (for large m)
114 // Compute trace on host (simple and reliable)
115 real* h_Hess = (real*)malloc(M * sizeof(real));
116
117 // Extract diagonal elements
118 for (int i = 0; i < M; i++) {
120 (real*)Hess + i * M + i,
121 sizeof(real),
123 }
125
126 // Compute trace and LM factor
127 real trace = 0.0;
128 for (int i = 0; i < M; i++) {
129 trace += h_Hess[i];
130 }
131 real lm_factor = fmax(-1.0e-4 * trace / M, 1.0e-7);
132
133 // Apply stabilization in parallel
135 (real*)Hess, lm_factor, M);
137
138 free(h_Hess);
139 }
140 }
141
142 void cuda_custom_solver(void* A, void* b, int n, int* info) {
144
145 if (n <= 0) {
146 *info = -1; // Use CPU fallback
147 return;
148 }
149 const dim3 nthrds(1024, 1, 1);
150 const dim3 nblcks(1, 1, 1);
151
153 (real*)A, (real*)b, n);
154
156 if (err == cudaSuccess) {
157 *info = 0; // GPU solver succeeded
158 } else {
159 *info = -1; // GPU failed
160 }
161 }
162
163 void cuSOLVER_wrapper(void* A, void* b, int n, int* jj) {
167
168 int lwork;
169 double* workspace;
170 int* ipiv;
171 int* info; // Device pointer for cuSOLVER info
172 int host_info = 0; // Host variable to store the info
173
174 // Workspace query
175 status = cusolverDnDgetrf_bufferSize(handle, n, n, (double*)A, n, &lwork);
178 }
179 cudaMalloc(&workspace, lwork * sizeof(double));
180 cudaMalloc(&ipiv, n * sizeof(int));
181 cudaMalloc(&info, sizeof(int));
182
183 // LU factorization and solve
184 cusolverDnDgetrf(handle, n, n, (double*)A, n, workspace, ipiv, info);
185
186 // Copy info from device to host to check if factorization succeeded
188
189 if (host_info == 0) {
190 // Only solve if factorization was successful
191 cusolverDnDgetrs(handle, CUBLAS_OP_N, n, 1, (double*)A, n, ipiv, (double*)b, n, info);
192 // Copy the final info value
194 }
195
196
197 // Return the actual info value through jj
198 *jj = host_info;
199
200 // Cleanup
202 cudaFree(ipiv);
203 cudaFree(info);
205 }
206
207 void custom_solve_linear_system(void* A, void* b, int n, int* info) {
209
210 if (n <= 0) {
211 *info = -1; // Use CPU fallback
212 return;
213 }
214 const dim3 nthrds(1024, 1, 1);
215 const dim3 nblcks(1, 1, 1);
216
218 (real*)A, (real*)b, n);
219
221 if (err == cudaSuccess) {
222 *info = 0; // GPU solver succeeded
223 } else {
224 *info = -1; // GPU failed, use CPU fallback
225 }
226 }
227
228 void delta_1dbeam_cuda(void* Delta, real* L_total, real* Le,
229 int* offset, int* n) {
230 const dim3 nthrds(1024, 1, 1);
231 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
233 ((real*)Delta, *L_total, *Le, *offset, *n);
235 }
236
237 void cuda_Hess(void* Hess, void* hijx, void* Ljjxinv, int *n, int *m) {
238 const dim3 nthrds(1024, 1, 1);
239 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
240 const int nb = ((*n) + 1024 - 1)/ 1024;
243 if(nb > mma_red_s){
244 mma_red_s = nb;
245 if(mma_bufred != NULL){
246 CUDA_CHECK(cudaFreeHost(mma_bufred));
247 CUDA_CHECK(cudaFree(mma_bufred_d));
248 }
249 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
250 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
251 }
252 for (int i = 0; i < (*m); i++){
253 for (int j=0; j<(*m);j++){
255 ((real*)hijx,(real*)Ljjxinv, mma_bufred_d, (*n),(*m), i, j);
257 mmareduce_kernel<real> <<<1, 1024, 0, stream>>> (mma_bufred_d, nb);
259 mma_copy_kernel<<<1, 1, 0, stream>>>((real*)Hess, mma_bufred_d, 1,
260 i+j*(*m));
263 }
264 }
265 }
266
267 void mma_Ljjxinv_cuda(void* Ljjxinv, void* pjlambda, void* qjlambda, void* x,
268 void* low, void* upp, void* alpha, void* beta, int* n) {
269 const dim3 nthrds(1024, 1, 1);
270 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
272 ((real*)Ljjxinv, (real*)pjlambda, (real*)qjlambda, (real*)x, (real*)low,
273 (real*)upp, (real*)alpha, (real*)beta, *n);
275 }
276
277 void mma_dipsolvesub1_cuda(void* x, void* pjlambda, void* qjlambda,
278 void* low, void* upp, void* alpha, void* beta, int* n) {
279 const dim3 nthrds(1024, 1, 1);
280 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
282 ((real*)x, (real*)pjlambda, (real*)qjlambda, (real*)low, (real*)upp,
283 (real*)alpha, (real*)beta, *n);
285 }
286
287 void mattrans_v_mul_cuda(void* output, void* pij, void* lambda,
288 int* m, int* n) {
289 const dim3 nthrds(1024, 1, 1);
290 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
292 ((real*)output, (real*)pij, (real*)lambda, *m, *n);
294 }
295
296 void mma_gensub4_cuda(const void* x, const void* low, const void* upp,
297 const void* pij, const void* qij,
298 const int* n, const int* m, void* bi) {
299
300 const int N = *n;
301 const int M = *m;
302
303 const dim3 nthrds(1024, 1, 1);
304 const dim3 nblcks((N + 1023) / 1024, 1, 1);
305 const int nb = (N + 1023) / 1024;
307
308 if (nb > mma_red_s) {
309 mma_red_s = nb;
310
311 if (mma_bufred != NULL) {
312 CUDA_CHECK(cudaFreeHost(mma_bufred));
313 CUDA_CHECK(cudaFree(mma_bufred_d));
314 }
315
316 CUDA_CHECK(cudaMallocHost(&mma_bufred, nb * sizeof(real)));
317 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb * sizeof(real)));
318 }
319
320 real* temp;
321 real* bi_d = static_cast<real*>(bi);
322 CUDA_CHECK(cudaMalloc(&temp, M * N * sizeof(real)));
323
325 static_cast<const real*>(x),
326 static_cast<const real*>(low),
327 static_cast<const real*>(upp),
328 static_cast<const real*>(pij),
329 static_cast<const real*>(qij),
330 temp, N, M);
331
332 for (int i = 0; i < M; ++i) {
334 temp, mma_bufred_d, N, M, i);
336
337 mmareduce_kernel<real><<<1, 1024, 0, stream>>>(mma_bufred_d, nb);
339
341 bi_d + i, mma_bufred_d, sizeof(real),
343
345 }
346
347 CUDA_CHECK(cudaFree(temp));
348 }
349
350 void mma_gensub3_cuda(void* x, void* df0dx, void* dfdx, void* low,
351 void* upp, void* xmin, void* xmax, void* alpha, void* beta,
352 void* p0j, void* q0j, void* pij, void* qij, int* n, int* m) {
353 const dim3 nthrds(1024, 1, 1);
354 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
355
358 (real*)x, (real*)df0dx, (real*)dfdx, (real*)low,
359 (real*)upp, (real*)xmin, (real*)xmax, (real*)alpha,
360 (real*)beta, (real*)p0j, (real*)q0j, (real*)pij,
361 (real*)qij, *n, *m);
362
364 }
365
366 void mma_gensub2_cuda(void* low, void* upp, void* x, void* xold1,
367 void* xold2, void* xdiff, real* asydecr, real* asyincr, int* n) {
368 const dim3 nthrds(1024, 1, 1);
369 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
370
373 (real*)low, (real*)upp, (real*)x, (real*)xold1,
374 (real*)xold2, (real*)xdiff, *asydecr, *asyincr, *n);
375
377 }
378
379 void mma_gensub1_cuda(void* low, void* upp, void* x, void* xmin, void* xmax,
380 real* asyinit, int* n) {
381 const dim3 nthrds(1024, 1, 1);
382 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
384 ((real*)low, (real*)upp, (real*)x, (real*)xmin, (real*)xmax,
385 *asyinit, *n);
387 }
388
389 void cuda_mma_max(void* xsi, void* x, void* alpha, int* n) {
390 const dim3 nthrds(1024, 1, 1);
391 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
392
394 ((real*)xsi, (real*)x, (real*)alpha, *n);
396 }
397
398 void cuda_relambda(void* relambda, void* x, void* xupp, void* xlow,
399 void* pij, void* qij, int* n, int* m) {
400 const dim3 nthrds(1024, 1, 1);
401 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
402 const int nb = ((*n) + 1024 - 1)/ 1024;
404
405 if ( nb > mma_red_s){
406 mma_red_s = nb;
407 if (mma_bufred != NULL) {
408 CUDA_CHECK(cudaFreeHost(mma_bufred));
409 CUDA_CHECK(cudaFree(mma_bufred_d));
410 }
411 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
412 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
413 }
414 real* temp;
415 cudaMalloc(&temp, (*n) * (*m) * sizeof(real));
416 relambda_kernel<real> <<<nblcks, nthrds, 0, stream >>> (temp, (real*)x,
417 (real*)xupp, (real*)xlow, (real*)pij, (real*)qij, *n, *m);
418 for (int i = 0; i < (*m); i++) {
420 (temp, mma_bufred_d, (*n),(*m), i);
422 mmareduce_kernel<real> <<<1, 1024, 0, stream >>>
423 (mma_bufred_d, nb);
425 mma_copy_kernel<<<1, 1, 0, stream>>>((real*)relambda, mma_bufred_d, 1, i);
428 }
429 cudaFree(temp);
430 }
431
432 void cuda_sub2cons2(void* a, void* b, void* c, void* d, real* e, int* n) {
433
434 const dim3 nthrds(1024, 1, 1);
435 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
436
438 (real*)a, (real*)b, (real*)c, (real*)d, *e, *n);
440 }
441
442 //max abs values of input
443 real cuda_maxval(void* a, int* n) {
444
445 const dim3 nthrds(1024, 1, 1);
446 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
447 const int nb = ((*n) + 1024 - 1) / 1024;
449
450 if (nb > mma_red_s) {
451 mma_red_s = nb;
452 if (mma_bufred != NULL) {
453 CUDA_CHECK(cudaFreeHost(mma_bufred));
454 CUDA_CHECK(cudaFree(mma_bufred_d));
455 }
456 CUDA_CHECK(cudaMallocHost(&mma_bufred, nb * sizeof(real)));
457 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb * sizeof(real)));
458 }
459
461 (real*)a, mma_bufred_d, (*n));
463
464 max_reduce_kernel<real><<<1, 1024, 0, stream>>>(
465 mma_bufred_d, nb);
467
468 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d, sizeof(real),
471
472 return mma_bufred[0];
473 }
474
475 void cuda_delx(void* delx, void* x, void* xlow, void* xupp, void* pij,
476 void* qij, void* p0j, void* q0j, void* alpha, void* beta, void* lambda,
477 real* epsi, int* n, int* m) {
478
479 const dim3 nthrds(1024, 1, 1);
480 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
481
483 (real*)delx, (real*)x, (real*)xlow, (real*)xupp, (real*)pij,
484 (real*)qij, (real*)p0j, (real*)q0j, (real*)alpha, (real*)beta,
485 (real*)lambda, *epsi, *n, *m);
487 }
488
489 void cuda_GG(void* GG, void* x, void* xlow, void* xupp,
490 void* pij, void* qij, int* n, int* m) {
491 const dim3 nthrds(1024, 1, 1);
492 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
494 ((real*)GG, (real*)x, (real*)xlow,(real*) xupp,
495 (real*)pij, (real*) qij, *n,*m);
497 }
498
499 void cuda_diagx(void* diagx, void* x, void* xsi,void* xlow, void* xupp,
500 void* p0j, void* q0j, void* pij, void* qij, void* alpha, void* beta,
501 void* eta, void* lambda, int *n, int *m) {
502 const dim3 nthrds(1024, 1, 1);
503 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
505 ((real*)diagx, (real*)x, (real*)xsi,(real*)xlow,
506 (real*) xupp,(real*)p0j, (real*) q0j, (real*)pij, (real*) qij,
507 (real*)alpha, (real*) beta, (real*)eta, (real*) lambda, *n,*m);
509 }
510
511 void cuda_bb(void* bb, void* GG, void* delx,void* diagx, int *n, int *m) {
512 const dim3 nthrds(1024, 1, 1);
513 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
514 const int nb = ((*n) + 1024 - 1)/ 1024;
517 if(nb > mma_red_s){
518 mma_red_s = nb;
519 if(mma_bufred != NULL){
520 CUDA_CHECK(cudaFreeHost(mma_bufred));
521 CUDA_CHECK(cudaFree(mma_bufred_d));
522 }
523 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
524 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
525 }
526 for (int i = 0; i < (*m); i++) {
528 ((real*)GG,(real*)delx,(real*)diagx, mma_bufred_d, (*n),(*m), i);
530 mmareduce_kernel<real> <<<1, 1024, 0, stream>>> (mma_bufred_d, nb);
532 mma_copy_kernel<<<1, 1, 0, stream>>>((real*)bb, mma_bufred_d, 1, i);
535 }
536 }
537
538 void cuda_AA(void* AA, void* GG, void* diagx, int *n, int *m) {
539 const dim3 nthrds(1024, 1, 1);
540 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
541 const int nb = ((*n) + 1024 - 1)/ 1024;
544 if(nb > mma_red_s){
545 mma_red_s = nb;
546 if(mma_bufred != NULL){
547 CUDA_CHECK(cudaFreeHost(mma_bufred));
548 CUDA_CHECK(cudaFree(mma_bufred_d));
549 }
550 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
551 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
552 }
553 for (int i = 0; i < (*m); i++){
554 for (int j=0; j<(*m);j++){
556 ((real*)GG,(real*)diagx, mma_bufred_d, (*n),(*m), i, j);
558 mmareduce_kernel<real> <<<1, 1024, 0, stream>>> (mma_bufred_d, nb);
560 mma_copy_kernel<<<1, 1, 0, stream>>>((real*)AA, mma_bufred_d, 1,
561 i+j*(*m+1));
564 }
565 }
566 }
567
568 void cuda_dx(void* dx,void* delx, void* diagx, void* GG, void* dlambda,
569 int* n, int* m) {
570 const dim3 nthrds(1024, 1, 1);
571 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
573 ((real*)dx, (real*)delx, (real*)diagx,(real*) GG, (real*)dlambda,
574 *n,*m);
576 }
577
578 void cuda_dxsi(void* dxsi, void* xsi, void* dx,void* x,
579 void* alpha, real*epsi, int* n) {
580 const dim3 nthrds(1024, 1, 1);
581 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
583 ((real*)dxsi, (real*)xsi, (real*)dx,(real*) x,
584 (real*)alpha, *epsi,*n);
586 }
587
588 void cuda_deta(void* deta, void* eta, void* dx, void* x,
589 void* beta, real* epsi, int* n) {
590 const dim3 nthrds(1024, 1, 1);
591 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
593 ((real*)deta, (real*)eta, (real*)dx, (real*)x,
594 (real*)beta, *epsi, *n);
596 }
597
598 void cuda_rex(void* rex, void* x, void* xlow, void* xupp, void* pij,
599 void* p0j, void* qij, void* q0j, void* lambda, void* xsi, void* eta,
600 int* n, int* m) {
601 const dim3 nthrds(1024, 1, 1);
602 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
605 (real*)xupp, (real*)pij, (real*)p0j, (real*)qij, (real*)q0j,
606 (real*)lambda, (real*)xsi, (real*)eta, *n, *m);
608 }
609
610 void cuda_rey(void* rey, void* c, void* d, void* y, void* lambda, void* mu,
611 int* n) {
612 const dim3 nthrds(1024, 1, 1);
613 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
616 (real*)d, (real*)y, (real*)lambda, (real*)mu, * n);
618 }
619
620
621 void cuda_sub2cons(void * a,void * b,void * c, real *d, int * n) {
622 const dim3 nthrds(1024, 1, 1);
623 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
625 ((real *) a, (real *) b, (real *) c, *d, *n);
627 }
628
629 real cuda_norm(void* a, int* n) {
630 const dim3 nthrds(1024, 1, 1);
631 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
632 const int nb = ((*n) + 1024 - 1)/ 1024;
634 if(nb > mma_red_s){
635 mma_red_s = nb;
636 if(mma_bufred != NULL){
637 CUDA_CHECK(cudaFreeHost(mma_bufred));
638 CUDA_CHECK(cudaFree(mma_bufred_d));
639 }
640 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
641 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
642 }
644 ((real*)a, mma_bufred_d, (*n));
646 mmareduce_kernel<real> <<<1, 1024, 0, stream >>> (mma_bufred_d, nb);
648 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d, sizeof(real),
651 return mma_bufred[0];
652 }
653
654 void cuda_dely(void* dely, void* c, void* d, void* y, void* lambda,
655 real* epsi, int* n) {
656 const dim3 nthrds(1024, 1, 1);
657 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
659 ((real*)dely,(real*)c, (real*)d, (real*)y, (real*)lambda,*epsi, * n);
661 }
662
663 real cuda_maxval2(void* a, void* b, real* cons, int* n) {
664 const dim3 nthrds(1024, 1, 1);
665 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
666 const int nb = ((*n) + 1024 - 1)/ 1024;
668 if(nb > mma_red_s){
669 mma_red_s = nb;
670 if(mma_bufred != NULL) {
671 CUDA_CHECK(cudaFreeHost(mma_bufred));
672 CUDA_CHECK(cudaFree(mma_bufred_d));
673 }
674 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
675 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
676 }
678 ((real*)a, (real*)b, mma_bufred_d, *cons, *n);
680 max_reduce_kernel<real> <<<1, 1024, 0,stream >>> (mma_bufred_d, nb);
682 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d, sizeof(real),
685 return mma_bufred[0];
686 }
687
688 real cuda_maxval3(void* a, void* b, void* c, real* cons, int* n) {
689 const dim3 nthrds(1024, 1, 1);
690 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
691 const int nb = ((*n) + 1024 - 1)/ 1024;
693 if(nb > mma_red_s){
694 mma_red_s = nb;
695 if(mma_bufred != NULL) {
696 CUDA_CHECK(cudaFreeHost(mma_bufred));
697 CUDA_CHECK(cudaFree(mma_bufred_d));
698 }
699 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
700 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
701 }
703 ((real*)a, (real*)b, (real*)c, mma_bufred_d, *cons, *n);
704 max_reduce_kernel<real> <<<1, 1024, 0,stream >>> (mma_bufred_d, nb);
706 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d, sizeof(real),
709 return mma_bufred[0];
710 }
711
712 void cuda_kkt_rex(void* rex, void* df0dx, void* dfdx, void* xsi,
713 void* eta, void* lambda, int* n, int* m) {
714 const dim3 nthrds(1024, 1, 1);
715 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
717 ((real*)rex, (real*)df0dx, (real*)dfdx, (real*)xsi,
718 (real*)eta, (real*)lambda, *n, *m);
720 }
721
722 void cuda_maxcons(void* a, real* b, real* c, void* d, int* n) {
723 const dim3 nthrds(1024, 1, 1);
724 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
726 ((real*)a, *b, *c, (real*)d, *n);
728 }
729
730 real cuda_lcsc2(void *a, void*b, int *n) {
731 const dim3 nthrds(1024, 1, 1);
732 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
733 const int nb = ((*n) + 1024 - 1)/ 1024;
735 if ( nb > mma_red_s){
736 mma_red_s = nb;
737 if (mma_bufred != NULL) {
738 CUDA_CHECK(cudaFreeHost(mma_bufred));
739 CUDA_CHECK(cudaFree(mma_bufred_d));
740 }
741 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
742 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
743 }
745 ((real*)a, (real*)b, mma_bufred_d, (*n));
747 mmareduce_kernel<real> <<<1, 1024, 0, stream>>> (mma_bufred_d, nb);
749 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d, sizeof(real),
752 return mma_bufred[0];
753 }
754
755 void cuda_mpisum(void *a, int *n) {
756#ifdef HAVE_DEVICE_MPI
757 real* temp=(real*)a;
760#endif
761 }
762
763 void cuda_add2inv2(void* a, void *b, real* c, int* n) {
764 const dim3 nthrds(1024, 1, 1);
765 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
767 ((real*)a, (real*) b, *c, *n);
769 }
770
771 void cuda_max2(void* a, real* b, void* c, real* d, int* n) {
772 const dim3 nthrds(1024, 1, 1);
773 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
775 ((real*)a, *b, (real*)c,*d, *n);
777 }
778
779 void cuda_updatebb(void* bb, void* dellambda, void* dely,void* d,
780 void* mu, void* y, real* delz, int *m) {
781 const dim3 nthrds(1024, 1, 1);
782 const dim3 nblcks(((*m+1) + 1024 - 1) / 1024, 1, 1);
784 ((real*) bb, (real*) dellambda, (real*) dely,(real*) d,
785 (real*) mu, (real*) y, *delz, *m);
787 }
788
789 void cuda_updateAA(void* AA, void* globaltmp_mm, void* s, void* lambda,
790 void* d, void*mu,void* y,void* a, real* zeta, real* z, int* m) {
791 const dim3 nthrds(1024, 1, 1);
792 const dim3 nblcks(((*m+1) + 1024 - 1) / 1024, 1, 1);
794 ((real*) AA,(real*) globaltmp_mm, (real*) s, (real*) lambda,(real*) d,
795 (real*)mu,(real*) y, (real*)a, *zeta, *z, *m);
797 }
798
799 void cuda_dy(void* dy, void* dely, void* dlambda,void* d, void* mu,
800 void* y, int* n) {
801 const dim3 nthrds(1024, 1, 1);
802 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
804 ((real*) dy, (real*) dely, (real*) dlambda, (real*) d,
805 (real*) mu,(real*) y, *n);
807 }
808
809}/* extern "C" */
__global__ void heaviside_mapping_apply_kernel(const T beta, const T eta, T *__restrict__ X_out_d, T *__restrict__ X_in_d, const int n)