Neko-TOP
A portable framework for high-order spectral element flow toplogy optimization.
Loading...
Searching...
No Matches
mma.hip
Go to the documentation of this file.
1
37// System includes
38#include <stdio.h>
39#include <stdlib.h>
40
41// Device includes
42#include <hip/hip_runtime.h>
43#include <hipsolver/hipsolver.h>
44
45// Neko includes
46#include <neko/device/device_config.h>
47#include <neko/device/hip/check.h>
48#include <neko/math/bcknd/device/device_mpi_op.h>
49
50// Local includes
51#include "mma_kernel.h"
52
53extern "C" {
54
55int mma_red_s = 0;
56real * mma_bufred = NULL;
57real * mma_bufred_d = NULL;
58
59void hipSOLVER_wrapper(void* A, void* b, int n, int* jj) {
63
64 int lwork;
65 double *workspace;
66 int *ipiv;
67 int *info; // Device pointer for hipSOLVER info
68 int host_info = 0; // Host variable to store the info
69
70 // Workspace query
71 status = hipsolverDnDgetrf_bufferSize(handle, n, n, (double*)A, n, &lwork);
72 hipMalloc(&workspace, lwork * sizeof(double));
73 hipMalloc(&ipiv, n * sizeof(int));
74 hipMalloc(&info, sizeof(int));
75
76 // LU factorization and solve
77 hipsolverDnDgetrf(handle, n, n, (double*)A, n, workspace, ipiv, info);
78
79 // Copy info from device to host to check if factorization succeeded
81
82 if (host_info == 0) {
83 // Only solve if factorization was successful
84 hipsolverDnDgetrs(handle, HIPSOLVER_OP_N, n, 1, (double*)A, n, ipiv, (double*)b, n, info);
85 // Copy the final info value
87 }
88
89 // Return the actual info value through jj
90 *jj = host_info;
91
92 // Cleanup
97}
98
99void mma_prepare_aa_matrix_hip(void* AA, void* s, void* lambda,
100 void* d, void* mu, void* y,
101 void* a, real zeta, real z, int m) {
102 const int M = m;
103 const int matrix_size = M + 1;
104 const dim3 nthrds(256, 1, 1);
105 const dim3 nblcks((M + 256 - 1) / 256, 1, 1);
107
108 // Launch kernel to prepare AA matrix entirely on device
110 nblcks, nthrds, 0, stream,
111 (real*)AA, (real*)s, (real*)lambda, (real*)d,
112 (real*)mu, (real*)y, (real*)a, zeta, z, M);
113
115}
116
117void mma_prepare_hessian_hip(void* Hess, void* y, void* d,
118 void* mu, void* lambda, int m) {
119 const int M = m;
120 const dim3 nthrds(1024, 1, 1);
121 const dim3 nblcks((M + 1024 - 1) / 1024, 1, 1);
123
124 // Update diagonal elements
126 nblcks, nthrds, 0, stream,
127 (real*)Hess, (real*)y, (real*)d, (real*)mu, (real*)lambda, M);
129
130 // Synchronize to ensure diagonal updates are complete
132
133 // Choose kernel based on problem size
134 if (M <= 1024) {
135 // Single-block version (fast for small m)
136 const dim3 stab_nblcks(1, 1, 1);
139 (real*)Hess, M);
141 } else {
142 // Multi-block version (for large m)
143 // Compute trace on host (simple and reliable)
144 real* h_Hess = (real*)malloc(M * sizeof(real));
145
146 // Extract diagonal elements
147 for (int i = 0; i < M; i++) {
149 (real*)Hess + i * M + i,
150 sizeof(real),
152 }
154
155 // Compute trace and LM factor
156 real trace = 0.0;
157 for (int i = 0; i < M; i++) {
158 trace += h_Hess[i];
159 }
160 real lm_factor = fmax(-1.0e-4 * trace / M, 1.0e-7);
161
162 // Apply stabilization in parallel
164 nblcks, nthrds, 0, stream,
165 (real*)Hess, lm_factor, M);
167
168 free(h_Hess);
169 }
170}
171
172// Custom linear solver using kernel
173extern "C" void hip_custom_solver(void* A, void* b, int n, int* info) {
175
176 if (n <= 0) {
177 *info = -1; // Use CPU fallback
178 return;
179 }
180 const dim3 nthrds(1024, 1, 1);
181 const dim3 nblcks(1, 1, 1);
182
184 (real*)A, (real*)b, n);
185
187 if (err == hipSuccess) {
188 *info = 0; // GPU solver succeeded
189 } else {
190 *info = -1; // GPU failed
191 }
192}
193
194void delta_1dbeam_hip(void* Delta, real* L_total, real* Le,
195 int* offset, int* n) {
196 const dim3 nthrds(1024, 1, 1);
197 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
200 (real*)Delta, *L_total, *Le, *offset, *n);
202}
203
204void hip_Hess(void* Hess, void* hijx, void* Ljjxinv, int *n, int *m) {
205 const dim3 nthrds(1024, 1, 1);
206 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
207 const int nb = ((*n) + 1024 - 1) / 1024;
210
211 if (nb > mma_red_s) {
212 mma_red_s = nb;
213 if (mma_bufred != NULL) {
214 HIP_CHECK(hipHostFree(mma_bufred));
215 HIP_CHECK(hipFree(mma_bufred_d));
216 }
217 HIP_CHECK(hipHostMalloc(&mma_bufred, nb * sizeof(real)));
218 HIP_CHECK(hipMalloc(&mma_bufred_d, nb * sizeof(real)));
219 }
220
221 for (int i = 0; i < (*m); i++) {
222 for (int j = 0; j < (*m); j++) {
224 (real*)hijx, (real*)Ljjxinv, mma_bufred_d, (*n), (*m), i, j);
226
228 mma_bufred_d, nb);
230
231 hipLaunchKernelGGL(mma_copy_kernel, dim3(1), dim3(1), 0, stream,
232 (real*)Hess, mma_bufred_d, 1, i + j * (*m));
234
236 }
237 }
238}
239
240void mma_Ljjxinv_hip(void* Ljjxinv, void* pjlambda, void* qjlambda, void* x,
241 void* low, void* upp, void* alpha, void* beta, int* n) {
242 const dim3 nthrds(1024, 1, 1);
243 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
246 (real*)x, (real*)low, (real*)upp, (real*)alpha, (real*)beta, *n);
248}
249
250void mma_dipsolvesub1_hip(void* x, void* pjlambda, void* qjlambda, void* low,
251 void* upp, void* alpha, void* beta, int* n) {
252 const dim3 nthrds(1024, 1, 1);
253 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
256 (real*)low, (real*)upp, (real*)alpha, (real*)beta, *n);
258}
259
260void mattrans_v_mul_hip(void* output, void* pij, void* lambda, int* m, int* n) {
261 const dim3 nthrds(1024, 1, 1);
262 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
264 (hipStream_t)glb_cmd_queue, (real*)output, (real*)pij, (real*)lambda, *m, *n);
266}
267
268void mma_gensub4_hip(const void* x, const void* low, const void* upp,
269 const void* pij, const void* qij,
270 const int* n, const int* m, void* bi) {
271
272 const int N = *n;
273 const int M = *m;
274
275 const dim3 nthrds(1024, 1, 1);
276 const dim3 nblcks((N + 1023) / 1024, 1, 1);
277 const int nb = (N + 1023) / 1024;
279
280 if (nb > mma_red_s) {
281 mma_red_s = nb;
282
283 if (mma_bufred != nullptr) {
284 HIP_CHECK(hipFreeHost(mma_bufred));
285 HIP_CHECK(hipFree(mma_bufred_d));
286 }
287
288 HIP_CHECK(hipHostMalloc(&mma_bufred, nb * sizeof(real)));
289 HIP_CHECK(hipMalloc(&mma_bufred_d, nb * sizeof(real)));
290 }
291
292 real* temp;
293 real* bi_d = static_cast<real*>(bi);
294 HIP_CHECK(hipMalloc(&temp, M * N * sizeof(real)));
295
297 static_cast<const real*>(x),
298 static_cast<const real*>(low),
299 static_cast<const real*>(upp),
300 static_cast<const real*>(pij),
301 static_cast<const real*>(qij),
302 temp, N, M);
303
304 for (int i = 0; i < M; ++i) {
306 temp, mma_bufred_d, N, M, i);
308
310 mma_bufred_d, nb);
312
314 bi_d + i, mma_bufred_d, sizeof(real),
316
318 }
319
320 HIP_CHECK(hipFree(temp));
321}
322
323
324void mma_gensub3_hip(void* x, void* df0dx, void* dfdx, void* low,
325 void* upp, void* xmin, void* xmax, void* alpha,
326 void* beta, void* p0j, void* q0j, void* pij,
327 void* qij, int* n, int* m) {
328 const dim3 nthrds(1024, 1, 1);
329 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
330
333 (real*)x, (real*)df0dx, (real*)dfdx, (real*)low,
334 (real*)upp, (real*)xmin, (real*)xmax, (real*)alpha,
335 (real*)beta, (real*)p0j, (real*)q0j, (real*)pij,
336 (real*)qij, *n, *m);
337
339}
340
341void mma_gensub2_hip(void* low, void* upp, void* x, void* xold1,
342 void* xold2, void* xdiff, real* asydecr,
343 real* asyincr, int* n) {
344 const dim3 nthrds(1024, 1, 1);
345 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
346
349 (real*)low, (real*)upp, (real*)x, (real*)xold1,
350 (real*)xold2, (real*)xdiff, *asydecr, *asyincr, *n);
351
353}
354
355void mma_gensub1_hip(void* low, void* upp, void* x, void* xmin, void* xmax,
356 real* asyinit, int* n) {
357 const dim3 nthrds(1024, 1, 1);
358 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
359
362 (real*)low, (real*)upp, (real*)x, (real*)xmin, (real*)xmax,
363 *asyinit, *n);
364
366}
367
368void hip_mma_max(void* xsi, void* x, void* alpha, int* n) {
369 const dim3 nthrds(1024, 1, 1);
370 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
371
374 (real*)xsi, (real*)x, (real*)alpha, *n);
375
377}
378
379void hip_relambda(void* relambda, void* x, void* xupp, void* xlow,
380 void* pij, void* qij, int* n, int* m) {
381 const dim3 nthrds(1024, 1, 1);
382 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
383 const int nb = nblcks.x;
385
386 if (nb > mma_red_s) {
387 mma_red_s = nb;
388 if (mma_bufred != NULL) {
389 HIP_CHECK(hipHostFree(mma_bufred));
390 HIP_CHECK(hipFree(mma_bufred_d));
391 }
392 HIP_CHECK(hipHostMalloc(&mma_bufred, nb * sizeof(real)));
393 HIP_CHECK(hipMalloc(&mma_bufred_d, nb * sizeof(real)));
394 }
395
396 real* temp;
397 hipMalloc(&temp, (*n) * (*m) * sizeof(real));
398
400 temp, (real*)x, (real*)xupp, (real*)xlow,
401 (real*)pij, (real*)qij, *n, *m);
402
403 for (int i = 0; i < (*m); i++) {
405 temp, mma_bufred_d, (*n), (*m), i);
407
409 stream, mma_bufred_d, nb);
411
412 hipLaunchKernelGGL(mma_copy_kernel, dim3(1), dim3(1), 0, stream,
413 (real*)relambda, mma_bufred_d, 1, i);
415
417 }
418
419 hipFree(temp);
420}
421
422void hip_sub2cons2(void* a, void* b, void* c, void* d, real* e, int* n) {
423 const dim3 nthrds(1024, 1, 1);
424 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
425
428 (real*)a, (real*)b, (real*)c, (real*)d, *e, *n);
429
431}
432
433real hip_maxval(void* a, int* n) {
434 const dim3 nthrds(1024, 1, 1);
435 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
436 const int nb = nblcks.x;
438
439 if (nb > mma_red_s) {
440 mma_red_s = nb;
441 if (mma_bufred != NULL) {
442 HIP_CHECK(hipHostFree(mma_bufred));
443 HIP_CHECK(hipFree(mma_bufred_d));
444 }
445 HIP_CHECK(hipHostMalloc(&mma_bufred, nb * sizeof(real)));
446 HIP_CHECK(hipMalloc(&mma_bufred_d, nb * sizeof(real)));
447 }
448
450 (real*)a, mma_bufred_d, (*n));
452
454 mma_bufred_d, nb);
456
457 HIP_CHECK(hipMemcpyAsync(mma_bufred, mma_bufred_d, sizeof(real),
460
461 return mma_bufred[0];
462}
463
464
465void hip_delx(void* delx, void* x, void* xlow, void* xupp, void* pij,
466 void* qij, void* p0j, void* q0j, void* alpha, void* beta, void* lambda,
467 real* epsi, int* n, int* m) {
468 const dim3 nthrds(1024, 1, 1);
469 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
470
473 (real*)delx, (real*)x, (real*)xlow, (real*)xupp, (real*)pij,
474 (real*)qij, (real*)p0j, (real*)q0j, (real*)alpha, (real*)beta,
475 (real*)lambda, *epsi, *n, *m);
477}
478
479void hip_GG(void* GG, void* x, void* xlow, void* xupp,
480 void* pij, void* qij, int* n, int* m) {
481 const dim3 nthrds(1024, 1, 1);
482 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
483
486 (real*)GG, (real*)x, (real*)xlow, (real*)xupp, (real*)pij,
487 (real*)qij, *n, *m);
489}
490
491void hip_diagx(void* diagx, void* x, void* xsi, void* xlow, void* xupp,
492 void* p0j, void* q0j, void* pij, void* qij, void* alpha, void* beta,
493 void* eta, void* lambda, int *n, int *m) {
494 const dim3 nthrds(1024, 1, 1);
495 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
496
499 (real*)diagx, (real*)x, (real*)xsi, (real*)xlow, (real*)xupp,
500 (real*)p0j, (real*)q0j, (real*)pij, (real*)qij, (real*)alpha,
501 (real*)beta, (real*)eta, (real*)lambda, *n, *m);
503}
504
505void hip_bb(void* bb, void* GG, void* delx, void* diagx, int *n, int *m) {
506 const dim3 nthrds(1024, 1, 1);
507 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
508 const int nb = ((*n) + 1024 - 1)/ 1024;
510
512
513 if (nb > mma_red_s) {
514 mma_red_s = nb;
515 if (mma_bufred != NULL) {
516 HIP_CHECK(hipHostFree(mma_bufred));
517 HIP_CHECK(hipFree(mma_bufred_d));
518 }
519 HIP_CHECK(hipHostMalloc(&mma_bufred, nb * sizeof(real)));
520 HIP_CHECK(hipMalloc(&mma_bufred_d, nb * sizeof(real)));
521 }
522
523 for (int i = 0; i < (*m); i++) {
525 (real*)GG, (real*)delx, (real*)diagx, mma_bufred_d, *n, *m, i);
527
529 mma_bufred_d, nb);
531
532 hipLaunchKernelGGL(mma_copy_kernel, 1, 1, 0, stream, (real*)bb,
533 mma_bufred_d, 1, i);
535
537 }
538}
539
540void hip_AA(void* AA, void* GG, void* diagx, int *n, int *m) {
541 const dim3 nthrds(1024, 1, 1);
542 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
543 const int nb = ((*n) + 1024 - 1)/ 1024;
545
547
548 if (nb > mma_red_s) {
549 mma_red_s = nb;
550 if (mma_bufred != NULL) {
551 HIP_CHECK(hipHostFree(mma_bufred));
552 HIP_CHECK(hipFree(mma_bufred_d));
553 }
554 HIP_CHECK(hipHostMalloc(&mma_bufred, nb * sizeof(real)));
555 HIP_CHECK(hipMalloc(&mma_bufred_d, nb * sizeof(real)));
556 }
557
558 for (int i = 0; i < (*m); i++) {
559 for (int j = 0; j < (*m); j++) {
561 (real*)GG, (real*)diagx, mma_bufred_d, *n, *m, i, j);
563
565 mma_bufred_d, nb);
567
568 hipLaunchKernelGGL(mma_copy_kernel, 1, 1, 0, stream,
569 (real*)AA, mma_bufred_d, 1, i + j * (*m + 1));
571
573 }
574 }
575}
576
577void hip_dx(void* dx, void* delx, void* diagx, void* GG, void* dlambda,
578 int* n, int* m) {
579 const dim3 nthrds(1024, 1, 1);
580 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
581
584 (real*)dx, (real*)delx, (real*)diagx, (real*)GG, (real*)dlambda, *n, *m);
586}
587
588void hip_dxsi(void* dxsi, void* xsi, void* dx, void* x,
589 void* alpha, real* epsi, int* n) {
590 const dim3 nthrds(1024, 1, 1);
591 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
592
595 (real*)dxsi, (real*)xsi, (real*)dx, (real*)x, (real*)alpha, *epsi, *n);
597}
598
599void hip_deta(void* deta, void* eta, void* dx, void* x,
600 void* beta, real* epsi, int* n) {
601 const dim3 nthrds(1024, 1, 1);
602 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
603
606 (real*)deta, (real*)eta, (real*)dx, (real*)x, (real*)beta, *epsi, *n);
608}
609
610void hip_rex(void* rex, void* x, void* xlow, void* xupp, void* pij,
611 void* p0j, void* qij, void* q0j, void* lambda, void* xsi, void* eta,
612 int* n, int* m) {
613 const dim3 nthrds(1024, 1, 1);
614 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
615
618 (real*)rex, (real*)x, (real*)xlow, (real*)xupp, (real*)pij, (real*)p0j,
619 (real*)qij, (real*)q0j, (real*)lambda, (real*)xsi, (real*)eta, *n, *m);
621}
622
623void hip_rey(void* rey, void* c, void* d, void* y, void* lambda, void* mu, int* n) {
624 const dim3 nthrds(1024, 1, 1);
625 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
626
629 (real*)rey, (real*)c, (real*)d, (real*)y, (real*)lambda, (real*)mu, *n);
631}
632
633
634 // a_d = b_d * c_d - d
635void hip_sub2cons(void *a, void *b, void *c, real *d, int *n) {
636 const dim3 nthrds(1024, 1, 1);
637 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
640 (real *)a, (real *)b, (real *)c, *d, *n);
642}
643
644
645// sum(a^2)
646real hip_norm(void* a, int* n) {
647 const dim3 nthrds(1024, 1, 1);
648 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
649 const int nb = ((*n) + 1024 - 1) / 1024;
651
652 if (nb > mma_red_s) {
653 mma_red_s = nb;
654 if (mma_bufred != NULL) {
655 HIP_CHECK(hipFreeHost(mma_bufred));
656 HIP_CHECK(hipFree(mma_bufred_d));
657 }
658 HIP_CHECK(hipHostMalloc(&mma_bufred, nb * sizeof(real)));
659 HIP_CHECK(hipMalloc(&mma_bufred_d, nb * sizeof(real)));
660 }
661
663 (real*)a, mma_bufred_d, (*n));
665
667 mma_bufred_d, nb);
669
670 HIP_CHECK(hipMemcpyAsync(mma_bufred, mma_bufred_d, sizeof(real),
672
674
675 return mma_bufred[0];
676}
677
678
679void hip_dely(void* dely, void* c, void* d, void* y, void* lambda,
680 real* epsi, int* n) {
681 const dim3 nthrds(1024, 1, 1);
682 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
685 (real*)dely, (real*)c, (real*)d, (real*)y, (real*)lambda, *epsi, *n);
687}
688
689
690real hip_maxval2(void* a, void* b, real* cons, int* n) {
691 const dim3 nthrds(1024, 1, 1);
692 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
693 const int nb = ((*n) + 1024 - 1) / 1024;
695
696 if (nb > mma_red_s) {
697 mma_red_s = nb;
698 if (mma_bufred != NULL) {
699 HIP_CHECK(hipFreeHost(mma_bufred));
700 HIP_CHECK(hipFree(mma_bufred_d));
701 }
702 HIP_CHECK(hipHostMalloc(&mma_bufred, nb * sizeof(real)));
703 HIP_CHECK(hipMalloc(&mma_bufred_d, nb * sizeof(real)));
704 }
705
707 (real*)a, (real*)b, mma_bufred_d, *cons, *n);
709
711 mma_bufred_d, nb);
713
714 HIP_CHECK(hipMemcpyAsync(mma_bufred, mma_bufred_d, sizeof(real),
716
718
719 return mma_bufred[0];
720}
721
722
723real hip_maxval3(void* a, void* b, void* c, real* cons, int* n) {
724 const dim3 nthrds(1024, 1, 1);
725 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
726 const int nb = ((*n) + 1024 - 1) / 1024;
728
729 if (nb > mma_red_s) {
730 mma_red_s = nb;
731 if (mma_bufred != NULL) {
732 HIP_CHECK(hipFreeHost(mma_bufred));
733 HIP_CHECK(hipFree(mma_bufred_d));
734 }
735 HIP_CHECK(hipHostMalloc(&mma_bufred, nb * sizeof(real)));
736 HIP_CHECK(hipMalloc(&mma_bufred_d, nb * sizeof(real)));
737 }
738
740 (real*)a, (real*)b, (real*)c, mma_bufred_d, *cons, *n);
742 mma_bufred_d, nb);
744
745 HIP_CHECK(hipMemcpyAsync(mma_bufred, mma_bufred_d, sizeof(real),
747
749
750 return mma_bufred[0];
751}
752
753
754void hip_kkt_rex(void* rex, void* df0dx, void* dfdx, void* xsi,
755 void* eta, void* lambda, int* n, int* m) {
756 const dim3 nthrds(1024, 1, 1);
757 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
760 (real*)rex, (real*)df0dx, (real*)dfdx, (real*)xsi,
761 (real*)eta, (real*)lambda, *n, *m);
763}
764
765
766// a_d = max(b, c * d_d)
767void hip_maxcons(void* a, real* b, real* c, void* d, int* n) {
768 const dim3 nthrds(1024, 1, 1);
769 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
772 (real*)a, *b, *c, (real*)d, *n);
774}
775
776
777real hip_lcsc2(void *a, void*b, int *n) {
778 const dim3 nthrds(1024, 1, 1);
779 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
780 const int nb = ((*n) + 1024 - 1) / 1024;
782
783 if (nb > mma_red_s) {
784 mma_red_s = nb;
785 if (mma_bufred != NULL) {
786 HIP_CHECK(hipFreeHost(mma_bufred));
787 HIP_CHECK(hipFree(mma_bufred_d));
788 }
789 HIP_CHECK(hipHostMalloc(&mma_bufred, nb * sizeof(real)));
790 HIP_CHECK(hipMalloc(&mma_bufred_d, nb * sizeof(real)));
791 }
792
794 (real*)a, (real*)b, mma_bufred_d, (*n));
796
798 mma_bufred_d, nb);
800
801 HIP_CHECK(hipMemcpyAsync(mma_bufred, mma_bufred_d, sizeof(real),
803
805
806 return mma_bufred[0];
807}
808
809
810void hip_mpisum(void *a, int *n) {
811#ifdef HAVE_DEVICE_MPI
812 real* temp = (real*)a;
815#endif
816}
817
818
819void hip_add2inv2(void* a, void* b, real* c, int* n) {
820 const dim3 nthrds(1024, 1, 1);
821 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
823
825 (real*)a, (real*)b, *c, *n);
827}
828
829void hip_max2(void* a, real* b, void* c, real* d, int* n) {
830 const dim3 nthrds(1024, 1, 1);
831 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
833
835 (real*)a, *b, (real*)c, *d, *n);
837}
838
839void hip_updatebb(void* bb, void* dellambda, void* dely, void* d,
840 void* mu, void* y, real* delz, int* m) {
841 const dim3 nthrds(1024, 1, 1);
842 const dim3 nblcks(((*m + 1) + 1024 - 1) / 1024, 1, 1);
844
846 (real*)bb, (real*)dellambda, (real*)dely, (real*)d,
847 (real*)mu, (real*)y, *delz, *m);
849}
850
851void hip_updateAA(void* AA, void* globaltmp_mm, void* s, void* lambda,
852 void* d, void* mu, void* y, void* a,
853 real* zeta, real* z, int* m) {
854 const dim3 nthrds(1024, 1, 1);
855 const dim3 nblcks(((*m + 1) + 1024 - 1) / 1024, 1, 1);
857
859 (real*)AA, (real*)globaltmp_mm, (real*)s,
860 (real*)lambda, (real*)d, (real*)mu,
861 (real*)y, (real*)a, *zeta, *z, *m);
863}
864
865void hip_dy(void* dy, void* dely, void* dlambda, void* d,
866 void* mu, void* y, int* n) {
867 const dim3 nthrds(1024, 1, 1);
868 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
870
872 (real*)dy, (real*)dely, (real*)dlambda, (real*)d,
873 (real*)mu, (real*)y, *n);
875}
876}
__global__ void convex_down_RAMP_mapping_apply_kernel(const T f_min, const T f_max, const T q, T *__restrict__ X_out_d, T *__restrict__ X_in_d, const int n)