35#include <hip/hip_runtime.h>
36#include "device/hip/check.h"
37#include "mma_kernel.h"
42#include "math/bcknd/device/device_mpi_reduce.h"
43#include "math/bcknd/device/device_mpi_op.h"
44#include "device/device_config.h"
47real * mma_bufred = NULL;
48real * mma_bufred_d = NULL;
50#include <hip/hip_runtime.h>
52void hip_Hess(
void* Hess,
void* hijx,
void* Ljjxinv,
int *n,
int *m) {
53 const dim3 nthrds(1024, 1, 1);
54 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
55 const int nb = ((*n) + 1024 - 1) / 1024;
56 const hipStream_t stream = (hipStream_t) glb_cmd_queue;
57 hipStreamSynchronize(stream);
61 if (mma_bufred != NULL) {
62 HIP_CHECK(hipHostFree(mma_bufred));
63 HIP_CHECK(hipFree(mma_bufred_d));
65 HIP_CHECK(hipHostMalloc(&mma_bufred, nb *
sizeof(real)));
66 HIP_CHECK(hipMalloc(&mma_bufred_d, nb *
sizeof(real)));
69 for (
int i = 0; i < (*m); i++) {
70 for (
int j = 0; j < (*m); j++) {
71 hipLaunchKernelGGL(mmasumHess_kernel<real>, nblcks, nthrds, 0, stream,
72 (real*)hijx, (real*)Ljjxinv, mma_bufred_d, (*n), (*m), i, j);
73 HIP_CHECK(hipGetLastError());
75 hipLaunchKernelGGL(mmareduce_kernel<real>, dim3(1), dim3(1024), 0, stream,
77 HIP_CHECK(hipGetLastError());
79 hipLaunchKernelGGL(mma_copy_kernel, dim3(1), dim3(1), 0, stream,
80 (real*)Hess, mma_bufred_d, 1, i + j * (*m));
81 HIP_CHECK(hipGetLastError());
83 hipStreamSynchronize(stream);
88void mma_Ljjxinv_hip(
void* Ljjxinv,
void* pjlambda,
void* qjlambda,
void* x,
89 void* low,
void* upp,
void* alpha,
void* beta,
int* n) {
90 const dim3 nthrds(1024, 1, 1);
91 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
92 hipLaunchKernelGGL(mma_Ljjxinv_kernel<real>, nblcks, nthrds, 0,
93 (hipStream_t)glb_cmd_queue, (real*)Ljjxinv, (real*)pjlambda, (real*)qjlambda,
94 (real*)x, (real*)low, (real*)upp, (real*)alpha, (real*)beta, *n);
95 HIP_CHECK(hipGetLastError());
98void mma_dipsolvesub1_hip(
void* x,
void* pjlambda,
void* qjlambda,
void* low,
99 void* upp,
void* alpha,
void* beta,
int* n) {
100 const dim3 nthrds(1024, 1, 1);
101 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
102 hipLaunchKernelGGL(mma_dipsolvesub1_kernel<real>, nblcks, nthrds, 0,
103 (hipStream_t)glb_cmd_queue, (real*)x, (real*)pjlambda, (real*)qjlambda,
104 (real*)low, (real*)upp, (real*)alpha, (real*)beta, *n);
105 HIP_CHECK(hipGetLastError());
108void mattrans_v_mul_hip(
void* output,
void* pij,
void* lambda,
int* m,
int* n) {
109 const dim3 nthrds(1024, 1, 1);
110 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
111 hipLaunchKernelGGL(mattrans_v_mul_kernel<real>, nblcks, nthrds, 0,
112 (hipStream_t)glb_cmd_queue, (real*)output, (real*)pij, (real*)lambda, *m, *n);
113 HIP_CHECK(hipGetLastError());
116void mma_gensub4_hip(
void* x,
void* low,
void* upp,
void* pij,
void* qij,
117 int* n,
int* m,
void* bi) {
118 const dim3 nthrds(1024, 1, 1);
119 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
120 const int nb = ((*n) + 1024 - 1) / 1024;
121 const hipStream_t stream = (hipStream_t)glb_cmd_queue;
123 if (nb > mma_red_s) {
125 if (mma_bufred != NULL) {
126 HIP_CHECK(hipFreeHost(mma_bufred));
127 HIP_CHECK(hipFree(mma_bufred_d));
129 HIP_CHECK(hipHostMalloc(&mma_bufred,
131 HIP_CHECK(hipMalloc(&mma_bufred_d,
136 real* bi_d = (real*)bi;
137 hipMalloc(&temp, (*m) * (*n) *
sizeof(real));
139 hipLaunchKernelGGL(mma_sub4_kernel<real>, nblcks, nthrds, 0, stream,
140 (real*)x, (real*)low, (real*)upp, (real*)pij, (real*)qij,
143 for (
int i = 0; i < (*m); i++) {
144 hipLaunchKernelGGL(mmasum_kernel<real>, nblcks, nthrds, 0, stream,
145 temp, mma_bufred_d, (*n), (*m), i);
146 HIP_CHECK(hipGetLastError());
148 hipLaunchKernelGGL(mmareduce_kernel<real>, dim3(1), dim3(1024), 0, stream,
150 HIP_CHECK(hipGetLastError());
152 HIP_CHECK(hipMemcpyAsync(
153 bi_d + i, mma_bufred_d,
sizeof(real),
154 hipMemcpyDeviceToDevice, stream));
156 hipStreamSynchronize(stream);
162void mma_gensub3_hip(
void* x,
void* df0dx,
void* dfdx,
void* low,
163 void* upp,
void* xmin,
void* xmax,
void* alpha,
164 void* beta,
void* p0j,
void* q0j,
void* pij,
165 void* qij,
int* n,
int* m) {
166 const dim3 nthrds(1024, 1, 1);
167 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
169 hipLaunchKernelGGL(mma_sub3_kernel<real>, nblcks, nthrds, 0,
170 (hipStream_t)glb_cmd_queue,
171 (real*)x, (real*)df0dx, (real*)dfdx, (real*)low,
172 (real*)upp, (real*)xmin, (real*)xmax, (real*)alpha,
173 (real*)beta, (real*)p0j, (real*)q0j, (real*)pij,
176 HIP_CHECK(hipGetLastError());
179void mma_gensub2_hip(
void* low,
void* upp,
void* x,
void* xold1,
180 void* xold2,
void* xdiff, real* asydecr,
181 real* asyincr,
int* n) {
182 const dim3 nthrds(1024, 1, 1);
183 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
185 hipLaunchKernelGGL(mma_sub2_kernel<real>, nblcks, nthrds, 0,
186 (hipStream_t)glb_cmd_queue,
187 (real*)low, (real*)upp, (real*)x, (real*)xold1,
188 (real*)xold2, (real*)xdiff, *asydecr, *asyincr, *n);
190 HIP_CHECK(hipGetLastError());
193void mma_gensub1_hip(
void* low,
void* upp,
void* x,
void* xmin,
void* xmax,
194 real* asyinit,
int* n) {
195 const dim3 nthrds(1024, 1, 1);
196 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
198 hipLaunchKernelGGL(mma_sub1_kernel<real>, nblcks, nthrds, 0,
199 (hipStream_t)glb_cmd_queue,
200 (real*)low, (real*)upp, (real*)x, (real*)xmin, (real*)xmax,
203 HIP_CHECK(hipGetLastError());
206void hip_mma_max(
void* xsi,
void* x,
void* alpha,
int* n) {
207 const dim3 nthrds(1024, 1, 1);
208 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
210 hipLaunchKernelGGL(mma_max2_kernel<real>, nblcks, nthrds, 0,
211 (hipStream_t)glb_cmd_queue,
212 (real*)xsi, (real*)x, (real*)alpha, *n);
214 HIP_CHECK(hipGetLastError());
217void hip_relambda(
void* relambda,
void* x,
void* xupp,
void* xlow,
218 void* pij,
void* qij,
int* n,
int* m) {
219 const dim3 nthrds(1024, 1, 1);
220 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
221 const int nb = nblcks.x;
222 const hipStream_t stream = (hipStream_t)glb_cmd_queue;
224 if (nb > mma_red_s) {
226 if (mma_bufred != NULL) {
227 HIP_CHECK(hipHostFree(mma_bufred));
228 HIP_CHECK(hipFree(mma_bufred_d));
230 HIP_CHECK(hipHostMalloc(&mma_bufred, nb *
sizeof(real)));
231 HIP_CHECK(hipMalloc(&mma_bufred_d, nb *
sizeof(real)));
235 hipMalloc(&temp, (*n) * (*m) *
sizeof(real));
237 hipLaunchKernelGGL(relambda_kernel<real>, nblcks, nthrds, 0, stream,
238 temp, (real*)x, (real*)xupp, (real*)xlow,
239 (real*)pij, (real*)qij, *n, *m);
241 for (
int i = 0; i < (*m); i++) {
242 hipLaunchKernelGGL(mmasum_kernel<real>, nblcks, nthrds, 0, stream,
243 temp, mma_bufred_d, (*n), (*m), i);
244 HIP_CHECK(hipGetLastError());
246 hipLaunchKernelGGL(mmareduce_kernel<real>, dim3(1), dim3(1024), 0,
247 stream, mma_bufred_d, nb);
248 HIP_CHECK(hipGetLastError());
250 hipLaunchKernelGGL(mma_copy_kernel, dim3(1), dim3(1), 0, stream,
251 (real*)relambda, mma_bufred_d, 1, i);
252 HIP_CHECK(hipGetLastError());
254 hipStreamSynchronize(stream);
260void hip_sub2cons2(
void* a,
void* b,
void* c,
void* d, real* e,
int* n) {
261 const dim3 nthrds(1024, 1, 1);
262 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
264 hipLaunchKernelGGL(sub2cons2_kernel<real>, nblcks, nthrds, 0,
265 (hipStream_t)glb_cmd_queue,
266 (real*)a, (real*)b, (real*)c, (real*)d, *e, *n);
268 HIP_CHECK(hipGetLastError());
271real hip_maxval(
void* a,
int* n) {
272 const dim3 nthrds(1024, 1, 1);
273 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
274 const int nb = nblcks.x;
275 const hipStream_t stream = (hipStream_t)glb_cmd_queue;
277 if (nb > mma_red_s) {
279 if (mma_bufred != NULL) {
280 HIP_CHECK(hipHostFree(mma_bufred));
281 HIP_CHECK(hipFree(mma_bufred_d));
283 HIP_CHECK(hipHostMalloc(&mma_bufred, nb *
sizeof(real)));
284 HIP_CHECK(hipMalloc(&mma_bufred_d, nb *
sizeof(real)));
287 hipLaunchKernelGGL(maxval_kernel<real>, nblcks, nthrds, 0, stream,
288 (real*)a, mma_bufred_d, (*n));
289 HIP_CHECK(hipGetLastError());
291 hipLaunchKernelGGL(max_reduce_kernel<real>, dim3(1), dim3(1024), 0, stream,
293 HIP_CHECK(hipGetLastError());
295 HIP_CHECK(hipMemcpyAsync(mma_bufred, mma_bufred_d,
sizeof(real),
296 hipMemcpyDeviceToHost, stream));
297 hipStreamSynchronize(stream);
299 return mma_bufred[0];
303void hip_delx(
void* delx,
void* x,
void* xlow,
void* xupp,
void* pij,
304 void* qij,
void* p0j,
void* q0j,
void* alpha,
void* beta,
void* lambda,
305 real* epsi,
int* n,
int* m) {
306 const dim3 nthrds(1024, 1, 1);
307 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
309 hipLaunchKernelGGL(delx_kernel<real>, nblcks, nthrds, 0,
310 (hipStream_t)glb_cmd_queue,
311 (real*)delx, (real*)x, (real*)xlow, (real*)xupp, (real*)pij,
312 (real*)qij, (real*)p0j, (real*)q0j, (real*)alpha, (real*)beta,
313 (real*)lambda, *epsi, *n, *m);
314 HIP_CHECK(hipGetLastError());
317void hip_GG(
void* GG,
void* x,
void* xlow,
void* xupp,
318 void* pij,
void* qij,
int* n,
int* m) {
319 const dim3 nthrds(1024, 1, 1);
320 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
322 hipLaunchKernelGGL(GG_kernel<real>, nblcks, nthrds, 0,
323 (hipStream_t)glb_cmd_queue,
324 (real*)GG, (real*)x, (real*)xlow, (real*)xupp, (real*)pij,
326 HIP_CHECK(hipGetLastError());
329void hip_diagx(
void* diagx,
void* x,
void* xsi,
void* xlow,
void* xupp,
330 void* p0j,
void* q0j,
void* pij,
void* qij,
void* alpha,
void* beta,
331 void* eta,
void* lambda,
int *n,
int *m) {
332 const dim3 nthrds(1024, 1, 1);
333 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
335 hipLaunchKernelGGL(diagx_kernel<real>, nblcks, nthrds, 0,
336 (hipStream_t)glb_cmd_queue,
337 (real*)diagx, (real*)x, (real*)xsi, (real*)xlow, (real*)xupp,
338 (real*)p0j, (real*)q0j, (real*)pij, (real*)qij, (real*)alpha,
339 (real*)beta, (real*)eta, (real*)lambda, *n, *m);
340 HIP_CHECK(hipGetLastError());
343void hip_bb(
void* bb,
void* GG,
void* delx,
void* diagx,
int *n,
int *m) {
344 const dim3 nthrds(1024, 1, 1);
345 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
346 const int nb = ((*n) + 1024 - 1)/ 1024;
347 const hipStream_t stream = (hipStream_t)glb_cmd_queue;
349 hipStreamSynchronize(stream);
351 if (nb > mma_red_s) {
353 if (mma_bufred != NULL) {
354 HIP_CHECK(hipHostFree(mma_bufred));
355 HIP_CHECK(hipFree(mma_bufred_d));
357 HIP_CHECK(hipHostMalloc(&mma_bufred, nb *
sizeof(real)));
358 HIP_CHECK(hipMalloc(&mma_bufred_d, nb *
sizeof(real)));
361 for (
int i = 0; i < (*m); i++) {
362 hipLaunchKernelGGL(mmasumbb_kernel<real>, nblcks, nthrds, 0, stream,
363 (real*)GG, (real*)delx, (real*)diagx, mma_bufred_d, *n, *m, i);
364 HIP_CHECK(hipGetLastError());
366 hipLaunchKernelGGL(mmareduce_kernel<real>, 1, 1024, 0, stream,
368 HIP_CHECK(hipGetLastError());
370 hipLaunchKernelGGL(mma_copy_kernel, 1, 1, 0, stream, (real*)bb,
372 HIP_CHECK(hipGetLastError());
374 hipStreamSynchronize(stream);
378void hip_AA(
void* AA,
void* GG,
void* diagx,
int *n,
int *m) {
379 const dim3 nthrds(1024, 1, 1);
380 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
381 const int nb = ((*n) + 1024 - 1)/ 1024;
382 const hipStream_t stream = (hipStream_t)glb_cmd_queue;
384 hipStreamSynchronize(stream);
386 if (nb > mma_red_s) {
388 if (mma_bufred != NULL) {
389 HIP_CHECK(hipHostFree(mma_bufred));
390 HIP_CHECK(hipFree(mma_bufred_d));
392 HIP_CHECK(hipHostMalloc(&mma_bufred, nb *
sizeof(real)));
393 HIP_CHECK(hipMalloc(&mma_bufred_d, nb *
sizeof(real)));
396 for (
int i = 0; i < (*m); i++) {
397 for (
int j = 0; j < (*m); j++) {
398 hipLaunchKernelGGL(mmasumAA_kernel<real>, nblcks, nthrds, 0, stream,
399 (real*)GG, (real*)diagx, mma_bufred_d, *n, *m, i, j);
400 HIP_CHECK(hipGetLastError());
402 hipLaunchKernelGGL(mmareduce_kernel<real>, 1, 1024, 0, stream,
404 HIP_CHECK(hipGetLastError());
406 hipLaunchKernelGGL(mma_copy_kernel, 1, 1, 0, stream,
407 (real*)AA, mma_bufred_d, 1, i + j * (*m + 1));
408 HIP_CHECK(hipGetLastError());
410 hipStreamSynchronize(stream);
415void hip_dx(
void* dx,
void* delx,
void* diagx,
void* GG,
void* dlambda,
417 const dim3 nthrds(1024, 1, 1);
418 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
420 hipLaunchKernelGGL(dx_kernel<real>, nblcks, nthrds, 0,
421 (hipStream_t)glb_cmd_queue,
422 (real*)dx, (real*)delx, (real*)diagx, (real*)GG, (real*)dlambda, *n, *m);
423 HIP_CHECK(hipGetLastError());
426void hip_dxsi(
void* dxsi,
void* xsi,
void* dx,
void* x,
427 void* alpha, real* epsi,
int* n) {
428 const dim3 nthrds(1024, 1, 1);
429 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
431 hipLaunchKernelGGL(dxsi_kernel<real>, nblcks, nthrds, 0,
432 (hipStream_t)glb_cmd_queue,
433 (real*)dxsi, (real*)xsi, (real*)dx, (real*)x, (real*)alpha, *epsi, *n);
434 HIP_CHECK(hipGetLastError());
437void hip_deta(
void* deta,
void* eta,
void* dx,
void* x,
438 void* beta, real* epsi,
int* n) {
439 const dim3 nthrds(1024, 1, 1);
440 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
442 hipLaunchKernelGGL(deta_kernel<real>, nblcks, nthrds, 0,
443 (hipStream_t)glb_cmd_queue,
444 (real*)deta, (real*)eta, (real*)dx, (real*)x, (real*)beta, *epsi, *n);
445 HIP_CHECK(hipGetLastError());
448void hip_rex(
void* rex,
void* x,
void* xlow,
void* xupp,
void* pij,
449 void* p0j,
void* qij,
void* q0j,
void* lambda,
void* xsi,
void* eta,
451 const dim3 nthrds(1024, 1, 1);
452 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
454 hipLaunchKernelGGL(RexCalculation_kernel<real>, nblcks, nthrds, 0,
455 (hipStream_t)glb_cmd_queue,
456 (real*)rex, (real*)x, (real*)xlow, (real*)xupp, (real*)pij, (real*)p0j,
457 (real*)qij, (real*)q0j, (real*)lambda, (real*)xsi, (real*)eta, *n, *m);
458 HIP_CHECK(hipGetLastError());
461void hip_rey(
void* rey,
void* c,
void* d,
void* y,
void* lambda,
void* mu,
int* n) {
462 const dim3 nthrds(1024, 1, 1);
463 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
465 hipLaunchKernelGGL(rey_calculation_kernel<real>, nblcks, nthrds, 0,
466 (hipStream_t)glb_cmd_queue,
467 (real*)rey, (real*)c, (real*)d, (real*)y, (real*)lambda, (real*)mu, *n);
468 HIP_CHECK(hipGetLastError());
473void hip_sub2cons(
void *a,
void *b,
void *c, real *d,
int *n) {
474 const dim3 nthrds(1024, 1, 1);
475 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
476 hipLaunchKernelGGL(sub2cons_kernel<real>, nblcks, nthrds, 0,
477 (hipStream_t)glb_cmd_queue,
478 (real *)a, (real *)b, (real *)c, *d, *n);
479 HIP_CHECK(hipGetLastError());
484real hip_norm(
void* a,
int* n) {
485 const dim3 nthrds(1024, 1, 1);
486 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
487 const int nb = ((*n) + 1024 - 1) / 1024;
488 const hipStream_t stream = (hipStream_t)glb_cmd_queue;
490 if (nb > mma_red_s) {
492 if (mma_bufred != NULL) {
493 HIP_CHECK(hipFreeHost(mma_bufred));
494 HIP_CHECK(hipFree(mma_bufred_d));
496 HIP_CHECK(hipHostMalloc(&mma_bufred, nb *
sizeof(real)));
497 HIP_CHECK(hipMalloc(&mma_bufred_d, nb *
sizeof(real)));
500 hipLaunchKernelGGL(norm_kernel<real>, nblcks, nthrds, 0, stream,
501 (real*)a, mma_bufred_d, (*n));
502 HIP_CHECK(hipGetLastError());
504 hipLaunchKernelGGL(mmareduce_kernel<real>, dim3(1), dim3(1024), 0, stream,
506 HIP_CHECK(hipGetLastError());
508 HIP_CHECK(hipMemcpyAsync(mma_bufred, mma_bufred_d,
sizeof(real),
509 hipMemcpyDeviceToHost, stream));
511 hipStreamSynchronize(stream);
513 return mma_bufred[0];
517void hip_dely(
void* dely,
void* c,
void* d,
void* y,
void* lambda,
518 real* epsi,
int* n) {
519 const dim3 nthrds(1024, 1, 1);
520 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
521 hipLaunchKernelGGL(dely_kernel<real>, nblcks, nthrds, 0,
522 (hipStream_t)glb_cmd_queue,
523 (real*)dely, (real*)c, (real*)d, (real*)y, (real*)lambda, *epsi, *n);
524 HIP_CHECK(hipGetLastError());
528real hip_maxval2(
void* a,
void* b, real* cons,
int* n) {
529 const dim3 nthrds(1024, 1, 1);
530 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
531 const int nb = ((*n) + 1024 - 1) / 1024;
532 const hipStream_t stream = (hipStream_t)glb_cmd_queue;
534 if (nb > mma_red_s) {
536 if (mma_bufred != NULL) {
537 HIP_CHECK(hipFreeHost(mma_bufred));
538 HIP_CHECK(hipFree(mma_bufred_d));
540 HIP_CHECK(hipHostMalloc(&mma_bufred, nb *
sizeof(real)));
541 HIP_CHECK(hipMalloc(&mma_bufred_d, nb *
sizeof(real)));
544 hipLaunchKernelGGL(maxval2_kernel<real>, nblcks, nthrds, 0, stream,
545 (real*)a, (real*)b, mma_bufred_d, *cons, *n);
546 HIP_CHECK(hipGetLastError());
548 hipLaunchKernelGGL(max_reduce_kernel<real>, dim3(1), dim3(1024), 0, stream,
550 HIP_CHECK(hipGetLastError());
552 HIP_CHECK(hipMemcpyAsync(mma_bufred, mma_bufred_d,
sizeof(real),
553 hipMemcpyDeviceToHost, stream));
555 hipStreamSynchronize(stream);
557 return mma_bufred[0];
561real hip_maxval3(
void* a,
void* b,
void* c, real* cons,
int* n) {
562 const dim3 nthrds(1024, 1, 1);
563 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
564 const int nb = ((*n) + 1024 - 1) / 1024;
565 const hipStream_t stream = (hipStream_t)glb_cmd_queue;
567 if (nb > mma_red_s) {
569 if (mma_bufred != NULL) {
570 HIP_CHECK(hipFreeHost(mma_bufred));
571 HIP_CHECK(hipFree(mma_bufred_d));
573 HIP_CHECK(hipHostMalloc(&mma_bufred, nb *
sizeof(real)));
574 HIP_CHECK(hipMalloc(&mma_bufred_d, nb *
sizeof(real)));
577 hipLaunchKernelGGL(maxval3_kernel<real>, nblcks, nthrds, 0, stream,
578 (real*)a, (real*)b, (real*)c, mma_bufred_d, *cons, *n);
579 hipLaunchKernelGGL(max_reduce_kernel<real>, dim3(1), dim3(1024), 0, stream,
581 HIP_CHECK(hipGetLastError());
583 HIP_CHECK(hipMemcpyAsync(mma_bufred, mma_bufred_d,
sizeof(real),
584 hipMemcpyDeviceToHost, stream));
586 hipStreamSynchronize(stream);
588 return mma_bufred[0];
592void hip_kkt_rex(
void* rex,
void* df0dx,
void* dfdx,
void* xsi,
593 void* eta,
void* lambda,
int* n,
int* m) {
594 const dim3 nthrds(1024, 1, 1);
595 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
596 hipLaunchKernelGGL(kkt_rex_kernel<real>, nblcks, nthrds, 0,
597 (hipStream_t)glb_cmd_queue,
598 (real*)rex, (real*)df0dx, (real*)dfdx, (real*)xsi,
599 (real*)eta, (real*)lambda, *n, *m);
600 HIP_CHECK(hipGetLastError());
605void hip_maxcons(
void* a, real* b, real* c,
void* d,
int* n) {
606 const dim3 nthrds(1024, 1, 1);
607 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
608 hipLaunchKernelGGL(maxcons_kernel<real>, nblcks, nthrds, 0,
609 (hipStream_t)glb_cmd_queue,
610 (real*)a, *b, *c, (real*)d, *n);
611 HIP_CHECK(hipGetLastError());
615real hip_lcsc2(
void *a,
void*b,
int *n) {
616 const dim3 nthrds(1024, 1, 1);
617 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
618 const int nb = ((*n) + 1024 - 1) / 1024;
619 const hipStream_t stream = (hipStream_t)glb_cmd_queue;
621 if (nb > mma_red_s) {
623 if (mma_bufred != NULL) {
624 HIP_CHECK(hipFreeHost(mma_bufred));
625 HIP_CHECK(hipFree(mma_bufred_d));
627 HIP_CHECK(hipHostMalloc(&mma_bufred, nb *
sizeof(real)));
628 HIP_CHECK(hipMalloc(&mma_bufred_d, nb *
sizeof(real)));
631 hipLaunchKernelGGL(glsc2_kernel<real>, nblcks, nthrds, 0, stream,
632 (real*)a, (real*)b, mma_bufred_d, (*n));
633 HIP_CHECK(hipGetLastError());
635 hipLaunchKernelGGL(mmareduce_kernel<real>, dim3(1), dim3(1024), 0, stream,
637 HIP_CHECK(hipGetLastError());
639 HIP_CHECK(hipMemcpyAsync(mma_bufred, mma_bufred_d,
sizeof(real),
640 hipMemcpyDeviceToHost, stream));
642 hipStreamSynchronize(stream);
644 return mma_bufred[0];
648void hip_mpisum(
void *a,
int *n) {
649#ifdef HAVE_DEVICE_MPI
650 real* temp = (real*)a;
651 hipStreamSynchronize(stream);
652 device_mpi_allreduce_inplace(temp, *n,
sizeof(real), DEVICE_MPI_SUM);
657void hip_add2inv2(
void* a,
void* b, real* c,
int* n) {
658 const dim3 nthrds(1024, 1, 1);
659 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
660 const hipStream_t stream = (hipStream_t)glb_cmd_queue;
662 hipLaunchKernelGGL(add2inv2_kernel<real>, nblcks, nthrds, 0, stream,
663 (real*)a, (real*)b, *c, *n);
664 HIP_CHECK(hipGetLastError());
667void hip_max2(
void* a, real* b,
void* c, real* d,
int* n) {
668 const dim3 nthrds(1024, 1, 1);
669 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
670 const hipStream_t stream = (hipStream_t)glb_cmd_queue;
672 hipLaunchKernelGGL(max2_kernel<real>, nblcks, nthrds, 0, stream,
673 (real*)a, *b, (real*)c, *d, *n);
674 HIP_CHECK(hipGetLastError());
677void hip_updatebb(
void* bb,
void* dellambda,
void* dely,
void* d,
678 void* mu,
void* y, real* delz,
int* m) {
679 const dim3 nthrds(1024, 1, 1);
680 const dim3 nblcks(((*m + 1) + 1024 - 1) / 1024, 1, 1);
681 const hipStream_t stream = (hipStream_t)glb_cmd_queue;
683 hipLaunchKernelGGL(updatebb_kernel<real>, nblcks, nthrds, 0, stream,
684 (real*)bb, (real*)dellambda, (real*)dely, (real*)d,
685 (real*)mu, (real*)y, *delz, *m);
686 HIP_CHECK(hipGetLastError());
689void hip_updateAA(
void* AA,
void* globaltmp_mm,
void* s,
void* lambda,
690 void* d,
void* mu,
void* y,
void* a,
691 real* zeta, real* z,
int* m) {
692 const dim3 nthrds(1024, 1, 1);
693 const dim3 nblcks(((*m + 1) + 1024 - 1) / 1024, 1, 1);
694 const hipStream_t stream = (hipStream_t)glb_cmd_queue;
696 hipLaunchKernelGGL(updateAA_kernel<real>, nblcks, nthrds, 0, stream,
697 (real*)AA, (real*)globaltmp_mm, (real*)s,
698 (real*)lambda, (real*)d, (real*)mu,
699 (real*)y, (real*)a, *zeta, *z, *m);
700 HIP_CHECK(hipGetLastError());
703void hip_dy(
void* dy,
void* dely,
void* dlambda,
void* d,
704 void* mu,
void* y,
int* n) {
705 const dim3 nthrds(1024, 1, 1);
706 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
707 const hipStream_t stream = (hipStream_t)glb_cmd_queue;
709 hipLaunchKernelGGL(dy_kernel<real>, nblcks, nthrds, 0, stream,
710 (real*)dy, (real*)dely, (real*)dlambda, (real*)d,
711 (real*)mu, (real*)y, *n);
712 HIP_CHECK(hipGetLastError());