Neko-TOP
A portable framework for high-order spectral element flow toplogy optimization.
Loading...
Searching...
No Matches
mma.hip
1/*
2 Copyright (c) 2021-2025, The Neko Authors
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 * Redistributions in binary form must reproduce the above
13 copyright notice, this list of conditions and the following
14 disclaimer in the documentation and/or other materials provided
15 with the distribution.
16
17 * Neither the name of the authors nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 POSSIBILITY OF SUCH DAMAGE.
33*/
34
35// System includes
36#include <stdio.h>
37#include <stdlib.h>
38
39// Device includes
40#include <hip/hip_runtime.h>
41
42// Neko includes
43#include <neko/device/device_config.h>
44#include <neko/device/hip/check.h>
45#include <neko/math/bcknd/device/device_mpi_op.h>
46
47// Local includes
48#include "mma_kernel.h"
49
50extern "C" {
51
52int mma_red_s = 0;
53real * mma_bufred = NULL;
54real * mma_bufred_d = NULL;
55
56void delta_1dbeam_hip(void* Delta, real* L_total, real* Le,
57 int* offset, int* n) {
58 const dim3 nthrds(1024, 1, 1);
59 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
60 hipLaunchKernelGGL(delta_1dbeam_kernel<real>,
61 nblcks, nthrds, 0, (hipStream_t)glb_cmd_queue,
62 (real*)Delta, *L_total, *Le, *offset, *n);
63 HIP_CHECK(hipGetLastError());
64}
65
66void hip_Hess(void* Hess, void* hijx, void* Ljjxinv, int *n, int *m) {
67 const dim3 nthrds(1024, 1, 1);
68 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
69 const int nb = ((*n) + 1024 - 1) / 1024;
70 const hipStream_t stream = (hipStream_t) glb_cmd_queue;
71 hipStreamSynchronize(stream);
72
73 if (nb > mma_red_s) {
74 mma_red_s = nb;
75 if (mma_bufred != NULL) {
76 HIP_CHECK(hipHostFree(mma_bufred));
77 HIP_CHECK(hipFree(mma_bufred_d));
78 }
79 HIP_CHECK(hipHostMalloc(&mma_bufred, nb * sizeof(real)));
80 HIP_CHECK(hipMalloc(&mma_bufred_d, nb * sizeof(real)));
81 }
82
83 for (int i = 0; i < (*m); i++) {
84 for (int j = 0; j < (*m); j++) {
85 hipLaunchKernelGGL(mmasumHess_kernel<real>, nblcks, nthrds, 0, stream,
86 (real*)hijx, (real*)Ljjxinv, mma_bufred_d, (*n), (*m), i, j);
87 HIP_CHECK(hipGetLastError());
88
89 hipLaunchKernelGGL(mmareduce_kernel<real>, dim3(1), dim3(1024), 0, stream,
90 mma_bufred_d, nb);
91 HIP_CHECK(hipGetLastError());
92
93 hipLaunchKernelGGL(mma_copy_kernel, dim3(1), dim3(1), 0, stream,
94 (real*)Hess, mma_bufred_d, 1, i + j * (*m));
95 HIP_CHECK(hipGetLastError());
96
97 hipStreamSynchronize(stream);
98 }
99 }
100}
101
102void mma_Ljjxinv_hip(void* Ljjxinv, void* pjlambda, void* qjlambda, void* x,
103 void* low, void* upp, void* alpha, void* beta, int* n) {
104 const dim3 nthrds(1024, 1, 1);
105 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
106 hipLaunchKernelGGL(mma_Ljjxinv_kernel<real>, nblcks, nthrds, 0,
107 (hipStream_t)glb_cmd_queue, (real*)Ljjxinv, (real*)pjlambda, (real*)qjlambda,
108 (real*)x, (real*)low, (real*)upp, (real*)alpha, (real*)beta, *n);
109 HIP_CHECK(hipGetLastError());
110}
111
112void mma_dipsolvesub1_hip(void* x, void* pjlambda, void* qjlambda, void* low,
113 void* upp, void* alpha, void* beta, int* n) {
114 const dim3 nthrds(1024, 1, 1);
115 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
116 hipLaunchKernelGGL(mma_dipsolvesub1_kernel<real>, nblcks, nthrds, 0,
117 (hipStream_t)glb_cmd_queue, (real*)x, (real*)pjlambda, (real*)qjlambda,
118 (real*)low, (real*)upp, (real*)alpha, (real*)beta, *n);
119 HIP_CHECK(hipGetLastError());
120}
121
122void mattrans_v_mul_hip(void* output, void* pij, void* lambda, int* m, int* n) {
123 const dim3 nthrds(1024, 1, 1);
124 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
125 hipLaunchKernelGGL(mattrans_v_mul_kernel<real>, nblcks, nthrds, 0,
126 (hipStream_t)glb_cmd_queue, (real*)output, (real*)pij, (real*)lambda, *m, *n);
127 HIP_CHECK(hipGetLastError());
128}
129
130void mma_gensub4_hip(const void* x, const void* low, const void* upp,
131 const void* pij, const void* qij,
132 const int* n, const int* m, void* bi) {
133
134 const int N = *n;
135 const int M = *m;
136
137 const dim3 nthrds(1024, 1, 1);
138 const dim3 nblcks((N + 1023) / 1024, 1, 1);
139 const int nb = (N + 1023) / 1024;
140 const hipStream_t stream = (hipStream_t)glb_cmd_queue;
141
142 if (nb > mma_red_s) {
143 mma_red_s = nb;
144
145 if (mma_bufred != nullptr) {
146 HIP_CHECK(hipFreeHost(mma_bufred));
147 HIP_CHECK(hipFree(mma_bufred_d));
148 }
149
150 HIP_CHECK(hipHostMalloc(&mma_bufred, nb * sizeof(real)));
151 HIP_CHECK(hipMalloc(&mma_bufred_d, nb * sizeof(real)));
152 }
153
154 real* temp;
155 real* bi_d = static_cast<real*>(bi);
156 HIP_CHECK(hipMalloc(&temp, M * N * sizeof(real)));
157
158 hipLaunchKernelGGL(mma_sub4_kernel<real>, nblcks, nthrds, 0, stream,
159 static_cast<const real*>(x),
160 static_cast<const real*>(low),
161 static_cast<const real*>(upp),
162 static_cast<const real*>(pij),
163 static_cast<const real*>(qij),
164 temp, N, M);
165
166 for (int i = 0; i < M; ++i) {
167 hipLaunchKernelGGL(mmasum_kernel<real>, nblcks, nthrds, 0, stream,
168 temp, mma_bufred_d, N, M, i);
169 HIP_CHECK(hipGetLastError());
170
171 hipLaunchKernelGGL(mmareduce_kernel<real>, dim3(1), dim3(1024), 0, stream,
172 mma_bufred_d, nb);
173 HIP_CHECK(hipGetLastError());
174
175 HIP_CHECK(hipMemcpyAsync(
176 bi_d + i, mma_bufred_d, sizeof(real),
177 hipMemcpyDeviceToDevice, stream));
178
179 HIP_CHECK(hipStreamSynchronize(stream));
180 }
181
182 HIP_CHECK(hipFree(temp));
183}
184
185
186void mma_gensub3_hip(void* x, void* df0dx, void* dfdx, void* low,
187 void* upp, void* xmin, void* xmax, void* alpha,
188 void* beta, void* p0j, void* q0j, void* pij,
189 void* qij, int* n, int* m) {
190 const dim3 nthrds(1024, 1, 1);
191 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
192
193 hipLaunchKernelGGL(mma_sub3_kernel<real>, nblcks, nthrds, 0,
194 (hipStream_t)glb_cmd_queue,
195 (real*)x, (real*)df0dx, (real*)dfdx, (real*)low,
196 (real*)upp, (real*)xmin, (real*)xmax, (real*)alpha,
197 (real*)beta, (real*)p0j, (real*)q0j, (real*)pij,
198 (real*)qij, *n, *m);
199
200 HIP_CHECK(hipGetLastError());
201}
202
203void mma_gensub2_hip(void* low, void* upp, void* x, void* xold1,
204 void* xold2, void* xdiff, real* asydecr,
205 real* asyincr, int* n) {
206 const dim3 nthrds(1024, 1, 1);
207 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
208
209 hipLaunchKernelGGL(mma_sub2_kernel<real>, nblcks, nthrds, 0,
210 (hipStream_t)glb_cmd_queue,
211 (real*)low, (real*)upp, (real*)x, (real*)xold1,
212 (real*)xold2, (real*)xdiff, *asydecr, *asyincr, *n);
213
214 HIP_CHECK(hipGetLastError());
215}
216
217void mma_gensub1_hip(void* low, void* upp, void* x, void* xmin, void* xmax,
218 real* asyinit, int* n) {
219 const dim3 nthrds(1024, 1, 1);
220 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
221
222 hipLaunchKernelGGL(mma_sub1_kernel<real>, nblcks, nthrds, 0,
223 (hipStream_t)glb_cmd_queue,
224 (real*)low, (real*)upp, (real*)x, (real*)xmin, (real*)xmax,
225 *asyinit, *n);
226
227 HIP_CHECK(hipGetLastError());
228}
229
230void hip_mma_max(void* xsi, void* x, void* alpha, int* n) {
231 const dim3 nthrds(1024, 1, 1);
232 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
233
234 hipLaunchKernelGGL(mma_max2_kernel<real>, nblcks, nthrds, 0,
235 (hipStream_t)glb_cmd_queue,
236 (real*)xsi, (real*)x, (real*)alpha, *n);
237
238 HIP_CHECK(hipGetLastError());
239}
240
241void hip_relambda(void* relambda, void* x, void* xupp, void* xlow,
242 void* pij, void* qij, int* n, int* m) {
243 const dim3 nthrds(1024, 1, 1);
244 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
245 const int nb = nblcks.x;
246 const hipStream_t stream = (hipStream_t)glb_cmd_queue;
247
248 if (nb > mma_red_s) {
249 mma_red_s = nb;
250 if (mma_bufred != NULL) {
251 HIP_CHECK(hipHostFree(mma_bufred));
252 HIP_CHECK(hipFree(mma_bufred_d));
253 }
254 HIP_CHECK(hipHostMalloc(&mma_bufred, nb * sizeof(real)));
255 HIP_CHECK(hipMalloc(&mma_bufred_d, nb * sizeof(real)));
256 }
257
258 real* temp;
259 hipMalloc(&temp, (*n) * (*m) * sizeof(real));
260
261 hipLaunchKernelGGL(relambda_kernel<real>, nblcks, nthrds, 0, stream,
262 temp, (real*)x, (real*)xupp, (real*)xlow,
263 (real*)pij, (real*)qij, *n, *m);
264
265 for (int i = 0; i < (*m); i++) {
266 hipLaunchKernelGGL(mmasum_kernel<real>, nblcks, nthrds, 0, stream,
267 temp, mma_bufred_d, (*n), (*m), i);
268 HIP_CHECK(hipGetLastError());
269
270 hipLaunchKernelGGL(mmareduce_kernel<real>, dim3(1), dim3(1024), 0,
271 stream, mma_bufred_d, nb);
272 HIP_CHECK(hipGetLastError());
273
274 hipLaunchKernelGGL(mma_copy_kernel, dim3(1), dim3(1), 0, stream,
275 (real*)relambda, mma_bufred_d, 1, i);
276 HIP_CHECK(hipGetLastError());
277
278 hipStreamSynchronize(stream);
279 }
280
281 hipFree(temp);
282}
283
284void hip_sub2cons2(void* a, void* b, void* c, void* d, real* e, int* n) {
285 const dim3 nthrds(1024, 1, 1);
286 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
287
288 hipLaunchKernelGGL(sub2cons2_kernel<real>, nblcks, nthrds, 0,
289 (hipStream_t)glb_cmd_queue,
290 (real*)a, (real*)b, (real*)c, (real*)d, *e, *n);
291
292 HIP_CHECK(hipGetLastError());
293}
294
295real hip_maxval(void* a, int* n) {
296 const dim3 nthrds(1024, 1, 1);
297 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
298 const int nb = nblcks.x;
299 const hipStream_t stream = (hipStream_t)glb_cmd_queue;
300
301 if (nb > mma_red_s) {
302 mma_red_s = nb;
303 if (mma_bufred != NULL) {
304 HIP_CHECK(hipHostFree(mma_bufred));
305 HIP_CHECK(hipFree(mma_bufred_d));
306 }
307 HIP_CHECK(hipHostMalloc(&mma_bufred, nb * sizeof(real)));
308 HIP_CHECK(hipMalloc(&mma_bufred_d, nb * sizeof(real)));
309 }
310
311 hipLaunchKernelGGL(maxval_kernel<real>, nblcks, nthrds, 0, stream,
312 (real*)a, mma_bufred_d, (*n));
313 HIP_CHECK(hipGetLastError());
314
315 hipLaunchKernelGGL(max_reduce_kernel<real>, dim3(1), dim3(1024), 0, stream,
316 mma_bufred_d, nb);
317 HIP_CHECK(hipGetLastError());
318
319 HIP_CHECK(hipMemcpyAsync(mma_bufred, mma_bufred_d, sizeof(real),
320 hipMemcpyDeviceToHost, stream));
321 hipStreamSynchronize(stream);
322
323 return mma_bufred[0];
324}
325
326
327void hip_delx(void* delx, void* x, void* xlow, void* xupp, void* pij,
328 void* qij, void* p0j, void* q0j, void* alpha, void* beta, void* lambda,
329 real* epsi, int* n, int* m) {
330 const dim3 nthrds(1024, 1, 1);
331 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
332
333 hipLaunchKernelGGL(delx_kernel<real>, nblcks, nthrds, 0,
334 (hipStream_t)glb_cmd_queue,
335 (real*)delx, (real*)x, (real*)xlow, (real*)xupp, (real*)pij,
336 (real*)qij, (real*)p0j, (real*)q0j, (real*)alpha, (real*)beta,
337 (real*)lambda, *epsi, *n, *m);
338 HIP_CHECK(hipGetLastError());
339}
340
341void hip_GG(void* GG, void* x, void* xlow, void* xupp,
342 void* pij, void* qij, int* n, int* m) {
343 const dim3 nthrds(1024, 1, 1);
344 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
345
346 hipLaunchKernelGGL(GG_kernel<real>, nblcks, nthrds, 0,
347 (hipStream_t)glb_cmd_queue,
348 (real*)GG, (real*)x, (real*)xlow, (real*)xupp, (real*)pij,
349 (real*)qij, *n, *m);
350 HIP_CHECK(hipGetLastError());
351}
352
353void hip_diagx(void* diagx, void* x, void* xsi, void* xlow, void* xupp,
354 void* p0j, void* q0j, void* pij, void* qij, void* alpha, void* beta,
355 void* eta, void* lambda, int *n, int *m) {
356 const dim3 nthrds(1024, 1, 1);
357 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
358
359 hipLaunchKernelGGL(diagx_kernel<real>, nblcks, nthrds, 0,
360 (hipStream_t)glb_cmd_queue,
361 (real*)diagx, (real*)x, (real*)xsi, (real*)xlow, (real*)xupp,
362 (real*)p0j, (real*)q0j, (real*)pij, (real*)qij, (real*)alpha,
363 (real*)beta, (real*)eta, (real*)lambda, *n, *m);
364 HIP_CHECK(hipGetLastError());
365}
366
367void hip_bb(void* bb, void* GG, void* delx, void* diagx, int *n, int *m) {
368 const dim3 nthrds(1024, 1, 1);
369 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
370 const int nb = ((*n) + 1024 - 1)/ 1024;
371 const hipStream_t stream = (hipStream_t)glb_cmd_queue;
372
373 hipStreamSynchronize(stream);
374
375 if (nb > mma_red_s) {
376 mma_red_s = nb;
377 if (mma_bufred != NULL) {
378 HIP_CHECK(hipHostFree(mma_bufred));
379 HIP_CHECK(hipFree(mma_bufred_d));
380 }
381 HIP_CHECK(hipHostMalloc(&mma_bufred, nb * sizeof(real)));
382 HIP_CHECK(hipMalloc(&mma_bufred_d, nb * sizeof(real)));
383 }
384
385 for (int i = 0; i < (*m); i++) {
386 hipLaunchKernelGGL(mmasumbb_kernel<real>, nblcks, nthrds, 0, stream,
387 (real*)GG, (real*)delx, (real*)diagx, mma_bufred_d, *n, *m, i);
388 HIP_CHECK(hipGetLastError());
389
390 hipLaunchKernelGGL(mmareduce_kernel<real>, 1, 1024, 0, stream,
391 mma_bufred_d, nb);
392 HIP_CHECK(hipGetLastError());
393
394 hipLaunchKernelGGL(mma_copy_kernel, 1, 1, 0, stream, (real*)bb,
395 mma_bufred_d, 1, i);
396 HIP_CHECK(hipGetLastError());
397
398 hipStreamSynchronize(stream);
399 }
400}
401
402void hip_AA(void* AA, void* GG, void* diagx, int *n, int *m) {
403 const dim3 nthrds(1024, 1, 1);
404 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
405 const int nb = ((*n) + 1024 - 1)/ 1024;
406 const hipStream_t stream = (hipStream_t)glb_cmd_queue;
407
408 hipStreamSynchronize(stream);
409
410 if (nb > mma_red_s) {
411 mma_red_s = nb;
412 if (mma_bufred != NULL) {
413 HIP_CHECK(hipHostFree(mma_bufred));
414 HIP_CHECK(hipFree(mma_bufred_d));
415 }
416 HIP_CHECK(hipHostMalloc(&mma_bufred, nb * sizeof(real)));
417 HIP_CHECK(hipMalloc(&mma_bufred_d, nb * sizeof(real)));
418 }
419
420 for (int i = 0; i < (*m); i++) {
421 for (int j = 0; j < (*m); j++) {
422 hipLaunchKernelGGL(mmasumAA_kernel<real>, nblcks, nthrds, 0, stream,
423 (real*)GG, (real*)diagx, mma_bufred_d, *n, *m, i, j);
424 HIP_CHECK(hipGetLastError());
425
426 hipLaunchKernelGGL(mmareduce_kernel<real>, 1, 1024, 0, stream,
427 mma_bufred_d, nb);
428 HIP_CHECK(hipGetLastError());
429
430 hipLaunchKernelGGL(mma_copy_kernel, 1, 1, 0, stream,
431 (real*)AA, mma_bufred_d, 1, i + j * (*m + 1));
432 HIP_CHECK(hipGetLastError());
433
434 hipStreamSynchronize(stream);
435 }
436 }
437}
438
439void hip_dx(void* dx, void* delx, void* diagx, void* GG, void* dlambda,
440 int* n, int* m) {
441 const dim3 nthrds(1024, 1, 1);
442 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
443
444 hipLaunchKernelGGL(dx_kernel<real>, nblcks, nthrds, 0,
445 (hipStream_t)glb_cmd_queue,
446 (real*)dx, (real*)delx, (real*)diagx, (real*)GG, (real*)dlambda, *n, *m);
447 HIP_CHECK(hipGetLastError());
448}
449
450void hip_dxsi(void* dxsi, void* xsi, void* dx, void* x,
451 void* alpha, real* epsi, int* n) {
452 const dim3 nthrds(1024, 1, 1);
453 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
454
455 hipLaunchKernelGGL(dxsi_kernel<real>, nblcks, nthrds, 0,
456 (hipStream_t)glb_cmd_queue,
457 (real*)dxsi, (real*)xsi, (real*)dx, (real*)x, (real*)alpha, *epsi, *n);
458 HIP_CHECK(hipGetLastError());
459}
460
461void hip_deta(void* deta, void* eta, void* dx, void* x,
462 void* beta, real* epsi, int* n) {
463 const dim3 nthrds(1024, 1, 1);
464 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
465
466 hipLaunchKernelGGL(deta_kernel<real>, nblcks, nthrds, 0,
467 (hipStream_t)glb_cmd_queue,
468 (real*)deta, (real*)eta, (real*)dx, (real*)x, (real*)beta, *epsi, *n);
469 HIP_CHECK(hipGetLastError());
470}
471
472void hip_rex(void* rex, void* x, void* xlow, void* xupp, void* pij,
473 void* p0j, void* qij, void* q0j, void* lambda, void* xsi, void* eta,
474 int* n, int* m) {
475 const dim3 nthrds(1024, 1, 1);
476 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
477
478 hipLaunchKernelGGL(RexCalculation_kernel<real>, nblcks, nthrds, 0,
479 (hipStream_t)glb_cmd_queue,
480 (real*)rex, (real*)x, (real*)xlow, (real*)xupp, (real*)pij, (real*)p0j,
481 (real*)qij, (real*)q0j, (real*)lambda, (real*)xsi, (real*)eta, *n, *m);
482 HIP_CHECK(hipGetLastError());
483}
484
485void hip_rey(void* rey, void* c, void* d, void* y, void* lambda, void* mu, int* n) {
486 const dim3 nthrds(1024, 1, 1);
487 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
488
489 hipLaunchKernelGGL(rey_calculation_kernel<real>, nblcks, nthrds, 0,
490 (hipStream_t)glb_cmd_queue,
491 (real*)rey, (real*)c, (real*)d, (real*)y, (real*)lambda, (real*)mu, *n);
492 HIP_CHECK(hipGetLastError());
493}
494
495
496 // a_d = b_d * c_d - d
497void hip_sub2cons(void *a, void *b, void *c, real *d, int *n) {
498 const dim3 nthrds(1024, 1, 1);
499 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
500 hipLaunchKernelGGL(sub2cons_kernel<real>, nblcks, nthrds, 0,
501 (hipStream_t)glb_cmd_queue,
502 (real *)a, (real *)b, (real *)c, *d, *n);
503 HIP_CHECK(hipGetLastError());
504}
505
506
507// sum(a^2)
508real hip_norm(void* a, int* n) {
509 const dim3 nthrds(1024, 1, 1);
510 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
511 const int nb = ((*n) + 1024 - 1) / 1024;
512 const hipStream_t stream = (hipStream_t)glb_cmd_queue;
513
514 if (nb > mma_red_s) {
515 mma_red_s = nb;
516 if (mma_bufred != NULL) {
517 HIP_CHECK(hipFreeHost(mma_bufred));
518 HIP_CHECK(hipFree(mma_bufred_d));
519 }
520 HIP_CHECK(hipHostMalloc(&mma_bufred, nb * sizeof(real)));
521 HIP_CHECK(hipMalloc(&mma_bufred_d, nb * sizeof(real)));
522 }
523
524 hipLaunchKernelGGL(norm_kernel<real>, nblcks, nthrds, 0, stream,
525 (real*)a, mma_bufred_d, (*n));
526 HIP_CHECK(hipGetLastError());
527
528 hipLaunchKernelGGL(mmareduce_kernel<real>, dim3(1), dim3(1024), 0, stream,
529 mma_bufred_d, nb);
530 HIP_CHECK(hipGetLastError());
531
532 HIP_CHECK(hipMemcpyAsync(mma_bufred, mma_bufred_d, sizeof(real),
533 hipMemcpyDeviceToHost, stream));
534
535 hipStreamSynchronize(stream);
536
537 return mma_bufred[0];
538}
539
540
541void hip_dely(void* dely, void* c, void* d, void* y, void* lambda,
542 real* epsi, int* n) {
543 const dim3 nthrds(1024, 1, 1);
544 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
545 hipLaunchKernelGGL(dely_kernel<real>, nblcks, nthrds, 0,
546 (hipStream_t)glb_cmd_queue,
547 (real*)dely, (real*)c, (real*)d, (real*)y, (real*)lambda, *epsi, *n);
548 HIP_CHECK(hipGetLastError());
549}
550
551
552real hip_maxval2(void* a, void* b, real* cons, int* n) {
553 const dim3 nthrds(1024, 1, 1);
554 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
555 const int nb = ((*n) + 1024 - 1) / 1024;
556 const hipStream_t stream = (hipStream_t)glb_cmd_queue;
557
558 if (nb > mma_red_s) {
559 mma_red_s = nb;
560 if (mma_bufred != NULL) {
561 HIP_CHECK(hipFreeHost(mma_bufred));
562 HIP_CHECK(hipFree(mma_bufred_d));
563 }
564 HIP_CHECK(hipHostMalloc(&mma_bufred, nb * sizeof(real)));
565 HIP_CHECK(hipMalloc(&mma_bufred_d, nb * sizeof(real)));
566 }
567
568 hipLaunchKernelGGL(maxval2_kernel<real>, nblcks, nthrds, 0, stream,
569 (real*)a, (real*)b, mma_bufred_d, *cons, *n);
570 HIP_CHECK(hipGetLastError());
571
572 hipLaunchKernelGGL(max_reduce_kernel<real>, dim3(1), dim3(1024), 0, stream,
573 mma_bufred_d, nb);
574 HIP_CHECK(hipGetLastError());
575
576 HIP_CHECK(hipMemcpyAsync(mma_bufred, mma_bufred_d, sizeof(real),
577 hipMemcpyDeviceToHost, stream));
578
579 hipStreamSynchronize(stream);
580
581 return mma_bufred[0];
582}
583
584
585real hip_maxval3(void* a, void* b, void* c, real* cons, int* n) {
586 const dim3 nthrds(1024, 1, 1);
587 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
588 const int nb = ((*n) + 1024 - 1) / 1024;
589 const hipStream_t stream = (hipStream_t)glb_cmd_queue;
590
591 if (nb > mma_red_s) {
592 mma_red_s = nb;
593 if (mma_bufred != NULL) {
594 HIP_CHECK(hipFreeHost(mma_bufred));
595 HIP_CHECK(hipFree(mma_bufred_d));
596 }
597 HIP_CHECK(hipHostMalloc(&mma_bufred, nb * sizeof(real)));
598 HIP_CHECK(hipMalloc(&mma_bufred_d, nb * sizeof(real)));
599 }
600
601 hipLaunchKernelGGL(maxval3_kernel<real>, nblcks, nthrds, 0, stream,
602 (real*)a, (real*)b, (real*)c, mma_bufred_d, *cons, *n);
603 hipLaunchKernelGGL(max_reduce_kernel<real>, dim3(1), dim3(1024), 0, stream,
604 mma_bufred_d, nb);
605 HIP_CHECK(hipGetLastError());
606
607 HIP_CHECK(hipMemcpyAsync(mma_bufred, mma_bufred_d, sizeof(real),
608 hipMemcpyDeviceToHost, stream));
609
610 hipStreamSynchronize(stream);
611
612 return mma_bufred[0];
613}
614
615
616void hip_kkt_rex(void* rex, void* df0dx, void* dfdx, void* xsi,
617 void* eta, void* lambda, int* n, int* m) {
618 const dim3 nthrds(1024, 1, 1);
619 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
620 hipLaunchKernelGGL(kkt_rex_kernel<real>, nblcks, nthrds, 0,
621 (hipStream_t)glb_cmd_queue,
622 (real*)rex, (real*)df0dx, (real*)dfdx, (real*)xsi,
623 (real*)eta, (real*)lambda, *n, *m);
624 HIP_CHECK(hipGetLastError());
625}
626
627
628// a_d = max(b, c * d_d)
629void hip_maxcons(void* a, real* b, real* c, void* d, int* n) {
630 const dim3 nthrds(1024, 1, 1);
631 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
632 hipLaunchKernelGGL(maxcons_kernel<real>, nblcks, nthrds, 0,
633 (hipStream_t)glb_cmd_queue,
634 (real*)a, *b, *c, (real*)d, *n);
635 HIP_CHECK(hipGetLastError());
636}
637
638
639real hip_lcsc2(void *a, void*b, int *n) {
640 const dim3 nthrds(1024, 1, 1);
641 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
642 const int nb = ((*n) + 1024 - 1) / 1024;
643 const hipStream_t stream = (hipStream_t)glb_cmd_queue;
644
645 if (nb > mma_red_s) {
646 mma_red_s = nb;
647 if (mma_bufred != NULL) {
648 HIP_CHECK(hipFreeHost(mma_bufred));
649 HIP_CHECK(hipFree(mma_bufred_d));
650 }
651 HIP_CHECK(hipHostMalloc(&mma_bufred, nb * sizeof(real)));
652 HIP_CHECK(hipMalloc(&mma_bufred_d, nb * sizeof(real)));
653 }
654
655 hipLaunchKernelGGL(glsc2_kernel<real>, nblcks, nthrds, 0, stream,
656 (real*)a, (real*)b, mma_bufred_d, (*n));
657 HIP_CHECK(hipGetLastError());
658
659 hipLaunchKernelGGL(mmareduce_kernel<real>, dim3(1), dim3(1024), 0, stream,
660 mma_bufred_d, nb);
661 HIP_CHECK(hipGetLastError());
662
663 HIP_CHECK(hipMemcpyAsync(mma_bufred, mma_bufred_d, sizeof(real),
664 hipMemcpyDeviceToHost, stream));
665
666 hipStreamSynchronize(stream);
667
668 return mma_bufred[0];
669}
670
671
672void hip_mpisum(void *a, int *n) {
673#ifdef HAVE_DEVICE_MPI
674 real* temp = (real*)a;
675 hipStreamSynchronize(stream);
676 device_mpi_allreduce_inplace(temp, *n, sizeof(real), DEVICE_MPI_SUM);
677#endif
678}
679
680
681void hip_add2inv2(void* a, void* b, real* c, int* n) {
682 const dim3 nthrds(1024, 1, 1);
683 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
684 const hipStream_t stream = (hipStream_t)glb_cmd_queue;
685
686 hipLaunchKernelGGL(add2inv2_kernel<real>, nblcks, nthrds, 0, stream,
687 (real*)a, (real*)b, *c, *n);
688 HIP_CHECK(hipGetLastError());
689}
690
691void hip_max2(void* a, real* b, void* c, real* d, int* n) {
692 const dim3 nthrds(1024, 1, 1);
693 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
694 const hipStream_t stream = (hipStream_t)glb_cmd_queue;
695
696 hipLaunchKernelGGL(max2_kernel<real>, nblcks, nthrds, 0, stream,
697 (real*)a, *b, (real*)c, *d, *n);
698 HIP_CHECK(hipGetLastError());
699}
700
701void hip_updatebb(void* bb, void* dellambda, void* dely, void* d,
702 void* mu, void* y, real* delz, int* m) {
703 const dim3 nthrds(1024, 1, 1);
704 const dim3 nblcks(((*m + 1) + 1024 - 1) / 1024, 1, 1);
705 const hipStream_t stream = (hipStream_t)glb_cmd_queue;
706
707 hipLaunchKernelGGL(updatebb_kernel<real>, nblcks, nthrds, 0, stream,
708 (real*)bb, (real*)dellambda, (real*)dely, (real*)d,
709 (real*)mu, (real*)y, *delz, *m);
710 HIP_CHECK(hipGetLastError());
711}
712
713void hip_updateAA(void* AA, void* globaltmp_mm, void* s, void* lambda,
714 void* d, void* mu, void* y, void* a,
715 real* zeta, real* z, int* m) {
716 const dim3 nthrds(1024, 1, 1);
717 const dim3 nblcks(((*m + 1) + 1024 - 1) / 1024, 1, 1);
718 const hipStream_t stream = (hipStream_t)glb_cmd_queue;
719
720 hipLaunchKernelGGL(updateAA_kernel<real>, nblcks, nthrds, 0, stream,
721 (real*)AA, (real*)globaltmp_mm, (real*)s,
722 (real*)lambda, (real*)d, (real*)mu,
723 (real*)y, (real*)a, *zeta, *z, *m);
724 HIP_CHECK(hipGetLastError());
725}
726
727void hip_dy(void* dy, void* dely, void* dlambda, void* d,
728 void* mu, void* y, int* n) {
729 const dim3 nthrds(1024, 1, 1);
730 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
731 const hipStream_t stream = (hipStream_t)glb_cmd_queue;
732
733 hipLaunchKernelGGL(dy_kernel<real>, nblcks, nthrds, 0, stream,
734 (real*)dy, (real*)dely, (real*)dlambda, (real*)d,
735 (real*)mu, (real*)y, *n);
736 HIP_CHECK(hipGetLastError());
737}
738}