Neko-TOP
A portable framework for high-order spectral element flow toplogy optimization.
Loading...
Searching...
No Matches
mma.cu
1/*
2 Copyright (c) 2021-2025, The Neko Authors
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 * Redistributions in binary form must reproduce the above
13 copyright notice, this list of conditions and the following
14 disclaimer in the documentation and/or other materials provided
15 with the distribution.
16
17 * Neither the name of the authors nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 POSSIBILITY OF SUCH DAMAGE.
33*/
34
35// System includes
36#include <stdio.h>
37#include <stdlib.h>
38
39// Device includes
40#include <cuda_runtime.h>
41
42// Neko includes
43#include <neko/device/device_config.h>
44#include <neko/device/cuda/check.h>
45#include <neko/math/bcknd/device/device_mpi_reduce.h>
46#include <neko/math/bcknd/device/device_mpi_op.h>
47
48// Local includes
49#include "mma_kernel.h"
50
51
52extern "C" {
53
54 int mma_red_s = 0;
55 real * mma_bufred = NULL;
56 real * mma_bufred_d = NULL;
57
58 void delta_1dbeam_cuda(void* Delta, real* L_total, real* Le,
59 int* offset, int* n) {
60 const dim3 nthrds(1024, 1, 1);
61 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
62 delta_1dbeam_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
63 ((real*)Delta, *L_total, *Le, *offset, *n);
64 CUDA_CHECK(cudaGetLastError());
65 }
66
67 void cuda_Hess(void* Hess, void* hijx, void* Ljjxinv, int *n, int *m) {
68 const dim3 nthrds(1024, 1, 1);
69 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
70 const int nb = ((*n) + 1024 - 1)/ 1024;
71 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
72 cudaStreamSynchronize(stream);
73 if(nb > mma_red_s){
74 mma_red_s = nb;
75 if(mma_bufred != NULL){
76 CUDA_CHECK(cudaFreeHost(mma_bufred));
77 CUDA_CHECK(cudaFree(mma_bufred_d));
78 }
79 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
80 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
81 }
82 for (int i = 0; i < (*m); i++){
83 for (int j=0; j<(*m);j++){
84 mmasumHess_kernel <real> <<<nblcks, nthrds, 0, stream>>>
85 ((real*)hijx,(real*)Ljjxinv, mma_bufred_d, (*n),(*m), i, j);
86 CUDA_CHECK(cudaGetLastError());
87 mmareduce_kernel<real> <<<1, 1024, 0, stream>>> (mma_bufred_d, nb);
88 CUDA_CHECK(cudaGetLastError());
89 mma_copy_kernel<<<1, 1, 0, stream>>>((real*)Hess, mma_bufred_d, 1,
90 i+j*(*m));
91 CUDA_CHECK(cudaGetLastError());
92 cudaStreamSynchronize(stream);
93 }
94 }
95 }
96
97 void mma_Ljjxinv_cuda(void* Ljjxinv, void* pjlambda, void* qjlambda, void* x,
98 void* low, void* upp, void* alpha, void* beta, int* n) {
99 const dim3 nthrds(1024, 1, 1);
100 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
101 mma_Ljjxinv_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
102 ((real*)Ljjxinv, (real*)pjlambda, (real*)qjlambda, (real*)x, (real*)low,
103 (real*)upp, (real*)alpha, (real*)beta, *n);
104 CUDA_CHECK(cudaGetLastError());
105 }
106
107 void mma_dipsolvesub1_cuda(void* x, void* pjlambda, void* qjlambda,
108 void* low, void* upp, void* alpha, void* beta, int* n) {
109 const dim3 nthrds(1024, 1, 1);
110 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
111 mma_dipsolvesub1_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
112 ((real*)x, (real*)pjlambda, (real*)qjlambda, (real*)low, (real*)upp,
113 (real*)alpha, (real*)beta, *n);
114 CUDA_CHECK(cudaGetLastError());
115 }
116
117 void mattrans_v_mul_cuda(void* output, void* pij, void* lambda,
118 int* m, int* n) {
119 const dim3 nthrds(1024, 1, 1);
120 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
121 mattrans_v_mul_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
122 ((real*)output, (real*)pij, (real*)lambda, *m, *n);
123 CUDA_CHECK(cudaGetLastError());
124 }
125
126 void mma_gensub4_cuda(const void* x, const void* low, const void* upp,
127 const void* pij, const void* qij,
128 const int* n, const int* m, void* bi) {
129
130 const int N = *n;
131 const int M = *m;
132
133 const dim3 nthrds(1024, 1, 1);
134 const dim3 nblcks((N + 1023) / 1024, 1, 1);
135 const int nb = (N + 1023) / 1024;
136 const cudaStream_t stream = (cudaStream_t)glb_cmd_queue;
137
138 if (nb > mma_red_s) {
139 mma_red_s = nb;
140
141 if (mma_bufred != NULL) {
142 CUDA_CHECK(cudaFreeHost(mma_bufred));
143 CUDA_CHECK(cudaFree(mma_bufred_d));
144 }
145
146 CUDA_CHECK(cudaMallocHost(&mma_bufred, nb * sizeof(real)));
147 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb * sizeof(real)));
148 }
149
150 real* temp;
151 real* bi_d = static_cast<real*>(bi);
152 CUDA_CHECK(cudaMalloc(&temp, M * N * sizeof(real)));
153
154 mma_sub4_kernel<real><<<nblcks, nthrds, 0, stream>>>(
155 static_cast<const real*>(x),
156 static_cast<const real*>(low),
157 static_cast<const real*>(upp),
158 static_cast<const real*>(pij),
159 static_cast<const real*>(qij),
160 temp, N, M);
161
162 for (int i = 0; i < M; ++i) {
163 mmasum_kernel<real><<<nblcks, nthrds, 0, stream>>>(
164 temp, mma_bufred_d, N, M, i);
165 CUDA_CHECK(cudaGetLastError());
166
167 mmareduce_kernel<real><<<1, 1024, 0, stream>>>(mma_bufred_d, nb);
168 CUDA_CHECK(cudaGetLastError());
169
170 CUDA_CHECK(cudaMemcpyAsync(
171 bi_d + i, mma_bufred_d, sizeof(real),
172 cudaMemcpyDeviceToDevice, stream));
173
174 CUDA_CHECK(cudaStreamSynchronize(stream));
175 }
176
177 CUDA_CHECK(cudaFree(temp));
178 }
179
180 void mma_gensub3_cuda(void* x, void* df0dx, void* dfdx, void* low,
181 void* upp, void* xmin, void* xmax, void* alpha, void* beta,
182 void* p0j, void* q0j, void* pij, void* qij, int* n, int* m) {
183 const dim3 nthrds(1024, 1, 1);
184 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
185
186 mma_sub3_kernel<real><<<nblcks, nthrds, 0,
187 (cudaStream_t)glb_cmd_queue>>>(
188 (real*)x, (real*)df0dx, (real*)dfdx, (real*)low,
189 (real*)upp, (real*)xmin, (real*)xmax, (real*)alpha,
190 (real*)beta, (real*)p0j, (real*)q0j, (real*)pij,
191 (real*)qij, *n, *m);
192
193 CUDA_CHECK(cudaGetLastError());
194 }
195
196 void mma_gensub2_cuda(void* low, void* upp, void* x, void* xold1,
197 void* xold2, void* xdiff, real* asydecr, real* asyincr, int* n) {
198 const dim3 nthrds(1024, 1, 1);
199 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
200
201 mma_sub2_kernel<real><<<nblcks, nthrds, 0,
202 (cudaStream_t)glb_cmd_queue>>>(
203 (real*)low, (real*)upp, (real*)x, (real*)xold1,
204 (real*)xold2, (real*)xdiff, *asydecr, *asyincr, *n);
205
206 CUDA_CHECK(cudaGetLastError());
207 }
208
209 void mma_gensub1_cuda(void* low, void* upp, void* x, void* xmin, void* xmax,
210 real* asyinit, int* n) {
211 const dim3 nthrds(1024, 1, 1);
212 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
213 mma_sub1_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
214 ((real*)low, (real*)upp, (real*)x, (real*)xmin, (real*)xmax,
215 *asyinit, *n);
216 CUDA_CHECK(cudaGetLastError());
217 }
218
219 void cuda_mma_max(void* xsi, void* x, void* alpha, int* n) {
220 const dim3 nthrds(1024, 1, 1);
221 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
222
223 mma_max2_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
224 ((real*)xsi, (real*)x, (real*)alpha, *n);
225 CUDA_CHECK(cudaGetLastError());
226 }
227
228 void cuda_relambda(void* relambda, void* x, void* xupp, void* xlow,
229 void* pij, void* qij, int* n, int* m) {
230 const dim3 nthrds(1024, 1, 1);
231 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
232 const int nb = ((*n) + 1024 - 1)/ 1024;
233 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
234
235 if ( nb > mma_red_s){
236 mma_red_s = nb;
237 if (mma_bufred != NULL) {
238 CUDA_CHECK(cudaFreeHost(mma_bufred));
239 CUDA_CHECK(cudaFree(mma_bufred_d));
240 }
241 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
242 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
243 }
244 real* temp;
245 cudaMalloc(&temp, (*n) * (*m) * sizeof(real));
246 relambda_kernel<real> <<<nblcks, nthrds, 0, stream >>> (temp, (real*)x,
247 (real*)xupp, (real*)xlow, (real*)pij, (real*)qij, *n, *m);
248 for (int i = 0; i < (*m); i++) {
249 mmasum_kernel <real> <<<nblcks, nthrds, 0, stream >>>
250 (temp, mma_bufred_d, (*n),(*m), i);
251 CUDA_CHECK(cudaGetLastError());
252 mmareduce_kernel<real> <<<1, 1024, 0, stream >>>
253 (mma_bufred_d, nb);
254 CUDA_CHECK(cudaGetLastError());
255 mma_copy_kernel<<<1, 1, 0, stream>>>((real*)relambda, mma_bufred_d, 1, i);
256 CUDA_CHECK(cudaGetLastError());
257 cudaStreamSynchronize(stream);
258 }
259 cudaFree(temp);
260 }
261
262 void cuda_sub2cons2(void* a, void* b, void* c, void* d, real* e, int* n) {
263
264 const dim3 nthrds(1024, 1, 1);
265 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
266
267 sub2cons2_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>(
268 (real*)a, (real*)b, (real*)c, (real*)d, *e, *n);
269 CUDA_CHECK(cudaGetLastError());
270 }
271
272 //max abs values of input
273 real cuda_maxval(void* a, int* n) {
274
275 const dim3 nthrds(1024, 1, 1);
276 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
277 const int nb = ((*n) + 1024 - 1) / 1024;
278 const cudaStream_t stream = (cudaStream_t)glb_cmd_queue;
279
280 if (nb > mma_red_s) {
281 mma_red_s = nb;
282 if (mma_bufred != NULL) {
283 CUDA_CHECK(cudaFreeHost(mma_bufred));
284 CUDA_CHECK(cudaFree(mma_bufred_d));
285 }
286 CUDA_CHECK(cudaMallocHost(&mma_bufred, nb * sizeof(real)));
287 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb * sizeof(real)));
288 }
289
290 maxval_kernel<real><<<nblcks, nthrds, 0, stream>>>(
291 (real*)a, mma_bufred_d, (*n));
292 CUDA_CHECK(cudaGetLastError());
293
294 max_reduce_kernel<real><<<1, 1024, 0, stream>>>(
295 mma_bufred_d, nb);
296 CUDA_CHECK(cudaGetLastError());
297
298 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d, sizeof(real),
299 cudaMemcpyDeviceToHost, stream));
300 cudaStreamSynchronize(stream);
301
302 return mma_bufred[0];
303 }
304
305 void cuda_delx(void* delx, void* x, void* xlow, void* xupp, void* pij,
306 void* qij, void* p0j, void* q0j, void* alpha, void* beta, void* lambda,
307 real* epsi, int* n, int* m) {
308
309 const dim3 nthrds(1024, 1, 1);
310 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
311
312 delx_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>(
313 (real*)delx, (real*)x, (real*)xlow, (real*)xupp, (real*)pij,
314 (real*)qij, (real*)p0j, (real*)q0j, (real*)alpha, (real*)beta,
315 (real*)lambda, *epsi, *n, *m);
316 CUDA_CHECK(cudaGetLastError());
317 }
318
319 void cuda_GG(void* GG, void* x, void* xlow, void* xupp,
320 void* pij, void* qij, int* n, int* m) {
321 const dim3 nthrds(1024, 1, 1);
322 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
323 GG_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
324 ((real*)GG, (real*)x, (real*)xlow,(real*) xupp,
325 (real*)pij, (real*) qij, *n,*m);
326 CUDA_CHECK(cudaGetLastError());
327 }
328
329 void cuda_diagx(void* diagx, void* x, void* xsi,void* xlow, void* xupp,
330 void* p0j, void* q0j, void* pij, void* qij, void* alpha, void* beta,
331 void* eta, void* lambda, int *n, int *m) {
332 const dim3 nthrds(1024, 1, 1);
333 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
334 diagx_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
335 ((real*)diagx, (real*)x, (real*)xsi,(real*)xlow,
336 (real*) xupp,(real*)p0j, (real*) q0j, (real*)pij, (real*) qij,
337 (real*)alpha, (real*) beta, (real*)eta, (real*) lambda, *n,*m);
338 CUDA_CHECK(cudaGetLastError());
339 }
340
341 void cuda_bb(void* bb, void* GG, void* delx,void* diagx, int *n, int *m) {
342 const dim3 nthrds(1024, 1, 1);
343 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
344 const int nb = ((*n) + 1024 - 1)/ 1024;
345 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
346 cudaStreamSynchronize(stream);
347 if(nb > mma_red_s){
348 mma_red_s = nb;
349 if(mma_bufred != NULL){
350 CUDA_CHECK(cudaFreeHost(mma_bufred));
351 CUDA_CHECK(cudaFree(mma_bufred_d));
352 }
353 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
354 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
355 }
356 for (int i = 0; i < (*m); i++) {
357 mmasumbb_kernel <real> <<<nblcks, nthrds, 0, stream>>>
358 ((real*)GG,(real*)delx,(real*)diagx, mma_bufred_d, (*n),(*m), i);
359 CUDA_CHECK(cudaGetLastError());
360 mmareduce_kernel<real> <<<1, 1024, 0, stream>>> (mma_bufred_d, nb);
361 CUDA_CHECK(cudaGetLastError());
362 mma_copy_kernel<<<1, 1, 0, stream>>>((real*)bb, mma_bufred_d, 1, i);
363 CUDA_CHECK(cudaGetLastError());
364 cudaStreamSynchronize(stream);
365 }
366 }
367
368 void cuda_AA(void* AA, void* GG, void* diagx, int *n, int *m) {
369 const dim3 nthrds(1024, 1, 1);
370 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
371 const int nb = ((*n) + 1024 - 1)/ 1024;
372 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
373 cudaStreamSynchronize(stream);
374 if(nb > mma_red_s){
375 mma_red_s = nb;
376 if(mma_bufred != NULL){
377 CUDA_CHECK(cudaFreeHost(mma_bufred));
378 CUDA_CHECK(cudaFree(mma_bufred_d));
379 }
380 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
381 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
382 }
383 for (int i = 0; i < (*m); i++){
384 for (int j=0; j<(*m);j++){
385 mmasumAA_kernel <real> <<<nblcks, nthrds, 0, stream>>>
386 ((real*)GG,(real*)diagx, mma_bufred_d, (*n),(*m), i, j);
387 CUDA_CHECK(cudaGetLastError());
388 mmareduce_kernel<real> <<<1, 1024, 0, stream>>> (mma_bufred_d, nb);
389 CUDA_CHECK(cudaGetLastError());
390 mma_copy_kernel<<<1, 1, 0, stream>>>((real*)AA, mma_bufred_d, 1,
391 i+j*(*m+1));
392 CUDA_CHECK(cudaGetLastError());
393 cudaStreamSynchronize(stream);
394 }
395 }
396 }
397
398 void cuda_dx(void* dx,void* delx, void* diagx, void* GG, void* dlambda,
399 int* n, int* m) {
400 const dim3 nthrds(1024, 1, 1);
401 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
402 dx_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
403 ((real*)dx, (real*)delx, (real*)diagx,(real*) GG, (real*)dlambda,
404 *n,*m);
405 CUDA_CHECK(cudaGetLastError());
406 }
407
408 void cuda_dxsi(void* dxsi, void* xsi, void* dx,void* x,
409 void* alpha, real*epsi, int* n) {
410 const dim3 nthrds(1024, 1, 1);
411 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
412 dxsi_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
413 ((real*)dxsi, (real*)xsi, (real*)dx,(real*) x,
414 (real*)alpha, *epsi,*n);
415 CUDA_CHECK(cudaGetLastError());
416 }
417
418 void cuda_deta(void* deta, void* eta, void* dx, void* x,
419 void* beta, real* epsi, int* n) {
420 const dim3 nthrds(1024, 1, 1);
421 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
422 deta_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
423 ((real*)deta, (real*)eta, (real*)dx, (real*)x,
424 (real*)beta, *epsi, *n);
425 CUDA_CHECK(cudaGetLastError());
426 }
427
428 void cuda_rex(void* rex, void* x, void* xlow, void* xupp, void* pij,
429 void* p0j, void* qij, void* q0j, void* lambda, void* xsi, void* eta,
430 int* n, int* m) {
431 const dim3 nthrds(1024, 1, 1);
432 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
433 RexCalculation_kernel<real> <<<nblcks, nthrds, 0,
434 (cudaStream_t)glb_cmd_queue >>> ((real*)rex, (real*)x, (real*)xlow,
435 (real*)xupp, (real*)pij, (real*)p0j, (real*)qij, (real*)q0j,
436 (real*)lambda, (real*)xsi, (real*)eta, *n, *m);
437 CUDA_CHECK(cudaGetLastError());
438 }
439
440 void cuda_rey(void* rey, void* c, void* d, void* y, void* lambda, void* mu,
441 int* n) {
442 const dim3 nthrds(1024, 1, 1);
443 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
444 rey_calculation_kernel<real> <<<nblcks, nthrds, 0,
445 (cudaStream_t)glb_cmd_queue >>> ((real*)rey, (real*)c,
446 (real*)d, (real*)y, (real*)lambda, (real*)mu, * n);
447 CUDA_CHECK(cudaGetLastError());
448 }
449
450
451 void cuda_sub2cons(void * a,void * b,void * c, real *d, int * n) {
452 const dim3 nthrds(1024, 1, 1);
453 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
454 sub2cons_kernel<real><<<nblcks, nthrds, 0,(cudaStream_t) glb_cmd_queue>>>
455 ((real *) a, (real *) b, (real *) c, *d, *n);
456 CUDA_CHECK(cudaGetLastError());
457 }
458
459 real cuda_norm(void* a, int* n) {
460 const dim3 nthrds(1024, 1, 1);
461 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
462 const int nb = ((*n) + 1024 - 1)/ 1024;
463 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
464 if(nb > mma_red_s){
465 mma_red_s = nb;
466 if(mma_bufred != NULL){
467 CUDA_CHECK(cudaFreeHost(mma_bufred));
468 CUDA_CHECK(cudaFree(mma_bufred_d));
469 }
470 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
471 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
472 }
473 norm_kernel <real> <<<nblcks, nthrds, 0, stream >>>
474 ((real*)a, mma_bufred_d, (*n));
475 CUDA_CHECK(cudaGetLastError());
476 mmareduce_kernel<real> <<<1, 1024, 0, stream >>> (mma_bufred_d, nb);
477 CUDA_CHECK(cudaGetLastError());
478 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d, sizeof(real),
479 cudaMemcpyDeviceToHost, stream));
480 cudaStreamSynchronize(stream);
481 return mma_bufred[0];
482 }
483
484 void cuda_dely(void* dely, void* c, void* d, void* y, void* lambda,
485 real* epsi, int* n) {
486 const dim3 nthrds(1024, 1, 1);
487 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
488 dely_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
489 ((real*)dely,(real*)c, (real*)d, (real*)y, (real*)lambda,*epsi, * n);
490 CUDA_CHECK(cudaGetLastError());
491 }
492
493 real cuda_maxval2(void* a, void* b, real* cons, int* n) {
494 const dim3 nthrds(1024, 1, 1);
495 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
496 const int nb = ((*n) + 1024 - 1)/ 1024;
497 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
498 if(nb > mma_red_s){
499 mma_red_s = nb;
500 if(mma_bufred != NULL) {
501 CUDA_CHECK(cudaFreeHost(mma_bufred));
502 CUDA_CHECK(cudaFree(mma_bufred_d));
503 }
504 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
505 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
506 }
507 maxval2_kernel <real> <<<nblcks, nthrds, 0,stream >>>
508 ((real*)a, (real*)b, mma_bufred_d, *cons, *n);
509 CUDA_CHECK(cudaGetLastError());
510 max_reduce_kernel<real> <<<1, 1024, 0,stream >>> (mma_bufred_d, nb);
511 CUDA_CHECK(cudaGetLastError());
512 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d, sizeof(real),
513 cudaMemcpyDeviceToHost, stream));
514 cudaStreamSynchronize(stream);
515 return mma_bufred[0];
516 }
517
518 real cuda_maxval3(void* a, void* b, void* c, real* cons, int* n) {
519 const dim3 nthrds(1024, 1, 1);
520 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
521 const int nb = ((*n) + 1024 - 1)/ 1024;
522 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
523 if(nb > mma_red_s){
524 mma_red_s = nb;
525 if(mma_bufred != NULL) {
526 CUDA_CHECK(cudaFreeHost(mma_bufred));
527 CUDA_CHECK(cudaFree(mma_bufred_d));
528 }
529 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
530 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
531 }
532 maxval3_kernel <real> <<<nblcks, nthrds, 0,stream>>>
533 ((real*)a, (real*)b, (real*)c, mma_bufred_d, *cons, *n);
534 max_reduce_kernel<real> <<<1, 1024, 0,stream >>> (mma_bufred_d, nb);
535 CUDA_CHECK(cudaGetLastError());
536 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d, sizeof(real),
537 cudaMemcpyDeviceToHost, stream));
538 cudaStreamSynchronize(stream);
539 return mma_bufred[0];
540 }
541
542 void cuda_kkt_rex(void* rex, void* df0dx, void* dfdx, void* xsi,
543 void* eta, void* lambda, int* n, int* m) {
544 const dim3 nthrds(1024, 1, 1);
545 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
546 kkt_rex_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
547 ((real*)rex, (real*)df0dx, (real*)dfdx, (real*)xsi,
548 (real*)eta, (real*)lambda, *n, *m);
549 CUDA_CHECK(cudaGetLastError());
550 }
551
552 void cuda_maxcons(void* a, real* b, real* c, void* d, int* n) {
553 const dim3 nthrds(1024, 1, 1);
554 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
555 maxcons_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
556 ((real*)a, *b, *c, (real*)d, *n);
557 CUDA_CHECK(cudaGetLastError());
558 }
559
560 real cuda_lcsc2(void *a, void*b, int *n) {
561 const dim3 nthrds(1024, 1, 1);
562 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
563 const int nb = ((*n) + 1024 - 1)/ 1024;
564 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
565 if ( nb > mma_red_s){
566 mma_red_s = nb;
567 if (mma_bufred != NULL) {
568 CUDA_CHECK(cudaFreeHost(mma_bufred));
569 CUDA_CHECK(cudaFree(mma_bufred_d));
570 }
571 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
572 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
573 }
574 glsc2_kernel <real> <<<nblcks, nthrds, 0, stream>>>
575 ((real*)a, (real*)b, mma_bufred_d, (*n));
576 CUDA_CHECK(cudaGetLastError());
577 mmareduce_kernel<real> <<<1, 1024, 0, stream>>> (mma_bufred_d, nb);
578 CUDA_CHECK(cudaGetLastError());
579 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d, sizeof(real),
580 cudaMemcpyDeviceToHost, stream));
581 cudaStreamSynchronize(stream);
582 return mma_bufred[0];
583 }
584
585 void cuda_mpisum(void *a, int *n) {
586#ifdef HAVE_DEVICE_MPI
587 real* temp=(real*)a;
588 cudaStreamSynchronize(stream);
589 device_mpi_allreduce_inplace(temp, *n, sizeof(real), DEVICE_MPI_SUM);
590#endif
591 }
592
593 void cuda_add2inv2(void* a, void *b, real* c, int* n) {
594 const dim3 nthrds(1024, 1, 1);
595 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
596 add2inv2_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
597 ((real*)a, (real*) b, *c, *n);
598 CUDA_CHECK(cudaGetLastError());
599 }
600
601 void cuda_max2(void* a, real* b, void* c, real* d, int* n) {
602 const dim3 nthrds(1024, 1, 1);
603 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
604 max2_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
605 ((real*)a, *b, (real*)c,*d, *n);
606 CUDA_CHECK(cudaGetLastError());
607 }
608
609 void cuda_updatebb(void* bb, void* dellambda, void* dely,void* d,
610 void* mu, void* y, real* delz, int *m) {
611 const dim3 nthrds(1024, 1, 1);
612 const dim3 nblcks(((*m+1) + 1024 - 1) / 1024, 1, 1);
613 updatebb_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
614 ((real*) bb, (real*) dellambda, (real*) dely,(real*) d,
615 (real*) mu, (real*) y, *delz, *m);
616 CUDA_CHECK(cudaGetLastError());
617 }
618
619 void cuda_updateAA(void* AA, void* globaltmp_mm, void* s, void* lambda,
620 void* d, void*mu,void* y,void* a, real* zeta, real* z, int* m) {
621 const dim3 nthrds(1024, 1, 1);
622 const dim3 nblcks(((*m+1) + 1024 - 1) / 1024, 1, 1);
623 updateAA_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
624 ((real*) AA,(real*) globaltmp_mm, (real*) s, (real*) lambda,(real*) d,
625 (real*)mu,(real*) y, (real*)a, *zeta, *z, *m);
626 CUDA_CHECK(cudaGetLastError());
627 }
628
629 void cuda_dy(void* dy, void* dely, void* dlambda,void* d, void* mu,
630 void* y, int* n) {
631 const dim3 nthrds(1024, 1, 1);
632 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
633 dy_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
634 ((real*) dy, (real*) dely, (real*) dlambda, (real*) d,
635 (real*) mu,(real*) y, *n);
636 CUDA_CHECK(cudaGetLastError());
637 }
638
639}/* extern "C" */