Neko-TOP
A portable framework for high-order spectral element flow toplogy optimization.
Loading...
Searching...
No Matches
mma.cu
1/*
2 Copyright (c) 2021-2025, The Neko Authors
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 * Redistributions in binary form must reproduce the above
13 copyright notice, this list of conditions and the following
14 disclaimer in the documentation and/or other materials provided
15 with the distribution.
16
17 * Neither the name of the authors nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 POSSIBILITY OF SUCH DAMAGE.
33*/
34
35// System includes
36#include <stdio.h>
37#include <stdlib.h>
38
39// Device includes
40#include <cuda_runtime.h>
41
42// Neko includes
43#include <neko/device/device_config.h>
44#include <neko/device/cuda/check.h>
45#include <neko/math/bcknd/device/device_mpi_reduce.h>
46#include <neko/math/bcknd/device/device_mpi_op.h>
47
48// Local includes
49#include "mma_kernel.h"
50
51
52extern "C" {
53
54 int mma_red_s = 0;
55 real * mma_bufred = NULL;
56 real * mma_bufred_d = NULL;
57
58 void cuda_Hess(void* Hess, void* hijx, void* Ljjxinv, int *n, int *m) {
59 const dim3 nthrds(1024, 1, 1);
60 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
61 const int nb = ((*n) + 1024 - 1)/ 1024;
62 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
63 cudaStreamSynchronize(stream);
64 if(nb > mma_red_s){
65 mma_red_s = nb;
66 if(mma_bufred != NULL){
67 CUDA_CHECK(cudaFreeHost(mma_bufred));
68 CUDA_CHECK(cudaFree(mma_bufred_d));
69 }
70 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
71 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
72 }
73 for (int i = 0; i < (*m); i++){
74 for (int j=0; j<(*m);j++){
75 mmasumHess_kernel <real> <<<nblcks, nthrds, 0, stream>>>
76 ((real*)hijx,(real*)Ljjxinv, mma_bufred_d, (*n),(*m), i, j);
77 CUDA_CHECK(cudaGetLastError());
78 mmareduce_kernel<real> <<<1, 1024, 0, stream>>> (mma_bufred_d, nb);
79 CUDA_CHECK(cudaGetLastError());
80 mma_copy_kernel<<<1, 1, 0, stream>>>((real*)Hess, mma_bufred_d, 1,
81 i+j*(*m));
82 CUDA_CHECK(cudaGetLastError());
83 cudaStreamSynchronize(stream);
84 }
85 }
86 }
87
88 void mma_Ljjxinv_cuda(void* Ljjxinv, void* pjlambda, void* qjlambda, void* x,
89 void* low, void* upp, void* alpha, void* beta, int* n) {
90 const dim3 nthrds(1024, 1, 1);
91 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
92 mma_Ljjxinv_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
93 ((real*)Ljjxinv, (real*)pjlambda, (real*)qjlambda, (real*)x, (real*)low,
94 (real*)upp, (real*)alpha, (real*)beta, *n);
95 CUDA_CHECK(cudaGetLastError());
96 }
97
98 void mma_dipsolvesub1_cuda(void* x, void* pjlambda, void* qjlambda,
99 void* low, void* upp, void* alpha, void* beta, int* n) {
100 const dim3 nthrds(1024, 1, 1);
101 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
102 mma_dipsolvesub1_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
103 ((real*)x, (real*)pjlambda, (real*)qjlambda, (real*)low, (real*)upp,
104 (real*)alpha, (real*)beta, *n);
105 CUDA_CHECK(cudaGetLastError());
106 }
107
108 void mattrans_v_mul_cuda(void* output, void* pij, void* lambda,
109 int* m, int* n) {
110 const dim3 nthrds(1024, 1, 1);
111 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
112 mattrans_v_mul_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
113 ((real*)output, (real*)pij, (real*)lambda, *m, *n);
114 CUDA_CHECK(cudaGetLastError());
115 }
116
117 void mma_gensub4_cuda(void* x, void* low, void* upp, void* pij, void* qij,
118 int* n, int* m, void* bi) {
119 const dim3 nthrds(1024, 1, 1);
120 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
121 const int nb = ((*n) + 1024 - 1) / 1024;
122 const cudaStream_t stream = (cudaStream_t)glb_cmd_queue;
123
124 if (nb > mma_red_s) {
125 mma_red_s = nb;
126 if (mma_bufred != NULL) {
127 CUDA_CHECK(cudaFreeHost(mma_bufred));
128 CUDA_CHECK(cudaFree(mma_bufred_d));
129 }
130 CUDA_CHECK(cudaMallocHost(&mma_bufred,
131 nb * sizeof(real)));
132 CUDA_CHECK(cudaMalloc(&mma_bufred_d,
133 nb * sizeof(real)));
134 }
135
136 real* temp;
137 real* bi_d = (real*)bi;
138 cudaMalloc(&temp, (*m) * (*n) * sizeof(real));
139
140 mma_sub4_kernel<real><<<nblcks, nthrds, 0, stream>>>(
141 (real*)x, (real*)low, (real*)upp, (real*)pij, (real*)qij,
142 temp, *n, *m);
143
144 for (int i = 0; i < (*m); i++) {
145 mmasum_kernel<real><<<nblcks, nthrds, 0, stream>>>(
146 temp, mma_bufred_d, (*n), (*m), i);
147 CUDA_CHECK(cudaGetLastError());
148
149 mmareduce_kernel<real><<<1, 1024, 0, stream>>>(
150 mma_bufred_d, nb);
151 CUDA_CHECK(cudaGetLastError());
152
153 CUDA_CHECK(cudaMemcpyAsync(
154 bi_d + i, mma_bufred_d, sizeof(real),
155 cudaMemcpyDeviceToDevice, stream));
156
157 cudaStreamSynchronize(stream);
158 }
159
160 cudaFree(temp);
161 }
162
163 void mma_gensub3_cuda(void* x, void* df0dx, void* dfdx, void* low,
164 void* upp, void* xmin, void* xmax, void* alpha, void* beta,
165 void* p0j, void* q0j, void* pij, void* qij, int* n, int* m) {
166 const dim3 nthrds(1024, 1, 1);
167 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
168
169 mma_sub3_kernel<real><<<nblcks, nthrds, 0,
170 (cudaStream_t)glb_cmd_queue>>>(
171 (real*)x, (real*)df0dx, (real*)dfdx, (real*)low,
172 (real*)upp, (real*)xmin, (real*)xmax, (real*)alpha,
173 (real*)beta, (real*)p0j, (real*)q0j, (real*)pij,
174 (real*)qij, *n, *m);
175
176 CUDA_CHECK(cudaGetLastError());
177 }
178
179 void mma_gensub2_cuda(void* low, void* upp, void* x, void* xold1,
180 void* xold2, void* xdiff, real* asydecr, real* asyincr, int* n) {
181 const dim3 nthrds(1024, 1, 1);
182 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
183
184 mma_sub2_kernel<real><<<nblcks, nthrds, 0,
185 (cudaStream_t)glb_cmd_queue>>>(
186 (real*)low, (real*)upp, (real*)x, (real*)xold1,
187 (real*)xold2, (real*)xdiff, *asydecr, *asyincr, *n);
188
189 CUDA_CHECK(cudaGetLastError());
190 }
191
192 void mma_gensub1_cuda(void* low, void* upp, void* x, void* xmin, void* xmax,
193 real* asyinit, int* n) {
194 const dim3 nthrds(1024, 1, 1);
195 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
196 mma_sub1_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
197 ((real*)low, (real*)upp, (real*)x, (real*)xmin, (real*)xmax,
198 *asyinit, *n);
199 CUDA_CHECK(cudaGetLastError());
200 }
201
202 void cuda_mma_max(void* xsi, void* x, void* alpha, int* n) {
203 const dim3 nthrds(1024, 1, 1);
204 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
205
206 mma_max2_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
207 ((real*)xsi, (real*)x, (real*)alpha, *n);
208 CUDA_CHECK(cudaGetLastError());
209 }
210
211 void cuda_relambda(void* relambda, void* x, void* xupp, void* xlow,
212 void* pij, void* qij, int* n, int* m) {
213 const dim3 nthrds(1024, 1, 1);
214 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
215 const int nb = ((*n) + 1024 - 1)/ 1024;
216 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
217
218 if ( nb > mma_red_s){
219 mma_red_s = nb;
220 if (mma_bufred != NULL) {
221 CUDA_CHECK(cudaFreeHost(mma_bufred));
222 CUDA_CHECK(cudaFree(mma_bufred_d));
223 }
224 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
225 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
226 }
227 real* temp;
228 cudaMalloc(&temp, (*n) * (*m) * sizeof(real));
229 relambda_kernel<real> <<<nblcks, nthrds, 0, stream >>> (temp, (real*)x,
230 (real*)xupp, (real*)xlow, (real*)pij, (real*)qij, *n, *m);
231 for (int i = 0; i < (*m); i++) {
232 mmasum_kernel <real> <<<nblcks, nthrds, 0, stream >>>
233 (temp, mma_bufred_d, (*n),(*m), i);
234 CUDA_CHECK(cudaGetLastError());
235 mmareduce_kernel<real> <<<1, 1024, 0, stream >>>
236 (mma_bufred_d, nb);
237 CUDA_CHECK(cudaGetLastError());
238 mma_copy_kernel<<<1, 1, 0, stream>>>((real*)relambda, mma_bufred_d, 1, i);
239 CUDA_CHECK(cudaGetLastError());
240 cudaStreamSynchronize(stream);
241 }
242 cudaFree(temp);
243 }
244
245 void cuda_sub2cons2(void* a, void* b, void* c, void* d, real* e, int* n) {
246
247 const dim3 nthrds(1024, 1, 1);
248 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
249
250 sub2cons2_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>(
251 (real*)a, (real*)b, (real*)c, (real*)d, *e, *n);
252 CUDA_CHECK(cudaGetLastError());
253 }
254
255 //max abs values of input
256 real cuda_maxval(void* a, int* n) {
257
258 const dim3 nthrds(1024, 1, 1);
259 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
260 const int nb = ((*n) + 1024 - 1) / 1024;
261 const cudaStream_t stream = (cudaStream_t)glb_cmd_queue;
262
263 if (nb > mma_red_s) {
264 mma_red_s = nb;
265 if (mma_bufred != NULL) {
266 CUDA_CHECK(cudaFreeHost(mma_bufred));
267 CUDA_CHECK(cudaFree(mma_bufred_d));
268 }
269 CUDA_CHECK(cudaMallocHost(&mma_bufred, nb * sizeof(real)));
270 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb * sizeof(real)));
271 }
272
273 maxval_kernel<real><<<nblcks, nthrds, 0, stream>>>(
274 (real*)a, mma_bufred_d, (*n));
275 CUDA_CHECK(cudaGetLastError());
276
277 max_reduce_kernel<real><<<1, 1024, 0, stream>>>(
278 mma_bufred_d, nb);
279 CUDA_CHECK(cudaGetLastError());
280
281 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d, sizeof(real),
282 cudaMemcpyDeviceToHost, stream));
283 cudaStreamSynchronize(stream);
284
285 return mma_bufred[0];
286 }
287
288 void cuda_delx(void* delx, void* x, void* xlow, void* xupp, void* pij,
289 void* qij, void* p0j, void* q0j, void* alpha, void* beta, void* lambda,
290 real* epsi, int* n, int* m) {
291
292 const dim3 nthrds(1024, 1, 1);
293 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
294
295 delx_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>(
296 (real*)delx, (real*)x, (real*)xlow, (real*)xupp, (real*)pij,
297 (real*)qij, (real*)p0j, (real*)q0j, (real*)alpha, (real*)beta,
298 (real*)lambda, *epsi, *n, *m);
299 CUDA_CHECK(cudaGetLastError());
300 }
301
302 void cuda_GG(void* GG, void* x, void* xlow, void* xupp,
303 void* pij, void* qij, int* n, int* m) {
304 const dim3 nthrds(1024, 1, 1);
305 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
306 GG_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
307 ((real*)GG, (real*)x, (real*)xlow,(real*) xupp,
308 (real*)pij, (real*) qij, *n,*m);
309 CUDA_CHECK(cudaGetLastError());
310 }
311
312 void cuda_diagx(void* diagx, void* x, void* xsi,void* xlow, void* xupp,
313 void* p0j, void* q0j, void* pij, void* qij, void* alpha, void* beta,
314 void* eta, void* lambda, int *n, int *m) {
315 const dim3 nthrds(1024, 1, 1);
316 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
317 diagx_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
318 ((real*)diagx, (real*)x, (real*)xsi,(real*)xlow,
319 (real*) xupp,(real*)p0j, (real*) q0j, (real*)pij, (real*) qij,
320 (real*)alpha, (real*) beta, (real*)eta, (real*) lambda, *n,*m);
321 CUDA_CHECK(cudaGetLastError());
322 }
323
324 void cuda_bb(void* bb, void* GG, void* delx,void* diagx, int *n, int *m) {
325 const dim3 nthrds(1024, 1, 1);
326 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
327 const int nb = ((*n) + 1024 - 1)/ 1024;
328 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
329 cudaStreamSynchronize(stream);
330 if(nb > mma_red_s){
331 mma_red_s = nb;
332 if(mma_bufred != NULL){
333 CUDA_CHECK(cudaFreeHost(mma_bufred));
334 CUDA_CHECK(cudaFree(mma_bufred_d));
335 }
336 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
337 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
338 }
339 for (int i = 0; i < (*m); i++) {
340 mmasumbb_kernel <real> <<<nblcks, nthrds, 0, stream>>>
341 ((real*)GG,(real*)delx,(real*)diagx, mma_bufred_d, (*n),(*m), i);
342 CUDA_CHECK(cudaGetLastError());
343 mmareduce_kernel<real> <<<1, 1024, 0, stream>>> (mma_bufred_d, nb);
344 CUDA_CHECK(cudaGetLastError());
345 mma_copy_kernel<<<1, 1, 0, stream>>>((real*)bb, mma_bufred_d, 1, i);
346 CUDA_CHECK(cudaGetLastError());
347 cudaStreamSynchronize(stream);
348 }
349 }
350
351 void cuda_AA(void* AA, void* GG, void* diagx, int *n, int *m) {
352 const dim3 nthrds(1024, 1, 1);
353 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
354 const int nb = ((*n) + 1024 - 1)/ 1024;
355 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
356 cudaStreamSynchronize(stream);
357 if(nb > mma_red_s){
358 mma_red_s = nb;
359 if(mma_bufred != NULL){
360 CUDA_CHECK(cudaFreeHost(mma_bufred));
361 CUDA_CHECK(cudaFree(mma_bufred_d));
362 }
363 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
364 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
365 }
366 for (int i = 0; i < (*m); i++){
367 for (int j=0; j<(*m);j++){
368 mmasumAA_kernel <real> <<<nblcks, nthrds, 0, stream>>>
369 ((real*)GG,(real*)diagx, mma_bufred_d, (*n),(*m), i, j);
370 CUDA_CHECK(cudaGetLastError());
371 mmareduce_kernel<real> <<<1, 1024, 0, stream>>> (mma_bufred_d, nb);
372 CUDA_CHECK(cudaGetLastError());
373 mma_copy_kernel<<<1, 1, 0, stream>>>((real*)AA, mma_bufred_d, 1,
374 i+j*(*m+1));
375 CUDA_CHECK(cudaGetLastError());
376 cudaStreamSynchronize(stream);
377 }
378 }
379 }
380
381 void cuda_dx(void* dx,void* delx, void* diagx, void* GG, void* dlambda,
382 int* n, int* m) {
383 const dim3 nthrds(1024, 1, 1);
384 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
385 dx_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
386 ((real*)dx, (real*)delx, (real*)diagx,(real*) GG, (real*)dlambda,
387 *n,*m);
388 CUDA_CHECK(cudaGetLastError());
389 }
390
391 void cuda_dxsi(void* dxsi, void* xsi, void* dx,void* x,
392 void* alpha, real*epsi, int* n) {
393 const dim3 nthrds(1024, 1, 1);
394 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
395 dxsi_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
396 ((real*)dxsi, (real*)xsi, (real*)dx,(real*) x,
397 (real*)alpha, *epsi,*n);
398 CUDA_CHECK(cudaGetLastError());
399 }
400
401 void cuda_deta(void* deta, void* eta, void* dx, void* x,
402 void* beta, real* epsi, int* n) {
403 const dim3 nthrds(1024, 1, 1);
404 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
405 deta_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
406 ((real*)deta, (real*)eta, (real*)dx, (real*)x,
407 (real*)beta, *epsi, *n);
408 CUDA_CHECK(cudaGetLastError());
409 }
410
411 void cuda_rex(void* rex, void* x, void* xlow, void* xupp, void* pij,
412 void* p0j, void* qij, void* q0j, void* lambda, void* xsi, void* eta,
413 int* n, int* m) {
414 const dim3 nthrds(1024, 1, 1);
415 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
416 RexCalculation_kernel<real> <<<nblcks, nthrds, 0,
417 (cudaStream_t)glb_cmd_queue >>> ((real*)rex, (real*)x, (real*)xlow,
418 (real*)xupp, (real*)pij, (real*)p0j, (real*)qij, (real*)q0j,
419 (real*)lambda, (real*)xsi, (real*)eta, *n, *m);
420 CUDA_CHECK(cudaGetLastError());
421 }
422
423 void cuda_rey(void* rey, void* c, void* d, void* y, void* lambda, void* mu,
424 int* n) {
425 const dim3 nthrds(1024, 1, 1);
426 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
427 rey_calculation_kernel<real> <<<nblcks, nthrds, 0,
428 (cudaStream_t)glb_cmd_queue >>> ((real*)rey, (real*)c,
429 (real*)d, (real*)y, (real*)lambda, (real*)mu, * n);
430 CUDA_CHECK(cudaGetLastError());
431 }
432
433
434 void cuda_sub2cons(void * a,void * b,void * c, real *d, int * n) {
435 const dim3 nthrds(1024, 1, 1);
436 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
437 sub2cons_kernel<real><<<nblcks, nthrds, 0,(cudaStream_t) glb_cmd_queue>>>
438 ((real *) a, (real *) b, (real *) c, *d, *n);
439 CUDA_CHECK(cudaGetLastError());
440 }
441
442 real cuda_norm(void* a, int* n) {
443 const dim3 nthrds(1024, 1, 1);
444 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
445 const int nb = ((*n) + 1024 - 1)/ 1024;
446 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
447 if(nb > mma_red_s){
448 mma_red_s = nb;
449 if(mma_bufred != NULL){
450 CUDA_CHECK(cudaFreeHost(mma_bufred));
451 CUDA_CHECK(cudaFree(mma_bufred_d));
452 }
453 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
454 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
455 }
456 norm_kernel <real> <<<nblcks, nthrds, 0, stream >>>
457 ((real*)a, mma_bufred_d, (*n));
458 CUDA_CHECK(cudaGetLastError());
459 mmareduce_kernel<real> <<<1, 1024, 0, stream >>> (mma_bufred_d, nb);
460 CUDA_CHECK(cudaGetLastError());
461 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d, sizeof(real),
462 cudaMemcpyDeviceToHost, stream));
463 cudaStreamSynchronize(stream);
464 return mma_bufred[0];
465 }
466
467 void cuda_dely(void* dely, void* c, void* d, void* y, void* lambda,
468 real* epsi, int* n) {
469 const dim3 nthrds(1024, 1, 1);
470 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
471 dely_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
472 ((real*)dely,(real*)c, (real*)d, (real*)y, (real*)lambda,*epsi, * n);
473 CUDA_CHECK(cudaGetLastError());
474 }
475
476 real cuda_maxval2(void* a, void* b, real* cons, int* n) {
477 const dim3 nthrds(1024, 1, 1);
478 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
479 const int nb = ((*n) + 1024 - 1)/ 1024;
480 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
481 if(nb > mma_red_s){
482 mma_red_s = nb;
483 if(mma_bufred != NULL) {
484 CUDA_CHECK(cudaFreeHost(mma_bufred));
485 CUDA_CHECK(cudaFree(mma_bufred_d));
486 }
487 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
488 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
489 }
490 maxval2_kernel <real> <<<nblcks, nthrds, 0,stream >>>
491 ((real*)a, (real*)b, mma_bufred_d, *cons, *n);
492 CUDA_CHECK(cudaGetLastError());
493 max_reduce_kernel<real> <<<1, 1024, 0,stream >>> (mma_bufred_d, nb);
494 CUDA_CHECK(cudaGetLastError());
495 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d, sizeof(real),
496 cudaMemcpyDeviceToHost, stream));
497 cudaStreamSynchronize(stream);
498 return mma_bufred[0];
499 }
500
501 real cuda_maxval3(void* a, void* b, void* c, real* cons, int* n) {
502 const dim3 nthrds(1024, 1, 1);
503 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
504 const int nb = ((*n) + 1024 - 1)/ 1024;
505 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
506 if(nb > mma_red_s){
507 mma_red_s = nb;
508 if(mma_bufred != NULL) {
509 CUDA_CHECK(cudaFreeHost(mma_bufred));
510 CUDA_CHECK(cudaFree(mma_bufred_d));
511 }
512 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
513 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
514 }
515 maxval3_kernel <real> <<<nblcks, nthrds, 0,stream>>>
516 ((real*)a, (real*)b, (real*)c, mma_bufred_d, *cons, *n);
517 max_reduce_kernel<real> <<<1, 1024, 0,stream >>> (mma_bufred_d, nb);
518 CUDA_CHECK(cudaGetLastError());
519 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d, sizeof(real),
520 cudaMemcpyDeviceToHost, stream));
521 cudaStreamSynchronize(stream);
522 return mma_bufred[0];
523 }
524
525 void cuda_kkt_rex(void* rex, void* df0dx, void* dfdx, void* xsi,
526 void* eta, void* lambda, int* n, int* m) {
527 const dim3 nthrds(1024, 1, 1);
528 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
529 kkt_rex_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
530 ((real*)rex, (real*)df0dx, (real*)dfdx, (real*)xsi,
531 (real*)eta, (real*)lambda, *n, *m);
532 CUDA_CHECK(cudaGetLastError());
533 }
534
535 void cuda_maxcons(void* a, real* b, real* c, void* d, int* n) {
536 const dim3 nthrds(1024, 1, 1);
537 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
538 maxcons_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
539 ((real*)a, *b, *c, (real*)d, *n);
540 CUDA_CHECK(cudaGetLastError());
541 }
542
543 real cuda_lcsc2(void *a, void*b, int *n) {
544 const dim3 nthrds(1024, 1, 1);
545 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
546 const int nb = ((*n) + 1024 - 1)/ 1024;
547 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
548 if ( nb > mma_red_s){
549 mma_red_s = nb;
550 if (mma_bufred != NULL) {
551 CUDA_CHECK(cudaFreeHost(mma_bufred));
552 CUDA_CHECK(cudaFree(mma_bufred_d));
553 }
554 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
555 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
556 }
557 glsc2_kernel <real> <<<nblcks, nthrds, 0, stream>>>
558 ((real*)a, (real*)b, mma_bufred_d, (*n));
559 CUDA_CHECK(cudaGetLastError());
560 mmareduce_kernel<real> <<<1, 1024, 0, stream>>> (mma_bufred_d, nb);
561 CUDA_CHECK(cudaGetLastError());
562 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d, sizeof(real),
563 cudaMemcpyDeviceToHost, stream));
564 cudaStreamSynchronize(stream);
565 return mma_bufred[0];
566 }
567
568 void cuda_mpisum(void *a, int *n) {
569#ifdef HAVE_DEVICE_MPI
570 real* temp=(real*)a;
571 cudaStreamSynchronize(stream);
572 device_mpi_allreduce_inplace(temp, *n, sizeof(real), DEVICE_MPI_SUM);
573#endif
574 }
575
576 void cuda_add2inv2(void* a, void *b, real* c, int* n) {
577 const dim3 nthrds(1024, 1, 1);
578 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
579 add2inv2_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
580 ((real*)a, (real*) b, *c, *n);
581 CUDA_CHECK(cudaGetLastError());
582 }
583
584 void cuda_max2(void* a, real* b, void* c, real* d, int* n) {
585 const dim3 nthrds(1024, 1, 1);
586 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
587 max2_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
588 ((real*)a, *b, (real*)c,*d, *n);
589 CUDA_CHECK(cudaGetLastError());
590 }
591
592 void cuda_updatebb(void* bb, void* dellambda, void* dely,void* d,
593 void* mu, void* y, real* delz, int *m) {
594 const dim3 nthrds(1024, 1, 1);
595 const dim3 nblcks(((*m+1) + 1024 - 1) / 1024, 1, 1);
596 updatebb_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
597 ((real*) bb, (real*) dellambda, (real*) dely,(real*) d,
598 (real*) mu, (real*) y, *delz, *m);
599 CUDA_CHECK(cudaGetLastError());
600 }
601
602 void cuda_updateAA(void* AA, void* globaltmp_mm, void* s, void* lambda,
603 void* d, void*mu,void* y,void* a, real* zeta, real* z, int* m) {
604 const dim3 nthrds(1024, 1, 1);
605 const dim3 nblcks(((*m+1) + 1024 - 1) / 1024, 1, 1);
606 updateAA_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
607 ((real*) AA,(real*) globaltmp_mm, (real*) s, (real*) lambda,(real*) d,
608 (real*)mu,(real*) y, (real*)a, *zeta, *z, *m);
609 CUDA_CHECK(cudaGetLastError());
610 }
611
612 void cuda_dy(void* dy, void* dely, void* dlambda,void* d, void* mu,
613 void* y, int* n) {
614 const dim3 nthrds(1024, 1, 1);
615 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
616 dy_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
617 ((real*) dy, (real*) dely, (real*) dlambda, (real*) d,
618 (real*) mu,(real*) y, *n);
619 CUDA_CHECK(cudaGetLastError());
620 }
621
622}/* extern "C" */