Neko-TOP
A portable framework for high-order spectral element flow toplogy optimization.
Loading...
Searching...
No Matches
mma.cu
1/*
2 Copyright (c) 2021-2025, The Neko Authors
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 * Redistributions in binary form must reproduce the above
13 copyright notice, this list of conditions and the following
14 disclaimer in the documentation and/or other materials provided
15 with the distribution.
16
17 * Neither the name of the authors nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 POSSIBILITY OF SUCH DAMAGE.
33*/
34
35#include "device/cuda/check.h"
36#include "mma_kernel.h"
37#include <stdio.h>
38#include <stdlib.h>
39
40
41extern "C" {
42#include "math/bcknd/device/device_mpi_reduce.h"
43#include "math/bcknd/device/device_mpi_op.h"
44#include "device/device_config.h"
45
46
47 int mma_red_s = 0;
48 real * mma_bufred = NULL;
49 real * mma_bufred_d = NULL;
51 void cuda_Hess(void* Hess, void* hijx, void* Ljjxinv, int *n, int *m) {
52 const dim3 nthrds(1024, 1, 1);
53 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
54 const int nb = ((*n) + 1024 - 1)/ 1024;
55 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
56 cudaStreamSynchronize(stream);
57 if(nb > mma_red_s){
58 mma_red_s = nb;
59 if(mma_bufred != NULL){
60 CUDA_CHECK(cudaFreeHost(mma_bufred));
61 CUDA_CHECK(cudaFree(mma_bufred_d));
62 }
63 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
64 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
65 }
66 for (int i = 0; i < (*m); i++){
67 for (int j=0; j<(*m);j++){
68 mmasumHess_kernel <real> <<<nblcks, nthrds, 0, stream>>>
69 ((real*)hijx,(real*)Ljjxinv, mma_bufred_d, (*n),(*m), i, j);
70 CUDA_CHECK(cudaGetLastError());
71 mmareduce_kernel<real> <<<1, 1024, 0, stream>>> (mma_bufred_d, nb);
72 CUDA_CHECK(cudaGetLastError());
73 mma_copy_kernel<<<1, 1, 0, stream>>>((real*)Hess, mma_bufred_d, 1,
74 i+j*(*m));
75 CUDA_CHECK(cudaGetLastError());
76 cudaStreamSynchronize(stream);
77 }
78 }
79 }
80
81 void mma_Ljjxinv_cuda(void* Ljjxinv, void* pjlambda, void* qjlambda, void* x,
82 void* low, void* upp, void* alpha, void* beta, int* n) {
83 const dim3 nthrds(1024, 1, 1);
84 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
85 mma_Ljjxinv_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
86 ((real*)Ljjxinv, (real*)pjlambda, (real*)qjlambda, (real*)x, (real*)low,
87 (real*)upp, (real*)alpha, (real*)beta, *n);
88 CUDA_CHECK(cudaGetLastError());
89 }
90
91 void mma_dipsolvesub1_cuda(void* x, void* pjlambda, void* qjlambda,
92 void* low, void* upp, void* alpha, void* beta, int* n) {
93 const dim3 nthrds(1024, 1, 1);
94 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
95 mma_dipsolvesub1_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
96 ((real*)x, (real*)pjlambda, (real*)qjlambda, (real*)low, (real*)upp,
97 (real*)alpha, (real*)beta, *n);
98 CUDA_CHECK(cudaGetLastError());
99 }
100
101 void mattrans_v_mul_cuda(void* output, void* pij, void* lambda,
102 int* m, int* n) {
103 const dim3 nthrds(1024, 1, 1);
104 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
105 mattrans_v_mul_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
106 ((real*)output, (real*)pij, (real*)lambda, *m, *n);
107 CUDA_CHECK(cudaGetLastError());
108 }
109
110 void mma_gensub4_cuda(void* x, void* low, void* upp, void* pij, void* qij,
111 int* n, int* m, void* bi) {
112 const dim3 nthrds(1024, 1, 1);
113 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
114 const int nb = ((*n) + 1024 - 1) / 1024;
115 const cudaStream_t stream = (cudaStream_t)glb_cmd_queue;
116
117 if (nb > mma_red_s) {
118 mma_red_s = nb;
119 if (mma_bufred != NULL) {
120 CUDA_CHECK(cudaFreeHost(mma_bufred));
121 CUDA_CHECK(cudaFree(mma_bufred_d));
122 }
123 CUDA_CHECK(cudaMallocHost(&mma_bufred,
124 nb * sizeof(real)));
125 CUDA_CHECK(cudaMalloc(&mma_bufred_d,
126 nb * sizeof(real)));
127 }
128
129 real* temp;
130 real* bi_d = (real*)bi;
131 cudaMalloc(&temp, (*m) * (*n) * sizeof(real));
132
133 mma_sub4_kernel<real><<<nblcks, nthrds, 0, stream>>>(
134 (real*)x, (real*)low, (real*)upp, (real*)pij, (real*)qij,
135 temp, *n, *m);
136
137 for (int i = 0; i < (*m); i++) {
138 mmasum_kernel<real><<<nblcks, nthrds, 0, stream>>>(
139 temp, mma_bufred_d, (*n), (*m), i);
140 CUDA_CHECK(cudaGetLastError());
141
142 mmareduce_kernel<real><<<1, 1024, 0, stream>>>(
143 mma_bufred_d, nb);
144 CUDA_CHECK(cudaGetLastError());
145
146 CUDA_CHECK(cudaMemcpyAsync(
147 bi_d + i, mma_bufred_d, sizeof(real),
148 cudaMemcpyDeviceToDevice, stream));
149
150 cudaStreamSynchronize(stream);
151 }
152
153 cudaFree(temp);
154 }
155
157 void mma_gensub3_cuda(void* x, void* df0dx, void* dfdx, void* low,
158 void* upp, void* xmin, void* xmax, void* alpha, void* beta,
159 void* p0j, void* q0j, void* pij, void* qij, int* n, int* m) {
160 const dim3 nthrds(1024, 1, 1);
161 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
162
163 mma_sub3_kernel<real><<<nblcks, nthrds, 0,
164 (cudaStream_t)glb_cmd_queue>>>(
165 (real*)x, (real*)df0dx, (real*)dfdx, (real*)low,
166 (real*)upp, (real*)xmin, (real*)xmax, (real*)alpha,
167 (real*)beta, (real*)p0j, (real*)q0j, (real*)pij,
168 (real*)qij, *n, *m);
169
170 CUDA_CHECK(cudaGetLastError());
171 }
172
173 void mma_gensub2_cuda(void* low, void* upp, void* x, void* xold1,
174 void* xold2, void* xdiff, real* asydecr, real* asyincr, int* n) {
175 const dim3 nthrds(1024, 1, 1);
176 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
177
178 mma_sub2_kernel<real><<<nblcks, nthrds, 0,
179 (cudaStream_t)glb_cmd_queue>>>(
180 (real*)low, (real*)upp, (real*)x, (real*)xold1,
181 (real*)xold2, (real*)xdiff, *asydecr, *asyincr, *n);
182
183 CUDA_CHECK(cudaGetLastError());
184 }
185
186
187
188 void mma_gensub1_cuda(void* low, void* upp, void* x, void* xmin, void* xmax,
189 real* asyinit, int* n) {
190 const dim3 nthrds(1024, 1, 1);
191 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
192 mma_sub1_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
193 ((real*)low, (real*)upp, (real*)x, (real*)xmin, (real*)xmax,
194 *asyinit, *n);
195 CUDA_CHECK(cudaGetLastError());
196 }
197
198 void cuda_mma_max(void* xsi, void* x, void* alpha, int* n) {
199 const dim3 nthrds(1024, 1, 1);
200 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
201
202 mma_max2_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
203 ((real*)xsi, (real*)x, (real*)alpha, *n);
204 CUDA_CHECK(cudaGetLastError());
205 }
206
207 void cuda_relambda(void* relambda, void* x, void* xupp, void* xlow,
208 void* pij, void* qij, int* n, int* m) {
209 const dim3 nthrds(1024, 1, 1);
210 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
211 const int nb = ((*n) + 1024 - 1)/ 1024;
212 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
213
214 if ( nb > mma_red_s){
215 mma_red_s = nb;
216 if (mma_bufred != NULL) {
217 CUDA_CHECK(cudaFreeHost(mma_bufred));
218 CUDA_CHECK(cudaFree(mma_bufred_d));
219 }
220 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
221 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
222 }
223 real* temp;
224 cudaMalloc(&temp, (*n) * (*m) * sizeof(real));
225 relambda_kernel<real> <<<nblcks, nthrds, 0, stream >>> (temp, (real*)x,
226 (real*)xupp, (real*)xlow, (real*)pij, (real*)qij, *n, *m);
227 for (int i = 0; i < (*m); i++) {
228 mmasum_kernel <real> <<<nblcks, nthrds, 0, stream >>>
229 (temp, mma_bufred_d, (*n),(*m), i);
230 CUDA_CHECK(cudaGetLastError());
231 mmareduce_kernel<real> <<<1, 1024, 0, stream >>>
232 (mma_bufred_d, nb);
233 CUDA_CHECK(cudaGetLastError());
234 mma_copy_kernel<<<1, 1, 0, stream>>>((real*)relambda, mma_bufred_d, 1, i);
235 CUDA_CHECK(cudaGetLastError());
236 cudaStreamSynchronize(stream);
237 }
238 cudaFree(temp);
239 }
240
241 void cuda_sub2cons2(void* a, void* b, void* c, void* d, real* e, int* n) {
242
243 const dim3 nthrds(1024, 1, 1);
244 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
245
246 sub2cons2_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>(
247 (real*)a, (real*)b, (real*)c, (real*)d, *e, *n);
248 CUDA_CHECK(cudaGetLastError());
249 }
250
252 real cuda_maxval(void* a, int* n) {
253
254 const dim3 nthrds(1024, 1, 1);
255 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
256 const int nb = ((*n) + 1024 - 1) / 1024;
257 const cudaStream_t stream = (cudaStream_t)glb_cmd_queue;
258
259 if (nb > mma_red_s) {
260 mma_red_s = nb;
261 if (mma_bufred != NULL) {
262 CUDA_CHECK(cudaFreeHost(mma_bufred));
263 CUDA_CHECK(cudaFree(mma_bufred_d));
264 }
265 CUDA_CHECK(cudaMallocHost(&mma_bufred, nb * sizeof(real)));
266 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb * sizeof(real)));
267 }
268
269 maxval_kernel<real><<<nblcks, nthrds, 0, stream>>>(
270 (real*)a, mma_bufred_d, (*n));
271 CUDA_CHECK(cudaGetLastError());
272
273 max_reduce_kernel<real><<<1, 1024, 0, stream>>>(
274 mma_bufred_d, nb);
275 CUDA_CHECK(cudaGetLastError());
276
277 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d, sizeof(real),
278 cudaMemcpyDeviceToHost, stream));
279 cudaStreamSynchronize(stream);
280
281 return mma_bufred[0];
282 }
283
284
285 void cuda_delx(void* delx, void* x, void* xlow, void* xupp, void* pij,
286 void* qij, void* p0j, void* q0j, void* alpha, void* beta, void* lambda,
287 real* epsi, int* n, int* m) {
288
289 const dim3 nthrds(1024, 1, 1);
290 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
291
292 delx_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>(
293 (real*)delx, (real*)x, (real*)xlow, (real*)xupp, (real*)pij,
294 (real*)qij, (real*)p0j, (real*)q0j, (real*)alpha, (real*)beta,
295 (real*)lambda, *epsi, *n, *m);
296 CUDA_CHECK(cudaGetLastError());
297 }
298
299
300 void cuda_GG(void* GG, void* x, void* xlow, void* xupp,
301 void* pij, void* qij, int* n, int* m) {
302 const dim3 nthrds(1024, 1, 1);
303 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
304 GG_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
305 ((real*)GG, (real*)x, (real*)xlow,(real*) xupp,
306 (real*)pij, (real*) qij, *n,*m);
307 CUDA_CHECK(cudaGetLastError());
308 }
309
310
311 void cuda_diagx(void* diagx, void* x, void* xsi,void* xlow, void* xupp,
312 void* p0j, void* q0j, void* pij, void* qij, void* alpha, void* beta,
313 void* eta, void* lambda, int *n, int *m) {
314 const dim3 nthrds(1024, 1, 1);
315 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
316 diagx_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
317 ((real*)diagx, (real*)x, (real*)xsi,(real*)xlow,
318 (real*) xupp,(real*)p0j, (real*) q0j, (real*)pij, (real*) qij,
319 (real*)alpha, (real*) beta, (real*)eta, (real*) lambda, *n,*m);
320 CUDA_CHECK(cudaGetLastError());
321 }
322
323
324 void cuda_bb(void* bb, void* GG, void* delx,void* diagx, int *n, int *m) {
325 const dim3 nthrds(1024, 1, 1);
326 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
327 const int nb = ((*n) + 1024 - 1)/ 1024;
328 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
329 cudaStreamSynchronize(stream);
330 if(nb > mma_red_s){
331 mma_red_s = nb;
332 if(mma_bufred != NULL){
333 CUDA_CHECK(cudaFreeHost(mma_bufred));
334 CUDA_CHECK(cudaFree(mma_bufred_d));
335 }
336 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
337 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
338 }
339 for (int i = 0; i < (*m); i++) {
340 mmasumbb_kernel <real> <<<nblcks, nthrds, 0, stream>>>
341 ((real*)GG,(real*)delx,(real*)diagx, mma_bufred_d, (*n),(*m), i);
342 CUDA_CHECK(cudaGetLastError());
343 mmareduce_kernel<real> <<<1, 1024, 0, stream>>> (mma_bufred_d, nb);
344 CUDA_CHECK(cudaGetLastError());
345 mma_copy_kernel<<<1, 1, 0, stream>>>((real*)bb, mma_bufred_d, 1, i);
346 CUDA_CHECK(cudaGetLastError());
347 cudaStreamSynchronize(stream);
348 }
349 }
350
351
352 void cuda_AA(void* AA, void* GG, void* diagx, int *n, int *m) {
353 const dim3 nthrds(1024, 1, 1);
354 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
355 const int nb = ((*n) + 1024 - 1)/ 1024;
356 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
357 cudaStreamSynchronize(stream);
358 if(nb > mma_red_s){
359 mma_red_s = nb;
360 if(mma_bufred != NULL){
361 CUDA_CHECK(cudaFreeHost(mma_bufred));
362 CUDA_CHECK(cudaFree(mma_bufred_d));
363 }
364 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
365 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
366 }
367 for (int i = 0; i < (*m); i++){
368 for (int j=0; j<(*m);j++){
369 mmasumAA_kernel <real> <<<nblcks, nthrds, 0, stream>>>
370 ((real*)GG,(real*)diagx, mma_bufred_d, (*n),(*m), i, j);
371 CUDA_CHECK(cudaGetLastError());
372 mmareduce_kernel<real> <<<1, 1024, 0, stream>>> (mma_bufred_d, nb);
373 CUDA_CHECK(cudaGetLastError());
374 mma_copy_kernel<<<1, 1, 0, stream>>>((real*)AA, mma_bufred_d, 1,
375 i+j*(*m+1));
376 CUDA_CHECK(cudaGetLastError());
377 cudaStreamSynchronize(stream);
378 }
379 }
380 }
381
382
383 void cuda_dx(void* dx,void* delx, void* diagx, void* GG, void* dlambda,
384 int* n, int* m) {
385 const dim3 nthrds(1024, 1, 1);
386 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
387 dx_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
388 ((real*)dx, (real*)delx, (real*)diagx,(real*) GG, (real*)dlambda,
389 *n,*m);
390 CUDA_CHECK(cudaGetLastError());
391 }
392
393
394 void cuda_dxsi(void* dxsi, void* xsi, void* dx,void* x,
395 void* alpha, real*epsi, int* n) {
396 const dim3 nthrds(1024, 1, 1);
397 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
398 dxsi_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
399 ((real*)dxsi, (real*)xsi, (real*)dx,(real*) x,
400 (real*)alpha, *epsi,*n);
401 CUDA_CHECK(cudaGetLastError());
402 }
403
404
405 void cuda_deta(void* deta, void* eta, void* dx, void* x,
406 void* beta, real* epsi, int* n) {
407 const dim3 nthrds(1024, 1, 1);
408 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
409 deta_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
410 ((real*)deta, (real*)eta, (real*)dx, (real*)x,
411 (real*)beta, *epsi, *n);
412 CUDA_CHECK(cudaGetLastError());
413 }
414
415
416 void cuda_rex(void* rex, void* x, void* xlow, void* xupp, void* pij,
417 void* p0j, void* qij, void* q0j, void* lambda, void* xsi, void* eta,
418 int* n, int* m) {
419 const dim3 nthrds(1024, 1, 1);
420 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
421 RexCalculation_kernel<real> <<<nblcks, nthrds, 0,
422 (cudaStream_t)glb_cmd_queue >>> ((real*)rex, (real*)x, (real*)xlow,
423 (real*)xupp, (real*)pij, (real*)p0j, (real*)qij, (real*)q0j,
424 (real*)lambda, (real*)xsi, (real*)eta, *n, *m);
425 CUDA_CHECK(cudaGetLastError());
426 }
427
428
429 void cuda_rey(void* rey, void* c, void* d, void* y, void* lambda, void* mu,
430 int* n) {
431 const dim3 nthrds(1024, 1, 1);
432 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
433 rey_calculation_kernel<real> <<<nblcks, nthrds, 0,
434 (cudaStream_t)glb_cmd_queue >>> ((real*)rey, (real*)c,
435 (real*)d, (real*)y, (real*)lambda, (real*)mu, * n);
436 CUDA_CHECK(cudaGetLastError());
437 }
438
439
441 void cuda_sub2cons(void * a,void * b,void * c, real *d, int * n) {
442 const dim3 nthrds(1024, 1, 1);
443 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
444 sub2cons_kernel<real><<<nblcks, nthrds, 0,(cudaStream_t) glb_cmd_queue>>>
445 ((real *) a, (real *) b, (real *) c, *d, *n);
446 CUDA_CHECK(cudaGetLastError());
447 }
448
449
451 real cuda_norm(void* a, int* n) {
452 const dim3 nthrds(1024, 1, 1);
453 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
454 const int nb = ((*n) + 1024 - 1)/ 1024;
455 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
456 if(nb > mma_red_s){
457 mma_red_s = nb;
458 if(mma_bufred != NULL){
459 CUDA_CHECK(cudaFreeHost(mma_bufred));
460 CUDA_CHECK(cudaFree(mma_bufred_d));
461 }
462 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
463 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
464 }
465 norm_kernel <real> <<<nblcks, nthrds, 0, stream >>>
466 ((real*)a, mma_bufred_d, (*n));
467 CUDA_CHECK(cudaGetLastError());
468 mmareduce_kernel<real> <<<1, 1024, 0, stream >>> (mma_bufred_d, nb);
469 CUDA_CHECK(cudaGetLastError());
470 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d, sizeof(real),
471 cudaMemcpyDeviceToHost, stream));
472 cudaStreamSynchronize(stream);
473 return mma_bufred[0];
474 }
475
476
477 void cuda_dely(void* dely, void* c, void* d, void* y, void* lambda,
478 real* epsi, int* n) {
479 const dim3 nthrds(1024, 1, 1);
480 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
481 dely_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
482 ((real*)dely,(real*)c, (real*)d, (real*)y, (real*)lambda,*epsi, * n);
483 CUDA_CHECK(cudaGetLastError());
484 }
485
486
487 real cuda_maxval2(void* a, void* b, real* cons, int* n) {
488 const dim3 nthrds(1024, 1, 1);
489 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
490 const int nb = ((*n) + 1024 - 1)/ 1024;
491 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
492 if(nb > mma_red_s){
493 mma_red_s = nb;
494 if(mma_bufred != NULL) {
495 CUDA_CHECK(cudaFreeHost(mma_bufred));
496 CUDA_CHECK(cudaFree(mma_bufred_d));
497 }
498 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
499 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
500 }
501 maxval2_kernel <real> <<<nblcks, nthrds, 0,stream >>>
502 ((real*)a, (real*)b, mma_bufred_d, *cons, *n);
503 CUDA_CHECK(cudaGetLastError());
504 max_reduce_kernel<real> <<<1, 1024, 0,stream >>> (mma_bufred_d, nb);
505 CUDA_CHECK(cudaGetLastError());
506 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d, sizeof(real),
507 cudaMemcpyDeviceToHost, stream));
508 cudaStreamSynchronize(stream);
509 return mma_bufred[0];
510 }
511
512
513 real cuda_maxval3(void* a, void* b, void* c, real* cons, int* n) {
514 const dim3 nthrds(1024, 1, 1);
515 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
516 const int nb = ((*n) + 1024 - 1)/ 1024;
517 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
518 if(nb > mma_red_s){
519 mma_red_s = nb;
520 if(mma_bufred != NULL) {
521 CUDA_CHECK(cudaFreeHost(mma_bufred));
522 CUDA_CHECK(cudaFree(mma_bufred_d));
523 }
524 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
525 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
526 }
527 maxval3_kernel <real> <<<nblcks, nthrds, 0,stream>>>
528 ((real*)a, (real*)b, (real*)c, mma_bufred_d, *cons, *n);
529 max_reduce_kernel<real> <<<1, 1024, 0,stream >>> (mma_bufred_d, nb);
530 CUDA_CHECK(cudaGetLastError());
531 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d, sizeof(real),
532 cudaMemcpyDeviceToHost, stream));
533 cudaStreamSynchronize(stream);
534 return mma_bufred[0];
535 }
536
537
538 void cuda_kkt_rex(void* rex, void* df0dx, void* dfdx, void* xsi,
539 void* eta, void* lambda, int* n, int* m) {
540 const dim3 nthrds(1024, 1, 1);
541 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
542 kkt_rex_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
543 ((real*)rex, (real*)df0dx, (real*)dfdx, (real*)xsi,
544 (real*)eta, (real*)lambda, *n, *m);
545 CUDA_CHECK(cudaGetLastError());
546 }
547
548
550 void cuda_maxcons(void* a, real* b, real* c, void* d, int* n) {
551 const dim3 nthrds(1024, 1, 1);
552 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
553 maxcons_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
554 ((real*)a, *b, *c, (real*)d, *n);
555 CUDA_CHECK(cudaGetLastError());
556 }
557
558
559 real cuda_lcsc2(void *a, void*b, int *n) {
560 const dim3 nthrds(1024, 1, 1);
561 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
562 const int nb = ((*n) + 1024 - 1)/ 1024;
563 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
564 if ( nb > mma_red_s){
565 mma_red_s = nb;
566 if (mma_bufred != NULL) {
567 CUDA_CHECK(cudaFreeHost(mma_bufred));
568 CUDA_CHECK(cudaFree(mma_bufred_d));
569 }
570 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
571 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
572 }
573 glsc2_kernel <real> <<<nblcks, nthrds, 0, stream>>>
574 ((real*)a, (real*)b, mma_bufred_d, (*n));
575 CUDA_CHECK(cudaGetLastError());
576 mmareduce_kernel<real> <<<1, 1024, 0, stream>>> (mma_bufred_d, nb);
577 CUDA_CHECK(cudaGetLastError());
578 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d, sizeof(real),
579 cudaMemcpyDeviceToHost, stream));
580 cudaStreamSynchronize(stream);
581 return mma_bufred[0];
582 }
583
584
585 void cuda_mpisum(void *a, int *n) {
586#ifdef HAVE_DEVICE_MPI
587 real* temp=(real*)a;
588 cudaStreamSynchronize(stream);
589 device_mpi_allreduce_inplace(temp, *n, sizeof(real), DEVICE_MPI_SUM);
590#endif
591 }
592
593
594 void cuda_add2inv2(void* a, void *b, real* c, int* n) {
595 const dim3 nthrds(1024, 1, 1);
596 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
597 add2inv2_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
598 ((real*)a, (real*) b, *c, *n);
599 CUDA_CHECK(cudaGetLastError());
600 }
601
602
603 void cuda_max2(void* a, real* b, void* c, real* d, int* n) {
604 const dim3 nthrds(1024, 1, 1);
605 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
606 max2_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
607 ((real*)a, *b, (real*)c,*d, *n);
608 CUDA_CHECK(cudaGetLastError());
609 }
610
611
612 void cuda_updatebb(void* bb, void* dellambda, void* dely,void* d,
613 void* mu, void* y, real* delz, int *m) {
614 const dim3 nthrds(1024, 1, 1);
615 const dim3 nblcks(((*m+1) + 1024 - 1) / 1024, 1, 1);
616 updatebb_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
617 ((real*) bb, (real*) dellambda, (real*) dely,(real*) d,
618 (real*) mu, (real*) y, *delz, *m);
619 CUDA_CHECK(cudaGetLastError());
620 }
621
622
623 void cuda_updateAA(void* AA, void* globaltmp_mm, void* s, void* lambda,
624 void* d, void*mu,void* y,void* a, real* zeta, real* z, int* m) {
625 const dim3 nthrds(1024, 1, 1);
626 const dim3 nblcks(((*m+1) + 1024 - 1) / 1024, 1, 1);
627 updateAA_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
628 ((real*) AA,(real*) globaltmp_mm, (real*) s, (real*) lambda,(real*) d,
629 (real*)mu,(real*) y, (real*)a, *zeta, *z, *m);
630 CUDA_CHECK(cudaGetLastError());
631 }
632
633
634 void cuda_dy(void* dy, void* dely, void* dlambda,void* d, void* mu,
635 void* y, int* n) {
636 const dim3 nthrds(1024, 1, 1);
637 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
638 dy_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
639 ((real*) dy, (real*) dely, (real*) dlambda, (real*) d,
640 (real*) mu,(real*) y, *n);
641 CUDA_CHECK(cudaGetLastError());
642 }
643
644}/* extern "C" */
645