Neko-TOP
A portable framework for high-order spectral element flow toplogy optimization.
Loading...
Searching...
No Matches
mma.cu
1/*
2 Copyright (c) 2021-2025, The Neko Authors
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 * Redistributions in binary form must reproduce the above
13 copyright notice, this list of conditions and the following
14 disclaimer in the documentation and/or other materials provided
15 with the distribution.
16
17 * Neither the name of the authors nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 POSSIBILITY OF SUCH DAMAGE.
33*/
34
35// System includes
36#include <stdio.h>
37#include <stdlib.h>
38
39// Device includes
40#include <cuda_runtime.h>
41#include <cusolverDn.h>
42
43// Neko includes
44#include <neko/device/device_config.h>
45#include <neko/device/cuda/check.h>
46#include <neko/math/bcknd/device/device_mpi_reduce.h>
47#include <neko/math/bcknd/device/device_mpi_op.h>
48
49// Local includes
50#include "mma_kernel.h"
51
52
53extern "C" {
54
55 int mma_red_s = 0;
56 real* mma_bufred = NULL;
57 real* mma_bufred_d = NULL;
58
59 void mma_prepare_aa_matrix_cuda(void* AA, void* s, void* lambda,
60 void* d, void* mu, void* y,
61 void* a, real* zeta, real* z, int* m) {
62 const dim3 nthrds(1024, 1, 1);
63 const dim3 nblcks(((*m) + 1024 - 1) / 1024, 1, 1);
64
65 // Launch kernel to prepare AA matrix
66 mma_prepare_aa_matrix_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>(
67 (real*)AA, (real*)s, (real*)lambda, (real*)d,
68 (real*)mu, (real*)y, (real*)a,
69 *zeta, *z, *m);
70
71 CUDA_CHECK(cudaGetLastError());
72 }
73
74 void mma_prepare_hessian_cuda(void* Hess, void* y, void* d,
75 void* mu, void* lambda, int* m) {
76 const int M = *m;
77 const dim3 nthrds(1024, 1, 1);
78 const dim3 nblcks((M + 1024 - 1) / 1024, 1, 1);
79 const cudaStream_t stream = (cudaStream_t)glb_cmd_queue;
80
81 // Update diagonal elements
82 mma_update_hessian_diagonal_kernel<real><<<nblcks, nthrds, 0, stream>>>(
83 (real*)Hess, (real*)y, (real*)d, (real*)mu, (real*)lambda, M);
84 CUDA_CHECK(cudaGetLastError());
85
86 // Synchronize to ensure diagonal updates are complete
87 CUDA_CHECK(cudaStreamSynchronize(stream));
88
89 // Choose kernel based on problem size
90 if (M <= 1024) {
91 // Single-block version (fast for small m)
92 const dim3 stab_nblcks(1, 1, 1);
93 mma_stabilize_hessian_single_kernel<real><<<stab_nblcks, nthrds, 0, stream>>>(
94 (real*)Hess, M);
95 CUDA_CHECK(cudaGetLastError());
96 } else {
97 // Multi-block version (for large m)
98 // Compute trace on host (simple and reliable)
99 real* h_Hess = (real*)malloc(M * sizeof(real));
100
101 // Extract diagonal elements
102 for (int i = 0; i < M; i++) {
103 CUDA_CHECK(cudaMemcpyAsync(&h_Hess[i],
104 (real*)Hess + i * M + i,
105 sizeof(real),
106 cudaMemcpyDeviceToHost, stream));
107 }
108 CUDA_CHECK(cudaStreamSynchronize(stream));
109
110 // Compute trace and LM factor
111 real trace = 0.0;
112 for (int i = 0; i < M; i++) {
113 trace += h_Hess[i];
114 }
115 real lm_factor = fmax(-1.0e-4 * trace / M, 1.0e-7);
116
117 // Apply stabilization in parallel
118 mma_stabilize_hessian_multi_kernel<real><<<nblcks, nthrds, 0, stream>>>(
119 (real*)Hess, lm_factor, M);
120 CUDA_CHECK(cudaGetLastError());
121
122 free(h_Hess);
123 }
124 }
125
126 void cuda_custom_solver(void* A, void* b, int n, int* info) {
127 const cudaStream_t stream = (cudaStream_t)glb_cmd_queue;
128
129 if (n <= 0) {
130 *info = -1; // Use CPU fallback
131 return;
132 }
133 const dim3 nthrds(1024, 1, 1);
134 const dim3 nblcks(1, 1, 1);
135
136 mma_small_lu_kernel<real><<<nblcks, nthrds, 0, stream>>>(
137 (real*)A, (real*)b, n);
138
139 cudaError_t err = cudaGetLastError();
140 if (err == cudaSuccess) {
141 *info = 0; // GPU solver succeeded
142 } else {
143 *info = -1; // GPU failed
144 }
145 }
146
147 void cuSOLVER_wrapper(void* A, void* b, int n, int* jj) {
148 cusolverDnHandle_t handle;
149 cusolverStatus_t status;
150 cusolverDnCreate(&handle);
151
152 int lwork;
153 double* workspace;
154 int* ipiv;
155 int* info; // Device pointer for cuSOLVER info
156 int host_info = 0; // Host variable to store the info
157
158 // Workspace query
159 status = cusolverDnDgetrf_bufferSize(handle, n, n, (double*)A, n, &lwork);
160 if (status != CUSOLVER_STATUS_SUCCESS) {
161 exit(EXIT_FAILURE);
162 }
163 cudaMalloc(&workspace, lwork * sizeof(double));
164 cudaMalloc(&ipiv, n * sizeof(int));
165 cudaMalloc(&info, sizeof(int));
166
167 // LU factorization and solve
168 cusolverDnDgetrf(handle, n, n, (double*)A, n, workspace, ipiv, info);
169
170 // Copy info from device to host to check if factorization succeeded
171 cudaMemcpy(&host_info, info, sizeof(int), cudaMemcpyDeviceToHost);
172
173 if (host_info == 0) {
174 // Only solve if factorization was successful
175 cusolverDnDgetrs(handle, CUBLAS_OP_N, n, 1, (double*)A, n, ipiv, (double*)b, n, info);
176 // Copy the final info value
177 cudaMemcpy(&host_info, info, sizeof(int), cudaMemcpyDeviceToHost);
178 }
179
180
181 // Return the actual info value through jj
182 *jj = host_info;
183
184 // Cleanup
185 cudaFree(workspace);
186 cudaFree(ipiv);
187 cudaFree(info);
188 cusolverDnDestroy(handle);
189 }
190
191 void custom_solve_linear_system(void* A, void* b, int n, int* info) {
192 const cudaStream_t stream = (cudaStream_t)glb_cmd_queue;
193
194 if (n <= 0) {
195 *info = -1; // Use CPU fallback
196 return;
197 }
198 const dim3 nthrds(1024, 1, 1);
199 const dim3 nblcks(1, 1, 1);
200
201 mma_small_lu_kernel<real><<<nblcks, nthrds, 0, stream>>>(
202 (real*)A, (real*)b, n);
203
204 cudaError_t err = cudaGetLastError();
205 if (err == cudaSuccess) {
206 *info = 0; // GPU solver succeeded
207 } else {
208 *info = -1; // GPU failed, use CPU fallback
209 }
210 }
211
212 void delta_1dbeam_cuda(void* Delta, real* L_total, real* Le,
213 int* offset, int* n) {
214 const dim3 nthrds(1024, 1, 1);
215 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
216 delta_1dbeam_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
217 ((real*)Delta, *L_total, *Le, *offset, *n);
218 CUDA_CHECK(cudaGetLastError());
219 }
220
221 void cuda_Hess(void* Hess, void* hijx, void* Ljjxinv, int *n, int *m) {
222 const dim3 nthrds(1024, 1, 1);
223 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
224 const int nb = ((*n) + 1024 - 1)/ 1024;
225 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
226 cudaStreamSynchronize(stream);
227 if(nb > mma_red_s){
228 mma_red_s = nb;
229 if(mma_bufred != NULL){
230 CUDA_CHECK(cudaFreeHost(mma_bufred));
231 CUDA_CHECK(cudaFree(mma_bufred_d));
232 }
233 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
234 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
235 }
236 for (int i = 0; i < (*m); i++){
237 for (int j=0; j<(*m);j++){
238 mmasumHess_kernel <real> <<<nblcks, nthrds, 0, stream>>>
239 ((real*)hijx,(real*)Ljjxinv, mma_bufred_d, (*n),(*m), i, j);
240 CUDA_CHECK(cudaGetLastError());
241 mmareduce_kernel<real> <<<1, 1024, 0, stream>>> (mma_bufred_d, nb);
242 CUDA_CHECK(cudaGetLastError());
243 mma_copy_kernel<<<1, 1, 0, stream>>>((real*)Hess, mma_bufred_d, 1,
244 i+j*(*m));
245 CUDA_CHECK(cudaGetLastError());
246 cudaStreamSynchronize(stream);
247 }
248 }
249 }
250
251 void mma_Ljjxinv_cuda(void* Ljjxinv, void* pjlambda, void* qjlambda, void* x,
252 void* low, void* upp, void* alpha, void* beta, int* n) {
253 const dim3 nthrds(1024, 1, 1);
254 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
255 mma_Ljjxinv_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
256 ((real*)Ljjxinv, (real*)pjlambda, (real*)qjlambda, (real*)x, (real*)low,
257 (real*)upp, (real*)alpha, (real*)beta, *n);
258 CUDA_CHECK(cudaGetLastError());
259 }
260
261 void mma_dipsolvesub1_cuda(void* x, void* pjlambda, void* qjlambda,
262 void* low, void* upp, void* alpha, void* beta, int* n) {
263 const dim3 nthrds(1024, 1, 1);
264 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
265 mma_dipsolvesub1_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
266 ((real*)x, (real*)pjlambda, (real*)qjlambda, (real*)low, (real*)upp,
267 (real*)alpha, (real*)beta, *n);
268 CUDA_CHECK(cudaGetLastError());
269 }
270
271 void mattrans_v_mul_cuda(void* output, void* pij, void* lambda,
272 int* m, int* n) {
273 const dim3 nthrds(1024, 1, 1);
274 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
275 mattrans_v_mul_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
276 ((real*)output, (real*)pij, (real*)lambda, *m, *n);
277 CUDA_CHECK(cudaGetLastError());
278 }
279
280 void mma_gensub4_cuda(const void* x, const void* low, const void* upp,
281 const void* pij, const void* qij,
282 const int* n, const int* m, void* bi) {
283
284 const int N = *n;
285 const int M = *m;
286
287 const dim3 nthrds(1024, 1, 1);
288 const dim3 nblcks((N + 1023) / 1024, 1, 1);
289 const int nb = (N + 1023) / 1024;
290 const cudaStream_t stream = (cudaStream_t)glb_cmd_queue;
291
292 if (nb > mma_red_s) {
293 mma_red_s = nb;
294
295 if (mma_bufred != NULL) {
296 CUDA_CHECK(cudaFreeHost(mma_bufred));
297 CUDA_CHECK(cudaFree(mma_bufred_d));
298 }
299
300 CUDA_CHECK(cudaMallocHost(&mma_bufred, nb * sizeof(real)));
301 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb * sizeof(real)));
302 }
303
304 real* temp;
305 real* bi_d = static_cast<real*>(bi);
306 CUDA_CHECK(cudaMalloc(&temp, M * N * sizeof(real)));
307
308 mma_sub4_kernel<real><<<nblcks, nthrds, 0, stream>>>(
309 static_cast<const real*>(x),
310 static_cast<const real*>(low),
311 static_cast<const real*>(upp),
312 static_cast<const real*>(pij),
313 static_cast<const real*>(qij),
314 temp, N, M);
315
316 for (int i = 0; i < M; ++i) {
317 mmasum_kernel<real><<<nblcks, nthrds, 0, stream>>>(
318 temp, mma_bufred_d, N, M, i);
319 CUDA_CHECK(cudaGetLastError());
320
321 mmareduce_kernel<real><<<1, 1024, 0, stream>>>(mma_bufred_d, nb);
322 CUDA_CHECK(cudaGetLastError());
323
324 CUDA_CHECK(cudaMemcpyAsync(
325 bi_d + i, mma_bufred_d, sizeof(real),
326 cudaMemcpyDeviceToDevice, stream));
327
328 CUDA_CHECK(cudaStreamSynchronize(stream));
329 }
330
331 CUDA_CHECK(cudaFree(temp));
332 }
333
334 void mma_gensub3_cuda(void* x, void* df0dx, void* dfdx, void* low,
335 void* upp, void* xmin, void* xmax, void* alpha, void* beta,
336 void* p0j, void* q0j, void* pij, void* qij, int* n, int* m) {
337 const dim3 nthrds(1024, 1, 1);
338 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
339
340 mma_sub3_kernel<real><<<nblcks, nthrds, 0,
341 (cudaStream_t)glb_cmd_queue>>>(
342 (real*)x, (real*)df0dx, (real*)dfdx, (real*)low,
343 (real*)upp, (real*)xmin, (real*)xmax, (real*)alpha,
344 (real*)beta, (real*)p0j, (real*)q0j, (real*)pij,
345 (real*)qij, *n, *m);
346
347 CUDA_CHECK(cudaGetLastError());
348 }
349
350 void mma_gensub2_cuda(void* low, void* upp, void* x, void* xold1,
351 void* xold2, void* xdiff, real* asydecr, real* asyincr, int* n) {
352 const dim3 nthrds(1024, 1, 1);
353 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
354
355 mma_sub2_kernel<real><<<nblcks, nthrds, 0,
356 (cudaStream_t)glb_cmd_queue>>>(
357 (real*)low, (real*)upp, (real*)x, (real*)xold1,
358 (real*)xold2, (real*)xdiff, *asydecr, *asyincr, *n);
359
360 CUDA_CHECK(cudaGetLastError());
361 }
362
363 void mma_gensub1_cuda(void* low, void* upp, void* x, void* xmin, void* xmax,
364 real* asyinit, int* n) {
365 const dim3 nthrds(1024, 1, 1);
366 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
367 mma_sub1_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
368 ((real*)low, (real*)upp, (real*)x, (real*)xmin, (real*)xmax,
369 *asyinit, *n);
370 CUDA_CHECK(cudaGetLastError());
371 }
372
373 void cuda_mma_max(void* xsi, void* x, void* alpha, int* n) {
374 const dim3 nthrds(1024, 1, 1);
375 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
376
377 mma_max2_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
378 ((real*)xsi, (real*)x, (real*)alpha, *n);
379 CUDA_CHECK(cudaGetLastError());
380 }
381
382 void cuda_relambda(void* relambda, void* x, void* xupp, void* xlow,
383 void* pij, void* qij, int* n, int* m) {
384 const dim3 nthrds(1024, 1, 1);
385 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
386 const int nb = ((*n) + 1024 - 1)/ 1024;
387 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
388
389 if ( nb > mma_red_s){
390 mma_red_s = nb;
391 if (mma_bufred != NULL) {
392 CUDA_CHECK(cudaFreeHost(mma_bufred));
393 CUDA_CHECK(cudaFree(mma_bufred_d));
394 }
395 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
396 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
397 }
398 real* temp;
399 cudaMalloc(&temp, (*n) * (*m) * sizeof(real));
400 relambda_kernel<real> <<<nblcks, nthrds, 0, stream >>> (temp, (real*)x,
401 (real*)xupp, (real*)xlow, (real*)pij, (real*)qij, *n, *m);
402 for (int i = 0; i < (*m); i++) {
403 mmasum_kernel <real> <<<nblcks, nthrds, 0, stream >>>
404 (temp, mma_bufred_d, (*n),(*m), i);
405 CUDA_CHECK(cudaGetLastError());
406 mmareduce_kernel<real> <<<1, 1024, 0, stream >>>
407 (mma_bufred_d, nb);
408 CUDA_CHECK(cudaGetLastError());
409 mma_copy_kernel<<<1, 1, 0, stream>>>((real*)relambda, mma_bufred_d, 1, i);
410 CUDA_CHECK(cudaGetLastError());
411 cudaStreamSynchronize(stream);
412 }
413 cudaFree(temp);
414 }
415
416 void cuda_sub2cons2(void* a, void* b, void* c, void* d, real* e, int* n) {
417
418 const dim3 nthrds(1024, 1, 1);
419 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
420
421 sub2cons2_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>(
422 (real*)a, (real*)b, (real*)c, (real*)d, *e, *n);
423 CUDA_CHECK(cudaGetLastError());
424 }
425
426 //max abs values of input
427 real cuda_maxval(void* a, int* n) {
428
429 const dim3 nthrds(1024, 1, 1);
430 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
431 const int nb = ((*n) + 1024 - 1) / 1024;
432 const cudaStream_t stream = (cudaStream_t)glb_cmd_queue;
433
434 if (nb > mma_red_s) {
435 mma_red_s = nb;
436 if (mma_bufred != NULL) {
437 CUDA_CHECK(cudaFreeHost(mma_bufred));
438 CUDA_CHECK(cudaFree(mma_bufred_d));
439 }
440 CUDA_CHECK(cudaMallocHost(&mma_bufred, nb * sizeof(real)));
441 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb * sizeof(real)));
442 }
443
444 maxval_kernel<real><<<nblcks, nthrds, 0, stream>>>(
445 (real*)a, mma_bufred_d, (*n));
446 CUDA_CHECK(cudaGetLastError());
447
448 max_reduce_kernel<real><<<1, 1024, 0, stream>>>(
449 mma_bufred_d, nb);
450 CUDA_CHECK(cudaGetLastError());
451
452 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d, sizeof(real),
453 cudaMemcpyDeviceToHost, stream));
454 cudaStreamSynchronize(stream);
455
456 return mma_bufred[0];
457 }
458
459 void cuda_delx(void* delx, void* x, void* xlow, void* xupp, void* pij,
460 void* qij, void* p0j, void* q0j, void* alpha, void* beta, void* lambda,
461 real* epsi, int* n, int* m) {
462
463 const dim3 nthrds(1024, 1, 1);
464 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
465
466 delx_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>(
467 (real*)delx, (real*)x, (real*)xlow, (real*)xupp, (real*)pij,
468 (real*)qij, (real*)p0j, (real*)q0j, (real*)alpha, (real*)beta,
469 (real*)lambda, *epsi, *n, *m);
470 CUDA_CHECK(cudaGetLastError());
471 }
472
473 void cuda_GG(void* GG, void* x, void* xlow, void* xupp,
474 void* pij, void* qij, int* n, int* m) {
475 const dim3 nthrds(1024, 1, 1);
476 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
477 GG_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
478 ((real*)GG, (real*)x, (real*)xlow,(real*) xupp,
479 (real*)pij, (real*) qij, *n,*m);
480 CUDA_CHECK(cudaGetLastError());
481 }
482
483 void cuda_diagx(void* diagx, void* x, void* xsi,void* xlow, void* xupp,
484 void* p0j, void* q0j, void* pij, void* qij, void* alpha, void* beta,
485 void* eta, void* lambda, int *n, int *m) {
486 const dim3 nthrds(1024, 1, 1);
487 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
488 diagx_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
489 ((real*)diagx, (real*)x, (real*)xsi,(real*)xlow,
490 (real*) xupp,(real*)p0j, (real*) q0j, (real*)pij, (real*) qij,
491 (real*)alpha, (real*) beta, (real*)eta, (real*) lambda, *n,*m);
492 CUDA_CHECK(cudaGetLastError());
493 }
494
495 void cuda_bb(void* bb, void* GG, void* delx,void* diagx, int *n, int *m) {
496 const dim3 nthrds(1024, 1, 1);
497 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
498 const int nb = ((*n) + 1024 - 1)/ 1024;
499 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
500 cudaStreamSynchronize(stream);
501 if(nb > mma_red_s){
502 mma_red_s = nb;
503 if(mma_bufred != NULL){
504 CUDA_CHECK(cudaFreeHost(mma_bufred));
505 CUDA_CHECK(cudaFree(mma_bufred_d));
506 }
507 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
508 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
509 }
510 for (int i = 0; i < (*m); i++) {
511 mmasumbb_kernel <real> <<<nblcks, nthrds, 0, stream>>>
512 ((real*)GG,(real*)delx,(real*)diagx, mma_bufred_d, (*n),(*m), i);
513 CUDA_CHECK(cudaGetLastError());
514 mmareduce_kernel<real> <<<1, 1024, 0, stream>>> (mma_bufred_d, nb);
515 CUDA_CHECK(cudaGetLastError());
516 mma_copy_kernel<<<1, 1, 0, stream>>>((real*)bb, mma_bufred_d, 1, i);
517 CUDA_CHECK(cudaGetLastError());
518 cudaStreamSynchronize(stream);
519 }
520 }
521
522 void cuda_AA(void* AA, void* GG, void* diagx, int *n, int *m) {
523 const dim3 nthrds(1024, 1, 1);
524 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
525 const int nb = ((*n) + 1024 - 1)/ 1024;
526 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
527 cudaStreamSynchronize(stream);
528 if(nb > mma_red_s){
529 mma_red_s = nb;
530 if(mma_bufred != NULL){
531 CUDA_CHECK(cudaFreeHost(mma_bufred));
532 CUDA_CHECK(cudaFree(mma_bufred_d));
533 }
534 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
535 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
536 }
537 for (int i = 0; i < (*m); i++){
538 for (int j=0; j<(*m);j++){
539 mmasumAA_kernel <real> <<<nblcks, nthrds, 0, stream>>>
540 ((real*)GG,(real*)diagx, mma_bufred_d, (*n),(*m), i, j);
541 CUDA_CHECK(cudaGetLastError());
542 mmareduce_kernel<real> <<<1, 1024, 0, stream>>> (mma_bufred_d, nb);
543 CUDA_CHECK(cudaGetLastError());
544 mma_copy_kernel<<<1, 1, 0, stream>>>((real*)AA, mma_bufred_d, 1,
545 i+j*(*m+1));
546 CUDA_CHECK(cudaGetLastError());
547 cudaStreamSynchronize(stream);
548 }
549 }
550 }
551
552 void cuda_dx(void* dx,void* delx, void* diagx, void* GG, void* dlambda,
553 int* n, int* m) {
554 const dim3 nthrds(1024, 1, 1);
555 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
556 dx_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
557 ((real*)dx, (real*)delx, (real*)diagx,(real*) GG, (real*)dlambda,
558 *n,*m);
559 CUDA_CHECK(cudaGetLastError());
560 }
561
562 void cuda_dxsi(void* dxsi, void* xsi, void* dx,void* x,
563 void* alpha, real*epsi, int* n) {
564 const dim3 nthrds(1024, 1, 1);
565 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
566 dxsi_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
567 ((real*)dxsi, (real*)xsi, (real*)dx,(real*) x,
568 (real*)alpha, *epsi,*n);
569 CUDA_CHECK(cudaGetLastError());
570 }
571
572 void cuda_deta(void* deta, void* eta, void* dx, void* x,
573 void* beta, real* epsi, int* n) {
574 const dim3 nthrds(1024, 1, 1);
575 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
576 deta_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
577 ((real*)deta, (real*)eta, (real*)dx, (real*)x,
578 (real*)beta, *epsi, *n);
579 CUDA_CHECK(cudaGetLastError());
580 }
581
582 void cuda_rex(void* rex, void* x, void* xlow, void* xupp, void* pij,
583 void* p0j, void* qij, void* q0j, void* lambda, void* xsi, void* eta,
584 int* n, int* m) {
585 const dim3 nthrds(1024, 1, 1);
586 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
587 RexCalculation_kernel<real> <<<nblcks, nthrds, 0,
588 (cudaStream_t)glb_cmd_queue >>> ((real*)rex, (real*)x, (real*)xlow,
589 (real*)xupp, (real*)pij, (real*)p0j, (real*)qij, (real*)q0j,
590 (real*)lambda, (real*)xsi, (real*)eta, *n, *m);
591 CUDA_CHECK(cudaGetLastError());
592 }
593
594 void cuda_rey(void* rey, void* c, void* d, void* y, void* lambda, void* mu,
595 int* n) {
596 const dim3 nthrds(1024, 1, 1);
597 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
598 rey_calculation_kernel<real> <<<nblcks, nthrds, 0,
599 (cudaStream_t)glb_cmd_queue >>> ((real*)rey, (real*)c,
600 (real*)d, (real*)y, (real*)lambda, (real*)mu, * n);
601 CUDA_CHECK(cudaGetLastError());
602 }
603
604
605 void cuda_sub2cons(void * a,void * b,void * c, real *d, int * n) {
606 const dim3 nthrds(1024, 1, 1);
607 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
608 sub2cons_kernel<real><<<nblcks, nthrds, 0,(cudaStream_t) glb_cmd_queue>>>
609 ((real *) a, (real *) b, (real *) c, *d, *n);
610 CUDA_CHECK(cudaGetLastError());
611 }
612
613 real cuda_norm(void* a, int* n) {
614 const dim3 nthrds(1024, 1, 1);
615 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
616 const int nb = ((*n) + 1024 - 1)/ 1024;
617 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
618 if(nb > mma_red_s){
619 mma_red_s = nb;
620 if(mma_bufred != NULL){
621 CUDA_CHECK(cudaFreeHost(mma_bufred));
622 CUDA_CHECK(cudaFree(mma_bufred_d));
623 }
624 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
625 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
626 }
627 norm_kernel <real> <<<nblcks, nthrds, 0, stream >>>
628 ((real*)a, mma_bufred_d, (*n));
629 CUDA_CHECK(cudaGetLastError());
630 mmareduce_kernel<real> <<<1, 1024, 0, stream >>> (mma_bufred_d, nb);
631 CUDA_CHECK(cudaGetLastError());
632 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d, sizeof(real),
633 cudaMemcpyDeviceToHost, stream));
634 cudaStreamSynchronize(stream);
635 return mma_bufred[0];
636 }
637
638 void cuda_dely(void* dely, void* c, void* d, void* y, void* lambda,
639 real* epsi, int* n) {
640 const dim3 nthrds(1024, 1, 1);
641 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
642 dely_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
643 ((real*)dely,(real*)c, (real*)d, (real*)y, (real*)lambda,*epsi, * n);
644 CUDA_CHECK(cudaGetLastError());
645 }
646
647 real cuda_maxval2(void* a, void* b, real* cons, int* n) {
648 const dim3 nthrds(1024, 1, 1);
649 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
650 const int nb = ((*n) + 1024 - 1)/ 1024;
651 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
652 if(nb > mma_red_s){
653 mma_red_s = nb;
654 if(mma_bufred != NULL) {
655 CUDA_CHECK(cudaFreeHost(mma_bufred));
656 CUDA_CHECK(cudaFree(mma_bufred_d));
657 }
658 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
659 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
660 }
661 maxval2_kernel <real> <<<nblcks, nthrds, 0,stream >>>
662 ((real*)a, (real*)b, mma_bufred_d, *cons, *n);
663 CUDA_CHECK(cudaGetLastError());
664 max_reduce_kernel<real> <<<1, 1024, 0,stream >>> (mma_bufred_d, nb);
665 CUDA_CHECK(cudaGetLastError());
666 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d, sizeof(real),
667 cudaMemcpyDeviceToHost, stream));
668 cudaStreamSynchronize(stream);
669 return mma_bufred[0];
670 }
671
672 real cuda_maxval3(void* a, void* b, void* c, real* cons, int* n) {
673 const dim3 nthrds(1024, 1, 1);
674 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
675 const int nb = ((*n) + 1024 - 1)/ 1024;
676 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
677 if(nb > mma_red_s){
678 mma_red_s = nb;
679 if(mma_bufred != NULL) {
680 CUDA_CHECK(cudaFreeHost(mma_bufred));
681 CUDA_CHECK(cudaFree(mma_bufred_d));
682 }
683 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
684 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
685 }
686 maxval3_kernel <real> <<<nblcks, nthrds, 0,stream>>>
687 ((real*)a, (real*)b, (real*)c, mma_bufred_d, *cons, *n);
688 max_reduce_kernel<real> <<<1, 1024, 0,stream >>> (mma_bufred_d, nb);
689 CUDA_CHECK(cudaGetLastError());
690 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d, sizeof(real),
691 cudaMemcpyDeviceToHost, stream));
692 cudaStreamSynchronize(stream);
693 return mma_bufred[0];
694 }
695
696 void cuda_kkt_rex(void* rex, void* df0dx, void* dfdx, void* xsi,
697 void* eta, void* lambda, int* n, int* m) {
698 const dim3 nthrds(1024, 1, 1);
699 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
700 kkt_rex_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
701 ((real*)rex, (real*)df0dx, (real*)dfdx, (real*)xsi,
702 (real*)eta, (real*)lambda, *n, *m);
703 CUDA_CHECK(cudaGetLastError());
704 }
705
706 void cuda_maxcons(void* a, real* b, real* c, void* d, int* n) {
707 const dim3 nthrds(1024, 1, 1);
708 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
709 maxcons_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
710 ((real*)a, *b, *c, (real*)d, *n);
711 CUDA_CHECK(cudaGetLastError());
712 }
713
714 real cuda_lcsc2(void *a, void*b, int *n) {
715 const dim3 nthrds(1024, 1, 1);
716 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
717 const int nb = ((*n) + 1024 - 1)/ 1024;
718 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
719 if ( nb > mma_red_s){
720 mma_red_s = nb;
721 if (mma_bufred != NULL) {
722 CUDA_CHECK(cudaFreeHost(mma_bufred));
723 CUDA_CHECK(cudaFree(mma_bufred_d));
724 }
725 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*sizeof(real)));
726 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*sizeof(real)));
727 }
728 glsc2_kernel <real> <<<nblcks, nthrds, 0, stream>>>
729 ((real*)a, (real*)b, mma_bufred_d, (*n));
730 CUDA_CHECK(cudaGetLastError());
731 mmareduce_kernel<real> <<<1, 1024, 0, stream>>> (mma_bufred_d, nb);
732 CUDA_CHECK(cudaGetLastError());
733 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d, sizeof(real),
734 cudaMemcpyDeviceToHost, stream));
735 cudaStreamSynchronize(stream);
736 return mma_bufred[0];
737 }
738
739 void cuda_mpisum(void *a, int *n) {
740#ifdef HAVE_DEVICE_MPI
741 real* temp=(real*)a;
742 cudaStreamSynchronize(stream);
743 device_mpi_allreduce_inplace(temp, *n, sizeof(real), DEVICE_MPI_SUM);
744#endif
745 }
746
747 void cuda_add2inv2(void* a, void *b, real* c, int* n) {
748 const dim3 nthrds(1024, 1, 1);
749 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
750 add2inv2_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
751 ((real*)a, (real*) b, *c, *n);
752 CUDA_CHECK(cudaGetLastError());
753 }
754
755 void cuda_max2(void* a, real* b, void* c, real* d, int* n) {
756 const dim3 nthrds(1024, 1, 1);
757 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
758 max2_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
759 ((real*)a, *b, (real*)c,*d, *n);
760 CUDA_CHECK(cudaGetLastError());
761 }
762
763 void cuda_updatebb(void* bb, void* dellambda, void* dely,void* d,
764 void* mu, void* y, real* delz, int *m) {
765 const dim3 nthrds(1024, 1, 1);
766 const dim3 nblcks(((*m+1) + 1024 - 1) / 1024, 1, 1);
767 updatebb_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
768 ((real*) bb, (real*) dellambda, (real*) dely,(real*) d,
769 (real*) mu, (real*) y, *delz, *m);
770 CUDA_CHECK(cudaGetLastError());
771 }
772
773 void cuda_updateAA(void* AA, void* globaltmp_mm, void* s, void* lambda,
774 void* d, void*mu,void* y,void* a, real* zeta, real* z, int* m) {
775 const dim3 nthrds(1024, 1, 1);
776 const dim3 nblcks(((*m+1) + 1024 - 1) / 1024, 1, 1);
777 updateAA_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
778 ((real*) AA,(real*) globaltmp_mm, (real*) s, (real*) lambda,(real*) d,
779 (real*)mu,(real*) y, (real*)a, *zeta, *z, *m);
780 CUDA_CHECK(cudaGetLastError());
781 }
782
783 void cuda_dy(void* dy, void* dely, void* dlambda,void* d, void* mu,
784 void* y, int* n) {
785 const dim3 nthrds(1024, 1, 1);
786 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
787 dy_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
788 ((real*) dy, (real*) dely, (real*) dlambda, (real*) d,
789 (real*) mu,(real*) y, *n);
790 CUDA_CHECK(cudaGetLastError());
791 }
792
793}/* extern "C" */