40#include <cuda_runtime.h>
43#include <neko/device/device_config.h>
44#include <neko/device/cuda/check.h>
45#include <neko/math/bcknd/device/device_mpi_reduce.h>
46#include <neko/math/bcknd/device/device_mpi_op.h>
49#include "mma_kernel.h"
55 real * mma_bufred = NULL;
56 real * mma_bufred_d = NULL;
58 void cuda_Hess(
void* Hess,
void* hijx,
void* Ljjxinv,
int *n,
int *m) {
59 const dim3 nthrds(1024, 1, 1);
60 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
61 const int nb = ((*n) + 1024 - 1)/ 1024;
62 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
63 cudaStreamSynchronize(stream);
66 if(mma_bufred != NULL){
67 CUDA_CHECK(cudaFreeHost(mma_bufred));
68 CUDA_CHECK(cudaFree(mma_bufred_d));
70 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*
sizeof(real)));
71 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*
sizeof(real)));
73 for (
int i = 0; i < (*m); i++){
74 for (
int j=0; j<(*m);j++){
75 mmasumHess_kernel <real> <<<nblcks, nthrds, 0, stream>>>
76 ((real*)hijx,(real*)Ljjxinv, mma_bufred_d, (*n),(*m), i, j);
77 CUDA_CHECK(cudaGetLastError());
78 mmareduce_kernel<real> <<<1, 1024, 0, stream>>> (mma_bufred_d, nb);
79 CUDA_CHECK(cudaGetLastError());
80 mma_copy_kernel<<<1, 1, 0, stream>>>((real*)Hess, mma_bufred_d, 1,
82 CUDA_CHECK(cudaGetLastError());
83 cudaStreamSynchronize(stream);
88 void mma_Ljjxinv_cuda(
void* Ljjxinv,
void* pjlambda,
void* qjlambda,
void* x,
89 void* low,
void* upp,
void* alpha,
void* beta,
int* n) {
90 const dim3 nthrds(1024, 1, 1);
91 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
92 mma_Ljjxinv_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
93 ((real*)Ljjxinv, (real*)pjlambda, (real*)qjlambda, (real*)x, (real*)low,
94 (real*)upp, (real*)alpha, (real*)beta, *n);
95 CUDA_CHECK(cudaGetLastError());
98 void mma_dipsolvesub1_cuda(
void* x,
void* pjlambda,
void* qjlambda,
99 void* low,
void* upp,
void* alpha,
void* beta,
int* n) {
100 const dim3 nthrds(1024, 1, 1);
101 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
102 mma_dipsolvesub1_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
103 ((real*)x, (real*)pjlambda, (real*)qjlambda, (real*)low, (real*)upp,
104 (real*)alpha, (real*)beta, *n);
105 CUDA_CHECK(cudaGetLastError());
108 void mattrans_v_mul_cuda(
void* output,
void* pij,
void* lambda,
110 const dim3 nthrds(1024, 1, 1);
111 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
112 mattrans_v_mul_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
113 ((real*)output, (real*)pij, (real*)lambda, *m, *n);
114 CUDA_CHECK(cudaGetLastError());
117 void mma_gensub4_cuda(
void* x,
void* low,
void* upp,
void* pij,
void* qij,
118 int* n,
int* m,
void* bi) {
119 const dim3 nthrds(1024, 1, 1);
120 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
121 const int nb = ((*n) + 1024 - 1) / 1024;
122 const cudaStream_t stream = (cudaStream_t)glb_cmd_queue;
124 if (nb > mma_red_s) {
126 if (mma_bufred != NULL) {
127 CUDA_CHECK(cudaFreeHost(mma_bufred));
128 CUDA_CHECK(cudaFree(mma_bufred_d));
130 CUDA_CHECK(cudaMallocHost(&mma_bufred,
132 CUDA_CHECK(cudaMalloc(&mma_bufred_d,
137 real* bi_d = (real*)bi;
138 cudaMalloc(&temp, (*m) * (*n) *
sizeof(real));
140 mma_sub4_kernel<real><<<nblcks, nthrds, 0, stream>>>(
141 (real*)x, (real*)low, (real*)upp, (real*)pij, (real*)qij,
144 for (
int i = 0; i < (*m); i++) {
145 mmasum_kernel<real><<<nblcks, nthrds, 0, stream>>>(
146 temp, mma_bufred_d, (*n), (*m), i);
147 CUDA_CHECK(cudaGetLastError());
149 mmareduce_kernel<real><<<1, 1024, 0, stream>>>(
151 CUDA_CHECK(cudaGetLastError());
153 CUDA_CHECK(cudaMemcpyAsync(
154 bi_d + i, mma_bufred_d,
sizeof(real),
155 cudaMemcpyDeviceToDevice, stream));
157 cudaStreamSynchronize(stream);
163 void mma_gensub3_cuda(
void* x,
void* df0dx,
void* dfdx,
void* low,
164 void* upp,
void* xmin,
void* xmax,
void* alpha,
void* beta,
165 void* p0j,
void* q0j,
void* pij,
void* qij,
int* n,
int* m) {
166 const dim3 nthrds(1024, 1, 1);
167 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
169 mma_sub3_kernel<real><<<nblcks, nthrds, 0,
170 (cudaStream_t)glb_cmd_queue>>>(
171 (real*)x, (real*)df0dx, (real*)dfdx, (real*)low,
172 (real*)upp, (real*)xmin, (real*)xmax, (real*)alpha,
173 (real*)beta, (real*)p0j, (real*)q0j, (real*)pij,
176 CUDA_CHECK(cudaGetLastError());
179 void mma_gensub2_cuda(
void* low,
void* upp,
void* x,
void* xold1,
180 void* xold2,
void* xdiff, real* asydecr, real* asyincr,
int* n) {
181 const dim3 nthrds(1024, 1, 1);
182 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
184 mma_sub2_kernel<real><<<nblcks, nthrds, 0,
185 (cudaStream_t)glb_cmd_queue>>>(
186 (real*)low, (real*)upp, (real*)x, (real*)xold1,
187 (real*)xold2, (real*)xdiff, *asydecr, *asyincr, *n);
189 CUDA_CHECK(cudaGetLastError());
192 void mma_gensub1_cuda(
void* low,
void* upp,
void* x,
void* xmin,
void* xmax,
193 real* asyinit,
int* n) {
194 const dim3 nthrds(1024, 1, 1);
195 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
196 mma_sub1_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
197 ((real*)low, (real*)upp, (real*)x, (real*)xmin, (real*)xmax,
199 CUDA_CHECK(cudaGetLastError());
202 void cuda_mma_max(
void* xsi,
void* x,
void* alpha,
int* n) {
203 const dim3 nthrds(1024, 1, 1);
204 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
206 mma_max2_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
207 ((real*)xsi, (real*)x, (real*)alpha, *n);
208 CUDA_CHECK(cudaGetLastError());
211 void cuda_relambda(
void* relambda,
void* x,
void* xupp,
void* xlow,
212 void* pij,
void* qij,
int* n,
int* m) {
213 const dim3 nthrds(1024, 1, 1);
214 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
215 const int nb = ((*n) + 1024 - 1)/ 1024;
216 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
218 if ( nb > mma_red_s){
220 if (mma_bufred != NULL) {
221 CUDA_CHECK(cudaFreeHost(mma_bufred));
222 CUDA_CHECK(cudaFree(mma_bufred_d));
224 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*
sizeof(real)));
225 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*
sizeof(real)));
228 cudaMalloc(&temp, (*n) * (*m) *
sizeof(real));
229 relambda_kernel<real> <<<nblcks, nthrds, 0, stream >>> (temp, (real*)x,
230 (real*)xupp, (real*)xlow, (real*)pij, (real*)qij, *n, *m);
231 for (
int i = 0; i < (*m); i++) {
232 mmasum_kernel <real> <<<nblcks, nthrds, 0, stream >>>
233 (temp, mma_bufred_d, (*n),(*m), i);
234 CUDA_CHECK(cudaGetLastError());
235 mmareduce_kernel<real> <<<1, 1024, 0, stream >>>
237 CUDA_CHECK(cudaGetLastError());
238 mma_copy_kernel<<<1, 1, 0, stream>>>((real*)relambda, mma_bufred_d, 1, i);
239 CUDA_CHECK(cudaGetLastError());
240 cudaStreamSynchronize(stream);
245 void cuda_sub2cons2(
void* a,
void* b,
void* c,
void* d, real* e,
int* n) {
247 const dim3 nthrds(1024, 1, 1);
248 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
250 sub2cons2_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>(
251 (real*)a, (real*)b, (real*)c, (real*)d, *e, *n);
252 CUDA_CHECK(cudaGetLastError());
256 real cuda_maxval(
void* a,
int* n) {
258 const dim3 nthrds(1024, 1, 1);
259 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
260 const int nb = ((*n) + 1024 - 1) / 1024;
261 const cudaStream_t stream = (cudaStream_t)glb_cmd_queue;
263 if (nb > mma_red_s) {
265 if (mma_bufred != NULL) {
266 CUDA_CHECK(cudaFreeHost(mma_bufred));
267 CUDA_CHECK(cudaFree(mma_bufred_d));
269 CUDA_CHECK(cudaMallocHost(&mma_bufred, nb *
sizeof(real)));
270 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb *
sizeof(real)));
273 maxval_kernel<real><<<nblcks, nthrds, 0, stream>>>(
274 (real*)a, mma_bufred_d, (*n));
275 CUDA_CHECK(cudaGetLastError());
277 max_reduce_kernel<real><<<1, 1024, 0, stream>>>(
279 CUDA_CHECK(cudaGetLastError());
281 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d,
sizeof(real),
282 cudaMemcpyDeviceToHost, stream));
283 cudaStreamSynchronize(stream);
285 return mma_bufred[0];
288 void cuda_delx(
void* delx,
void* x,
void* xlow,
void* xupp,
void* pij,
289 void* qij,
void* p0j,
void* q0j,
void* alpha,
void* beta,
void* lambda,
290 real* epsi,
int* n,
int* m) {
292 const dim3 nthrds(1024, 1, 1);
293 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
295 delx_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>(
296 (real*)delx, (real*)x, (real*)xlow, (real*)xupp, (real*)pij,
297 (real*)qij, (real*)p0j, (real*)q0j, (real*)alpha, (real*)beta,
298 (real*)lambda, *epsi, *n, *m);
299 CUDA_CHECK(cudaGetLastError());
302 void cuda_GG(
void* GG,
void* x,
void* xlow,
void* xupp,
303 void* pij,
void* qij,
int* n,
int* m) {
304 const dim3 nthrds(1024, 1, 1);
305 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
306 GG_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
307 ((real*)GG, (real*)x, (real*)xlow,(real*) xupp,
308 (real*)pij, (real*) qij, *n,*m);
309 CUDA_CHECK(cudaGetLastError());
312 void cuda_diagx(
void* diagx,
void* x,
void* xsi,
void* xlow,
void* xupp,
313 void* p0j,
void* q0j,
void* pij,
void* qij,
void* alpha,
void* beta,
314 void* eta,
void* lambda,
int *n,
int *m) {
315 const dim3 nthrds(1024, 1, 1);
316 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
317 diagx_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
318 ((real*)diagx, (real*)x, (real*)xsi,(real*)xlow,
319 (real*) xupp,(real*)p0j, (real*) q0j, (real*)pij, (real*) qij,
320 (real*)alpha, (real*) beta, (real*)eta, (real*) lambda, *n,*m);
321 CUDA_CHECK(cudaGetLastError());
324 void cuda_bb(
void* bb,
void* GG,
void* delx,
void* diagx,
int *n,
int *m) {
325 const dim3 nthrds(1024, 1, 1);
326 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
327 const int nb = ((*n) + 1024 - 1)/ 1024;
328 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
329 cudaStreamSynchronize(stream);
332 if(mma_bufred != NULL){
333 CUDA_CHECK(cudaFreeHost(mma_bufred));
334 CUDA_CHECK(cudaFree(mma_bufred_d));
336 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*
sizeof(real)));
337 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*
sizeof(real)));
339 for (
int i = 0; i < (*m); i++) {
340 mmasumbb_kernel <real> <<<nblcks, nthrds, 0, stream>>>
341 ((real*)GG,(real*)delx,(real*)diagx, mma_bufred_d, (*n),(*m), i);
342 CUDA_CHECK(cudaGetLastError());
343 mmareduce_kernel<real> <<<1, 1024, 0, stream>>> (mma_bufred_d, nb);
344 CUDA_CHECK(cudaGetLastError());
345 mma_copy_kernel<<<1, 1, 0, stream>>>((real*)bb, mma_bufred_d, 1, i);
346 CUDA_CHECK(cudaGetLastError());
347 cudaStreamSynchronize(stream);
351 void cuda_AA(
void* AA,
void* GG,
void* diagx,
int *n,
int *m) {
352 const dim3 nthrds(1024, 1, 1);
353 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
354 const int nb = ((*n) + 1024 - 1)/ 1024;
355 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
356 cudaStreamSynchronize(stream);
359 if(mma_bufred != NULL){
360 CUDA_CHECK(cudaFreeHost(mma_bufred));
361 CUDA_CHECK(cudaFree(mma_bufred_d));
363 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*
sizeof(real)));
364 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*
sizeof(real)));
366 for (
int i = 0; i < (*m); i++){
367 for (
int j=0; j<(*m);j++){
368 mmasumAA_kernel <real> <<<nblcks, nthrds, 0, stream>>>
369 ((real*)GG,(real*)diagx, mma_bufred_d, (*n),(*m), i, j);
370 CUDA_CHECK(cudaGetLastError());
371 mmareduce_kernel<real> <<<1, 1024, 0, stream>>> (mma_bufred_d, nb);
372 CUDA_CHECK(cudaGetLastError());
373 mma_copy_kernel<<<1, 1, 0, stream>>>((real*)AA, mma_bufred_d, 1,
375 CUDA_CHECK(cudaGetLastError());
376 cudaStreamSynchronize(stream);
381 void cuda_dx(
void* dx,
void* delx,
void* diagx,
void* GG,
void* dlambda,
383 const dim3 nthrds(1024, 1, 1);
384 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
385 dx_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
386 ((real*)dx, (real*)delx, (real*)diagx,(real*) GG, (real*)dlambda,
388 CUDA_CHECK(cudaGetLastError());
391 void cuda_dxsi(
void* dxsi,
void* xsi,
void* dx,
void* x,
392 void* alpha, real*epsi,
int* n) {
393 const dim3 nthrds(1024, 1, 1);
394 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
395 dxsi_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
396 ((real*)dxsi, (real*)xsi, (real*)dx,(real*) x,
397 (real*)alpha, *epsi,*n);
398 CUDA_CHECK(cudaGetLastError());
401 void cuda_deta(
void* deta,
void* eta,
void* dx,
void* x,
402 void* beta, real* epsi,
int* n) {
403 const dim3 nthrds(1024, 1, 1);
404 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
405 deta_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
406 ((real*)deta, (real*)eta, (real*)dx, (real*)x,
407 (real*)beta, *epsi, *n);
408 CUDA_CHECK(cudaGetLastError());
411 void cuda_rex(
void* rex,
void* x,
void* xlow,
void* xupp,
void* pij,
412 void* p0j,
void* qij,
void* q0j,
void* lambda,
void* xsi,
void* eta,
414 const dim3 nthrds(1024, 1, 1);
415 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
416 RexCalculation_kernel<real> <<<nblcks, nthrds, 0,
417 (cudaStream_t)glb_cmd_queue >>> ((real*)rex, (real*)x, (real*)xlow,
418 (real*)xupp, (real*)pij, (real*)p0j, (real*)qij, (real*)q0j,
419 (real*)lambda, (real*)xsi, (real*)eta, *n, *m);
420 CUDA_CHECK(cudaGetLastError());
423 void cuda_rey(
void* rey,
void* c,
void* d,
void* y,
void* lambda,
void* mu,
425 const dim3 nthrds(1024, 1, 1);
426 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
427 rey_calculation_kernel<real> <<<nblcks, nthrds, 0,
428 (cudaStream_t)glb_cmd_queue >>> ((real*)rey, (real*)c,
429 (real*)d, (real*)y, (real*)lambda, (real*)mu, * n);
430 CUDA_CHECK(cudaGetLastError());
434 void cuda_sub2cons(
void * a,
void * b,
void * c, real *d,
int * n) {
435 const dim3 nthrds(1024, 1, 1);
436 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
437 sub2cons_kernel<real><<<nblcks, nthrds, 0,(cudaStream_t) glb_cmd_queue>>>
438 ((real *) a, (real *) b, (real *) c, *d, *n);
439 CUDA_CHECK(cudaGetLastError());
442 real cuda_norm(
void* a,
int* n) {
443 const dim3 nthrds(1024, 1, 1);
444 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
445 const int nb = ((*n) + 1024 - 1)/ 1024;
446 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
449 if(mma_bufred != NULL){
450 CUDA_CHECK(cudaFreeHost(mma_bufred));
451 CUDA_CHECK(cudaFree(mma_bufred_d));
453 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*
sizeof(real)));
454 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*
sizeof(real)));
456 norm_kernel <real> <<<nblcks, nthrds, 0, stream >>>
457 ((real*)a, mma_bufred_d, (*n));
458 CUDA_CHECK(cudaGetLastError());
459 mmareduce_kernel<real> <<<1, 1024, 0, stream >>> (mma_bufred_d, nb);
460 CUDA_CHECK(cudaGetLastError());
461 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d,
sizeof(real),
462 cudaMemcpyDeviceToHost, stream));
463 cudaStreamSynchronize(stream);
464 return mma_bufred[0];
467 void cuda_dely(
void* dely,
void* c,
void* d,
void* y,
void* lambda,
468 real* epsi,
int* n) {
469 const dim3 nthrds(1024, 1, 1);
470 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
471 dely_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
472 ((real*)dely,(real*)c, (real*)d, (real*)y, (real*)lambda,*epsi, * n);
473 CUDA_CHECK(cudaGetLastError());
476 real cuda_maxval2(
void* a,
void* b, real* cons,
int* n) {
477 const dim3 nthrds(1024, 1, 1);
478 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
479 const int nb = ((*n) + 1024 - 1)/ 1024;
480 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
483 if(mma_bufred != NULL) {
484 CUDA_CHECK(cudaFreeHost(mma_bufred));
485 CUDA_CHECK(cudaFree(mma_bufred_d));
487 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*
sizeof(real)));
488 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*
sizeof(real)));
490 maxval2_kernel <real> <<<nblcks, nthrds, 0,stream >>>
491 ((real*)a, (real*)b, mma_bufred_d, *cons, *n);
492 CUDA_CHECK(cudaGetLastError());
493 max_reduce_kernel<real> <<<1, 1024, 0,stream >>> (mma_bufred_d, nb);
494 CUDA_CHECK(cudaGetLastError());
495 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d,
sizeof(real),
496 cudaMemcpyDeviceToHost, stream));
497 cudaStreamSynchronize(stream);
498 return mma_bufred[0];
501 real cuda_maxval3(
void* a,
void* b,
void* c, real* cons,
int* n) {
502 const dim3 nthrds(1024, 1, 1);
503 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
504 const int nb = ((*n) + 1024 - 1)/ 1024;
505 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
508 if(mma_bufred != NULL) {
509 CUDA_CHECK(cudaFreeHost(mma_bufred));
510 CUDA_CHECK(cudaFree(mma_bufred_d));
512 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*
sizeof(real)));
513 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*
sizeof(real)));
515 maxval3_kernel <real> <<<nblcks, nthrds, 0,stream>>>
516 ((real*)a, (real*)b, (real*)c, mma_bufred_d, *cons, *n);
517 max_reduce_kernel<real> <<<1, 1024, 0,stream >>> (mma_bufred_d, nb);
518 CUDA_CHECK(cudaGetLastError());
519 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d,
sizeof(real),
520 cudaMemcpyDeviceToHost, stream));
521 cudaStreamSynchronize(stream);
522 return mma_bufred[0];
525 void cuda_kkt_rex(
void* rex,
void* df0dx,
void* dfdx,
void* xsi,
526 void* eta,
void* lambda,
int* n,
int* m) {
527 const dim3 nthrds(1024, 1, 1);
528 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
529 kkt_rex_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
530 ((real*)rex, (real*)df0dx, (real*)dfdx, (real*)xsi,
531 (real*)eta, (real*)lambda, *n, *m);
532 CUDA_CHECK(cudaGetLastError());
535 void cuda_maxcons(
void* a, real* b, real* c,
void* d,
int* n) {
536 const dim3 nthrds(1024, 1, 1);
537 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
538 maxcons_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
539 ((real*)a, *b, *c, (real*)d, *n);
540 CUDA_CHECK(cudaGetLastError());
543 real cuda_lcsc2(
void *a,
void*b,
int *n) {
544 const dim3 nthrds(1024, 1, 1);
545 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
546 const int nb = ((*n) + 1024 - 1)/ 1024;
547 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
548 if ( nb > mma_red_s){
550 if (mma_bufred != NULL) {
551 CUDA_CHECK(cudaFreeHost(mma_bufred));
552 CUDA_CHECK(cudaFree(mma_bufred_d));
554 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*
sizeof(real)));
555 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*
sizeof(real)));
557 glsc2_kernel <real> <<<nblcks, nthrds, 0, stream>>>
558 ((real*)a, (real*)b, mma_bufred_d, (*n));
559 CUDA_CHECK(cudaGetLastError());
560 mmareduce_kernel<real> <<<1, 1024, 0, stream>>> (mma_bufred_d, nb);
561 CUDA_CHECK(cudaGetLastError());
562 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d,
sizeof(real),
563 cudaMemcpyDeviceToHost, stream));
564 cudaStreamSynchronize(stream);
565 return mma_bufred[0];
568 void cuda_mpisum(
void *a,
int *n) {
569#ifdef HAVE_DEVICE_MPI
571 cudaStreamSynchronize(stream);
572 device_mpi_allreduce_inplace(temp, *n,
sizeof(real), DEVICE_MPI_SUM);
576 void cuda_add2inv2(
void* a,
void *b, real* c,
int* n) {
577 const dim3 nthrds(1024, 1, 1);
578 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
579 add2inv2_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
580 ((real*)a, (real*) b, *c, *n);
581 CUDA_CHECK(cudaGetLastError());
584 void cuda_max2(
void* a, real* b,
void* c, real* d,
int* n) {
585 const dim3 nthrds(1024, 1, 1);
586 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
587 max2_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
588 ((real*)a, *b, (real*)c,*d, *n);
589 CUDA_CHECK(cudaGetLastError());
592 void cuda_updatebb(
void* bb,
void* dellambda,
void* dely,
void* d,
593 void* mu,
void* y, real* delz,
int *m) {
594 const dim3 nthrds(1024, 1, 1);
595 const dim3 nblcks(((*m+1) + 1024 - 1) / 1024, 1, 1);
596 updatebb_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
597 ((real*) bb, (real*) dellambda, (real*) dely,(real*) d,
598 (real*) mu, (real*) y, *delz, *m);
599 CUDA_CHECK(cudaGetLastError());
602 void cuda_updateAA(
void* AA,
void* globaltmp_mm,
void* s,
void* lambda,
603 void* d,
void*mu,
void* y,
void* a, real* zeta, real* z,
int* m) {
604 const dim3 nthrds(1024, 1, 1);
605 const dim3 nblcks(((*m+1) + 1024 - 1) / 1024, 1, 1);
606 updateAA_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
607 ((real*) AA,(real*) globaltmp_mm, (real*) s, (real*) lambda,(real*) d,
608 (real*)mu,(real*) y, (real*)a, *zeta, *z, *m);
609 CUDA_CHECK(cudaGetLastError());
612 void cuda_dy(
void* dy,
void* dely,
void* dlambda,
void* d,
void* mu,
614 const dim3 nthrds(1024, 1, 1);
615 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
616 dy_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
617 ((real*) dy, (real*) dely, (real*) dlambda, (real*) d,
618 (real*) mu,(real*) y, *n);
619 CUDA_CHECK(cudaGetLastError());