40#include <cuda_runtime.h>
43#include <neko/device/device_config.h>
44#include <neko/device/cuda/check.h>
45#include <neko/math/bcknd/device/device_mpi_reduce.h>
46#include <neko/math/bcknd/device/device_mpi_op.h>
49#include "mma_kernel.h"
55 real * mma_bufred = NULL;
56 real * mma_bufred_d = NULL;
58 void delta_1dbeam_cuda(
void* Delta, real* L_total, real* Le,
59 int* offset,
int* n) {
60 const dim3 nthrds(1024, 1, 1);
61 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
62 delta_1dbeam_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
63 ((real*)Delta, *L_total, *Le, *offset, *n);
64 CUDA_CHECK(cudaGetLastError());
67 void cuda_Hess(
void* Hess,
void* hijx,
void* Ljjxinv,
int *n,
int *m) {
68 const dim3 nthrds(1024, 1, 1);
69 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
70 const int nb = ((*n) + 1024 - 1)/ 1024;
71 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
72 cudaStreamSynchronize(stream);
75 if(mma_bufred != NULL){
76 CUDA_CHECK(cudaFreeHost(mma_bufred));
77 CUDA_CHECK(cudaFree(mma_bufred_d));
79 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*
sizeof(real)));
80 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*
sizeof(real)));
82 for (
int i = 0; i < (*m); i++){
83 for (
int j=0; j<(*m);j++){
84 mmasumHess_kernel <real> <<<nblcks, nthrds, 0, stream>>>
85 ((real*)hijx,(real*)Ljjxinv, mma_bufred_d, (*n),(*m), i, j);
86 CUDA_CHECK(cudaGetLastError());
87 mmareduce_kernel<real> <<<1, 1024, 0, stream>>> (mma_bufred_d, nb);
88 CUDA_CHECK(cudaGetLastError());
89 mma_copy_kernel<<<1, 1, 0, stream>>>((real*)Hess, mma_bufred_d, 1,
91 CUDA_CHECK(cudaGetLastError());
92 cudaStreamSynchronize(stream);
97 void mma_Ljjxinv_cuda(
void* Ljjxinv,
void* pjlambda,
void* qjlambda,
void* x,
98 void* low,
void* upp,
void* alpha,
void* beta,
int* n) {
99 const dim3 nthrds(1024, 1, 1);
100 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
101 mma_Ljjxinv_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
102 ((real*)Ljjxinv, (real*)pjlambda, (real*)qjlambda, (real*)x, (real*)low,
103 (real*)upp, (real*)alpha, (real*)beta, *n);
104 CUDA_CHECK(cudaGetLastError());
107 void mma_dipsolvesub1_cuda(
void* x,
void* pjlambda,
void* qjlambda,
108 void* low,
void* upp,
void* alpha,
void* beta,
int* n) {
109 const dim3 nthrds(1024, 1, 1);
110 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
111 mma_dipsolvesub1_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
112 ((real*)x, (real*)pjlambda, (real*)qjlambda, (real*)low, (real*)upp,
113 (real*)alpha, (real*)beta, *n);
114 CUDA_CHECK(cudaGetLastError());
117 void mattrans_v_mul_cuda(
void* output,
void* pij,
void* lambda,
119 const dim3 nthrds(1024, 1, 1);
120 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
121 mattrans_v_mul_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
122 ((real*)output, (real*)pij, (real*)lambda, *m, *n);
123 CUDA_CHECK(cudaGetLastError());
126 void mma_gensub4_cuda(
const void* x,
const void* low,
const void* upp,
127 const void* pij,
const void* qij,
128 const int* n,
const int* m,
void* bi) {
133 const dim3 nthrds(1024, 1, 1);
134 const dim3 nblcks((N + 1023) / 1024, 1, 1);
135 const int nb = (N + 1023) / 1024;
136 const cudaStream_t stream = (cudaStream_t)glb_cmd_queue;
138 if (nb > mma_red_s) {
141 if (mma_bufred != NULL) {
142 CUDA_CHECK(cudaFreeHost(mma_bufred));
143 CUDA_CHECK(cudaFree(mma_bufred_d));
146 CUDA_CHECK(cudaMallocHost(&mma_bufred, nb *
sizeof(real)));
147 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb *
sizeof(real)));
151 real* bi_d =
static_cast<real*
>(bi);
152 CUDA_CHECK(cudaMalloc(&temp, M * N *
sizeof(real)));
154 mma_sub4_kernel<real><<<nblcks, nthrds, 0, stream>>>(
155 static_cast<const real*
>(x),
156 static_cast<const real*
>(low),
157 static_cast<const real*
>(upp),
158 static_cast<const real*
>(pij),
159 static_cast<const real*
>(qij),
162 for (
int i = 0; i < M; ++i) {
163 mmasum_kernel<real><<<nblcks, nthrds, 0, stream>>>(
164 temp, mma_bufred_d, N, M, i);
165 CUDA_CHECK(cudaGetLastError());
167 mmareduce_kernel<real><<<1, 1024, 0, stream>>>(mma_bufred_d, nb);
168 CUDA_CHECK(cudaGetLastError());
170 CUDA_CHECK(cudaMemcpyAsync(
171 bi_d + i, mma_bufred_d,
sizeof(real),
172 cudaMemcpyDeviceToDevice, stream));
174 CUDA_CHECK(cudaStreamSynchronize(stream));
177 CUDA_CHECK(cudaFree(temp));
180 void mma_gensub3_cuda(
void* x,
void* df0dx,
void* dfdx,
void* low,
181 void* upp,
void* xmin,
void* xmax,
void* alpha,
void* beta,
182 void* p0j,
void* q0j,
void* pij,
void* qij,
int* n,
int* m) {
183 const dim3 nthrds(1024, 1, 1);
184 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
186 mma_sub3_kernel<real><<<nblcks, nthrds, 0,
187 (cudaStream_t)glb_cmd_queue>>>(
188 (real*)x, (real*)df0dx, (real*)dfdx, (real*)low,
189 (real*)upp, (real*)xmin, (real*)xmax, (real*)alpha,
190 (real*)beta, (real*)p0j, (real*)q0j, (real*)pij,
193 CUDA_CHECK(cudaGetLastError());
196 void mma_gensub2_cuda(
void* low,
void* upp,
void* x,
void* xold1,
197 void* xold2,
void* xdiff, real* asydecr, real* asyincr,
int* n) {
198 const dim3 nthrds(1024, 1, 1);
199 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
201 mma_sub2_kernel<real><<<nblcks, nthrds, 0,
202 (cudaStream_t)glb_cmd_queue>>>(
203 (real*)low, (real*)upp, (real*)x, (real*)xold1,
204 (real*)xold2, (real*)xdiff, *asydecr, *asyincr, *n);
206 CUDA_CHECK(cudaGetLastError());
209 void mma_gensub1_cuda(
void* low,
void* upp,
void* x,
void* xmin,
void* xmax,
210 real* asyinit,
int* n) {
211 const dim3 nthrds(1024, 1, 1);
212 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
213 mma_sub1_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
214 ((real*)low, (real*)upp, (real*)x, (real*)xmin, (real*)xmax,
216 CUDA_CHECK(cudaGetLastError());
219 void cuda_mma_max(
void* xsi,
void* x,
void* alpha,
int* n) {
220 const dim3 nthrds(1024, 1, 1);
221 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
223 mma_max2_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
224 ((real*)xsi, (real*)x, (real*)alpha, *n);
225 CUDA_CHECK(cudaGetLastError());
228 void cuda_relambda(
void* relambda,
void* x,
void* xupp,
void* xlow,
229 void* pij,
void* qij,
int* n,
int* m) {
230 const dim3 nthrds(1024, 1, 1);
231 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
232 const int nb = ((*n) + 1024 - 1)/ 1024;
233 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
235 if ( nb > mma_red_s){
237 if (mma_bufred != NULL) {
238 CUDA_CHECK(cudaFreeHost(mma_bufred));
239 CUDA_CHECK(cudaFree(mma_bufred_d));
241 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*
sizeof(real)));
242 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*
sizeof(real)));
245 cudaMalloc(&temp, (*n) * (*m) *
sizeof(real));
246 relambda_kernel<real> <<<nblcks, nthrds, 0, stream >>> (temp, (real*)x,
247 (real*)xupp, (real*)xlow, (real*)pij, (real*)qij, *n, *m);
248 for (
int i = 0; i < (*m); i++) {
249 mmasum_kernel <real> <<<nblcks, nthrds, 0, stream >>>
250 (temp, mma_bufred_d, (*n),(*m), i);
251 CUDA_CHECK(cudaGetLastError());
252 mmareduce_kernel<real> <<<1, 1024, 0, stream >>>
254 CUDA_CHECK(cudaGetLastError());
255 mma_copy_kernel<<<1, 1, 0, stream>>>((real*)relambda, mma_bufred_d, 1, i);
256 CUDA_CHECK(cudaGetLastError());
257 cudaStreamSynchronize(stream);
262 void cuda_sub2cons2(
void* a,
void* b,
void* c,
void* d, real* e,
int* n) {
264 const dim3 nthrds(1024, 1, 1);
265 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
267 sub2cons2_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>(
268 (real*)a, (real*)b, (real*)c, (real*)d, *e, *n);
269 CUDA_CHECK(cudaGetLastError());
273 real cuda_maxval(
void* a,
int* n) {
275 const dim3 nthrds(1024, 1, 1);
276 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
277 const int nb = ((*n) + 1024 - 1) / 1024;
278 const cudaStream_t stream = (cudaStream_t)glb_cmd_queue;
280 if (nb > mma_red_s) {
282 if (mma_bufred != NULL) {
283 CUDA_CHECK(cudaFreeHost(mma_bufred));
284 CUDA_CHECK(cudaFree(mma_bufred_d));
286 CUDA_CHECK(cudaMallocHost(&mma_bufred, nb *
sizeof(real)));
287 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb *
sizeof(real)));
290 maxval_kernel<real><<<nblcks, nthrds, 0, stream>>>(
291 (real*)a, mma_bufred_d, (*n));
292 CUDA_CHECK(cudaGetLastError());
294 max_reduce_kernel<real><<<1, 1024, 0, stream>>>(
296 CUDA_CHECK(cudaGetLastError());
298 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d,
sizeof(real),
299 cudaMemcpyDeviceToHost, stream));
300 cudaStreamSynchronize(stream);
302 return mma_bufred[0];
305 void cuda_delx(
void* delx,
void* x,
void* xlow,
void* xupp,
void* pij,
306 void* qij,
void* p0j,
void* q0j,
void* alpha,
void* beta,
void* lambda,
307 real* epsi,
int* n,
int* m) {
309 const dim3 nthrds(1024, 1, 1);
310 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
312 delx_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>(
313 (real*)delx, (real*)x, (real*)xlow, (real*)xupp, (real*)pij,
314 (real*)qij, (real*)p0j, (real*)q0j, (real*)alpha, (real*)beta,
315 (real*)lambda, *epsi, *n, *m);
316 CUDA_CHECK(cudaGetLastError());
319 void cuda_GG(
void* GG,
void* x,
void* xlow,
void* xupp,
320 void* pij,
void* qij,
int* n,
int* m) {
321 const dim3 nthrds(1024, 1, 1);
322 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
323 GG_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
324 ((real*)GG, (real*)x, (real*)xlow,(real*) xupp,
325 (real*)pij, (real*) qij, *n,*m);
326 CUDA_CHECK(cudaGetLastError());
329 void cuda_diagx(
void* diagx,
void* x,
void* xsi,
void* xlow,
void* xupp,
330 void* p0j,
void* q0j,
void* pij,
void* qij,
void* alpha,
void* beta,
331 void* eta,
void* lambda,
int *n,
int *m) {
332 const dim3 nthrds(1024, 1, 1);
333 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
334 diagx_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
335 ((real*)diagx, (real*)x, (real*)xsi,(real*)xlow,
336 (real*) xupp,(real*)p0j, (real*) q0j, (real*)pij, (real*) qij,
337 (real*)alpha, (real*) beta, (real*)eta, (real*) lambda, *n,*m);
338 CUDA_CHECK(cudaGetLastError());
341 void cuda_bb(
void* bb,
void* GG,
void* delx,
void* diagx,
int *n,
int *m) {
342 const dim3 nthrds(1024, 1, 1);
343 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
344 const int nb = ((*n) + 1024 - 1)/ 1024;
345 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
346 cudaStreamSynchronize(stream);
349 if(mma_bufred != NULL){
350 CUDA_CHECK(cudaFreeHost(mma_bufred));
351 CUDA_CHECK(cudaFree(mma_bufred_d));
353 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*
sizeof(real)));
354 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*
sizeof(real)));
356 for (
int i = 0; i < (*m); i++) {
357 mmasumbb_kernel <real> <<<nblcks, nthrds, 0, stream>>>
358 ((real*)GG,(real*)delx,(real*)diagx, mma_bufred_d, (*n),(*m), i);
359 CUDA_CHECK(cudaGetLastError());
360 mmareduce_kernel<real> <<<1, 1024, 0, stream>>> (mma_bufred_d, nb);
361 CUDA_CHECK(cudaGetLastError());
362 mma_copy_kernel<<<1, 1, 0, stream>>>((real*)bb, mma_bufred_d, 1, i);
363 CUDA_CHECK(cudaGetLastError());
364 cudaStreamSynchronize(stream);
368 void cuda_AA(
void* AA,
void* GG,
void* diagx,
int *n,
int *m) {
369 const dim3 nthrds(1024, 1, 1);
370 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
371 const int nb = ((*n) + 1024 - 1)/ 1024;
372 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
373 cudaStreamSynchronize(stream);
376 if(mma_bufred != NULL){
377 CUDA_CHECK(cudaFreeHost(mma_bufred));
378 CUDA_CHECK(cudaFree(mma_bufred_d));
380 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*
sizeof(real)));
381 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*
sizeof(real)));
383 for (
int i = 0; i < (*m); i++){
384 for (
int j=0; j<(*m);j++){
385 mmasumAA_kernel <real> <<<nblcks, nthrds, 0, stream>>>
386 ((real*)GG,(real*)diagx, mma_bufred_d, (*n),(*m), i, j);
387 CUDA_CHECK(cudaGetLastError());
388 mmareduce_kernel<real> <<<1, 1024, 0, stream>>> (mma_bufred_d, nb);
389 CUDA_CHECK(cudaGetLastError());
390 mma_copy_kernel<<<1, 1, 0, stream>>>((real*)AA, mma_bufred_d, 1,
392 CUDA_CHECK(cudaGetLastError());
393 cudaStreamSynchronize(stream);
398 void cuda_dx(
void* dx,
void* delx,
void* diagx,
void* GG,
void* dlambda,
400 const dim3 nthrds(1024, 1, 1);
401 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
402 dx_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
403 ((real*)dx, (real*)delx, (real*)diagx,(real*) GG, (real*)dlambda,
405 CUDA_CHECK(cudaGetLastError());
408 void cuda_dxsi(
void* dxsi,
void* xsi,
void* dx,
void* x,
409 void* alpha, real*epsi,
int* n) {
410 const dim3 nthrds(1024, 1, 1);
411 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
412 dxsi_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
413 ((real*)dxsi, (real*)xsi, (real*)dx,(real*) x,
414 (real*)alpha, *epsi,*n);
415 CUDA_CHECK(cudaGetLastError());
418 void cuda_deta(
void* deta,
void* eta,
void* dx,
void* x,
419 void* beta, real* epsi,
int* n) {
420 const dim3 nthrds(1024, 1, 1);
421 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
422 deta_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
423 ((real*)deta, (real*)eta, (real*)dx, (real*)x,
424 (real*)beta, *epsi, *n);
425 CUDA_CHECK(cudaGetLastError());
428 void cuda_rex(
void* rex,
void* x,
void* xlow,
void* xupp,
void* pij,
429 void* p0j,
void* qij,
void* q0j,
void* lambda,
void* xsi,
void* eta,
431 const dim3 nthrds(1024, 1, 1);
432 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
433 RexCalculation_kernel<real> <<<nblcks, nthrds, 0,
434 (cudaStream_t)glb_cmd_queue >>> ((real*)rex, (real*)x, (real*)xlow,
435 (real*)xupp, (real*)pij, (real*)p0j, (real*)qij, (real*)q0j,
436 (real*)lambda, (real*)xsi, (real*)eta, *n, *m);
437 CUDA_CHECK(cudaGetLastError());
440 void cuda_rey(
void* rey,
void* c,
void* d,
void* y,
void* lambda,
void* mu,
442 const dim3 nthrds(1024, 1, 1);
443 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
444 rey_calculation_kernel<real> <<<nblcks, nthrds, 0,
445 (cudaStream_t)glb_cmd_queue >>> ((real*)rey, (real*)c,
446 (real*)d, (real*)y, (real*)lambda, (real*)mu, * n);
447 CUDA_CHECK(cudaGetLastError());
451 void cuda_sub2cons(
void * a,
void * b,
void * c, real *d,
int * n) {
452 const dim3 nthrds(1024, 1, 1);
453 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
454 sub2cons_kernel<real><<<nblcks, nthrds, 0,(cudaStream_t) glb_cmd_queue>>>
455 ((real *) a, (real *) b, (real *) c, *d, *n);
456 CUDA_CHECK(cudaGetLastError());
459 real cuda_norm(
void* a,
int* n) {
460 const dim3 nthrds(1024, 1, 1);
461 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
462 const int nb = ((*n) + 1024 - 1)/ 1024;
463 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
466 if(mma_bufred != NULL){
467 CUDA_CHECK(cudaFreeHost(mma_bufred));
468 CUDA_CHECK(cudaFree(mma_bufred_d));
470 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*
sizeof(real)));
471 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*
sizeof(real)));
473 norm_kernel <real> <<<nblcks, nthrds, 0, stream >>>
474 ((real*)a, mma_bufred_d, (*n));
475 CUDA_CHECK(cudaGetLastError());
476 mmareduce_kernel<real> <<<1, 1024, 0, stream >>> (mma_bufred_d, nb);
477 CUDA_CHECK(cudaGetLastError());
478 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d,
sizeof(real),
479 cudaMemcpyDeviceToHost, stream));
480 cudaStreamSynchronize(stream);
481 return mma_bufred[0];
484 void cuda_dely(
void* dely,
void* c,
void* d,
void* y,
void* lambda,
485 real* epsi,
int* n) {
486 const dim3 nthrds(1024, 1, 1);
487 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
488 dely_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
489 ((real*)dely,(real*)c, (real*)d, (real*)y, (real*)lambda,*epsi, * n);
490 CUDA_CHECK(cudaGetLastError());
493 real cuda_maxval2(
void* a,
void* b, real* cons,
int* n) {
494 const dim3 nthrds(1024, 1, 1);
495 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
496 const int nb = ((*n) + 1024 - 1)/ 1024;
497 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
500 if(mma_bufred != NULL) {
501 CUDA_CHECK(cudaFreeHost(mma_bufred));
502 CUDA_CHECK(cudaFree(mma_bufred_d));
504 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*
sizeof(real)));
505 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*
sizeof(real)));
507 maxval2_kernel <real> <<<nblcks, nthrds, 0,stream >>>
508 ((real*)a, (real*)b, mma_bufred_d, *cons, *n);
509 CUDA_CHECK(cudaGetLastError());
510 max_reduce_kernel<real> <<<1, 1024, 0,stream >>> (mma_bufred_d, nb);
511 CUDA_CHECK(cudaGetLastError());
512 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d,
sizeof(real),
513 cudaMemcpyDeviceToHost, stream));
514 cudaStreamSynchronize(stream);
515 return mma_bufred[0];
518 real cuda_maxval3(
void* a,
void* b,
void* c, real* cons,
int* n) {
519 const dim3 nthrds(1024, 1, 1);
520 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
521 const int nb = ((*n) + 1024 - 1)/ 1024;
522 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
525 if(mma_bufred != NULL) {
526 CUDA_CHECK(cudaFreeHost(mma_bufred));
527 CUDA_CHECK(cudaFree(mma_bufred_d));
529 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*
sizeof(real)));
530 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*
sizeof(real)));
532 maxval3_kernel <real> <<<nblcks, nthrds, 0,stream>>>
533 ((real*)a, (real*)b, (real*)c, mma_bufred_d, *cons, *n);
534 max_reduce_kernel<real> <<<1, 1024, 0,stream >>> (mma_bufred_d, nb);
535 CUDA_CHECK(cudaGetLastError());
536 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d,
sizeof(real),
537 cudaMemcpyDeviceToHost, stream));
538 cudaStreamSynchronize(stream);
539 return mma_bufred[0];
542 void cuda_kkt_rex(
void* rex,
void* df0dx,
void* dfdx,
void* xsi,
543 void* eta,
void* lambda,
int* n,
int* m) {
544 const dim3 nthrds(1024, 1, 1);
545 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
546 kkt_rex_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
547 ((real*)rex, (real*)df0dx, (real*)dfdx, (real*)xsi,
548 (real*)eta, (real*)lambda, *n, *m);
549 CUDA_CHECK(cudaGetLastError());
552 void cuda_maxcons(
void* a, real* b, real* c,
void* d,
int* n) {
553 const dim3 nthrds(1024, 1, 1);
554 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
555 maxcons_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
556 ((real*)a, *b, *c, (real*)d, *n);
557 CUDA_CHECK(cudaGetLastError());
560 real cuda_lcsc2(
void *a,
void*b,
int *n) {
561 const dim3 nthrds(1024, 1, 1);
562 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
563 const int nb = ((*n) + 1024 - 1)/ 1024;
564 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
565 if ( nb > mma_red_s){
567 if (mma_bufred != NULL) {
568 CUDA_CHECK(cudaFreeHost(mma_bufred));
569 CUDA_CHECK(cudaFree(mma_bufred_d));
571 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*
sizeof(real)));
572 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*
sizeof(real)));
574 glsc2_kernel <real> <<<nblcks, nthrds, 0, stream>>>
575 ((real*)a, (real*)b, mma_bufred_d, (*n));
576 CUDA_CHECK(cudaGetLastError());
577 mmareduce_kernel<real> <<<1, 1024, 0, stream>>> (mma_bufred_d, nb);
578 CUDA_CHECK(cudaGetLastError());
579 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d,
sizeof(real),
580 cudaMemcpyDeviceToHost, stream));
581 cudaStreamSynchronize(stream);
582 return mma_bufred[0];
585 void cuda_mpisum(
void *a,
int *n) {
586#ifdef HAVE_DEVICE_MPI
588 cudaStreamSynchronize(stream);
589 device_mpi_allreduce_inplace(temp, *n,
sizeof(real), DEVICE_MPI_SUM);
593 void cuda_add2inv2(
void* a,
void *b, real* c,
int* n) {
594 const dim3 nthrds(1024, 1, 1);
595 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
596 add2inv2_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
597 ((real*)a, (real*) b, *c, *n);
598 CUDA_CHECK(cudaGetLastError());
601 void cuda_max2(
void* a, real* b,
void* c, real* d,
int* n) {
602 const dim3 nthrds(1024, 1, 1);
603 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
604 max2_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
605 ((real*)a, *b, (real*)c,*d, *n);
606 CUDA_CHECK(cudaGetLastError());
609 void cuda_updatebb(
void* bb,
void* dellambda,
void* dely,
void* d,
610 void* mu,
void* y, real* delz,
int *m) {
611 const dim3 nthrds(1024, 1, 1);
612 const dim3 nblcks(((*m+1) + 1024 - 1) / 1024, 1, 1);
613 updatebb_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
614 ((real*) bb, (real*) dellambda, (real*) dely,(real*) d,
615 (real*) mu, (real*) y, *delz, *m);
616 CUDA_CHECK(cudaGetLastError());
619 void cuda_updateAA(
void* AA,
void* globaltmp_mm,
void* s,
void* lambda,
620 void* d,
void*mu,
void* y,
void* a, real* zeta, real* z,
int* m) {
621 const dim3 nthrds(1024, 1, 1);
622 const dim3 nblcks(((*m+1) + 1024 - 1) / 1024, 1, 1);
623 updateAA_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
624 ((real*) AA,(real*) globaltmp_mm, (real*) s, (real*) lambda,(real*) d,
625 (real*)mu,(real*) y, (real*)a, *zeta, *z, *m);
626 CUDA_CHECK(cudaGetLastError());
629 void cuda_dy(
void* dy,
void* dely,
void* dlambda,
void* d,
void* mu,
631 const dim3 nthrds(1024, 1, 1);
632 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
633 dy_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
634 ((real*) dy, (real*) dely, (real*) dlambda, (real*) d,
635 (real*) mu,(real*) y, *n);
636 CUDA_CHECK(cudaGetLastError());