40#include <cuda_runtime.h>
41#include <cusolverDn.h>
44#include <neko/device/device_config.h>
45#include <neko/device/cuda/check.h>
46#include <neko/math/bcknd/device/device_mpi_reduce.h>
47#include <neko/math/bcknd/device/device_mpi_op.h>
50#include "mma_kernel.h"
56 real* mma_bufred = NULL;
57 real* mma_bufred_d = NULL;
59 void mma_prepare_aa_matrix_cuda(
void* AA,
void* s,
void* lambda,
60 void* d,
void* mu,
void* y,
61 void* a, real* zeta, real* z,
int* m) {
62 const dim3 nthrds(1024, 1, 1);
63 const dim3 nblcks(((*m) + 1024 - 1) / 1024, 1, 1);
66 mma_prepare_aa_matrix_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>(
67 (real*)AA, (real*)s, (real*)lambda, (real*)d,
68 (real*)mu, (real*)y, (real*)a,
71 CUDA_CHECK(cudaGetLastError());
74 void mma_prepare_hessian_cuda(
void* Hess,
void* y,
void* d,
75 void* mu,
void* lambda,
int* m) {
77 const dim3 nthrds(1024, 1, 1);
78 const dim3 nblcks((M + 1024 - 1) / 1024, 1, 1);
79 const cudaStream_t stream = (cudaStream_t)glb_cmd_queue;
82 mma_update_hessian_diagonal_kernel<real><<<nblcks, nthrds, 0, stream>>>(
83 (real*)Hess, (real*)y, (real*)d, (real*)mu, (real*)lambda, M);
84 CUDA_CHECK(cudaGetLastError());
87 CUDA_CHECK(cudaStreamSynchronize(stream));
92 const dim3 stab_nblcks(1, 1, 1);
93 mma_stabilize_hessian_single_kernel<real><<<stab_nblcks, nthrds, 0, stream>>>(
95 CUDA_CHECK(cudaGetLastError());
99 real* h_Hess = (real*)malloc(M *
sizeof(real));
102 for (
int i = 0; i < M; i++) {
103 CUDA_CHECK(cudaMemcpyAsync(&h_Hess[i],
104 (real*)Hess + i * M + i,
106 cudaMemcpyDeviceToHost, stream));
108 CUDA_CHECK(cudaStreamSynchronize(stream));
112 for (
int i = 0; i < M; i++) {
115 real lm_factor = fmax(-1.0e-4 * trace / M, 1.0e-7);
118 mma_stabilize_hessian_multi_kernel<real><<<nblcks, nthrds, 0, stream>>>(
119 (real*)Hess, lm_factor, M);
120 CUDA_CHECK(cudaGetLastError());
126 void cuda_custom_solver(
void* A,
void* b,
int n,
int* info) {
127 const cudaStream_t stream = (cudaStream_t)glb_cmd_queue;
133 const dim3 nthrds(1024, 1, 1);
134 const dim3 nblcks(1, 1, 1);
136 mma_small_lu_kernel<real><<<nblcks, nthrds, 0, stream>>>(
137 (real*)A, (real*)b, n);
139 cudaError_t err = cudaGetLastError();
140 if (err == cudaSuccess) {
147 void cuSOLVER_wrapper(
void* A,
void* b,
int n,
int* jj) {
148 cusolverDnHandle_t handle;
149 cusolverStatus_t status;
150 cusolverDnCreate(&handle);
159 status = cusolverDnDgetrf_bufferSize(handle, n, n, (
double*)A, n, &lwork);
160 if (status != CUSOLVER_STATUS_SUCCESS) {
163 cudaMalloc(&workspace, lwork *
sizeof(
double));
164 cudaMalloc(&ipiv, n *
sizeof(
int));
165 cudaMalloc(&info,
sizeof(
int));
168 cusolverDnDgetrf(handle, n, n, (
double*)A, n, workspace, ipiv, info);
171 cudaMemcpy(&host_info, info,
sizeof(
int), cudaMemcpyDeviceToHost);
173 if (host_info == 0) {
175 cusolverDnDgetrs(handle, CUBLAS_OP_N, n, 1, (
double*)A, n, ipiv, (
double*)b, n, info);
177 cudaMemcpy(&host_info, info,
sizeof(
int), cudaMemcpyDeviceToHost);
188 cusolverDnDestroy(handle);
191 void custom_solve_linear_system(
void* A,
void* b,
int n,
int* info) {
192 const cudaStream_t stream = (cudaStream_t)glb_cmd_queue;
198 const dim3 nthrds(1024, 1, 1);
199 const dim3 nblcks(1, 1, 1);
201 mma_small_lu_kernel<real><<<nblcks, nthrds, 0, stream>>>(
202 (real*)A, (real*)b, n);
204 cudaError_t err = cudaGetLastError();
205 if (err == cudaSuccess) {
212 void delta_1dbeam_cuda(
void* Delta, real* L_total, real* Le,
213 int* offset,
int* n) {
214 const dim3 nthrds(1024, 1, 1);
215 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
216 delta_1dbeam_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
217 ((real*)Delta, *L_total, *Le, *offset, *n);
218 CUDA_CHECK(cudaGetLastError());
221 void cuda_Hess(
void* Hess,
void* hijx,
void* Ljjxinv,
int *n,
int *m) {
222 const dim3 nthrds(1024, 1, 1);
223 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
224 const int nb = ((*n) + 1024 - 1)/ 1024;
225 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
226 cudaStreamSynchronize(stream);
229 if(mma_bufred != NULL){
230 CUDA_CHECK(cudaFreeHost(mma_bufred));
231 CUDA_CHECK(cudaFree(mma_bufred_d));
233 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*
sizeof(real)));
234 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*
sizeof(real)));
236 for (
int i = 0; i < (*m); i++){
237 for (
int j=0; j<(*m);j++){
238 mmasumHess_kernel <real> <<<nblcks, nthrds, 0, stream>>>
239 ((real*)hijx,(real*)Ljjxinv, mma_bufred_d, (*n),(*m), i, j);
240 CUDA_CHECK(cudaGetLastError());
241 mmareduce_kernel<real> <<<1, 1024, 0, stream>>> (mma_bufred_d, nb);
242 CUDA_CHECK(cudaGetLastError());
243 mma_copy_kernel<<<1, 1, 0, stream>>>((real*)Hess, mma_bufred_d, 1,
245 CUDA_CHECK(cudaGetLastError());
246 cudaStreamSynchronize(stream);
251 void mma_Ljjxinv_cuda(
void* Ljjxinv,
void* pjlambda,
void* qjlambda,
void* x,
252 void* low,
void* upp,
void* alpha,
void* beta,
int* n) {
253 const dim3 nthrds(1024, 1, 1);
254 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
255 mma_Ljjxinv_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
256 ((real*)Ljjxinv, (real*)pjlambda, (real*)qjlambda, (real*)x, (real*)low,
257 (real*)upp, (real*)alpha, (real*)beta, *n);
258 CUDA_CHECK(cudaGetLastError());
261 void mma_dipsolvesub1_cuda(
void* x,
void* pjlambda,
void* qjlambda,
262 void* low,
void* upp,
void* alpha,
void* beta,
int* n) {
263 const dim3 nthrds(1024, 1, 1);
264 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
265 mma_dipsolvesub1_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
266 ((real*)x, (real*)pjlambda, (real*)qjlambda, (real*)low, (real*)upp,
267 (real*)alpha, (real*)beta, *n);
268 CUDA_CHECK(cudaGetLastError());
271 void mattrans_v_mul_cuda(
void* output,
void* pij,
void* lambda,
273 const dim3 nthrds(1024, 1, 1);
274 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
275 mattrans_v_mul_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
276 ((real*)output, (real*)pij, (real*)lambda, *m, *n);
277 CUDA_CHECK(cudaGetLastError());
280 void mma_gensub4_cuda(
const void* x,
const void* low,
const void* upp,
281 const void* pij,
const void* qij,
282 const int* n,
const int* m,
void* bi) {
287 const dim3 nthrds(1024, 1, 1);
288 const dim3 nblcks((N + 1023) / 1024, 1, 1);
289 const int nb = (N + 1023) / 1024;
290 const cudaStream_t stream = (cudaStream_t)glb_cmd_queue;
292 if (nb > mma_red_s) {
295 if (mma_bufred != NULL) {
296 CUDA_CHECK(cudaFreeHost(mma_bufred));
297 CUDA_CHECK(cudaFree(mma_bufred_d));
300 CUDA_CHECK(cudaMallocHost(&mma_bufred, nb *
sizeof(real)));
301 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb *
sizeof(real)));
305 real* bi_d =
static_cast<real*
>(bi);
306 CUDA_CHECK(cudaMalloc(&temp, M * N *
sizeof(real)));
308 mma_sub4_kernel<real><<<nblcks, nthrds, 0, stream>>>(
309 static_cast<const real*
>(x),
310 static_cast<const real*
>(low),
311 static_cast<const real*
>(upp),
312 static_cast<const real*
>(pij),
313 static_cast<const real*
>(qij),
316 for (
int i = 0; i < M; ++i) {
317 mmasum_kernel<real><<<nblcks, nthrds, 0, stream>>>(
318 temp, mma_bufred_d, N, M, i);
319 CUDA_CHECK(cudaGetLastError());
321 mmareduce_kernel<real><<<1, 1024, 0, stream>>>(mma_bufred_d, nb);
322 CUDA_CHECK(cudaGetLastError());
324 CUDA_CHECK(cudaMemcpyAsync(
325 bi_d + i, mma_bufred_d,
sizeof(real),
326 cudaMemcpyDeviceToDevice, stream));
328 CUDA_CHECK(cudaStreamSynchronize(stream));
331 CUDA_CHECK(cudaFree(temp));
334 void mma_gensub3_cuda(
void* x,
void* df0dx,
void* dfdx,
void* low,
335 void* upp,
void* xmin,
void* xmax,
void* alpha,
void* beta,
336 void* p0j,
void* q0j,
void* pij,
void* qij,
int* n,
int* m) {
337 const dim3 nthrds(1024, 1, 1);
338 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
340 mma_sub3_kernel<real><<<nblcks, nthrds, 0,
341 (cudaStream_t)glb_cmd_queue>>>(
342 (real*)x, (real*)df0dx, (real*)dfdx, (real*)low,
343 (real*)upp, (real*)xmin, (real*)xmax, (real*)alpha,
344 (real*)beta, (real*)p0j, (real*)q0j, (real*)pij,
347 CUDA_CHECK(cudaGetLastError());
350 void mma_gensub2_cuda(
void* low,
void* upp,
void* x,
void* xold1,
351 void* xold2,
void* xdiff, real* asydecr, real* asyincr,
int* n) {
352 const dim3 nthrds(1024, 1, 1);
353 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
355 mma_sub2_kernel<real><<<nblcks, nthrds, 0,
356 (cudaStream_t)glb_cmd_queue>>>(
357 (real*)low, (real*)upp, (real*)x, (real*)xold1,
358 (real*)xold2, (real*)xdiff, *asydecr, *asyincr, *n);
360 CUDA_CHECK(cudaGetLastError());
363 void mma_gensub1_cuda(
void* low,
void* upp,
void* x,
void* xmin,
void* xmax,
364 real* asyinit,
int* n) {
365 const dim3 nthrds(1024, 1, 1);
366 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
367 mma_sub1_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
368 ((real*)low, (real*)upp, (real*)x, (real*)xmin, (real*)xmax,
370 CUDA_CHECK(cudaGetLastError());
373 void cuda_mma_max(
void* xsi,
void* x,
void* alpha,
int* n) {
374 const dim3 nthrds(1024, 1, 1);
375 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
377 mma_max2_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
378 ((real*)xsi, (real*)x, (real*)alpha, *n);
379 CUDA_CHECK(cudaGetLastError());
382 void cuda_relambda(
void* relambda,
void* x,
void* xupp,
void* xlow,
383 void* pij,
void* qij,
int* n,
int* m) {
384 const dim3 nthrds(1024, 1, 1);
385 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
386 const int nb = ((*n) + 1024 - 1)/ 1024;
387 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
389 if ( nb > mma_red_s){
391 if (mma_bufred != NULL) {
392 CUDA_CHECK(cudaFreeHost(mma_bufred));
393 CUDA_CHECK(cudaFree(mma_bufred_d));
395 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*
sizeof(real)));
396 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*
sizeof(real)));
399 cudaMalloc(&temp, (*n) * (*m) *
sizeof(real));
400 relambda_kernel<real> <<<nblcks, nthrds, 0, stream >>> (temp, (real*)x,
401 (real*)xupp, (real*)xlow, (real*)pij, (real*)qij, *n, *m);
402 for (
int i = 0; i < (*m); i++) {
403 mmasum_kernel <real> <<<nblcks, nthrds, 0, stream >>>
404 (temp, mma_bufred_d, (*n),(*m), i);
405 CUDA_CHECK(cudaGetLastError());
406 mmareduce_kernel<real> <<<1, 1024, 0, stream >>>
408 CUDA_CHECK(cudaGetLastError());
409 mma_copy_kernel<<<1, 1, 0, stream>>>((real*)relambda, mma_bufred_d, 1, i);
410 CUDA_CHECK(cudaGetLastError());
411 cudaStreamSynchronize(stream);
416 void cuda_sub2cons2(
void* a,
void* b,
void* c,
void* d, real* e,
int* n) {
418 const dim3 nthrds(1024, 1, 1);
419 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
421 sub2cons2_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>(
422 (real*)a, (real*)b, (real*)c, (real*)d, *e, *n);
423 CUDA_CHECK(cudaGetLastError());
427 real cuda_maxval(
void* a,
int* n) {
429 const dim3 nthrds(1024, 1, 1);
430 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
431 const int nb = ((*n) + 1024 - 1) / 1024;
432 const cudaStream_t stream = (cudaStream_t)glb_cmd_queue;
434 if (nb > mma_red_s) {
436 if (mma_bufred != NULL) {
437 CUDA_CHECK(cudaFreeHost(mma_bufred));
438 CUDA_CHECK(cudaFree(mma_bufred_d));
440 CUDA_CHECK(cudaMallocHost(&mma_bufred, nb *
sizeof(real)));
441 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb *
sizeof(real)));
444 maxval_kernel<real><<<nblcks, nthrds, 0, stream>>>(
445 (real*)a, mma_bufred_d, (*n));
446 CUDA_CHECK(cudaGetLastError());
448 max_reduce_kernel<real><<<1, 1024, 0, stream>>>(
450 CUDA_CHECK(cudaGetLastError());
452 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d,
sizeof(real),
453 cudaMemcpyDeviceToHost, stream));
454 cudaStreamSynchronize(stream);
456 return mma_bufred[0];
459 void cuda_delx(
void* delx,
void* x,
void* xlow,
void* xupp,
void* pij,
460 void* qij,
void* p0j,
void* q0j,
void* alpha,
void* beta,
void* lambda,
461 real* epsi,
int* n,
int* m) {
463 const dim3 nthrds(1024, 1, 1);
464 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
466 delx_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>(
467 (real*)delx, (real*)x, (real*)xlow, (real*)xupp, (real*)pij,
468 (real*)qij, (real*)p0j, (real*)q0j, (real*)alpha, (real*)beta,
469 (real*)lambda, *epsi, *n, *m);
470 CUDA_CHECK(cudaGetLastError());
473 void cuda_GG(
void* GG,
void* x,
void* xlow,
void* xupp,
474 void* pij,
void* qij,
int* n,
int* m) {
475 const dim3 nthrds(1024, 1, 1);
476 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
477 GG_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
478 ((real*)GG, (real*)x, (real*)xlow,(real*) xupp,
479 (real*)pij, (real*) qij, *n,*m);
480 CUDA_CHECK(cudaGetLastError());
483 void cuda_diagx(
void* diagx,
void* x,
void* xsi,
void* xlow,
void* xupp,
484 void* p0j,
void* q0j,
void* pij,
void* qij,
void* alpha,
void* beta,
485 void* eta,
void* lambda,
int *n,
int *m) {
486 const dim3 nthrds(1024, 1, 1);
487 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
488 diagx_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
489 ((real*)diagx, (real*)x, (real*)xsi,(real*)xlow,
490 (real*) xupp,(real*)p0j, (real*) q0j, (real*)pij, (real*) qij,
491 (real*)alpha, (real*) beta, (real*)eta, (real*) lambda, *n,*m);
492 CUDA_CHECK(cudaGetLastError());
495 void cuda_bb(
void* bb,
void* GG,
void* delx,
void* diagx,
int *n,
int *m) {
496 const dim3 nthrds(1024, 1, 1);
497 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
498 const int nb = ((*n) + 1024 - 1)/ 1024;
499 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
500 cudaStreamSynchronize(stream);
503 if(mma_bufred != NULL){
504 CUDA_CHECK(cudaFreeHost(mma_bufred));
505 CUDA_CHECK(cudaFree(mma_bufred_d));
507 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*
sizeof(real)));
508 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*
sizeof(real)));
510 for (
int i = 0; i < (*m); i++) {
511 mmasumbb_kernel <real> <<<nblcks, nthrds, 0, stream>>>
512 ((real*)GG,(real*)delx,(real*)diagx, mma_bufred_d, (*n),(*m), i);
513 CUDA_CHECK(cudaGetLastError());
514 mmareduce_kernel<real> <<<1, 1024, 0, stream>>> (mma_bufred_d, nb);
515 CUDA_CHECK(cudaGetLastError());
516 mma_copy_kernel<<<1, 1, 0, stream>>>((real*)bb, mma_bufred_d, 1, i);
517 CUDA_CHECK(cudaGetLastError());
518 cudaStreamSynchronize(stream);
522 void cuda_AA(
void* AA,
void* GG,
void* diagx,
int *n,
int *m) {
523 const dim3 nthrds(1024, 1, 1);
524 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
525 const int nb = ((*n) + 1024 - 1)/ 1024;
526 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
527 cudaStreamSynchronize(stream);
530 if(mma_bufred != NULL){
531 CUDA_CHECK(cudaFreeHost(mma_bufred));
532 CUDA_CHECK(cudaFree(mma_bufred_d));
534 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*
sizeof(real)));
535 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*
sizeof(real)));
537 for (
int i = 0; i < (*m); i++){
538 for (
int j=0; j<(*m);j++){
539 mmasumAA_kernel <real> <<<nblcks, nthrds, 0, stream>>>
540 ((real*)GG,(real*)diagx, mma_bufred_d, (*n),(*m), i, j);
541 CUDA_CHECK(cudaGetLastError());
542 mmareduce_kernel<real> <<<1, 1024, 0, stream>>> (mma_bufred_d, nb);
543 CUDA_CHECK(cudaGetLastError());
544 mma_copy_kernel<<<1, 1, 0, stream>>>((real*)AA, mma_bufred_d, 1,
546 CUDA_CHECK(cudaGetLastError());
547 cudaStreamSynchronize(stream);
552 void cuda_dx(
void* dx,
void* delx,
void* diagx,
void* GG,
void* dlambda,
554 const dim3 nthrds(1024, 1, 1);
555 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
556 dx_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
557 ((real*)dx, (real*)delx, (real*)diagx,(real*) GG, (real*)dlambda,
559 CUDA_CHECK(cudaGetLastError());
562 void cuda_dxsi(
void* dxsi,
void* xsi,
void* dx,
void* x,
563 void* alpha, real*epsi,
int* n) {
564 const dim3 nthrds(1024, 1, 1);
565 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
566 dxsi_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
567 ((real*)dxsi, (real*)xsi, (real*)dx,(real*) x,
568 (real*)alpha, *epsi,*n);
569 CUDA_CHECK(cudaGetLastError());
572 void cuda_deta(
void* deta,
void* eta,
void* dx,
void* x,
573 void* beta, real* epsi,
int* n) {
574 const dim3 nthrds(1024, 1, 1);
575 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
576 deta_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
577 ((real*)deta, (real*)eta, (real*)dx, (real*)x,
578 (real*)beta, *epsi, *n);
579 CUDA_CHECK(cudaGetLastError());
582 void cuda_rex(
void* rex,
void* x,
void* xlow,
void* xupp,
void* pij,
583 void* p0j,
void* qij,
void* q0j,
void* lambda,
void* xsi,
void* eta,
585 const dim3 nthrds(1024, 1, 1);
586 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
587 RexCalculation_kernel<real> <<<nblcks, nthrds, 0,
588 (cudaStream_t)glb_cmd_queue >>> ((real*)rex, (real*)x, (real*)xlow,
589 (real*)xupp, (real*)pij, (real*)p0j, (real*)qij, (real*)q0j,
590 (real*)lambda, (real*)xsi, (real*)eta, *n, *m);
591 CUDA_CHECK(cudaGetLastError());
594 void cuda_rey(
void* rey,
void* c,
void* d,
void* y,
void* lambda,
void* mu,
596 const dim3 nthrds(1024, 1, 1);
597 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
598 rey_calculation_kernel<real> <<<nblcks, nthrds, 0,
599 (cudaStream_t)glb_cmd_queue >>> ((real*)rey, (real*)c,
600 (real*)d, (real*)y, (real*)lambda, (real*)mu, * n);
601 CUDA_CHECK(cudaGetLastError());
605 void cuda_sub2cons(
void * a,
void * b,
void * c, real *d,
int * n) {
606 const dim3 nthrds(1024, 1, 1);
607 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
608 sub2cons_kernel<real><<<nblcks, nthrds, 0,(cudaStream_t) glb_cmd_queue>>>
609 ((real *) a, (real *) b, (real *) c, *d, *n);
610 CUDA_CHECK(cudaGetLastError());
613 real cuda_norm(
void* a,
int* n) {
614 const dim3 nthrds(1024, 1, 1);
615 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
616 const int nb = ((*n) + 1024 - 1)/ 1024;
617 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
620 if(mma_bufred != NULL){
621 CUDA_CHECK(cudaFreeHost(mma_bufred));
622 CUDA_CHECK(cudaFree(mma_bufred_d));
624 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*
sizeof(real)));
625 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*
sizeof(real)));
627 norm_kernel <real> <<<nblcks, nthrds, 0, stream >>>
628 ((real*)a, mma_bufred_d, (*n));
629 CUDA_CHECK(cudaGetLastError());
630 mmareduce_kernel<real> <<<1, 1024, 0, stream >>> (mma_bufred_d, nb);
631 CUDA_CHECK(cudaGetLastError());
632 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d,
sizeof(real),
633 cudaMemcpyDeviceToHost, stream));
634 cudaStreamSynchronize(stream);
635 return mma_bufred[0];
638 void cuda_dely(
void* dely,
void* c,
void* d,
void* y,
void* lambda,
639 real* epsi,
int* n) {
640 const dim3 nthrds(1024, 1, 1);
641 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
642 dely_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
643 ((real*)dely,(real*)c, (real*)d, (real*)y, (real*)lambda,*epsi, * n);
644 CUDA_CHECK(cudaGetLastError());
647 real cuda_maxval2(
void* a,
void* b, real* cons,
int* n) {
648 const dim3 nthrds(1024, 1, 1);
649 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
650 const int nb = ((*n) + 1024 - 1)/ 1024;
651 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
654 if(mma_bufred != NULL) {
655 CUDA_CHECK(cudaFreeHost(mma_bufred));
656 CUDA_CHECK(cudaFree(mma_bufred_d));
658 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*
sizeof(real)));
659 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*
sizeof(real)));
661 maxval2_kernel <real> <<<nblcks, nthrds, 0,stream >>>
662 ((real*)a, (real*)b, mma_bufred_d, *cons, *n);
663 CUDA_CHECK(cudaGetLastError());
664 max_reduce_kernel<real> <<<1, 1024, 0,stream >>> (mma_bufred_d, nb);
665 CUDA_CHECK(cudaGetLastError());
666 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d,
sizeof(real),
667 cudaMemcpyDeviceToHost, stream));
668 cudaStreamSynchronize(stream);
669 return mma_bufred[0];
672 real cuda_maxval3(
void* a,
void* b,
void* c, real* cons,
int* n) {
673 const dim3 nthrds(1024, 1, 1);
674 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
675 const int nb = ((*n) + 1024 - 1)/ 1024;
676 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
679 if(mma_bufred != NULL) {
680 CUDA_CHECK(cudaFreeHost(mma_bufred));
681 CUDA_CHECK(cudaFree(mma_bufred_d));
683 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*
sizeof(real)));
684 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*
sizeof(real)));
686 maxval3_kernel <real> <<<nblcks, nthrds, 0,stream>>>
687 ((real*)a, (real*)b, (real*)c, mma_bufred_d, *cons, *n);
688 max_reduce_kernel<real> <<<1, 1024, 0,stream >>> (mma_bufred_d, nb);
689 CUDA_CHECK(cudaGetLastError());
690 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d,
sizeof(real),
691 cudaMemcpyDeviceToHost, stream));
692 cudaStreamSynchronize(stream);
693 return mma_bufred[0];
696 void cuda_kkt_rex(
void* rex,
void* df0dx,
void* dfdx,
void* xsi,
697 void* eta,
void* lambda,
int* n,
int* m) {
698 const dim3 nthrds(1024, 1, 1);
699 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
700 kkt_rex_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
701 ((real*)rex, (real*)df0dx, (real*)dfdx, (real*)xsi,
702 (real*)eta, (real*)lambda, *n, *m);
703 CUDA_CHECK(cudaGetLastError());
706 void cuda_maxcons(
void* a, real* b, real* c,
void* d,
int* n) {
707 const dim3 nthrds(1024, 1, 1);
708 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
709 maxcons_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
710 ((real*)a, *b, *c, (real*)d, *n);
711 CUDA_CHECK(cudaGetLastError());
714 real cuda_lcsc2(
void *a,
void*b,
int *n) {
715 const dim3 nthrds(1024, 1, 1);
716 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
717 const int nb = ((*n) + 1024 - 1)/ 1024;
718 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
719 if ( nb > mma_red_s){
721 if (mma_bufred != NULL) {
722 CUDA_CHECK(cudaFreeHost(mma_bufred));
723 CUDA_CHECK(cudaFree(mma_bufred_d));
725 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*
sizeof(real)));
726 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*
sizeof(real)));
728 glsc2_kernel <real> <<<nblcks, nthrds, 0, stream>>>
729 ((real*)a, (real*)b, mma_bufred_d, (*n));
730 CUDA_CHECK(cudaGetLastError());
731 mmareduce_kernel<real> <<<1, 1024, 0, stream>>> (mma_bufred_d, nb);
732 CUDA_CHECK(cudaGetLastError());
733 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d,
sizeof(real),
734 cudaMemcpyDeviceToHost, stream));
735 cudaStreamSynchronize(stream);
736 return mma_bufred[0];
739 void cuda_mpisum(
void *a,
int *n) {
740#ifdef HAVE_DEVICE_MPI
742 cudaStreamSynchronize(stream);
743 device_mpi_allreduce_inplace(temp, *n,
sizeof(real), DEVICE_MPI_SUM);
747 void cuda_add2inv2(
void* a,
void *b, real* c,
int* n) {
748 const dim3 nthrds(1024, 1, 1);
749 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
750 add2inv2_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
751 ((real*)a, (real*) b, *c, *n);
752 CUDA_CHECK(cudaGetLastError());
755 void cuda_max2(
void* a, real* b,
void* c, real* d,
int* n) {
756 const dim3 nthrds(1024, 1, 1);
757 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
758 max2_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
759 ((real*)a, *b, (real*)c,*d, *n);
760 CUDA_CHECK(cudaGetLastError());
763 void cuda_updatebb(
void* bb,
void* dellambda,
void* dely,
void* d,
764 void* mu,
void* y, real* delz,
int *m) {
765 const dim3 nthrds(1024, 1, 1);
766 const dim3 nblcks(((*m+1) + 1024 - 1) / 1024, 1, 1);
767 updatebb_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
768 ((real*) bb, (real*) dellambda, (real*) dely,(real*) d,
769 (real*) mu, (real*) y, *delz, *m);
770 CUDA_CHECK(cudaGetLastError());
773 void cuda_updateAA(
void* AA,
void* globaltmp_mm,
void* s,
void* lambda,
774 void* d,
void*mu,
void* y,
void* a, real* zeta, real* z,
int* m) {
775 const dim3 nthrds(1024, 1, 1);
776 const dim3 nblcks(((*m+1) + 1024 - 1) / 1024, 1, 1);
777 updateAA_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
778 ((real*) AA,(real*) globaltmp_mm, (real*) s, (real*) lambda,(real*) d,
779 (real*)mu,(real*) y, (real*)a, *zeta, *z, *m);
780 CUDA_CHECK(cudaGetLastError());
783 void cuda_dy(
void* dy,
void* dely,
void* dlambda,
void* d,
void* mu,
785 const dim3 nthrds(1024, 1, 1);
786 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
787 dy_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
788 ((real*) dy, (real*) dely, (real*) dlambda, (real*) d,
789 (real*) mu,(real*) y, *n);
790 CUDA_CHECK(cudaGetLastError());