42#include <cuda_runtime.h>
43#include <cusolverDn.h>
46#include <neko/device/device_config.h>
47#include <neko/device/cuda/check.h>
48#include <neko/math/bcknd/device/device_mpi_reduce.h>
49#include <neko/math/bcknd/device/device_mpi_op.h>
61 void mma_prepare_aa_matrix_cuda(
void*
AA,
void* s,
void* lambda,
62 void* d,
void* mu,
void* y,
63 void* a,
real* zeta,
real* z,
int* m) {
65 const dim3 nblcks(((*m) + 1024 - 1) / 1024, 1, 1);
76 void mma_prepare_hessian_cuda(
void*
Hess,
void* y,
void* d,
77 void* mu,
void* lambda,
int* m) {
104 for (
int i = 0; i <
M; i++) {
114 for (
int i = 0; i <
M; i++) {
128 void cuda_custom_solver(
void*
A,
void*
b,
int n,
int*
info) {
149 void cuSOLVER_wrapper(
void*
A,
void*
b,
int n,
int*
jj) {
193 void custom_solve_linear_system(
void*
A,
void*
b,
int n,
int*
info) {
217 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
223 void cuda_Hess(
void*
Hess,
void*
hijx,
void*
Ljjxinv,
int *n,
int *m) {
225 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
226 const int nb = ((*n) + 1024 - 1)/ 1024;
231 if(mma_bufred !=
NULL){
238 for (
int i = 0; i < (*m); i++){
239 for (
int j=0;
j<(*m);
j++){
254 void* low,
void* upp,
void* alpha,
void* beta,
int* n) {
256 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
264 void* low,
void* upp,
void* alpha,
void* beta,
int* n) {
266 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
273 void mattrans_v_mul_cuda(
void* output,
void* pij,
void* lambda,
276 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
282 void mma_gensub4_cuda(
const void* x,
const void* low,
const void* upp,
283 const void* pij,
const void* qij,
284 const int* n,
const int* m,
void* bi) {
291 const int nb = (
N + 1023) / 1024;
294 if (
nb > mma_red_s) {
297 if (mma_bufred !=
NULL) {
311 static_cast<const real*
>(x),
312 static_cast<const real*
>(low),
313 static_cast<const real*
>(upp),
314 static_cast<const real*
>(pij),
315 static_cast<const real*
>(qij),
318 for (
int i = 0; i <
M; ++i) {
320 temp, mma_bufred_d,
N,
M, i);
327 bi_d + i, mma_bufred_d,
sizeof(
real),
336 void mma_gensub3_cuda(
void* x,
void*
df0dx,
void*
dfdx,
void* low,
337 void* upp,
void* xmin,
void* xmax,
void* alpha,
void* beta,
338 void* p0j,
void* q0j,
void* pij,
void* qij,
int* n,
int* m) {
340 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
352 void mma_gensub2_cuda(
void* low,
void* upp,
void* x,
void* xold1,
353 void* xold2,
void*
xdiff,
real* asydecr,
real* asyincr,
int* n) {
355 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
365 void mma_gensub1_cuda(
void* low,
void* upp,
void* x,
void* xmin,
void* xmax,
366 real* asyinit,
int* n) {
368 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
375 void cuda_mma_max(
void* xsi,
void* x,
void* alpha,
int* n) {
377 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
385 void* pij,
void* qij,
int* n,
int* m) {
387 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
388 const int nb = ((*n) + 1024 - 1)/ 1024;
391 if (
nb > mma_red_s){
393 if (mma_bufred !=
NULL) {
404 for (
int i = 0; i < (*m); i++) {
406 (temp, mma_bufred_d, (*n),(*m), i);
418 void cuda_sub2cons2(
void* a,
void*
b,
void* c,
void* d,
real*
e,
int* n) {
421 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
429 real cuda_maxval(
void* a,
int* n) {
432 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
433 const int nb = ((*n) + 1024 - 1) / 1024;
436 if (
nb > mma_red_s) {
438 if (mma_bufred !=
NULL) {
447 (
real*)a, mma_bufred_d, (*n));
458 return mma_bufred[0];
461 void cuda_delx(
void*
delx,
void* x,
void*
xlow,
void*
xupp,
void* pij,
462 void* qij,
void* p0j,
void* q0j,
void* alpha,
void* beta,
void* lambda,
466 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
475 void cuda_GG(
void*
GG,
void* x,
void*
xlow,
void*
xupp,
476 void* pij,
void* qij,
int* n,
int* m) {
478 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
485 void cuda_diagx(
void*
diagx,
void* x,
void* xsi,
void*
xlow,
void*
xupp,
486 void* p0j,
void* q0j,
void* pij,
void* qij,
void* alpha,
void* beta,
487 void* eta,
void* lambda,
int *n,
int *m) {
489 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
497 void cuda_bb(
void*
bb,
void*
GG,
void*
delx,
void*
diagx,
int *n,
int *m) {
499 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
500 const int nb = ((*n) + 1024 - 1)/ 1024;
505 if(mma_bufred !=
NULL){
512 for (
int i = 0; i < (*m); i++) {
524 void cuda_AA(
void*
AA,
void*
GG,
void*
diagx,
int *n,
int *m) {
526 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
527 const int nb = ((*n) + 1024 - 1)/ 1024;
532 if(mma_bufred !=
NULL){
539 for (
int i = 0; i < (*m); i++){
540 for (
int j=0;
j<(*m);
j++){
557 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
564 void cuda_dxsi(
void*
dxsi,
void* xsi,
void*
dx,
void* x,
567 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
574 void cuda_deta(
void*
deta,
void* eta,
void*
dx,
void* x,
577 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
584 void cuda_rex(
void*
rex,
void* x,
void*
xlow,
void*
xupp,
void* pij,
585 void* p0j,
void* qij,
void* q0j,
void* lambda,
void* xsi,
void* eta,
588 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
596 void cuda_rey(
void*
rey,
void* c,
void* d,
void* y,
void* lambda,
void* mu,
599 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
607 void cuda_sub2cons(
void * a,
void *
b,
void * c,
real *d,
int * n) {
609 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
615 real cuda_norm(
void* a,
int* n) {
617 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
618 const int nb = ((*n) + 1024 - 1)/ 1024;
622 if(mma_bufred !=
NULL){
630 ((
real*)a, mma_bufred_d, (*n));
637 return mma_bufred[0];
640 void cuda_dely(
void*
dely,
void* c,
void* d,
void* y,
void* lambda,
643 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
651 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
652 const int nb = ((*n) + 1024 - 1)/ 1024;
656 if(mma_bufred !=
NULL) {
671 return mma_bufred[0];
674 real cuda_maxval3(
void* a,
void*
b,
void* c,
real*
cons,
int* n) {
676 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
677 const int nb = ((*n) + 1024 - 1)/ 1024;
681 if(mma_bufred !=
NULL) {
695 return mma_bufred[0];
698 void cuda_kkt_rex(
void*
rex,
void*
df0dx,
void*
dfdx,
void* xsi,
699 void* eta,
void* lambda,
int* n,
int* m) {
701 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
708 void cuda_maxcons(
void* a,
real*
b,
real* c,
void* d,
int* n) {
710 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
716 real cuda_lcsc2(
void *a,
void*
b,
int *n) {
718 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
719 const int nb = ((*n) + 1024 - 1)/ 1024;
721 if (
nb > mma_red_s){
723 if (mma_bufred !=
NULL) {
731 ((
real*)a, (
real*)
b, mma_bufred_d, (*n));
738 return mma_bufred[0];
741 void cuda_mpisum(
void *a,
int *n) {
742#ifdef HAVE_DEVICE_MPI
749 void cuda_add2inv2(
void* a,
void *
b,
real* c,
int* n) {
751 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
757 void cuda_max2(
void* a,
real*
b,
void* c,
real* d,
int* n) {
759 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
766 void* mu,
void* y,
real*
delz,
int *m) {
768 const dim3 nblcks(((*m+1) + 1024 - 1) / 1024, 1, 1);
775 void cuda_updateAA(
void*
AA,
void*
globaltmp_mm,
void* s,
void* lambda,
776 void* d,
void*mu,
void* y,
void* a,
real* zeta,
real* z,
int* m) {
778 const dim3 nblcks(((*m+1) + 1024 - 1) / 1024, 1, 1);
785 void cuda_dy(
void*
dy,
void*
dely,
void*
dlambda,
void* d,
void* mu,
788 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
__global__ void convex_down_RAMP_mapping_apply_kernel(const T f_min, const T f_max, const T q, T *__restrict__ X_out_d, T *__restrict__ X_in_d, const int n)