42#include <cuda_runtime.h>
43#include <cusolverDn.h>
46#include <neko/device/device_config.h>
47#include <neko/device/cuda/check.h>
48#include <neko/math/bcknd/device/device_mpi_reduce.h>
49#include <neko/math/bcknd/device/device_mpi_op.h>
61 void mma_update_hessian_z_cuda(
void*
Hess,
void* a,
int* m) {
75 void mma_prepare_aa_matrix_cuda(
void*
AA,
void* s,
void* lambda,
76 void* d,
void* mu,
void* y,
77 void* a,
real* zeta,
real* z,
int* m) {
79 const dim3 nblcks(((*m) + 1024 - 1) / 1024, 1, 1);
90 void mma_prepare_hessian_cuda(
void*
Hess,
void* y,
91 void* mu,
void* lambda,
int* m) {
118 for (
int i = 0; i <
M; i++) {
128 for (
int i = 0; i <
M; i++) {
142 void cuda_custom_solver(
void*
A,
void*
b,
int n,
int*
info) {
163 void cuSOLVER_wrapper(
void*
A,
void*
b,
int n,
int*
jj) {
207 void custom_solve_linear_system(
void*
A,
void*
b,
int n,
int*
info) {
231 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
237 void cuda_Hess(
void*
Hess,
void*
hijx,
void*
Ljjxinv,
int *n,
int *m) {
239 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
240 const int nb = ((*n) + 1024 - 1)/ 1024;
245 if(mma_bufred !=
NULL){
252 for (
int i = 0; i < (*m); i++){
253 for (
int j=0;
j<(*m);
j++){
268 void* low,
void* upp,
void* alpha,
void* beta,
int* n) {
270 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
278 void* low,
void* upp,
void* alpha,
void* beta,
int* n) {
280 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
287 void mattrans_v_mul_cuda(
void* output,
void* pij,
void* lambda,
290 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
296 void mma_gensub4_cuda(
const void* x,
const void* low,
const void* upp,
297 const void* pij,
const void* qij,
298 const int* n,
const int* m,
void* bi) {
305 const int nb = (
N + 1023) / 1024;
308 if (
nb > mma_red_s) {
311 if (mma_bufred !=
NULL) {
325 static_cast<const real*
>(x),
326 static_cast<const real*
>(low),
327 static_cast<const real*
>(upp),
328 static_cast<const real*
>(pij),
329 static_cast<const real*
>(qij),
332 for (
int i = 0; i <
M; ++i) {
334 temp, mma_bufred_d,
N,
M, i);
341 bi_d + i, mma_bufred_d,
sizeof(
real),
350 void mma_gensub3_cuda(
void* x,
void*
df0dx,
void*
dfdx,
void* low,
351 void* upp,
void* xmin,
void* xmax,
void* alpha,
void* beta,
352 void* p0j,
void* q0j,
void* pij,
void* qij,
int* n,
int* m) {
354 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
366 void mma_gensub2_cuda(
void* low,
void* upp,
void* x,
void* xold1,
367 void* xold2,
void*
xdiff,
real* asydecr,
real* asyincr,
int* n) {
369 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
379 void mma_gensub1_cuda(
void* low,
void* upp,
void* x,
void* xmin,
void* xmax,
380 real* asyinit,
int* n) {
382 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
389 void cuda_mma_max(
void* xsi,
void* x,
void* alpha,
int* n) {
391 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
399 void* pij,
void* qij,
int* n,
int* m) {
401 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
402 const int nb = ((*n) + 1024 - 1)/ 1024;
405 if (
nb > mma_red_s){
407 if (mma_bufred !=
NULL) {
418 for (
int i = 0; i < (*m); i++) {
420 (temp, mma_bufred_d, (*n),(*m), i);
432 void cuda_sub2cons2(
void* a,
void*
b,
void* c,
void* d,
real*
e,
int* n) {
435 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
443 real cuda_maxval(
void* a,
int* n) {
446 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
447 const int nb = ((*n) + 1024 - 1) / 1024;
450 if (
nb > mma_red_s) {
452 if (mma_bufred !=
NULL) {
461 (
real*)a, mma_bufred_d, (*n));
472 return mma_bufred[0];
475 void cuda_delx(
void*
delx,
void* x,
void*
xlow,
void*
xupp,
void* pij,
476 void* qij,
void* p0j,
void* q0j,
void* alpha,
void* beta,
void* lambda,
480 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
489 void cuda_GG(
void*
GG,
void* x,
void*
xlow,
void*
xupp,
490 void* pij,
void* qij,
int* n,
int* m) {
492 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
499 void cuda_diagx(
void*
diagx,
void* x,
void* xsi,
void*
xlow,
void*
xupp,
500 void* p0j,
void* q0j,
void* pij,
void* qij,
void* alpha,
void* beta,
501 void* eta,
void* lambda,
int *n,
int *m) {
503 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
511 void cuda_bb(
void*
bb,
void*
GG,
void*
delx,
void*
diagx,
int *n,
int *m) {
513 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
514 const int nb = ((*n) + 1024 - 1)/ 1024;
519 if(mma_bufred !=
NULL){
526 for (
int i = 0; i < (*m); i++) {
538 void cuda_AA(
void*
AA,
void*
GG,
void*
diagx,
int *n,
int *m) {
540 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
541 const int nb = ((*n) + 1024 - 1)/ 1024;
546 if(mma_bufred !=
NULL){
553 for (
int i = 0; i < (*m); i++){
554 for (
int j=0;
j<(*m);
j++){
571 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
578 void cuda_dxsi(
void*
dxsi,
void* xsi,
void*
dx,
void* x,
581 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
588 void cuda_deta(
void*
deta,
void* eta,
void*
dx,
void* x,
591 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
598 void cuda_rex(
void*
rex,
void* x,
void*
xlow,
void*
xupp,
void* pij,
599 void* p0j,
void* qij,
void* q0j,
void* lambda,
void* xsi,
void* eta,
602 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
610 void cuda_rey(
void*
rey,
void* c,
void* d,
void* y,
void* lambda,
void* mu,
613 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
621 void cuda_sub2cons(
void * a,
void *
b,
void * c,
real *d,
int * n) {
623 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
629 real cuda_norm(
void* a,
int* n) {
631 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
632 const int nb = ((*n) + 1024 - 1)/ 1024;
636 if(mma_bufred !=
NULL){
644 ((
real*)a, mma_bufred_d, (*n));
651 return mma_bufred[0];
654 void cuda_dely(
void*
dely,
void* c,
void* d,
void* y,
void* lambda,
657 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
665 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
666 const int nb = ((*n) + 1024 - 1)/ 1024;
670 if(mma_bufred !=
NULL) {
685 return mma_bufred[0];
688 real cuda_maxval3(
void* a,
void*
b,
void* c,
real*
cons,
int* n) {
690 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
691 const int nb = ((*n) + 1024 - 1)/ 1024;
695 if(mma_bufred !=
NULL) {
709 return mma_bufred[0];
712 void cuda_kkt_rex(
void*
rex,
void*
df0dx,
void*
dfdx,
void* xsi,
713 void* eta,
void* lambda,
int* n,
int* m) {
715 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
722 void cuda_maxcons(
void* a,
real*
b,
real* c,
void* d,
int* n) {
724 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
730 real cuda_lcsc2(
void *a,
void*
b,
int *n) {
732 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
733 const int nb = ((*n) + 1024 - 1)/ 1024;
735 if (
nb > mma_red_s){
737 if (mma_bufred !=
NULL) {
745 ((
real*)a, (
real*)
b, mma_bufred_d, (*n));
752 return mma_bufred[0];
755 void cuda_mpisum(
void *a,
int *n) {
756#ifdef HAVE_DEVICE_MPI
763 void cuda_add2inv2(
void* a,
void *
b,
real* c,
int* n) {
765 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
771 void cuda_max2(
void* a,
real*
b,
void* c,
real* d,
int* n) {
773 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
780 void* mu,
void* y,
real*
delz,
int *m) {
782 const dim3 nblcks(((*m+1) + 1024 - 1) / 1024, 1, 1);
789 void cuda_updateAA(
void*
AA,
void*
globaltmp_mm,
void* s,
void* lambda,
790 void* d,
void*mu,
void* y,
void* a,
real* zeta,
real* z,
int* m) {
792 const dim3 nblcks(((*m+1) + 1024 - 1) / 1024, 1, 1);
799 void cuda_dy(
void*
dy,
void*
dely,
void*
dlambda,
void* d,
void* mu,
802 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
__global__ void heaviside_mapping_apply_kernel(const T beta, const T eta, T *__restrict__ X_out_d, T *__restrict__ X_in_d, const int n)