35#include "device/cuda/check.h"
36#include "mma_kernel.h"
42#include "math/bcknd/device/device_mpi_reduce.h"
43#include "math/bcknd/device/device_mpi_op.h"
44#include "device/device_config.h"
48 real * mma_bufred = NULL;
49 real * mma_bufred_d = NULL;
51 void cuda_Hess(
void* Hess,
void* hijx,
void* Ljjxinv,
int *n,
int *m) {
52 const dim3 nthrds(1024, 1, 1);
53 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
54 const int nb = ((*n) + 1024 - 1)/ 1024;
55 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
56 cudaStreamSynchronize(stream);
59 if(mma_bufred != NULL){
60 CUDA_CHECK(cudaFreeHost(mma_bufred));
61 CUDA_CHECK(cudaFree(mma_bufred_d));
63 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*
sizeof(real)));
64 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*
sizeof(real)));
66 for (
int i = 0; i < (*m); i++){
67 for (
int j=0; j<(*m);j++){
68 mmasumHess_kernel <real> <<<nblcks, nthrds, 0, stream>>>
69 ((real*)hijx,(real*)Ljjxinv, mma_bufred_d, (*n),(*m), i, j);
70 CUDA_CHECK(cudaGetLastError());
71 mmareduce_kernel<real> <<<1, 1024, 0, stream>>> (mma_bufred_d, nb);
72 CUDA_CHECK(cudaGetLastError());
73 mma_copy_kernel<<<1, 1, 0, stream>>>((real*)Hess, mma_bufred_d, 1,
75 CUDA_CHECK(cudaGetLastError());
76 cudaStreamSynchronize(stream);
81 void mma_Ljjxinv_cuda(
void* Ljjxinv,
void* pjlambda,
void* qjlambda,
void* x,
82 void* low,
void* upp,
void* alpha,
void* beta,
int* n) {
83 const dim3 nthrds(1024, 1, 1);
84 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
85 mma_Ljjxinv_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
86 ((real*)Ljjxinv, (real*)pjlambda, (real*)qjlambda, (real*)x, (real*)low,
87 (real*)upp, (real*)alpha, (real*)beta, *n);
88 CUDA_CHECK(cudaGetLastError());
91 void mma_dipsolvesub1_cuda(
void* x,
void* pjlambda,
void* qjlambda,
92 void* low,
void* upp,
void* alpha,
void* beta,
int* n) {
93 const dim3 nthrds(1024, 1, 1);
94 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
95 mma_dipsolvesub1_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
96 ((real*)x, (real*)pjlambda, (real*)qjlambda, (real*)low, (real*)upp,
97 (real*)alpha, (real*)beta, *n);
98 CUDA_CHECK(cudaGetLastError());
101 void mattrans_v_mul_cuda(
void* output,
void* pij,
void* lambda,
103 const dim3 nthrds(1024, 1, 1);
104 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
105 mattrans_v_mul_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
106 ((real*)output, (real*)pij, (real*)lambda, *m, *n);
107 CUDA_CHECK(cudaGetLastError());
110 void mma_gensub4_cuda(
void* x,
void* low,
void* upp,
void* pij,
void* qij,
111 int* n,
int* m,
void* bi) {
112 const dim3 nthrds(1024, 1, 1);
113 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
114 const int nb = ((*n) + 1024 - 1) / 1024;
115 const cudaStream_t stream = (cudaStream_t)glb_cmd_queue;
117 if (nb > mma_red_s) {
119 if (mma_bufred != NULL) {
120 CUDA_CHECK(cudaFreeHost(mma_bufred));
121 CUDA_CHECK(cudaFree(mma_bufred_d));
123 CUDA_CHECK(cudaMallocHost(&mma_bufred,
125 CUDA_CHECK(cudaMalloc(&mma_bufred_d,
130 real* bi_d = (real*)bi;
131 cudaMalloc(&temp, (*m) * (*n) *
sizeof(real));
133 mma_sub4_kernel<real><<<nblcks, nthrds, 0, stream>>>(
134 (real*)x, (real*)low, (real*)upp, (real*)pij, (real*)qij,
137 for (
int i = 0; i < (*m); i++) {
138 mmasum_kernel<real><<<nblcks, nthrds, 0, stream>>>(
139 temp, mma_bufred_d, (*n), (*m), i);
140 CUDA_CHECK(cudaGetLastError());
142 mmareduce_kernel<real><<<1, 1024, 0, stream>>>(
144 CUDA_CHECK(cudaGetLastError());
146 CUDA_CHECK(cudaMemcpyAsync(
147 bi_d + i, mma_bufred_d,
sizeof(real),
148 cudaMemcpyDeviceToDevice, stream));
150 cudaStreamSynchronize(stream);
157 void mma_gensub3_cuda(
void* x,
void* df0dx,
void* dfdx,
void* low,
158 void* upp,
void* xmin,
void* xmax,
void* alpha,
void* beta,
159 void* p0j,
void* q0j,
void* pij,
void* qij,
int* n,
int* m) {
160 const dim3 nthrds(1024, 1, 1);
161 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
163 mma_sub3_kernel<real><<<nblcks, nthrds, 0,
164 (cudaStream_t)glb_cmd_queue>>>(
165 (real*)x, (real*)df0dx, (real*)dfdx, (real*)low,
166 (real*)upp, (real*)xmin, (real*)xmax, (real*)alpha,
167 (real*)beta, (real*)p0j, (real*)q0j, (real*)pij,
170 CUDA_CHECK(cudaGetLastError());
173 void mma_gensub2_cuda(
void* low,
void* upp,
void* x,
void* xold1,
174 void* xold2,
void* xdiff, real* asydecr, real* asyincr,
int* n) {
175 const dim3 nthrds(1024, 1, 1);
176 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
178 mma_sub2_kernel<real><<<nblcks, nthrds, 0,
179 (cudaStream_t)glb_cmd_queue>>>(
180 (real*)low, (real*)upp, (real*)x, (real*)xold1,
181 (real*)xold2, (real*)xdiff, *asydecr, *asyincr, *n);
183 CUDA_CHECK(cudaGetLastError());
188 void mma_gensub1_cuda(
void* low,
void* upp,
void* x,
void* xmin,
void* xmax,
189 real* asyinit,
int* n) {
190 const dim3 nthrds(1024, 1, 1);
191 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
192 mma_sub1_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
193 ((real*)low, (real*)upp, (real*)x, (real*)xmin, (real*)xmax,
195 CUDA_CHECK(cudaGetLastError());
198 void cuda_mma_max(
void* xsi,
void* x,
void* alpha,
int* n) {
199 const dim3 nthrds(1024, 1, 1);
200 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
202 mma_max2_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>
203 ((real*)xsi, (real*)x, (real*)alpha, *n);
204 CUDA_CHECK(cudaGetLastError());
207 void cuda_relambda(
void* relambda,
void* x,
void* xupp,
void* xlow,
208 void* pij,
void* qij,
int* n,
int* m) {
209 const dim3 nthrds(1024, 1, 1);
210 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
211 const int nb = ((*n) + 1024 - 1)/ 1024;
212 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
214 if ( nb > mma_red_s){
216 if (mma_bufred != NULL) {
217 CUDA_CHECK(cudaFreeHost(mma_bufred));
218 CUDA_CHECK(cudaFree(mma_bufred_d));
220 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*
sizeof(real)));
221 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*
sizeof(real)));
224 cudaMalloc(&temp, (*n) * (*m) *
sizeof(real));
225 relambda_kernel<real> <<<nblcks, nthrds, 0, stream >>> (temp, (real*)x,
226 (real*)xupp, (real*)xlow, (real*)pij, (real*)qij, *n, *m);
227 for (
int i = 0; i < (*m); i++) {
228 mmasum_kernel <real> <<<nblcks, nthrds, 0, stream >>>
229 (temp, mma_bufred_d, (*n),(*m), i);
230 CUDA_CHECK(cudaGetLastError());
231 mmareduce_kernel<real> <<<1, 1024, 0, stream >>>
233 CUDA_CHECK(cudaGetLastError());
234 mma_copy_kernel<<<1, 1, 0, stream>>>((real*)relambda, mma_bufred_d, 1, i);
235 CUDA_CHECK(cudaGetLastError());
236 cudaStreamSynchronize(stream);
241 void cuda_sub2cons2(
void* a,
void* b,
void* c,
void* d, real* e,
int* n) {
243 const dim3 nthrds(1024, 1, 1);
244 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
246 sub2cons2_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>(
247 (real*)a, (real*)b, (real*)c, (real*)d, *e, *n);
248 CUDA_CHECK(cudaGetLastError());
252 real cuda_maxval(
void* a,
int* n) {
254 const dim3 nthrds(1024, 1, 1);
255 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
256 const int nb = ((*n) + 1024 - 1) / 1024;
257 const cudaStream_t stream = (cudaStream_t)glb_cmd_queue;
259 if (nb > mma_red_s) {
261 if (mma_bufred != NULL) {
262 CUDA_CHECK(cudaFreeHost(mma_bufred));
263 CUDA_CHECK(cudaFree(mma_bufred_d));
265 CUDA_CHECK(cudaMallocHost(&mma_bufred, nb *
sizeof(real)));
266 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb *
sizeof(real)));
269 maxval_kernel<real><<<nblcks, nthrds, 0, stream>>>(
270 (real*)a, mma_bufred_d, (*n));
271 CUDA_CHECK(cudaGetLastError());
273 max_reduce_kernel<real><<<1, 1024, 0, stream>>>(
275 CUDA_CHECK(cudaGetLastError());
277 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d,
sizeof(real),
278 cudaMemcpyDeviceToHost, stream));
279 cudaStreamSynchronize(stream);
281 return mma_bufred[0];
285 void cuda_delx(
void* delx,
void* x,
void* xlow,
void* xupp,
void* pij,
286 void* qij,
void* p0j,
void* q0j,
void* alpha,
void* beta,
void* lambda,
287 real* epsi,
int* n,
int* m) {
289 const dim3 nthrds(1024, 1, 1);
290 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
292 delx_kernel<real><<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue>>>(
293 (real*)delx, (real*)x, (real*)xlow, (real*)xupp, (real*)pij,
294 (real*)qij, (real*)p0j, (real*)q0j, (real*)alpha, (real*)beta,
295 (real*)lambda, *epsi, *n, *m);
296 CUDA_CHECK(cudaGetLastError());
300 void cuda_GG(
void* GG,
void* x,
void* xlow,
void* xupp,
301 void* pij,
void* qij,
int* n,
int* m) {
302 const dim3 nthrds(1024, 1, 1);
303 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
304 GG_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
305 ((real*)GG, (real*)x, (real*)xlow,(real*) xupp,
306 (real*)pij, (real*) qij, *n,*m);
307 CUDA_CHECK(cudaGetLastError());
311 void cuda_diagx(
void* diagx,
void* x,
void* xsi,
void* xlow,
void* xupp,
312 void* p0j,
void* q0j,
void* pij,
void* qij,
void* alpha,
void* beta,
313 void* eta,
void* lambda,
int *n,
int *m) {
314 const dim3 nthrds(1024, 1, 1);
315 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
316 diagx_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
317 ((real*)diagx, (real*)x, (real*)xsi,(real*)xlow,
318 (real*) xupp,(real*)p0j, (real*) q0j, (real*)pij, (real*) qij,
319 (real*)alpha, (real*) beta, (real*)eta, (real*) lambda, *n,*m);
320 CUDA_CHECK(cudaGetLastError());
324 void cuda_bb(
void* bb,
void* GG,
void* delx,
void* diagx,
int *n,
int *m) {
325 const dim3 nthrds(1024, 1, 1);
326 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
327 const int nb = ((*n) + 1024 - 1)/ 1024;
328 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
329 cudaStreamSynchronize(stream);
332 if(mma_bufred != NULL){
333 CUDA_CHECK(cudaFreeHost(mma_bufred));
334 CUDA_CHECK(cudaFree(mma_bufred_d));
336 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*
sizeof(real)));
337 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*
sizeof(real)));
339 for (
int i = 0; i < (*m); i++) {
340 mmasumbb_kernel <real> <<<nblcks, nthrds, 0, stream>>>
341 ((real*)GG,(real*)delx,(real*)diagx, mma_bufred_d, (*n),(*m), i);
342 CUDA_CHECK(cudaGetLastError());
343 mmareduce_kernel<real> <<<1, 1024, 0, stream>>> (mma_bufred_d, nb);
344 CUDA_CHECK(cudaGetLastError());
345 mma_copy_kernel<<<1, 1, 0, stream>>>((real*)bb, mma_bufred_d, 1, i);
346 CUDA_CHECK(cudaGetLastError());
347 cudaStreamSynchronize(stream);
352 void cuda_AA(
void* AA,
void* GG,
void* diagx,
int *n,
int *m) {
353 const dim3 nthrds(1024, 1, 1);
354 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
355 const int nb = ((*n) + 1024 - 1)/ 1024;
356 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
357 cudaStreamSynchronize(stream);
360 if(mma_bufred != NULL){
361 CUDA_CHECK(cudaFreeHost(mma_bufred));
362 CUDA_CHECK(cudaFree(mma_bufred_d));
364 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*
sizeof(real)));
365 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*
sizeof(real)));
367 for (
int i = 0; i < (*m); i++){
368 for (
int j=0; j<(*m);j++){
369 mmasumAA_kernel <real> <<<nblcks, nthrds, 0, stream>>>
370 ((real*)GG,(real*)diagx, mma_bufred_d, (*n),(*m), i, j);
371 CUDA_CHECK(cudaGetLastError());
372 mmareduce_kernel<real> <<<1, 1024, 0, stream>>> (mma_bufred_d, nb);
373 CUDA_CHECK(cudaGetLastError());
374 mma_copy_kernel<<<1, 1, 0, stream>>>((real*)AA, mma_bufred_d, 1,
376 CUDA_CHECK(cudaGetLastError());
377 cudaStreamSynchronize(stream);
383 void cuda_dx(
void* dx,
void* delx,
void* diagx,
void* GG,
void* dlambda,
385 const dim3 nthrds(1024, 1, 1);
386 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
387 dx_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
388 ((real*)dx, (real*)delx, (real*)diagx,(real*) GG, (real*)dlambda,
390 CUDA_CHECK(cudaGetLastError());
394 void cuda_dxsi(
void* dxsi,
void* xsi,
void* dx,
void* x,
395 void* alpha, real*epsi,
int* n) {
396 const dim3 nthrds(1024, 1, 1);
397 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
398 dxsi_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
399 ((real*)dxsi, (real*)xsi, (real*)dx,(real*) x,
400 (real*)alpha, *epsi,*n);
401 CUDA_CHECK(cudaGetLastError());
405 void cuda_deta(
void* deta,
void* eta,
void* dx,
void* x,
406 void* beta, real* epsi,
int* n) {
407 const dim3 nthrds(1024, 1, 1);
408 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
409 deta_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
410 ((real*)deta, (real*)eta, (real*)dx, (real*)x,
411 (real*)beta, *epsi, *n);
412 CUDA_CHECK(cudaGetLastError());
416 void cuda_rex(
void* rex,
void* x,
void* xlow,
void* xupp,
void* pij,
417 void* p0j,
void* qij,
void* q0j,
void* lambda,
void* xsi,
void* eta,
419 const dim3 nthrds(1024, 1, 1);
420 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
421 RexCalculation_kernel<real> <<<nblcks, nthrds, 0,
422 (cudaStream_t)glb_cmd_queue >>> ((real*)rex, (real*)x, (real*)xlow,
423 (real*)xupp, (real*)pij, (real*)p0j, (real*)qij, (real*)q0j,
424 (real*)lambda, (real*)xsi, (real*)eta, *n, *m);
425 CUDA_CHECK(cudaGetLastError());
429 void cuda_rey(
void* rey,
void* c,
void* d,
void* y,
void* lambda,
void* mu,
431 const dim3 nthrds(1024, 1, 1);
432 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
433 rey_calculation_kernel<real> <<<nblcks, nthrds, 0,
434 (cudaStream_t)glb_cmd_queue >>> ((real*)rey, (real*)c,
435 (real*)d, (real*)y, (real*)lambda, (real*)mu, * n);
436 CUDA_CHECK(cudaGetLastError());
441 void cuda_sub2cons(
void * a,
void * b,
void * c, real *d,
int * n) {
442 const dim3 nthrds(1024, 1, 1);
443 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
444 sub2cons_kernel<real><<<nblcks, nthrds, 0,(cudaStream_t) glb_cmd_queue>>>
445 ((real *) a, (real *) b, (real *) c, *d, *n);
446 CUDA_CHECK(cudaGetLastError());
451 real cuda_norm(
void* a,
int* n) {
452 const dim3 nthrds(1024, 1, 1);
453 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
454 const int nb = ((*n) + 1024 - 1)/ 1024;
455 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
458 if(mma_bufred != NULL){
459 CUDA_CHECK(cudaFreeHost(mma_bufred));
460 CUDA_CHECK(cudaFree(mma_bufred_d));
462 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*
sizeof(real)));
463 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*
sizeof(real)));
465 norm_kernel <real> <<<nblcks, nthrds, 0, stream >>>
466 ((real*)a, mma_bufred_d, (*n));
467 CUDA_CHECK(cudaGetLastError());
468 mmareduce_kernel<real> <<<1, 1024, 0, stream >>> (mma_bufred_d, nb);
469 CUDA_CHECK(cudaGetLastError());
470 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d,
sizeof(real),
471 cudaMemcpyDeviceToHost, stream));
472 cudaStreamSynchronize(stream);
473 return mma_bufred[0];
477 void cuda_dely(
void* dely,
void* c,
void* d,
void* y,
void* lambda,
478 real* epsi,
int* n) {
479 const dim3 nthrds(1024, 1, 1);
480 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
481 dely_kernel<real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
482 ((real*)dely,(real*)c, (real*)d, (real*)y, (real*)lambda,*epsi, * n);
483 CUDA_CHECK(cudaGetLastError());
487 real cuda_maxval2(
void* a,
void* b, real* cons,
int* n) {
488 const dim3 nthrds(1024, 1, 1);
489 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
490 const int nb = ((*n) + 1024 - 1)/ 1024;
491 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
494 if(mma_bufred != NULL) {
495 CUDA_CHECK(cudaFreeHost(mma_bufred));
496 CUDA_CHECK(cudaFree(mma_bufred_d));
498 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*
sizeof(real)));
499 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*
sizeof(real)));
501 maxval2_kernel <real> <<<nblcks, nthrds, 0,stream >>>
502 ((real*)a, (real*)b, mma_bufred_d, *cons, *n);
503 CUDA_CHECK(cudaGetLastError());
504 max_reduce_kernel<real> <<<1, 1024, 0,stream >>> (mma_bufred_d, nb);
505 CUDA_CHECK(cudaGetLastError());
506 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d,
sizeof(real),
507 cudaMemcpyDeviceToHost, stream));
508 cudaStreamSynchronize(stream);
509 return mma_bufred[0];
513 real cuda_maxval3(
void* a,
void* b,
void* c, real* cons,
int* n) {
514 const dim3 nthrds(1024, 1, 1);
515 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
516 const int nb = ((*n) + 1024 - 1)/ 1024;
517 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
520 if(mma_bufred != NULL) {
521 CUDA_CHECK(cudaFreeHost(mma_bufred));
522 CUDA_CHECK(cudaFree(mma_bufred_d));
524 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*
sizeof(real)));
525 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*
sizeof(real)));
527 maxval3_kernel <real> <<<nblcks, nthrds, 0,stream>>>
528 ((real*)a, (real*)b, (real*)c, mma_bufred_d, *cons, *n);
529 max_reduce_kernel<real> <<<1, 1024, 0,stream >>> (mma_bufred_d, nb);
530 CUDA_CHECK(cudaGetLastError());
531 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d,
sizeof(real),
532 cudaMemcpyDeviceToHost, stream));
533 cudaStreamSynchronize(stream);
534 return mma_bufred[0];
538 void cuda_kkt_rex(
void* rex,
void* df0dx,
void* dfdx,
void* xsi,
539 void* eta,
void* lambda,
int* n,
int* m) {
540 const dim3 nthrds(1024, 1, 1);
541 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
542 kkt_rex_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
543 ((real*)rex, (real*)df0dx, (real*)dfdx, (real*)xsi,
544 (real*)eta, (real*)lambda, *n, *m);
545 CUDA_CHECK(cudaGetLastError());
550 void cuda_maxcons(
void* a, real* b, real* c,
void* d,
int* n) {
551 const dim3 nthrds(1024, 1, 1);
552 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
553 maxcons_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
554 ((real*)a, *b, *c, (real*)d, *n);
555 CUDA_CHECK(cudaGetLastError());
559 real cuda_lcsc2(
void *a,
void*b,
int *n) {
560 const dim3 nthrds(1024, 1, 1);
561 const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
562 const int nb = ((*n) + 1024 - 1)/ 1024;
563 const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
564 if ( nb > mma_red_s){
566 if (mma_bufred != NULL) {
567 CUDA_CHECK(cudaFreeHost(mma_bufred));
568 CUDA_CHECK(cudaFree(mma_bufred_d));
570 CUDA_CHECK(cudaMallocHost(&mma_bufred,nb*
sizeof(real)));
571 CUDA_CHECK(cudaMalloc(&mma_bufred_d, nb*
sizeof(real)));
573 glsc2_kernel <real> <<<nblcks, nthrds, 0, stream>>>
574 ((real*)a, (real*)b, mma_bufred_d, (*n));
575 CUDA_CHECK(cudaGetLastError());
576 mmareduce_kernel<real> <<<1, 1024, 0, stream>>> (mma_bufred_d, nb);
577 CUDA_CHECK(cudaGetLastError());
578 CUDA_CHECK(cudaMemcpyAsync(mma_bufred, mma_bufred_d,
sizeof(real),
579 cudaMemcpyDeviceToHost, stream));
580 cudaStreamSynchronize(stream);
581 return mma_bufred[0];
585 void cuda_mpisum(
void *a,
int *n) {
586#ifdef HAVE_DEVICE_MPI
588 cudaStreamSynchronize(stream);
589 device_mpi_allreduce_inplace(temp, *n,
sizeof(real), DEVICE_MPI_SUM);
594 void cuda_add2inv2(
void* a,
void *b, real* c,
int* n) {
595 const dim3 nthrds(1024, 1, 1);
596 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
597 add2inv2_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
598 ((real*)a, (real*) b, *c, *n);
599 CUDA_CHECK(cudaGetLastError());
603 void cuda_max2(
void* a, real* b,
void* c, real* d,
int* n) {
604 const dim3 nthrds(1024, 1, 1);
605 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
606 max2_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
607 ((real*)a, *b, (real*)c,*d, *n);
608 CUDA_CHECK(cudaGetLastError());
612 void cuda_updatebb(
void* bb,
void* dellambda,
void* dely,
void* d,
613 void* mu,
void* y, real* delz,
int *m) {
614 const dim3 nthrds(1024, 1, 1);
615 const dim3 nblcks(((*m+1) + 1024 - 1) / 1024, 1, 1);
616 updatebb_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
617 ((real*) bb, (real*) dellambda, (real*) dely,(real*) d,
618 (real*) mu, (real*) y, *delz, *m);
619 CUDA_CHECK(cudaGetLastError());
623 void cuda_updateAA(
void* AA,
void* globaltmp_mm,
void* s,
void* lambda,
624 void* d,
void*mu,
void* y,
void* a, real* zeta, real* z,
int* m) {
625 const dim3 nthrds(1024, 1, 1);
626 const dim3 nblcks(((*m+1) + 1024 - 1) / 1024, 1, 1);
627 updateAA_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
628 ((real*) AA,(real*) globaltmp_mm, (real*) s, (real*) lambda,(real*) d,
629 (real*)mu,(real*) y, (real*)a, *zeta, *z, *m);
630 CUDA_CHECK(cudaGetLastError());
634 void cuda_dy(
void* dy,
void* dely,
void* dlambda,
void* d,
void* mu,
636 const dim3 nthrds(1024, 1, 1);
637 const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
638 dy_kernel <real> <<<nblcks, nthrds, 0, (cudaStream_t)glb_cmd_queue >>>
639 ((real*) dy, (real*) dely, (real*) dlambda, (real*) d,
640 (real*) mu,(real*) y, *n);
641 CUDA_CHECK(cudaGetLastError());