Neko-TOP
A portable framework for high-order spectral element flow toplogy optimization.
Loading...
Searching...
No Matches
mma_kernel.h
Go to the documentation of this file.
1
37
#ifndef MMA_CUDA_KERNEL_H
38
#define MMA_CUDA_KERNEL_H
39
40
template
<
typename
T>
41
__global__
void
mma_prepare_aa_matrix_kernel(
T
*
__restrict__
AA
,
42
const
T
*
__restrict__
s,
const
T
*
__restrict__
lambda,
43
const
T
*
__restrict__
d,
const
T
*
__restrict__
mu,
44
const
T
*
__restrict__
y,
const
T
*
__restrict__
a,
45
const
T
zeta,
const
T
z,
const
int
m) {
46
const
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
47
const
int
matrix_size
= m + 1;
48
49
if
(
tj
>= m)
return
;
50
AA
[
tj
*
matrix_size
+
tj
] += s[
tj
] / lambda[
tj
] +
51
(
T
)1.0 / (d[
tj
] + mu[
tj
] / y[
tj
]);
52
AA
[
tj
*
matrix_size
+ m] = a[
tj
];
// column m+1
53
AA
[m *
matrix_size
+
tj
] = a[
tj
];
// row m+1
54
55
// Only first thread updates the bottom-right corner element.
56
if
(
tj
== 0)
57
AA
[m *
matrix_size
+ m] = -zeta / z;
58
}
59
60
61
//Update Hessian diagonal elements (y contributions for dip subsolve)
62
template
<
typename
T>
63
__global__
void
mma_update_hessian_diagonal_kernel(
T
*
__restrict__
Hess
,
64
const
T
*
__restrict__
y,
const
T
*
__restrict__
d,
65
const
T
*
__restrict__
mu,
const
T
*
__restrict__
lambda,
const
int
m) {
66
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
67
if
(
tj
>= m)
return
;
68
69
T
diag
=
Hess
[
tj
* m +
tj
];
70
// Contribution from y terms (inactive constraints)
71
if
(y[
tj
] > (
T
)0.0) {
72
if
(
fabs
(d[
tj
]) >= (
T
)1.0e-15) {
73
diag
-= (
T
)1.0 / d[
tj
];
74
}
75
// else: skip - equivalent to subtracting 1.0/1.0e-8
76
}
77
78
// Contribution from -mu/lambda (eq 10)
79
diag
-= mu[
tj
] / lambda[
tj
];
80
Hess
[
tj
* m +
tj
] =
diag
;
81
}
82
83
// Levenberg-Marquardt algorithm (heuristically)
84
// Single-block version for m <= 1024
85
template
<
typename
T>
86
__global__
void
mma_stabilize_hessian_single_kernel(
T
*
__restrict__
Hess
,
const
int
m) {
87
const
int
tid
=
threadIdx
.x;
88
89
// Single thread computes trace and LM factor
90
if
(
tid
== 0) {
91
T
trace
= (
T
)0.0;
92
for
(
int
j
= 0;
j
< m;
j
++) {
93
trace
+=
Hess
[
j
* m +
j
];
94
}
95
T
lm_factor
=
max
((
T
)(-1.0e-4) *
trace
/ m, (
T
)1.0e-7);
96
97
// Apply to all diagonal elements
98
for
(
int
j
= 0;
j
< m;
j
++) {
99
Hess
[
j
* m +
j
] -=
lm_factor
;
100
}
101
}
102
}
103
104
// Levenberg-Marquardt algorithm (heuristically)
105
// Multi-block version for m > 1024
106
template
<
typename
T>
107
__global__
void
mma_stabilize_hessian_multi_kernel(
T
*
__restrict__
Hess
,
const
T
lm_factor
,
const
int
m) {
108
const
int
i =
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
109
if
(i < m) {
110
Hess
[i * m + i] -=
lm_factor
;
111
}
112
}
113
114
// Small linear solver for MMA (n <= 50)
115
template
<
typename
T>
116
__global__
void
mma_small_lu_kernel(
T
*
__restrict__
A
,
T
*
__restrict__
b
,
const
int
n) {
117
const
int
tid
=
threadIdx
.x;
118
119
// Handle 1x1 case
120
if
(n == 1) {
121
if
(
tid
== 0 &&
abs
(
A
[0]) > (
T
)1
e
-12)
b
[0] /=
A
[0];
122
return
;
123
}
124
125
// LU decomposition with partial pivoting
126
for
(
int
k = 0; k < n; k++) {
127
// Pivoting - single thread
128
if
(
tid
== 0) {
129
int
max_row
= k;
130
T
max_val
=
abs
(
A
[k * n + k]);
131
for
(
int
i = k + 1; i < n; i++) {
132
T
val
=
abs
(
A
[i * n + k]);
133
if
(
val
>
max_val
) {
134
max_val
=
val
;
135
max_row
= i;
136
}
137
}
138
if
(
max_val
> (
T
)1
e
-12 &&
max_row
!= k) {
139
// Swap rows
140
for
(
int
j
= k;
j
< n;
j
++) {
141
T
temp =
A
[k * n +
j
];
142
A
[k * n +
j
] =
A
[
max_row
* n +
j
];
143
A
[
max_row
* n +
j
] = temp;
144
}
145
// Swap rhs
146
T
temp_b
=
b
[k];
147
b
[k] =
b
[
max_row
];
148
b
[
max_row
] =
temp_b
;
149
}
150
}
151
__syncthreads
();
152
153
// Parallel elimination
154
T
diag
=
A
[k * n + k];
155
if
(
abs
(
diag
) > (
T
)1
e
-12) {
156
for
(
int
i =
tid
+ k + 1; i < n; i +=
blockDim
.x) {
157
T
factor
=
A
[i * n + k] /
diag
;
158
A
[i * n + k] =
factor
;
159
for
(
int
j
= k + 1;
j
< n;
j
++) {
160
A
[i * n +
j
] -=
factor
*
A
[k * n +
j
];
161
}
162
}
163
}
164
__syncthreads
();
165
}
166
167
// Parallel forward substitution
168
for
(
int
i =
tid
; i < n; i +=
blockDim
.x) {
169
T
sum
=
b
[i];
170
for
(
int
j
= 0;
j
< i;
j
++) {
171
sum
-=
A
[i * n +
j
] *
b
[
j
];
172
}
173
b
[i] =
sum
;
174
}
175
__syncthreads
();
176
177
// Parallel backward substitution
178
for
(
int
i = n - 1 -
tid
; i >= 0; i -=
blockDim
.x) {
179
if
(i >= 0) {
180
T
sum
=
b
[i];
181
for
(
int
j
= i + 1;
j
< n;
j
++) {
182
sum
-=
A
[i * n +
j
] *
b
[
j
];
183
}
184
if
(
abs
(
A
[i * n + i]) > (
T
)1
e
-12) {
185
b
[i] =
sum
/
A
[i * n + i];
186
}
187
}
188
}
189
__syncthreads
();
190
}
191
192
template
<
typename
T>
193
__global__
void
delta_1dbeam_kernel(
T
*
__restrict__
Delta
,
194
const
T
L_total
,
const
T
Le
,
const
int
offset
,
const
int
n) {
195
int
k =
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
196
if
(k >= n)
return
;
197
198
// Convert to 1-based indexing for the calculation
199
int
idx
= k + 1;
200
201
// Calculate first term: (L_total - Le*(offset+idx-1))^3
202
T
x1
=
L_total
-
Le
*
static_cast<
T
>
(
offset
+
idx
- 1);
203
T
term1
=
x1
*
x1
*
x1
;
204
205
// Calculate second term: (L_total - Le*(offset+idx))^3
206
T
x2
=
L_total
-
Le
*
static_cast<
T
>
(
offset
+
idx
);
207
T
term2
=
x2
*
x2
*
x2
;
208
209
// Final result
210
Delta
[k] = (
term1
-
term2
) /
static_cast<
T
>
(3.0);
211
}
212
213
template
<
typename
T>
214
__global__
void
mma_Ljjxinv_kernel(
T
*
__restrict__
Ljjxinv
,
215
const
T
*
__restrict__
pjlambda
,
const
T
*
__restrict__
qjlambda
,
216
const
T
*
__restrict__
x,
const
T
*
__restrict__
low,
const
T
*
__restrict__
upp,
217
const
T
*
__restrict__
alpha,
const
T
*
__restrict__
beta,
218
const
int
n) {
219
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
220
if
(
tj
>= n)
return
;
221
222
// Load inputs into registers
223
const
T
xt
= x[
tj
];
224
const
T
low_j
= low[
tj
];
225
const
T
upp_j
= upp[
tj
];
226
const
T
pj
=
pjlambda
[
tj
];
227
const
T
qj
=
qjlambda
[
tj
];
228
229
// Precompute reused differences
230
const
T
diff_u
=
upp_j
-
xt
;
231
const
T
diff_l
=
xt
-
low_j
;
232
233
// Cube once (avoiding pow for speed)
234
const
T
diff_u3
=
diff_u
*
diff_u
*
diff_u
;
235
const
T
diff_l3
=
diff_l
*
diff_l
*
diff_l
;
236
237
// Compute inverse value safely
238
T
denom
= 2.0 *
pj
/
diff_u3
+ 2.0 *
qj
/
diff_l3
;
239
T
val
= -1.0 /
denom
;
240
241
// Mask out active primal constraints
242
bool
active
= (
fabs
(
xt
- alpha[
tj
]) <=
T
(1
e
-16)) ||
243
(
fabs
(
xt
- beta[
tj
]) <=
T
(1
e
-16));
244
245
Ljjxinv
[
tj
] =
active
?
T
(0.0) :
val
;
246
}
247
248
template
<
typename
T>
249
__global__
void
mma_dipsolvesub1_kernel(
T
*
__restrict__
x,
250
const
T
*
__restrict__
pjlambda
,
const
T
*
__restrict__
qjlambda
,
251
const
T
*
__restrict__
low,
const
T
*
__restrict__
upp,
252
const
T
*
__restrict__
alpha,
const
T
*
__restrict__
beta,
253
const
int
n) {
254
255
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
256
if
(
tj
>= n)
return
;
257
258
// Load inputs into registers
259
const
T
pj_sqrt
=
sqrt
(
pjlambda
[
tj
]);
260
const
T
qj_sqrt
=
sqrt
(
qjlambda
[
tj
]);
261
const
T
denom
=
pj_sqrt
+
qj_sqrt
;
262
263
const
T
low_j
= low[
tj
];
264
const
T
upp_j
= upp[
tj
];
265
const
T
alpha_j
= alpha[
tj
];
266
const
T
beta_j
= beta[
tj
];
267
268
// Weighted average
269
const
T
val
= (
pj_sqrt
*
low_j
+
qj_sqrt
*
upp_j
) /
denom
;
270
271
// Clamp x between alpha and beta using branchless min/max
272
x[
tj
] =
fmin
(
fmax
(
val
,
alpha_j
),
beta_j
);
273
}
274
275
template
<
typename
T>
276
__global__
void
mattrans_v_mul_kernel(
T
*
__restrict__
output,
277
const
T
*
__restrict__
pij,
const
T
*
__restrict__
lambda,
278
const
int
m,
const
int
n) {
279
280
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
281
if
(
tj
>= n)
return
;
282
283
T
acc
= 0.0;
284
285
// Use register accumulation
286
for
(
int
i = 0; i < m; ++i) {
287
acc
+= pij[i +
tj
* m] * lambda[i];
288
}
289
290
output[
tj
] =
acc
;
291
}
292
293
template
<
typename
T>
294
__global__
void
mma_sub1_kernel(
295
T
*
__restrict__
xlow
,
296
T
*
__restrict__
xupp
,
297
const
T
*
__restrict__
x,
298
const
T
*
__restrict__
xmin,
299
const
T
*
__restrict__
xmax,
300
const
T
asyinit,
301
const
int
n) {
302
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
303
if
(
tj
>= n)
return
;
304
305
// Load values once into registers (global memory is slow)
306
const
T
xt
= x[
tj
];
307
const
T
xmin_j
= xmin[
tj
];
308
const
T
xmax_j
= xmax[
tj
];
309
310
// Reuse xgap calculation
311
const
T
xgap
=
xmax_j
-
xmin_j
;
312
const
T
offset
= asyinit *
xgap
;
313
314
xlow
[
tj
] =
xt
-
offset
;
315
xupp
[
tj
] =
xt
+
offset
;
316
}
317
318
319
320
template
<
typename
T >
321
__global__
void
mma_sub2_kernel(
T
*
__restrict__
low,
T
*
__restrict__
upp,
322
const
T
*
__restrict__
x,
const
T
*
__restrict__
xold1,
323
const
T
*
__restrict__
xold2,
const
T
*
__restrict__
xdiff
,
324
const
T
asydecr,
const
T
asyincr,
const
int
n) {
325
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
326
if
(
tj
>= n)
return
;
327
328
// Load data into registers for faster accessing compare to global memory
329
// when accessing repeatedly)
330
const
T
xval
= x[
tj
];
331
const
T
xold1val
= xold1[
tj
];
332
const
T
xold2val
= xold2[
tj
];
333
const
T
lowval
= low[
tj
];
334
const
T
uppval
= upp[
tj
];
335
const
T
xdiffval
=
xdiff
[
tj
];
336
337
// Compute the product
338
const
T
prod
= (
xval
-
xold1val
) * (
xold1val
-
xold2val
);
339
340
// Compute asy_factor without branching
341
T
asy_factor
= (
prod
<
T
(0)) ? asydecr :
342
(
prod
>
T
(0)) ? asyincr :
T
(1);
343
344
// Update low and upp using fma (fused multiply-add) for numerical stability
345
T
new_low
=
fma
(-
asy_factor
, (
xold1val
-
lowval
),
xval
);
346
T
new_upp
=
fma
(
asy_factor
, (
uppval
-
xold1val
),
xval
);
347
348
// Apply bounds
349
new_low
=
max
(
new_low
,
xval
-
T
(10.0) *
xdiffval
);
350
new_low
=
min
(
new_low
,
xval
-
T
(0.01) *
xdiffval
);
351
352
new_upp
=
min
(
new_upp
,
xval
+
T
(10.0) *
xdiffval
);
353
new_upp
=
max
(
new_upp
,
xval
+
T
(0.01) *
xdiffval
);
354
355
// Write results back
356
low[
tj
] =
new_low
;
357
upp[
tj
] =
new_upp
;
358
}
359
360
template
<
typename
T>
361
__global__
void
mma_sub3_kernel(
const
T
*
__restrict__
x,
362
const
T
*
__restrict__
df0dx
,
const
T
*
__restrict__
dfdx
,
363
T
*
__restrict__
low,
T
*
__restrict__
upp,
const
T
*
__restrict__
xmin,
364
const
T
*
__restrict__
xmax,
T
*
__restrict__
alpha,
T
*
__restrict__
beta,
365
T
*
__restrict__
p0j,
T
*
__restrict__
q0j,
T
*
__restrict__
pij,
366
T
*
__restrict__
qij,
const
int
n,
const
int
m) {
367
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
368
if
(
tj
>= n)
return
;
369
370
// Load into registers once
371
const
T
xt
= x[
tj
];
372
const
T
xmin_j
= xmin[
tj
];
373
const
T
xmax_j
= xmax[
tj
];
374
const
T
low_j
= low[
tj
];
375
const
T
upp_j
= upp[
tj
];
376
const
T
df0
=
df0dx
[
tj
];
377
const
T
xgap
=
xmax_j
-
xmin_j
;
378
379
// Clamp helpers
380
const
T
half_xgap
= 0.5 *
xgap
;
381
const
T
tenth_low_diff
=
T
(0.1) * (
xt
-
low_j
);
382
const
T
tenth_upp_diff
=
T
(0.1) * (
upp_j
-
xt
);
383
384
// Compute alpha and beta with fused max/min and fewer calls
385
T
alpha_val
=
max
(
max
(
xmin_j
,
low_j
+
tenth_low_diff
),
xt
-
half_xgap
);
386
T
beta_val
=
min
(
min
(
xmax_j
,
upp_j
-
tenth_upp_diff
),
xt
+
half_xgap
);
387
388
alpha[
tj
] =
alpha_val
;
389
beta[
tj
] =
beta_val
;
390
391
// Avoid multiple pow calls, compute once
392
const
T
upp_minus_x
=
upp_j
-
xt
;
393
const
T
x_minus_low
=
xt
-
low_j
;
394
const
T
upp_minus_x_sq
=
upp_minus_x
*
upp_minus_x
;
395
const
T
x_minus_low_sq
=
x_minus_low
*
x_minus_low
;
396
397
// Small epsilon for numerical stability
398
const
T
eps
=
T
(1
e
-5);
399
const
T
inv_xgap
=
T
(1.0) /
max
(
eps
,
xgap
);
400
401
// Compute terms reused multiple times
402
const
T
max_df0_pos
=
max
(
df0
,
T
(0));
403
const
T
max_df0_neg
=
max
(-
df0
,
T
(0));
404
405
p0j[
tj
] =
upp_minus_x_sq
* (
T
(1.001) *
max_df0_pos
+
406
T
(0.001) *
max_df0_neg
+
eps
*
inv_xgap
);
407
q0j[
tj
] =
x_minus_low_sq
* (
T
(0.001) *
max_df0_pos
+
408
T
(1.001) *
max_df0_neg
+
eps
*
inv_xgap
);
409
410
// Loop over m for pij and qij
411
for
(
int
i = 0; i < m; ++i) {
412
int
idx
= i +
tj
* m;
413
414
T
dfdx_val
=
dfdx
[
idx
];
415
T
max_pos
=
max
(
dfdx_val
,
T
(0));
416
T
max_neg
=
max
(-
dfdx_val
,
T
(0));
417
418
pij[
idx
] =
upp_minus_x_sq
* (
T
(1.001) *
max_pos
+
419
T
(0.001) *
max_neg
+
eps
*
inv_xgap
);
420
qij[
idx
] =
x_minus_low_sq
* (
T
(0.001) *
max_pos
+
421
T
(1.001) *
max_neg
+
eps
*
inv_xgap
);
422
}
423
}
424
425
template
<
typename
T>
426
__global__
void
mma_sub4_kernel(
427
const
T
*
__restrict__
x,
428
const
T
*
__restrict__
low,
429
const
T
*
__restrict__
upp,
430
const
T
*
__restrict__
pij,
431
const
T
*
__restrict__
qij,
432
T
*
__restrict__
temp,
433
const
int
n,
434
const
int
m) {
435
436
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
437
if
(
tj
>= n)
return
;
438
439
// Register caching
440
const
T
xt
= x[
tj
];
441
const
T
low_j
= low[
tj
];
442
const
T
upp_j
= upp[
tj
];
443
444
const
T
denom_upp
=
upp_j
-
xt
;
445
const
T
denom_low
=
xt
-
low_j
;
446
447
const
T
eps
=
T
(1
e
-12);
// Precision-dependent epsilon
448
const
T
inv_denom_upp
=
T
(1) /
max
(
denom_upp
,
eps
);
449
const
T
inv_denom_low
=
T
(1) /
max
(
denom_low
,
eps
);
450
451
const
int
base_idx
=
tj
* m;
452
453
for
(
int
i = 0; i < m; ++i) {
454
int
idx
=
base_idx
+ i;
455
temp[
idx
] = pij[
idx
] *
inv_denom_upp
+ qij[
idx
] *
inv_denom_low
;
456
}
457
}
458
459
460
template
<
typename
T>
461
__global__
void
mma_max2_kernel(
462
T
*
__restrict__
xsi,
463
const
T
*
__restrict__
x,
464
const
T
*
__restrict__
alpha,
465
const
int
n) {
466
467
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
468
if
(
tj
>= n)
return
;
469
470
const
T
eps
=
T
(1
e
-12);
471
T
denom
=
max
(x[
tj
] - alpha[
tj
],
eps
);
472
xsi[
tj
] =
max
(
T
(1),
T
(1) /
denom
);
473
}
474
475
476
477
478
template
<
typename
T>
479
__global__
void
relambda_kernel(
480
T
*
__restrict__
temp,
481
const
T
*
__restrict__
x,
482
const
T
*
__restrict__
xupp
,
483
const
T
*
__restrict__
xlow
,
484
const
T
*
__restrict__
pij,
485
const
T
*
__restrict__
qij,
486
const
int
n,
487
const
int
m) {
488
489
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
490
if
(
tj
>= n)
return
;
491
492
const
T
xt
= x[
tj
];
493
const
T
xup
=
xupp
[
tj
];
494
const
T
xlo
=
xlow
[
tj
];
495
496
// Prevent divide-by-zero using small epsilon
497
const
T
eps
=
T
(1
e
-12);
498
const
T
inv_denom_upp
=
T
(1) /
max
(
xup
-
xt
,
eps
);
499
const
T
inv_denom_low
=
T
(1) /
max
(
xt
-
xlo
,
eps
);
500
501
int
base_idx
=
tj
* m;
502
for
(
int
i = 0; i < m; ++i) {
503
int
idx
=
base_idx
+ i;
504
temp[
idx
] = pij[
idx
] *
inv_denom_upp
+ qij[
idx
] *
inv_denom_low
;
505
}
506
}
507
508
509
template
<
typename
T>
510
__global__
void
sub2cons2_kernel(
511
T
*
__restrict__
a,
512
const
T
*
__restrict__
b
,
513
const
T
*
__restrict__
c,
514
const
T
*
__restrict__
d,
515
const
T
e
,
516
const
int
n) {
517
const
int
idx
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
518
if
(
idx
>= n)
return
;
519
520
const
T
bt
=
b
[
idx
];
521
const
T
ct
= c[
idx
];
522
const
T
dt
= d[
idx
];
523
524
a[
idx
] =
bt
* (
ct
-
dt
) -
e
;
525
}
526
527
528
template
<
typename
T>
529
__inline__
__device__
T
max_reduce_warp(
T
val
) {
530
val
=
max
(
val
,
__shfl_down_sync
(0xffffffff,
val
, 16));
531
val
=
max
(
val
,
__shfl_down_sync
(0xffffffff,
val
, 8));
532
val
=
max
(
val
,
__shfl_down_sync
(0xffffffff,
val
, 4));
533
val
=
max
(
val
,
__shfl_down_sync
(0xffffffff,
val
, 2));
534
val
=
max
(
val
,
__shfl_down_sync
(0xffffffff,
val
, 1));
535
return
val
;
536
}
537
538
539
540
template
<
typename
T>
541
__global__
void
maxval_kernel(
const
T
*
__restrict__
a,
T
* temp,
const
int
n) {
542
const
int
idx
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
543
const
int
stride
=
blockDim
.x *
gridDim
.x;
544
545
const
unsigned
int
lane
=
threadIdx
.x %
warpSize
;
546
const
unsigned
int
warp_id
=
threadIdx
.x /
warpSize
;
547
548
// Shared memory to store warp-level maxima
549
__shared__
T
shared
[32];
// Assumes max 1024 threads per block
550
551
T
local_max
=
T
(0);
552
for
(
int
i =
idx
; i < n; i +=
stride
) {
553
local_max
=
max
(
local_max
,
abs
(a[i]));
554
}
555
556
// Warp-level reduction
557
local_max
=
max_reduce_warp<T>
(
local_max
);
558
559
// Write warp-level results to shared memory
560
if
(
lane
== 0) {
561
shared
[
warp_id
] =
local_max
;
562
}
563
__syncthreads
();
564
565
// Let the first warp reduce shared values
566
local_max
= (
threadIdx
.x <
blockDim
.x /
warpSize
) ?
shared
[
lane
] :
T
(0);
567
568
if
(
warp_id
== 0) {
569
local_max
=
max_reduce_warp<T>
(
local_max
);
570
}
571
572
if
(
threadIdx
.x == 0) {
573
temp[
blockIdx
.x] =
local_max
;
574
}
575
}
576
577
578
template
<
typename
T>
579
__global__
void
max_reduce_kernel(
T
*
__restrict__
bufred
,
const
int
n) {
580
581
T
maxval
=
T
(0);
582
const
int
idx
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
583
const
int
stride
=
blockDim
.x *
gridDim
.x;
584
585
// Grid-stride loop to cover all elements
586
for
(
int
i =
idx
; i < n; i +=
stride
) {
587
maxval
=
max
(
maxval
,
bufred
[i]);
588
}
589
590
__shared__
T
shared
[32];
// One slot per warp (max 1024 threads/block)
591
592
unsigned
int
lane
=
threadIdx
.x %
warpSize
;
593
unsigned
int
wid
=
threadIdx
.x /
warpSize
;
594
595
// Warp-level max reduction
596
maxval
=
max_reduce_warp<T>
(
maxval
);
597
598
// Store each warp's max value in shared memory
599
if
(
lane
== 0) {
600
shared
[
wid
] =
maxval
;
601
}
602
__syncthreads
();
603
604
// Let the first warp reduce the warp-level maxima
605
maxval
= (
threadIdx
.x <
blockDim
.x /
warpSize
) ?
shared
[
lane
] :
T
(0);
606
if
(
wid
== 0) {
607
maxval
=
max_reduce_warp<T>
(
maxval
);
608
}
609
610
// Write the block's maximum value back to bufred[blockIdx.x]
611
if
(
threadIdx
.x == 0) {
612
bufred
[
blockIdx
.x] =
maxval
;
613
}
614
}
615
616
template
<
typename
T>
617
__global__
void
delx_kernel(
618
T
*
__restrict__
delx
,
619
const
T
*
__restrict__
x,
620
const
T
*
__restrict__
xlow
,
621
const
T
*
__restrict__
xupp
,
622
const
T
*
__restrict__
pij,
623
const
T
*
__restrict__
qij,
624
const
T
*
__restrict__
p0j,
625
const
T
*
__restrict__
q0j,
626
const
T
*
__restrict__
alpha,
627
const
T
*
__restrict__
beta,
628
const
T
*
__restrict__
lambda,
629
const
T
epsi
,
630
const
int
n,
631
const
int
m)
632
{
633
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
634
if
(
tj
< n) {
635
T
xt
= x[
tj
];
636
T
xlow_j
=
xlow
[
tj
];
637
T
xupp_j
=
xupp
[
tj
];
638
T
alpha_j
= alpha[
tj
];
639
T
beta_j
= beta[
tj
];
640
641
// Precompute denominators squared for better performance
642
T
denom_low
=
xt
-
xlow_j
;
643
T
denom_upp
=
xupp_j
-
xt
;
644
T
denom_alpha
=
xt
-
alpha_j
;
645
T
denom_beta
=
beta_j
-
xt
;
646
647
const
T
small_eps
=
T
(1
e
-12);
648
denom_low
= (
abs
(
denom_low
) <
small_eps
) ?
small_eps
:
denom_low
;
649
denom_upp
= (
abs
(
denom_upp
) <
small_eps
) ?
small_eps
:
denom_upp
;
650
denom_alpha
= (
abs
(
denom_alpha
) <
small_eps
) ?
small_eps
:
denom_alpha
;
651
denom_beta
= (
abs
(
denom_beta
) <
small_eps
) ?
small_eps
:
denom_beta
;
652
653
T
sum
=
T
(0);
654
for
(
int
i = 0; i < m; i++) {
655
T
lambda_i
= lambda[i];
656
sum
+= pij[i +
tj
* m] *
lambda_i
/ (
denom_upp
*
denom_upp
)
657
- qij[i +
tj
* m] *
lambda_i
/ (
denom_low
*
denom_low
);
658
}
659
sum
+= p0j[
tj
] / (
denom_upp
*
denom_upp
)
660
- q0j[
tj
] / (
denom_low
*
denom_low
)
661
-
epsi
/
denom_alpha
662
+
epsi
/
denom_beta
;
663
664
delx
[
tj
] =
sum
;
665
}
666
}
667
668
template
<
typename
T>
669
__global__
void
GG_kernel(
670
T
*
__restrict__
GG
,
671
const
T
*
__restrict__
x,
672
const
T
*
__restrict__
xlow
,
673
const
T
*
__restrict__
xupp
,
674
const
T
*
__restrict__
pij,
675
const
T
*
__restrict__
qij,
676
const
int
n,
677
const
int
m) {
678
679
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
680
if
(
tj
< n) {
681
T
xt
= x[
tj
];
682
T
xlow_j
=
xlow
[
tj
];
683
T
xupp_j
=
xupp
[
tj
];
684
685
// Distances from bounds
686
T
diff_upper
=
xupp_j
-
xt
;
687
T
diff_lower
=
xt
-
xlow_j
;
688
689
// Squared distances
690
T
diff_upper2
=
diff_upper
*
diff_upper
;
691
T
diff_lower2
=
diff_lower
*
diff_lower
;
692
693
for
(
int
i = 0; i < m; i++) {
694
int
idx
= i +
tj
* m;
695
GG
[
idx
] = pij[
idx
] /
diff_upper2
- qij[
idx
] /
diff_lower2
;
696
}
697
}
698
}
699
700
701
template
<
typename
T>
702
__global__
void
diagx_kernel(
T
*
__restrict__
diagx
,
const
T
*
__restrict__
x,
703
const
T
*
__restrict__
xsi,
const
T
*
__restrict__
xlow
,
704
const
T
*
__restrict__
xupp
,
const
T
*
__restrict__
p0j,
705
const
T
*
__restrict__
q0j,
const
T
*
__restrict__
pij,
706
const
T
*
__restrict__
qij,
const
T
* alpha,
const
T
* beta,
707
const
T
* eta,
const
T
* lambda,
const
int
n,
const
int
m) {
708
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
709
if
(
tj
< n) {
710
T
sum
= 0;
711
T
sum1
= 0;
712
for
(
int
i = 0; i < m; i++) {
713
sum
=
sum
+ pij[
tj
*m+ i] * lambda[i];
714
sum1
=
sum1
+ qij[
tj
*m + i] * lambda[i];
715
}
716
diagx
[
tj
] = (p0j[
tj
] +
sum
) /
pow
(
xupp
[
tj
] - x[
tj
], 3) +
717
(q0j[
tj
] +
sum1
) /
pow
(x[
tj
] -
xlow
[
tj
], 3);
718
diagx
[
tj
] = 2.0 *
diagx
[
tj
] + xsi[
tj
] / (x[
tj
] - alpha[
tj
]) +
719
eta[
tj
] / (beta[
tj
] - x[
tj
]);
720
}
721
}
722
723
724
template
<
typename
T>
725
__inline__
__device__
T
reduce_warp(
T
val
) {
726
volatile
T
v =
val
;
727
v +=
__shfl_down_sync
(0xffffffff, v, 16);
728
v +=
__shfl_down_sync
(0xffffffff, v, 8);
729
v +=
__shfl_down_sync
(0xffffffff, v, 4);
730
v +=
__shfl_down_sync
(0xffffffff, v, 2);
731
v +=
__shfl_down_sync
(0xffffffff, v, 1);
732
return
v;
733
}
734
735
736
template
<
typename
T>
737
__global__
void
mmareduce_kernel(
T
*
__restrict__
bufred
,
const
int
n) {
738
739
T
sum
= 0;
740
const
int
idx
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
741
const
int
str
=
blockDim
.x *
gridDim
.x;
742
for
(
int
i =
idx
; i < n; i +=
str
)
743
{
744
sum
+=
bufred
[i];
745
}
746
747
__shared__
T
shared
[32];
748
unsigned
int
lane
=
threadIdx
.x %
warpSize
;
749
unsigned
int
wid
=
threadIdx
.x /
warpSize
;
750
751
sum
=
reduce_warp<T>
(
sum
);
752
if
(
lane
== 0)
753
shared
[
wid
] =
sum
;
754
__syncthreads
();
755
756
sum
= (
threadIdx
.x <
blockDim
.x /
warpSize
) ?
shared
[
lane
] : 0;
757
if
(
wid
== 0)
758
sum
=
reduce_warp<T>
(
sum
);
759
760
if
(
threadIdx
.x == 0)
761
bufred
[
blockIdx
.x] =
sum
;
762
}
763
764
765
template
<
typename
T >
766
__global__
void
mmasum_kernel(
const
T
*
__restrict__
a,
T
*
__restrict__
buf_h
,
767
const
int
n,
const
int
m,
const
int
k) {
768
769
const
int
idx
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
770
const
int
str
=
blockDim
.x *
gridDim
.x;
771
772
const
unsigned
int
lane
=
threadIdx
.x %
warpSize
;
773
const
unsigned
int
wid
=
threadIdx
.x /
warpSize
;
774
775
__shared__
T
shared
[32];
776
T
sum
= 0;
777
for
(
int
i =
idx
; i < n; i +=
str
)
778
{
779
sum
+= a[m * i + k ];
780
}
781
782
sum
=
reduce_warp<T>
(
sum
);
783
if
(
lane
== 0)
784
shared
[
wid
] =
sum
;
785
__syncthreads
();
786
787
sum
= (
threadIdx
.x <
blockDim
.x /
warpSize
) ?
shared
[
lane
] : 0;
788
if
(
wid
== 0)
789
sum
=
reduce_warp<T>
(
sum
);
790
791
if
(
threadIdx
.x == 0)
792
buf_h
[
blockIdx
.x] =
sum
;
793
794
}
795
template
<
typename
T >
796
__global__
void
mmasumbb_kernel(
const
T
*
__restrict__
GG
,
797
const
T
*
__restrict__
delx
,
const
T
*
__restrict__
diagx
,
798
T
*
__restrict__
buf_h
,
const
int
n,
const
int
m,
const
int
k) {
799
800
const
int
idx
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
801
const
int
str
=
blockDim
.x *
gridDim
.x;
802
803
const
unsigned
int
lane
=
threadIdx
.x %
warpSize
;
804
const
unsigned
int
wid
=
threadIdx
.x /
warpSize
;
805
806
__shared__
T
shared
[32];
807
T
sum
= 0;
808
for
(
int
i =
idx
; i < n; i +=
str
)
809
{
810
sum
+=
GG
[ k + i * m] *
delx
[i] /
diagx
[i];
811
}
812
813
sum
=
reduce_warp<T>
(
sum
);
814
if
(
lane
== 0)
815
shared
[
wid
] =
sum
;
816
__syncthreads
();
817
818
sum
= (
threadIdx
.x <
blockDim
.x /
warpSize
) ?
shared
[
lane
] : 0;
819
if
(
wid
== 0)
820
sum
=
reduce_warp<T>
(
sum
);
821
822
if
(
threadIdx
.x == 0)
823
buf_h
[
blockIdx
.x] =
sum
;
824
825
}
826
827
template
<
typename
T>
828
__global__
void
mmasumHess_kernel(
829
const
T
*
__restrict__
hijx
,
830
const
T
*
__restrict__
Ljjxinv
,
831
T
*
__restrict__
buf_h
,
832
const
int
n,
833
const
int
m,
834
const
int
k0
,
835
const
int
k1
)
836
{
837
const
int
idx
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
838
const
int
stride
=
blockDim
.x *
gridDim
.x;
839
840
// Warp lane and warp id within the block
841
const
unsigned
int
lane
=
threadIdx
.x %
warpSize
;
842
const
unsigned
int
wid
=
threadIdx
.x /
warpSize
;
843
844
__shared__
T
shared
[32];
845
846
T
sum
=
T
(0);
847
848
// Grid-stride loop for global reduction
849
for
(
int
i =
idx
; i < n; i +=
stride
) {
850
// hijx is indexed as hijx[offset + row * m]
851
T
val0
=
hijx
[
k0
+ i * m];
852
T
val1
=
hijx
[
k1
+ i * m];
853
sum
+=
val0
*
Ljjxinv
[i] *
val1
;
854
}
855
856
// Warp-level reduction using your reduce_warp implementation
857
sum
=
reduce_warp<T>
(
sum
);
858
859
// Write each warp's partial sum to shared memory
860
if
(
lane
== 0) {
861
shared
[
wid
] =
sum
;
862
}
863
__syncthreads
();
864
865
// First warp reduces the warp sums in shared memory
866
sum
= (
threadIdx
.x <
blockDim
.x /
warpSize
) ?
shared
[
lane
] :
T
(0);
867
if
(
wid
== 0) {
868
sum
=
reduce_warp<T>
(
sum
);
869
}
870
871
// Write final result from first thread of block
872
if
(
threadIdx
.x == 0) {
873
buf_h
[
blockIdx
.x] =
sum
;
874
}
875
}
876
877
878
template
<
typename
T >
879
__global__
void
mmasumAA_kernel(
const
T
*
__restrict__
GG
,
880
const
T
*
__restrict__
diagx
,
T
*
__restrict__
buf_h
,
const
int
n,
881
const
int
m,
const
int
k0
,
const
int
k1
) {
882
883
const
int
idx
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
884
const
int
str
=
blockDim
.x *
gridDim
.x;
885
886
const
unsigned
int
lane
=
threadIdx
.x %
warpSize
;
887
const
unsigned
int
wid
=
threadIdx
.x /
warpSize
;
888
889
__shared__
T
shared
[32];
890
T
sum
= 0;
891
for
(
int
i =
idx
; i < n; i +=
str
)
892
{
893
sum
+=
GG
[
k0
+ i * m] /
diagx
[i] *
GG
[
k1
+ i * m];
894
}
895
896
sum
=
reduce_warp<T>
(
sum
);
897
if
(
lane
== 0)
898
shared
[
wid
] =
sum
;
899
__syncthreads
();
900
901
sum
= (
threadIdx
.x <
blockDim
.x /
warpSize
) ?
shared
[
lane
] : 0;
902
if
(
wid
== 0)
903
sum
=
reduce_warp<T>
(
sum
);
904
905
if
(
threadIdx
.x == 0)
906
buf_h
[
blockIdx
.x] =
sum
;
907
908
}
909
910
911
template
<
typename
T>
912
__global__
void
mma_copy_kernel(
T
*
__restrict__
a,
const
T
*
__restrict__
b
,
913
const
int
n,
const
int
m) {
914
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
915
if
(
tj
<n)
916
a[
tj
+m]=
b
[
tj
];
917
}
918
919
920
921
template
<
typename
T>
922
__global__
void
AA_kernel(
T
*
__restrict__
temp,
const
T
*
__restrict__
GG
,
923
const
T
*
__restrict__
diagx
,
const
int
n,
const
int
m) {
924
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
925
if
(
tj
< n) {
926
for
(
int
i0
= 0;
i0
< m;
i0
++) {
927
for
(
int
i1
= 0;
i1
< m;
i1
++) {
928
temp[
tj
+
i0
* (n + 1) +
i1
* (m + 1) * (n + 1)] =
GG
[
i0
* n +
tj
] *
929
(1.0 /
diagx
[
tj
]) *
GG
[
i1
* n +
tj
];
930
}
931
}
932
}
933
}
934
935
936
template
<
typename
T>
937
__global__
void
dx_kernel(
T
*
__restrict__
dx
,
const
T
*
__restrict__
delx
,
938
const
T
*
__restrict__
diagx
,
const
T
*
__restrict__
GG
,
939
const
T
*
__restrict__
dlambda
,
const
int
n,
const
int
m) {
940
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
941
if
(
tj
< n) {
942
dx
[
tj
] = -
delx
[
tj
]/
diagx
[
tj
];
943
for
(
int
i=0;i<m;i++){
944
dx
[
tj
] =
dx
[
tj
] -
GG
[
tj
*m+i]*
dlambda
[i]/
diagx
[
tj
];
945
}
946
}
947
}
948
949
950
951
template
<
typename
T>
952
__global__
void
dxsi_kernel(
T
*
__restrict__
dxsi
,
const
T
*
__restrict__
xsi,
953
const
T
*
__restrict__
dx
,
const
T
*
__restrict__
x,
954
const
T
*
__restrict__
alpha,
const
T
epsi
,
const
int
n) {
955
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
956
if
(
tj
< n) {
957
dxsi
[
tj
]= -xsi[
tj
] + (
epsi
-
dx
[
tj
]*xsi[
tj
])/(x[
tj
] - alpha[
tj
]);
958
}
959
}
960
961
962
template
<
typename
T>
963
__global__
void
deta_kernel(
T
*
__restrict__
deta
,
const
T
*
__restrict__
eta,
964
const
T
*
__restrict__
dx
,
const
T
*
__restrict__
x,
965
const
T
*
__restrict__
beta,
const
T
epsi
,
const
int
n) {
966
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
967
if
(
tj
< n) {
968
deta
[
tj
] = -eta[
tj
] + (
epsi
+
dx
[
tj
] * eta[
tj
]) / (beta[
tj
] - x[
tj
]);
969
}
970
}
971
972
973
template
<
typename
T>
974
__global__
void
RexCalculation_kernel(
975
T
*
__restrict__
rex
,
976
const
T
*
__restrict__
x,
977
const
T
*
__restrict__
xlow
,
978
const
T
*
__restrict__
xupp
,
979
const
T
*
__restrict__
pij,
980
const
T
*
__restrict__
p0j,
981
const
T
*
__restrict__
qij,
982
const
T
*
__restrict__
q0j,
983
const
T
*
__restrict__
lambda,
984
const
T
*
__restrict__
xsi,
985
const
T
*
__restrict__
eta,
986
const
int
n,
987
const
int
m)
988
{
989
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
990
if
(
tj
< n) {
991
T
upp_diff
=
xupp
[
tj
] - x[
tj
];
992
T
low_diff
= x[
tj
] -
xlow
[
tj
];
993
T
upp_diff_sq
=
upp_diff
*
upp_diff
;
994
T
low_diff_sq
=
low_diff
*
low_diff
;
995
996
T
sum
= 0.0;
997
for
(
int
i = 0; i < m; ++i) {
998
sum
+= pij[i +
tj
* m] * lambda[i] /
upp_diff_sq
999
- qij[i +
tj
* m] * lambda[i] /
low_diff_sq
;
1000
}
1001
1002
rex
[
tj
] =
sum
1003
+ p0j[
tj
] /
upp_diff_sq
1004
- q0j[
tj
] /
low_diff_sq
1005
- xsi[
tj
] + eta[
tj
];
1006
}
1007
}
1008
1009
1010
template
<
typename
T>
1011
__global__
void
rey_calculation_kernel(
T
*
__restrict__
rey
,
1012
const
T
*
__restrict__
c,
const
T
*
__restrict__
d,
const
T
*
__restrict__
y,
1013
const
T
*
__restrict__
lambda,
const
T
*
__restrict__
mu,
const
int
n) {
1014
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
1015
if
(
tj
< n) {
1016
rey
[
tj
] = c[
tj
] + d[
tj
] * y[
tj
] - lambda[
tj
] - mu[
tj
];
1017
}
1018
}
1019
1020
1021
template
<
typename
T>
1022
__global__
void
norm_kernel(
const
T
*
__restrict__
a,
1023
T
*
__restrict__
buf_h
,
const
int
n) {
1024
const
int
idx
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
1025
const
int
str
=
blockDim
.x *
gridDim
.x;
1026
1027
const
unsigned
int
lane
=
threadIdx
.x %
warpSize
;
1028
const
unsigned
int
wid
=
threadIdx
.x /
warpSize
;
1029
1030
__shared__
T
shared
[32];
1031
T
sum
=
T
(0);
1032
1033
for
(
int
i =
idx
; i < n; i +=
str
) {
1034
T
val
= a[i];
1035
sum
+=
val
*
val
;
// faster than pow(val, 2)
1036
}
1037
1038
sum
=
reduce_warp<T>
(
sum
);
// your warp-level sum reduction function
1039
if
(
lane
== 0)
1040
shared
[
wid
] =
sum
;
1041
__syncthreads
();
1042
1043
sum
= (
threadIdx
.x <
blockDim
.x /
warpSize
) ?
shared
[
lane
] :
T
(0);
1044
if
(
wid
== 0)
1045
sum
=
reduce_warp<T>
(
sum
);
1046
1047
if
(
threadIdx
.x == 0)
1048
buf_h
[
blockIdx
.x] =
sum
;
1049
}
1050
1051
1052
template
<
typename
T>
1053
__global__
void
sub2cons_kernel(
T
*
__restrict__
a,
const
T
*
__restrict__
b
,
1054
const
T
*
__restrict__
c,
1055
const
T
d,
const
int
n) {
1056
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
1057
if
(
tj
< n) {
1058
a[
tj
] =
b
[
tj
]*c[
tj
]-d;
1059
}
1060
}
1061
1062
1063
template
<
typename
T>
1064
__global__
void
dely_kernel(
T
*
__restrict__
dely
,
const
T
*
__restrict__
c,
1065
const
T
*
__restrict__
d,
const
T
*
__restrict__
y,
1066
const
T
*
__restrict__
lambda,
const
T
epsi
,
const
int
n) {
1067
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
1068
if
(
tj
< n) {
1069
dely
[
tj
] = c[
tj
] + d[
tj
]*y[
tj
] - lambda[
tj
] -
epsi
/y[
tj
];
1070
}
1071
}
1072
1073
1074
1075
template
<
typename
T >
1076
__global__
void
maxval2_kernel(
const
T
*
__restrict__
a,
const
T
*
__restrict__
b
,
1077
T
*
__restrict__
temp,
const
T
cons
,
const
int
n) {
1078
1079
const
int
idx
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
1080
const
int
str
=
blockDim
.x *
gridDim
.x;
1081
1082
const
unsigned
int
lane
=
threadIdx
.x %
warpSize
;
1083
const
unsigned
int
wid
=
threadIdx
.x /
warpSize
;
1084
1085
__shared__
T
shared
[32];
1086
T
maxval
=
cons
* a[0] /
b
[0];
1087
for
(
int
i =
idx
; i < n; i +=
str
)
1088
{
1089
maxval
=
max
(
maxval
,
cons
* a[i] /
b
[i]);
1090
}
1091
1092
maxval
=
max_reduce_warp<T>
(
maxval
);
1093
if
(
lane
== 0)
1094
shared
[
wid
] =
maxval
;
1095
__syncthreads
();
1096
1097
maxval
= (
threadIdx
.x <
blockDim
.x /
warpSize
) ?
shared
[
lane
] : 0.0;
1098
if
(
wid
== 0)
1099
maxval
=
max_reduce_warp<T>
(
maxval
);
1100
1101
if
(
threadIdx
.x == 0)
1102
temp[
blockIdx
.x] =
maxval
;
1103
1104
}
1105
1106
1107
template
<
typename
T >
1108
__global__
void
maxval3_kernel(
const
T
*
__restrict__
a,
const
T
*
__restrict__
b
,
1109
const
T
*
__restrict__
c,
T
*
__restrict__
temp,
const
T
cons
,
const
int
n) {
1110
1111
const
int
idx
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
1112
const
int
str
=
blockDim
.x *
gridDim
.x;
1113
1114
const
unsigned
int
lane
=
threadIdx
.x %
warpSize
;
1115
const
unsigned
int
wid
=
threadIdx
.x /
warpSize
;
1116
1117
__shared__
T
shared
[32];
1118
T
maxval
=
cons
* a[0] /
b
[0];
1119
for
(
int
i =
idx
; i < n; i +=
str
)
1120
{
1121
maxval
=
max
(
maxval
,
cons
* a[i] / (
b
[i] - c[i]));
1122
}
1123
1124
maxval
=
max_reduce_warp<T>
(
maxval
);
1125
if
(
lane
== 0)
1126
shared
[
wid
] =
maxval
;
1127
__syncthreads
();
1128
1129
maxval
= (
threadIdx
.x <
blockDim
.x /
warpSize
) ?
shared
[
lane
] : 0;
1130
if
(
wid
== 0)
1131
maxval
=
max_reduce_warp<T>
(
maxval
);
1132
1133
if
(
threadIdx
.x == 0)
1134
temp[
blockIdx
.x] =
maxval
;
1135
1136
}
1137
1138
1139
template
<
typename
T>
1140
__global__
void
kkt_rex_kernel(
T
*
__restrict__
rex
,
const
T
*
__restrict__
df0dx
,
1141
const
T
*
__restrict__
dfdx
,
const
T
*
__restrict__
xsi,
1142
const
T
*
__restrict__
eta,
const
T
*
__restrict__
lambda,
const
int
n,
1143
const
int
m) {
1144
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
1145
if
(
tj
< n) {
1146
rex
[
tj
] = 0.0;
1147
for
(
int
i = 0; i < m; i++) {
1148
rex
[
tj
] =
rex
[
tj
] +
dfdx
[i +
tj
*m] * lambda[i];
1149
}
1150
rex
[
tj
] +=
df0dx
[
tj
] - xsi[
tj
] + eta[
tj
];
1151
}
1152
}
1153
1154
1155
template
<
typename
T>
1156
__global__
void
maxcons_kernel(
T
*
__restrict__
a,
const
T
b
,
1157
const
T
c,
const
T
*
__restrict__
d,
const
int
n) {
1158
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
1159
if
(
tj
< n) {
1160
a[
tj
] =
max
(
b
, c * d[
tj
]);
1161
}
1162
}
1163
1166
template
<
typename
T >
1167
__global__
void
glsum_kernel(
const
T
* a,
T
*
buf_h
,
const
int
n) {
1168
const
int
idx
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
1169
const
int
str
=
blockDim
.x *
gridDim
.x;
1170
1171
const
unsigned
int
lane
=
threadIdx
.x %
warpSize
;
1172
const
unsigned
int
wid
=
threadIdx
.x /
warpSize
;
1173
1174
__shared__
T
shared
[32];
1175
T
sum
= 0;
1176
for
(
int
i =
idx
; i<n ; i +=
str
)
1177
{
1178
sum
+= a[i];
1179
}
1180
1181
sum
=
reduce_warp<T>
(
sum
);
1182
if
(
lane
== 0)
1183
shared
[
wid
] =
sum
;
1184
__syncthreads
();
1185
1186
sum
= (
threadIdx
.x <
blockDim
.x /
warpSize
) ?
shared
[
lane
] : 0;
1187
if
(
wid
== 0)
1188
sum
=
reduce_warp<T>
(
sum
);
1189
1190
if
(
threadIdx
.x == 0)
1191
buf_h
[
blockIdx
.x] =
sum
;
1192
1193
}
1194
template
<
typename
T >
1195
__global__
void
glsc2_kernel(
const
T
* a,
1196
const
T
*
b
,
1197
T
*
buf_h
,
1198
const
int
n) {
1199
1200
const
int
idx
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
1201
const
int
str
=
blockDim
.x *
gridDim
.x;
1202
1203
const
unsigned
int
lane
=
threadIdx
.x %
warpSize
;
1204
const
unsigned
int
wid
=
threadIdx
.x /
warpSize
;
1205
1206
__shared__
T
shared
[32];
1207
T
sum
= 0.0;
1208
for
(
int
i =
idx
; i < n; i+=
str
) {
1209
sum
+= a[i] *
b
[i];
1210
}
1211
1212
sum
=
reduce_warp<T>
(
sum
);
1213
if
(
lane
== 0)
1214
shared
[
wid
] =
sum
;
1215
__syncthreads
();
1216
1217
sum
= (
threadIdx
.x <
blockDim
.x /
warpSize
) ?
shared
[
lane
] : 0;
1218
if
(
wid
== 0)
1219
sum
=
reduce_warp<T>
(
sum
);
1220
1221
if
(
threadIdx
.x == 0)
1222
buf_h
[
blockIdx
.x] =
sum
;
1223
1224
}
1225
1226
1227
1228
1229
1230
template
<
typename
T>
1231
__global__
void
add2inv2_kernel(
T
*
__restrict__
a,
const
T
*
__restrict__
b
,
1232
const
T
c,
const
int
n) {
1233
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
1234
if
(
tj
< n) {
1235
a[
tj
] = a[
tj
]+c/
b
[
tj
];
1236
}
1237
}
1238
1239
template
<
typename
T>
1240
__global__
void
max2_kernel(
T
*
__restrict__
a,
const
T
b
,
1241
const
T
*
__restrict__
c,
const
T
d,
const
int
n) {
1242
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
1243
if
(
tj
< n) {
1244
a[
tj
]=
max
(
b
, d*c[
tj
]);
1245
}
1246
}
1247
1248
template
<
typename
T>
1249
__global__
void
updatebb_kernel(
T
*
__restrict__
bb
,
1250
const
T
*
__restrict__
dellambda
,
const
T
*
__restrict__
dely
,
1251
const
T
*
__restrict__
d,
const
T
*
__restrict__
mu,
1252
const
T
*
__restrict__
y,
const
T
delz
,
const
int
m) {
1253
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
1254
if
(
tj
<m)
1255
bb
[
tj
]=
dellambda
[
tj
] +
dely
[
tj
]/(d[
tj
] + mu[
tj
]/y[
tj
]) -
bb
[
tj
];
1256
else
if
(
tj
<m+1)
1257
bb
[
tj
]=
delz
;
1258
}
1259
1260
1261
1262
template
<
typename
T>
1263
__global__
void
updateAA_kernel(
T
*
__restrict__
AA
,
1264
const
T
*
__restrict__
globaltmp_mm
,
const
T
*
__restrict__
s,
1265
const
T
*
__restrict__
lambda,
const
T
*
__restrict__
d,
1266
const
T
*
__restrict__
mu,
const
T
*
__restrict__
y,
const
T
*
__restrict__
a,
1267
const
T
zeta,
const
T
z,
const
int
m) {
1268
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
1269
if
(
tj
<m)
1270
{
1271
AA
[
tj
+
tj
*(m+1)]=
globaltmp_mm
[
tj
+
tj
*m] + (s[
tj
] / lambda[
tj
] +
1272
1.0/ (d[
tj
] + mu[
tj
] / y[
tj
]));
1273
AA
[
tj
+m*(m+1)]=a[
tj
];
1274
AA
[m+
tj
*(m+1)]=a[
tj
];
1275
}
1276
else
if
(
tj
<m+1)
1277
AA
[
tj
+
tj
*(m+1)]= -zeta/z;
1278
}
1279
1280
template
<
typename
T>
1281
__global__
void
dy_kernel(
T
*
__restrict__
dy
,
const
T
*
__restrict__
dely
,
1282
const
T
*
__restrict__
dlambda
,
const
T
*
__restrict__
d,
1283
const
T
*
__restrict__
mu,
const
T
*
__restrict__
y,
const
int
n) {
1284
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
1285
if
(
tj
<n)
1286
dy
[
tj
] = (-
dely
[
tj
]+
dlambda
[
tj
])/(d[
tj
] + mu[
tj
]/y[
tj
]);
1287
}
1288
1289
#endif
1290
convex_down_RAMP_mapping_apply_kernel
__global__ void convex_down_RAMP_mapping_apply_kernel(const T f_min, const T f_max, const T q, T *__restrict__ X_out_d, T *__restrict__ X_in_d, const int n)
Definition
RAMP_mapping_kernel.h:44
sources
optimizer
bcknd
device
cuda
mma_kernel.h
Generated by
1.9.8