Neko-TOP
A portable framework for high-order spectral element flow toplogy optimization.
Loading...
Searching...
No Matches
mma_kernel.h
Go to the documentation of this file.
1
37
#ifndef MMA_HIP_KERNEL_H
38
#define MMA_HIP_KERNEL_H
39
40
41
template
<
typename
T>
42
__global__
void
mma_prepare_aa_matrix_kernel(
T
*
__restrict__
AA
,
43
const
T
*
__restrict__
s,
const
T
*
__restrict__
lambda,
44
const
T
*
__restrict__
d,
const
T
*
__restrict__
mu,
45
const
T
*
__restrict__
y,
const
T
*
__restrict__
a,
46
const
T
zeta,
const
T
z,
const
int
m) {
47
const
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
48
const
int
matrix_size
= m + 1;
49
50
if
(
tj
>= m)
return
;
51
AA
[
tj
*
matrix_size
+
tj
] += s[
tj
] / lambda[
tj
] +
52
(
T
)1.0 / (d[
tj
] + mu[
tj
] / y[
tj
]);
53
AA
[
tj
*
matrix_size
+ m] = a[
tj
];
// column m+1
54
AA
[m *
matrix_size
+
tj
] = a[
tj
];
// row m+1
55
56
// Only first thread updates the bottom-right corner element.
57
if
(
tj
== 0)
58
AA
[m *
matrix_size
+ m] = -zeta / z;
59
}
60
61
62
//Update Hessian diagonal elements (y contributions for dip subsolve)
63
template
<
typename
T>
64
__global__
void
mma_update_hessian_diagonal_kernel(
T
*
__restrict__
Hess
,
65
const
T
*
__restrict__
y,
const
T
*
__restrict__
d,
66
const
T
*
__restrict__
mu,
const
T
*
__restrict__
lambda,
const
int
m) {
67
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
68
if
(
tj
>= m)
return
;
69
70
T
diag
=
Hess
[
tj
* m +
tj
];
71
// Contribution from y terms (inactive constraints)
72
if
(y[
tj
] > (
T
)0.0) {
73
if
(
fabs
(d[
tj
]) >= (
T
)1.0e-15) {
74
diag
-= (
T
)1.0 / d[
tj
];
75
}
76
// else: skip - equivalent to subtracting 1.0/1.0e-8
77
}
78
79
// Contribution from -mu/lambda (eq 10)
80
diag
-= mu[
tj
] / lambda[
tj
];
81
Hess
[
tj
* m +
tj
] =
diag
;
82
}
83
84
// Levenberg-Marquardt algorithm (heuristically)
85
// Single-block version for m <= 1024
86
template
<
typename
T>
87
__global__
void
mma_stabilize_hessian_single_kernel(
T
*
__restrict__
Hess
,
const
int
m) {
88
const
int
tid
=
threadIdx
.x;
89
90
// Single thread computes trace and LM factor
91
if
(
tid
== 0) {
92
T
trace
= (
T
)0.0;
93
for
(
int
j
= 0;
j
< m;
j
++) {
94
trace
+=
Hess
[
j
* m +
j
];
95
}
96
T
lm_factor
=
max
((
T
)(-1.0e-4) *
trace
/ m, (
T
)1.0e-7);
97
98
// Apply to all diagonal elements
99
for
(
int
j
= 0;
j
< m;
j
++) {
100
Hess
[
j
* m +
j
] -=
lm_factor
;
101
}
102
}
103
}
104
105
// Levenberg-Marquardt algorithm (heuristically)
106
// Multi-block version for m > 1024
107
template
<
typename
T>
108
__global__
void
mma_stabilize_hessian_multi_kernel(
T
*
__restrict__
Hess
,
const
T
lm_factor
,
const
int
m) {
109
const
int
i =
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
110
if
(i < m) {
111
Hess
[i * m + i] -=
lm_factor
;
112
}
113
}
114
115
// Small linear solver efficient for (n <= 100)
116
template
<
typename
T>
117
__global__
void
mma_small_lu_kernel(
T
*
__restrict__
A
,
T
*
__restrict__
b
,
const
int
n) {
118
const
int
tid
=
threadIdx
.x;
119
120
// Handle 1x1 case
121
if
(n == 1) {
122
if
(
tid
== 0 &&
abs
(
A
[0]) > (
T
)1
e
-12)
b
[0] /=
A
[0];
123
return
;
124
}
125
126
// LU decomposition with partial pivoting
127
for
(
int
k = 0; k < n; k++) {
128
// Pivoting - single thread
129
if
(
tid
== 0) {
130
int
max_row
= k;
131
T
max_val
=
abs
(
A
[k * n + k]);
132
for
(
int
i = k + 1; i < n; i++) {
133
T
val
=
abs
(
A
[i * n + k]);
134
if
(
val
>
max_val
) {
135
max_val
=
val
;
136
max_row
= i;
137
}
138
}
139
if
(
max_val
> (
T
)1
e
-12 &&
max_row
!= k) {
140
// Swap rows
141
for
(
int
j
= k;
j
< n;
j
++) {
142
T
temp =
A
[k * n +
j
];
143
A
[k * n +
j
] =
A
[
max_row
* n +
j
];
144
A
[
max_row
* n +
j
] = temp;
145
}
146
// Swap rhs
147
T
temp_b
=
b
[k];
148
b
[k] =
b
[
max_row
];
149
b
[
max_row
] =
temp_b
;
150
}
151
}
152
__syncthreads
();
153
154
// Parallel elimination
155
T
diag
=
A
[k * n + k];
156
if
(
abs
(
diag
) > (
T
)1
e
-12) {
157
for
(
int
i =
tid
+ k + 1; i < n; i +=
blockDim
.x) {
158
T
factor
=
A
[i * n + k] /
diag
;
159
A
[i * n + k] =
factor
;
160
for
(
int
j
= k + 1;
j
< n;
j
++) {
161
A
[i * n +
j
] -=
factor
*
A
[k * n +
j
];
162
}
163
}
164
}
165
__syncthreads
();
166
}
167
168
// Parallel forward substitution
169
for
(
int
i =
tid
; i < n; i +=
blockDim
.x) {
170
T
sum
=
b
[i];
171
for
(
int
j
= 0;
j
< i;
j
++) {
172
sum
-=
A
[i * n +
j
] *
b
[
j
];
173
}
174
b
[i] =
sum
;
175
}
176
__syncthreads
();
177
178
// Parallel backward substitution
179
for
(
int
i = n - 1 -
tid
; i >= 0; i -=
blockDim
.x) {
180
if
(i >= 0) {
181
T
sum
=
b
[i];
182
for
(
int
j
= i + 1;
j
< n;
j
++) {
183
sum
-=
A
[i * n +
j
] *
b
[
j
];
184
}
185
if
(
abs
(
A
[i * n + i]) > (
T
)1
e
-12) {
186
b
[i] =
sum
/
A
[i * n + i];
187
}
188
}
189
}
190
__syncthreads
();
191
}
192
193
194
template
<
typename
T>
195
__global__
void
delta_1dbeam_kernel(
T
*
__restrict__
Delta
,
196
const
T
L_total
,
const
T
Le
,
const
int
offset
,
const
int
n) {
197
int
k =
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
198
if
(k >= n)
return
;
199
200
// Convert to 1-based indexing for the calculation
201
int
idx
= k + 1;
202
203
// Calculate first term: (L_total - Le*(offset+idx-1))^3
204
T
x1
=
L_total
-
Le
*
static_cast<
T
>
(
offset
+
idx
- 1);
205
T
term1
=
x1
*
x1
*
x1
;
206
207
// Calculate second term: (L_total - Le*(offset+idx))^3
208
T
x2
=
L_total
-
Le
*
static_cast<
T
>
(
offset
+
idx
);
209
T
term2
=
x2
*
x2
*
x2
;
210
211
// Final result
212
Delta
[k] = (
term1
-
term2
) /
static_cast<
T
>
(3.0);
213
}
214
215
template
<
typename
T>
216
__global__
void
mma_Ljjxinv_kernel(
T
*
__restrict__
Ljjxinv
,
217
const
T
*
__restrict__
pjlambda
,
const
T
*
__restrict__
qjlambda
,
218
const
T
*
__restrict__
x,
const
T
*
__restrict__
low,
const
T
*
__restrict__
upp,
219
const
T
*
__restrict__
alpha,
const
T
*
__restrict__
beta,
220
const
int
n) {
221
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
222
if
(
tj
>= n)
return
;
223
224
// Load inputs into registers
225
const
T
xt
= x[
tj
];
226
const
T
low_j
= low[
tj
];
227
const
T
upp_j
= upp[
tj
];
228
const
T
pj
=
pjlambda
[
tj
];
229
const
T
qj
=
qjlambda
[
tj
];
230
231
// Precompute reused differences
232
const
T
diff_u
=
upp_j
-
xt
;
233
const
T
diff_l
=
xt
-
low_j
;
234
235
// Cube once (avoiding pow for speed)
236
const
T
diff_u3
=
diff_u
*
diff_u
*
diff_u
;
237
const
T
diff_l3
=
diff_l
*
diff_l
*
diff_l
;
238
239
// Compute inverse value safely
240
T
denom
= 2.0 *
pj
/
diff_u3
+ 2.0 *
qj
/
diff_l3
;
241
T
val
= -1.0 /
denom
;
242
243
// Mask out active primal constraints
244
bool
active
= (
fabs
(
xt
- alpha[
tj
]) <=
T
(1
e
-16)) ||
245
(
fabs
(
xt
- beta[
tj
]) <=
T
(1
e
-16));
246
247
Ljjxinv
[
tj
] =
active
?
T
(0.0) :
val
;
248
}
249
250
template
<
typename
T>
251
__global__
void
mma_dipsolvesub1_kernel(
T
*
__restrict__
x,
252
const
T
*
__restrict__
pjlambda
,
const
T
*
__restrict__
qjlambda
,
253
const
T
*
__restrict__
low,
const
T
*
__restrict__
upp,
254
const
T
*
__restrict__
alpha,
const
T
*
__restrict__
beta,
255
const
int
n) {
256
257
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
258
if
(
tj
>= n)
return
;
259
260
// Load inputs into registers
261
const
T
pj_sqrt
=
sqrt
(
pjlambda
[
tj
]);
262
const
T
qj_sqrt
=
sqrt
(
qjlambda
[
tj
]);
263
const
T
denom
=
pj_sqrt
+
qj_sqrt
;
264
265
const
T
low_j
= low[
tj
];
266
const
T
upp_j
= upp[
tj
];
267
const
T
alpha_j
= alpha[
tj
];
268
const
T
beta_j
= beta[
tj
];
269
270
// Weighted average
271
const
T
val
= (
pj_sqrt
*
low_j
+
qj_sqrt
*
upp_j
) /
denom
;
272
273
// Clamp x between alpha and beta using branchless min/max
274
x[
tj
] =
fmin
(
fmax
(
val
,
alpha_j
),
beta_j
);
275
}
276
277
template
<
typename
T>
278
__global__
void
mattrans_v_mul_kernel(
T
*
__restrict__
output,
279
const
T
*
__restrict__
pij,
const
T
*
__restrict__
lambda,
280
const
int
m,
const
int
n) {
281
282
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
283
if
(
tj
>= n)
return
;
284
285
T
acc
= 0.0;
286
287
// Use register accumulation
288
for
(
int
i = 0; i < m; ++i) {
289
acc
+= pij[i +
tj
* m] * lambda[i];
290
}
291
292
output[
tj
] =
acc
;
293
}
294
295
296
template
<
typename
T>
297
__global__
void
mma_sub1_kernel(
298
T
*
__restrict__
xlow
,
299
T
*
__restrict__
xupp
,
300
const
T
*
__restrict__
x,
301
const
T
*
__restrict__
xmin,
302
const
T
*
__restrict__
xmax,
303
const
T
asyinit,
304
const
int
n) {
305
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
306
if
(
tj
>= n)
return
;
307
308
// Load values once into registers (global memory is slow)
309
const
T
xt
= x[
tj
];
310
const
T
xmin_j
= xmin[
tj
];
311
const
T
xmax_j
= xmax[
tj
];
312
313
// Reuse xgap calculation
314
const
T
xgap
=
xmax_j
-
xmin_j
;
315
const
T
offset
= asyinit *
xgap
;
316
317
xlow
[
tj
] =
xt
-
offset
;
318
xupp
[
tj
] =
xt
+
offset
;
319
}
320
321
template
<
typename
T >
322
__global__
void
mma_sub2_kernel(
T
*
__restrict__
low,
T
*
__restrict__
upp,
323
const
T
*
__restrict__
x,
const
T
*
__restrict__
xold1,
324
const
T
*
__restrict__
xold2,
const
T
*
__restrict__
xdiff
,
325
const
T
asydecr,
const
T
asyincr,
const
int
n) {
326
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
327
if
(
tj
>= n)
return
;
328
329
// Load data into registers for faster accessing compare to global memory
330
// when accessing repeatedly)
331
const
T
xval
= x[
tj
];
332
const
T
xold1val
= xold1[
tj
];
333
const
T
xold2val
= xold2[
tj
];
334
const
T
lowval
= low[
tj
];
335
const
T
uppval
= upp[
tj
];
336
const
T
xdiffval
=
xdiff
[
tj
];
337
338
// Compute the product
339
const
T
prod
= (
xval
-
xold1val
) * (
xold1val
-
xold2val
);
340
341
// Compute asy_factor without branching
342
T
asy_factor
= (
prod
<
T
(0)) ? asydecr :
343
(
prod
>
T
(0)) ? asyincr :
T
(1);
344
345
// Update low and upp using fma (fused multiply-add) for numerical stability
346
T
new_low
=
fma
(-
asy_factor
, (
xold1val
-
lowval
),
xval
);
347
T
new_upp
=
fma
(
asy_factor
, (
uppval
-
xold1val
),
xval
);
348
349
// Apply bounds
350
new_low
=
max
(
new_low
,
xval
-
T
(10.0) *
xdiffval
);
351
new_low
=
min
(
new_low
,
xval
-
T
(0.01) *
xdiffval
);
352
353
new_upp
=
min
(
new_upp
,
xval
+
T
(10.0) *
xdiffval
);
354
new_upp
=
max
(
new_upp
,
xval
+
T
(0.01) *
xdiffval
);
355
356
// Write results back
357
low[
tj
] =
new_low
;
358
upp[
tj
] =
new_upp
;
359
}
360
361
template
<
typename
T>
362
__global__
void
mma_sub3_kernel(
const
T
*
__restrict__
x,
363
const
T
*
__restrict__
df0dx
,
const
T
*
__restrict__
dfdx
,
364
T
*
__restrict__
low,
T
*
__restrict__
upp,
const
T
*
__restrict__
xmin,
365
const
T
*
__restrict__
xmax,
T
*
__restrict__
alpha,
T
*
__restrict__
beta,
366
T
*
__restrict__
p0j,
T
*
__restrict__
q0j,
T
*
__restrict__
pij,
367
T
*
__restrict__
qij,
const
int
n,
const
int
m) {
368
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
369
if
(
tj
>= n)
return
;
370
371
// Load into registers once
372
const
T
xt
= x[
tj
];
373
const
T
xmin_j
= xmin[
tj
];
374
const
T
xmax_j
= xmax[
tj
];
375
const
T
low_j
= low[
tj
];
376
const
T
upp_j
= upp[
tj
];
377
const
T
df0
=
df0dx
[
tj
];
378
const
T
xgap
=
xmax_j
-
xmin_j
;
379
380
// Clamp helpers
381
const
T
half_xgap
= 0.5 *
xgap
;
382
const
T
tenth_low_diff
=
T
(0.1) * (
xt
-
low_j
);
383
const
T
tenth_upp_diff
=
T
(0.1) * (
upp_j
-
xt
);
384
385
// Compute alpha and beta with fused max/min and fewer calls
386
T
alpha_val
=
max
(
max
(
xmin_j
,
low_j
+
tenth_low_diff
),
xt
-
half_xgap
);
387
T
beta_val
=
min
(
min
(
xmax_j
,
upp_j
-
tenth_upp_diff
),
xt
+
half_xgap
);
388
389
alpha[
tj
] =
alpha_val
;
390
beta[
tj
] =
beta_val
;
391
392
// Avoid multiple pow calls, compute once
393
const
T
upp_minus_x
=
upp_j
-
xt
;
394
const
T
x_minus_low
=
xt
-
low_j
;
395
const
T
upp_minus_x_sq
=
upp_minus_x
*
upp_minus_x
;
396
const
T
x_minus_low_sq
=
x_minus_low
*
x_minus_low
;
397
398
// Small epsilon for numerical stability
399
const
T
eps
=
T
(1
e
-5);
400
const
T
inv_xgap
=
T
(1.0) /
max
(
eps
,
xgap
);
401
402
// Compute terms reused multiple times
403
const
T
max_df0_pos
=
max
(
df0
,
T
(0));
404
const
T
max_df0_neg
=
max
(-
df0
,
T
(0));
405
406
p0j[
tj
] =
upp_minus_x_sq
* (
T
(1.001) *
max_df0_pos
+
407
T
(0.001) *
max_df0_neg
+
eps
*
inv_xgap
);
408
q0j[
tj
] =
x_minus_low_sq
* (
T
(0.001) *
max_df0_pos
+
409
T
(1.001) *
max_df0_neg
+
eps
*
inv_xgap
);
410
411
// Loop over m for pij and qij
412
for
(
int
i = 0; i < m; ++i) {
413
int
idx
= i +
tj
* m;
414
415
T
dfdx_val
=
dfdx
[
idx
];
416
T
max_pos
=
max
(
dfdx_val
,
T
(0));
417
T
max_neg
=
max
(-
dfdx_val
,
T
(0));
418
419
pij[
idx
] =
upp_minus_x_sq
* (
T
(1.001) *
max_pos
+
420
T
(0.001) *
max_neg
+
eps
*
inv_xgap
);
421
qij[
idx
] =
x_minus_low_sq
* (
T
(0.001) *
max_pos
+
422
T
(1.001) *
max_neg
+
eps
*
inv_xgap
);
423
}
424
}
425
426
template
<
typename
T>
427
__global__
void
mma_sub4_kernel(
428
const
T
*
__restrict__
x,
429
const
T
*
__restrict__
low,
430
const
T
*
__restrict__
upp,
431
const
T
*
__restrict__
pij,
432
const
T
*
__restrict__
qij,
433
T
*
__restrict__
temp,
434
const
int
n,
435
const
int
m) {
436
437
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
438
if
(
tj
>= n)
return
;
439
440
// Register caching
441
const
T
xt
= x[
tj
];
442
const
T
low_j
= low[
tj
];
443
const
T
upp_j
= upp[
tj
];
444
445
const
T
denom_upp
=
upp_j
-
xt
;
446
const
T
denom_low
=
xt
-
low_j
;
447
448
const
T
eps
=
T
(1
e
-12);
// Precision-dependent epsilon
449
const
T
inv_denom_upp
=
T
(1) /
max
(
denom_upp
,
eps
);
450
const
T
inv_denom_low
=
T
(1) /
max
(
denom_low
,
eps
);
451
452
const
int
base_idx
=
tj
* m;
453
454
for
(
int
i = 0; i < m; ++i) {
455
int
idx
=
base_idx
+ i;
456
temp[
idx
] = pij[
idx
] *
inv_denom_upp
+ qij[
idx
] *
inv_denom_low
;
457
}
458
}
459
460
461
template
<
typename
T>
462
__global__
void
mma_max2_kernel(
463
T
*
__restrict__
xsi,
464
const
T
*
__restrict__
x,
465
const
T
*
__restrict__
alpha,
466
const
int
n) {
467
468
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
469
if
(
tj
>= n)
return
;
470
471
const
T
eps
=
T
(1
e
-12);
472
T
denom
=
max
(x[
tj
] - alpha[
tj
],
eps
);
473
xsi[
tj
] =
max
(
T
(1),
T
(1) /
denom
);
474
}
475
476
template
<
typename
T>
477
__global__
void
relambda_kernel(
478
T
*
__restrict__
temp,
479
const
T
*
__restrict__
x,
480
const
T
*
__restrict__
xupp
,
481
const
T
*
__restrict__
xlow
,
482
const
T
*
__restrict__
pij,
483
const
T
*
__restrict__
qij,
484
const
int
n,
485
const
int
m) {
486
487
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
488
if
(
tj
>= n)
return
;
489
490
const
T
xt
= x[
tj
];
491
const
T
xup
=
xupp
[
tj
];
492
const
T
xlo
=
xlow
[
tj
];
493
494
// Prevent divide-by-zero using small epsilon
495
const
T
eps
=
T
(1
e
-12);
496
const
T
inv_denom_upp
=
T
(1) /
max
(
xup
-
xt
,
eps
);
497
const
T
inv_denom_low
=
T
(1) /
max
(
xt
-
xlo
,
eps
);
498
499
int
base_idx
=
tj
* m;
500
for
(
int
i = 0; i < m; ++i) {
501
int
idx
=
base_idx
+ i;
502
temp[
idx
] = pij[
idx
] *
inv_denom_upp
+ qij[
idx
] *
inv_denom_low
;
503
}
504
}
505
506
507
template
<
typename
T>
508
__global__
void
sub2cons2_kernel(
509
T
*
__restrict__
a,
510
const
T
*
__restrict__
b
,
511
const
T
*
__restrict__
c,
512
const
T
*
__restrict__
d,
513
const
T
e
,
514
const
int
n) {
515
const
int
idx
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
516
if
(
idx
>= n)
return
;
517
518
const
T
bt
=
b
[
idx
];
519
const
T
ct
= c[
idx
];
520
const
T
dt
= d[
idx
];
521
522
a[
idx
] =
bt
* (
ct
-
dt
) -
e
;
523
}
524
525
template
<
typename
T>
526
__inline__
__device__
T
max_reduce_warp(
T
val
) {
527
int
w =
warpSize
;
528
for
(
int
offset
= w / 2;
offset
> 0;
offset
/= 2)
529
val
=
max
(
val
,
__shfl_down
(
val
,
offset
, w));
530
return
val
;
531
}
532
533
template
<
typename
T >
534
__global__
void
maxval_kernel(
const
T
*
__restrict__
a,
T
*temp,
const
int
n) {
535
536
const
int
idx
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
537
const
int
str
=
blockDim
.x *
gridDim
.x;
538
539
const
unsigned
int
lane
=
threadIdx
.x %
warpSize
;
540
const
unsigned
int
wid
=
threadIdx
.x /
warpSize
;
541
542
__shared__
T
shared
[16];
543
T
maxval
= 0.0;
544
for
(
int
i =
idx
; i < n; i +=
str
) {
545
maxval
=
max
(
maxval
,
abs
(a[i]));
546
}
547
548
maxval
=
max_reduce_warp<T>
(
maxval
);
549
if
(
lane
== 0)
550
shared
[
wid
] =
maxval
;
551
__syncthreads
();
552
553
maxval
= (
threadIdx
.x <
blockDim
.x /
warpSize
) ?
shared
[
lane
] : 0.0;
554
if
(
wid
== 0)
555
maxval
=
max_reduce_warp<T>
(
maxval
);
556
557
if
(
threadIdx
.x == 0)
558
temp[
blockIdx
.x] =
maxval
;
559
560
}
561
562
563
template
<
typename
T>
564
__global__
void
max_reduce_kernel(
T
*
__restrict__
bufred
,
const
int
n) {
565
566
T
maxval
=
T
(0);
567
const
int
idx
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
568
const
int
stride
=
blockDim
.x *
gridDim
.x;
569
570
// Grid-stride loop to cover all elements
571
for
(
int
i =
idx
; i < n; i +=
stride
) {
572
maxval
=
max
(
maxval
,
bufred
[i]);
573
}
574
575
__shared__
T
shared
[16];
// One slot per warp (max 1024 threads/block)
576
577
unsigned
int
lane
=
threadIdx
.x %
warpSize
;
578
unsigned
int
wid
=
threadIdx
.x /
warpSize
;
579
580
// Warp-level max reduction
581
maxval
=
max_reduce_warp<T>
(
maxval
);
582
583
// Store each warp's max value in shared memory
584
if
(
lane
== 0) {
585
shared
[
wid
] =
maxval
;
586
}
587
__syncthreads
();
588
589
// Let the first warp reduce the warp-level maxima
590
maxval
= (
threadIdx
.x <
blockDim
.x /
warpSize
) ?
shared
[
lane
] :
T
(0);
591
if
(
wid
== 0) {
592
maxval
=
max_reduce_warp<T>
(
maxval
);
593
}
594
595
// Write the block's maximum value back to bufred[blockIdx.x]
596
if
(
threadIdx
.x == 0) {
597
bufred
[
blockIdx
.x] =
maxval
;
598
}
599
}
600
601
602
template
<
typename
T>
603
__global__
void
delx_kernel(
604
T
*
__restrict__
delx
,
605
const
T
*
__restrict__
x,
606
const
T
*
__restrict__
xlow
,
607
const
T
*
__restrict__
xupp
,
608
const
T
*
__restrict__
pij,
609
const
T
*
__restrict__
qij,
610
const
T
*
__restrict__
p0j,
611
const
T
*
__restrict__
q0j,
612
const
T
*
__restrict__
alpha,
613
const
T
*
__restrict__
beta,
614
const
T
*
__restrict__
lambda,
615
const
T
epsi
,
616
const
int
n,
617
const
int
m)
618
{
619
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
620
if
(
tj
< n) {
621
T
xt
= x[
tj
];
622
T
xlow_j
=
xlow
[
tj
];
623
T
xupp_j
=
xupp
[
tj
];
624
T
alpha_j
= alpha[
tj
];
625
T
beta_j
= beta[
tj
];
626
627
// Precompute denominators squared for better performance
628
T
denom_low
=
xt
-
xlow_j
;
629
T
denom_upp
=
xupp_j
-
xt
;
630
T
denom_alpha
=
xt
-
alpha_j
;
631
T
denom_beta
=
beta_j
-
xt
;
632
633
const
T
small_eps
=
T
(1
e
-12);
634
denom_low
= (
abs
(
denom_low
) <
small_eps
) ?
small_eps
:
denom_low
;
635
denom_upp
= (
abs
(
denom_upp
) <
small_eps
) ?
small_eps
:
denom_upp
;
636
denom_alpha
= (
abs
(
denom_alpha
) <
small_eps
) ?
small_eps
:
denom_alpha
;
637
denom_beta
= (
abs
(
denom_beta
) <
small_eps
) ?
small_eps
:
denom_beta
;
638
639
T
sum
=
T
(0);
640
for
(
int
i = 0; i < m; i++) {
641
T
lambda_i
= lambda[i];
642
sum
+= pij[i +
tj
* m] *
lambda_i
/ (
denom_upp
*
denom_upp
)
643
- qij[i +
tj
* m] *
lambda_i
/ (
denom_low
*
denom_low
);
644
}
645
sum
+= p0j[
tj
] / (
denom_upp
*
denom_upp
)
646
- q0j[
tj
] / (
denom_low
*
denom_low
)
647
-
epsi
/
denom_alpha
648
+
epsi
/
denom_beta
;
649
650
delx
[
tj
] =
sum
;
651
}
652
}
653
654
template
<
typename
T>
655
__global__
void
GG_kernel(
656
T
*
__restrict__
GG
,
657
const
T
*
__restrict__
x,
658
const
T
*
__restrict__
xlow
,
659
const
T
*
__restrict__
xupp
,
660
const
T
*
__restrict__
pij,
661
const
T
*
__restrict__
qij,
662
const
int
n,
663
const
int
m) {
664
665
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
666
if
(
tj
< n) {
667
T
xt
= x[
tj
];
668
T
xlow_j
=
xlow
[
tj
];
669
T
xupp_j
=
xupp
[
tj
];
670
671
// Distances from bounds
672
T
diff_upper
=
xupp_j
-
xt
;
673
T
diff_lower
=
xt
-
xlow_j
;
674
675
// Squared distances
676
T
diff_upper2
=
diff_upper
*
diff_upper
;
677
T
diff_lower2
=
diff_lower
*
diff_lower
;
678
679
for
(
int
i = 0; i < m; i++) {
680
int
idx
= i +
tj
* m;
681
GG
[
idx
] = pij[
idx
] /
diff_upper2
- qij[
idx
] /
diff_lower2
;
682
}
683
}
684
}
685
686
template
<
typename
T>
687
__global__
void
diagx_kernel(
T
*
__restrict__
diagx
,
const
T
*
__restrict__
x,
688
const
T
*
__restrict__
xsi,
const
T
*
__restrict__
xlow
,
689
const
T
*
__restrict__
xupp
,
const
T
*
__restrict__
p0j,
690
const
T
*
__restrict__
q0j,
const
T
*
__restrict__
pij,
691
const
T
*
__restrict__
qij,
const
T
* alpha,
const
T
* beta,
692
const
T
* eta,
const
T
* lambda,
const
int
n,
const
int
m) {
693
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
694
if
(
tj
< n) {
695
T
sum
= 0;
696
T
sum1
= 0;
697
for
(
int
i = 0; i < m; i++) {
698
sum
=
sum
+ pij[
tj
*m+ i] * lambda[i];
699
sum1
=
sum1
+ qij[
tj
*m + i] * lambda[i];
700
}
701
diagx
[
tj
] = (p0j[
tj
] +
sum
) /
pow
(
xupp
[
tj
] - x[
tj
], 3) +
702
(q0j[
tj
] +
sum1
) /
pow
(x[
tj
] -
xlow
[
tj
], 3);
703
diagx
[
tj
] = 2.0 *
diagx
[
tj
] + xsi[
tj
] / (x[
tj
] - alpha[
tj
]) +
704
eta[
tj
] / (beta[
tj
] - x[
tj
]);
705
}
706
}
707
708
709
template
<
typename
T>
710
__inline__
__device__
T
reduce_warp(
T
val
) {
711
int
w =
warpSize
;
712
for
(
int
offset
= w / 2;
offset
> 0;
offset
/= 2)
713
val
+=
__shfl_down
(
val
,
offset
, w);
714
return
val
;
715
}
716
717
template
<
typename
T>
718
__global__
void
mmareduce_kernel(
T
*
__restrict__
bufred
,
const
int
n) {
719
720
T
sum
= 0;
721
const
int
idx
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
722
const
int
str
=
blockDim
.x *
gridDim
.x;
723
for
(
int
i =
idx
; i < n; i +=
str
)
724
{
725
sum
+=
bufred
[i];
726
}
727
728
__shared__
T
shared
[16];
729
unsigned
int
lane
=
threadIdx
.x %
warpSize
;
730
unsigned
int
wid
=
threadIdx
.x /
warpSize
;
731
732
sum
=
reduce_warp<T>
(
sum
);
733
if
(
lane
== 0)
734
shared
[
wid
] =
sum
;
735
__syncthreads
();
736
737
sum
= (
threadIdx
.x <
blockDim
.x /
warpSize
) ?
shared
[
lane
] : 0;
738
if
(
wid
== 0)
739
sum
=
reduce_warp<T>
(
sum
);
740
741
if
(
threadIdx
.x == 0)
742
bufred
[
blockIdx
.x] =
sum
;
743
}
744
745
746
template
<
typename
T >
747
__global__
void
mmasum_kernel(
const
T
*
__restrict__
a,
T
*
__restrict__
buf_h
,
748
const
int
n,
const
int
m,
const
int
k) {
749
750
const
int
idx
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
751
const
int
str
=
blockDim
.x *
gridDim
.x;
752
753
const
unsigned
int
lane
=
threadIdx
.x %
warpSize
;
754
const
unsigned
int
wid
=
threadIdx
.x /
warpSize
;
755
756
__shared__
T
shared
[16];
757
T
sum
= 0;
758
for
(
int
i =
idx
; i < n; i +=
str
)
759
{
760
sum
+= a[m * i + k ];
761
}
762
763
sum
=
reduce_warp<T>
(
sum
);
764
if
(
lane
== 0)
765
shared
[
wid
] =
sum
;
766
__syncthreads
();
767
768
sum
= (
threadIdx
.x <
blockDim
.x /
warpSize
) ?
shared
[
lane
] : 0;
769
if
(
wid
== 0)
770
sum
=
reduce_warp<T>
(
sum
);
771
772
if
(
threadIdx
.x == 0)
773
buf_h
[
blockIdx
.x] =
sum
;
774
775
}
776
template
<
typename
T >
777
__global__
void
mmasumbb_kernel(
const
T
*
__restrict__
GG
,
778
const
T
*
__restrict__
delx
,
const
T
*
__restrict__
diagx
,
779
T
*
__restrict__
buf_h
,
const
int
n,
const
int
m,
const
int
k) {
780
781
const
int
idx
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
782
const
int
str
=
blockDim
.x *
gridDim
.x;
783
784
const
unsigned
int
lane
=
threadIdx
.x %
warpSize
;
785
const
unsigned
int
wid
=
threadIdx
.x /
warpSize
;
786
787
__shared__
T
shared
[16];
788
T
sum
= 0;
789
for
(
int
i =
idx
; i < n; i +=
str
)
790
{
791
sum
+=
GG
[ k + i * m] *
delx
[i] /
diagx
[i];
792
}
793
794
sum
=
reduce_warp<T>
(
sum
);
795
if
(
lane
== 0)
796
shared
[
wid
] =
sum
;
797
__syncthreads
();
798
799
sum
= (
threadIdx
.x <
blockDim
.x /
warpSize
) ?
shared
[
lane
] : 0;
800
if
(
wid
== 0)
801
sum
=
reduce_warp<T>
(
sum
);
802
803
if
(
threadIdx
.x == 0)
804
buf_h
[
blockIdx
.x] =
sum
;
805
806
}
807
808
template
<
typename
T>
809
__global__
void
mmasumHess_kernel(
810
const
T
*
__restrict__
hijx
,
811
const
T
*
__restrict__
Ljjxinv
,
812
T
*
__restrict__
buf_h
,
813
const
int
n,
814
const
int
m,
815
const
int
k0
,
816
const
int
k1
)
817
{
818
const
int
idx
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
819
const
int
stride
=
blockDim
.x *
gridDim
.x;
820
821
// Warp lane and warp id within the block
822
const
unsigned
int
lane
=
threadIdx
.x %
warpSize
;
823
const
unsigned
int
wid
=
threadIdx
.x /
warpSize
;
824
825
__shared__
T
shared
[16];
826
827
T
sum
=
T
(0);
828
829
// Grid-stride loop for global reduction
830
for
(
int
i =
idx
; i < n; i +=
stride
) {
831
// hijx is indexed as hijx[offset + row * m]
832
T
val0
=
hijx
[
k0
+ i * m];
833
T
val1
=
hijx
[
k1
+ i * m];
834
sum
+=
val0
*
Ljjxinv
[i] *
val1
;
835
}
836
837
// Warp-level reduction using your reduce_warp implementation
838
sum
=
reduce_warp<T>
(
sum
);
839
840
// Write each warp's partial sum to shared memory
841
if
(
lane
== 0) {
842
shared
[
wid
] =
sum
;
843
}
844
__syncthreads
();
845
846
// First warp reduces the warp sums in shared memory
847
sum
= (
threadIdx
.x <
blockDim
.x /
warpSize
) ?
shared
[
lane
] :
T
(0);
848
if
(
wid
== 0) {
849
sum
=
reduce_warp<T>
(
sum
);
850
}
851
852
// Write final result from first thread of block
853
if
(
threadIdx
.x == 0) {
854
buf_h
[
blockIdx
.x] =
sum
;
855
}
856
}
857
858
template
<
typename
T >
859
__global__
void
mmasumAA_kernel(
const
T
*
__restrict__
GG
,
860
const
T
*
__restrict__
diagx
,
T
*
__restrict__
buf_h
,
const
int
n,
861
const
int
m,
const
int
k0
,
const
int
k1
) {
862
863
const
int
idx
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
864
const
int
str
=
blockDim
.x *
gridDim
.x;
865
866
const
unsigned
int
lane
=
threadIdx
.x %
warpSize
;
867
const
unsigned
int
wid
=
threadIdx
.x /
warpSize
;
868
869
__shared__
T
shared
[16];
870
T
sum
= 0;
871
for
(
int
i =
idx
; i < n; i +=
str
)
872
{
873
sum
+=
GG
[
k0
+ i * m] /
diagx
[i] *
GG
[
k1
+ i * m];
874
}
875
876
sum
=
reduce_warp<T>
(
sum
);
877
if
(
lane
== 0)
878
shared
[
wid
] =
sum
;
879
__syncthreads
();
880
881
sum
= (
threadIdx
.x <
blockDim
.x /
warpSize
) ?
shared
[
lane
] : 0;
882
if
(
wid
== 0)
883
sum
=
reduce_warp<T>
(
sum
);
884
885
if
(
threadIdx
.x == 0)
886
buf_h
[
blockIdx
.x] =
sum
;
887
888
}
889
890
891
template
<
typename
T>
892
__global__
void
mma_copy_kernel(
T
*
__restrict__
a,
const
T
*
__restrict__
b
,
893
const
int
n,
const
int
m) {
894
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
895
if
(
tj
<n)
896
a[
tj
+m]=
b
[
tj
];
897
}
898
899
900
901
template
<
typename
T>
902
__global__
void
AA_kernel(
T
*
__restrict__
temp,
const
T
*
__restrict__
GG
,
903
const
T
*
__restrict__
diagx
,
const
int
n,
const
int
m) {
904
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
905
if
(
tj
< n) {
906
for
(
int
i0
= 0;
i0
< m;
i0
++) {
907
for
(
int
i1
= 0;
i1
< m;
i1
++) {
908
temp[
tj
+
i0
* (n + 1) +
i1
* (m + 1) * (n + 1)] =
GG
[
i0
* n +
tj
] *
909
(1.0 /
diagx
[
tj
]) *
GG
[
i1
* n +
tj
];
910
}
911
}
912
}
913
}
914
915
916
template
<
typename
T>
917
__global__
void
dx_kernel(
T
*
__restrict__
dx
,
const
T
*
__restrict__
delx
,
918
const
T
*
__restrict__
diagx
,
const
T
*
__restrict__
GG
,
919
const
T
*
__restrict__
dlambda
,
const
int
n,
const
int
m) {
920
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
921
if
(
tj
< n) {
922
dx
[
tj
] = -
delx
[
tj
]/
diagx
[
tj
];
923
for
(
int
i=0;i<m;i++){
924
dx
[
tj
] =
dx
[
tj
] -
GG
[
tj
*m+i]*
dlambda
[i]/
diagx
[
tj
];
925
}
926
}
927
}
928
929
930
931
template
<
typename
T>
932
__global__
void
dxsi_kernel(
T
*
__restrict__
dxsi
,
const
T
*
__restrict__
xsi,
933
const
T
*
__restrict__
dx
,
const
T
*
__restrict__
x,
934
const
T
*
__restrict__
alpha,
const
T
epsi
,
const
int
n) {
935
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
936
if
(
tj
< n) {
937
dxsi
[
tj
]= -xsi[
tj
] + (
epsi
-
dx
[
tj
]*xsi[
tj
])/(x[
tj
] - alpha[
tj
]);
938
}
939
}
940
941
942
template
<
typename
T>
943
__global__
void
deta_kernel(
T
*
__restrict__
deta
,
const
T
*
__restrict__
eta,
944
const
T
*
__restrict__
dx
,
const
T
*
__restrict__
x,
945
const
T
*
__restrict__
beta,
const
T
epsi
,
const
int
n) {
946
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
947
if
(
tj
< n) {
948
deta
[
tj
] = -eta[
tj
] + (
epsi
+
dx
[
tj
] * eta[
tj
]) / (beta[
tj
] - x[
tj
]);
949
}
950
}
951
952
template
<
typename
T>
953
__global__
void
RexCalculation_kernel(
954
T
*
__restrict__
rex
,
955
const
T
*
__restrict__
x,
956
const
T
*
__restrict__
xlow
,
957
const
T
*
__restrict__
xupp
,
958
const
T
*
__restrict__
pij,
959
const
T
*
__restrict__
p0j,
960
const
T
*
__restrict__
qij,
961
const
T
*
__restrict__
q0j,
962
const
T
*
__restrict__
lambda,
963
const
T
*
__restrict__
xsi,
964
const
T
*
__restrict__
eta,
965
const
int
n,
966
const
int
m)
967
{
968
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
969
if
(
tj
< n) {
970
T
upp_diff
=
xupp
[
tj
] - x[
tj
];
971
T
low_diff
= x[
tj
] -
xlow
[
tj
];
972
T
upp_diff_sq
=
upp_diff
*
upp_diff
;
973
T
low_diff_sq
=
low_diff
*
low_diff
;
974
975
T
sum
= 0.0;
976
for
(
int
i = 0; i < m; ++i) {
977
sum
+= pij[i +
tj
* m] * lambda[i] /
upp_diff_sq
978
- qij[i +
tj
* m] * lambda[i] /
low_diff_sq
;
979
}
980
981
rex
[
tj
] =
sum
982
+ p0j[
tj
] /
upp_diff_sq
983
- q0j[
tj
] /
low_diff_sq
984
- xsi[
tj
] + eta[
tj
];
985
}
986
}
987
988
template
<
typename
T>
989
__global__
void
rey_calculation_kernel(
T
*
__restrict__
rey
,
990
const
T
*
__restrict__
c,
const
T
*
__restrict__
d,
const
T
*
__restrict__
y,
991
const
T
*
__restrict__
lambda,
const
T
*
__restrict__
mu,
const
int
n) {
992
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
993
if
(
tj
< n) {
994
rey
[
tj
] = c[
tj
] + d[
tj
] * y[
tj
] - lambda[
tj
] - mu[
tj
];
995
}
996
}
997
998
999
template
<
typename
T >
1000
__global__
void
norm_kernel(
const
T
*
__restrict__
a,
T
*
__restrict__
buf_h
,
1001
const
int
n) {
1002
1003
const
int
idx
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
1004
const
int
str
=
blockDim
.x *
gridDim
.x;
1005
1006
const
unsigned
int
lane
=
threadIdx
.x %
warpSize
;
1007
const
unsigned
int
wid
=
threadIdx
.x /
warpSize
;
1008
1009
__shared__
T
shared
[16];
1010
T
sum
= 0;
1011
for
(
int
i =
idx
; i < n; i +=
str
)
1012
{
1013
sum
+=
pow
(a[i], 2);
1014
}
1015
1016
sum
=
reduce_warp<T>
(
sum
);
1017
if
(
lane
== 0)
1018
shared
[
wid
] =
sum
;
1019
__syncthreads
();
1020
1021
sum
= (
threadIdx
.x <
blockDim
.x /
warpSize
) ?
shared
[
lane
] : 0;
1022
if
(
wid
== 0)
1023
sum
=
reduce_warp<T>
(
sum
);
1024
1025
if
(
threadIdx
.x == 0)
1026
buf_h
[
blockIdx
.x] =
sum
;
1027
1028
}
1029
1030
1031
1032
template
<
typename
T>
1033
__global__
void
sub2cons_kernel(
T
*
__restrict__
a,
const
T
*
__restrict__
b
,
1034
const
T
*
__restrict__
c,
1035
const
T
d,
const
int
n) {
1036
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
1037
if
(
tj
< n) {
1038
a[
tj
] =
b
[
tj
]*c[
tj
]-d;
1039
}
1040
}
1041
1042
1043
template
<
typename
T>
1044
__global__
void
dely_kernel(
T
*
__restrict__
dely
,
const
T
*
__restrict__
c,
1045
const
T
*
__restrict__
d,
const
T
*
__restrict__
y,
1046
const
T
*
__restrict__
lambda,
const
T
epsi
,
const
int
n) {
1047
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
1048
if
(
tj
< n) {
1049
dely
[
tj
] = c[
tj
] + d[
tj
]*y[
tj
] - lambda[
tj
] -
epsi
/y[
tj
];
1050
}
1051
}
1052
1053
1054
1055
template
<
typename
T >
1056
__global__
void
maxval2_kernel(
const
T
*
__restrict__
a,
const
T
*
__restrict__
b
,
1057
T
*
__restrict__
temp,
const
T
cons
,
const
int
n) {
1058
1059
const
int
idx
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
1060
const
int
str
=
blockDim
.x *
gridDim
.x;
1061
1062
const
unsigned
int
lane
=
threadIdx
.x %
warpSize
;
1063
const
unsigned
int
wid
=
threadIdx
.x /
warpSize
;
1064
1065
__shared__
T
shared
[16];
1066
T
maxval
=
cons
* a[0] /
b
[0];
1067
for
(
int
i =
idx
; i < n; i +=
str
)
1068
{
1069
maxval
=
max
(
maxval
,
cons
* a[i] /
b
[i]);
1070
}
1071
1072
maxval
=
max_reduce_warp<T>
(
maxval
);
1073
if
(
lane
== 0)
1074
shared
[
wid
] =
maxval
;
1075
__syncthreads
();
1076
1077
maxval
= (
threadIdx
.x <
blockDim
.x /
warpSize
) ?
shared
[
lane
] : 0.0;
1078
if
(
wid
== 0)
1079
maxval
=
max_reduce_warp<T>
(
maxval
);
1080
1081
if
(
threadIdx
.x == 0)
1082
temp[
blockIdx
.x] =
maxval
;
1083
1084
}
1085
1086
1087
template
<
typename
T >
1088
__global__
void
maxval3_kernel(
const
T
*
__restrict__
a,
const
T
*
__restrict__
b
,
1089
const
T
*
__restrict__
c,
T
*
__restrict__
temp,
const
T
cons
,
const
int
n) {
1090
1091
const
int
idx
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
1092
const
int
str
=
blockDim
.x *
gridDim
.x;
1093
1094
const
unsigned
int
lane
=
threadIdx
.x %
warpSize
;
1095
const
unsigned
int
wid
=
threadIdx
.x /
warpSize
;
1096
1097
__shared__
T
shared
[16];
1098
T
maxval
=
cons
* a[0] /
b
[0];
1099
for
(
int
i =
idx
; i < n; i +=
str
)
1100
{
1101
maxval
=
max
(
maxval
,
cons
* a[i] / (
b
[i] - c[i]));
1102
}
1103
1104
maxval
=
max_reduce_warp<T>
(
maxval
);
1105
if
(
lane
== 0)
1106
shared
[
wid
] =
maxval
;
1107
__syncthreads
();
1108
1109
maxval
= (
threadIdx
.x <
blockDim
.x /
warpSize
) ?
shared
[
lane
] : 0;
1110
if
(
wid
== 0)
1111
maxval
=
max_reduce_warp<T>
(
maxval
);
1112
1113
if
(
threadIdx
.x == 0)
1114
temp[
blockIdx
.x] =
maxval
;
1115
1116
}
1117
1118
1119
template
<
typename
T>
1120
__global__
void
kkt_rex_kernel(
T
*
__restrict__
rex
,
const
T
*
__restrict__
df0dx
,
1121
const
T
*
__restrict__
dfdx
,
const
T
*
__restrict__
xsi,
1122
const
T
*
__restrict__
eta,
const
T
*
__restrict__
lambda,
const
int
n,
1123
const
int
m) {
1124
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
1125
if
(
tj
< n) {
1126
rex
[
tj
] = 0.0;
1127
for
(
int
i = 0; i < m; i++) {
1128
rex
[
tj
] =
rex
[
tj
] +
dfdx
[i +
tj
*m] * lambda[i];
1129
}
1130
rex
[
tj
] +=
df0dx
[
tj
] - xsi[
tj
] + eta[
tj
];
1131
}
1132
}
1133
1134
1135
template
<
typename
T>
1136
__global__
void
maxcons_kernel(
T
*
__restrict__
a,
const
T
b
,
1137
const
T
c,
const
T
*
__restrict__
d,
const
int
n) {
1138
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
1139
if
(
tj
< n) {
1140
a[
tj
] =
max
(
b
, c * d[
tj
]);
1141
}
1142
}
1143
1146
template
<
typename
T >
1147
__global__
void
glsum_kernel(
const
T
* a,
T
*
buf_h
,
const
int
n) {
1148
const
int
idx
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
1149
const
int
str
=
blockDim
.x *
gridDim
.x;
1150
1151
const
unsigned
int
lane
=
threadIdx
.x %
warpSize
;
1152
const
unsigned
int
wid
=
threadIdx
.x /
warpSize
;
1153
1154
__shared__
T
shared
[16];
1155
T
sum
= 0;
1156
for
(
int
i =
idx
; i<n ; i +=
str
)
1157
{
1158
sum
+= a[i];
1159
}
1160
1161
sum
=
reduce_warp<T>
(
sum
);
1162
if
(
lane
== 0)
1163
shared
[
wid
] =
sum
;
1164
__syncthreads
();
1165
1166
sum
= (
threadIdx
.x <
blockDim
.x /
warpSize
) ?
shared
[
lane
] : 0;
1167
if
(
wid
== 0)
1168
sum
=
reduce_warp<T>
(
sum
);
1169
1170
if
(
threadIdx
.x == 0)
1171
buf_h
[
blockIdx
.x] =
sum
;
1172
1173
}
1174
template
<
typename
T >
1175
__global__
void
glsc2_kernel(
const
T
* a,
1176
const
T
*
b
,
1177
T
*
buf_h
,
1178
const
int
n) {
1179
1180
const
int
idx
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
1181
const
int
str
=
blockDim
.x *
gridDim
.x;
1182
1183
const
unsigned
int
lane
=
threadIdx
.x %
warpSize
;
1184
const
unsigned
int
wid
=
threadIdx
.x /
warpSize
;
1185
1186
__shared__
T
shared
[16];
1187
T
sum
= 0.0;
1188
for
(
int
i =
idx
; i < n; i+=
str
) {
1189
sum
+= a[i] *
b
[i];
1190
}
1191
1192
sum
=
reduce_warp<T>
(
sum
);
1193
if
(
lane
== 0)
1194
shared
[
wid
] =
sum
;
1195
__syncthreads
();
1196
1197
sum
= (
threadIdx
.x <
blockDim
.x /
warpSize
) ?
shared
[
lane
] : 0;
1198
if
(
wid
== 0)
1199
sum
=
reduce_warp<T>
(
sum
);
1200
1201
if
(
threadIdx
.x == 0)
1202
buf_h
[
blockIdx
.x] =
sum
;
1203
1204
}
1205
1206
1207
1208
1209
1210
template
<
typename
T>
1211
__global__
void
add2inv2_kernel(
T
*
__restrict__
a,
const
T
*
__restrict__
b
,
1212
const
T
c,
const
int
n) {
1213
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
1214
if
(
tj
< n) {
1215
a[
tj
] = a[
tj
]+c/
b
[
tj
];
1216
}
1217
}
1218
1219
template
<
typename
T>
1220
__global__
void
max2_kernel(
T
*
__restrict__
a,
const
T
b
,
1221
const
T
*
__restrict__
c,
const
T
d,
const
int
n) {
1222
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
1223
if
(
tj
< n) {
1224
a[
tj
]=
max
(
b
, d*c[
tj
]);
1225
}
1226
}
1227
1228
template
<
typename
T>
1229
__global__
void
updatebb_kernel(
T
*
__restrict__
bb
,
1230
const
T
*
__restrict__
dellambda
,
const
T
*
__restrict__
dely
,
1231
const
T
*
__restrict__
d,
const
T
*
__restrict__
mu,
1232
const
T
*
__restrict__
y,
const
T
delz
,
const
int
m) {
1233
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
1234
if
(
tj
<m)
1235
bb
[
tj
]=
dellambda
[
tj
] +
dely
[
tj
]/(d[
tj
] + mu[
tj
]/y[
tj
]) -
bb
[
tj
];
1236
else
if
(
tj
<m+1)
1237
bb
[
tj
]=
delz
;
1238
}
1239
1240
1241
1242
template
<
typename
T>
1243
__global__
void
updateAA_kernel(
T
*
__restrict__
AA
,
1244
const
T
*
__restrict__
globaltmp_mm
,
const
T
*
__restrict__
s,
1245
const
T
*
__restrict__
lambda,
const
T
*
__restrict__
d,
1246
const
T
*
__restrict__
mu,
const
T
*
__restrict__
y,
const
T
*
__restrict__
a,
1247
const
T
zeta,
const
T
z,
const
int
m) {
1248
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
1249
if
(
tj
<m)
1250
{
1251
AA
[
tj
+
tj
*(m+1)]=
globaltmp_mm
[
tj
+
tj
*m] + (s[
tj
] / lambda[
tj
] +
1252
1.0/ (d[
tj
] + mu[
tj
] / y[
tj
]));
1253
AA
[
tj
+m*(m+1)]=a[
tj
];
1254
AA
[m+
tj
*(m+1)]=a[
tj
];
1255
}
1256
else
if
(
tj
<m+1)
1257
AA
[
tj
+
tj
*(m+1)]= -zeta/z;
1258
}
1259
1260
template
<
typename
T>
1261
__global__
void
dy_kernel(
T
*
__restrict__
dy
,
const
T
*
__restrict__
dely
,
1262
const
T
*
__restrict__
dlambda
,
const
T
*
__restrict__
d,
1263
const
T
*
__restrict__
mu,
const
T
*
__restrict__
y,
const
int
n) {
1264
int
tj
=
blockIdx
.x *
blockDim
.x +
threadIdx
.x;
1265
if
(
tj
<n)
1266
dy
[
tj
] = (-
dely
[
tj
]+
dlambda
[
tj
])/(d[
tj
] + mu[
tj
]/y[
tj
]);
1267
}
1268
1269
#endif
1270
convex_down_RAMP_mapping_apply_kernel
__global__ void convex_down_RAMP_mapping_apply_kernel(const T f_min, const T f_max, const T q, T *__restrict__ X_out_d, T *__restrict__ X_in_d, const int n)
Definition
RAMP_mapping_kernel.h:44
sources
optimizer
bcknd
device
hip
mma_kernel.h
Generated by
1.9.8