Neko-TOP
A portable framework for high-order spectral element flow toplogy optimization.
Loading...
Searching...
No Matches
mma_device.f90
1
34
35submodule(mma) mma_device
36
37 use device_math, only: device_copy, device_cmult, device_cadd, device_cfill, &
38 device_add2, device_add3s2, device_invcol2, device_col2, device_col3, &
39 device_sub2, device_sub3, device_add2s2, device_cadd2, device_pwmax2, &
40 device_pwmin2, device_cpwmax2, device_glsum, device_cmult2
41 use device_mma_math, only: device_maxval, device_norm, device_lcsc2, &
42 device_maxval2, device_maxval3, device_mma_gensub3, &
43 device_mma_gensub4, device_mma_max, device_max2, device_rex, &
44 device_relambda, device_delx, device_add2inv2, device_gg, device_diagx, &
45 device_bb, device_updatebb, device_aa, device_updateaa, device_dx, &
46 device_dy, device_dxsi, device_deta, device_kkt_rex, &
47 device_mma_gensub2, device_mattrans_v_mul, device_mma_dipsolvesub1, &
48 device_mma_ljjxinv, device_hess, device_solve_linear_system, &
49 device_prepare_hessian, device_prepare_aa_matrix, device_update_hessian_z
50
51 use neko_config, only: neko_bcknd_device, neko_device_mpi
52 use device, only: device_to_host
53 use comm, only: neko_comm, pe_rank, mpi_real_precision
54 use mpi_f08, only: mpi_in_place, mpi_max, mpi_min
55 use profiler, only: profiler_start_region, profiler_end_region
56
57 implicit none
58
59contains
60
61 module subroutine mma_update_device(this, iter, x, df0dx, fval, dfdx)
62 ! ----------------------------------------------------- !
63 ! Update the design variable x by solving the convex !
64 ! approximation of the problem. !
65 ! !
66 ! This subroutine is called in each iteration of the !
67 ! optimization loop !
68 ! ----------------------------------------------------- !
69 class(mma_t), intent(inout) :: this
70 integer, intent(in) :: iter
71 type(c_ptr), intent(inout) :: x
72 type(c_ptr), intent(in) :: df0dx, fval, dfdx
73
74 if (.not. this%is_initialized) then
75 call neko_error("The MMA object is not initialized.")
76 end if
77
78 call profiler_start_region("MMA gensub")
79 ! generate a convex approximation of the problem
80 call mma_gensub_device(this, iter, x, df0dx, fval, dfdx)
81 call profiler_end_region("MMA gensub")
82
83 !solve the approximation problem using interior point method
84 call profiler_start_region("MMA subsolve")
85 if (this%subsolver .eq. "dip") then
86 call mma_subsolve_dip_device(this, x)
87 else if (this%subsolver .eq. "dpip") then
88 call mma_subsolve_dpip_device(this, x)
89 else
90 call neko_error("Unrecognized subsolver for MMA in mma_device.")
91 end if
92 call profiler_end_region("MMA subsolve")
93
94 this%is_updated = .true.
95 end subroutine mma_update_device
96
97 module subroutine mma_kkt_device(this, x, df0dx, fval, dfdx)
98 class(mma_t), intent(inout) :: this
99 type(c_ptr), intent(in) :: x, df0dx, fval, dfdx
100
101 if (this%subsolver .eq. "dip") then
102 call mma_dip_kkt_device(this, x, df0dx, fval, dfdx)
103 else
104 call mma_dpip_kkt_device(this, x, df0dx, fval, dfdx)
105 end if
106 end subroutine mma_kkt_device
107
109 ! point method (dip) subsolve of MMA algorithm.
110 module subroutine mma_dip_kkt_device(this, x, df0dx, fval, dfdx)
111 class(mma_t), intent(inout) :: this
112 type(c_ptr), intent(in) :: x, df0dx, fval, dfdx
113
114 type(vector_t), pointer :: relambda, remu
115 integer :: ind(2)
116
117 call this%scratch%request(relambda, ind(1), this%m, .false.)
118 call this%scratch%request(remu, ind(2), this%m, .false.)
119
120 ! relambda = fval - this%a%x * this%z - this%y%x + this%mu%x
121 call device_add3s2(relambda%x_d, fval, this%a%x_d, 1.0_rp, -this%z, &
122 this%m)
123 call device_sub2(relambda%x_d, this%y%x_d, this%m)
124 call device_add2(relambda%x_d, this%mu%x_d, this%m)
125
126 ! Compute residual for mu (eta in the paper)
127 call device_col3(remu%x_d, this%lambda%x_d, this%mu%x_d, this%m)
128
129 this%residumax = maxval([device_maxval(relambda%x_d, this%m), &
130 device_maxval(remu%x_d, this%m)])
131 this%residunorm = sqrt(device_norm(relambda%x_d, this%m)+ &
132 device_norm(remu%x_d, this%m))
133
134 call this%scratch%relinquish(ind)
135 end subroutine mma_dip_kkt_device
136
138 ! point method (dpip) subsolve of MMA algorithm.
139 module subroutine mma_dpip_kkt_device(this, x, df0dx, fval, dfdx)
140 class(mma_t), intent(inout) :: this
141 type(c_ptr), intent(in) :: x, df0dx, fval, dfdx
142
143 real(kind=rp) :: rez, rezeta
144 type(vector_t), pointer :: rey, relambda, remu, res
145 type(vector_t), pointer :: rex, rexsi, reeta
146 integer :: ierr, ind(7)
147 real(kind=rp) :: re_sq_norm
148
149 call this%scratch%request(rey, ind(1), this%m, .false.)
150 call this%scratch%request(relambda, ind(2), this%m, .false.)
151 call this%scratch%request(remu, ind(3), this%m, .false.)
152 call this%scratch%request(res, ind(4), this%m, .false.)
153
154 call this%scratch%request(rex, ind(5), this%n, .false.)
155 call this%scratch%request(rexsi, ind(6), this%n, .false.)
156 call this%scratch%request(reeta, ind(7), this%n, .false.)
157
158 call device_kkt_rex(rex%x_d, df0dx, dfdx, this%xsi%x_d, &
159 this%eta%x_d, this%lambda%x_d, this%n, this%m)
160
161 call device_col3(rey%x_d, this%d%x_d, this%y%x_d, this%m)
162 call device_add2(rey%x_d, this%c%x_d, this%m)
163 call device_sub2(rey%x_d, this%lambda%x_d, this%m)
164 call device_sub2(rey%x_d, this%mu%x_d, this%m)
165
166 rez = this%a0 - this%zeta - device_lcsc2(this%lambda%x_d, this%a%x_d, &
167 this%m)
168
169 call device_add3s2(relambda%x_d, fval, this%a%x_d, 1.0_rp, -this%z, &
170 this%m)
171 call device_sub2(relambda%x_d, this%y%x_d, this%m)
172 call device_add2(relambda%x_d, this%s%x_d, this%m)
173
174 call device_sub3(rexsi%x_d, x, this%xmin%x_d, this%n)
175 call device_col2(rexsi%x_d, this%xsi%x_d, this%n)
176
177 call device_sub3(reeta%x_d, this%xmax%x_d, x, this%n)
178 call device_col2(reeta%x_d, this%eta%x_d, this%n)
179
180 call device_col3(remu%x_d, this%mu%x_d, this%y%x_d, this%m)
181
182 rezeta = this%zeta * this%z
183
184 call device_col3(res%x_d, this%lambda%x_d, this%s%x_d, this%m)
185
186 this%residumax = maxval([ &
187 device_maxval(rex%x_d, this%n), &
188 device_maxval(rey%x_d, this%m), &
189 abs(rez), &
190 device_maxval(relambda%x_d, this%m), &
191 device_maxval(rexsi%x_d, this%n), &
192 device_maxval(reeta%x_d, this%n), &
193 device_maxval(remu%x_d, this%m), &
194 abs(rezeta), &
195 device_maxval(res%x_d, this%m)])
196
197 re_sq_norm = device_norm(rex%x_d, this%n) + &
198 device_norm(rexsi%x_d, this%n) + &
199 device_norm(reeta%x_d, this%n)
200
201 call mpi_allreduce(mpi_in_place, this%residumax, 1, &
202 mpi_real_precision, mpi_max, neko_comm, ierr)
203
204 call mpi_allreduce(mpi_in_place, re_sq_norm, 1, &
205 mpi_real_precision, mpi_sum, neko_comm, ierr)
206
207 this%residunorm = sqrt(( &
208 device_norm(rey%x_d, this%m) + &
209 rez**2 + &
210 device_norm(relambda%x_d, this%m) + &
211 device_norm(remu%x_d, this%m) + &
212 rezeta**2 + &
213 device_norm(res%x_d, this%m) &
214 ) + re_sq_norm)
215
216 call this%scratch%relinquish(ind)
217 end subroutine mma_dpip_kkt_device
218
219 !============================================================================!
220 ! private internal subroutines
221
223 subroutine mma_gensub_device(this, iter, x, df0dx, fval, dfdx)
224 ! ----------------------------------------------------- !
225 ! Generate the approximation sub problem by computing !
226 ! the lower and upper asymtotes and the other necessary !
227 ! parameters (alpha, beta, p0j, q0j, pij, qij, ...). !
228 ! ----------------------------------------------------- !
229 class(mma_t), intent(inout) :: this
230 type(c_ptr), intent(in) :: x
231 type(c_ptr), intent(in) :: df0dx
232 type(c_ptr), intent(in) :: fval
233 type(c_ptr), intent(in) :: dfdx
234
235 integer, intent(in) :: iter
236 integer :: ierr
237
238 type(vector_t), pointer :: x_diff, xmin_eff, xmax_eff
239 integer :: ind(3)
240
241 call this%scratch%request(x_diff, ind(1), this%n, .false.)
242 call this%scratch%request(xmin_eff, ind(2), this%n, .false.)
243 call this%scratch%request(xmax_eff, ind(3), this%n, .false.)
244
245 call device_copy(xmin_eff%x_d, this%xmin%x_d, this%n)
246 call device_copy(xmax_eff%x_d, this%xmax%x_d, this%n)
247
248 if (this%move_limit .gt. 0.0_rp) then
249 call device_cadd2(xmin_eff%x_d, x, -this%move_limit, this%n)
250 call device_pwmax2(xmin_eff%x_d, this%xmin%x_d, this%n)
251
252 call device_cadd2(xmax_eff%x_d, x, this%move_limit, this%n)
253 call device_pwmin2(xmax_eff%x_d, this%xmax%x_d, this%n)
254 end if
255
256 call device_sub3(x_diff%x_d, xmax_eff%x_d, xmin_eff%x_d, this%n)
257 call device_cpwmax2(x_diff%x_d, 1.0e-5_rp, this%n)
258
259 ! ------------------------------------------------------------------------ !
260 ! Setup the current asymptotes
261
262 if (iter .lt. 3) then
263 call device_copy(this%low%x_d, x, this%n)
264 call device_add2s2(this%low%x_d, x_diff%x_d, - this%asyinit, this%n)
265 call device_copy(this%upp%x_d, x, this%n)
266 call device_add2s2(this%upp%x_d, x_diff%x_d, this%asyinit, this%n)
267 else
268 call device_mma_gensub2(this%low%x_d, this%upp%x_d, x, &
269 this%xold1%x_d, this%xold2%x_d, x_diff%x_d, &
270 this%asydecr, this%asyincr, this%n)
271 end if
272
273 ! ------------------------------------------------------------------------ !
274 ! Calculate p0j, q0j, pij, qij, alpha, and beta
275
276 call device_mma_gensub3(x, df0dx, dfdx, this%low%x_d, &
277 this%upp%x_d, xmin_eff%x_d, xmax_eff%x_d, this%alpha%x_d, &
278 this%beta%x_d, this%p0j%x_d, this%q0j%x_d, this%pij%x_d, &
279 this%qij%x_d, this%n, this%m)
280
281 ! ------------------------------------------------------------------------ !
282 ! Computing bi as defined in page 5
283
284 call device_mma_gensub4(x, this%low%x_d, this%upp%x_d, this%pij%x_d, &
285 this%qij%x_d, this%n, this%m, this%bi%x_d)
286
287 call device_memcpy(this%bi%x, this%bi%x_d, this%m, device_to_host, &
288 sync = .true.)
289 call mpi_allreduce(mpi_in_place, this%bi%x, this%m, &
290 mpi_real_precision, mpi_sum, neko_comm, ierr)
291 call device_memcpy(this%bi%x, this%bi%x_d, this%m, host_to_device, &
292 sync = .true.)
293
294 call device_sub2(this%bi%x_d, fval, this%m)
295
296 call this%scratch%relinquish(ind)
297 end subroutine mma_gensub_device
298
301 subroutine mma_subsolve_dpip_device(this, designx_d)
302 class(mma_t), intent(inout) :: this
303 type(c_ptr), intent(in) :: designx_d
304 integer :: iter, itto, ierr
305 real(kind=rp) :: epsi, residual_max, residual_norm, z, zeta, rez, rezeta, &
306 delz, dz, dzeta, steg, zold, zetaold, new_residual
307 ! vectors with size m
308 type(vector_t) , pointer :: y, lambda, s, mu, rey, relambda, remu, res, &
309 dely, dellambda, dy, dlambda, ds, dmu, yold, lambdaold, sold, muold
310
311 ! vectors with size n
312 type(vector_t), pointer :: x, xsi, eta, rex, rexsi, reeta, &
313 delx, diagx, dx, dxsi, deta, xold, xsiold, etaold
314
315 type(vector_t), pointer :: bb
316 type(matrix_t), pointer :: GG
317 type(matrix_t), pointer :: AA
318
319 integer :: info
320 real(kind=rp) :: re_sq_norm
321
322 integer :: ind(35)
323
324 real(kind=rp) :: minimal_epsilon
325
326 call this%scratch%request(y, ind(1), this%m, .false.)
327 call this%scratch%request(lambda, ind(2), this%m, .false.)
328 call this%scratch%request(s, ind(3), this%m, .false.)
329 call this%scratch%request(mu, ind(4), this%m, .false.)
330 call this%scratch%request(rey, ind(5), this%m, .false.)
331 call this%scratch%request(relambda, ind(6), this%m, .false.)
332 call this%scratch%request(remu, ind(7), this%m, .false.)
333 call this%scratch%request(res, ind(8), this%m, .false.)
334 call this%scratch%request(dely, ind(9), this%m, .false.)
335 call this%scratch%request(dellambda, ind(10), this%m, .false.)
336 call this%scratch%request(dy, ind(11), this%m, .false.)
337 call this%scratch%request(dlambda, ind(12), this%m, .false.)
338 call this%scratch%request(ds, ind(13), this%m, .false.)
339 call this%scratch%request(dmu, ind(14), this%m, .false.)
340 call this%scratch%request(yold, ind(15), this%m, .false.)
341 call this%scratch%request(lambdaold, ind(16), this%m, .false.)
342 call this%scratch%request(sold, ind(17), this%m, .false.)
343 call this%scratch%request(muold, ind(18), this%m, .false.)
344 call this%scratch%request(x, ind(19), this%n, .false.)
345 call this%scratch%request(xsi, ind(20), this%n, .false.)
346 call this%scratch%request(eta, ind(21), this%n, .false.)
347 call this%scratch%request(rex, ind(22), this%n, .false.)
348 call this%scratch%request(rexsi, ind(23), this%n, .false.)
349 call this%scratch%request(reeta, ind(24), this%n, .false.)
350 call this%scratch%request(delx, ind(25), this%n, .false.)
351 call this%scratch%request(diagx, ind(26), this%n, .false.)
352 call this%scratch%request(dx, ind(27), this%n, .false.)
353 call this%scratch%request(dxsi, ind(28), this%n, .false.)
354 call this%scratch%request(deta, ind(29), this%n, .false.)
355 call this%scratch%request(xold, ind(30), this%n, .false.)
356 call this%scratch%request(xsiold, ind(31), this%n, .false.)
357 call this%scratch%request(etaold, ind(32), this%n, .false.)
358 call this%scratch%request(bb, ind(33), this%m+1, .false.)
359
360 call this%scratch%request(gg, ind(34), this%m, this%n, .false.)
361 call this%scratch%request(aa, ind(35), this%m+1, this%m+1, .false.)
362
363 ! ------------------------------------------------------------------------ !
364 ! initial value for the parameters in the subsolve based on
365 ! page 15 of "https://people.kth.se/~krille/mmagcmma.pdf"
366
367 epsi = 1.0_rp !100
368 call device_add3s2(x%x_d, this%alpha%x_d, this%beta%x_d, 0.5_rp, 0.5_rp, &
369 this%n)
370 call device_cfill(y%x_d, 1.0_rp, this%m)
371 z = 1.0_rp
372 zeta = 1.0_rp
373 call device_cfill(lambda%x_d, 1.0_rp, this%m)
374 call device_cfill(s%x_d, 1.0_rp, this%m)
375 call device_mma_max(xsi%x_d, x%x_d, this%alpha%x_d, this%n)
376 call device_mma_max(eta%x_d, this%beta%x_d, x%x_d, this%n)
377 call device_max2(mu%x_d, 1.0_rp, this%c%x_d, 0.5_rp, this%m)
378
379 ! ------------------------------------------------------------------------ !
380 ! Computing the minimal epsilon and choose the most conservative one
381
382 minimal_epsilon = max(0.9_rp * this%epsimin, 1.0e-12_rp)
383 call mpi_allreduce(mpi_in_place, minimal_epsilon, 1, &
384 mpi_real_precision, mpi_min, neko_comm, ierr)
385
386 ! ------------------------------------------------------------------------ !
387 ! The main loop of the dual-primal interior point method.
388
389 do while (epsi .gt. minimal_epsilon)
390
391 ! --------------------------------------------------------------------- !
392 ! Calculating residuals based on
393 ! "https://people.kth.se/~krille/mmagcmma.pdf" for the variables
394 ! x, y, z, lambda residuals based on eq(5.9a)-(5.9d), respectively.
395
396 associate(p0j => this%p0j, q0j => this%q0j, &
397 pij => this%pij, qij => this%qij, &
398 low => this%low, upp => this%upp, &
399 alpha => this%alpha, beta => this%beta, &
400 c => this%c, d => this%d, &
401 a0 => this%a0, a => this%a)
402
403 call device_rex(rex%x_d, x%x_d, low%x_d, upp%x_d, &
404 pij%x_d, p0j%x_d, qij%x_d, q0j%x_d, &
405 lambda%x_d, xsi%x_d, eta%x_d, this%n, this%m)
406
407 call device_col3(rey%x_d, d%x_d, y%x_d, this%m)
408 call device_add2(rey%x_d, c%x_d, this%m)
409 call device_sub2(rey%x_d, lambda%x_d, this%m)
410 call device_sub2(rey%x_d, mu%x_d, this%m)
411 rez = a0 - zeta - device_lcsc2(lambda%x_d, a%x_d, this%m)
412
413 call device_cfill(relambda%x_d, 0.0_rp, this%m)
414 call device_relambda(relambda%x_d, x%x_d, this%upp%x_d, &
415 low%x_d, pij%x_d, qij%x_d, this%n, this%m)
416
417 end associate
418
419 ! --------------------------------------------------------------------- !
420 ! Computing the norm of the residuals
421
422 ! Complete the computations of lambda residuals
423 call device_memcpy(relambda%x, relambda%x_d, this%m, device_to_host, &
424 sync = .true.)
425 call mpi_allreduce(mpi_in_place, relambda%x, this%m, &
426 mpi_real_precision, mpi_sum, neko_comm, ierr)
427 call device_memcpy(relambda%x, relambda%x_d, this%m, host_to_device, &
428 sync = .true.)
429
430 call device_add2s2(relambda%x_d, this%a%x_d, -z, this%m)
431 call device_sub2(relambda%x_d, y%x_d, this%m)
432 call device_add2(relambda%x_d, s%x_d, this%m)
433 call device_sub2(relambda%x_d, this%bi%x_d, this%m)
434
435 call device_sub3(rexsi%x_d, x%x_d, this%alpha%x_d, this%n)
436 call device_col2(rexsi%x_d, xsi%x_d, this%n)
437 call device_cadd(rexsi%x_d, - epsi, this%n)
438
439 call device_sub3(reeta%x_d, this%beta%x_d, x%x_d, this%n)
440 call device_col2(reeta%x_d, eta%x_d, this%n)
441 call device_cadd(reeta%x_d, - epsi, this%n)
442
443 call device_col3(remu%x_d, mu%x_d, y%x_d, this%m)
444 call device_cadd(remu%x_d, - epsi, this%m)
445
446 rezeta = zeta * z - epsi
447
448 call device_col3(res%x_d, lambda%x_d, s%x_d, this%m)
449 call device_cadd(res%x_d, - epsi, this%m)
450
451 ! Setup vectors of residuals and their norms
452 residual_max = maxval([device_maxval(rex%x_d, this%n), &
453 device_maxval(rey%x_d, this%m), abs(rez), &
454 device_maxval(relambda%x_d, this%m), &
455 device_maxval(rexsi%x_d, this%n), &
456 device_maxval(reeta%x_d, this%n), &
457 device_maxval(remu%x_d, this%m), abs(rezeta), &
458 device_maxval(res%x_d, this%m)])
459
460 re_sq_norm = device_norm(rex%x_d, this%n) + &
461 device_norm(rexsi%x_d, this%n) + device_norm(reeta%x_d, this%n)
462
463 call mpi_allreduce(mpi_in_place, residual_max, 1, &
464 mpi_real_precision, mpi_max, neko_comm, ierr)
465
466 call mpi_allreduce(mpi_in_place, re_sq_norm, &
467 1, mpi_real_precision, mpi_sum, neko_comm, ierr)
468
469 residual_norm = sqrt(device_norm(rey%x_d, this%m) + &
470 rez**2 + &
471 device_norm(relambda%x_d, this%m) + &
472 device_norm(remu%x_d, this%m)+ &
473 rezeta**2 + &
474 device_norm(res%x_d, this%m) &
475 + re_sq_norm)
476
477 ! --------------------------------------------------------------------- !
478 ! Internal loop
479
480 do iter = 1, this%max_iter
481
482 if (residual_max .lt. epsi) exit
483
484 call device_delx(delx%x_d, x%x_d, this%low%x_d, this%upp%x_d, &
485 this%pij%x_d, this%qij%x_d, this%p0j%x_d, this%q0j%x_d, &
486 this%alpha%x_d, this%beta%x_d, lambda%x_d, epsi, this%n, &
487 this%m)
488
489 call device_col3(dely%x_d, this%d%x_d, y%x_d, this%m)
490 call device_add2(dely%x_d, this%c%x_d, this%m)
491 call device_sub2(dely%x_d, lambda%x_d, this%m)
492 call device_add2inv2(dely%x_d, y%x_d, - epsi, this%m)
493 delz = this%a0 - device_lcsc2(lambda%x_d, this%a%x_d, this%m) - epsi/z
494
495 ! Accumulate sums for dellambda (the term gi(x))
496 call device_cfill(dellambda%x_d, 0.0_rp, this%m)
497 call device_relambda(dellambda%x_d, x%x_d, this%upp%x_d, &
498 this%low%x_d, this%pij%x_d, this%qij%x_d, this%n, this%m)
499
500 call device_memcpy(dellambda%x, dellambda%x_d, this%m, &
501 device_to_host, sync = .true.)
502 call mpi_allreduce(mpi_in_place, dellambda%x, this%m, &
503 mpi_real_precision, mpi_sum, neko_comm, ierr)
504 call device_memcpy(dellambda%x, dellambda%x_d, this%m, &
505 host_to_device, sync = .true.)
506
507 call device_add3s2(dellambda%x_d, dellambda%x_d, this%a%x_d, &
508 1.0_rp, -z, this%m)
509 call device_sub2(dellambda%x_d, y%x_d, this%m)
510 call device_sub2(dellambda%x_d, this%bi%x_d, this%m)
511 call device_add2inv2(dellambda%x_d, lambda%x_d, epsi, this%m)
512
513 call device_gg(gg%x_d, x%x_d, this%low%x_d, this%upp%x_d, &
514 this%pij%x_d, this%qij%x_d, this%n, this%m)
515
516 call device_diagx(diagx%x_d, x%x_d, xsi%x_d, this%low%x_d, &
517 this%upp%x_d, this%p0j%x_d, this%q0j%x_d, this%pij%x_d, &
518 this%qij%x_d, this%alpha%x_d, this%beta%x_d, eta%x_d, &
519 lambda%x_d, this%n, this%m)
520
521 !Here we only consider the case m<n in the matlab code
522 !assembling the right hand side matrix based on eq(5.20)
523 ! bb = [dellambda + dely/(this%d%x + &
524 ! (mu/y)) - matmul(GG,delx/diagx), delz ]
525
526 !--------------------------------------------------------------------!
527 ! for MPI computation of bb
528
529 call device_bb(bb%x_d, gg%x_d, delx%x_d, diagx%x_d, this%n, &
530 this%m)
531
532 call device_memcpy(bb%x, bb%x_d, this%m + 1, device_to_host, &
533 sync = .true.)
534 call mpi_allreduce(mpi_in_place, bb%x, this%m + 1, &
535 mpi_real_precision, mpi_sum, neko_comm, ierr)
536 call device_memcpy(bb%x, bb%x_d, this%m + 1, &
537 host_to_device, sync = .true.)
538
539 call device_updatebb(bb%x_d, dellambda%x_d, dely%x_d, &
540 this%d%x_d, mu%x_d, y%x_d, delz, this%m)
541
542 ! assembling the coefficients matrix AA based on eq(5.20)
543 ! AA(1:this%m,1:this%m) = &
544 ! matmul(matmul(GG,mma_diag(1/diagx)), transpose(GG))
545 ! !update diag(AA)
546 ! AA(1:this%m,1:this%m) = AA(1:this%m,1:this%m) + &
547 ! mma_diag(s/lambda + 1.0/(this%d%x + (mu/y)))
548
549 call device_cfill(aa%x_d, 0.0_rp, (this%m+1) * (this%m+1))
550 call device_aa(aa%x_d, gg%x_d, diagx%x_d, this%n, this%m)
551
552 call device_memcpy(aa%x, aa%x_d, (this%m+1) * (this%m+1), &
553 device_to_host, sync = .true.)
554 call mpi_allreduce(mpi_in_place, aa%x, &
555 (this%m + 1)**2, mpi_real_precision, mpi_sum, neko_comm, ierr)
556 call device_memcpy(aa%x, aa%x_d, (this%m+1) * (this%m+1), &
557 host_to_device, sync = .true.)
558
559 call device_prepare_aa_matrix(aa%x_d, s%x_d, lambda%x_d, &
560 this%d%x_d, mu%x_d, y%x_d, this%a%x_d, zeta, z, this%m)
561
562 ! Device solve for the linear system
563 call device_solve_linear_system(aa%x_d, bb%x_d, this%m + 1, info)
564 if (info .ne. 0) then
565 call neko_error("Linear solver failed on the device in " // &
566 "mma_subsolve_dpip")
567 end if
568
569 call device_copy(dlambda%x_d, bb%x_d, this%m)
570
571
572 !We need to write the last element of bb to dz so this is necessary
573 call device_memcpy(bb%x, bb%x_d, this%m+1, device_to_host, &
574 sync = .true.)
575 dz = bb%x(this%m + 1)
576
577
578 ! based on eq(5.19)
579 call device_dx(dx%x_d, delx%x_d, diagx%x_d, gg%x_d, &
580 dlambda%x_d, this%n, this%m)
581 call device_dy(dy%x_d, dely%x_d, dlambda%x_d, this%d%x_d, &
582 mu%x_d, y%x_d, this%m)
583 call device_dxsi(dxsi%x_d, xsi%x_d, dx%x_d, x%x_d, &
584 this%alpha%x_d, epsi, this%n)
585 call device_deta(deta%x_d, eta%x_d, dx%x_d, x%x_d, &
586 this%beta%x_d, epsi, this%n)
587
588 call device_col3(dmu%x_d, mu%x_d, dy%x_d, this%m)
589 call device_cmult(dmu%x_d, -1.0_rp, this%m)
590 call device_cadd(dmu%x_d, epsi, this%m)
591 call device_invcol2(dmu%x_d, y%x_d, this%m)
592 call device_sub2(dmu%x_d, mu%x_d, this%m)
593 dzeta = -zeta + (epsi - zeta * dz) / z
594 call device_col3(ds%x_d, dlambda%x_d, s%x_d, this%m)
595 call device_cmult(ds%x_d, -1.0_rp, this%m)
596 call device_cadd(ds%x_d, epsi, this%m)
597 call device_invcol2(ds%x_d, lambda%x_d, this%m)
598 call device_sub2(ds%x_d, s%x_d, this%m)
599
600 steg = maxval([1.0_rp, &
601 device_maxval2(dy%x_d, y%x_d, -1.01_rp, this%m), &
602 -1.01_rp * dz / z, &
603 device_maxval2(dlambda%x_d, lambda%x_d, -1.01_rp, this%m), &
604 device_maxval2(dxsi%x_d, xsi%x_d, -1.01_rp, this%n), &
605 device_maxval2(deta%x_d, eta%x_d, -1.01_rp, this%n), &
606 device_maxval2(dmu%x_d, mu%x_d, -1.01_rp, this%m), &
607 -1.01_rp * dzeta / zeta, &
608 device_maxval2(ds%x_d, s%x_d, -1.01_rp, this%m), &
609 device_maxval3(dx%x_d, x%x_d, this%alpha%x_d, -1.01_rp, this%n),&
610 device_maxval3(dx%x_d, this%beta%x_d, x%x_d, 1.01_rp, this%n)])
611
612 steg = 1.0_rp / steg
613
614 call device_copy(xold%x_d, x%x_d, this%n)
615 call device_copy(yold%x_d, y%x_d, this%m)
616 zold = z
617 call device_copy(lambdaold%x_d, lambda%x_d, this%m)
618 call device_copy(xsiold%x_d, xsi%x_d, this%n)
619 call device_copy(etaold%x_d, eta%x_d, this%n)
620 call device_copy(muold%x_d, mu%x_d, this%m)
621 zetaold = zeta
622 call device_copy(sold%x_d, s%x_d, this%m)
623
624 new_residual = 2.0_rp * residual_norm
625
626 ! Share the new_residual and steg values
627 call mpi_allreduce(mpi_in_place, steg, 1, &
628 mpi_real_precision, mpi_min, neko_comm, ierr)
629 call mpi_allreduce(mpi_in_place, new_residual, 1, &
630 mpi_real_precision, mpi_min, neko_comm, ierr)
631
632 ! The innermost loop to determine the suitable step length
633 ! using the Backtracking Line Search approach
634 itto = 0
635 do while ((new_residual .gt. residual_norm) .and. (itto .lt. 50))
636 itto = itto + 1
637
638 ! update the variables
639 call device_add3s2(x%x_d, xold%x_d, dx%x_d, 1.0_rp, steg, this%n)
640 call device_add3s2(y%x_d, yold%x_d, dy%x_d, 1.0_rp, steg, this%m)
641 z = zold + steg*dz
642 call device_add3s2(lambda%x_d, lambdaold%x_d, &
643 dlambda%x_d, 1.0_rp, steg, this%m)
644 call device_add3s2(xsi%x_d, xsiold%x_d, dxsi%x_d, &
645 1.0_rp, steg, this%n)
646 call device_add3s2(eta%x_d, etaold%x_d, deta%x_d, &
647 1.0_rp, steg, this%n)
648 call device_add3s2(mu%x_d, muold%x_d, dmu%x_d, &
649 1.0_rp, steg, this%m)
650 zeta = zetaold + steg*dzeta
651 call device_add3s2(s%x_d, sold%x_d, ds%x_d, 1.0_rp, &
652 steg, this%m)
653
654 ! Recompute the new_residual to see if this stepsize improves
655 ! the residue
656 call device_rex(rex%x_d, x%x_d, this%low%x_d, &
657 this%upp%x_d, this%pij%x_d, this%p0j%x_d, &
658 this%qij%x_d, this%q0j%x_d, lambda%x_d, xsi%x_d, &
659 eta%x_d, this%n, this%m)
660
661 call device_col3(rey%x_d, this%d%x_d, y%x_d, this%m)
662 call device_add2(rey%x_d, this%c%x_d, this%m)
663 call device_sub2(rey%x_d, lambda%x_d, this%m)
664 call device_sub2(rey%x_d, mu%x_d, this%m)
665
666 rez = this%a0 - zeta - device_lcsc2(lambda%x_d, this%a%x_d, this%m)
667
668 ! Accumulate sums for relambda (the term gi(x))
669 call device_cfill(relambda%x_d, 0.0_rp, this%m)
670 call device_relambda(relambda%x_d, x%x_d, this%upp%x_d, &
671 this%low%x_d, this%pij%x_d, this%qij%x_d, &
672 this%n, this%m)
673
674 call device_memcpy(relambda%x, relambda%x_d, this%m, &
675 device_to_host, sync = .true.)
676 call mpi_allreduce(mpi_in_place, relambda%x, this%m, &
677 mpi_real_precision, mpi_sum, neko_comm, ierr)
678 call device_memcpy(relambda%x, relambda%x_d, &
679 this%m, host_to_device, sync = .true.)
680
681 call device_add3s2(relambda%x_d, relambda%x_d, &
682 this%a%x_d, 1.0_rp, -z, this%m)
683 call device_sub2(relambda%x_d, y%x_d, this%m)
684 call device_add2(relambda%x_d, s%x_d, this%m)
685 call device_sub2(relambda%x_d, this%bi%x_d, this%m)
686
687 call device_sub3(rexsi%x_d, x%x_d, this%alpha%x_d, this%n)
688 call device_col2(rexsi%x_d, xsi%x_d, this%n)
689 call device_cadd(rexsi%x_d, - epsi, this%n)
690
691 call device_sub3(reeta%x_d, this%beta%x_d, x%x_d, this%n)
692 call device_col2(reeta%x_d, eta%x_d, this%n)
693 call device_cadd(reeta%x_d, - epsi, this%n)
694
695 call device_col3(remu%x_d, mu%x_d, y%x_d, this%m)
696 call device_cadd(remu%x_d, - epsi, this%m)
697
698 rezeta = zeta*z - epsi
699
700 call device_col3(res%x_d, lambda%x_d, s%x_d, this%m)
701 call device_cadd(res%x_d, - epsi, this%m)
702
703 ! Compute squared norms for the residuals
704 re_sq_norm = device_norm(rex%x_d, this%n) + &
705 device_norm(rexsi%x_d, this%n) + &
706 device_norm(reeta%x_d, this%n)
707 call mpi_allreduce(mpi_in_place, re_sq_norm, 1, &
708 mpi_real_precision, mpi_sum, neko_comm, ierr)
709
710 new_residual = sqrt(device_norm(rey%x_d, this%m) + &
711 rez**2 + &
712 device_norm(relambda%x_d, this%m) + &
713 device_norm(remu%x_d, this%m) + &
714 rezeta**2 + &
715 device_norm(res%x_d, this%m) + &
716 re_sq_norm)
717
718 call mpi_allreduce(mpi_in_place, new_residual, 1, &
719 mpi_real_precision, mpi_sum, neko_comm, ierr)
720
721 steg = steg / 2.0_rp
722
723 end do
724 steg = 2.0_rp * steg ! Correction for the final division by 2
725
726 ! Update the maximum and norm of the residuals
727 residual_norm = new_residual
728 residual_max = maxval([ &
729 device_maxval(rex%x_d, this%n), &
730 device_maxval(rey%x_d, this%m), &
731 abs(rez), &
732 device_maxval(relambda%x_d, this%m), &
733 device_maxval(rexsi%x_d, this%n), &
734 device_maxval(reeta%x_d, this%n), &
735 device_maxval(remu%x_d, this%m), &
736 abs(rezeta), &
737 device_maxval(res%x_d, this%m)])
738
739 call mpi_allreduce(mpi_in_place, residual_max, 1, &
740 mpi_real_precision, mpi_max, neko_comm, ierr)
741
742 end do
743
744 epsi = 0.1_rp * epsi
745 end do
746
747 ! Save the new designx
748 call device_copy(this%xold2%x_d, this%xold1%x_d, this%n)
749 call device_copy(this%xold1%x_d, designx_d, this%n)
750 call device_copy(designx_d, x%x_d, this%n)
751
752 ! update the parameters of the MMA object nesessary to compute KKT residual
753 call device_copy(this%y%x_d, y%x_d, this%m)
754 this%z = z
755 call device_copy(this%lambda%x_d, lambda%x_d, this%m)
756 this%zeta = zeta
757 call device_copy(this%xsi%x_d, xsi%x_d, this%n)
758 call device_copy(this%eta%x_d, eta%x_d, this%n)
759 call device_copy(this%mu%x_d, mu%x_d, this%m)
760 call device_copy(this%s%x_d, s%x_d, this%m)
761
762 !free all the initiated variables in this subroutine
763 call this%scratch%relinquish(ind)
764 end subroutine mma_subsolve_dpip_device
765
768 subroutine mma_subsolve_dip_device(this, designx_d)
769 class(mma_t), intent(inout) :: this
770 type(c_ptr), intent(in) :: designx_d
771 integer :: iter, ierr
772 real(kind=rp) :: epsi, residumax, z, steg
773 ! vectors with size m
774 type(vector_t), pointer :: y, lambda, mu, relambda, remu, dlambda, dmu, &
775 gradlambda, zerom, dd, dummy_m
776 ! vectors with size n
777 type(vector_t), pointer :: x, pjlambda, qjlambda
778
779 ! inverse of a diag matrix:
780 type(vector_t), pointer :: Ljjxinv ! [∇_x^2 Ljj]−1
781 type(matrix_t), pointer :: hijx ! ∇_x hij
782 type(matrix_t), pointer :: Hess
783
784 integer :: info, ind(17)
785
786 real(kind=rp) :: minimal_epsilon
787
788 call this%scratch%request(y, ind(1), this%m, .false.)
789 call this%scratch%request(lambda, ind(2), this%m, .false.)
790 call this%scratch%request(mu, ind(3), this%m, .false.)
791 call this%scratch%request(relambda, ind(4), this%m, .false.)
792 call this%scratch%request(remu, ind(5), this%m, .false.)
793 call this%scratch%request(dlambda, ind(6), this%m, .false.)
794 call this%scratch%request(dmu, ind(7), this%m, .false.)
795 call this%scratch%request(gradlambda, ind(8), this%m, .false.)
796 call this%scratch%request(zerom, ind(9), this%m, .false.)
797 call this%scratch%request(dd, ind(10), this%m, .false.)
798 call this%scratch%request(dummy_m, ind(11), this%m, .false.)
799
800 call this%scratch%request(x, ind(12), this%n, .false.)
801 call this%scratch%request(pjlambda,ind(13), this%n, .false.)
802 call this%scratch%request(qjlambda, ind(14), this%n, .false.)
803
804 call this%scratch%request(ljjxinv, ind(15), this%n, .false.)
805
806 call this%scratch%request(hijx, ind(16), this%m, this%n, .false.)
807 call this%scratch%request(hess, ind(17), this%m, this%m, .false.)
808
809 ! ------------------------------------------------------------------------ !
810 ! initial value for the parameters in the subsolve based on
811 ! page 15 of "https://people.kth.se/~krille/mmagcmma.pdf"
812
813 epsi = 1.0_rp !100
814 call device_cfill(y%x_d, 1.0_rp, this%m)
815 ! initialize lambda with an array of ones (change to this%c%x/2 if needed!)
816 call device_cfill(lambda%x_d, 1.0_rp, this%m)
817 call device_cmult2(dummy_m%x_d, this%c%x_d, 0.5_rp, this%m)
818 call device_pwmax2(lambda%x_d, dummy_m%x_d, this%m)
819
820 call device_cfill(mu%x_d, 1.0_rp, this%m)
821 z = 0.0_rp
822
823 ! ------------------------------------------------------------------------ !
824 ! Computing the minimal epsilon and choose the most conservative one
825
826 minimal_epsilon = max(0.9_rp * this%epsimin, 1.0e-12_rp)
827 call mpi_allreduce(mpi_in_place, minimal_epsilon, 1, &
828 mpi_real_precision, mpi_min, neko_comm, ierr)
829
830 ! ------------------------------------------------------------------------ !
831 ! The main loop of the dual-primal interior point method.
832
833 outer: do while (epsi .gt. minimal_epsilon)
834 ! calculating residuals based on
835 ! "https://people.kth.se/~krille/mmagcmma.pdf" for the variables
836 ! x, y, z, lambda residuals based on eq(5.9a)-(5.9d), respectively.
837 associate(p0j => this%p0j, q0j => this%q0j, &
838 pij => this%pij, qij => this%qij, &
839 low => this%low, upp => this%upp, &
840 alpha => this%alpha, beta => this%beta, &
841 c => this%c, a0 => this%a0, a => this%a)
842
843 ! minimize(L_x, L_y, L_z) and compute x(λ), y(λ), z(λ) for
844 ! the initial value of λ
845
846 ! Comput the value of y that minimizes L_y for the current λ
847 ! minimize (sum_{i=1}^{m} [ (c_i - λ_i) * y_i + 0.5 * y_i^2 ])
848 ! dL_y/dy =0 => y= (λ_i - c_i), ensure y>=0
849 call device_sub3(y%x_d, lambda%x_d, c%x_d, this%m)
850 call device_pwmax2(y%x_d, zerom%x_d, this%m)
851
852 ! Comput the value of z that minimizes L_z for the current λ
853 ! minimize ((a_0 - sum_{i=1}^{m} λ_i * a_i) * z + 0.5 * z^2)
854 ! ensure z>=0
855 call device_col3(dummy_m%x_d, lambda%x_d, a%x_d, this%m)
856 z = device_glsum(dummy_m%x_d, this%m)
857 z = max(0.0_rp, z - a0)
858
859 ! Comput the value of x that minimizes L_x for the current λ
860 ! minimize( sum_{j=1}^{n} [ (p_{0j} + sum_{i=1}^{m} λ_i *
861 ! p_{ij}) / (u_j - x_j) + (q_{0j} + sum_{i=1}^{m} λ_i * q_{ij}) /
862 ! (x_j - l_j) ] - sum_{i=1}^{m} λ_i * b_i)
863 call device_mattrans_v_mul(pjlambda%x_d, pij%x_d, lambda%x_d, this%m, this%n)
864 call device_mattrans_v_mul(qjlambda%x_d, qij%x_d, lambda%x_d, this%m, this%n)
865 call device_add2(pjlambda%x_d, p0j%x_d, this%n)
866 call device_add2(qjlambda%x_d, q0j%x_d, this%n)
867
868 call device_mma_dipsolvesub1(x%x_d, pjlambda%x_d, qjlambda%x_d, &
869 low%x_d, upp%x_d, alpha%x_d, beta%x_d, this%n)
870
871 call device_cfill(relambda%x_d, 0.0_rp, this%m)
872 call device_relambda(relambda%x_d, x%x_d, this%upp%x_d, &
873 low%x_d, pij%x_d, qij%x_d, this%n, this%m)
874
875 ! Global comminucation for relambda values
876
877 call device_memcpy(relambda%x, relambda%x_d, this%m, device_to_host, &
878 sync = .true.)
879 call mpi_allreduce(mpi_in_place, relambda%x, this%m, &
880 mpi_real_precision, mpi_sum, neko_comm, ierr)
881 call device_memcpy(relambda%x, relambda%x_d, this%m, &
882 host_to_device, sync = .true.)
883
884 call device_add2s2(relambda%x_d, this%a%x_d, -z, this%m)
885 call device_sub2(relambda%x_d, y%x_d, this%m)
886 call device_add2(relambda%x_d, mu%x_d, this%m)
887 call device_sub2(relambda%x_d, this%bi%x_d, this%m)
888
889 call device_col3(remu%x_d, mu%x_d, lambda%x_d, this%m)
890 call device_cadd(remu%x_d, -epsi, this%m)
891
892 residumax = maxval([device_maxval(relambda%x_d, this%m), &
893 device_maxval(remu%x_d, this%m)])
894
895 ! ------------------------------------------------------------------- !
896 ! Internal loop
897 do iter = 1, this%max_iter
898 !Check the condition
899 if (residumax .lt. epsi) exit
900
901 ! Compute dL(x, y, z, λ)/dλ for the updated x(λ), y(λ), z(λ)
902 ! based on the implementation in the following paper by Niels
903 ! https://doi.org/10.1007/s00158-012-0869-2
904 ! (https://github.com/topopt/TopOpt_in_PETSc/blob/master/MMA.cc)
905 ! The formula for gradlambda and relambda are basically the same:
906 ! thus, we utilise gradlambda = relambda - mu for efficiency
907 call device_copy(gradlambda%x_d, relambda%x_d, this%m)
908 call device_sub2(gradlambda%x_d, mu%x_d, this%m)
909
910 ! Update gradlambda as the right hand side for Newton's method(eq10)
911 call device_cfill(dummy_m%x_d, epsi, this%m)
912 call device_invcol2(dummy_m%x_d, lambda%x_d, this%m)
913 call device_add2(gradlambda%x_d, dummy_m%x_d, this%m)
914 call device_cmult(gradlambda%x_d, -1.0_rp, this%m)
915
916 ! Computing the Hessian as in equation (13) in
917 !! https://doi.org/10.1007/s00158-012-0869-2
918
919 !--------------contributions of x terms to Hess--------------------!
920 call device_mma_ljjxinv(ljjxinv%x_d, pjlambda%x_d, qjlambda%x_d, &
921 x%x_d, low%x_d, upp%x_d, alpha%x_d, beta%x_d, this%n)
922
923 call device_gg(hijx%x_d, x%x_d, this%low%x_d, this%upp%x_d, &
924 this%pij%x_d, this%qij%x_d, this%n, this%m)
925
926 call device_cfill(hess%x_d, 0.0_rp, (this%m) * (this%m) )
927 call device_hess(hess%x_d, hijx%x_d, ljjxinv%x_d, this%n, this%m)
928
929 ! download Hess to CPU, mpi reduce, upload to the device
930 call device_memcpy(hess%x, hess%x_d, this%m*this%m, device_to_host, &
931 sync = .true.)
932 call mpi_allreduce(mpi_in_place, hess%x, &
933 this%m*this%m, mpi_real_precision, mpi_sum, neko_comm, ierr)
934 call device_memcpy(hess%x, hess%x_d, this%m*this%m, &
935 host_to_device, sync = .true.)
936
937 !---------------contributions of z terms to Hess-------------------!
938 ! Only for inactive constraint, we consider contributions to Hess
939 ! based on the cpp code by Niels.
940 call device_col3(dummy_m%x_d, lambda%x_d, a%x_d, this%m)
941 if (device_glsum(dummy_m%x_d, this%m) .gt. 0.0_rp) then
942 call device_update_hessian_z(hess%x_d, a%x_d, this%m)
943 end if
944
945 !---------------contributions of y terms to Hess-------------------!
946 ! Only for inactive constraint, we consider contributions to Hess.
947 ! Note that if d(i) = 0, the y terms (just like z terms) will not
948 ! contribute to the Hessian matrix.
949 ! Note that since we use DGESV to solve LSE on CPU, we dont need
950 ! cuda kernel for this part
951 ! Also, improve the robustness by stablizing the Hess using
952 ! Levenberg-Marquardt algorithm (heuristically)
953 call device_prepare_hessian(hess%x_d, y%x_d, mu%x_d, lambda%x_d, &
954 this%m)
955
956 ! Device solve for the linear system
957 call device_solve_linear_system(hess%x_d, gradlambda%x_d, &
958 this%m, info)
959 if (info .ne. 0) then
960 call neko_error("Linear solver failed on the device in " // &
961 "mma_subsolve_dip")
962 end if
963
964 call device_copy(dlambda%x_d, gradlambda%x_d, this%m)
965
966 ! based on eq(11) for delta eta
967 call device_copy(dummy_m%x_d, dlambda%x_d, this%m)
968 call device_col2(dummy_m%x_d, mu%x_d, this%m)
969 call device_invcol2(dummy_m%x_d, lambda%x_d, this%m)
970
971 call device_cfill(dmu%x_d, epsi, this%m)
972 call device_invcol2(dmu%x_d, lambda%x_d, this%m)
973 call device_add2s2(dmu%x_d, dummy_m%x_d, -1.0_rp, this%m)
974 call device_sub2(dmu%x_d, mu%x_d, this%m)
975
976 steg = maxval([1.005_rp, device_maxval2(dlambda%x_d, lambda%x_d, &
977 -1.01_rp, this%m), device_maxval2(dmu%x_d, mu%x_d, -1.01_rp, &
978 this%m)])
979 steg = 1.0_rp / steg
980
981 call device_add2s2(lambda%x_d, dlambda%x_d, steg, this%m)
982 call device_add2s2(mu%x_d, dmu%x_d, steg, this%m)
983
984 ! minimize(L_x, L_y, L_z) and compute x(λ), y(λ), z(λ) for
985 ! the updated values of λ
986
987 ! Comput the value of y that minimizes L_y for the current λ
988 ! minimize (sum_{i=1}^{m} [ (c_i - λ_i) * y_i + 0.5 * y_i^2 ])
989 ! dL_y/dy =0 => y= (λ_i - c_i), ensure y>=0
990 call device_sub3(y%x_d, lambda%x_d, c%x_d, this%m)
991 call device_pwmax2(y%x_d, zerom%x_d, this%m)
992
993 ! Comput the value of z that minimizes L_z for the current λ
994 ! minimize ((a_0 - sum_{i=1}^{m} λ_i * a_i) * z + 0.5 * z^2)
995 ! ensure z>=0
996 call device_col3(dummy_m%x_d, lambda%x_d, a%x_d, this%m)
997 z = device_glsum(dummy_m%x_d, this%m)
998 z = max(0.0_rp, z - a0)
999
1000 ! Comput the value of x that minimizes L_x for the current λ
1001 ! minimize( sum_{j=1}^{n} [ (p_{0j} + sum_{i=1}^{m} λ_i *
1002 ! p_{ij}) / (u_j - x_j) + (q_{0j} + sum_{i=1}^{m} λ_i * q_{ij}) /
1003 ! (x_j - l_j) ] - sum_{i=1}^{m} λ_i * b_i)
1004 call device_mattrans_v_mul(pjlambda%x_d, pij%x_d, lambda%x_d, this%m, this%n)
1005 call device_mattrans_v_mul(qjlambda%x_d, qij%x_d, lambda%x_d, this%m, this%n)
1006 call device_add2(pjlambda%x_d, p0j%x_d, this%n)
1007 call device_add2(qjlambda%x_d, q0j%x_d, this%n)
1008
1009 call device_mma_dipsolvesub1(x%x_d, pjlambda%x_d, qjlambda%x_d, &
1010 low%x_d, upp%x_d, alpha%x_d, beta%x_d, this%n)
1011
1012 ! Compute the residual for the lambda and mu using eq(9) and eq(15)
1013
1014 call device_cfill(relambda%x_d, 0.0_rp, this%m)
1015 call device_relambda(relambda%x_d, x%x_d, this%upp%x_d, &
1016 low%x_d, pij%x_d, qij%x_d, this%n, this%m)
1017
1018 ! Global comminucation for relambda values
1019
1020 call device_memcpy(relambda%x, relambda%x_d, this%m, device_to_host, &
1021 sync = .true.)
1022 call mpi_allreduce(mpi_in_place, relambda%x, this%m, &
1023 mpi_real_precision, mpi_sum, neko_comm, ierr)
1024 call device_memcpy(relambda%x, relambda%x_d, this%m, &
1025 host_to_device, sync = .true.)
1026
1027 call device_add2s2(relambda%x_d, this%a%x_d, -z, this%m)
1028 call device_sub2(relambda%x_d, y%x_d, this%m)
1029 call device_add2(relambda%x_d, mu%x_d, this%m)
1030 call device_sub2(relambda%x_d, this%bi%x_d, this%m)
1031
1032 call device_col3(remu%x_d, mu%x_d, lambda%x_d, this%m)
1033 call device_cadd(remu%x_d, -epsi, this%m)
1034
1035 residumax = maxval([device_maxval(relambda%x_d, this%m), &
1036 device_maxval(remu%x_d, this%m)])
1037 end do
1038 end associate
1039 epsi = 0.1_rp * epsi
1040 end do outer
1041
1042 ! Save the new designx
1043 call device_copy(this%xold2%x_d, this%xold1%x_d, this%n)
1044 call device_copy(this%xold1%x_d, designx_d, this%n)
1045 call device_copy(designx_d, x%x_d, this%n)
1046
1047 ! update the parameters of the MMA object nesessary to compute KKT residual
1048 call device_copy(this%y%x_d, y%x_d, this%m)
1049 this%z = z
1050 call device_copy(this%lambda%x_d, lambda%x_d, this%m)
1051 call device_copy(this%mu%x_d, mu%x_d, this%m)
1052
1053 call this%scratch%relinquish(ind)
1054 end subroutine mma_subsolve_dip_device
1055
1056end submodule mma_device
MMA module.
Definition mma.f90:69