37#ifndef MMA_HIP_KERNEL_H
38#define MMA_HIP_KERNEL_H
62 const T zeta,
const T z,
const int m) {
68 (
T)1.0 / (d[
tj] + mu[
tj] / y[
tj]);
106 for (
int j = 0;
j < m;
j++) {
112 for (
int j = 0;
j < m;
j++) {
135 if (
tid == 0 &&
abs(
A[0]) > (
T)1
e-12)
b[0] /=
A[0];
140 for (
int k = 0; k < n; k++) {
145 for (
int i = k + 1; i < n; i++) {
154 for (
int j = k;
j < n;
j++) {
155 T temp =
A[k * n +
j];
170 for (
int i =
tid + k + 1; i < n; i +=
blockDim.x) {
173 for (
int j = k + 1;
j < n;
j++) {
184 for (
int j = 0;
j < i;
j++) {
192 for (
int i = n - 1 -
tid; i >= 0; i -=
blockDim.x) {
195 for (
int j = i + 1;
j < n;
j++) {
198 if (
abs(
A[i * n + i]) > (
T)1
e-12) {
199 b[i] =
sum /
A[i * n + i];
293 const int m,
const int n) {
301 for (
int i = 0; i < m; ++i) {
302 acc += pij[i +
tj * m] * lambda[i];
334template<
typename T >
338 const T asydecr,
const T asyincr,
const int n) {
356 (
prod >
T(0)) ? asyincr :
T(1);
425 for (
int i = 0; i < m; ++i) {
426 int idx = i +
tj * m;
467 for (
int i = 0; i < m; ++i) {
513 for (
int i = 0; i < m; ++i) {
529 if (
idx >= n)
return;
546template<
typename T >
557 for (
int i =
idx; i < n; i +=
str) {
653 for (
int i = 0; i < m; i++) {
692 for (
int i = 0; i < m; i++) {
693 int idx = i +
tj * m;
705 const T* eta,
const T* lambda,
const int n,
const int m) {
710 for (
int i = 0; i < m; i++) {
711 sum =
sum + pij[
tj *m+ i] * lambda[i];
717 eta[
tj] / (beta[
tj] - x[
tj]);
736 for (
int i =
idx; i < n; i +=
str)
759template<
typename T >
761 const int n,
const int m,
const int k) {
771 for (
int i =
idx; i < n; i +=
str)
773 sum += a[m * i + k ];
789template<
typename T >
802 for (
int i =
idx; i < n; i +=
str)
871template<
typename T >
874 const int m,
const int k0,
const int k1) {
884 for (
int i =
idx; i < n; i +=
str)
906 const int n,
const int m) {
919 for (
int i0 = 0;
i0 < m;
i0++) {
920 for (
int i1 = 0;
i1 < m;
i1++) {
921 temp[
tj +
i0 * (n + 1) +
i1 * (m + 1) * (n + 1)] =
GG[
i0 * n +
tj] *
936 for(
int i=0;i<m;i++){
989 for (
int i = 0; i < m; ++i) {
1001template <
typename T>
1012template<
typename T >
1024 for (
int i =
idx; i < n; i +=
str)
1045template <
typename T>
1048 const T d,
const int n) {
1056template <
typename T>
1068template<
typename T >
1080 for (
int i =
idx; i < n; i +=
str)
1100template<
typename T >
1112 for (
int i =
idx; i < n; i +=
str)
1132template <
typename T>
1140 for (
int i = 0; i < m; i++) {
1148template <
typename T>
1159 template<
typename T >
1169 for (
int i =
idx; i<n ; i +=
str)
1187 template<
typename T >
1201 for (
int i =
idx; i < n; i+=
str) {
1223template <
typename T>
1225 const T c,
const int n) {
1232template <
typename T>
1241template <
typename T>
1255template <
typename T>
1260 const T zeta,
const T z,
const int m) {
1265 1.0/ (d[
tj] + mu[
tj] / y[
tj]));
1270 AA[
tj+
tj*(m+1)]= -zeta/z;
1273template <
typename T>
__global__ void heaviside_mapping_apply_kernel(const T beta, const T eta, T *__restrict__ X_out_d, T *__restrict__ X_in_d, const int n)