37#ifndef MMA_CUDA_KERNEL_H
38#define MMA_CUDA_KERNEL_H
62 const T zeta,
const T z,
const int m) {
68 (
T)1.0 / (d[
tj] + mu[
tj] / y[
tj]);
106 for (
int j = 0;
j < m;
j++) {
112 for (
int j = 0;
j < m;
j++) {
135 if (
tid == 0 &&
abs(
A[0]) > (
T)1
e-12)
b[0] /=
A[0];
140 for (
int k = 0; k < n; k++) {
145 for (
int i = k + 1; i < n; i++) {
154 for (
int j = k;
j < n;
j++) {
155 T temp =
A[k * n +
j];
170 for (
int i =
tid + k + 1; i < n; i +=
blockDim.x) {
173 for (
int j = k + 1;
j < n;
j++) {
184 for (
int j = 0;
j < i;
j++) {
192 for (
int i = n - 1 -
tid; i >= 0; i -=
blockDim.x) {
195 for (
int j = i + 1;
j < n;
j++) {
198 if (
abs(
A[i * n + i]) > (
T)1
e-12) {
199 b[i] =
sum /
A[i * n + i];
292 const int m,
const int n) {
300 for (
int i = 0; i < m; ++i) {
301 acc += pij[i +
tj * m] * lambda[i];
334template<
typename T >
338 const T asydecr,
const T asyincr,
const int n) {
356 (
prod >
T(0)) ? asyincr :
T(1);
425 for (
int i = 0; i < m; ++i) {
426 int idx = i +
tj * m;
467 for (
int i = 0; i < m; ++i) {
516 for (
int i = 0; i < m; ++i) {
532 if (
idx >= n)
return;
668 for (
int i = 0; i < m; i++) {
707 for (
int i = 0; i < m; i++) {
708 int idx = i +
tj * m;
721 const T* eta,
const T* lambda,
const int n,
const int m) {
726 for (
int i = 0; i < m; i++) {
727 sum =
sum + pij[
tj *m+ i] * lambda[i];
733 eta[
tj] / (beta[
tj] - x[
tj]);
756 for (
int i =
idx; i < n; i +=
str)
779template<
typename T >
781 const int n,
const int m,
const int k) {
791 for (
int i =
idx; i < n; i +=
str)
793 sum += a[m * i + k ];
809template<
typename T >
822 for (
int i =
idx; i < n; i +=
str)
892template<
typename T >
895 const int m,
const int k0,
const int k1) {
905 for (
int i =
idx; i < n; i +=
str)
927 const int n,
const int m) {
940 for (
int i0 = 0;
i0 < m;
i0++) {
941 for (
int i1 = 0;
i1 < m;
i1++) {
942 temp[
tj +
i0 * (n + 1) +
i1 * (m + 1) * (n + 1)] =
GG[
i0 * n +
tj] *
957 for(
int i=0;i<m;i++){
1011 for (
int i = 0; i < m; ++i) {
1019 - xsi[
tj] + eta[
tj];
1024template <
typename T>
1035template <
typename T>
1047 for (
int i =
idx; i < n; i +=
str) {
1066template <
typename T>
1069 const T d,
const int n) {
1077template <
typename T>
1089template<
typename T >
1101 for (
int i =
idx; i < n; i +=
str)
1121template<
typename T >
1133 for (
int i =
idx; i < n; i +=
str)
1153template <
typename T>
1161 for (
int i = 0; i < m; i++) {
1169template <
typename T>
1180 template<
typename T >
1190 for (
int i =
idx; i<n ; i +=
str)
1208 template<
typename T >
1222 for (
int i =
idx; i < n; i+=
str) {
1244template <
typename T>
1246 const T c,
const int n) {
1253template <
typename T>
1262template <
typename T>
1276template <
typename T>
1281 const T zeta,
const T z,
const int m) {
1286 1.0/ (d[
tj] + mu[
tj] / y[
tj]));
1291 AA[
tj+
tj*(m+1)]= -zeta/z;
1294template <
typename T>
__global__ void heaviside_mapping_apply_kernel(const T beta, const T eta, T *__restrict__ X_out_d, T *__restrict__ X_in_d, const int n)