c@64
|
1 #ifndef ZMM_H
|
c@64
|
2 #define ZMM_H
|
c@64
|
3
|
c@64
|
4 #define ATL_mmMULADD
|
c@64
|
5 #define ATL_mmLAT 1
|
c@64
|
6 #define ATL_mmMU 12
|
c@64
|
7 #define ATL_mmNU 1
|
c@64
|
8 #define ATL_mmKU 48
|
c@64
|
9 #define MB 48
|
c@64
|
10 #define NB 48
|
c@64
|
11 #define KB 48
|
c@64
|
12 #define NBNB 2304
|
c@64
|
13 #define MBNB 2304
|
c@64
|
14 #define MBKB 2304
|
c@64
|
15 #define NBKB 2304
|
c@64
|
16 #define NB2 96
|
c@64
|
17 #define NBNB2 4608
|
c@64
|
18
|
c@64
|
19 #define ATL_MulByNB(N_) ((N_) * 48)
|
c@64
|
20 #define ATL_DivByNB(N_) ((N_) / 48)
|
c@64
|
21 #define ATL_MulByNBNB(N_) ((N_) * 2304)
|
c@64
|
22 void ATL_zJIK48x48x48TN48x48x0_a1_b0(const int M, const int N, const int K, const TYPE alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const TYPE beta, TYPE *C, const int ldc);
|
c@64
|
23 void ATL_zJIK48x48x48TN48x48x0_a1_b1(const int M, const int N, const int K, const TYPE alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const TYPE beta, TYPE *C, const int ldc);
|
c@64
|
24 void ATL_zJIK48x48x48TN48x48x0_a1_bX(const int M, const int N, const int K, const TYPE alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const TYPE beta, TYPE *C, const int ldc);
|
c@64
|
25
|
c@64
|
26 #define NBmm_b1(m_, n_, k_, al_, A_, lda_, B_, ldb_, be_, C_, ldc_) \
|
c@64
|
27 { \
|
c@64
|
28 ATL_zJIK48x48x48TN48x48x0_a1_bX(m_, n_, k_, al_, (A_), lda_, (B_), ldb_, ATL_rnone, C_, ldc_); \
|
c@64
|
29 ATL_zJIK48x48x48TN48x48x0_a1_b1(m_, n_, k_, al_, (A_), lda_, (B_)+NBNB, ldb_, ATL_rone, (C_)+1, ldc_); \
|
c@64
|
30 ATL_zJIK48x48x48TN48x48x0_a1_bX(m_, n_, k_, al_, (A_)+NBNB, lda_, (B_)+NBNB, ldb_, ATL_rnone, C_, ldc_); \
|
c@64
|
31 ATL_zJIK48x48x48TN48x48x0_a1_b1(m_, n_, k_, al_, (A_)+NBNB, lda_, (B_), ldb_, ATL_rone, (C_)+1, ldc_); \
|
c@64
|
32 }
|
c@64
|
33 #define NBmm_b0(m_, n_, k_, al_, A_, lda_, B_, ldb_, be_, C_, ldc_) \
|
c@64
|
34 { \
|
c@64
|
35 ATL_zJIK48x48x48TN48x48x0_a1_b0(m_, n_, k_, al_, (A_), lda_, (B_), ldb_, ATL_rzero, C_, ldc_); \
|
c@64
|
36 ATL_zJIK48x48x48TN48x48x0_a1_b0(m_, n_, k_, al_, (A_), lda_, (B_)+NBNB, ldb_, ATL_rzero, (C_)+1, ldc_); \
|
c@64
|
37 ATL_zJIK48x48x48TN48x48x0_a1_bX(m_, n_, k_, al_, (A_)+NBNB, lda_, (B_)+NBNB, ldb_, ATL_rnone, C_, ldc_); \
|
c@64
|
38 ATL_zJIK48x48x48TN48x48x0_a1_b1(m_, n_, k_, al_, (A_)+NBNB, lda_, (B_), ldb_, ATL_rone, (C_)+1, ldc_); \
|
c@64
|
39 }
|
c@64
|
40 #define NBmm_bX(m_, n_, k_, al_, A_, lda_, B_, ldb_, be_, C_, ldc_) \
|
c@64
|
41 { \
|
c@64
|
42 ATL_zJIK48x48x48TN48x48x0_a1_bX(m_, n_, k_, al_, (A_), lda_, (B_), ldb_, -(be_), C_, ldc_); \
|
c@64
|
43 ATL_zJIK48x48x48TN48x48x0_a1_bX(m_, n_, k_, al_, (A_), lda_, (B_)+NBNB, ldb_, be_, (C_)+1, ldc_); \
|
c@64
|
44 ATL_zJIK48x48x48TN48x48x0_a1_bX(m_, n_, k_, al_, (A_)+NBNB, lda_, (B_)+NBNB, ldb_, ATL_rnone, C_, ldc_); \
|
c@64
|
45 ATL_zJIK48x48x48TN48x48x0_a1_b1(m_, n_, k_, al_, (A_)+NBNB, lda_, (B_), ldb_, ATL_rone, (C_)+1, ldc_); \
|
c@64
|
46 }
|
c@64
|
47 #define rNBmm_b1 ATL_dJIK48x48x48TN48x48x0_a1_b1
|
c@64
|
48 #define rNBmm_b0 ATL_dJIK48x48x48TN48x48x0_a1_b0
|
c@64
|
49 #define rNBmm_bX ATL_dJIK48x48x48TN48x48x0_a1_bX
|
c@64
|
50
|
c@64
|
51 #endif
|