CPU矩阵运算库比较DNNL,mkl,eigen3

安装必要的库

Eigen3

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
执行命令:

sudo apt-get install libeigen3-dev
or sudo yum install eigen3
安装后执行以下命令

运行命令:

sudo cp -r /usr/include/eigen3/Eigen /usr/include

注意:参考cp指令 /usr/men /usr/zh 将目录/usr/men下的所有文件及其子目录复制到目录/usr/zh中

这个命令的说明:

因为eigen3被默认安装到了usr/include里了(系统默认的路径),在很多程序中include时经常使用#include<Eigen/Dense>而不是使用#include<eigen3/Eigen/Dense>所以要做一下处理,否则有些程序在编译时找不到Eigen/Dense而报错。上面指令将usr/include/eigen3文件夹中的Eigen文件递归的复制到上一层文件(直接放到/usr/include中,否则系统无法默认搜索到----->此时只能在CMakeLists.txt用include_libraries(绝对路径了))


//或第二种安装方法
#github 有个mirror,版本3.3.4 from 2017
git clone https://github.com/eigenteam/eigen-git-mirror

#安装
cd eigen-git-mirror
mkdir build
cd build
cmake ..
sudo make install

#安装后,头文件安装在/usr/local/include/eigen3

示例程序

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
//g++ -I/usr/local/include/eigen3 1.cpp -o 1
#include <iostream>
#include <Eigen/Dense>

using namespace std;
using namespace Eigen;

int main()
{

MatrixXd m = MatrixXd::Random(3,3); //随机数-1 ~1
cout << m << endl;
m = (m + MatrixXd::Constant(3,3,1.2))* 50;
cout << " m2 =" << m << endl;

MatrixXd n = MatrixXd::Constant(3,3,2);//常矩阵
cout << " n =" << n << endl;

VectorXd v(3);
v << 1,2,3;

cout << v<< endl;

cout << "m * v" << endl << m * v << endl; //向量矩阵计算

}

MKL

官网下载安装包https://software.intel.com/en-us/mkl

1
2
3
4
5
6
7
8
9
10
11
12
13
14
tar -zxvf l_mkl_2019.0.117.tgz
cd l_mkl_2019.0.117
./install.sh
source ~/intel/bin/compilervars.sh intel64


locate libdnnl.so.1
export LD_LIBRARY_PATH=/home/resources/yxwang/tf_install_source/serving1.13/dfsmn_opt_quant:$LD_LIBRARY_PATH

locate libmkl_sequential.so
export LD_LIBRARY_PATH=/home/public/wangyunxia_share/build_tf1.12mkl/Predict-ft/tensorflow_lib_h2_old/third_party/attnlibs/:$LD_LIBRARY_PATH


export LD_LIBRARY_PATH=/home/yx.wang/dnnl_lnx_1.1.1_cpu_iomp/lib:$LD_LIBRARY_PATH

MKL-DNNL

官网下载编译好的包即可,然后-I加头文件路径,-L加so路径,LD_LIBRARY_PATH实际的so代码路径

MKL_INCDIR=/opt/composer_xe_2013.2.146/mkl/include

MKL_LIBDIR=/opt/composerxe/mkl/lib/intel64/

MKL_RT_LIBDIR=/opt/composer_xe_2013/lib/intel64/

#DNNLROOT=/home/resources/lizhipeng/dense-matrix-mult/MKL/mkldnn_lnx_1.0.4_cpu_iomp

DNNLROOT=/home/yx.wang/dnnl_lnx_1.1.1_cpu_iomp

gcc \

-I $MKL_INCDIR -I $DNNLROOT/include \

-L $MKL_LIBDIR -L /home/yx.wang/dnnl_lnx_1.1.1_cpu_iomp/lib/ -ldnnl \

-L $MKL_RT_LIBDIR \

-O4 -msse2 -msse3 -msse4 \

dense_mult_mkl.c \

-Wl,–start-group -lmkl_sequential -lmkl_core -lmkl_intel_lp64 -Wl,–end-group \

-lm -lpthread \

-o a.out

示例测试程序

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#include <math.h>
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <stdlib.h>
#include <mkl.h>
#include "dnnl.h"
#define REP(i,n) for ( i = 0; i < (n); ++i )
typedef int8_t test_type;

// using namespace std;
// using namespace dnnl;
void init_matrix(double* A, int dim1 , int dim2 ) {

int mod = 100003, prod = 7 , e = 1 , i = 0, j = 0;

REP(i,dim1) REP(j,dim2) {
e = (e*prod + 1)%mod;
A[i*dim2 + j] = e * .91739210437;
}
}

void init_matrix_float(float* A, int dim1 , int dim2 ) {

int mod = 100003, prod = 7 , e = 1 , i = 0, j = 0;

REP(i,dim1) REP(j,dim2) {
e = (e*prod + 1)%mod;
A[i*dim2 + j] = 0.0;
}
}

void init_matrix_X(test_type* A, int dim1 , int dim2 ) {

int mod = 100003, prod = 7 , e = 1 , i = 0, j = 0;

REP(i,dim1) REP(j,dim2) {
e = (e*prod + 1)%mod;
A[i*dim2 + j] = 0;
}
}

void print_matrix (double *A , int dim1 , int dim2 ) {

int i, j;

REP(i,dim1) {
REP(j,dim2) printf("%lf " , A[i*dim2 + j]);
puts("");
}
}

void print_matrix_float (float *A , int dim1 , int dim2 ) {

int i, j;

REP(i,dim1) {
REP(j,dim2) printf("%lf " , A[i*dim2 + j]);
puts("");
}
}


int main(int argc, char** argv){

int m , n , k , i , j , u , nrep = 1 , cnt = 0;
int tag = 1;

double totaltime , error = 0;

struct timeval before , after;

if ( argc <= 1 || argc >= 6 ) {
puts("./exe [nrep] <dim1> [dim2 dim3]");
exit(0);
}

if ( argc == 2 ) {

sscanf ( argv[1] , "%d" , &m );
n = k = m;
}
else if ( argc == 3 ) {

sscanf ( argv[1] , "%d" , &nrep );
sscanf ( argv[2] , "%d" , &m );
n = k = m;
}
else if ( argc == 4 ) {

sscanf ( argv[1] , "%d" , &m );
sscanf ( argv[2] , "%d" , &k );
sscanf ( argv[3] , "%d" , &n );
}
else if ( argc == 5 ) {

sscanf ( argv[1] , "%d" , &nrep );
sscanf ( argv[2] , "%d" , &m );
sscanf ( argv[3] , "%d" , &k );
sscanf ( argv[4] , "%d" , &n );
}
test_type *A, *B;
int32_t *C;
if(tag == 1){

A = (test_type *) malloc( m*k*sizeof( test_type ) );
B = (test_type *) malloc( k*n*sizeof( test_type ) );
C = (int32_t *) malloc( m*n*sizeof( int32_t ) );
init_matrix_X ( A , m , k );
init_matrix_X ( B , k , n );
}
// else if(tag == 2){
// A = (float *) malloc( m*k*sizeof( float ) );
// B = (float *) malloc( k*n*sizeof( float ) );
// C = (float *) malloc( m*n*sizeof( float ) );
// init_matrix_float ( A , m , k );
// init_matrix_float ( B , k , n );
// }
// else if(tag == 3){
// A = (double *) malloc( m*k*sizeof( double ) );
// B = (double *) malloc( k*n*sizeof( double ) );
// C = (double *) malloc( m*n*sizeof( double ) );
// init_matrix ( A , m , k );
// init_matrix ( B , k , n );
// }


//A = (double *) malloc( sizeof(double) * m * k );
//B = (double *) malloc( sizeof(double) * k * n );
//C = (double *) malloc( sizeof(double) * m * n );


printf ("Matrices Initialized [%dx%d , %dx%d , %dx%d ]\n" , m , k , k , n , m , n);

printf ("Running %d Iteration(s) \n" , nrep );

totaltime = 0;
const MKL_INT8 ao = 0;
const MKL_INT8 bo = 0;
MKL_INT32 co = 0;
CBLAS_OFFSET offsetc=CblasFixOffset;
MKL_INT lda=2, ldb=1, ldc=1;
float alpha=1.0f, beta=0.0f;
REP(i,nrep) {

double runtime;

gettimeofday ( &before , NULL );
// cblas_gemm_s8u8s32(CblasRowMajor, CblasNoTrans, CblasNoTrans, offsetc,
// m, n, k, alpha, A, k, ao, B, n, bo, beta, C, n, &co);
// cblas_gemm_s16s16s32(CblasRowMajor, CblasNoTrans, CblasNoTrans, offsetc,
// m, n, k, alpha, A, k, ao, B, n, bo, beta, C, n, &co);
//https://software.intel.com/en-us/onemkl-developer-reference-c-cblas-gemm-1
//cblas_sgemm ( CblasRowMajor, CblasNoTrans, CblasNoTrans , m , n , k , 1.0 , A , k , B , n , 0.0 , C , n );
//https://intel.github.io/mkl-dnn/group__dnnl__api__blas.html#ga2b763b7629846913507d88fba875cc26
dnnl_gemm_s8s8s32('N', 'N', 'F', m, n, k, 1.0f, A, k,ao, B, n, bo,0.0f, C, n, &co);
gettimeofday ( &after , NULL );

runtime = ( after.tv_usec - before.tv_usec )/1000000.0 + (after.tv_sec - before.tv_sec);

if ( nrep == 1 || i > 0 )
totaltime += runtime , cnt++;

printf(" [Iteration %d Time : %lf ]\n" , i+1 , runtime );

/*
REP(i,m) REP(j,n) {
double ans = 0;
REP(u,k) ans += A[i*k+u]*B[u*n+j];
error += (ans-C[i*n+j])*(ans-C[i*n+j]);
}

printf(" [Error = %lf ]\n" , error );
*/
}

printf("Average Time : %lf \n" , totaltime/cnt );

return 0;
}

Eigen

INCDIR=/usr/include/eigen3/

g++ \

-I $INCDIR \

-O4 -msse2 -msse3 -msse4 \

dense_mult_eigen.cpp \

-fopenmp \

-o ap.out

g++ \

-I $INCDIR \

-O4 -msse2 -msse3 -msse4 \

-DEIGEN_DONT_PARALLELIZE \

dense_mult_eigen.cpp \

-o a.out

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
/* dense_mult_eigen.cpp
* Copyright (C) 2013, Siddharth Gopal (gcdart AT gmail)
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2.1 of the GNU Lesser General Public License
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA, 02111-1307, USA,
* or visit the GNU web site, www.gnu.org.
*/

#define EIGEN_NO_DEBUG

#include <Eigen/Sparse>
#include <Eigen/Core>
#include <sys/time.h>
#include <cstdio>


using namespace std;
using namespace Eigen;

#define REP(i,n) for ( int i = 0; i < (n); ++i )
typedef Matrix< int8_t, Dynamic, Dynamic > MatrixX;

void init_matrix (MatrixXd& A, int dim1 , int dim2 ) {

int mod = 100003, prod = 7 , e = 1 , i = 0, j = 0;

REP(i,dim1) REP(j,dim2) {
e = (e*prod + 1)%mod;
A(i,j) = e * .91739210437;
}
}

void init_matrix_float(MatrixXf& A, int dim1 , int dim2 ) {

int mod = 100003, prod = 7 , e = 1 , i = 0, j = 0;

REP(i,dim1) REP(j,dim2) {
e = (e*prod + 1)%mod;
A(i*dim2 + j) = 0.0;
}
}

void init_matrix_X(MatrixX& A, int dim1 , int dim2 ) {

int mod = 100003, prod = 7 , e = 1 , i = 0, j = 0;

REP(i,dim1) REP(j,dim2) {
e = (e*prod + 1)%mod;
A(i*dim2 + j) = 0.0;
}
}

void print_matrix (MatrixXd& A, int dim1 , int dim2 ) {

int i, j;

REP(i,dim1) {
REP(j,dim2) printf("%lf " , A(i,j) );
puts("");
}
}


int main( int argc , char *argv[] ){

int m , n , k , i , j , u , nrep = 1 , cnt = 0;

double totaltime , error = 0;

struct timeval before , after;

if ( argc <= 1 || argc >= 6 ) {
puts("./exe [nrep] <dim1> [dim2 dim3]");
exit(0);
}

if ( argc == 2 ) {

sscanf ( argv[1] , "%d" , &m );
n = k = m;
}
else if ( argc == 3 ) {

sscanf ( argv[1] , "%d" , &nrep );
sscanf ( argv[2] , "%d" , &m );
n = k = m;
}
else if ( argc == 4 ) {

sscanf ( argv[1] , "%d" , &m );
sscanf ( argv[2] , "%d" , &k );
sscanf ( argv[3] , "%d" , &n );
}
else if ( argc == 5 ) {

sscanf ( argv[1] , "%d" , &nrep );
sscanf ( argv[2] , "%d" , &m );
sscanf ( argv[3] , "%d" , &k );
sscanf ( argv[4] , "%d" , &n );
}

MatrixX A(m,k) , B(k,n) , C(m,n);

init_matrix_X ( A , m , k );
init_matrix_X( B , k , n );

printf ("Matrices Initialized [%dx%d , %dx%d , %dx%d ]\n" , m , k , k , n , m , n);

printf ("Running %d Iteration(s) \n" , nrep );

totaltime = 0;

REP(i,nrep) {

double runtime;

gettimeofday ( &before , NULL );

C.noalias() = A * B;

gettimeofday ( &after , NULL );

runtime = ( after.tv_usec - before.tv_usec )/1000000.0 + (after.tv_sec - before.tv_sec);

if ( nrep == 1 || i > 0 )
totaltime += runtime , cnt++;

printf(" [Iteration %d Time : %lf ]\n" , i+1 , runtime );

/*
REP(i,m) REP(j,n) {
double ans = 0;
REP(u,k) ans += A[i*k+u]*B[u*n+j];
error += (ans-C[i*n+j])*(ans-C[i*n+j]);
}

printf(" [Error = %lf ]\n" , error );
*/
}

printf("Average Time : %lf \n" , totaltime/cnt );

return 0;
}

测试结果

AVX16 单线程CPU
矩阵大小 Float16 Int16 Int8
M N K MKLDNN MKL Eigen3 MKLDNN MKL Eigen3 MKLDNN MKL Eigen3
5 2048 1320 0.001295 0.001204 0.002072 \ 0.001089 0.005318 0.038058 0.000793 0.006
5 512 2048 0.000763 0.000498 0.000786 \ 0.000539 0.001926 0.017806 0.000354 0.002
5 2048 512 0.000679 0.000475 0.000801 \ 0.000490 0.002057 0.017584 0.000317 0.002
5 2048 2048 0.002650 0.002028 0.003185 \ 0.002116 0.007945 0.080426 0.001405 0.009
5 2457 2048 0.003639 0.002482 0.003942 \ 0.002531 0.009683 0.096175 0.001714 0.011:

OpenBLAS似乎是实现密集密集矩阵乘法的最佳库(至少是我测试过的机器的w.r.t)。随着核心数量的增加和矩阵的尺寸的增加,它可以很好地扩展。

对于多线程应用程序,

OpenBLAS>MKL>ATLAS>>EIGEN>ACML

对于单线程应用,

OpenBLAS≈ACML>MKL>ATLAS>>EIGEN

也许下次我会把稀疏稠密矩阵乘法的一些比较,

源代码

我已经附上了源代码的程序和绘图脚本在这里以及在