CPU矩阵运算库比较DNNL,mkl,eigen3

安装必要的库

Eigen3

执行命令：

sudo apt-get install libeigen3-dev
or sudo yum install eigen3
安装后执行以下命令

运行命令：

sudo cp -r /usr/include/eigen3/Eigen /usr/include

注意：参考cp指令 /usr/men /usr/zh 将目录/usr/men下的所有文件及其子目录复制到目录/usr/zh中

这个命令的说明：

因为eigen3被默认安装到了usr/include里了（系统默认的路径），在很多程序中include时经常使用#include<Eigen/Dense>而不是使用#include<eigen3/Eigen/Dense>所以要做一下处理，否则有些程序在编译时找不到Eigen/Dense而报错。上面指令将usr/include/eigen3文件夹中的Eigen文件递归的复制到上一层文件（直接放到/usr/include中，否则系统无法默认搜索到----->此时只能在CMakeLists.txt用include_libraries（绝对路径了））
  
 
//或第二种安装方法    
#github 有个mirror,版本3.3.4 from 2017
git clone https://github.com/eigenteam/eigen-git-mirror

#安装
cd eigen-git-mirror
mkdir build
cd build
cmake ..
sudo make install

#安装后,头文件安装在/usr/local/include/eigen3

示例程序

//g++ -I/usr/local/include/eigen3 1.cpp -o 1
#include <iostream>
#include <Eigen/Dense>

using namespace std;
using namespace Eigen;

int main()
{

    MatrixXd m = MatrixXd::Random(3,3); //随机数-1  ~1
    cout << m << endl;
    m = (m + MatrixXd::Constant(3,3,1.2))* 50;
    cout << " m2 =" << m << endl;

    MatrixXd n = MatrixXd::Constant(3,3,2);//常矩阵
    cout << " n =" << n << endl;

    VectorXd v(3);
    v << 1,2,3;

    cout << v<< endl;

    cout << "m * v" << endl << m * v << endl; //向量矩阵计算

}

MKL

官网下载安装包https://software.intel.com/en-us/mkl

tar -zxvf l_mkl_2019.0.117.tgz
cd l_mkl_2019.0.117
./install.sh
source ~/intel/bin/compilervars.sh intel64


locate libdnnl.so.1
export LD_LIBRARY_PATH=/home/resources/yxwang/tf_install_source/serving1.13/dfsmn_opt_quant:$LD_LIBRARY_PATH

locate libmkl_sequential.so
export LD_LIBRARY_PATH=/home/public/wangyunxia_share/build_tf1.12mkl/Predict-ft/tensorflow_lib_h2_old/third_party/attnlibs/:$LD_LIBRARY_PATH


export LD_LIBRARY_PATH=/home/yx.wang/dnnl_lnx_1.1.1_cpu_iomp/lib:$LD_LIBRARY_PATH

MKL-DNNL

官网下载编译好的包即可，然后-I加头文件路径，-L加so路径，LD_LIBRARY_PATH实际的so代码路径

MKL_INCDIR=/opt/composer_xe_2013.2.146/mkl/include

MKL_LIBDIR=/opt/composerxe/mkl/lib/intel64/

MKL_RT_LIBDIR=/opt/composer_xe_2013/lib/intel64/

#DNNLROOT=/home/resources/lizhipeng/dense-matrix-mult/MKL/mkldnn_lnx_1.0.4_cpu_iomp

DNNLROOT=/home/yx.wang/dnnl_lnx_1.1.1_cpu_iomp

gcc \

-I $MKL_INCDIR -I $DNNLROOT/include \

-L $MKL_LIBDIR -L /home/yx.wang/dnnl_lnx_1.1.1_cpu_iomp/lib/ -ldnnl \

-L $MKL_RT_LIBDIR \

-O4 -msse2 -msse3 -msse4 \

dense_mult_mkl.c \

-Wl,–start-group -lmkl_sequential -lmkl_core -lmkl_intel_lp64 -Wl,–end-group \

-lm -lpthread \

-o a.out

示例测试程序

#include <math.h>
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h> 
#include <stdlib.h>
#include <mkl.h>
#include "dnnl.h"
#define REP(i,n) for ( i = 0; i < (n); ++i ) 
typedef int8_t test_type;

// using namespace std;
// using namespace dnnl;
void init_matrix(double* A, int dim1 , int dim2 ) {

  int mod = 100003, prod = 7 , e = 1 , i = 0, j = 0;

  REP(i,dim1) REP(j,dim2) {
    e = (e*prod + 1)%mod;
    A[i*dim2 + j] = e * .91739210437;
  }
}

void init_matrix_float(float* A, int dim1 , int dim2 ) {

  int mod = 100003, prod = 7 , e = 1 , i = 0, j = 0;

  REP(i,dim1) REP(j,dim2) {
    e = (e*prod + 1)%mod;
    A[i*dim2 + j] = 0.0;
  }
}

void init_matrix_X(test_type* A, int dim1 , int dim2 ) {

  int mod = 100003, prod = 7 , e = 1 , i = 0, j = 0;

  REP(i,dim1) REP(j,dim2) {
    e = (e*prod + 1)%mod;
    A[i*dim2 + j] = 0;
  }
}

void print_matrix (double *A , int dim1 , int dim2 ) {

  int i, j;

  REP(i,dim1) {
    REP(j,dim2) printf("%lf " , A[i*dim2 + j]);
    puts("");
  }
}

void print_matrix_float (float *A , int dim1 , int dim2 ) {

  int i, j;

  REP(i,dim1) {
    REP(j,dim2) printf("%lf " , A[i*dim2 + j]);
    puts("");
  }
}


int main(int argc, char** argv){

  int m , n , k , i , j , u , nrep = 1 , cnt = 0;
  int tag = 1;
  
  double totaltime , error = 0;

  struct timeval before , after;

  if ( argc <= 1 || argc >= 6 ) {
    puts("./exe [nrep] <dim1> [dim2 dim3]");
    exit(0);
  }

  if ( argc == 2 ) {

    sscanf ( argv[1] , "%d" , &m );
    n = k = m;
  }
  else if ( argc == 3 ) {

    sscanf ( argv[1] , "%d" , &nrep );
    sscanf ( argv[2] , "%d" , &m );
    n = k = m;
  }
  else if ( argc == 4 ) {

    sscanf ( argv[1] , "%d" , &m );
    sscanf ( argv[2] , "%d" , &k );
    sscanf ( argv[3] , "%d" , &n );
  }
  else if ( argc == 5 ) {

    sscanf ( argv[1] , "%d" , &nrep );
    sscanf ( argv[2] , "%d" , &m );
    sscanf ( argv[3] , "%d" , &k );
    sscanf ( argv[4] , "%d" , &n );
  }
  test_type *A, *B;
  int32_t *C;
  if(tag == 1){

      A = (test_type *) malloc( m*k*sizeof( test_type ) );
      B = (test_type *) malloc( k*n*sizeof( test_type ) );
      C = (int32_t *) malloc( m*n*sizeof( int32_t ) );
      init_matrix_X ( A , m , k );
      init_matrix_X ( B , k , n );
  }
  // else if(tag == 2){
  //     A = (float *) malloc( m*k*sizeof( float ) );
  //     B = (float *) malloc( k*n*sizeof( float ) );
  //     C = (float *) malloc( m*n*sizeof( float ) );
  //     init_matrix_float ( A , m , k );
  //     init_matrix_float ( B , k , n );
  // }
  // else if(tag == 3){
  //     A = (double *) malloc( m*k*sizeof( double ) );
  //     B = (double *) malloc( k*n*sizeof( double ) );
  //     C = (double *) malloc( m*n*sizeof( double ) );
  //     init_matrix ( A , m , k );
  //     init_matrix ( B , k , n );
  // }


  //A = (double *) malloc( sizeof(double) * m * k );
  //B = (double *) malloc( sizeof(double) * k * n );
  //C = (double *) malloc( sizeof(double) * m * n );


  printf ("Matrices Initialized [%dx%d , %dx%d , %dx%d ]\n" , m , k , k , n , m , n);

  printf ("Running %d Iteration(s) \n" , nrep );

  totaltime = 0;
  const MKL_INT8 ao = 0;
  const MKL_INT8 bo = 0;
  MKL_INT32 co = 0;
  CBLAS_OFFSET    offsetc=CblasFixOffset;
  MKL_INT         lda=2, ldb=1, ldc=1;
  float           alpha=1.0f, beta=0.0f;
  REP(i,nrep) {

    double runtime;

    gettimeofday ( &before , NULL );
    // cblas_gemm_s8u8s32(CblasRowMajor, CblasNoTrans, CblasNoTrans, offsetc,
                      // m, n, k, alpha, A, k, ao, B, n, bo, beta, C, n, &co);
    // cblas_gemm_s16s16s32(CblasRowMajor, CblasNoTrans, CblasNoTrans, offsetc,
                      // m, n, k, alpha, A, k, ao, B, n, bo, beta, C, n, &co);                  
    //https://software.intel.com/en-us/onemkl-developer-reference-c-cblas-gemm-1
    //cblas_sgemm ( CblasRowMajor, CblasNoTrans, CblasNoTrans , m , n , k , 1.0 , A , k , B , n , 0.0 , C , n );
    //https://intel.github.io/mkl-dnn/group__dnnl__api__blas.html#ga2b763b7629846913507d88fba875cc26
    dnnl_gemm_s8s8s32('N', 'N', 'F', m, n, k, 1.0f, A, k,ao, B, n, bo,0.0f, C, n, &co);
    gettimeofday ( &after , NULL );

    runtime = ( after.tv_usec - before.tv_usec )/1000000.0  + (after.tv_sec - before.tv_sec);

    if ( nrep == 1 || i > 0 ) 
      totaltime += runtime , cnt++;

    printf(" [Iteration %d Time : %lf ]\n" , i+1 , runtime );

    /*
    REP(i,m) REP(j,n) {
      double ans = 0;
      REP(u,k) ans += A[i*k+u]*B[u*n+j];
      error += (ans-C[i*n+j])*(ans-C[i*n+j]);
    }

    printf(" [Error = %lf ]\n" , error );
    */
  }

  printf("Average Time : %lf \n" , totaltime/cnt );

  return 0;
}

Eigen

INCDIR=/usr/include/eigen3/

g++ \

-I $INCDIR \

-O4 -msse2 -msse3 -msse4 \

dense_mult_eigen.cpp \

-fopenmp \

-o ap.out

g++ \

-I $INCDIR \

-O4 -msse2 -msse3 -msse4 \

-DEIGEN_DONT_PARALLELIZE \

dense_mult_eigen.cpp \

-o a.out

/* dense_mult_eigen.cpp
 * Copyright (C) 2013, Siddharth Gopal (gcdart AT gmail)
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of version 2.1 of the GNU Lesser General Public License
 * as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA, 02111-1307, USA,
 * or visit the GNU web site, www.gnu.org.
 */

#define EIGEN_NO_DEBUG

#include <Eigen/Sparse>
#include <Eigen/Core>
#include <sys/time.h> 
#include <cstdio>


using namespace std;
using namespace Eigen;

#define REP(i,n) for ( int i = 0; i < (n); ++i ) 
typedef Matrix< int8_t, Dynamic, Dynamic >  MatrixX;

void init_matrix (MatrixXd& A, int dim1 , int dim2 ) {

  int mod = 100003, prod = 7 , e = 1 , i = 0, j = 0;

  REP(i,dim1) REP(j,dim2) {
    e = (e*prod + 1)%mod;
    A(i,j) = e * .91739210437;
  }
}

void init_matrix_float(MatrixXf& A, int dim1 , int dim2 ) {

  int mod = 100003, prod = 7 , e = 1 , i = 0, j = 0;

  REP(i,dim1) REP(j,dim2) {
    e = (e*prod + 1)%mod;
    A(i*dim2 + j) = 0.0;
  }
}

void init_matrix_X(MatrixX& A, int dim1 , int dim2 ) {

  int mod = 100003, prod = 7 , e = 1 , i = 0, j = 0;

  REP(i,dim1) REP(j,dim2) {
    e = (e*prod + 1)%mod;
    A(i*dim2 + j) = 0.0;
  }
}

void print_matrix (MatrixXd& A, int dim1 , int dim2 ) {

  int i, j;

  REP(i,dim1) {
    REP(j,dim2) printf("%lf " , A(i,j) );
    puts("");
  }
}


int main( int argc , char *argv[] ){

  int m , n , k , i , j , u , nrep = 1 , cnt = 0;

  double totaltime , error = 0;

  struct timeval before , after;

  if ( argc <= 1 || argc >= 6 ) {
    puts("./exe [nrep] <dim1> [dim2 dim3]");
    exit(0);
  }

  if ( argc == 2 ) {

    sscanf ( argv[1] , "%d" , &m );
    n = k = m;
  }
  else if ( argc == 3 ) {

    sscanf ( argv[1] , "%d" , &nrep );
    sscanf ( argv[2] , "%d" , &m );
    n = k = m;
  }
  else if ( argc == 4 ) {

    sscanf ( argv[1] , "%d" , &m );
    sscanf ( argv[2] , "%d" , &k );
    sscanf ( argv[3] , "%d" , &n );
  }
  else if ( argc == 5 ) {

    sscanf ( argv[1] , "%d" , &nrep );
    sscanf ( argv[2] , "%d" , &m );
    sscanf ( argv[3] , "%d" , &k );
    sscanf ( argv[4] , "%d" , &n );
  }

  MatrixX A(m,k) , B(k,n) , C(m,n);

  init_matrix_X ( A , m , k );
  init_matrix_X( B , k , n );

  printf ("Matrices Initialized [%dx%d , %dx%d , %dx%d ]\n" , m , k , k , n , m , n);

  printf ("Running %d Iteration(s) \n" , nrep );

  totaltime = 0;

  REP(i,nrep) {

    double runtime;

    gettimeofday ( &before , NULL );
  
    C.noalias() = A * B;

    gettimeofday ( &after , NULL );

    runtime = ( after.tv_usec - before.tv_usec )/1000000.0  + (after.tv_sec - before.tv_sec);

    if ( nrep == 1 || i > 0 ) 
      totaltime += runtime , cnt++;

    printf(" [Iteration %d Time : %lf ]\n" , i+1 , runtime );

    /*
    REP(i,m) REP(j,n) {
      double ans = 0;
      REP(u,k) ans += A[i*k+u]*B[u*n+j];
      error += (ans-C[i*n+j])*(ans-C[i*n+j]);
    }

    printf(" [Error = %lf ]\n" , error );
    */
  }

  printf("Average Time : %lf \n" , totaltime/cnt );

  return 0;
}

测试结果

AVX16 单线程CPU
矩阵大小			Float16			Int16			Int8
M	N	K	MKLDNN	MKL	Eigen3	MKLDNN	MKL	Eigen3	MKLDNN	MKL	Eigen3
5	2048	1320	0.001295	0.001204	0.002072	\	0.001089	0.005318	0.038058	0.000793	0.006
5	512	2048	0.000763	0.000498	0.000786	\	0.000539	0.001926	0.017806	0.000354	0.002
5	2048	512	0.000679	0.000475	0.000801	\	0.000490	0.002057	0.017584	0.000317	0.002
5	2048	2048	0.002650	0.002028	0.003185	\	0.002116	0.007945	0.080426	0.001405	0.009
5	2457	2048	0.003639	0.002482	0.003942	\	0.002531	0.009683	0.096175	0.001714	0.011:

OpenBLAS似乎是实现密集密集矩阵乘法的最佳库（至少是我测试过的机器的w.r.t）。随着核心数量的增加和矩阵的尺寸的增加，它可以很好地扩展。

对于多线程应用程序，

OpenBLAS>MKL>ATLAS>>EIGEN>ACML

对于单线程应用，

OpenBLAS≈ACML>MKL>ATLAS>>EIGEN

也许下次我会把稀疏稠密矩阵乘法的一些比较，

源代码

我已经附上了源代码的程序和绘图脚本在这里以及在