1. 程式人生 > >Linux 版的 Intel MKL 的安裝使用

Linux 版的 Intel MKL 的安裝使用

1.下載

https://software.intel.com/en-us/mkl

連結:https://pan.baidu.com/s/1ysHRNqGOhL72YC7KZXU_uA 密碼:8ivh

最新版下載方法請自行研究。

檔名字類似 l_mkl_2017.3.196.tgz

2.安裝

1)解壓至任意目錄(安裝後可刪除)

2)# ./install.sh

預設安裝至 /opt/, 可配置安裝路徑。

3)在 /etc/ld.so.conf.d 下建立名為 intel-mkl.conf 的檔案,內容為

/opt/intel/mkl/lib/intel64 /opt/intel/lib/intel64 然後執行

# ldconfig -v  4) 執行

$ /opt/intel/mkl/bin/mklvars.sh intel64 mod 見:https://software.intel.com/en-us/mkl-linux-developer-guide-scripts-to-set-environment-variables

3.使用 以編譯官方文件上的 dgemm_example.c 為例

#define min(x,y) (((x) < (y)) ? (x) : (y)) #include <stdio.h> #include <stdlib.h> #include "mkl.h"   int main() {     double *A, *B, *C;     int m, n, p, i, j;     double alpha, beta;       printf ("\n This example computes real matrix C=alpha*A*B+beta*C using \n"             " Intel(R) MKL function dgemm, where A, B, and  C are matrices and \n"             " alpha and beta are double precision scalars\n\n");       m = 2000, p = 200, n = 1000;     printf (" Initializing data for matrix multiplication C=A*B for matrix \n"             " A(%ix%i) and matrix B(%ix%i)\n\n", m, p, p, n);     alpha = 1.0; beta = 0.0;     printf (" Allocating memory for matrices aligned on 64-byte boundary for better \n"             " performance \n\n");     A = (double *)mkl_malloc( m*p*sizeof( double ), 64 );     B = (double *)mkl_malloc( p*n*sizeof( double ), 64 );     C = (double *)mkl_malloc( m*n*sizeof( double ), 64 );     if (A == NULL || B == NULL || C == NULL) {         printf( "\n ERROR: Can't allocate memory for matrices. Aborting... \n\n");         mkl_free(A);         mkl_free(B);         mkl_free(C);         return 1;     }       printf (" Intializing matrix data \n\n");     for (i = 0; i < (m*p); i++) {         A[i] = (double)(i+1);     }       for (i = 0; i < (p*n); i++) {         B[i] = (double)(-i-1);     }       for (i = 0; i < (m*n); i++) {         C[i] = 0.0;     }       printf (" Computing matrix product using Intel(R) MKL dgemm function via CBLAS interface \n\n");     cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,                  m, n, p, alpha, A, p, B, n, beta, C, n);     printf ("\n Computations completed.\n\n");       printf (" Top left corner of matrix A: \n");     for (i=0; i<min(m,6); i++) {         for (j=0; j<min(p,6); j++) {             printf ("%12.0f", A[j+i*p]);         }         printf ("\n");     }       printf ("\n Top left corner of matrix B: \n");     for (i=0; i<min(p,6); i++) {         for (j=0; j<min(n,6); j++) {             printf ("%12.0f", B[j+i*n]);         }         printf ("\n");     }          printf ("\n Top left corner of matrix C: \n");     for (i=0; i<min(m,6); i++) {         for (j=0; j<min(n,6); j++) {             printf ("%12.5G", C[j+i*n]);         }         printf ("\n");     }       printf ("\n Deallocating memory \n\n");     mkl_free(A);     mkl_free(B);     mkl_free(C);       printf (" Example completed. \n\n");     return 0; } (程式碼下載地址:https://software.intel.com/en-us/product-code-samples) 編譯命令為:

$ gcc -I/opt/intel/mkl/include dgemm_example.c -lmkl_core -lmkl_intel_lp64 -lmkl_intel_thread -liomp5 -lpthread -lm -L/opt/intel/mkl/lib/intel64 -L/opt/intel/lib/intel64

或者

$ gcc -I/opt/intel/mkl/include dgemm_example.c -lmkl_rt -L/opt/intel/mkl/lib/intel64 -L/opt/intel/lib/intel64

再或者

$ . /opt/intel/bin/compilervars.sh intel64

$ gcc dgemm_example.c -lmkl_rt

(此方法可行是因為前一個命令設定了環境變數 CPATH,LD_LIBRARY_PATH,LIBRARY_PATH,致使編譯器可以找到所需的標頭檔案和庫檔案。編譯C時標頭檔案查詢 C_INCLUDE_PATH 中包含目錄,C++ 查詢 CPLUS_INCLUDE_PATH,C和C++都查詢 CPATH)

連結MKL的庫的方法見:https://software.intel.com/en-us/mkl-linux-developer-guide-linking-your-application-with-the-intel-math-kernel-library

(學習文件:https://software.intel.com/en-us/get-started-with-mkl-for-linux ,https://software.intel.com/en-us/mkl-linux-developer-guide)

後記:

Intel的MPI庫的安裝方法與MKL相同,執行 compilervars.sh 之後即可編譯使用了MPI庫的檔案。

gmres_test.c

/* Example to show how to use Intel's FGMRES with preconditioner to solve the linear system Ax=b in MPI.  * Based on Intel's example: solverc/source/fgmres_full_funct_c.c  * For CS51501 HW3 Part b   *   * Please read Intel Reference Manual, Chapter 6 Sparse Solve Routine, FGMRES Interface Description for the detail information.  */ #include <stdio.h> #include "mkl.h" #include "mpi.h"   #define MASTER 0        // taskid of first task #define RESTART 500 #define TOL 0.00000001 #define MAXIT 1000   void mpi_dgemv(const MKL_INT m, const MKL_INT local_m, const double *A, const double *u, double *v, double *local_u, double *local_v, int taskid, MPI_Comm comm); void mpi_preconditioner_solver(const MKL_INT m, const MKL_INT local_m, const double *local_M, const double *u, double *v, double *local_u, int taskid, MPI_Comm comm);   int main(int argc, char *argv[]) {     int taskid;        // a task identifier      int numtasks;        // number of tasks in partition     MPI_Comm comm;     int m;            // size of the matrix     int local_m;        // rows of matrix A sent to each worker      double *A, *b, *exact_x, *x;     double *temp_1, *temp_2;     double *local_A, *local_v, *local_u;     double *local_M;    // M is the preconditioner in this example, which is the diagonal element of A;     int i, j, k;       MPI_Init(&argc, &argv);     comm = MPI_COMM_WORLD;     MPI_Comm_rank(comm, &taskid);     MPI_Comm_size(comm, &numtasks);     if (taskid == MASTER) {    // initilization: A and b         /* start modification 1: read A and b from mtx files in node 0 */         m = 64;        // size of the matrix         A = malloc(sizeof(double) * (m * m));         // !!! A is in col-major         for (j = 0; j < m; j++)             for (i = 0; i < m; i++) {                 if (i == j)                     *(A + j * m + i) = m * 100.0;                 else                     *(A + j * m + i) = i + 1.0;             }         exact_x = malloc(sizeof(double) * m);         for (i = 0; i < m; i++)             *(exact_x + i) = 1.0;         b = malloc(sizeof(double) * m);         // b=A*ones(n,1)         cblas_dgemv(CblasColMajor, CblasNoTrans, m, m, 1.0, A, m, exact_x, 1, 0.0, b, 1);         /* end modification 1 */     }       MPI_Bcast(&m, 1, MPI_INT, MASTER, comm);    // send m from node MASTER to all other nodes.     local_m = m / numtasks;     local_A = malloc(sizeof(double) * (local_m * m));     local_u = malloc(sizeof(double) * (local_m));     local_v = malloc(sizeof(double) * m);     //      partition A and send A_i to local_A on  node i     MPI_Scatter(A, local_m * m, MPI_DOUBLE, local_A, local_m * m, MPI_DOUBLE, MASTER, comm);       if (taskid == MASTER) {         free(A);         free(exact_x);         // do not free b, it wil be used  for GMRES     }         /* start modification 2: generate preconditioner M      * In this example, TA choose the diagonal elements of A as the preconditioner.      * In HW3 part b, you should generate L and U here.      */     local_M = malloc(sizeof(double) * local_m);     for (i = 0; i < local_m; i++)         *(local_M + i) = *(local_A + taskid * local_m + i * m + i);     /* end  modification 2 */         /*---------------------------------------------------------------------------      * GMRES: Allocate storage for the ?par parameters and the solution vectors      *---------------------------------------------------------------------------*/     MKL_INT RCI_request;     int RCI_flag;     double dvar;     int flag = 0;       MKL_INT ipar[128];    //specifies the integer set of data for the RCI FGMRES computations     double dpar[128];    // specifies the double precision set of data     double *tmp;        //used to supply the double precision temporary space for theRCI FGMRES computations, specifically:     double *computed_solution;     double *residual;     double *f;     MKL_INT itercount, ierr = 0;;     MKL_INT ivar;     double b_2norm;     char cvar = 'N';     MKL_INT incx = 1;     if (taskid == MASTER) {         ipar[14] = RESTART;    // restart iteration number         int n_tmp = (2 * ipar[14] + 1) * m + ipar[14] * (ipar[14] + 9) / 2 + 1;         tmp = (double *) malloc(sizeof(double) * n_tmp);         computed_solution = (double *) malloc(sizeof(double) * m);         residual = (double *) malloc(sizeof(double) * m);         f = (double *) malloc(sizeof(double) * m);           ivar = m;         /*---------------------------------------------------------------------------          * Initialize the initial guess          *---------------------------------------------------------------------------*/         for (i = 0; i < m; i++) {             computed_solution[i] = 0.5;         }           b_2norm = cblas_dnrm2(ivar, b, incx);         //      printf("b_2norm=%f\n",b_2norm);         /*---------------------------------------------------------------------------          * Initialize the solver          *---------------------------------------------------------------------------*/         dfgmres_init(&ivar, computed_solution, b, &RCI_request, ipar, dpar, tmp);         RCI_flag = RCI_request;     }     MPI_Bcast(&RCI_flag, 1, MPI_INT, MASTER, comm);     if (RCI_flag != 0)         goto FAILED;       if (taskid == MASTER) {         /*---------------------------------------------------------------------------          * GMRES: Set the desired parameters:          *---------------------------------------------------------------------------*/         ipar[14] = RESTART;    // restart iteration number         ipar[7] = 1;    //do the stopping test         ipar[10] = 1;    // use preconditioner         dpar[0] = TOL;         /*---------------------------------------------------------------------------          * Check the correctness and consistency of the newly set parameters          *---------------------------------------------------------------------------*/         dfgmres_check(&ivar, computed_solution, b, &RCI_request, ipar, dpar, tmp);         RCI_flag = RCI_request;     }       MPI_Bcast(&RCI_flag, 1, MPI_INT, MASTER, comm);     if (RCI_flag != 0)         goto FAILED;       if (taskid == MASTER) {         /*---------------------------------------------------------------------------          * Print the info about the RCI FGMRES method          *---------------------------------------------------------------------------*/         printf("Some info about the current run of RCI FGMRES method:\n\n");         if (ipar[7]) {             printf("As ipar[7]=%d, the automatic test for the maximal number of ", ipar[7]);             printf("iterations will be\nperformed\n");         } else {             printf("As ipar[7]=%d, the automatic test for the maximal number of ", ipar[7]);             printf("iterations will be\nskipped\n");         }         printf("+++\n");         if (ipar[8]) {             printf("As ipar[8]=%d, the automatic residual test will be performed\n", ipar[8]);         } else {             printf("As ipar[8]=%d, the automatic residual test will be skipped\n", ipar[8]);         }         printf("+++\n");         if (ipar[9]) {             printf("As ipar[9]=%d, the user-defined stopping test will be ", ipar[9]);             printf("requested via\nRCI_request=2\n");         } else {             printf("As ipar[9]=%d, the user-defined stopping test will not be ", ipar[9]);             printf("requested, thus,\nRCI_request will not take the value 2\n");         }         printf("+++\n");         if (ipar[10]) {             printf("As ipar[10]=%d, the Preconditioned FGMRES iterations will be ", ipar[10]);             printf("performed, thus,\nthe preconditioner action will be requested via ");             printf("RCI_request=3\n");         } else {             printf("As ipar[10]=%d, the Preconditioned FGMRES iterations will not ", ipar[10]);             printf("be performed,\nthus, RCI_request will not take the value 3\n");         }         printf("+++\n");         if (ipar[11]) {             printf("As ipar[11]=%d, the automatic test for the norm of the next ", ipar[11]);             printf("generated vector is\nnot equal to zero up to rounding and ");             printf("computational errors will be performed,\nthus, RCI_request will not ");             printf("take the value 4\n");         } else {             printf("As ipar[11]=%d, the automatic test for the norm of the next ", ipar[11]);             printf("generated vector is\nnot equal to zero up to rounding and ");             printf("computational errors will be skipped,\nthus, the user-defined test ");             printf("will be requested via RCI_request=4\n");         }         printf("+++\n\n");     }     /*---------------------------------------------------------------------------      * Compute the solution by RCI (P)FGMRES solver with preconditioning      * Reverse Communication starts here      *---------------------------------------------------------------------------*/       ONE:     if (taskid == MASTER) {         dfgmres(&ivar, computed_solution, b, &RCI_request, ipar, dpar, tmp);         RCI_flag = RCI_request;     }     MPI_Bcast(&RCI_flag, 1, MPI_INT, MASTER, comm);    // send RCI_request from node MASTER to all other nodes.       /*---------------------------------------------------------------------------      * If RCI_request=0, then the solution was found with the required precision      *---------------------------------------------------------------------------*/     if (RCI_flag == 0)         goto COMPLETE;     /*---------------------------------------------------------------------------      * If RCI_request=1, then compute the vector A*tmp[ipar[21]-1]      * and put the result in vector tmp[ipar[22]-1]      *---------------------------------------------------------------------------      * NOTE that ipar[21] and ipar[22] contain FORTRAN style addresses,      * therefore, in C code it is required to subtract 1 from them to get C style      * addresses      *---------------------------------------------------------------------------*/     if (RCI_flag == 1) {         if (taskid == MASTER) {             temp_1 = &tmp[ipar[21] - 1];             temp_2 = &tmp[ipar[22] - 1];         }           mpi_dgemv(m, local_m, local_A, temp_1, temp_2, local_u, local_v, taskid, comm);           goto ONE;     }     /*---------------------------------------------------------------------------      * If RCI_request=2, then do the user-defined stopping test      * The residual stopping test for the computed solution is performed here      *---------------------------------------------------------------------------      */     if (RCI_flag == 2) {         /* Request to the dfgmres_get routine to put the solution into b[N] via ipar[12]            --------------------------------------------------------------------------------            WARNING: beware that the call to dfgmres_get routine with ipar[12]=0 at this            stage may destroy the convergence of the FGMRES method, therefore, only            advanced users should exploit this option with care */         if (taskid == MASTER) {             ipar[12] = 1;             /* Get the current FGMRES solution in the vector f */             dfgmres_get(&ivar, computed_solution, f, &RCI_request, ipar, dpar, tmp, &itercount);             temp_1 = f;             temp_2 = residual;         }         /* Compute the current true residual via mpi mat_vec multiplication */         mpi_dgemv(m, local_m, local_A, temp_1, temp_2, local_u, local_v, taskid, comm);           if (taskid == MASTER) {             dvar = -1.0E0;             cblas_daxpy(ivar, dvar, b, incx, residual, incx);             dvar = cblas_dnrm2(ivar, residual, incx);             printf("iteration %d, relative residual:%e\n", itercount, dvar);         }           MPI_Bcast(&dvar, 1, MPI_DOUBLE, MASTER, comm);         if (dvar < TOL) {             goto COMPLETE;         } else             goto ONE;     }     /*---------------------------------------------------------------------------      * If RCI_request=3, then apply the preconditioner on the vector      * tmp[ipar[21]-1] and put the result in vector tmp[ipar[22]-1]      *---------------------------------------------------------------------------      * NOTE that ipar[21] and ipar[22] contain FORTRAN style addresses,      * therefore, in C code it is required to subtract 1 from them to get C style      * addresses      *---------------------------------------------------------------------------*/     if (RCI_flag == 3) {         if (taskid == MASTER) {             temp_1 = &tmp[ipar[21] - 1];             temp_2 = &tmp[ipar[22] - 1];         }         /* start modification 3: solve L U temp_2 = temp_1   */         mpi_preconditioner_solver(m, local_m, local_M, temp_1, temp_2, local_u, taskid, comm);         /* end modification 3 */         goto ONE;     }     /*---------------------------------------------------------------------------      * If RCI_request=4, then check if the norm of the next generated vector is      * not zero up to rounding and computational errors. The norm is contained      * in dpar[6] parameter      *---------------------------------------------------------------------------*/     if (RCI_flag == 4) {         if (taskid == MASTER)             dvar = dpar[6];         MPI_Bcast(&dvar, 1, MPI_DOUBLE, MASTER, comm);         if (dvar < 1.0E-12) {             goto COMPLETE;         } else             goto ONE;     }     /*---------------------------------------------------------------------------      * If RCI_request=anything else, then dfgmres subroutine failed      * to compute the solution vector: computed_solution[N]      *---------------------------------------------------------------------------*/     else {         goto FAILED;     }     /*---------------------------------------------------------------------------      * Reverse Communication ends here      * Get the current iteration number and the FGMRES solution (DO NOT FORGET to      * call dfgmres_get routine as computed_solution is still containing      * the initial guess!). Request to dfgmres_get to put the solution      * into vector computed_solution[N] via ipar[12]      *---------------------------------------------------------------------------*/       COMPLETE:if (taskid == MASTER) {         ipar[12] = 0;         dfgmres_get(&ivar, computed_solution, b, &RCI_request, ipar, dpar, tmp, &itercount);              /*---------------------------------------------------------------------------               * Print solution vector: computed_solution[N] and the number of iterations: itercount               *---------------------------------------------------------------------------*/         printf("The system has been solved in %d iterations \n", itercount);         printf("The following solution has been obtained (first 4 elements): \n");         for (i = 0; i < 4; i++) {             printf("computed_solution[%d]=", i);             printf("%e\n", computed_solution[i]);         }                /*-------------------------------------------------------------------------*/         /* Release internal MKL memory that might be used for computations         */         /* NOTE: It is important to call the routine below to avoid memory leaks   */         /* unless you disable MKL Memory Manager                                   */              /*-------------------------------------------------------------------------*/         MKL_Free_Buffers();         temp_1 = computed_solution;         temp_2 = residual;     }     // compute the relative residual     mpi_dgemv(m, local_m, local_A, temp_1, temp_2, local_u, local_v, taskid, comm);     if (taskid == MASTER) {         dvar = -1.0E0;         cblas_daxpy(ivar, dvar, b, incx, residual, incx);         dvar = cblas_dnrm2(ivar, residual, incx);         printf("relative residual:%e\n", dvar / b_2norm);           if (itercount < MAXIT && dvar < TOL)             flag = 0;    //success         else             flag = 1;    //fail       }       MPI_Bcast(&flag, 1, MPI_INT, MASTER, comm);       free(local_A);     free(local_M);     free(local_u);     free(local_v);     if (taskid == MASTER) {         free(tmp);         free(b);         free(computed_solution);         free(residual);     }       if (flag == 0) {         MPI_Finalize();         return 0;     } else {         MPI_Finalize();         return 1;     }     /* Release internal MKL memory that might be used for computations         */     /* NOTE: It is important to call the routine below to avoid memory leaks   */     /* unless you disable MKL Memory Manager                                   */          /*-------------------------------------------------------------------------*/       FAILED:     if (taskid == MASTER) {         printf("\nThis example FAILED as the solver has returned the ERROR code %d", RCI_request);         MKL_Free_Buffers();     }     free(local_A);     free(local_M);     free(local_u);     free(local_v);     if (taskid == MASTER) {         free(tmp);         free(b);         free(computed_solution);         free(residual);     }       MPI_Finalize();     return 1; }   void mpi_dgemv(const MKL_INT m, const MKL_INT local_m, const double *local_A, const double *u, double *v, double *local_u, double *local_v, int taskid, MPI_Comm comm) {     // compute v=A*u in MPI     CBLAS_LAYOUT layout = CblasColMajor;    //col major     CBLAS_TRANSPOSE trans = CblasNoTrans;    // no transfer       MPI_Scatter(u, local_m, MPI_DOUBLE, local_u, local_m, MPI_DOUBLE, MASTER, comm);    // send u_i from node MASTER to all other nodes.     //      printf("scatter finish at taskid=%d\n",taskid);     // compute A_i     cblas_dgemv(layout, trans, m, local_m, 1.0, local_A, m, local_u, 1, 0.0, local_v, 1);       //  Apply a reduction operation on all nodes and place the result in vector v.     MPI_Reduce(local_v, v, m, MPI_DOUBLE, MPI_SUM, MASTER, comm); }   void mpi_preconditioner_solver(const MKL_INT m, const MKL_INT local_m, const double *local_M, const double *u, double *v, double *local_u, int taskid, MPI_Comm comm) {     int i = 0;     //      printf("begin taskid=%d\n",taskid);     MPI_Scatter(u, local_m, MPI_DOUBLE, local_u, local_m, MPI_DOUBLE, MASTER, comm);    // send u_i from node MASTER to all other nodes.     //      printf("taskid=%d\n",taskid);     //compute Mi^(-1)*y_i at each node     for (i = 0; i < local_m; i++)         *(local_u + i) /= *(local_M + i);       // Apply a gather operation on all nodes      MPI_Gather(local_u, local_m, MPI_DOUBLE, v, local_m, MPI_DOUBLE, MASTER, comm); }

$ . /opt/intel/bin/compilervars.sh intel64 $ mpicc gmres_test.c -o gmres_test -lmkl_rt

---------------------  作者:chenjun15  來源:CSDN  原文:https://blog.csdn.net/chenjun15/article/details/75041932  版權宣告:本文為博主原創文章,轉載請附上博文連結!