linux多執行緒程式設計--對三層for迴圈的優化
阿新 • • 發佈:2019-01-23
目標:將下面3層for迴圈的程式碼進行優化:
將array改成vector後,執行時間為88s。#include <iostream> #include <vector> #include <pthread.h> using namespace std; typedef vector< vector<long> > lvec; long arr[1000][5000] = {}; long acculate(int, int); lvec& forarr(lvec&, long, long); int main() { long sum(0); lvec ivec; ivec.resize(1000); for (long i=0; i<1000; ++i) { for (long j=0; j<5000; ++j) ivec[i].push_back(j); } cout << ivec[999][4999] << endl; for (long i=0; i<1000; ++i) { ivec = forarr(ivec, i, 0); pthread_t id; int ret = pthread_create(&id, 0, forarr, 0); if (!ret) cout << "failed to create thread!" << endl; ivec = forarr(ivec, i, 2500); } cout << ivec[999][4999] << endl; } lvec& forarr(lvec& in_vec, long f, long delta) { for (long j=0; j<2500+delta; ++j) in_vec[f][j] = acculate(f, j); return in_vec; } // 得到第m幀第n個點的值 long acculate(int m, int n) { long sum(0); for (long k=0; k<5000; ++k) sum += k; return sum + m*n; }
下面用8個執行緒平行計算後,執行時間為11s,
直接貼程式碼:
我的感想:#include <iostream> #include <vector> #include <pthread.h> #include <sstream> using namespace std; typedef vector< vector<long> > lvec; long arr[1000][5000] = {}; // 結構體,用來儲存執行緒函式引數 struct para { lvec* longvec; long f; long start_state, end_state; }; long acculate(int, int); void* forarr(void*); vector<int> aver (int, int); int main() { long sum(0); lvec ivec; ivec.resize(1000); // 初始化 for (long i=0; i<1000; ++i) { for (long j=0; j<5000; ++j) ivec[i].push_back(j); } cout << ivec[999][4999] << endl; // 用8個執行緒去跑,執行緒數等於處理器數最優 int threads_num = 8; // 得到每個執行緒要處理的狀態範圍 vector<int> i_vec; i_vec = aver(5000, threads_num); for (int i=0; i<threads_num; ++i) if (i!=0) i_vec[i] = i_vec[i] + i_vec[i-1]; vector<struct para> struct_vec; vector<pthread_t> pthreadT_vec; for (long i=0; i<1000; ++i) { // 清理的操作尤其重要 struct_vec.clear(); pthreadT_vec.clear(); // 完成引數的設定 for (int ii=0; ii<threads_num; ++ii) { struct para p_struct; p_struct.longvec = &ivec; p_struct.f = i; if (i==0) p_struct.start_state = 0; else p_struct.start_state = i_vec[ii-1]; p_struct.end_state = i_vec[ii]; struct_vec.push_back(p_struct); pthread_t id; pthreadT_vec.push_back(id); } // 多個執行緒同時開啟 for (int ii=0; ii<threads_num; ++ii) pthread_create(&pthreadT_vec[ii], NULL, forarr, &struct_vec[ii]); // 等待執行緒的結束 for (int ii=0; ii<threads_num; ++ii) pthread_join(pthreadT_vec[ii], NULL); } cout << ivec[999][4999] << endl; return 0; } void forarr0(lvec& in_vec, long f, long st, long ed) { for (long j=st; j<ed; ++j) in_vec[f][j] = acculate(f, j); //return in_vec; } void* forarr(void* paralist) { struct para* p = (struct para*)paralist; lvec* in_vec = p->longvec; // 用指標 long fra = p->f; long start_s = p->start_state; long end_s = p->end_state; for (long j=start_s; j<end_s; ++j) (*in_vec)[fra][j] = acculate(fra, j); pthread_exit(NULL); //return in_vec; } // 得到第m幀第n個點的值 long acculate(int m, int n) { long sum(0); for (long k=0; k<5000; ++k) sum += k; return sum + m*n; } vector<int> aver (int x, int y) { vector<int> invec; int m = x/y, n = x%y; for (int i=0; i<y; ++i) { if (i<n) invec.push_back(m+1); else invec.push_back(m); } return invec; }
1 當執行緒數小於等於cpu數時,執行緒數與時間基本成反比,我的電腦是4核8cpu的,故用8個執行緒最為合適,執行緒再加大,其實多餘的執行緒還是處於等待狀態;
2 本函式只用到了linux下多執行緒處理的兩個最基本函式pthread_create和pthread_creat,因為這裡執行緒間資料的讀寫沒有衝突;當可能產生衝突時,就要用到加鎖技術,見之後的學習。