Linux執行緒qps測試

阿新 • • 發佈：2019-01-10

本實驗源自該開源專案需求

https://github.com/yds086/HereticOS

實驗環境 OS : Centos 7.1 Kernel: 4.6.0 CPU : Intel(R) Xeon(R) CPU E5-2620 v2 @ 2.10GHz （開啟超執行緒） MEM : 48G DDR3 修改如下系統引數以建立儘量多的執行緒

/proc/sys/kernel/pid_max #作業系統執行緒數限制

/proc/sys/kernel/threads-max #作業系統執行緒數

max_user_process（ulimit -u）#系統限制某使用者下最多可以執行多少程序或執行緒

ulimit -s 512 #修改執行緒棧大小

/proc/sys/vm/max_map_count #單程序mmap的限制會影響當個程序可建立的執行緒數

/proc/sys/kernel/threads-max 這個值需要注意下：
4.6.0的核心中，該值：

threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE,
				    (u64) THREAD_SIZE * 8UL);

即48GB的記憶體，可以建立的threads-max為:

totalram_pages = 49432356KB/4KB = 12358089

threads = 12358089*4kB / (8kB * 8) = 772380
理論上可已得到應該是772380的執行緒數目，但不知為何，實際threads-max引數最多可以設定到772380/2 = 386190

虛擬碼

//任務模式
long long g_SleepIoCount=0;
long long g_SleepIoLastCount=0;
void IOTask()
{
	for(;;)
	{
		Sleep(100);// 100 ms 1000ms 10000ms
		g_SleepIoCount++;
	}
}

void TestIo()
{
	//建立一組併發任務
	CreatTask(IOTask,1000000);
	//監測IO計數
	for(;;)
	{
		Sleep(3000)//3s統計一次
		printf("Sleep Iops %d",(g_SleepIoCount-g_SleepIoLastCount)/3);
	}
	
}

測試程式碼

#define _GNU_SOURCE
#include <stdio.h>
#include <string.h>
#include <pthread.h>
#include <stdint.h>
#include <time.h>
#include <unistd.h>
#include <sys/syscall.h>

uint32_t g_sleep_ms = 0; 
uint32_t g_threadcnt = 0;
uint32_t g_running_threadcnt = 0;
uint64_t g_SleepIoCount = 0;
int32_t  g_main_bind = -1;
int32_t  g_task_bind = -1;

#define USE_CORE_BIND 1
#define MSLEEP(x) usleep(1000 * (x))
#define ATOMIC_FETCH_AND_ADD(ptr,value)    __sync_fetch_and_add((ptr), (value))


void *sleep_task(void* para)
{

    if (g_task_bind >= 0)
    {
        cpu_set_t mask;
        CPU_ZERO(&mask);
        CPU_SET(g_task_bind, &mask);
        if (pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask) < 0)
        {		
            printf("Bind to Core Error !\n");
        return NULL;
        }
    }  
    
    ATOMIC_FETCH_AND_ADD(&g_running_threadcnt, 1);
    while(1)
    {
        MSLEEP(g_sleep_ms);
        ATOMIC_FETCH_AND_ADD(&g_SleepIoCount, 1);
    }
}

static inline pid_t gettid(void){
    return syscall(SYS_gettid);
}

void execute_cmd(const char *cmd, char *result)   
{   
    char buf_ps[1024];   
    char ps[1024]={0};   
    FILE *ptr;   
    strcpy(ps, cmd);   
    if((ptr=popen(ps, "r"))!=NULL)   
    {   
        while(fgets(buf_ps, 1024, ptr)!=NULL)   
        {   
           strcat(result, buf_ps);   
           if(strlen(result)>1024)   
               break;   
        }   
        pclose(ptr);   
        ptr = NULL;   
    }   
    else  
    {   
        printf("popen %s error\n", ps);   
    }   
}  

void print_process_info(void)
{
    char cmd_string[128] = {0};
    char cmd_result[128] = {0};
    
    pid_t my_pid = gettid();
    memset(cmd_string, 0, sizeof(cmd_string));
    memset(cmd_result, 0, sizeof(cmd_result));
    sprintf(cmd_string, "cat /proc/%u/status | grep VmRSS | cut -d : -f 2 | tr -cd \"[0-9]\"", (uint32_t)my_pid); 
    execute_cmd(cmd_string, cmd_result);
    printf("Current Process Used %s physical memory !!!!\n", cmd_result);    

    memset(cmd_string, 0, sizeof(cmd_string));
    memset(cmd_result, 0, sizeof(cmd_result));
    sprintf(cmd_string, "cat /proc/%u/status | grep VmSize | cut -d : -f 2 | tr -cd \"[0-9]\"", (uint32_t)my_pid);
    execute_cmd(cmd_string, cmd_result);
    printf("Current Process Used %s virtual memory !!!!\n", cmd_result);

    memset(cmd_string, 0, sizeof(cmd_string));
    memset(cmd_result, 0, sizeof(cmd_result));
    sprintf(cmd_string, "cat /proc/%u/status | grep Threads | cut -d : -f 2 | tr -cd \"[0-9]\"", (uint32_t)my_pid);
    execute_cmd(cmd_string, cmd_result);
    printf("Current Process Used %s threads !!!!\n", cmd_result);

    sleep(3);
    return ;
}

void main(int argc, void* argv[])
{
    if (argc != 5)
    {
        printf("Usage:$s thread_cnt sleep_ms main_bind task_bind \n", argv[0]);
        return;
    }
    
    g_threadcnt = atoi(argv[1]);
    g_sleep_ms = atoi(argv[2]);
    g_main_bind = atoi(argv[3]);
    g_task_bind = atoi(argv[4]);

    if (g_main_bind >= 0)
    {
        cpu_set_t mask;
        CPU_ZERO(&mask);
        CPU_SET(1, &mask);
        if (pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask) < 0)
        {
            printf("Main Bind to Core Error !\n");
            return;
        }
    }
 
    int ret = 0;
    uint32_t i;
    pthread_t thread;
    for (i = 0; i < g_threadcnt; i++)
    {
        if (i % 5000 == 0)
        {
            printf("Already create %d threads ....\n", i);
        }
    
        ret = pthread_create(&thread, NULL, &sleep_task, NULL);
        if (0 != ret)
        {
            printf("[ERROR]Create thread error, index:%d, ret:%d!!!\n", i, ret);
            return;
        }
    }
    
    //waiting for thread all running
    while (g_running_threadcnt != g_threadcnt)
    {
        printf("Running:%d  -  Total:%d \n", g_running_threadcnt, g_threadcnt);
        sleep(1);
    }
    printf("All the %d threads is running ....\n", g_running_threadcnt);

    print_process_info();
 
    //excute the test
    uint64_t last_cnt = 0;
    int test_cnt = 0;
    for (; test_cnt < 50; test_cnt++)
    {
        sleep(3);
        if (test_cnt != 0)
            printf("Sleep Iops %d  \n",(g_SleepIoCount-last_cnt)/3);
        last_cnt = g_SleepIoCount; // maybe not accurate ...   
    }
   
    print_process_info(); 
    return;
}

Makefile

test:pthreadtest.c
	gcc -g -O3 -o test ./pthreadtest.c -lpthread 
clean:
	rm -rf ./test

測試場景

單一核心上可以同時執行的最大執行緒數目

執行 ./test 325000 1000 1 2 ，即建立325000個執行緒，每個任務執行緒sleep 1s，同時主程序繫結在核心1上，任務執行緒均繫結在核心2上。

可以看出，主程序很快就建立完了325000個執行緒，但由於這些執行緒均繫結至核心2上，並且已經執行的執行緒只sleep 1s，導致需要執行的執行緒得不到時間片。（用島主的話說：“那就是被io上限約束了吧，再建立的都餓死，而不是都給點飯吃嗎”） 結論：經過測試，本實驗環境下，在sleep 1s情況下，建立24w左右的執行緒可為極限。

單一程序進行核心繫結，得出單核的qps極限值

執行./test 26000 100 1 2

調整建立的執行緒數目，發現大於26000的執行緒後，建立執行緒將變的困難，和測試場景1屬於同樣的問題。 結論：經過測試，本實驗環境下，在sleep 100ms情況下，單核qps極限值為25w左右，此時任務核心cpu跑滿100%
同樣的，時間擴大1倍至1s，執行緒數目擴大至250000，結果如下：（線性擴大至260000時，程式響應慢）

結論：經過測試，本實驗環境下，在sleep 1000ms情況下，單核qps極限值為24w左右，此時任務核心cpu跑滿100%
我的環境下/proc/sys/kernel/threads-max最多30w左右（記憶體限制），導致無法測試10s的情況

同時執行多個程序，並進行核心繫結

同時執行./test 26000 100 1 2 和 ./test 26000 100 3 4

測試結果：多核心的cpu，在此種測試結果下，qps基本成線性增加，（同時在物理核和超執行緒核心，會有一定影響）

將執行執行緒和sleep時間同時擴大10倍，即同時執行./test 240000 1000 1 2 和 ./test 240000 1000 3 4：測試結果如下

測試結果：多核心的cpu，在此種測試結果下，qps基本成線性增加，（同時在物理核和超執行緒核心，會有一定影響）

單一程序，不進行核心繫結

不進行核心繫結，由linux預設進行排程，理論上應該是25w * 12 的qps，在我的12核心cpu實驗環境下得到如下結果：執行 ./test 190000 100 -1 -1

此時的cpu基本跑滿

如前述所說，和超執行緒也有一定的關係，所以並不一定是完全線性的。

測試結果：19w的執行緒，sleep 100ms，基本可以達到190w的qps，再進一步建立執行緒比較困難。

測試彙總

任務數	sleep 100ms	sleep 1000ms
26000	25w qps
250000	24w qps
180000	180w qps

相關結論：
1）單個cpu核心的qps，可達25w qps；
2）多核心cpu，qps可擴充套件，基本符合線性，但超執行緒需關閉

Linux執行緒qps測試

虛擬碼

測試程式碼

測試場景

單一核心上可以同時執行的最大執行緒數目

單一程序進行核心繫結，得出單核的qps極限值

同時執行多個程序，並進行核心繫結

單一程序，不進行核心繫結

測試彙總

Linux執行緒qps測試

linux執行緒排程方式測試總結

Kafka的多工執行緒消費測試

Linux-執行緒互斥-鎖

Linux-執行緒

【linux執行緒】執行緒安全之條件變數

Linux 執行緒管理

SimpleDateFormat非執行緒安全測試

linux執行緒基礎概念及多執行緒程式設計

Linux 執行緒同步---條件變數

linux執行緒數限制與zabbix監控

記今天學習Linux執行緒遇到的關於sleep(0)的問題

Linux執行緒間死鎖分析

Linux 執行緒 ID 和設定名字

Linux執行緒同步之條件變數pthread_cond_t

c#多執行緒操作測試（阻塞執行緒，結束任務）

Linux執行緒和fork

多執行緒篇——初始Linux執行緒

Linux執行緒取消

【Linux 執行緒】同一個程序中的執行緒共享哪些資源

Linux執行緒qps測試

虛擬碼

測試程式碼

測試場景

單一核心上可以同時執行的最大執行緒數目

單一程序進行核心繫結，得出單核的qps極限值

同時執行多個程序，並進行核心繫結

單一程序，不進行核心繫結

測試彙總

相關推薦