マルチスレッドではスピードが向上していない - Cでpthreadを使用する理由 - なぜですか？

私はマルチスレッドでより快適になるために、私は "集中的な"計算で少しCプログラムをプログラムしました。これは、各ピクセルが別々に計算され、次にピクセルが行にバッファされるマンデルブロセットの画像です。各スレッドは、合計行数の等しいシェアを獲得しています。したがって、たとえば、選択されたスレッドの数が2である場合、高さが1000行で計算されたピクチャが2行の500行パッケージに終わるはずです。したがって、私はスピードの種類が2倍に減少することを提案しましたが、改善はありません。なぜ？？？私はそれを得ることはない、すべてが機能し、論理的だと思われるから。誰かが私にヒントを与えることができれば、とても感謝しています。以下はmainと呼ばれるマンデルブロセットの校正のためのメインと関数です。マルチスレッドではスピードが向上していない - Cでpthreadを使用する理由 - なぜですか？

int main(int argc, char ** argv, char ** envp) { 

if(argc != 4) 
{ 
printf("Bitte genau 3 Argumente eingeben.\n"); 
return 1; 
} 
//Structs und Variablen für die Stopuhr 
struct timeval start, ende; 
long ttlende, ttlstart; 

width = str2num(argv[1]); 
height = str2num(argv[2]); 

int y; 
//char blueGreenRed[3]; 
//Ist Buffer für ganze Zeile: Breite * 3 wegen den 3 Bytes pro Pixel 
//char zeile[width*3]; 

unsigned char info[BMPHEADER_SIZE] = { 
       //size 
    'B','M', 0,0,0,0, 0,0, 0,0, 54,0,0,0, 
       //width //height 
    40,0,0,0, 0,0,0,0, 0,0,0,0, 1,0, 24,0, 
       // datasize 
    0,0,0,0, 0,0,0,0 
}; 

// BMP lines must be of lengths divisible by 4 
char span[4] = "\0\0\0\0"; 
int spanBytes = 4 - ((width * 3) % 4); 
if (spanBytes == 4) spanBytes = 0; 
int psize = ((width * 3) + spanBytes) * height; 

*((int*) &info[2]) = BMPHEADER_SIZE + psize; 
*((int*) &info[18]) = width; 
*((int*) &info[22]) = height; 
*((int*) &info[34]) = psize; 

write(1, (char *) info, BMPHEADER_SIZE); 
//Stoppuhr starten, d.h. get time stamp 

//create chunks 
int threads= str2num(argv[3]); 
int i; 
int reminder = height%threads; 
int blocksize = height/threads; 
int rounds = height/blocksize; 
int begin = 1; 


//init structs 
threadinfo *tinfoptr = getptr(rounds); 
//threadinfo tinfo = *tinfoptr; 
for (i=1; i<=rounds; ++i){ 
     int res = blocksize*i; 
     if((i==rounds)){ 
       res = res+reminder; 
     } 

     //update parameters of tinfo 
     (*(tinfoptr+(i-1))).from = begin; 
     (*(tinfoptr+(i-1))).to = res; 
     (*(tinfoptr+(i-1))).span = span; 
     (*(tinfoptr+(i-1))).spanBytes = spanBytes; 
     (*(tinfoptr+(i-1))).width = width; 
     (*(tinfoptr+(i-1))).height = res-begin+1; 
     (*(tinfoptr+(i-1))).results = NULL; 
     (*(tinfoptr+(i-1))).threadno = i; 
     (*(tinfoptr+(i-1))).blocksizeperthread = -1; 
     //altes ende ist neuer start des nächsten blocks. 
     begin = res; 
} 

fprintf(stderr,"inti abgeschlossen, starte threads\n"); 

pthread_t myThread[rounds]; 
for (i=1; i<=rounds; ++i){ 
    fprintf(stderr,"Rufe Thread %d auf\n",i); 
    if (pthread_create(&myThread[i-1], NULL, myDo2, (void*)(tinfoptr+. (i-1)))) { 
     fprintf(stderr, "Error creating thread\n"); 
     return 1; 
    } 
} 

gettimeofday(&start, NULL); 
for (i=1; i<=rounds; ++i){ 
    /* wait for the second thread to finish */ 
    if (pthread_join(myThread[i-1], NULL)) { 
     fprintf(stderr, "Error joining thread\n"); 
     return 2; 
    } 
} 
//Stoppuhr beenden, d.h. get time stamp, NULL per Doku. 
gettimeofday(&ende,NULL); 

    //if the main thread arrives this position, restulptr containts all rows indexed by the threadnr. 
    for (i=1; i<=rounds; i++){ 
     //noch countereinbauen 
     int l_blocksize = (tinfoptr+(i-1))->blocksizeperthread; 
     for (y=0; y <= l_blocksize; y++) { 
      //Zeilenweise nach stdout schreiben 
      write(1, (tinfoptr+(i-1))->results[y], width*3); 
      // BMP lines must be of lengths divisible by 4 
      write(1, span, spanBytes); 
     } 
    } 


ttlende = ende.tv_sec * 1000000 + ende.tv_usec; 
ttlstart = start.tv_sec * 1000000 + start.tv_usec; 
fprintf(stderr, "\nDauer: %ld Mikrosekunden\n", (ttlende - ttlstart)); 

return 0; 
}

そして、ここで呼び出される関数：

void* myDo2(void* tiptr){ 
threadinfo* mythread = (threadinfo*)tiptr; 
//copy infos from struct to this thread 
int l_from = mythread->from; 
int l_to = mythread->to; 
int l_width = mythread->width; 
int l_height = mythread->height; 
// char **container = createMatrix(l_width*3,l_height); 
char **container = malloc (l_height * sizeof(char*)); 
for(int i = 0; i<l_height; i++){ 
    container[i] = malloc(l_width*3*sizeof(char)); 
} 

int x,y; 
char iterate=0; 
Complex c = {0,0}; 
Complex newz = {0,0}; 
float imageRelation = (float)l_width/(float)height; 
char blueGreenRed[3]; 
    //Ist Buffer für ganze Zeile: Breite * 3 wegen den 3 Bytes pro Pixel 
    char zeile[l_width*3]; 
    int counter = 0; 

for (y=l_from; y <= l_to; ++y) 
{ 
    for (x=1; x <= l_width; ++x) { 
     Complex z = {0,0}; 
     float quad=0; 

     c.re = zoom * (-1.0 + imageRelation * ((x-1.0)/(width-1.0))); 
     c.im = zoom * (0.5 - (y-1.0)/(height-1.0)); 

     // iterate 
     for (iterate=1; iterate < colorLimit && quad < quadLimit; ++iterate) { 
      quad = z.re * z.re + z.im * z.im; 

      newz.re = (z.re * z.re) - (z.im * z.im) + c.re; 
      newz.im = z.re * z.im * 2.0   + c.im; 

      z = newz; 
     } 
     toRGB(iterate, blueGreenRed); 
     //Kopiere 3 Bytes von bgr nach zeile + (x-1)*3 
     //Beachte: Die Variable zeile ist ein character array daher wird (x-1)*3 benutzt um 3 Byte Pakete pro Pixel in die Zeile zu laden. 
     memcpy((zeile + (x-1)*3), blueGreenRed, 3); 
    } 
    memcpy(container[counter], zeile, l_width*3); 
    counter++; 
} 

mythread->blocksizeperthread = counter-1; 
mythread->results = container; 
     fprintf(stderr, "Ich bin Thread-Nr. %d\n", mythread->threadno); 
     fprintf(stderr, "und habe eine Menge Zeilen von %d\n", mythread->blocksizeperthread); 
     fprintf(stderr, "und habe berechnet von %d\n", l_from); 
     fprintf(stderr, "und habe berechnet bis %d\n", l_to); 
return NULL; 
}

は、どうもありがとうございまし短期であなた jbug

出典

2017-05-16 J. Bug

を使用してRedHatの上に以下のコードをコンパイル - C中のpthreadを使用して - なぜ？ " - なぜ** IT **であるべきですか？いくつかの保証マルチスレッドの速度を向上させるためのリファレンスを提供できますか？そして、あなたはどのように "速度"を定義しますか？ CPUサイクルに関しては、**常に**単一スレッドプログラミング以上のものが必要です。 – Olaf

並列処理、つまりマルチコアプロセッサで別のコアにプロセスを吐き出すことは考えられますか？これは、正しく行われた場合、および実質的に重い処理を必要とするアルゴリズムで、測定可能な効率の向上をもたらします。もしそうなら、それについて_ [ここに質問とディスカッション]（http://stackoverflow.com/q/19324306/645128）_。簡単に言えば、スレッドは、アプリケーションを実行しているOSによって制御される時間/処理能力と同じ時間を共有します。プロセスを別のコアに分割することで、それらのプロセスを真に並列に実行できるため、効率が向上します。 – ryyker

実際の同時実行をサポートするハードウェアがない可能性があります。コアの数とお使いのOSのアーキテクチャを確認してください。 –

答えはモデルが動作することですが、あなたは十分な各スレッドを与える必要がスレッドの開始、停止、および同期のオーバーヘッドを吸収する価値があるようにするために作業します。また、複数のスレッドを同時に実行できるコンピュータで実行する必要があります（マルチコアマシン）。

私が提供したアプリケーションを実際にコンパイルするために変更しました。私は利用可能な多くのCPUコアを持つLinuxマシン上でこれを実行し、 myDo2作業スレッドを行うのに十分な仕事を与えた場合、私は次のような結果を参照してください。

./test width height num_threads 
./test 10000 10000 1 
Dauer: 17,660,185 Mikrosekunden 

./test 10000 10000 2 
Dauer: 7,864,508 Mikrosekunden 

./test 10000 10000 8 
Dauer: 1,100,126 Mikrosekunden

これは、8つのスレッド、全体の壁を意味しますクロック時間が17.6秒から1.1秒に短縮されました。これは8倍以上の改善です（メモリとキャッシュの使用率が向上したためです）。

しかし、もし私が各スレッドにあまり仕事を与えなければ、私の時間は改善されていないように見え、実際にはある時点で悪化していきます。

./test 10 10 1 
Dauer: 70 Mikrosekunden 

./test 10 10 2 
Dauer: 60 Mikrosekunden 

./test 10 10 4 
Dauer: 205 Mikrosekunden

ここでは、スレッドを開始し、そのスレッドで停止して同期するオーバーヘッドがスレッド内部で行われる作業量よりも大きいことがわかります。

プログラミングモデルは機能しますが、正しく使用する必要があります。

私はマルチスレッドの速度に全く改善を持っていない」

GCC -std = gnu99 test.cの-o試験-l pthreadの

#include <stdio.h> 
#include <stdlib.h> 
#include <sys/time.h> 
#include <pthread.h> 
#include <string.h> 

typedef struct _threadinfo 
{ 
    int from; 
    int to; 
    int width; 
    int height; 
    int blocksizeperthread; 
    char **results; 
    int threadno; 
} threadinfo; 

typedef struct _cplx 
{ 
    float re; 
    float im; 
} Complex; 

void* myDo2(void *tiptr) 
{ 
    threadinfo *mythread = (threadinfo *)tiptr; 
    //copy infos from struct to this thread 
    int l_from = mythread->from; 
    int l_to = mythread->to; 
    int l_width = mythread->width; 
    int l_height = mythread->height; 
    char **container = malloc(l_height * sizeof(char *)); 
    for (int i = 0; i < l_height; i++) 
    { 
     container[i] = malloc(l_width * 3 * sizeof(char)); 
    } 

    int x, y; 
    char iterate = 0; 
    Complex c = { 0, 0 }; 
    Complex newz = { 0, 0 }; 
    float imageRelation = (float)l_width/(float)l_height; 
    char blueGreenRed[3]; 
    //Ist Buffer für ganze Zeile: Breite * 3 wegen den 3 Bytes pro Pixel 
    char zeile[l_width * 3];       //1000*3 
    int counter = 0; 
    float zoom = 1.0; 
    float colorLimit = 10.0; 
    float quadLimit = 10.0; 

    for (y = l_from; y <= l_to; ++y)     //1..500 
    { 
     for (x = 1; x <= l_width; ++x)     //1..1000 
     { 
      Complex z = { 0, 0 }; 
      float quad = 0; 

      c.re = zoom * (-1.0 + imageRelation * ((x - 1.0)/(l_width - 1.0))); 
      c.im = zoom * (0.5 - (y - 1.0)/(l_height - 1.0)); 

      // iterate 
      for (iterate = 1; iterate < colorLimit && quad < quadLimit; ++iterate) 
      { 
       quad = z.re * z.re + z.im * z.im; 

       newz.re = (z.re * z.re) - (z.im * z.im) + c.re; 
       newz.im = z.re * z.im * 2.0   + c.im; 

       z = newz; 
      } 
      //toRGB(iterate, blueGreenRed); 
      //Kopiere 3 Bytes von bgr nach zeile + (x-1)*3 
      //Beachte: Die Variable zeile ist ein character array daher wird 
      //(x-1)*3 benutzt um 3 Byte Pakete pro Pixel in die Zeile zu laden. 
      memcpy((zeile + (x - 1) * 3), blueGreenRed, 3); 
     } 
     memcpy(container[counter], zeile, l_width * 3); 
     counter++; 
    } 

    mythread->blocksizeperthread = counter - 1; 
    mythread->results = container; 
    fprintf(stderr, "Ich bin Thread-Nr. %d\n", mythread->threadno); 
    fprintf(stderr, "und habe eine Menge Zeilen von %d\n", mythread->blocksizeperthread); 
    fprintf(stderr, "und habe berechnet von %d\n", l_from); 
    fprintf(stderr, "und habe berechnet bis %d\n", l_to); 
    return NULL; 
} 

int main(int argc, char **argv, char **envp) 
{ 
    if (argc != 4) 
    { 
     printf("Bitte genau 3 Argumente eingeben.\n"); 
     return 1; 
    } 
//Structs und Variablen für die Stopuhr 
    struct timeval start, ende; 
    long ttlende, ttlstart; 
    int width; 
    int height; 

    width = atoi(argv[1]); 
    height = atoi(argv[2]); 

    int y; 

// BMP lines must be of lengths divisible by 4 
    char span[4] = "\0\0\0\0"; 
    int spanBytes = 4 - ((width * 3) % 4); 
    if (spanBytes == 4) spanBytes = 0; 
    int psize = ((width * 3) + spanBytes) * height; 

//Stoppuhr starten, d.h. get time stamp 

//create chunks 
    int threads = atoi(argv[3]); 
    int i; 
    int reminder = height % threads; 
    int blocksize = height/threads; 
    int rounds = height/blocksize; 
    int begin = 1; 


//init structs 
    threadinfo *tinfoptr = malloc(sizeof(threadinfo) * rounds); 
//threadinfo tinfo = *tinfoptr; 
    for (i = 1; i <= rounds; ++i) 
    { 
     //res = 500 * 1; 
     //res = 500*2; 
     int res = blocksize * i; 
     if ((i == rounds)) 
     { 
      res = res + reminder; 
     } 

     //update parameters of tinfo 
     (*(tinfoptr + (i - 1))).from = begin; 
     (*(tinfoptr + (i - 1))).to = res; 
     (*(tinfoptr + (i - 1))).width = width; 
     (*(tinfoptr + (i - 1))).height = res - begin + 1; 
     (*(tinfoptr + (i - 1))).results = NULL; 
     (*(tinfoptr + (i - 1))).threadno = i; 
     (*(tinfoptr + (i - 1))).blocksizeperthread = -1; 
     //altes ende ist neuer start des nächsten blocks. 
     begin = res; 
    } 

    fprintf(stderr, "inti abgeschlossen, starte threads\n"); 

    pthread_t myThread[rounds]; 
    for (i = 1; i <= rounds; ++i) 
    { 
     fprintf(stderr, "Rufe Thread %d auf\n", i); 
     if (pthread_create(&myThread[i - 1], NULL, myDo2, 
          (void *)(tinfoptr + (i - 1)))) 
     { 
      fprintf(stderr, "Error creating thread\n"); 
      return 1; 
     } 
    } 

    gettimeofday(&start, NULL); 
    for (i = 1; i <= rounds; ++i) 
    { 
     /* wait for the second thread to finish */ 
     if (pthread_join(myThread[i - 1], NULL)) 
     { 
      fprintf(stderr, "Error joining thread\n"); 
      return 2; 
     } 
    } 
//Stoppuhr beenden, d.h. get time stamp, NULL per Doku. 
    gettimeofday(&ende, NULL); 

    ttlende = ende.tv_sec * 1000000 + ende.tv_usec; 
    ttlstart = start.tv_sec * 1000000 + start.tv_usec; 
    fprintf(stderr, "\nDauer: %ld Mikrosekunden\n", (ttlende - ttlstart)); 

    return 0; 
}

出典

2017-05-16 19:14:59 Russ

非常に良い有能なanwser :)ありがとうございました。私はそれがマルチコアシステムであることを確かに知っているコンピュータにシステムを切り替えました。あなたの答えに先立って、私は仮想のLinuxインスタンス内でそれをテストしました。この仮想インスタンスはquadcoreを持つコンピュータ上で動作するので、私はvirutalマシンがちょうど1つのコア（cat proc/statで見つけたように）を持っていても、スレッドは何とか魔法のように翻訳されて本当のパラレルが可能であると思っていました。しかし、ありがとうございます。 –

マルチスレッドではスピードが向上していない - Cでpthreadを使用する理由 - なぜですか？

答えて

関連する問題