OpenMPの実装がシリアルの実装よりも遅い

現在、OpenMPに慣れようとしています。練習のために私はOpenMPで貪欲な "学習"アルゴリズムを実装しました。それから私は私が私のシリアル実装と私のプログラムは、OpenMP 1が遅くなかれ重要でやっているどのように多くの反復に関係なくと比較OpenMPの実装がシリアルの実装よりも遅い

time ./a.out

との時間を測定しました。それはこの実装で12Sを取った私の自宅のPCで

#include <omp.h> 
#include <iostream> 
#include <vector> 
#include <cstdlib> 
#include <cmath> 
#include <stdio.h> 
#include <ctime> 

#define THREADS 4 

using namespace std; 

struct TrainData { 
    double input; 
    double output; 
}; 

//Long Term Memory struct 
struct LTM { 
     double a; //paramter a of the polynom 
     double b; 
     double c; 
     double score; //score to be minimized! 

     LTM() 
     { 
      a=0; 
      b=0; 
      c=0; 
      score=0; 
     } 

     //random LTM with paramters from low to high (including low and high) 
     LTM(int low, int high) 
     { 
      score=0; 
      a= rand() % high + low; 
      b= rand() % high + low; 
      c= rand() % high + low; 

     } 

     LTM(double _a, double _b, double _c) 
     { 
      a=_a; 
      b=_b; 
      c=_c; 
     } 

     void print() 
     { 
      cout<<"Score: "<<score<<endl; 
      cout<<"a: "<<a<<" b: "<<b<<" c: "<<c<<endl; 
     } 
}; 

//the acutal polynom function evaluating with passed LTM 
inline double evaluate(LTM &ltm, const double &x) 
{ 
    double ret; 
    ret = ltm.a*x*x + ltm.b*x + ltm.c; 

    return ret; 
} 


//scoring function calculates the Root Mean Square error (RMS) 
inline double score_function(LTM &ltmnew, vector<TrainData> &td) 
{ 
    double score; 
    double val; 
    int tdsize=td.size(); 
    score=0; 

    for(int i=0; i< tdsize; i++) 
    { 
     val = (td.at(i)).output - evaluate(ltmnew, (td.at(i)).input); 
     val *= val; 
     score += val; 
    } 

    score /= (double)tdsize; 

    score = sqrt(score); 

    return score; 
} 

LTM iterate(int iterations, vector<TrainData> td, int low, int high) 
{ 
    LTM fav = LTM(low,high); 
    fav.score = score_function(fav, td); 
    fav.print(); 
    LTM favs[THREADS]; // array for collecting the favorites of each thread 

    #pragma omp parallel num_threads(THREADS) firstprivate(fav, low, high, td) 
    { 
     #pragma omp master 
     printf("Threads: %d\n", omp_get_num_threads()); 

     LTM cand; 
     #pragma omp for private(cand) 
     for(int i=0; i<iterations; i++) 
     { 
      cand = LTM(low, high); 
      cand.score = score_function(cand, td); 

      if(cand.score < fav.score) 
       fav = cand; 
     } 

     //save the favorite before ending the parallel section 
     #pragma omp critical 
     favs[omp_get_thread_num()] = fav; 
    } 

    //search for the best one in the array 
    for(int i=0; i<THREADS; i++) 
    { 
     if(favs[i].score < fav.score) 
      fav=favs[i]; 
    } 

    return fav; 
} 

//generate training data from -50 up to 50 with the train LTM 
void generateTrainData(vector<TrainData> *td, LTM train) 
{ 
    #pragma omp parallel for schedule(dynamic, 25) 
    for(int i=-50; i< 50; i++) 
    { 
     struct TrainData d; 
     d.input = i; 
     d.output = evaluate(train, (double)i); 
     #pragma omp critical 
     td->push_back(d); 

     //cout<<"input: "<<d.input<<" -> "<<d.output<<endl; 
    } 

} 

int main(int argc, char *argv[]) 
{ 

    int its= 10000000; //number of iterations 
    int a=2; 
    int b=4; 
    int c=6; 

    srand(time(NULL)); 
    LTM pol = LTM(a,b,c); //original polynom parameters 
    vector<TrainData> td; 

    //first genarte some training data and save it to td 
    generateTrainData(&td, pol); 

    //try to find the best solution 
    LTM fav = iterate(its, td, 1, 6); 


    printf("Final: a=%f b=%f c=%f score: %f\n", fav.a, fav.b, fav.c, fav.score); 

    return 0; 
}

：ここ

は私のコードで、コメントがうまくいけば、すべてを説明する必要があります。シリアルは6秒しかありません。反復回数を係数10で増やすと、約2分/ 1分（omp/serial）になります。

誰でも手伝ってもらえますか？

出典

2017-05-19 Talto

'rand（）'は、グローバルな状態を使用しているため、並行コンテキストでは使用できません。代わりに 'drand48_r（）'を使用してください。 –

私の最初の質問のおかげで、私はパフォーマンスの問題を解決することができました。

コメントのように問題は、私が使用していたrand（）関数だと言いました。私はそれらを適切なスレッドセーフdrand48_r（）に置き換えました。

同様：

... 
LTM(double low, double high, struct drand48_data *buff) 
{ 
    score=0; 
    double x; 
    drand48_r(buff,&x); 
    a= low + x * (high - low); 
    drand48_r(buff,&x); 
    b= low + x * (high - low); 
    drand48_r(buff,&x); 
    c= low + x * (high - low); 

} 
...

今私は1秒未満の時間を持って！ありがとう！ :)

出典

2017-05-19 12:08:45 Talto

また、より良いAPIを備えた[C++ 11ランダムジェネレータ]（http://en.cppreference.com/w/cpp/numeric/random）を使用して、PRNGアルゴリズムの選択肢を増やすことができます。出力を異なる統計分布に適合させる。ランダムなエンジン変数を '#pragma omp parallel'ブロック内に宣言するだけで、各スレッドのローカルになります。 – Wyzard

OpenMPの実装がシリアルの実装よりも遅い

答えて

関連する問題