C＃TPLよりもC++ PPLの方が速いですか？

私はTPLのParallel.ForEach PPLのparallel_for_each対を比較するために、フィボナッチfonctionを使用し、非常に簡単なアプリをコード化され、その結果が8つのコアを搭載したPC上で、本当に変だった、C＃のは、C++ 11秒速くです。C＃TPLよりもC++ PPLの方が速いですか？

vs2010とvs 2011の両方のプレビューで同じ結果が得られます。

は

C＃コード：

using System; 
using System.Collections.Generic; 
using System.Linq; 
using System.Text; 
using System.Collections.Concurrent; 
using System.Threading.Tasks; 
using System.Diagnostics; 

namespace ConsoleApplication1 
{ 
    class Program 
    { 

      static void Main(string[] args) 
      { 
       var ll = new ConcurrentQueue<Tuple<int, int>>(); 
       var a = new int[12] { 40, 41, 42, 43, 44, 45, 46, 47, 35, 25, 36, 37 }; 

       long elapsed = time_call(() => 
       { 
        Parallel.ForEach(a, (n) => { ll.Enqueue(new Tuple<int, int>(n, fibonacci(n))); }); 
       }); 

       Console.WriteLine("TPL C# elapsed time: " + elapsed + "\n\r"); 
       foreach (var ss in ll) 
       { 
        Console.WriteLine(String.Format("fib<{0}>: {1}", ss.Item1, +ss.Item2)); 
       } 

       Console.ReadLine(); 
      } 

      static long time_call(Action f) 
      { 
       var p = Stopwatch.StartNew(); 
       p.Start(); 
       f(); 
       p.Stop(); 
       return p.ElapsedMilliseconds; 
      } 

      Computes the nth Fibonacci number. 
      static int fibonacci(int n) 
      { 
       if (n < 2) return n; 
       return fibonacci(n - 1) + fibonacci(n - 2); 
      } 
     } 
    }

C++コード：

#include <windows.h> 
#include <ppl.h> 
#include <concurrent_vector.h> 
#include <array> 
#include <tuple> 
#include <algorithm> 
#include <iostream> 

using namespace Concurrency; 
using namespace std; 

template <class Function> 
__int64 time_call(Function&& f) { 
    __int64 begin = GetTickCount(); 
    f(); 
    return GetTickCount() - begin; 
} 

// Computes the nth Fibonacci number. 
int fibonacci(int n) { 
    if (n < 2) return n; 
    return fibonacci(n-1) + fibonacci(n-2); 
} 

int wmain() { 
    __int64 elapsed; 
    array<int, 12> a ={ 40, 41, 42, 43, 44, 45, 46, 47, 35, 25, 36, 37 }; 
    concurrent_vector<tuple<int,int>> results2; 

    elapsed = time_call([&]{ 
     parallel_for_each(a.begin(), a.end(), [&](int n) { 
      results2.push_back(make_tuple(n, fibonacci(n))); 
     }); 
    }); 

    wcout << L"PPL time: " << elapsed << L" ms" << endl << endl; 
    for_each (results2.begin(), results2.end(), [](tuple<int,int>& pair) { 
     wcout << L"fib(" << get<0>(pair) << L"): " << get<1>(pair) << endl; 
    }); 

    cin.ignore(); 
}

あなたは私のC++コードの一部は、私が間違っmのところ、私をポイントしてくださいできますか？

task_group tasks; 
    elapsed = time_call([&] 
    { 
     for_each(begin(a), end(a), [&](int n) 
     { 
      tasks.run([&,n]{results2.push_back(make_tuple(n, fibonacci(n)));}); 
     }); 
     tasks.wait();

出典

2012-01-03 user1127781

適切にフォーマットされたソースコードを投稿してください。これらのステートメントの間の空白のリンクは、私がかなり多くスクロールするように強制するので、特に刺激的です。 –

私が飛び交うのは、あなたがあなたの例でさまざまなコレクションを使用しているということです。 C＃バージョンはキューを使用し、C++バージョンはベクトルを使用します。 –

また、各例で 'fibonacci'関数の呼び出しだけをタイムアウトしましたか？ –

のGetTickCount（http://msdn.microsoft.com/en-us/library/windows/desktop/ms724408%28v=vs：

幅group_taskは、私はC＃のコードのように同じ時間を持っています。 85％29.aspx）関数は、ネイティブ側で渡された時間を測定するために使用されていますが、まったく正確ではありません。説明はこうです：

"GetTickCount関数の解像度は、通常は10ミリ秒から16ミリ秒の範囲内にあるシステムタイマの解像度に制限されています。

GetSystemTimeを使用している私の経験からは、Windows Vista上ではより良い結果が得られます（XPが正しく覚えていれば勝利XPではチックカウントと同じ解像度になります）。または、サブミルスの解像度を提供するAPIのいくつかをきめ細かく測定するために使用できます。おそらく大規模なデータセットを構築することは、有意義なデータを得るために役立つでしょう。ここで

出典

2012-01-03 14:14:23 Ghita

parallel_for_eachのC++コードを task_groupに変更した場合、問題はGetTickCount関数でもコンテナでもないと思います。 – user1127781

OPによると、両者の間に11秒の差があるので、おそらくティックカウントの精度には関係しません。 –

はラーフルVパティルマイクロソフトチームによる説明

こんにちは、これを起動するための

おかげです。実際には、がデフォルトの並列for *に関連付けられたオーバーヘッドを特定しました。特に、の反復回数が少なく、ワークサイズが可変である場合に発生します。のデフォルトの並列処理は、作業を8つのコア（8つのコア）に分割して開始します。作業が終了すると、作業は動的に負荷分散されます。ほとんどの場合（多くの場合、の反復回数が多い）、反復処理の基礎となる作業がうまくいかない場合は、が理解されます（ライブラリーを呼び出すとします）が、受け入れられないオーバーヘッドを伴う場合があります。

解決策は、代替の実装で確認したとおりです。そのためには、Visual Studioの次のバージョンでは「」という並列化プログラムを「シンプル」としてください。具体的にはという具合に、より良いパフォーマンスのがあります。

PSは： - ので、あなたがワークロードに応じてわずかに異なる性能特性を見ることができます彼らは反復回数を経る方法を各実装のためのC＃とC++並列に少し異なるアルゴリズムを使用します。それは使用しているので、

フィボナッチは、メモリの過度の量を使用するプロセスを引き起こし計算する再帰を使用：

よろしく

出典

2012-01-04 08:12:42 user1127781

はのは、それらを一つずつ解決しましょう、あなたのコードを持ついくつかの問題があります結果を計算するコールスタック。再帰的なフィボナッチを持つということは、C＃/ C++の並列フレームワークを比較していないことを意味しています。コールスタックメカニズムを比較しています。フィボナッチをもっと速く計算することができます：

int fibonacci(int n) 
{ 
    int curr = 1, prev = 0, total = 0; 
    for (int i = 0; i < n; i++) 
    { 
     int pc = curr; 
     curr += prev; 
     total += curr; 
     prev = pc; 
    } 
    return total; 
}

この関数を使用すると、測定可能な値を取得するために少なくとも100万回実行する必要がありました。代わりのGetTickCountの

使用GetTickCount64：

template <class Function> 
__int64 time_call(Function&& f) 
{ 
    __int64 begin = ::GetTickCount64(); 
    f(); 
    return GetTickCount64() - begin; 
}

ような小さな体でのparallel_forを実行すると、パフォーマンスに実際に有害である可能性があります。より細かい関数を使用する方が良いでしょう。心の中でこれらの形質に

は、ここでのコードはC++である：間の数値のため1200万フィボナッチ計算を実行するための

using System; 
using System.Collections.Generic; 
using System.Linq; 
using System.Text; 
using System.Threading.Tasks; 
using System.Collections.Concurrent; 
using System.Diagnostics; 

namespace ParallelFiboCS 
{ 
    class Program 
    { 
     const int NUMBER_OF_REPETITIONS = 1000000; 
     const int MIN_FIBO = 37; 
     const int MAX_FIBO = 49; 
     static void Main(string[] args) 
     { 
      var ll = new ConcurrentQueue<Tuple<int, int>>(); 

      var a = new int[MAX_FIBO - MIN_FIBO]; 
      for (int i = MIN_FIBO; i < MAX_FIBO; i++) 
      { 
       a[i - MIN_FIBO] = i; 
      } 

      long elapsed = time_call(() => 
      { 
       Parallel.ForEach(a, (n) => 
       { 
        for (int i = 0; i < NUMBER_OF_REPETITIONS; i++) 
        { 
         ll.Enqueue(new Tuple<int, int>(n, fibonacci(n))); 
        } 
       }); 
      }); 

      Console.WriteLine("TPL C# elapsed time: " + elapsed + "\n\r"); 

      Console.ReadLine(); 
     } 

     static long time_call(Action f) 
     { 
      var p = Stopwatch.StartNew(); 
      p.Start(); 
      f(); 
      p.Stop(); 
      return p.ElapsedMilliseconds; 
     } 

     static int fibonacci(int n) 
     { 
      int curr = 1, prev = 0, total = 0; 
      for (int i = 0; i < n; i++) 
      { 
       int pc = curr; 
       curr += prev; 
       total += curr; 
       prev = pc; 
      } 
      return total; 
     } 
    } 
}

平均時間：

// ParallelFibo.cpp : Defines the entry point for the console application. 
// 

#include "stdafx.h" 
#include <windows.h> 
#include <ppl.h> 
#include <concurrent_vector.h> 
#include <array> 
#include <tuple> 
#include <algorithm> 
#include <iostream> 
#include <random> 

using namespace Concurrency; 
using namespace std; 

template <class Function> 
__int64 time_call(Function&& f) 
{ 
    __int64 begin = ::GetTickCount64(); 
    f(); 
    return GetTickCount64() - begin; 
} 

// Computes the nth Fibonacci number. 
inline int fibonacci(int n) 
{ 
    int curr = 1, prev = 0, total = 0; 
    for (int i = 0; i < n; i++) 
    { 
     int pc = curr; 
     curr += prev; 
     total += curr; 
     prev = pc; 
    } 
    return total; 
} 

#define NUMBER_OF_REPETITIONS 1000000 
#define MIN_FIBO    37 
#define MAX_FIBO    49 

int main() 
{ 
    __int64 elapsed; 
    vector<int> v; 
    for (int i = MIN_FIBO; i < MAX_FIBO; i++) 
    { 
     v.emplace_back(i); 
    } 

    concurrent_vector<tuple<int, int>> results; 
    elapsed = time_call([&] { 
     parallel_for(MIN_FIBO, MAX_FIBO, [&](int n) { 
      for (int i = 0; i < NUMBER_OF_REPETITIONS; i++) 
      { 
       results.push_back(make_tuple(n, fibonacci(n))); 
      } 
     }); 
    }); 
    wcout << L"PPL time: " << elapsed << L" ms" << endl << endl; 
    cin.ignore(); 
    return 0; 
}

そしてここでは、C＃のコードです37と49：

C++：513ms

のC＃：2527ms

出典

2015-07-17 22:59:59

C＃TPLよりもC++ PPLの方が速いですか？

答えて

関連する問題