CUDAの行列の列を最大のパフォーマンスで正規化する方法は？

CUDAでマトリックス列を効果的に正規化する方法は？CUDAの行列の列を最大のパフォーマンスで正規化する方法は？

私の行列は列長に格納され、標準サイズは2000x200です。

操作は次のMATLABコードで表すことができます。

A = rand(2000,200); 

A = exp(A); 
A = A./repmat(sum(A,1), [size(A,1) 1]);

これは、Thrust、cuBLASおよび/またはcuNPPによって効果的に行うことができますか？

4つのカーネルを含む迅速な実装を以下に示します。

特に、cublasDgemv（）によって実装された列合計ステップの場合、パフォーマンスを向上させるためにこれらを1または2のカーネルで実行できるかどうかわかりません。

#include <cuda.h> 
#include <curand.h> 
#include <cublas_v2.h> 
#include <thrust/device_vector.h> 
#include <thrust/device_ptr.h> 
#include <thrust/transform.h> 
#include <thrust/iterator/constant_iterator.h> 
#include <math.h> 

struct Exp 
{ 
    __host__ __device__ void operator()(double& x) 
    { 
     x = exp(x); 
    } 
}; 

struct Inv 
{ 
    __host__ __device__ void operator()(double& x) 
    { 
     x = (double) 1.0/x; 
    } 
}; 

int main() 
{ 
    cudaDeviceSetCacheConfig(cudaFuncCachePreferShared); 
    cublasHandle_t hd; 
    curandGenerator_t rng; 
    cublasCreate(&hd); 
    curandCreateGenerator(&rng, CURAND_RNG_PSEUDO_DEFAULT); 

    const size_t m = 2000, n = 200; 
    const double c1 = 1.0; 
    const double c0 = 0.0; 

    thrust::device_vector<double> A(m * n); 
    thrust::device_vector<double> sum(1 * n); 
    thrust::device_vector<double> one(m * n, 1.0); 

    double* pA = thrust::raw_pointer_cast(&A[0]); 
    double* pSum = thrust::raw_pointer_cast(&sum[0]); 
    double* pOne = thrust::raw_pointer_cast(&one[0]); 

    for (int i = 0; i < 100; i++) 
    { 
     curandGenerateUniformDouble(rng, pA, A.size()); 


     thrust::for_each(A.begin(), A.end(), Exp()); 

     cublasDgemv(hd, CUBLAS_OP_T, m, n, 
       &c1, pA, m, pOne, 1, &c0, pSum, 1); 

     thrust::for_each(sum.begin(), sum.end(), Inv()); 

     cublasDdgmm(hd, CUBLAS_SIDE_RIGHT, m, n, pA, m, pSum, 1, pA, m); 
    } 

    curandDestroyGenerator(rng); 
    cublasDestroy(hd); 

    return 0; 
}

出典

2013-01-08 kangshiyin

はい、それはCUDAで効果的に行うことができます。あなたが望むものを達成するために書いたCUDAコードをいくつか表示してください。 – sgarizvi

コードが追加されました。パフォーマンス改善を求める – kangshiyin

M2090の3つのアプローチのパフォーマンスをCUDA 5.0と比較しました。

[173.179米国] CUBLAS実装当該@talonmies
からthrust::reduce_by_key [1
[733.734米国純粋なスラスト実装を示すように。
1. CUBLAS、この場合に最高の性能を持っている、ことを508ミリ秒] thrust::inclusive_scan_by_key
$Performance on A_{2,000 x 200}$

純粋なスラストの実装を見ることができます。
両方thrust::reduce_by_key & thrust::inclusive_scan_by_key余分なオーバーヘッドにつながる複数のカーネルを起動します。
thrust::inclusive_scan_by_keyは、より長いカーネル時間の理由の1つであるthrust::reduce_by_keyと比較して、はるかに多くのデータをDRAMに書き込みます。
cublasとthrustアプローチの主なパフォーマンスの違いは、行列の列の合計です。 thrust::reduce_by_keyは、バリアント長のセグメントの削減を行うように設計されているため、推力はおそらく遅くなりますが、cublas_gemv()は固定長セグメント（行/列）にのみ適用できます。

行列Aがカーネル起動オーバーヘッドを無視するのに十分な大きさである場合、cublas appoachは最高のパフォーマンスを発揮します。 A_ {20,000 x 2,000}のプロファイリング結果を以下に示します。 @talonmiesによって示されるようにcublasSgemv呼び出しで最初for_each動作を融着

$Performance on A_{20,000 x 2,000}$

さらにパフォーマンスを向上させるが、私は手で書かれたカーネルの代わりthrust::reduce_by_keyを使用すべきであると思うかもしれません。

3つのアプローチのコードを以下に示します。

#include <cuda.h> 
#include <curand.h> 
#include <cublas_v2.h> 
#include <thrust/device_vector.h> 
#include <thrust/device_ptr.h> 
#include <thrust/transform.h> 
#include <thrust/reduce.h> 
#include <thrust/scan.h> 
#include <thrust/iterator/counting_iterator.h> 
#include <thrust/iterator/transform_iterator.h> 
#include <thrust/iterator/discard_iterator.h> 
#include <thrust/iterator/permutation_iterator.h> 
#include <math.h> 

struct Exp: public thrust::unary_function<double, double> 
{ 
    __host__ __device__ double operator()(double x) 
    { 
     return exp(x); 
    } 
}; 

struct Inv: public thrust::unary_function<double, double> 
{ 
    __host__ __device__ double operator()(double x) 
    { 
     return (double) 1.0/x; 
    } 
}; 

template<typename T> 
struct MulC: public thrust::unary_function<T, T> 
{ 
    T C; 
    __host__ __device__ MulC(T c) : 
     C(c) 
    { 
    } 
    __host__ __device__ T operator()(T x) 
    { 
     return x * C; 
    } 
}; 

template<typename T> 
struct line2col: public thrust::unary_function<T, T> 
{ 
    T C; 
    __host__ __device__ line2col(T C) : 
      C(C) 
    { 
    } 

    __host__ __device__ T operator()(T i) 
    { 
     return i/C; 
    } 
}; 

int main() 
{ 
    cudaDeviceSetCacheConfig(cudaFuncCachePreferShared); 
    cublasHandle_t hd; 
    curandGenerator_t rng; 
    cublasCreate(&hd); 
    curandCreateGenerator(&rng, CURAND_RNG_PSEUDO_DEFAULT); 

    const size_t m = 2000, n = 200; 
    const double c1 = 1.0; 
    const double c0 = 0.0; 

    thrust::device_vector<double> A(m * n); 
    thrust::device_vector<double> B(m * n); 
    thrust::device_vector<double> C(m * n); 
    thrust::device_vector<double> sum1(1 * n); 
    thrust::device_vector<double> sum2(1 * n); 
    thrust::device_vector<double> one(m * n, 1); 

    double* pA = thrust::raw_pointer_cast(&A[0]); 
    double* pB = thrust::raw_pointer_cast(&B[0]); 
    double* pSum1 = thrust::raw_pointer_cast(&sum1[0]); 
    double* pSum2 = thrust::raw_pointer_cast(&sum2[0]); 
    double* pOne = thrust::raw_pointer_cast(&one[0]); 

    curandGenerateUniformDouble(rng, pA, A.size()); 

    const int count = 2; 

    for (int i = 0; i < count; i++) 
    { 
     thrust::transform(A.begin(), A.end(), B.begin(), Exp()); 
     cublasDgemv(hd, CUBLAS_OP_T, m, n, &c1, pB, m, pOne, 1, &c0, pSum1, 1); 
     thrust::transform(sum1.begin(), sum1.end(), sum1.begin(), Inv()); 
     cublasDdgmm(hd, CUBLAS_SIDE_RIGHT, m, n, pB, m, pSum2, 1, pB, m); 
    } 

    for (int i = 0; i < count; i++) 
    { 
     thrust::reduce_by_key(
       thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m)), 
       thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m)) + A.size(), 
       thrust::make_transform_iterator(A.begin(), Exp()), 
       thrust::make_discard_iterator(), 
       sum2.begin()); 
     thrust::transform(
       A.begin(), A.end(), 
       thrust::make_permutation_iterator(
         sum2.begin(), 
         thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m))), 
       C.begin(), 
       thrust::divides<double>()); 
    } 

    for (int i = 0; i < count; i++) 
    { 
     thrust::inclusive_scan_by_key(
       thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m)), 
       thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m)) + A.size(), 
       thrust::make_transform_iterator(A.begin(), Exp()), 
       C.begin()); 
     thrust::copy(
       thrust::make_permutation_iterator(
         C.begin() + m - 1, 
         thrust::make_transform_iterator(thrust::make_counting_iterator(0), MulC<int>(m))), 
       thrust::make_permutation_iterator(
         C.begin() + m - 1, 
         thrust::make_transform_iterator(thrust::make_counting_iterator(0), MulC<int>(m))) + n, 
       sum2.begin()); 
     thrust::transform(
       A.begin(), A.end(), 
       thrust::make_permutation_iterator(
         sum2.begin(), 
         thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m))), 
       C.begin(), 
       thrust::divides<double>()); 
    } 

    curandDestroyGenerator(rng); 
    cublasDestroy(hd); 

    return 0; 
}

出典

2013-01-09 08:41:15 kangshiyin

あなたは、あなたが同様に推力でこれを行うことができ、次のように

array A = randu(2000, 2000); 
A = exp(A); 
A /= tile(sum(A, 0), A.dims(0), 1);

をArrayFireを使用することができます。しかし、（プレーンベクタとは対照的に）行列を使って作業するなら、効率的ではないforループで行う必要があります。

免責事項私はAccelereyesの開発元であり、arrayfireに取り組んでいます。

EDIT私は、要求通りに新しいベンチマークを生成するために取り組んでいます。

EDITexpのパフォーマンスバグは、このベンチマークのためにコードで見つかりました。私たちはそれを見直して修正しています。

出典

2013-01-08 23:24:59

ありがとう！コードがMatlabのようにシンプルになることは印象的です。また、あなたのコードのパフォーマンスを私のものと比較できますか？私はArrayFireライブラリを手に持っていないので。 – kangshiyin

@EricShiyinKang結果が更新されました。 –

あなたのベンチマークコードには、cublas/thrustアプローチのプールタイミング結果につながる問題があると思います。ここに変更された[bench.cu]（https://gist.github.com/786a7ea0597be5ab10d3） – kangshiyin

最初のfor_eachオペレーションをcublasSgemvコールと融合させて、単一のreduce_by_keyコールにすることができるはずです。

struct Accessor : public thrust::unary_function<int,int> 
{ 
    int lda; 
    __host__ __device__ Accessor(int _lda) : lda(_lda) {}; 
    __host__ __device__ int operator()(const int& idx) 
    { 
     return idx/lda; 
    } 
}; 

struct Exp : public thrust::unary_function<double,double> 
{ 
    __host__ __device__ double operator()(const double& x) 
    { 
     return exp(x); 
    } 
}; 

struct Inv : public thrust::unary_function<double,double> 
{ 
    __host__ __device__ double operator()(const double& x) 
    { 
     return double(1.0)/x; 
    } 
};

あなたはその後、

Accessor columns(m); 
thrust::reduce_by_key(
     thrust::make_transform_iterator(thrust::make_counting_iterator(int(0)), columns), 
     thrust::make_transform_iterator(thrust::make_counting_iterator(int(m*n)), columns), 
     thrust::make_transform_iterator(A.begin(), Exp()), 
     thrust::make_discard_iterator(), 
     sum.begin()); 

thrust::for_each(sum.begin(), sum.end(), Inv()); 

cublasDdgmm(hd, CUBLAS_SIDE_RIGHT, m, n, pA, m, pSum, 1, pA, m);

として正規化された出力を計算することができます：あなたが定義した場合/としてファンクタを再定義：

[免責事項ブラウザで書かれたすべてのコードをしてテストされていない、自己責任で使用します]

ファンシーイテレータを使用することで、カーネルコールの数を減らすこととは別に、大規模な単位行列の必要性がなくなり、合計と累乗演算を行うためにメモリフットプリントとメモリトランザクションの総数が削減されます。

出典

2013-01-09 03:43:47 talonmies

イテレータは実際には_fancy_です。私はcublasとthrustアプローチを比較しました。 'thrust :: reduce_by_key'はメモリ帯域幅を小さくする必要があるかもしれませんが、' cublasDgemv'に比べてまだ遅いです。何か案は？ – kangshiyin

私は、相対的なパフォーマンスがあなたが使用するGPUとタイプにかなり依存すると考えています。 32ビットタイプを使用する別のGPUでは、純粋なCUBLAS実装よりもパフォーマンスが向上しています。推力の開発者は、現在の推力を実施して以来、最先端技術の削減が少し前に進んだことを認めていますが、一般的に減速パターンのような木は常に効率が悪く、FMADのストリームこの場合。 – talonmies

また、 'thrust_for_each'ではなく' thrust :: transform'を試してみることをお勧めします。いくつかの場合（間違いなく）、私は 'for_each'より少し速いことがわかりました。しかし、おそらくパフォーマンスはそれほど変わらないでしょう。 – talonmies

CUDAの行列の列を最大のパフォーマンスで正規化する方法は？

答えて

関連する問題