共有メモリでのCUDA行列の転置

共有メモリを使用してGPUに行列転置関数を実装する必要があります。私はシンプルな方法でそれをやったSMとの試みもうまく動作する共有メモリなしでやった。しかし残念なことに、計算が正しくなく、私は理由を理解できません。完全な実例はhereとこの質問の最後にあります。共有メモリでのCUDA行列の転置

EDIT 1

私はさらに、私は間違った値を有する結果の最初のインデックスは、（二次元的にので、平坦化マトリックスの、matr[0][32]）指数32であることを知っています。

これ以上の情報が必要な場合は、喜んでそれらを提供します。

動作しない機能に類似している全体のコードからの短い抜粋を以下に示す：

__global__ void notSoNaivaTransKernel(float *matrB, float *matrA, const int width, 
    const int height, const int nreps) 
{ 
    __shared__ float tile[TILE_DIM][TILE_DIM + 1]; 
    int blockIdx_y = blockIdx.x; 
    int blockIdx_x = (blockIdx.x + blockIdx.y) % gridDim.x; 
    int xIndex = blockIdx_x * TILE_DIM + threadIdx.x; 
    int yIndex = blockIdx_y * TILE_DIM + threadIdx.y; 
    int index_in = xIndex + (yIndex)* width; 

    xIndex = blockIdx_y * TILE_DIM + threadIdx.x; 
    yIndex = blockIdx_x * TILE_DIM + threadIdx.y; 
    int index_out = xIndex + (yIndex)* height; 

    int r, i; 
#pragma unroll 
    for (r = 0; r < nreps; r++) 
    { 
#pragma unroll 
     for (i = 0; i < TILE_DIM; i += BLOCK_ROWS) 
      tile[threadIdx.y + i][threadIdx.x] = matrA[index_in + i * width]; 

     __syncthreads(); 

#pragma unroll 
     for (i = 0; i < TILE_DIM; i += BLOCK_ROWS) 
      if (index_in + i * width < width * height) 
       matrB[index_out + i * height] = tile[threadIdx.x][threadIdx.y + i]; 
    } 
}

出力は次のようになり：ここ

Avg. CPU Transpose Time: 0.106048 ms, Bandwidth: 3.771873 GB/s 

Avg. GPU Naive Trans Time: 0.009871 ms, bandwidth: 40.520836 GB/s 
    Correct: 50000, Wrong: 0 

Avg. GPU Trans with SM Time: 0.007598 ms, bandwidth: 52.643482 GB/s 
    Correct: 12352, Wrong: 37648

フル実施例です。私はそれから不必要なコードのほとんどを取り除いたので、詰め物は少なくなりました：

#include "cuda_runtime.h" 
#include "device_functions.h" 
#include "device_launch_parameters.h" 

#include <chrono> 
#include <time.h> 
#include <stdio.h> 
#include <stdlib.h> 

#define TILE_DIM 32 
#define BLOCK_ROWS 8 
#define BLOCK_COLS 32 

cudaError_t matrMagicCuda(float *matrB, float *matrA, const int width, const int height, const int nreps, const int operation); 
void cpuMatrTrans(float *matrB, float *matrA, const int width, const int height, const int nreps); 
__global__ void naiveTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps); 
__global__ void notSoNaivaTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps); 

int main() 
{ 
    int i, width, height, nreps, size, wrong, correct; 
    double cpuTime, cpuBandwidth; 
    cudaError_t cudaStatus; 

    float *matrA, *matrATC, *matrATG, *matrAC; 

    srand(time(NULL)); 

    nreps = 10000; 
    width = 500; 
    height = 100; 
    size = width * height; 

    matrA = (float*)malloc(size * sizeof(float)); // matrix A 
    matrAC = (float*)malloc(size * sizeof(float)); // matrix A copied 
    matrATC = (float*)malloc(size * sizeof(float)); // matrix A transposed by CPU 
    matrATG = (float*)malloc(size * sizeof(float)); // matrix A transposed by GPU 

    for (i = 0; i < size; i++) 
    { 
     matrA[i] = (float)i; 
    } 

    auto start = std::chrono::high_resolution_clock::now(); 

    //CPU Transpose 
    cpuMatrTrans(matrATC, matrA, width, height, nreps); 

    auto end = std::chrono::high_resolution_clock::now(); 

    std::chrono::duration<double> diff = end - start; 
    cpuTime = (diff.count() * 1000)/nreps; 
    cpuBandwidth = (sizeof(float) * size * 2)/(cpuTime * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write 
    printf("Avg. CPU Transpose Time: %f ms, Bandwidth: %f GB/s\n\n", cpuTime, cpuBandwidth); 

    correct = 0; 
    wrong = 0; 

    //Naive transpose 
    matrMagicCuda(matrATG, matrA, width, height, nreps, 1); 

    //Check if calc was correct 
    for (i = 0; i < size; i++) 
    { 
     if (matrATC[i] != matrATG[i]) 
     { 
      /*printf("ERROR - %d - ATC:%f - ATG:%f\n\n", i, matrATC[i], matrATG[i]); 
      return;*/ 
      wrong++; 
     } 
     else 
     { 
      correct++; 
     } 
    } 

    printf("\tCorrect: %d, Wrong: %d\n\n", correct, wrong); 
    correct = 0; 
    wrong = 0; 

    //Transpose with shared memory 
    matrMagicCuda(matrATG, matrA, width, height, nreps, 2); 

    //Check if calc was correct 
    for (i = 0; i < size; i++) 
    { 
     if (matrATC[i] != matrATG[i]) 
     { 
      /*printf("ERROR - %d - ATC:%f - ATG:%f\n\n", i, matrATC[i], matrATG[i]); 
      return;*/ 
      wrong++; 
     } 
     else 
     { 
      correct++; 
     } 
    } 

    //printf("\tTranspose with SM on GPU was executed correctly.\n\n"); 
    printf("\tCorrect: %d, Wrong: %d\n\n", correct, wrong); 
    correct = 0; 
    wrong = 0; 

    // cudaDeviceReset must be called before exiting in order for profiling and 
    // tracing tools such as Nsight and Visual Profiler to show complete traces. 
    cudaStatus = cudaDeviceReset(); 
    if (cudaStatus != cudaSuccess) 
    { 
     fprintf(stderr, "cudaDeviceReset failed!\n"); 
     return 1; 
    } 

    return 0; 
} 

cudaError_t matrMagicCuda(float *matrB, float *matrA, const int width, const int height, const int nreps, const int operation) 
{ 
    float elapsed = 0; 
    float *dev_matrA = 0; 
    float *dev_matrB = 0; 
    cudaError_t cudaStatus; 
    dim3 dim_grid, dim_block; 
    double gpuBandwidth; 

    int size = width * height; 

    dim_block.x = TILE_DIM; 
    dim_block.y = BLOCK_ROWS; 
    dim_block.z = 1; 

    dim_grid.x = (width + TILE_DIM - 1)/TILE_DIM; 
    dim_grid.y = (height + TILE_DIM - 1)/TILE_DIM; 
    dim_grid.z = 1; 

    // Choose which GPU to run on, change this on a multi-GPU system. 
    cudaStatus = cudaSetDevice(0); 
    if (cudaStatus != cudaSuccess) 
    { 
     fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); 
     goto Error; 
    } 

    // Allocate GPU buffers for three matrix 
    cudaStatus = cudaMalloc((void**)&dev_matrA, size * sizeof(float)); 
    if (cudaStatus != cudaSuccess) 
    { 
     fprintf(stderr, "cudaMalloc failed!"); 
     goto Error; 
    } 

    cudaStatus = cudaMalloc((void**)&dev_matrB, size * sizeof(float)); 
    if (cudaStatus != cudaSuccess) 
    { 
     fprintf(stderr, "cudaMalloc failed!"); 
     goto Error; 
    } 

    // Copy input matrix from host memory to GPU buffers. 
    cudaStatus = cudaMemcpy(dev_matrA, matrA, size * sizeof(float), cudaMemcpyHostToDevice); 
    if (cudaStatus != cudaSuccess) 
    { 
     fprintf(stderr, "cudaMemcpy failed!"); 
     goto Error; 
    } 

    cudaEvent_t start, stop; 
    cudaEventCreate(&start); 
    cudaEventCreate(&stop); 

    switch (operation) 
    { 
     case(1): 
     { 
      cudaEventRecord(start); 
      // Launch a kernel on the GPU with one thread for each element. 
      naiveTransKernel << <dim_grid, dim_block >> >(dev_matrB, dev_matrA, width, height, nreps); 

      cudaEventRecord(stop); 
      cudaEventSynchronize(stop); 

      cudaEventElapsedTime(&elapsed, start, stop); 
      cudaEventDestroy(start); 
      cudaEventDestroy(stop); 

      elapsed /= nreps; 

      gpuBandwidth = (sizeof(float) * size * 2)/(elapsed * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write 
      printf("Avg. GPU Naive Trans Time: %f ms, bandwidth: %f GB/s\n", elapsed, gpuBandwidth); 

      break; 
     } 

     case(2): 
     { 
      cudaEventRecord(start); 
      // Launch a kernel on the GPU with one thread for each element. 
      notSoNaivaTransKernel << <dim_grid, dim_block >> >(dev_matrB, dev_matrA, width, height, nreps); 

      cudaEventRecord(stop); 
      cudaEventSynchronize(stop); 

      cudaEventElapsedTime(&elapsed, start, stop); 
      cudaEventDestroy(start); 
      cudaEventDestroy(stop); 

      elapsed /= nreps; 

      gpuBandwidth = (sizeof(float) * size * 2)/(elapsed * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write 
      printf("Avg. GPU Trans with SM Time: %f ms, bandwidth: %f GB/s\n", elapsed, gpuBandwidth); 

      break; 
     } 

    default: 
     printf("No matching opcode was found.\n"); 
    } 

    // Check for any errors launching the kernel 
    cudaStatus = cudaGetLastError(); 
    if (cudaStatus != cudaSuccess) 
    { 
     fprintf(stderr, "Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); 
     goto Error; 
    } 

    // cudaDeviceSynchronize waits for the kernel to finish, and returns 
    // any errors encountered during the launch. 
    cudaStatus = cudaDeviceSynchronize(); 
    if (cudaStatus != cudaSuccess) 
    { 
     fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching Kernel!\n", cudaStatus); 
     goto Error; 
    } 

    // Copy output matrix from GPU buffer to host memory. 
    cudaStatus = cudaMemcpy(matrB, dev_matrB, size * sizeof(float), cudaMemcpyDeviceToHost); 
    if (cudaStatus != cudaSuccess) 
    { 
     fprintf(stderr, "cudaMemcpy failed!"); 
     goto Error; 
    } 

Error: 
    cudaFree(dev_matrB); 
    cudaFree(dev_matrA); 

    return cudaStatus; 
} 

void cpuMatrTrans(float *matrB, float *matrA, const int width, const int height, const int nreps) 
{ 
    int i, j, r; 

#pragma unroll 
    for (r = 0; r < nreps; r++) 
#pragma unroll 
     for (i = 0; i < height; i++) 
#pragma unroll 
      for (j = 0; j < width; j++) 
       matrB[j * height + i] = matrA[i * width + j]; 
} 

__global__ void naiveTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps) 
{ 
    int i, r; 
    int row = blockIdx.x * TILE_DIM + threadIdx.x; 
    int col = blockIdx.y * TILE_DIM + threadIdx.y; 
    int index_in = row + width * col; 
    int index_out = col + height * row; 

#pragma unroll 
    for (r = 0; r < nreps; r++) 
#pragma unroll 
     for (i = 0; i < TILE_DIM; i += BLOCK_ROWS) 
      if (index_in + i * width < width * height) 
       matrB[index_out + i] = matrA[index_in + i * width]; 
} 

__global__ void notSoNaivaTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps) 
{ 
    __shared__ float tile[TILE_DIM][TILE_DIM + 1]; 
    int blockIdx_y = blockIdx.x; 
    int blockIdx_x = (blockIdx.x + blockIdx.y) % gridDim.x; 
    int xIndex = blockIdx_x * TILE_DIM + threadIdx.x; 
    int yIndex = blockIdx_y * TILE_DIM + threadIdx.y; 
    int index_in = xIndex + (yIndex)* width; 

    xIndex = blockIdx_y * TILE_DIM + threadIdx.x; 
    yIndex = blockIdx_x * TILE_DIM + threadIdx.y; 
    int index_out = xIndex + (yIndex)* height; 

    int r, i; 
#pragma unroll 
    for (r = 0; r < nreps; r++) 
    { 
#pragma unroll 
     for (i = 0; i < TILE_DIM; i += BLOCK_ROWS) 
      tile[threadIdx.y + i][threadIdx.x] = matrA[index_in + i * width]; 

     __syncthreads(); 

#pragma unroll 
     for (i = 0; i < TILE_DIM; i += BLOCK_ROWS) 
      if (index_in + i * width < width * height) 
       matrB[index_out + i * height] = tile[threadIdx.x][threadIdx.y + i]; 
    } 
}

出典

2016-11-22 JRsz

明らかに、非正方行列転置を実行できるようにしたいですか？または正方行列転置のみ？あなたはあなたの質問に外部リンクではなく[mcve] *を含めるべきです。 –

はい、私は非正方行列にも転置をしたいと思います。わかりやすくするために、私は外に置いたが、あなたが望むように、私はそれを質問にも追加する。 – JRsz

コードが範囲外アクセスを行っています。 CUDAコードに問題がある場合はいつでも[適切なcudaエラーチェック]（http://stackoverflow.com/questions/14038589）を使用して、 'cuda-memcheck'でコードを実行してください。あなたがエラー出力を理解していなくても、あなたを助けようとしている他の人にとっては役に立ちます。また、これはおそらく学習の練習であると理解していますが、深刻な作業のためには、cublas [geam]（http://docs.nvidia.com/cuda/cublas/index.html#cublas- lt-t-gt-geam）を含む。 –

このコードにはいくつかの問題がありました。私はそれらのすべてをカバーすることができるか分からない。

おそらく、最も重要な問題は、適切な2Dスレッドチェックが不足していること（および理解が不足していること）でした。アルゴリズムでは、問題のサイズよりも両方の次元が大きいスレッドのグリッドが作成されます。これにより、両方の次元で論理的なスレッドがマトリックスの次元のの外側に作成されます。

次のような2Dスレッドチェックを作成しようとしました：

 if (index_in + i * width < width * height)

これは動作しません。私が3x3の行列と4x4のスレッドのグリッドを持っているとします。（3,0）にあるスレッドは、あなたの行列に対しては範囲外ですが、2Dスレッドのチェックに合格します。

この場合、適切なスレッドチェックでは、製品としてではなく、各ディメンションを個別にテストする必要があります。

この論理エラーは、あなたの「ナイーブな」トランスポーズカーネルにも存在し、cuda-memcheckでコードを実行すると確認できます。それは正しく動作するように見えても、そのカーネルの範囲外のアクセスエラーを示します。

他にも様々な問題がありました。これらのほとんどは、共有メモリーカーネルでの索引作成と関連がありました。 shared memory transposeのために必要なインデックス作成操作を理解していることは私には分かりません。二つの別々のインデックスは、我々はこのような場合に行う必要があることを転置あります

は、ブロック（タイル）をトランスポーズは/
がスレッドインデックスの

移調が読め行われるスレッドインデックスを転置インデックス共有メモリを書き込む。これは、共有メモリの読み書きにthreadIdx.xとthreadIdx.yの使用を逆転させて正しく説明しました。しかし、私が知る限りでは、ブロックインデックスの逆転のためのインデックス生成（この逆転は、のグローバルメモリメモリへの読み書き中に使用されました）が壊れています。それが整理されるもう一つの大きな問題でした。

次のコードでは、これらを持っており、他のいくつかの問題が修正され、私のために正常に動作しているように見えます：

$ cat t33.cu 
#include "cuda_runtime.h" 
#include "device_functions.h" 
#include "device_launch_parameters.h" 

#include <chrono> 
#include <time.h> 
#include <stdio.h> 
#include <stdlib.h> 

#define TILE_DIM 32 
#define BLOCK_ROWS 8 
#define BLOCK_COLS 32 

cudaError_t matrMagicCuda(float *matrB, float *matrA, const int width, const int height, const int nreps, const int operation); 
void cpuMatrTrans(float *matrB, float *matrA, const int width, const int height, const int nreps); 
__global__ void naiveTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps); 
__global__ void notSoNaivaTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps); 

int main() 
{ 
    int i, width, height, nreps, size, wrong, correct; 
    double cpuTime, cpuBandwidth; 
    cudaError_t cudaStatus; 

    float *matrA, *matrATC, *matrATG, *matrAC; 

    srand(time(NULL)); 

    nreps = 10000; 
    width = 500; 
    height = 100; 


    size = width * height; 

    matrA = (float*)malloc(size * sizeof(float)); // matrix A 
    matrAC = (float*)malloc(size * sizeof(float)); // matrix A copied 
    matrATC = (float*)malloc(size * sizeof(float)); // matrix A transposed by CPU 
    matrATG = (float*)malloc(size * sizeof(float)); // matrix A transposed by GPU 

    for (i = 0; i < size; i++) 
    { 
     matrA[i] = (float)i; 
    } 

    auto start = std::chrono::high_resolution_clock::now(); 

    //CPU Transpose 
    cpuMatrTrans(matrATC, matrA, width, height, nreps); 

    auto end = std::chrono::high_resolution_clock::now(); 

    std::chrono::duration<double> diff = end - start; 
    cpuTime = (diff.count() * 1000)/nreps; 
    cpuBandwidth = (sizeof(float) * size * 2)/(cpuTime * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write 
    printf("Avg. CPU Transpose Time: %f ms, Bandwidth: %f GB/s\n\n", cpuTime, cpuBandwidth); 

    correct = 0; 
    wrong = 0; 

    //Naive transpose 
    memset(matrATG, 0, size*sizeof(float)); 
    matrMagicCuda(matrATG, matrA, width, height, nreps, 1); 

    //Check if calc was correct 
    for (i = 0; i < size; i++) 
    { 
     if (matrATC[i] != matrATG[i]) 
     { 
      /*printf("ERROR - %d - ATC:%f - ATG:%f\n\n", i, matrATC[i], matrATG[i]); 
      return;*/ 
      wrong++; 
     } 
     else 
     { 
      correct++; 
     } 
    } 

    printf("\tCorrect: %d, Wrong: %d\n\n", correct, wrong); 
    correct = 0; 
    wrong = 0; 

    //Transpose with shared memory 
    memset(matrATG, 0, size*sizeof(float)); 
    matrMagicCuda(matrATG, matrA, width, height, nreps, 2); 

    //Check if calc was correct 
    for (i = 0; i < size; i++) 
    { 
     if (matrATC[i] != matrATG[i]) 
     { 
      /*printf("ERROR - %d - ATC:%f - ATG:%f\n\n", i, matrATC[i], matrATG[i]); 
      return;*/ 
      wrong++; 
     } 
     else 
     { 
      correct++; 
     } 
    } 

    //printf("\tTranspose with SM on GPU was executed correctly.\n\n"); 
    printf("\tCorrect: %d, Wrong: %d\n\n", correct, wrong); 
    correct = 0; 
    wrong = 0; 

    // cudaDeviceReset must be called before exiting in order for profiling and 
    // tracing tools such as Nsight and Visual Profiler to show complete traces. 
    cudaStatus = cudaDeviceReset(); 
    if (cudaStatus != cudaSuccess) 
    { 
     fprintf(stderr, "cudaDeviceReset failed!\n"); 
     return 1; 
    } 

    return 0; 
} 

cudaError_t matrMagicCuda(float *matrB, float *matrA, const int width, const int height, const int nreps, const int operation) 
{ 
    float elapsed = 0; 
    float *dev_matrA = 0; 
    float *dev_matrB = 0; 
    cudaError_t cudaStatus; 
    dim3 dim_grid, dim_block; 
    double gpuBandwidth; 

    int size = width * height; 

    dim_block.x = TILE_DIM; 
    dim_block.y = BLOCK_ROWS; 
    dim_block.z = 1; 

    dim_grid.x = (width + TILE_DIM - 1)/TILE_DIM; 
    dim_grid.y = (height + TILE_DIM - 1)/TILE_DIM; 
    dim_grid.z = 1; 

    // Choose which GPU to run on, change this on a multi-GPU system. 
    cudaStatus = cudaSetDevice(0); 
    if (cudaStatus != cudaSuccess) 
    { 
     fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); 
     goto Error; 
    } 

    // Allocate GPU buffers for three matrix 
    cudaStatus = cudaMalloc((void**)&dev_matrA, size * sizeof(float)); 
    if (cudaStatus != cudaSuccess) 
    { 
     fprintf(stderr, "cudaMalloc failed!"); 
     goto Error; 
    } 

    cudaStatus = cudaMalloc((void**)&dev_matrB, size * sizeof(float)); 
    if (cudaStatus != cudaSuccess) 
    { 
     fprintf(stderr, "cudaMalloc failed!"); 
     goto Error; 
    } 

    // Copy input matrix from host memory to GPU buffers. 
    cudaStatus = cudaMemcpy(dev_matrA, matrA, size * sizeof(float), cudaMemcpyHostToDevice); 
    if (cudaStatus != cudaSuccess) 
    { 
     fprintf(stderr, "cudaMemcpy failed!"); 
     goto Error; 
    } 
    cudaMemset(dev_matrB, 0, size * sizeof(float)); 
    cudaEvent_t start, stop; 
    cudaEventCreate(&start); 
    cudaEventCreate(&stop); 

    switch (operation) 
    { 
     case(1): 
     { 
      cudaEventRecord(start); 
      // Launch a kernel on the GPU with one thread for each element. 
      naiveTransKernel << <dim_grid, dim_block >> >(dev_matrB, dev_matrA, width, height, nreps); 

      cudaEventRecord(stop); 
      cudaEventSynchronize(stop); 

      cudaEventElapsedTime(&elapsed, start, stop); 
      cudaEventDestroy(start); 
      cudaEventDestroy(stop); 

      elapsed /= nreps; 

      gpuBandwidth = (sizeof(float) * size * 2)/(elapsed * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write 
      printf("Avg. GPU Naive Trans Time: %f ms, bandwidth: %f GB/s\n", elapsed, gpuBandwidth); 

      break; 
     } 

     case(2): 
     { 
      cudaEventRecord(start); 
      // Launch a kernel on the GPU with one thread for each element. 
      notSoNaivaTransKernel << <dim_grid, dim_block >> >(dev_matrB, dev_matrA, width, height, nreps); 

      cudaEventRecord(stop); 
      cudaEventSynchronize(stop); 

      cudaEventElapsedTime(&elapsed, start, stop); 
      cudaEventDestroy(start); 
      cudaEventDestroy(stop); 

      elapsed /= nreps; 

      gpuBandwidth = (sizeof(float) * size * 2)/(elapsed * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write 
      printf("Avg. GPU Trans with SM Time: %f ms, bandwidth: %f GB/s\n", elapsed, gpuBandwidth); 

      break; 
     } 

    default: 
     printf("No matching opcode was found.\n"); 
    } 

    // Check for any errors launching the kernel 
    cudaStatus = cudaGetLastError(); 
    if (cudaStatus != cudaSuccess) 
    { 
     fprintf(stderr, "Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); 
     goto Error; 
    } 

    // cudaDeviceSynchronize waits for the kernel to finish, and returns 
    // any errors encountered during the launch. 
    cudaStatus = cudaDeviceSynchronize(); 
    if (cudaStatus != cudaSuccess) 
    { 
     fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching Kernel!\n", cudaStatus); 
     goto Error; 
    } 

    // Copy output matrix from GPU buffer to host memory. 
    cudaStatus = cudaMemcpy(matrB, dev_matrB, size * sizeof(float), cudaMemcpyDeviceToHost); 
    if (cudaStatus != cudaSuccess) 
    { 
     fprintf(stderr, "cudaMemcpy failed!"); 
     goto Error; 
    } 

Error: 
    cudaFree(dev_matrB); 
    cudaFree(dev_matrA); 

    return cudaStatus; 
} 

void cpuMatrTrans(float *matrB, float *matrA, const int width, const int height, const int nreps) 
{ 
    int i, j, r; 

#pragma unroll 
    for (r = 0; r < nreps; r++) 
#pragma unroll 
     for (i = 0; i < height; i++) 
#pragma unroll 
      for (j = 0; j < width; j++) 
       matrB[j * height + i] = matrA[i * width + j]; 
} 

__global__ void naiveTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps) 
{ 
    int i, r; 
    int col = blockIdx.x * TILE_DIM + threadIdx.x; 
    int row = blockIdx.y * TILE_DIM + threadIdx.y; 
    int index_in = col + width * row; 
    int index_out = row + height * col; 

#pragma unroll 
    for (r = 0; r < nreps; r++) 
#pragma unroll 
     for (i = 0; i < TILE_DIM; i += BLOCK_ROWS) 
      if ((row+i<height) && (col < width)) 
       matrB[index_out + i] = matrA[index_in + i * width]; 
} 

__global__ void notSoNaivaTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps) 
{ 
    __shared__ float tile[TILE_DIM][TILE_DIM + 1]; 
    int ciIndex = blockIdx.x * TILE_DIM + threadIdx.x; 
    int riIndex = blockIdx.y * TILE_DIM + threadIdx.y; 
    int coIndex = blockIdx.y * TILE_DIM + threadIdx.x; 
    int roIndex = blockIdx.x * TILE_DIM + threadIdx.y; 
    int index_in = ciIndex + (riIndex)* width; 
    int index_out = coIndex + (roIndex)* height; 


    int r, i; 
#pragma unroll 
    for (r = 0; r < nreps; r++) 
    { 
#pragma unroll 
     for (i = 0; i < TILE_DIM; i += BLOCK_ROWS) 
      if ((ciIndex<width) && (riIndex+i < height)) 
       tile[threadIdx.y + i][threadIdx.x] = matrA[index_in + i * width]; 
     __syncthreads(); 

#pragma unroll 
     for (i = 0; i < TILE_DIM; i += BLOCK_ROWS) 
      if ((coIndex<height) && (roIndex+i < width)) 
       matrB[index_out + i*height] = tile[threadIdx.x][threadIdx.y + i]; 
     __syncthreads(); 
    } 
} 
$ nvcc -std=c++11 -arch=sm_61 -o t33 t33.cu 
t33.cu(25): warning: variable "matrAC" was set but never used 

t33.cu(25): warning: variable "matrAC" was set but never used 

$ cuda-memcheck ./t33 
========= CUDA-MEMCHECK 
Avg. CPU Transpose Time: 0.143087 ms, Bandwidth: 2.795509 GB/s 

Avg. GPU Naive Trans Time: 0.028587 ms, bandwidth: 13.992195 GB/s 
     Correct: 50000, Wrong: 0 

Avg. GPU Trans with SM Time: 0.040328 ms, bandwidth: 9.918678 GB/s 
     Correct: 50000, Wrong: 0 

========= ERROR SUMMARY: 0 errors 
$ ./t33 
Avg. CPU Transpose Time: 0.140469 ms, Bandwidth: 2.847594 GB/s 

Avg. GPU Naive Trans Time: 0.003828 ms, bandwidth: 104.505440 GB/s 
     Correct: 50000, Wrong: 0 

Avg. GPU Trans with SM Time: 0.000715 ms, bandwidth: 559.206604 GB/s 
     Correct: 50000, Wrong: 0 

$

注：コードは、帯域幅を測定しようとします。ただし、ここで測定された帯域幅はキャッシュ帯域幅の影響を受けることに注意してください。ここでは、ほとんどのGPUのL2キャッシュに収まるように、マトリクスサイズ（500x100 = 200Kbyteずつの入力と出力）を簡単に小さくできます。この事実と、同じトランスポーズを複数回実行しているという事実（nreps）は、多くの作業がL2キャッシュから直接操作されていることを意味します。したがって、上記の「最適化された」ケースでは、GPUの使用可能なメモリ帯域幅を大幅に超える報告された帯域幅番号が表示されます（この場合はPascal Titan Xとなるため、約340GB/sのメインメモリ帯域幅が利用できます）。これは、この測定には、メインメモリの帯域幅の少なくとも2倍の帯域幅を持つL2キャッシュのメリットが含まれているためです。この効果は、大幅に大きなマトリックスサイズを使用したり、またはnrepsを1に減らすことで排除できます。

出典

2016-11-23 03:58:48

これは講義のためのものであり、私は非常に新しく、すでに索引付けに精通していないことが分かっています。あなたのソリューションは機能し、私はあなたの助けに非常に感謝しています。私はそれのすべての行を理解しようとするので、将来私はこれらの間違いをもう一度しません。 sidenoteでは、私のアルゴリズムは講師のものです：/正確なインデックス、ブロック、グリッドのレイアウトを理解するのに役立つ参考資料がありますか？あなたが提供したリンクがこれを忘れると、これは忘れてしまいます。私は間違いなくそれを読むでしょう！そして、あなたがそこに持っているいいBTUのGPU！ Matrix 5000x1000は私のドライバをクラッシュさせます:( – JRsz

私が指し示すことができる最良の参考文献は私の答えで既にリンクされているものですが、プレゼンテーションを簡単にするために、正方形の行列を持つ場合と、あなたの提示されたアルゴリズムの多くの点は、BLOCK_ROWSリテラルの使用に似ているようですが、各スレッドがいくつかの転置を実行する特定のカーネルループメソッド –

ドライバのクラッシュに関しては、Windowsプラットフォームで作業していることが明らかです。WDDMモードのWindows GPUの場合（おそらくあなたのテスト）、GPUカーネルの実行はWDDMのTDRウォッチドッグで約2秒に制限されています。これを回避するには（Googleの「WDDM TDR cuda」）あなたのドライバーがクラッシュする可能性が最も高いです。より大きな行列の転置と多数の繰返しは、カーネルに2秒以上かかることになります（それは純粋なカーネルかもしれません）。だから、あなたは 'nreps'を減らしてこれを回避しようとすることもできます。 –

共有メモリでのCUDA行列の転置

答えて

関連する問題