2011-11-11 21 views
0

私はコードをcudaデバイス上で並列に動作させようとしていますが、gfxドライバはプログラム実行中にクラッシュし続けますが、このプログラム、他のcudaコードうまく動作します。そしてそれは間違ったアンサーを与えるが、私はそれがクラッシュのためだと思うだろう!CUDAプログラムがクラッシュしたドライバ

注:Quadro 2000M GFXカードで実行されています。

ここに私のパラレルバージョンのコードがあります。あなたが長い実行中のカーネルはWindowsが応答しなくなってきてからシステムを防ぐために、あなたのドライバを停止します持っている場合は

#include <stdio.h> 
#include <time.h> 
#include <math.h> 

#define N 16 
#define threads 512 
#define MaxBlocks 6500 

__global__ void cudaCalculateBestPath(int *M, int *bestToDiagPathSum, int *bestFromDiagPathSum, 
     unsigned int *bestToDiagPathCode, unsigned int *bestFromDiagPathCode) 
{ 
    int x = ((unsigned int)1 << (N-1)); 
    unsigned int currentPathCode = blockIdx.x * threads + threadIdx.x; 
    // This while is for if we are over the max amount of blocks 
    while(currentPathCode < x) 
    { 
     int test = ((unsigned int)1 << (N-1)); 
     if(currentPathCode >= test) 
      return; 

     unsigned int ui = currentPathCode; 

     int toDiagPathSum = M[0]; 
     int toDiagRow = 0; 
     int toDiagCol = 0; 

     int fromDiagPathSum = M[(N-1)*N+N-1]; 
     int fromDiagRow = N-1; 
     int fromDiagCol = N-1; 

     for (int i = 0; i < N-1; i++) 
     { 
      if (ui % 2 == 0) 
      { 
       toDiagCol++;     // horizontal move 
       fromDiagCol--; 
      } 
      else        
      { 
       toDiagRow++;     // vertical move 
       fromDiagRow--; 
      } 
      toDiagPathSum += M[toDiagRow*N+toDiagCol]; 
      fromDiagPathSum += M[fromDiagRow*N+fromDiagCol]; 
      ui = ui >> 1; 
     } 

     if (toDiagPathSum < bestToDiagPathSum[toDiagRow]) 
     { 
      bestToDiagPathSum[toDiagRow] = toDiagPathSum; 
      bestToDiagPathCode[toDiagRow] = currentPathCode; 
     } 

     if (fromDiagPathSum < bestFromDiagPathSum[fromDiagRow]) 
     { 
      bestFromDiagPathSum[fromDiagRow] = fromDiagPathSum; 
      bestFromDiagPathCode[fromDiagRow] = currentPathCode; 
     } 

     // Next run 
     currentPathCode = blockDim.x + gridDim.x; 
    } 
} 

int main() 
{ 
    clock_t start = clock(); 

    //--- create and initialize M matrix (including best path) 

    int M[N*N]; 
    for (int row = 0; row < N; row++) 
     for (int col = 0; col < N; col++) 
      M[row*N+col] = 2; 

#pragma region Test Path 
    M[ 0*N+0] = 1; 
    M[ 0*N+1] = 1; 
    M[ 0*N+2] = 1; 
    M[ 1*N+2] = 1; 
    M[ 1*N+3] = 1; 
    M[ 2*N+3] = 1; 
    M[ 3*N+3] = 1; 
    M[ 3*N+4] = 1; 
    M[ 3*N+5] = 1; 
    M[ 3*N+6] = 1; 
    M[ 4*N+6] = 1; 
    M[ 5*N+6] = 1; 
    M[ 6*N+6] = 1; 
    M[ 7*N+6] = 1; 
    M[ 8*N+6] = 1; 
    M[ 9*N+6] = 1; 
    M[10*N+6] = 1; 
    M[10*N+7] = 1; 
    M[10*N+8] = 1; 
    M[10*N+9] = 1; 
    M[11*N+9] = 1; 
    M[11*N+0] = 1; 
    M[12*N+0] = 1; 
    M[13*N+10] = 1; 
    M[13*N+11] = 1; 
    M[13*N+12] = 1; 
    M[14*N+12] = 1; 
    M[15*N+12] = 1; 
    M[15*N+13] = 1; 
    M[15*N+14] = 1; 
    M[15*N+15] = 1; 
#pragma endregion Test Path 

    //--- create and initialize bestToDiag and bestFromDiag arrays 

    int bestToDiagPathSum[N]; 
    int bestFromDiagPathSum[N]; 

    unsigned int bestToDiagPathCode[N]; 
    unsigned int bestFromDiagPathCode[N]; 

    int biggerThanMaxPathSum = 256*N + 1; 
    for (int i = 0; i < N; i++) 
    { 
     bestToDiagPathSum[i] = biggerThanMaxPathSum; 
     bestFromDiagPathSum[i] = biggerThanMaxPathSum; 
    } 

    //--- iterate through path codes, updating bestToDiag and bestFromDiag arrays 

    int x = ((unsigned int)1 << (N-1)); 

    // Trick for celin the total blocks 
    int TotalBlocks = (x+threads)/threads; 
    if(TotalBlocks > MaxBlocks) 
     TotalBlocks = MaxBlocks; 

    int *dev_M, *dev_bestToDiagPathSum, *dev_bestFromDiagPathSum; 
    unsigned int *dev_bestToDiagPathCode, *dev_bestFromDiagPathCode; 

    // allocate memory 
    cudaMalloc((void**)&dev_M, N*N*sizeof(int)); 
    cudaMalloc((void**)&dev_bestToDiagPathSum, N*sizeof(int)); 
    cudaMalloc((void**)&dev_bestFromDiagPathSum, N*sizeof(int)); 
    cudaMalloc((void**)&dev_bestToDiagPathCode, N*sizeof(int)); 
    cudaMalloc((void**)&dev_bestFromDiagPathCode, N*sizeof(int)); 

    // Copy memory to device 
    cudaMemcpy(dev_M, M, N*N*sizeof(int), cudaMemcpyHostToDevice); 
    cudaMemcpy(dev_bestToDiagPathSum, bestToDiagPathSum, N*sizeof(int), cudaMemcpyHostToDevice); 
    cudaMemcpy(dev_bestFromDiagPathSum, bestFromDiagPathSum, N*sizeof(int), cudaMemcpyHostToDevice); 
    cudaMemcpy(dev_bestToDiagPathCode, bestToDiagPathCode, N*sizeof(int), cudaMemcpyHostToDevice); 
    cudaMemcpy(dev_bestFromDiagPathCode, bestFromDiagPathCode, N*sizeof(int), cudaMemcpyHostToDevice); 

    // Run code on device 
    printf("Blocks: %d\n", TotalBlocks); 
    printf("Threads: %d\n\n", threads); 
    cudaCalculateBestPath<<<TotalBlocks,threads>>>(dev_M, dev_bestToDiagPathSum, dev_bestFromDiagPathSum, 
     dev_bestToDiagPathCode, dev_bestFromDiagPathCode); 

    // Insert code here to run while the GPU is running. 

    // Copy the mem back 
    cudaMemcpy(M, dev_M, N*N*sizeof(int), cudaMemcpyDeviceToHost); 
    cudaMemcpy(bestToDiagPathSum, dev_bestToDiagPathSum, N*sizeof(int), cudaMemcpyDeviceToHost); 
    cudaMemcpy(bestFromDiagPathSum, dev_bestFromDiagPathSum, N*sizeof(int), cudaMemcpyDeviceToHost); 
    cudaMemcpy(bestToDiagPathCode, dev_bestToDiagPathCode, N*sizeof(int), cudaMemcpyDeviceToHost); 
    cudaMemcpy(bestFromDiagPathCode, dev_bestFromDiagPathCode, N*sizeof(int), cudaMemcpyDeviceToHost); 

    int bestPathSum = biggerThanMaxPathSum; 
    unsigned int bestPathCodePrefix; 
    unsigned int bestPathCodeSuffix; 

    int tempSum; 

    for (int i = 0; i < N; i++) 
    { 
     tempSum = bestToDiagPathSum[i] + bestFromDiagPathSum[i] - M[i*N+(N-1-i)]; 
     if (tempSum < bestPathSum) 
     { 
      bestPathSum = tempSum; 
      bestPathCodePrefix = bestToDiagPathCode[i]; 
      bestPathCodeSuffix = bestFromDiagPathCode[i]; 
     } 
    } 

    //--- output best path sum and best path diagram 

    printf("Best Path Sum = %d\n\n",bestPathSum); 

    M[0] = -M[0]; 
    int toDiagRow = 0; 
    int toDiagCol = 0; 
    unsigned int ui = bestPathCodePrefix; 
    for (int i = 0; i < N-1; i++) 
    { 
     if (ui % 2 == 0) 
      toDiagCol++;     // horizontal move 
     else        
      toDiagRow++;     // vertical move 
     M[toDiagRow*N+toDiagCol] = -M[toDiagRow*N+toDiagCol]; 
     ui = ui >> 1; 
    } 

    M[(N-1)*N+N-1] = -M[(N-1)*N+N-1]; 
    int fromDiagRow = N-1; 
    int fromDiagCol = N-1; 
    ui = bestPathCodeSuffix; 
    for (int i = 0; i < N-2; i++) 
    { 
     if (ui % 2 == 0) 
      fromDiagCol--;     // horizontal move 
     else        
      fromDiagRow--;     // vertical move 
     M[fromDiagRow*N+fromDiagCol] = -M[fromDiagRow*N+fromDiagCol]; 
     ui = ui >> 1; 
    } 

    for (int row = N-1; row >= 0; row--) 
    { 
     for (int col = 0; col <= N-1; col++) 
      if (M[row*N+col] < 0) 
      { 
       printf("*"); 
       M[row*N+col] = -M[row*N+col]; 
      } 
      else 
       printf("%d",M[row*N+col]); 
     printf("\n"); 
    } 

    printf("\nTime elapsed: %f", ((double)clock() - start)/CLOCKS_PER_SEC); 

    int dummyReadForPause; 
    scanf_s("%d",&dummyReadForPause); 

    return 0; 
} 

シーケンシャルコード

#include <stdio.h> 
#include <time.h> 
#include <math.h> 

#define N 16 
#define threads 512 
#define MaxBlocks 6500 

int main() 
{ 
    clock_t start = clock(); 

    //--- create and initialize M matrix (including best path) 

    int M[N*N]; 
    for (int row = 0; row < N; row++) 
     for (int col = 0; col < N; col++) 
      M[row*N+col] = 2; 

#pragma region Test Path 
    M[ 0*N+0] = 1; 
    M[ 0*N+1] = 1; 
    M[ 0*N+2] = 1; 
    M[ 1*N+2] = 1; 
    M[ 1*N+3] = 1; 
    M[ 2*N+3] = 1; 
    M[ 3*N+3] = 1; 
    M[ 3*N+4] = 1; 
    M[ 3*N+5] = 1; 
    M[ 3*N+6] = 1; 
    M[ 4*N+6] = 1; 
    M[ 5*N+6] = 1; 
    M[ 6*N+6] = 1; 
    M[ 7*N+6] = 1; 
    M[ 8*N+6] = 1; 
    M[ 9*N+6] = 1; 
    M[10*N+6] = 1; 
    M[10*N+7] = 1; 
    M[10*N+8] = 1; 
    M[10*N+9] = 1; 
    M[11*N+9] = 1; 
    M[11*N+0] = 1; 
    M[12*N+0] = 1; 
    M[13*N+10] = 1; 
    M[13*N+11] = 1; 
    M[13*N+12] = 1; 
    M[14*N+12] = 1; 
    M[15*N+12] = 1; 
    M[15*N+13] = 1; 
    M[15*N+14] = 1; 
    M[15*N+15] = 1; 
#pragma endregion Test Path 

    //--- create and initialize bestToDiag and bestFromDiag arrays 

    int bestToDiagPathSum[N]; 
    int bestFromDiagPathSum[N]; 

    unsigned int bestToDiagPathCode[N]; 
    unsigned int bestFromDiagPathCode[N]; 

    int biggerThanMaxPathSum = 256*N + 1; 
    for (int i = 0; i < N; i++) 
    { 
     bestToDiagPathSum[i] = biggerThanMaxPathSum; 
     bestFromDiagPathSum[i] = biggerThanMaxPathSum; 
    } 

    //--- iterate through path codes, updating bestToDiag and bestFromDiag arrays 
    int toDiagPathSum, toDiagRow, toDiagCol; 
    int fromDiagPathSum, fromDiagRow, fromDiagCol; 
    unsigned int ui; 
    int x = ((unsigned int)1 << (N-1)); 

    for(unsigned int currentPathCode = 0; currentPathCode < x; currentPathCode++) 
    { 
     ui = currentPathCode; 

     toDiagPathSum = M[0]; 
     toDiagRow = 0; 
     toDiagCol = 0; 

     fromDiagPathSum = M[(N-1)*N+N-1]; 
     fromDiagRow = N-1; 
     fromDiagCol = N-1; 

     for (int i = 0; i < N-1; i++) 
     { 
      if (ui % 2 == 0) 
      { 
       toDiagCol++;     // horizontal move 
       fromDiagCol--; 
      } 
      else        
      { 
       toDiagRow++;     // vertical move 
       fromDiagRow--; 
      } 
      toDiagPathSum += M[toDiagRow*N+toDiagCol]; 
      fromDiagPathSum += M[fromDiagRow*N+fromDiagCol]; 
      ui = ui >> 1; 
     } 

     if (toDiagPathSum < bestToDiagPathSum[toDiagRow]) 
     { 
      bestToDiagPathSum[toDiagRow] = toDiagPathSum; 
      bestToDiagPathCode[toDiagRow] = currentPathCode; 
     } 

     if (fromDiagPathSum < bestFromDiagPathSum[fromDiagRow]) 
     { 
      bestFromDiagPathSum[fromDiagRow] = fromDiagPathSum; 
      bestFromDiagPathCode[fromDiagRow] = currentPathCode; 
     } 
    } 

    int bestPathSum = biggerThanMaxPathSum; 
    unsigned int bestPathCodePrefix; 
    unsigned int bestPathCodeSuffix; 

    int tempSum; 

    for (int i = 0; i < N; i++) 
    { 
     tempSum = bestToDiagPathSum[i] + bestFromDiagPathSum[i] - M[i*N+(N-1-i)]; 
     if (tempSum < bestPathSum) 
     { 
      bestPathSum = tempSum; 
      bestPathCodePrefix = bestToDiagPathCode[i]; 
      bestPathCodeSuffix = bestFromDiagPathCode[i]; 
     } 
    } 

    //--- output best path sum and best path diagram 

    printf("Best Path Sum = %d\n\n",bestPathSum); 

    M[0] = -M[0]; 
    toDiagRow = 0; 
    toDiagCol = 0; 
    ui = bestPathCodePrefix; 
    for (int i = 0; i < N-1; i++) 
    { 
     if (ui % 2 == 0) 
      toDiagCol++;     // horizontal move 
     else        
      toDiagRow++;     // vertical move 
     M[toDiagRow*N+toDiagCol] = -M[toDiagRow*N+toDiagCol]; 
     ui = ui >> 1; 
    } 

    M[(N-1)*N+N-1] = -M[(N-1)*N+N-1]; 
    fromDiagRow = N-1; 
    fromDiagCol = N-1; 
    ui = bestPathCodeSuffix; 
    for (int i = 0; i < N-2; i++) 
    { 
     if (ui % 2 == 0) 
      fromDiagCol--;     // horizontal move 
     else        
      fromDiagRow--;     // vertical move 
     M[fromDiagRow*N+fromDiagCol] = -M[fromDiagRow*N+fromDiagCol]; 
     ui = ui >> 1; 
    } 

    for (int row = N-1; row >= 0; row--) 
    { 
     for (int col = 0; col <= N-1; col++) 
      if (M[row*N+col] < 0) 
      { 
       printf("*"); 
       M[row*N+col] = -M[row*N+col]; 
      } 
      else 
       printf("%d",M[row*N+col]); 
     printf("\n"); 
    } 

    printf("\nTime elapsed: %f", ((double)clock() - start)/CLOCKS_PER_SEC); 

    int dummyReadForPause; 
    scanf_s("%d",&dummyReadForPause); 

    return 0; 
} 
+1

あなたのCUDAコードは非常に複雑です。各スレッドが別々のメモリ位置にアクセスしていることは明らかですか?並列コード内での分岐はお勧めしませんが、コード内のインデックス変数( 'ui')も変更しています。 –

+0

もし私がN数を6のような低い数に下げれば、それはうまくいけばいいけど、もしそれがクラッシュすれば、それはウォッチドッグの問題かもしれないと思う。 – Androme

答えて

関連する問題