クーダIDX doesntのインデックス行列が正しく

-3

私はCUDAで次のカーネルを持っています。次の作品：クーダIDX doesntのインデックス行列が正しく

__global__ void pagerank(Node *ingoing, Node *outgoing, int N) { 
     int j; 
     int idx = threadIdx.x + blockIdx.x * blockDim.x; 
     if ((idx > 0) && (idx < N)){ 
      for(j=0;j<N;j++){ 
      outgoing[j].p_t1=ingoing[j].p_t1; 
      } 
      //outgoing[idx].p_t1=ingoing[idx].p_t1; 

     } 
    }

何が問題なのですか？ idxが行列を正しく索引付けしないのはなぜですか？

コード全体が以下に書かれています。それを理解するのはそれほど簡単ではありません。事はあること私は、彼らは私が

outgoing[idx].p_t1=ingoing[idx].p_t1;

を行う0を印刷し、main関数の最後に出て行く[IDX] .p_t1フィールドを印刷するが、彼らは私が行うとき、正しいとき

for(j=0;j<N;j++){ 
    outgoing[j].p_t1=ingoing[j].p_t1; 
}

いただきました違う？あなたは私がを行うとき、彼らは0を印刷主な機能を言うとき、私はあなたが実際にちょうどインデックス0すべてのエントリを参照のうえとされていないと仮定し、インデックス0としてファースト・バージョンを使用して、コードによって処理されていない

/******************** Includes - Defines ****************/ #include "pagerank_serial.h" #include <stdio.h> #include <stdlib.h> #include <time.h> #include <math.h> #include <assert.h> #include <string.h> #include <sys/time.h> #include <fcntl.h> #include <cuda.h> #include "string.h" /******************** Defines ****************/ // Number of nodes int N; // Convergence threashold and algorithm's parameter d double threshold, d; // Table of node's data Node *Nodes; __global__ void pagerank(Node *ingoing, Node *outgoing, int N) { int j; int idx = threadIdx.x + blockIdx.x * blockDim.x; if ((idx > 0) && (idx < N)){ for(j=0;j<N;j++){ outgoing[j].p_t1=ingoing[j].p_t1; } //outgoing[idx].p_t1=ingoing[idx].p_t1; } } /***** Read graph connections from txt file *****/ void Read_from_txt_file(char* filename) { FILE *fid; int from_idx, to_idx; int temp_size; fid = fopen(filename, "r"); if (fid == NULL){ printf("Error opening data file\n"); } while (!feof(fid)) { if (fscanf(fid,"%d\t%d\n", &from_idx,&to_idx)) { Nodes[from_idx].con_size++; temp_size = Nodes[from_idx].con_size; //Nodes[from_idx].To_id =(int*) realloc(Nodes[from_idx].To_id, temp_size * sizeof(int)); Nodes[from_idx].To_id[temp_size - 1] = to_idx; } } //printf("End of connections insertion!\n"); fclose(fid); } /***** Read P vector from txt file*****/ void Read_P_from_txt_file() { FILE *fid; double temp_P; int index = 0; fid = fopen("P.txt", "r"); if (fid == NULL){printf("Error opening the Probabilities file\n");} while (!feof(fid)) { // P's values are double! if (fscanf(fid," double sum = 0;%lf\n", &temp_P)) { Nodes[index].p_t1 = temp_P; index++; } } //printf("End of P insertion!"); fclose(fid); } /***** Read E vector from txt file*****/ void Read_E_from_txt_file() { FILE *fid; double temp_E; int index = 0; fid = fopen("E.txt", "r"); if (fid == NULL) printf("Error opening the E file\n"); while (!feof(fid)) { // E's values are double! if (fscanf(fid,"%lf\n", &temp_E)) { Nodes[index].e = temp_E; index++; } } //printf("End of E insertion!"); fclose(fid); } /***** Create P and E with equal probability *****/ void Random_P_E() { int i; // Sum of P (it must be =1) double sum_P_1 = 0; // Sum of E (it must be =1) double sum_E_1 = 0; // Arrays initialization for (i = 0; i < N; i++) { Nodes[i].p_t0 = 0; Nodes[i].p_t1 = 1; Nodes[i].p_t1 = (double) Nodes[i].p_t1/N; sum_P_1 = sum_P_1 + Nodes[i].p_t1; Nodes[i].e = 1; Nodes[i].e = (double) Nodes[i].e/N; sum_E_1 = sum_E_1 + Nodes[i].e; } // Assert sum of probabilities is =1 // Print sum of P (it must be =1) //printf("Sum of P = %f\n",sum_P_1); // Exit if sum of P is !=1 assert(sum_P_1 = 1); //printf("\n"); // Print sum of E (it must be =1) //printf("Sum of E = %f\n",sum_E_1); // Exit if sum of Pt0 is !=1 assert(sum_E_1 = 1); } /***** Main function *****/ int main(int argc, char** argv) { int blockSize; // The launch configurator returned block size int minGridSize; // The minimum grid size needed to achieve the maximum occupancy for a full device launch int gridSize; // The actual grid size needed, based on input size // Check input arguments if (argc < 5) { printf("Error in arguments! Three arguments required: graph filename, N, threshold and d\n"); return 0; } // get arguments char filename[256]; strcpy(filename, argv[1]); N = atoi(argv[2]); threshold = atof(argv[3]); d = atof(argv[4]); int i; // a constant value contributed of all nodes with connectivity = 0 // it's going to be addes to all node's new probability // Allocate memory for N nodes Nodes = (Node*) malloc(N * sizeof(Node)); for (i = 0; i < N; i++) { Nodes[i].con_size = 0; //Nodes[i].To_id = (int*) malloc(sizeof(int)); } Read_from_txt_file(filename); // set random probabilities Random_P_E(); Node *h_ingoing; Node *h_outgoing; h_ingoing = Nodes; h_outgoing = (Node *)calloc(N, sizeof *h_outgoing); Node *d_ingoing; Node *d_outgoing; cudaMalloc(&d_ingoing, N * sizeof *d_ingoing); cudaMalloc(&d_outgoing, N * sizeof *d_outgoing); cudaMemcpy(d_ingoing, h_ingoing, N * sizeof *h_ingoing, cudaMemcpyHostToDevice); cudaMemcpy(d_outgoing, h_outgoing, N * sizeof *h_outgoing, cudaMemcpyHostToDevice); float time; cudaEvent_t begin, end; cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, pagerank, 0, N); // Round up according to array size gridSize = (N + blockSize - 1)/blockSize; printf("Gridsize, blockzise : %d , %d \n", gridSize, blockSize); cudaEventCreate(&begin); cudaEventCreate(&end); cudaEventRecord(begin, 0); pagerank<<<gridSize, blockSize>>>(d_ingoing, d_outgoing, N, threshold, d); cudaEventRecord(end, 0); cudaEventSynchronize(end); cudaEventElapsedTime(&time, begin, end); cudaMemcpy(h_outgoing, d_outgoing, N * sizeof *h_outgoing, cudaMemcpyDeviceToHost); printf("%f\n", time) ; printf("\n"); // Print final probabilitities for (i = 0; i <100; i++) { printf("P_t1[%d] = %f\n",i,h_outgoing[i].p_t1); } printf("\n"); printf("End of program!\n"); return (EXIT_SUCCESS); }

出典

2016-04-22 Haris

エラーは何ですか？私は生きているコンパイラではありません... –

グローバルカーネルを呼び出すコードがなくても、何が起きているのかを知ることは難しいですが、私はその質問を理解していると思います。 –

私はコード全体を書いた、今説明してもらえますか？ありがとう – Haris

((idx > 0) && (idx < N))はidx=0の場合はfalseです。

さらに、お客様のコードでは、Nodeタイプの定義がありません。これは、コード内で何がうまくいかないのかをよりよく理解するためには必須です。

Nodeのサイズ、内容、およびコンパイル時に使用している構造体のパッケージによっては、ホスト側のNodeのサイズがデバイス上のNodeのサイズと異なります。 printfを使用して、それが有用かどうかを確認するか、デバッガを使用してください。

また、起動時にエラーを確認していないようです。カーネルコールの後にcudaPeekAtLastErrorとcudaDeviceSynchronizeを追加して、エラーが発生していないことを確かめてください。（cuda Runtime APIからの他のメソッド呼び出しでも、コードでチェックされないエラーが返されることがあります）。

EDIT 再現しようとすると、コードに可能な限り近づけて書いています。私は十分なメモリを持つカードを持っていないので、ノード数は少なくなります。

typedef struct 
{ 
    double p_t0; 
    double p_t1; 
    double e; 
    int To_id[460]; 
    int con_size; 
} Node ; 

__global__ void pagerank(Node* ingoing, Node* outgoing, int N) 
{ 
    int idx = threadIdx.x + blockIdx.x * blockDim.x ; 
    if ((idx > 0) && (idx < N)) 
     outgoing[idx].p_t1 = ingoing[idx].p_t1; 
} 

#include <cstdlib> 

#define cudaCheck(a) { cudaError_t cuerr = a ; if (cuerr != cudaSuccess) { printf("[ERROR @ %s : %d ] : (%d) - %s\n", __FILE__, __LINE__, cuerr, cudaGetErrorString(cuerr)) ; ::exit(1) ; } } 

int main() 
{ 
    // int N = 916428 ; // does not fit on my GPU 
    int N = 400000 ; 

    int blockSize; 
    int minGridSize; 
    int gridSize; 

    Node* Nodes = (Node*)malloc(N * sizeof (Node)) ; 

    for (int i = 0 ; i < N ; ++i) 
     Nodes[i].p_t1 = (double)i+1; 

    Node* h_ingoing = Nodes; 
    Node* h_outgoing = (Node*)calloc(N, sizeof *h_outgoing) ; 

    Node* d_ingoing ; 
    Node* d_outgoing ; 

    cudaCheck (cudaMalloc(&d_ingoing, N * sizeof *d_ingoing)); 
    cudaCheck (cudaMalloc(&d_outgoing, N * sizeof *d_outgoing)); 

    cudaCheck (cudaMemcpy (d_ingoing, h_ingoing, N * sizeof *h_ingoing, cudaMemcpyHostToDevice)); 
    cudaCheck (cudaMemcpy (d_outgoing, h_outgoing, N * sizeof *h_outgoing, cudaMemcpyHostToDevice)); 

    float time; 

    cudaEvent_t begin, end ; 

    //blockSize = 256 ; 
    cudaOccupancyMaxPotentialBlockSize<> (&minGridSize, &blockSize, pagerank, 0, N) ; 
    gridSize = (N + blockSize -1)/blockSize ; 

    printf ("Configuration = <<< %d , %d >>>\n", gridSize, blockSize) ; 

    cudaCheck (cudaEventCreate (&begin)) ; 
    cudaCheck (cudaEventCreate (&end)) ; 

    cudaCheck (cudaEventRecord (begin, 0)) ; 

    pagerank <<< gridSize, blockSize >>> (d_ingoing, d_outgoing, N) ; 

    cudaCheck (cudaEventRecord (end, 0)) ; 

    cudaCheck (cudaEventSynchronize (end)) ; 

    cudaCheck (cudaMemcpy (h_outgoing, d_outgoing, N * sizeof *h_outgoing, cudaMemcpyDeviceToHost)) ; 

    for (int i = 0 ; i < 100 ; ++i) 
    { 
     printf ("P_t1[%d] = %f\n", i, h_outgoing[i].p_t1) ; 
    } 

    for (int i = 0 ; i < N ; ++i) 
    { 
     if (h_outgoing[i].p_t1 != (double)(i+1)) 
      printf ("Error @ %d : %lf <> %lf\n", i, h_outgoing[i].p_t1, (double)(i+1)); 
    } 

    return 0 ; 
}

回答の最初の草案に問題があると指摘されたインデックス0を除き、各出力は正しいです。

出典

2016-04-24 13:02:05

ありがとう。私は以下のノードの説明を追加しました。これは何か変わるでしょうか？ – Haris

ノードは大きな構造体です。ランタイムエラーが発生していませんか？ –

実際、ノード行列（916428個のノードがある）の合計サイズは1.7Gbitです。私はテスラk20mを走っています。いいえ、少なくともデバッグツールなしではエラーは発生していません – Haris

クーダIDX doesntのインデックス行列が正しく

答えて

関連する問題