2017-06-09 22 views
0

ベクトルを並列に追加する方が非平行な加算よりも速い方法を比較しようとしています。並列加算を高速化するためには、大きな数字が必要です。私のNが10000であれば、私はまだそれを実行することができます。 0xC00000FD:私のNが100000であるときしかし、私はbasicCuda.exeで0x00D25B89でCUDA未処理の例外とスタックのオーバーフローをプログラミングする

未処理の例外を取得し、私はこの問題をどのように修正すればよい

: オーバーフロー(0x00000000に、0x002F2000パラメータ)をスタック?

#include "cuda_runtime.h" 
#include "device_launch_parameters.h" 
#include <iostream> 
#include <stdio.h> 
#include <time.h> 
#include <chrono> 
cudaError_t addWithCuda(int *c, const int *a, const int *b, uint64_t N); 

__global__ void addKernel(int *c, const int *a, const int *b, uint64_t N) 
{ 
    int i = threadIdx.x + blockIdx.x * blockDim.x; 
    if (i < N) { 
     c[i] = a[i] * b[i]; 
    } 
} 


void randomizeArr(int arr[], int size) { 

    unsigned int randNum; 
    srand(time(NULL)); 
    for (int i = 0; i < size; i++) { 
     randNum = rand() % 100 + 1; 
     arr[i] = randNum; 
    } 
} 

void addWithCPU(int c[], int a[], int b[], int size) { 
    for (int i = 0; i < size; i++) { 
     c[i] = a[i] + b[i]; 
    } 
} 

#define N (10000) // Number of elements each array has 
#define M 1024 // 512 Threads Per Block 
int main() 
{ 
    const uint64_t arSize = N; 
    int a[arSize] = { 0 }; 
    int b[arSize] = { 0 }; 
    randomizeArr(a, arSize); 
    randomizeArr(b, arSize); 
    int c[arSize] = { 0 }; 
    int d[arSize] = { 0 }; 

    // Add vectors in parallel. 
    int iterations = 100; 
    cudaError cudaStatus; 
    auto begin = std::chrono::high_resolution_clock::now(); 
    for (uint32_t i = 0; i < iterations; ++i) 
    { 
     cudaStatus = addWithCuda(c, a, b, arSize); 
    } 
    auto end = std::chrono::high_resolution_clock::now(); 
    auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin).count(); 
    std::cout << "Parallel : " << duration/iterations << "ns." << std::endl; 

    // Add vectors NOT in parallel. 
    auto begin2 = std::chrono::high_resolution_clock::now(); 
    for (uint32_t i = 0; i < iterations; ++i) 
    { 
     addWithCPU(d, a, b, arSize); 
    } 
    auto end2 = std::chrono::high_resolution_clock::now(); 
    auto duration2 = std::chrono::duration_cast<std::chrono::nanoseconds>(end2 - begin2).count(); 
    std::cout << "Not Parallel : " << duration2/iterations << "ns." << std::endl; 


    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "addWithCuda failed!"); 
     std::getchar(); 
     return 1; 
    } 


    // cudaDeviceReset must be called before exiting in order for profiling and 
    // tracing tools such as Nsight and Visual Profiler to show complete traces. 
    cudaStatus = cudaDeviceReset(); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudaDeviceReset failed!"); 
     return 1; 
    } 
    std::getchar(); 

    return 0; 
} 

// Helper function for using CUDA to add vectors in parallel. 
cudaError_t addWithCuda(int *c, const int *a, const int *b, uint64_t size) 
{ 
    int *dev_a = 0; 
    int *dev_b = 0; 
    int *dev_c = 0; 
    cudaError_t cudaStatus; 

    // Choose which GPU to run on, change this on a multi-GPU system. 
    cudaStatus = cudaSetDevice(0); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); 
     goto Error; 
    } 

    // Allocate GPU buffers for three vectors (two input, one output) . 
    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int)); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudaMalloc failed!"); 
     goto Error; 
    } 

    cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int)); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudaMalloc failed!"); 
     goto Error; 
    } 

    cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int)); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudaMalloc failed!"); 
     goto Error; 
    } 

    // Copy input vectors from host memory to GPU buffers. 
    cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudaMemcpy failed!"); 
     goto Error; 
    } 

    cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudaMemcpy failed!"); 
     goto Error; 
    } 

    // Launch a kernel on the GPU with one thread for each element. 
    addKernel<<<(N + M - 1)/ M, M>>>(dev_c, dev_a, dev_b, N); 

    // Check for any errors launching the kernel 
    cudaStatus = cudaGetLastError(); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); 
     goto Error; 
    } 

    // cudaDeviceSynchronize waits for the kernel to finish, and returns 
    // any errors encountered during the launch. 
    cudaStatus = cudaDeviceSynchronize(); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus); 
     goto Error; 
    } 

    // Copy output vector from GPU buffer to host memory. 
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudaMemcpy failed!"); 
     goto Error; 
    } 

Error: 
    cudaFree(dev_c); 
    cudaFree(dev_a); 
    cudaFree(dev_b); 

    return cudaStatus; 
} 

答えて

2

すべての配列:

int a[arSize] 
int b[arSize] 
int c[arSize] 
int d[arSize] 

は "メイン" 機能でスタック上に作成されます。 arSize = 100000、sizeof(int)= 4の場合、1600000バイト(1.5 MB)の割り当てをお願いします。

代わりに、あなたはnewでメモリを割り当てることができます:

int* a = new int[arSize](); 

(すべての配列値が0に初期化されることに注意し、また c++ initial value of dynamic arrayを参照してください)