CUDAプログラムが期待通りに動作しない

-1

私はCUDA/C++を試していて、nボディシミュレータを作ることに決めました。 4096個の粒子間の重力引力をシミュレートします。それは約2または3FPSで動作し、私は完全になぜそうは分かりません。使用しているグラフィックスカードはGTX 980 Tiですので、プログラムがスムーズに動くことを期待しています。私はそれがおそらく最高のものに最適化されていないと理解していますが、それほどゆっくり動くとは思っていませんでした。CUDAプログラムが期待通りに動作しない

コードはプロトタイプであると仮定されているため、コードはどのような方法でもきちんと書かれていません。

main.cu

#include <Windows.h> 
#include <GL/glew.h> 
#include <GL/freeglut.h> 
#include <iostream> 
#include <vector> 
#include <math.h> 
#include "Particle.h" 
#include <cuda_runtime.h> 

#include <device_launch_parameters.h> 
#include <ctime> 
#include <string> 

#define N 4096 
#define DT 0.00001 
# define M_PI   3.14159265358979323846 /* pi */ 

using namespace std; 

Particle p[N]; 

int frames = 0; 
clock_t starttime = clock(); 
clock_t timepassed = 0; 
bool first = true; 
float fps = 0.0f; 

__global__ void updateParticle(Particle* out, Particle *pin) 
{ 
    int i = blockIdx.x * blockDim.x + threadIdx.x; 
    double velx = 0; 
    double vely = 0; 
    out[i].mass = pin[i].mass; 
    for(int j = 0; j < N; j++) 
    { 
     if (i == j || pin[j].mass == 0 || pin[i].mass == 0) 
      continue; 
     double difx = pin[i].posx - pin[j].posx; 
     double dify = pin[i].posy - pin[j].posy; 
     double len = difx * difx + dify * dify; 
     if (len == 0) 
      continue; 
     double force = (pin[i].mass * pin[j].mass)/len; 
     len = sqrt(len); 
     double dirx = -difx/len; 
     double diry = -dify/len; 
     dirx *= force; 
     diry *= force; 
     velx += (dirx/pin[i].mass + pin[i].velx) * DT; 
     vely += (diry/pin[i].mass + pin[i].vely) * DT; 
    } 
    out[i].posx = pin[i].posx + velx; 
    out[i].posy = pin[i].posy + vely; 
    out[i].velx = pin[i].velx; 
    out[i].vely = pin[i].vely; 

    while (out[i].posx > 1) 
     out[i].posx--; 
    while (out[i].posx < -1) 
     out[i].posx++; 
    while (out[i].posy > 1) 
     out[i].posy--; 
    while (out[i].posy < -1) 
     out[i].posy++; 
} 

void changeViewPort(int w, int h) 
{ 
    glViewport(0, 0, w, h); 

} 

void renderMore() 
{ 

    for (int i = 0; i < N; ++i) 
    { 
     if (p[i].mass == 0) 
      continue; 
     if (p[i].mass == 1) 
      glColor3f(1, 1, 1); 
     else 
      glColor3f(1, 0, 0); 
     glBegin(GL_LINE_LOOP); 
     for (int j = 0; j <= 4; j++) { 
      double angle = 2 * M_PI * j/300; 
      double x = cos(angle) * 0.001; 
      double y = sin(angle) * 0.001; 
      x *= p[i].mass; 
      y *= p[i].mass; 
      glVertex2d(x + p[i].posx, y + p[i].posy); 
     } 
     glEnd(); 
    } 

} 

void render(void) 
{ 
    if(first) 
    { 
     frames = 0; 
     starttime = clock(); 
     first = false; 
    } 
    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); 
    renderMore(); 
    glutSwapBuffers(); 
    frames++; 
} 

void moveCuda(Particle* in, Particle* out) 
{ 
    Particle *device_p = nullptr; 
    Particle *device_res = nullptr; 
    cudaError_t cudaStatus; 

    int size = N * sizeof(Particle); 

    cudaStatus = cudaMalloc((void**)&device_res, size); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudaMalloc failed!"); 
    } 

    cudaStatus = cudaMalloc((void**)&device_p, size); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudaMalloc failed!"); 
    } 

    cudaStatus = cudaSetDevice(0); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); 
    } 

    // Copy input vectors from host memory to GPU buffers. 
    cudaStatus = cudaMemcpy(device_p, in, size, cudaMemcpyHostToDevice); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudaMemcpy failed!"); 
    } 

    updateParticle << <N/1024, 1024 >> >(device_res, device_p); 

    cudaStatus = cudaGetLastError(); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "kernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); 
    } 

    cudaStatus = cudaDeviceSynchronize(); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus); 
    } 

    cudaStatus = cudaMemcpy(out, device_res, size, cudaMemcpyDeviceToHost); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudaMemcpy failed!"); 
    } 

    cudaFree(device_res); 
    cudaFree(device_p); 
} 

void update(int) 
{ 
    Particle temp[N] = {}; 
    moveCuda(p, temp); 
    for (int i = 0; i < N; ++i) 
     p[i] = temp[i]; 
    fps = (double)frames/((clock() - starttime)/1000); 
     const string a = "FPS: " + to_string(fps); 
     glutSetWindowTitle(a.c_str()); 

    glutTimerFunc(100.0/60, update, -1); 
} 

void idle() 
{ 
    glutPostRedisplay(); 
} 

int main(int argc, char* argv[]) 
{ 
    for (int i = 0; i < N; ++i) 
    { 
     p[i] = Particle(); 
    } 
    // Initialize GLUT 
    glutInit(&argc, argv); 
    // Set up some memory buffers for our display 
    glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA | GLUT_DEPTH); 
    // Set the window size 
    glutInitWindowSize(1000, 1000); 
    // Create the window with the title "Hello,GL" 
    glutCreateWindow("Hello World"); 
    // Bind the two functions (above) to respond when necessary 
    glutReshapeFunc(changeViewPort); 
    glutDisplayFunc(render); 
    glutTimerFunc(100.0/60, update, -1); 
    glutIdleFunc(idle); 


    // Very important! This initializes the entry points in the OpenGL driver so we can 
    // call all the functions in the API. 
    GLenum err = glewInit(); 
    if (GLEW_OK != err) { 
     fprintf(stderr, "GLEW error"); 
     return 1; 
    } 

    render(); 
    glutMainLoop(); 
    return 0; 
}

Particle.cpp

#include "Particle.h" 
#include "stdlib.h" 
#include <host_defines.h> 

Particle::Particle() 
{ 
    posx = (((double)rand()/(RAND_MAX)) * 2) - 1; 
    posy = (((double)rand()/(RAND_MAX)) * 2) - 1; 
    velx = ((((double)rand()/(RAND_MAX)) * 2) - 1)/4; 
    vely = ((((double)rand()/(RAND_MAX)) * 2) - 1)/4; 
    mass = 1; 
}

Particle.h

#pragma once 

class Particle 
{ 
public: 
    Particle(); 
    void Update(); 
    double posx; 
    double posy; 
    double velx; 
    double vely; 
    double mass; 
};

Iは、グラフィックス・デバイスを設定する行を削除すると、それがエラーをスローするが、継続2〜3フレーム/秒で動作します。これは私がグラフィックカードを手に入れることができないことを示しているかもしれませんが、私はこれをどうしたらいいのか分かりません。私はそれをcudaSetDevice（0）に設定すると、エラーをスローしません。グラフィックスカードが動作しており、モニタが接続されていて動作しています。

誰かが指針やアドバイスを提供できるのであれば、私は大いに感謝しています。

出典

2017-04-18 Jacob Morris

エラー： 'Particle'にメンバー 'p'がありません。誰かがコンパイルできるコードを提供してください。 –

コードを更新しました。今すぐコンパイルされます。 –

まず、CUDA nbody sample codeを勉強したいと思うかもしれません。それは、私よりもうまく書かれたコードを公開するより良い仕事です。また、そのサンプルには、参考になるthis chapterへのリンクが含まれています。

元のコードよりもはるかに高速に動作するように見えるコードを表示します。ここで私が適用される一般的な戦略は以下のとおりです。

あなたはリリースプロジェクト、ないデバッグプロジェクトを構築していることを確認してください。
不要cudaMalloc/cudaFreeまたはcudaMemcpy操作をしないでください。これらの割り当ては一度実行してから再利用することをお勧めします。ホストコードで何か（位置、速度）を変更していないので、moveCudaの繰り返しごとにデバイスを更新する必要はありません。デバイスにデータを残すだけです。これにより、私たちはOpenGLの処理を実行できるように、私たちを1つのcudaMemcpyオペレーションに縮小しました（しかし以下を参照）。私はこれから約3倍のブーストを得るように見えました。また、不要なコピーを避けるために、「ピンポン」バッファー戦略を実装しました。
doubleの代わりにfloatを使用してください。これにはいくつかの利点があります。まず、半分のデータを検索するので、メモリトラフィックを削減します。第2に、使用しているGPUがfloatのスループット（数学演算）が劇的に高く、doubleよりも大きいです。私はこれがコンピューティングバウンドのカーネルだとは思わないので、メモリトラフィックが大きな問題だと思います。私はこれからさらに3倍のブーストを得るように見えました。
粒子をAoSからSoAに変換します。このトピックについては、cudaタグの他の多くの場所でも説明していますので、ここではこれを検討しません。私はこれを完全には実行していませんでした。代わりに部分的な変換（別の配列への質量の除去）を行い、残ったfloat4量の速度x/yと位置x/yに対して「ベクトル荷重」戦略を使用しました。 hereは、AoS→SoA変換とそれがなぜ重要であるか、そしてここで取り上げたベクトルロード「ショートカット」の両方について議論する答えの例です。
4096は、現代GPUのスレッド数が比較的少ないです。 1024スレッドブロックから512スレッドブロックに切り替えることで、小さなメリットが得られます。これにより、カーネルはGPU上で利用可能なSMをいっぱいにする機会が少し増えます。 SMが4つ以下の場合はそれほど大きな違いはありませんが、980 TiにはのSMがあるため、最高のパフォーマンスを目指すにはSMに少なくとも1つのブロックを置くことが最善の方法です。したがって、256スレッドのブロックを試してみることもできます（合計16スレッドブロック）。
これは、計算のセット、かなり「高価」である：
```
len = sqrt(len); 
double dirx = -difx/len; 
double diry = -dify/len; 
```
それはrsqrtf()がsqrtf()として計算しようとして簡単であることが判明し、それを私たちは、その後の浮動小数点除算演算を回すことができます浮動小数点乗算演算に変換します。これらの基本的な手順で

、私は非常に古いGPUで約30fpsのに取得することができた、あなたはおそらくそれよりももっと良いものを目撃する必要があります。私はLinuxで作業していましたが、私が行った変更のどれもが窓の下で "壊れる"べきではないと思います。

#include <GL/glew.h> 
#include <GL/freeglut.h> 
#include <iostream> 
#include <vector> 
#include <math.h> 
#include <ctime> 
#include <string> 
#include <cstdlib> 
#include <cstdio> 
#include <time.h> 
#define N 4096 
#define DT 0.00001 
#define M_PI   3.14159265358979323846 /* pi */ 


class Particle 
{ 
public: 
    Particle(); 
    float4 p; 
}; 

Particle::Particle() 
{ 
    p.x = (((double)rand()/(RAND_MAX)) * 2) - 1; 
    p.y = (((double)rand()/(RAND_MAX)) * 2) - 1; 
    p.z = ((((double)rand()/(RAND_MAX)) * 2) - 1)/4; 
    p.w = ((((double)rand()/(RAND_MAX)) * 2) - 1)/4; 
} 
const int size = N * sizeof(Particle); 


using namespace std; 

Particle p[N]; 
float pmass[N]; 
Particle *d_p1, *d_p2; 
float *d_pmass1, *d_pmass2; 
int ping_pong = 0; 
float et; 
cudaEvent_t start, stop; 
int frames = 0; 
clock_t starttime = clock(); 
clock_t timepassed = 0; 
bool first = true; 
float fps = 0.0f; 

__global__ void updateParticle(Particle * __restrict__ out, float * __restrict__ pmass_out, const Particle * __restrict__ pin, const float * __restrict__ pmass_in) 
{ 
    int i = blockIdx.x * blockDim.x + threadIdx.x; 
    float velx = 0; 
    float vely = 0; 
    Particle my_i = pin[i]; 
    float my_mass_i = pmass_in[i]; 
    pmass_out[i] = my_mass_i; 
    for(int j = 0; j < N; j++) 
    { 
     float my_mass_j = pmass_in[j]; 
     if (i == j || my_mass_i == 0 || my_mass_j == 0) 
      continue; 
     Particle my_j = pin[j]; 
     float difx = my_i.p.x - my_j.p.x; 
     float dify = my_i.p.y - my_j.p.y; 
     float len = difx * difx + dify * dify; 
     if (len == 0) 
      continue; 
     float force = (my_mass_i * my_mass_j)/len; 
     len = rsqrtf(len); 
     float dirx = -difx * len; 
     float diry = -dify * len; 
     dirx *= force; 
     diry *= force; 
     velx += (dirx/my_mass_i + my_i.p.z) * DT; 
     vely += (diry/my_mass_i + my_i.p.w) * DT; 
    } 
    Particle my_out_i = my_i; 
    my_out_i.p.x = my_i.p.x + velx; 
    my_out_i.p.y = my_i.p.y + vely; 
    my_out_i.p.z = my_i.p.z; 
    my_out_i.p.w = my_i.p.w; 

    if (my_out_i.p.x > 1) 
     my_out_i.p.x = 1; 
    if (my_out_i.p.x < -1) 
     my_out_i.p.x = -1; 
    if (my_out_i.p.y > 1) 
     my_out_i.p.y = 1; 
    if (my_out_i.p.y < -1) 
     my_out_i.p.y = -1; 
    out[i] = my_out_i; 
} 

void changeViewPort(int w, int h) 
{ 
    glViewport(0, 0, w, h); 

} 

void renderMore() 
{ 

    for (int i = 0; i < N; ++i) 
    { 
     if (pmass[i] == 0) 
      continue; 
     if (pmass[i] == 1) 
      glColor3f(1, 1, 1); 
     else 
      glColor3f(1, 0, 0); 
     glBegin(GL_LINE_LOOP); 
     for (int j = 0; j <= 4; j++) { 
      double angle = 2 * M_PI * j/300; 
      double x = cos(angle) * 0.001; 
      double y = sin(angle) * 0.001; 
      x *= pmass[i]; 
      y *= pmass[i]; 
      glVertex2d(x + p[i].p.x, y + p[i].p.y); 
     } 
     glEnd(); 
    } 

} 

void render(void) 
{ 
    if(first) 
    { 
     frames = 0; 
     starttime = clock(); 
     first = false; 
    } 
    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); 
    renderMore(); 
    glutSwapBuffers(); 
    frames++; 
} 

void moveCuda(Particle* in, Particle* out) 
{ 
    Particle *d_pi; 
    Particle *d_po; 
    float *d_pmassi, *d_pmasso; 
    cudaError_t cudaStatus; 
    if (ping_pong) { 
     d_pi = d_p2; 
     d_po = d_p1; 
     d_pmassi = d_pmass2; 
     d_pmasso = d_pmass1; 
     ping_pong = 0;} 
    else { 
     d_pi = d_p1; 
     d_po = d_p2; 
     d_pmassi = d_pmass1; 
     d_pmasso = d_pmass2; 
     ping_pong = 1;} 
    cudaEventRecord(start); 
    cudaStatus = cudaSetDevice(0); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); 
    } 
    updateParticle << <N/256, 256 >> >(d_po, d_pmasso, d_pi, d_pmassi); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "kernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); 
    } 

    cudaStatus = cudaDeviceSynchronize(); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus); 
    } 
    cudaEventRecord(stop); 

    cudaStatus = cudaMemcpy(out, d_po, size, cudaMemcpyDeviceToHost); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudaMemcpy failed!"); 
    } 
    //cudaEventRecord(stop); 
    cudaEventSynchronize(stop); 
    cudaEventElapsedTime(&et, start, stop); 

} 

void update(int) 
{ 
    Particle temp[N] = {}; 
    moveCuda(p, temp); 
    for (int i = 0; i < N; ++i) 
     p[i] = temp[i]; 
    char a[64]; 
    fps = (float)frames/((clock() - starttime)/CLOCKS_PER_SEC); 
     sprintf(a, "FPS: %f, et: %f\0", fps, et); 
     glutSetWindowTitle(a); 

    glutTimerFunc(100.0/60, update, -1); 
} 

void idle() 
{ 
    glutPostRedisplay(); 
} 

int main(int argc, char* argv[]) 
{ 
    for (int i = 0; i < N; ++i) 
    { 
     p[i] = Particle(); 
     pmass[i] = 1; 
    // p[i].p(); 
    } 
    cudaMalloc((void**)&d_p2, size); 
    cudaMalloc((void**)&d_p1, size); 
    cudaMalloc((void**)&d_pmass2, N*sizeof(float)); 
    cudaMalloc((void**)&d_pmass1, N*sizeof(float)); 
    cudaMemcpy(d_p1, p, size, cudaMemcpyHostToDevice); 
    cudaMemcpy(d_pmass1, pmass, N*sizeof(float), cudaMemcpyHostToDevice); 
    cudaEventCreate(&start); cudaEventCreate(&stop); 
    // Initialize GLUT 
    glutInit(&argc, argv); 
    // Set up some memory buffers for our display 
    glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA | GLUT_DEPTH); 
    // Set the window size 
    glutInitWindowSize(1000, 1000); 
    // Create the window with the title "Hello,GL" 
    glutCreateWindow("Hello World"); 
    // Bind the two functions (above) to respond when necessary 
    glutReshapeFunc(changeViewPort); 
    glutDisplayFunc(render); 
    glutTimerFunc(100.0/60, update, -1); 
    glutIdleFunc(idle); 


    // Very important! This initializes the entry points in the OpenGL driver so we can 
    // call all the functions in the API. 
    GLenum err = glewInit(); 
    if (GLEW_OK != err) { 
     fprintf(stderr, "GLEW error"); 
     return 1; 
    } 

    render(); 
    glutMainLoop(); 
    return 0; 
}

私は（私はあなたではなかったと思います）、これは欠陥のないコードであることを主張しませんが、それはあなたの元のコードとほぼ同じでグラフィカルに振る舞うように見えました。例えば、あなたのコードでは、あなたはカーネルの最後にこれを持っている：右私には見えません

out[i].velx = pin[i].velx; 
out[i].vely = pin[i].vely;

が、それは性能が、ここで議論されているの中心ではありません。

質量が常に1または0であることが分かっている場合は、このコードをさらに大幅に最適化することができますが、それを追求していません。

ここに残っているデバイス - >ホストコピーを取り除き、データをGPUに永久に移動するには、CUDA/OpenGLの相互運用戦略について考えてみてください。繰り返しになりますが、CUDAのサンプルコードはロードマップになる可能性があります.CUDA/GL interopを使い始める場合は、this presentationは少し古いですが、良い出発点です。

出典

2017-04-19 03:13:00

CUDAプログラムが期待通りに動作しない

答えて

関連する問題