OpenCLで削減するベストプラクティスは何ですか？

アソシエーティブプロパティでバイナリ操作（名前を "+"にします）を考えてみましょう。あなたは、左の一つの要素があるまで、その後、その前のステップの結果のために同じことをやって、その後、OpenCLで削減するベストプラクティスは何ですか？

b1 = a1 + a2 
b2 = a3 + a4

、並列に

c1 = b1 + b2 
c2 = b3 + b4

を第1の演算をa1 + a2 + a3 + a4 + ...を計算することができたとき。

私はOpenCLを学び、このアプローチを実装して、配列のすべての要素を要約しようとしています。私はこの技術では初心者ですから、プログラムが変わって見えるかもしれません。

これはカーネルです：バッファに一つだけの要素があるまで

#include <stdio.h> 
#include <stdlib.h> 
#include <fcntl.h> 
#include <unistd.h> 
#include <sys/mman.h> 
#include <sys/stat.h> 
#include <CL/cl.h> 

#define N (64*64*64*64) 

#include <sys/time.h> 
#include <stdlib.h> 

double gettime() 
{ 
    struct timeval tv; 
    gettimeofday (&tv, NULL); 
    return (double)tv.tv_sec + (0.000001 * (double)tv.tv_usec); 
} 

int main() 
{ 
    int i, fd, res = 0; 
    void* kernel_source = MAP_FAILED; 

    cl_context context; 
    cl_context_properties properties[3]; 
    cl_kernel kernel; 
    cl_command_queue command_queue; 
    cl_program program; 
    cl_int err; 
    cl_uint num_of_platforms=0; 
    cl_platform_id platform_id; 
    cl_device_id device_id; 
    cl_uint num_of_devices=0; 
    cl_mem input, output; 
    size_t global, local; 

    cl_float *array = malloc (sizeof (cl_float)*N); 
    cl_float *array2 = malloc (sizeof (cl_float)*N); 
    for (i=0; i<N; i++) array[i] = i; 

    fd = open ("kernel.cl", O_RDONLY); 
    if (fd == -1) { 
     perror ("Cannot open kernel"); 
     res = 1; 
     goto cleanup; 
    } 
    struct stat s; 

    res = fstat (fd, &s); 
    if (res == -1) { 
     perror ("Cannot stat() kernel"); 
     res = 1; 
     goto cleanup; 
    } 

    kernel_source = mmap (NULL, s.st_size, PROT_READ, MAP_PRIVATE, fd, 0); 
    if (kernel_source == MAP_FAILED) { 
     perror ("Cannot map() kernel"); 
     res = 1; 
     goto cleanup; 
    } 

    if (clGetPlatformIDs (1, &platform_id, &num_of_platforms) != CL_SUCCESS) { 
     printf("Unable to get platform_id\n"); 
     res = 1; 
     goto cleanup; 
    } 

    if (clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, 
         &num_of_devices) != CL_SUCCESS) 
    { 
     printf("Unable to get device_id\n"); 
     res = 1; 
     goto cleanup; 
    } 
    properties[0]= CL_CONTEXT_PLATFORM; 
    properties[1]= (cl_context_properties) platform_id; 
    properties[2]= 0; 
    context = clCreateContext(properties,1,&device_id,NULL,NULL,&err); 
    command_queue = clCreateCommandQueue(context, device_id, 0, &err); 
    program = clCreateProgramWithSource(context, 1, (const char**)&kernel_source, NULL, &err); 


    if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS) { 
     char buffer[4096]; 
     size_t len; 

     printf("Error building program\n"); 
     clGetProgramBuildInfo (program, device_id, CL_PROGRAM_BUILD_LOG, sizeof (buffer), buffer, &len); 
     printf ("%s\n", buffer); 
     res = 1; 
     goto cleanup; 
    } 

    kernel = clCreateKernel(program, "reduce", &err); 
    if (err != CL_SUCCESS) { 
     printf("Unable to create kernel\n"); 
     res = 1; 
     goto cleanup; 
    } 

    // create buffers for the input and ouput 
    input = clCreateBuffer(context, CL_MEM_READ_ONLY, 
          sizeof(cl_float) * N, NULL, NULL); 
    output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 
          sizeof(cl_float) * N, NULL, NULL); 

    // load data into the input buffer 
    clEnqueueWriteBuffer(command_queue, input, CL_TRUE, 0, 
          sizeof(cl_float) * N, array, 0, NULL, NULL); 

    size_t size = N; 
    cl_mem tmp; 
    double time = gettime(); 
    while (size > 1) 
    { 
     // set the argument list for the kernel command 
     clSetKernelArg(kernel, 0, sizeof(cl_mem), &input); 
     clSetKernelArg(kernel, 1, sizeof(cl_mem), &output); 
     global = size; 
     local = 64; 

     // enqueue the kernel command for execution 
     clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, 
          &local, 0, NULL, NULL); 
     clFinish(command_queue); 
     size = size/64; 
     tmp = output; 
     output = input; 
     input = tmp; 
    } 
    cl_float answer[1]; 
    clEnqueueReadBuffer(command_queue, tmp, CL_TRUE, 0, 
         sizeof(cl_float), array, 0, NULL, NULL); 
    time = gettime() - time; 
    printf ("%f %f\n", array[0], time); 

cleanup: 
    free (array); 
    free (array2); 
    clReleaseMemObject(input); 
    clReleaseMemObject(output); 
    clReleaseProgram(program); 
    clReleaseKernel(kernel); 
    clReleaseCommandQueue(command_queue); 
    clReleaseContext(context); 

    if (kernel_source != MAP_FAILED) munmap (kernel_source, s.st_size); 
    if (fd != -1) close (fd); 

    _Exit (res); // Kludge 
    return res; 
}

だから私再実行カーネル：

__kernel void reduce (__global float *input, __global float *output) 
{ 
    size_t gl = get_global_id (0); 
    size_t s = get_local_size (0); 
    int i; 
    float accum = 0; 

    for (i=0; i<s; i++) { 
     accum += input[s*gl+i]; 
    } 

    output[gl] = accum; 
}

はこれがメインプログラムです。 OpenCLの要素の合計を計算する正しいアプローチですか？私がgettimeで測定した時間は、CPU上の単純なループ（コンパイルされたclang 4.0.0と-O2 -ffast-mathフラグ）の実行時間が約10倍遅くなります。私が使用するハードウェア：Amd Ryzen 5 1600XとAmd Radeon HD 6950.

出典

2017-10-05 shamaz.mazum

パフォーマンスを向上させるためにできることはいくつかあります。

まず、ループ内のclFinishコールを取り除きます。これにより、カーネルの個々の実行は、コマンド・キューの状態全体がホストとの同期ポイントに達してから続行する必要があります。唯一必要な同期は、カーネルが順番に実行されることです。プログラムが要求していない順不同のキューを持っていても、イベントオブジェクトを簡単に使用することで保証できます。

size_t size = N; 
size_t total_expected_events = 0; 
for(size_t event_count = size; event_count > 1; event_count /= 64) 
    total_expected_events++; 
cl_event * events = malloc(total_expected_events * sizeof(cl_event)); 
cl_mem tmp; 
double time = gettime(); 
size_t event_index = 0; 
while (size > 1) 
{ 
    // set the argument list for the kernel command 
    clSetKernelArg(kernel, 0, sizeof(cl_mem), &input); 
    clSetKernelArg(kernel, 1, sizeof(cl_mem), &output); 
    global = size; 
    local = 64; 

    if(event_index == 0) 
     // enqueue the kernel command for execution 
     clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, 
          &local, 0, NULL, events); 
    else 
     clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, 
          &local, 1, events + (event_index - 1), events + event_index); 
    size = size/64; 
    tmp = output; 
    output = input; 
    input = tmp; 
    event_index++; 
} 
clFinish(command_queue); 
for(; event_index > 0; event_index--) 
    clReleaseEvent(events[event_index-1]); 
free(events); 
cl_float answer[1]; 
clEnqueueReadBuffer(command_queue, tmp, CL_TRUE, 0, 
        sizeof(cl_float), array, 0, NULL, NULL);

もう1つのことは、同じカーネルの複数の呼び出しで展開するのではなく、1つのカーネルで縮小をすべて実行することです。 This is one potentialの例ですが、必要以上に複雑な場合があります。

出典

2017-10-05 20:27:16 Xirema

私はclFinishを削除するための有用なアドバイスをありがとう。 AMDの記事では、このようにカーネルを改善して作業グループに作業を分散し、ローカルメモリを活用することができました。しかし、私はまだその記事が混乱していることがわかります。例：操作の順序を変更する必要がある理由（操作の可換プロパティを使用する）私が理解しているように、作業要素をよりコンパクトにロードすると（その間に隙間がないので）より良いことです。あれは正しいですか？記事で話しているSIMD波面は何ですか？ –

さまざまなGPUメーカー（nVidia、AMD、Intelなど）からOpenCL最適化ガイドをチェックしてください。これらの用語は、用語を含むGPUの仕組みをかなりよく紹介しています。 – pmdj

ところで、私は[this]（http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.225.1324&rep=rep1&type=pdf）のリンクを発見しました。非常に便利。 –

OpenCLで削減するベストプラクティスは何ですか？

答えて

関連する問題