CUDAの勾配降下最適化

私の最初の比較的大きなCUDAプロジェクトは機械学習のために勾配降下最適化としてコード化します。 CUDAのネイティブ機能のうち、プロジェクトで使用するための短期間の可能性があることについて、私は群衆の知恵から恩恵を受けたいと考えています。任意のアイデア/提案？CUDAの勾配降下最適化

出典

2013-05-15 erogol

勾配降下のどのような実装するつもりですか？さまざまな方法と結果で、いくつかの興味深い例[** here **]（http://blog.accelereyes.com/blog/2011/09/20/optimization-methods-for-deep-learning/）を見つけることができます。機械学習とGPGPUに関する[** this other post **]（http://adnanboz.wordpress.com/2012/02/25/large-scale-machine-learning-using-nvidia-cuda/）もあります。だからあなたの問題に関するいくつかの情報を私たちに与えることができますか？ – BenC

リンクをありがとうが、私はGDを学びたくありません。そのようなプロジェクトに役立つかもしれないCUDAの便利な機能を学びたいだけです – erogol

問題はこの種の質問が広すぎるかもしれないということです。線形代数ライブラリ（[MAGMA]（https://developer.nvidia.com/magma）、[CUBLAS]（https（））は、数学ライブラリ（https://developer.nvidia.com/cuda-math-library）：//developer.nvidia.com/cublas））、開発指向のライブラリがほしいならば、[Thrust]（http://thrust.github.io/）は間違いなく良い選択です。 NVIDIAのウェブサイトで[** this list **]（https://developer.nvidia.com/technologies/Libraries）を確認することができます。 – BenC

勾配降下（AKA 最急降下）は、現在の時点でF(x)の勾配の負に比例するステップを取ることによって、多変数関数F(x)の極小値を発見することを目的とします。 ステップサイズgamma_nが各ステップで変化させ、線探索によって、例えば、決定することができる

enter image description here

：更新規則は、以下です。

上記の更新ルールをCUDAで実装するのは簡単です。以下では、Rosenbrock関数を使用して、最適化されるコスト関数として、分析勾配を利用し、反復によるステップサイズの一定値（すなわち、gamma_n = gamma）を考慮した完全な例を示します。 Utilities.cuとUtilities.cuhファイルはOrangeOwlSolutions/CUDA_Utilitiesにありますので、ここでは省略します。この例では、GPUと同様にCPUを実装しています。

**kernel.cu** 

#include <stdio.h> 
#include <float.h> 

#include "cuda_runtime.h" 
#include "device_launch_parameters.h" 

#include "GradientDescentCPU.h" 
#include "GradientDescentGPU.cuh" 

#include "Utilities.cuh" 

/********/ 
/* MAIN */ 
/********/ 

int main() 
{ 
    /********************/ 
    /* INPUT PARAMETERS */ 
    /********************/ 

    // --- Number of unknowns 
    const int M = 5; 

    // --- Starting point 
    float *h_x0 = (float*)malloc(M * sizeof(float)); 
    for (int i=0; i<M; i++) h_x0[i] = 1.2f; 

    // --- Termination tolerance 
    const float tol = 1.e-6; 

    // --- Maximum number of allowed iterations 
    const int maxiter = 10000; 

    // --- Step size 
    const float alpha = 0.001f; 

    // --- Derivative step 
    const float h = 0.0001f; 

    // --- Minimum allowed perturbations 
    const float dxmin = 1e-5; 

    /*********************/ 
    /* OUTPUT PARAMETERS */ 
    /*********************/ 

    // --- Optimal point 
    float* h_xopt = (float*)malloc(M * sizeof(float)); 
    for (int i=0; i<M; i++) h_xopt[i] = 0.f; 

    // --- Optimal functional 
    float fopt = 0.f; 

    // --- Number of performed iterations 
    int niter = 0; 

    // --- Gradient norm at optimal point 
    float gnorm = 0.f; 

    // --- Distance between last and penultimate solutions found 
    float dx = 0.f; 

    /***************************/ 
    /* OPTIMIZATION - CPU CASE */ 
    /***************************/ 

    GradientDescentCPU(h_x0, tol, maxiter, alpha, h, dxmin, M, h_xopt, &fopt, &niter, &gnorm, &dx); 

    printf("Solution found - CPU case:\n"); 
    printf("fopt = %f; niter = %i; gnorm = %f; dx = %f\n", fopt, niter, gnorm, dx); 
    printf("\n\n"); 

#ifdef VERBOSE 
    printf("Found minimum - CPU case:\n"); 
    for (int i=0; i<M; i++) printf("i = %i; h_xopt = %f\n", i, h_xopt[i]); 
    printf("\n\n"); 
#endif 

    /***************************/ 
    /* OPTIMIZATION - GPU CASE */ 
    /***************************/ 

    // --- Starting point 
    float *d_x0; gpuErrchk(cudaMalloc((void**)&d_x0,  M * sizeof(float))); 
    gpuErrchk(cudaMemcpy(d_x0, h_x0, M * sizeof(float), cudaMemcpyHostToDevice)); 
    // --- Optimal point 
    float *d_xopt; gpuErrchk(cudaMalloc((void**)&d_xopt, M * sizeof(float))); 

    GradientDescentGPU(d_x0, tol, maxiter, alpha, h, dxmin, M, d_xopt, &fopt, &niter, &gnorm, &dx); 

    printf("Solution found - GPU case:\n"); 
    printf("fopt = %f; niter = %i; gnorm = %f; dx = %f\n", fopt, niter, gnorm, dx); 
    printf("\n\n"); 

#ifdef VERBOSE 
    gpuErrchk(cudaMemcpy(h_xopt, d_xopt, M * sizeof(float), cudaMemcpyDeviceToHost)); 
    printf("Found minimum - GPU case:\n"); 
    for (int i=0; i<M; i++) printf("i = %i; h_xopt = %f\n", i, h_xopt[i]); 
    printf("\n\n"); 
#endif 
    return 0; 
}

GradientDescentCPU.cu

#include <stdlib.h> 
#include <math.h> 
#include <float.h> 

#include "cuda_runtime.h" 
#include "device_launch_parameters.h" 

#include "GradientDescentGPU.cuh" 

/*******************************/ 
/* GRADIENT DESCENT - CPU CASE */ 
/*******************************/ 
// --- Version using finite differences 
//void CostFunctionGradientCPU(float * __restrict h_x, float * __restrict h_g, const float h, const int M) { 
// 
// for (int i=0; i<M; i++) { 
//  h_x[i] = h_x[i] + h/2.f; 
//  h_g[i] = CostFunction(h_x, M); 
//  h_x[i] = h_x[i] - h; 
//  h_g[i] = (h_g[i] - CostFunction(h_x, M))/(2.f * h); 
//  h_x[i] = h_x[i] + h/2.f; 
// } 
//} 

// --- Version using analytical gradient (Rosenbrock function) 
void CostFunctionGradientCPU(float * __restrict h_x, float * __restrict h_g, const float h, const int M) { 

    h_g[0] = -400.f * (h_x[1] - h_x[0] * h_x[0]) * h_x[0] + 2.f * (h_x[0] - 1.f); 
    for (int i=1; i<M-1; i++) { 
     h_g[i] = -400.f * h_x[i] * (h_x[i+1] - h_x[i] * h_x[i]) + 2.f * (h_x[i] - 1.f) + 200.f * (h_x[i] - h_x[i-1] * h_x[i-1]); 
    } 
    h_g[M-1] = 200.f * (h_x[M-1] - h_x[M-2] * h_x[M-2]); 
} 

/********/ 
/* NORM */ 
/********/ 

float normCPU(const float * __restrict h_x, const int M) { 

    float sum = 0.f; 
    for(int i=0; i<M; i++) sum = sum + h_x[i] * h_x[i]; 

    return sqrt(sum); 

} 

/****************************************/ 
/* GRADIENT DESCENT FUNCTION - CPU CASE */ 
/****************************************/ 

// x0  - Starting point 
// tol  - Termination tolerance 
// maxiter - Maximum number of allowed iterations 
// alpha - Step size 
// dxmin - Minimum allowed perturbations 

void GradientDescentCPU(const float * __restrict h_x0, const float tol, const int maxiter, const float alpha, const float h, const float dxmin, const int M, 
          float * __restrict h_xopt, float *fopt, int *niter, float *gnorm, float *dx) { 

    // --- Initialize gradient norm, optimization vector, iteration counter, perturbation 

    *gnorm = FLT_MAX; 

    float *h_x = (float *)malloc(M * sizeof(float)); 
    for (int i=0; i<M; i++) h_x[i] = h_x0[i]; 

    *niter = 0; 

    *dx = FLT_MAX; 

    // --- Allocating space for the gradient, for the new actual solution and for the difference between actual and old solutions 
    float *h_g  = (float *)malloc(M * sizeof(float)); 
    float *h_xnew = (float *)malloc(M * sizeof(float)); 
    float *h_xdiff = (float *)malloc(M * sizeof(float)); 

    // --- Gradient Descent iterations 
    while ((*gnorm >= tol) && (*niter <= maxiter) && (*dx >= dxmin)) { 

     // --- Calculate gradient 
     CostFunctionGradientCPU(h_x, h_g, h, M); 
     *gnorm = normCPU(h_g, M); 

     // --- Take step: 
     for (int i=0; i<M; i++) h_xnew[i] = h_x[i] - alpha * h_g[i]; 

     // --- Update termination metrics 
     *niter = *niter + 1; 
     for (int i=0; i<M; i++) h_xdiff[i] = h_xnew[i] - h_x[i]; 
     *dx = normCPU(h_xdiff, M); 
     for (int i=0; i<M; i++) h_x[i] = h_xnew[i]; 
    } 

    for (int i=0; i<M; i++) h_xopt[i] = h_x[i]; 
    *fopt = CostFunction(h_xopt, M); 
    *niter = *niter - 1; 

}

GradientDescentCPU.h

#ifndef GRADIENT_DESCENT_CPU 
#define GRADIENT_DESCENT_CPU 

void GradientDescentCPU(const float * __restrict, const float, const int, const float, const float, const float, const int, 
           float * __restrict, float *, int *, float *, float *); 

#endif

GradientDescentGPU.cu

#include <thrust\device_ptr.h> 
#include <thrust\inner_product.h> 

#include "Utilities.cuh" 

#define BLOCK_SIZE 256 

//#define VERBOSE 
//#define DEBUG 

/***********************************/ 
/* COST FUNCTION - CPU & GPU CASES */ 
/***********************************/ 
__host__ __device__ float CostFunction(const float * __restrict h_x, const int M) { 

    // --- Rosenbrock function 
    float sum = 0.f; 
    for (int i=0; i<M-1; i++) { 
     float temp1 = (h_x[i+1] - h_x[i] * h_x[i]); 
     float temp2 = (h_x[i] - 1.f); 
     sum = sum + 100.f * temp1 * temp1 + temp2 * temp2; 
    } 
    return sum; 
} 

/*******************************/ 
/* GRADIENT DESCENT - GPU CASE */ 
/*******************************/ 

// --- Version using finite differences 
//__device__ void CostFunctionGradientGPU(float * __restrict d_x, float * __restrict d_g, const float h, const int tid, const int M) { 
// 
// int test1, test2; 
// float h_test1_plus, h_test1_minus, h_test2_plus, h_test2_minus, temp1_plus, temp1_minus, temp2_plus, temp2_minus; 
// 
// // --- Rosenbrock function 
// float sum_plus = 0.f, sum_minus = 0.f; 
// for (int i=0; i<M-1; i++) { 
//  h_test1_plus = d_x[i] + (h/2.f) * (tid == i); 
//  h_test1_minus = d_x[i] - (h/2.f) * (tid == i); 
//  h_test2_plus = d_x[i + 1] + (h/2.f) * (tid == (i + 1)); 
//  h_test2_minus = d_x[i + 1] - (h/2.f) * (tid == (i + 1)); 
//  temp1_plus  = (h_test2_plus - h_test1_plus * h_test1_plus); 
//  temp2_plus  = (h_test1_plus - 1.f); 
//  temp1_minus  = (h_test2_minus - h_test1_minus * h_test1_minus); 
//  temp2_minus  = (h_test1_minus - 1.f); 
//  sum_plus  = sum_plus + 100.f * temp1_plus * temp1_plus + temp2_plus * temp2_plus; 
//  sum_minus  = sum_minus + 100.f * temp1_minus * temp1_minus + temp2_minus * temp2_minus; 
// } 
// d_g[tid] = (sum_plus - sum_minus)/(2.f * h); 
//} 

// --- Version using analytical gradient (Rosenbrock function) 
__device__ void CostFunctionGradientGPU(float * __restrict d_x, float * __restrict d_g, const float h, const int tid, const int M) { 

    if (tid == 0) d_g[0] = -400.f * (d_x[1] - d_x[0] * d_x[0]) * d_x[0] + 2.f * (d_x[0] - 1.f); 
    else if (tid == M-1) d_g[M-1] = 200.f * (d_x[M-1] - d_x[M-2] * d_x[M-2]); 
    else { 
     for (int i=1; i<M-1; i++) { 
      d_g[i] = -400.f * d_x[i] * (d_x[i+1] - d_x[i] * d_x[i]) + 2.f * (d_x[i] - 1.f) + 200.f * (d_x[i] - d_x[i-1] * d_x[i-1]); 
     } 
    } 
} 

/*******************/ 
/* STEP - GPU CASE */ 
/*******************/ 
__global__ void StepGPU(float * __restrict d_x, float * __restrict d_xnew, float * __restrict d_xdiff, float * __restrict d_g, const float alpha, const float h, const int M) { 

    const int tid = threadIdx.x + blockIdx.x * blockDim.x; 

    if (tid < M) { 

     // --- Calculate gradient 
     CostFunctionGradientGPU(d_x, d_g, h, tid, M); 

     // --- Take step 
     d_xnew[tid] = d_x[tid] - alpha * d_g[tid]; 

     // --- Update termination metrics 
     d_xdiff[tid] = d_xnew[tid] - d_x[tid]; 

     // --- Update current solution 
     d_x[tid] = d_xnew[tid]; 
    } 

} 

/***********************************/ 
/* COST FUNCTION STRUCT - GPU CASE */ 
/***********************************/ 

// --- Rosenbrock function struct for thrust reduction 
struct CostFunctionStructGPU{ 
template <typename Tuple> 
    __host__ __device__ float operator()(Tuple a) { 

     float temp1 = (thrust::get<1>(a) - thrust::get<0>(a) * thrust::get<0>(a)); 
     float temp2 = (thrust::get<0>(a) - 1.f); 

     return 100.f * temp1 * temp1 + temp2 * temp2; 
    } 
}; 


/****************************************/ 
/* GRADIENT DESCENT FUNCTION - GPU CASE */ 
/****************************************/ 

// x0  - Starting point 
// tol  - Termination tolerance 
// maxiter - Maximum number of allowed iterations 
// alpha - Step size 
// dxmin - Minimum allowed perturbations 

void GradientDescentGPU(const float * __restrict__ d_x0, const float tol, const int maxiter, const float alpha, const float h, 
         const float dxmin, const int M, float * __restrict__ d_xopt, float *fopt, int *niter, float *gnorm, float *dx) { 

    thrust::device_ptr<float> dev_ptr_xopt  = thrust::device_pointer_cast(d_xopt); 

    // --- Initialize gradient norm, optimization vector, iteration counter, perturbation 
    *gnorm = FLT_MAX; 

    float *d_x;   gpuErrchk(cudaMalloc((void**)&d_x, M * sizeof(float))); 
    gpuErrchk(cudaMemcpy(d_x, d_x0, M * sizeof(float), cudaMemcpyDeviceToDevice)); 

    *niter = 0; 

    *dx = FLT_MAX; 

    // --- Allocating space for the gradient, for the new actual solution and for the difference between actual and old solutions 
    float *d_g;   gpuErrchk(cudaMalloc((void**)&d_g, M * sizeof(float)));   thrust::device_ptr<float> dev_ptr_g  = thrust::device_pointer_cast(d_g); 
    float *d_xnew;  gpuErrchk(cudaMalloc((void**)&d_xnew, M * sizeof(float)));  
    float *d_xdiff;  gpuErrchk(cudaMalloc((void**)&d_xdiff, M * sizeof(float)));  thrust::device_ptr<float> dev_ptr_xdiff = thrust::device_pointer_cast(d_xdiff); 

    // --- Gradient Descent iterations 
    while ((*gnorm >= tol) && (*niter <= maxiter) && (*dx >= dxmin)) { 

     // --- Iteration step 
     StepGPU<<<iDivUp(M, BLOCK_SIZE), BLOCK_SIZE>>>(d_x, d_xnew, d_xdiff, d_g, alpha, h, M); 
#ifdef DEBUG 
     gpuErrchk(cudaPeekAtLastError()); 
     gpuErrchk(cudaDeviceSynchronize()); 
#endif 

     *gnorm = sqrt(thrust::inner_product(dev_ptr_g,  dev_ptr_g + M,  dev_ptr_g,  0.0f)); 
     *dx  = sqrt(thrust::inner_product(dev_ptr_xdiff, dev_ptr_xdiff + M, dev_ptr_xdiff, 0.0f)); 
     *niter = *niter + 1; 

    } 

    gpuErrchk(cudaMemcpy(d_xopt, d_x, M * sizeof(float), cudaMemcpyDeviceToDevice)); 

    // --- Functional calculation 
    *fopt = thrust::transform_reduce(thrust::make_zip_iterator(thrust::make_tuple(dev_ptr_xopt, dev_ptr_xopt + 1)), thrust::make_zip_iterator(thrust::make_tuple(dev_ptr_xopt + M - 1, dev_ptr_xopt + M)), CostFunctionStructGPU(), 0.0f, thrust::plus<float>()); 

    *niter = *niter - 1; 

}

GradientDescentGPU.cuh

#ifndef GRADIENT_DESCENT_GPU 
#define GRADIENT_DESCENT_GPU 

void GradientDescentGPU(const float * __restrict__, const float, const int, const float, const float, const float, const int, 
           float * __restrict__, float *, int *, float *, float *); 

__host__ __device__ float CostFunction(const float * __restrict, const int); 

#endif

出典

2015-01-12 21:07:42 JackOLantern

答えて

関連する問題