-3
私のプログラムを実行する何らかの理由で、デバイス変数の値が0になっています。私がcudaカーネルを実行する直前に、デバイス変数は正しい値を持っています。出力画像は元の画像サイズの黒だけです。すべてのメモリ割り当てとホスト間のコピーは正しいようです。Cuda - カーネル実行後のデバイス値
ありがとうございました!
あなたのコードは、元画像に書いている// Includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#ifdef _WIN32
# define WINDOWS_LEAN_AND_MEAN
# define NOMINMAX
# include <windows.h>
#endif
#define Image_Size 512
#define Kernel_Size 3
// Includes CUDA
#include <cuda_runtime.h>
// Utilities and timing functions
#include "./inc/helper_functions.h" // includes cuda.h and cuda_runtime_api.h
// CUDA helper functions
#include "./inc/helper_cuda.h" // helper functions for CUDA error check
const char *imageFilename = "lena_bw.pgm";
const char *sampleName = "simpleTexture";
#define C_PI 3.141592653589793238462643383279502884197169399375
void __global__ SwirlCu(int width, int height, int stride, float *pRawBitmapOrig, float *pBitmapCopy, double factor)
{
// This function effectively swirls an image
// This CUDA kernel is basically the exact same code as the CPU-only, except it has a slightly different setup
// Each thread on the GPU will process exactly one pixel
// Before doing anything, we need to determine the current pixel we are calculating in this thread
// Original code used i as y, and j as x. We will do the same so we can just re-use CPU code in the CUDA kernel
int i = blockIdx.y * blockDim.y + threadIdx.y;
int j = blockIdx.x * blockDim.x + threadIdx.x;
// Test to see if we're testing a valid pixel
if (i >= height || j >= width) return; // Don't bother doing the calculation. We're not in a valid pixel location
double cX = (double)width/2.0f;
double cY = (double)height/2.0f;
double relY = cY-i;
double relX = j-cX;
// relX and relY are points in our UV space
// Calculate the angle our points are relative to UV origin. Everything is in radians.
double originalAngle;
if (relX != 0)
{
originalAngle = atan(abs(relY)/abs(relX));
if (relX > 0 && relY < 0) originalAngle = 2.0f*C_PI - originalAngle;
else if (relX <= 0 && relY >=0) originalAngle = C_PI-originalAngle;
else if (relX <=0 && relY <0) originalAngle += C_PI;
}
else
{
// Take care of rare special case
if (relY >= 0) originalAngle = 0.5f * C_PI;
else originalAngle = 1.5f * C_PI;
}
// Calculate the distance from the center of the UV using pythagorean distance
double radius = sqrt(relX*relX + relY*relY);
// Use any equation we want to determine how much to rotate image by
//double newAngle = originalAngle + factor*radius; // a progressive twist
double newAngle = originalAngle + 1/(factor*radius+(4.0f/C_PI));
// Transform source UV coordinates back into bitmap coordinates
int srcX = (int)(floor(radius * cos(newAngle)+0.5f));
int srcY = (int)(floor(radius * sin(newAngle)+0.5f));
srcX += cX;
srcY += cY;
srcY = height - srcY;
// Clamp the source to legal image pixel
if (srcX < 0) srcX = 0;
else if (srcX >= width) srcX = width-1;
if (srcY < 0) srcY = 0;
else if (srcY >= height) srcY = height-1;
// Set the pixel color
// Since each thread writes to exactly 1 unique pixel, we don't have to do anything special here
pRawBitmapOrig[i*stride/4 + j] = pBitmapCopy[srcY*stride/4 + srcX];
}
////////////////////////////////////////////////////////////////////////////////
// Declaration, forward
void runTest(int argc, char **argv);
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
printf("%s starting...\n", sampleName);
// Process command-line arguments
if (argc > 1)
{
if (checkCmdLineFlag(argc, (const char **) argv, "input"))
{
getCmdLineArgumentString(argc,(const char **) argv,"input",(char **) &imageFilename);
}
else if (checkCmdLineFlag(argc, (const char **) argv, "reference"))
{
printf("-reference flag should be used with -input flag");
exit(EXIT_FAILURE);
}
}
runTest(argc, argv);
cudaDeviceReset();
printf("%s completed",
sampleName);
//exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv)
{
int devID = findCudaDevice(argc, (const char **) argv);
unsigned int kernel_bytes = Kernel_Size * Kernel_Size * sizeof(float);
// load image from disk
float *hData = NULL;
float *host_array_kernel = 0;
float *device_array_Image = 0;
float *device_array_kernel = 0;
float *device_array_Result = 0;
unsigned int width, height;
char *imagePath = sdkFindFilePath(imageFilename, argv[0]);
if (imagePath == NULL)
{
printf("Unable to source image file: %s\n", imageFilename);
exit(EXIT_FAILURE);
}
sdkLoadPGM(imagePath, &hData, &width, &height);
unsigned int size = width * height * sizeof(float);
printf("Loaded '%s', %d x %d pixels\n", imageFilename, width, height);
// Allocation of device arrays using CudaMalloc
cudaMalloc((void**)&device_array_Image, size);
cudaMalloc((void**)&device_array_kernel, kernel_bytes);
cudaMalloc((void**)&device_array_Result, size);
host_array_kernel = (float*)malloc(kernel_bytes); // kernel
// Allocate mem for the result on host side
float *hOutputDataSharp = (float *) malloc(size);
GenerateKernel (host_array_kernel);
// copy arrays and kernel from host to device
checkCudaErrors(cudaMemcpy(device_array_Image, hData, size, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(device_array_kernel, host_array_kernel, kernel_bytes, cudaMemcpyHostToDevice));
dim3 dimBlock(16, 16, 1);
dim3 dimGrid(width/dimBlock.x, height/dimBlock.y, 1);
//Do the Convolution
printf("DImage : '%.8f'\n",device_array_Image);
printf("DKernel : '%.8f'\n",device_array_kernel);
//serialConvolution(hData, host_array_kernel ,hOutputDataSharp);
SwirlCu<<<512, 512>>>(width, height, width*4, device_array_Image,device_array_Result, 0.005f);
printf("DResult : '%.8f'\n",device_array_Result);
checkCudaErrors(cudaDeviceSynchronize());
cudaMemcpy(hOutputDataSharp,device_array_Result, size, cudaMemcpyDeviceToHost);
printf("HResult : '%.8f'\n",hOutputDataSharp);
// Write result to file
char outputSharp[1024];
strcpy(outputSharp, imagePath);
strcpy(outputSharp, "data/serial_sharptest.pgm");
sdkSavePGM(outputSharp, hOutputDataSharp, width, height);
cudaFree(device_array_Result);
cudaFree(device_array_Image);
cudaFree(device_array_kernel);
free(hData);
free(imagePath);
//free(host_array_Image);
free(host_array_kernel);
free(hOutputDataSharp);
//free(hOutputImage);
//free(hOutputKernel);
}
エラーチェック?? –
エラーが発生していません。生成された画像は、処理された画像の代わりに黒で表示されます。私は本質的に数日前に始まったので、まだそれを理解することを把握するために来て、私は本当にクーダに新しいです。 – Alpha
@Alpha:コードにエラーチェックがない場合、エラーを生成していないことをどのように知っていますか? – talonmies