CudaFreeがメモリを解放するように見えないのはなぜですか？

私はデバイスメモリを割り当て、コピーし、GPUで計算を実行し、結果をコピーして、割り当てられたデバイスメモリを解放しようとしています。私は限界を超えていないことを確認したかったので、いくつかの配列をダンプするのに十分なメモリが共有メモリ空間にあるかどうかを見たいと思っていました。CudaFreeがメモリを解放するように見えないのはなぜですか？

デバイスメモリを割り当てると、エラーは返されません。 cudaMemGetInfoを使用して割り当てられたメモリ量を確認すると、cudaMallocはメモリを割り当てていないように見えます。また、メモリを解放しようとすると、1つのポインタだけが解放されたように見えます。

私は、GPUメモリをセットアップしてカーネルを起動するために、matlab Mexfunctionインターフェイスを使用しています。この時点では、私はカーネルを呼び出していなくても、結果の単位行列を返すだけです。ここで

cudaError_t cudaErr; 
size_t freeMem = 0; 
size_t totalMem = 0; 
size_t allocMem = 0; 
cudaMemGetInfo(&freeMem, &totalMem); 
mexPrintf("Memory avaliable: Free: %lu, Total: %lu\n",freeMem, totalMem); 

/* Pointers for the device memory */ 
double *devicePulseDelay, *deviceTarDistance, *deviceScattDistance, *deviceScatterers; 
double *deviceReceivedReal, *deviceReceivedImag; 

/* Allocate memory on the device for the arrays. */ 
mexPrintf("Allocating memory.\n"); 
cudaErr = cudaMalloc((void **) &devicePulseDelay, sizeof(double)*512); 
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could not allocate memory to devicePulseDelay\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); 
} 
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); 
cudaErr = cudaMalloc((void **) &deviceTarDistance, sizeof(double)*512); 
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could not allocate memory to deviceTarDistance\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); 
} 
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); 
cudaErr = cudaMalloc((void **) &deviceScattDistance, sizeof(double)*999*512); 
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could not allocate memory to deviceScattDistance\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); 
} 
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); 
cudaErr = cudaMalloc((void **) &deviceScatterers, sizeof(double)*999); 
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could not allocate memory to deviceScatterers\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); 
} 
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); 
cudaErr = cudaMalloc((void **) &deviceReceivedReal, sizeof(double)*999*512); 
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could not allocate memory to deviceReceivedReal\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); 
} 
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("deviceReceivedReal: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); 
cudaErr = cudaMalloc((void **) &deviceReceivedImag, sizeof(double)*999*512); 
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could not allocate memory to deviceReceivedImag\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); 
} 
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("deviceReceivedImag: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n", allocMem, totalMem,(freeMem - allocMem)); 

/* copy the input arrays across to the device */ 
mexPrintf("\nCopying memory.\n"); 
cudaErr = cudaMemcpy(devicePulseDelay, pulseDelay, sizeof(double)*512,cudaMemcpyHostToDevice); 
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could not copy to devicePulseDelay\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); 
} 
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); 
cudaErr = cudaMemcpy(deviceTarDistance, tarDistance, sizeof(double)*512,cudaMemcpyHostToDevice); 
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could not copy to deviceTarDistance\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); 
} 
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); 
cudaErr = cudaMemcpy(deviceScattDistance, scattDistance, sizeof(double)*999*512,cudaMemcpyHostToDevice); 
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could not copy to deviceScattDistance\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); 
} 
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); 
cudaErr = cudaMemcpy(deviceScatterers, scatterers, sizeof(double)*999,cudaMemcpyHostToDevice); 
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could not copy to deviceScatterers\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); 
} 
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); 

/* call the kernel */ 
// launchKernel<<<1,512>>>(........); 

/* retireve the output */ 
cudaErr = cudaMemcpy(receivedReal, deviceReceivedReal, sizeof(double)*512*512,cudaMemcpyDeviceToHost); 
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could not copy to receivedReal\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); 
} 
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("receivedReal: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); 
cudaErr = cudaMemcpy(receivedImag, deviceReceivedImag, sizeof(double)*512*512,cudaMemcpyDeviceToHost); 
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could not copy to receivedImag\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); 
} 
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("receivedImag: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); 

/* free the memory. */ 
mexPrintf("\nFree'ing memory.\n"); 
cudaMemGetInfo(&freeMem, &totalMem); 
mexPrintf("Before freeing: Free %lu, Total: %lu\n", freeMem, totalMem); 
cudaErr = cudaFree(devicePulseDelay); 
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could free devicePulseDelay\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); 
} 
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem)); 
cudaErr = cudaFree(deviceTarDistance); 
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could free deviceTarDistance\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); 
} 
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem)); 
cudaErr = cudaFree(deviceScattDistance); 
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could free deviceScattDistance\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); 
} 
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem)); 
cudaErr = cudaFree(deviceScatterers); 
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could free deviceScatterers\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); 
} 
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem)); 
cudaErr = cudaFree(deviceReceivedReal); 
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could free deviceReceivedReal\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); 
} 
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("deviceReceivedReal: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem)); 
cudaErr = cudaFree(deviceReceivedImag); 
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could free deviceReceivedImag\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); 
} 
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("deviceReceivedImag: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));

これからの出力である：私は欠けていることは明らかに何かがあるよう

 
Memory avaliable: Free: 2523959296, Total: 2818572288 
Allocating memory. 
devicePulseDelay: Memory avaliable: Free: 2522910720, Total: 2818572288, Consumed: 1048576 
deviceTarDistance: Memory avaliable: Free: 2522910720, Total: 2818572288, Consumed: 1048576 
deviceScattDistance: Memory avaliable: Free: 2518716416, Total: 2818572288, Consumed: 5242880 
deviceScatterers: Memory avaliable: Free: 2517667840, Total: 2818572288, Consumed: 6291456 
deviceReceivedReal: Memory avaliable: Free: 2515570688, Total: 2818572288, Consumed: 8388608 
deviceReceivedImag: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 

Copying memory. 
devicePulseDelay: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 
deviceTarDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 
deviceScattDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 
deviceScatterers: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 
receivedReal: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 
receivedImag: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 

Free'ing memory. 
Before freeing: Free 2513473536, Total: 2818572288 
devicePulseDelay: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0 
deviceTarDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0 
deviceScattDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0 
deviceScatterers: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576 
deviceReceivedReal: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576 
deviceReceivedImag: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576

私は感じています。誰が何が起こっているのかを説明するのに役立つことができる？

EDIT：プラットフォームは、Tesla C2050 GPuカードを搭載したWindows 7です。

出典

2012-05-01 Beau Bellamy

このコードはどのプラットフォームで実行していますか？ – talonmies

cudaMemGetInfo（）を呼び出す前にallocMemとtotalMemの値をゼロにし、cudaMemGetInfo（）の戻り値をチェックしてください。 –

は、cudaMemGetInfo（）を呼び出す前にallocMemとtotalMemをゼロにしても違いはありません。また、cudaMemGetInfo呼び出しはエラーを返しませんでした。参考までに、私のプラットフォームはTesla C2050 GPUカードのWindows 7です。 –

mallocは、呼び出されたときにホストオペレーティングシステムから直接メモリ割り当てを取得し、freeは、呼び出されたときに直接ホストに戻すという誤った考え方です。しかし、彼らはほとんど常にそのようには動作しません。代わりに、標準ライブラリはホストOSと対話することによって、機敏に拡張され、縮小されるfree'dとmalloc'dのメモリの循環リストを保持します（How do malloc() and free() work?あなたが興味を持っているならば、詳細）。どのように動作しているかにかかわらず、これは、OSが自由であるほど多くのメモリを割り当てることは通常不可能であり、割り振りは空きメモリの量を変更しないように見えることがあり、 freeは、OSがフリーであると言うメモリ量に影響を与えないことがあります。

私はこれをサポートする経験的な証拠しかありませんが、CUDAはまったく同じように動作すると私は信じています。コンテキストは独自のmallocとfreeのメモリのリストを保持し、そのリストに保持されているメモリをホストドライバ/ウィンドウマネージャとして拡張して縮小し、GPU自身が許可します。すべてのハードウェアには特有のMMUページサイズがあり、NVIDIA GPUのページサイズがかなり大きいという証拠があります。これは、cudaMallocコールでは粗粒度があることを意味し、mallocは空きメモリの量に影響を与えないか、または要求されたメモリよりも多くのメモリを消費することがあることを意味し、時にはfreeコールが影響を与えないように見えますCUDAドライバのページサイズの動作を示すのに役立つ小さなツールを見つけることができます（here）。ただし、CUDA APIの初期バージョン用に書かれていて、現代バージョンでコンパイルするにはいくつかの変更が必要です。私はこれが観察している行動のもっともらしい説明であると信じています。

MacOS 10に投稿したコードの簡略化されたバージョンを実行するとします。GT200ファミリデバイスとの6：

#include <cstdio> 

#define mexPrintf printf 

inline void gpuAssert(cudaError_t code, char *file, int line, 
       bool abort=true) 
{ 
    if (code != cudaSuccess) 
    { 
     mexPrintf("GPUassert: %s %s %d\n", cudaGetErrorString(code), 
      file, line); 
     if (abort) exit(code); 
    } 
} 

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } 

inline void gpuMemReport(size_t * avail, size_t * total, 
     const char * title = 0, const size_t * free = 0, const bool sense = true) 
{ 
    char tstring[32] = { '\0' }; 
    gpuErrchk(cudaMemGetInfo(avail, total)); 

    if (free) { 
     if (title) { 
      strncpy(tstring, title, 31); 
     } 
     mexPrintf("%s Memory avaliable: Free: %zu, Total: %zu, %s: %zu\n", 
       tstring, *avail, *total, (sense) ? "Allocated\0" : "Freed\0", 
       (sense) ? (*free - *avail) : (*avail - *free)); 
    } else { 
     mexPrintf("Memory avaliable: Free: %zu, Total: %zu\n", *avail, *total); 
    } 
} 

int main() 
{ 
    size_t freeMem = 0; 
    size_t totalMem = 0; 
    size_t allocMem = 0; 

    gpuErrchk(cudaFree(0)); 
    gpuMemReport(&freeMem, &totalMem); 

    double *devicePulseDelay, *deviceTarDistance, *deviceScattDistance, *deviceScatterers; 
    double *deviceReceivedReal, *deviceReceivedImag; 

    mexPrintf("Allocating memory.\n"); 
    gpuErrchk(cudaMalloc((void **) &devicePulseDelay, sizeof(double)*512)); 
    gpuMemReport(&allocMem, &totalMem, "devicePulseDelay:", &freeMem); 

    gpuErrchk(cudaMalloc((void **) &deviceTarDistance, sizeof(double)*512)); 
    gpuMemReport(&allocMem, &totalMem, "deviceTarDistance:", &freeMem); 

    gpuErrchk(cudaMalloc((void **) &deviceScattDistance, sizeof(double)*999*512)); 
    gpuMemReport(&allocMem, &totalMem, "deviceScattDistance:", &freeMem); 

    gpuErrchk(cudaMalloc((void **) &deviceScatterers, sizeof(double)*999)); 
    gpuMemReport(&allocMem, &totalMem, "deviceScatterers:", &freeMem); 

    gpuErrchk(cudaMalloc((void **) &deviceReceivedReal, sizeof(double)*999*512)); 
    gpuMemReport(&allocMem, &totalMem, "deviceReceivedReal:", &freeMem); 

    gpuErrchk(cudaMalloc((void **) &deviceReceivedImag, sizeof(double)*999*512)); 
    gpuMemReport(&allocMem, &totalMem, "deviceReceivedImag:", &freeMem); 

    mexPrintf("\nFree'ing memory.\n"); 
    gpuMemReport(&freeMem, &totalMem); 

    gpuErrchk(cudaFree(devicePulseDelay)); 
    gpuMemReport(&allocMem, &totalMem, "devicePulseDelay:", &freeMem, false); 

    gpuErrchk(cudaFree(deviceTarDistance)); 
    gpuMemReport(&allocMem, &totalMem, "deviceTarDistance:", &freeMem, false); 

    gpuErrchk(cudaFree(deviceScattDistance)); 
    gpuMemReport(&allocMem, &totalMem, "deviceScattDistance:", &freeMem, false); 

    gpuErrchk(cudaFree(deviceScatterers)); 
    gpuMemReport(&allocMem, &totalMem, "deviceScatterers:", &freeMem, false); 

    gpuErrchk(cudaFree(deviceReceivedReal)); 
    gpuMemReport(&allocMem, &totalMem, "deviceReceivedReal:", &freeMem, false); 

    gpuErrchk(cudaFree(deviceReceivedImag)); 
    gpuMemReport(&allocMem, &totalMem, "deviceReceivedImag:", &freeMem, false); 

    return 0; 
}

は、私は別の結果を得るが、また同じ現象を示す1：

Allocating memory. 
devicePulseDelay: Memory avaliable: Free: 202870784, Total: 265027584, Allocated: 1048576 
deviceTarDistance: Memory avaliable: Free: 202870784, Total: 265027584, Allocated: 1048576 
deviceScattDistance: Memory avaliable: Free: 198778880, Total: 265027584, Allocated: 5140480 
deviceScatterers: Memory avaliable: Free: 197730304, Total: 265027584, Allocated: 6189056 
deviceReceivedReal: Memory avaliable: Free: 193638400, Total: 265027584, Allocated: 10280960 
deviceReceivedImag: Memory avaliable: Free: 189546496, Total: 265027584, Allocated: 14372864 

Free'ing memory. 
Memory avaliable: Free: 189546496, Total: 265027584 
devicePulseDelay: Memory avaliable: Free: 189546496, Total: 265027584, Freed: 0 
deviceTarDistance: Memory avaliable: Free: 190595072, Total: 265027584, Freed: 1048576 
deviceScattDistance: Memory avaliable: Free: 194686976, Total: 265027584, Freed: 5140480 
deviceScatterers: Memory avaliable: Free: 195735552, Total: 265027584, Freed: 6189056 
deviceReceivedReal: Memory avaliable: Free: 199827456, Total: 265027584, Freed: 10280960 
deviceReceivedImag: Memory avaliable: Free: 203919360, Total: 265027584, Freed: 14372864

行動が同様にハードウェアに依存/ホストOSであることを示唆しています。

出典

2012-05-02 07:17:10 talonmies

CudaFreeがメモリを解放するように見えないのはなぜですか？

答えて

関連する問題