2017-08-24 14 views
1

DataParallelDistributedDataParallelを使用しないように、コードを少し変更しました。コードは以下の通りである:PyTorchがCudaランタイムエラーを返す

import argparse 
import os 
import shutil 
import time 

import torch 
import torch.nn as nn 
import torch.nn.parallel 
import torch.backends.cudnn as cudnn 
import torch.distributed as dist 
import torch.optim 
import torch.utils.data 
import torch.utils.data.distributed 
import torchvision.transforms as transforms 
import torchvision.datasets as datasets 
import torchvision.models as models 

model_names = sorted(name for name in models.__dict__ 
    if name.islower() and not name.startswith("__") 
    and callable(models.__dict__[name])) 

parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') 
parser.add_argument('data', metavar='DIR', 
        help='path to dataset') 
parser.add_argument('--arch', '-a', metavar='ARCH', default='resnet18', 
        choices=model_names, 
        help='model architecture: ' + 
         ' | '.join(model_names) + 
         ' (default: resnet18)') 
parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', 
        help='number of data loading workers (default: 4)') 
parser.add_argument('--epochs', default=90, type=int, metavar='N', 
        help='number of total epochs to run') 
parser.add_argument('--start-epoch', default=0, type=int, metavar='N', 
        help='manual epoch number (useful on restarts)') 
parser.add_argument('-b', '--batch-size', default=256, type=int, 
        metavar='N', help='mini-batch size (default: 256)') 
parser.add_argument('--lr', '--learning-rate', default=0.1, type=float, 
        metavar='LR', help='initial learning rate') 
parser.add_argument('--momentum', default=0.9, type=float, metavar='M', 
        help='momentum') 
parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float, 
        metavar='W', help='weight decay (default: 1e-4)') 
parser.add_argument('--print-freq', '-p', default=10, type=int, 
        metavar='N', help='print frequency (default: 10)') 
parser.add_argument('--resume', default='', type=str, metavar='PATH', 
        help='path to latest checkpoint (default: none)') 
parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true', 
        help='evaluate model on validation set') 
parser.add_argument('--pretrained', dest='pretrained', action='store_true', 
        help='use pre-trained model') 
parser.add_argument('--world-size', default=1, type=int, 
        help='number of distributed processes') 
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str, 
        help='url used to set up distributed training') 
parser.add_argument('--dist-backend', default='gloo', type=str, 
        help='distributed backend') 

best_prec1 = 0 


def main(): 
    global args, best_prec1 
    args = parser.parse_args() 

    args.distributed = args.world_size > 1 

    if args.distributed: 
     dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, 
           world_size=args.world_size) 

    # create model 
    if args.pretrained: 
     print("=> using pre-trained model '{}'".format(args.arch)) 
     model = models.__dict__[args.arch](pretrained=True) 
    else: 
     print("=> creating model '{}'".format(args.arch)) 
     model = models.__dict__[args.arch]() 

    if not args.distributed: 
     if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): 
      #model.features = torch.nn.DataParallel(model.features) 
      model.cuda() 
     #else: 
      #model = torch.nn.DataParallel(model).cuda() 
    else: 
     model.cuda() 
     #model = torch.nn.parallel.DistributedDataParallel(model) 

    # define loss function (criterion) and optimizer 
    criterion = nn.CrossEntropyLoss().cuda() 

    optimizer = torch.optim.SGD(model.parameters(), args.lr, 
           momentum=args.momentum, 
           weight_decay=args.weight_decay) 

    # optionally resume from a checkpoint 
    if args.resume: 
     if os.path.isfile(args.resume): 
      print("=> loading checkpoint '{}'".format(args.resume)) 
      checkpoint = torch.load(args.resume) 
      args.start_epoch = checkpoint['epoch'] 
      best_prec1 = checkpoint['best_prec1'] 
      model.load_state_dict(checkpoint['state_dict']) 
      optimizer.load_state_dict(checkpoint['optimizer']) 
      print("=> loaded checkpoint '{}' (epoch {})" 
        .format(args.resume, checkpoint['epoch'])) 
     else: 
      print("=> no checkpoint found at '{}'".format(args.resume)) 

    cudnn.benchmark = True 

    # Data loading code 
    traindir = os.path.join(args.data, 'train') 
    valdir = os.path.join(args.data, 'val') 
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], 
            std=[0.229, 0.224, 0.225]) 

    train_dataset = datasets.ImageFolder(
     traindir, 
     transforms.Compose([ 
      transforms.RandomSizedCrop(224), 
      transforms.RandomHorizontalFlip(), 
      transforms.ToTensor(), 
      normalize, 
     ])) 

    if args.distributed: 
     train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) 
    else: 
     train_sampler = None 

    train_loader = torch.utils.data.DataLoader(
     train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), 
     num_workers=args.workers, pin_memory=True, sampler=train_sampler) 

    val_loader = torch.utils.data.DataLoader(
     datasets.ImageFolder(valdir, transforms.Compose([ 
      transforms.Scale(256), 
      transforms.CenterCrop(224), 
      transforms.ToTensor(), 
      normalize, 
     ])), 
     batch_size=args.batch_size, shuffle=False, 
     num_workers=args.workers, pin_memory=True) 

    if args.evaluate: 
     validate(val_loader, model, criterion) 
     return 

    for epoch in range(args.start_epoch, args.epochs): 
     if args.distributed: 
      train_sampler.set_epoch(epoch) 
     adjust_learning_rate(optimizer, epoch) 

     # train for one epoch 
     train(train_loader, model, criterion, optimizer, epoch) 

     # evaluate on validation set 
     prec1 = validate(val_loader, model, criterion) 

     # remember best [email protected] and save checkpoint 
     is_best = prec1 > best_prec1 
     best_prec1 = max(prec1, best_prec1) 
     save_checkpoint({ 
      'epoch': epoch + 1, 
      'arch': args.arch, 
      'state_dict': model.state_dict(), 
      'best_prec1': best_prec1, 
      'optimizer' : optimizer.state_dict(), 
     }, is_best) 


def train(train_loader, model, criterion, optimizer, epoch): 
    batch_time = AverageMeter() 
    data_time = AverageMeter() 
    losses = AverageMeter() 
    top1 = AverageMeter() 
    top5 = AverageMeter() 

    # switch to train mode 
    model.train() 

    end = time.time() 
    for i, (input, target) in enumerate(train_loader): 
     # measure data loading time 
     data_time.update(time.time() - end) 

     target = target.cuda(async=True) 
     input_var = torch.autograd.Variable(input) 
     target_var = torch.autograd.Variable(target) 

     # compute output 
     output = model(input_var) 
     loss = criterion(output, target_var) 

     # measure accuracy and record loss 
     prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) 
     losses.update(loss.data[0], input.size(0)) 
     top1.update(prec1[0], input.size(0)) 
     top5.update(prec5[0], input.size(0)) 

     # compute gradient and do SGD step 
     optimizer.zero_grad() 
     loss.backward() 
     optimizer.step() 

     # measure elapsed time 
     batch_time.update(time.time() - end) 
     end = time.time() 

     if i % args.print_freq == 0: 
      print('Epoch: [{0}][{1}/{2}]\t' 
        'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 
        'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 
        'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 
        '[email protected] {top1.val:.3f} ({top1.avg:.3f})\t' 
        '[email protected] {top5.val:.3f} ({top5.avg:.3f})'.format(
        epoch, i, len(train_loader), batch_time=batch_time, 
        data_time=data_time, loss=losses, top1=top1, top5=top5)) 


def validate(val_loader, model, criterion): 
    batch_time = AverageMeter() 
    losses = AverageMeter() 
    top1 = AverageMeter() 
    top5 = AverageMeter() 

    # switch to evaluate mode 
    model.eval() 

    end = time.time() 
    for i, (input, target) in enumerate(val_loader): 
     target = target.cuda(async=True) 
     input_var = torch.autograd.Variable(input, volatile=True) 
     target_var = torch.autograd.Variable(target, volatile=True) 

     # compute output 
     output = model(input_var) 
     loss = criterion(output, target_var) 

     # measure accuracy and record loss 
     prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) 
     losses.update(loss.data[0], input.size(0)) 
     top1.update(prec1[0], input.size(0)) 
     top5.update(prec5[0], input.size(0)) 

     # measure elapsed time 
     batch_time.update(time.time() - end) 
     end = time.time() 

     if i % args.print_freq == 0: 
      print('Test: [{0}/{1}]\t' 
        'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 
        'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 
        '[email protected] {top1.val:.3f} ({top1.avg:.3f})\t' 
        '[email protected] {top5.val:.3f} ({top5.avg:.3f})'.format(
        i, len(val_loader), batch_time=batch_time, loss=losses, 
        top1=top1, top5=top5)) 

    print(' * [email protected] {top1.avg:.3f} [email protected] {top5.avg:.3f}' 
      .format(top1=top1, top5=top5)) 

    return top1.avg 


def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'): 
    torch.save(state, filename) 
    if is_best: 
     shutil.copyfile(filename, 'model_best.pth.tar') 


class AverageMeter(object): 
    """Computes and stores the average and current value""" 
    def __init__(self): 
     self.reset() 

    def reset(self): 
     self.val = 0 
     self.avg = 0 
     self.sum = 0 
     self.count = 0 

    def update(self, val, n=1): 
     self.val = val 
     self.sum += val * n 
     self.count += n 
     self.avg = self.sum/self.count 


def adjust_learning_rate(optimizer, epoch): 
    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" 
    lr = args.lr * (0.1 ** (epoch // 30)) 
    for param_group in optimizer.param_groups: 
     param_group['lr'] = lr 


def accuracy(output, target, topk=(1,)): 
    """Computes the [email protected] for the specified values of k""" 
    maxk = max(topk) 
    batch_size = target.size(0) 

    _, pred = output.topk(maxk, 1, True, True) 
    pred = pred.t() 
    correct = pred.eq(target.view(1, -1).expand_as(pred)) 

    res = [] 
    for k in topk: 
     correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) 
     res.append(correct_k.mul_(100.0/batch_size)) 
    return res 


if __name__ == '__main__': 
    main() 

そして、私はalexnet neuralnetアーキテクチャと画像のセットでこのコードを実行すると、それは次のようにある奇妙なCUDAのエラーを与える:

=> creating model 'alexnet' 
THCudaCheck FAIL file=/pytorch/torch/lib/THC/THCGeneral.c line=70 error=30 : unknown error 
Traceback (most recent call last): 
    File "imagenet2.py", line 319, in <module> 
    main() 
    File "imagenet2.py", line 87, in main 
    model.cuda() 
    File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 147, in cuda 
    return self._apply(lambda t: t.cuda(device_id)) 
    File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 118, in _apply 
    module._apply(fn) 
    File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 118, in _apply 
    module._apply(fn) 
    File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 118, in _apply 
    module._apply(fn) 
    File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 124, in _apply 
    param.data = fn(param.data) 
    File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 147, in <lambda> 
    return self._apply(lambda t: t.cuda(device_id)) 
    File "/usr/local/lib/python2.7/dist-packages/torch/_utils.py", line 66, in _cuda 
    return new_type(self.size()).copy_(self, async) 
    File "/usr/local/lib/python2.7/dist-packages/torch/cuda/__init__.py", line 266, in _lazy_new 
    _lazy_init() 
    File "/usr/local/lib/python2.7/dist-packages/torch/cuda/__init__.py", line 85, in _lazy_init 
    torch._C._cuda_init() 
RuntimeError: cuda runtime error (30) : unknown error at /pytorch/torch/lib/THC/THCGeneral.c:70 

コードの実行に使用されるコマンド:python imagenet.py --world-size 1 --arch 'alexnet' <image_folder>

どこが間違っていましたか?

PS:AWS g2.2xlarge Ubuntuインスタンスで実行しています。次のように

CUDAのバージョンは次のとおりです。

nvcc: NVIDIA (R) Cuda compiler driver 
Copyright (c) 2005-2016 NVIDIA Corporation 
Built on Tue_Jan_10_13:22:03_CST_2017 
Cuda compilation tools, release 8.0, V8.0.61 

答えて

1
  1. CUDNNは役に立たないエラーメッセージを与えます。デバッグの場合は、net.cpu()を使用してCPU上のネットをテストするか、単純にnet.cuda()を削除してください。トレーニング、バリデーション、アウトプット変数でも同じことをしなければなりません。

  2. これは、224x224以外のサイズの画像で、事前に訓練されたAlexNetを使用したということです。ドキュメントによると、画像サイズが少なくとも224x224であれば動作するはずです。

  3. これはおそらく、pytorchのAlexNetの実装でハードコーディングされたパラメータによるテンソルシェーピングの問題です。 vision/torchvision/models/alexnet.pyでは、それは

    x = x.view(x.size(0), -1) 
    

x = x.view(x.size(0), 256 * 6 * 6) 

変更、それを言うライン44でこれは、異なる画像サイズで動作できるようにする必要があります。

  1. この変更をgithubリポジトリにサブビューしましたが、まだ更新されていないと思います。
関連する問題