Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Validation loss curve #13

Rajat-Mehta opened this issue Aug 3, 2019 · 1 comment

Validation loss curve #13

Rajat-Mehta opened this issue Aug 3, 2019 · 1 comment


Copy link

Rajat-Mehta commented Aug 3, 2019

Is there a way to plot validation loss curves along with the training loss during the Training process?

Here is my train method:

def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = model.state_dict()
    best_acc = 0.0
    last_margin = 0.0

    for epoch in range(num_epochs - start_epoch):
        epoch = epoch + start_epoch
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train(True)  # Set model to training mode
                model.train(False)  # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0.0
            running_margin = 0.0
            running_reg = 0.0
            # Iterate over data.
            for data in dataloaders[phase]:
                # get the inputs
                inputs, labels, pos, pos_labels = data
                now_batch_size, c, h, w = inputs.shape

                if now_batch_size<opt.batchsize:  # next epoch
                pos = pos.view(4*opt.batchsize, c, h, w)
                # copy pos 4times
                pos_labels = pos_labels.repeat(4).reshape(4, opt.batchsize)
                pos_labels = pos_labels.transpose(0, 1).reshape(4*opt.batchsize)

                # wrap them in Variable
                if use_gpu:
                    inputs = Variable(inputs.cuda())
                    pos = Variable(pos.cuda())
                    labels = Variable(labels.cuda())
                    inputs, labels = Variable(inputs), Variable(labels)

                # zero the parameter gradients

                # forward
                if phase == 'val':
                    with torch.no_grad():
                        outputs, f = model(inputs)
                        _, pf = model(pos)
                    # model_eval = copy.deepcopy(model)
                    # model_eval = model_eval.eval()
                    outputs, f = model(inputs)
                    _, pf = model(pos)
                # pf = Variable( pf, requires_grad=True)
                neg_labels = pos_labels
                # hard-neg
                # ----------------------------------
                nf_data = pf  # 128*512

                # 128 is too much, we use pool size = 64
                rand = np.random.permutation(4*opt.batchsize)[0:opt.poolsize]
                nf_data = nf_data[rand, :]
                neg_labels = neg_labels[rand]
                nf_t = nf_data.transpose(0, 1)  # 512*128
                score =, nf_t)  # cosine 32*128
                score, rank = score.sort(dim=1, descending=True)  # score high == hard
                labels_cpu = labels.cpu()
                nf_hard = torch.zeros(f.shape).cuda()
                for k in range(now_batch_size):
                    hard = rank[k, :]
                    for kk in hard:
                        now_label = neg_labels[kk] 
                        anchor_label = labels_cpu[k]
                        if now_label != anchor_label:
                            nf_hard[k, :] = nf_data[kk, :]

                # hard-pos
                # ----------------------------------
                pf_hard = torch.zeros(f.shape).cuda() # 32*512
                for k in range(now_batch_size):
                    pf_data = pf[4*k:4*k+4,:]
                    pf_t = pf_data.transpose(0,1) # 512*4
                    ff =[k,:].reshape(1,-1) # 1*512
                    score =, pf_t) #cosine
                    score, rank = score.sort(dim=1, descending = False)  # score low == hard
                    pf_hard[k,:] = pf_data[rank[0][0],:]

                # loss
                # ---------------------------------
                criterion_triplet = nn.MarginRankingLoss(margin=opt.margin)                
                pscore = torch.sum(f * pf_hard, dim=1)
                nscore = torch.sum(f * nf_hard, dim=1)
                y = torch.ones(now_batch_size)
                y = Variable(y.cuda())

                if not opt.PCB:
                    _, preds = torch.max(, 1)
                    #loss = criterion(outputs, labels)
                    #loss_triplet = criterion_triplet(f, pf, nf)
                    reg = torch.sum((1+nscore)**2) + torch.sum((-1+pscore)**2)
                    loss = torch.sum(torch.nn.functional.relu(nscore + opt.margin - pscore))  #Here I use sum
                    loss_triplet = loss + opt.alpha*reg
                    part = {}
                    sm = nn.Softmax(dim=1)
                    num_part = 6
                    for i in range(num_part):
                        part[i] = outputs[i]

                    score = sm(part[0]) + sm(part[1]) +sm(part[2]) + sm(part[3]) +sm(part[4]) +sm(part[5])
                    _, preds = torch.max(, 1)

                    loss = criterion(part[0], labels)
                    for i in range(num_part-1):
                        loss += criterion(part[i+1], labels)

                # backward + optimize only if in training phase
                if phase == 'train':
                    if fp16:  # we use optimier to backward loss
                        with amp.scale_loss(loss_triplet, optimizer) as scaled_loss:
                # statistics
                if int(version[0]) > 0 or int(version[2]) > 3:  # for the new version like 0.4.0 and 0.5.0
                    running_loss += loss_triplet.item() #* opt.batchsize
                else :  # for the old version like 0.3.0 and 0.3.1
                    running_loss +=[0] #*opt.batchsize
                # print( loss_triplet.item())
                running_corrects += float(torch.sum(pscore>nscore+opt.margin))
                running_margin +=float(torch.sum(pscore-nscore))
                running_reg += reg

            datasize = dataset_sizes[phase]//opt.batchsize * opt.batchsize
            epoch_loss = running_loss / datasize
            epoch_reg = opt.alpha*running_reg/ datasize
            epoch_acc = running_corrects / datasize
            epoch_margin = running_margin / datasize

            #if epoch_acc>0.75:
            #    opt.margin = min(opt.margin+0.02, 1.0)
            print('now_margin: %.4f'%opt.margin)           
            print('{} Loss: {:.4f} Reg: {:.4f} Acc: {:.4f} MeanMargin: {:.4f}'.format(
                phase, epoch_loss, epoch_reg, epoch_acc, epoch_margin))

            # deep copy the model
            if epoch_margin>last_margin:
                last_margin = epoch_margin
                last_model_wts = model.state_dict()

            if phase == 'val':
                last_model_wts = model.state_dict()
                if epoch % 10 == 9:
                    save_network(model, epoch)

        time_elapsed = time.time() - since
        print('Training complete in {:.0f}m {:.0f}s'.format(
            time_elapsed // 60, time_elapsed % 60))

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    #print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    save_network(model, 'last')
    return model

I tried to add 'val' phase in your train method: "for phase in ['train', 'val']". But I am getting the following error while looping through the Val dataloader
(at this line "for data in dataloaders[phase]"):

but I am getting this error probably because of dataloader['val']:

Traceback (most recent call last):
  File "", line 593, in <module>
  File "", line 323, in train_model
    for data in dataloaders[phase]:
  File "/home/rajat/MyPC/DFKI/MasterThesis/Vehicle_Reidentification/veri/lib/python3.6/site-packages/torch/utils/data/", line 582, in __next__
    return self._process_next_batch(batch)
  File "/home/rajat/MyPC/DFKI/MasterThesis/Vehicle_Reidentification/veri/lib/python3.6/site-packages/torch/utils/data/", line 608, in _process_next_batch
    raise batch.exc_type(batch.exc_msg)
ZeroDivisionError: Traceback (most recent call last):
  File "/home/rajat/MyPC/DFKI/MasterThesis/Vehicle_Reidentification/veri/lib/python3.6/site-packages/torch/utils/data/_utils/", line 99, in _worker_loop
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "/home/rajat/MyPC/DFKI/MasterThesis/Vehicle_Reidentification/veri/lib/python3.6/site-packages/torch/utils/data/_utils/", line 99, in <listcomp>
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "/home/rajat/MyPC/DFKI/MasterThesis/Vehicle_Reidentification/", line 47, in __getitem__
    pos_path = self._get_pos_sample(target, index)
  File "/home/rajat/MyPC/DFKI/MasterThesis/Vehicle_Reidentification/", line 32, in _get_pos_sample
    t = i%len(rand)
ZeroDivisionError: integer division or modulo by zero
Copy link

Of course it will be this result. In ‘val’ set, only one snapshot per identity.
ZeroDivisionError cause by t = i % len(rand), len(rand) is zero !

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
None yet
None yet

No branches or pull requests

2 participants