Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Validation loss curve #13

Open
Rajat-Mehta opened this issue Aug 3, 2019 · 1 comment
Open

Validation loss curve #13

Rajat-Mehta opened this issue Aug 3, 2019 · 1 comment

Comments

@Rajat-Mehta
Copy link

Rajat-Mehta commented Aug 3, 2019

Is there a way to plot validation loss curves along with the training loss during the Training process?

Here is my train method:

def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = model.state_dict()
    best_acc = 0.0
    last_margin = 0.0

    for epoch in range(num_epochs - start_epoch):
        epoch = epoch + start_epoch
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                scheduler.step()
                model.train(True)  # Set model to training mode
            else:
                model.train(False)  # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0.0
            running_margin = 0.0
            running_reg = 0.0
            # Iterate over data.
            for data in dataloaders[phase]:
                # get the inputs
                inputs, labels, pos, pos_labels = data
                now_batch_size, c, h, w = inputs.shape

                if now_batch_size<opt.batchsize:  # next epoch
                    continue
                pos = pos.view(4*opt.batchsize, c, h, w)
                # copy pos 4times
                pos_labels = pos_labels.repeat(4).reshape(4, opt.batchsize)
                pos_labels = pos_labels.transpose(0, 1).reshape(4*opt.batchsize)

                # wrap them in Variable
                if use_gpu:
                    inputs = Variable(inputs.cuda())
                    pos = Variable(pos.cuda())
                    labels = Variable(labels.cuda())
                else:
                    inputs, labels = Variable(inputs), Variable(labels)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                if phase == 'val':
                    with torch.no_grad():
                        outputs, f = model(inputs)
                        _, pf = model(pos)
                else:
                    # model_eval = copy.deepcopy(model)
                    # model_eval = model_eval.eval()
                    outputs, f = model(inputs)
                    _, pf = model(pos)
                # pf = Variable( pf, requires_grad=True)
                neg_labels = pos_labels
                # hard-neg
                # ----------------------------------
                nf_data = pf  # 128*512

                # 128 is too much, we use pool size = 64
                rand = np.random.permutation(4*opt.batchsize)[0:opt.poolsize]
                nf_data = nf_data[rand, :]
                neg_labels = neg_labels[rand]
                nf_t = nf_data.transpose(0, 1)  # 512*128
                score = torch.mm(f.data, nf_t)  # cosine 32*128
                score, rank = score.sort(dim=1, descending=True)  # score high == hard
                labels_cpu = labels.cpu()
                nf_hard = torch.zeros(f.shape).cuda()
                for k in range(now_batch_size):
                    hard = rank[k, :]
                    for kk in hard:
                        now_label = neg_labels[kk] 
                        anchor_label = labels_cpu[k]
                        if now_label != anchor_label:
                            nf_hard[k, :] = nf_data[kk, :]
                            break

                # hard-pos
                # ----------------------------------
                pf_hard = torch.zeros(f.shape).cuda() # 32*512
                for k in range(now_batch_size):
                    pf_data = pf[4*k:4*k+4,:]
                    pf_t = pf_data.transpose(0,1) # 512*4
                    ff = f.data[k,:].reshape(1,-1) # 1*512
                    score = torch.mm(ff, pf_t) #cosine
                    score, rank = score.sort(dim=1, descending = False)  # score low == hard
                    pf_hard[k,:] = pf_data[rank[0][0],:]

                # loss
                # ---------------------------------
                criterion_triplet = nn.MarginRankingLoss(margin=opt.margin)                
                pscore = torch.sum(f * pf_hard, dim=1)
                nscore = torch.sum(f * nf_hard, dim=1)
                y = torch.ones(now_batch_size)
                y = Variable(y.cuda())

                if not opt.PCB:
                    _, preds = torch.max(outputs.data, 1)
                    #loss = criterion(outputs, labels)
                    #loss_triplet = criterion_triplet(f, pf, nf)
                    reg = torch.sum((1+nscore)**2) + torch.sum((-1+pscore)**2)
                    loss = torch.sum(torch.nn.functional.relu(nscore + opt.margin - pscore))  #Here I use sum
                    loss_triplet = loss + opt.alpha*reg
                else:
                    part = {}
                    sm = nn.Softmax(dim=1)
                    num_part = 6
                    for i in range(num_part):
                        part[i] = outputs[i]

                    score = sm(part[0]) + sm(part[1]) +sm(part[2]) + sm(part[3]) +sm(part[4]) +sm(part[5])
                    _, preds = torch.max(score.data, 1)

                    loss = criterion(part[0], labels)
                    for i in range(num_part-1):
                        loss += criterion(part[i+1], labels)

                # backward + optimize only if in training phase
                if phase == 'train':
                    if fp16:  # we use optimier to backward loss
                        with amp.scale_loss(loss_triplet, optimizer) as scaled_loss:
                            scaled_loss.backward()
                    else:
                        loss_triplet.backward()
                    optimizer.step()
                # statistics
                if int(version[0]) > 0 or int(version[2]) > 3:  # for the new version like 0.4.0 and 0.5.0
                    running_loss += loss_triplet.item() #* opt.batchsize
                else :  # for the old version like 0.3.0 and 0.3.1
                    running_loss += loss_triplet.data[0] #*opt.batchsize
                # print( loss_triplet.item())
                running_corrects += float(torch.sum(pscore>nscore+opt.margin))
                running_margin +=float(torch.sum(pscore-nscore))
                running_reg += reg

            datasize = dataset_sizes[phase]//opt.batchsize * opt.batchsize
            epoch_loss = running_loss / datasize
            epoch_reg = opt.alpha*running_reg/ datasize
            epoch_acc = running_corrects / datasize
            epoch_margin = running_margin / datasize

            #if epoch_acc>0.75:
            #    opt.margin = min(opt.margin+0.02, 1.0)
            print('now_margin: %.4f'%opt.margin)           
            print('{} Loss: {:.4f} Reg: {:.4f} Acc: {:.4f} MeanMargin: {:.4f}'.format(
                phase, epoch_loss, epoch_reg, epoch_acc, epoch_margin))

            y_loss[phase].append(epoch_loss)
            y_err[phase].append(1.0-epoch_acc)
            # deep copy the model
            if epoch_margin>last_margin:
                last_margin = epoch_margin
                last_model_wts = model.state_dict()


            
            if phase == 'val':
                last_model_wts = model.state_dict()
                if epoch % 10 == 9:
                    save_network(model, epoch)
                draw_curve(epoch)
            

        time_elapsed = time.time() - since
        print('Training complete in {:.0f}m {:.0f}s'.format(
            time_elapsed // 60, time_elapsed % 60))
        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    #print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(last_model_wts)
    save_network(model, 'last')
    return model

I tried to add 'val' phase in your train method: "for phase in ['train', 'val']". But I am getting the following error while looping through the Val dataloader
(at this line "for data in dataloaders[phase]"):

but I am getting this error probably because of dataloader['val']:

Traceback (most recent call last):
  File "train_siamese.py", line 593, in <module>
    num_epochs=150)
  File "train_siamese.py", line 323, in train_model
    for data in dataloaders[phase]:
  File "/home/rajat/MyPC/DFKI/MasterThesis/Vehicle_Reidentification/veri/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 582, in __next__
    return self._process_next_batch(batch)
  File "/home/rajat/MyPC/DFKI/MasterThesis/Vehicle_Reidentification/veri/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 608, in _process_next_batch
    raise batch.exc_type(batch.exc_msg)
ZeroDivisionError: Traceback (most recent call last):
  File "/home/rajat/MyPC/DFKI/MasterThesis/Vehicle_Reidentification/veri/lib/python3.6/site-packages/torch/utils/data/_utils/worker.py", line 99, in _worker_loop
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "/home/rajat/MyPC/DFKI/MasterThesis/Vehicle_Reidentification/veri/lib/python3.6/site-packages/torch/utils/data/_utils/worker.py", line 99, in <listcomp>
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "/home/rajat/MyPC/DFKI/MasterThesis/Vehicle_Reidentification/tripletfolder.py", line 47, in __getitem__
    pos_path = self._get_pos_sample(target, index)
  File "/home/rajat/MyPC/DFKI/MasterThesis/Vehicle_Reidentification/tripletfolder.py", line 32, in _get_pos_sample
    t = i%len(rand)
ZeroDivisionError: integer division or modulo by zero
@CynicalHeart
Copy link

Of course it will be this result. In ‘val’ set, only one snapshot per identity.
ZeroDivisionError cause by t = i % len(rand), len(rand) is zero !

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants