task/classification/deit/configs/DeiT_base_patch16_224_in1k_2n16c_dp_fp16o2.yaml

# global configs
Global:
  checkpoint: null
  pretrained_model: null
  output_dir: ./output/
  device: gpu
  save_interval: 1
  max_num_latest_checkpoint: 0
  eval_during_train: True
  eval_interval: 1
  eval_unit: "epoch"
  accum_steps: 1
  epochs: 300
  print_batch_step: 10
  use_visualdl: False
  seed: 2021

# FP16 setting
FP16:
  level: O2
  GradScaler:
    init_loss_scaling: 65536.0
    
DistributedStrategy:
  data_parallel: True

# model architecture
Model:
  name: DeiT_base_patch16_224
  drop_path_rate : 0.1
  drop_rate : 0.0
  class_num: 1000
 
# loss function config for traing/eval process
Loss:
  Train:
    - CELoss:
        weight: 1.0
  Eval:
    - CELoss:
        weight: 1.0
        
LRScheduler:
    name: TimmCosine
    learning_rate: 1e-3
    eta_min: 1e-5
    warmup_epoch: 5
    warmup_start_lr: 1e-6
    decay_unit: epoch

Optimizer:
  name: AdamW
  betas: (0.9, 0.999)
  eps: 1e-8
  weight_decay: 0.05
  no_weight_decay_name: ["cls_token", "pos_embed", "norm"]
  use_master_param: True
  exp_avg_force_fp32: True

# data loader for train and eval
DataLoader:
  Train:
    dataset:
      name: ImageNetDataset
      image_root: ./dataset/ILSVRC2012/
      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
      transform_ops:
        - DecodeImage:
            to_rgb: True
            channel_first: False
        - RandCropImage:
            size: 224
            interpolation: bicubic
            backend: pil
        - RandFlipImage:
            flip_code: 1
        - TimmAutoAugment:
            config_str: rand-m9-mstd0.5-inc1
            interpolation: bicubic
            img_size: 224
            mean: [0.485, 0.456, 0.406]
        - NormalizeImage:
            scale: 1.0/255.0
            mean: [0.485, 0.456, 0.406]
            std: [0.229, 0.224, 0.225]
            order: ''
        - RandomErasing:
            EPSILON: 0.25
            sl: 0.02
            sh: 1.0/3.0
            r1: 0.3
            attempt: 10
            use_log_aspect: True
            mode: pixel
        - ToCHWImage:
      batch_transform_ops:
        - TransformOpSampler:
            Mixup:
              alpha: 0.8
              prob: 0.5
              epsilon: 0.1
              class_num: 1000
            Cutmix:
              alpha: 1.0
              prob: 0.5
              epsilon: 0.1
              class_num: 1000
    sampler:
      name: RepeatedAugSampler
      batch_size: 64
      drop_last: False
      shuffle: True
    loader:
      num_workers: 8
      use_shared_memory: True

  Eval:
    dataset: 
      name: ImageNetDataset
      image_root: ./dataset/ILSVRC2012/
      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
      transform_ops:
        - DecodeImage:
            to_rgb: True
            channel_first: False
        - ResizeImage:
            resize_short: 256
            interpolation: bicubic
            backend: pil
        - CenterCropImage:
            size: 224
        - NormalizeImage:
            scale: 1.0/255.0
            mean: [0.485, 0.456, 0.406]
            std: [0.229, 0.224, 0.225]
            order: ''
        - ToCHWImage:
    sampler:
      name: DistributedBatchSampler
      batch_size: 256
      drop_last: False
      shuffle: False
    loader:
      num_workers: 8
      use_shared_memory: True

Metric:
  Eval:
    - TopkAcc:
        topk: [1, 5]

Export:
  export_type: paddle
  input_shape: [None, 3, 224, 224]