diff --git a/configs/openseed/openseed_swinl_lang_decouple.yaml b/configs/openseed/openseed_swinl_lang_decouple.yaml new file mode 100644 index 0000000..b9795a8 --- /dev/null +++ b/configs/openseed/openseed_swinl_lang_decouple.yaml @@ -0,0 +1,460 @@ +# -------------------------------------------------------- +# X-Decoder -- Generalized Decoding for Pixel, Image, and Language +# ------------------------------------------------------------------------ +# OpenSeeD +# Copyright (c) 2023 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# Modified from MaskDINO https://github.com/IDEA-Research/MaskDINO by Hao Zhang and Feng Li. +# ------------------------------------------------------------------------ + +################## +# Task settings +################## +WEIGHT: '' +PORT: 53711 +VERBOSE: true +OUTPUT_DIR: '../../data/output/test' +# misc +LOADER: + JOINT: True + KEY_DATASET: 'coco' +# model +MODEL: + NAME: openseed_model_decouple_train + HEAD: openseed_head + MASK_ON: false + KEYPOINT_ON: false + LOAD_PROPOSALS: false + DIM_PROJ: 512 + BACKBONE_DIM: 768 + BACKGROUND: False + WEIGHTS: '' + TEXT: + ARCH: encoder + NAME: transformer + TOKENIZER: clip + CONTEXT_LENGTH: 18 # 77 + WIDTH: 512 + HEADS: 8 + LAYERS: 12 # 6 + AUTOGRESSIVE: True + BACKBONE: + NAME: swin + PRETRAINED: 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth' + LOAD_PRETRAINED: true + SWIN: + PRETRAIN_IMG_SIZE: 384 + PATCH_SIZE: 4 + EMBED_DIM: 192 + DEPTHS: [ 2, 2, 18, 2 ] + NUM_HEADS: [ 6, 12, 24, 48 ] + WINDOW_SIZE: 12 + MLP_RATIO: 4.0 + QKV_BIAS: true + QK_SCALE: ~ + DROP_RATE: 0.0 + ATTN_DROP_RATE: 0.0 + DROP_PATH_RATE: 0.3 + APE: false + PATCH_NORM: true + USE_CHECKPOINT: false + OUT_FEATURES: [ 'res2', 'res3', 'res4', 'res5' ] + ENCODER: + NAME: encoder_deform + IGNORE_VALUE: 255 + NUM_CLASSES: 133 + LOSS_WEIGHT: 1.0 + CONVS_DIM: 256 + MASK_DIM: 256 + NORM: "GN" + IN_FEATURES: ["res2", "res3", "res4", "res5"] + DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res2","res3", "res4", "res5"] + COMMON_STRIDE: 4 + TRANSFORMER_ENC_LAYERS: 6 + TOTAL_NUM_FEATURE_LEVELS: 5 + NUM_FEATURE_LEVELS: 4 + FEATURE_ORDER: "low2high" + DECODER: + NAME: openseed_decoder_decouple + TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" + mask_neg_o365: True + MASK: True + BOX: True + GROUNDING: + ENABLED: False + MAX_LEN: 5 + TEXT_WEIGHT: 2.0 + CLASS_WEIGHT: 0.5 + CAPTION: + ENABLED: False + PHRASE_PROB: 0.0 + SIM_THRES: 0.95 + CAPTIONING: + ENABLED: False + STEP: 50 + RETRIEVAL: + ENABLED: False + DIM_IMG: 768 + ENSEMBLE: True + OPENIMAGE: + ENABLED: False + NEGATIVE_SAMPLES: 5 + GROUNDING: + ENABLED: False + MAX_LEN: 5 + DEEP_SUPERVISION: True + NO_OBJECT_WEIGHT: 0.1 + CLASS_WEIGHT: 4.0 + MASK_WEIGHT: 5.0 + DICE_WEIGHT: 5.0 + BOX_WEIGHT: 5.0 + GIOU_WEIGHT: 2.0 + COST_CLASS_WEIGHT: 4.0 + COST_DICE_WEIGHT: 5.0 + COST_MASK_WEIGHT: 5.0 + COST_BOX_WEIGHT: 5.0 + COST_GIOU_WEIGHT: 2.0 + HIDDEN_DIM: 256 + NUM_OBJECT_QUERIES: 300 + NHEADS: 8 + DROPOUT: 0.0 + DIM_FEEDFORWARD: 2048 + ENC_LAYERS: 0 + PRE_NORM: False + ENFORCE_INPUT_PROJ: False + SIZE_DIVISIBILITY: 32 + DEC_LAYERS: 9 # 9 decoder layers, add one for the loss on learnable query + TRAIN_NUM_POINTS: 12544 + OVERSAMPLE_RATIO: 3.0 + IMPORTANCE_SAMPLE_RATIO: 0.75 + TWO_STAGE: True + INITIALIZE_BOX_TYPE: 'no' + DN: seg + DN_NOISE_SCALE: 0.4 + DN_NUM: 100 + INITIAL_PRED: True + LEARN_TGT: False + TOTAL_NUM_FEATURE_LEVELS: 5 + SEMANTIC_CE_LOSS: False + PANO_BOX_LOSS: False + COCO: True + O365: True + TEST: + SEMANTIC_ON: True + INSTANCE_ON: True + PANOPTIC_ON: True + OVERLAP_THRESHOLD: 0.8 + OBJECT_MASK_THRESHOLD: 0.25 + SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE: false + TEST_FOUCUS_ON_BOX: False + PANO_TRANSFORM_EVAL: True + PANO_TEMPERATURE: 0.06 + +TEST: + EVAL_PERIOD: 5000 + PRECISE_BN: + NUM_ITER: 1 + ENABLED: False + AUG: + ENABLED: False + +COCO: + INPUT: + MIN_SIZE_TEST: 800 + MAX_SIZE_TEST: 1333 + IMAGE_SIZE: 1024 + MIN_SCALE: 0.1 + MAX_SCALE: 2.0 + DATASET_MAPPER_NAME: "coco_panoptic_lsj" + IGNORE_VALUE: 255 + COLOR_AUG_SSD: False + SIZE_DIVISIBILITY: 32 + RANDOM_FLIP: "horizontal" + MASK_FORMAT: "polygon" + FORMAT: "RGB" + CROP: + ENABLED: True + DATASET: + DATASET: 'coco' + TEST: + DETECTIONS_PER_IMAGE: 300 + NAME: coco_eval + IOU_TYPE: ['bbox', 'segm'] + USE_MULTISCALE: false + BATCH_SIZE_TOTAL: 8 + MODEL_FILE: '' + AUG: + ENABLED: False + TRAIN: + BATCH_SIZE_TOTAL: 16 + BATCH_SIZE_PER_GPU: 2 + SHUFFLE: true + DATALOADER: + FILTER_EMPTY_ANNOTATIONS: False + NUM_WORKERS: 2 + LOAD_PROPOSALS: False + SAMPLER_TRAIN: "TrainingSampler" + ASPECT_RATIO_GROUPING: True + +VLP: + INPUT: + IMAGE_SIZE: 224 + DATASET_MAPPER_NAME: "vlpretrain" + IGNORE_VALUE: 255 + COLOR_AUG_SSD: False + SIZE_DIVISIBILITY: 32 + MASK_FORMAT: "polygon" + FORMAT: "RGB" + CROP: + ENABLED: True + TRAIN: + BATCH_SIZE_TOTAL: 2 + BATCH_SIZE_PER_GPU: 2 + TEST: + BATCH_SIZE_TOTAL: 256 + DATALOADER: + FILTER_EMPTY_ANNOTATIONS: False + NUM_WORKERS: 16 + LOAD_PROPOSALS: False + SAMPLER_TRAIN: "TrainingSampler" + ASPECT_RATIO_GROUPING: True + +INPUT: + PIXEL_MEAN: [123.675, 116.280, 103.530] + PIXEL_STD: [58.395, 57.120, 57.375] + +DATASETS: + TRAIN: ["coco_2017_train_panoptic"]#, "object365_train_od"] + # open vocabulary segmentation evaluation. + TEST: ["ade20k_panoptic_val"] + CLASS_CONCAT: false + SIZE_DIVISIBILITY: 32 + PROPOSAL_FILES_TRAIN: [] + +DATALOADER: + FILTER_EMPTY_ANNOTATIONS: False + NUM_WORKERS: 16 + LOAD_PROPOSALS: False + SAMPLER_TRAIN: "TrainingSampler" + ASPECT_RATIO_GROUPING: True + +# Detectron2 training config for optimizer and lr scheduler +SOLVER: + BASE_LR_END: 0.0 + MOMENTUM: 0.9 + NESTEROV: False + CHECKPOINT_PERIOD: 5000 + IMS_PER_BATCH: 16 + REFERENCE_WORLD_SIZE: 0 + BIAS_LR_FACTOR: 1.0 + WEIGHT_DECAY_BIAS: None + # original + BASE_LR: 0.0001 + STEPS: [327778, 355092] + MAX_ITER: 368750 + GAMMA: 0.1 + WARMUP_FACTOR: 1.0 + WARMUP_ITERS: 10 + WARMUP_METHOD: "linear" + WEIGHT_DECAY: 0.05 + OPTIMIZER: "ADAMW" + LR_SCHEDULER_NAME: "WarmupMultiStepLR" + LR_MULTIPLIER: + backbone: 0.1 + lang_encoder: 0.1 + WEIGHT_DECAY_NORM: 0.0 + WEIGHT_DECAY_EMBED: 0.0 + CLIP_GRADIENTS: + ENABLED: True + CLIP_TYPE: "full_model" + CLIP_VALUE: 0.01 + NORM_TYPE: 2.0 + AMP: + ENABLED: True + +OBJECT365: + INPUT: + MIN_SIZE_TRAIN: 800 + MAX_SIZE_TRAIN: 1333 + MIN_SIZE_TRAIN_SAMPLING: 'choice' + MIN_SIZE_TEST: 800 + MAX_SIZE_TEST: 1333 + IMAGE_SIZE: 1024 + MIN_SCALE: 0.1 + MAX_SCALE: 2.0 + DATASET_MAPPER_NAME: "object365" + IGNORE_VALUE: 255 + COLOR_AUG_SSD: False + SIZE_DIVISIBILITY: 32 + RANDOM_FLIP: "horizontal" + MASK_FORMAT: "polygon" + FORMAT: "RGB" + CROP: + ENABLED: True + TYPE: "relative_range" + SIZE: [0.9, 0.9] + DATASET: + DATASET: 'object365' + TEST: + DETECTIONS_PER_IMAGE: 100 + NAME: coco_eval + IOU_TYPE: ['bbox'] + USE_MULTISCALE: false + BATCH_SIZE_TOTAL: 1 + MODEL_FILE: '' + AUG: + ENABLED: False + TRAIN: + ASPECT_RATIO_GROUPING: true + BATCH_SIZE_TOTAL: 2 + BATCH_SIZE_PER_GPU: 2 + SHUFFLE: true + DATALOADER: + FILTER_EMPTY_ANNOTATIONS: False + NUM_WORKERS: 0 + LOAD_PROPOSALS: False + SAMPLER_TRAIN: "TrainingSampler" + ASPECT_RATIO_GROUPING: True + +# Evaluation Dataset +ADE20K: + INPUT: + MIN_SIZE_TRAIN: [320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 1088, 1152, 1216, 1280] + MIN_SIZE_TRAIN_SAMPLING: "choice" + MIN_SIZE_TEST: 640 + MAX_SIZE_TRAIN: 2560 + MAX_SIZE_TEST: 2560 + MASK_FORMAT: "polygon" + CROP: + ENABLED: True + TYPE: "absolute" + SIZE: [640, 640] + SINGLE_CATEGORY_MAX_AREA: 1.0 + IGNORE_VALUE: 255 + COLOR_AUG_SSD: True + SIZE_DIVISIBILITY: 640 # used in dataset mapper + DATASET_MAPPER_NAME: "mask_former_panoptic" + FORMAT: "RGB" + DATASET: + DATASET: 'ade' + TRAIN: + ASPECT_RATIO_GROUPING: true + BATCH_SIZE_TOTAL: 16 + BATCH_SIZE_PER_GPU: 2 + SHUFFLE: true + TEST: + DETECTIONS_PER_IMAGE: 100 + NAME: coco_eval + IOU_TYPE: ['bbox', 'segm'] + USE_MULTISCALE: false + BATCH_SIZE_TOTAL: 1 + MODEL_FILE: '' + AUG: + ENABLED: False + DATALOADER: + FILTER_EMPTY_ANNOTATIONS: False + NUM_WORKERS: 8 + LOAD_PROPOSALS: False + SAMPLER_TRAIN: "TrainingSampler" + ASPECT_RATIO_GROUPING: True + + +REF: + INPUT: + PIXEL_MEAN: [123.675, 116.280, 103.530] + PIXEL_STD: [58.395, 57.120, 57.375] + MIN_SIZE_TEST: 512 + MAX_SIZE_TEST: 1024 + FORMAT: "RGB" + DATALOADER: + FILTER_EMPTY_ANNOTATIONS: False + NUM_WORKERS: 0 + LOAD_PROPOSALS: False + SAMPLER_TRAIN: "TrainingSampler" + ASPECT_RATIO_GROUPING: False + TEST: + BATCH_SIZE_TOTAL: 8 + +SUN: + INPUT: + PIXEL_MEAN: [123.675, 116.280, 103.530] + PIXEL_STD: [58.395, 57.120, 57.375] + MIN_SIZE_TEST: 512 + MAX_SIZE_TEST: 1024 + DATALOADER: + FILTER_EMPTY_ANNOTATIONS: False + NUM_WORKERS: 0 + LOAD_PROPOSALS: False + SAMPLER_TRAIN: "TrainingSampler" + ASPECT_RATIO_GROUPING: False + TEST: + BATCH_SIZE_TOTAL: 8 + +SCAN: + INPUT: + PIXEL_MEAN: [123.675, 116.280, 103.530] + PIXEL_STD: [58.395, 57.120, 57.375] + MIN_SIZE_TEST: 512 + MAX_SIZE_TEST: 1024 + DATALOADER: + FILTER_EMPTY_ANNOTATIONS: False + NUM_WORKERS: 0 + LOAD_PROPOSALS: False + SAMPLER_TRAIN: "TrainingSampler" + ASPECT_RATIO_GROUPING: False + TEST: + BATCH_SIZE_TOTAL: 8 + +BDD: + INPUT: + PIXEL_MEAN: [123.675, 116.280, 103.530] + PIXEL_STD: [58.395, 57.120, 57.375] + MIN_SIZE_TEST: 800 + MAX_SIZE_TEST: 1333 + DATALOADER: + FILTER_EMPTY_ANNOTATIONS: False + NUM_WORKERS: 0 + LOAD_PROPOSALS: False + SAMPLER_TRAIN: "TrainingSampler" + ASPECT_RATIO_GROUPING: False + TEST: + BATCH_SIZE_TOTAL: 8 + +CITY: + INPUT: + MIN_SIZE_TRAIN: [ 512, 614, 716, 819, 921, 1024, 1126, 1228, 1331, 1433, 1536, 1638, 1740, 1843, 1945, 2048 ] + MIN_SIZE_TRAIN_SAMPLING: "choice" + MIN_SIZE_TEST: 1024 + MAX_SIZE_TRAIN: 4096 + MAX_SIZE_TEST: 2048 + CROP: + ENABLED: True + TYPE: "absolute" + SIZE: [ 512, 1024 ] + SINGLE_CATEGORY_MAX_AREA: 1.0 + IGNORE_VALUE: 255 + COLOR_AUG_SSD: True + SIZE_DIVISIBILITY: -1 + FORMAT: "RGB" + DATASET_MAPPER_NAME: "mask_former_panoptic" + MASK_FORMAT: "polygon" + TEST: + EVAL_PERIOD: 5000 + BATCH_SIZE_TOTAL: 1 + AUG: + ENABLED: False + MIN_SIZES: [ 512, 768, 1024, 1280, 1536, 1792 ] + MAX_SIZE: 4096 + FLIP: True + DATALOADER: + FILTER_EMPTY_ANNOTATIONS: True + NUM_WORKERS: 2 + LOAD_PROPOSALS: False + SAMPLER_TRAIN: "TrainingSampler" + ASPECT_RATIO_GROUPING: True + TRAIN: + ASPECT_RATIO_GROUPING: true + BATCH_SIZE_TOTAL: 2 + BATCH_SIZE_PER_GPU: 2 + SHUFFLE: true \ No newline at end of file