forked from DAMO-NLP-SG/Video-LLaMA
-
Notifications
You must be signed in to change notification settings - Fork 0
/
visionbranch_stage1_pretrain.yaml
87 lines (73 loc) · 1.97 KB
/
visionbranch_stage1_pretrain.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
model:
arch: video_llama
model_type: pretrain_vicuna
freeze_vit: True
freeze_qformer: True
# Q-Former
num_query_token: 32
# If you want train models based on LLaMA-2-chat,
# some ckpts could be download from our provided huggingface repo
# i.e. https://huggingface.co/DAMO-NLP-SG/Video-LLaMA-2-13B-Pretrained
llama_model: "ckpt/vicuna-13b/" or "ckpt/vicuna-7b/" or "ckpt/llama-2-7b-chat-hf" or "ckpt/llama-2-13b-chat-hf"
imagebind_ckpt_path: "ckpt/imagebind_path/"
llama_proj_model: 'ckpt/pretrained_minigpt4.pth' or '/mnt/workspace/ckpt/pretrained_minigpt4_7b.pth'
# only train vision branch
equip_audio_branch: False # whether equips the audio branch
frozen_llama_proj: False
frozen_video_Qformer: False
frozen_audio_Qformer: True
fusion_head_layers: 2
max_frame_pos: 32
fusion_header_type: "seqTransf"
num_video_query_token: 32
datasets:
webvid:
data_type: video
build_info:
anno_dir: path/webvid/webvid_train_data/filter_annotations/
videos_dir: path/webvid/webvid_train_data/videos/
vis_processor:
train:
name: "alpro_video_train"
n_frms: 8
image_size: 224
text_processor:
train:
name: "blip_caption"
sample_ratio: 100
cc_sbu_align:
data_type: images
build_info:
storage: /path/LLaVA_cc3m
vis_processor:
train:
name: "blip2_image_train"
image_size: 224
text_processor:
train:
name: "blip_caption"
sample_ratio: 24
run:
task: video_text_pretrain
# optimizer
lr_sched: "linear_warmup_cosine_lr"
init_lr: 1e-4
min_lr: 8e-5
warmup_lr: 1e-6
weight_decay: 0.05
max_epoch: 5
batch_size_train: 32
batch_size_eval: 32
num_workers: 8
warmup_steps: 2500
iters_per_epoch: 2500
seed: 42
output_dir: "output/videollama_stage1_pretrain"
amp: True
resume_ckpt_path: null
evaluate: False
train_splits: ["train"]
device: "cuda"
world_size: 1
dist_url: "env://"
distributed: True