update code

TencentARC · May 15, 2023 · ae599bb · ae599bb
1 parent 446fed4
commit ae599bb
Show file tree

Hide file tree

Showing 67 changed files with 8,923 additions and 2 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+*.pyc
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "flm/pycocoevalcap"]
+ path = flm/pycocoevalcap
+ url = https://github.com/salaniz/pycocoevalcap.git
diff --git a/README.md b/README.md
@@ -16,9 +16,88 @@ Free language modeling (FLM) is a new language modeling method that enables a 10
  <img src="imgs/pipeline.png" width = "50%" />
 </p>
 
+## Install
+```
+pip install -r requirements.txt
+```
+## Dataset Preparation
+We follow [ViLT](https://github.com/dandelin/ViLT) and use `pyarrow` to serialize the datasets. See this [link](https://github.com/dandelin/ViLT/blob/master/DATA.md) for details.
+
 ## Pretraining
+```bash
+export MASTER_ADDR=$DIST_0_IP
+export MASTER_PORT=$DIST_0_PORT
+export NODE_RANK=$DIST_RANK
+
+python run.py with data_root=<DATA_DIR> exp_name="pretrain_FLM_4m" \
+ num_gpus=8 resume_from=None fix_exp_version=True \
+ flm text_roberta image_size=288 clip32 causal_flm \
+ precision=16 max_steps=30000 learning_rate=0.00008 \
+ batch_size=4096 per_gpu_batchsize=64 warmup_steps=0.05
+```
+
+## Evaluation on Downstream Tasks
+#### Visual Question Answering (VQA v2)
+```bash
+# training: 4 gpu
+python run.py with data_root=<DOWNSTREAM_DATA_DIR> exp_name="pretrain_FLM_4m_ft_vqa_train" \
+ num_gpus=4 resume_from=None fix_exp_version=True load_path="flm_pretrain.ckpt" \
+ ft_vqa text_roberta image_size=576 clip32 causal_flm \
+ learning_rate=0.000005 batch_size=512 per_gpu_batchsize=32 log_dir='result_ft' clip_randaug
+
+# testing: 4 gpu
+python run.py with data_root=<DOWNSTREAM_DATA_DIR> exp_name="pretrain_FLM_4m_ft_vqa_test" \
+ num_gpus=4 load_path="flm_vqa.ckpt" \
+ ft_vqa text_roberta image_size=576 clip32 causal_flm \
+ per_gpu_batchsize=32 log_dir='result_ft' test_only=True skip_test_step=True
+```
+
+#### Natural Language for Visual Reasoning
+```bash
+# training: 1 gpu
+python run.py with data_root=<DOWNSTREAM_DATA_DIR> exp_name="pretrain_FLM_4m_ft_nlvr2_train" \
+ num_gpus=1 resume_from=None fix_exp_version=True load_path="flm_pretrain.ckpt" \
+ ft_nlvr2 text_roberta image_size=288 clip32 causal_flm \
+ learning_rate=0.00001 batch_size=256 per_gpu_batchsize=32 log_dir='result_ft' clip_randaug
+
+# testing: 1 gpu
+python run.py with data_root=<DOWNSTREAM_DATA_DIR> exp_name="pretrain_FLM_4m_ft_nlvr2_test" \
+ num_gpus=1 load_path="flm_nlvr2.ckpt" \
+ ft_nlvr2 text_roberta image_size=288 clip32 causal_flm \
+ per_gpu_batchsize=32 log_dir='result_ft' test_only=True skip_test_step=True
+```
+
+#### Image Captioning
+```bash
+# training: 4 gpu
+python run.py with data_root=<DOWNSTREAM_DATA_DIR> exp_name="pretrain_FLM_4m_ft_cap_coco_train" \
+ num_gpus=4 resume_from=None fix_exp_version=True load_path="flm_pretrain.ckpt" \
+ ft_cap_coco text_roberta image_size=288 clip32 causal_flm \
+ learning_rate=0.000003 batch_size=256 per_gpu_batchsize=64 log_dir='result_ft' clip_randaug
+
+# testing: 4 gpu
+python run.py with data_root=<DOWNSTREAM_DATA_DIR> exp_name="pretrain_FLM_4m_ft_cap_coco_test" \
+ num_gpus=4 load_path="flm_cap_coco.ckpt" \
+ ft_cap_coco text_roberta image_size=384 clip32 causal_flm \
+ per_gpu_batchsize=64 log_dir='result_ft' test_only=True skip_test_step=True
+```
+
+#### Image-Text Retrieval
+
+```bash
+# training: 8 gpu
+python run.py with data_root=<DOWNSTREAM_DATA_DIR> exp_name="pretrain_FLM_4m_ft_irtr_f30k_train" \
+ num_gpus=8 resume_from=None fix_exp_version=True load_path="flm_pretrain.ckpt" \
+ ft_irtr_f30k text_roberta image_size=384 clip32 causal_flm precision=16 \
+ learning_rate=0.000005 batch_size=512 per_gpu_batchsize=8 log_dir='result_ft' clip_randaug
+
+# testing: 8 gpu
+python run.py with data_root=<DOWNSTREAM_DATA_DIR> exp_name="pretrain_FLM_4m_ft_irtr_f30k_test" \
+ num_gpus=8 load_path="flm_irtr_f30k.ckpt" \
+ ft_irtr_f30k text_roberta image_size=384 clip32 causal_flm \
+ per_gpu_batchsize=8 log_dir='result_ft' test_only=True skip_test_step=True
+```
 
-Coming soon.
 
 ## Citation
 ```
@@ -30,4 +109,7 @@ Coming soon.
  archivePrefix={arXiv},
  primaryClass={cs.CV}
 }
-```
+```
+
+## Acknowledgements
+The code is highly based on [METER](https://github.com/zdou0830/METER) and [ViLT](https://github.com/dandelin/ViLT).
diff --git a/data/coco_caption/captions_val2014.json b/data/coco_caption/captions_val2014.json
diff --git a/flm/__init__.py b/flm/__init__.py