Simplify running commands(single node and multi nodes)

1. Update training logs(glint360k) 2. Update install docs 3. Fix distributed training
deepinsight · Mar 21, 2021 · 8b11677 · 8b11677
1 parent 873e94c
commit 8b11677
Show file tree

Hide file tree

Showing 6 changed files with 81 additions and 52 deletions.
diff --git a/recognition/arcface_torch/README.md b/recognition/arcface_torch/README.md
@@ -1,48 +1,34 @@
 # Arcface Pytorch (Distributed Version of ArcFace)
 
-
 ## Contents
 
 ## Set Up
 ```shell
 torch >= 1.6.0
-```
-
-## Train on a single node 
-If you want to use 8 GPU to train, you should set `--nproc_per_node=8` and set `CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 ` 
-If you want to use 4 GPU to train, you should set `--nproc_per_node=4` and set `CUDA_VISIBLE_DEVICES=0,1,2,3` 
-If you want to use 1 GPU to train, you should set `--nproc_per_node=1` ... 
+``` 
+More details see [eval.md](docs/install.md) in docs.
 
+## Training
+### 1. Single node, 1 GPUs:
 ```shell
-export OMP_NUM_THREADS=4
-export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 
-python -m torch.distributed.launch \ 
---nproc_per_node=8 --nnodes=1 \
---node_rank=0 --master_addr="127.0.0.1" \
---master_port=1234 train.py
-ps -ef | grep "train" | grep -v grep | awk '{print "kill -9 "$2}' | sh
+python -m torch.distributed.launch --nproc_per_node=1 --nnodes=1 --node_rank=0 --master_addr="127.0.0.1" --master_port=1234 train.py
 ```
-
-## Train on multi-node
+### 2. Single node, 8 GPUs:
 ```shell
-pass
+python -m torch.distributed.launch --nproc_per_node=8 --nnodes=1 --node_rank=0 --master_addr="127.0.0.1" --master_port=1234 train.py
 ```
-
-## Evaluation
+### 3. Multiple nodes, each node 8 GPUs: 
+Node 0: 
+```shell
+python -m torch.distributed.launch --nproc_per_node=8 --nnodes=2 --node_rank=0 --master_addr="ip1" --master_port=1234 train.py
+```
+Node 1: 
 ```shell
-# model-prefix your model path
-# image-path your IJBC path
-# result-dir your result path
-# network your backbone
-CUDA_VISIBLE_DEVICES=0,1 python eval_ijbc.py \
---model-prefix ms1mv3_arcface_r50/backbone.pth \
---image-path IJB_release/IJBC \
---result-dir ms1mv3_arcface_r50 \
---batch-size 128 \
---job ms1mv3_arcface_r50 \
---target IJBC \
---network iresnet50
+python -m torch.distributed.launch --nproc_per_node=8 --nnodes=2 --node_rank=1 --master_addr="ip1" --master_port=1234 train.py
 ```
+
+
+## Evaluation IJBC
 More details see [eval.md](docs/eval.md) in docs.
 
 ## Speed Benchmark
@@ -89,14 +75,12 @@ All Model Can be found in here.
 ### Glint360k
 | Datasets | log |backbone | IJBC(1e-05) | IJBC(1e-04) |agedb30|cfp_fp|lfw | 
 | :---: | :--- |:--- | :--- | :--- |:--- |:--- |:--- |
-| Glint360k-Cosface |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_cosface_r100/training.log) |r100 | 96.19 | 97.39 | 98.52 | 99.26 | 99.83 |
-| Glint360k-Cosface |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_cosface_r100_fp16_0.1/training.log)|r100-fp16-sample-0.1 | 95.95 | 97.35 | 98.57 | 99.30 | 99.85 |
-| Glint360k-Cosface | - | - | - | - | - | - | - |
-| Glint360k-Cosface | - | - | - | - | - | - | - |
-| Glint360k-Cosface | - | - | - | - | - | - | - |
-
-
-
+| Glint360k-Cosface |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_cosface_r18_fp16_0.1/training.log) |r18-fp16-0.1 | 93.16 | 95.33 | 97.72 | 97.73 | 99.77 |
+| Glint360k-Cosface |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_cosface_r34_fp16_0.1/training.log) |r34-fp16-0.1 | 95.16 | 96.56 | 98.33 | 98.78 | 99.82 |
+| Glint360k-Cosface |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_cosface_r50_fp16_0.1/training.log) |r50-fp16-0.1 | 95.61 | 96.97 | 98.38 | 99.20 | 99.83 |
+| Glint360k-Cosface |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_cosface_r100_fp16_0.1/training.log)|r100-fp16-0.1 | 95.88 | 97.32 | 98.48 | 99.29 | 99.82 |
+
+0.1 means sample rate is 0.1. 
 
 More details see [eval.md](docs/modelzoo.md) in docs.
 
@@ -121,3 +105,4 @@ We test on PyTorch versions 1.6.0, 1.7.1, and 1.8.0. Please create an issue if y
  year={2020}
 }
 ```
+7
diff --git a/recognition/arcface_torch/config.py b/recognition/arcface_torch/config.py
@@ -37,6 +37,9 @@ def lr_step_func(epoch):
  config.lr_func = lr_step_func
 
 elif config.dataset == "glint360k":
+ # make training faster
+ # our RAM is 256G
+ # mount -t tmpfs -o size=140G tmpfs /train_tmp
  config.rec = "/train_tmp/glint360k"
  config.num_classes = 360232
  config.num_image = 17091657

diff --git a/recognition/arcface_torch/docs/eval.md b/recognition/arcface_torch/docs/eval.md
@@ -1,15 +1,20 @@
 ## Eval IJBC
 
 ```shell
+# model-prefix your model path
+# image-path your IJBC path
+# result-dir your result path
+# network your backbone
 CUDA_VISIBLE_DEVICES=0,1 python eval_ijbc.py \
---model-prefix tmp_models/backbone.pth \
---image-path /data/anxiang/IJB_release/IJBC \
---result-dir result \
+--model-prefix ms1mv3_arcface_r50/backbone.pth \
+--image-path IJB_release/IJBC \
+--result-dir ms1mv3_arcface_r50 \
 --batch-size 128 \
---job cosface \
+--job ms1mv3_arcface_r50 \
 --target IJBC \
 --network iresnet50
 ```
 
 ## Eval MegaFace
+pass
 
diff --git a/recognition/arcface_torch/docs/install.md b/recognition/arcface_torch/docs/install.md
@@ -0,0 +1,36 @@
+## v1.7.1 
+### Linux and Windows 
+```shell
+# CUDA 11.0
+pip install torch==1.7.1+cu110 torchvision==0.8.2+cu110 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html
+
+# CUDA 10.2
+pip install torch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2
+
+# CUDA 10.1
+pip install torch==1.7.1+cu101 torchvision==0.8.2+cu101 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html
+
+# CUDA 9.2
+pip install torch==1.7.1+cu92 torchvision==0.8.2+cu92 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html
+
+# CPU only
+pip install torch==1.7.1+cpu torchvision==0.8.2+cpu torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html
+```
+
+
+## v1.6.0 
+
+### Linux and Windows
+```shell
+# CUDA 10.2
+pip install torch==1.6.0 torchvision==0.7.0
+
+# CUDA 10.1
+pip install torch==1.6.0+cu101 torchvision==0.7.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
+
+# CUDA 9.2
+pip install torch==1.6.0+cu92 torchvision==0.7.0+cu92 -f https://download.pytorch.org/whl/torch_stable.html
+
+# CPU only
+pip install torch==1.6.0+cpu torchvision==0.7.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
+```
diff --git a/recognition/arcface_torch/run.sh b/recognition/arcface_torch/run.sh
@@ -1,4 +1,2 @@
-export OMP_NUM_THREADS=4
-CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m torch.distributed.launch --nproc_per_node=8 --nnodes=1 \
---node_rank=0 --master_addr="127.0.0.1" --master_port=1234 train.py
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m torch.distributed.launch --nproc_per_node=8 --nnodes=1 --node_rank=0 --master_addr="127.0.0.1" --master_port=1234 train.py
 ps -ef | grep "train" | grep -v grep | awk '{print "kill -9 "$2}' | sh
diff --git a/recognition/arcface_torch/train.py b/recognition/arcface_torch/train.py
@@ -22,11 +22,13 @@
 
 
 def main(args):
- dist.init_process_group(backend='nccl', init_method='env://')
+
+ world_size = int(os.environ['WORLD_SIZE'])
+ rank = int(os.environ['RANK'])
+ dist_url = "tcp://{}:{}".format(os.environ["MASTER_ADDR"], os.environ["MASTER_PORT"])
+ dist.init_process_group(backend='nccl', init_method=dist_url, rank=rank, world_size=world_size)
  local_rank = args.local_rank
  torch.cuda.set_device(local_rank)
- rank = dist.get_rank()
- world_size = dist.get_world_size()
 
  if not os.path.exists(cfg.output) and rank is 0:
  os.makedirs(cfg.output)
@@ -124,8 +126,8 @@ def main(args):
 if __name__ == "__main__":
  parser = argparse.ArgumentParser(description='PyTorch ArcFace Training')
  parser.add_argument('--local_rank', type=int, default=0, help='local_rank')
- parser.add_argument('--network', type=str, default="iresnet50", help="backbone network")
- parser.add_argument('--loss', type=str, default="ArcFace", help="loss function")
- parser.add_argument('--resume', type=int, default=0, help="model resuming")
+ parser.add_argument('--network', type=str, default='iresnet50', help='backbone network')
+ parser.add_argument('--loss', type=str, default='ArcFace', help='loss function')
+ parser.add_argument('--resume', type=int, default=0, help='model resuming')
  args_ = parser.parse_args()
  main(args_)