v-iashin · Junxi-Chen · Jan 18, 2024 · v-iashin · Jan 23, 2024 · v-iashin
diff --git a/configs/i3d.yml b/configs/i3d.yml
@@ -6,9 +6,13 @@ streams: null # Streams to use for feature extraction (e.g. 'rgb' or 'flow'). Bo
 flow_type: 'pwc' # Flow to use in I3D. 'pwc' (PWCNet) is faster while 'raft' (RAFT) is more accurate.
 extraction_fps: null # For original video fps, leave as "null" (None)
 
+# Feature Augumentation
+augment: null # Augmentation to use for feature extraction. Can be ['ten_crop', 'five_crop']
+
 # Extraction Parameters
 device: 'cuda:0' # device as in `torch`, can be 'cpu'
 on_extraction: 'print' # what to do once the features are extracted. Can be ['print', 'save_numpy', 'save_pickle']
+save_option: null # choose what to save. If you only want rgb features, choose 'rgb_only' else null
 output_path: './output' # where to store results if saved
 tmp_path: './tmp' # folder to store the temporary files used for extraction (frames or aud files)
 keep_tmp_files: false # to keep temp files after feature extraction.

diff --git a/docs/models/i3d.md b/docs/models/i3d.md
@@ -44,6 +44,7 @@ You may test it yourself by providing `--show_pred` flag.
 | `video_paths` | `null` | A list of videos for feature extraction. E.g. `"[./sample/v_ZNVhz7ctTq0.mp4, ./sample/v_GGSY1Qvo990.mp4]"` or just one path `"./sample/v_GGSY1Qvo990.mp4"`. |
 | `file_with_video_paths` | `null` | A path to a text file with video paths (one path per line). Hint: given a folder `./dataset` with `.mp4` files one could use: `find ./dataset -name "*mp4" > ./video_paths.txt`. |
 | `on_extraction` | `print` | If `print`, the features are printed to the terminal. If `save_numpy` or `save_pickle`, the features are saved to either `.npy` file or `.pkl`. |
+| `save_option` | `null` | If `rgb_only`, only the rgb features will be saved. |
 | `output_path` | `"./output"` | A path to a folder for storing the extracted features (if `on_extraction` is either `save_numpy` or `save_pickle`). |
 | `keep_tmp_files` | `false` | If `true`, the reencoded videos will be kept in `tmp_path`. |
 | `tmp_path` | `"./tmp"` | A path to a folder for storing temporal files (e.g. reencoded videos). |

diff --git a/models/_base/base_extractor.py b/models/_base/base_extractor.py
@@ -18,9 +18,11 @@ def __init__(self,
  output_path: str,
  keep_tmp_files: bool,
  device: str,
+ save_option=None,
  ) -> None:
  self.feature_type = feature_type
  self.on_extraction = on_extraction
+ self.save_option = save_option
  self.tmp_path = tmp_path
  self.output_path = output_path
  self.keep_tmp_files = keep_tmp_files
@@ -76,6 +78,11 @@ def action_on_extraction(
  return
 
  for key, value in feats_dict.items():
+ if self.save_option == 'rgb_only':
+ if key != 'rgb':
+ continue
+ else:
+ key = None
  if self.on_extraction == 'print':
  print(key)
  print(value)
@@ -84,11 +91,18 @@ def action_on_extraction(
  elif self.on_extraction in ['save_numpy', 'save_pickle']:
  # make dir if doesn't exist
  os.makedirs(self.output_path, exist_ok=True)
- fpath = make_path(self.output_path, video_path, key, action2ext[self.on_extraction])
- if key != 'fps' and len(value) == 0:
- print(f'Warning: the value is empty for {key} @ {fpath}')
  # save the info behind the each key
- action2savefn[self.on_extraction](fpath, value)
+ if len(value.shape) < 3:
+ fpath = make_path(self.output_path, video_path, key, action2ext[self.on_extraction])
+ if key != 'fps' and len(value) == 0:
+ print(f'Warning: the value is empty for {key} @ {fpath}')
+ action2savefn[self.on_extraction](fpath, value)
+ else:
+ for i in range(value.shape[0]):
+ fpath = make_path(self.output_path, video_path, key, action2ext[self.on_extraction], i)
+ if key != 'fps' and len(value) == 0:
+ print(f'Warning: the value is empty for {key} @ {fpath}')
+ action2savefn[self.on_extraction](fpath, value[i, :])
  else:
  raise NotImplementedError(f'on_extraction: {self.on_extraction} is not implemented')
 

diff --git a/models/i3d/extract_i3d.py b/models/i3d/extract_i3d.py
@@ -24,6 +24,7 @@ def __init__(self, args) -> None:
  super().__init__(
  feature_type=args.feature_type,
  on_extraction=args.on_extraction,
+ save_option=args.save_option,
  tmp_path=args.tmp_path,
  output_path=args.output_path,
  keep_tmp_files=args.keep_tmp_files,
@@ -38,15 +39,24 @@ def __init__(self, args) -> None:
  self.extraction_fps = args.extraction_fps
  self.step_size = 64 if args.step_size is None else args.step_size
  self.stack_size = 64 if args.stack_size is None else args.stack_size
+ self.aug_type = args.augment
  self.resize_transforms = torchvision.transforms.Compose([
  torchvision.transforms.ToPILImage(),
  ResizeImproved(self.min_side_size),
  PILToTensor(),
  ToFloat(),
  ])
+ if self.aug_type is None:
+ aug_transform = TensorCenterCrop(self.central_crop_size)
+ elif self.aug_type == 'five_crop':
+ aug_transform = torchvision.transforms.FiveCrop(self.central_crop_size)
+ self.num_crop = 5
+ elif self.aug_type == 'ten_crop':
+ aug_transform = torchvision.transforms.TenCrop(self.central_crop_size)
+ self.num_crop = 10
  self.i3d_transforms = {
  'rgb': torchvision.transforms.Compose([
- TensorCenterCrop(self.central_crop_size),
+ aug_transform,
  ScaleTo1_1(),
  PermuteAndUnsqueeze()
  ]),
@@ -82,8 +92,12 @@ def extract(self, video_path: str) -> Dict[str, np.ndarray]:
  # timestamp when the last frame in the stack begins (when the old frame of the last pair ends)
  timestamps_ms = []
  rgb_stack = []
- feats_dict = {stream: [] for stream in self.streams}
-
+
+ if self.aug_type is not None:
+ feats_dict = {stream: [[] for _ in range(self.num_crop)] for stream in self.streams}
+ else:
+ feats_dict = {stream: [] for stream in self.streams}
+
  # sometimes when the target fps is 1 or 2, the first frame of the reencoded video is missing
  # and cap.read returns None but the rest of the frames are ok. timestep is 0.0 for the 2nd frame in
  # this case
@@ -113,7 +127,11 @@ def extract(self, video_path: str) -> Dict[str, np.ndarray]:
  if len(rgb_stack) - 1 == self.stack_size:
  batch_feats_dict = self.run_on_a_stack(rgb_stack, stack_counter, padder)
  for stream in self.streams:
- feats_dict[stream].extend(batch_feats_dict[stream].tolist())
+ if isinstance(batch_feats_dict[stream], tuple):
+ for i in range(len(batch_feats_dict[stream])):
+ feats_dict[stream][i].extend(batch_feats_dict[stream][i].tolist())
+ else:
+ feats_dict[stream].extend(batch_feats_dict[stream].tolist())
  # leaving the elements if step_size < stack_size so they will not be loaded again
  # if step_size == stack_size one element is left because the flow between the last element
  # in the prev list and the first element in the current list
@@ -161,8 +179,11 @@ def run_on_a_stack(self, rgb_stack, stack_counter, padder=None) -> Dict[str, tor
  raise NotImplementedError
  # apply transforms depending on the stream (flow or rgb)
  stream_slice = self.i3d_transforms[stream](stream_slice)
- # extract features for a stream
- batch_feats_dict[stream] = models[stream](stream_slice, features=True) # (B, 1024)
+ if isinstance(stream_slice, tuple):
+ # extract features for a stream
+ batch_feats_dict[stream] = tuple([models[stream](stream_crop, features=True) for stream_crop in stream_slice])
+ else: 
+ batch_feats_dict[stream] = models[stream](stream_slice, features=True) # (B, 1024)
  # add features to the output dict
  self.maybe_show_pred(stream_slice, self.name2module['model'][stream], stack_counter)
 

diff --git a/models/transforms.py b/models/transforms.py
@@ -145,13 +145,17 @@ def __call__(self, tensor: torch.FloatTensor) -> torch.FloatTensor:
 
 class ScaleTo1_1(object):
 
- def __call__(self, tensor: torch.FloatTensor) -> torch.FloatTensor:
+ def __call__(self, tensor):
+ if isinstance(tensor, tuple):
+ return tuple([(2 * t / 255) - 1 for t in tensor])
  return (2 * tensor / 255) - 1
 
 
 class PermuteAndUnsqueeze(object):
 
- def __call__(self, tensor: torch.FloatTensor) -> torch.FloatTensor:
+ def __call__(self, tensor):
+ if isinstance(tensor, tuple):
+ return tuple([t.permute(1, 0, 2, 3).unsqueeze(0) for t in tensor])
  return tensor.permute(1, 0, 2, 3).unsqueeze(0)
 
 

diff --git a/utils/utils.py b/utils/utils.py
@@ -50,9 +50,18 @@ def show_predictions_on_dataset(logits: torch.FloatTensor, dataset: Union[str, L
  print(f'{logit:8.3f} | {smax:.3f} | {cls}')
  print()
 
-def make_path(output_root, video_path, output_key, ext):
+def make_path(output_root, video_path, output_key, ext, idx=None):
  # extract file name and change the extention
- fname = f'{Path(video_path).stem}_{output_key}{ext}'
+ if idx is not None:
+ if output_key is not None:
+ fname = f'{Path(video_path).stem}_{output_key}_{idx}{ext}'
+ else:
+ fname = f'{Path(video_path).stem}_{idx}{ext}'
+ else:
+ if output_key is not None:
+ fname = f'{Path(video_path).stem}_{output_key}{ext}'
+ else:
+ fname = f'{Path(video_path).stem}_{idx}{ext}'
  # construct the paths to save the features
  return os.path.join(output_root, fname)
 
@@ -131,8 +140,7 @@ def form_list_from_user_input(
  to_shuffle: bool = True,
  ) -> list:
  '''User specifies either list of videos in the cmd or a path to a file with video paths. This function
- transforms the user input into a list of paths. Files are expected to be formatted with a single
- video-path in each line.
+ transforms the user input into a list of paths.
 
  Args:
  video_paths (Union[str, ListConfig, None], optional): a list of video paths. Defaults to None.