Next (#502)

* Validate the overrides from facefusion.ini * Break down cli testing * Remove architecture lookup to support old driver * Remove architecture lookup to support old driver * Remove hwaccel auto * Respect the output video resolution * Bump next version * Full directml support (#501) * Introduce conditional thread management for DML support * Finish migration to thread helpers * Introduce dynamic frame colorizer sizes * Introduce dynamic frame colorizer sizes * Add 192x192 to frame colorizer * Fix async audio
facefusion · Apr 19, 2024 · 4efa5b2 · 4efa5b2
1 parent 092dfbb
commit 4efa5b2
Show file tree

Hide file tree

Showing 30 changed files with 350 additions and 191 deletions.
diff --git a/README.md b/README.md
@@ -96,6 +96,7 @@ frame processors:
  --face-swapper-model {blendswap_256,inswapper_128,inswapper_128_fp16,simswap_256,simswap_512_unofficial,uniface_256} choose the model responsible for swapping the face
  --frame-colorizer-model {ddcolor,ddcolor_artistic,deoldify,deoldify_artistic,deoldify_stable} choose the model responsible for colorizing the frame
  --frame-colorizer-blend [0-100] blend the colorized into the previous frame
+ --frame-colorizer-size {192x192,256x256,384x384,512x512} specify the size of the frame provided to the frame colorizer
  --frame-enhancer-model {lsdir_x4,nomos8k_sc_x4,real_esrgan_x2,real_esrgan_x2_fp16,real_esrgan_x4,real_esrgan_x4_fp16,real_hatgan_x4,span_kendata_x4} choose the model responsible for enhancing the frame
  --frame-enhancer-blend [0-100] blend the enhanced into the previous frame
  --lip-syncer-model {wav2lip_gan} choose the model responsible for syncing the lips

diff --git a/facefusion.ini b/facefusion.ini
@@ -63,6 +63,7 @@ face_enhancer_blend =
 face_swapper_model =
 frame_colorizer_model =
 frame_colorizer_blend =
+frame_colorizer_size =
 frame_enhancer_model =
 frame_enhancer_blend =
 lip_syncer_model =

diff --git a/facefusion/content_analyser.py b/facefusion/content_analyser.py
@@ -1,22 +1,21 @@
 from typing import Any
 from functools import lru_cache
 from time import sleep
-import threading
 import cv2
 import numpy
 import onnxruntime
 from tqdm import tqdm
 
 import facefusion.globals
 from facefusion import process_manager, wording
+from facefusion.thread_helper import thread_lock, conditional_thread_semaphore
 from facefusion.typing import VisionFrame, ModelSet, Fps
 from facefusion.execution import apply_execution_provider_options
 from facefusion.vision import get_video_frame, count_video_frame_total, read_image, detect_video_fps
 from facefusion.filesystem import resolve_relative_path, is_file
 from facefusion.download import conditional_download
 
 CONTENT_ANALYSER = None
-THREAD_LOCK : threading.Lock = threading.Lock()
 MODELS : ModelSet =\
 {
  'open_nsfw':
@@ -33,7 +32,7 @@
 def get_content_analyser() -> Any:
  global CONTENT_ANALYSER
 
- with THREAD_LOCK:
+ with thread_lock():
  while process_manager.is_checking():
  sleep(0.5)
  if CONTENT_ANALYSER is None:
@@ -72,10 +71,11 @@ def analyse_stream(vision_frame : VisionFrame, video_fps : Fps) -> bool:
 def analyse_frame(vision_frame : VisionFrame) -> bool:
  content_analyser = get_content_analyser()
  vision_frame = prepare_frame(vision_frame)
- probability = content_analyser.run(None,
- {
- content_analyser.get_inputs()[0].name: vision_frame
- })[0][0][1]
+ with conditional_thread_semaphore(facefusion.globals.execution_providers):
+ probability = content_analyser.run(None,
+ {
+ content_analyser.get_inputs()[0].name: vision_frame
+ })[0][0][1]
  return probability > PROBABILITY_LIMIT
 
 

diff --git a/facefusion/core.py b/facefusion/core.py
@@ -108,6 +108,19 @@ def cli() -> None:
  run(program)
 
 
+def validate_args(program : ArgumentParser) -> None:
+ try:
+ for action in program._actions:
+ if action.default:
+ if isinstance(action.default, list):
+ for default in action.default:
+ program._check_value(action, default)
+ else:
+ program._check_value(action, action.default)
+ except Exception as exception:
+ program.error(str(exception))
+
+
 def apply_args(program : ArgumentParser) -> None:
  args = program.parse_args()
  # general
@@ -185,6 +198,7 @@ def apply_args(program : ArgumentParser) -> None:
 
 
 def run(program : ArgumentParser) -> None:
+ validate_args(program)
  apply_args(program)
  logger.init(facefusion.globals.log_level)
 

diff --git a/facefusion/execution.py b/facefusion/execution.py
@@ -11,14 +11,14 @@ def encode_execution_providers(execution_providers : List[str]) -> List[str]:
  return [ execution_provider.replace('ExecutionProvider', '').lower() for execution_provider in execution_providers ]
 
 
-def decode_execution_providers(execution_providers: List[str]) -> List[str]:
+def decode_execution_providers(execution_providers : List[str]) -> List[str]:
  available_execution_providers = onnxruntime.get_available_providers()
  encoded_execution_providers = encode_execution_providers(available_execution_providers)
 
  return [ execution_provider for execution_provider, encoded_execution_provider in zip(available_execution_providers, encoded_execution_providers) if any(execution_provider in encoded_execution_provider for execution_provider in execution_providers) ]
 
 
-def apply_execution_provider_options(execution_providers: List[str]) -> List[Any]:
+def apply_execution_provider_options(execution_providers : List[str]) -> List[Any]:
  execution_providers_with_options : List[Any] = []
 
  for execution_provider in execution_providers:
@@ -64,13 +64,12 @@ def detect_execution_devices() -> List[ExecutionDevice]:
  'framework':
  {
  'name': 'CUDA',
- 'version': root_element.find('cuda_version').text,
+ 'version': root_element.find('cuda_version').text
  },
  'product':
  {
  'vendor': 'NVIDIA',
- 'name': gpu_element.find('product_name').text.replace('NVIDIA ', ''),
- 'architecture': gpu_element.find('product_architecture').text,
+ 'name': gpu_element.find('product_name').text.replace('NVIDIA ', '')
  },
  'video_memory':
  {

diff --git a/facefusion/face_analyser.py b/facefusion/face_analyser.py
@@ -1,6 +1,5 @@
 from typing import Any, Optional, List, Tuple
 from time import sleep
-import threading
 import cv2
 import numpy
 import onnxruntime
@@ -13,12 +12,11 @@
 from facefusion.execution import apply_execution_provider_options
 from facefusion.download import conditional_download
 from facefusion.filesystem import resolve_relative_path, is_file
+from facefusion.thread_helper import thread_lock, thread_semaphore, conditional_thread_semaphore
 from facefusion.typing import VisionFrame, Face, FaceSet, FaceAnalyserOrder, FaceAnalyserAge, FaceAnalyserGender, ModelSet, BoundingBox, FaceLandmarkSet, FaceLandmark5, FaceLandmark68, Score, FaceScoreSet, Embedding
 from facefusion.vision import resize_frame_resolution, unpack_resolution
 
 FACE_ANALYSER = None
-THREAD_SEMAPHORE : threading.Semaphore = threading.Semaphore()
-THREAD_LOCK : threading.Lock = threading.Lock()
 MODELS : ModelSet =\
 {
  'face_detector_retinaface':
@@ -85,7 +83,7 @@ def get_face_analyser() -> Any:
  face_detectors = {}
  face_landmarkers = {}
 
- with THREAD_LOCK:
+ with thread_lock():
  while process_manager.is_checking():
  sleep(0.5)
  if FACE_ANALYSER is None:
@@ -185,7 +183,7 @@ def detect_with_retinaface(vision_frame : VisionFrame, face_detector_size : str)
  score_list = []
 
  detect_vision_frame = prepare_detect_frame(temp_vision_frame, face_detector_size)
- with THREAD_SEMAPHORE:
+ with thread_semaphore():
  detections = face_detector.run(None,
  {
  face_detector.get_inputs()[0].name: detect_vision_frame
@@ -227,7 +225,7 @@ def detect_with_scrfd(vision_frame : VisionFrame, face_detector_size : str) -> T
  score_list = []
 
  detect_vision_frame = prepare_detect_frame(temp_vision_frame, face_detector_size)
- with THREAD_SEMAPHORE:
+ with thread_semaphore():
  detections = face_detector.run(None,
  {
  face_detector.get_inputs()[0].name: detect_vision_frame
@@ -266,7 +264,7 @@ def detect_with_yoloface(vision_frame : VisionFrame, face_detector_size : str) -
  score_list = []
 
  detect_vision_frame = prepare_detect_frame(temp_vision_frame, face_detector_size)
- with THREAD_SEMAPHORE:
+ with thread_semaphore():
  detections = face_detector.run(None,
  {
  face_detector.get_inputs()[0].name: detect_vision_frame
@@ -304,7 +302,7 @@ def detect_with_yunet(vision_frame : VisionFrame, face_detector_size : str) -> T
 
  face_detector.setInputSize((temp_vision_frame.shape[1], temp_vision_frame.shape[0]))
  face_detector.setScoreThreshold(facefusion.globals.face_detector_score)
- with THREAD_SEMAPHORE:
+ with thread_semaphore():
  _, detections = face_detector.detect(temp_vision_frame)
  if numpy.any(detections):
  for detection in detections:
@@ -380,10 +378,11 @@ def calc_embedding(temp_vision_frame : VisionFrame, face_landmark_5 : FaceLandma
  crop_vision_frame = crop_vision_frame / 127.5 - 1
  crop_vision_frame = crop_vision_frame[:, :, ::-1].transpose(2, 0, 1).astype(numpy.float32)
  crop_vision_frame = numpy.expand_dims(crop_vision_frame, axis = 0)
- embedding = face_recognizer.run(None,
- {
- face_recognizer.get_inputs()[0].name: crop_vision_frame
- })[0]
+ with conditional_thread_semaphore(facefusion.globals.execution_providers):
+ embedding = face_recognizer.run(None,
+ {
+ face_recognizer.get_inputs()[0].name: crop_vision_frame
+ })[0]
  embedding = embedding.ravel()
  normed_embedding = embedding / numpy.linalg.norm(embedding)
  return embedding, normed_embedding
@@ -399,10 +398,11 @@ def detect_face_landmark_68(temp_vision_frame : VisionFrame, bounding_box : Boun
  crop_vision_frame[:, :, 0] = cv2.createCLAHE(clipLimit = 2).apply(crop_vision_frame[:, :, 0])
  crop_vision_frame = cv2.cvtColor(crop_vision_frame, cv2.COLOR_Lab2RGB)
  crop_vision_frame = crop_vision_frame.transpose(2, 0, 1).astype(numpy.float32) / 255.0
- face_landmark_68, face_heatmap = face_landmarker.run(None,
- {
- face_landmarker.get_inputs()[0].name: [ crop_vision_frame ]
- })
+ with conditional_thread_semaphore(facefusion.globals.execution_providers):
+ face_landmark_68, face_heatmap = face_landmarker.run(None,
+ {
+ face_landmarker.get_inputs()[0].name: [ crop_vision_frame ]
+ })
  face_landmark_68 = face_landmark_68[:, :, :2][0] / 64
  face_landmark_68 = face_landmark_68.reshape(1, -1, 2) * 256
  face_landmark_68 = cv2.transform(face_landmark_68, cv2.invertAffineTransform(affine_matrix))
@@ -416,10 +416,11 @@ def expand_face_landmark_68_from_5(face_landmark_5 : FaceLandmark5) -> FaceLandm
  face_landmarker = get_face_analyser().get('face_landmarkers').get('68_5')
  affine_matrix = estimate_matrix_by_face_landmark_5(face_landmark_5, 'ffhq_512', (1, 1))
  face_landmark_5 = cv2.transform(face_landmark_5.reshape(1, -1, 2), affine_matrix).reshape(-1, 2)
- face_landmark_68_5 = face_landmarker.run(None,
- {
- face_landmarker.get_inputs()[0].name: [ face_landmark_5 ]
- })[0][0]
+ with conditional_thread_semaphore(facefusion.globals.execution_providers):
+ face_landmark_68_5 = face_landmarker.run(None,
+ {
+ face_landmarker.get_inputs()[0].name: [ face_landmark_5 ]
+ })[0][0]
  face_landmark_68_5 = cv2.transform(face_landmark_68_5.reshape(1, -1, 2), cv2.invertAffineTransform(affine_matrix)).reshape(-1, 2)
  return face_landmark_68_5
 
@@ -432,10 +433,11 @@ def detect_gender_age(temp_vision_frame : VisionFrame, bounding_box : BoundingBo
  crop_vision_frame, affine_matrix = warp_face_by_translation(temp_vision_frame, translation, scale, (96, 96))
  crop_vision_frame = crop_vision_frame[:, :, ::-1].transpose(2, 0, 1).astype(numpy.float32)
  crop_vision_frame = numpy.expand_dims(crop_vision_frame, axis = 0)
- prediction = gender_age.run(None,
- {
- gender_age.get_inputs()[0].name: crop_vision_frame
- })[0][0]
+ with conditional_thread_semaphore(facefusion.globals.execution_providers):
+ prediction = gender_age.run(None,
+ {
+ gender_age.get_inputs()[0].name: crop_vision_frame
+ })[0][0]
  gender = int(numpy.argmax(prediction[:2]))
  age = int(numpy.round(prediction[2] * 100))
  return gender, age

diff --git a/facefusion/face_masker.py b/facefusion/face_masker.py
@@ -2,21 +2,20 @@
 from cv2.typing import Size
 from functools import lru_cache
 from time import sleep
-import threading
 import cv2
 import numpy
 import onnxruntime
 
 import facefusion.globals
 from facefusion import process_manager
+from facefusion.thread_helper import thread_lock, conditional_thread_semaphore
 from facefusion.typing import FaceLandmark68, VisionFrame, Mask, Padding, FaceMaskRegion, ModelSet
 from facefusion.execution import apply_execution_provider_options
 from facefusion.filesystem import resolve_relative_path, is_file
 from facefusion.download import conditional_download
 
 FACE_OCCLUDER = None
 FACE_PARSER = None
-THREAD_LOCK : threading.Lock = threading.Lock()
 MODELS : ModelSet =\
 {
  'face_occluder':
@@ -48,7 +47,7 @@
 def get_face_occluder() -> Any:
  global FACE_OCCLUDER
 
- with THREAD_LOCK:
+ with thread_lock():
  while process_manager.is_checking():
  sleep(0.5)
  if FACE_OCCLUDER is None:
@@ -60,7 +59,7 @@ def get_face_occluder() -> Any:
 def get_face_parser() -> Any:
  global FACE_PARSER
 
- with THREAD_LOCK:
+ with thread_lock():
  while process_manager.is_checking():
  sleep(0.5)
  if FACE_PARSER is None:
@@ -120,10 +119,11 @@ def create_occlusion_mask(crop_vision_frame : VisionFrame) -> Mask:
  prepare_vision_frame = cv2.resize(crop_vision_frame, face_occluder.get_inputs()[0].shape[1:3][::-1])
  prepare_vision_frame = numpy.expand_dims(prepare_vision_frame, axis = 0).astype(numpy.float32) / 255
  prepare_vision_frame = prepare_vision_frame.transpose(0, 1, 2, 3)
- occlusion_mask : Mask = face_occluder.run(None,
- {
- face_occluder.get_inputs()[0].name: prepare_vision_frame
- })[0][0]
+ with conditional_thread_semaphore(facefusion.globals.execution_providers):
+ occlusion_mask : Mask = face_occluder.run(None,
+ {
+ face_occluder.get_inputs()[0].name: prepare_vision_frame
+ })[0][0]
  occlusion_mask = occlusion_mask.transpose(0, 1, 2).clip(0, 1).astype(numpy.float32)
  occlusion_mask = cv2.resize(occlusion_mask, crop_vision_frame.shape[:2][::-1])
  occlusion_mask = (cv2.GaussianBlur(occlusion_mask.clip(0, 1), (0, 0), 5).clip(0.5, 1) - 0.5) * 2
@@ -135,10 +135,11 @@ def create_region_mask(crop_vision_frame : VisionFrame, face_mask_regions : List
  prepare_vision_frame = cv2.flip(cv2.resize(crop_vision_frame, (512, 512)), 1)
  prepare_vision_frame = numpy.expand_dims(prepare_vision_frame, axis = 0).astype(numpy.float32)[:, :, ::-1] / 127.5 - 1
  prepare_vision_frame = prepare_vision_frame.transpose(0, 3, 1, 2)
- region_mask : Mask = face_parser.run(None,
- {
- face_parser.get_inputs()[0].name: prepare_vision_frame
- })[0][0]
+ with conditional_thread_semaphore(facefusion.globals.execution_providers):
+ region_mask : Mask = face_parser.run(None,
+ {
+ face_parser.get_inputs()[0].name: prepare_vision_frame
+ })[0][0]
  region_mask = numpy.isin(region_mask.argmax(0), [ FACE_MASK_REGIONS[region] for region in face_mask_regions ])
  region_mask = cv2.resize(region_mask.astype(numpy.float32), crop_vision_frame.shape[:2][::-1])
  region_mask = (cv2.GaussianBlur(region_mask.clip(0, 1), (0, 0), 5).clip(0.5, 1) - 0.5) * 2

diff --git a/facefusion/ffmpeg.py b/facefusion/ffmpeg.py
@@ -44,16 +44,16 @@ def extract_frames(target_path : str, temp_video_resolution : str, temp_video_fp
  trim_frame_start = facefusion.globals.trim_frame_start
  trim_frame_end = facefusion.globals.trim_frame_end
  temp_frames_pattern = get_temp_frames_pattern(target_path, '%04d')
- commands = [ '-hwaccel', 'auto', '-i', target_path, '-q:v', '0' ]
+ commands = [ '-i', target_path, '-s', str(temp_video_resolution), '-q:v', '0' ]
 
  if trim_frame_start is not None and trim_frame_end is not None:
- commands.extend([ '-vf', 'trim=start_frame=' + str(trim_frame_start) + ':end_frame=' + str(trim_frame_end) + ',scale=' + str(temp_video_resolution) + ',fps=' + str(temp_video_fps) ])
+ commands.extend([ '-vf', 'trim=start_frame=' + str(trim_frame_start) + ':end_frame=' + str(trim_frame_end) + ',fps=' + str(temp_video_fps) ])
  elif trim_frame_start is not None:
- commands.extend([ '-vf', 'trim=start_frame=' + str(trim_frame_start) + ',scale=' + str(temp_video_resolution) + ',fps=' + str(temp_video_fps) ])
+ commands.extend([ '-vf', 'trim=start_frame=' + str(trim_frame_start) + ',fps=' + str(temp_video_fps) ])
  elif trim_frame_end is not None:
- commands.extend([ '-vf', 'trim=end_frame=' + str(trim_frame_end) + ',scale=' + str(temp_video_resolution) + ',fps=' + str(temp_video_fps) ])
+ commands.extend([ '-vf', 'trim=end_frame=' + str(trim_frame_end) + ',fps=' + str(temp_video_fps) ])
  else:
- commands.extend([ '-vf', 'scale=' + str(temp_video_resolution) + ',fps=' + str(temp_video_fps) ])
+ commands.extend([ '-vf', 'fps=' + str(temp_video_fps) ])
  commands.extend([ '-vsync', '0', temp_frames_pattern ])
  return run_ffmpeg(commands)
 
@@ -62,7 +62,7 @@ def merge_video(target_path : str, output_video_resolution : str, output_video_f
  temp_video_fps = restrict_video_fps(target_path, output_video_fps)
  temp_output_video_path = get_temp_output_video_path(target_path)
  temp_frames_pattern = get_temp_frames_pattern(target_path, '%04d')
- commands = [ '-hwaccel', 'auto', '-s', str(output_video_resolution), '-r', str(temp_video_fps), '-i', temp_frames_pattern, '-c:v', facefusion.globals.output_video_encoder ]
+ commands = [ '-r', str(temp_video_fps), '-i', temp_frames_pattern, '-s', str(output_video_resolution), '-c:v', facefusion.globals.output_video_encoder ]
 
  if facefusion.globals.output_video_encoder in [ 'libx264', 'libx265' ]:
  output_video_compression = round(51 - (facefusion.globals.output_video_quality * 0.51))
@@ -83,13 +83,13 @@ def merge_video(target_path : str, output_video_resolution : str, output_video_f
 def copy_image(target_path : str, output_path : str, temp_image_resolution : str) -> bool:
  is_webp = filetype.guess_mime(target_path) == 'image/webp'
  temp_image_compression = 100 if is_webp else 0
- commands = [ '-i', target_path, '-q:v', str(temp_image_compression), '-vf', 'scale=' + str(temp_image_resolution), '-y', output_path ]
+ commands = [ '-i', target_path, '-s', str(temp_image_resolution), '-q:v', str(temp_image_compression), '-y', output_path ]
  return run_ffmpeg(commands)
 
 
 def finalize_image(output_path : str, output_image_resolution : str) -> bool:
  output_image_compression = round(31 - (facefusion.globals.output_image_quality * 0.31))
- commands = [ '-i', output_path, '-q:v', str(output_image_compression), '-vf', 'scale=' + str(output_image_resolution), '-y', output_path ]
+ commands = [ '-i', output_path, '-s', str(output_image_resolution), '-q:v', str(output_image_compression), '-y', output_path ]
  return run_ffmpeg(commands)
 
 
@@ -106,7 +106,7 @@ def restore_audio(target_path : str, output_path : str, output_video_fps : Fps)
  trim_frame_start = facefusion.globals.trim_frame_start
  trim_frame_end = facefusion.globals.trim_frame_end
  temp_output_video_path = get_temp_output_video_path(target_path)
- commands = [ '-hwaccel', 'auto', '-i', temp_output_video_path ]
+ commands = [ '-i', temp_output_video_path ]
 
  if trim_frame_start is not None:
  start_time = trim_frame_start / output_video_fps
@@ -120,7 +120,7 @@ def restore_audio(target_path : str, output_path : str, output_video_fps : Fps)
 
 def replace_audio(target_path : str, audio_path : str, output_path : str) -> bool:
  temp_output_path = get_temp_output_video_path(target_path)
- commands = [ '-hwaccel', 'auto', '-i', temp_output_path, '-i', audio_path, '-af', 'apad', '-shortest', '-y', output_path ]
+ commands = [ '-i', temp_output_path, '-i', audio_path, '-af', 'apad', '-shortest', '-y', output_path ]
  return run_ffmpeg(commands)