From a14307c683534e7b7cb64ef7cbee68454a7f5386 Mon Sep 17 00:00:00 2001
From: Starg <starg@users.osdn.me>
Date: Thu, 28 Dec 2023 01:00:23 +0900
Subject: [PATCH 1/5] [wasapi] Fix COM initialization in console versions

---
 timidity/timidity.c |  8 +++++---
 timidity/wasapi_a.c | 18 +++++++++---------
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/timidity/timidity.c b/timidity/timidity.c
index 88a24910..387c5a28 100644
--- a/timidity/timidity.c
+++ b/timidity/timidity.c
@@ -9081,7 +9081,7 @@ extern int volatile save_playlist_once_before_exit_flag;
 #endif /* IA_W32GUI */
 
 
-#if defined ( IA_W32GUI ) || defined ( IA_W32G_SYN )
+#ifdef __W32__
 static int CoInitializeOK = 0;
 #endif
 
@@ -9238,9 +9238,11 @@ int main(int argc, char **argv)
 		return 0;
 	}
 	timidity_start_initialize();
-#if defined (IA_W32GUI) || defined (IA_W32G_SYN)
+#ifdef __W32__
 	if (SUCCEEDED(CoInitializeEx(NULL, COINIT_MULTITHREADED | COINIT_DISABLE_OLE1DDE)))
 		CoInitializeOK = 1;
+#endif
+#if defined (IA_W32GUI) || defined (IA_W32G_SYN)
 	w32g_initialize();
 	for (c = 1; c < argc; c++)
 		if (is_directory(argv[c])) {
@@ -9367,9 +9369,9 @@ int main(int argc, char **argv)
 	/* CUI, SYN */
 	main_ret = timidity_play_main(nfiles, files);
 	w32_atexit = 0;
-#ifdef IA_W32G_SYN
 	if (CoInitializeOK)
 		CoUninitialize();
+#ifdef IA_W32G_SYN
 	w32g_uninitialize();
 #endif /* IA_W32G_SYN */
 #else
diff --git a/timidity/wasapi_a.c b/timidity/wasapi_a.c
index 584ad20f..51fa1cbc 100644
--- a/timidity/wasapi_a.c
+++ b/timidity/wasapi_a.c
@@ -854,15 +854,15 @@ void close_output(void)
 		CloseHandle(hEventTcv);
 		hEventTcv = NULL;
 	}
-	if(IsCoInit && CoInitThreadId == GetCurrentThreadId()){
-		CoUninitialize();
-		IsCoInit = 0;
-	}
 	BufferFrames = 0;
 	free_avrt();
 #ifdef USE_TEMP_ENCODE
 	reset_temporary_encoding();
 #endif
+	if(IsCoInit && CoInitThreadId == GetCurrentThreadId()){
+		CoUninitialize();
+		IsCoInit = 0;
+	}
 	IsOpened = 0;
 }
 
@@ -881,7 +881,11 @@ int open_output(void)
 	int device_id;
 
 	close_output();	
-		
+
+	if(check_hresult_failed(CoInitializeEx(NULL, COINIT_MULTITHREADED | COINIT_DISABLE_OLE1DDE), "CoInitializeEx()"))
+		goto error;	
+	IsCoInit = 1;
+	CoInitThreadId = GetCurrentThreadId();
 	if(!get_winver()){
 		ctl->cmsg(CMSG_WARNING, VERB_NORMAL, "WASAPI ERROR! WASAPI require Windows Vista and later.");
 		return -1;
@@ -900,10 +904,6 @@ int open_output(void)
 	hEventTcv = CreateEvent(NULL,FALSE,FALSE,NULL); // auto reset
 	if(!hEventTcv)
 		goto error;	
-    if(check_hresult_failed(CoInitializeEx(NULL, COINIT_MULTITHREADED | COINIT_DISABLE_OLE1DDE), "CoInitializeEx()"))
-		goto error;	
-	IsCoInit = 1;
-	CoInitThreadId = GetCurrentThreadId();
 	if(!get_device(&pMMDevice, device_id))
 		goto error;
 	if(check_hresult_failed(IMMDevice_Activate(pMMDevice, &tim_IID_IAudioClient, CLSCTX_INPROC_SERVER, NULL, (void**)&pAudioClient), "IMMDevice::Activate()"))

From 9a2d532cd73c8e775f722ec131a5ce68fff329c0 Mon Sep 17 00:00:00 2001
From: starg2 <75976488+starg2@users.noreply.github.com>
Date: Sat, 16 Mar 2024 17:24:27 +0900
Subject: [PATCH 2/5] [opus] Update libopus to 1.5.1

---
 opus/CMakeLists.txt                           |    6 +-
 opus/COPYING                                  |    4 +-
 opus/README                                   |   34 +-
 opus/celt/arch.h                              |    3 +
 opus/celt/arm/arm_celt_map.c                  |   31 +-
 opus/celt/arm/armcpu.c                        |   62 +-
 opus/celt/arm/armcpu.h                        |   13 +
 opus/celt/arm/celt_neon_intr.c                |   83 +-
 opus/celt/arm/pitch_neon_intr.c               |   58 +-
 opus/celt/bands.c                             |   11 +-
 opus/celt/celt.h                              |   25 +-
 opus/celt/celt_decoder.c                      |  419 +++++--
 opus/celt/celt_encoder.c                      |   84 +-
 opus/celt/celt_lpc.c                          |   99 +-
 opus/celt/celt_lpc.h                          |    2 +-
 opus/celt/cpu_support.h                       |   10 +-
 opus/celt/ecintrin.h                          |    4 +
 opus/celt/entdec.c                            |   21 +
 opus/celt/entdec.h                            |   10 +
 opus/celt/entenc.c                            |   11 +
 opus/celt/entenc.h                            |    9 +
 opus/celt/fixed_debug.h                       |   71 +-
 opus/celt/fixed_generic.h                     |   14 +-
 opus/celt/float_cast.h                        |   62 +-
 opus/celt/kiss_fft.h                          |   12 +-
 opus/celt/laplace.c                           |  101 ++
 opus/celt/laplace.h                           |    9 +
 opus/celt/mathops.h                           |   12 +-
 opus/celt/mips/celt_mipsr1.h                  |    6 +-
 opus/celt/mips/mdct_mipsr1.h                  |    6 +-
 opus/celt/mips/vq_mipsr1.h                    |    6 +-
 opus/celt/modes.c                             |    3 +
 opus/celt/os_support.h                        |   15 +-
 opus/celt/pitch.c                             |   34 +-
 opus/celt/pitch.h                             |   11 +
 opus/celt/rate.c                              |    2 +
 opus/celt/stack_alloc.h                       |    6 +-
 opus/celt/tests/test_unit_cwrs32.c            |    1 +
 opus/celt/tests/test_unit_dft.c               |    4 +-
 opus/celt/tests/test_unit_entropy.c           |    4 +-
 opus/celt/tests/test_unit_laplace.c           |    1 +
 opus/celt/tests/test_unit_mathops.c           |    6 +-
 opus/celt/tests/test_unit_mdct.c              |    4 +-
 opus/celt/tests/test_unit_rotation.c          |    1 +
 opus/celt/x86/celt_lpc_sse.h                  |    5 +-
 opus/celt/x86/celt_lpc_sse4_1.c               |   13 +-
 opus/celt/x86/pitch_avx.c                     |  101 ++
 opus/celt/x86/pitch_sse.h                     |   48 +-
 opus/celt/x86/pitch_sse4_1.c                  |   51 +-
 opus/celt/x86/vq_sse.h                        |    6 +-
 opus/celt/x86/vq_sse2.c                       |    8 +-
 opus/celt/x86/x86_arch_macros.h               |   47 +
 opus/celt/x86/x86_celt_map.c                  |   20 +
 opus/celt/x86/x86cpu.c                        |   51 +-
 opus/celt/x86/x86cpu.h                        |   73 +-
 opus/include/opus/opus.h                      |  126 +-
 opus/include/opus/opus_custom.h               |    7 +-
 opus/include/opus/opus_defines.h              |   37 +-
 opus/include/opus/opus_multistream.h          |    2 +-
 opus/silk/API.h                               |   23 +-
 opus/silk/CNG.c                               |    4 +
 opus/silk/LPC_fit.c                           |    3 +-
 opus/silk/MacroCount.h                        |    2 +-
 opus/silk/MacroDebug.h                        |   54 +-
 opus/silk/NSQ.c                               |   48 +-
 opus/silk/NSQ_del_dec.c                       |   60 +-
 opus/silk/PLC.c                               |   67 +-
 opus/silk/PLC.h                               |    3 +
 opus/silk/SigProc_FIX.h                       |   10 +-
 opus/silk/VQ_WMat_EC.c                        |    4 +-
 opus/silk/arm/LPC_inv_pred_gain_neon_intr.c   |   22 +-
 opus/silk/arm/NSQ_del_dec_arm.h               |    4 +-
 opus/silk/arm/NSQ_del_dec_neon_intr.c         |   28 +-
 opus/silk/arm/NSQ_neon.h                      |    4 +-
 opus/silk/arm/arm_silk_map.c                  |    7 +-
 opus/silk/bwexpander_32.c                     |    3 +-
 opus/silk/control.h                           |   11 +
 opus/silk/control_codec.c                     |    2 +-
 opus/silk/debug.c                             |   12 +-
 opus/silk/debug.h                             |   25 +-
 opus/silk/dec_API.c                           |   69 +-
 opus/silk/decode_frame.c                      |   61 +-
 opus/silk/define.h                            |    1 +
 opus/silk/enc_API.c                           |   19 +-
 opus/silk/fixed/LTP_scale_ctrl_FIX.c          |   11 +-
 .../warped_autocorrelation_FIX_neon_intr.c    |    9 +-
 opus/silk/fixed/burg_modified_FIX.c           |    8 +-
 opus/silk/fixed/encode_frame_FIX.c            |   18 +-
 opus/silk/fixed/find_pred_coefs_FIX.c         |    3 +-
 opus/silk/fixed/vector_ops_FIX.c              |    2 +-
 .../silk/fixed/x86/burg_modified_FIX_sse4_1.c |   69 +-
 opus/silk/fixed/x86/vector_ops_FIX_sse4_1.c   |   43 +-
 opus/silk/float/LTP_scale_ctrl_FLP.c          |   10 +-
 opus/silk/float/SigProc_FLP.h                 |   14 +-
 opus/silk/float/autocorrelation_FLP.c         |    5 +-
 opus/silk/float/burg_modified_FLP.c           |    5 +-
 opus/silk/float/corrMatrix_FLP.c              |   10 +-
 opus/silk/float/encode_frame_FLP.c            |   17 +-
 opus/silk/float/find_LPC_FLP.c                |    7 +-
 opus/silk/float/find_LTP_FLP.c                |    7 +-
 opus/silk/float/find_pitch_lags_FLP.c         |    2 +-
 opus/silk/float/find_pred_coefs_FLP.c         |    7 +-
 opus/silk/float/inner_product_FLP.c           |    2 +-
 opus/silk/float/main_FLP.h                    |   12 +-
 opus/silk/float/noise_shape_analysis_FLP.c    |    2 +-
 opus/silk/float/pitch_analysis_core_FLP.c     |    2 +-
 opus/silk/float/warped_autocorrelation_FLP.c  |    6 +-
 opus/silk/float/wrappers_FLP.c                |   10 +-
 opus/silk/float/x86/inner_product_FLP_avx2.c  |   85 ++
 opus/silk/init_decoder.c                      |   33 +-
 opus/silk/init_encoder.c                      |    4 +
 opus/silk/main.h                              |   70 +-
 opus/silk/mips/NSQ_del_dec_mipsr1.h           |    6 +-
 opus/silk/mips/macros_mipsr1.h                |    6 +-
 opus/silk/stereo_LR_to_MS.c                   |    8 +-
 opus/silk/stereo_MS_to_LR.c                   |    4 +-
 opus/silk/structs.h                           |   28 +
 opus/silk/tests/test_unit_LPC_inv_pred_gain.c |    3 +-
 opus/silk/typedef.h                           |    3 +
 opus/silk/x86/NSQ_del_dec_avx2.c              | 1075 +++++++++++++++++
 opus/silk/x86/NSQ_del_dec_sse4_1.c            |  206 ++--
 opus/silk/x86/NSQ_sse4_1.c                    |  247 ++--
 opus/silk/x86/SigProc_FIX_sse.h               |   49 +-
 opus/silk/x86/VAD_sse4_1.c                    |   30 +-
 opus/silk/x86/VQ_WMat_EC_sse4_1.c             |  189 +--
 opus/silk/x86/main_sse.h                      |  287 +++--
 opus/silk/x86/x86_silk_map.c                  |  113 +-
 opus/src/analysis.c                           |    8 +-
 opus/src/extensions.c                         |  315 +++++
 opus/src/mapping_matrix.c                     |  561 +++++++++
 opus/src/mapping_matrix.h                     |   12 +
 opus/src/mlp.c                                |   42 +-
 opus/src/mlp.h                                |   20 +-
 opus/src/mlp_data.c                           |    6 +-
 opus/src/opus.c                               |   10 +-
 opus/src/opus_decoder.c                       |  510 +++++++-
 opus/src/opus_demo.c                          |  359 +++++-
 opus/src/opus_encoder.c                       |  726 +++++++----
 opus/src/opus_multistream_decoder.c           |   11 +-
 opus/src/opus_multistream_encoder.c           |   11 +-
 opus/src/opus_private.h                       |   28 +-
 opus/src/opus_projection_encoder.c            |   42 +
 opus/src/repacketizer.c                       |  144 ++-
 opus/src/repacketizer_demo.c                  |   44 +-
 opus/src/tansig_table.h                       |   45 -
 145 files changed, 6594 insertions(+), 1558 deletions(-)
 create mode 100644 opus/celt/x86/pitch_avx.c
 create mode 100644 opus/celt/x86/x86_arch_macros.h
 create mode 100644 opus/silk/float/x86/inner_product_FLP_avx2.c
 create mode 100644 opus/silk/x86/NSQ_del_dec_avx2.c
 create mode 100644 opus/src/extensions.c
 delete mode 100644 opus/src/tansig_table.h

diff --git a/opus/CMakeLists.txt b/opus/CMakeLists.txt
index adb21233..afd72104 100644
--- a/opus/CMakeLists.txt
+++ b/opus/CMakeLists.txt
@@ -32,6 +32,7 @@ add_library(
 
     src/analysis.c
     src/analysis.h
+    src/extensions.c
     src/mlp.c
     src/mlp.h
     src/mlp_data.c
@@ -43,7 +44,6 @@ add_library(
     src/opus_multistream_encoder.c
     src/opus_private.h
     src/repacketizer.c
-    src/tansig_table.h
 
     celt/arch.h
     celt/bands.c
@@ -95,12 +95,14 @@ add_library(
     celt/_kiss_fft_guts.h
     celt/x86/celt_lpc_sse4_1.c
     celt/x86/celt_lpc_sse.h
+    celt/x86/pitch_avx.c
     celt/x86/pitch_sse.c
     celt/x86/pitch_sse.h
     celt/x86/pitch_sse2.c
     celt/x86/pitch_sse4_1.c
     celt/x86/vq_sse.h
     celt/x86/vq_sse2.c
+    celt/x86/x86_arch_macros.h
     celt/x86/x86cpu.c
     celt/x86/x86cpu.h
     celt/x86/x86_celt_map.c
@@ -234,7 +236,9 @@ add_library(
     silk/float/structs_FLP.h
     silk/float/warped_autocorrelation_FLP.c
     silk/float/wrappers_FLP.c
+    silk/float/x86/inner_product_FLP_avx2.c
     silk/x86/main_sse.h
+    silk/x86/NSQ_del_dec_avx2.c
     silk/x86/NSQ_del_dec_sse4_1.c
     silk/x86/NSQ_sse4_1.c
     silk/x86/SigProc_FIX_sse.h
diff --git a/opus/COPYING b/opus/COPYING
index 9c739c34..75711467 100644
--- a/opus/COPYING
+++ b/opus/COPYING
@@ -1,7 +1,7 @@
-Copyright 2001-2011 Xiph.Org, Skype Limited, Octasic,
+Copyright 2001-2023 Xiph.Org, Skype Limited, Octasic,
                     Jean-Marc Valin, Timothy B. Terriberry,
                     CSIRO, Gregory Maxwell, Mark Borgerding,
-                    Erik de Castro Lopo
+                    Erik de Castro Lopo, Mozilla, Amazon
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions
diff --git a/opus/README b/opus/README
index 27fddf96..bcf2376d 100644
--- a/opus/README
+++ b/opus/README
@@ -22,7 +22,7 @@ This package implements a shared library for encoding and decoding raw Opus
 bitstreams. Raw Opus bitstreams should be used over RTP according to
  https://tools.ietf.org/html/rfc7587
 
-The package also includes a number of test  tools used for testing the
+The package also includes a number of test tools used for testing the
 correct operation of the library. The bitstreams read/written by these
 tools should not be used for Opus file distribution: They include
 additional debugging data and cannot support seeking.
@@ -35,10 +35,32 @@ An opus-tools package is available which provides encoding and decoding of
 Ogg encapsulated Opus files and includes a number of useful features.
 
 Opus-tools can be found at:
- https://git.xiph.org/?p=opus-tools.git
+ https://gitlab.xiph.org/xiph/opus-tools.git
 or on the main Opus website:
  https://opus-codec.org/
 
+== Deep Learning and Opus ==
+
+Lossy networks continue to be a challenge for real-time communications.
+While the original implementation of Opus provides an excellent packet loss
+concealment mechanism, the team has continued to advance the methodology used
+to improve audio quality in challenge network environments.
+
+In Opus 1.5, we added a deep learning based redundancy encoder that enhances
+audio in lossy networks by embedding one second of recovery data in the padding
+data of each packet. The underlying algorithm behind encoding and decoding the
+recovery data is called the deep redundancy (DRED) algorithm. By leveraging
+the padding data within the packet, Opus 1.5 is fully backward compatible with
+prior revisions of Opus. Please see the README under the "dnn" subdirectory to
+understand DRED.
+
+DRED was developed by a team that Amazon Web Services initially sponsored,
+who open-sourced the implementation as well as began the
+standardization process at the IETF:
+  https://datatracker.ietf.org/doc/draft-ietf-mlcodec-opus-extension/
+The license behind Opus or the intellectual property position of Opus does
+not change with Opus 1.5.
+
 == Compiling libopus ==
 
 To build from a distribution tarball, you only need to do the following:
@@ -68,7 +90,7 @@ On Apple macOS, install Xcode and brew.sh, then in the Terminal enter:
 
 1) Clone the repository:
 
-    % git clone https://git.xiph.org/opus.git
+    % git clone https://gitlab.xiph.org/xiph/opus.git
     % cd opus
 
 2) Compiling the source
@@ -77,6 +99,8 @@ On Apple macOS, install Xcode and brew.sh, then in the Terminal enter:
     % ./configure
     % make
 
+On x86, it's a good idea to use a -march= option that allows the use of AVX2.
+
 3) Install the codec libraries (optional)
 
     % sudo make install
@@ -133,6 +157,10 @@ To run compare the code to these test vectors:
     % tar -zxf opus_testvectors-rfc8251.tar.gz
     % ./tests/run_vectors.sh ./ opus_newvectors 48000
 
+== Compiling libopus for Windows and alternative build systems ==
+
+See cmake/README.md or meson/README.md.
+
 == Portability notes ==
 
 This implementation uses floating-point by default but can be compiled to
diff --git a/opus/celt/arch.h b/opus/celt/arch.h
index 08b07db5..3845c3a0 100644
--- a/opus/celt/arch.h
+++ b/opus/celt/arch.h
@@ -73,6 +73,9 @@ __attribute__((noreturn))
 void celt_fatal(const char *str, const char *file, int line)
 {
    fprintf (stderr, "Fatal (internal) error in %s, line %d: %s\n", file, line, str);
+#if defined(_MSC_VER)
+   _set_abort_behavior( 0, _WRITE_ABORT_MSG);
+#endif
    abort();
 }
 #endif
diff --git a/opus/celt/arm/arm_celt_map.c b/opus/celt/arm/arm_celt_map.c
index ca988b66..cbaea495 100644
--- a/opus/celt/arm/arm_celt_map.c
+++ b/opus/celt/arm/arm_celt_map.c
@@ -40,7 +40,8 @@ opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, c
   celt_inner_prod_c,   /* ARMv4 */
   celt_inner_prod_c,   /* EDSP */
   celt_inner_prod_c,   /* Media */
-  celt_inner_prod_neon /* NEON */
+  celt_inner_prod_neon,/* NEON */
+  celt_inner_prod_neon /* DOTPROD */
 };
 
 void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
@@ -48,7 +49,8 @@ void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const o
   dual_inner_prod_c,   /* ARMv4 */
   dual_inner_prod_c,   /* EDSP */
   dual_inner_prod_c,   /* Media */
-  dual_inner_prod_neon /* NEON */
+  dual_inner_prod_neon,/* NEON */
+  dual_inner_prod_neon /* DOTPROD */
 };
 # endif
 
@@ -61,7 +63,8 @@ opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
   celt_pitch_xcorr_c,               /* ARMv4 */
   MAY_HAVE_EDSP(celt_pitch_xcorr),  /* EDSP */
   MAY_HAVE_MEDIA(celt_pitch_xcorr), /* Media */
-  MAY_HAVE_NEON(celt_pitch_xcorr)   /* NEON */
+  MAY_HAVE_NEON(celt_pitch_xcorr),  /* NEON */
+  MAY_HAVE_NEON(celt_pitch_xcorr)   /* DOTPROD */
 };
 
 #  endif
@@ -72,7 +75,8 @@ void (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
   celt_pitch_xcorr_c,              /* ARMv4 */
   celt_pitch_xcorr_c,              /* EDSP */
   celt_pitch_xcorr_c,              /* Media */
-  celt_pitch_xcorr_float_neon      /* Neon */
+  celt_pitch_xcorr_float_neon,     /* Neon */
+  celt_pitch_xcorr_float_neon      /* DOTPROD */
 };
 #  endif
 # endif /* FIXED_POINT */
@@ -90,6 +94,7 @@ void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
   xcorr_kernel_c,                /* EDSP */
   xcorr_kernel_c,                /* Media */
   xcorr_kernel_neon_fixed,       /* Neon */
+  xcorr_kernel_neon_fixed        /* DOTPROD */
 };
 
 #endif
@@ -101,14 +106,16 @@ int (*const OPUS_FFT_ALLOC_ARCH_IMPL[OPUS_ARCHMASK+1])(kiss_fft_state *st) = {
    opus_fft_alloc_arch_c,        /* ARMv4 */
    opus_fft_alloc_arch_c,        /* EDSP */
    opus_fft_alloc_arch_c,        /* Media */
-   opus_fft_alloc_arm_neon       /* Neon with NE10 library support */
+   opus_fft_alloc_arm_neon,      /* Neon with NE10 library support */
+   opus_fft_alloc_arm_neon       /* DOTPROD with NE10 library support */
 };
 
 void (*const OPUS_FFT_FREE_ARCH_IMPL[OPUS_ARCHMASK+1])(kiss_fft_state *st) = {
    opus_fft_free_arch_c,         /* ARMv4 */
    opus_fft_free_arch_c,         /* EDSP */
    opus_fft_free_arch_c,         /* Media */
-   opus_fft_free_arm_neon        /* Neon with NE10 */
+   opus_fft_free_arm_neon,       /* Neon with NE10 */
+   opus_fft_free_arm_neon        /* DOTPROD with NE10 */
 };
 #   endif /* CUSTOM_MODES */
 
@@ -118,7 +125,8 @@ void (*const OPUS_FFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg,
    opus_fft_c,                   /* ARMv4 */
    opus_fft_c,                   /* EDSP */
    opus_fft_c,                   /* Media */
-   opus_fft_neon                 /* Neon with NE10 */
+   opus_fft_neon,                /* Neon with NE10 */
+   opus_fft_neon                 /* DOTPROD with NE10 */
 };
 
 void (*const OPUS_IFFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg,
@@ -127,7 +135,8 @@ void (*const OPUS_IFFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg,
    opus_ifft_c,                   /* ARMv4 */
    opus_ifft_c,                   /* EDSP */
    opus_ifft_c,                   /* Media */
-   opus_ifft_neon                 /* Neon with NE10 */
+   opus_ifft_neon,                /* Neon with NE10 */
+   opus_ifft_neon                 /* DOTPROD with NE10 */
 };
 
 void (*const CLT_MDCT_FORWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l,
@@ -139,7 +148,8 @@ void (*const CLT_MDCT_FORWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l,
    clt_mdct_forward_c,           /* ARMv4 */
    clt_mdct_forward_c,           /* EDSP */
    clt_mdct_forward_c,           /* Media */
-   clt_mdct_forward_neon         /* Neon with NE10 */
+   clt_mdct_forward_neon,        /* Neon with NE10 */
+   clt_mdct_forward_neon         /* DOTPROD with NE10 */
 };
 
 void (*const CLT_MDCT_BACKWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l,
@@ -151,7 +161,8 @@ void (*const CLT_MDCT_BACKWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l,
    clt_mdct_backward_c,           /* ARMv4 */
    clt_mdct_backward_c,           /* EDSP */
    clt_mdct_backward_c,           /* Media */
-   clt_mdct_backward_neon         /* Neon with NE10 */
+   clt_mdct_backward_neon,        /* Neon with NE10 */
+   clt_mdct_backward_neon         /* DOTPROD with NE10 */
 };
 
 #  endif /* HAVE_ARM_NE10 */
diff --git a/opus/celt/arm/armcpu.c b/opus/celt/arm/armcpu.c
index 694a63b7..06a53435 100644
--- a/opus/celt/arm/armcpu.c
+++ b/opus/celt/arm/armcpu.c
@@ -43,6 +43,7 @@
 #define OPUS_CPU_ARM_EDSP_FLAG  (1<<OPUS_ARCH_ARM_EDSP)
 #define OPUS_CPU_ARM_MEDIA_FLAG (1<<OPUS_ARCH_ARM_MEDIA)
 #define OPUS_CPU_ARM_NEON_FLAG  (1<<OPUS_ARCH_ARM_NEON)
+#define OPUS_CPU_ARM_DOTPROD_FLAG  (1<<OPUS_ARCH_ARM_DOTPROD)
 
 #if defined(_MSC_VER)
 /*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
@@ -93,6 +94,8 @@ static OPUS_INLINE opus_uint32 opus_cpu_capabilities(void){
 
 #elif defined(__linux__)
 /* Linux based */
+#include <stdio.h>
+
 opus_uint32 opus_cpu_capabilities(void)
 {
   opus_uint32 flags = 0;
@@ -124,6 +127,14 @@ opus_uint32 opus_cpu_capabilities(void)
         p = strstr(buf, " neon");
         if(p != NULL && (p[5] == ' ' || p[5] == '\n'))
           flags |= OPUS_CPU_ARM_NEON_FLAG;
+        p = strstr(buf, " asimd");
+        if(p != NULL && (p[6] == ' ' || p[6] == '\n'))
+          flags |= OPUS_CPU_ARM_NEON_FLAG | OPUS_CPU_ARM_MEDIA_FLAG | OPUS_CPU_ARM_EDSP_FLAG;
+#  endif
+#  if defined(OPUS_ARM_MAY_HAVE_DOTPROD)
+        p = strstr(buf, " asimddp");
+        if(p != NULL && (p[8] == ' ' || p[8] == '\n'))
+          flags |= OPUS_CPU_ARM_DOTPROD_FLAG;
 #  endif
       }
 # endif
@@ -142,10 +153,44 @@ opus_uint32 opus_cpu_capabilities(void)
 # endif
     }
 
+#if defined(OPUS_ARM_PRESUME_AARCH64_NEON_INTR)
+    flags |= OPUS_CPU_ARM_EDSP_FLAG | OPUS_CPU_ARM_MEDIA_FLAG | OPUS_CPU_ARM_NEON_FLAG;
+# if defined(OPUS_ARM_PRESUME_DOTPROD)
+    flags |= OPUS_CPU_ARM_DOTPROD_FLAG;
+# endif
+#endif
+
     fclose(cpuinfo);
   }
   return flags;
 }
+
+#elif defined(__APPLE__)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+
+opus_uint32 opus_cpu_capabilities(void)
+{
+  opus_uint32 flags = 0;
+
+#if defined(OPUS_ARM_MAY_HAVE_DOTPROD)
+  size_t size = sizeof(uint32_t);
+  uint32_t value = 0;
+  if (!sysctlbyname("hw.optional.arm.FEAT_DotProd", &value, &size, NULL, 0) && value)
+  {
+    flags |= OPUS_CPU_ARM_DOTPROD_FLAG;
+  }
+#endif
+
+#if defined(OPUS_ARM_PRESUME_AARCH64_NEON_INTR)
+  flags |= OPUS_CPU_ARM_EDSP_FLAG | OPUS_CPU_ARM_MEDIA_FLAG | OPUS_CPU_ARM_NEON_FLAG;
+# if defined(OPUS_ARM_PRESUME_DOTPROD)
+  flags |= OPUS_CPU_ARM_DOTPROD_FLAG;
+# endif
+#endif
+  return flags;
+}
+
 #else
 /* The feature registers which can tell us what the processor supports are
  * accessible in priveleged modes only, so we can't have a general user-space
@@ -154,7 +199,7 @@ opus_uint32 opus_cpu_capabilities(void)
    "your platform.  Reconfigure with --disable-rtcd (or send patches)."
 #endif
 
-int opus_select_arch(void)
+static int opus_select_arch_impl(void)
 {
   opus_uint32 flags = opus_cpu_capabilities();
   int arch = 0;
@@ -178,8 +223,21 @@ int opus_select_arch(void)
   }
   arch++;
 
-  celt_assert(arch == OPUS_ARCH_ARM_NEON);
+  if(!(flags & OPUS_CPU_ARM_DOTPROD_FLAG)) {
+    celt_assert(arch == OPUS_ARCH_ARM_NEON);
+    return arch;
+  }
+  arch++;
+
+  celt_assert(arch == OPUS_ARCH_ARM_DOTPROD);
   return arch;
 }
 
+int opus_select_arch(void) {
+  int arch = opus_select_arch_impl();
+#ifdef FUZZING
+  arch = rand()%(arch+1);
+#endif
+  return arch;
+}
 #endif
diff --git a/opus/celt/arm/armcpu.h b/opus/celt/arm/armcpu.h
index 820262ff..6d5803d8 100644
--- a/opus/celt/arm/armcpu.h
+++ b/opus/celt/arm/armcpu.h
@@ -46,6 +46,12 @@
 #  define MAY_HAVE_NEON(name) MAY_HAVE_MEDIA(name)
 # endif
 
+# if defined(OPUS_ARM_MAY_HAVE_DOTPROD)
+#  define MAY_HAVE_DOTPROD(name) name ## _dotprod
+# else
+#  define MAY_HAVE_DOTPROD(name) MAY_HAVE_NEON(name)
+# endif
+
 # if defined(OPUS_ARM_PRESUME_EDSP)
 #  define PRESUME_EDSP(name) name ## _edsp
 # else
@@ -64,6 +70,12 @@
 #  define PRESUME_NEON(name) PRESUME_MEDIA(name)
 # endif
 
+# if defined(OPUS_ARM_PRESUME_DOTPROD)
+#  define PRESUME_DOTPROD(name) name ## _dotprod
+# else
+#  define PRESUME_DOTPROD(name) PRESUME_NEON(name)
+# endif
+
 # if defined(OPUS_HAVE_RTCD)
 int opus_select_arch(void);
 
@@ -71,6 +83,7 @@ int opus_select_arch(void);
 #define OPUS_ARCH_ARM_EDSP  (1)
 #define OPUS_ARCH_ARM_MEDIA (2)
 #define OPUS_ARCH_ARM_NEON  (3)
+#define OPUS_ARCH_ARM_DOTPROD  (4)
 
 # endif
 
diff --git a/opus/celt/arm/celt_neon_intr.c b/opus/celt/arm/celt_neon_intr.c
index effda769..250f8362 100644
--- a/opus/celt/arm/celt_neon_intr.c
+++ b/opus/celt/arm/celt_neon_intr.c
@@ -38,6 +38,8 @@
 #include "../pitch.h"
 
 #if defined(FIXED_POINT)
+#include <string.h>
+
 void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len)
 {
    int j;
@@ -47,7 +49,10 @@ void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_va
    int16x4_t y0 = vld1_s16(y);
    y += 4;
 
-   for (j = 0; j + 8 <= len; j += 8)
+   /* This loop loads one y value more than we actually need.
+      Therefore we have to stop as soon as there are 8 or fewer samples left
+       (instead of 7), to avoid reading past the end of the array. */
+   for (j = 0; j + 8 < len; j += 8)
    {
       /* Load x[0...7] */
       int16x8_t xx = vld1q_s16(x);
@@ -80,23 +85,79 @@ void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_va
       x += 8;
       y += 8;
    }
-
-   for (; j < len; j++)
-   {
-      int16x4_t x0 = vld1_dup_s16(x);  /* load next x */
+   if (j + 4 < len) {
+      /* Load x[0...3] */
+      int16x4_t x0 = vld1_s16(x);
+      /* Load y[4...7] */
+      int16x4_t y4 = vld1_s16(y);
+      int32x4_t a0 = vmlal_lane_s16(a, y0, x0, 0);
+      int16x4_t y1 = vext_s16(y0, y4, 1);
+      int32x4_t a1 = vmlal_lane_s16(a0, y1, x0, 1);
+      int16x4_t y2 = vext_s16(y0, y4, 2);
+      int32x4_t a2 = vmlal_lane_s16(a1, y2, x0, 2);
+      int16x4_t y3 = vext_s16(y0, y4, 3);
+      int32x4_t a3 = vmlal_lane_s16(a2, y3, x0, 3);
+      y0 = y4;
+      a = a3;
+      x += 4;
+      y += 4;
+      j += 4;
+   }
+   if (j + 2 < len) {
+      /* Load x[0...1] */
+      int16x4x2_t xx = vld2_dup_s16(x);
+      int16x4_t x0 = xx.val[0];
+      int16x4_t x1 = xx.val[1];
+      /* Load y[4...5].
+         We would like to use vld1_dup_s32(), but casting the pointer would
+          break strict aliasing rules and potentially have alignment issues.
+         Fortunately the compiler seems capable of translating this memcpy()
+          and vdup_n_s32() into the equivalent vld1_dup_s32().*/
+      int32_t yy;
+      memcpy(&yy, y, sizeof(yy));
+      int16x4_t y4 = vreinterpret_s16_s32(vdup_n_s32(yy));
       int32x4_t a0 = vmlal_s16(a, y0, x0);
-
-      int16x4_t y4 = vld1_dup_s16(y);  /* load next y */
-      y0 = vext_s16(y0, y4, 1);
+      int16x4_t y1 = vext_s16(y0, y4, 1);
+      /* Replace bottom copy of {y[5], y[4]} in y4 with {y[3], y[2]} from y0,
+          using VSRI instead of VEXT, since it's a data-processing
+          instruction. */
+      y0 = vreinterpret_s16_s64(vsri_n_s64(vreinterpret_s64_s16(y4),
+       vreinterpret_s64_s16(y0), 32));
+      int32x4_t a1 = vmlal_s16(a0, y1, x1);
+      a = a1;
+      x += 2;
+      y += 2;
+      j += 2;
+   }
+   if (j + 1 < len) {
+      /* Load next x. */
+      int16x4_t x0 = vld1_dup_s16(x);
+      int32x4_t a0 = vmlal_s16(a, y0, x0);
+      /* Load last y. */
+      int16x4_t y4 = vld1_dup_s16(y);
+      y0 = vreinterpret_s16_s64(vsri_n_s64(vreinterpret_s64_s16(y4),
+       vreinterpret_s64_s16(y0), 16));
       a = a0;
       x++;
-      y++;
    }
-
-   vst1q_s32(sum, a);
+   /* Load last x. */
+   int16x4_t x0 = vld1_dup_s16(x);
+   int32x4_t a0 = vmlal_s16(a, y0, x0);
+   vst1q_s32(sum, a0);
 }
 
 #else
+
+#if defined(__ARM_FEATURE_FMA) && defined(__ARM_ARCH_ISA_A64)
+/* If we can, force the compiler to use an FMA instruction rather than break
+ *    vmlaq_f32() into fmul/fadd. */
+#ifdef vmlaq_lane_f32
+#undef vmlaq_lane_f32
+#endif
+#define vmlaq_lane_f32(a,b,c,lane) vfmaq_lane_f32(a,b,c,lane)
+#endif
+
+
 /*
  * Function: xcorr_kernel_neon_float
  * ---------------------------------
diff --git a/opus/celt/arm/pitch_neon_intr.c b/opus/celt/arm/pitch_neon_intr.c
index 1ac38c43..43885f52 100644
--- a/opus/celt/arm/pitch_neon_intr.c
+++ b/opus/celt/arm/pitch_neon_intr.c
@@ -130,6 +130,13 @@ void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus
 
 /* ========================================================================== */
 
+#ifdef __ARM_FEATURE_FMA
+/* If we can, force the compiler to use an FMA instruction rather than break
+   vmlaq_f32() into fmul/fadd. */
+#define vmlaq_f32(a,b,c) vfmaq_f32(a,b,c)
+#endif
+
+
 #ifdef OPUS_CHECK_ASM
 
 /* This part of code simulates floating-point NEON operations. */
@@ -137,22 +144,27 @@ void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus
 /* celt_inner_prod_neon_float_c_simulation() simulates the floating-point   */
 /* operations of celt_inner_prod_neon(), and both functions should have bit */
 /* exact output.                                                            */
-static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y, int N)
+static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y, float *err, int N)
 {
    int i;
+   *err = 0;
    opus_val32 xy, xy0 = 0, xy1 = 0, xy2 = 0, xy3 = 0;
    for (i = 0; i < N - 3; i += 4) {
       xy0 = MAC16_16(xy0, x[i + 0], y[i + 0]);
       xy1 = MAC16_16(xy1, x[i + 1], y[i + 1]);
       xy2 = MAC16_16(xy2, x[i + 2], y[i + 2]);
       xy3 = MAC16_16(xy3, x[i + 3], y[i + 3]);
+      *err += ABS32(xy0)+ABS32(xy1)+ABS32(xy2)+ABS32(xy3);
    }
    xy0 += xy2;
    xy1 += xy3;
    xy = xy0 + xy1;
+   *err += ABS32(xy1)+ABS32(xy0)+ABS32(xy);
    for (; i < N; i++) {
       xy = MAC16_16(xy, x[i], y[i]);
+      *err += ABS32(xy);
    }
+   *err = *err*2e-7 + N*1e-37;
    return xy;
 }
 
@@ -160,32 +172,10 @@ static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, c
 /* operations of dual_inner_prod_neon(), and both functions should have bit */
 /* exact output.                                                            */
 static void dual_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
-      int N, opus_val32 *xy1, opus_val32 *xy2)
+      int N, opus_val32 *xy1, opus_val32 *xy2, float *err)
 {
-   int i;
-   opus_val32 xy01, xy02, xy01_0 = 0, xy01_1 = 0, xy01_2 = 0, xy01_3 = 0, xy02_0 = 0, xy02_1 = 0, xy02_2 = 0, xy02_3 = 0;
-   for (i = 0; i < N - 3; i += 4) {
-      xy01_0 = MAC16_16(xy01_0, x[i + 0], y01[i + 0]);
-      xy01_1 = MAC16_16(xy01_1, x[i + 1], y01[i + 1]);
-      xy01_2 = MAC16_16(xy01_2, x[i + 2], y01[i + 2]);
-      xy01_3 = MAC16_16(xy01_3, x[i + 3], y01[i + 3]);
-      xy02_0 = MAC16_16(xy02_0, x[i + 0], y02[i + 0]);
-      xy02_1 = MAC16_16(xy02_1, x[i + 1], y02[i + 1]);
-      xy02_2 = MAC16_16(xy02_2, x[i + 2], y02[i + 2]);
-      xy02_3 = MAC16_16(xy02_3, x[i + 3], y02[i + 3]);
-   }
-   xy01_0 += xy01_2;
-   xy02_0 += xy02_2;
-   xy01_1 += xy01_3;
-   xy02_1 += xy02_3;
-   xy01 = xy01_0 + xy01_1;
-   xy02 = xy02_0 + xy02_1;
-   for (; i < N; i++) {
-      xy01 = MAC16_16(xy01, x[i], y01[i]);
-      xy02 = MAC16_16(xy02, x[i], y02[i]);
-   }
-   *xy1 = xy01;
-   *xy2 = xy02;
+   *xy1 = celt_inner_prod_neon_float_c_simulation(x, y01, &err[0], N);
+   *xy2 = celt_inner_prod_neon_float_c_simulation(x, y02, &err[1], N);
 }
 
 #endif /* OPUS_CHECK_ASM */
@@ -225,7 +215,12 @@ opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N)
     }
 
 #ifdef OPUS_CHECK_ASM
-    celt_assert(ABS32(celt_inner_prod_neon_float_c_simulation(x, y, N) - xy) <= VERY_SMALL);
+    {
+        float err, res;
+        res = celt_inner_prod_neon_float_c_simulation(x, y, &err, N);
+        /*if (ABS32(res - xy) > err) fprintf(stderr, "%g %g %g\n", res, xy, err);*/
+        celt_assert(ABS32(res - xy) <= err);
+    }
 #endif
 
     return xy;
@@ -280,9 +275,12 @@ void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus
 #ifdef OPUS_CHECK_ASM
     {
         opus_val32 xy1_c, xy2_c;
-        dual_inner_prod_neon_float_c_simulation(x, y01, y02, N, &xy1_c, &xy2_c);
-        celt_assert(ABS32(xy1_c - *xy1) <= VERY_SMALL);
-        celt_assert(ABS32(xy2_c - *xy2) <= VERY_SMALL);
+        float err[2];
+        dual_inner_prod_neon_float_c_simulation(x, y01, y02, N, &xy1_c, &xy2_c, err);
+        /*if (ABS32(xy1_c - *xy1) > err[0]) fprintf(stderr, "dual1 fail: %g %g %g\n", xy1_c, *xy1, err[0]);
+        if (ABS32(xy2_c - *xy2) > err[1]) fprintf(stderr, "dual2 fail: %g %g %g\n", xy2_c, *xy2, err[1]);*/
+        celt_assert(ABS32(xy1_c - *xy1) <= err[0]);
+        celt_assert(ABS32(xy2_c - *xy2) <= err[1]);
     }
 #endif
 }
diff --git a/opus/celt/bands.c b/opus/celt/bands.c
index 2702963c..6785e08e 100644
--- a/opus/celt/bands.c
+++ b/opus/celt/bands.c
@@ -901,7 +901,7 @@ static void compute_theta(struct band_ctx *ctx, struct split_ctx *sctx,
    sctx->itheta = itheta;
    sctx->qalloc = qalloc;
 }
-static unsigned quant_band_n1(struct band_ctx *ctx, celt_norm *X, celt_norm *Y, int b,
+static unsigned quant_band_n1(struct band_ctx *ctx, celt_norm *X, celt_norm *Y,
       celt_norm *lowband_out)
 {
    int c;
@@ -926,7 +926,6 @@ static unsigned quant_band_n1(struct band_ctx *ctx, celt_norm *X, celt_norm *Y,
             sign = ec_dec_bits(ec, 1);
          }
          ctx->remaining_bits -= 1<<BITRES;
-         b-=1<<BITRES;
       }
       if (ctx->resynth)
          x[0] = sign ? -NORM_SCALING : NORM_SCALING;
@@ -1134,7 +1133,7 @@ static unsigned quant_band(struct band_ctx *ctx, celt_norm *X,
    /* Special case for one sample */
    if (N==1)
    {
-      return quant_band_n1(ctx, X, NULL, b, lowband_out);
+      return quant_band_n1(ctx, X, NULL, lowband_out);
    }
 
    if (tf_change>0)
@@ -1256,7 +1255,7 @@ static unsigned quant_band_stereo(struct band_ctx *ctx, celt_norm *X, celt_norm
    /* Special case for one sample */
    if (N==1)
    {
-      return quant_band_n1(ctx, X, Y, b, lowband_out);
+      return quant_band_n1(ctx, X, Y, lowband_out);
    }
 
    orig_fill = fill;
@@ -1381,6 +1380,7 @@ static unsigned quant_band_stereo(struct band_ctx *ctx, celt_norm *X, celt_norm
    return cm;
 }
 
+#ifndef DISABLE_UPDATE_DRAFT
 static void special_hybrid_folding(const CELTMode *m, celt_norm *norm, celt_norm *norm2, int start, int M, int dual_stereo)
 {
    int n1, n2;
@@ -1393,6 +1393,7 @@ static void special_hybrid_folding(const CELTMode *m, celt_norm *norm, celt_norm
    if (dual_stereo)
       OPUS_COPY(&norm2[n1], &norm2[2*n1 - n2], n2-n1);
 }
+#endif
 
 void quant_all_bands(int encode, const CELTMode *m, int start, int end,
       celt_norm *X_, celt_norm *Y_, unsigned char *collapse_masks,
@@ -1449,7 +1450,7 @@ void quant_all_bands(int encode, const CELTMode *m, int start, int end,
    if (encode && resynth)
       lowband_scratch = _lowband_scratch;
    else
-      lowband_scratch = X_+M*eBands[m->nbEBands-1];
+      lowband_scratch = X_+M*eBands[m->effEBands-1];
    ALLOC(X_save, resynth_alloc, celt_norm);
    ALLOC(Y_save, resynth_alloc, celt_norm);
    ALLOC(X_save2, resynth_alloc, celt_norm);
diff --git a/opus/celt/celt.h b/opus/celt/celt.h
index 24b6b2b5..2f501951 100644
--- a/opus/celt/celt.h
+++ b/opus/celt/celt.h
@@ -42,6 +42,10 @@
 #include "entdec.h"
 #include "arch.h"
 
+#ifdef ENABLE_DEEP_PLC
+#include "lpcnet.h"
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -149,6 +153,13 @@ int celt_decoder_get_size(int channels);
 
 int celt_decoder_init(CELTDecoder *st, opus_int32 sampling_rate, int channels);
 
+int celt_decode_with_ec_dred(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data,
+      int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec, int accum
+#ifdef ENABLE_DEEP_PLC
+      ,LPCNetPLCState *lpcnet
+#endif
+      );
+
 int celt_decode_with_ec(OpusCustomDecoder * OPUS_RESTRICT st, const unsigned char *data,
       int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec, int accum);
 
@@ -225,23 +236,13 @@ void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
       opus_val16 g0, opus_val16 g1, int tapset0, int tapset1,
       const opus_val16 *window, int overlap, int arch);
 
-#ifdef NON_STATIC_COMB_FILTER_CONST_C
-void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N,
-                         opus_val16 g10, opus_val16 g11, opus_val16 g12);
-#endif
-
-#ifndef OVERRIDE_COMB_FILTER_CONST
-# define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \
-    ((void)(arch),comb_filter_const_c(y, x, T, N, g10, g11, g12))
-#endif
-
 void init_caps(const CELTMode *m,int *cap,int LM,int C);
 
 #ifdef RESYNTH
-void deemphasis(celt_sig *in[], opus_val16 *pcm, int N, int C, int downsample, const opus_val16 *coef, celt_sig *mem);
+void deemphasis(celt_sig *in[], opus_val16 *pcm, int N, int C, int downsample, const opus_val16 *coef, celt_sig *mem, int accum);
 void celt_synthesis(const CELTMode *mode, celt_norm *X, celt_sig * out_syn[],
       opus_val16 *oldBandE, int start, int effEnd, int C, int CC, int isTransient,
-      int LM, int downsample, int silence);
+      int LM, int downsample, int silence, int arch);
 #endif
 
 #ifdef __cplusplus
diff --git a/opus/celt/celt_decoder.c b/opus/celt/celt_decoder.c
index e6efce93..743c2031 100644
--- a/opus/celt/celt_decoder.c
+++ b/opus/celt/celt_decoder.c
@@ -51,6 +51,11 @@
 #include "celt_lpc.h"
 #include "vq.h"
 
+#ifdef ENABLE_DEEP_PLC
+#include "lpcnet.h"
+#include "lpcnet_private.h"
+#endif
+
 /* The maximum pitch lag to allow in the pitch-based PLC. It's possible to save
    CPU time in the PLC pitch search by making this smaller than MAX_PERIOD. The
    current value corresponds to a pitch of 66.67 Hz. */
@@ -59,9 +64,6 @@
    pitch of 480 Hz. */
 #define PLC_PITCH_LAG_MIN (100)
 
-#if defined(SMALL_FOOTPRINT) && defined(FIXED_POINT)
-#define NORM_ALIASING_HACK
-#endif
 /**********************************************************************/
 /*                                                                    */
 /*                             DECODER                                */
@@ -69,6 +71,9 @@
 /**********************************************************************/
 #define DECODE_BUFFER_SIZE 2048
 
+#define PLC_UPDATE_FRAMES 4
+#define PLC_UPDATE_SAMPLES (PLC_UPDATE_FRAMES*FRAME_SIZE)
+
 /** Decoder state
  @brief Decoder state
  */
@@ -82,6 +87,7 @@ struct OpusCustomDecoder {
    int start, end;
    int signalling;
    int disable_inv;
+   int complexity;
    int arch;
 
    /* Everything beyond this point gets cleared on a reset */
@@ -90,7 +96,7 @@ struct OpusCustomDecoder {
    opus_uint32 rng;
    int error;
    int last_pitch_index;
-   int loss_count;
+   int loss_duration;
    int skip_plc;
    int postfilter_period;
    int postfilter_period_old;
@@ -98,11 +104,18 @@ struct OpusCustomDecoder {
    opus_val16 postfilter_gain_old;
    int postfilter_tapset;
    int postfilter_tapset_old;
+   int prefilter_and_fold;
 
    celt_sig preemph_memD[2];
 
+#ifdef ENABLE_DEEP_PLC
+   opus_int16 plc_pcm[PLC_UPDATE_SAMPLES];
+   int plc_fill;
+   float plc_preemphasis_mem;
+#endif
+
    celt_sig _decode_mem[1]; /* Size = channels*(DECODE_BUFFER_SIZE+mode->overlap) */
-   /* opus_val16 lpc[],  Size = channels*LPC_ORDER */
+   /* opus_val16 lpc[],  Size = channels*CELT_LPC_ORDER */
    /* opus_val16 oldEBands[], Size = 2*mode->nbEBands */
    /* opus_val16 oldLogE[], Size = 2*mode->nbEBands */
    /* opus_val16 oldLogE2[], Size = 2*mode->nbEBands */
@@ -117,13 +130,19 @@ void validate_celt_decoder(CELTDecoder *st)
 #ifndef CUSTOM_MODES
    celt_assert(st->mode == opus_custom_mode_create(48000, 960, NULL));
    celt_assert(st->overlap == 120);
+   celt_assert(st->end <= 21);
+#else
+/* From Section 4.3 in the spec: "The normal CELT layer uses 21 of those bands,
+   though Opus Custom (see Section 6.2) may use a different number of bands"
+
+   Check if it's within the maximum number of Bark frequency bands instead */
+   celt_assert(st->end <= 25);
 #endif
    celt_assert(st->channels == 1 || st->channels == 2);
    celt_assert(st->stream_channels == 1 || st->stream_channels == 2);
    celt_assert(st->downsample > 0);
    celt_assert(st->start == 0 || st->start == 17);
    celt_assert(st->start < st->end);
-   celt_assert(st->end <= 21);
 #ifdef OPUS_ARCHMASK
    celt_assert(st->arch >= 0);
    celt_assert(st->arch <= OPUS_ARCHMASK);
@@ -151,7 +170,7 @@ OPUS_CUSTOM_NOSTATIC int opus_custom_decoder_get_size(const CELTMode *mode, int
 {
    int size = sizeof(struct CELTDecoder)
             + (channels*(DECODE_BUFFER_SIZE+mode->overlap)-1)*sizeof(celt_sig)
-            + channels*LPC_ORDER*sizeof(opus_val16)
+            + channels*CELT_LPC_ORDER*sizeof(opus_val16)
             + 4*2*mode->nbEBands*sizeof(opus_val16);
    return size;
 }
@@ -493,7 +512,100 @@ static int celt_plc_pitch_search(celt_sig *decode_mem[2], int C, int arch)
    return pitch_index;
 }
 
-static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
+static void prefilter_and_fold(CELTDecoder * OPUS_RESTRICT st, int N)
+{
+   int c;
+   int CC;
+   int i;
+   int overlap;
+   celt_sig *decode_mem[2];
+   const OpusCustomMode *mode;
+   VARDECL(opus_val32, etmp);
+   mode = st->mode;
+   overlap = st->overlap;
+   CC = st->channels;
+   ALLOC(etmp, overlap, opus_val32);
+   c=0; do {
+      decode_mem[c] = st->_decode_mem + c*(DECODE_BUFFER_SIZE+overlap);
+   } while (++c<CC);
+
+   c=0; do {
+      /* Apply the pre-filter to the MDCT overlap for the next frame because
+         the post-filter will be re-applied in the decoder after the MDCT
+         overlap. */
+      comb_filter(etmp, decode_mem[c]+DECODE_BUFFER_SIZE-N,
+         st->postfilter_period_old, st->postfilter_period, overlap,
+         -st->postfilter_gain_old, -st->postfilter_gain,
+         st->postfilter_tapset_old, st->postfilter_tapset, NULL, 0, st->arch);
+
+      /* Simulate TDAC on the concealed audio so that it blends with the
+         MDCT of the next frame. */
+      for (i=0;i<overlap/2;i++)
+      {
+         decode_mem[c][DECODE_BUFFER_SIZE-N+i] =
+            MULT16_32_Q15(mode->window[i], etmp[overlap-1-i])
+            + MULT16_32_Q15(mode->window[overlap-i-1], etmp[i]);
+      }
+   } while (++c<CC);
+}
+
+#ifdef ENABLE_DEEP_PLC
+
+#define SINC_ORDER 48
+/* h=cos(pi/2*abs(sin([-24:24]/48*pi*23./24)).^2);
+   b=sinc([-24:24]/3*1.02).*h;
+   b=b/sum(b); */
+static const float sinc_filter[SINC_ORDER+1] = {
+    4.2931e-05f, -0.000190293f, -0.000816132f, -0.000637162f, 0.00141662f, 0.00354764f, 0.00184368f, -0.00428274f,
+    -0.00856105f, -0.0034003f, 0.00930201f, 0.0159616f, 0.00489785f, -0.0169649f, -0.0259484f, -0.00596856f,
+    0.0286551f, 0.0405872f, 0.00649994f, -0.0509284f, -0.0716655f, -0.00665212f,  0.134336f,  0.278927f,
+    0.339995f,  0.278927f,  0.134336f, -0.00665212f, -0.0716655f, -0.0509284f, 0.00649994f, 0.0405872f,
+    0.0286551f, -0.00596856f, -0.0259484f, -0.0169649f, 0.00489785f, 0.0159616f, 0.00930201f, -0.0034003f,
+    -0.00856105f, -0.00428274f, 0.00184368f, 0.00354764f, 0.00141662f, -0.000637162f, -0.000816132f, -0.000190293f,
+    4.2931e-05f
+};
+
+void update_plc_state(LPCNetPLCState *lpcnet, celt_sig *decode_mem[2], float *plc_preemphasis_mem, int CC)
+{
+   int i;
+   int tmp_read_post, tmp_fec_skip;
+   int offset;
+   celt_sig buf48k[DECODE_BUFFER_SIZE];
+   opus_int16 buf16k[PLC_UPDATE_SAMPLES];
+   if (CC == 1) OPUS_COPY(buf48k, decode_mem[0], DECODE_BUFFER_SIZE);
+   else {
+      for (i=0;i<DECODE_BUFFER_SIZE;i++) {
+         buf48k[i] = .5*(decode_mem[0][i] + decode_mem[1][i]);
+      }
+   }
+   /* Down-sample the last 40 ms. */
+   for (i=1;i<DECODE_BUFFER_SIZE;i++) buf48k[i] += PREEMPHASIS*buf48k[i-1];
+   *plc_preemphasis_mem = buf48k[DECODE_BUFFER_SIZE-1];
+   offset = DECODE_BUFFER_SIZE-SINC_ORDER-1 - 3*(PLC_UPDATE_SAMPLES-1);
+   celt_assert(3*(PLC_UPDATE_SAMPLES-1) + SINC_ORDER + offset == DECODE_BUFFER_SIZE-1);
+   for (i=0;i<PLC_UPDATE_SAMPLES;i++) {
+      int j;
+      float sum = 0;
+      for (j=0;j<SINC_ORDER+1;j++) {
+         sum += buf48k[3*i + j + offset]*sinc_filter[j];
+      }
+      buf16k[i] = float2int(MIN32(32767.f, MAX32(-32767.f, sum)));
+   }
+   tmp_read_post = lpcnet->fec_read_pos;
+   tmp_fec_skip = lpcnet->fec_skip;
+   for (i=0;i<PLC_UPDATE_FRAMES;i++) {
+      lpcnet_plc_update(lpcnet, &buf16k[FRAME_SIZE*i]);
+   }
+   lpcnet->fec_read_pos = tmp_read_post;
+   lpcnet->fec_skip = tmp_fec_skip;
+}
+#endif
+
+static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM
+#ifdef ENABLE_DEEP_PLC
+      ,LPCNetPLCState *lpcnet
+#endif
+      )
 {
    int c;
    int i;
@@ -506,7 +618,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
    int nbEBands;
    int overlap;
    int start;
-   int loss_count;
+   int loss_duration;
    int noise_based;
    const opus_int16 *eBands;
    SAVE_STACK;
@@ -521,22 +633,22 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
       out_syn[c] = decode_mem[c]+DECODE_BUFFER_SIZE-N;
    } while (++c<C);
    lpc = (opus_val16*)(st->_decode_mem+(DECODE_BUFFER_SIZE+overlap)*C);
-   oldBandE = lpc+C*LPC_ORDER;
+   oldBandE = lpc+C*CELT_LPC_ORDER;
    oldLogE = oldBandE + 2*nbEBands;
    oldLogE2 = oldLogE + 2*nbEBands;
    backgroundLogE = oldLogE2  + 2*nbEBands;
 
-   loss_count = st->loss_count;
+   loss_duration = st->loss_duration;
    start = st->start;
-   noise_based = loss_count >= 5 || start != 0 || st->skip_plc;
+#ifdef ENABLE_DEEP_PLC
+   noise_based = start != 0 || (lpcnet->fec_fill_pos == 0 && (st->skip_plc || loss_duration >= 80));
+#else
+   noise_based = loss_duration >= 40 || start != 0 || st->skip_plc;
+#endif
    if (noise_based)
    {
       /* Noise-based PLC/CNG */
-#ifdef NORM_ALIASING_HACK
-      celt_norm *X;
-#else
       VARDECL(celt_norm, X);
-#endif
       opus_uint32 seed;
       int end;
       int effEnd;
@@ -544,16 +656,18 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
       end = st->end;
       effEnd = IMAX(start, IMIN(end, mode->effEBands));
 
-#ifdef NORM_ALIASING_HACK
-      /* This is an ugly hack that breaks aliasing rules and would be easily broken,
-         but it saves almost 4kB of stack. */
-      X = (celt_norm*)(out_syn[C-1]+overlap/2);
-#else
       ALLOC(X, C*N, celt_norm);   /**< Interleaved normalised MDCTs */
-#endif
+      c=0; do {
+         OPUS_MOVE(decode_mem[c], decode_mem[c]+N,
+               DECODE_BUFFER_SIZE-N+overlap);
+      } while (++c<C);
+
+      if (st->prefilter_and_fold) {
+         prefilter_and_fold(st, N);
+      }
 
       /* Energy decay */
-      decay = loss_count==0 ? QCONST16(1.5f, DB_SHIFT) : QCONST16(.5f, DB_SHIFT);
+      decay = loss_duration==0 ? QCONST16(1.5f, DB_SHIFT) : QCONST16(.5f, DB_SHIFT);
       c=0; do
       {
          for (i=start;i<end;i++)
@@ -579,12 +693,10 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
       }
       st->rng = seed;
 
-      c=0; do {
-         OPUS_MOVE(decode_mem[c], decode_mem[c]+N,
-               DECODE_BUFFER_SIZE-N+(overlap>>1));
-      } while (++c<C);
-
       celt_synthesis(mode, X, out_syn, oldBandE, start, effEnd, C, C, 0, LM, st->downsample, 0, st->arch);
+      st->prefilter_and_fold = 0;
+      /* Skip regular PLC until we get two consecutive packets. */
+      st->skip_plc = 1;
    } else {
       int exc_length;
       /* Pitch-based PLC */
@@ -592,12 +704,14 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
       opus_val16 *exc;
       opus_val16 fade = Q15ONE;
       int pitch_index;
-      VARDECL(opus_val32, etmp);
       VARDECL(opus_val16, _exc);
       VARDECL(opus_val16, fir_tmp);
 
-      if (loss_count == 0)
+      if (loss_duration == 0)
       {
+#ifdef ENABLE_DEEP_PLC
+        if (lpcnet->loaded) update_plc_state(lpcnet, decode_mem, &st->plc_preemphasis_mem, C);
+#endif
          st->last_pitch_index = pitch_index = celt_plc_pitch_search(decode_mem, C, st->arch);
       } else {
          pitch_index = st->last_pitch_index;
@@ -608,10 +722,9 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
          decaying signal, but we can't get more than MAX_PERIOD. */
       exc_length = IMIN(2*pitch_index, MAX_PERIOD);
 
-      ALLOC(etmp, overlap, opus_val32);
-      ALLOC(_exc, MAX_PERIOD+LPC_ORDER, opus_val16);
+      ALLOC(_exc, MAX_PERIOD+CELT_LPC_ORDER, opus_val16);
       ALLOC(fir_tmp, exc_length, opus_val16);
-      exc = _exc+LPC_ORDER;
+      exc = _exc+CELT_LPC_ORDER;
       window = mode->window;
       c=0; do {
          opus_val16 decay;
@@ -623,16 +736,16 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
          int j;
 
          buf = decode_mem[c];
-         for (i=0;i<MAX_PERIOD+LPC_ORDER;i++)
-            exc[i-LPC_ORDER] = ROUND16(buf[DECODE_BUFFER_SIZE-MAX_PERIOD-LPC_ORDER+i], SIG_SHIFT);
+         for (i=0;i<MAX_PERIOD+CELT_LPC_ORDER;i++)
+            exc[i-CELT_LPC_ORDER] = SROUND16(buf[DECODE_BUFFER_SIZE-MAX_PERIOD-CELT_LPC_ORDER+i], SIG_SHIFT);
 
-         if (loss_count == 0)
+         if (loss_duration == 0)
          {
-            opus_val32 ac[LPC_ORDER+1];
+            opus_val32 ac[CELT_LPC_ORDER+1];
             /* Compute LPC coefficients for the last MAX_PERIOD samples before
                the first loss so we can work in the excitation-filter domain. */
             _celt_autocorr(exc, ac, window, overlap,
-                   LPC_ORDER, MAX_PERIOD, st->arch);
+                   CELT_LPC_ORDER, MAX_PERIOD, st->arch);
             /* Add a noise floor of -40 dB. */
 #ifdef FIXED_POINT
             ac[0] += SHR32(ac[0],13);
@@ -640,7 +753,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
             ac[0] *= 1.0001f;
 #endif
             /* Use lag windowing to stabilize the Levinson-Durbin recursion. */
-            for (i=1;i<=LPC_ORDER;i++)
+            for (i=1;i<=CELT_LPC_ORDER;i++)
             {
                /*ac[i] *= exp(-.5*(2*M_PI*.002*i)*(2*M_PI*.002*i));*/
 #ifdef FIXED_POINT
@@ -649,7 +762,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
                ac[i] -= ac[i]*(0.008f*0.008f)*i*i;
 #endif
             }
-            _celt_lpc(lpc+c*LPC_ORDER, ac, LPC_ORDER);
+            _celt_lpc(lpc+c*CELT_LPC_ORDER, ac, CELT_LPC_ORDER);
 #ifdef FIXED_POINT
          /* For fixed-point, apply bandwidth expansion until we can guarantee that
             no overflow can happen in the IIR filter. This means:
@@ -657,13 +770,13 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
          while (1) {
             opus_val16 tmp=Q15ONE;
             opus_val32 sum=QCONST16(1., SIG_SHIFT);
-            for (i=0;i<LPC_ORDER;i++)
-               sum += ABS16(lpc[c*LPC_ORDER+i]);
+            for (i=0;i<CELT_LPC_ORDER;i++)
+               sum += ABS16(lpc[c*CELT_LPC_ORDER+i]);
             if (sum < 65535) break;
-            for (i=0;i<LPC_ORDER;i++)
+            for (i=0;i<CELT_LPC_ORDER;i++)
             {
                tmp = MULT16_16_Q15(QCONST16(.99f,15), tmp);
-               lpc[c*LPC_ORDER+i] = MULT16_16_Q15(lpc[c*LPC_ORDER+i], tmp);
+               lpc[c*CELT_LPC_ORDER+i] = MULT16_16_Q15(lpc[c*CELT_LPC_ORDER+i], tmp);
             }
          }
 #endif
@@ -673,8 +786,8 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
          {
             /* Compute the excitation for exc_length samples before the loss. We need the copy
                because celt_fir() cannot filter in-place. */
-            celt_fir(exc+MAX_PERIOD-exc_length, lpc+c*LPC_ORDER,
-                  fir_tmp, exc_length, LPC_ORDER, st->arch);
+            celt_fir(exc+MAX_PERIOD-exc_length, lpc+c*CELT_LPC_ORDER,
+                  fir_tmp, exc_length, CELT_LPC_ORDER, st->arch);
             OPUS_COPY(exc+MAX_PERIOD-exc_length, fir_tmp, exc_length);
          }
 
@@ -726,21 +839,21 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
                         exc[extrapolation_offset+j])), SIG_SHIFT);
             /* Compute the energy of the previously decoded signal whose
                excitation we're copying. */
-            tmp = ROUND16(
+            tmp = SROUND16(
                   buf[DECODE_BUFFER_SIZE-MAX_PERIOD-N+extrapolation_offset+j],
                   SIG_SHIFT);
             S1 += SHR32(MULT16_16(tmp, tmp), 10);
          }
          {
-            opus_val16 lpc_mem[LPC_ORDER];
+            opus_val16 lpc_mem[CELT_LPC_ORDER];
             /* Copy the last decoded samples (prior to the overlap region) to
                synthesis filter memory so we can have a continuous signal. */
-            for (i=0;i<LPC_ORDER;i++)
-               lpc_mem[i] = ROUND16(buf[DECODE_BUFFER_SIZE-N-1-i], SIG_SHIFT);
+            for (i=0;i<CELT_LPC_ORDER;i++)
+               lpc_mem[i] = SROUND16(buf[DECODE_BUFFER_SIZE-N-1-i], SIG_SHIFT);
             /* Apply the synthesis filter to convert the excitation back into
                the signal domain. */
-            celt_iir(buf+DECODE_BUFFER_SIZE-N, lpc+c*LPC_ORDER,
-                  buf+DECODE_BUFFER_SIZE-N, extrapolation_len, LPC_ORDER,
+            celt_iir(buf+DECODE_BUFFER_SIZE-N, lpc+c*CELT_LPC_ORDER,
+                  buf+DECODE_BUFFER_SIZE-N, extrapolation_len, CELT_LPC_ORDER,
                   lpc_mem, st->arch);
 #ifdef FIXED_POINT
             for (i=0; i < extrapolation_len; i++)
@@ -755,7 +868,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
             opus_val32 S2=0;
             for (i=0;i<extrapolation_len;i++)
             {
-               opus_val16 tmp = ROUND16(buf[DECODE_BUFFER_SIZE-N+i], SIG_SHIFT);
+               opus_val16 tmp = SROUND16(buf[DECODE_BUFFER_SIZE-N+i], SIG_SHIFT);
                S2 += SHR32(MULT16_16(tmp, tmp), 10);
             }
             /* This checks for an "explosion" in the synthesis. */
@@ -787,42 +900,85 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
             }
          }
 
-         /* Apply the pre-filter to the MDCT overlap for the next frame because
-            the post-filter will be re-applied in the decoder after the MDCT
-            overlap. */
-         comb_filter(etmp, buf+DECODE_BUFFER_SIZE,
-              st->postfilter_period, st->postfilter_period, overlap,
-              -st->postfilter_gain, -st->postfilter_gain,
-              st->postfilter_tapset, st->postfilter_tapset, NULL, 0, st->arch);
-
-         /* Simulate TDAC on the concealed audio so that it blends with the
-            MDCT of the next frame. */
-         for (i=0;i<overlap/2;i++)
-         {
-            buf[DECODE_BUFFER_SIZE+i] =
-               MULT16_32_Q15(window[i], etmp[overlap-1-i])
-               + MULT16_32_Q15(window[overlap-i-1], etmp[i]);
-         }
       } while (++c<C);
+
+#ifdef ENABLE_DEEP_PLC
+      if (lpcnet->loaded && (st->complexity >= 5 || lpcnet->fec_fill_pos > 0)) {
+         float overlap_mem;
+         int samples_needed16k;
+         celt_sig *buf;
+         VARDECL(float, buf_copy);
+         buf = decode_mem[0];
+         ALLOC(buf_copy, C*overlap, float);
+         c=0; do {
+            OPUS_COPY(buf_copy+c*overlap, &decode_mem[c][DECODE_BUFFER_SIZE-N], overlap);
+         } while (++c<C);
+
+         /* Need enough samples from the PLC to cover the frame size, resampling delay,
+            and the overlap at the end. */
+         samples_needed16k = (N+SINC_ORDER+overlap)/3;
+         if (loss_duration == 0) {
+            st->plc_fill = 0;
+         }
+         while (st->plc_fill < samples_needed16k) {
+            lpcnet_plc_conceal(lpcnet, &st->plc_pcm[st->plc_fill]);
+            st->plc_fill += FRAME_SIZE;
+         }
+         /* Resample to 48 kHz. */
+         for (i=0;i<(N+overlap)/3;i++) {
+            int j;
+            float sum;
+            for (sum=0, j=0;j<17;j++) sum += 3*st->plc_pcm[i+j]*sinc_filter[3*j];
+            buf[DECODE_BUFFER_SIZE-N+3*i] = sum;
+            for (sum=0, j=0;j<16;j++) sum += 3*st->plc_pcm[i+j+1]*sinc_filter[3*j+2];
+            buf[DECODE_BUFFER_SIZE-N+3*i+1] = sum;
+            for (sum=0, j=0;j<16;j++) sum += 3*st->plc_pcm[i+j+1]*sinc_filter[3*j+1];
+            buf[DECODE_BUFFER_SIZE-N+3*i+2] = sum;
+         }
+         OPUS_MOVE(st->plc_pcm, &st->plc_pcm[N/3], st->plc_fill-N/3);
+         st->plc_fill -= N/3;
+         for (i=0;i<N;i++) {
+            float tmp = buf[DECODE_BUFFER_SIZE-N+i];
+            buf[DECODE_BUFFER_SIZE-N+i] -= PREEMPHASIS*st->plc_preemphasis_mem;
+            st->plc_preemphasis_mem = tmp;
+         }
+         overlap_mem = st->plc_preemphasis_mem;
+         for (i=0;i<overlap;i++) {
+            float tmp = buf[DECODE_BUFFER_SIZE+i];
+            buf[DECODE_BUFFER_SIZE+i] -= PREEMPHASIS*overlap_mem;
+            overlap_mem = tmp;
+         }
+         /* For now, we just do mono PLC. */
+         if (C==2) OPUS_COPY(decode_mem[1], decode_mem[0], DECODE_BUFFER_SIZE+overlap);
+         c=0; do {
+            /* Cross-fade with 48-kHz non-neural PLC for the first 2.5 ms to avoid a discontinuity. */
+            if (loss_duration == 0) {
+               for (i=0;i<overlap;i++) decode_mem[c][DECODE_BUFFER_SIZE-N+i] = (1-window[i])*buf_copy[c*overlap+i] + (window[i])*decode_mem[c][DECODE_BUFFER_SIZE-N+i];
+            }
+         } while (++c<C);
+      }
+#endif
+      st->prefilter_and_fold = 1;
    }
 
-   st->loss_count = loss_count+1;
+   /* Saturate to soemthing large to avoid wrap-around. */
+   st->loss_duration = IMIN(10000, loss_duration+(1<<LM));
 
    RESTORE_STACK;
 }
 
-int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data,
-      int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec, int accum)
+int celt_decode_with_ec_dred(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data,
+      int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec, int accum
+#ifdef ENABLE_DEEP_PLC
+      ,LPCNetPLCState *lpcnet
+#endif
+      )
 {
    int c, i, N;
    int spread_decision;
    opus_int32 bits;
    ec_dec _dec;
-#ifdef NORM_ALIASING_HACK
-   celt_norm *X;
-#else
    VARDECL(celt_norm, X);
-#endif
    VARDECL(int, fine_quant);
    VARDECL(int, pulses);
    VARDECL(int, cap);
@@ -862,6 +1018,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
    int nbEBands;
    int overlap;
    const opus_int16 *eBands;
+   opus_val16 max_background_increase;
    ALLOC_STACK;
 
    VALIDATE_CELT_DECODER(st);
@@ -874,7 +1031,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
    frame_size *= st->downsample;
 
    lpc = (opus_val16*)(st->_decode_mem+(DECODE_BUFFER_SIZE+overlap)*CC);
-   oldBandE = lpc+CC*LPC_ORDER;
+   oldBandE = lpc+CC*CELT_LPC_ORDER;
    oldLogE = oldBandE + 2*nbEBands;
    oldLogE2 = oldLogE + 2*nbEBands;
    backgroundLogE = oldLogE2  + 2*nbEBands;
@@ -928,15 +1085,25 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
 
    if (data == NULL || len<=1)
    {
-      celt_decode_lost(st, N, LM);
+      celt_decode_lost(st, N, LM
+#ifdef ENABLE_DEEP_PLC
+      , lpcnet
+#endif
+                      );
       deemphasis(out_syn, pcm, N, CC, st->downsample, mode->preemph, st->preemph_memD, accum);
       RESTORE_STACK;
       return frame_size/st->downsample;
    }
+#ifdef ENABLE_DEEP_PLC
+   else {
+      /* FIXME: This is a bit of a hack just to make sure opus_decode_native() knows we're no longer in PLC. */
+      if (lpcnet) lpcnet->blend = 0;
+   }
+#endif
 
    /* Check if there are at least two packets received consecutively before
     * turning on the pitch-based PLC */
-   st->skip_plc = st->loss_count != 0;
+   if (st->loss_duration == 0) st->skip_plc = 0;
 
    if (dec == NULL)
    {
@@ -999,6 +1166,36 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
 
    /* Decode the global flags (first symbols in the stream) */
    intra_ener = tell+3<=total_bits ? ec_dec_bit_logp(dec, 3) : 0;
+   /* If recovering from packet loss, make sure we make the energy prediction safe to reduce the
+      risk of getting loud artifacts. */
+   if (!intra_ener && st->loss_duration != 0) {
+      c=0; do
+      {
+         opus_val16 safety = 0;
+         int missing = IMIN(10, st->loss_duration>>LM);
+         if (LM==0) safety = QCONST16(1.5f,DB_SHIFT);
+         else if (LM==1) safety = QCONST16(.5f,DB_SHIFT);
+         for (i=start;i<end;i++)
+         {
+            if (oldBandE[c*nbEBands+i] < MAX16(oldLogE[c*nbEBands+i], oldLogE2[c*nbEBands+i])) {
+               /* If energy is going down already, continue the trend. */
+               opus_val32 slope;
+               opus_val32 E0, E1, E2;
+               E0 = oldBandE[c*nbEBands+i];
+               E1 = oldLogE[c*nbEBands+i];
+               E2 = oldLogE2[c*nbEBands+i];
+               slope = MAX32(E1 - E0, HALF32(E2 - E0));
+               E0 -= MAX32(0, (1+missing)*slope);
+               oldBandE[c*nbEBands+i] = MAX32(-QCONST16(20.f,DB_SHIFT), E0);
+            } else {
+               /* Otherwise take the min of the last frames. */
+               oldBandE[c*nbEBands+i] = MIN16(MIN16(oldBandE[c*nbEBands+i], oldLogE[c*nbEBands+i]), oldLogE2[c*nbEBands+i]);
+            }
+            /* Shorter frames have more natural fluctuations -- play it safe. */
+            oldBandE[c*nbEBands+i] -= safety;
+         }
+      } while (++c<2);
+   }
    /* Get band energies */
    unquant_coarse_energy(mode, start, end, oldBandE,
          intra_ener, dec, C, LM);
@@ -1066,19 +1263,13 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
    unquant_fine_energy(mode, start, end, oldBandE, fine_quant, dec, C);
 
    c=0; do {
-      OPUS_MOVE(decode_mem[c], decode_mem[c]+N, DECODE_BUFFER_SIZE-N+overlap/2);
+      OPUS_MOVE(decode_mem[c], decode_mem[c]+N, DECODE_BUFFER_SIZE-N+overlap);
    } while (++c<CC);
 
    /* Decode fixed codebook */
    ALLOC(collapse_masks, C*nbEBands, unsigned char);
 
-#ifdef NORM_ALIASING_HACK
-   /* This is an ugly hack that breaks aliasing rules and would be easily broken,
-      but it saves almost 4kB of stack. */
-   X = (celt_norm*)(out_syn[CC-1]+overlap/2);
-#else
    ALLOC(X, C*N, celt_norm);   /**< Interleaved normalised MDCTs */
-#endif
 
    quant_all_bands(0, mode, start, end, X, C==2 ? X+N : NULL, collapse_masks,
          NULL, pulses, shortBlocks, spread_decision, dual_stereo, intensity, tf_res,
@@ -1102,7 +1293,9 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
       for (i=0;i<C*nbEBands;i++)
          oldBandE[i] = -QCONST16(28.f,DB_SHIFT);
    }
-
+   if (st->prefilter_and_fold) {
+      prefilter_and_fold(st, N);
+   }
    celt_synthesis(mode, X, out_syn, oldBandE, start, effEnd,
                   C, CC, isTransient, LM, st->downsample, silence, st->arch);
 
@@ -1134,25 +1327,21 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
    if (C==1)
       OPUS_COPY(&oldBandE[nbEBands], oldBandE, nbEBands);
 
-   /* In case start or end were to change */
    if (!isTransient)
    {
-      opus_val16 max_background_increase;
       OPUS_COPY(oldLogE2, oldLogE, 2*nbEBands);
       OPUS_COPY(oldLogE, oldBandE, 2*nbEBands);
-      /* In normal circumstances, we only allow the noise floor to increase by
-         up to 2.4 dB/second, but when we're in DTX, we allow up to 6 dB
-         increase for each update.*/
-      if (st->loss_count < 10)
-         max_background_increase = M*QCONST16(0.001f,DB_SHIFT);
-      else
-         max_background_increase = QCONST16(1.f,DB_SHIFT);
-      for (i=0;i<2*nbEBands;i++)
-         backgroundLogE[i] = MIN16(backgroundLogE[i] + max_background_increase, oldBandE[i]);
    } else {
       for (i=0;i<2*nbEBands;i++)
          oldLogE[i] = MIN16(oldLogE[i], oldBandE[i]);
    }
+   /* In normal circumstances, we only allow the noise floor to increase by
+      up to 2.4 dB/second, but when we're in DTX we give the weight of
+      all missing packets to the update packet. */
+   max_background_increase = IMIN(160, st->loss_duration+M)*QCONST16(0.001f,DB_SHIFT);
+   for (i=0;i<2*nbEBands;i++)
+      backgroundLogE[i] = MIN16(backgroundLogE[i] + max_background_increase, oldBandE[i]);
+   /* In case start or end were to change */
    c=0; do
    {
       for (i=0;i<start;i++)
@@ -1169,7 +1358,8 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
    st->rng = dec->rng;
 
    deemphasis(out_syn, pcm, N, CC, st->downsample, mode->preemph, st->preemph_memD, accum);
-   st->loss_count = 0;
+   st->loss_duration = 0;
+   st->prefilter_and_fold = 0;
    RESTORE_STACK;
    if (ec_tell(dec) > 8*len)
       return OPUS_INTERNAL_ERROR;
@@ -1178,6 +1368,15 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
    return frame_size/st->downsample;
 }
 
+int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data,
+      int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec, int accum)
+{
+   return celt_decode_with_ec_dred(st, data, len, pcm, frame_size, dec, accum
+#ifdef ENABLE_DEEP_PLC
+       , NULL
+#endif
+       );
+}
 
 #ifdef CUSTOM_MODES
 
@@ -1251,6 +1450,26 @@ int opus_custom_decoder_ctl(CELTDecoder * OPUS_RESTRICT st, int request, ...)
    va_start(ap, request);
    switch (request)
    {
+      case OPUS_SET_COMPLEXITY_REQUEST:
+      {
+          opus_int32 value = va_arg(ap, opus_int32);
+          if(value<0 || value>10)
+          {
+             goto bad_arg;
+          }
+          st->complexity = value;
+      }
+      break;
+      case OPUS_GET_COMPLEXITY_REQUEST:
+      {
+          opus_int32 *value = va_arg(ap, opus_int32*);
+          if (!value)
+          {
+             goto bad_arg;
+          }
+          *value = st->complexity;
+      }
+      break;
       case CELT_SET_START_BAND_REQUEST:
       {
          opus_int32 value = va_arg(ap, opus_int32);
@@ -1297,7 +1516,7 @@ int opus_custom_decoder_ctl(CELTDecoder * OPUS_RESTRICT st, int request, ...)
          int i;
          opus_val16 *lpc, *oldBandE, *oldLogE, *oldLogE2;
          lpc = (opus_val16*)(st->_decode_mem+(DECODE_BUFFER_SIZE+st->overlap)*st->channels);
-         oldBandE = lpc+st->channels*LPC_ORDER;
+         oldBandE = lpc+st->channels*CELT_LPC_ORDER;
          oldLogE = oldBandE + 2*st->mode->nbEBands;
          oldLogE2 = oldLogE + 2*st->mode->nbEBands;
          OPUS_CLEAR((char*)&st->DECODER_RESET_START,
diff --git a/opus/celt/celt_encoder.c b/opus/celt/celt_encoder.c
index 44cb0850..7f32a801 100644
--- a/opus/celt/celt_encoder.c
+++ b/opus/celt/celt_encoder.c
@@ -281,6 +281,9 @@ static int transient_analysis(const opus_val32 * OPUS_RESTRICT in, int len, int
       /* High-pass filter: (1 - 2*z^-1 + z^-2) / (1 - z^-1 + .5*z^-2) */
       for (i=0;i<len;i++)
       {
+#ifndef FIXED_POINT
+         float mem00;
+#endif
          opus_val32 x,y;
          x = SHR32(in[i+c*len],SIG_SHIFT);
          y = ADD32(mem0, x);
@@ -288,8 +291,13 @@ static int transient_analysis(const opus_val32 * OPUS_RESTRICT in, int len, int
          mem0 = mem1 + y - SHL32(x,1);
          mem1 = x - SHR32(y,1);
 #else
+         /* Original code:
          mem0 = mem1 + y - 2*x;
          mem1 = x - .5f*y;
+         Modified code to shorten dependency chains: */
+         mem00=mem0;
+         mem0 = mem0 - x + .5f*mem1;
+         mem1 =  x - mem00;
 #endif
          tmp[i] = SROUND16(y, 2);
          /*printf("%f ", tmp[i]);*/
@@ -322,10 +330,11 @@ static int transient_analysis(const opus_val32 * OPUS_RESTRICT in, int len, int
 #ifdef FIXED_POINT
          /* FIXME: Use PSHR16() instead */
          tmp[i] = mem0 + PSHR32(x2-mem0,forward_shift);
+         mem0 = tmp[i];
 #else
-         tmp[i] = mem0 + MULT16_16_P15(forward_decay,x2-mem0);
+         mem0 = x2 + (1.f-forward_decay)*mem0;
+         tmp[i] = forward_decay*mem0;
 #endif
-         mem0 = tmp[i];
       }
 
       mem0=0;
@@ -337,11 +346,13 @@ static int transient_analysis(const opus_val32 * OPUS_RESTRICT in, int len, int
 #ifdef FIXED_POINT
          /* FIXME: Use PSHR16() instead */
          tmp[i] = mem0 + PSHR32(tmp[i]-mem0,3);
-#else
-         tmp[i] = mem0 + MULT16_16_P15(QCONST16(0.125f,15),tmp[i]-mem0);
-#endif
          mem0 = tmp[i];
          maxE = MAX16(maxE, mem0);
+#else
+         mem0 = tmp[i] + 0.875f*mem0;
+         tmp[i] = 0.125f*mem0;
+         maxE = MAX16(maxE, 0.125f*mem0);
+#endif
       }
       /*for (i=0;i<len2;i++)printf("%f ", tmp[i]/mean);printf("\n");*/
 
@@ -967,7 +978,7 @@ static opus_val16 median_of_3(const opus_val16 *x)
       return t0;
 }
 
-static opus_val16 dynalloc_analysis(const opus_val16 *bandLogE, const opus_val16 *bandLogE2,
+static opus_val16 dynalloc_analysis(const opus_val16 *bandLogE, const opus_val16 *bandLogE2, const opus_val16 *oldBandE,
       int nbEBands, int start, int end, int C, int *offsets, int lsb_depth, const opus_int16 *logN,
       int isTransient, int vbr, int constrained_vbr, const opus_int16 *eBands, int LM,
       int effectiveBytes, opus_int32 *tot_boost_, int lfe, opus_val16 *surround_dynalloc,
@@ -978,9 +989,11 @@ static opus_val16 dynalloc_analysis(const opus_val16 *bandLogE, const opus_val16
    opus_val16 maxDepth;
    VARDECL(opus_val16, follower);
    VARDECL(opus_val16, noise_floor);
+   VARDECL(opus_val16, bandLogE3);
    SAVE_STACK;
    ALLOC(follower, C*nbEBands, opus_val16);
    ALLOC(noise_floor, C*nbEBands, opus_val16);
+   ALLOC(bandLogE3, nbEBands, opus_val16);
    OPUS_CLEAR(offsets, nbEBands);
    /* Dynamic allocation code */
    maxDepth=-QCONST16(31.9f, DB_SHIFT);
@@ -1033,8 +1046,10 @@ static opus_val16 dynalloc_analysis(const opus_val16 *bandLogE, const opus_val16
          printf("%d ", spread_weight[i]);
       printf("\n");*/
    }
-   /* Make sure that dynamic allocation can't make us bust the budget */
-   if (effectiveBytes > 50 && LM>=1 && !lfe)
+   /* Make sure that dynamic allocation can't make us bust the budget.
+      We enable the feature starting at 24 kb/s for 20-ms frames
+      and 96 kb/s for 2.5 ms frames.  */
+   if (effectiveBytes >= (30 + 5*LM) && !lfe)
    {
       int last=0;
       c=0;do
@@ -1042,30 +1057,38 @@ static opus_val16 dynalloc_analysis(const opus_val16 *bandLogE, const opus_val16
          opus_val16 offset;
          opus_val16 tmp;
          opus_val16 *f;
+         OPUS_COPY(bandLogE3, &bandLogE2[c*nbEBands], end);
+         if (LM==0) {
+            /* For 2.5 ms frames, the first 8 bands have just one bin, so the
+               energy is highly unreliable (high variance). For that reason,
+               we take the max with the previous energy so that at least 2 bins
+               are getting used. */
+            for (i=0;i<IMIN(8,end);i++) bandLogE3[i] = MAX16(bandLogE2[c*nbEBands+i], oldBandE[c*nbEBands+i]);
+         }
          f = &follower[c*nbEBands];
-         f[0] = bandLogE2[c*nbEBands];
+         f[0] = bandLogE3[0];
          for (i=1;i<end;i++)
          {
             /* The last band to be at least 3 dB higher than the previous one
                is the last we'll consider. Otherwise, we run into problems on
                bandlimited signals. */
-            if (bandLogE2[c*nbEBands+i] > bandLogE2[c*nbEBands+i-1]+QCONST16(.5f,DB_SHIFT))
+            if (bandLogE3[i] > bandLogE3[i-1]+QCONST16(.5f,DB_SHIFT))
                last=i;
-            f[i] = MIN16(f[i-1]+QCONST16(1.5f,DB_SHIFT), bandLogE2[c*nbEBands+i]);
+            f[i] = MIN16(f[i-1]+QCONST16(1.5f,DB_SHIFT), bandLogE3[i]);
          }
          for (i=last-1;i>=0;i--)
-            f[i] = MIN16(f[i], MIN16(f[i+1]+QCONST16(2.f,DB_SHIFT), bandLogE2[c*nbEBands+i]));
+            f[i] = MIN16(f[i], MIN16(f[i+1]+QCONST16(2.f,DB_SHIFT), bandLogE3[i]));
 
          /* Combine with a median filter to avoid dynalloc triggering unnecessarily.
             The "offset" value controls how conservative we are -- a higher offset
             reduces the impact of the median filter and makes dynalloc use more bits. */
          offset = QCONST16(1.f, DB_SHIFT);
          for (i=2;i<end-2;i++)
-            f[i] = MAX16(f[i], median_of_5(&bandLogE2[c*nbEBands+i-2])-offset);
-         tmp = median_of_3(&bandLogE2[c*nbEBands])-offset;
+            f[i] = MAX16(f[i], median_of_5(&bandLogE3[i-2])-offset);
+         tmp = median_of_3(&bandLogE3[0])-offset;
          f[0] = MAX16(f[0], tmp);
          f[1] = MAX16(f[1], tmp);
-         tmp = median_of_3(&bandLogE2[c*nbEBands+end-3])-offset;
+         tmp = median_of_3(&bandLogE3[end-3])-offset;
          f[end-2] = MAX16(f[end-2], tmp);
          f[end-1] = MAX16(f[end-1], tmp);
 
@@ -1565,13 +1588,16 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
       vbr_rate = 0;
       tmp = st->bitrate*frame_size;
       if (tell>1)
-         tmp += tell;
+         tmp += tell*mode->Fs;
       if (st->bitrate!=OPUS_BITRATE_MAX)
+      {
          nbCompressedBytes = IMAX(2, IMIN(nbCompressedBytes,
                (tmp+4*mode->Fs)/(8*mode->Fs)-!!st->signalling));
+         ec_enc_shrink(enc, nbCompressedBytes);
+      }
       effectiveBytes = nbCompressedBytes - nbFilledBytes;
    }
-   equiv_rate = ((opus_int32)nbCompressedBytes*8*50 >> (3-LM)) - (40*C+20)*((400>>LM) - 50);
+   equiv_rate = ((opus_int32)nbCompressedBytes*8*50 << (3-LM)) - (40*C+20)*((400>>LM) - 50);
    if (st->bitrate != OPUS_BITRATE_MAX)
       equiv_rate = IMIN(equiv_rate, st->bitrate - (40*C+20)*((400>>LM) - 50));
 
@@ -1719,8 +1745,11 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
       compute_mdcts(mode, 0, in, freq, C, CC, LM, st->upsample, st->arch);
       compute_band_energies(mode, freq, bandE, effEnd, C, LM, st->arch);
       amp2Log2(mode, effEnd, end, bandE, bandLogE2, C);
-      for (i=0;i<C*nbEBands;i++)
-         bandLogE2[i] += HALF16(SHL16(LM, DB_SHIFT));
+      for (c=0;c<C;c++)
+      {
+         for (i=0;i<end;i++)
+            bandLogE2[nbEBands*c+i] += HALF16(SHL16(LM, DB_SHIFT));
+      }
    }
 
    compute_mdcts(mode, shortBlocks, in, freq, C, CC, LM, st->upsample, st->arch);
@@ -1856,8 +1885,11 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
          compute_band_energies(mode, freq, bandE, effEnd, C, LM, st->arch);
          amp2Log2(mode, effEnd, end, bandE, bandLogE, C);
          /* Compensate for the scaling of short vs long mdcts */
-         for (i=0;i<C*nbEBands;i++)
-            bandLogE2[i] += HALF16(SHL16(LM, DB_SHIFT));
+         for (c=0;c<C;c++)
+         {
+            for (i=0;i<end;i++)
+               bandLogE2[nbEBands*c+i] += HALF16(SHL16(LM, DB_SHIFT));
+         }
          tf_estimate = QCONST16(.2f,14);
       }
    }
@@ -1876,7 +1908,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
    ALLOC(importance, nbEBands, int);
    ALLOC(spread_weight, nbEBands, int);
 
-   maxDepth = dynalloc_analysis(bandLogE, bandLogE2, nbEBands, start, end, C, offsets,
+   maxDepth = dynalloc_analysis(bandLogE, bandLogE2, oldBandE, nbEBands, start, end, C, offsets,
          st->lsb_depth, mode->logN, isTransient, st->vbr, st->constrained_vbr,
          eBands, LM, effectiveBytes, &tot_boost, st->lfe, surround_dynalloc, &st->analysis, importance, spread_weight);
 
@@ -2240,7 +2272,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
       if (anti_collapse_on)
       {
          anti_collapse(mode, X, collapse_masks, LM, C, N,
-               start, end, oldBandE, oldLogE, oldLogE2, pulses, st->rng);
+               start, end, oldBandE, oldLogE, oldLogE2, pulses, st->rng, st->arch);
       }
 
       c=0; do {
@@ -2259,15 +2291,15 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
          st->prefilter_period_old=IMAX(st->prefilter_period_old, COMBFILTER_MINPERIOD);
          comb_filter(out_mem[c], out_mem[c], st->prefilter_period_old, st->prefilter_period, mode->shortMdctSize,
                st->prefilter_gain_old, st->prefilter_gain, st->prefilter_tapset_old, st->prefilter_tapset,
-               mode->window, overlap);
+               mode->window, overlap, st->arch);
          if (LM!=0)
             comb_filter(out_mem[c]+mode->shortMdctSize, out_mem[c]+mode->shortMdctSize, st->prefilter_period, pitch_index, N-mode->shortMdctSize,
                   st->prefilter_gain, gain1, st->prefilter_tapset, prefilter_tapset,
-                  mode->window, overlap);
+                  mode->window, overlap, st->arch);
       } while (++c<CC);
 
       /* We reuse freq[] as scratch space for the de-emphasis */
-      deemphasis(out_mem, (opus_val16*)pcm, N, CC, st->upsample, mode->preemph, st->preemph_memD);
+      deemphasis(out_mem, (opus_val16*)pcm, N, CC, st->upsample, mode->preemph, st->preemph_memD, 0);
       st->prefilter_period_old = st->prefilter_period;
       st->prefilter_gain_old = st->prefilter_gain;
       st->prefilter_tapset_old = st->prefilter_tapset;
diff --git a/opus/celt/celt_lpc.c b/opus/celt/celt_lpc.c
index 8ecb693e..fabca65c 100644
--- a/opus/celt/celt_lpc.c
+++ b/opus/celt/celt_lpc.c
@@ -44,23 +44,27 @@ int          p
    opus_val32 r;
    opus_val32 error = ac[0];
 #ifdef FIXED_POINT
-   opus_val32 lpc[LPC_ORDER];
+   opus_val32 lpc[CELT_LPC_ORDER];
 #else
    float *lpc = _lpc;
 #endif
 
    OPUS_CLEAR(lpc, p);
+#ifdef FIXED_POINT
    if (ac[0] != 0)
+#else
+   if (ac[0] > 1e-10f)
+#endif
    {
       for (i = 0; i < p; i++) {
          /* Sum up this iteration's reflection coefficient */
          opus_val32 rr = 0;
          for (j = 0; j < i; j++)
             rr += MULT32_32_Q31(lpc[j],ac[i - j]);
-         rr += SHR32(ac[i + 1],3);
-         r = -frac_div32(SHL32(rr,3), error);
+         rr += SHR32(ac[i + 1],6);
+         r = -frac_div32(SHL32(rr,6), error);
          /*  Update LPC coefficients and total error */
-         lpc[i] = SHR32(r,3);
+         lpc[i] = SHR32(r,6);
          for (j = 0; j < (i+1)>>1; j++)
          {
             opus_val32 tmp1, tmp2;
@@ -73,17 +77,61 @@ int          p
          error = error - MULT32_32_Q31(MULT32_32_Q31(r,r),error);
          /* Bail out once we get 30 dB gain */
 #ifdef FIXED_POINT
-         if (error<SHR32(ac[0],10))
+         if (error<=SHR32(ac[0],10))
             break;
 #else
-         if (error<.001f*ac[0])
+         if (error<=.001f*ac[0])
             break;
 #endif
       }
    }
 #ifdef FIXED_POINT
-   for (i=0;i<p;i++)
-      _lpc[i] = ROUND16(lpc[i],16);
+   {
+      /* Convert the int32 lpcs to int16 and ensure there are no wrap-arounds.
+         This reuses the logic in silk_LPC_fit() and silk_bwexpander_32(). Any bug
+         fixes should also be applied there. */
+      int iter, idx = 0;
+      opus_val32 maxabs, absval, chirp_Q16, chirp_minus_one_Q16;
+
+      for (iter = 0; iter < 10; iter++) {
+         maxabs = 0;
+         for (i = 0; i < p; i++) {
+            absval = ABS32(lpc[i]);
+            if (absval > maxabs) {
+               maxabs = absval;
+               idx = i;
+            }
+         }
+         maxabs = PSHR32(maxabs, 13);  /* Q25->Q12 */
+
+         if (maxabs > 32767) {
+            maxabs = MIN32(maxabs, 163838);
+            chirp_Q16 = QCONST32(0.999, 16) - DIV32(SHL32(maxabs - 32767, 14),
+                                                    SHR32(MULT32_32_32(maxabs, idx + 1), 2));
+            chirp_minus_one_Q16 = chirp_Q16 - 65536;
+
+            /* Apply bandwidth expansion. */
+            for (i = 0; i < p - 1; i++) {
+               lpc[i] = MULT32_32_Q16(chirp_Q16, lpc[i]);
+               chirp_Q16 += PSHR32(MULT32_32_32(chirp_Q16, chirp_minus_one_Q16), 16);
+            }
+            lpc[p - 1] = MULT32_32_Q16(chirp_Q16, lpc[p - 1]);
+         } else {
+            break;
+         }
+      }
+
+      if (iter == 10) {
+         /* If the coeffs still do not fit into the 16 bit range after 10 iterations,
+            fall back to the A(z)=1 filter. */
+         OPUS_CLEAR(lpc, p);
+         _lpc[0] = 4096;  /* Q12 */
+      } else {
+         for (i = 0; i < p; i++) {
+            _lpc[i] = EXTRACT16(PSHR32(lpc[i], 13));  /* Q25->Q12 */
+         }
+      }
+   }
 #endif
 }
 
@@ -110,18 +158,28 @@ void celt_fir_c(
       sum[1] = SHL32(EXTEND32(x[i+1]), SIG_SHIFT);
       sum[2] = SHL32(EXTEND32(x[i+2]), SIG_SHIFT);
       sum[3] = SHL32(EXTEND32(x[i+3]), SIG_SHIFT);
-      xcorr_kernel(rnum, x+i-ord, sum, ord, arch);
-      y[i  ] = ROUND16(sum[0], SIG_SHIFT);
-      y[i+1] = ROUND16(sum[1], SIG_SHIFT);
-      y[i+2] = ROUND16(sum[2], SIG_SHIFT);
-      y[i+3] = ROUND16(sum[3], SIG_SHIFT);
+#if defined(OPUS_CHECK_ASM) && defined(FIXED_POINT)
+      {
+         opus_val32 sum_c[4];
+         memcpy(sum_c, sum, sizeof(sum_c));
+         xcorr_kernel_c(rnum, x+i-ord, sum_c, ord);
+#endif
+         xcorr_kernel(rnum, x+i-ord, sum, ord, arch);
+#if defined(OPUS_CHECK_ASM) && defined(FIXED_POINT)
+         celt_assert(memcmp(sum, sum_c, sizeof(sum)) == 0);
+      }
+#endif
+      y[i  ] = SROUND16(sum[0], SIG_SHIFT);
+      y[i+1] = SROUND16(sum[1], SIG_SHIFT);
+      y[i+2] = SROUND16(sum[2], SIG_SHIFT);
+      y[i+3] = SROUND16(sum[3], SIG_SHIFT);
    }
    for (;i<N;i++)
    {
       opus_val32 sum = SHL32(EXTEND32(x[i]), SIG_SHIFT);
       for (j=0;j<ord;j++)
          sum = MAC16_16(sum,rnum[j],x[i+j-ord]);
-      y[i] = ROUND16(sum, SIG_SHIFT);
+      y[i] = SROUND16(sum, SIG_SHIFT);
    }
    RESTORE_STACK;
 }
@@ -174,8 +232,17 @@ void celt_iir(const opus_val32 *_x,
       sum[1]=_x[i+1];
       sum[2]=_x[i+2];
       sum[3]=_x[i+3];
-      xcorr_kernel(rden, y+i, sum, ord, arch);
-
+#if defined(OPUS_CHECK_ASM) && defined(FIXED_POINT)
+      {
+         opus_val32 sum_c[4];
+         memcpy(sum_c, sum, sizeof(sum_c));
+         xcorr_kernel_c(rden, y+i, sum_c, ord);
+#endif
+         xcorr_kernel(rden, y+i, sum, ord, arch);
+#if defined(OPUS_CHECK_ASM) && defined(FIXED_POINT)
+         celt_assert(memcmp(sum, sum_c, sizeof(sum)) == 0);
+      }
+#endif
       /* Patch up the result to compensate for the fact that this is an IIR */
       y[i+ord  ] = -SROUND16(sum[0],SIG_SHIFT);
       _y[i  ] = sum[0];
diff --git a/opus/celt/celt_lpc.h b/opus/celt/celt_lpc.h
index a4c5fd6e..97dee82f 100644
--- a/opus/celt/celt_lpc.h
+++ b/opus/celt/celt_lpc.h
@@ -35,7 +35,7 @@
 #include "x86/celt_lpc_sse.h"
 #endif
 
-#define LPC_ORDER 24
+#define CELT_LPC_ORDER 24
 
 void _celt_lpc(opus_val16 *_lpc, const opus_val32 *ac, int p);
 
diff --git a/opus/celt/cpu_support.h b/opus/celt/cpu_support.h
index 68fc6067..9f13d8ae 100644
--- a/opus/celt/cpu_support.h
+++ b/opus/celt/cpu_support.h
@@ -35,18 +35,20 @@
   (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
 #include "arm/armcpu.h"
 
-/* We currently support 4 ARM variants:
+/* We currently support 5 ARM variants:
  * arch[0] -> ARMv4
  * arch[1] -> ARMv5E
  * arch[2] -> ARMv6
  * arch[3] -> NEON
+ * arch[4] -> NEON+DOTPROD
  */
-#define OPUS_ARCHMASK 3
+#define OPUS_ARCHMASK 7
 
-#elif (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
+#elif defined(OPUS_HAVE_RTCD) && \
+  ((defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
   (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \
   (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \
-  (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX))
+  (defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_PRESUME_AVX2)))
 
 #include "x86/x86cpu.h"
 /* We currently support 5 x86 variants:
diff --git a/opus/celt/ecintrin.h b/opus/celt/ecintrin.h
index 2263cff6..66a4c36e 100644
--- a/opus/celt/ecintrin.h
+++ b/opus/celt/ecintrin.h
@@ -49,7 +49,11 @@
   This macro should only be used for implementing ec_ilog(), if it is defined.
   All other code should use EC_ILOG() instead.*/
 #if defined(_MSC_VER) && (_MSC_VER >= 1400)
+#if defined(_MSC_VER) && (_MSC_VER >= 1910)
+# include <intrin0.h> /* Improve compiler throughput. */
+#else
 # include <intrin.h>
+#endif
 /*In _DEBUG mode this is not an intrinsic by default.*/
 # pragma intrinsic(_BitScanReverse)
 
diff --git a/opus/celt/entdec.c b/opus/celt/entdec.c
index 0b3433ed..027aa24b 100644
--- a/opus/celt/entdec.c
+++ b/opus/celt/entdec.c
@@ -195,6 +195,27 @@ int ec_dec_icdf(ec_dec *_this,const unsigned char *_icdf,unsigned _ftb){
   return ret;
 }
 
+int ec_dec_icdf16(ec_dec *_this,const opus_uint16 *_icdf,unsigned _ftb){
+  opus_uint32 r;
+  opus_uint32 d;
+  opus_uint32 s;
+  opus_uint32 t;
+  int         ret;
+  s=_this->rng;
+  d=_this->val;
+  r=s>>_ftb;
+  ret=-1;
+  do{
+    t=s;
+    s=IMUL32(r,_icdf[++ret]);
+  }
+  while(d<s);
+  _this->val=d-s;
+  _this->rng=t-s;
+  ec_dec_normalize(_this);
+  return ret;
+}
+
 opus_uint32 ec_dec_uint(ec_dec *_this,opus_uint32 _ft){
   unsigned ft;
   unsigned s;
diff --git a/opus/celt/entdec.h b/opus/celt/entdec.h
index 025fc187..c81f26fd 100644
--- a/opus/celt/entdec.h
+++ b/opus/celt/entdec.h
@@ -81,6 +81,16 @@ int ec_dec_bit_logp(ec_dec *_this,unsigned _logp);
   Return: The decoded symbol s.*/
 int ec_dec_icdf(ec_dec *_this,const unsigned char *_icdf,unsigned _ftb);
 
+/*Decodes a symbol given an "inverse" CDF table.
+  No call to ec_dec_update() is necessary after this call.
+  _icdf: The "inverse" CDF, such that symbol s falls in the range
+          [s>0?ft-_icdf[s-1]:0,ft-_icdf[s]), where ft=1<<_ftb.
+         The values must be monotonically non-increasing, and the last value
+          must be 0.
+  _ftb: The number of bits of precision in the cumulative distribution.
+  Return: The decoded symbol s.*/
+int ec_dec_icdf16(ec_dec *_this,const opus_uint16 *_icdf,unsigned _ftb);
+
 /*Extracts a raw unsigned integer with a non-power-of-2 range from the stream.
   The bits must have been encoded with ec_enc_uint().
   No call to ec_dec_update() is necessary after this call.
diff --git a/opus/celt/entenc.c b/opus/celt/entenc.c
index f1750d25..69c6f835 100644
--- a/opus/celt/entenc.c
+++ b/opus/celt/entenc.c
@@ -172,6 +172,17 @@ void ec_enc_icdf(ec_enc *_this,int _s,const unsigned char *_icdf,unsigned _ftb){
   ec_enc_normalize(_this);
 }
 
+void ec_enc_icdf16(ec_enc *_this,int _s,const opus_uint16 *_icdf,unsigned _ftb){
+  opus_uint32 r;
+  r=_this->rng>>_ftb;
+  if(_s>0){
+    _this->val+=_this->rng-IMUL32(r,_icdf[_s-1]);
+    _this->rng=IMUL32(r,_icdf[_s-1]-_icdf[_s]);
+  }
+  else _this->rng-=IMUL32(r,_icdf[_s]);
+  ec_enc_normalize(_this);
+}
+
 void ec_enc_uint(ec_enc *_this,opus_uint32 _fl,opus_uint32 _ft){
   unsigned  ft;
   unsigned  fl;
diff --git a/opus/celt/entenc.h b/opus/celt/entenc.h
index f502eaf6..010874bb 100644
--- a/opus/celt/entenc.h
+++ b/opus/celt/entenc.h
@@ -64,6 +64,15 @@ void ec_enc_bit_logp(ec_enc *_this,int _val,unsigned _logp);
   _ftb: The number of bits of precision in the cumulative distribution.*/
 void ec_enc_icdf(ec_enc *_this,int _s,const unsigned char *_icdf,unsigned _ftb);
 
+/*Encodes a symbol given an "inverse" CDF table.
+  _s:    The index of the symbol to encode.
+  _icdf: The "inverse" CDF, such that symbol _s falls in the range
+          [_s>0?ft-_icdf[_s-1]:0,ft-_icdf[_s]), where ft=1<<_ftb.
+         The values must be monotonically non-increasing, and the last value
+          must be 0.
+  _ftb: The number of bits of precision in the cumulative distribution.*/
+void ec_enc_icdf16(ec_enc *_this,int _s,const opus_uint16 *_icdf,unsigned _ftb);
+
 /*Encodes a raw unsigned integer in the stream.
   _fl: The integer to encode.
   _ft: The number of integers that can be encoded (one more than the max).
diff --git a/opus/celt/fixed_debug.h b/opus/celt/fixed_debug.h
index f4352952..ef2e5d02 100644
--- a/opus/celt/fixed_debug.h
+++ b/opus/celt/fixed_debug.h
@@ -167,7 +167,7 @@ static OPUS_INLINE short SHR16_(int a, int shift, char *file, int line)
 #define SHL16(a, shift) SHL16_(a, shift, __FILE__, __LINE__)
 static OPUS_INLINE short SHL16_(int a, int shift, char *file, int line)
 {
-   int res;
+   opus_int32 res;
    if (!VERIFY_SHORT(a) || !VERIFY_SHORT(shift))
    {
       fprintf (stderr, "SHL16: inputs are not short: %d %d in %s: line %d\n", a, shift, file, line);
@@ -175,7 +175,7 @@ static OPUS_INLINE short SHL16_(int a, int shift, char *file, int line)
       celt_assert(0);
 #endif
    }
-   res = a<<shift;
+   res = (opus_int32)((opus_uint32)a<<shift);
    if (!VERIFY_SHORT(res))
    {
       fprintf (stderr, "SHL16: output is not short: %d in %s: line %d\n", res, file, line);
@@ -214,15 +214,15 @@ static OPUS_INLINE int SHL32_(opus_int64 a, int shift, char *file, int line)
    opus_int64  res;
    if (!VERIFY_INT(a) || !VERIFY_SHORT(shift))
    {
-      fprintf (stderr, "SHL32: inputs are not int: %lld %d in %s: line %d\n", a, shift, file, line);
+      fprintf (stderr, "SHL32: inputs are not int: %lld %d in %s: line %d\n", (long long)a, shift, file, line);
 #ifdef FIXED_DEBUG_ASSERT
       celt_assert(0);
 #endif
    }
-   res = a<<shift;
+   res = (opus_int64)((opus_uint64)a<<shift);
    if (!VERIFY_INT(res))
    {
-      fprintf (stderr, "SHL32: output is not int: %lld<<%d = %lld in %s: line %d\n", a, shift, res, file, line);
+      fprintf (stderr, "SHL32: output is not int: %lld<<%d = %lld in %s: line %d\n", (long long)a, shift, (long long)res, file, line);
 #ifdef FIXED_DEBUG_ASSERT
       celt_assert(0);
 #endif
@@ -339,7 +339,7 @@ static OPUS_INLINE unsigned int UADD32_(opus_uint64 a, opus_uint64 b, char *file
    opus_uint64 res;
    if (!VERIFY_UINT(a) || !VERIFY_UINT(b))
    {
-      fprintf (stderr, "UADD32: inputs are not uint32: %llu %llu in %s: line %d\n", a, b, file, line);
+      fprintf (stderr, "UADD32: inputs are not uint32: %llu %llu in %s: line %d\n", (unsigned long long)a, (unsigned long long)b, file, line);
 #ifdef FIXED_DEBUG_ASSERT
       celt_assert(0);
 #endif
@@ -347,7 +347,7 @@ static OPUS_INLINE unsigned int UADD32_(opus_uint64 a, opus_uint64 b, char *file
    res = a+b;
    if (!VERIFY_UINT(res))
    {
-      fprintf (stderr, "UADD32: output is not uint32: %llu in %s: line %d\n", res, file, line);
+      fprintf (stderr, "UADD32: output is not uint32: %llu in %s: line %d\n", (unsigned long long)res, file, line);
 #ifdef FIXED_DEBUG_ASSERT
       celt_assert(0);
 #endif
@@ -363,14 +363,14 @@ static OPUS_INLINE unsigned int USUB32_(opus_uint64 a, opus_uint64 b, char *file
    opus_uint64 res;
    if (!VERIFY_UINT(a) || !VERIFY_UINT(b))
    {
-      fprintf (stderr, "USUB32: inputs are not uint32: %llu %llu in %s: line %d\n", a, b, file, line);
+      fprintf (stderr, "USUB32: inputs are not uint32: %llu %llu in %s: line %d\n", (unsigned long long)a, (unsigned long long)b, file, line);
 #ifdef FIXED_DEBUG_ASSERT
       celt_assert(0);
 #endif
    }
    if (a<b)
    {
-      fprintf (stderr, "USUB32: inputs underflow: %llu < %llu in %s: line %d\n", a, b, file, line);
+      fprintf (stderr, "USUB32: inputs underflow: %llu < %llu in %s: line %d\n", (unsigned long long)a, (unsigned long long)b, file, line);
 #ifdef FIXED_DEBUG_ASSERT
       celt_assert(0);
 #endif
@@ -378,7 +378,7 @@ static OPUS_INLINE unsigned int USUB32_(opus_uint64 a, opus_uint64 b, char *file
    res = a-b;
    if (!VERIFY_UINT(res))
    {
-      fprintf (stderr, "USUB32: output is not uint32: %llu - %llu = %llu in %s: line %d\n", a, b, res, file, line);
+      fprintf (stderr, "USUB32: output is not uint32: %llu - %llu = %llu in %s: line %d\n", (unsigned long long)a, (unsigned long long)b, (unsigned long long)res, file, line);
 #ifdef FIXED_DEBUG_ASSERT
       celt_assert(0);
 #endif
@@ -410,6 +410,51 @@ static OPUS_INLINE short MULT16_16_16(int a, int b)
    return res;
 }
 
+/* result fits in 32 bits */
+static OPUS_INLINE int MULT32_32_32(opus_int64 a, opus_int64 b)
+{
+   opus_int64 res;
+   if (!VERIFY_INT(a) || !VERIFY_INT(b))
+   {
+      fprintf (stderr, "MULT32_32_32: inputs are not int: %lld %lld\n", (long long)a, (long long)b);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   res = a*b;
+   if (!VERIFY_INT(res))
+   {
+      fprintf (stderr, "MULT32_32_32: output is not int: %lld\n", (long long)res);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   celt_mips+=5;
+   return res;
+}
+
+static OPUS_INLINE int MULT32_32_Q16(opus_int64 a, opus_int64 b)
+{
+   opus_int64 res;
+   if (!VERIFY_INT(a) || !VERIFY_INT(b))
+   {
+      fprintf (stderr, "MULT32_32_Q16: inputs are not int: %lld %lld\n", (long long)a, (long long)b);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   res = ((opus_int64)(a)*(opus_int64)(b)) >> 16;
+   if (!VERIFY_INT(res))
+   {
+      fprintf (stderr, "MULT32_32_Q16: output is not int: %lld*%lld=%lld\n", (long long)a, (long long)b, (long long)res);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   celt_mips+=5;
+   return res;
+}
+
 #define MULT16_16(a, b) MULT16_16_(a, b, __FILE__, __LINE__)
 static OPUS_INLINE int MULT16_16_(int a, int b, char *file, int line)
 {
@@ -446,7 +491,7 @@ static OPUS_INLINE int MULT16_32_QX_(int a, opus_int64 b, int Q, char *file, int
       celt_assert(0);
 #endif
    }
-   if (ABS32(b)>=((opus_val32)(1)<<(15+Q)))
+   if (ABS32(b)>=((opus_int64)(1)<<(16+Q)))
    {
       fprintf (stderr, "MULT16_32_Q%d: second operand too large: %d %d in %s: line %d\n", Q, (int)a, (int)b, file, line);
 #ifdef FIXED_DEBUG_ASSERT
@@ -479,7 +524,7 @@ static OPUS_INLINE int MULT16_32_PX_(int a, opus_int64 b, int Q, char *file, int
       celt_assert(0);
 #endif
    }
-   if (ABS32(b)>=((opus_int64)(1)<<(15+Q)))
+   if (ABS32(b)>=((opus_int64)(1)<<(16+Q)))
    {
       fprintf (stderr, "MULT16_32_Q%d: second operand too large: %d %d in %s: line %d\n\n", Q, (int)a, (int)b,file, line);
 #ifdef FIXED_DEBUG_ASSERT
@@ -786,6 +831,6 @@ static OPUS_INLINE opus_val16 SIG2WORD16_generic(celt_sig x)
 
 
 #undef PRINT_MIPS
-#define PRINT_MIPS(file) do {fprintf (file, "total complexity = %llu MIPS\n", celt_mips);} while (0);
+#define PRINT_MIPS(file) do {fprintf (file, "total complexity = %llu MIPS\n", (unsigned long long)celt_mips);} while (0);
 
 #endif
diff --git a/opus/celt/fixed_generic.h b/opus/celt/fixed_generic.h
index 5f4abda7..8f29d46b 100644
--- a/opus/celt/fixed_generic.h
+++ b/opus/celt/fixed_generic.h
@@ -57,6 +57,13 @@
 #define MULT16_32_Q15(a,b) ADD32(SHL(MULT16_16((a),SHR((b),16)),1), SHR(MULT16_16SU((a),((b)&0x0000ffff)),15))
 #endif
 
+/** 32x32 multiplication, followed by a 16-bit shift right. Results fits in 32 bits */
+#if OPUS_FAST_INT64
+#define MULT32_32_Q16(a,b) ((opus_val32)SHR((opus_int64)(a)*(opus_int64)(b),16))
+#else
+#define MULT32_32_Q16(a,b) (ADD32(ADD32(ADD32((opus_val32)(SHR32(((opus_uint32)((a)&0x0000ffff)*(opus_uint32)((b)&0x0000ffff)),16)), MULT16_16SU(SHR32(a,16),((b)&0x0000ffff))), MULT16_16SU(SHR32(b,16),((a)&0x0000ffff))), SHL32(MULT16_16(SHR32(a,16),SHR32(b,16)),16)))
+#endif
+
 /** 32x32 multiplication, followed by a 31-bit shift right. Results fits in 32 bits */
 #if OPUS_FAST_INT64
 #define MULT32_32_Q31(a,b) ((opus_val32)SHR((opus_int64)(a)*(opus_int64)(b),31))
@@ -102,9 +109,9 @@
 
 #define SATURATE16(x) (EXTRACT16((x)>32767 ? 32767 : (x)<-32768 ? -32768 : (x)))
 
-/** Shift by a and round-to-neareast 32-bit value. Result is a 16-bit value */
+/** Shift by a and round-to-nearest 32-bit value. Result is a 16-bit value */
 #define ROUND16(x,a) (EXTRACT16(PSHR32((x),(a))))
-/** Shift by a and round-to-neareast 32-bit value. Result is a saturated 16-bit value */
+/** Shift by a and round-to-nearest 32-bit value. Result is a saturated 16-bit value */
 #define SROUND16(x,a) EXTRACT16(SATURATE(PSHR32(x,a), 32767));
 
 /** Divide by two */
@@ -131,6 +138,9 @@
 /** 16x16 multiplication where the result fits in 16 bits */
 #define MULT16_16_16(a,b)     ((((opus_val16)(a))*((opus_val16)(b))))
 
+/** 32x32 multiplication where the result fits in 32 bits */
+#define MULT32_32_32(a,b)     ((((opus_val32)(a))*((opus_val32)(b))))
+
 /* (opus_val32)(opus_val16) gives TI compiler a hint that it's 16x16->32 multiply */
 /** 16x16 multiplication where the result fits in 32 bits */
 #define MULT16_16(a,b)     (((opus_val32)(opus_val16)(a))*((opus_val32)(opus_val16)(b)))
diff --git a/opus/celt/float_cast.h b/opus/celt/float_cast.h
index 889dae96..8915a5fd 100644
--- a/opus/celt/float_cast.h
+++ b/opus/celt/float_cast.h
@@ -67,7 +67,39 @@
 #include <xmmintrin.h>
 static OPUS_INLINE opus_int32 float2int(float x) {return _mm_cvt_ss2si(_mm_set_ss(x));}
 
-#elif defined(HAVE_LRINTF)
+#elif (defined(_MSC_VER) && _MSC_VER >= 1400) && (defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1))
+
+        #include <xmmintrin.h>
+        static OPUS_INLINE opus_int32 float2int(float value)
+        {
+                /* _mm_load_ss will generate same code as _mm_set_ss
+                ** in _MSC_VER >= 1914 /02 so keep __mm_load__ss
+                ** for backward compatibility.
+                */
+                return _mm_cvtss_si32(_mm_load_ss(&value));
+        }
+
+#elif (defined(_MSC_VER) && _MSC_VER >= 1400) && defined (_M_IX86)
+
+        #include <math.h>
+
+        /*      Win32 doesn't seem to have these functions.
+        **      Therefore implement OPUS_INLINE versions of these functions here.
+        */
+
+        static OPUS_INLINE opus_int32
+        float2int (float flt)
+        {       int intgr;
+
+                _asm
+                {       fld flt
+                        fistp intgr
+                } ;
+
+                return intgr ;
+        }
+
+#elif defined(HAVE_LRINTF) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
 
 /*      These defines enable functionality introduced with the 1999 ISO C
 **      standard. They must be defined before the inclusion of math.h to
@@ -85,7 +117,7 @@ static OPUS_INLINE opus_int32 float2int(float x) {return _mm_cvt_ss2si(_mm_set_s
 #include <math.h>
 #define float2int(x) lrintf(x)
 
-#elif (defined(HAVE_LRINT))
+#elif defined(HAVE_LRINT) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
 
 #define _ISOC9X_SOURCE 1
 #define _ISOC99_SOURCE 1
@@ -96,32 +128,6 @@ static OPUS_INLINE opus_int32 float2int(float x) {return _mm_cvt_ss2si(_mm_set_s
 #include <math.h>
 #define float2int(x) lrint(x)
 
-#elif (defined(_MSC_VER) && _MSC_VER >= 1400) && (defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1))
-        #include <xmmintrin.h>
-
-        static __inline long int float2int(float value)
-        {
-                return _mm_cvtss_si32(_mm_load_ss(&value));
-        }
-#elif (defined(_MSC_VER) && _MSC_VER >= 1400) && defined (_M_IX86)
-        #include <math.h>
-
-        /*      Win32 doesn't seem to have these functions.
-        **      Therefore implement OPUS_INLINE versions of these functions here.
-        */
-
-        static __inline long int
-        float2int (float flt)
-        {       int intgr;
-
-                _asm
-                {       fld flt
-                        fistp intgr
-                } ;
-
-                return intgr ;
-        }
-
 #else
 
 #if (defined(__GNUC__) && defined(__STDC__) && __STDC__ && __STDC_VERSION__ >= 199901L)
diff --git a/opus/celt/kiss_fft.h b/opus/celt/kiss_fft.h
index bffa2bfa..267f72f9 100644
--- a/opus/celt/kiss_fft.h
+++ b/opus/celt/kiss_fft.h
@@ -52,6 +52,10 @@ extern "C" {
 #  define kiss_fft_scalar opus_int32
 #  define kiss_twiddle_scalar opus_int16
 
+/* Some 32-bit CPUs would load/store a kiss_twiddle_cpx with a single memory
+ * access, and could benefit from additional alignment.
+ */
+#  define KISS_TWIDDLE_CPX_ALIGNMENT (sizeof(opus_int32))
 
 #else
 # ifndef kiss_fft_scalar
@@ -62,6 +66,12 @@ extern "C" {
 # endif
 #endif
 
+#if defined(__GNUC__) && defined(KISS_TWIDDLE_CPX_ALIGNMENT)
+#define KISS_TWIDDLE_CPX_ALIGNED __attribute__((aligned(KISS_TWIDDLE_CPX_ALIGNMENT)))
+#else
+#define KISS_TWIDDLE_CPX_ALIGNED
+#endif
+
 typedef struct {
     kiss_fft_scalar r;
     kiss_fft_scalar i;
@@ -70,7 +80,7 @@ typedef struct {
 typedef struct {
    kiss_twiddle_scalar r;
    kiss_twiddle_scalar i;
-}kiss_twiddle_cpx;
+} KISS_TWIDDLE_CPX_ALIGNED kiss_twiddle_cpx;
 
 #define MAXFACTORS 8
 /* e.g. an fft of length 128 has 4 factors
diff --git a/opus/celt/laplace.c b/opus/celt/laplace.c
index a7bca874..21809666 100644
--- a/opus/celt/laplace.c
+++ b/opus/celt/laplace.c
@@ -132,3 +132,104 @@ int ec_laplace_decode(ec_dec *dec, unsigned fs, int decay)
    ec_dec_update(dec, fl, IMIN(fl+fs,32768), 32768);
    return val;
 }
+
+void ec_laplace_encode_p0(ec_enc *enc, int value, opus_uint16 p0, opus_uint16 decay)
+{
+   int s;
+   opus_uint16 sign_icdf[3];
+   sign_icdf[0] = 32768-p0;
+   sign_icdf[1] = sign_icdf[0]/2;
+   sign_icdf[2] = 0;
+   s = value == 0 ? 0 : (value > 0 ? 1 : 2);
+   ec_enc_icdf16(enc, s, sign_icdf, 15);
+   value = abs(value);
+   if (value)
+   {
+      int i;
+      opus_uint16 icdf[8];
+      icdf[0] = IMAX(7, decay);
+      for (i=1;i<7;i++)
+      {
+         icdf[i] = IMAX(7-i, (icdf[i-1] * (opus_int32)decay) >> 15);
+      }
+      icdf[7] = 0;
+      value--;
+      do {
+         ec_enc_icdf16(enc, IMIN(value, 7), icdf, 15);
+         value -= 7;
+      } while (value >= 0);
+   }
+}
+
+int ec_laplace_decode_p0(ec_dec *dec, opus_uint16 p0, opus_uint16 decay)
+{
+   int s;
+   int value;
+   opus_uint16 sign_icdf[3];
+   sign_icdf[0] = 32768-p0;
+   sign_icdf[1] = sign_icdf[0]/2;
+   sign_icdf[2] = 0;
+   s = ec_dec_icdf16(dec, sign_icdf, 15);
+   if (s==2) s = -1;
+   if (s != 0)
+   {
+      int i;
+      int v;
+      opus_uint16 icdf[8];
+      icdf[0] = IMAX(7, decay);
+      for (i=1;i<7;i++)
+      {
+         icdf[i] = IMAX(7-i, (icdf[i-1] * (opus_int32)decay) >> 15);
+      }
+      icdf[7] = 0;
+      value = 1;
+      do {
+         v = ec_dec_icdf16(dec, icdf, 15);
+         value += v;
+      } while (v == 7);
+      return s*value;
+   } else return 0;
+}
+
+#if 0
+
+#include <stdio.h>
+#define NB_VALS 10
+#define DATA_SIZE 10000
+int main() {
+   ec_enc enc;
+   ec_dec dec;
+   unsigned char *ptr;
+   int i;
+   int decay, p0;
+   int val[NB_VALS] = {6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+   /*for (i=0;i<NB_VALS;i++) {
+      val[i] = -log(rand()/(float)RAND_MAX);
+      if (rand()%2) val[i] = -val[i];
+   }*/
+   p0 = 16000;
+   decay = 16000;
+   ptr = (unsigned char *)malloc(DATA_SIZE);
+   ec_enc_init(&enc,ptr,DATA_SIZE);
+   for (i=0;i<NB_VALS;i++) {
+      printf("%d ", val[i]);
+   }
+   printf("\n");
+   for (i=0;i<NB_VALS;i++) {
+      ec_laplace_encode_p0(&enc, val[i], p0, decay);
+   }
+
+   ec_enc_done(&enc);
+
+   ec_dec_init(&dec,ec_get_buffer(&enc),ec_range_bytes(&enc));
+
+   for (i=0;i<NB_VALS;i++) {
+      val[i] = ec_laplace_decode_p0(&dec, p0, decay);
+   }
+   for (i=0;i<NB_VALS;i++) {
+      printf("%d ", val[i]);
+   }
+   printf("\n");
+}
+
+#endif
diff --git a/opus/celt/laplace.h b/opus/celt/laplace.h
index 46c14b5d..8010ad97 100644
--- a/opus/celt/laplace.h
+++ b/opus/celt/laplace.h
@@ -26,6 +26,9 @@
    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
+#ifndef LAPLACE_H
+#define LAPLACE_H
+
 #include "entenc.h"
 #include "entdec.h"
 
@@ -46,3 +49,9 @@ void ec_laplace_encode(ec_enc *enc, int *value, unsigned fs, int decay);
  @return Value decoded
  */
 int ec_laplace_decode(ec_dec *dec, unsigned fs, int decay);
+
+
+int ec_laplace_decode_p0(ec_dec *dec, opus_uint16 p0, opus_uint16 decay);
+void ec_laplace_encode_p0(ec_enc *enc, int value, opus_uint16 p0, opus_uint16 decay);
+
+#endif
diff --git a/opus/celt/mathops.h b/opus/celt/mathops.h
index 5e86ff0d..e2eece29 100644
--- a/opus/celt/mathops.h
+++ b/opus/celt/mathops.h
@@ -137,7 +137,7 @@ static OPUS_INLINE float celt_log2(float x)
    } in;
    in.f = x;
    integer = (in.i>>23)-127;
-   in.i -= integer<<23;
+   in.i -= (opus_uint32)integer<<23;
    frac = in.f - 1.5f;
    frac = -0.41445418f + frac*(0.95909232f
           + frac*(-0.33951290f + frac*0.16541097f));
@@ -153,14 +153,14 @@ static OPUS_INLINE float celt_exp2(float x)
       float f;
       opus_uint32 i;
    } res;
-   integer = floor(x);
+   integer = (int)floor(x);
    if (integer < -50)
       return 0;
    frac = x-integer;
    /* K0 = 1, K1 = log(2), K2 = 3-4*log(2), K3 = 3*log(2) - 2 */
    res.f = 0.99992522f + frac * (0.69583354f
            + frac * (0.22606716f + 0.078024523f*frac));
-   res.i = (res.i + (integer<<23)) & 0x7fffffff;
+   res.i = (res.i + ((opus_uint32)integer<<23)) & 0x7fffffff;
    return res.f;
 }
 
@@ -230,6 +230,12 @@ static OPUS_INLINE opus_val32 celt_exp2_frac(opus_val16 x)
    frac = SHL16(x, 4);
    return ADD16(D0, MULT16_16_Q15(frac, ADD16(D1, MULT16_16_Q15(frac, ADD16(D2 , MULT16_16_Q15(D3,frac))))));
 }
+
+#undef D0
+#undef D1
+#undef D2
+#undef D3
+
 /** Base-2 exponential approximation (2^x). (Q10 input, Q16 output) */
 static OPUS_INLINE opus_val32 celt_exp2(opus_val16 x)
 {
diff --git a/opus/celt/mips/celt_mipsr1.h b/opus/celt/mips/celt_mipsr1.h
index c332fe04..d1b25c20 100644
--- a/opus/celt/mips/celt_mipsr1.h
+++ b/opus/celt/mips/celt_mipsr1.h
@@ -27,8 +27,8 @@
    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
-#ifndef __CELT_MIPSR1_H__
-#define __CELT_MIPSR1_H__
+#ifndef CELT_MIPSR1_H__
+#define CELT_MIPSR1_H__
 
 #ifdef HAVE_CONFIG_H
 #include "config.h"
@@ -149,4 +149,4 @@ void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
    }
 }
 
-#endif /* __CELT_MIPSR1_H__ */
+#endif /* CELT_MIPSR1_H__ */
diff --git a/opus/celt/mips/mdct_mipsr1.h b/opus/celt/mips/mdct_mipsr1.h
index 2934dab7..7456c181 100644
--- a/opus/celt/mips/mdct_mipsr1.h
+++ b/opus/celt/mips/mdct_mipsr1.h
@@ -38,8 +38,8 @@
    MDCT implementation in FFMPEG, but has differences in signs, ordering
    and scaling in many places.
 */
-#ifndef __MDCT_MIPSR1_H__
-#define __MDCT_MIPSR1_H__
+#ifndef MDCT_MIPSR1_H__
+#define MDCT_MIPSR1_H__
 
 #ifndef SKIP_CONFIG_H
 #ifdef HAVE_CONFIG_H
@@ -285,4 +285,4 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala
       }
    }
 }
-#endif /* __MDCT_MIPSR1_H__ */
+#endif /* MDCT_MIPSR1_H__ */
diff --git a/opus/celt/mips/vq_mipsr1.h b/opus/celt/mips/vq_mipsr1.h
index f26a33e7..1621c562 100644
--- a/opus/celt/mips/vq_mipsr1.h
+++ b/opus/celt/mips/vq_mipsr1.h
@@ -26,8 +26,8 @@
    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
-#ifndef __VQ_MIPSR1_H__
-#define __VQ_MIPSR1_H__
+#ifndef VQ_MIPSR1_H__
+#define VQ_MIPSR1_H__
 
 #ifdef HAVE_CONFIG_H
 #include "config.h"
@@ -113,4 +113,4 @@ void renormalise_vector(celt_norm *X, int N, opus_val16 gain, int arch)
    /*return celt_sqrt(E);*/
 }
 
-#endif /* __VQ_MIPSR1_H__ */
+#endif /* VQ_MIPSR1_H__ */
diff --git a/opus/celt/modes.c b/opus/celt/modes.c
index 390c5e8a..23f7cde6 100644
--- a/opus/celt/modes.c
+++ b/opus/celt/modes.c
@@ -173,7 +173,10 @@ static void compute_allocation_table(CELTMode *mode)
    mode->nbAllocVectors = BITALLOC_SIZE;
    allocVectors = opus_alloc(sizeof(unsigned char)*(BITALLOC_SIZE*mode->nbEBands));
    if (allocVectors==NULL)
+   {
+      mode->allocVectors = NULL;
       return;
+   }
 
    /* Check for standard mode */
    if (mode->Fs == 400*(opus_int32)mode->shortMdctSize)
diff --git a/opus/celt/os_support.h b/opus/celt/os_support.h
index a2171971..7d2d3781 100644
--- a/opus/celt/os_support.h
+++ b/opus/celt/os_support.h
@@ -39,10 +39,9 @@
 #include "opus_defines.h"
 
 #include <string.h>
-#include <stdio.h>
 #include <stdlib.h>
 
-/** Opus wrapper for malloc(). To do your own dynamic allocation, all you need to do is replace this function and opus_free */
+/** Opus wrapper for malloc(). To do your own dynamic allocation replace this function, opus_realloc, and opus_free */
 #ifndef OVERRIDE_OPUS_ALLOC
 static OPUS_INLINE void *opus_alloc (size_t size)
 {
@@ -50,7 +49,15 @@ static OPUS_INLINE void *opus_alloc (size_t size)
 }
 #endif
 
-/** Same as celt_alloc(), except that the area is only needed inside a CELT call (might cause problem with wideband though) */
+#ifndef OVERRIDE_OPUS_REALLOC
+static OPUS_INLINE void *opus_realloc (void *ptr, size_t size)
+{
+   return realloc(ptr, size);
+}
+#endif
+
+/** Used only for non-threadsafe pseudostack.
+    If desired, this can always return the same area of memory rather than allocating a new one every time. */
 #ifndef OVERRIDE_OPUS_ALLOC_SCRATCH
 static OPUS_INLINE void *opus_alloc_scratch (size_t size)
 {
@@ -59,7 +66,7 @@ static OPUS_INLINE void *opus_alloc_scratch (size_t size)
 }
 #endif
 
-/** Opus wrapper for free(). To do your own dynamic allocation, all you need to do is replace this function and opus_alloc */
+/** Opus wrapper for free(). To do your own dynamic allocation replace this function, opus_realloc, and opus_free */
 #ifndef OVERRIDE_OPUS_FREE
 static OPUS_INLINE void opus_free (void *ptr)
 {
diff --git a/opus/celt/pitch.c b/opus/celt/pitch.c
index 872582a4..e33c60a3 100644
--- a/opus/celt/pitch.c
+++ b/opus/celt/pitch.c
@@ -161,17 +161,26 @@ void pitch_downsample(celt_sig * OPUS_RESTRICT x[], opus_val16 * OPUS_RESTRICT x
       shift=0;
    if (C==2)
       shift++;
-#endif
    for (i=1;i<len>>1;i++)
-      x_lp[i] = SHR32(HALF32(HALF32(x[0][(2*i-1)]+x[0][(2*i+1)])+x[0][2*i]), shift);
-   x_lp[0] = SHR32(HALF32(HALF32(x[0][1])+x[0][0]), shift);
+      x_lp[i] = SHR32(x[0][(2*i-1)], shift+2) + SHR32(x[0][(2*i+1)], shift+2) + SHR32(x[0][2*i], shift+1);
+   x_lp[0] = SHR32(x[0][1], shift+2) + SHR32(x[0][0], shift+1);
    if (C==2)
    {
       for (i=1;i<len>>1;i++)
-         x_lp[i] += SHR32(HALF32(HALF32(x[1][(2*i-1)]+x[1][(2*i+1)])+x[1][2*i]), shift);
-      x_lp[0] += SHR32(HALF32(HALF32(x[1][1])+x[1][0]), shift);
+         x_lp[i] += SHR32(x[1][(2*i-1)], shift+2) + SHR32(x[1][(2*i+1)], shift+2) + SHR32(x[1][2*i], shift+1);
+      x_lp[0] += SHR32(x[1][1], shift+2) + SHR32(x[1][0], shift+1);
    }
-
+#else
+   for (i=1;i<len>>1;i++)
+      x_lp[i] = .25f*x[0][(2*i-1)] + .25f*x[0][(2*i+1)] + .5f*x[0][2*i];
+   x_lp[0] = .25f*x[0][1] + .5f*x[0][0];
+   if (C==2)
+   {
+      for (i=1;i<len>>1;i++)
+         x_lp[i] += .25f*x[1][(2*i-1)] + .25f*x[1][(2*i+1)] + .5f*x[1][2*i];
+      x_lp[0] += .25f*x[1][1] + .5f*x[1][0];
+   }
+#endif
    _celt_autocorr(x_lp, ac, NULL, 0,
                   4, len>>1, arch);
 
@@ -249,11 +258,20 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y,
    opus_val32 maxcorr=1;
 #endif
    celt_assert(max_pitch>0);
-   celt_sig_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
+   celt_sig_assert(((size_t)_x&3)==0);
    for (i=0;i<max_pitch-3;i+=4)
    {
       opus_val32 sum[4]={0,0,0,0};
-      xcorr_kernel(_x, _y+i, sum, len, arch);
+#if defined(OPUS_CHECK_ASM) && defined(FIXED_POINT)
+      {
+         opus_val32 sum_c[4]={0,0,0,0};
+         xcorr_kernel_c(_x, _y+i, sum_c, len);
+#endif
+         xcorr_kernel(_x, _y+i, sum, len, arch);
+#if defined(OPUS_CHECK_ASM) && defined(FIXED_POINT)
+         celt_assert(memcmp(sum, sum_c, sizeof(sum)) == 0);
+      }
+#endif
       xcorr[i]=sum[0];
       xcorr[i+1]=sum[1];
       xcorr[i+2]=sum[2];
diff --git a/opus/celt/pitch.h b/opus/celt/pitch.h
index e425f56a..dd0e2beb 100644
--- a/opus/celt/pitch.h
+++ b/opus/celt/pitch.h
@@ -189,4 +189,15 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y,
 # define celt_pitch_xcorr celt_pitch_xcorr_c
 #endif
 
+#ifdef NON_STATIC_COMB_FILTER_CONST_C
+void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N,
+                         opus_val16 g10, opus_val16 g11, opus_val16 g12);
+#endif
+
+#ifndef OVERRIDE_COMB_FILTER_CONST
+# define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \
+    ((void)(arch),comb_filter_const_c(y, x, T, N, g10, g11, g12))
+#endif
+
+
 #endif
diff --git a/opus/celt/rate.c b/opus/celt/rate.c
index 465e1ba2..7f7ad3fa 100644
--- a/opus/celt/rate.c
+++ b/opus/celt/rate.c
@@ -356,6 +356,8 @@ static OPUS_INLINE int interp_bits2pulses(const CELTMode *m, int start, int end,
             else
                depth_threshold = 0;
 #ifdef FUZZING
+            (void)signalBandwidth;
+            (void)depth_threshold;
             if ((rand()&0x1) == 0)
 #else
             if (codedBands<=start+2 || (band_bits > (depth_threshold*band_width<<LM<<BITRES)>>4 && j<=signalBandwidth))
diff --git a/opus/celt/stack_alloc.h b/opus/celt/stack_alloc.h
index 2b51c8d8..e2739bdf 100644
--- a/opus/celt/stack_alloc.h
+++ b/opus/celt/stack_alloc.h
@@ -40,7 +40,7 @@
 #endif
 
 #ifdef USE_ALLOCA
-# ifdef WIN32
+# ifdef _WIN32
 #  include <malloc.h>
 # else
 #  ifdef HAVE_ALLOCA_H
@@ -102,7 +102,7 @@
 
 #define VARDECL(type, var) type *var
 
-# ifdef WIN32
+# ifdef _WIN32
 #  define ALLOC(var, size, type) var = ((type*)_alloca(sizeof(type)*(size)))
 # else
 #  define ALLOC(var, size, type) var = ((type*)alloca(sizeof(type)*(size)))
@@ -141,7 +141,7 @@ extern char *global_stack_top;
 #else
 
 #define ALIGN(stack, size) ((stack) += ((size) - (long)(stack)) & ((size) - 1))
-#define PUSH(stack, size, type) (ALIGN((stack),sizeof(type)/sizeof(char)),(stack)+=(size)*(sizeof(type)/sizeof(char)),(type*)((stack)-(size)*(sizeof(type)/sizeof(char))))
+#define PUSH(stack, size, type) (ALIGN((stack),sizeof(type)/(sizeof(char))),(stack)+=(size)*(sizeof(type)/(sizeof(char))),(type*)((stack)-(size)*(sizeof(type)/(sizeof(char)))))
 #if 0 /* Set this to 1 to instrument pseudostack usage */
 #define RESTORE_STACK (printf("%ld %s:%d\n", global_stack-scratch_ptr, __FILE__, __LINE__),global_stack = _saved_stack)
 #else
diff --git a/opus/celt/tests/test_unit_cwrs32.c b/opus/celt/tests/test_unit_cwrs32.c
index 36dd8af5..f6b8ac4b 100644
--- a/opus/celt/tests/test_unit_cwrs32.c
+++ b/opus/celt/tests/test_unit_cwrs32.c
@@ -157,5 +157,6 @@ int main(void){
       /*printf("\n");*/
     }
   }
+  RESTORE_STACK;
   return 0;
 }
diff --git a/opus/celt/tests/test_unit_dft.c b/opus/celt/tests/test_unit_dft.c
index 70f8f493..ad6c60a0 100644
--- a/opus/celt/tests/test_unit_dft.c
+++ b/opus/celt/tests/test_unit_dft.c
@@ -144,8 +144,9 @@ void test1d(int nfft,int isinverse,int arch)
 
 int main(int argc,char ** argv)
 {
+    int arch;
     ALLOC_STACK;
-    int arch = opus_select_arch();
+    arch = opus_select_arch();
 
     if (argc>1) {
         int k;
@@ -175,5 +176,6 @@ int main(int argc,char ** argv)
         test1d(480,1,arch);
 #endif
     }
+    RESTORE_STACK;
     return ret;
 }
diff --git a/opus/celt/tests/test_unit_entropy.c b/opus/celt/tests/test_unit_entropy.c
index 7f674529..b1619b74 100644
--- a/opus/celt/tests/test_unit_entropy.c
+++ b/opus/celt/tests/test_unit_entropy.c
@@ -104,7 +104,7 @@ int main(int _argc,char **_argv){
   nbits=ec_tell_frac(&enc);
   ec_enc_done(&enc);
   fprintf(stderr,
-   "Encoded %0.2lf bits of entropy to %0.2lf bits (%0.3lf%% wasted).\n",
+   "Encoded %0.2f bits of entropy to %0.2f bits (%0.3f%% wasted).\n",
    entropy,ldexp(nbits,-3),100*(nbits-ldexp(entropy,3))/nbits);
   fprintf(stderr,"Packed to %li bytes.\n",(long)ec_range_bytes(&enc));
   ec_dec_init(&dec,ptr,DATA_SIZE);
@@ -129,7 +129,7 @@ int main(int _argc,char **_argv){
   nbits2=ec_tell_frac(&dec);
   if(nbits!=nbits2){
     fprintf(stderr,
-     "Reported number of bits used was %0.2lf, should be %0.2lf.\n",
+     "Reported number of bits used was %0.2f, should be %0.2f.\n",
      ldexp(nbits2,-3),ldexp(nbits,-3));
     ret=-1;
   }
diff --git a/opus/celt/tests/test_unit_laplace.c b/opus/celt/tests/test_unit_laplace.c
index 727bf012..9832e5e5 100644
--- a/opus/celt/tests/test_unit_laplace.c
+++ b/opus/celt/tests/test_unit_laplace.c
@@ -89,5 +89,6 @@ int main(void)
    }
 
    free(ptr);
+   RESTORE_STACK;
    return ret;
 }
diff --git a/opus/celt/tests/test_unit_mathops.c b/opus/celt/tests/test_unit_mathops.c
index 874e9adf..0615448d 100644
--- a/opus/celt/tests/test_unit_mathops.c
+++ b/opus/celt/tests/test_unit_mathops.c
@@ -143,7 +143,7 @@ void testbitexactlog2tan(void)
 void testlog2(void)
 {
    float x;
-   for (x=0.001;x<1677700.0;x+=(x/8.0))
+   for (x=0.001f;x<1677700.0;x+=(x/8.0))
    {
       float error = fabs((1.442695040888963387*log(x))-celt_log2(x));
       if (error>0.0009)
@@ -157,7 +157,7 @@ void testlog2(void)
 void testexp2(void)
 {
    float x;
-   for (x=-11.0;x<24.0;x+=0.0007)
+   for (x=-11.0;x<24.0;x+=0.0007f)
    {
       float error = fabs(x-(1.442695040888963387*log(celt_exp2(x))));
       if (error>0.0002)
@@ -171,7 +171,7 @@ void testexp2(void)
 void testexp2log2(void)
 {
    float x;
-   for (x=-11.0;x<24.0;x+=0.0007)
+   for (x=-11.0;x<24.0;x+=0.0007f)
    {
       float error = fabs(x-(celt_log2(celt_exp2(x))));
       if (error>0.001)
diff --git a/opus/celt/tests/test_unit_mdct.c b/opus/celt/tests/test_unit_mdct.c
index 4a563ccf..70dc042e 100644
--- a/opus/celt/tests/test_unit_mdct.c
+++ b/opus/celt/tests/test_unit_mdct.c
@@ -184,8 +184,9 @@ void test1d(int nfft,int isinverse,int arch)
 
 int main(int argc,char ** argv)
 {
+    int arch;
     ALLOC_STACK;
-    int arch = opus_select_arch();
+    arch = opus_select_arch();
 
     if (argc>1) {
         int k;
@@ -223,5 +224,6 @@ int main(int argc,char ** argv)
         test1d(1920,1,arch);
 #endif
     }
+    RESTORE_STACK;
     return ret;
 }
diff --git a/opus/celt/tests/test_unit_rotation.c b/opus/celt/tests/test_unit_rotation.c
index 8a31b3f2..6d05499b 100644
--- a/opus/celt/tests/test_unit_rotation.c
+++ b/opus/celt/tests/test_unit_rotation.c
@@ -82,5 +82,6 @@ int main(void)
    test_rotation(23, 5);
    test_rotation(50, 3);
    test_rotation(80, 1);
+   RESTORE_STACK;
    return ret;
 }
diff --git a/opus/celt/x86/celt_lpc_sse.h b/opus/celt/x86/celt_lpc_sse.h
index 7d1ecf75..90e69ecf 100644
--- a/opus/celt/x86/celt_lpc_sse.h
+++ b/opus/celt/x86/celt_lpc_sse.h
@@ -33,7 +33,6 @@
 #endif
 
 #if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
-#define OVERRIDE_CELT_FIR
 
 void celt_fir_sse4_1(
          const opus_val16 *x,
@@ -44,10 +43,11 @@ void celt_fir_sse4_1(
          int arch);
 
 #if defined(OPUS_X86_PRESUME_SSE4_1)
+#define OVERRIDE_CELT_FIR
 #define celt_fir(x, num, y, N, ord, arch) \
     ((void)arch, celt_fir_sse4_1(x, num, y, N, ord, arch))
 
-#else
+#elif defined(OPUS_HAVE_RTCD)
 
 extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
          const opus_val16 *x,
@@ -57,6 +57,7 @@ extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
          int ord,
          int arch);
 
+#define OVERRIDE_CELT_FIR
 #  define celt_fir(x, num, y, N, ord, arch) \
     ((*CELT_FIR_IMPL[(arch) & OPUS_ARCHMASK])(x, num, y, N, ord, arch))
 
diff --git a/opus/celt/x86/celt_lpc_sse4_1.c b/opus/celt/x86/celt_lpc_sse4_1.c
index 54785688..daf59d24 100644
--- a/opus/celt/x86/celt_lpc_sse4_1.c
+++ b/opus/celt/x86/celt_lpc_sse4_1.c
@@ -64,9 +64,16 @@ void celt_fir_sse4_1(const opus_val16 *x,
    {
       opus_val32 sums[4] = {0};
       __m128i vecSum, vecX;
-
-      xcorr_kernel(rnum, x+i-ord, sums, ord, arch);
-
+#if defined(OPUS_CHECK_ASM)
+      {
+         opus_val32 sums_c[4] = {0};
+         xcorr_kernel_c(rnum, x+i-ord, sums_c, ord);
+#endif
+         xcorr_kernel(rnum, x+i-ord, sums, ord, arch);
+#if defined(OPUS_CHECK_ASM)
+         celt_assert(memcmp(sums, sums_c, sizeof(sums)) == 0);
+      }
+#endif
       vecSum = _mm_loadu_si128((__m128i *)sums);
       vecSum = _mm_add_epi32(vecSum, vecNoA);
       vecSum = _mm_srai_epi32(vecSum, SIG_SHIFT);
diff --git a/opus/celt/x86/pitch_avx.c b/opus/celt/x86/pitch_avx.c
new file mode 100644
index 00000000..f731762d
--- /dev/null
+++ b/opus/celt/x86/pitch_avx.c
@@ -0,0 +1,101 @@
+/* Copyright (c) 2023 Amazon */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+
+#include <immintrin.h>
+#include "x86cpu.h"
+#include "pitch.h"
+
+#if defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(FIXED_POINT)
+
+/* Like the "regular" xcorr_kernel(), but computes 8 results at a time. */
+static void xcorr_kernel_avx(const float *x, const float *y, float sum[8], int len)
+{
+    __m256 xsum0, xsum1, xsum2, xsum3, xsum4, xsum5, xsum6, xsum7;
+    xsum7 = xsum6 = xsum5 = xsum4 = xsum3 = xsum2 = xsum1 = xsum0 = _mm256_setzero_ps();
+    int i;
+    __m256 x0;
+    /* Compute 8 inner products using partial sums. */
+    for (i=0;i<len-7;i+=8)
+    {
+        x0 = _mm256_loadu_ps(x+i);
+        xsum0 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i  ), xsum0);
+        xsum1 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+1), xsum1);
+        xsum2 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+2), xsum2);
+        xsum3 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+3), xsum3);
+        xsum4 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+4), xsum4);
+        xsum5 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+5), xsum5);
+        xsum6 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+6), xsum6);
+        xsum7 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+7), xsum7);
+    }
+    if (i != len) {
+        static const int mask[15] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0};
+        __m256i m;
+        m = _mm256_loadu_si256((__m256i*)(void*)(mask + 7+i-len));
+        x0 = _mm256_maskload_ps(x+i, m);
+        xsum0 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i  , m), xsum0);
+        xsum1 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+1, m), xsum1);
+        xsum2 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+2, m), xsum2);
+        xsum3 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+3, m), xsum3);
+        xsum4 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+4, m), xsum4);
+        xsum5 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+5, m), xsum5);
+        xsum6 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+6, m), xsum6);
+        xsum7 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+7, m), xsum7);
+    }
+    /* 8 horizontal adds. */
+    /* Compute [0 4] [1 5] [2 6] [3 7] */
+    xsum0 = _mm256_add_ps(_mm256_permute2f128_ps(xsum0, xsum4, 2<<4), _mm256_permute2f128_ps(xsum0, xsum4, 1 | (3<<4)));
+    xsum1 = _mm256_add_ps(_mm256_permute2f128_ps(xsum1, xsum5, 2<<4), _mm256_permute2f128_ps(xsum1, xsum5, 1 | (3<<4)));
+    xsum2 = _mm256_add_ps(_mm256_permute2f128_ps(xsum2, xsum6, 2<<4), _mm256_permute2f128_ps(xsum2, xsum6, 1 | (3<<4)));
+    xsum3 = _mm256_add_ps(_mm256_permute2f128_ps(xsum3, xsum7, 2<<4), _mm256_permute2f128_ps(xsum3, xsum7, 1 | (3<<4)));
+    /* Compute [0 1 4 5] [2 3 6 7] */
+    xsum0 = _mm256_hadd_ps(xsum0, xsum1);
+    xsum1 = _mm256_hadd_ps(xsum2, xsum3);
+    /* Compute [0 1 2 3 4 5 6 7] */
+    xsum0 = _mm256_hadd_ps(xsum0, xsum1);
+    _mm256_storeu_ps(sum, xsum0);
+}
+
+void celt_pitch_xcorr_avx2(const float *_x, const float *_y, float *xcorr, int len, int max_pitch, int arch)
+{
+   int i;
+   celt_assert(max_pitch>0);
+   (void)arch;
+   for (i=0;i<max_pitch-7;i+=8)
+   {
+      xcorr_kernel_avx(_x, _y+i, &xcorr[i], len);
+   }
+   for (;i<max_pitch;i++)
+   {
+      xcorr[i] = celt_inner_prod(_x, _y+i, len, arch);
+   }
+}
+
+#endif
diff --git a/opus/celt/x86/pitch_sse.h b/opus/celt/x86/pitch_sse.h
index e5f87ab5..127581f3 100644
--- a/opus/celt/x86/pitch_sse.h
+++ b/opus/celt/x86/pitch_sse.h
@@ -63,7 +63,7 @@ void xcorr_kernel_sse(
 #define xcorr_kernel(x, y, sum, len, arch) \
     ((void)arch, xcorr_kernel_sse(x, y, sum, len))
 
-#elif (defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)) || (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT))
+#elif defined(OPUS_HAVE_RTCD) &&  ((defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)) || (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)))
 
 extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
                     const opus_val16 *x,
@@ -91,7 +91,7 @@ opus_val32 celt_inner_prod_sse2(
     int               N);
 #endif
 
-#if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT)
+#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)
 opus_val32 celt_inner_prod_sse(
     const opus_val16 *x,
     const opus_val16 *y,
@@ -115,8 +115,8 @@ opus_val32 celt_inner_prod_sse(
     ((void)arch, celt_inner_prod_sse(x, y, N))
 
 
-#elif ((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT)) || \
-    (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT))
+#elif defined(OPUS_HAVE_RTCD) && (((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT)) || \
+    (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)))
 
 extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
                     const opus_val16 *x,
@@ -131,12 +131,6 @@ extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
 
 #if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)
 
-#define OVERRIDE_DUAL_INNER_PROD
-#define OVERRIDE_COMB_FILTER_CONST
-
-#undef dual_inner_prod
-#undef comb_filter_const
-
 void dual_inner_prod_sse(const opus_val16 *x,
     const opus_val16 *y01,
     const opus_val16 *y02,
@@ -154,13 +148,17 @@ void comb_filter_const_sse(opus_val32 *y,
 
 
 #if defined(OPUS_X86_PRESUME_SSE)
+#define OVERRIDE_DUAL_INNER_PROD
+#define OVERRIDE_COMB_FILTER_CONST
 # define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) \
     ((void)(arch),dual_inner_prod_sse(x, y01, y02, N, xy1, xy2))
 
 # define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \
     ((void)(arch),comb_filter_const_sse(y, x, T, N, g10, g11, g12))
-#else
+#elif defined(OPUS_HAVE_RTCD)
 
+#define OVERRIDE_DUAL_INNER_PROD
+#define OVERRIDE_COMB_FILTER_CONST
 extern void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
               const opus_val16 *x,
               const opus_val16 *y01,
@@ -187,6 +185,32 @@ extern void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])(
 #define NON_STATIC_COMB_FILTER_CONST_C
 
 #endif
-#endif
+
+void celt_pitch_xcorr_avx2(const float *_x, const float *_y, float *xcorr, int len, int max_pitch, int arch);
+
+#if defined(OPUS_X86_PRESUME_AVX2)
+
+#define OVERRIDE_PITCH_XCORR
+# define celt_pitch_xcorr celt_pitch_xcorr_avx2
+
+#elif defined(OPUS_HAVE_RTCD) && defined(OPUS_X86_MAY_HAVE_AVX2)
+
+#define OVERRIDE_PITCH_XCORR
+extern void (*const PITCH_XCORR_IMPL[OPUS_ARCHMASK + 1])(
+              const float *_x,
+              const float *_y,
+              float *xcorr,
+              int len,
+              int max_pitch,
+              int arch
+              );
+
+#define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
+    ((*PITCH_XCORR_IMPL[(arch) & OPUS_ARCHMASK])(_x, _y, xcorr, len, max_pitch, arch))
+
+
+#endif /* OPUS_X86_PRESUME_AVX2 && !OPUS_HAVE_RTCD */
+
+#endif /* OPUS_X86_MAY_HAVE_SSE && !FIXED_POINT */
 
 #endif
diff --git a/opus/celt/x86/pitch_sse4_1.c b/opus/celt/x86/pitch_sse4_1.c
index a092c68b..2bc57830 100644
--- a/opus/celt/x86/pitch_sse4_1.c
+++ b/opus/celt/x86/pitch_sse4_1.c
@@ -117,6 +117,14 @@ void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32
     __m128i sum0, sum1, sum2, sum3, vecSum;
     __m128i initSum;
 
+#ifdef OPUS_CHECK_ASM
+    opus_val32 sum_c[4];
+    for (j=0;j<4;j++) {
+      sum_c[j] = sum[j];
+    }
+    xcorr_kernel_c(x, y, sum_c, len);
+#endif
+
     celt_assert(len >= 3);
 
     sum0 = _mm_setzero_si128();
@@ -177,19 +185,56 @@ void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32
         vecSum = _mm_add_epi32(vecSum, sum2);
     }
 
-    for (;j<len;j++)
+    vecX = OP_CVTEPI16_EPI32_M64(&x[len - 4]);
+    if (len - j == 3)
     {
-        vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
-        vecX0 = _mm_shuffle_epi32(vecX, 0x00);
+        vecX0 = _mm_shuffle_epi32(vecX, 0x55);
+        vecX1 = _mm_shuffle_epi32(vecX, 0xaa);
+        vecX2 = _mm_shuffle_epi32(vecX, 0xff);
 
         vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
+        vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
+        vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]);
 
         sum0 = _mm_mullo_epi32(vecX0, vecY0);
+        sum1 = _mm_mullo_epi32(vecX1, vecY1);
+        sum2 = _mm_mullo_epi32(vecX2, vecY2);
+
+        vecSum = _mm_add_epi32(vecSum, sum0);
+        vecSum = _mm_add_epi32(vecSum, sum1);
+        vecSum = _mm_add_epi32(vecSum, sum2);
+    }
+    else if (len - j == 2)
+    {
+        vecX0 = _mm_shuffle_epi32(vecX, 0xaa);
+        vecX1 = _mm_shuffle_epi32(vecX, 0xff);
+
+        vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
+        vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
+
+        sum0 = _mm_mullo_epi32(vecX0, vecY0);
+        sum1 = _mm_mullo_epi32(vecX1, vecY1);
+
+        vecSum = _mm_add_epi32(vecSum, sum0);
+        vecSum = _mm_add_epi32(vecSum, sum1);
+    }
+    else if (len - j == 1)
+    {
+        vecX0 = _mm_shuffle_epi32(vecX, 0xff);
+
+        vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
+
+        sum0 = _mm_mullo_epi32(vecX0, vecY0);
+
         vecSum = _mm_add_epi32(vecSum, sum0);
     }
 
     initSum = _mm_loadu_si128((__m128i *)(&sum[0]));
     initSum = _mm_add_epi32(initSum, vecSum);
     _mm_storeu_si128((__m128i *)sum, initSum);
+
+#ifdef OPUS_CHECK_ASM
+    celt_assert(!memcmp(sum_c, sum, sizeof(sum_c)));
+#endif
 }
 #endif
diff --git a/opus/celt/x86/vq_sse.h b/opus/celt/x86/vq_sse.h
index b4efe8f2..444503b6 100644
--- a/opus/celt/x86/vq_sse.h
+++ b/opus/celt/x86/vq_sse.h
@@ -28,16 +28,18 @@
 #define VQ_SSE_H
 
 #if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT)
-#define OVERRIDE_OP_PVQ_SEARCH
 
 opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch);
 
 #if defined(OPUS_X86_PRESUME_SSE2)
+
+#define OVERRIDE_OP_PVQ_SEARCH
 #define op_pvq_search(x, iy, K, N, arch) \
     (op_pvq_search_sse2(x, iy, K, N, arch))
 
-#else
+#elif defined(OPUS_HAVE_RTCD)
 
+#define OVERRIDE_OP_PVQ_SEARCH
 extern opus_val16 (*const OP_PVQ_SEARCH_IMPL[OPUS_ARCHMASK + 1])(
       celt_norm *_X, int *iy, int K, int N, int arch);
 
diff --git a/opus/celt/x86/vq_sse2.c b/opus/celt/x86/vq_sse2.c
index 77504286..4c4ebf8e 100644
--- a/opus/celt/x86/vq_sse2.c
+++ b/opus/celt/x86/vq_sse2.c
@@ -75,7 +75,7 @@ opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch)
       sums = _mm_add_ps(sums, x4);
       /* Clear y and iy in case we don't do the projection. */
       _mm_storeu_ps(&y[j], _mm_setzero_ps());
-      _mm_storeu_si128((__m128i*)&iy[j], _mm_setzero_si128());
+      _mm_storeu_si128((__m128i*)(void*)&iy[j], _mm_setzero_si128());
       _mm_storeu_ps(&X[j], x4);
       _mm_storeu_ps(&signy[j], s4);
    }
@@ -116,7 +116,7 @@ opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch)
          rx4 = _mm_mul_ps(x4, rcp4);
          iy4 = _mm_cvttps_epi32(rx4);
          pulses_sum = _mm_add_epi32(pulses_sum, iy4);
-         _mm_storeu_si128((__m128i*)&iy[j], iy4);
+         _mm_storeu_si128((__m128i*)(void*)&iy[j], iy4);
          y4 = _mm_cvtepi32_ps(iy4);
          xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4));
          yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4));
@@ -205,10 +205,10 @@ opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch)
    {
       __m128i y4;
       __m128i s4;
-      y4 = _mm_loadu_si128((__m128i*)&iy[j]);
+      y4 = _mm_loadu_si128((__m128i*)(void*)&iy[j]);
       s4 = _mm_castps_si128(_mm_loadu_ps(&signy[j]));
       y4 = _mm_xor_si128(_mm_add_epi32(y4, s4), s4);
-      _mm_storeu_si128((__m128i*)&iy[j], y4);
+      _mm_storeu_si128((__m128i*)(void*)&iy[j], y4);
    }
    RESTORE_STACK;
    return yy;
diff --git a/opus/celt/x86/x86_arch_macros.h b/opus/celt/x86/x86_arch_macros.h
new file mode 100644
index 00000000..975b443e
--- /dev/null
+++ b/opus/celt/x86/x86_arch_macros.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2023 Amazon */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef _MSC_VER
+
+# ifdef OPUS_X86_MAY_HAVE_SSE
+#  ifndef __SSE__
+#   define __SSE__
+#  endif
+# endif
+
+# ifdef OPUS_X86_MAY_HAVE_SSE2
+#  ifndef __SSE2__
+#   define __SSE2__
+#  endif
+# endif
+
+# ifdef OPUS_X86_MAY_HAVE_SSE4_1
+#  ifndef __SSE4_1__
+#   define __SSE4_1__
+#  endif
+# endif
+
+#endif
diff --git a/opus/celt/x86/x86_celt_map.c b/opus/celt/x86/x86_celt_map.c
index d39d88ed..ba8eafe6 100644
--- a/opus/celt/x86/x86_celt_map.c
+++ b/opus/celt/x86/x86_celt_map.c
@@ -90,6 +90,26 @@ opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
 
 # else
 
+#if defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_PRESUME_AVX2)
+
+void (*const PITCH_XCORR_IMPL[OPUS_ARCHMASK + 1])(
+         const float *_x,
+         const float *_y,
+         float *xcorr,
+         int len,
+         int max_pitch,
+         int arch
+) = {
+  celt_pitch_xcorr_c,                /* non-sse */
+  celt_pitch_xcorr_c,
+  celt_pitch_xcorr_c,
+  celt_pitch_xcorr_c,
+  MAY_HAVE_AVX2(celt_pitch_xcorr)
+};
+
+#endif
+
+
 #if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)
 
 void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
diff --git a/opus/celt/x86/x86cpu.c b/opus/celt/x86/x86cpu.c
index 080eb25e..2e7c32ae 100644
--- a/opus/celt/x86/x86cpu.c
+++ b/opus/celt/x86/x86cpu.c
@@ -35,11 +35,11 @@
 #include "pitch.h"
 #include "x86cpu.h"
 
-#if (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
+#if defined(OPUS_HAVE_RTCD) && \
+  ((defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
   (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \
   (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \
-  (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX))
-
+  (defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_PRESUME_AVX2)))
 
 #if defined(_MSC_VER)
 
@@ -68,7 +68,8 @@ static void cpuid(unsigned int CPUInfo[4], unsigned int InfoType)
         "=r" (CPUInfo[1]),
         "=c" (CPUInfo[2]),
         "=d" (CPUInfo[3]) :
-        "0" (InfoType)
+        /* We clear ECX to avoid a valgrind false-positive prior to v3.17.0. */
+        "0" (InfoType), "2" (0)
     );
 #else
     __asm__ __volatile__ (
@@ -77,11 +78,22 @@ static void cpuid(unsigned int CPUInfo[4], unsigned int InfoType)
         "=b" (CPUInfo[1]),
         "=c" (CPUInfo[2]),
         "=d" (CPUInfo[3]) :
-        "0" (InfoType)
+        /* We clear ECX to avoid a valgrind false-positive prior to v3.17.0. */
+        "0" (InfoType), "2" (0)
     );
 #endif
 #elif defined(CPU_INFO_BY_C)
-    __get_cpuid(InfoType, &(CPUInfo[0]), &(CPUInfo[1]), &(CPUInfo[2]), &(CPUInfo[3]));
+    /* We use __get_cpuid_count to clear ECX to avoid a valgrind false-positive
+        prior to v3.17.0.*/
+    if (!__get_cpuid_count(InfoType, 0, &(CPUInfo[0]), &(CPUInfo[1]), &(CPUInfo[2]), &(CPUInfo[3]))) {
+        /* Our function cannot fail, but __get_cpuid{_count} can.
+           Returning all zeroes will effectively disable all SIMD, which is
+            what we want on CPUs that don't support CPUID. */
+        CPUInfo[3] = CPUInfo[2] = CPUInfo[1] = CPUInfo[0] = 0;
+    }
+#else
+# error "Configured to use x86 RTCD, but no CPU detection method available. " \
+ "Reconfigure with --disable-rtcd (or send patches)."
 #endif
 }
 
@@ -93,12 +105,12 @@ typedef struct CPU_Feature{
     int HW_SSE2;
     int HW_SSE41;
     /*  SIMD: 256-bit */
-    int HW_AVX;
+    int HW_AVX2;
 } CPU_Feature;
 
 static void opus_cpu_feature_check(CPU_Feature *cpu_feature)
 {
-    unsigned int info[4] = {0};
+    unsigned int info[4];
     unsigned int nIds = 0;
 
     cpuid(info, 0);
@@ -109,17 +121,23 @@ static void opus_cpu_feature_check(CPU_Feature *cpu_feature)
         cpu_feature->HW_SSE = (info[3] & (1 << 25)) != 0;
         cpu_feature->HW_SSE2 = (info[3] & (1 << 26)) != 0;
         cpu_feature->HW_SSE41 = (info[2] & (1 << 19)) != 0;
-        cpu_feature->HW_AVX = (info[2] & (1 << 28)) != 0;
+        cpu_feature->HW_AVX2 = (info[2] & (1 << 28)) != 0 && (info[2] & (1 << 12)) != 0;
+        if (cpu_feature->HW_AVX2 && nIds >= 7) {
+            cpuid(info, 7);
+            cpu_feature->HW_AVX2 = cpu_feature->HW_AVX2 && (info[1] & (1 << 5)) != 0;
+        } else {
+            cpu_feature->HW_AVX2 = 0;
+        }
     }
     else {
         cpu_feature->HW_SSE = 0;
         cpu_feature->HW_SSE2 = 0;
         cpu_feature->HW_SSE41 = 0;
-        cpu_feature->HW_AVX = 0;
+        cpu_feature->HW_AVX2 = 0;
     }
 }
 
-int opus_select_arch(void)
+static int opus_select_arch_impl(void)
 {
     CPU_Feature cpu_feature;
     int arch;
@@ -145,7 +163,7 @@ int opus_select_arch(void)
     }
     arch++;
 
-    if (!cpu_feature.HW_AVX)
+    if (!cpu_feature.HW_AVX2)
     {
         return arch;
     }
@@ -154,4 +172,13 @@ int opus_select_arch(void)
     return arch;
 }
 
+int opus_select_arch(void) {
+    int arch = opus_select_arch_impl();
+#ifdef FUZZING
+    /* Randomly downgrade the architecture. */
+    arch = rand()%(arch+1);
+#endif
+    return arch;
+}
+
 #endif
diff --git a/opus/celt/x86/x86cpu.h b/opus/celt/x86/x86cpu.h
index 1e2bf17b..8ae9be8d 100644
--- a/opus/celt/x86/x86cpu.h
+++ b/opus/celt/x86/x86cpu.h
@@ -46,50 +46,53 @@
 #  define MAY_HAVE_SSE4_1(name) name ## _c
 # endif
 
-# if defined(OPUS_X86_MAY_HAVE_AVX)
-#  define MAY_HAVE_AVX(name) name ## _avx
+# if defined(OPUS_X86_MAY_HAVE_AVX2)
+#  define MAY_HAVE_AVX2(name) name ## _avx2
 # else
-#  define MAY_HAVE_AVX(name) name ## _c
+#  define MAY_HAVE_AVX2(name) name ## _c
 # endif
 
-# if defined(OPUS_HAVE_RTCD)
+# if defined(OPUS_HAVE_RTCD) && \
+  ((defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
+  (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \
+  (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \
+  (defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_PRESUME_AVX2)))
 int opus_select_arch(void);
 # endif
 
-/*gcc appears to emit MOVDQA's to load the argument of an _mm_cvtepi8_epi32()
-  or _mm_cvtepi16_epi32() when optimizations are disabled, even though the
-  actual PMOVSXWD instruction takes an m32 or m64. Unlike a normal memory
-  reference, these require 16-byte alignment and load a full 16 bytes (instead
-  of 4 or 8), possibly reading out of bounds.
-
-  We can insert an explicit MOVD or MOVQ using _mm_cvtsi32_si128() or
-  _mm_loadl_epi64(), which should have the same semantics as an m32 or m64
-  reference in the PMOVSXWD instruction itself, but gcc is not smart enough to
-  optimize this out when optimizations ARE enabled.
-
-  Clang, in contrast, requires us to do this always for _mm_cvtepi8_epi32
-  (which is fair, since technically the compiler is always allowed to do the
-  dereference before invoking the function implementing the intrinsic).
-  However, it is smart enough to eliminate the extra MOVD instruction.
-  For _mm_cvtepi16_epi32, it does the right thing, though does *not* optimize out
-  the extra MOVQ if it's specified explicitly */
-
-# if defined(__clang__) || !defined(__OPTIMIZE__)
-#  define OP_CVTEPI8_EPI32_M32(x) \
- (_mm_cvtepi8_epi32(_mm_cvtsi32_si128(*(int *)(x))))
-# else
+# if defined(OPUS_X86_MAY_HAVE_SSE2)
+#  include "opus_defines.h"
+
+/*MOVD should not impose any alignment restrictions, but the C standard does,
+   and UBSan will report errors if we actually make unaligned accesses.
+  Use this to work around those restrictions (which should hopefully all get
+   optimized to a single MOVD instruction).
+  GCC implemented _mm_loadu_si32() since GCC 11; HOWEVER, there is a bug!
+  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99754 */
+#  if !defined(_MSC_VER) && !OPUS_GNUC_PREREQ(11,3) && !(defined(__clang__) && (__clang_major__ >= 8))
+#   include <string.h>
+#   include <emmintrin.h>
+
+#   ifdef _mm_loadu_si32
+#    undef _mm_loadu_si32
+#   endif
+#   define _mm_loadu_si32 WORKAROUND_mm_loadu_si32
+static inline __m128i WORKAROUND_mm_loadu_si32(void const* mem_addr) {
+  int val;
+  memcpy(&val, mem_addr, sizeof(val));
+  return _mm_cvtsi32_si128(val);
+}
+#  elif defined(_MSC_VER)
+    /* MSVC needs this for _mm_loadu_si32 */
+#   include <immintrin.h>
+#  endif
+
 #  define OP_CVTEPI8_EPI32_M32(x) \
- (_mm_cvtepi8_epi32(*(__m128i *)(x)))
-#endif
+ (_mm_cvtepi8_epi32(_mm_loadu_si32(x)))
 
-/* similar reasoning about the instruction sequence as in the 32-bit macro above,
- */
-# if defined(__clang__) || !defined(__OPTIMIZE__)
-#  define OP_CVTEPI16_EPI32_M64(x) \
- (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(x))))
-# else
 #  define OP_CVTEPI16_EPI32_M64(x) \
- (_mm_cvtepi16_epi32(*(__m128i *)(x)))
+ (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(void*)(x))))
+
 # endif
 
 #endif
diff --git a/opus/include/opus/opus.h b/opus/include/opus/opus.h
index d282f21d..eadeda75 100644
--- a/opus/include/opus/opus.h
+++ b/opus/include/opus/opus.h
@@ -103,7 +103,7 @@ extern "C" {
   * @endcode
   *
   * where opus_encoder_get_size() returns the required size for the encoder state. Note that
-  * future versions of this code may change the size, so no assuptions should be made about it.
+  * future versions of this code may change the size, so no assumptions should be made about it.
   *
   * The encoder state is always continuous in memory and only a shallow copy is sufficient
   * to copy it (e.g. memcpy())
@@ -198,7 +198,7 @@ OPUS_EXPORT OPUS_WARN_UNUSED_RESULT int opus_encoder_get_size(int channels);
  *                                     This must be one of 8000, 12000, 16000,
  *                                     24000, or 48000.
  * @param [in] channels <tt>int</tt>: Number of channels (1 or 2) in input signal
- * @param [in] application <tt>int</tt>: Coding mode (@ref OPUS_APPLICATION_VOIP/@ref OPUS_APPLICATION_AUDIO/@ref OPUS_APPLICATION_RESTRICTED_LOWDELAY)
+ * @param [in] application <tt>int</tt>: Coding mode (one of @ref OPUS_APPLICATION_VOIP, @ref OPUS_APPLICATION_AUDIO, or @ref OPUS_APPLICATION_RESTRICTED_LOWDELAY)
  * @param [out] error <tt>int*</tt>: @ref opus_errorcodes
  * @note Regardless of the sampling rate and number channels selected, the Opus encoder
  * can switch to a lower audio bandwidth or number of channels if the bitrate
@@ -222,7 +222,7 @@ OPUS_EXPORT OPUS_WARN_UNUSED_RESULT OpusEncoder *opus_encoder_create(
  *                                      This must be one of 8000, 12000, 16000,
  *                                      24000, or 48000.
   * @param [in] channels <tt>int</tt>: Number of channels (1 or 2) in input signal
-  * @param [in] application <tt>int</tt>: Coding mode (OPUS_APPLICATION_VOIP/OPUS_APPLICATION_AUDIO/OPUS_APPLICATION_RESTRICTED_LOWDELAY)
+  * @param [in] application <tt>int</tt>: Coding mode (one of OPUS_APPLICATION_VOIP, OPUS_APPLICATION_AUDIO, or OPUS_APPLICATION_RESTRICTED_LOWDELAY)
   * @retval #OPUS_OK Success or @ref opus_errorcodes
   */
 OPUS_EXPORT int opus_encoder_init(
@@ -357,7 +357,7 @@ OPUS_EXPORT int opus_encoder_ctl(OpusEncoder *st, int request, ...) OPUS_ARG_NON
   * error = opus_decoder_init(dec, Fs, channels);
   * @endcode
   * where opus_decoder_get_size() returns the required size for the decoder state. Note that
-  * future versions of this code may change the size, so no assuptions should be made about it.
+  * future versions of this code may change the size, so no assumptions should be made about it.
   *
   * The decoder state is always continuous in memory and only a shallow copy is sufficient
   * to copy it (e.g. memcpy())
@@ -398,6 +398,21 @@ OPUS_EXPORT int opus_encoder_ctl(OpusEncoder *st, int request, ...) OPUS_ARG_NON
   */
 typedef struct OpusDecoder OpusDecoder;
 
+/** Opus DRED decoder.
+  * This contains the complete state of an Opus DRED decoder.
+  * It is position independent and can be freely copied.
+  * @see opus_dred_decoder_create,opus_dred_decoder_init
+  */
+typedef struct OpusDREDDecoder OpusDREDDecoder;
+
+
+/** Opus DRED state.
+  * This contains the complete state of an Opus DRED packet.
+  * It is position independent and can be freely copied.
+  * @see opus_dred_create,opus_dred_init
+  */
+typedef struct OpusDRED OpusDRED;
+
 /** Gets the size of an <code>OpusDecoder</code> structure.
   * @param [in] channels <tt>int</tt>: Number of channels.
   *                                    This must be 1 or 2.
@@ -511,6 +526,101 @@ OPUS_EXPORT int opus_decoder_ctl(OpusDecoder *st, int request, ...) OPUS_ARG_NON
   */
 OPUS_EXPORT void opus_decoder_destroy(OpusDecoder *st);
 
+/** Gets the size of an <code>OpusDREDDecoder</code> structure.
+  * @returns The size in bytes.
+  */
+OPUS_EXPORT int opus_dred_decoder_get_size(void);
+
+/** Allocates and initializes an OpusDREDDecoder state.
+  * @param [out] error <tt>int*</tt>: #OPUS_OK Success or @ref opus_errorcodes
+  */
+OPUS_EXPORT OpusDREDDecoder *opus_dred_decoder_create(int *error);
+
+/** Initializes an <code>OpusDREDDecoder</code> state.
+  * @param[in] dec <tt>OpusDREDDecoder*</tt>: State to be initialized.
+  */
+OPUS_EXPORT int opus_dred_decoder_init(OpusDREDDecoder *dec);
+
+/** Frees an <code>OpusDREDDecoder</code> allocated by opus_dred_decoder_create().
+  * @param[in] dec <tt>OpusDREDDecoder*</tt>: State to be freed.
+  */
+OPUS_EXPORT void opus_dred_decoder_destroy(OpusDREDDecoder *dec);
+
+/** Perform a CTL function on an Opus DRED decoder.
+  *
+  * Generally the request and subsequent arguments are generated
+  * by a convenience macro.
+  * @param dred_dec <tt>OpusDREDDecoder*</tt>: DRED Decoder state.
+  * @param request This and all remaining parameters should be replaced by one
+  *                of the convenience macros in @ref opus_genericctls or
+  *                @ref opus_decoderctls.
+  * @see opus_genericctls
+  * @see opus_decoderctls
+  */
+OPUS_EXPORT int opus_dred_decoder_ctl(OpusDREDDecoder *dred_dec, int request, ...);
+
+/** Gets the size of an <code>OpusDRED</code> structure.
+  * @returns The size in bytes.
+  */
+OPUS_EXPORT int opus_dred_get_size(void);
+
+/** Allocates and initializes a DRED state.
+  * @param [out] error <tt>int*</tt>: #OPUS_OK Success or @ref opus_errorcodes
+  */
+OPUS_EXPORT OpusDRED *opus_dred_alloc(int *error);
+
+/** Frees an <code>OpusDRED</code> allocated by opus_dred_create().
+  * @param[in] dec <tt>OpusDRED*</tt>: State to be freed.
+  */
+OPUS_EXPORT void opus_dred_free(OpusDRED *dec);
+
+/** Decode an Opus DRED packet.
+  * @param [in] dred_dec <tt>OpusDRED*</tt>: DRED Decoder state
+  * @param [in] dred <tt>OpusDRED*</tt>: DRED state
+  * @param [in] data <tt>char*</tt>: Input payload
+  * @param [in] len <tt>opus_int32</tt>: Number of bytes in payload
+  * @param [in] max_dred_samples <tt>opus_int32</tt>: Maximum number of DRED samples that may be needed (if available in the packet).
+  * @param [in] sampling_rate <tt>opus_int32</tt>: Sampling rate used for max_dred_samples argument. Needs not match the actual sampling rate of the decoder.
+  * @param [out] dred_end <tt>opus_int32*</tt>: Number of non-encoded (silence) samples between the DRED timestamp and the last DRED sample.
+  * @param [in] defer_processing <tt>int</tt>: Flag (0 or 1). If set to one, the CPU-intensive part of the DRED decoding is deferred until opus_dred_process() is called.
+  * @returns Offset (positive) of the first decoded DRED samples, zero if no DRED is present, or @ref opus_errorcodes
+  */
+OPUS_EXPORT int opus_dred_parse(OpusDREDDecoder *dred_dec, OpusDRED *dred, const unsigned char *data, opus_int32 len, opus_int32 max_dred_samples, opus_int32 sampling_rate, int *dred_end, int defer_processing) OPUS_ARG_NONNULL(1);
+
+/** Finish decoding an Opus DRED packet. The function only needs to be called if opus_dred_parse() was called with defer_processing=1.
+  * The source and destination will often be the same DRED state.
+  * @param [in] dred_dec <tt>OpusDRED*</tt>: DRED Decoder state
+  * @param [in] src <tt>OpusDRED*</tt>: Source DRED state to start the processing from.
+  * @param [out] dst <tt>OpusDRED*</tt>: Destination DRED state to store the updated state after processing.
+  * @returns @ref opus_errorcodes
+  */
+OPUS_EXPORT int opus_dred_process(OpusDREDDecoder *dred_dec, const OpusDRED *src, OpusDRED *dst);
+
+/** Decode audio from an Opus DRED packet with floating point output.
+  * @param [in] st <tt>OpusDecoder*</tt>: Decoder state
+  * @param [in] dred <tt>OpusDRED*</tt>: DRED state
+  * @param [in] dred_offset <tt>opus_int32</tt>: position of the redundancy to decode (in samples before the beginning of the real audio data in the packet).
+  * @param [out] pcm <tt>opus_int16*</tt>: Output signal (interleaved if 2 channels). length
+  *  is frame_size*channels*sizeof(opus_int16)
+  * @param [in] frame_size Number of samples per channel to decode in \a pcm.
+  *  frame_size <b>must</b> be a multiple of 2.5 ms.
+  * @returns Number of decoded samples or @ref opus_errorcodes
+  */
+OPUS_EXPORT int opus_decoder_dred_decode(OpusDecoder *st, const OpusDRED *dred, opus_int32 dred_offset, opus_int16 *pcm, opus_int32 frame_size);
+
+/** Decode audio from an Opus DRED packet with floating point output.
+  * @param [in] st <tt>OpusDecoder*</tt>: Decoder state
+  * @param [in] dred <tt>OpusDRED*</tt>: DRED state
+  * @param [in] dred_offset <tt>opus_int32</tt>: position of the redundancy to decode (in samples before the beginning of the real audio data in the packet).
+  * @param [out] pcm <tt>float*</tt>: Output signal (interleaved if 2 channels). length
+  *  is frame_size*channels*sizeof(float)
+  * @param [in] frame_size Number of samples per channel to decode in \a pcm.
+  *  frame_size <b>must</b> be a multiple of 2.5 ms.
+  * @returns Number of decoded samples or @ref opus_errorcodes
+  */
+OPUS_EXPORT int opus_decoder_dred_decode_float(OpusDecoder *st, const OpusDRED *dred, opus_int32 dred_offset, float *pcm, opus_int32 frame_size);
+
+
 /** Parse an opus packet into one or more frames.
   * Opus_decode will perform this operation internally so most applications do
   * not need to use this function.
@@ -583,6 +693,14 @@ OPUS_EXPORT OPUS_WARN_UNUSED_RESULT int opus_packet_get_nb_frames(const unsigned
   */
 OPUS_EXPORT OPUS_WARN_UNUSED_RESULT int opus_packet_get_nb_samples(const unsigned char packet[], opus_int32 len, opus_int32 Fs) OPUS_ARG_NONNULL(1);
 
+/** Checks whether an Opus packet has LBRR.
+  * @param [in] packet <tt>char*</tt>: Opus packet
+  * @param [in] len <tt>opus_int32</tt>: Length of packet
+  * @returns 1 is LBRR is present, 0 otherwise
+  * @retval OPUS_INVALID_PACKET The compressed data passed is corrupted or of an unsupported type
+  */
+OPUS_EXPORT OPUS_WARN_UNUSED_RESULT int opus_packet_has_lbrr(const unsigned char packet[], opus_int32 len);
+
 /** Gets the number of samples of an Opus packet.
   * @param [in] dec <tt>OpusDecoder*</tt>: Decoder state
   * @param [in] packet <tt>char*</tt>: Opus packet
diff --git a/opus/include/opus/opus_custom.h b/opus/include/opus/opus_custom.h
index 41f36bf2..2f22d4b3 100644
--- a/opus/include/opus/opus_custom.h
+++ b/opus/include/opus/opus_custom.h
@@ -104,7 +104,8 @@ typedef struct OpusCustomDecoder OpusCustomDecoder;
 /** The mode contains all the information necessary to create an
     encoder. Both the encoder and decoder need to be initialized
     with exactly the same mode, otherwise the output will be
-    corrupted.
+    corrupted. The mode MUST NOT BE DESTROYED until the encoders and
+    decoders that use it are destroyed as well.
    @brief Mode configuration
  */
 typedef struct OpusCustomMode OpusCustomMode;
@@ -178,7 +179,7 @@ OPUS_CUSTOM_EXPORT OPUS_WARN_UNUSED_RESULT OpusCustomEncoder *opus_custom_encode
 ) OPUS_ARG_NONNULL(1);
 
 
-/** Destroys a an encoder state.
+/** Destroys an encoder state.
   * @param[in] st <tt>OpusCustomEncoder*</tt>: State to be freed.
   */
 OPUS_CUSTOM_EXPORT void opus_custom_encoder_destroy(OpusCustomEncoder *st);
@@ -286,7 +287,7 @@ OPUS_CUSTOM_EXPORT OPUS_WARN_UNUSED_RESULT OpusCustomDecoder *opus_custom_decode
     int *error
 ) OPUS_ARG_NONNULL(1);
 
-/** Destroys a an decoder state.
+/** Destroys a decoder state.
   * @param[in] st <tt>OpusCustomDecoder*</tt>: State to be freed.
   */
 OPUS_CUSTOM_EXPORT void opus_custom_decoder_destroy(OpusCustomDecoder *st);
diff --git a/opus/include/opus/opus_defines.h b/opus/include/opus/opus_defines.h
index d141418b..cd8f4dde 100644
--- a/opus/include/opus/opus_defines.h
+++ b/opus/include/opus/opus_defines.h
@@ -64,7 +64,7 @@ extern "C" {
 /**Export control for opus functions */
 
 #ifndef OPUS_EXPORT
-# if defined(WIN32)
+# if defined(_WIN32)
 #  if defined(OPUS_BUILD) && defined(DLL_EXPORT)
 #   define OPUS_EXPORT __declspec(dllexport)
 #  else
@@ -169,15 +169,32 @@ extern "C" {
 #define OPUS_SET_PHASE_INVERSION_DISABLED_REQUEST 4046
 #define OPUS_GET_PHASE_INVERSION_DISABLED_REQUEST 4047
 #define OPUS_GET_IN_DTX_REQUEST              4049
+#define OPUS_SET_DRED_DURATION_REQUEST 4050
+#define OPUS_GET_DRED_DURATION_REQUEST 4051
+#define OPUS_SET_DNN_BLOB_REQUEST 4052
+/*#define OPUS_GET_DNN_BLOB_REQUEST 4053 */
 
 /** Defines for the presence of extended APIs. */
 #define OPUS_HAVE_OPUS_PROJECTION_H
 
 /* Macros to trigger compilation errors when the wrong types are provided to a CTL */
 #define __opus_check_int(x) (((void)((x) == (opus_int32)0)), (opus_int32)(x))
+
+#ifdef DISABLE_PTR_CHECK
+/* Disable checks to prevent ubsan from complaining about NULL checks
+   in test_opus_api. */
+#define __opus_check_int_ptr(ptr) (ptr)
+#define __opus_check_uint_ptr(ptr) (ptr)
+#define __opus_check_uint8_ptr(ptr) (ptr)
+#define __opus_check_val16_ptr(ptr) (ptr)
+#define __opus_check_void_ptr(ptr) (ptr)
+#else
 #define __opus_check_int_ptr(ptr) ((ptr) + ((ptr) - (opus_int32*)(ptr)))
 #define __opus_check_uint_ptr(ptr) ((ptr) + ((ptr) - (opus_uint32*)(ptr)))
+#define __opus_check_uint8_ptr(ptr) ((ptr) + ((ptr) - (opus_uint8*)(ptr)))
 #define __opus_check_val16_ptr(ptr) ((ptr) + ((ptr) - (opus_val16*)(ptr)))
+#define __opus_check_void_ptr(x) ((void)((void *)0 == (x)), (x))
+#endif
 /** @endcond */
 
 /** @defgroup opus_ctlvalues Pre-defined values for CTL interface
@@ -482,7 +499,8 @@ extern "C" {
   * @param[in] x <tt>opus_int32</tt>: Allowed values:
   * <dl>
   * <dt>0</dt><dd>Disable inband FEC (default).</dd>
-  * <dt>1</dt><dd>Enable inband FEC.</dd>
+  * <dt>1</dt><dd>Inband FEC enabled. If the packet loss rate is sufficiently high, Opus will automatically switch to SILK even at high rates to enable use of that FEC.</dd>
+  * <dt>2</dt><dd>Inband FEC enabled, but does not necessarily switch to SILK if we have music.</dd>
   * </dl>
   * @hideinitializer */
 #define OPUS_SET_INBAND_FEC(x) OPUS_SET_INBAND_FEC_REQUEST, __opus_check_int(x)
@@ -491,7 +509,8 @@ extern "C" {
   * @param[out] x <tt>opus_int32 *</tt>: Returns one of the following values:
   * <dl>
   * <dt>0</dt><dd>Inband FEC disabled (default).</dd>
-  * <dt>1</dt><dd>Inband FEC enabled.</dd>
+  * <dt>1</dt><dd>Inband FEC enabled. If the packet loss rate is sufficiently high, Opus will automatically switch to SILK even at high rates to enable use of that FEC.</dd>
+  * <dt>2</dt><dd>Inband FEC enabled, but does not necessarily switch to SILK if we have music.</dd>
   * </dl>
   * @hideinitializer */
 #define OPUS_GET_INBAND_FEC(x) OPUS_GET_INBAND_FEC_REQUEST, __opus_check_int_ptr(x)
@@ -618,6 +637,18 @@ extern "C" {
   * @hideinitializer */
 #define OPUS_GET_PREDICTION_DISABLED(x) OPUS_GET_PREDICTION_DISABLED_REQUEST, __opus_check_int_ptr(x)
 
+/** If non-zero, enables Deep Redundancy (DRED) and use the specified maximum number of 10-ms redundant frames
+  * @hideinitializer */
+#define OPUS_SET_DRED_DURATION(x) OPUS_SET_DRED_DURATION_REQUEST, __opus_check_int(x)
+/** Gets the encoder's configured Deep Redundancy (DRED) maximum number of frames.
+  * @hideinitializer */
+#define OPUS_GET_DRED_DURATION(x) OPUS_GET_DRED_DURATION_REQUEST, __opus_check_int_ptr(x)
+
+/** Provide external DNN weights from binary object (only when explicitly built without the weights)
+  * @hideinitializer */
+#define OPUS_SET_DNN_BLOB(data, len) OPUS_SET_DNN_BLOB_REQUEST, __opus_check_void_ptr(data), __opus_check_int(len)
+
+
 /**@}*/
 
 /** @defgroup opus_genericctls Generic CTLs
diff --git a/opus/include/opus/opus_multistream.h b/opus/include/opus/opus_multistream.h
index babcee69..824cc55a 100644
--- a/opus/include/opus/opus_multistream.h
+++ b/opus/include/opus/opus_multistream.h
@@ -143,7 +143,7 @@ extern "C" {
   * <a href="https://www.xiph.org/vorbis/doc/Vorbis_I_spec.html#x1-810004.3.9">Vorbis
   * channel ordering</a>. A decoder may wish to apply an additional permutation
   * to the mapping the encoder used to achieve a different output channel
-  * order (e.g. for outputing in WAV order).
+  * order (e.g. for outputting in WAV order).
   *
   * Each multistream packet contains an Opus packet for each stream, and all of
   * the Opus packets in a single multistream packet must have the same
diff --git a/opus/silk/API.h b/opus/silk/API.h
index 4d90ff9a..878965c7 100644
--- a/opus/silk/API.h
+++ b/opus/silk/API.h
@@ -34,6 +34,10 @@ POSSIBILITY OF SUCH DAMAGE.
 #include "entenc.h"
 #include "entdec.h"
 
+#ifdef ENABLE_DEEP_PLC
+#include "lpcnet_private.h"
+#endif
+
 #ifdef __cplusplus
 extern "C"
 {
@@ -88,6 +92,16 @@ opus_int silk_Encode(                                   /* O    Returns error co
 /* Decoder functions                    */
 /****************************************/
 
+
+/***********************************************/
+/* Load OSCE models from external data pointer */
+/***********************************************/
+opus_int silk_LoadOSCEModels(
+    void *decState,                                     /* O    I/O State                                       */
+    const unsigned char *data,                          /* I    pointer to binary blob                          */
+    int len                                             /* I    length of binary blob data                      */
+);
+
 /***********************************************/
 /* Get size in bytes of the Silk decoder state */
 /***********************************************/
@@ -96,8 +110,12 @@ opus_int silk_Get_Decoder_Size(                         /* O    Returns error co
 );
 
 /*************************/
-/* Init or Reset decoder */
+/* Init and Reset decoder */
 /*************************/
+opus_int silk_ResetDecoder(                              /* O    Returns error code                              */
+    void                            *decState            /* I/O  State                                           */
+);
+
 opus_int silk_InitDecoder(                              /* O    Returns error code                              */
     void                            *decState           /* I/O  State                                           */
 );
@@ -113,6 +131,9 @@ opus_int silk_Decode(                                   /* O    Returns error co
     ec_dec                          *psRangeDec,        /* I/O  Compressor data structure                       */
     opus_int16                      *samplesOut,        /* O    Decoded output speech vector                    */
     opus_int32                      *nSamplesOut,       /* O    Number of samples decoded                       */
+#ifdef ENABLE_DEEP_PLC
+    LPCNetPLCState                  *lpcnet,
+#endif
     int                             arch                /* I    Run-time architecture                           */
 );
 
diff --git a/opus/silk/CNG.c b/opus/silk/CNG.c
index ef8e38df..2a910099 100644
--- a/opus/silk/CNG.c
+++ b/opus/silk/CNG.c
@@ -118,6 +118,10 @@ void silk_CNG(
         /* Smooth gains */
         for( i = 0; i < psDec->nb_subfr; i++ ) {
             psCNG->CNG_smth_Gain_Q16 += silk_SMULWB( psDecCtrl->Gains_Q16[ i ] - psCNG->CNG_smth_Gain_Q16, CNG_GAIN_SMTH_Q16 );
+            /* If the smoothed gain is 3 dB greater than this subframe's gain, use this subframe's gain to adapt faster. */
+            if( silk_SMULWW( psCNG->CNG_smth_Gain_Q16, CNG_GAIN_SMTH_THRESHOLD_Q16 ) > psDecCtrl->Gains_Q16[ i ] ) {
+                psCNG->CNG_smth_Gain_Q16 = psDecCtrl->Gains_Q16[ i ];
+            }
         }
     }
 
diff --git a/opus/silk/LPC_fit.c b/opus/silk/LPC_fit.c
index cdea4f3a..c0690a1f 100644
--- a/opus/silk/LPC_fit.c
+++ b/opus/silk/LPC_fit.c
@@ -31,7 +31,8 @@ POSSIBILITY OF SUCH DAMAGE.
 
 #include "SigProc_FIX.h"
 
-/* Convert int32 coefficients to int16 coefs and make sure there's no wrap-around */
+/* Convert int32 coefficients to int16 coefs and make sure there's no wrap-around.
+   This logic is reused in _celt_lpc(). Any bug fixes should also be applied there. */
 void silk_LPC_fit(
     opus_int16                  *a_QOUT,            /* O    Output signal                                               */
     opus_int32                    *a_QIN,             /* I/O  Input signal                                                */
diff --git a/opus/silk/MacroCount.h b/opus/silk/MacroCount.h
index 78100ffe..dab2f57a 100644
--- a/opus/silk/MacroCount.h
+++ b/opus/silk/MacroCount.h
@@ -27,9 +27,9 @@ POSSIBILITY OF SUCH DAMAGE.
 
 #ifndef SIGPROCFIX_API_MACROCOUNT_H
 #define SIGPROCFIX_API_MACROCOUNT_H
-#include <stdio.h>
 
 #ifdef    silk_MACRO_COUNT
+#include <stdio.h>
 #define varDefine opus_int64 ops_count = 0;
 
 extern opus_int64 ops_count;
diff --git a/opus/silk/MacroDebug.h b/opus/silk/MacroDebug.h
index 8dd4ce2e..3110da9a 100644
--- a/opus/silk/MacroDebug.h
+++ b/opus/silk/MacroDebug.h
@@ -55,7 +55,7 @@ static OPUS_INLINE opus_int16 silk_ADD16_(opus_int16 a, opus_int16 b, char *file
 static OPUS_INLINE opus_int32 silk_ADD32_(opus_int32 a, opus_int32 b, char *file, int line){
     opus_int32 ret;
 
-    ret = a + b;
+    ret = (opus_int32)((opus_uint32)a + (opus_uint32)b);
     if ( ret != silk_ADD_SAT32( a, b ) )
     {
         fprintf (stderr, "silk_ADD32(%d, %d) in %s: line %d\n", a, b, file, line);
@@ -101,9 +101,9 @@ static OPUS_INLINE opus_int16 silk_SUB16_(opus_int16 a, opus_int16 b, char *file
 #undef silk_SUB32
 #define silk_SUB32(a,b) silk_SUB32_((a), (b), __FILE__, __LINE__)
 static OPUS_INLINE opus_int32 silk_SUB32_(opus_int32 a, opus_int32 b, char *file, int line){
-    opus_int32 ret;
+    opus_int64 ret;
 
-    ret = a - b;
+    ret = a - (opus_int64)b;
     if ( ret != silk_SUB_SAT32( a, b ) )
     {
         fprintf (stderr, "silk_SUB32(%d, %d) in %s: line %d\n", a, b, file, line);
@@ -257,7 +257,7 @@ static OPUS_INLINE opus_int64 silk_SUB_SAT64_( opus_int64 a64, opus_int64 b64, c
 static OPUS_INLINE opus_int32 silk_MUL_(opus_int32 a32, opus_int32 b32, char *file, int line){
     opus_int32 ret;
     opus_int64 ret64;
-    ret = a32 * b32;
+    ret = (opus_int32)((opus_uint32)a32 * (opus_uint32)b32);
     ret64 = (opus_int64)a32 * (opus_int64)b32;
     if ( (opus_int64)ret != ret64 )
     {
@@ -333,8 +333,8 @@ static OPUS_INLINE opus_int32 silk_SMULWB_(opus_int32 a32, opus_int32 b32, char
 #define silk_SMLAWB(a,b,c) silk_SMLAWB_((a), (b), (c), __FILE__, __LINE__)
 static OPUS_INLINE opus_int32 silk_SMLAWB_(opus_int32 a32, opus_int32 b32, opus_int32 c32, char *file, int line){
     opus_int32 ret;
-    ret = silk_ADD32( a32, silk_SMULWB( b32, c32 ) );
-    if ( silk_ADD32( a32, silk_SMULWB( b32, c32 ) ) != silk_ADD_SAT32( a32, silk_SMULWB( b32, c32 ) ) )
+    ret = silk_ADD32_ovflw( a32, silk_SMULWB( b32, c32 ) );
+    if ( ret != silk_ADD_SAT32( a32, silk_SMULWB( b32, c32 ) ) )
     {
         fprintf (stderr, "silk_SMLAWB(%d, %d, %d) in %s: line %d\n", a32, b32, c32, file, line);
 #ifdef FIXED_DEBUG_ASSERT
@@ -465,7 +465,7 @@ static OPUS_INLINE opus_int32 silk_SMULWW_(opus_int32 a32, opus_int32 b32, char
 
     if ( fail )
     {
-        fprintf (stderr, "silk_SMULWT(%d, %d) in %s: line %d\n", a32, b32, file, line);
+        fprintf (stderr, "silk_SMULWW(%d, %d) in %s: line %d\n", a32, b32, file, line);
 #ifdef FIXED_DEBUG_ASSERT
         silk_assert( 0 );
 #endif
@@ -491,12 +491,6 @@ static OPUS_INLINE opus_int32 silk_SMLAWW_(opus_int32 a32, opus_int32 b32, opus_
     return ret;
 }
 
-/* Multiply-accumulate macros that allow overflow in the addition (ie, no asserts in debug mode) */
-#undef  silk_MLA_ovflw
-#define silk_MLA_ovflw(a32, b32, c32)    ((a32) + ((b32) * (c32)))
-#undef  silk_SMLABB_ovflw
-#define silk_SMLABB_ovflw(a32, b32, c32)    ((a32) + ((opus_int32)((opus_int16)(b32))) * (opus_int32)((opus_int16)(c32)))
-
 /* no checking needed for silk_SMULL
    no checking needed for silk_SMLAL
    no checking needed for silk_SMLALBB
@@ -546,10 +540,10 @@ static OPUS_INLINE opus_int32 silk_DIV32_16_(opus_int32 a32, opus_int32 b32, cha
 static OPUS_INLINE opus_int8 silk_LSHIFT8_(opus_int8 a, opus_int32 shift, char *file, int line){
     opus_int8 ret;
     int       fail = 0;
-    ret = a << shift;
+    ret = (opus_int8)((opus_uint8)a << shift);
     fail |= shift < 0;
     fail |= shift >= 8;
-    fail |= (opus_int64)ret != ((opus_int64)a) << shift;
+    fail |= (opus_int64)ret != (opus_int64)(((opus_uint64)a) << shift);
     if ( fail )
     {
         fprintf (stderr, "silk_LSHIFT8(%d, %d) in %s: line %d\n", a, shift, file, line);
@@ -565,10 +559,10 @@ static OPUS_INLINE opus_int8 silk_LSHIFT8_(opus_int8 a, opus_int32 shift, char *
 static OPUS_INLINE opus_int16 silk_LSHIFT16_(opus_int16 a, opus_int32 shift, char *file, int line){
     opus_int16 ret;
     int        fail = 0;
-    ret = a << shift;
+    ret = (opus_int16)((opus_uint16)a << shift);
     fail |= shift < 0;
     fail |= shift >= 16;
-    fail |= (opus_int64)ret != ((opus_int64)a) << shift;
+    fail |= (opus_int64)ret != (opus_int64)(((opus_uint64)a) << shift);
     if ( fail )
     {
         fprintf (stderr, "silk_LSHIFT16(%d, %d) in %s: line %d\n", a, shift, file, line);
@@ -584,10 +578,10 @@ static OPUS_INLINE opus_int16 silk_LSHIFT16_(opus_int16 a, opus_int32 shift, cha
 static OPUS_INLINE opus_int32 silk_LSHIFT32_(opus_int32 a, opus_int32 shift, char *file, int line){
     opus_int32 ret;
     int        fail = 0;
-    ret = a << shift;
+    ret = (opus_int32)((opus_uint32)a << shift);
     fail |= shift < 0;
     fail |= shift >= 32;
-    fail |= (opus_int64)ret != ((opus_int64)a) << shift;
+    fail |= (opus_int64)ret != (opus_int64)(((opus_uint64)a) << shift);
     if ( fail )
     {
         fprintf (stderr, "silk_LSHIFT32(%d, %d) in %s: line %d\n", a, shift, file, line);
@@ -603,7 +597,7 @@ static OPUS_INLINE opus_int32 silk_LSHIFT32_(opus_int32 a, opus_int32 shift, cha
 static OPUS_INLINE opus_int64 silk_LSHIFT64_(opus_int64 a, opus_int shift, char *file, int line){
     opus_int64 ret;
     int        fail = 0;
-    ret = a << shift;
+    ret = (opus_int64)((opus_uint64)a << shift);
     fail |= shift < 0;
     fail |= shift >= 64;
     fail |= (ret>>shift) != ((opus_int64)a);
@@ -714,8 +708,8 @@ static OPUS_INLINE opus_uint32 silk_RSHIFT_uint_(opus_uint32 a, opus_int32 shift
 #define silk_ADD_LSHIFT(a,b,c) silk_ADD_LSHIFT_((a), (b), (c), __FILE__, __LINE__)
 static OPUS_INLINE int silk_ADD_LSHIFT_(int a, int b, int shift, char *file, int line){
     opus_int16 ret;
-    ret = a + (b << shift);
-    if ( (shift < 0) || (shift>15) || ((opus_int64)ret != (opus_int64)a + (((opus_int64)b) << shift)) )
+    ret = a + (opus_int16)((opus_uint16)b << shift);
+    if ( (shift < 0) || (shift>15) || ((opus_int64)ret != (opus_int64)a + (opus_int64)(((opus_uint64)b) << shift)) )
     {
         fprintf (stderr, "silk_ADD_LSHIFT(%d, %d, %d) in %s: line %d\n", a, b, shift, file, line);
 #ifdef FIXED_DEBUG_ASSERT
@@ -729,8 +723,8 @@ static OPUS_INLINE int silk_ADD_LSHIFT_(int a, int b, int shift, char *file, int
 #define silk_ADD_LSHIFT32(a,b,c) silk_ADD_LSHIFT32_((a), (b), (c), __FILE__, __LINE__)
 static OPUS_INLINE opus_int32 silk_ADD_LSHIFT32_(opus_int32 a, opus_int32 b, opus_int32 shift, char *file, int line){
     opus_int32 ret;
-    ret = a + (b << shift);
-    if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a + (((opus_int64)b) << shift)) )
+    ret = silk_ADD32_ovflw(a, (opus_int32)((opus_uint32)b << shift));
+    if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a + (opus_int64)(((opus_uint64)b) << shift)) )
     {
         fprintf (stderr, "silk_ADD_LSHIFT32(%d, %d, %d) in %s: line %d\n", a, b, shift, file, line);
 #ifdef FIXED_DEBUG_ASSERT
@@ -774,7 +768,7 @@ static OPUS_INLINE int silk_ADD_RSHIFT_(int a, int b, int shift, char *file, int
 #define silk_ADD_RSHIFT32(a,b,c) silk_ADD_RSHIFT32_((a), (b), (c), __FILE__, __LINE__)
 static OPUS_INLINE opus_int32 silk_ADD_RSHIFT32_(opus_int32 a, opus_int32 b, opus_int32 shift, char *file, int line){
     opus_int32 ret;
-    ret = a + (b >> shift);
+    ret = silk_ADD32_ovflw(a, (b >> shift));
     if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a + (((opus_int64)b) >> shift)) )
     {
         fprintf (stderr, "silk_ADD_RSHIFT32(%d, %d, %d) in %s: line %d\n", a, b, shift, file, line);
@@ -804,8 +798,8 @@ static OPUS_INLINE opus_uint32 silk_ADD_RSHIFT_uint_(opus_uint32 a, opus_uint32
 #define silk_SUB_LSHIFT32(a,b,c) silk_SUB_LSHIFT32_((a), (b), (c), __FILE__, __LINE__)
 static OPUS_INLINE opus_int32 silk_SUB_LSHIFT32_(opus_int32 a, opus_int32 b, opus_int32 shift, char *file, int line){
     opus_int32 ret;
-    ret = a - (b << shift);
-    if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a - (((opus_int64)b) << shift)) )
+    ret = silk_SUB32_ovflw(a, (opus_int32)((opus_uint32)b << shift));
+    if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a - (opus_int64)(((opus_uint64)b) << shift)) )
     {
         fprintf (stderr, "silk_SUB_LSHIFT32(%d, %d, %d) in %s: line %d\n", a, b, shift, file, line);
 #ifdef FIXED_DEBUG_ASSERT
@@ -819,7 +813,7 @@ static OPUS_INLINE opus_int32 silk_SUB_LSHIFT32_(opus_int32 a, opus_int32 b, opu
 #define silk_SUB_RSHIFT32(a,b,c) silk_SUB_RSHIFT32_((a), (b), (c), __FILE__, __LINE__)
 static OPUS_INLINE opus_int32 silk_SUB_RSHIFT32_(opus_int32 a, opus_int32 b, opus_int32 shift, char *file, int line){
     opus_int32 ret;
-    ret = a - (b >> shift);
+    ret = silk_SUB32_ovflw(a, (b >> shift));
     if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a - (((opus_int64)b) >> shift)) )
     {
         fprintf (stderr, "silk_SUB_RSHIFT32(%d, %d, %d) in %s: line %d\n", a, b, shift, file, line);
@@ -835,7 +829,7 @@ static OPUS_INLINE opus_int32 silk_SUB_RSHIFT32_(opus_int32 a, opus_int32 b, opu
 static OPUS_INLINE opus_int32 silk_RSHIFT_ROUND_(opus_int32 a, opus_int32 shift, char *file, int line){
     opus_int32 ret;
     ret = shift == 1 ? (a >> 1) + (a & 1) : ((a >> (shift - 1)) + 1) >> 1;
-    /* the marco definition can't handle a shift of zero */
+    /* the macro definition can't handle a shift of zero */
     if ( (shift <= 0) || (shift>31) || ((opus_int64)ret != ((opus_int64)a + ((opus_int64)1 << (shift - 1))) >> shift) )
     {
         fprintf (stderr, "silk_RSHIFT_ROUND(%d, %d) in %s: line %d\n", a, shift, file, line);
@@ -850,7 +844,7 @@ static OPUS_INLINE opus_int32 silk_RSHIFT_ROUND_(opus_int32 a, opus_int32 shift,
 #define silk_RSHIFT_ROUND64(a,b) silk_RSHIFT_ROUND64_((a), (b), __FILE__, __LINE__)
 static OPUS_INLINE opus_int64 silk_RSHIFT_ROUND64_(opus_int64 a, opus_int32 shift, char *file, int line){
     opus_int64 ret;
-    /* the marco definition can't handle a shift of zero */
+    /* the macro definition can't handle a shift of zero */
     if ( (shift <= 0) || (shift>=64) )
     {
         fprintf (stderr, "silk_RSHIFT_ROUND64(%lld, %d) in %s: line %d\n", (long long)a, shift, file, line);
diff --git a/opus/silk/NSQ.c b/opus/silk/NSQ.c
index 1d64d8e2..1caa829b 100644
--- a/opus/silk/NSQ.c
+++ b/opus/silk/NSQ.c
@@ -75,21 +75,21 @@ static OPUS_INLINE void silk_noise_shape_quantizer(
 
 void silk_NSQ_c
 (
-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
     const opus_int16            x16[],                                        /* I    Input                           */
-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
-    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs              */
-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
+    const opus_int16            *PredCoef_Q12,                                /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 )
 {
     opus_int            k, lag, start_idx, LSF_interpolation_flag;
@@ -173,9 +173,9 @@ void silk_NSQ_c
     RESTORE_STACK;
 }
 
-/***********************************/
-/* silk_noise_shape_quantizer  */
-/***********************************/
+/******************************/
+/* silk_noise_shape_quantizer */
+/******************************/
 
 #if !defined(OPUS_X86_MAY_HAVE_SSE4_1)
 static OPUS_INLINE
@@ -258,17 +258,17 @@ void silk_noise_shape_quantizer(
         celt_assert( lag > 0 || signalType != TYPE_VOICED );
 
         /* Combine prediction and noise shaping signals */
-        tmp1 = silk_SUB32( silk_LSHIFT32( LPC_pred_Q10, 2 ), n_AR_Q12 );        /* Q12 */
-        tmp1 = silk_SUB32( tmp1, n_LF_Q12 );                                    /* Q12 */
+        tmp1 = silk_SUB32_ovflw( silk_LSHIFT32( LPC_pred_Q10, 2 ), n_AR_Q12 );  /* Q12 */
+        tmp1 = silk_SUB32_ovflw( tmp1, n_LF_Q12 );                              /* Q12 */
         if( lag > 0 ) {
             /* Symmetric, packed FIR coefficients */
-            n_LTP_Q13 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
+            n_LTP_Q13 = silk_SMULWB( silk_ADD_SAT32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
             n_LTP_Q13 = silk_SMLAWT( n_LTP_Q13, shp_lag_ptr[ -1 ],                      HarmShapeFIRPacked_Q14 );
             n_LTP_Q13 = silk_LSHIFT( n_LTP_Q13, 1 );
             shp_lag_ptr++;
 
             tmp2 = silk_SUB32( LTP_pred_Q13, n_LTP_Q13 );                       /* Q13 */
-            tmp1 = silk_ADD_LSHIFT32( tmp2, tmp1, 1 );                          /* Q13 */
+            tmp1 = silk_ADD32_ovflw( tmp2, silk_LSHIFT32( tmp1, 1 ) );          /* Q13 */
             tmp1 = silk_RSHIFT_ROUND( tmp1, 3 );                                /* Q10 */
         } else {
             tmp1 = silk_RSHIFT_ROUND( tmp1, 2 );                                /* Q10 */
@@ -340,7 +340,7 @@ void silk_noise_shape_quantizer(
 
         /* Add predictions */
         LPC_exc_Q14 = silk_ADD_LSHIFT32( exc_Q14, LTP_pred_Q13, 1 );
-        xq_Q14      = silk_ADD_LSHIFT32( LPC_exc_Q14, LPC_pred_Q10, 4 );
+        xq_Q14      = silk_ADD32_ovflw( LPC_exc_Q14, silk_LSHIFT32( LPC_pred_Q10, 4 ) );
 
         /* Scale XQ back to normal level before saving */
         xq[ i ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( xq_Q14, Gain_Q10 ), 8 ) );
@@ -349,10 +349,10 @@ void silk_noise_shape_quantizer(
         psLPC_Q14++;
         *psLPC_Q14 = xq_Q14;
         NSQ->sDiff_shp_Q14 = silk_SUB_LSHIFT32( xq_Q14, x_sc_Q10[ i ], 4 );
-        sLF_AR_shp_Q14 = silk_SUB_LSHIFT32( NSQ->sDiff_shp_Q14, n_AR_Q12, 2 );
+        sLF_AR_shp_Q14 = silk_SUB32_ovflw( NSQ->sDiff_shp_Q14, silk_LSHIFT32( n_AR_Q12, 2 ) );
         NSQ->sLF_AR_shp_Q14 = sLF_AR_shp_Q14;
 
-        NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx ] = silk_SUB_LSHIFT32( sLF_AR_shp_Q14, n_LF_Q12, 2 );
+        NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx ] = silk_SUB32_ovflw(sLF_AR_shp_Q14, silk_LSHIFT32(n_LF_Q12, 2));
         sLTP_Q15[ NSQ->sLTP_buf_idx ] = silk_LSHIFT( LPC_exc_Q14, 1 );
         NSQ->sLTP_shp_buf_idx++;
         NSQ->sLTP_buf_idx++;
diff --git a/opus/silk/NSQ_del_dec.c b/opus/silk/NSQ_del_dec.c
index 3fd9fa0d..e8dadf15 100644
--- a/opus/silk/NSQ_del_dec.c
+++ b/opus/silk/NSQ_del_dec.c
@@ -115,21 +115,21 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec(
 );
 
 void silk_NSQ_del_dec_c(
-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
     const opus_int16            x16[],                                        /* I    Input                           */
-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
-    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs              */
-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
+    const opus_int16            *PredCoef_Q12,                                /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 )
 {
     opus_int            i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr;
@@ -394,8 +394,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec(
         /* Long-term shaping */
         if( lag > 0 ) {
             /* Symmetric, packed FIR coefficients */
-            n_LTP_Q14 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
-            n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ],                      HarmShapeFIRPacked_Q14 );
+            n_LTP_Q14 = silk_SMULWB( silk_ADD_SAT32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
+            n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 );
             n_LTP_Q14 = silk_SUB_LSHIFT32( LTP_pred_Q14, n_LTP_Q14, 2 );            /* Q12 -> Q14 */
             shp_lag_ptr++;
         } else {
@@ -423,18 +423,18 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec(
             /* Output of lowpass section */
             tmp2 = silk_SMLAWB( psDD->Diff_Q14, psDD->sAR2_Q14[ 0 ], warping_Q16 );
             /* Output of allpass section */
-            tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ 0 ], psDD->sAR2_Q14[ 1 ] - tmp2, warping_Q16 );
+            tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ 0 ], silk_SUB32_ovflw(psDD->sAR2_Q14[ 1 ], tmp2), warping_Q16 );
             psDD->sAR2_Q14[ 0 ] = tmp2;
             n_AR_Q14 = silk_RSHIFT( shapingLPCOrder, 1 );
             n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp2, AR_shp_Q13[ 0 ] );
             /* Loop over allpass sections */
             for( j = 2; j < shapingLPCOrder; j += 2 ) {
                 /* Output of allpass section */
-                tmp2 = silk_SMLAWB( psDD->sAR2_Q14[ j - 1 ], psDD->sAR2_Q14[ j + 0 ] - tmp1, warping_Q16 );
+                tmp2 = silk_SMLAWB( psDD->sAR2_Q14[ j - 1 ], silk_SUB32_ovflw(psDD->sAR2_Q14[ j + 0 ], tmp1), warping_Q16 );
                 psDD->sAR2_Q14[ j - 1 ] = tmp1;
                 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp1, AR_shp_Q13[ j - 1 ] );
                 /* Output of allpass section */
-                tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ j + 0 ], psDD->sAR2_Q14[ j + 1 ] - tmp2, warping_Q16 );
+                tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ j + 0 ], silk_SUB32_ovflw(psDD->sAR2_Q14[ j + 1 ], tmp2), warping_Q16 );
                 psDD->sAR2_Q14[ j + 0 ] = tmp2;
                 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp2, AR_shp_Q13[ j ] );
             }
@@ -451,9 +451,9 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec(
 
             /* Input minus prediction plus noise feedback                       */
             /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP  */
-            tmp1 = silk_ADD32( n_AR_Q14, n_LF_Q14 );                                    /* Q14 */
-            tmp2 = silk_ADD32( n_LTP_Q14, LPC_pred_Q14 );                               /* Q13 */
-            tmp1 = silk_SUB32( tmp2, tmp1 );                                            /* Q13 */
+            tmp1 = silk_ADD_SAT32( n_AR_Q14, n_LF_Q14 );                                /* Q14 */
+            tmp2 = silk_ADD32_ovflw( n_LTP_Q14, LPC_pred_Q14 );                         /* Q13 */
+            tmp1 = silk_SUB_SAT32( tmp2, tmp1 );                                        /* Q13 */
             tmp1 = silk_RSHIFT_ROUND( tmp1, 4 );                                        /* Q10 */
 
             r_Q10 = silk_SUB32( x_Q10[ i ], tmp1 );                                     /* residual error Q10 */
@@ -530,12 +530,12 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec(
 
             /* Add predictions */
             LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 );
-            xq_Q14      = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 );
+            xq_Q14      = silk_ADD32_ovflw( LPC_exc_Q14, LPC_pred_Q14 );
 
             /* Update states */
-            psSS[ 0 ].Diff_Q14     = silk_SUB_LSHIFT32( xq_Q14, x_Q10[ i ], 4 );
-            sLF_AR_shp_Q14         = silk_SUB32( psSS[ 0 ].Diff_Q14, n_AR_Q14 );
-            psSS[ 0 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 );
+            psSS[ 0 ].Diff_Q14     = silk_SUB32_ovflw( xq_Q14, silk_LSHIFT32( x_Q10[ i ], 4 ) );
+            sLF_AR_shp_Q14         = silk_SUB32_ovflw( psSS[ 0 ].Diff_Q14, n_AR_Q14 );
+            psSS[ 0 ].sLTP_shp_Q14 = silk_SUB_SAT32( sLF_AR_shp_Q14, n_LF_Q14 );
             psSS[ 0 ].LF_AR_Q14    = sLF_AR_shp_Q14;
             psSS[ 0 ].LPC_exc_Q14  = LPC_exc_Q14;
             psSS[ 0 ].xq_Q14       = xq_Q14;
@@ -550,12 +550,12 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec(
 
             /* Add predictions */
             LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 );
-            xq_Q14      = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 );
+            xq_Q14      = silk_ADD32_ovflw( LPC_exc_Q14, LPC_pred_Q14 );
 
             /* Update states */
-            psSS[ 1 ].Diff_Q14     = silk_SUB_LSHIFT32( xq_Q14, x_Q10[ i ], 4 );
-            sLF_AR_shp_Q14         = silk_SUB32( psSS[ 1 ].Diff_Q14, n_AR_Q14 );
-            psSS[ 1 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 );
+            psSS[ 1 ].Diff_Q14     = silk_SUB32_ovflw( xq_Q14, silk_LSHIFT32( x_Q10[ i ], 4 ) );
+            sLF_AR_shp_Q14         = silk_SUB32_ovflw( psSS[ 1 ].Diff_Q14, n_AR_Q14 );
+            psSS[ 1 ].sLTP_shp_Q14 = silk_SUB_SAT32( sLF_AR_shp_Q14, n_LF_Q14 );
             psSS[ 1 ].LF_AR_Q14    = sLF_AR_shp_Q14;
             psSS[ 1 ].LPC_exc_Q14  = LPC_exc_Q14;
             psSS[ 1 ].xq_Q14       = xq_Q14;
diff --git a/opus/silk/PLC.c b/opus/silk/PLC.c
index f8939165..b35bf750 100644
--- a/opus/silk/PLC.c
+++ b/opus/silk/PLC.c
@@ -33,6 +33,10 @@ POSSIBILITY OF SUCH DAMAGE.
 #include "stack_alloc.h"
 #include "PLC.h"
 
+#ifdef ENABLE_DEEP_PLC
+#include "lpcnet.h"
+#endif
+
 #define NB_ATT 2
 static const opus_int16 HARM_ATT_Q15[NB_ATT]              = { 32440, 31130 }; /* 0.99, 0.95 */
 static const opus_int16 PLC_RAND_ATTENUATE_V_Q15[NB_ATT]  = { 31130, 26214 }; /* 0.95, 0.8 */
@@ -47,6 +51,9 @@ static OPUS_INLINE void silk_PLC_conceal(
     silk_decoder_state                  *psDec,             /* I/O Decoder state        */
     silk_decoder_control                *psDecCtrl,         /* I/O Decoder control      */
     opus_int16                          frame[],            /* O LPC residual signal    */
+#ifdef ENABLE_DEEP_PLC
+    LPCNetPLCState                      *lpcnet,
+#endif
     int                                 arch                /* I  Run-time architecture */
 );
 
@@ -67,6 +74,9 @@ void silk_PLC(
     silk_decoder_control                *psDecCtrl,         /* I/O Decoder control      */
     opus_int16                          frame[],            /* I/O  signal              */
     opus_int                            lost,               /* I Loss flag              */
+#ifdef ENABLE_DEEP_PLC
+    LPCNetPLCState                      *lpcnet,
+#endif
     int                                 arch                /* I Run-time architecture  */
 )
 {
@@ -80,7 +90,11 @@ void silk_PLC(
         /****************************/
         /* Generate Signal          */
         /****************************/
-        silk_PLC_conceal( psDec, psDecCtrl, frame, arch );
+        silk_PLC_conceal( psDec, psDecCtrl, frame,
+#ifdef ENABLE_DEEP_PLC
+            lpcnet,
+#endif
+            arch );
 
         psDec->lossCnt++;
     } else {
@@ -88,6 +102,14 @@ void silk_PLC(
         /* Update state             */
         /****************************/
         silk_PLC_update( psDec, psDecCtrl );
+#ifdef ENABLE_DEEP_PLC
+        if ( lpcnet != NULL && psDec->sPLC.fs_kHz == 16 ) {
+            int k;
+            for( k = 0; k < psDec->nb_subfr; k += 2 ) {
+                lpcnet_plc_update( lpcnet, frame + k * psDec->subfr_length );
+            }
+        }
+#endif
     }
 }
 
@@ -195,6 +217,9 @@ static OPUS_INLINE void silk_PLC_conceal(
     silk_decoder_state                  *psDec,             /* I/O Decoder state        */
     silk_decoder_control                *psDecCtrl,         /* I/O Decoder control      */
     opus_int16                          frame[],            /* O LPC residual signal    */
+#ifdef ENABLE_DEEP_PLC
+    LPCNetPLCState                      *lpcnet,
+#endif
     int                                 arch                /* I Run-time architecture  */
 )
 {
@@ -328,10 +353,8 @@ static OPUS_INLINE void silk_PLC_conceal(
         for( j = 0; j < LTP_ORDER; j++ ) {
             B_Q14[ j ] = silk_RSHIFT( silk_SMULBB( harm_Gain_Q15, B_Q14[ j ] ), 15 );
         }
-        if ( psDec->indices.signalType != TYPE_NO_VOICE_ACTIVITY ) {
-            /* Gradually reduce excitation gain */
-            rand_scale_Q14 = silk_RSHIFT( silk_SMULBB( rand_scale_Q14, rand_Gain_Q15 ), 15 );
-        }
+        /* Gradually reduce excitation gain */
+        rand_scale_Q14 = silk_RSHIFT( silk_SMULBB( rand_scale_Q14, rand_Gain_Q15 ), 15 );
 
         /* Slowly increase pitch lag */
         psPLC->pitchL_Q8 = silk_SMLAWB( psPLC->pitchL_Q8, psPLC->pitchL_Q8, PITCH_DRIFT_FAC_Q16 );
@@ -373,6 +396,24 @@ static OPUS_INLINE void silk_PLC_conceal(
         /* Scale with Gain */
         frame[ i ] = (opus_int16)silk_SAT16( silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( sLPC_Q14_ptr[ MAX_LPC_ORDER + i ], prevGain_Q10[ 1 ] ), 8 ) ) );
     }
+#ifdef ENABLE_DEEP_PLC
+    if ( lpcnet != NULL && lpcnet->loaded && psDec->sPLC.fs_kHz == 16 ) {
+        int run_deep_plc = psDec->sPLC.enable_deep_plc || lpcnet->fec_fill_pos != 0;
+        if( run_deep_plc ) {
+            for( k = 0; k < psDec->nb_subfr; k += 2 ) {
+                lpcnet_plc_conceal( lpcnet, frame + k * psDec->subfr_length );
+            }
+            /* We *should* be able to copy only from psDec->frame_length-MAX_LPC_ORDER, i.e. the last MAX_LPC_ORDER samples. */
+            for( i = 0; i < psDec->frame_length; i++ ) {
+                sLPC_Q14_ptr[ MAX_LPC_ORDER + i ] = (int)floor(.5 + frame[ i ] * (float)(1 << 24) / prevGain_Q10[ 1 ] );
+            }
+        } else {
+          for( k = 0; k < psDec->nb_subfr; k += 2 ) {
+              lpcnet_plc_update( lpcnet, frame + k * psDec->subfr_length );
+          }
+        }
+    }
+#endif
 
     /* Save LPC state */
     silk_memcpy( psDec->sLPC_Q14_buf, &sLPC_Q14_ptr[ psDec->frame_length ], MAX_LPC_ORDER * sizeof( opus_int32 ) );
@@ -433,12 +474,16 @@ void silk_PLC_glue_frames(
                 slope_Q16 = silk_DIV32_16( ( (opus_int32)1 << 16 ) - gain_Q16, length );
                 /* Make slope 4x steeper to avoid missing onsets after DTX */
                 slope_Q16 = silk_LSHIFT( slope_Q16, 2 );
-
-                for( i = 0; i < length; i++ ) {
-                    frame[ i ] = silk_SMULWB( gain_Q16, frame[ i ] );
-                    gain_Q16 += slope_Q16;
-                    if( gain_Q16 > (opus_int32)1 << 16 ) {
-                        break;
+#ifdef ENABLE_DEEP_PLC
+                if ( psDec->sPLC.fs_kHz != 16 )
+#endif
+                {
+                    for( i = 0; i < length; i++ ) {
+                        frame[ i ] = silk_SMULWB( gain_Q16, frame[ i ] );
+                        gain_Q16 += slope_Q16;
+                        if( gain_Q16 > (opus_int32)1 << 16 ) {
+                            break;
+                        }
                     }
                 }
             }
diff --git a/opus/silk/PLC.h b/opus/silk/PLC.h
index 6438f516..1bebb786 100644
--- a/opus/silk/PLC.h
+++ b/opus/silk/PLC.h
@@ -49,6 +49,9 @@ void silk_PLC(
     silk_decoder_control                *psDecCtrl,         /* I/O Decoder control      */
     opus_int16                          frame[],            /* I/O  signal              */
     opus_int                            lost,               /* I Loss flag              */
+#ifdef ENABLE_DEEP_PLC
+    LPCNetPLCState                      *lpcnet,
+#endif
     int                                 arch                /* I Run-time architecture  */
 );
 
diff --git a/opus/silk/SigProc_FIX.h b/opus/silk/SigProc_FIX.h
index f9ae3263..fbdfa82e 100644
--- a/opus/silk/SigProc_FIX.h
+++ b/opus/silk/SigProc_FIX.h
@@ -381,7 +381,7 @@ opus_int32 silk_inner_prod_aligned_scale(
     const opus_int              len                 /*    I vector lengths                                              */
 );
 
-opus_int64 silk_inner_prod16_aligned_64_c(
+opus_int64 silk_inner_prod16_c(
     const opus_int16            *inVec1,            /*    I input vector 1                                              */
     const opus_int16            *inVec2,            /*    I input vector 2                                              */
     const opus_int              len                 /*    I vector lengths                                              */
@@ -609,12 +609,14 @@ static OPUS_INLINE opus_int64 silk_max_64(opus_int64 a, opus_int64 b)
 /* the following seems faster on x86 */
 #define silk_SMMUL(a32, b32)                (opus_int32)silk_RSHIFT64(silk_SMULL((a32), (b32)), 32)
 
-#if !defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#if !defined(OVERRIDE_silk_burg_modified)
 #define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \
     ((void)(arch), silk_burg_modified_c(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))
+#endif
 
-#define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \
-    ((void)(arch),silk_inner_prod16_aligned_64_c(inVec1, inVec2, len))
+#if !defined(OVERRIDE_silk_inner_prod16)
+#define silk_inner_prod16(inVec1, inVec2, len, arch) \
+    ((void)(arch),silk_inner_prod16_c(inVec1, inVec2, len))
 #endif
 
 #include "Inlines.h"
diff --git a/opus/silk/VQ_WMat_EC.c b/opus/silk/VQ_WMat_EC.c
index 0f3d545c..245a7e4b 100644
--- a/opus/silk/VQ_WMat_EC.c
+++ b/opus/silk/VQ_WMat_EC.c
@@ -64,7 +64,7 @@ void silk_VQ_WMat_EC_c(
     *rate_dist_Q8 = silk_int32_MAX;
     *res_nrg_Q15 = silk_int32_MAX;
     cb_row_Q7 = cb_Q7;
-    /* In things go really bad, at least *ind is set to something safe. */
+    /* If things go really bad, at least *ind is set to something safe. */
     *ind = 0;
     for( k = 0; k < L; k++ ) {
         opus_int32 penalty;
@@ -115,7 +115,7 @@ void silk_VQ_WMat_EC_c(
         if( sum1_Q15 >= 0 ) {
             /* Translate residual energy to bits using high-rate assumption (6 dB ==> 1 bit/sample) */
             bits_res_Q8 = silk_SMULBB( subfr_len, silk_lin2log( sum1_Q15 + penalty) - (15 << 7) );
-            /* In the following line we reduce the codelength component by half ("-1"); seems to slghtly improve quality */
+            /* In the following line we reduce the codelength component by half ("-1"); seems to slightly improve quality */
             bits_tot_Q8 = silk_ADD_LSHIFT32( bits_res_Q8, cl_Q5[ k ], 3-1 );
             if( bits_tot_Q8 <= *rate_dist_Q8 ) {
                 *rate_dist_Q8 = bits_tot_Q8;
diff --git a/opus/silk/arm/LPC_inv_pred_gain_neon_intr.c b/opus/silk/arm/LPC_inv_pred_gain_neon_intr.c
index ab426bcd..726e6667 100644
--- a/opus/silk/arm/LPC_inv_pred_gain_neon_intr.c
+++ b/opus/silk/arm/LPC_inv_pred_gain_neon_intr.c
@@ -210,19 +210,23 @@ opus_int32 silk_LPC_inverse_pred_gain_neon(         /* O   Returns inverse predi
         /* Increase Q domain of the AR coefficients */
         t0_s16x8 = vld1q_s16( A_Q12 +  0 );
         t1_s16x8 = vld1q_s16( A_Q12 +  8 );
-        t2_s16x8 = vld1q_s16( A_Q12 + 16 );
+        if ( order > 16 ) {
+          t2_s16x8 = vld1q_s16( A_Q12 + 16 );
+        }
         t0_s32x4 = vpaddlq_s16( t0_s16x8 );
 
         switch( order - leftover )
         {
         case 24:
             t0_s32x4 = vpadalq_s16( t0_s32x4, t2_s16x8 );
+            vst1q_s32( Atmp_QA + 16, vshll_n_s16( vget_low_s16 ( t2_s16x8 ), QA - 12 ) );
+            vst1q_s32( Atmp_QA + 20, vshll_n_s16( vget_high_s16( t2_s16x8 ), QA - 12 ) );
             /* FALLTHROUGH */
 
         case 16:
             t0_s32x4 = vpadalq_s16( t0_s32x4, t1_s16x8 );
-            vst1q_s32( Atmp_QA + 16, vshll_n_s16( vget_low_s16 ( t2_s16x8 ), QA - 12 ) );
-            vst1q_s32( Atmp_QA + 20, vshll_n_s16( vget_high_s16( t2_s16x8 ), QA - 12 ) );
+            vst1q_s32( Atmp_QA +  8, vshll_n_s16( vget_low_s16 ( t1_s16x8 ), QA - 12 ) );
+            vst1q_s32( Atmp_QA + 12, vshll_n_s16( vget_high_s16( t1_s16x8 ), QA - 12 ) );
             /* FALLTHROUGH */
 
         case 8:
@@ -230,8 +234,8 @@ opus_int32 silk_LPC_inverse_pred_gain_neon(         /* O   Returns inverse predi
             const int32x2_t t_s32x2 = vpadd_s32( vget_low_s32( t0_s32x4 ), vget_high_s32( t0_s32x4 ) );
             const int64x1_t t_s64x1 = vpaddl_s32( t_s32x2 );
             DC_resp = vget_lane_s32( vreinterpret_s32_s64( t_s64x1 ), 0 );
-            vst1q_s32( Atmp_QA +  8, vshll_n_s16( vget_low_s16 ( t1_s16x8 ), QA - 12 ) );
-            vst1q_s32( Atmp_QA + 12, vshll_n_s16( vget_high_s16( t1_s16x8 ), QA - 12 ) );
+            vst1q_s32( Atmp_QA + 0, vshll_n_s16( vget_low_s16 ( t0_s16x8 ), QA - 12 ) );
+            vst1q_s32( Atmp_QA + 4, vshll_n_s16( vget_high_s16( t0_s16x8 ), QA - 12 ) );
         }
         break;
 
@@ -246,16 +250,22 @@ opus_int32 silk_LPC_inverse_pred_gain_neon(         /* O   Returns inverse predi
         case 6:
             DC_resp += (opus_int32)A_Q12[ 5 ];
             DC_resp += (opus_int32)A_Q12[ 4 ];
+            Atmp_QA[ order - leftover + 5 ] = silk_LSHIFT32( (opus_int32)A_Q12[ 5 ], QA - 12 );
+            Atmp_QA[ order - leftover + 4 ] = silk_LSHIFT32( (opus_int32)A_Q12[ 4 ], QA - 12 );
             /* FALLTHROUGH */
 
         case 4:
             DC_resp += (opus_int32)A_Q12[ 3 ];
             DC_resp += (opus_int32)A_Q12[ 2 ];
+            Atmp_QA[ order - leftover + 3 ] = silk_LSHIFT32( (opus_int32)A_Q12[ 3 ], QA - 12 );
+            Atmp_QA[ order - leftover + 2 ] = silk_LSHIFT32( (opus_int32)A_Q12[ 2 ], QA - 12 );
             /* FALLTHROUGH */
 
         case 2:
             DC_resp += (opus_int32)A_Q12[ 1 ];
             DC_resp += (opus_int32)A_Q12[ 0 ];
+            Atmp_QA[ order - leftover + 1 ] = silk_LSHIFT32( (opus_int32)A_Q12[ 1 ], QA - 12 );
+            Atmp_QA[ order - leftover + 0 ] = silk_LSHIFT32( (opus_int32)A_Q12[ 0 ], QA - 12 );
             /* FALLTHROUGH */
 
         default:
@@ -266,8 +276,6 @@ opus_int32 silk_LPC_inverse_pred_gain_neon(         /* O   Returns inverse predi
         if( DC_resp >= 4096 ) {
             invGain_Q30 = 0;
         } else {
-            vst1q_s32( Atmp_QA + 0, vshll_n_s16( vget_low_s16 ( t0_s16x8 ), QA - 12 ) );
-            vst1q_s32( Atmp_QA + 4, vshll_n_s16( vget_high_s16( t0_s16x8 ), QA - 12 ) );
             invGain_Q30 = LPC_inverse_pred_gain_QA_neon( Atmp_QA, order );
         }
     }
diff --git a/opus/silk/arm/NSQ_del_dec_arm.h b/opus/silk/arm/NSQ_del_dec_arm.h
index 9e76e169..0c4fcfcc 100644
--- a/opus/silk/arm/NSQ_del_dec_arm.h
+++ b/opus/silk/arm/NSQ_del_dec_arm.h
@@ -34,7 +34,7 @@ POSSIBILITY OF SUCH DAMAGE.
 void silk_NSQ_del_dec_neon(
     const silk_encoder_state *psEncC, silk_nsq_state *NSQ,
     SideInfoIndices *psIndices, const opus_int16 x16[], opus_int8 pulses[],
-    const opus_int16 PredCoef_Q12[2 * MAX_LPC_ORDER],
+    const opus_int16 *PredCoef_Q12,
     const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR],
     const opus_int16 AR_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER],
     const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR],
@@ -65,7 +65,7 @@ void silk_NSQ_del_dec_neon(
 extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])(
     const silk_encoder_state *psEncC, silk_nsq_state *NSQ,
     SideInfoIndices *psIndices, const opus_int16 x16[], opus_int8 pulses[],
-    const opus_int16 PredCoef_Q12[2 * MAX_LPC_ORDER],
+    const opus_int16 *PredCoef_Q12,
     const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR],
     const opus_int16 AR_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER],
     const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR],
diff --git a/opus/silk/arm/NSQ_del_dec_neon_intr.c b/opus/silk/arm/NSQ_del_dec_neon_intr.c
index 212410f3..668dde6d 100644
--- a/opus/silk/arm/NSQ_del_dec_neon_intr.c
+++ b/opus/silk/arm/NSQ_del_dec_neon_intr.c
@@ -35,6 +35,7 @@ POSSIBILITY OF SUCH DAMAGE.
 #endif
 #include "main.h"
 #include "stack_alloc.h"
+#include "os_support.h"
 
 /* NEON intrinsics optimization now can only parallelize up to 4 delay decision states.    */
 /* If there are more states, C function is called, and this optimization must be expanded. */
@@ -220,7 +221,7 @@ void silk_NSQ_del_dec_neon(
     SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
     const opus_int16            x16[],                                      /* I    Input                           */
     opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
+    const opus_int16            *PredCoef_Q12,                              /* I    Short term prediction coefs     */
     const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
     const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs              */
     const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
@@ -279,6 +280,7 @@ void silk_NSQ_del_dec_neon(
 
         /* Initialize delayed decision states */
         ALLOC( psDelDec, 1, NSQ_del_decs_struct );
+        OPUS_CLEAR(psDelDec, 1);
         /* Only RandState and RD_Q10 need to be initialized to 0. */
         silk_memset( psDelDec->RandState, 0, sizeof( psDelDec->RandState ) );
         vst1q_s32( psDelDec->RD_Q10, vdupq_n_s32( 0 ) );
@@ -587,6 +589,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_neon(
     silk_assert( nStatesDelayedDecision > 0 );
     silk_assert( ( shapingLPCOrder & 1 ) == 0 );   /* check that order is even */
     ALLOC( psSampleState, 2, NSQ_samples_struct );
+    OPUS_CLEAR(psSampleState, 2);
 
     shp_lag_ptr  = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
     pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
@@ -711,23 +714,26 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_neon(
                 const int rdo_offset = Lambda_Q10/2 - 512;
                 const uint16x4_t greaterThanRdo = vcgt_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) );
                 const uint16x4_t lessThanMinusRdo = vclt_s16( q1_Q10_s16x4, vdup_n_s16( -rdo_offset ) );
+                int16x4_t signed_offset = vbsl_s16( greaterThanRdo, vdup_n_s16( -rdo_offset ), vdup_n_s16( 0 ) );
+                signed_offset = vbsl_s16( lessThanMinusRdo, vdup_n_s16( rdo_offset ), signed_offset );
                 /* If Lambda_Q10 > 32767, then q1_Q0, q1_Q10 and q2_Q10 must change to 32-bit. */
                 silk_assert( Lambda_Q10 <= 32767 );
 
                 q1_Q0_s16x4 = vreinterpret_s16_u16( vclt_s16( q1_Q10_s16x4, vdup_n_s16( 0 ) ) );
-                q1_Q0_s16x4 = vbsl_s16( greaterThanRdo, vsub_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) ), q1_Q0_s16x4 );
-                q1_Q0_s16x4 = vbsl_s16( lessThanMinusRdo, vadd_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) ), q1_Q0_s16x4 );
+                q1_Q0_s16x4 = vbsl_s16(vorr_u16(greaterThanRdo, lessThanMinusRdo), vadd_s16( q1_Q10_s16x4 , signed_offset), q1_Q0_s16x4);
                 q1_Q0_s16x4 = vshr_n_s16( q1_Q0_s16x4, 10 );
             }
             {
                 const uint16x4_t equal0_u16x4 = vceq_s16( q1_Q0_s16x4, vdup_n_s16( 0 ) );
                 const uint16x4_t equalMinus1_u16x4 = vceq_s16( q1_Q0_s16x4, vdup_n_s16( -1 ) );
                 const uint16x4_t lessThanMinus1_u16x4 = vclt_s16( q1_Q0_s16x4, vdup_n_s16( -1 ) );
-                int16x4_t tmp1_s16x4, tmp2_s16x4;
+                int16x4_t tmp1_s16x4, tmp2_s16x4, tmp_summand_s16x4;
 
                 q1_Q10_s16x4 = vshl_n_s16( q1_Q0_s16x4, 10 );
-                tmp1_s16x4 = vadd_s16( q1_Q10_s16x4, vdup_n_s16( offset_Q10 - QUANT_LEVEL_ADJUST_Q10 ) );
-                q1_Q10_s16x4 = vadd_s16( q1_Q10_s16x4, vdup_n_s16( offset_Q10 + QUANT_LEVEL_ADJUST_Q10 ) );
+                tmp_summand_s16x4 = vand_s16( vreinterpret_s16_u16(vcge_s16(q1_Q0_s16x4, vdup_n_s16(0))), vdup_n_s16( offset_Q10 - QUANT_LEVEL_ADJUST_Q10 ) );
+                tmp1_s16x4 = vadd_s16( q1_Q10_s16x4, tmp_summand_s16x4 );
+                tmp_summand_s16x4 = vbsl_s16( lessThanMinus1_u16x4, vdup_n_s16( offset_Q10 + QUANT_LEVEL_ADJUST_Q10 ), vdup_n_s16(0) );
+                q1_Q10_s16x4 = vadd_s16( q1_Q10_s16x4,  tmp_summand_s16x4);
                 q1_Q10_s16x4 = vbsl_s16( lessThanMinus1_u16x4, q1_Q10_s16x4, tmp1_s16x4 );
                 q1_Q10_s16x4 = vbsl_s16( equal0_u16x4, vdup_n_s16( offset_Q10 ), q1_Q10_s16x4 );
                 q1_Q10_s16x4 = vbsl_s16( equalMinus1_u16x4, vdup_n_s16( offset_Q10 - ( 1024 - QUANT_LEVEL_ADJUST_Q10 ) ), q1_Q10_s16x4 );
@@ -818,6 +824,13 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_neon(
             }
         }
 
+        /* clear unused part of RD_Q10 to avoid overflows */
+        if( nStatesDelayedDecision < NEON_MAX_DEL_DEC_STATES )
+        {
+            OPUS_CLEAR(psSampleState[0].RD_Q10 + nStatesDelayedDecision, NEON_MAX_DEL_DEC_STATES - nStatesDelayedDecision);
+            OPUS_CLEAR(psSampleState[1].RD_Q10 + nStatesDelayedDecision, NEON_MAX_DEL_DEC_STATES - nStatesDelayedDecision);
+        }
+
         /* Increase RD values of expired states */
         {
             uint32x4_t t_u32x4;
@@ -896,7 +909,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_neon(
         vst1q_s32( psDelDec->Pred_Q15[ *smpl_buf_idx ], vshlq_n_s32( vld1q_s32( psSampleState[ 0 ].LPC_exc_Q14 ), 1 ) );
         vst1q_s32( psDelDec->Shape_Q14[ *smpl_buf_idx ], vld1q_s32( psSampleState[ 0 ].sLTP_shp_Q14 ) );
         tmp1_s32x4 = vrshrq_n_s32( tmp1_s32x4, 10 );
-        tmp1_s32x4 = vaddq_s32( vld1q_s32( psDelDec->Seed ), tmp1_s32x4 );
+        tmp1_s32x4 = vreinterpretq_s32_u32( vaddq_u32( vreinterpretq_u32_s32(
+            vld1q_s32( psDelDec->Seed ) ), vreinterpretq_u32_s32( tmp1_s32x4 ) ) );
         vst1q_s32( psDelDec->Seed, tmp1_s32x4 );
         vst1q_s32( psDelDec->RandState[ *smpl_buf_idx ], tmp1_s32x4 );
         vst1q_s32( psDelDec->RD_Q10, vld1q_s32( psSampleState[ 0 ].RD_Q10 ) );
diff --git a/opus/silk/arm/NSQ_neon.h b/opus/silk/arm/NSQ_neon.h
index b31d9442..f03d8ddd 100644
--- a/opus/silk/arm/NSQ_neon.h
+++ b/opus/silk/arm/NSQ_neon.h
@@ -73,7 +73,7 @@ static OPUS_INLINE void silk_short_prediction_create_arch_coef_neon(opus_int32 *
 #elif defined(OPUS_HAVE_RTCD) && defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
 
 #define silk_short_prediction_create_arch_coef(out, in, order) \
-    do { if (arch == OPUS_ARCH_ARM_NEON) { silk_short_prediction_create_arch_coef_neon(out, in, order); } } while (0)
+    do { if (arch >= OPUS_ARCH_ARM_NEON) { silk_short_prediction_create_arch_coef_neon(out, in, order); } } while (0)
 
 #endif
 
@@ -95,7 +95,7 @@ opus_int32 silk_NSQ_noise_shape_feedback_loop_neon(const opus_int32 *data0, opus
    (coef vs. coefRev) so can't use the usual IMPL table implementation */
 #undef silk_noise_shape_quantizer_short_prediction
 #define silk_noise_shape_quantizer_short_prediction(in, coef, coefRev, order, arch)  \
-    (arch == OPUS_ARCH_ARM_NEON ? \
+    (arch >= OPUS_ARCH_ARM_NEON ? \
         silk_noise_shape_quantizer_short_prediction_neon(in, coefRev, order) : \
         silk_noise_shape_quantizer_short_prediction_c(in, coef, order))
 
diff --git a/opus/silk/arm/arm_silk_map.c b/opus/silk/arm/arm_silk_map.c
index 0b9bfec2..a91f79b5 100644
--- a/opus/silk/arm/arm_silk_map.c
+++ b/opus/silk/arm/arm_silk_map.c
@@ -49,6 +49,7 @@ void (*const SILK_BIQUAD_ALT_STRIDE2_IMPL[OPUS_ARCHMASK + 1])(
       silk_biquad_alt_stride2_c,    /* EDSP */
       silk_biquad_alt_stride2_c,    /* Media */
       silk_biquad_alt_stride2_neon, /* Neon */
+      silk_biquad_alt_stride2_neon, /* dotprod */
 };
 
 opus_int32 (*const SILK_LPC_INVERSE_PRED_GAIN_IMPL[OPUS_ARCHMASK + 1])( /* O   Returns inverse prediction gain in energy domain, Q30        */
@@ -59,6 +60,7 @@ opus_int32 (*const SILK_LPC_INVERSE_PRED_GAIN_IMPL[OPUS_ARCHMASK + 1])( /* O   R
       silk_LPC_inverse_pred_gain_c,    /* EDSP */
       silk_LPC_inverse_pred_gain_c,    /* Media */
       silk_LPC_inverse_pred_gain_neon, /* Neon */
+      silk_LPC_inverse_pred_gain_neon, /* dotprod */
 };
 
 void  (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])(
@@ -67,7 +69,7 @@ void  (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])(
         SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
         const opus_int16            x16[],                                      /* I    Input                           */
         opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
-        const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
+        const opus_int16            *PredCoef_Q12,                              /* I    Short term prediction coefs     */
         const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
         const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs              */
         const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
@@ -82,6 +84,7 @@ void  (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])(
       silk_NSQ_del_dec_c,    /* EDSP */
       silk_NSQ_del_dec_c,    /* Media */
       silk_NSQ_del_dec_neon, /* Neon */
+      silk_NSQ_del_dec_neon, /* dotprod */
 };
 
 /*There is no table for silk_noise_shape_quantizer_short_prediction because the
@@ -97,6 +100,7 @@ opus_int32
   silk_NSQ_noise_shape_feedback_loop_c,    /* EDSP */
   silk_NSQ_noise_shape_feedback_loop_c,    /* Media */
   silk_NSQ_noise_shape_feedback_loop_neon, /* NEON */
+  silk_NSQ_noise_shape_feedback_loop_neon, /* dotprod */
 };
 
 # endif
@@ -116,6 +120,7 @@ void (*const SILK_WARPED_AUTOCORRELATION_FIX_IMPL[OPUS_ARCHMASK + 1])(
       silk_warped_autocorrelation_FIX_c,    /* EDSP */
       silk_warped_autocorrelation_FIX_c,    /* Media */
       silk_warped_autocorrelation_FIX_neon, /* Neon */
+      silk_warped_autocorrelation_FIX_neon, /* dotprod */
 };
 
 # endif
diff --git a/opus/silk/bwexpander_32.c b/opus/silk/bwexpander_32.c
index d0010f73..0f32b9df 100644
--- a/opus/silk/bwexpander_32.c
+++ b/opus/silk/bwexpander_32.c
@@ -31,7 +31,8 @@ POSSIBILITY OF SUCH DAMAGE.
 
 #include "SigProc_FIX.h"
 
-/* Chirp (bandwidth expand) LP AR filter */
+/* Chirp (bandwidth expand) LP AR filter.
+   This logic is reused in _celt_lpc(). Any bug fixes should also be applied there. */
 void silk_bwexpander_32(
     opus_int32                  *ar,                /* I/O  AR filter to be expanded (without leading 1)                */
     const opus_int              d,                  /* I    Length of ar                                                */
diff --git a/opus/silk/control.h b/opus/silk/control.h
index b76ec33c..f5633e62 100644
--- a/opus/silk/control.h
+++ b/opus/silk/control.h
@@ -77,6 +77,9 @@ typedef struct {
     /* I:   Flag to enable in-band Forward Error Correction (FEC); 0/1                      */
     opus_int useInBandFEC;
 
+    /* I:   Flag to enable in-band Deep REDundancy (DRED); 0/1                              */
+    opus_int useDRED;
+
     /* I:   Flag to actually code in-band Forward Error Correction (FEC) in the current packet; 0/1 */
     opus_int LBRR_coded;
 
@@ -141,6 +144,14 @@ typedef struct {
 
     /* O:   Pitch lag of previous frame (0 if unvoiced), measured in samples at 48 kHz      */
     opus_int prevPitchLag;
+
+    /* I:   Enable Deep PLC                                                                 */
+    opus_int enable_deep_plc;
+
+#ifdef ENABLE_OSCE
+    /* I: OSCE method */
+    opus_int osce_method;
+#endif
 } silk_DecControlStruct;
 
 #ifdef __cplusplus
diff --git a/opus/silk/control_codec.c b/opus/silk/control_codec.c
index 52aa8fde..784ffe66 100644
--- a/opus/silk/control_codec.c
+++ b/opus/silk/control_codec.c
@@ -415,7 +415,7 @@ static OPUS_INLINE opus_int silk_setup_LBRR(
             /* Previous packet did not have LBRR, and was therefore coded at a higher bitrate */
             psEncC->LBRR_GainIncreases = 7;
         } else {
-            psEncC->LBRR_GainIncreases = silk_max_int( 7 - silk_SMULWB( (opus_int32)psEncC->PacketLoss_perc, SILK_FIX_CONST( 0.4, 16 ) ), 2 );
+            psEncC->LBRR_GainIncreases = silk_max_int( 7 - silk_SMULWB( (opus_int32)psEncC->PacketLoss_perc, SILK_FIX_CONST( 0.2, 16 ) ), 3 );
         }
     }
 
diff --git a/opus/silk/debug.c b/opus/silk/debug.c
index 9253faf7..46a24a47 100644
--- a/opus/silk/debug.c
+++ b/opus/silk/debug.c
@@ -29,19 +29,23 @@ POSSIBILITY OF SUCH DAMAGE.
 #include "config.h"
 #endif
 
+typedef int prevent_empty_translation_unit_warning;
+
 #include "debug.h"
+
+#if SILK_DEBUG || SILK_TIC_TOC
 #include "SigProc_FIX.h"
+#endif
 
 #if SILK_TIC_TOC
 
-#ifdef _WIN32
-
 #if (defined(_WIN32) || defined(_WINCE))
 #include <windows.h>    /* timer */
 #else   /* Linux or Mac*/
 #include <sys/time.h>
 #endif
 
+#ifdef _WIN32
 unsigned long silk_GetHighResolutionTime(void) /* O  time in usec*/
 {
     /* Returns a time counter in microsec   */
@@ -65,7 +69,7 @@ unsigned long GetHighResolutionTime(void) /* O  time in usec*/
 int           silk_Timer_nTimers = 0;
 int           silk_Timer_depth_ctr = 0;
 char          silk_Timer_tags[silk_NUM_TIMERS_MAX][silk_NUM_TIMERS_MAX_TAG_LEN];
-#ifdef WIN32
+#ifdef _WIN32
 LARGE_INTEGER silk_Timer_start[silk_NUM_TIMERS_MAX];
 #else
 unsigned long silk_Timer_start[silk_NUM_TIMERS_MAX];
@@ -76,7 +80,7 @@ opus_int64     silk_Timer_sum[silk_NUM_TIMERS_MAX];
 opus_int64     silk_Timer_max[silk_NUM_TIMERS_MAX];
 opus_int64     silk_Timer_depth[silk_NUM_TIMERS_MAX];
 
-#ifdef WIN32
+#ifdef _WIN32
 void silk_TimerSave(char *file_name)
 {
     if( silk_Timer_nTimers > 0 )
diff --git a/opus/silk/debug.h b/opus/silk/debug.h
index 6f68c1ca..36163e47 100644
--- a/opus/silk/debug.h
+++ b/opus/silk/debug.h
@@ -28,28 +28,29 @@ POSSIBILITY OF SUCH DAMAGE.
 #ifndef SILK_DEBUG_H
 #define SILK_DEBUG_H
 
-#include "typedef.h"
-#include <stdio.h>      /* file writing */
-#include <string.h>     /* strcpy, strcmp */
-
-#ifdef  __cplusplus
-extern "C"
-{
-#endif
-
-unsigned long GetHighResolutionTime(void); /* O  time in usec*/
-
 /* Set to 1 to enable DEBUG_STORE_DATA() macros for dumping
  * intermediate signals from the codec.
  */
 #define SILK_DEBUG 0
 
 /* Flag for using timers */
-#define SILK_TIC_TOC    0
+#define SILK_TIC_TOC 0
 
+#if SILK_DEBUG || SILK_TIC_TOC
+#include "typedef.h"
+#include <string.h>     /* strcpy, strcmp */
+#include <stdio.h>      /* file writing */
+#endif
+
+#ifdef  __cplusplus
+extern "C"
+{
+#endif
 
 #if SILK_TIC_TOC
 
+unsigned long GetHighResolutionTime(void); /* O  time in usec*/
+
 #if (defined(_WIN32) || defined(_WINCE))
 #include <windows.h>    /* timer */
 #else   /* Linux or Mac*/
diff --git a/opus/silk/dec_API.c b/opus/silk/dec_API.c
index 7d5ca7fb..c1091d13 100644
--- a/opus/silk/dec_API.c
+++ b/opus/silk/dec_API.c
@@ -33,6 +33,11 @@ POSSIBILITY OF SUCH DAMAGE.
 #include "stack_alloc.h"
 #include "os_support.h"
 
+#ifdef ENABLE_OSCE
+#include "osce.h"
+#include "osce_structs.h"
+#endif
+
 /************************/
 /* Decoder Super Struct */
 /************************/
@@ -42,12 +47,33 @@ typedef struct {
     opus_int                         nChannelsAPI;
     opus_int                         nChannelsInternal;
     opus_int                         prev_decode_only_middle;
+#ifdef ENABLE_OSCE
+    OSCEModel                        osce_model;
+#endif
 } silk_decoder;
 
 /*********************/
 /* Decoder functions */
 /*********************/
 
+
+
+opus_int silk_LoadOSCEModels(void *decState, const unsigned char *data, int len)
+{
+#ifdef ENABLE_OSCE
+    opus_int ret = SILK_NO_ERROR;
+
+    ret = osce_load_models(&((silk_decoder *)decState)->osce_model, data, len);
+    ((silk_decoder *)decState)->osce_model.loaded = (ret == 0);
+    return ret;
+#else
+    (void) decState;
+    (void) data;
+    (void) len;
+    return SILK_NO_ERROR;
+#endif
+}
+
 opus_int silk_Get_Decoder_Size(                         /* O    Returns error code                              */
     opus_int                        *decSizeBytes       /* O    Number of bytes in SILK decoder state           */
 )
@@ -60,12 +86,37 @@ opus_int silk_Get_Decoder_Size(                         /* O    Returns error co
 }
 
 /* Reset decoder state */
+opus_int silk_ResetDecoder(                              /* O    Returns error code                              */
+    void                            *decState           /* I/O  State                                           */
+)
+{
+    opus_int n, ret = SILK_NO_ERROR;
+    silk_decoder_state *channel_state = ((silk_decoder *)decState)->channel_state;
+
+    for( n = 0; n < DECODER_NUM_CHANNELS; n++ ) {
+        ret  = silk_reset_decoder( &channel_state[ n ] );
+    }
+    silk_memset(&((silk_decoder *)decState)->sStereo, 0, sizeof(((silk_decoder *)decState)->sStereo));
+    /* Not strictly needed, but it's cleaner that way */
+    ((silk_decoder *)decState)->prev_decode_only_middle = 0;
+
+    return ret;
+}
+
+
 opus_int silk_InitDecoder(                              /* O    Returns error code                              */
     void                            *decState           /* I/O  State                                           */
 )
 {
     opus_int n, ret = SILK_NO_ERROR;
     silk_decoder_state *channel_state = ((silk_decoder *)decState)->channel_state;
+#ifdef ENABLE_OSCE
+    ((silk_decoder *)decState)->osce_model.loaded = 0;
+#endif
+#ifndef USE_WEIGHTS_FILE
+    /* load osce models */
+    silk_LoadOSCEModels(decState, NULL, 0);
+#endif
 
     for( n = 0; n < DECODER_NUM_CHANNELS; n++ ) {
         ret  = silk_init_decoder( &channel_state[ n ] );
@@ -86,6 +137,9 @@ opus_int silk_Decode(                                   /* O    Returns error co
     ec_dec                          *psRangeDec,        /* I/O  Compressor data structure                       */
     opus_int16                      *samplesOut,        /* O    Decoded output speech vector                    */
     opus_int32                      *nSamplesOut,       /* O    Number of samples decoded                       */
+#ifdef ENABLE_DEEP_PLC
+    LPCNetPLCState                  *lpcnet,
+#endif
     int                             arch                /* I    Run-time architecture                           */
 )
 {
@@ -278,6 +332,7 @@ opus_int silk_Decode(                                   /* O    Returns error co
         has_side = !psDec->prev_decode_only_middle
               || (decControl->nChannelsInternal == 2 && lostFlag == FLAG_DECODE_LBRR && channel_state[1].LBRR_flags[ channel_state[1].nFramesDecoded ] == 1 );
     }
+    channel_state[ 0 ].sPLC.enable_deep_plc = decControl->enable_deep_plc;
     /* Call decoder for one frame */
     for( n = 0; n < decControl->nChannelsInternal; n++ ) {
         if( n == 0 || has_side ) {
@@ -297,7 +352,19 @@ opus_int silk_Decode(                                   /* O    Returns error co
             } else {
                 condCoding = CODE_CONDITIONALLY;
             }
-            ret += silk_decode_frame( &channel_state[ n ], psRangeDec, &samplesOut1_tmp[ n ][ 2 ], &nSamplesOutDec, lostFlag, condCoding, arch);
+#ifdef ENABLE_OSCE
+            if ( channel_state[n].osce.method != decControl->osce_method ) {
+                osce_reset( &channel_state[n].osce, decControl->osce_method );
+            }
+#endif
+            ret += silk_decode_frame( &channel_state[ n ], psRangeDec, &samplesOut1_tmp[ n ][ 2 ], &nSamplesOutDec, lostFlag, condCoding,
+#ifdef ENABLE_DEEP_PLC
+                n == 0 ? lpcnet : NULL,
+#endif
+#ifdef ENABLE_OSCE
+                &psDec->osce_model,
+#endif
+                arch);
         } else {
             silk_memset( &samplesOut1_tmp[ n ][ 2 ], 0, nSamplesOutDec * sizeof( opus_int16 ) );
         }
diff --git a/opus/silk/decode_frame.c b/opus/silk/decode_frame.c
index e73825b2..9bc4ca2b 100644
--- a/opus/silk/decode_frame.c
+++ b/opus/silk/decode_frame.c
@@ -33,6 +33,10 @@ POSSIBILITY OF SUCH DAMAGE.
 #include "stack_alloc.h"
 #include "PLC.h"
 
+#ifdef ENABLE_OSCE
+#include "osce.h"
+#endif
+
 /****************/
 /* Decode frame */
 /****************/
@@ -43,6 +47,12 @@ opus_int silk_decode_frame(
     opus_int32                  *pN,                            /* O    Pointer to size of output frame             */
     opus_int                    lostFlag,                       /* I    0: no loss, 1 loss, 2 decode fec            */
     opus_int                    condCoding,                     /* I    The type of conditional coding to use       */
+#ifdef ENABLE_DEEP_PLC
+    LPCNetPLCState              *lpcnet,
+#endif
+#ifdef ENABLE_OSCE
+    OSCEModel                   *osce_model,
+#endif
     int                         arch                            /* I    Run-time architecture                       */
 )
 {
@@ -61,6 +71,10 @@ opus_int silk_decode_frame(
         ( lostFlag == FLAG_DECODE_LBRR && psDec->LBRR_flags[ psDec->nFramesDecoded ] == 1 ) )
     {
         VARDECL( opus_int16, pulses );
+#ifdef ENABLE_OSCE
+        opus_int32  ec_start;
+        ec_start = ec_tell(psRangeDec);
+#endif
         ALLOC( pulses, (L + SHELL_CODEC_FRAME_LENGTH - 1) &
                        ~(SHELL_CODEC_FRAME_LENGTH - 1), opus_int16 );
         /*********************************************/
@@ -84,10 +98,29 @@ opus_int silk_decode_frame(
         /********************************************************/
         silk_decode_core( psDec, psDecCtrl, pOut, pulses, arch );
 
+        /*************************/
+        /* Update output buffer. */
+        /*************************/
+        celt_assert( psDec->ltp_mem_length >= psDec->frame_length );
+        mv_len = psDec->ltp_mem_length - psDec->frame_length;
+        silk_memmove( psDec->outBuf, &psDec->outBuf[ psDec->frame_length ], mv_len * sizeof(opus_int16) );
+        silk_memcpy( &psDec->outBuf[ mv_len ], pOut, psDec->frame_length * sizeof( opus_int16 ) );
+
+#ifdef ENABLE_OSCE
+        /********************************************************/
+        /* Run SILK enhancer                                    */
+        /********************************************************/
+        osce_enhance_frame( osce_model, psDec, psDecCtrl, pOut, ec_tell(psRangeDec) - ec_start, arch );
+#endif
+
         /********************************************************/
         /* Update PLC state                                     */
         /********************************************************/
-        silk_PLC( psDec, psDecCtrl, pOut, 0, arch );
+        silk_PLC( psDec, psDecCtrl, pOut, 0,
+#ifdef ENABLE_DEEP_PLC
+            lpcnet,
+#endif
+            arch );
 
         psDec->lossCnt = 0;
         psDec->prevSignalType = psDec->indices.signalType;
@@ -97,17 +130,23 @@ opus_int silk_decode_frame(
         psDec->first_frame_after_reset = 0;
     } else {
         /* Handle packet loss by extrapolation */
-        psDec->indices.signalType = psDec->prevSignalType;
-        silk_PLC( psDec, psDecCtrl, pOut, 1, arch );
-    }
+        silk_PLC( psDec, psDecCtrl, pOut, 1,
+#ifdef ENABLE_DEEP_PLC
+            lpcnet,
+#endif
+            arch );
 
-    /*************************/
-    /* Update output buffer. */
-    /*************************/
-    celt_assert( psDec->ltp_mem_length >= psDec->frame_length );
-    mv_len = psDec->ltp_mem_length - psDec->frame_length;
-    silk_memmove( psDec->outBuf, &psDec->outBuf[ psDec->frame_length ], mv_len * sizeof(opus_int16) );
-    silk_memcpy( &psDec->outBuf[ mv_len ], pOut, psDec->frame_length * sizeof( opus_int16 ) );
+#ifdef ENABLE_OSCE
+        osce_reset( &psDec->osce, psDec->osce.method );
+#endif
+        /*************************/
+        /* Update output buffer. */
+        /*************************/
+        celt_assert( psDec->ltp_mem_length >= psDec->frame_length );
+        mv_len = psDec->ltp_mem_length - psDec->frame_length;
+        silk_memmove( psDec->outBuf, &psDec->outBuf[ psDec->frame_length ], mv_len * sizeof(opus_int16) );
+        silk_memcpy( &psDec->outBuf[ mv_len ], pOut, psDec->frame_length * sizeof( opus_int16 ) );
+    }
 
     /************************************************/
     /* Comfort noise generation / estimation        */
diff --git a/opus/silk/define.h b/opus/silk/define.h
index 247cb0bf..491c86f3 100644
--- a/opus/silk/define.h
+++ b/opus/silk/define.h
@@ -225,6 +225,7 @@ extern "C"
 /* Defines for CN generation */
 #define CNG_BUF_MASK_MAX                        255     /* 2^floor(log2(MAX_FRAME_LENGTH))-1    */
 #define CNG_GAIN_SMTH_Q16                       4634    /* 0.25^(1/4)                           */
+#define CNG_GAIN_SMTH_THRESHOLD_Q16             46396   /* -3 dB                                */
 #define CNG_NLSF_SMTH_Q16                       16348   /* 0.25                                 */
 
 #ifdef __cplusplus
diff --git a/opus/silk/enc_API.c b/opus/silk/enc_API.c
index 55a33f37..369caddd 100644
--- a/opus/silk/enc_API.c
+++ b/opus/silk/enc_API.c
@@ -41,6 +41,10 @@ POSSIBILITY OF SUCH DAMAGE.
 #include "main_FLP.h"
 #endif
 
+#ifdef ENABLE_DRED
+#include "dred_encoder.h"
+#endif
+
 /***************************************/
 /* Read control structure from encoder */
 /***************************************/
@@ -270,6 +274,7 @@ opus_int silk_Encode(                                   /* O    Returns error co
                        psEnc->state_Fxx[ 0 ].sCmn.fs_kHz * 1000 );
     ALLOC( buf, nSamplesFromInputMax, opus_int16 );
     while( 1 ) {
+        int curr_nBitsUsedLBRR = 0;
         nSamplesToBuffer  = psEnc->state_Fxx[ 0 ].sCmn.frame_length - psEnc->state_Fxx[ 0 ].sCmn.inputBufIx;
         nSamplesToBuffer  = silk_min( nSamplesToBuffer, nSamplesToBufferMax );
         nSamplesFromInput = silk_DIV32_16( nSamplesToBuffer * psEnc->state_Fxx[ 0 ].sCmn.API_fs_Hz, psEnc->state_Fxx[ 0 ].sCmn.fs_kHz * 1000 );
@@ -342,6 +347,7 @@ opus_int silk_Encode(                                   /* O    Returns error co
                 opus_uint8 iCDF[ 2 ] = { 0, 0 };
                 iCDF[ 0 ] = 256 - silk_RSHIFT( 256, ( psEnc->state_Fxx[ 0 ].sCmn.nFramesPerPacket + 1 ) * encControl->nChannelsInternal );
                 ec_enc_icdf( psRangeEnc, 0, iCDF, 8 );
+                curr_nBitsUsedLBRR = ec_tell( psRangeEnc );
 
                 /* Encode any LBRR data from previous packet */
                 /* Encode LBRR flags */
@@ -386,8 +392,7 @@ opus_int silk_Encode(                                   /* O    Returns error co
                 for( n = 0; n < encControl->nChannelsInternal; n++ ) {
                     silk_memset( psEnc->state_Fxx[ n ].sCmn.LBRR_flags, 0, sizeof( psEnc->state_Fxx[ n ].sCmn.LBRR_flags ) );
                 }
-
-                psEnc->nBitsUsedLBRR = ec_tell( psRangeEnc );
+                curr_nBitsUsedLBRR = ec_tell( psRangeEnc ) - curr_nBitsUsedLBRR;
             }
 
             silk_HP_variable_cutoff( psEnc->state_Fxx );
@@ -396,6 +401,16 @@ opus_int silk_Encode(                                   /* O    Returns error co
             nBits = silk_DIV32_16( silk_MUL( encControl->bitRate, encControl->payloadSize_ms ), 1000 );
             /* Subtract bits used for LBRR */
             if( !prefillFlag ) {
+                /* psEnc->nBitsUsedLBRR is an exponential moving average of the LBRR usage,
+                   except that for the first LBRR frame it does no averaging and for the first
+                   frame after after LBRR, it goes back to zero immediately. */
+                if ( curr_nBitsUsedLBRR < 10 ) {
+                    psEnc->nBitsUsedLBRR = 0;
+                } else if ( psEnc->nBitsUsedLBRR < 10) {
+                    psEnc->nBitsUsedLBRR = curr_nBitsUsedLBRR;
+                } else {
+                    psEnc->nBitsUsedLBRR = ( psEnc->nBitsUsedLBRR + curr_nBitsUsedLBRR ) / 2;
+                }
                 nBits -= psEnc->nBitsUsedLBRR;
             }
             /* Divide by number of uncoded frames left in packet */
diff --git a/opus/silk/fixed/LTP_scale_ctrl_FIX.c b/opus/silk/fixed/LTP_scale_ctrl_FIX.c
index 3dcedef8..db1016e0 100644
--- a/opus/silk/fixed/LTP_scale_ctrl_FIX.c
+++ b/opus/silk/fixed/LTP_scale_ctrl_FIX.c
@@ -42,9 +42,14 @@ void silk_LTP_scale_ctrl_FIX(
 
     if( condCoding == CODE_INDEPENDENTLY ) {
         /* Only scale if first frame in packet */
-        round_loss = psEnc->sCmn.PacketLoss_perc + psEnc->sCmn.nFramesPerPacket;
-        psEnc->sCmn.indices.LTP_scaleIndex = (opus_int8)silk_LIMIT(
-            silk_SMULWB( silk_SMULBB( round_loss, psEncCtrl->LTPredCodGain_Q7 ), SILK_FIX_CONST( 0.1, 9 ) ), 0, 2 );
+        round_loss = psEnc->sCmn.PacketLoss_perc * psEnc->sCmn.nFramesPerPacket;
+        if ( psEnc->sCmn.LBRR_flag ) {
+            /* LBRR reduces the effective loss. In practice, it does not square the loss because
+               losses aren't independent, but that still seems to work best. We also never go below 2%. */
+            round_loss = 2 + silk_SMULBB( round_loss, round_loss ) / 100;
+        }
+        psEnc->sCmn.indices.LTP_scaleIndex = silk_SMULBB( psEncCtrl->LTPredCodGain_Q7, round_loss ) > silk_log2lin( 128*7 + 2900-psEnc->sCmn.SNR_dB_Q7 );
+        psEnc->sCmn.indices.LTP_scaleIndex += silk_SMULBB( psEncCtrl->LTPredCodGain_Q7, round_loss ) > silk_log2lin( 128*7 + 3900-psEnc->sCmn.SNR_dB_Q7 );
     } else {
         /* Default is minimum scaling */
         psEnc->sCmn.indices.LTP_scaleIndex = 0;
diff --git a/opus/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c b/opus/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c
index 00a70cb5..6f3be025 100644
--- a/opus/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c
+++ b/opus/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c
@@ -84,7 +84,9 @@ void silk_warped_autocorrelation_FIX_neon(
         silk_assert( ( order & 1 ) == 0 );
         silk_assert( 2 * QS - QC >= 0 );
 
-        ALLOC( input_QST, length + 2 * MAX_SHAPE_LPC_ORDER, opus_int32 );
+        /* The additional +4 is to ensure a later vld1q_s32 call does not overflow.               */
+        /* Strictly, only +3 is needed but +4 simplifies initialization using the 4x32 neon load. */
+        ALLOC( input_QST, length + 2 * MAX_SHAPE_LPC_ORDER + 4, opus_int32 );
 
         input_QS = input_QST;
         /* input_QS has zero paddings in the beginning and end. */
@@ -121,6 +123,8 @@ void silk_warped_autocorrelation_FIX_neon(
         vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
         input_QS += 4;
         vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
+        input_QS += 4;
+        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
         input_QS = input_QST + MAX_SHAPE_LPC_ORDER - orderT;
 
         /* The following loop runs ( length + order ) times, with ( order ) extra epilogues.                  */
@@ -153,7 +157,8 @@ void silk_warped_autocorrelation_FIX_neon(
             opus_int o = orderT;
             int32x4_t state_QS_s32x4[ 3 ][ 2 ];
 
-            ALLOC( state, length + orderT, opus_int32 );
+            /* The additional +4 is to ensure a later vld1q_s32 call does not overflow. */
+            ALLOC( state, length + order + 4, opus_int32 );
             state_QS_s32x4[ 2 ][ 1 ] = vdupq_n_s32( 0 );
 
             /* Calculate 8 taps of all inputs in each loop. */
diff --git a/opus/silk/fixed/burg_modified_FIX.c b/opus/silk/fixed/burg_modified_FIX.c
index 274d4b28..185a12b1 100644
--- a/opus/silk/fixed/burg_modified_FIX.c
+++ b/opus/silk/fixed/burg_modified_FIX.c
@@ -68,7 +68,7 @@ void silk_burg_modified_c(
     celt_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE );
 
     /* Compute autocorrelations, added over subframes */
-    C0_64 = silk_inner_prod16_aligned_64( x, x, subfr_length*nb_subfr, arch );
+    C0_64 = silk_inner_prod16( x, x, subfr_length*nb_subfr, arch );
     lz = silk_CLZ64(C0_64);
     rshifts = 32 + 1 + N_BITS_HEAD_ROOM - lz;
     if (rshifts > MAX_RSHIFTS) rshifts = MAX_RSHIFTS;
@@ -87,7 +87,7 @@ void silk_burg_modified_c(
             x_ptr = x + s * subfr_length;
             for( n = 1; n < D + 1; n++ ) {
                 C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64(
-                    silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
+                    silk_inner_prod16( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
             }
         }
     } else {
@@ -150,7 +150,7 @@ void silk_burg_modified_c(
                     C_first_row[ k ] = silk_MLA( C_first_row[ k ], x1, x_ptr[ n - k - 1 ]            ); /* Q( -rshifts ) */
                     C_last_row[ k ]  = silk_MLA( C_last_row[ k ],  x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */
                     Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 17 );                                   /* Q17 */
-                    /* We sometimes have get overflows in the multiplications (even beyond +/- 2^32),
+                    /* We sometimes get overflows in the multiplications (even beyond +/- 2^32),
                        but they cancel each other and the real result seems to always fit in a 32-bit
                        signed integer. This was determined experimentally, not theoretically (unfortunately). */
                     tmp1 = silk_MLA_ovflw( tmp1, x_ptr[ n - k - 1 ],            Atmp1 );                      /* Q17 */
@@ -253,7 +253,7 @@ void silk_burg_modified_c(
         if( rshifts > 0 ) {
             for( s = 0; s < nb_subfr; s++ ) {
                 x_ptr = x + s * subfr_length;
-                C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x_ptr, x_ptr, D, arch ), rshifts );
+                C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16( x_ptr, x_ptr, D, arch ), rshifts );
             }
         } else {
             for( s = 0; s < nb_subfr; s++ ) {
diff --git a/opus/silk/fixed/encode_frame_FIX.c b/opus/silk/fixed/encode_frame_FIX.c
index a02bf87d..7c83360b 100644
--- a/opus/silk/fixed/encode_frame_FIX.c
+++ b/opus/silk/fixed/encode_frame_FIX.c
@@ -105,8 +105,11 @@ opus_int silk_encode_frame_FIX(
     opus_int     gain_lock[ MAX_NB_SUBFR ] = {0};
     opus_int16   best_gain_mult[ MAX_NB_SUBFR ];
     opus_int     best_sum[ MAX_NB_SUBFR ];
+    opus_int     bits_margin;
     SAVE_STACK;
 
+    /* For CBR, 5 bits below budget is close enough. For VBR, allow up to 25% below the cap if we initially busted the budget. */
+    bits_margin = useCBR ? 5 : maxBits/4;
     /* This is totally unnecessary but many compilers (including gcc) are too dumb to realise it */
     LastGainIndex_copy2 = nBits_lower = nBits_upper = gainMult_lower = gainMult_upper = 0;
 
@@ -282,7 +285,7 @@ opus_int silk_encode_frame_FIX(
                     gainMult_upper = gainMult_Q8;
                     gainsID_upper = gainsID;
                 }
-            } else if( nBits < maxBits - 5 ) {
+            } else if( nBits < maxBits - bits_margin ) {
                 found_lower = 1;
                 nBits_lower = nBits;
                 gainMult_lower = gainMult_Q8;
@@ -296,7 +299,7 @@ opus_int silk_encode_frame_FIX(
                     LastGainIndex_copy2 = psEnc->sShape.LastGainIndex;
                 }
             } else {
-                /* Within 5 bits of budget: close enough */
+                /* Close enough */
                 break;
             }
 
@@ -318,17 +321,10 @@ opus_int silk_encode_frame_FIX(
             if( ( found_lower & found_upper ) == 0 ) {
                 /* Adjust gain according to high-rate rate/distortion curve */
                 if( nBits > maxBits ) {
-                    if (gainMult_Q8 < 16384) {
-                        gainMult_Q8 *= 2;
-                    } else {
-                        gainMult_Q8 = 32767;
-                    }
+                    gainMult_Q8 = silk_min_32( 1024, gainMult_Q8*3/2 );
                 } else {
-                    opus_int32 gain_factor_Q16;
-                    gain_factor_Q16 = silk_log2lin( silk_LSHIFT( nBits - maxBits, 7 ) / psEnc->sCmn.frame_length + SILK_FIX_CONST( 16, 7 ) );
-                    gainMult_Q8 = silk_SMULWB( gain_factor_Q16, gainMult_Q8 );
+                    gainMult_Q8 = silk_max_32( 64, gainMult_Q8*4/5 );
                 }
-
             } else {
                 /* Adjust gain by interpolating */
                 gainMult_Q8 = gainMult_lower + silk_DIV32_16( silk_MUL( gainMult_upper - gainMult_lower, maxBits - nBits_lower ), nBits_upper - nBits_lower );
diff --git a/opus/silk/fixed/find_pred_coefs_FIX.c b/opus/silk/fixed/find_pred_coefs_FIX.c
index 606d8633..ad363fb7 100644
--- a/opus/silk/fixed/find_pred_coefs_FIX.c
+++ b/opus/silk/fixed/find_pred_coefs_FIX.c
@@ -42,7 +42,8 @@ void silk_find_pred_coefs_FIX(
 {
     opus_int         i;
     opus_int32       invGains_Q16[ MAX_NB_SUBFR ], local_gains[ MAX_NB_SUBFR ];
-    opus_int16       NLSF_Q15[ MAX_LPC_ORDER ];
+    /* Set to NLSF_Q15 to zero so we don't copy junk to the state. */
+    opus_int16       NLSF_Q15[ MAX_LPC_ORDER ]={0};
     const opus_int16 *x_ptr;
     opus_int16       *x_pre_ptr;
     VARDECL( opus_int16, LPC_in_pre );
diff --git a/opus/silk/fixed/vector_ops_FIX.c b/opus/silk/fixed/vector_ops_FIX.c
index d9498001..dcf84070 100644
--- a/opus/silk/fixed/vector_ops_FIX.c
+++ b/opus/silk/fixed/vector_ops_FIX.c
@@ -87,7 +87,7 @@ opus_int32 silk_inner_prod_aligned(
 #endif
 }
 
-opus_int64 silk_inner_prod16_aligned_64_c(
+opus_int64 silk_inner_prod16_c(
     const opus_int16            *inVec1,            /*    I input vector 1                                              */
     const opus_int16            *inVec2,            /*    I input vector 2                                              */
     const opus_int              len                 /*    I vector lengths                                              */
diff --git a/opus/silk/fixed/x86/burg_modified_FIX_sse4_1.c b/opus/silk/fixed/x86/burg_modified_FIX_sse4_1.c
index bbb1ce0f..e58bf079 100644
--- a/opus/silk/fixed/x86/burg_modified_FIX_sse4_1.c
+++ b/opus/silk/fixed/x86/burg_modified_FIX_sse4_1.c
@@ -1,5 +1,5 @@
-/* Copyright (c) 2014, Cisco Systems, INC
-   Written by XiangMingZhu WeiZhou MinPeng YanWang
+/* Copyright (c) 2014-2020, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions
@@ -42,7 +42,7 @@
 #define MAX_FRAME_SIZE              384             /* subfr_length * nb_subfr = ( 0.005 * 16000 + 16 ) * 4 = 384 */
 
 #define QA                          25
-#define N_BITS_HEAD_ROOM            2
+#define N_BITS_HEAD_ROOM            3
 #define MIN_RSHIFTS                 -16
 #define MAX_RSHIFTS                 (32 - QA)
 
@@ -59,7 +59,7 @@ void silk_burg_modified_sse4_1(
     int                         arch                /* I    Run-time architecture                                       */
 )
 {
-    opus_int         k, n, s, lz, rshifts, rshifts_extra, reached_max_gain;
+    opus_int         k, n, s, lz, rshifts, reached_max_gain;
     opus_int32       C0, num, nrg, rc_Q31, invGain_Q30, Atmp_QA, Atmp1, tmp1, tmp2, x1, x2;
     const opus_int16 *x_ptr;
     opus_int32       C_first_row[ SILK_MAX_ORDER_LPC ];
@@ -68,6 +68,7 @@ void silk_burg_modified_sse4_1(
     opus_int32       CAf[ SILK_MAX_ORDER_LPC + 1 ];
     opus_int32       CAb[ SILK_MAX_ORDER_LPC + 1 ];
     opus_int32       xcorr[ SILK_MAX_ORDER_LPC ];
+    opus_int64       C0_64;
 
     __m128i FIRST_3210, LAST_3210, ATMP_3210, TMP1_3210, TMP2_3210, T1_3210, T2_3210, PTR_3210, SUBFR_3210, X1_3210, X2_3210;
     __m128i CONST1 = _mm_set1_epi32(1);
@@ -75,23 +76,18 @@ void silk_burg_modified_sse4_1(
     celt_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE );
 
     /* Compute autocorrelations, added over subframes */
-    silk_sum_sqr_shift( &C0, &rshifts, x, nb_subfr * subfr_length );
-    if( rshifts > MAX_RSHIFTS ) {
-        C0 = silk_LSHIFT32( C0, rshifts - MAX_RSHIFTS );
-        silk_assert( C0 > 0 );
-        rshifts = MAX_RSHIFTS;
+    C0_64 = silk_inner_prod16( x, x, subfr_length*nb_subfr, arch );
+    lz = silk_CLZ64(C0_64);
+    rshifts = 32 + 1 + N_BITS_HEAD_ROOM - lz;
+    if (rshifts > MAX_RSHIFTS) rshifts = MAX_RSHIFTS;
+    if (rshifts < MIN_RSHIFTS) rshifts = MIN_RSHIFTS;
+
+    if (rshifts > 0) {
+        C0 = (opus_int32)silk_RSHIFT64(C0_64, rshifts );
     } else {
-        lz = silk_CLZ32( C0 ) - 1;
-        rshifts_extra = N_BITS_HEAD_ROOM - lz;
-        if( rshifts_extra > 0 ) {
-            rshifts_extra = silk_min( rshifts_extra, MAX_RSHIFTS - rshifts );
-            C0 = silk_RSHIFT32( C0, rshifts_extra );
-        } else {
-            rshifts_extra = silk_max( rshifts_extra, MIN_RSHIFTS - rshifts );
-            C0 = silk_LSHIFT32( C0, -rshifts_extra );
-        }
-        rshifts += rshifts_extra;
+        C0 = silk_LSHIFT32((opus_int32)C0_64, -rshifts );
     }
+
     CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ) + 1;                                /* Q(-rshifts) */
     silk_memset( C_first_row, 0, SILK_MAX_ORDER_LPC * sizeof( opus_int32 ) );
     if( rshifts > 0 ) {
@@ -99,7 +95,7 @@ void silk_burg_modified_sse4_1(
             x_ptr = x + s * subfr_length;
             for( n = 1; n < D + 1; n++ ) {
                 C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64(
-                    silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
+                    silk_inner_prod16( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
             }
         }
     } else {
@@ -203,8 +199,11 @@ void silk_burg_modified_sse4_1(
                     C_first_row[ k ] = silk_MLA( C_first_row[ k ], x1, x_ptr[ n - k - 1 ]            ); /* Q( -rshifts ) */
                     C_last_row[ k ]  = silk_MLA( C_last_row[ k ],  x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */
                     Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 17 );                                   /* Q17 */
-                    tmp1 = silk_MLA( tmp1, x_ptr[ n - k - 1 ],            Atmp1 );                      /* Q17 */
-                    tmp2 = silk_MLA( tmp2, x_ptr[ subfr_length - n + k ], Atmp1 );                      /* Q17 */
+                    /* We sometimes get overflows in the multiplications (even beyond +/- 2^32),
+                       but they cancel each other and the real result seems to always fit in a 32-bit
+                       signed integer. This was determined experimentally, not theoretically (unfortunately). */
+                    tmp1 = silk_MLA_ovflw( tmp1, x_ptr[ n - k - 1 ],            Atmp1 );                      /* Q17 */
+                    tmp2 = silk_MLA_ovflw( tmp2, x_ptr[ subfr_length - n + k ], Atmp1 );                      /* Q17 */
                 }
 
                 tmp1 = -tmp1;                /* Q17 */
@@ -350,7 +349,7 @@ void silk_burg_modified_sse4_1(
         if( rshifts > 0 ) {
             for( s = 0; s < nb_subfr; s++ ) {
                 x_ptr = x + s * subfr_length;
-                C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x_ptr, x_ptr, D, arch ), rshifts );
+                C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16( x_ptr, x_ptr, D, arch ), rshifts );
             }
         } else {
             for( s = 0; s < nb_subfr; s++ ) {
@@ -374,4 +373,28 @@ void silk_burg_modified_sse4_1(
         *res_nrg = silk_SMLAWW( nrg, silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ), -tmp1 );/* Q( -rshifts ) */
         *res_nrg_Q = -rshifts;
     }
+
+#ifdef OPUS_CHECK_ASM
+    {
+        opus_int32 res_nrg_c = 0;
+        opus_int res_nrg_Q_c = 0;
+        opus_int32 A_Q16_c[ MAX_LPC_ORDER ] = {0};
+
+        silk_burg_modified_c(
+            &res_nrg_c,
+            &res_nrg_Q_c,
+            A_Q16_c,
+            x,
+            minInvGain_Q30,
+            subfr_length,
+            nb_subfr,
+            D,
+            0
+        );
+
+        silk_assert( *res_nrg == res_nrg_c );
+        silk_assert( *res_nrg_Q == res_nrg_Q_c );
+        silk_assert( !memcmp( A_Q16, A_Q16_c, D * sizeof( *A_Q16 ) ) );
+    }
+#endif
 }
diff --git a/opus/silk/fixed/x86/vector_ops_FIX_sse4_1.c b/opus/silk/fixed/x86/vector_ops_FIX_sse4_1.c
index c1e90564..a46289bb 100644
--- a/opus/silk/fixed/x86/vector_ops_FIX_sse4_1.c
+++ b/opus/silk/fixed/x86/vector_ops_FIX_sse4_1.c
@@ -36,40 +36,38 @@
 
 #include "SigProc_FIX.h"
 #include "pitch.h"
+#include "celt/x86/x86cpu.h"
 
-opus_int64 silk_inner_prod16_aligned_64_sse4_1(
+opus_int64 silk_inner_prod16_sse4_1(
     const opus_int16            *inVec1,            /*    I input vector 1                                              */
     const opus_int16            *inVec2,            /*    I input vector 2                                              */
     const opus_int              len                 /*    I vector lengths                                              */
 )
 {
-    opus_int  i, dataSize8;
+    opus_int  i, dataSize4;
     opus_int64 sum;
 
-    __m128i xmm_tempa;
-    __m128i inVec1_76543210, acc1;
-    __m128i inVec2_76543210, acc2;
+    __m128i xmm_prod_20, xmm_prod_31;
+    __m128i inVec1_3210, acc1;
+    __m128i inVec2_3210, acc2;
 
     sum = 0;
-    dataSize8 = len & ~7;
+    dataSize4 = len & ~3;
 
     acc1 = _mm_setzero_si128();
     acc2 = _mm_setzero_si128();
 
-    for( i = 0; i < dataSize8; i += 8 ) {
-        inVec1_76543210 = _mm_loadu_si128( (__m128i *)(&inVec1[i + 0] ) );
-        inVec2_76543210 = _mm_loadu_si128( (__m128i *)(&inVec2[i + 0] ) );
+    for( i = 0; i < dataSize4; i += 4 ) {
+        inVec1_3210 = OP_CVTEPI16_EPI32_M64( &inVec1[i + 0] );
+        inVec2_3210 = OP_CVTEPI16_EPI32_M64( &inVec2[i + 0] );
+        xmm_prod_20 = _mm_mul_epi32( inVec1_3210, inVec2_3210 );
 
-        /* only when all 4 operands are -32768 (0x8000), this results in wrap around */
-        inVec1_76543210 = _mm_madd_epi16( inVec1_76543210, inVec2_76543210 );
+        inVec1_3210 = _mm_shuffle_epi32( inVec1_3210, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+        inVec2_3210 = _mm_shuffle_epi32( inVec2_3210, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+        xmm_prod_31 = _mm_mul_epi32( inVec1_3210, inVec2_3210 );
 
-        xmm_tempa       = _mm_cvtepi32_epi64( inVec1_76543210 );
-        /* equal shift right 8 bytes */
-        inVec1_76543210 = _mm_shuffle_epi32( inVec1_76543210, _MM_SHUFFLE( 0, 0, 3, 2 ) );
-        inVec1_76543210 = _mm_cvtepi32_epi64( inVec1_76543210 );
-
-        acc1 = _mm_add_epi64( acc1, xmm_tempa );
-        acc2 = _mm_add_epi64( acc2, inVec1_76543210 );
+        acc1 = _mm_add_epi64( acc1, xmm_prod_20 );
+        acc2 = _mm_add_epi64( acc2, xmm_prod_31 );
     }
 
     acc1 = _mm_add_epi64( acc1, acc2 );
@@ -81,8 +79,15 @@ opus_int64 silk_inner_prod16_aligned_64_sse4_1(
     _mm_storel_epi64( (__m128i *)&sum, acc1 );
 
     for( ; i < len; i++ ) {
-        sum = silk_SMLABB( sum, inVec1[ i ], inVec2[ i ] );
+        sum = silk_SMLALBB( sum, inVec1[ i ], inVec2[ i ] );
+    }
+
+#ifdef OPUS_CHECK_ASM
+    {
+        opus_int64 sum_c = silk_inner_prod16_c( inVec1, inVec2, len );
+        silk_assert( sum == sum_c );
     }
+#endif
 
     return sum;
 }
diff --git a/opus/silk/float/LTP_scale_ctrl_FLP.c b/opus/silk/float/LTP_scale_ctrl_FLP.c
index 8dbe29d0..6f30ff09 100644
--- a/opus/silk/float/LTP_scale_ctrl_FLP.c
+++ b/opus/silk/float/LTP_scale_ctrl_FLP.c
@@ -41,8 +41,14 @@ void silk_LTP_scale_ctrl_FLP(
 
     if( condCoding == CODE_INDEPENDENTLY ) {
         /* Only scale if first frame in packet */
-        round_loss = psEnc->sCmn.PacketLoss_perc + psEnc->sCmn.nFramesPerPacket;
-        psEnc->sCmn.indices.LTP_scaleIndex = (opus_int8)silk_LIMIT( round_loss * psEncCtrl->LTPredCodGain * 0.1f, 0.0f, 2.0f );
+        round_loss = psEnc->sCmn.PacketLoss_perc * psEnc->sCmn.nFramesPerPacket;
+        if ( psEnc->sCmn.LBRR_flag ) {
+            /* LBRR reduces the effective loss. In practice, it does not square the loss because
+               losses aren't independent, but that still seems to work best. We also never go below 2%. */
+            round_loss = 2 + silk_SMULBB( round_loss, round_loss) / 100;
+        }
+        psEnc->sCmn.indices.LTP_scaleIndex = silk_SMULBB( psEncCtrl->LTPredCodGain, round_loss ) > silk_log2lin( 2900 - psEnc->sCmn.SNR_dB_Q7 );
+        psEnc->sCmn.indices.LTP_scaleIndex += silk_SMULBB( psEncCtrl->LTPredCodGain, round_loss ) > silk_log2lin( 3900 - psEnc->sCmn.SNR_dB_Q7 );
     } else {
         /* Default is minimum scaling */
         psEnc->sCmn.indices.LTP_scaleIndex = 0;
diff --git a/opus/silk/float/SigProc_FLP.h b/opus/silk/float/SigProc_FLP.h
index 953de8b0..ff9281b8 100644
--- a/opus/silk/float/SigProc_FLP.h
+++ b/opus/silk/float/SigProc_FLP.h
@@ -30,6 +30,7 @@ POSSIBILITY OF SUCH DAMAGE.
 
 #include "SigProc_FIX.h"
 #include "float_cast.h"
+#include "main.h"
 #include <math.h>
 
 #ifdef  __cplusplus
@@ -73,7 +74,8 @@ void silk_autocorrelation_FLP(
     silk_float          *results,           /* O    result (length correlationCount)                            */
     const silk_float    *inputData,         /* I    input data to correlate                                     */
     opus_int            inputDataSize,      /* I    length of input                                             */
-    opus_int            correlationCount    /* I    number of correlation taps to compute                       */
+    opus_int            correlationCount,    /* I    number of correlation taps to compute                       */
+    int                 arch
 );
 
 opus_int silk_pitch_analysis_core_FLP(      /* O    Voicing estimate: 0 voiced, 1 unvoiced                      */
@@ -105,7 +107,8 @@ silk_float silk_burg_modified_FLP(          /* O    returns residual energy
     const silk_float    minInvGain,         /* I    minimum inverse prediction gain                             */
     const opus_int      subfr_length,       /* I    input signal subframe length (incl. D preceding samples)    */
     const opus_int      nb_subfr,           /* I    number of subframes stacked in x                            */
-    const opus_int      D                   /* I    order                                                       */
+    const opus_int      D,                  /* I    order                                                       */
+    int                 arch
 );
 
 /* multiply a vector by a constant */
@@ -124,12 +127,17 @@ void silk_scale_copy_vector_FLP(
 );
 
 /* inner product of two silk_float arrays, with result as double */
-double silk_inner_product_FLP(
+double silk_inner_product_FLP_c(
     const silk_float    *data1,
     const silk_float    *data2,
     opus_int            dataSize
 );
 
+#ifndef OVERRIDE_inner_product_FLP
+#define silk_inner_product_FLP(data1, data2, dataSize, arch) ((void)arch,silk_inner_product_FLP_c(data1, data2, dataSize))
+#endif
+
+
 /* sum of squares of a silk_float array, with result as double */
 double silk_energy_FLP(
     const silk_float    *data,
diff --git a/opus/silk/float/autocorrelation_FLP.c b/opus/silk/float/autocorrelation_FLP.c
index 8b8a9e65..4253b26e 100644
--- a/opus/silk/float/autocorrelation_FLP.c
+++ b/opus/silk/float/autocorrelation_FLP.c
@@ -37,7 +37,8 @@ void silk_autocorrelation_FLP(
     silk_float          *results,           /* O    result (length correlationCount)                            */
     const silk_float    *inputData,         /* I    input data to correlate                                     */
     opus_int            inputDataSize,      /* I    length of input                                             */
-    opus_int            correlationCount    /* I    number of correlation taps to compute                       */
+    opus_int            correlationCount,    /* I    number of correlation taps to compute                       */
+    int                 arch
 )
 {
     opus_int i;
@@ -47,6 +48,6 @@ void silk_autocorrelation_FLP(
     }
 
     for( i = 0; i < correlationCount; i++ ) {
-        results[ i ] =  (silk_float)silk_inner_product_FLP( inputData, inputData + i, inputDataSize - i );
+        results[ i ] =  (silk_float)silk_inner_product_FLP( inputData, inputData + i, inputDataSize - i, arch );
     }
 }
diff --git a/opus/silk/float/burg_modified_FLP.c b/opus/silk/float/burg_modified_FLP.c
index 756b76a3..f5bef5dd 100644
--- a/opus/silk/float/burg_modified_FLP.c
+++ b/opus/silk/float/burg_modified_FLP.c
@@ -42,7 +42,8 @@ silk_float silk_burg_modified_FLP(          /* O    returns residual energy
     const silk_float    minInvGain,         /* I    minimum inverse prediction gain                             */
     const opus_int      subfr_length,       /* I    input signal subframe length (incl. D preceding samples)    */
     const opus_int      nb_subfr,           /* I    number of subframes stacked in x                            */
-    const opus_int      D                   /* I    order                                                       */
+    const opus_int      D,                  /* I    order                                                       */
+    int                 arch
 )
 {
     opus_int         k, n, s, reached_max_gain;
@@ -60,7 +61,7 @@ silk_float silk_burg_modified_FLP(          /* O    returns residual energy
     for( s = 0; s < nb_subfr; s++ ) {
         x_ptr = x + s * subfr_length;
         for( n = 1; n < D + 1; n++ ) {
-            C_first_row[ n - 1 ] += silk_inner_product_FLP( x_ptr, x_ptr + n, subfr_length - n );
+            C_first_row[ n - 1 ] += silk_inner_product_FLP( x_ptr, x_ptr + n, subfr_length - n, arch );
         }
     }
     silk_memcpy( C_last_row, C_first_row, SILK_MAX_ORDER_LPC * sizeof( double ) );
diff --git a/opus/silk/float/corrMatrix_FLP.c b/opus/silk/float/corrMatrix_FLP.c
index eae6a1cf..eef6e8aa 100644
--- a/opus/silk/float/corrMatrix_FLP.c
+++ b/opus/silk/float/corrMatrix_FLP.c
@@ -41,7 +41,8 @@ void silk_corrVector_FLP(
     const silk_float                *t,                                 /* I    Target vector [L]                           */
     const opus_int                  L,                                  /* I    Length of vecors                            */
     const opus_int                  Order,                              /* I    Max lag for correlation                     */
-    silk_float                      *Xt                                 /* O    X'*t correlation vector [order]             */
+    silk_float                      *Xt,                                /* O    X'*t correlation vector [order]             */
+    int                             arch
 )
 {
     opus_int lag;
@@ -50,7 +51,7 @@ void silk_corrVector_FLP(
     ptr1 = &x[ Order - 1 ];                     /* Points to first sample of column 0 of X: X[:,0] */
     for( lag = 0; lag < Order; lag++ ) {
         /* Calculate X[:,lag]'*t */
-        Xt[ lag ] = (silk_float)silk_inner_product_FLP( ptr1, t, L );
+        Xt[ lag ] = (silk_float)silk_inner_product_FLP( ptr1, t, L, arch );
         ptr1--;                                 /* Next column of X */
     }
 }
@@ -60,7 +61,8 @@ void silk_corrMatrix_FLP(
     const silk_float                *x,                                 /* I    x vector [ L+order-1 ] used to create X     */
     const opus_int                  L,                                  /* I    Length of vectors                           */
     const opus_int                  Order,                              /* I    Max lag for correlation                     */
-    silk_float                      *XX                                 /* O    X'*X correlation matrix [order x order]     */
+    silk_float                      *XX,                                /* O    X'*X correlation matrix [order x order]     */
+    int                             arch
 )
 {
     opus_int j, lag;
@@ -79,7 +81,7 @@ void silk_corrMatrix_FLP(
     ptr2 = &x[ Order - 2 ];                     /* First sample of column 1 of X */
     for( lag = 1; lag < Order; lag++ ) {
         /* Calculate X[:,0]'*X[:,lag] */
-        energy = silk_inner_product_FLP( ptr1, ptr2, L );
+        energy = silk_inner_product_FLP( ptr1, ptr2, L, arch );
         matrix_ptr( XX, lag, 0, Order ) = ( silk_float )energy;
         matrix_ptr( XX, 0, lag, Order ) = ( silk_float )energy;
         /* Calculate X[:,j]'*X[:,j + lag] */
diff --git a/opus/silk/float/encode_frame_FLP.c b/opus/silk/float/encode_frame_FLP.c
index b029c3f5..8a327c56 100644
--- a/opus/silk/float/encode_frame_FLP.c
+++ b/opus/silk/float/encode_frame_FLP.c
@@ -107,7 +107,10 @@ opus_int silk_encode_frame_FLP(
     opus_int     gain_lock[ MAX_NB_SUBFR ] = {0};
     opus_int16   best_gain_mult[ MAX_NB_SUBFR ];
     opus_int     best_sum[ MAX_NB_SUBFR ];
+    opus_int     bits_margin;
 
+    /* For CBR, 5 bits below budget is close enough. For VBR, allow up to 25% below the cap if we initially busted the budget. */
+    bits_margin = useCBR ? 5 : maxBits/4;
     /* This is totally unnecessary but many compilers (including gcc) are too dumb to realise it */
     LastGainIndex_copy2 = nBits_lower = nBits_upper = gainMult_lower = gainMult_upper = 0;
 
@@ -270,7 +273,7 @@ opus_int silk_encode_frame_FLP(
                     gainMult_upper = gainMult_Q8;
                     gainsID_upper = gainsID;
                 }
-            } else if( nBits < maxBits - 5 ) {
+            } else if( nBits < maxBits - bits_margin ) {
                 found_lower = 1;
                 nBits_lower = nBits;
                 gainMult_lower = gainMult_Q8;
@@ -284,7 +287,7 @@ opus_int silk_encode_frame_FLP(
                     LastGainIndex_copy2 = psEnc->sShape.LastGainIndex;
                 }
             } else {
-                /* Within 5 bits of budget: close enough */
+                /* Close enough */
                 break;
             }
 
@@ -306,15 +309,9 @@ opus_int silk_encode_frame_FLP(
             if( ( found_lower & found_upper ) == 0 ) {
                 /* Adjust gain according to high-rate rate/distortion curve */
                 if( nBits > maxBits ) {
-                    if (gainMult_Q8 < 16384) {
-                        gainMult_Q8 *= 2;
-                    } else {
-                        gainMult_Q8 = 32767;
-                    }
+                    gainMult_Q8 = silk_min_32( 1024, gainMult_Q8*3/2 );
                 } else {
-                    opus_int32 gain_factor_Q16;
-                    gain_factor_Q16 = silk_log2lin( silk_LSHIFT( nBits - maxBits, 7 ) / psEnc->sCmn.frame_length + SILK_FIX_CONST( 16, 7 ) );
-                    gainMult_Q8 = silk_SMULWB( gain_factor_Q16, gainMult_Q8 );
+                    gainMult_Q8 = silk_max_32( 64, gainMult_Q8*4/5 );
                 }
             } else {
                 /* Adjust gain by interpolating */
diff --git a/opus/silk/float/find_LPC_FLP.c b/opus/silk/float/find_LPC_FLP.c
index fa3ffe7f..6ccd711d 100644
--- a/opus/silk/float/find_LPC_FLP.c
+++ b/opus/silk/float/find_LPC_FLP.c
@@ -38,7 +38,8 @@ void silk_find_LPC_FLP(
     silk_encoder_state              *psEncC,                            /* I/O  Encoder state                               */
     opus_int16                      NLSF_Q15[],                         /* O    NLSFs                                       */
     const silk_float                x[],                                /* I    Input signal                                */
-    const silk_float                minInvGain                          /* I    Inverse of max prediction gain              */
+    const silk_float                minInvGain,                         /* I    Inverse of max prediction gain              */
+    int                             arch
 )
 {
     opus_int    k, subfr_length;
@@ -56,12 +57,12 @@ void silk_find_LPC_FLP(
     psEncC->indices.NLSFInterpCoef_Q2 = 4;
 
     /* Burg AR analysis for the full frame */
-    res_nrg = silk_burg_modified_FLP( a, x, minInvGain, subfr_length, psEncC->nb_subfr, psEncC->predictLPCOrder );
+    res_nrg = silk_burg_modified_FLP( a, x, minInvGain, subfr_length, psEncC->nb_subfr, psEncC->predictLPCOrder, arch );
 
     if( psEncC->useInterpolatedNLSFs && !psEncC->first_frame_after_reset && psEncC->nb_subfr == MAX_NB_SUBFR ) {
         /* Optimal solution for last 10 ms; subtract residual energy here, as that's easier than        */
         /* adding it to the residual energy of the first 10 ms in each iteration of the search below    */
-        res_nrg -= silk_burg_modified_FLP( a_tmp, x + ( MAX_NB_SUBFR / 2 ) * subfr_length, minInvGain, subfr_length, MAX_NB_SUBFR / 2, psEncC->predictLPCOrder );
+        res_nrg -= silk_burg_modified_FLP( a_tmp, x + ( MAX_NB_SUBFR / 2 ) * subfr_length, minInvGain, subfr_length, MAX_NB_SUBFR / 2, psEncC->predictLPCOrder, arch );
 
         /* Convert to NLSFs */
         silk_A2NLSF_FLP( NLSF_Q15, a_tmp, psEncC->predictLPCOrder );
diff --git a/opus/silk/float/find_LTP_FLP.c b/opus/silk/float/find_LTP_FLP.c
index f9706493..90aeeac0 100644
--- a/opus/silk/float/find_LTP_FLP.c
+++ b/opus/silk/float/find_LTP_FLP.c
@@ -38,7 +38,8 @@ void silk_find_LTP_FLP(
     const silk_float                r_ptr[],                            /* I    LPC residual                                */
     const opus_int                  lag[ MAX_NB_SUBFR ],                /* I    LTP lags                                    */
     const opus_int                  subfr_length,                       /* I    Subframe length                             */
-    const opus_int                  nb_subfr                            /* I    number of subframes                         */
+    const opus_int                  nb_subfr,                           /* I    number of subframes                         */
+    int                             arch
 )
 {
     opus_int   k;
@@ -50,8 +51,8 @@ void silk_find_LTP_FLP(
     XX_ptr = XX;
     for( k = 0; k < nb_subfr; k++ ) {
         lag_ptr = r_ptr - ( lag[ k ] + LTP_ORDER / 2 );
-        silk_corrMatrix_FLP( lag_ptr, subfr_length, LTP_ORDER, XX_ptr );
-        silk_corrVector_FLP( lag_ptr, r_ptr, subfr_length, LTP_ORDER, xX_ptr );
+        silk_corrMatrix_FLP( lag_ptr, subfr_length, LTP_ORDER, XX_ptr, arch );
+        silk_corrVector_FLP( lag_ptr, r_ptr, subfr_length, LTP_ORDER, xX_ptr, arch );
         xx = ( silk_float )silk_energy_FLP( r_ptr, subfr_length + LTP_ORDER );
         temp = 1.0f / silk_max( xx, LTP_CORR_INV_MAX * 0.5f * ( XX_ptr[ 0 ] + XX_ptr[ 24 ] ) + 1.0f );
         silk_scale_vector_FLP( XX_ptr, temp, LTP_ORDER * LTP_ORDER );
diff --git a/opus/silk/float/find_pitch_lags_FLP.c b/opus/silk/float/find_pitch_lags_FLP.c
index dedbcd28..1f6bd599 100644
--- a/opus/silk/float/find_pitch_lags_FLP.c
+++ b/opus/silk/float/find_pitch_lags_FLP.c
@@ -82,7 +82,7 @@ void silk_find_pitch_lags_FLP(
     silk_apply_sine_window_FLP( Wsig_ptr, x_buf_ptr, 2, psEnc->sCmn.la_pitch );
 
     /* Calculate autocorrelation sequence */
-    silk_autocorrelation_FLP( auto_corr, Wsig, psEnc->sCmn.pitch_LPC_win_length, psEnc->sCmn.pitchEstimationLPCOrder + 1 );
+    silk_autocorrelation_FLP( auto_corr, Wsig, psEnc->sCmn.pitch_LPC_win_length, psEnc->sCmn.pitchEstimationLPCOrder + 1, arch );
 
     /* Add white noise, as a fraction of the energy */
     auto_corr[ 0 ] += auto_corr[ 0 ] * FIND_PITCH_WHITE_NOISE_FRACTION + 1;
diff --git a/opus/silk/float/find_pred_coefs_FLP.c b/opus/silk/float/find_pred_coefs_FLP.c
index dcf7c520..f3c54cf4 100644
--- a/opus/silk/float/find_pred_coefs_FLP.c
+++ b/opus/silk/float/find_pred_coefs_FLP.c
@@ -44,7 +44,8 @@ void silk_find_pred_coefs_FLP(
     silk_float       XXLTP[ MAX_NB_SUBFR * LTP_ORDER * LTP_ORDER ];
     silk_float       xXLTP[ MAX_NB_SUBFR * LTP_ORDER ];
     silk_float       invGains[ MAX_NB_SUBFR ];
-    opus_int16       NLSF_Q15[ MAX_LPC_ORDER ];
+    /* Set to NLSF_Q15 to zero so we don't copy junk to the state. */
+    opus_int16       NLSF_Q15[ MAX_LPC_ORDER ]={0};
     const silk_float *x_ptr;
     silk_float       *x_pre_ptr, LPC_in_pre[ MAX_NB_SUBFR * MAX_LPC_ORDER + MAX_FRAME_LENGTH ];
     silk_float       minInvGain;
@@ -62,7 +63,7 @@ void silk_find_pred_coefs_FLP(
         celt_assert( psEnc->sCmn.ltp_mem_length - psEnc->sCmn.predictLPCOrder >= psEncCtrl->pitchL[ 0 ] + LTP_ORDER / 2 );
 
         /* LTP analysis */
-        silk_find_LTP_FLP( XXLTP, xXLTP, res_pitch, psEncCtrl->pitchL, psEnc->sCmn.subfr_length, psEnc->sCmn.nb_subfr );
+        silk_find_LTP_FLP( XXLTP, xXLTP, res_pitch, psEncCtrl->pitchL, psEnc->sCmn.subfr_length, psEnc->sCmn.nb_subfr, psEnc->sCmn.arch );
 
         /* Quantize LTP gain parameters */
         silk_quant_LTP_gains_FLP( psEncCtrl->LTPCoef, psEnc->sCmn.indices.LTPIndex, &psEnc->sCmn.indices.PERIndex,
@@ -101,7 +102,7 @@ void silk_find_pred_coefs_FLP(
     }
 
     /* LPC_in_pre contains the LTP-filtered input for voiced, and the unfiltered input for unvoiced */
-    silk_find_LPC_FLP( &psEnc->sCmn, NLSF_Q15, LPC_in_pre, minInvGain );
+    silk_find_LPC_FLP( &psEnc->sCmn, NLSF_Q15, LPC_in_pre, minInvGain, psEnc->sCmn.arch );
 
     /* Quantize LSFs */
     silk_process_NLSFs_FLP( &psEnc->sCmn, psEncCtrl->PredCoef, NLSF_Q15, psEnc->sCmn.prev_NLSFq_Q15 );
diff --git a/opus/silk/float/inner_product_FLP.c b/opus/silk/float/inner_product_FLP.c
index cdd39d24..88b160ab 100644
--- a/opus/silk/float/inner_product_FLP.c
+++ b/opus/silk/float/inner_product_FLP.c
@@ -32,7 +32,7 @@ POSSIBILITY OF SUCH DAMAGE.
 #include "SigProc_FLP.h"
 
 /* inner product of two silk_float arrays, with result as double */
-double silk_inner_product_FLP(
+double silk_inner_product_FLP_c(
     const silk_float    *data1,
     const silk_float    *data2,
     opus_int            dataSize
diff --git a/opus/silk/float/main_FLP.h b/opus/silk/float/main_FLP.h
index 5dc0ccf4..2e4435cc 100644
--- a/opus/silk/float/main_FLP.h
+++ b/opus/silk/float/main_FLP.h
@@ -138,7 +138,8 @@ void silk_find_LPC_FLP(
     silk_encoder_state              *psEncC,                            /* I/O  Encoder state                               */
     opus_int16                      NLSF_Q15[],                         /* O    NLSFs                                       */
     const silk_float                x[],                                /* I    Input signal                                */
-    const silk_float                minInvGain                          /* I    Prediction gain from LTP (dB)               */
+    const silk_float                minInvGain,                         /* I    Prediction gain from LTP (dB)               */
+    int                             arch
 );
 
 /* LTP analysis */
@@ -148,7 +149,8 @@ void silk_find_LTP_FLP(
     const silk_float                r_ptr[],                            /* I    LPC residual                                */
     const opus_int                  lag[  MAX_NB_SUBFR ],               /* I    LTP lags                                    */
     const opus_int                  subfr_length,                       /* I    Subframe length                             */
-    const opus_int                  nb_subfr                            /* I    number of subframes                         */
+    const opus_int                  nb_subfr,                           /* I    number of subframes                         */
+    int                             arch
 );
 
 void silk_LTP_analysis_filter_FLP(
@@ -221,7 +223,8 @@ void silk_corrMatrix_FLP(
     const silk_float                *x,                                 /* I    x vector [ L+order-1 ] used to create X     */
     const opus_int                  L,                                  /* I    Length of vectors                           */
     const opus_int                  Order,                              /* I    Max lag for correlation                     */
-    silk_float                      *XX                                 /* O    X'*X correlation matrix [order x order]     */
+    silk_float                      *XX,                                /* O    X'*X correlation matrix [order x order]     */
+    int                             arch
 );
 
 /* Calculates correlation vector X'*t */
@@ -230,7 +233,8 @@ void silk_corrVector_FLP(
     const silk_float                *t,                                 /* I    Target vector [L]                           */
     const opus_int                  L,                                  /* I    Length of vecors                            */
     const opus_int                  Order,                              /* I    Max lag for correlation                     */
-    silk_float                      *Xt                                 /* O    X'*t correlation vector [order]             */
+    silk_float                      *Xt,                                /* O    X'*t correlation vector [order]             */
+    int                             arch
 );
 
 /* Apply sine window to signal vector.  */
diff --git a/opus/silk/float/noise_shape_analysis_FLP.c b/opus/silk/float/noise_shape_analysis_FLP.c
index cb3d8a50..0b5ea952 100644
--- a/opus/silk/float/noise_shape_analysis_FLP.c
+++ b/opus/silk/float/noise_shape_analysis_FLP.c
@@ -255,7 +255,7 @@ void silk_noise_shape_analysis_FLP(
                 psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder );
         } else {
             /* Calculate regular auto correlation */
-            silk_autocorrelation_FLP( auto_corr, x_windowed, psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder + 1 );
+            silk_autocorrelation_FLP( auto_corr, x_windowed, psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder + 1, psEnc->sCmn.arch );
         }
 
         /* Add white noise, as a fraction of energy */
diff --git a/opus/silk/float/pitch_analysis_core_FLP.c b/opus/silk/float/pitch_analysis_core_FLP.c
index f351bc37..0530a883 100644
--- a/opus/silk/float/pitch_analysis_core_FLP.c
+++ b/opus/silk/float/pitch_analysis_core_FLP.c
@@ -291,7 +291,7 @@ opus_int silk_pitch_analysis_core_FLP(      /* O    Voicing estimate: 0 voiced,
         for( j = 0; j < length_d_comp; j++ ) {
             d = d_comp[ j ];
             basis_ptr = target_ptr - d;
-            cross_corr = silk_inner_product_FLP( basis_ptr, target_ptr, sf_length_8kHz );
+            cross_corr = silk_inner_product_FLP( basis_ptr, target_ptr, sf_length_8kHz, arch );
             if( cross_corr > 0.0f ) {
                 energy = silk_energy_FLP( basis_ptr, sf_length_8kHz );
                 C[ k ][ d ] = (silk_float)( 2 * cross_corr / ( energy + energy_tmp ) );
diff --git a/opus/silk/float/warped_autocorrelation_FLP.c b/opus/silk/float/warped_autocorrelation_FLP.c
index 09186e73..116dab92 100644
--- a/opus/silk/float/warped_autocorrelation_FLP.c
+++ b/opus/silk/float/warped_autocorrelation_FLP.c
@@ -54,11 +54,13 @@ void silk_warped_autocorrelation_FLP(
         /* Loop over allpass sections */
         for( i = 0; i < order; i += 2 ) {
             /* Output of allpass section */
-            tmp2 = state[ i ] + warping * ( state[ i + 1 ] - tmp1 );
+            /* We voluntarily use two multiples instead of factoring the expression to
+               reduce the length of the dependency chain (tmp1->tmp2->tmp1... ). */
+            tmp2 = state[ i ] + warping * state[ i + 1 ] - warping * tmp1;
             state[ i ] = tmp1;
             C[ i ] += state[ 0 ] * tmp1;
             /* Output of allpass section */
-            tmp1 = state[ i + 1 ] + warping * ( state[ i + 2 ] - tmp2 );
+            tmp1 = state[ i + 1 ] + warping * state[ i + 2 ] - warping * tmp2;
             state[ i + 1 ] = tmp2;
             C[ i + 1 ] += state[ 0 ] * tmp2;
         }
diff --git a/opus/silk/float/wrappers_FLP.c b/opus/silk/float/wrappers_FLP.c
index ad90b874..c0c183e3 100644
--- a/opus/silk/float/wrappers_FLP.c
+++ b/opus/silk/float/wrappers_FLP.c
@@ -190,12 +190,14 @@ void silk_quant_LTP_gains_FLP(
     opus_int32 XX_Q17[ MAX_NB_SUBFR * LTP_ORDER * LTP_ORDER ];
     opus_int32 xX_Q17[ MAX_NB_SUBFR * LTP_ORDER ];
 
-    for( i = 0; i < nb_subfr * LTP_ORDER * LTP_ORDER; i++ ) {
+    i = 0;
+    do {
         XX_Q17[ i ] = (opus_int32)silk_float2int( XX[ i ] * 131072.0f );
-    }
-    for( i = 0; i < nb_subfr * LTP_ORDER; i++ ) {
+    } while ( ++i < nb_subfr * LTP_ORDER * LTP_ORDER );
+    i = 0;
+    do {
         xX_Q17[ i ] = (opus_int32)silk_float2int( xX[ i ] * 131072.0f );
-    }
+    } while ( ++i < nb_subfr * LTP_ORDER );
 
     silk_quant_LTP_gains( B_Q14, cbk_index, periodicity_index, sum_log_gain_Q7, &pred_gain_dB_Q7, XX_Q17, xX_Q17, subfr_len, nb_subfr, arch );
 
diff --git a/opus/silk/float/x86/inner_product_FLP_avx2.c b/opus/silk/float/x86/inner_product_FLP_avx2.c
new file mode 100644
index 00000000..4a2daaf5
--- /dev/null
+++ b/opus/silk/float/x86/inner_product_FLP_avx2.c
@@ -0,0 +1,85 @@
+/***********************************************************************
+Copyright (c) 2006-2011, Skype Limited. All rights reserved.
+              2023 Amazon
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "SigProc_FLP.h"
+#include <immintrin.h>
+
+
+/* inner product of two silk_float arrays, with result as double */
+double silk_inner_product_FLP_avx2(
+    const silk_float    *data1,
+    const silk_float    *data2,
+    opus_int            dataSize
+)
+{
+    opus_int i;
+    __m256d accum1, accum2;
+    double   result;
+
+    /* 4x unrolled loop */
+    result = 0.0;
+    accum1 = accum2 = _mm256_setzero_pd();
+    for( i = 0; i < dataSize - 7; i += 8 ) {
+        __m128  x1f, x2f;
+        __m256d x1d, x2d;
+        x1f = _mm_loadu_ps( &data1[ i ] );
+        x2f = _mm_loadu_ps( &data2[ i ] );
+        x1d = _mm256_cvtps_pd( x1f );
+        x2d = _mm256_cvtps_pd( x2f );
+        accum1 = _mm256_fmadd_pd( x1d, x2d, accum1 );
+        x1f = _mm_loadu_ps( &data1[ i + 4 ] );
+        x2f = _mm_loadu_ps( &data2[ i + 4 ] );
+        x1d = _mm256_cvtps_pd( x1f );
+        x2d = _mm256_cvtps_pd( x2f );
+        accum2 = _mm256_fmadd_pd( x1d, x2d, accum2 );
+    }
+    for( ; i < dataSize - 3; i += 4 ) {
+        __m128  x1f, x2f;
+        __m256d x1d, x2d;
+        x1f = _mm_loadu_ps( &data1[ i ] );
+        x2f = _mm_loadu_ps( &data2[ i ] );
+        x1d = _mm256_cvtps_pd( x1f );
+        x2d = _mm256_cvtps_pd( x2f );
+        accum1 = _mm256_fmadd_pd( x1d, x2d, accum1 );
+    }
+    accum1 = _mm256_add_pd(accum1, accum2);
+    accum1 = _mm256_add_pd(accum1, _mm256_permute2f128_pd(accum1, accum1, 1));
+    accum1 = _mm256_hadd_pd(accum1,accum1);
+    result = _mm256_cvtsd_f64(accum1);
+
+    /* add any remaining products */
+    for( ; i < dataSize; i++ ) {
+        result += data1[ i ] * (double)data2[ i ];
+    }
+
+    return result;
+}
diff --git a/opus/silk/init_decoder.c b/opus/silk/init_decoder.c
index 16c03dcd..01bc4b7a 100644
--- a/opus/silk/init_decoder.c
+++ b/opus/silk/init_decoder.c
@@ -31,15 +31,21 @@ POSSIBILITY OF SUCH DAMAGE.
 
 #include "main.h"
 
+#ifdef ENABLE_OSCE
+#include "osce.h"
+#endif
+
+#include "structs.h"
+
 /************************/
-/* Init Decoder State   */
+/* Reset Decoder State  */
 /************************/
-opus_int silk_init_decoder(
+opus_int silk_reset_decoder(
     silk_decoder_state          *psDec                          /* I/O  Decoder state pointer                       */
 )
 {
     /* Clear the entire encoder state, except anything copied */
-    silk_memset( psDec, 0, sizeof( silk_decoder_state ) );
+    silk_memset( &psDec->SILK_DECODER_STATE_RESET_START, 0, sizeof( silk_decoder_state ) - ((char*) &psDec->SILK_DECODER_STATE_RESET_START - (char*)psDec) );
 
     /* Used to deactivate LSF interpolation */
     psDec->first_frame_after_reset = 1;
@@ -52,6 +58,27 @@ opus_int silk_init_decoder(
     /* Reset PLC state */
     silk_PLC_Reset( psDec );
 
+#ifdef ENABLE_OSCE
+    /* Reset OSCE state and method */
+    osce_reset(&psDec->osce, OSCE_DEFAULT_METHOD);
+#endif
+
+    return 0;
+}
+
+
+/************************/
+/* Init Decoder State   */
+/************************/
+opus_int silk_init_decoder(
+    silk_decoder_state          *psDec                          /* I/O  Decoder state pointer                       */
+)
+{
+    /* Clear the entire encoder state, except anything copied */
+    silk_memset( psDec, 0, sizeof( silk_decoder_state ) );
+
+    silk_reset_decoder( psDec );
+
     return(0);
 }
 
diff --git a/opus/silk/init_encoder.c b/opus/silk/init_encoder.c
index 65995c33..10d41287 100644
--- a/opus/silk/init_encoder.c
+++ b/opus/silk/init_encoder.c
@@ -36,6 +36,10 @@ POSSIBILITY OF SUCH DAMAGE.
 #include "tuning_parameters.h"
 #include "cpu_support.h"
 
+#ifdef ENABLE_DRED
+#include "dred_encoder.h"
+#endif
+
 /*********************************/
 /* Initialize Silk Encoder state */
 /*********************************/
diff --git a/opus/silk/main.h b/opus/silk/main.h
index 1a33eed5..cd576d8c 100644
--- a/opus/silk/main.h
+++ b/opus/silk/main.h
@@ -247,21 +247,21 @@ void silk_VQ_WMat_EC_c(
 /************************************/
 
 void silk_NSQ_c(
-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
-    const opus_int16            x16[],                                      /* I    Input                           */
-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
-    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I  Noise shaping coefs             */
-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
+    const opus_int16            x16[],                                        /* I    Input                           */
+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
+    const opus_int16            *PredCoef_Q12,                                /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 );
 
 #if !defined(OVERRIDE_silk_NSQ)
@@ -273,21 +273,21 @@ void silk_NSQ_c(
 
 /* Noise shaping using delayed decision */
 void silk_NSQ_del_dec_c(
-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
-    const opus_int16            x16[],                                      /* I    Input                           */
-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
-    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I  Noise shaping coefs             */
-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
+    const opus_int16            x16[],                                        /* I    Input                           */
+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
+    const opus_int16            *PredCoef_Q12,                                /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 );
 
 #if !defined(OVERRIDE_silk_NSQ_del_dec)
@@ -389,6 +389,10 @@ void silk_NLSF_decode(
 /****************************************************/
 /* Decoder Functions                                */
 /****************************************************/
+opus_int silk_reset_decoder(
+    silk_decoder_state          *psDec                          /* I/O  Decoder state pointer                       */
+);
+
 opus_int silk_init_decoder(
     silk_decoder_state          *psDec                          /* I/O  Decoder state pointer                       */
 );
@@ -410,6 +414,12 @@ opus_int silk_decode_frame(
     opus_int32                  *pN,                            /* O    Pointer to size of output frame             */
     opus_int                    lostFlag,                       /* I    0: no loss, 1 loss, 2 decode fec            */
     opus_int                    condCoding,                     /* I    The type of conditional coding to use       */
+#ifdef ENABLE_DEEP_PLC
+    LPCNetPLCState              *lpcnet,
+#endif
+#ifdef ENABLE_OSCE
+    OSCEModel                   *osce_model,
+#endif
     int                         arch                            /* I    Run-time architecture                       */
 );
 
diff --git a/opus/silk/mips/NSQ_del_dec_mipsr1.h b/opus/silk/mips/NSQ_del_dec_mipsr1.h
index cd70713a..85bfb637 100644
--- a/opus/silk/mips/NSQ_del_dec_mipsr1.h
+++ b/opus/silk/mips/NSQ_del_dec_mipsr1.h
@@ -25,8 +25,8 @@ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGE.
 ***********************************************************************/
 
-#ifndef __NSQ_DEL_DEC_MIPSR1_H__
-#define __NSQ_DEL_DEC_MIPSR1_H__
+#ifndef NSQ_DEL_DEC_MIPSR1_H__
+#define NSQ_DEL_DEC_MIPSR1_H__
 
 #ifdef HAVE_CONFIG_H
 #include "config.h"
@@ -407,4 +407,4 @@ static inline void silk_noise_shape_quantizer_del_dec(
     }
 }
 
-#endif /* __NSQ_DEL_DEC_MIPSR1_H__ */
+#endif /* NSQ_DEL_DEC_MIPSR1_H__ */
diff --git a/opus/silk/mips/macros_mipsr1.h b/opus/silk/mips/macros_mipsr1.h
index 12ed981a..af408802 100644
--- a/opus/silk/mips/macros_mipsr1.h
+++ b/opus/silk/mips/macros_mipsr1.h
@@ -26,8 +26,8 @@ POSSIBILITY OF SUCH DAMAGE.
 ***********************************************************************/
 
 
-#ifndef __SILK_MACROS_MIPSR1_H__
-#define __SILK_MACROS_MIPSR1_H__
+#ifndef SILK_MACROS_MIPSR1_H__
+#define SILK_MACROS_MIPSR1_H__
 
 #define mips_clz(x) __builtin_clz(x)
 
@@ -89,4 +89,4 @@ static inline opus_int32 silk_CLZ32(opus_int32 in32)
     return re32;
 }
 
-#endif /* __SILK_MACROS_MIPSR1_H__ */
+#endif /* SILK_MACROS_MIPSR1_H__ */
diff --git a/opus/silk/stereo_LR_to_MS.c b/opus/silk/stereo_LR_to_MS.c
index c8226663..751452cb 100644
--- a/opus/silk/stereo_LR_to_MS.c
+++ b/opus/silk/stereo_LR_to_MS.c
@@ -77,7 +77,7 @@ void silk_stereo_LR_to_MS(
     ALLOC( LP_mid, frame_length, opus_int16 );
     ALLOC( HP_mid, frame_length, opus_int16 );
     for( n = 0; n < frame_length; n++ ) {
-        sum = silk_RSHIFT_ROUND( silk_ADD_LSHIFT( mid[ n ] + (opus_int32)mid[ n + 2 ], mid[ n + 1 ], 1 ), 2 );
+        sum = silk_RSHIFT_ROUND( silk_ADD_LSHIFT32( mid[ n ] + (opus_int32)mid[ n + 2 ], mid[ n + 1 ], 1 ), 2 );
         LP_mid[ n ] = sum;
         HP_mid[ n ] = mid[ n + 1 ] - sum;
     }
@@ -86,7 +86,7 @@ void silk_stereo_LR_to_MS(
     ALLOC( LP_side, frame_length, opus_int16 );
     ALLOC( HP_side, frame_length, opus_int16 );
     for( n = 0; n < frame_length; n++ ) {
-        sum = silk_RSHIFT_ROUND( silk_ADD_LSHIFT( side[ n ] + (opus_int32)side[ n + 2 ], side[ n + 1 ], 1 ), 2 );
+        sum = silk_RSHIFT_ROUND( silk_ADD_LSHIFT32( side[ n ] + (opus_int32)side[ n + 2 ], side[ n + 1 ], 1 ), 2 );
         LP_side[ n ] = sum;
         HP_side[ n ] = side[ n + 1 ] - sum;
     }
@@ -207,7 +207,7 @@ void silk_stereo_LR_to_MS(
         pred0_Q13 += delta0_Q13;
         pred1_Q13 += delta1_Q13;
         w_Q24   += deltaw_Q24;
-        sum = silk_LSHIFT( silk_ADD_LSHIFT( mid[ n ] + (opus_int32)mid[ n + 2 ], mid[ n + 1 ], 1 ), 9 );    /* Q11 */
+        sum = silk_LSHIFT( silk_ADD_LSHIFT32( mid[ n ] + (opus_int32)mid[ n + 2 ], mid[ n + 1 ], 1 ), 9 );    /* Q11 */
         sum = silk_SMLAWB( silk_SMULWB( w_Q24, side[ n + 1 ] ), sum, pred0_Q13 );               /* Q8  */
         sum = silk_SMLAWB( sum, silk_LSHIFT( (opus_int32)mid[ n + 1 ], 11 ), pred1_Q13 );       /* Q8  */
         x2[ n - 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( sum, 8 ) );
@@ -217,7 +217,7 @@ void silk_stereo_LR_to_MS(
     pred1_Q13 = -pred_Q13[ 1 ];
     w_Q24     =  silk_LSHIFT( width_Q14, 10 );
     for( n = STEREO_INTERP_LEN_MS * fs_kHz; n < frame_length; n++ ) {
-        sum = silk_LSHIFT( silk_ADD_LSHIFT( mid[ n ] + (opus_int32)mid[ n + 2 ], mid[ n + 1 ], 1 ), 9 );    /* Q11 */
+        sum = silk_LSHIFT( silk_ADD_LSHIFT32( mid[ n ] + (opus_int32)mid[ n + 2 ], mid[ n + 1 ], 1 ), 9 );    /* Q11 */
         sum = silk_SMLAWB( silk_SMULWB( w_Q24, side[ n + 1 ] ), sum, pred0_Q13 );               /* Q8  */
         sum = silk_SMLAWB( sum, silk_LSHIFT( (opus_int32)mid[ n + 1 ], 11 ), pred1_Q13 );       /* Q8  */
         x2[ n - 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( sum, 8 ) );
diff --git a/opus/silk/stereo_MS_to_LR.c b/opus/silk/stereo_MS_to_LR.c
index 62521a4f..1e01bb6e 100644
--- a/opus/silk/stereo_MS_to_LR.c
+++ b/opus/silk/stereo_MS_to_LR.c
@@ -59,7 +59,7 @@ void silk_stereo_MS_to_LR(
     for( n = 0; n < STEREO_INTERP_LEN_MS * fs_kHz; n++ ) {
         pred0_Q13 += delta0_Q13;
         pred1_Q13 += delta1_Q13;
-        sum = silk_LSHIFT( silk_ADD_LSHIFT( x1[ n ] + x1[ n + 2 ], x1[ n + 1 ], 1 ), 9 );       /* Q11 */
+        sum = silk_LSHIFT( silk_ADD_LSHIFT32( x1[ n ] + (opus_int32)x1[ n + 2 ], x1[ n + 1 ], 1 ), 9 );       /* Q11 */
         sum = silk_SMLAWB( silk_LSHIFT( (opus_int32)x2[ n + 1 ], 8 ), sum, pred0_Q13 );         /* Q8  */
         sum = silk_SMLAWB( sum, silk_LSHIFT( (opus_int32)x1[ n + 1 ], 11 ), pred1_Q13 );        /* Q8  */
         x2[ n + 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( sum, 8 ) );
@@ -67,7 +67,7 @@ void silk_stereo_MS_to_LR(
     pred0_Q13 = pred_Q13[ 0 ];
     pred1_Q13 = pred_Q13[ 1 ];
     for( n = STEREO_INTERP_LEN_MS * fs_kHz; n < frame_length; n++ ) {
-        sum = silk_LSHIFT( silk_ADD_LSHIFT( x1[ n ] + x1[ n + 2 ], x1[ n + 1 ], 1 ), 9 );       /* Q11 */
+        sum = silk_LSHIFT( silk_ADD_LSHIFT32( x1[ n ] + (opus_int32)x1[ n + 2 ], x1[ n + 1 ], 1 ), 9 );       /* Q11 */
         sum = silk_SMLAWB( silk_LSHIFT( (opus_int32)x2[ n + 1 ], 8 ), sum, pred0_Q13 );         /* Q8  */
         sum = silk_SMLAWB( sum, silk_LSHIFT( (opus_int32)x1[ n + 1 ], 11 ), pred1_Q13 );        /* Q8  */
         x2[ n + 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( sum, 8 ) );
diff --git a/opus/silk/structs.h b/opus/silk/structs.h
index 3380c757..38243be1 100644
--- a/opus/silk/structs.h
+++ b/opus/silk/structs.h
@@ -34,6 +34,21 @@ POSSIBILITY OF SUCH DAMAGE.
 #include "entenc.h"
 #include "entdec.h"
 
+#ifdef ENABLE_DEEP_PLC
+#include "lpcnet.h"
+#include "lpcnet_private.h"
+#endif
+
+#ifdef ENABLE_DRED
+#include "dred_encoder.h"
+#include "dred_decoder.h"
+#endif
+
+#ifdef ENABLE_OSCE
+#include "osce_config.h"
+#include "osce_structs.h"
+#endif
+
 #ifdef __cplusplus
 extern "C"
 {
@@ -228,6 +243,14 @@ typedef struct {
 } silk_encoder_state;
 
 
+#ifdef ENABLE_OSCE
+typedef struct {
+    OSCEFeatureState features;
+    OSCEState state;
+    int method;
+} silk_OSCE_struct;
+#endif
+
 /* Struct for Packet Loss Concealment */
 typedef struct {
     opus_int32                  pitchL_Q8;                          /* Pitch lag to use for voiced concealment                          */
@@ -243,6 +266,7 @@ typedef struct {
     opus_int                    fs_kHz;
     opus_int                    nb_subfr;
     opus_int                    subfr_length;
+    opus_int                    enable_deep_plc;
 } silk_PLC_struct;
 
 /* Struct for CNG */
@@ -259,6 +283,10 @@ typedef struct {
 /* Decoder state                */
 /********************************/
 typedef struct {
+#ifdef ENABLE_OSCE
+    silk_OSCE_struct            osce;
+#endif
+#define SILK_DECODER_STATE_RESET_START prev_gain_Q16
     opus_int32                  prev_gain_Q16;
     opus_int32                  exc_Q14[ MAX_FRAME_LENGTH ];
     opus_int32                  sLPC_Q14_buf[ MAX_LPC_ORDER ];
diff --git a/opus/silk/tests/test_unit_LPC_inv_pred_gain.c b/opus/silk/tests/test_unit_LPC_inv_pred_gain.c
index 67067cea..acdd31af 100644
--- a/opus/silk/tests/test_unit_LPC_inv_pred_gain.c
+++ b/opus/silk/tests/test_unit_LPC_inv_pred_gain.c
@@ -43,6 +43,7 @@ int check_stability(opus_int16 *A_Q12, int order) {
     int i;
     int j;
     int sum_a, sum_abs_a;
+    double y[SILK_MAX_ORDER_LPC] = {0};
     sum_a = sum_abs_a = 0;
     for( j = 0; j < order; j++ ) {
         sum_a += A_Q12[ j ];
@@ -57,7 +58,6 @@ int check_stability(opus_int16 *A_Q12, int order) {
     if( sum_abs_a < 4096 ) {
         return 1;
     }
-    double y[SILK_MAX_ORDER_LPC] = {0};
     y[0] = 1;
     for( i = 0; i < 10000; i++ ) {
         double sum = 0;
@@ -125,5 +125,6 @@ int main(void) {
         }
     }
     printf("silk_LPC_inverse_pred_gain() optimization passed\n");
+    RESTORE_STACK;
     return 0;
 }
diff --git a/opus/silk/typedef.h b/opus/silk/typedef.h
index 97b7e709..793d2c0c 100644
--- a/opus/silk/typedef.h
+++ b/opus/silk/typedef.h
@@ -67,6 +67,9 @@ __attribute__((noreturn))
 static OPUS_INLINE void _silk_fatal(const char *str, const char *file, int line)
 {
    fprintf (stderr, "Fatal (internal) error in %s, line %d: %s\n", file, line, str);
+#if defined(_MSC_VER)
+   _set_abort_behavior( 0, _WRITE_ABORT_MSG);
+#endif
    abort();
 }
 #  define silk_assert(COND) {if (!(COND)) {silk_fatal("assertion failed: " #COND);}}
diff --git a/opus/silk/x86/NSQ_del_dec_avx2.c b/opus/silk/x86/NSQ_del_dec_avx2.c
new file mode 100644
index 00000000..43485871
--- /dev/null
+++ b/opus/silk/x86/NSQ_del_dec_avx2.c
@@ -0,0 +1,1075 @@
+/***********************************************************************
+Copyright (c) 2021 Google Inc.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#ifdef OPUS_CHECK_ASM
+#include <string.h>
+#endif
+
+#include "opus_defines.h"
+#include <immintrin.h>
+
+#include "main.h"
+#include "stack_alloc.h"
+#include "NSQ.h"
+#include "celt/x86/x86cpu.h"
+
+/* Returns TRUE if all assumptions met */
+static OPUS_INLINE int verify_assumptions(const silk_encoder_state *psEncC)
+{
+    /* This optimization is based on these assumptions        */
+    /* These assumptions are fundamental and hence assert are */
+    /* used. Should any assert triggers, we have to re-visit  */
+    /* all related code to make sure it still functions the   */
+    /* same as the C implementation.                          */
+    silk_assert(MAX_DEL_DEC_STATES  <= 4      &&
+                MAX_FRAME_LENGTH     % 4 == 0 &&
+                MAX_SUB_FRAME_LENGTH % 4 == 0 &&
+                LTP_MEM_LENGTH_MS    % 4 == 0 );
+    silk_assert(psEncC->fs_kHz ==  8 ||
+                psEncC->fs_kHz == 12 ||
+                psEncC->fs_kHz == 16 );
+    silk_assert(psEncC->nb_subfr <= MAX_NB_SUBFR &&
+                psEncC->nb_subfr > 0             );
+    silk_assert(psEncC->nStatesDelayedDecision <= MAX_DEL_DEC_STATES &&
+                psEncC->nStatesDelayedDecision > 0                   );
+    silk_assert(psEncC->ltp_mem_length == psEncC->fs_kHz * LTP_MEM_LENGTH_MS);
+
+    /* Regressions were observed on certain AMD Zen CPUs when      */
+    /* nStatesDelayedDecision is 1 or 2. Ideally we should detect  */
+    /* these CPUs and enable this optimization on others; however, */
+    /* there is no good way to do so under current OPUS framework. */
+    return psEncC->nStatesDelayedDecision == 3 ||
+           psEncC->nStatesDelayedDecision == 4;
+}
+
+/* Intrinsics not defined on MSVC */
+#ifdef _MSC_VER
+#include <Intsafe.h>
+#define __m128i_u __m128i
+static inline int __builtin_sadd_overflow(opus_int32 a, opus_int32 b, opus_int32* res)
+{
+    *res = a+b;
+    return (*res ^ a) & (*res ^ b) & 0x80000000;
+}
+static inline int __builtin_ctz(unsigned int x)
+{
+    DWORD res = 0;
+    return _BitScanForward(&res, x) ? res : 32;
+}
+#endif
+
+static OPUS_INLINE __m128i silk_cvtepi64_epi32_high(__m256i num)
+{
+    return _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(num, _mm256_set_epi32(0, 0, 0, 0, 7, 5, 3, 1)));
+}
+
+static OPUS_INLINE opus_int16 silk_sat16(opus_int32 num)
+{
+    num = num > silk_int16_MAX ? silk_int16_MAX : num;
+    num = num < silk_int16_MIN ? silk_int16_MIN : num;
+    return num;
+}
+
+static OPUS_INLINE opus_int32 silk_sar_round_32(opus_int32 a, int bits)
+{
+    silk_assert(bits > 0 && bits < 31);
+    a += 1 << (bits-1);
+    return a >> bits;
+}
+
+static OPUS_INLINE opus_int64 silk_sar_round_smulww(opus_int32 a, opus_int32 b, int bits)
+{
+    silk_assert(bits > 0 && bits < 63);
+#ifdef OPUS_CHECK_ASM
+    return silk_RSHIFT_ROUND(silk_SMULWW(a, b), bits);
+#else
+    /* This code is more correct, but it won't overflow like the C code in some rare cases. */
+    silk_assert(bits > 0 && bits < 63);
+    opus_int64 t = ((opus_int64)a) * ((opus_int64)b);
+    bits += 16;
+    t += 1ull << (bits-1);
+    return t >> bits;
+#endif
+}
+
+static OPUS_INLINE opus_int32 silk_add_sat32(opus_int32 a, opus_int32 b)
+{
+    opus_int32 sum;
+    if (__builtin_sadd_overflow(a, b, &sum))
+    {
+        return a >= 0 ? silk_int32_MAX : silk_int32_MIN;
+    }
+    return sum;
+}
+
+static OPUS_INLINE __m128i silk_mm_srai_round_epi32(__m128i a, int bits)
+{
+    silk_assert(bits > 0 && bits < 31);
+    return _mm_srai_epi32(_mm_add_epi32(a, _mm_set1_epi32(1 << (bits - 1))), bits);
+}
+
+/* add/subtract with output saturated */
+static OPUS_INLINE __m128i silk_mm_add_sat_epi32(__m128i a, __m128i b)
+{
+    __m128i r = _mm_add_epi32(a, b);
+    __m128i OF = _mm_and_si128(_mm_xor_si128(a, r), _mm_xor_si128(b, r));           /* OF = (sum ^ a) & (sum ^ b)   */
+    __m128i SAT = _mm_add_epi32(_mm_srli_epi32(a, 31), _mm_set1_epi32(0x7FFFFFFF)); /* SAT = (a >> 31) + 0x7FFFFFFF */
+    return _mm_blendv_epi8(r, SAT, _mm_srai_epi32(OF, 31));
+}
+static OPUS_INLINE __m128i silk_mm_sub_sat_epi32(__m128i a, __m128i b)
+{
+    __m128i r = _mm_sub_epi32(a, b);
+    __m128i OF = _mm_andnot_si128(_mm_xor_si128(b, r), _mm_xor_si128(a, r));        /* OF = (sum ^ a) & (sum ^ ~b) = (sum ^ a) & ~(sum ^ b) */
+    __m128i SAT = _mm_add_epi32(_mm_srli_epi32(a, 31), _mm_set1_epi32(0x7FFFFFFF)); /* SAT = (a >> 31) + 0x7FFFFFFF                         */
+    return _mm_blendv_epi8(r, SAT, _mm_srai_epi32(OF, 31));
+}
+static OPUS_INLINE __m256i silk_mm256_sub_sat_epi32(__m256i a, __m256i b)
+{
+    __m256i r = _mm256_sub_epi32(a, b);
+    __m256i OF = _mm256_andnot_si256(_mm256_xor_si256(b, r), _mm256_xor_si256(a, r));        /* OF = (sum ^ a) & (sum ^ ~b) = (sum ^ a) & ~(sum ^ b) */
+    __m256i SAT = _mm256_add_epi32(_mm256_srli_epi32(a, 31), _mm256_set1_epi32(0x7FFFFFFF)); /* SAT = (a >> 31) + 0x7FFFFFFF                         */
+    return _mm256_blendv_epi8(r, SAT, _mm256_srai_epi32(OF, 31));
+}
+
+static OPUS_INLINE __m128i silk_mm_limit_epi32(__m128i num, opus_int32 limit1, opus_int32 limit2)
+{
+    opus_int32 lo = limit1 < limit2 ? limit1 : limit2;
+    opus_int32 hi = limit1 > limit2 ? limit1 : limit2;
+
+    num = _mm_min_epi32(num, _mm_set1_epi32(hi));
+    num = _mm_max_epi32(num, _mm_set1_epi32(lo));
+    return num;
+}
+
+/* cond < 0 ? -num : num */
+static OPUS_INLINE __m128i silk_mm_sign_epi32(__m128i num, __m128i cond)
+{
+    return _mm_sign_epi32(num, _mm_or_si128(cond, _mm_set1_epi32(1)));
+}
+static OPUS_INLINE __m256i silk_mm256_sign_epi32(__m256i num, __m256i cond)
+{
+    return _mm256_sign_epi32(num, _mm256_or_si256(cond, _mm256_set1_epi32(1)));
+}
+
+/* (a32 * b32) >> 16 */
+static OPUS_INLINE __m128i silk_mm_smulww_epi32(__m128i a, opus_int32 b)
+{
+    return silk_cvtepi64_epi32_high(_mm256_slli_epi64(_mm256_mul_epi32(_mm256_cvtepi32_epi64(a), _mm256_set1_epi32(b)), 16));
+}
+
+/* (a32 * (opus_int32)((opus_int16)(b32))) >> 16 output have to be 32bit int */
+static OPUS_INLINE __m128i silk_mm_smulwb_epi32(__m128i a, opus_int32 b)
+{
+    return silk_cvtepi64_epi32_high(_mm256_mul_epi32(_mm256_cvtepi32_epi64(a), _mm256_set1_epi32(silk_LSHIFT(b, 16))));
+}
+
+/* (opus_int32)((opus_int16)(a3))) * (opus_int32)((opus_int16)(b32)) output have to be 32bit int */
+static OPUS_INLINE __m256i silk_mm256_smulbb_epi32(__m256i a, __m256i b)
+{
+    const char FF = (char)0xFF;
+    __m256i msk = _mm256_set_epi8(
+        FF, FF, FF, FF, FF, FF, FF, FF, 13, 12, 9, 8, 5, 4, 1, 0,
+        FF, FF, FF, FF, FF, FF, FF, FF, 13, 12, 9, 8, 5, 4, 1, 0);
+    __m256i lo = _mm256_mullo_epi16(a, b);
+    __m256i hi = _mm256_mulhi_epi16(a, b);
+    lo = _mm256_shuffle_epi8(lo, msk);
+    hi = _mm256_shuffle_epi8(hi, msk);
+    return _mm256_unpacklo_epi16(lo, hi);
+}
+
+static OPUS_INLINE __m256i silk_mm256_reverse_epi32(__m256i v)
+{
+    v = _mm256_shuffle_epi32(v, 0x1B);
+    v = _mm256_permute4x64_epi64(v, 0x4E);
+    return v;
+}
+
+static OPUS_INLINE opus_int32 silk_mm256_hsum_epi32(__m256i v)
+{
+    __m128i sum = _mm_add_epi32(_mm256_extracti128_si256(v, 1), _mm256_extracti128_si256(v, 0));
+    sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E));
+    sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1));
+    return _mm_cvtsi128_si32(sum);
+}
+
+static OPUS_INLINE __m128i silk_mm_hmin_epi32(__m128i num)
+{
+    num = _mm_min_epi32(num, _mm_shuffle_epi32(num, 0x4E)); /* 0123 -> 2301 */
+    num = _mm_min_epi32(num, _mm_shuffle_epi32(num, 0xB1)); /* 0123 -> 1032 */
+    return num;
+}
+
+static OPUS_INLINE __m128i silk_mm_hmax_epi32(__m128i num)
+{
+    num = _mm_max_epi32(num, _mm_shuffle_epi32(num, 0x4E)); /* 0123 -> 2310 */
+    num = _mm_max_epi32(num, _mm_shuffle_epi32(num, 0xB1)); /* 0123 -> 1032 */
+    return num;
+}
+
+static OPUS_INLINE __m128i silk_mm_mask_hmin_epi32(__m128i num, __m128i mask)
+{
+    num = _mm_blendv_epi8(num, _mm_set1_epi32(silk_int32_MAX), mask);
+    return silk_mm_hmin_epi32(num);
+}
+
+static OPUS_INLINE __m128i silk_mm_mask_hmax_epi32(__m128i num, __m128i mask)
+{
+    num = _mm_blendv_epi8(num, _mm_set1_epi32(silk_int32_MIN), mask);
+    return silk_mm_hmax_epi32(num);
+}
+
+static OPUS_INLINE __m128i silk_mm256_rand_epi32(__m128i seed)
+{
+    seed = _mm_mullo_epi32(seed, _mm_set1_epi32(RAND_MULTIPLIER));
+    seed = _mm_add_epi32(seed, _mm_set1_epi32(RAND_INCREMENT));
+    return seed;
+}
+
+static OPUS_INLINE opus_int32 silk_index_of_first_equal_epi32(__m128i a, __m128i b)
+{
+    unsigned int mask = _mm_movemask_epi8(_mm_cmpeq_epi32(a, b)) & 0x1111;
+    silk_assert(mask != 0);
+    return __builtin_ctz(mask) >> 2;
+}
+
+static __m128i silk_index_to_selector(opus_int32 index)
+{
+    silk_assert(index < 4);
+    index <<= 2;
+    return _mm_set_epi8(
+        index + 3, index + 2, index + 1, index + 0,
+        index + 3, index + 2, index + 1, index + 0,
+        index + 3, index + 2, index + 1, index + 0,
+        index + 3, index + 2, index + 1, index + 0);
+}
+
+static opus_int32 silk_select_winner(__m128i num, __m128i selector)
+{
+    return _mm_cvtsi128_si32(_mm_shuffle_epi8(num, selector));
+}
+
+typedef struct
+{
+    __m128i RandState;
+    __m128i Q_Q10;
+    __m128i Xq_Q14;
+    __m128i Pred_Q15;
+    __m128i Shape_Q14;
+} NSQ_del_dec_sample_struct;
+
+typedef struct
+{
+    __m128i sLPC_Q14[MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH];
+    __m128i LF_AR_Q14;
+    __m128i Seed;
+    __m128i SeedInit;
+    __m128i RD_Q10;
+    __m128i Diff_Q14;
+    __m128i sAR2_Q14[MAX_SHAPE_LPC_ORDER];
+    NSQ_del_dec_sample_struct Samples[DECISION_DELAY];
+} NSQ_del_dec_struct;
+
+static OPUS_INLINE void silk_nsq_del_dec_scale_states_avx2(
+    const silk_encoder_state *psEncC,          /* I    Encoder State                   */
+    silk_nsq_state *NSQ,                       /* I/O  NSQ state                       */
+    NSQ_del_dec_struct *psDelDec,              /* I/O  Delayed decision states         */
+    const opus_int16 x16[],                    /* I    Input                           */
+    opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH], /* O    Input scaled with 1/Gain in Q10 */
+    const opus_int16 sLTP[],                   /* I    Re-whitened LTP state in Q0     */
+    opus_int32 sLTP_Q15[],                     /* O    LTP state matching scaled input */
+    opus_int subfr,                            /* I    Subframe number                 */
+    const opus_int LTP_scale_Q14,              /* I    LTP state scaling               */
+    const opus_int32 Gains_Q16[MAX_NB_SUBFR],  /* I                                    */
+    const opus_int pitchL[MAX_NB_SUBFR],       /* I    Pitch lag                       */
+    const opus_int signal_type,                /* I    Signal type                     */
+    const opus_int decisionDelay               /* I    Decision delay                  */
+);
+
+/*******************************************/
+/* LPC analysis filter                     */
+/* NB! State is kept internally and the    */
+/* filter always starts with zero state    */
+/* first d output samples are set to zero  */
+/*******************************************/
+static OPUS_INLINE void silk_LPC_analysis_filter_avx2(
+    opus_int16                  *out,               /* O    Output signal                           */
+    const opus_int16            *in,                /* I    Input signal                            */
+    const opus_int16            *B,                 /* I    MA prediction coefficients, Q12 [order] */
+    const opus_int32            len,                /* I    Signal length                           */
+    const opus_int32            order               /* I    Filter order                            */
+);
+
+/******************************************/
+/* Noise shape quantizer for one subframe */
+/******************************************/
+static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_avx2(
+    silk_nsq_state *NSQ,                        /* I/O  NSQ state                          */
+    NSQ_del_dec_struct psDelDec[],              /* I/O  Delayed decision states            */
+    opus_int signalType,                        /* I    Signal type                        */
+    const opus_int32 x_Q10[],                   /* I                                       */
+    opus_int8 pulses[],                         /* O                                       */
+    opus_int16 xq[],                            /* O                                       */
+    opus_int32 sLTP_Q15[],                      /* I/O  LTP filter state                   */
+    opus_int32 delayedGain_Q10[DECISION_DELAY], /* I/O  Gain delay buffer                  */
+    const opus_int16 a_Q12[],                   /* I    Short term prediction coefs        */
+    const opus_int16 b_Q14[],                   /* I    Long term prediction coefs         */
+    const opus_int16 AR_shp_Q13[],              /* I    Noise shaping coefs                */
+    opus_int lag,                               /* I    Pitch lag                          */
+    opus_int32 HarmShapeFIRPacked_Q14,          /* I                                       */
+    opus_int Tilt_Q14,                          /* I    Spectral tilt                      */
+    opus_int32 LF_shp_Q14,                      /* I                                       */
+    opus_int32 Gain_Q16,                        /* I                                       */
+    opus_int Lambda_Q10,                        /* I                                       */
+    opus_int offset_Q10,                        /* I                                       */
+    opus_int length,                            /* I    Input length                       */
+    opus_int subfr,                             /* I    Subframe number                    */
+    opus_int shapingLPCOrder,                   /* I    Shaping LPC filter order           */
+    opus_int predictLPCOrder,                   /* I    Prediction filter order            */
+    opus_int warping_Q16,                       /* I                                       */
+    __m128i MaskDelDec,                         /* I    Mask of states in decision tree    */
+    opus_int *smpl_buf_idx,                     /* I/O  Index to newest samples in buffers */
+    opus_int decisionDelay                      /* I                                       */
+);
+
+void silk_NSQ_del_dec_avx2(
+    const silk_encoder_state *psEncC,                            /* I    Encoder State               */
+    silk_nsq_state *NSQ,                                         /* I/O  NSQ state                   */
+    SideInfoIndices *psIndices,                                  /* I/O  Quantization Indices        */
+    const opus_int16 x16[],                                      /* I    Input                       */
+    opus_int8 pulses[],                                          /* O    Quantized pulse signal      */
+    const opus_int16 *PredCoef_Q12,                              /* I    Short term prediction coefs */
+    const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR],      /* I    Long term prediction coefs  */
+    const opus_int16 AR_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER], /* I    Noise shaping coefs         */
+    const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR],              /* I    Long term shaping coefs     */
+    const opus_int Tilt_Q14[MAX_NB_SUBFR],                       /* I    Spectral tilt               */
+    const opus_int32 LF_shp_Q14[MAX_NB_SUBFR],                   /* I    Low frequency shaping coefs */
+    const opus_int32 Gains_Q16[MAX_NB_SUBFR],                    /* I    Quantization step sizes     */
+    const opus_int32 pitchL[MAX_NB_SUBFR],                       /* I    Pitch lags                  */
+    const opus_int Lambda_Q10,                                   /* I    Rate/distortion tradeoff    */
+    const opus_int LTP_scale_Q14                                 /* I    LTP state scaling           */
+)
+{
+#ifdef OPUS_CHECK_ASM
+    silk_nsq_state NSQ_c;
+    SideInfoIndices psIndices_c;
+    opus_int8 pulses_c[MAX_FRAME_LENGTH];
+    const opus_int8 *const pulses_a = pulses;
+
+    silk_memcpy(&NSQ_c, NSQ, sizeof(NSQ_c));
+    silk_memcpy(&psIndices_c, psIndices, sizeof(psIndices_c));
+    silk_memcpy(pulses_c, pulses, sizeof(pulses_c));
+    silk_NSQ_del_dec_c(psEncC, &NSQ_c, &psIndices_c, x16, pulses_c, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16,
+                       pitchL, Lambda_Q10, LTP_scale_Q14);
+#endif
+
+    if (!verify_assumptions(psEncC))
+    {
+        silk_NSQ_del_dec_c(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14);
+        return;
+    }
+
+    opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr;
+    opus_int last_smple_idx, smpl_buf_idx, decisionDelay;
+    const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13;
+    opus_int16 *pxq;
+    VARDECL(opus_int32, sLTP_Q15);
+    VARDECL(opus_int16, sLTP);
+    opus_int32 HarmShapeFIRPacked_Q14;
+    opus_int offset_Q10;
+    opus_int32 Gain_Q10;
+    opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH];
+    opus_int32 delayedGain_Q10[DECISION_DELAY];
+    NSQ_del_dec_struct psDelDec = {0};
+    NSQ_del_dec_sample_struct *psSample;
+    __m128i RDmin_Q10, MaskDelDec, Winner_selector;
+    SAVE_STACK;
+
+    MaskDelDec = _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFFFFF00ul << ((psEncC->nStatesDelayedDecision - 1) << 3)));
+
+    /* Set unvoiced lag to the previous one, overwrite later for voiced */
+    lag = NSQ->lagPrev;
+
+    silk_assert(NSQ->prev_gain_Q16 != 0);
+    psDelDec.Seed = _mm_and_si128(
+        _mm_add_epi32(_mm_set_epi32(3, 2, 1, 0), _mm_set1_epi32(psIndices->Seed)),
+        _mm_set1_epi32(3));
+    psDelDec.SeedInit = psDelDec.Seed;
+    psDelDec.RD_Q10 = _mm_setzero_si128();
+    psDelDec.LF_AR_Q14 = _mm_set1_epi32(NSQ->sLF_AR_shp_Q14);
+    psDelDec.Diff_Q14 = _mm_set1_epi32(NSQ->sDiff_shp_Q14);
+    psDelDec.Samples[0].Shape_Q14 = _mm_set1_epi32(NSQ->sLTP_shp_Q14[psEncC->ltp_mem_length - 1]);
+    for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)
+    {
+        psDelDec.sLPC_Q14[i] = _mm_set1_epi32(NSQ->sLPC_Q14[i]);
+    }
+    for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++)
+    {
+        psDelDec.sAR2_Q14[i] = _mm_set1_epi32(NSQ->sAR2_Q14[i]);
+    }
+
+    offset_Q10 = silk_Quantization_Offsets_Q10[psIndices->signalType >> 1][psIndices->quantOffsetType];
+    smpl_buf_idx = 0; /* index of oldest samples */
+
+    decisionDelay = silk_min_int(DECISION_DELAY, psEncC->subfr_length);
+
+    /* For voiced frames limit the decision delay to lower than the pitch lag */
+    if (psIndices->signalType == TYPE_VOICED)
+    {
+        for (k = 0; k < psEncC->nb_subfr; k++)
+        {
+            decisionDelay = silk_min_int(decisionDelay, pitchL[k] - LTP_ORDER / 2 - 1);
+        }
+    }
+    else
+    {
+        if (lag > 0)
+        {
+            decisionDelay = silk_min_int(decisionDelay, lag - LTP_ORDER / 2 - 1);
+        }
+    }
+
+    if (psIndices->NLSFInterpCoef_Q2 == 4)
+    {
+        LSF_interpolation_flag = 0;
+    }
+    else
+    {
+        LSF_interpolation_flag = 1;
+    }
+
+    ALLOC(sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32);
+    ALLOC(sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16);
+    /* Set up pointers to start of sub frame */
+    pxq = &NSQ->xq[psEncC->ltp_mem_length];
+    NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;
+    NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
+    subfr = 0;
+    for (k = 0; k < psEncC->nb_subfr; k++)
+    {
+        A_Q12 = &PredCoef_Q12[((k >> 1) | (1 ^ LSF_interpolation_flag)) * MAX_LPC_ORDER];
+        B_Q14 = &LTPCoef_Q14[k * LTP_ORDER];
+        AR_shp_Q13 = &AR_Q13[k * MAX_SHAPE_LPC_ORDER];
+
+        /* Noise shape parameters */
+        silk_assert(HarmShapeGain_Q14[k] >= 0);
+        HarmShapeFIRPacked_Q14  =                          silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 );
+        HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 );
+
+        NSQ->rewhite_flag = 0;
+        if (psIndices->signalType == TYPE_VOICED)
+        {
+            /* Voiced */
+            lag = pitchL[k];
+
+            /* Re-whitening */
+            if ((k & (3 ^ (LSF_interpolation_flag << 1))) == 0)
+            {
+                if (k == 2)
+                {
+                    /* RESET DELAYED DECISIONS */
+                    /* Find winner */
+                    RDmin_Q10 = silk_mm_mask_hmin_epi32(psDelDec.RD_Q10, MaskDelDec);
+                    Winner_ind = silk_index_of_first_equal_epi32(RDmin_Q10, psDelDec.RD_Q10);
+                    Winner_selector = silk_index_to_selector(Winner_ind);
+                    psDelDec.RD_Q10 = _mm_add_epi32(
+                        psDelDec.RD_Q10,
+                        _mm_blendv_epi8(
+                            _mm_set1_epi32(silk_int32_MAX >> 4),
+                            _mm_setzero_si128(),
+                            _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFU << (unsigned)(Winner_ind << 3)))));
+
+                    /* Copy final part of signals from winner state to output and long-term filter states */
+                    last_smple_idx = smpl_buf_idx + decisionDelay;
+                    for (i = 0; i < decisionDelay; i++)
+                    {
+                        last_smple_idx = (last_smple_idx + DECISION_DELAY - 1) % DECISION_DELAY;
+                        psSample = &psDelDec.Samples[last_smple_idx];
+                        pulses[i - decisionDelay] =
+                            (opus_int8)silk_sar_round_32(silk_select_winner(psSample->Q_Q10, Winner_selector), 10);
+                        pxq[i - decisionDelay] =
+                            silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psSample->Xq_Q14, Winner_selector), Gains_Q16[1], 14));
+                        NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay + i] =
+                            silk_select_winner(psSample->Shape_Q14, Winner_selector);
+                    }
+
+                    subfr = 0;
+                }
+
+                /* Rewhiten with new A coefs */
+                start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;
+                silk_assert(start_idx > 0);
+
+                silk_LPC_analysis_filter_avx2(&sLTP[start_idx], &NSQ->xq[start_idx + k * psEncC->subfr_length],
+                                              A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder);
+
+                NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
+                NSQ->rewhite_flag = 1;
+            }
+        }
+
+        silk_nsq_del_dec_scale_states_avx2(psEncC, NSQ, &psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k,
+                                           LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay);
+
+        silk_noise_shape_quantizer_del_dec_avx2(NSQ, &psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15,
+                                                delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[k], LF_shp_Q14[k],
+                                                Gains_Q16[k], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder,
+                                                psEncC->predictLPCOrder, psEncC->warping_Q16, MaskDelDec, &smpl_buf_idx, decisionDelay);
+
+        x16 += psEncC->subfr_length;
+        pulses += psEncC->subfr_length;
+        pxq += psEncC->subfr_length;
+    }
+
+    /* Find winner */
+    RDmin_Q10 = silk_mm_mask_hmin_epi32(psDelDec.RD_Q10, MaskDelDec);
+    Winner_selector = silk_index_to_selector(silk_index_of_first_equal_epi32(RDmin_Q10, psDelDec.RD_Q10));
+
+    /* Copy final part of signals from winner state to output and long-term filter states */
+    psIndices->Seed = silk_select_winner(psDelDec.SeedInit, Winner_selector);
+    last_smple_idx = smpl_buf_idx + decisionDelay;
+    Gain_Q10 = Gains_Q16[psEncC->nb_subfr - 1] >> 6;
+    for (i = 0; i < decisionDelay; i++)
+    {
+        last_smple_idx = (last_smple_idx + DECISION_DELAY - 1) % DECISION_DELAY;
+        psSample = &psDelDec.Samples[last_smple_idx];
+
+        pulses[i - decisionDelay] =
+            (opus_int8)silk_sar_round_32(silk_select_winner(psSample->Q_Q10, Winner_selector), 10);
+        pxq[i - decisionDelay] =
+            silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psSample->Xq_Q14, Winner_selector), Gain_Q10, 8));
+        NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay + i] =
+            silk_select_winner(psSample->Shape_Q14, Winner_selector);
+    }
+    for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)
+    {
+        NSQ->sLPC_Q14[i] = silk_select_winner(psDelDec.sLPC_Q14[i], Winner_selector);
+    }
+    for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++)
+    {
+        NSQ->sAR2_Q14[i] = silk_select_winner(psDelDec.sAR2_Q14[i], Winner_selector);
+    }
+
+    /* Update states */
+    NSQ->sLF_AR_shp_Q14 = silk_select_winner(psDelDec.LF_AR_Q14, Winner_selector);
+    NSQ->sDiff_shp_Q14 = silk_select_winner(psDelDec.Diff_Q14, Winner_selector);
+    NSQ->lagPrev = pitchL[psEncC->nb_subfr - 1];
+
+    /* Save quantized speech signal */
+    silk_memmove(NSQ->xq, &NSQ->xq[psEncC->frame_length], psEncC->ltp_mem_length * sizeof(opus_int16));
+    silk_memmove(NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[psEncC->frame_length], psEncC->ltp_mem_length * sizeof(opus_int32));
+
+#ifdef OPUS_CHECK_ASM
+    silk_assert(!memcmp(&NSQ_c, NSQ, sizeof(NSQ_c)));
+    silk_assert(!memcmp(&psIndices_c, psIndices, sizeof(psIndices_c)));
+    silk_assert(!memcmp(pulses_c, pulses_a, sizeof(pulses_c)));
+#endif
+
+    RESTORE_STACK;
+}
+
+static OPUS_INLINE __m128i silk_noise_shape_quantizer_short_prediction_x4(const __m128i *buf32, const opus_int16 *coef16, opus_int order)
+{
+    __m256i out;
+    silk_assert(order == 10 || order == 16);
+
+    /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
+    out = _mm256_set1_epi32(order >> 1);
+    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-0]), _mm256_set1_epi32(silk_LSHIFT(coef16[0], 16)))); /* High DWORD */
+    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-1]), _mm256_set1_epi32(silk_LSHIFT(coef16[1], 16)))); /* High DWORD */
+    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-2]), _mm256_set1_epi32(silk_LSHIFT(coef16[2], 16)))); /* High DWORD */
+    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-3]), _mm256_set1_epi32(silk_LSHIFT(coef16[3], 16)))); /* High DWORD */
+    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-4]), _mm256_set1_epi32(silk_LSHIFT(coef16[4], 16)))); /* High DWORD */
+    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-5]), _mm256_set1_epi32(silk_LSHIFT(coef16[5], 16)))); /* High DWORD */
+    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-6]), _mm256_set1_epi32(silk_LSHIFT(coef16[6], 16)))); /* High DWORD */
+    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-7]), _mm256_set1_epi32(silk_LSHIFT(coef16[7], 16)))); /* High DWORD */
+    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-8]), _mm256_set1_epi32(silk_LSHIFT(coef16[8], 16)))); /* High DWORD */
+    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-9]), _mm256_set1_epi32(silk_LSHIFT(coef16[9], 16)))); /* High DWORD */
+
+    if (order == 16)
+    {
+        out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-10]), _mm256_set1_epi32(silk_LSHIFT(coef16[10], 16)))); /* High DWORD */
+        out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-11]), _mm256_set1_epi32(silk_LSHIFT(coef16[11], 16)))); /* High DWORD */
+        out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-12]), _mm256_set1_epi32(silk_LSHIFT(coef16[12], 16)))); /* High DWORD */
+        out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-13]), _mm256_set1_epi32(silk_LSHIFT(coef16[13], 16)))); /* High DWORD */
+        out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-14]), _mm256_set1_epi32(silk_LSHIFT(coef16[14], 16)))); /* High DWORD */
+        out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-15]), _mm256_set1_epi32(silk_LSHIFT(coef16[15], 16)))); /* High DWORD */
+    }
+    return silk_cvtepi64_epi32_high(out);
+}
+
+/******************************************/
+/* Noise shape quantizer for one subframe */
+/******************************************/
+static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_avx2(
+    silk_nsq_state *NSQ,                        /* I/O  NSQ state                          */
+    NSQ_del_dec_struct *psDelDec,               /* I/O  Delayed decision states            */
+    opus_int signalType,                        /* I    Signal type                        */
+    const opus_int32 x_Q10[],                   /* I                                       */
+    opus_int8 pulses[],                         /* O                                       */
+    opus_int16 xq[],                            /* O                                       */
+    opus_int32 sLTP_Q15[],                      /* I/O  LTP filter state                   */
+    opus_int32 delayedGain_Q10[DECISION_DELAY], /* I/O  Gain delay buffer                  */
+    const opus_int16 a_Q12[],                   /* I    Short term prediction coefs        */
+    const opus_int16 b_Q14[],                   /* I    Long term prediction coefs         */
+    const opus_int16 AR_shp_Q13[],              /* I    Noise shaping coefs                */
+    opus_int lag,                               /* I    Pitch lag                          */
+    opus_int32 HarmShapeFIRPacked_Q14,          /* I                                       */
+    opus_int Tilt_Q14,                          /* I    Spectral tilt                      */
+    opus_int32 LF_shp_Q14,                      /* I                                       */
+    opus_int32 Gain_Q16,                        /* I                                       */
+    opus_int Lambda_Q10,                        /* I                                       */
+    opus_int offset_Q10,                        /* I                                       */
+    opus_int length,                            /* I    Input length                       */
+    opus_int subfr,                             /* I    Subframe number                    */
+    opus_int shapingLPCOrder,                   /* I    Shaping LPC filter order           */
+    opus_int predictLPCOrder,                   /* I    Prediction filter order            */
+    opus_int warping_Q16,                       /* I                                       */
+    __m128i MaskDelDec,                         /* I    Mask of states in decision tree    */
+    opus_int *smpl_buf_idx,                     /* I/O  Index to newest samples in buffers */
+    opus_int decisionDelay                      /* I                                       */
+)
+{
+    int i;
+    opus_int32 *shp_lag_ptr = &NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2];
+    opus_int32 *pred_lag_ptr = &sLTP_Q15[NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2];
+    opus_int32 Gain_Q10 = Gain_Q16 >> 6;
+
+    for (i = 0; i < length; i++)
+    {
+        /* Perform common calculations used in all states */
+        /* NSQ_sample_struct */
+        /* Low  128 bits => 1st set */
+        /* High 128 bits => 2nd set */
+        int j;
+        __m256i SS_Q_Q10;
+        __m256i SS_RD_Q10;
+        __m256i SS_xq_Q14;
+        __m256i SS_LF_AR_Q14;
+        __m256i SS_Diff_Q14;
+        __m256i SS_sLTP_shp_Q14;
+        __m256i SS_LPC_exc_Q14;
+        __m256i exc_Q14;
+        __m256i q_Q10, rr_Q10, rd_Q10;
+        __m256i mask;
+        __m128i LPC_pred_Q14, n_AR_Q14;
+        __m128i RDmin_Q10, RDmax_Q10;
+        __m128i n_LF_Q14;
+        __m128i r_Q10, q1_Q0, q1_Q10, q2_Q10;
+        __m128i Winner_rand_state, Winner_selector;
+        __m128i tmp0, tmp1;
+        NSQ_del_dec_sample_struct *psLastSample, *psSample;
+        opus_int32 RDmin_ind, RDmax_ind, last_smple_idx;
+        opus_int32 LTP_pred_Q14, n_LTP_Q14;
+
+        /* Long-term prediction */
+        if (signalType == TYPE_VOICED)
+        {
+            /* Unrolled loop */
+            /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
+            LTP_pred_Q14 = 2;
+            LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-0], b_Q14[0]);
+            LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-1], b_Q14[1]);
+            LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-2], b_Q14[2]);
+            LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-3], b_Q14[3]);
+            LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-4], b_Q14[4]);
+            LTP_pred_Q14 = silk_LSHIFT(LTP_pred_Q14, 1); /* Q13 -> Q14 */
+            pred_lag_ptr++;
+        }
+        else
+        {
+            LTP_pred_Q14 = 0;
+        }
+
+        /* Long-term shaping */
+        if (lag > 0)
+        {
+            /* Symmetric, packed FIR coefficients */
+            n_LTP_Q14 = silk_add_sat32(shp_lag_ptr[0], shp_lag_ptr[-2]);
+            n_LTP_Q14 = silk_SMULWB(n_LTP_Q14, HarmShapeFIRPacked_Q14);
+            n_LTP_Q14 = n_LTP_Q14 + silk_SMULWT(shp_lag_ptr[-1], HarmShapeFIRPacked_Q14);
+            n_LTP_Q14 = LTP_pred_Q14 - (silk_LSHIFT(n_LTP_Q14, 2)); /* Q12 -> Q14 */
+            shp_lag_ptr++;
+        }
+        else
+        {
+            n_LTP_Q14 = 0;
+        }
+
+        /* BEGIN Updating Delayed Decision States */
+
+        /* Generate dither */
+        psDelDec->Seed = silk_mm256_rand_epi32(psDelDec->Seed);
+
+        /* Short-term prediction */
+        LPC_pred_Q14 = silk_noise_shape_quantizer_short_prediction_x4(&psDelDec->sLPC_Q14[NSQ_LPC_BUF_LENGTH - 1 + i], a_Q12, predictLPCOrder);
+        LPC_pred_Q14 = _mm_slli_epi32(LPC_pred_Q14, 4); /* Q10 -> Q14 */
+
+        /* Noise shape feedback */
+        silk_assert(shapingLPCOrder > 0);
+        silk_assert((shapingLPCOrder & 1) == 0); /* check that order is even */
+        /* Output of lowpass section */
+        tmp0 = _mm_add_epi32(psDelDec->Diff_Q14, silk_mm_smulwb_epi32(psDelDec->sAR2_Q14[0], warping_Q16));
+        n_AR_Q14 = _mm_set1_epi32(shapingLPCOrder >> 1);
+        for (j = 0; j < shapingLPCOrder - 1; j++)
+        {
+            /* Output of allpass section */
+            tmp1 = psDelDec->sAR2_Q14[j];
+            psDelDec->sAR2_Q14[j] = tmp0;
+            n_AR_Q14 = _mm_add_epi32(n_AR_Q14, silk_mm_smulwb_epi32(tmp0, AR_shp_Q13[j]));
+            tmp0 = _mm_add_epi32(tmp1, silk_mm_smulwb_epi32(_mm_sub_epi32(psDelDec->sAR2_Q14[j + 1], tmp0), warping_Q16));
+        }
+        psDelDec->sAR2_Q14[shapingLPCOrder - 1] = tmp0;
+        n_AR_Q14 = _mm_add_epi32(n_AR_Q14, silk_mm_smulwb_epi32(tmp0, AR_shp_Q13[shapingLPCOrder - 1]));
+
+        n_AR_Q14 = _mm_slli_epi32(n_AR_Q14, 1);                                                  /* Q11 -> Q12 */
+        n_AR_Q14 = _mm_add_epi32(n_AR_Q14, silk_mm_smulwb_epi32(psDelDec->LF_AR_Q14, Tilt_Q14)); /* Q12 */
+        n_AR_Q14 = _mm_slli_epi32(n_AR_Q14, 2);                                                  /* Q12 -> Q14 */
+
+        tmp0 = silk_mm_smulwb_epi32(psDelDec->Samples[*smpl_buf_idx].Shape_Q14, LF_shp_Q14); /* Q12 */
+        tmp1 = silk_mm_smulwb_epi32(psDelDec->LF_AR_Q14, LF_shp_Q14 >> 16);                  /* Q12 */
+        n_LF_Q14 = _mm_add_epi32(tmp0, tmp1);                                                /* Q12 */
+        n_LF_Q14 = _mm_slli_epi32(n_LF_Q14, 2);                                              /* Q12 -> Q14 */
+
+        /* Input minus prediction plus noise feedback                       */
+        /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP  */
+        tmp0 = silk_mm_add_sat_epi32(n_AR_Q14, n_LF_Q14);              /* Q14 */
+        tmp1 = _mm_add_epi32(_mm_set1_epi32(n_LTP_Q14), LPC_pred_Q14); /* Q13 */
+        tmp0 = silk_mm_sub_sat_epi32(tmp1, tmp0);                      /* Q13 */
+        tmp0 = silk_mm_srai_round_epi32(tmp0, 4);                      /* Q10 */
+
+        r_Q10 = _mm_sub_epi32(_mm_set1_epi32(x_Q10[i]), tmp0); /* residual error Q10 */
+
+        /* Flip sign depending on dither */
+        r_Q10 = silk_mm_sign_epi32(r_Q10, psDelDec->Seed);
+        r_Q10 = silk_mm_limit_epi32(r_Q10, -(31 << 10), 30 << 10);
+
+        /* Find two quantization level candidates and measure their rate-distortion */
+        q1_Q10 = _mm_sub_epi32(r_Q10, _mm_set1_epi32(offset_Q10));
+        q1_Q0 = _mm_srai_epi32(q1_Q10, 10);
+        if (Lambda_Q10 > 2048)
+        {
+            /* For aggressive RDO, the bias becomes more than one pulse. */
+            tmp0 = _mm_sub_epi32(_mm_abs_epi32(q1_Q10), _mm_set1_epi32(Lambda_Q10 / 2 - 512)); /* rdo_offset */
+            q1_Q0 = _mm_srai_epi32(q1_Q10, 31);
+            tmp1 = _mm_cmpgt_epi32(tmp0, _mm_setzero_si128());
+            tmp0 = _mm_srai_epi32(silk_mm_sign_epi32(tmp0, q1_Q10), 10);
+            q1_Q0 = _mm_blendv_epi8(q1_Q0, tmp0, tmp1);
+        }
+
+        tmp0 = _mm_sign_epi32(_mm_set1_epi32(QUANT_LEVEL_ADJUST_Q10), q1_Q0);
+        q1_Q10 = _mm_sub_epi32(_mm_slli_epi32(q1_Q0, 10), tmp0);
+        q1_Q10 = _mm_add_epi32(q1_Q10, _mm_set1_epi32(offset_Q10));
+
+        /* check if q1_Q0 is 0 or -1 */
+        tmp0 = _mm_add_epi32(_mm_srli_epi32(q1_Q0, 31), q1_Q0);
+        tmp1 = _mm_cmpeq_epi32(tmp0, _mm_setzero_si128());
+        tmp0 = _mm_blendv_epi8(_mm_set1_epi32(1024), _mm_set1_epi32(1024 - QUANT_LEVEL_ADJUST_Q10), tmp1);
+        q2_Q10 = _mm_add_epi32(q1_Q10, tmp0);
+        q_Q10 = _mm256_set_m128i(q2_Q10, q1_Q10);
+
+        rr_Q10 = _mm256_sub_epi32(_mm256_broadcastsi128_si256(r_Q10), q_Q10);
+        rd_Q10 = _mm256_abs_epi32(q_Q10);
+        rr_Q10 = silk_mm256_smulbb_epi32(rr_Q10, rr_Q10);
+        rd_Q10 = silk_mm256_smulbb_epi32(rd_Q10, _mm256_set1_epi32(Lambda_Q10));
+        rd_Q10 = _mm256_add_epi32(rd_Q10, rr_Q10);
+        rd_Q10 = _mm256_srai_epi32(rd_Q10, 10);
+
+        mask = _mm256_broadcastsi128_si256(_mm_cmplt_epi32(_mm256_extracti128_si256(rd_Q10, 0), _mm256_extracti128_si256(rd_Q10, 1)));
+        SS_RD_Q10 = _mm256_add_epi32(
+            _mm256_broadcastsi128_si256(psDelDec->RD_Q10),
+            _mm256_blendv_epi8(
+                _mm256_permute2x128_si256(rd_Q10, rd_Q10, 0x1),
+                rd_Q10,
+                mask));
+        SS_Q_Q10 = _mm256_blendv_epi8(
+            _mm256_permute2x128_si256(q_Q10, q_Q10, 0x1),
+            q_Q10,
+            mask);
+
+        /* Update states for best and second best quantization */
+
+        /* Quantized excitation */
+        exc_Q14 = silk_mm256_sign_epi32(_mm256_slli_epi32(SS_Q_Q10, 4), _mm256_broadcastsi128_si256(psDelDec->Seed));
+
+        /* Add predictions */
+        exc_Q14 = _mm256_add_epi32(exc_Q14, _mm256_set1_epi32(LTP_pred_Q14));
+        SS_LPC_exc_Q14 = _mm256_slli_epi32(exc_Q14, 1);
+        SS_xq_Q14 = _mm256_add_epi32(exc_Q14, _mm256_broadcastsi128_si256(LPC_pred_Q14));
+
+        /* Update states */
+        SS_Diff_Q14 = _mm256_sub_epi32(SS_xq_Q14, _mm256_set1_epi32(silk_LSHIFT(x_Q10[i], 4)));
+        SS_LF_AR_Q14 = _mm256_sub_epi32(SS_Diff_Q14, _mm256_broadcastsi128_si256(n_AR_Q14));
+        SS_sLTP_shp_Q14 = silk_mm256_sub_sat_epi32(SS_LF_AR_Q14, _mm256_broadcastsi128_si256(n_LF_Q14));
+
+        /* END Updating Delayed Decision States */
+
+        *smpl_buf_idx = (*smpl_buf_idx + DECISION_DELAY - 1) % DECISION_DELAY;
+        last_smple_idx = (*smpl_buf_idx + decisionDelay) % DECISION_DELAY;
+        psLastSample = &psDelDec->Samples[last_smple_idx];
+
+        /* Find winner */
+        RDmin_Q10 = silk_mm_mask_hmin_epi32(_mm256_castsi256_si128(SS_RD_Q10), MaskDelDec);
+        Winner_selector = silk_index_to_selector(silk_index_of_first_equal_epi32(RDmin_Q10, _mm256_castsi256_si128(SS_RD_Q10)));
+
+        /* Increase RD values of expired states */
+        Winner_rand_state = _mm_shuffle_epi8(psLastSample->RandState, Winner_selector);
+
+        SS_RD_Q10 = _mm256_blendv_epi8(
+            _mm256_add_epi32(SS_RD_Q10, _mm256_set1_epi32(silk_int32_MAX >> 4)),
+            SS_RD_Q10,
+            _mm256_broadcastsi128_si256(_mm_cmpeq_epi32(psLastSample->RandState, Winner_rand_state)));
+
+        /* find worst in first set */
+        RDmax_Q10 = silk_mm_mask_hmax_epi32(_mm256_extracti128_si256(SS_RD_Q10, 0), MaskDelDec);
+        /* find best in second set */
+        RDmin_Q10 = silk_mm_mask_hmin_epi32(_mm256_extracti128_si256(SS_RD_Q10, 1), MaskDelDec);
+
+        /* Replace a state if best from second set outperforms worst in first set */
+        tmp0 = _mm_cmplt_epi32(RDmin_Q10, RDmax_Q10);
+        if (!_mm_test_all_zeros(tmp0, tmp0))
+        {
+            int t;
+            RDmax_ind = silk_index_of_first_equal_epi32(RDmax_Q10, _mm256_extracti128_si256(SS_RD_Q10, 0));
+            RDmin_ind = silk_index_of_first_equal_epi32(RDmin_Q10, _mm256_extracti128_si256(SS_RD_Q10, 1));
+            tmp1 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFU << (unsigned)(RDmax_ind << 3)));
+            tmp0 = _mm_blendv_epi8(
+                _mm_set_epi8(0xF, 0xE, 0xD, 0xC, 0xB, 0xA, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0),
+                silk_index_to_selector(RDmin_ind),
+                tmp1);
+            for (t = i; t < MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH; t++)
+            {
+                psDelDec->sLPC_Q14[t] = _mm_shuffle_epi8(psDelDec->sLPC_Q14[t], tmp0);
+            }
+            psDelDec->Seed = _mm_shuffle_epi8(psDelDec->Seed, tmp0);
+            psDelDec->SeedInit = _mm_shuffle_epi8(psDelDec->SeedInit, tmp0);
+            for (t = 0; t < MAX_SHAPE_LPC_ORDER; t++)
+            {
+                psDelDec->sAR2_Q14[t] = _mm_shuffle_epi8(psDelDec->sAR2_Q14[t], tmp0);
+            }
+            for (t = 0; t < DECISION_DELAY; t++)
+            {
+                psDelDec->Samples[t].RandState = _mm_shuffle_epi8(psDelDec->Samples[t].RandState, tmp0);
+                psDelDec->Samples[t].Q_Q10 = _mm_shuffle_epi8(psDelDec->Samples[t].Q_Q10, tmp0);
+                psDelDec->Samples[t].Xq_Q14 = _mm_shuffle_epi8(psDelDec->Samples[t].Xq_Q14, tmp0);
+                psDelDec->Samples[t].Pred_Q15 = _mm_shuffle_epi8(psDelDec->Samples[t].Pred_Q15, tmp0);
+                psDelDec->Samples[t].Shape_Q14 = _mm_shuffle_epi8(psDelDec->Samples[t].Shape_Q14, tmp0);
+            }
+            mask = _mm256_castsi128_si256(_mm_blendv_epi8(_mm_set_epi32(0x3, 0x2, 0x1, 0x0), _mm_set1_epi32(RDmin_ind + 4), tmp1));
+            SS_Q_Q10 = _mm256_permutevar8x32_epi32(SS_Q_Q10, mask);
+            SS_RD_Q10 = _mm256_permutevar8x32_epi32(SS_RD_Q10, mask);
+            SS_xq_Q14 = _mm256_permutevar8x32_epi32(SS_xq_Q14, mask);
+            SS_LF_AR_Q14 = _mm256_permutevar8x32_epi32(SS_LF_AR_Q14, mask);
+            SS_Diff_Q14 = _mm256_permutevar8x32_epi32(SS_Diff_Q14, mask);
+            SS_sLTP_shp_Q14 = _mm256_permutevar8x32_epi32(SS_sLTP_shp_Q14, mask);
+            SS_LPC_exc_Q14 = _mm256_permutevar8x32_epi32(SS_LPC_exc_Q14, mask);
+        }
+
+        /* Write samples from winner to output and long-term filter states */
+        if (subfr > 0 || i >= decisionDelay)
+        {
+            pulses[i - decisionDelay] =
+                (opus_int8)silk_sar_round_32(silk_select_winner(psLastSample->Q_Q10, Winner_selector), 10);
+            xq[i - decisionDelay] =
+                silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psLastSample->Xq_Q14, Winner_selector), delayedGain_Q10[last_smple_idx], 8));
+            NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay] =
+                silk_select_winner(psLastSample->Shape_Q14, Winner_selector);
+            sLTP_Q15[NSQ->sLTP_buf_idx - decisionDelay] =
+                silk_select_winner(psLastSample->Pred_Q15, Winner_selector);
+        }
+        NSQ->sLTP_shp_buf_idx++;
+        NSQ->sLTP_buf_idx++;
+
+        /* Update states */
+        psSample = &psDelDec->Samples[*smpl_buf_idx];
+        psDelDec->Seed = _mm_add_epi32(psDelDec->Seed, silk_mm_srai_round_epi32(_mm256_castsi256_si128(SS_Q_Q10), 10));
+        psDelDec->LF_AR_Q14 = _mm256_castsi256_si128(SS_LF_AR_Q14);
+        psDelDec->Diff_Q14 = _mm256_castsi256_si128(SS_Diff_Q14);
+        psDelDec->sLPC_Q14[i + NSQ_LPC_BUF_LENGTH] = _mm256_castsi256_si128(SS_xq_Q14);
+        psDelDec->RD_Q10 = _mm256_castsi256_si128(SS_RD_Q10);
+        psSample->Xq_Q14 = _mm256_castsi256_si128(SS_xq_Q14);
+        psSample->Q_Q10 = _mm256_castsi256_si128(SS_Q_Q10);
+        psSample->Pred_Q15 = _mm256_castsi256_si128(SS_LPC_exc_Q14);
+        psSample->Shape_Q14 = _mm256_castsi256_si128(SS_sLTP_shp_Q14);
+        psSample->RandState = psDelDec->Seed;
+        delayedGain_Q10[*smpl_buf_idx] = Gain_Q10;
+    }
+    /* Update LPC states */
+    for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)
+    {
+        psDelDec->sLPC_Q14[i] = (&psDelDec->sLPC_Q14[length])[i];
+    }
+}
+
+static OPUS_INLINE void silk_nsq_del_dec_scale_states_avx2(
+    const silk_encoder_state *psEncC,          /* I    Encoder State                   */
+    silk_nsq_state *NSQ,                       /* I/O  NSQ state                       */
+    NSQ_del_dec_struct *psDelDec,              /* I/O  Delayed decision states         */
+    const opus_int16 x16[],                    /* I    Input                           */
+    opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH], /* O    Input scaled with 1/Gain in Q10 */
+    const opus_int16 sLTP[],                   /* I    Re-whitened LTP state in Q0     */
+    opus_int32 sLTP_Q15[],                     /* O    LTP state matching scaled input */
+    opus_int subfr,                            /* I    Subframe number                 */
+    const opus_int LTP_scale_Q14,              /* I    LTP state scaling               */
+    const opus_int32 Gains_Q16[MAX_NB_SUBFR],  /* I                                    */
+    const opus_int pitchL[MAX_NB_SUBFR],       /* I    Pitch lag                       */
+    const opus_int signal_type,                /* I    Signal type                     */
+    const opus_int decisionDelay               /* I    Decision delay                  */
+)
+{
+    int i;
+    opus_int lag;
+    opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26;
+    NSQ_del_dec_sample_struct *psSample;
+
+    lag = pitchL[subfr];
+    inv_gain_Q31 = silk_INVERSE32_varQ(silk_max(Gains_Q16[subfr], 1), 47);
+    silk_assert(inv_gain_Q31 != 0);
+
+    /* Scale input */
+    inv_gain_Q26 = silk_sar_round_32(inv_gain_Q31, 5);
+    for (i = 0; i < psEncC->subfr_length; i+=4)
+    {
+        __m256i x = _mm256_cvtepi16_epi64(_mm_loadu_si64(&x16[i]));
+        x = _mm256_slli_epi64(_mm256_mul_epi32(x, _mm256_set1_epi32(inv_gain_Q26)), 16);
+        _mm_storeu_si128((__m128i_u*)&x_sc_Q10[i], silk_cvtepi64_epi32_high(x));
+    }
+
+    /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
+    if (NSQ->rewhite_flag)
+    {
+        if (subfr == 0)
+        {
+            /* Do LTP downscaling */
+            inv_gain_Q31 = silk_LSHIFT(silk_SMULWB(inv_gain_Q31, LTP_scale_Q14), 2);
+        }
+        for (i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++)
+        {
+            silk_assert(i < MAX_FRAME_LENGTH);
+            sLTP_Q15[i] = silk_SMULWB(inv_gain_Q31, sLTP[i]);
+        }
+    }
+
+    /* Adjust for changing gain */
+    if (Gains_Q16[subfr] != NSQ->prev_gain_Q16)
+    {
+        gain_adj_Q16 = silk_DIV32_varQ(NSQ->prev_gain_Q16, Gains_Q16[subfr], 16);
+
+        /* Scale long-term shaping state */
+        for (i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx; i+=4)
+        {
+            __m128i_u* p = (__m128i_u*)&NSQ->sLTP_shp_Q14[i];
+            *p = silk_mm_smulww_epi32(*p, gain_adj_Q16);
+        }
+
+        /* Scale long-term prediction state */
+        if (signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0)
+        {
+            for (i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay; i++)
+            {
+                sLTP_Q15[i] = ((opus_int64)sLTP_Q15[i]) * ((opus_int64)gain_adj_Q16) >> 16;
+            }
+        }
+
+        /* Scale scalar states */
+        psDelDec->LF_AR_Q14 = silk_mm_smulww_epi32(psDelDec->LF_AR_Q14, gain_adj_Q16);
+        psDelDec->Diff_Q14 = silk_mm_smulww_epi32(psDelDec->Diff_Q14, gain_adj_Q16);
+
+        /* Scale short-term prediction and shaping states */
+        for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)
+        {
+            psDelDec->sLPC_Q14[i] = silk_mm_smulww_epi32(psDelDec->sLPC_Q14[i], gain_adj_Q16);
+        }
+        for (i = 0; i < DECISION_DELAY; i++)
+        {
+            psSample = &psDelDec->Samples[i];
+            psSample->Pred_Q15 = silk_mm_smulww_epi32(psSample->Pred_Q15, gain_adj_Q16);
+            psSample->Shape_Q14 = silk_mm_smulww_epi32(psSample->Shape_Q14, gain_adj_Q16);
+        }
+        for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++)
+        {
+            psDelDec->sAR2_Q14[i] = silk_mm_smulww_epi32(psDelDec->sAR2_Q14[i], gain_adj_Q16);
+        }
+
+        /* Save inverse gain */
+        NSQ->prev_gain_Q16 = Gains_Q16[subfr];
+    }
+}
+
+static OPUS_INLINE void silk_LPC_analysis_filter_avx2(
+    opus_int16                  *out,               /* O    Output signal                           */
+    const opus_int16            *in,                /* I    Input signal                            */
+    const opus_int16            *B,                 /* I    MA prediction coefficients, Q12 [order] */
+    const opus_int32            len,                /* I    Signal length                           */
+    const opus_int32            order               /* I    Filter order                            */
+)
+{
+    int i;
+    opus_int32       out32_Q12, out32;
+    silk_assert(order == 10 || order == 16);
+
+    for(i = order; i < len; i++ )
+    {
+        const opus_int16 *in_ptr = &in[ i ];
+        /* Allowing wrap around so that two wraps can cancel each other. The rare
+           cases where the result wraps around can only be triggered by invalid streams*/
+
+        __m256i in_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i_u*)&in_ptr[-8]));
+        __m256i B_v  = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i_u*)&      B[0]));
+        __m256i sum = _mm256_mullo_epi32(in_v, silk_mm256_reverse_epi32(B_v));
+        if (order > 10)
+        {
+            in_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i_u*)&in_ptr[-16]));
+            B_v  = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i_u*)&B       [8]));
+            B_v  = silk_mm256_reverse_epi32(B_v);
+        }
+        else
+        {
+            in_v = _mm256_cvtepi16_epi32(_mm_loadu_si32(&in_ptr[-10]));
+            B_v  = _mm256_cvtepi16_epi32(_mm_loadu_si32(&B       [8]));
+            B_v  = _mm256_shuffle_epi32(B_v, 0x01);
+        }
+        sum = _mm256_add_epi32(sum, _mm256_mullo_epi32(in_v, B_v));
+
+        out32_Q12 = silk_mm256_hsum_epi32(sum);
+
+        /* Subtract prediction */
+        out32_Q12 = silk_SUB32_ovflw( silk_LSHIFT( (opus_int32)*in_ptr, 12 ), out32_Q12 );
+
+        /* Scale to Q0 */
+        out32 = silk_sar_round_32(out32_Q12, 12);
+
+        /* Saturate output */
+        out[ i ] = silk_sat16(out32);
+    }
+
+    /* Set first d output samples to zero */
+    silk_memset( out, 0, order * sizeof( opus_int16 ) );
+}
diff --git a/opus/silk/x86/NSQ_del_dec_sse4_1.c b/opus/silk/x86/NSQ_del_dec_sse4_1.c
index 2c75ede2..5937682d 100644
--- a/opus/silk/x86/NSQ_del_dec_sse4_1.c
+++ b/opus/silk/x86/NSQ_del_dec_sse4_1.c
@@ -1,5 +1,5 @@
-/* Copyright (c) 2014, Cisco Systems, INC
-   Written by XiangMingZhu WeiZhou MinPeng YanWang
+/* Copyright (c) 2014-2020, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions
@@ -46,6 +46,7 @@ typedef struct {
     opus_int32 Shape_Q14[ DECISION_DELAY ];
     opus_int32 sAR2_Q14[ MAX_SHAPE_LPC_ORDER ];
     opus_int32 LF_AR_Q14;
+    opus_int32 Diff_Q14;
     opus_int32 Seed;
     opus_int32 SeedInit;
     opus_int32 RD_Q10;
@@ -56,6 +57,7 @@ typedef struct {
     opus_int32 RD_Q10;
     opus_int32 xq_Q14;
     opus_int32 LF_AR_Q14;
+    opus_int32 Diff_Q14;
     opus_int32 sLTP_shp_Q14;
     opus_int32 LPC_exc_Q14;
 } NSQ_sample_struct;
@@ -66,7 +68,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
     const silk_encoder_state *psEncC,               /* I    Encoder State                       */
     silk_nsq_state      *NSQ,                       /* I/O  NSQ state                           */
     NSQ_del_dec_struct  psDelDec[],                 /* I/O  Delayed decision states             */
-    const opus_int32    x_Q3[],                     /* I    Input in Q3                         */
+    const opus_int16    x16[],                      /* I    Input                               */
     opus_int32          x_sc_Q10[],                 /* O    Input scaled with 1/Gain in Q10     */
     const opus_int16    sLTP[],                     /* I    Re-whitened LTP state in Q0         */
     opus_int32          sLTP_Q15[],                 /* O    LTP state matching scaled input     */
@@ -112,21 +114,21 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
 );
 
 void silk_NSQ_del_dec_sse4_1(
-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
-    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
-    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
+    const opus_int16            x16[],                                        /* I    Input                           */
+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
+    const opus_int16            *PredCoef_Q12,                                /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 )
 {
     opus_int            i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr;
@@ -142,8 +144,39 @@ void silk_NSQ_del_dec_sse4_1(
     VARDECL( opus_int32, delayedGain_Q10 );
     VARDECL( NSQ_del_dec_struct, psDelDec );
     NSQ_del_dec_struct  *psDD;
+#ifdef OPUS_CHECK_ASM
+    silk_nsq_state NSQ_c;
+    SideInfoIndices psIndices_c;
+    opus_int8 pulses_c[ MAX_FRAME_LENGTH ];
+    const opus_int8 *const pulses_a = pulses;
+#endif
     SAVE_STACK;
 
+#ifdef OPUS_CHECK_ASM
+    ( void )pulses_a;
+    silk_memcpy( &NSQ_c, NSQ, sizeof( NSQ_c ) );
+    silk_memcpy( &psIndices_c, psIndices, sizeof( psIndices_c ) );
+    silk_assert( psEncC->nb_subfr * psEncC->subfr_length <= MAX_FRAME_LENGTH );
+    silk_memcpy( pulses_c, pulses, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) );
+    silk_NSQ_del_dec_c(
+        psEncC,
+        &NSQ_c,
+        &psIndices_c,
+        x16,
+        pulses_c,
+        PredCoef_Q12,
+        LTPCoef_Q14,
+        AR_Q13,
+        HarmShapeGain_Q14,
+        Tilt_Q14,
+        LF_shp_Q14,
+        Gains_Q16,
+        pitchL,
+        Lambda_Q10,
+        LTP_scale_Q14
+    );
+#endif
+
     /* Set unvoiced lag to the previous one, overwrite later for voiced */
     lag = NSQ->lagPrev;
 
@@ -158,6 +191,7 @@ void silk_NSQ_del_dec_sse4_1(
         psDD->SeedInit       = psDD->Seed;
         psDD->RD_Q10         = 0;
         psDD->LF_AR_Q14      = NSQ->sLF_AR_shp_Q14;
+        psDD->Diff_Q14       = NSQ->sDiff_shp_Q14;
         psDD->Shape_Q14[ 0 ] = NSQ->sLTP_shp_Q14[ psEncC->ltp_mem_length - 1 ];
         silk_memcpy( psDD->sLPC_Q14, NSQ->sLPC_Q14, NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
         silk_memcpy( psDD->sAR2_Q14, NSQ->sAR2_Q14, sizeof( NSQ->sAR2_Q14 ) );
@@ -185,8 +219,7 @@ void silk_NSQ_del_dec_sse4_1(
         LSF_interpolation_flag = 1;
     }
 
-    ALLOC( sLTP_Q15,
-           psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
+    ALLOC( sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
     ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
     ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
     ALLOC( delayedGain_Q10, DECISION_DELAY, opus_int32 );
@@ -198,7 +231,7 @@ void silk_NSQ_del_dec_sse4_1(
     for( k = 0; k < psEncC->nb_subfr; k++ ) {
         A_Q12      = &PredCoef_Q12[ ( ( k >> 1 ) | ( 1 - LSF_interpolation_flag ) ) * MAX_LPC_ORDER ];
         B_Q14      = &LTPCoef_Q14[ k * LTP_ORDER           ];
-        AR_shp_Q13 = &AR2_Q13[     k * MAX_SHAPE_LPC_ORDER ];
+        AR_shp_Q13 = &AR_Q13[     k * MAX_SHAPE_LPC_ORDER ];
 
         /* Noise shape parameters */
         silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
@@ -257,7 +290,7 @@ void silk_NSQ_del_dec_sse4_1(
             }
         }
 
-        silk_nsq_del_dec_scale_states_sse4_1( psEncC, NSQ, psDelDec, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k,
+        silk_nsq_del_dec_scale_states_sse4_1( psEncC, NSQ, psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k,
             psEncC->nStatesDelayedDecision, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay );
 
         silk_noise_shape_quantizer_del_dec_sse4_1( NSQ, psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15,
@@ -265,7 +298,7 @@ void silk_NSQ_del_dec_sse4_1(
             Gains_Q16[ k ], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder,
             psEncC->predictLPCOrder, psEncC->warping_Q16, psEncC->nStatesDelayedDecision, &smpl_buf_idx, decisionDelay );
 
-        x_Q3   += psEncC->subfr_length;
+        x16    += psEncC->subfr_length;
         pulses += psEncC->subfr_length;
         pxq    += psEncC->subfr_length;
     }
@@ -288,6 +321,7 @@ void silk_NSQ_del_dec_sse4_1(
     for( i = 0; i < decisionDelay; i++ ) {
         last_smple_idx = ( last_smple_idx - 1 ) % DECISION_DELAY;
         if( last_smple_idx < 0 ) last_smple_idx += DECISION_DELAY;
+
         pulses[   i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 );
         pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
             silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], Gain_Q10 ), 8 ) );
@@ -298,11 +332,19 @@ void silk_NSQ_del_dec_sse4_1(
 
     /* Update states */
     NSQ->sLF_AR_shp_Q14 = psDD->LF_AR_Q14;
+    NSQ->sDiff_shp_Q14  = psDD->Diff_Q14;
     NSQ->lagPrev        = pitchL[ psEncC->nb_subfr - 1 ];
 
     /* Save quantized speech signal */
     silk_memmove( NSQ->xq,           &NSQ->xq[           psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
     silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
+
+#ifdef OPUS_CHECK_ASM
+    silk_assert( !memcmp( &NSQ_c, NSQ, sizeof( NSQ_c ) ) );
+    silk_assert( !memcmp( &psIndices_c, psIndices, sizeof( psIndices_c ) ) );
+    silk_assert( !memcmp( pulses_c, pulses_a, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) ) );
+#endif
+
     RESTORE_STACK;
 }
 
@@ -345,6 +387,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
     opus_int32   q1_Q0, q1_Q10, q2_Q10, exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
     opus_int32   tmp1, tmp2, sLF_AR_shp_Q14;
     opus_int32   *pred_lag_ptr, *shp_lag_ptr, *psLPC_Q14;
+    int rdo_offset;
+
     VARDECL( NSQ_sample_pair, psSampleState );
     NSQ_del_dec_struct *psDD;
     NSQ_sample_struct  *psSS;
@@ -356,6 +400,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
     celt_assert( nStatesDelayedDecision > 0 );
     ALLOC( psSampleState, nStatesDelayedDecision, NSQ_sample_pair );
 
+    rdo_offset = (Lambda_Q10 >> 1) - 512;
+
     shp_lag_ptr  = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
     pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
     Gain_Q10     = silk_RSHIFT( Gain_Q16, 6 );
@@ -382,7 +428,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
             LTP_pred_Q14 = 2;
             {
                 __m128i tmpa, tmpb, pred_lag_ptr_tmp;
-                pred_lag_ptr_tmp    = _mm_loadu_si128( (__m128i *)(&pred_lag_ptr[ -3 ] ) );
+                pred_lag_ptr_tmp    = _mm_loadu_si128( (__m128i *)(void*)(&pred_lag_ptr[ -3 ] ) );
                 pred_lag_ptr_tmp    = _mm_shuffle_epi32( pred_lag_ptr_tmp, 0x1B );
                 tmpa                = _mm_mul_epi32( pred_lag_ptr_tmp, b_Q12_0123 );
                 tmpa                = _mm_srli_si128( tmpa, 2 );
@@ -407,8 +453,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
         /* Long-term shaping */
         if( lag > 0 ) {
             /* Symmetric, packed FIR coefficients */
-            n_LTP_Q14 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
-            n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ],                      HarmShapeFIRPacked_Q14 );
+            n_LTP_Q14 = silk_SMULWB( silk_ADD_SAT32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
+            n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 );
             n_LTP_Q14 = silk_SUB_LSHIFT32( LTP_pred_Q14, n_LTP_Q14, 2 );            /* Q12 -> Q14 */
             shp_lag_ptr++;
         } else {
@@ -437,7 +483,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
                 tmpb = _mm_setzero_si128();
 
                 /* step 1 */
-                psLPC_Q14_tmp   = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -3 ] ) ); /* -3, -2 , -1, 0 */
+                psLPC_Q14_tmp   = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -3 ] ) ); /* -3, -2 , -1, 0 */
                 psLPC_Q14_tmp   = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );      /* 0, -1, -2, -3 */
                 tmpa            = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_0123 );    /* 0, -1, -2, -3 * 0123 -> 0*0, 2*-2 */
 
@@ -451,7 +497,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
                 tmpb            = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
 
                 /* step 2 */
-                psLPC_Q14_tmp   = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -7 ] ) );
+                psLPC_Q14_tmp   = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -7 ] ) );
                 psLPC_Q14_tmp   = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
                 tmpa            = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_4567 );
                 tmpa            = _mm_srli_epi64( tmpa, 16 );
@@ -466,7 +512,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
                 if ( opus_likely( predictLPCOrder == 16 ) )
                 {
                     /* step 3 */
-                    psLPC_Q14_tmp   = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -11 ] ) );
+                    psLPC_Q14_tmp   = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -11 ] ) );
                     psLPC_Q14_tmp   = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
                     tmpa            = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_89AB );
                     tmpa            = _mm_srli_epi64( tmpa, 16 );
@@ -478,8 +524,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
                     psLPC_Q14_tmp   = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
                     tmpb            = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
 
-                    /* setp 4 */
-                    psLPC_Q14_tmp   = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -15 ] ) );
+                    /* step 4 */
+                    psLPC_Q14_tmp   = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -15 ] ) );
                     psLPC_Q14_tmp   = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
                     tmpa            = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_CDEF );
                     tmpa            = _mm_srli_epi64( tmpa, 16 );
@@ -511,22 +557,22 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
                 LPC_pred_Q14 = silk_LSHIFT( LPC_pred_Q14, 4 ); /* Q10 -> Q14 */
 
                 /* Noise shape feedback */
-                silk_assert( ( shapingLPCOrder & 1 ) == 0 );   /* check that order is even */
+                celt_assert( ( shapingLPCOrder & 1 ) == 0 );   /* check that order is even */
                 /* Output of lowpass section */
-                tmp2 = silk_SMLAWB( psLPC_Q14[ 0 ], psDD->sAR2_Q14[ 0 ], warping_Q16 );
+                tmp2 = silk_SMLAWB( psDD->Diff_Q14, psDD->sAR2_Q14[ 0 ], warping_Q16 );
                 /* Output of allpass section */
-                tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ 0 ], psDD->sAR2_Q14[ 1 ] - tmp2, warping_Q16 );
+                tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ 0 ], silk_SUB32_ovflw(psDD->sAR2_Q14[ 1 ], tmp2), warping_Q16 );
                 psDD->sAR2_Q14[ 0 ] = tmp2;
                 n_AR_Q14 = silk_RSHIFT( shapingLPCOrder, 1 );
                 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp2, AR_shp_Q13[ 0 ] );
                 /* Loop over allpass sections */
                 for( j = 2; j < shapingLPCOrder; j += 2 ) {
                     /* Output of allpass section */
-                    tmp2 = silk_SMLAWB( psDD->sAR2_Q14[ j - 1 ], psDD->sAR2_Q14[ j + 0 ] - tmp1, warping_Q16 );
+                    tmp2 = silk_SMLAWB( psDD->sAR2_Q14[ j - 1 ], silk_SUB32_ovflw(psDD->sAR2_Q14[ j + 0 ], tmp1), warping_Q16 );
                     psDD->sAR2_Q14[ j - 1 ] = tmp1;
                     n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp1, AR_shp_Q13[ j - 1 ] );
                     /* Output of allpass section */
-                    tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ j + 0 ], psDD->sAR2_Q14[ j + 1 ] - tmp2, warping_Q16 );
+                    tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ j + 0 ], silk_SUB32_ovflw(psDD->sAR2_Q14[ j + 1 ], tmp2), warping_Q16 );
                     psDD->sAR2_Q14[ j + 0 ] = tmp2;
                     n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp2, AR_shp_Q13[ j ] );
                 }
@@ -543,9 +589,9 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
 
                 /* Input minus prediction plus noise feedback                       */
                 /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP  */
-                tmp1 = silk_ADD32( n_AR_Q14, n_LF_Q14 );                                    /* Q14 */
-                tmp2 = silk_ADD32( n_LTP_Q14, LPC_pred_Q14 );                               /* Q13 */
-                tmp1 = silk_SUB32( tmp2, tmp1 );                                            /* Q13 */
+                tmp1 = silk_ADD_SAT32( n_AR_Q14, n_LF_Q14 );                                /* Q14 */
+                tmp2 = silk_ADD32_ovflw( n_LTP_Q14, LPC_pred_Q14 );                         /* Q13 */
+                tmp1 = silk_SUB_SAT32( tmp2, tmp1 );                                        /* Q13 */
                 tmp1 = silk_RSHIFT_ROUND( tmp1, 4 );                                        /* Q10 */
 
                 r_Q10 = silk_SUB32( x_Q10[ i ], tmp1 );                                     /* residual error Q10 */
@@ -559,6 +605,18 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
                 /* Find two quantization level candidates and measure their rate-distortion */
                 q1_Q10 = silk_SUB32( r_Q10, offset_Q10 );
                 q1_Q0 = silk_RSHIFT( q1_Q10, 10 );
+                if (Lambda_Q10 > 2048) {
+                    /* For aggressive RDO, the bias becomes more than one pulse. */
+                    if (q1_Q10 > rdo_offset) {
+                        q1_Q0 = silk_RSHIFT( q1_Q10 - rdo_offset, 10 );
+                    } else if (q1_Q10 < -rdo_offset) {
+                        q1_Q0 = silk_RSHIFT( q1_Q10 + rdo_offset, 10 );
+                    } else if (q1_Q10 < 0) {
+                        q1_Q0 = -1;
+                    } else {
+                        q1_Q0 = 0;
+                    }
+                }
                 if( q1_Q0 > 0 ) {
                     q1_Q10  = silk_SUB32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 );
                     q1_Q10  = silk_ADD32( q1_Q10, offset_Q10 );
@@ -609,11 +667,12 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
 
                 /* Add predictions */
                 LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 );
-                xq_Q14      = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 );
+                xq_Q14      = silk_ADD32_ovflw( LPC_exc_Q14, LPC_pred_Q14 );
 
                 /* Update states */
-                sLF_AR_shp_Q14         = silk_SUB32( xq_Q14, n_AR_Q14 );
-                psSS[ 0 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 );
+                psSS[ 0 ].Diff_Q14     = silk_SUB32_ovflw( xq_Q14, silk_LSHIFT32( x_Q10[ i ], 4 ) );
+                sLF_AR_shp_Q14         = silk_SUB32_ovflw( psSS[ 0 ].Diff_Q14, n_AR_Q14 );
+                psSS[ 0 ].sLTP_shp_Q14 = silk_SUB_SAT32( sLF_AR_shp_Q14, n_LF_Q14 );
                 psSS[ 0 ].LF_AR_Q14    = sLF_AR_shp_Q14;
                 psSS[ 0 ].LPC_exc_Q14  = LPC_exc_Q14;
                 psSS[ 0 ].xq_Q14       = xq_Q14;
@@ -626,14 +685,14 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
                     exc_Q14 = -exc_Q14;
                 }
 
-
                 /* Add predictions */
                 LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 );
-                xq_Q14      = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 );
+                xq_Q14      = silk_ADD32_ovflw( LPC_exc_Q14, LPC_pred_Q14 );
 
                 /* Update states */
-                sLF_AR_shp_Q14         = silk_SUB32( xq_Q14, n_AR_Q14 );
-                psSS[ 1 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 );
+                psSS[ 1 ].Diff_Q14     = silk_SUB32_ovflw( xq_Q14, silk_LSHIFT32( x_Q10[ i ], 4 ) );
+                sLF_AR_shp_Q14         = silk_SUB32_ovflw( psSS[ 1 ].Diff_Q14, n_AR_Q14 );
+                psSS[ 1 ].sLTP_shp_Q14 = silk_SUB_SAT32( sLF_AR_shp_Q14, n_LF_Q14 );
                 psSS[ 1 ].LF_AR_Q14    = sLF_AR_shp_Q14;
                 psSS[ 1 ].LPC_exc_Q14  = LPC_exc_Q14;
                 psSS[ 1 ].xq_Q14       = xq_Q14;
@@ -705,6 +764,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
             psDD                                     = &psDelDec[ k ];
             psSS                                     = &psSampleState[ k ][ 0 ];
             psDD->LF_AR_Q14                          = psSS->LF_AR_Q14;
+            psDD->Diff_Q14                           = psSS->Diff_Q14;
             psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH + i ] = psSS->xq_Q14;
             psDD->Xq_Q14[    *smpl_buf_idx ]         = psSS->xq_Q14;
             psDD->Q_Q10[     *smpl_buf_idx ]         = psSS->Q_Q10;
@@ -728,7 +788,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
     const silk_encoder_state *psEncC,               /* I    Encoder State                       */
     silk_nsq_state      *NSQ,                       /* I/O  NSQ state                           */
     NSQ_del_dec_struct  psDelDec[],                 /* I/O  Delayed decision states             */
-    const opus_int32    x_Q3[],                     /* I    Input in Q3                         */
+    const opus_int16    x16[],                      /* I    Input                               */
     opus_int32          x_sc_Q10[],                 /* O    Input scaled with 1/Gain in Q10     */
     const opus_int16    sLTP[],                     /* I    Re-whitened LTP state in Q0         */
     opus_int32          sLTP_Q15[],                 /* O    LTP state matching scaled input     */
@@ -742,51 +802,41 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
 )
 {
     opus_int            i, k, lag;
-    opus_int32          gain_adj_Q16, inv_gain_Q31, inv_gain_Q23;
+    opus_int32          gain_adj_Q16, inv_gain_Q31, inv_gain_Q26;
     NSQ_del_dec_struct  *psDD;
-    __m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1;
+    __m128i xmm_inv_gain_Q26, xmm_x16_x2x0, xmm_x16_x3x1;
 
     lag          = pitchL[ subfr ];
     inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
-
     silk_assert( inv_gain_Q31 != 0 );
 
-    /* Calculate gain adjustment factor */
-    if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
-        gain_adj_Q16 =  silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
-    } else {
-        gain_adj_Q16 = (opus_int32)1 << 16;
-    }
-
     /* Scale input */
-    inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 );
+    inv_gain_Q26 = silk_RSHIFT_ROUND( inv_gain_Q31, 5 );
 
-    /* prepare inv_gain_Q23 in packed 4 32-bits */
-    xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23);
+    /* prepare inv_gain_Q26 in packed 4 32-bits */
+    xmm_inv_gain_Q26 = _mm_set1_epi32(inv_gain_Q26);
 
     for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) {
-        xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) );
+        xmm_x16_x2x0 = OP_CVTEPI16_EPI32_M64( &(x16[ i ] ) );
+
         /* equal shift right 4 bytes*/
-        xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+        xmm_x16_x3x1 = _mm_shuffle_epi32( xmm_x16_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
 
-        xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 );
-        xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 );
+        xmm_x16_x2x0 = _mm_mul_epi32( xmm_x16_x2x0, xmm_inv_gain_Q26 );
+        xmm_x16_x3x1 = _mm_mul_epi32( xmm_x16_x3x1, xmm_inv_gain_Q26 );
 
-        xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 );
-        xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 );
+        xmm_x16_x2x0 = _mm_srli_epi64( xmm_x16_x2x0, 16 );
+        xmm_x16_x3x1 = _mm_slli_epi64( xmm_x16_x3x1, 16 );
 
-        xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC );
+        xmm_x16_x2x0 = _mm_blend_epi16( xmm_x16_x2x0, xmm_x16_x3x1, 0xCC );
 
-        _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ])), xmm_x_Q3_x2x0 );
+        _mm_storeu_si128( (__m128i *)(void*)(&(x_sc_Q10[ i ] ) ), xmm_x16_x2x0 );
     }
 
     for( ; i < psEncC->subfr_length; i++ ) {
-        x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 );
+        x_sc_Q10[ i ] = silk_SMULWW( x16[ i ], inv_gain_Q26 );
     }
 
-    /* Save inverse gain */
-    NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
-
     /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
     if( NSQ->rewhite_flag ) {
         if( subfr == 0 ) {
@@ -800,7 +850,9 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
     }
 
     /* Adjust for changing gain */
-    if( gain_adj_Q16 != (opus_int32)1 << 16 ) {
+    if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
+        gain_adj_Q16 =  silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
+
         /* Scale long-term shaping state */
         {
             __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1;
@@ -810,7 +862,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
 
             for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 3; i += 4 )
             {
-                xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ) );
+                xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(void*)(&(NSQ->sLTP_shp_Q14[ i ] ) ) );
                 /* equal shift right 4 bytes*/
                 xmm_sLTP_shp_Q14_x3x1 = _mm_shuffle_epi32( xmm_sLTP_shp_Q14_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
 
@@ -822,7 +874,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
 
                 xmm_sLTP_shp_Q14_x2x0 = _mm_blend_epi16( xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1, 0xCC );
 
-                _mm_storeu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 );
+                _mm_storeu_si128( (__m128i *)(void*)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 );
             }
 
             for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {
@@ -841,6 +893,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
 
                 /* Scale scalar states */
                 psDD->LF_AR_Q14 = silk_SMULWW( gain_adj_Q16, psDD->LF_AR_Q14 );
+                psDD->Diff_Q14 = silk_SMULWW( gain_adj_Q16, psDD->Diff_Q14 );
 
                 /* Scale short-term prediction and shaping states */
                 for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
@@ -855,5 +908,8 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
                 }
             }
         }
+
+        /* Save inverse gain */
+        NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
     }
 }
diff --git a/opus/silk/x86/NSQ_sse4_1.c b/opus/silk/x86/NSQ_sse4_1.c
index b0315e35..3c9aca7b 100644
--- a/opus/silk/x86/NSQ_sse4_1.c
+++ b/opus/silk/x86/NSQ_sse4_1.c
@@ -1,5 +1,5 @@
-/* Copyright (c) 2014, Cisco Systems, INC
-   Written by XiangMingZhu WeiZhou MinPeng YanWang
+/* Copyright (c) 2014-2020, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions
@@ -37,17 +37,17 @@
 #include "stack_alloc.h"
 
 static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
-    const silk_encoder_state *psEncC,           /* I    Encoder State                   */
-    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */
-    const opus_int32    x_Q3[],                 /* I    input in Q3                     */
-    opus_int32          x_sc_Q10[],             /* O    input scaled with 1/Gain        */
-    const opus_int16    sLTP[],                 /* I    re-whitened LTP state in Q0     */
-    opus_int32          sLTP_Q15[],             /* O    LTP state matching scaled input */
-    opus_int            subfr,                  /* I    subframe number                 */
-    const opus_int      LTP_scale_Q14,          /* I                                    */
-    const opus_int32    Gains_Q16[ MAX_NB_SUBFR ], /* I                                 */
-    const opus_int      pitchL[ MAX_NB_SUBFR ], /* I    Pitch lag                       */
-    const opus_int      signal_type             /* I    Signal type                     */
+    const silk_encoder_state *psEncC,              /* I    Encoder State                   */
+    silk_nsq_state      *NSQ,                      /* I/O  NSQ state                       */
+    const opus_int16    x16[],                     /* I    input                           */
+    opus_int32          x_sc_Q10[],                /* O    input scaled with 1/Gain        */
+    const opus_int16    sLTP[],                    /* I    re-whitened LTP state in Q0     */
+    opus_int32          sLTP_Q15[],                /* O    LTP state matching scaled input */
+    opus_int            subfr,                     /* I    subframe number                 */
+    const opus_int      LTP_scale_Q14,             /* I                                    */
+    const opus_int32    Gains_Q16[ MAX_NB_SUBFR ], /* I                                    */
+    const opus_int      pitchL[ MAX_NB_SUBFR ],    /* I    Pitch lag                       */
+    const opus_int      signal_type                /* I    Signal type                     */
 );
 
 static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
@@ -65,27 +65,28 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
     opus_int            Tilt_Q14,               /* I    Spectral tilt                   */
     opus_int32          LF_shp_Q14,             /* I                                    */
     opus_int32          Gain_Q16,               /* I                                    */
+    opus_int            Lambda_Q10,             /* I                                    */
     opus_int            offset_Q10,             /* I                                    */
     opus_int            length,                 /* I    Input length                    */
     opus_int32          table[][4]              /* I                                    */
 );
 
 void silk_NSQ_sse4_1(
-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
-    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
-    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
+    const opus_int16            x16[],                                        /* I    Input                           */
+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
+    const opus_int16            *PredCoef_Q12,                                /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 )
 {
     opus_int            k, lag, start_idx, LSF_interpolation_flag;
@@ -101,8 +102,41 @@ void silk_NSQ_sse4_1(
     opus_int32   tmp1;
     opus_int32   q1_Q10, q2_Q10, rd1_Q20, rd2_Q20;
 
+#ifdef OPUS_CHECK_ASM
+    silk_nsq_state NSQ_c;
+    SideInfoIndices psIndices_c;
+    opus_int8 pulses_c[ MAX_FRAME_LENGTH ];
+    const opus_int8 *const pulses_a = pulses;
+#endif
+
     SAVE_STACK;
 
+#ifdef OPUS_CHECK_ASM
+    ( void )pulses_a;
+    silk_memcpy( &NSQ_c, NSQ, sizeof( NSQ_c ) );
+    silk_memcpy( &psIndices_c, psIndices, sizeof( psIndices_c ) );
+    silk_assert( psEncC->nb_subfr * psEncC->subfr_length <= MAX_FRAME_LENGTH );
+    silk_memcpy( pulses_c, pulses, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) );
+
+    silk_NSQ_c(
+        psEncC,
+        &NSQ_c,
+        &psIndices_c,
+        x16,
+        pulses_c,
+        PredCoef_Q12,
+        LTPCoef_Q14,
+        AR_Q13,
+        HarmShapeGain_Q14,
+        Tilt_Q14,
+        LF_shp_Q14,
+        Gains_Q16,
+        pitchL,
+        Lambda_Q10,
+        LTP_scale_Q14
+    );
+#endif
+
     NSQ->rand_seed = psIndices->Seed;
 
     /* Set unvoiced lag to the previous one, overwrite later for voiced */
@@ -172,8 +206,7 @@ void silk_NSQ_sse4_1(
         LSF_interpolation_flag = 1;
     }
 
-    ALLOC( sLTP_Q15,
-           psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
+    ALLOC( sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
     ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
     ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
     /* Set up pointers to start of sub frame */
@@ -183,7 +216,7 @@ void silk_NSQ_sse4_1(
     for( k = 0; k < psEncC->nb_subfr; k++ ) {
         A_Q12      = &PredCoef_Q12[ (( k >> 1 ) | ( 1 - LSF_interpolation_flag )) * MAX_LPC_ORDER ];
         B_Q14      = &LTPCoef_Q14[ k * LTP_ORDER ];
-        AR_shp_Q13 = &AR2_Q13[     k * MAX_SHAPE_LPC_ORDER ];
+        AR_shp_Q13 = &AR_Q13[ k * MAX_SHAPE_LPC_ORDER ];
 
         /* Noise shape parameters */
         silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
@@ -209,12 +242,12 @@ void silk_NSQ_sse4_1(
             }
         }
 
-        silk_nsq_scale_states_sse4_1( psEncC, NSQ, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType );
+        silk_nsq_scale_states_sse4_1( psEncC, NSQ, x16, x_sc_Q10, sLTP, sLTP_Q15, k, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType );
 
         if ( opus_likely( ( 10 == psEncC->shapingLPCOrder ) && ( 16 == psEncC->predictLPCOrder) ) )
         {
             silk_noise_shape_quantizer_10_16_sse4_1( NSQ, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14,
-                AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ],
+                AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ], Lambda_Q10,
                 offset_Q10, psEncC->subfr_length, &(table[32]) );
         }
         else
@@ -224,7 +257,7 @@ void silk_NSQ_sse4_1(
                 offset_Q10, psEncC->subfr_length, psEncC->shapingLPCOrder, psEncC->predictLPCOrder, psEncC->arch );
         }
 
-        x_Q3   += psEncC->subfr_length;
+        x16    += psEncC->subfr_length;
         pulses += psEncC->subfr_length;
         pxq    += psEncC->subfr_length;
     }
@@ -235,12 +268,19 @@ void silk_NSQ_sse4_1(
     /* Save quantized speech and noise shaping signals */
     silk_memmove( NSQ->xq,           &NSQ->xq[           psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
     silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
+
+#ifdef OPUS_CHECK_ASM
+    silk_assert( !memcmp( &NSQ_c, NSQ, sizeof( NSQ_c ) ) );
+    silk_assert( !memcmp( &psIndices_c, psIndices, sizeof( psIndices_c ) ) );
+    silk_assert( !memcmp( pulses_c, pulses_a, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) ) );
+#endif
+
     RESTORE_STACK;
 }
 
-/***********************************/
-/* silk_noise_shape_quantizer_10_16  */
-/***********************************/
+/************************************/
+/* silk_noise_shape_quantizer_10_16 */
+/************************************/
 static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
     silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */
     opus_int            signalType,             /* I    Signal type                     */
@@ -256,6 +296,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
     opus_int            Tilt_Q14,               /* I    Spectral tilt                   */
     opus_int32          LF_shp_Q14,             /* I                                    */
     opus_int32          Gain_Q16,               /* I                                    */
+    opus_int            Lambda_Q10,             /* I                                    */
     opus_int            offset_Q10,             /* I                                    */
     opus_int            length,                 /* I    Input length                    */
     opus_int32          table[][4]              /* I                                    */
@@ -264,7 +305,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
     opus_int     i;
     opus_int32   LTP_pred_Q13, LPC_pred_Q10, n_AR_Q12, n_LTP_Q13;
     opus_int32   n_LF_Q12, r_Q10, q1_Q0, q1_Q10, q2_Q10;
-    opus_int32   exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
+    opus_int32   exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10, sDiff_shp_Q14;
     opus_int32   tmp1, tmp2, sLF_AR_shp_Q14;
     opus_int32   *psLPC_Q14, *shp_lag_ptr, *pred_lag_ptr;
 
@@ -279,6 +320,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
     __m128i sAR2_Q14_hi_76543210, sAR2_Q14_lo_76543210;
     __m128i AR_shp_Q13_76543210;
 
+    int rdo_offset = (Lambda_Q10 >> 1) - 512;
+
     shp_lag_ptr  = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
     pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
     Gain_Q10     = silk_RSHIFT( Gain_Q16, 6 );
@@ -288,27 +331,28 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
 
     sLF_AR_shp_Q14 = NSQ->sLF_AR_shp_Q14;
     xq_Q14         = psLPC_Q14[ 0 ];
+    sDiff_shp_Q14  = NSQ->sDiff_shp_Q14;
     LTP_pred_Q13   = 0;
 
     /* load a_Q12 */
     xmm_one = _mm_set_epi8( 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 );
 
     /* load a_Q12[0] - a_Q12[7] */
-    a_Q12_01234567 = _mm_loadu_si128( (__m128i *)(&a_Q12[ 0 ] ) );
+    a_Q12_01234567 = _mm_loadu_si128( (__m128i *)(void*)(&a_Q12[ 0 ] ) );
     /* load a_Q12[ 8 ] - a_Q12[ 15 ] */
-    a_Q12_89ABCDEF = _mm_loadu_si128( (__m128i *)(&a_Q12[ 8 ] ) );
+    a_Q12_89ABCDEF = _mm_loadu_si128( (__m128i *)(void*)(&a_Q12[ 8 ] ) );
 
     a_Q12_01234567 = _mm_shuffle_epi8( a_Q12_01234567, xmm_one );
     a_Q12_89ABCDEF = _mm_shuffle_epi8( a_Q12_89ABCDEF, xmm_one );
 
     /* load AR_shp_Q13 */
-    AR_shp_Q13_76543210 = _mm_loadu_si128( (__m128i *)(&AR_shp_Q13[0] ) );
+    AR_shp_Q13_76543210 = _mm_loadu_si128( (__m128i *)(void*)(&AR_shp_Q13[0] ) );
 
     /* load psLPC_Q14 */
     xmm_one = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 );
 
-    xmm_tempa = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[-16]) );
-    xmm_tempb = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[-12]) );
+    xmm_tempa = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[-16]) );
+    xmm_tempb = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[-12]) );
 
     xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one );
     xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one );
@@ -316,8 +360,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
     psLPC_Q14_hi_89ABCDEF = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb );
     psLPC_Q14_lo_89ABCDEF = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb );
 
-    xmm_tempa = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -8 ]) );
-    xmm_tempb = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -4 ]) );
+    xmm_tempa = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -8 ]) );
+    xmm_tempb = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -4 ]) );
 
     xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one );
     xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one );
@@ -326,8 +370,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
     psLPC_Q14_lo_01234567 = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb );
 
     /* load sAR2_Q14 */
-    xmm_tempa = _mm_loadu_si128( (__m128i *)(&(NSQ->sAR2_Q14[ 0 ]) ) );
-    xmm_tempb = _mm_loadu_si128( (__m128i *)(&(NSQ->sAR2_Q14[ 4 ]) ) );
+    xmm_tempa = _mm_loadu_si128( (__m128i *)(void*)(&(NSQ->sAR2_Q14[ 0 ]) ) );
+    xmm_tempb = _mm_loadu_si128( (__m128i *)(void*)(&(NSQ->sAR2_Q14[ 4 ]) ) );
 
     xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one );
     xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one );
@@ -399,7 +443,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
                 b_Q14_0123 = _mm_shuffle_epi32( b_Q14_3210, 0x1B );
 
                 /* loaded: [0] [-1] [-2] [-3] */
-                pred_lag_ptr_0123 = _mm_loadu_si128( (__m128i *)(&pred_lag_ptr[ -3 ] ) );
+                pred_lag_ptr_0123 = _mm_loadu_si128( (__m128i *)(void*)(&pred_lag_ptr[ -3 ] ) );
                 /* shuffle to [-3] [-2] [-1] [0] and to new xmm */
                 xmm_tempa = _mm_shuffle_epi32( pred_lag_ptr_0123, 0x1B );
                 /*64-bit multiply, a[2] * b[-2], a[0] * b[0] */
@@ -430,8 +474,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
         sAR2_Q14_hi_76543210 = _mm_slli_si128( sAR2_Q14_hi_76543210, 2 );
         sAR2_Q14_lo_76543210 = _mm_slli_si128( sAR2_Q14_lo_76543210, 2 );
 
-        sAR2_Q14_hi_76543210 = _mm_insert_epi16( sAR2_Q14_hi_76543210, (xq_Q14 >> 16), 0 );
-        sAR2_Q14_lo_76543210 = _mm_insert_epi16( sAR2_Q14_lo_76543210, (xq_Q14),       0 );
+        sAR2_Q14_hi_76543210 = _mm_insert_epi16( sAR2_Q14_hi_76543210, (sDiff_shp_Q14 >> 16), 0 );
+        sAR2_Q14_lo_76543210 = _mm_insert_epi16( sAR2_Q14_lo_76543210, (sDiff_shp_Q14),       0 );
 
         /* high part, use pmaddwd, results in 4 32-bit */
         xmm_hi_07 = _mm_madd_epi16( sAR2_Q14_hi_76543210, AR_shp_Q13_76543210 );
@@ -462,14 +506,14 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
         n_LF_Q12 = silk_SMULWB( NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - 1 ], LF_shp_Q14 );
         n_LF_Q12 = silk_SMLAWT( n_LF_Q12, sLF_AR_shp_Q14, LF_shp_Q14 );
 
-        silk_assert( lag > 0 || signalType != TYPE_VOICED );
+        celt_assert( lag > 0 || signalType != TYPE_VOICED );
 
         /* Combine prediction and noise shaping signals */
         tmp1 = silk_SUB32( silk_LSHIFT32( LPC_pred_Q10, 2 ), n_AR_Q12 );        /* Q12 */
         tmp1 = silk_SUB32( tmp1, n_LF_Q12 );                                    /* Q12 */
         if( lag > 0 ) {
             /* Symmetric, packed FIR coefficients */
-            n_LTP_Q13 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
+            n_LTP_Q13 = silk_SMULWB( silk_ADD_SAT32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
             n_LTP_Q13 = silk_SMLAWT( n_LTP_Q13, shp_lag_ptr[ -1 ],                      HarmShapeFIRPacked_Q14 );
             n_LTP_Q13 = silk_LSHIFT( n_LTP_Q13, 1 );
             shp_lag_ptr++;
@@ -495,6 +539,18 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
         /* Find two quantization level candidates and measure their rate-distortion */
         q1_Q10 = silk_SUB32( r_Q10, offset_Q10 );
         q1_Q0 = silk_RSHIFT( q1_Q10, 10 );
+        if (Lambda_Q10 > 2048) {
+            /* For aggressive RDO, the bias becomes more than one pulse. */
+            if (q1_Q10 > rdo_offset) {
+                q1_Q0 = silk_RSHIFT( q1_Q10 - rdo_offset, 10 );
+            } else if (q1_Q10 < -rdo_offset) {
+                q1_Q0 = silk_RSHIFT( q1_Q10 + rdo_offset, 10 );
+            } else if (q1_Q10 < 0) {
+                q1_Q0 = -1;
+            } else {
+                q1_Q0 = 0;
+            }
+        }
 
         q1_Q10 = table[q1_Q0][0];
         q2_Q10 = table[q1_Q0][1];
@@ -519,7 +575,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
         /* Update states */
         psLPC_Q14++;
         *psLPC_Q14 = xq_Q14;
-        sLF_AR_shp_Q14 = silk_SUB_LSHIFT32( xq_Q14, n_AR_Q12, 2 );
+        NSQ->sDiff_shp_Q14 = silk_SUB_LSHIFT32( xq_Q14, x_sc_Q10[ i ], 4 );
+        sLF_AR_shp_Q14 = silk_SUB_LSHIFT32( NSQ->sDiff_shp_Q14, n_AR_Q12, 2 );
 
         NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx ] = silk_SUB_LSHIFT32( sLF_AR_shp_Q14, n_LF_Q12, 2 );
         sLTP_Q15[ NSQ->sLTP_buf_idx ] = silk_LSHIFT( LPC_exc_Q14, 1 );
@@ -538,8 +595,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
     /* write back sAR2_Q14 */
     xmm_tempa = _mm_unpackhi_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 );
     xmm_tempb = _mm_unpacklo_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 );
-    _mm_storeu_si128( (__m128i *)(&NSQ->sAR2_Q14[ 4 ]), xmm_tempa );
-    _mm_storeu_si128( (__m128i *)(&NSQ->sAR2_Q14[ 0 ]), xmm_tempb );
+    _mm_storeu_si128( (__m128i *)(void*)(&NSQ->sAR2_Q14[ 4 ]), xmm_tempa );
+    _mm_storeu_si128( (__m128i *)(void*)(&NSQ->sAR2_Q14[ 0 ]), xmm_tempb );
 
     /* xq[ i ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psLPC_Q14[ i ], Gain_Q10 ), 8 ) ); */
     {
@@ -555,8 +612,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
         /* process xq */
         for (i = 0; i < length - 7; i += 8)
         {
-            xmm_xq_Q14_3210 = _mm_loadu_si128( (__m128i *)(&(psLPC_Q14[ i + 0 ] ) ) );
-            xmm_xq_Q14_7654 = _mm_loadu_si128( (__m128i *)(&(psLPC_Q14[ i + 4 ] ) ) );
+            xmm_xq_Q14_3210 = _mm_loadu_si128( (__m128i *)(void*)(&(psLPC_Q14[ i + 0 ] ) ) );
+            xmm_xq_Q14_7654 = _mm_loadu_si128( (__m128i *)(void*)(&(psLPC_Q14[ i + 4 ] ) ) );
 
             /* equal shift right 4 bytes*/
             xmm_xq_Q14_x3x1 = _mm_shuffle_epi32( xmm_xq_Q14_3210, _MM_SHUFFLE( 0, 3, 2, 1 ) );
@@ -587,7 +644,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
             xmm_xq_Q14_3210 = _mm_packs_epi32( xmm_xq_Q14_3210, xmm_xq_Q14_7654 );
 
             /* save to xq */
-            _mm_storeu_si128( (__m128i *)(&xq[ i ] ), xmm_xq_Q14_3210 );
+            _mm_storeu_si128( (__m128i *)(void*)(&xq[ i ] ), xmm_xq_Q14_3210 );
         }
     }
     for ( ; i < length; i++)
@@ -600,64 +657,54 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
 }
 
 static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
-    const silk_encoder_state *psEncC,           /* I    Encoder State                   */
-    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */
-    const opus_int32    x_Q3[],                 /* I    input in Q3                     */
-    opus_int32          x_sc_Q10[],             /* O    input scaled with 1/Gain        */
-    const opus_int16    sLTP[],                 /* I    re-whitened LTP state in Q0     */
-    opus_int32          sLTP_Q15[],             /* O    LTP state matching scaled input */
-    opus_int            subfr,                  /* I    subframe number                 */
-    const opus_int      LTP_scale_Q14,          /* I                                    */
-    const opus_int32    Gains_Q16[ MAX_NB_SUBFR ], /* I                                 */
-    const opus_int      pitchL[ MAX_NB_SUBFR ], /* I    Pitch lag                       */
-    const opus_int      signal_type             /* I    Signal type                     */
+    const silk_encoder_state *psEncC,              /* I    Encoder State                   */
+    silk_nsq_state      *NSQ,                      /* I/O  NSQ state                       */
+    const opus_int16    x16[],                     /* I    input                           */
+    opus_int32          x_sc_Q10[],                /* O    input scaled with 1/Gain        */
+    const opus_int16    sLTP[],                    /* I    re-whitened LTP state in Q0     */
+    opus_int32          sLTP_Q15[],                /* O    LTP state matching scaled input */
+    opus_int            subfr,                     /* I    subframe number                 */
+    const opus_int      LTP_scale_Q14,             /* I                                    */
+    const opus_int32    Gains_Q16[ MAX_NB_SUBFR ], /* I                                    */
+    const opus_int      pitchL[ MAX_NB_SUBFR ],    /* I    Pitch lag                       */
+    const opus_int      signal_type                /* I    Signal type                     */
 )
 {
     opus_int   i, lag;
-    opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q23;
-    __m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1;
+    opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26;
+    __m128i xmm_inv_gain_Q26, xmm_x16_x2x0, xmm_x16_x3x1;
 
     lag          = pitchL[ subfr ];
     inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
     silk_assert( inv_gain_Q31 != 0 );
 
-    /* Calculate gain adjustment factor */
-    if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
-        gain_adj_Q16 =  silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
-    } else {
-        gain_adj_Q16 = (opus_int32)1 << 16;
-    }
-
     /* Scale input */
-    inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 );
+    inv_gain_Q26 = silk_RSHIFT_ROUND( inv_gain_Q31, 5 );
 
-    /* prepare inv_gain_Q23 in packed 4 32-bits */
-    xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23);
+    /* prepare inv_gain_Q26 in packed 4 32-bits */
+    xmm_inv_gain_Q26 = _mm_set1_epi32(inv_gain_Q26);
 
     for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) {
-        xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) );
+        xmm_x16_x2x0 = OP_CVTEPI16_EPI32_M64( &(x16[ i ] ) );
 
         /* equal shift right 4 bytes*/
-        xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+        xmm_x16_x3x1 = _mm_shuffle_epi32( xmm_x16_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
 
-        xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 );
-        xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 );
+        xmm_x16_x2x0 = _mm_mul_epi32( xmm_x16_x2x0, xmm_inv_gain_Q26 );
+        xmm_x16_x3x1 = _mm_mul_epi32( xmm_x16_x3x1, xmm_inv_gain_Q26 );
 
-        xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 );
-        xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 );
+        xmm_x16_x2x0 = _mm_srli_epi64( xmm_x16_x2x0, 16 );
+        xmm_x16_x3x1 = _mm_slli_epi64( xmm_x16_x3x1, 16 );
 
-        xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC );
+        xmm_x16_x2x0 = _mm_blend_epi16( xmm_x16_x2x0, xmm_x16_x3x1, 0xCC );
 
-        _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x_Q3_x2x0 );
+        _mm_storeu_si128( (__m128i *)(void*)(&(x_sc_Q10[ i ] ) ), xmm_x16_x2x0 );
     }
 
     for( ; i < psEncC->subfr_length; i++ ) {
-        x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 );
+        x_sc_Q10[ i ] = silk_SMULWW( x16[ i ], inv_gain_Q26 );
     }
 
-    /* Save inverse gain */
-    NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
-
     /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
     if( NSQ->rewhite_flag ) {
         if( subfr == 0 ) {
@@ -671,16 +718,18 @@ static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
     }
 
     /* Adjust for changing gain */
-    if( gain_adj_Q16 != (opus_int32)1 << 16 ) {
-        /* Scale long-term shaping state */
+    if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
         __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1;
+        gain_adj_Q16 =  silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
+
+        /* Scale long-term shaping state */
 
         /* prepare gain_adj_Q16 in packed 4 32-bits */
         xmm_gain_adj_Q16 = _mm_set1_epi32(gain_adj_Q16);
 
         for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 3; i += 4 )
         {
-            xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ) );
+            xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(void*)(&(NSQ->sLTP_shp_Q14[ i ] ) ) );
             /* equal shift right 4 bytes*/
             xmm_sLTP_shp_Q14_x3x1 = _mm_shuffle_epi32( xmm_sLTP_shp_Q14_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
 
@@ -692,7 +741,7 @@ static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
 
             xmm_sLTP_shp_Q14_x2x0 = _mm_blend_epi16( xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1, 0xCC );
 
-            _mm_storeu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 );
+            _mm_storeu_si128( (__m128i *)(void*)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 );
         }
 
         for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {
@@ -707,6 +756,7 @@ static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
         }
 
         NSQ->sLF_AR_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sLF_AR_shp_Q14 );
+        NSQ->sDiff_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sDiff_shp_Q14 );
 
         /* Scale short-term prediction and shaping states */
         for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
@@ -715,5 +765,8 @@ static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
         for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
             NSQ->sAR2_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sAR2_Q14[ i ] );
         }
+
+        /* Save inverse gain */
+        NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
     }
 }
diff --git a/opus/silk/x86/SigProc_FIX_sse.h b/opus/silk/x86/SigProc_FIX_sse.h
index 61efa8da..89a5ec88 100644
--- a/opus/silk/x86/SigProc_FIX_sse.h
+++ b/opus/silk/x86/SigProc_FIX_sse.h
@@ -26,13 +26,13 @@
 */
 
 #ifndef SIGPROC_FIX_SSE_H
-#define SIGPROC_FIX_SSE_H
+# define SIGPROC_FIX_SSE_H
 
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
+# ifdef HAVE_CONFIG_H
+#  include "config.h"
+# endif
 
-#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
+# if defined(OPUS_X86_MAY_HAVE_SSE4_1)
 void silk_burg_modified_sse4_1(
     opus_int32                  *res_nrg,           /* O    Residual energy                                             */
     opus_int                    *res_nrg_Q,         /* O    Residual energy Q value                                     */
@@ -45,11 +45,13 @@ void silk_burg_modified_sse4_1(
     int                         arch                /* I    Run-time architecture                                       */
 );
 
-#if defined(OPUS_X86_PRESUME_SSE4_1)
-#define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \
-    ((void)(arch), silk_burg_modified_sse4_1(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))
+#  if defined(OPUS_X86_PRESUME_SSE4_1)
+
+#   define OVERRIDE_silk_burg_modified
+#   define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \
+       ((void)(arch), silk_burg_modified_sse4_1(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))
 
-#else
+#  elif defined(OPUS_HAVE_RTCD)
 
 extern void (*const SILK_BURG_MODIFIED_IMPL[OPUS_ARCHMASK + 1])(
     opus_int32                  *res_nrg,           /* O    Residual energy                                             */
@@ -62,33 +64,36 @@ extern void (*const SILK_BURG_MODIFIED_IMPL[OPUS_ARCHMASK + 1])(
     const opus_int              D,                  /* I    Order                                                       */
     int                         arch                /* I    Run-time architecture                                       */);
 
-#  define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \
-    ((*SILK_BURG_MODIFIED_IMPL[(arch) & OPUS_ARCHMASK])(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))
+#   define OVERRIDE_silk_burg_modified
+#   define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \
+     ((*SILK_BURG_MODIFIED_IMPL[(arch) & OPUS_ARCHMASK])(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))
 
-#endif
+#  endif
 
-opus_int64 silk_inner_prod16_aligned_64_sse4_1(
+opus_int64 silk_inner_prod16_sse4_1(
     const opus_int16 *inVec1,
     const opus_int16 *inVec2,
     const opus_int   len
 );
 
 
-#if defined(OPUS_X86_PRESUME_SSE4_1)
+#  if defined(OPUS_X86_PRESUME_SSE4_1)
 
-#define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \
-    ((void)(arch),silk_inner_prod16_aligned_64_sse4_1(inVec1, inVec2, len))
+#   define OVERRIDE_silk_inner_prod16
+#   define silk_inner_prod16(inVec1, inVec2, len, arch) \
+       ((void)(arch),silk_inner_prod16_sse4_1(inVec1, inVec2, len))
 
-#else
+#  elif defined(OPUS_HAVE_RTCD)
 
-extern opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[OPUS_ARCHMASK + 1])(
+extern opus_int64 (*const SILK_INNER_PROD16_IMPL[OPUS_ARCHMASK + 1])(
                     const opus_int16 *inVec1,
                     const opus_int16 *inVec2,
                     const opus_int   len);
 
-#  define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \
-    ((*SILK_INNER_PROD16_ALIGNED_64_IMPL[(arch) & OPUS_ARCHMASK])(inVec1, inVec2, len))
+#   define OVERRIDE_silk_inner_prod16
+#   define silk_inner_prod16(inVec1, inVec2, len, arch) \
+     ((*SILK_INNER_PROD16_IMPL[(arch) & OPUS_ARCHMASK])(inVec1, inVec2, len))
 
-#endif
-#endif
+#  endif
+# endif
 #endif
diff --git a/opus/silk/x86/VAD_sse4_1.c b/opus/silk/x86/VAD_sse4_1.c
index d02ddf4a..9e06bc79 100644
--- a/opus/silk/x86/VAD_sse4_1.c
+++ b/opus/silk/x86/VAD_sse4_1.c
@@ -1,5 +1,5 @@
-/* Copyright (c) 2014, Cisco Systems, INC
-   Written by XiangMingZhu WeiZhou MinPeng YanWang
+/* Copyright (c) 2014-2020, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions
@@ -63,6 +63,14 @@ opus_int silk_VAD_GetSA_Q8_sse4_1(                  /* O    Return value, 0 if s
 
     SAVE_STACK;
 
+#ifdef OPUS_CHECK_ASM
+    silk_encoder_state psEncC_c;
+    opus_int ret_c;
+
+    silk_memcpy( &psEncC_c, psEncC, sizeof( psEncC_c ) );
+    ret_c = silk_VAD_GetSA_Q8_c( &psEncC_c, pIn );
+#endif
+
     /* Safety checks */
     silk_assert( VAD_N_BANDS == 4 );
     celt_assert( MAX_FRAME_LENGTH >= psEncC->frame_length );
@@ -136,7 +144,7 @@ opus_int silk_VAD_GetSA_Q8_sse4_1(                  /* O    Return value, 0 if s
 
             for( i = 0; i < dec_subframe_length - 7; i += 8 )
             {
-                xmm_X   = _mm_loadu_si128( (__m128i *)&(X[ X_offset[ b ] + i + dec_subframe_offset ] ) );
+                xmm_X   = _mm_loadu_si128( (__m128i *)(void*)&(X[ X_offset[ b ] + i + dec_subframe_offset ] ) );
                 xmm_X   = _mm_srai_epi16( xmm_X, 3 );
                 xmm_X   = _mm_madd_epi16( xmm_X, xmm_X );
                 xmm_acc = _mm_add_epi32( xmm_acc, xmm_X );
@@ -233,15 +241,14 @@ opus_int silk_VAD_GetSA_Q8_sse4_1(                  /* O    Return value, 0 if s
         speech_nrg += ( b + 1 ) * silk_RSHIFT( Xnrg[ b ] - psSilk_VAD->NL[ b ], 4 );
     }
 
+    if( psEncC->frame_length == 20 * psEncC->fs_kHz ) {
+        speech_nrg = silk_RSHIFT32( speech_nrg, 1 );
+    }
     /* Power scaling */
     if( speech_nrg <= 0 ) {
         SA_Q15 = silk_RSHIFT( SA_Q15, 1 );
-    } else if( speech_nrg < 32768 ) {
-        if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
-            speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 16 );
-        } else {
-            speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 15 );
-        }
+    } else if( speech_nrg < 16384 ) {
+        speech_nrg = silk_LSHIFT32( speech_nrg, 16 );
 
         /* square-root */
         speech_nrg = silk_SQRT_APPROX( speech_nrg );
@@ -272,6 +279,11 @@ opus_int silk_VAD_GetSA_Q8_sse4_1(                  /* O    Return value, 0 if s
         psEncC->input_quality_bands_Q15[ b ] = silk_sigm_Q15( silk_RSHIFT( SNR_Q7 - 16 * 128, 4 ) );
     }
 
+#ifdef OPUS_CHECK_ASM
+    silk_assert( ret == ret_c );
+    silk_assert( !memcmp( &psEncC_c, psEncC, sizeof( psEncC_c ) ) );
+#endif
+
     RESTORE_STACK;
     return( ret );
 }
diff --git a/opus/silk/x86/VQ_WMat_EC_sse4_1.c b/opus/silk/x86/VQ_WMat_EC_sse4_1.c
index 74d6c6d0..df4626b6 100644
--- a/opus/silk/x86/VQ_WMat_EC_sse4_1.c
+++ b/opus/silk/x86/VQ_WMat_EC_sse4_1.c
@@ -1,5 +1,5 @@
-/* Copyright (c) 2014, Cisco Systems, INC
-   Written by XiangMingZhu WeiZhou MinPeng YanWang
+/* Copyright (c) 2014-2020, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions
@@ -38,105 +38,136 @@
 /* Entropy constrained matrix-weighted VQ, hard-coded to 5-element vectors, for a single input data vector */
 void silk_VQ_WMat_EC_sse4_1(
     opus_int8                   *ind,                           /* O    index of best codebook vector               */
-    opus_int32                  *rate_dist_Q14,                 /* O    best weighted quant error + mu * rate       */
+    opus_int32                  *res_nrg_Q15,                   /* O    best residual energy                        */
+    opus_int32                  *rate_dist_Q8,                  /* O    best total bitrate                          */
     opus_int                    *gain_Q7,                       /* O    sum of absolute LTP coefficients            */
-    const opus_int16            *in_Q14,                        /* I    input vector to be quantized                */
-    const opus_int32            *W_Q18,                         /* I    weighting matrix                            */
+    const opus_int32            *XX_Q17,                        /* I    correlation matrix                          */
+    const opus_int32            *xX_Q17,                        /* I    correlation vector                          */
     const opus_int8             *cb_Q7,                         /* I    codebook                                    */
     const opus_uint8            *cb_gain_Q7,                    /* I    codebook effective gain                     */
     const opus_uint8            *cl_Q5,                         /* I    code length for each codebook vector        */
-    const opus_int              mu_Q9,                          /* I    tradeoff betw. weighted error and rate      */
+    const opus_int              subfr_len,                      /* I    number of samples per subframe              */
     const opus_int32            max_gain_Q7,                    /* I    maximum sum of absolute LTP coefficients    */
-    opus_int                    L                               /* I    number of vectors in codebook               */
+    const opus_int              L                               /* I    number of vectors in codebook               */
 )
 {
     opus_int   k, gain_tmp_Q7;
     const opus_int8 *cb_row_Q7;
-    opus_int16 diff_Q14[ 5 ];
-    opus_int32 sum1_Q14, sum2_Q16;
+    opus_int32 neg_xX_Q24[ 5 ];
+    opus_int32 sum1_Q15, sum2_Q24;
+    opus_int32 bits_res_Q8, bits_tot_Q8;
+    __m128i v_XX_31_Q17, v_XX_42_Q17, v_cb_row_31_Q7, v_cb_row_42_Q7, v_acc1_Q24, v_acc2_Q24;
+
+    /* Negate and convert to new Q domain */
+    neg_xX_Q24[ 0 ] = -silk_LSHIFT32( xX_Q17[ 0 ], 7 );
+    neg_xX_Q24[ 1 ] = -silk_LSHIFT32( xX_Q17[ 1 ], 7 );
+    neg_xX_Q24[ 2 ] = -silk_LSHIFT32( xX_Q17[ 2 ], 7 );
+    neg_xX_Q24[ 3 ] = -silk_LSHIFT32( xX_Q17[ 3 ], 7 );
+    neg_xX_Q24[ 4 ] = -silk_LSHIFT32( xX_Q17[ 4 ], 7 );
+
+    v_XX_31_Q17 = _mm_loadu_si128( (__m128i *)(void*)(&XX_Q17[ 1 ] ) );
+    v_XX_42_Q17 = _mm_shuffle_epi32( v_XX_31_Q17, _MM_SHUFFLE( 0, 3, 2, 1 ) );
 
-    __m128i C_tmp1, C_tmp2, C_tmp3, C_tmp4, C_tmp5;
     /* Loop over codebook */
-    *rate_dist_Q14 = silk_int32_MAX;
+    *rate_dist_Q8 = silk_int32_MAX;
+    *res_nrg_Q15 = silk_int32_MAX;
     cb_row_Q7 = cb_Q7;
+    /* If things go really bad, at least *ind is set to something safe. */
+    *ind = 0;
     for( k = 0; k < L; k++ ) {
+        opus_int32 penalty;
         gain_tmp_Q7 = cb_gain_Q7[k];
-
-        diff_Q14[ 0 ] = in_Q14[ 0 ] - silk_LSHIFT( cb_row_Q7[ 0 ], 7 );
-
-        C_tmp1 = OP_CVTEPI16_EPI32_M64( &in_Q14[ 1 ] );
-        C_tmp2 = OP_CVTEPI8_EPI32_M32( &cb_row_Q7[ 1 ] );
-        C_tmp2 = _mm_slli_epi32( C_tmp2, 7 );
-        C_tmp1 = _mm_sub_epi32( C_tmp1, C_tmp2 );
-
-        diff_Q14[ 1 ] = _mm_extract_epi16( C_tmp1, 0 );
-        diff_Q14[ 2 ] = _mm_extract_epi16( C_tmp1, 2 );
-        diff_Q14[ 3 ] = _mm_extract_epi16( C_tmp1, 4 );
-        diff_Q14[ 4 ] = _mm_extract_epi16( C_tmp1, 6 );
-
         /* Weighted rate */
-        sum1_Q14 = silk_SMULBB( mu_Q9, cl_Q5[ k ] );
+        /* Quantization error: 1 - 2 * xX * cb + cb' * XX * cb */
+        sum1_Q15 = SILK_FIX_CONST( 1.001, 15 );
 
         /* Penalty for too large gain */
-        sum1_Q14 = silk_ADD_LSHIFT32( sum1_Q14, silk_max( silk_SUB32( gain_tmp_Q7, max_gain_Q7 ), 0 ), 10 );
-
-        silk_assert( sum1_Q14 >= 0 );
-
-        /* first row of W_Q18 */
-        C_tmp3 = _mm_loadu_si128( (__m128i *)(&W_Q18[ 1 ] ) );
-        C_tmp4 = _mm_mul_epi32( C_tmp3, C_tmp1 );
-        C_tmp4 = _mm_srli_si128( C_tmp4, 2 );
-
-        C_tmp1 = _mm_shuffle_epi32( C_tmp1, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* shift right 4 bytes */
-        C_tmp3 = _mm_shuffle_epi32( C_tmp3, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* shift right 4 bytes */
-
-        C_tmp5 = _mm_mul_epi32( C_tmp3, C_tmp1 );
-        C_tmp5 = _mm_srli_si128( C_tmp5, 2 );
-
-        C_tmp5 = _mm_add_epi32( C_tmp4, C_tmp5 );
-        C_tmp5 = _mm_slli_epi32( C_tmp5, 1 );
-
-        C_tmp5 = _mm_add_epi32( C_tmp5, _mm_shuffle_epi32( C_tmp5, _MM_SHUFFLE( 0, 0, 0, 2 ) ) );
-        sum2_Q16 = _mm_cvtsi128_si32( C_tmp5 );
-
-        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[  0 ], diff_Q14[ 0 ] );
-        sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16,    diff_Q14[ 0 ] );
-
-        /* second row of W_Q18 */
-        sum2_Q16 = silk_SMULWB(           W_Q18[  7 ], diff_Q14[ 2 ] );
-        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[  8 ], diff_Q14[ 3 ] );
-        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[  9 ], diff_Q14[ 4 ] );
-        sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );
-        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[  6 ], diff_Q14[ 1 ] );
-        sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16,    diff_Q14[ 1 ] );
-
-        /* third row of W_Q18 */
-        sum2_Q16 = silk_SMULWB(           W_Q18[ 13 ], diff_Q14[ 3 ] );
-        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 14 ], diff_Q14[ 4 ] );
-        sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );
-        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 12 ], diff_Q14[ 2 ] );
-        sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16,    diff_Q14[ 2 ] );
-
-        /* fourth row of W_Q18 */
-        sum2_Q16 = silk_SMULWB(           W_Q18[ 19 ], diff_Q14[ 4 ] );
-        sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );
-        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 18 ], diff_Q14[ 3 ] );
-        sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16,    diff_Q14[ 3 ] );
-
-        /* last row of W_Q18 */
-        sum2_Q16 = silk_SMULWB(           W_Q18[ 24 ], diff_Q14[ 4 ] );
-        sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16,    diff_Q14[ 4 ] );
-
-        silk_assert( sum1_Q14 >= 0 );
+        penalty = silk_LSHIFT32( silk_max( silk_SUB32( gain_tmp_Q7, max_gain_Q7 ), 0 ), 11 );
+
+        /* first row of XX_Q17 */
+        v_cb_row_31_Q7 = OP_CVTEPI8_EPI32_M32( &cb_row_Q7[ 1 ] );
+        v_cb_row_42_Q7 = _mm_shuffle_epi32( v_cb_row_31_Q7, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+        v_cb_row_31_Q7 = _mm_mul_epi32( v_XX_31_Q17, v_cb_row_31_Q7 );
+        v_cb_row_42_Q7 = _mm_mul_epi32( v_XX_42_Q17, v_cb_row_42_Q7 );
+        v_acc1_Q24 = _mm_add_epi64( v_cb_row_31_Q7, v_cb_row_42_Q7);
+        v_acc2_Q24 = _mm_shuffle_epi32( v_acc1_Q24, _MM_SHUFFLE( 1, 0, 3, 2 ) );
+        v_acc1_Q24 = _mm_add_epi64( v_acc1_Q24, v_acc2_Q24);
+        sum2_Q24 = _mm_cvtsi128_si32( v_acc1_Q24 );
+        sum2_Q24 = silk_ADD32( neg_xX_Q24[ 0 ], sum2_Q24 );
+        sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 );
+        sum2_Q24 = silk_MLA( sum2_Q24,        XX_Q17[  0 ], cb_row_Q7[ 0 ] );
+        sum1_Q15 = silk_SMLAWB( sum1_Q15,        sum2_Q24,  cb_row_Q7[ 0 ] );
+
+        /* second row of XX_Q17 */
+        sum2_Q24 = silk_MLA( neg_xX_Q24[ 1 ], XX_Q17[  7 ], cb_row_Q7[ 2 ] );
+        sum2_Q24 = silk_MLA( sum2_Q24,        XX_Q17[  8 ], cb_row_Q7[ 3 ] );
+        sum2_Q24 = silk_MLA( sum2_Q24,        XX_Q17[  9 ], cb_row_Q7[ 4 ] );
+        sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 );
+        sum2_Q24 = silk_MLA( sum2_Q24,        XX_Q17[  6 ], cb_row_Q7[ 1 ] );
+        sum1_Q15 = silk_SMLAWB( sum1_Q15,        sum2_Q24,  cb_row_Q7[ 1 ] );
+
+        /* third row of XX_Q17 */
+        sum2_Q24 = silk_MLA( neg_xX_Q24[ 2 ], XX_Q17[ 13 ], cb_row_Q7[ 3 ] );
+        sum2_Q24 = silk_MLA( sum2_Q24,        XX_Q17[ 14 ], cb_row_Q7[ 4 ] );
+        sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 );
+        sum2_Q24 = silk_MLA( sum2_Q24,        XX_Q17[ 12 ], cb_row_Q7[ 2 ] );
+        sum1_Q15 = silk_SMLAWB( sum1_Q15,        sum2_Q24,  cb_row_Q7[ 2 ] );
+
+        /* fourth row of XX_Q17 */
+        sum2_Q24 = silk_MLA( neg_xX_Q24[ 3 ], XX_Q17[ 19 ], cb_row_Q7[ 4 ] );
+        sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 );
+        sum2_Q24 = silk_MLA( sum2_Q24,        XX_Q17[ 18 ], cb_row_Q7[ 3 ] );
+        sum1_Q15 = silk_SMLAWB( sum1_Q15,        sum2_Q24,  cb_row_Q7[ 3 ] );
+
+        /* last row of XX_Q17 */
+        sum2_Q24 = silk_LSHIFT32( neg_xX_Q24[ 4 ], 1 );
+        sum2_Q24 = silk_MLA( sum2_Q24,        XX_Q17[ 24 ], cb_row_Q7[ 4 ] );
+        sum1_Q15 = silk_SMLAWB( sum1_Q15,        sum2_Q24,  cb_row_Q7[ 4 ] );
 
         /* find best */
-        if( sum1_Q14 < *rate_dist_Q14 ) {
-            *rate_dist_Q14 = sum1_Q14;
-            *ind = (opus_int8)k;
-            *gain_Q7 = gain_tmp_Q7;
+        if( sum1_Q15 >= 0 ) {
+            /* Translate residual energy to bits using high-rate assumption (6 dB ==> 1 bit/sample) */
+            bits_res_Q8 = silk_SMULBB( subfr_len, silk_lin2log( sum1_Q15 + penalty) - (15 << 7) );
+            /* In the following line we reduce the codelength component by half ("-1"); seems to slightly improve quality */
+            bits_tot_Q8 = silk_ADD_LSHIFT32( bits_res_Q8, cl_Q5[ k ], 3-1 );
+            if( bits_tot_Q8 <= *rate_dist_Q8 ) {
+                *rate_dist_Q8 = bits_tot_Q8;
+                *res_nrg_Q15 = sum1_Q15 + penalty;
+                *ind = (opus_int8)k;
+                *gain_Q7 = gain_tmp_Q7;
+            }
         }
 
         /* Go to next cbk vector */
         cb_row_Q7 += LTP_ORDER;
     }
+
+#ifdef OPUS_CHECK_ASM
+    {
+        opus_int8  ind_c = 0;
+        opus_int32 res_nrg_Q15_c = 0;
+        opus_int32 rate_dist_Q8_c = 0;
+        opus_int   gain_Q7_c = 0;
+
+        silk_VQ_WMat_EC_c(
+            &ind_c,
+            &res_nrg_Q15_c,
+            &rate_dist_Q8_c,
+            &gain_Q7_c,
+            XX_Q17,
+            xX_Q17,
+            cb_Q7,
+            cb_gain_Q7,
+            cl_Q5,
+            subfr_len,
+            max_gain_Q7,
+            L
+        );
+
+        silk_assert( *ind == ind_c );
+        silk_assert( *res_nrg_Q15 == res_nrg_Q15_c );
+        silk_assert( *rate_dist_Q8 == rate_dist_Q8_c );
+        silk_assert( *gain_Q7 == gain_Q7_c );
+    }
+#endif
 }
diff --git a/opus/silk/x86/main_sse.h b/opus/silk/x86/main_sse.h
index 2f15d448..b254d53e 100644
--- a/opus/silk/x86/main_sse.h
+++ b/opus/silk/x86/main_sse.h
@@ -26,171 +26,195 @@
 */
 
 #ifndef MAIN_SSE_H
-#define MAIN_SSE_H
+# define MAIN_SSE_H
 
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
+# ifdef HAVE_CONFIG_H
+#  include "config.h"
+# endif
 
 # if defined(OPUS_X86_MAY_HAVE_SSE4_1)
 
-#if 0 /* FIXME: SSE disabled until silk_VQ_WMat_EC_sse4_1() gets updated. */
-#  define OVERRIDE_silk_VQ_WMat_EC
-
 void silk_VQ_WMat_EC_sse4_1(
     opus_int8                   *ind,                           /* O    index of best codebook vector               */
-    opus_int32                  *rate_dist_Q14,                 /* O    best weighted quant error + mu * rate       */
+    opus_int32                  *res_nrg_Q15,                   /* O    best residual energy                        */
+    opus_int32                  *rate_dist_Q8,                  /* O    best total bitrate                          */
     opus_int                    *gain_Q7,                       /* O    sum of absolute LTP coefficients            */
-    const opus_int16            *in_Q14,                        /* I    input vector to be quantized                */
-    const opus_int32            *W_Q18,                         /* I    weighting matrix                            */
+    const opus_int32            *XX_Q17,                        /* I    correlation matrix                          */
+    const opus_int32            *xX_Q17,                        /* I    correlation vector                          */
     const opus_int8             *cb_Q7,                         /* I    codebook                                    */
     const opus_uint8            *cb_gain_Q7,                    /* I    codebook effective gain                     */
     const opus_uint8            *cl_Q5,                         /* I    code length for each codebook vector        */
-    const opus_int              mu_Q9,                          /* I    tradeoff betw. weighted error and rate      */
+    const opus_int              subfr_len,                      /* I    number of samples per subframe              */
     const opus_int32            max_gain_Q7,                    /* I    maximum sum of absolute LTP coefficients    */
-    opus_int                    L                               /* I    number of vectors in codebook               */
+    const opus_int              L                               /* I    number of vectors in codebook               */
 );
 
-#if defined OPUS_X86_PRESUME_SSE4_1
+#  if defined OPUS_X86_PRESUME_SSE4_1
 
-#define silk_VQ_WMat_EC(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
-                          mu_Q9, max_gain_Q7, L, arch) \
-    ((void)(arch),silk_VQ_WMat_EC_sse4_1(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
-                          mu_Q9, max_gain_Q7, L))
+#   define OVERRIDE_silk_VQ_WMat_EC
+#   define silk_VQ_WMat_EC(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
+                           subfr_len, max_gain_Q7, L, arch) \
+    ((void)(arch),silk_VQ_WMat_EC_sse4_1(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
+                          subfr_len, max_gain_Q7, L))
 
-#else
+#  elif defined(OPUS_HAVE_RTCD)
 
 extern void (*const SILK_VQ_WMAT_EC_IMPL[OPUS_ARCHMASK + 1])(
     opus_int8                   *ind,                           /* O    index of best codebook vector               */
-    opus_int32                  *rate_dist_Q14,                 /* O    best weighted quant error + mu * rate       */
+    opus_int32                  *res_nrg_Q15,                   /* O    best residual energy                        */
+    opus_int32                  *rate_dist_Q8,                  /* O    best total bitrate                          */
     opus_int                    *gain_Q7,                       /* O    sum of absolute LTP coefficients            */
-    const opus_int16            *in_Q14,                        /* I    input vector to be quantized                */
-    const opus_int32            *W_Q18,                         /* I    weighting matrix                            */
+    const opus_int32            *XX_Q17,                        /* I    correlation matrix                          */
+    const opus_int32            *xX_Q17,                        /* I    correlation vector                          */
     const opus_int8             *cb_Q7,                         /* I    codebook                                    */
     const opus_uint8            *cb_gain_Q7,                    /* I    codebook effective gain                     */
     const opus_uint8            *cl_Q5,                         /* I    code length for each codebook vector        */
-    const opus_int              mu_Q9,                          /* I    tradeoff betw. weighted error and rate      */
+    const opus_int              subfr_len,                      /* I    number of samples per subframe              */
     const opus_int32            max_gain_Q7,                    /* I    maximum sum of absolute LTP coefficients    */
-    opus_int                    L                               /* I    number of vectors in codebook               */
+    const opus_int              L                               /* I    number of vectors in codebook               */
 );
 
-#  define silk_VQ_WMat_EC(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
-                          mu_Q9, max_gain_Q7, L, arch) \
-    ((*SILK_VQ_WMAT_EC_IMPL[(arch) & OPUS_ARCHMASK])(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
-                          mu_Q9, max_gain_Q7, L))
+#   define OVERRIDE_silk_VQ_WMat_EC
+#   define silk_VQ_WMat_EC(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
+                           subfr_len, max_gain_Q7, L, arch) \
+    ((*SILK_VQ_WMAT_EC_IMPL[(arch) & OPUS_ARCHMASK])(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
+                          subfr_len, max_gain_Q7, L))
 
-#endif
-#endif
-
-#if 0 /* FIXME: SSE disabled until the NSQ code gets updated. */
-#  define OVERRIDE_silk_NSQ
+#  endif
 
 void silk_NSQ_sse4_1(
-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
-    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
-    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
+    const opus_int16            x16[],                                        /* I    Input                           */
+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
+    const opus_int16            *PredCoef_Q12,                                /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 );
 
-#if defined OPUS_X86_PRESUME_SSE4_1
+#  if defined OPUS_X86_PRESUME_SSE4_1
 
-#define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
-                   HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
+#   define OVERRIDE_silk_NSQ
+#   define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+                    HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
     ((void)(arch),silk_NSQ_sse4_1(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
                    HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
 
-#else
+#  elif defined(OPUS_HAVE_RTCD)
 
 extern void (*const SILK_NSQ_IMPL[OPUS_ARCHMASK + 1])(
-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
-    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
-    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
+    const opus_int16            x16[],                                        /* I    Input                           */
+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
+    const opus_int16            *PredCoef_Q12,                                /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 );
 
-#  define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
-                   HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
+#   define OVERRIDE_silk_NSQ
+#   define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+                    HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
     ((*SILK_NSQ_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
                    HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
 
-#endif
-
-#  define OVERRIDE_silk_NSQ_del_dec
+#  endif
 
 void silk_NSQ_del_dec_sse4_1(
-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
-    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
-    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
+    const opus_int16            x16[],                                        /* I    Input                           */
+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
+    const opus_int16            *PredCoef_Q12,                                /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
+);
+
+void silk_NSQ_del_dec_avx2(
+    const silk_encoder_state *psEncC,                            /* I    Encoder State               */
+    silk_nsq_state *NSQ,                                         /* I/O  NSQ state                   */
+    SideInfoIndices *psIndices,                                  /* I/O  Quantization Indices        */
+    const opus_int16 x16[],                                      /* I    Input                       */
+    opus_int8 pulses[],                                          /* O    Quantized pulse signal      */
+    const opus_int16 *PredCoef_Q12,                              /* I    Short term prediction coefs */
+    const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR],      /* I    Long term prediction coefs  */
+    const opus_int16 AR_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER], /* I    Noise shaping coefs         */
+    const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR],              /* I    Long term shaping coefs     */
+    const opus_int Tilt_Q14[MAX_NB_SUBFR],                       /* I    Spectral tilt               */
+    const opus_int32 LF_shp_Q14[MAX_NB_SUBFR],                   /* I    Low frequency shaping coefs */
+    const opus_int32 Gains_Q16[MAX_NB_SUBFR],                    /* I    Quantization step sizes     */
+    const opus_int32 pitchL[MAX_NB_SUBFR],                       /* I    Pitch lags                  */
+    const opus_int Lambda_Q10,                                   /* I    Rate/distortion tradeoff    */
+    const opus_int LTP_scale_Q14                                 /* I    LTP state scaling           */
 );
 
-#if defined OPUS_X86_PRESUME_SSE4_1
+#  if defined (OPUS_X86_PRESUME_AVX2)
 
-#define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
-                           HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
-    ((void)(arch),silk_NSQ_del_dec_sse4_1(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+#   define OVERRIDE_silk_NSQ_del_dec
+#   define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
+                            HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
+    ((void)(arch),silk_NSQ_del_dec_avx2(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
                            HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
 
-#else
+#  elif defined (OPUS_X86_PRESUME_SSE4_1) && !defined(OPUS_X86_MAY_HAVE_AVX2)
+
+#   define OVERRIDE_silk_NSQ_del_dec
+#   define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
+                            HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
+    ((void)(arch),silk_NSQ_del_dec_sse4_1(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
+                           HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
+
+#  elif defined(OPUS_HAVE_RTCD)
 
 extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])(
-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
-    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
-    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
+    const opus_int16            x16[],                                        /* I    Input                           */
+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
+    const opus_int16            *PredCoef_Q12,                                /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 );
 
-#  define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
-                           HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
-    ((*SILK_NSQ_DEL_DEC_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+#   define OVERRIDE_silk_NSQ_del_dec
+#   define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
+                            HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
+    ((*SILK_NSQ_DEL_DEC_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
                            HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
 
-#endif
-#endif
+#  endif
 
 void silk_noise_shape_quantizer(
     silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */
@@ -223,25 +247,52 @@ void silk_VAD_GetNoiseLevels(
     silk_VAD_state              *psSilk_VAD         /* I/O  Pointer to Silk VAD state                   */
 );
 
-#  define OVERRIDE_silk_VAD_GetSA_Q8
-
 opus_int silk_VAD_GetSA_Q8_sse4_1(
     silk_encoder_state *psEnC,
     const opus_int16   pIn[]
 );
 
-#if defined(OPUS_X86_PRESUME_SSE4_1)
-#define silk_VAD_GetSA_Q8(psEnC, pIn, arch) ((void)(arch),silk_VAD_GetSA_Q8_sse4_1(psEnC, pIn))
+#  if defined(OPUS_X86_PRESUME_SSE4_1)
 
-#else
+#   define OVERRIDE_silk_VAD_GetSA_Q8
+#   define silk_VAD_GetSA_Q8(psEnC, pIn, arch) ((void)(arch),silk_VAD_GetSA_Q8_sse4_1(psEnC, pIn))
 
-#  define silk_VAD_GetSA_Q8(psEnC, pIn, arch) \
-     ((*SILK_VAD_GETSA_Q8_IMPL[(arch) & OPUS_ARCHMASK])(psEnC, pIn))
+#  elif defined(OPUS_HAVE_RTCD)
 
 extern opus_int (*const SILK_VAD_GETSA_Q8_IMPL[OPUS_ARCHMASK + 1])(
      silk_encoder_state *psEnC,
      const opus_int16   pIn[]);
 
+#   define OVERRIDE_silk_VAD_GetSA_Q8
+#   define silk_VAD_GetSA_Q8(psEnC, pIn, arch) \
+      ((*SILK_VAD_GETSA_Q8_IMPL[(arch) & OPUS_ARCHMASK])(psEnC, pIn))
+
+#  endif
+
+#ifndef FIXED_POINT
+double silk_inner_product_FLP_avx2(
+    const silk_float    *data1,
+    const silk_float    *data2,
+    opus_int            dataSize
+);
+
+#if defined (OPUS_X86_PRESUME_AVX2)
+
+#define OVERRIDE_inner_product_FLP
+#define silk_inner_product_FLP(data1, data2, dataSize, arch) ((void)arch,silk_inner_product_FLP_avx2(data1, data2, dataSize))
+
+#elif defined(OPUS_HAVE_RTCD) && defined(OPUS_X86_MAY_HAVE_AVX2)
+
+#define OVERRIDE_inner_product_FLP
+extern double (*const SILK_INNER_PRODUCT_FLP_IMPL[OPUS_ARCHMASK + 1])(
+    const silk_float    *data1,
+    const silk_float    *data2,
+    opus_int            dataSize
+);
+
+#define silk_inner_product_FLP(data1, data2, dataSize, arch) ((void)arch,(*SILK_INNER_PRODUCT_FLP_IMPL[(arch) & OPUS_ARCHMASK])(data1, data2, dataSize))
+
+#endif
 #endif
 
 # endif
diff --git a/opus/silk/x86/x86_silk_map.c b/opus/silk/x86/x86_silk_map.c
index 32dcc3ca..39ad7527 100644
--- a/opus/silk/x86/x86_silk_map.c
+++ b/opus/silk/x86/x86_silk_map.c
@@ -32,25 +32,28 @@
 #include "celt/x86/x86cpu.h"
 #include "structs.h"
 #include "SigProc_FIX.h"
+#ifndef FIXED_POINT
+#include "SigProc_FLP.h"
+#endif
 #include "pitch.h"
 #include "main.h"
 
-#if !defined(OPUS_X86_PRESUME_SSE4_1)
+#if defined(OPUS_HAVE_RTCD) && !defined(OPUS_X86_PRESUME_AVX2)
 
 #if defined(FIXED_POINT)
 
 #include "fixed/main_FIX.h"
 
-opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[ OPUS_ARCHMASK + 1 ] )(
+opus_int64 (*const SILK_INNER_PROD16_IMPL[ OPUS_ARCHMASK + 1 ] )(
     const opus_int16 *inVec1,
     const opus_int16 *inVec2,
     const opus_int   len
 ) = {
-  silk_inner_prod16_aligned_64_c,                  /* non-sse */
-  silk_inner_prod16_aligned_64_c,
-  silk_inner_prod16_aligned_64_c,
-  MAY_HAVE_SSE4_1( silk_inner_prod16_aligned_64 ), /* sse4.1 */
-  MAY_HAVE_SSE4_1( silk_inner_prod16_aligned_64 )  /* avx */
+  silk_inner_prod16_c,                  /* non-sse */
+  silk_inner_prod16_c,
+  silk_inner_prod16_c,
+  MAY_HAVE_SSE4_1( silk_inner_prod16 ), /* sse4.1 */
+  MAY_HAVE_SSE4_1( silk_inner_prod16 )  /* avx */
 };
 
 #endif
@@ -66,23 +69,22 @@ opus_int (*const SILK_VAD_GETSA_Q8_IMPL[ OPUS_ARCHMASK + 1 ] )(
   MAY_HAVE_SSE4_1( silk_VAD_GetSA_Q8 )  /* avx */
 };
 
-#if 0 /* FIXME: SSE disabled until the NSQ code gets updated. */
 void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )(
-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
-    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
-    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
+    const opus_int16            x16[],                                        /* I    Input                           */
+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
+    const opus_int16            *PredCoef_Q12,                                /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 ) = {
   silk_NSQ_c,                  /* non-sse */
   silk_NSQ_c,
@@ -90,21 +92,20 @@ void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )(
   MAY_HAVE_SSE4_1( silk_NSQ ), /* sse4.1 */
   MAY_HAVE_SSE4_1( silk_NSQ )  /* avx */
 };
-#endif
 
-#if 0 /* FIXME: SSE disabled until silk_VQ_WMat_EC_sse4_1() gets updated. */
 void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )(
     opus_int8                   *ind,                           /* O    index of best codebook vector               */
-    opus_int32                  *rate_dist_Q14,                 /* O    best weighted quant error + mu * rate       */
+    opus_int32                  *res_nrg_Q15,                   /* O    best residual energy                        */
+    opus_int32                  *rate_dist_Q8,                  /* O    best total bitrate                          */
     opus_int                    *gain_Q7,                       /* O    sum of absolute LTP coefficients            */
-    const opus_int16            *in_Q14,                        /* I    input vector to be quantized                */
-    const opus_int32            *W_Q18,                         /* I    weighting matrix                            */
+    const opus_int32            *XX_Q17,                        /* I    correlation matrix                          */
+    const opus_int32            *xX_Q17,                        /* I    correlation vector                          */
     const opus_int8             *cb_Q7,                         /* I    codebook                                    */
     const opus_uint8            *cb_gain_Q7,                    /* I    codebook effective gain                     */
     const opus_uint8            *cl_Q5,                         /* I    code length for each codebook vector        */
-    const opus_int              mu_Q9,                          /* I    tradeoff betw. weighted error and rate      */
+    const opus_int              subfr_len,                      /* I    number of samples per subframe              */
     const opus_int32            max_gain_Q7,                    /* I    maximum sum of absolute LTP coefficients    */
-    opus_int                    L                               /* I    number of vectors in codebook               */
+    const opus_int              L                               /* I    number of vectors in codebook               */
 ) = {
   silk_VQ_WMat_EC_c,                  /* non-sse */
   silk_VQ_WMat_EC_c,
@@ -112,33 +113,30 @@ void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )(
   MAY_HAVE_SSE4_1( silk_VQ_WMat_EC ), /* sse4.1 */
   MAY_HAVE_SSE4_1( silk_VQ_WMat_EC )  /* avx */
 };
-#endif
 
-#if 0 /* FIXME: SSE disabled until the NSQ code gets updated. */
 void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )(
-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
-    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
-    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
+    const opus_int16            x16[],                                        /* I    Input                           */
+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
+    const opus_int16            *PredCoef_Q12,                                /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 ) = {
   silk_NSQ_del_dec_c,                  /* non-sse */
   silk_NSQ_del_dec_c,
   silk_NSQ_del_dec_c,
   MAY_HAVE_SSE4_1( silk_NSQ_del_dec ), /* sse4.1 */
-  MAY_HAVE_SSE4_1( silk_NSQ_del_dec )  /* avx */
+  MAY_HAVE_AVX2( silk_NSQ_del_dec )  /* avx */
 };
-#endif
 
 #if defined(FIXED_POINT)
 
@@ -161,4 +159,21 @@ void (*const SILK_BURG_MODIFIED_IMPL[ OPUS_ARCHMASK + 1 ] )(
 };
 
 #endif
+
+#ifndef FIXED_POINT
+
+double (*const SILK_INNER_PRODUCT_FLP_IMPL[ OPUS_ARCHMASK + 1 ] )(
+    const silk_float    *data1,
+    const silk_float    *data2,
+    opus_int            dataSize
+) = {
+  silk_inner_product_FLP_c,                  /* non-sse */
+  silk_inner_product_FLP_c,
+  silk_inner_product_FLP_c,
+  silk_inner_product_FLP_c, /* sse4.1 */
+  MAY_HAVE_AVX2( silk_inner_product_FLP )  /* avx */
+};
+
+#endif
+
 #endif
diff --git a/opus/src/analysis.c b/opus/src/analysis.c
index cb46dec5..1f580138 100644
--- a/opus/src/analysis.c
+++ b/opus/src/analysis.c
@@ -31,7 +31,9 @@
 
 #define ANALYSIS_C
 
+#ifdef MLP_TRAINING
 #include <stdio.h>
+#endif
 
 #include "mathops.h"
 #include "kiss_fft.h"
@@ -927,9 +929,9 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt
     features[23] = info->tonality_slope + 0.069216f;
     features[24] = tonal->lowECount - 0.067930f;
 
-    compute_dense(&layer0, layer_out, features);
-    compute_gru(&layer1, tonal->rnn_state, layer_out);
-    compute_dense(&layer2, frame_probs, tonal->rnn_state);
+    analysis_compute_dense(&layer0, layer_out, features);
+    analysis_compute_gru(&layer1, tonal->rnn_state, layer_out);
+    analysis_compute_dense(&layer2, frame_probs, tonal->rnn_state);
 
     /* Probability of speech or music vs noise */
     info->activity_probability = frame_probs[1];
diff --git a/opus/src/extensions.c b/opus/src/extensions.c
new file mode 100644
index 00000000..bb6c0b02
--- /dev/null
+++ b/opus/src/extensions.c
@@ -0,0 +1,315 @@
+/* Copyright (c) 2022 Amazon */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+
+#include "opus_types.h"
+#include "opus_defines.h"
+#include "arch.h"
+#include "os_support.h"
+#include "opus_private.h"
+
+
+/* Given an extension payload, advance data to the next extension and return the
+   length of the remaining extensions. */
+opus_int32 skip_extension(const unsigned char **data, opus_int32 len, opus_int32 *header_size)
+{
+   int id, L;
+   if (len==0)
+      return 0;
+   id = **data>>1;
+   L = **data&1;
+   if (id == 0 && L == 1)
+   {
+      *header_size = 1;
+      if (len < 1)
+         return -1;
+      (*data)++;
+      len--;
+      return len;
+   } else if (id > 0 && id < 32)
+   {
+      if (len < 1+L)
+         return -1;
+      *data += 1+L;
+      len -= 1+L;
+      *header_size = 1;
+      return len;
+   } else {
+      if (L==0)
+      {
+         *data += len;
+         *header_size = 1;
+         return 0;
+      } else {
+         opus_int32 bytes=0;
+         *header_size = 1;
+         do {
+            (*data)++;
+            len--;
+            if (len == 0)
+               return -1;
+            bytes += **data;
+            (*header_size)++;
+         } while (**data == 255);
+         (*data)++;
+         len--;
+         if (bytes <= len)
+         {
+            len -= bytes;
+            *data += bytes;
+         } else {
+            return -1;
+         }
+         return len;
+      }
+   }
+}
+
+/* Count the number of extensions, excluding real padding and separators. */
+opus_int32 opus_packet_extensions_count(const unsigned char *data, opus_int32 len)
+{
+   opus_int32 curr_len;
+   opus_int32 count=0;
+   const unsigned char *curr_data = data;
+
+   celt_assert(len >= 0);
+   celt_assert(data != NULL || len == 0);
+
+   curr_len = len;
+   while (curr_len > 0)
+   {
+      int id;
+      opus_int32 header_size;
+      id = *curr_data>>1;
+      curr_len = skip_extension(&curr_data, curr_len, &header_size);
+      if (curr_len < 0)
+         return OPUS_INVALID_PACKET;
+      if (id > 1)
+         count++;
+   }
+   return count;
+}
+
+/* Extract extensions from Opus padding (excluding real padding and separators) */
+opus_int32 opus_packet_extensions_parse(const unsigned char *data, opus_int32 len, opus_extension_data *extensions, opus_int32 *nb_extensions)
+{
+   const unsigned char *curr_data;
+   opus_int32 curr_len;
+   int curr_frame=0;
+   opus_int32 count=0;
+
+   celt_assert(len >= 0);
+   celt_assert(data != NULL || len == 0);
+   celt_assert(nb_extensions != NULL);
+   celt_assert(extensions != NULL || *nb_extensions == 0);
+
+   curr_data = data;
+   curr_len = len;
+   while (curr_len > 0)
+   {
+      int id;
+      opus_int32 header_size;
+      opus_extension_data curr_ext;
+      id = *curr_data>>1;
+      if (id > 1)
+      {
+         curr_ext.id = id;
+         curr_ext.frame = curr_frame;
+         curr_ext.data = curr_data;
+      } else if (id == 1)
+      {
+         int L = *curr_data&1;
+         if (L==0)
+            curr_frame++;
+         else {
+            if (curr_len >= 2)
+               curr_frame += curr_data[1];
+            /* Else we're at the end and it doesn't matter. */
+         }
+         if (curr_frame >= 48)
+         {
+            *nb_extensions = count;
+            return OPUS_INVALID_PACKET;
+         }
+      }
+      curr_len = skip_extension(&curr_data, curr_len, &header_size);
+      /* printf("curr_len = %d, header_size = %d\n", curr_len, header_size); */
+      if (curr_len < 0)
+      {
+         *nb_extensions = count;
+         return OPUS_INVALID_PACKET;
+      }
+      celt_assert(curr_data - data == len - curr_len);
+      if (id > 1)
+      {
+         if (count == *nb_extensions)
+         {
+             return OPUS_BUFFER_TOO_SMALL;
+         }
+         curr_ext.len = curr_data - curr_ext.data - header_size;
+         curr_ext.data += header_size;
+         extensions[count++] = curr_ext;
+      }
+   }
+   celt_assert(curr_len == 0);
+   *nb_extensions = count;
+   return OPUS_OK;
+}
+
+opus_int32 opus_packet_extensions_generate(unsigned char *data, opus_int32 len, const opus_extension_data  *extensions, opus_int32 nb_extensions, int pad)
+{
+   int max_frame=0;
+   opus_int32 i;
+   int frame;
+   int curr_frame = 0;
+   opus_int32 pos = 0;
+   opus_int32 written = 0;
+
+   celt_assert(len >= 0);
+
+   for (i=0;i<nb_extensions;i++)
+   {
+      max_frame = IMAX(max_frame, extensions[i].frame);
+      if (extensions[i].id < 2 || extensions[i].id > 127)
+         return OPUS_BAD_ARG;
+   }
+   if (max_frame >= 48) return OPUS_BAD_ARG;
+   for (frame=0;frame<=max_frame;frame++)
+   {
+      for (i=0;i<nb_extensions;i++)
+      {
+         if (extensions[i].frame == frame)
+         {
+            /* Insert separator when needed. */
+            if (frame != curr_frame) {
+               int diff = frame - curr_frame;
+               if (len-pos < 2)
+                  return OPUS_BUFFER_TOO_SMALL;
+               if (diff == 1) {
+                  if (data) data[pos] = 0x02;
+                  pos++;
+               } else {
+                  if (data) data[pos] = 0x03;
+                  pos++;
+                  if (data) data[pos] = diff;
+                  pos++;
+               }
+               curr_frame = frame;
+            }
+            if (extensions[i].id < 32)
+            {
+               if (extensions[i].len < 0 || extensions[i].len > 1)
+                  return OPUS_BAD_ARG;
+               if (len-pos < extensions[i].len+1)
+                  return OPUS_BUFFER_TOO_SMALL;
+               if (data) data[pos] = (extensions[i].id<<1) + extensions[i].len;
+               pos++;
+               if (extensions[i].len > 0) {
+                  if (data) data[pos] = extensions[i].data[0];
+                  pos++;
+               }
+            } else {
+               int last;
+               opus_int32 length_bytes;
+               if (extensions[i].len < 0)
+                  return OPUS_BAD_ARG;
+               last = (written == nb_extensions - 1);
+               length_bytes = 1 + extensions[i].len/255;
+               if (last)
+                  length_bytes = 0;
+               if (len-pos < 1 + length_bytes + extensions[i].len)
+                  return OPUS_BUFFER_TOO_SMALL;
+               if (data) data[pos] = (extensions[i].id<<1) + !last;
+               pos++;
+               if (!last)
+               {
+                  opus_int32 j;
+                  for (j=0;j<extensions[i].len/255;j++) {
+                     if (data) data[pos] = 255;
+                     pos++;
+                  }
+                  if (data) data[pos] = extensions[i].len % 255;
+                  pos++;
+               }
+               if (data) OPUS_COPY(&data[pos], extensions[i].data, extensions[i].len);
+               pos += extensions[i].len;
+            }
+            written++;
+         }
+      }
+   }
+   /* If we need to pad, just prepend 0x01 bytes. Even better would be to fill the
+      end with zeros, but that requires checking that turning the last extesion into
+      an L=1 case still fits. */
+   if (pad && pos < len)
+   {
+      opus_int32 padding = len - pos;
+      if (data) {
+         OPUS_MOVE(data+padding, data, pos);
+         for (i=0;i<padding;i++)
+            data[i] = 0x01;
+      }
+      pos += padding;
+   }
+   return pos;
+}
+
+#if 0
+#include <stdio.h>
+int main()
+{
+   opus_extension_data ext[] = {{2, 0, (const unsigned char *)"a", 1},
+   {32, 10, (const unsigned char *)"DRED", 4},
+   {33, 1, (const unsigned char *)"NOT DRED", 8},
+   {3, 4, (const unsigned char *)NULL, 0}
+   };
+   opus_extension_data ext2[10];
+   int i, len;
+   int nb_ext = 10;
+   unsigned char packet[10000];
+   len = opus_packet_extensions_generate(packet, 32, ext, 4, 1);
+   for (i=0;i<len;i++)
+   {
+      printf("%#04x ", packet[i]);
+      if (i%16 == 15)
+         printf("\n");
+   }
+   printf("\n");
+   printf("count = %d\n", opus_packet_extensions_count(packet, len));
+   opus_packet_extensions_parse(packet, len, ext2, &nb_ext);
+   for (i=0;i<nb_ext;i++)
+   {
+      int j;
+      printf("%d %d {", ext2[i].id, ext2[i].frame);
+      for (j=0;j<ext2[i].len;j++) printf("%#04x ", ext2[i].data[j]);
+      printf("} %d\n", ext2[i].len);
+   }
+}
+#endif
diff --git a/opus/src/mapping_matrix.c b/opus/src/mapping_matrix.c
index 31298af0..3f78ab59 100644
--- a/opus/src/mapping_matrix.c
+++ b/opus/src/mapping_matrix.c
@@ -302,6 +302,287 @@ const opus_int16 mapping_matrix_toa_mixing_data[324] = {
          0,      0,      0,  32767
 };
 
+const MappingMatrix mapping_matrix_fourthoa_mixing = { 27, 27, 0 };
+const opus_int16 mapping_matrix_fourthoa_mixing_data[729] = {
+      9243,      0,  16010,      0,      0,      0,  20669,      0,
+         0,      0,      0,      0,  24456,      0,      0,      0,
+         0,      0,      0,      0,  27731,      0,      0,      0,
+         0,      0,      0,   9243,      0,  10884,  11741,      0,
+         0,   3995,  17849,   9626,      0,      0,      0,  -5727,
+     14399,  17315,   7625,      0,      0,      0,      0, -11747,
+      2574,  18637,  15552,   5930,      0,      0,   9243, -14302,
+     -2682,  -6677,  13337,   5357,  -9464,   2501, -11170,   4770,
+     -5911,  11501,   5858,   5369,   4951,  17901, -19071,  -2397,
+     -9281,  -9198,   7576,  -4294,   7773,  -8997,  -3399,      0,
+         0,   9243,   9940,  11991,  -3705,  -5144,  16647,   7057,
+     -6206,  -5941,  -2698, -10194,  16781,  -1788,  -6256, -11772,
+      4935,   3912,  -6062, -13039,   9446,  -9758,  -3521, -15058,
+     11089,    565,      0,      0,   9243, -15376,   3720,   2461,
+     -5285,  -7989,  -8660,   1278, -16087,  15811,  -3249,  10500,
+     -7757,  -1680,  -9890,  -8153,  10884,  11022,   2847,  12828,
+      5137,  -2053,   8666,  -5684,  14776,      0,      0,   9243,
+    -10577,  10304,  -6186,   9139, -15222,   2507,  -8902,  -5140,
+      -145,  15562, -10596,  -7311,  -6197,  -8753,   8667,  -6014,
+      -281,  15033,    938, -11859,    548,  -8456,  16735,  -3654,
+         0,      0,   9243,   8974,   4839, -12343, -15472,   6066,
+     -7501,  -8343,   5015,  15920, -12374,  -4559,  -9400,   6271,
+      4011,   5191,  -9932,  14438,   4828,  -8768,   1909,  12059,
+     -1565,   4707, -13711,      0,      0,   9243,  15799,   2085,
+     -1534,  -3386,   4602,  -9808,   -447, -17267, -18054,  -1167,
+    -13525,  -4644,   1313,  -5951,   5397,   7485,  -7056,   2584,
+     -8120,   8669,    788,  13177,   2109,  18349,      0,      0,
+      9243,  12371, -10036,   1597,   2760, -17341,   1848,  -2239,
+    -10509,  -8474,  -4577,  11164,   7935,   1441,  17430,  -3436,
+     -3713,  15936,   4184,   2647, -11730,    341, -15934,   6462,
+      6581,      0,      0,   9243,  -8963,   2184,  13084, -16381,
+     -2734,  -9757,   3991,   6345, -18297,  -5912,   7604,  -4849,
+    -11100,   2290,  -4304, -13305,  -7488,  12338,   4805,   8505,
+     -7014,  -4779,  -1761, -14597,      0,      0,   9243,   1301,
+    -15498,   3799,    690,  -2816,  18718,  -8223,    889,    255,
+     -1768,   4485, -19951,  13097,  -2278,    167,     78,   -740,
+      3324,  -6139,  19488, -17925,   4283,   -486,     20,      0,
+         0,   9243, -13470,  -6719,   5452, -10257,  12641,  -4873,
+     -5116, -10595,   5856,  11389,   1502,  10876,   -608,  11765,
+    -13218,  13911,  -7373,  -2070, -13679,  -4154,   5536,  -2138,
+     16643,    451,      0,      0,   9243,   2455,  -3679, -15387,
+     -5277,  -1261,  -8697,   7906,  16112,   8147,   3208,  -1690,
+      7687,  10593,  -9796, -15852, -10884,  -5616,   2881,   2032,
+      5246, -12735,  -8796,  10928,  14833,      0,      0,   9243,
+     -6849,   2775, -14202,  13586,  -2655,  -9402,  -5505,  10809,
+    -18013,   6231,   5444,  -6041,  11288,   4958,  -4078,  18799,
+     -9368,  -9291,   4535,   7383,   9405,  -7391,  -2121,  -4336,
+         0,      0,   9243,   6423,  -9040,  11548,  10359,  -8109,
+      -450, -14580,   6431,  10857, -15475,   3569,   9707,   6416,
+     -9607,    521,   8528, -18391,  11049,   3815, -10423,   6860,
+      6860,   -883,  -4221,      0,      0,   9243,  11932,  -5968,
+     -8850, -14749,  -9946,  -6026,   7377,  -4472,   5206,  14547,
+     -3406,  10508,   2526,   4411,  14543,   8444,  -5822,    347,
+     12347,  -1709,  -9158,    105, -16265, -12642,      0,      0,
+      9243,  13044,   -150,   9282,  16910,   -274, -10332,   -194,
+     -5864,   5428,   -420, -12196,    344,  -8679,    145, -18554,
+    -12695,   -152, -14635,    503,  10389,    358,   5076,    522,
+    -16100,      0,      0,   9243,  -8374, -13590,  -1221,   1428,
+     15896,  12005,   2318,  -4793,   2590,  -3209, -20390,  -6256,
+     -2974,  10766,   1202,   -876,  -6597,   5004,  19896,  -1541,
+      2902, -16788,  -3062,   1340,      0,      0,   9243,   9879,
+     10267,   7300,  10073,  14167,   2416,  10469,  -3094,   2899,
+     17092,   9762,  -7400,   7214,  -5250,  -8238,  -3989,   5578,
+     16392,  -1050, -11848,   -776,  -5034, -15850,  -5882,      0,
+         0,   9243,  -4974,  -9068,  12221,  -8490,   6299,   -388,
+    -15478,   8702,  -9920,  12723,  -2810,   9668,   6905, -13040,
+      4325,  -9456,  16856,  -9159,  -2909, -10476,   7149,   9387,
+     -7350,    233,      0,      0,   9243,   3627, -13823,  -7218,
+     -3656,  -7002,  12776,  13935,   2719,   2446,   8352,   9252,
+     -7676, -18413,  -6212,   -429,  -1272,  -6335, -13356,  -9510,
+       295,  18926,   9934,   1112,   -382,      0,      0,   9243,
+     -6383,  -9343, -11326,  10097,   8329,    223,  14780,   6114,
+    -10348, -15590,  -4195,   9257,  -7445,  -9439,   -323,   7902,
+     18117,  12101,  -3142, -10944,  -5577,   7327,    566,  -4133,
+         0,      0,   9243,   2626,    865,  15769,   5783,    317,
+    -10244,   1905,  16884,   9144,    826,  -2420,  -1972, -14536,
+      2413,  16939,  12500,   1482,  -4906,   -578,  10096,  -3476,
+    -14323,   2745,  16105,      0,      0,   9243,  -8975,  12086,
+      5450,  -6832, -15149,   7333,   9200,  -3550,   -362, -13645,
+    -15525,  -1391,   9428,  -7091,  -5442,   3105,   -820, -17685,
+     -9175,  -9462,   5572,  -9191, -12325,  -2180,      0,      0,
+      9243,   -114,  11576, -11058,    177,   -185,   5875, -17880,
+      8539,   -198,    339,   -173,  -3411, -16698,  16336,  -6369,
+       193,   -430,    408,    -75, -10806,  -7225,  19670, -13817,
+      4665,      0,      0,      0,      0,      0,      0,      0,
+         0,      0,      0,      0,      0,      0,      0,      0,
+         0,      0,      0,      0,      0,      0,      0,      0,
+         0,      0,      0,      0,  32767,      0,      0,      0,
+         0,      0,      0,      0,      0,      0,      0,      0,
+         0,      0,      0,      0,      0,      0,      0,      0,
+         0,      0,      0,      0,      0,      0,      0,      0,
+     32767
+};
+
+const MappingMatrix mapping_matrix_fifthoa_mixing = { 38, 38, 0 };
+const opus_int16 mapping_matrix_fifthoa_mixing_data[1444] = {
+      9243,      0,  16010,      0,      0,      0,  20669,      0,
+         0,      0,      0,      0,  24456,      0,      0,      0,
+         0,      0,      0,      0,  27731,      0,      0,      0,
+         0,      0,      0,      0,      0,      0,  30657,      0,
+         0,      0,      0,      0,      0,      0,   9243,      0,
+     -7023,  14387,      0,      0,  -4369, -14112,  14455,      0,
+         0,      0,  10931,   -510, -16777,  14031,      0,      0,
+         0,      0,  -5118,  14286,   4343, -18465,  13374,      0,
+         0,      0,      0,      0,  -6494, -12221,  11761,   8513,
+    -19458,  12605,      0,      0,   9243, -14128,   5093,   5547,
+    -10946, -10050,  -7197,   3945, -11790,   7142,  -9213,   6529,
+     -9701,  -2563,  -9923, -14846,  16521,   6816,   2764,  14103,
+      1118,  -5537,   2977, -14168,   1228,   4866,  17430,   -528,
+     10639,   2641,  10437,  -1037,  11460,   1098,   1296,  15737,
+         0,      0,   9243,   1128, -14775,   6062,    955,  -2329,
+     16069, -12511,   2477,    579,  -2333,   3440, -14197,  18478,
+     -6050,    940,    303,  -1604,   4106,  -4223,   9829, -22688,
+     10647,  -2604,    334,    145,   -927,   3203,  -6017,   4507,
+     -3812,  24212, -15600,   5198,  -1023,    110,      0,      0,
+      9243,   1158,  12997,   9277,   1501,   2103,  10097,  16840,
+      5916,   1402,   3225,   2488,   2929,  19916,  12706,   3585,
+      1137,   3415,   4698,   2078,  -5442,  16634,  18511,   8731,
+      2095,    850,   3061,   5733,   5225,    960, -11728,   7689,
+     20588,  14659,   5642,   1187,      0,      0,   9243,  -4663,
+     -3081, -15003,   9771,   2007,  -9185,   6457,  14199, -14357,
+     -4976,   3554,   6625,  11434,  -7231, -11297,  17760,   8291,
+     -6267,  -3368,   6712, -10837,  -9107,   6524,   6793, -19531,
+    -11338,   7934,   7335,  -2205,  -9215,  -7094,  10659,   6243,
+     -4337,  -1250,      0,      0,   9243, -13515,   7679,  -3831,
+      7232, -14496,  -3201,  -4109, -11731,   8828,   9178,  -1901,
+    -10848,   -539, -14888,   9626, -10860,  12703,   3824,  12334,
+     -7104,   3496,  -6203,  13852,   5461,  -2109, -17277,   7837,
+     -4714,  13901,   4097,   3940,   7647,   8546,   8688, -10986,
+         0,      0,   9243,   8113,  -9860,   9657,  10943, -11174,
+      1426, -13300,   1915,   8178, -17833,   6805,   8309,   8100,
+     -3121,  -4742,   2683, -15111,  15688,   2358, -11590,   2807,
+      2746,   8762,  -7430,  -2251,  -5481,  16370,  -4081,  -9694,
+      5872, -11539,   -714,  -9492,  15177,  -6126,      0,      0,
+      9243,   9933,  -9215,  -8528, -11831, -12785,    -62,  10976,
+     -1811,   5593,  18018,   6100,   9455,  -5237,   2758,   8971,
+      2743,  -9659, -13517,   5330, -10737,  -4576,  -2069, -15491,
+     -8749,  -7226,  -5237,   9191,   -181, -12277,   2815,  10540,
+       -27,  14741,  16703,   3103,      0,      0,   9243, -10067,
+     -8881,  -8723,  12265,  12487,   -793,  10821,  -1762,  -6021,
+    -18002,  -5072,   9912,  -4395,   2587,   9368,  -2767,  10021,
+     12259,  -6468, -10113,  -5605,  -1761, -15590,  -9430,   7800,
+      5092,  -8835,   2293,  12314,   1222,  10671,   -329,  13745,
+     17349,   3563,      0,      0,   9243,  -6485,  12991,  -6743,
+      6108, -11768,  10080, -12236,    238,  -2883,  13115, -13907,
+      2900, -14460,    511,   2564,    186,  -7019,  19094, -11597,
+     -5472, -12058,    744,   6243,  -2384,    930,    501, -11778,
+     21214,  -5330, -11746,  -5542,    827,  10475,  -6418,   1132,
+         0,      0,   9243,   3862,   5238, -14627,  -7891,   2826,
+     -7015, -10701,  13900,  11410,  -6831,  -1679,  -9861,   6359,
+     12032, -11660, -14041,  11199,   1713,  -3895,    657,  14749,
+     -3017, -11445,   8380,  15575, -15236,   -346,   7690,   -923,
+     10317,   3498, -13545,    354,   9093,  -4476,      0,      0,
+      9243,  -8417,  13183,   3418,  -4018, -15498,  10685,   6294,
+     -4132,   1419,  -8755, -18818,   3926,   7642,  -9001,  -3235,
+      2125,   3506, -13037, -16570,  -4337,   6729, -13404,  -7991,
+        59,    443,   5804,   6005, -15011,  -9060, -11044,   3679,
+    -15434, -13685,    161,   1185,      0,      0,   9243,  -5288,
+      6773, -13508,   9977,  -5002,  -4784, -12780,  10790, -12942,
+     11168,    519, -10890,   1326,  12078,  -6274,  13780, -16427,
+      2186,   5352,  -4328,  13671,   2364,  -7963,   1080, -12568,
+     19336,  -6557,  -8574,   4084,   7277,  10433,  -9273,  -3178,
+      1516,   3817,      0,      0,   9243,   9660,   7817,  10093,
+     13619,  10548,  -2942,  11021,    597,   9663,  17594,   1736,
+    -10794,   1814,    771,  -8469,   1041,  14155,   7891,  -8597,
+     -7498,  -8982,    346, -12407, -11848,  -6809,   1686,   9181,
+     -8306, -10247,   3538, -10706,   -364,  -8047, -19188,  -8493,
+         0,      0,   9243,  -7163,  -1020,  14282, -14289,   1021,
+    -10208,  -2036,  10660, -18919,   2410,   6564,   2323, -13088,
+     -1798,   3365, -19498,   3619,  12022,  -1858,   9978,   3705,
+     -8969,   -643,  -5794, -15523,   4123,  15113,  -3949,  -6265,
+     -3596,  12490,   2946,  -2688,   1225, -14570,      0,      0,
+      9243, -12187,    772, -10354,  17623,  -1314, -10262,  -1117,
+     -2885,  -9937,   2249,  11267,  -1763,   9572,   -368,  16506,
+     -6510,  -1438, -15014,   2402,  10157,   2041,   2458,   2389,
+    -19346,  19860,  -1041,   8067,  -3704, -10931,   2743,  -9286,
+       606, -13399,  -3095,   7924,      0,      0,   9243,  15545,
+     -2367,  -3011,  -6538,  -5139,  -9657,    995, -16242, -15706,
+      2557, -12952,   5226,   2508,   6353,  10156,  13593,   6966,
+      4795,   8960,   8183,  -1735,  11914,  -4504,  14149,  11727,
+     -6665,  10460,  -3962,  10145,  -7648,  -1965,  -9845,  -6764,
+     -6938, -16633,      0,      0,   9243,   3098,  12983,  -8841,
+     -3826,   5618,  10053, -16031,   4787,   3283,  -8209,   6632,
+      2856, -18922,  10272,  -2055,  -2344,   7987, -11939,   5516,
+     -5520, -15739,  14940,  -5001,    530,   1465,  -6306,  13388,
+    -13243,   2513, -11772,  -7170,  16572,  -8384,   1426,    168,
+         0,      0,   9243, -15767,  -2008,  -1916,   4220,   4422,
+     -9846,    537, -17105,  17650,  -1400,  13589,   4481,   1651,
+      5677,   6701,  -9241,  -6642,  -3252,  -7827,   8792,   -951,
+     13182,  -2522,  17586, -17005,   3845, -12562,   2213, -11472,
+     -6688,  -1394,  -8970,  -4769,  -7316, -11753,      0,      0,
+      9243, -13344,  -3829,   7975, -14863,   7136,  -8561,  -4265,
+     -7992,   -801,   9405,   8912,   7937,  -5326,   5057, -17681,
+     15207,    575,   7717, -11360,   4847,   6789,   4150,  12686,
+    -10050,  16730, -12063,    322, -12920,  -3313, -10267,   1980,
+     -6948,   7112,   7972,   8042,      0,      0,   9243,   7791,
+     -1021,  13949,  15180,  -1111, -10208,  -1989,   9348,  19199,
+     -2561,  -7140,   2323, -12782,  -1577,    817,  18164,  -3673,
+    -12771,   2022,   9978,   3620,  -7865,   -156,  -9155,  11924,
+     -3842, -15336,   4196,   6814,  -3596,  12199,   2583,   -652,
+      1936, -17637,      0,      0,   9243,  -4810, -15144,  -1958,
+      1315,  10175,  17406,   4142,  -1348,    263,  -3292, -15632,
+    -17046,  -6363,   3374,    605,   -227,   -748,   5997,  20334,
+     14481,   8277,  -6146,  -1717,      5,     27,    712,   1542,
+     -9197, -23572, -10163,  -9595,   9425,   3539,    -17,    -72,
+         0,      0,   9243,  -7366,   8261,  11568, -11901,  -8499,
+     -2079,  13347,   5556, -12049, -16247,  -2282, -10529,   3584,
+      7585,  -1577,  -8464, -18652,  -8902,   5913,  -8688,  -9287,
+      4156,  -2442,  -7089,  -2993, -14485, -13949,   5422,   8459,
+      1638, -13285,  -2531,  -1826, -12132,  -9456,      0,      0,
+      9243,  11716,    698, -10889, -17818,   1143, -10275,  -1062,
+     -1305,  12057,  -2057, -10855,  -1595,  10088,   -150,  15043,
+      2978,   1578,  15225,  -2090,  10201,   1943,   1115,   1969,
+    -20211, -17636,    430,  -9826,   3391,  10572,   2485,  -9826,
+       248, -12259,  -2924,  12131,      0,      0,   9243,   4361,
+     -4594, -14703,  -8956,  -2798,  -7781,   9434,  13769,  12936,
+      6800,  -2400,   9082,   8091, -10453, -11023, -15786, -11136,
+      3285,   4153,   2658, -14002,  -5051,   9489,   7000,  17206,
+     15024,  -2777,  -8491,    -42, -10626,    141,  13053,   2366,
+     -6662,  -2231,      0,      0,   9243,   -752, -11933, -10646,
+      1119,   1254,   6890,  17745,   7875,  -1203,  -2207,  -1251,
+      2024, -17706, -15532,  -5600,   1128,   2691,   2800,    683,
+     -9927,   9661,  19706,  12522,   3889,   -978,  -2789,  -3992,
+     -2440,    206,  12695,   2921, -17173, -18575,  -9616,  -2657,
+         0,      0,   9243,   4791, -15001,  -2887,  -1931, -10037,
+     16885,   6048,  -1020,     46,   4789,  15191, -15922,  -9154,
+      2530,    823,    252,   -130,  -8608, -19335,  12613,  11651,
+     -4549,  -2314,   -172,   -101,   -784,    265,  12975,  21741,
+     -7551, -13101,   6856,   4710,    535,    -46,      0,      0,
+      9243, -12153, -10395,    754,  -1281,  17644,   2735,  -1095,
+    -10274,   8359,   2200, -12593,   7083,    782,  17650,  -1573,
+      1685, -16282,  -2164,   -530, -11878,     32, -17359,   3065,
+      6651,  -5212,  -3628,  19365,    965,  13180,   8243,   -818,
+      7746,  -3645, -14323,   1670,      0,      0,   9243,  -6961,
+    -11198,   9081,  -8829,  10887,   4833, -14202,   2374,  -6524,
+     16339,  -9417,   4737,  12284,  -4394,  -2691,  -2683,  13690,
+    -18539,   2830, -11438,  -3692,   4985,   5648,  -4628,    514,
+      6225, -18409,  12672,   5311,  11170,  -6928,  -3407,  -7595,
+     10737,  -3977,      0,      0,   9243,  12099, -10405,   1294,
+      2187, -17582,   2760,  -1880, -10105,  -8058,  -3760,  12583,
+      7058,   1346,  17376,  -2667,  -2829,  15710,   3705,    468,
+    -11880,     50, -17123,   5201,   6230,   4698,   6098, -18716,
+     -1665, -13088,   8285,  -1400,   7696,  -6196, -13429,   2770,
+         0,      0,   9243,   8602,  13392,   1722,   2070,  16090,
+     11359,   3222,  -4960,  -2638,   4581,  20106,   5099,   4026,
+    -10978,  -1778,  -1314,  -6620,   6988,  18701,  -2965,   3745,
+    -16745,  -4461,   1300,    584,  -3646, -11588,   8350,  11847,
+    -10050,   2372, -20010,  -7809,   3608,    887,      0,      0,
+      9243,  14252,  -1958,   7026,  13986,  -3899,  -9870,  -1922,
+    -10736,  -3693,  -4527, -12333,   4376,  -6080,   3475, -18537,
+    -19222,   1355, -10843,   6913,   8869,   3408,   8323,   6804,
+     -5141, -13648,   7800,   2649,   7171,  10505,  -6548,   5179,
+     -5505,  13299,   2086,  15579,      0,      0,   9243,  11323,
+      9021,  -6835, -10810,  14267,   -489,  -8613,  -5689,    639,
+    -16117,   6224,  -9731,  -3757,  -8482,  10882,   7873,   1080,
+    -11447,  -6791, -10388,   4099,  -6025,  18396,  -5407,  -7536,
+     14714,    984,   1267, -13940,  -1889,   8416,    666,  16762,
+    -10106,  -3418,      0,      0,   9243,    871,   4833,  15238,
+      1855,    588,  -7508,  10287,  16162,   2857,   1481,   -443,
+     -9392,  -7758,  12910,  16506,   3837,   2588,   -581,   -851,
+      1928, -14879,  -5066,  14950,  16498,   4773,   3842,   -425,
+     -1785,    -82,  10578,  -1435, -15554,  -2459,  16520,  16250,
+         0,      0,   9243,  14762,   5967,   1673,   3450,  12303,
+     -6027,   1394, -15022, -14571,   3402,  -4217, -10507,   -478,
+    -14813,  -5131,  -6634, -16293,    -82, -15276,  -1705,  -1731,
+       358,  -5738,  13681,  12503,  -8200,  -3023,  -3290,  -7384,
+      9272,   -837,  14328,  -1064,  16913,   7915,      0,      0,
+         0,      0,      0,      0,      0,      0,      0,      0,
+         0,      0,      0,      0,      0,      0,      0,      0,
+         0,      0,      0,      0,      0,      0,      0,      0,
+         0,      0,      0,      0,      0,      0,      0,      0,
+         0,      0,      0,      0,  32767,      0,      0,      0,
+         0,      0,      0,      0,      0,      0,      0,      0,
+         0,      0,      0,      0,      0,      0,      0,      0,
+         0,      0,      0,      0,      0,      0,      0,      0,
+         0,      0,      0,      0,      0,      0,      0,      0,
+         0,      0,      0,  32767
+};
+
 const MappingMatrix mapping_matrix_foa_demixing = { 6, 6, 0 };
 const opus_int16 mapping_matrix_foa_demixing_data[36] = {
      16384,  16384,  16384,  16384,      0,      0,      0,  23170,
@@ -376,3 +657,283 @@ const opus_int16 mapping_matrix_toa_demixing_data[324] = {
          0,      0,      0,  32767
 };
 
+const MappingMatrix mapping_matrix_fourthoa_demixing = { 27, 27, 0 };
+const opus_int16 mapping_matrix_fourthoa_demixing_data[729] = {
+      4870,   4484,   4870,   4347,   4440,   4726,   4683,   4821,
+      4883,   4842,   4603,   4484,   4683,   4698,   4234,   4368,
+      4603,   4783,   4783,   4820,   4821,   4347,   4820,   4440,
+      4698,      0,      0,    101,     84,  -7818,   4640,  -7178,
+     -5492,   4629,   8384,   6547,  -4966,    617,  -6345,   1061,
+     -3241,   2939,   5549,   6390,  -4434,   4994,  -2610,   1993,
+     -2873,   1547,  -4356,   -164,      0,      0,   8797,   5074,
+     -1553,   5383,   1906,   5297,   2722,   1158,  -5226,   1311,
+     -7760,  -3327,  -1940,   1586,  -4093,  -2951,   -214,  -6873,
+      5450,  -4875,  -7193,  -4438,    558,   5593,   5607,      0,
+         0,    -26,   5761,  -3723,  -1460,   1195,  -3065,  -6357,
+     -1175,    608,   6965,   2310,   2759,  -8023,  -7138,   5162,
+     -3624,   5006,   -809,   3592,   6209,  -4159,  -4968,   8150,
+      2513,  -5702,      0,      0,    301,    109,   7161,  -2462,
+     -2443,   5044,  -7125,  -2256,   1967,  -9107,    259,  -4928,
+     -2592,   6514,   4111,  -7236,   8695,    635,   5009,  -4025,
+     -1937,   4794,   3420,  -3507,   -400,      0,      0,   -134,
+        85,   2771,   7842,  -3649,  -8225,   2866,   2586,  -9200,
+     -1945,  -1563,   6155,   -720,  -1061,  -3494,  -4513,   -487,
+      8389,   7317,   3348,  -3721,   3806,    371,  -6896,     70,
+         0,      0,  10919,   2072,  -4867,   3472,  -4429,   1721,
+     -4066,  -5193,   1032,  -5253,   9501,  -2017,  -3971,  -5261,
+      -306,  -2737,  -5137,   5713,   1237,     -8,   6387,    364,
+     -5423,   3364,   2888,      0,      0,    -48,   8946,   1048,
+     -2691,    602,  -4332,  -4302,   -514,  -1730,   2459,  -4328,
+     -2156,   3335,  -2748,  -6029,   4023,    155,    897,   5268,
+     -8380,   7625,   7395,    508,   3945,  -8951,      0,      0,
+        39,   4151,  -5965,  -3398,  -7006,  -3534,   2697,  -8989,
+     -5237,   2913,     46,  -5540,   8196,   5766,   2711,  -2520,
+     -3043,  -2146,   -948,   4965,   1806,   2472,   8988,  -1266,
+      4840,      0,      0,   -407,   -189,   2179,  -1627,   6516,
+       259,   7196,  -9449,  -4905,  -9766,    561,   4021,   3371,
+     -8650,   5032,   3329,   2534,    641,   2224,  -5747,   1047,
+     -4074,   5252,    -24,    674,      0,      0,    664,    237,
+     -2837,  -4072,  -1205,   8252,  -5875,  -1670,  -2743,  -3984,
+       381,   5059,   1765,   2666,  -8295,   7403,   1154,  -2086,
+      7622,   7105,   3677,  -6943,   1050,  -6632,   -694,      0,
+         0,    382,   -133,   5699,   7650,   5154,  -5713,  -1645,
+     -6902,   6181,   4450,   1151,    410,   -993,   3829,   2444,
+     -2405,  -6618,  -9514,   5366,  -1896,   5844,  -2886,  -1524,
+     -7321,  -1007,      0,      0,  12767,  -2530,   3183,  -1409,
+     -4015,  -2894,  -5155,  -1710,   3841,  -2107, -10274,   5119,
+      3979,  -4010,   5550,   4822,   -746,  -2507,  -3080,   4289,
+     -3675,   4333,  -1416,  -1230,  -1122,      0,      0,     17,
+      8048,   2398,  -2167,    -73,  -3606,   3125,    398,    731,
+     -5973,   5705,  -1032,   4679,   7305,   3134,   1301,  -3858,
+       -89,   2938,   4359,  -9155,  -4805,  -8407,   3673,  -8645,
+         0,      0,    187,   7355,   3145,  -6719,  -4432,  -5939,
+      2541,  -2810,   9723,    778,  -1105,   5687,  -4174,   2534,
+     -4461,   1017,   -244,   5481,  -1655,  -6765,  -3350,  -4894,
+      1592,  -2318,   8827,      0,      0,    196,   3588,   9631,
+      3063,  -4564,   6043,   2683,   2595,  -2488,  -2186,    173,
+     -6059,  -8270,  -2386,    409,   7441,  -8608,    376,  -4364,
+      2321,   -280,     97,   8331,  -3022,  -4721,      0,      0,
+       117,   -748, -10833,   1533,   4200,  -2875,   -997,   -109,
+     -3661,  -6119,   4454,   8808,  -9189,   8294,   1521,   7265,
+     -2348,  -5094,   -948,  -5400,  -3193,   8914,   5763,   1716,
+     -1070,      0,      0,   2497,    399,  -5201,  -2038,   7843,
+      -376,   7567,  -5073,   7616,  -5537,   2086,  -3453,  -5544,
+       -56, -11648,  -1314,   3546,  -3432,   -117,   8694,  -4245,
+      9621,   3098,  -2582,  -2351,      0,      0,   4386,  -3104,
+     -3132, -10512,    566,   5217,   5128,   4967,   1348,   7035,
+     -1470,     91,   -125,  -3548,   8244,  -3029, -10033,   2186,
+      9745,  -6440,  -2074,   3638,  -1477,  -7045,   -562,      0,
+         0,   2154,   8116,  -6102,   6570,  12998,   -712,  -4126,
+     -4996,     30,   1571,  -6393, -12794,    425,   5036,   1190,
+      5763,   5653,  12933,  -6671,   5197,  -2964,  -3316,  -6354,
+    -10554,  -2652,      0,      0,  12618,  -3737,     93,  -5901,
+      4262,  -3364,   4444,   3103,  -2767,   3403,   4925,  -2584,
+      -989,   4977,  -3714,  -1965,   3076,    326,  -2946,  -2568,
+      1026,  -2980,   3362,  -6132,  -5966,      0,      0,   6001,
+        48,  -1979,  -7275,   3476,  -2096,  10591,   3793,   2629,
+      -447, -14747,  -3689,  -5525,   8358,   6883,  -9703,  -4556,
+      7471,   2965,   4056,  13221,  -7327,  -3073,  -2353,  -6720,
+         0,      0,    621,  11034,    -44,  -2828,   5978,  -1850,
+     -1772,   3894,  -7471,  -1397,    945,  -2028,  -2928,  -2240,
+      3172,   2222,   4544,  -4243,  -5645,   3745,   2573,   3511,
+     -8206,  -7286,   5700,      0,      0,    321,  10818,  -4982,
+      7813,   -749,   9907,   1360,  -1443,    568,  -1103,   2305,
+      6045,   2270,  -1063,  -1920,  -3073,   5893,  -3476, -11346,
+     -1657,   -588,   2957,  -2287,  -8527,  -8041,      0,      0,
+       119,   -268,   2372,  -3040,   4979,  -3789,  -5630,  10619,
+      5900,  -5109,  -4585,  -3862,  10467,  -3527,   -385, -10034,
+     -9991,   4860,    984,   2362,   2311,  -6804,   6324,    433,
+      5291,      0,      0,      0,      0,      0,      0,      0,
+         0,      0,      0,      0,      0,      0,      0,      0,
+         0,      0,      0,      0,      0,      0,      0,      0,
+         0,      0,      0,      0,  32767,      0,      0,      0,
+         0,      0,      0,      0,      0,      0,      0,      0,
+         0,      0,      0,      0,      0,      0,      0,      0,
+         0,      0,      0,      0,      0,      0,      0,      0,
+     32767
+};
+
+const MappingMatrix mapping_matrix_fifthoa_demixing = { 38, 38, 0 };
+const opus_int16 mapping_matrix_fifthoa_demixing_data[1444] = {
+      3188,   3247,   3268,   3368,   3368,   3138,   3268,   3099,
+      3211,   3368,   3099,   3247,   3211,   3368,   3368,   3368,
+      3149,   3268,   3247,   3211,   3099,   3188,   3138,   3149,
+      3099,   3188,   3368,   3149,   3188,   3247,   3268,   3138,
+      3211,   3368,   3138,   3149,      0,      0,    118,    -47,
+     -5011,    282,    333,  -1497,  -4584,   2908,   3388,  -3647,
+     -2493,   1139,  -2882,  -1719,   3604,  -2543,  -4328,   5443,
+      1286,  -5498,  -4583,   2510,  -1743,  -2556,   4168,   1446,
+      -290,   1812,  -4074,  -2377,   4152,   2847,   4991,   3980,
+       393,   5072,      0,      0,   5489,  -2235,   1507,  -5326,
+      4609,  -1096,   2926,  -3427,  -3301,  -3078,   4226,   1730,
+      4627,   2561,   2966,   -592,    143,   -677,   4617,   -755,
+      -956,   -433,  -5138,   3037,    157,  -1394,  -4498,  -4984,
+     -3661,  -4112,  -3756,   4628,   -570,   3356,   1605,   1803,
+         0,      0,   -162,   5162,   2132,   2392,   3556,  -5141,
+     -1536,   2975,  -3001,  -3350,  -2231,  -5230,   1294,  -4965,
+      3494,   5230,  -3292,  -1359,  -2945,   -773,   2670,   4867,
+      -660,   3720,  -3415,  -5112,  -3700,  -1211,    407,   3013,
+       763,    591,   2481,  -2657,   5210,    784,      0,      0,
+      -156,    338,  -4246,    510,    462,   3296,   2846,   3333,
+     -4292,   4574,   1940,  -2986,  -1275,   3701,   5022,  -5250,
+      5780,  -2676,  -1180,   1516,  -4852,   4877,    342,  -3923,
+     -5703,  -2920,    379,   -657,   -361,  -3346,   1044,    795,
+      5257,  -4004,    698,   1115,      0,      0,     47,   -140,
+     -3292,  -1097,    652,    855,  -5260,  -3691,  -4470,   4521,
+     -3863,   1093,  -5552,  -2016,   3831,    334,   -456,  -1532,
+      2068,   1788,   2054,   -295,   3668,  -2820,    328,   -994,
+       295,  -3301,   5770,   4282,  -6353,   5632,  -1371,   5005,
+       238,   4041,      0,      0,   6764,  -1659,  -2730,   5726,
+      3715,  -3216,   -933,    531,    -52,   -345,   3022,  -2818,
+      4005,  -1617,  -1189,  -3748,  -3403,  -3592,   4040,  -3553,
+     -2806,  -3444,   6023,   -711,  -3298,  -2503,   2548,   5564,
+       940,   1848,   1207,   4010,  -3488,   -358,  -2511,  -1966,
+         0,      0,    -64,  -5039,   1403,  -4455,   6240,   2189,
+     -1716,  -4348,   4183,   3951,  -4042,  -3606,   2399,  -4563,
+      4050,   -612,   -395,    348,  -5791,    391,  -1440,   -735,
+      1398,   4359,   -518,   2969,   6556,   1951,   -518,  -4993,
+      -925,    998,   -569,  -2934,   3460,    420,      0,      0,
+        16,   5482,  -4122,    770,   2082,   5020,  -3961,    485,
+      -584,   -793,      3,   5222,  -1416,   3673,     78,   3549,
+      -937,  -5723,   1673,  -6162,  -2540,   3082,   -355,   1838,
+      -615,   4601,   2832,   -359,  -3346,    668,  -3393,  -1583,
+     -3774,  -2206,   5754,  -4961,      0,      0,   -328,    299,
+      2470,    317,    525,  -4494,   2805,   2617,   2383,  -2363,
+     -1037,   4085,    895,  -4622,   3218,  -6607,  -3381,  -5933,
+      1397,   6394,   -446,   5694,     14,  -4510,   4329,   3690,
+      -334,      0,   2932,  -2478,  -2944,   -577,   -599,   -230,
+      1553,  -4736,      0,      0,   -324,    142,  -3252,   -867,
+      1111,  -1882,   3378,  -6055,   6502,  -6840,   4280,  -2694,
+     -2876,   4190,   6454,    655,   1061,    626,  -2669,   -798,
+      3192,   -985,   -898,  -5482,   -548,   2315,   -558,   1302,
+       900,   5747,  -1325,   1599,  -1384,  -5749,    624,   1110,
+         0,      0,    321,    312,   2188,   1322,    237,    708,
+      -304,   2463,   1500,  -1094,  -5112,  -1010,  -6799,    646,
+       992,   1969,   3423,  -3996,   2628,   4451,   3432,  -2833,
+     -6101,   -330,  -3768,     -3,   -707,   5961,  -4037,  -3736,
+      4080,   7254,  -4113,   2151,     54,  -2150,      0,      0,
+      7735,   4064,  -3884,  -5240,    577,   2229,  -3947,   2914,
+      3555,   4011,    774,  -3519,   1985,  -3701,  -3824,    330,
+      -905,   2085,   1155,   2176,   3006,    340,  -5533,  -3264,
+      -902,   3114,    344,  -5060,   1524,   1805,   1926,   2350,
+      1905,  -3203,  -2762,  -4162,      0,      0,    193,   -151,
+     -1434,   6289,   7354,   4234,    169,   2868,  -1977,  -1375,
+     -4987,   2345,   2742,    599,    939,  -4837,   2688,    991,
+     -6907,    716,  -1542,  -4346,  -1833,   1493,   3134,   2903,
+     -7019,  -2835,     93,   4395,    621,    870,  -2357,   -975,
+     -2933,   -127,      0,      0,   -616,  -5968,  -3479,  -1651,
+      4932,  -2445,  -5512,  -1451,    691,    739,    479,   4227,
+     -2886,   3853,      8,   -501,    188,   1990,   3842,   2270,
+      1662,   -174,   1290,   2456,     67,  -3267,  -5535,    483,
+      5721,  -1642,   6501,  -3432,   1184,  -3246,   4101,  -4880,
+         0,      0,   -465,   5264,  -4812,    682,   1683,  -4539,
+      2916,  -1985,   2899,   3324,   1060,  -4398,   -745,  -2137,
+     -3827,   1044,   6225,   3609,   -532,   1980,  -6001,    564,
+      -209,  -1299,   5336,  -3605,  -1484,     37,     19,  -1295,
+      -665,   -385,  -6773,   3651,   6153,  -1291,      0,      0,
+       193,   -415,   5166,   -110,    626,   6743,  -2860,   1425,
+      1101,  -1341,     80,  -4533,    249,   4231,   -119,  -6009,
+     -2970,   5170,   -822,  -2610,   4527,   5948,    182,  -2589,
+       837,  -5471,    371,    -43,    373,   -665,  -1233,   -626,
+     -7353,   2606,   1339,  -1398,      0,      0,   -533,    147,
+      2075,   -672,   1043,   3503,   4402,  -4971,  -3287,   3731,
+     -2606,   3817,   1972,  -5603,   5114,   1185,  -1318,   1906,
+      3018,  -1999,    343,  -1943,    207,  -6744,    913,  -4060,
+       645,   -349,  -5667,   4766,   5575,  -1733,   1116,    160,
+      1534,  -5690,      0,      0,   -137,    -36,   1556,   1325,
+      1553,  -2230,   1188,   5296,  -5104,   4673,   6295,    498,
+     -4723,    933,   2994,   4067,  -4700,   1758,  -4116,  -1252,
+      2444,  -4092,   1653,  -2802,   5069,   1133,    790,  -2355,
+      -934,  -6304,   1642,   2045,  -4259,  -3873,   -213,    215,
+         0,      0,   -364,    423,   4888,  -1316,    118,   -950,
+      4027,    114,   2961,  -3136,  -3012,   -883,  -6192,   1340,
+     -3210,  -1193,   1376,   3128,   1596,  -2994,  -3194,    533,
+      8502,   2487,  -1485,   1032,    301,  -8007,   -577,    887,
+       297,   7778,   3121,  -1901,    -94,  -6401,      0,      0,
+      9260,  -1845,    668,   2787,  -2255,   2699,  -2512,  -3737,
+     -3675,  -3601,  -1803,    210,  -1701,  -1442,  -2700,   3457,
+      2868,   2079,  -2113,   3178,   1277,   3578,   5240,  -2482,
+      3324,   1020,  -4027,   3835,  -3758,  -3633,  -3170,  -1310,
+      2509,  -3110,    713,    174,      0,      0,   -399,   4969,
+     -2321,  -7744,   6494,  -3776,   1478,    758,  -1794,  -2233,
+     -4059,   4932,   2770,   4761,  -3475,   1243,    829,   -651,
+     -5358,   -436,   2381,   1360,   2561,  -3118,    858,  -4366,
+      3933,   3646,    -43,  -1310,    -16,    924,   1197,   1415,
+     -5036,   -376,      0,      0,    100,   1410,   1290,   3199,
+      7091,  -3638,  -2641,   1118,     45,   -441,    794,   -974,
+     -5033,    889,    438,  -3102,    895,   3555,   4672,   4795,
+      1129,  -2408,  -2153,   1742,    159,  -2040,   7578,  -2006,
+     -5737,   1986,  -5568,  -6413,   2428,  -1387,  -2441,    667,
+         0,      0,    -37,  -6031,  -4434,   -904,   3290,   1806,
+      4736,   2516,  -5905,  -5927,   1754,  -4300,  -2468,  -2203,
+     -4836,   -672,   1444,  -1591,  -1631,  -1789,   4311,   -153,
+      -688,  -1222,   1058,   3139,   4659,   -353,   1543,   1838,
+      2180,  -1448,   2432,   6277,   5304,  -1692,      0,      0,
+      -280,   4506,    807,   -477,    823,   3550,   1427,  -1856,
+     -3003,  -3501,  -1203,   2679,    933,    778,  -4954,  -1977,
+     -7458,   4687,    435,   7045,  -4053,  -3130,    257,  -3917,
+     -6165,   1889,    927,    235,   1889,  -1097,   1985,    630,
+     -2172,  -2130,   7080,   4810,      0,      0,   -300,    496,
+      2808,    279,    667,  -7179,  -2661,   -526,  -2832,   1751,
+      2849,   4829,   -906,  -4151,  -1124,  -3062,   8166,   5361,
+     -1656,  -6017,   3265,   2551,   -864,   -432,  -6966,   6295,
+      -168,    901,    442,   -582,    269,    236,  -3574,    799,
+       472,    565,      0,      0,    805,  -2466,   6208,  -4592,
+      -170,  -6701,  -5610,   3678,  -4242,   4561,   -724,  -5534,
+      2415,   7354,   2761,   2699,   -349,   3822,  -2372,   1756,
+     -5523,  -3445,   -588,  -5749,  -3986,   9804,  -3871,   5375,
+     -2308,   5504,  -2766,  -1651,   1472,   6832,   2705,  -5104,
+         0,      0,   -700,  -1179,   4402,    400,   1383,    939,
+     -1342,   6013,   2577,  -3472,    472,   2883,   1450,  -3917,
+      2849,   5084,   4990,   5392,    342,  -4925,  -3329,  -5372,
+     -2674,  -6035,  -5072,   -836,    179,   2506,   7987,  -3647,
+     -8202,  -1437,   1891,   2400,   1607,  -3611,      0,      0,
+     -4706,  -4003,   9928,   -379,   5557,   3738,  -8789,    685,
+      1937,  -5157,  13388,   7995,  -4119,  -9909,  -5079,   4804,
+      5586,    774,  -5430,    299,  -9943,   3264,  -3690,  -3901,
+     -1133,  -6199,   3182,   1544,   5467,   3686,  -2639,   4068,
+      1163,   -185,  -1299,   -506,      0,      0,    843,   1005,
+     -1059,    467,  -1279,  -2259,   6057,  -1694,  -5885,   5342,
+     -5160,  -3748,  -1382,   4420,   -697,  -2000,  -3808,   3100,
+      2685,  -4073,    531,    318,  -7822,   2414,   2901,   3399,
+     -1340,   8449,   3685,    463,  -3341,   2423,   2304,  -2723,
+        84,  -2622,      0,      0,  12088,   -265,   2562,   -435,
+     -4348,  -2426,   3538,   1552,   1279,    883,  -4166,   2634,
+     -6130,   2994,   3729,  -1570,   -601,  -1753,  -5124,  -2788,
+     -2096,  -1920,  -2649,   2793,  -1079,  -1952,   2983,  -1530,
+      2499,   1769,   1492,  -6757,  -2108,   2841,   1466,   2597,
+         0,      0,  -3830,  -4093,   2448,  12720,   7737,   -665,
+      -832,  -9257,   2971,  -2400,    791,   1873,   1072,   -587,
+     -7440,   8055,   1531,  -4736,    616,  -1782,  -2982,   9663,
+     -5057,  -5926,   1610,  -4489,   7033,  -8658,   6010,  -5673,
+      5648,    812,   -271,  -1802,  -4500,   4392,      0,      0,
+      -888,   -327,   3373,  -1084,   7959,   2430,   1898,  -2360,
+     -1820,  -1377,  -1090,  -4436,  -3422,  -1106,  -3230,   3876,
+       -41,  -5128,   6375,  -1848,  -3824,   5844,    617,  -1957,
+      4232,   1345,  -1439,    -83,   3046,   -214,   5458,  -5566,
+     -4387,  -3738,  -5740,   8657,      0,      0,   6978,   6239,
+     -3686,   -981,  -2854,     78,   5859,   -357,   4618,   7391,
+      -138,    971,  -5799,   2135,   4478,  -7004,  -5949,   1668,
+     -6933,  -1163,   7010,  -5624,   2990,   6192,  -8075,   3567,
+     -8308,   2236,  -5098,  -2120,  -4355,  -4238,   4955,  10230,
+       692,  -5606,      0,      0,  -1348,  -7069,    -12,  -4927,
+      1211,    651,   1360,   7744,   3404,   5069,  -2438,   -105,
+      2332,   1494,  -4686,   1336,  -3628,   -881,   2474,   1736,
+       -26,   -257,   2135,  -4452,    446,   -641,  -4704,   2605,
+     -6436,   6662,  -4939,    990,  -1100,  -3782,   5028,   4753,
+         0,      0,  -2875,   6410,   3518,   3950,   1271,    869,
+     -2842,  -5837,   1532,  -2899,   1140,   -597,   1712,  -1988,
+     -4819,  -4783,   4773,  -8796,   2240,  -4596,   3565,  -4853,
+      -556,  -3974,   7366,  -4370,   3113,  -3548,   3552,  -5450,
+      3869,   2514,   6736,  -4570,   6074,   3151,      0,      0,
+         0,      0,      0,      0,      0,      0,      0,      0,
+         0,      0,      0,      0,      0,      0,      0,      0,
+         0,      0,      0,      0,      0,      0,      0,      0,
+         0,      0,      0,      0,      0,      0,      0,      0,
+         0,      0,      0,      0,  32767,      0,      0,      0,
+         0,      0,      0,      0,      0,      0,      0,      0,
+         0,      0,      0,      0,      0,      0,      0,      0,
+         0,      0,      0,      0,      0,      0,      0,      0,
+         0,      0,      0,      0,      0,      0,      0,      0,
+         0,      0,      0,  32767
+};
diff --git a/opus/src/mapping_matrix.h b/opus/src/mapping_matrix.h
index 98bc82df..53646cb1 100644
--- a/opus/src/mapping_matrix.h
+++ b/opus/src/mapping_matrix.h
@@ -117,6 +117,12 @@ extern const opus_int16 mapping_matrix_soa_mixing_data[121];
 extern const MappingMatrix mapping_matrix_toa_mixing;
 extern const opus_int16 mapping_matrix_toa_mixing_data[324];
 
+extern const MappingMatrix mapping_matrix_fourthoa_mixing;
+extern const opus_int16 mapping_matrix_fourthoa_mixing_data[729];
+
+extern const MappingMatrix mapping_matrix_fifthoa_mixing;
+extern const opus_int16 mapping_matrix_fifthoa_mixing_data[1444];
+
 extern const MappingMatrix mapping_matrix_foa_demixing;
 extern const opus_int16 mapping_matrix_foa_demixing_data[36];
 
@@ -126,6 +132,12 @@ extern const opus_int16 mapping_matrix_soa_demixing_data[121];
 extern const MappingMatrix mapping_matrix_toa_demixing;
 extern const opus_int16 mapping_matrix_toa_demixing_data[324];
 
+extern const MappingMatrix mapping_matrix_fourthoa_demixing;
+extern const opus_int16 mapping_matrix_fourthoa_demixing_data[729];
+
+extern const MappingMatrix mapping_matrix_fifthoa_demixing;
+extern const opus_int16 mapping_matrix_fifthoa_demixing_data[1444];
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/opus/src/mlp.c b/opus/src/mlp.c
index 964c6a98..e658ccde 100644
--- a/opus/src/mlp.c
+++ b/opus/src/mlp.c
@@ -33,35 +33,23 @@
 #include "opus_types.h"
 #include "opus_defines.h"
 #include "arch.h"
-#include "tansig_table.h"
 #include "mlp.h"
 
+#define fmadd(a, b, c) ((a)*(b)+(c))
 static OPUS_INLINE float tansig_approx(float x)
 {
-    int i;
-    float y, dy;
-    float sign=1;
-    /* Tests are reversed to catch NaNs */
-    if (!(x<8))
-        return 1;
-    if (!(x>-8))
-        return -1;
-#ifndef FIXED_POINT
-    /* Another check in case of -ffast-math */
-    if (celt_isnan(x))
-       return 0;
-#endif
-    if (x<0)
-    {
-       x=-x;
-       sign=-1;
-    }
-    i = (int)floor(.5f+25*x);
-    x -= .04f*i;
-    y = tansig_table[i];
-    dy = 1-y*y;
-    y = y + x*dy*(1 - y*x);
-    return sign*y;
+    const float N0 = 952.52801514f;
+    const float N1 = 96.39235687f;
+    const float N2 = 0.60863042f;
+    const float D0 = 952.72399902f;
+    const float D1 = 413.36801147f;
+    const float D2 = 11.88600922f;
+    float X2, num, den;
+    X2 = x*x;
+    num = fmadd(fmadd(N2, X2, N1), X2, N0);
+    den = fmadd(fmadd(D2, X2, D1), X2, D0);
+    num = num*x/den;
+    return MAX32(-1.f, MIN32(1.f, num));
 }
 
 static OPUS_INLINE float sigmoid_approx(float x)
@@ -79,7 +67,7 @@ static void gemm_accum(float *out, const opus_int8 *weights, int rows, int cols,
    }
 }
 
-void compute_dense(const DenseLayer *layer, float *output, const float *input)
+void analysis_compute_dense(const AnalysisDenseLayer *layer, float *output, const float *input)
 {
    int i;
    int N, M;
@@ -101,7 +89,7 @@ void compute_dense(const DenseLayer *layer, float *output, const float *input)
    }
 }
 
-void compute_gru(const GRULayer *gru, float *state, const float *input)
+void analysis_compute_gru(const AnalysisGRULayer *gru, float *state, const float *input)
 {
    int i;
    int N, M;
diff --git a/opus/src/mlp.h b/opus/src/mlp.h
index d7670550..e6b1a8f7 100644
--- a/opus/src/mlp.h
+++ b/opus/src/mlp.h
@@ -24,8 +24,8 @@
    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
-#ifndef _MLP_H_
-#define _MLP_H_
+#ifndef MLP_H_
+#define MLP_H_
 
 #include "opus_types.h"
 
@@ -39,7 +39,7 @@ typedef struct {
   int nb_inputs;
   int nb_neurons;
   int sigmoid;
-} DenseLayer;
+} AnalysisDenseLayer;
 
 typedef struct {
   const opus_int8 *bias;
@@ -47,14 +47,14 @@ typedef struct {
   const opus_int8 *recurrent_weights;
   int nb_inputs;
   int nb_neurons;
-} GRULayer;
+} AnalysisGRULayer;
 
-extern const DenseLayer layer0;
-extern const GRULayer layer1;
-extern const DenseLayer layer2;
+extern const AnalysisDenseLayer layer0;
+extern const AnalysisGRULayer layer1;
+extern const AnalysisDenseLayer layer2;
 
-void compute_dense(const DenseLayer *layer, float *output, const float *input);
+void analysis_compute_dense(const AnalysisDenseLayer *layer, float *output, const float *input);
 
-void compute_gru(const GRULayer *gru, float *state, const float *input);
+void analysis_compute_gru(const AnalysisGRULayer *gru, float *state, const float *input);
 
-#endif /* _MLP_H_ */
+#endif /* MLP_H_ */
diff --git a/opus/src/mlp_data.c b/opus/src/mlp_data.c
index ae4178df..65f7448e 100644
--- a/opus/src/mlp_data.c
+++ b/opus/src/mlp_data.c
@@ -651,20 +651,20 @@ static const opus_int8 layer2_bias[2] = {
    14, 117
 };
 
-const DenseLayer layer0 = {
+const AnalysisDenseLayer layer0 = {
    layer0_bias,
    layer0_weights,
    25, 32, 0
 };
 
-const GRULayer layer1 = {
+const AnalysisGRULayer layer1 = {
    layer1_bias,
    layer1_weights,
    layer1_recur_weights,
    32, 24
 };
 
-const DenseLayer layer2 = {
+const AnalysisDenseLayer layer2 = {
    layer2_bias,
    layer2_weights,
    24, 2, 1
diff --git a/opus/src/opus.c b/opus/src/opus.c
index 538b5ea7..816a4dd5 100644
--- a/opus/src/opus.c
+++ b/opus/src/opus.c
@@ -194,7 +194,8 @@ int opus_packet_get_samples_per_frame(const unsigned char *data,
 int opus_packet_parse_impl(const unsigned char *data, opus_int32 len,
       int self_delimited, unsigned char *out_toc,
       const unsigned char *frames[48], opus_int16 size[48],
-      int *payload_offset, opus_int32 *packet_offset)
+      int *payload_offset, opus_int32 *packet_offset,
+      const unsigned char **padding, opus_int32 *padding_len)
 {
    int i, bytes;
    int count;
@@ -337,6 +338,11 @@ int opus_packet_parse_impl(const unsigned char *data, opus_int32 len,
       data += size[i];
    }
 
+   if (padding != NULL)
+   {
+      *padding = data;
+      *padding_len = pad;
+   }
    if (packet_offset)
       *packet_offset = pad+(opus_int32)(data-data0);
 
@@ -351,6 +357,6 @@ int opus_packet_parse(const unsigned char *data, opus_int32 len,
       opus_int16 size[48], int *payload_offset)
 {
    return opus_packet_parse_impl(data, len, 0, out_toc,
-                                 frames, size, payload_offset, NULL);
+                                 frames, size, payload_offset, NULL, NULL, NULL);
 }
 
diff --git a/opus/src/opus_decoder.c b/opus/src/opus_decoder.c
index 9113638a..b57c8094 100644
--- a/opus/src/opus_decoder.c
+++ b/opus/src/opus_decoder.c
@@ -52,6 +52,15 @@
 #include "mathops.h"
 #include "cpu_support.h"
 
+#ifdef ENABLE_DEEP_PLC
+#include "dred_rdovae_dec_data.h"
+#include "dred_rdovae_dec.h"
+#endif
+
+#ifdef ENABLE_OSCE
+#include "osce.h"
+#endif
+
 struct OpusDecoder {
    int          celt_dec_offset;
    int          silk_dec_offset;
@@ -59,7 +68,11 @@ struct OpusDecoder {
    opus_int32   Fs;          /** Sampling rate (at the API level) */
    silk_DecControlStruct DecControl;
    int          decode_gain;
+   int          complexity;
    int          arch;
+#ifdef ENABLE_DEEP_PLC
+    LPCNetPLCState lpcnet;
+#endif
 
    /* Everything beyond this point gets cleared on a reset */
 #define OPUS_DECODER_RESET_START stream_channels
@@ -135,6 +148,7 @@ int opus_decoder_init(OpusDecoder *st, opus_int32 Fs, int channels)
    silk_dec = (char*)st+st->silk_dec_offset;
    celt_dec = (CELTDecoder*)((char*)st+st->celt_dec_offset);
    st->stream_channels = st->channels = channels;
+   st->complexity = 0;
 
    st->Fs = Fs;
    st->DecControl.API_sampleRate = st->Fs;
@@ -152,6 +166,9 @@ int opus_decoder_init(OpusDecoder *st, opus_int32 Fs, int channels)
 
    st->prev_mode = 0;
    st->frame_size = Fs/400;
+#ifdef ENABLE_DEEP_PLC
+    lpcnet_plc_init( &st->lpcnet);
+#endif
    st->arch = opus_select_arch();
    return OPUS_OK;
 }
@@ -278,7 +295,8 @@ static int opus_decode_frame(OpusDecoder *st, const unsigned char *data,
       ec_dec_init(&dec,(unsigned char*)data,len);
    } else {
       audiosize = frame_size;
-      mode = st->prev_mode;
+      /* Run PLC using last used mode (CELT if we ended with CELT redundancy) */
+      mode = st->prev_redundancy ? MODE_CELT_ONLY : st->prev_mode;
       bandwidth = 0;
 
       if (mode == 0)
@@ -369,7 +387,7 @@ static int opus_decode_frame(OpusDecoder *st, const unsigned char *data,
          pcm_ptr = pcm_silk;
 
       if (st->prev_mode==MODE_CELT_ONLY)
-         silk_InitDecoder( silk_dec );
+         silk_ResetDecoder( silk_dec );
 
       /* The SILK PLC cannot produce frames of less than 10 ms */
       st->DecControl.payloadSize_ms = IMAX(10, 1000 * audiosize / st->Fs);
@@ -393,14 +411,28 @@ static int opus_decode_frame(OpusDecoder *st, const unsigned char *data,
            st->DecControl.internalSampleRate = 16000;
         }
      }
+     st->DecControl.enable_deep_plc = st->complexity >= 5;
+#ifdef ENABLE_OSCE
+     st->DecControl.osce_method = OSCE_METHOD_NONE;
+#ifndef DISABLE_LACE
+     if (st->complexity >= 6) {st->DecControl.osce_method = OSCE_METHOD_LACE;}
+#endif
+#ifndef DISABLE_NOLACE
+     if (st->complexity >= 7) {st->DecControl.osce_method = OSCE_METHOD_NOLACE;}
+#endif
+#endif
 
-     lost_flag = data == NULL ? 1 : 2 * decode_fec;
+     lost_flag = data == NULL ? 1 : 2 * !!decode_fec;
      decoded_samples = 0;
      do {
         /* Call SILK decoder */
         int first_frame = decoded_samples == 0;
         silk_ret = silk_Decode( silk_dec, &st->DecControl,
-                                lost_flag, first_frame, &dec, pcm_ptr, &silk_frame_size, st->arch );
+                                lost_flag, first_frame, &dec, pcm_ptr, &silk_frame_size,
+#ifdef ENABLE_DEEP_PLC
+                                &st->lpcnet,
+#endif
+                                st->arch );
         if( silk_ret ) {
            if (lost_flag) {
               /* PLC failure should not be fatal */
@@ -419,7 +451,7 @@ static int opus_decode_frame(OpusDecoder *st, const unsigned char *data,
 
    start_band = 0;
    if (!decode_fec && mode != MODE_CELT_ONLY && data != NULL
-    && ec_tell(&dec)+17+20*(st->mode == MODE_HYBRID) <= 8*len)
+    && ec_tell(&dec)+17+20*(mode == MODE_HYBRID) <= 8*len)
    {
       /* Check if we have a redundant 0-8 kHz band */
       if (mode == MODE_HYBRID)
@@ -499,6 +531,11 @@ static int opus_decode_frame(OpusDecoder *st, const unsigned char *data,
    /* 5 ms redundant frame for CELT->SILK*/
    if (redundancy && celt_to_silk)
    {
+      /* If the previous frame did not use CELT (the first redundancy frame in
+         a transition from SILK may have been lost) then the CELT decoder is
+         stale at this point and the redundancy audio is not useful, however
+         the final range is still needed (for testing), so the redundancy is
+         always decoded but the decoded audio may not be used */
       MUST_SUCCEED(celt_decoder_ctl(celt_dec, CELT_SET_START_BAND(0)));
       celt_decode_with_ec(celt_dec, data+len, redundancy_bytes,
                           redundant_audio, F5, NULL, 0);
@@ -515,8 +552,12 @@ static int opus_decode_frame(OpusDecoder *st, const unsigned char *data,
       if (mode != st->prev_mode && st->prev_mode > 0 && !st->prev_redundancy)
          MUST_SUCCEED(celt_decoder_ctl(celt_dec, OPUS_RESET_STATE));
       /* Decode CELT */
-      celt_ret = celt_decode_with_ec(celt_dec, decode_fec ? NULL : data,
-                                     len, pcm, celt_frame_size, &dec, celt_accum);
+      celt_ret = celt_decode_with_ec_dred(celt_dec, decode_fec ? NULL : data,
+                                     len, pcm, celt_frame_size, &dec, celt_accum
+#ifdef ENABLE_DEEP_PLC
+                                     , &st->lpcnet
+#endif
+                                     );
    } else {
       unsigned char silence[2] = {0xFF, 0xFF};
       if (!celt_accum)
@@ -561,7 +602,10 @@ static int opus_decode_frame(OpusDecoder *st, const unsigned char *data,
       smooth_fade(pcm+st->channels*(frame_size-F2_5), redundant_audio+st->channels*F2_5,
                   pcm+st->channels*(frame_size-F2_5), F2_5, st->channels, window, st->Fs);
    }
-   if (redundancy && celt_to_silk)
+   /* 5ms redundant frame for CELT->SILK; ignore if the previous frame did not
+      use CELT (the first redundancy frame in a transition from SILK may have
+      been lost) */
+   if (redundancy && celt_to_silk && (st->prev_mode != MODE_SILK_ONLY || st->prev_redundancy))
    {
       for (c=0;c<st->channels;c++)
       {
@@ -625,7 +669,7 @@ static int opus_decode_frame(OpusDecoder *st, const unsigned char *data,
 
 int opus_decode_native(OpusDecoder *st, const unsigned char *data,
       opus_int32 len, opus_val16 *pcm, int frame_size, int decode_fec,
-      int self_delimited, opus_int32 *packet_offset, int soft_clip)
+      int self_delimited, opus_int32 *packet_offset, int soft_clip, const OpusDRED *dred, opus_int32 dred_offset)
 {
    int i, nb_samples;
    int count, offset;
@@ -639,6 +683,35 @@ int opus_decode_native(OpusDecoder *st, const unsigned char *data,
    /* For FEC/PLC, frame_size has to be to have a multiple of 2.5 ms */
    if ((decode_fec || len==0 || data==NULL) && frame_size%(st->Fs/400)!=0)
       return OPUS_BAD_ARG;
+#ifdef ENABLE_DRED
+   if (dred != NULL && dred->process_stage == 2) {
+      int F10;
+      int features_per_frame;
+      int needed_feature_frames;
+      int init_frames;
+      lpcnet_plc_fec_clear(&st->lpcnet);
+      F10 = st->Fs/100;
+      /* if blend==0, the last PLC call was "update" and we need to feed two extra 10-ms frames. */
+      init_frames = (st->lpcnet.blend == 0) ? 2 : 0;
+      features_per_frame = IMAX(1, frame_size/F10);
+      needed_feature_frames = init_frames + features_per_frame;
+      lpcnet_plc_fec_clear(&st->lpcnet);
+      for (i=0;i<needed_feature_frames;i++) {
+         int feature_offset;
+         /* We floor instead of rounding because 5-ms overlap compensates for the missing 0.5 rounding offset. */
+         feature_offset = init_frames - i - 2 + (int)floor(((float)dred_offset + dred->dred_offset*F10/4)/F10);
+         if (feature_offset <= 4*dred->nb_latents-1 && feature_offset >= 0) {
+           lpcnet_plc_fec_add(&st->lpcnet, dred->fec_features+feature_offset*DRED_NUM_FEATURES);
+         } else {
+           if (feature_offset >= 0) lpcnet_plc_fec_add(&st->lpcnet, NULL);
+         }
+
+      }
+   }
+#else
+   (void)dred;
+   (void)dred_offset;
+#endif
    if (len==0 || data==NULL)
    {
       int pcm_count=0;
@@ -663,7 +736,7 @@ int opus_decode_native(OpusDecoder *st, const unsigned char *data,
    packet_stream_channels = opus_packet_get_nb_channels(data);
 
    count = opus_packet_parse_impl(data, len, self_delimited, &toc, NULL,
-                                  size, &offset, packet_offset);
+                                  size, &offset, packet_offset, NULL, NULL);
    if (count<0)
       return count;
 
@@ -675,12 +748,12 @@ int opus_decode_native(OpusDecoder *st, const unsigned char *data,
       int ret;
       /* If no FEC can be present, run the PLC (recursive call) */
       if (frame_size < packet_frame_size || packet_mode == MODE_CELT_ONLY || st->mode == MODE_CELT_ONLY)
-         return opus_decode_native(st, NULL, 0, pcm, frame_size, 0, 0, NULL, soft_clip);
+         return opus_decode_native(st, NULL, 0, pcm, frame_size, 0, 0, NULL, soft_clip, NULL, 0);
       /* Otherwise, run the PLC on everything except the size for which we might have FEC */
       duration_copy = st->last_packet_duration;
       if (frame_size-packet_frame_size!=0)
       {
-         ret = opus_decode_native(st, NULL, 0, pcm, frame_size-packet_frame_size, 0, 0, NULL, soft_clip);
+         ret = opus_decode_native(st, NULL, 0, pcm, frame_size-packet_frame_size, 0, 0, NULL, soft_clip, NULL, 0);
          if (ret<0)
          {
             st->last_packet_duration = duration_copy;
@@ -744,7 +817,7 @@ int opus_decode(OpusDecoder *st, const unsigned char *data,
 {
    if(frame_size<=0)
       return OPUS_BAD_ARG;
-   return opus_decode_native(st, data, len, pcm, frame_size, decode_fec, 0, NULL, 0);
+   return opus_decode_native(st, data, len, pcm, frame_size, decode_fec, 0, NULL, 0, NULL, 0);
 }
 
 #ifndef DISABLE_FLOAT_API
@@ -772,7 +845,7 @@ int opus_decode_float(OpusDecoder *st, const unsigned char *data,
    celt_assert(st->channels == 1 || st->channels == 2);
    ALLOC(out, frame_size*st->channels, opus_int16);
 
-   ret = opus_decode_native(st, data, len, out, frame_size, decode_fec, 0, NULL, 0);
+   ret = opus_decode_native(st, data, len, out, frame_size, decode_fec, 0, NULL, 0, NULL, 0);
    if (ret > 0)
    {
       for (i=0;i<ret*st->channels;i++)
@@ -810,7 +883,7 @@ int opus_decode(OpusDecoder *st, const unsigned char *data,
    celt_assert(st->channels == 1 || st->channels == 2);
    ALLOC(out, frame_size*st->channels, float);
 
-   ret = opus_decode_native(st, data, len, out, frame_size, decode_fec, 0, NULL, 1);
+   ret = opus_decode_native(st, data, len, out, frame_size, decode_fec, 0, NULL, 1, NULL, 0);
    if (ret > 0)
    {
       for (i=0;i<ret*st->channels;i++)
@@ -825,7 +898,7 @@ int opus_decode_float(OpusDecoder *st, const unsigned char *data,
 {
    if(frame_size<=0)
       return OPUS_BAD_ARG;
-   return opus_decode_native(st, data, len, pcm, frame_size, decode_fec, 0, NULL, 0);
+   return opus_decode_native(st, data, len, pcm, frame_size, decode_fec, 0, NULL, 0, NULL, 0);
 }
 
 #endif
@@ -855,6 +928,27 @@ int opus_decoder_ctl(OpusDecoder *st, int request, ...)
       *value = st->bandwidth;
    }
    break;
+   case OPUS_SET_COMPLEXITY_REQUEST:
+   {
+       opus_int32 value = va_arg(ap, opus_int32);
+       if(value<0 || value>10)
+       {
+          goto bad_arg;
+       }
+       st->complexity = value;
+       celt_decoder_ctl(celt_dec, OPUS_SET_COMPLEXITY(value));
+   }
+   break;
+   case OPUS_GET_COMPLEXITY_REQUEST:
+   {
+       opus_int32 *value = va_arg(ap, opus_int32*);
+       if (!value)
+       {
+          goto bad_arg;
+       }
+       *value = st->complexity;
+   }
+   break;
    case OPUS_GET_FINAL_RANGE_REQUEST:
    {
       opus_uint32 *value = va_arg(ap, opus_uint32*);
@@ -872,9 +966,12 @@ int opus_decoder_ctl(OpusDecoder *st, int request, ...)
             ((char*)&st->OPUS_DECODER_RESET_START - (char*)st));
 
       celt_decoder_ctl(celt_dec, OPUS_RESET_STATE);
-      silk_InitDecoder( silk_dec );
+      silk_ResetDecoder( silk_dec );
       st->stream_channels = st->channels;
       st->frame_size = st->Fs/400;
+#ifdef ENABLE_DEEP_PLC
+      lpcnet_plc_reset( &st->lpcnet );
+#endif
    }
    break;
    case OPUS_GET_SAMPLE_RATE_REQUEST:
@@ -950,6 +1047,20 @@ int opus_decoder_ctl(OpusDecoder *st, int request, ...)
        ret = celt_decoder_ctl(celt_dec, OPUS_GET_PHASE_INVERSION_DISABLED(value));
    }
    break;
+#ifdef USE_WEIGHTS_FILE
+   case OPUS_SET_DNN_BLOB_REQUEST:
+   {
+       const unsigned char *data = va_arg(ap, const unsigned char *);
+       opus_int32 len = va_arg(ap, opus_int32);
+       if(len<0 || data == NULL)
+       {
+          goto bad_arg;
+       }
+       ret = lpcnet_plc_load_model(&st->lpcnet, data, len);
+       ret = silk_LoadOSCEModels(silk_dec, data, len) || ret;
+   }
+   break;
+#endif
    default:
       /*fprintf(stderr, "unknown opus_decoder_ctl() request: %d", request);*/
       ret = OPUS_UNIMPLEMENTED;
@@ -1025,8 +1136,373 @@ int opus_packet_get_nb_samples(const unsigned char packet[], opus_int32 len,
       return samples;
 }
 
+int opus_packet_has_lbrr(const unsigned char packet[], opus_int32 len)
+{
+   int ret;
+   const unsigned char *frames[48];
+   opus_int16 size[48];
+   int packet_mode, packet_frame_size, packet_stream_channels;
+   int nb_frames=1;
+   int lbrr;
+
+   packet_mode = opus_packet_get_mode(packet);
+   if (packet_mode == MODE_CELT_ONLY)
+      return 0;
+   packet_frame_size = opus_packet_get_samples_per_frame(packet, 48000);
+   if (packet_frame_size > 960)
+      nb_frames = packet_frame_size/960;
+   packet_stream_channels = opus_packet_get_nb_channels(packet);
+   ret = opus_packet_parse(packet, len, NULL, frames, size, NULL);
+   if (ret <= 0)
+      return ret;
+   lbrr = (frames[0][0] >> (7-nb_frames)) & 0x1;
+   if (packet_stream_channels == 2)
+      lbrr = lbrr || ((frames[0][0] >> (6-2*nb_frames)) & 0x1);
+   return lbrr;
+}
+
 int opus_decoder_get_nb_samples(const OpusDecoder *dec,
       const unsigned char packet[], opus_int32 len)
 {
    return opus_packet_get_nb_samples(packet, len, dec->Fs);
 }
+
+struct OpusDREDDecoder {
+#ifdef ENABLE_DRED
+   RDOVAEDec model;
+#endif
+   int loaded;
+   int arch;
+   opus_uint32 magic;
+};
+
+#if defined(ENABLE_DRED) && (defined(ENABLE_HARDENING) || defined(ENABLE_ASSERTIONS))
+static void validate_dred_decoder(OpusDREDDecoder *st)
+{
+   celt_assert(st->magic == 0xD8EDDEC0);
+#ifdef OPUS_ARCHMASK
+   celt_assert(st->arch >= 0);
+   celt_assert(st->arch <= OPUS_ARCHMASK);
+#endif
+}
+#define VALIDATE_DRED_DECODER(st) validate_dred_decoder(st)
+#else
+#define VALIDATE_DRED_DECODER(st)
+#endif
+
+
+int opus_dred_decoder_get_size(void)
+{
+  return sizeof(OpusDREDDecoder);
+}
+
+#ifdef ENABLE_DRED
+int dred_decoder_load_model(OpusDREDDecoder *dec, const unsigned char *data, int len)
+{
+    WeightArray *list;
+    int ret;
+    parse_weights(&list, data, len);
+    ret = init_rdovaedec(&dec->model, list);
+    opus_free(list);
+    if (ret == 0) dec->loaded = 1;
+    return (ret == 0) ? OPUS_OK : OPUS_BAD_ARG;
+}
+#endif
+
+int opus_dred_decoder_init(OpusDREDDecoder *dec)
+{
+   int ret = 0;
+   dec->loaded = 0;
+#if defined(ENABLE_DRED) && !defined(USE_WEIGHTS_FILE)
+   ret = init_rdovaedec(&dec->model, rdovaedec_arrays);
+   if (ret == 0) dec->loaded = 1;
+#endif
+   dec->arch = opus_select_arch();
+   /* To make sure nobody forgets to init, use a magic number. */
+   dec->magic = 0xD8EDDEC0;
+   return (ret == 0) ? OPUS_OK : OPUS_UNIMPLEMENTED;
+}
+
+OpusDREDDecoder *opus_dred_decoder_create(int *error)
+{
+   int ret;
+   OpusDREDDecoder *dec;
+   dec = (OpusDREDDecoder *)opus_alloc(opus_dred_decoder_get_size());
+   if (dec == NULL)
+   {
+      if (error)
+         *error = OPUS_ALLOC_FAIL;
+      return NULL;
+   }
+   ret = opus_dred_decoder_init(dec);
+   if (error)
+      *error = ret;
+   if (ret != OPUS_OK)
+   {
+      opus_free(dec);
+      dec = NULL;
+   }
+   return dec;
+}
+
+void opus_dred_decoder_destroy(OpusDREDDecoder *dec)
+{
+   if (dec) dec->magic = 0xDE57801D;
+   opus_free(dec);
+}
+
+int opus_dred_decoder_ctl(OpusDREDDecoder *dred_dec, int request, ...)
+{
+#ifdef ENABLE_DRED
+   int ret = OPUS_OK;
+   va_list ap;
+
+   va_start(ap, request);
+   (void)dred_dec;
+   switch (request)
+   {
+# ifdef USE_WEIGHTS_FILE
+   case OPUS_SET_DNN_BLOB_REQUEST:
+   {
+      const unsigned char *data = va_arg(ap, const unsigned char *);
+      opus_int32 len = va_arg(ap, opus_int32);
+      if(len<0 || data == NULL)
+      {
+         goto bad_arg;
+      }
+      return dred_decoder_load_model(dred_dec, data, len);
+   }
+   break;
+# endif
+   default:
+     /*fprintf(stderr, "unknown opus_decoder_ctl() request: %d", request);*/
+     ret = OPUS_UNIMPLEMENTED;
+     break;
+  }
+  va_end(ap);
+  return ret;
+# ifdef USE_WEIGHTS_FILE
+bad_arg:
+  va_end(ap);
+  return OPUS_BAD_ARG;
+# endif
+#else
+  (void)dred_dec;
+  (void)request;
+  return OPUS_UNIMPLEMENTED;
+#endif
+}
+
+#ifdef ENABLE_DRED
+static int dred_find_payload(const unsigned char *data, opus_int32 len, const unsigned char **payload, int *dred_frame_offset)
+{
+   const unsigned char *data0;
+   int len0;
+   int frame = 0;
+   int ret;
+   const unsigned char *frames[48];
+   opus_int16 size[48];
+   int frame_size;
+
+   *payload = NULL;
+   /* Get the padding section of the packet. */
+   ret = opus_packet_parse_impl(data, len, 0, NULL, frames, size, NULL, NULL, &data0, &len0);
+   if (ret < 0)
+      return ret;
+   frame_size = opus_packet_get_samples_per_frame(data, 48000);
+   data = data0;
+   len = len0;
+   /* Scan extensions in order until we find the earliest frame with DRED data. */
+   while (len > 0)
+   {
+      opus_int32 header_size;
+      int id, L;
+      len0 = len;
+      data0 = data;
+      id = *data0 >> 1;
+      L = *data0 & 0x1;
+      len = skip_extension(&data, len, &header_size);
+      if (len < 0)
+         break;
+      if (id == 1)
+      {
+         if (L==0)
+         {
+            frame++;
+         } else {
+            frame += data0[1];
+         }
+      } else if (id == DRED_EXTENSION_ID)
+      {
+         const unsigned char *curr_payload;
+         opus_int32 curr_payload_len;
+         curr_payload = data0+header_size;
+         curr_payload_len = (data-data0)-header_size;
+         /* DRED position in the packet, in units of 2.5 ms like for the signaled DRED offset. */
+         *dred_frame_offset = frame*frame_size/120;
+#ifdef DRED_EXPERIMENTAL_VERSION
+         /* Check that temporary extension type and version match.
+            This check will be removed once extension is finalized. */
+         if (curr_payload_len > DRED_EXPERIMENTAL_BYTES && curr_payload[0] == 'D' && curr_payload[1] == DRED_EXPERIMENTAL_VERSION) {
+            *payload = curr_payload+2;
+            return curr_payload_len-2;
+         }
+#else
+         if (curr_payload_len > 0) {
+            *payload = curr_payload;
+            return curr_payload_len;
+         }
+#endif
+      }
+   }
+   return 0;
+}
+#endif
+
+int opus_dred_get_size(void)
+{
+#ifdef ENABLE_DRED
+  return sizeof(OpusDRED);
+#else
+  return 0;
+#endif
+}
+
+OpusDRED *opus_dred_alloc(int *error)
+{
+#ifdef ENABLE_DRED
+  OpusDRED *dec;
+  dec = (OpusDRED *)opus_alloc(opus_dred_get_size());
+  if (dec == NULL)
+  {
+    if (error)
+      *error = OPUS_ALLOC_FAIL;
+    return NULL;
+  }
+  return dec;
+#else
+  if (error)
+    *error = OPUS_UNIMPLEMENTED;
+  return NULL;
+#endif
+}
+
+void opus_dred_free(OpusDRED *dec)
+{
+#ifdef ENABLE_DRED
+  opus_free(dec);
+#else
+  (void)dec;
+#endif
+}
+
+int opus_dred_parse(OpusDREDDecoder *dred_dec, OpusDRED *dred, const unsigned char *data, opus_int32 len, opus_int32 max_dred_samples, opus_int32 sampling_rate, int *dred_end, int defer_processing)
+{
+#ifdef ENABLE_DRED
+   const unsigned char *payload;
+   opus_int32 payload_len;
+   int dred_frame_offset=0;
+   VALIDATE_DRED_DECODER(dred_dec);
+   if (!dred_dec->loaded) return OPUS_UNIMPLEMENTED;
+   dred->process_stage = -1;
+   payload_len = dred_find_payload(data, len, &payload, &dred_frame_offset);
+   if (payload_len < 0)
+      return payload_len;
+   if (payload != NULL)
+   {
+      int offset;
+      int min_feature_frames;
+      offset = 100*max_dred_samples/sampling_rate;
+      min_feature_frames = IMIN(2 + offset, 2*DRED_NUM_REDUNDANCY_FRAMES);
+      dred_ec_decode(dred, payload, payload_len, min_feature_frames, dred_frame_offset);
+      if (!defer_processing)
+         opus_dred_process(dred_dec, dred, dred);
+      if (dred_end) *dred_end = IMAX(0, -dred->dred_offset*sampling_rate/400);
+      return IMAX(0, dred->nb_latents*sampling_rate/25 - dred->dred_offset* sampling_rate/400);
+   }
+   if (dred_end) *dred_end = 0;
+   return 0;
+#else
+   (void)dred_dec;
+   (void)dred;
+   (void)data;
+   (void)len;
+   (void)max_dred_samples;
+   (void)sampling_rate;
+   (void)defer_processing;
+   (void)dred_end;
+   return OPUS_UNIMPLEMENTED;
+#endif
+}
+
+int opus_dred_process(OpusDREDDecoder *dred_dec, const OpusDRED *src, OpusDRED *dst)
+{
+#ifdef ENABLE_DRED
+   if (dred_dec == NULL || src == NULL || dst == NULL || (src->process_stage != 1 && src->process_stage != 2))
+      return OPUS_BAD_ARG;
+   VALIDATE_DRED_DECODER(dred_dec);
+   if (!dred_dec->loaded) return OPUS_UNIMPLEMENTED;
+   if (src != dst)
+      OPUS_COPY(dst, src, 1);
+   if (dst->process_stage == 2)
+      return OPUS_OK;
+   DRED_rdovae_decode_all(&dred_dec->model, dst->fec_features, dst->state, dst->latents, dst->nb_latents, dred_dec->arch);
+   dst->process_stage = 2;
+   return OPUS_OK;
+#else
+   (void)dred_dec;
+   (void)src;
+   (void)dst;
+   return OPUS_UNIMPLEMENTED;
+#endif
+}
+
+int opus_decoder_dred_decode(OpusDecoder *st, const OpusDRED *dred, opus_int32 dred_offset, opus_int16 *pcm, opus_int32 frame_size)
+{
+#ifdef ENABLE_DRED
+   VARDECL(float, out);
+   int ret, i;
+   ALLOC_STACK;
+
+   if(frame_size<=0)
+   {
+      RESTORE_STACK;
+      return OPUS_BAD_ARG;
+   }
+
+   celt_assert(st->channels == 1 || st->channels == 2);
+   ALLOC(out, frame_size*st->channels, float);
+
+   ret = opus_decode_native(st, NULL, 0, out, frame_size, 0, 0, NULL, 1, dred, dred_offset);
+   if (ret > 0)
+   {
+      for (i=0;i<ret*st->channels;i++)
+         pcm[i] = FLOAT2INT16(out[i]);
+   }
+   RESTORE_STACK;
+   return ret;
+#else
+   (void)st;
+   (void)dred;
+   (void)dred_offset;
+   (void)pcm;
+   (void)frame_size;
+   return OPUS_UNIMPLEMENTED;
+#endif
+}
+
+int opus_decoder_dred_decode_float(OpusDecoder *st, const OpusDRED *dred, opus_int32 dred_offset, float *pcm, opus_int32 frame_size)
+{
+#ifdef ENABLE_DRED
+   if(frame_size<=0)
+      return OPUS_BAD_ARG;
+   return opus_decode_native(st, NULL, 0, pcm, frame_size, 0, 0, NULL, 0, dred, dred_offset);
+#else
+   (void)st;
+   (void)dred;
+   (void)dred_offset;
+   (void)pcm;
+   (void)frame_size;
+   return OPUS_UNIMPLEMENTED;
+#endif
+}
diff --git a/opus/src/opus_demo.c b/opus/src/opus_demo.c
index 4cc26a6c..2876fff8 100644
--- a/opus/src/opus_demo.c
+++ b/opus/src/opus_demo.c
@@ -39,9 +39,80 @@
 #include "opus_types.h"
 #include "opus_private.h"
 #include "opus_multistream.h"
+#ifdef ENABLE_LOSSGEN
+#include "lossgen.h"
+#endif
 
 #define MAX_PACKET 1500
 
+#ifdef USE_WEIGHTS_FILE
+# if __unix__
+#  include <fcntl.h>
+#  include <sys/mman.h>
+#  include <unistd.h>
+#  include <sys/stat.h>
+/* When available, mmap() is preferable to reading the file, as it leads to
+   better resource utilization, especially if multiple processes are using the same
+   file (mapping will be shared in cache). */
+void *load_blob(const char *filename, int *len) {
+  int fd;
+  void *data;
+  struct stat st;
+  if (stat(filename, &st)) {
+     *len = 0;
+     return NULL;
+  }
+  *len = st.st_size;
+  fd = open(filename, O_RDONLY);
+  if (fd<0) {
+     *len = 0;
+     return NULL;
+  }
+  data = mmap(NULL, *len, PROT_READ, MAP_SHARED, fd, 0);
+  if (data == MAP_FAILED) {
+     *len = 0;
+     data = NULL;
+  }
+  close(fd);
+  return data;
+}
+void free_blob(void *blob, int len) {
+  if (blob) munmap(blob, len);
+}
+# else
+void *load_blob(const char *filename, int *len) {
+  FILE *file;
+  void *data;
+  file = fopen(filename, "r");
+  if (file == NULL)
+  {
+    perror("could not open blob file");
+    *len = 0;
+    return NULL;
+  }
+  fseek(file, 0L, SEEK_END);
+  *len = ftell(file);
+  fseek(file, 0L, SEEK_SET);
+  if (*len <= 0) {
+     *len = 0;
+     return NULL;
+  }
+  data = malloc(*len);
+  if (!data) {
+     *len = 0;
+     return NULL;
+  }
+  *len = fread(data, 1, *len, file);
+  return data;
+}
+void free_blob(void *blob, int len) {
+  free(blob);
+  (void)len;
+}
+# endif
+#endif
+
+
 void print_usage( char* argv[] )
 {
     fprintf(stderr, "Usage: %s [-e] <application> <sampling rate (Hz)> <channels (1/2)> "
@@ -58,11 +129,17 @@ void print_usage( char* argv[] )
     fprintf(stderr, "-bandwidth <NB|MB|WB|SWB|FB> : audio bandwidth (from narrowband to fullband); default: sampling rate\n" );
     fprintf(stderr, "-framesize <2.5|5|10|20|40|60|80|100|120> : frame size in ms; default: 20 \n" );
     fprintf(stderr, "-max_payload <bytes> : maximum payload size in bytes, default: 1024\n" );
-    fprintf(stderr, "-complexity <comp>   : complexity, 0 (lowest) ... 10 (highest); default: 10\n" );
+    fprintf(stderr, "-complexity <comp>   : encoder complexity, 0 (lowest) ... 10 (highest); default: 10\n" );
+    fprintf(stderr, "-dec_complexity <comp> : decoder complexity, 0 (lowest) ... 10 (highest); default: 0\n" );
     fprintf(stderr, "-inbandfec           : enable SILK inband FEC\n" );
     fprintf(stderr, "-forcemono           : force mono encoding, even for stereo input\n" );
     fprintf(stderr, "-dtx                 : enable SILK DTX\n" );
-    fprintf(stderr, "-loss <perc>         : simulate packet loss, in percent (0-100); default: 0\n" );
+    fprintf(stderr, "-loss <perc>         : optimize for loss percentage and simulate packet loss, in percent (0-100); default: 0\n" );
+#ifdef ENABLE_LOSSGEN
+    fprintf(stderr, "-sim_loss <perc>     : simulate realistic (bursty) packet loss from percentage, using generative model\n" );
+#endif
+    fprintf(stderr, "-lossfile <file>     : simulate packet loss, reading loss from file\n" );
+    fprintf(stderr, "-dred <frames>       : add Deep REDundancy (in units of 10-ms frames)\n" );
 }
 
 static void int_to_char(opus_uint32 i, unsigned char ch[4])
@@ -80,6 +157,7 @@ static opus_uint32 char_to_int(unsigned char ch[4])
 }
 
 #define check_encoder_option(decode_only, opt) do {if (decode_only) {fprintf(stderr, "option %s is only for encoding\n", opt); goto failure;}} while(0)
+#define check_decoder_option(encode_only, opt) do {if (encode_only) {fprintf(stderr, "option %s is only for decoding\n", opt); goto failure;}} while(0)
 
 static const int silk8_test[][4] = {
       {MODE_SILK_ONLY, OPUS_BANDWIDTH_NARROWBAND, 960*3, 1},
@@ -207,6 +285,68 @@ static OpusDecoder *ms_opus_decoder_create(opus_int32 Fs, int channels, int *err
 }
 #endif
 
+
+#ifdef ENABLE_OSCE_TRAINING_DATA
+#define COMPLEXITY_MIN 0
+#define COMPLEXITY_MAX 10
+
+#define PACKET_LOSS_PERC_MIN 0
+#define PACKET_LOSS_PERC_MAX 50
+#define PACKET_LOSS_PERC_STEP 5
+
+#define CBR_BITRATE_LIMIT 8000
+
+#define NUM_BITRATES 102
+static int bitrates[NUM_BITRATES] = {
+        6000,  6060,  6120,  6180,  6240,  6300,  6360,  6420,  6480,
+        6525,  6561,  6598,  6634,  6670,  6707,  6743,  6780,  6816,
+        6853,  6889,  6926,  6962,  6999,  7042,  7085,  7128,  7171,
+        7215,  7258,  7301,  7344,  7388,  7431,  7474,  7512,  7541,
+        7570,  7599,  7628,  7657,  7686,  7715,  7744,  7773,  7802,
+        7831,  7860,  7889,  7918,  7947,  7976,  8013,  8096,  8179,
+        8262,  8344,  8427,  8511,  8605,  8699,  8792,  8886,  8980,
+        9100,  9227,  9354,  9480,  9561,  9634,  9706,  9779,  9851,
+        9924,  9996, 10161, 10330, 10499, 10698, 10898, 11124, 11378,
+       11575, 11719, 11862, 12014, 12345, 12751, 13195, 13561, 13795,
+       14069, 14671, 15403, 15790, 16371, 17399, 17968, 19382, 20468,
+       22000, 32000, 64000
+};
+
+static int randint(int min, int max, int step)
+{
+    double r = ((double) rand())/ (RAND_MAX + 1.);
+    int d;
+
+    d = ((int) ((max + 1 - min) * r / step) * step) + min;
+
+    return d;
+}
+
+static void new_random_setting(OpusEncoder *enc)
+{
+    int bitrate_bps;
+    int complexity;
+    int packet_loss_perc;
+    int use_vbr;
+
+    bitrate_bps = bitrates[randint(0, NUM_BITRATES - 1, 1)];
+    complexity  = randint(COMPLEXITY_MIN, COMPLEXITY_MAX, 1);
+    packet_loss_perc = randint(PACKET_LOSS_PERC_MIN, PACKET_LOSS_PERC_MAX, PACKET_LOSS_PERC_STEP);
+    use_vbr = bitrate_bps < CBR_BITRATE_LIMIT ? 1 : randint(0, 1, 1);
+
+    if (1)
+    {
+        printf("changing settings to %d\t%d\t%d\t%d\n", bitrate_bps, complexity, packet_loss_perc, use_vbr);
+    }
+
+    opus_encoder_ctl(enc, OPUS_SET_BITRATE(bitrate_bps));
+    opus_encoder_ctl(enc, OPUS_SET_COMPLEXITY(complexity));
+    opus_encoder_ctl(enc, OPUS_SET_PACKET_LOSS_PERC(packet_loss_perc));
+    opus_encoder_ctl(enc, OPUS_SET_VBR(use_vbr));
+}
+
+#endif
+
 int main(int argc, char *argv[])
 {
     int err;
@@ -215,21 +355,28 @@ int main(int argc, char *argv[])
     FILE *fout=NULL;
     OpusEncoder *enc=NULL;
     OpusDecoder *dec=NULL;
+    OpusDRED *dred=NULL;
+    OpusDREDDecoder *dred_dec=NULL;
     int args;
-    int len[2];
+    int len;
     int frame_size, channels;
     opus_int32 bitrate_bps=0;
-    unsigned char *data[2] = {NULL, NULL};
+    unsigned char *data = NULL;
     unsigned char *fbytes=NULL;
     opus_int32 sampling_rate;
     int use_vbr;
     int max_payload_bytes;
     int complexity;
+    int dec_complexity;
     int use_inbandfec;
     int use_dtx;
     int forcechannels;
     int cvbr = 0;
     int packet_loss_perc;
+#ifdef ENABLE_LOSSGEN
+    float lossgen_perc = -1.f;
+    LossGenState lossgen;
+#endif
     opus_int32 count=0, count_act=0;
     int k;
     opus_int32 skip=0;
@@ -243,8 +390,7 @@ int main(int argc, char *argv[])
     int bandwidth=OPUS_AUTO;
     const char *bandwidth_string;
     int lost = 0, lost_prev = 1;
-    int toggle = 0;
-    opus_uint32 enc_final_range[2];
+    opus_uint32 enc_final_range;
     opus_uint32 dec_final_range;
     int encode_only=0, decode_only=0;
     int max_frame_size = 48000*2;
@@ -264,6 +410,19 @@ int main(int argc, char *argv[])
     int variable_duration=OPUS_FRAMESIZE_ARG;
     int delayed_decision=0;
     int ret = EXIT_FAILURE;
+    int lost_count=0;
+    FILE *packet_loss_file=NULL;
+    int dred_duration=0;
+#ifdef ENABLE_OSCE_TRAINING_DATA
+    int silk_random_switching = 0;
+    int silk_frame_counter = 0;
+#endif
+#ifdef USE_WEIGHTS_FILE
+    int blob_len;
+    void *blob_data;
+    const char *filename = "weights_blob.bin";
+    blob_data = load_blob(filename, &blob_len);
+#endif
 
     if (argc < 5 )
     {
@@ -335,6 +494,7 @@ int main(int argc, char *argv[])
     use_vbr = 1;
     max_payload_bytes = MAX_PACKET;
     complexity = 10;
+    dec_complexity = 0;
     use_inbandfec = 0;
     forcechannels = OPUS_AUTO;
     use_dtx = 0;
@@ -400,6 +560,10 @@ int main(int argc, char *argv[])
             check_encoder_option(decode_only, "-complexity");
             complexity = atoi( argv[ args + 1 ] );
             args += 2;
+        } else if( strcmp( argv[ args ], "-dec_complexity" ) == 0 ) {
+            check_decoder_option(encode_only, "-dec_complexity");
+            dec_complexity = atoi( argv[ args + 1 ] );
+            args += 2;
         } else if( strcmp( argv[ args ], "-inbandfec" ) == 0 ) {
             use_inbandfec = 1;
             args++;
@@ -422,6 +586,22 @@ int main(int argc, char *argv[])
         } else if( strcmp( argv[ args ], "-loss" ) == 0 ) {
             packet_loss_perc = atoi( argv[ args + 1 ] );
             args += 2;
+#ifdef ENABLE_LOSSGEN
+        } else if( strcmp( argv[ args ], "-sim_loss" ) == 0 ) {
+            lossgen_perc = atof( argv[ args + 1 ] );
+            lossgen_init(&lossgen);
+            args += 2;
+#endif
+        } else if( strcmp( argv[ args ], "-lossfile" ) == 0 ) {
+            packet_loss_file = fopen( argv[ args + 1 ], "r" );
+            if (packet_loss_file == NULL) {
+                fprintf(stderr, "failed to open loss file %s\n", argv[ args + 1 ] );
+                exit(1);
+            }
+            args += 2;
+        } else if( strcmp( argv[ args ], "-dred" ) == 0 ) {
+            dred_duration = atoi( argv[ args + 1 ] );
+            args += 2;
         } else if( strcmp( argv[ args ], "-sweep" ) == 0 ) {
             check_encoder_option(decode_only, "-sweep");
             sweep_bps = atoi( argv[ args + 1 ] );
@@ -473,6 +653,12 @@ int main(int argc, char *argv[])
             mode_list = celt_hq_test;
             nb_modes_in_list = 4;
             args++;
+#ifdef ENABLE_OSCE_TRAINING_DATA
+        } else if( strcmp( argv[ args ], "-silk_random_switching" ) == 0 ){
+            silk_random_switching = atoi( argv[ args + 1 ] );
+            printf("switching encoding parameters every %dth frame\n", silk_random_switching);
+            args += 2;
+#endif
         } else {
             printf( "Error: unrecognized setting: %s\n\n", argv[ args ] );
             print_usage( argv );
@@ -537,6 +723,10 @@ int main(int argc, char *argv[])
        opus_encoder_ctl(enc, OPUS_GET_LOOKAHEAD(&skip));
        opus_encoder_ctl(enc, OPUS_SET_LSB_DEPTH(16));
        opus_encoder_ctl(enc, OPUS_SET_EXPERT_FRAME_DURATION(variable_duration));
+       if (dred_duration > 0)
+       {
+          opus_encoder_ctl(enc, OPUS_SET_DRED_DURATION(dred_duration));
+       }
     }
     if (!encode_only)
     {
@@ -546,9 +736,8 @@ int main(int argc, char *argv[])
           fprintf(stderr, "Cannot create decoder: %s\n", opus_strerror(err));
           goto failure;
        }
+       opus_decoder_ctl(dec, OPUS_SET_COMPLEXITY(dec_complexity));
     }
-
-
     switch(bandwidth)
     {
     case OPUS_BANDWIDTH_NARROWBAND:
@@ -587,10 +776,7 @@ int main(int argc, char *argv[])
     out = (short*)malloc(max_frame_size*channels*sizeof(short));
     /* We need to allocate for 16-bit PCM data, but we store it as unsigned char. */
     fbytes = (unsigned char*)malloc(max_frame_size*channels*sizeof(short));
-    data[0] = (unsigned char*)calloc(max_payload_bytes,sizeof(unsigned char));
-    if ( use_inbandfec ) {
-        data[1] = (unsigned char*)calloc(max_payload_bytes,sizeof(unsigned char));
-    }
+    data = (unsigned char*)calloc(max_payload_bytes,sizeof(unsigned char));
     if(delayed_decision)
     {
        if (frame_size==sampling_rate/400)
@@ -614,6 +800,13 @@ int main(int argc, char *argv[])
        opus_encoder_ctl(enc, OPUS_SET_EXPERT_FRAME_DURATION(variable_duration));
        frame_size = 2*48000;
     }
+    dred_dec = opus_dred_decoder_create(&err);
+    dred = opus_dred_alloc(&err);
+#ifdef USE_WEIGHTS_FILE
+    if (enc) opus_encoder_ctl(enc, OPUS_SET_DNN_BLOB(blob_data, blob_len));
+    if (dec) opus_decoder_ctl(dec, OPUS_SET_DNN_BLOB(blob_data, blob_len));
+    if (dred_dec) opus_dred_decoder_ctl(dred_dec, OPUS_SET_DNN_BLOB(blob_data, blob_len));
+#endif
     while (!stop)
     {
         if (delayed_celt)
@@ -652,22 +845,22 @@ int main(int argc, char *argv[])
             num_read = fread(ch, 1, 4, fin);
             if (num_read!=4)
                 break;
-            len[toggle] = char_to_int(ch);
-            if (len[toggle]>max_payload_bytes || len[toggle]<0)
+            len = char_to_int(ch);
+            if (len>max_payload_bytes || len<0)
             {
-                fprintf(stderr, "Invalid payload length: %d\n",len[toggle]);
+                fprintf(stderr, "Invalid payload length: %d\n",len);
                 break;
             }
             num_read = fread(ch, 1, 4, fin);
             if (num_read!=4)
                 break;
-            enc_final_range[toggle] = char_to_int(ch);
-            num_read = fread(data[toggle], 1, len[toggle], fin);
-            if (num_read!=(size_t)len[toggle])
+            enc_final_range = char_to_int(ch);
+            num_read = fread(data, 1, len, fin);
+            if (num_read!=(size_t)len)
             {
                 fprintf(stderr, "Ran out of input, "
                                 "expecting %d bytes got %d\n",
-                                len[toggle],(int)num_read);
+                                len,(int)num_read);
                 break;
             }
         } else {
@@ -679,6 +872,15 @@ int main(int argc, char *argv[])
                 opus_encoder_ctl(enc, OPUS_SET_FORCE_CHANNELS(mode_list[curr_mode][3]));
                 frame_size = mode_list[curr_mode][2];
             }
+#ifdef ENABLE_OSCE_TRAINING_DATA
+            if (silk_random_switching)
+            {
+                silk_frame_counter += 1;
+                if (silk_frame_counter % silk_random_switching == 0) {
+                    new_random_setting(enc);
+                }
+            }
+#endif
             num_read = fread(fbytes, sizeof(short)*channels, frame_size-remaining, fin);
             curr_read = (int)num_read;
             tot_in += curr_read;
@@ -696,8 +898,13 @@ int main(int argc, char *argv[])
                 if (encode_only || decode_only)
                    stop = 1;
             }
-            len[toggle] = opus_encode(enc, in, frame_size, data[toggle], max_payload_bytes);
-            nb_encoded = opus_packet_get_samples_per_frame(data[toggle], sampling_rate)*opus_packet_get_nb_frames(data[toggle], len[toggle]);
+            len = opus_encode(enc, in, frame_size, data, max_payload_bytes);
+            if (len < 0)
+            {
+                fprintf (stderr, "opus_encode() returned %d\n", len);
+                goto failure;
+            }
+            nb_encoded = opus_packet_get_samples_per_frame(data, sampling_rate)*opus_packet_get_nb_frames(data, len);
             remaining = frame_size-nb_encoded;
             for(i=0;i<remaining*channels;i++)
                in[i] = in[nb_encoded*channels+i];
@@ -716,12 +923,7 @@ int main(int argc, char *argv[])
                   bitrate_bps = 1000;
                opus_encoder_ctl(enc, OPUS_SET_BITRATE(bitrate_bps));
             }
-            opus_encoder_ctl(enc, OPUS_GET_FINAL_RANGE(&enc_final_range[toggle]));
-            if (len[toggle] < 0)
-            {
-                fprintf (stderr, "opus_encode() returned %d\n", len[toggle]);
-                goto failure;
-            }
+            opus_encoder_ctl(enc, OPUS_GET_FINAL_RANGE(&enc_final_range));
             curr_mode_count += frame_size;
             if (curr_mode_count > mode_switch_time && curr_mode < nb_modes_in_list-1)
             {
@@ -731,56 +933,84 @@ int main(int argc, char *argv[])
         }
 
 #if 0 /* This is for testing the padding code, do not enable by default */
-        if (len[toggle]<1275)
+        if (len<1275)
         {
-           int new_len = len[toggle]+rand()%(max_payload_bytes-len[toggle]);
-           if ((err = opus_packet_pad(data[toggle], len[toggle], new_len)) != OPUS_OK)
+           int new_len = len+rand()%(max_payload_bytes-len);
+           if ((err = opus_packet_pad(data, len, new_len)) != OPUS_OK)
            {
               fprintf(stderr, "padding failed: %s\n", opus_strerror(err));
               goto failure;
            }
-           len[toggle] = new_len;
+           len = new_len;
         }
 #endif
         if (encode_only)
         {
             unsigned char int_field[4];
-            int_to_char(len[toggle], int_field);
+            int_to_char(len, int_field);
             if (fwrite(int_field, 1, 4, fout) != 4) {
                fprintf(stderr, "Error writing.\n");
                goto failure;
             }
-            int_to_char(enc_final_range[toggle], int_field);
+            int_to_char(enc_final_range, int_field);
             if (fwrite(int_field, 1, 4, fout) != 4) {
                fprintf(stderr, "Error writing.\n");
                goto failure;
             }
-            if (fwrite(data[toggle], 1, len[toggle], fout) != (unsigned)len[toggle]) {
+            if (fwrite(data, 1, len, fout) != (unsigned)len) {
                fprintf(stderr, "Error writing.\n");
                goto failure;
             }
             tot_samples += nb_encoded;
         } else {
-            opus_int32 output_samples;
-            lost = len[toggle]==0 || (packet_loss_perc>0 && rand()%100 < packet_loss_perc);
+            int fr;
+            int run_decoder;
+            int dred_input=0;
+            int dred_end=0;
+            if (packet_loss_file != NULL) {
+                if ( fscanf(packet_loss_file, "%d", &lost) != 1) {
+                    lost = 0;
+                }
+#ifdef ENABLE_LOSSGEN
+            } else if (lossgen_perc >= 0) {
+               lost = sample_loss(&lossgen, lossgen_perc*.01f);
+#endif
+            } else {
+              lost = (packet_loss_perc>0) && (rand()%100 < packet_loss_perc);
+            }
+            if (len == 0) lost = 1;
             if (lost)
-               opus_decoder_ctl(dec, OPUS_GET_LAST_PACKET_DURATION(&output_samples));
-            else
-               output_samples = max_frame_size;
-            if( count >= use_inbandfec ) {
-                /* delay by one packet when using in-band FEC */
-                if( use_inbandfec  ) {
-                    if( lost_prev ) {
-                        /* attempt to decode with in-band FEC from next packet */
-                        opus_decoder_ctl(dec, OPUS_GET_LAST_PACKET_DURATION(&output_samples));
-                        output_samples = opus_decode(dec, lost ? NULL : data[toggle], len[toggle], out, output_samples, 1);
-                    } else {
-                        /* regular decode */
-                        output_samples = max_frame_size;
-                        output_samples = opus_decode(dec, data[1-toggle], len[1-toggle], out, output_samples, 0);
-                    }
+            {
+               lost_count++;
+               run_decoder = 0;
+            } else {
+               run_decoder= 1;
+            }
+            if (run_decoder)
+                run_decoder += lost_count;
+            if (!lost && lost_count > 0) {
+                opus_int32 output_samples=0;
+                opus_decoder_ctl(dec, OPUS_GET_LAST_PACKET_DURATION(&output_samples));
+                dred_input = lost_count*output_samples;
+                /* Only decode the amount we need to fill in the gap. */
+                ret = opus_dred_parse(dred_dec, dred, data, len, IMIN(48000, IMAX(0, dred_input)), sampling_rate, &dred_end, 0);
+                dred_input = ret > 0 ? ret : 0;
+            }
+            /* FIXME: Figure out how to trigger the decoder when the last packet of the file is lost. */
+            for (fr=0;fr<run_decoder;fr++) {
+                opus_int32 output_samples=0;
+                if (fr == lost_count-1 && opus_packet_has_lbrr(data, len)) {
+                   opus_decoder_ctl(dec, OPUS_GET_LAST_PACKET_DURATION(&output_samples));
+                   output_samples = opus_decode(dec, data, len, out, output_samples, 1);
+                } else if (fr < lost_count) {
+                   opus_decoder_ctl(dec, OPUS_GET_LAST_PACKET_DURATION(&output_samples));
+                   if (dred_input > 0)
+                      output_samples = opus_decoder_dred_decode(dec, dred, (lost_count-fr)*output_samples, out, output_samples);
+                   else
+                      output_samples = opus_decode(dec, NULL, 0, out, output_samples, 0);
                 } else {
-                    output_samples = opus_decode(dec, lost ? NULL : data[toggle], len[toggle], out, output_samples, 0);
+                   output_samples = max_frame_size;
+                   output_samples = opus_decode(dec, data, len, out, output_samples, 0);
                 }
                 if (output_samples>0)
                 {
@@ -817,24 +1047,26 @@ int main(int argc, char *argv[])
         if (!encode_only)
            opus_decoder_ctl(dec, OPUS_GET_FINAL_RANGE(&dec_final_range));
         /* compare final range encoder rng values of encoder and decoder */
-        if( enc_final_range[toggle^use_inbandfec]!=0  && !encode_only
+        if( enc_final_range!=0  && !encode_only
          && !lost && !lost_prev
-         && dec_final_range != enc_final_range[toggle^use_inbandfec] ) {
+         && dec_final_range != enc_final_range ) {
             fprintf (stderr, "Error: Range coder state mismatch "
                              "between encoder and decoder "
                              "in frame %ld: 0x%8lx vs 0x%8lx\n",
                          (long)count,
-                         (unsigned long)enc_final_range[toggle^use_inbandfec],
+                         (unsigned long)enc_final_range,
                          (unsigned long)dec_final_range);
             goto failure;
         }
 
         lost_prev = lost;
+        if (!lost)
+           lost_count = 0;
         if( count >= use_inbandfec ) {
             /* count bits */
-            bits += len[toggle]*8;
-            bits_max = ( len[toggle]*8 > bits_max ) ? len[toggle]*8 : bits_max;
-            bits2 += len[toggle]*len[toggle]*64;
+            bits += len*8;
+            bits_max = ( len*8 > bits_max ) ? len*8 : bits_max;
+            bits2 += len*len*64;
             if (!decode_only)
             {
                 nrg = 0.0;
@@ -843,13 +1075,12 @@ int main(int argc, char *argv[])
                 }
                 nrg /= frame_size * channels;
                 if( nrg > 1e5 ) {
-                    bits_act += len[toggle]*8;
+                    bits_act += len*8;
                     count_act++;
                 }
             }
         }
         count++;
-        toggle = (toggle + use_inbandfec) & 1;
     }
 
     if(decode_only && count > 0)
@@ -879,8 +1110,9 @@ int main(int argc, char *argv[])
 failure:
     opus_encoder_destroy(enc);
     opus_decoder_destroy(dec);
-    free(data[0]);
-    free(data[1]);
+    opus_dred_free(dred);
+    opus_dred_decoder_destroy(dred_dec);
+    free(data);
     if (fin)
         fclose(fin);
     if (fout)
@@ -888,5 +1120,8 @@ int main(int argc, char *argv[])
     free(in);
     free(out);
     free(fbytes);
+#ifdef USE_WEIGHTS_FILE
+    free_blob(blob_data, blob_len);
+#endif
     return ret;
 }
diff --git a/opus/src/opus_encoder.c b/opus/src/opus_encoder.c
index e98ac5b8..d18d582f 100644
--- a/opus/src/opus_encoder.c
+++ b/opus/src/opus_encoder.c
@@ -45,11 +45,19 @@
 #include "analysis.h"
 #include "mathops.h"
 #include "tuning_parameters.h"
+
+#ifdef ENABLE_DRED
+#include "dred_coding.h"
+#endif
+
 #ifdef FIXED_POINT
 #include "fixed/structs_FIX.h"
 #else
 #include "float/structs_FLP.h"
 #endif
+#ifdef ENABLE_OSCE_TRAINING_DATA
+#include <stdio.h>
+#endif
 
 #define MAX_ENCODER_BUFFER 480
 
@@ -67,6 +75,9 @@ struct OpusEncoder {
     int          celt_enc_offset;
     int          silk_enc_offset;
     silk_EncControlStruct silk_mode;
+#ifdef ENABLE_DRED
+    DREDEnc      dred_encoder;
+#endif
     int          application;
     int          channels;
     int          delay_compensation;
@@ -87,6 +98,7 @@ struct OpusEncoder {
     int          lfe;
     int          arch;
     int          use_dtx;                 /* general DTX for both SILK and CELT */
+    int          fec_config;
 #ifndef DISABLE_FLOAT_API
     TonalityAnalysisState analysis;
 #endif
@@ -112,8 +124,16 @@ struct OpusEncoder {
     opus_val16   delay_buffer[MAX_ENCODER_BUFFER*2];
 #ifndef DISABLE_FLOAT_API
     int          detected_bandwidth;
-    int          nb_no_activity_frames;
+    int          nb_no_activity_ms_Q1;
     opus_val32   peak_signal_energy;
+#endif
+#ifdef ENABLE_DRED
+    int          dred_duration;
+    int          dred_q0;
+    int          dred_dQ;
+    int          dred_qmax;
+    int          dred_target_chunks;
+    unsigned char activity_mem[DRED_MAX_FRAMES*4]; /* 2.5ms resolution*/
 #endif
     int          nonfinal_frame; /* current frame is not the final in a packet */
     opus_uint32  rangeFinal;
@@ -223,6 +243,7 @@ int opus_encoder_init(OpusEncoder* st, opus_int32 Fs, int channels, int applicat
     st->silk_mode.packetLossPercentage      = 0;
     st->silk_mode.complexity                = 9;
     st->silk_mode.useInBandFEC              = 0;
+    st->silk_mode.useDRED                   = 0;
     st->silk_mode.useDTX                    = 0;
     st->silk_mode.useCBR                    = 0;
     st->silk_mode.reducedDependency         = 0;
@@ -235,6 +256,11 @@ int opus_encoder_init(OpusEncoder* st, opus_int32 Fs, int channels, int applicat
     celt_encoder_ctl(celt_enc, CELT_SET_SIGNALLING(0));
     celt_encoder_ctl(celt_enc, OPUS_SET_COMPLEXITY(st->silk_mode.complexity));
 
+#ifdef ENABLE_DRED
+    /* Initialize DRED Encoder */
+    dred_encoder_init( &st->dred_encoder, Fs, channels );
+#endif
+
     st->use_vbr = 1;
     /* Makes constrained VBR the default (safer for real-time use) */
     st->vbr_constraint = 1;
@@ -543,6 +569,73 @@ OpusEncoder *opus_encoder_create(opus_int32 Fs, int channels, int application, i
    return st;
 }
 
+#ifdef ENABLE_DRED
+
+static const float dred_bits_table[16] = {73.2f, 68.1f, 62.5f, 57.0f, 51.5f, 45.7f, 39.9f, 32.4f, 26.4f, 20.4f, 16.3f, 13.f, 9.3f, 8.2f, 7.2f, 6.4f};
+static int estimate_dred_bitrate(int q0, int dQ, int qmax, int duration, opus_int32 target_bits, int *target_chunks) {
+   int dred_chunks;
+   int i;
+   float bits;
+   /* Signaling DRED costs 3 bytes. */
+   bits = 8*(3+DRED_EXPERIMENTAL_BYTES);
+   /* Approximation for the size of the IS. */
+   bits += 50.f+dred_bits_table[q0];
+   dred_chunks = IMIN((duration+5)/4, DRED_NUM_REDUNDANCY_FRAMES/2);
+   if (target_chunks != NULL) *target_chunks = 0;
+   for (i=0;i<dred_chunks;i++) {
+      int q = compute_quantizer(q0, dQ, qmax, i);
+      bits += dred_bits_table[q];
+      if (target_chunks != NULL && bits < target_bits) *target_chunks = i+1;
+   }
+   return (int)floor(.5f+bits);
+}
+
+static opus_int32 compute_dred_bitrate(OpusEncoder *st, opus_int32 bitrate_bps, int frame_size)
+{
+   float dred_frac;
+   int bitrate_offset;
+   opus_int32 dred_bitrate;
+   opus_int32 target_dred_bitrate;
+   int target_chunks;
+   opus_int32 max_dred_bits;
+   int q0, dQ, qmax;
+   if (st->silk_mode.useInBandFEC) {
+      dred_frac = MIN16(.7f, 3.f*st->silk_mode.packetLossPercentage/100.f);
+      bitrate_offset = 20000;
+   } else {
+      if (st->silk_mode.packetLossPercentage > 5) {
+         dred_frac = MIN16(.8f, .55f + st->silk_mode.packetLossPercentage/100.f);
+      } else {
+         dred_frac = 12*st->silk_mode.packetLossPercentage/100.f;
+      }
+      bitrate_offset = 12000;
+   }
+   /* Account for the fact that longer packets require less redundancy. */
+   dred_frac = dred_frac/(dred_frac + (1-dred_frac)*(frame_size*50.f)/st->Fs);
+   /* Approximate fit based on a few experiments. Could probably be improved. */
+   q0 = IMIN(15, IMAX(4, 51 - 3*EC_ILOG(IMAX(1, bitrate_bps-bitrate_offset))));
+   dQ = bitrate_bps-bitrate_offset > 36000 ? 3 : 5;
+   qmax = 15;
+   target_dred_bitrate = IMAX(0, (int)(dred_frac*(bitrate_bps-bitrate_offset)));
+   if (st->dred_duration > 0) {
+      opus_int32 target_bits = target_dred_bitrate*frame_size/st->Fs;
+      max_dred_bits = estimate_dred_bitrate(q0, dQ, qmax, st->dred_duration, target_bits, &target_chunks);
+   } else {
+      max_dred_bits = 0;
+      target_chunks=0;
+   }
+   dred_bitrate = IMIN(target_dred_bitrate, max_dred_bits*st->Fs/frame_size);
+   /* If we can't afford enough bits, don't bother with DRED at all. */
+   if (target_chunks < 2)
+      dred_bitrate = 0;
+   st->dred_q0 = q0;
+   st->dred_dQ = dQ;
+   st->dred_qmax = qmax;
+   st->dred_target_chunks = target_chunks;
+   return dred_bitrate;
+}
+#endif
+
 static opus_int32 user_bitrate_to_bitrate(OpusEncoder *st, int frame_size, int max_data_bytes)
 {
   if(!frame_size)frame_size=st->Fs/400;
@@ -871,7 +964,7 @@ static opus_val32 compute_frame_energy(const opus_val16 *pcm, int frame_size, in
 
    /* Compute the right shift required in the MAC to avoid an overflow */
    max_shift = celt_ilog2(len);
-   shift = IMAX(0, (celt_ilog2(sample_max) << 1) + max_shift - 28);
+   shift = IMAX(0, (celt_ilog2(1+sample_max) << 1) + max_shift - 28);
 
    /* Compute the energy */
    for (i=0; i<len; i++)
@@ -892,149 +985,35 @@ static opus_val32 compute_frame_energy(const opus_val16 *pcm, int frame_size, in
 #endif
 
 /* Decides if DTX should be turned on (=1) or off (=0) */
-static int decide_dtx_mode(float activity_probability,    /* probability that current frame contains speech/music */
-                           int *nb_no_activity_frames,    /* number of consecutive frames with no activity */
-                           opus_val32 peak_signal_energy, /* peak energy of desired signal detected so far */
-                           const opus_val16 *pcm,         /* input pcm signal */
-                           int frame_size,                /* frame size */
-                           int channels,
-                           int is_silence,                 /* only digital silence detected in this frame */
-                           int arch
-                          )
-{
-   opus_val32 noise_energy;
-
-   if (!is_silence)
-   {
-      if (activity_probability < DTX_ACTIVITY_THRESHOLD)  /* is noise */
-      {
-         noise_energy = compute_frame_energy(pcm, frame_size, channels, arch);
-
-         /* but is sufficiently quiet */
-         is_silence = peak_signal_energy >= (PSEUDO_SNR_THRESHOLD * noise_energy);
-      }
-   }
+static int decide_dtx_mode(opus_int activity,            /* indicates if this frame contains speech/music */
+                           int *nb_no_activity_ms_Q1,    /* number of consecutive milliseconds with no activity, in Q1 */
+                           int frame_size_ms_Q1          /* number of miliseconds in this update, in Q1 */
+                           )
 
-   if (is_silence)
+{
+   if (!activity)
    {
-      /* The number of consecutive DTX frames should be within the allowed bounds */
-      (*nb_no_activity_frames)++;
-
-      if (*nb_no_activity_frames > NB_SPEECH_FRAMES_BEFORE_DTX)
+      /* The number of consecutive DTX frames should be within the allowed bounds.
+         Note that the allowed bound is defined in the SILK headers and assumes 20 ms
+         frames. As this function can be called with any frame length, a conversion to
+         milliseconds is done before the comparisons. */
+      (*nb_no_activity_ms_Q1) += frame_size_ms_Q1;
+      if (*nb_no_activity_ms_Q1 > NB_SPEECH_FRAMES_BEFORE_DTX*20*2)
       {
-         if (*nb_no_activity_frames <= (NB_SPEECH_FRAMES_BEFORE_DTX + MAX_CONSECUTIVE_DTX))
+         if (*nb_no_activity_ms_Q1 <= (NB_SPEECH_FRAMES_BEFORE_DTX + MAX_CONSECUTIVE_DTX)*20*2)
             /* Valid frame for DTX! */
             return 1;
          else
-            (*nb_no_activity_frames) = NB_SPEECH_FRAMES_BEFORE_DTX;
+            (*nb_no_activity_ms_Q1) = NB_SPEECH_FRAMES_BEFORE_DTX*20*2;
       }
    } else
-      (*nb_no_activity_frames) = 0;
+      (*nb_no_activity_ms_Q1) = 0;
 
    return 0;
 }
 
 #endif
 
-static opus_int32 encode_multiframe_packet(OpusEncoder *st,
-                                           const opus_val16 *pcm,
-                                           int nb_frames,
-                                           int frame_size,
-                                           unsigned char *data,
-                                           opus_int32 out_data_bytes,
-                                           int to_celt,
-                                           int lsb_depth,
-                                           int float_api)
-{
-   int i;
-   int ret = 0;
-   VARDECL(unsigned char, tmp_data);
-   int bak_mode, bak_bandwidth, bak_channels, bak_to_mono;
-   VARDECL(OpusRepacketizer, rp);
-   int max_header_bytes;
-   opus_int32 bytes_per_frame;
-   opus_int32 cbr_bytes;
-   opus_int32 repacketize_len;
-   int tmp_len;
-   ALLOC_STACK;
-
-   /* Worst cases:
-    * 2 frames: Code 2 with different compressed sizes
-    * >2 frames: Code 3 VBR */
-   max_header_bytes = nb_frames == 2 ? 3 : (2+(nb_frames-1)*2);
-
-   if (st->use_vbr || st->user_bitrate_bps==OPUS_BITRATE_MAX)
-      repacketize_len = out_data_bytes;
-   else {
-      cbr_bytes = 3*st->bitrate_bps/(3*8*st->Fs/(frame_size*nb_frames));
-      repacketize_len = IMIN(cbr_bytes, out_data_bytes);
-   }
-   bytes_per_frame = IMIN(1276, 1+(repacketize_len-max_header_bytes)/nb_frames);
-
-   ALLOC(tmp_data, nb_frames*bytes_per_frame, unsigned char);
-   ALLOC(rp, 1, OpusRepacketizer);
-   opus_repacketizer_init(rp);
-
-   bak_mode = st->user_forced_mode;
-   bak_bandwidth = st->user_bandwidth;
-   bak_channels = st->force_channels;
-
-   st->user_forced_mode = st->mode;
-   st->user_bandwidth = st->bandwidth;
-   st->force_channels = st->stream_channels;
-
-   bak_to_mono = st->silk_mode.toMono;
-   if (bak_to_mono)
-      st->force_channels = 1;
-   else
-      st->prev_channels = st->stream_channels;
-
-   for (i=0;i<nb_frames;i++)
-   {
-      st->silk_mode.toMono = 0;
-      st->nonfinal_frame = i<(nb_frames-1);
-
-      /* When switching from SILK/Hybrid to CELT, only ask for a switch at the last frame */
-      if (to_celt && i==nb_frames-1)
-         st->user_forced_mode = MODE_CELT_ONLY;
-
-      tmp_len = opus_encode_native(st, pcm+i*(st->channels*frame_size), frame_size,
-         tmp_data+i*bytes_per_frame, bytes_per_frame, lsb_depth, NULL, 0, 0, 0, 0,
-         NULL, float_api);
-
-      if (tmp_len<0)
-      {
-         RESTORE_STACK;
-         return OPUS_INTERNAL_ERROR;
-      }
-
-      ret = opus_repacketizer_cat(rp, tmp_data+i*bytes_per_frame, tmp_len);
-
-      if (ret<0)
-      {
-         RESTORE_STACK;
-         return OPUS_INTERNAL_ERROR;
-      }
-   }
-
-   ret = opus_repacketizer_out_range_impl(rp, 0, nb_frames, data, repacketize_len, 0, !st->use_vbr);
-
-   if (ret<0)
-   {
-      RESTORE_STACK;
-      return OPUS_INTERNAL_ERROR;
-   }
-
-   /* Discard configs that were forced locally for the purpose of repacketization */
-   st->user_forced_mode = bak_mode;
-   st->user_bandwidth = bak_bandwidth;
-   st->force_channels = bak_channels;
-   st->silk_mode.toMono = bak_to_mono;
-
-   RESTORE_STACK;
-   return ret;
-}
-
 static int compute_redundancy_bytes(opus_int32 max_data_bytes, opus_int32 bitrate_bps, int frame_rate, int channels)
 {
    int redundancy_bytes_cap;
@@ -1063,6 +1042,18 @@ static int compute_redundancy_bytes(opus_int32 max_data_bytes, opus_int32 bitrat
    return redundancy_bytes;
 }
 
+static opus_int32 opus_encode_frame_native(OpusEncoder *st, const opus_val16 *pcm, int frame_size,
+                unsigned char *data, opus_int32 max_data_bytes,
+                int float_api, int first_frame,
+#ifdef ENABLE_DRED
+                opus_int32 dred_bitrate_bps,
+#endif
+#ifndef DISABLE_FLOAT_API
+                AnalysisInfo *analysis_info, int is_silence,
+#endif
+                int redundancy, int celt_to_silk, int prefill,
+                opus_int32 equiv_rate, int to_celt);
+
 opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_size,
                 unsigned char *data, opus_int32 out_data_bytes, int lsb_depth,
                 const void *analysis_pcm, opus_int32 analysis_size, int c1, int c2,
@@ -1072,28 +1063,17 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
     CELTEncoder *celt_enc;
     int i;
     int ret=0;
-    opus_int32 nBytes;
-    ec_enc enc;
-    int bytes_target;
     int prefill=0;
-    int start_band = 0;
     int redundancy = 0;
-    int redundancy_bytes = 0; /* Number of bytes to use for redundancy frame */
     int celt_to_silk = 0;
-    VARDECL(opus_val16, pcm_buf);
-    int nb_compr_bytes;
     int to_celt = 0;
-    opus_uint32 redundant_rng = 0;
-    int cutoff_Hz, hp_freq_smth1;
     int voice_est; /* Probability of voice in Q7 */
     opus_int32 equiv_rate;
-    int delay_compensation;
     int frame_rate;
     opus_int32 max_rate; /* Max bitrate we're allowed to use */
     int curr_bandwidth;
-    opus_val16 HB_gain;
     opus_int32 max_data_bytes; /* Max number of bytes we're allowed to use */
-    int total_buffer;
+    opus_int32 cbr_bytes=-1;
     opus_val16 stereo_width;
     const CELTMode *celt_mode;
 #ifndef DISABLE_FLOAT_API
@@ -1102,8 +1082,9 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
     int analysis_read_subframe_bak=-1;
     int is_silence = 0;
 #endif
-    VARDECL(opus_val16, tmp_prefill);
-
+#ifdef ENABLE_DRED
+    opus_int32 dred_bitrate_bps;
+#endif
     ALLOC_STACK;
 
     max_data_bytes = IMIN(1276, out_data_bytes);
@@ -1124,10 +1105,6 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
 
     silk_enc = (char*)st+st->silk_enc_offset;
     celt_enc = (CELTEncoder*)((char*)st+st->celt_enc_offset);
-    if (st->application == OPUS_APPLICATION_RESTRICTED_LOWDELAY)
-       delay_compensation = 0;
-    else
-       delay_compensation = st->delay_compensation;
 
     lsb_depth = IMIN(lsb_depth, st->lsb_depth);
 
@@ -1205,21 +1182,24 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
        stereo_width = compute_stereo_width(pcm, frame_size, st->Fs, &st->width_mem);
     else
        stereo_width = 0;
-    total_buffer = delay_compensation;
     st->bitrate_bps = user_bitrate_to_bitrate(st, frame_size, max_data_bytes);
 
     frame_rate = st->Fs/frame_size;
     if (!st->use_vbr)
     {
-       int cbrBytes;
        /* Multiply by 12 to make sure the division is exact. */
        int frame_rate12 = 12*st->Fs/frame_size;
        /* We need to make sure that "int" values always fit in 16 bits. */
-       cbrBytes = IMIN( (12*st->bitrate_bps/8 + frame_rate12/2)/frame_rate12, max_data_bytes);
-       st->bitrate_bps = cbrBytes*(opus_int32)frame_rate12*8/12;
+       cbr_bytes = IMIN( (12*st->bitrate_bps/8 + frame_rate12/2)/frame_rate12, max_data_bytes);
+       st->bitrate_bps = cbr_bytes*(opus_int32)frame_rate12*8/12;
        /* Make sure we provide at least one byte to avoid failing. */
-       max_data_bytes = IMAX(1, cbrBytes);
+       max_data_bytes = IMAX(1, cbr_bytes);
     }
+#ifdef ENABLE_DRED
+    /* Allocate some of the bits to DRED if needed. */
+    dred_bitrate_bps = compute_dred_bitrate(st, st->bitrate_bps, frame_size);
+    st->bitrate_bps -= dred_bitrate_bps;
+#endif
     if (max_data_bytes<3 || st->bitrate_bps < 3*frame_rate*8
        || (frame_rate<50 && (max_data_bytes*frame_rate<300 || st->bitrate_bps < 2400)))
     {
@@ -1313,6 +1293,8 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
         st->stream_channels = st->force_channels;
     } else {
 #ifdef FUZZING
+        (void)stereo_music_threshold;
+        (void)stereo_voice_threshold;
        /* Random mono/stereo decision */
        if (st->channels == 2 && (rand()&0x1F)==0)
           st->stream_channels = 3-st->stream_channels;
@@ -1351,6 +1333,8 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
     } else if (st->user_forced_mode == OPUS_AUTO)
     {
 #ifdef FUZZING
+        (void)stereo_width;
+        (void)mode_thresholds;
        /* Random mode switching */
        if ((rand()&0xF)==0)
        {
@@ -1388,8 +1372,9 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
 
        st->mode = (equiv_rate >= threshold) ? MODE_CELT_ONLY: MODE_SILK_ONLY;
 
-       /* When FEC is enabled and there's enough packet loss, use SILK */
-       if (st->silk_mode.useInBandFEC && st->silk_mode.packetLossPercentage > (128-voice_est)>>4)
+       /* When FEC is enabled and there's enough packet loss, use SILK.
+          Unless the FEC is set to 2, in which case we don't switch to SILK if we're confident we have music. */
+       if (st->silk_mode.useInBandFEC && st->silk_mode.packetLossPercentage > (128-voice_est)>>4 && (st->fec_config != 2 || voice_est > 25))
           st->mode = MODE_SILK_ONLY;
        /* When encoding voice and DTX is enabled but the generalized DTX cannot be used,
           use SILK in order to make use of its DTX. */
@@ -1568,6 +1553,15 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
     {
        int enc_frame_size;
        int nb_frames;
+       VARDECL(unsigned char, tmp_data);
+       VARDECL(OpusRepacketizer, rp);
+       int max_header_bytes;
+       opus_int32 repacketize_len;
+       opus_int32 max_len_sum;
+       opus_int32 tot_size=0;
+       unsigned char *curr_data;
+       int tmp_len;
+       int dtx_count = 0;
 
        if (st->mode == MODE_SILK_ONLY)
        {
@@ -1586,17 +1580,186 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
 #ifndef DISABLE_FLOAT_API
        if (analysis_read_pos_bak!= -1)
        {
+          /* Reset analysis position to the beginning of the first frame so we
+             can use it one frame at a time. */
           st->analysis.read_pos = analysis_read_pos_bak;
           st->analysis.read_subframe = analysis_read_subframe_bak;
        }
 #endif
 
-       ret = encode_multiframe_packet(st, pcm, nb_frames, enc_frame_size, data,
-                                      out_data_bytes, to_celt, lsb_depth, float_api);
+       /* Worst cases:
+        * 2 frames: Code 2 with different compressed sizes
+        * >2 frames: Code 3 VBR */
+       max_header_bytes = nb_frames == 2 ? 3 : (2+(nb_frames-1)*2);
+
+       if (st->use_vbr || st->user_bitrate_bps==OPUS_BITRATE_MAX)
+          repacketize_len = out_data_bytes;
+       else {
+          celt_assert(cbr_bytes>=0);
+          repacketize_len = IMIN(cbr_bytes, out_data_bytes);
+       }
+       max_len_sum = nb_frames + repacketize_len - max_header_bytes;
+
+       ALLOC(tmp_data, max_len_sum, unsigned char);
+       curr_data = tmp_data;
+       ALLOC(rp, 1, OpusRepacketizer);
+       opus_repacketizer_init(rp);
 
+
+       int bak_to_mono = st->silk_mode.toMono;
+       if (bak_to_mono)
+          st->force_channels = 1;
+       else
+          st->prev_channels = st->stream_channels;
+
+       for (i=0;i<nb_frames;i++)
+       {
+          int first_frame;
+          int frame_to_celt;
+          int frame_redundancy;
+          opus_int32 curr_max;
+          /* Attempt DRED encoding until we have a non-DTX frame. In case of DTX refresh,
+             that allows for DRED not to be in the first frame. */
+          first_frame = (i == 0) || (i == dtx_count);
+          st->silk_mode.toMono = 0;
+          st->nonfinal_frame = i<(nb_frames-1);
+
+          /* When switching from SILK/Hybrid to CELT, only ask for a switch at the last frame */
+          frame_to_celt = to_celt && i==nb_frames-1;
+          frame_redundancy = redundancy && (frame_to_celt || (!to_celt && i==0));
+
+          curr_max = IMIN(3*st->bitrate_bps/(3*8*st->Fs/enc_frame_size), max_len_sum/nb_frames);
+#ifdef ENABLE_DRED
+          curr_max = IMIN(curr_max, (max_len_sum-3*dred_bitrate_bps/(3*8*st->Fs/frame_size))/nb_frames);
+          if (first_frame) curr_max += 3*dred_bitrate_bps/(3*8*st->Fs/frame_size);
+#endif
+          curr_max = IMIN(max_len_sum-tot_size, curr_max);
+#ifndef DISABLE_FLOAT_API
+          if (analysis_read_pos_bak != -1) {
+            is_silence = is_digital_silence(pcm, frame_size, st->channels, lsb_depth);
+            /* Get analysis for current frame. */
+            tonality_get_info(&st->analysis, &analysis_info, enc_frame_size);
+          }
+#endif
+
+          tmp_len = opus_encode_frame_native(st, pcm+i*(st->channels*enc_frame_size), enc_frame_size, curr_data, curr_max, float_api, first_frame,
+#ifdef ENABLE_DRED
+          dred_bitrate_bps,
+#endif
+#ifndef DISABLE_FLOAT_API
+          &analysis_info,
+          is_silence,
+#endif
+                    frame_redundancy, celt_to_silk, prefill,
+                    equiv_rate, frame_to_celt
+              );
+          if (tmp_len<0)
+          {
+             RESTORE_STACK;
+             return OPUS_INTERNAL_ERROR;
+          } else if (tmp_len==1) {
+             dtx_count++;
+          }
+          ret = opus_repacketizer_cat(rp, curr_data, tmp_len);
+
+          if (ret<0)
+          {
+             RESTORE_STACK;
+             return OPUS_INTERNAL_ERROR;
+          }
+          tot_size += tmp_len;
+          curr_data += tmp_len;
+       }
+       ret = opus_repacketizer_out_range_impl(rp, 0, nb_frames, data, repacketize_len, 0, !st->use_vbr && (dtx_count != nb_frames), NULL, 0);
+       if (ret<0)
+       {
+          ret = OPUS_INTERNAL_ERROR;
+       }
+       st->silk_mode.toMono = bak_to_mono;
        RESTORE_STACK;
        return ret;
+    } else {
+      ret = opus_encode_frame_native(st, pcm, frame_size, data, max_data_bytes, float_api, 1,
+#ifdef ENABLE_DRED
+                dred_bitrate_bps,
+#endif
+#ifndef DISABLE_FLOAT_API
+                &analysis_info,
+                is_silence,
+#endif
+                redundancy, celt_to_silk, prefill,
+                equiv_rate, to_celt
+          );
+      RESTORE_STACK;
+      return ret;
     }
+}
+
+static opus_int32 opus_encode_frame_native(OpusEncoder *st, const opus_val16 *pcm, int frame_size,
+                unsigned char *data, opus_int32 max_data_bytes,
+                int float_api, int first_frame,
+#ifdef ENABLE_DRED
+                opus_int32 dred_bitrate_bps,
+#endif
+#ifndef DISABLE_FLOAT_API
+                AnalysisInfo *analysis_info, int is_silence,
+#endif
+                int redundancy, int celt_to_silk, int prefill,
+                opus_int32 equiv_rate, int to_celt)
+{
+    void *silk_enc;
+    CELTEncoder *celt_enc;
+    const CELTMode *celt_mode;
+    int i;
+    int ret=0;
+    opus_int32 nBytes;
+    ec_enc enc;
+    int bytes_target;
+    int start_band = 0;
+    int redundancy_bytes = 0; /* Number of bytes to use for redundancy frame */
+    int nb_compr_bytes;
+    opus_uint32 redundant_rng = 0;
+    int cutoff_Hz;
+    int hp_freq_smth1;
+    opus_val16 HB_gain;
+    int apply_padding;
+    int frame_rate;
+    int curr_bandwidth;
+    int delay_compensation;
+    int total_buffer;
+    opus_int activity = VAD_NO_DECISION;
+    VARDECL(opus_val16, pcm_buf);
+    VARDECL(opus_val16, tmp_prefill);
+    SAVE_STACK;
+
+    st->rangeFinal = 0;
+    silk_enc = (char*)st+st->silk_enc_offset;
+    celt_enc = (CELTEncoder*)((char*)st+st->celt_enc_offset);
+    celt_encoder_ctl(celt_enc, CELT_GET_MODE(&celt_mode));
+    curr_bandwidth = st->bandwidth;
+    if (st->application == OPUS_APPLICATION_RESTRICTED_LOWDELAY)
+       delay_compensation = 0;
+    else
+       delay_compensation = st->delay_compensation;
+    total_buffer = delay_compensation;
+
+    frame_rate = st->Fs/frame_size;
+
+#ifndef DISABLE_FLOAT_API
+    if (is_silence)
+    {
+       activity = !is_silence;
+    } else if (analysis_info->valid)
+    {
+       activity = analysis_info->activity_probability >= DTX_ACTIVITY_THRESHOLD;
+       if (!activity)
+       {
+           /* Mark as active if this noise frame is sufficiently loud */
+           opus_val32 noise_energy = compute_frame_energy(pcm, frame_size, st->channels, st->arch);
+           activity = st->peak_signal_energy < (PSEUDO_SNR_THRESHOLD * noise_energy);
+       }
+    }
+#endif
 
     /* For the first frame at a new SILK bandwidth */
     if (st->silk_bw_switch)
@@ -1604,7 +1767,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
        redundancy = 1;
        celt_to_silk = 1;
        st->silk_bw_switch = 0;
-       /* Do a prefill without reseting the sampling rate control. */
+       /* Do a prefill without resetting the sampling rate control. */
        prefill=2;
     }
 
@@ -1644,6 +1807,25 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
     if (st->application == OPUS_APPLICATION_VOIP)
     {
        hp_cutoff(pcm, cutoff_Hz, &pcm_buf[total_buffer*st->channels], st->hp_mem, frame_size, st->channels, st->Fs, st->arch);
+
+#ifdef ENABLE_OSCE_TRAINING_DATA
+       /* write out high pass filtered clean signal*/
+       static FILE *fout =NULL;
+       if (fout == NULL)
+       {
+         fout = fopen("clean_hp.s16", "wb");
+       }
+
+       {
+         int idx;
+         opus_int16 tmp;
+         for (idx = 0; idx < frame_size; idx++)
+         {
+            tmp = (opus_int16) (32768 * pcm_buf[total_buffer + idx] + 0.5f);
+            fwrite(&tmp, sizeof(tmp), 1, fout);
+         }
+       }
+#endif
     } else {
        dc_reject(pcm, 3, &pcm_buf[total_buffer*st->channels], st->hp_mem, frame_size, st->channels, st->Fs);
     }
@@ -1660,15 +1842,30 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
           st->hp_mem[0] = st->hp_mem[1] = st->hp_mem[2] = st->hp_mem[3] = 0;
        }
     }
+#else
+    (void)float_api;
 #endif
 
+#ifdef ENABLE_DRED
+    if ( st->dred_duration > 0 && st->dred_encoder.loaded ) {
+        int frame_size_400Hz;
+        /* DRED Encoder */
+        dred_compute_latents( &st->dred_encoder, &pcm_buf[total_buffer*st->channels], frame_size, total_buffer, st->arch );
+        frame_size_400Hz = frame_size*400/st->Fs;
+        OPUS_MOVE(&st->activity_mem[frame_size_400Hz], st->activity_mem, 4*DRED_MAX_FRAMES-frame_size_400Hz);
+        for (i=0;i<frame_size_400Hz;i++)
+           st->activity_mem[i] = activity;
+    } else {
+        st->dred_encoder.latents_buffer_fill = 0;
+        OPUS_CLEAR(st->activity_mem, DRED_MAX_FRAMES);
+    }
+#endif
 
     /* SILK processing */
     HB_gain = Q15ONE;
     if (st->mode != MODE_CELT_ONLY)
     {
         opus_int32 total_bitRate, celt_rate;
-        opus_int activity;
 #ifdef FIXED_POINT
        const opus_int16 *pcm_silk;
 #else
@@ -1676,14 +1873,6 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
        ALLOC(pcm_silk, st->channels*frame_size, opus_int16);
 #endif
 
-        activity = VAD_NO_DECISION;
-#ifndef DISABLE_FLOAT_API
-        if( analysis_info.valid ) {
-            /* Inform SILK about the Opus VAD decision */
-            activity = ( analysis_info.activity_probability >= DTX_ACTIVITY_THRESHOLD );
-        }
-#endif
-
         /* Distribute bits between SILK and CELT */
         total_bitRate = 8 * bytes_target * frame_rate;
         if( st->mode == MODE_HYBRID ) {
@@ -1765,7 +1954,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
         st->silk_mode.maxInternalSampleRate = 16000;
         if (st->mode == MODE_SILK_ONLY)
         {
-           opus_int32 effective_max_rate = max_rate;
+           opus_int32 effective_max_rate = frame_rate*max_data_bytes*8;
            if (frame_rate > 50)
               effective_max_rate = effective_max_rate*2/3;
            if (effective_max_rate < 8000)
@@ -1795,9 +1984,19 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
         }
         if (st->silk_mode.useCBR)
         {
+           /* When we're in CBR mode, but we have non-SILK data to encode, switch SILK to VBR with cap to
+              save on complexity. Any variations will be absorbed by CELT and/or DRED and we can still
+              produce a constant bitrate without wasting bits. */
+#ifdef ENABLE_DRED
+           if (st->mode == MODE_HYBRID || dred_bitrate_bps > 0)
+#else
            if (st->mode == MODE_HYBRID)
+#endif
            {
-              st->silk_mode.maxBits = IMIN(st->silk_mode.maxBits, st->silk_mode.bitRate * frame_size / st->Fs);
+              /* Allow SILK to steal up to 25% of the remaining bits */
+              opus_int16 other_bits = IMAX(0, st->silk_mode.maxBits - st->silk_mode.bitRate * frame_size / st->Fs);
+              st->silk_mode.maxBits = IMAX(0, st->silk_mode.maxBits - other_bits*3/4);
+              st->silk_mode.useCBR = 0;
            }
         } else {
            /* Constrained VBR. */
@@ -1910,26 +2109,10 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
     if (st->mode != MODE_SILK_ONLY)
     {
         opus_val32 celt_pred=2;
-        celt_encoder_ctl(celt_enc, OPUS_SET_VBR(0));
         /* We may still decide to disable prediction later */
         if (st->silk_mode.reducedDependency)
            celt_pred = 0;
         celt_encoder_ctl(celt_enc, CELT_SET_PREDICTION(celt_pred));
-
-        if (st->mode == MODE_HYBRID)
-        {
-            if( st->use_vbr ) {
-                celt_encoder_ctl(celt_enc, OPUS_SET_BITRATE(st->bitrate_bps-st->silk_mode.bitRate));
-                celt_encoder_ctl(celt_enc, OPUS_SET_VBR_CONSTRAINT(0));
-            }
-        } else {
-            if (st->use_vbr)
-            {
-                celt_encoder_ctl(celt_enc, OPUS_SET_VBR(1));
-                celt_encoder_ctl(celt_enc, OPUS_SET_VBR_CONSTRAINT(st->vbr_constraint));
-                celt_encoder_ctl(celt_enc, OPUS_SET_BITRATE(st->bitrate_bps));
-            }
-        }
     }
 
     ALLOC(tmp_prefill, st->channels*st->Fs/400, opus_val16);
@@ -2023,13 +2206,27 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
         ec_enc_done(&enc);
         nb_compr_bytes = ret;
     } else {
-       nb_compr_bytes = (max_data_bytes-1)-redundancy_bytes;
-       ec_enc_shrink(&enc, nb_compr_bytes);
+        nb_compr_bytes = (max_data_bytes-1)-redundancy_bytes;
+#ifdef ENABLE_DRED
+        if (st->dred_duration > 0)
+        {
+            int max_celt_bytes;
+            opus_int32 dred_bytes = dred_bitrate_bps/(frame_rate*8);
+            /* Allow CELT to steal up to 25% of the remaining bits. */
+            max_celt_bytes = nb_compr_bytes - dred_bytes*3/4;
+            /* But try to give CELT at least 5 bytes to prevent a mismatch with
+               the redundancy signaling. */
+            max_celt_bytes = IMAX((ec_tell(&enc)+7)/8 + 5, max_celt_bytes);
+            /* Subject to the original max. */
+            nb_compr_bytes = IMIN(nb_compr_bytes, max_celt_bytes);
+        }
+#endif
+        ec_enc_shrink(&enc, nb_compr_bytes);
     }
 
 #ifndef DISABLE_FLOAT_API
     if (redundancy || st->mode != MODE_SILK_ONLY)
-       celt_encoder_ctl(celt_enc, CELT_SET_ANALYSIS(&analysis_info));
+       celt_encoder_ctl(celt_enc, CELT_SET_ANALYSIS(analysis_info));
 #endif
     if (st->mode == MODE_HYBRID) {
        SILKInfo info;
@@ -2059,6 +2256,34 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
 
     if (st->mode != MODE_SILK_ONLY)
     {
+        celt_encoder_ctl(celt_enc, OPUS_SET_VBR(st->use_vbr));
+        if (st->mode == MODE_HYBRID)
+        {
+            if( st->use_vbr ) {
+                celt_encoder_ctl(celt_enc, OPUS_SET_BITRATE(st->bitrate_bps-st->silk_mode.bitRate));
+                celt_encoder_ctl(celt_enc, OPUS_SET_VBR_CONSTRAINT(0));
+            }
+        } else {
+            if (st->use_vbr)
+            {
+                celt_encoder_ctl(celt_enc, OPUS_SET_VBR(1));
+                celt_encoder_ctl(celt_enc, OPUS_SET_VBR_CONSTRAINT(st->vbr_constraint));
+                celt_encoder_ctl(celt_enc, OPUS_SET_BITRATE(st->bitrate_bps));
+            }
+        }
+#ifdef ENABLE_DRED
+        /* When Using DRED CBR, we can actually make the CELT part VBR and have DRED pick up the slack. */
+        if (!st->use_vbr && st->dred_duration > 0)
+        {
+            opus_int32 celt_bitrate = st->bitrate_bps;
+            celt_encoder_ctl(celt_enc, OPUS_SET_VBR(1));
+            celt_encoder_ctl(celt_enc, OPUS_SET_VBR_CONSTRAINT(0));
+            if (st->mode == MODE_HYBRID) {
+                celt_bitrate -= st->silk_mode.bitRate;
+            }
+            celt_encoder_ctl(celt_enc, OPUS_SET_BITRATE(celt_bitrate));
+        }
+#endif
         if (st->mode != st->prev_mode && st->prev_mode > 0)
         {
            unsigned char dummy[2];
@@ -2071,10 +2296,6 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
         /* If false, we already busted the budget and we'll end up with a "PLC frame" */
         if (ec_tell(&enc) <= 8*nb_compr_bytes)
         {
-           /* Set the bitrate again if it was overridden in the redundancy code above*/
-           if (redundancy && celt_to_silk && st->mode==MODE_HYBRID && st->use_vbr)
-              celt_encoder_ctl(celt_enc, OPUS_SET_BITRATE(st->bitrate_bps-st->silk_mode.bitRate));
-           celt_encoder_ctl(celt_enc, OPUS_SET_VBR(st->use_vbr));
            ret = celt_encode_with_ec(celt_enc, pcm_buf, frame_size, NULL, nb_compr_bytes, &enc);
            if (ret < 0)
            {
@@ -2082,10 +2303,10 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
               return OPUS_INTERNAL_ERROR;
            }
            /* Put CELT->SILK redundancy data in the right place. */
-           if (redundancy && celt_to_silk && st->mode==MODE_HYBRID && st->use_vbr)
+           if (redundancy && celt_to_silk && st->mode==MODE_HYBRID && nb_compr_bytes != ret)
            {
               OPUS_MOVE(data+ret, data+nb_compr_bytes, redundancy_bytes);
-              nb_compr_bytes = nb_compr_bytes+redundancy_bytes;
+              nb_compr_bytes = ret+redundancy_bytes;
            }
         }
     }
@@ -2142,10 +2363,9 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
 
     /* DTX decision */
 #ifndef DISABLE_FLOAT_API
-    if (st->use_dtx && (analysis_info.valid || is_silence))
+    if (st->use_dtx && (analysis_info->valid || is_silence))
     {
-       if (decide_dtx_mode(analysis_info.activity_probability, &st->nb_no_activity_frames,
-             st->peak_signal_energy, pcm, frame_size, st->channels, is_silence, st->arch))
+       if (decide_dtx_mode(activity, &st->nb_no_activity_ms_Q1, 2*1000*frame_size/st->Fs))
        {
           st->rangeFinal = 0;
           data[0] = gen_toc(st->mode, st->Fs/frame_size, curr_bandwidth, st->stream_channels);
@@ -2153,7 +2373,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
           return 1;
        }
     } else {
-       st->nb_no_activity_frames = 0;
+       st->nb_no_activity_ms_Q1 = 0;
     }
 #endif
 
@@ -2181,7 +2401,51 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
     }
     /* Count ToC and redundancy */
     ret += 1+redundancy_bytes;
-    if (!st->use_vbr)
+    apply_padding = !st->use_vbr;
+#ifdef ENABLE_DRED
+    if (st->dred_duration > 0 && st->dred_encoder.loaded && first_frame) {
+       opus_extension_data extension;
+       unsigned char buf[DRED_MAX_DATA_SIZE];
+       int dred_chunks;
+       int dred_bytes_left;
+       dred_chunks = IMIN((st->dred_duration+5)/4, DRED_NUM_REDUNDANCY_FRAMES/2);
+       if (st->use_vbr) dred_chunks = IMIN(dred_chunks, st->dred_target_chunks);
+       /* Remaining space for DRED, accounting for cost the 3 extra bytes for code 3, padding length, and extension number. */
+       dred_bytes_left = IMIN(DRED_MAX_DATA_SIZE, max_data_bytes-ret-3);
+       /* Account for the extra bytes required to signal large padding length. */
+       dred_bytes_left -= (dred_bytes_left+1+DRED_EXPERIMENTAL_BYTES)/255;
+       /* Check whether we actually have something to encode. */
+       if (dred_chunks >= 1 && dred_bytes_left >= DRED_MIN_BYTES+DRED_EXPERIMENTAL_BYTES) {
+           int dred_bytes;
+#ifdef DRED_EXPERIMENTAL_VERSION
+           /* Add temporary extension type and version.
+              These bytes will be removed once extension is finalized. */
+           buf[0] = 'D';
+           buf[1] = DRED_EXPERIMENTAL_VERSION;
+#endif
+           dred_bytes = dred_encode_silk_frame(&st->dred_encoder, buf+DRED_EXPERIMENTAL_BYTES, dred_chunks, dred_bytes_left-DRED_EXPERIMENTAL_BYTES,
+                                               st->dred_q0, st->dred_dQ, st->dred_qmax, st->activity_mem, st->arch);
+           if (dred_bytes > 0) {
+              dred_bytes += DRED_EXPERIMENTAL_BYTES;
+              celt_assert(dred_bytes <= dred_bytes_left);
+              extension.id = DRED_EXTENSION_ID;
+              extension.frame = 0;
+              extension.data = buf;
+              extension.len = dred_bytes;
+              ret = opus_packet_pad_impl(data, ret, max_data_bytes, !st->use_vbr, &extension, 1);
+              if (ret < 0)
+              {
+                 RESTORE_STACK;
+                 return OPUS_INTERNAL_ERROR;
+              }
+              apply_padding = 0;
+           }
+       }
+    }
+#else
+    (void)first_frame; /* Avoids a warning about first_frame being unused. */
+#endif
+    if (apply_padding)
     {
        if (opus_packet_pad(data, ret, max_data_bytes) != OPUS_OK)
        {
@@ -2448,11 +2712,12 @@ int opus_encoder_ctl(OpusEncoder *st, int request, ...)
         case OPUS_SET_INBAND_FEC_REQUEST:
         {
             opus_int32 value = va_arg(ap, opus_int32);
-            if(value<0 || value>1)
+            if(value<0 || value>2)
             {
                goto bad_arg;
             }
-            st->silk_mode.useInBandFEC = value;
+            st->fec_config = value;
+            st->silk_mode.useInBandFEC = (value != 0);
         }
         break;
         case OPUS_GET_INBAND_FEC_REQUEST:
@@ -2462,7 +2727,7 @@ int opus_encoder_ctl(OpusEncoder *st, int request, ...)
             {
                goto bad_arg;
             }
-            *value = st->silk_mode.useInBandFEC;
+            *value = st->fec_config;
         }
         break;
         case OPUS_SET_PACKET_LOSS_PERC_REQUEST:
@@ -2679,6 +2944,29 @@ int opus_encoder_ctl(OpusEncoder *st, int request, ...)
             celt_encoder_ctl(celt_enc, OPUS_GET_PHASE_INVERSION_DISABLED(value));
         }
         break;
+#ifdef ENABLE_DRED
+        case OPUS_SET_DRED_DURATION_REQUEST:
+        {
+            opus_int32 value = va_arg(ap, opus_int32);
+            if(value<0 || value>DRED_MAX_FRAMES)
+            {
+               goto bad_arg;
+            }
+            st->dred_duration = value;
+            st->silk_mode.useDRED = !!value;
+        }
+        break;
+        case OPUS_GET_DRED_DURATION_REQUEST:
+        {
+            opus_int32 *value = va_arg(ap, opus_int32*);
+            if (!value)
+            {
+               goto bad_arg;
+            }
+            *value = st->dred_duration;
+        }
+        break;
+#endif
         case OPUS_RESET_STATE:
         {
            void *silk_enc;
@@ -2694,6 +2982,10 @@ int opus_encoder_ctl(OpusEncoder *st, int request, ...)
 
            celt_encoder_ctl(celt_enc, OPUS_RESET_STATE);
            silk_InitEncoder( silk_enc, st->arch, &dummy );
+#ifdef ENABLE_DRED
+           /* Initialize DRED Encoder */
+           dred_encoder_reset( &st->dred_encoder );
+#endif
            st->stream_channels = st->channels;
            st->hybrid_stereo_width_Q14 = 1 << 14;
            st->prev_HB_gain = Q15ONE;
@@ -2736,17 +3028,17 @@ int opus_encoder_ctl(OpusEncoder *st, int request, ...)
             }
             if (st->silk_mode.useDTX && (st->prev_mode == MODE_SILK_ONLY || st->prev_mode == MODE_HYBRID)) {
                 /* DTX determined by Silk. */
-                int n;
-                void *silk_enc = (char*)st+st->silk_enc_offset;
-                *value = 1;
-                for (n=0;n<st->silk_mode.nChannelsInternal;n++) {
-                    *value = *value && ((silk_encoder*)silk_enc)->state_Fxx[n].sCmn.noSpeechCounter >= NB_SPEECH_FRAMES_BEFORE_DTX;
+                silk_encoder *silk_enc = (silk_encoder*)(void *)((char*)st+st->silk_enc_offset);
+                *value = silk_enc->state_Fxx[0].sCmn.noSpeechCounter >= NB_SPEECH_FRAMES_BEFORE_DTX;
+                /* Stereo: check second channel unless only the middle channel was encoded. */
+                if(*value == 1 && st->silk_mode.nChannelsInternal == 2 && silk_enc->prev_decode_only_middle == 0) {
+                    *value = silk_enc->state_Fxx[1].sCmn.noSpeechCounter >= NB_SPEECH_FRAMES_BEFORE_DTX;
                 }
             }
 #ifndef DISABLE_FLOAT_API
             else if (st->use_dtx) {
                 /* DTX determined by Opus. */
-                *value = st->nb_no_activity_frames >= NB_SPEECH_FRAMES_BEFORE_DTX;
+                *value = st->nb_no_activity_ms_Q1 >= NB_SPEECH_FRAMES_BEFORE_DTX*20*2;
             }
 #endif
             else {
@@ -2754,7 +3046,21 @@ int opus_encoder_ctl(OpusEncoder *st, int request, ...)
             }
         }
         break;
-
+#ifdef USE_WEIGHTS_FILE
+        case OPUS_SET_DNN_BLOB_REQUEST:
+        {
+            const unsigned char *data = va_arg(ap, const unsigned char *);
+            opus_int32 len = va_arg(ap, opus_int32);
+            if(len<0 || data == NULL)
+            {
+               goto bad_arg;
+            }
+#ifdef ENABLE_DRED
+            ret = dred_encoder_load_model(&st->dred_encoder, data, len);
+#endif
+        }
+        break;
+#endif
         case CELT_GET_MODE_REQUEST:
         {
            const CELTMode ** value = va_arg(ap, const CELTMode**);
diff --git a/opus/src/opus_multistream_decoder.c b/opus/src/opus_multistream_decoder.c
index 0018517a..4ae877a7 100644
--- a/opus/src/opus_multistream_decoder.c
+++ b/opus/src/opus_multistream_decoder.c
@@ -162,7 +162,7 @@ static int opus_multistream_packet_validate(const unsigned char *data,
       if (len<=0)
          return OPUS_INVALID_PACKET;
       count = opus_packet_parse_impl(data, len, s!=nb_streams-1, &toc, NULL,
-                                     size, NULL, &packet_offset);
+                                     size, NULL, &packet_offset, NULL, NULL);
       if (count<0)
          return count;
       tmp_samples = opus_packet_get_nb_samples(data, packet_offset, Fs);
@@ -250,9 +250,12 @@ int opus_multistream_decode_native(
          return OPUS_INTERNAL_ERROR;
       }
       packet_offset = 0;
-      ret = opus_decode_native(dec, data, len, buf, frame_size, decode_fec, s!=st->layout.nb_streams-1, &packet_offset, soft_clip);
-      data += packet_offset;
-      len -= packet_offset;
+      ret = opus_decode_native(dec, data, len, buf, frame_size, decode_fec, s!=st->layout.nb_streams-1, &packet_offset, soft_clip, NULL, 0);
+      if (!do_plc)
+      {
+        data += packet_offset;
+        len -= packet_offset;
+      }
       if (ret <= 0)
       {
          RESTORE_STACK;
diff --git a/opus/src/opus_multistream_encoder.c b/opus/src/opus_multistream_encoder.c
index 93204a14..1725ade7 100644
--- a/opus/src/opus_multistream_encoder.c
+++ b/opus/src/opus_multistream_encoder.c
@@ -443,7 +443,8 @@ static int opus_multistream_encoder_init_impl(
    char *ptr;
 
    if ((channels>255) || (channels<1) || (coupled_streams>streams) ||
-       (streams<1) || (coupled_streams<0) || (streams>255-coupled_streams))
+       (streams<1) || (coupled_streams<0) || (streams>255-coupled_streams) ||
+       (streams+coupled_streams>channels))
       return OPUS_BAD_ARG;
 
    st->arch = opus_select_arch();
@@ -459,8 +460,7 @@ static int opus_multistream_encoder_init_impl(
       st->layout.mapping[i] = mapping[i];
    if (!validate_layout(&st->layout))
       return OPUS_BAD_ARG;
-   if (mapping_type == MAPPING_TYPE_SURROUND &&
-       !validate_encoder_layout(&st->layout))
+   if (!validate_encoder_layout(&st->layout))
       return OPUS_BAD_ARG;
    if (mapping_type == MAPPING_TYPE_AMBISONICS &&
        !validate_ambisonics(st->layout.nb_channels, NULL, NULL))
@@ -595,7 +595,8 @@ OpusMSEncoder *opus_multistream_encoder_create(
    int ret;
    OpusMSEncoder *st;
    if ((channels>255) || (channels<1) || (coupled_streams>streams) ||
-       (streams<1) || (coupled_streams<0) || (streams>255-coupled_streams))
+       (streams<1) || (coupled_streams<0) || (streams>255-coupled_streams) ||
+       (streams+coupled_streams>channels))
    {
       if (error)
          *error = OPUS_BAD_ARG;
@@ -1002,7 +1003,7 @@ int opus_multistream_encode_native
          return OPUS_INTERNAL_ERROR;
       }
       len = opus_repacketizer_out_range_impl(&rp, 0, opus_repacketizer_get_nb_frames(&rp),
-            data, max_data_bytes-tot_size, s != st->layout.nb_streams-1, !vbr && s == st->layout.nb_streams-1);
+            data, max_data_bytes-tot_size, s != st->layout.nb_streams-1, !vbr && s == st->layout.nb_streams-1, NULL, 0);
       data += len;
       tot_size += len;
    }
diff --git a/opus/src/opus_private.h b/opus/src/opus_private.h
index 5e2463f5..364c21ce 100644
--- a/opus/src/opus_private.h
+++ b/opus/src/opus_private.h
@@ -42,8 +42,17 @@ struct OpusRepacketizer {
    const unsigned char *frames[48];
    opus_int16 len[48];
    int framesize;
+   const unsigned char *paddings[48];
+   opus_int32 padding_len[48];
 };
 
+typedef struct {
+   int id;
+   int frame;
+   const unsigned char *data;
+   opus_int32 len;
+} opus_extension_data;
+
 typedef struct ChannelLayout {
    int nb_channels;
    int nb_streams;
@@ -148,7 +157,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
 
 int opus_decode_native(OpusDecoder *st, const unsigned char *data, opus_int32 len,
       opus_val16 *pcm, int frame_size, int decode_fec, int self_delimited,
-      opus_int32 *packet_offset, int soft_clip);
+      opus_int32 *packet_offset, int soft_clip, const OpusDRED *dred, opus_int32 dred_offset);
 
 /* Make sure everything is properly aligned. */
 static OPUS_INLINE int align(int i)
@@ -162,13 +171,18 @@ static OPUS_INLINE int align(int i)
     return ((i + alignment - 1) / alignment) * alignment;
 }
 
+/* More than that is ridiculous for now (3 * max frames per packet)*/
+opus_int32 skip_extension(const unsigned char **data, opus_int32 len, opus_int32 *header_size);
+
 int opus_packet_parse_impl(const unsigned char *data, opus_int32 len,
       int self_delimited, unsigned char *out_toc,
       const unsigned char *frames[48], opus_int16 size[48],
-      int *payload_offset, opus_int32 *packet_offset);
+      int *payload_offset, opus_int32 *packet_offset,
+      const unsigned char **padding, opus_int32 *padding_len);
 
 opus_int32 opus_repacketizer_out_range_impl(OpusRepacketizer *rp, int begin, int end,
-      unsigned char *data, opus_int32 maxlen, int self_delimited, int pad);
+      unsigned char *data, opus_int32 maxlen, int self_delimited, int pad,
+      const opus_extension_data *extensions, int nb_extensions);
 
 int pad_frame(unsigned char *data, opus_int32 len, opus_int32 new_len);
 
@@ -198,4 +212,12 @@ int opus_multistream_decode_native(
   void *user_data
 );
 
+opus_int32 opus_packet_extensions_parse(const unsigned char *data, opus_int32 len, opus_extension_data *extensions, opus_int32 *nb_extensions);
+
+opus_int32 opus_packet_extensions_generate(unsigned char *data, opus_int32 len, const opus_extension_data  *extensions, int nb_extensions, int pad);
+
+opus_int32 opus_packet_extensions_count(const unsigned char *data, opus_int32 len);
+
+opus_int32 opus_packet_pad_impl(unsigned char *data, opus_int32 len, opus_int32 new_len, int pad, const opus_extension_data  *extensions, int nb_extensions);
+
 #endif /* OPUS_PRIVATE_H */
diff --git a/opus/src/opus_projection_encoder.c b/opus/src/opus_projection_encoder.c
index 06fb2d25..92813ad0 100644
--- a/opus/src/opus_projection_encoder.c
+++ b/opus/src/opus_projection_encoder.c
@@ -177,6 +177,20 @@ opus_int32 opus_projection_ambisonics_encoder_get_size(int channels,
     demixing_matrix_rows = mapping_matrix_toa_demixing.rows;
     demixing_matrix_cols = mapping_matrix_toa_demixing.cols;
   }
+  else if (order_plus_one == 5)
+  {
+    mixing_matrix_rows = mapping_matrix_fourthoa_mixing.rows;
+    mixing_matrix_cols = mapping_matrix_fourthoa_mixing.cols;
+    demixing_matrix_rows = mapping_matrix_fourthoa_demixing.rows;
+    demixing_matrix_cols = mapping_matrix_fourthoa_demixing.cols;
+  }
+  else if (order_plus_one == 6)
+  {
+    mixing_matrix_rows = mapping_matrix_fifthoa_mixing.rows;
+    mixing_matrix_cols = mapping_matrix_fifthoa_mixing.cols;
+    demixing_matrix_rows = mapping_matrix_fifthoa_demixing.rows;
+    demixing_matrix_cols = mapping_matrix_fifthoa_demixing.cols;
+  }
   else
     return 0;
 
@@ -245,6 +259,20 @@ int opus_projection_ambisonics_encoder_init(OpusProjectionEncoder *st, opus_int3
         mapping_matrix_toa_mixing_data,
         sizeof(mapping_matrix_toa_mixing_data));
     }
+    else if (order_plus_one == 5)
+    {
+      mapping_matrix_init(mixing_matrix, mapping_matrix_fourthoa_mixing.rows,
+        mapping_matrix_fourthoa_mixing.cols, mapping_matrix_fourthoa_mixing.gain,
+        mapping_matrix_fourthoa_mixing_data,
+        sizeof(mapping_matrix_fourthoa_mixing_data));
+    }
+    else if (order_plus_one == 6)
+    {
+      mapping_matrix_init(mixing_matrix, mapping_matrix_fifthoa_mixing.rows,
+        mapping_matrix_fifthoa_mixing.cols, mapping_matrix_fifthoa_mixing.gain,
+        mapping_matrix_fifthoa_mixing_data,
+        sizeof(mapping_matrix_fifthoa_mixing_data));
+    }
     else
       return OPUS_BAD_ARG;
 
@@ -275,6 +303,20 @@ int opus_projection_ambisonics_encoder_init(OpusProjectionEncoder *st, opus_int3
         mapping_matrix_toa_demixing.cols, mapping_matrix_toa_demixing.gain,
         mapping_matrix_toa_demixing_data,
         sizeof(mapping_matrix_toa_demixing_data));
+    }
+      else if (order_plus_one == 5)
+    {
+      mapping_matrix_init(demixing_matrix, mapping_matrix_fourthoa_demixing.rows,
+        mapping_matrix_fourthoa_demixing.cols, mapping_matrix_fourthoa_demixing.gain,
+        mapping_matrix_fourthoa_demixing_data,
+        sizeof(mapping_matrix_fourthoa_demixing_data));
+    }
+    else if (order_plus_one == 6)
+    {
+      mapping_matrix_init(demixing_matrix, mapping_matrix_fifthoa_demixing.rows,
+        mapping_matrix_fifthoa_demixing.cols, mapping_matrix_fifthoa_demixing.gain,
+        mapping_matrix_fifthoa_demixing_data,
+        sizeof(mapping_matrix_fifthoa_demixing_data));
     }
     else
       return OPUS_BAD_ARG;
diff --git a/opus/src/repacketizer.c b/opus/src/repacketizer.c
index bda44a14..6a7a8b3d 100644
--- a/opus/src/repacketizer.c
+++ b/opus/src/repacketizer.c
@@ -32,6 +32,7 @@
 #include "opus.h"
 #include "opus_private.h"
 #include "os_support.h"
+#include "stack_alloc.h"
 
 
 int opus_repacketizer_get_size(void)
@@ -82,10 +83,19 @@ static int opus_repacketizer_cat_impl(OpusRepacketizer *rp, const unsigned char
       return OPUS_INVALID_PACKET;
    }
 
-   ret=opus_packet_parse_impl(data, len, self_delimited, &tmp_toc, &rp->frames[rp->nb_frames], &rp->len[rp->nb_frames], NULL, NULL);
+   ret=opus_packet_parse_impl(data, len, self_delimited, &tmp_toc, &rp->frames[rp->nb_frames], &rp->len[rp->nb_frames],
+       NULL, NULL, &rp->paddings[rp->nb_frames], &rp->padding_len[rp->nb_frames]);
    if(ret<1)return ret;
 
-   rp->nb_frames += curr_nb_frames;
+   /* set padding length to zero for all but the first frame */
+   while (curr_nb_frames > 1)
+   {
+      rp->nb_frames++;
+      rp->padding_len[rp->nb_frames] = 0;
+      rp->paddings[rp->nb_frames] = NULL;
+      curr_nb_frames--;
+   }
+   rp->nb_frames++;
    return OPUS_OK;
 }
 
@@ -100,17 +110,23 @@ int opus_repacketizer_get_nb_frames(OpusRepacketizer *rp)
 }
 
 opus_int32 opus_repacketizer_out_range_impl(OpusRepacketizer *rp, int begin, int end,
-      unsigned char *data, opus_int32 maxlen, int self_delimited, int pad)
+      unsigned char *data, opus_int32 maxlen, int self_delimited, int pad, const opus_extension_data *extensions, int nb_extensions)
 {
    int i, count;
    opus_int32 tot_size;
    opus_int16 *len;
    const unsigned char **frames;
    unsigned char * ptr;
+   int ones_begin=0, ones_end=0;
+   int ext_begin=0, ext_len=0;
+   int ext_count, total_ext_count;
+   VARDECL(opus_extension_data, all_extensions);
+   SAVE_STACK;
 
    if (begin<0 || begin>=end || end>rp->nb_frames)
    {
       /*fprintf(stderr, "%d %d %d\n", begin, end, rp->nb_frames);*/
+      RESTORE_STACK;
       return OPUS_BAD_ARG;
    }
    count = end-begin;
@@ -122,13 +138,50 @@ opus_int32 opus_repacketizer_out_range_impl(OpusRepacketizer *rp, int begin, int
    else
       tot_size = 0;
 
+   /* figure out total number of extensions */
+   total_ext_count = nb_extensions;
+   for (i=begin;i<end;i++)
+   {
+      int n = opus_packet_extensions_count(rp->paddings[i], rp->padding_len[i]);
+      if (n > 0) total_ext_count += n;
+   }
+   ALLOC(all_extensions, total_ext_count ? total_ext_count : ALLOC_NONE, opus_extension_data);
+   /* copy over any extensions that were passed in */
+   for (ext_count=0;ext_count<nb_extensions;ext_count++)
+   {
+      all_extensions[ext_count] = extensions[ext_count];
+   }
+
+   /* incorporate any extensions from the repacketizer padding */
+   for (i=begin;i<end;i++)
+   {
+      int frame_ext_count, j;
+      frame_ext_count = total_ext_count - ext_count;
+      int ret = opus_packet_extensions_parse(rp->paddings[i], rp->padding_len[i],
+         &all_extensions[ext_count], &frame_ext_count);
+      if (ret<0)
+      {
+         RESTORE_STACK;
+         return OPUS_INTERNAL_ERROR;
+      }
+      /* renumber the extension frame numbers */
+      for (j=0;j<frame_ext_count;j++)
+      {
+         all_extensions[ext_count+j].frame += i-begin;
+      }
+      ext_count += frame_ext_count;
+   }
+
    ptr = data;
    if (count==1)
    {
       /* Code 0 */
       tot_size += len[0]+1;
       if (tot_size > maxlen)
+      {
+         RESTORE_STACK;
          return OPUS_BUFFER_TOO_SMALL;
+      }
       *ptr++ = rp->toc&0xFC;
    } else if (count==2)
    {
@@ -137,18 +190,24 @@ opus_int32 opus_repacketizer_out_range_impl(OpusRepacketizer *rp, int begin, int
          /* Code 1 */
          tot_size += 2*len[0]+1;
          if (tot_size > maxlen)
+         {
+            RESTORE_STACK;
             return OPUS_BUFFER_TOO_SMALL;
+         }
          *ptr++ = (rp->toc&0xFC) | 0x1;
       } else {
          /* Code 2 */
          tot_size += len[0]+len[1]+2+(len[0]>=252);
          if (tot_size > maxlen)
+         {
+            RESTORE_STACK;
             return OPUS_BUFFER_TOO_SMALL;
+         }
          *ptr++ = (rp->toc&0xFC) | 0x2;
          ptr += encode_size(len[0], ptr);
       }
    }
-   if (count > 2 || (pad && tot_size < maxlen))
+   if (count > 2 || (pad && tot_size < maxlen) || ext_count > 0)
    {
       /* Code 3 */
       int vbr;
@@ -177,22 +236,45 @@ opus_int32 opus_repacketizer_out_range_impl(OpusRepacketizer *rp, int begin, int
          tot_size += len[count-1];
 
          if (tot_size > maxlen)
+         {
+            RESTORE_STACK;
             return OPUS_BUFFER_TOO_SMALL;
+         }
          *ptr++ = (rp->toc&0xFC) | 0x3;
          *ptr++ = count | 0x80;
       } else {
          tot_size += count*len[0]+2;
          if (tot_size > maxlen)
+         {
+            RESTORE_STACK;
             return OPUS_BUFFER_TOO_SMALL;
+         }
          *ptr++ = (rp->toc&0xFC) | 0x3;
          *ptr++ = count;
       }
       pad_amount = pad ? (maxlen-tot_size) : 0;
+      if (ext_count>0)
+      {
+         /* figure out how much space we need for the extensions */
+         ext_len = opus_packet_extensions_generate(NULL, maxlen-tot_size, all_extensions, ext_count, 0);
+         if (ext_len < 0) return ext_len;
+         if (!pad)
+            pad_amount = ext_len + ext_len/254 + 1;
+      }
       if (pad_amount != 0)
       {
          int nb_255s;
          data[1] |= 0x40;
          nb_255s = (pad_amount-1)/255;
+         if (tot_size + ext_len + nb_255s + 1 > maxlen)
+         {
+            RESTORE_STACK;
+            return OPUS_BUFFER_TOO_SMALL;
+         }
+         ext_begin = tot_size+pad_amount-ext_len;
+         /* Prepend 0x01 padding */
+         ones_begin = tot_size+nb_255s+1;
+         ones_end = tot_size+pad_amount-ext_len;
          for (i=0;i<nb_255s;i++)
             *ptr++ = 255;
          *ptr++ = pad_amount-255*nb_255s-1;
@@ -218,42 +300,62 @@ opus_int32 opus_repacketizer_out_range_impl(OpusRepacketizer *rp, int begin, int
       OPUS_MOVE(ptr, frames[i], len[i]);
       ptr += len[i];
    }
-   if (pad)
+   if (ext_len > 0) {
+      int ret = opus_packet_extensions_generate(&data[ext_begin], ext_len, all_extensions, ext_count, 0);
+      celt_assert(ret == ext_len);
+   }
+   for (i=ones_begin;i<ones_end;i++)
+      data[i] = 0x01;
+   if (pad && ext_count==0)
    {
       /* Fill padding with zeros. */
       while (ptr<data+maxlen)
          *ptr++=0;
    }
+   RESTORE_STACK;
    return tot_size;
 }
 
 opus_int32 opus_repacketizer_out_range(OpusRepacketizer *rp, int begin, int end, unsigned char *data, opus_int32 maxlen)
 {
-   return opus_repacketizer_out_range_impl(rp, begin, end, data, maxlen, 0, 0);
+   return opus_repacketizer_out_range_impl(rp, begin, end, data, maxlen, 0, 0, NULL, 0);
 }
 
 opus_int32 opus_repacketizer_out(OpusRepacketizer *rp, unsigned char *data, opus_int32 maxlen)
 {
-   return opus_repacketizer_out_range_impl(rp, 0, rp->nb_frames, data, maxlen, 0, 0);
+   return opus_repacketizer_out_range_impl(rp, 0, rp->nb_frames, data, maxlen, 0, 0, NULL, 0);
 }
 
-int opus_packet_pad(unsigned char *data, opus_int32 len, opus_int32 new_len)
+opus_int32 opus_packet_pad_impl(unsigned char *data, opus_int32 len, opus_int32 new_len, int pad, const opus_extension_data  *extensions, int nb_extensions)
 {
    OpusRepacketizer rp;
    opus_int32 ret;
+   VARDECL(unsigned char, copy);
+   SAVE_STACK;
    if (len < 1)
       return OPUS_BAD_ARG;
    if (len==new_len)
       return OPUS_OK;
    else if (len > new_len)
       return OPUS_BAD_ARG;
+   ALLOC(copy, len, unsigned char);
    opus_repacketizer_init(&rp);
    /* Moving payload to the end of the packet so we can do in-place padding */
-   OPUS_MOVE(data+new_len-len, data, len);
-   ret = opus_repacketizer_cat(&rp, data+new_len-len, len);
+   OPUS_COPY(copy, data, len);
+   ret = opus_repacketizer_cat(&rp, copy, len);
    if (ret != OPUS_OK)
       return ret;
-   ret = opus_repacketizer_out_range_impl(&rp, 0, rp.nb_frames, data, new_len, 0, 1);
+   ret = opus_repacketizer_out_range_impl(&rp, 0, rp.nb_frames, data, new_len, 0, pad, extensions, nb_extensions);
+   RESTORE_STACK;
+   return ret;
+}
+
+int opus_packet_pad(unsigned char *data, opus_int32 len, opus_int32 new_len)
+{
+   opus_int32 ret;
+   ALLOC_STACK;
+   ret = opus_packet_pad_impl(data, len, new_len, 1, NULL, 0);
+   RESTORE_STACK;
    if (ret > 0)
       return OPUS_OK;
    else
@@ -264,13 +366,19 @@ opus_int32 opus_packet_unpad(unsigned char *data, opus_int32 len)
 {
    OpusRepacketizer rp;
    opus_int32 ret;
+   int i;
    if (len < 1)
       return OPUS_BAD_ARG;
    opus_repacketizer_init(&rp);
    ret = opus_repacketizer_cat(&rp, data, len);
    if (ret < 0)
       return ret;
-   ret = opus_repacketizer_out_range_impl(&rp, 0, rp.nb_frames, data, len, 0, 0);
+   /* Discard all padding and extensions. */
+   for (i=0;i<rp.nb_frames;i++) {
+      rp.padding_len[i] = 0;
+      rp.paddings[i] = NULL;
+   }
+   ret = opus_repacketizer_out_range_impl(&rp, 0, rp.nb_frames, data, len, 0, 0, NULL, 0);
    celt_assert(ret > 0 && ret <= len);
    return ret;
 }
@@ -297,7 +405,7 @@ int opus_multistream_packet_pad(unsigned char *data, opus_int32 len, opus_int32
       if (len<=0)
          return OPUS_INVALID_PACKET;
       count = opus_packet_parse_impl(data, len, 1, &toc, NULL,
-                                     size, NULL, &packet_offset);
+                                     size, NULL, &packet_offset, NULL, NULL);
       if (count<0)
          return count;
       data += packet_offset;
@@ -324,18 +432,24 @@ opus_int32 opus_multistream_packet_unpad(unsigned char *data, opus_int32 len, in
    for (s=0;s<nb_streams;s++)
    {
       opus_int32 ret;
+      int i;
       int self_delimited = s!=nb_streams-1;
       if (len<=0)
          return OPUS_INVALID_PACKET;
       opus_repacketizer_init(&rp);
       ret = opus_packet_parse_impl(data, len, self_delimited, &toc, NULL,
-                                     size, NULL, &packet_offset);
+                                     size, NULL, &packet_offset, NULL, NULL);
       if (ret<0)
          return ret;
       ret = opus_repacketizer_cat_impl(&rp, data, packet_offset, self_delimited);
       if (ret < 0)
          return ret;
-      ret = opus_repacketizer_out_range_impl(&rp, 0, rp.nb_frames, dst, len, self_delimited, 0);
+      /* Discard all padding and extensions. */
+      for (i=0;i<rp.nb_frames;i++) {
+         rp.padding_len[i] = 0;
+         rp.paddings[i] = NULL;
+      }
+      ret = opus_repacketizer_out_range_impl(&rp, 0, rp.nb_frames, dst, len, self_delimited, 0, NULL, 0);
       if (ret < 0)
          return ret;
       else
diff --git a/opus/src/repacketizer_demo.c b/opus/src/repacketizer_demo.c
index dc05c1b3..43de7019 100644
--- a/opus/src/repacketizer_demo.c
+++ b/opus/src/repacketizer_demo.c
@@ -119,7 +119,19 @@ int main(int argc, char *argv[])
       for (i=0;i<nb_packets;i++)
       {
          unsigned char ch[4];
-         err = fread(ch, 1, 4, fin);
+         if (fread(ch, 1, 4, fin)!=4)
+         {
+             if (feof(fin))
+             {
+                eof = 1;
+             } else {
+                fprintf(stderr, "Error reading payload length.\n");
+                fclose(fin);
+                fclose(fout);
+                return EXIT_FAILURE;
+             }
+             break;
+         }
          len[i] = char_to_int(ch);
          /*fprintf(stderr, "in len = %d\n", len[i]);*/
          if (len[i]>1500 || len[i]<0)
@@ -135,13 +147,31 @@ int main(int argc, char *argv[])
              }
              break;
          }
-         err = fread(ch, 1, 4, fin);
-         rng[i] = char_to_int(ch);
-         err = fread(packets[i], 1, len[i], fin);
-         if (feof(fin))
+         if (fread(ch, 1, 4, fin)!=4)
          {
-            eof = 1;
-            break;
+             if (feof(fin))
+             {
+                eof = 1;
+             } else {
+                fprintf(stderr, "Error reading.\n");
+                fclose(fin);
+                fclose(fout);
+                return EXIT_FAILURE;
+             }
+             break;
+         }
+         rng[i] = char_to_int(ch);
+         if (fread(packets[i], len[i], 1, fin)!=1) {
+             if (feof(fin))
+             {
+                eof = 1;
+             } else {
+                fprintf(stderr, "Error reading packet of %u bytes.\n", len[i]);
+                fclose(fin);
+                fclose(fout);
+                return EXIT_FAILURE;
+             }
+             break;
          }
          err = opus_repacketizer_cat(rp, packets[i], len[i]);
          if (err!=OPUS_OK)
diff --git a/opus/src/tansig_table.h b/opus/src/tansig_table.h
deleted file mode 100644
index c76f844a..00000000
--- a/opus/src/tansig_table.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* This file is auto-generated by gen_tables */
-
-static const float tansig_table[201] = {
-0.000000f, 0.039979f, 0.079830f, 0.119427f, 0.158649f,
-0.197375f, 0.235496f, 0.272905f, 0.309507f, 0.345214f,
-0.379949f, 0.413644f, 0.446244f, 0.477700f, 0.507977f,
-0.537050f, 0.564900f, 0.591519f, 0.616909f, 0.641077f,
-0.664037f, 0.685809f, 0.706419f, 0.725897f, 0.744277f,
-0.761594f, 0.777888f, 0.793199f, 0.807569f, 0.821040f,
-0.833655f, 0.845456f, 0.856485f, 0.866784f, 0.876393f,
-0.885352f, 0.893698f, 0.901468f, 0.908698f, 0.915420f,
-0.921669f, 0.927473f, 0.932862f, 0.937863f, 0.942503f,
-0.946806f, 0.950795f, 0.954492f, 0.957917f, 0.961090f,
-0.964028f, 0.966747f, 0.969265f, 0.971594f, 0.973749f,
-0.975743f, 0.977587f, 0.979293f, 0.980869f, 0.982327f,
-0.983675f, 0.984921f, 0.986072f, 0.987136f, 0.988119f,
-0.989027f, 0.989867f, 0.990642f, 0.991359f, 0.992020f,
-0.992631f, 0.993196f, 0.993718f, 0.994199f, 0.994644f,
-0.995055f, 0.995434f, 0.995784f, 0.996108f, 0.996407f,
-0.996682f, 0.996937f, 0.997172f, 0.997389f, 0.997590f,
-0.997775f, 0.997946f, 0.998104f, 0.998249f, 0.998384f,
-0.998508f, 0.998623f, 0.998728f, 0.998826f, 0.998916f,
-0.999000f, 0.999076f, 0.999147f, 0.999213f, 0.999273f,
-0.999329f, 0.999381f, 0.999428f, 0.999472f, 0.999513f,
-0.999550f, 0.999585f, 0.999617f, 0.999646f, 0.999673f,
-0.999699f, 0.999722f, 0.999743f, 0.999763f, 0.999781f,
-0.999798f, 0.999813f, 0.999828f, 0.999841f, 0.999853f,
-0.999865f, 0.999875f, 0.999885f, 0.999893f, 0.999902f,
-0.999909f, 0.999916f, 0.999923f, 0.999929f, 0.999934f,
-0.999939f, 0.999944f, 0.999948f, 0.999952f, 0.999956f,
-0.999959f, 0.999962f, 0.999965f, 0.999968f, 0.999970f,
-0.999973f, 0.999975f, 0.999977f, 0.999978f, 0.999980f,
-0.999982f, 0.999983f, 0.999984f, 0.999986f, 0.999987f,
-0.999988f, 0.999989f, 0.999990f, 0.999990f, 0.999991f,
-0.999992f, 0.999992f, 0.999993f, 0.999994f, 0.999994f,
-0.999994f, 0.999995f, 0.999995f, 0.999996f, 0.999996f,
-0.999996f, 0.999997f, 0.999997f, 0.999997f, 0.999997f,
-0.999997f, 0.999998f, 0.999998f, 0.999998f, 0.999998f,
-0.999998f, 0.999998f, 0.999999f, 0.999999f, 0.999999f,
-0.999999f, 0.999999f, 0.999999f, 0.999999f, 0.999999f,
-0.999999f, 0.999999f, 0.999999f, 0.999999f, 0.999999f,
-1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-1.000000f,
-};

From 25ec8dc8170551438630e92085b4f44c728a85a6 Mon Sep 17 00:00:00 2001
From: starg2 <75976488+starg2@users.noreply.github.com>
Date: Sat, 16 Mar 2024 17:34:30 +0900
Subject: [PATCH 3/5] [opus] Enable runtime detection of AVX2

---
 opus/CMakeLists.txt    | 2 ++
 opus/celt/x86/x86cpu.c | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/opus/CMakeLists.txt b/opus/CMakeLists.txt
index afd72104..fc299bc3 100644
--- a/opus/CMakeLists.txt
+++ b/opus/CMakeLists.txt
@@ -12,9 +12,11 @@ add_definitions(
     -DOPUS_BUILD
     -DFLOAT_APPROX
     -DDLL_EXPORT
+    -DOPUS_HAVE_RTCD
     -DOPUS_X86_MAY_HAVE_SSE
     -DOPUS_X86_MAY_HAVE_SSE2
     -DOPUS_X86_MAY_HAVE_SSE4_1
+    -DOPUS_X86_MAY_HAVE_AVX2
     -DOPUS_X86_PRESUME_SSE
     -DOPUS_X86_PRESUME_SSE2
     -DOPUS_X86_PRESUME_SSE4_1
diff --git a/opus/celt/x86/x86cpu.c b/opus/celt/x86/x86cpu.c
index 2e7c32ae..81cff4f4 100644
--- a/opus/celt/x86/x86cpu.c
+++ b/opus/celt/x86/x86cpu.c
@@ -41,7 +41,7 @@
   (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \
   (defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_PRESUME_AVX2)))
 
-#if defined(_MSC_VER)
+#if defined(_WIN32)
 
 #include <intrin.h>
 static _inline void cpuid(unsigned int CPUInfo[4], unsigned int InfoType)

From e2f9bf59e3dc855a797387a0d39441f7ea5bcb0f Mon Sep 17 00:00:00 2001
From: starg2 <75976488+starg2@users.noreply.github.com>
Date: Sat, 16 Mar 2024 17:42:36 +0900
Subject: [PATCH 4/5] [opus] Limit runtime AVX2 detection to MSVC and always
 enable AVX2 for AVX2/AVX512 builds

---
 opus/CMakeLists.txt | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/opus/CMakeLists.txt b/opus/CMakeLists.txt
index fc299bc3..caa71c26 100644
--- a/opus/CMakeLists.txt
+++ b/opus/CMakeLists.txt
@@ -12,16 +12,26 @@ add_definitions(
     -DOPUS_BUILD
     -DFLOAT_APPROX
     -DDLL_EXPORT
-    -DOPUS_HAVE_RTCD
     -DOPUS_X86_MAY_HAVE_SSE
     -DOPUS_X86_MAY_HAVE_SSE2
     -DOPUS_X86_MAY_HAVE_SSE4_1
-    -DOPUS_X86_MAY_HAVE_AVX2
     -DOPUS_X86_PRESUME_SSE
     -DOPUS_X86_PRESUME_SSE2
     -DOPUS_X86_PRESUME_SSE4_1
 )
 
+if("${TIM41_X86_SIMD_LEVEL}" MATCHES "^(AVX2|AVX512)$")
+    add_definitions(
+        -DOPUS_X86_MAY_HAVE_AVX2
+        -DOPUS_X86_PRESUME_AVX2
+    )
+elseif(MSVC)
+    add_definitions(
+        -DOPUS_HAVE_RTCD
+        -DOPUS_X86_MAY_HAVE_AVX2
+    )
+endif()
+
 add_library(
     opus SHARED
 

From 3bb6b1fe8485333037c5841b738bc98d86664684 Mon Sep 17 00:00:00 2001
From: starg2 <75976488+starg2@users.noreply.github.com>
Date: Sat, 16 Mar 2024 17:51:46 +0900
Subject: [PATCH 5/5] [opus] Exclude AVX2 sources unless necessary

---
 opus/CMakeLists.txt | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/opus/CMakeLists.txt b/opus/CMakeLists.txt
index caa71c26..d6f1e699 100644
--- a/opus/CMakeLists.txt
+++ b/opus/CMakeLists.txt
@@ -248,9 +248,7 @@ add_library(
     silk/float/structs_FLP.h
     silk/float/warped_autocorrelation_FLP.c
     silk/float/wrappers_FLP.c
-    silk/float/x86/inner_product_FLP_avx2.c
     silk/x86/main_sse.h
-    silk/x86/NSQ_del_dec_avx2.c
     silk/x86/NSQ_del_dec_sse4_1.c
     silk/x86/NSQ_sse4_1.c
     silk/x86/SigProc_FIX_sse.h
@@ -259,4 +257,13 @@ add_library(
     silk/x86/x86_silk_map.c
 )
 
+if("${TIM41_X86_SIMD_LEVEL}" MATCHES "^(AVX2|AVX512)$" OR MSVC)
+    target_sources(
+        opus
+        PRIVATE
+            silk/float/x86/inner_product_FLP_avx2.c
+            silk/x86/NSQ_del_dec_avx2.c
+    )
+endif()
+
 install(TARGETS opus)