From a14307c683534e7b7cb64ef7cbee68454a7f5386 Mon Sep 17 00:00:00 2001 From: Starg Date: Thu, 28 Dec 2023 01:00:23 +0900 Subject: [PATCH 1/5] [wasapi] Fix COM initialization in console versions --- timidity/timidity.c | 8 +++++--- timidity/wasapi_a.c | 18 +++++++++--------- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/timidity/timidity.c b/timidity/timidity.c index 88a24910..387c5a28 100644 --- a/timidity/timidity.c +++ b/timidity/timidity.c @@ -9081,7 +9081,7 @@ extern int volatile save_playlist_once_before_exit_flag; #endif /* IA_W32GUI */ -#if defined ( IA_W32GUI ) || defined ( IA_W32G_SYN ) +#ifdef __W32__ static int CoInitializeOK = 0; #endif @@ -9238,9 +9238,11 @@ int main(int argc, char **argv) return 0; } timidity_start_initialize(); -#if defined (IA_W32GUI) || defined (IA_W32G_SYN) +#ifdef __W32__ if (SUCCEEDED(CoInitializeEx(NULL, COINIT_MULTITHREADED | COINIT_DISABLE_OLE1DDE))) CoInitializeOK = 1; +#endif +#if defined (IA_W32GUI) || defined (IA_W32G_SYN) w32g_initialize(); for (c = 1; c < argc; c++) if (is_directory(argv[c])) { @@ -9367,9 +9369,9 @@ int main(int argc, char **argv) /* CUI, SYN */ main_ret = timidity_play_main(nfiles, files); w32_atexit = 0; -#ifdef IA_W32G_SYN if (CoInitializeOK) CoUninitialize(); +#ifdef IA_W32G_SYN w32g_uninitialize(); #endif /* IA_W32G_SYN */ #else diff --git a/timidity/wasapi_a.c b/timidity/wasapi_a.c index 584ad20f..51fa1cbc 100644 --- a/timidity/wasapi_a.c +++ b/timidity/wasapi_a.c @@ -854,15 +854,15 @@ void close_output(void) CloseHandle(hEventTcv); hEventTcv = NULL; } - if(IsCoInit && CoInitThreadId == GetCurrentThreadId()){ - CoUninitialize(); - IsCoInit = 0; - } BufferFrames = 0; free_avrt(); #ifdef USE_TEMP_ENCODE reset_temporary_encoding(); #endif + if(IsCoInit && CoInitThreadId == GetCurrentThreadId()){ + CoUninitialize(); + IsCoInit = 0; + } IsOpened = 0; } @@ -881,7 +881,11 @@ int open_output(void) int device_id; close_output(); - + + if(check_hresult_failed(CoInitializeEx(NULL, COINIT_MULTITHREADED | COINIT_DISABLE_OLE1DDE), "CoInitializeEx()")) + goto error; + IsCoInit = 1; + CoInitThreadId = GetCurrentThreadId(); if(!get_winver()){ ctl->cmsg(CMSG_WARNING, VERB_NORMAL, "WASAPI ERROR! WASAPI require Windows Vista and later."); return -1; @@ -900,10 +904,6 @@ int open_output(void) hEventTcv = CreateEvent(NULL,FALSE,FALSE,NULL); // auto reset if(!hEventTcv) goto error; - if(check_hresult_failed(CoInitializeEx(NULL, COINIT_MULTITHREADED | COINIT_DISABLE_OLE1DDE), "CoInitializeEx()")) - goto error; - IsCoInit = 1; - CoInitThreadId = GetCurrentThreadId(); if(!get_device(&pMMDevice, device_id)) goto error; if(check_hresult_failed(IMMDevice_Activate(pMMDevice, &tim_IID_IAudioClient, CLSCTX_INPROC_SERVER, NULL, (void**)&pAudioClient), "IMMDevice::Activate()")) From 9a2d532cd73c8e775f722ec131a5ce68fff329c0 Mon Sep 17 00:00:00 2001 From: starg2 <75976488+starg2@users.noreply.github.com> Date: Sat, 16 Mar 2024 17:24:27 +0900 Subject: [PATCH 2/5] [opus] Update libopus to 1.5.1 --- opus/CMakeLists.txt | 6 +- opus/COPYING | 4 +- opus/README | 34 +- opus/celt/arch.h | 3 + opus/celt/arm/arm_celt_map.c | 31 +- opus/celt/arm/armcpu.c | 62 +- opus/celt/arm/armcpu.h | 13 + opus/celt/arm/celt_neon_intr.c | 83 +- opus/celt/arm/pitch_neon_intr.c | 58 +- opus/celt/bands.c | 11 +- opus/celt/celt.h | 25 +- opus/celt/celt_decoder.c | 419 +++++-- opus/celt/celt_encoder.c | 84 +- opus/celt/celt_lpc.c | 99 +- opus/celt/celt_lpc.h | 2 +- opus/celt/cpu_support.h | 10 +- opus/celt/ecintrin.h | 4 + opus/celt/entdec.c | 21 + opus/celt/entdec.h | 10 + opus/celt/entenc.c | 11 + opus/celt/entenc.h | 9 + opus/celt/fixed_debug.h | 71 +- opus/celt/fixed_generic.h | 14 +- opus/celt/float_cast.h | 62 +- opus/celt/kiss_fft.h | 12 +- opus/celt/laplace.c | 101 ++ opus/celt/laplace.h | 9 + opus/celt/mathops.h | 12 +- opus/celt/mips/celt_mipsr1.h | 6 +- opus/celt/mips/mdct_mipsr1.h | 6 +- opus/celt/mips/vq_mipsr1.h | 6 +- opus/celt/modes.c | 3 + opus/celt/os_support.h | 15 +- opus/celt/pitch.c | 34 +- opus/celt/pitch.h | 11 + opus/celt/rate.c | 2 + opus/celt/stack_alloc.h | 6 +- opus/celt/tests/test_unit_cwrs32.c | 1 + opus/celt/tests/test_unit_dft.c | 4 +- opus/celt/tests/test_unit_entropy.c | 4 +- opus/celt/tests/test_unit_laplace.c | 1 + opus/celt/tests/test_unit_mathops.c | 6 +- opus/celt/tests/test_unit_mdct.c | 4 +- opus/celt/tests/test_unit_rotation.c | 1 + opus/celt/x86/celt_lpc_sse.h | 5 +- opus/celt/x86/celt_lpc_sse4_1.c | 13 +- opus/celt/x86/pitch_avx.c | 101 ++ opus/celt/x86/pitch_sse.h | 48 +- opus/celt/x86/pitch_sse4_1.c | 51 +- opus/celt/x86/vq_sse.h | 6 +- opus/celt/x86/vq_sse2.c | 8 +- opus/celt/x86/x86_arch_macros.h | 47 + opus/celt/x86/x86_celt_map.c | 20 + opus/celt/x86/x86cpu.c | 51 +- opus/celt/x86/x86cpu.h | 73 +- opus/include/opus/opus.h | 126 +- opus/include/opus/opus_custom.h | 7 +- opus/include/opus/opus_defines.h | 37 +- opus/include/opus/opus_multistream.h | 2 +- opus/silk/API.h | 23 +- opus/silk/CNG.c | 4 + opus/silk/LPC_fit.c | 3 +- opus/silk/MacroCount.h | 2 +- opus/silk/MacroDebug.h | 54 +- opus/silk/NSQ.c | 48 +- opus/silk/NSQ_del_dec.c | 60 +- opus/silk/PLC.c | 67 +- opus/silk/PLC.h | 3 + opus/silk/SigProc_FIX.h | 10 +- opus/silk/VQ_WMat_EC.c | 4 +- opus/silk/arm/LPC_inv_pred_gain_neon_intr.c | 22 +- opus/silk/arm/NSQ_del_dec_arm.h | 4 +- opus/silk/arm/NSQ_del_dec_neon_intr.c | 28 +- opus/silk/arm/NSQ_neon.h | 4 +- opus/silk/arm/arm_silk_map.c | 7 +- opus/silk/bwexpander_32.c | 3 +- opus/silk/control.h | 11 + opus/silk/control_codec.c | 2 +- opus/silk/debug.c | 12 +- opus/silk/debug.h | 25 +- opus/silk/dec_API.c | 69 +- opus/silk/decode_frame.c | 61 +- opus/silk/define.h | 1 + opus/silk/enc_API.c | 19 +- opus/silk/fixed/LTP_scale_ctrl_FIX.c | 11 +- .../warped_autocorrelation_FIX_neon_intr.c | 9 +- opus/silk/fixed/burg_modified_FIX.c | 8 +- opus/silk/fixed/encode_frame_FIX.c | 18 +- opus/silk/fixed/find_pred_coefs_FIX.c | 3 +- opus/silk/fixed/vector_ops_FIX.c | 2 +- .../silk/fixed/x86/burg_modified_FIX_sse4_1.c | 69 +- opus/silk/fixed/x86/vector_ops_FIX_sse4_1.c | 43 +- opus/silk/float/LTP_scale_ctrl_FLP.c | 10 +- opus/silk/float/SigProc_FLP.h | 14 +- opus/silk/float/autocorrelation_FLP.c | 5 +- opus/silk/float/burg_modified_FLP.c | 5 +- opus/silk/float/corrMatrix_FLP.c | 10 +- opus/silk/float/encode_frame_FLP.c | 17 +- opus/silk/float/find_LPC_FLP.c | 7 +- opus/silk/float/find_LTP_FLP.c | 7 +- opus/silk/float/find_pitch_lags_FLP.c | 2 +- opus/silk/float/find_pred_coefs_FLP.c | 7 +- opus/silk/float/inner_product_FLP.c | 2 +- opus/silk/float/main_FLP.h | 12 +- opus/silk/float/noise_shape_analysis_FLP.c | 2 +- opus/silk/float/pitch_analysis_core_FLP.c | 2 +- opus/silk/float/warped_autocorrelation_FLP.c | 6 +- opus/silk/float/wrappers_FLP.c | 10 +- opus/silk/float/x86/inner_product_FLP_avx2.c | 85 ++ opus/silk/init_decoder.c | 33 +- opus/silk/init_encoder.c | 4 + opus/silk/main.h | 70 +- opus/silk/mips/NSQ_del_dec_mipsr1.h | 6 +- opus/silk/mips/macros_mipsr1.h | 6 +- opus/silk/stereo_LR_to_MS.c | 8 +- opus/silk/stereo_MS_to_LR.c | 4 +- opus/silk/structs.h | 28 + opus/silk/tests/test_unit_LPC_inv_pred_gain.c | 3 +- opus/silk/typedef.h | 3 + opus/silk/x86/NSQ_del_dec_avx2.c | 1075 +++++++++++++++++ opus/silk/x86/NSQ_del_dec_sse4_1.c | 206 ++-- opus/silk/x86/NSQ_sse4_1.c | 247 ++-- opus/silk/x86/SigProc_FIX_sse.h | 49 +- opus/silk/x86/VAD_sse4_1.c | 30 +- opus/silk/x86/VQ_WMat_EC_sse4_1.c | 189 +-- opus/silk/x86/main_sse.h | 287 +++-- opus/silk/x86/x86_silk_map.c | 113 +- opus/src/analysis.c | 8 +- opus/src/extensions.c | 315 +++++ opus/src/mapping_matrix.c | 561 +++++++++ opus/src/mapping_matrix.h | 12 + opus/src/mlp.c | 42 +- opus/src/mlp.h | 20 +- opus/src/mlp_data.c | 6 +- opus/src/opus.c | 10 +- opus/src/opus_decoder.c | 510 +++++++- opus/src/opus_demo.c | 359 +++++- opus/src/opus_encoder.c | 726 +++++++---- opus/src/opus_multistream_decoder.c | 11 +- opus/src/opus_multistream_encoder.c | 11 +- opus/src/opus_private.h | 28 +- opus/src/opus_projection_encoder.c | 42 + opus/src/repacketizer.c | 144 ++- opus/src/repacketizer_demo.c | 44 +- opus/src/tansig_table.h | 45 - 145 files changed, 6594 insertions(+), 1558 deletions(-) create mode 100644 opus/celt/x86/pitch_avx.c create mode 100644 opus/celt/x86/x86_arch_macros.h create mode 100644 opus/silk/float/x86/inner_product_FLP_avx2.c create mode 100644 opus/silk/x86/NSQ_del_dec_avx2.c create mode 100644 opus/src/extensions.c delete mode 100644 opus/src/tansig_table.h diff --git a/opus/CMakeLists.txt b/opus/CMakeLists.txt index adb21233..afd72104 100644 --- a/opus/CMakeLists.txt +++ b/opus/CMakeLists.txt @@ -32,6 +32,7 @@ add_library( src/analysis.c src/analysis.h + src/extensions.c src/mlp.c src/mlp.h src/mlp_data.c @@ -43,7 +44,6 @@ add_library( src/opus_multistream_encoder.c src/opus_private.h src/repacketizer.c - src/tansig_table.h celt/arch.h celt/bands.c @@ -95,12 +95,14 @@ add_library( celt/_kiss_fft_guts.h celt/x86/celt_lpc_sse4_1.c celt/x86/celt_lpc_sse.h + celt/x86/pitch_avx.c celt/x86/pitch_sse.c celt/x86/pitch_sse.h celt/x86/pitch_sse2.c celt/x86/pitch_sse4_1.c celt/x86/vq_sse.h celt/x86/vq_sse2.c + celt/x86/x86_arch_macros.h celt/x86/x86cpu.c celt/x86/x86cpu.h celt/x86/x86_celt_map.c @@ -234,7 +236,9 @@ add_library( silk/float/structs_FLP.h silk/float/warped_autocorrelation_FLP.c silk/float/wrappers_FLP.c + silk/float/x86/inner_product_FLP_avx2.c silk/x86/main_sse.h + silk/x86/NSQ_del_dec_avx2.c silk/x86/NSQ_del_dec_sse4_1.c silk/x86/NSQ_sse4_1.c silk/x86/SigProc_FIX_sse.h diff --git a/opus/COPYING b/opus/COPYING index 9c739c34..75711467 100644 --- a/opus/COPYING +++ b/opus/COPYING @@ -1,7 +1,7 @@ -Copyright 2001-2011 Xiph.Org, Skype Limited, Octasic, +Copyright 2001-2023 Xiph.Org, Skype Limited, Octasic, Jean-Marc Valin, Timothy B. Terriberry, CSIRO, Gregory Maxwell, Mark Borgerding, - Erik de Castro Lopo + Erik de Castro Lopo, Mozilla, Amazon Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions diff --git a/opus/README b/opus/README index 27fddf96..bcf2376d 100644 --- a/opus/README +++ b/opus/README @@ -22,7 +22,7 @@ This package implements a shared library for encoding and decoding raw Opus bitstreams. Raw Opus bitstreams should be used over RTP according to https://tools.ietf.org/html/rfc7587 -The package also includes a number of test tools used for testing the +The package also includes a number of test tools used for testing the correct operation of the library. The bitstreams read/written by these tools should not be used for Opus file distribution: They include additional debugging data and cannot support seeking. @@ -35,10 +35,32 @@ An opus-tools package is available which provides encoding and decoding of Ogg encapsulated Opus files and includes a number of useful features. Opus-tools can be found at: - https://git.xiph.org/?p=opus-tools.git + https://gitlab.xiph.org/xiph/opus-tools.git or on the main Opus website: https://opus-codec.org/ +== Deep Learning and Opus == + +Lossy networks continue to be a challenge for real-time communications. +While the original implementation of Opus provides an excellent packet loss +concealment mechanism, the team has continued to advance the methodology used +to improve audio quality in challenge network environments. + +In Opus 1.5, we added a deep learning based redundancy encoder that enhances +audio in lossy networks by embedding one second of recovery data in the padding +data of each packet. The underlying algorithm behind encoding and decoding the +recovery data is called the deep redundancy (DRED) algorithm. By leveraging +the padding data within the packet, Opus 1.5 is fully backward compatible with +prior revisions of Opus. Please see the README under the "dnn" subdirectory to +understand DRED. + +DRED was developed by a team that Amazon Web Services initially sponsored, +who open-sourced the implementation as well as began the +standardization process at the IETF: + https://datatracker.ietf.org/doc/draft-ietf-mlcodec-opus-extension/ +The license behind Opus or the intellectual property position of Opus does +not change with Opus 1.5. + == Compiling libopus == To build from a distribution tarball, you only need to do the following: @@ -68,7 +90,7 @@ On Apple macOS, install Xcode and brew.sh, then in the Terminal enter: 1) Clone the repository: - % git clone https://git.xiph.org/opus.git + % git clone https://gitlab.xiph.org/xiph/opus.git % cd opus 2) Compiling the source @@ -77,6 +99,8 @@ On Apple macOS, install Xcode and brew.sh, then in the Terminal enter: % ./configure % make +On x86, it's a good idea to use a -march= option that allows the use of AVX2. + 3) Install the codec libraries (optional) % sudo make install @@ -133,6 +157,10 @@ To run compare the code to these test vectors: % tar -zxf opus_testvectors-rfc8251.tar.gz % ./tests/run_vectors.sh ./ opus_newvectors 48000 +== Compiling libopus for Windows and alternative build systems == + +See cmake/README.md or meson/README.md. + == Portability notes == This implementation uses floating-point by default but can be compiled to diff --git a/opus/celt/arch.h b/opus/celt/arch.h index 08b07db5..3845c3a0 100644 --- a/opus/celt/arch.h +++ b/opus/celt/arch.h @@ -73,6 +73,9 @@ __attribute__((noreturn)) void celt_fatal(const char *str, const char *file, int line) { fprintf (stderr, "Fatal (internal) error in %s, line %d: %s\n", file, line, str); +#if defined(_MSC_VER) + _set_abort_behavior( 0, _WRITE_ABORT_MSG); +#endif abort(); } #endif diff --git a/opus/celt/arm/arm_celt_map.c b/opus/celt/arm/arm_celt_map.c index ca988b66..cbaea495 100644 --- a/opus/celt/arm/arm_celt_map.c +++ b/opus/celt/arm/arm_celt_map.c @@ -40,7 +40,8 @@ opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, c celt_inner_prod_c, /* ARMv4 */ celt_inner_prod_c, /* EDSP */ celt_inner_prod_c, /* Media */ - celt_inner_prod_neon /* NEON */ + celt_inner_prod_neon,/* NEON */ + celt_inner_prod_neon /* DOTPROD */ }; void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, @@ -48,7 +49,8 @@ void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const o dual_inner_prod_c, /* ARMv4 */ dual_inner_prod_c, /* EDSP */ dual_inner_prod_c, /* Media */ - dual_inner_prod_neon /* NEON */ + dual_inner_prod_neon,/* NEON */ + dual_inner_prod_neon /* DOTPROD */ }; # endif @@ -61,7 +63,8 @@ opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *, celt_pitch_xcorr_c, /* ARMv4 */ MAY_HAVE_EDSP(celt_pitch_xcorr), /* EDSP */ MAY_HAVE_MEDIA(celt_pitch_xcorr), /* Media */ - MAY_HAVE_NEON(celt_pitch_xcorr) /* NEON */ + MAY_HAVE_NEON(celt_pitch_xcorr), /* NEON */ + MAY_HAVE_NEON(celt_pitch_xcorr) /* DOTPROD */ }; # endif @@ -72,7 +75,8 @@ void (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *, celt_pitch_xcorr_c, /* ARMv4 */ celt_pitch_xcorr_c, /* EDSP */ celt_pitch_xcorr_c, /* Media */ - celt_pitch_xcorr_float_neon /* Neon */ + celt_pitch_xcorr_float_neon, /* Neon */ + celt_pitch_xcorr_float_neon /* DOTPROD */ }; # endif # endif /* FIXED_POINT */ @@ -90,6 +94,7 @@ void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])( xcorr_kernel_c, /* EDSP */ xcorr_kernel_c, /* Media */ xcorr_kernel_neon_fixed, /* Neon */ + xcorr_kernel_neon_fixed /* DOTPROD */ }; #endif @@ -101,14 +106,16 @@ int (*const OPUS_FFT_ALLOC_ARCH_IMPL[OPUS_ARCHMASK+1])(kiss_fft_state *st) = { opus_fft_alloc_arch_c, /* ARMv4 */ opus_fft_alloc_arch_c, /* EDSP */ opus_fft_alloc_arch_c, /* Media */ - opus_fft_alloc_arm_neon /* Neon with NE10 library support */ + opus_fft_alloc_arm_neon, /* Neon with NE10 library support */ + opus_fft_alloc_arm_neon /* DOTPROD with NE10 library support */ }; void (*const OPUS_FFT_FREE_ARCH_IMPL[OPUS_ARCHMASK+1])(kiss_fft_state *st) = { opus_fft_free_arch_c, /* ARMv4 */ opus_fft_free_arch_c, /* EDSP */ opus_fft_free_arch_c, /* Media */ - opus_fft_free_arm_neon /* Neon with NE10 */ + opus_fft_free_arm_neon, /* Neon with NE10 */ + opus_fft_free_arm_neon /* DOTPROD with NE10 */ }; # endif /* CUSTOM_MODES */ @@ -118,7 +125,8 @@ void (*const OPUS_FFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg, opus_fft_c, /* ARMv4 */ opus_fft_c, /* EDSP */ opus_fft_c, /* Media */ - opus_fft_neon /* Neon with NE10 */ + opus_fft_neon, /* Neon with NE10 */ + opus_fft_neon /* DOTPROD with NE10 */ }; void (*const OPUS_IFFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg, @@ -127,7 +135,8 @@ void (*const OPUS_IFFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg, opus_ifft_c, /* ARMv4 */ opus_ifft_c, /* EDSP */ opus_ifft_c, /* Media */ - opus_ifft_neon /* Neon with NE10 */ + opus_ifft_neon, /* Neon with NE10 */ + opus_ifft_neon /* DOTPROD with NE10 */ }; void (*const CLT_MDCT_FORWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l, @@ -139,7 +148,8 @@ void (*const CLT_MDCT_FORWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l, clt_mdct_forward_c, /* ARMv4 */ clt_mdct_forward_c, /* EDSP */ clt_mdct_forward_c, /* Media */ - clt_mdct_forward_neon /* Neon with NE10 */ + clt_mdct_forward_neon, /* Neon with NE10 */ + clt_mdct_forward_neon /* DOTPROD with NE10 */ }; void (*const CLT_MDCT_BACKWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l, @@ -151,7 +161,8 @@ void (*const CLT_MDCT_BACKWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l, clt_mdct_backward_c, /* ARMv4 */ clt_mdct_backward_c, /* EDSP */ clt_mdct_backward_c, /* Media */ - clt_mdct_backward_neon /* Neon with NE10 */ + clt_mdct_backward_neon, /* Neon with NE10 */ + clt_mdct_backward_neon /* DOTPROD with NE10 */ }; # endif /* HAVE_ARM_NE10 */ diff --git a/opus/celt/arm/armcpu.c b/opus/celt/arm/armcpu.c index 694a63b7..06a53435 100644 --- a/opus/celt/arm/armcpu.c +++ b/opus/celt/arm/armcpu.c @@ -43,6 +43,7 @@ #define OPUS_CPU_ARM_EDSP_FLAG (1< + opus_uint32 opus_cpu_capabilities(void) { opus_uint32 flags = 0; @@ -124,6 +127,14 @@ opus_uint32 opus_cpu_capabilities(void) p = strstr(buf, " neon"); if(p != NULL && (p[5] == ' ' || p[5] == '\n')) flags |= OPUS_CPU_ARM_NEON_FLAG; + p = strstr(buf, " asimd"); + if(p != NULL && (p[6] == ' ' || p[6] == '\n')) + flags |= OPUS_CPU_ARM_NEON_FLAG | OPUS_CPU_ARM_MEDIA_FLAG | OPUS_CPU_ARM_EDSP_FLAG; +# endif +# if defined(OPUS_ARM_MAY_HAVE_DOTPROD) + p = strstr(buf, " asimddp"); + if(p != NULL && (p[8] == ' ' || p[8] == '\n')) + flags |= OPUS_CPU_ARM_DOTPROD_FLAG; # endif } # endif @@ -142,10 +153,44 @@ opus_uint32 opus_cpu_capabilities(void) # endif } +#if defined(OPUS_ARM_PRESUME_AARCH64_NEON_INTR) + flags |= OPUS_CPU_ARM_EDSP_FLAG | OPUS_CPU_ARM_MEDIA_FLAG | OPUS_CPU_ARM_NEON_FLAG; +# if defined(OPUS_ARM_PRESUME_DOTPROD) + flags |= OPUS_CPU_ARM_DOTPROD_FLAG; +# endif +#endif + fclose(cpuinfo); } return flags; } + +#elif defined(__APPLE__) +#include +#include + +opus_uint32 opus_cpu_capabilities(void) +{ + opus_uint32 flags = 0; + +#if defined(OPUS_ARM_MAY_HAVE_DOTPROD) + size_t size = sizeof(uint32_t); + uint32_t value = 0; + if (!sysctlbyname("hw.optional.arm.FEAT_DotProd", &value, &size, NULL, 0) && value) + { + flags |= OPUS_CPU_ARM_DOTPROD_FLAG; + } +#endif + +#if defined(OPUS_ARM_PRESUME_AARCH64_NEON_INTR) + flags |= OPUS_CPU_ARM_EDSP_FLAG | OPUS_CPU_ARM_MEDIA_FLAG | OPUS_CPU_ARM_NEON_FLAG; +# if defined(OPUS_ARM_PRESUME_DOTPROD) + flags |= OPUS_CPU_ARM_DOTPROD_FLAG; +# endif +#endif + return flags; +} + #else /* The feature registers which can tell us what the processor supports are * accessible in priveleged modes only, so we can't have a general user-space @@ -154,7 +199,7 @@ opus_uint32 opus_cpu_capabilities(void) "your platform. Reconfigure with --disable-rtcd (or send patches)." #endif -int opus_select_arch(void) +static int opus_select_arch_impl(void) { opus_uint32 flags = opus_cpu_capabilities(); int arch = 0; @@ -178,8 +223,21 @@ int opus_select_arch(void) } arch++; - celt_assert(arch == OPUS_ARCH_ARM_NEON); + if(!(flags & OPUS_CPU_ARM_DOTPROD_FLAG)) { + celt_assert(arch == OPUS_ARCH_ARM_NEON); + return arch; + } + arch++; + + celt_assert(arch == OPUS_ARCH_ARM_DOTPROD); return arch; } +int opus_select_arch(void) { + int arch = opus_select_arch_impl(); +#ifdef FUZZING + arch = rand()%(arch+1); +#endif + return arch; +} #endif diff --git a/opus/celt/arm/armcpu.h b/opus/celt/arm/armcpu.h index 820262ff..6d5803d8 100644 --- a/opus/celt/arm/armcpu.h +++ b/opus/celt/arm/armcpu.h @@ -46,6 +46,12 @@ # define MAY_HAVE_NEON(name) MAY_HAVE_MEDIA(name) # endif +# if defined(OPUS_ARM_MAY_HAVE_DOTPROD) +# define MAY_HAVE_DOTPROD(name) name ## _dotprod +# else +# define MAY_HAVE_DOTPROD(name) MAY_HAVE_NEON(name) +# endif + # if defined(OPUS_ARM_PRESUME_EDSP) # define PRESUME_EDSP(name) name ## _edsp # else @@ -64,6 +70,12 @@ # define PRESUME_NEON(name) PRESUME_MEDIA(name) # endif +# if defined(OPUS_ARM_PRESUME_DOTPROD) +# define PRESUME_DOTPROD(name) name ## _dotprod +# else +# define PRESUME_DOTPROD(name) PRESUME_NEON(name) +# endif + # if defined(OPUS_HAVE_RTCD) int opus_select_arch(void); @@ -71,6 +83,7 @@ int opus_select_arch(void); #define OPUS_ARCH_ARM_EDSP (1) #define OPUS_ARCH_ARM_MEDIA (2) #define OPUS_ARCH_ARM_NEON (3) +#define OPUS_ARCH_ARM_DOTPROD (4) # endif diff --git a/opus/celt/arm/celt_neon_intr.c b/opus/celt/arm/celt_neon_intr.c index effda769..250f8362 100644 --- a/opus/celt/arm/celt_neon_intr.c +++ b/opus/celt/arm/celt_neon_intr.c @@ -38,6 +38,8 @@ #include "../pitch.h" #if defined(FIXED_POINT) +#include + void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len) { int j; @@ -47,7 +49,10 @@ void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_va int16x4_t y0 = vld1_s16(y); y += 4; - for (j = 0; j + 8 <= len; j += 8) + /* This loop loads one y value more than we actually need. + Therefore we have to stop as soon as there are 8 or fewer samples left + (instead of 7), to avoid reading past the end of the array. */ + for (j = 0; j + 8 < len; j += 8) { /* Load x[0...7] */ int16x8_t xx = vld1q_s16(x); @@ -80,23 +85,79 @@ void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_va x += 8; y += 8; } - - for (; j < len; j++) - { - int16x4_t x0 = vld1_dup_s16(x); /* load next x */ + if (j + 4 < len) { + /* Load x[0...3] */ + int16x4_t x0 = vld1_s16(x); + /* Load y[4...7] */ + int16x4_t y4 = vld1_s16(y); + int32x4_t a0 = vmlal_lane_s16(a, y0, x0, 0); + int16x4_t y1 = vext_s16(y0, y4, 1); + int32x4_t a1 = vmlal_lane_s16(a0, y1, x0, 1); + int16x4_t y2 = vext_s16(y0, y4, 2); + int32x4_t a2 = vmlal_lane_s16(a1, y2, x0, 2); + int16x4_t y3 = vext_s16(y0, y4, 3); + int32x4_t a3 = vmlal_lane_s16(a2, y3, x0, 3); + y0 = y4; + a = a3; + x += 4; + y += 4; + j += 4; + } + if (j + 2 < len) { + /* Load x[0...1] */ + int16x4x2_t xx = vld2_dup_s16(x); + int16x4_t x0 = xx.val[0]; + int16x4_t x1 = xx.val[1]; + /* Load y[4...5]. + We would like to use vld1_dup_s32(), but casting the pointer would + break strict aliasing rules and potentially have alignment issues. + Fortunately the compiler seems capable of translating this memcpy() + and vdup_n_s32() into the equivalent vld1_dup_s32().*/ + int32_t yy; + memcpy(&yy, y, sizeof(yy)); + int16x4_t y4 = vreinterpret_s16_s32(vdup_n_s32(yy)); int32x4_t a0 = vmlal_s16(a, y0, x0); - - int16x4_t y4 = vld1_dup_s16(y); /* load next y */ - y0 = vext_s16(y0, y4, 1); + int16x4_t y1 = vext_s16(y0, y4, 1); + /* Replace bottom copy of {y[5], y[4]} in y4 with {y[3], y[2]} from y0, + using VSRI instead of VEXT, since it's a data-processing + instruction. */ + y0 = vreinterpret_s16_s64(vsri_n_s64(vreinterpret_s64_s16(y4), + vreinterpret_s64_s16(y0), 32)); + int32x4_t a1 = vmlal_s16(a0, y1, x1); + a = a1; + x += 2; + y += 2; + j += 2; + } + if (j + 1 < len) { + /* Load next x. */ + int16x4_t x0 = vld1_dup_s16(x); + int32x4_t a0 = vmlal_s16(a, y0, x0); + /* Load last y. */ + int16x4_t y4 = vld1_dup_s16(y); + y0 = vreinterpret_s16_s64(vsri_n_s64(vreinterpret_s64_s16(y4), + vreinterpret_s64_s16(y0), 16)); a = a0; x++; - y++; } - - vst1q_s32(sum, a); + /* Load last x. */ + int16x4_t x0 = vld1_dup_s16(x); + int32x4_t a0 = vmlal_s16(a, y0, x0); + vst1q_s32(sum, a0); } #else + +#if defined(__ARM_FEATURE_FMA) && defined(__ARM_ARCH_ISA_A64) +/* If we can, force the compiler to use an FMA instruction rather than break + * vmlaq_f32() into fmul/fadd. */ +#ifdef vmlaq_lane_f32 +#undef vmlaq_lane_f32 +#endif +#define vmlaq_lane_f32(a,b,c,lane) vfmaq_lane_f32(a,b,c,lane) +#endif + + /* * Function: xcorr_kernel_neon_float * --------------------------------- diff --git a/opus/celt/arm/pitch_neon_intr.c b/opus/celt/arm/pitch_neon_intr.c index 1ac38c43..43885f52 100644 --- a/opus/celt/arm/pitch_neon_intr.c +++ b/opus/celt/arm/pitch_neon_intr.c @@ -130,6 +130,13 @@ void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus /* ========================================================================== */ +#ifdef __ARM_FEATURE_FMA +/* If we can, force the compiler to use an FMA instruction rather than break + vmlaq_f32() into fmul/fadd. */ +#define vmlaq_f32(a,b,c) vfmaq_f32(a,b,c) +#endif + + #ifdef OPUS_CHECK_ASM /* This part of code simulates floating-point NEON operations. */ @@ -137,22 +144,27 @@ void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus /* celt_inner_prod_neon_float_c_simulation() simulates the floating-point */ /* operations of celt_inner_prod_neon(), and both functions should have bit */ /* exact output. */ -static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y, int N) +static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y, float *err, int N) { int i; + *err = 0; opus_val32 xy, xy0 = 0, xy1 = 0, xy2 = 0, xy3 = 0; for (i = 0; i < N - 3; i += 4) { xy0 = MAC16_16(xy0, x[i + 0], y[i + 0]); xy1 = MAC16_16(xy1, x[i + 1], y[i + 1]); xy2 = MAC16_16(xy2, x[i + 2], y[i + 2]); xy3 = MAC16_16(xy3, x[i + 3], y[i + 3]); + *err += ABS32(xy0)+ABS32(xy1)+ABS32(xy2)+ABS32(xy3); } xy0 += xy2; xy1 += xy3; xy = xy0 + xy1; + *err += ABS32(xy1)+ABS32(xy0)+ABS32(xy); for (; i < N; i++) { xy = MAC16_16(xy, x[i], y[i]); + *err += ABS32(xy); } + *err = *err*2e-7 + N*1e-37; return xy; } @@ -160,32 +172,10 @@ static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, c /* operations of dual_inner_prod_neon(), and both functions should have bit */ /* exact output. */ static void dual_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, - int N, opus_val32 *xy1, opus_val32 *xy2) + int N, opus_val32 *xy1, opus_val32 *xy2, float *err) { - int i; - opus_val32 xy01, xy02, xy01_0 = 0, xy01_1 = 0, xy01_2 = 0, xy01_3 = 0, xy02_0 = 0, xy02_1 = 0, xy02_2 = 0, xy02_3 = 0; - for (i = 0; i < N - 3; i += 4) { - xy01_0 = MAC16_16(xy01_0, x[i + 0], y01[i + 0]); - xy01_1 = MAC16_16(xy01_1, x[i + 1], y01[i + 1]); - xy01_2 = MAC16_16(xy01_2, x[i + 2], y01[i + 2]); - xy01_3 = MAC16_16(xy01_3, x[i + 3], y01[i + 3]); - xy02_0 = MAC16_16(xy02_0, x[i + 0], y02[i + 0]); - xy02_1 = MAC16_16(xy02_1, x[i + 1], y02[i + 1]); - xy02_2 = MAC16_16(xy02_2, x[i + 2], y02[i + 2]); - xy02_3 = MAC16_16(xy02_3, x[i + 3], y02[i + 3]); - } - xy01_0 += xy01_2; - xy02_0 += xy02_2; - xy01_1 += xy01_3; - xy02_1 += xy02_3; - xy01 = xy01_0 + xy01_1; - xy02 = xy02_0 + xy02_1; - for (; i < N; i++) { - xy01 = MAC16_16(xy01, x[i], y01[i]); - xy02 = MAC16_16(xy02, x[i], y02[i]); - } - *xy1 = xy01; - *xy2 = xy02; + *xy1 = celt_inner_prod_neon_float_c_simulation(x, y01, &err[0], N); + *xy2 = celt_inner_prod_neon_float_c_simulation(x, y02, &err[1], N); } #endif /* OPUS_CHECK_ASM */ @@ -225,7 +215,12 @@ opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N) } #ifdef OPUS_CHECK_ASM - celt_assert(ABS32(celt_inner_prod_neon_float_c_simulation(x, y, N) - xy) <= VERY_SMALL); + { + float err, res; + res = celt_inner_prod_neon_float_c_simulation(x, y, &err, N); + /*if (ABS32(res - xy) > err) fprintf(stderr, "%g %g %g\n", res, xy, err);*/ + celt_assert(ABS32(res - xy) <= err); + } #endif return xy; @@ -280,9 +275,12 @@ void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus #ifdef OPUS_CHECK_ASM { opus_val32 xy1_c, xy2_c; - dual_inner_prod_neon_float_c_simulation(x, y01, y02, N, &xy1_c, &xy2_c); - celt_assert(ABS32(xy1_c - *xy1) <= VERY_SMALL); - celt_assert(ABS32(xy2_c - *xy2) <= VERY_SMALL); + float err[2]; + dual_inner_prod_neon_float_c_simulation(x, y01, y02, N, &xy1_c, &xy2_c, err); + /*if (ABS32(xy1_c - *xy1) > err[0]) fprintf(stderr, "dual1 fail: %g %g %g\n", xy1_c, *xy1, err[0]); + if (ABS32(xy2_c - *xy2) > err[1]) fprintf(stderr, "dual2 fail: %g %g %g\n", xy2_c, *xy2, err[1]);*/ + celt_assert(ABS32(xy1_c - *xy1) <= err[0]); + celt_assert(ABS32(xy2_c - *xy2) <= err[1]); } #endif } diff --git a/opus/celt/bands.c b/opus/celt/bands.c index 2702963c..6785e08e 100644 --- a/opus/celt/bands.c +++ b/opus/celt/bands.c @@ -901,7 +901,7 @@ static void compute_theta(struct band_ctx *ctx, struct split_ctx *sctx, sctx->itheta = itheta; sctx->qalloc = qalloc; } -static unsigned quant_band_n1(struct band_ctx *ctx, celt_norm *X, celt_norm *Y, int b, +static unsigned quant_band_n1(struct band_ctx *ctx, celt_norm *X, celt_norm *Y, celt_norm *lowband_out) { int c; @@ -926,7 +926,6 @@ static unsigned quant_band_n1(struct band_ctx *ctx, celt_norm *X, celt_norm *Y, sign = ec_dec_bits(ec, 1); } ctx->remaining_bits -= 1<resynth) x[0] = sign ? -NORM_SCALING : NORM_SCALING; @@ -1134,7 +1133,7 @@ static unsigned quant_band(struct band_ctx *ctx, celt_norm *X, /* Special case for one sample */ if (N==1) { - return quant_band_n1(ctx, X, NULL, b, lowband_out); + return quant_band_n1(ctx, X, NULL, lowband_out); } if (tf_change>0) @@ -1256,7 +1255,7 @@ static unsigned quant_band_stereo(struct band_ctx *ctx, celt_norm *X, celt_norm /* Special case for one sample */ if (N==1) { - return quant_band_n1(ctx, X, Y, b, lowband_out); + return quant_band_n1(ctx, X, Y, lowband_out); } orig_fill = fill; @@ -1381,6 +1380,7 @@ static unsigned quant_band_stereo(struct band_ctx *ctx, celt_norm *X, celt_norm return cm; } +#ifndef DISABLE_UPDATE_DRAFT static void special_hybrid_folding(const CELTMode *m, celt_norm *norm, celt_norm *norm2, int start, int M, int dual_stereo) { int n1, n2; @@ -1393,6 +1393,7 @@ static void special_hybrid_folding(const CELTMode *m, celt_norm *norm, celt_norm if (dual_stereo) OPUS_COPY(&norm2[n1], &norm2[2*n1 - n2], n2-n1); } +#endif void quant_all_bands(int encode, const CELTMode *m, int start, int end, celt_norm *X_, celt_norm *Y_, unsigned char *collapse_masks, @@ -1449,7 +1450,7 @@ void quant_all_bands(int encode, const CELTMode *m, int start, int end, if (encode && resynth) lowband_scratch = _lowband_scratch; else - lowband_scratch = X_+M*eBands[m->nbEBands-1]; + lowband_scratch = X_+M*eBands[m->effEBands-1]; ALLOC(X_save, resynth_alloc, celt_norm); ALLOC(Y_save, resynth_alloc, celt_norm); ALLOC(X_save2, resynth_alloc, celt_norm); diff --git a/opus/celt/celt.h b/opus/celt/celt.h index 24b6b2b5..2f501951 100644 --- a/opus/celt/celt.h +++ b/opus/celt/celt.h @@ -42,6 +42,10 @@ #include "entdec.h" #include "arch.h" +#ifdef ENABLE_DEEP_PLC +#include "lpcnet.h" +#endif + #ifdef __cplusplus extern "C" { #endif @@ -149,6 +153,13 @@ int celt_decoder_get_size(int channels); int celt_decoder_init(CELTDecoder *st, opus_int32 sampling_rate, int channels); +int celt_decode_with_ec_dred(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, + int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec, int accum +#ifdef ENABLE_DEEP_PLC + ,LPCNetPLCState *lpcnet +#endif + ); + int celt_decode_with_ec(OpusCustomDecoder * OPUS_RESTRICT st, const unsigned char *data, int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec, int accum); @@ -225,23 +236,13 @@ void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N, opus_val16 g0, opus_val16 g1, int tapset0, int tapset1, const opus_val16 *window, int overlap, int arch); -#ifdef NON_STATIC_COMB_FILTER_CONST_C -void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N, - opus_val16 g10, opus_val16 g11, opus_val16 g12); -#endif - -#ifndef OVERRIDE_COMB_FILTER_CONST -# define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \ - ((void)(arch),comb_filter_const_c(y, x, T, N, g10, g11, g12)) -#endif - void init_caps(const CELTMode *m,int *cap,int LM,int C); #ifdef RESYNTH -void deemphasis(celt_sig *in[], opus_val16 *pcm, int N, int C, int downsample, const opus_val16 *coef, celt_sig *mem); +void deemphasis(celt_sig *in[], opus_val16 *pcm, int N, int C, int downsample, const opus_val16 *coef, celt_sig *mem, int accum); void celt_synthesis(const CELTMode *mode, celt_norm *X, celt_sig * out_syn[], opus_val16 *oldBandE, int start, int effEnd, int C, int CC, int isTransient, - int LM, int downsample, int silence); + int LM, int downsample, int silence, int arch); #endif #ifdef __cplusplus diff --git a/opus/celt/celt_decoder.c b/opus/celt/celt_decoder.c index e6efce93..743c2031 100644 --- a/opus/celt/celt_decoder.c +++ b/opus/celt/celt_decoder.c @@ -51,6 +51,11 @@ #include "celt_lpc.h" #include "vq.h" +#ifdef ENABLE_DEEP_PLC +#include "lpcnet.h" +#include "lpcnet_private.h" +#endif + /* The maximum pitch lag to allow in the pitch-based PLC. It's possible to save CPU time in the PLC pitch search by making this smaller than MAX_PERIOD. The current value corresponds to a pitch of 66.67 Hz. */ @@ -59,9 +64,6 @@ pitch of 480 Hz. */ #define PLC_PITCH_LAG_MIN (100) -#if defined(SMALL_FOOTPRINT) && defined(FIXED_POINT) -#define NORM_ALIASING_HACK -#endif /**********************************************************************/ /* */ /* DECODER */ @@ -69,6 +71,9 @@ /**********************************************************************/ #define DECODE_BUFFER_SIZE 2048 +#define PLC_UPDATE_FRAMES 4 +#define PLC_UPDATE_SAMPLES (PLC_UPDATE_FRAMES*FRAME_SIZE) + /** Decoder state @brief Decoder state */ @@ -82,6 +87,7 @@ struct OpusCustomDecoder { int start, end; int signalling; int disable_inv; + int complexity; int arch; /* Everything beyond this point gets cleared on a reset */ @@ -90,7 +96,7 @@ struct OpusCustomDecoder { opus_uint32 rng; int error; int last_pitch_index; - int loss_count; + int loss_duration; int skip_plc; int postfilter_period; int postfilter_period_old; @@ -98,11 +104,18 @@ struct OpusCustomDecoder { opus_val16 postfilter_gain_old; int postfilter_tapset; int postfilter_tapset_old; + int prefilter_and_fold; celt_sig preemph_memD[2]; +#ifdef ENABLE_DEEP_PLC + opus_int16 plc_pcm[PLC_UPDATE_SAMPLES]; + int plc_fill; + float plc_preemphasis_mem; +#endif + celt_sig _decode_mem[1]; /* Size = channels*(DECODE_BUFFER_SIZE+mode->overlap) */ - /* opus_val16 lpc[], Size = channels*LPC_ORDER */ + /* opus_val16 lpc[], Size = channels*CELT_LPC_ORDER */ /* opus_val16 oldEBands[], Size = 2*mode->nbEBands */ /* opus_val16 oldLogE[], Size = 2*mode->nbEBands */ /* opus_val16 oldLogE2[], Size = 2*mode->nbEBands */ @@ -117,13 +130,19 @@ void validate_celt_decoder(CELTDecoder *st) #ifndef CUSTOM_MODES celt_assert(st->mode == opus_custom_mode_create(48000, 960, NULL)); celt_assert(st->overlap == 120); + celt_assert(st->end <= 21); +#else +/* From Section 4.3 in the spec: "The normal CELT layer uses 21 of those bands, + though Opus Custom (see Section 6.2) may use a different number of bands" + + Check if it's within the maximum number of Bark frequency bands instead */ + celt_assert(st->end <= 25); #endif celt_assert(st->channels == 1 || st->channels == 2); celt_assert(st->stream_channels == 1 || st->stream_channels == 2); celt_assert(st->downsample > 0); celt_assert(st->start == 0 || st->start == 17); celt_assert(st->start < st->end); - celt_assert(st->end <= 21); #ifdef OPUS_ARCHMASK celt_assert(st->arch >= 0); celt_assert(st->arch <= OPUS_ARCHMASK); @@ -151,7 +170,7 @@ OPUS_CUSTOM_NOSTATIC int opus_custom_decoder_get_size(const CELTMode *mode, int { int size = sizeof(struct CELTDecoder) + (channels*(DECODE_BUFFER_SIZE+mode->overlap)-1)*sizeof(celt_sig) - + channels*LPC_ORDER*sizeof(opus_val16) + + channels*CELT_LPC_ORDER*sizeof(opus_val16) + 4*2*mode->nbEBands*sizeof(opus_val16); return size; } @@ -493,7 +512,100 @@ static int celt_plc_pitch_search(celt_sig *decode_mem[2], int C, int arch) return pitch_index; } -static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) +static void prefilter_and_fold(CELTDecoder * OPUS_RESTRICT st, int N) +{ + int c; + int CC; + int i; + int overlap; + celt_sig *decode_mem[2]; + const OpusCustomMode *mode; + VARDECL(opus_val32, etmp); + mode = st->mode; + overlap = st->overlap; + CC = st->channels; + ALLOC(etmp, overlap, opus_val32); + c=0; do { + decode_mem[c] = st->_decode_mem + c*(DECODE_BUFFER_SIZE+overlap); + } while (++cpostfilter_period_old, st->postfilter_period, overlap, + -st->postfilter_gain_old, -st->postfilter_gain, + st->postfilter_tapset_old, st->postfilter_tapset, NULL, 0, st->arch); + + /* Simulate TDAC on the concealed audio so that it blends with the + MDCT of the next frame. */ + for (i=0;iwindow[i], etmp[overlap-1-i]) + + MULT16_32_Q15(mode->window[overlap-i-1], etmp[i]); + } + } while (++cfec_read_pos; + tmp_fec_skip = lpcnet->fec_skip; + for (i=0;ifec_read_pos = tmp_read_post; + lpcnet->fec_skip = tmp_fec_skip; +} +#endif + +static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM +#ifdef ENABLE_DEEP_PLC + ,LPCNetPLCState *lpcnet +#endif + ) { int c; int i; @@ -506,7 +618,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) int nbEBands; int overlap; int start; - int loss_count; + int loss_duration; int noise_based; const opus_int16 *eBands; SAVE_STACK; @@ -521,22 +633,22 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) out_syn[c] = decode_mem[c]+DECODE_BUFFER_SIZE-N; } while (++c_decode_mem+(DECODE_BUFFER_SIZE+overlap)*C); - oldBandE = lpc+C*LPC_ORDER; + oldBandE = lpc+C*CELT_LPC_ORDER; oldLogE = oldBandE + 2*nbEBands; oldLogE2 = oldLogE + 2*nbEBands; backgroundLogE = oldLogE2 + 2*nbEBands; - loss_count = st->loss_count; + loss_duration = st->loss_duration; start = st->start; - noise_based = loss_count >= 5 || start != 0 || st->skip_plc; +#ifdef ENABLE_DEEP_PLC + noise_based = start != 0 || (lpcnet->fec_fill_pos == 0 && (st->skip_plc || loss_duration >= 80)); +#else + noise_based = loss_duration >= 40 || start != 0 || st->skip_plc; +#endif if (noise_based) { /* Noise-based PLC/CNG */ -#ifdef NORM_ALIASING_HACK - celt_norm *X; -#else VARDECL(celt_norm, X); -#endif opus_uint32 seed; int end; int effEnd; @@ -544,16 +656,18 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) end = st->end; effEnd = IMAX(start, IMIN(end, mode->effEBands)); -#ifdef NORM_ALIASING_HACK - /* This is an ugly hack that breaks aliasing rules and would be easily broken, - but it saves almost 4kB of stack. */ - X = (celt_norm*)(out_syn[C-1]+overlap/2); -#else ALLOC(X, C*N, celt_norm); /**< Interleaved normalised MDCTs */ -#endif + c=0; do { + OPUS_MOVE(decode_mem[c], decode_mem[c]+N, + DECODE_BUFFER_SIZE-N+overlap); + } while (++cprefilter_and_fold) { + prefilter_and_fold(st, N); + } /* Energy decay */ - decay = loss_count==0 ? QCONST16(1.5f, DB_SHIFT) : QCONST16(.5f, DB_SHIFT); + decay = loss_duration==0 ? QCONST16(1.5f, DB_SHIFT) : QCONST16(.5f, DB_SHIFT); c=0; do { for (i=start;irng = seed; - c=0; do { - OPUS_MOVE(decode_mem[c], decode_mem[c]+N, - DECODE_BUFFER_SIZE-N+(overlap>>1)); - } while (++cdownsample, 0, st->arch); + st->prefilter_and_fold = 0; + /* Skip regular PLC until we get two consecutive packets. */ + st->skip_plc = 1; } else { int exc_length; /* Pitch-based PLC */ @@ -592,12 +704,14 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) opus_val16 *exc; opus_val16 fade = Q15ONE; int pitch_index; - VARDECL(opus_val32, etmp); VARDECL(opus_val16, _exc); VARDECL(opus_val16, fir_tmp); - if (loss_count == 0) + if (loss_duration == 0) { +#ifdef ENABLE_DEEP_PLC + if (lpcnet->loaded) update_plc_state(lpcnet, decode_mem, &st->plc_preemphasis_mem, C); +#endif st->last_pitch_index = pitch_index = celt_plc_pitch_search(decode_mem, C, st->arch); } else { pitch_index = st->last_pitch_index; @@ -608,10 +722,9 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) decaying signal, but we can't get more than MAX_PERIOD. */ exc_length = IMIN(2*pitch_index, MAX_PERIOD); - ALLOC(etmp, overlap, opus_val32); - ALLOC(_exc, MAX_PERIOD+LPC_ORDER, opus_val16); + ALLOC(_exc, MAX_PERIOD+CELT_LPC_ORDER, opus_val16); ALLOC(fir_tmp, exc_length, opus_val16); - exc = _exc+LPC_ORDER; + exc = _exc+CELT_LPC_ORDER; window = mode->window; c=0; do { opus_val16 decay; @@ -623,16 +736,16 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) int j; buf = decode_mem[c]; - for (i=0;iarch); + CELT_LPC_ORDER, MAX_PERIOD, st->arch); /* Add a noise floor of -40 dB. */ #ifdef FIXED_POINT ac[0] += SHR32(ac[0],13); @@ -640,7 +753,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) ac[0] *= 1.0001f; #endif /* Use lag windowing to stabilize the Levinson-Durbin recursion. */ - for (i=1;i<=LPC_ORDER;i++) + for (i=1;i<=CELT_LPC_ORDER;i++) { /*ac[i] *= exp(-.5*(2*M_PI*.002*i)*(2*M_PI*.002*i));*/ #ifdef FIXED_POINT @@ -649,7 +762,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) ac[i] -= ac[i]*(0.008f*0.008f)*i*i; #endif } - _celt_lpc(lpc+c*LPC_ORDER, ac, LPC_ORDER); + _celt_lpc(lpc+c*CELT_LPC_ORDER, ac, CELT_LPC_ORDER); #ifdef FIXED_POINT /* For fixed-point, apply bandwidth expansion until we can guarantee that no overflow can happen in the IIR filter. This means: @@ -657,13 +770,13 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) while (1) { opus_val16 tmp=Q15ONE; opus_val32 sum=QCONST16(1., SIG_SHIFT); - for (i=0;iarch); + celt_fir(exc+MAX_PERIOD-exc_length, lpc+c*CELT_LPC_ORDER, + fir_tmp, exc_length, CELT_LPC_ORDER, st->arch); OPUS_COPY(exc+MAX_PERIOD-exc_length, fir_tmp, exc_length); } @@ -726,21 +839,21 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) exc[extrapolation_offset+j])), SIG_SHIFT); /* Compute the energy of the previously decoded signal whose excitation we're copying. */ - tmp = ROUND16( + tmp = SROUND16( buf[DECODE_BUFFER_SIZE-MAX_PERIOD-N+extrapolation_offset+j], SIG_SHIFT); S1 += SHR32(MULT16_16(tmp, tmp), 10); } { - opus_val16 lpc_mem[LPC_ORDER]; + opus_val16 lpc_mem[CELT_LPC_ORDER]; /* Copy the last decoded samples (prior to the overlap region) to synthesis filter memory so we can have a continuous signal. */ - for (i=0;iarch); #ifdef FIXED_POINT for (i=0; i < extrapolation_len; i++) @@ -755,7 +868,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) opus_val32 S2=0; for (i=0;ipostfilter_period, st->postfilter_period, overlap, - -st->postfilter_gain, -st->postfilter_gain, - st->postfilter_tapset, st->postfilter_tapset, NULL, 0, st->arch); - - /* Simulate TDAC on the concealed audio so that it blends with the - MDCT of the next frame. */ - for (i=0;iloaded && (st->complexity >= 5 || lpcnet->fec_fill_pos > 0)) { + float overlap_mem; + int samples_needed16k; + celt_sig *buf; + VARDECL(float, buf_copy); + buf = decode_mem[0]; + ALLOC(buf_copy, C*overlap, float); + c=0; do { + OPUS_COPY(buf_copy+c*overlap, &decode_mem[c][DECODE_BUFFER_SIZE-N], overlap); + } while (++cplc_fill = 0; + } + while (st->plc_fill < samples_needed16k) { + lpcnet_plc_conceal(lpcnet, &st->plc_pcm[st->plc_fill]); + st->plc_fill += FRAME_SIZE; + } + /* Resample to 48 kHz. */ + for (i=0;i<(N+overlap)/3;i++) { + int j; + float sum; + for (sum=0, j=0;j<17;j++) sum += 3*st->plc_pcm[i+j]*sinc_filter[3*j]; + buf[DECODE_BUFFER_SIZE-N+3*i] = sum; + for (sum=0, j=0;j<16;j++) sum += 3*st->plc_pcm[i+j+1]*sinc_filter[3*j+2]; + buf[DECODE_BUFFER_SIZE-N+3*i+1] = sum; + for (sum=0, j=0;j<16;j++) sum += 3*st->plc_pcm[i+j+1]*sinc_filter[3*j+1]; + buf[DECODE_BUFFER_SIZE-N+3*i+2] = sum; + } + OPUS_MOVE(st->plc_pcm, &st->plc_pcm[N/3], st->plc_fill-N/3); + st->plc_fill -= N/3; + for (i=0;iplc_preemphasis_mem; + st->plc_preemphasis_mem = tmp; + } + overlap_mem = st->plc_preemphasis_mem; + for (i=0;iprefilter_and_fold = 1; } - st->loss_count = loss_count+1; + /* Saturate to soemthing large to avoid wrap-around. */ + st->loss_duration = IMIN(10000, loss_duration+(1<downsample; lpc = (opus_val16*)(st->_decode_mem+(DECODE_BUFFER_SIZE+overlap)*CC); - oldBandE = lpc+CC*LPC_ORDER; + oldBandE = lpc+CC*CELT_LPC_ORDER; oldLogE = oldBandE + 2*nbEBands; oldLogE2 = oldLogE + 2*nbEBands; backgroundLogE = oldLogE2 + 2*nbEBands; @@ -928,15 +1085,25 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat if (data == NULL || len<=1) { - celt_decode_lost(st, N, LM); + celt_decode_lost(st, N, LM +#ifdef ENABLE_DEEP_PLC + , lpcnet +#endif + ); deemphasis(out_syn, pcm, N, CC, st->downsample, mode->preemph, st->preemph_memD, accum); RESTORE_STACK; return frame_size/st->downsample; } +#ifdef ENABLE_DEEP_PLC + else { + /* FIXME: This is a bit of a hack just to make sure opus_decode_native() knows we're no longer in PLC. */ + if (lpcnet) lpcnet->blend = 0; + } +#endif /* Check if there are at least two packets received consecutively before * turning on the pitch-based PLC */ - st->skip_plc = st->loss_count != 0; + if (st->loss_duration == 0) st->skip_plc = 0; if (dec == NULL) { @@ -999,6 +1166,36 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat /* Decode the global flags (first symbols in the stream) */ intra_ener = tell+3<=total_bits ? ec_dec_bit_logp(dec, 3) : 0; + /* If recovering from packet loss, make sure we make the energy prediction safe to reduce the + risk of getting loud artifacts. */ + if (!intra_ener && st->loss_duration != 0) { + c=0; do + { + opus_val16 safety = 0; + int missing = IMIN(10, st->loss_duration>>LM); + if (LM==0) safety = QCONST16(1.5f,DB_SHIFT); + else if (LM==1) safety = QCONST16(.5f,DB_SHIFT); + for (i=start;iprefilter_and_fold) { + prefilter_and_fold(st, N); + } celt_synthesis(mode, X, out_syn, oldBandE, start, effEnd, C, CC, isTransient, LM, st->downsample, silence, st->arch); @@ -1134,25 +1327,21 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat if (C==1) OPUS_COPY(&oldBandE[nbEBands], oldBandE, nbEBands); - /* In case start or end were to change */ if (!isTransient) { - opus_val16 max_background_increase; OPUS_COPY(oldLogE2, oldLogE, 2*nbEBands); OPUS_COPY(oldLogE, oldBandE, 2*nbEBands); - /* In normal circumstances, we only allow the noise floor to increase by - up to 2.4 dB/second, but when we're in DTX, we allow up to 6 dB - increase for each update.*/ - if (st->loss_count < 10) - max_background_increase = M*QCONST16(0.001f,DB_SHIFT); - else - max_background_increase = QCONST16(1.f,DB_SHIFT); - for (i=0;i<2*nbEBands;i++) - backgroundLogE[i] = MIN16(backgroundLogE[i] + max_background_increase, oldBandE[i]); } else { for (i=0;i<2*nbEBands;i++) oldLogE[i] = MIN16(oldLogE[i], oldBandE[i]); } + /* In normal circumstances, we only allow the noise floor to increase by + up to 2.4 dB/second, but when we're in DTX we give the weight of + all missing packets to the update packet. */ + max_background_increase = IMIN(160, st->loss_duration+M)*QCONST16(0.001f,DB_SHIFT); + for (i=0;i<2*nbEBands;i++) + backgroundLogE[i] = MIN16(backgroundLogE[i] + max_background_increase, oldBandE[i]); + /* In case start or end were to change */ c=0; do { for (i=0;irng = dec->rng; deemphasis(out_syn, pcm, N, CC, st->downsample, mode->preemph, st->preemph_memD, accum); - st->loss_count = 0; + st->loss_duration = 0; + st->prefilter_and_fold = 0; RESTORE_STACK; if (ec_tell(dec) > 8*len) return OPUS_INTERNAL_ERROR; @@ -1178,6 +1368,15 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat return frame_size/st->downsample; } +int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, + int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec, int accum) +{ + return celt_decode_with_ec_dred(st, data, len, pcm, frame_size, dec, accum +#ifdef ENABLE_DEEP_PLC + , NULL +#endif + ); +} #ifdef CUSTOM_MODES @@ -1251,6 +1450,26 @@ int opus_custom_decoder_ctl(CELTDecoder * OPUS_RESTRICT st, int request, ...) va_start(ap, request); switch (request) { + case OPUS_SET_COMPLEXITY_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + if(value<0 || value>10) + { + goto bad_arg; + } + st->complexity = value; + } + break; + case OPUS_GET_COMPLEXITY_REQUEST: + { + opus_int32 *value = va_arg(ap, opus_int32*); + if (!value) + { + goto bad_arg; + } + *value = st->complexity; + } + break; case CELT_SET_START_BAND_REQUEST: { opus_int32 value = va_arg(ap, opus_int32); @@ -1297,7 +1516,7 @@ int opus_custom_decoder_ctl(CELTDecoder * OPUS_RESTRICT st, int request, ...) int i; opus_val16 *lpc, *oldBandE, *oldLogE, *oldLogE2; lpc = (opus_val16*)(st->_decode_mem+(DECODE_BUFFER_SIZE+st->overlap)*st->channels); - oldBandE = lpc+st->channels*LPC_ORDER; + oldBandE = lpc+st->channels*CELT_LPC_ORDER; oldLogE = oldBandE + 2*st->mode->nbEBands; oldLogE2 = oldLogE + 2*st->mode->nbEBands; OPUS_CLEAR((char*)&st->DECODER_RESET_START, diff --git a/opus/celt/celt_encoder.c b/opus/celt/celt_encoder.c index 44cb0850..7f32a801 100644 --- a/opus/celt/celt_encoder.c +++ b/opus/celt/celt_encoder.c @@ -281,6 +281,9 @@ static int transient_analysis(const opus_val32 * OPUS_RESTRICT in, int len, int /* High-pass filter: (1 - 2*z^-1 + z^-2) / (1 - z^-1 + .5*z^-2) */ for (i=0;i 50 && LM>=1 && !lfe) + /* Make sure that dynamic allocation can't make us bust the budget. + We enable the feature starting at 24 kb/s for 20-ms frames + and 96 kb/s for 2.5 ms frames. */ + if (effectiveBytes >= (30 + 5*LM) && !lfe) { int last=0; c=0;do @@ -1042,30 +1057,38 @@ static opus_val16 dynalloc_analysis(const opus_val16 *bandLogE, const opus_val16 opus_val16 offset; opus_val16 tmp; opus_val16 *f; + OPUS_COPY(bandLogE3, &bandLogE2[c*nbEBands], end); + if (LM==0) { + /* For 2.5 ms frames, the first 8 bands have just one bin, so the + energy is highly unreliable (high variance). For that reason, + we take the max with the previous energy so that at least 2 bins + are getting used. */ + for (i=0;i bandLogE2[c*nbEBands+i-1]+QCONST16(.5f,DB_SHIFT)) + if (bandLogE3[i] > bandLogE3[i-1]+QCONST16(.5f,DB_SHIFT)) last=i; - f[i] = MIN16(f[i-1]+QCONST16(1.5f,DB_SHIFT), bandLogE2[c*nbEBands+i]); + f[i] = MIN16(f[i-1]+QCONST16(1.5f,DB_SHIFT), bandLogE3[i]); } for (i=last-1;i>=0;i--) - f[i] = MIN16(f[i], MIN16(f[i+1]+QCONST16(2.f,DB_SHIFT), bandLogE2[c*nbEBands+i])); + f[i] = MIN16(f[i], MIN16(f[i+1]+QCONST16(2.f,DB_SHIFT), bandLogE3[i])); /* Combine with a median filter to avoid dynalloc triggering unnecessarily. The "offset" value controls how conservative we are -- a higher offset reduces the impact of the median filter and makes dynalloc use more bits. */ offset = QCONST16(1.f, DB_SHIFT); for (i=2;ibitrate*frame_size; if (tell>1) - tmp += tell; + tmp += tell*mode->Fs; if (st->bitrate!=OPUS_BITRATE_MAX) + { nbCompressedBytes = IMAX(2, IMIN(nbCompressedBytes, (tmp+4*mode->Fs)/(8*mode->Fs)-!!st->signalling)); + ec_enc_shrink(enc, nbCompressedBytes); + } effectiveBytes = nbCompressedBytes - nbFilledBytes; } - equiv_rate = ((opus_int32)nbCompressedBytes*8*50 >> (3-LM)) - (40*C+20)*((400>>LM) - 50); + equiv_rate = ((opus_int32)nbCompressedBytes*8*50 << (3-LM)) - (40*C+20)*((400>>LM) - 50); if (st->bitrate != OPUS_BITRATE_MAX) equiv_rate = IMIN(equiv_rate, st->bitrate - (40*C+20)*((400>>LM) - 50)); @@ -1719,8 +1745,11 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, compute_mdcts(mode, 0, in, freq, C, CC, LM, st->upsample, st->arch); compute_band_energies(mode, freq, bandE, effEnd, C, LM, st->arch); amp2Log2(mode, effEnd, end, bandE, bandLogE2, C); - for (i=0;iupsample, st->arch); @@ -1856,8 +1885,11 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, compute_band_energies(mode, freq, bandE, effEnd, C, LM, st->arch); amp2Log2(mode, effEnd, end, bandE, bandLogE, C); /* Compensate for the scaling of short vs long mdcts */ - for (i=0;ilsb_depth, mode->logN, isTransient, st->vbr, st->constrained_vbr, eBands, LM, effectiveBytes, &tot_boost, st->lfe, surround_dynalloc, &st->analysis, importance, spread_weight); @@ -2240,7 +2272,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, if (anti_collapse_on) { anti_collapse(mode, X, collapse_masks, LM, C, N, - start, end, oldBandE, oldLogE, oldLogE2, pulses, st->rng); + start, end, oldBandE, oldLogE, oldLogE2, pulses, st->rng, st->arch); } c=0; do { @@ -2259,15 +2291,15 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, st->prefilter_period_old=IMAX(st->prefilter_period_old, COMBFILTER_MINPERIOD); comb_filter(out_mem[c], out_mem[c], st->prefilter_period_old, st->prefilter_period, mode->shortMdctSize, st->prefilter_gain_old, st->prefilter_gain, st->prefilter_tapset_old, st->prefilter_tapset, - mode->window, overlap); + mode->window, overlap, st->arch); if (LM!=0) comb_filter(out_mem[c]+mode->shortMdctSize, out_mem[c]+mode->shortMdctSize, st->prefilter_period, pitch_index, N-mode->shortMdctSize, st->prefilter_gain, gain1, st->prefilter_tapset, prefilter_tapset, - mode->window, overlap); + mode->window, overlap, st->arch); } while (++cupsample, mode->preemph, st->preemph_memD); + deemphasis(out_mem, (opus_val16*)pcm, N, CC, st->upsample, mode->preemph, st->preemph_memD, 0); st->prefilter_period_old = st->prefilter_period; st->prefilter_gain_old = st->prefilter_gain; st->prefilter_tapset_old = st->prefilter_tapset; diff --git a/opus/celt/celt_lpc.c b/opus/celt/celt_lpc.c index 8ecb693e..fabca65c 100644 --- a/opus/celt/celt_lpc.c +++ b/opus/celt/celt_lpc.c @@ -44,23 +44,27 @@ int p opus_val32 r; opus_val32 error = ac[0]; #ifdef FIXED_POINT - opus_val32 lpc[LPC_ORDER]; + opus_val32 lpc[CELT_LPC_ORDER]; #else float *lpc = _lpc; #endif OPUS_CLEAR(lpc, p); +#ifdef FIXED_POINT if (ac[0] != 0) +#else + if (ac[0] > 1e-10f) +#endif { for (i = 0; i < p; i++) { /* Sum up this iteration's reflection coefficient */ opus_val32 rr = 0; for (j = 0; j < i; j++) rr += MULT32_32_Q31(lpc[j],ac[i - j]); - rr += SHR32(ac[i + 1],3); - r = -frac_div32(SHL32(rr,3), error); + rr += SHR32(ac[i + 1],6); + r = -frac_div32(SHL32(rr,6), error); /* Update LPC coefficients and total error */ - lpc[i] = SHR32(r,3); + lpc[i] = SHR32(r,6); for (j = 0; j < (i+1)>>1; j++) { opus_val32 tmp1, tmp2; @@ -73,17 +77,61 @@ int p error = error - MULT32_32_Q31(MULT32_32_Q31(r,r),error); /* Bail out once we get 30 dB gain */ #ifdef FIXED_POINT - if (error maxabs) { + maxabs = absval; + idx = i; + } + } + maxabs = PSHR32(maxabs, 13); /* Q25->Q12 */ + + if (maxabs > 32767) { + maxabs = MIN32(maxabs, 163838); + chirp_Q16 = QCONST32(0.999, 16) - DIV32(SHL32(maxabs - 32767, 14), + SHR32(MULT32_32_32(maxabs, idx + 1), 2)); + chirp_minus_one_Q16 = chirp_Q16 - 65536; + + /* Apply bandwidth expansion. */ + for (i = 0; i < p - 1; i++) { + lpc[i] = MULT32_32_Q16(chirp_Q16, lpc[i]); + chirp_Q16 += PSHR32(MULT32_32_32(chirp_Q16, chirp_minus_one_Q16), 16); + } + lpc[p - 1] = MULT32_32_Q16(chirp_Q16, lpc[p - 1]); + } else { + break; + } + } + + if (iter == 10) { + /* If the coeffs still do not fit into the 16 bit range after 10 iterations, + fall back to the A(z)=1 filter. */ + OPUS_CLEAR(lpc, p); + _lpc[0] = 4096; /* Q12 */ + } else { + for (i = 0; i < p; i++) { + _lpc[i] = EXTRACT16(PSHR32(lpc[i], 13)); /* Q25->Q12 */ + } + } + } #endif } @@ -110,18 +158,28 @@ void celt_fir_c( sum[1] = SHL32(EXTEND32(x[i+1]), SIG_SHIFT); sum[2] = SHL32(EXTEND32(x[i+2]), SIG_SHIFT); sum[3] = SHL32(EXTEND32(x[i+3]), SIG_SHIFT); - xcorr_kernel(rnum, x+i-ord, sum, ord, arch); - y[i ] = ROUND16(sum[0], SIG_SHIFT); - y[i+1] = ROUND16(sum[1], SIG_SHIFT); - y[i+2] = ROUND16(sum[2], SIG_SHIFT); - y[i+3] = ROUND16(sum[3], SIG_SHIFT); +#if defined(OPUS_CHECK_ASM) && defined(FIXED_POINT) + { + opus_val32 sum_c[4]; + memcpy(sum_c, sum, sizeof(sum_c)); + xcorr_kernel_c(rnum, x+i-ord, sum_c, ord); +#endif + xcorr_kernel(rnum, x+i-ord, sum, ord, arch); +#if defined(OPUS_CHECK_ASM) && defined(FIXED_POINT) + celt_assert(memcmp(sum, sum_c, sizeof(sum)) == 0); + } +#endif + y[i ] = SROUND16(sum[0], SIG_SHIFT); + y[i+1] = SROUND16(sum[1], SIG_SHIFT); + y[i+2] = SROUND16(sum[2], SIG_SHIFT); + y[i+3] = SROUND16(sum[3], SIG_SHIFT); } for (;i ARMv4 * arch[1] -> ARMv5E * arch[2] -> ARMv6 * arch[3] -> NEON + * arch[4] -> NEON+DOTPROD */ -#define OPUS_ARCHMASK 3 +#define OPUS_ARCHMASK 7 -#elif (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \ +#elif defined(OPUS_HAVE_RTCD) && \ + ((defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \ (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \ (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \ - (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX)) + (defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_PRESUME_AVX2))) #include "x86/x86cpu.h" /* We currently support 5 x86 variants: diff --git a/opus/celt/ecintrin.h b/opus/celt/ecintrin.h index 2263cff6..66a4c36e 100644 --- a/opus/celt/ecintrin.h +++ b/opus/celt/ecintrin.h @@ -49,7 +49,11 @@ This macro should only be used for implementing ec_ilog(), if it is defined. All other code should use EC_ILOG() instead.*/ #if defined(_MSC_VER) && (_MSC_VER >= 1400) +#if defined(_MSC_VER) && (_MSC_VER >= 1910) +# include /* Improve compiler throughput. */ +#else # include +#endif /*In _DEBUG mode this is not an intrinsic by default.*/ # pragma intrinsic(_BitScanReverse) diff --git a/opus/celt/entdec.c b/opus/celt/entdec.c index 0b3433ed..027aa24b 100644 --- a/opus/celt/entdec.c +++ b/opus/celt/entdec.c @@ -195,6 +195,27 @@ int ec_dec_icdf(ec_dec *_this,const unsigned char *_icdf,unsigned _ftb){ return ret; } +int ec_dec_icdf16(ec_dec *_this,const opus_uint16 *_icdf,unsigned _ftb){ + opus_uint32 r; + opus_uint32 d; + opus_uint32 s; + opus_uint32 t; + int ret; + s=_this->rng; + d=_this->val; + r=s>>_ftb; + ret=-1; + do{ + t=s; + s=IMUL32(r,_icdf[++ret]); + } + while(dval=d-s; + _this->rng=t-s; + ec_dec_normalize(_this); + return ret; +} + opus_uint32 ec_dec_uint(ec_dec *_this,opus_uint32 _ft){ unsigned ft; unsigned s; diff --git a/opus/celt/entdec.h b/opus/celt/entdec.h index 025fc187..c81f26fd 100644 --- a/opus/celt/entdec.h +++ b/opus/celt/entdec.h @@ -81,6 +81,16 @@ int ec_dec_bit_logp(ec_dec *_this,unsigned _logp); Return: The decoded symbol s.*/ int ec_dec_icdf(ec_dec *_this,const unsigned char *_icdf,unsigned _ftb); +/*Decodes a symbol given an "inverse" CDF table. + No call to ec_dec_update() is necessary after this call. + _icdf: The "inverse" CDF, such that symbol s falls in the range + [s>0?ft-_icdf[s-1]:0,ft-_icdf[s]), where ft=1<<_ftb. + The values must be monotonically non-increasing, and the last value + must be 0. + _ftb: The number of bits of precision in the cumulative distribution. + Return: The decoded symbol s.*/ +int ec_dec_icdf16(ec_dec *_this,const opus_uint16 *_icdf,unsigned _ftb); + /*Extracts a raw unsigned integer with a non-power-of-2 range from the stream. The bits must have been encoded with ec_enc_uint(). No call to ec_dec_update() is necessary after this call. diff --git a/opus/celt/entenc.c b/opus/celt/entenc.c index f1750d25..69c6f835 100644 --- a/opus/celt/entenc.c +++ b/opus/celt/entenc.c @@ -172,6 +172,17 @@ void ec_enc_icdf(ec_enc *_this,int _s,const unsigned char *_icdf,unsigned _ftb){ ec_enc_normalize(_this); } +void ec_enc_icdf16(ec_enc *_this,int _s,const opus_uint16 *_icdf,unsigned _ftb){ + opus_uint32 r; + r=_this->rng>>_ftb; + if(_s>0){ + _this->val+=_this->rng-IMUL32(r,_icdf[_s-1]); + _this->rng=IMUL32(r,_icdf[_s-1]-_icdf[_s]); + } + else _this->rng-=IMUL32(r,_icdf[_s]); + ec_enc_normalize(_this); +} + void ec_enc_uint(ec_enc *_this,opus_uint32 _fl,opus_uint32 _ft){ unsigned ft; unsigned fl; diff --git a/opus/celt/entenc.h b/opus/celt/entenc.h index f502eaf6..010874bb 100644 --- a/opus/celt/entenc.h +++ b/opus/celt/entenc.h @@ -64,6 +64,15 @@ void ec_enc_bit_logp(ec_enc *_this,int _val,unsigned _logp); _ftb: The number of bits of precision in the cumulative distribution.*/ void ec_enc_icdf(ec_enc *_this,int _s,const unsigned char *_icdf,unsigned _ftb); +/*Encodes a symbol given an "inverse" CDF table. + _s: The index of the symbol to encode. + _icdf: The "inverse" CDF, such that symbol _s falls in the range + [_s>0?ft-_icdf[_s-1]:0,ft-_icdf[_s]), where ft=1<<_ftb. + The values must be monotonically non-increasing, and the last value + must be 0. + _ftb: The number of bits of precision in the cumulative distribution.*/ +void ec_enc_icdf16(ec_enc *_this,int _s,const opus_uint16 *_icdf,unsigned _ftb); + /*Encodes a raw unsigned integer in the stream. _fl: The integer to encode. _ft: The number of integers that can be encoded (one more than the max). diff --git a/opus/celt/fixed_debug.h b/opus/celt/fixed_debug.h index f4352952..ef2e5d02 100644 --- a/opus/celt/fixed_debug.h +++ b/opus/celt/fixed_debug.h @@ -167,7 +167,7 @@ static OPUS_INLINE short SHR16_(int a, int shift, char *file, int line) #define SHL16(a, shift) SHL16_(a, shift, __FILE__, __LINE__) static OPUS_INLINE short SHL16_(int a, int shift, char *file, int line) { - int res; + opus_int32 res; if (!VERIFY_SHORT(a) || !VERIFY_SHORT(shift)) { fprintf (stderr, "SHL16: inputs are not short: %d %d in %s: line %d\n", a, shift, file, line); @@ -175,7 +175,7 @@ static OPUS_INLINE short SHL16_(int a, int shift, char *file, int line) celt_assert(0); #endif } - res = a<> 16; + if (!VERIFY_INT(res)) + { + fprintf (stderr, "MULT32_32_Q16: output is not int: %lld*%lld=%lld\n", (long long)a, (long long)b, (long long)res); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + celt_mips+=5; + return res; +} + #define MULT16_16(a, b) MULT16_16_(a, b, __FILE__, __LINE__) static OPUS_INLINE int MULT16_16_(int a, int b, char *file, int line) { @@ -446,7 +491,7 @@ static OPUS_INLINE int MULT16_32_QX_(int a, opus_int64 b, int Q, char *file, int celt_assert(0); #endif } - if (ABS32(b)>=((opus_val32)(1)<<(15+Q))) + if (ABS32(b)>=((opus_int64)(1)<<(16+Q))) { fprintf (stderr, "MULT16_32_Q%d: second operand too large: %d %d in %s: line %d\n", Q, (int)a, (int)b, file, line); #ifdef FIXED_DEBUG_ASSERT @@ -479,7 +524,7 @@ static OPUS_INLINE int MULT16_32_PX_(int a, opus_int64 b, int Q, char *file, int celt_assert(0); #endif } - if (ABS32(b)>=((opus_int64)(1)<<(15+Q))) + if (ABS32(b)>=((opus_int64)(1)<<(16+Q))) { fprintf (stderr, "MULT16_32_Q%d: second operand too large: %d %d in %s: line %d\n\n", Q, (int)a, (int)b,file, line); #ifdef FIXED_DEBUG_ASSERT @@ -786,6 +831,6 @@ static OPUS_INLINE opus_val16 SIG2WORD16_generic(celt_sig x) #undef PRINT_MIPS -#define PRINT_MIPS(file) do {fprintf (file, "total complexity = %llu MIPS\n", celt_mips);} while (0); +#define PRINT_MIPS(file) do {fprintf (file, "total complexity = %llu MIPS\n", (unsigned long long)celt_mips);} while (0); #endif diff --git a/opus/celt/fixed_generic.h b/opus/celt/fixed_generic.h index 5f4abda7..8f29d46b 100644 --- a/opus/celt/fixed_generic.h +++ b/opus/celt/fixed_generic.h @@ -57,6 +57,13 @@ #define MULT16_32_Q15(a,b) ADD32(SHL(MULT16_16((a),SHR((b),16)),1), SHR(MULT16_16SU((a),((b)&0x0000ffff)),15)) #endif +/** 32x32 multiplication, followed by a 16-bit shift right. Results fits in 32 bits */ +#if OPUS_FAST_INT64 +#define MULT32_32_Q16(a,b) ((opus_val32)SHR((opus_int64)(a)*(opus_int64)(b),16)) +#else +#define MULT32_32_Q16(a,b) (ADD32(ADD32(ADD32((opus_val32)(SHR32(((opus_uint32)((a)&0x0000ffff)*(opus_uint32)((b)&0x0000ffff)),16)), MULT16_16SU(SHR32(a,16),((b)&0x0000ffff))), MULT16_16SU(SHR32(b,16),((a)&0x0000ffff))), SHL32(MULT16_16(SHR32(a,16),SHR32(b,16)),16))) +#endif + /** 32x32 multiplication, followed by a 31-bit shift right. Results fits in 32 bits */ #if OPUS_FAST_INT64 #define MULT32_32_Q31(a,b) ((opus_val32)SHR((opus_int64)(a)*(opus_int64)(b),31)) @@ -102,9 +109,9 @@ #define SATURATE16(x) (EXTRACT16((x)>32767 ? 32767 : (x)<-32768 ? -32768 : (x))) -/** Shift by a and round-to-neareast 32-bit value. Result is a 16-bit value */ +/** Shift by a and round-to-nearest 32-bit value. Result is a 16-bit value */ #define ROUND16(x,a) (EXTRACT16(PSHR32((x),(a)))) -/** Shift by a and round-to-neareast 32-bit value. Result is a saturated 16-bit value */ +/** Shift by a and round-to-nearest 32-bit value. Result is a saturated 16-bit value */ #define SROUND16(x,a) EXTRACT16(SATURATE(PSHR32(x,a), 32767)); /** Divide by two */ @@ -131,6 +138,9 @@ /** 16x16 multiplication where the result fits in 16 bits */ #define MULT16_16_16(a,b) ((((opus_val16)(a))*((opus_val16)(b)))) +/** 32x32 multiplication where the result fits in 32 bits */ +#define MULT32_32_32(a,b) ((((opus_val32)(a))*((opus_val32)(b)))) + /* (opus_val32)(opus_val16) gives TI compiler a hint that it's 16x16->32 multiply */ /** 16x16 multiplication where the result fits in 32 bits */ #define MULT16_16(a,b) (((opus_val32)(opus_val16)(a))*((opus_val32)(opus_val16)(b))) diff --git a/opus/celt/float_cast.h b/opus/celt/float_cast.h index 889dae96..8915a5fd 100644 --- a/opus/celt/float_cast.h +++ b/opus/celt/float_cast.h @@ -67,7 +67,39 @@ #include static OPUS_INLINE opus_int32 float2int(float x) {return _mm_cvt_ss2si(_mm_set_ss(x));} -#elif defined(HAVE_LRINTF) +#elif (defined(_MSC_VER) && _MSC_VER >= 1400) && (defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)) + + #include + static OPUS_INLINE opus_int32 float2int(float value) + { + /* _mm_load_ss will generate same code as _mm_set_ss + ** in _MSC_VER >= 1914 /02 so keep __mm_load__ss + ** for backward compatibility. + */ + return _mm_cvtss_si32(_mm_load_ss(&value)); + } + +#elif (defined(_MSC_VER) && _MSC_VER >= 1400) && defined (_M_IX86) + + #include + + /* Win32 doesn't seem to have these functions. + ** Therefore implement OPUS_INLINE versions of these functions here. + */ + + static OPUS_INLINE opus_int32 + float2int (float flt) + { int intgr; + + _asm + { fld flt + fistp intgr + } ; + + return intgr ; + } + +#elif defined(HAVE_LRINTF) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* These defines enable functionality introduced with the 1999 ISO C ** standard. They must be defined before the inclusion of math.h to @@ -85,7 +117,7 @@ static OPUS_INLINE opus_int32 float2int(float x) {return _mm_cvt_ss2si(_mm_set_s #include #define float2int(x) lrintf(x) -#elif (defined(HAVE_LRINT)) +#elif defined(HAVE_LRINT) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L #define _ISOC9X_SOURCE 1 #define _ISOC99_SOURCE 1 @@ -96,32 +128,6 @@ static OPUS_INLINE opus_int32 float2int(float x) {return _mm_cvt_ss2si(_mm_set_s #include #define float2int(x) lrint(x) -#elif (defined(_MSC_VER) && _MSC_VER >= 1400) && (defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)) - #include - - static __inline long int float2int(float value) - { - return _mm_cvtss_si32(_mm_load_ss(&value)); - } -#elif (defined(_MSC_VER) && _MSC_VER >= 1400) && defined (_M_IX86) - #include - - /* Win32 doesn't seem to have these functions. - ** Therefore implement OPUS_INLINE versions of these functions here. - */ - - static __inline long int - float2int (float flt) - { int intgr; - - _asm - { fld flt - fistp intgr - } ; - - return intgr ; - } - #else #if (defined(__GNUC__) && defined(__STDC__) && __STDC__ && __STDC_VERSION__ >= 199901L) diff --git a/opus/celt/kiss_fft.h b/opus/celt/kiss_fft.h index bffa2bfa..267f72f9 100644 --- a/opus/celt/kiss_fft.h +++ b/opus/celt/kiss_fft.h @@ -52,6 +52,10 @@ extern "C" { # define kiss_fft_scalar opus_int32 # define kiss_twiddle_scalar opus_int16 +/* Some 32-bit CPUs would load/store a kiss_twiddle_cpx with a single memory + * access, and could benefit from additional alignment. + */ +# define KISS_TWIDDLE_CPX_ALIGNMENT (sizeof(opus_int32)) #else # ifndef kiss_fft_scalar @@ -62,6 +66,12 @@ extern "C" { # endif #endif +#if defined(__GNUC__) && defined(KISS_TWIDDLE_CPX_ALIGNMENT) +#define KISS_TWIDDLE_CPX_ALIGNED __attribute__((aligned(KISS_TWIDDLE_CPX_ALIGNMENT))) +#else +#define KISS_TWIDDLE_CPX_ALIGNED +#endif + typedef struct { kiss_fft_scalar r; kiss_fft_scalar i; @@ -70,7 +80,7 @@ typedef struct { typedef struct { kiss_twiddle_scalar r; kiss_twiddle_scalar i; -}kiss_twiddle_cpx; +} KISS_TWIDDLE_CPX_ALIGNED kiss_twiddle_cpx; #define MAXFACTORS 8 /* e.g. an fft of length 128 has 4 factors diff --git a/opus/celt/laplace.c b/opus/celt/laplace.c index a7bca874..21809666 100644 --- a/opus/celt/laplace.c +++ b/opus/celt/laplace.c @@ -132,3 +132,104 @@ int ec_laplace_decode(ec_dec *dec, unsigned fs, int decay) ec_dec_update(dec, fl, IMIN(fl+fs,32768), 32768); return val; } + +void ec_laplace_encode_p0(ec_enc *enc, int value, opus_uint16 p0, opus_uint16 decay) +{ + int s; + opus_uint16 sign_icdf[3]; + sign_icdf[0] = 32768-p0; + sign_icdf[1] = sign_icdf[0]/2; + sign_icdf[2] = 0; + s = value == 0 ? 0 : (value > 0 ? 1 : 2); + ec_enc_icdf16(enc, s, sign_icdf, 15); + value = abs(value); + if (value) + { + int i; + opus_uint16 icdf[8]; + icdf[0] = IMAX(7, decay); + for (i=1;i<7;i++) + { + icdf[i] = IMAX(7-i, (icdf[i-1] * (opus_int32)decay) >> 15); + } + icdf[7] = 0; + value--; + do { + ec_enc_icdf16(enc, IMIN(value, 7), icdf, 15); + value -= 7; + } while (value >= 0); + } +} + +int ec_laplace_decode_p0(ec_dec *dec, opus_uint16 p0, opus_uint16 decay) +{ + int s; + int value; + opus_uint16 sign_icdf[3]; + sign_icdf[0] = 32768-p0; + sign_icdf[1] = sign_icdf[0]/2; + sign_icdf[2] = 0; + s = ec_dec_icdf16(dec, sign_icdf, 15); + if (s==2) s = -1; + if (s != 0) + { + int i; + int v; + opus_uint16 icdf[8]; + icdf[0] = IMAX(7, decay); + for (i=1;i<7;i++) + { + icdf[i] = IMAX(7-i, (icdf[i-1] * (opus_int32)decay) >> 15); + } + icdf[7] = 0; + value = 1; + do { + v = ec_dec_icdf16(dec, icdf, 15); + value += v; + } while (v == 7); + return s*value; + } else return 0; +} + +#if 0 + +#include +#define NB_VALS 10 +#define DATA_SIZE 10000 +int main() { + ec_enc enc; + ec_dec dec; + unsigned char *ptr; + int i; + int decay, p0; + int val[NB_VALS] = {6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + /*for (i=0;i>23)-127; - in.i -= integer<<23; + in.i -= (opus_uint32)integer<<23; frac = in.f - 1.5f; frac = -0.41445418f + frac*(0.95909232f + frac*(-0.33951290f + frac*0.16541097f)); @@ -153,14 +153,14 @@ static OPUS_INLINE float celt_exp2(float x) float f; opus_uint32 i; } res; - integer = floor(x); + integer = (int)floor(x); if (integer < -50) return 0; frac = x-integer; /* K0 = 1, K1 = log(2), K2 = 3-4*log(2), K3 = 3*log(2) - 2 */ res.f = 0.99992522f + frac * (0.69583354f + frac * (0.22606716f + 0.078024523f*frac)); - res.i = (res.i + (integer<<23)) & 0x7fffffff; + res.i = (res.i + ((opus_uint32)integer<<23)) & 0x7fffffff; return res.f; } @@ -230,6 +230,12 @@ static OPUS_INLINE opus_val32 celt_exp2_frac(opus_val16 x) frac = SHL16(x, 4); return ADD16(D0, MULT16_16_Q15(frac, ADD16(D1, MULT16_16_Q15(frac, ADD16(D2 , MULT16_16_Q15(D3,frac)))))); } + +#undef D0 +#undef D1 +#undef D2 +#undef D3 + /** Base-2 exponential approximation (2^x). (Q10 input, Q16 output) */ static OPUS_INLINE opus_val32 celt_exp2(opus_val16 x) { diff --git a/opus/celt/mips/celt_mipsr1.h b/opus/celt/mips/celt_mipsr1.h index c332fe04..d1b25c20 100644 --- a/opus/celt/mips/celt_mipsr1.h +++ b/opus/celt/mips/celt_mipsr1.h @@ -27,8 +27,8 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef __CELT_MIPSR1_H__ -#define __CELT_MIPSR1_H__ +#ifndef CELT_MIPSR1_H__ +#define CELT_MIPSR1_H__ #ifdef HAVE_CONFIG_H #include "config.h" @@ -149,4 +149,4 @@ void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N, } } -#endif /* __CELT_MIPSR1_H__ */ +#endif /* CELT_MIPSR1_H__ */ diff --git a/opus/celt/mips/mdct_mipsr1.h b/opus/celt/mips/mdct_mipsr1.h index 2934dab7..7456c181 100644 --- a/opus/celt/mips/mdct_mipsr1.h +++ b/opus/celt/mips/mdct_mipsr1.h @@ -38,8 +38,8 @@ MDCT implementation in FFMPEG, but has differences in signs, ordering and scaling in many places. */ -#ifndef __MDCT_MIPSR1_H__ -#define __MDCT_MIPSR1_H__ +#ifndef MDCT_MIPSR1_H__ +#define MDCT_MIPSR1_H__ #ifndef SKIP_CONFIG_H #ifdef HAVE_CONFIG_H @@ -285,4 +285,4 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala } } } -#endif /* __MDCT_MIPSR1_H__ */ +#endif /* MDCT_MIPSR1_H__ */ diff --git a/opus/celt/mips/vq_mipsr1.h b/opus/celt/mips/vq_mipsr1.h index f26a33e7..1621c562 100644 --- a/opus/celt/mips/vq_mipsr1.h +++ b/opus/celt/mips/vq_mipsr1.h @@ -26,8 +26,8 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef __VQ_MIPSR1_H__ -#define __VQ_MIPSR1_H__ +#ifndef VQ_MIPSR1_H__ +#define VQ_MIPSR1_H__ #ifdef HAVE_CONFIG_H #include "config.h" @@ -113,4 +113,4 @@ void renormalise_vector(celt_norm *X, int N, opus_val16 gain, int arch) /*return celt_sqrt(E);*/ } -#endif /* __VQ_MIPSR1_H__ */ +#endif /* VQ_MIPSR1_H__ */ diff --git a/opus/celt/modes.c b/opus/celt/modes.c index 390c5e8a..23f7cde6 100644 --- a/opus/celt/modes.c +++ b/opus/celt/modes.c @@ -173,7 +173,10 @@ static void compute_allocation_table(CELTMode *mode) mode->nbAllocVectors = BITALLOC_SIZE; allocVectors = opus_alloc(sizeof(unsigned char)*(BITALLOC_SIZE*mode->nbEBands)); if (allocVectors==NULL) + { + mode->allocVectors = NULL; return; + } /* Check for standard mode */ if (mode->Fs == 400*(opus_int32)mode->shortMdctSize) diff --git a/opus/celt/os_support.h b/opus/celt/os_support.h index a2171971..7d2d3781 100644 --- a/opus/celt/os_support.h +++ b/opus/celt/os_support.h @@ -39,10 +39,9 @@ #include "opus_defines.h" #include -#include #include -/** Opus wrapper for malloc(). To do your own dynamic allocation, all you need to do is replace this function and opus_free */ +/** Opus wrapper for malloc(). To do your own dynamic allocation replace this function, opus_realloc, and opus_free */ #ifndef OVERRIDE_OPUS_ALLOC static OPUS_INLINE void *opus_alloc (size_t size) { @@ -50,7 +49,15 @@ static OPUS_INLINE void *opus_alloc (size_t size) } #endif -/** Same as celt_alloc(), except that the area is only needed inside a CELT call (might cause problem with wideband though) */ +#ifndef OVERRIDE_OPUS_REALLOC +static OPUS_INLINE void *opus_realloc (void *ptr, size_t size) +{ + return realloc(ptr, size); +} +#endif + +/** Used only for non-threadsafe pseudostack. + If desired, this can always return the same area of memory rather than allocating a new one every time. */ #ifndef OVERRIDE_OPUS_ALLOC_SCRATCH static OPUS_INLINE void *opus_alloc_scratch (size_t size) { @@ -59,7 +66,7 @@ static OPUS_INLINE void *opus_alloc_scratch (size_t size) } #endif -/** Opus wrapper for free(). To do your own dynamic allocation, all you need to do is replace this function and opus_alloc */ +/** Opus wrapper for free(). To do your own dynamic allocation replace this function, opus_realloc, and opus_free */ #ifndef OVERRIDE_OPUS_FREE static OPUS_INLINE void opus_free (void *ptr) { diff --git a/opus/celt/pitch.c b/opus/celt/pitch.c index 872582a4..e33c60a3 100644 --- a/opus/celt/pitch.c +++ b/opus/celt/pitch.c @@ -161,17 +161,26 @@ void pitch_downsample(celt_sig * OPUS_RESTRICT x[], opus_val16 * OPUS_RESTRICT x shift=0; if (C==2) shift++; -#endif for (i=1;i>1;i++) - x_lp[i] = SHR32(HALF32(HALF32(x[0][(2*i-1)]+x[0][(2*i+1)])+x[0][2*i]), shift); - x_lp[0] = SHR32(HALF32(HALF32(x[0][1])+x[0][0]), shift); + x_lp[i] = SHR32(x[0][(2*i-1)], shift+2) + SHR32(x[0][(2*i+1)], shift+2) + SHR32(x[0][2*i], shift+1); + x_lp[0] = SHR32(x[0][1], shift+2) + SHR32(x[0][0], shift+1); if (C==2) { for (i=1;i>1;i++) - x_lp[i] += SHR32(HALF32(HALF32(x[1][(2*i-1)]+x[1][(2*i+1)])+x[1][2*i]), shift); - x_lp[0] += SHR32(HALF32(HALF32(x[1][1])+x[1][0]), shift); + x_lp[i] += SHR32(x[1][(2*i-1)], shift+2) + SHR32(x[1][(2*i+1)], shift+2) + SHR32(x[1][2*i], shift+1); + x_lp[0] += SHR32(x[1][1], shift+2) + SHR32(x[1][0], shift+1); } - +#else + for (i=1;i>1;i++) + x_lp[i] = .25f*x[0][(2*i-1)] + .25f*x[0][(2*i+1)] + .5f*x[0][2*i]; + x_lp[0] = .25f*x[0][1] + .5f*x[0][0]; + if (C==2) + { + for (i=1;i>1;i++) + x_lp[i] += .25f*x[1][(2*i-1)] + .25f*x[1][(2*i+1)] + .5f*x[1][2*i]; + x_lp[0] += .25f*x[1][1] + .5f*x[1][0]; + } +#endif _celt_autocorr(x_lp, ac, NULL, 0, 4, len>>1, arch); @@ -249,11 +258,20 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, opus_val32 maxcorr=1; #endif celt_assert(max_pitch>0); - celt_sig_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0); + celt_sig_assert(((size_t)_x&3)==0); for (i=0;i (depth_threshold*band_width<>4 && j<=signalBandwidth)) diff --git a/opus/celt/stack_alloc.h b/opus/celt/stack_alloc.h index 2b51c8d8..e2739bdf 100644 --- a/opus/celt/stack_alloc.h +++ b/opus/celt/stack_alloc.h @@ -40,7 +40,7 @@ #endif #ifdef USE_ALLOCA -# ifdef WIN32 +# ifdef _WIN32 # include # else # ifdef HAVE_ALLOCA_H @@ -102,7 +102,7 @@ #define VARDECL(type, var) type *var -# ifdef WIN32 +# ifdef _WIN32 # define ALLOC(var, size, type) var = ((type*)_alloca(sizeof(type)*(size))) # else # define ALLOC(var, size, type) var = ((type*)alloca(sizeof(type)*(size))) @@ -141,7 +141,7 @@ extern char *global_stack_top; #else #define ALIGN(stack, size) ((stack) += ((size) - (long)(stack)) & ((size) - 1)) -#define PUSH(stack, size, type) (ALIGN((stack),sizeof(type)/sizeof(char)),(stack)+=(size)*(sizeof(type)/sizeof(char)),(type*)((stack)-(size)*(sizeof(type)/sizeof(char)))) +#define PUSH(stack, size, type) (ALIGN((stack),sizeof(type)/(sizeof(char))),(stack)+=(size)*(sizeof(type)/(sizeof(char))),(type*)((stack)-(size)*(sizeof(type)/(sizeof(char))))) #if 0 /* Set this to 1 to instrument pseudostack usage */ #define RESTORE_STACK (printf("%ld %s:%d\n", global_stack-scratch_ptr, __FILE__, __LINE__),global_stack = _saved_stack) #else diff --git a/opus/celt/tests/test_unit_cwrs32.c b/opus/celt/tests/test_unit_cwrs32.c index 36dd8af5..f6b8ac4b 100644 --- a/opus/celt/tests/test_unit_cwrs32.c +++ b/opus/celt/tests/test_unit_cwrs32.c @@ -157,5 +157,6 @@ int main(void){ /*printf("\n");*/ } } + RESTORE_STACK; return 0; } diff --git a/opus/celt/tests/test_unit_dft.c b/opus/celt/tests/test_unit_dft.c index 70f8f493..ad6c60a0 100644 --- a/opus/celt/tests/test_unit_dft.c +++ b/opus/celt/tests/test_unit_dft.c @@ -144,8 +144,9 @@ void test1d(int nfft,int isinverse,int arch) int main(int argc,char ** argv) { + int arch; ALLOC_STACK; - int arch = opus_select_arch(); + arch = opus_select_arch(); if (argc>1) { int k; @@ -175,5 +176,6 @@ int main(int argc,char ** argv) test1d(480,1,arch); #endif } + RESTORE_STACK; return ret; } diff --git a/opus/celt/tests/test_unit_entropy.c b/opus/celt/tests/test_unit_entropy.c index 7f674529..b1619b74 100644 --- a/opus/celt/tests/test_unit_entropy.c +++ b/opus/celt/tests/test_unit_entropy.c @@ -104,7 +104,7 @@ int main(int _argc,char **_argv){ nbits=ec_tell_frac(&enc); ec_enc_done(&enc); fprintf(stderr, - "Encoded %0.2lf bits of entropy to %0.2lf bits (%0.3lf%% wasted).\n", + "Encoded %0.2f bits of entropy to %0.2f bits (%0.3f%% wasted).\n", entropy,ldexp(nbits,-3),100*(nbits-ldexp(entropy,3))/nbits); fprintf(stderr,"Packed to %li bytes.\n",(long)ec_range_bytes(&enc)); ec_dec_init(&dec,ptr,DATA_SIZE); @@ -129,7 +129,7 @@ int main(int _argc,char **_argv){ nbits2=ec_tell_frac(&dec); if(nbits!=nbits2){ fprintf(stderr, - "Reported number of bits used was %0.2lf, should be %0.2lf.\n", + "Reported number of bits used was %0.2f, should be %0.2f.\n", ldexp(nbits2,-3),ldexp(nbits,-3)); ret=-1; } diff --git a/opus/celt/tests/test_unit_laplace.c b/opus/celt/tests/test_unit_laplace.c index 727bf012..9832e5e5 100644 --- a/opus/celt/tests/test_unit_laplace.c +++ b/opus/celt/tests/test_unit_laplace.c @@ -89,5 +89,6 @@ int main(void) } free(ptr); + RESTORE_STACK; return ret; } diff --git a/opus/celt/tests/test_unit_mathops.c b/opus/celt/tests/test_unit_mathops.c index 874e9adf..0615448d 100644 --- a/opus/celt/tests/test_unit_mathops.c +++ b/opus/celt/tests/test_unit_mathops.c @@ -143,7 +143,7 @@ void testbitexactlog2tan(void) void testlog2(void) { float x; - for (x=0.001;x<1677700.0;x+=(x/8.0)) + for (x=0.001f;x<1677700.0;x+=(x/8.0)) { float error = fabs((1.442695040888963387*log(x))-celt_log2(x)); if (error>0.0009) @@ -157,7 +157,7 @@ void testlog2(void) void testexp2(void) { float x; - for (x=-11.0;x<24.0;x+=0.0007) + for (x=-11.0;x<24.0;x+=0.0007f) { float error = fabs(x-(1.442695040888963387*log(celt_exp2(x)))); if (error>0.0002) @@ -171,7 +171,7 @@ void testexp2(void) void testexp2log2(void) { float x; - for (x=-11.0;x<24.0;x+=0.0007) + for (x=-11.0;x<24.0;x+=0.0007f) { float error = fabs(x-(celt_log2(celt_exp2(x)))); if (error>0.001) diff --git a/opus/celt/tests/test_unit_mdct.c b/opus/celt/tests/test_unit_mdct.c index 4a563ccf..70dc042e 100644 --- a/opus/celt/tests/test_unit_mdct.c +++ b/opus/celt/tests/test_unit_mdct.c @@ -184,8 +184,9 @@ void test1d(int nfft,int isinverse,int arch) int main(int argc,char ** argv) { + int arch; ALLOC_STACK; - int arch = opus_select_arch(); + arch = opus_select_arch(); if (argc>1) { int k; @@ -223,5 +224,6 @@ int main(int argc,char ** argv) test1d(1920,1,arch); #endif } + RESTORE_STACK; return ret; } diff --git a/opus/celt/tests/test_unit_rotation.c b/opus/celt/tests/test_unit_rotation.c index 8a31b3f2..6d05499b 100644 --- a/opus/celt/tests/test_unit_rotation.c +++ b/opus/celt/tests/test_unit_rotation.c @@ -82,5 +82,6 @@ int main(void) test_rotation(23, 5); test_rotation(50, 3); test_rotation(80, 1); + RESTORE_STACK; return ret; } diff --git a/opus/celt/x86/celt_lpc_sse.h b/opus/celt/x86/celt_lpc_sse.h index 7d1ecf75..90e69ecf 100644 --- a/opus/celt/x86/celt_lpc_sse.h +++ b/opus/celt/x86/celt_lpc_sse.h @@ -33,7 +33,6 @@ #endif #if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT) -#define OVERRIDE_CELT_FIR void celt_fir_sse4_1( const opus_val16 *x, @@ -44,10 +43,11 @@ void celt_fir_sse4_1( int arch); #if defined(OPUS_X86_PRESUME_SSE4_1) +#define OVERRIDE_CELT_FIR #define celt_fir(x, num, y, N, ord, arch) \ ((void)arch, celt_fir_sse4_1(x, num, y, N, ord, arch)) -#else +#elif defined(OPUS_HAVE_RTCD) extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])( const opus_val16 *x, @@ -57,6 +57,7 @@ extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])( int ord, int arch); +#define OVERRIDE_CELT_FIR # define celt_fir(x, num, y, N, ord, arch) \ ((*CELT_FIR_IMPL[(arch) & OPUS_ARCHMASK])(x, num, y, N, ord, arch)) diff --git a/opus/celt/x86/celt_lpc_sse4_1.c b/opus/celt/x86/celt_lpc_sse4_1.c index 54785688..daf59d24 100644 --- a/opus/celt/x86/celt_lpc_sse4_1.c +++ b/opus/celt/x86/celt_lpc_sse4_1.c @@ -64,9 +64,16 @@ void celt_fir_sse4_1(const opus_val16 *x, { opus_val32 sums[4] = {0}; __m128i vecSum, vecX; - - xcorr_kernel(rnum, x+i-ord, sums, ord, arch); - +#if defined(OPUS_CHECK_ASM) + { + opus_val32 sums_c[4] = {0}; + xcorr_kernel_c(rnum, x+i-ord, sums_c, ord); +#endif + xcorr_kernel(rnum, x+i-ord, sums, ord, arch); +#if defined(OPUS_CHECK_ASM) + celt_assert(memcmp(sums, sums_c, sizeof(sums)) == 0); + } +#endif vecSum = _mm_loadu_si128((__m128i *)sums); vecSum = _mm_add_epi32(vecSum, vecNoA); vecSum = _mm_srai_epi32(vecSum, SIG_SHIFT); diff --git a/opus/celt/x86/pitch_avx.c b/opus/celt/x86/pitch_avx.c new file mode 100644 index 00000000..f731762d --- /dev/null +++ b/opus/celt/x86/pitch_avx.c @@ -0,0 +1,101 @@ +/* Copyright (c) 2023 Amazon */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + + +#include +#include "x86cpu.h" +#include "pitch.h" + +#if defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(FIXED_POINT) + +/* Like the "regular" xcorr_kernel(), but computes 8 results at a time. */ +static void xcorr_kernel_avx(const float *x, const float *y, float sum[8], int len) +{ + __m256 xsum0, xsum1, xsum2, xsum3, xsum4, xsum5, xsum6, xsum7; + xsum7 = xsum6 = xsum5 = xsum4 = xsum3 = xsum2 = xsum1 = xsum0 = _mm256_setzero_ps(); + int i; + __m256 x0; + /* Compute 8 inner products using partial sums. */ + for (i=0;i0); + (void)arch; + for (i=0;i= 3); sum0 = _mm_setzero_si128(); @@ -177,19 +185,56 @@ void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 vecSum = _mm_add_epi32(vecSum, sum2); } - for (;jHW_SSE = (info[3] & (1 << 25)) != 0; cpu_feature->HW_SSE2 = (info[3] & (1 << 26)) != 0; cpu_feature->HW_SSE41 = (info[2] & (1 << 19)) != 0; - cpu_feature->HW_AVX = (info[2] & (1 << 28)) != 0; + cpu_feature->HW_AVX2 = (info[2] & (1 << 28)) != 0 && (info[2] & (1 << 12)) != 0; + if (cpu_feature->HW_AVX2 && nIds >= 7) { + cpuid(info, 7); + cpu_feature->HW_AVX2 = cpu_feature->HW_AVX2 && (info[1] & (1 << 5)) != 0; + } else { + cpu_feature->HW_AVX2 = 0; + } } else { cpu_feature->HW_SSE = 0; cpu_feature->HW_SSE2 = 0; cpu_feature->HW_SSE41 = 0; - cpu_feature->HW_AVX = 0; + cpu_feature->HW_AVX2 = 0; } } -int opus_select_arch(void) +static int opus_select_arch_impl(void) { CPU_Feature cpu_feature; int arch; @@ -145,7 +163,7 @@ int opus_select_arch(void) } arch++; - if (!cpu_feature.HW_AVX) + if (!cpu_feature.HW_AVX2) { return arch; } @@ -154,4 +172,13 @@ int opus_select_arch(void) return arch; } +int opus_select_arch(void) { + int arch = opus_select_arch_impl(); +#ifdef FUZZING + /* Randomly downgrade the architecture. */ + arch = rand()%(arch+1); +#endif + return arch; +} + #endif diff --git a/opus/celt/x86/x86cpu.h b/opus/celt/x86/x86cpu.h index 1e2bf17b..8ae9be8d 100644 --- a/opus/celt/x86/x86cpu.h +++ b/opus/celt/x86/x86cpu.h @@ -46,50 +46,53 @@ # define MAY_HAVE_SSE4_1(name) name ## _c # endif -# if defined(OPUS_X86_MAY_HAVE_AVX) -# define MAY_HAVE_AVX(name) name ## _avx +# if defined(OPUS_X86_MAY_HAVE_AVX2) +# define MAY_HAVE_AVX2(name) name ## _avx2 # else -# define MAY_HAVE_AVX(name) name ## _c +# define MAY_HAVE_AVX2(name) name ## _c # endif -# if defined(OPUS_HAVE_RTCD) +# if defined(OPUS_HAVE_RTCD) && \ + ((defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \ + (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \ + (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \ + (defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_PRESUME_AVX2))) int opus_select_arch(void); # endif -/*gcc appears to emit MOVDQA's to load the argument of an _mm_cvtepi8_epi32() - or _mm_cvtepi16_epi32() when optimizations are disabled, even though the - actual PMOVSXWD instruction takes an m32 or m64. Unlike a normal memory - reference, these require 16-byte alignment and load a full 16 bytes (instead - of 4 or 8), possibly reading out of bounds. - - We can insert an explicit MOVD or MOVQ using _mm_cvtsi32_si128() or - _mm_loadl_epi64(), which should have the same semantics as an m32 or m64 - reference in the PMOVSXWD instruction itself, but gcc is not smart enough to - optimize this out when optimizations ARE enabled. - - Clang, in contrast, requires us to do this always for _mm_cvtepi8_epi32 - (which is fair, since technically the compiler is always allowed to do the - dereference before invoking the function implementing the intrinsic). - However, it is smart enough to eliminate the extra MOVD instruction. - For _mm_cvtepi16_epi32, it does the right thing, though does *not* optimize out - the extra MOVQ if it's specified explicitly */ - -# if defined(__clang__) || !defined(__OPTIMIZE__) -# define OP_CVTEPI8_EPI32_M32(x) \ - (_mm_cvtepi8_epi32(_mm_cvtsi32_si128(*(int *)(x)))) -# else +# if defined(OPUS_X86_MAY_HAVE_SSE2) +# include "opus_defines.h" + +/*MOVD should not impose any alignment restrictions, but the C standard does, + and UBSan will report errors if we actually make unaligned accesses. + Use this to work around those restrictions (which should hopefully all get + optimized to a single MOVD instruction). + GCC implemented _mm_loadu_si32() since GCC 11; HOWEVER, there is a bug! + https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99754 */ +# if !defined(_MSC_VER) && !OPUS_GNUC_PREREQ(11,3) && !(defined(__clang__) && (__clang_major__ >= 8)) +# include +# include + +# ifdef _mm_loadu_si32 +# undef _mm_loadu_si32 +# endif +# define _mm_loadu_si32 WORKAROUND_mm_loadu_si32 +static inline __m128i WORKAROUND_mm_loadu_si32(void const* mem_addr) { + int val; + memcpy(&val, mem_addr, sizeof(val)); + return _mm_cvtsi32_si128(val); +} +# elif defined(_MSC_VER) + /* MSVC needs this for _mm_loadu_si32 */ +# include +# endif + # define OP_CVTEPI8_EPI32_M32(x) \ - (_mm_cvtepi8_epi32(*(__m128i *)(x))) -#endif + (_mm_cvtepi8_epi32(_mm_loadu_si32(x))) -/* similar reasoning about the instruction sequence as in the 32-bit macro above, - */ -# if defined(__clang__) || !defined(__OPTIMIZE__) -# define OP_CVTEPI16_EPI32_M64(x) \ - (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(x)))) -# else # define OP_CVTEPI16_EPI32_M64(x) \ - (_mm_cvtepi16_epi32(*(__m128i *)(x))) + (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(void*)(x)))) + # endif #endif diff --git a/opus/include/opus/opus.h b/opus/include/opus/opus.h index d282f21d..eadeda75 100644 --- a/opus/include/opus/opus.h +++ b/opus/include/opus/opus.h @@ -103,7 +103,7 @@ extern "C" { * @endcode * * where opus_encoder_get_size() returns the required size for the encoder state. Note that - * future versions of this code may change the size, so no assuptions should be made about it. + * future versions of this code may change the size, so no assumptions should be made about it. * * The encoder state is always continuous in memory and only a shallow copy is sufficient * to copy it (e.g. memcpy()) @@ -198,7 +198,7 @@ OPUS_EXPORT OPUS_WARN_UNUSED_RESULT int opus_encoder_get_size(int channels); * This must be one of 8000, 12000, 16000, * 24000, or 48000. * @param [in] channels int: Number of channels (1 or 2) in input signal - * @param [in] application int: Coding mode (@ref OPUS_APPLICATION_VOIP/@ref OPUS_APPLICATION_AUDIO/@ref OPUS_APPLICATION_RESTRICTED_LOWDELAY) + * @param [in] application int: Coding mode (one of @ref OPUS_APPLICATION_VOIP, @ref OPUS_APPLICATION_AUDIO, or @ref OPUS_APPLICATION_RESTRICTED_LOWDELAY) * @param [out] error int*: @ref opus_errorcodes * @note Regardless of the sampling rate and number channels selected, the Opus encoder * can switch to a lower audio bandwidth or number of channels if the bitrate @@ -222,7 +222,7 @@ OPUS_EXPORT OPUS_WARN_UNUSED_RESULT OpusEncoder *opus_encoder_create( * This must be one of 8000, 12000, 16000, * 24000, or 48000. * @param [in] channels int: Number of channels (1 or 2) in input signal - * @param [in] application int: Coding mode (OPUS_APPLICATION_VOIP/OPUS_APPLICATION_AUDIO/OPUS_APPLICATION_RESTRICTED_LOWDELAY) + * @param [in] application int: Coding mode (one of OPUS_APPLICATION_VOIP, OPUS_APPLICATION_AUDIO, or OPUS_APPLICATION_RESTRICTED_LOWDELAY) * @retval #OPUS_OK Success or @ref opus_errorcodes */ OPUS_EXPORT int opus_encoder_init( @@ -357,7 +357,7 @@ OPUS_EXPORT int opus_encoder_ctl(OpusEncoder *st, int request, ...) OPUS_ARG_NON * error = opus_decoder_init(dec, Fs, channels); * @endcode * where opus_decoder_get_size() returns the required size for the decoder state. Note that - * future versions of this code may change the size, so no assuptions should be made about it. + * future versions of this code may change the size, so no assumptions should be made about it. * * The decoder state is always continuous in memory and only a shallow copy is sufficient * to copy it (e.g. memcpy()) @@ -398,6 +398,21 @@ OPUS_EXPORT int opus_encoder_ctl(OpusEncoder *st, int request, ...) OPUS_ARG_NON */ typedef struct OpusDecoder OpusDecoder; +/** Opus DRED decoder. + * This contains the complete state of an Opus DRED decoder. + * It is position independent and can be freely copied. + * @see opus_dred_decoder_create,opus_dred_decoder_init + */ +typedef struct OpusDREDDecoder OpusDREDDecoder; + + +/** Opus DRED state. + * This contains the complete state of an Opus DRED packet. + * It is position independent and can be freely copied. + * @see opus_dred_create,opus_dred_init + */ +typedef struct OpusDRED OpusDRED; + /** Gets the size of an OpusDecoder structure. * @param [in] channels int: Number of channels. * This must be 1 or 2. @@ -511,6 +526,101 @@ OPUS_EXPORT int opus_decoder_ctl(OpusDecoder *st, int request, ...) OPUS_ARG_NON */ OPUS_EXPORT void opus_decoder_destroy(OpusDecoder *st); +/** Gets the size of an OpusDREDDecoder structure. + * @returns The size in bytes. + */ +OPUS_EXPORT int opus_dred_decoder_get_size(void); + +/** Allocates and initializes an OpusDREDDecoder state. + * @param [out] error int*: #OPUS_OK Success or @ref opus_errorcodes + */ +OPUS_EXPORT OpusDREDDecoder *opus_dred_decoder_create(int *error); + +/** Initializes an OpusDREDDecoder state. + * @param[in] dec OpusDREDDecoder*: State to be initialized. + */ +OPUS_EXPORT int opus_dred_decoder_init(OpusDREDDecoder *dec); + +/** Frees an OpusDREDDecoder allocated by opus_dred_decoder_create(). + * @param[in] dec OpusDREDDecoder*: State to be freed. + */ +OPUS_EXPORT void opus_dred_decoder_destroy(OpusDREDDecoder *dec); + +/** Perform a CTL function on an Opus DRED decoder. + * + * Generally the request and subsequent arguments are generated + * by a convenience macro. + * @param dred_dec OpusDREDDecoder*: DRED Decoder state. + * @param request This and all remaining parameters should be replaced by one + * of the convenience macros in @ref opus_genericctls or + * @ref opus_decoderctls. + * @see opus_genericctls + * @see opus_decoderctls + */ +OPUS_EXPORT int opus_dred_decoder_ctl(OpusDREDDecoder *dred_dec, int request, ...); + +/** Gets the size of an OpusDRED structure. + * @returns The size in bytes. + */ +OPUS_EXPORT int opus_dred_get_size(void); + +/** Allocates and initializes a DRED state. + * @param [out] error int*: #OPUS_OK Success or @ref opus_errorcodes + */ +OPUS_EXPORT OpusDRED *opus_dred_alloc(int *error); + +/** Frees an OpusDRED allocated by opus_dred_create(). + * @param[in] dec OpusDRED*: State to be freed. + */ +OPUS_EXPORT void opus_dred_free(OpusDRED *dec); + +/** Decode an Opus DRED packet. + * @param [in] dred_dec OpusDRED*: DRED Decoder state + * @param [in] dred OpusDRED*: DRED state + * @param [in] data char*: Input payload + * @param [in] len opus_int32: Number of bytes in payload + * @param [in] max_dred_samples opus_int32: Maximum number of DRED samples that may be needed (if available in the packet). + * @param [in] sampling_rate opus_int32: Sampling rate used for max_dred_samples argument. Needs not match the actual sampling rate of the decoder. + * @param [out] dred_end opus_int32*: Number of non-encoded (silence) samples between the DRED timestamp and the last DRED sample. + * @param [in] defer_processing int: Flag (0 or 1). If set to one, the CPU-intensive part of the DRED decoding is deferred until opus_dred_process() is called. + * @returns Offset (positive) of the first decoded DRED samples, zero if no DRED is present, or @ref opus_errorcodes + */ +OPUS_EXPORT int opus_dred_parse(OpusDREDDecoder *dred_dec, OpusDRED *dred, const unsigned char *data, opus_int32 len, opus_int32 max_dred_samples, opus_int32 sampling_rate, int *dred_end, int defer_processing) OPUS_ARG_NONNULL(1); + +/** Finish decoding an Opus DRED packet. The function only needs to be called if opus_dred_parse() was called with defer_processing=1. + * The source and destination will often be the same DRED state. + * @param [in] dred_dec OpusDRED*: DRED Decoder state + * @param [in] src OpusDRED*: Source DRED state to start the processing from. + * @param [out] dst OpusDRED*: Destination DRED state to store the updated state after processing. + * @returns @ref opus_errorcodes + */ +OPUS_EXPORT int opus_dred_process(OpusDREDDecoder *dred_dec, const OpusDRED *src, OpusDRED *dst); + +/** Decode audio from an Opus DRED packet with floating point output. + * @param [in] st OpusDecoder*: Decoder state + * @param [in] dred OpusDRED*: DRED state + * @param [in] dred_offset opus_int32: position of the redundancy to decode (in samples before the beginning of the real audio data in the packet). + * @param [out] pcm opus_int16*: Output signal (interleaved if 2 channels). length + * is frame_size*channels*sizeof(opus_int16) + * @param [in] frame_size Number of samples per channel to decode in \a pcm. + * frame_size must be a multiple of 2.5 ms. + * @returns Number of decoded samples or @ref opus_errorcodes + */ +OPUS_EXPORT int opus_decoder_dred_decode(OpusDecoder *st, const OpusDRED *dred, opus_int32 dred_offset, opus_int16 *pcm, opus_int32 frame_size); + +/** Decode audio from an Opus DRED packet with floating point output. + * @param [in] st OpusDecoder*: Decoder state + * @param [in] dred OpusDRED*: DRED state + * @param [in] dred_offset opus_int32: position of the redundancy to decode (in samples before the beginning of the real audio data in the packet). + * @param [out] pcm float*: Output signal (interleaved if 2 channels). length + * is frame_size*channels*sizeof(float) + * @param [in] frame_size Number of samples per channel to decode in \a pcm. + * frame_size must be a multiple of 2.5 ms. + * @returns Number of decoded samples or @ref opus_errorcodes + */ +OPUS_EXPORT int opus_decoder_dred_decode_float(OpusDecoder *st, const OpusDRED *dred, opus_int32 dred_offset, float *pcm, opus_int32 frame_size); + + /** Parse an opus packet into one or more frames. * Opus_decode will perform this operation internally so most applications do * not need to use this function. @@ -583,6 +693,14 @@ OPUS_EXPORT OPUS_WARN_UNUSED_RESULT int opus_packet_get_nb_frames(const unsigned */ OPUS_EXPORT OPUS_WARN_UNUSED_RESULT int opus_packet_get_nb_samples(const unsigned char packet[], opus_int32 len, opus_int32 Fs) OPUS_ARG_NONNULL(1); +/** Checks whether an Opus packet has LBRR. + * @param [in] packet char*: Opus packet + * @param [in] len opus_int32: Length of packet + * @returns 1 is LBRR is present, 0 otherwise + * @retval OPUS_INVALID_PACKET The compressed data passed is corrupted or of an unsupported type + */ +OPUS_EXPORT OPUS_WARN_UNUSED_RESULT int opus_packet_has_lbrr(const unsigned char packet[], opus_int32 len); + /** Gets the number of samples of an Opus packet. * @param [in] dec OpusDecoder*: Decoder state * @param [in] packet char*: Opus packet diff --git a/opus/include/opus/opus_custom.h b/opus/include/opus/opus_custom.h index 41f36bf2..2f22d4b3 100644 --- a/opus/include/opus/opus_custom.h +++ b/opus/include/opus/opus_custom.h @@ -104,7 +104,8 @@ typedef struct OpusCustomDecoder OpusCustomDecoder; /** The mode contains all the information necessary to create an encoder. Both the encoder and decoder need to be initialized with exactly the same mode, otherwise the output will be - corrupted. + corrupted. The mode MUST NOT BE DESTROYED until the encoders and + decoders that use it are destroyed as well. @brief Mode configuration */ typedef struct OpusCustomMode OpusCustomMode; @@ -178,7 +179,7 @@ OPUS_CUSTOM_EXPORT OPUS_WARN_UNUSED_RESULT OpusCustomEncoder *opus_custom_encode ) OPUS_ARG_NONNULL(1); -/** Destroys a an encoder state. +/** Destroys an encoder state. * @param[in] st OpusCustomEncoder*: State to be freed. */ OPUS_CUSTOM_EXPORT void opus_custom_encoder_destroy(OpusCustomEncoder *st); @@ -286,7 +287,7 @@ OPUS_CUSTOM_EXPORT OPUS_WARN_UNUSED_RESULT OpusCustomDecoder *opus_custom_decode int *error ) OPUS_ARG_NONNULL(1); -/** Destroys a an decoder state. +/** Destroys a decoder state. * @param[in] st OpusCustomDecoder*: State to be freed. */ OPUS_CUSTOM_EXPORT void opus_custom_decoder_destroy(OpusCustomDecoder *st); diff --git a/opus/include/opus/opus_defines.h b/opus/include/opus/opus_defines.h index d141418b..cd8f4dde 100644 --- a/opus/include/opus/opus_defines.h +++ b/opus/include/opus/opus_defines.h @@ -64,7 +64,7 @@ extern "C" { /**Export control for opus functions */ #ifndef OPUS_EXPORT -# if defined(WIN32) +# if defined(_WIN32) # if defined(OPUS_BUILD) && defined(DLL_EXPORT) # define OPUS_EXPORT __declspec(dllexport) # else @@ -169,15 +169,32 @@ extern "C" { #define OPUS_SET_PHASE_INVERSION_DISABLED_REQUEST 4046 #define OPUS_GET_PHASE_INVERSION_DISABLED_REQUEST 4047 #define OPUS_GET_IN_DTX_REQUEST 4049 +#define OPUS_SET_DRED_DURATION_REQUEST 4050 +#define OPUS_GET_DRED_DURATION_REQUEST 4051 +#define OPUS_SET_DNN_BLOB_REQUEST 4052 +/*#define OPUS_GET_DNN_BLOB_REQUEST 4053 */ /** Defines for the presence of extended APIs. */ #define OPUS_HAVE_OPUS_PROJECTION_H /* Macros to trigger compilation errors when the wrong types are provided to a CTL */ #define __opus_check_int(x) (((void)((x) == (opus_int32)0)), (opus_int32)(x)) + +#ifdef DISABLE_PTR_CHECK +/* Disable checks to prevent ubsan from complaining about NULL checks + in test_opus_api. */ +#define __opus_check_int_ptr(ptr) (ptr) +#define __opus_check_uint_ptr(ptr) (ptr) +#define __opus_check_uint8_ptr(ptr) (ptr) +#define __opus_check_val16_ptr(ptr) (ptr) +#define __opus_check_void_ptr(ptr) (ptr) +#else #define __opus_check_int_ptr(ptr) ((ptr) + ((ptr) - (opus_int32*)(ptr))) #define __opus_check_uint_ptr(ptr) ((ptr) + ((ptr) - (opus_uint32*)(ptr))) +#define __opus_check_uint8_ptr(ptr) ((ptr) + ((ptr) - (opus_uint8*)(ptr))) #define __opus_check_val16_ptr(ptr) ((ptr) + ((ptr) - (opus_val16*)(ptr))) +#define __opus_check_void_ptr(x) ((void)((void *)0 == (x)), (x)) +#endif /** @endcond */ /** @defgroup opus_ctlvalues Pre-defined values for CTL interface @@ -482,7 +499,8 @@ extern "C" { * @param[in] x opus_int32: Allowed values: *
*
0
Disable inband FEC (default).
- *
1
Enable inband FEC.
+ *
1
Inband FEC enabled. If the packet loss rate is sufficiently high, Opus will automatically switch to SILK even at high rates to enable use of that FEC.
+ *
2
Inband FEC enabled, but does not necessarily switch to SILK if we have music.
*
* @hideinitializer */ #define OPUS_SET_INBAND_FEC(x) OPUS_SET_INBAND_FEC_REQUEST, __opus_check_int(x) @@ -491,7 +509,8 @@ extern "C" { * @param[out] x opus_int32 *: Returns one of the following values: *
*
0
Inband FEC disabled (default).
- *
1
Inband FEC enabled.
+ *
1
Inband FEC enabled. If the packet loss rate is sufficiently high, Opus will automatically switch to SILK even at high rates to enable use of that FEC.
+ *
2
Inband FEC enabled, but does not necessarily switch to SILK if we have music.
*
* @hideinitializer */ #define OPUS_GET_INBAND_FEC(x) OPUS_GET_INBAND_FEC_REQUEST, __opus_check_int_ptr(x) @@ -618,6 +637,18 @@ extern "C" { * @hideinitializer */ #define OPUS_GET_PREDICTION_DISABLED(x) OPUS_GET_PREDICTION_DISABLED_REQUEST, __opus_check_int_ptr(x) +/** If non-zero, enables Deep Redundancy (DRED) and use the specified maximum number of 10-ms redundant frames + * @hideinitializer */ +#define OPUS_SET_DRED_DURATION(x) OPUS_SET_DRED_DURATION_REQUEST, __opus_check_int(x) +/** Gets the encoder's configured Deep Redundancy (DRED) maximum number of frames. + * @hideinitializer */ +#define OPUS_GET_DRED_DURATION(x) OPUS_GET_DRED_DURATION_REQUEST, __opus_check_int_ptr(x) + +/** Provide external DNN weights from binary object (only when explicitly built without the weights) + * @hideinitializer */ +#define OPUS_SET_DNN_BLOB(data, len) OPUS_SET_DNN_BLOB_REQUEST, __opus_check_void_ptr(data), __opus_check_int(len) + + /**@}*/ /** @defgroup opus_genericctls Generic CTLs diff --git a/opus/include/opus/opus_multistream.h b/opus/include/opus/opus_multistream.h index babcee69..824cc55a 100644 --- a/opus/include/opus/opus_multistream.h +++ b/opus/include/opus/opus_multistream.h @@ -143,7 +143,7 @@ extern "C" { * Vorbis * channel ordering. A decoder may wish to apply an additional permutation * to the mapping the encoder used to achieve a different output channel - * order (e.g. for outputing in WAV order). + * order (e.g. for outputting in WAV order). * * Each multistream packet contains an Opus packet for each stream, and all of * the Opus packets in a single multistream packet must have the same diff --git a/opus/silk/API.h b/opus/silk/API.h index 4d90ff9a..878965c7 100644 --- a/opus/silk/API.h +++ b/opus/silk/API.h @@ -34,6 +34,10 @@ POSSIBILITY OF SUCH DAMAGE. #include "entenc.h" #include "entdec.h" +#ifdef ENABLE_DEEP_PLC +#include "lpcnet_private.h" +#endif + #ifdef __cplusplus extern "C" { @@ -88,6 +92,16 @@ opus_int silk_Encode( /* O Returns error co /* Decoder functions */ /****************************************/ + +/***********************************************/ +/* Load OSCE models from external data pointer */ +/***********************************************/ +opus_int silk_LoadOSCEModels( + void *decState, /* O I/O State */ + const unsigned char *data, /* I pointer to binary blob */ + int len /* I length of binary blob data */ +); + /***********************************************/ /* Get size in bytes of the Silk decoder state */ /***********************************************/ @@ -96,8 +110,12 @@ opus_int silk_Get_Decoder_Size( /* O Returns error co ); /*************************/ -/* Init or Reset decoder */ +/* Init and Reset decoder */ /*************************/ +opus_int silk_ResetDecoder( /* O Returns error code */ + void *decState /* I/O State */ +); + opus_int silk_InitDecoder( /* O Returns error code */ void *decState /* I/O State */ ); @@ -113,6 +131,9 @@ opus_int silk_Decode( /* O Returns error co ec_dec *psRangeDec, /* I/O Compressor data structure */ opus_int16 *samplesOut, /* O Decoded output speech vector */ opus_int32 *nSamplesOut, /* O Number of samples decoded */ +#ifdef ENABLE_DEEP_PLC + LPCNetPLCState *lpcnet, +#endif int arch /* I Run-time architecture */ ); diff --git a/opus/silk/CNG.c b/opus/silk/CNG.c index ef8e38df..2a910099 100644 --- a/opus/silk/CNG.c +++ b/opus/silk/CNG.c @@ -118,6 +118,10 @@ void silk_CNG( /* Smooth gains */ for( i = 0; i < psDec->nb_subfr; i++ ) { psCNG->CNG_smth_Gain_Q16 += silk_SMULWB( psDecCtrl->Gains_Q16[ i ] - psCNG->CNG_smth_Gain_Q16, CNG_GAIN_SMTH_Q16 ); + /* If the smoothed gain is 3 dB greater than this subframe's gain, use this subframe's gain to adapt faster. */ + if( silk_SMULWW( psCNG->CNG_smth_Gain_Q16, CNG_GAIN_SMTH_THRESHOLD_Q16 ) > psDecCtrl->Gains_Q16[ i ] ) { + psCNG->CNG_smth_Gain_Q16 = psDecCtrl->Gains_Q16[ i ]; + } } } diff --git a/opus/silk/LPC_fit.c b/opus/silk/LPC_fit.c index cdea4f3a..c0690a1f 100644 --- a/opus/silk/LPC_fit.c +++ b/opus/silk/LPC_fit.c @@ -31,7 +31,8 @@ POSSIBILITY OF SUCH DAMAGE. #include "SigProc_FIX.h" -/* Convert int32 coefficients to int16 coefs and make sure there's no wrap-around */ +/* Convert int32 coefficients to int16 coefs and make sure there's no wrap-around. + This logic is reused in _celt_lpc(). Any bug fixes should also be applied there. */ void silk_LPC_fit( opus_int16 *a_QOUT, /* O Output signal */ opus_int32 *a_QIN, /* I/O Input signal */ diff --git a/opus/silk/MacroCount.h b/opus/silk/MacroCount.h index 78100ffe..dab2f57a 100644 --- a/opus/silk/MacroCount.h +++ b/opus/silk/MacroCount.h @@ -27,9 +27,9 @@ POSSIBILITY OF SUCH DAMAGE. #ifndef SIGPROCFIX_API_MACROCOUNT_H #define SIGPROCFIX_API_MACROCOUNT_H -#include #ifdef silk_MACRO_COUNT +#include #define varDefine opus_int64 ops_count = 0; extern opus_int64 ops_count; diff --git a/opus/silk/MacroDebug.h b/opus/silk/MacroDebug.h index 8dd4ce2e..3110da9a 100644 --- a/opus/silk/MacroDebug.h +++ b/opus/silk/MacroDebug.h @@ -55,7 +55,7 @@ static OPUS_INLINE opus_int16 silk_ADD16_(opus_int16 a, opus_int16 b, char *file static OPUS_INLINE opus_int32 silk_ADD32_(opus_int32 a, opus_int32 b, char *file, int line){ opus_int32 ret; - ret = a + b; + ret = (opus_int32)((opus_uint32)a + (opus_uint32)b); if ( ret != silk_ADD_SAT32( a, b ) ) { fprintf (stderr, "silk_ADD32(%d, %d) in %s: line %d\n", a, b, file, line); @@ -101,9 +101,9 @@ static OPUS_INLINE opus_int16 silk_SUB16_(opus_int16 a, opus_int16 b, char *file #undef silk_SUB32 #define silk_SUB32(a,b) silk_SUB32_((a), (b), __FILE__, __LINE__) static OPUS_INLINE opus_int32 silk_SUB32_(opus_int32 a, opus_int32 b, char *file, int line){ - opus_int32 ret; + opus_int64 ret; - ret = a - b; + ret = a - (opus_int64)b; if ( ret != silk_SUB_SAT32( a, b ) ) { fprintf (stderr, "silk_SUB32(%d, %d) in %s: line %d\n", a, b, file, line); @@ -257,7 +257,7 @@ static OPUS_INLINE opus_int64 silk_SUB_SAT64_( opus_int64 a64, opus_int64 b64, c static OPUS_INLINE opus_int32 silk_MUL_(opus_int32 a32, opus_int32 b32, char *file, int line){ opus_int32 ret; opus_int64 ret64; - ret = a32 * b32; + ret = (opus_int32)((opus_uint32)a32 * (opus_uint32)b32); ret64 = (opus_int64)a32 * (opus_int64)b32; if ( (opus_int64)ret != ret64 ) { @@ -333,8 +333,8 @@ static OPUS_INLINE opus_int32 silk_SMULWB_(opus_int32 a32, opus_int32 b32, char #define silk_SMLAWB(a,b,c) silk_SMLAWB_((a), (b), (c), __FILE__, __LINE__) static OPUS_INLINE opus_int32 silk_SMLAWB_(opus_int32 a32, opus_int32 b32, opus_int32 c32, char *file, int line){ opus_int32 ret; - ret = silk_ADD32( a32, silk_SMULWB( b32, c32 ) ); - if ( silk_ADD32( a32, silk_SMULWB( b32, c32 ) ) != silk_ADD_SAT32( a32, silk_SMULWB( b32, c32 ) ) ) + ret = silk_ADD32_ovflw( a32, silk_SMULWB( b32, c32 ) ); + if ( ret != silk_ADD_SAT32( a32, silk_SMULWB( b32, c32 ) ) ) { fprintf (stderr, "silk_SMLAWB(%d, %d, %d) in %s: line %d\n", a32, b32, c32, file, line); #ifdef FIXED_DEBUG_ASSERT @@ -465,7 +465,7 @@ static OPUS_INLINE opus_int32 silk_SMULWW_(opus_int32 a32, opus_int32 b32, char if ( fail ) { - fprintf (stderr, "silk_SMULWT(%d, %d) in %s: line %d\n", a32, b32, file, line); + fprintf (stderr, "silk_SMULWW(%d, %d) in %s: line %d\n", a32, b32, file, line); #ifdef FIXED_DEBUG_ASSERT silk_assert( 0 ); #endif @@ -491,12 +491,6 @@ static OPUS_INLINE opus_int32 silk_SMLAWW_(opus_int32 a32, opus_int32 b32, opus_ return ret; } -/* Multiply-accumulate macros that allow overflow in the addition (ie, no asserts in debug mode) */ -#undef silk_MLA_ovflw -#define silk_MLA_ovflw(a32, b32, c32) ((a32) + ((b32) * (c32))) -#undef silk_SMLABB_ovflw -#define silk_SMLABB_ovflw(a32, b32, c32) ((a32) + ((opus_int32)((opus_int16)(b32))) * (opus_int32)((opus_int16)(c32))) - /* no checking needed for silk_SMULL no checking needed for silk_SMLAL no checking needed for silk_SMLALBB @@ -546,10 +540,10 @@ static OPUS_INLINE opus_int32 silk_DIV32_16_(opus_int32 a32, opus_int32 b32, cha static OPUS_INLINE opus_int8 silk_LSHIFT8_(opus_int8 a, opus_int32 shift, char *file, int line){ opus_int8 ret; int fail = 0; - ret = a << shift; + ret = (opus_int8)((opus_uint8)a << shift); fail |= shift < 0; fail |= shift >= 8; - fail |= (opus_int64)ret != ((opus_int64)a) << shift; + fail |= (opus_int64)ret != (opus_int64)(((opus_uint64)a) << shift); if ( fail ) { fprintf (stderr, "silk_LSHIFT8(%d, %d) in %s: line %d\n", a, shift, file, line); @@ -565,10 +559,10 @@ static OPUS_INLINE opus_int8 silk_LSHIFT8_(opus_int8 a, opus_int32 shift, char * static OPUS_INLINE opus_int16 silk_LSHIFT16_(opus_int16 a, opus_int32 shift, char *file, int line){ opus_int16 ret; int fail = 0; - ret = a << shift; + ret = (opus_int16)((opus_uint16)a << shift); fail |= shift < 0; fail |= shift >= 16; - fail |= (opus_int64)ret != ((opus_int64)a) << shift; + fail |= (opus_int64)ret != (opus_int64)(((opus_uint64)a) << shift); if ( fail ) { fprintf (stderr, "silk_LSHIFT16(%d, %d) in %s: line %d\n", a, shift, file, line); @@ -584,10 +578,10 @@ static OPUS_INLINE opus_int16 silk_LSHIFT16_(opus_int16 a, opus_int32 shift, cha static OPUS_INLINE opus_int32 silk_LSHIFT32_(opus_int32 a, opus_int32 shift, char *file, int line){ opus_int32 ret; int fail = 0; - ret = a << shift; + ret = (opus_int32)((opus_uint32)a << shift); fail |= shift < 0; fail |= shift >= 32; - fail |= (opus_int64)ret != ((opus_int64)a) << shift; + fail |= (opus_int64)ret != (opus_int64)(((opus_uint64)a) << shift); if ( fail ) { fprintf (stderr, "silk_LSHIFT32(%d, %d) in %s: line %d\n", a, shift, file, line); @@ -603,7 +597,7 @@ static OPUS_INLINE opus_int32 silk_LSHIFT32_(opus_int32 a, opus_int32 shift, cha static OPUS_INLINE opus_int64 silk_LSHIFT64_(opus_int64 a, opus_int shift, char *file, int line){ opus_int64 ret; int fail = 0; - ret = a << shift; + ret = (opus_int64)((opus_uint64)a << shift); fail |= shift < 0; fail |= shift >= 64; fail |= (ret>>shift) != ((opus_int64)a); @@ -714,8 +708,8 @@ static OPUS_INLINE opus_uint32 silk_RSHIFT_uint_(opus_uint32 a, opus_int32 shift #define silk_ADD_LSHIFT(a,b,c) silk_ADD_LSHIFT_((a), (b), (c), __FILE__, __LINE__) static OPUS_INLINE int silk_ADD_LSHIFT_(int a, int b, int shift, char *file, int line){ opus_int16 ret; - ret = a + (b << shift); - if ( (shift < 0) || (shift>15) || ((opus_int64)ret != (opus_int64)a + (((opus_int64)b) << shift)) ) + ret = a + (opus_int16)((opus_uint16)b << shift); + if ( (shift < 0) || (shift>15) || ((opus_int64)ret != (opus_int64)a + (opus_int64)(((opus_uint64)b) << shift)) ) { fprintf (stderr, "silk_ADD_LSHIFT(%d, %d, %d) in %s: line %d\n", a, b, shift, file, line); #ifdef FIXED_DEBUG_ASSERT @@ -729,8 +723,8 @@ static OPUS_INLINE int silk_ADD_LSHIFT_(int a, int b, int shift, char *file, int #define silk_ADD_LSHIFT32(a,b,c) silk_ADD_LSHIFT32_((a), (b), (c), __FILE__, __LINE__) static OPUS_INLINE opus_int32 silk_ADD_LSHIFT32_(opus_int32 a, opus_int32 b, opus_int32 shift, char *file, int line){ opus_int32 ret; - ret = a + (b << shift); - if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a + (((opus_int64)b) << shift)) ) + ret = silk_ADD32_ovflw(a, (opus_int32)((opus_uint32)b << shift)); + if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a + (opus_int64)(((opus_uint64)b) << shift)) ) { fprintf (stderr, "silk_ADD_LSHIFT32(%d, %d, %d) in %s: line %d\n", a, b, shift, file, line); #ifdef FIXED_DEBUG_ASSERT @@ -774,7 +768,7 @@ static OPUS_INLINE int silk_ADD_RSHIFT_(int a, int b, int shift, char *file, int #define silk_ADD_RSHIFT32(a,b,c) silk_ADD_RSHIFT32_((a), (b), (c), __FILE__, __LINE__) static OPUS_INLINE opus_int32 silk_ADD_RSHIFT32_(opus_int32 a, opus_int32 b, opus_int32 shift, char *file, int line){ opus_int32 ret; - ret = a + (b >> shift); + ret = silk_ADD32_ovflw(a, (b >> shift)); if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a + (((opus_int64)b) >> shift)) ) { fprintf (stderr, "silk_ADD_RSHIFT32(%d, %d, %d) in %s: line %d\n", a, b, shift, file, line); @@ -804,8 +798,8 @@ static OPUS_INLINE opus_uint32 silk_ADD_RSHIFT_uint_(opus_uint32 a, opus_uint32 #define silk_SUB_LSHIFT32(a,b,c) silk_SUB_LSHIFT32_((a), (b), (c), __FILE__, __LINE__) static OPUS_INLINE opus_int32 silk_SUB_LSHIFT32_(opus_int32 a, opus_int32 b, opus_int32 shift, char *file, int line){ opus_int32 ret; - ret = a - (b << shift); - if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a - (((opus_int64)b) << shift)) ) + ret = silk_SUB32_ovflw(a, (opus_int32)((opus_uint32)b << shift)); + if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a - (opus_int64)(((opus_uint64)b) << shift)) ) { fprintf (stderr, "silk_SUB_LSHIFT32(%d, %d, %d) in %s: line %d\n", a, b, shift, file, line); #ifdef FIXED_DEBUG_ASSERT @@ -819,7 +813,7 @@ static OPUS_INLINE opus_int32 silk_SUB_LSHIFT32_(opus_int32 a, opus_int32 b, opu #define silk_SUB_RSHIFT32(a,b,c) silk_SUB_RSHIFT32_((a), (b), (c), __FILE__, __LINE__) static OPUS_INLINE opus_int32 silk_SUB_RSHIFT32_(opus_int32 a, opus_int32 b, opus_int32 shift, char *file, int line){ opus_int32 ret; - ret = a - (b >> shift); + ret = silk_SUB32_ovflw(a, (b >> shift)); if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a - (((opus_int64)b) >> shift)) ) { fprintf (stderr, "silk_SUB_RSHIFT32(%d, %d, %d) in %s: line %d\n", a, b, shift, file, line); @@ -835,7 +829,7 @@ static OPUS_INLINE opus_int32 silk_SUB_RSHIFT32_(opus_int32 a, opus_int32 b, opu static OPUS_INLINE opus_int32 silk_RSHIFT_ROUND_(opus_int32 a, opus_int32 shift, char *file, int line){ opus_int32 ret; ret = shift == 1 ? (a >> 1) + (a & 1) : ((a >> (shift - 1)) + 1) >> 1; - /* the marco definition can't handle a shift of zero */ + /* the macro definition can't handle a shift of zero */ if ( (shift <= 0) || (shift>31) || ((opus_int64)ret != ((opus_int64)a + ((opus_int64)1 << (shift - 1))) >> shift) ) { fprintf (stderr, "silk_RSHIFT_ROUND(%d, %d) in %s: line %d\n", a, shift, file, line); @@ -850,7 +844,7 @@ static OPUS_INLINE opus_int32 silk_RSHIFT_ROUND_(opus_int32 a, opus_int32 shift, #define silk_RSHIFT_ROUND64(a,b) silk_RSHIFT_ROUND64_((a), (b), __FILE__, __LINE__) static OPUS_INLINE opus_int64 silk_RSHIFT_ROUND64_(opus_int64 a, opus_int32 shift, char *file, int line){ opus_int64 ret; - /* the marco definition can't handle a shift of zero */ + /* the macro definition can't handle a shift of zero */ if ( (shift <= 0) || (shift>=64) ) { fprintf (stderr, "silk_RSHIFT_ROUND64(%lld, %d) in %s: line %d\n", (long long)a, shift, file, line); diff --git a/opus/silk/NSQ.c b/opus/silk/NSQ.c index 1d64d8e2..1caa829b 100644 --- a/opus/silk/NSQ.c +++ b/opus/silk/NSQ.c @@ -75,21 +75,21 @@ static OPUS_INLINE void silk_noise_shape_quantizer( void silk_NSQ_c ( - const silk_encoder_state *psEncC, /* I Encoder State */ - silk_nsq_state *NSQ, /* I/O NSQ state */ - SideInfoIndices *psIndices, /* I/O Quantization Indices */ + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + SideInfoIndices *psIndices, /* I/O Quantization Indices */ const opus_int16 x16[], /* I Input */ - opus_int8 pulses[], /* O Quantized pulse signal */ - const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ - const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ - const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ - const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ - const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ - const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ - const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ - const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ - const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ - const opus_int LTP_scale_Q14 /* I LTP state scaling */ + opus_int8 pulses[], /* O Quantized pulse signal */ + const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */ + const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ + const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ + const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ + const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ + const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ + const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ + const opus_int LTP_scale_Q14 /* I LTP state scaling */ ) { opus_int k, lag, start_idx, LSF_interpolation_flag; @@ -173,9 +173,9 @@ void silk_NSQ_c RESTORE_STACK; } -/***********************************/ -/* silk_noise_shape_quantizer */ -/***********************************/ +/******************************/ +/* silk_noise_shape_quantizer */ +/******************************/ #if !defined(OPUS_X86_MAY_HAVE_SSE4_1) static OPUS_INLINE @@ -258,17 +258,17 @@ void silk_noise_shape_quantizer( celt_assert( lag > 0 || signalType != TYPE_VOICED ); /* Combine prediction and noise shaping signals */ - tmp1 = silk_SUB32( silk_LSHIFT32( LPC_pred_Q10, 2 ), n_AR_Q12 ); /* Q12 */ - tmp1 = silk_SUB32( tmp1, n_LF_Q12 ); /* Q12 */ + tmp1 = silk_SUB32_ovflw( silk_LSHIFT32( LPC_pred_Q10, 2 ), n_AR_Q12 ); /* Q12 */ + tmp1 = silk_SUB32_ovflw( tmp1, n_LF_Q12 ); /* Q12 */ if( lag > 0 ) { /* Symmetric, packed FIR coefficients */ - n_LTP_Q13 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 ); + n_LTP_Q13 = silk_SMULWB( silk_ADD_SAT32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 ); n_LTP_Q13 = silk_SMLAWT( n_LTP_Q13, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 ); n_LTP_Q13 = silk_LSHIFT( n_LTP_Q13, 1 ); shp_lag_ptr++; tmp2 = silk_SUB32( LTP_pred_Q13, n_LTP_Q13 ); /* Q13 */ - tmp1 = silk_ADD_LSHIFT32( tmp2, tmp1, 1 ); /* Q13 */ + tmp1 = silk_ADD32_ovflw( tmp2, silk_LSHIFT32( tmp1, 1 ) ); /* Q13 */ tmp1 = silk_RSHIFT_ROUND( tmp1, 3 ); /* Q10 */ } else { tmp1 = silk_RSHIFT_ROUND( tmp1, 2 ); /* Q10 */ @@ -340,7 +340,7 @@ void silk_noise_shape_quantizer( /* Add predictions */ LPC_exc_Q14 = silk_ADD_LSHIFT32( exc_Q14, LTP_pred_Q13, 1 ); - xq_Q14 = silk_ADD_LSHIFT32( LPC_exc_Q14, LPC_pred_Q10, 4 ); + xq_Q14 = silk_ADD32_ovflw( LPC_exc_Q14, silk_LSHIFT32( LPC_pred_Q10, 4 ) ); /* Scale XQ back to normal level before saving */ xq[ i ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( xq_Q14, Gain_Q10 ), 8 ) ); @@ -349,10 +349,10 @@ void silk_noise_shape_quantizer( psLPC_Q14++; *psLPC_Q14 = xq_Q14; NSQ->sDiff_shp_Q14 = silk_SUB_LSHIFT32( xq_Q14, x_sc_Q10[ i ], 4 ); - sLF_AR_shp_Q14 = silk_SUB_LSHIFT32( NSQ->sDiff_shp_Q14, n_AR_Q12, 2 ); + sLF_AR_shp_Q14 = silk_SUB32_ovflw( NSQ->sDiff_shp_Q14, silk_LSHIFT32( n_AR_Q12, 2 ) ); NSQ->sLF_AR_shp_Q14 = sLF_AR_shp_Q14; - NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx ] = silk_SUB_LSHIFT32( sLF_AR_shp_Q14, n_LF_Q12, 2 ); + NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx ] = silk_SUB32_ovflw(sLF_AR_shp_Q14, silk_LSHIFT32(n_LF_Q12, 2)); sLTP_Q15[ NSQ->sLTP_buf_idx ] = silk_LSHIFT( LPC_exc_Q14, 1 ); NSQ->sLTP_shp_buf_idx++; NSQ->sLTP_buf_idx++; diff --git a/opus/silk/NSQ_del_dec.c b/opus/silk/NSQ_del_dec.c index 3fd9fa0d..e8dadf15 100644 --- a/opus/silk/NSQ_del_dec.c +++ b/opus/silk/NSQ_del_dec.c @@ -115,21 +115,21 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec( ); void silk_NSQ_del_dec_c( - const silk_encoder_state *psEncC, /* I Encoder State */ - silk_nsq_state *NSQ, /* I/O NSQ state */ - SideInfoIndices *psIndices, /* I/O Quantization Indices */ + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + SideInfoIndices *psIndices, /* I/O Quantization Indices */ const opus_int16 x16[], /* I Input */ - opus_int8 pulses[], /* O Quantized pulse signal */ - const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ - const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ - const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ - const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ - const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ - const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ - const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ - const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ - const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ - const opus_int LTP_scale_Q14 /* I LTP state scaling */ + opus_int8 pulses[], /* O Quantized pulse signal */ + const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */ + const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ + const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ + const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ + const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ + const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ + const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ + const opus_int LTP_scale_Q14 /* I LTP state scaling */ ) { opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr; @@ -394,8 +394,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec( /* Long-term shaping */ if( lag > 0 ) { /* Symmetric, packed FIR coefficients */ - n_LTP_Q14 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 ); - n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 ); + n_LTP_Q14 = silk_SMULWB( silk_ADD_SAT32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 ); + n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 ); n_LTP_Q14 = silk_SUB_LSHIFT32( LTP_pred_Q14, n_LTP_Q14, 2 ); /* Q12 -> Q14 */ shp_lag_ptr++; } else { @@ -423,18 +423,18 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec( /* Output of lowpass section */ tmp2 = silk_SMLAWB( psDD->Diff_Q14, psDD->sAR2_Q14[ 0 ], warping_Q16 ); /* Output of allpass section */ - tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ 0 ], psDD->sAR2_Q14[ 1 ] - tmp2, warping_Q16 ); + tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ 0 ], silk_SUB32_ovflw(psDD->sAR2_Q14[ 1 ], tmp2), warping_Q16 ); psDD->sAR2_Q14[ 0 ] = tmp2; n_AR_Q14 = silk_RSHIFT( shapingLPCOrder, 1 ); n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp2, AR_shp_Q13[ 0 ] ); /* Loop over allpass sections */ for( j = 2; j < shapingLPCOrder; j += 2 ) { /* Output of allpass section */ - tmp2 = silk_SMLAWB( psDD->sAR2_Q14[ j - 1 ], psDD->sAR2_Q14[ j + 0 ] - tmp1, warping_Q16 ); + tmp2 = silk_SMLAWB( psDD->sAR2_Q14[ j - 1 ], silk_SUB32_ovflw(psDD->sAR2_Q14[ j + 0 ], tmp1), warping_Q16 ); psDD->sAR2_Q14[ j - 1 ] = tmp1; n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp1, AR_shp_Q13[ j - 1 ] ); /* Output of allpass section */ - tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ j + 0 ], psDD->sAR2_Q14[ j + 1 ] - tmp2, warping_Q16 ); + tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ j + 0 ], silk_SUB32_ovflw(psDD->sAR2_Q14[ j + 1 ], tmp2), warping_Q16 ); psDD->sAR2_Q14[ j + 0 ] = tmp2; n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp2, AR_shp_Q13[ j ] ); } @@ -451,9 +451,9 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec( /* Input minus prediction plus noise feedback */ /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP */ - tmp1 = silk_ADD32( n_AR_Q14, n_LF_Q14 ); /* Q14 */ - tmp2 = silk_ADD32( n_LTP_Q14, LPC_pred_Q14 ); /* Q13 */ - tmp1 = silk_SUB32( tmp2, tmp1 ); /* Q13 */ + tmp1 = silk_ADD_SAT32( n_AR_Q14, n_LF_Q14 ); /* Q14 */ + tmp2 = silk_ADD32_ovflw( n_LTP_Q14, LPC_pred_Q14 ); /* Q13 */ + tmp1 = silk_SUB_SAT32( tmp2, tmp1 ); /* Q13 */ tmp1 = silk_RSHIFT_ROUND( tmp1, 4 ); /* Q10 */ r_Q10 = silk_SUB32( x_Q10[ i ], tmp1 ); /* residual error Q10 */ @@ -530,12 +530,12 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec( /* Add predictions */ LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 ); - xq_Q14 = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 ); + xq_Q14 = silk_ADD32_ovflw( LPC_exc_Q14, LPC_pred_Q14 ); /* Update states */ - psSS[ 0 ].Diff_Q14 = silk_SUB_LSHIFT32( xq_Q14, x_Q10[ i ], 4 ); - sLF_AR_shp_Q14 = silk_SUB32( psSS[ 0 ].Diff_Q14, n_AR_Q14 ); - psSS[ 0 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 ); + psSS[ 0 ].Diff_Q14 = silk_SUB32_ovflw( xq_Q14, silk_LSHIFT32( x_Q10[ i ], 4 ) ); + sLF_AR_shp_Q14 = silk_SUB32_ovflw( psSS[ 0 ].Diff_Q14, n_AR_Q14 ); + psSS[ 0 ].sLTP_shp_Q14 = silk_SUB_SAT32( sLF_AR_shp_Q14, n_LF_Q14 ); psSS[ 0 ].LF_AR_Q14 = sLF_AR_shp_Q14; psSS[ 0 ].LPC_exc_Q14 = LPC_exc_Q14; psSS[ 0 ].xq_Q14 = xq_Q14; @@ -550,12 +550,12 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec( /* Add predictions */ LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 ); - xq_Q14 = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 ); + xq_Q14 = silk_ADD32_ovflw( LPC_exc_Q14, LPC_pred_Q14 ); /* Update states */ - psSS[ 1 ].Diff_Q14 = silk_SUB_LSHIFT32( xq_Q14, x_Q10[ i ], 4 ); - sLF_AR_shp_Q14 = silk_SUB32( psSS[ 1 ].Diff_Q14, n_AR_Q14 ); - psSS[ 1 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 ); + psSS[ 1 ].Diff_Q14 = silk_SUB32_ovflw( xq_Q14, silk_LSHIFT32( x_Q10[ i ], 4 ) ); + sLF_AR_shp_Q14 = silk_SUB32_ovflw( psSS[ 1 ].Diff_Q14, n_AR_Q14 ); + psSS[ 1 ].sLTP_shp_Q14 = silk_SUB_SAT32( sLF_AR_shp_Q14, n_LF_Q14 ); psSS[ 1 ].LF_AR_Q14 = sLF_AR_shp_Q14; psSS[ 1 ].LPC_exc_Q14 = LPC_exc_Q14; psSS[ 1 ].xq_Q14 = xq_Q14; diff --git a/opus/silk/PLC.c b/opus/silk/PLC.c index f8939165..b35bf750 100644 --- a/opus/silk/PLC.c +++ b/opus/silk/PLC.c @@ -33,6 +33,10 @@ POSSIBILITY OF SUCH DAMAGE. #include "stack_alloc.h" #include "PLC.h" +#ifdef ENABLE_DEEP_PLC +#include "lpcnet.h" +#endif + #define NB_ATT 2 static const opus_int16 HARM_ATT_Q15[NB_ATT] = { 32440, 31130 }; /* 0.99, 0.95 */ static const opus_int16 PLC_RAND_ATTENUATE_V_Q15[NB_ATT] = { 31130, 26214 }; /* 0.95, 0.8 */ @@ -47,6 +51,9 @@ static OPUS_INLINE void silk_PLC_conceal( silk_decoder_state *psDec, /* I/O Decoder state */ silk_decoder_control *psDecCtrl, /* I/O Decoder control */ opus_int16 frame[], /* O LPC residual signal */ +#ifdef ENABLE_DEEP_PLC + LPCNetPLCState *lpcnet, +#endif int arch /* I Run-time architecture */ ); @@ -67,6 +74,9 @@ void silk_PLC( silk_decoder_control *psDecCtrl, /* I/O Decoder control */ opus_int16 frame[], /* I/O signal */ opus_int lost, /* I Loss flag */ +#ifdef ENABLE_DEEP_PLC + LPCNetPLCState *lpcnet, +#endif int arch /* I Run-time architecture */ ) { @@ -80,7 +90,11 @@ void silk_PLC( /****************************/ /* Generate Signal */ /****************************/ - silk_PLC_conceal( psDec, psDecCtrl, frame, arch ); + silk_PLC_conceal( psDec, psDecCtrl, frame, +#ifdef ENABLE_DEEP_PLC + lpcnet, +#endif + arch ); psDec->lossCnt++; } else { @@ -88,6 +102,14 @@ void silk_PLC( /* Update state */ /****************************/ silk_PLC_update( psDec, psDecCtrl ); +#ifdef ENABLE_DEEP_PLC + if ( lpcnet != NULL && psDec->sPLC.fs_kHz == 16 ) { + int k; + for( k = 0; k < psDec->nb_subfr; k += 2 ) { + lpcnet_plc_update( lpcnet, frame + k * psDec->subfr_length ); + } + } +#endif } } @@ -195,6 +217,9 @@ static OPUS_INLINE void silk_PLC_conceal( silk_decoder_state *psDec, /* I/O Decoder state */ silk_decoder_control *psDecCtrl, /* I/O Decoder control */ opus_int16 frame[], /* O LPC residual signal */ +#ifdef ENABLE_DEEP_PLC + LPCNetPLCState *lpcnet, +#endif int arch /* I Run-time architecture */ ) { @@ -328,10 +353,8 @@ static OPUS_INLINE void silk_PLC_conceal( for( j = 0; j < LTP_ORDER; j++ ) { B_Q14[ j ] = silk_RSHIFT( silk_SMULBB( harm_Gain_Q15, B_Q14[ j ] ), 15 ); } - if ( psDec->indices.signalType != TYPE_NO_VOICE_ACTIVITY ) { - /* Gradually reduce excitation gain */ - rand_scale_Q14 = silk_RSHIFT( silk_SMULBB( rand_scale_Q14, rand_Gain_Q15 ), 15 ); - } + /* Gradually reduce excitation gain */ + rand_scale_Q14 = silk_RSHIFT( silk_SMULBB( rand_scale_Q14, rand_Gain_Q15 ), 15 ); /* Slowly increase pitch lag */ psPLC->pitchL_Q8 = silk_SMLAWB( psPLC->pitchL_Q8, psPLC->pitchL_Q8, PITCH_DRIFT_FAC_Q16 ); @@ -373,6 +396,24 @@ static OPUS_INLINE void silk_PLC_conceal( /* Scale with Gain */ frame[ i ] = (opus_int16)silk_SAT16( silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( sLPC_Q14_ptr[ MAX_LPC_ORDER + i ], prevGain_Q10[ 1 ] ), 8 ) ) ); } +#ifdef ENABLE_DEEP_PLC + if ( lpcnet != NULL && lpcnet->loaded && psDec->sPLC.fs_kHz == 16 ) { + int run_deep_plc = psDec->sPLC.enable_deep_plc || lpcnet->fec_fill_pos != 0; + if( run_deep_plc ) { + for( k = 0; k < psDec->nb_subfr; k += 2 ) { + lpcnet_plc_conceal( lpcnet, frame + k * psDec->subfr_length ); + } + /* We *should* be able to copy only from psDec->frame_length-MAX_LPC_ORDER, i.e. the last MAX_LPC_ORDER samples. */ + for( i = 0; i < psDec->frame_length; i++ ) { + sLPC_Q14_ptr[ MAX_LPC_ORDER + i ] = (int)floor(.5 + frame[ i ] * (float)(1 << 24) / prevGain_Q10[ 1 ] ); + } + } else { + for( k = 0; k < psDec->nb_subfr; k += 2 ) { + lpcnet_plc_update( lpcnet, frame + k * psDec->subfr_length ); + } + } + } +#endif /* Save LPC state */ silk_memcpy( psDec->sLPC_Q14_buf, &sLPC_Q14_ptr[ psDec->frame_length ], MAX_LPC_ORDER * sizeof( opus_int32 ) ); @@ -433,12 +474,16 @@ void silk_PLC_glue_frames( slope_Q16 = silk_DIV32_16( ( (opus_int32)1 << 16 ) - gain_Q16, length ); /* Make slope 4x steeper to avoid missing onsets after DTX */ slope_Q16 = silk_LSHIFT( slope_Q16, 2 ); - - for( i = 0; i < length; i++ ) { - frame[ i ] = silk_SMULWB( gain_Q16, frame[ i ] ); - gain_Q16 += slope_Q16; - if( gain_Q16 > (opus_int32)1 << 16 ) { - break; +#ifdef ENABLE_DEEP_PLC + if ( psDec->sPLC.fs_kHz != 16 ) +#endif + { + for( i = 0; i < length; i++ ) { + frame[ i ] = silk_SMULWB( gain_Q16, frame[ i ] ); + gain_Q16 += slope_Q16; + if( gain_Q16 > (opus_int32)1 << 16 ) { + break; + } } } } diff --git a/opus/silk/PLC.h b/opus/silk/PLC.h index 6438f516..1bebb786 100644 --- a/opus/silk/PLC.h +++ b/opus/silk/PLC.h @@ -49,6 +49,9 @@ void silk_PLC( silk_decoder_control *psDecCtrl, /* I/O Decoder control */ opus_int16 frame[], /* I/O signal */ opus_int lost, /* I Loss flag */ +#ifdef ENABLE_DEEP_PLC + LPCNetPLCState *lpcnet, +#endif int arch /* I Run-time architecture */ ); diff --git a/opus/silk/SigProc_FIX.h b/opus/silk/SigProc_FIX.h index f9ae3263..fbdfa82e 100644 --- a/opus/silk/SigProc_FIX.h +++ b/opus/silk/SigProc_FIX.h @@ -381,7 +381,7 @@ opus_int32 silk_inner_prod_aligned_scale( const opus_int len /* I vector lengths */ ); -opus_int64 silk_inner_prod16_aligned_64_c( +opus_int64 silk_inner_prod16_c( const opus_int16 *inVec1, /* I input vector 1 */ const opus_int16 *inVec2, /* I input vector 2 */ const opus_int len /* I vector lengths */ @@ -609,12 +609,14 @@ static OPUS_INLINE opus_int64 silk_max_64(opus_int64 a, opus_int64 b) /* the following seems faster on x86 */ #define silk_SMMUL(a32, b32) (opus_int32)silk_RSHIFT64(silk_SMULL((a32), (b32)), 32) -#if !defined(OPUS_X86_MAY_HAVE_SSE4_1) +#if !defined(OVERRIDE_silk_burg_modified) #define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \ ((void)(arch), silk_burg_modified_c(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch)) +#endif -#define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \ - ((void)(arch),silk_inner_prod16_aligned_64_c(inVec1, inVec2, len)) +#if !defined(OVERRIDE_silk_inner_prod16) +#define silk_inner_prod16(inVec1, inVec2, len, arch) \ + ((void)(arch),silk_inner_prod16_c(inVec1, inVec2, len)) #endif #include "Inlines.h" diff --git a/opus/silk/VQ_WMat_EC.c b/opus/silk/VQ_WMat_EC.c index 0f3d545c..245a7e4b 100644 --- a/opus/silk/VQ_WMat_EC.c +++ b/opus/silk/VQ_WMat_EC.c @@ -64,7 +64,7 @@ void silk_VQ_WMat_EC_c( *rate_dist_Q8 = silk_int32_MAX; *res_nrg_Q15 = silk_int32_MAX; cb_row_Q7 = cb_Q7; - /* In things go really bad, at least *ind is set to something safe. */ + /* If things go really bad, at least *ind is set to something safe. */ *ind = 0; for( k = 0; k < L; k++ ) { opus_int32 penalty; @@ -115,7 +115,7 @@ void silk_VQ_WMat_EC_c( if( sum1_Q15 >= 0 ) { /* Translate residual energy to bits using high-rate assumption (6 dB ==> 1 bit/sample) */ bits_res_Q8 = silk_SMULBB( subfr_len, silk_lin2log( sum1_Q15 + penalty) - (15 << 7) ); - /* In the following line we reduce the codelength component by half ("-1"); seems to slghtly improve quality */ + /* In the following line we reduce the codelength component by half ("-1"); seems to slightly improve quality */ bits_tot_Q8 = silk_ADD_LSHIFT32( bits_res_Q8, cl_Q5[ k ], 3-1 ); if( bits_tot_Q8 <= *rate_dist_Q8 ) { *rate_dist_Q8 = bits_tot_Q8; diff --git a/opus/silk/arm/LPC_inv_pred_gain_neon_intr.c b/opus/silk/arm/LPC_inv_pred_gain_neon_intr.c index ab426bcd..726e6667 100644 --- a/opus/silk/arm/LPC_inv_pred_gain_neon_intr.c +++ b/opus/silk/arm/LPC_inv_pred_gain_neon_intr.c @@ -210,19 +210,23 @@ opus_int32 silk_LPC_inverse_pred_gain_neon( /* O Returns inverse predi /* Increase Q domain of the AR coefficients */ t0_s16x8 = vld1q_s16( A_Q12 + 0 ); t1_s16x8 = vld1q_s16( A_Q12 + 8 ); - t2_s16x8 = vld1q_s16( A_Q12 + 16 ); + if ( order > 16 ) { + t2_s16x8 = vld1q_s16( A_Q12 + 16 ); + } t0_s32x4 = vpaddlq_s16( t0_s16x8 ); switch( order - leftover ) { case 24: t0_s32x4 = vpadalq_s16( t0_s32x4, t2_s16x8 ); + vst1q_s32( Atmp_QA + 16, vshll_n_s16( vget_low_s16 ( t2_s16x8 ), QA - 12 ) ); + vst1q_s32( Atmp_QA + 20, vshll_n_s16( vget_high_s16( t2_s16x8 ), QA - 12 ) ); /* FALLTHROUGH */ case 16: t0_s32x4 = vpadalq_s16( t0_s32x4, t1_s16x8 ); - vst1q_s32( Atmp_QA + 16, vshll_n_s16( vget_low_s16 ( t2_s16x8 ), QA - 12 ) ); - vst1q_s32( Atmp_QA + 20, vshll_n_s16( vget_high_s16( t2_s16x8 ), QA - 12 ) ); + vst1q_s32( Atmp_QA + 8, vshll_n_s16( vget_low_s16 ( t1_s16x8 ), QA - 12 ) ); + vst1q_s32( Atmp_QA + 12, vshll_n_s16( vget_high_s16( t1_s16x8 ), QA - 12 ) ); /* FALLTHROUGH */ case 8: @@ -230,8 +234,8 @@ opus_int32 silk_LPC_inverse_pred_gain_neon( /* O Returns inverse predi const int32x2_t t_s32x2 = vpadd_s32( vget_low_s32( t0_s32x4 ), vget_high_s32( t0_s32x4 ) ); const int64x1_t t_s64x1 = vpaddl_s32( t_s32x2 ); DC_resp = vget_lane_s32( vreinterpret_s32_s64( t_s64x1 ), 0 ); - vst1q_s32( Atmp_QA + 8, vshll_n_s16( vget_low_s16 ( t1_s16x8 ), QA - 12 ) ); - vst1q_s32( Atmp_QA + 12, vshll_n_s16( vget_high_s16( t1_s16x8 ), QA - 12 ) ); + vst1q_s32( Atmp_QA + 0, vshll_n_s16( vget_low_s16 ( t0_s16x8 ), QA - 12 ) ); + vst1q_s32( Atmp_QA + 4, vshll_n_s16( vget_high_s16( t0_s16x8 ), QA - 12 ) ); } break; @@ -246,16 +250,22 @@ opus_int32 silk_LPC_inverse_pred_gain_neon( /* O Returns inverse predi case 6: DC_resp += (opus_int32)A_Q12[ 5 ]; DC_resp += (opus_int32)A_Q12[ 4 ]; + Atmp_QA[ order - leftover + 5 ] = silk_LSHIFT32( (opus_int32)A_Q12[ 5 ], QA - 12 ); + Atmp_QA[ order - leftover + 4 ] = silk_LSHIFT32( (opus_int32)A_Q12[ 4 ], QA - 12 ); /* FALLTHROUGH */ case 4: DC_resp += (opus_int32)A_Q12[ 3 ]; DC_resp += (opus_int32)A_Q12[ 2 ]; + Atmp_QA[ order - leftover + 3 ] = silk_LSHIFT32( (opus_int32)A_Q12[ 3 ], QA - 12 ); + Atmp_QA[ order - leftover + 2 ] = silk_LSHIFT32( (opus_int32)A_Q12[ 2 ], QA - 12 ); /* FALLTHROUGH */ case 2: DC_resp += (opus_int32)A_Q12[ 1 ]; DC_resp += (opus_int32)A_Q12[ 0 ]; + Atmp_QA[ order - leftover + 1 ] = silk_LSHIFT32( (opus_int32)A_Q12[ 1 ], QA - 12 ); + Atmp_QA[ order - leftover + 0 ] = silk_LSHIFT32( (opus_int32)A_Q12[ 0 ], QA - 12 ); /* FALLTHROUGH */ default: @@ -266,8 +276,6 @@ opus_int32 silk_LPC_inverse_pred_gain_neon( /* O Returns inverse predi if( DC_resp >= 4096 ) { invGain_Q30 = 0; } else { - vst1q_s32( Atmp_QA + 0, vshll_n_s16( vget_low_s16 ( t0_s16x8 ), QA - 12 ) ); - vst1q_s32( Atmp_QA + 4, vshll_n_s16( vget_high_s16( t0_s16x8 ), QA - 12 ) ); invGain_Q30 = LPC_inverse_pred_gain_QA_neon( Atmp_QA, order ); } } diff --git a/opus/silk/arm/NSQ_del_dec_arm.h b/opus/silk/arm/NSQ_del_dec_arm.h index 9e76e169..0c4fcfcc 100644 --- a/opus/silk/arm/NSQ_del_dec_arm.h +++ b/opus/silk/arm/NSQ_del_dec_arm.h @@ -34,7 +34,7 @@ POSSIBILITY OF SUCH DAMAGE. void silk_NSQ_del_dec_neon( const silk_encoder_state *psEncC, silk_nsq_state *NSQ, SideInfoIndices *psIndices, const opus_int16 x16[], opus_int8 pulses[], - const opus_int16 PredCoef_Q12[2 * MAX_LPC_ORDER], + const opus_int16 *PredCoef_Q12, const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR], const opus_int16 AR_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER], const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR], @@ -65,7 +65,7 @@ void silk_NSQ_del_dec_neon( extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])( const silk_encoder_state *psEncC, silk_nsq_state *NSQ, SideInfoIndices *psIndices, const opus_int16 x16[], opus_int8 pulses[], - const opus_int16 PredCoef_Q12[2 * MAX_LPC_ORDER], + const opus_int16 *PredCoef_Q12, const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR], const opus_int16 AR_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER], const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR], diff --git a/opus/silk/arm/NSQ_del_dec_neon_intr.c b/opus/silk/arm/NSQ_del_dec_neon_intr.c index 212410f3..668dde6d 100644 --- a/opus/silk/arm/NSQ_del_dec_neon_intr.c +++ b/opus/silk/arm/NSQ_del_dec_neon_intr.c @@ -35,6 +35,7 @@ POSSIBILITY OF SUCH DAMAGE. #endif #include "main.h" #include "stack_alloc.h" +#include "os_support.h" /* NEON intrinsics optimization now can only parallelize up to 4 delay decision states. */ /* If there are more states, C function is called, and this optimization must be expanded. */ @@ -220,7 +221,7 @@ void silk_NSQ_del_dec_neon( SideInfoIndices *psIndices, /* I/O Quantization Indices */ const opus_int16 x16[], /* I Input */ opus_int8 pulses[], /* O Quantized pulse signal */ - const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ + const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */ const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ @@ -279,6 +280,7 @@ void silk_NSQ_del_dec_neon( /* Initialize delayed decision states */ ALLOC( psDelDec, 1, NSQ_del_decs_struct ); + OPUS_CLEAR(psDelDec, 1); /* Only RandState and RD_Q10 need to be initialized to 0. */ silk_memset( psDelDec->RandState, 0, sizeof( psDelDec->RandState ) ); vst1q_s32( psDelDec->RD_Q10, vdupq_n_s32( 0 ) ); @@ -587,6 +589,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_neon( silk_assert( nStatesDelayedDecision > 0 ); silk_assert( ( shapingLPCOrder & 1 ) == 0 ); /* check that order is even */ ALLOC( psSampleState, 2, NSQ_samples_struct ); + OPUS_CLEAR(psSampleState, 2); shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ]; pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ]; @@ -711,23 +714,26 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_neon( const int rdo_offset = Lambda_Q10/2 - 512; const uint16x4_t greaterThanRdo = vcgt_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) ); const uint16x4_t lessThanMinusRdo = vclt_s16( q1_Q10_s16x4, vdup_n_s16( -rdo_offset ) ); + int16x4_t signed_offset = vbsl_s16( greaterThanRdo, vdup_n_s16( -rdo_offset ), vdup_n_s16( 0 ) ); + signed_offset = vbsl_s16( lessThanMinusRdo, vdup_n_s16( rdo_offset ), signed_offset ); /* If Lambda_Q10 > 32767, then q1_Q0, q1_Q10 and q2_Q10 must change to 32-bit. */ silk_assert( Lambda_Q10 <= 32767 ); q1_Q0_s16x4 = vreinterpret_s16_u16( vclt_s16( q1_Q10_s16x4, vdup_n_s16( 0 ) ) ); - q1_Q0_s16x4 = vbsl_s16( greaterThanRdo, vsub_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) ), q1_Q0_s16x4 ); - q1_Q0_s16x4 = vbsl_s16( lessThanMinusRdo, vadd_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) ), q1_Q0_s16x4 ); + q1_Q0_s16x4 = vbsl_s16(vorr_u16(greaterThanRdo, lessThanMinusRdo), vadd_s16( q1_Q10_s16x4 , signed_offset), q1_Q0_s16x4); q1_Q0_s16x4 = vshr_n_s16( q1_Q0_s16x4, 10 ); } { const uint16x4_t equal0_u16x4 = vceq_s16( q1_Q0_s16x4, vdup_n_s16( 0 ) ); const uint16x4_t equalMinus1_u16x4 = vceq_s16( q1_Q0_s16x4, vdup_n_s16( -1 ) ); const uint16x4_t lessThanMinus1_u16x4 = vclt_s16( q1_Q0_s16x4, vdup_n_s16( -1 ) ); - int16x4_t tmp1_s16x4, tmp2_s16x4; + int16x4_t tmp1_s16x4, tmp2_s16x4, tmp_summand_s16x4; q1_Q10_s16x4 = vshl_n_s16( q1_Q0_s16x4, 10 ); - tmp1_s16x4 = vadd_s16( q1_Q10_s16x4, vdup_n_s16( offset_Q10 - QUANT_LEVEL_ADJUST_Q10 ) ); - q1_Q10_s16x4 = vadd_s16( q1_Q10_s16x4, vdup_n_s16( offset_Q10 + QUANT_LEVEL_ADJUST_Q10 ) ); + tmp_summand_s16x4 = vand_s16( vreinterpret_s16_u16(vcge_s16(q1_Q0_s16x4, vdup_n_s16(0))), vdup_n_s16( offset_Q10 - QUANT_LEVEL_ADJUST_Q10 ) ); + tmp1_s16x4 = vadd_s16( q1_Q10_s16x4, tmp_summand_s16x4 ); + tmp_summand_s16x4 = vbsl_s16( lessThanMinus1_u16x4, vdup_n_s16( offset_Q10 + QUANT_LEVEL_ADJUST_Q10 ), vdup_n_s16(0) ); + q1_Q10_s16x4 = vadd_s16( q1_Q10_s16x4, tmp_summand_s16x4); q1_Q10_s16x4 = vbsl_s16( lessThanMinus1_u16x4, q1_Q10_s16x4, tmp1_s16x4 ); q1_Q10_s16x4 = vbsl_s16( equal0_u16x4, vdup_n_s16( offset_Q10 ), q1_Q10_s16x4 ); q1_Q10_s16x4 = vbsl_s16( equalMinus1_u16x4, vdup_n_s16( offset_Q10 - ( 1024 - QUANT_LEVEL_ADJUST_Q10 ) ), q1_Q10_s16x4 ); @@ -818,6 +824,13 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_neon( } } + /* clear unused part of RD_Q10 to avoid overflows */ + if( nStatesDelayedDecision < NEON_MAX_DEL_DEC_STATES ) + { + OPUS_CLEAR(psSampleState[0].RD_Q10 + nStatesDelayedDecision, NEON_MAX_DEL_DEC_STATES - nStatesDelayedDecision); + OPUS_CLEAR(psSampleState[1].RD_Q10 + nStatesDelayedDecision, NEON_MAX_DEL_DEC_STATES - nStatesDelayedDecision); + } + /* Increase RD values of expired states */ { uint32x4_t t_u32x4; @@ -896,7 +909,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_neon( vst1q_s32( psDelDec->Pred_Q15[ *smpl_buf_idx ], vshlq_n_s32( vld1q_s32( psSampleState[ 0 ].LPC_exc_Q14 ), 1 ) ); vst1q_s32( psDelDec->Shape_Q14[ *smpl_buf_idx ], vld1q_s32( psSampleState[ 0 ].sLTP_shp_Q14 ) ); tmp1_s32x4 = vrshrq_n_s32( tmp1_s32x4, 10 ); - tmp1_s32x4 = vaddq_s32( vld1q_s32( psDelDec->Seed ), tmp1_s32x4 ); + tmp1_s32x4 = vreinterpretq_s32_u32( vaddq_u32( vreinterpretq_u32_s32( + vld1q_s32( psDelDec->Seed ) ), vreinterpretq_u32_s32( tmp1_s32x4 ) ) ); vst1q_s32( psDelDec->Seed, tmp1_s32x4 ); vst1q_s32( psDelDec->RandState[ *smpl_buf_idx ], tmp1_s32x4 ); vst1q_s32( psDelDec->RD_Q10, vld1q_s32( psSampleState[ 0 ].RD_Q10 ) ); diff --git a/opus/silk/arm/NSQ_neon.h b/opus/silk/arm/NSQ_neon.h index b31d9442..f03d8ddd 100644 --- a/opus/silk/arm/NSQ_neon.h +++ b/opus/silk/arm/NSQ_neon.h @@ -73,7 +73,7 @@ static OPUS_INLINE void silk_short_prediction_create_arch_coef_neon(opus_int32 * #elif defined(OPUS_HAVE_RTCD) && defined(OPUS_ARM_MAY_HAVE_NEON_INTR) #define silk_short_prediction_create_arch_coef(out, in, order) \ - do { if (arch == OPUS_ARCH_ARM_NEON) { silk_short_prediction_create_arch_coef_neon(out, in, order); } } while (0) + do { if (arch >= OPUS_ARCH_ARM_NEON) { silk_short_prediction_create_arch_coef_neon(out, in, order); } } while (0) #endif @@ -95,7 +95,7 @@ opus_int32 silk_NSQ_noise_shape_feedback_loop_neon(const opus_int32 *data0, opus (coef vs. coefRev) so can't use the usual IMPL table implementation */ #undef silk_noise_shape_quantizer_short_prediction #define silk_noise_shape_quantizer_short_prediction(in, coef, coefRev, order, arch) \ - (arch == OPUS_ARCH_ARM_NEON ? \ + (arch >= OPUS_ARCH_ARM_NEON ? \ silk_noise_shape_quantizer_short_prediction_neon(in, coefRev, order) : \ silk_noise_shape_quantizer_short_prediction_c(in, coef, order)) diff --git a/opus/silk/arm/arm_silk_map.c b/opus/silk/arm/arm_silk_map.c index 0b9bfec2..a91f79b5 100644 --- a/opus/silk/arm/arm_silk_map.c +++ b/opus/silk/arm/arm_silk_map.c @@ -49,6 +49,7 @@ void (*const SILK_BIQUAD_ALT_STRIDE2_IMPL[OPUS_ARCHMASK + 1])( silk_biquad_alt_stride2_c, /* EDSP */ silk_biquad_alt_stride2_c, /* Media */ silk_biquad_alt_stride2_neon, /* Neon */ + silk_biquad_alt_stride2_neon, /* dotprod */ }; opus_int32 (*const SILK_LPC_INVERSE_PRED_GAIN_IMPL[OPUS_ARCHMASK + 1])( /* O Returns inverse prediction gain in energy domain, Q30 */ @@ -59,6 +60,7 @@ opus_int32 (*const SILK_LPC_INVERSE_PRED_GAIN_IMPL[OPUS_ARCHMASK + 1])( /* O R silk_LPC_inverse_pred_gain_c, /* EDSP */ silk_LPC_inverse_pred_gain_c, /* Media */ silk_LPC_inverse_pred_gain_neon, /* Neon */ + silk_LPC_inverse_pred_gain_neon, /* dotprod */ }; void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])( @@ -67,7 +69,7 @@ void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])( SideInfoIndices *psIndices, /* I/O Quantization Indices */ const opus_int16 x16[], /* I Input */ opus_int8 pulses[], /* O Quantized pulse signal */ - const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ + const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */ const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ @@ -82,6 +84,7 @@ void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])( silk_NSQ_del_dec_c, /* EDSP */ silk_NSQ_del_dec_c, /* Media */ silk_NSQ_del_dec_neon, /* Neon */ + silk_NSQ_del_dec_neon, /* dotprod */ }; /*There is no table for silk_noise_shape_quantizer_short_prediction because the @@ -97,6 +100,7 @@ opus_int32 silk_NSQ_noise_shape_feedback_loop_c, /* EDSP */ silk_NSQ_noise_shape_feedback_loop_c, /* Media */ silk_NSQ_noise_shape_feedback_loop_neon, /* NEON */ + silk_NSQ_noise_shape_feedback_loop_neon, /* dotprod */ }; # endif @@ -116,6 +120,7 @@ void (*const SILK_WARPED_AUTOCORRELATION_FIX_IMPL[OPUS_ARCHMASK + 1])( silk_warped_autocorrelation_FIX_c, /* EDSP */ silk_warped_autocorrelation_FIX_c, /* Media */ silk_warped_autocorrelation_FIX_neon, /* Neon */ + silk_warped_autocorrelation_FIX_neon, /* dotprod */ }; # endif diff --git a/opus/silk/bwexpander_32.c b/opus/silk/bwexpander_32.c index d0010f73..0f32b9df 100644 --- a/opus/silk/bwexpander_32.c +++ b/opus/silk/bwexpander_32.c @@ -31,7 +31,8 @@ POSSIBILITY OF SUCH DAMAGE. #include "SigProc_FIX.h" -/* Chirp (bandwidth expand) LP AR filter */ +/* Chirp (bandwidth expand) LP AR filter. + This logic is reused in _celt_lpc(). Any bug fixes should also be applied there. */ void silk_bwexpander_32( opus_int32 *ar, /* I/O AR filter to be expanded (without leading 1) */ const opus_int d, /* I Length of ar */ diff --git a/opus/silk/control.h b/opus/silk/control.h index b76ec33c..f5633e62 100644 --- a/opus/silk/control.h +++ b/opus/silk/control.h @@ -77,6 +77,9 @@ typedef struct { /* I: Flag to enable in-band Forward Error Correction (FEC); 0/1 */ opus_int useInBandFEC; + /* I: Flag to enable in-band Deep REDundancy (DRED); 0/1 */ + opus_int useDRED; + /* I: Flag to actually code in-band Forward Error Correction (FEC) in the current packet; 0/1 */ opus_int LBRR_coded; @@ -141,6 +144,14 @@ typedef struct { /* O: Pitch lag of previous frame (0 if unvoiced), measured in samples at 48 kHz */ opus_int prevPitchLag; + + /* I: Enable Deep PLC */ + opus_int enable_deep_plc; + +#ifdef ENABLE_OSCE + /* I: OSCE method */ + opus_int osce_method; +#endif } silk_DecControlStruct; #ifdef __cplusplus diff --git a/opus/silk/control_codec.c b/opus/silk/control_codec.c index 52aa8fde..784ffe66 100644 --- a/opus/silk/control_codec.c +++ b/opus/silk/control_codec.c @@ -415,7 +415,7 @@ static OPUS_INLINE opus_int silk_setup_LBRR( /* Previous packet did not have LBRR, and was therefore coded at a higher bitrate */ psEncC->LBRR_GainIncreases = 7; } else { - psEncC->LBRR_GainIncreases = silk_max_int( 7 - silk_SMULWB( (opus_int32)psEncC->PacketLoss_perc, SILK_FIX_CONST( 0.4, 16 ) ), 2 ); + psEncC->LBRR_GainIncreases = silk_max_int( 7 - silk_SMULWB( (opus_int32)psEncC->PacketLoss_perc, SILK_FIX_CONST( 0.2, 16 ) ), 3 ); } } diff --git a/opus/silk/debug.c b/opus/silk/debug.c index 9253faf7..46a24a47 100644 --- a/opus/silk/debug.c +++ b/opus/silk/debug.c @@ -29,19 +29,23 @@ POSSIBILITY OF SUCH DAMAGE. #include "config.h" #endif +typedef int prevent_empty_translation_unit_warning; + #include "debug.h" + +#if SILK_DEBUG || SILK_TIC_TOC #include "SigProc_FIX.h" +#endif #if SILK_TIC_TOC -#ifdef _WIN32 - #if (defined(_WIN32) || defined(_WINCE)) #include /* timer */ #else /* Linux or Mac*/ #include #endif +#ifdef _WIN32 unsigned long silk_GetHighResolutionTime(void) /* O time in usec*/ { /* Returns a time counter in microsec */ @@ -65,7 +69,7 @@ unsigned long GetHighResolutionTime(void) /* O time in usec*/ int silk_Timer_nTimers = 0; int silk_Timer_depth_ctr = 0; char silk_Timer_tags[silk_NUM_TIMERS_MAX][silk_NUM_TIMERS_MAX_TAG_LEN]; -#ifdef WIN32 +#ifdef _WIN32 LARGE_INTEGER silk_Timer_start[silk_NUM_TIMERS_MAX]; #else unsigned long silk_Timer_start[silk_NUM_TIMERS_MAX]; @@ -76,7 +80,7 @@ opus_int64 silk_Timer_sum[silk_NUM_TIMERS_MAX]; opus_int64 silk_Timer_max[silk_NUM_TIMERS_MAX]; opus_int64 silk_Timer_depth[silk_NUM_TIMERS_MAX]; -#ifdef WIN32 +#ifdef _WIN32 void silk_TimerSave(char *file_name) { if( silk_Timer_nTimers > 0 ) diff --git a/opus/silk/debug.h b/opus/silk/debug.h index 6f68c1ca..36163e47 100644 --- a/opus/silk/debug.h +++ b/opus/silk/debug.h @@ -28,28 +28,29 @@ POSSIBILITY OF SUCH DAMAGE. #ifndef SILK_DEBUG_H #define SILK_DEBUG_H -#include "typedef.h" -#include /* file writing */ -#include /* strcpy, strcmp */ - -#ifdef __cplusplus -extern "C" -{ -#endif - -unsigned long GetHighResolutionTime(void); /* O time in usec*/ - /* Set to 1 to enable DEBUG_STORE_DATA() macros for dumping * intermediate signals from the codec. */ #define SILK_DEBUG 0 /* Flag for using timers */ -#define SILK_TIC_TOC 0 +#define SILK_TIC_TOC 0 +#if SILK_DEBUG || SILK_TIC_TOC +#include "typedef.h" +#include /* strcpy, strcmp */ +#include /* file writing */ +#endif + +#ifdef __cplusplus +extern "C" +{ +#endif #if SILK_TIC_TOC +unsigned long GetHighResolutionTime(void); /* O time in usec*/ + #if (defined(_WIN32) || defined(_WINCE)) #include /* timer */ #else /* Linux or Mac*/ diff --git a/opus/silk/dec_API.c b/opus/silk/dec_API.c index 7d5ca7fb..c1091d13 100644 --- a/opus/silk/dec_API.c +++ b/opus/silk/dec_API.c @@ -33,6 +33,11 @@ POSSIBILITY OF SUCH DAMAGE. #include "stack_alloc.h" #include "os_support.h" +#ifdef ENABLE_OSCE +#include "osce.h" +#include "osce_structs.h" +#endif + /************************/ /* Decoder Super Struct */ /************************/ @@ -42,12 +47,33 @@ typedef struct { opus_int nChannelsAPI; opus_int nChannelsInternal; opus_int prev_decode_only_middle; +#ifdef ENABLE_OSCE + OSCEModel osce_model; +#endif } silk_decoder; /*********************/ /* Decoder functions */ /*********************/ + + +opus_int silk_LoadOSCEModels(void *decState, const unsigned char *data, int len) +{ +#ifdef ENABLE_OSCE + opus_int ret = SILK_NO_ERROR; + + ret = osce_load_models(&((silk_decoder *)decState)->osce_model, data, len); + ((silk_decoder *)decState)->osce_model.loaded = (ret == 0); + return ret; +#else + (void) decState; + (void) data; + (void) len; + return SILK_NO_ERROR; +#endif +} + opus_int silk_Get_Decoder_Size( /* O Returns error code */ opus_int *decSizeBytes /* O Number of bytes in SILK decoder state */ ) @@ -60,12 +86,37 @@ opus_int silk_Get_Decoder_Size( /* O Returns error co } /* Reset decoder state */ +opus_int silk_ResetDecoder( /* O Returns error code */ + void *decState /* I/O State */ +) +{ + opus_int n, ret = SILK_NO_ERROR; + silk_decoder_state *channel_state = ((silk_decoder *)decState)->channel_state; + + for( n = 0; n < DECODER_NUM_CHANNELS; n++ ) { + ret = silk_reset_decoder( &channel_state[ n ] ); + } + silk_memset(&((silk_decoder *)decState)->sStereo, 0, sizeof(((silk_decoder *)decState)->sStereo)); + /* Not strictly needed, but it's cleaner that way */ + ((silk_decoder *)decState)->prev_decode_only_middle = 0; + + return ret; +} + + opus_int silk_InitDecoder( /* O Returns error code */ void *decState /* I/O State */ ) { opus_int n, ret = SILK_NO_ERROR; silk_decoder_state *channel_state = ((silk_decoder *)decState)->channel_state; +#ifdef ENABLE_OSCE + ((silk_decoder *)decState)->osce_model.loaded = 0; +#endif +#ifndef USE_WEIGHTS_FILE + /* load osce models */ + silk_LoadOSCEModels(decState, NULL, 0); +#endif for( n = 0; n < DECODER_NUM_CHANNELS; n++ ) { ret = silk_init_decoder( &channel_state[ n ] ); @@ -86,6 +137,9 @@ opus_int silk_Decode( /* O Returns error co ec_dec *psRangeDec, /* I/O Compressor data structure */ opus_int16 *samplesOut, /* O Decoded output speech vector */ opus_int32 *nSamplesOut, /* O Number of samples decoded */ +#ifdef ENABLE_DEEP_PLC + LPCNetPLCState *lpcnet, +#endif int arch /* I Run-time architecture */ ) { @@ -278,6 +332,7 @@ opus_int silk_Decode( /* O Returns error co has_side = !psDec->prev_decode_only_middle || (decControl->nChannelsInternal == 2 && lostFlag == FLAG_DECODE_LBRR && channel_state[1].LBRR_flags[ channel_state[1].nFramesDecoded ] == 1 ); } + channel_state[ 0 ].sPLC.enable_deep_plc = decControl->enable_deep_plc; /* Call decoder for one frame */ for( n = 0; n < decControl->nChannelsInternal; n++ ) { if( n == 0 || has_side ) { @@ -297,7 +352,19 @@ opus_int silk_Decode( /* O Returns error co } else { condCoding = CODE_CONDITIONALLY; } - ret += silk_decode_frame( &channel_state[ n ], psRangeDec, &samplesOut1_tmp[ n ][ 2 ], &nSamplesOutDec, lostFlag, condCoding, arch); +#ifdef ENABLE_OSCE + if ( channel_state[n].osce.method != decControl->osce_method ) { + osce_reset( &channel_state[n].osce, decControl->osce_method ); + } +#endif + ret += silk_decode_frame( &channel_state[ n ], psRangeDec, &samplesOut1_tmp[ n ][ 2 ], &nSamplesOutDec, lostFlag, condCoding, +#ifdef ENABLE_DEEP_PLC + n == 0 ? lpcnet : NULL, +#endif +#ifdef ENABLE_OSCE + &psDec->osce_model, +#endif + arch); } else { silk_memset( &samplesOut1_tmp[ n ][ 2 ], 0, nSamplesOutDec * sizeof( opus_int16 ) ); } diff --git a/opus/silk/decode_frame.c b/opus/silk/decode_frame.c index e73825b2..9bc4ca2b 100644 --- a/opus/silk/decode_frame.c +++ b/opus/silk/decode_frame.c @@ -33,6 +33,10 @@ POSSIBILITY OF SUCH DAMAGE. #include "stack_alloc.h" #include "PLC.h" +#ifdef ENABLE_OSCE +#include "osce.h" +#endif + /****************/ /* Decode frame */ /****************/ @@ -43,6 +47,12 @@ opus_int silk_decode_frame( opus_int32 *pN, /* O Pointer to size of output frame */ opus_int lostFlag, /* I 0: no loss, 1 loss, 2 decode fec */ opus_int condCoding, /* I The type of conditional coding to use */ +#ifdef ENABLE_DEEP_PLC + LPCNetPLCState *lpcnet, +#endif +#ifdef ENABLE_OSCE + OSCEModel *osce_model, +#endif int arch /* I Run-time architecture */ ) { @@ -61,6 +71,10 @@ opus_int silk_decode_frame( ( lostFlag == FLAG_DECODE_LBRR && psDec->LBRR_flags[ psDec->nFramesDecoded ] == 1 ) ) { VARDECL( opus_int16, pulses ); +#ifdef ENABLE_OSCE + opus_int32 ec_start; + ec_start = ec_tell(psRangeDec); +#endif ALLOC( pulses, (L + SHELL_CODEC_FRAME_LENGTH - 1) & ~(SHELL_CODEC_FRAME_LENGTH - 1), opus_int16 ); /*********************************************/ @@ -84,10 +98,29 @@ opus_int silk_decode_frame( /********************************************************/ silk_decode_core( psDec, psDecCtrl, pOut, pulses, arch ); + /*************************/ + /* Update output buffer. */ + /*************************/ + celt_assert( psDec->ltp_mem_length >= psDec->frame_length ); + mv_len = psDec->ltp_mem_length - psDec->frame_length; + silk_memmove( psDec->outBuf, &psDec->outBuf[ psDec->frame_length ], mv_len * sizeof(opus_int16) ); + silk_memcpy( &psDec->outBuf[ mv_len ], pOut, psDec->frame_length * sizeof( opus_int16 ) ); + +#ifdef ENABLE_OSCE + /********************************************************/ + /* Run SILK enhancer */ + /********************************************************/ + osce_enhance_frame( osce_model, psDec, psDecCtrl, pOut, ec_tell(psRangeDec) - ec_start, arch ); +#endif + /********************************************************/ /* Update PLC state */ /********************************************************/ - silk_PLC( psDec, psDecCtrl, pOut, 0, arch ); + silk_PLC( psDec, psDecCtrl, pOut, 0, +#ifdef ENABLE_DEEP_PLC + lpcnet, +#endif + arch ); psDec->lossCnt = 0; psDec->prevSignalType = psDec->indices.signalType; @@ -97,17 +130,23 @@ opus_int silk_decode_frame( psDec->first_frame_after_reset = 0; } else { /* Handle packet loss by extrapolation */ - psDec->indices.signalType = psDec->prevSignalType; - silk_PLC( psDec, psDecCtrl, pOut, 1, arch ); - } + silk_PLC( psDec, psDecCtrl, pOut, 1, +#ifdef ENABLE_DEEP_PLC + lpcnet, +#endif + arch ); - /*************************/ - /* Update output buffer. */ - /*************************/ - celt_assert( psDec->ltp_mem_length >= psDec->frame_length ); - mv_len = psDec->ltp_mem_length - psDec->frame_length; - silk_memmove( psDec->outBuf, &psDec->outBuf[ psDec->frame_length ], mv_len * sizeof(opus_int16) ); - silk_memcpy( &psDec->outBuf[ mv_len ], pOut, psDec->frame_length * sizeof( opus_int16 ) ); +#ifdef ENABLE_OSCE + osce_reset( &psDec->osce, psDec->osce.method ); +#endif + /*************************/ + /* Update output buffer. */ + /*************************/ + celt_assert( psDec->ltp_mem_length >= psDec->frame_length ); + mv_len = psDec->ltp_mem_length - psDec->frame_length; + silk_memmove( psDec->outBuf, &psDec->outBuf[ psDec->frame_length ], mv_len * sizeof(opus_int16) ); + silk_memcpy( &psDec->outBuf[ mv_len ], pOut, psDec->frame_length * sizeof( opus_int16 ) ); + } /************************************************/ /* Comfort noise generation / estimation */ diff --git a/opus/silk/define.h b/opus/silk/define.h index 247cb0bf..491c86f3 100644 --- a/opus/silk/define.h +++ b/opus/silk/define.h @@ -225,6 +225,7 @@ extern "C" /* Defines for CN generation */ #define CNG_BUF_MASK_MAX 255 /* 2^floor(log2(MAX_FRAME_LENGTH))-1 */ #define CNG_GAIN_SMTH_Q16 4634 /* 0.25^(1/4) */ +#define CNG_GAIN_SMTH_THRESHOLD_Q16 46396 /* -3 dB */ #define CNG_NLSF_SMTH_Q16 16348 /* 0.25 */ #ifdef __cplusplus diff --git a/opus/silk/enc_API.c b/opus/silk/enc_API.c index 55a33f37..369caddd 100644 --- a/opus/silk/enc_API.c +++ b/opus/silk/enc_API.c @@ -41,6 +41,10 @@ POSSIBILITY OF SUCH DAMAGE. #include "main_FLP.h" #endif +#ifdef ENABLE_DRED +#include "dred_encoder.h" +#endif + /***************************************/ /* Read control structure from encoder */ /***************************************/ @@ -270,6 +274,7 @@ opus_int silk_Encode( /* O Returns error co psEnc->state_Fxx[ 0 ].sCmn.fs_kHz * 1000 ); ALLOC( buf, nSamplesFromInputMax, opus_int16 ); while( 1 ) { + int curr_nBitsUsedLBRR = 0; nSamplesToBuffer = psEnc->state_Fxx[ 0 ].sCmn.frame_length - psEnc->state_Fxx[ 0 ].sCmn.inputBufIx; nSamplesToBuffer = silk_min( nSamplesToBuffer, nSamplesToBufferMax ); nSamplesFromInput = silk_DIV32_16( nSamplesToBuffer * psEnc->state_Fxx[ 0 ].sCmn.API_fs_Hz, psEnc->state_Fxx[ 0 ].sCmn.fs_kHz * 1000 ); @@ -342,6 +347,7 @@ opus_int silk_Encode( /* O Returns error co opus_uint8 iCDF[ 2 ] = { 0, 0 }; iCDF[ 0 ] = 256 - silk_RSHIFT( 256, ( psEnc->state_Fxx[ 0 ].sCmn.nFramesPerPacket + 1 ) * encControl->nChannelsInternal ); ec_enc_icdf( psRangeEnc, 0, iCDF, 8 ); + curr_nBitsUsedLBRR = ec_tell( psRangeEnc ); /* Encode any LBRR data from previous packet */ /* Encode LBRR flags */ @@ -386,8 +392,7 @@ opus_int silk_Encode( /* O Returns error co for( n = 0; n < encControl->nChannelsInternal; n++ ) { silk_memset( psEnc->state_Fxx[ n ].sCmn.LBRR_flags, 0, sizeof( psEnc->state_Fxx[ n ].sCmn.LBRR_flags ) ); } - - psEnc->nBitsUsedLBRR = ec_tell( psRangeEnc ); + curr_nBitsUsedLBRR = ec_tell( psRangeEnc ) - curr_nBitsUsedLBRR; } silk_HP_variable_cutoff( psEnc->state_Fxx ); @@ -396,6 +401,16 @@ opus_int silk_Encode( /* O Returns error co nBits = silk_DIV32_16( silk_MUL( encControl->bitRate, encControl->payloadSize_ms ), 1000 ); /* Subtract bits used for LBRR */ if( !prefillFlag ) { + /* psEnc->nBitsUsedLBRR is an exponential moving average of the LBRR usage, + except that for the first LBRR frame it does no averaging and for the first + frame after after LBRR, it goes back to zero immediately. */ + if ( curr_nBitsUsedLBRR < 10 ) { + psEnc->nBitsUsedLBRR = 0; + } else if ( psEnc->nBitsUsedLBRR < 10) { + psEnc->nBitsUsedLBRR = curr_nBitsUsedLBRR; + } else { + psEnc->nBitsUsedLBRR = ( psEnc->nBitsUsedLBRR + curr_nBitsUsedLBRR ) / 2; + } nBits -= psEnc->nBitsUsedLBRR; } /* Divide by number of uncoded frames left in packet */ diff --git a/opus/silk/fixed/LTP_scale_ctrl_FIX.c b/opus/silk/fixed/LTP_scale_ctrl_FIX.c index 3dcedef8..db1016e0 100644 --- a/opus/silk/fixed/LTP_scale_ctrl_FIX.c +++ b/opus/silk/fixed/LTP_scale_ctrl_FIX.c @@ -42,9 +42,14 @@ void silk_LTP_scale_ctrl_FIX( if( condCoding == CODE_INDEPENDENTLY ) { /* Only scale if first frame in packet */ - round_loss = psEnc->sCmn.PacketLoss_perc + psEnc->sCmn.nFramesPerPacket; - psEnc->sCmn.indices.LTP_scaleIndex = (opus_int8)silk_LIMIT( - silk_SMULWB( silk_SMULBB( round_loss, psEncCtrl->LTPredCodGain_Q7 ), SILK_FIX_CONST( 0.1, 9 ) ), 0, 2 ); + round_loss = psEnc->sCmn.PacketLoss_perc * psEnc->sCmn.nFramesPerPacket; + if ( psEnc->sCmn.LBRR_flag ) { + /* LBRR reduces the effective loss. In practice, it does not square the loss because + losses aren't independent, but that still seems to work best. We also never go below 2%. */ + round_loss = 2 + silk_SMULBB( round_loss, round_loss ) / 100; + } + psEnc->sCmn.indices.LTP_scaleIndex = silk_SMULBB( psEncCtrl->LTPredCodGain_Q7, round_loss ) > silk_log2lin( 128*7 + 2900-psEnc->sCmn.SNR_dB_Q7 ); + psEnc->sCmn.indices.LTP_scaleIndex += silk_SMULBB( psEncCtrl->LTPredCodGain_Q7, round_loss ) > silk_log2lin( 128*7 + 3900-psEnc->sCmn.SNR_dB_Q7 ); } else { /* Default is minimum scaling */ psEnc->sCmn.indices.LTP_scaleIndex = 0; diff --git a/opus/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c b/opus/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c index 00a70cb5..6f3be025 100644 --- a/opus/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c +++ b/opus/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c @@ -84,7 +84,9 @@ void silk_warped_autocorrelation_FIX_neon( silk_assert( ( order & 1 ) == 0 ); silk_assert( 2 * QS - QC >= 0 ); - ALLOC( input_QST, length + 2 * MAX_SHAPE_LPC_ORDER, opus_int32 ); + /* The additional +4 is to ensure a later vld1q_s32 call does not overflow. */ + /* Strictly, only +3 is needed but +4 simplifies initialization using the 4x32 neon load. */ + ALLOC( input_QST, length + 2 * MAX_SHAPE_LPC_ORDER + 4, opus_int32 ); input_QS = input_QST; /* input_QS has zero paddings in the beginning and end. */ @@ -121,6 +123,8 @@ void silk_warped_autocorrelation_FIX_neon( vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); + input_QS += 4; + vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS = input_QST + MAX_SHAPE_LPC_ORDER - orderT; /* The following loop runs ( length + order ) times, with ( order ) extra epilogues. */ @@ -153,7 +157,8 @@ void silk_warped_autocorrelation_FIX_neon( opus_int o = orderT; int32x4_t state_QS_s32x4[ 3 ][ 2 ]; - ALLOC( state, length + orderT, opus_int32 ); + /* The additional +4 is to ensure a later vld1q_s32 call does not overflow. */ + ALLOC( state, length + order + 4, opus_int32 ); state_QS_s32x4[ 2 ][ 1 ] = vdupq_n_s32( 0 ); /* Calculate 8 taps of all inputs in each loop. */ diff --git a/opus/silk/fixed/burg_modified_FIX.c b/opus/silk/fixed/burg_modified_FIX.c index 274d4b28..185a12b1 100644 --- a/opus/silk/fixed/burg_modified_FIX.c +++ b/opus/silk/fixed/burg_modified_FIX.c @@ -68,7 +68,7 @@ void silk_burg_modified_c( celt_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE ); /* Compute autocorrelations, added over subframes */ - C0_64 = silk_inner_prod16_aligned_64( x, x, subfr_length*nb_subfr, arch ); + C0_64 = silk_inner_prod16( x, x, subfr_length*nb_subfr, arch ); lz = silk_CLZ64(C0_64); rshifts = 32 + 1 + N_BITS_HEAD_ROOM - lz; if (rshifts > MAX_RSHIFTS) rshifts = MAX_RSHIFTS; @@ -87,7 +87,7 @@ void silk_burg_modified_c( x_ptr = x + s * subfr_length; for( n = 1; n < D + 1; n++ ) { C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64( - silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts ); + silk_inner_prod16( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts ); } } } else { @@ -150,7 +150,7 @@ void silk_burg_modified_c( C_first_row[ k ] = silk_MLA( C_first_row[ k ], x1, x_ptr[ n - k - 1 ] ); /* Q( -rshifts ) */ C_last_row[ k ] = silk_MLA( C_last_row[ k ], x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */ Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 17 ); /* Q17 */ - /* We sometimes have get overflows in the multiplications (even beyond +/- 2^32), + /* We sometimes get overflows in the multiplications (even beyond +/- 2^32), but they cancel each other and the real result seems to always fit in a 32-bit signed integer. This was determined experimentally, not theoretically (unfortunately). */ tmp1 = silk_MLA_ovflw( tmp1, x_ptr[ n - k - 1 ], Atmp1 ); /* Q17 */ @@ -253,7 +253,7 @@ void silk_burg_modified_c( if( rshifts > 0 ) { for( s = 0; s < nb_subfr; s++ ) { x_ptr = x + s * subfr_length; - C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x_ptr, x_ptr, D, arch ), rshifts ); + C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16( x_ptr, x_ptr, D, arch ), rshifts ); } } else { for( s = 0; s < nb_subfr; s++ ) { diff --git a/opus/silk/fixed/encode_frame_FIX.c b/opus/silk/fixed/encode_frame_FIX.c index a02bf87d..7c83360b 100644 --- a/opus/silk/fixed/encode_frame_FIX.c +++ b/opus/silk/fixed/encode_frame_FIX.c @@ -105,8 +105,11 @@ opus_int silk_encode_frame_FIX( opus_int gain_lock[ MAX_NB_SUBFR ] = {0}; opus_int16 best_gain_mult[ MAX_NB_SUBFR ]; opus_int best_sum[ MAX_NB_SUBFR ]; + opus_int bits_margin; SAVE_STACK; + /* For CBR, 5 bits below budget is close enough. For VBR, allow up to 25% below the cap if we initially busted the budget. */ + bits_margin = useCBR ? 5 : maxBits/4; /* This is totally unnecessary but many compilers (including gcc) are too dumb to realise it */ LastGainIndex_copy2 = nBits_lower = nBits_upper = gainMult_lower = gainMult_upper = 0; @@ -282,7 +285,7 @@ opus_int silk_encode_frame_FIX( gainMult_upper = gainMult_Q8; gainsID_upper = gainsID; } - } else if( nBits < maxBits - 5 ) { + } else if( nBits < maxBits - bits_margin ) { found_lower = 1; nBits_lower = nBits; gainMult_lower = gainMult_Q8; @@ -296,7 +299,7 @@ opus_int silk_encode_frame_FIX( LastGainIndex_copy2 = psEnc->sShape.LastGainIndex; } } else { - /* Within 5 bits of budget: close enough */ + /* Close enough */ break; } @@ -318,17 +321,10 @@ opus_int silk_encode_frame_FIX( if( ( found_lower & found_upper ) == 0 ) { /* Adjust gain according to high-rate rate/distortion curve */ if( nBits > maxBits ) { - if (gainMult_Q8 < 16384) { - gainMult_Q8 *= 2; - } else { - gainMult_Q8 = 32767; - } + gainMult_Q8 = silk_min_32( 1024, gainMult_Q8*3/2 ); } else { - opus_int32 gain_factor_Q16; - gain_factor_Q16 = silk_log2lin( silk_LSHIFT( nBits - maxBits, 7 ) / psEnc->sCmn.frame_length + SILK_FIX_CONST( 16, 7 ) ); - gainMult_Q8 = silk_SMULWB( gain_factor_Q16, gainMult_Q8 ); + gainMult_Q8 = silk_max_32( 64, gainMult_Q8*4/5 ); } - } else { /* Adjust gain by interpolating */ gainMult_Q8 = gainMult_lower + silk_DIV32_16( silk_MUL( gainMult_upper - gainMult_lower, maxBits - nBits_lower ), nBits_upper - nBits_lower ); diff --git a/opus/silk/fixed/find_pred_coefs_FIX.c b/opus/silk/fixed/find_pred_coefs_FIX.c index 606d8633..ad363fb7 100644 --- a/opus/silk/fixed/find_pred_coefs_FIX.c +++ b/opus/silk/fixed/find_pred_coefs_FIX.c @@ -42,7 +42,8 @@ void silk_find_pred_coefs_FIX( { opus_int i; opus_int32 invGains_Q16[ MAX_NB_SUBFR ], local_gains[ MAX_NB_SUBFR ]; - opus_int16 NLSF_Q15[ MAX_LPC_ORDER ]; + /* Set to NLSF_Q15 to zero so we don't copy junk to the state. */ + opus_int16 NLSF_Q15[ MAX_LPC_ORDER ]={0}; const opus_int16 *x_ptr; opus_int16 *x_pre_ptr; VARDECL( opus_int16, LPC_in_pre ); diff --git a/opus/silk/fixed/vector_ops_FIX.c b/opus/silk/fixed/vector_ops_FIX.c index d9498001..dcf84070 100644 --- a/opus/silk/fixed/vector_ops_FIX.c +++ b/opus/silk/fixed/vector_ops_FIX.c @@ -87,7 +87,7 @@ opus_int32 silk_inner_prod_aligned( #endif } -opus_int64 silk_inner_prod16_aligned_64_c( +opus_int64 silk_inner_prod16_c( const opus_int16 *inVec1, /* I input vector 1 */ const opus_int16 *inVec2, /* I input vector 2 */ const opus_int len /* I vector lengths */ diff --git a/opus/silk/fixed/x86/burg_modified_FIX_sse4_1.c b/opus/silk/fixed/x86/burg_modified_FIX_sse4_1.c index bbb1ce0f..e58bf079 100644 --- a/opus/silk/fixed/x86/burg_modified_FIX_sse4_1.c +++ b/opus/silk/fixed/x86/burg_modified_FIX_sse4_1.c @@ -1,5 +1,5 @@ -/* Copyright (c) 2014, Cisco Systems, INC - Written by XiangMingZhu WeiZhou MinPeng YanWang +/* Copyright (c) 2014-2020, Cisco Systems, INC + Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -42,7 +42,7 @@ #define MAX_FRAME_SIZE 384 /* subfr_length * nb_subfr = ( 0.005 * 16000 + 16 ) * 4 = 384 */ #define QA 25 -#define N_BITS_HEAD_ROOM 2 +#define N_BITS_HEAD_ROOM 3 #define MIN_RSHIFTS -16 #define MAX_RSHIFTS (32 - QA) @@ -59,7 +59,7 @@ void silk_burg_modified_sse4_1( int arch /* I Run-time architecture */ ) { - opus_int k, n, s, lz, rshifts, rshifts_extra, reached_max_gain; + opus_int k, n, s, lz, rshifts, reached_max_gain; opus_int32 C0, num, nrg, rc_Q31, invGain_Q30, Atmp_QA, Atmp1, tmp1, tmp2, x1, x2; const opus_int16 *x_ptr; opus_int32 C_first_row[ SILK_MAX_ORDER_LPC ]; @@ -68,6 +68,7 @@ void silk_burg_modified_sse4_1( opus_int32 CAf[ SILK_MAX_ORDER_LPC + 1 ]; opus_int32 CAb[ SILK_MAX_ORDER_LPC + 1 ]; opus_int32 xcorr[ SILK_MAX_ORDER_LPC ]; + opus_int64 C0_64; __m128i FIRST_3210, LAST_3210, ATMP_3210, TMP1_3210, TMP2_3210, T1_3210, T2_3210, PTR_3210, SUBFR_3210, X1_3210, X2_3210; __m128i CONST1 = _mm_set1_epi32(1); @@ -75,23 +76,18 @@ void silk_burg_modified_sse4_1( celt_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE ); /* Compute autocorrelations, added over subframes */ - silk_sum_sqr_shift( &C0, &rshifts, x, nb_subfr * subfr_length ); - if( rshifts > MAX_RSHIFTS ) { - C0 = silk_LSHIFT32( C0, rshifts - MAX_RSHIFTS ); - silk_assert( C0 > 0 ); - rshifts = MAX_RSHIFTS; + C0_64 = silk_inner_prod16( x, x, subfr_length*nb_subfr, arch ); + lz = silk_CLZ64(C0_64); + rshifts = 32 + 1 + N_BITS_HEAD_ROOM - lz; + if (rshifts > MAX_RSHIFTS) rshifts = MAX_RSHIFTS; + if (rshifts < MIN_RSHIFTS) rshifts = MIN_RSHIFTS; + + if (rshifts > 0) { + C0 = (opus_int32)silk_RSHIFT64(C0_64, rshifts ); } else { - lz = silk_CLZ32( C0 ) - 1; - rshifts_extra = N_BITS_HEAD_ROOM - lz; - if( rshifts_extra > 0 ) { - rshifts_extra = silk_min( rshifts_extra, MAX_RSHIFTS - rshifts ); - C0 = silk_RSHIFT32( C0, rshifts_extra ); - } else { - rshifts_extra = silk_max( rshifts_extra, MIN_RSHIFTS - rshifts ); - C0 = silk_LSHIFT32( C0, -rshifts_extra ); - } - rshifts += rshifts_extra; + C0 = silk_LSHIFT32((opus_int32)C0_64, -rshifts ); } + CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ) + 1; /* Q(-rshifts) */ silk_memset( C_first_row, 0, SILK_MAX_ORDER_LPC * sizeof( opus_int32 ) ); if( rshifts > 0 ) { @@ -99,7 +95,7 @@ void silk_burg_modified_sse4_1( x_ptr = x + s * subfr_length; for( n = 1; n < D + 1; n++ ) { C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64( - silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts ); + silk_inner_prod16( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts ); } } } else { @@ -203,8 +199,11 @@ void silk_burg_modified_sse4_1( C_first_row[ k ] = silk_MLA( C_first_row[ k ], x1, x_ptr[ n - k - 1 ] ); /* Q( -rshifts ) */ C_last_row[ k ] = silk_MLA( C_last_row[ k ], x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */ Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 17 ); /* Q17 */ - tmp1 = silk_MLA( tmp1, x_ptr[ n - k - 1 ], Atmp1 ); /* Q17 */ - tmp2 = silk_MLA( tmp2, x_ptr[ subfr_length - n + k ], Atmp1 ); /* Q17 */ + /* We sometimes get overflows in the multiplications (even beyond +/- 2^32), + but they cancel each other and the real result seems to always fit in a 32-bit + signed integer. This was determined experimentally, not theoretically (unfortunately). */ + tmp1 = silk_MLA_ovflw( tmp1, x_ptr[ n - k - 1 ], Atmp1 ); /* Q17 */ + tmp2 = silk_MLA_ovflw( tmp2, x_ptr[ subfr_length - n + k ], Atmp1 ); /* Q17 */ } tmp1 = -tmp1; /* Q17 */ @@ -350,7 +349,7 @@ void silk_burg_modified_sse4_1( if( rshifts > 0 ) { for( s = 0; s < nb_subfr; s++ ) { x_ptr = x + s * subfr_length; - C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x_ptr, x_ptr, D, arch ), rshifts ); + C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16( x_ptr, x_ptr, D, arch ), rshifts ); } } else { for( s = 0; s < nb_subfr; s++ ) { @@ -374,4 +373,28 @@ void silk_burg_modified_sse4_1( *res_nrg = silk_SMLAWW( nrg, silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ), -tmp1 );/* Q( -rshifts ) */ *res_nrg_Q = -rshifts; } + +#ifdef OPUS_CHECK_ASM + { + opus_int32 res_nrg_c = 0; + opus_int res_nrg_Q_c = 0; + opus_int32 A_Q16_c[ MAX_LPC_ORDER ] = {0}; + + silk_burg_modified_c( + &res_nrg_c, + &res_nrg_Q_c, + A_Q16_c, + x, + minInvGain_Q30, + subfr_length, + nb_subfr, + D, + 0 + ); + + silk_assert( *res_nrg == res_nrg_c ); + silk_assert( *res_nrg_Q == res_nrg_Q_c ); + silk_assert( !memcmp( A_Q16, A_Q16_c, D * sizeof( *A_Q16 ) ) ); + } +#endif } diff --git a/opus/silk/fixed/x86/vector_ops_FIX_sse4_1.c b/opus/silk/fixed/x86/vector_ops_FIX_sse4_1.c index c1e90564..a46289bb 100644 --- a/opus/silk/fixed/x86/vector_ops_FIX_sse4_1.c +++ b/opus/silk/fixed/x86/vector_ops_FIX_sse4_1.c @@ -36,40 +36,38 @@ #include "SigProc_FIX.h" #include "pitch.h" +#include "celt/x86/x86cpu.h" -opus_int64 silk_inner_prod16_aligned_64_sse4_1( +opus_int64 silk_inner_prod16_sse4_1( const opus_int16 *inVec1, /* I input vector 1 */ const opus_int16 *inVec2, /* I input vector 2 */ const opus_int len /* I vector lengths */ ) { - opus_int i, dataSize8; + opus_int i, dataSize4; opus_int64 sum; - __m128i xmm_tempa; - __m128i inVec1_76543210, acc1; - __m128i inVec2_76543210, acc2; + __m128i xmm_prod_20, xmm_prod_31; + __m128i inVec1_3210, acc1; + __m128i inVec2_3210, acc2; sum = 0; - dataSize8 = len & ~7; + dataSize4 = len & ~3; acc1 = _mm_setzero_si128(); acc2 = _mm_setzero_si128(); - for( i = 0; i < dataSize8; i += 8 ) { - inVec1_76543210 = _mm_loadu_si128( (__m128i *)(&inVec1[i + 0] ) ); - inVec2_76543210 = _mm_loadu_si128( (__m128i *)(&inVec2[i + 0] ) ); + for( i = 0; i < dataSize4; i += 4 ) { + inVec1_3210 = OP_CVTEPI16_EPI32_M64( &inVec1[i + 0] ); + inVec2_3210 = OP_CVTEPI16_EPI32_M64( &inVec2[i + 0] ); + xmm_prod_20 = _mm_mul_epi32( inVec1_3210, inVec2_3210 ); - /* only when all 4 operands are -32768 (0x8000), this results in wrap around */ - inVec1_76543210 = _mm_madd_epi16( inVec1_76543210, inVec2_76543210 ); + inVec1_3210 = _mm_shuffle_epi32( inVec1_3210, _MM_SHUFFLE( 0, 3, 2, 1 ) ); + inVec2_3210 = _mm_shuffle_epi32( inVec2_3210, _MM_SHUFFLE( 0, 3, 2, 1 ) ); + xmm_prod_31 = _mm_mul_epi32( inVec1_3210, inVec2_3210 ); - xmm_tempa = _mm_cvtepi32_epi64( inVec1_76543210 ); - /* equal shift right 8 bytes */ - inVec1_76543210 = _mm_shuffle_epi32( inVec1_76543210, _MM_SHUFFLE( 0, 0, 3, 2 ) ); - inVec1_76543210 = _mm_cvtepi32_epi64( inVec1_76543210 ); - - acc1 = _mm_add_epi64( acc1, xmm_tempa ); - acc2 = _mm_add_epi64( acc2, inVec1_76543210 ); + acc1 = _mm_add_epi64( acc1, xmm_prod_20 ); + acc2 = _mm_add_epi64( acc2, xmm_prod_31 ); } acc1 = _mm_add_epi64( acc1, acc2 ); @@ -81,8 +79,15 @@ opus_int64 silk_inner_prod16_aligned_64_sse4_1( _mm_storel_epi64( (__m128i *)&sum, acc1 ); for( ; i < len; i++ ) { - sum = silk_SMLABB( sum, inVec1[ i ], inVec2[ i ] ); + sum = silk_SMLALBB( sum, inVec1[ i ], inVec2[ i ] ); + } + +#ifdef OPUS_CHECK_ASM + { + opus_int64 sum_c = silk_inner_prod16_c( inVec1, inVec2, len ); + silk_assert( sum == sum_c ); } +#endif return sum; } diff --git a/opus/silk/float/LTP_scale_ctrl_FLP.c b/opus/silk/float/LTP_scale_ctrl_FLP.c index 8dbe29d0..6f30ff09 100644 --- a/opus/silk/float/LTP_scale_ctrl_FLP.c +++ b/opus/silk/float/LTP_scale_ctrl_FLP.c @@ -41,8 +41,14 @@ void silk_LTP_scale_ctrl_FLP( if( condCoding == CODE_INDEPENDENTLY ) { /* Only scale if first frame in packet */ - round_loss = psEnc->sCmn.PacketLoss_perc + psEnc->sCmn.nFramesPerPacket; - psEnc->sCmn.indices.LTP_scaleIndex = (opus_int8)silk_LIMIT( round_loss * psEncCtrl->LTPredCodGain * 0.1f, 0.0f, 2.0f ); + round_loss = psEnc->sCmn.PacketLoss_perc * psEnc->sCmn.nFramesPerPacket; + if ( psEnc->sCmn.LBRR_flag ) { + /* LBRR reduces the effective loss. In practice, it does not square the loss because + losses aren't independent, but that still seems to work best. We also never go below 2%. */ + round_loss = 2 + silk_SMULBB( round_loss, round_loss) / 100; + } + psEnc->sCmn.indices.LTP_scaleIndex = silk_SMULBB( psEncCtrl->LTPredCodGain, round_loss ) > silk_log2lin( 2900 - psEnc->sCmn.SNR_dB_Q7 ); + psEnc->sCmn.indices.LTP_scaleIndex += silk_SMULBB( psEncCtrl->LTPredCodGain, round_loss ) > silk_log2lin( 3900 - psEnc->sCmn.SNR_dB_Q7 ); } else { /* Default is minimum scaling */ psEnc->sCmn.indices.LTP_scaleIndex = 0; diff --git a/opus/silk/float/SigProc_FLP.h b/opus/silk/float/SigProc_FLP.h index 953de8b0..ff9281b8 100644 --- a/opus/silk/float/SigProc_FLP.h +++ b/opus/silk/float/SigProc_FLP.h @@ -30,6 +30,7 @@ POSSIBILITY OF SUCH DAMAGE. #include "SigProc_FIX.h" #include "float_cast.h" +#include "main.h" #include #ifdef __cplusplus @@ -73,7 +74,8 @@ void silk_autocorrelation_FLP( silk_float *results, /* O result (length correlationCount) */ const silk_float *inputData, /* I input data to correlate */ opus_int inputDataSize, /* I length of input */ - opus_int correlationCount /* I number of correlation taps to compute */ + opus_int correlationCount, /* I number of correlation taps to compute */ + int arch ); opus_int silk_pitch_analysis_core_FLP( /* O Voicing estimate: 0 voiced, 1 unvoiced */ @@ -105,7 +107,8 @@ silk_float silk_burg_modified_FLP( /* O returns residual energy const silk_float minInvGain, /* I minimum inverse prediction gain */ const opus_int subfr_length, /* I input signal subframe length (incl. D preceding samples) */ const opus_int nb_subfr, /* I number of subframes stacked in x */ - const opus_int D /* I order */ + const opus_int D, /* I order */ + int arch ); /* multiply a vector by a constant */ @@ -124,12 +127,17 @@ void silk_scale_copy_vector_FLP( ); /* inner product of two silk_float arrays, with result as double */ -double silk_inner_product_FLP( +double silk_inner_product_FLP_c( const silk_float *data1, const silk_float *data2, opus_int dataSize ); +#ifndef OVERRIDE_inner_product_FLP +#define silk_inner_product_FLP(data1, data2, dataSize, arch) ((void)arch,silk_inner_product_FLP_c(data1, data2, dataSize)) +#endif + + /* sum of squares of a silk_float array, with result as double */ double silk_energy_FLP( const silk_float *data, diff --git a/opus/silk/float/autocorrelation_FLP.c b/opus/silk/float/autocorrelation_FLP.c index 8b8a9e65..4253b26e 100644 --- a/opus/silk/float/autocorrelation_FLP.c +++ b/opus/silk/float/autocorrelation_FLP.c @@ -37,7 +37,8 @@ void silk_autocorrelation_FLP( silk_float *results, /* O result (length correlationCount) */ const silk_float *inputData, /* I input data to correlate */ opus_int inputDataSize, /* I length of input */ - opus_int correlationCount /* I number of correlation taps to compute */ + opus_int correlationCount, /* I number of correlation taps to compute */ + int arch ) { opus_int i; @@ -47,6 +48,6 @@ void silk_autocorrelation_FLP( } for( i = 0; i < correlationCount; i++ ) { - results[ i ] = (silk_float)silk_inner_product_FLP( inputData, inputData + i, inputDataSize - i ); + results[ i ] = (silk_float)silk_inner_product_FLP( inputData, inputData + i, inputDataSize - i, arch ); } } diff --git a/opus/silk/float/burg_modified_FLP.c b/opus/silk/float/burg_modified_FLP.c index 756b76a3..f5bef5dd 100644 --- a/opus/silk/float/burg_modified_FLP.c +++ b/opus/silk/float/burg_modified_FLP.c @@ -42,7 +42,8 @@ silk_float silk_burg_modified_FLP( /* O returns residual energy const silk_float minInvGain, /* I minimum inverse prediction gain */ const opus_int subfr_length, /* I input signal subframe length (incl. D preceding samples) */ const opus_int nb_subfr, /* I number of subframes stacked in x */ - const opus_int D /* I order */ + const opus_int D, /* I order */ + int arch ) { opus_int k, n, s, reached_max_gain; @@ -60,7 +61,7 @@ silk_float silk_burg_modified_FLP( /* O returns residual energy for( s = 0; s < nb_subfr; s++ ) { x_ptr = x + s * subfr_length; for( n = 1; n < D + 1; n++ ) { - C_first_row[ n - 1 ] += silk_inner_product_FLP( x_ptr, x_ptr + n, subfr_length - n ); + C_first_row[ n - 1 ] += silk_inner_product_FLP( x_ptr, x_ptr + n, subfr_length - n, arch ); } } silk_memcpy( C_last_row, C_first_row, SILK_MAX_ORDER_LPC * sizeof( double ) ); diff --git a/opus/silk/float/corrMatrix_FLP.c b/opus/silk/float/corrMatrix_FLP.c index eae6a1cf..eef6e8aa 100644 --- a/opus/silk/float/corrMatrix_FLP.c +++ b/opus/silk/float/corrMatrix_FLP.c @@ -41,7 +41,8 @@ void silk_corrVector_FLP( const silk_float *t, /* I Target vector [L] */ const opus_int L, /* I Length of vecors */ const opus_int Order, /* I Max lag for correlation */ - silk_float *Xt /* O X'*t correlation vector [order] */ + silk_float *Xt, /* O X'*t correlation vector [order] */ + int arch ) { opus_int lag; @@ -50,7 +51,7 @@ void silk_corrVector_FLP( ptr1 = &x[ Order - 1 ]; /* Points to first sample of column 0 of X: X[:,0] */ for( lag = 0; lag < Order; lag++ ) { /* Calculate X[:,lag]'*t */ - Xt[ lag ] = (silk_float)silk_inner_product_FLP( ptr1, t, L ); + Xt[ lag ] = (silk_float)silk_inner_product_FLP( ptr1, t, L, arch ); ptr1--; /* Next column of X */ } } @@ -60,7 +61,8 @@ void silk_corrMatrix_FLP( const silk_float *x, /* I x vector [ L+order-1 ] used to create X */ const opus_int L, /* I Length of vectors */ const opus_int Order, /* I Max lag for correlation */ - silk_float *XX /* O X'*X correlation matrix [order x order] */ + silk_float *XX, /* O X'*X correlation matrix [order x order] */ + int arch ) { opus_int j, lag; @@ -79,7 +81,7 @@ void silk_corrMatrix_FLP( ptr2 = &x[ Order - 2 ]; /* First sample of column 1 of X */ for( lag = 1; lag < Order; lag++ ) { /* Calculate X[:,0]'*X[:,lag] */ - energy = silk_inner_product_FLP( ptr1, ptr2, L ); + energy = silk_inner_product_FLP( ptr1, ptr2, L, arch ); matrix_ptr( XX, lag, 0, Order ) = ( silk_float )energy; matrix_ptr( XX, 0, lag, Order ) = ( silk_float )energy; /* Calculate X[:,j]'*X[:,j + lag] */ diff --git a/opus/silk/float/encode_frame_FLP.c b/opus/silk/float/encode_frame_FLP.c index b029c3f5..8a327c56 100644 --- a/opus/silk/float/encode_frame_FLP.c +++ b/opus/silk/float/encode_frame_FLP.c @@ -107,7 +107,10 @@ opus_int silk_encode_frame_FLP( opus_int gain_lock[ MAX_NB_SUBFR ] = {0}; opus_int16 best_gain_mult[ MAX_NB_SUBFR ]; opus_int best_sum[ MAX_NB_SUBFR ]; + opus_int bits_margin; + /* For CBR, 5 bits below budget is close enough. For VBR, allow up to 25% below the cap if we initially busted the budget. */ + bits_margin = useCBR ? 5 : maxBits/4; /* This is totally unnecessary but many compilers (including gcc) are too dumb to realise it */ LastGainIndex_copy2 = nBits_lower = nBits_upper = gainMult_lower = gainMult_upper = 0; @@ -270,7 +273,7 @@ opus_int silk_encode_frame_FLP( gainMult_upper = gainMult_Q8; gainsID_upper = gainsID; } - } else if( nBits < maxBits - 5 ) { + } else if( nBits < maxBits - bits_margin ) { found_lower = 1; nBits_lower = nBits; gainMult_lower = gainMult_Q8; @@ -284,7 +287,7 @@ opus_int silk_encode_frame_FLP( LastGainIndex_copy2 = psEnc->sShape.LastGainIndex; } } else { - /* Within 5 bits of budget: close enough */ + /* Close enough */ break; } @@ -306,15 +309,9 @@ opus_int silk_encode_frame_FLP( if( ( found_lower & found_upper ) == 0 ) { /* Adjust gain according to high-rate rate/distortion curve */ if( nBits > maxBits ) { - if (gainMult_Q8 < 16384) { - gainMult_Q8 *= 2; - } else { - gainMult_Q8 = 32767; - } + gainMult_Q8 = silk_min_32( 1024, gainMult_Q8*3/2 ); } else { - opus_int32 gain_factor_Q16; - gain_factor_Q16 = silk_log2lin( silk_LSHIFT( nBits - maxBits, 7 ) / psEnc->sCmn.frame_length + SILK_FIX_CONST( 16, 7 ) ); - gainMult_Q8 = silk_SMULWB( gain_factor_Q16, gainMult_Q8 ); + gainMult_Q8 = silk_max_32( 64, gainMult_Q8*4/5 ); } } else { /* Adjust gain by interpolating */ diff --git a/opus/silk/float/find_LPC_FLP.c b/opus/silk/float/find_LPC_FLP.c index fa3ffe7f..6ccd711d 100644 --- a/opus/silk/float/find_LPC_FLP.c +++ b/opus/silk/float/find_LPC_FLP.c @@ -38,7 +38,8 @@ void silk_find_LPC_FLP( silk_encoder_state *psEncC, /* I/O Encoder state */ opus_int16 NLSF_Q15[], /* O NLSFs */ const silk_float x[], /* I Input signal */ - const silk_float minInvGain /* I Inverse of max prediction gain */ + const silk_float minInvGain, /* I Inverse of max prediction gain */ + int arch ) { opus_int k, subfr_length; @@ -56,12 +57,12 @@ void silk_find_LPC_FLP( psEncC->indices.NLSFInterpCoef_Q2 = 4; /* Burg AR analysis for the full frame */ - res_nrg = silk_burg_modified_FLP( a, x, minInvGain, subfr_length, psEncC->nb_subfr, psEncC->predictLPCOrder ); + res_nrg = silk_burg_modified_FLP( a, x, minInvGain, subfr_length, psEncC->nb_subfr, psEncC->predictLPCOrder, arch ); if( psEncC->useInterpolatedNLSFs && !psEncC->first_frame_after_reset && psEncC->nb_subfr == MAX_NB_SUBFR ) { /* Optimal solution for last 10 ms; subtract residual energy here, as that's easier than */ /* adding it to the residual energy of the first 10 ms in each iteration of the search below */ - res_nrg -= silk_burg_modified_FLP( a_tmp, x + ( MAX_NB_SUBFR / 2 ) * subfr_length, minInvGain, subfr_length, MAX_NB_SUBFR / 2, psEncC->predictLPCOrder ); + res_nrg -= silk_burg_modified_FLP( a_tmp, x + ( MAX_NB_SUBFR / 2 ) * subfr_length, minInvGain, subfr_length, MAX_NB_SUBFR / 2, psEncC->predictLPCOrder, arch ); /* Convert to NLSFs */ silk_A2NLSF_FLP( NLSF_Q15, a_tmp, psEncC->predictLPCOrder ); diff --git a/opus/silk/float/find_LTP_FLP.c b/opus/silk/float/find_LTP_FLP.c index f9706493..90aeeac0 100644 --- a/opus/silk/float/find_LTP_FLP.c +++ b/opus/silk/float/find_LTP_FLP.c @@ -38,7 +38,8 @@ void silk_find_LTP_FLP( const silk_float r_ptr[], /* I LPC residual */ const opus_int lag[ MAX_NB_SUBFR ], /* I LTP lags */ const opus_int subfr_length, /* I Subframe length */ - const opus_int nb_subfr /* I number of subframes */ + const opus_int nb_subfr, /* I number of subframes */ + int arch ) { opus_int k; @@ -50,8 +51,8 @@ void silk_find_LTP_FLP( XX_ptr = XX; for( k = 0; k < nb_subfr; k++ ) { lag_ptr = r_ptr - ( lag[ k ] + LTP_ORDER / 2 ); - silk_corrMatrix_FLP( lag_ptr, subfr_length, LTP_ORDER, XX_ptr ); - silk_corrVector_FLP( lag_ptr, r_ptr, subfr_length, LTP_ORDER, xX_ptr ); + silk_corrMatrix_FLP( lag_ptr, subfr_length, LTP_ORDER, XX_ptr, arch ); + silk_corrVector_FLP( lag_ptr, r_ptr, subfr_length, LTP_ORDER, xX_ptr, arch ); xx = ( silk_float )silk_energy_FLP( r_ptr, subfr_length + LTP_ORDER ); temp = 1.0f / silk_max( xx, LTP_CORR_INV_MAX * 0.5f * ( XX_ptr[ 0 ] + XX_ptr[ 24 ] ) + 1.0f ); silk_scale_vector_FLP( XX_ptr, temp, LTP_ORDER * LTP_ORDER ); diff --git a/opus/silk/float/find_pitch_lags_FLP.c b/opus/silk/float/find_pitch_lags_FLP.c index dedbcd28..1f6bd599 100644 --- a/opus/silk/float/find_pitch_lags_FLP.c +++ b/opus/silk/float/find_pitch_lags_FLP.c @@ -82,7 +82,7 @@ void silk_find_pitch_lags_FLP( silk_apply_sine_window_FLP( Wsig_ptr, x_buf_ptr, 2, psEnc->sCmn.la_pitch ); /* Calculate autocorrelation sequence */ - silk_autocorrelation_FLP( auto_corr, Wsig, psEnc->sCmn.pitch_LPC_win_length, psEnc->sCmn.pitchEstimationLPCOrder + 1 ); + silk_autocorrelation_FLP( auto_corr, Wsig, psEnc->sCmn.pitch_LPC_win_length, psEnc->sCmn.pitchEstimationLPCOrder + 1, arch ); /* Add white noise, as a fraction of the energy */ auto_corr[ 0 ] += auto_corr[ 0 ] * FIND_PITCH_WHITE_NOISE_FRACTION + 1; diff --git a/opus/silk/float/find_pred_coefs_FLP.c b/opus/silk/float/find_pred_coefs_FLP.c index dcf7c520..f3c54cf4 100644 --- a/opus/silk/float/find_pred_coefs_FLP.c +++ b/opus/silk/float/find_pred_coefs_FLP.c @@ -44,7 +44,8 @@ void silk_find_pred_coefs_FLP( silk_float XXLTP[ MAX_NB_SUBFR * LTP_ORDER * LTP_ORDER ]; silk_float xXLTP[ MAX_NB_SUBFR * LTP_ORDER ]; silk_float invGains[ MAX_NB_SUBFR ]; - opus_int16 NLSF_Q15[ MAX_LPC_ORDER ]; + /* Set to NLSF_Q15 to zero so we don't copy junk to the state. */ + opus_int16 NLSF_Q15[ MAX_LPC_ORDER ]={0}; const silk_float *x_ptr; silk_float *x_pre_ptr, LPC_in_pre[ MAX_NB_SUBFR * MAX_LPC_ORDER + MAX_FRAME_LENGTH ]; silk_float minInvGain; @@ -62,7 +63,7 @@ void silk_find_pred_coefs_FLP( celt_assert( psEnc->sCmn.ltp_mem_length - psEnc->sCmn.predictLPCOrder >= psEncCtrl->pitchL[ 0 ] + LTP_ORDER / 2 ); /* LTP analysis */ - silk_find_LTP_FLP( XXLTP, xXLTP, res_pitch, psEncCtrl->pitchL, psEnc->sCmn.subfr_length, psEnc->sCmn.nb_subfr ); + silk_find_LTP_FLP( XXLTP, xXLTP, res_pitch, psEncCtrl->pitchL, psEnc->sCmn.subfr_length, psEnc->sCmn.nb_subfr, psEnc->sCmn.arch ); /* Quantize LTP gain parameters */ silk_quant_LTP_gains_FLP( psEncCtrl->LTPCoef, psEnc->sCmn.indices.LTPIndex, &psEnc->sCmn.indices.PERIndex, @@ -101,7 +102,7 @@ void silk_find_pred_coefs_FLP( } /* LPC_in_pre contains the LTP-filtered input for voiced, and the unfiltered input for unvoiced */ - silk_find_LPC_FLP( &psEnc->sCmn, NLSF_Q15, LPC_in_pre, minInvGain ); + silk_find_LPC_FLP( &psEnc->sCmn, NLSF_Q15, LPC_in_pre, minInvGain, psEnc->sCmn.arch ); /* Quantize LSFs */ silk_process_NLSFs_FLP( &psEnc->sCmn, psEncCtrl->PredCoef, NLSF_Q15, psEnc->sCmn.prev_NLSFq_Q15 ); diff --git a/opus/silk/float/inner_product_FLP.c b/opus/silk/float/inner_product_FLP.c index cdd39d24..88b160ab 100644 --- a/opus/silk/float/inner_product_FLP.c +++ b/opus/silk/float/inner_product_FLP.c @@ -32,7 +32,7 @@ POSSIBILITY OF SUCH DAMAGE. #include "SigProc_FLP.h" /* inner product of two silk_float arrays, with result as double */ -double silk_inner_product_FLP( +double silk_inner_product_FLP_c( const silk_float *data1, const silk_float *data2, opus_int dataSize diff --git a/opus/silk/float/main_FLP.h b/opus/silk/float/main_FLP.h index 5dc0ccf4..2e4435cc 100644 --- a/opus/silk/float/main_FLP.h +++ b/opus/silk/float/main_FLP.h @@ -138,7 +138,8 @@ void silk_find_LPC_FLP( silk_encoder_state *psEncC, /* I/O Encoder state */ opus_int16 NLSF_Q15[], /* O NLSFs */ const silk_float x[], /* I Input signal */ - const silk_float minInvGain /* I Prediction gain from LTP (dB) */ + const silk_float minInvGain, /* I Prediction gain from LTP (dB) */ + int arch ); /* LTP analysis */ @@ -148,7 +149,8 @@ void silk_find_LTP_FLP( const silk_float r_ptr[], /* I LPC residual */ const opus_int lag[ MAX_NB_SUBFR ], /* I LTP lags */ const opus_int subfr_length, /* I Subframe length */ - const opus_int nb_subfr /* I number of subframes */ + const opus_int nb_subfr, /* I number of subframes */ + int arch ); void silk_LTP_analysis_filter_FLP( @@ -221,7 +223,8 @@ void silk_corrMatrix_FLP( const silk_float *x, /* I x vector [ L+order-1 ] used to create X */ const opus_int L, /* I Length of vectors */ const opus_int Order, /* I Max lag for correlation */ - silk_float *XX /* O X'*X correlation matrix [order x order] */ + silk_float *XX, /* O X'*X correlation matrix [order x order] */ + int arch ); /* Calculates correlation vector X'*t */ @@ -230,7 +233,8 @@ void silk_corrVector_FLP( const silk_float *t, /* I Target vector [L] */ const opus_int L, /* I Length of vecors */ const opus_int Order, /* I Max lag for correlation */ - silk_float *Xt /* O X'*t correlation vector [order] */ + silk_float *Xt, /* O X'*t correlation vector [order] */ + int arch ); /* Apply sine window to signal vector. */ diff --git a/opus/silk/float/noise_shape_analysis_FLP.c b/opus/silk/float/noise_shape_analysis_FLP.c index cb3d8a50..0b5ea952 100644 --- a/opus/silk/float/noise_shape_analysis_FLP.c +++ b/opus/silk/float/noise_shape_analysis_FLP.c @@ -255,7 +255,7 @@ void silk_noise_shape_analysis_FLP( psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder ); } else { /* Calculate regular auto correlation */ - silk_autocorrelation_FLP( auto_corr, x_windowed, psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder + 1 ); + silk_autocorrelation_FLP( auto_corr, x_windowed, psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder + 1, psEnc->sCmn.arch ); } /* Add white noise, as a fraction of energy */ diff --git a/opus/silk/float/pitch_analysis_core_FLP.c b/opus/silk/float/pitch_analysis_core_FLP.c index f351bc37..0530a883 100644 --- a/opus/silk/float/pitch_analysis_core_FLP.c +++ b/opus/silk/float/pitch_analysis_core_FLP.c @@ -291,7 +291,7 @@ opus_int silk_pitch_analysis_core_FLP( /* O Voicing estimate: 0 voiced, for( j = 0; j < length_d_comp; j++ ) { d = d_comp[ j ]; basis_ptr = target_ptr - d; - cross_corr = silk_inner_product_FLP( basis_ptr, target_ptr, sf_length_8kHz ); + cross_corr = silk_inner_product_FLP( basis_ptr, target_ptr, sf_length_8kHz, arch ); if( cross_corr > 0.0f ) { energy = silk_energy_FLP( basis_ptr, sf_length_8kHz ); C[ k ][ d ] = (silk_float)( 2 * cross_corr / ( energy + energy_tmp ) ); diff --git a/opus/silk/float/warped_autocorrelation_FLP.c b/opus/silk/float/warped_autocorrelation_FLP.c index 09186e73..116dab92 100644 --- a/opus/silk/float/warped_autocorrelation_FLP.c +++ b/opus/silk/float/warped_autocorrelation_FLP.c @@ -54,11 +54,13 @@ void silk_warped_autocorrelation_FLP( /* Loop over allpass sections */ for( i = 0; i < order; i += 2 ) { /* Output of allpass section */ - tmp2 = state[ i ] + warping * ( state[ i + 1 ] - tmp1 ); + /* We voluntarily use two multiples instead of factoring the expression to + reduce the length of the dependency chain (tmp1->tmp2->tmp1... ). */ + tmp2 = state[ i ] + warping * state[ i + 1 ] - warping * tmp1; state[ i ] = tmp1; C[ i ] += state[ 0 ] * tmp1; /* Output of allpass section */ - tmp1 = state[ i + 1 ] + warping * ( state[ i + 2 ] - tmp2 ); + tmp1 = state[ i + 1 ] + warping * state[ i + 2 ] - warping * tmp2; state[ i + 1 ] = tmp2; C[ i + 1 ] += state[ 0 ] * tmp2; } diff --git a/opus/silk/float/wrappers_FLP.c b/opus/silk/float/wrappers_FLP.c index ad90b874..c0c183e3 100644 --- a/opus/silk/float/wrappers_FLP.c +++ b/opus/silk/float/wrappers_FLP.c @@ -190,12 +190,14 @@ void silk_quant_LTP_gains_FLP( opus_int32 XX_Q17[ MAX_NB_SUBFR * LTP_ORDER * LTP_ORDER ]; opus_int32 xX_Q17[ MAX_NB_SUBFR * LTP_ORDER ]; - for( i = 0; i < nb_subfr * LTP_ORDER * LTP_ORDER; i++ ) { + i = 0; + do { XX_Q17[ i ] = (opus_int32)silk_float2int( XX[ i ] * 131072.0f ); - } - for( i = 0; i < nb_subfr * LTP_ORDER; i++ ) { + } while ( ++i < nb_subfr * LTP_ORDER * LTP_ORDER ); + i = 0; + do { xX_Q17[ i ] = (opus_int32)silk_float2int( xX[ i ] * 131072.0f ); - } + } while ( ++i < nb_subfr * LTP_ORDER ); silk_quant_LTP_gains( B_Q14, cbk_index, periodicity_index, sum_log_gain_Q7, &pred_gain_dB_Q7, XX_Q17, xX_Q17, subfr_len, nb_subfr, arch ); diff --git a/opus/silk/float/x86/inner_product_FLP_avx2.c b/opus/silk/float/x86/inner_product_FLP_avx2.c new file mode 100644 index 00000000..4a2daaf5 --- /dev/null +++ b/opus/silk/float/x86/inner_product_FLP_avx2.c @@ -0,0 +1,85 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. + 2023 Amazon +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "SigProc_FLP.h" +#include + + +/* inner product of two silk_float arrays, with result as double */ +double silk_inner_product_FLP_avx2( + const silk_float *data1, + const silk_float *data2, + opus_int dataSize +) +{ + opus_int i; + __m256d accum1, accum2; + double result; + + /* 4x unrolled loop */ + result = 0.0; + accum1 = accum2 = _mm256_setzero_pd(); + for( i = 0; i < dataSize - 7; i += 8 ) { + __m128 x1f, x2f; + __m256d x1d, x2d; + x1f = _mm_loadu_ps( &data1[ i ] ); + x2f = _mm_loadu_ps( &data2[ i ] ); + x1d = _mm256_cvtps_pd( x1f ); + x2d = _mm256_cvtps_pd( x2f ); + accum1 = _mm256_fmadd_pd( x1d, x2d, accum1 ); + x1f = _mm_loadu_ps( &data1[ i + 4 ] ); + x2f = _mm_loadu_ps( &data2[ i + 4 ] ); + x1d = _mm256_cvtps_pd( x1f ); + x2d = _mm256_cvtps_pd( x2f ); + accum2 = _mm256_fmadd_pd( x1d, x2d, accum2 ); + } + for( ; i < dataSize - 3; i += 4 ) { + __m128 x1f, x2f; + __m256d x1d, x2d; + x1f = _mm_loadu_ps( &data1[ i ] ); + x2f = _mm_loadu_ps( &data2[ i ] ); + x1d = _mm256_cvtps_pd( x1f ); + x2d = _mm256_cvtps_pd( x2f ); + accum1 = _mm256_fmadd_pd( x1d, x2d, accum1 ); + } + accum1 = _mm256_add_pd(accum1, accum2); + accum1 = _mm256_add_pd(accum1, _mm256_permute2f128_pd(accum1, accum1, 1)); + accum1 = _mm256_hadd_pd(accum1,accum1); + result = _mm256_cvtsd_f64(accum1); + + /* add any remaining products */ + for( ; i < dataSize; i++ ) { + result += data1[ i ] * (double)data2[ i ]; + } + + return result; +} diff --git a/opus/silk/init_decoder.c b/opus/silk/init_decoder.c index 16c03dcd..01bc4b7a 100644 --- a/opus/silk/init_decoder.c +++ b/opus/silk/init_decoder.c @@ -31,15 +31,21 @@ POSSIBILITY OF SUCH DAMAGE. #include "main.h" +#ifdef ENABLE_OSCE +#include "osce.h" +#endif + +#include "structs.h" + /************************/ -/* Init Decoder State */ +/* Reset Decoder State */ /************************/ -opus_int silk_init_decoder( +opus_int silk_reset_decoder( silk_decoder_state *psDec /* I/O Decoder state pointer */ ) { /* Clear the entire encoder state, except anything copied */ - silk_memset( psDec, 0, sizeof( silk_decoder_state ) ); + silk_memset( &psDec->SILK_DECODER_STATE_RESET_START, 0, sizeof( silk_decoder_state ) - ((char*) &psDec->SILK_DECODER_STATE_RESET_START - (char*)psDec) ); /* Used to deactivate LSF interpolation */ psDec->first_frame_after_reset = 1; @@ -52,6 +58,27 @@ opus_int silk_init_decoder( /* Reset PLC state */ silk_PLC_Reset( psDec ); +#ifdef ENABLE_OSCE + /* Reset OSCE state and method */ + osce_reset(&psDec->osce, OSCE_DEFAULT_METHOD); +#endif + + return 0; +} + + +/************************/ +/* Init Decoder State */ +/************************/ +opus_int silk_init_decoder( + silk_decoder_state *psDec /* I/O Decoder state pointer */ +) +{ + /* Clear the entire encoder state, except anything copied */ + silk_memset( psDec, 0, sizeof( silk_decoder_state ) ); + + silk_reset_decoder( psDec ); + return(0); } diff --git a/opus/silk/init_encoder.c b/opus/silk/init_encoder.c index 65995c33..10d41287 100644 --- a/opus/silk/init_encoder.c +++ b/opus/silk/init_encoder.c @@ -36,6 +36,10 @@ POSSIBILITY OF SUCH DAMAGE. #include "tuning_parameters.h" #include "cpu_support.h" +#ifdef ENABLE_DRED +#include "dred_encoder.h" +#endif + /*********************************/ /* Initialize Silk Encoder state */ /*********************************/ diff --git a/opus/silk/main.h b/opus/silk/main.h index 1a33eed5..cd576d8c 100644 --- a/opus/silk/main.h +++ b/opus/silk/main.h @@ -247,21 +247,21 @@ void silk_VQ_WMat_EC_c( /************************************/ void silk_NSQ_c( - const silk_encoder_state *psEncC, /* I Encoder State */ - silk_nsq_state *NSQ, /* I/O NSQ state */ - SideInfoIndices *psIndices, /* I/O Quantization Indices */ - const opus_int16 x16[], /* I Input */ - opus_int8 pulses[], /* O Quantized pulse signal */ - const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ - const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ - const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ - const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ - const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ - const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ - const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ - const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ - const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ - const opus_int LTP_scale_Q14 /* I LTP state scaling */ + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + SideInfoIndices *psIndices, /* I/O Quantization Indices */ + const opus_int16 x16[], /* I Input */ + opus_int8 pulses[], /* O Quantized pulse signal */ + const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */ + const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ + const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ + const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ + const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ + const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ + const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ + const opus_int LTP_scale_Q14 /* I LTP state scaling */ ); #if !defined(OVERRIDE_silk_NSQ) @@ -273,21 +273,21 @@ void silk_NSQ_c( /* Noise shaping using delayed decision */ void silk_NSQ_del_dec_c( - const silk_encoder_state *psEncC, /* I Encoder State */ - silk_nsq_state *NSQ, /* I/O NSQ state */ - SideInfoIndices *psIndices, /* I/O Quantization Indices */ - const opus_int16 x16[], /* I Input */ - opus_int8 pulses[], /* O Quantized pulse signal */ - const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ - const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ - const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ - const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ - const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ - const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ - const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ - const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ - const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ - const opus_int LTP_scale_Q14 /* I LTP state scaling */ + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + SideInfoIndices *psIndices, /* I/O Quantization Indices */ + const opus_int16 x16[], /* I Input */ + opus_int8 pulses[], /* O Quantized pulse signal */ + const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */ + const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ + const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ + const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ + const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ + const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ + const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ + const opus_int LTP_scale_Q14 /* I LTP state scaling */ ); #if !defined(OVERRIDE_silk_NSQ_del_dec) @@ -389,6 +389,10 @@ void silk_NLSF_decode( /****************************************************/ /* Decoder Functions */ /****************************************************/ +opus_int silk_reset_decoder( + silk_decoder_state *psDec /* I/O Decoder state pointer */ +); + opus_int silk_init_decoder( silk_decoder_state *psDec /* I/O Decoder state pointer */ ); @@ -410,6 +414,12 @@ opus_int silk_decode_frame( opus_int32 *pN, /* O Pointer to size of output frame */ opus_int lostFlag, /* I 0: no loss, 1 loss, 2 decode fec */ opus_int condCoding, /* I The type of conditional coding to use */ +#ifdef ENABLE_DEEP_PLC + LPCNetPLCState *lpcnet, +#endif +#ifdef ENABLE_OSCE + OSCEModel *osce_model, +#endif int arch /* I Run-time architecture */ ); diff --git a/opus/silk/mips/NSQ_del_dec_mipsr1.h b/opus/silk/mips/NSQ_del_dec_mipsr1.h index cd70713a..85bfb637 100644 --- a/opus/silk/mips/NSQ_del_dec_mipsr1.h +++ b/opus/silk/mips/NSQ_del_dec_mipsr1.h @@ -25,8 +25,8 @@ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ***********************************************************************/ -#ifndef __NSQ_DEL_DEC_MIPSR1_H__ -#define __NSQ_DEL_DEC_MIPSR1_H__ +#ifndef NSQ_DEL_DEC_MIPSR1_H__ +#define NSQ_DEL_DEC_MIPSR1_H__ #ifdef HAVE_CONFIG_H #include "config.h" @@ -407,4 +407,4 @@ static inline void silk_noise_shape_quantizer_del_dec( } } -#endif /* __NSQ_DEL_DEC_MIPSR1_H__ */ +#endif /* NSQ_DEL_DEC_MIPSR1_H__ */ diff --git a/opus/silk/mips/macros_mipsr1.h b/opus/silk/mips/macros_mipsr1.h index 12ed981a..af408802 100644 --- a/opus/silk/mips/macros_mipsr1.h +++ b/opus/silk/mips/macros_mipsr1.h @@ -26,8 +26,8 @@ POSSIBILITY OF SUCH DAMAGE. ***********************************************************************/ -#ifndef __SILK_MACROS_MIPSR1_H__ -#define __SILK_MACROS_MIPSR1_H__ +#ifndef SILK_MACROS_MIPSR1_H__ +#define SILK_MACROS_MIPSR1_H__ #define mips_clz(x) __builtin_clz(x) @@ -89,4 +89,4 @@ static inline opus_int32 silk_CLZ32(opus_int32 in32) return re32; } -#endif /* __SILK_MACROS_MIPSR1_H__ */ +#endif /* SILK_MACROS_MIPSR1_H__ */ diff --git a/opus/silk/stereo_LR_to_MS.c b/opus/silk/stereo_LR_to_MS.c index c8226663..751452cb 100644 --- a/opus/silk/stereo_LR_to_MS.c +++ b/opus/silk/stereo_LR_to_MS.c @@ -77,7 +77,7 @@ void silk_stereo_LR_to_MS( ALLOC( LP_mid, frame_length, opus_int16 ); ALLOC( HP_mid, frame_length, opus_int16 ); for( n = 0; n < frame_length; n++ ) { - sum = silk_RSHIFT_ROUND( silk_ADD_LSHIFT( mid[ n ] + (opus_int32)mid[ n + 2 ], mid[ n + 1 ], 1 ), 2 ); + sum = silk_RSHIFT_ROUND( silk_ADD_LSHIFT32( mid[ n ] + (opus_int32)mid[ n + 2 ], mid[ n + 1 ], 1 ), 2 ); LP_mid[ n ] = sum; HP_mid[ n ] = mid[ n + 1 ] - sum; } @@ -86,7 +86,7 @@ void silk_stereo_LR_to_MS( ALLOC( LP_side, frame_length, opus_int16 ); ALLOC( HP_side, frame_length, opus_int16 ); for( n = 0; n < frame_length; n++ ) { - sum = silk_RSHIFT_ROUND( silk_ADD_LSHIFT( side[ n ] + (opus_int32)side[ n + 2 ], side[ n + 1 ], 1 ), 2 ); + sum = silk_RSHIFT_ROUND( silk_ADD_LSHIFT32( side[ n ] + (opus_int32)side[ n + 2 ], side[ n + 1 ], 1 ), 2 ); LP_side[ n ] = sum; HP_side[ n ] = side[ n + 1 ] - sum; } @@ -207,7 +207,7 @@ void silk_stereo_LR_to_MS( pred0_Q13 += delta0_Q13; pred1_Q13 += delta1_Q13; w_Q24 += deltaw_Q24; - sum = silk_LSHIFT( silk_ADD_LSHIFT( mid[ n ] + (opus_int32)mid[ n + 2 ], mid[ n + 1 ], 1 ), 9 ); /* Q11 */ + sum = silk_LSHIFT( silk_ADD_LSHIFT32( mid[ n ] + (opus_int32)mid[ n + 2 ], mid[ n + 1 ], 1 ), 9 ); /* Q11 */ sum = silk_SMLAWB( silk_SMULWB( w_Q24, side[ n + 1 ] ), sum, pred0_Q13 ); /* Q8 */ sum = silk_SMLAWB( sum, silk_LSHIFT( (opus_int32)mid[ n + 1 ], 11 ), pred1_Q13 ); /* Q8 */ x2[ n - 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( sum, 8 ) ); @@ -217,7 +217,7 @@ void silk_stereo_LR_to_MS( pred1_Q13 = -pred_Q13[ 1 ]; w_Q24 = silk_LSHIFT( width_Q14, 10 ); for( n = STEREO_INTERP_LEN_MS * fs_kHz; n < frame_length; n++ ) { - sum = silk_LSHIFT( silk_ADD_LSHIFT( mid[ n ] + (opus_int32)mid[ n + 2 ], mid[ n + 1 ], 1 ), 9 ); /* Q11 */ + sum = silk_LSHIFT( silk_ADD_LSHIFT32( mid[ n ] + (opus_int32)mid[ n + 2 ], mid[ n + 1 ], 1 ), 9 ); /* Q11 */ sum = silk_SMLAWB( silk_SMULWB( w_Q24, side[ n + 1 ] ), sum, pred0_Q13 ); /* Q8 */ sum = silk_SMLAWB( sum, silk_LSHIFT( (opus_int32)mid[ n + 1 ], 11 ), pred1_Q13 ); /* Q8 */ x2[ n - 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( sum, 8 ) ); diff --git a/opus/silk/stereo_MS_to_LR.c b/opus/silk/stereo_MS_to_LR.c index 62521a4f..1e01bb6e 100644 --- a/opus/silk/stereo_MS_to_LR.c +++ b/opus/silk/stereo_MS_to_LR.c @@ -59,7 +59,7 @@ void silk_stereo_MS_to_LR( for( n = 0; n < STEREO_INTERP_LEN_MS * fs_kHz; n++ ) { pred0_Q13 += delta0_Q13; pred1_Q13 += delta1_Q13; - sum = silk_LSHIFT( silk_ADD_LSHIFT( x1[ n ] + x1[ n + 2 ], x1[ n + 1 ], 1 ), 9 ); /* Q11 */ + sum = silk_LSHIFT( silk_ADD_LSHIFT32( x1[ n ] + (opus_int32)x1[ n + 2 ], x1[ n + 1 ], 1 ), 9 ); /* Q11 */ sum = silk_SMLAWB( silk_LSHIFT( (opus_int32)x2[ n + 1 ], 8 ), sum, pred0_Q13 ); /* Q8 */ sum = silk_SMLAWB( sum, silk_LSHIFT( (opus_int32)x1[ n + 1 ], 11 ), pred1_Q13 ); /* Q8 */ x2[ n + 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( sum, 8 ) ); @@ -67,7 +67,7 @@ void silk_stereo_MS_to_LR( pred0_Q13 = pred_Q13[ 0 ]; pred1_Q13 = pred_Q13[ 1 ]; for( n = STEREO_INTERP_LEN_MS * fs_kHz; n < frame_length; n++ ) { - sum = silk_LSHIFT( silk_ADD_LSHIFT( x1[ n ] + x1[ n + 2 ], x1[ n + 1 ], 1 ), 9 ); /* Q11 */ + sum = silk_LSHIFT( silk_ADD_LSHIFT32( x1[ n ] + (opus_int32)x1[ n + 2 ], x1[ n + 1 ], 1 ), 9 ); /* Q11 */ sum = silk_SMLAWB( silk_LSHIFT( (opus_int32)x2[ n + 1 ], 8 ), sum, pred0_Q13 ); /* Q8 */ sum = silk_SMLAWB( sum, silk_LSHIFT( (opus_int32)x1[ n + 1 ], 11 ), pred1_Q13 ); /* Q8 */ x2[ n + 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( sum, 8 ) ); diff --git a/opus/silk/structs.h b/opus/silk/structs.h index 3380c757..38243be1 100644 --- a/opus/silk/structs.h +++ b/opus/silk/structs.h @@ -34,6 +34,21 @@ POSSIBILITY OF SUCH DAMAGE. #include "entenc.h" #include "entdec.h" +#ifdef ENABLE_DEEP_PLC +#include "lpcnet.h" +#include "lpcnet_private.h" +#endif + +#ifdef ENABLE_DRED +#include "dred_encoder.h" +#include "dred_decoder.h" +#endif + +#ifdef ENABLE_OSCE +#include "osce_config.h" +#include "osce_structs.h" +#endif + #ifdef __cplusplus extern "C" { @@ -228,6 +243,14 @@ typedef struct { } silk_encoder_state; +#ifdef ENABLE_OSCE +typedef struct { + OSCEFeatureState features; + OSCEState state; + int method; +} silk_OSCE_struct; +#endif + /* Struct for Packet Loss Concealment */ typedef struct { opus_int32 pitchL_Q8; /* Pitch lag to use for voiced concealment */ @@ -243,6 +266,7 @@ typedef struct { opus_int fs_kHz; opus_int nb_subfr; opus_int subfr_length; + opus_int enable_deep_plc; } silk_PLC_struct; /* Struct for CNG */ @@ -259,6 +283,10 @@ typedef struct { /* Decoder state */ /********************************/ typedef struct { +#ifdef ENABLE_OSCE + silk_OSCE_struct osce; +#endif +#define SILK_DECODER_STATE_RESET_START prev_gain_Q16 opus_int32 prev_gain_Q16; opus_int32 exc_Q14[ MAX_FRAME_LENGTH ]; opus_int32 sLPC_Q14_buf[ MAX_LPC_ORDER ]; diff --git a/opus/silk/tests/test_unit_LPC_inv_pred_gain.c b/opus/silk/tests/test_unit_LPC_inv_pred_gain.c index 67067cea..acdd31af 100644 --- a/opus/silk/tests/test_unit_LPC_inv_pred_gain.c +++ b/opus/silk/tests/test_unit_LPC_inv_pred_gain.c @@ -43,6 +43,7 @@ int check_stability(opus_int16 *A_Q12, int order) { int i; int j; int sum_a, sum_abs_a; + double y[SILK_MAX_ORDER_LPC] = {0}; sum_a = sum_abs_a = 0; for( j = 0; j < order; j++ ) { sum_a += A_Q12[ j ]; @@ -57,7 +58,6 @@ int check_stability(opus_int16 *A_Q12, int order) { if( sum_abs_a < 4096 ) { return 1; } - double y[SILK_MAX_ORDER_LPC] = {0}; y[0] = 1; for( i = 0; i < 10000; i++ ) { double sum = 0; @@ -125,5 +125,6 @@ int main(void) { } } printf("silk_LPC_inverse_pred_gain() optimization passed\n"); + RESTORE_STACK; return 0; } diff --git a/opus/silk/typedef.h b/opus/silk/typedef.h index 97b7e709..793d2c0c 100644 --- a/opus/silk/typedef.h +++ b/opus/silk/typedef.h @@ -67,6 +67,9 @@ __attribute__((noreturn)) static OPUS_INLINE void _silk_fatal(const char *str, const char *file, int line) { fprintf (stderr, "Fatal (internal) error in %s, line %d: %s\n", file, line, str); +#if defined(_MSC_VER) + _set_abort_behavior( 0, _WRITE_ABORT_MSG); +#endif abort(); } # define silk_assert(COND) {if (!(COND)) {silk_fatal("assertion failed: " #COND);}} diff --git a/opus/silk/x86/NSQ_del_dec_avx2.c b/opus/silk/x86/NSQ_del_dec_avx2.c new file mode 100644 index 00000000..43485871 --- /dev/null +++ b/opus/silk/x86/NSQ_del_dec_avx2.c @@ -0,0 +1,1075 @@ +/*********************************************************************** +Copyright (c) 2021 Google Inc. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#ifdef OPUS_CHECK_ASM +#include +#endif + +#include "opus_defines.h" +#include + +#include "main.h" +#include "stack_alloc.h" +#include "NSQ.h" +#include "celt/x86/x86cpu.h" + +/* Returns TRUE if all assumptions met */ +static OPUS_INLINE int verify_assumptions(const silk_encoder_state *psEncC) +{ + /* This optimization is based on these assumptions */ + /* These assumptions are fundamental and hence assert are */ + /* used. Should any assert triggers, we have to re-visit */ + /* all related code to make sure it still functions the */ + /* same as the C implementation. */ + silk_assert(MAX_DEL_DEC_STATES <= 4 && + MAX_FRAME_LENGTH % 4 == 0 && + MAX_SUB_FRAME_LENGTH % 4 == 0 && + LTP_MEM_LENGTH_MS % 4 == 0 ); + silk_assert(psEncC->fs_kHz == 8 || + psEncC->fs_kHz == 12 || + psEncC->fs_kHz == 16 ); + silk_assert(psEncC->nb_subfr <= MAX_NB_SUBFR && + psEncC->nb_subfr > 0 ); + silk_assert(psEncC->nStatesDelayedDecision <= MAX_DEL_DEC_STATES && + psEncC->nStatesDelayedDecision > 0 ); + silk_assert(psEncC->ltp_mem_length == psEncC->fs_kHz * LTP_MEM_LENGTH_MS); + + /* Regressions were observed on certain AMD Zen CPUs when */ + /* nStatesDelayedDecision is 1 or 2. Ideally we should detect */ + /* these CPUs and enable this optimization on others; however, */ + /* there is no good way to do so under current OPUS framework. */ + return psEncC->nStatesDelayedDecision == 3 || + psEncC->nStatesDelayedDecision == 4; +} + +/* Intrinsics not defined on MSVC */ +#ifdef _MSC_VER +#include +#define __m128i_u __m128i +static inline int __builtin_sadd_overflow(opus_int32 a, opus_int32 b, opus_int32* res) +{ + *res = a+b; + return (*res ^ a) & (*res ^ b) & 0x80000000; +} +static inline int __builtin_ctz(unsigned int x) +{ + DWORD res = 0; + return _BitScanForward(&res, x) ? res : 32; +} +#endif + +static OPUS_INLINE __m128i silk_cvtepi64_epi32_high(__m256i num) +{ + return _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(num, _mm256_set_epi32(0, 0, 0, 0, 7, 5, 3, 1))); +} + +static OPUS_INLINE opus_int16 silk_sat16(opus_int32 num) +{ + num = num > silk_int16_MAX ? silk_int16_MAX : num; + num = num < silk_int16_MIN ? silk_int16_MIN : num; + return num; +} + +static OPUS_INLINE opus_int32 silk_sar_round_32(opus_int32 a, int bits) +{ + silk_assert(bits > 0 && bits < 31); + a += 1 << (bits-1); + return a >> bits; +} + +static OPUS_INLINE opus_int64 silk_sar_round_smulww(opus_int32 a, opus_int32 b, int bits) +{ + silk_assert(bits > 0 && bits < 63); +#ifdef OPUS_CHECK_ASM + return silk_RSHIFT_ROUND(silk_SMULWW(a, b), bits); +#else + /* This code is more correct, but it won't overflow like the C code in some rare cases. */ + silk_assert(bits > 0 && bits < 63); + opus_int64 t = ((opus_int64)a) * ((opus_int64)b); + bits += 16; + t += 1ull << (bits-1); + return t >> bits; +#endif +} + +static OPUS_INLINE opus_int32 silk_add_sat32(opus_int32 a, opus_int32 b) +{ + opus_int32 sum; + if (__builtin_sadd_overflow(a, b, &sum)) + { + return a >= 0 ? silk_int32_MAX : silk_int32_MIN; + } + return sum; +} + +static OPUS_INLINE __m128i silk_mm_srai_round_epi32(__m128i a, int bits) +{ + silk_assert(bits > 0 && bits < 31); + return _mm_srai_epi32(_mm_add_epi32(a, _mm_set1_epi32(1 << (bits - 1))), bits); +} + +/* add/subtract with output saturated */ +static OPUS_INLINE __m128i silk_mm_add_sat_epi32(__m128i a, __m128i b) +{ + __m128i r = _mm_add_epi32(a, b); + __m128i OF = _mm_and_si128(_mm_xor_si128(a, r), _mm_xor_si128(b, r)); /* OF = (sum ^ a) & (sum ^ b) */ + __m128i SAT = _mm_add_epi32(_mm_srli_epi32(a, 31), _mm_set1_epi32(0x7FFFFFFF)); /* SAT = (a >> 31) + 0x7FFFFFFF */ + return _mm_blendv_epi8(r, SAT, _mm_srai_epi32(OF, 31)); +} +static OPUS_INLINE __m128i silk_mm_sub_sat_epi32(__m128i a, __m128i b) +{ + __m128i r = _mm_sub_epi32(a, b); + __m128i OF = _mm_andnot_si128(_mm_xor_si128(b, r), _mm_xor_si128(a, r)); /* OF = (sum ^ a) & (sum ^ ~b) = (sum ^ a) & ~(sum ^ b) */ + __m128i SAT = _mm_add_epi32(_mm_srli_epi32(a, 31), _mm_set1_epi32(0x7FFFFFFF)); /* SAT = (a >> 31) + 0x7FFFFFFF */ + return _mm_blendv_epi8(r, SAT, _mm_srai_epi32(OF, 31)); +} +static OPUS_INLINE __m256i silk_mm256_sub_sat_epi32(__m256i a, __m256i b) +{ + __m256i r = _mm256_sub_epi32(a, b); + __m256i OF = _mm256_andnot_si256(_mm256_xor_si256(b, r), _mm256_xor_si256(a, r)); /* OF = (sum ^ a) & (sum ^ ~b) = (sum ^ a) & ~(sum ^ b) */ + __m256i SAT = _mm256_add_epi32(_mm256_srli_epi32(a, 31), _mm256_set1_epi32(0x7FFFFFFF)); /* SAT = (a >> 31) + 0x7FFFFFFF */ + return _mm256_blendv_epi8(r, SAT, _mm256_srai_epi32(OF, 31)); +} + +static OPUS_INLINE __m128i silk_mm_limit_epi32(__m128i num, opus_int32 limit1, opus_int32 limit2) +{ + opus_int32 lo = limit1 < limit2 ? limit1 : limit2; + opus_int32 hi = limit1 > limit2 ? limit1 : limit2; + + num = _mm_min_epi32(num, _mm_set1_epi32(hi)); + num = _mm_max_epi32(num, _mm_set1_epi32(lo)); + return num; +} + +/* cond < 0 ? -num : num */ +static OPUS_INLINE __m128i silk_mm_sign_epi32(__m128i num, __m128i cond) +{ + return _mm_sign_epi32(num, _mm_or_si128(cond, _mm_set1_epi32(1))); +} +static OPUS_INLINE __m256i silk_mm256_sign_epi32(__m256i num, __m256i cond) +{ + return _mm256_sign_epi32(num, _mm256_or_si256(cond, _mm256_set1_epi32(1))); +} + +/* (a32 * b32) >> 16 */ +static OPUS_INLINE __m128i silk_mm_smulww_epi32(__m128i a, opus_int32 b) +{ + return silk_cvtepi64_epi32_high(_mm256_slli_epi64(_mm256_mul_epi32(_mm256_cvtepi32_epi64(a), _mm256_set1_epi32(b)), 16)); +} + +/* (a32 * (opus_int32)((opus_int16)(b32))) >> 16 output have to be 32bit int */ +static OPUS_INLINE __m128i silk_mm_smulwb_epi32(__m128i a, opus_int32 b) +{ + return silk_cvtepi64_epi32_high(_mm256_mul_epi32(_mm256_cvtepi32_epi64(a), _mm256_set1_epi32(silk_LSHIFT(b, 16)))); +} + +/* (opus_int32)((opus_int16)(a3))) * (opus_int32)((opus_int16)(b32)) output have to be 32bit int */ +static OPUS_INLINE __m256i silk_mm256_smulbb_epi32(__m256i a, __m256i b) +{ + const char FF = (char)0xFF; + __m256i msk = _mm256_set_epi8( + FF, FF, FF, FF, FF, FF, FF, FF, 13, 12, 9, 8, 5, 4, 1, 0, + FF, FF, FF, FF, FF, FF, FF, FF, 13, 12, 9, 8, 5, 4, 1, 0); + __m256i lo = _mm256_mullo_epi16(a, b); + __m256i hi = _mm256_mulhi_epi16(a, b); + lo = _mm256_shuffle_epi8(lo, msk); + hi = _mm256_shuffle_epi8(hi, msk); + return _mm256_unpacklo_epi16(lo, hi); +} + +static OPUS_INLINE __m256i silk_mm256_reverse_epi32(__m256i v) +{ + v = _mm256_shuffle_epi32(v, 0x1B); + v = _mm256_permute4x64_epi64(v, 0x4E); + return v; +} + +static OPUS_INLINE opus_int32 silk_mm256_hsum_epi32(__m256i v) +{ + __m128i sum = _mm_add_epi32(_mm256_extracti128_si256(v, 1), _mm256_extracti128_si256(v, 0)); + sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); + sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); + return _mm_cvtsi128_si32(sum); +} + +static OPUS_INLINE __m128i silk_mm_hmin_epi32(__m128i num) +{ + num = _mm_min_epi32(num, _mm_shuffle_epi32(num, 0x4E)); /* 0123 -> 2301 */ + num = _mm_min_epi32(num, _mm_shuffle_epi32(num, 0xB1)); /* 0123 -> 1032 */ + return num; +} + +static OPUS_INLINE __m128i silk_mm_hmax_epi32(__m128i num) +{ + num = _mm_max_epi32(num, _mm_shuffle_epi32(num, 0x4E)); /* 0123 -> 2310 */ + num = _mm_max_epi32(num, _mm_shuffle_epi32(num, 0xB1)); /* 0123 -> 1032 */ + return num; +} + +static OPUS_INLINE __m128i silk_mm_mask_hmin_epi32(__m128i num, __m128i mask) +{ + num = _mm_blendv_epi8(num, _mm_set1_epi32(silk_int32_MAX), mask); + return silk_mm_hmin_epi32(num); +} + +static OPUS_INLINE __m128i silk_mm_mask_hmax_epi32(__m128i num, __m128i mask) +{ + num = _mm_blendv_epi8(num, _mm_set1_epi32(silk_int32_MIN), mask); + return silk_mm_hmax_epi32(num); +} + +static OPUS_INLINE __m128i silk_mm256_rand_epi32(__m128i seed) +{ + seed = _mm_mullo_epi32(seed, _mm_set1_epi32(RAND_MULTIPLIER)); + seed = _mm_add_epi32(seed, _mm_set1_epi32(RAND_INCREMENT)); + return seed; +} + +static OPUS_INLINE opus_int32 silk_index_of_first_equal_epi32(__m128i a, __m128i b) +{ + unsigned int mask = _mm_movemask_epi8(_mm_cmpeq_epi32(a, b)) & 0x1111; + silk_assert(mask != 0); + return __builtin_ctz(mask) >> 2; +} + +static __m128i silk_index_to_selector(opus_int32 index) +{ + silk_assert(index < 4); + index <<= 2; + return _mm_set_epi8( + index + 3, index + 2, index + 1, index + 0, + index + 3, index + 2, index + 1, index + 0, + index + 3, index + 2, index + 1, index + 0, + index + 3, index + 2, index + 1, index + 0); +} + +static opus_int32 silk_select_winner(__m128i num, __m128i selector) +{ + return _mm_cvtsi128_si32(_mm_shuffle_epi8(num, selector)); +} + +typedef struct +{ + __m128i RandState; + __m128i Q_Q10; + __m128i Xq_Q14; + __m128i Pred_Q15; + __m128i Shape_Q14; +} NSQ_del_dec_sample_struct; + +typedef struct +{ + __m128i sLPC_Q14[MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH]; + __m128i LF_AR_Q14; + __m128i Seed; + __m128i SeedInit; + __m128i RD_Q10; + __m128i Diff_Q14; + __m128i sAR2_Q14[MAX_SHAPE_LPC_ORDER]; + NSQ_del_dec_sample_struct Samples[DECISION_DELAY]; +} NSQ_del_dec_struct; + +static OPUS_INLINE void silk_nsq_del_dec_scale_states_avx2( + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + NSQ_del_dec_struct *psDelDec, /* I/O Delayed decision states */ + const opus_int16 x16[], /* I Input */ + opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH], /* O Input scaled with 1/Gain in Q10 */ + const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */ + opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */ + opus_int subfr, /* I Subframe number */ + const opus_int LTP_scale_Q14, /* I LTP state scaling */ + const opus_int32 Gains_Q16[MAX_NB_SUBFR], /* I */ + const opus_int pitchL[MAX_NB_SUBFR], /* I Pitch lag */ + const opus_int signal_type, /* I Signal type */ + const opus_int decisionDelay /* I Decision delay */ +); + +/*******************************************/ +/* LPC analysis filter */ +/* NB! State is kept internally and the */ +/* filter always starts with zero state */ +/* first d output samples are set to zero */ +/*******************************************/ +static OPUS_INLINE void silk_LPC_analysis_filter_avx2( + opus_int16 *out, /* O Output signal */ + const opus_int16 *in, /* I Input signal */ + const opus_int16 *B, /* I MA prediction coefficients, Q12 [order] */ + const opus_int32 len, /* I Signal length */ + const opus_int32 order /* I Filter order */ +); + +/******************************************/ +/* Noise shape quantizer for one subframe */ +/******************************************/ +static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_avx2( + silk_nsq_state *NSQ, /* I/O NSQ state */ + NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */ + opus_int signalType, /* I Signal type */ + const opus_int32 x_Q10[], /* I */ + opus_int8 pulses[], /* O */ + opus_int16 xq[], /* O */ + opus_int32 sLTP_Q15[], /* I/O LTP filter state */ + opus_int32 delayedGain_Q10[DECISION_DELAY], /* I/O Gain delay buffer */ + const opus_int16 a_Q12[], /* I Short term prediction coefs */ + const opus_int16 b_Q14[], /* I Long term prediction coefs */ + const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */ + opus_int lag, /* I Pitch lag */ + opus_int32 HarmShapeFIRPacked_Q14, /* I */ + opus_int Tilt_Q14, /* I Spectral tilt */ + opus_int32 LF_shp_Q14, /* I */ + opus_int32 Gain_Q16, /* I */ + opus_int Lambda_Q10, /* I */ + opus_int offset_Q10, /* I */ + opus_int length, /* I Input length */ + opus_int subfr, /* I Subframe number */ + opus_int shapingLPCOrder, /* I Shaping LPC filter order */ + opus_int predictLPCOrder, /* I Prediction filter order */ + opus_int warping_Q16, /* I */ + __m128i MaskDelDec, /* I Mask of states in decision tree */ + opus_int *smpl_buf_idx, /* I/O Index to newest samples in buffers */ + opus_int decisionDelay /* I */ +); + +void silk_NSQ_del_dec_avx2( + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + SideInfoIndices *psIndices, /* I/O Quantization Indices */ + const opus_int16 x16[], /* I Input */ + opus_int8 pulses[], /* O Quantized pulse signal */ + const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */ + const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR], /* I Long term prediction coefs */ + const opus_int16 AR_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER], /* I Noise shaping coefs */ + const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR], /* I Long term shaping coefs */ + const opus_int Tilt_Q14[MAX_NB_SUBFR], /* I Spectral tilt */ + const opus_int32 LF_shp_Q14[MAX_NB_SUBFR], /* I Low frequency shaping coefs */ + const opus_int32 Gains_Q16[MAX_NB_SUBFR], /* I Quantization step sizes */ + const opus_int32 pitchL[MAX_NB_SUBFR], /* I Pitch lags */ + const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ + const opus_int LTP_scale_Q14 /* I LTP state scaling */ +) +{ +#ifdef OPUS_CHECK_ASM + silk_nsq_state NSQ_c; + SideInfoIndices psIndices_c; + opus_int8 pulses_c[MAX_FRAME_LENGTH]; + const opus_int8 *const pulses_a = pulses; + + silk_memcpy(&NSQ_c, NSQ, sizeof(NSQ_c)); + silk_memcpy(&psIndices_c, psIndices, sizeof(psIndices_c)); + silk_memcpy(pulses_c, pulses, sizeof(pulses_c)); + silk_NSQ_del_dec_c(psEncC, &NSQ_c, &psIndices_c, x16, pulses_c, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, + pitchL, Lambda_Q10, LTP_scale_Q14); +#endif + + if (!verify_assumptions(psEncC)) + { + silk_NSQ_del_dec_c(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14); + return; + } + + opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr; + opus_int last_smple_idx, smpl_buf_idx, decisionDelay; + const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13; + opus_int16 *pxq; + VARDECL(opus_int32, sLTP_Q15); + VARDECL(opus_int16, sLTP); + opus_int32 HarmShapeFIRPacked_Q14; + opus_int offset_Q10; + opus_int32 Gain_Q10; + opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH]; + opus_int32 delayedGain_Q10[DECISION_DELAY]; + NSQ_del_dec_struct psDelDec = {0}; + NSQ_del_dec_sample_struct *psSample; + __m128i RDmin_Q10, MaskDelDec, Winner_selector; + SAVE_STACK; + + MaskDelDec = _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFFFFF00ul << ((psEncC->nStatesDelayedDecision - 1) << 3))); + + /* Set unvoiced lag to the previous one, overwrite later for voiced */ + lag = NSQ->lagPrev; + + silk_assert(NSQ->prev_gain_Q16 != 0); + psDelDec.Seed = _mm_and_si128( + _mm_add_epi32(_mm_set_epi32(3, 2, 1, 0), _mm_set1_epi32(psIndices->Seed)), + _mm_set1_epi32(3)); + psDelDec.SeedInit = psDelDec.Seed; + psDelDec.RD_Q10 = _mm_setzero_si128(); + psDelDec.LF_AR_Q14 = _mm_set1_epi32(NSQ->sLF_AR_shp_Q14); + psDelDec.Diff_Q14 = _mm_set1_epi32(NSQ->sDiff_shp_Q14); + psDelDec.Samples[0].Shape_Q14 = _mm_set1_epi32(NSQ->sLTP_shp_Q14[psEncC->ltp_mem_length - 1]); + for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++) + { + psDelDec.sLPC_Q14[i] = _mm_set1_epi32(NSQ->sLPC_Q14[i]); + } + for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++) + { + psDelDec.sAR2_Q14[i] = _mm_set1_epi32(NSQ->sAR2_Q14[i]); + } + + offset_Q10 = silk_Quantization_Offsets_Q10[psIndices->signalType >> 1][psIndices->quantOffsetType]; + smpl_buf_idx = 0; /* index of oldest samples */ + + decisionDelay = silk_min_int(DECISION_DELAY, psEncC->subfr_length); + + /* For voiced frames limit the decision delay to lower than the pitch lag */ + if (psIndices->signalType == TYPE_VOICED) + { + for (k = 0; k < psEncC->nb_subfr; k++) + { + decisionDelay = silk_min_int(decisionDelay, pitchL[k] - LTP_ORDER / 2 - 1); + } + } + else + { + if (lag > 0) + { + decisionDelay = silk_min_int(decisionDelay, lag - LTP_ORDER / 2 - 1); + } + } + + if (psIndices->NLSFInterpCoef_Q2 == 4) + { + LSF_interpolation_flag = 0; + } + else + { + LSF_interpolation_flag = 1; + } + + ALLOC(sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32); + ALLOC(sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16); + /* Set up pointers to start of sub frame */ + pxq = &NSQ->xq[psEncC->ltp_mem_length]; + NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length; + NSQ->sLTP_buf_idx = psEncC->ltp_mem_length; + subfr = 0; + for (k = 0; k < psEncC->nb_subfr; k++) + { + A_Q12 = &PredCoef_Q12[((k >> 1) | (1 ^ LSF_interpolation_flag)) * MAX_LPC_ORDER]; + B_Q14 = <PCoef_Q14[k * LTP_ORDER]; + AR_shp_Q13 = &AR_Q13[k * MAX_SHAPE_LPC_ORDER]; + + /* Noise shape parameters */ + silk_assert(HarmShapeGain_Q14[k] >= 0); + HarmShapeFIRPacked_Q14 = silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 ); + HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 ); + + NSQ->rewhite_flag = 0; + if (psIndices->signalType == TYPE_VOICED) + { + /* Voiced */ + lag = pitchL[k]; + + /* Re-whitening */ + if ((k & (3 ^ (LSF_interpolation_flag << 1))) == 0) + { + if (k == 2) + { + /* RESET DELAYED DECISIONS */ + /* Find winner */ + RDmin_Q10 = silk_mm_mask_hmin_epi32(psDelDec.RD_Q10, MaskDelDec); + Winner_ind = silk_index_of_first_equal_epi32(RDmin_Q10, psDelDec.RD_Q10); + Winner_selector = silk_index_to_selector(Winner_ind); + psDelDec.RD_Q10 = _mm_add_epi32( + psDelDec.RD_Q10, + _mm_blendv_epi8( + _mm_set1_epi32(silk_int32_MAX >> 4), + _mm_setzero_si128(), + _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFU << (unsigned)(Winner_ind << 3))))); + + /* Copy final part of signals from winner state to output and long-term filter states */ + last_smple_idx = smpl_buf_idx + decisionDelay; + for (i = 0; i < decisionDelay; i++) + { + last_smple_idx = (last_smple_idx + DECISION_DELAY - 1) % DECISION_DELAY; + psSample = &psDelDec.Samples[last_smple_idx]; + pulses[i - decisionDelay] = + (opus_int8)silk_sar_round_32(silk_select_winner(psSample->Q_Q10, Winner_selector), 10); + pxq[i - decisionDelay] = + silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psSample->Xq_Q14, Winner_selector), Gains_Q16[1], 14)); + NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay + i] = + silk_select_winner(psSample->Shape_Q14, Winner_selector); + } + + subfr = 0; + } + + /* Rewhiten with new A coefs */ + start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2; + silk_assert(start_idx > 0); + + silk_LPC_analysis_filter_avx2(&sLTP[start_idx], &NSQ->xq[start_idx + k * psEncC->subfr_length], + A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder); + + NSQ->sLTP_buf_idx = psEncC->ltp_mem_length; + NSQ->rewhite_flag = 1; + } + } + + silk_nsq_del_dec_scale_states_avx2(psEncC, NSQ, &psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k, + LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay); + + silk_noise_shape_quantizer_del_dec_avx2(NSQ, &psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, + delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[k], LF_shp_Q14[k], + Gains_Q16[k], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder, + psEncC->predictLPCOrder, psEncC->warping_Q16, MaskDelDec, &smpl_buf_idx, decisionDelay); + + x16 += psEncC->subfr_length; + pulses += psEncC->subfr_length; + pxq += psEncC->subfr_length; + } + + /* Find winner */ + RDmin_Q10 = silk_mm_mask_hmin_epi32(psDelDec.RD_Q10, MaskDelDec); + Winner_selector = silk_index_to_selector(silk_index_of_first_equal_epi32(RDmin_Q10, psDelDec.RD_Q10)); + + /* Copy final part of signals from winner state to output and long-term filter states */ + psIndices->Seed = silk_select_winner(psDelDec.SeedInit, Winner_selector); + last_smple_idx = smpl_buf_idx + decisionDelay; + Gain_Q10 = Gains_Q16[psEncC->nb_subfr - 1] >> 6; + for (i = 0; i < decisionDelay; i++) + { + last_smple_idx = (last_smple_idx + DECISION_DELAY - 1) % DECISION_DELAY; + psSample = &psDelDec.Samples[last_smple_idx]; + + pulses[i - decisionDelay] = + (opus_int8)silk_sar_round_32(silk_select_winner(psSample->Q_Q10, Winner_selector), 10); + pxq[i - decisionDelay] = + silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psSample->Xq_Q14, Winner_selector), Gain_Q10, 8)); + NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay + i] = + silk_select_winner(psSample->Shape_Q14, Winner_selector); + } + for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++) + { + NSQ->sLPC_Q14[i] = silk_select_winner(psDelDec.sLPC_Q14[i], Winner_selector); + } + for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++) + { + NSQ->sAR2_Q14[i] = silk_select_winner(psDelDec.sAR2_Q14[i], Winner_selector); + } + + /* Update states */ + NSQ->sLF_AR_shp_Q14 = silk_select_winner(psDelDec.LF_AR_Q14, Winner_selector); + NSQ->sDiff_shp_Q14 = silk_select_winner(psDelDec.Diff_Q14, Winner_selector); + NSQ->lagPrev = pitchL[psEncC->nb_subfr - 1]; + + /* Save quantized speech signal */ + silk_memmove(NSQ->xq, &NSQ->xq[psEncC->frame_length], psEncC->ltp_mem_length * sizeof(opus_int16)); + silk_memmove(NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[psEncC->frame_length], psEncC->ltp_mem_length * sizeof(opus_int32)); + +#ifdef OPUS_CHECK_ASM + silk_assert(!memcmp(&NSQ_c, NSQ, sizeof(NSQ_c))); + silk_assert(!memcmp(&psIndices_c, psIndices, sizeof(psIndices_c))); + silk_assert(!memcmp(pulses_c, pulses_a, sizeof(pulses_c))); +#endif + + RESTORE_STACK; +} + +static OPUS_INLINE __m128i silk_noise_shape_quantizer_short_prediction_x4(const __m128i *buf32, const opus_int16 *coef16, opus_int order) +{ + __m256i out; + silk_assert(order == 10 || order == 16); + + /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */ + out = _mm256_set1_epi32(order >> 1); + out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-0]), _mm256_set1_epi32(silk_LSHIFT(coef16[0], 16)))); /* High DWORD */ + out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-1]), _mm256_set1_epi32(silk_LSHIFT(coef16[1], 16)))); /* High DWORD */ + out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-2]), _mm256_set1_epi32(silk_LSHIFT(coef16[2], 16)))); /* High DWORD */ + out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-3]), _mm256_set1_epi32(silk_LSHIFT(coef16[3], 16)))); /* High DWORD */ + out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-4]), _mm256_set1_epi32(silk_LSHIFT(coef16[4], 16)))); /* High DWORD */ + out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-5]), _mm256_set1_epi32(silk_LSHIFT(coef16[5], 16)))); /* High DWORD */ + out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-6]), _mm256_set1_epi32(silk_LSHIFT(coef16[6], 16)))); /* High DWORD */ + out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-7]), _mm256_set1_epi32(silk_LSHIFT(coef16[7], 16)))); /* High DWORD */ + out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-8]), _mm256_set1_epi32(silk_LSHIFT(coef16[8], 16)))); /* High DWORD */ + out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-9]), _mm256_set1_epi32(silk_LSHIFT(coef16[9], 16)))); /* High DWORD */ + + if (order == 16) + { + out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-10]), _mm256_set1_epi32(silk_LSHIFT(coef16[10], 16)))); /* High DWORD */ + out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-11]), _mm256_set1_epi32(silk_LSHIFT(coef16[11], 16)))); /* High DWORD */ + out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-12]), _mm256_set1_epi32(silk_LSHIFT(coef16[12], 16)))); /* High DWORD */ + out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-13]), _mm256_set1_epi32(silk_LSHIFT(coef16[13], 16)))); /* High DWORD */ + out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-14]), _mm256_set1_epi32(silk_LSHIFT(coef16[14], 16)))); /* High DWORD */ + out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-15]), _mm256_set1_epi32(silk_LSHIFT(coef16[15], 16)))); /* High DWORD */ + } + return silk_cvtepi64_epi32_high(out); +} + +/******************************************/ +/* Noise shape quantizer for one subframe */ +/******************************************/ +static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_avx2( + silk_nsq_state *NSQ, /* I/O NSQ state */ + NSQ_del_dec_struct *psDelDec, /* I/O Delayed decision states */ + opus_int signalType, /* I Signal type */ + const opus_int32 x_Q10[], /* I */ + opus_int8 pulses[], /* O */ + opus_int16 xq[], /* O */ + opus_int32 sLTP_Q15[], /* I/O LTP filter state */ + opus_int32 delayedGain_Q10[DECISION_DELAY], /* I/O Gain delay buffer */ + const opus_int16 a_Q12[], /* I Short term prediction coefs */ + const opus_int16 b_Q14[], /* I Long term prediction coefs */ + const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */ + opus_int lag, /* I Pitch lag */ + opus_int32 HarmShapeFIRPacked_Q14, /* I */ + opus_int Tilt_Q14, /* I Spectral tilt */ + opus_int32 LF_shp_Q14, /* I */ + opus_int32 Gain_Q16, /* I */ + opus_int Lambda_Q10, /* I */ + opus_int offset_Q10, /* I */ + opus_int length, /* I Input length */ + opus_int subfr, /* I Subframe number */ + opus_int shapingLPCOrder, /* I Shaping LPC filter order */ + opus_int predictLPCOrder, /* I Prediction filter order */ + opus_int warping_Q16, /* I */ + __m128i MaskDelDec, /* I Mask of states in decision tree */ + opus_int *smpl_buf_idx, /* I/O Index to newest samples in buffers */ + opus_int decisionDelay /* I */ +) +{ + int i; + opus_int32 *shp_lag_ptr = &NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2]; + opus_int32 *pred_lag_ptr = &sLTP_Q15[NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2]; + opus_int32 Gain_Q10 = Gain_Q16 >> 6; + + for (i = 0; i < length; i++) + { + /* Perform common calculations used in all states */ + /* NSQ_sample_struct */ + /* Low 128 bits => 1st set */ + /* High 128 bits => 2nd set */ + int j; + __m256i SS_Q_Q10; + __m256i SS_RD_Q10; + __m256i SS_xq_Q14; + __m256i SS_LF_AR_Q14; + __m256i SS_Diff_Q14; + __m256i SS_sLTP_shp_Q14; + __m256i SS_LPC_exc_Q14; + __m256i exc_Q14; + __m256i q_Q10, rr_Q10, rd_Q10; + __m256i mask; + __m128i LPC_pred_Q14, n_AR_Q14; + __m128i RDmin_Q10, RDmax_Q10; + __m128i n_LF_Q14; + __m128i r_Q10, q1_Q0, q1_Q10, q2_Q10; + __m128i Winner_rand_state, Winner_selector; + __m128i tmp0, tmp1; + NSQ_del_dec_sample_struct *psLastSample, *psSample; + opus_int32 RDmin_ind, RDmax_ind, last_smple_idx; + opus_int32 LTP_pred_Q14, n_LTP_Q14; + + /* Long-term prediction */ + if (signalType == TYPE_VOICED) + { + /* Unrolled loop */ + /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */ + LTP_pred_Q14 = 2; + LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-0], b_Q14[0]); + LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-1], b_Q14[1]); + LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-2], b_Q14[2]); + LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-3], b_Q14[3]); + LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-4], b_Q14[4]); + LTP_pred_Q14 = silk_LSHIFT(LTP_pred_Q14, 1); /* Q13 -> Q14 */ + pred_lag_ptr++; + } + else + { + LTP_pred_Q14 = 0; + } + + /* Long-term shaping */ + if (lag > 0) + { + /* Symmetric, packed FIR coefficients */ + n_LTP_Q14 = silk_add_sat32(shp_lag_ptr[0], shp_lag_ptr[-2]); + n_LTP_Q14 = silk_SMULWB(n_LTP_Q14, HarmShapeFIRPacked_Q14); + n_LTP_Q14 = n_LTP_Q14 + silk_SMULWT(shp_lag_ptr[-1], HarmShapeFIRPacked_Q14); + n_LTP_Q14 = LTP_pred_Q14 - (silk_LSHIFT(n_LTP_Q14, 2)); /* Q12 -> Q14 */ + shp_lag_ptr++; + } + else + { + n_LTP_Q14 = 0; + } + + /* BEGIN Updating Delayed Decision States */ + + /* Generate dither */ + psDelDec->Seed = silk_mm256_rand_epi32(psDelDec->Seed); + + /* Short-term prediction */ + LPC_pred_Q14 = silk_noise_shape_quantizer_short_prediction_x4(&psDelDec->sLPC_Q14[NSQ_LPC_BUF_LENGTH - 1 + i], a_Q12, predictLPCOrder); + LPC_pred_Q14 = _mm_slli_epi32(LPC_pred_Q14, 4); /* Q10 -> Q14 */ + + /* Noise shape feedback */ + silk_assert(shapingLPCOrder > 0); + silk_assert((shapingLPCOrder & 1) == 0); /* check that order is even */ + /* Output of lowpass section */ + tmp0 = _mm_add_epi32(psDelDec->Diff_Q14, silk_mm_smulwb_epi32(psDelDec->sAR2_Q14[0], warping_Q16)); + n_AR_Q14 = _mm_set1_epi32(shapingLPCOrder >> 1); + for (j = 0; j < shapingLPCOrder - 1; j++) + { + /* Output of allpass section */ + tmp1 = psDelDec->sAR2_Q14[j]; + psDelDec->sAR2_Q14[j] = tmp0; + n_AR_Q14 = _mm_add_epi32(n_AR_Q14, silk_mm_smulwb_epi32(tmp0, AR_shp_Q13[j])); + tmp0 = _mm_add_epi32(tmp1, silk_mm_smulwb_epi32(_mm_sub_epi32(psDelDec->sAR2_Q14[j + 1], tmp0), warping_Q16)); + } + psDelDec->sAR2_Q14[shapingLPCOrder - 1] = tmp0; + n_AR_Q14 = _mm_add_epi32(n_AR_Q14, silk_mm_smulwb_epi32(tmp0, AR_shp_Q13[shapingLPCOrder - 1])); + + n_AR_Q14 = _mm_slli_epi32(n_AR_Q14, 1); /* Q11 -> Q12 */ + n_AR_Q14 = _mm_add_epi32(n_AR_Q14, silk_mm_smulwb_epi32(psDelDec->LF_AR_Q14, Tilt_Q14)); /* Q12 */ + n_AR_Q14 = _mm_slli_epi32(n_AR_Q14, 2); /* Q12 -> Q14 */ + + tmp0 = silk_mm_smulwb_epi32(psDelDec->Samples[*smpl_buf_idx].Shape_Q14, LF_shp_Q14); /* Q12 */ + tmp1 = silk_mm_smulwb_epi32(psDelDec->LF_AR_Q14, LF_shp_Q14 >> 16); /* Q12 */ + n_LF_Q14 = _mm_add_epi32(tmp0, tmp1); /* Q12 */ + n_LF_Q14 = _mm_slli_epi32(n_LF_Q14, 2); /* Q12 -> Q14 */ + + /* Input minus prediction plus noise feedback */ + /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP */ + tmp0 = silk_mm_add_sat_epi32(n_AR_Q14, n_LF_Q14); /* Q14 */ + tmp1 = _mm_add_epi32(_mm_set1_epi32(n_LTP_Q14), LPC_pred_Q14); /* Q13 */ + tmp0 = silk_mm_sub_sat_epi32(tmp1, tmp0); /* Q13 */ + tmp0 = silk_mm_srai_round_epi32(tmp0, 4); /* Q10 */ + + r_Q10 = _mm_sub_epi32(_mm_set1_epi32(x_Q10[i]), tmp0); /* residual error Q10 */ + + /* Flip sign depending on dither */ + r_Q10 = silk_mm_sign_epi32(r_Q10, psDelDec->Seed); + r_Q10 = silk_mm_limit_epi32(r_Q10, -(31 << 10), 30 << 10); + + /* Find two quantization level candidates and measure their rate-distortion */ + q1_Q10 = _mm_sub_epi32(r_Q10, _mm_set1_epi32(offset_Q10)); + q1_Q0 = _mm_srai_epi32(q1_Q10, 10); + if (Lambda_Q10 > 2048) + { + /* For aggressive RDO, the bias becomes more than one pulse. */ + tmp0 = _mm_sub_epi32(_mm_abs_epi32(q1_Q10), _mm_set1_epi32(Lambda_Q10 / 2 - 512)); /* rdo_offset */ + q1_Q0 = _mm_srai_epi32(q1_Q10, 31); + tmp1 = _mm_cmpgt_epi32(tmp0, _mm_setzero_si128()); + tmp0 = _mm_srai_epi32(silk_mm_sign_epi32(tmp0, q1_Q10), 10); + q1_Q0 = _mm_blendv_epi8(q1_Q0, tmp0, tmp1); + } + + tmp0 = _mm_sign_epi32(_mm_set1_epi32(QUANT_LEVEL_ADJUST_Q10), q1_Q0); + q1_Q10 = _mm_sub_epi32(_mm_slli_epi32(q1_Q0, 10), tmp0); + q1_Q10 = _mm_add_epi32(q1_Q10, _mm_set1_epi32(offset_Q10)); + + /* check if q1_Q0 is 0 or -1 */ + tmp0 = _mm_add_epi32(_mm_srli_epi32(q1_Q0, 31), q1_Q0); + tmp1 = _mm_cmpeq_epi32(tmp0, _mm_setzero_si128()); + tmp0 = _mm_blendv_epi8(_mm_set1_epi32(1024), _mm_set1_epi32(1024 - QUANT_LEVEL_ADJUST_Q10), tmp1); + q2_Q10 = _mm_add_epi32(q1_Q10, tmp0); + q_Q10 = _mm256_set_m128i(q2_Q10, q1_Q10); + + rr_Q10 = _mm256_sub_epi32(_mm256_broadcastsi128_si256(r_Q10), q_Q10); + rd_Q10 = _mm256_abs_epi32(q_Q10); + rr_Q10 = silk_mm256_smulbb_epi32(rr_Q10, rr_Q10); + rd_Q10 = silk_mm256_smulbb_epi32(rd_Q10, _mm256_set1_epi32(Lambda_Q10)); + rd_Q10 = _mm256_add_epi32(rd_Q10, rr_Q10); + rd_Q10 = _mm256_srai_epi32(rd_Q10, 10); + + mask = _mm256_broadcastsi128_si256(_mm_cmplt_epi32(_mm256_extracti128_si256(rd_Q10, 0), _mm256_extracti128_si256(rd_Q10, 1))); + SS_RD_Q10 = _mm256_add_epi32( + _mm256_broadcastsi128_si256(psDelDec->RD_Q10), + _mm256_blendv_epi8( + _mm256_permute2x128_si256(rd_Q10, rd_Q10, 0x1), + rd_Q10, + mask)); + SS_Q_Q10 = _mm256_blendv_epi8( + _mm256_permute2x128_si256(q_Q10, q_Q10, 0x1), + q_Q10, + mask); + + /* Update states for best and second best quantization */ + + /* Quantized excitation */ + exc_Q14 = silk_mm256_sign_epi32(_mm256_slli_epi32(SS_Q_Q10, 4), _mm256_broadcastsi128_si256(psDelDec->Seed)); + + /* Add predictions */ + exc_Q14 = _mm256_add_epi32(exc_Q14, _mm256_set1_epi32(LTP_pred_Q14)); + SS_LPC_exc_Q14 = _mm256_slli_epi32(exc_Q14, 1); + SS_xq_Q14 = _mm256_add_epi32(exc_Q14, _mm256_broadcastsi128_si256(LPC_pred_Q14)); + + /* Update states */ + SS_Diff_Q14 = _mm256_sub_epi32(SS_xq_Q14, _mm256_set1_epi32(silk_LSHIFT(x_Q10[i], 4))); + SS_LF_AR_Q14 = _mm256_sub_epi32(SS_Diff_Q14, _mm256_broadcastsi128_si256(n_AR_Q14)); + SS_sLTP_shp_Q14 = silk_mm256_sub_sat_epi32(SS_LF_AR_Q14, _mm256_broadcastsi128_si256(n_LF_Q14)); + + /* END Updating Delayed Decision States */ + + *smpl_buf_idx = (*smpl_buf_idx + DECISION_DELAY - 1) % DECISION_DELAY; + last_smple_idx = (*smpl_buf_idx + decisionDelay) % DECISION_DELAY; + psLastSample = &psDelDec->Samples[last_smple_idx]; + + /* Find winner */ + RDmin_Q10 = silk_mm_mask_hmin_epi32(_mm256_castsi256_si128(SS_RD_Q10), MaskDelDec); + Winner_selector = silk_index_to_selector(silk_index_of_first_equal_epi32(RDmin_Q10, _mm256_castsi256_si128(SS_RD_Q10))); + + /* Increase RD values of expired states */ + Winner_rand_state = _mm_shuffle_epi8(psLastSample->RandState, Winner_selector); + + SS_RD_Q10 = _mm256_blendv_epi8( + _mm256_add_epi32(SS_RD_Q10, _mm256_set1_epi32(silk_int32_MAX >> 4)), + SS_RD_Q10, + _mm256_broadcastsi128_si256(_mm_cmpeq_epi32(psLastSample->RandState, Winner_rand_state))); + + /* find worst in first set */ + RDmax_Q10 = silk_mm_mask_hmax_epi32(_mm256_extracti128_si256(SS_RD_Q10, 0), MaskDelDec); + /* find best in second set */ + RDmin_Q10 = silk_mm_mask_hmin_epi32(_mm256_extracti128_si256(SS_RD_Q10, 1), MaskDelDec); + + /* Replace a state if best from second set outperforms worst in first set */ + tmp0 = _mm_cmplt_epi32(RDmin_Q10, RDmax_Q10); + if (!_mm_test_all_zeros(tmp0, tmp0)) + { + int t; + RDmax_ind = silk_index_of_first_equal_epi32(RDmax_Q10, _mm256_extracti128_si256(SS_RD_Q10, 0)); + RDmin_ind = silk_index_of_first_equal_epi32(RDmin_Q10, _mm256_extracti128_si256(SS_RD_Q10, 1)); + tmp1 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFU << (unsigned)(RDmax_ind << 3))); + tmp0 = _mm_blendv_epi8( + _mm_set_epi8(0xF, 0xE, 0xD, 0xC, 0xB, 0xA, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0), + silk_index_to_selector(RDmin_ind), + tmp1); + for (t = i; t < MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH; t++) + { + psDelDec->sLPC_Q14[t] = _mm_shuffle_epi8(psDelDec->sLPC_Q14[t], tmp0); + } + psDelDec->Seed = _mm_shuffle_epi8(psDelDec->Seed, tmp0); + psDelDec->SeedInit = _mm_shuffle_epi8(psDelDec->SeedInit, tmp0); + for (t = 0; t < MAX_SHAPE_LPC_ORDER; t++) + { + psDelDec->sAR2_Q14[t] = _mm_shuffle_epi8(psDelDec->sAR2_Q14[t], tmp0); + } + for (t = 0; t < DECISION_DELAY; t++) + { + psDelDec->Samples[t].RandState = _mm_shuffle_epi8(psDelDec->Samples[t].RandState, tmp0); + psDelDec->Samples[t].Q_Q10 = _mm_shuffle_epi8(psDelDec->Samples[t].Q_Q10, tmp0); + psDelDec->Samples[t].Xq_Q14 = _mm_shuffle_epi8(psDelDec->Samples[t].Xq_Q14, tmp0); + psDelDec->Samples[t].Pred_Q15 = _mm_shuffle_epi8(psDelDec->Samples[t].Pred_Q15, tmp0); + psDelDec->Samples[t].Shape_Q14 = _mm_shuffle_epi8(psDelDec->Samples[t].Shape_Q14, tmp0); + } + mask = _mm256_castsi128_si256(_mm_blendv_epi8(_mm_set_epi32(0x3, 0x2, 0x1, 0x0), _mm_set1_epi32(RDmin_ind + 4), tmp1)); + SS_Q_Q10 = _mm256_permutevar8x32_epi32(SS_Q_Q10, mask); + SS_RD_Q10 = _mm256_permutevar8x32_epi32(SS_RD_Q10, mask); + SS_xq_Q14 = _mm256_permutevar8x32_epi32(SS_xq_Q14, mask); + SS_LF_AR_Q14 = _mm256_permutevar8x32_epi32(SS_LF_AR_Q14, mask); + SS_Diff_Q14 = _mm256_permutevar8x32_epi32(SS_Diff_Q14, mask); + SS_sLTP_shp_Q14 = _mm256_permutevar8x32_epi32(SS_sLTP_shp_Q14, mask); + SS_LPC_exc_Q14 = _mm256_permutevar8x32_epi32(SS_LPC_exc_Q14, mask); + } + + /* Write samples from winner to output and long-term filter states */ + if (subfr > 0 || i >= decisionDelay) + { + pulses[i - decisionDelay] = + (opus_int8)silk_sar_round_32(silk_select_winner(psLastSample->Q_Q10, Winner_selector), 10); + xq[i - decisionDelay] = + silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psLastSample->Xq_Q14, Winner_selector), delayedGain_Q10[last_smple_idx], 8)); + NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay] = + silk_select_winner(psLastSample->Shape_Q14, Winner_selector); + sLTP_Q15[NSQ->sLTP_buf_idx - decisionDelay] = + silk_select_winner(psLastSample->Pred_Q15, Winner_selector); + } + NSQ->sLTP_shp_buf_idx++; + NSQ->sLTP_buf_idx++; + + /* Update states */ + psSample = &psDelDec->Samples[*smpl_buf_idx]; + psDelDec->Seed = _mm_add_epi32(psDelDec->Seed, silk_mm_srai_round_epi32(_mm256_castsi256_si128(SS_Q_Q10), 10)); + psDelDec->LF_AR_Q14 = _mm256_castsi256_si128(SS_LF_AR_Q14); + psDelDec->Diff_Q14 = _mm256_castsi256_si128(SS_Diff_Q14); + psDelDec->sLPC_Q14[i + NSQ_LPC_BUF_LENGTH] = _mm256_castsi256_si128(SS_xq_Q14); + psDelDec->RD_Q10 = _mm256_castsi256_si128(SS_RD_Q10); + psSample->Xq_Q14 = _mm256_castsi256_si128(SS_xq_Q14); + psSample->Q_Q10 = _mm256_castsi256_si128(SS_Q_Q10); + psSample->Pred_Q15 = _mm256_castsi256_si128(SS_LPC_exc_Q14); + psSample->Shape_Q14 = _mm256_castsi256_si128(SS_sLTP_shp_Q14); + psSample->RandState = psDelDec->Seed; + delayedGain_Q10[*smpl_buf_idx] = Gain_Q10; + } + /* Update LPC states */ + for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++) + { + psDelDec->sLPC_Q14[i] = (&psDelDec->sLPC_Q14[length])[i]; + } +} + +static OPUS_INLINE void silk_nsq_del_dec_scale_states_avx2( + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + NSQ_del_dec_struct *psDelDec, /* I/O Delayed decision states */ + const opus_int16 x16[], /* I Input */ + opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH], /* O Input scaled with 1/Gain in Q10 */ + const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */ + opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */ + opus_int subfr, /* I Subframe number */ + const opus_int LTP_scale_Q14, /* I LTP state scaling */ + const opus_int32 Gains_Q16[MAX_NB_SUBFR], /* I */ + const opus_int pitchL[MAX_NB_SUBFR], /* I Pitch lag */ + const opus_int signal_type, /* I Signal type */ + const opus_int decisionDelay /* I Decision delay */ +) +{ + int i; + opus_int lag; + opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26; + NSQ_del_dec_sample_struct *psSample; + + lag = pitchL[subfr]; + inv_gain_Q31 = silk_INVERSE32_varQ(silk_max(Gains_Q16[subfr], 1), 47); + silk_assert(inv_gain_Q31 != 0); + + /* Scale input */ + inv_gain_Q26 = silk_sar_round_32(inv_gain_Q31, 5); + for (i = 0; i < psEncC->subfr_length; i+=4) + { + __m256i x = _mm256_cvtepi16_epi64(_mm_loadu_si64(&x16[i])); + x = _mm256_slli_epi64(_mm256_mul_epi32(x, _mm256_set1_epi32(inv_gain_Q26)), 16); + _mm_storeu_si128((__m128i_u*)&x_sc_Q10[i], silk_cvtepi64_epi32_high(x)); + } + + /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */ + if (NSQ->rewhite_flag) + { + if (subfr == 0) + { + /* Do LTP downscaling */ + inv_gain_Q31 = silk_LSHIFT(silk_SMULWB(inv_gain_Q31, LTP_scale_Q14), 2); + } + for (i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++) + { + silk_assert(i < MAX_FRAME_LENGTH); + sLTP_Q15[i] = silk_SMULWB(inv_gain_Q31, sLTP[i]); + } + } + + /* Adjust for changing gain */ + if (Gains_Q16[subfr] != NSQ->prev_gain_Q16) + { + gain_adj_Q16 = silk_DIV32_varQ(NSQ->prev_gain_Q16, Gains_Q16[subfr], 16); + + /* Scale long-term shaping state */ + for (i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx; i+=4) + { + __m128i_u* p = (__m128i_u*)&NSQ->sLTP_shp_Q14[i]; + *p = silk_mm_smulww_epi32(*p, gain_adj_Q16); + } + + /* Scale long-term prediction state */ + if (signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0) + { + for (i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay; i++) + { + sLTP_Q15[i] = ((opus_int64)sLTP_Q15[i]) * ((opus_int64)gain_adj_Q16) >> 16; + } + } + + /* Scale scalar states */ + psDelDec->LF_AR_Q14 = silk_mm_smulww_epi32(psDelDec->LF_AR_Q14, gain_adj_Q16); + psDelDec->Diff_Q14 = silk_mm_smulww_epi32(psDelDec->Diff_Q14, gain_adj_Q16); + + /* Scale short-term prediction and shaping states */ + for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++) + { + psDelDec->sLPC_Q14[i] = silk_mm_smulww_epi32(psDelDec->sLPC_Q14[i], gain_adj_Q16); + } + for (i = 0; i < DECISION_DELAY; i++) + { + psSample = &psDelDec->Samples[i]; + psSample->Pred_Q15 = silk_mm_smulww_epi32(psSample->Pred_Q15, gain_adj_Q16); + psSample->Shape_Q14 = silk_mm_smulww_epi32(psSample->Shape_Q14, gain_adj_Q16); + } + for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++) + { + psDelDec->sAR2_Q14[i] = silk_mm_smulww_epi32(psDelDec->sAR2_Q14[i], gain_adj_Q16); + } + + /* Save inverse gain */ + NSQ->prev_gain_Q16 = Gains_Q16[subfr]; + } +} + +static OPUS_INLINE void silk_LPC_analysis_filter_avx2( + opus_int16 *out, /* O Output signal */ + const opus_int16 *in, /* I Input signal */ + const opus_int16 *B, /* I MA prediction coefficients, Q12 [order] */ + const opus_int32 len, /* I Signal length */ + const opus_int32 order /* I Filter order */ +) +{ + int i; + opus_int32 out32_Q12, out32; + silk_assert(order == 10 || order == 16); + + for(i = order; i < len; i++ ) + { + const opus_int16 *in_ptr = &in[ i ]; + /* Allowing wrap around so that two wraps can cancel each other. The rare + cases where the result wraps around can only be triggered by invalid streams*/ + + __m256i in_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i_u*)&in_ptr[-8])); + __m256i B_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i_u*)& B[0])); + __m256i sum = _mm256_mullo_epi32(in_v, silk_mm256_reverse_epi32(B_v)); + if (order > 10) + { + in_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i_u*)&in_ptr[-16])); + B_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i_u*)&B [8])); + B_v = silk_mm256_reverse_epi32(B_v); + } + else + { + in_v = _mm256_cvtepi16_epi32(_mm_loadu_si32(&in_ptr[-10])); + B_v = _mm256_cvtepi16_epi32(_mm_loadu_si32(&B [8])); + B_v = _mm256_shuffle_epi32(B_v, 0x01); + } + sum = _mm256_add_epi32(sum, _mm256_mullo_epi32(in_v, B_v)); + + out32_Q12 = silk_mm256_hsum_epi32(sum); + + /* Subtract prediction */ + out32_Q12 = silk_SUB32_ovflw( silk_LSHIFT( (opus_int32)*in_ptr, 12 ), out32_Q12 ); + + /* Scale to Q0 */ + out32 = silk_sar_round_32(out32_Q12, 12); + + /* Saturate output */ + out[ i ] = silk_sat16(out32); + } + + /* Set first d output samples to zero */ + silk_memset( out, 0, order * sizeof( opus_int16 ) ); +} diff --git a/opus/silk/x86/NSQ_del_dec_sse4_1.c b/opus/silk/x86/NSQ_del_dec_sse4_1.c index 2c75ede2..5937682d 100644 --- a/opus/silk/x86/NSQ_del_dec_sse4_1.c +++ b/opus/silk/x86/NSQ_del_dec_sse4_1.c @@ -1,5 +1,5 @@ -/* Copyright (c) 2014, Cisco Systems, INC - Written by XiangMingZhu WeiZhou MinPeng YanWang +/* Copyright (c) 2014-2020, Cisco Systems, INC + Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -46,6 +46,7 @@ typedef struct { opus_int32 Shape_Q14[ DECISION_DELAY ]; opus_int32 sAR2_Q14[ MAX_SHAPE_LPC_ORDER ]; opus_int32 LF_AR_Q14; + opus_int32 Diff_Q14; opus_int32 Seed; opus_int32 SeedInit; opus_int32 RD_Q10; @@ -56,6 +57,7 @@ typedef struct { opus_int32 RD_Q10; opus_int32 xq_Q14; opus_int32 LF_AR_Q14; + opus_int32 Diff_Q14; opus_int32 sLTP_shp_Q14; opus_int32 LPC_exc_Q14; } NSQ_sample_struct; @@ -66,7 +68,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1( const silk_encoder_state *psEncC, /* I Encoder State */ silk_nsq_state *NSQ, /* I/O NSQ state */ NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */ - const opus_int32 x_Q3[], /* I Input in Q3 */ + const opus_int16 x16[], /* I Input */ opus_int32 x_sc_Q10[], /* O Input scaled with 1/Gain in Q10 */ const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */ opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */ @@ -112,21 +114,21 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1( ); void silk_NSQ_del_dec_sse4_1( - const silk_encoder_state *psEncC, /* I Encoder State */ - silk_nsq_state *NSQ, /* I/O NSQ state */ - SideInfoIndices *psIndices, /* I/O Quantization Indices */ - const opus_int32 x_Q3[], /* I Prefiltered input signal */ - opus_int8 pulses[], /* O Quantized pulse signal */ - const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ - const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ - const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ - const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ - const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ - const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ - const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ - const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ - const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ - const opus_int LTP_scale_Q14 /* I LTP state scaling */ + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + SideInfoIndices *psIndices, /* I/O Quantization Indices */ + const opus_int16 x16[], /* I Input */ + opus_int8 pulses[], /* O Quantized pulse signal */ + const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */ + const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ + const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ + const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ + const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ + const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ + const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ + const opus_int LTP_scale_Q14 /* I LTP state scaling */ ) { opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr; @@ -142,8 +144,39 @@ void silk_NSQ_del_dec_sse4_1( VARDECL( opus_int32, delayedGain_Q10 ); VARDECL( NSQ_del_dec_struct, psDelDec ); NSQ_del_dec_struct *psDD; +#ifdef OPUS_CHECK_ASM + silk_nsq_state NSQ_c; + SideInfoIndices psIndices_c; + opus_int8 pulses_c[ MAX_FRAME_LENGTH ]; + const opus_int8 *const pulses_a = pulses; +#endif SAVE_STACK; +#ifdef OPUS_CHECK_ASM + ( void )pulses_a; + silk_memcpy( &NSQ_c, NSQ, sizeof( NSQ_c ) ); + silk_memcpy( &psIndices_c, psIndices, sizeof( psIndices_c ) ); + silk_assert( psEncC->nb_subfr * psEncC->subfr_length <= MAX_FRAME_LENGTH ); + silk_memcpy( pulses_c, pulses, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) ); + silk_NSQ_del_dec_c( + psEncC, + &NSQ_c, + &psIndices_c, + x16, + pulses_c, + PredCoef_Q12, + LTPCoef_Q14, + AR_Q13, + HarmShapeGain_Q14, + Tilt_Q14, + LF_shp_Q14, + Gains_Q16, + pitchL, + Lambda_Q10, + LTP_scale_Q14 + ); +#endif + /* Set unvoiced lag to the previous one, overwrite later for voiced */ lag = NSQ->lagPrev; @@ -158,6 +191,7 @@ void silk_NSQ_del_dec_sse4_1( psDD->SeedInit = psDD->Seed; psDD->RD_Q10 = 0; psDD->LF_AR_Q14 = NSQ->sLF_AR_shp_Q14; + psDD->Diff_Q14 = NSQ->sDiff_shp_Q14; psDD->Shape_Q14[ 0 ] = NSQ->sLTP_shp_Q14[ psEncC->ltp_mem_length - 1 ]; silk_memcpy( psDD->sLPC_Q14, NSQ->sLPC_Q14, NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) ); silk_memcpy( psDD->sAR2_Q14, NSQ->sAR2_Q14, sizeof( NSQ->sAR2_Q14 ) ); @@ -185,8 +219,7 @@ void silk_NSQ_del_dec_sse4_1( LSF_interpolation_flag = 1; } - ALLOC( sLTP_Q15, - psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 ); + ALLOC( sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 ); ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 ); ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 ); ALLOC( delayedGain_Q10, DECISION_DELAY, opus_int32 ); @@ -198,7 +231,7 @@ void silk_NSQ_del_dec_sse4_1( for( k = 0; k < psEncC->nb_subfr; k++ ) { A_Q12 = &PredCoef_Q12[ ( ( k >> 1 ) | ( 1 - LSF_interpolation_flag ) ) * MAX_LPC_ORDER ]; B_Q14 = <PCoef_Q14[ k * LTP_ORDER ]; - AR_shp_Q13 = &AR2_Q13[ k * MAX_SHAPE_LPC_ORDER ]; + AR_shp_Q13 = &AR_Q13[ k * MAX_SHAPE_LPC_ORDER ]; /* Noise shape parameters */ silk_assert( HarmShapeGain_Q14[ k ] >= 0 ); @@ -257,7 +290,7 @@ void silk_NSQ_del_dec_sse4_1( } } - silk_nsq_del_dec_scale_states_sse4_1( psEncC, NSQ, psDelDec, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k, + silk_nsq_del_dec_scale_states_sse4_1( psEncC, NSQ, psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k, psEncC->nStatesDelayedDecision, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay ); silk_noise_shape_quantizer_del_dec_sse4_1( NSQ, psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, @@ -265,7 +298,7 @@ void silk_NSQ_del_dec_sse4_1( Gains_Q16[ k ], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder, psEncC->predictLPCOrder, psEncC->warping_Q16, psEncC->nStatesDelayedDecision, &smpl_buf_idx, decisionDelay ); - x_Q3 += psEncC->subfr_length; + x16 += psEncC->subfr_length; pulses += psEncC->subfr_length; pxq += psEncC->subfr_length; } @@ -288,6 +321,7 @@ void silk_NSQ_del_dec_sse4_1( for( i = 0; i < decisionDelay; i++ ) { last_smple_idx = ( last_smple_idx - 1 ) % DECISION_DELAY; if( last_smple_idx < 0 ) last_smple_idx += DECISION_DELAY; + pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 ); pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], Gain_Q10 ), 8 ) ); @@ -298,11 +332,19 @@ void silk_NSQ_del_dec_sse4_1( /* Update states */ NSQ->sLF_AR_shp_Q14 = psDD->LF_AR_Q14; + NSQ->sDiff_shp_Q14 = psDD->Diff_Q14; NSQ->lagPrev = pitchL[ psEncC->nb_subfr - 1 ]; /* Save quantized speech signal */ silk_memmove( NSQ->xq, &NSQ->xq[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) ); silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) ); + +#ifdef OPUS_CHECK_ASM + silk_assert( !memcmp( &NSQ_c, NSQ, sizeof( NSQ_c ) ) ); + silk_assert( !memcmp( &psIndices_c, psIndices, sizeof( psIndices_c ) ) ); + silk_assert( !memcmp( pulses_c, pulses_a, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) ) ); +#endif + RESTORE_STACK; } @@ -345,6 +387,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1( opus_int32 q1_Q0, q1_Q10, q2_Q10, exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10; opus_int32 tmp1, tmp2, sLF_AR_shp_Q14; opus_int32 *pred_lag_ptr, *shp_lag_ptr, *psLPC_Q14; + int rdo_offset; + VARDECL( NSQ_sample_pair, psSampleState ); NSQ_del_dec_struct *psDD; NSQ_sample_struct *psSS; @@ -356,6 +400,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1( celt_assert( nStatesDelayedDecision > 0 ); ALLOC( psSampleState, nStatesDelayedDecision, NSQ_sample_pair ); + rdo_offset = (Lambda_Q10 >> 1) - 512; + shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ]; pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ]; Gain_Q10 = silk_RSHIFT( Gain_Q16, 6 ); @@ -382,7 +428,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1( LTP_pred_Q14 = 2; { __m128i tmpa, tmpb, pred_lag_ptr_tmp; - pred_lag_ptr_tmp = _mm_loadu_si128( (__m128i *)(&pred_lag_ptr[ -3 ] ) ); + pred_lag_ptr_tmp = _mm_loadu_si128( (__m128i *)(void*)(&pred_lag_ptr[ -3 ] ) ); pred_lag_ptr_tmp = _mm_shuffle_epi32( pred_lag_ptr_tmp, 0x1B ); tmpa = _mm_mul_epi32( pred_lag_ptr_tmp, b_Q12_0123 ); tmpa = _mm_srli_si128( tmpa, 2 ); @@ -407,8 +453,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1( /* Long-term shaping */ if( lag > 0 ) { /* Symmetric, packed FIR coefficients */ - n_LTP_Q14 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 ); - n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 ); + n_LTP_Q14 = silk_SMULWB( silk_ADD_SAT32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 ); + n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 ); n_LTP_Q14 = silk_SUB_LSHIFT32( LTP_pred_Q14, n_LTP_Q14, 2 ); /* Q12 -> Q14 */ shp_lag_ptr++; } else { @@ -437,7 +483,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1( tmpb = _mm_setzero_si128(); /* step 1 */ - psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -3 ] ) ); /* -3, -2 , -1, 0 */ + psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -3 ] ) ); /* -3, -2 , -1, 0 */ psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B ); /* 0, -1, -2, -3 */ tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_0123 ); /* 0, -1, -2, -3 * 0123 -> 0*0, 2*-2 */ @@ -451,7 +497,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1( tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp ); /* step 2 */ - psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -7 ] ) ); + psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -7 ] ) ); psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B ); tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_4567 ); tmpa = _mm_srli_epi64( tmpa, 16 ); @@ -466,7 +512,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1( if ( opus_likely( predictLPCOrder == 16 ) ) { /* step 3 */ - psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -11 ] ) ); + psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -11 ] ) ); psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B ); tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_89AB ); tmpa = _mm_srli_epi64( tmpa, 16 ); @@ -478,8 +524,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1( psLPC_Q14_tmp = _mm_srli_epi64( psLPC_Q14_tmp, 16 ); tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp ); - /* setp 4 */ - psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -15 ] ) ); + /* step 4 */ + psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -15 ] ) ); psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B ); tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_CDEF ); tmpa = _mm_srli_epi64( tmpa, 16 ); @@ -511,22 +557,22 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1( LPC_pred_Q14 = silk_LSHIFT( LPC_pred_Q14, 4 ); /* Q10 -> Q14 */ /* Noise shape feedback */ - silk_assert( ( shapingLPCOrder & 1 ) == 0 ); /* check that order is even */ + celt_assert( ( shapingLPCOrder & 1 ) == 0 ); /* check that order is even */ /* Output of lowpass section */ - tmp2 = silk_SMLAWB( psLPC_Q14[ 0 ], psDD->sAR2_Q14[ 0 ], warping_Q16 ); + tmp2 = silk_SMLAWB( psDD->Diff_Q14, psDD->sAR2_Q14[ 0 ], warping_Q16 ); /* Output of allpass section */ - tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ 0 ], psDD->sAR2_Q14[ 1 ] - tmp2, warping_Q16 ); + tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ 0 ], silk_SUB32_ovflw(psDD->sAR2_Q14[ 1 ], tmp2), warping_Q16 ); psDD->sAR2_Q14[ 0 ] = tmp2; n_AR_Q14 = silk_RSHIFT( shapingLPCOrder, 1 ); n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp2, AR_shp_Q13[ 0 ] ); /* Loop over allpass sections */ for( j = 2; j < shapingLPCOrder; j += 2 ) { /* Output of allpass section */ - tmp2 = silk_SMLAWB( psDD->sAR2_Q14[ j - 1 ], psDD->sAR2_Q14[ j + 0 ] - tmp1, warping_Q16 ); + tmp2 = silk_SMLAWB( psDD->sAR2_Q14[ j - 1 ], silk_SUB32_ovflw(psDD->sAR2_Q14[ j + 0 ], tmp1), warping_Q16 ); psDD->sAR2_Q14[ j - 1 ] = tmp1; n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp1, AR_shp_Q13[ j - 1 ] ); /* Output of allpass section */ - tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ j + 0 ], psDD->sAR2_Q14[ j + 1 ] - tmp2, warping_Q16 ); + tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ j + 0 ], silk_SUB32_ovflw(psDD->sAR2_Q14[ j + 1 ], tmp2), warping_Q16 ); psDD->sAR2_Q14[ j + 0 ] = tmp2; n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp2, AR_shp_Q13[ j ] ); } @@ -543,9 +589,9 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1( /* Input minus prediction plus noise feedback */ /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP */ - tmp1 = silk_ADD32( n_AR_Q14, n_LF_Q14 ); /* Q14 */ - tmp2 = silk_ADD32( n_LTP_Q14, LPC_pred_Q14 ); /* Q13 */ - tmp1 = silk_SUB32( tmp2, tmp1 ); /* Q13 */ + tmp1 = silk_ADD_SAT32( n_AR_Q14, n_LF_Q14 ); /* Q14 */ + tmp2 = silk_ADD32_ovflw( n_LTP_Q14, LPC_pred_Q14 ); /* Q13 */ + tmp1 = silk_SUB_SAT32( tmp2, tmp1 ); /* Q13 */ tmp1 = silk_RSHIFT_ROUND( tmp1, 4 ); /* Q10 */ r_Q10 = silk_SUB32( x_Q10[ i ], tmp1 ); /* residual error Q10 */ @@ -559,6 +605,18 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1( /* Find two quantization level candidates and measure their rate-distortion */ q1_Q10 = silk_SUB32( r_Q10, offset_Q10 ); q1_Q0 = silk_RSHIFT( q1_Q10, 10 ); + if (Lambda_Q10 > 2048) { + /* For aggressive RDO, the bias becomes more than one pulse. */ + if (q1_Q10 > rdo_offset) { + q1_Q0 = silk_RSHIFT( q1_Q10 - rdo_offset, 10 ); + } else if (q1_Q10 < -rdo_offset) { + q1_Q0 = silk_RSHIFT( q1_Q10 + rdo_offset, 10 ); + } else if (q1_Q10 < 0) { + q1_Q0 = -1; + } else { + q1_Q0 = 0; + } + } if( q1_Q0 > 0 ) { q1_Q10 = silk_SUB32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 ); q1_Q10 = silk_ADD32( q1_Q10, offset_Q10 ); @@ -609,11 +667,12 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1( /* Add predictions */ LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 ); - xq_Q14 = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 ); + xq_Q14 = silk_ADD32_ovflw( LPC_exc_Q14, LPC_pred_Q14 ); /* Update states */ - sLF_AR_shp_Q14 = silk_SUB32( xq_Q14, n_AR_Q14 ); - psSS[ 0 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 ); + psSS[ 0 ].Diff_Q14 = silk_SUB32_ovflw( xq_Q14, silk_LSHIFT32( x_Q10[ i ], 4 ) ); + sLF_AR_shp_Q14 = silk_SUB32_ovflw( psSS[ 0 ].Diff_Q14, n_AR_Q14 ); + psSS[ 0 ].sLTP_shp_Q14 = silk_SUB_SAT32( sLF_AR_shp_Q14, n_LF_Q14 ); psSS[ 0 ].LF_AR_Q14 = sLF_AR_shp_Q14; psSS[ 0 ].LPC_exc_Q14 = LPC_exc_Q14; psSS[ 0 ].xq_Q14 = xq_Q14; @@ -626,14 +685,14 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1( exc_Q14 = -exc_Q14; } - /* Add predictions */ LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 ); - xq_Q14 = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 ); + xq_Q14 = silk_ADD32_ovflw( LPC_exc_Q14, LPC_pred_Q14 ); /* Update states */ - sLF_AR_shp_Q14 = silk_SUB32( xq_Q14, n_AR_Q14 ); - psSS[ 1 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 ); + psSS[ 1 ].Diff_Q14 = silk_SUB32_ovflw( xq_Q14, silk_LSHIFT32( x_Q10[ i ], 4 ) ); + sLF_AR_shp_Q14 = silk_SUB32_ovflw( psSS[ 1 ].Diff_Q14, n_AR_Q14 ); + psSS[ 1 ].sLTP_shp_Q14 = silk_SUB_SAT32( sLF_AR_shp_Q14, n_LF_Q14 ); psSS[ 1 ].LF_AR_Q14 = sLF_AR_shp_Q14; psSS[ 1 ].LPC_exc_Q14 = LPC_exc_Q14; psSS[ 1 ].xq_Q14 = xq_Q14; @@ -705,6 +764,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1( psDD = &psDelDec[ k ]; psSS = &psSampleState[ k ][ 0 ]; psDD->LF_AR_Q14 = psSS->LF_AR_Q14; + psDD->Diff_Q14 = psSS->Diff_Q14; psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH + i ] = psSS->xq_Q14; psDD->Xq_Q14[ *smpl_buf_idx ] = psSS->xq_Q14; psDD->Q_Q10[ *smpl_buf_idx ] = psSS->Q_Q10; @@ -728,7 +788,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1( const silk_encoder_state *psEncC, /* I Encoder State */ silk_nsq_state *NSQ, /* I/O NSQ state */ NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */ - const opus_int32 x_Q3[], /* I Input in Q3 */ + const opus_int16 x16[], /* I Input */ opus_int32 x_sc_Q10[], /* O Input scaled with 1/Gain in Q10 */ const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */ opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */ @@ -742,51 +802,41 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1( ) { opus_int i, k, lag; - opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q23; + opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26; NSQ_del_dec_struct *psDD; - __m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1; + __m128i xmm_inv_gain_Q26, xmm_x16_x2x0, xmm_x16_x3x1; lag = pitchL[ subfr ]; inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 ); - silk_assert( inv_gain_Q31 != 0 ); - /* Calculate gain adjustment factor */ - if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) { - gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 ); - } else { - gain_adj_Q16 = (opus_int32)1 << 16; - } - /* Scale input */ - inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 ); + inv_gain_Q26 = silk_RSHIFT_ROUND( inv_gain_Q31, 5 ); - /* prepare inv_gain_Q23 in packed 4 32-bits */ - xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23); + /* prepare inv_gain_Q26 in packed 4 32-bits */ + xmm_inv_gain_Q26 = _mm_set1_epi32(inv_gain_Q26); for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) { - xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) ); + xmm_x16_x2x0 = OP_CVTEPI16_EPI32_M64( &(x16[ i ] ) ); + /* equal shift right 4 bytes*/ - xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) ); + xmm_x16_x3x1 = _mm_shuffle_epi32( xmm_x16_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) ); - xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 ); - xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 ); + xmm_x16_x2x0 = _mm_mul_epi32( xmm_x16_x2x0, xmm_inv_gain_Q26 ); + xmm_x16_x3x1 = _mm_mul_epi32( xmm_x16_x3x1, xmm_inv_gain_Q26 ); - xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 ); - xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 ); + xmm_x16_x2x0 = _mm_srli_epi64( xmm_x16_x2x0, 16 ); + xmm_x16_x3x1 = _mm_slli_epi64( xmm_x16_x3x1, 16 ); - xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC ); + xmm_x16_x2x0 = _mm_blend_epi16( xmm_x16_x2x0, xmm_x16_x3x1, 0xCC ); - _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ])), xmm_x_Q3_x2x0 ); + _mm_storeu_si128( (__m128i *)(void*)(&(x_sc_Q10[ i ] ) ), xmm_x16_x2x0 ); } for( ; i < psEncC->subfr_length; i++ ) { - x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 ); + x_sc_Q10[ i ] = silk_SMULWW( x16[ i ], inv_gain_Q26 ); } - /* Save inverse gain */ - NSQ->prev_gain_Q16 = Gains_Q16[ subfr ]; - /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */ if( NSQ->rewhite_flag ) { if( subfr == 0 ) { @@ -800,7 +850,9 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1( } /* Adjust for changing gain */ - if( gain_adj_Q16 != (opus_int32)1 << 16 ) { + if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) { + gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 ); + /* Scale long-term shaping state */ { __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1; @@ -810,7 +862,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1( for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 3; i += 4 ) { - xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ) ); + xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(void*)(&(NSQ->sLTP_shp_Q14[ i ] ) ) ); /* equal shift right 4 bytes*/ xmm_sLTP_shp_Q14_x3x1 = _mm_shuffle_epi32( xmm_sLTP_shp_Q14_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) ); @@ -822,7 +874,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1( xmm_sLTP_shp_Q14_x2x0 = _mm_blend_epi16( xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1, 0xCC ); - _mm_storeu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 ); + _mm_storeu_si128( (__m128i *)(void*)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 ); } for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) { @@ -841,6 +893,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1( /* Scale scalar states */ psDD->LF_AR_Q14 = silk_SMULWW( gain_adj_Q16, psDD->LF_AR_Q14 ); + psDD->Diff_Q14 = silk_SMULWW( gain_adj_Q16, psDD->Diff_Q14 ); /* Scale short-term prediction and shaping states */ for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) { @@ -855,5 +908,8 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1( } } } + + /* Save inverse gain */ + NSQ->prev_gain_Q16 = Gains_Q16[ subfr ]; } } diff --git a/opus/silk/x86/NSQ_sse4_1.c b/opus/silk/x86/NSQ_sse4_1.c index b0315e35..3c9aca7b 100644 --- a/opus/silk/x86/NSQ_sse4_1.c +++ b/opus/silk/x86/NSQ_sse4_1.c @@ -1,5 +1,5 @@ -/* Copyright (c) 2014, Cisco Systems, INC - Written by XiangMingZhu WeiZhou MinPeng YanWang +/* Copyright (c) 2014-2020, Cisco Systems, INC + Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -37,17 +37,17 @@ #include "stack_alloc.h" static OPUS_INLINE void silk_nsq_scale_states_sse4_1( - const silk_encoder_state *psEncC, /* I Encoder State */ - silk_nsq_state *NSQ, /* I/O NSQ state */ - const opus_int32 x_Q3[], /* I input in Q3 */ - opus_int32 x_sc_Q10[], /* O input scaled with 1/Gain */ - const opus_int16 sLTP[], /* I re-whitened LTP state in Q0 */ - opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */ - opus_int subfr, /* I subframe number */ - const opus_int LTP_scale_Q14, /* I */ - const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */ - const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */ - const opus_int signal_type /* I Signal type */ + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + const opus_int16 x16[], /* I input */ + opus_int32 x_sc_Q10[], /* O input scaled with 1/Gain */ + const opus_int16 sLTP[], /* I re-whitened LTP state in Q0 */ + opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */ + opus_int subfr, /* I subframe number */ + const opus_int LTP_scale_Q14, /* I */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */ + const opus_int signal_type /* I Signal type */ ); static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( @@ -65,27 +65,28 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( opus_int Tilt_Q14, /* I Spectral tilt */ opus_int32 LF_shp_Q14, /* I */ opus_int32 Gain_Q16, /* I */ + opus_int Lambda_Q10, /* I */ opus_int offset_Q10, /* I */ opus_int length, /* I Input length */ opus_int32 table[][4] /* I */ ); void silk_NSQ_sse4_1( - const silk_encoder_state *psEncC, /* I Encoder State */ - silk_nsq_state *NSQ, /* I/O NSQ state */ - SideInfoIndices *psIndices, /* I/O Quantization Indices */ - const opus_int32 x_Q3[], /* I Prefiltered input signal */ - opus_int8 pulses[], /* O Quantized pulse signal */ - const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ - const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ - const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ - const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ - const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ - const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ - const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ - const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ - const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ - const opus_int LTP_scale_Q14 /* I LTP state scaling */ + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + SideInfoIndices *psIndices, /* I/O Quantization Indices */ + const opus_int16 x16[], /* I Input */ + opus_int8 pulses[], /* O Quantized pulse signal */ + const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */ + const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ + const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ + const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ + const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ + const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ + const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ + const opus_int LTP_scale_Q14 /* I LTP state scaling */ ) { opus_int k, lag, start_idx, LSF_interpolation_flag; @@ -101,8 +102,41 @@ void silk_NSQ_sse4_1( opus_int32 tmp1; opus_int32 q1_Q10, q2_Q10, rd1_Q20, rd2_Q20; +#ifdef OPUS_CHECK_ASM + silk_nsq_state NSQ_c; + SideInfoIndices psIndices_c; + opus_int8 pulses_c[ MAX_FRAME_LENGTH ]; + const opus_int8 *const pulses_a = pulses; +#endif + SAVE_STACK; +#ifdef OPUS_CHECK_ASM + ( void )pulses_a; + silk_memcpy( &NSQ_c, NSQ, sizeof( NSQ_c ) ); + silk_memcpy( &psIndices_c, psIndices, sizeof( psIndices_c ) ); + silk_assert( psEncC->nb_subfr * psEncC->subfr_length <= MAX_FRAME_LENGTH ); + silk_memcpy( pulses_c, pulses, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) ); + + silk_NSQ_c( + psEncC, + &NSQ_c, + &psIndices_c, + x16, + pulses_c, + PredCoef_Q12, + LTPCoef_Q14, + AR_Q13, + HarmShapeGain_Q14, + Tilt_Q14, + LF_shp_Q14, + Gains_Q16, + pitchL, + Lambda_Q10, + LTP_scale_Q14 + ); +#endif + NSQ->rand_seed = psIndices->Seed; /* Set unvoiced lag to the previous one, overwrite later for voiced */ @@ -172,8 +206,7 @@ void silk_NSQ_sse4_1( LSF_interpolation_flag = 1; } - ALLOC( sLTP_Q15, - psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 ); + ALLOC( sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 ); ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 ); ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 ); /* Set up pointers to start of sub frame */ @@ -183,7 +216,7 @@ void silk_NSQ_sse4_1( for( k = 0; k < psEncC->nb_subfr; k++ ) { A_Q12 = &PredCoef_Q12[ (( k >> 1 ) | ( 1 - LSF_interpolation_flag )) * MAX_LPC_ORDER ]; B_Q14 = <PCoef_Q14[ k * LTP_ORDER ]; - AR_shp_Q13 = &AR2_Q13[ k * MAX_SHAPE_LPC_ORDER ]; + AR_shp_Q13 = &AR_Q13[ k * MAX_SHAPE_LPC_ORDER ]; /* Noise shape parameters */ silk_assert( HarmShapeGain_Q14[ k ] >= 0 ); @@ -209,12 +242,12 @@ void silk_NSQ_sse4_1( } } - silk_nsq_scale_states_sse4_1( psEncC, NSQ, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType ); + silk_nsq_scale_states_sse4_1( psEncC, NSQ, x16, x_sc_Q10, sLTP, sLTP_Q15, k, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType ); if ( opus_likely( ( 10 == psEncC->shapingLPCOrder ) && ( 16 == psEncC->predictLPCOrder) ) ) { silk_noise_shape_quantizer_10_16_sse4_1( NSQ, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14, - AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ], + AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ], Lambda_Q10, offset_Q10, psEncC->subfr_length, &(table[32]) ); } else @@ -224,7 +257,7 @@ void silk_NSQ_sse4_1( offset_Q10, psEncC->subfr_length, psEncC->shapingLPCOrder, psEncC->predictLPCOrder, psEncC->arch ); } - x_Q3 += psEncC->subfr_length; + x16 += psEncC->subfr_length; pulses += psEncC->subfr_length; pxq += psEncC->subfr_length; } @@ -235,12 +268,19 @@ void silk_NSQ_sse4_1( /* Save quantized speech and noise shaping signals */ silk_memmove( NSQ->xq, &NSQ->xq[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) ); silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) ); + +#ifdef OPUS_CHECK_ASM + silk_assert( !memcmp( &NSQ_c, NSQ, sizeof( NSQ_c ) ) ); + silk_assert( !memcmp( &psIndices_c, psIndices, sizeof( psIndices_c ) ) ); + silk_assert( !memcmp( pulses_c, pulses_a, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) ) ); +#endif + RESTORE_STACK; } -/***********************************/ -/* silk_noise_shape_quantizer_10_16 */ -/***********************************/ +/************************************/ +/* silk_noise_shape_quantizer_10_16 */ +/************************************/ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( silk_nsq_state *NSQ, /* I/O NSQ state */ opus_int signalType, /* I Signal type */ @@ -256,6 +296,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( opus_int Tilt_Q14, /* I Spectral tilt */ opus_int32 LF_shp_Q14, /* I */ opus_int32 Gain_Q16, /* I */ + opus_int Lambda_Q10, /* I */ opus_int offset_Q10, /* I */ opus_int length, /* I Input length */ opus_int32 table[][4] /* I */ @@ -264,7 +305,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( opus_int i; opus_int32 LTP_pred_Q13, LPC_pred_Q10, n_AR_Q12, n_LTP_Q13; opus_int32 n_LF_Q12, r_Q10, q1_Q0, q1_Q10, q2_Q10; - opus_int32 exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10; + opus_int32 exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10, sDiff_shp_Q14; opus_int32 tmp1, tmp2, sLF_AR_shp_Q14; opus_int32 *psLPC_Q14, *shp_lag_ptr, *pred_lag_ptr; @@ -279,6 +320,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( __m128i sAR2_Q14_hi_76543210, sAR2_Q14_lo_76543210; __m128i AR_shp_Q13_76543210; + int rdo_offset = (Lambda_Q10 >> 1) - 512; + shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ]; pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ]; Gain_Q10 = silk_RSHIFT( Gain_Q16, 6 ); @@ -288,27 +331,28 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( sLF_AR_shp_Q14 = NSQ->sLF_AR_shp_Q14; xq_Q14 = psLPC_Q14[ 0 ]; + sDiff_shp_Q14 = NSQ->sDiff_shp_Q14; LTP_pred_Q13 = 0; /* load a_Q12 */ xmm_one = _mm_set_epi8( 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 ); /* load a_Q12[0] - a_Q12[7] */ - a_Q12_01234567 = _mm_loadu_si128( (__m128i *)(&a_Q12[ 0 ] ) ); + a_Q12_01234567 = _mm_loadu_si128( (__m128i *)(void*)(&a_Q12[ 0 ] ) ); /* load a_Q12[ 8 ] - a_Q12[ 15 ] */ - a_Q12_89ABCDEF = _mm_loadu_si128( (__m128i *)(&a_Q12[ 8 ] ) ); + a_Q12_89ABCDEF = _mm_loadu_si128( (__m128i *)(void*)(&a_Q12[ 8 ] ) ); a_Q12_01234567 = _mm_shuffle_epi8( a_Q12_01234567, xmm_one ); a_Q12_89ABCDEF = _mm_shuffle_epi8( a_Q12_89ABCDEF, xmm_one ); /* load AR_shp_Q13 */ - AR_shp_Q13_76543210 = _mm_loadu_si128( (__m128i *)(&AR_shp_Q13[0] ) ); + AR_shp_Q13_76543210 = _mm_loadu_si128( (__m128i *)(void*)(&AR_shp_Q13[0] ) ); /* load psLPC_Q14 */ xmm_one = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 ); - xmm_tempa = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[-16]) ); - xmm_tempb = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[-12]) ); + xmm_tempa = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[-16]) ); + xmm_tempb = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[-12]) ); xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one ); xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one ); @@ -316,8 +360,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( psLPC_Q14_hi_89ABCDEF = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb ); psLPC_Q14_lo_89ABCDEF = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb ); - xmm_tempa = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -8 ]) ); - xmm_tempb = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -4 ]) ); + xmm_tempa = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -8 ]) ); + xmm_tempb = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -4 ]) ); xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one ); xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one ); @@ -326,8 +370,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( psLPC_Q14_lo_01234567 = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb ); /* load sAR2_Q14 */ - xmm_tempa = _mm_loadu_si128( (__m128i *)(&(NSQ->sAR2_Q14[ 0 ]) ) ); - xmm_tempb = _mm_loadu_si128( (__m128i *)(&(NSQ->sAR2_Q14[ 4 ]) ) ); + xmm_tempa = _mm_loadu_si128( (__m128i *)(void*)(&(NSQ->sAR2_Q14[ 0 ]) ) ); + xmm_tempb = _mm_loadu_si128( (__m128i *)(void*)(&(NSQ->sAR2_Q14[ 4 ]) ) ); xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one ); xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one ); @@ -399,7 +443,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( b_Q14_0123 = _mm_shuffle_epi32( b_Q14_3210, 0x1B ); /* loaded: [0] [-1] [-2] [-3] */ - pred_lag_ptr_0123 = _mm_loadu_si128( (__m128i *)(&pred_lag_ptr[ -3 ] ) ); + pred_lag_ptr_0123 = _mm_loadu_si128( (__m128i *)(void*)(&pred_lag_ptr[ -3 ] ) ); /* shuffle to [-3] [-2] [-1] [0] and to new xmm */ xmm_tempa = _mm_shuffle_epi32( pred_lag_ptr_0123, 0x1B ); /*64-bit multiply, a[2] * b[-2], a[0] * b[0] */ @@ -430,8 +474,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( sAR2_Q14_hi_76543210 = _mm_slli_si128( sAR2_Q14_hi_76543210, 2 ); sAR2_Q14_lo_76543210 = _mm_slli_si128( sAR2_Q14_lo_76543210, 2 ); - sAR2_Q14_hi_76543210 = _mm_insert_epi16( sAR2_Q14_hi_76543210, (xq_Q14 >> 16), 0 ); - sAR2_Q14_lo_76543210 = _mm_insert_epi16( sAR2_Q14_lo_76543210, (xq_Q14), 0 ); + sAR2_Q14_hi_76543210 = _mm_insert_epi16( sAR2_Q14_hi_76543210, (sDiff_shp_Q14 >> 16), 0 ); + sAR2_Q14_lo_76543210 = _mm_insert_epi16( sAR2_Q14_lo_76543210, (sDiff_shp_Q14), 0 ); /* high part, use pmaddwd, results in 4 32-bit */ xmm_hi_07 = _mm_madd_epi16( sAR2_Q14_hi_76543210, AR_shp_Q13_76543210 ); @@ -462,14 +506,14 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( n_LF_Q12 = silk_SMULWB( NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - 1 ], LF_shp_Q14 ); n_LF_Q12 = silk_SMLAWT( n_LF_Q12, sLF_AR_shp_Q14, LF_shp_Q14 ); - silk_assert( lag > 0 || signalType != TYPE_VOICED ); + celt_assert( lag > 0 || signalType != TYPE_VOICED ); /* Combine prediction and noise shaping signals */ tmp1 = silk_SUB32( silk_LSHIFT32( LPC_pred_Q10, 2 ), n_AR_Q12 ); /* Q12 */ tmp1 = silk_SUB32( tmp1, n_LF_Q12 ); /* Q12 */ if( lag > 0 ) { /* Symmetric, packed FIR coefficients */ - n_LTP_Q13 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 ); + n_LTP_Q13 = silk_SMULWB( silk_ADD_SAT32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 ); n_LTP_Q13 = silk_SMLAWT( n_LTP_Q13, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 ); n_LTP_Q13 = silk_LSHIFT( n_LTP_Q13, 1 ); shp_lag_ptr++; @@ -495,6 +539,18 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( /* Find two quantization level candidates and measure their rate-distortion */ q1_Q10 = silk_SUB32( r_Q10, offset_Q10 ); q1_Q0 = silk_RSHIFT( q1_Q10, 10 ); + if (Lambda_Q10 > 2048) { + /* For aggressive RDO, the bias becomes more than one pulse. */ + if (q1_Q10 > rdo_offset) { + q1_Q0 = silk_RSHIFT( q1_Q10 - rdo_offset, 10 ); + } else if (q1_Q10 < -rdo_offset) { + q1_Q0 = silk_RSHIFT( q1_Q10 + rdo_offset, 10 ); + } else if (q1_Q10 < 0) { + q1_Q0 = -1; + } else { + q1_Q0 = 0; + } + } q1_Q10 = table[q1_Q0][0]; q2_Q10 = table[q1_Q0][1]; @@ -519,7 +575,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( /* Update states */ psLPC_Q14++; *psLPC_Q14 = xq_Q14; - sLF_AR_shp_Q14 = silk_SUB_LSHIFT32( xq_Q14, n_AR_Q12, 2 ); + NSQ->sDiff_shp_Q14 = silk_SUB_LSHIFT32( xq_Q14, x_sc_Q10[ i ], 4 ); + sLF_AR_shp_Q14 = silk_SUB_LSHIFT32( NSQ->sDiff_shp_Q14, n_AR_Q12, 2 ); NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx ] = silk_SUB_LSHIFT32( sLF_AR_shp_Q14, n_LF_Q12, 2 ); sLTP_Q15[ NSQ->sLTP_buf_idx ] = silk_LSHIFT( LPC_exc_Q14, 1 ); @@ -538,8 +595,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( /* write back sAR2_Q14 */ xmm_tempa = _mm_unpackhi_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 ); xmm_tempb = _mm_unpacklo_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 ); - _mm_storeu_si128( (__m128i *)(&NSQ->sAR2_Q14[ 4 ]), xmm_tempa ); - _mm_storeu_si128( (__m128i *)(&NSQ->sAR2_Q14[ 0 ]), xmm_tempb ); + _mm_storeu_si128( (__m128i *)(void*)(&NSQ->sAR2_Q14[ 4 ]), xmm_tempa ); + _mm_storeu_si128( (__m128i *)(void*)(&NSQ->sAR2_Q14[ 0 ]), xmm_tempb ); /* xq[ i ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psLPC_Q14[ i ], Gain_Q10 ), 8 ) ); */ { @@ -555,8 +612,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( /* process xq */ for (i = 0; i < length - 7; i += 8) { - xmm_xq_Q14_3210 = _mm_loadu_si128( (__m128i *)(&(psLPC_Q14[ i + 0 ] ) ) ); - xmm_xq_Q14_7654 = _mm_loadu_si128( (__m128i *)(&(psLPC_Q14[ i + 4 ] ) ) ); + xmm_xq_Q14_3210 = _mm_loadu_si128( (__m128i *)(void*)(&(psLPC_Q14[ i + 0 ] ) ) ); + xmm_xq_Q14_7654 = _mm_loadu_si128( (__m128i *)(void*)(&(psLPC_Q14[ i + 4 ] ) ) ); /* equal shift right 4 bytes*/ xmm_xq_Q14_x3x1 = _mm_shuffle_epi32( xmm_xq_Q14_3210, _MM_SHUFFLE( 0, 3, 2, 1 ) ); @@ -587,7 +644,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( xmm_xq_Q14_3210 = _mm_packs_epi32( xmm_xq_Q14_3210, xmm_xq_Q14_7654 ); /* save to xq */ - _mm_storeu_si128( (__m128i *)(&xq[ i ] ), xmm_xq_Q14_3210 ); + _mm_storeu_si128( (__m128i *)(void*)(&xq[ i ] ), xmm_xq_Q14_3210 ); } } for ( ; i < length; i++) @@ -600,64 +657,54 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( } static OPUS_INLINE void silk_nsq_scale_states_sse4_1( - const silk_encoder_state *psEncC, /* I Encoder State */ - silk_nsq_state *NSQ, /* I/O NSQ state */ - const opus_int32 x_Q3[], /* I input in Q3 */ - opus_int32 x_sc_Q10[], /* O input scaled with 1/Gain */ - const opus_int16 sLTP[], /* I re-whitened LTP state in Q0 */ - opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */ - opus_int subfr, /* I subframe number */ - const opus_int LTP_scale_Q14, /* I */ - const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */ - const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */ - const opus_int signal_type /* I Signal type */ + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + const opus_int16 x16[], /* I input */ + opus_int32 x_sc_Q10[], /* O input scaled with 1/Gain */ + const opus_int16 sLTP[], /* I re-whitened LTP state in Q0 */ + opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */ + opus_int subfr, /* I subframe number */ + const opus_int LTP_scale_Q14, /* I */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */ + const opus_int signal_type /* I Signal type */ ) { opus_int i, lag; - opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q23; - __m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1; + opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26; + __m128i xmm_inv_gain_Q26, xmm_x16_x2x0, xmm_x16_x3x1; lag = pitchL[ subfr ]; inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 ); silk_assert( inv_gain_Q31 != 0 ); - /* Calculate gain adjustment factor */ - if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) { - gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 ); - } else { - gain_adj_Q16 = (opus_int32)1 << 16; - } - /* Scale input */ - inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 ); + inv_gain_Q26 = silk_RSHIFT_ROUND( inv_gain_Q31, 5 ); - /* prepare inv_gain_Q23 in packed 4 32-bits */ - xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23); + /* prepare inv_gain_Q26 in packed 4 32-bits */ + xmm_inv_gain_Q26 = _mm_set1_epi32(inv_gain_Q26); for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) { - xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) ); + xmm_x16_x2x0 = OP_CVTEPI16_EPI32_M64( &(x16[ i ] ) ); /* equal shift right 4 bytes*/ - xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) ); + xmm_x16_x3x1 = _mm_shuffle_epi32( xmm_x16_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) ); - xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 ); - xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 ); + xmm_x16_x2x0 = _mm_mul_epi32( xmm_x16_x2x0, xmm_inv_gain_Q26 ); + xmm_x16_x3x1 = _mm_mul_epi32( xmm_x16_x3x1, xmm_inv_gain_Q26 ); - xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 ); - xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 ); + xmm_x16_x2x0 = _mm_srli_epi64( xmm_x16_x2x0, 16 ); + xmm_x16_x3x1 = _mm_slli_epi64( xmm_x16_x3x1, 16 ); - xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC ); + xmm_x16_x2x0 = _mm_blend_epi16( xmm_x16_x2x0, xmm_x16_x3x1, 0xCC ); - _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x_Q3_x2x0 ); + _mm_storeu_si128( (__m128i *)(void*)(&(x_sc_Q10[ i ] ) ), xmm_x16_x2x0 ); } for( ; i < psEncC->subfr_length; i++ ) { - x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 ); + x_sc_Q10[ i ] = silk_SMULWW( x16[ i ], inv_gain_Q26 ); } - /* Save inverse gain */ - NSQ->prev_gain_Q16 = Gains_Q16[ subfr ]; - /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */ if( NSQ->rewhite_flag ) { if( subfr == 0 ) { @@ -671,16 +718,18 @@ static OPUS_INLINE void silk_nsq_scale_states_sse4_1( } /* Adjust for changing gain */ - if( gain_adj_Q16 != (opus_int32)1 << 16 ) { - /* Scale long-term shaping state */ + if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) { __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1; + gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 ); + + /* Scale long-term shaping state */ /* prepare gain_adj_Q16 in packed 4 32-bits */ xmm_gain_adj_Q16 = _mm_set1_epi32(gain_adj_Q16); for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 3; i += 4 ) { - xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ) ); + xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(void*)(&(NSQ->sLTP_shp_Q14[ i ] ) ) ); /* equal shift right 4 bytes*/ xmm_sLTP_shp_Q14_x3x1 = _mm_shuffle_epi32( xmm_sLTP_shp_Q14_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) ); @@ -692,7 +741,7 @@ static OPUS_INLINE void silk_nsq_scale_states_sse4_1( xmm_sLTP_shp_Q14_x2x0 = _mm_blend_epi16( xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1, 0xCC ); - _mm_storeu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 ); + _mm_storeu_si128( (__m128i *)(void*)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 ); } for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) { @@ -707,6 +756,7 @@ static OPUS_INLINE void silk_nsq_scale_states_sse4_1( } NSQ->sLF_AR_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sLF_AR_shp_Q14 ); + NSQ->sDiff_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sDiff_shp_Q14 ); /* Scale short-term prediction and shaping states */ for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) { @@ -715,5 +765,8 @@ static OPUS_INLINE void silk_nsq_scale_states_sse4_1( for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) { NSQ->sAR2_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sAR2_Q14[ i ] ); } + + /* Save inverse gain */ + NSQ->prev_gain_Q16 = Gains_Q16[ subfr ]; } } diff --git a/opus/silk/x86/SigProc_FIX_sse.h b/opus/silk/x86/SigProc_FIX_sse.h index 61efa8da..89a5ec88 100644 --- a/opus/silk/x86/SigProc_FIX_sse.h +++ b/opus/silk/x86/SigProc_FIX_sse.h @@ -26,13 +26,13 @@ */ #ifndef SIGPROC_FIX_SSE_H -#define SIGPROC_FIX_SSE_H +# define SIGPROC_FIX_SSE_H -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif +# ifdef HAVE_CONFIG_H +# include "config.h" +# endif -#if defined(OPUS_X86_MAY_HAVE_SSE4_1) +# if defined(OPUS_X86_MAY_HAVE_SSE4_1) void silk_burg_modified_sse4_1( opus_int32 *res_nrg, /* O Residual energy */ opus_int *res_nrg_Q, /* O Residual energy Q value */ @@ -45,11 +45,13 @@ void silk_burg_modified_sse4_1( int arch /* I Run-time architecture */ ); -#if defined(OPUS_X86_PRESUME_SSE4_1) -#define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \ - ((void)(arch), silk_burg_modified_sse4_1(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch)) +# if defined(OPUS_X86_PRESUME_SSE4_1) + +# define OVERRIDE_silk_burg_modified +# define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \ + ((void)(arch), silk_burg_modified_sse4_1(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch)) -#else +# elif defined(OPUS_HAVE_RTCD) extern void (*const SILK_BURG_MODIFIED_IMPL[OPUS_ARCHMASK + 1])( opus_int32 *res_nrg, /* O Residual energy */ @@ -62,33 +64,36 @@ extern void (*const SILK_BURG_MODIFIED_IMPL[OPUS_ARCHMASK + 1])( const opus_int D, /* I Order */ int arch /* I Run-time architecture */); -# define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \ - ((*SILK_BURG_MODIFIED_IMPL[(arch) & OPUS_ARCHMASK])(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch)) +# define OVERRIDE_silk_burg_modified +# define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \ + ((*SILK_BURG_MODIFIED_IMPL[(arch) & OPUS_ARCHMASK])(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch)) -#endif +# endif -opus_int64 silk_inner_prod16_aligned_64_sse4_1( +opus_int64 silk_inner_prod16_sse4_1( const opus_int16 *inVec1, const opus_int16 *inVec2, const opus_int len ); -#if defined(OPUS_X86_PRESUME_SSE4_1) +# if defined(OPUS_X86_PRESUME_SSE4_1) -#define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \ - ((void)(arch),silk_inner_prod16_aligned_64_sse4_1(inVec1, inVec2, len)) +# define OVERRIDE_silk_inner_prod16 +# define silk_inner_prod16(inVec1, inVec2, len, arch) \ + ((void)(arch),silk_inner_prod16_sse4_1(inVec1, inVec2, len)) -#else +# elif defined(OPUS_HAVE_RTCD) -extern opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[OPUS_ARCHMASK + 1])( +extern opus_int64 (*const SILK_INNER_PROD16_IMPL[OPUS_ARCHMASK + 1])( const opus_int16 *inVec1, const opus_int16 *inVec2, const opus_int len); -# define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \ - ((*SILK_INNER_PROD16_ALIGNED_64_IMPL[(arch) & OPUS_ARCHMASK])(inVec1, inVec2, len)) +# define OVERRIDE_silk_inner_prod16 +# define silk_inner_prod16(inVec1, inVec2, len, arch) \ + ((*SILK_INNER_PROD16_IMPL[(arch) & OPUS_ARCHMASK])(inVec1, inVec2, len)) -#endif -#endif +# endif +# endif #endif diff --git a/opus/silk/x86/VAD_sse4_1.c b/opus/silk/x86/VAD_sse4_1.c index d02ddf4a..9e06bc79 100644 --- a/opus/silk/x86/VAD_sse4_1.c +++ b/opus/silk/x86/VAD_sse4_1.c @@ -1,5 +1,5 @@ -/* Copyright (c) 2014, Cisco Systems, INC - Written by XiangMingZhu WeiZhou MinPeng YanWang +/* Copyright (c) 2014-2020, Cisco Systems, INC + Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -63,6 +63,14 @@ opus_int silk_VAD_GetSA_Q8_sse4_1( /* O Return value, 0 if s SAVE_STACK; +#ifdef OPUS_CHECK_ASM + silk_encoder_state psEncC_c; + opus_int ret_c; + + silk_memcpy( &psEncC_c, psEncC, sizeof( psEncC_c ) ); + ret_c = silk_VAD_GetSA_Q8_c( &psEncC_c, pIn ); +#endif + /* Safety checks */ silk_assert( VAD_N_BANDS == 4 ); celt_assert( MAX_FRAME_LENGTH >= psEncC->frame_length ); @@ -136,7 +144,7 @@ opus_int silk_VAD_GetSA_Q8_sse4_1( /* O Return value, 0 if s for( i = 0; i < dec_subframe_length - 7; i += 8 ) { - xmm_X = _mm_loadu_si128( (__m128i *)&(X[ X_offset[ b ] + i + dec_subframe_offset ] ) ); + xmm_X = _mm_loadu_si128( (__m128i *)(void*)&(X[ X_offset[ b ] + i + dec_subframe_offset ] ) ); xmm_X = _mm_srai_epi16( xmm_X, 3 ); xmm_X = _mm_madd_epi16( xmm_X, xmm_X ); xmm_acc = _mm_add_epi32( xmm_acc, xmm_X ); @@ -233,15 +241,14 @@ opus_int silk_VAD_GetSA_Q8_sse4_1( /* O Return value, 0 if s speech_nrg += ( b + 1 ) * silk_RSHIFT( Xnrg[ b ] - psSilk_VAD->NL[ b ], 4 ); } + if( psEncC->frame_length == 20 * psEncC->fs_kHz ) { + speech_nrg = silk_RSHIFT32( speech_nrg, 1 ); + } /* Power scaling */ if( speech_nrg <= 0 ) { SA_Q15 = silk_RSHIFT( SA_Q15, 1 ); - } else if( speech_nrg < 32768 ) { - if( psEncC->frame_length == 10 * psEncC->fs_kHz ) { - speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 16 ); - } else { - speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 15 ); - } + } else if( speech_nrg < 16384 ) { + speech_nrg = silk_LSHIFT32( speech_nrg, 16 ); /* square-root */ speech_nrg = silk_SQRT_APPROX( speech_nrg ); @@ -272,6 +279,11 @@ opus_int silk_VAD_GetSA_Q8_sse4_1( /* O Return value, 0 if s psEncC->input_quality_bands_Q15[ b ] = silk_sigm_Q15( silk_RSHIFT( SNR_Q7 - 16 * 128, 4 ) ); } +#ifdef OPUS_CHECK_ASM + silk_assert( ret == ret_c ); + silk_assert( !memcmp( &psEncC_c, psEncC, sizeof( psEncC_c ) ) ); +#endif + RESTORE_STACK; return( ret ); } diff --git a/opus/silk/x86/VQ_WMat_EC_sse4_1.c b/opus/silk/x86/VQ_WMat_EC_sse4_1.c index 74d6c6d0..df4626b6 100644 --- a/opus/silk/x86/VQ_WMat_EC_sse4_1.c +++ b/opus/silk/x86/VQ_WMat_EC_sse4_1.c @@ -1,5 +1,5 @@ -/* Copyright (c) 2014, Cisco Systems, INC - Written by XiangMingZhu WeiZhou MinPeng YanWang +/* Copyright (c) 2014-2020, Cisco Systems, INC + Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -38,105 +38,136 @@ /* Entropy constrained matrix-weighted VQ, hard-coded to 5-element vectors, for a single input data vector */ void silk_VQ_WMat_EC_sse4_1( opus_int8 *ind, /* O index of best codebook vector */ - opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */ + opus_int32 *res_nrg_Q15, /* O best residual energy */ + opus_int32 *rate_dist_Q8, /* O best total bitrate */ opus_int *gain_Q7, /* O sum of absolute LTP coefficients */ - const opus_int16 *in_Q14, /* I input vector to be quantized */ - const opus_int32 *W_Q18, /* I weighting matrix */ + const opus_int32 *XX_Q17, /* I correlation matrix */ + const opus_int32 *xX_Q17, /* I correlation vector */ const opus_int8 *cb_Q7, /* I codebook */ const opus_uint8 *cb_gain_Q7, /* I codebook effective gain */ const opus_uint8 *cl_Q5, /* I code length for each codebook vector */ - const opus_int mu_Q9, /* I tradeoff betw. weighted error and rate */ + const opus_int subfr_len, /* I number of samples per subframe */ const opus_int32 max_gain_Q7, /* I maximum sum of absolute LTP coefficients */ - opus_int L /* I number of vectors in codebook */ + const opus_int L /* I number of vectors in codebook */ ) { opus_int k, gain_tmp_Q7; const opus_int8 *cb_row_Q7; - opus_int16 diff_Q14[ 5 ]; - opus_int32 sum1_Q14, sum2_Q16; + opus_int32 neg_xX_Q24[ 5 ]; + opus_int32 sum1_Q15, sum2_Q24; + opus_int32 bits_res_Q8, bits_tot_Q8; + __m128i v_XX_31_Q17, v_XX_42_Q17, v_cb_row_31_Q7, v_cb_row_42_Q7, v_acc1_Q24, v_acc2_Q24; + + /* Negate and convert to new Q domain */ + neg_xX_Q24[ 0 ] = -silk_LSHIFT32( xX_Q17[ 0 ], 7 ); + neg_xX_Q24[ 1 ] = -silk_LSHIFT32( xX_Q17[ 1 ], 7 ); + neg_xX_Q24[ 2 ] = -silk_LSHIFT32( xX_Q17[ 2 ], 7 ); + neg_xX_Q24[ 3 ] = -silk_LSHIFT32( xX_Q17[ 3 ], 7 ); + neg_xX_Q24[ 4 ] = -silk_LSHIFT32( xX_Q17[ 4 ], 7 ); + + v_XX_31_Q17 = _mm_loadu_si128( (__m128i *)(void*)(&XX_Q17[ 1 ] ) ); + v_XX_42_Q17 = _mm_shuffle_epi32( v_XX_31_Q17, _MM_SHUFFLE( 0, 3, 2, 1 ) ); - __m128i C_tmp1, C_tmp2, C_tmp3, C_tmp4, C_tmp5; /* Loop over codebook */ - *rate_dist_Q14 = silk_int32_MAX; + *rate_dist_Q8 = silk_int32_MAX; + *res_nrg_Q15 = silk_int32_MAX; cb_row_Q7 = cb_Q7; + /* If things go really bad, at least *ind is set to something safe. */ + *ind = 0; for( k = 0; k < L; k++ ) { + opus_int32 penalty; gain_tmp_Q7 = cb_gain_Q7[k]; - - diff_Q14[ 0 ] = in_Q14[ 0 ] - silk_LSHIFT( cb_row_Q7[ 0 ], 7 ); - - C_tmp1 = OP_CVTEPI16_EPI32_M64( &in_Q14[ 1 ] ); - C_tmp2 = OP_CVTEPI8_EPI32_M32( &cb_row_Q7[ 1 ] ); - C_tmp2 = _mm_slli_epi32( C_tmp2, 7 ); - C_tmp1 = _mm_sub_epi32( C_tmp1, C_tmp2 ); - - diff_Q14[ 1 ] = _mm_extract_epi16( C_tmp1, 0 ); - diff_Q14[ 2 ] = _mm_extract_epi16( C_tmp1, 2 ); - diff_Q14[ 3 ] = _mm_extract_epi16( C_tmp1, 4 ); - diff_Q14[ 4 ] = _mm_extract_epi16( C_tmp1, 6 ); - /* Weighted rate */ - sum1_Q14 = silk_SMULBB( mu_Q9, cl_Q5[ k ] ); + /* Quantization error: 1 - 2 * xX * cb + cb' * XX * cb */ + sum1_Q15 = SILK_FIX_CONST( 1.001, 15 ); /* Penalty for too large gain */ - sum1_Q14 = silk_ADD_LSHIFT32( sum1_Q14, silk_max( silk_SUB32( gain_tmp_Q7, max_gain_Q7 ), 0 ), 10 ); - - silk_assert( sum1_Q14 >= 0 ); - - /* first row of W_Q18 */ - C_tmp3 = _mm_loadu_si128( (__m128i *)(&W_Q18[ 1 ] ) ); - C_tmp4 = _mm_mul_epi32( C_tmp3, C_tmp1 ); - C_tmp4 = _mm_srli_si128( C_tmp4, 2 ); - - C_tmp1 = _mm_shuffle_epi32( C_tmp1, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* shift right 4 bytes */ - C_tmp3 = _mm_shuffle_epi32( C_tmp3, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* shift right 4 bytes */ - - C_tmp5 = _mm_mul_epi32( C_tmp3, C_tmp1 ); - C_tmp5 = _mm_srli_si128( C_tmp5, 2 ); - - C_tmp5 = _mm_add_epi32( C_tmp4, C_tmp5 ); - C_tmp5 = _mm_slli_epi32( C_tmp5, 1 ); - - C_tmp5 = _mm_add_epi32( C_tmp5, _mm_shuffle_epi32( C_tmp5, _MM_SHUFFLE( 0, 0, 0, 2 ) ) ); - sum2_Q16 = _mm_cvtsi128_si32( C_tmp5 ); - - sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 0 ], diff_Q14[ 0 ] ); - sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 0 ] ); - - /* second row of W_Q18 */ - sum2_Q16 = silk_SMULWB( W_Q18[ 7 ], diff_Q14[ 2 ] ); - sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 8 ], diff_Q14[ 3 ] ); - sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 9 ], diff_Q14[ 4 ] ); - sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 ); - sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 6 ], diff_Q14[ 1 ] ); - sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 1 ] ); - - /* third row of W_Q18 */ - sum2_Q16 = silk_SMULWB( W_Q18[ 13 ], diff_Q14[ 3 ] ); - sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 14 ], diff_Q14[ 4 ] ); - sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 ); - sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 12 ], diff_Q14[ 2 ] ); - sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 2 ] ); - - /* fourth row of W_Q18 */ - sum2_Q16 = silk_SMULWB( W_Q18[ 19 ], diff_Q14[ 4 ] ); - sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 ); - sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 18 ], diff_Q14[ 3 ] ); - sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 3 ] ); - - /* last row of W_Q18 */ - sum2_Q16 = silk_SMULWB( W_Q18[ 24 ], diff_Q14[ 4 ] ); - sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 4 ] ); - - silk_assert( sum1_Q14 >= 0 ); + penalty = silk_LSHIFT32( silk_max( silk_SUB32( gain_tmp_Q7, max_gain_Q7 ), 0 ), 11 ); + + /* first row of XX_Q17 */ + v_cb_row_31_Q7 = OP_CVTEPI8_EPI32_M32( &cb_row_Q7[ 1 ] ); + v_cb_row_42_Q7 = _mm_shuffle_epi32( v_cb_row_31_Q7, _MM_SHUFFLE( 0, 3, 2, 1 ) ); + v_cb_row_31_Q7 = _mm_mul_epi32( v_XX_31_Q17, v_cb_row_31_Q7 ); + v_cb_row_42_Q7 = _mm_mul_epi32( v_XX_42_Q17, v_cb_row_42_Q7 ); + v_acc1_Q24 = _mm_add_epi64( v_cb_row_31_Q7, v_cb_row_42_Q7); + v_acc2_Q24 = _mm_shuffle_epi32( v_acc1_Q24, _MM_SHUFFLE( 1, 0, 3, 2 ) ); + v_acc1_Q24 = _mm_add_epi64( v_acc1_Q24, v_acc2_Q24); + sum2_Q24 = _mm_cvtsi128_si32( v_acc1_Q24 ); + sum2_Q24 = silk_ADD32( neg_xX_Q24[ 0 ], sum2_Q24 ); + sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 ); + sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 0 ], cb_row_Q7[ 0 ] ); + sum1_Q15 = silk_SMLAWB( sum1_Q15, sum2_Q24, cb_row_Q7[ 0 ] ); + + /* second row of XX_Q17 */ + sum2_Q24 = silk_MLA( neg_xX_Q24[ 1 ], XX_Q17[ 7 ], cb_row_Q7[ 2 ] ); + sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 8 ], cb_row_Q7[ 3 ] ); + sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 9 ], cb_row_Q7[ 4 ] ); + sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 ); + sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 6 ], cb_row_Q7[ 1 ] ); + sum1_Q15 = silk_SMLAWB( sum1_Q15, sum2_Q24, cb_row_Q7[ 1 ] ); + + /* third row of XX_Q17 */ + sum2_Q24 = silk_MLA( neg_xX_Q24[ 2 ], XX_Q17[ 13 ], cb_row_Q7[ 3 ] ); + sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 14 ], cb_row_Q7[ 4 ] ); + sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 ); + sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 12 ], cb_row_Q7[ 2 ] ); + sum1_Q15 = silk_SMLAWB( sum1_Q15, sum2_Q24, cb_row_Q7[ 2 ] ); + + /* fourth row of XX_Q17 */ + sum2_Q24 = silk_MLA( neg_xX_Q24[ 3 ], XX_Q17[ 19 ], cb_row_Q7[ 4 ] ); + sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 ); + sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 18 ], cb_row_Q7[ 3 ] ); + sum1_Q15 = silk_SMLAWB( sum1_Q15, sum2_Q24, cb_row_Q7[ 3 ] ); + + /* last row of XX_Q17 */ + sum2_Q24 = silk_LSHIFT32( neg_xX_Q24[ 4 ], 1 ); + sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 24 ], cb_row_Q7[ 4 ] ); + sum1_Q15 = silk_SMLAWB( sum1_Q15, sum2_Q24, cb_row_Q7[ 4 ] ); /* find best */ - if( sum1_Q14 < *rate_dist_Q14 ) { - *rate_dist_Q14 = sum1_Q14; - *ind = (opus_int8)k; - *gain_Q7 = gain_tmp_Q7; + if( sum1_Q15 >= 0 ) { + /* Translate residual energy to bits using high-rate assumption (6 dB ==> 1 bit/sample) */ + bits_res_Q8 = silk_SMULBB( subfr_len, silk_lin2log( sum1_Q15 + penalty) - (15 << 7) ); + /* In the following line we reduce the codelength component by half ("-1"); seems to slightly improve quality */ + bits_tot_Q8 = silk_ADD_LSHIFT32( bits_res_Q8, cl_Q5[ k ], 3-1 ); + if( bits_tot_Q8 <= *rate_dist_Q8 ) { + *rate_dist_Q8 = bits_tot_Q8; + *res_nrg_Q15 = sum1_Q15 + penalty; + *ind = (opus_int8)k; + *gain_Q7 = gain_tmp_Q7; + } } /* Go to next cbk vector */ cb_row_Q7 += LTP_ORDER; } + +#ifdef OPUS_CHECK_ASM + { + opus_int8 ind_c = 0; + opus_int32 res_nrg_Q15_c = 0; + opus_int32 rate_dist_Q8_c = 0; + opus_int gain_Q7_c = 0; + + silk_VQ_WMat_EC_c( + &ind_c, + &res_nrg_Q15_c, + &rate_dist_Q8_c, + &gain_Q7_c, + XX_Q17, + xX_Q17, + cb_Q7, + cb_gain_Q7, + cl_Q5, + subfr_len, + max_gain_Q7, + L + ); + + silk_assert( *ind == ind_c ); + silk_assert( *res_nrg_Q15 == res_nrg_Q15_c ); + silk_assert( *rate_dist_Q8 == rate_dist_Q8_c ); + silk_assert( *gain_Q7 == gain_Q7_c ); + } +#endif } diff --git a/opus/silk/x86/main_sse.h b/opus/silk/x86/main_sse.h index 2f15d448..b254d53e 100644 --- a/opus/silk/x86/main_sse.h +++ b/opus/silk/x86/main_sse.h @@ -26,171 +26,195 @@ */ #ifndef MAIN_SSE_H -#define MAIN_SSE_H +# define MAIN_SSE_H -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif +# ifdef HAVE_CONFIG_H +# include "config.h" +# endif # if defined(OPUS_X86_MAY_HAVE_SSE4_1) -#if 0 /* FIXME: SSE disabled until silk_VQ_WMat_EC_sse4_1() gets updated. */ -# define OVERRIDE_silk_VQ_WMat_EC - void silk_VQ_WMat_EC_sse4_1( opus_int8 *ind, /* O index of best codebook vector */ - opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */ + opus_int32 *res_nrg_Q15, /* O best residual energy */ + opus_int32 *rate_dist_Q8, /* O best total bitrate */ opus_int *gain_Q7, /* O sum of absolute LTP coefficients */ - const opus_int16 *in_Q14, /* I input vector to be quantized */ - const opus_int32 *W_Q18, /* I weighting matrix */ + const opus_int32 *XX_Q17, /* I correlation matrix */ + const opus_int32 *xX_Q17, /* I correlation vector */ const opus_int8 *cb_Q7, /* I codebook */ const opus_uint8 *cb_gain_Q7, /* I codebook effective gain */ const opus_uint8 *cl_Q5, /* I code length for each codebook vector */ - const opus_int mu_Q9, /* I tradeoff betw. weighted error and rate */ + const opus_int subfr_len, /* I number of samples per subframe */ const opus_int32 max_gain_Q7, /* I maximum sum of absolute LTP coefficients */ - opus_int L /* I number of vectors in codebook */ + const opus_int L /* I number of vectors in codebook */ ); -#if defined OPUS_X86_PRESUME_SSE4_1 +# if defined OPUS_X86_PRESUME_SSE4_1 -#define silk_VQ_WMat_EC(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \ - mu_Q9, max_gain_Q7, L, arch) \ - ((void)(arch),silk_VQ_WMat_EC_sse4_1(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \ - mu_Q9, max_gain_Q7, L)) +# define OVERRIDE_silk_VQ_WMat_EC +# define silk_VQ_WMat_EC(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \ + subfr_len, max_gain_Q7, L, arch) \ + ((void)(arch),silk_VQ_WMat_EC_sse4_1(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \ + subfr_len, max_gain_Q7, L)) -#else +# elif defined(OPUS_HAVE_RTCD) extern void (*const SILK_VQ_WMAT_EC_IMPL[OPUS_ARCHMASK + 1])( opus_int8 *ind, /* O index of best codebook vector */ - opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */ + opus_int32 *res_nrg_Q15, /* O best residual energy */ + opus_int32 *rate_dist_Q8, /* O best total bitrate */ opus_int *gain_Q7, /* O sum of absolute LTP coefficients */ - const opus_int16 *in_Q14, /* I input vector to be quantized */ - const opus_int32 *W_Q18, /* I weighting matrix */ + const opus_int32 *XX_Q17, /* I correlation matrix */ + const opus_int32 *xX_Q17, /* I correlation vector */ const opus_int8 *cb_Q7, /* I codebook */ const opus_uint8 *cb_gain_Q7, /* I codebook effective gain */ const opus_uint8 *cl_Q5, /* I code length for each codebook vector */ - const opus_int mu_Q9, /* I tradeoff betw. weighted error and rate */ + const opus_int subfr_len, /* I number of samples per subframe */ const opus_int32 max_gain_Q7, /* I maximum sum of absolute LTP coefficients */ - opus_int L /* I number of vectors in codebook */ + const opus_int L /* I number of vectors in codebook */ ); -# define silk_VQ_WMat_EC(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \ - mu_Q9, max_gain_Q7, L, arch) \ - ((*SILK_VQ_WMAT_EC_IMPL[(arch) & OPUS_ARCHMASK])(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \ - mu_Q9, max_gain_Q7, L)) +# define OVERRIDE_silk_VQ_WMat_EC +# define silk_VQ_WMat_EC(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \ + subfr_len, max_gain_Q7, L, arch) \ + ((*SILK_VQ_WMAT_EC_IMPL[(arch) & OPUS_ARCHMASK])(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \ + subfr_len, max_gain_Q7, L)) -#endif -#endif - -#if 0 /* FIXME: SSE disabled until the NSQ code gets updated. */ -# define OVERRIDE_silk_NSQ +# endif void silk_NSQ_sse4_1( - const silk_encoder_state *psEncC, /* I Encoder State */ - silk_nsq_state *NSQ, /* I/O NSQ state */ - SideInfoIndices *psIndices, /* I/O Quantization Indices */ - const opus_int32 x_Q3[], /* I Prefiltered input signal */ - opus_int8 pulses[], /* O Quantized pulse signal */ - const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ - const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ - const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ - const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ - const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ - const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ - const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ - const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ - const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ - const opus_int LTP_scale_Q14 /* I LTP state scaling */ + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + SideInfoIndices *psIndices, /* I/O Quantization Indices */ + const opus_int16 x16[], /* I Input */ + opus_int8 pulses[], /* O Quantized pulse signal */ + const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */ + const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ + const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ + const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ + const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ + const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ + const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ + const opus_int LTP_scale_Q14 /* I LTP state scaling */ ); -#if defined OPUS_X86_PRESUME_SSE4_1 +# if defined OPUS_X86_PRESUME_SSE4_1 -#define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ - HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \ +# define OVERRIDE_silk_NSQ +# define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ + HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \ ((void)(arch),silk_NSQ_sse4_1(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14)) -#else +# elif defined(OPUS_HAVE_RTCD) extern void (*const SILK_NSQ_IMPL[OPUS_ARCHMASK + 1])( - const silk_encoder_state *psEncC, /* I Encoder State */ - silk_nsq_state *NSQ, /* I/O NSQ state */ - SideInfoIndices *psIndices, /* I/O Quantization Indices */ - const opus_int32 x_Q3[], /* I Prefiltered input signal */ - opus_int8 pulses[], /* O Quantized pulse signal */ - const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ - const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ - const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ - const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ - const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ - const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ - const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ - const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ - const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ - const opus_int LTP_scale_Q14 /* I LTP state scaling */ + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + SideInfoIndices *psIndices, /* I/O Quantization Indices */ + const opus_int16 x16[], /* I Input */ + opus_int8 pulses[], /* O Quantized pulse signal */ + const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */ + const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ + const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ + const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ + const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ + const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ + const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ + const opus_int LTP_scale_Q14 /* I LTP state scaling */ ); -# define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ - HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \ +# define OVERRIDE_silk_NSQ +# define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ + HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \ ((*SILK_NSQ_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14)) -#endif - -# define OVERRIDE_silk_NSQ_del_dec +# endif void silk_NSQ_del_dec_sse4_1( - const silk_encoder_state *psEncC, /* I Encoder State */ - silk_nsq_state *NSQ, /* I/O NSQ state */ - SideInfoIndices *psIndices, /* I/O Quantization Indices */ - const opus_int32 x_Q3[], /* I Prefiltered input signal */ - opus_int8 pulses[], /* O Quantized pulse signal */ - const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ - const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ - const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ - const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ - const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ - const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ - const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ - const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ - const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ - const opus_int LTP_scale_Q14 /* I LTP state scaling */ + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + SideInfoIndices *psIndices, /* I/O Quantization Indices */ + const opus_int16 x16[], /* I Input */ + opus_int8 pulses[], /* O Quantized pulse signal */ + const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */ + const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ + const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ + const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ + const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ + const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ + const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ + const opus_int LTP_scale_Q14 /* I LTP state scaling */ +); + +void silk_NSQ_del_dec_avx2( + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + SideInfoIndices *psIndices, /* I/O Quantization Indices */ + const opus_int16 x16[], /* I Input */ + opus_int8 pulses[], /* O Quantized pulse signal */ + const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */ + const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR], /* I Long term prediction coefs */ + const opus_int16 AR_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER], /* I Noise shaping coefs */ + const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR], /* I Long term shaping coefs */ + const opus_int Tilt_Q14[MAX_NB_SUBFR], /* I Spectral tilt */ + const opus_int32 LF_shp_Q14[MAX_NB_SUBFR], /* I Low frequency shaping coefs */ + const opus_int32 Gains_Q16[MAX_NB_SUBFR], /* I Quantization step sizes */ + const opus_int32 pitchL[MAX_NB_SUBFR], /* I Pitch lags */ + const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ + const opus_int LTP_scale_Q14 /* I LTP state scaling */ ); -#if defined OPUS_X86_PRESUME_SSE4_1 +# if defined (OPUS_X86_PRESUME_AVX2) -#define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ - HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \ - ((void)(arch),silk_NSQ_del_dec_sse4_1(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ +# define OVERRIDE_silk_NSQ_del_dec +# define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \ + HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \ + ((void)(arch),silk_NSQ_del_dec_avx2(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \ HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14)) -#else +# elif defined (OPUS_X86_PRESUME_SSE4_1) && !defined(OPUS_X86_MAY_HAVE_AVX2) + +# define OVERRIDE_silk_NSQ_del_dec +# define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \ + HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \ + ((void)(arch),silk_NSQ_del_dec_sse4_1(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \ + HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14)) + +# elif defined(OPUS_HAVE_RTCD) extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])( - const silk_encoder_state *psEncC, /* I Encoder State */ - silk_nsq_state *NSQ, /* I/O NSQ state */ - SideInfoIndices *psIndices, /* I/O Quantization Indices */ - const opus_int32 x_Q3[], /* I Prefiltered input signal */ - opus_int8 pulses[], /* O Quantized pulse signal */ - const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ - const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ - const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ - const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ - const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ - const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ - const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ - const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ - const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ - const opus_int LTP_scale_Q14 /* I LTP state scaling */ + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + SideInfoIndices *psIndices, /* I/O Quantization Indices */ + const opus_int16 x16[], /* I Input */ + opus_int8 pulses[], /* O Quantized pulse signal */ + const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */ + const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ + const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ + const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ + const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ + const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ + const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ + const opus_int LTP_scale_Q14 /* I LTP state scaling */ ); -# define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ - HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \ - ((*SILK_NSQ_DEL_DEC_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ +# define OVERRIDE_silk_NSQ_del_dec +# define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \ + HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \ + ((*SILK_NSQ_DEL_DEC_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \ HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14)) -#endif -#endif +# endif void silk_noise_shape_quantizer( silk_nsq_state *NSQ, /* I/O NSQ state */ @@ -223,25 +247,52 @@ void silk_VAD_GetNoiseLevels( silk_VAD_state *psSilk_VAD /* I/O Pointer to Silk VAD state */ ); -# define OVERRIDE_silk_VAD_GetSA_Q8 - opus_int silk_VAD_GetSA_Q8_sse4_1( silk_encoder_state *psEnC, const opus_int16 pIn[] ); -#if defined(OPUS_X86_PRESUME_SSE4_1) -#define silk_VAD_GetSA_Q8(psEnC, pIn, arch) ((void)(arch),silk_VAD_GetSA_Q8_sse4_1(psEnC, pIn)) +# if defined(OPUS_X86_PRESUME_SSE4_1) -#else +# define OVERRIDE_silk_VAD_GetSA_Q8 +# define silk_VAD_GetSA_Q8(psEnC, pIn, arch) ((void)(arch),silk_VAD_GetSA_Q8_sse4_1(psEnC, pIn)) -# define silk_VAD_GetSA_Q8(psEnC, pIn, arch) \ - ((*SILK_VAD_GETSA_Q8_IMPL[(arch) & OPUS_ARCHMASK])(psEnC, pIn)) +# elif defined(OPUS_HAVE_RTCD) extern opus_int (*const SILK_VAD_GETSA_Q8_IMPL[OPUS_ARCHMASK + 1])( silk_encoder_state *psEnC, const opus_int16 pIn[]); +# define OVERRIDE_silk_VAD_GetSA_Q8 +# define silk_VAD_GetSA_Q8(psEnC, pIn, arch) \ + ((*SILK_VAD_GETSA_Q8_IMPL[(arch) & OPUS_ARCHMASK])(psEnC, pIn)) + +# endif + +#ifndef FIXED_POINT +double silk_inner_product_FLP_avx2( + const silk_float *data1, + const silk_float *data2, + opus_int dataSize +); + +#if defined (OPUS_X86_PRESUME_AVX2) + +#define OVERRIDE_inner_product_FLP +#define silk_inner_product_FLP(data1, data2, dataSize, arch) ((void)arch,silk_inner_product_FLP_avx2(data1, data2, dataSize)) + +#elif defined(OPUS_HAVE_RTCD) && defined(OPUS_X86_MAY_HAVE_AVX2) + +#define OVERRIDE_inner_product_FLP +extern double (*const SILK_INNER_PRODUCT_FLP_IMPL[OPUS_ARCHMASK + 1])( + const silk_float *data1, + const silk_float *data2, + opus_int dataSize +); + +#define silk_inner_product_FLP(data1, data2, dataSize, arch) ((void)arch,(*SILK_INNER_PRODUCT_FLP_IMPL[(arch) & OPUS_ARCHMASK])(data1, data2, dataSize)) + +#endif #endif # endif diff --git a/opus/silk/x86/x86_silk_map.c b/opus/silk/x86/x86_silk_map.c index 32dcc3ca..39ad7527 100644 --- a/opus/silk/x86/x86_silk_map.c +++ b/opus/silk/x86/x86_silk_map.c @@ -32,25 +32,28 @@ #include "celt/x86/x86cpu.h" #include "structs.h" #include "SigProc_FIX.h" +#ifndef FIXED_POINT +#include "SigProc_FLP.h" +#endif #include "pitch.h" #include "main.h" -#if !defined(OPUS_X86_PRESUME_SSE4_1) +#if defined(OPUS_HAVE_RTCD) && !defined(OPUS_X86_PRESUME_AVX2) #if defined(FIXED_POINT) #include "fixed/main_FIX.h" -opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[ OPUS_ARCHMASK + 1 ] )( +opus_int64 (*const SILK_INNER_PROD16_IMPL[ OPUS_ARCHMASK + 1 ] )( const opus_int16 *inVec1, const opus_int16 *inVec2, const opus_int len ) = { - silk_inner_prod16_aligned_64_c, /* non-sse */ - silk_inner_prod16_aligned_64_c, - silk_inner_prod16_aligned_64_c, - MAY_HAVE_SSE4_1( silk_inner_prod16_aligned_64 ), /* sse4.1 */ - MAY_HAVE_SSE4_1( silk_inner_prod16_aligned_64 ) /* avx */ + silk_inner_prod16_c, /* non-sse */ + silk_inner_prod16_c, + silk_inner_prod16_c, + MAY_HAVE_SSE4_1( silk_inner_prod16 ), /* sse4.1 */ + MAY_HAVE_SSE4_1( silk_inner_prod16 ) /* avx */ }; #endif @@ -66,23 +69,22 @@ opus_int (*const SILK_VAD_GETSA_Q8_IMPL[ OPUS_ARCHMASK + 1 ] )( MAY_HAVE_SSE4_1( silk_VAD_GetSA_Q8 ) /* avx */ }; -#if 0 /* FIXME: SSE disabled until the NSQ code gets updated. */ void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )( - const silk_encoder_state *psEncC, /* I Encoder State */ - silk_nsq_state *NSQ, /* I/O NSQ state */ - SideInfoIndices *psIndices, /* I/O Quantization Indices */ - const opus_int32 x_Q3[], /* I Prefiltered input signal */ - opus_int8 pulses[], /* O Quantized pulse signal */ - const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ - const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ - const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ - const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ - const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ - const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ - const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ - const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ - const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ - const opus_int LTP_scale_Q14 /* I LTP state scaling */ + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + SideInfoIndices *psIndices, /* I/O Quantization Indices */ + const opus_int16 x16[], /* I Input */ + opus_int8 pulses[], /* O Quantized pulse signal */ + const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */ + const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ + const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ + const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ + const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ + const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ + const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ + const opus_int LTP_scale_Q14 /* I LTP state scaling */ ) = { silk_NSQ_c, /* non-sse */ silk_NSQ_c, @@ -90,21 +92,20 @@ void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )( MAY_HAVE_SSE4_1( silk_NSQ ), /* sse4.1 */ MAY_HAVE_SSE4_1( silk_NSQ ) /* avx */ }; -#endif -#if 0 /* FIXME: SSE disabled until silk_VQ_WMat_EC_sse4_1() gets updated. */ void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )( opus_int8 *ind, /* O index of best codebook vector */ - opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */ + opus_int32 *res_nrg_Q15, /* O best residual energy */ + opus_int32 *rate_dist_Q8, /* O best total bitrate */ opus_int *gain_Q7, /* O sum of absolute LTP coefficients */ - const opus_int16 *in_Q14, /* I input vector to be quantized */ - const opus_int32 *W_Q18, /* I weighting matrix */ + const opus_int32 *XX_Q17, /* I correlation matrix */ + const opus_int32 *xX_Q17, /* I correlation vector */ const opus_int8 *cb_Q7, /* I codebook */ const opus_uint8 *cb_gain_Q7, /* I codebook effective gain */ const opus_uint8 *cl_Q5, /* I code length for each codebook vector */ - const opus_int mu_Q9, /* I tradeoff betw. weighted error and rate */ + const opus_int subfr_len, /* I number of samples per subframe */ const opus_int32 max_gain_Q7, /* I maximum sum of absolute LTP coefficients */ - opus_int L /* I number of vectors in codebook */ + const opus_int L /* I number of vectors in codebook */ ) = { silk_VQ_WMat_EC_c, /* non-sse */ silk_VQ_WMat_EC_c, @@ -112,33 +113,30 @@ void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )( MAY_HAVE_SSE4_1( silk_VQ_WMat_EC ), /* sse4.1 */ MAY_HAVE_SSE4_1( silk_VQ_WMat_EC ) /* avx */ }; -#endif -#if 0 /* FIXME: SSE disabled until the NSQ code gets updated. */ void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )( - const silk_encoder_state *psEncC, /* I Encoder State */ - silk_nsq_state *NSQ, /* I/O NSQ state */ - SideInfoIndices *psIndices, /* I/O Quantization Indices */ - const opus_int32 x_Q3[], /* I Prefiltered input signal */ - opus_int8 pulses[], /* O Quantized pulse signal */ - const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ - const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ - const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ - const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ - const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ - const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ - const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ - const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ - const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ - const opus_int LTP_scale_Q14 /* I LTP state scaling */ + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + SideInfoIndices *psIndices, /* I/O Quantization Indices */ + const opus_int16 x16[], /* I Input */ + opus_int8 pulses[], /* O Quantized pulse signal */ + const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */ + const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ + const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ + const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ + const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ + const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ + const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ + const opus_int LTP_scale_Q14 /* I LTP state scaling */ ) = { silk_NSQ_del_dec_c, /* non-sse */ silk_NSQ_del_dec_c, silk_NSQ_del_dec_c, MAY_HAVE_SSE4_1( silk_NSQ_del_dec ), /* sse4.1 */ - MAY_HAVE_SSE4_1( silk_NSQ_del_dec ) /* avx */ + MAY_HAVE_AVX2( silk_NSQ_del_dec ) /* avx */ }; -#endif #if defined(FIXED_POINT) @@ -161,4 +159,21 @@ void (*const SILK_BURG_MODIFIED_IMPL[ OPUS_ARCHMASK + 1 ] )( }; #endif + +#ifndef FIXED_POINT + +double (*const SILK_INNER_PRODUCT_FLP_IMPL[ OPUS_ARCHMASK + 1 ] )( + const silk_float *data1, + const silk_float *data2, + opus_int dataSize +) = { + silk_inner_product_FLP_c, /* non-sse */ + silk_inner_product_FLP_c, + silk_inner_product_FLP_c, + silk_inner_product_FLP_c, /* sse4.1 */ + MAY_HAVE_AVX2( silk_inner_product_FLP ) /* avx */ +}; + +#endif + #endif diff --git a/opus/src/analysis.c b/opus/src/analysis.c index cb46dec5..1f580138 100644 --- a/opus/src/analysis.c +++ b/opus/src/analysis.c @@ -31,7 +31,9 @@ #define ANALYSIS_C +#ifdef MLP_TRAINING #include +#endif #include "mathops.h" #include "kiss_fft.h" @@ -927,9 +929,9 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt features[23] = info->tonality_slope + 0.069216f; features[24] = tonal->lowECount - 0.067930f; - compute_dense(&layer0, layer_out, features); - compute_gru(&layer1, tonal->rnn_state, layer_out); - compute_dense(&layer2, frame_probs, tonal->rnn_state); + analysis_compute_dense(&layer0, layer_out, features); + analysis_compute_gru(&layer1, tonal->rnn_state, layer_out); + analysis_compute_dense(&layer2, frame_probs, tonal->rnn_state); /* Probability of speech or music vs noise */ info->activity_probability = frame_probs[1]; diff --git a/opus/src/extensions.c b/opus/src/extensions.c new file mode 100644 index 00000000..bb6c0b02 --- /dev/null +++ b/opus/src/extensions.c @@ -0,0 +1,315 @@ +/* Copyright (c) 2022 Amazon */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + + +#include "opus_types.h" +#include "opus_defines.h" +#include "arch.h" +#include "os_support.h" +#include "opus_private.h" + + +/* Given an extension payload, advance data to the next extension and return the + length of the remaining extensions. */ +opus_int32 skip_extension(const unsigned char **data, opus_int32 len, opus_int32 *header_size) +{ + int id, L; + if (len==0) + return 0; + id = **data>>1; + L = **data&1; + if (id == 0 && L == 1) + { + *header_size = 1; + if (len < 1) + return -1; + (*data)++; + len--; + return len; + } else if (id > 0 && id < 32) + { + if (len < 1+L) + return -1; + *data += 1+L; + len -= 1+L; + *header_size = 1; + return len; + } else { + if (L==0) + { + *data += len; + *header_size = 1; + return 0; + } else { + opus_int32 bytes=0; + *header_size = 1; + do { + (*data)++; + len--; + if (len == 0) + return -1; + bytes += **data; + (*header_size)++; + } while (**data == 255); + (*data)++; + len--; + if (bytes <= len) + { + len -= bytes; + *data += bytes; + } else { + return -1; + } + return len; + } + } +} + +/* Count the number of extensions, excluding real padding and separators. */ +opus_int32 opus_packet_extensions_count(const unsigned char *data, opus_int32 len) +{ + opus_int32 curr_len; + opus_int32 count=0; + const unsigned char *curr_data = data; + + celt_assert(len >= 0); + celt_assert(data != NULL || len == 0); + + curr_len = len; + while (curr_len > 0) + { + int id; + opus_int32 header_size; + id = *curr_data>>1; + curr_len = skip_extension(&curr_data, curr_len, &header_size); + if (curr_len < 0) + return OPUS_INVALID_PACKET; + if (id > 1) + count++; + } + return count; +} + +/* Extract extensions from Opus padding (excluding real padding and separators) */ +opus_int32 opus_packet_extensions_parse(const unsigned char *data, opus_int32 len, opus_extension_data *extensions, opus_int32 *nb_extensions) +{ + const unsigned char *curr_data; + opus_int32 curr_len; + int curr_frame=0; + opus_int32 count=0; + + celt_assert(len >= 0); + celt_assert(data != NULL || len == 0); + celt_assert(nb_extensions != NULL); + celt_assert(extensions != NULL || *nb_extensions == 0); + + curr_data = data; + curr_len = len; + while (curr_len > 0) + { + int id; + opus_int32 header_size; + opus_extension_data curr_ext; + id = *curr_data>>1; + if (id > 1) + { + curr_ext.id = id; + curr_ext.frame = curr_frame; + curr_ext.data = curr_data; + } else if (id == 1) + { + int L = *curr_data&1; + if (L==0) + curr_frame++; + else { + if (curr_len >= 2) + curr_frame += curr_data[1]; + /* Else we're at the end and it doesn't matter. */ + } + if (curr_frame >= 48) + { + *nb_extensions = count; + return OPUS_INVALID_PACKET; + } + } + curr_len = skip_extension(&curr_data, curr_len, &header_size); + /* printf("curr_len = %d, header_size = %d\n", curr_len, header_size); */ + if (curr_len < 0) + { + *nb_extensions = count; + return OPUS_INVALID_PACKET; + } + celt_assert(curr_data - data == len - curr_len); + if (id > 1) + { + if (count == *nb_extensions) + { + return OPUS_BUFFER_TOO_SMALL; + } + curr_ext.len = curr_data - curr_ext.data - header_size; + curr_ext.data += header_size; + extensions[count++] = curr_ext; + } + } + celt_assert(curr_len == 0); + *nb_extensions = count; + return OPUS_OK; +} + +opus_int32 opus_packet_extensions_generate(unsigned char *data, opus_int32 len, const opus_extension_data *extensions, opus_int32 nb_extensions, int pad) +{ + int max_frame=0; + opus_int32 i; + int frame; + int curr_frame = 0; + opus_int32 pos = 0; + opus_int32 written = 0; + + celt_assert(len >= 0); + + for (i=0;i 127) + return OPUS_BAD_ARG; + } + if (max_frame >= 48) return OPUS_BAD_ARG; + for (frame=0;frame<=max_frame;frame++) + { + for (i=0;i 1) + return OPUS_BAD_ARG; + if (len-pos < extensions[i].len+1) + return OPUS_BUFFER_TOO_SMALL; + if (data) data[pos] = (extensions[i].id<<1) + extensions[i].len; + pos++; + if (extensions[i].len > 0) { + if (data) data[pos] = extensions[i].data[0]; + pos++; + } + } else { + int last; + opus_int32 length_bytes; + if (extensions[i].len < 0) + return OPUS_BAD_ARG; + last = (written == nb_extensions - 1); + length_bytes = 1 + extensions[i].len/255; + if (last) + length_bytes = 0; + if (len-pos < 1 + length_bytes + extensions[i].len) + return OPUS_BUFFER_TOO_SMALL; + if (data) data[pos] = (extensions[i].id<<1) + !last; + pos++; + if (!last) + { + opus_int32 j; + for (j=0;j +int main() +{ + opus_extension_data ext[] = {{2, 0, (const unsigned char *)"a", 1}, + {32, 10, (const unsigned char *)"DRED", 4}, + {33, 1, (const unsigned char *)"NOT DRED", 8}, + {3, 4, (const unsigned char *)NULL, 0} + }; + opus_extension_data ext2[10]; + int i, len; + int nb_ext = 10; + unsigned char packet[10000]; + len = opus_packet_extensions_generate(packet, 32, ext, 4, 1); + for (i=0;i-8)) - return -1; -#ifndef FIXED_POINT - /* Another check in case of -ffast-math */ - if (celt_isnan(x)) - return 0; -#endif - if (x<0) - { - x=-x; - sign=-1; - } - i = (int)floor(.5f+25*x); - x -= .04f*i; - y = tansig_table[i]; - dy = 1-y*y; - y = y + x*dy*(1 - y*x); - return sign*y; + const float N0 = 952.52801514f; + const float N1 = 96.39235687f; + const float N2 = 0.60863042f; + const float D0 = 952.72399902f; + const float D1 = 413.36801147f; + const float D2 = 11.88600922f; + float X2, num, den; + X2 = x*x; + num = fmadd(fmadd(N2, X2, N1), X2, N0); + den = fmadd(fmadd(D2, X2, D1), X2, D0); + num = num*x/den; + return MAX32(-1.f, MIN32(1.f, num)); } static OPUS_INLINE float sigmoid_approx(float x) @@ -79,7 +67,7 @@ static void gemm_accum(float *out, const opus_int8 *weights, int rows, int cols, } } -void compute_dense(const DenseLayer *layer, float *output, const float *input) +void analysis_compute_dense(const AnalysisDenseLayer *layer, float *output, const float *input) { int i; int N, M; @@ -101,7 +89,7 @@ void compute_dense(const DenseLayer *layer, float *output, const float *input) } } -void compute_gru(const GRULayer *gru, float *state, const float *input) +void analysis_compute_gru(const AnalysisGRULayer *gru, float *state, const float *input) { int i; int N, M; diff --git a/opus/src/mlp.h b/opus/src/mlp.h index d7670550..e6b1a8f7 100644 --- a/opus/src/mlp.h +++ b/opus/src/mlp.h @@ -24,8 +24,8 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef _MLP_H_ -#define _MLP_H_ +#ifndef MLP_H_ +#define MLP_H_ #include "opus_types.h" @@ -39,7 +39,7 @@ typedef struct { int nb_inputs; int nb_neurons; int sigmoid; -} DenseLayer; +} AnalysisDenseLayer; typedef struct { const opus_int8 *bias; @@ -47,14 +47,14 @@ typedef struct { const opus_int8 *recurrent_weights; int nb_inputs; int nb_neurons; -} GRULayer; +} AnalysisGRULayer; -extern const DenseLayer layer0; -extern const GRULayer layer1; -extern const DenseLayer layer2; +extern const AnalysisDenseLayer layer0; +extern const AnalysisGRULayer layer1; +extern const AnalysisDenseLayer layer2; -void compute_dense(const DenseLayer *layer, float *output, const float *input); +void analysis_compute_dense(const AnalysisDenseLayer *layer, float *output, const float *input); -void compute_gru(const GRULayer *gru, float *state, const float *input); +void analysis_compute_gru(const AnalysisGRULayer *gru, float *state, const float *input); -#endif /* _MLP_H_ */ +#endif /* MLP_H_ */ diff --git a/opus/src/mlp_data.c b/opus/src/mlp_data.c index ae4178df..65f7448e 100644 --- a/opus/src/mlp_data.c +++ b/opus/src/mlp_data.c @@ -651,20 +651,20 @@ static const opus_int8 layer2_bias[2] = { 14, 117 }; -const DenseLayer layer0 = { +const AnalysisDenseLayer layer0 = { layer0_bias, layer0_weights, 25, 32, 0 }; -const GRULayer layer1 = { +const AnalysisGRULayer layer1 = { layer1_bias, layer1_weights, layer1_recur_weights, 32, 24 }; -const DenseLayer layer2 = { +const AnalysisDenseLayer layer2 = { layer2_bias, layer2_weights, 24, 2, 1 diff --git a/opus/src/opus.c b/opus/src/opus.c index 538b5ea7..816a4dd5 100644 --- a/opus/src/opus.c +++ b/opus/src/opus.c @@ -194,7 +194,8 @@ int opus_packet_get_samples_per_frame(const unsigned char *data, int opus_packet_parse_impl(const unsigned char *data, opus_int32 len, int self_delimited, unsigned char *out_toc, const unsigned char *frames[48], opus_int16 size[48], - int *payload_offset, opus_int32 *packet_offset) + int *payload_offset, opus_int32 *packet_offset, + const unsigned char **padding, opus_int32 *padding_len) { int i, bytes; int count; @@ -337,6 +338,11 @@ int opus_packet_parse_impl(const unsigned char *data, opus_int32 len, data += size[i]; } + if (padding != NULL) + { + *padding = data; + *padding_len = pad; + } if (packet_offset) *packet_offset = pad+(opus_int32)(data-data0); @@ -351,6 +357,6 @@ int opus_packet_parse(const unsigned char *data, opus_int32 len, opus_int16 size[48], int *payload_offset) { return opus_packet_parse_impl(data, len, 0, out_toc, - frames, size, payload_offset, NULL); + frames, size, payload_offset, NULL, NULL, NULL); } diff --git a/opus/src/opus_decoder.c b/opus/src/opus_decoder.c index 9113638a..b57c8094 100644 --- a/opus/src/opus_decoder.c +++ b/opus/src/opus_decoder.c @@ -52,6 +52,15 @@ #include "mathops.h" #include "cpu_support.h" +#ifdef ENABLE_DEEP_PLC +#include "dred_rdovae_dec_data.h" +#include "dred_rdovae_dec.h" +#endif + +#ifdef ENABLE_OSCE +#include "osce.h" +#endif + struct OpusDecoder { int celt_dec_offset; int silk_dec_offset; @@ -59,7 +68,11 @@ struct OpusDecoder { opus_int32 Fs; /** Sampling rate (at the API level) */ silk_DecControlStruct DecControl; int decode_gain; + int complexity; int arch; +#ifdef ENABLE_DEEP_PLC + LPCNetPLCState lpcnet; +#endif /* Everything beyond this point gets cleared on a reset */ #define OPUS_DECODER_RESET_START stream_channels @@ -135,6 +148,7 @@ int opus_decoder_init(OpusDecoder *st, opus_int32 Fs, int channels) silk_dec = (char*)st+st->silk_dec_offset; celt_dec = (CELTDecoder*)((char*)st+st->celt_dec_offset); st->stream_channels = st->channels = channels; + st->complexity = 0; st->Fs = Fs; st->DecControl.API_sampleRate = st->Fs; @@ -152,6 +166,9 @@ int opus_decoder_init(OpusDecoder *st, opus_int32 Fs, int channels) st->prev_mode = 0; st->frame_size = Fs/400; +#ifdef ENABLE_DEEP_PLC + lpcnet_plc_init( &st->lpcnet); +#endif st->arch = opus_select_arch(); return OPUS_OK; } @@ -278,7 +295,8 @@ static int opus_decode_frame(OpusDecoder *st, const unsigned char *data, ec_dec_init(&dec,(unsigned char*)data,len); } else { audiosize = frame_size; - mode = st->prev_mode; + /* Run PLC using last used mode (CELT if we ended with CELT redundancy) */ + mode = st->prev_redundancy ? MODE_CELT_ONLY : st->prev_mode; bandwidth = 0; if (mode == 0) @@ -369,7 +387,7 @@ static int opus_decode_frame(OpusDecoder *st, const unsigned char *data, pcm_ptr = pcm_silk; if (st->prev_mode==MODE_CELT_ONLY) - silk_InitDecoder( silk_dec ); + silk_ResetDecoder( silk_dec ); /* The SILK PLC cannot produce frames of less than 10 ms */ st->DecControl.payloadSize_ms = IMAX(10, 1000 * audiosize / st->Fs); @@ -393,14 +411,28 @@ static int opus_decode_frame(OpusDecoder *st, const unsigned char *data, st->DecControl.internalSampleRate = 16000; } } + st->DecControl.enable_deep_plc = st->complexity >= 5; +#ifdef ENABLE_OSCE + st->DecControl.osce_method = OSCE_METHOD_NONE; +#ifndef DISABLE_LACE + if (st->complexity >= 6) {st->DecControl.osce_method = OSCE_METHOD_LACE;} +#endif +#ifndef DISABLE_NOLACE + if (st->complexity >= 7) {st->DecControl.osce_method = OSCE_METHOD_NOLACE;} +#endif +#endif - lost_flag = data == NULL ? 1 : 2 * decode_fec; + lost_flag = data == NULL ? 1 : 2 * !!decode_fec; decoded_samples = 0; do { /* Call SILK decoder */ int first_frame = decoded_samples == 0; silk_ret = silk_Decode( silk_dec, &st->DecControl, - lost_flag, first_frame, &dec, pcm_ptr, &silk_frame_size, st->arch ); + lost_flag, first_frame, &dec, pcm_ptr, &silk_frame_size, +#ifdef ENABLE_DEEP_PLC + &st->lpcnet, +#endif + st->arch ); if( silk_ret ) { if (lost_flag) { /* PLC failure should not be fatal */ @@ -419,7 +451,7 @@ static int opus_decode_frame(OpusDecoder *st, const unsigned char *data, start_band = 0; if (!decode_fec && mode != MODE_CELT_ONLY && data != NULL - && ec_tell(&dec)+17+20*(st->mode == MODE_HYBRID) <= 8*len) + && ec_tell(&dec)+17+20*(mode == MODE_HYBRID) <= 8*len) { /* Check if we have a redundant 0-8 kHz band */ if (mode == MODE_HYBRID) @@ -499,6 +531,11 @@ static int opus_decode_frame(OpusDecoder *st, const unsigned char *data, /* 5 ms redundant frame for CELT->SILK*/ if (redundancy && celt_to_silk) { + /* If the previous frame did not use CELT (the first redundancy frame in + a transition from SILK may have been lost) then the CELT decoder is + stale at this point and the redundancy audio is not useful, however + the final range is still needed (for testing), so the redundancy is + always decoded but the decoded audio may not be used */ MUST_SUCCEED(celt_decoder_ctl(celt_dec, CELT_SET_START_BAND(0))); celt_decode_with_ec(celt_dec, data+len, redundancy_bytes, redundant_audio, F5, NULL, 0); @@ -515,8 +552,12 @@ static int opus_decode_frame(OpusDecoder *st, const unsigned char *data, if (mode != st->prev_mode && st->prev_mode > 0 && !st->prev_redundancy) MUST_SUCCEED(celt_decoder_ctl(celt_dec, OPUS_RESET_STATE)); /* Decode CELT */ - celt_ret = celt_decode_with_ec(celt_dec, decode_fec ? NULL : data, - len, pcm, celt_frame_size, &dec, celt_accum); + celt_ret = celt_decode_with_ec_dred(celt_dec, decode_fec ? NULL : data, + len, pcm, celt_frame_size, &dec, celt_accum +#ifdef ENABLE_DEEP_PLC + , &st->lpcnet +#endif + ); } else { unsigned char silence[2] = {0xFF, 0xFF}; if (!celt_accum) @@ -561,7 +602,10 @@ static int opus_decode_frame(OpusDecoder *st, const unsigned char *data, smooth_fade(pcm+st->channels*(frame_size-F2_5), redundant_audio+st->channels*F2_5, pcm+st->channels*(frame_size-F2_5), F2_5, st->channels, window, st->Fs); } - if (redundancy && celt_to_silk) + /* 5ms redundant frame for CELT->SILK; ignore if the previous frame did not + use CELT (the first redundancy frame in a transition from SILK may have + been lost) */ + if (redundancy && celt_to_silk && (st->prev_mode != MODE_SILK_ONLY || st->prev_redundancy)) { for (c=0;cchannels;c++) { @@ -625,7 +669,7 @@ static int opus_decode_frame(OpusDecoder *st, const unsigned char *data, int opus_decode_native(OpusDecoder *st, const unsigned char *data, opus_int32 len, opus_val16 *pcm, int frame_size, int decode_fec, - int self_delimited, opus_int32 *packet_offset, int soft_clip) + int self_delimited, opus_int32 *packet_offset, int soft_clip, const OpusDRED *dred, opus_int32 dred_offset) { int i, nb_samples; int count, offset; @@ -639,6 +683,35 @@ int opus_decode_native(OpusDecoder *st, const unsigned char *data, /* For FEC/PLC, frame_size has to be to have a multiple of 2.5 ms */ if ((decode_fec || len==0 || data==NULL) && frame_size%(st->Fs/400)!=0) return OPUS_BAD_ARG; +#ifdef ENABLE_DRED + if (dred != NULL && dred->process_stage == 2) { + int F10; + int features_per_frame; + int needed_feature_frames; + int init_frames; + lpcnet_plc_fec_clear(&st->lpcnet); + F10 = st->Fs/100; + /* if blend==0, the last PLC call was "update" and we need to feed two extra 10-ms frames. */ + init_frames = (st->lpcnet.blend == 0) ? 2 : 0; + features_per_frame = IMAX(1, frame_size/F10); + needed_feature_frames = init_frames + features_per_frame; + lpcnet_plc_fec_clear(&st->lpcnet); + for (i=0;idred_offset*F10/4)/F10); + if (feature_offset <= 4*dred->nb_latents-1 && feature_offset >= 0) { + lpcnet_plc_fec_add(&st->lpcnet, dred->fec_features+feature_offset*DRED_NUM_FEATURES); + } else { + if (feature_offset >= 0) lpcnet_plc_fec_add(&st->lpcnet, NULL); + } + + } + } +#else + (void)dred; + (void)dred_offset; +#endif if (len==0 || data==NULL) { int pcm_count=0; @@ -663,7 +736,7 @@ int opus_decode_native(OpusDecoder *st, const unsigned char *data, packet_stream_channels = opus_packet_get_nb_channels(data); count = opus_packet_parse_impl(data, len, self_delimited, &toc, NULL, - size, &offset, packet_offset); + size, &offset, packet_offset, NULL, NULL); if (count<0) return count; @@ -675,12 +748,12 @@ int opus_decode_native(OpusDecoder *st, const unsigned char *data, int ret; /* If no FEC can be present, run the PLC (recursive call) */ if (frame_size < packet_frame_size || packet_mode == MODE_CELT_ONLY || st->mode == MODE_CELT_ONLY) - return opus_decode_native(st, NULL, 0, pcm, frame_size, 0, 0, NULL, soft_clip); + return opus_decode_native(st, NULL, 0, pcm, frame_size, 0, 0, NULL, soft_clip, NULL, 0); /* Otherwise, run the PLC on everything except the size for which we might have FEC */ duration_copy = st->last_packet_duration; if (frame_size-packet_frame_size!=0) { - ret = opus_decode_native(st, NULL, 0, pcm, frame_size-packet_frame_size, 0, 0, NULL, soft_clip); + ret = opus_decode_native(st, NULL, 0, pcm, frame_size-packet_frame_size, 0, 0, NULL, soft_clip, NULL, 0); if (ret<0) { st->last_packet_duration = duration_copy; @@ -744,7 +817,7 @@ int opus_decode(OpusDecoder *st, const unsigned char *data, { if(frame_size<=0) return OPUS_BAD_ARG; - return opus_decode_native(st, data, len, pcm, frame_size, decode_fec, 0, NULL, 0); + return opus_decode_native(st, data, len, pcm, frame_size, decode_fec, 0, NULL, 0, NULL, 0); } #ifndef DISABLE_FLOAT_API @@ -772,7 +845,7 @@ int opus_decode_float(OpusDecoder *st, const unsigned char *data, celt_assert(st->channels == 1 || st->channels == 2); ALLOC(out, frame_size*st->channels, opus_int16); - ret = opus_decode_native(st, data, len, out, frame_size, decode_fec, 0, NULL, 0); + ret = opus_decode_native(st, data, len, out, frame_size, decode_fec, 0, NULL, 0, NULL, 0); if (ret > 0) { for (i=0;ichannels;i++) @@ -810,7 +883,7 @@ int opus_decode(OpusDecoder *st, const unsigned char *data, celt_assert(st->channels == 1 || st->channels == 2); ALLOC(out, frame_size*st->channels, float); - ret = opus_decode_native(st, data, len, out, frame_size, decode_fec, 0, NULL, 1); + ret = opus_decode_native(st, data, len, out, frame_size, decode_fec, 0, NULL, 1, NULL, 0); if (ret > 0) { for (i=0;ichannels;i++) @@ -825,7 +898,7 @@ int opus_decode_float(OpusDecoder *st, const unsigned char *data, { if(frame_size<=0) return OPUS_BAD_ARG; - return opus_decode_native(st, data, len, pcm, frame_size, decode_fec, 0, NULL, 0); + return opus_decode_native(st, data, len, pcm, frame_size, decode_fec, 0, NULL, 0, NULL, 0); } #endif @@ -855,6 +928,27 @@ int opus_decoder_ctl(OpusDecoder *st, int request, ...) *value = st->bandwidth; } break; + case OPUS_SET_COMPLEXITY_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + if(value<0 || value>10) + { + goto bad_arg; + } + st->complexity = value; + celt_decoder_ctl(celt_dec, OPUS_SET_COMPLEXITY(value)); + } + break; + case OPUS_GET_COMPLEXITY_REQUEST: + { + opus_int32 *value = va_arg(ap, opus_int32*); + if (!value) + { + goto bad_arg; + } + *value = st->complexity; + } + break; case OPUS_GET_FINAL_RANGE_REQUEST: { opus_uint32 *value = va_arg(ap, opus_uint32*); @@ -872,9 +966,12 @@ int opus_decoder_ctl(OpusDecoder *st, int request, ...) ((char*)&st->OPUS_DECODER_RESET_START - (char*)st)); celt_decoder_ctl(celt_dec, OPUS_RESET_STATE); - silk_InitDecoder( silk_dec ); + silk_ResetDecoder( silk_dec ); st->stream_channels = st->channels; st->frame_size = st->Fs/400; +#ifdef ENABLE_DEEP_PLC + lpcnet_plc_reset( &st->lpcnet ); +#endif } break; case OPUS_GET_SAMPLE_RATE_REQUEST: @@ -950,6 +1047,20 @@ int opus_decoder_ctl(OpusDecoder *st, int request, ...) ret = celt_decoder_ctl(celt_dec, OPUS_GET_PHASE_INVERSION_DISABLED(value)); } break; +#ifdef USE_WEIGHTS_FILE + case OPUS_SET_DNN_BLOB_REQUEST: + { + const unsigned char *data = va_arg(ap, const unsigned char *); + opus_int32 len = va_arg(ap, opus_int32); + if(len<0 || data == NULL) + { + goto bad_arg; + } + ret = lpcnet_plc_load_model(&st->lpcnet, data, len); + ret = silk_LoadOSCEModels(silk_dec, data, len) || ret; + } + break; +#endif default: /*fprintf(stderr, "unknown opus_decoder_ctl() request: %d", request);*/ ret = OPUS_UNIMPLEMENTED; @@ -1025,8 +1136,373 @@ int opus_packet_get_nb_samples(const unsigned char packet[], opus_int32 len, return samples; } +int opus_packet_has_lbrr(const unsigned char packet[], opus_int32 len) +{ + int ret; + const unsigned char *frames[48]; + opus_int16 size[48]; + int packet_mode, packet_frame_size, packet_stream_channels; + int nb_frames=1; + int lbrr; + + packet_mode = opus_packet_get_mode(packet); + if (packet_mode == MODE_CELT_ONLY) + return 0; + packet_frame_size = opus_packet_get_samples_per_frame(packet, 48000); + if (packet_frame_size > 960) + nb_frames = packet_frame_size/960; + packet_stream_channels = opus_packet_get_nb_channels(packet); + ret = opus_packet_parse(packet, len, NULL, frames, size, NULL); + if (ret <= 0) + return ret; + lbrr = (frames[0][0] >> (7-nb_frames)) & 0x1; + if (packet_stream_channels == 2) + lbrr = lbrr || ((frames[0][0] >> (6-2*nb_frames)) & 0x1); + return lbrr; +} + int opus_decoder_get_nb_samples(const OpusDecoder *dec, const unsigned char packet[], opus_int32 len) { return opus_packet_get_nb_samples(packet, len, dec->Fs); } + +struct OpusDREDDecoder { +#ifdef ENABLE_DRED + RDOVAEDec model; +#endif + int loaded; + int arch; + opus_uint32 magic; +}; + +#if defined(ENABLE_DRED) && (defined(ENABLE_HARDENING) || defined(ENABLE_ASSERTIONS)) +static void validate_dred_decoder(OpusDREDDecoder *st) +{ + celt_assert(st->magic == 0xD8EDDEC0); +#ifdef OPUS_ARCHMASK + celt_assert(st->arch >= 0); + celt_assert(st->arch <= OPUS_ARCHMASK); +#endif +} +#define VALIDATE_DRED_DECODER(st) validate_dred_decoder(st) +#else +#define VALIDATE_DRED_DECODER(st) +#endif + + +int opus_dred_decoder_get_size(void) +{ + return sizeof(OpusDREDDecoder); +} + +#ifdef ENABLE_DRED +int dred_decoder_load_model(OpusDREDDecoder *dec, const unsigned char *data, int len) +{ + WeightArray *list; + int ret; + parse_weights(&list, data, len); + ret = init_rdovaedec(&dec->model, list); + opus_free(list); + if (ret == 0) dec->loaded = 1; + return (ret == 0) ? OPUS_OK : OPUS_BAD_ARG; +} +#endif + +int opus_dred_decoder_init(OpusDREDDecoder *dec) +{ + int ret = 0; + dec->loaded = 0; +#if defined(ENABLE_DRED) && !defined(USE_WEIGHTS_FILE) + ret = init_rdovaedec(&dec->model, rdovaedec_arrays); + if (ret == 0) dec->loaded = 1; +#endif + dec->arch = opus_select_arch(); + /* To make sure nobody forgets to init, use a magic number. */ + dec->magic = 0xD8EDDEC0; + return (ret == 0) ? OPUS_OK : OPUS_UNIMPLEMENTED; +} + +OpusDREDDecoder *opus_dred_decoder_create(int *error) +{ + int ret; + OpusDREDDecoder *dec; + dec = (OpusDREDDecoder *)opus_alloc(opus_dred_decoder_get_size()); + if (dec == NULL) + { + if (error) + *error = OPUS_ALLOC_FAIL; + return NULL; + } + ret = opus_dred_decoder_init(dec); + if (error) + *error = ret; + if (ret != OPUS_OK) + { + opus_free(dec); + dec = NULL; + } + return dec; +} + +void opus_dred_decoder_destroy(OpusDREDDecoder *dec) +{ + if (dec) dec->magic = 0xDE57801D; + opus_free(dec); +} + +int opus_dred_decoder_ctl(OpusDREDDecoder *dred_dec, int request, ...) +{ +#ifdef ENABLE_DRED + int ret = OPUS_OK; + va_list ap; + + va_start(ap, request); + (void)dred_dec; + switch (request) + { +# ifdef USE_WEIGHTS_FILE + case OPUS_SET_DNN_BLOB_REQUEST: + { + const unsigned char *data = va_arg(ap, const unsigned char *); + opus_int32 len = va_arg(ap, opus_int32); + if(len<0 || data == NULL) + { + goto bad_arg; + } + return dred_decoder_load_model(dred_dec, data, len); + } + break; +# endif + default: + /*fprintf(stderr, "unknown opus_decoder_ctl() request: %d", request);*/ + ret = OPUS_UNIMPLEMENTED; + break; + } + va_end(ap); + return ret; +# ifdef USE_WEIGHTS_FILE +bad_arg: + va_end(ap); + return OPUS_BAD_ARG; +# endif +#else + (void)dred_dec; + (void)request; + return OPUS_UNIMPLEMENTED; +#endif +} + +#ifdef ENABLE_DRED +static int dred_find_payload(const unsigned char *data, opus_int32 len, const unsigned char **payload, int *dred_frame_offset) +{ + const unsigned char *data0; + int len0; + int frame = 0; + int ret; + const unsigned char *frames[48]; + opus_int16 size[48]; + int frame_size; + + *payload = NULL; + /* Get the padding section of the packet. */ + ret = opus_packet_parse_impl(data, len, 0, NULL, frames, size, NULL, NULL, &data0, &len0); + if (ret < 0) + return ret; + frame_size = opus_packet_get_samples_per_frame(data, 48000); + data = data0; + len = len0; + /* Scan extensions in order until we find the earliest frame with DRED data. */ + while (len > 0) + { + opus_int32 header_size; + int id, L; + len0 = len; + data0 = data; + id = *data0 >> 1; + L = *data0 & 0x1; + len = skip_extension(&data, len, &header_size); + if (len < 0) + break; + if (id == 1) + { + if (L==0) + { + frame++; + } else { + frame += data0[1]; + } + } else if (id == DRED_EXTENSION_ID) + { + const unsigned char *curr_payload; + opus_int32 curr_payload_len; + curr_payload = data0+header_size; + curr_payload_len = (data-data0)-header_size; + /* DRED position in the packet, in units of 2.5 ms like for the signaled DRED offset. */ + *dred_frame_offset = frame*frame_size/120; +#ifdef DRED_EXPERIMENTAL_VERSION + /* Check that temporary extension type and version match. + This check will be removed once extension is finalized. */ + if (curr_payload_len > DRED_EXPERIMENTAL_BYTES && curr_payload[0] == 'D' && curr_payload[1] == DRED_EXPERIMENTAL_VERSION) { + *payload = curr_payload+2; + return curr_payload_len-2; + } +#else + if (curr_payload_len > 0) { + *payload = curr_payload; + return curr_payload_len; + } +#endif + } + } + return 0; +} +#endif + +int opus_dred_get_size(void) +{ +#ifdef ENABLE_DRED + return sizeof(OpusDRED); +#else + return 0; +#endif +} + +OpusDRED *opus_dred_alloc(int *error) +{ +#ifdef ENABLE_DRED + OpusDRED *dec; + dec = (OpusDRED *)opus_alloc(opus_dred_get_size()); + if (dec == NULL) + { + if (error) + *error = OPUS_ALLOC_FAIL; + return NULL; + } + return dec; +#else + if (error) + *error = OPUS_UNIMPLEMENTED; + return NULL; +#endif +} + +void opus_dred_free(OpusDRED *dec) +{ +#ifdef ENABLE_DRED + opus_free(dec); +#else + (void)dec; +#endif +} + +int opus_dred_parse(OpusDREDDecoder *dred_dec, OpusDRED *dred, const unsigned char *data, opus_int32 len, opus_int32 max_dred_samples, opus_int32 sampling_rate, int *dred_end, int defer_processing) +{ +#ifdef ENABLE_DRED + const unsigned char *payload; + opus_int32 payload_len; + int dred_frame_offset=0; + VALIDATE_DRED_DECODER(dred_dec); + if (!dred_dec->loaded) return OPUS_UNIMPLEMENTED; + dred->process_stage = -1; + payload_len = dred_find_payload(data, len, &payload, &dred_frame_offset); + if (payload_len < 0) + return payload_len; + if (payload != NULL) + { + int offset; + int min_feature_frames; + offset = 100*max_dred_samples/sampling_rate; + min_feature_frames = IMIN(2 + offset, 2*DRED_NUM_REDUNDANCY_FRAMES); + dred_ec_decode(dred, payload, payload_len, min_feature_frames, dred_frame_offset); + if (!defer_processing) + opus_dred_process(dred_dec, dred, dred); + if (dred_end) *dred_end = IMAX(0, -dred->dred_offset*sampling_rate/400); + return IMAX(0, dred->nb_latents*sampling_rate/25 - dred->dred_offset* sampling_rate/400); + } + if (dred_end) *dred_end = 0; + return 0; +#else + (void)dred_dec; + (void)dred; + (void)data; + (void)len; + (void)max_dred_samples; + (void)sampling_rate; + (void)defer_processing; + (void)dred_end; + return OPUS_UNIMPLEMENTED; +#endif +} + +int opus_dred_process(OpusDREDDecoder *dred_dec, const OpusDRED *src, OpusDRED *dst) +{ +#ifdef ENABLE_DRED + if (dred_dec == NULL || src == NULL || dst == NULL || (src->process_stage != 1 && src->process_stage != 2)) + return OPUS_BAD_ARG; + VALIDATE_DRED_DECODER(dred_dec); + if (!dred_dec->loaded) return OPUS_UNIMPLEMENTED; + if (src != dst) + OPUS_COPY(dst, src, 1); + if (dst->process_stage == 2) + return OPUS_OK; + DRED_rdovae_decode_all(&dred_dec->model, dst->fec_features, dst->state, dst->latents, dst->nb_latents, dred_dec->arch); + dst->process_stage = 2; + return OPUS_OK; +#else + (void)dred_dec; + (void)src; + (void)dst; + return OPUS_UNIMPLEMENTED; +#endif +} + +int opus_decoder_dred_decode(OpusDecoder *st, const OpusDRED *dred, opus_int32 dred_offset, opus_int16 *pcm, opus_int32 frame_size) +{ +#ifdef ENABLE_DRED + VARDECL(float, out); + int ret, i; + ALLOC_STACK; + + if(frame_size<=0) + { + RESTORE_STACK; + return OPUS_BAD_ARG; + } + + celt_assert(st->channels == 1 || st->channels == 2); + ALLOC(out, frame_size*st->channels, float); + + ret = opus_decode_native(st, NULL, 0, out, frame_size, 0, 0, NULL, 1, dred, dred_offset); + if (ret > 0) + { + for (i=0;ichannels;i++) + pcm[i] = FLOAT2INT16(out[i]); + } + RESTORE_STACK; + return ret; +#else + (void)st; + (void)dred; + (void)dred_offset; + (void)pcm; + (void)frame_size; + return OPUS_UNIMPLEMENTED; +#endif +} + +int opus_decoder_dred_decode_float(OpusDecoder *st, const OpusDRED *dred, opus_int32 dred_offset, float *pcm, opus_int32 frame_size) +{ +#ifdef ENABLE_DRED + if(frame_size<=0) + return OPUS_BAD_ARG; + return opus_decode_native(st, NULL, 0, pcm, frame_size, 0, 0, NULL, 0, dred, dred_offset); +#else + (void)st; + (void)dred; + (void)dred_offset; + (void)pcm; + (void)frame_size; + return OPUS_UNIMPLEMENTED; +#endif +} diff --git a/opus/src/opus_demo.c b/opus/src/opus_demo.c index 4cc26a6c..2876fff8 100644 --- a/opus/src/opus_demo.c +++ b/opus/src/opus_demo.c @@ -39,9 +39,80 @@ #include "opus_types.h" #include "opus_private.h" #include "opus_multistream.h" +#ifdef ENABLE_LOSSGEN +#include "lossgen.h" +#endif #define MAX_PACKET 1500 +#ifdef USE_WEIGHTS_FILE +# if __unix__ +# include +# include +# include +# include +/* When available, mmap() is preferable to reading the file, as it leads to + better resource utilization, especially if multiple processes are using the same + file (mapping will be shared in cache). */ +void *load_blob(const char *filename, int *len) { + int fd; + void *data; + struct stat st; + if (stat(filename, &st)) { + *len = 0; + return NULL; + } + *len = st.st_size; + fd = open(filename, O_RDONLY); + if (fd<0) { + *len = 0; + return NULL; + } + data = mmap(NULL, *len, PROT_READ, MAP_SHARED, fd, 0); + if (data == MAP_FAILED) { + *len = 0; + data = NULL; + } + close(fd); + return data; +} +void free_blob(void *blob, int len) { + if (blob) munmap(blob, len); +} +# else +void *load_blob(const char *filename, int *len) { + FILE *file; + void *data; + file = fopen(filename, "r"); + if (file == NULL) + { + perror("could not open blob file"); + *len = 0; + return NULL; + } + fseek(file, 0L, SEEK_END); + *len = ftell(file); + fseek(file, 0L, SEEK_SET); + if (*len <= 0) { + *len = 0; + return NULL; + } + data = malloc(*len); + if (!data) { + *len = 0; + return NULL; + } + *len = fread(data, 1, *len, file); + return data; +} +void free_blob(void *blob, int len) { + free(blob); + (void)len; +} +# endif +#endif + + void print_usage( char* argv[] ) { fprintf(stderr, "Usage: %s [-e] " @@ -58,11 +129,17 @@ void print_usage( char* argv[] ) fprintf(stderr, "-bandwidth : audio bandwidth (from narrowband to fullband); default: sampling rate\n" ); fprintf(stderr, "-framesize <2.5|5|10|20|40|60|80|100|120> : frame size in ms; default: 20 \n" ); fprintf(stderr, "-max_payload : maximum payload size in bytes, default: 1024\n" ); - fprintf(stderr, "-complexity : complexity, 0 (lowest) ... 10 (highest); default: 10\n" ); + fprintf(stderr, "-complexity : encoder complexity, 0 (lowest) ... 10 (highest); default: 10\n" ); + fprintf(stderr, "-dec_complexity : decoder complexity, 0 (lowest) ... 10 (highest); default: 0\n" ); fprintf(stderr, "-inbandfec : enable SILK inband FEC\n" ); fprintf(stderr, "-forcemono : force mono encoding, even for stereo input\n" ); fprintf(stderr, "-dtx : enable SILK DTX\n" ); - fprintf(stderr, "-loss : simulate packet loss, in percent (0-100); default: 0\n" ); + fprintf(stderr, "-loss : optimize for loss percentage and simulate packet loss, in percent (0-100); default: 0\n" ); +#ifdef ENABLE_LOSSGEN + fprintf(stderr, "-sim_loss : simulate realistic (bursty) packet loss from percentage, using generative model\n" ); +#endif + fprintf(stderr, "-lossfile : simulate packet loss, reading loss from file\n" ); + fprintf(stderr, "-dred : add Deep REDundancy (in units of 10-ms frames)\n" ); } static void int_to_char(opus_uint32 i, unsigned char ch[4]) @@ -80,6 +157,7 @@ static opus_uint32 char_to_int(unsigned char ch[4]) } #define check_encoder_option(decode_only, opt) do {if (decode_only) {fprintf(stderr, "option %s is only for encoding\n", opt); goto failure;}} while(0) +#define check_decoder_option(encode_only, opt) do {if (encode_only) {fprintf(stderr, "option %s is only for decoding\n", opt); goto failure;}} while(0) static const int silk8_test[][4] = { {MODE_SILK_ONLY, OPUS_BANDWIDTH_NARROWBAND, 960*3, 1}, @@ -207,6 +285,68 @@ static OpusDecoder *ms_opus_decoder_create(opus_int32 Fs, int channels, int *err } #endif + +#ifdef ENABLE_OSCE_TRAINING_DATA +#define COMPLEXITY_MIN 0 +#define COMPLEXITY_MAX 10 + +#define PACKET_LOSS_PERC_MIN 0 +#define PACKET_LOSS_PERC_MAX 50 +#define PACKET_LOSS_PERC_STEP 5 + +#define CBR_BITRATE_LIMIT 8000 + +#define NUM_BITRATES 102 +static int bitrates[NUM_BITRATES] = { + 6000, 6060, 6120, 6180, 6240, 6300, 6360, 6420, 6480, + 6525, 6561, 6598, 6634, 6670, 6707, 6743, 6780, 6816, + 6853, 6889, 6926, 6962, 6999, 7042, 7085, 7128, 7171, + 7215, 7258, 7301, 7344, 7388, 7431, 7474, 7512, 7541, + 7570, 7599, 7628, 7657, 7686, 7715, 7744, 7773, 7802, + 7831, 7860, 7889, 7918, 7947, 7976, 8013, 8096, 8179, + 8262, 8344, 8427, 8511, 8605, 8699, 8792, 8886, 8980, + 9100, 9227, 9354, 9480, 9561, 9634, 9706, 9779, 9851, + 9924, 9996, 10161, 10330, 10499, 10698, 10898, 11124, 11378, + 11575, 11719, 11862, 12014, 12345, 12751, 13195, 13561, 13795, + 14069, 14671, 15403, 15790, 16371, 17399, 17968, 19382, 20468, + 22000, 32000, 64000 +}; + +static int randint(int min, int max, int step) +{ + double r = ((double) rand())/ (RAND_MAX + 1.); + int d; + + d = ((int) ((max + 1 - min) * r / step) * step) + min; + + return d; +} + +static void new_random_setting(OpusEncoder *enc) +{ + int bitrate_bps; + int complexity; + int packet_loss_perc; + int use_vbr; + + bitrate_bps = bitrates[randint(0, NUM_BITRATES - 1, 1)]; + complexity = randint(COMPLEXITY_MIN, COMPLEXITY_MAX, 1); + packet_loss_perc = randint(PACKET_LOSS_PERC_MIN, PACKET_LOSS_PERC_MAX, PACKET_LOSS_PERC_STEP); + use_vbr = bitrate_bps < CBR_BITRATE_LIMIT ? 1 : randint(0, 1, 1); + + if (1) + { + printf("changing settings to %d\t%d\t%d\t%d\n", bitrate_bps, complexity, packet_loss_perc, use_vbr); + } + + opus_encoder_ctl(enc, OPUS_SET_BITRATE(bitrate_bps)); + opus_encoder_ctl(enc, OPUS_SET_COMPLEXITY(complexity)); + opus_encoder_ctl(enc, OPUS_SET_PACKET_LOSS_PERC(packet_loss_perc)); + opus_encoder_ctl(enc, OPUS_SET_VBR(use_vbr)); +} + +#endif + int main(int argc, char *argv[]) { int err; @@ -215,21 +355,28 @@ int main(int argc, char *argv[]) FILE *fout=NULL; OpusEncoder *enc=NULL; OpusDecoder *dec=NULL; + OpusDRED *dred=NULL; + OpusDREDDecoder *dred_dec=NULL; int args; - int len[2]; + int len; int frame_size, channels; opus_int32 bitrate_bps=0; - unsigned char *data[2] = {NULL, NULL}; + unsigned char *data = NULL; unsigned char *fbytes=NULL; opus_int32 sampling_rate; int use_vbr; int max_payload_bytes; int complexity; + int dec_complexity; int use_inbandfec; int use_dtx; int forcechannels; int cvbr = 0; int packet_loss_perc; +#ifdef ENABLE_LOSSGEN + float lossgen_perc = -1.f; + LossGenState lossgen; +#endif opus_int32 count=0, count_act=0; int k; opus_int32 skip=0; @@ -243,8 +390,7 @@ int main(int argc, char *argv[]) int bandwidth=OPUS_AUTO; const char *bandwidth_string; int lost = 0, lost_prev = 1; - int toggle = 0; - opus_uint32 enc_final_range[2]; + opus_uint32 enc_final_range; opus_uint32 dec_final_range; int encode_only=0, decode_only=0; int max_frame_size = 48000*2; @@ -264,6 +410,19 @@ int main(int argc, char *argv[]) int variable_duration=OPUS_FRAMESIZE_ARG; int delayed_decision=0; int ret = EXIT_FAILURE; + int lost_count=0; + FILE *packet_loss_file=NULL; + int dred_duration=0; +#ifdef ENABLE_OSCE_TRAINING_DATA + int silk_random_switching = 0; + int silk_frame_counter = 0; +#endif +#ifdef USE_WEIGHTS_FILE + int blob_len; + void *blob_data; + const char *filename = "weights_blob.bin"; + blob_data = load_blob(filename, &blob_len); +#endif if (argc < 5 ) { @@ -335,6 +494,7 @@ int main(int argc, char *argv[]) use_vbr = 1; max_payload_bytes = MAX_PACKET; complexity = 10; + dec_complexity = 0; use_inbandfec = 0; forcechannels = OPUS_AUTO; use_dtx = 0; @@ -400,6 +560,10 @@ int main(int argc, char *argv[]) check_encoder_option(decode_only, "-complexity"); complexity = atoi( argv[ args + 1 ] ); args += 2; + } else if( strcmp( argv[ args ], "-dec_complexity" ) == 0 ) { + check_decoder_option(encode_only, "-dec_complexity"); + dec_complexity = atoi( argv[ args + 1 ] ); + args += 2; } else if( strcmp( argv[ args ], "-inbandfec" ) == 0 ) { use_inbandfec = 1; args++; @@ -422,6 +586,22 @@ int main(int argc, char *argv[]) } else if( strcmp( argv[ args ], "-loss" ) == 0 ) { packet_loss_perc = atoi( argv[ args + 1 ] ); args += 2; +#ifdef ENABLE_LOSSGEN + } else if( strcmp( argv[ args ], "-sim_loss" ) == 0 ) { + lossgen_perc = atof( argv[ args + 1 ] ); + lossgen_init(&lossgen); + args += 2; +#endif + } else if( strcmp( argv[ args ], "-lossfile" ) == 0 ) { + packet_loss_file = fopen( argv[ args + 1 ], "r" ); + if (packet_loss_file == NULL) { + fprintf(stderr, "failed to open loss file %s\n", argv[ args + 1 ] ); + exit(1); + } + args += 2; + } else if( strcmp( argv[ args ], "-dred" ) == 0 ) { + dred_duration = atoi( argv[ args + 1 ] ); + args += 2; } else if( strcmp( argv[ args ], "-sweep" ) == 0 ) { check_encoder_option(decode_only, "-sweep"); sweep_bps = atoi( argv[ args + 1 ] ); @@ -473,6 +653,12 @@ int main(int argc, char *argv[]) mode_list = celt_hq_test; nb_modes_in_list = 4; args++; +#ifdef ENABLE_OSCE_TRAINING_DATA + } else if( strcmp( argv[ args ], "-silk_random_switching" ) == 0 ){ + silk_random_switching = atoi( argv[ args + 1 ] ); + printf("switching encoding parameters every %dth frame\n", silk_random_switching); + args += 2; +#endif } else { printf( "Error: unrecognized setting: %s\n\n", argv[ args ] ); print_usage( argv ); @@ -537,6 +723,10 @@ int main(int argc, char *argv[]) opus_encoder_ctl(enc, OPUS_GET_LOOKAHEAD(&skip)); opus_encoder_ctl(enc, OPUS_SET_LSB_DEPTH(16)); opus_encoder_ctl(enc, OPUS_SET_EXPERT_FRAME_DURATION(variable_duration)); + if (dred_duration > 0) + { + opus_encoder_ctl(enc, OPUS_SET_DRED_DURATION(dred_duration)); + } } if (!encode_only) { @@ -546,9 +736,8 @@ int main(int argc, char *argv[]) fprintf(stderr, "Cannot create decoder: %s\n", opus_strerror(err)); goto failure; } + opus_decoder_ctl(dec, OPUS_SET_COMPLEXITY(dec_complexity)); } - - switch(bandwidth) { case OPUS_BANDWIDTH_NARROWBAND: @@ -587,10 +776,7 @@ int main(int argc, char *argv[]) out = (short*)malloc(max_frame_size*channels*sizeof(short)); /* We need to allocate for 16-bit PCM data, but we store it as unsigned char. */ fbytes = (unsigned char*)malloc(max_frame_size*channels*sizeof(short)); - data[0] = (unsigned char*)calloc(max_payload_bytes,sizeof(unsigned char)); - if ( use_inbandfec ) { - data[1] = (unsigned char*)calloc(max_payload_bytes,sizeof(unsigned char)); - } + data = (unsigned char*)calloc(max_payload_bytes,sizeof(unsigned char)); if(delayed_decision) { if (frame_size==sampling_rate/400) @@ -614,6 +800,13 @@ int main(int argc, char *argv[]) opus_encoder_ctl(enc, OPUS_SET_EXPERT_FRAME_DURATION(variable_duration)); frame_size = 2*48000; } + dred_dec = opus_dred_decoder_create(&err); + dred = opus_dred_alloc(&err); +#ifdef USE_WEIGHTS_FILE + if (enc) opus_encoder_ctl(enc, OPUS_SET_DNN_BLOB(blob_data, blob_len)); + if (dec) opus_decoder_ctl(dec, OPUS_SET_DNN_BLOB(blob_data, blob_len)); + if (dred_dec) opus_dred_decoder_ctl(dred_dec, OPUS_SET_DNN_BLOB(blob_data, blob_len)); +#endif while (!stop) { if (delayed_celt) @@ -652,22 +845,22 @@ int main(int argc, char *argv[]) num_read = fread(ch, 1, 4, fin); if (num_read!=4) break; - len[toggle] = char_to_int(ch); - if (len[toggle]>max_payload_bytes || len[toggle]<0) + len = char_to_int(ch); + if (len>max_payload_bytes || len<0) { - fprintf(stderr, "Invalid payload length: %d\n",len[toggle]); + fprintf(stderr, "Invalid payload length: %d\n",len); break; } num_read = fread(ch, 1, 4, fin); if (num_read!=4) break; - enc_final_range[toggle] = char_to_int(ch); - num_read = fread(data[toggle], 1, len[toggle], fin); - if (num_read!=(size_t)len[toggle]) + enc_final_range = char_to_int(ch); + num_read = fread(data, 1, len, fin); + if (num_read!=(size_t)len) { fprintf(stderr, "Ran out of input, " "expecting %d bytes got %d\n", - len[toggle],(int)num_read); + len,(int)num_read); break; } } else { @@ -679,6 +872,15 @@ int main(int argc, char *argv[]) opus_encoder_ctl(enc, OPUS_SET_FORCE_CHANNELS(mode_list[curr_mode][3])); frame_size = mode_list[curr_mode][2]; } +#ifdef ENABLE_OSCE_TRAINING_DATA + if (silk_random_switching) + { + silk_frame_counter += 1; + if (silk_frame_counter % silk_random_switching == 0) { + new_random_setting(enc); + } + } +#endif num_read = fread(fbytes, sizeof(short)*channels, frame_size-remaining, fin); curr_read = (int)num_read; tot_in += curr_read; @@ -696,8 +898,13 @@ int main(int argc, char *argv[]) if (encode_only || decode_only) stop = 1; } - len[toggle] = opus_encode(enc, in, frame_size, data[toggle], max_payload_bytes); - nb_encoded = opus_packet_get_samples_per_frame(data[toggle], sampling_rate)*opus_packet_get_nb_frames(data[toggle], len[toggle]); + len = opus_encode(enc, in, frame_size, data, max_payload_bytes); + if (len < 0) + { + fprintf (stderr, "opus_encode() returned %d\n", len); + goto failure; + } + nb_encoded = opus_packet_get_samples_per_frame(data, sampling_rate)*opus_packet_get_nb_frames(data, len); remaining = frame_size-nb_encoded; for(i=0;i mode_switch_time && curr_mode < nb_modes_in_list-1) { @@ -731,56 +933,84 @@ int main(int argc, char *argv[]) } #if 0 /* This is for testing the padding code, do not enable by default */ - if (len[toggle]<1275) + if (len<1275) { - int new_len = len[toggle]+rand()%(max_payload_bytes-len[toggle]); - if ((err = opus_packet_pad(data[toggle], len[toggle], new_len)) != OPUS_OK) + int new_len = len+rand()%(max_payload_bytes-len); + if ((err = opus_packet_pad(data, len, new_len)) != OPUS_OK) { fprintf(stderr, "padding failed: %s\n", opus_strerror(err)); goto failure; } - len[toggle] = new_len; + len = new_len; } #endif if (encode_only) { unsigned char int_field[4]; - int_to_char(len[toggle], int_field); + int_to_char(len, int_field); if (fwrite(int_field, 1, 4, fout) != 4) { fprintf(stderr, "Error writing.\n"); goto failure; } - int_to_char(enc_final_range[toggle], int_field); + int_to_char(enc_final_range, int_field); if (fwrite(int_field, 1, 4, fout) != 4) { fprintf(stderr, "Error writing.\n"); goto failure; } - if (fwrite(data[toggle], 1, len[toggle], fout) != (unsigned)len[toggle]) { + if (fwrite(data, 1, len, fout) != (unsigned)len) { fprintf(stderr, "Error writing.\n"); goto failure; } tot_samples += nb_encoded; } else { - opus_int32 output_samples; - lost = len[toggle]==0 || (packet_loss_perc>0 && rand()%100 < packet_loss_perc); + int fr; + int run_decoder; + int dred_input=0; + int dred_end=0; + if (packet_loss_file != NULL) { + if ( fscanf(packet_loss_file, "%d", &lost) != 1) { + lost = 0; + } +#ifdef ENABLE_LOSSGEN + } else if (lossgen_perc >= 0) { + lost = sample_loss(&lossgen, lossgen_perc*.01f); +#endif + } else { + lost = (packet_loss_perc>0) && (rand()%100 < packet_loss_perc); + } + if (len == 0) lost = 1; if (lost) - opus_decoder_ctl(dec, OPUS_GET_LAST_PACKET_DURATION(&output_samples)); - else - output_samples = max_frame_size; - if( count >= use_inbandfec ) { - /* delay by one packet when using in-band FEC */ - if( use_inbandfec ) { - if( lost_prev ) { - /* attempt to decode with in-band FEC from next packet */ - opus_decoder_ctl(dec, OPUS_GET_LAST_PACKET_DURATION(&output_samples)); - output_samples = opus_decode(dec, lost ? NULL : data[toggle], len[toggle], out, output_samples, 1); - } else { - /* regular decode */ - output_samples = max_frame_size; - output_samples = opus_decode(dec, data[1-toggle], len[1-toggle], out, output_samples, 0); - } + { + lost_count++; + run_decoder = 0; + } else { + run_decoder= 1; + } + if (run_decoder) + run_decoder += lost_count; + if (!lost && lost_count > 0) { + opus_int32 output_samples=0; + opus_decoder_ctl(dec, OPUS_GET_LAST_PACKET_DURATION(&output_samples)); + dred_input = lost_count*output_samples; + /* Only decode the amount we need to fill in the gap. */ + ret = opus_dred_parse(dred_dec, dred, data, len, IMIN(48000, IMAX(0, dred_input)), sampling_rate, &dred_end, 0); + dred_input = ret > 0 ? ret : 0; + } + /* FIXME: Figure out how to trigger the decoder when the last packet of the file is lost. */ + for (fr=0;fr 0) + output_samples = opus_decoder_dred_decode(dec, dred, (lost_count-fr)*output_samples, out, output_samples); + else + output_samples = opus_decode(dec, NULL, 0, out, output_samples, 0); } else { - output_samples = opus_decode(dec, lost ? NULL : data[toggle], len[toggle], out, output_samples, 0); + output_samples = max_frame_size; + output_samples = opus_decode(dec, data, len, out, output_samples, 0); } if (output_samples>0) { @@ -817,24 +1047,26 @@ int main(int argc, char *argv[]) if (!encode_only) opus_decoder_ctl(dec, OPUS_GET_FINAL_RANGE(&dec_final_range)); /* compare final range encoder rng values of encoder and decoder */ - if( enc_final_range[toggle^use_inbandfec]!=0 && !encode_only + if( enc_final_range!=0 && !encode_only && !lost && !lost_prev - && dec_final_range != enc_final_range[toggle^use_inbandfec] ) { + && dec_final_range != enc_final_range ) { fprintf (stderr, "Error: Range coder state mismatch " "between encoder and decoder " "in frame %ld: 0x%8lx vs 0x%8lx\n", (long)count, - (unsigned long)enc_final_range[toggle^use_inbandfec], + (unsigned long)enc_final_range, (unsigned long)dec_final_range); goto failure; } lost_prev = lost; + if (!lost) + lost_count = 0; if( count >= use_inbandfec ) { /* count bits */ - bits += len[toggle]*8; - bits_max = ( len[toggle]*8 > bits_max ) ? len[toggle]*8 : bits_max; - bits2 += len[toggle]*len[toggle]*64; + bits += len*8; + bits_max = ( len*8 > bits_max ) ? len*8 : bits_max; + bits2 += len*len*64; if (!decode_only) { nrg = 0.0; @@ -843,13 +1075,12 @@ int main(int argc, char *argv[]) } nrg /= frame_size * channels; if( nrg > 1e5 ) { - bits_act += len[toggle]*8; + bits_act += len*8; count_act++; } } } count++; - toggle = (toggle + use_inbandfec) & 1; } if(decode_only && count > 0) @@ -879,8 +1110,9 @@ int main(int argc, char *argv[]) failure: opus_encoder_destroy(enc); opus_decoder_destroy(dec); - free(data[0]); - free(data[1]); + opus_dred_free(dred); + opus_dred_decoder_destroy(dred_dec); + free(data); if (fin) fclose(fin); if (fout) @@ -888,5 +1120,8 @@ int main(int argc, char *argv[]) free(in); free(out); free(fbytes); +#ifdef USE_WEIGHTS_FILE + free_blob(blob_data, blob_len); +#endif return ret; } diff --git a/opus/src/opus_encoder.c b/opus/src/opus_encoder.c index e98ac5b8..d18d582f 100644 --- a/opus/src/opus_encoder.c +++ b/opus/src/opus_encoder.c @@ -45,11 +45,19 @@ #include "analysis.h" #include "mathops.h" #include "tuning_parameters.h" + +#ifdef ENABLE_DRED +#include "dred_coding.h" +#endif + #ifdef FIXED_POINT #include "fixed/structs_FIX.h" #else #include "float/structs_FLP.h" #endif +#ifdef ENABLE_OSCE_TRAINING_DATA +#include +#endif #define MAX_ENCODER_BUFFER 480 @@ -67,6 +75,9 @@ struct OpusEncoder { int celt_enc_offset; int silk_enc_offset; silk_EncControlStruct silk_mode; +#ifdef ENABLE_DRED + DREDEnc dred_encoder; +#endif int application; int channels; int delay_compensation; @@ -87,6 +98,7 @@ struct OpusEncoder { int lfe; int arch; int use_dtx; /* general DTX for both SILK and CELT */ + int fec_config; #ifndef DISABLE_FLOAT_API TonalityAnalysisState analysis; #endif @@ -112,8 +124,16 @@ struct OpusEncoder { opus_val16 delay_buffer[MAX_ENCODER_BUFFER*2]; #ifndef DISABLE_FLOAT_API int detected_bandwidth; - int nb_no_activity_frames; + int nb_no_activity_ms_Q1; opus_val32 peak_signal_energy; +#endif +#ifdef ENABLE_DRED + int dred_duration; + int dred_q0; + int dred_dQ; + int dred_qmax; + int dred_target_chunks; + unsigned char activity_mem[DRED_MAX_FRAMES*4]; /* 2.5ms resolution*/ #endif int nonfinal_frame; /* current frame is not the final in a packet */ opus_uint32 rangeFinal; @@ -223,6 +243,7 @@ int opus_encoder_init(OpusEncoder* st, opus_int32 Fs, int channels, int applicat st->silk_mode.packetLossPercentage = 0; st->silk_mode.complexity = 9; st->silk_mode.useInBandFEC = 0; + st->silk_mode.useDRED = 0; st->silk_mode.useDTX = 0; st->silk_mode.useCBR = 0; st->silk_mode.reducedDependency = 0; @@ -235,6 +256,11 @@ int opus_encoder_init(OpusEncoder* st, opus_int32 Fs, int channels, int applicat celt_encoder_ctl(celt_enc, CELT_SET_SIGNALLING(0)); celt_encoder_ctl(celt_enc, OPUS_SET_COMPLEXITY(st->silk_mode.complexity)); +#ifdef ENABLE_DRED + /* Initialize DRED Encoder */ + dred_encoder_init( &st->dred_encoder, Fs, channels ); +#endif + st->use_vbr = 1; /* Makes constrained VBR the default (safer for real-time use) */ st->vbr_constraint = 1; @@ -543,6 +569,73 @@ OpusEncoder *opus_encoder_create(opus_int32 Fs, int channels, int application, i return st; } +#ifdef ENABLE_DRED + +static const float dred_bits_table[16] = {73.2f, 68.1f, 62.5f, 57.0f, 51.5f, 45.7f, 39.9f, 32.4f, 26.4f, 20.4f, 16.3f, 13.f, 9.3f, 8.2f, 7.2f, 6.4f}; +static int estimate_dred_bitrate(int q0, int dQ, int qmax, int duration, opus_int32 target_bits, int *target_chunks) { + int dred_chunks; + int i; + float bits; + /* Signaling DRED costs 3 bytes. */ + bits = 8*(3+DRED_EXPERIMENTAL_BYTES); + /* Approximation for the size of the IS. */ + bits += 50.f+dred_bits_table[q0]; + dred_chunks = IMIN((duration+5)/4, DRED_NUM_REDUNDANCY_FRAMES/2); + if (target_chunks != NULL) *target_chunks = 0; + for (i=0;isilk_mode.useInBandFEC) { + dred_frac = MIN16(.7f, 3.f*st->silk_mode.packetLossPercentage/100.f); + bitrate_offset = 20000; + } else { + if (st->silk_mode.packetLossPercentage > 5) { + dred_frac = MIN16(.8f, .55f + st->silk_mode.packetLossPercentage/100.f); + } else { + dred_frac = 12*st->silk_mode.packetLossPercentage/100.f; + } + bitrate_offset = 12000; + } + /* Account for the fact that longer packets require less redundancy. */ + dred_frac = dred_frac/(dred_frac + (1-dred_frac)*(frame_size*50.f)/st->Fs); + /* Approximate fit based on a few experiments. Could probably be improved. */ + q0 = IMIN(15, IMAX(4, 51 - 3*EC_ILOG(IMAX(1, bitrate_bps-bitrate_offset)))); + dQ = bitrate_bps-bitrate_offset > 36000 ? 3 : 5; + qmax = 15; + target_dred_bitrate = IMAX(0, (int)(dred_frac*(bitrate_bps-bitrate_offset))); + if (st->dred_duration > 0) { + opus_int32 target_bits = target_dred_bitrate*frame_size/st->Fs; + max_dred_bits = estimate_dred_bitrate(q0, dQ, qmax, st->dred_duration, target_bits, &target_chunks); + } else { + max_dred_bits = 0; + target_chunks=0; + } + dred_bitrate = IMIN(target_dred_bitrate, max_dred_bits*st->Fs/frame_size); + /* If we can't afford enough bits, don't bother with DRED at all. */ + if (target_chunks < 2) + dred_bitrate = 0; + st->dred_q0 = q0; + st->dred_dQ = dQ; + st->dred_qmax = qmax; + st->dred_target_chunks = target_chunks; + return dred_bitrate; +} +#endif + static opus_int32 user_bitrate_to_bitrate(OpusEncoder *st, int frame_size, int max_data_bytes) { if(!frame_size)frame_size=st->Fs/400; @@ -871,7 +964,7 @@ static opus_val32 compute_frame_energy(const opus_val16 *pcm, int frame_size, in /* Compute the right shift required in the MAC to avoid an overflow */ max_shift = celt_ilog2(len); - shift = IMAX(0, (celt_ilog2(sample_max) << 1) + max_shift - 28); + shift = IMAX(0, (celt_ilog2(1+sample_max) << 1) + max_shift - 28); /* Compute the energy */ for (i=0; i= (PSEUDO_SNR_THRESHOLD * noise_energy); - } - } +static int decide_dtx_mode(opus_int activity, /* indicates if this frame contains speech/music */ + int *nb_no_activity_ms_Q1, /* number of consecutive milliseconds with no activity, in Q1 */ + int frame_size_ms_Q1 /* number of miliseconds in this update, in Q1 */ + ) - if (is_silence) +{ + if (!activity) { - /* The number of consecutive DTX frames should be within the allowed bounds */ - (*nb_no_activity_frames)++; - - if (*nb_no_activity_frames > NB_SPEECH_FRAMES_BEFORE_DTX) + /* The number of consecutive DTX frames should be within the allowed bounds. + Note that the allowed bound is defined in the SILK headers and assumes 20 ms + frames. As this function can be called with any frame length, a conversion to + milliseconds is done before the comparisons. */ + (*nb_no_activity_ms_Q1) += frame_size_ms_Q1; + if (*nb_no_activity_ms_Q1 > NB_SPEECH_FRAMES_BEFORE_DTX*20*2) { - if (*nb_no_activity_frames <= (NB_SPEECH_FRAMES_BEFORE_DTX + MAX_CONSECUTIVE_DTX)) + if (*nb_no_activity_ms_Q1 <= (NB_SPEECH_FRAMES_BEFORE_DTX + MAX_CONSECUTIVE_DTX)*20*2) /* Valid frame for DTX! */ return 1; else - (*nb_no_activity_frames) = NB_SPEECH_FRAMES_BEFORE_DTX; + (*nb_no_activity_ms_Q1) = NB_SPEECH_FRAMES_BEFORE_DTX*20*2; } } else - (*nb_no_activity_frames) = 0; + (*nb_no_activity_ms_Q1) = 0; return 0; } #endif -static opus_int32 encode_multiframe_packet(OpusEncoder *st, - const opus_val16 *pcm, - int nb_frames, - int frame_size, - unsigned char *data, - opus_int32 out_data_bytes, - int to_celt, - int lsb_depth, - int float_api) -{ - int i; - int ret = 0; - VARDECL(unsigned char, tmp_data); - int bak_mode, bak_bandwidth, bak_channels, bak_to_mono; - VARDECL(OpusRepacketizer, rp); - int max_header_bytes; - opus_int32 bytes_per_frame; - opus_int32 cbr_bytes; - opus_int32 repacketize_len; - int tmp_len; - ALLOC_STACK; - - /* Worst cases: - * 2 frames: Code 2 with different compressed sizes - * >2 frames: Code 3 VBR */ - max_header_bytes = nb_frames == 2 ? 3 : (2+(nb_frames-1)*2); - - if (st->use_vbr || st->user_bitrate_bps==OPUS_BITRATE_MAX) - repacketize_len = out_data_bytes; - else { - cbr_bytes = 3*st->bitrate_bps/(3*8*st->Fs/(frame_size*nb_frames)); - repacketize_len = IMIN(cbr_bytes, out_data_bytes); - } - bytes_per_frame = IMIN(1276, 1+(repacketize_len-max_header_bytes)/nb_frames); - - ALLOC(tmp_data, nb_frames*bytes_per_frame, unsigned char); - ALLOC(rp, 1, OpusRepacketizer); - opus_repacketizer_init(rp); - - bak_mode = st->user_forced_mode; - bak_bandwidth = st->user_bandwidth; - bak_channels = st->force_channels; - - st->user_forced_mode = st->mode; - st->user_bandwidth = st->bandwidth; - st->force_channels = st->stream_channels; - - bak_to_mono = st->silk_mode.toMono; - if (bak_to_mono) - st->force_channels = 1; - else - st->prev_channels = st->stream_channels; - - for (i=0;isilk_mode.toMono = 0; - st->nonfinal_frame = i<(nb_frames-1); - - /* When switching from SILK/Hybrid to CELT, only ask for a switch at the last frame */ - if (to_celt && i==nb_frames-1) - st->user_forced_mode = MODE_CELT_ONLY; - - tmp_len = opus_encode_native(st, pcm+i*(st->channels*frame_size), frame_size, - tmp_data+i*bytes_per_frame, bytes_per_frame, lsb_depth, NULL, 0, 0, 0, 0, - NULL, float_api); - - if (tmp_len<0) - { - RESTORE_STACK; - return OPUS_INTERNAL_ERROR; - } - - ret = opus_repacketizer_cat(rp, tmp_data+i*bytes_per_frame, tmp_len); - - if (ret<0) - { - RESTORE_STACK; - return OPUS_INTERNAL_ERROR; - } - } - - ret = opus_repacketizer_out_range_impl(rp, 0, nb_frames, data, repacketize_len, 0, !st->use_vbr); - - if (ret<0) - { - RESTORE_STACK; - return OPUS_INTERNAL_ERROR; - } - - /* Discard configs that were forced locally for the purpose of repacketization */ - st->user_forced_mode = bak_mode; - st->user_bandwidth = bak_bandwidth; - st->force_channels = bak_channels; - st->silk_mode.toMono = bak_to_mono; - - RESTORE_STACK; - return ret; -} - static int compute_redundancy_bytes(opus_int32 max_data_bytes, opus_int32 bitrate_bps, int frame_rate, int channels) { int redundancy_bytes_cap; @@ -1063,6 +1042,18 @@ static int compute_redundancy_bytes(opus_int32 max_data_bytes, opus_int32 bitrat return redundancy_bytes; } +static opus_int32 opus_encode_frame_native(OpusEncoder *st, const opus_val16 *pcm, int frame_size, + unsigned char *data, opus_int32 max_data_bytes, + int float_api, int first_frame, +#ifdef ENABLE_DRED + opus_int32 dred_bitrate_bps, +#endif +#ifndef DISABLE_FLOAT_API + AnalysisInfo *analysis_info, int is_silence, +#endif + int redundancy, int celt_to_silk, int prefill, + opus_int32 equiv_rate, int to_celt); + opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_size, unsigned char *data, opus_int32 out_data_bytes, int lsb_depth, const void *analysis_pcm, opus_int32 analysis_size, int c1, int c2, @@ -1072,28 +1063,17 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ CELTEncoder *celt_enc; int i; int ret=0; - opus_int32 nBytes; - ec_enc enc; - int bytes_target; int prefill=0; - int start_band = 0; int redundancy = 0; - int redundancy_bytes = 0; /* Number of bytes to use for redundancy frame */ int celt_to_silk = 0; - VARDECL(opus_val16, pcm_buf); - int nb_compr_bytes; int to_celt = 0; - opus_uint32 redundant_rng = 0; - int cutoff_Hz, hp_freq_smth1; int voice_est; /* Probability of voice in Q7 */ opus_int32 equiv_rate; - int delay_compensation; int frame_rate; opus_int32 max_rate; /* Max bitrate we're allowed to use */ int curr_bandwidth; - opus_val16 HB_gain; opus_int32 max_data_bytes; /* Max number of bytes we're allowed to use */ - int total_buffer; + opus_int32 cbr_bytes=-1; opus_val16 stereo_width; const CELTMode *celt_mode; #ifndef DISABLE_FLOAT_API @@ -1102,8 +1082,9 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ int analysis_read_subframe_bak=-1; int is_silence = 0; #endif - VARDECL(opus_val16, tmp_prefill); - +#ifdef ENABLE_DRED + opus_int32 dred_bitrate_bps; +#endif ALLOC_STACK; max_data_bytes = IMIN(1276, out_data_bytes); @@ -1124,10 +1105,6 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ silk_enc = (char*)st+st->silk_enc_offset; celt_enc = (CELTEncoder*)((char*)st+st->celt_enc_offset); - if (st->application == OPUS_APPLICATION_RESTRICTED_LOWDELAY) - delay_compensation = 0; - else - delay_compensation = st->delay_compensation; lsb_depth = IMIN(lsb_depth, st->lsb_depth); @@ -1205,21 +1182,24 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ stereo_width = compute_stereo_width(pcm, frame_size, st->Fs, &st->width_mem); else stereo_width = 0; - total_buffer = delay_compensation; st->bitrate_bps = user_bitrate_to_bitrate(st, frame_size, max_data_bytes); frame_rate = st->Fs/frame_size; if (!st->use_vbr) { - int cbrBytes; /* Multiply by 12 to make sure the division is exact. */ int frame_rate12 = 12*st->Fs/frame_size; /* We need to make sure that "int" values always fit in 16 bits. */ - cbrBytes = IMIN( (12*st->bitrate_bps/8 + frame_rate12/2)/frame_rate12, max_data_bytes); - st->bitrate_bps = cbrBytes*(opus_int32)frame_rate12*8/12; + cbr_bytes = IMIN( (12*st->bitrate_bps/8 + frame_rate12/2)/frame_rate12, max_data_bytes); + st->bitrate_bps = cbr_bytes*(opus_int32)frame_rate12*8/12; /* Make sure we provide at least one byte to avoid failing. */ - max_data_bytes = IMAX(1, cbrBytes); + max_data_bytes = IMAX(1, cbr_bytes); } +#ifdef ENABLE_DRED + /* Allocate some of the bits to DRED if needed. */ + dred_bitrate_bps = compute_dred_bitrate(st, st->bitrate_bps, frame_size); + st->bitrate_bps -= dred_bitrate_bps; +#endif if (max_data_bytes<3 || st->bitrate_bps < 3*frame_rate*8 || (frame_rate<50 && (max_data_bytes*frame_rate<300 || st->bitrate_bps < 2400))) { @@ -1313,6 +1293,8 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ st->stream_channels = st->force_channels; } else { #ifdef FUZZING + (void)stereo_music_threshold; + (void)stereo_voice_threshold; /* Random mono/stereo decision */ if (st->channels == 2 && (rand()&0x1F)==0) st->stream_channels = 3-st->stream_channels; @@ -1351,6 +1333,8 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ } else if (st->user_forced_mode == OPUS_AUTO) { #ifdef FUZZING + (void)stereo_width; + (void)mode_thresholds; /* Random mode switching */ if ((rand()&0xF)==0) { @@ -1388,8 +1372,9 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ st->mode = (equiv_rate >= threshold) ? MODE_CELT_ONLY: MODE_SILK_ONLY; - /* When FEC is enabled and there's enough packet loss, use SILK */ - if (st->silk_mode.useInBandFEC && st->silk_mode.packetLossPercentage > (128-voice_est)>>4) + /* When FEC is enabled and there's enough packet loss, use SILK. + Unless the FEC is set to 2, in which case we don't switch to SILK if we're confident we have music. */ + if (st->silk_mode.useInBandFEC && st->silk_mode.packetLossPercentage > (128-voice_est)>>4 && (st->fec_config != 2 || voice_est > 25)) st->mode = MODE_SILK_ONLY; /* When encoding voice and DTX is enabled but the generalized DTX cannot be used, use SILK in order to make use of its DTX. */ @@ -1568,6 +1553,15 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ { int enc_frame_size; int nb_frames; + VARDECL(unsigned char, tmp_data); + VARDECL(OpusRepacketizer, rp); + int max_header_bytes; + opus_int32 repacketize_len; + opus_int32 max_len_sum; + opus_int32 tot_size=0; + unsigned char *curr_data; + int tmp_len; + int dtx_count = 0; if (st->mode == MODE_SILK_ONLY) { @@ -1586,17 +1580,186 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ #ifndef DISABLE_FLOAT_API if (analysis_read_pos_bak!= -1) { + /* Reset analysis position to the beginning of the first frame so we + can use it one frame at a time. */ st->analysis.read_pos = analysis_read_pos_bak; st->analysis.read_subframe = analysis_read_subframe_bak; } #endif - ret = encode_multiframe_packet(st, pcm, nb_frames, enc_frame_size, data, - out_data_bytes, to_celt, lsb_depth, float_api); + /* Worst cases: + * 2 frames: Code 2 with different compressed sizes + * >2 frames: Code 3 VBR */ + max_header_bytes = nb_frames == 2 ? 3 : (2+(nb_frames-1)*2); + + if (st->use_vbr || st->user_bitrate_bps==OPUS_BITRATE_MAX) + repacketize_len = out_data_bytes; + else { + celt_assert(cbr_bytes>=0); + repacketize_len = IMIN(cbr_bytes, out_data_bytes); + } + max_len_sum = nb_frames + repacketize_len - max_header_bytes; + + ALLOC(tmp_data, max_len_sum, unsigned char); + curr_data = tmp_data; + ALLOC(rp, 1, OpusRepacketizer); + opus_repacketizer_init(rp); + + int bak_to_mono = st->silk_mode.toMono; + if (bak_to_mono) + st->force_channels = 1; + else + st->prev_channels = st->stream_channels; + + for (i=0;isilk_mode.toMono = 0; + st->nonfinal_frame = i<(nb_frames-1); + + /* When switching from SILK/Hybrid to CELT, only ask for a switch at the last frame */ + frame_to_celt = to_celt && i==nb_frames-1; + frame_redundancy = redundancy && (frame_to_celt || (!to_celt && i==0)); + + curr_max = IMIN(3*st->bitrate_bps/(3*8*st->Fs/enc_frame_size), max_len_sum/nb_frames); +#ifdef ENABLE_DRED + curr_max = IMIN(curr_max, (max_len_sum-3*dred_bitrate_bps/(3*8*st->Fs/frame_size))/nb_frames); + if (first_frame) curr_max += 3*dred_bitrate_bps/(3*8*st->Fs/frame_size); +#endif + curr_max = IMIN(max_len_sum-tot_size, curr_max); +#ifndef DISABLE_FLOAT_API + if (analysis_read_pos_bak != -1) { + is_silence = is_digital_silence(pcm, frame_size, st->channels, lsb_depth); + /* Get analysis for current frame. */ + tonality_get_info(&st->analysis, &analysis_info, enc_frame_size); + } +#endif + + tmp_len = opus_encode_frame_native(st, pcm+i*(st->channels*enc_frame_size), enc_frame_size, curr_data, curr_max, float_api, first_frame, +#ifdef ENABLE_DRED + dred_bitrate_bps, +#endif +#ifndef DISABLE_FLOAT_API + &analysis_info, + is_silence, +#endif + frame_redundancy, celt_to_silk, prefill, + equiv_rate, frame_to_celt + ); + if (tmp_len<0) + { + RESTORE_STACK; + return OPUS_INTERNAL_ERROR; + } else if (tmp_len==1) { + dtx_count++; + } + ret = opus_repacketizer_cat(rp, curr_data, tmp_len); + + if (ret<0) + { + RESTORE_STACK; + return OPUS_INTERNAL_ERROR; + } + tot_size += tmp_len; + curr_data += tmp_len; + } + ret = opus_repacketizer_out_range_impl(rp, 0, nb_frames, data, repacketize_len, 0, !st->use_vbr && (dtx_count != nb_frames), NULL, 0); + if (ret<0) + { + ret = OPUS_INTERNAL_ERROR; + } + st->silk_mode.toMono = bak_to_mono; RESTORE_STACK; return ret; + } else { + ret = opus_encode_frame_native(st, pcm, frame_size, data, max_data_bytes, float_api, 1, +#ifdef ENABLE_DRED + dred_bitrate_bps, +#endif +#ifndef DISABLE_FLOAT_API + &analysis_info, + is_silence, +#endif + redundancy, celt_to_silk, prefill, + equiv_rate, to_celt + ); + RESTORE_STACK; + return ret; } +} + +static opus_int32 opus_encode_frame_native(OpusEncoder *st, const opus_val16 *pcm, int frame_size, + unsigned char *data, opus_int32 max_data_bytes, + int float_api, int first_frame, +#ifdef ENABLE_DRED + opus_int32 dred_bitrate_bps, +#endif +#ifndef DISABLE_FLOAT_API + AnalysisInfo *analysis_info, int is_silence, +#endif + int redundancy, int celt_to_silk, int prefill, + opus_int32 equiv_rate, int to_celt) +{ + void *silk_enc; + CELTEncoder *celt_enc; + const CELTMode *celt_mode; + int i; + int ret=0; + opus_int32 nBytes; + ec_enc enc; + int bytes_target; + int start_band = 0; + int redundancy_bytes = 0; /* Number of bytes to use for redundancy frame */ + int nb_compr_bytes; + opus_uint32 redundant_rng = 0; + int cutoff_Hz; + int hp_freq_smth1; + opus_val16 HB_gain; + int apply_padding; + int frame_rate; + int curr_bandwidth; + int delay_compensation; + int total_buffer; + opus_int activity = VAD_NO_DECISION; + VARDECL(opus_val16, pcm_buf); + VARDECL(opus_val16, tmp_prefill); + SAVE_STACK; + + st->rangeFinal = 0; + silk_enc = (char*)st+st->silk_enc_offset; + celt_enc = (CELTEncoder*)((char*)st+st->celt_enc_offset); + celt_encoder_ctl(celt_enc, CELT_GET_MODE(&celt_mode)); + curr_bandwidth = st->bandwidth; + if (st->application == OPUS_APPLICATION_RESTRICTED_LOWDELAY) + delay_compensation = 0; + else + delay_compensation = st->delay_compensation; + total_buffer = delay_compensation; + + frame_rate = st->Fs/frame_size; + +#ifndef DISABLE_FLOAT_API + if (is_silence) + { + activity = !is_silence; + } else if (analysis_info->valid) + { + activity = analysis_info->activity_probability >= DTX_ACTIVITY_THRESHOLD; + if (!activity) + { + /* Mark as active if this noise frame is sufficiently loud */ + opus_val32 noise_energy = compute_frame_energy(pcm, frame_size, st->channels, st->arch); + activity = st->peak_signal_energy < (PSEUDO_SNR_THRESHOLD * noise_energy); + } + } +#endif /* For the first frame at a new SILK bandwidth */ if (st->silk_bw_switch) @@ -1604,7 +1767,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ redundancy = 1; celt_to_silk = 1; st->silk_bw_switch = 0; - /* Do a prefill without reseting the sampling rate control. */ + /* Do a prefill without resetting the sampling rate control. */ prefill=2; } @@ -1644,6 +1807,25 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ if (st->application == OPUS_APPLICATION_VOIP) { hp_cutoff(pcm, cutoff_Hz, &pcm_buf[total_buffer*st->channels], st->hp_mem, frame_size, st->channels, st->Fs, st->arch); + +#ifdef ENABLE_OSCE_TRAINING_DATA + /* write out high pass filtered clean signal*/ + static FILE *fout =NULL; + if (fout == NULL) + { + fout = fopen("clean_hp.s16", "wb"); + } + + { + int idx; + opus_int16 tmp; + for (idx = 0; idx < frame_size; idx++) + { + tmp = (opus_int16) (32768 * pcm_buf[total_buffer + idx] + 0.5f); + fwrite(&tmp, sizeof(tmp), 1, fout); + } + } +#endif } else { dc_reject(pcm, 3, &pcm_buf[total_buffer*st->channels], st->hp_mem, frame_size, st->channels, st->Fs); } @@ -1660,15 +1842,30 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ st->hp_mem[0] = st->hp_mem[1] = st->hp_mem[2] = st->hp_mem[3] = 0; } } +#else + (void)float_api; #endif +#ifdef ENABLE_DRED + if ( st->dred_duration > 0 && st->dred_encoder.loaded ) { + int frame_size_400Hz; + /* DRED Encoder */ + dred_compute_latents( &st->dred_encoder, &pcm_buf[total_buffer*st->channels], frame_size, total_buffer, st->arch ); + frame_size_400Hz = frame_size*400/st->Fs; + OPUS_MOVE(&st->activity_mem[frame_size_400Hz], st->activity_mem, 4*DRED_MAX_FRAMES-frame_size_400Hz); + for (i=0;iactivity_mem[i] = activity; + } else { + st->dred_encoder.latents_buffer_fill = 0; + OPUS_CLEAR(st->activity_mem, DRED_MAX_FRAMES); + } +#endif /* SILK processing */ HB_gain = Q15ONE; if (st->mode != MODE_CELT_ONLY) { opus_int32 total_bitRate, celt_rate; - opus_int activity; #ifdef FIXED_POINT const opus_int16 *pcm_silk; #else @@ -1676,14 +1873,6 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ ALLOC(pcm_silk, st->channels*frame_size, opus_int16); #endif - activity = VAD_NO_DECISION; -#ifndef DISABLE_FLOAT_API - if( analysis_info.valid ) { - /* Inform SILK about the Opus VAD decision */ - activity = ( analysis_info.activity_probability >= DTX_ACTIVITY_THRESHOLD ); - } -#endif - /* Distribute bits between SILK and CELT */ total_bitRate = 8 * bytes_target * frame_rate; if( st->mode == MODE_HYBRID ) { @@ -1765,7 +1954,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ st->silk_mode.maxInternalSampleRate = 16000; if (st->mode == MODE_SILK_ONLY) { - opus_int32 effective_max_rate = max_rate; + opus_int32 effective_max_rate = frame_rate*max_data_bytes*8; if (frame_rate > 50) effective_max_rate = effective_max_rate*2/3; if (effective_max_rate < 8000) @@ -1795,9 +1984,19 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ } if (st->silk_mode.useCBR) { + /* When we're in CBR mode, but we have non-SILK data to encode, switch SILK to VBR with cap to + save on complexity. Any variations will be absorbed by CELT and/or DRED and we can still + produce a constant bitrate without wasting bits. */ +#ifdef ENABLE_DRED + if (st->mode == MODE_HYBRID || dred_bitrate_bps > 0) +#else if (st->mode == MODE_HYBRID) +#endif { - st->silk_mode.maxBits = IMIN(st->silk_mode.maxBits, st->silk_mode.bitRate * frame_size / st->Fs); + /* Allow SILK to steal up to 25% of the remaining bits */ + opus_int16 other_bits = IMAX(0, st->silk_mode.maxBits - st->silk_mode.bitRate * frame_size / st->Fs); + st->silk_mode.maxBits = IMAX(0, st->silk_mode.maxBits - other_bits*3/4); + st->silk_mode.useCBR = 0; } } else { /* Constrained VBR. */ @@ -1910,26 +2109,10 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ if (st->mode != MODE_SILK_ONLY) { opus_val32 celt_pred=2; - celt_encoder_ctl(celt_enc, OPUS_SET_VBR(0)); /* We may still decide to disable prediction later */ if (st->silk_mode.reducedDependency) celt_pred = 0; celt_encoder_ctl(celt_enc, CELT_SET_PREDICTION(celt_pred)); - - if (st->mode == MODE_HYBRID) - { - if( st->use_vbr ) { - celt_encoder_ctl(celt_enc, OPUS_SET_BITRATE(st->bitrate_bps-st->silk_mode.bitRate)); - celt_encoder_ctl(celt_enc, OPUS_SET_VBR_CONSTRAINT(0)); - } - } else { - if (st->use_vbr) - { - celt_encoder_ctl(celt_enc, OPUS_SET_VBR(1)); - celt_encoder_ctl(celt_enc, OPUS_SET_VBR_CONSTRAINT(st->vbr_constraint)); - celt_encoder_ctl(celt_enc, OPUS_SET_BITRATE(st->bitrate_bps)); - } - } } ALLOC(tmp_prefill, st->channels*st->Fs/400, opus_val16); @@ -2023,13 +2206,27 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ ec_enc_done(&enc); nb_compr_bytes = ret; } else { - nb_compr_bytes = (max_data_bytes-1)-redundancy_bytes; - ec_enc_shrink(&enc, nb_compr_bytes); + nb_compr_bytes = (max_data_bytes-1)-redundancy_bytes; +#ifdef ENABLE_DRED + if (st->dred_duration > 0) + { + int max_celt_bytes; + opus_int32 dred_bytes = dred_bitrate_bps/(frame_rate*8); + /* Allow CELT to steal up to 25% of the remaining bits. */ + max_celt_bytes = nb_compr_bytes - dred_bytes*3/4; + /* But try to give CELT at least 5 bytes to prevent a mismatch with + the redundancy signaling. */ + max_celt_bytes = IMAX((ec_tell(&enc)+7)/8 + 5, max_celt_bytes); + /* Subject to the original max. */ + nb_compr_bytes = IMIN(nb_compr_bytes, max_celt_bytes); + } +#endif + ec_enc_shrink(&enc, nb_compr_bytes); } #ifndef DISABLE_FLOAT_API if (redundancy || st->mode != MODE_SILK_ONLY) - celt_encoder_ctl(celt_enc, CELT_SET_ANALYSIS(&analysis_info)); + celt_encoder_ctl(celt_enc, CELT_SET_ANALYSIS(analysis_info)); #endif if (st->mode == MODE_HYBRID) { SILKInfo info; @@ -2059,6 +2256,34 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ if (st->mode != MODE_SILK_ONLY) { + celt_encoder_ctl(celt_enc, OPUS_SET_VBR(st->use_vbr)); + if (st->mode == MODE_HYBRID) + { + if( st->use_vbr ) { + celt_encoder_ctl(celt_enc, OPUS_SET_BITRATE(st->bitrate_bps-st->silk_mode.bitRate)); + celt_encoder_ctl(celt_enc, OPUS_SET_VBR_CONSTRAINT(0)); + } + } else { + if (st->use_vbr) + { + celt_encoder_ctl(celt_enc, OPUS_SET_VBR(1)); + celt_encoder_ctl(celt_enc, OPUS_SET_VBR_CONSTRAINT(st->vbr_constraint)); + celt_encoder_ctl(celt_enc, OPUS_SET_BITRATE(st->bitrate_bps)); + } + } +#ifdef ENABLE_DRED + /* When Using DRED CBR, we can actually make the CELT part VBR and have DRED pick up the slack. */ + if (!st->use_vbr && st->dred_duration > 0) + { + opus_int32 celt_bitrate = st->bitrate_bps; + celt_encoder_ctl(celt_enc, OPUS_SET_VBR(1)); + celt_encoder_ctl(celt_enc, OPUS_SET_VBR_CONSTRAINT(0)); + if (st->mode == MODE_HYBRID) { + celt_bitrate -= st->silk_mode.bitRate; + } + celt_encoder_ctl(celt_enc, OPUS_SET_BITRATE(celt_bitrate)); + } +#endif if (st->mode != st->prev_mode && st->prev_mode > 0) { unsigned char dummy[2]; @@ -2071,10 +2296,6 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ /* If false, we already busted the budget and we'll end up with a "PLC frame" */ if (ec_tell(&enc) <= 8*nb_compr_bytes) { - /* Set the bitrate again if it was overridden in the redundancy code above*/ - if (redundancy && celt_to_silk && st->mode==MODE_HYBRID && st->use_vbr) - celt_encoder_ctl(celt_enc, OPUS_SET_BITRATE(st->bitrate_bps-st->silk_mode.bitRate)); - celt_encoder_ctl(celt_enc, OPUS_SET_VBR(st->use_vbr)); ret = celt_encode_with_ec(celt_enc, pcm_buf, frame_size, NULL, nb_compr_bytes, &enc); if (ret < 0) { @@ -2082,10 +2303,10 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ return OPUS_INTERNAL_ERROR; } /* Put CELT->SILK redundancy data in the right place. */ - if (redundancy && celt_to_silk && st->mode==MODE_HYBRID && st->use_vbr) + if (redundancy && celt_to_silk && st->mode==MODE_HYBRID && nb_compr_bytes != ret) { OPUS_MOVE(data+ret, data+nb_compr_bytes, redundancy_bytes); - nb_compr_bytes = nb_compr_bytes+redundancy_bytes; + nb_compr_bytes = ret+redundancy_bytes; } } } @@ -2142,10 +2363,9 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ /* DTX decision */ #ifndef DISABLE_FLOAT_API - if (st->use_dtx && (analysis_info.valid || is_silence)) + if (st->use_dtx && (analysis_info->valid || is_silence)) { - if (decide_dtx_mode(analysis_info.activity_probability, &st->nb_no_activity_frames, - st->peak_signal_energy, pcm, frame_size, st->channels, is_silence, st->arch)) + if (decide_dtx_mode(activity, &st->nb_no_activity_ms_Q1, 2*1000*frame_size/st->Fs)) { st->rangeFinal = 0; data[0] = gen_toc(st->mode, st->Fs/frame_size, curr_bandwidth, st->stream_channels); @@ -2153,7 +2373,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ return 1; } } else { - st->nb_no_activity_frames = 0; + st->nb_no_activity_ms_Q1 = 0; } #endif @@ -2181,7 +2401,51 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ } /* Count ToC and redundancy */ ret += 1+redundancy_bytes; - if (!st->use_vbr) + apply_padding = !st->use_vbr; +#ifdef ENABLE_DRED + if (st->dred_duration > 0 && st->dred_encoder.loaded && first_frame) { + opus_extension_data extension; + unsigned char buf[DRED_MAX_DATA_SIZE]; + int dred_chunks; + int dred_bytes_left; + dred_chunks = IMIN((st->dred_duration+5)/4, DRED_NUM_REDUNDANCY_FRAMES/2); + if (st->use_vbr) dred_chunks = IMIN(dred_chunks, st->dred_target_chunks); + /* Remaining space for DRED, accounting for cost the 3 extra bytes for code 3, padding length, and extension number. */ + dred_bytes_left = IMIN(DRED_MAX_DATA_SIZE, max_data_bytes-ret-3); + /* Account for the extra bytes required to signal large padding length. */ + dred_bytes_left -= (dred_bytes_left+1+DRED_EXPERIMENTAL_BYTES)/255; + /* Check whether we actually have something to encode. */ + if (dred_chunks >= 1 && dred_bytes_left >= DRED_MIN_BYTES+DRED_EXPERIMENTAL_BYTES) { + int dred_bytes; +#ifdef DRED_EXPERIMENTAL_VERSION + /* Add temporary extension type and version. + These bytes will be removed once extension is finalized. */ + buf[0] = 'D'; + buf[1] = DRED_EXPERIMENTAL_VERSION; +#endif + dred_bytes = dred_encode_silk_frame(&st->dred_encoder, buf+DRED_EXPERIMENTAL_BYTES, dred_chunks, dred_bytes_left-DRED_EXPERIMENTAL_BYTES, + st->dred_q0, st->dred_dQ, st->dred_qmax, st->activity_mem, st->arch); + if (dred_bytes > 0) { + dred_bytes += DRED_EXPERIMENTAL_BYTES; + celt_assert(dred_bytes <= dred_bytes_left); + extension.id = DRED_EXTENSION_ID; + extension.frame = 0; + extension.data = buf; + extension.len = dred_bytes; + ret = opus_packet_pad_impl(data, ret, max_data_bytes, !st->use_vbr, &extension, 1); + if (ret < 0) + { + RESTORE_STACK; + return OPUS_INTERNAL_ERROR; + } + apply_padding = 0; + } + } + } +#else + (void)first_frame; /* Avoids a warning about first_frame being unused. */ +#endif + if (apply_padding) { if (opus_packet_pad(data, ret, max_data_bytes) != OPUS_OK) { @@ -2448,11 +2712,12 @@ int opus_encoder_ctl(OpusEncoder *st, int request, ...) case OPUS_SET_INBAND_FEC_REQUEST: { opus_int32 value = va_arg(ap, opus_int32); - if(value<0 || value>1) + if(value<0 || value>2) { goto bad_arg; } - st->silk_mode.useInBandFEC = value; + st->fec_config = value; + st->silk_mode.useInBandFEC = (value != 0); } break; case OPUS_GET_INBAND_FEC_REQUEST: @@ -2462,7 +2727,7 @@ int opus_encoder_ctl(OpusEncoder *st, int request, ...) { goto bad_arg; } - *value = st->silk_mode.useInBandFEC; + *value = st->fec_config; } break; case OPUS_SET_PACKET_LOSS_PERC_REQUEST: @@ -2679,6 +2944,29 @@ int opus_encoder_ctl(OpusEncoder *st, int request, ...) celt_encoder_ctl(celt_enc, OPUS_GET_PHASE_INVERSION_DISABLED(value)); } break; +#ifdef ENABLE_DRED + case OPUS_SET_DRED_DURATION_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + if(value<0 || value>DRED_MAX_FRAMES) + { + goto bad_arg; + } + st->dred_duration = value; + st->silk_mode.useDRED = !!value; + } + break; + case OPUS_GET_DRED_DURATION_REQUEST: + { + opus_int32 *value = va_arg(ap, opus_int32*); + if (!value) + { + goto bad_arg; + } + *value = st->dred_duration; + } + break; +#endif case OPUS_RESET_STATE: { void *silk_enc; @@ -2694,6 +2982,10 @@ int opus_encoder_ctl(OpusEncoder *st, int request, ...) celt_encoder_ctl(celt_enc, OPUS_RESET_STATE); silk_InitEncoder( silk_enc, st->arch, &dummy ); +#ifdef ENABLE_DRED + /* Initialize DRED Encoder */ + dred_encoder_reset( &st->dred_encoder ); +#endif st->stream_channels = st->channels; st->hybrid_stereo_width_Q14 = 1 << 14; st->prev_HB_gain = Q15ONE; @@ -2736,17 +3028,17 @@ int opus_encoder_ctl(OpusEncoder *st, int request, ...) } if (st->silk_mode.useDTX && (st->prev_mode == MODE_SILK_ONLY || st->prev_mode == MODE_HYBRID)) { /* DTX determined by Silk. */ - int n; - void *silk_enc = (char*)st+st->silk_enc_offset; - *value = 1; - for (n=0;nsilk_mode.nChannelsInternal;n++) { - *value = *value && ((silk_encoder*)silk_enc)->state_Fxx[n].sCmn.noSpeechCounter >= NB_SPEECH_FRAMES_BEFORE_DTX; + silk_encoder *silk_enc = (silk_encoder*)(void *)((char*)st+st->silk_enc_offset); + *value = silk_enc->state_Fxx[0].sCmn.noSpeechCounter >= NB_SPEECH_FRAMES_BEFORE_DTX; + /* Stereo: check second channel unless only the middle channel was encoded. */ + if(*value == 1 && st->silk_mode.nChannelsInternal == 2 && silk_enc->prev_decode_only_middle == 0) { + *value = silk_enc->state_Fxx[1].sCmn.noSpeechCounter >= NB_SPEECH_FRAMES_BEFORE_DTX; } } #ifndef DISABLE_FLOAT_API else if (st->use_dtx) { /* DTX determined by Opus. */ - *value = st->nb_no_activity_frames >= NB_SPEECH_FRAMES_BEFORE_DTX; + *value = st->nb_no_activity_ms_Q1 >= NB_SPEECH_FRAMES_BEFORE_DTX*20*2; } #endif else { @@ -2754,7 +3046,21 @@ int opus_encoder_ctl(OpusEncoder *st, int request, ...) } } break; - +#ifdef USE_WEIGHTS_FILE + case OPUS_SET_DNN_BLOB_REQUEST: + { + const unsigned char *data = va_arg(ap, const unsigned char *); + opus_int32 len = va_arg(ap, opus_int32); + if(len<0 || data == NULL) + { + goto bad_arg; + } +#ifdef ENABLE_DRED + ret = dred_encoder_load_model(&st->dred_encoder, data, len); +#endif + } + break; +#endif case CELT_GET_MODE_REQUEST: { const CELTMode ** value = va_arg(ap, const CELTMode**); diff --git a/opus/src/opus_multistream_decoder.c b/opus/src/opus_multistream_decoder.c index 0018517a..4ae877a7 100644 --- a/opus/src/opus_multistream_decoder.c +++ b/opus/src/opus_multistream_decoder.c @@ -162,7 +162,7 @@ static int opus_multistream_packet_validate(const unsigned char *data, if (len<=0) return OPUS_INVALID_PACKET; count = opus_packet_parse_impl(data, len, s!=nb_streams-1, &toc, NULL, - size, NULL, &packet_offset); + size, NULL, &packet_offset, NULL, NULL); if (count<0) return count; tmp_samples = opus_packet_get_nb_samples(data, packet_offset, Fs); @@ -250,9 +250,12 @@ int opus_multistream_decode_native( return OPUS_INTERNAL_ERROR; } packet_offset = 0; - ret = opus_decode_native(dec, data, len, buf, frame_size, decode_fec, s!=st->layout.nb_streams-1, &packet_offset, soft_clip); - data += packet_offset; - len -= packet_offset; + ret = opus_decode_native(dec, data, len, buf, frame_size, decode_fec, s!=st->layout.nb_streams-1, &packet_offset, soft_clip, NULL, 0); + if (!do_plc) + { + data += packet_offset; + len -= packet_offset; + } if (ret <= 0) { RESTORE_STACK; diff --git a/opus/src/opus_multistream_encoder.c b/opus/src/opus_multistream_encoder.c index 93204a14..1725ade7 100644 --- a/opus/src/opus_multistream_encoder.c +++ b/opus/src/opus_multistream_encoder.c @@ -443,7 +443,8 @@ static int opus_multistream_encoder_init_impl( char *ptr; if ((channels>255) || (channels<1) || (coupled_streams>streams) || - (streams<1) || (coupled_streams<0) || (streams>255-coupled_streams)) + (streams<1) || (coupled_streams<0) || (streams>255-coupled_streams) || + (streams+coupled_streams>channels)) return OPUS_BAD_ARG; st->arch = opus_select_arch(); @@ -459,8 +460,7 @@ static int opus_multistream_encoder_init_impl( st->layout.mapping[i] = mapping[i]; if (!validate_layout(&st->layout)) return OPUS_BAD_ARG; - if (mapping_type == MAPPING_TYPE_SURROUND && - !validate_encoder_layout(&st->layout)) + if (!validate_encoder_layout(&st->layout)) return OPUS_BAD_ARG; if (mapping_type == MAPPING_TYPE_AMBISONICS && !validate_ambisonics(st->layout.nb_channels, NULL, NULL)) @@ -595,7 +595,8 @@ OpusMSEncoder *opus_multistream_encoder_create( int ret; OpusMSEncoder *st; if ((channels>255) || (channels<1) || (coupled_streams>streams) || - (streams<1) || (coupled_streams<0) || (streams>255-coupled_streams)) + (streams<1) || (coupled_streams<0) || (streams>255-coupled_streams) || + (streams+coupled_streams>channels)) { if (error) *error = OPUS_BAD_ARG; @@ -1002,7 +1003,7 @@ int opus_multistream_encode_native return OPUS_INTERNAL_ERROR; } len = opus_repacketizer_out_range_impl(&rp, 0, opus_repacketizer_get_nb_frames(&rp), - data, max_data_bytes-tot_size, s != st->layout.nb_streams-1, !vbr && s == st->layout.nb_streams-1); + data, max_data_bytes-tot_size, s != st->layout.nb_streams-1, !vbr && s == st->layout.nb_streams-1, NULL, 0); data += len; tot_size += len; } diff --git a/opus/src/opus_private.h b/opus/src/opus_private.h index 5e2463f5..364c21ce 100644 --- a/opus/src/opus_private.h +++ b/opus/src/opus_private.h @@ -42,8 +42,17 @@ struct OpusRepacketizer { const unsigned char *frames[48]; opus_int16 len[48]; int framesize; + const unsigned char *paddings[48]; + opus_int32 padding_len[48]; }; +typedef struct { + int id; + int frame; + const unsigned char *data; + opus_int32 len; +} opus_extension_data; + typedef struct ChannelLayout { int nb_channels; int nb_streams; @@ -148,7 +157,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ int opus_decode_native(OpusDecoder *st, const unsigned char *data, opus_int32 len, opus_val16 *pcm, int frame_size, int decode_fec, int self_delimited, - opus_int32 *packet_offset, int soft_clip); + opus_int32 *packet_offset, int soft_clip, const OpusDRED *dred, opus_int32 dred_offset); /* Make sure everything is properly aligned. */ static OPUS_INLINE int align(int i) @@ -162,13 +171,18 @@ static OPUS_INLINE int align(int i) return ((i + alignment - 1) / alignment) * alignment; } +/* More than that is ridiculous for now (3 * max frames per packet)*/ +opus_int32 skip_extension(const unsigned char **data, opus_int32 len, opus_int32 *header_size); + int opus_packet_parse_impl(const unsigned char *data, opus_int32 len, int self_delimited, unsigned char *out_toc, const unsigned char *frames[48], opus_int16 size[48], - int *payload_offset, opus_int32 *packet_offset); + int *payload_offset, opus_int32 *packet_offset, + const unsigned char **padding, opus_int32 *padding_len); opus_int32 opus_repacketizer_out_range_impl(OpusRepacketizer *rp, int begin, int end, - unsigned char *data, opus_int32 maxlen, int self_delimited, int pad); + unsigned char *data, opus_int32 maxlen, int self_delimited, int pad, + const opus_extension_data *extensions, int nb_extensions); int pad_frame(unsigned char *data, opus_int32 len, opus_int32 new_len); @@ -198,4 +212,12 @@ int opus_multistream_decode_native( void *user_data ); +opus_int32 opus_packet_extensions_parse(const unsigned char *data, opus_int32 len, opus_extension_data *extensions, opus_int32 *nb_extensions); + +opus_int32 opus_packet_extensions_generate(unsigned char *data, opus_int32 len, const opus_extension_data *extensions, int nb_extensions, int pad); + +opus_int32 opus_packet_extensions_count(const unsigned char *data, opus_int32 len); + +opus_int32 opus_packet_pad_impl(unsigned char *data, opus_int32 len, opus_int32 new_len, int pad, const opus_extension_data *extensions, int nb_extensions); + #endif /* OPUS_PRIVATE_H */ diff --git a/opus/src/opus_projection_encoder.c b/opus/src/opus_projection_encoder.c index 06fb2d25..92813ad0 100644 --- a/opus/src/opus_projection_encoder.c +++ b/opus/src/opus_projection_encoder.c @@ -177,6 +177,20 @@ opus_int32 opus_projection_ambisonics_encoder_get_size(int channels, demixing_matrix_rows = mapping_matrix_toa_demixing.rows; demixing_matrix_cols = mapping_matrix_toa_demixing.cols; } + else if (order_plus_one == 5) + { + mixing_matrix_rows = mapping_matrix_fourthoa_mixing.rows; + mixing_matrix_cols = mapping_matrix_fourthoa_mixing.cols; + demixing_matrix_rows = mapping_matrix_fourthoa_demixing.rows; + demixing_matrix_cols = mapping_matrix_fourthoa_demixing.cols; + } + else if (order_plus_one == 6) + { + mixing_matrix_rows = mapping_matrix_fifthoa_mixing.rows; + mixing_matrix_cols = mapping_matrix_fifthoa_mixing.cols; + demixing_matrix_rows = mapping_matrix_fifthoa_demixing.rows; + demixing_matrix_cols = mapping_matrix_fifthoa_demixing.cols; + } else return 0; @@ -245,6 +259,20 @@ int opus_projection_ambisonics_encoder_init(OpusProjectionEncoder *st, opus_int3 mapping_matrix_toa_mixing_data, sizeof(mapping_matrix_toa_mixing_data)); } + else if (order_plus_one == 5) + { + mapping_matrix_init(mixing_matrix, mapping_matrix_fourthoa_mixing.rows, + mapping_matrix_fourthoa_mixing.cols, mapping_matrix_fourthoa_mixing.gain, + mapping_matrix_fourthoa_mixing_data, + sizeof(mapping_matrix_fourthoa_mixing_data)); + } + else if (order_plus_one == 6) + { + mapping_matrix_init(mixing_matrix, mapping_matrix_fifthoa_mixing.rows, + mapping_matrix_fifthoa_mixing.cols, mapping_matrix_fifthoa_mixing.gain, + mapping_matrix_fifthoa_mixing_data, + sizeof(mapping_matrix_fifthoa_mixing_data)); + } else return OPUS_BAD_ARG; @@ -275,6 +303,20 @@ int opus_projection_ambisonics_encoder_init(OpusProjectionEncoder *st, opus_int3 mapping_matrix_toa_demixing.cols, mapping_matrix_toa_demixing.gain, mapping_matrix_toa_demixing_data, sizeof(mapping_matrix_toa_demixing_data)); + } + else if (order_plus_one == 5) + { + mapping_matrix_init(demixing_matrix, mapping_matrix_fourthoa_demixing.rows, + mapping_matrix_fourthoa_demixing.cols, mapping_matrix_fourthoa_demixing.gain, + mapping_matrix_fourthoa_demixing_data, + sizeof(mapping_matrix_fourthoa_demixing_data)); + } + else if (order_plus_one == 6) + { + mapping_matrix_init(demixing_matrix, mapping_matrix_fifthoa_demixing.rows, + mapping_matrix_fifthoa_demixing.cols, mapping_matrix_fifthoa_demixing.gain, + mapping_matrix_fifthoa_demixing_data, + sizeof(mapping_matrix_fifthoa_demixing_data)); } else return OPUS_BAD_ARG; diff --git a/opus/src/repacketizer.c b/opus/src/repacketizer.c index bda44a14..6a7a8b3d 100644 --- a/opus/src/repacketizer.c +++ b/opus/src/repacketizer.c @@ -32,6 +32,7 @@ #include "opus.h" #include "opus_private.h" #include "os_support.h" +#include "stack_alloc.h" int opus_repacketizer_get_size(void) @@ -82,10 +83,19 @@ static int opus_repacketizer_cat_impl(OpusRepacketizer *rp, const unsigned char return OPUS_INVALID_PACKET; } - ret=opus_packet_parse_impl(data, len, self_delimited, &tmp_toc, &rp->frames[rp->nb_frames], &rp->len[rp->nb_frames], NULL, NULL); + ret=opus_packet_parse_impl(data, len, self_delimited, &tmp_toc, &rp->frames[rp->nb_frames], &rp->len[rp->nb_frames], + NULL, NULL, &rp->paddings[rp->nb_frames], &rp->padding_len[rp->nb_frames]); if(ret<1)return ret; - rp->nb_frames += curr_nb_frames; + /* set padding length to zero for all but the first frame */ + while (curr_nb_frames > 1) + { + rp->nb_frames++; + rp->padding_len[rp->nb_frames] = 0; + rp->paddings[rp->nb_frames] = NULL; + curr_nb_frames--; + } + rp->nb_frames++; return OPUS_OK; } @@ -100,17 +110,23 @@ int opus_repacketizer_get_nb_frames(OpusRepacketizer *rp) } opus_int32 opus_repacketizer_out_range_impl(OpusRepacketizer *rp, int begin, int end, - unsigned char *data, opus_int32 maxlen, int self_delimited, int pad) + unsigned char *data, opus_int32 maxlen, int self_delimited, int pad, const opus_extension_data *extensions, int nb_extensions) { int i, count; opus_int32 tot_size; opus_int16 *len; const unsigned char **frames; unsigned char * ptr; + int ones_begin=0, ones_end=0; + int ext_begin=0, ext_len=0; + int ext_count, total_ext_count; + VARDECL(opus_extension_data, all_extensions); + SAVE_STACK; if (begin<0 || begin>=end || end>rp->nb_frames) { /*fprintf(stderr, "%d %d %d\n", begin, end, rp->nb_frames);*/ + RESTORE_STACK; return OPUS_BAD_ARG; } count = end-begin; @@ -122,13 +138,50 @@ opus_int32 opus_repacketizer_out_range_impl(OpusRepacketizer *rp, int begin, int else tot_size = 0; + /* figure out total number of extensions */ + total_ext_count = nb_extensions; + for (i=begin;ipaddings[i], rp->padding_len[i]); + if (n > 0) total_ext_count += n; + } + ALLOC(all_extensions, total_ext_count ? total_ext_count : ALLOC_NONE, opus_extension_data); + /* copy over any extensions that were passed in */ + for (ext_count=0;ext_countpaddings[i], rp->padding_len[i], + &all_extensions[ext_count], &frame_ext_count); + if (ret<0) + { + RESTORE_STACK; + return OPUS_INTERNAL_ERROR; + } + /* renumber the extension frame numbers */ + for (j=0;j maxlen) + { + RESTORE_STACK; return OPUS_BUFFER_TOO_SMALL; + } *ptr++ = rp->toc&0xFC; } else if (count==2) { @@ -137,18 +190,24 @@ opus_int32 opus_repacketizer_out_range_impl(OpusRepacketizer *rp, int begin, int /* Code 1 */ tot_size += 2*len[0]+1; if (tot_size > maxlen) + { + RESTORE_STACK; return OPUS_BUFFER_TOO_SMALL; + } *ptr++ = (rp->toc&0xFC) | 0x1; } else { /* Code 2 */ tot_size += len[0]+len[1]+2+(len[0]>=252); if (tot_size > maxlen) + { + RESTORE_STACK; return OPUS_BUFFER_TOO_SMALL; + } *ptr++ = (rp->toc&0xFC) | 0x2; ptr += encode_size(len[0], ptr); } } - if (count > 2 || (pad && tot_size < maxlen)) + if (count > 2 || (pad && tot_size < maxlen) || ext_count > 0) { /* Code 3 */ int vbr; @@ -177,22 +236,45 @@ opus_int32 opus_repacketizer_out_range_impl(OpusRepacketizer *rp, int begin, int tot_size += len[count-1]; if (tot_size > maxlen) + { + RESTORE_STACK; return OPUS_BUFFER_TOO_SMALL; + } *ptr++ = (rp->toc&0xFC) | 0x3; *ptr++ = count | 0x80; } else { tot_size += count*len[0]+2; if (tot_size > maxlen) + { + RESTORE_STACK; return OPUS_BUFFER_TOO_SMALL; + } *ptr++ = (rp->toc&0xFC) | 0x3; *ptr++ = count; } pad_amount = pad ? (maxlen-tot_size) : 0; + if (ext_count>0) + { + /* figure out how much space we need for the extensions */ + ext_len = opus_packet_extensions_generate(NULL, maxlen-tot_size, all_extensions, ext_count, 0); + if (ext_len < 0) return ext_len; + if (!pad) + pad_amount = ext_len + ext_len/254 + 1; + } if (pad_amount != 0) { int nb_255s; data[1] |= 0x40; nb_255s = (pad_amount-1)/255; + if (tot_size + ext_len + nb_255s + 1 > maxlen) + { + RESTORE_STACK; + return OPUS_BUFFER_TOO_SMALL; + } + ext_begin = tot_size+pad_amount-ext_len; + /* Prepend 0x01 padding */ + ones_begin = tot_size+nb_255s+1; + ones_end = tot_size+pad_amount-ext_len; for (i=0;i 0) { + int ret = opus_packet_extensions_generate(&data[ext_begin], ext_len, all_extensions, ext_count, 0); + celt_assert(ret == ext_len); + } + for (i=ones_begin;inb_frames, data, maxlen, 0, 0); + return opus_repacketizer_out_range_impl(rp, 0, rp->nb_frames, data, maxlen, 0, 0, NULL, 0); } -int opus_packet_pad(unsigned char *data, opus_int32 len, opus_int32 new_len) +opus_int32 opus_packet_pad_impl(unsigned char *data, opus_int32 len, opus_int32 new_len, int pad, const opus_extension_data *extensions, int nb_extensions) { OpusRepacketizer rp; opus_int32 ret; + VARDECL(unsigned char, copy); + SAVE_STACK; if (len < 1) return OPUS_BAD_ARG; if (len==new_len) return OPUS_OK; else if (len > new_len) return OPUS_BAD_ARG; + ALLOC(copy, len, unsigned char); opus_repacketizer_init(&rp); /* Moving payload to the end of the packet so we can do in-place padding */ - OPUS_MOVE(data+new_len-len, data, len); - ret = opus_repacketizer_cat(&rp, data+new_len-len, len); + OPUS_COPY(copy, data, len); + ret = opus_repacketizer_cat(&rp, copy, len); if (ret != OPUS_OK) return ret; - ret = opus_repacketizer_out_range_impl(&rp, 0, rp.nb_frames, data, new_len, 0, 1); + ret = opus_repacketizer_out_range_impl(&rp, 0, rp.nb_frames, data, new_len, 0, pad, extensions, nb_extensions); + RESTORE_STACK; + return ret; +} + +int opus_packet_pad(unsigned char *data, opus_int32 len, opus_int32 new_len) +{ + opus_int32 ret; + ALLOC_STACK; + ret = opus_packet_pad_impl(data, len, new_len, 1, NULL, 0); + RESTORE_STACK; if (ret > 0) return OPUS_OK; else @@ -264,13 +366,19 @@ opus_int32 opus_packet_unpad(unsigned char *data, opus_int32 len) { OpusRepacketizer rp; opus_int32 ret; + int i; if (len < 1) return OPUS_BAD_ARG; opus_repacketizer_init(&rp); ret = opus_repacketizer_cat(&rp, data, len); if (ret < 0) return ret; - ret = opus_repacketizer_out_range_impl(&rp, 0, rp.nb_frames, data, len, 0, 0); + /* Discard all padding and extensions. */ + for (i=0;i 0 && ret <= len); return ret; } @@ -297,7 +405,7 @@ int opus_multistream_packet_pad(unsigned char *data, opus_int32 len, opus_int32 if (len<=0) return OPUS_INVALID_PACKET; count = opus_packet_parse_impl(data, len, 1, &toc, NULL, - size, NULL, &packet_offset); + size, NULL, &packet_offset, NULL, NULL); if (count<0) return count; data += packet_offset; @@ -324,18 +432,24 @@ opus_int32 opus_multistream_packet_unpad(unsigned char *data, opus_int32 len, in for (s=0;s1500 || len[i]<0) @@ -135,13 +147,31 @@ int main(int argc, char *argv[]) } break; } - err = fread(ch, 1, 4, fin); - rng[i] = char_to_int(ch); - err = fread(packets[i], 1, len[i], fin); - if (feof(fin)) + if (fread(ch, 1, 4, fin)!=4) { - eof = 1; - break; + if (feof(fin)) + { + eof = 1; + } else { + fprintf(stderr, "Error reading.\n"); + fclose(fin); + fclose(fout); + return EXIT_FAILURE; + } + break; + } + rng[i] = char_to_int(ch); + if (fread(packets[i], len[i], 1, fin)!=1) { + if (feof(fin)) + { + eof = 1; + } else { + fprintf(stderr, "Error reading packet of %u bytes.\n", len[i]); + fclose(fin); + fclose(fout); + return EXIT_FAILURE; + } + break; } err = opus_repacketizer_cat(rp, packets[i], len[i]); if (err!=OPUS_OK) diff --git a/opus/src/tansig_table.h b/opus/src/tansig_table.h deleted file mode 100644 index c76f844a..00000000 --- a/opus/src/tansig_table.h +++ /dev/null @@ -1,45 +0,0 @@ -/* This file is auto-generated by gen_tables */ - -static const float tansig_table[201] = { -0.000000f, 0.039979f, 0.079830f, 0.119427f, 0.158649f, -0.197375f, 0.235496f, 0.272905f, 0.309507f, 0.345214f, -0.379949f, 0.413644f, 0.446244f, 0.477700f, 0.507977f, -0.537050f, 0.564900f, 0.591519f, 0.616909f, 0.641077f, -0.664037f, 0.685809f, 0.706419f, 0.725897f, 0.744277f, -0.761594f, 0.777888f, 0.793199f, 0.807569f, 0.821040f, -0.833655f, 0.845456f, 0.856485f, 0.866784f, 0.876393f, -0.885352f, 0.893698f, 0.901468f, 0.908698f, 0.915420f, -0.921669f, 0.927473f, 0.932862f, 0.937863f, 0.942503f, -0.946806f, 0.950795f, 0.954492f, 0.957917f, 0.961090f, -0.964028f, 0.966747f, 0.969265f, 0.971594f, 0.973749f, -0.975743f, 0.977587f, 0.979293f, 0.980869f, 0.982327f, -0.983675f, 0.984921f, 0.986072f, 0.987136f, 0.988119f, -0.989027f, 0.989867f, 0.990642f, 0.991359f, 0.992020f, -0.992631f, 0.993196f, 0.993718f, 0.994199f, 0.994644f, -0.995055f, 0.995434f, 0.995784f, 0.996108f, 0.996407f, -0.996682f, 0.996937f, 0.997172f, 0.997389f, 0.997590f, -0.997775f, 0.997946f, 0.998104f, 0.998249f, 0.998384f, -0.998508f, 0.998623f, 0.998728f, 0.998826f, 0.998916f, -0.999000f, 0.999076f, 0.999147f, 0.999213f, 0.999273f, -0.999329f, 0.999381f, 0.999428f, 0.999472f, 0.999513f, -0.999550f, 0.999585f, 0.999617f, 0.999646f, 0.999673f, -0.999699f, 0.999722f, 0.999743f, 0.999763f, 0.999781f, -0.999798f, 0.999813f, 0.999828f, 0.999841f, 0.999853f, -0.999865f, 0.999875f, 0.999885f, 0.999893f, 0.999902f, -0.999909f, 0.999916f, 0.999923f, 0.999929f, 0.999934f, -0.999939f, 0.999944f, 0.999948f, 0.999952f, 0.999956f, -0.999959f, 0.999962f, 0.999965f, 0.999968f, 0.999970f, -0.999973f, 0.999975f, 0.999977f, 0.999978f, 0.999980f, -0.999982f, 0.999983f, 0.999984f, 0.999986f, 0.999987f, -0.999988f, 0.999989f, 0.999990f, 0.999990f, 0.999991f, -0.999992f, 0.999992f, 0.999993f, 0.999994f, 0.999994f, -0.999994f, 0.999995f, 0.999995f, 0.999996f, 0.999996f, -0.999996f, 0.999997f, 0.999997f, 0.999997f, 0.999997f, -0.999997f, 0.999998f, 0.999998f, 0.999998f, 0.999998f, -0.999998f, 0.999998f, 0.999999f, 0.999999f, 0.999999f, -0.999999f, 0.999999f, 0.999999f, 0.999999f, 0.999999f, -0.999999f, 0.999999f, 0.999999f, 0.999999f, 0.999999f, -1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, -1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, -1.000000f, -}; From 25ec8dc8170551438630e92085b4f44c728a85a6 Mon Sep 17 00:00:00 2001 From: starg2 <75976488+starg2@users.noreply.github.com> Date: Sat, 16 Mar 2024 17:34:30 +0900 Subject: [PATCH 3/5] [opus] Enable runtime detection of AVX2 --- opus/CMakeLists.txt | 2 ++ opus/celt/x86/x86cpu.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/opus/CMakeLists.txt b/opus/CMakeLists.txt index afd72104..fc299bc3 100644 --- a/opus/CMakeLists.txt +++ b/opus/CMakeLists.txt @@ -12,9 +12,11 @@ add_definitions( -DOPUS_BUILD -DFLOAT_APPROX -DDLL_EXPORT + -DOPUS_HAVE_RTCD -DOPUS_X86_MAY_HAVE_SSE -DOPUS_X86_MAY_HAVE_SSE2 -DOPUS_X86_MAY_HAVE_SSE4_1 + -DOPUS_X86_MAY_HAVE_AVX2 -DOPUS_X86_PRESUME_SSE -DOPUS_X86_PRESUME_SSE2 -DOPUS_X86_PRESUME_SSE4_1 diff --git a/opus/celt/x86/x86cpu.c b/opus/celt/x86/x86cpu.c index 2e7c32ae..81cff4f4 100644 --- a/opus/celt/x86/x86cpu.c +++ b/opus/celt/x86/x86cpu.c @@ -41,7 +41,7 @@ (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \ (defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_PRESUME_AVX2))) -#if defined(_MSC_VER) +#if defined(_WIN32) #include static _inline void cpuid(unsigned int CPUInfo[4], unsigned int InfoType) From e2f9bf59e3dc855a797387a0d39441f7ea5bcb0f Mon Sep 17 00:00:00 2001 From: starg2 <75976488+starg2@users.noreply.github.com> Date: Sat, 16 Mar 2024 17:42:36 +0900 Subject: [PATCH 4/5] [opus] Limit runtime AVX2 detection to MSVC and always enable AVX2 for AVX2/AVX512 builds --- opus/CMakeLists.txt | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/opus/CMakeLists.txt b/opus/CMakeLists.txt index fc299bc3..caa71c26 100644 --- a/opus/CMakeLists.txt +++ b/opus/CMakeLists.txt @@ -12,16 +12,26 @@ add_definitions( -DOPUS_BUILD -DFLOAT_APPROX -DDLL_EXPORT - -DOPUS_HAVE_RTCD -DOPUS_X86_MAY_HAVE_SSE -DOPUS_X86_MAY_HAVE_SSE2 -DOPUS_X86_MAY_HAVE_SSE4_1 - -DOPUS_X86_MAY_HAVE_AVX2 -DOPUS_X86_PRESUME_SSE -DOPUS_X86_PRESUME_SSE2 -DOPUS_X86_PRESUME_SSE4_1 ) +if("${TIM41_X86_SIMD_LEVEL}" MATCHES "^(AVX2|AVX512)$") + add_definitions( + -DOPUS_X86_MAY_HAVE_AVX2 + -DOPUS_X86_PRESUME_AVX2 + ) +elseif(MSVC) + add_definitions( + -DOPUS_HAVE_RTCD + -DOPUS_X86_MAY_HAVE_AVX2 + ) +endif() + add_library( opus SHARED From 3bb6b1fe8485333037c5841b738bc98d86664684 Mon Sep 17 00:00:00 2001 From: starg2 <75976488+starg2@users.noreply.github.com> Date: Sat, 16 Mar 2024 17:51:46 +0900 Subject: [PATCH 5/5] [opus] Exclude AVX2 sources unless necessary --- opus/CMakeLists.txt | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/opus/CMakeLists.txt b/opus/CMakeLists.txt index caa71c26..d6f1e699 100644 --- a/opus/CMakeLists.txt +++ b/opus/CMakeLists.txt @@ -248,9 +248,7 @@ add_library( silk/float/structs_FLP.h silk/float/warped_autocorrelation_FLP.c silk/float/wrappers_FLP.c - silk/float/x86/inner_product_FLP_avx2.c silk/x86/main_sse.h - silk/x86/NSQ_del_dec_avx2.c silk/x86/NSQ_del_dec_sse4_1.c silk/x86/NSQ_sse4_1.c silk/x86/SigProc_FIX_sse.h @@ -259,4 +257,13 @@ add_library( silk/x86/x86_silk_map.c ) +if("${TIM41_X86_SIMD_LEVEL}" MATCHES "^(AVX2|AVX512)$" OR MSVC) + target_sources( + opus + PRIVATE + silk/float/x86/inner_product_FLP_avx2.c + silk/x86/NSQ_del_dec_avx2.c + ) +endif() + install(TARGETS opus)