update version to v2.1.5

update version to v2.1.4
update version to v2.1.2
2026-05-21 09:22:11 +00:00 · 2024-08-27 09:56:16 +08:00 · 2024-05-13 13:48:59 +08:00 · 2024-01-25 10:08:14 +08:00 · 2023-10-30 11:24:27 +08:00 · 2023-08-08 19:30:55 +08:00
1907 changed files with 511005 additions and 39413 deletions
--- a/AT32F415_periph_lib_V2.1.5.chm
+++ b/AT32F415_periph_lib_V2.1.5.chm
--- a/29
+++ b/29
@@ -0,0 +1,29 @@
 BSD 3-Clause License
 Copyright (c) 2021, ArteryTek
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
 * Redistributions of source code must retain the above copyright notice, this
  list of conditions and the following disclaimer.
 * Redistributions in binary form must reproduce the above copyright notice,
  this list of conditions and the following disclaimer in the documentation
  and/or other materials provided with the distribution.
 * Neither the name of the copyright holder nor the names of its
  contributors may be used to endorse or promote products derived from
  this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/document/AT32F415固件库BSP&Pack应用指南.pdf
+++ b/document/AT32F415固件库BSP&Pack应用指南.pdf
--- a/document/ReleaseNotes_AT32F415_Firmware_Library.pdf
+++ b/document/ReleaseNotes_AT32F415_Firmware_Library.pdf
--- a/libraries/cmsis/cm4/device_support/at32f415.h
+++ b/libraries/cmsis/cm4/device_support/at32f415.h
@@ -1,8 +1,6 @@
 /**
  **************************************************************************
  * @file     at32f415.h
  * @version  v2.0.0
  * @date     2021-11-26
  * @brief    at32f415 header file
  **************************************************************************
  *                       Copyright notice & Disclaimer
@@ -58,7 +56,7 @@ extern "C" {
    !defined (AT32F415KBU7_4) && !defined (AT32F415R8T7)   && !defined (AT32F415R8T7_7) && \
    !defined (AT32F415C8T7)   && !defined (AT32F415K8U7_4)
-    #error "Please select first the target at32f4xx device used in your application (in at32f4xx.h file)"
+    #error "Please select first the target device used in your application (in at32f415.h file)"
 #endif
 #if defined (AT32F415RCT7)   || defined (AT32F415RCT7_7) || defined (AT32F415CCT7)   || \
@@ -70,6 +68,47 @@ extern "C" {
    #define AT32F415xx
 #endif
 /**
  * define with package
  */
 #if defined (AT32F415RCT7)   || defined (AT32F415RCT7_7) || defined (AT32F415RBT7)   || \
    defined (AT32F415RBT7_7) || defined (AT32F415R8T7)   || defined (AT32F415R8T7_7)
    #define AT32F415Rx
 #endif
 #if defined (AT32F415CCT7)   || defined (AT32F415CCU7)   || defined (AT32F415CBT7)   || \
    defined (AT32F415CBU7)   || defined (AT32F415C8T7)
    #define AT32F415Cx
 #endif
 #if defined (AT32F415KCU7_4) || defined (AT32F415KBU7_4) || defined (AT32F415K8U7_4)
    #define AT32F415Kx
 #endif
 /**
  * define with memory density
  */
 #if defined (AT32F415R8T7)   || defined (AT32F415R8T7_7) || defined (AT32F415C8T7)   || \
    defined (AT32F415K8U7_4)
    #define AT32F415x8
 #endif
 #if defined (AT32F415RBT7)   || defined (AT32F415RBT7_7) || defined (AT32F415CBT7)   || \
    defined (AT32F415CBU7)   || defined (AT32F415KBU7_4)
    #define AT32F415xB
 #endif
 #if defined (AT32F415RCT7)   || defined (AT32F415RCT7_7) || defined (AT32F415CCT7)   || \
    defined (AT32F415CCU7)   || defined (AT32F415KCU7_4)
    #define AT32F415xC
 #endif
 #ifndef USE_STDPERIPH_DRIVER
 /**
  * @brief comment the line below if you will not use the peripherals drivers.
@@ -88,8 +127,8 @@ extern "C" {
  * @brief at32f415 standard peripheral library version number
  */
 #define __AT32F415_LIBRARY_VERSION_MAJOR    (0x02) /*!< [31:24] major version */
-#define __AT32F415_LIBRARY_VERSION_MIDDLE   (0x00) /*!< [23:16] middle version */
+#define __AT32F415_LIBRARY_VERSION_MIDDLE   (0x01) /*!< [23:16] middle version */
-#define __AT32F415_LIBRARY_VERSION_MINOR    (0x00) /*!< [15:8]  minor version */
+#define __AT32F415_LIBRARY_VERSION_MINOR    (0x05) /*!< [15:8]  minor version */
 #define __AT32F415_LIBRARY_VERSION_RC       (0x00) /*!< [7:0]  release candidate */
 #define __AT32F415_LIBRARY_VERSION          ((__AT32F415_LIBRARY_VERSION_MAJOR << 24)  | \
                                             (__AT32F415_LIBRARY_VERSION_MIDDLE << 16) | \
@@ -187,7 +226,7 @@ typedef enum IRQn
    OTGFS1_IRQn                 = 67,     /*!< otgfs1 global interrupt                              */
    CMP1_IRQn                   = 70,     /*!< comparator1 global interrupt                         */
    CMP2_IRQn                   = 71,     /*!< comparator2 global interrupt                         */
-    DMA2_Channel6_7_IRQn        = 75,     /*!< dma2 channel 6 and channel 7 global interrupt        */
+    DMA2_Channel6_7_IRQn        = 75      /*!< dma2 channel 6 and channel 7 global interrupt        */
 } IRQn_Type;
@@ -361,6 +400,7 @@ typedef enum {ERROR = 0, SUCCESS = !ERROR} error_status;
  * @}
  */
 #include "at32f415_def.h"
 #include "at32f415_conf.h"
 #ifdef __cplusplus
--- a/libraries/cmsis/cm4/device_support/at32f415_conf_template.h
+++ b/libraries/cmsis/cm4/device_support/at32f415_conf_template.h
@@ -1,8 +1,6 @@
 /**
  **************************************************************************
  * @file     at32f415_conf.h
  * @version  v2.0.0
  * @date     2021-11-26
  * @brief    at32f415 config header file
  **************************************************************************
  *                       Copyright notice & Disclaimer
@@ -34,21 +32,22 @@ extern "C" {
 /**
-  * @brief in the following line adjust the value of high speed exernal crystal (hext)
+  * @brief in the following line adjust the value of high speed external crystal (hext)
  * used in your application
  * tip: to avoid modifying this file each time you need to use different hext, you
  *      can define the hext value in your toolchain compiler preprocessor.
  */
 #if !defined  HEXT_VALUE
-#define HEXT_VALUE               ((uint32_t)8000000) /*!< value of the high speed exernal crystal in hz */
+#define HEXT_VALUE               ((uint32_t)8000000) /*!< value of the high speed external crystal in hz */
 #endif
 /**
-  * @brief in the following line adjust the high speed exernal crystal (hext) startup
+  * @brief in the following line adjust the high speed external crystal (hext) startup
  * timeout value
  */
-#define HEXT_STARTUP_TIMEOUT     ((uint16_t)0x3000) /*!< time out for hext start up */
+#define HEXT_STARTUP_TIMEOUT             ((uint16_t)0x3000)  /*!< time out for hext start up */
-#define HICK_VALUE               ((uint32_t)8000000) /*!< value of the high speed internal clock in hz */
+#define HICK_VALUE                       ((uint32_t)8000000) /*!< value of the high speed internal clock in hz */
 #define LEXT_VALUE                       ((uint32_t)32768)   /*!< value of the low speed external clock in hz */
 /* module define -------------------------------------------------------------*/
 #define CRM_MODULE_ENABLED
--- a/libraries/cmsis/cm4/device_support/startup/gcc/linker/AT32F415x8_FLASH.ld
+++ b/libraries/cmsis/cm4/device_support/startup/gcc/linker/AT32F415x8_FLASH.ld
@@ -22,7 +22,7 @@
 ENTRY(Reset_Handler)
 /* Highest address of the user mode stack */
-_estack = 0x20007FFF;    /* end of RAM */
+_estack = 0x20008000;    /* end of RAM */
 /* Generate a link error if heap and stack don't fit into RAM */
 _Min_Heap_Size = 0x200;      /* required amount of heap  */
@@ -134,12 +134,12 @@ SECTIONS
  /* User_heap_stack section, used to check that there is enough RAM left */
  ._user_heap_stack :
  {
-    . = ALIGN(4);
+    . = ALIGN(8);
    PROVIDE ( end = . );
    PROVIDE ( _end = . );
    . = . + _Min_Heap_Size;
    . = . + _Min_Stack_Size;
-    . = ALIGN(4);
+    . = ALIGN(8);
  } >RAM
  /* Remove information from the standard libraries */
--- a/libraries/cmsis/cm4/device_support/startup/gcc/linker/AT32F415xB_FLASH.ld
+++ b/libraries/cmsis/cm4/device_support/startup/gcc/linker/AT32F415xB_FLASH.ld
@@ -22,7 +22,7 @@
 ENTRY(Reset_Handler)
 /* Highest address of the user mode stack */
-_estack = 0x20007FFF;    /* end of RAM */
+_estack = 0x20008000;    /* end of RAM */
 /* Generate a link error if heap and stack don't fit into RAM */
 _Min_Heap_Size = 0x200;      /* required amount of heap  */
@@ -134,12 +134,12 @@ SECTIONS
  /* User_heap_stack section, used to check that there is enough RAM left */
  ._user_heap_stack :
  {
-    . = ALIGN(4);
+    . = ALIGN(8);
    PROVIDE ( end = . );
    PROVIDE ( _end = . );
    . = . + _Min_Heap_Size;
    . = . + _Min_Stack_Size;
-    . = ALIGN(4);
+    . = ALIGN(8);
  } >RAM
  /* Remove information from the standard libraries */
--- a/libraries/cmsis/cm4/device_support/startup/gcc/linker/AT32F415xC_FLASH.ld
+++ b/libraries/cmsis/cm4/device_support/startup/gcc/linker/AT32F415xC_FLASH.ld
@@ -22,7 +22,7 @@
 ENTRY(Reset_Handler)
 /* Highest address of the user mode stack */
-_estack = 0x20007FFF;    /* end of RAM */
+_estack = 0x20008000;    /* end of RAM */
 /* Generate a link error if heap and stack don't fit into RAM */
 _Min_Heap_Size = 0x200;      /* required amount of heap  */
@@ -134,12 +134,12 @@ SECTIONS
  /* User_heap_stack section, used to check that there is enough RAM left */
  ._user_heap_stack :
  {
-    . = ALIGN(4);
+    . = ALIGN(8);
    PROVIDE ( end = . );
    PROVIDE ( _end = . );
    . = . + _Min_Heap_Size;
    . = . + _Min_Stack_Size;
-    . = ALIGN(4);
+    . = ALIGN(8);
  } >RAM
  /* Remove information from the standard libraries */
--- a/libraries/cmsis/cm4/device_support/startup/gcc/startup_at32f415.s
+++ b/libraries/cmsis/cm4/device_support/startup/gcc/startup_at32f415.s
@@ -1,8 +1,6 @@
 /**
  ******************************************************************************
  * @file     startup_at32f415.s
  * @version  v2.0.0
  * @date     2021-11-26
  * @brief    at32f415xx devices vector table for gcc toolchain.
  *           this module performs:
  *           - set the initial sp
@@ -134,8 +132,8 @@ g_pfnVectors:
  /* External Interrupts */
  .word  WWDT_IRQHandler                     /* Window Watchdog Timer                   */
  .word  PVM_IRQHandler                      /* PVM through EXINT Line detect           */
-  .word  TAMPER_IRQHandler                   /* Tamper                                  */
+  .word  TAMP_STAMP_IRQHandler               /* Tamper and TimeStamps through the EXINT line */
-  .word  ERTC_IRQHandler                     /* ERTC                                    */
+  .word  ERTC_WKUP_IRQHandler                /* ERTC Wakeup through the EXINT line      */
  .word  FLASH_IRQHandler                    /* Flash                                   */
  .word  CRM_IRQHandler                      /* CRM                                     */
  .word  EXINT0_IRQHandler                   /* EXINT Line 0                            */
@@ -249,11 +247,11 @@ g_pfnVectors:
   .weak      PVM_IRQHandler
   .thumb_set PVM_IRQHandler,Default_Handler
-   .weak      TAMPER_IRQHandler
+   .weak      TAMP_STAMP_IRQHandler
-   .thumb_set TAMPER_IRQHandler,Default_Handler
+   .thumb_set TAMP_STAMP_IRQHandler,Default_Handler
-   .weak      ERTC_IRQHandler
+   .weak      ERTC_WKUP_IRQHandler
-   .thumb_set ERTC_IRQHandler,Default_Handler
+   .thumb_set ERTC_WKUP_IRQHandler,Default_Handler
   .weak      FLASH_IRQHandler
   .thumb_set FLASH_IRQHandler,Default_Handler
--- a/libraries/cmsis/cm4/device_support/startup/iar/startup_at32f415.s
+++ b/libraries/cmsis/cm4/device_support/startup/iar/startup_at32f415.s
@@ -1,7 +1,5 @@
 ;**************************************************************************
 ;* @file     startup_at32f415.s
 ;* @version  v2.0.0
 ;* @date     2021-11-26
 ;* @brief    at32f415 startup file for IAR Systems
 ;**************************************************************************
 ;
@@ -46,8 +44,8 @@ __vector_table
        ; External Interrupts
        DCD     WWDT_IRQHandler                     ; Window Watchdog Timer
        DCD     PVM_IRQHandler                      ; PVM through EXINT Line detect
-        DCD     TAMPER_IRQHandler                   ; Tamper
+        DCD     TAMP_STAMP_IRQHandler               ; Tamper and TimeStamps through the EXINT line
-        DCD     ERTC_IRQHandler                     ; ERTC
+        DCD     ERTC_WKUP_IRQHandler                ; ERTC Wakeup through the EXINT line
        DCD     FLASH_IRQHandler                    ; Flash
        DCD     CRM_IRQHandler                      ; CRM
        DCD     EXINT0_IRQHandler                   ; EXINT Line 0
@@ -190,15 +188,15 @@ WWDT_IRQHandler
 PVM_IRQHandler
        B PVM_IRQHandler
-        PUBWEAK TAMPER_IRQHandler
+        PUBWEAK TAMP_STAMP_IRQHandler
        SECTION .text:CODE:REORDER:NOROOT(1)
-TAMPER_IRQHandler
+TAMP_STAMP_IRQHandler
-        B TAMPER_IRQHandler
+        B TAMP_STAMP_IRQHandler
-        PUBWEAK ERTC_IRQHandler
+        PUBWEAK ERTC_WKUP_IRQHandler
        SECTION .text:CODE:REORDER:NOROOT(1)
-ERTC_IRQHandler
+ERTC_WKUP_IRQHandler
-        B ERTC_IRQHandler
+        B ERTC_WKUP_IRQHandler
        PUBWEAK FLASH_IRQHandler
        SECTION .text:CODE:REORDER:NOROOT(1)
--- a/libraries/cmsis/cm4/device_support/startup/mdk/startup_at32f415.s
+++ b/libraries/cmsis/cm4/device_support/startup/mdk/startup_at32f415.s
@@ -1,8 +1,7 @@
 ;**************************************************************************
 ;* @file     startup_at32f415.s
 ;* @version  v2.0.0
 ;* @date     2021-11-26
 ;* @brief    at32f415 startup file for keil
 ;* <<< Use Configuration Wizard in Context Menu >>>  
 ;**************************************************************************
 ;
--- a/libraries/cmsis/cm4/device_support/system_at32f415.c
+++ b/libraries/cmsis/cm4/device_support/system_at32f415.c
@@ -1,8 +1,6 @@
 /**
  **************************************************************************
  * @file     system_at32f415.c
  * @version  v2.0.0
  * @date     2021-11-26
  * @brief    contains all the functions for cmsis cortex-m4 system source file
  **************************************************************************
  *                       Copyright notice & Disclaimer
@@ -63,6 +61,11 @@ unsigned int system_core_clock           = HICK_VALUE; /*!< system clock frequen
  */
 void SystemInit (void)
 {
  /* enable low power mode */
  CRM->apb1en_bit.pwcen = 1;
  *(volatile uint8_t *)(0x40007050) |= (uint8_t)(0x1 << 2);
  CRM->apb1en_bit.pwcen = 0;
  /* reset the crm clock configuration to the default reset state(for debug purpose) */
  /* set hicken bit */
  CRM->ctrl_bit.hicken = TRUE;
@@ -76,6 +79,9 @@ void SystemInit (void)
  /* wait sclk switch status */
  while(CRM->cfg_bit.sclksts != CRM_SCLK_HICK);
  /* reset hexten, hextbyps, cfden and pllen bits */
  CRM->ctrl &= ~(0x010D0000U);
  /* reset cfg register, include sclk switch, ahbdiv, apb1div, apb2div, adcdiv,
     clkout pllrcs, pllhextdiv, pllmult, usbdiv and pllrange bits */
  CRM->cfg = 0;
@@ -83,9 +89,6 @@ void SystemInit (void)
  /* reset pllfr, pllms, pllns and pllfref bits */
  CRM->pll = (0x00001F10U);
  /* reset hexten, hextbyps, cfden and pllen bits */
  CRM->ctrl &= ~(0x010D0000U);
  /* reset clkout[3], usbbufs, hickdiv, clkoutdiv */
  CRM->misc1 = 0x00100000;
@@ -188,7 +191,7 @@ void system_core_clock_update(void)
            pllrcsfreq = HEXT_VALUE;
          }
        }
-        system_core_clock = (pllrcsfreq * pll_ns) / (pll_ms * (0x1 << pll_fr));
+        system_core_clock = (uint32_t)(((uint64_t)pllrcsfreq * pll_ns) / (pll_ms * (0x1 << pll_fr)));
      }
      break;
    default:
--- a/libraries/cmsis/cm4/device_support/system_at32f415.h
+++ b/libraries/cmsis/cm4/device_support/system_at32f415.h
@@ -1,8 +1,6 @@
 /**
  **************************************************************************
  * @file     system_at32f415.h
  * @version  v2.0.0
  * @date     2021-11-26
  * @brief    cmsis cortex-m4 system header file.
  **************************************************************************
  *                       Copyright notice & Disclaimer
@@ -45,6 +43,11 @@ extern "C" {
 #define HEXT_STABLE_DELAY                (5000u)
 #define PLL_STABLE_DELAY                 (500u)
 #define SystemCoreClock                  system_core_clock
 #define DUMMY_NOP()                      {__NOP();__NOP();__NOP();__NOP();__NOP(); \
                                          __NOP();__NOP();__NOP();__NOP();__NOP(); \
                                          __NOP();__NOP();__NOP();__NOP();__NOP(); \
                                          __NOP();__NOP();__NOP();__NOP();__NOP();}
 /**
  * @}
--- a/libraries/cmsis/dsp/ComputeLibrary/Include/NEMath.h
+++ b/libraries/cmsis/dsp/ComputeLibrary/Include/NEMath.h
@@ -0,0 +1,414 @@
 /*
 * Copyright (c) 2016, 2019 ARM Limited.
 *
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
 #ifndef __ARM_COMPUTE_NEMATH_H__
 #define __ARM_COMPUTE_NEMATH_H__
 #if defined(ARM_MATH_NEON)
 /** Calculate floor of a vector.
 *
 * @param[in] val Input vector value in F32 format.
 *
 * @return The calculated floor vector.
 */
 static inline float32x4_t vfloorq_f32(float32x4_t val);
 /** Calculate inverse square root.
 *
 * @param[in] x Input value.
 *
 * @return The calculated inverse square root.
 */
 static inline float32x2_t vinvsqrt_f32(float32x2_t x);
 /** Calculate inverse square root.
 *
 * @param[in] x Input value.
 *
 * @return The calculated inverse square root.
 */
 static inline float32x4_t vinvsqrtq_f32(float32x4_t x);
 /** Calculate reciprocal.
 *
 * @param[in] x Input value.
 *
 * @return The calculated reciprocal.
 */
 static inline float32x2_t vinv_f32(float32x2_t x);
 /** Calculate reciprocal.
 *
 * @param[in] x Input value.
 *
 * @return The calculated reciprocal.
 */
 static inline float32x4_t vinvq_f32(float32x4_t x);
 /** Perform a 7th degree polynomial approximation using Estrin's method.
 *
 * @param[in] x      Input vector value in F32 format.
 * @param[in] coeffs Polynomial coefficients table. (array of flattened float32x4_t vectors)
 *
 * @return The calculated approximation.
 */
 static inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const float32_t *coeffs);
 /** Calculate exponential
 *
 * @param[in] x Input vector value in F32 format.
 *
 * @return The calculated exponent.
 */
 static inline float32x4_t vexpq_f32(float32x4_t x);
 /** Calculate logarithm
 *
 * @param[in] x Input vector value in F32 format.
 *
 * @return The calculated logarithm.
 */
 static inline float32x4_t vlogq_f32(float32x4_t x);
 /** Calculate hyperbolic tangent.
 *
 * tanh(x) = (e^2x - 1)/(e^2x + 1)
 *
 * @note We clamp x to [-5,5] to avoid overflowing issues.
 *
 * @param[in] val Input vector value in F32 format.
 *
 * @return The calculated Hyperbolic Tangent.
 */
 static inline float32x4_t vtanhq_f32(float32x4_t val);
 /** Calculate n power of a number.
 *
 * pow(x,n) = e^(n*log(x))
 *
 * @param[in] val Input vector value in F32 format.
 * @param[in] n   Powers to raise the input to.
 *
 * @return The calculated power.
 */
 static inline float32x4_t vpowq_f32(float32x4_t val, float32x4_t n);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 /** Calculate hyperbolic tangent.
 *
 * tanh(x) = (e^2x - 1)/(e^2x + 1)
 *
 * @note We clamp x to [-5,5] to avoid overflowing issues.
 *
 * @param[in] val Input vector value in F32 format.
 *
 * @return The calculated Hyperbolic Tangent.
 */
 static inline float16x8_t vtanhq_f16(float16x8_t val);
 /** Calculate reciprocal.
 *
 * @param[in] x Input value.
 *
 * @return The calculated reciprocal.
 */
 static inline float16x4_t vinv_f16(float16x4_t x);
 /** Calculate reciprocal.
 *
 * @param[in] x Input value.
 *
 * @return The calculated reciprocal.
 */
 static inline float16x8_t vinvq_f16(float16x8_t x);
 /** Calculate inverse square root.
 *
 * @param[in] x Input value.
 *
 * @return The calculated inverse square root.
 */
 static inline float16x4_t vinvsqrt_f16(float16x4_t x);
 /** Calculate inverse square root.
 *
 * @param[in] x Input value.
 *
 * @return The calculated inverse square root.
 */
 static inline float16x8_t vinvsqrtq_f16(float16x8_t x);
 /** Calculate exponential
 *
 * @param[in] x Input vector value in F16 format.
 *
 * @return The calculated exponent.
 */
 static inline float16x8_t vexpq_f16(float16x8_t x);
 /** Calculate n power of a number.
 *
 * pow(x,n) = e^(n*log(x))
 *
 * @param[in] val Input vector value in F16 format.
 * @param[in] n   Powers to raise the input to.
 *
 * @return The calculated power.
 */
 static inline float16x8_t vpowq_f16(float16x8_t val, float16x8_t n);
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 /** Exponent polynomial coefficients */
 extern const float32_t exp_tab[4*8];
 /** Logarithm polynomial coefficients */
 extern const float32_t log_tab[4*8];
 #ifndef DOXYGEN_SKIP_THIS
 inline float32x4_t vfloorq_f32(float32x4_t val)
 {
    static const float32_t CONST_1[4] = {1.f,1.f,1.f,1.f};
    const int32x4_t   z = vcvtq_s32_f32(val);
    const float32x4_t r = vcvtq_f32_s32(z);
    return vbslq_f32(vcgtq_f32(r, val), vsubq_f32(r, vld1q_f32(CONST_1)), r);
 }
 inline float32x2_t vinvsqrt_f32(float32x2_t x)
 {
    float32x2_t sqrt_reciprocal = vrsqrte_f32(x);
    sqrt_reciprocal             = vmul_f32(vrsqrts_f32(vmul_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
    sqrt_reciprocal             = vmul_f32(vrsqrts_f32(vmul_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
    return sqrt_reciprocal;
 }
 inline float32x4_t vinvsqrtq_f32(float32x4_t x)
 {
    float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
    sqrt_reciprocal             = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
    sqrt_reciprocal             = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
    return sqrt_reciprocal;
 }
 inline float32x2_t vinv_f32(float32x2_t x)
 {
    float32x2_t recip = vrecpe_f32(x);
    recip             = vmul_f32(vrecps_f32(x, recip), recip);
    recip             = vmul_f32(vrecps_f32(x, recip), recip);
    return recip;
 }
 inline float32x4_t vinvq_f32(float32x4_t x)
 {
    float32x4_t recip = vrecpeq_f32(x);
    recip             = vmulq_f32(vrecpsq_f32(x, recip), recip);
    recip             = vmulq_f32(vrecpsq_f32(x, recip), recip);
    return recip;
 }
 inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const float32_t *coeffs)
 {
    float32x4_t A   = vmlaq_f32(vld1q_f32(&coeffs[4*0]), vld1q_f32(&coeffs[4*4]), x);
    float32x4_t B   = vmlaq_f32(vld1q_f32(&coeffs[4*2]), vld1q_f32(&coeffs[4*6]), x);
    float32x4_t C   = vmlaq_f32(vld1q_f32(&coeffs[4*1]), vld1q_f32(&coeffs[4*5]), x);
    float32x4_t D   = vmlaq_f32(vld1q_f32(&coeffs[4*3]), vld1q_f32(&coeffs[4*7]), x);
    float32x4_t x2  = vmulq_f32(x, x);
    float32x4_t x4  = vmulq_f32(x2, x2);
    float32x4_t res = vmlaq_f32(vmlaq_f32(A, B, x2), vmlaq_f32(C, D, x2), x4);
    return res;
 }
 inline float32x4_t vexpq_f32(float32x4_t x)
 {
    static const float32_t CONST_LN2[4]          = {0.6931471805f,0.6931471805f,0.6931471805f,0.6931471805f}; // ln(2)
    static const float32_t CONST_INV_LN2[4]      = {1.4426950408f,1.4426950408f,1.4426950408f,1.4426950408f}; // 1/ln(2)
    static const float32_t CONST_0[4]            = {0.f,0.f,0.f,0.f};
    static const int32_t   CONST_NEGATIVE_126[4] = {-126,-126,-126,-126};
    // Perform range reduction [-log(2),log(2)]
    int32x4_t   m   = vcvtq_s32_f32(vmulq_f32(x, vld1q_f32(CONST_INV_LN2)));
    float32x4_t val = vmlsq_f32(x, vcvtq_f32_s32(m), vld1q_f32(CONST_LN2));
    // Polynomial Approximation
    float32x4_t poly = vtaylor_polyq_f32(val, exp_tab);
    // Reconstruct
    poly = vreinterpretq_f32_s32(vqaddq_s32(vreinterpretq_s32_f32(poly), vqshlq_n_s32(m, 23)));
    poly = vbslq_f32(vcltq_s32(m, vld1q_s32(CONST_NEGATIVE_126)), vld1q_f32(CONST_0), poly);
    return poly;
 }
 inline float32x4_t vlogq_f32(float32x4_t x)
 {
    static const int32_t   CONST_127[4] = {127,127,127,127};           // 127
    static const float32_t CONST_LN2[4] = {0.6931471805f,0.6931471805f,0.6931471805f,0.6931471805f}; // ln(2)
    // Extract exponent
    int32x4_t   m   = vsubq_s32(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), vld1q_s32(CONST_127));
    float32x4_t val = vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
    // Polynomial Approximation
    float32x4_t poly = vtaylor_polyq_f32(val, log_tab);
    // Reconstruct
    poly = vmlaq_f32(poly, vcvtq_f32_s32(m), vld1q_f32(CONST_LN2));
    return poly;
 }
 inline float32x4_t vtanhq_f32(float32x4_t val)
 {
    static const float32_t CONST_1[4]        = {1.f,1.f,1.f,1.f};
    static const float32_t CONST_2[4]        = {2.f,2.f,2.f,2.f};
    static const float32_t CONST_MIN_TANH[4] = {-10.f,-10.f,-10.f,-10.f};
    static const float32_t CONST_MAX_TANH[4] = {10.f,10.f,10.f,10.f};
    float32x4_t x     = vminq_f32(vmaxq_f32(val, vld1q_f32(CONST_MIN_TANH)), vld1q_f32(CONST_MAX_TANH));
    float32x4_t exp2x = vexpq_f32(vmulq_f32(vld1q_f32(CONST_2), x));
    float32x4_t num   = vsubq_f32(exp2x, vld1q_f32(CONST_1));
    float32x4_t den   = vaddq_f32(exp2x, vld1q_f32(CONST_1));
    float32x4_t tanh  = vmulq_f32(num, vinvq_f32(den));
    return tanh;
 }
 inline float32x4_t vpowq_f32(float32x4_t val, float32x4_t n)
 {
    return vexpq_f32(vmulq_f32(n, vlogq_f32(val)));
 }
 #endif /* DOXYGEN_SKIP_THIS */
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 /** Exponent polynomial coefficients */
 /** Logarithm polynomial coefficients */
 #ifndef DOXYGEN_SKIP_THIS
 inline float16x8_t vfloorq_f16(float16x8_t val)
 {
    static const float16_t CONST_1[8] = {1.f,1.f,1.f,1.f,1.f,1.f,1.f,1.f};
    const int16x8_t   z = vcvtq_s16_f16(val);
    const float16x8_t r = vcvtq_f16_s16(z);
    return vbslq_f16(vcgtq_f16(r, val), vsubq_f16(r, vld1q_f16(CONST_1)), r);
 }
 inline float16x4_t vinvsqrt_f16(float16x4_t x)
 {
    float16x4_t sqrt_reciprocal = vrsqrte_f16(x);
    sqrt_reciprocal             = vmul_f16(vrsqrts_f16(vmul_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
    sqrt_reciprocal             = vmul_f16(vrsqrts_f16(vmul_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
    return sqrt_reciprocal;
 }
 inline float16x8_t vinvsqrtq_f16(float16x8_t x)
 {
    float16x8_t sqrt_reciprocal = vrsqrteq_f16(x);
    sqrt_reciprocal             = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
    sqrt_reciprocal             = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
    return sqrt_reciprocal;
 }
 inline float16x4_t vinv_f16(float16x4_t x)
 {
    float16x4_t recip = vrecpe_f16(x);
    recip             = vmul_f16(vrecps_f16(x, recip), recip);
    recip             = vmul_f16(vrecps_f16(x, recip), recip);
    return recip;
 }
 inline float16x8_t vinvq_f16(float16x8_t x)
 {
    float16x8_t recip = vrecpeq_f16(x);
    recip             = vmulq_f16(vrecpsq_f16(x, recip), recip);
    recip             = vmulq_f16(vrecpsq_f16(x, recip), recip);
    return recip;
 }
 inline float16x8_t vtanhq_f16(float16x8_t val)
 {
    const float16_t CONST_1[8]        = {1.f,1.f,1.f,1.f,1.f,1.f,1.f,1.f};
    const float16_t CONST_2[8]        = {2.f,2.f,2.f,2.f,2.f,2.f,2.f,2.f};
    const float16_t CONST_MIN_TANH[8] = {-10.f,-10.f,-10.f,-10.f,-10.f,-10.f,-10.f,-10.f};
    const float16_t CONST_MAX_TANH[8] = {10.f,10.f,10.f,10.f,10.f,10.f,10.f,10.f};
    const float16x8_t x     = vminq_f16(vmaxq_f16(val, vld1q_f16(CONST_MIN_TANH)), vld1q_f16(CONST_MAX_TANH));
    const float16x8_t exp2x = vexpq_f16(vmulq_f16(vld1q_f16(CONST_2), x));
    const float16x8_t num   = vsubq_f16(exp2x, vld1q_f16(CONST_1));
    const float16x8_t den   = vaddq_f16(exp2x, vld1q_f16(CONST_1));
    const float16x8_t tanh  = vmulq_f16(num, vinvq_f16(den));
    return tanh;
 }
 inline float16x8_t vtaylor_polyq_f16(float16x8_t x, const float16_t *coeffs)
 {
    const float16x8_t A   = vaddq_f16(&coeffs[8*0], vmulq_f16(&coeffs[8*4], x));
    const float16x8_t B   = vaddq_f16(&coeffs[8*2], vmulq_f16(&coeffs[8*6], x));
    const float16x8_t C   = vaddq_f16(&coeffs[8*1], vmulq_f16(&coeffs[8*5], x));
    const float16x8_t D   = vaddq_f16(&coeffs[8*3], vmulq_f16(&coeffs[8*7], x));
    const float16x8_t x2  = vmulq_f16(x, x);
    const float16x8_t x4  = vmulq_f16(x2, x2);
    const float16x8_t res = vaddq_f16(vaddq_f16(A, vmulq_f16(B, x2)), vmulq_f16(vaddq_f16(C, vmulq_f16(D, x2)), x4));
    return res;
 }
 inline float16x8_t vexpq_f16(float16x8_t x)
 {
    // TODO (COMPMID-1535) : Revisit FP16 approximations
    const float32x4_t x_high = vcvt_f32_f16(vget_high_f16(x));
    const float32x4_t x_low  = vcvt_f32_f16(vget_low_f16(x));
    const float16x8_t res = vcvt_high_f16_f32(vcvt_f16_f32(vexpq_f32(x_low)), vexpq_f32(x_high));
    return res;
 }
 inline float16x8_t vlogq_f16(float16x8_t x)
 {
    // TODO (COMPMID-1535) : Revisit FP16 approximations
    const float32x4_t x_high = vcvt_f32_f16(vget_high_f16(x));
    const float32x4_t x_low  = vcvt_f32_f16(vget_low_f16(x));
    const float16x8_t res = vcvt_high_f16_f32(vcvt_f16_f32(vlogq_f32(x_low)), vlogq_f32(x_high));
    return res;
 }
 inline float16x8_t vpowq_f16(float16x8_t val, float16x8_t n)
 {
    // TODO (giaiod01) - COMPMID-1535
    float32x4_t n0_f32   = vcvt_f32_f16(vget_low_f16(n));
    float32x4_t n1_f32   = vcvt_f32_f16(vget_high_f16(n));
    float32x4_t val0_f32 = vcvt_f32_f16(vget_low_f16(val));
    float32x4_t val1_f32 = vcvt_f32_f16(vget_high_f16(val));
    float32x4_t res0_f32 = vexpq_f32(vmulq_f32(n0_f32, vlogq_f32(val0_f32)));
    float32x4_t res1_f32 = vexpq_f32(vmulq_f32(n1_f32, vlogq_f32(val1_f32)));
    return vcombine_f16(vcvt_f16_f32(res0_f32), vcvt_f16_f32(res1_f32));
 }
 #endif /* DOXYGEN_SKIP_THIS */
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 #endif
 #endif /* __ARM_COMPUTE_NEMATH_H__ */
--- a/libraries/cmsis/dsp/ComputeLibrary/LICENSE.txt
+++ b/libraries/cmsis/dsp/ComputeLibrary/LICENSE.txt
@@ -0,0 +1,21 @@
 MIT License
 Copyright (c) 2017-2019 ARM Software
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/libraries/cmsis/dsp/ComputeLibrary/README.md
+++ b/libraries/cmsis/dsp/ComputeLibrary/README.md
@@ -0,0 +1,19 @@
 README
 ======
 This folder is containing two files imported, and slightly modified, from the ComputeLibrary:
    NEMath.h and arm_cl_tables.c 
 In the original compute library, there are instead two other files:
    NEMath.h and NEMath.inl
 NEMath.inl is included from NEMath.h whereas in this CMSIS DSP implementation, there is no NEMath.inl and its content is copied into NEMath.h
 The tables contained in NEMath.inl have been moved to arm_cl_tables.c and finally the files are in C for the CMSIS DSP library and in C++ in the original Compute Library.
 Otherwise, the features and implementations are the same : a few optimized Neon functions.
 The license covering those files is different : It is a MIT license.
 Other parts of the CMSIS-DSP are covered with an Apache-2.0 license.
--- a/libraries/cmsis/dsp/ComputeLibrary/Source/arm_cl_tables.c
+++ b/libraries/cmsis/dsp/ComputeLibrary/Source/arm_cl_tables.c
@@ -0,0 +1,55 @@
 /*
 * Copyright (c) 2016, 2019 ARM Limited.
 *
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
 #include "arm_math.h"
 #include "NEMath.h"
 #if defined(ARM_MATH_NEON)
 /** Exponent polynomial coefficients */
 const float32_t exp_tab[4*8] =
 {
        1.f,1.f,1.f,1.f,
        0.0416598916054f,0.0416598916054f,0.0416598916054f,0.0416598916054f,
        0.500000596046f,0.500000596046f,0.500000596046f,0.500000596046f,
        0.0014122662833f,0.0014122662833f,0.0014122662833f,0.0014122662833f,
        1.00000011921f,1.00000011921f,1.00000011921f,1.00000011921f,
        0.00833693705499f,0.00833693705499f,0.00833693705499f,0.00833693705499f,
        0.166665703058f,0.166665703058f,0.166665703058f,0.166665703058f,
        0.000195780929062f,0.000195780929062f,0.000195780929062f,0.000195780929062f
 };
 /** Logarithm polynomial coefficients */
 const float32_t log_tab[4*8] =
 {
        -2.29561495781f,-2.29561495781f,-2.29561495781f,-2.29561495781f,
        -2.47071170807f,-2.47071170807f,-2.47071170807f,-2.47071170807f,
        -5.68692588806f,-5.68692588806f,-5.68692588806f,-5.68692588806f,
        -0.165253549814f,-0.165253549814f,-0.165253549814f,-0.165253549814f,
        5.17591238022f,5.17591238022f,5.17591238022f,5.17591238022f,
        0.844007015228f,0.844007015228f,0.844007015228f,0.844007015228f,
        4.58445882797f,4.58445882797f,4.58445882797f,4.58445882797f,
        0.0141278216615f,0.0141278216615f,0.0141278216615f,0.0141278216615f
 };
 #endif
--- a/libraries/cmsis/dsp/PrivateInclude/arm_sorting.h
+++ b/libraries/cmsis/dsp/PrivateInclude/arm_sorting.h
@@ -0,0 +1,200 @@
 /******************************************************************************
 * @file     arm_sorting.h
 * @brief    Private header file for CMSIS DSP Library
 * @version  V1.7.0
 * @date     2019
 ******************************************************************************/
 /*
 * Copyright (c) 2010-2019 Arm Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef _ARM_SORTING_H_
 #define _ARM_SORTING_H_
 #include "arm_math.h"
 #ifdef   __cplusplus
 extern "C"
 {
 #endif
  /**
   * @param[in]  S          points to an instance of the sorting structure.
   * @param[in]  pSrc       points to the block of input data.
   * @param[out] pDst       points to the block of output data.
   * @param[in]  blockSize  number of samples to process.
   */
  void arm_bubble_sort_f32(
    const arm_sort_instance_f32 * S,
          float32_t * pSrc,
          float32_t * pDst,
    uint32_t blockSize);
   /**
   * @param[in]  S          points to an instance of the sorting structure.
   * @param[in]  pSrc       points to the block of input data.
   * @param[out] pDst       points to the block of output data.
   * @param[in]  blockSize  number of samples to process.
   */
  void arm_heap_sort_f32(
    const arm_sort_instance_f32 * S,
          float32_t * pSrc,
          float32_t * pDst,
    uint32_t blockSize);
  /**
   * @param[in]  S          points to an instance of the sorting structure.
   * @param[in]  pSrc       points to the block of input data.
   * @param[out] pDst       points to the block of output data.
   * @param[in]  blockSize  number of samples to process.
   */
  void arm_insertion_sort_f32(
    const arm_sort_instance_f32 * S,
          float32_t *pSrc,
          float32_t* pDst,
    uint32_t blockSize);
  /**
   * @param[in]  S          points to an instance of the sorting structure.
   * @param[in]  pSrc       points to the block of input data.
   * @param[out] pDst       points to the block of output data
   * @param[in]  blockSize  number of samples to process.
   */
  void arm_quick_sort_f32(
    const arm_sort_instance_f32 * S,
          float32_t * pSrc,
          float32_t * pDst,
    uint32_t blockSize);
  /**
   * @param[in]  S          points to an instance of the sorting structure.
   * @param[in]  pSrc       points to the block of input data.
   * @param[out] pDst       points to the block of output data
   * @param[in]  blockSize  number of samples to process.
   */
  void arm_selection_sort_f32(
    const arm_sort_instance_f32 * S,
          float32_t * pSrc,
          float32_t * pDst,
    uint32_t blockSize);
  /**
   * @param[in]  S          points to an instance of the sorting structure.
   * @param[in]  pSrc       points to the block of input data.
   * @param[out] pDst       points to the block of output data
   * @param[in]  blockSize  number of samples to process.
   */
  void arm_bitonic_sort_f32(
    const arm_sort_instance_f32 * S,
          float32_t * pSrc,
          float32_t * pDst,
          uint32_t blockSize);
 #if defined(ARM_MATH_NEON)
 #define vtrn256_128q(a, b)                   \
 do {                                         \
 	float32x4_t vtrn128_temp = a.val[1]; \
 	a.val[1] = b.val[0];                 \
 	b.val[0] = vtrn128_temp ;            \
 } while (0)
 #define vtrn128_64q(a, b)           \
 do {                                \
 	float32x2_t ab, cd, ef, gh; \
 	ab = vget_low_f32(a);	    \
 	ef = vget_low_f32(b);	    \
 	cd = vget_high_f32(a);	    \
 	gh = vget_high_f32(b);      \
 	a = vcombine_f32(ab, ef);   \
 	b = vcombine_f32(cd, gh);   \
 } while (0)
 #define vtrn256_64q(a, b)                  \
 do {                                       \
 	float32x2_t a_0, a_1, a_2, a_3;    \
 	float32x2_t b_0, b_1, b_2, b_3;    \
 	a_0 = vget_low_f32(a.val[0]);      \
 	a_1 = vget_high_f32(a.val[0]);     \
 	a_2 = vget_low_f32(a.val[1]);      \
 	a_3 = vget_high_f32(a.val[1]);     \
 	b_0 = vget_low_f32(b.val[0]);      \
 	b_1 = vget_high_f32(b.val[0]);     \
 	b_2 = vget_low_f32(b.val[1]);      \
 	b_3 = vget_high_f32(b.val[1]);     \
 	a.val[0] = vcombine_f32(a_0, b_0); \
 	a.val[1] = vcombine_f32(a_2, b_2); \
 	b.val[0] = vcombine_f32(a_1, b_1); \
 	b.val[1] = vcombine_f32(a_3, b_3); \
 } while (0)
 #define vtrn128_32q(a, b)                               \
 do {                                                    \
 	float32x4x2_t vtrn32_tmp = vtrnq_f32((a), (b)); \
 	(a) = vtrn32_tmp.val[0];                        \
 	(b) = vtrn32_tmp.val[1];                        \
 } while (0)
 #define vtrn256_32q(a, b)               \
 do {                                    \
 	float32x4x2_t vtrn32_tmp_1 = vtrnq_f32((a.val[0]), (b.val[0])); \
 	float32x4x2_t vtrn32_tmp_2 = vtrnq_f32((a.val[1]), (b.val[1])); \
 	a.val[0] = vtrn32_tmp_1.val[0]; \
 	a.val[1] = vtrn32_tmp_2.val[0]; \
 	b.val[0] = vtrn32_tmp_1.val[1]; \
 	b.val[1] = vtrn32_tmp_2.val[1]; \
 } while (0)
 #define vminmaxq(a, b)                    \
 	do {                              \
 	float32x4_t minmax_tmp = (a);     \
 	(a) = vminq_f32((a), (b));        \
 	(b) = vmaxq_f32(minmax_tmp, (b)); \
 } while (0)
 #define vminmax256q(a, b)                         \
 	do {                                      \
 	float32x4x2_t minmax256_tmp = (a);        \
 	a.val[0] = vminq_f32(a.val[0], b.val[0]); \
 	a.val[1] = vminq_f32(a.val[1], b.val[1]); \
 	b.val[0] = vmaxq_f32(minmax256_tmp.val[0], b.val[0]); \
 	b.val[1] = vmaxq_f32(minmax256_tmp.val[1], b.val[1]); \
 } while (0)
 #define vrev128q_f32(a) \
        vcombine_f32(vrev64_f32(vget_high_f32(a)), vrev64_f32(vget_low_f32(a)))
 #define vrev256q_f32(a)     \
 	do {                \
        float32x4_t rev_tmp = vcombine_f32(vrev64_f32(vget_high_f32(a.val[0])), vrev64_f32(vget_low_f32(a.val[0]))); \
 	a.val[0] = vcombine_f32(vrev64_f32(vget_high_f32(a.val[1])), vrev64_f32(vget_low_f32(a.val[1])));  \
 	a.val[1] = rev_tmp; \
 } while (0)
 #define vldrev128q_f32(a, p) \
 	do {                 \
 	a = vld1q_f32(p);    \
 	a = vrev128q_f32(a); \
 } while (0)
 #endif /* ARM_MATH_NEON */
 #ifdef   __cplusplus
 }
 #endif
 #endif /* _ARM_SORTING_H */
--- a/libraries/cmsis/dsp/PrivateInclude/arm_vec_fft.h
+++ b/libraries/cmsis/dsp/PrivateInclude/arm_vec_fft.h
@@ -0,0 +1,58 @@
 /******************************************************************************
 * @file     arm_vec_fft.h
 * @brief    Private header file for CMSIS DSP Library
 * @version  V1.7.0
 * @date     07. January 2020
 ******************************************************************************/
 /*
 * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef _ARM_VEC_FFT_H_
 #define _ARM_VEC_FFT_H_
 #include "arm_math.h"
 #include "arm_helium_utils.h"
 #ifdef   __cplusplus
 extern "C"
 {
 #endif
 #if (defined(ARM_MATH_MVEF) || defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)
 #define MVE_CMPLX_ADD_A_ixB(A, B)           vcaddq_rot90(A,B)
 #define MVE_CMPLX_SUB_A_ixB(A,B)            vcaddq_rot270(A,B)
 #define MVE_CMPLX_MULT_FLT_AxB(A,B)         vcmlaq_rot90(vcmulq(A, B), A, B)
 #define MVE_CMPLX_MULT_FLT_Conj_AxB(A,B)    vcmlaq_rot270(vcmulq(A, B), A, B)
 #define MVE_CMPLX_MULT_FX_AxB(A,B)          vqdmladhxq(vqdmlsdhq((__typeof(A))vuninitializedq_s32(), A, B), A, B);
 #define MVE_CMPLX_MULT_FX_AxConjB(A,B)      vqdmladhq(vqdmlsdhxq((__typeof(A))vuninitializedq_s32(), A, B), A, B);
 #define MVE_CMPLX_ADD_FX_A_ixB(A, B)        vhcaddq_rot90(A,B)
 #define MVE_CMPLX_SUB_FX_A_ixB(A,B)         vhcaddq_rot270(A,B)
 #endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)*/
 #ifdef   __cplusplus
 }
 #endif
 #endif /* _ARM_VEC_FFT_H_ */
--- a/libraries/cmsis/dsp/PrivateInclude/arm_vec_filtering.h
+++ b/libraries/cmsis/dsp/PrivateInclude/arm_vec_filtering.h
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/BasicMathFunctions.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/BasicMathFunctions.c
@@ -0,0 +1,75 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        BasicMathFunctions.c
 * Description:  Combination of all basic math function source files.
 *
 * $Date:        16. March 2020
 * $Revision:    V1.1.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2019-2020 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_abs_f32.c"
 #include "arm_abs_q15.c"
 #include "arm_abs_q31.c"
 #include "arm_abs_q7.c"
 #include "arm_add_f32.c"
 #include "arm_add_q15.c"
 #include "arm_add_q31.c"
 #include "arm_add_q7.c"
 #include "arm_and_u16.c"
 #include "arm_and_u32.c"
 #include "arm_and_u8.c"
 #include "arm_dot_prod_f32.c"
 #include "arm_dot_prod_q15.c"
 #include "arm_dot_prod_q31.c"
 #include "arm_dot_prod_q7.c"
 #include "arm_mult_f32.c"
 #include "arm_mult_q15.c"
 #include "arm_mult_q31.c"
 #include "arm_mult_q7.c"
 #include "arm_negate_f32.c"
 #include "arm_negate_q15.c"
 #include "arm_negate_q31.c"
 #include "arm_negate_q7.c"
 #include "arm_not_u16.c"
 #include "arm_not_u32.c"
 #include "arm_not_u8.c"
 #include "arm_offset_f32.c"
 #include "arm_offset_q15.c"
 #include "arm_offset_q31.c"
 #include "arm_offset_q7.c"
 #include "arm_or_u16.c"
 #include "arm_or_u32.c"
 #include "arm_or_u8.c"
 #include "arm_scale_f32.c"
 #include "arm_scale_q15.c"
 #include "arm_scale_q31.c"
 #include "arm_scale_q7.c"
 #include "arm_shift_q15.c"
 #include "arm_shift_q31.c"
 #include "arm_shift_q7.c"
 #include "arm_sub_f32.c"
 #include "arm_sub_q15.c"
 #include "arm_sub_q31.c"
 #include "arm_sub_q7.c"
 #include "arm_xor_u16.c"
 #include "arm_xor_u32.c"
 #include "arm_xor_u8.c"
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/CMakeLists.txt
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/CMakeLists.txt
@@ -0,0 +1,19 @@
 cmake_minimum_required (VERSION 3.6)
 project(CMSISDSPBasicMath)
 include(configLib)
 include(configDsp)
 file(GLOB SRC "./*_*.c")
 add_library(CMSISDSPBasicMath STATIC ${SRC})
 configLib(CMSISDSPBasicMath ${ROOT})
 configDsp(CMSISDSPBasicMath ${ROOT})
 ### Includes
 target_include_directories(CMSISDSPBasicMath PUBLIC "${DSP}/Include")
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_abs_f32.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_abs_f32.c
@@ -0,0 +1,196 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_abs_f32.c
 * Description:  Floating-point vector absolute value
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 #include <math.h>
 /**
  @ingroup groupMath
 */
 /**
  @defgroup BasicAbs Vector Absolute Value
  Computes the absolute value of a vector on an element-by-element basis.
  <pre>
      pDst[n] = abs(pSrc[n]),   0 <= n < blockSize.
  </pre>
  The functions support in-place computation allowing the source and
  destination pointers to reference the same memory buffer.
  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
 */
 /**
  @addtogroup BasicAbs
  @{
 */
 /**
  @brief         Floating-point vector absolute value.
  @param[in]     pSrc       points to the input vector
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
 */
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 #include "arm_helium_utils.h"
 void arm_abs_f32(
  const float32_t * pSrc,
        float32_t * pDst,
        uint32_t blockSize)
 {
    uint32_t blkCnt;                               /* Loop counter */
    f32x4_t vec1;
    f32x4_t res;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        /* C = |A| */
        /* Calculate absolute values and then store the results in the destination buffer. */
        vec1 = vld1q(pSrc);
        res = vabsq(vec1);
        vst1q(pDst, res);
        /* Increment pointers */
        pSrc += 4;
        pDst += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0x3;
    if (blkCnt > 0U)
    {
      /* C = |A| */
      mve_pred16_t p0 = vctp32q(blkCnt);
      vec1 = vld1q(pSrc);
      vstrwq_p(pDst, vabsq(vec1), p0);
    }
 }
 #else
 void arm_abs_f32(
  const float32_t * pSrc,
        float32_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
    f32x4_t vec1;
    f32x4_t res;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        /* C = |A| */
    	/* Calculate absolute values and then store the results in the destination buffer. */
        vec1 = vld1q_f32(pSrc);
        res = vabsq_f32(vec1);
        vst1q_f32(pDst, res);
        /* Increment pointers */
        pSrc += 4;
        pDst += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0x3;
 #else
 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = |A| */
    /* Calculate absolute and store result in destination buffer. */
    *pDst++ = fabsf(*pSrc++);
    *pDst++ = fabsf(*pSrc++);
    *pDst++ = fabsf(*pSrc++);
    *pDst++ = fabsf(*pSrc++);
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
 #endif /* #if defined(ARM_MATH_NEON) */
  while (blkCnt > 0U)
  {
    /* C = |A| */
    /* Calculate absolute and store result in destination buffer. */
    *pDst++ = fabsf(*pSrc++);
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 /**
  @} end of BasicAbs group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_abs_q15.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_abs_q15.c
@@ -0,0 +1,178 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_abs_q15.c
 * Description:  Q15 vector absolute value
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicAbs
  @{
 */
 /**
  @brief         Q15 vector absolute value.
  @param[in]     pSrc       points to the input vector
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   The Q15 value -1 (0x8000) will be saturated to the maximum allowable positive value 0x7FFF.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_abs_q15(
    const q15_t * pSrc,
    q15_t * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q15x8_t vecSrc;
    /* Compute 8 outputs at a time */
    blkCnt = blockSize >> 3;
    while (blkCnt > 0U)
    {
        /*
         * C = |A|
         * Calculate absolute and then store the results in the destination buffer.
         */
        vecSrc = vld1q(pSrc);
        vst1q(pDst, vqabsq(vecSrc));
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrc += 8;
        pDst += 8;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 7;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp16q(blkCnt);
        vecSrc = vld1q(pSrc);
        vstrhq_p(pDst, vqabsq(vecSrc), p0);
    }
 }
 #else
 void arm_abs_q15(
  const q15_t * pSrc,
        q15_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
        q15_t in;                                      /* Temporary input variable */
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = |A| */
    /* Calculate absolute of input (if -1 then saturated to 0x7fff) and store result in destination buffer. */
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
 #else
    *pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
 #endif
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
 #else
    *pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
 #endif
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
 #else
    *pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
 #endif
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
 #else
    *pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = |A| */
    /* Calculate absolute of input (if -1 then saturated to 0x7fff) and store result in destination buffer. */
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
 #else
    *pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicAbs group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_abs_q31.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_abs_q31.c
@@ -0,0 +1,208 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_abs_q31.c
 * Description:  Q31 vector absolute value
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicAbs
  @{
 */
 /**
  @brief         Q31 vector absolute value.
  @param[in]     pSrc       points to the input vector
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   The Q31 value -1 (0x80000000) will be saturated to the maximum allowable positive value 0x7FFFFFFF.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_abs_q31(
    const q31_t * pSrc,
    q31_t * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* Loop counters */
    q31x4_t vecSrc;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2;
    while (blkCnt > 0U)
    {
        /*
         * C = |A|
         * Calculate absolute and then store the results in the destination buffer.
         */
        vecSrc = vld1q(pSrc);
        vst1q(pDst, vqabsq(vecSrc));
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * Advance vector source and destination pointers
         */
        pSrc += 4;
        pDst += 4;
    }
    /*
     * Tail
     */
    blkCnt = blockSize & 3;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp32q(blkCnt);
        vecSrc = vld1q(pSrc);
        vstrwq_p(pDst, vqabsq(vecSrc), p0);
    }
 }
 #else
 void arm_abs_q31(
  const q31_t * pSrc,
        q31_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
        q31_t in;                                      /* Temporary variable */
 #if defined(ARM_MATH_NEON)
    int32x4_t vec1;
    int32x4_t res;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        /* C = |A| */
        /* Calculate absolute and then store the results in the destination buffer. */
        vec1 = vld1q_s32(pSrc);
        res = vqabsq_s32(vec1);
        vst1q_s32(pDst, res);
        /* Increment pointers */
        pSrc += 4;
        pDst += 4;
        /* Decrement the blockSize loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0x3;
 #else
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = |A| */
    /* Calculate absolute of input (if -1 then saturated to 0x7fffffff) and store result in destination buffer. */
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
 #else
    *pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
 #endif
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
 #else
    *pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
 #endif
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
 #else
    *pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
 #endif
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
 #else
    *pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
 #endif /* #if defined (ARM_MATH_NEON) */
  while (blkCnt > 0U)
  {
    /* C = |A| */
    /* Calculate absolute of input (if -1 then saturated to 0x7fffffff) and store result in destination buffer. */
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
 #else
    *pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* #if defined (ARM_MATH_MVEI) */
 /**
  @} end of BasicAbs group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_abs_q7.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_abs_q7.c
@@ -0,0 +1,180 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_abs_q7.c
 * Description:  Q7 vector absolute value
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicAbs
  @{
 */
 /**
  @brief         Q7 vector absolute value.
  @param[in]     pSrc       points to the input vector
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
  @par           Conditions for optimum performance
                   Input and output buffers should be aligned by 32-bit
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   The Q7 value -1 (0x80) will be saturated to the maximum allowable positive value 0x7F.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_abs_q7(
    const q7_t * pSrc,
    q7_t * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q7x16_t vecSrc;
    /* Compute 16 outputs at a time */
    blkCnt = blockSize >> 4;
    while (blkCnt > 0U)
    {
        /*
         * C = |A|
         * Calculate absolute and then store the results in the destination buffer.
         */
        vecSrc = vld1q(pSrc);
        vst1q(pDst, vqabsq(vecSrc));
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrc += 16;
        pDst += 16;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 0xF;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp8q(blkCnt);
        vecSrc = vld1q(pSrc);
        vstrbq_p(pDst, vqabsq(vecSrc), p0);
    }
 }
 #else
 void arm_abs_q7(
  const q7_t * pSrc,
        q7_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
        q7_t in;                                       /* Temporary input variable */
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = |A| */
    /* Calculate absolute of input (if -1 then saturated to 0x7f) and store result in destination buffer. */
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = (in > 0) ? in : (q7_t)__QSUB8(0, in);
 #else
    *pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
 #endif
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = (in > 0) ? in : (q7_t)__QSUB8(0, in);
 #else
    *pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
 #endif
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = (in > 0) ? in : (q7_t)__QSUB8(0, in);
 #else
    *pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
 #endif
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = (in > 0) ? in : (q7_t)__QSUB8(0, in);
 #else
    *pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = |A| */
    /* Calculate absolute of input (if -1 then saturated to 0x7f) and store result in destination buffer. */
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = (in > 0) ? in : (q7_t) __QSUB8(0, in);
 #else
    *pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicAbs group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_add_f32.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_add_f32.c
@@ -0,0 +1,199 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_add_f32.c
 * Description:  Floating-point vector addition
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @defgroup BasicAdd Vector Addition
  Element-by-element addition of two vectors.
  <pre>
      pDst[n] = pSrcA[n] + pSrcB[n],   0 <= n < blockSize.
  </pre>
  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
 */
 /**
  @addtogroup BasicAdd
  @{
 */
 /**
  @brief         Floating-point vector addition.
  @param[in]     pSrcA      points to first input vector
  @param[in]     pSrcB      points to second input vector
  @param[out]    pDst       points to output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
 */
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 #include "arm_helium_utils.h"
 void arm_add_f32(
  const float32_t * pSrcA,
  const float32_t * pSrcB,
        float32_t * pDst,
        uint32_t blockSize)
 {
    uint32_t blkCnt;                               /* Loop counter */
    f32x4_t vec1;
    f32x4_t vec2;
    f32x4_t res;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        /* C = A + B */
        /* Add and then store the results in the destination buffer. */
        vec1 = vld1q(pSrcA);
        vec2 = vld1q(pSrcB);
        res = vaddq(vec1, vec2);
        vst1q(pDst, res);
        /* Increment pointers */
        pSrcA += 4;
        pSrcB += 4;
        pDst += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0x3;
    if (blkCnt > 0U)
    {
      /* C = A + B */
      mve_pred16_t p0 = vctp32q(blkCnt);
      vec1 = vld1q(pSrcA);
      vec2 = vld1q(pSrcB);
      vstrwq_p(pDst, vaddq(vec1,vec2), p0);
    }
 }
 #else
 void arm_add_f32(
  const float32_t * pSrcA,
  const float32_t * pSrcB,
        float32_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
    f32x4_t vec1;
    f32x4_t vec2;
    f32x4_t res;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        /* C = A + B */
    	/* Add and then store the results in the destination buffer. */
        vec1 = vld1q_f32(pSrcA);
        vec2 = vld1q_f32(pSrcB);
        res = vaddq_f32(vec1, vec2);
        vst1q_f32(pDst, res);
        /* Increment pointers */
        pSrcA += 4;
        pSrcB += 4;
        pDst += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0x3;
 #else
 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A + B */
    /* Add and store result in destination buffer. */
    *pDst++ = (*pSrcA++) + (*pSrcB++);
    *pDst++ = (*pSrcA++) + (*pSrcB++);
    *pDst++ = (*pSrcA++) + (*pSrcB++);
    *pDst++ = (*pSrcA++) + (*pSrcB++);
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
 #endif /* #if defined(ARM_MATH_NEON) */
  while (blkCnt > 0U)
  {
    /* C = A + B */
    /* Add and store result in destination buffer. */
    *pDst++ = (*pSrcA++) + (*pSrcB++);
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 /**
  @} end of BasicAdd group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_add_q15.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_add_q15.c
@@ -0,0 +1,176 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_add_q15.c
 * Description:  Q15 vector addition
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicAdd
  @{
 */
 /**
  @brief         Q15 vector addition.
  @param[in]     pSrcA      points to the first input vector
  @param[in]     pSrcB      points to the second input vector
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_add_q15(
    const q15_t * pSrcA,
    const q15_t * pSrcB,
    q15_t * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q15x8_t vecA;
    q15x8_t vecB;
    /* Compute 8 outputs at a time */
    blkCnt = blockSize >> 3;
    while (blkCnt > 0U)
    {
        /*
         * C = A + B
         * Add and then store the results in the destination buffer.
         */
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        vst1q(pDst, vqaddq(vecA, vecB));
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrcA  += 8;
        pSrcB  += 8;
        pDst   += 8;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 7;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp16q(blkCnt);
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        vstrhq_p(pDst, vqaddq(vecA, vecB), p0);
    }
 }
 #else
 void arm_add_q15(
  const q15_t * pSrcA,
  const q15_t * pSrcB,
        q15_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
 #if defined (ARM_MATH_LOOPUNROLL)
 #if defined (ARM_MATH_DSP)
  q31_t inA1, inA2;
  q31_t inB1, inB2;
 #endif
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A + B */
 #if defined (ARM_MATH_DSP)
    /* read 2 times 2 samples at a time from sourceA */
    inA1 = read_q15x2_ia ((q15_t **) &pSrcA);
    inA2 = read_q15x2_ia ((q15_t **) &pSrcA);
    /* read 2 times 2 samples at a time from sourceB */
    inB1 = read_q15x2_ia ((q15_t **) &pSrcB);
    inB2 = read_q15x2_ia ((q15_t **) &pSrcB);
    /* Add and store 2 times 2 samples at a time */
    write_q15x2_ia (&pDst, __QADD16(inA1, inB1));
    write_q15x2_ia (&pDst, __QADD16(inA2, inB2));
 #else
    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ + *pSrcB++), 16);
    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ + *pSrcB++), 16);
    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ + *pSrcB++), 16);
    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ + *pSrcB++), 16);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = A + B */
    /* Add and store result in destination buffer. */
 #if defined (ARM_MATH_DSP)
    *pDst++ = (q15_t) __QADD16(*pSrcA++, *pSrcB++);
 #else
    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ + *pSrcB++), 16);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicAdd group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_add_q31.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_add_q31.c
@@ -0,0 +1,159 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_add_q31.c
 * Description:  Q31 vector addition
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicAdd
  @{
 */
 /**
  @brief         Q31 vector addition.
  @param[in]     pSrcA      points to the first input vector
  @param[in]     pSrcB      points to the second input vector
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_add_q31(
  const q31_t * pSrcA,
  const q31_t * pSrcB,
        q31_t * pDst,
        uint32_t blockSize)
 {
    uint32_t blkCnt;
    q31x4_t vecA;
    q31x4_t vecB;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2;
    while (blkCnt > 0U)
    {
        /*
         * C = A + B
         * Add and then store the results in the destination buffer.
         */
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        vst1q(pDst, vqaddq(vecA, vecB));
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrcA  += 4;
        pSrcB  += 4;
        pDst   += 4;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 3;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp32q(blkCnt);
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        vstrwq_p(pDst, vqaddq(vecA, vecB), p0);
    }
 }
 #else
 void arm_add_q31(
  const q31_t * pSrcA,
  const q31_t * pSrcB,
        q31_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A + B */
    /* Add and store result in destination buffer. */
    *pDst++ = __QADD(*pSrcA++, *pSrcB++);
    *pDst++ = __QADD(*pSrcA++, *pSrcB++);
    *pDst++ = __QADD(*pSrcA++, *pSrcB++);
    *pDst++ = __QADD(*pSrcA++, *pSrcB++);
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = A + B */
    /* Add and store result in destination buffer. */
    *pDst++ = __QADD(*pSrcA++, *pSrcB++);
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicAdd group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_add_q7.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_add_q7.c
@@ -0,0 +1,158 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_add_q7.c
 * Description:  Q7 vector addition
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicAdd
  @{
 */
 /**
  @brief         Q7 vector addition.
  @param[in]     pSrcA      points to the first input vector
  @param[in]     pSrcB      points to the second input vector
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_add_q7(
    const q7_t * pSrcA,
    const q7_t * pSrcB,
    q7_t * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q7x16_t vecA;
    q7x16_t vecB;
    /* Compute 16 outputs at a time */
    blkCnt = blockSize >> 4;
    while (blkCnt > 0U)
    {
        /*
         * C = A + B
         * Add and then store the results in the destination buffer.
         */
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        vst1q(pDst, vqaddq(vecA, vecB));
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrcA  += 16;
        pSrcB  += 16;
        pDst   += 16;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 0xF;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp8q(blkCnt);
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        vstrbq_p(pDst, vqaddq(vecA, vecB), p0);
    }
 }
 #else
 void arm_add_q7(
  const q7_t * pSrcA,
  const q7_t * pSrcB,
        q7_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A + B */
 #if defined (ARM_MATH_DSP)
    /* Add and store result in destination buffer (4 samples at a time). */
    write_q7x4_ia (&pDst, __QADD8 (read_q7x4_ia ((q7_t **) &pSrcA), read_q7x4_ia ((q7_t **) &pSrcB)));
 #else
    *pDst++ = (q7_t) __SSAT ((q15_t) *pSrcA++ + *pSrcB++, 8);
    *pDst++ = (q7_t) __SSAT ((q15_t) *pSrcA++ + *pSrcB++, 8);
    *pDst++ = (q7_t) __SSAT ((q15_t) *pSrcA++ + *pSrcB++, 8);
    *pDst++ = (q7_t) __SSAT ((q15_t) *pSrcA++ + *pSrcB++, 8);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = A + B */
    /* Add and store result in destination buffer. */
    *pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ + *pSrcB++, 8);
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicAdd group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_and_u16.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_and_u16.c
@@ -0,0 +1,137 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_and_u16.c
 * Description:  uint16_t bitwise AND
 *
 * $Date:        14 November 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @defgroup And Vector bitwise AND
  Compute the logical bitwise AND.
  There are separate functions for uint32_t, uint16_t, and uint7_t data types.
 */
 /**
  @addtogroup And
  @{
 */
 /**
  @brief         Compute the logical bitwise AND of two fixed-point vectors.
  @param[in]     pSrcA      points to input vector A
  @param[in]     pSrcB      points to input vector B
  @param[out]    pDst       points to output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
 */
 void arm_and_u16(
    const uint16_t * pSrcA,
    const uint16_t * pSrcB,
          uint16_t * pDst,
          uint32_t blockSize)
 {
    uint32_t blkCnt;      /* Loop counter */
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
    q15x8_t vecSrcA, vecSrcB;
    /* Compute 8 outputs at a time */
    blkCnt = blockSize >> 3;
    while (blkCnt > 0U)
    {
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vst1q(pDst, vandq_u16(vecSrcA, vecSrcB) );
        pSrcA += 8;
        pSrcB += 8;
        pDst  += 8;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 7;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp16q(blkCnt);
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vstrhq_p(pDst, vandq_u16(vecSrcA, vecSrcB), p0);
    }
 #else
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
    uint16x8_t vecA, vecB;
    /* Compute 8 outputs at a time */
    blkCnt = blockSize >> 3U;
    while (blkCnt > 0U)
    {
        vecA = vld1q_u16(pSrcA);
        vecB = vld1q_u16(pSrcB);
        vst1q_u16(pDst, vandq_u16(vecA, vecB) );
        pSrcA += 8;
        pSrcB += 8;
        pDst  += 8;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 7;
 #else
    /* Initialize blkCnt with number of samples */
    blkCnt = blockSize;
 #endif
    while (blkCnt > 0U)
    {
        *pDst++ = (*pSrcA++)&(*pSrcB++);
        /* Decrement the loop counter */
        blkCnt--;
    }
 #endif /* if defined(ARM_MATH_MVEI) */
 }
 /**
  @} end of And group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_and_u32.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_and_u32.c
@@ -0,0 +1,129 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_and_u32.c
 * Description:  uint32_t bitwise AND
 *
 * $Date:        14 November 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup And
  @{
 */
 /**
  @brief         Compute the logical bitwise AND of two fixed-point vectors.
  @param[in]     pSrcA      points to input vector A
  @param[in]     pSrcB      points to input vector B
  @param[out]    pDst       points to output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
 */
 void arm_and_u32(
    const uint32_t * pSrcA,
    const uint32_t * pSrcB,
          uint32_t * pDst,
          uint32_t blockSize)
 {
    uint32_t blkCnt;      /* Loop counter */
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
    q31x4_t vecSrcA, vecSrcB;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2;
    while (blkCnt > 0U)
    {
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vst1q(pDst, vandq_u32(vecSrcA, vecSrcB) );
        pSrcA += 4;
        pSrcB += 4;
        pDst  += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 3;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp32q(blkCnt);
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vstrwq_p(pDst, vandq_u32(vecSrcA, vecSrcB), p0);
    }
 #else
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
    uint32x4_t vecA, vecB;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        vecA = vld1q_u32(pSrcA);
        vecB = vld1q_u32(pSrcB);
        vst1q_u32(pDst, vandq_u32(vecA, vecB) );
        pSrcA += 4;
        pSrcB += 4;
        pDst  += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 3;
 #else
    /* Initialize blkCnt with number of samples */
    blkCnt = blockSize;
 #endif
    while (blkCnt > 0U)
    {
        *pDst++ = (*pSrcA++)&(*pSrcB++);
        /* Decrement the loop counter */
        blkCnt--;
    }
 #endif /* if defined(ARM_MATH_MVEI) */
 }
 /**
  @} end of And group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_and_u8.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_and_u8.c
@@ -0,0 +1,130 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_and_u8.c
 * Description:  uint8_t bitwise AND
 *
 * $Date:        14 November 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup And
  @{
 */
 /**
  @brief         Compute the logical bitwise AND of two fixed-point vectors.
  @param[in]     pSrcA      points to input vector A
  @param[in]     pSrcB      points to input vector B
  @param[out]    pDst       points to output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
 */
 void arm_and_u8(
    const uint8_t * pSrcA,
    const uint8_t * pSrcB,
          uint8_t * pDst,
          uint32_t blockSize)
 {
    uint32_t blkCnt;      /* Loop counter */
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
    q7x16_t vecSrcA, vecSrcB;
    /* Compute 16 outputs at a time */
    blkCnt = blockSize >> 4;
    while (blkCnt > 0U)
    {
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vst1q(pDst, vandq_u8(vecSrcA, vecSrcB) );
        pSrcA += 16;
        pSrcB += 16;
        pDst  += 16;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0xF;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp8q(blkCnt);
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vstrbq_p(pDst, vandq_u8(vecSrcA, vecSrcB), p0);
    }
 #else
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
    uint8x16_t vecA, vecB;
    /* Compute 16 outputs at a time */
    blkCnt = blockSize >> 4U;
    while (blkCnt > 0U)
    {
        vecA = vld1q_u8(pSrcA);
        vecB = vld1q_u8(pSrcB);
        vst1q_u8(pDst, vandq_u8(vecA, vecB) );
        pSrcA += 16;
        pSrcB += 16;
        pDst  += 16;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0xF;
 #else
    /* Initialize blkCnt with number of samples */
    blkCnt = blockSize;
 #endif
    while (blkCnt > 0U)
    {
        *pDst++ = (*pSrcA++)&(*pSrcB++);
        /* Decrement the loop counter */
        blkCnt--;
    }
 #endif /* if defined(ARM_MATH_MVEI) */
 }
 /**
  @} end of And group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_dot_prod_f32.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_dot_prod_f32.c
@@ -0,0 +1,226 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_dot_prod_f32.c
 * Description:  Floating-point dot product
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @defgroup BasicDotProd Vector Dot Product
  Computes the dot product of two vectors.
  The vectors are multiplied element-by-element and then summed.
  <pre>
      sum = pSrcA[0]*pSrcB[0] + pSrcA[1]*pSrcB[1] + ... + pSrcA[blockSize-1]*pSrcB[blockSize-1]
  </pre>
  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
 */
 /**
  @addtogroup BasicDotProd
  @{
 */
 /**
  @brief         Dot product of floating-point vectors.
  @param[in]     pSrcA      points to the first input vector.
  @param[in]     pSrcB      points to the second input vector.
  @param[in]     blockSize  number of samples in each vector.
  @param[out]    result     output result returned here.
  @return        none
 */
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 #include "arm_helium_utils.h"
 void arm_dot_prod_f32(
    const float32_t * pSrcA,
    const float32_t * pSrcB,
    uint32_t    blockSize,
    float32_t * result)
 {
    f32x4_t vecA, vecB;
    f32x4_t vecSum;
    uint32_t blkCnt;
    float32_t sum = 0.0f;
    vecSum = vdupq_n_f32(0.0f);
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        /*
         * C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
         * Calculate dot product and then store the result in a temporary buffer.
         * and advance vector source and destination pointers
         */
        vecA = vld1q(pSrcA);
        pSrcA += 4;
        vecB = vld1q(pSrcB);
        pSrcB += 4;
        vecSum = vfmaq(vecSum, vecA, vecB);
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt --;
    }
    blkCnt = blockSize & 3;
    if (blkCnt > 0U)
    {
        /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
        mve_pred16_t p0 = vctp32q(blkCnt);
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        vecSum = vfmaq_m(vecSum, vecA, vecB, p0);
    }
    sum = vecAddAcrossF32Mve(vecSum);
    /* Store result in destination buffer */
    *result = sum;
 }
 #else
 void arm_dot_prod_f32(
  const float32_t * pSrcA,
  const float32_t * pSrcB,
        uint32_t blockSize,
        float32_t * result)
 {
        uint32_t blkCnt;                               /* Loop counter */
        float32_t sum = 0.0f;                          /* Temporary return variable */
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
    f32x4_t vec1;
    f32x4_t vec2;
    f32x4_t accum = vdupq_n_f32(0);
    f32x2_t tmp = vdup_n_f32(0);
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    vec1 = vld1q_f32(pSrcA);
    vec2 = vld1q_f32(pSrcB);
    while (blkCnt > 0U)
    {
        /* C = A[0]*B[0] + A[1]*B[1] + A[2]*B[2] + ... + A[blockSize-1]*B[blockSize-1] */
        /* Calculate dot product and then store the result in a temporary buffer. */
 	      accum = vmlaq_f32(accum, vec1, vec2);
        /* Increment pointers */
        pSrcA += 4;
        pSrcB += 4;
        vec1 = vld1q_f32(pSrcA);
        vec2 = vld1q_f32(pSrcB);
        /* Decrement the loop counter */
        blkCnt--;
    }
 #if __aarch64__
    sum = vpadds_f32(vpadd_f32(vget_low_f32(accum), vget_high_f32(accum)));
 #else
    tmp = vpadd_f32(vget_low_f32(accum), vget_high_f32(accum));
    sum = vget_lane_f32(tmp, 0) + vget_lane_f32(tmp, 1);
 #endif
    /* Tail */
    blkCnt = blockSize & 0x3;
 #else
 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
    /* Calculate dot product and store result in a temporary buffer. */
    sum += (*pSrcA++) * (*pSrcB++);
    sum += (*pSrcA++) * (*pSrcB++);
    sum += (*pSrcA++) * (*pSrcB++);
    sum += (*pSrcA++) * (*pSrcB++);
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
 #endif /* #if defined(ARM_MATH_NEON) */
  while (blkCnt > 0U)
  {
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
    /* Calculate dot product and store result in a temporary buffer. */
    sum += (*pSrcA++) * (*pSrcB++);
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Store result in destination buffer */
  *result = sum;
 }
 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 /**
  @} end of BasicDotProd group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_dot_prod_q15.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_dot_prod_q15.c
@@ -0,0 +1,172 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_dot_prod_q15.c
 * Description:  Q15 dot product
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicDotProd
  @{
 */
 /**
  @brief         Dot product of Q15 vectors.
  @param[in]     pSrcA      points to the first input vector
  @param[in]     pSrcB      points to the second input vector
  @param[in]     blockSize  number of samples in each vector
  @param[out]    result     output result returned here
  @return        none
  @par           Scaling and Overflow Behavior
                   The intermediate multiplications are in 1.15 x 1.15 = 2.30 format and these
                   results are added to a 64-bit accumulator in 34.30 format.
                   Nonsaturating additions are used and given that there are 33 guard bits in the accumulator
                   there is no risk of overflow.
                   The return result is in 34.30 format.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_dot_prod_q15(
    const q15_t * pSrcA,
    const q15_t * pSrcB,
    uint32_t blockSize,
    q63_t * result)
 {
    uint32_t  blkCnt;           /* loop counters */
    q15x8_t vecA;
    q15x8_t vecB;
    q63_t     sum = 0LL;
    /* Compute 8 outputs at a time */
    blkCnt = blockSize >> 3;
    while (blkCnt > 0U)
    {
        /*
         * C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
         * Calculate dot product and then store the result in a temporary buffer.
         */
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        sum = vmlaldavaq(sum, vecA, vecB);
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrcA += 8;
        pSrcB += 8;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 7;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp16q(blkCnt);
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        sum = vmlaldavaq_p(sum, vecA, vecB, p0);
    }
    *result = sum;
 }
 #else
 void arm_dot_prod_q15(
  const q15_t * pSrcA,
  const q15_t * pSrcB,
        uint32_t blockSize,
        q63_t * result)
 {
        uint32_t blkCnt;                               /* Loop counter */
        q63_t sum = 0;                                 /* Temporary return variable */
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
 #if defined (ARM_MATH_DSP)
    /* Calculate dot product and store result in a temporary buffer. */
    sum = __SMLALD(read_q15x2_ia ((q15_t **) &pSrcA), read_q15x2_ia ((q15_t **) &pSrcB), sum);
    sum = __SMLALD(read_q15x2_ia ((q15_t **) &pSrcA), read_q15x2_ia ((q15_t **) &pSrcB), sum);
 #else
    sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
    sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
    sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
    sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
    /* Calculate dot product and store result in a temporary buffer. */
 //#if defined (ARM_MATH_DSP)
 //    sum  = __SMLALD(*pSrcA++, *pSrcB++, sum);
 //#else
    sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
 //#endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Store result in destination buffer in 34.30 format */
  *result = sum;
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicDotProd group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_dot_prod_q31.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_dot_prod_q31.c
@@ -0,0 +1,174 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_dot_prod_q31.c
 * Description:  Q31 dot product
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicDotProd
  @{
 */
 /**
  @brief         Dot product of Q31 vectors.
  @param[in]     pSrcA      points to the first input vector.
  @param[in]     pSrcB      points to the second input vector.
  @param[in]     blockSize  number of samples in each vector.
  @param[out]    result     output result returned here.
  @return        none
  @par           Scaling and Overflow Behavior
                   The intermediate multiplications are in 1.31 x 1.31 = 2.62 format and these
                   are truncated to 2.48 format by discarding the lower 14 bits.
                   The 2.48 result is then added without saturation to a 64-bit accumulator in 16.48 format.
                   There are 15 guard bits in the accumulator and there is no risk of overflow as long as
                   the length of the vectors is less than 2^16 elements.
                   The return result is in 16.48 format.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_dot_prod_q31(
    const q31_t * pSrcA,
    const q31_t * pSrcB,
    uint32_t blockSize,
    q63_t * result)
 {
    uint32_t  blkCnt;           /* loop counters */
    q31x4_t vecA;
    q31x4_t vecB;
    q63_t     sum = 0LL;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2;
    while (blkCnt > 0U)
    {
        /*
         * C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
         * Calculate dot product and then store the result in a temporary buffer.
         */
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        sum = vrmlaldavhaq(sum, vecA, vecB);
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrcA += 4;
        pSrcB += 4;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 3;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp32q(blkCnt);
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        sum = vrmlaldavhaq_p(sum, vecA, vecB, p0);
    }
    /*
     * vrmlaldavhaq provides extra intermediate accumulator headroom.
     * limiting the need of intermediate scaling
     * Scalar variant uses 2.48 accu format by right shifting accumulators by 14.
     * 16.48 output conversion is performed outside the loop by scaling accu. by 6
     */
    *result = asrl(sum, (14 - 8));
 }
 #else
 void arm_dot_prod_q31(
  const q31_t * pSrcA,
  const q31_t * pSrcB,
        uint32_t blockSize,
        q63_t * result)
 {
        uint32_t blkCnt;                               /* Loop counter */
        q63_t sum = 0;                                 /* Temporary return variable */
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
    /* Calculate dot product and store result in a temporary buffer. */
    sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
    sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
    sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
    sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
    /* Calculate dot product and store result in a temporary buffer. */
    sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Store result in destination buffer in 16.48 format */
  *result = sum;
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicDotProd group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_dot_prod_q7.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_dot_prod_q7.c
@@ -0,0 +1,191 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_dot_prod_q7.c
 * Description:  Q7 dot product
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicDotProd
  @{
 */
 /**
  @brief         Dot product of Q7 vectors.
  @param[in]     pSrcA      points to the first input vector
  @param[in]     pSrcB      points to the second input vector
  @param[in]     blockSize  number of samples in each vector
  @param[out]    result     output result returned here
  @return        none
  @par           Scaling and Overflow Behavior
                   The intermediate multiplications are in 1.7 x 1.7 = 2.14 format and these
                   results are added to an accumulator in 18.14 format.
                   Nonsaturating additions are used and there is no danger of wrap around as long as
                   the vectors are less than 2^18 elements long.
                   The return result is in 18.14 format.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_dot_prod_q7(
    const q7_t * pSrcA,
    const q7_t * pSrcB,
    uint32_t blockSize,
    q31_t * result)
 {
    uint32_t  blkCnt;           /* loop counters */
    q7x16_t vecA;
    q7x16_t vecB;
    q31_t     sum = 0;
    /* Compute 16 outputs at a time */
    blkCnt = blockSize >> 4;
    while (blkCnt > 0U)
    {
        /*
         * C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
         * Calculate dot product and then store the result in a temporary buffer.
         */
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        sum = vmladavaq(sum, vecA, vecB);
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrcA += 16;
        pSrcB += 16;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 0xF;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp8q(blkCnt);
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        sum = vmladavaq_p(sum, vecA, vecB, p0);
    }
    *result = sum;
 }
 #else
 void arm_dot_prod_q7(
  const q7_t * pSrcA,
  const q7_t * pSrcB,
        uint32_t blockSize,
        q31_t * result)
 {
        uint32_t blkCnt;                               /* Loop counter */
        q31_t sum = 0;                                 /* Temporary return variable */
 #if defined (ARM_MATH_LOOPUNROLL)
 #if defined (ARM_MATH_DSP)
  q31_t input1, input2;                          /* Temporary variables */
  q31_t inA1, inA2, inB1, inB2;                  /* Temporary variables */
 #endif
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
 #if defined (ARM_MATH_DSP)
    /* read 4 samples at a time from sourceA */
    input1 = read_q7x4_ia ((q7_t **) &pSrcA);
    /* read 4 samples at a time from sourceB */
    input2 = read_q7x4_ia ((q7_t **) &pSrcB);
    /* extract two q7_t samples to q15_t samples */
    inA1 = __SXTB16(__ROR(input1, 8));
    /* extract reminaing two samples */
    inA2 = __SXTB16(input1);
    /* extract two q7_t samples to q15_t samples */
    inB1 = __SXTB16(__ROR(input2, 8));
    /* extract reminaing two samples */
    inB2 = __SXTB16(input2);
    /* multiply and accumulate two samples at a time */
    sum = __SMLAD(inA1, inB1, sum);
    sum = __SMLAD(inA2, inB2, sum);
 #else
    sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
    sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
    sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
    sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
    /* Calculate dot product and store result in a temporary buffer. */
 //#if defined (ARM_MATH_DSP)
 //    sum  = __SMLAD(*pSrcA++, *pSrcB++, sum);
 //#else
    sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
 //#endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Store result in destination buffer in 18.14 format */
  *result = sum;
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicDotProd group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_mult_f32.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_mult_f32.c
@@ -0,0 +1,200 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_mult_f32.c
 * Description:  Floating-point vector multiplication
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @defgroup BasicMult Vector Multiplication
  Element-by-element multiplication of two vectors.
  <pre>
      pDst[n] = pSrcA[n] * pSrcB[n],   0 <= n < blockSize.
  </pre>
  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
 */
 /**
  @addtogroup BasicMult
  @{
 */
 /**
  @brief         Floating-point vector multiplication.
  @param[in]     pSrcA      points to the first input vector.
  @param[in]     pSrcB      points to the second input vector.
  @param[out]    pDst       points to the output vector.
  @param[in]     blockSize  number of samples in each vector.
  @return        none
 */
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 #include "arm_helium_utils.h"
 void arm_mult_f32(
  const float32_t * pSrcA,
  const float32_t * pSrcB,
        float32_t * pDst,
        uint32_t blockSize)
 {
    uint32_t blkCnt;                               /* Loop counter */
    f32x4_t vec1;
    f32x4_t vec2;
    f32x4_t res;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        /* C = A + B */
      /* Add and then store the results in the destination buffer. */
        vec1 = vld1q(pSrcA);
        vec2 = vld1q(pSrcB);
        res = vmulq(vec1, vec2);
        vst1q(pDst, res);
        /* Increment pointers */
        pSrcA += 4;
        pSrcB += 4;
        pDst += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0x3;
    if (blkCnt > 0U)
    {
      /* C = A + B */
      mve_pred16_t p0 = vctp32q(blkCnt);
      vec1 = vld1q(pSrcA);
      vec2 = vld1q(pSrcB);
      vstrwq_p(pDst, vmulq(vec1,vec2), p0);
    }
 }
 #else
 void arm_mult_f32(
  const float32_t * pSrcA,
  const float32_t * pSrcB,
        float32_t * pDst,
        uint32_t blockSize)
 {
    uint32_t blkCnt;                               /* Loop counter */
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
    f32x4_t vec1;
    f32x4_t vec2;
    f32x4_t res;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        /* C = A * B */
    	/* Multiply the inputs and then store the results in the destination buffer. */
        vec1 = vld1q_f32(pSrcA);
        vec2 = vld1q_f32(pSrcB);
        res = vmulq_f32(vec1, vec2);
        vst1q_f32(pDst, res);
        /* Increment pointers */
        pSrcA += 4;
        pSrcB += 4;
        pDst += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0x3;
 #else
 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A * B */
    /* Multiply inputs and store result in destination buffer. */
    *pDst++ = (*pSrcA++) * (*pSrcB++);
    *pDst++ = (*pSrcA++) * (*pSrcB++);
    *pDst++ = (*pSrcA++) * (*pSrcB++);
    *pDst++ = (*pSrcA++) * (*pSrcB++);
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
 #endif /* #if defined(ARM_MATH_NEON) */
  while (blkCnt > 0U)
  {
    /* C = A * B */
    /* Multiply input and store result in destination buffer. */
    *pDst++ = (*pSrcA++) * (*pSrcB++);
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 /**
  @} end of BasicMult group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_mult_q15.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_mult_q15.c
@@ -0,0 +1,192 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_mult_q15.c
 * Description:  Q15 vector multiplication
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicMult
  @{
 */
 /**
  @brief         Q15 vector multiplication
  @param[in]     pSrcA      points to first input vector
  @param[in]     pSrcB      points to second input vector
  @param[out]    pDst       points to output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_mult_q15(
    const q15_t * pSrcA,
    const q15_t * pSrcB,
    q15_t * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q15x8_t vecA, vecB;
    /* Compute 8 outputs at a time */
    blkCnt = blockSize >> 3;
    while (blkCnt > 0U)
    {
        /*
         * C = A * B
         * Multiply the inputs and then store the results in the destination buffer.
         */
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        vst1q(pDst, vqdmulhq(vecA, vecB));
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrcA  += 8;
        pSrcB  += 8;
        pDst   += 8;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 7;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp16q(blkCnt);
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        vstrhq_p(pDst, vqdmulhq(vecA, vecB), p0);
    }
 }
 #else
 void arm_mult_q15(
  const q15_t * pSrcA,
  const q15_t * pSrcB,
        q15_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
 #if defined (ARM_MATH_LOOPUNROLL)
 #if defined (ARM_MATH_DSP)
  q31_t inA1, inA2, inB1, inB2;                  /* Temporary input variables */
  q15_t out1, out2, out3, out4;                  /* Temporary output variables */
  q31_t mul1, mul2, mul3, mul4;                  /* Temporary variables */
 #endif
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A * B */
 #if defined (ARM_MATH_DSP)
    /* read 2 samples at a time from sourceA */
    inA1 = read_q15x2_ia ((q15_t **) &pSrcA);
    /* read 2 samples at a time from sourceB */
    inB1 = read_q15x2_ia ((q15_t **) &pSrcB);
    /* read 2 samples at a time from sourceA */
    inA2 = read_q15x2_ia ((q15_t **) &pSrcA);
    /* read 2 samples at a time from sourceB */
    inB2 = read_q15x2_ia ((q15_t **) &pSrcB);
    /* multiply mul = sourceA * sourceB */
    mul1 = (q31_t) ((q15_t) (inA1 >> 16) * (q15_t) (inB1 >> 16));
    mul2 = (q31_t) ((q15_t) (inA1      ) * (q15_t) (inB1      ));
    mul3 = (q31_t) ((q15_t) (inA2 >> 16) * (q15_t) (inB2 >> 16));
    mul4 = (q31_t) ((q15_t) (inA2      ) * (q15_t) (inB2      ));
    /* saturate result to 16 bit */
    out1 = (q15_t) __SSAT(mul1 >> 15, 16);
    out2 = (q15_t) __SSAT(mul2 >> 15, 16);
    out3 = (q15_t) __SSAT(mul3 >> 15, 16);
    out4 = (q15_t) __SSAT(mul4 >> 15, 16);
    /* store result to destination */
 #ifndef ARM_MATH_BIG_ENDIAN
    write_q15x2_ia (&pDst, __PKHBT(out2, out1, 16));
    write_q15x2_ia (&pDst, __PKHBT(out4, out3, 16));
 #else
    write_q15x2_ia (&pDst, __PKHBT(out1, out2, 16));
    write_q15x2_ia (&pDst, __PKHBT(out3, out4, 16));
 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
 #else
    *pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
    *pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
    *pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
    *pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = A * B */
    /* Multiply inputs and store result in destination buffer. */
    *pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicMult group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_mult_q31.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_mult_q31.c
@@ -0,0 +1,168 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_mult_q31.c
 * Description:  Q31 vector multiplication
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicMult
  @{
 */
 /**
  @brief         Q31 vector multiplication.
  @param[in]     pSrcA      points to the first input vector.
  @param[in]     pSrcB      points to the second input vector.
  @param[out]    pDst       points to the output vector.
  @param[in]     blockSize  number of samples in each vector.
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] are saturated.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_mult_q31(
    const q31_t * pSrcA,
    const q31_t * pSrcB,
    q31_t * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q31x4_t vecA, vecB;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2;
    while (blkCnt > 0U)
    {
        /*
         * C = A * B
         * Multiply the inputs and then store the results in the destination buffer.
         */
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        vst1q(pDst, vqdmulhq(vecA, vecB));
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrcA  += 4;
        pSrcB  += 4;
        pDst   += 4;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 3;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp32q(blkCnt);
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        vstrwq_p(pDst, vqdmulhq(vecA, vecB), p0);
    }
 }
 #else
 void arm_mult_q31(
  const q31_t * pSrcA,
  const q31_t * pSrcB,
        q31_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
        q31_t out;                                     /* Temporary output variable */
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A * B */
    /* Multiply inputs and store result in destination buffer. */
    out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
    out = __SSAT(out, 31);
    *pDst++ = out << 1U;
    out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
    out = __SSAT(out, 31);
    *pDst++ = out << 1U;
    out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
    out = __SSAT(out, 31);
    *pDst++ = out << 1U;
    out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
    out = __SSAT(out, 31);
    *pDst++ = out << 1U;
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = A * B */
    /* Multiply inputs and store result in destination buffer. */
    out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
    out = __SSAT(out, 31);
    *pDst++ = out << 1U;
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicMult group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_mult_q7.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_mult_q7.c
@@ -0,0 +1,168 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_mult_q7.c
 * Description:  Q7 vector multiplication
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicMult
  @{
 */
 /**
  @brief         Q7 vector multiplication
  @param[in]     pSrcA      points to the first input vector
  @param[in]     pSrcB      points to the second input vector
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_mult_q7(
    const q7_t * pSrcA,
    const q7_t * pSrcB,
    q7_t * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q7x16_t vecA, vecB;
    /* Compute 16 outputs at a time */
    blkCnt = blockSize >> 4;
    while (blkCnt > 0U)
    {
        /*
         * C = A * B
         * Multiply the inputs and then store the results in the destination buffer.
         */
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        vst1q(pDst, vqdmulhq(vecA, vecB));
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrcA  += 16;
        pSrcB  += 16;
        pDst   += 16;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 0xF;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp8q(blkCnt);
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        vstrbq_p(pDst, vqdmulhq(vecA, vecB), p0);
    }
 }
 #else
 void arm_mult_q7(
  const q7_t * pSrcA,
  const q7_t * pSrcB,
        q7_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
 #if defined (ARM_MATH_LOOPUNROLL)
 #if defined (ARM_MATH_DSP)
  q7_t out1, out2, out3, out4;                   /* Temporary output variables */
 #endif
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A * B */
 #if defined (ARM_MATH_DSP)
    /* Multiply inputs and store results in temporary variables */
    out1 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
    out2 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
    out3 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
    out4 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
    /* Pack and store result in destination buffer (in single write) */
    write_q7x4_ia (&pDst, __PACKq7(out1, out2, out3, out4));
 #else
    *pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
    *pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
    *pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
    *pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = A * B */
    /* Multiply input and store result in destination buffer. */
    *pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicMult group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_negate_f32.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_negate_f32.c
@@ -0,0 +1,192 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_negate_f32.c
 * Description:  Negates floating-point vectors
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @defgroup BasicNegate Vector Negate
  Negates the elements of a vector.
  <pre>
      pDst[n] = -pSrc[n],   0 <= n < blockSize.
  </pre>
  The functions support in-place computation allowing the source and
  destination pointers to reference the same memory buffer.
  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
 */
 /**
  @addtogroup BasicNegate
  @{
 */
 /**
  @brief         Negates the elements of a floating-point vector.
  @param[in]     pSrc       points to input vector.
  @param[out]    pDst       points to output vector.
  @param[in]     blockSize  number of samples in each vector.
  @return        none
 */
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 #include "arm_helium_utils.h"
 void arm_negate_f32(
  const float32_t * pSrc,
        float32_t * pDst,
        uint32_t blockSize)
 {
    uint32_t blkCnt;                               /* Loop counter */
    f32x4_t vec1;
    f32x4_t res;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        /* C = |A| */
        /* Calculate absolute values and then store the results in the destination buffer. */
        vec1 = vld1q(pSrc);
        res = vnegq(vec1);
        vst1q(pDst, res);
        /* Increment pointers */
        pSrc += 4;
        pDst += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0x3;
    if (blkCnt > 0U)
    {
      /* C = |A| */
      mve_pred16_t p0 = vctp32q(blkCnt);
      vec1 = vld1q((float32_t const *) pSrc);
      vstrwq_p(pDst, vnegq(vec1), p0);
    }
 }
 #else
 void arm_negate_f32(
  const float32_t * pSrc,
        float32_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
 #if defined(ARM_MATH_NEON_EXPERIMENTAL) && !defined(ARM_MATH_AUTOVECTORIZE)
    f32x4_t vec1;
    f32x4_t res;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        /* C = -A */
    	/* Negate and then store the results in the destination buffer. */
        vec1 = vld1q_f32(pSrc);
        res = vnegq_f32(vec1);
        vst1q_f32(pDst, res);
        /* Increment pointers */
        pSrc += 4;
        pDst += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0x3;
 #else
 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = -A */
    /* Negate and store result in destination buffer. */
    *pDst++ = -*pSrc++;
    *pDst++ = -*pSrc++;
    *pDst++ = -*pSrc++;
    *pDst++ = -*pSrc++;
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
 #endif /* #if defined(ARM_MATH_NEON_EXPERIMENTAL) */
  while (blkCnt > 0U)
  {
    /* C = -A */
    /* Negate and store result in destination buffer. */
    *pDst++ = -*pSrc++;
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 /**
  @} end of BasicNegate group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_negate_q15.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_negate_q15.c
@@ -0,0 +1,171 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_negate_q15.c
 * Description:  Negates Q15 vectors
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicNegate
  @{
 */
 /**
  @brief         Negates the elements of a Q15 vector.
  @param[in]     pSrc       points to the input vector.
  @param[out]    pDst       points to the output vector.
  @param[in]     blockSize  number of samples in each vector.
  @return        none
  @par           Conditions for optimum performance
                   Input and output buffers should be aligned by 32-bit
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   The Q15 value -1 (0x8000) is saturated to the maximum allowable positive value 0x7FFF.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_negate_q15(
    const q15_t  * pSrc,
    q15_t  * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q15x8_t vecSrc;
    /* Compute 8 outputs at a time */
    blkCnt = blockSize >> 3;
    while (blkCnt > 0U)
    {
        /*
         * C = -A
         * Negate and then store the results in the destination buffer.
         */
        vecSrc = vld1q(pSrc);
        vst1q(pDst, vqnegq(vecSrc));
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrc += 8;
        pDst += 8;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 7;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp16q(blkCnt);
        vecSrc = vld1q(pSrc);
        vstrhq_p(pDst, vqnegq(vecSrc), p0);
    }
 }
 #else
 void arm_negate_q15(
  const q15_t * pSrc,
        q15_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
        q15_t in;                                      /* Temporary input variable */
 #if defined (ARM_MATH_LOOPUNROLL)
 #if defined (ARM_MATH_DSP)
  q31_t in1;                                    /* Temporary input variables */
 #endif
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = -A */
 #if defined (ARM_MATH_DSP)
    /* Negate and store result in destination buffer (2 samples at a time). */
    in1 = read_q15x2_ia ((q15_t **) &pSrc);
    write_q15x2_ia (&pDst, __QSUB16(0, in1));
    in1 = read_q15x2_ia ((q15_t **) &pSrc);
    write_q15x2_ia (&pDst, __QSUB16(0, in1));
 #else
    in = *pSrc++;
    *pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
    in = *pSrc++;
    *pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
    in = *pSrc++;
    *pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
    in = *pSrc++;
    *pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = -A */
    /* Negate and store result in destination buffer. */
    in = *pSrc++;
    *pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicNegate group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_negate_q31.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_negate_q31.c
@@ -0,0 +1,178 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_negate_q31.c
 * Description:  Negates Q31 vectors
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicNegate
  @{
 */
 /**
  @brief         Negates the elements of a Q31 vector.
  @param[in]     pSrc       points to the input vector.
  @param[out]    pDst       points to the output vector.
  @param[in]     blockSize   number of samples in each vector.
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   The Q31 value -1 (0x80000000) is saturated to the maximum allowable positive value 0x7FFFFFFF.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_negate_q31(
    const q31_t * pSrc,
    q31_t * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q31x4_t vecSrc;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2;
    while (blkCnt > 0U)
    {
        /*
         * C = -A
         * Negate and then store the results in the destination buffer.
         */
        vecSrc = vld1q(pSrc);
        vst1q(pDst, vqnegq(vecSrc));
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrc += 4;
        pDst += 4;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 3;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp32q(blkCnt);
        vecSrc = vld1q(pSrc);
        vstrwq_p(pDst, vqnegq(vecSrc), p0);
    }
 }
 #else
 void arm_negate_q31(
  const q31_t * pSrc,
        q31_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
        q31_t in;                                      /* Temporary input variable */
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = -A */
    /* Negate and store result in destination buffer. */
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = __QSUB(0, in);
 #else
    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
 #endif
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = __QSUB(0, in);
 #else
    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
 #endif
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = __QSUB(0, in);
 #else
    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
 #endif
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = __QSUB(0, in);
 #else
    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = -A */
    /* Negate and store result in destination buffer. */
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = __QSUB(0, in);
 #else
    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicNegate group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_negate_q7.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_negate_q7.c
@@ -0,0 +1,171 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_negate_q7.c
 * Description:  Negates Q7 vectors
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicNegate
  @{
 */
 /**
  @brief         Negates the elements of a Q7 vector.
  @param[in]     pSrc       points to the input vector.
  @param[out]    pDst       points to the output vector.
  @param[in]     blockSize   number of samples in each vector.
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   The Q7 value -1 (0x80) is saturated to the maximum allowable positive value 0x7F.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_negate_q7(
    const q7_t   * pSrc,
    q7_t   * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q7x16_t vecSrc;
    /* Compute 16 outputs at a time */
    blkCnt = blockSize >> 4;
    while (blkCnt > 0U)
    {
        /*
         * C = -A
         * Negate and then store the results in the destination buffer.
         */
        vecSrc = vld1q(pSrc);
        vst1q(pDst, vqnegq(vecSrc));
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrc += 16;
        pDst += 16;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 0xF;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp8q(blkCnt);
        vecSrc = vld1q(pSrc);
        vstrbq_p(pDst, vqnegq(vecSrc), p0);
    }
 }
 #else
 void arm_negate_q7(
  const q7_t * pSrc,
        q7_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
        q7_t in;                                       /* Temporary input variable */
 #if defined (ARM_MATH_LOOPUNROLL)
 #if defined (ARM_MATH_DSP)
  q31_t in1;                                    /* Temporary input variable */
 #endif
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = -A */
 #if defined (ARM_MATH_DSP)
    /* Negate and store result in destination buffer (4 samples at a time). */
    in1 = read_q7x4_ia ((q7_t **) &pSrc);
    write_q7x4_ia (&pDst, __QSUB8(0, in1));
 #else
    in = *pSrc++;
    *pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
    in = *pSrc++;
    *pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
    in = *pSrc++;
    *pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
    in = *pSrc++;
    *pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = -A */
    /* Negate and store result in destination buffer. */
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = (q7_t) __QSUB8(0, in);
 #else
    *pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicNegate group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_not_u16.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_not_u16.c
@@ -0,0 +1,130 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_not_u16.c
 * Description:  uint16_t bitwise NOT
 *
 * $Date:        14 November 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @defgroup Not Vector bitwise NOT
  Compute the logical bitwise NOT.
  There are separate functions for uint32_t, uint16_t, and uint8_t data types.
 */
 /**
  @addtogroup Not
  @{
 */
 /**
  @brief         Compute the logical bitwise NOT of a fixed-point vector.
  @param[in]     pSrc       points to input vector
  @param[out]    pDst       points to output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
 */
 void arm_not_u16(
    const uint16_t * pSrc,
          uint16_t * pDst,
          uint32_t blockSize)
 {
    uint32_t blkCnt;      /* Loop counter */
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
    q15x8_t vecSrc;
    /* Compute 8 outputs at a time */
    blkCnt = blockSize >> 3;
    while (blkCnt > 0U)
    {
        vecSrc = vld1q(pSrc);
        vst1q(pDst, vmvnq_u16(vecSrc) );
        pSrc += 8;
        pDst += 8;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 7;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp16q(blkCnt);
        vecSrc = vld1q(pSrc);
        vstrhq_p(pDst, vmvnq_u16(vecSrc), p0);
    }
 #else
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
    uint16x8_t inV;
    /* Compute 8 outputs at a time */
    blkCnt = blockSize >> 3U;
    while (blkCnt > 0U)
    {
        inV = vld1q_u16(pSrc);
        vst1q_u16(pDst, vmvnq_u16(inV) );
        pSrc += 8;
        pDst += 8;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 7;
 #else
    /* Initialize blkCnt with number of samples */
    blkCnt = blockSize;
 #endif
    while (blkCnt > 0U)
    {
        *pDst++ = ~(*pSrc++);
        /* Decrement the loop counter */
        blkCnt--;
    }
 #endif /* if defined(ARM_MATH_MVEI) */
 }
 /**
  @} end of Not group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_not_u32.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_not_u32.c
@@ -0,0 +1,122 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_not_u32.c
 * Description:  uint32_t bitwise NOT
 *
 * $Date:        14 November 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup Not
  @{
 */
 /**
  @brief         Compute the logical bitwise NOT of a fixed-point vector.
  @param[in]     pSrc       points to input vector
  @param[out]    pDst       points to output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
 */
 void arm_not_u32(
    const uint32_t * pSrc,
          uint32_t * pDst,
          uint32_t blockSize)
 {
    uint32_t blkCnt;      /* Loop counter */
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
    q31x4_t vecSrc;
    /* Compute 8 outputs at a time */
    blkCnt = blockSize >> 2;
    while (blkCnt > 0U)
    {
        vecSrc = vld1q(pSrc);
        vst1q(pDst, vmvnq_u32(vecSrc) );
        pSrc += 4;
        pDst += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 3;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp32q(blkCnt);
        vecSrc = vld1q(pSrc);
        vstrwq_p(pDst, vmvnq_u32(vecSrc), p0);
    }
 #else
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
    uint32x4_t inV;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        inV = vld1q_u32(pSrc);
        vst1q_u32(pDst, vmvnq_u32(inV) );
        pSrc += 4;
        pDst += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 3;
 #else
    /* Initialize blkCnt with number of samples */
    blkCnt = blockSize;
 #endif
    while (blkCnt > 0U)
    {
        *pDst++ = ~(*pSrc++);
        /* Decrement the loop counter */
        blkCnt--;
    }
 #endif /* if defined(ARM_MATH_MVEI) */
 }
 /**
  @} end of Not group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_not_u8.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_not_u8.c
@@ -0,0 +1,122 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_not_u8.c
 * Description:  uint8_t bitwise NOT
 *
 * $Date:        14 November 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup Not
  @{
 */
 /**
  @brief         Compute the logical bitwise NOT of a fixed-point vector.
  @param[in]     pSrc       points to input vector
  @param[out]    pDst       points to output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
 */
 void arm_not_u8(
    const uint8_t * pSrc,
          uint8_t * pDst,
          uint32_t blockSize)
 {
    uint32_t blkCnt;      /* Loop counter */
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
    q7x16_t vecSrc;
    /* Compute 16 outputs at a time */
    blkCnt = blockSize >> 4;
    while (blkCnt > 0U)
    {
        vecSrc = vld1q(pSrc);
        vst1q(pDst, vmvnq_u8(vecSrc) );
        pSrc += 16;
        pDst += 16;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0xF;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp8q(blkCnt);
        vecSrc = vld1q(pSrc);
        vstrbq_p(pDst, vmvnq_u8(vecSrc), p0);
    }
 #else
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
    uint8x16_t inV;
    /* Compute 16 outputs at a time */
    blkCnt = blockSize >> 4U;
    while (blkCnt > 0U)
    {
        inV = vld1q_u8(pSrc);
        vst1q_u8(pDst, vmvnq_u8(inV) );
        pSrc += 16;
        pDst += 16;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0xF;
 #else
    /* Initialize blkCnt with number of samples */
    blkCnt = blockSize;
 #endif
    while (blkCnt > 0U)
    {
        *pDst++ = ~(*pSrc++);
        /* Decrement the loop counter */
        blkCnt--;
    }
 #endif /* if defined(ARM_MATH_MVEI) */
 }
 /**
  @} end of Not group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_offset_f32.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_offset_f32.c
@@ -0,0 +1,196 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_offset_f32.c
 * Description:  Floating-point vector offset
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @defgroup BasicOffset Vector Offset
  Adds a constant offset to each element of a vector.
  <pre>
      pDst[n] = pSrc[n] + offset,   0 <= n < blockSize.
  </pre>
  The functions support in-place computation allowing the source and
  destination pointers to reference the same memory buffer.
  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
 */
 /**
  @addtogroup BasicOffset
  @{
 */
 /**
  @brief         Adds a constant offset to a floating-point vector.
  @param[in]     pSrc       points to the input vector
  @param[in]     offset     is the offset to be added
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
 */
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 #include "arm_helium_utils.h"
 void arm_offset_f32(
  const float32_t * pSrc,
        float32_t offset,
        float32_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
    f32x4_t vec1;
    f32x4_t res;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        /* C = A + offset */
        /* Add offset and then store the results in the destination buffer. */
        vec1 = vld1q(pSrc);
        res = vaddq(vec1,offset);
        vst1q(pDst, res);
        /* Increment pointers */
        pSrc += 4;
        pDst += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0x3;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp32q(blkCnt);
        vec1 = vld1q((float32_t const *) pSrc);
        vstrwq_p(pDst, vaddq(vec1, offset), p0);
    }
 }
 #else
 void arm_offset_f32(
  const float32_t * pSrc,
        float32_t offset,
        float32_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
 #if defined(ARM_MATH_NEON_EXPERIMENTAL) && !defined(ARM_MATH_AUTOVECTORIZE)
    f32x4_t vec1;
    f32x4_t res;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        /* C = A + offset */
        /* Add offset and then store the results in the destination buffer. */
        vec1 = vld1q_f32(pSrc);
        res = vaddq_f32(vec1,vdupq_n_f32(offset));
        vst1q_f32(pDst, res);
        /* Increment pointers */
        pSrc += 4;
        pDst += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0x3;
 #else
 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A + offset */
    /* Add offset and store result in destination buffer. */
    *pDst++ = (*pSrc++) + offset;
    *pDst++ = (*pSrc++) + offset;
    *pDst++ = (*pSrc++) + offset;
    *pDst++ = (*pSrc++) + offset;
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
 #endif /* #if defined(ARM_MATH_NEON_EXPERIMENTAL) */
  while (blkCnt > 0U)
  {
    /* C = A + offset */
    /* Add offset and store result in destination buffer. */
    *pDst++ = (*pSrc++) + offset;
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 /**
  @} end of BasicOffset group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_offset_q15.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_offset_q15.c
@@ -0,0 +1,168 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_offset_q15.c
 * Description:  Q15 vector offset
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicOffset
  @{
 */
 /**
  @brief         Adds a constant offset to a Q15 vector.
  @param[in]     pSrc       points to the input vector
  @param[in]     offset     is the offset to be added
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_offset_q15(
    const q15_t * pSrc,
    q15_t   offset,
    q15_t * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q15x8_t vecSrc;
    /* Compute 8 outputs at a time */
    blkCnt = blockSize >> 3;
    while (blkCnt > 0U)
    {
        /*
         * C = A + offset
         * Add offset and then store the result in the destination buffer.
         */
        vecSrc = vld1q(pSrc);
        vst1q(pDst, vqaddq(vecSrc, offset));
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrc += 8;
        pDst += 8;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 7;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp16q(blkCnt);
        vecSrc = vld1q(pSrc);
        vstrhq_p(pDst, vqaddq(vecSrc, offset), p0);
    }
 }
 #else
 void arm_offset_q15(
  const q15_t * pSrc,
        q15_t offset,
        q15_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
 #if defined (ARM_MATH_LOOPUNROLL)
 #if defined (ARM_MATH_DSP)
  q31_t offset_packed;                           /* Offset packed to 32 bit */
  /* Offset is packed to 32 bit in order to use SIMD32 for addition */
  offset_packed = __PKHBT(offset, offset, 16);
 #endif
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A + offset */
 #if defined (ARM_MATH_DSP)
    /* Add offset and store result in destination buffer (2 samples at a time). */
    write_q15x2_ia (&pDst, __QADD16(read_q15x2_ia ((q15_t **) &pSrc), offset_packed));
    write_q15x2_ia (&pDst, __QADD16(read_q15x2_ia ((q15_t **) &pSrc), offset_packed));
 #else
    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrc++ + offset), 16);
    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrc++ + offset), 16);
    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrc++ + offset), 16);
    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrc++ + offset), 16);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = A + offset */
    /* Add offset and store result in destination buffer. */
 #if defined (ARM_MATH_DSP)
    *pDst++ = (q15_t) __QADD16(*pSrc++, offset);
 #else
    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrc++ + offset), 16);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicOffset group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_offset_q31.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_offset_q31.c
@@ -0,0 +1,175 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_offset_q31.c
 * Description:  Q31 vector offset
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicOffset
  @{
 */
 /**
  @brief         Adds a constant offset to a Q31 vector.
  @param[in]     pSrc       points to the input vector
  @param[in]     offset     is the offset to be added
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_offset_q31(
    const q31_t * pSrc,
    q31_t   offset,
    q31_t * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q31x4_t vecSrc;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2;
    while (blkCnt > 0U)
    {
        /*
         * C = A + offset
         * Add offset and then store the result in the destination buffer.
         */
        vecSrc = vld1q(pSrc);
        vst1q(pDst, vqaddq(vecSrc, offset));
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrc += 4;
        pDst += 4;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 3;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp32q(blkCnt);
        vecSrc = vld1q(pSrc);
        vstrwq_p(pDst, vqaddq(vecSrc, offset), p0);
    }
 }
 #else
 void arm_offset_q31(
  const q31_t * pSrc,
        q31_t offset,
        q31_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A + offset */
    /* Add offset and store result in destination buffer. */
 #if defined (ARM_MATH_DSP)
    *pDst++ = __QADD(*pSrc++, offset);
 #else
    *pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);
 #endif
 #if defined (ARM_MATH_DSP)
    *pDst++ = __QADD(*pSrc++, offset);
 #else
    *pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);
 #endif
 #if defined (ARM_MATH_DSP)
    *pDst++ = __QADD(*pSrc++, offset);
 #else
    *pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);
 #endif
 #if defined (ARM_MATH_DSP)
    *pDst++ = __QADD(*pSrc++, offset);
 #else
    *pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = A + offset */
    /* Add offset and store result in destination buffer. */
 #if defined (ARM_MATH_DSP)
    *pDst++ = __QADD(*pSrc++, offset);
 #else
    *pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicOffset group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_offset_q7.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_offset_q7.c
@@ -0,0 +1,162 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_offset_q7.c
 * Description:  Q7 vector offset
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicOffset
  @{
 */
 /**
  @brief         Adds a constant offset to a Q7 vector.
  @param[in]     pSrc       points to the input vector
  @param[in]     offset     is the offset to be added
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_offset_q7(
    const q7_t * pSrc,
    q7_t   offset,
    q7_t * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q7x16_t vecSrc;
    /* Compute 16 outputs at a time */
    blkCnt = blockSize >> 4;
    while (blkCnt > 0U)
    {
        /*
         * C = A + offset
         * Add offset and then store the result in the destination buffer.
         */
        vecSrc = vld1q(pSrc);
        vst1q(pDst, vqaddq(vecSrc, offset));
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrc += 16;
        pDst += 16;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 0xF;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp8q(blkCnt);
        vecSrc = vld1q(pSrc);
        vstrbq_p(pDst, vqaddq(vecSrc, offset), p0);
    }
 }
 #else
 void arm_offset_q7(
  const q7_t * pSrc,
        q7_t offset,
        q7_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
 #if defined (ARM_MATH_LOOPUNROLL)
 #if defined (ARM_MATH_DSP)
  q31_t offset_packed;                           /* Offset packed to 32 bit */
  /* Offset is packed to 32 bit in order to use SIMD32 for addition */
  offset_packed = __PACKq7(offset, offset, offset, offset);
 #endif
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A + offset */
 #if defined (ARM_MATH_DSP)
    /* Add offset and store result in destination buffer (4 samples at a time). */
    write_q7x4_ia (&pDst, __QADD8(read_q7x4_ia ((q7_t **) &pSrc), offset_packed));
 #else
    *pDst++ = (q7_t) __SSAT(*pSrc++ + offset, 8);
    *pDst++ = (q7_t) __SSAT(*pSrc++ + offset, 8);
    *pDst++ = (q7_t) __SSAT(*pSrc++ + offset, 8);
    *pDst++ = (q7_t) __SSAT(*pSrc++ + offset, 8);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = A + offset */
    /* Add offset and store result in destination buffer. */
    *pDst++ = (q7_t) __SSAT((q15_t) *pSrc++ + offset, 8);
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicOffset group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_or_u16.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_or_u16.c
@@ -0,0 +1,137 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_or_u16.c
 * Description:  uint16_t bitwise inclusive OR
 *
 * $Date:        14 November 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @defgroup Or Vector bitwise inclusive OR
  Compute the logical bitwise OR.
  There are separate functions for uint32_t, uint16_t, and uint8_t data types.
 */
 /**
  @addtogroup Or
  @{
 */
 /**
  @brief         Compute the logical bitwise OR of two fixed-point vectors.
  @param[in]     pSrcA      points to input vector A
  @param[in]     pSrcB      points to input vector B
  @param[out]    pDst       points to output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
 */
 void arm_or_u16(
    const uint16_t * pSrcA,
    const uint16_t * pSrcB,
          uint16_t * pDst,
          uint32_t blockSize)
 {
    uint32_t blkCnt;      /* Loop counter */
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
    q15x8_t vecSrcA, vecSrcB;
    /* Compute 8 outputs at a time */
    blkCnt = blockSize >> 3;
    while (blkCnt > 0U)
    {
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vst1q(pDst, vorrq_u16(vecSrcA, vecSrcB) );
        pSrcA += 8;
        pSrcB += 8;
        pDst  += 8;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 7;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp16q(blkCnt);
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vstrhq_p(pDst, vorrq_u16(vecSrcA, vecSrcB), p0);
    }
 #else
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
    uint16x8_t vecA, vecB;
    /* Compute 8 outputs at a time */
    blkCnt = blockSize >> 3U;
    while (blkCnt > 0U)
    {
        vecA = vld1q_u16(pSrcA);
        vecB = vld1q_u16(pSrcB);
        vst1q_u16(pDst, vorrq_u16(vecA, vecB) );
        pSrcA += 8;
        pSrcB += 8;
        pDst  += 8;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 7;
 #else
    /* Initialize blkCnt with number of samples */
    blkCnt = blockSize;
 #endif
    while (blkCnt > 0U)
    {
        *pDst++ = (*pSrcA++)|(*pSrcB++);
        /* Decrement the loop counter */
        blkCnt--;
    }
 #endif /* if defined(ARM_MATH_MVEI) */
 }
 /**
  @} end of Or group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_or_u32.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_or_u32.c
@@ -0,0 +1,128 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_or_u32.c
 * Description:  uint32_t bitwise inclusive OR
 *
 * $Date:        14 November 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup Or
  @{
 */
 /**
  @brief         Compute the logical bitwise OR of two fixed-point vectors.
  @param[in]     pSrcA      points to input vector A
  @param[in]     pSrcB      points to input vector B
  @param[out]    pDst       points to output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
 */
 void arm_or_u32(
    const uint32_t * pSrcA,
    const uint32_t * pSrcB,
          uint32_t * pDst,
          uint32_t blockSize)
 {
    uint32_t blkCnt;      /* Loop counter */
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
    q31x4_t vecSrcA, vecSrcB;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2;
    while (blkCnt > 0U)
    {
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vst1q(pDst, vorrq_u32(vecSrcA, vecSrcB) );
        pSrcA += 4;
        pSrcB += 4;
        pDst  += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 3;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp32q(blkCnt);
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vstrwq_p(pDst, vorrq_u32(vecSrcA, vecSrcB), p0);
    }
 #else
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
    uint32x4_t vecA, vecB;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        vecA = vld1q_u32(pSrcA);
        vecB = vld1q_u32(pSrcB);
        vst1q_u32(pDst, vorrq_u32(vecA, vecB) );
        pSrcA += 4;
        pSrcB += 4;
        pDst  += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 3;
 #else
    /* Initialize blkCnt with number of samples */
    blkCnt = blockSize;
 #endif
    while (blkCnt > 0U)
    {
        *pDst++ = (*pSrcA++)|(*pSrcB++);
        /* Decrement the loop counter */
        blkCnt--;
    }
 #endif /* if defined(ARM_MATH_MVEI) */
 }
 /**
  @} end of Or group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_or_u8.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_or_u8.c
@@ -0,0 +1,128 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_or_u8.c
 * Description:  uint8_t bitwise inclusive OR
 *
 * $Date:        14 November 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup Or
  @{
 */
 /**
  @brief         Compute the logical bitwise OR of two fixed-point vectors.
  @param[in]     pSrcA      points to input vector A
  @param[in]     pSrcB      points to input vector B
  @param[out]    pDst       points to output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
 */
 void arm_or_u8(
    const uint8_t * pSrcA,
    const uint8_t * pSrcB,
          uint8_t * pDst,
          uint32_t blockSize)
 {
    uint32_t blkCnt;      /* Loop counter */
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
    q7x16_t vecSrcA, vecSrcB;
    /* Compute 16 outputs at a time */
    blkCnt = blockSize >> 4;
    while (blkCnt > 0U)
    {
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vst1q(pDst, vorrq_u8(vecSrcA, vecSrcB) );
        pSrcA += 16;
        pSrcB += 16;
        pDst  += 16;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0xF;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp8q(blkCnt);
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vstrbq_p(pDst, vorrq_u8(vecSrcA, vecSrcB), p0);
    }
 #else
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
    uint8x16_t vecA, vecB;
    /* Compute 16 outputs at a time */
    blkCnt = blockSize >> 4U;
    while (blkCnt > 0U)
    {
        vecA = vld1q_u8(pSrcA);
        vecB = vld1q_u8(pSrcB);
        vst1q_u8(pDst, vorrq_u8(vecA, vecB) );
        pSrcA += 16;
        pSrcB += 16;
        pDst  += 16;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0xF;
 #else
    /* Initialize blkCnt with number of samples */
    blkCnt = blockSize;
 #endif
    while (blkCnt > 0U)
    {
        *pDst++ = (*pSrcA++)|(*pSrcB++);
        /* Decrement the loop counter */
        blkCnt--;
    }
 #endif /* if defined(ARM_MATH_MVEI) */
 }
 /**
  @} end of Or group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_scale_f32.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_scale_f32.c
@@ -0,0 +1,216 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_scale_f32.c
 * Description:  Multiplies a floating-point vector by a scalar
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @defgroup BasicScale Vector Scale
  Multiply a vector by a scalar value.  For floating-point data, the algorithm used is:
  <pre>
      pDst[n] = pSrc[n] * scale,   0 <= n < blockSize.
  </pre>
  In the fixed-point Q7, Q15, and Q31 functions, <code>scale</code> is represented by
  a fractional multiplication <code>scaleFract</code> and an arithmetic shift <code>shift</code>.
  The shift allows the gain of the scaling operation to exceed 1.0.
  The algorithm used with fixed-point data is:
  <pre>
      pDst[n] = (pSrc[n] * scaleFract) << shift,   0 <= n < blockSize.
  </pre>
  The overall scale factor applied to the fixed-point data is
  <pre>
      scale = scaleFract * 2^shift.
  </pre>
  The functions support in-place computation allowing the source and destination
  pointers to reference the same memory buffer.
 */
 /**
  @addtogroup BasicScale
  @{
 */
 /**
  @brief         Multiplies a floating-point vector by a scalar.
  @param[in]     pSrc       points to the input vector
  @param[in]     scale      scale factor to be applied
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
 */
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 #include "arm_helium_utils.h"
 void arm_scale_f32(
  const float32_t * pSrc,
        float32_t scale,
        float32_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
    f32x4_t vec1;
    f32x4_t res;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        /* C = A + offset */
        /* Add offset and then store the results in the destination buffer. */
        vec1 = vld1q(pSrc);
        res = vmulq(vec1,scale);
        vst1q(pDst, res);
        /* Increment pointers */
        pSrc += 4;
        pDst += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0x3;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp32q(blkCnt);
        vec1 = vld1q((float32_t const *) pSrc);
        vstrwq_p(pDst, vmulq(vec1, scale), p0);
    }
 }
 #else
 void arm_scale_f32(
  const float32_t *pSrc,
        float32_t scale,
        float32_t *pDst,
        uint32_t blockSize)
 {
  uint32_t blkCnt;                               /* Loop counter */
 #if defined(ARM_MATH_NEON_EXPERIMENTAL)
    f32x4_t vec1;
    f32x4_t res;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        /* C = A * scale */
    	/* Scale the input and then store the results in the destination buffer. */
        vec1 = vld1q_f32(pSrc);
        res = vmulq_f32(vec1, vdupq_n_f32(scale));
        vst1q_f32(pDst, res);
        /* Increment pointers */
        pSrc += 4;
        pDst += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0x3;
 #else
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    float32_t in1, in2, in3, in4;
    /* C = A * scale */
    /* Scale input and store result in destination buffer. */
    in1 = (*pSrc++) * scale;
    in2 = (*pSrc++) * scale;
    in3 = (*pSrc++) * scale;
    in4 = (*pSrc++) * scale;
    *pDst++ = in1;
    *pDst++ = in2;
    *pDst++ = in3;
    *pDst++ = in4;
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
 #endif /* #if defined(ARM_MATH_NEON_EXPERIMENTAL) */
  while (blkCnt > 0U)
  {
    /* C = A * scale */
    /* Scale input and store result in destination buffer. */
    *pDst++ = (*pSrc++) * scale;
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 /**
  @} end of BasicScale group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_scale_q15.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_scale_q15.c
@@ -0,0 +1,201 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_scale_q15.c
 * Description:  Multiplies a Q15 vector by a scalar
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicScale
  @{
 */
 /**
  @brief         Multiplies a Q15 vector by a scalar.
  @param[in]     pSrc       points to the input vector
  @param[in]     scaleFract fractional portion of the scale value
  @param[in]     shift      number of bits to shift the result by
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.15 format.
                   These are multiplied to yield a 2.30 intermediate result and this is shifted with saturation to 1.15 format.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_scale_q15(
    const q15_t * pSrc,
    q15_t   scaleFract,
    int8_t  shift,
    q15_t * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q15x8_t vecSrc;
    q15x8_t vecDst;
    /* Compute 8 outputs at a time */
    blkCnt = blockSize >> 3;
    while (blkCnt > 0U)
    {
        /*
         * C = A * scale
         * Scale the input and then store the result in the destination buffer.
         */
        vecSrc = vld1q(pSrc);
        vecDst = vmulhq(vecSrc, vdupq_n_s16(scaleFract));
        vecDst = vqshlq_r(vecDst, shift + 1);
        vst1q(pDst, vecDst);
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrc += 8;
        pDst += 8;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 7;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp16q(blkCnt);;
        vecSrc = vld1q(pSrc);
        vecDst = vmulhq(vecSrc, vdupq_n_s16(scaleFract));
        vecDst = vqshlq_r(vecDst, shift + 1);
        vstrhq_p(pDst, vecDst, p0);
    }
 }
 #else
 void arm_scale_q15(
  const q15_t *pSrc,
        q15_t scaleFract,
        int8_t shift,
        q15_t *pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
        int8_t kShift = 15 - shift;                    /* Shift to apply after scaling */
 #if defined (ARM_MATH_LOOPUNROLL)
 #if defined (ARM_MATH_DSP)
  q31_t inA1, inA2;
  q31_t out1, out2, out3, out4;                  /* Temporary output variables */
  q15_t in1, in2, in3, in4;                      /* Temporary input variables */
 #endif
 #endif
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A * scale */
 #if defined (ARM_MATH_DSP)
    /* read 2 times 2 samples at a time from source */
    inA1 = read_q15x2_ia ((q15_t **) &pSrc);
    inA2 = read_q15x2_ia ((q15_t **) &pSrc);
    /* Scale inputs and store result in temporary variables
     * in single cycle by packing the outputs */
    out1 = (q31_t) ((q15_t) (inA1 >> 16) * scaleFract);
    out2 = (q31_t) ((q15_t) (inA1      ) * scaleFract);
    out3 = (q31_t) ((q15_t) (inA2 >> 16) * scaleFract);
    out4 = (q31_t) ((q15_t) (inA2      ) * scaleFract);
    /* apply shifting */
    out1 = out1 >> kShift;
    out2 = out2 >> kShift;
    out3 = out3 >> kShift;
    out4 = out4 >> kShift;
    /* saturate the output */
    in1 = (q15_t) (__SSAT(out1, 16));
    in2 = (q15_t) (__SSAT(out2, 16));
    in3 = (q15_t) (__SSAT(out3, 16));
    in4 = (q15_t) (__SSAT(out4, 16));
    /* store result to destination */
    write_q15x2_ia (&pDst, __PKHBT(in2, in1, 16));
    write_q15x2_ia (&pDst, __PKHBT(in4, in3, 16));
 #else
    *pDst++ = (q15_t) (__SSAT(((q31_t) *pSrc++ * scaleFract) >> kShift, 16));
    *pDst++ = (q15_t) (__SSAT(((q31_t) *pSrc++ * scaleFract) >> kShift, 16));
    *pDst++ = (q15_t) (__SSAT(((q31_t) *pSrc++ * scaleFract) >> kShift, 16));
    *pDst++ = (q15_t) (__SSAT(((q31_t) *pSrc++ * scaleFract) >> kShift, 16));
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = A * scale */
    /* Scale input and store result in destination buffer. */
    *pDst++ = (q15_t) (__SSAT(((q31_t) *pSrc++ * scaleFract) >> kShift, 16));
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicScale group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_scale_q31.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_scale_q31.c
@@ -0,0 +1,244 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_scale_q31.c
 * Description:  Multiplies a Q31 vector by a scalar
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicScale
  @{
 */
 /**
  @brief         Multiplies a Q31 vector by a scalar.
  @param[in]     pSrc       points to the input vector
  @param[in]     scaleFract fractional portion of the scale value
  @param[in]     shift      number of bits to shift the result by
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.31 format.
                   These are multiplied to yield a 2.62 intermediate result and this is shifted with saturation to 1.31 format.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_scale_q31(
    const q31_t * pSrc,
    q31_t   scaleFract,
    int8_t  shift,
    q31_t * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q31x4_t vecSrc;
    q31x4_t vecDst;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2;
    while (blkCnt > 0U)
    {
        /*
         * C = A * scale
         * Scale the input and then store the result in the destination buffer.
         */
        vecSrc = vld1q(pSrc);
        vecDst = vmulhq(vecSrc, vdupq_n_s32(scaleFract));
        vecDst = vqshlq_r(vecDst, shift + 1);
        vst1q(pDst, vecDst);
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrc += 4;
        pDst += 4;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 3;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp32q(blkCnt);
        vecSrc = vld1q(pSrc);
        vecDst = vmulhq(vecSrc, vdupq_n_s32(scaleFract));
        vecDst = vqshlq_r(vecDst, shift + 1);
        vstrwq_p(pDst, vecDst, p0);
    }
 }
 #else
 void arm_scale_q31(
  const q31_t *pSrc,
        q31_t scaleFract,
        int8_t shift,
        q31_t *pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
        q31_t in, out;                                 /* Temporary variables */
        int8_t kShift = shift + 1;                     /* Shift to apply after scaling */
        int8_t sign = (kShift & 0x80);
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  if (sign == 0U)
  {
    while (blkCnt > 0U)
    {
      /* C = A * scale */
      /* Scale input and store result in destination buffer. */
      in = *pSrc++;                                /* read input from source */
      in = ((q63_t) in * scaleFract) >> 32;        /* multiply input with scaler value */
      out = in << kShift;                          /* apply shifting */
      if (in != (out >> kShift))                   /* saturate the result */
        out = 0x7FFFFFFF ^ (in >> 31);
      *pDst++ = out;                               /* Store result destination */
      in = *pSrc++;
      in = ((q63_t) in * scaleFract) >> 32;
      out = in << kShift;
      if (in != (out >> kShift))
        out = 0x7FFFFFFF ^ (in >> 31);
      *pDst++ = out;
      in = *pSrc++;
      in = ((q63_t) in * scaleFract) >> 32;
      out = in << kShift;
      if (in != (out >> kShift))
        out = 0x7FFFFFFF ^ (in >> 31);
      *pDst++ = out;
      in = *pSrc++;
      in = ((q63_t) in * scaleFract) >> 32;
      out = in << kShift;
      if (in != (out >> kShift))
        out = 0x7FFFFFFF ^ (in >> 31);
      *pDst++ = out;
      /* Decrement loop counter */
      blkCnt--;
    }
  }
  else
  {
    while (blkCnt > 0U)
    {
      /* C = A * scale */
      /* Scale input and store result in destination buffer. */
      in = *pSrc++;                                /* read four inputs from source */
      in = ((q63_t) in * scaleFract) >> 32;        /* multiply input with scaler value */
      out = in >> -kShift;                         /* apply shifting */
      *pDst++ = out;                               /* Store result destination */
      in = *pSrc++;
      in = ((q63_t) in * scaleFract) >> 32;
      out = in >> -kShift;
      *pDst++ = out;
      in = *pSrc++;
      in = ((q63_t) in * scaleFract) >> 32;
      out = in >> -kShift;
      *pDst++ = out;
      in = *pSrc++;
      in = ((q63_t) in * scaleFract) >> 32;
      out = in >> -kShift;
      *pDst++ = out;
      /* Decrement loop counter */
      blkCnt--;
    }
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  if (sign == 0U)
  {
    while (blkCnt > 0U)
    {
      /* C = A * scale */
      /* Scale input and store result in destination buffer. */
      in = *pSrc++;
      in = ((q63_t) in * scaleFract) >> 32;
      out = in << kShift;
      if (in != (out >> kShift))
          out = 0x7FFFFFFF ^ (in >> 31);
      *pDst++ = out;
      /* Decrement loop counter */
      blkCnt--;
    }
  }
  else
  {
    while (blkCnt > 0U)
    {
      /* C = A * scale */
      /* Scale input and store result in destination buffer. */
      in = *pSrc++;
      in = ((q63_t) in * scaleFract) >> 32;
      out = in >> -kShift;
      *pDst++ = out;
      /* Decrement loop counter */
      blkCnt--;
    }
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicScale group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_scale_q7.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_scale_q7.c
@@ -0,0 +1,186 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_scale_q7.c
 * Description:  Multiplies a Q7 vector by a scalar
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicScale
  @{
 */
 /**
  @brief         Multiplies a Q7 vector by a scalar.
  @param[in]     pSrc       points to the input vector
  @param[in]     scaleFract fractional portion of the scale value
  @param[in]     shift      number of bits to shift the result by
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.7 format.
                   These are multiplied to yield a 2.14 intermediate result and this is shifted with saturation to 1.7 format.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_scale_q7(
    const q7_t * pSrc,
    q7_t   scaleFract,
    int8_t  shift,
    q7_t * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q7x16_t vecSrc;
    q7x16_t vecDst;
    /* Compute 16 outputs at a time */
    blkCnt = blockSize >> 4;
    while (blkCnt > 0U)
    {
        /*
         * C = A * scale
         * Scale the input and then store the result in the destination buffer.
         */
        vecSrc = vld1q(pSrc);
        vecDst = vmulhq(vecSrc, vdupq_n_s8(scaleFract));
        vecDst = vqshlq_r(vecDst, shift + 1);
        vst1q(pDst, vecDst);
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrc += 16;
        pDst += 16;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 0xF;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp8q(blkCnt);
        vecSrc = vld1q(pSrc);
        vecDst = vmulhq(vecSrc, vdupq_n_s8(scaleFract));
        vecDst = vqshlq_r(vecDst, shift + 1);
        vstrbq_p(pDst, vecDst, p0);
    }
 }
 #else
 void arm_scale_q7(
  const q7_t * pSrc,
        q7_t scaleFract,
        int8_t shift,
        q7_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
        int8_t kShift = 7 - shift;                     /* Shift to apply after scaling */
 #if defined (ARM_MATH_LOOPUNROLL)
 #if defined (ARM_MATH_DSP)
  q7_t in1,  in2,  in3,  in4;                    /* Temporary input variables */
  q7_t out1, out2, out3, out4;                   /* Temporary output variables */
 #endif
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A * scale */
 #if defined (ARM_MATH_DSP)
    /* Reading 4 inputs from memory */
    in1 = *pSrc++;
    in2 = *pSrc++;
    in3 = *pSrc++;
    in4 = *pSrc++;
    /* Scale inputs and store result in the temporary variable. */
    out1 = (q7_t) (__SSAT(((in1) * scaleFract) >> kShift, 8));
    out2 = (q7_t) (__SSAT(((in2) * scaleFract) >> kShift, 8));
    out3 = (q7_t) (__SSAT(((in3) * scaleFract) >> kShift, 8));
    out4 = (q7_t) (__SSAT(((in4) * scaleFract) >> kShift, 8));
    /* Pack and store result in destination buffer (in single write) */
    write_q7x4_ia (&pDst, __PACKq7(out1, out2, out3, out4));
 #else
    *pDst++ = (q7_t) (__SSAT((((q15_t) *pSrc++ * scaleFract) >> kShift), 8));
    *pDst++ = (q7_t) (__SSAT((((q15_t) *pSrc++ * scaleFract) >> kShift), 8));
    *pDst++ = (q7_t) (__SSAT((((q15_t) *pSrc++ * scaleFract) >> kShift), 8));
    *pDst++ = (q7_t) (__SSAT((((q15_t) *pSrc++ * scaleFract) >> kShift), 8));
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = A * scale */
    /* Scale input and store result in destination buffer. */
    *pDst++ = (q7_t) (__SSAT((((q15_t) *pSrc++ * scaleFract) >> kShift), 8));
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicScale group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_shift_q15.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_shift_q15.c
@@ -0,0 +1,251 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_shift_q15.c
 * Description:  Shifts the elements of a Q15 vector by a specified number of bits
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicShift
  @{
 */
 /**
  @brief         Shifts the elements of a Q15 vector a specified number of bits
  @param[in]     pSrc       points to the input vector
  @param[in]     shiftBits  number of bits to shift.  A positive value shifts left; a negative value shifts right.
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_shift_q15(
    const q15_t * pSrc,
    int8_t shiftBits,
    q15_t * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q15x8_t vecSrc;
    q15x8_t vecDst;
    /* Compute 8 outputs at a time */
    blkCnt = blockSize >> 3;
    while (blkCnt > 0U)
    {
        /*
         * C = A (>> or <<) shiftBits
         * Shift the input and then store the result in the destination buffer.
         */
        vecSrc = vld1q(pSrc);
        vecDst = vqshlq_r(vecSrc, shiftBits);
        vst1q(pDst, vecDst);
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrc += 8;
        pDst += 8;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 7;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp16q(blkCnt);
        vecSrc = vld1q(pSrc);
        vecDst = vqshlq_r(vecSrc, shiftBits);
        vstrhq_p(pDst, vecDst, p0);
    }
 }
 #else
 void arm_shift_q15(
  const q15_t * pSrc,
        int8_t shiftBits,
        q15_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
        uint8_t sign = (shiftBits & 0x80);             /* Sign of shiftBits */
 #if defined (ARM_MATH_LOOPUNROLL)
 #if defined (ARM_MATH_DSP)
  q15_t in1, in2;                                /* Temporary input variables */
 #endif
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  /* If the shift value is positive then do right shift else left shift */
  if (sign == 0U)
  {
    while (blkCnt > 0U)
    {
      /* C = A << shiftBits */
 #if defined (ARM_MATH_DSP)
      /* read 2 samples from source */
      in1 = *pSrc++;
      in2 = *pSrc++;
      /* Shift the inputs and then store the results in the destination buffer. */
 #ifndef ARM_MATH_BIG_ENDIAN
      write_q15x2_ia (&pDst, __PKHBT(__SSAT((in1 << shiftBits), 16),
                                     __SSAT((in2 << shiftBits), 16), 16));
 #else
      write_q15x2_ia (&pDst, __PKHBT(__SSAT((in2 << shiftBits), 16),
                                      __SSAT((in1 << shiftBits), 16), 16));
 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
      /* read 2 samples from source */
      in1 = *pSrc++;
      in2 = *pSrc++;
 #ifndef ARM_MATH_BIG_ENDIAN
      write_q15x2_ia (&pDst, __PKHBT(__SSAT((in1 << shiftBits), 16),
                                     __SSAT((in2 << shiftBits), 16), 16));
 #else
      write_q15x2_ia (&pDst, __PKHBT(__SSAT((in2 << shiftBits), 16),
                                     __SSAT((in1 << shiftBits), 16), 16));
 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
 #else
      *pDst++ = __SSAT(((q31_t) *pSrc++ << shiftBits), 16);
      *pDst++ = __SSAT(((q31_t) *pSrc++ << shiftBits), 16);
      *pDst++ = __SSAT(((q31_t) *pSrc++ << shiftBits), 16);
      *pDst++ = __SSAT(((q31_t) *pSrc++ << shiftBits), 16);
 #endif
      /* Decrement loop counter */
      blkCnt--;
    }
  }
  else
  {
    while (blkCnt > 0U)
    {
      /* C = A >> shiftBits */
 #if defined (ARM_MATH_DSP)
      /* read 2 samples from source */
      in1 = *pSrc++;
      in2 = *pSrc++;
      /* Shift the inputs and then store the results in the destination buffer. */
 #ifndef ARM_MATH_BIG_ENDIAN
      write_q15x2_ia (&pDst, __PKHBT((in1 >> -shiftBits),
                                     (in2 >> -shiftBits), 16));
 #else
      write_q15x2_ia (&pDst, __PKHBT((in2 >> -shiftBits),
                                     (in1 >> -shiftBits), 16));
 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
      /* read 2 samples from source */
      in1 = *pSrc++;
      in2 = *pSrc++;
 #ifndef ARM_MATH_BIG_ENDIAN
      write_q15x2_ia (&pDst, __PKHBT((in1 >> -shiftBits),
                                     (in2 >> -shiftBits), 16));
 #else
      write_q15x2_ia (&pDst, __PKHBT((in2 >> -shiftBits),
                                     (in1 >> -shiftBits), 16));
 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
 #else
      *pDst++ = (*pSrc++ >> -shiftBits);
      *pDst++ = (*pSrc++ >> -shiftBits);
      *pDst++ = (*pSrc++ >> -shiftBits);
      *pDst++ = (*pSrc++ >> -shiftBits);
 #endif
      /* Decrement loop counter */
      blkCnt--;
    }
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  /* If the shift value is positive then do right shift else left shift */
  if (sign == 0U)
  {
    while (blkCnt > 0U)
    {
      /* C = A << shiftBits */
      /* Shift input and store result in destination buffer. */
      *pDst++ = __SSAT(((q31_t) *pSrc++ << shiftBits), 16);
      /* Decrement loop counter */
      blkCnt--;
    }
  }
  else
  {
    while (blkCnt > 0U)
    {
      /* C = A >> shiftBits */
      /* Shift input and store result in destination buffer. */
      *pDst++ = (*pSrc++ >> -shiftBits);
      /* Decrement loop counter */
      blkCnt--;
    }
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicShift group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_shift_q31.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_shift_q31.c
@@ -0,0 +1,232 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_shift_q31.c
 * Description:  Shifts the elements of a Q31 vector by a specified number of bits
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @defgroup BasicShift Vector Shift
  Shifts the elements of a fixed-point vector by a specified number of bits.
  There are separate functions for Q7, Q15, and Q31 data types.
  The underlying algorithm used is:
  <pre>
      pDst[n] = pSrc[n] << shift,   0 <= n < blockSize.
  </pre>
  If <code>shift</code> is positive then the elements of the vector are shifted to the left.
  If <code>shift</code> is negative then the elements of the vector are shifted to the right.
  The functions support in-place computation allowing the source and destination
  pointers to reference the same memory buffer.
 */
 /**
  @addtogroup BasicShift
  @{
 */
 /**
  @brief         Shifts the elements of a Q31 vector a specified number of bits.
  @param[in]     pSrc       points to the input vector
  @param[in]     shiftBits  number of bits to shift.  A positive value shifts left; a negative value shifts right.
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in the vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_shift_q31(
    const q31_t * pSrc,
    int8_t shiftBits,
    q31_t * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q31x4_t vecSrc;
    q31x4_t vecDst;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2;
    while (blkCnt > 0U)
    {
        /*
         * C = A (>> or <<) shiftBits
         * Shift the input and then store the result in the destination buffer.
         */
        vecSrc = vld1q((q31_t const *) pSrc);
        vecDst = vqshlq_r(vecSrc, shiftBits);
        vst1q(pDst, vecDst);
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrc += 4;
        pDst += 4;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 3;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp32q(blkCnt);
        vecSrc = vld1q((q31_t const *) pSrc);
        vecDst = vqshlq_r(vecSrc, shiftBits);
        vstrwq_p(pDst, vecDst, p0);
    }
 }
 #else
 void arm_shift_q31(
  const q31_t * pSrc,
        int8_t shiftBits,
        q31_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
        uint8_t sign = (shiftBits & 0x80);             /* Sign of shiftBits */
 #if defined (ARM_MATH_LOOPUNROLL)
  q31_t in, out;                                 /* Temporary variables */
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  /* If the shift value is positive then do right shift else left shift */
  if (sign == 0U)
  {
    while (blkCnt > 0U)
    {
      /* C = A << shiftBits */
      /* Shift input and store result in destination buffer. */
      in = *pSrc++;
      out = in << shiftBits;
      if (in != (out >> shiftBits))
        out = 0x7FFFFFFF ^ (in >> 31);
      *pDst++ = out;
      in = *pSrc++;
      out = in << shiftBits;
      if (in != (out >> shiftBits))
        out = 0x7FFFFFFF ^ (in >> 31);
      *pDst++ = out;
      in = *pSrc++;
      out = in << shiftBits;
      if (in != (out >> shiftBits))
        out = 0x7FFFFFFF ^ (in >> 31);
      *pDst++ = out;
      in = *pSrc++;
      out = in << shiftBits;
      if (in != (out >> shiftBits))
        out = 0x7FFFFFFF ^ (in >> 31);
      *pDst++ = out;
      /* Decrement loop counter */
      blkCnt--;
    }
  }
  else
  {
    while (blkCnt > 0U)
    {
      /* C = A >> shiftBits */
      /* Shift input and store results in destination buffer. */
      *pDst++ = (*pSrc++ >> -shiftBits);
      *pDst++ = (*pSrc++ >> -shiftBits);
      *pDst++ = (*pSrc++ >> -shiftBits);
      *pDst++ = (*pSrc++ >> -shiftBits);
      /* Decrement loop counter */
      blkCnt--;
    }
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  /* If the shift value is positive then do right shift else left shift */
  if (sign == 0U)
  {
    while (blkCnt > 0U)
    {
      /* C = A << shiftBits */
      /* Shift input and store result in destination buffer. */
      *pDst++ = clip_q63_to_q31((q63_t) *pSrc++ << shiftBits);
      /* Decrement loop counter */
      blkCnt--;
    }
  }
  else
  {
    while (blkCnt > 0U)
    {
      /* C = A >> shiftBits */
      /* Shift input and store result in destination buffer. */
      *pDst++ = (*pSrc++ >> -shiftBits);
      /* Decrement loop counter */
      blkCnt--;
    }
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicShift group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_shift_q7.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_shift_q7.c
@@ -0,0 +1,225 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_shift_q7.c
 * Description:  Processing function for the Q7 Shifting
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicShift
  @{
 */
 /**
  @brief         Shifts the elements of a Q7 vector a specified number of bits
  @param[in]     pSrc       points to the input vector
  @param[in]     shiftBits  number of bits to shift.  A positive value shifts left; a negative value shifts right.
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
  @par           onditions for optimum performance
                   Input and output buffers should be aligned by 32-bit
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_shift_q7(
    const q7_t * pSrc,
    int8_t shiftBits,
    q7_t * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q7x16_t vecSrc;
    q7x16_t vecDst;
    /* Compute 16 outputs at a time */
    blkCnt = blockSize >> 4;
    while (blkCnt > 0U)
    {
        /*
         * C = A (>> or <<) shiftBits
         * Shift the input and then store the result in the destination buffer.
         */
        vecSrc = vld1q(pSrc);
        vecDst = vqshlq_r(vecSrc, shiftBits);
        vst1q(pDst, vecDst);
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrc += 16;
        pDst += 16;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 0xF;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp8q(blkCnt);
        vecSrc = vld1q(pSrc);
        vecDst = vqshlq_r(vecSrc, shiftBits);
        vstrbq_p(pDst, vecDst, p0);
    }
 }
 #else
 void arm_shift_q7(
  const q7_t * pSrc,
        int8_t shiftBits,
        q7_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
        uint8_t sign = (shiftBits & 0x80);             /* Sign of shiftBits */
 #if defined (ARM_MATH_LOOPUNROLL)
 #if defined (ARM_MATH_DSP)
  q7_t in1,  in2,  in3,  in4;                    /* Temporary input variables */
 #endif
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  /* If the shift value is positive then do right shift else left shift */
  if (sign == 0U)
  {
    while (blkCnt > 0U)
    {
      /* C = A << shiftBits */
 #if defined (ARM_MATH_DSP)
      /* Read 4 inputs */
      in1 = *pSrc++;
      in2 = *pSrc++;
      in3 = *pSrc++;
      in4 = *pSrc++;
    /* Pack and store result in destination buffer (in single write) */
      write_q7x4_ia (&pDst, __PACKq7(__SSAT((in1 << shiftBits), 8),
                                     __SSAT((in2 << shiftBits), 8),
                                     __SSAT((in3 << shiftBits), 8),
                                     __SSAT((in4 << shiftBits), 8) ));
 #else
      *pDst++ = (q7_t) __SSAT(((q15_t) *pSrc++ << shiftBits), 8);
      *pDst++ = (q7_t) __SSAT(((q15_t) *pSrc++ << shiftBits), 8);
      *pDst++ = (q7_t) __SSAT(((q15_t) *pSrc++ << shiftBits), 8);
      *pDst++ = (q7_t) __SSAT(((q15_t) *pSrc++ << shiftBits), 8);
 #endif
      /* Decrement loop counter */
      blkCnt--;
    }
  }
  else
  {
    while (blkCnt > 0U)
    {
      /* C = A >> shiftBits */
 #if defined (ARM_MATH_DSP)
      /* Read 4 inputs */
      in1 = *pSrc++;
      in2 = *pSrc++;
      in3 = *pSrc++;
      in4 = *pSrc++;
    /* Pack and store result in destination buffer (in single write) */
      write_q7x4_ia (&pDst, __PACKq7((in1 >> -shiftBits),
                                     (in2 >> -shiftBits),
                                     (in3 >> -shiftBits),
                                     (in4 >> -shiftBits) ));
 #else
      *pDst++ = (*pSrc++ >> -shiftBits);
      *pDst++ = (*pSrc++ >> -shiftBits);
      *pDst++ = (*pSrc++ >> -shiftBits);
      *pDst++ = (*pSrc++ >> -shiftBits);
 #endif
      /* Decrement loop counter */
      blkCnt--;
    }
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  /* If the shift value is positive then do right shift else left shift */
  if (sign == 0U)
  {
    while (blkCnt > 0U)
    {
      /* C = A << shiftBits */
      /* Shift input and store result in destination buffer. */
      *pDst++ = (q7_t) __SSAT(((q15_t) *pSrc++ << shiftBits), 8);
      /* Decrement loop counter */
      blkCnt--;
    }
  }
  else
  {
    while (blkCnt > 0U)
    {
      /* C = A >> shiftBits */
      /* Shift input and store result in destination buffer. */
      *pDst++ = (*pSrc++ >> -shiftBits);
      /* Decrement loop counter */
      blkCnt--;
    }
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicShift group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_sub_f32.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_sub_f32.c
@@ -0,0 +1,202 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_sub_f32.c
 * Description:  Floating-point vector subtraction
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @defgroup BasicSub Vector Subtraction
  Element-by-element subtraction of two vectors.
  <pre>
      pDst[n] = pSrcA[n] - pSrcB[n],   0 <= n < blockSize.
  </pre>
  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
 */
 /**
  @addtogroup BasicSub
  @{
 */
 /**
  @brief         Floating-point vector subtraction.
  @param[in]     pSrcA      points to the first input vector
  @param[in]     pSrcB      points to the second input vector
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
 */
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 #include "arm_helium_utils.h"
 void arm_sub_f32(
  const float32_t * pSrcA,
  const float32_t * pSrcB,
        float32_t * pDst,
        uint32_t blockSize)
 {
    uint32_t blkCnt;                               /* Loop counter */
    f32x4_t vec1;
    f32x4_t vec2;
    f32x4_t res;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        /* C = A + B */
      /* Add and then store the results in the destination buffer. */
        vec1 = vld1q(pSrcA);
        vec2 = vld1q(pSrcB);
        res = vsubq(vec1, vec2);
        vst1q(pDst, res);
        /* Increment pointers */
        pSrcA += 4;
        pSrcB += 4;
        pDst += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0x3;
    if (blkCnt > 0U)
    {
      /* C = A + B */
      mve_pred16_t p0 = vctp32q(blkCnt);
      vec1 = vld1q(pSrcA);
      vec2 = vld1q(pSrcB);
      vstrwq_p(pDst, vsubq(vec1,vec2), p0);
    }
 }
 #else
 void arm_sub_f32(
  const float32_t * pSrcA,
  const float32_t * pSrcB,
        float32_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
    f32x4_t vec1;
    f32x4_t vec2;
    f32x4_t res;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        /* C = A - B */
        /* Subtract and then store the results in the destination buffer. */
        vec1 = vld1q_f32(pSrcA);
        vec2 = vld1q_f32(pSrcB);
        res = vsubq_f32(vec1, vec2);
        vst1q_f32(pDst, res);
        /* Increment pointers */
        pSrcA += 4;
        pSrcB += 4;
        pDst += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0x3;
 #else
 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A - B */
    /* Subtract and store result in destination buffer. */
    *pDst++ = (*pSrcA++) - (*pSrcB++);
    *pDst++ = (*pSrcA++) - (*pSrcB++);
    *pDst++ = (*pSrcA++) - (*pSrcB++);
    *pDst++ = (*pSrcA++) - (*pSrcB++);
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
 #endif /* #if defined(ARM_MATH_NEON) */
  while (blkCnt > 0U)
  {
    /* C = A - B */
    /* Subtract and store result in destination buffer. */
    *pDst++ = (*pSrcA++) - (*pSrcB++);
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 /**
  @} end of BasicSub group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_sub_q15.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_sub_q15.c
@@ -0,0 +1,178 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_sub_q15.c
 * Description:  Q15 vector subtraction
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicSub
  @{
 */
 /**
  @brief         Q15 vector subtraction.
  @param[in]     pSrcA      points to the first input vector
  @param[in]     pSrcB      points to the second input vector
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_sub_q15(
    const q15_t * pSrcA,
    const q15_t * pSrcB,
    q15_t * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q15x8_t vecA;
    q15x8_t vecB;
    /* Compute 8 outputs at a time */
    blkCnt = blockSize >> 3;
    while (blkCnt > 0U)
    {
        /*
         * C = A - B
         * Subtract and then store the results in the destination buffer.
         */
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        vst1q(pDst, vqsubq(vecA, vecB));
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrcA  += 8;
        pSrcB  += 8;
        pDst   += 8;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 7;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp16q(blkCnt);
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        vstrhq_p(pDst, vqsubq(vecA, vecB), p0);
    }
 }
 #else
 void arm_sub_q15(
  const q15_t * pSrcA,
  const q15_t * pSrcB,
        q15_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
 #if defined (ARM_MATH_LOOPUNROLL)
 #if defined (ARM_MATH_DSP)
  q31_t inA1, inA2;
  q31_t inB1, inB2;
 #endif
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A - B */
 #if defined (ARM_MATH_DSP)
    /* read 2 times 2 samples at a time from sourceA */
    inA1 = read_q15x2_ia ((q15_t **) &pSrcA);
    inA2 = read_q15x2_ia ((q15_t **) &pSrcA);
    /* read 2 times 2 samples at a time from sourceB */
    inB1 = read_q15x2_ia ((q15_t **) &pSrcB);
    inB2 = read_q15x2_ia ((q15_t **) &pSrcB);
    /* Subtract and store 2 times 2 samples at a time */
    write_q15x2_ia (&pDst, __QSUB16(inA1, inB1));
    write_q15x2_ia (&pDst, __QSUB16(inA2, inB2));
 #else
    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = A - B */
    /* Subtract and store result in destination buffer. */
 #if defined (ARM_MATH_DSP)
    *pDst++ = (q15_t) __QSUB16(*pSrcA++, *pSrcB++);
 #else
    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicSub group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_sub_q31.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_sub_q31.c
@@ -0,0 +1,159 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_sub_q31.c
 * Description:  Q31 vector subtraction
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicSub
  @{
 */
 /**
  @brief         Q31 vector subtraction.
  @param[in]     pSrcA      points to the first input vector
  @param[in]     pSrcB      points to the second input vector
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_sub_q31(
  const q31_t * pSrcA,
  const q31_t * pSrcB,
        q31_t * pDst,
        uint32_t blockSize)
 {
    uint32_t blkCnt;
    q31x4_t vecA;
    q31x4_t vecB;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2;
    while (blkCnt > 0U)
    {
        /*
         * C = A + B
         * Add and then store the results in the destination buffer.
         */
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        vst1q(pDst, vqsubq(vecA, vecB));
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrcA  += 4;
        pSrcB  += 4;
        pDst   += 4;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 3;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp32q(blkCnt);
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        vstrwq_p(pDst, vqsubq(vecA, vecB), p0);
    }
 }
 #else
 void arm_sub_q31(
  const q31_t * pSrcA,
  const q31_t * pSrcB,
        q31_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A - B */
    /* Subtract and store result in destination buffer. */
    *pDst++ = __QSUB(*pSrcA++, *pSrcB++);
    *pDst++ = __QSUB(*pSrcA++, *pSrcB++);
    *pDst++ = __QSUB(*pSrcA++, *pSrcB++);
    *pDst++ = __QSUB(*pSrcA++, *pSrcB++);
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = A - B */
    /* Subtract and store result in destination buffer. */
    *pDst++ = __QSUB(*pSrcA++, *pSrcB++);
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicSub group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_sub_q7.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_sub_q7.c
@@ -0,0 +1,158 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_sub_q7.c
 * Description:  Q7 vector subtraction
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicSub
  @{
 */
 /**
  @brief         Q7 vector subtraction.
  @param[in]     pSrcA      points to the first input vector
  @param[in]     pSrcB      points to the second input vector
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_sub_q7(
    const q7_t * pSrcA,
    const q7_t * pSrcB,
    q7_t * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q7x16_t vecA;
    q7x16_t vecB;
    /* Compute 16 outputs at a time */
    blkCnt = blockSize >> 4;
    while (blkCnt > 0U)
    {
        /*
         * C = A - B
         * Subtract and then store the results in the destination buffer.
         */
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        vst1q(pDst, vqsubq(vecA, vecB));
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrcA  += 16;
        pSrcB  += 16;
        pDst   += 16;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 0xF;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp8q(blkCnt);
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        vstrbq_p(pDst, vqsubq(vecA, vecB), p0);
    }
 }
 #else
 void arm_sub_q7(
  const q7_t * pSrcA,
  const q7_t * pSrcB,
        q7_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A - B */
 #if defined (ARM_MATH_DSP)
    /* Subtract and store result in destination buffer (4 samples at a time). */
    write_q7x4_ia (&pDst, __QSUB8(read_q7x4_ia ((q7_t **) &pSrcA), read_q7x4_ia ((q7_t **) &pSrcB)));
 #else
    *pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ - *pSrcB++, 8);
    *pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ - *pSrcB++, 8);
    *pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ - *pSrcB++, 8);
    *pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ - *pSrcB++, 8);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = A - B */
    /* Subtract and store result in destination buffer. */
    *pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ - *pSrcB++, 8);
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicSub group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_xor_u16.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_xor_u16.c
@@ -0,0 +1,137 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_xor_u16.c
 * Description:  uint16_t bitwise exclusive OR
 *
 * $Date:        14 November 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @defgroup Xor Vector bitwise exclusive OR
  Compute the logical bitwise XOR.
  There are separate functions for uint32_t, uint16_t, and uint8_t data types.
 */
 /**
  @addtogroup Xor
  @{
 */
 /**
  @brief         Compute the logical bitwise XOR of two fixed-point vectors.
  @param[in]     pSrcA      points to input vector A
  @param[in]     pSrcB      points to input vector B
  @param[out]    pDst       points to output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
 */
 void arm_xor_u16(
    const uint16_t * pSrcA,
    const uint16_t * pSrcB,
          uint16_t * pDst,
          uint32_t blockSize)
 {
    uint32_t blkCnt;      /* Loop counter */
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
    q15x8_t vecSrcA, vecSrcB;
    /* Compute 8 outputs at a time */
    blkCnt = blockSize >> 3;
    while (blkCnt > 0U)
    {
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vst1q(pDst, veorq_u16(vecSrcA, vecSrcB) );
        pSrcA += 8;
        pSrcB += 8;
        pDst  += 8;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 7;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp16q(blkCnt);
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vstrhq_p(pDst, veorq_u16(vecSrcA, vecSrcB), p0);
    }
 #else
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
    uint16x8_t vecA, vecB;
    /* Compute 8 outputs at a time */
    blkCnt = blockSize >> 3U;
    while (blkCnt > 0U)
    {
        vecA = vld1q_u16(pSrcA);
        vecB = vld1q_u16(pSrcB);
        vst1q_u16(pDst, veorq_u16(vecA, vecB) );
        pSrcA += 8;
        pSrcB += 8;
        pDst  += 8;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 7;
 #else
    /* Initialize blkCnt with number of samples */
    blkCnt = blockSize;
 #endif
    while (blkCnt > 0U)
    {
        *pDst++ = (*pSrcA++)^(*pSrcB++);
        /* Decrement the loop counter */
        blkCnt--;
    }
 #endif /* if defined(ARM_MATH_MVEI) */
 }
 /**
  @} end of Xor group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_xor_u32.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_xor_u32.c
@@ -0,0 +1,129 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_xor_u32.c
 * Description:  uint32_t bitwise exclusive OR
 *
 * $Date:        14 November 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup Xor
  @{
 */
 /**
  @brief         Compute the logical bitwise XOR of two fixed-point vectors.
  @param[in]     pSrcA      points to input vector A
  @param[in]     pSrcB      points to input vector B
  @param[out]    pDst       points to output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
 */
 void arm_xor_u32(
    const uint32_t * pSrcA,
    const uint32_t * pSrcB,
          uint32_t * pDst,
          uint32_t blockSize)
 {
    uint32_t blkCnt;      /* Loop counter */
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
    q31x4_t vecSrcA, vecSrcB;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2;
    while (blkCnt > 0U)
    {
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vst1q(pDst, veorq_u32(vecSrcA, vecSrcB) );
        pSrcA += 4;
        pSrcB += 4;
        pDst  += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 3;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp32q(blkCnt);
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vstrwq_p(pDst, veorq_u32(vecSrcA, vecSrcB), p0);
    }
 #else
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
    uint32x4_t vecA, vecB;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        vecA = vld1q_u32(pSrcA);
        vecB = vld1q_u32(pSrcB);
        vst1q_u32(pDst, veorq_u32(vecA, vecB) );
        pSrcA += 4;
        pSrcB += 4;
        pDst  += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 3;
 #else
    /* Initialize blkCnt with number of samples */
    blkCnt = blockSize;
 #endif
    while (blkCnt > 0U)
    {
        *pDst++ = (*pSrcA++)^(*pSrcB++);
        /* Decrement the loop counter */
        blkCnt--;
    }
 #endif /* if defined(ARM_MATH_MVEI) */
 }
 /**
  @} end of Xor group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_xor_u8.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_xor_u8.c
@@ -0,0 +1,129 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_xor_u8.c
 * Description:  uint8_t bitwise exclusive OR
 *
 * $Date:        14 November 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup Xor
  @{
 */
 /**
  @brief         Compute the logical bitwise XOR of two fixed-point vectors.
  @param[in]     pSrcA      points to input vector A
  @param[in]     pSrcB      points to input vector B
  @param[out]    pDst       points to output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
 */
 void arm_xor_u8(
    const uint8_t * pSrcA,
    const uint8_t * pSrcB,
          uint8_t * pDst,
          uint32_t blockSize)
 {
    uint32_t blkCnt;      /* Loop counter */
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
    q7x16_t vecSrcA, vecSrcB;
    /* Compute 16 outputs at a time */
    blkCnt = blockSize >> 4;
    while (blkCnt > 0U)
    {
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vst1q(pDst, veorq_u8(vecSrcA, vecSrcB) );
        pSrcA += 16;
        pSrcB += 16;
        pDst  += 16;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0xF;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp8q(blkCnt);
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vstrbq_p(pDst, veorq_u8(vecSrcA, vecSrcB), p0);
    }
 #else
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
    uint8x16_t vecA, vecB;
    /* Compute 16 outputs at a time */
    blkCnt = blockSize >> 4U;
    while (blkCnt > 0U)
    {
        vecA = vld1q_u8(pSrcA);
        vecB = vld1q_u8(pSrcB);
        vst1q_u8(pDst, veorq_u8(vecA, vecB) );
        pSrcA += 16;
        pSrcB += 16;
        pDst  += 16;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0xF;
 #else
    /* Initialize blkCnt with number of samples */
    blkCnt = blockSize;
 #endif
    while (blkCnt > 0U)
    {
        *pDst++ = (*pSrcA++)^(*pSrcB++);
        /* Decrement the loop counter */
        blkCnt--;
    }
 #endif /* if defined(ARM_MATH_MVEI) */
 }
 /**
  @} end of Xor group
 */
--- a/libraries/cmsis/dsp/Source/BayesFunctions/BayesFunctions.c
+++ b/libraries/cmsis/dsp/Source/BayesFunctions/BayesFunctions.c
@@ -0,0 +1,29 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        BayesFunctions.c
 * Description:  Combination of all bayes function source files.
 *
 * $Date:        16. March 2020
 * $Revision:    V1.0.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2020 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_gaussian_naive_bayes_predict_f32.c"
--- a/libraries/cmsis/dsp/Source/BayesFunctions/CMakeLists.txt
+++ b/libraries/cmsis/dsp/Source/BayesFunctions/CMakeLists.txt
@@ -0,0 +1,19 @@
 cmake_minimum_required (VERSION 3.6)
 project(CMSISDSPBayes)
 include(configLib)
 include(configDsp)
 file(GLOB SRC "./*_*.c")
 add_library(CMSISDSPBayes STATIC ${SRC})
 configLib(CMSISDSPBayes ${ROOT})
 configDsp(CMSISDSPBayes ${ROOT})
 ### Includes
 target_include_directories(CMSISDSPBayes PUBLIC "${DSP}/Include")
--- a/libraries/cmsis/dsp/Source/BayesFunctions/arm_gaussian_naive_bayes_predict_f32.c
+++ b/libraries/cmsis/dsp/Source/BayesFunctions/arm_gaussian_naive_bayes_predict_f32.c
@@ -0,0 +1,397 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_naive_gaussian_bayes_predict_f32
 * Description:  Naive Gaussian Bayesian Estimator
 *
 *
 * Target Processor: Cortex-M and Cortex-A cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 #include <limits.h>
 #include <math.h>
 #define PI_F 3.1415926535897932384626433832795f
 #define DPI_F (2.0f*3.1415926535897932384626433832795f)
 /**
 * @addtogroup groupBayes
 * @{
 */
 /**
 * @brief Naive Gaussian Bayesian Estimator
 *
 * @param[in]  *S         points to a naive bayes instance structure
 * @param[in]  *in        points to the elements of the input vector.
 * @param[in]  *pBuffer   points to a buffer of length numberOfClasses
 * @return The predicted class
 *
 * @par If the number of classes is big, MVE version will consume lot of
 * stack since the log prior are computed on the stack.
 *
 */
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 #include "arm_helium_utils.h"
 #include "arm_vec_math.h"
 uint32_t arm_gaussian_naive_bayes_predict_f32(const arm_gaussian_naive_bayes_instance_f32 *S,
   const float32_t * in,
   float32_t *pBuffer)
 {
    uint32_t         nbClass;
    const float32_t *pTheta = S->theta;
    const float32_t *pSigma = S->sigma;
    float32_t      *buffer = pBuffer;
    const float32_t *pIn = in;
    float32_t       result;
    f32x4_t         vsigma;
    float32_t       tmp;
    f32x4_t         vacc1, vacc2;
    uint32_t        index;
    float32_t       logclassPriors[S->numberOfClasses];
    float32_t      *pLogPrior = logclassPriors;
    arm_vlog_f32((float32_t *) S->classPriors, logclassPriors, S->numberOfClasses);
    pTheta = S->theta;
    pSigma = S->sigma;
    for (nbClass = 0; nbClass < S->numberOfClasses; nbClass++) {
        pIn = in;
        vacc1 = vdupq_n_f32(0);
        vacc2 = vdupq_n_f32(0);
        uint32_t         blkCnt =S->vectorDimension >> 2;
        while (blkCnt > 0U) {
            f32x4_t         vinvSigma, vtmp;
            vsigma = vaddq_n_f32(vld1q(pSigma), S->epsilon);
            vacc1 = vaddq(vacc1, vlogq_f32(vmulq_n_f32(vsigma, 2.0f * PI)));
            vinvSigma = vrecip_medprec_f32(vsigma);
            vtmp = vsubq(vld1q(pIn), vld1q(pTheta));
            /* squaring */
            vtmp = vmulq(vtmp, vtmp);
            vacc2 = vfmaq(vacc2, vtmp, vinvSigma);
            pIn += 4;
            pTheta += 4;
            pSigma += 4;
            blkCnt--;
        }
        blkCnt = S->vectorDimension & 3;
        if (blkCnt > 0U) {
            mve_pred16_t    p0 = vctp32q(blkCnt);
            f32x4_t         vinvSigma, vtmp;
            vsigma = vaddq_n_f32(vld1q(pSigma), S->epsilon);
            vacc1 =
                vaddq_m_f32(vacc1, vacc1, vlogq_f32(vmulq_n_f32(vsigma, 2.0f * PI)), p0);
            vinvSigma = vrecip_medprec_f32(vsigma);
            vtmp = vsubq(vld1q(pIn), vld1q(pTheta));
            /* squaring */
            vtmp = vmulq(vtmp, vtmp);
            vacc2 = vfmaq_m_f32(vacc2, vtmp, vinvSigma, p0);
            pTheta += blkCnt;
            pSigma += blkCnt;
        }
        tmp = -0.5f * vecAddAcrossF32Mve(vacc1);
        tmp -= 0.5f * vecAddAcrossF32Mve(vacc2);
        *buffer = tmp + *pLogPrior++;
        buffer++;
    }
    arm_max_f32(pBuffer, S->numberOfClasses, &result, &index);
    return (index);
 }
 #else
 #if defined(ARM_MATH_NEON)
 #include "NEMath.h"
 uint32_t arm_gaussian_naive_bayes_predict_f32(const arm_gaussian_naive_bayes_instance_f32 *S,
   const float32_t * in,
   float32_t *pBuffer)
 {
    const float32_t *pPrior = S->classPriors;
    const float32_t *pTheta = S->theta;
    const float32_t *pSigma = S->sigma;
    const float32_t *pTheta1 = S->theta + S->vectorDimension;
    const float32_t *pSigma1 = S->sigma + S->vectorDimension;
    float32_t *buffer = pBuffer;
    const float32_t *pIn=in;
    float32_t result;
    float32_t sigma,sigma1;
    float32_t tmp,tmp1;
    uint32_t index;
    uint32_t vecBlkCnt;
    uint32_t classBlkCnt;
    float32x4_t epsilonV;
    float32x4_t sigmaV,sigmaV1;
    float32x4_t tmpV,tmpVb,tmpV1;
    float32x2_t tmpV2;
    float32x4_t thetaV,thetaV1;
    float32x4_t inV;
    epsilonV = vdupq_n_f32(S->epsilon);
    classBlkCnt = S->numberOfClasses >> 1;
    while(classBlkCnt > 0)
    {
        pIn = in;
        tmp = logf(*pPrior++);
        tmp1 = logf(*pPrior++);
        tmpV = vdupq_n_f32(0.0f);
        tmpV1 = vdupq_n_f32(0.0f);
        vecBlkCnt = S->vectorDimension >> 2;
        while(vecBlkCnt > 0)
        {
           sigmaV = vld1q_f32(pSigma);
           thetaV = vld1q_f32(pTheta);
           sigmaV1 = vld1q_f32(pSigma1);
           thetaV1 = vld1q_f32(pTheta1);
           inV = vld1q_f32(pIn);
           sigmaV = vaddq_f32(sigmaV, epsilonV);
           sigmaV1 = vaddq_f32(sigmaV1, epsilonV);
           tmpVb = vmulq_n_f32(sigmaV,DPI_F);
           tmpVb = vlogq_f32(tmpVb);
           tmpV = vmlsq_n_f32(tmpV,tmpVb,0.5f);
           tmpVb = vmulq_n_f32(sigmaV1,DPI_F);
           tmpVb = vlogq_f32(tmpVb);
           tmpV1 = vmlsq_n_f32(tmpV1,tmpVb,0.5f);
           tmpVb = vsubq_f32(inV,thetaV);
           tmpVb = vmulq_f32(tmpVb,tmpVb);
           tmpVb = vmulq_f32(tmpVb, vinvq_f32(sigmaV));
           tmpV = vmlsq_n_f32(tmpV,tmpVb,0.5f);
           tmpVb = vsubq_f32(inV,thetaV1);
           tmpVb = vmulq_f32(tmpVb,tmpVb);
           tmpVb = vmulq_f32(tmpVb, vinvq_f32(sigmaV1));
           tmpV1 = vmlsq_n_f32(tmpV1,tmpVb,0.5f);
           pIn += 4;
           pTheta += 4;
           pSigma += 4;
           pTheta1 += 4;
           pSigma1 += 4;
           vecBlkCnt--;
        }
        tmpV2 = vpadd_f32(vget_low_f32(tmpV),vget_high_f32(tmpV));
        tmp += vget_lane_f32(tmpV2, 0) + vget_lane_f32(tmpV2, 1);
        tmpV2 = vpadd_f32(vget_low_f32(tmpV1),vget_high_f32(tmpV1));
        tmp1 += vget_lane_f32(tmpV2, 0) + vget_lane_f32(tmpV2, 1);
        vecBlkCnt = S->vectorDimension & 3;
        while(vecBlkCnt > 0)
        {
           sigma = *pSigma + S->epsilon;
           sigma1 = *pSigma1 + S->epsilon;
           tmp -= 0.5f*logf(2.0f * PI_F * sigma);
           tmp -= 0.5f*(*pIn - *pTheta) * (*pIn - *pTheta) / sigma;
           tmp1 -= 0.5f*logf(2.0f * PI_F * sigma1);
           tmp1 -= 0.5f*(*pIn - *pTheta1) * (*pIn - *pTheta1) / sigma1;
           pIn++;
           pTheta++;
           pSigma++;
           pTheta1++;
           pSigma1++;
           vecBlkCnt--;
        }
        *buffer++ = tmp;
        *buffer++ = tmp1;
        pSigma += S->vectorDimension;
        pTheta += S->vectorDimension;
        pSigma1 += S->vectorDimension;
        pTheta1 += S->vectorDimension;
        classBlkCnt--;
    }
    classBlkCnt = S->numberOfClasses & 1;
    while(classBlkCnt > 0)
    {
        pIn = in;
        tmp = logf(*pPrior++);
        tmpV = vdupq_n_f32(0.0f);
        vecBlkCnt = S->vectorDimension >> 2;
        while(vecBlkCnt > 0)
        {
           sigmaV = vld1q_f32(pSigma);
           thetaV = vld1q_f32(pTheta);
           inV = vld1q_f32(pIn);
           sigmaV = vaddq_f32(sigmaV, epsilonV);
           tmpVb = vmulq_n_f32(sigmaV,DPI_F);
           tmpVb = vlogq_f32(tmpVb);
           tmpV = vmlsq_n_f32(tmpV,tmpVb,0.5f);
           tmpVb = vsubq_f32(inV,thetaV);
           tmpVb = vmulq_f32(tmpVb,tmpVb);
           tmpVb = vmulq_f32(tmpVb, vinvq_f32(sigmaV));
           tmpV = vmlsq_n_f32(tmpV,tmpVb,0.5f);
           pIn += 4;
           pTheta += 4;
           pSigma += 4;
           vecBlkCnt--;
        }
        tmpV2 = vpadd_f32(vget_low_f32(tmpV),vget_high_f32(tmpV));
        tmp += vget_lane_f32(tmpV2, 0) + vget_lane_f32(tmpV2, 1);
        vecBlkCnt = S->vectorDimension & 3;
        while(vecBlkCnt > 0)
        {
           sigma = *pSigma + S->epsilon;
           tmp -= 0.5f*logf(2.0f * PI_F * sigma);
           tmp -= 0.5f*(*pIn - *pTheta) * (*pIn - *pTheta) / sigma;
           pIn++;
           pTheta++;
           pSigma++;
           vecBlkCnt--;
        }
        *buffer++ = tmp;
        classBlkCnt--;
    }
    arm_max_f32(pBuffer,S->numberOfClasses,&result,&index);
    return(index);
 }
 #else
 /**
 * @brief Naive Gaussian Bayesian Estimator
 *
 * @param[in]  *S         points to a naive bayes instance structure
 * @param[in]  *in        points to the elements of the input vector.
 * @param[in]  *pBuffer   points to a buffer of length numberOfClasses
 * @return The predicted class
 *
 */
 uint32_t arm_gaussian_naive_bayes_predict_f32(const arm_gaussian_naive_bayes_instance_f32 *S,
   const float32_t * in,
   float32_t *pBuffer)
 {
    uint32_t nbClass;
    uint32_t nbDim;
    const float32_t *pPrior = S->classPriors;
    const float32_t *pTheta = S->theta;
    const float32_t *pSigma = S->sigma;
    float32_t *buffer = pBuffer;
    const float32_t *pIn=in;
    float32_t result;
    float32_t sigma;
    float32_t tmp;
    float32_t acc1,acc2;
    uint32_t index;
    pTheta=S->theta;
    pSigma=S->sigma;
    for(nbClass = 0; nbClass < S->numberOfClasses; nbClass++)
    {
        pIn = in;
        tmp = 0.0;
        acc1 = 0.0f;
        acc2 = 0.0f;
        for(nbDim = 0; nbDim < S->vectorDimension; nbDim++)
        {
           sigma = *pSigma + S->epsilon;
           acc1 += logf(2.0f * PI_F * sigma);
           acc2 += (*pIn - *pTheta) * (*pIn - *pTheta) / sigma;
           pIn++;
           pTheta++;
           pSigma++;
        }
        tmp = -0.5f * acc1;
        tmp -= 0.5f * acc2;
        *buffer = tmp + logf(*pPrior++);
        buffer++;
    }
    arm_max_f32(pBuffer,S->numberOfClasses,&result,&index);
    return(index);
 }
 #endif
 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 /**
 * @} end of groupBayes group
 */
--- a/libraries/cmsis/dsp/Source/CMakeLists.txt
+++ b/libraries/cmsis/dsp/Source/CMakeLists.txt
@@ -0,0 +1,280 @@
 cmake_minimum_required (VERSION 3.6)
 cmake_policy(SET CMP0077 NEW)
 project(CMSISDSP)
 # DSP Sources
 SET(DSP ${ROOT}/CMSIS/DSP)
 list(APPEND CMAKE_MODULE_PATH ${DSP}/Source)
 list(APPEND CMAKE_MODULE_PATH ${DSP})
 include(configLib)
 option(NEON "Neon acceleration" OFF)
 option(NEONEXPERIMENTAL "Neon experimental acceleration" OFF)
 option(LOOPUNROLL "Loop unrolling" ON)
 option(ROUNDING "Rounding" OFF)
 option(MATRIXCHECK "Matrix Checks" OFF)
 option(HELIUM "Helium acceleration (MVEF and MVEI supported)" OFF)
 option(MVEF "MVEF intrinsics supported" OFF)
 option(MVEI "MVEI intrinsics supported" OFF)
 # Select which parts of the CMSIS-DSP must be compiled.
 # There are some dependencies between the parts but they are not tracked
 # by this cmake. So, enabling some functions may require to enable some
 # other ones.
 option(BASICMATH            "Basic Math Functions"              ON)
 option(COMPLEXMATH          "Complex Math Functions"            ON)
 option(CONTROLLER           "Controller Functions"              ON)
 option(FASTMATH             "Fast Math Functions"               ON)
 option(FILTERING            "Filtering Functions"               ON)
 option(MATRIX               "Matrix Functions"                  ON)
 option(STATISTICS           "Statistics Functions"              ON)
 option(SUPPORT              "Support Functions"                 ON)
 option(TRANSFORM            "Transform Functions"               ON)
 option(SVM                  "Support Vector Machine Functions"  ON)
 option(BAYES                "Bayesian Estimators"               ON)
 option(DISTANCE             "Distance Functions"                ON)
 # When OFF it is the default behavior : all tables are included.
 option(CONFIGTABLE          "Configuration of table allowed"    OFF)
 # When CONFIGTABLE is ON, select if all interpolation tables must be included
 option(ALLFAST              "All interpolation tables included" OFF)
 # When CONFIGTABLE is ON, select if all FFT tables must be included
 option(ALLFFT               "All fft tables included"           OFF)
 # Features which require inclusion of a data table.
 # Since some tables may be big, the corresponding feature can be
 # disabled.
 # Those options are taken into account only when CONFIGTABLE is ON
 option(ARM_COS_F32          "cos f32"                           OFF)
 option(ARM_COS_Q31          "cos q31"                           OFF)
 option(ARM_COS_Q15          "cos q15"                           OFF)
 option(ARM_SIN_F32          "sin f32"                           OFF)
 option(ARM_SIN_Q31          "sin q31"                           OFF)
 option(ARM_SIN_Q15          "sin q15"                           OFF)
 option(ARM_SIN_COS_F32      "sin cos f32"                       OFF)
 option(ARM_SIN_COS_Q31      "sin cos q31"                       OFF)
 option(ARM_LMS_NORM_Q31     "lms norm q31"                      OFF)
 option(ARM_LMS_NORM_Q15     "lms norm q15"                      OFF)
 option(CFFT_F64_16          "cfft f64 16"                       OFF)
 option(CFFT_F64_32          "cfft f64 32"                       OFF)
 option(CFFT_F64_64          "cfft f64 64"                       OFF)
 option(CFFT_F64_128         "cfft f64 128"                      OFF)
 option(CFFT_F64_256         "cfft f64 256"                      OFF)
 option(CFFT_F64_512         "cfft f64 512"                      OFF)
 option(CFFT_F64_1024        "cfft f64 1024"                     OFF)
 option(CFFT_F64_2048        "cfft f64 2048"                     OFF)
 option(CFFT_F64_4096        "cfft f64 4096"                     OFF)
 option(CFFT_F32_16          "cfft f32 16"                       OFF)
 option(CFFT_F32_32          "cfft f32 32"                       OFF)
 option(CFFT_F32_64          "cfft f32 64"                       OFF)
 option(CFFT_F32_128         "cfft f32 128"                      OFF)
 option(CFFT_F32_256         "cfft f32 256"                      OFF)
 option(CFFT_F32_512         "cfft f32 512"                      OFF)
 option(CFFT_F32_1024        "cfft f32 1024"                     OFF)
 option(CFFT_F32_2048        "cfft f32 2048"                     OFF)
 option(CFFT_F32_4096        "cfft f32 4096"                     OFF)
 option(CFFT_Q31_16          "cfft q31 16"                       OFF)
 option(CFFT_Q31_32          "cfft q31 32"                       OFF)
 option(CFFT_Q31_64          "cfft q31 64"                       OFF)
 option(CFFT_Q31_128         "cfft q31 128"                      OFF)
 option(CFFT_Q31_256         "cfft q31 256"                      OFF)
 option(CFFT_Q31_512         "cfft q31 512"                      OFF)
 option(CFFT_Q31_1024        "cfft q31 1024"                     OFF)
 option(CFFT_Q31_2048        "cfft q31 2048"                     OFF)
 option(CFFT_Q31_4096        "cfft q31 4096"                     OFF)
 option(CFFT_Q15_16          "cfft q15 16"                       OFF)
 option(CFFT_Q15_32          "cfft q15 32"                       OFF)
 option(CFFT_Q15_64          "cfft q15 64"                       OFF)
 option(CFFT_Q15_128         "cfft q15 128"                      OFF)
 option(CFFT_Q15_256         "cfft q15 256"                      OFF)
 option(CFFT_Q15_512         "cfft q15 512"                      OFF)
 option(CFFT_Q15_1024        "cfft q15 1024"                     OFF)
 option(CFFT_Q15_2048        "cfft q15 2048"                     OFF)
 option(CFFT_Q15_4096        "cfft q15 4096"                     OFF)
 option(RFFT_FAST_F32_32     "rfft fast f32 32"                  OFF)
 option(RFFT_FAST_F32_64     "rfft fast f32 64"                  OFF)
 option(RFFT_FAST_F32_128    "rfft fast f32 128"                 OFF)
 option(RFFT_FAST_F32_256    "rfft fast f32 256"                 OFF)
 option(RFFT_FAST_F32_512    "rfft fast f32 512"                 OFF)
 option(RFFT_FAST_F32_1024   "rfft fast f32 1024"                OFF)
 option(RFFT_FAST_F32_2048   "rfft fast f32 2048"                OFF)
 option(RFFT_FAST_F32_4096   "rfft fast f32 4096"                OFF)
 option(RFFT_F32_128         "rfft f32 128"                      OFF)
 option(RFFT_F32_512         "rfft f32 512"                      OFF)
 option(RFFT_F32_2048        "rfft f32 2048"                     OFF)
 option(RFFT_F32_8192        "rfft f32 8192"                     OFF)
 option(RFFT_FAST_F64_32     "rfft fast f64 32"                  OFF)
 option(RFFT_FAST_F64_64     "rfft fast f64 64"                  OFF)
 option(RFFT_FAST_F64_128    "rfft fast f64 128"                 OFF)
 option(RFFT_FAST_F64_256    "rfft fast f64 256"                 OFF)
 option(RFFT_FAST_F64_512    "rfft fast f64 512"                 OFF)
 option(RFFT_FAST_F64_1024   "rfft fast f64 1024"                OFF)
 option(RFFT_FAST_F64_2048   "rfft fast f64 2048"                OFF)
 option(RFFT_FAST_F64_4096   "rfft fast f64 4096"                OFF)
 option(RFFT_F64_128         "rfft f64 128"                      OFF)
 option(RFFT_F64_512         "rfft f64 512"                      OFF)
 option(RFFT_F64_2048        "rfft f64 2048"                     OFF)
 option(RFFT_F64_8192        "rfft f64 8192"                     OFF)
 option(RFFT_Q31_32          "rfft q31 32"                       OFF)
 option(RFFT_Q31_64          "rfft q31 64"                       OFF)
 option(RFFT_Q31_128         "rfft q31 128"                      OFF)
 option(RFFT_Q31_256         "rfft q31 256"                      OFF)
 option(RFFT_Q31_512         "rfft q31 512"                      OFF)
 option(RFFT_Q31_1024        "rfft q31 1024"                     OFF)
 option(RFFT_Q31_2048        "rfft q31 2048"                     OFF)
 option(RFFT_Q31_4096        "rfft q31 4096"                     OFF)
 option(RFFT_Q31_8192        "rfft q31 8192"                     OFF)
 option(RFFT_Q15_32          "rfft q15 32"                       OFF)
 option(RFFT_Q15_64          "rfft q15 64"                       OFF)
 option(RFFT_Q15_128         "rfft q15 128"                      OFF)
 option(RFFT_Q15_256         "rfft q15 256"                      OFF)
 option(RFFT_Q15_512         "rfft q15 512"                      OFF)
 option(RFFT_Q15_1024        "rfft q15 1024"                     OFF)
 option(RFFT_Q15_2048        "rfft q15 2048"                     OFF)
 option(RFFT_Q15_4096        "rfft q15 4096"                     OFF)
 option(RFFT_Q15_8192        "rfft q15 8192"                     OFF)
 option(DCT4_F32_128          "dct4 f32 128"                     OFF)
 option(DCT4_F32_512          "dct4 f32 512"                     OFF)
 option(DCT4_F32_2048         "dct4 f32 2048"                    OFF)
 option(DCT4_F32_8192         "dct4 f32 8192"                    OFF)
 option(DCT4_Q31_128          "dct4 q31 128"                     OFF)
 option(DCT4_Q31_512          "dct4 q31 512"                     OFF)
 option(DCT4_Q31_2048         "dct4 q31 2048"                    OFF)
 option(DCT4_Q31_8192         "dct4 q31 8192"                    OFF)
 option(DCT4_Q15_128          "dct4 q15 128"                     OFF)
 option(DCT4_Q15_512          "dct4 q15 512"                     OFF)
 option(DCT4_Q15_2048         "dct4 q15 2048"                    OFF)
 option(DCT4_Q15_8192         "dct4 q15 8192"                    OFF)
 ###########################
 #
 # CMSIS DSP
 #
 ###########################
 add_library(CMSISDSP INTERFACE)
 if (BASICMATH)
  add_subdirectory(BasicMathFunctions)
  target_link_libraries(CMSISDSP INTERFACE CMSISDSPBasicMath)
 endif()
 if (COMPLEXMATH)
  add_subdirectory(ComplexMathFunctions)
  target_link_libraries(CMSISDSP INTERFACE CMSISDSPComplexMath)
 endif()
 if (CONTROLLER)
  add_subdirectory(ControllerFunctions)
  # Fast tables inclusion is allowed
  if (CONFIGTABLE)
    target_compile_definitions(CMSISDSPController PUBLIC ARM_FAST_ALLOW_TABLES)
  endif()
  target_link_libraries(CMSISDSP INTERFACE CMSISDSPController)
 endif()
 if (FASTMATH)
  add_subdirectory(FastMathFunctions)
  # Fast tables inclusion is allowed
  if (CONFIGTABLE)
    target_compile_definitions(CMSISDSPFastMath PUBLIC ARM_FAST_ALLOW_TABLES)
  endif()
  target_link_libraries(CMSISDSP INTERFACE CMSISDSPFastMath)
 endif()
 if (FILTERING)
  add_subdirectory(FilteringFunctions)
  # Fast tables inclusion is allowed
  if (CONFIGTABLE)
    target_compile_definitions(CMSISDSPFiltering PUBLIC ARM_FAST_ALLOW_TABLES)
  endif()
  target_link_libraries(CMSISDSP INTERFACE CMSISDSPFiltering)
 endif()
 if (MATRIX)
  add_subdirectory(MatrixFunctions)
  target_link_libraries(CMSISDSP INTERFACE CMSISDSPMatrix)
 endif()
 if (STATISTICS)
  add_subdirectory(StatisticsFunctions)
  target_link_libraries(CMSISDSP INTERFACE CMSISDSPStatistics)
 endif()
 if (SUPPORT)
  add_subdirectory(SupportFunctions)
  target_link_libraries(CMSISDSP INTERFACE CMSISDSPSupport)
 endif()
 if (TRANSFORM)
  add_subdirectory(TransformFunctions)
  # FFT tables inclusion is allowed
  if (CONFIGTABLE)
    target_compile_definitions(CMSISDSPTransform PUBLIC ARM_FFT_ALLOW_TABLES)
  endif()
  target_link_libraries(CMSISDSP INTERFACE CMSISDSPTransform)
 endif()
 if (FILTERING OR CONTROLLER OR FASTMATH OR TRANSFORM OR SVM OR DISTANCE)
  add_subdirectory(CommonTables)
  if (TRANSFORM)
    # FFT tables inclusion is allowed
    if (CONFIGTABLE)
      target_compile_definitions(CMSISDSPCommon PUBLIC ARM_FFT_ALLOW_TABLES)
    endif()
  endif()
  if (FILTERING OR CONTROLLER OR FASTMATH)
    # Select which tables to include
    if (CONFIGTABLE)
      target_compile_definitions(CMSISDSPCommon PUBLIC ARM_FAST_ALLOW_TABLES)
    endif()
  endif()
  target_link_libraries(CMSISDSP INTERFACE CMSISDSPCommon)
  # Common project is adding ComputeLibrary tables used by SVM and Distance
  # when NEon is ON.
 endif()
 if (SVM)
  add_subdirectory(SVMFunctions)
  target_link_libraries(CMSISDSP INTERFACE CMSISDSPSVM)
 endif()
 if (BAYES)
  add_subdirectory(BayesFunctions)
  target_link_libraries(CMSISDSP INTERFACE CMSISDSPBayes)
 endif()
 if (DISTANCE)
  add_subdirectory(DistanceFunctions)
  target_link_libraries(CMSISDSP INTERFACE CMSISDSPDistance)
 endif()
 ### Includes
 target_include_directories(CMSISDSP INTERFACE "${DSP}/Include")
--- a/libraries/cmsis/dsp/Source/CommonTables/CMakeLists.txt
+++ b/libraries/cmsis/dsp/Source/CommonTables/CMakeLists.txt
@@ -0,0 +1,41 @@
 cmake_minimum_required (VERSION 3.6)
 project(CMSISDSPCommon)
 include(configLib)
 include(configDsp)
 add_library(CMSISDSPCommon STATIC arm_common_tables.c)
 configLib(CMSISDSPCommon ${ROOT})
 configDsp(CMSISDSPCommon ${ROOT})
 if (CONFIGTABLE AND ALLFFT)
    target_compile_definitions(CMSISDSPCommon PUBLIC ARM_ALL_FFT_TABLES) 
 endif()
 if (CONFIGTABLE AND ALLFAST)
    target_compile_definitions(CMSISDSPCommon PUBLIC ARM_ALL_FAST_TABLES) 
 endif()
 include(fft)
 fft(CMSISDSPCommon)
 include(interpol)
 interpol(CMSISDSPCommon)
 target_sources(CMSISDSPCommon PRIVATE arm_const_structs.c)
 ### Includes
 target_include_directories(CMSISDSPCommon PUBLIC "${DSP}/Include")
 if (NEON OR NEONEXPERIMENTAL)
    target_sources(CMSISDSPCommon PRIVATE "${DSP}/ComputeLibrary/Source/arm_cl_tables.c")
 endif()
 if (HELIUM OR MVEF)
    target_sources(CMSISDSPCommon PRIVATE "${DSP}/Source/CommonTables/arm_mve_tables.c")
 endif()
--- a/libraries/cmsis/dsp/Source/CommonTables/CommonTables.c
+++ b/libraries/cmsis/dsp/Source/CommonTables/CommonTables.c
@@ -0,0 +1,31 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        CommonTables.c
 * Description:  Combination of all common table source files.
 *
 * $Date:        08. January 2020
 * $Revision:    V1.1.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2019-2020 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_common_tables.c"
 #include "arm_const_structs.c"
 #include "arm_mve_tables.c"
--- a/libraries/cmsis/dsp/Source/CommonTables/arm_common_tables.c
+++ b/libraries/cmsis/dsp/Source/CommonTables/arm_common_tables.c
--- a/libraries/cmsis/dsp/Source/CommonTables/arm_const_structs.c
+++ b/libraries/cmsis/dsp/Source/CommonTables/arm_const_structs.c
@@ -0,0 +1,663 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_const_structs.c
 * Description:  Constant structs that are initialized for user convenience.
 *               For example, some can be given as arguments to the arm_cfft_f32() or arm_rfft_f32() functions.
 *
 * $Date:        27. January 2017
 * $Revision:    V.1.5.1
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 #include "arm_const_structs.h"
 /*
 ALLOW TABLE is true when config table is enabled and the Tramsform folder is included
 for compilation.
 */
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
 /* Floating-point structs */
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_16) && defined(ARM_TABLE_BITREVIDX_FLT64_16))
 const arm_cfft_instance_f64 arm_cfft_sR_f64_len16 = {
  16, (const float64_t *)twiddleCoefF64_16, armBitRevIndexTableF64_16, ARMBITREVINDEXTABLEF64_16_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_32) && defined(ARM_TABLE_BITREVIDX_FLT64_32))
 const arm_cfft_instance_f64 arm_cfft_sR_f64_len32 = {
  32, (const float64_t *)twiddleCoefF64_32, armBitRevIndexTableF64_32, ARMBITREVINDEXTABLEF64_32_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_64) && defined(ARM_TABLE_BITREVIDX_FLT64_64))
 const arm_cfft_instance_f64 arm_cfft_sR_f64_len64 = {
  64, (const float64_t *)twiddleCoefF64_64, armBitRevIndexTableF64_64, ARMBITREVINDEXTABLEF64_64_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_128) && defined(ARM_TABLE_BITREVIDX_FLT64_128))
 const arm_cfft_instance_f64 arm_cfft_sR_f64_len128 = {
  128, (const float64_t *)twiddleCoefF64_128, armBitRevIndexTableF64_128, ARMBITREVINDEXTABLEF64_128_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_256) && defined(ARM_TABLE_BITREVIDX_FLT64_256))
 const arm_cfft_instance_f64 arm_cfft_sR_f64_len256 = {
  256, (const float64_t *)twiddleCoefF64_256, armBitRevIndexTableF64_256, ARMBITREVINDEXTABLEF64_256_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_512) && defined(ARM_TABLE_BITREVIDX_FLT64_512))
 const arm_cfft_instance_f64 arm_cfft_sR_f64_len512 = {
  512, (const float64_t *)twiddleCoefF64_512, armBitRevIndexTableF64_512, ARMBITREVINDEXTABLEF64_512_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_1024) && defined(ARM_TABLE_BITREVIDX_FLT64_1024))
 const arm_cfft_instance_f64 arm_cfft_sR_f64_len1024 = {
  1024, (const float64_t *)twiddleCoefF64_1024, armBitRevIndexTableF64_1024, ARMBITREVINDEXTABLEF64_1024_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_2048) && defined(ARM_TABLE_BITREVIDX_FLT64_2048))
 const arm_cfft_instance_f64 arm_cfft_sR_f64_len2048 = {
  2048, (const float64_t *)twiddleCoefF64_2048, armBitRevIndexTableF64_2048, ARMBITREVINDEXTABLEF64_2048_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_4096) && defined(ARM_TABLE_BITREVIDX_FLT64_4096))
 const arm_cfft_instance_f64 arm_cfft_sR_f64_len4096 = {
  4096, (const float64_t *)twiddleCoefF64_4096, armBitRevIndexTableF64_4096, ARMBITREVINDEXTABLEF64_4096_TABLE_LENGTH
 };
 #endif
 /* Floating-point structs */
 #if !defined(ARM_MATH_MVEF) || defined(ARM_MATH_AUTOVECTORIZE)
 /*
 Those structures cannot be used to initialize the MVE version of the FFT F32 instances.
 So they are not compiled when MVE is defined.
 For the MVE version, the new arm_cfft_init_f32 must be used.
 */
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_16) && defined(ARM_TABLE_BITREVIDX_FLT_16))
 const arm_cfft_instance_f32 arm_cfft_sR_f32_len16 = {
  16, twiddleCoef_16, armBitRevIndexTable16, ARMBITREVINDEXTABLE_16_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_32) && defined(ARM_TABLE_BITREVIDX_FLT_32))
 const arm_cfft_instance_f32 arm_cfft_sR_f32_len32 = {
  32, twiddleCoef_32, armBitRevIndexTable32, ARMBITREVINDEXTABLE_32_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_64) && defined(ARM_TABLE_BITREVIDX_FLT_64))
 const arm_cfft_instance_f32 arm_cfft_sR_f32_len64 = {
  64, twiddleCoef_64, armBitRevIndexTable64, ARMBITREVINDEXTABLE_64_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_128) && defined(ARM_TABLE_BITREVIDX_FLT_128))
 const arm_cfft_instance_f32 arm_cfft_sR_f32_len128 = {
  128, twiddleCoef_128, armBitRevIndexTable128, ARMBITREVINDEXTABLE_128_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_256) && defined(ARM_TABLE_BITREVIDX_FLT_256))
 const arm_cfft_instance_f32 arm_cfft_sR_f32_len256 = {
  256, twiddleCoef_256, armBitRevIndexTable256, ARMBITREVINDEXTABLE_256_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_512) && defined(ARM_TABLE_BITREVIDX_FLT_512))
 const arm_cfft_instance_f32 arm_cfft_sR_f32_len512 = {
  512, twiddleCoef_512, armBitRevIndexTable512, ARMBITREVINDEXTABLE_512_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024))
 const arm_cfft_instance_f32 arm_cfft_sR_f32_len1024 = {
  1024, twiddleCoef_1024, armBitRevIndexTable1024, ARMBITREVINDEXTABLE_1024_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048))
 const arm_cfft_instance_f32 arm_cfft_sR_f32_len2048 = {
  2048, twiddleCoef_2048, armBitRevIndexTable2048, ARMBITREVINDEXTABLE_2048_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_4096) && defined(ARM_TABLE_BITREVIDX_FLT_4096))
 const arm_cfft_instance_f32 arm_cfft_sR_f32_len4096 = {
  4096, twiddleCoef_4096, armBitRevIndexTable4096, ARMBITREVINDEXTABLE_4096_TABLE_LENGTH
 };
 #endif
 #endif /* !defined(ARM_MATH_MVEF) || defined(ARM_MATH_AUTOVECTORIZE) */
 /* Fixed-point structs */
 #if !defined(ARM_MATH_MVEI)
 /*
 Those structures cannot be used to initialize the MVE version of the FFT Q31 instances.
 So they are not compiled when MVE is defined.
 For the MVE version, the new arm_cfft_init_f32 must be used.
 */
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
 const arm_cfft_instance_q31 arm_cfft_sR_q31_len16 = {
  16, twiddleCoef_16_q31, armBitRevIndexTable_fixed_16, ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
 const arm_cfft_instance_q31 arm_cfft_sR_q31_len32 = {
  32, twiddleCoef_32_q31, armBitRevIndexTable_fixed_32, ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
 const arm_cfft_instance_q31 arm_cfft_sR_q31_len64 = {
  64, twiddleCoef_64_q31, armBitRevIndexTable_fixed_64, ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
 const arm_cfft_instance_q31 arm_cfft_sR_q31_len128 = {
  128, twiddleCoef_128_q31, armBitRevIndexTable_fixed_128, ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
 const arm_cfft_instance_q31 arm_cfft_sR_q31_len256 = {
  256, twiddleCoef_256_q31, armBitRevIndexTable_fixed_256, ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
 const arm_cfft_instance_q31 arm_cfft_sR_q31_len512 = {
  512, twiddleCoef_512_q31, armBitRevIndexTable_fixed_512, ARMBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
 const arm_cfft_instance_q31 arm_cfft_sR_q31_len1024 = {
  1024, twiddleCoef_1024_q31, armBitRevIndexTable_fixed_1024, ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
 const arm_cfft_instance_q31 arm_cfft_sR_q31_len2048 = {
  2048, twiddleCoef_2048_q31, armBitRevIndexTable_fixed_2048, ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
 const arm_cfft_instance_q31 arm_cfft_sR_q31_len4096 = {
  4096, twiddleCoef_4096_q31, armBitRevIndexTable_fixed_4096, ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
 const arm_cfft_instance_q15 arm_cfft_sR_q15_len16 = {
  16, twiddleCoef_16_q15, armBitRevIndexTable_fixed_16, ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
 const arm_cfft_instance_q15 arm_cfft_sR_q15_len32 = {
  32, twiddleCoef_32_q15, armBitRevIndexTable_fixed_32, ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
 const arm_cfft_instance_q15 arm_cfft_sR_q15_len64 = {
  64, twiddleCoef_64_q15, armBitRevIndexTable_fixed_64, ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
 const arm_cfft_instance_q15 arm_cfft_sR_q15_len128 = {
  128, twiddleCoef_128_q15, armBitRevIndexTable_fixed_128, ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
 const arm_cfft_instance_q15 arm_cfft_sR_q15_len256 = {
  256, twiddleCoef_256_q15, armBitRevIndexTable_fixed_256, ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
 const arm_cfft_instance_q15 arm_cfft_sR_q15_len512 = {
  512, twiddleCoef_512_q15, armBitRevIndexTable_fixed_512, ARMBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
 const arm_cfft_instance_q15 arm_cfft_sR_q15_len1024 = {
  1024, twiddleCoef_1024_q15, armBitRevIndexTable_fixed_1024, ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
 const arm_cfft_instance_q15 arm_cfft_sR_q15_len2048 = {
  2048, twiddleCoef_2048_q15, armBitRevIndexTable_fixed_2048, ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
 const arm_cfft_instance_q15 arm_cfft_sR_q15_len4096 = {
  4096, twiddleCoef_4096_q15, armBitRevIndexTable_fixed_4096, ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH
 };
 #endif
 #endif /* !defined(ARM_MATH_MVEI) */
 /* Structure for real-value inputs */
 /* Double precision strucs */
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_32) && defined(ARM_TABLE_BITREVIDX_FLT64_32) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_32))
 const arm_rfft_fast_instance_f64 arm_rfft_fast_sR_f64_len32 = {
  { 16, (const float64_t *)twiddleCoefF64_16, armBitRevIndexTableF64_16, ARMBITREVINDEXTABLEF64_16_TABLE_LENGTH },
  32U,
  (float64_t *)twiddleCoefF64_rfft_32
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_64) && defined(ARM_TABLE_BITREVIDX_FLT64_64) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_64))
 const arm_rfft_fast_instance_f64 arm_rfft_fast_sR_f64_len64 = {
   { 32, (const float64_t *)twiddleCoefF64_32, armBitRevIndexTableF64_32, ARMBITREVINDEXTABLEF64_32_TABLE_LENGTH },
  64U,
  (float64_t *)twiddleCoefF64_rfft_64
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_128) && defined(ARM_TABLE_BITREVIDX_FLT64_128) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_128))
 const arm_rfft_fast_instance_f64 arm_rfft_fast_sR_f64_len128 = {
  { 64, (const float64_t *)twiddleCoefF64_64, armBitRevIndexTableF64_64, ARMBITREVINDEXTABLEF64_64_TABLE_LENGTH },
  128U,
  (float64_t *)twiddleCoefF64_rfft_128
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_256) && defined(ARM_TABLE_BITREVIDX_FLT64_256) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_256))
 const arm_rfft_fast_instance_f64 arm_rfft_fast_sR_f64_len256 = {
  { 128, (const float64_t *)twiddleCoefF64_128, armBitRevIndexTableF64_128, ARMBITREVINDEXTABLEF64_128_TABLE_LENGTH },
  256U,
  (float64_t *)twiddleCoefF64_rfft_256
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_512) && defined(ARM_TABLE_BITREVIDX_FLT64_512) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_512))
 const arm_rfft_fast_instance_f64 arm_rfft_fast_sR_f64_len512 = {
  { 256, (const float64_t *)twiddleCoefF64_256, armBitRevIndexTableF64_256, ARMBITREVINDEXTABLEF64_256_TABLE_LENGTH },
  512U,
  (float64_t *)twiddleCoefF64_rfft_512
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_1024) && defined(ARM_TABLE_BITREVIDX_FLT64_1024) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_1024))
 const arm_rfft_fast_instance_f64 arm_rfft_fast_sR_f64_len1024 = {
  { 512, (const float64_t *)twiddleCoefF64_512, armBitRevIndexTableF64_512, ARMBITREVINDEXTABLEF64_512_TABLE_LENGTH },
  1024U,
  (float64_t *)twiddleCoefF64_rfft_1024
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_2048) && defined(ARM_TABLE_BITREVIDX_FLT64_2048) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_2048))
 const arm_rfft_fast_instance_f64 arm_rfft_fast_sR_f64_len2048 = {
  { 1024, (const float64_t *)twiddleCoefF64_1024, armBitRevIndexTableF64_1024, ARMBITREVINDEXTABLEF64_1024_TABLE_LENGTH },
  2048U,
  (float64_t *)twiddleCoefF64_rfft_2048
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_4096) && defined(ARM_TABLE_BITREVIDX_FLT64_4096) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_4096))
 const arm_rfft_fast_instance_f64 arm_rfft_fast_sR_f64_len4096 = {
  { 2048, (const float64_t *)twiddleCoefF64_2048, armBitRevIndexTableF64_2048, ARMBITREVINDEXTABLEF64_2048_TABLE_LENGTH },
  4096U,
  (float64_t *)twiddleCoefF64_rfft_4096
 };
 #endif
 /* Floating-point structs */
 #if !defined(ARM_MATH_MVEF) || defined(ARM_MATH_AUTOVECTORIZE)
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_32) && defined(ARM_TABLE_BITREVIDX_FLT_32) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_32))
 const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len32 = {
  { 16, twiddleCoef_16, armBitRevIndexTable16, ARMBITREVINDEXTABLE_16_TABLE_LENGTH },
  32U,
  (float32_t *)twiddleCoef_rfft_32
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_64) && defined(ARM_TABLE_BITREVIDX_FLT_64) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_64))
 const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len64 = {
   { 32, twiddleCoef_32, armBitRevIndexTable32, ARMBITREVINDEXTABLE_32_TABLE_LENGTH },
  64U,
  (float32_t *)twiddleCoef_rfft_64
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_128) && defined(ARM_TABLE_BITREVIDX_FLT_128) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_128))
 const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len128 = {
  { 64, twiddleCoef_64, armBitRevIndexTable64, ARMBITREVINDEXTABLE_64_TABLE_LENGTH },
  128U,
  (float32_t *)twiddleCoef_rfft_128
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_256) && defined(ARM_TABLE_BITREVIDX_FLT_256) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_256))
 const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len256 = {
  { 128, twiddleCoef_128, armBitRevIndexTable128, ARMBITREVINDEXTABLE_128_TABLE_LENGTH },
  256U,
  (float32_t *)twiddleCoef_rfft_256
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_512) && defined(ARM_TABLE_BITREVIDX_FLT_512) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_512))
 const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len512 = {
  { 256, twiddleCoef_256, armBitRevIndexTable256, ARMBITREVINDEXTABLE_256_TABLE_LENGTH },
  512U,
  (float32_t *)twiddleCoef_rfft_512
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_1024))
 const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len1024 = {
  { 512, twiddleCoef_512, armBitRevIndexTable512, ARMBITREVINDEXTABLE_512_TABLE_LENGTH },
  1024U,
  (float32_t *)twiddleCoef_rfft_1024
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_2048))
 const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len2048 = {
  { 1024, twiddleCoef_1024, armBitRevIndexTable1024, ARMBITREVINDEXTABLE_1024_TABLE_LENGTH },
  2048U,
  (float32_t *)twiddleCoef_rfft_2048
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_4096) && defined(ARM_TABLE_BITREVIDX_FLT_4096) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_4096))
 const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len4096 = {
  { 2048, twiddleCoef_2048, armBitRevIndexTable2048, ARMBITREVINDEXTABLE_2048_TABLE_LENGTH },
  4096U,
  (float32_t *)twiddleCoef_rfft_4096
 };
 #endif
 #endif /* #if !defined(ARM_MATH_MVEF) || defined(ARM_MATH_AUTOVECTORIZE) */
 /* Fixed-point structs */
 /* q31_t */
 #if !defined(ARM_MATH_MVEI)
 /*
 Those structures cannot be used to initialize the MVE version of the FFT Q31 instances.
 So they are not compiled when MVE is defined.
 For the MVE version, the new arm_cfft_init_f32 must be used.
 */
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
 const arm_rfft_instance_q31 arm_rfft_sR_q31_len32 = {
  32U,
  0,
  1,
  256U,
  (q31_t*)realCoefAQ31,
  (q31_t*)realCoefBQ31,
  &arm_cfft_sR_q31_len16
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
 const arm_rfft_instance_q31 arm_rfft_sR_q31_len64 = {
  64U,
  0,
  1,
  128U,
  (q31_t*)realCoefAQ31,
  (q31_t*)realCoefBQ31,
  &arm_cfft_sR_q31_len32
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
 const arm_rfft_instance_q31 arm_rfft_sR_q31_len128 = {
  128U,
  0,
  1,
  64U,
  (q31_t*)realCoefAQ31,
  (q31_t*)realCoefBQ31,
  &arm_cfft_sR_q31_len64
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
 const arm_rfft_instance_q31 arm_rfft_sR_q31_len256 = {
  256U,
  0,
  1,
  32U,
  (q31_t*)realCoefAQ31,
  (q31_t*)realCoefBQ31,
  &arm_cfft_sR_q31_len128
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
 const arm_rfft_instance_q31 arm_rfft_sR_q31_len512 = {
  512U,
  0,
  1,
  16U,
  (q31_t*)realCoefAQ31,
  (q31_t*)realCoefBQ31,
  &arm_cfft_sR_q31_len256
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
 const arm_rfft_instance_q31 arm_rfft_sR_q31_len1024 = {
  1024U,
  0,
  1,
  8U,
  (q31_t*)realCoefAQ31,
  (q31_t*)realCoefBQ31,
  &arm_cfft_sR_q31_len512
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
 const arm_rfft_instance_q31 arm_rfft_sR_q31_len2048 = {
  2048U,
  0,
  1,
  4U,
  (q31_t*)realCoefAQ31,
  (q31_t*)realCoefBQ31,
  &arm_cfft_sR_q31_len1024
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
 const arm_rfft_instance_q31 arm_rfft_sR_q31_len4096 = {
  4096U,
  0,
  1,
  2U,
  (q31_t*)realCoefAQ31,
  (q31_t*)realCoefBQ31,
  &arm_cfft_sR_q31_len2048
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
 const arm_rfft_instance_q31 arm_rfft_sR_q31_len8192 = {
  8192U,
  0,
  1,
  1U,
  (q31_t*)realCoefAQ31,
  (q31_t*)realCoefBQ31,
  &arm_cfft_sR_q31_len4096
 };
 #endif
 /* q15_t */
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
 const arm_rfft_instance_q15 arm_rfft_sR_q15_len32 = {
  32U,
  0,
  1,
  256U,
  (q15_t*)realCoefAQ15,
  (q15_t*)realCoefBQ15,
  &arm_cfft_sR_q15_len16
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
 const arm_rfft_instance_q15 arm_rfft_sR_q15_len64 = {
  64U,
  0,
  1,
  128U,
  (q15_t*)realCoefAQ15,
  (q15_t*)realCoefBQ15,
  &arm_cfft_sR_q15_len32
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
 const arm_rfft_instance_q15 arm_rfft_sR_q15_len128 = {
  128U,
  0,
  1,
  64U,
  (q15_t*)realCoefAQ15,
  (q15_t*)realCoefBQ15,
  &arm_cfft_sR_q15_len64
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
 const arm_rfft_instance_q15 arm_rfft_sR_q15_len256 = {
  256U,
  0,
  1,
  32U,
  (q15_t*)realCoefAQ15,
  (q15_t*)realCoefBQ15,
  &arm_cfft_sR_q15_len128
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
 const arm_rfft_instance_q15 arm_rfft_sR_q15_len512 = {
  512U,
  0,
  1,
  16U,
  (q15_t*)realCoefAQ15,
  (q15_t*)realCoefBQ15,
  &arm_cfft_sR_q15_len256
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
 const arm_rfft_instance_q15 arm_rfft_sR_q15_len1024 = {
  1024U,
  0,
  1,
  8U,
  (q15_t*)realCoefAQ15,
  (q15_t*)realCoefBQ15,
  &arm_cfft_sR_q15_len512
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
 const arm_rfft_instance_q15 arm_rfft_sR_q15_len2048 = {
  2048U,
  0,
  1,
  4U,
  (q15_t*)realCoefAQ15,
  (q15_t*)realCoefBQ15,
  &arm_cfft_sR_q15_len1024
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
 const arm_rfft_instance_q15 arm_rfft_sR_q15_len4096 = {
  4096U,
  0,
  1,
  2U,
  (q15_t*)realCoefAQ15,
  (q15_t*)realCoefBQ15,
  &arm_cfft_sR_q15_len2048
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
 const arm_rfft_instance_q15 arm_rfft_sR_q15_len8192 = {
  8192U,
  0,
  1,
  1U,
  (q15_t*)realCoefAQ15,
  (q15_t*)realCoefBQ15,
  &arm_cfft_sR_q15_len4096
 };
 #endif
 #endif /* !defined(ARM_MATH_MVEI) */
 #endif
--- a/libraries/cmsis/dsp/Source/CommonTables/arm_mve_tables.c
+++ b/libraries/cmsis/dsp/Source/CommonTables/arm_mve_tables.c
--- a/libraries/cmsis/dsp/Source/ComplexMathFunctions/CMakeLists.txt
+++ b/libraries/cmsis/dsp/Source/ComplexMathFunctions/CMakeLists.txt
@@ -0,0 +1,53 @@
 cmake_minimum_required (VERSION 3.6)
 project(CMSISDSPComplexMath)
 include(configLib)
 include(configDsp)
 file(GLOB SRC "./*_*.c")
 add_library(CMSISDSPComplexMath STATIC)
 configLib(CMSISDSPComplexMath ${ROOT})
 configDsp(CMSISDSPComplexMath ${ROOT})
 include(interpol)
 interpol(CMSISDSPFastMath)
 if (CONFIGTABLE AND ALLFAST)
    target_compile_definitions(CMSISDSPComplexMath PUBLIC ARM_ALL_FAST_TABLES)  
 endif()
 # MVE code is using a table for computing the fast sqrt arm_cmplx_mag_q31
 # There is the possibility of not compiling this function and not including
 # the table.
 if (NOT CONFIGTABLE OR ALLFAST OR ARM_CMPLX_MAG_Q31 OR (NOT HELIUM AND NOT MVEI))
 target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_q31.c)
 endif()
 if (NOT CONFIGTABLE OR ALLFAST OR ARM_CMPLX_MAG_Q15 OR (NOT HELIUM AND NOT MVEI))
 target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_q15.c)
 endif()
 target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_conj_f32.c)
 target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_conj_q15.c)
 target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_conj_q31.c)
 target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_dot_prod_f32.c)
 target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_dot_prod_q15.c)
 target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_dot_prod_q31.c)
 target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_f32.c)
 target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_squared_f32.c)
 target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_squared_q15.c)
 target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_squared_q31.c)
 target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_cmplx_f32.c)
 target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_cmplx_q15.c)
 target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_cmplx_q31.c)
 target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_real_f32.c)
 target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_real_q15.c)
 target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_real_q31.c)
 ### Includes
 target_include_directories(CMSISDSPComplexMath PUBLIC "${DSP}/Include")
--- a/libraries/cmsis/dsp/Source/ComplexMathFunctions/ComplexMathFunctions.c
+++ b/libraries/cmsis/dsp/Source/ComplexMathFunctions/ComplexMathFunctions.c
@@ -0,0 +1,46 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        CompexMathFunctions.c
 * Description:  Combination of all comlex math function source files.
 *
 * $Date:        18. March 2019
 * $Revision:    V1.0.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_cmplx_conj_f32.c"
 #include "arm_cmplx_conj_q15.c"
 #include "arm_cmplx_conj_q31.c"
 #include "arm_cmplx_dot_prod_f32.c"
 #include "arm_cmplx_dot_prod_q15.c"
 #include "arm_cmplx_dot_prod_q31.c"
 #include "arm_cmplx_mag_f32.c"
 #include "arm_cmplx_mag_q15.c"
 #include "arm_cmplx_mag_q31.c"
 #include "arm_cmplx_mag_squared_f32.c"
 #include "arm_cmplx_mag_squared_q15.c"
 #include "arm_cmplx_mag_squared_q31.c"
 #include "arm_cmplx_mult_cmplx_f32.c"
 #include "arm_cmplx_mult_cmplx_q15.c"
 #include "arm_cmplx_mult_cmplx_q31.c"
 #include "arm_cmplx_mult_real_f32.c"
 #include "arm_cmplx_mult_real_q15.c"
 #include "arm_cmplx_mult_real_q31.c"
--- a/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_conj_f32.c
+++ b/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_conj_f32.c
@@ -0,0 +1,213 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_cmplx_conj_f32.c
 * Description:  Floating-point complex conjugate
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupCmplxMath
 */
 /**
  @defgroup cmplx_conj Complex Conjugate
  Conjugates the elements of a complex data vector.
  The <code>pSrc</code> points to the source data and
  <code>pDst</code> points to the destination data where the result should be written.
  <code>numSamples</code> specifies the number of complex samples
  and the data in each array is stored in an interleaved fashion
  (real, imag, real, imag, ...).
  Each array has a total of <code>2*numSamples</code> values.
  The underlying algorithm is used:
  <pre>
  for (n = 0; n < numSamples; n++) {
      pDst[(2*n)  ] =  pSrc[(2*n)  ];    // real part
      pDst[(2*n)+1] = -pSrc[(2*n)+1];    // imag part
  }
  </pre>
  There are separate functions for floating-point, Q15, and Q31 data types.
 */
 /**
  @addtogroup cmplx_conj
  @{
 */
 /**
  @brief         Floating-point complex conjugate.
  @param[in]     pSrc        points to the input vector
  @param[out]    pDst        points to the output vector
  @param[in]     numSamples  number of samples in each vector
  @return        none
 */
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_cmplx_conj_f32(
    const float32_t * pSrc,
    float32_t * pDst,
    uint32_t numSamples)
 {
    static const float32_t cmplx_conj_sign[4] = { 1.0f, -1.0f, 1.0f, -1.0f };
    uint32_t blockSize = numSamples * CMPLX_DIM;   /* loop counters */
    uint32_t blkCnt;
    f32x4_t vecSrc;
    f32x4_t vecSign;
    /*
     * load sign vector
     */
    vecSign = *(f32x4_t *) cmplx_conj_sign;
    /* Compute 4 real samples at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        vecSrc = vld1q(pSrc);
        vst1q(pDst,vmulq(vecSrc, vecSign));
        /*
         * Decrement the blkCnt loop counter
         * Advance vector source and destination pointers
         */
        pSrc += 4;
        pDst += 4;
        blkCnt--;
    }
     /* Tail */
    blkCnt = (blockSize & 0x3) >> 1;
    while (blkCnt > 0U)
    {
      /* C[0] + jC[1] = A[0]+ j(-1)A[1] */
      /* Calculate Complex Conjugate and store result in destination buffer. */
      *pDst++ =  *pSrc++;
      *pDst++ = -*pSrc++;
      /* Decrement loop counter */
      blkCnt--;
    }
 }
 #else
 void arm_cmplx_conj_f32(
  const float32_t * pSrc,
        float32_t * pDst,
        uint32_t numSamples)
 {
        uint32_t blkCnt;                               /* Loop counter */
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
   float32x4_t zero;
   float32x4x2_t vec;
   zero = vdupq_n_f32(0.0f);
   /* Compute 4 outputs at a time */
   blkCnt = numSamples >> 2U;
   while (blkCnt > 0U)
   {
     /* C[0]+jC[1] = A[0]+(-1)*jA[1] */
     /* Calculate Complex Conjugate and then store the results in the destination buffer. */
     vec = vld2q_f32(pSrc);
     vec.val[1] = vsubq_f32(zero,vec.val[1]);
     vst2q_f32(pDst,vec);
     /* Increment pointers */
     pSrc += 8;
     pDst += 8;
     /* Decrement the loop counter */
     blkCnt--;
   }
   /* Tail */
   blkCnt = numSamples & 0x3;
 #else
 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = numSamples >> 2U;
  while (blkCnt > 0U)
  {
    /* C[0] + jC[1] = A[0]+ j(-1)A[1] */
    /* Calculate Complex Conjugate and store result in destination buffer. */
    *pDst++ =  *pSrc++;
    *pDst++ = -*pSrc++;
    *pDst++ =  *pSrc++;
    *pDst++ = -*pSrc++;
    *pDst++ =  *pSrc++;
    *pDst++ = -*pSrc++;
    *pDst++ =  *pSrc++;
    *pDst++ = -*pSrc++;
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = numSamples % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = numSamples;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
 #endif /* #if defined (ARM_MATH_NEON) */
  while (blkCnt > 0U)
  {
    /* C[0] + jC[1] = A[0]+ j(-1)A[1] */
    /* Calculate Complex Conjugate and store result in destination buffer. */
    *pDst++ =  *pSrc++;
    *pDst++ = -*pSrc++;
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 /**
  @} end of cmplx_conj group
 */
--- a/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_conj_q15.c
+++ b/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_conj_q15.c
@@ -0,0 +1,207 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_cmplx_conj_q15.c
 * Description:  Q15 complex conjugate
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupCmplxMath
 */
 /**
  @addtogroup cmplx_conj
  @{
 */
 /**
  @brief         Q15 complex conjugate.
  @param[in]     pSrc        points to the input vector
  @param[out]    pDst        points to the output vector
  @param[in]     numSamples  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   The Q15 value -1 (0x8000) is saturated to the maximum allowable positive value 0x7FFF.
 */
 #if defined(ARM_MATH_MVEI)
 void arm_cmplx_conj_q15(
  const q15_t * pSrc,
        q15_t * pDst,
        uint32_t numSamples)
 {
    uint32_t blockSize = numSamples * CMPLX_DIM;   /* loop counters */
    uint32_t blkCnt;
    q31_t in1;
    q15x8x2_t vecSrc;
    q15x8_t zero;
    zero = vdupq_n_s16(0);
    /* Compute 8 real samples at a time */
    blkCnt = blockSize >> 4U;
    while (blkCnt > 0U)
    {
        vecSrc = vld2q(pSrc);
        vecSrc.val[1] = vqsubq(zero, vecSrc.val[1]);
        vst2q(pDst,vecSrc);
        /*
         * Decrement the blkCnt loop counter
         * Advance vector source and destination pointers
         */
        pSrc += 16;
        pDst += 16;
        blkCnt --;
    }
     /* Tail */
    blkCnt = (blockSize & 0xF) >> 1;
    while (blkCnt > 0U)
    {
      /* C[0] + jC[1] = A[0]+ j(-1)A[1] */
      /* Calculate Complex Conjugate and store result in destination buffer. */
      *pDst++ =  *pSrc++;
      in1 = *pSrc++;
      *pDst++ = __SSAT(-in1, 16);
      /* Decrement loop counter */
      blkCnt--;
    }
 }
 #else
 void arm_cmplx_conj_q15(
  const q15_t * pSrc,
        q15_t * pDst,
        uint32_t numSamples)
 {
        uint32_t blkCnt;                               /* Loop counter */
        q31_t in1;                                     /* Temporary input variable */
 #if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
        q31_t in2, in3, in4;                           /* Temporary input variables */
 #endif
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = numSamples >> 2U;
  while (blkCnt > 0U)
  {
    /* C[0] + jC[1] = A[0]+ j(-1)A[1] */
    /* Calculate Complex Conjugate and store result in destination buffer. */
    #if defined (ARM_MATH_DSP)
    in1 = read_q15x2_ia ((q15_t **) &pSrc);
    in2 = read_q15x2_ia ((q15_t **) &pSrc);
    in3 = read_q15x2_ia ((q15_t **) &pSrc);
    in4 = read_q15x2_ia ((q15_t **) &pSrc);
 #ifndef ARM_MATH_BIG_ENDIAN
    in1 = __QASX(0, in1);
    in2 = __QASX(0, in2);
    in3 = __QASX(0, in3);
    in4 = __QASX(0, in4);
 #else
    in1 = __QSAX(0, in1);
    in2 = __QSAX(0, in2);
    in3 = __QSAX(0, in3);
    in4 = __QSAX(0, in4);
 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
    in1 = ((uint32_t) in1 >> 16) | ((uint32_t) in1 << 16);
    in2 = ((uint32_t) in2 >> 16) | ((uint32_t) in2 << 16);
    in3 = ((uint32_t) in3 >> 16) | ((uint32_t) in3 << 16);
    in4 = ((uint32_t) in4 >> 16) | ((uint32_t) in4 << 16);
    write_q15x2_ia (&pDst, in1);
    write_q15x2_ia (&pDst, in2);
    write_q15x2_ia (&pDst, in3);
    write_q15x2_ia (&pDst, in4);
 #else
    *pDst++ =  *pSrc++;
    in1 = *pSrc++;
    *pDst++ = (in1 == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in1;
    *pDst++ =  *pSrc++;
    in1 = *pSrc++;
    *pDst++ = (in1 == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in1;
    *pDst++ =  *pSrc++;
    in1 = *pSrc++;
    *pDst++ = (in1 == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in1;
    *pDst++ =  *pSrc++;
    in1 = *pSrc++;
    *pDst++ = (in1 == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in1;
 #endif /* #if defined (ARM_MATH_DSP) */
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = numSamples % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = numSamples;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C[0] + jC[1] = A[0]+ j(-1)A[1] */
    /* Calculate Complex Conjugate and store result in destination buffer. */
    *pDst++ =  *pSrc++;
    in1 = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = __SSAT(-in1, 16);
 #else
    *pDst++ = (in1 == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in1;
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of cmplx_conj group
 */
--- a/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_conj_q31.c
+++ b/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_conj_q31.c
@@ -0,0 +1,193 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_cmplx_conj_q31.c
 * Description:  Q31 complex conjugate
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupCmplxMath
 */
 /**
  @addtogroup cmplx_conj
  @{
 */
 /**
  @brief         Q31 complex conjugate.
  @param[in]     pSrc        points to the input vector
  @param[out]    pDst        points to the output vector
  @param[in]     numSamples  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   The Q31 value -1 (0x80000000) is saturated to the maximum allowable positive value 0x7FFFFFFF.
 */
 #if defined(ARM_MATH_MVEI)
 void arm_cmplx_conj_q31(
  const q31_t * pSrc,
        q31_t * pDst,
        uint32_t numSamples)
 {
    uint32_t blockSize = numSamples * CMPLX_DIM;   /* loop counters */
    uint32_t blkCnt;
    q31x4x2_t vecSrc;
    q31_t in;                                      /* Temporary input variable */
    q31x4_t zero;
    zero = vdupq_n_s32(0);
    /* Compute 4 real samples at a time */
    blkCnt = blockSize >> 3U;
    while (blkCnt > 0U)
    {
        vecSrc = vld2q(pSrc);
        vecSrc.val[1] = vqsubq(zero, vecSrc.val[1]);
        vst2q(pDst,vecSrc);
        /*
         * Decrement the blkCnt loop counter
         * Advance vector source and destination pointers
         */
        pSrc += 8;
        pDst += 8;
        blkCnt --;
    }
     /* Tail */
    blkCnt = (blockSize & 0x7) >> 1;
    while (blkCnt > 0U)
    {
      /* C[0] + jC[1] = A[0]+ j(-1)A[1] */
      /* Calculate Complex Conjugate and store result in destination buffer. */
      *pDst++ =  *pSrc++;
      in = *pSrc++;
      *pDst++ = __QSUB(0, in);
      /* Decrement loop counter */
      blkCnt--;
    }
 }
 #else
 void arm_cmplx_conj_q31(
  const q31_t * pSrc,
        q31_t * pDst,
        uint32_t numSamples)
 {
        uint32_t blkCnt;                               /* Loop counter */
        q31_t in;                                      /* Temporary input variable */
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = numSamples >> 2U;
  while (blkCnt > 0U)
  {
    /* C[0] + jC[1] = A[0]+ j(-1)A[1] */
    /* Calculate Complex Conjugate and store result in destination buffer. */
    *pDst++ =  *pSrc++;
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = __QSUB(0, in);
 #else
    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
 #endif
    *pDst++ =  *pSrc++;
    in =  *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = __QSUB(0, in);
 #else
    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
 #endif
    *pDst++ =  *pSrc++;
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = __QSUB(0, in);
 #else
    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
 #endif
    *pDst++ =  *pSrc++;
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = __QSUB(0, in);
 #else
    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = numSamples % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = numSamples;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C[0] + jC[1] = A[0]+ j(-1)A[1] */
    /* Calculate Complex Conjugate and store result in destination buffer. */
    *pDst++ =  *pSrc++;
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = __QSUB(0, in);
 #else
    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of cmplx_conj group
 */
--- a/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c
+++ b/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c
@@ -0,0 +1,302 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_cmplx_dot_prod_f32.c
 * Description:  Floating-point complex dot product
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupCmplxMath
 */
 /**
  @defgroup cmplx_dot_prod Complex Dot Product
  Computes the dot product of two complex vectors.
  The vectors are multiplied element-by-element and then summed.
  The <code>pSrcA</code> points to the first complex input vector and
  <code>pSrcB</code> points to the second complex input vector.
  <code>numSamples</code> specifies the number of complex samples
  and the data in each array is stored in an interleaved fashion
  (real, imag, real, imag, ...).
  Each array has a total of <code>2*numSamples</code> values.
  The underlying algorithm is used:
  <pre>
  realResult = 0;
  imagResult = 0;
  for (n = 0; n < numSamples; n++) {
      realResult += pSrcA[(2*n)+0] * pSrcB[(2*n)+0] - pSrcA[(2*n)+1] * pSrcB[(2*n)+1];
      imagResult += pSrcA[(2*n)+0] * pSrcB[(2*n)+1] + pSrcA[(2*n)+1] * pSrcB[(2*n)+0];
  }
  </pre>
  There are separate functions for floating-point, Q15, and Q31 data types.
 */
 /**
  @addtogroup cmplx_dot_prod
  @{
 */
 /**
  @brief         Floating-point complex dot product.
  @param[in]     pSrcA       points to the first input vector
  @param[in]     pSrcB       points to the second input vector
  @param[in]     numSamples  number of samples in each vector
  @param[out]    realResult  real part of the result returned here
  @param[out]    imagResult  imaginary part of the result returned here
  @return        none
 */
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_cmplx_dot_prod_f32(
    const float32_t * pSrcA,
    const float32_t * pSrcB,
    uint32_t numSamples,
    float32_t * realResult,
    float32_t * imagResult)
 {
    uint32_t blockSize = numSamples * CMPLX_DIM;  /* loop counters */
    uint32_t blkCnt;
    float32_t real_sum, imag_sum;
    f32x4_t vecSrcA, vecSrcB;
    f32x4_t vec_acc = vdupq_n_f32(0.0f);
    float32_t a0,b0,c0,d0;
    /* Compute 2 complex samples at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vec_acc = vcmlaq(vec_acc, vecSrcA, vecSrcB);
        vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
        /*
         * Decrement the blkCnt loop counter
         * Advance vector source and destination pointers
         */
        pSrcA += 4;
        pSrcB += 4;
        blkCnt--;
    }
    real_sum = vgetq_lane(vec_acc, 0) + vgetq_lane(vec_acc, 2);
    imag_sum = vgetq_lane(vec_acc, 1) + vgetq_lane(vec_acc, 3);
    /* Tail */
    blkCnt = (blockSize & 3) >> 1;
    while (blkCnt > 0U)
    {
      a0 = *pSrcA++;
      b0 = *pSrcA++;
      c0 = *pSrcB++;
      d0 = *pSrcB++;
      real_sum += a0 * c0;
      imag_sum += a0 * d0;
      real_sum -= b0 * d0;
      imag_sum += b0 * c0;
      /* Decrement loop counter */
      blkCnt--;
    }
    /*
     * Store the real and imaginary results in the destination buffers
     */
    *realResult = real_sum;
    *imagResult = imag_sum;
 }
 #else
 void arm_cmplx_dot_prod_f32(
  const float32_t * pSrcA,
  const float32_t * pSrcB,
        uint32_t numSamples,
        float32_t * realResult,
        float32_t * imagResult)
 {
        uint32_t blkCnt;                               /* Loop counter */
        float32_t real_sum = 0.0f, imag_sum = 0.0f;    /* Temporary result variables */
        float32_t a0,b0,c0,d0;
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
    float32x4x2_t vec1,vec2,vec3,vec4;
    float32x4_t accR,accI;
    float32x2_t accum = vdup_n_f32(0);
    accR = vdupq_n_f32(0.0f);
    accI = vdupq_n_f32(0.0f);
    /* Loop unrolling: Compute 8 outputs at a time */
    blkCnt = numSamples >> 3U;
    while (blkCnt > 0U)
    {
 	/* C = (A[0]+jA[1])*(B[0]+jB[1]) + ...  */
        /* Calculate dot product and then store the result in a temporary buffer. */
 	      vec1 = vld2q_f32(pSrcA);
        vec2 = vld2q_f32(pSrcB);
 	/* Increment pointers */
        pSrcA += 8;
        pSrcB += 8;
 	/* Re{C} = Re{A}*Re{B} - Im{A}*Im{B} */
        accR = vmlaq_f32(accR,vec1.val[0],vec2.val[0]);
        accR = vmlsq_f32(accR,vec1.val[1],vec2.val[1]);
 	/* Im{C} = Re{A}*Im{B} + Im{A}*Re{B} */
        accI = vmlaq_f32(accI,vec1.val[1],vec2.val[0]);
        accI = vmlaq_f32(accI,vec1.val[0],vec2.val[1]);
        vec3 = vld2q_f32(pSrcA);
        vec4 = vld2q_f32(pSrcB);
 	/* Increment pointers */
        pSrcA += 8;
        pSrcB += 8;
 	/* Re{C} = Re{A}*Re{B} - Im{A}*Im{B} */
        accR = vmlaq_f32(accR,vec3.val[0],vec4.val[0]);
        accR = vmlsq_f32(accR,vec3.val[1],vec4.val[1]);
 	/* Im{C} = Re{A}*Im{B} + Im{A}*Re{B} */
        accI = vmlaq_f32(accI,vec3.val[1],vec4.val[0]);
        accI = vmlaq_f32(accI,vec3.val[0],vec4.val[1]);
        /* Decrement the loop counter */
        blkCnt--;
    }
    accum = vpadd_f32(vget_low_f32(accR), vget_high_f32(accR));
    real_sum += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
    accum = vpadd_f32(vget_low_f32(accI), vget_high_f32(accI));
    imag_sum += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
    /* Tail */
    blkCnt = numSamples & 0x7;
 #else
 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = numSamples >> 2U;
  while (blkCnt > 0U)
  {
    a0 = *pSrcA++;
    b0 = *pSrcA++;
    c0 = *pSrcB++;
    d0 = *pSrcB++;
    real_sum += a0 * c0;
    imag_sum += a0 * d0;
    real_sum -= b0 * d0;
    imag_sum += b0 * c0;
    a0 = *pSrcA++;
    b0 = *pSrcA++;
    c0 = *pSrcB++;
    d0 = *pSrcB++;
    real_sum += a0 * c0;
    imag_sum += a0 * d0;
    real_sum -= b0 * d0;
    imag_sum += b0 * c0;
    a0 = *pSrcA++;
    b0 = *pSrcA++;
    c0 = *pSrcB++;
    d0 = *pSrcB++;
    real_sum += a0 * c0;
    imag_sum += a0 * d0;
    real_sum -= b0 * d0;
    imag_sum += b0 * c0;
    a0 = *pSrcA++;
    b0 = *pSrcA++;
    c0 = *pSrcB++;
    d0 = *pSrcB++;
    real_sum += a0 * c0;
    imag_sum += a0 * d0;
    real_sum -= b0 * d0;
    imag_sum += b0 * c0;
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = numSamples % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = numSamples;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
 #endif /* #if defined(ARM_MATH_NEON) */
  while (blkCnt > 0U)
  {
    a0 = *pSrcA++;
    b0 = *pSrcA++;
    c0 = *pSrcB++;
    d0 = *pSrcB++;
    real_sum += a0 * c0;
    imag_sum += a0 * d0;
    real_sum -= b0 * d0;
    imag_sum += b0 * c0;
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Store real and imaginary result in destination buffer. */
  *realResult = real_sum;
  *imagResult = imag_sum;
 }
 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 /**
  @} end of cmplx_dot_prod group
 */
--- a/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q15.c
+++ b/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q15.c
@@ -0,0 +1,234 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_cmplx_dot_prod_q15.c
 * Description:  Processing function for the Q15 Complex Dot product
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupCmplxMath
 */
 /**
  @addtogroup cmplx_dot_prod
  @{
 */
 /**
  @brief         Q15 complex dot product.
  @param[in]     pSrcA       points to the first input vector
  @param[in]     pSrcB       points to the second input vector
  @param[in]     numSamples  number of samples in each vector
  @param[out]    realResult  real part of the result returned here
  @param[out]    imagResult  imaginary part of the result returned her
  @return        none
  @par           Scaling and Overflow Behavior
                   The function is implemented using an internal 64-bit accumulator.
                   The intermediate 1.15 by 1.15 multiplications are performed with full precision and yield a 2.30 result.
                   These are accumulated in a 64-bit accumulator with 34.30 precision.
                   As a final step, the accumulators are converted to 8.24 format.
                   The return results <code>realResult</code> and <code>imagResult</code> are in 8.24 format.
 */
 #if defined(ARM_MATH_MVEI)
 void arm_cmplx_dot_prod_q15(
  const q15_t * pSrcA,
  const q15_t * pSrcB,
        uint32_t numSamples,
        q31_t * realResult,
        q31_t * imagResult)
 {
  uint32_t blockSize = numSamples * CMPLX_DIM;  /* loop counters */
  uint32_t blkCnt;
  q15_t a0,b0,c0,d0;
  q63_t accReal = 0LL; q63_t accImag = 0LL;
  q15x8_t vecSrcA, vecSrcB;
  /* should give more freedom to generate stall free code */
  vecSrcA = vld1q(pSrcA);
  vecSrcB = vld1q(pSrcB);
  pSrcA += 8;
  pSrcB += 8;
  /* Compute 4 complex samples at a time */
  blkCnt = blockSize >> 3;
  while (blkCnt > 0U)
  {
      q15x8_t vecSrcC, vecSrcD;
      accReal = vmlsldavaq(accReal, vecSrcA, vecSrcB);
      vecSrcC = vld1q(pSrcA);
      pSrcA += 8;
      accImag = vmlaldavaxq(accImag, vecSrcA, vecSrcB);
      vecSrcD = vld1q(pSrcB);
      pSrcB += 8;
      accReal = vmlsldavaq(accReal, vecSrcC, vecSrcD);
      vecSrcA = vld1q(pSrcA);
      pSrcA += 8;
      accImag = vmlaldavaxq(accImag, vecSrcC, vecSrcD);
      vecSrcB = vld1q(pSrcB);
      pSrcB += 8;
      /*
       * Decrement the blockSize loop counter
       */
      blkCnt--;
  }
  /* Tail */
  pSrcA -= 8;
  pSrcB -= 8;
  blkCnt = (blockSize & 7) >> 1;
  while (blkCnt > 0U)
  {
    a0 = *pSrcA++;
    b0 = *pSrcA++;
    c0 = *pSrcB++;
    d0 = *pSrcB++;
    accReal += (q31_t)a0 * c0;
    accImag += (q31_t)a0 * d0;
    accReal -= (q31_t)b0 * d0;
    accImag += (q31_t)b0 * c0;
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Store real and imaginary result in 8.24 format  */
  /* Convert real data in 34.30 to 8.24 by 6 right shifts */
  *realResult = (q31_t) (accReal >> 6);
  /* Convert imaginary data in 34.30 to 8.24 by 6 right shifts */
  *imagResult = (q31_t) (accImag >> 6);
 }
 #else
 void arm_cmplx_dot_prod_q15(
  const q15_t * pSrcA,
  const q15_t * pSrcB,
        uint32_t numSamples,
        q31_t * realResult,
        q31_t * imagResult)
 {
        uint32_t blkCnt;                               /* Loop counter */
        q63_t real_sum = 0, imag_sum = 0;              /* Temporary result variables */
        q15_t a0,b0,c0,d0;
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = numSamples >> 2U;
  while (blkCnt > 0U)
  {
    a0 = *pSrcA++;
    b0 = *pSrcA++;
    c0 = *pSrcB++;
    d0 = *pSrcB++;
    real_sum += (q31_t)a0 * c0;
    imag_sum += (q31_t)a0 * d0;
    real_sum -= (q31_t)b0 * d0;
    imag_sum += (q31_t)b0 * c0;
    a0 = *pSrcA++;
    b0 = *pSrcA++;
    c0 = *pSrcB++;
    d0 = *pSrcB++;
    real_sum += (q31_t)a0 * c0;
    imag_sum += (q31_t)a0 * d0;
    real_sum -= (q31_t)b0 * d0;
    imag_sum += (q31_t)b0 * c0;
    a0 = *pSrcA++;
    b0 = *pSrcA++;
    c0 = *pSrcB++;
    d0 = *pSrcB++;
    real_sum += (q31_t)a0 * c0;
    imag_sum += (q31_t)a0 * d0;
    real_sum -= (q31_t)b0 * d0;
    imag_sum += (q31_t)b0 * c0;
    a0 = *pSrcA++;
    b0 = *pSrcA++;
    c0 = *pSrcB++;
    d0 = *pSrcB++;
    real_sum += (q31_t)a0 * c0;
    imag_sum += (q31_t)a0 * d0;
    real_sum -= (q31_t)b0 * d0;
    imag_sum += (q31_t)b0 * c0;
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = numSamples % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = numSamples;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    a0 = *pSrcA++;
    b0 = *pSrcA++;
    c0 = *pSrcB++;
    d0 = *pSrcB++;
    real_sum += (q31_t)a0 * c0;
    imag_sum += (q31_t)a0 * d0;
    real_sum -= (q31_t)b0 * d0;
    imag_sum += (q31_t)b0 * c0;
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Store real and imaginary result in 8.24 format  */
  /* Convert real data in 34.30 to 8.24 by 6 right shifts */
  *realResult = (q31_t) (real_sum >> 6);
  /* Convert imaginary data in 34.30 to 8.24 by 6 right shifts */
  *imagResult = (q31_t) (imag_sum >> 6);
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of cmplx_dot_prod group
 */
--- a/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q31.c
+++ b/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q31.c
@@ -0,0 +1,220 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_cmplx_dot_prod_q31.c
 * Description:  Q31 complex dot product
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupCmplxMath
 */
 /**
  @addtogroup cmplx_dot_prod
  @{
 */
 /**
  @brief         Q31 complex dot product.
  @param[in]     pSrcA       points to the first input vector
  @param[in]     pSrcB       points to the second input vector
  @param[in]     numSamples  number of samples in each vector
  @param[out]    realResult  real part of the result returned here
  @param[out]    imagResult  imaginary part of the result returned here
  @return        none
  @par           Scaling and Overflow Behavior
                   The function is implemented using an internal 64-bit accumulator.
                   The intermediate 1.31 by 1.31 multiplications are performed with 64-bit precision and then shifted to 16.48 format.
                   The internal real and imaginary accumulators are in 16.48 format and provide 15 guard bits.
                   Additions are nonsaturating and no overflow will occur as long as <code>numSamples</code> is less than 32768.
                   The return results <code>realResult</code> and <code>imagResult</code> are in 16.48 format.
                   Input down scaling is not required.
 */
 #if defined(ARM_MATH_MVEI)
 void arm_cmplx_dot_prod_q31(
  const q31_t * pSrcA,
  const q31_t * pSrcB,
        uint32_t numSamples,
        q63_t * realResult,
        q63_t * imagResult)
 {
    uint32_t blockSize = numSamples * CMPLX_DIM;  /* loop counters */
    uint32_t blkCnt;
    q31x4_t vecSrcA, vecSrcB;
    q63_t accReal = 0LL;
    q63_t accImag = 0LL;
    q31_t a0,b0,c0,d0;
     /* Compute 2 complex samples at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        accReal = vrmlsldavhaq(accReal, vecSrcA, vecSrcB);
        accImag = vrmlaldavhaxq(accImag, vecSrcA, vecSrcB);
        /*
         * Decrement the blkCnt loop counter
         * Advance vector source and destination pointers
         */
        pSrcA += 4;
        pSrcB += 4;
        blkCnt --;
    }
    accReal = asrl(accReal, (14 - 8));
    accImag = asrl(accImag, (14 - 8));
    /* Tail */
    blkCnt = (blockSize & 3) >> 1;
    while (blkCnt > 0U)
    {
      a0 = *pSrcA++;
      b0 = *pSrcA++;
      c0 = *pSrcB++;
      d0 = *pSrcB++;
      accReal += ((q63_t)a0 * c0) >> 14;
      accImag += ((q63_t)a0 * d0) >> 14;
      accReal -= ((q63_t)b0 * d0) >> 14;
      accImag += ((q63_t)b0 * c0) >> 14;
      /* Decrement loop counter */
      blkCnt--;
    }
    /* Store real and imaginary result in destination buffer. */
    *realResult = accReal;
    *imagResult = accImag;
 }
 #else
 void arm_cmplx_dot_prod_q31(
  const q31_t * pSrcA,
  const q31_t * pSrcB,
        uint32_t numSamples,
        q63_t * realResult,
        q63_t * imagResult)
 {
        uint32_t blkCnt;                               /* Loop counter */
        q63_t real_sum = 0, imag_sum = 0;              /* Temporary result variables */
        q31_t a0,b0,c0,d0;
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = numSamples >> 2U;
  while (blkCnt > 0U)
  {
    a0 = *pSrcA++;
    b0 = *pSrcA++;
    c0 = *pSrcB++;
    d0 = *pSrcB++;
    real_sum += ((q63_t)a0 * c0) >> 14;
    imag_sum += ((q63_t)a0 * d0) >> 14;
    real_sum -= ((q63_t)b0 * d0) >> 14;
    imag_sum += ((q63_t)b0 * c0) >> 14;
    a0 = *pSrcA++;
    b0 = *pSrcA++;
    c0 = *pSrcB++;
    d0 = *pSrcB++;
    real_sum += ((q63_t)a0 * c0) >> 14;
    imag_sum += ((q63_t)a0 * d0) >> 14;
    real_sum -= ((q63_t)b0 * d0) >> 14;
    imag_sum += ((q63_t)b0 * c0) >> 14;
    a0 = *pSrcA++;
    b0 = *pSrcA++;
    c0 = *pSrcB++;
    d0 = *pSrcB++;
    real_sum += ((q63_t)a0 * c0) >> 14;
    imag_sum += ((q63_t)a0 * d0) >> 14;
    real_sum -= ((q63_t)b0 * d0) >> 14;
    imag_sum += ((q63_t)b0 * c0) >> 14;
    a0 = *pSrcA++;
    b0 = *pSrcA++;
    c0 = *pSrcB++;
    d0 = *pSrcB++;
    real_sum += ((q63_t)a0 * c0) >> 14;
    imag_sum += ((q63_t)a0 * d0) >> 14;
    real_sum -= ((q63_t)b0 * d0) >> 14;
    imag_sum += ((q63_t)b0 * c0) >> 14;
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = numSamples % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = numSamples;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    a0 = *pSrcA++;
    b0 = *pSrcA++;
    c0 = *pSrcB++;
    d0 = *pSrcB++;
    real_sum += ((q63_t)a0 * c0) >> 14;
    imag_sum += ((q63_t)a0 * d0) >> 14;
    real_sum -= ((q63_t)b0 * d0) >> 14;
    imag_sum += ((q63_t)b0 * c0) >> 14;
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Store real and imaginary result in 16.48 format  */
  *realResult = real_sum;
  *imagResult = imag_sum;
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of cmplx_dot_prod group
 */
--- a/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_mag_f32.c
+++ b/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_mag_f32.c
@@ -0,0 +1,273 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_cmplx_mag_f32.c
 * Description:  Floating-point complex magnitude
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupCmplxMath
 */
 /**
  @defgroup cmplx_mag Complex Magnitude
  Computes the magnitude of the elements of a complex data vector.
  The <code>pSrc</code> points to the source data and
  <code>pDst</code> points to the where the result should be written.
  <code>numSamples</code> specifies the number of complex samples
  in the input array and the data is stored in an interleaved fashion
  (real, imag, real, imag, ...).
  The input array has a total of <code>2*numSamples</code> values;
  the output array has a total of <code>numSamples</code> values.
  The underlying algorithm is used:
  <pre>
  for (n = 0; n < numSamples; n++) {
      pDst[n] = sqrt(pSrc[(2*n)+0]^2 + pSrc[(2*n)+1]^2);
  }
  </pre>
  There are separate functions for floating-point, Q15, and Q31 data types.
 */
 /**
  @addtogroup cmplx_mag
  @{
 */
 /**
  @brief         Floating-point complex magnitude.
  @param[in]     pSrc        points to input vector
  @param[out]    pDst        points to output vector
  @param[in]     numSamples  number of samples in each vector
  @return        none
 */
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
 #include "arm_vec_math.h"
 #endif
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 #include "arm_helium_utils.h"
 void arm_cmplx_mag_f32(
  const float32_t * pSrc,
        float32_t * pDst,
        uint32_t numSamples)
 {
    int32_t blockSize = numSamples;  /* loop counters */
    uint32_t  blkCnt;           /* loop counters */
    f32x4x2_t vecSrc;
    f32x4_t sum;
    float32_t real, imag;                      /* Temporary variables to hold input values */
    /* Compute 4 complex samples at a time */
    blkCnt = blockSize >> 2;
    while (blkCnt > 0U)
    {
        q31x4_t newtonStartVec;
        f32x4_t sumHalf, invSqrt;
        vecSrc = vld2q(pSrc);
        pSrc += 8;
        sum = vmulq(vecSrc.val[0], vecSrc.val[0]);
        sum = vfmaq(sum, vecSrc.val[1], vecSrc.val[1]);
        /*
         * inlined Fast SQRT using inverse SQRT newton-raphson method
         */
        /* compute initial value */
        newtonStartVec = vdupq_n_s32(INVSQRT_MAGIC_F32) - vshrq((q31x4_t) sum, 1);
        sumHalf = sum * 0.5f;
        /*
         * compute 3 x iterations
         *
         * The more iterations, the more accuracy.
         * If you need to trade a bit of accuracy for more performance,
         * you can comment out the 3rd use of the macro.
         */
        INVSQRT_NEWTON_MVE_F32(invSqrt, sumHalf, (f32x4_t) newtonStartVec);
        INVSQRT_NEWTON_MVE_F32(invSqrt, sumHalf, invSqrt);
        INVSQRT_NEWTON_MVE_F32(invSqrt, sumHalf, invSqrt);
        /*
         * set negative values to 0
         */
        invSqrt = vdupq_m(invSqrt, 0.0f, vcmpltq(invSqrt, 0.0f));
        /*
         * sqrt(x) = x * invSqrt(x)
         */
        sum = vmulq(sum, invSqrt);
        vst1q(pDst, sum);
        pDst += 4;
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 3;
    while (blkCnt > 0U)
    {
      /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
      real = *pSrc++;
      imag = *pSrc++;
      /* store result in destination buffer. */
      arm_sqrt_f32((real * real) + (imag * imag), pDst++);
      /* Decrement loop counter */
      blkCnt--;
    }
 }
 #else
 void arm_cmplx_mag_f32(
  const float32_t * pSrc,
        float32_t * pDst,
        uint32_t numSamples)
 {
  uint32_t blkCnt;                               /* loop counter */
  float32_t real, imag;                      /* Temporary variables to hold input values */
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
  float32x4x2_t vecA;
  float32x4_t vRealA;
  float32x4_t vImagA;
  float32x4_t vMagSqA;
  float32x4x2_t vecB;
  float32x4_t vRealB;
  float32x4_t vImagB;
  float32x4_t vMagSqB;
  /* Loop unrolling: Compute 8 outputs at a time */
  blkCnt = numSamples >> 3;
  while (blkCnt > 0U)
  {
    /* out = sqrt((real * real) + (imag * imag)) */
    vecA = vld2q_f32(pSrc);
    pSrc += 8;
    vecB = vld2q_f32(pSrc);
    pSrc += 8;
    vRealA = vmulq_f32(vecA.val[0], vecA.val[0]);
    vImagA = vmulq_f32(vecA.val[1], vecA.val[1]);
    vMagSqA = vaddq_f32(vRealA, vImagA);
    vRealB = vmulq_f32(vecB.val[0], vecB.val[0]);
    vImagB = vmulq_f32(vecB.val[1], vecB.val[1]);
    vMagSqB = vaddq_f32(vRealB, vImagB);
    /* Store the result in the destination buffer. */
    vst1q_f32(pDst, __arm_vec_sqrt_f32_neon(vMagSqA));
    pDst += 4;
    vst1q_f32(pDst, __arm_vec_sqrt_f32_neon(vMagSqB));
    pDst += 4;
    /* Decrement the loop counter */
    blkCnt--;
  }
  blkCnt = numSamples & 7;
 #else
 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = numSamples >> 2U;
  while (blkCnt > 0U)
  {
    /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
    real = *pSrc++;
    imag = *pSrc++;
    /* store result in destination buffer. */
    arm_sqrt_f32((real * real) + (imag * imag), pDst++);
    real = *pSrc++;
    imag = *pSrc++;
    arm_sqrt_f32((real * real) + (imag * imag), pDst++);
    real = *pSrc++;
    imag = *pSrc++;
    arm_sqrt_f32((real * real) + (imag * imag), pDst++);
    real = *pSrc++;
    imag = *pSrc++;
    arm_sqrt_f32((real * real) + (imag * imag), pDst++);
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = numSamples % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = numSamples;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
 #endif /* #if defined(ARM_MATH_NEON) */
  while (blkCnt > 0U)
  {
    /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
    real = *pSrc++;
    imag = *pSrc++;
    /* store result in destination buffer. */
    arm_sqrt_f32((real * real) + (imag * imag), pDst++);
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 /**
  @} end of cmplx_mag group
 */
--- a/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_mag_q15.c
+++ b/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_mag_q15.c
@@ -0,0 +1,221 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_cmplx_mag_q15.c
 * Description:  Q15 complex magnitude
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupCmplxMath
 */
 /**
  @addtogroup cmplx_mag
  @{
 */
 /**
  @brief         Q15 complex magnitude.
  @param[in]     pSrc        points to input vector
  @param[out]    pDst        points to output vector
  @param[in]     numSamples  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function implements 1.15 by 1.15 multiplications and finally output is converted into 2.14 format.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_cmplx_mag_q15(
  const q15_t * pSrc,
        q15_t * pDst,
        uint32_t numSamples)
 {
    int32_t blockSize = numSamples;  /* loop counters */
    uint32_t  blkCnt;           /* loop counters */
    q15x8x2_t vecSrc;
    q15x8_t sum;
    q31_t in;
    q31_t acc0;
    blkCnt = blockSize >> 3;
    while (blkCnt > 0U)
    {
        vecSrc = vld2q(pSrc);
        pSrc += 16;
        sum = vqaddq(vmulhq(vecSrc.val[0], vecSrc.val[0]),
                     vmulhq(vecSrc.val[1], vecSrc.val[1]));
        sum = vshrq(sum, 1);
        sum = FAST_VSQRT_Q15(sum);
        vst1q(pDst, sum);
        pDst += 8;
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 7;
    while (blkCnt > 0U)
    {
      /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
      in = read_q15x2_ia ((q15_t **) &pSrc);
      acc0 = __SMUAD(in, in);
      /* store result in 2.14 format in destination buffer. */
      arm_sqrt_q15((q15_t) (acc0 >> 17), pDst++);
      /* Decrement loop counter */
      blkCnt--;
    }
 }
 #else
 void arm_cmplx_mag_q15(
  const q15_t * pSrc,
        q15_t * pDst,
        uint32_t numSamples)
 {
        uint32_t blkCnt;                               /* Loop counter */
 #if defined (ARM_MATH_DSP)
        q31_t in;
        q31_t acc0;                                    /* Accumulators */
 #else
       q15_t real, imag;                              /* Temporary input variables */
       q31_t acc0, acc1;                              /* Accumulators */
 #endif
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = numSamples >> 2U;
  while (blkCnt > 0U)
  {
    /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
 #if defined (ARM_MATH_DSP)
    in = read_q15x2_ia ((q15_t **) &pSrc);
    acc0 = __SMUAD(in, in);
    /* store result in 2.14 format in destination buffer. */
    arm_sqrt_q15((q15_t) (acc0 >> 17), pDst++);
    in = read_q15x2_ia ((q15_t **) &pSrc);
    acc0 = __SMUAD(in, in);
    arm_sqrt_q15((q15_t) (acc0 >> 17), pDst++);
    in = read_q15x2_ia ((q15_t **) &pSrc);
    acc0 = __SMUAD(in, in);
    arm_sqrt_q15((q15_t) (acc0 >> 17), pDst++);
    in = read_q15x2_ia ((q15_t **) &pSrc);
    acc0 = __SMUAD(in, in);
    arm_sqrt_q15((q15_t) (acc0 >> 17), pDst++);
 #else
    real = *pSrc++;
    imag = *pSrc++;
    acc0 = ((q31_t) real * real);
    acc1 = ((q31_t) imag * imag);
    /* store result in 2.14 format in destination buffer. */
    arm_sqrt_q15((q15_t) (((q63_t) acc0 + acc1) >> 17), pDst++);
    real = *pSrc++;
    imag = *pSrc++;
    acc0 = ((q31_t) real * real);
    acc1 = ((q31_t) imag * imag);
    arm_sqrt_q15((q15_t) (((q63_t) acc0 + acc1) >> 17), pDst++);
    real = *pSrc++;
    imag = *pSrc++;
    acc0 = ((q31_t) real * real);
    acc1 = ((q31_t) imag * imag);
    arm_sqrt_q15((q15_t) (((q63_t) acc0 + acc1) >> 17), pDst++);
    real = *pSrc++;
    imag = *pSrc++;
    acc0 = ((q31_t) real * real);
    acc1 = ((q31_t) imag * imag);
    arm_sqrt_q15((q15_t) (((q63_t) acc0 + acc1) >> 17), pDst++);
 #endif /* #if defined (ARM_MATH_DSP) */
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = numSamples % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = numSamples;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
 #if defined (ARM_MATH_DSP)
    in = read_q15x2_ia ((q15_t **) &pSrc);
    acc0 = __SMUAD(in, in);
    /* store result in 2.14 format in destination buffer. */
    arm_sqrt_q15((q15_t) (acc0 >> 17), pDst++);
 #else
    real = *pSrc++;
    imag = *pSrc++;
    acc0 = ((q31_t) real * real);
    acc1 = ((q31_t) imag * imag);
    /* store result in 2.14 format in destination buffer. */
    arm_sqrt_q15((q15_t) (((q63_t) acc0 + acc1) >> 17), pDst++);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of cmplx_mag group
 */
--- a/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_mag_q31.c
+++ b/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_mag_q31.c
@@ -0,0 +1,201 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_cmplx_mag_q31.c
 * Description:  Q31 complex magnitude
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupCmplxMath
 */
 /**
  @addtogroup cmplx_mag
  @{
 */
 /**
  @brief         Q31 complex magnitude.
  @param[in]     pSrc        points to input vector
  @param[out]    pDst        points to output vector
  @param[in]     numSamples  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function implements 1.31 by 1.31 multiplications and finally output is converted into 2.30 format.
                   Input down scaling is not required.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_cmplx_mag_q31(
  const q31_t * pSrc,
        q31_t * pDst,
        uint32_t numSamples)
 {
    int32_t blockSize = numSamples;  /* loop counters */
    uint32_t  blkCnt;           /* loop counters */
    q31x4x2_t vecSrc;
    q31x4_t sum;
    q31_t real, imag;                              /* Temporary input variables */
    q31_t acc0, acc1;                              /* Accumulators */
    /* Compute 4 complex samples at a time */
    blkCnt = blockSize >> 2;
    while (blkCnt > 0U)
    {
        vecSrc = vld2q(pSrc);
        sum = vqaddq(vmulhq(vecSrc.val[0], vecSrc.val[0]),
                     vmulhq(vecSrc.val[1], vecSrc.val[1]));
        sum = vshrq(sum, 1);
        /*
        This function is using a table. There are compilations flags to avoid
        including this table (and in this case, arm_cmplx_maq_q31 must not
        be built and linked.)
        */
        sum = FAST_VSQRT_Q31(sum);
        vst1q(pDst, sum);
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        pSrc += 8;
        pDst += 4;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 3;
    while (blkCnt > 0U)
    {
      /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
      real = *pSrc++;
      imag = *pSrc++;
      acc0 = (q31_t) (((q63_t) real * real) >> 33);
      acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
      /* store result in 2.30 format in destination buffer. */
      arm_sqrt_q31(acc0 + acc1, pDst++);
      /* Decrement loop counter */
      blkCnt--;
    }
 }
 #else
 void arm_cmplx_mag_q31(
  const q31_t * pSrc,
        q31_t * pDst,
        uint32_t numSamples)
 {
        uint32_t blkCnt;                               /* Loop counter */
        q31_t real, imag;                              /* Temporary input variables */
        q31_t acc0, acc1;                              /* Accumulators */
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = numSamples >> 2U;
  while (blkCnt > 0U)
  {
    /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
    real = *pSrc++;
    imag = *pSrc++;
    acc0 = (q31_t) (((q63_t) real * real) >> 33);
    acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
    /* store result in 2.30 format in destination buffer. */
    arm_sqrt_q31(acc0 + acc1, pDst++);
    real = *pSrc++;
    imag = *pSrc++;
    acc0 = (q31_t) (((q63_t) real * real) >> 33);
    acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
    arm_sqrt_q31(acc0 + acc1, pDst++);
    real = *pSrc++;
    imag = *pSrc++;
    acc0 = (q31_t) (((q63_t) real * real) >> 33);
    acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
    arm_sqrt_q31(acc0 + acc1, pDst++);
    real = *pSrc++;
    imag = *pSrc++;
    acc0 = (q31_t) (((q63_t) real * real) >> 33);
    acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
    arm_sqrt_q31(acc0 + acc1, pDst++);
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = numSamples % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = numSamples;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
    real = *pSrc++;
    imag = *pSrc++;
    acc0 = (q31_t) (((q63_t) real * real) >> 33);
    acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
    /* store result in 2.30 format in destination buffer. */
    arm_sqrt_q31(acc0 + acc1, pDst++);
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of cmplx_mag group
 */
--- a/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f32.c
+++ b/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f32.c
@@ -0,0 +1,235 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_cmplx_mag_squared_f32.c
 * Description:  Floating-point complex magnitude squared
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupCmplxMath
 */
 /**
  @defgroup cmplx_mag_squared Complex Magnitude Squared
  Computes the magnitude squared of the elements of a complex data vector.
  The <code>pSrc</code> points to the source data and
  <code>pDst</code> points to the where the result should be written.
  <code>numSamples</code> specifies the number of complex samples
  in the input array and the data is stored in an interleaved fashion
  (real, imag, real, imag, ...).
  The input array has a total of <code>2*numSamples</code> values;
  the output array has a total of <code>numSamples</code> values.
  The underlying algorithm is used:
  <pre>
  for (n = 0; n < numSamples; n++) {
      pDst[n] = pSrc[(2*n)+0]^2 + pSrc[(2*n)+1]^2;
  }
  </pre>
  There are separate functions for floating-point, Q15, and Q31 data types.
 */
 /**
  @addtogroup cmplx_mag_squared
  @{
 */
 /**
  @brief         Floating-point complex magnitude squared.
  @param[in]     pSrc        points to input vector
  @param[out]    pDst        points to output vector
  @param[in]     numSamples  number of samples in each vector
  @return        none
 */
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_cmplx_mag_squared_f32(
  const float32_t * pSrc,
        float32_t * pDst,
        uint32_t numSamples)
 {
    int32_t blockSize = numSamples;  /* loop counters */
    uint32_t  blkCnt;           /* loop counters */
    f32x4x2_t vecSrc;
    f32x4_t sum;
    float32_t real, imag;                          /* Temporary input variables */
    /* Compute 4 complex samples at a time */
    blkCnt = blockSize >> 2;
    while (blkCnt > 0U)
    {
        vecSrc = vld2q(pSrc);
        sum = vmulq(vecSrc.val[0], vecSrc.val[0]);
        sum = vfmaq(sum, vecSrc.val[1], vecSrc.val[1]);
        vst1q(pDst, sum);
        pSrc += 8;
        pDst += 4;
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 3;
    while (blkCnt > 0U)
    {
      /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
      real = *pSrc++;
      imag = *pSrc++;
      /* store result in destination buffer. */
      *pDst++ = (real * real) + (imag * imag);
      /* Decrement loop counter */
      blkCnt--;
    }
 }
 #else
 void arm_cmplx_mag_squared_f32(
  const float32_t * pSrc,
        float32_t * pDst,
        uint32_t numSamples)
 {
        uint32_t blkCnt;                               /* Loop counter */
        float32_t real, imag;                          /* Temporary input variables */
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
  float32x4x2_t vecA;
  float32x4_t vRealA;
  float32x4_t vImagA;
  float32x4_t vMagSqA;
  float32x4x2_t vecB;
  float32x4_t vRealB;
  float32x4_t vImagB;
  float32x4_t vMagSqB;
  /* Loop unrolling: Compute 8 outputs at a time */
  blkCnt = numSamples >> 3;
  while (blkCnt > 0U)
  {
    /* out = sqrt((real * real) + (imag * imag)) */
    vecA = vld2q_f32(pSrc);
    pSrc += 8;
    vRealA = vmulq_f32(vecA.val[0], vecA.val[0]);
    vImagA = vmulq_f32(vecA.val[1], vecA.val[1]);
    vMagSqA = vaddq_f32(vRealA, vImagA);
    vecB = vld2q_f32(pSrc);
    pSrc += 8;
    vRealB = vmulq_f32(vecB.val[0], vecB.val[0]);
    vImagB = vmulq_f32(vecB.val[1], vecB.val[1]);
    vMagSqB = vaddq_f32(vRealB, vImagB);
    /* Store the result in the destination buffer. */
    vst1q_f32(pDst, vMagSqA);
    pDst += 4;
    vst1q_f32(pDst, vMagSqB);
    pDst += 4;
    /* Decrement the loop counter */
    blkCnt--;
  }
  blkCnt = numSamples & 7;
 #else
 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = numSamples >> 2U;
  while (blkCnt > 0U)
  {
    /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
    real = *pSrc++;
    imag = *pSrc++;
    *pDst++ = (real * real) + (imag * imag);
    real = *pSrc++;
    imag = *pSrc++;
    *pDst++ = (real * real) + (imag * imag);
    real = *pSrc++;
    imag = *pSrc++;
    *pDst++ = (real * real) + (imag * imag);
    real = *pSrc++;
    imag = *pSrc++;
    *pDst++ = (real * real) + (imag * imag);
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = numSamples % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = numSamples;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
 #endif /* #if defined(ARM_MATH_NEON) */
  while (blkCnt > 0U)
  {
    /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
    real = *pSrc++;
    imag = *pSrc++;
    /* store result in destination buffer. */
    *pDst++ = (real * real) + (imag * imag);
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 /**
  @} end of cmplx_mag_squared group
 */
--- a/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q15.c
+++ b/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q15.c
@@ -0,0 +1,221 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_cmplx_mag_squared_q15.c
 * Description:  Q15 complex magnitude squared
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupCmplxMath
 */
 /**
  @addtogroup cmplx_mag_squared
  @{
 */
 /**
  @brief         Q15 complex magnitude squared.
  @param[in]     pSrc        points to input vector
  @param[out]    pDst        points to output vector
  @param[in]     numSamples  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function implements 1.15 by 1.15 multiplications and finally output is converted into 3.13 format.
 */
 #if defined(ARM_MATH_MVEI)
 void arm_cmplx_mag_squared_q15(
  const q15_t * pSrc,
        q15_t * pDst,
        uint32_t numSamples)
 {
  int32_t blockSize = numSamples;  /* loop counters */
  uint32_t  blkCnt;           /* loop counters */
  q31_t in;
  q31_t acc0;                                    /* Accumulators */
  q15x8x2_t vecSrc;
  q15x8_t vReal, vImag;
  q15x8_t vMagSq;
  blkCnt = blockSize >> 3;
  while (blkCnt > 0U)
  {
    vecSrc = vld2q(pSrc);
    vReal = vmulhq(vecSrc.val[0], vecSrc.val[0]);
    vImag = vmulhq(vecSrc.val[1], vecSrc.val[1]);
    vMagSq = vqaddq(vReal, vImag);
    vMagSq = vshrq(vMagSq, 1);
    vst1q(pDst, vMagSq);
    pSrc += 16;
    pDst += 8;
    /*
     * Decrement the blkCnt loop counter
     * Advance vector source and destination pointers
     */
    blkCnt --;
  }
  /*
   * tail
   */
  blkCnt = blockSize & 7;
  while (blkCnt > 0U)
  {
    /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
    in = read_q15x2_ia ((q15_t **) &pSrc);
    acc0 = __SMUAD(in, in);
    /* store result in 3.13 format in destination buffer. */
    *pDst++ = (q15_t) (acc0 >> 17);
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #else
 void arm_cmplx_mag_squared_q15(
  const q15_t * pSrc,
        q15_t * pDst,
        uint32_t numSamples)
 {
        uint32_t blkCnt;                               /* Loop counter */
 #if defined (ARM_MATH_DSP)
        q31_t in;
        q31_t acc0;                                    /* Accumulators */
 #else
        q15_t real, imag;                              /* Temporary input variables */
        q31_t acc0, acc1;                              /* Accumulators */
 #endif
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = numSamples >> 2U;
  while (blkCnt > 0U)
  {
    /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
 #if defined (ARM_MATH_DSP)
    in = read_q15x2_ia ((q15_t **) &pSrc);
    acc0 = __SMUAD(in, in);
    /* store result in 3.13 format in destination buffer. */
    *pDst++ = (q15_t) (acc0 >> 17);
    in = read_q15x2_ia ((q15_t **) &pSrc);
    acc0 = __SMUAD(in, in);
    *pDst++ = (q15_t) (acc0 >> 17);
    in = read_q15x2_ia ((q15_t **) &pSrc);
    acc0 = __SMUAD(in, in);
    *pDst++ = (q15_t) (acc0 >> 17);
    in = read_q15x2_ia ((q15_t **) &pSrc);
    acc0 = __SMUAD(in, in);
    *pDst++ = (q15_t) (acc0 >> 17);
 #else
    real = *pSrc++;
    imag = *pSrc++;
    acc0 = ((q31_t) real * real);
    acc1 = ((q31_t) imag * imag);
    /* store result in 3.13 format in destination buffer. */
    *pDst++ = (q15_t) (((q63_t) acc0 + acc1) >> 17);
    real = *pSrc++;
    imag = *pSrc++;
    acc0 = ((q31_t) real * real);
    acc1 = ((q31_t) imag * imag);
    *pDst++ = (q15_t) (((q63_t) acc0 + acc1) >> 17);
    real = *pSrc++;
    imag = *pSrc++;
    acc0 = ((q31_t) real * real);
    acc1 = ((q31_t) imag * imag);
    *pDst++ = (q15_t) (((q63_t) acc0 + acc1) >> 17);
    real = *pSrc++;
    imag = *pSrc++;
    acc0 = ((q31_t) real * real);
    acc1 = ((q31_t) imag * imag);
    *pDst++ = (q15_t) (((q63_t) acc0 + acc1) >> 17);
 #endif /* #if defined (ARM_MATH_DSP) */
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = numSamples % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = numSamples;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
 #if defined (ARM_MATH_DSP)
    in = read_q15x2_ia ((q15_t **) &pSrc);
    acc0 = __SMUAD(in, in);
    /* store result in 3.13 format in destination buffer. */
    *pDst++ = (q15_t) (acc0 >> 17);
 #else
    real = *pSrc++;
    imag = *pSrc++;
    acc0 = ((q31_t) real * real);
    acc1 = ((q31_t) imag * imag);
    /* store result in 3.13 format in destination buffer. */
    *pDst++ = (q15_t) (((q63_t) acc0 + acc1) >> 17);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of cmplx_mag_squared group
 */
--- a/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q31.c
+++ b/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q31.c
@@ -0,0 +1,187 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_cmplx_mag_squared_q31.c
 * Description:  Q31 complex magnitude squared
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupCmplxMath
 */
 /**
  @addtogroup cmplx_mag_squared
  @{
 */
 /**
  @brief         Q31 complex magnitude squared.
  @param[in]     pSrc        points to input vector
  @param[out]    pDst        points to output vector
  @param[in]     numSamples  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function implements 1.31 by 1.31 multiplications and finally output is converted into 3.29 format.
                   Input down scaling is not required.
 */
 #if defined(ARM_MATH_MVEI)
 void arm_cmplx_mag_squared_q31(
  const q31_t * pSrc,
        q31_t * pDst,
        uint32_t numSamples)
 {
    int32_t blockSize = numSamples;  /* loop counters */
    uint32_t  blkCnt;           /* loop counters */
    q31x4x2_t vecSrc;
    q31x4_t vReal, vImag;
    q31x4_t vMagSq;
    q31_t real, imag;                              /* Temporary input variables */
    q31_t acc0, acc1;                              /* Accumulators */
    /* Compute 4 complex samples at a time */
    blkCnt = blockSize >> 2;
    while (blkCnt > 0U)
    {
        vecSrc = vld2q(pSrc);
        vReal = vmulhq(vecSrc.val[0], vecSrc.val[0]);
        vImag = vmulhq(vecSrc.val[1], vecSrc.val[1]);
        vMagSq = vqaddq(vReal, vImag);
        vMagSq = vshrq(vMagSq, 1);
        vst1q(pDst, vMagSq);
        pSrc += 8;
        pDst += 4;
        /*
         * Decrement the blkCnt loop counter
         * Advance vector source and destination pointers
         */
        blkCnt --;
    }
    /* Tail */
    blkCnt = blockSize & 3;
    while (blkCnt > 0U)
    {
      /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
      real = *pSrc++;
      imag = *pSrc++;
      acc0 = (q31_t) (((q63_t) real * real) >> 33);
      acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
      /* store result in 3.29 format in destination buffer. */
      *pDst++ = acc0 + acc1;
      /* Decrement loop counter */
      blkCnt--;
    }
 }
 #else
 void arm_cmplx_mag_squared_q31(
  const q31_t * pSrc,
        q31_t * pDst,
        uint32_t numSamples)
 {
        uint32_t blkCnt;                               /* Loop counter */
        q31_t real, imag;                              /* Temporary input variables */
        q31_t acc0, acc1;                              /* Accumulators */
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = numSamples >> 2U;
  while (blkCnt > 0U)
  {
    /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
    real = *pSrc++;
    imag = *pSrc++;
    acc0 = (q31_t) (((q63_t) real * real) >> 33);
    acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
    /* store the result in 3.29 format in the destination buffer. */
    *pDst++ = acc0 + acc1;
    real = *pSrc++;
    imag = *pSrc++;
    acc0 = (q31_t) (((q63_t) real * real) >> 33);
    acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
    *pDst++ = acc0 + acc1;
    real = *pSrc++;
    imag = *pSrc++;
    acc0 = (q31_t) (((q63_t) real * real) >> 33);
    acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
    *pDst++ = acc0 + acc1;
    real = *pSrc++;
    imag = *pSrc++;
    acc0 = (q31_t) (((q63_t) real * real) >> 33);
    acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
    *pDst++ = acc0 + acc1;
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = numSamples % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = numSamples;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
    real = *pSrc++;
    imag = *pSrc++;
    acc0 = (q31_t) (((q63_t) real * real) >> 33);
    acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
    /* store result in 3.29 format in destination buffer. */
    *pDst++ = acc0 + acc1;
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of cmplx_mag_squared group
 */
--- a/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f32.c
+++ b/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f32.c
@@ -0,0 +1,255 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_cmplx_mult_cmplx_f32.c
 * Description:  Floating-point complex-by-complex multiplication
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupCmplxMath
 */
 /**
  @defgroup CmplxByCmplxMult Complex-by-Complex Multiplication
  Multiplies a complex vector by another complex vector and generates a complex result.
  The data in the complex arrays is stored in an interleaved fashion
  (real, imag, real, imag, ...).
  The parameter <code>numSamples</code> represents the number of complex
  samples processed.  The complex arrays have a total of <code>2*numSamples</code>
  real values.
  The underlying algorithm is used:
  <pre>
  for (n = 0; n < numSamples; n++) {
      pDst[(2*n)+0] = pSrcA[(2*n)+0] * pSrcB[(2*n)+0] - pSrcA[(2*n)+1] * pSrcB[(2*n)+1];
      pDst[(2*n)+1] = pSrcA[(2*n)+0] * pSrcB[(2*n)+1] + pSrcA[(2*n)+1] * pSrcB[(2*n)+0];
  }
  </pre>
  There are separate functions for floating-point, Q15, and Q31 data types.
 */
 /**
  @addtogroup CmplxByCmplxMult
  @{
 */
 /**
  @brief         Floating-point complex-by-complex multiplication.
  @param[in]     pSrcA       points to first input vector
  @param[in]     pSrcB       points to second input vector
  @param[out]    pDst        points to output vector
  @param[in]     numSamples  number of samples in each vector
  @return        none
 */
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_cmplx_mult_cmplx_f32(
  const float32_t * pSrcA,
  const float32_t * pSrcB,
        float32_t * pDst,
        uint32_t numSamples)
 {
    uint32_t blkCnt;           /* loop counters */
    uint32_t blockSize = numSamples;  /* loop counters */
    float32_t a, b, c, d;  /* Temporary variables to store real and imaginary values */
    f32x4x2_t vecA;
    f32x4x2_t vecB;
    f32x4x2_t vecDst;
    /* Compute 4 complex outputs at a time */
    blkCnt = blockSize >> 2;
    while (blkCnt > 0U)
    {
            vecA = vld2q(pSrcA);  // load & separate real/imag pSrcA (de-interleave 2)
            vecB = vld2q(pSrcB);  // load & separate real/imag pSrcB
            pSrcA += 8;
            pSrcB += 8;
            /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1].  */
            vecDst.val[0] = vmulq(vecA.val[0], vecB.val[0]);
            vecDst.val[0] = vfmsq(vecDst.val[0],vecA.val[1], vecB.val[1]);
            /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i].  */
            vecDst.val[1] = vmulq(vecA.val[0], vecB.val[1]);
            vecDst.val[1] = vfmaq(vecDst.val[1], vecA.val[1], vecB.val[0]);
            vst2q(pDst, vecDst);
            pDst += 8;
            blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 3;
    while (blkCnt > 0U)
    {
      /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
      /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */
      a = *pSrcA++;
      b = *pSrcA++;
      c = *pSrcB++;
      d = *pSrcB++;
      /* store result in destination buffer. */
      *pDst++ = (a * c) - (b * d);
      *pDst++ = (a * d) + (b * c);
      /* Decrement loop counter */
      blkCnt--;
    }
 }
 #else
 void arm_cmplx_mult_cmplx_f32(
  const float32_t * pSrcA,
  const float32_t * pSrcB,
        float32_t * pDst,
        uint32_t numSamples)
 {
    uint32_t blkCnt;                               /* Loop counter */
    float32_t a, b, c, d;  /* Temporary variables to store real and imaginary values */
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
    float32x4x2_t va, vb;
    float32x4x2_t outCplx;
    /* Compute 4 outputs at a time */
    blkCnt = numSamples >> 2U;
    while (blkCnt > 0U)
    {
        va = vld2q_f32(pSrcA);  // load & separate real/imag pSrcA (de-interleave 2)
        vb = vld2q_f32(pSrcB);  // load & separate real/imag pSrcB
 	/* Increment pointers */
        pSrcA += 8;
        pSrcB += 8;
 	/* Re{C} = Re{A}*Re{B} - Im{A}*Im{B} */
        outCplx.val[0] = vmulq_f32(va.val[0], vb.val[0]);
        outCplx.val[0] = vmlsq_f32(outCplx.val[0], va.val[1], vb.val[1]);
 	/* Im{C} = Re{A}*Im{B} + Im{A}*Re{B} */
        outCplx.val[1] = vmulq_f32(va.val[0], vb.val[1]);
        outCplx.val[1] = vmlaq_f32(outCplx.val[1], va.val[1], vb.val[0]);
        vst2q_f32(pDst, outCplx);
 	/* Increment pointer */
        pDst += 8;
 	/* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = numSamples & 3;
 #else
 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = numSamples >> 2U;
  while (blkCnt > 0U)
  {
    /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */
    a = *pSrcA++;
    b = *pSrcA++;
    c = *pSrcB++;
    d = *pSrcB++;
    /* store result in destination buffer. */
    *pDst++ = (a * c) - (b * d);
    *pDst++ = (a * d) + (b * c);
    a = *pSrcA++;
    b = *pSrcA++;
    c = *pSrcB++;
    d = *pSrcB++;
    *pDst++ = (a * c) - (b * d);
    *pDst++ = (a * d) + (b * c);
    a = *pSrcA++;
    b = *pSrcA++;
    c = *pSrcB++;
    d = *pSrcB++;
    *pDst++ = (a * c) - (b * d);
    *pDst++ = (a * d) + (b * c);
    a = *pSrcA++;
    b = *pSrcA++;
    c = *pSrcB++;
    d = *pSrcB++;
    *pDst++ = (a * c) - (b * d);
    *pDst++ = (a * d) + (b * c);
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = numSamples % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = numSamples;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
 #endif /* #if defined(ARM_MATH_NEON) */
  while (blkCnt > 0U)
  {
    /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */
    a = *pSrcA++;
    b = *pSrcA++;
    c = *pSrcB++;
    d = *pSrcB++;
    /* store result in destination buffer. */
    *pDst++ = (a * c) - (b * d);
    *pDst++ = (a * d) + (b * c);
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 /**
  @} end of CmplxByCmplxMult group
 */
--- a/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q15.c
+++ b/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q15.c
@@ -0,0 +1,196 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_cmplx_mult_cmplx_q15.c
 * Description:  Q15 complex-by-complex multiplication
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupCmplxMath
 */
 /**
  @addtogroup CmplxByCmplxMult
  @{
 */
 /**
  @brief         Q15 complex-by-complex multiplication.
  @param[in]     pSrcA       points to first input vector
  @param[in]     pSrcB       points to second input vector
  @param[out]    pDst        points to output vector
  @param[in]     numSamples  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function implements 1.15 by 1.15 multiplications and finally output is converted into 3.13 format.
 */
 #if defined(ARM_MATH_MVEI)
 void arm_cmplx_mult_cmplx_q15(
  const q15_t * pSrcA,
  const q15_t * pSrcB,
        q15_t * pDst,
        uint32_t numSamples)
 {
  uint32_t blkCnt;           /* loop counters */
  uint32_t blockSize = numSamples * CMPLX_DIM;  /* loop counters */
  q15_t a, b, c, d;
  q15x8_t vecA;
  q15x8_t vecB;
  q15x8_t vecDst;
  blkCnt = blockSize >> 3;
  while (blkCnt > 0U)
  {
      vecA = vld1q(pSrcA);
      vecB = vld1q(pSrcB);
      /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1].  */
      vecDst = vqdmlsdhq_s16(vuninitializedq_s16(), vecA, vecB);
      /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i].  */
      vecDst = vqdmladhxq_s16(vecDst, vecA, vecB);
      vecDst = vshrq(vecDst, 2);
      vst1q(pDst, vecDst);
      blkCnt --;
      pSrcA += 8;
      pSrcB += 8;
      pDst += 8;
  };
  /*
   * tail
   */
  blkCnt = (blockSize & 7) >> 1;
  while (blkCnt > 0U)
  {
    /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */
    a = *pSrcA++;
    b = *pSrcA++;
    c = *pSrcB++;
    d = *pSrcB++;
    /* store result in 3.13 format in destination buffer. */
    *pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
    *pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #else
 void arm_cmplx_mult_cmplx_q15(
  const q15_t * pSrcA,
  const q15_t * pSrcB,
        q15_t * pDst,
        uint32_t numSamples)
 {
        uint32_t blkCnt;                               /* Loop counter */
        q15_t a, b, c, d;                              /* Temporary variables */
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = numSamples >> 2U;
  while (blkCnt > 0U)
  {
    /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */
    a = *pSrcA++;
    b = *pSrcA++;
    c = *pSrcB++;
    d = *pSrcB++;
    /* store result in 3.13 format in destination buffer. */
    *pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
    *pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
    a = *pSrcA++;
    b = *pSrcA++;
    c = *pSrcB++;
    d = *pSrcB++;
    *pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
    *pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
    a = *pSrcA++;
    b = *pSrcA++;
    c = *pSrcB++;
    d = *pSrcB++;
    *pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
    *pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
    a = *pSrcA++;
    b = *pSrcA++;
    c = *pSrcB++;
    d = *pSrcB++;
    *pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
    *pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = numSamples % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = numSamples;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */
    a = *pSrcA++;
    b = *pSrcA++;
    c = *pSrcB++;
    d = *pSrcB++;
    /* store result in 3.13 format in destination buffer. */
    *pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
    *pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of CmplxByCmplxMult group
 */
--- a/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q31.c
+++ b/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q31.c
@@ -0,0 +1,194 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_cmplx_mult_cmplx_q31.c
 * Description:  Q31 complex-by-complex multiplication
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupCmplxMath
 */
 /**
  @addtogroup CmplxByCmplxMult
  @{
 */
 /**
  @brief         Q31 complex-by-complex multiplication.
  @param[in]     pSrcA       points to first input vector
  @param[in]     pSrcB       points to second input vector
  @param[out]    pDst        points to output vector
  @param[in]     numSamples  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function implements 1.31 by 1.31 multiplications and finally output is converted into 3.29 format.
                   Input down scaling is not required.
 */
 #if defined(ARM_MATH_MVEI)
 void arm_cmplx_mult_cmplx_q31(
  const q31_t * pSrcA,
  const q31_t * pSrcB,
        q31_t * pDst,
        uint32_t numSamples)
 {
    uint32_t blkCnt;           /* loop counters */
    uint32_t blockSize = numSamples * CMPLX_DIM;  /* loop counters */
    q31x4_t vecA;
    q31x4_t vecB;
    q31x4_t vecDst;
    q31_t a, b, c, d;                              /* Temporary variables */
    /* Compute 2 complex outputs at a time */
    blkCnt = blockSize >> 2;
    while (blkCnt > 0U)
    {
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1].  */
        vecDst = vqdmlsdhq(vuninitializedq_s32(),vecA, vecB);
        /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i].  */
        vecDst = vqdmladhxq(vecDst, vecA, vecB);
        vecDst = vshrq(vecDst, 2);
        vst1q(pDst, vecDst);
        blkCnt --;
        pSrcA += 4;
        pSrcB += 4;
        pDst += 4;
    };
    blkCnt = (blockSize & 3) >> 1;
    while (blkCnt > 0U)
    {
      /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
      /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */
      a = *pSrcA++;
      b = *pSrcA++;
      c = *pSrcB++;
      d = *pSrcB++;
      /* store result in 3.29 format in destination buffer. */
      *pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
      *pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
      /* Decrement loop counter */
      blkCnt--;
    }
 }
 #else
 void arm_cmplx_mult_cmplx_q31(
  const q31_t * pSrcA,
  const q31_t * pSrcB,
        q31_t * pDst,
        uint32_t numSamples)
 {
        uint32_t blkCnt;                               /* Loop counter */
        q31_t a, b, c, d;                              /* Temporary variables */
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = numSamples >> 2U;
  while (blkCnt > 0U)
  {
    /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */
    a = *pSrcA++;
    b = *pSrcA++;
    c = *pSrcB++;
    d = *pSrcB++;
    /* store result in 3.29 format in destination buffer. */
    *pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
    *pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
    a = *pSrcA++;
    b = *pSrcA++;
    c = *pSrcB++;
    d = *pSrcB++;
    *pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
    *pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
    a = *pSrcA++;
    b = *pSrcA++;
    c = *pSrcB++;
    d = *pSrcB++;
    *pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
    *pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
    a = *pSrcA++;
    b = *pSrcA++;
    c = *pSrcB++;
    d = *pSrcB++;
    *pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
    *pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = numSamples % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = numSamples;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */
    a = *pSrcA++;
    b = *pSrcA++;
    c = *pSrcB++;
    d = *pSrcB++;
    /* store result in 3.29 format in destination buffer. */
    *pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
    *pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of CmplxByCmplxMult group
 */
--- a/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_mult_real_f32.c
+++ b/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_mult_real_f32.c
@@ -0,0 +1,224 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_cmplx_mult_real_f32.c
 * Description:  Floating-point complex by real multiplication
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupCmplxMath
 */
 /**
  @defgroup CmplxByRealMult Complex-by-Real Multiplication
  Multiplies a complex vector by a real vector and generates a complex result.
  The data in the complex arrays is stored in an interleaved fashion
  (real, imag, real, imag, ...).
  The parameter <code>numSamples</code> represents the number of complex
  samples processed.  The complex arrays have a total of <code>2*numSamples</code>
  real values while the real array has a total of <code>numSamples</code>
  real values.
  The underlying algorithm is used:
  <pre>
  for (n = 0; n < numSamples; n++) {
      pCmplxDst[(2*n)+0] = pSrcCmplx[(2*n)+0] * pSrcReal[n];
      pCmplxDst[(2*n)+1] = pSrcCmplx[(2*n)+1] * pSrcReal[n];
  }
  </pre>
  There are separate functions for floating-point, Q15, and Q31 data types.
 */
 /**
  @addtogroup CmplxByRealMult
  @{
 */
 /**
  @brief         Floating-point complex-by-real multiplication.
  @param[in]     pSrcCmplx   points to complex input vector
  @param[in]     pSrcReal    points to real input vector
  @param[out]    pCmplxDst   points to complex output vector
  @param[in]     numSamples  number of samples in each vector
  @return        none
 */
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_cmplx_mult_real_f32(
  const float32_t * pSrcCmplx,
  const float32_t * pSrcReal,
        float32_t * pCmplxDst,
        uint32_t numSamples)
 {
    const static uint32_t stride_cmplx_x_real_32[4] = { 0, 0, 1, 1 };
    uint32_t blockSizeC = numSamples * CMPLX_DIM;   /* loop counters */
    uint32_t blkCnt;
    f32x4_t rVec;
    f32x4_t cmplxVec;
    f32x4_t dstVec;
    uint32x4_t strideVec;
    float32_t in;
    /* stride vector for pairs of real generation */
    strideVec = vld1q(stride_cmplx_x_real_32);
    /* Compute 4 complex outputs at a time */
    blkCnt = blockSizeC >> 2;
    while (blkCnt > 0U)
    {
        cmplxVec = vld1q(pSrcCmplx);
        rVec = vldrwq_gather_shifted_offset_f32(pSrcReal, strideVec);
        dstVec = vmulq(cmplxVec, rVec);
        vst1q(pCmplxDst, dstVec);
        pSrcReal += 2;
        pSrcCmplx += 4;
        pCmplxDst += 4;
        blkCnt--;
    }
    blkCnt = (blockSizeC & 3) >> 1;
    while (blkCnt > 0U)
    {
      /* C[2 * i    ] = A[2 * i    ] * B[i]. */
      /* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
      in = *pSrcReal++;
      /* store result in destination buffer. */
      *pCmplxDst++ = *pSrcCmplx++ * in;
      *pCmplxDst++ = *pSrcCmplx++ * in;
      /* Decrement loop counter */
      blkCnt--;
    }
 }
 #else
 void arm_cmplx_mult_real_f32(
  const float32_t * pSrcCmplx,
  const float32_t * pSrcReal,
        float32_t * pCmplxDst,
        uint32_t numSamples)
 {
        uint32_t blkCnt;                               /* Loop counter */
        float32_t in;                                  /* Temporary variable */
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
    float32x4_t r;
    float32x4x2_t ab,outCplx;
    /* Compute 4 outputs at a time */
    blkCnt = numSamples >> 2U;
    while (blkCnt > 0U)
    {
        ab = vld2q_f32(pSrcCmplx);  // load & separate real/imag pSrcA (de-interleave 2)
        r = vld1q_f32(pSrcReal);  // load & separate real/imag pSrcB
 	/* Increment pointers */
        pSrcCmplx += 8;
        pSrcReal += 4;
        outCplx.val[0] = vmulq_f32(ab.val[0], r);
        outCplx.val[1] = vmulq_f32(ab.val[1], r);
        vst2q_f32(pCmplxDst, outCplx);
        pCmplxDst += 8;
        blkCnt--;
    }
    /* Tail */
    blkCnt = numSamples & 3;
 #else
 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = numSamples >> 2U;
  while (blkCnt > 0U)
  {
    /* C[2 * i    ] = A[2 * i    ] * B[i]. */
    /* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
    in = *pSrcReal++;
    /* store result in destination buffer. */
    *pCmplxDst++ = *pSrcCmplx++ * in;
    *pCmplxDst++ = *pSrcCmplx++ * in;
    in = *pSrcReal++;
    *pCmplxDst++ = *pSrcCmplx++ * in;
    *pCmplxDst++ = *pSrcCmplx++ * in;
    in = *pSrcReal++;
    *pCmplxDst++ = *pSrcCmplx++ * in;
    *pCmplxDst++ = *pSrcCmplx++ * in;
    in = *pSrcReal++;
    *pCmplxDst++ = *pSrcCmplx++* in;
    *pCmplxDst++ = *pSrcCmplx++ * in;
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = numSamples % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = numSamples;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
 #endif /* #if defined(ARM_MATH_NEON) */
  while (blkCnt > 0U)
  {
    /* C[2 * i    ] = A[2 * i    ] * B[i]. */
    /* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
    in = *pSrcReal++;
    /* store result in destination buffer. */
    *pCmplxDst++ = *pSrcCmplx++ * in;
    *pCmplxDst++ = *pSrcCmplx++ * in;
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 /**
  @} end of CmplxByRealMult group
 */
--- a/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_mult_real_q15.c
+++ b/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_mult_real_q15.c
@@ -0,0 +1,238 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_cmplx_mult_real_q15.c
 * Description:  Q15 complex by real multiplication
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupCmplxMath
 */
 /**
  @addtogroup CmplxByRealMult
  @{
 */
 /**
  @brief         Q15 complex-by-real multiplication.
  @param[in]     pSrcCmplx   points to complex input vector
  @param[in]     pSrcReal    points to real input vector
  @param[out]    pCmplxDst   points to complex output vector
  @param[in]     numSamples  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
 */
 #if defined(ARM_MATH_MVEI)
 void arm_cmplx_mult_real_q15(
  const q15_t * pSrcCmplx,
  const q15_t * pSrcReal,
        q15_t * pCmplxDst,
        uint32_t numSamples)
 {
  const static uint16_t stride_cmplx_x_real_16[8] = {
      0, 0, 1, 1, 2, 2, 3, 3
      };
  q15x8_t rVec;
  q15x8_t cmplxVec;
  q15x8_t dstVec;
  uint16x8_t strideVec;
  uint32_t blockSizeC = numSamples * CMPLX_DIM;   /* loop counters */
  uint32_t blkCnt;
  q15_t in;
  /*
  * stride vector for pairs of real generation
  */
  strideVec = vld1q(stride_cmplx_x_real_16);
  blkCnt = blockSizeC >> 3;
  while (blkCnt > 0U)
  {
    cmplxVec = vld1q(pSrcCmplx);
    rVec = vldrhq_gather_shifted_offset_s16(pSrcReal, strideVec);
    dstVec = vqdmulhq(cmplxVec, rVec);
    vst1q(pCmplxDst, dstVec);
    pSrcReal += 4;
    pSrcCmplx += 8;
    pCmplxDst += 8;
    blkCnt --;
  }
  /* Tail */
  blkCnt = (blockSizeC & 7) >> 1;
  while (blkCnt > 0U)
  {
    /* C[2 * i    ] = A[2 * i    ] * B[i]. */
    /* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
    in = *pSrcReal++;
    /* store the result in the destination buffer. */
    *pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
    *pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #else
 void arm_cmplx_mult_real_q15(
  const q15_t * pSrcCmplx,
  const q15_t * pSrcReal,
        q15_t * pCmplxDst,
        uint32_t numSamples)
 {
        uint32_t blkCnt;                               /* Loop counter */
        q15_t in;                                      /* Temporary variable */
 #if defined (ARM_MATH_LOOPUNROLL)
 #if defined (ARM_MATH_DSP)
        q31_t inA1, inA2;                              /* Temporary variables to hold input data */
        q31_t inB1;                                    /* Temporary variables to hold input data */
        q15_t out1, out2, out3, out4;                  /* Temporary variables to hold output data */
        q31_t mul1, mul2, mul3, mul4;                  /* Temporary variables to hold intermediate data */
 #endif
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = numSamples >> 2U;
  while (blkCnt > 0U)
  {
    /* C[2 * i    ] = A[2 * i    ] * B[i]. */
    /* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
 #if defined (ARM_MATH_DSP)
    /* read 2 complex numbers both real and imaginary from complex input buffer */
    inA1 = read_q15x2_ia ((q15_t **) &pSrcCmplx);
    inA2 = read_q15x2_ia ((q15_t **) &pSrcCmplx);
    /* read 2 real values at a time from real input buffer */
    inB1 = read_q15x2_ia ((q15_t **) &pSrcReal);
    /* multiply complex number with real numbers */
 #ifndef ARM_MATH_BIG_ENDIAN
    mul1 = (q31_t) ((q15_t) (inA1)       * (q15_t) (inB1));
    mul2 = (q31_t) ((q15_t) (inA1 >> 16) * (q15_t) (inB1));
    mul3 = (q31_t) ((q15_t) (inA2)       * (q15_t) (inB1 >> 16));
    mul4 = (q31_t) ((q15_t) (inA2 >> 16) * (q15_t) (inB1 >> 16));
 #else
    mul2 = (q31_t) ((q15_t) (inA1 >> 16) * (q15_t) (inB1 >> 16));
    mul1 = (q31_t) ((q15_t) inA1         * (q15_t) (inB1 >> 16));
    mul4 = (q31_t) ((q15_t) (inA2 >> 16) * (q15_t) inB1);
    mul3 = (q31_t) ((q15_t) inA2         * (q15_t) inB1);
 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
    /* saturate the result */
    out1 = (q15_t) __SSAT(mul1 >> 15U, 16);
    out2 = (q15_t) __SSAT(mul2 >> 15U, 16);
    out3 = (q15_t) __SSAT(mul3 >> 15U, 16);
    out4 = (q15_t) __SSAT(mul4 >> 15U, 16);
    /* pack real and imaginary outputs and store them to destination */
    write_q15x2_ia (&pCmplxDst, __PKHBT(out1, out2, 16));
    write_q15x2_ia (&pCmplxDst, __PKHBT(out3, out4, 16));
    inA1 = read_q15x2_ia ((q15_t **) &pSrcCmplx);
    inA2 = read_q15x2_ia ((q15_t **) &pSrcCmplx);
    inB1 = read_q15x2_ia ((q15_t **) &pSrcReal);
 #ifndef ARM_MATH_BIG_ENDIAN
    mul1 = (q31_t) ((q15_t) (inA1)       * (q15_t) (inB1));
    mul2 = (q31_t) ((q15_t) (inA1 >> 16) * (q15_t) (inB1));
    mul3 = (q31_t) ((q15_t) (inA2)       * (q15_t) (inB1 >> 16));
    mul4 = (q31_t) ((q15_t) (inA2 >> 16) * (q15_t) (inB1 >> 16));
 #else
    mul2 = (q31_t) ((q15_t) (inA1 >> 16) * (q15_t) (inB1 >> 16));
    mul1 = (q31_t) ((q15_t) inA1         * (q15_t) (inB1 >> 16));
    mul4 = (q31_t) ((q15_t) (inA2 >> 16) * (q15_t) inB1);
    mul3 = (q31_t) ((q15_t) inA2 * (q15_t) inB1);
 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
    out1 = (q15_t) __SSAT(mul1 >> 15U, 16);
    out2 = (q15_t) __SSAT(mul2 >> 15U, 16);
    out3 = (q15_t) __SSAT(mul3 >> 15U, 16);
    out4 = (q15_t) __SSAT(mul4 >> 15U, 16);
    write_q15x2_ia (&pCmplxDst, __PKHBT(out1, out2, 16));
    write_q15x2_ia (&pCmplxDst, __PKHBT(out3, out4, 16));
 #else
    in = *pSrcReal++;
    *pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
    *pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
    in = *pSrcReal++;
    *pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
    *pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
    in = *pSrcReal++;
    *pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
    *pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
    in = *pSrcReal++;
    *pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
    *pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = numSamples % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = numSamples;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C[2 * i    ] = A[2 * i    ] * B[i]. */
    /* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
    in = *pSrcReal++;
    /* store the result in the destination buffer. */
    *pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
    *pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of CmplxByRealMult group
 */
--- a/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_mult_real_q31.c
+++ b/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_mult_real_q31.c
@@ -0,0 +1,204 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_cmplx_mult_real_q31.c
 * Description:  Q31 complex by real multiplication
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupCmplxMath
 */
 /**
  @addtogroup CmplxByRealMult
  @{
 */
 /**
  @brief         Q31 complex-by-real multiplication.
  @param[in]     pSrcCmplx   points to complex input vector
  @param[in]     pSrcReal    points to real input vector
  @param[out]    pCmplxDst   points to complex output vector
  @param[in]     numSamples  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] are saturated.
 */
 #if defined(ARM_MATH_MVEI)
 void arm_cmplx_mult_real_q31(
  const q31_t * pSrcCmplx,
  const q31_t * pSrcReal,
        q31_t * pCmplxDst,
        uint32_t numSamples)
 {
    const static uint32_t stride_cmplx_x_real_32[4] = {
        0, 0, 1, 1
    };
    q31x4_t rVec;
    q31x4_t cmplxVec;
    q31x4_t dstVec;
    uint32x4_t strideVec;
    uint32_t blockSizeC = numSamples * CMPLX_DIM;   /* loop counters */
    uint32_t blkCnt;
    q31_t in;
    /*
     * stride vector for pairs of real generation
     */
    strideVec = vld1q(stride_cmplx_x_real_32);
    /* Compute 4 complex outputs at a time */
    blkCnt = blockSizeC >> 2;
    while (blkCnt > 0U)
    {
        cmplxVec = vld1q(pSrcCmplx);
        rVec = vldrwq_gather_shifted_offset_s32(pSrcReal, strideVec);
        dstVec = vqdmulhq(cmplxVec, rVec);
        vst1q(pCmplxDst, dstVec);
        pSrcReal += 2;
        pSrcCmplx += 4;
        pCmplxDst += 4;
        blkCnt --;
    }
    blkCnt = (blockSizeC & 3) >> 1;
    while (blkCnt > 0U)
    {
      /* C[2 * i    ] = A[2 * i    ] * B[i]. */
      /* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
      in = *pSrcReal++;
      /* store saturated result in 1.31 format to destination buffer */
      *pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
      *pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
      /* Decrement loop counter */
      blkCnt--;
    }
 }
 #else
 void arm_cmplx_mult_real_q31(
  const q31_t * pSrcCmplx,
  const q31_t * pSrcReal,
        q31_t * pCmplxDst,
        uint32_t numSamples)
 {
        uint32_t blkCnt;                               /* Loop counter */
        q31_t in;                                      /* Temporary variable */
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = numSamples >> 2U;
  while (blkCnt > 0U)
  {
    /* C[2 * i    ] = A[2 * i    ] * B[i]. */
    /* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
    in = *pSrcReal++;
 #if defined (ARM_MATH_DSP)
    /* store saturated result in 1.31 format to destination buffer */
    *pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
    *pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
 #else
    /* store result in destination buffer. */
    *pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
    *pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
 #endif
    in = *pSrcReal++;
 #if defined (ARM_MATH_DSP)
    *pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
    *pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
 #else
    *pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
    *pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
 #endif
    in = *pSrcReal++;
 #if defined (ARM_MATH_DSP)
    *pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
    *pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
 #else
    *pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
    *pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
 #endif
    in = *pSrcReal++;
 #if defined (ARM_MATH_DSP)
    *pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
    *pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
 #else
    *pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
    *pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = numSamples % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = numSamples;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C[2 * i    ] = A[2 * i    ] * B[i]. */
    /* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
    in = *pSrcReal++;
 #if defined (ARM_MATH_DSP)
    /* store saturated result in 1.31 format to destination buffer */
    *pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
    *pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
 #else
    /* store result in destination buffer. */
    *pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
    *pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of CmplxByRealMult group
 */
--- a/libraries/cmsis/dsp/Source/ControllerFunctions/CMakeLists.txt
+++ b/libraries/cmsis/dsp/Source/ControllerFunctions/CMakeLists.txt
@@ -0,0 +1,41 @@
 cmake_minimum_required (VERSION 3.6)
 project(CMSISDSPController)
 include(configLib)
 include(configDsp)
 add_library(CMSISDSPController STATIC)
 configLib(CMSISDSPController ${ROOT})
 configDsp(CMSISDSPController ${ROOT})
 include(interpol)
 interpol(CMSISDSPController)
 if (CONFIGTABLE AND ALLFAST)
    target_compile_definitions(CMSISDSPController PUBLIC ARM_ALL_FAST_TABLES)  
 endif()
 target_sources(CMSISDSPController PRIVATE arm_pid_init_f32.c)
 target_sources(CMSISDSPController PRIVATE arm_pid_init_q15.c)
 target_sources(CMSISDSPController PRIVATE arm_pid_init_q31.c)
 target_sources(CMSISDSPController PRIVATE arm_pid_reset_f32.c)
 target_sources(CMSISDSPController PRIVATE arm_pid_reset_q15.c)
 target_sources(CMSISDSPController PRIVATE arm_pid_reset_q31.c)
 if (NOT CONFIGTABLE OR ALLFAST OR ARM_SIN_COS_F32)
 target_sources(CMSISDSPController PRIVATE arm_sin_cos_f32.c)
 endif()
 if (NOT CONFIGTABLE OR ALLFAST OR ARM_SIN_COS_Q31)
 target_sources(CMSISDSPController PRIVATE arm_sin_cos_q31.c)
 endif()
 ### Includes
 target_include_directories(CMSISDSPController PUBLIC "${DSP}/Include")
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Artery-MCU	716f545aa1	update version to v2.1.5	2024-08-27 09:56:16 +08:00
Artery-MCU	95481a671d	update version to v2.1.4	2024-05-13 13:48:59 +08:00
Artery-MCU	0a93017e07	update version to v2.1.2	2024-01-25 10:08:14 +08:00
Artery-MCU	9d554c6b91	update version to v2.1.1	2023-10-30 11:24:27 +08:00
Artery-MCU	f49a554036	update version to v2.1.0	2023-08-08 19:30:55 +08:00
Artery-MCU	d7bcb64bf8	update version to v2.0.9	2023-02-28 12:59:38 +08:00
Artery-MCU	d4910499d3	update version to v2.0.8	2022-11-22 18:18:07 +08:00
Artery-MCU	d95c5fb9e8	update version to v2.0.7	2022-08-26 14:45:50 +08:00
Artery-MCU	ac9d1046b7	update version to v2.0.6	2022-06-30 17:16:46 +08:00
Artery-MCU	f09f510f6d	update version to v2.0.5	2022-05-23 20:01:46 +08:00
Artery-MCU	ccd0f1e108	update version to v2.0.4	2022-04-11 19:32:28 +08:00
Artery-MCU	07d7347ba4	update version to v2.0.3	2022-03-03 19:38:35 +08:00
Artery-MCU	79c8fdf6b4	update version to v2.0.2	2022-01-21 15:54:53 +08:00
Artery-MCU	47b90bbacd	add LICENSE.	2021-12-14 09:38:26 +00:00