update version to v2.0.8

2026-05-21 09:22:19 +00:00 · 2022-04-11 19:22:17 +08:00
parent eb00682e95
commit a89a26cea4
2631 changed files with 230547 additions and 26018 deletions
--- a/AT32F403A_407_periph_lib_V2.0.8.chm
+++ b/AT32F403A_407_periph_lib_V2.0.8.chm
--- a/document/ReleaseNotes_AT32F403A_407_Firmware_Library.pdf
+++ b/document/ReleaseNotes_AT32F403A_407_Firmware_Library.pdf
--- a/libraries/cmsis/cm4/core_support/arm_helium_utils.h
+++ b/libraries/cmsis/cm4/core_support/arm_helium_utils.h
@@ -180,7 +180,7 @@ __STATIC_INLINE arm_status arm_mat_trans_32bit_generic_mve(
        while (blkCnt > 0U)
        {
            vecIn = vldrwq_gather_shifted_offset_u32(pDataC, vecOffs);
-            vstrwq(pDataDestR, vecIn); 
+            vstrwq(pDataDestR, vecIn);
            pDataDestR += 4;
            pDataC = pDataC + srcCols * 4;
            /*
--- a/libraries/cmsis/cm4/core_support/arm_math.h
+++ b/libraries/cmsis/cm4/core_support/arm_math.h
@@ -29,7 +29,7 @@
   * ------------
   *
   * This user manual describes the CMSIS DSP software library,
-   * a suite of common signal processing functions for use on Cortex-M and Cortex-A processor 
+   * a suite of common signal processing functions for use on Cortex-M and Cortex-A processor
   * based devices.
   *
   * The library is divided into a number of functions each covering a specific category:
@@ -91,8 +91,8 @@
   *
   * The library is now tested on Fast Models building with cmake.
   * Core M0, M7, A5 are tested.
-   * 
+   *
-   * 
+   *
   *
   * Building the Library
   * ------------
@@ -129,12 +129,12 @@
   * - ARM_MATH_NEON:
   *
   * Define macro ARM_MATH_NEON to enable Neon versions of the DSP functions.
-   * It is not enabled by default when Neon is available because performances are 
+   * It is not enabled by default when Neon is available because performances are
   * dependent on the compiler and target architecture.
   *
   * - ARM_MATH_NEON_EXPERIMENTAL:
   *
-   * Define macro ARM_MATH_NEON_EXPERIMENTAL to enable experimental Neon versions of 
+   * Define macro ARM_MATH_NEON_EXPERIMENTAL to enable experimental Neon versions of
   * of some DSP functions. Experimental Neon versions currently do not have better
   * performances than the scalar versions.
   *
@@ -309,11 +309,11 @@
 * generated from the scikit-learn object. Some examples are given in
 * DSP/Testing/PatternGeneration/SVM.py
 *
- * If more than 2 classes are needed, the functions in this folder 
+ * If more than 2 classes are needed, the functions in this folder
 * will have to be used, as building blocks, to do multi-class classification.
 *
 * No multi-class classification is provided in this SVM folder.
- * 
+ *
 */
@@ -372,7 +372,7 @@ extern "C"
 /* Included for instrinsics definitions */
-#if defined (_MSC_VER ) 
+#if defined (_MSC_VER )
 #include <stdint.h>
 #define __STATIC_FORCEINLINE static __forceinline
 #define __STATIC_INLINE static __inline
@@ -715,7 +715,7 @@ extern "C"
   * @brief 16-bit float 64-bit vector data type.
   */
  typedef  __ALIGNED(2) float16x4_t f16x4_t;
-#endif 
+#endif
  /**
   * @brief 32-bit floating-point 128-bit vector triplet data type
@@ -774,7 +774,7 @@ extern "C"
   * @brief 16-bit floating-point 64-bit vector quadruplet data type
   */
  typedef float16x4x4_t f16x4x4_t;
-#endif 
+#endif
  /**
   * @brief 32-bit fractional 64-bit vector pair data type in 1.31 format
@@ -839,7 +839,7 @@ extern "C"
      float16x4_t     f;
      int16x4_t       i;
  } any16x4_t;
-#endif 
+#endif
  /**
   * @brief 32-bit status 64-bit vector data type.
@@ -1011,7 +1011,7 @@ __STATIC_FORCEINLINE q31_t read_q7x4_ia (
  memcpy (&val, *pQ7, 4);
 #else
  val =(((*pQ7)[3] & 0x0FF) << 24)  | (((*pQ7)[2] & 0x0FF) << 16)  | (((*pQ7)[1] & 0x0FF) << 8)  | ((*pQ7)[0] & 0x0FF);
-#endif 
+#endif
  *pQ7 += 4;
@@ -1031,7 +1031,7 @@ __STATIC_FORCEINLINE q31_t read_q7x4_da (
  memcpy (&val, *pQ7, 4);
 #else
  val = ((((*pQ7)[3]) & 0x0FF) << 24) | ((((*pQ7)[2]) & 0x0FF) << 16)   | ((((*pQ7)[1]) & 0x0FF) << 8)  | ((*pQ7)[0] & 0x0FF);
-#endif 
+#endif
  *pQ7 -= 4;
  return (val);
@@ -1964,7 +1964,7 @@ __STATIC_INLINE q31_t arm_div_q63_to_q31(q63_t num, q31_t den)
  {
      float32_t coeffs[8][4]; /**< Points to the array of modified coefficients.  The array is of length 32. There is one per stage */
  } arm_biquad_mod_coef_f32;
-#endif 
+#endif
  /**
   * @brief Processing function for the Q15 Biquad cascade filter.
@@ -2073,11 +2073,11 @@ __STATIC_INLINE q31_t arm_div_q63_to_q31(q63_t num, q31_t den)
  void arm_biquad_cascade_df1_mve_init_f32(
      arm_biquad_casd_df1_inst_f32 * S,
      uint8_t numStages,
-      const float32_t * pCoeffs, 
+      const float32_t * pCoeffs,
-      arm_biquad_mod_coef_f32 * pCoeffsMod, 
+      arm_biquad_mod_coef_f32 * pCoeffsMod,
      float32_t * pState);
 #endif
-  
+
  void arm_biquad_cascade_df1_init_f32(
        arm_biquad_casd_df1_inst_f32 * S,
        uint8_t numStages,
@@ -2171,7 +2171,7 @@ __STATIC_INLINE q31_t arm_div_q63_to_q31(q63_t num, q31_t den)
  /**
   * @brief         Compute the logical bitwise NOT of a fixed-point vector.
-   * @param[in]     pSrc       points to input vector 
+   * @param[in]     pSrc       points to input vector
   * @param[out]    pDst       points to output vector
   * @param[in]     blockSize  number of samples in each vector
   * @return        none
@@ -2183,7 +2183,7 @@ __STATIC_INLINE q31_t arm_div_q63_to_q31(q63_t num, q31_t den)
  /**
   * @brief         Compute the logical bitwise NOT of a fixed-point vector.
-   * @param[in]     pSrc       points to input vector 
+   * @param[in]     pSrc       points to input vector
   * @param[out]    pDst       points to output vector
   * @param[in]     blockSize  number of samples in each vector
   * @return        none
@@ -2195,7 +2195,7 @@ __STATIC_INLINE q31_t arm_div_q63_to_q31(q63_t num, q31_t den)
  /**
   * @brief         Compute the logical bitwise NOT of a fixed-point vector.
-   * @param[in]     pSrc       points to input vector 
+   * @param[in]     pSrc       points to input vector
   * @param[out]    pDst       points to output vector
   * @param[in]     blockSize  number of samples in each vector
   * @return        none
@@ -2280,11 +2280,11 @@ __STATIC_INLINE q31_t arm_div_q63_to_q31(q63_t num, q31_t den)
  /**
   * @brief Instance structure for the sorting algorithms.
   */
-  typedef struct            
+  typedef struct
  {
    arm_sort_alg alg;        /**< Sorting algorithm selected */
    arm_sort_dir dir;        /**< Sorting order (direction)  */
-  } arm_sort_instance_f32;  
+  } arm_sort_instance_f32;
  /**
   * @param[in]  S          points to an instance of the sorting structure.
@@ -2293,9 +2293,9 @@ __STATIC_INLINE q31_t arm_div_q63_to_q31(q63_t num, q31_t den)
   * @param[in]  blockSize  number of samples to process.
   */
  void arm_sort_f32(
-    const arm_sort_instance_f32 * S, 
+    const arm_sort_instance_f32 * S,
-          float32_t * pSrc, 
+          float32_t * pSrc,
-          float32_t * pDst, 
+          float32_t * pDst,
          uint32_t blockSize);
  /**
@@ -2304,18 +2304,18 @@ __STATIC_INLINE q31_t arm_div_q63_to_q31(q63_t num, q31_t den)
   * @param[in]      dir          Sorting order.
   */
  void arm_sort_init_f32(
-    arm_sort_instance_f32 * S, 
+    arm_sort_instance_f32 * S,
-    arm_sort_alg alg, 
+    arm_sort_alg alg,
-    arm_sort_dir dir); 
+    arm_sort_dir dir);
  /**
   * @brief Instance structure for the sorting algorithms.
   */
-  typedef struct            
+  typedef struct
  {
    arm_sort_dir dir;        /**< Sorting order (direction)  */
    float32_t * buffer;      /**< Working buffer */
-  } arm_merge_sort_instance_f32;  
+  } arm_merge_sort_instance_f32;
  /**
   * @param[in]      S          points to an instance of the sorting structure.
@@ -2368,7 +2368,7 @@ __STATIC_INLINE q31_t arm_div_q63_to_q31(q63_t num, q31_t den)
   * @param[in]  blockSize  number of samples of output data.
   */
  void arm_spline_f32(
-        arm_spline_instance_f32 * S, 
+        arm_spline_instance_f32 * S,
  const float32_t * xq,
        float32_t * pDst,
        uint32_t blockSize);
@@ -2388,7 +2388,7 @@ __STATIC_INLINE q31_t arm_div_q63_to_q31(q63_t num, q31_t den)
          arm_spline_type type,
    const float32_t * x,
    const float32_t * y,
-          uint32_t n, 
+          uint32_t n,
          float32_t * coeffs,
          float32_t * tempBuffer);
@@ -2401,7 +2401,7 @@ __STATIC_INLINE q31_t arm_div_q63_to_q31(q63_t num, q31_t den)
    uint16_t numCols;     /**< number of columns of the matrix.  */
    float32_t *pData;     /**< points to the data of the matrix. */
  } arm_matrix_instance_f32;
- 
+
 /**
   * @brief Instance structure for the floating-point matrix structure.
   */
@@ -4765,7 +4765,7 @@ arm_status arm_fir_decimate_init_f32(
        uint32_t blockSize);
-#if defined(ARM_MATH_NEON) 
+#if defined(ARM_MATH_NEON)
 void arm_biquad_cascade_df2T_compute_coefs_f32(
  arm_biquad_cascade_df2T_instance_f32 * S,
  uint8_t numStages,
@@ -7934,7 +7934,7 @@ typedef struct
 */
-void arm_svm_linear_init_f32(arm_svm_linear_instance_f32 *S, 
+void arm_svm_linear_init_f32(arm_svm_linear_instance_f32 *S,
  uint32_t nbOfSupportVectors,
  uint32_t vectorDimension,
  float32_t intercept,
@@ -7950,9 +7950,9 @@ void arm_svm_linear_init_f32(arm_svm_linear_instance_f32 *S,
 * @return none.
 *
 */
-  
+
-void arm_svm_linear_predict_f32(const arm_svm_linear_instance_f32 *S, 
+void arm_svm_linear_predict_f32(const arm_svm_linear_instance_f32 *S,
-   const float32_t * in, 
+   const float32_t * in,
   int32_t * pResult);
@@ -7973,7 +7973,7 @@ void arm_svm_linear_predict_f32(const arm_svm_linear_instance_f32 *S,
 */
-void arm_svm_polynomial_init_f32(arm_svm_polynomial_instance_f32 *S, 
+void arm_svm_polynomial_init_f32(arm_svm_polynomial_instance_f32 *S,
  uint32_t nbOfSupportVectors,
  uint32_t vectorDimension,
  float32_t intercept,
@@ -7993,8 +7993,8 @@ void arm_svm_polynomial_init_f32(arm_svm_polynomial_instance_f32 *S,
 * @return none.
 *
 */
-void arm_svm_polynomial_predict_f32(const arm_svm_polynomial_instance_f32 *S, 
+void arm_svm_polynomial_predict_f32(const arm_svm_polynomial_instance_f32 *S,
-   const float32_t * in, 
+   const float32_t * in,
   int32_t * pResult);
@@ -8012,7 +8012,7 @@ void arm_svm_polynomial_predict_f32(const arm_svm_polynomial_instance_f32 *S,
 *
 */
-void arm_svm_rbf_init_f32(arm_svm_rbf_instance_f32 *S, 
+void arm_svm_rbf_init_f32(arm_svm_rbf_instance_f32 *S,
  uint32_t nbOfSupportVectors,
  uint32_t vectorDimension,
  float32_t intercept,
@@ -8030,8 +8030,8 @@ void arm_svm_rbf_init_f32(arm_svm_rbf_instance_f32 *S,
 * @return none.
 *
 */
-void arm_svm_rbf_predict_f32(const arm_svm_rbf_instance_f32 *S, 
+void arm_svm_rbf_predict_f32(const arm_svm_rbf_instance_f32 *S,
-   const float32_t * in, 
+   const float32_t * in,
   int32_t * pResult);
 /**
@@ -8049,7 +8049,7 @@ void arm_svm_rbf_predict_f32(const arm_svm_rbf_instance_f32 *S,
 *
 */
-void arm_svm_sigmoid_init_f32(arm_svm_sigmoid_instance_f32 *S, 
+void arm_svm_sigmoid_init_f32(arm_svm_sigmoid_instance_f32 *S,
  uint32_t nbOfSupportVectors,
  uint32_t vectorDimension,
  float32_t intercept,
@@ -8068,8 +8068,8 @@ void arm_svm_sigmoid_init_f32(arm_svm_sigmoid_instance_f32 *S,
 * @return none.
 *
 */
-void arm_svm_sigmoid_predict_f32(const arm_svm_sigmoid_instance_f32 *S, 
+void arm_svm_sigmoid_predict_f32(const arm_svm_sigmoid_instance_f32 *S,
-   const float32_t * in, 
+   const float32_t * in,
   int32_t * pResult);
@@ -8098,8 +8098,8 @@ typedef struct
 */
-uint32_t arm_gaussian_naive_bayes_predict_f32(const arm_gaussian_naive_bayes_instance_f32 *S, 
+uint32_t arm_gaussian_naive_bayes_predict_f32(const arm_gaussian_naive_bayes_instance_f32 *S,
-   const float32_t * in, 
+   const float32_t * in,
   float32_t *pBuffer);
 /**
@@ -8197,8 +8197,8 @@ float32_t arm_kullback_leibler_f32(const float32_t * pSrcA
 * @return Kullback-Leibler  Divergence D(A || B)
 *
 */
-float64_t arm_kullback_leibler_f64(const float64_t * pSrcA, 
+float64_t arm_kullback_leibler_f64(const float64_t * pSrcA,
-                const float64_t * pSrcB, 
+                const float64_t * pSrcB,
                uint32_t blockSize);
@@ -8925,11 +8925,11 @@ float32_t arm_yule_distance(const uint32_t *pA, const uint32_t *pB, uint32_t num
  #define LOW_OPTIMIZATION_EXIT
  #define IAR_ONLY_LOW_OPTIMIZATION_ENTER
  #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
-       
+
 #elif defined ( _MSC_VER ) || defined(__GNUC_PYTHON__)
      #define LOW_OPTIMIZATION_ENTER
      #define LOW_OPTIMIZATION_EXIT
-      #define IAR_ONLY_LOW_OPTIMIZATION_ENTER 
+      #define IAR_ONLY_LOW_OPTIMIZATION_ENTER
      #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
 #endif
--- a/libraries/cmsis/cm4/core_support/arm_mve_tables.h
+++ b/libraries/cmsis/cm4/core_support/arm_mve_tables.h
@@ -32,10 +32,10 @@
 #include "arm_math.h"
- 
+
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
@@ -98,7 +98,7 @@ extern float32_t rearranged_twiddle_stride3_4096_f32[2728];
-#if defined(ARM_MATH_MVEI) 
+#if defined(ARM_MATH_MVEI)
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
@@ -159,7 +159,7 @@ extern q31_t rearranged_twiddle_stride3_4096_q31[2728];
-#if defined(ARM_MATH_MVEI) 
+#if defined(ARM_MATH_MVEI)
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
@@ -220,7 +220,7 @@ extern q15_t rearranged_twiddle_stride3_4096_q15[2728];
-#if defined(ARM_MATH_MVEI) 
+#if defined(ARM_MATH_MVEI)
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
--- a/libraries/cmsis/cm4/core_support/cmsis_armcc.h
+++ b/libraries/cmsis/cm4/core_support/cmsis_armcc.h
@@ -63,9 +63,9 @@
 #ifndef   __STATIC_INLINE
  #define __STATIC_INLINE                        static __inline
 #endif
-#ifndef   __STATIC_FORCEINLINE                 
+#ifndef   __STATIC_FORCEINLINE
  #define __STATIC_FORCEINLINE                   static __forceinline
-#endif           
+#endif
 #ifndef   __NO_RETURN
  #define __NO_RETURN                            __declspec(noreturn)
 #endif
@@ -461,7 +461,7 @@ __STATIC_INLINE void __set_FPSCR(uint32_t fpscr)
 */
 #define __DMB()                           __dmb(0xF)
-                  
+
 /**
  \brief   Reverse byte order (32 bit)
  \details Reverses the byte order in unsigned integer value. For example, 0x12345678 becomes 0x78563412.
--- a/libraries/cmsis/cm4/core_support/cmsis_armclang.h
+++ b/libraries/cmsis/cm4/core_support/cmsis_armclang.h
@@ -597,7 +597,7 @@ __STATIC_FORCEINLINE void __TZ_set_FAULTMASK_NS(uint32_t faultMask)
  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
  Stack Pointer Limit register hence zero is returned always in non-secure
  mode.
-  
+
  \details Returns the current value of the Process Stack Pointer Limit (PSPLIM).
  \return               PSPLIM Register value
 */
@@ -645,7 +645,7 @@ __STATIC_FORCEINLINE uint32_t __TZ_get_PSPLIM_NS(void)
  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
  Stack Pointer Limit register hence the write is silently ignored in non-secure
  mode.
-  
+
  \details Assigns the given value to the Process Stack Pointer Limit (PSPLIM).
  \param [in]    ProcStackPtrLimit  Process Stack Pointer Limit value to set
 */
@@ -1228,7 +1228,7 @@ __STATIC_FORCEINLINE uint32_t __USAT(int32_t val, uint32_t sat)
 #if ((defined (__ARM_ARCH_8M_MAIN__  ) && (__ARM_ARCH_8M_MAIN__   == 1)) || \
     (defined (__ARM_ARCH_8M_BASE__  ) && (__ARM_ARCH_8M_BASE__   == 1)) || \
     (defined (__ARM_ARCH_8_1M_MAIN__) && (__ARM_ARCH_8_1M_MAIN__ == 1))     )
-           
+
 /**
  \brief   Load-Acquire (8 bit)
  \details Executes a LDAB instruction for 8 bit value.
--- a/libraries/cmsis/cm4/core_support/cmsis_armclang_ltm.h
+++ b/libraries/cmsis/cm4/core_support/cmsis_armclang_ltm.h
@@ -595,7 +595,7 @@ __STATIC_FORCEINLINE void __TZ_set_FAULTMASK_NS(uint32_t faultMask)
  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
  Stack Pointer Limit register hence zero is returned always in non-secure
  mode.
-  
+
  \details Returns the current value of the Process Stack Pointer Limit (PSPLIM).
  \return               PSPLIM Register value
 */
@@ -641,7 +641,7 @@ __STATIC_FORCEINLINE uint32_t __TZ_get_PSPLIM_NS(void)
  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
  Stack Pointer Limit register hence the write is silently ignored in non-secure
  mode.
-  
+
  \details Assigns the given value to the Process Stack Pointer Limit (PSPLIM).
  \param [in]    ProcStackPtrLimit  Process Stack Pointer Limit value to set
 */
--- a/libraries/cmsis/cm4/core_support/cmsis_gcc.h
+++ b/libraries/cmsis/cm4/core_support/cmsis_gcc.h
@@ -46,9 +46,9 @@
 #ifndef   __STATIC_INLINE
  #define __STATIC_INLINE                        static inline
 #endif
-#ifndef   __STATIC_FORCEINLINE                 
+#ifndef   __STATIC_FORCEINLINE
  #define __STATIC_FORCEINLINE                   __attribute__((always_inline)) static inline
-#endif                                           
+#endif
 #ifndef   __NO_RETURN
  #define __NO_RETURN                            __attribute__((__noreturn__))
 #endif
@@ -126,23 +126,23 @@
  \details This default implementations initialized all data and additional bss
           sections relying on .copy.table and .zero.table specified properly
           in the used linker script.
-  
+
 */
 __STATIC_FORCEINLINE __NO_RETURN void __cmsis_start(void)
 {
  extern void _start(void) __NO_RETURN;
-  
+
  typedef struct {
    uint32_t const* src;
    uint32_t* dest;
    uint32_t  wlen;
  } __copy_table_t;
-  
+
  typedef struct {
    uint32_t* dest;
    uint32_t  wlen;
  } __zero_table_t;
-  
+
  extern const __copy_table_t __copy_table_start__;
  extern const __copy_table_t __copy_table_end__;
  extern const __zero_table_t __zero_table_start__;
@@ -153,16 +153,16 @@ __STATIC_FORCEINLINE __NO_RETURN void __cmsis_start(void)
      pTable->dest[i] = pTable->src[i];
    }
  }
- 
+
  for (__zero_table_t const* pTable = &__zero_table_start__; pTable < &__zero_table_end__; ++pTable) {
    for(uint32_t i=0u; i<pTable->wlen; ++i) {
      pTable->dest[i] = 0u;
    }
  }
- 
+
  _start();
 }
-  
+
 #define __PROGRAM_START           __cmsis_start
 #endif
@@ -652,7 +652,7 @@ __STATIC_FORCEINLINE void __TZ_set_FAULTMASK_NS(uint32_t faultMask)
  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
  Stack Pointer Limit register hence zero is returned always in non-secure
  mode.
-  
+
  \details Returns the current value of the Process Stack Pointer Limit (PSPLIM).
  \return               PSPLIM Register value
 */
@@ -697,7 +697,7 @@ __STATIC_FORCEINLINE uint32_t __TZ_get_PSPLIM_NS(void)
  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
  Stack Pointer Limit register hence the write is silently ignored in non-secure
  mode.
-  
+
  \details Assigns the given value to the Process Stack Pointer Limit (PSPLIM).
  \param [in]    ProcStackPtrLimit  Process Stack Pointer Limit value to set
 */
@@ -834,7 +834,7 @@ __STATIC_FORCEINLINE uint32_t __get_FPSCR(void)
 {
 #if ((defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)) && \
     (defined (__FPU_USED   ) && (__FPU_USED    == 1U))     )
-#if __has_builtin(__builtin_arm_get_fpscr) 
+#if __has_builtin(__builtin_arm_get_fpscr)
 // Re-enable using built-in when GCC has been fixed
 // || (__GNUC__ > 7) || (__GNUC__ == 7 && __GNUC_MINOR__ >= 2)
  /* see https://gcc.gnu.org/ml/gcc-patches/2017-04/msg00443.html */
--- a/libraries/cmsis/cm4/core_support/cmsis_iccarm.h
+++ b/libraries/cmsis/cm4/core_support/cmsis_iccarm.h
@@ -8,7 +8,7 @@
 //------------------------------------------------------------------------------
 //
 // Copyright (c) 2017-2019 IAR Systems
-// Copyright (c) 2017-2019 Arm Limited. All rights reserved. 
+// Copyright (c) 2017-2019 Arm Limited. All rights reserved.
 //
 // SPDX-License-Identifier: Apache-2.0
 //
--- a/libraries/cmsis/cm4/core_support/core_cm4.h
+++ b/libraries/cmsis/cm4/core_support/core_cm4.h
@@ -198,7 +198,7 @@
    #define __VTOR_PRESENT             1U
    #warning "__VTOR_PRESENT not defined in device header file; using default!"
  #endif
-  
+
  #ifndef __NVIC_PRIO_BITS
    #define __NVIC_PRIO_BITS          3U
    #warning "__NVIC_PRIO_BITS not defined in device header file; using default!"
--- a/libraries/cmsis/cm4/core_support/mpu_armv7.h
+++ b/libraries/cmsis/cm4/core_support/mpu_armv7.h
@@ -21,13 +21,13 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
- 
+
 #if   defined ( __ICCARM__ )
  #pragma system_include         /* treat file as system include file for MISRA check */
 #elif defined (__clang__)
  #pragma clang system_header    /* treat file as system include file */
 #endif
- 
+
 #ifndef ARM_MPU_ARMV7_H
 #define ARM_MPU_ARMV7_H
@@ -79,12 +79,12 @@
 /**
 * MPU Memory Access Attributes
-* 
+*
 * \param TypeExtField      Type extension field, allows you to configure memory access type, for example strongly ordered, peripheral.
 * \param IsShareable       Region is shareable between multiple bus masters.
 * \param IsCacheable       Region is cacheable, i.e. its value may be kept in cache.
 * \param IsBufferable      Region is bufferable, i.e. using write-back caching. Cacheable but non-bufferable regions use write-through policy.
-*/  
+*/
 #define ARM_MPU_ACCESS_(TypeExtField, IsShareable, IsCacheable, IsBufferable)   \
  ((((TypeExtField) << MPU_RASR_TEX_Pos) & MPU_RASR_TEX_Msk)                  | \
   (((IsShareable)  << MPU_RASR_S_Pos)   & MPU_RASR_S_Msk)                    | \
@@ -93,7 +93,7 @@
 /**
 * MPU Region Attribute and Size Register Value
-* 
+*
 * \param DisableExec       Instruction access disable bit, 1= disable instruction fetches.
 * \param AccessPermission  Data access permissions, allows you to configure read/write access for User and Privileged mode.
 * \param AccessAttributes  Memory access attribution, see \ref ARM_MPU_ACCESS_.
@@ -110,7 +110,7 @@
 /**
 * MPU Region Attribute and Size Register Value
-* 
+*
 * \param DisableExec       Instruction access disable bit, 1= disable instruction fetches.
 * \param AccessPermission  Data access permissions, allows you to configure read/write access for User and Privileged mode.
 * \param TypeExtField      Type extension field, allows you to configure memory access type, for example strongly ordered, peripheral.
@@ -119,7 +119,7 @@
 * \param IsBufferable      Region is bufferable, i.e. using write-back caching. Cacheable but non-bufferable regions use write-through policy.
 * \param SubRegionDisable  Sub-region disable field.
 * \param Size              Region size of the region to be configured, for example 4K, 8K.
-*/                         
+*/
 #define ARM_MPU_RASR(DisableExec, AccessPermission, TypeExtField, IsShareable, IsCacheable, IsBufferable, SubRegionDisable, Size) \
  ARM_MPU_RASR_EX(DisableExec, AccessPermission, ARM_MPU_ACCESS_(TypeExtField, IsShareable, IsCacheable, IsBufferable), SubRegionDisable, Size)
@@ -129,7 +129,7 @@
 *  - Shareable
 *  - Non-cacheable
 *  - Non-bufferable
-*/ 
+*/
 #define ARM_MPU_ACCESS_ORDERED ARM_MPU_ACCESS_(0U, 1U, 0U, 0U)
 /**
@@ -140,7 +140,7 @@
 *  - Bufferable (if shareable) or non-bufferable (if non-shareable)
 *
 * \param IsShareable Configures the device memory as shareable or non-shareable.
-*/ 
+*/
 #define ARM_MPU_ACCESS_DEVICE(IsShareable) ((IsShareable) ? ARM_MPU_ACCESS_(0U, 1U, 0U, 1U) : ARM_MPU_ACCESS_(2U, 0U, 0U, 0U))
 /**
@@ -153,7 +153,7 @@
 * \param OuterCp Configures the outer cache policy.
 * \param InnerCp Configures the inner cache policy.
 * \param IsShareable Configures the memory as shareable or non-shareable.
-*/ 
+*/
 #define ARM_MPU_ACCESS_NORMAL(OuterCp, InnerCp, IsShareable) ARM_MPU_ACCESS_((4U | (OuterCp)), IsShareable, ((InnerCp) >> 1U), ((InnerCp) & 1U))
 /**
@@ -184,7 +184,7 @@ typedef struct {
  uint32_t RBAR; //!< The region base address register value (RBAR)
  uint32_t RASR; //!< The region attribute and size register value (RASR) \ref MPU_RASR
 } ARM_MPU_Region_t;
-    
+
 /** Enable the MPU.
 * \param MPU_Control Default access permissions for unconfigured regions.
 */
@@ -224,7 +224,7 @@ __STATIC_INLINE void ARM_MPU_ClrRegion(uint32_t rnr)
 /** Configure an MPU region.
 * \param rbar Value for RBAR register.
 * \param rsar Value for RSAR register.
-*/   
+*/
 __STATIC_INLINE void ARM_MPU_SetRegion(uint32_t rbar, uint32_t rasr)
 {
  MPU->RBAR = rbar;
@@ -235,7 +235,7 @@ __STATIC_INLINE void ARM_MPU_SetRegion(uint32_t rbar, uint32_t rasr)
 * \param rnr Region number to be configured.
 * \param rbar Value for RBAR register.
 * \param rsar Value for RSAR register.
-*/   
+*/
 __STATIC_INLINE void ARM_MPU_SetRegionEx(uint32_t rnr, uint32_t rbar, uint32_t rasr)
 {
  MPU->RNR = rnr;
@@ -251,7 +251,7 @@ __STATIC_INLINE void ARM_MPU_SetRegionEx(uint32_t rnr, uint32_t rbar, uint32_t r
 __STATIC_INLINE void ARM_MPU_OrderedMemcpy(volatile uint32_t* dst, const uint32_t* __RESTRICT src, uint32_t len)
 {
  uint32_t i;
-  for (i = 0U; i < len; ++i) 
+  for (i = 0U; i < len; ++i)
  {
    dst[i] = src[i];
  }
@@ -261,7 +261,7 @@ __STATIC_INLINE void ARM_MPU_OrderedMemcpy(volatile uint32_t* dst, const uint32_
 * \param table Pointer to the MPU configuration table.
 * \param cnt Amount of regions to be configured.
 */
-__STATIC_INLINE void ARM_MPU_Load(ARM_MPU_Region_t const* table, uint32_t cnt) 
+__STATIC_INLINE void ARM_MPU_Load(ARM_MPU_Region_t const* table, uint32_t cnt)
 {
  const uint32_t rowWordSize = sizeof(ARM_MPU_Region_t)/4U;
  while (cnt > MPU_TYPE_RALIASES) {
--- a/libraries/cmsis/cm4/core_support/mpu_armv8.h
+++ b/libraries/cmsis/cm4/core_support/mpu_armv8.h
@@ -102,7 +102,7 @@
  (MPU_RLAR_EN_Msk))
 #if defined(MPU_RLAR_PXN_Pos)
-  
+
 /** \brief Region Limit Address Register with PXN value
 * \param LIMIT The limit address bits [31:5] for this memory region. The value is one extended.
 * \param PXN Privileged execute never. Defines whether code can be executed from this privileged region.
@@ -113,7 +113,7 @@
  (((PXN) << MPU_RLAR_PXN_Pos) & MPU_RLAR_PXN_Msk) | \
  (((IDX) << MPU_RLAR_AttrIndx_Pos) & MPU_RLAR_AttrIndx_Msk) | \
  (MPU_RLAR_EN_Msk))
-  
+
 #endif
 /**
@@ -123,7 +123,7 @@ typedef struct {
  uint32_t RBAR;                   /*!< Region Base Address Register value */
  uint32_t RLAR;                   /*!< Region Limit Address Register value */
 } ARM_MPU_Region_t;
-    
+
 /** Enable the MPU.
 * \param MPU_Control Default access permissions for unconfigured regions.
 */
@@ -190,11 +190,11 @@ __STATIC_INLINE void ARM_MPU_SetMemAttrEx(MPU_Type* mpu, uint8_t idx, uint8_t at
  const uint8_t reg = idx / 4U;
  const uint32_t pos = ((idx % 4U) * 8U);
  const uint32_t mask = 0xFFU << pos;
-  
+
  if (reg >= (sizeof(mpu->MAIR) / sizeof(mpu->MAIR[0]))) {
    return; // invalid index
  }
-  
+
  mpu->MAIR[reg] = ((mpu->MAIR[reg] & ~mask) | ((attr << pos) & mask));
 }
@@ -241,7 +241,7 @@ __STATIC_INLINE void ARM_MPU_ClrRegion(uint32_t rnr)
 * \param rnr Region number to be cleared.
 */
 __STATIC_INLINE void ARM_MPU_ClrRegion_NS(uint32_t rnr)
-{  
+{
  ARM_MPU_ClrRegionEx(MPU_NS, rnr);
 }
 #endif
@@ -251,7 +251,7 @@ __STATIC_INLINE void ARM_MPU_ClrRegion_NS(uint32_t rnr)
 * \param rnr Region number to be configured.
 * \param rbar Value for RBAR register.
 * \param rlar Value for RLAR register.
-*/   
+*/
 __STATIC_INLINE void ARM_MPU_SetRegionEx(MPU_Type* mpu, uint32_t rnr, uint32_t rbar, uint32_t rlar)
 {
  mpu->RNR = rnr;
@@ -263,7 +263,7 @@ __STATIC_INLINE void ARM_MPU_SetRegionEx(MPU_Type* mpu, uint32_t rnr, uint32_t r
 * \param rnr Region number to be configured.
 * \param rbar Value for RBAR register.
 * \param rlar Value for RLAR register.
-*/   
+*/
 __STATIC_INLINE void ARM_MPU_SetRegion(uint32_t rnr, uint32_t rbar, uint32_t rlar)
 {
  ARM_MPU_SetRegionEx(MPU, rnr, rbar, rlar);
@@ -274,10 +274,10 @@ __STATIC_INLINE void ARM_MPU_SetRegion(uint32_t rnr, uint32_t rbar, uint32_t rla
 * \param rnr Region number to be configured.
 * \param rbar Value for RBAR register.
 * \param rlar Value for RLAR register.
-*/   
+*/
 __STATIC_INLINE void ARM_MPU_SetRegion_NS(uint32_t rnr, uint32_t rbar, uint32_t rlar)
 {
-  ARM_MPU_SetRegionEx(MPU_NS, rnr, rbar, rlar);  
+  ARM_MPU_SetRegionEx(MPU_NS, rnr, rbar, rlar);
 }
 #endif
@@ -289,7 +289,7 @@ __STATIC_INLINE void ARM_MPU_SetRegion_NS(uint32_t rnr, uint32_t rbar, uint32_t
 __STATIC_INLINE void ARM_MPU_OrderedMemcpy(volatile uint32_t* dst, const uint32_t* __RESTRICT src, uint32_t len)
 {
  uint32_t i;
-  for (i = 0U; i < len; ++i) 
+  for (i = 0U; i < len; ++i)
  {
    dst[i] = src[i];
  }
@@ -301,7 +301,7 @@ __STATIC_INLINE void ARM_MPU_OrderedMemcpy(volatile uint32_t* dst, const uint32_
 * \param table Pointer to the MPU configuration table.
 * \param cnt Amount of regions to be configured.
 */
-__STATIC_INLINE void ARM_MPU_LoadEx(MPU_Type* mpu, uint32_t rnr, ARM_MPU_Region_t const* table, uint32_t cnt) 
+__STATIC_INLINE void ARM_MPU_LoadEx(MPU_Type* mpu, uint32_t rnr, ARM_MPU_Region_t const* table, uint32_t cnt)
 {
  const uint32_t rowWordSize = sizeof(ARM_MPU_Region_t)/4U;
  if (cnt == 1U) {
@@ -310,7 +310,7 @@ __STATIC_INLINE void ARM_MPU_LoadEx(MPU_Type* mpu, uint32_t rnr, ARM_MPU_Region_
  } else {
    uint32_t rnrBase   = rnr & ~(MPU_TYPE_RALIASES-1U);
    uint32_t rnrOffset = rnr % MPU_TYPE_RALIASES;
-    
+
    mpu->RNR = rnrBase;
    while ((rnrOffset + cnt) > MPU_TYPE_RALIASES) {
      uint32_t c = MPU_TYPE_RALIASES - rnrOffset;
@@ -321,7 +321,7 @@ __STATIC_INLINE void ARM_MPU_LoadEx(MPU_Type* mpu, uint32_t rnr, ARM_MPU_Region_
      rnrBase += MPU_TYPE_RALIASES;
      mpu->RNR = rnrBase;
    }
-    
+
    ARM_MPU_OrderedMemcpy(&(mpu->RBAR)+(rnrOffset*2U), &(table->RBAR), cnt*rowWordSize);
  }
 }
@@ -331,7 +331,7 @@ __STATIC_INLINE void ARM_MPU_LoadEx(MPU_Type* mpu, uint32_t rnr, ARM_MPU_Region_
 * \param table Pointer to the MPU configuration table.
 * \param cnt Amount of regions to be configured.
 */
-__STATIC_INLINE void ARM_MPU_Load(uint32_t rnr, ARM_MPU_Region_t const* table, uint32_t cnt) 
+__STATIC_INLINE void ARM_MPU_Load(uint32_t rnr, ARM_MPU_Region_t const* table, uint32_t cnt)
 {
  ARM_MPU_LoadEx(MPU, rnr, table, cnt);
 }
@@ -342,7 +342,7 @@ __STATIC_INLINE void ARM_MPU_Load(uint32_t rnr, ARM_MPU_Region_t const* table, u
 * \param table Pointer to the MPU configuration table.
 * \param cnt Amount of regions to be configured.
 */
-__STATIC_INLINE void ARM_MPU_Load_NS(uint32_t rnr, ARM_MPU_Region_t const* table, uint32_t cnt) 
+__STATIC_INLINE void ARM_MPU_Load_NS(uint32_t rnr, ARM_MPU_Region_t const* table, uint32_t cnt)
 {
  ARM_MPU_LoadEx(MPU_NS, rnr, table, cnt);
 }
--- a/libraries/cmsis/cm4/core_support/pmu_armv8.h
+++ b/libraries/cmsis/cm4/core_support/pmu_armv8.h
@@ -192,23 +192,23 @@ __STATIC_INLINE void ARM_PMU_Set_CNTR_IRQ_Disable(uint32_t mask);
 __STATIC_INLINE void ARM_PMU_CNTR_Increment(uint32_t mask);
-/** 
+/**
  \brief   Enable the PMU
 */
-__STATIC_INLINE void ARM_PMU_Enable(void) 
+__STATIC_INLINE void ARM_PMU_Enable(void)
 {
  PMU->CTRL |= PMU_CTRL_ENABLE_Msk;
 }
-/** 
+/**
  \brief   Disable the PMU
 */
-__STATIC_INLINE void ARM_PMU_Disable(void) 
+__STATIC_INLINE void ARM_PMU_Disable(void)
 {
  PMU->CTRL &= ~PMU_CTRL_ENABLE_Msk;
 }
-/** 
+/**
  \brief   Set event to count for PMU eventer counter
  \param [in]    num     Event counter (0-30) to configure
  \param [in]    type    Event to count
@@ -218,7 +218,7 @@ __STATIC_INLINE void ARM_PMU_Set_EVTYPER(uint32_t num, uint32_t type)
  PMU->EVTYPER[num] = type;
 }
-/** 
+/**
  \brief  Reset cycle counter
 */
 __STATIC_INLINE void ARM_PMU_CYCCNT_Reset(void)
@@ -226,7 +226,7 @@ __STATIC_INLINE void ARM_PMU_CYCCNT_Reset(void)
  PMU->CTRL |= PMU_CTRL_CYCCNT_RESET_Msk;
 }
-/** 
+/**
  \brief  Reset all event counters
 */
 __STATIC_INLINE void ARM_PMU_EVCNTR_ALL_Reset(void)
@@ -234,8 +234,8 @@ __STATIC_INLINE void ARM_PMU_EVCNTR_ALL_Reset(void)
  PMU->CTRL |= PMU_CTRL_EVENTCNT_RESET_Msk;
 }
-/** 
+/**
-  \brief  Enable counters 
+  \brief  Enable counters
  \param [in]     mask    Counters to enable
  \note   Enables one or more of the following:
          - event counters (0-30)
@@ -246,7 +246,7 @@ __STATIC_INLINE void ARM_PMU_CNTR_Enable(uint32_t mask)
  PMU->CNTENSET = mask;
 }
-/** 
+/**
  \brief  Disable counters
  \param [in]     mask    Counters to enable
  \note   Disables one or more of the following:
@@ -258,7 +258,7 @@ __STATIC_INLINE void ARM_PMU_CNTR_Disable(uint32_t mask)
  PMU->CNTENCLR = mask;
 }
-/** 
+/**
  \brief  Read cycle counter
  \return                 Cycle count
 */
@@ -267,7 +267,7 @@ __STATIC_INLINE uint32_t ARM_PMU_Get_CCNTR(void)
  return PMU->CCNTR;
 }
-/** 
+/**
  \brief   Read event counter
  \param [in]     num     Event counter (0-30) to read
  \return                 Event count
@@ -277,7 +277,7 @@ __STATIC_INLINE uint32_t ARM_PMU_Get_EVCNTR(uint32_t num)
  return PMU->EVCNTR[num];
 }
-/** 
+/**
  \brief   Read counter overflow status
  \return  Counter overflow status bits for the following:
          - event counters (0-30)
@@ -285,10 +285,10 @@ __STATIC_INLINE uint32_t ARM_PMU_Get_EVCNTR(uint32_t num)
 */
 __STATIC_INLINE uint32_t ARM_PMU_Get_CNTR_OVS(void)
 {
-  return PMU->OVSSET;	
+  return PMU->OVSSET;
 }
-/** 
+/**
  \brief   Clear counter overflow status
  \param [in]     mask    Counter overflow status bits to clear
  \note    Clears overflow status bits for one or more of the following:
@@ -300,8 +300,8 @@ __STATIC_INLINE void ARM_PMU_Set_CNTR_OVS(uint32_t mask)
  PMU->OVSCLR = mask;
 }
-/** 
+/**
-  \brief   Enable counter overflow interrupt request 
+  \brief   Enable counter overflow interrupt request
  \param [in]     mask    Counter overflow interrupt request bits to set
  \note    Sets overflow interrupt request bits for one or more of the following:
           - event counters (0-30)
@@ -312,8 +312,8 @@ __STATIC_INLINE void ARM_PMU_Set_CNTR_IRQ_Enable(uint32_t mask)
  PMU->INTENSET = mask;
 }
-/** 
+/**
-  \brief   Disable counter overflow interrupt request 
+  \brief   Disable counter overflow interrupt request
  \param [in]     mask    Counter overflow interrupt request bits to clear
  \note    Clears overflow interrupt request bits for one or more of the following:
           - event counters (0-30)
@@ -324,8 +324,8 @@ __STATIC_INLINE void ARM_PMU_Set_CNTR_IRQ_Disable(uint32_t mask)
  PMU->INTENCLR = mask;
 }
-/** 
+/**
-  \brief   Software increment event counter 
+  \brief   Software increment event counter
  \param [in]     mask    Counters to increment
  \note    Software increment bits for one or more event counters (0-30)
 */
--- a/libraries/cmsis/cm4/device_support/at32f403a_407.h
+++ b/libraries/cmsis/cm4/device_support/at32f403a_407.h
@@ -1,17 +1,17 @@
 /**
  **************************************************************************
  * @file     at32f403a_407.h
-  * @version  v2.0.7
+  * @version  v2.0.8
-  * @date     2022-02-11
+  * @date     2022-04-02
  * @brief    at32f403a_407 header file
  **************************************************************************
  *                       Copyright notice & Disclaimer
  *
-  * The software Board Support Package (BSP) that is made available to 
+  * The software Board Support Package (BSP) that is made available to
-  * download from Artery official website is the copyrighted work of Artery. 
+  * download from Artery official website is the copyrighted work of Artery.
-  * Artery authorizes customers to use, copy, and distribute the BSP 
+  * Artery authorizes customers to use, copy, and distribute the BSP
-  * software and its related documentation for the purpose of design and 
+  * software and its related documentation for the purpose of design and
-  * development in conjunction with Artery microcontrollers. Use of the 
+  * development in conjunction with Artery microcontrollers. Use of the
  * software is governed by this copyright notice and the following disclaimer.
  *
  * THIS SOFTWARE IS PROVIDED ON "AS IS" BASIS WITHOUT WARRANTIES,
@@ -42,7 +42,7 @@ extern "C" {
 /** @addtogroup AT32F403A_407
  * @{
  */
-  
+
 /** @addtogroup Library_configuration_section
  * @{
  */
@@ -81,8 +81,8 @@ extern "C" {
 #ifndef USE_STDPERIPH_DRIVER
 /**
  * @brief comment the line below if you will not use the peripherals drivers.
-  * in this case, these drivers will not be included and the application code will 
+  * in this case, these drivers will not be included and the application code will
-  * be based on direct access to peripherals registers 
+  * be based on direct access to peripherals registers
  */
  #ifdef _RTE_
    #include "RTE_Components.h"
@@ -97,7 +97,7 @@ extern "C" {
  */
 #define __AT32F403A_407_LIBRARY_VERSION_MAJOR    (0x02) /*!< [31:24] major version */
 #define __AT32F403A_407_LIBRARY_VERSION_MIDDLE   (0x00) /*!< [23:16] middle version */
-#define __AT32F403A_407_LIBRARY_VERSION_MINOR    (0x07) /*!< [15:8]  minor version */
+#define __AT32F403A_407_LIBRARY_VERSION_MINOR    (0x08) /*!< [15:8]  minor version */
 #define __AT32F403A_407_LIBRARY_VERSION_RC       (0x00) /*!< [7:0]  release candidate */
 #define __AT32F403A_407_LIBRARY_VERSION          ((__AT32F403A_407_LIBRARY_VERSION_MAJOR << 24)  | \
                                                  (__AT32F403A_407_LIBRARY_VERSION_MIDDLE << 16) | \
@@ -293,7 +293,7 @@ typedef enum IRQn
 /** @addtogroup Exported_types
  * @{
-  */  
+  */
 typedef int32_t  INT32;
 typedef int16_t  INT16;
@@ -335,19 +335,19 @@ typedef __I uint16_t vuc16;   /*!< read only */
 typedef __I uint8_t  vuc8;    /*!< read only */
 /**
-  * @brief flag status 
+  * @brief flag status
  */
-typedef enum {RESET = 0, SET = !RESET} flag_status;  
+typedef enum {RESET = 0, SET = !RESET} flag_status;
 /**
  * @brief confirm state
-  */       
+  */
-typedef enum {FALSE = 0, TRUE = !FALSE} confirm_state;     
+typedef enum {FALSE = 0, TRUE = !FALSE} confirm_state;
 /**
  * @brief error status
-  */  
+  */
-typedef enum {ERROR = 0, SUCCESS = !ERROR} error_status;   
+typedef enum {ERROR = 0, SUCCESS = !ERROR} error_status;
 /**
  * @}
@@ -415,7 +415,7 @@ typedef enum {ERROR = 0, SUCCESS = !ERROR} error_status;
 #define BPR_BASE                         (APB1PERIPH_BASE + 0x6C00)
 #define PWC_BASE                         (APB1PERIPH_BASE + 0x7000)
 #define DAC_BASE                         (APB1PERIPH_BASE + 0x7400)
-/* apb2 bus base address */              
+/* apb2 bus base address */
 #define IOMUX_BASE                       (APB2PERIPH_BASE + 0x0000)
 #define EXINT_BASE                       (APB2PERIPH_BASE + 0x0400)
 #define GPIOA_BASE                       (APB2PERIPH_BASE + 0x0800)
@@ -441,7 +441,7 @@ typedef enum {ERROR = 0, SUCCESS = !ERROR} error_status;
 #define I2S2EXT_BASE                     (APB2PERIPH_BASE + 0x6C00)
 #define I2S3EXT_BASE                     (APB2PERIPH_BASE + 0x7000)
 #define SDIO1_BASE                       (APB2PERIPH_BASE + 0x8000)
-/* ahb bus base address */               
+/* ahb bus base address */
 #define DMA1_BASE                        (AHBPERIPH_BASE + 0x0000)
 #define DMA1_CHANNEL1_BASE               (AHBPERIPH_BASE + 0x0008)
 #define DMA1_CHANNEL2_BASE               (AHBPERIPH_BASE + 0x001C)
@@ -499,7 +499,7 @@ typedef enum {ERROR = 0, SUCCESS = !ERROR} error_status;
 #define BPR_BASE                         (APB1PERIPH_BASE + 0x6C00)
 #define PWC_BASE                         (APB1PERIPH_BASE + 0x7000)
 #define DAC_BASE                         (APB1PERIPH_BASE + 0x7400)
-/* apb2 bus base address */              
+/* apb2 bus base address */
 #define IOMUX_BASE                       (APB2PERIPH_BASE + 0x0000)
 #define EXINT_BASE                       (APB2PERIPH_BASE + 0x0400)
 #define GPIOA_BASE                       (APB2PERIPH_BASE + 0x0800)
@@ -525,7 +525,7 @@ typedef enum {ERROR = 0, SUCCESS = !ERROR} error_status;
 #define I2S2EXT_BASE                     (APB2PERIPH_BASE + 0x6C00)
 #define I2S3EXT_BASE                     (APB2PERIPH_BASE + 0x7000)
 #define SDIO1_BASE                       (APB2PERIPH_BASE + 0x8000)
-/* ahb bus base address */               
+/* ahb bus base address */
 #define DMA1_BASE                        (AHBPERIPH_BASE + 0x0000)
 #define DMA1_CHANNEL1_BASE               (AHBPERIPH_BASE + 0x0008)
 #define DMA1_CHANNEL2_BASE               (AHBPERIPH_BASE + 0x001C)
@@ -566,7 +566,7 @@ typedef enum {ERROR = 0, SUCCESS = !ERROR} error_status;
  * @}
  */
-#include "at32f403a_407_def.h"  
+#include "at32f403a_407_def.h"
 #include "at32f403a_407_conf.h"
 #ifdef __cplusplus
--- a/libraries/cmsis/cm4/device_support/at32f403a_407_conf_template.h
+++ b/libraries/cmsis/cm4/device_support/at32f403a_407_conf_template.h
@@ -1,17 +1,17 @@
 /**
  **************************************************************************
  * @file     at32f403a_407_conf.h
-  * @version  v2.0.7
+  * @version  v2.0.8
-  * @date     2022-02-11
+  * @date     2022-04-02
  * @brief    at32f403a_407 config header file
  **************************************************************************
  *                       Copyright notice & Disclaimer
  *
-  * The software Board Support Package (BSP) that is made available to 
+  * The software Board Support Package (BSP) that is made available to
-  * download from Artery official website is the copyrighted work of Artery. 
+  * download from Artery official website is the copyrighted work of Artery.
-  * Artery authorizes customers to use, copy, and distribute the BSP 
+  * Artery authorizes customers to use, copy, and distribute the BSP
-  * software and its related documentation for the purpose of design and 
+  * software and its related documentation for the purpose of design and
-  * development in conjunction with Artery microcontrollers. Use of the 
+  * development in conjunction with Artery microcontrollers. Use of the
  * software is governed by this copyright notice and the following disclaimer.
  *
  * THIS SOFTWARE IS PROVIDED ON "AS IS" BASIS WITHOUT WARRANTIES,
@@ -31,7 +31,7 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
- 
+
 /**
  * @brief in the following line adjust the value of high speed exernal crystal (hext)
--- a/libraries/cmsis/cm4/device_support/startup/gcc/linker/AT32F403AxC_FLASH.ld
+++ b/libraries/cmsis/cm4/device_support/startup/gcc/linker/AT32F403AxC_FLASH.ld
@@ -22,7 +22,7 @@
 ENTRY(Reset_Handler)
 /* Highest address of the user mode stack */
-_estack = 0x20017FFF;    /* end of RAM */
+_estack = 0x20018000;    /* end of RAM */
 /* Generate a link error if heap and stack don't fit into RAM */
 _Min_Heap_Size = 0x200;      /* required amount of heap  */
@@ -134,12 +134,12 @@ SECTIONS
  /* User_heap_stack section, used to check that there is enough RAM left */
  ._user_heap_stack :
  {
-    . = ALIGN(4);
+    . = ALIGN(8);
    PROVIDE ( end = . );
    PROVIDE ( _end = . );
    . = . + _Min_Heap_Size;
    . = . + _Min_Stack_Size;
-    . = ALIGN(4);
+    . = ALIGN(8);
  } >RAM
  /* Remove information from the standard libraries */
--- a/libraries/cmsis/cm4/device_support/startup/gcc/linker/AT32F403AxE_FLASH.ld
+++ b/libraries/cmsis/cm4/device_support/startup/gcc/linker/AT32F403AxE_FLASH.ld
@@ -22,7 +22,7 @@
 ENTRY(Reset_Handler)
 /* Highest address of the user mode stack */
-_estack = 0x20017FFF;    /* end of RAM */
+_estack = 0x20018000;    /* end of RAM */
 /* Generate a link error if heap and stack don't fit into RAM */
 _Min_Heap_Size = 0x200;      /* required amount of heap  */
@@ -134,12 +134,12 @@ SECTIONS
  /* User_heap_stack section, used to check that there is enough RAM left */
  ._user_heap_stack :
  {
-    . = ALIGN(4);
+    . = ALIGN(8);
    PROVIDE ( end = . );
    PROVIDE ( _end = . );
    . = . + _Min_Heap_Size;
    . = . + _Min_Stack_Size;
-    . = ALIGN(4);
+    . = ALIGN(8);
  } >RAM
  /* Remove information from the standard libraries */
--- a/libraries/cmsis/cm4/device_support/startup/gcc/linker/AT32F403AxG_FLASH.ld
+++ b/libraries/cmsis/cm4/device_support/startup/gcc/linker/AT32F403AxG_FLASH.ld
@@ -22,7 +22,7 @@
 ENTRY(Reset_Handler)
 /* Highest address of the user mode stack */
-_estack = 0x20017FFF;    /* end of RAM */
+_estack = 0x20018000;    /* end of RAM */
 /* Generate a link error if heap and stack don't fit into RAM */
 _Min_Heap_Size = 0x200;      /* required amount of heap  */
@@ -134,12 +134,12 @@ SECTIONS
  /* User_heap_stack section, used to check that there is enough RAM left */
  ._user_heap_stack :
  {
-    . = ALIGN(4);
+    . = ALIGN(8);
    PROVIDE ( end = . );
    PROVIDE ( _end = . );
    . = . + _Min_Heap_Size;
    . = . + _Min_Stack_Size;
-    . = ALIGN(4);
+    . = ALIGN(8);
  } >RAM
  /* Remove information from the standard libraries */
--- a/libraries/cmsis/cm4/device_support/startup/gcc/linker/AT32F407xC_FLASH.ld
+++ b/libraries/cmsis/cm4/device_support/startup/gcc/linker/AT32F407xC_FLASH.ld
@@ -22,7 +22,7 @@
 ENTRY(Reset_Handler)
 /* Highest address of the user mode stack */
-_estack = 0x20017FFF;    /* end of RAM */
+_estack = 0x20018000;    /* end of RAM */
 /* Generate a link error if heap and stack don't fit into RAM */
 _Min_Heap_Size = 0x200;      /* required amount of heap  */
@@ -134,12 +134,12 @@ SECTIONS
  /* User_heap_stack section, used to check that there is enough RAM left */
  ._user_heap_stack :
  {
-    . = ALIGN(4);
+    . = ALIGN(8);
    PROVIDE ( end = . );
    PROVIDE ( _end = . );
    . = . + _Min_Heap_Size;
    . = . + _Min_Stack_Size;
-    . = ALIGN(4);
+    . = ALIGN(8);
  } >RAM
  /* Remove information from the standard libraries */
--- a/libraries/cmsis/cm4/device_support/startup/gcc/linker/AT32F407xE_FLASH.ld
+++ b/libraries/cmsis/cm4/device_support/startup/gcc/linker/AT32F407xE_FLASH.ld
@@ -22,7 +22,7 @@
 ENTRY(Reset_Handler)
 /* Highest address of the user mode stack */
-_estack = 0x20017FFF;    /* end of RAM */
+_estack = 0x20018000;    /* end of RAM */
 /* Generate a link error if heap and stack don't fit into RAM */
 _Min_Heap_Size = 0x200;      /* required amount of heap  */
@@ -134,12 +134,12 @@ SECTIONS
  /* User_heap_stack section, used to check that there is enough RAM left */
  ._user_heap_stack :
  {
-    . = ALIGN(4);
+    . = ALIGN(8);
    PROVIDE ( end = . );
    PROVIDE ( _end = . );
    . = . + _Min_Heap_Size;
    . = . + _Min_Stack_Size;
-    . = ALIGN(4);
+    . = ALIGN(8);
  } >RAM
  /* Remove information from the standard libraries */
--- a/libraries/cmsis/cm4/device_support/startup/gcc/linker/AT32F407xG_FLASH.ld
+++ b/libraries/cmsis/cm4/device_support/startup/gcc/linker/AT32F407xG_FLASH.ld
@@ -22,7 +22,7 @@
 ENTRY(Reset_Handler)
 /* Highest address of the user mode stack */
-_estack = 0x20017FFF;    /* end of RAM */
+_estack = 0x20018000;    /* end of RAM */
 /* Generate a link error if heap and stack don't fit into RAM */
 _Min_Heap_Size = 0x200;      /* required amount of heap  */
@@ -134,12 +134,12 @@ SECTIONS
  /* User_heap_stack section, used to check that there is enough RAM left */
  ._user_heap_stack :
  {
-    . = ALIGN(4);
+    . = ALIGN(8);
    PROVIDE ( end = . );
    PROVIDE ( _end = . );
    . = . + _Min_Heap_Size;
    . = . + _Min_Stack_Size;
-    . = ALIGN(4);
+    . = ALIGN(8);
  } >RAM
  /* Remove information from the standard libraries */
--- a/libraries/cmsis/cm4/device_support/startup/gcc/startup_at32f403a_407.s
+++ b/libraries/cmsis/cm4/device_support/startup/gcc/startup_at32f403a_407.s
@@ -1,8 +1,8 @@
 /**
  ******************************************************************************
  * @file     startup_at32f403a_407.s
-  * @version  v2.0.7
+  * @version  v2.0.8
-  * @date     2022-02-11
+  * @date     2022-04-02
  * @brief    at32f403a_407xx devices vector table for gcc toolchain.
  *           this module performs:
  *           - set the initial sp
@@ -106,7 +106,7 @@ Infinite_Loop:
 * The minimal vector table for a Cortex M3. Note that the proper constructs
 * must be placed on this to ensure that it ends up at physical address
 * 0x0000.0000.
-* 
+*
 *******************************************************************************/
   .section  .isr_vector,"a",%progbits
  .type  g_pfnVectors, %object
@@ -130,7 +130,7 @@ g_pfnVectors:
  .word  0
  .word  PendSV_Handler
  .word  SysTick_Handler
-  
+
  /* External Interrupts */
  .word  WWDT_IRQHandler                     /* Window Watchdog Timer                   */
  .word  PVM_IRQHandler                      /* PVM through EXINT Line detect           */
@@ -216,20 +216,20 @@ g_pfnVectors:
 /*******************************************************************************
 *
-* Provide weak aliases for each Exception handler to the Default_Handler. 
+* Provide weak aliases for each Exception handler to the Default_Handler.
-* As they are weak aliases, any function with the same name will override 
+* As they are weak aliases, any function with the same name will override
 * this definition.
-* 
+*
 *******************************************************************************/
   .weak      NMI_Handler
   .thumb_set NMI_Handler,Default_Handler
-  
+
   .weak      HardFault_Handler
   .thumb_set HardFault_Handler,Default_Handler
-  
+
   .weak      MemManage_Handler
   .thumb_set MemManage_Handler,Default_Handler
-  
+
   .weak      BusFault_Handler
   .thumb_set BusFault_Handler,Default_Handler
@@ -246,10 +246,10 @@ g_pfnVectors:
   .thumb_set PendSV_Handler,Default_Handler
   .weak      SysTick_Handler
-   .thumb_set SysTick_Handler,Default_Handler              
+   .thumb_set SysTick_Handler,Default_Handler
-  
+
   .weak      WWDT_IRQHandler
-   .thumb_set WWDT_IRQHandler,Default_Handler      
+   .thumb_set WWDT_IRQHandler,Default_Handler
   .weak      PVM_IRQHandler
   .thumb_set PVM_IRQHandler,Default_Handler
@@ -273,7 +273,7 @@ g_pfnVectors:
   .thumb_set EXINT1_IRQHandler,Default_Handler
   .weak      EXINT2_IRQHandler
-   .thumb_set EXINT2_IRQHandler,Default_Handler 
+   .thumb_set EXINT2_IRQHandler,Default_Handler
   .weak      EXINT3_IRQHandler
   .thumb_set EXINT3_IRQHandler,Default_Handler
@@ -291,7 +291,7 @@ g_pfnVectors:
   .thumb_set DMA1_Channel3_IRQHandler,Default_Handler
   .weak      DMA1_Channel4_IRQHandler
-   .thumb_set DMA1_Channel4_IRQHandler,Default_Handler 
+   .thumb_set DMA1_Channel4_IRQHandler,Default_Handler
   .weak      DMA1_Channel5_IRQHandler
   .thumb_set DMA1_Channel5_IRQHandler,Default_Handler
@@ -443,10 +443,10 @@ g_pfnVectors:
   .weak      CAN2_TX_IRQHandler
   .thumb_set CAN2_TX_IRQHandler,Default_Handler
-   .weak      CAN2_RX0_IRQHandler 
+   .weak      CAN2_RX0_IRQHandler
   .thumb_set CAN2_RX0_IRQHandler ,Default_Handler
-   .weak      CAN2_RX1_IRQHandler 
+   .weak      CAN2_RX1_IRQHandler
   .thumb_set CAN2_RX1_IRQHandler ,Default_Handler
   .weak      CAN2_SE_IRQHandler
--- a/libraries/cmsis/cm4/device_support/startup/iar/startup_at32f403a_407.s
+++ b/libraries/cmsis/cm4/device_support/startup/iar/startup_at32f403a_407.s
@@ -1,7 +1,7 @@
 ;**************************************************************************
 ;* @file     startup_at32f403a_407.s
-;* @version  v2.0.7
+;* @version  v2.0.8
-;* @date     2022-02-11
+;* @date     2022-04-02
 ;* @brief    at32f403a_407 startup file for IAR Systems
 ;**************************************************************************
 ;
@@ -64,8 +64,8 @@ __vector_table
        DCD     DMA1_Channel7_IRQHandler            ; DMA1 Channel 7
        DCD     ADC1_2_IRQHandler                   ; ADC1 & ADC2
        DCD     USBFS_H_CAN1_TX_IRQHandler          ; USB High Priority or CAN1 TX
-        DCD     USBFS_L_CAN1_RX0_IRQHandler         ; USB Low  Priority or CAN1 RX0 
+        DCD     USBFS_L_CAN1_RX0_IRQHandler         ; USB Low  Priority or CAN1 RX0
-        DCD     CAN1_RX1_IRQHandler                 ; CAN1 RX1 
+        DCD     CAN1_RX1_IRQHandler                 ; CAN1 RX1
        DCD     CAN1_SE_IRQHandler                  ; CAN1 SE
        DCD     EXINT9_5_IRQHandler                 ; EXINT Line [9:5]
        DCD     TMR1_BRK_TMR9_IRQHandler            ; TMR1 Brake and TMR9
@@ -113,8 +113,8 @@ __vector_table
        DCD     0                                   ; Reserved
        DCD     0                                   ; Reserved
        DCD     CAN2_TX_IRQHandler                  ; CAN2 TX
-        DCD     CAN2_RX0_IRQHandler                 ; CAN2 RX0 
+        DCD     CAN2_RX0_IRQHandler                 ; CAN2 RX0
-        DCD     CAN2_RX1_IRQHandler                 ; CAN2 RX1 
+        DCD     CAN2_RX1_IRQHandler                 ; CAN2 RX1
        DCD     CAN2_SE_IRQHandler                  ; CAN2 SE
        DCD     ACC_IRQHandler                      ; ACC
        DCD     USBFS_MAPH_IRQHandler               ; USB Map HP
@@ -285,15 +285,15 @@ ADC1_2_IRQHandler
 USBFS_H_CAN1_TX_IRQHandler
        B USBFS_H_CAN1_TX_IRQHandler
-        PUBWEAK USBFS_L_CAN1_RX0_IRQHandler 
+        PUBWEAK USBFS_L_CAN1_RX0_IRQHandler
        SECTION .text:CODE:REORDER:NOROOT(1)
-USBFS_L_CAN1_RX0_IRQHandler 
+USBFS_L_CAN1_RX0_IRQHandler
-        B USBFS_L_CAN1_RX0_IRQHandler 
+        B USBFS_L_CAN1_RX0_IRQHandler
-        PUBWEAK CAN1_RX1_IRQHandler 
+        PUBWEAK CAN1_RX1_IRQHandler
        SECTION .text:CODE:REORDER:NOROOT(1)
-CAN1_RX1_IRQHandler 
+CAN1_RX1_IRQHandler
-        B CAN1_RX1_IRQHandler 
+        B CAN1_RX1_IRQHandler
        PUBWEAK CAN1_SE_IRQHandler
        SECTION .text:CODE:REORDER:NOROOT(1)
@@ -510,15 +510,15 @@ SPI4_IRQHandler
 CAN2_TX_IRQHandler
        B CAN2_TX_IRQHandler
-        PUBWEAK CAN2_RX0_IRQHandler 
+        PUBWEAK CAN2_RX0_IRQHandler
        SECTION .text:CODE:REORDER:NOROOT(1)
-CAN2_RX0_IRQHandler 
+CAN2_RX0_IRQHandler
-        B CAN2_RX0_IRQHandler 
+        B CAN2_RX0_IRQHandler
-        PUBWEAK CAN2_RX1_IRQHandler 
+        PUBWEAK CAN2_RX1_IRQHandler
        SECTION .text:CODE:REORDER:NOROOT(1)
-CAN2_RX1_IRQHandler 
+CAN2_RX1_IRQHandler
-        B CAN2_RX1_IRQHandler 
+        B CAN2_RX1_IRQHandler
        PUBWEAK CAN2_SE_IRQHandler
        SECTION .text:CODE:REORDER:NOROOT(1)
--- a/libraries/cmsis/cm4/device_support/startup/mdk/startup_at32f403a_407.s
+++ b/libraries/cmsis/cm4/device_support/startup/mdk/startup_at32f403a_407.s
@@ -1,7 +1,7 @@
 ;**************************************************************************
 ;* @file     startup_at32f403a_407.s
-;* @version  v2.0.7
+;* @version  v2.0.8
-;* @date     2022-02-11
+;* @date     2022-04-02
 ;* @brief    at32f403a_407 startup file for keil
 ;**************************************************************************
 ;
@@ -77,8 +77,8 @@ __Vectors       DCD     __initial_sp                        ; Top of Stack
                DCD     DMA1_Channel7_IRQHandler            ; DMA1 Channel 7
                DCD     ADC1_2_IRQHandler                   ; ADC1 & ADC2
                DCD     USBFS_H_CAN1_TX_IRQHandler          ; USB High Priority or CAN1 TX
-                DCD     USBFS_L_CAN1_RX0_IRQHandler         ; USB Low  Priority or CAN1 RX0 
+                DCD     USBFS_L_CAN1_RX0_IRQHandler         ; USB Low  Priority or CAN1 RX0
-                DCD     CAN1_RX1_IRQHandler                 ; CAN1 RX1 
+                DCD     CAN1_RX1_IRQHandler                 ; CAN1 RX1
                DCD     CAN1_SE_IRQHandler                  ; CAN1 SE
                DCD     EXINT9_5_IRQHandler                 ; EXINT Line [9:5]
                DCD     TMR1_BRK_TMR9_IRQHandler            ; TMR1 Brake and TMR9
@@ -126,8 +126,8 @@ __Vectors       DCD     __initial_sp                        ; Top of Stack
                DCD     0                                   ; Reserved
                DCD     0                                   ; Reserved
                DCD     CAN2_TX_IRQHandler                  ; CAN2 TX
-                DCD     CAN2_RX0_IRQHandler                 ; CAN2 RX0 
+                DCD     CAN2_RX0_IRQHandler                 ; CAN2 RX0
-                DCD     CAN2_RX1_IRQHandler                 ; CAN2 RX1 
+                DCD     CAN2_RX1_IRQHandler                 ; CAN2 RX1
                DCD     CAN2_SE_IRQHandler                  ; CAN2 SE
                DCD     ACC_IRQHandler                      ; ACC
                DCD     USBFS_MAPH_IRQHandler               ; USB Map High
@@ -299,8 +299,8 @@ DMA1_Channel6_IRQHandler
 DMA1_Channel7_IRQHandler
 ADC1_2_IRQHandler
 USBFS_H_CAN1_TX_IRQHandler
-USBFS_L_CAN1_RX0_IRQHandler 
+USBFS_L_CAN1_RX0_IRQHandler
-CAN1_RX1_IRQHandler 
+CAN1_RX1_IRQHandler
 CAN1_SE_IRQHandler
 EXINT9_5_IRQHandler
 TMR1_BRK_TMR9_IRQHandler
@@ -344,8 +344,8 @@ I2C3_EVT_IRQHandler
 I2C3_ERR_IRQHandler
 SPI4_IRQHandler
 CAN2_TX_IRQHandler
-CAN2_RX0_IRQHandler 
+CAN2_RX0_IRQHandler
-CAN2_RX1_IRQHandler 
+CAN2_RX1_IRQHandler
 CAN2_SE_IRQHandler
 ACC_IRQHandler
 USBFS_MAPH_IRQHandler
--- a/libraries/cmsis/cm4/device_support/system_at32f403a_407.c
+++ b/libraries/cmsis/cm4/device_support/system_at32f403a_407.c
@@ -1,17 +1,17 @@
 /**
  **************************************************************************
  * @file     system_at32f403a_407.c
-  * @version  v2.0.7
+  * @version  v2.0.8
-  * @date     2022-02-11
+  * @date     2022-04-02
  * @brief    contains all the functions for cmsis cortex-m4 system source file
  **************************************************************************
  *                       Copyright notice & Disclaimer
  *
-  * The software Board Support Package (BSP) that is made available to 
+  * The software Board Support Package (BSP) that is made available to
-  * download from Artery official website is the copyrighted work of Artery. 
+  * download from Artery official website is the copyrighted work of Artery.
-  * Artery authorizes customers to use, copy, and distribute the BSP 
+  * Artery authorizes customers to use, copy, and distribute the BSP
-  * software and its related documentation for the purpose of design and 
+  * software and its related documentation for the purpose of design and
-  * development in conjunction with Artery microcontrollers. Use of the 
+  * development in conjunction with Artery microcontrollers. Use of the
  * software is governed by this copyright notice and the following disclaimer.
  *
  * THIS SOFTWARE IS PROVIDED ON "AS IS" BASIS WITHOUT WARRANTIES,
@@ -31,7 +31,7 @@
 /** @addtogroup AT32F403A_407_system
  * @{
  */
-    
+
 #include "at32f403a_407.h"
 /** @addtogroup AT32F403A_407_system_private_defines
@@ -81,7 +81,7 @@ void SystemInit (void)
  /* wait sclk switch status */
  while(CRM->cfg_bit.sclksts != CRM_SCLK_HICK);
-  /* reset cfg register, include sclk switch, ahbdiv, apb1div, apb2div, adcdiv, 
+  /* reset cfg register, include sclk switch, ahbdiv, apb1div, apb2div, adcdiv,
     clkout pllrcs, pllhextdiv, pllmult, usbdiv and pllrange bits */
  CRM->cfg = 0;
@@ -182,7 +182,7 @@ void system_core_clock_update(void)
 /**
  * @}
  */
-  
+
 /**
  * @}
  */
--- a/libraries/cmsis/cm4/device_support/system_at32f403a_407.h
+++ b/libraries/cmsis/cm4/device_support/system_at32f403a_407.h
@@ -1,17 +1,17 @@
 /**
  **************************************************************************
  * @file     system_at32f403a_407.h
-  * @version  v2.0.7
+  * @version  v2.0.8
-  * @date     2022-02-11
+  * @date     2022-04-02
  * @brief    cmsis cortex-m4 system header file.
  **************************************************************************
  *                       Copyright notice & Disclaimer
  *
-  * The software Board Support Package (BSP) that is made available to 
+  * The software Board Support Package (BSP) that is made available to
-  * download from Artery official website is the copyrighted work of Artery. 
+  * download from Artery official website is the copyrighted work of Artery.
-  * Artery authorizes customers to use, copy, and distribute the BSP 
+  * Artery authorizes customers to use, copy, and distribute the BSP
-  * software and its related documentation for the purpose of design and 
+  * software and its related documentation for the purpose of design and
-  * development in conjunction with Artery microcontrollers. Use of the 
+  * development in conjunction with Artery microcontrollers. Use of the
  * software is governed by this copyright notice and the following disclaimer.
  *
  * THIS SOFTWARE IS PROVIDED ON "AS IS" BASIS WITHOUT WARRANTIES,
@@ -39,7 +39,7 @@ extern "C" {
  * @{
  */
-/** @defgroup AT32F403A_407_system_clock_stable_definition 
+/** @defgroup AT32F403A_407_system_clock_stable_definition
  * @{
  */
@@ -50,7 +50,7 @@ extern "C" {
  * @}
  */
-/** @defgroup AT32F403A_407_system_exported_variables 
+/** @defgroup AT32F403A_407_system_exported_variables
  * @{
  */
@@ -60,10 +60,10 @@ extern unsigned int system_core_clock; /*!< system clock frequency (core clock)
  * @}
  */
-/** @defgroup AT32F403A_407_system_exported_functions 
+/** @defgroup AT32F403A_407_system_exported_functions
  * @{
  */
-  
+
 extern void SystemInit(void);
 extern void system_core_clock_update(void);
--- a/libraries/cmsis/dsp/ComputeLibrary/Include/NEMath.h
+++ b/libraries/cmsis/dsp/ComputeLibrary/Include/NEMath.h
@@ -0,0 +1,414 @@
 /*
 * Copyright (c) 2016, 2019 ARM Limited.
 *
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
 #ifndef __ARM_COMPUTE_NEMATH_H__
 #define __ARM_COMPUTE_NEMATH_H__
 #if defined(ARM_MATH_NEON)
 /** Calculate floor of a vector.
 *
 * @param[in] val Input vector value in F32 format.
 *
 * @return The calculated floor vector.
 */
 static inline float32x4_t vfloorq_f32(float32x4_t val);
 /** Calculate inverse square root.
 *
 * @param[in] x Input value.
 *
 * @return The calculated inverse square root.
 */
 static inline float32x2_t vinvsqrt_f32(float32x2_t x);
 /** Calculate inverse square root.
 *
 * @param[in] x Input value.
 *
 * @return The calculated inverse square root.
 */
 static inline float32x4_t vinvsqrtq_f32(float32x4_t x);
 /** Calculate reciprocal.
 *
 * @param[in] x Input value.
 *
 * @return The calculated reciprocal.
 */
 static inline float32x2_t vinv_f32(float32x2_t x);
 /** Calculate reciprocal.
 *
 * @param[in] x Input value.
 *
 * @return The calculated reciprocal.
 */
 static inline float32x4_t vinvq_f32(float32x4_t x);
 /** Perform a 7th degree polynomial approximation using Estrin's method.
 *
 * @param[in] x      Input vector value in F32 format.
 * @param[in] coeffs Polynomial coefficients table. (array of flattened float32x4_t vectors)
 *
 * @return The calculated approximation.
 */
 static inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const float32_t *coeffs);
 /** Calculate exponential
 *
 * @param[in] x Input vector value in F32 format.
 *
 * @return The calculated exponent.
 */
 static inline float32x4_t vexpq_f32(float32x4_t x);
 /** Calculate logarithm
 *
 * @param[in] x Input vector value in F32 format.
 *
 * @return The calculated logarithm.
 */
 static inline float32x4_t vlogq_f32(float32x4_t x);
 /** Calculate hyperbolic tangent.
 *
 * tanh(x) = (e^2x - 1)/(e^2x + 1)
 *
 * @note We clamp x to [-5,5] to avoid overflowing issues.
 *
 * @param[in] val Input vector value in F32 format.
 *
 * @return The calculated Hyperbolic Tangent.
 */
 static inline float32x4_t vtanhq_f32(float32x4_t val);
 /** Calculate n power of a number.
 *
 * pow(x,n) = e^(n*log(x))
 *
 * @param[in] val Input vector value in F32 format.
 * @param[in] n   Powers to raise the input to.
 *
 * @return The calculated power.
 */
 static inline float32x4_t vpowq_f32(float32x4_t val, float32x4_t n);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 /** Calculate hyperbolic tangent.
 *
 * tanh(x) = (e^2x - 1)/(e^2x + 1)
 *
 * @note We clamp x to [-5,5] to avoid overflowing issues.
 *
 * @param[in] val Input vector value in F32 format.
 *
 * @return The calculated Hyperbolic Tangent.
 */
 static inline float16x8_t vtanhq_f16(float16x8_t val);
 /** Calculate reciprocal.
 *
 * @param[in] x Input value.
 *
 * @return The calculated reciprocal.
 */
 static inline float16x4_t vinv_f16(float16x4_t x);
 /** Calculate reciprocal.
 *
 * @param[in] x Input value.
 *
 * @return The calculated reciprocal.
 */
 static inline float16x8_t vinvq_f16(float16x8_t x);
 /** Calculate inverse square root.
 *
 * @param[in] x Input value.
 *
 * @return The calculated inverse square root.
 */
 static inline float16x4_t vinvsqrt_f16(float16x4_t x);
 /** Calculate inverse square root.
 *
 * @param[in] x Input value.
 *
 * @return The calculated inverse square root.
 */
 static inline float16x8_t vinvsqrtq_f16(float16x8_t x);
 /** Calculate exponential
 *
 * @param[in] x Input vector value in F16 format.
 *
 * @return The calculated exponent.
 */
 static inline float16x8_t vexpq_f16(float16x8_t x);
 /** Calculate n power of a number.
 *
 * pow(x,n) = e^(n*log(x))
 *
 * @param[in] val Input vector value in F16 format.
 * @param[in] n   Powers to raise the input to.
 *
 * @return The calculated power.
 */
 static inline float16x8_t vpowq_f16(float16x8_t val, float16x8_t n);
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 /** Exponent polynomial coefficients */
 extern const float32_t exp_tab[4*8];
 /** Logarithm polynomial coefficients */
 extern const float32_t log_tab[4*8];
 #ifndef DOXYGEN_SKIP_THIS
 inline float32x4_t vfloorq_f32(float32x4_t val)
 {
    static const float32_t CONST_1[4] = {1.f,1.f,1.f,1.f};
    const int32x4_t   z = vcvtq_s32_f32(val);
    const float32x4_t r = vcvtq_f32_s32(z);
    return vbslq_f32(vcgtq_f32(r, val), vsubq_f32(r, vld1q_f32(CONST_1)), r);
 }
 inline float32x2_t vinvsqrt_f32(float32x2_t x)
 {
    float32x2_t sqrt_reciprocal = vrsqrte_f32(x);
    sqrt_reciprocal             = vmul_f32(vrsqrts_f32(vmul_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
    sqrt_reciprocal             = vmul_f32(vrsqrts_f32(vmul_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
    return sqrt_reciprocal;
 }
 inline float32x4_t vinvsqrtq_f32(float32x4_t x)
 {
    float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
    sqrt_reciprocal             = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
    sqrt_reciprocal             = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
    return sqrt_reciprocal;
 }
 inline float32x2_t vinv_f32(float32x2_t x)
 {
    float32x2_t recip = vrecpe_f32(x);
    recip             = vmul_f32(vrecps_f32(x, recip), recip);
    recip             = vmul_f32(vrecps_f32(x, recip), recip);
    return recip;
 }
 inline float32x4_t vinvq_f32(float32x4_t x)
 {
    float32x4_t recip = vrecpeq_f32(x);
    recip             = vmulq_f32(vrecpsq_f32(x, recip), recip);
    recip             = vmulq_f32(vrecpsq_f32(x, recip), recip);
    return recip;
 }
 inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const float32_t *coeffs)
 {
    float32x4_t A   = vmlaq_f32(vld1q_f32(&coeffs[4*0]), vld1q_f32(&coeffs[4*4]), x);
    float32x4_t B   = vmlaq_f32(vld1q_f32(&coeffs[4*2]), vld1q_f32(&coeffs[4*6]), x);
    float32x4_t C   = vmlaq_f32(vld1q_f32(&coeffs[4*1]), vld1q_f32(&coeffs[4*5]), x);
    float32x4_t D   = vmlaq_f32(vld1q_f32(&coeffs[4*3]), vld1q_f32(&coeffs[4*7]), x);
    float32x4_t x2  = vmulq_f32(x, x);
    float32x4_t x4  = vmulq_f32(x2, x2);
    float32x4_t res = vmlaq_f32(vmlaq_f32(A, B, x2), vmlaq_f32(C, D, x2), x4);
    return res;
 }
 inline float32x4_t vexpq_f32(float32x4_t x)
 {
    static const float32_t CONST_LN2[4]          = {0.6931471805f,0.6931471805f,0.6931471805f,0.6931471805f}; // ln(2)
    static const float32_t CONST_INV_LN2[4]      = {1.4426950408f,1.4426950408f,1.4426950408f,1.4426950408f}; // 1/ln(2)
    static const float32_t CONST_0[4]            = {0.f,0.f,0.f,0.f};
    static const int32_t   CONST_NEGATIVE_126[4] = {-126,-126,-126,-126};
    // Perform range reduction [-log(2),log(2)]
    int32x4_t   m   = vcvtq_s32_f32(vmulq_f32(x, vld1q_f32(CONST_INV_LN2)));
    float32x4_t val = vmlsq_f32(x, vcvtq_f32_s32(m), vld1q_f32(CONST_LN2));
    // Polynomial Approximation
    float32x4_t poly = vtaylor_polyq_f32(val, exp_tab);
    // Reconstruct
    poly = vreinterpretq_f32_s32(vqaddq_s32(vreinterpretq_s32_f32(poly), vqshlq_n_s32(m, 23)));
    poly = vbslq_f32(vcltq_s32(m, vld1q_s32(CONST_NEGATIVE_126)), vld1q_f32(CONST_0), poly);
    return poly;
 }
 inline float32x4_t vlogq_f32(float32x4_t x)
 {
    static const int32_t   CONST_127[4] = {127,127,127,127};           // 127
    static const float32_t CONST_LN2[4] = {0.6931471805f,0.6931471805f,0.6931471805f,0.6931471805f}; // ln(2)
    // Extract exponent
    int32x4_t   m   = vsubq_s32(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), vld1q_s32(CONST_127));
    float32x4_t val = vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
    // Polynomial Approximation
    float32x4_t poly = vtaylor_polyq_f32(val, log_tab);
    // Reconstruct
    poly = vmlaq_f32(poly, vcvtq_f32_s32(m), vld1q_f32(CONST_LN2));
    return poly;
 }
 inline float32x4_t vtanhq_f32(float32x4_t val)
 {
    static const float32_t CONST_1[4]        = {1.f,1.f,1.f,1.f};
    static const float32_t CONST_2[4]        = {2.f,2.f,2.f,2.f};
    static const float32_t CONST_MIN_TANH[4] = {-10.f,-10.f,-10.f,-10.f};
    static const float32_t CONST_MAX_TANH[4] = {10.f,10.f,10.f,10.f};
    float32x4_t x     = vminq_f32(vmaxq_f32(val, vld1q_f32(CONST_MIN_TANH)), vld1q_f32(CONST_MAX_TANH));
    float32x4_t exp2x = vexpq_f32(vmulq_f32(vld1q_f32(CONST_2), x));
    float32x4_t num   = vsubq_f32(exp2x, vld1q_f32(CONST_1));
    float32x4_t den   = vaddq_f32(exp2x, vld1q_f32(CONST_1));
    float32x4_t tanh  = vmulq_f32(num, vinvq_f32(den));
    return tanh;
 }
 inline float32x4_t vpowq_f32(float32x4_t val, float32x4_t n)
 {
    return vexpq_f32(vmulq_f32(n, vlogq_f32(val)));
 }
 #endif /* DOXYGEN_SKIP_THIS */
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 /** Exponent polynomial coefficients */
 /** Logarithm polynomial coefficients */
 #ifndef DOXYGEN_SKIP_THIS
 inline float16x8_t vfloorq_f16(float16x8_t val)
 {
    static const float16_t CONST_1[8] = {1.f,1.f,1.f,1.f,1.f,1.f,1.f,1.f};
    const int16x8_t   z = vcvtq_s16_f16(val);
    const float16x8_t r = vcvtq_f16_s16(z);
    return vbslq_f16(vcgtq_f16(r, val), vsubq_f16(r, vld1q_f16(CONST_1)), r);
 }
 inline float16x4_t vinvsqrt_f16(float16x4_t x)
 {
    float16x4_t sqrt_reciprocal = vrsqrte_f16(x);
    sqrt_reciprocal             = vmul_f16(vrsqrts_f16(vmul_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
    sqrt_reciprocal             = vmul_f16(vrsqrts_f16(vmul_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
    return sqrt_reciprocal;
 }
 inline float16x8_t vinvsqrtq_f16(float16x8_t x)
 {
    float16x8_t sqrt_reciprocal = vrsqrteq_f16(x);
    sqrt_reciprocal             = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
    sqrt_reciprocal             = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
    return sqrt_reciprocal;
 }
 inline float16x4_t vinv_f16(float16x4_t x)
 {
    float16x4_t recip = vrecpe_f16(x);
    recip             = vmul_f16(vrecps_f16(x, recip), recip);
    recip             = vmul_f16(vrecps_f16(x, recip), recip);
    return recip;
 }
 inline float16x8_t vinvq_f16(float16x8_t x)
 {
    float16x8_t recip = vrecpeq_f16(x);
    recip             = vmulq_f16(vrecpsq_f16(x, recip), recip);
    recip             = vmulq_f16(vrecpsq_f16(x, recip), recip);
    return recip;
 }
 inline float16x8_t vtanhq_f16(float16x8_t val)
 {
    const float16_t CONST_1[8]        = {1.f,1.f,1.f,1.f,1.f,1.f,1.f,1.f};
    const float16_t CONST_2[8]        = {2.f,2.f,2.f,2.f,2.f,2.f,2.f,2.f};
    const float16_t CONST_MIN_TANH[8] = {-10.f,-10.f,-10.f,-10.f,-10.f,-10.f,-10.f,-10.f};
    const float16_t CONST_MAX_TANH[8] = {10.f,10.f,10.f,10.f,10.f,10.f,10.f,10.f};
    const float16x8_t x     = vminq_f16(vmaxq_f16(val, vld1q_f16(CONST_MIN_TANH)), vld1q_f16(CONST_MAX_TANH));
    const float16x8_t exp2x = vexpq_f16(vmulq_f16(vld1q_f16(CONST_2), x));
    const float16x8_t num   = vsubq_f16(exp2x, vld1q_f16(CONST_1));
    const float16x8_t den   = vaddq_f16(exp2x, vld1q_f16(CONST_1));
    const float16x8_t tanh  = vmulq_f16(num, vinvq_f16(den));
    return tanh;
 }
 inline float16x8_t vtaylor_polyq_f16(float16x8_t x, const float16_t *coeffs)
 {
    const float16x8_t A   = vaddq_f16(&coeffs[8*0], vmulq_f16(&coeffs[8*4], x));
    const float16x8_t B   = vaddq_f16(&coeffs[8*2], vmulq_f16(&coeffs[8*6], x));
    const float16x8_t C   = vaddq_f16(&coeffs[8*1], vmulq_f16(&coeffs[8*5], x));
    const float16x8_t D   = vaddq_f16(&coeffs[8*3], vmulq_f16(&coeffs[8*7], x));
    const float16x8_t x2  = vmulq_f16(x, x);
    const float16x8_t x4  = vmulq_f16(x2, x2);
    const float16x8_t res = vaddq_f16(vaddq_f16(A, vmulq_f16(B, x2)), vmulq_f16(vaddq_f16(C, vmulq_f16(D, x2)), x4));
    return res;
 }
 inline float16x8_t vexpq_f16(float16x8_t x)
 {
    // TODO (COMPMID-1535) : Revisit FP16 approximations
    const float32x4_t x_high = vcvt_f32_f16(vget_high_f16(x));
    const float32x4_t x_low  = vcvt_f32_f16(vget_low_f16(x));
    const float16x8_t res = vcvt_high_f16_f32(vcvt_f16_f32(vexpq_f32(x_low)), vexpq_f32(x_high));
    return res;
 }
 inline float16x8_t vlogq_f16(float16x8_t x)
 {
    // TODO (COMPMID-1535) : Revisit FP16 approximations
    const float32x4_t x_high = vcvt_f32_f16(vget_high_f16(x));
    const float32x4_t x_low  = vcvt_f32_f16(vget_low_f16(x));
    const float16x8_t res = vcvt_high_f16_f32(vcvt_f16_f32(vlogq_f32(x_low)), vlogq_f32(x_high));
    return res;
 }
 inline float16x8_t vpowq_f16(float16x8_t val, float16x8_t n)
 {
    // TODO (giaiod01) - COMPMID-1535
    float32x4_t n0_f32   = vcvt_f32_f16(vget_low_f16(n));
    float32x4_t n1_f32   = vcvt_f32_f16(vget_high_f16(n));
    float32x4_t val0_f32 = vcvt_f32_f16(vget_low_f16(val));
    float32x4_t val1_f32 = vcvt_f32_f16(vget_high_f16(val));
    float32x4_t res0_f32 = vexpq_f32(vmulq_f32(n0_f32, vlogq_f32(val0_f32)));
    float32x4_t res1_f32 = vexpq_f32(vmulq_f32(n1_f32, vlogq_f32(val1_f32)));
    return vcombine_f16(vcvt_f16_f32(res0_f32), vcvt_f16_f32(res1_f32));
 }
 #endif /* DOXYGEN_SKIP_THIS */
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 #endif
 #endif /* __ARM_COMPUTE_NEMATH_H__ */
--- a/libraries/cmsis/dsp/ComputeLibrary/LICENSE.txt
+++ b/libraries/cmsis/dsp/ComputeLibrary/LICENSE.txt
@@ -0,0 +1,21 @@
 MIT License
 Copyright (c) 2017-2019 ARM Software
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/libraries/cmsis/dsp/ComputeLibrary/README.md
+++ b/libraries/cmsis/dsp/ComputeLibrary/README.md
@@ -0,0 +1,19 @@
 README
 ======
 This folder is containing two files imported, and slightly modified, from the ComputeLibrary:
    NEMath.h and arm_cl_tables.c 
 In the original compute library, there are instead two other files:
    NEMath.h and NEMath.inl
 NEMath.inl is included from NEMath.h whereas in this CMSIS DSP implementation, there is no NEMath.inl and its content is copied into NEMath.h
 The tables contained in NEMath.inl have been moved to arm_cl_tables.c and finally the files are in C for the CMSIS DSP library and in C++ in the original Compute Library.
 Otherwise, the features and implementations are the same : a few optimized Neon functions.
 The license covering those files is different : It is a MIT license.
 Other parts of the CMSIS-DSP are covered with an Apache-2.0 license.
--- a/libraries/cmsis/dsp/ComputeLibrary/Source/arm_cl_tables.c
+++ b/libraries/cmsis/dsp/ComputeLibrary/Source/arm_cl_tables.c
@@ -0,0 +1,55 @@
 /*
 * Copyright (c) 2016, 2019 ARM Limited.
 *
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
 #include "arm_math.h"
 #include "NEMath.h"
 #if defined(ARM_MATH_NEON)
 /** Exponent polynomial coefficients */
 const float32_t exp_tab[4*8] =
 {
        1.f,1.f,1.f,1.f,
        0.0416598916054f,0.0416598916054f,0.0416598916054f,0.0416598916054f,
        0.500000596046f,0.500000596046f,0.500000596046f,0.500000596046f,
        0.0014122662833f,0.0014122662833f,0.0014122662833f,0.0014122662833f,
        1.00000011921f,1.00000011921f,1.00000011921f,1.00000011921f,
        0.00833693705499f,0.00833693705499f,0.00833693705499f,0.00833693705499f,
        0.166665703058f,0.166665703058f,0.166665703058f,0.166665703058f,
        0.000195780929062f,0.000195780929062f,0.000195780929062f,0.000195780929062f
 };
 /** Logarithm polynomial coefficients */
 const float32_t log_tab[4*8] =
 {
        -2.29561495781f,-2.29561495781f,-2.29561495781f,-2.29561495781f,
        -2.47071170807f,-2.47071170807f,-2.47071170807f,-2.47071170807f,
        -5.68692588806f,-5.68692588806f,-5.68692588806f,-5.68692588806f,
        -0.165253549814f,-0.165253549814f,-0.165253549814f,-0.165253549814f,
        5.17591238022f,5.17591238022f,5.17591238022f,5.17591238022f,
        0.844007015228f,0.844007015228f,0.844007015228f,0.844007015228f,
        4.58445882797f,4.58445882797f,4.58445882797f,4.58445882797f,
        0.0141278216615f,0.0141278216615f,0.0141278216615f,0.0141278216615f
 };
 #endif
--- a/libraries/cmsis/dsp/PrivateInclude/arm_sorting.h
+++ b/libraries/cmsis/dsp/PrivateInclude/arm_sorting.h
@@ -0,0 +1,200 @@
 /******************************************************************************
 * @file     arm_sorting.h
  * @version  v2.0.8
  * @date     2022-04-02
 * @brief    Private header file for CMSIS DSP Library
 ******************************************************************************/
 /*
 * Copyright (c) 2010-2019 Arm Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef _ARM_SORTING_H_
 #define _ARM_SORTING_H_
 #include "arm_math.h"
 #ifdef   __cplusplus
 extern "C"
 {
 #endif
  /**
   * @param[in]  S          points to an instance of the sorting structure.
   * @param[in]  pSrc       points to the block of input data.
   * @param[out] pDst       points to the block of output data.
   * @param[in]  blockSize  number of samples to process.
   */
  void arm_bubble_sort_f32(
    const arm_sort_instance_f32 * S,
          float32_t * pSrc,
          float32_t * pDst,
    uint32_t blockSize);
   /**
   * @param[in]  S          points to an instance of the sorting structure.
   * @param[in]  pSrc       points to the block of input data.
   * @param[out] pDst       points to the block of output data.
   * @param[in]  blockSize  number of samples to process.
   */
  void arm_heap_sort_f32(
    const arm_sort_instance_f32 * S,
          float32_t * pSrc,
          float32_t * pDst,
    uint32_t blockSize);
  /**
   * @param[in]  S          points to an instance of the sorting structure.
   * @param[in]  pSrc       points to the block of input data.
   * @param[out] pDst       points to the block of output data.
   * @param[in]  blockSize  number of samples to process.
   */
  void arm_insertion_sort_f32(
    const arm_sort_instance_f32 * S,
          float32_t *pSrc,
          float32_t* pDst,
    uint32_t blockSize);
  /**
   * @param[in]  S          points to an instance of the sorting structure.
   * @param[in]  pSrc       points to the block of input data.
   * @param[out] pDst       points to the block of output data
   * @param[in]  blockSize  number of samples to process.
   */
  void arm_quick_sort_f32(
    const arm_sort_instance_f32 * S,
          float32_t * pSrc,
          float32_t * pDst,
    uint32_t blockSize);
  /**
   * @param[in]  S          points to an instance of the sorting structure.
   * @param[in]  pSrc       points to the block of input data.
   * @param[out] pDst       points to the block of output data
   * @param[in]  blockSize  number of samples to process.
   */
  void arm_selection_sort_f32(
    const arm_sort_instance_f32 * S,
          float32_t * pSrc,
          float32_t * pDst,
    uint32_t blockSize);
  /**
   * @param[in]  S          points to an instance of the sorting structure.
   * @param[in]  pSrc       points to the block of input data.
   * @param[out] pDst       points to the block of output data
   * @param[in]  blockSize  number of samples to process.
   */
  void arm_bitonic_sort_f32(
    const arm_sort_instance_f32 * S,
          float32_t * pSrc,
          float32_t * pDst,
          uint32_t blockSize);
 #if defined(ARM_MATH_NEON)
 #define vtrn256_128q(a, b)                   \
 do {                                         \
 	float32x4_t vtrn128_temp = a.val[1]; \
 	a.val[1] = b.val[0];                 \
 	b.val[0] = vtrn128_temp ;            \
 } while (0)
 #define vtrn128_64q(a, b)           \
 do {                                \
 	float32x2_t ab, cd, ef, gh; \
 	ab = vget_low_f32(a);	    \
 	ef = vget_low_f32(b);	    \
 	cd = vget_high_f32(a);	    \
 	gh = vget_high_f32(b);      \
 	a = vcombine_f32(ab, ef);   \
 	b = vcombine_f32(cd, gh);   \
 } while (0)
 #define vtrn256_64q(a, b)                  \
 do {                                       \
 	float32x2_t a_0, a_1, a_2, a_3;    \
 	float32x2_t b_0, b_1, b_2, b_3;    \
 	a_0 = vget_low_f32(a.val[0]);      \
 	a_1 = vget_high_f32(a.val[0]);     \
 	a_2 = vget_low_f32(a.val[1]);      \
 	a_3 = vget_high_f32(a.val[1]);     \
 	b_0 = vget_low_f32(b.val[0]);      \
 	b_1 = vget_high_f32(b.val[0]);     \
 	b_2 = vget_low_f32(b.val[1]);      \
 	b_3 = vget_high_f32(b.val[1]);     \
 	a.val[0] = vcombine_f32(a_0, b_0); \
 	a.val[1] = vcombine_f32(a_2, b_2); \
 	b.val[0] = vcombine_f32(a_1, b_1); \
 	b.val[1] = vcombine_f32(a_3, b_3); \
 } while (0)
 #define vtrn128_32q(a, b)                               \
 do {                                                    \
 	float32x4x2_t vtrn32_tmp = vtrnq_f32((a), (b)); \
 	(a) = vtrn32_tmp.val[0];                        \
 	(b) = vtrn32_tmp.val[1];                        \
 } while (0)
 #define vtrn256_32q(a, b)               \
 do {                                    \
 	float32x4x2_t vtrn32_tmp_1 = vtrnq_f32((a.val[0]), (b.val[0])); \
 	float32x4x2_t vtrn32_tmp_2 = vtrnq_f32((a.val[1]), (b.val[1])); \
 	a.val[0] = vtrn32_tmp_1.val[0]; \
 	a.val[1] = vtrn32_tmp_2.val[0]; \
 	b.val[0] = vtrn32_tmp_1.val[1]; \
 	b.val[1] = vtrn32_tmp_2.val[1]; \
 } while (0)
 #define vminmaxq(a, b)                    \
 	do {                              \
 	float32x4_t minmax_tmp = (a);     \
 	(a) = vminq_f32((a), (b));        \
 	(b) = vmaxq_f32(minmax_tmp, (b)); \
 } while (0)
 #define vminmax256q(a, b)                         \
 	do {                                      \
 	float32x4x2_t minmax256_tmp = (a);        \
 	a.val[0] = vminq_f32(a.val[0], b.val[0]); \
 	a.val[1] = vminq_f32(a.val[1], b.val[1]); \
 	b.val[0] = vmaxq_f32(minmax256_tmp.val[0], b.val[0]); \
 	b.val[1] = vmaxq_f32(minmax256_tmp.val[1], b.val[1]); \
 } while (0)
 #define vrev128q_f32(a) \
        vcombine_f32(vrev64_f32(vget_high_f32(a)), vrev64_f32(vget_low_f32(a)))
 #define vrev256q_f32(a)     \
 	do {                \
        float32x4_t rev_tmp = vcombine_f32(vrev64_f32(vget_high_f32(a.val[0])), vrev64_f32(vget_low_f32(a.val[0]))); \
 	a.val[0] = vcombine_f32(vrev64_f32(vget_high_f32(a.val[1])), vrev64_f32(vget_low_f32(a.val[1])));  \
 	a.val[1] = rev_tmp; \
 } while (0)
 #define vldrev128q_f32(a, p) \
 	do {                 \
 	a = vld1q_f32(p);    \
 	a = vrev128q_f32(a); \
 } while (0)
 #endif /* ARM_MATH_NEON */
 #ifdef   __cplusplus
 }
 #endif
 #endif /* _ARM_SORTING_H */
--- a/libraries/cmsis/dsp/PrivateInclude/arm_vec_fft.h
+++ b/libraries/cmsis/dsp/PrivateInclude/arm_vec_fft.h
@@ -0,0 +1,58 @@
 /******************************************************************************
 * @file     arm_vec_fft.h
  * @version  v2.0.8
  * @date     2022-04-02
 * @brief    Private header file for CMSIS DSP Library
 ******************************************************************************/
 /*
 * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef _ARM_VEC_FFT_H_
 #define _ARM_VEC_FFT_H_
 #include "arm_math.h"
 #include "arm_helium_utils.h"
 #ifdef   __cplusplus
 extern "C"
 {
 #endif
 #if (defined(ARM_MATH_MVEF) || defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)
 #define MVE_CMPLX_ADD_A_ixB(A, B)           vcaddq_rot90(A,B)
 #define MVE_CMPLX_SUB_A_ixB(A,B)            vcaddq_rot270(A,B)
 #define MVE_CMPLX_MULT_FLT_AxB(A,B)         vcmlaq_rot90(vcmulq(A, B), A, B)
 #define MVE_CMPLX_MULT_FLT_Conj_AxB(A,B)    vcmlaq_rot270(vcmulq(A, B), A, B)
 #define MVE_CMPLX_MULT_FX_AxB(A,B)          vqdmladhxq(vqdmlsdhq((__typeof(A))vuninitializedq_s32(), A, B), A, B);
 #define MVE_CMPLX_MULT_FX_AxConjB(A,B)      vqdmladhq(vqdmlsdhxq((__typeof(A))vuninitializedq_s32(), A, B), A, B);
 #define MVE_CMPLX_ADD_FX_A_ixB(A, B)        vhcaddq_rot90(A,B)
 #define MVE_CMPLX_SUB_FX_A_ixB(A,B)         vhcaddq_rot270(A,B)
 #endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)*/
 #ifdef   __cplusplus
 }
 #endif
 #endif /* _ARM_VEC_FFT_H_ */
--- a/libraries/cmsis/dsp/PrivateInclude/arm_vec_filtering.h
+++ b/libraries/cmsis/dsp/PrivateInclude/arm_vec_filtering.h
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/BasicMathFunctions.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/BasicMathFunctions.c
@@ -0,0 +1,75 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        BasicMathFunctions.c
 * Description:  Combination of all basic math function source files.
 *
 * $Date:        16. March 2020
 * $Revision:    V1.1.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2019-2020 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_abs_f32.c"
 #include "arm_abs_q15.c"
 #include "arm_abs_q31.c"
 #include "arm_abs_q7.c"
 #include "arm_add_f32.c"
 #include "arm_add_q15.c"
 #include "arm_add_q31.c"
 #include "arm_add_q7.c"
 #include "arm_and_u16.c"
 #include "arm_and_u32.c"
 #include "arm_and_u8.c"
 #include "arm_dot_prod_f32.c"
 #include "arm_dot_prod_q15.c"
 #include "arm_dot_prod_q31.c"
 #include "arm_dot_prod_q7.c"
 #include "arm_mult_f32.c"
 #include "arm_mult_q15.c"
 #include "arm_mult_q31.c"
 #include "arm_mult_q7.c"
 #include "arm_negate_f32.c"
 #include "arm_negate_q15.c"
 #include "arm_negate_q31.c"
 #include "arm_negate_q7.c"
 #include "arm_not_u16.c"
 #include "arm_not_u32.c"
 #include "arm_not_u8.c"
 #include "arm_offset_f32.c"
 #include "arm_offset_q15.c"
 #include "arm_offset_q31.c"
 #include "arm_offset_q7.c"
 #include "arm_or_u16.c"
 #include "arm_or_u32.c"
 #include "arm_or_u8.c"
 #include "arm_scale_f32.c"
 #include "arm_scale_q15.c"
 #include "arm_scale_q31.c"
 #include "arm_scale_q7.c"
 #include "arm_shift_q15.c"
 #include "arm_shift_q31.c"
 #include "arm_shift_q7.c"
 #include "arm_sub_f32.c"
 #include "arm_sub_q15.c"
 #include "arm_sub_q31.c"
 #include "arm_sub_q7.c"
 #include "arm_xor_u16.c"
 #include "arm_xor_u32.c"
 #include "arm_xor_u8.c"
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/CMakeLists.txt
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/CMakeLists.txt
@@ -0,0 +1,19 @@
 cmake_minimum_required (VERSION 3.6)
 project(CMSISDSPBasicMath)
 include(configLib)
 include(configDsp)
 file(GLOB SRC "./*_*.c")
 add_library(CMSISDSPBasicMath STATIC ${SRC})
 configLib(CMSISDSPBasicMath ${ROOT})
 configDsp(CMSISDSPBasicMath ${ROOT})
 ### Includes
 target_include_directories(CMSISDSPBasicMath PUBLIC "${DSP}/Include")
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_abs_f32.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_abs_f32.c
@@ -0,0 +1,196 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_abs_f32.c
 * Description:  Floating-point vector absolute value
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 #include <math.h>
 /**
  @ingroup groupMath
 */
 /**
  @defgroup BasicAbs Vector Absolute Value
  Computes the absolute value of a vector on an element-by-element basis.
  <pre>
      pDst[n] = abs(pSrc[n]),   0 <= n < blockSize.
  </pre>
  The functions support in-place computation allowing the source and
  destination pointers to reference the same memory buffer.
  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
 */
 /**
  @addtogroup BasicAbs
  @{
 */
 /**
  @brief         Floating-point vector absolute value.
  @param[in]     pSrc       points to the input vector
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
 */
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 #include "arm_helium_utils.h"
 void arm_abs_f32(
  const float32_t * pSrc,
        float32_t * pDst,
        uint32_t blockSize)
 {
    uint32_t blkCnt;                               /* Loop counter */
    f32x4_t vec1;
    f32x4_t res;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        /* C = |A| */
        /* Calculate absolute values and then store the results in the destination buffer. */
        vec1 = vld1q(pSrc);
        res = vabsq(vec1);
        vst1q(pDst, res);
        /* Increment pointers */
        pSrc += 4;
        pDst += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0x3;
    if (blkCnt > 0U)
    {
      /* C = |A| */
      mve_pred16_t p0 = vctp32q(blkCnt);
      vec1 = vld1q(pSrc);
      vstrwq_p(pDst, vabsq(vec1), p0);
    }
 }
 #else
 void arm_abs_f32(
  const float32_t * pSrc,
        float32_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
    f32x4_t vec1;
    f32x4_t res;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        /* C = |A| */
    	/* Calculate absolute values and then store the results in the destination buffer. */
        vec1 = vld1q_f32(pSrc);
        res = vabsq_f32(vec1);
        vst1q_f32(pDst, res);
        /* Increment pointers */
        pSrc += 4;
        pDst += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0x3;
 #else
 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = |A| */
    /* Calculate absolute and store result in destination buffer. */
    *pDst++ = fabsf(*pSrc++);
    *pDst++ = fabsf(*pSrc++);
    *pDst++ = fabsf(*pSrc++);
    *pDst++ = fabsf(*pSrc++);
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
 #endif /* #if defined(ARM_MATH_NEON) */
  while (blkCnt > 0U)
  {
    /* C = |A| */
    /* Calculate absolute and store result in destination buffer. */
    *pDst++ = fabsf(*pSrc++);
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 /**
  @} end of BasicAbs group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_abs_q15.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_abs_q15.c
@@ -0,0 +1,178 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_abs_q15.c
 * Description:  Q15 vector absolute value
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicAbs
  @{
 */
 /**
  @brief         Q15 vector absolute value.
  @param[in]     pSrc       points to the input vector
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   The Q15 value -1 (0x8000) will be saturated to the maximum allowable positive value 0x7FFF.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_abs_q15(
    const q15_t * pSrc,
    q15_t * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q15x8_t vecSrc;
    /* Compute 8 outputs at a time */
    blkCnt = blockSize >> 3;
    while (blkCnt > 0U)
    {
        /*
         * C = |A|
         * Calculate absolute and then store the results in the destination buffer.
         */
        vecSrc = vld1q(pSrc);
        vst1q(pDst, vqabsq(vecSrc));
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrc += 8;
        pDst += 8;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 7;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp16q(blkCnt);
        vecSrc = vld1q(pSrc);
        vstrhq_p(pDst, vqabsq(vecSrc), p0);
    }
 }
 #else
 void arm_abs_q15(
  const q15_t * pSrc,
        q15_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
        q15_t in;                                      /* Temporary input variable */
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = |A| */
    /* Calculate absolute of input (if -1 then saturated to 0x7fff) and store result in destination buffer. */
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
 #else
    *pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
 #endif
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
 #else
    *pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
 #endif
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
 #else
    *pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
 #endif
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
 #else
    *pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = |A| */
    /* Calculate absolute of input (if -1 then saturated to 0x7fff) and store result in destination buffer. */
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
 #else
    *pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicAbs group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_abs_q31.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_abs_q31.c
@@ -0,0 +1,208 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_abs_q31.c
 * Description:  Q31 vector absolute value
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicAbs
  @{
 */
 /**
  @brief         Q31 vector absolute value.
  @param[in]     pSrc       points to the input vector
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   The Q31 value -1 (0x80000000) will be saturated to the maximum allowable positive value 0x7FFFFFFF.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_abs_q31(
    const q31_t * pSrc,
    q31_t * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* Loop counters */
    q31x4_t vecSrc;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2;
    while (blkCnt > 0U)
    {
        /*
         * C = |A|
         * Calculate absolute and then store the results in the destination buffer.
         */
        vecSrc = vld1q(pSrc);
        vst1q(pDst, vqabsq(vecSrc));
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * Advance vector source and destination pointers
         */
        pSrc += 4;
        pDst += 4;
    }
    /*
     * Tail
     */
    blkCnt = blockSize & 3;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp32q(blkCnt);
        vecSrc = vld1q(pSrc);
        vstrwq_p(pDst, vqabsq(vecSrc), p0);
    }
 }
 #else
 void arm_abs_q31(
  const q31_t * pSrc,
        q31_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
        q31_t in;                                      /* Temporary variable */
 #if defined(ARM_MATH_NEON)
    int32x4_t vec1;
    int32x4_t res;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        /* C = |A| */
        /* Calculate absolute and then store the results in the destination buffer. */
        vec1 = vld1q_s32(pSrc);
        res = vqabsq_s32(vec1);
        vst1q_s32(pDst, res);
        /* Increment pointers */
        pSrc += 4;
        pDst += 4;
        /* Decrement the blockSize loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0x3;
 #else
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = |A| */
    /* Calculate absolute of input (if -1 then saturated to 0x7fffffff) and store result in destination buffer. */
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
 #else
    *pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
 #endif
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
 #else
    *pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
 #endif
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
 #else
    *pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
 #endif
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
 #else
    *pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
 #endif /* #if defined (ARM_MATH_NEON) */
  while (blkCnt > 0U)
  {
    /* C = |A| */
    /* Calculate absolute of input (if -1 then saturated to 0x7fffffff) and store result in destination buffer. */
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
 #else
    *pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* #if defined (ARM_MATH_MVEI) */
 /**
  @} end of BasicAbs group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_abs_q7.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_abs_q7.c
@@ -0,0 +1,180 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_abs_q7.c
 * Description:  Q7 vector absolute value
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicAbs
  @{
 */
 /**
  @brief         Q7 vector absolute value.
  @param[in]     pSrc       points to the input vector
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
  @par           Conditions for optimum performance
                   Input and output buffers should be aligned by 32-bit
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   The Q7 value -1 (0x80) will be saturated to the maximum allowable positive value 0x7F.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_abs_q7(
    const q7_t * pSrc,
    q7_t * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q7x16_t vecSrc;
    /* Compute 16 outputs at a time */
    blkCnt = blockSize >> 4;
    while (blkCnt > 0U)
    {
        /*
         * C = |A|
         * Calculate absolute and then store the results in the destination buffer.
         */
        vecSrc = vld1q(pSrc);
        vst1q(pDst, vqabsq(vecSrc));
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrc += 16;
        pDst += 16;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 0xF;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp8q(blkCnt);
        vecSrc = vld1q(pSrc);
        vstrbq_p(pDst, vqabsq(vecSrc), p0);
    }
 }
 #else
 void arm_abs_q7(
  const q7_t * pSrc,
        q7_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
        q7_t in;                                       /* Temporary input variable */
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = |A| */
    /* Calculate absolute of input (if -1 then saturated to 0x7f) and store result in destination buffer. */
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = (in > 0) ? in : (q7_t)__QSUB8(0, in);
 #else
    *pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
 #endif
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = (in > 0) ? in : (q7_t)__QSUB8(0, in);
 #else
    *pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
 #endif
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = (in > 0) ? in : (q7_t)__QSUB8(0, in);
 #else
    *pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
 #endif
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = (in > 0) ? in : (q7_t)__QSUB8(0, in);
 #else
    *pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = |A| */
    /* Calculate absolute of input (if -1 then saturated to 0x7f) and store result in destination buffer. */
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = (in > 0) ? in : (q7_t) __QSUB8(0, in);
 #else
    *pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicAbs group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_add_f32.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_add_f32.c
@@ -0,0 +1,199 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_add_f32.c
 * Description:  Floating-point vector addition
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @defgroup BasicAdd Vector Addition
  Element-by-element addition of two vectors.
  <pre>
      pDst[n] = pSrcA[n] + pSrcB[n],   0 <= n < blockSize.
  </pre>
  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
 */
 /**
  @addtogroup BasicAdd
  @{
 */
 /**
  @brief         Floating-point vector addition.
  @param[in]     pSrcA      points to first input vector
  @param[in]     pSrcB      points to second input vector
  @param[out]    pDst       points to output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
 */
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 #include "arm_helium_utils.h"
 void arm_add_f32(
  const float32_t * pSrcA,
  const float32_t * pSrcB,
        float32_t * pDst,
        uint32_t blockSize)
 {
    uint32_t blkCnt;                               /* Loop counter */
    f32x4_t vec1;
    f32x4_t vec2;
    f32x4_t res;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        /* C = A + B */
        /* Add and then store the results in the destination buffer. */
        vec1 = vld1q(pSrcA);
        vec2 = vld1q(pSrcB);
        res = vaddq(vec1, vec2);
        vst1q(pDst, res);
        /* Increment pointers */
        pSrcA += 4;
        pSrcB += 4;
        pDst += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0x3;
    if (blkCnt > 0U)
    {
      /* C = A + B */
      mve_pred16_t p0 = vctp32q(blkCnt);
      vec1 = vld1q(pSrcA);
      vec2 = vld1q(pSrcB);
      vstrwq_p(pDst, vaddq(vec1,vec2), p0);
    }
 }
 #else
 void arm_add_f32(
  const float32_t * pSrcA,
  const float32_t * pSrcB,
        float32_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
    f32x4_t vec1;
    f32x4_t vec2;
    f32x4_t res;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        /* C = A + B */
    	/* Add and then store the results in the destination buffer. */
        vec1 = vld1q_f32(pSrcA);
        vec2 = vld1q_f32(pSrcB);
        res = vaddq_f32(vec1, vec2);
        vst1q_f32(pDst, res);
        /* Increment pointers */
        pSrcA += 4;
        pSrcB += 4;
        pDst += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0x3;
 #else
 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A + B */
    /* Add and store result in destination buffer. */
    *pDst++ = (*pSrcA++) + (*pSrcB++);
    *pDst++ = (*pSrcA++) + (*pSrcB++);
    *pDst++ = (*pSrcA++) + (*pSrcB++);
    *pDst++ = (*pSrcA++) + (*pSrcB++);
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
 #endif /* #if defined(ARM_MATH_NEON) */
  while (blkCnt > 0U)
  {
    /* C = A + B */
    /* Add and store result in destination buffer. */
    *pDst++ = (*pSrcA++) + (*pSrcB++);
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 /**
  @} end of BasicAdd group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_add_q15.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_add_q15.c
@@ -0,0 +1,176 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_add_q15.c
 * Description:  Q15 vector addition
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicAdd
  @{
 */
 /**
  @brief         Q15 vector addition.
  @param[in]     pSrcA      points to the first input vector
  @param[in]     pSrcB      points to the second input vector
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_add_q15(
    const q15_t * pSrcA,
    const q15_t * pSrcB,
    q15_t * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q15x8_t vecA;
    q15x8_t vecB;
    /* Compute 8 outputs at a time */
    blkCnt = blockSize >> 3;
    while (blkCnt > 0U)
    {
        /*
         * C = A + B
         * Add and then store the results in the destination buffer.
         */
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        vst1q(pDst, vqaddq(vecA, vecB));
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrcA  += 8;
        pSrcB  += 8;
        pDst   += 8;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 7;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp16q(blkCnt);
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        vstrhq_p(pDst, vqaddq(vecA, vecB), p0);
    }
 }
 #else
 void arm_add_q15(
  const q15_t * pSrcA,
  const q15_t * pSrcB,
        q15_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
 #if defined (ARM_MATH_LOOPUNROLL)
 #if defined (ARM_MATH_DSP)
  q31_t inA1, inA2;
  q31_t inB1, inB2;
 #endif
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A + B */
 #if defined (ARM_MATH_DSP)
    /* read 2 times 2 samples at a time from sourceA */
    inA1 = read_q15x2_ia ((q15_t **) &pSrcA);
    inA2 = read_q15x2_ia ((q15_t **) &pSrcA);
    /* read 2 times 2 samples at a time from sourceB */
    inB1 = read_q15x2_ia ((q15_t **) &pSrcB);
    inB2 = read_q15x2_ia ((q15_t **) &pSrcB);
    /* Add and store 2 times 2 samples at a time */
    write_q15x2_ia (&pDst, __QADD16(inA1, inB1));
    write_q15x2_ia (&pDst, __QADD16(inA2, inB2));
 #else
    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ + *pSrcB++), 16);
    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ + *pSrcB++), 16);
    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ + *pSrcB++), 16);
    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ + *pSrcB++), 16);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = A + B */
    /* Add and store result in destination buffer. */
 #if defined (ARM_MATH_DSP)
    *pDst++ = (q15_t) __QADD16(*pSrcA++, *pSrcB++);
 #else
    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ + *pSrcB++), 16);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicAdd group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_add_q31.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_add_q31.c
@@ -0,0 +1,159 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_add_q31.c
 * Description:  Q31 vector addition
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicAdd
  @{
 */
 /**
  @brief         Q31 vector addition.
  @param[in]     pSrcA      points to the first input vector
  @param[in]     pSrcB      points to the second input vector
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_add_q31(
  const q31_t * pSrcA,
  const q31_t * pSrcB,
        q31_t * pDst,
        uint32_t blockSize)
 {
    uint32_t blkCnt;
    q31x4_t vecA;
    q31x4_t vecB;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2;
    while (blkCnt > 0U)
    {
        /*
         * C = A + B
         * Add and then store the results in the destination buffer.
         */
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        vst1q(pDst, vqaddq(vecA, vecB));
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrcA  += 4;
        pSrcB  += 4;
        pDst   += 4;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 3;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp32q(blkCnt);
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        vstrwq_p(pDst, vqaddq(vecA, vecB), p0);
    }
 }
 #else
 void arm_add_q31(
  const q31_t * pSrcA,
  const q31_t * pSrcB,
        q31_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A + B */
    /* Add and store result in destination buffer. */
    *pDst++ = __QADD(*pSrcA++, *pSrcB++);
    *pDst++ = __QADD(*pSrcA++, *pSrcB++);
    *pDst++ = __QADD(*pSrcA++, *pSrcB++);
    *pDst++ = __QADD(*pSrcA++, *pSrcB++);
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = A + B */
    /* Add and store result in destination buffer. */
    *pDst++ = __QADD(*pSrcA++, *pSrcB++);
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicAdd group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_add_q7.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_add_q7.c
@@ -0,0 +1,158 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_add_q7.c
 * Description:  Q7 vector addition
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicAdd
  @{
 */
 /**
  @brief         Q7 vector addition.
  @param[in]     pSrcA      points to the first input vector
  @param[in]     pSrcB      points to the second input vector
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_add_q7(
    const q7_t * pSrcA,
    const q7_t * pSrcB,
    q7_t * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q7x16_t vecA;
    q7x16_t vecB;
    /* Compute 16 outputs at a time */
    blkCnt = blockSize >> 4;
    while (blkCnt > 0U)
    {
        /*
         * C = A + B
         * Add and then store the results in the destination buffer.
         */
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        vst1q(pDst, vqaddq(vecA, vecB));
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrcA  += 16;
        pSrcB  += 16;
        pDst   += 16;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 0xF;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp8q(blkCnt);
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        vstrbq_p(pDst, vqaddq(vecA, vecB), p0);
    }
 }
 #else
 void arm_add_q7(
  const q7_t * pSrcA,
  const q7_t * pSrcB,
        q7_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A + B */
 #if defined (ARM_MATH_DSP)
    /* Add and store result in destination buffer (4 samples at a time). */
    write_q7x4_ia (&pDst, __QADD8 (read_q7x4_ia ((q7_t **) &pSrcA), read_q7x4_ia ((q7_t **) &pSrcB)));
 #else
    *pDst++ = (q7_t) __SSAT ((q15_t) *pSrcA++ + *pSrcB++, 8);
    *pDst++ = (q7_t) __SSAT ((q15_t) *pSrcA++ + *pSrcB++, 8);
    *pDst++ = (q7_t) __SSAT ((q15_t) *pSrcA++ + *pSrcB++, 8);
    *pDst++ = (q7_t) __SSAT ((q15_t) *pSrcA++ + *pSrcB++, 8);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = A + B */
    /* Add and store result in destination buffer. */
    *pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ + *pSrcB++, 8);
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicAdd group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_and_u16.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_and_u16.c
@@ -0,0 +1,137 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_and_u16.c
 * Description:  uint16_t bitwise AND
 *
 * $Date:        14 November 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @defgroup And Vector bitwise AND
  Compute the logical bitwise AND.
  There are separate functions for uint32_t, uint16_t, and uint7_t data types.
 */
 /**
  @addtogroup And
  @{
 */
 /**
  @brief         Compute the logical bitwise AND of two fixed-point vectors.
  @param[in]     pSrcA      points to input vector A
  @param[in]     pSrcB      points to input vector B
  @param[out]    pDst       points to output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
 */
 void arm_and_u16(
    const uint16_t * pSrcA,
    const uint16_t * pSrcB,
          uint16_t * pDst,
          uint32_t blockSize)
 {
    uint32_t blkCnt;      /* Loop counter */
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
    q15x8_t vecSrcA, vecSrcB;
    /* Compute 8 outputs at a time */
    blkCnt = blockSize >> 3;
    while (blkCnt > 0U)
    {
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vst1q(pDst, vandq_u16(vecSrcA, vecSrcB) );
        pSrcA += 8;
        pSrcB += 8;
        pDst  += 8;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 7;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp16q(blkCnt);
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vstrhq_p(pDst, vandq_u16(vecSrcA, vecSrcB), p0);
    }
 #else
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
    uint16x8_t vecA, vecB;
    /* Compute 8 outputs at a time */
    blkCnt = blockSize >> 3U;
    while (blkCnt > 0U)
    {
        vecA = vld1q_u16(pSrcA);
        vecB = vld1q_u16(pSrcB);
        vst1q_u16(pDst, vandq_u16(vecA, vecB) );
        pSrcA += 8;
        pSrcB += 8;
        pDst  += 8;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 7;
 #else
    /* Initialize blkCnt with number of samples */
    blkCnt = blockSize;
 #endif
    while (blkCnt > 0U)
    {
        *pDst++ = (*pSrcA++)&(*pSrcB++);
        /* Decrement the loop counter */
        blkCnt--;
    }
 #endif /* if defined(ARM_MATH_MVEI) */
 }
 /**
  @} end of And group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_and_u32.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_and_u32.c
@@ -0,0 +1,129 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_and_u32.c
 * Description:  uint32_t bitwise AND
 *
 * $Date:        14 November 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup And
  @{
 */
 /**
  @brief         Compute the logical bitwise AND of two fixed-point vectors.
  @param[in]     pSrcA      points to input vector A
  @param[in]     pSrcB      points to input vector B
  @param[out]    pDst       points to output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
 */
 void arm_and_u32(
    const uint32_t * pSrcA,
    const uint32_t * pSrcB,
          uint32_t * pDst,
          uint32_t blockSize)
 {
    uint32_t blkCnt;      /* Loop counter */
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
    q31x4_t vecSrcA, vecSrcB;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2;
    while (blkCnt > 0U)
    {
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vst1q(pDst, vandq_u32(vecSrcA, vecSrcB) );
        pSrcA += 4;
        pSrcB += 4;
        pDst  += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 3;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp32q(blkCnt);
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vstrwq_p(pDst, vandq_u32(vecSrcA, vecSrcB), p0);
    }
 #else
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
    uint32x4_t vecA, vecB;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        vecA = vld1q_u32(pSrcA);
        vecB = vld1q_u32(pSrcB);
        vst1q_u32(pDst, vandq_u32(vecA, vecB) );
        pSrcA += 4;
        pSrcB += 4;
        pDst  += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 3;
 #else
    /* Initialize blkCnt with number of samples */
    blkCnt = blockSize;
 #endif
    while (blkCnt > 0U)
    {
        *pDst++ = (*pSrcA++)&(*pSrcB++);
        /* Decrement the loop counter */
        blkCnt--;
    }
 #endif /* if defined(ARM_MATH_MVEI) */
 }
 /**
  @} end of And group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_and_u8.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_and_u8.c
@@ -0,0 +1,130 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_and_u8.c
 * Description:  uint8_t bitwise AND
 *
 * $Date:        14 November 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup And
  @{
 */
 /**
  @brief         Compute the logical bitwise AND of two fixed-point vectors.
  @param[in]     pSrcA      points to input vector A
  @param[in]     pSrcB      points to input vector B
  @param[out]    pDst       points to output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
 */
 void arm_and_u8(
    const uint8_t * pSrcA,
    const uint8_t * pSrcB,
          uint8_t * pDst,
          uint32_t blockSize)
 {
    uint32_t blkCnt;      /* Loop counter */
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
    q7x16_t vecSrcA, vecSrcB;
    /* Compute 16 outputs at a time */
    blkCnt = blockSize >> 4;
    while (blkCnt > 0U)
    {
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vst1q(pDst, vandq_u8(vecSrcA, vecSrcB) );
        pSrcA += 16;
        pSrcB += 16;
        pDst  += 16;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0xF;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp8q(blkCnt);
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vstrbq_p(pDst, vandq_u8(vecSrcA, vecSrcB), p0);
    }
 #else
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
    uint8x16_t vecA, vecB;
    /* Compute 16 outputs at a time */
    blkCnt = blockSize >> 4U;
    while (blkCnt > 0U)
    {
        vecA = vld1q_u8(pSrcA);
        vecB = vld1q_u8(pSrcB);
        vst1q_u8(pDst, vandq_u8(vecA, vecB) );
        pSrcA += 16;
        pSrcB += 16;
        pDst  += 16;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0xF;
 #else
    /* Initialize blkCnt with number of samples */
    blkCnt = blockSize;
 #endif
    while (blkCnt > 0U)
    {
        *pDst++ = (*pSrcA++)&(*pSrcB++);
        /* Decrement the loop counter */
        blkCnt--;
    }
 #endif /* if defined(ARM_MATH_MVEI) */
 }
 /**
  @} end of And group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_dot_prod_f32.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_dot_prod_f32.c
@@ -0,0 +1,226 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_dot_prod_f32.c
 * Description:  Floating-point dot product
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @defgroup BasicDotProd Vector Dot Product
  Computes the dot product of two vectors.
  The vectors are multiplied element-by-element and then summed.
  <pre>
      sum = pSrcA[0]*pSrcB[0] + pSrcA[1]*pSrcB[1] + ... + pSrcA[blockSize-1]*pSrcB[blockSize-1]
  </pre>
  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
 */
 /**
  @addtogroup BasicDotProd
  @{
 */
 /**
  @brief         Dot product of floating-point vectors.
  @param[in]     pSrcA      points to the first input vector.
  @param[in]     pSrcB      points to the second input vector.
  @param[in]     blockSize  number of samples in each vector.
  @param[out]    result     output result returned here.
  @return        none
 */
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 #include "arm_helium_utils.h"
 void arm_dot_prod_f32(
    const float32_t * pSrcA,
    const float32_t * pSrcB,
    uint32_t    blockSize,
    float32_t * result)
 {
    f32x4_t vecA, vecB;
    f32x4_t vecSum;
    uint32_t blkCnt;
    float32_t sum = 0.0f;
    vecSum = vdupq_n_f32(0.0f);
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        /*
         * C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
         * Calculate dot product and then store the result in a temporary buffer.
         * and advance vector source and destination pointers
         */
        vecA = vld1q(pSrcA);
        pSrcA += 4;
        vecB = vld1q(pSrcB);
        pSrcB += 4;
        vecSum = vfmaq(vecSum, vecA, vecB);
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt --;
    }
    blkCnt = blockSize & 3;
    if (blkCnt > 0U)
    {
        /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
        mve_pred16_t p0 = vctp32q(blkCnt);
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        vecSum = vfmaq_m(vecSum, vecA, vecB, p0);
    }
    sum = vecAddAcrossF32Mve(vecSum);
    /* Store result in destination buffer */
    *result = sum;
 }
 #else
 void arm_dot_prod_f32(
  const float32_t * pSrcA,
  const float32_t * pSrcB,
        uint32_t blockSize,
        float32_t * result)
 {
        uint32_t blkCnt;                               /* Loop counter */
        float32_t sum = 0.0f;                          /* Temporary return variable */
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
    f32x4_t vec1;
    f32x4_t vec2;
    f32x4_t accum = vdupq_n_f32(0);
    f32x2_t tmp = vdup_n_f32(0);
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    vec1 = vld1q_f32(pSrcA);
    vec2 = vld1q_f32(pSrcB);
    while (blkCnt > 0U)
    {
        /* C = A[0]*B[0] + A[1]*B[1] + A[2]*B[2] + ... + A[blockSize-1]*B[blockSize-1] */
        /* Calculate dot product and then store the result in a temporary buffer. */
 	      accum = vmlaq_f32(accum, vec1, vec2);
        /* Increment pointers */
        pSrcA += 4;
        pSrcB += 4;
        vec1 = vld1q_f32(pSrcA);
        vec2 = vld1q_f32(pSrcB);
        /* Decrement the loop counter */
        blkCnt--;
    }
 #if __aarch64__
    sum = vpadds_f32(vpadd_f32(vget_low_f32(accum), vget_high_f32(accum)));
 #else
    tmp = vpadd_f32(vget_low_f32(accum), vget_high_f32(accum));
    sum = vget_lane_f32(tmp, 0) + vget_lane_f32(tmp, 1);
 #endif
    /* Tail */
    blkCnt = blockSize & 0x3;
 #else
 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
    /* Calculate dot product and store result in a temporary buffer. */
    sum += (*pSrcA++) * (*pSrcB++);
    sum += (*pSrcA++) * (*pSrcB++);
    sum += (*pSrcA++) * (*pSrcB++);
    sum += (*pSrcA++) * (*pSrcB++);
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
 #endif /* #if defined(ARM_MATH_NEON) */
  while (blkCnt > 0U)
  {
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
    /* Calculate dot product and store result in a temporary buffer. */
    sum += (*pSrcA++) * (*pSrcB++);
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Store result in destination buffer */
  *result = sum;
 }
 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 /**
  @} end of BasicDotProd group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_dot_prod_q15.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_dot_prod_q15.c
@@ -0,0 +1,172 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_dot_prod_q15.c
 * Description:  Q15 dot product
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicDotProd
  @{
 */
 /**
  @brief         Dot product of Q15 vectors.
  @param[in]     pSrcA      points to the first input vector
  @param[in]     pSrcB      points to the second input vector
  @param[in]     blockSize  number of samples in each vector
  @param[out]    result     output result returned here
  @return        none
  @par           Scaling and Overflow Behavior
                   The intermediate multiplications are in 1.15 x 1.15 = 2.30 format and these
                   results are added to a 64-bit accumulator in 34.30 format.
                   Nonsaturating additions are used and given that there are 33 guard bits in the accumulator
                   there is no risk of overflow.
                   The return result is in 34.30 format.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_dot_prod_q15(
    const q15_t * pSrcA,
    const q15_t * pSrcB,
    uint32_t blockSize,
    q63_t * result)
 {
    uint32_t  blkCnt;           /* loop counters */
    q15x8_t vecA;
    q15x8_t vecB;
    q63_t     sum = 0LL;
    /* Compute 8 outputs at a time */
    blkCnt = blockSize >> 3;
    while (blkCnt > 0U)
    {
        /*
         * C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
         * Calculate dot product and then store the result in a temporary buffer.
         */
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        sum = vmlaldavaq(sum, vecA, vecB);
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrcA += 8;
        pSrcB += 8;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 7;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp16q(blkCnt);
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        sum = vmlaldavaq_p(sum, vecA, vecB, p0);
    }
    *result = sum;
 }
 #else
 void arm_dot_prod_q15(
  const q15_t * pSrcA,
  const q15_t * pSrcB,
        uint32_t blockSize,
        q63_t * result)
 {
        uint32_t blkCnt;                               /* Loop counter */
        q63_t sum = 0;                                 /* Temporary return variable */
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
 #if defined (ARM_MATH_DSP)
    /* Calculate dot product and store result in a temporary buffer. */
    sum = __SMLALD(read_q15x2_ia ((q15_t **) &pSrcA), read_q15x2_ia ((q15_t **) &pSrcB), sum);
    sum = __SMLALD(read_q15x2_ia ((q15_t **) &pSrcA), read_q15x2_ia ((q15_t **) &pSrcB), sum);
 #else
    sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
    sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
    sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
    sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
    /* Calculate dot product and store result in a temporary buffer. */
 //#if defined (ARM_MATH_DSP)
 //    sum  = __SMLALD(*pSrcA++, *pSrcB++, sum);
 //#else
    sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
 //#endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Store result in destination buffer in 34.30 format */
  *result = sum;
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicDotProd group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_dot_prod_q31.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_dot_prod_q31.c
@@ -0,0 +1,174 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_dot_prod_q31.c
 * Description:  Q31 dot product
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicDotProd
  @{
 */
 /**
  @brief         Dot product of Q31 vectors.
  @param[in]     pSrcA      points to the first input vector.
  @param[in]     pSrcB      points to the second input vector.
  @param[in]     blockSize  number of samples in each vector.
  @param[out]    result     output result returned here.
  @return        none
  @par           Scaling and Overflow Behavior
                   The intermediate multiplications are in 1.31 x 1.31 = 2.62 format and these
                   are truncated to 2.48 format by discarding the lower 14 bits.
                   The 2.48 result is then added without saturation to a 64-bit accumulator in 16.48 format.
                   There are 15 guard bits in the accumulator and there is no risk of overflow as long as
                   the length of the vectors is less than 2^16 elements.
                   The return result is in 16.48 format.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_dot_prod_q31(
    const q31_t * pSrcA,
    const q31_t * pSrcB,
    uint32_t blockSize,
    q63_t * result)
 {
    uint32_t  blkCnt;           /* loop counters */
    q31x4_t vecA;
    q31x4_t vecB;
    q63_t     sum = 0LL;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2;
    while (blkCnt > 0U)
    {
        /*
         * C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
         * Calculate dot product and then store the result in a temporary buffer.
         */
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        sum = vrmlaldavhaq(sum, vecA, vecB);
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrcA += 4;
        pSrcB += 4;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 3;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp32q(blkCnt);
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        sum = vrmlaldavhaq_p(sum, vecA, vecB, p0);
    }
    /*
     * vrmlaldavhaq provides extra intermediate accumulator headroom.
     * limiting the need of intermediate scaling
     * Scalar variant uses 2.48 accu format by right shifting accumulators by 14.
     * 16.48 output conversion is performed outside the loop by scaling accu. by 6
     */
    *result = asrl(sum, (14 - 8));
 }
 #else
 void arm_dot_prod_q31(
  const q31_t * pSrcA,
  const q31_t * pSrcB,
        uint32_t blockSize,
        q63_t * result)
 {
        uint32_t blkCnt;                               /* Loop counter */
        q63_t sum = 0;                                 /* Temporary return variable */
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
    /* Calculate dot product and store result in a temporary buffer. */
    sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
    sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
    sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
    sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
    /* Calculate dot product and store result in a temporary buffer. */
    sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Store result in destination buffer in 16.48 format */
  *result = sum;
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicDotProd group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_dot_prod_q7.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_dot_prod_q7.c
@@ -0,0 +1,191 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_dot_prod_q7.c
 * Description:  Q7 dot product
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicDotProd
  @{
 */
 /**
  @brief         Dot product of Q7 vectors.
  @param[in]     pSrcA      points to the first input vector
  @param[in]     pSrcB      points to the second input vector
  @param[in]     blockSize  number of samples in each vector
  @param[out]    result     output result returned here
  @return        none
  @par           Scaling and Overflow Behavior
                   The intermediate multiplications are in 1.7 x 1.7 = 2.14 format and these
                   results are added to an accumulator in 18.14 format.
                   Nonsaturating additions are used and there is no danger of wrap around as long as
                   the vectors are less than 2^18 elements long.
                   The return result is in 18.14 format.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_dot_prod_q7(
    const q7_t * pSrcA,
    const q7_t * pSrcB,
    uint32_t blockSize,
    q31_t * result)
 {
    uint32_t  blkCnt;           /* loop counters */
    q7x16_t vecA;
    q7x16_t vecB;
    q31_t     sum = 0;
    /* Compute 16 outputs at a time */
    blkCnt = blockSize >> 4;
    while (blkCnt > 0U)
    {
        /*
         * C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
         * Calculate dot product and then store the result in a temporary buffer.
         */
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        sum = vmladavaq(sum, vecA, vecB);
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrcA += 16;
        pSrcB += 16;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 0xF;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp8q(blkCnt);
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        sum = vmladavaq_p(sum, vecA, vecB, p0);
    }
    *result = sum;
 }
 #else
 void arm_dot_prod_q7(
  const q7_t * pSrcA,
  const q7_t * pSrcB,
        uint32_t blockSize,
        q31_t * result)
 {
        uint32_t blkCnt;                               /* Loop counter */
        q31_t sum = 0;                                 /* Temporary return variable */
 #if defined (ARM_MATH_LOOPUNROLL)
 #if defined (ARM_MATH_DSP)
  q31_t input1, input2;                          /* Temporary variables */
  q31_t inA1, inA2, inB1, inB2;                  /* Temporary variables */
 #endif
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
 #if defined (ARM_MATH_DSP)
    /* read 4 samples at a time from sourceA */
    input1 = read_q7x4_ia ((q7_t **) &pSrcA);
    /* read 4 samples at a time from sourceB */
    input2 = read_q7x4_ia ((q7_t **) &pSrcB);
    /* extract two q7_t samples to q15_t samples */
    inA1 = __SXTB16(__ROR(input1, 8));
    /* extract reminaing two samples */
    inA2 = __SXTB16(input1);
    /* extract two q7_t samples to q15_t samples */
    inB1 = __SXTB16(__ROR(input2, 8));
    /* extract reminaing two samples */
    inB2 = __SXTB16(input2);
    /* multiply and accumulate two samples at a time */
    sum = __SMLAD(inA1, inB1, sum);
    sum = __SMLAD(inA2, inB2, sum);
 #else
    sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
    sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
    sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
    sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
    /* Calculate dot product and store result in a temporary buffer. */
 //#if defined (ARM_MATH_DSP)
 //    sum  = __SMLAD(*pSrcA++, *pSrcB++, sum);
 //#else
    sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
 //#endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Store result in destination buffer in 18.14 format */
  *result = sum;
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicDotProd group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_mult_f32.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_mult_f32.c
@@ -0,0 +1,200 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_mult_f32.c
 * Description:  Floating-point vector multiplication
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @defgroup BasicMult Vector Multiplication
  Element-by-element multiplication of two vectors.
  <pre>
      pDst[n] = pSrcA[n] * pSrcB[n],   0 <= n < blockSize.
  </pre>
  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
 */
 /**
  @addtogroup BasicMult
  @{
 */
 /**
  @brief         Floating-point vector multiplication.
  @param[in]     pSrcA      points to the first input vector.
  @param[in]     pSrcB      points to the second input vector.
  @param[out]    pDst       points to the output vector.
  @param[in]     blockSize  number of samples in each vector.
  @return        none
 */
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 #include "arm_helium_utils.h"
 void arm_mult_f32(
  const float32_t * pSrcA,
  const float32_t * pSrcB,
        float32_t * pDst,
        uint32_t blockSize)
 {
    uint32_t blkCnt;                               /* Loop counter */
    f32x4_t vec1;
    f32x4_t vec2;
    f32x4_t res;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        /* C = A + B */
      /* Add and then store the results in the destination buffer. */
        vec1 = vld1q(pSrcA);
        vec2 = vld1q(pSrcB);
        res = vmulq(vec1, vec2);
        vst1q(pDst, res);
        /* Increment pointers */
        pSrcA += 4;
        pSrcB += 4;
        pDst += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0x3;
    if (blkCnt > 0U)
    {
      /* C = A + B */
      mve_pred16_t p0 = vctp32q(blkCnt);
      vec1 = vld1q(pSrcA);
      vec2 = vld1q(pSrcB);
      vstrwq_p(pDst, vmulq(vec1,vec2), p0);
    }
 }
 #else
 void arm_mult_f32(
  const float32_t * pSrcA,
  const float32_t * pSrcB,
        float32_t * pDst,
        uint32_t blockSize)
 {
    uint32_t blkCnt;                               /* Loop counter */
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
    f32x4_t vec1;
    f32x4_t vec2;
    f32x4_t res;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        /* C = A * B */
    	/* Multiply the inputs and then store the results in the destination buffer. */
        vec1 = vld1q_f32(pSrcA);
        vec2 = vld1q_f32(pSrcB);
        res = vmulq_f32(vec1, vec2);
        vst1q_f32(pDst, res);
        /* Increment pointers */
        pSrcA += 4;
        pSrcB += 4;
        pDst += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0x3;
 #else
 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A * B */
    /* Multiply inputs and store result in destination buffer. */
    *pDst++ = (*pSrcA++) * (*pSrcB++);
    *pDst++ = (*pSrcA++) * (*pSrcB++);
    *pDst++ = (*pSrcA++) * (*pSrcB++);
    *pDst++ = (*pSrcA++) * (*pSrcB++);
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
 #endif /* #if defined(ARM_MATH_NEON) */
  while (blkCnt > 0U)
  {
    /* C = A * B */
    /* Multiply input and store result in destination buffer. */
    *pDst++ = (*pSrcA++) * (*pSrcB++);
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 /**
  @} end of BasicMult group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_mult_q15.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_mult_q15.c
@@ -0,0 +1,192 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_mult_q15.c
 * Description:  Q15 vector multiplication
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicMult
  @{
 */
 /**
  @brief         Q15 vector multiplication
  @param[in]     pSrcA      points to first input vector
  @param[in]     pSrcB      points to second input vector
  @param[out]    pDst       points to output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_mult_q15(
    const q15_t * pSrcA,
    const q15_t * pSrcB,
    q15_t * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q15x8_t vecA, vecB;
    /* Compute 8 outputs at a time */
    blkCnt = blockSize >> 3;
    while (blkCnt > 0U)
    {
        /*
         * C = A * B
         * Multiply the inputs and then store the results in the destination buffer.
         */
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        vst1q(pDst, vqdmulhq(vecA, vecB));
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrcA  += 8;
        pSrcB  += 8;
        pDst   += 8;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 7;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp16q(blkCnt);
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        vstrhq_p(pDst, vqdmulhq(vecA, vecB), p0);
    }
 }
 #else
 void arm_mult_q15(
  const q15_t * pSrcA,
  const q15_t * pSrcB,
        q15_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
 #if defined (ARM_MATH_LOOPUNROLL)
 #if defined (ARM_MATH_DSP)
  q31_t inA1, inA2, inB1, inB2;                  /* Temporary input variables */
  q15_t out1, out2, out3, out4;                  /* Temporary output variables */
  q31_t mul1, mul2, mul3, mul4;                  /* Temporary variables */
 #endif
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A * B */
 #if defined (ARM_MATH_DSP)
    /* read 2 samples at a time from sourceA */
    inA1 = read_q15x2_ia ((q15_t **) &pSrcA);
    /* read 2 samples at a time from sourceB */
    inB1 = read_q15x2_ia ((q15_t **) &pSrcB);
    /* read 2 samples at a time from sourceA */
    inA2 = read_q15x2_ia ((q15_t **) &pSrcA);
    /* read 2 samples at a time from sourceB */
    inB2 = read_q15x2_ia ((q15_t **) &pSrcB);
    /* multiply mul = sourceA * sourceB */
    mul1 = (q31_t) ((q15_t) (inA1 >> 16) * (q15_t) (inB1 >> 16));
    mul2 = (q31_t) ((q15_t) (inA1      ) * (q15_t) (inB1      ));
    mul3 = (q31_t) ((q15_t) (inA2 >> 16) * (q15_t) (inB2 >> 16));
    mul4 = (q31_t) ((q15_t) (inA2      ) * (q15_t) (inB2      ));
    /* saturate result to 16 bit */
    out1 = (q15_t) __SSAT(mul1 >> 15, 16);
    out2 = (q15_t) __SSAT(mul2 >> 15, 16);
    out3 = (q15_t) __SSAT(mul3 >> 15, 16);
    out4 = (q15_t) __SSAT(mul4 >> 15, 16);
    /* store result to destination */
 #ifndef ARM_MATH_BIG_ENDIAN
    write_q15x2_ia (&pDst, __PKHBT(out2, out1, 16));
    write_q15x2_ia (&pDst, __PKHBT(out4, out3, 16));
 #else
    write_q15x2_ia (&pDst, __PKHBT(out1, out2, 16));
    write_q15x2_ia (&pDst, __PKHBT(out3, out4, 16));
 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
 #else
    *pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
    *pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
    *pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
    *pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = A * B */
    /* Multiply inputs and store result in destination buffer. */
    *pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicMult group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_mult_q31.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_mult_q31.c
@@ -0,0 +1,168 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_mult_q31.c
 * Description:  Q31 vector multiplication
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicMult
  @{
 */
 /**
  @brief         Q31 vector multiplication.
  @param[in]     pSrcA      points to the first input vector.
  @param[in]     pSrcB      points to the second input vector.
  @param[out]    pDst       points to the output vector.
  @param[in]     blockSize  number of samples in each vector.
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] are saturated.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_mult_q31(
    const q31_t * pSrcA,
    const q31_t * pSrcB,
    q31_t * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q31x4_t vecA, vecB;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2;
    while (blkCnt > 0U)
    {
        /*
         * C = A * B
         * Multiply the inputs and then store the results in the destination buffer.
         */
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        vst1q(pDst, vqdmulhq(vecA, vecB));
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrcA  += 4;
        pSrcB  += 4;
        pDst   += 4;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 3;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp32q(blkCnt);
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        vstrwq_p(pDst, vqdmulhq(vecA, vecB), p0);
    }
 }
 #else
 void arm_mult_q31(
  const q31_t * pSrcA,
  const q31_t * pSrcB,
        q31_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
        q31_t out;                                     /* Temporary output variable */
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A * B */
    /* Multiply inputs and store result in destination buffer. */
    out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
    out = __SSAT(out, 31);
    *pDst++ = out << 1U;
    out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
    out = __SSAT(out, 31);
    *pDst++ = out << 1U;
    out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
    out = __SSAT(out, 31);
    *pDst++ = out << 1U;
    out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
    out = __SSAT(out, 31);
    *pDst++ = out << 1U;
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = A * B */
    /* Multiply inputs and store result in destination buffer. */
    out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
    out = __SSAT(out, 31);
    *pDst++ = out << 1U;
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicMult group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_mult_q7.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_mult_q7.c
@@ -0,0 +1,168 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_mult_q7.c
 * Description:  Q7 vector multiplication
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicMult
  @{
 */
 /**
  @brief         Q7 vector multiplication
  @param[in]     pSrcA      points to the first input vector
  @param[in]     pSrcB      points to the second input vector
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_mult_q7(
    const q7_t * pSrcA,
    const q7_t * pSrcB,
    q7_t * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q7x16_t vecA, vecB;
    /* Compute 16 outputs at a time */
    blkCnt = blockSize >> 4;
    while (blkCnt > 0U)
    {
        /*
         * C = A * B
         * Multiply the inputs and then store the results in the destination buffer.
         */
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        vst1q(pDst, vqdmulhq(vecA, vecB));
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrcA  += 16;
        pSrcB  += 16;
        pDst   += 16;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 0xF;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp8q(blkCnt);
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        vstrbq_p(pDst, vqdmulhq(vecA, vecB), p0);
    }
 }
 #else
 void arm_mult_q7(
  const q7_t * pSrcA,
  const q7_t * pSrcB,
        q7_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
 #if defined (ARM_MATH_LOOPUNROLL)
 #if defined (ARM_MATH_DSP)
  q7_t out1, out2, out3, out4;                   /* Temporary output variables */
 #endif
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A * B */
 #if defined (ARM_MATH_DSP)
    /* Multiply inputs and store results in temporary variables */
    out1 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
    out2 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
    out3 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
    out4 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
    /* Pack and store result in destination buffer (in single write) */
    write_q7x4_ia (&pDst, __PACKq7(out1, out2, out3, out4));
 #else
    *pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
    *pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
    *pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
    *pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = A * B */
    /* Multiply input and store result in destination buffer. */
    *pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicMult group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_negate_f32.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_negate_f32.c
@@ -0,0 +1,192 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_negate_f32.c
 * Description:  Negates floating-point vectors
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @defgroup BasicNegate Vector Negate
  Negates the elements of a vector.
  <pre>
      pDst[n] = -pSrc[n],   0 <= n < blockSize.
  </pre>
  The functions support in-place computation allowing the source and
  destination pointers to reference the same memory buffer.
  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
 */
 /**
  @addtogroup BasicNegate
  @{
 */
 /**
  @brief         Negates the elements of a floating-point vector.
  @param[in]     pSrc       points to input vector.
  @param[out]    pDst       points to output vector.
  @param[in]     blockSize  number of samples in each vector.
  @return        none
 */
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 #include "arm_helium_utils.h"
 void arm_negate_f32(
  const float32_t * pSrc,
        float32_t * pDst,
        uint32_t blockSize)
 {
    uint32_t blkCnt;                               /* Loop counter */
    f32x4_t vec1;
    f32x4_t res;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        /* C = |A| */
        /* Calculate absolute values and then store the results in the destination buffer. */
        vec1 = vld1q(pSrc);
        res = vnegq(vec1);
        vst1q(pDst, res);
        /* Increment pointers */
        pSrc += 4;
        pDst += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0x3;
    if (blkCnt > 0U)
    {
      /* C = |A| */
      mve_pred16_t p0 = vctp32q(blkCnt);
      vec1 = vld1q((float32_t const *) pSrc);
      vstrwq_p(pDst, vnegq(vec1), p0);
    }
 }
 #else
 void arm_negate_f32(
  const float32_t * pSrc,
        float32_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
 #if defined(ARM_MATH_NEON_EXPERIMENTAL) && !defined(ARM_MATH_AUTOVECTORIZE)
    f32x4_t vec1;
    f32x4_t res;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        /* C = -A */
    	/* Negate and then store the results in the destination buffer. */
        vec1 = vld1q_f32(pSrc);
        res = vnegq_f32(vec1);
        vst1q_f32(pDst, res);
        /* Increment pointers */
        pSrc += 4;
        pDst += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0x3;
 #else
 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = -A */
    /* Negate and store result in destination buffer. */
    *pDst++ = -*pSrc++;
    *pDst++ = -*pSrc++;
    *pDst++ = -*pSrc++;
    *pDst++ = -*pSrc++;
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
 #endif /* #if defined(ARM_MATH_NEON_EXPERIMENTAL) */
  while (blkCnt > 0U)
  {
    /* C = -A */
    /* Negate and store result in destination buffer. */
    *pDst++ = -*pSrc++;
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 /**
  @} end of BasicNegate group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_negate_q15.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_negate_q15.c
@@ -0,0 +1,171 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_negate_q15.c
 * Description:  Negates Q15 vectors
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicNegate
  @{
 */
 /**
  @brief         Negates the elements of a Q15 vector.
  @param[in]     pSrc       points to the input vector.
  @param[out]    pDst       points to the output vector.
  @param[in]     blockSize  number of samples in each vector.
  @return        none
  @par           Conditions for optimum performance
                   Input and output buffers should be aligned by 32-bit
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   The Q15 value -1 (0x8000) is saturated to the maximum allowable positive value 0x7FFF.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_negate_q15(
    const q15_t  * pSrc,
    q15_t  * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q15x8_t vecSrc;
    /* Compute 8 outputs at a time */
    blkCnt = blockSize >> 3;
    while (blkCnt > 0U)
    {
        /*
         * C = -A
         * Negate and then store the results in the destination buffer.
         */
        vecSrc = vld1q(pSrc);
        vst1q(pDst, vqnegq(vecSrc));
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrc += 8;
        pDst += 8;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 7;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp16q(blkCnt);
        vecSrc = vld1q(pSrc);
        vstrhq_p(pDst, vqnegq(vecSrc), p0);
    }
 }
 #else
 void arm_negate_q15(
  const q15_t * pSrc,
        q15_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
        q15_t in;                                      /* Temporary input variable */
 #if defined (ARM_MATH_LOOPUNROLL)
 #if defined (ARM_MATH_DSP)
  q31_t in1;                                    /* Temporary input variables */
 #endif
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = -A */
 #if defined (ARM_MATH_DSP)
    /* Negate and store result in destination buffer (2 samples at a time). */
    in1 = read_q15x2_ia ((q15_t **) &pSrc);
    write_q15x2_ia (&pDst, __QSUB16(0, in1));
    in1 = read_q15x2_ia ((q15_t **) &pSrc);
    write_q15x2_ia (&pDst, __QSUB16(0, in1));
 #else
    in = *pSrc++;
    *pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
    in = *pSrc++;
    *pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
    in = *pSrc++;
    *pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
    in = *pSrc++;
    *pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = -A */
    /* Negate and store result in destination buffer. */
    in = *pSrc++;
    *pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicNegate group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_negate_q31.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_negate_q31.c
@@ -0,0 +1,178 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_negate_q31.c
 * Description:  Negates Q31 vectors
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicNegate
  @{
 */
 /**
  @brief         Negates the elements of a Q31 vector.
  @param[in]     pSrc       points to the input vector.
  @param[out]    pDst       points to the output vector.
  @param[in]     blockSize   number of samples in each vector.
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   The Q31 value -1 (0x80000000) is saturated to the maximum allowable positive value 0x7FFFFFFF.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_negate_q31(
    const q31_t * pSrc,
    q31_t * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q31x4_t vecSrc;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2;
    while (blkCnt > 0U)
    {
        /*
         * C = -A
         * Negate and then store the results in the destination buffer.
         */
        vecSrc = vld1q(pSrc);
        vst1q(pDst, vqnegq(vecSrc));
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrc += 4;
        pDst += 4;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 3;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp32q(blkCnt);
        vecSrc = vld1q(pSrc);
        vstrwq_p(pDst, vqnegq(vecSrc), p0);
    }
 }
 #else
 void arm_negate_q31(
  const q31_t * pSrc,
        q31_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
        q31_t in;                                      /* Temporary input variable */
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = -A */
    /* Negate and store result in destination buffer. */
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = __QSUB(0, in);
 #else
    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
 #endif
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = __QSUB(0, in);
 #else
    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
 #endif
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = __QSUB(0, in);
 #else
    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
 #endif
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = __QSUB(0, in);
 #else
    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = -A */
    /* Negate and store result in destination buffer. */
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = __QSUB(0, in);
 #else
    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicNegate group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_negate_q7.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_negate_q7.c
@@ -0,0 +1,171 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_negate_q7.c
 * Description:  Negates Q7 vectors
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicNegate
  @{
 */
 /**
  @brief         Negates the elements of a Q7 vector.
  @param[in]     pSrc       points to the input vector.
  @param[out]    pDst       points to the output vector.
  @param[in]     blockSize   number of samples in each vector.
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   The Q7 value -1 (0x80) is saturated to the maximum allowable positive value 0x7F.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_negate_q7(
    const q7_t   * pSrc,
    q7_t   * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q7x16_t vecSrc;
    /* Compute 16 outputs at a time */
    blkCnt = blockSize >> 4;
    while (blkCnt > 0U)
    {
        /*
         * C = -A
         * Negate and then store the results in the destination buffer.
         */
        vecSrc = vld1q(pSrc);
        vst1q(pDst, vqnegq(vecSrc));
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrc += 16;
        pDst += 16;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 0xF;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp8q(blkCnt);
        vecSrc = vld1q(pSrc);
        vstrbq_p(pDst, vqnegq(vecSrc), p0);
    }
 }
 #else
 void arm_negate_q7(
  const q7_t * pSrc,
        q7_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
        q7_t in;                                       /* Temporary input variable */
 #if defined (ARM_MATH_LOOPUNROLL)
 #if defined (ARM_MATH_DSP)
  q31_t in1;                                    /* Temporary input variable */
 #endif
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = -A */
 #if defined (ARM_MATH_DSP)
    /* Negate and store result in destination buffer (4 samples at a time). */
    in1 = read_q7x4_ia ((q7_t **) &pSrc);
    write_q7x4_ia (&pDst, __QSUB8(0, in1));
 #else
    in = *pSrc++;
    *pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
    in = *pSrc++;
    *pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
    in = *pSrc++;
    *pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
    in = *pSrc++;
    *pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = -A */
    /* Negate and store result in destination buffer. */
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = (q7_t) __QSUB8(0, in);
 #else
    *pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicNegate group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_not_u16.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_not_u16.c
@@ -0,0 +1,130 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_not_u16.c
 * Description:  uint16_t bitwise NOT
 *
 * $Date:        14 November 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @defgroup Not Vector bitwise NOT
  Compute the logical bitwise NOT.
  There are separate functions for uint32_t, uint16_t, and uint8_t data types.
 */
 /**
  @addtogroup Not
  @{
 */
 /**
  @brief         Compute the logical bitwise NOT of a fixed-point vector.
  @param[in]     pSrc       points to input vector
  @param[out]    pDst       points to output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
 */
 void arm_not_u16(
    const uint16_t * pSrc,
          uint16_t * pDst,
          uint32_t blockSize)
 {
    uint32_t blkCnt;      /* Loop counter */
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
    q15x8_t vecSrc;
    /* Compute 8 outputs at a time */
    blkCnt = blockSize >> 3;
    while (blkCnt > 0U)
    {
        vecSrc = vld1q(pSrc);
        vst1q(pDst, vmvnq_u16(vecSrc) );
        pSrc += 8;
        pDst += 8;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 7;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp16q(blkCnt);
        vecSrc = vld1q(pSrc);
        vstrhq_p(pDst, vmvnq_u16(vecSrc), p0);
    }
 #else
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
    uint16x8_t inV;
    /* Compute 8 outputs at a time */
    blkCnt = blockSize >> 3U;
    while (blkCnt > 0U)
    {
        inV = vld1q_u16(pSrc);
        vst1q_u16(pDst, vmvnq_u16(inV) );
        pSrc += 8;
        pDst += 8;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 7;
 #else
    /* Initialize blkCnt with number of samples */
    blkCnt = blockSize;
 #endif
    while (blkCnt > 0U)
    {
        *pDst++ = ~(*pSrc++);
        /* Decrement the loop counter */
        blkCnt--;
    }
 #endif /* if defined(ARM_MATH_MVEI) */
 }
 /**
  @} end of Not group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_not_u32.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_not_u32.c
@@ -0,0 +1,122 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_not_u32.c
 * Description:  uint32_t bitwise NOT
 *
 * $Date:        14 November 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup Not
  @{
 */
 /**
  @brief         Compute the logical bitwise NOT of a fixed-point vector.
  @param[in]     pSrc       points to input vector
  @param[out]    pDst       points to output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
 */
 void arm_not_u32(
    const uint32_t * pSrc,
          uint32_t * pDst,
          uint32_t blockSize)
 {
    uint32_t blkCnt;      /* Loop counter */
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
    q31x4_t vecSrc;
    /* Compute 8 outputs at a time */
    blkCnt = blockSize >> 2;
    while (blkCnt > 0U)
    {
        vecSrc = vld1q(pSrc);
        vst1q(pDst, vmvnq_u32(vecSrc) );
        pSrc += 4;
        pDst += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 3;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp32q(blkCnt);
        vecSrc = vld1q(pSrc);
        vstrwq_p(pDst, vmvnq_u32(vecSrc), p0);
    }
 #else
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
    uint32x4_t inV;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        inV = vld1q_u32(pSrc);
        vst1q_u32(pDst, vmvnq_u32(inV) );
        pSrc += 4;
        pDst += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 3;
 #else
    /* Initialize blkCnt with number of samples */
    blkCnt = blockSize;
 #endif
    while (blkCnt > 0U)
    {
        *pDst++ = ~(*pSrc++);
        /* Decrement the loop counter */
        blkCnt--;
    }
 #endif /* if defined(ARM_MATH_MVEI) */
 }
 /**
  @} end of Not group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_not_u8.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_not_u8.c
@@ -0,0 +1,122 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_not_u8.c
 * Description:  uint8_t bitwise NOT
 *
 * $Date:        14 November 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup Not
  @{
 */
 /**
  @brief         Compute the logical bitwise NOT of a fixed-point vector.
  @param[in]     pSrc       points to input vector
  @param[out]    pDst       points to output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
 */
 void arm_not_u8(
    const uint8_t * pSrc,
          uint8_t * pDst,
          uint32_t blockSize)
 {
    uint32_t blkCnt;      /* Loop counter */
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
    q7x16_t vecSrc;
    /* Compute 16 outputs at a time */
    blkCnt = blockSize >> 4;
    while (blkCnt > 0U)
    {
        vecSrc = vld1q(pSrc);
        vst1q(pDst, vmvnq_u8(vecSrc) );
        pSrc += 16;
        pDst += 16;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0xF;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp8q(blkCnt);
        vecSrc = vld1q(pSrc);
        vstrbq_p(pDst, vmvnq_u8(vecSrc), p0);
    }
 #else
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
    uint8x16_t inV;
    /* Compute 16 outputs at a time */
    blkCnt = blockSize >> 4U;
    while (blkCnt > 0U)
    {
        inV = vld1q_u8(pSrc);
        vst1q_u8(pDst, vmvnq_u8(inV) );
        pSrc += 16;
        pDst += 16;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0xF;
 #else
    /* Initialize blkCnt with number of samples */
    blkCnt = blockSize;
 #endif
    while (blkCnt > 0U)
    {
        *pDst++ = ~(*pSrc++);
        /* Decrement the loop counter */
        blkCnt--;
    }
 #endif /* if defined(ARM_MATH_MVEI) */
 }
 /**
  @} end of Not group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_offset_f32.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_offset_f32.c
@@ -0,0 +1,196 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_offset_f32.c
 * Description:  Floating-point vector offset
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @defgroup BasicOffset Vector Offset
  Adds a constant offset to each element of a vector.
  <pre>
      pDst[n] = pSrc[n] + offset,   0 <= n < blockSize.
  </pre>
  The functions support in-place computation allowing the source and
  destination pointers to reference the same memory buffer.
  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
 */
 /**
  @addtogroup BasicOffset
  @{
 */
 /**
  @brief         Adds a constant offset to a floating-point vector.
  @param[in]     pSrc       points to the input vector
  @param[in]     offset     is the offset to be added
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
 */
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 #include "arm_helium_utils.h"
 void arm_offset_f32(
  const float32_t * pSrc,
        float32_t offset,
        float32_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
    f32x4_t vec1;
    f32x4_t res;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        /* C = A + offset */
        /* Add offset and then store the results in the destination buffer. */
        vec1 = vld1q(pSrc);
        res = vaddq(vec1,offset);
        vst1q(pDst, res);
        /* Increment pointers */
        pSrc += 4;
        pDst += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0x3;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp32q(blkCnt);
        vec1 = vld1q((float32_t const *) pSrc);
        vstrwq_p(pDst, vaddq(vec1, offset), p0);
    }
 }
 #else
 void arm_offset_f32(
  const float32_t * pSrc,
        float32_t offset,
        float32_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
 #if defined(ARM_MATH_NEON_EXPERIMENTAL) && !defined(ARM_MATH_AUTOVECTORIZE)
    f32x4_t vec1;
    f32x4_t res;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        /* C = A + offset */
        /* Add offset and then store the results in the destination buffer. */
        vec1 = vld1q_f32(pSrc);
        res = vaddq_f32(vec1,vdupq_n_f32(offset));
        vst1q_f32(pDst, res);
        /* Increment pointers */
        pSrc += 4;
        pDst += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0x3;
 #else
 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A + offset */
    /* Add offset and store result in destination buffer. */
    *pDst++ = (*pSrc++) + offset;
    *pDst++ = (*pSrc++) + offset;
    *pDst++ = (*pSrc++) + offset;
    *pDst++ = (*pSrc++) + offset;
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
 #endif /* #if defined(ARM_MATH_NEON_EXPERIMENTAL) */
  while (blkCnt > 0U)
  {
    /* C = A + offset */
    /* Add offset and store result in destination buffer. */
    *pDst++ = (*pSrc++) + offset;
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 /**
  @} end of BasicOffset group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_offset_q15.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_offset_q15.c
@@ -0,0 +1,168 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_offset_q15.c
 * Description:  Q15 vector offset
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicOffset
  @{
 */
 /**
  @brief         Adds a constant offset to a Q15 vector.
  @param[in]     pSrc       points to the input vector
  @param[in]     offset     is the offset to be added
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_offset_q15(
    const q15_t * pSrc,
    q15_t   offset,
    q15_t * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q15x8_t vecSrc;
    /* Compute 8 outputs at a time */
    blkCnt = blockSize >> 3;
    while (blkCnt > 0U)
    {
        /*
         * C = A + offset
         * Add offset and then store the result in the destination buffer.
         */
        vecSrc = vld1q(pSrc);
        vst1q(pDst, vqaddq(vecSrc, offset));
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrc += 8;
        pDst += 8;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 7;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp16q(blkCnt);
        vecSrc = vld1q(pSrc);
        vstrhq_p(pDst, vqaddq(vecSrc, offset), p0);
    }
 }
 #else
 void arm_offset_q15(
  const q15_t * pSrc,
        q15_t offset,
        q15_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
 #if defined (ARM_MATH_LOOPUNROLL)
 #if defined (ARM_MATH_DSP)
  q31_t offset_packed;                           /* Offset packed to 32 bit */
  /* Offset is packed to 32 bit in order to use SIMD32 for addition */
  offset_packed = __PKHBT(offset, offset, 16);
 #endif
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A + offset */
 #if defined (ARM_MATH_DSP)
    /* Add offset and store result in destination buffer (2 samples at a time). */
    write_q15x2_ia (&pDst, __QADD16(read_q15x2_ia ((q15_t **) &pSrc), offset_packed));
    write_q15x2_ia (&pDst, __QADD16(read_q15x2_ia ((q15_t **) &pSrc), offset_packed));
 #else
    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrc++ + offset), 16);
    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrc++ + offset), 16);
    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrc++ + offset), 16);
    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrc++ + offset), 16);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = A + offset */
    /* Add offset and store result in destination buffer. */
 #if defined (ARM_MATH_DSP)
    *pDst++ = (q15_t) __QADD16(*pSrc++, offset);
 #else
    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrc++ + offset), 16);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicOffset group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_offset_q31.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_offset_q31.c
@@ -0,0 +1,175 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_offset_q31.c
 * Description:  Q31 vector offset
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicOffset
  @{
 */
 /**
  @brief         Adds a constant offset to a Q31 vector.
  @param[in]     pSrc       points to the input vector
  @param[in]     offset     is the offset to be added
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_offset_q31(
    const q31_t * pSrc,
    q31_t   offset,
    q31_t * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q31x4_t vecSrc;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2;
    while (blkCnt > 0U)
    {
        /*
         * C = A + offset
         * Add offset and then store the result in the destination buffer.
         */
        vecSrc = vld1q(pSrc);
        vst1q(pDst, vqaddq(vecSrc, offset));
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrc += 4;
        pDst += 4;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 3;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp32q(blkCnt);
        vecSrc = vld1q(pSrc);
        vstrwq_p(pDst, vqaddq(vecSrc, offset), p0);
    }
 }
 #else
 void arm_offset_q31(
  const q31_t * pSrc,
        q31_t offset,
        q31_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A + offset */
    /* Add offset and store result in destination buffer. */
 #if defined (ARM_MATH_DSP)
    *pDst++ = __QADD(*pSrc++, offset);
 #else
    *pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);
 #endif
 #if defined (ARM_MATH_DSP)
    *pDst++ = __QADD(*pSrc++, offset);
 #else
    *pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);
 #endif
 #if defined (ARM_MATH_DSP)
    *pDst++ = __QADD(*pSrc++, offset);
 #else
    *pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);
 #endif
 #if defined (ARM_MATH_DSP)
    *pDst++ = __QADD(*pSrc++, offset);
 #else
    *pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = A + offset */
    /* Add offset and store result in destination buffer. */
 #if defined (ARM_MATH_DSP)
    *pDst++ = __QADD(*pSrc++, offset);
 #else
    *pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicOffset group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_offset_q7.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_offset_q7.c
@@ -0,0 +1,162 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_offset_q7.c
 * Description:  Q7 vector offset
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicOffset
  @{
 */
 /**
  @brief         Adds a constant offset to a Q7 vector.
  @param[in]     pSrc       points to the input vector
  @param[in]     offset     is the offset to be added
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_offset_q7(
    const q7_t * pSrc,
    q7_t   offset,
    q7_t * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q7x16_t vecSrc;
    /* Compute 16 outputs at a time */
    blkCnt = blockSize >> 4;
    while (blkCnt > 0U)
    {
        /*
         * C = A + offset
         * Add offset and then store the result in the destination buffer.
         */
        vecSrc = vld1q(pSrc);
        vst1q(pDst, vqaddq(vecSrc, offset));
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrc += 16;
        pDst += 16;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 0xF;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp8q(blkCnt);
        vecSrc = vld1q(pSrc);
        vstrbq_p(pDst, vqaddq(vecSrc, offset), p0);
    }
 }
 #else
 void arm_offset_q7(
  const q7_t * pSrc,
        q7_t offset,
        q7_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
 #if defined (ARM_MATH_LOOPUNROLL)
 #if defined (ARM_MATH_DSP)
  q31_t offset_packed;                           /* Offset packed to 32 bit */
  /* Offset is packed to 32 bit in order to use SIMD32 for addition */
  offset_packed = __PACKq7(offset, offset, offset, offset);
 #endif
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A + offset */
 #if defined (ARM_MATH_DSP)
    /* Add offset and store result in destination buffer (4 samples at a time). */
    write_q7x4_ia (&pDst, __QADD8(read_q7x4_ia ((q7_t **) &pSrc), offset_packed));
 #else
    *pDst++ = (q7_t) __SSAT(*pSrc++ + offset, 8);
    *pDst++ = (q7_t) __SSAT(*pSrc++ + offset, 8);
    *pDst++ = (q7_t) __SSAT(*pSrc++ + offset, 8);
    *pDst++ = (q7_t) __SSAT(*pSrc++ + offset, 8);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = A + offset */
    /* Add offset and store result in destination buffer. */
    *pDst++ = (q7_t) __SSAT((q15_t) *pSrc++ + offset, 8);
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicOffset group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_or_u16.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_or_u16.c
@@ -0,0 +1,137 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_or_u16.c
 * Description:  uint16_t bitwise inclusive OR
 *
 * $Date:        14 November 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @defgroup Or Vector bitwise inclusive OR
  Compute the logical bitwise OR.
  There are separate functions for uint32_t, uint16_t, and uint8_t data types.
 */
 /**
  @addtogroup Or
  @{
 */
 /**
  @brief         Compute the logical bitwise OR of two fixed-point vectors.
  @param[in]     pSrcA      points to input vector A
  @param[in]     pSrcB      points to input vector B
  @param[out]    pDst       points to output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
 */
 void arm_or_u16(
    const uint16_t * pSrcA,
    const uint16_t * pSrcB,
          uint16_t * pDst,
          uint32_t blockSize)
 {
    uint32_t blkCnt;      /* Loop counter */
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
    q15x8_t vecSrcA, vecSrcB;
    /* Compute 8 outputs at a time */
    blkCnt = blockSize >> 3;
    while (blkCnt > 0U)
    {
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vst1q(pDst, vorrq_u16(vecSrcA, vecSrcB) );
        pSrcA += 8;
        pSrcB += 8;
        pDst  += 8;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 7;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp16q(blkCnt);
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vstrhq_p(pDst, vorrq_u16(vecSrcA, vecSrcB), p0);
    }
 #else
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
    uint16x8_t vecA, vecB;
    /* Compute 8 outputs at a time */
    blkCnt = blockSize >> 3U;
    while (blkCnt > 0U)
    {
        vecA = vld1q_u16(pSrcA);
        vecB = vld1q_u16(pSrcB);
        vst1q_u16(pDst, vorrq_u16(vecA, vecB) );
        pSrcA += 8;
        pSrcB += 8;
        pDst  += 8;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 7;
 #else
    /* Initialize blkCnt with number of samples */
    blkCnt = blockSize;
 #endif
    while (blkCnt > 0U)
    {
        *pDst++ = (*pSrcA++)|(*pSrcB++);
        /* Decrement the loop counter */
        blkCnt--;
    }
 #endif /* if defined(ARM_MATH_MVEI) */
 }
 /**
  @} end of Or group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_or_u32.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_or_u32.c
@@ -0,0 +1,128 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_or_u32.c
 * Description:  uint32_t bitwise inclusive OR
 *
 * $Date:        14 November 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup Or
  @{
 */
 /**
  @brief         Compute the logical bitwise OR of two fixed-point vectors.
  @param[in]     pSrcA      points to input vector A
  @param[in]     pSrcB      points to input vector B
  @param[out]    pDst       points to output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
 */
 void arm_or_u32(
    const uint32_t * pSrcA,
    const uint32_t * pSrcB,
          uint32_t * pDst,
          uint32_t blockSize)
 {
    uint32_t blkCnt;      /* Loop counter */
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
    q31x4_t vecSrcA, vecSrcB;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2;
    while (blkCnt > 0U)
    {
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vst1q(pDst, vorrq_u32(vecSrcA, vecSrcB) );
        pSrcA += 4;
        pSrcB += 4;
        pDst  += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 3;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp32q(blkCnt);
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vstrwq_p(pDst, vorrq_u32(vecSrcA, vecSrcB), p0);
    }
 #else
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
    uint32x4_t vecA, vecB;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        vecA = vld1q_u32(pSrcA);
        vecB = vld1q_u32(pSrcB);
        vst1q_u32(pDst, vorrq_u32(vecA, vecB) );
        pSrcA += 4;
        pSrcB += 4;
        pDst  += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 3;
 #else
    /* Initialize blkCnt with number of samples */
    blkCnt = blockSize;
 #endif
    while (blkCnt > 0U)
    {
        *pDst++ = (*pSrcA++)|(*pSrcB++);
        /* Decrement the loop counter */
        blkCnt--;
    }
 #endif /* if defined(ARM_MATH_MVEI) */
 }
 /**
  @} end of Or group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_or_u8.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_or_u8.c
@@ -0,0 +1,128 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_or_u8.c
 * Description:  uint8_t bitwise inclusive OR
 *
 * $Date:        14 November 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup Or
  @{
 */
 /**
  @brief         Compute the logical bitwise OR of two fixed-point vectors.
  @param[in]     pSrcA      points to input vector A
  @param[in]     pSrcB      points to input vector B
  @param[out]    pDst       points to output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
 */
 void arm_or_u8(
    const uint8_t * pSrcA,
    const uint8_t * pSrcB,
          uint8_t * pDst,
          uint32_t blockSize)
 {
    uint32_t blkCnt;      /* Loop counter */
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
    q7x16_t vecSrcA, vecSrcB;
    /* Compute 16 outputs at a time */
    blkCnt = blockSize >> 4;
    while (blkCnt > 0U)
    {
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vst1q(pDst, vorrq_u8(vecSrcA, vecSrcB) );
        pSrcA += 16;
        pSrcB += 16;
        pDst  += 16;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0xF;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp8q(blkCnt);
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vstrbq_p(pDst, vorrq_u8(vecSrcA, vecSrcB), p0);
    }
 #else
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
    uint8x16_t vecA, vecB;
    /* Compute 16 outputs at a time */
    blkCnt = blockSize >> 4U;
    while (blkCnt > 0U)
    {
        vecA = vld1q_u8(pSrcA);
        vecB = vld1q_u8(pSrcB);
        vst1q_u8(pDst, vorrq_u8(vecA, vecB) );
        pSrcA += 16;
        pSrcB += 16;
        pDst  += 16;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0xF;
 #else
    /* Initialize blkCnt with number of samples */
    blkCnt = blockSize;
 #endif
    while (blkCnt > 0U)
    {
        *pDst++ = (*pSrcA++)|(*pSrcB++);
        /* Decrement the loop counter */
        blkCnt--;
    }
 #endif /* if defined(ARM_MATH_MVEI) */
 }
 /**
  @} end of Or group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_scale_f32.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_scale_f32.c
@@ -0,0 +1,216 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_scale_f32.c
 * Description:  Multiplies a floating-point vector by a scalar
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @defgroup BasicScale Vector Scale
  Multiply a vector by a scalar value.  For floating-point data, the algorithm used is:
  <pre>
      pDst[n] = pSrc[n] * scale,   0 <= n < blockSize.
  </pre>
  In the fixed-point Q7, Q15, and Q31 functions, <code>scale</code> is represented by
  a fractional multiplication <code>scaleFract</code> and an arithmetic shift <code>shift</code>.
  The shift allows the gain of the scaling operation to exceed 1.0.
  The algorithm used with fixed-point data is:
  <pre>
      pDst[n] = (pSrc[n] * scaleFract) << shift,   0 <= n < blockSize.
  </pre>
  The overall scale factor applied to the fixed-point data is
  <pre>
      scale = scaleFract * 2^shift.
  </pre>
  The functions support in-place computation allowing the source and destination
  pointers to reference the same memory buffer.
 */
 /**
  @addtogroup BasicScale
  @{
 */
 /**
  @brief         Multiplies a floating-point vector by a scalar.
  @param[in]     pSrc       points to the input vector
  @param[in]     scale      scale factor to be applied
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
 */
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 #include "arm_helium_utils.h"
 void arm_scale_f32(
  const float32_t * pSrc,
        float32_t scale,
        float32_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
    f32x4_t vec1;
    f32x4_t res;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        /* C = A + offset */
        /* Add offset and then store the results in the destination buffer. */
        vec1 = vld1q(pSrc);
        res = vmulq(vec1,scale);
        vst1q(pDst, res);
        /* Increment pointers */
        pSrc += 4;
        pDst += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0x3;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp32q(blkCnt);
        vec1 = vld1q((float32_t const *) pSrc);
        vstrwq_p(pDst, vmulq(vec1, scale), p0);
    }
 }
 #else
 void arm_scale_f32(
  const float32_t *pSrc,
        float32_t scale,
        float32_t *pDst,
        uint32_t blockSize)
 {
  uint32_t blkCnt;                               /* Loop counter */
 #if defined(ARM_MATH_NEON_EXPERIMENTAL)
    f32x4_t vec1;
    f32x4_t res;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        /* C = A * scale */
    	/* Scale the input and then store the results in the destination buffer. */
        vec1 = vld1q_f32(pSrc);
        res = vmulq_f32(vec1, vdupq_n_f32(scale));
        vst1q_f32(pDst, res);
        /* Increment pointers */
        pSrc += 4;
        pDst += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0x3;
 #else
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    float32_t in1, in2, in3, in4;
    /* C = A * scale */
    /* Scale input and store result in destination buffer. */
    in1 = (*pSrc++) * scale;
    in2 = (*pSrc++) * scale;
    in3 = (*pSrc++) * scale;
    in4 = (*pSrc++) * scale;
    *pDst++ = in1;
    *pDst++ = in2;
    *pDst++ = in3;
    *pDst++ = in4;
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
 #endif /* #if defined(ARM_MATH_NEON_EXPERIMENTAL) */
  while (blkCnt > 0U)
  {
    /* C = A * scale */
    /* Scale input and store result in destination buffer. */
    *pDst++ = (*pSrc++) * scale;
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 /**
  @} end of BasicScale group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_scale_q15.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_scale_q15.c
@@ -0,0 +1,201 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_scale_q15.c
 * Description:  Multiplies a Q15 vector by a scalar
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicScale
  @{
 */
 /**
  @brief         Multiplies a Q15 vector by a scalar.
  @param[in]     pSrc       points to the input vector
  @param[in]     scaleFract fractional portion of the scale value
  @param[in]     shift      number of bits to shift the result by
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.15 format.
                   These are multiplied to yield a 2.30 intermediate result and this is shifted with saturation to 1.15 format.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_scale_q15(
    const q15_t * pSrc,
    q15_t   scaleFract,
    int8_t  shift,
    q15_t * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q15x8_t vecSrc;
    q15x8_t vecDst;
    /* Compute 8 outputs at a time */
    blkCnt = blockSize >> 3;
    while (blkCnt > 0U)
    {
        /*
         * C = A * scale
         * Scale the input and then store the result in the destination buffer.
         */
        vecSrc = vld1q(pSrc);
        vecDst = vmulhq(vecSrc, vdupq_n_s16(scaleFract));
        vecDst = vqshlq_r(vecDst, shift + 1);
        vst1q(pDst, vecDst);
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrc += 8;
        pDst += 8;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 7;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp16q(blkCnt);;
        vecSrc = vld1q(pSrc);
        vecDst = vmulhq(vecSrc, vdupq_n_s16(scaleFract));
        vecDst = vqshlq_r(vecDst, shift + 1);
        vstrhq_p(pDst, vecDst, p0);
    }
 }
 #else
 void arm_scale_q15(
  const q15_t *pSrc,
        q15_t scaleFract,
        int8_t shift,
        q15_t *pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
        int8_t kShift = 15 - shift;                    /* Shift to apply after scaling */
 #if defined (ARM_MATH_LOOPUNROLL)
 #if defined (ARM_MATH_DSP)
  q31_t inA1, inA2;
  q31_t out1, out2, out3, out4;                  /* Temporary output variables */
  q15_t in1, in2, in3, in4;                      /* Temporary input variables */
 #endif
 #endif
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A * scale */
 #if defined (ARM_MATH_DSP)
    /* read 2 times 2 samples at a time from source */
    inA1 = read_q15x2_ia ((q15_t **) &pSrc);
    inA2 = read_q15x2_ia ((q15_t **) &pSrc);
    /* Scale inputs and store result in temporary variables
     * in single cycle by packing the outputs */
    out1 = (q31_t) ((q15_t) (inA1 >> 16) * scaleFract);
    out2 = (q31_t) ((q15_t) (inA1      ) * scaleFract);
    out3 = (q31_t) ((q15_t) (inA2 >> 16) * scaleFract);
    out4 = (q31_t) ((q15_t) (inA2      ) * scaleFract);
    /* apply shifting */
    out1 = out1 >> kShift;
    out2 = out2 >> kShift;
    out3 = out3 >> kShift;
    out4 = out4 >> kShift;
    /* saturate the output */
    in1 = (q15_t) (__SSAT(out1, 16));
    in2 = (q15_t) (__SSAT(out2, 16));
    in3 = (q15_t) (__SSAT(out3, 16));
    in4 = (q15_t) (__SSAT(out4, 16));
    /* store result to destination */
    write_q15x2_ia (&pDst, __PKHBT(in2, in1, 16));
    write_q15x2_ia (&pDst, __PKHBT(in4, in3, 16));
 #else
    *pDst++ = (q15_t) (__SSAT(((q31_t) *pSrc++ * scaleFract) >> kShift, 16));
    *pDst++ = (q15_t) (__SSAT(((q31_t) *pSrc++ * scaleFract) >> kShift, 16));
    *pDst++ = (q15_t) (__SSAT(((q31_t) *pSrc++ * scaleFract) >> kShift, 16));
    *pDst++ = (q15_t) (__SSAT(((q31_t) *pSrc++ * scaleFract) >> kShift, 16));
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = A * scale */
    /* Scale input and store result in destination buffer. */
    *pDst++ = (q15_t) (__SSAT(((q31_t) *pSrc++ * scaleFract) >> kShift, 16));
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicScale group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_scale_q31.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_scale_q31.c
@@ -0,0 +1,244 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_scale_q31.c
 * Description:  Multiplies a Q31 vector by a scalar
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicScale
  @{
 */
 /**
  @brief         Multiplies a Q31 vector by a scalar.
  @param[in]     pSrc       points to the input vector
  @param[in]     scaleFract fractional portion of the scale value
  @param[in]     shift      number of bits to shift the result by
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.31 format.
                   These are multiplied to yield a 2.62 intermediate result and this is shifted with saturation to 1.31 format.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_scale_q31(
    const q31_t * pSrc,
    q31_t   scaleFract,
    int8_t  shift,
    q31_t * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q31x4_t vecSrc;
    q31x4_t vecDst;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2;
    while (blkCnt > 0U)
    {
        /*
         * C = A * scale
         * Scale the input and then store the result in the destination buffer.
         */
        vecSrc = vld1q(pSrc);
        vecDst = vmulhq(vecSrc, vdupq_n_s32(scaleFract));
        vecDst = vqshlq_r(vecDst, shift + 1);
        vst1q(pDst, vecDst);
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrc += 4;
        pDst += 4;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 3;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp32q(blkCnt);
        vecSrc = vld1q(pSrc);
        vecDst = vmulhq(vecSrc, vdupq_n_s32(scaleFract));
        vecDst = vqshlq_r(vecDst, shift + 1);
        vstrwq_p(pDst, vecDst, p0);
    }
 }
 #else
 void arm_scale_q31(
  const q31_t *pSrc,
        q31_t scaleFract,
        int8_t shift,
        q31_t *pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
        q31_t in, out;                                 /* Temporary variables */
        int8_t kShift = shift + 1;                     /* Shift to apply after scaling */
        int8_t sign = (kShift & 0x80);
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  if (sign == 0U)
  {
    while (blkCnt > 0U)
    {
      /* C = A * scale */
      /* Scale input and store result in destination buffer. */
      in = *pSrc++;                                /* read input from source */
      in = ((q63_t) in * scaleFract) >> 32;        /* multiply input with scaler value */
      out = in << kShift;                          /* apply shifting */
      if (in != (out >> kShift))                   /* saturate the result */
        out = 0x7FFFFFFF ^ (in >> 31);
      *pDst++ = out;                               /* Store result destination */
      in = *pSrc++;
      in = ((q63_t) in * scaleFract) >> 32;
      out = in << kShift;
      if (in != (out >> kShift))
        out = 0x7FFFFFFF ^ (in >> 31);
      *pDst++ = out;
      in = *pSrc++;
      in = ((q63_t) in * scaleFract) >> 32;
      out = in << kShift;
      if (in != (out >> kShift))
        out = 0x7FFFFFFF ^ (in >> 31);
      *pDst++ = out;
      in = *pSrc++;
      in = ((q63_t) in * scaleFract) >> 32;
      out = in << kShift;
      if (in != (out >> kShift))
        out = 0x7FFFFFFF ^ (in >> 31);
      *pDst++ = out;
      /* Decrement loop counter */
      blkCnt--;
    }
  }
  else
  {
    while (blkCnt > 0U)
    {
      /* C = A * scale */
      /* Scale input and store result in destination buffer. */
      in = *pSrc++;                                /* read four inputs from source */
      in = ((q63_t) in * scaleFract) >> 32;        /* multiply input with scaler value */
      out = in >> -kShift;                         /* apply shifting */
      *pDst++ = out;                               /* Store result destination */
      in = *pSrc++;
      in = ((q63_t) in * scaleFract) >> 32;
      out = in >> -kShift;
      *pDst++ = out;
      in = *pSrc++;
      in = ((q63_t) in * scaleFract) >> 32;
      out = in >> -kShift;
      *pDst++ = out;
      in = *pSrc++;
      in = ((q63_t) in * scaleFract) >> 32;
      out = in >> -kShift;
      *pDst++ = out;
      /* Decrement loop counter */
      blkCnt--;
    }
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  if (sign == 0U)
  {
    while (blkCnt > 0U)
    {
      /* C = A * scale */
      /* Scale input and store result in destination buffer. */
      in = *pSrc++;
      in = ((q63_t) in * scaleFract) >> 32;
      out = in << kShift;
      if (in != (out >> kShift))
          out = 0x7FFFFFFF ^ (in >> 31);
      *pDst++ = out;
      /* Decrement loop counter */
      blkCnt--;
    }
  }
  else
  {
    while (blkCnt > 0U)
    {
      /* C = A * scale */
      /* Scale input and store result in destination buffer. */
      in = *pSrc++;
      in = ((q63_t) in * scaleFract) >> 32;
      out = in >> -kShift;
      *pDst++ = out;
      /* Decrement loop counter */
      blkCnt--;
    }
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicScale group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_scale_q7.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_scale_q7.c
@@ -0,0 +1,186 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_scale_q7.c
 * Description:  Multiplies a Q7 vector by a scalar
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicScale
  @{
 */
 /**
  @brief         Multiplies a Q7 vector by a scalar.
  @param[in]     pSrc       points to the input vector
  @param[in]     scaleFract fractional portion of the scale value
  @param[in]     shift      number of bits to shift the result by
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.7 format.
                   These are multiplied to yield a 2.14 intermediate result and this is shifted with saturation to 1.7 format.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_scale_q7(
    const q7_t * pSrc,
    q7_t   scaleFract,
    int8_t  shift,
    q7_t * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q7x16_t vecSrc;
    q7x16_t vecDst;
    /* Compute 16 outputs at a time */
    blkCnt = blockSize >> 4;
    while (blkCnt > 0U)
    {
        /*
         * C = A * scale
         * Scale the input and then store the result in the destination buffer.
         */
        vecSrc = vld1q(pSrc);
        vecDst = vmulhq(vecSrc, vdupq_n_s8(scaleFract));
        vecDst = vqshlq_r(vecDst, shift + 1);
        vst1q(pDst, vecDst);
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrc += 16;
        pDst += 16;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 0xF;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp8q(blkCnt);
        vecSrc = vld1q(pSrc);
        vecDst = vmulhq(vecSrc, vdupq_n_s8(scaleFract));
        vecDst = vqshlq_r(vecDst, shift + 1);
        vstrbq_p(pDst, vecDst, p0);
    }
 }
 #else
 void arm_scale_q7(
  const q7_t * pSrc,
        q7_t scaleFract,
        int8_t shift,
        q7_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
        int8_t kShift = 7 - shift;                     /* Shift to apply after scaling */
 #if defined (ARM_MATH_LOOPUNROLL)
 #if defined (ARM_MATH_DSP)
  q7_t in1,  in2,  in3,  in4;                    /* Temporary input variables */
  q7_t out1, out2, out3, out4;                   /* Temporary output variables */
 #endif
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A * scale */
 #if defined (ARM_MATH_DSP)
    /* Reading 4 inputs from memory */
    in1 = *pSrc++;
    in2 = *pSrc++;
    in3 = *pSrc++;
    in4 = *pSrc++;
    /* Scale inputs and store result in the temporary variable. */
    out1 = (q7_t) (__SSAT(((in1) * scaleFract) >> kShift, 8));
    out2 = (q7_t) (__SSAT(((in2) * scaleFract) >> kShift, 8));
    out3 = (q7_t) (__SSAT(((in3) * scaleFract) >> kShift, 8));
    out4 = (q7_t) (__SSAT(((in4) * scaleFract) >> kShift, 8));
    /* Pack and store result in destination buffer (in single write) */
    write_q7x4_ia (&pDst, __PACKq7(out1, out2, out3, out4));
 #else
    *pDst++ = (q7_t) (__SSAT((((q15_t) *pSrc++ * scaleFract) >> kShift), 8));
    *pDst++ = (q7_t) (__SSAT((((q15_t) *pSrc++ * scaleFract) >> kShift), 8));
    *pDst++ = (q7_t) (__SSAT((((q15_t) *pSrc++ * scaleFract) >> kShift), 8));
    *pDst++ = (q7_t) (__SSAT((((q15_t) *pSrc++ * scaleFract) >> kShift), 8));
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = A * scale */
    /* Scale input and store result in destination buffer. */
    *pDst++ = (q7_t) (__SSAT((((q15_t) *pSrc++ * scaleFract) >> kShift), 8));
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicScale group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_shift_q15.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_shift_q15.c
@@ -0,0 +1,251 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_shift_q15.c
 * Description:  Shifts the elements of a Q15 vector by a specified number of bits
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicShift
  @{
 */
 /**
  @brief         Shifts the elements of a Q15 vector a specified number of bits
  @param[in]     pSrc       points to the input vector
  @param[in]     shiftBits  number of bits to shift.  A positive value shifts left; a negative value shifts right.
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_shift_q15(
    const q15_t * pSrc,
    int8_t shiftBits,
    q15_t * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q15x8_t vecSrc;
    q15x8_t vecDst;
    /* Compute 8 outputs at a time */
    blkCnt = blockSize >> 3;
    while (blkCnt > 0U)
    {
        /*
         * C = A (>> or <<) shiftBits
         * Shift the input and then store the result in the destination buffer.
         */
        vecSrc = vld1q(pSrc);
        vecDst = vqshlq_r(vecSrc, shiftBits);
        vst1q(pDst, vecDst);
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrc += 8;
        pDst += 8;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 7;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp16q(blkCnt);
        vecSrc = vld1q(pSrc);
        vecDst = vqshlq_r(vecSrc, shiftBits);
        vstrhq_p(pDst, vecDst, p0);
    }
 }
 #else
 void arm_shift_q15(
  const q15_t * pSrc,
        int8_t shiftBits,
        q15_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
        uint8_t sign = (shiftBits & 0x80);             /* Sign of shiftBits */
 #if defined (ARM_MATH_LOOPUNROLL)
 #if defined (ARM_MATH_DSP)
  q15_t in1, in2;                                /* Temporary input variables */
 #endif
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  /* If the shift value is positive then do right shift else left shift */
  if (sign == 0U)
  {
    while (blkCnt > 0U)
    {
      /* C = A << shiftBits */
 #if defined (ARM_MATH_DSP)
      /* read 2 samples from source */
      in1 = *pSrc++;
      in2 = *pSrc++;
      /* Shift the inputs and then store the results in the destination buffer. */
 #ifndef ARM_MATH_BIG_ENDIAN
      write_q15x2_ia (&pDst, __PKHBT(__SSAT((in1 << shiftBits), 16),
                                     __SSAT((in2 << shiftBits), 16), 16));
 #else
      write_q15x2_ia (&pDst, __PKHBT(__SSAT((in2 << shiftBits), 16),
                                      __SSAT((in1 << shiftBits), 16), 16));
 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
      /* read 2 samples from source */
      in1 = *pSrc++;
      in2 = *pSrc++;
 #ifndef ARM_MATH_BIG_ENDIAN
      write_q15x2_ia (&pDst, __PKHBT(__SSAT((in1 << shiftBits), 16),
                                     __SSAT((in2 << shiftBits), 16), 16));
 #else
      write_q15x2_ia (&pDst, __PKHBT(__SSAT((in2 << shiftBits), 16),
                                     __SSAT((in1 << shiftBits), 16), 16));
 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
 #else
      *pDst++ = __SSAT(((q31_t) *pSrc++ << shiftBits), 16);
      *pDst++ = __SSAT(((q31_t) *pSrc++ << shiftBits), 16);
      *pDst++ = __SSAT(((q31_t) *pSrc++ << shiftBits), 16);
      *pDst++ = __SSAT(((q31_t) *pSrc++ << shiftBits), 16);
 #endif
      /* Decrement loop counter */
      blkCnt--;
    }
  }
  else
  {
    while (blkCnt > 0U)
    {
      /* C = A >> shiftBits */
 #if defined (ARM_MATH_DSP)
      /* read 2 samples from source */
      in1 = *pSrc++;
      in2 = *pSrc++;
      /* Shift the inputs and then store the results in the destination buffer. */
 #ifndef ARM_MATH_BIG_ENDIAN
      write_q15x2_ia (&pDst, __PKHBT((in1 >> -shiftBits),
                                     (in2 >> -shiftBits), 16));
 #else
      write_q15x2_ia (&pDst, __PKHBT((in2 >> -shiftBits),
                                     (in1 >> -shiftBits), 16));
 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
      /* read 2 samples from source */
      in1 = *pSrc++;
      in2 = *pSrc++;
 #ifndef ARM_MATH_BIG_ENDIAN
      write_q15x2_ia (&pDst, __PKHBT((in1 >> -shiftBits),
                                     (in2 >> -shiftBits), 16));
 #else
      write_q15x2_ia (&pDst, __PKHBT((in2 >> -shiftBits),
                                     (in1 >> -shiftBits), 16));
 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
 #else
      *pDst++ = (*pSrc++ >> -shiftBits);
      *pDst++ = (*pSrc++ >> -shiftBits);
      *pDst++ = (*pSrc++ >> -shiftBits);
      *pDst++ = (*pSrc++ >> -shiftBits);
 #endif
      /* Decrement loop counter */
      blkCnt--;
    }
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  /* If the shift value is positive then do right shift else left shift */
  if (sign == 0U)
  {
    while (blkCnt > 0U)
    {
      /* C = A << shiftBits */
      /* Shift input and store result in destination buffer. */
      *pDst++ = __SSAT(((q31_t) *pSrc++ << shiftBits), 16);
      /* Decrement loop counter */
      blkCnt--;
    }
  }
  else
  {
    while (blkCnt > 0U)
    {
      /* C = A >> shiftBits */
      /* Shift input and store result in destination buffer. */
      *pDst++ = (*pSrc++ >> -shiftBits);
      /* Decrement loop counter */
      blkCnt--;
    }
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicShift group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_shift_q31.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_shift_q31.c
@@ -0,0 +1,232 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_shift_q31.c
 * Description:  Shifts the elements of a Q31 vector by a specified number of bits
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @defgroup BasicShift Vector Shift
  Shifts the elements of a fixed-point vector by a specified number of bits.
  There are separate functions for Q7, Q15, and Q31 data types.
  The underlying algorithm used is:
  <pre>
      pDst[n] = pSrc[n] << shift,   0 <= n < blockSize.
  </pre>
  If <code>shift</code> is positive then the elements of the vector are shifted to the left.
  If <code>shift</code> is negative then the elements of the vector are shifted to the right.
  The functions support in-place computation allowing the source and destination
  pointers to reference the same memory buffer.
 */
 /**
  @addtogroup BasicShift
  @{
 */
 /**
  @brief         Shifts the elements of a Q31 vector a specified number of bits.
  @param[in]     pSrc       points to the input vector
  @param[in]     shiftBits  number of bits to shift.  A positive value shifts left; a negative value shifts right.
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in the vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_shift_q31(
    const q31_t * pSrc,
    int8_t shiftBits,
    q31_t * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q31x4_t vecSrc;
    q31x4_t vecDst;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2;
    while (blkCnt > 0U)
    {
        /*
         * C = A (>> or <<) shiftBits
         * Shift the input and then store the result in the destination buffer.
         */
        vecSrc = vld1q((q31_t const *) pSrc);
        vecDst = vqshlq_r(vecSrc, shiftBits);
        vst1q(pDst, vecDst);
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrc += 4;
        pDst += 4;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 3;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp32q(blkCnt);
        vecSrc = vld1q((q31_t const *) pSrc);
        vecDst = vqshlq_r(vecSrc, shiftBits);
        vstrwq_p(pDst, vecDst, p0);
    }
 }
 #else
 void arm_shift_q31(
  const q31_t * pSrc,
        int8_t shiftBits,
        q31_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
        uint8_t sign = (shiftBits & 0x80);             /* Sign of shiftBits */
 #if defined (ARM_MATH_LOOPUNROLL)
  q31_t in, out;                                 /* Temporary variables */
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  /* If the shift value is positive then do right shift else left shift */
  if (sign == 0U)
  {
    while (blkCnt > 0U)
    {
      /* C = A << shiftBits */
      /* Shift input and store result in destination buffer. */
      in = *pSrc++;
      out = in << shiftBits;
      if (in != (out >> shiftBits))
        out = 0x7FFFFFFF ^ (in >> 31);
      *pDst++ = out;
      in = *pSrc++;
      out = in << shiftBits;
      if (in != (out >> shiftBits))
        out = 0x7FFFFFFF ^ (in >> 31);
      *pDst++ = out;
      in = *pSrc++;
      out = in << shiftBits;
      if (in != (out >> shiftBits))
        out = 0x7FFFFFFF ^ (in >> 31);
      *pDst++ = out;
      in = *pSrc++;
      out = in << shiftBits;
      if (in != (out >> shiftBits))
        out = 0x7FFFFFFF ^ (in >> 31);
      *pDst++ = out;
      /* Decrement loop counter */
      blkCnt--;
    }
  }
  else
  {
    while (blkCnt > 0U)
    {
      /* C = A >> shiftBits */
      /* Shift input and store results in destination buffer. */
      *pDst++ = (*pSrc++ >> -shiftBits);
      *pDst++ = (*pSrc++ >> -shiftBits);
      *pDst++ = (*pSrc++ >> -shiftBits);
      *pDst++ = (*pSrc++ >> -shiftBits);
      /* Decrement loop counter */
      blkCnt--;
    }
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  /* If the shift value is positive then do right shift else left shift */
  if (sign == 0U)
  {
    while (blkCnt > 0U)
    {
      /* C = A << shiftBits */
      /* Shift input and store result in destination buffer. */
      *pDst++ = clip_q63_to_q31((q63_t) *pSrc++ << shiftBits);
      /* Decrement loop counter */
      blkCnt--;
    }
  }
  else
  {
    while (blkCnt > 0U)
    {
      /* C = A >> shiftBits */
      /* Shift input and store result in destination buffer. */
      *pDst++ = (*pSrc++ >> -shiftBits);
      /* Decrement loop counter */
      blkCnt--;
    }
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicShift group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_shift_q7.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_shift_q7.c
@@ -0,0 +1,225 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_shift_q7.c
 * Description:  Processing function for the Q7 Shifting
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicShift
  @{
 */
 /**
  @brief         Shifts the elements of a Q7 vector a specified number of bits
  @param[in]     pSrc       points to the input vector
  @param[in]     shiftBits  number of bits to shift.  A positive value shifts left; a negative value shifts right.
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
  @par           onditions for optimum performance
                   Input and output buffers should be aligned by 32-bit
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_shift_q7(
    const q7_t * pSrc,
    int8_t shiftBits,
    q7_t * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q7x16_t vecSrc;
    q7x16_t vecDst;
    /* Compute 16 outputs at a time */
    blkCnt = blockSize >> 4;
    while (blkCnt > 0U)
    {
        /*
         * C = A (>> or <<) shiftBits
         * Shift the input and then store the result in the destination buffer.
         */
        vecSrc = vld1q(pSrc);
        vecDst = vqshlq_r(vecSrc, shiftBits);
        vst1q(pDst, vecDst);
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrc += 16;
        pDst += 16;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 0xF;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp8q(blkCnt);
        vecSrc = vld1q(pSrc);
        vecDst = vqshlq_r(vecSrc, shiftBits);
        vstrbq_p(pDst, vecDst, p0);
    }
 }
 #else
 void arm_shift_q7(
  const q7_t * pSrc,
        int8_t shiftBits,
        q7_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
        uint8_t sign = (shiftBits & 0x80);             /* Sign of shiftBits */
 #if defined (ARM_MATH_LOOPUNROLL)
 #if defined (ARM_MATH_DSP)
  q7_t in1,  in2,  in3,  in4;                    /* Temporary input variables */
 #endif
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  /* If the shift value is positive then do right shift else left shift */
  if (sign == 0U)
  {
    while (blkCnt > 0U)
    {
      /* C = A << shiftBits */
 #if defined (ARM_MATH_DSP)
      /* Read 4 inputs */
      in1 = *pSrc++;
      in2 = *pSrc++;
      in3 = *pSrc++;
      in4 = *pSrc++;
    /* Pack and store result in destination buffer (in single write) */
      write_q7x4_ia (&pDst, __PACKq7(__SSAT((in1 << shiftBits), 8),
                                     __SSAT((in2 << shiftBits), 8),
                                     __SSAT((in3 << shiftBits), 8),
                                     __SSAT((in4 << shiftBits), 8) ));
 #else
      *pDst++ = (q7_t) __SSAT(((q15_t) *pSrc++ << shiftBits), 8);
      *pDst++ = (q7_t) __SSAT(((q15_t) *pSrc++ << shiftBits), 8);
      *pDst++ = (q7_t) __SSAT(((q15_t) *pSrc++ << shiftBits), 8);
      *pDst++ = (q7_t) __SSAT(((q15_t) *pSrc++ << shiftBits), 8);
 #endif
      /* Decrement loop counter */
      blkCnt--;
    }
  }
  else
  {
    while (blkCnt > 0U)
    {
      /* C = A >> shiftBits */
 #if defined (ARM_MATH_DSP)
      /* Read 4 inputs */
      in1 = *pSrc++;
      in2 = *pSrc++;
      in3 = *pSrc++;
      in4 = *pSrc++;
    /* Pack and store result in destination buffer (in single write) */
      write_q7x4_ia (&pDst, __PACKq7((in1 >> -shiftBits),
                                     (in2 >> -shiftBits),
                                     (in3 >> -shiftBits),
                                     (in4 >> -shiftBits) ));
 #else
      *pDst++ = (*pSrc++ >> -shiftBits);
      *pDst++ = (*pSrc++ >> -shiftBits);
      *pDst++ = (*pSrc++ >> -shiftBits);
      *pDst++ = (*pSrc++ >> -shiftBits);
 #endif
      /* Decrement loop counter */
      blkCnt--;
    }
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  /* If the shift value is positive then do right shift else left shift */
  if (sign == 0U)
  {
    while (blkCnt > 0U)
    {
      /* C = A << shiftBits */
      /* Shift input and store result in destination buffer. */
      *pDst++ = (q7_t) __SSAT(((q15_t) *pSrc++ << shiftBits), 8);
      /* Decrement loop counter */
      blkCnt--;
    }
  }
  else
  {
    while (blkCnt > 0U)
    {
      /* C = A >> shiftBits */
      /* Shift input and store result in destination buffer. */
      *pDst++ = (*pSrc++ >> -shiftBits);
      /* Decrement loop counter */
      blkCnt--;
    }
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicShift group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_sub_f32.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_sub_f32.c
@@ -0,0 +1,202 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_sub_f32.c
 * Description:  Floating-point vector subtraction
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @defgroup BasicSub Vector Subtraction
  Element-by-element subtraction of two vectors.
  <pre>
      pDst[n] = pSrcA[n] - pSrcB[n],   0 <= n < blockSize.
  </pre>
  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
 */
 /**
  @addtogroup BasicSub
  @{
 */
 /**
  @brief         Floating-point vector subtraction.
  @param[in]     pSrcA      points to the first input vector
  @param[in]     pSrcB      points to the second input vector
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
 */
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 #include "arm_helium_utils.h"
 void arm_sub_f32(
  const float32_t * pSrcA,
  const float32_t * pSrcB,
        float32_t * pDst,
        uint32_t blockSize)
 {
    uint32_t blkCnt;                               /* Loop counter */
    f32x4_t vec1;
    f32x4_t vec2;
    f32x4_t res;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        /* C = A + B */
      /* Add and then store the results in the destination buffer. */
        vec1 = vld1q(pSrcA);
        vec2 = vld1q(pSrcB);
        res = vsubq(vec1, vec2);
        vst1q(pDst, res);
        /* Increment pointers */
        pSrcA += 4;
        pSrcB += 4;
        pDst += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0x3;
    if (blkCnt > 0U)
    {
      /* C = A + B */
      mve_pred16_t p0 = vctp32q(blkCnt);
      vec1 = vld1q(pSrcA);
      vec2 = vld1q(pSrcB);
      vstrwq_p(pDst, vsubq(vec1,vec2), p0);
    }
 }
 #else
 void arm_sub_f32(
  const float32_t * pSrcA,
  const float32_t * pSrcB,
        float32_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
    f32x4_t vec1;
    f32x4_t vec2;
    f32x4_t res;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        /* C = A - B */
        /* Subtract and then store the results in the destination buffer. */
        vec1 = vld1q_f32(pSrcA);
        vec2 = vld1q_f32(pSrcB);
        res = vsubq_f32(vec1, vec2);
        vst1q_f32(pDst, res);
        /* Increment pointers */
        pSrcA += 4;
        pSrcB += 4;
        pDst += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0x3;
 #else
 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A - B */
    /* Subtract and store result in destination buffer. */
    *pDst++ = (*pSrcA++) - (*pSrcB++);
    *pDst++ = (*pSrcA++) - (*pSrcB++);
    *pDst++ = (*pSrcA++) - (*pSrcB++);
    *pDst++ = (*pSrcA++) - (*pSrcB++);
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
 #endif /* #if defined(ARM_MATH_NEON) */
  while (blkCnt > 0U)
  {
    /* C = A - B */
    /* Subtract and store result in destination buffer. */
    *pDst++ = (*pSrcA++) - (*pSrcB++);
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 /**
  @} end of BasicSub group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_sub_q15.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_sub_q15.c
@@ -0,0 +1,178 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_sub_q15.c
 * Description:  Q15 vector subtraction
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicSub
  @{
 */
 /**
  @brief         Q15 vector subtraction.
  @param[in]     pSrcA      points to the first input vector
  @param[in]     pSrcB      points to the second input vector
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_sub_q15(
    const q15_t * pSrcA,
    const q15_t * pSrcB,
    q15_t * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q15x8_t vecA;
    q15x8_t vecB;
    /* Compute 8 outputs at a time */
    blkCnt = blockSize >> 3;
    while (blkCnt > 0U)
    {
        /*
         * C = A - B
         * Subtract and then store the results in the destination buffer.
         */
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        vst1q(pDst, vqsubq(vecA, vecB));
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrcA  += 8;
        pSrcB  += 8;
        pDst   += 8;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 7;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp16q(blkCnt);
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        vstrhq_p(pDst, vqsubq(vecA, vecB), p0);
    }
 }
 #else
 void arm_sub_q15(
  const q15_t * pSrcA,
  const q15_t * pSrcB,
        q15_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
 #if defined (ARM_MATH_LOOPUNROLL)
 #if defined (ARM_MATH_DSP)
  q31_t inA1, inA2;
  q31_t inB1, inB2;
 #endif
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A - B */
 #if defined (ARM_MATH_DSP)
    /* read 2 times 2 samples at a time from sourceA */
    inA1 = read_q15x2_ia ((q15_t **) &pSrcA);
    inA2 = read_q15x2_ia ((q15_t **) &pSrcA);
    /* read 2 times 2 samples at a time from sourceB */
    inB1 = read_q15x2_ia ((q15_t **) &pSrcB);
    inB2 = read_q15x2_ia ((q15_t **) &pSrcB);
    /* Subtract and store 2 times 2 samples at a time */
    write_q15x2_ia (&pDst, __QSUB16(inA1, inB1));
    write_q15x2_ia (&pDst, __QSUB16(inA2, inB2));
 #else
    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = A - B */
    /* Subtract and store result in destination buffer. */
 #if defined (ARM_MATH_DSP)
    *pDst++ = (q15_t) __QSUB16(*pSrcA++, *pSrcB++);
 #else
    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicSub group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_sub_q31.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_sub_q31.c
@@ -0,0 +1,159 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_sub_q31.c
 * Description:  Q31 vector subtraction
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicSub
  @{
 */
 /**
  @brief         Q31 vector subtraction.
  @param[in]     pSrcA      points to the first input vector
  @param[in]     pSrcB      points to the second input vector
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_sub_q31(
  const q31_t * pSrcA,
  const q31_t * pSrcB,
        q31_t * pDst,
        uint32_t blockSize)
 {
    uint32_t blkCnt;
    q31x4_t vecA;
    q31x4_t vecB;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2;
    while (blkCnt > 0U)
    {
        /*
         * C = A + B
         * Add and then store the results in the destination buffer.
         */
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        vst1q(pDst, vqsubq(vecA, vecB));
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrcA  += 4;
        pSrcB  += 4;
        pDst   += 4;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 3;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp32q(blkCnt);
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        vstrwq_p(pDst, vqsubq(vecA, vecB), p0);
    }
 }
 #else
 void arm_sub_q31(
  const q31_t * pSrcA,
  const q31_t * pSrcB,
        q31_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A - B */
    /* Subtract and store result in destination buffer. */
    *pDst++ = __QSUB(*pSrcA++, *pSrcB++);
    *pDst++ = __QSUB(*pSrcA++, *pSrcB++);
    *pDst++ = __QSUB(*pSrcA++, *pSrcB++);
    *pDst++ = __QSUB(*pSrcA++, *pSrcB++);
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = A - B */
    /* Subtract and store result in destination buffer. */
    *pDst++ = __QSUB(*pSrcA++, *pSrcB++);
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicSub group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_sub_q7.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_sub_q7.c
@@ -0,0 +1,158 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_sub_q7.c
 * Description:  Q7 vector subtraction
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup BasicSub
  @{
 */
 /**
  @brief         Q7 vector subtraction.
  @param[in]     pSrcA      points to the first input vector
  @param[in]     pSrcB      points to the second input vector
  @param[out]    pDst       points to the output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.
 */
 #if defined(ARM_MATH_MVEI)
 #include "arm_helium_utils.h"
 void arm_sub_q7(
    const q7_t * pSrcA,
    const q7_t * pSrcB,
    q7_t * pDst,
    uint32_t blockSize)
 {
    uint32_t  blkCnt;           /* loop counters */
    q7x16_t vecA;
    q7x16_t vecB;
    /* Compute 16 outputs at a time */
    blkCnt = blockSize >> 4;
    while (blkCnt > 0U)
    {
        /*
         * C = A - B
         * Subtract and then store the results in the destination buffer.
         */
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        vst1q(pDst, vqsubq(vecA, vecB));
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
        /*
         * advance vector source and destination pointers
         */
        pSrcA  += 16;
        pSrcB  += 16;
        pDst   += 16;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 0xF;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp8q(blkCnt);
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        vstrbq_p(pDst, vqsubq(vecA, vecB), p0);
    }
 }
 #else
 void arm_sub_q7(
  const q7_t * pSrcA,
  const q7_t * pSrcB,
        q7_t * pDst,
        uint32_t blockSize)
 {
        uint32_t blkCnt;                               /* Loop counter */
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    /* C = A - B */
 #if defined (ARM_MATH_DSP)
    /* Subtract and store result in destination buffer (4 samples at a time). */
    write_q7x4_ia (&pDst, __QSUB8(read_q7x4_ia ((q7_t **) &pSrcA), read_q7x4_ia ((q7_t **) &pSrcB)));
 #else
    *pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ - *pSrcB++, 8);
    *pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ - *pSrcB++, 8);
    *pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ - *pSrcB++, 8);
    *pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ - *pSrcB++, 8);
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C = A - B */
    /* Subtract and store result in destination buffer. */
    *pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ - *pSrcB++, 8);
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicSub group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_xor_u16.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_xor_u16.c
@@ -0,0 +1,137 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_xor_u16.c
 * Description:  uint16_t bitwise exclusive OR
 *
 * $Date:        14 November 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @defgroup Xor Vector bitwise exclusive OR
  Compute the logical bitwise XOR.
  There are separate functions for uint32_t, uint16_t, and uint8_t data types.
 */
 /**
  @addtogroup Xor
  @{
 */
 /**
  @brief         Compute the logical bitwise XOR of two fixed-point vectors.
  @param[in]     pSrcA      points to input vector A
  @param[in]     pSrcB      points to input vector B
  @param[out]    pDst       points to output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
 */
 void arm_xor_u16(
    const uint16_t * pSrcA,
    const uint16_t * pSrcB,
          uint16_t * pDst,
          uint32_t blockSize)
 {
    uint32_t blkCnt;      /* Loop counter */
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
    q15x8_t vecSrcA, vecSrcB;
    /* Compute 8 outputs at a time */
    blkCnt = blockSize >> 3;
    while (blkCnt > 0U)
    {
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vst1q(pDst, veorq_u16(vecSrcA, vecSrcB) );
        pSrcA += 8;
        pSrcB += 8;
        pDst  += 8;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 7;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp16q(blkCnt);
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vstrhq_p(pDst, veorq_u16(vecSrcA, vecSrcB), p0);
    }
 #else
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
    uint16x8_t vecA, vecB;
    /* Compute 8 outputs at a time */
    blkCnt = blockSize >> 3U;
    while (blkCnt > 0U)
    {
        vecA = vld1q_u16(pSrcA);
        vecB = vld1q_u16(pSrcB);
        vst1q_u16(pDst, veorq_u16(vecA, vecB) );
        pSrcA += 8;
        pSrcB += 8;
        pDst  += 8;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 7;
 #else
    /* Initialize blkCnt with number of samples */
    blkCnt = blockSize;
 #endif
    while (blkCnt > 0U)
    {
        *pDst++ = (*pSrcA++)^(*pSrcB++);
        /* Decrement the loop counter */
        blkCnt--;
    }
 #endif /* if defined(ARM_MATH_MVEI) */
 }
 /**
  @} end of Xor group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_xor_u32.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_xor_u32.c
@@ -0,0 +1,129 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_xor_u32.c
 * Description:  uint32_t bitwise exclusive OR
 *
 * $Date:        14 November 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup Xor
  @{
 */
 /**
  @brief         Compute the logical bitwise XOR of two fixed-point vectors.
  @param[in]     pSrcA      points to input vector A
  @param[in]     pSrcB      points to input vector B
  @param[out]    pDst       points to output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
 */
 void arm_xor_u32(
    const uint32_t * pSrcA,
    const uint32_t * pSrcB,
          uint32_t * pDst,
          uint32_t blockSize)
 {
    uint32_t blkCnt;      /* Loop counter */
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
    q31x4_t vecSrcA, vecSrcB;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2;
    while (blkCnt > 0U)
    {
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vst1q(pDst, veorq_u32(vecSrcA, vecSrcB) );
        pSrcA += 4;
        pSrcB += 4;
        pDst  += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 3;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp32q(blkCnt);
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vstrwq_p(pDst, veorq_u32(vecSrcA, vecSrcB), p0);
    }
 #else
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
    uint32x4_t vecA, vecB;
    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        vecA = vld1q_u32(pSrcA);
        vecB = vld1q_u32(pSrcB);
        vst1q_u32(pDst, veorq_u32(vecA, vecB) );
        pSrcA += 4;
        pSrcB += 4;
        pDst  += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 3;
 #else
    /* Initialize blkCnt with number of samples */
    blkCnt = blockSize;
 #endif
    while (blkCnt > 0U)
    {
        *pDst++ = (*pSrcA++)^(*pSrcB++);
        /* Decrement the loop counter */
        blkCnt--;
    }
 #endif /* if defined(ARM_MATH_MVEI) */
 }
 /**
  @} end of Xor group
 */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_xor_u8.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_xor_u8.c
@@ -0,0 +1,129 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_xor_u8.c
 * Description:  uint8_t bitwise exclusive OR
 *
 * $Date:        14 November 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupMath
 */
 /**
  @addtogroup Xor
  @{
 */
 /**
  @brief         Compute the logical bitwise XOR of two fixed-point vectors.
  @param[in]     pSrcA      points to input vector A
  @param[in]     pSrcB      points to input vector B
  @param[out]    pDst       points to output vector
  @param[in]     blockSize  number of samples in each vector
  @return        none
 */
 void arm_xor_u8(
    const uint8_t * pSrcA,
    const uint8_t * pSrcB,
          uint8_t * pDst,
          uint32_t blockSize)
 {
    uint32_t blkCnt;      /* Loop counter */
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
    q7x16_t vecSrcA, vecSrcB;
    /* Compute 16 outputs at a time */
    blkCnt = blockSize >> 4;
    while (blkCnt > 0U)
    {
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vst1q(pDst, veorq_u8(vecSrcA, vecSrcB) );
        pSrcA += 16;
        pSrcB += 16;
        pDst  += 16;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0xF;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp8q(blkCnt);
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vstrbq_p(pDst, veorq_u8(vecSrcA, vecSrcB), p0);
    }
 #else
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
    uint8x16_t vecA, vecB;
    /* Compute 16 outputs at a time */
    blkCnt = blockSize >> 4U;
    while (blkCnt > 0U)
    {
        vecA = vld1q_u8(pSrcA);
        vecB = vld1q_u8(pSrcB);
        vst1q_u8(pDst, veorq_u8(vecA, vecB) );
        pSrcA += 16;
        pSrcB += 16;
        pDst  += 16;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* Tail */
    blkCnt = blockSize & 0xF;
 #else
    /* Initialize blkCnt with number of samples */
    blkCnt = blockSize;
 #endif
    while (blkCnt > 0U)
    {
        *pDst++ = (*pSrcA++)^(*pSrcB++);
        /* Decrement the loop counter */
        blkCnt--;
    }
 #endif /* if defined(ARM_MATH_MVEI) */
 }
 /**
  @} end of Xor group
 */
--- a/libraries/cmsis/dsp/Source/BayesFunctions/BayesFunctions.c
+++ b/libraries/cmsis/dsp/Source/BayesFunctions/BayesFunctions.c
@@ -0,0 +1,29 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        BayesFunctions.c
 * Description:  Combination of all bayes function source files.
 *
 * $Date:        16. March 2020
 * $Revision:    V1.0.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2020 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_gaussian_naive_bayes_predict_f32.c"
--- a/libraries/cmsis/dsp/Source/BayesFunctions/CMakeLists.txt
+++ b/libraries/cmsis/dsp/Source/BayesFunctions/CMakeLists.txt
@@ -0,0 +1,19 @@
 cmake_minimum_required (VERSION 3.6)
 project(CMSISDSPBayes)
 include(configLib)
 include(configDsp)
 file(GLOB SRC "./*_*.c")
 add_library(CMSISDSPBayes STATIC ${SRC})
 configLib(CMSISDSPBayes ${ROOT})
 configDsp(CMSISDSPBayes ${ROOT})
 ### Includes
 target_include_directories(CMSISDSPBayes PUBLIC "${DSP}/Include")
--- a/libraries/cmsis/dsp/Source/BayesFunctions/arm_gaussian_naive_bayes_predict_f32.c
+++ b/libraries/cmsis/dsp/Source/BayesFunctions/arm_gaussian_naive_bayes_predict_f32.c
@@ -0,0 +1,397 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_naive_gaussian_bayes_predict_f32
 * Description:  Naive Gaussian Bayesian Estimator
 *
 *
 * Target Processor: Cortex-M and Cortex-A cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 #include <limits.h>
 #include <math.h>
 #define PI_F 3.1415926535897932384626433832795f
 #define DPI_F (2.0f*3.1415926535897932384626433832795f)
 /**
 * @addtogroup groupBayes
 * @{
 */
 /**
 * @brief Naive Gaussian Bayesian Estimator
 *
 * @param[in]  *S         points to a naive bayes instance structure
 * @param[in]  *in        points to the elements of the input vector.
 * @param[in]  *pBuffer   points to a buffer of length numberOfClasses
 * @return The predicted class
 *
 * @par If the number of classes is big, MVE version will consume lot of
 * stack since the log prior are computed on the stack.
 *
 */
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 #include "arm_helium_utils.h"
 #include "arm_vec_math.h"
 uint32_t arm_gaussian_naive_bayes_predict_f32(const arm_gaussian_naive_bayes_instance_f32 *S,
   const float32_t * in,
   float32_t *pBuffer)
 {
    uint32_t         nbClass;
    const float32_t *pTheta = S->theta;
    const float32_t *pSigma = S->sigma;
    float32_t      *buffer = pBuffer;
    const float32_t *pIn = in;
    float32_t       result;
    f32x4_t         vsigma;
    float32_t       tmp;
    f32x4_t         vacc1, vacc2;
    uint32_t        index;
    float32_t       logclassPriors[S->numberOfClasses];
    float32_t      *pLogPrior = logclassPriors;
    arm_vlog_f32((float32_t *) S->classPriors, logclassPriors, S->numberOfClasses);
    pTheta = S->theta;
    pSigma = S->sigma;
    for (nbClass = 0; nbClass < S->numberOfClasses; nbClass++) {
        pIn = in;
        vacc1 = vdupq_n_f32(0);
        vacc2 = vdupq_n_f32(0);
        uint32_t         blkCnt =S->vectorDimension >> 2;
        while (blkCnt > 0U) {
            f32x4_t         vinvSigma, vtmp;
            vsigma = vaddq_n_f32(vld1q(pSigma), S->epsilon);
            vacc1 = vaddq(vacc1, vlogq_f32(vmulq_n_f32(vsigma, 2.0f * PI)));
            vinvSigma = vrecip_medprec_f32(vsigma);
            vtmp = vsubq(vld1q(pIn), vld1q(pTheta));
            /* squaring */
            vtmp = vmulq(vtmp, vtmp);
            vacc2 = vfmaq(vacc2, vtmp, vinvSigma);
            pIn += 4;
            pTheta += 4;
            pSigma += 4;
            blkCnt--;
        }
        blkCnt = S->vectorDimension & 3;
        if (blkCnt > 0U) {
            mve_pred16_t    p0 = vctp32q(blkCnt);
            f32x4_t         vinvSigma, vtmp;
            vsigma = vaddq_n_f32(vld1q(pSigma), S->epsilon);
            vacc1 =
                vaddq_m_f32(vacc1, vacc1, vlogq_f32(vmulq_n_f32(vsigma, 2.0f * PI)), p0);
            vinvSigma = vrecip_medprec_f32(vsigma);
            vtmp = vsubq(vld1q(pIn), vld1q(pTheta));
            /* squaring */
            vtmp = vmulq(vtmp, vtmp);
            vacc2 = vfmaq_m_f32(vacc2, vtmp, vinvSigma, p0);
            pTheta += blkCnt;
            pSigma += blkCnt;
        }
        tmp = -0.5f * vecAddAcrossF32Mve(vacc1);
        tmp -= 0.5f * vecAddAcrossF32Mve(vacc2);
        *buffer = tmp + *pLogPrior++;
        buffer++;
    }
    arm_max_f32(pBuffer, S->numberOfClasses, &result, &index);
    return (index);
 }
 #else
 #if defined(ARM_MATH_NEON)
 #include "NEMath.h"
 uint32_t arm_gaussian_naive_bayes_predict_f32(const arm_gaussian_naive_bayes_instance_f32 *S,
   const float32_t * in,
   float32_t *pBuffer)
 {
    const float32_t *pPrior = S->classPriors;
    const float32_t *pTheta = S->theta;
    const float32_t *pSigma = S->sigma;
    const float32_t *pTheta1 = S->theta + S->vectorDimension;
    const float32_t *pSigma1 = S->sigma + S->vectorDimension;
    float32_t *buffer = pBuffer;
    const float32_t *pIn=in;
    float32_t result;
    float32_t sigma,sigma1;
    float32_t tmp,tmp1;
    uint32_t index;
    uint32_t vecBlkCnt;
    uint32_t classBlkCnt;
    float32x4_t epsilonV;
    float32x4_t sigmaV,sigmaV1;
    float32x4_t tmpV,tmpVb,tmpV1;
    float32x2_t tmpV2;
    float32x4_t thetaV,thetaV1;
    float32x4_t inV;
    epsilonV = vdupq_n_f32(S->epsilon);
    classBlkCnt = S->numberOfClasses >> 1;
    while(classBlkCnt > 0)
    {
        pIn = in;
        tmp = logf(*pPrior++);
        tmp1 = logf(*pPrior++);
        tmpV = vdupq_n_f32(0.0f);
        tmpV1 = vdupq_n_f32(0.0f);
        vecBlkCnt = S->vectorDimension >> 2;
        while(vecBlkCnt > 0)
        {
           sigmaV = vld1q_f32(pSigma);
           thetaV = vld1q_f32(pTheta);
           sigmaV1 = vld1q_f32(pSigma1);
           thetaV1 = vld1q_f32(pTheta1);
           inV = vld1q_f32(pIn);
           sigmaV = vaddq_f32(sigmaV, epsilonV);
           sigmaV1 = vaddq_f32(sigmaV1, epsilonV);
           tmpVb = vmulq_n_f32(sigmaV,DPI_F);
           tmpVb = vlogq_f32(tmpVb);
           tmpV = vmlsq_n_f32(tmpV,tmpVb,0.5f);
           tmpVb = vmulq_n_f32(sigmaV1,DPI_F);
           tmpVb = vlogq_f32(tmpVb);
           tmpV1 = vmlsq_n_f32(tmpV1,tmpVb,0.5f);
           tmpVb = vsubq_f32(inV,thetaV);
           tmpVb = vmulq_f32(tmpVb,tmpVb);
           tmpVb = vmulq_f32(tmpVb, vinvq_f32(sigmaV));
           tmpV = vmlsq_n_f32(tmpV,tmpVb,0.5f);
           tmpVb = vsubq_f32(inV,thetaV1);
           tmpVb = vmulq_f32(tmpVb,tmpVb);
           tmpVb = vmulq_f32(tmpVb, vinvq_f32(sigmaV1));
           tmpV1 = vmlsq_n_f32(tmpV1,tmpVb,0.5f);
           pIn += 4;
           pTheta += 4;
           pSigma += 4;
           pTheta1 += 4;
           pSigma1 += 4;
           vecBlkCnt--;
        }
        tmpV2 = vpadd_f32(vget_low_f32(tmpV),vget_high_f32(tmpV));
        tmp += vget_lane_f32(tmpV2, 0) + vget_lane_f32(tmpV2, 1);
        tmpV2 = vpadd_f32(vget_low_f32(tmpV1),vget_high_f32(tmpV1));
        tmp1 += vget_lane_f32(tmpV2, 0) + vget_lane_f32(tmpV2, 1);
        vecBlkCnt = S->vectorDimension & 3;
        while(vecBlkCnt > 0)
        {
           sigma = *pSigma + S->epsilon;
           sigma1 = *pSigma1 + S->epsilon;
           tmp -= 0.5f*logf(2.0f * PI_F * sigma);
           tmp -= 0.5f*(*pIn - *pTheta) * (*pIn - *pTheta) / sigma;
           tmp1 -= 0.5f*logf(2.0f * PI_F * sigma1);
           tmp1 -= 0.5f*(*pIn - *pTheta1) * (*pIn - *pTheta1) / sigma1;
           pIn++;
           pTheta++;
           pSigma++;
           pTheta1++;
           pSigma1++;
           vecBlkCnt--;
        }
        *buffer++ = tmp;
        *buffer++ = tmp1;
        pSigma += S->vectorDimension;
        pTheta += S->vectorDimension;
        pSigma1 += S->vectorDimension;
        pTheta1 += S->vectorDimension;
        classBlkCnt--;
    }
    classBlkCnt = S->numberOfClasses & 1;
    while(classBlkCnt > 0)
    {
        pIn = in;
        tmp = logf(*pPrior++);
        tmpV = vdupq_n_f32(0.0f);
        vecBlkCnt = S->vectorDimension >> 2;
        while(vecBlkCnt > 0)
        {
           sigmaV = vld1q_f32(pSigma);
           thetaV = vld1q_f32(pTheta);
           inV = vld1q_f32(pIn);
           sigmaV = vaddq_f32(sigmaV, epsilonV);
           tmpVb = vmulq_n_f32(sigmaV,DPI_F);
           tmpVb = vlogq_f32(tmpVb);
           tmpV = vmlsq_n_f32(tmpV,tmpVb,0.5f);
           tmpVb = vsubq_f32(inV,thetaV);
           tmpVb = vmulq_f32(tmpVb,tmpVb);
           tmpVb = vmulq_f32(tmpVb, vinvq_f32(sigmaV));
           tmpV = vmlsq_n_f32(tmpV,tmpVb,0.5f);
           pIn += 4;
           pTheta += 4;
           pSigma += 4;
           vecBlkCnt--;
        }
        tmpV2 = vpadd_f32(vget_low_f32(tmpV),vget_high_f32(tmpV));
        tmp += vget_lane_f32(tmpV2, 0) + vget_lane_f32(tmpV2, 1);
        vecBlkCnt = S->vectorDimension & 3;
        while(vecBlkCnt > 0)
        {
           sigma = *pSigma + S->epsilon;
           tmp -= 0.5f*logf(2.0f * PI_F * sigma);
           tmp -= 0.5f*(*pIn - *pTheta) * (*pIn - *pTheta) / sigma;
           pIn++;
           pTheta++;
           pSigma++;
           vecBlkCnt--;
        }
        *buffer++ = tmp;
        classBlkCnt--;
    }
    arm_max_f32(pBuffer,S->numberOfClasses,&result,&index);
    return(index);
 }
 #else
 /**
 * @brief Naive Gaussian Bayesian Estimator
 *
 * @param[in]  *S         points to a naive bayes instance structure
 * @param[in]  *in        points to the elements of the input vector.
 * @param[in]  *pBuffer   points to a buffer of length numberOfClasses
 * @return The predicted class
 *
 */
 uint32_t arm_gaussian_naive_bayes_predict_f32(const arm_gaussian_naive_bayes_instance_f32 *S,
   const float32_t * in,
   float32_t *pBuffer)
 {
    uint32_t nbClass;
    uint32_t nbDim;
    const float32_t *pPrior = S->classPriors;
    const float32_t *pTheta = S->theta;
    const float32_t *pSigma = S->sigma;
    float32_t *buffer = pBuffer;
    const float32_t *pIn=in;
    float32_t result;
    float32_t sigma;
    float32_t tmp;
    float32_t acc1,acc2;
    uint32_t index;
    pTheta=S->theta;
    pSigma=S->sigma;
    for(nbClass = 0; nbClass < S->numberOfClasses; nbClass++)
    {
        pIn = in;
        tmp = 0.0;
        acc1 = 0.0f;
        acc2 = 0.0f;
        for(nbDim = 0; nbDim < S->vectorDimension; nbDim++)
        {
           sigma = *pSigma + S->epsilon;
           acc1 += logf(2.0f * PI_F * sigma);
           acc2 += (*pIn - *pTheta) * (*pIn - *pTheta) / sigma;
           pIn++;
           pTheta++;
           pSigma++;
        }
        tmp = -0.5f * acc1;
        tmp -= 0.5f * acc2;
        *buffer = tmp + logf(*pPrior++);
        buffer++;
    }
    arm_max_f32(pBuffer,S->numberOfClasses,&result,&index);
    return(index);
 }
 #endif
 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 /**
 * @} end of groupBayes group
 */
--- a/libraries/cmsis/dsp/Source/CMakeLists.txt
+++ b/libraries/cmsis/dsp/Source/CMakeLists.txt
@@ -0,0 +1,280 @@
 cmake_minimum_required (VERSION 3.6)
 cmake_policy(SET CMP0077 NEW)
 project(CMSISDSP)
 # DSP Sources
 SET(DSP ${ROOT}/CMSIS/DSP)
 list(APPEND CMAKE_MODULE_PATH ${DSP}/Source)
 list(APPEND CMAKE_MODULE_PATH ${DSP})
 include(configLib)
 option(NEON "Neon acceleration" OFF)
 option(NEONEXPERIMENTAL "Neon experimental acceleration" OFF)
 option(LOOPUNROLL "Loop unrolling" ON)
 option(ROUNDING "Rounding" OFF)
 option(MATRIXCHECK "Matrix Checks" OFF)
 option(HELIUM "Helium acceleration (MVEF and MVEI supported)" OFF)
 option(MVEF "MVEF intrinsics supported" OFF)
 option(MVEI "MVEI intrinsics supported" OFF)
 # Select which parts of the CMSIS-DSP must be compiled.
 # There are some dependencies between the parts but they are not tracked
 # by this cmake. So, enabling some functions may require to enable some
 # other ones.
 option(BASICMATH            "Basic Math Functions"              ON)
 option(COMPLEXMATH          "Complex Math Functions"            ON)
 option(CONTROLLER           "Controller Functions"              ON)
 option(FASTMATH             "Fast Math Functions"               ON)
 option(FILTERING            "Filtering Functions"               ON)
 option(MATRIX               "Matrix Functions"                  ON)
 option(STATISTICS           "Statistics Functions"              ON)
 option(SUPPORT              "Support Functions"                 ON)
 option(TRANSFORM            "Transform Functions"               ON)
 option(SVM                  "Support Vector Machine Functions"  ON)
 option(BAYES                "Bayesian Estimators"               ON)
 option(DISTANCE             "Distance Functions"                ON)
 # When OFF it is the default behavior : all tables are included.
 option(CONFIGTABLE          "Configuration of table allowed"    OFF)
 # When CONFIGTABLE is ON, select if all interpolation tables must be included
 option(ALLFAST              "All interpolation tables included" OFF)
 # When CONFIGTABLE is ON, select if all FFT tables must be included
 option(ALLFFT               "All fft tables included"           OFF)
 # Features which require inclusion of a data table.
 # Since some tables may be big, the corresponding feature can be
 # disabled.
 # Those options are taken into account only when CONFIGTABLE is ON
 option(ARM_COS_F32          "cos f32"                           OFF)
 option(ARM_COS_Q31          "cos q31"                           OFF)
 option(ARM_COS_Q15          "cos q15"                           OFF)
 option(ARM_SIN_F32          "sin f32"                           OFF)
 option(ARM_SIN_Q31          "sin q31"                           OFF)
 option(ARM_SIN_Q15          "sin q15"                           OFF)
 option(ARM_SIN_COS_F32      "sin cos f32"                       OFF)
 option(ARM_SIN_COS_Q31      "sin cos q31"                       OFF)
 option(ARM_LMS_NORM_Q31     "lms norm q31"                      OFF)
 option(ARM_LMS_NORM_Q15     "lms norm q15"                      OFF)
 option(CFFT_F64_16          "cfft f64 16"                       OFF)
 option(CFFT_F64_32          "cfft f64 32"                       OFF)
 option(CFFT_F64_64          "cfft f64 64"                       OFF)
 option(CFFT_F64_128         "cfft f64 128"                      OFF)
 option(CFFT_F64_256         "cfft f64 256"                      OFF)
 option(CFFT_F64_512         "cfft f64 512"                      OFF)
 option(CFFT_F64_1024        "cfft f64 1024"                     OFF)
 option(CFFT_F64_2048        "cfft f64 2048"                     OFF)
 option(CFFT_F64_4096        "cfft f64 4096"                     OFF)
 option(CFFT_F32_16          "cfft f32 16"                       OFF)
 option(CFFT_F32_32          "cfft f32 32"                       OFF)
 option(CFFT_F32_64          "cfft f32 64"                       OFF)
 option(CFFT_F32_128         "cfft f32 128"                      OFF)
 option(CFFT_F32_256         "cfft f32 256"                      OFF)
 option(CFFT_F32_512         "cfft f32 512"                      OFF)
 option(CFFT_F32_1024        "cfft f32 1024"                     OFF)
 option(CFFT_F32_2048        "cfft f32 2048"                     OFF)
 option(CFFT_F32_4096        "cfft f32 4096"                     OFF)
 option(CFFT_Q31_16          "cfft q31 16"                       OFF)
 option(CFFT_Q31_32          "cfft q31 32"                       OFF)
 option(CFFT_Q31_64          "cfft q31 64"                       OFF)
 option(CFFT_Q31_128         "cfft q31 128"                      OFF)
 option(CFFT_Q31_256         "cfft q31 256"                      OFF)
 option(CFFT_Q31_512         "cfft q31 512"                      OFF)
 option(CFFT_Q31_1024        "cfft q31 1024"                     OFF)
 option(CFFT_Q31_2048        "cfft q31 2048"                     OFF)
 option(CFFT_Q31_4096        "cfft q31 4096"                     OFF)
 option(CFFT_Q15_16          "cfft q15 16"                       OFF)
 option(CFFT_Q15_32          "cfft q15 32"                       OFF)
 option(CFFT_Q15_64          "cfft q15 64"                       OFF)
 option(CFFT_Q15_128         "cfft q15 128"                      OFF)
 option(CFFT_Q15_256         "cfft q15 256"                      OFF)
 option(CFFT_Q15_512         "cfft q15 512"                      OFF)
 option(CFFT_Q15_1024        "cfft q15 1024"                     OFF)
 option(CFFT_Q15_2048        "cfft q15 2048"                     OFF)
 option(CFFT_Q15_4096        "cfft q15 4096"                     OFF)
 option(RFFT_FAST_F32_32     "rfft fast f32 32"                  OFF)
 option(RFFT_FAST_F32_64     "rfft fast f32 64"                  OFF)
 option(RFFT_FAST_F32_128    "rfft fast f32 128"                 OFF)
 option(RFFT_FAST_F32_256    "rfft fast f32 256"                 OFF)
 option(RFFT_FAST_F32_512    "rfft fast f32 512"                 OFF)
 option(RFFT_FAST_F32_1024   "rfft fast f32 1024"                OFF)
 option(RFFT_FAST_F32_2048   "rfft fast f32 2048"                OFF)
 option(RFFT_FAST_F32_4096   "rfft fast f32 4096"                OFF)
 option(RFFT_F32_128         "rfft f32 128"                      OFF)
 option(RFFT_F32_512         "rfft f32 512"                      OFF)
 option(RFFT_F32_2048        "rfft f32 2048"                     OFF)
 option(RFFT_F32_8192        "rfft f32 8192"                     OFF)
 option(RFFT_FAST_F64_32     "rfft fast f64 32"                  OFF)
 option(RFFT_FAST_F64_64     "rfft fast f64 64"                  OFF)
 option(RFFT_FAST_F64_128    "rfft fast f64 128"                 OFF)
 option(RFFT_FAST_F64_256    "rfft fast f64 256"                 OFF)
 option(RFFT_FAST_F64_512    "rfft fast f64 512"                 OFF)
 option(RFFT_FAST_F64_1024   "rfft fast f64 1024"                OFF)
 option(RFFT_FAST_F64_2048   "rfft fast f64 2048"                OFF)
 option(RFFT_FAST_F64_4096   "rfft fast f64 4096"                OFF)
 option(RFFT_F64_128         "rfft f64 128"                      OFF)
 option(RFFT_F64_512         "rfft f64 512"                      OFF)
 option(RFFT_F64_2048        "rfft f64 2048"                     OFF)
 option(RFFT_F64_8192        "rfft f64 8192"                     OFF)
 option(RFFT_Q31_32          "rfft q31 32"                       OFF)
 option(RFFT_Q31_64          "rfft q31 64"                       OFF)
 option(RFFT_Q31_128         "rfft q31 128"                      OFF)
 option(RFFT_Q31_256         "rfft q31 256"                      OFF)
 option(RFFT_Q31_512         "rfft q31 512"                      OFF)
 option(RFFT_Q31_1024        "rfft q31 1024"                     OFF)
 option(RFFT_Q31_2048        "rfft q31 2048"                     OFF)
 option(RFFT_Q31_4096        "rfft q31 4096"                     OFF)
 option(RFFT_Q31_8192        "rfft q31 8192"                     OFF)
 option(RFFT_Q15_32          "rfft q15 32"                       OFF)
 option(RFFT_Q15_64          "rfft q15 64"                       OFF)
 option(RFFT_Q15_128         "rfft q15 128"                      OFF)
 option(RFFT_Q15_256         "rfft q15 256"                      OFF)
 option(RFFT_Q15_512         "rfft q15 512"                      OFF)
 option(RFFT_Q15_1024        "rfft q15 1024"                     OFF)
 option(RFFT_Q15_2048        "rfft q15 2048"                     OFF)
 option(RFFT_Q15_4096        "rfft q15 4096"                     OFF)
 option(RFFT_Q15_8192        "rfft q15 8192"                     OFF)
 option(DCT4_F32_128          "dct4 f32 128"                     OFF)
 option(DCT4_F32_512          "dct4 f32 512"                     OFF)
 option(DCT4_F32_2048         "dct4 f32 2048"                    OFF)
 option(DCT4_F32_8192         "dct4 f32 8192"                    OFF)
 option(DCT4_Q31_128          "dct4 q31 128"                     OFF)
 option(DCT4_Q31_512          "dct4 q31 512"                     OFF)
 option(DCT4_Q31_2048         "dct4 q31 2048"                    OFF)
 option(DCT4_Q31_8192         "dct4 q31 8192"                    OFF)
 option(DCT4_Q15_128          "dct4 q15 128"                     OFF)
 option(DCT4_Q15_512          "dct4 q15 512"                     OFF)
 option(DCT4_Q15_2048         "dct4 q15 2048"                    OFF)
 option(DCT4_Q15_8192         "dct4 q15 8192"                    OFF)
 ###########################
 #
 # CMSIS DSP
 #
 ###########################
 add_library(CMSISDSP INTERFACE)
 if (BASICMATH)
  add_subdirectory(BasicMathFunctions)
  target_link_libraries(CMSISDSP INTERFACE CMSISDSPBasicMath)
 endif()
 if (COMPLEXMATH)
  add_subdirectory(ComplexMathFunctions)
  target_link_libraries(CMSISDSP INTERFACE CMSISDSPComplexMath)
 endif()
 if (CONTROLLER)
  add_subdirectory(ControllerFunctions)
  # Fast tables inclusion is allowed
  if (CONFIGTABLE)
    target_compile_definitions(CMSISDSPController PUBLIC ARM_FAST_ALLOW_TABLES)
  endif()
  target_link_libraries(CMSISDSP INTERFACE CMSISDSPController)
 endif()
 if (FASTMATH)
  add_subdirectory(FastMathFunctions)
  # Fast tables inclusion is allowed
  if (CONFIGTABLE)
    target_compile_definitions(CMSISDSPFastMath PUBLIC ARM_FAST_ALLOW_TABLES)
  endif()
  target_link_libraries(CMSISDSP INTERFACE CMSISDSPFastMath)
 endif()
 if (FILTERING)
  add_subdirectory(FilteringFunctions)
  # Fast tables inclusion is allowed
  if (CONFIGTABLE)
    target_compile_definitions(CMSISDSPFiltering PUBLIC ARM_FAST_ALLOW_TABLES)
  endif()
  target_link_libraries(CMSISDSP INTERFACE CMSISDSPFiltering)
 endif()
 if (MATRIX)
  add_subdirectory(MatrixFunctions)
  target_link_libraries(CMSISDSP INTERFACE CMSISDSPMatrix)
 endif()
 if (STATISTICS)
  add_subdirectory(StatisticsFunctions)
  target_link_libraries(CMSISDSP INTERFACE CMSISDSPStatistics)
 endif()
 if (SUPPORT)
  add_subdirectory(SupportFunctions)
  target_link_libraries(CMSISDSP INTERFACE CMSISDSPSupport)
 endif()
 if (TRANSFORM)
  add_subdirectory(TransformFunctions)
  # FFT tables inclusion is allowed
  if (CONFIGTABLE)
    target_compile_definitions(CMSISDSPTransform PUBLIC ARM_FFT_ALLOW_TABLES)
  endif()
  target_link_libraries(CMSISDSP INTERFACE CMSISDSPTransform)
 endif()
 if (FILTERING OR CONTROLLER OR FASTMATH OR TRANSFORM OR SVM OR DISTANCE)
  add_subdirectory(CommonTables)
  if (TRANSFORM)
    # FFT tables inclusion is allowed
    if (CONFIGTABLE)
      target_compile_definitions(CMSISDSPCommon PUBLIC ARM_FFT_ALLOW_TABLES)
    endif()
  endif()
  if (FILTERING OR CONTROLLER OR FASTMATH)
    # Select which tables to include
    if (CONFIGTABLE)
      target_compile_definitions(CMSISDSPCommon PUBLIC ARM_FAST_ALLOW_TABLES)
    endif()
  endif()
  target_link_libraries(CMSISDSP INTERFACE CMSISDSPCommon)
  # Common project is adding ComputeLibrary tables used by SVM and Distance
  # when NEon is ON.
 endif()
 if (SVM)
  add_subdirectory(SVMFunctions)
  target_link_libraries(CMSISDSP INTERFACE CMSISDSPSVM)
 endif()
 if (BAYES)
  add_subdirectory(BayesFunctions)
  target_link_libraries(CMSISDSP INTERFACE CMSISDSPBayes)
 endif()
 if (DISTANCE)
  add_subdirectory(DistanceFunctions)
  target_link_libraries(CMSISDSP INTERFACE CMSISDSPDistance)
 endif()
 ### Includes
 target_include_directories(CMSISDSP INTERFACE "${DSP}/Include")
--- a/libraries/cmsis/dsp/Source/CommonTables/CMakeLists.txt
+++ b/libraries/cmsis/dsp/Source/CommonTables/CMakeLists.txt
@@ -0,0 +1,41 @@
 cmake_minimum_required (VERSION 3.6)
 project(CMSISDSPCommon)
 include(configLib)
 include(configDsp)
 add_library(CMSISDSPCommon STATIC arm_common_tables.c)
 configLib(CMSISDSPCommon ${ROOT})
 configDsp(CMSISDSPCommon ${ROOT})
 if (CONFIGTABLE AND ALLFFT)
    target_compile_definitions(CMSISDSPCommon PUBLIC ARM_ALL_FFT_TABLES) 
 endif()
 if (CONFIGTABLE AND ALLFAST)
    target_compile_definitions(CMSISDSPCommon PUBLIC ARM_ALL_FAST_TABLES) 
 endif()
 include(fft)
 fft(CMSISDSPCommon)
 include(interpol)
 interpol(CMSISDSPCommon)
 target_sources(CMSISDSPCommon PRIVATE arm_const_structs.c)
 ### Includes
 target_include_directories(CMSISDSPCommon PUBLIC "${DSP}/Include")
 if (NEON OR NEONEXPERIMENTAL)
    target_sources(CMSISDSPCommon PRIVATE "${DSP}/ComputeLibrary/Source/arm_cl_tables.c")
 endif()
 if (HELIUM OR MVEF)
    target_sources(CMSISDSPCommon PRIVATE "${DSP}/Source/CommonTables/arm_mve_tables.c")
 endif()
--- a/libraries/cmsis/dsp/Source/CommonTables/CommonTables.c
+++ b/libraries/cmsis/dsp/Source/CommonTables/CommonTables.c
@@ -0,0 +1,31 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        CommonTables.c
 * Description:  Combination of all common table source files.
 *
 * $Date:        08. January 2020
 * $Revision:    V1.1.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2019-2020 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_common_tables.c"
 #include "arm_const_structs.c"
 #include "arm_mve_tables.c"
--- a/libraries/cmsis/dsp/Source/CommonTables/arm_common_tables.c
+++ b/libraries/cmsis/dsp/Source/CommonTables/arm_common_tables.c
--- a/libraries/cmsis/dsp/Source/CommonTables/arm_const_structs.c
+++ b/libraries/cmsis/dsp/Source/CommonTables/arm_const_structs.c
@@ -0,0 +1,663 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_const_structs.c
 * Description:  Constant structs that are initialized for user convenience.
 *               For example, some can be given as arguments to the arm_cfft_f32() or arm_rfft_f32() functions.
 *
 * $Date:        27. January 2017
 * $Revision:    V.1.5.1
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 #include "arm_const_structs.h"
 /*
 ALLOW TABLE is true when config table is enabled and the Tramsform folder is included
 for compilation.
 */
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
 /* Floating-point structs */
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_16) && defined(ARM_TABLE_BITREVIDX_FLT64_16))
 const arm_cfft_instance_f64 arm_cfft_sR_f64_len16 = {
  16, (const float64_t *)twiddleCoefF64_16, armBitRevIndexTableF64_16, ARMBITREVINDEXTABLEF64_16_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_32) && defined(ARM_TABLE_BITREVIDX_FLT64_32))
 const arm_cfft_instance_f64 arm_cfft_sR_f64_len32 = {
  32, (const float64_t *)twiddleCoefF64_32, armBitRevIndexTableF64_32, ARMBITREVINDEXTABLEF64_32_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_64) && defined(ARM_TABLE_BITREVIDX_FLT64_64))
 const arm_cfft_instance_f64 arm_cfft_sR_f64_len64 = {
  64, (const float64_t *)twiddleCoefF64_64, armBitRevIndexTableF64_64, ARMBITREVINDEXTABLEF64_64_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_128) && defined(ARM_TABLE_BITREVIDX_FLT64_128))
 const arm_cfft_instance_f64 arm_cfft_sR_f64_len128 = {
  128, (const float64_t *)twiddleCoefF64_128, armBitRevIndexTableF64_128, ARMBITREVINDEXTABLEF64_128_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_256) && defined(ARM_TABLE_BITREVIDX_FLT64_256))
 const arm_cfft_instance_f64 arm_cfft_sR_f64_len256 = {
  256, (const float64_t *)twiddleCoefF64_256, armBitRevIndexTableF64_256, ARMBITREVINDEXTABLEF64_256_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_512) && defined(ARM_TABLE_BITREVIDX_FLT64_512))
 const arm_cfft_instance_f64 arm_cfft_sR_f64_len512 = {
  512, (const float64_t *)twiddleCoefF64_512, armBitRevIndexTableF64_512, ARMBITREVINDEXTABLEF64_512_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_1024) && defined(ARM_TABLE_BITREVIDX_FLT64_1024))
 const arm_cfft_instance_f64 arm_cfft_sR_f64_len1024 = {
  1024, (const float64_t *)twiddleCoefF64_1024, armBitRevIndexTableF64_1024, ARMBITREVINDEXTABLEF64_1024_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_2048) && defined(ARM_TABLE_BITREVIDX_FLT64_2048))
 const arm_cfft_instance_f64 arm_cfft_sR_f64_len2048 = {
  2048, (const float64_t *)twiddleCoefF64_2048, armBitRevIndexTableF64_2048, ARMBITREVINDEXTABLEF64_2048_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_4096) && defined(ARM_TABLE_BITREVIDX_FLT64_4096))
 const arm_cfft_instance_f64 arm_cfft_sR_f64_len4096 = {
  4096, (const float64_t *)twiddleCoefF64_4096, armBitRevIndexTableF64_4096, ARMBITREVINDEXTABLEF64_4096_TABLE_LENGTH
 };
 #endif
 /* Floating-point structs */
 #if !defined(ARM_MATH_MVEF) || defined(ARM_MATH_AUTOVECTORIZE)
 /*
 Those structures cannot be used to initialize the MVE version of the FFT F32 instances.
 So they are not compiled when MVE is defined.
 For the MVE version, the new arm_cfft_init_f32 must be used.
 */
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_16) && defined(ARM_TABLE_BITREVIDX_FLT_16))
 const arm_cfft_instance_f32 arm_cfft_sR_f32_len16 = {
  16, twiddleCoef_16, armBitRevIndexTable16, ARMBITREVINDEXTABLE_16_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_32) && defined(ARM_TABLE_BITREVIDX_FLT_32))
 const arm_cfft_instance_f32 arm_cfft_sR_f32_len32 = {
  32, twiddleCoef_32, armBitRevIndexTable32, ARMBITREVINDEXTABLE_32_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_64) && defined(ARM_TABLE_BITREVIDX_FLT_64))
 const arm_cfft_instance_f32 arm_cfft_sR_f32_len64 = {
  64, twiddleCoef_64, armBitRevIndexTable64, ARMBITREVINDEXTABLE_64_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_128) && defined(ARM_TABLE_BITREVIDX_FLT_128))
 const arm_cfft_instance_f32 arm_cfft_sR_f32_len128 = {
  128, twiddleCoef_128, armBitRevIndexTable128, ARMBITREVINDEXTABLE_128_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_256) && defined(ARM_TABLE_BITREVIDX_FLT_256))
 const arm_cfft_instance_f32 arm_cfft_sR_f32_len256 = {
  256, twiddleCoef_256, armBitRevIndexTable256, ARMBITREVINDEXTABLE_256_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_512) && defined(ARM_TABLE_BITREVIDX_FLT_512))
 const arm_cfft_instance_f32 arm_cfft_sR_f32_len512 = {
  512, twiddleCoef_512, armBitRevIndexTable512, ARMBITREVINDEXTABLE_512_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024))
 const arm_cfft_instance_f32 arm_cfft_sR_f32_len1024 = {
  1024, twiddleCoef_1024, armBitRevIndexTable1024, ARMBITREVINDEXTABLE_1024_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048))
 const arm_cfft_instance_f32 arm_cfft_sR_f32_len2048 = {
  2048, twiddleCoef_2048, armBitRevIndexTable2048, ARMBITREVINDEXTABLE_2048_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_4096) && defined(ARM_TABLE_BITREVIDX_FLT_4096))
 const arm_cfft_instance_f32 arm_cfft_sR_f32_len4096 = {
  4096, twiddleCoef_4096, armBitRevIndexTable4096, ARMBITREVINDEXTABLE_4096_TABLE_LENGTH
 };
 #endif
 #endif /* !defined(ARM_MATH_MVEF) || defined(ARM_MATH_AUTOVECTORIZE) */
 /* Fixed-point structs */
 #if !defined(ARM_MATH_MVEI)
 /*
 Those structures cannot be used to initialize the MVE version of the FFT Q31 instances.
 So they are not compiled when MVE is defined.
 For the MVE version, the new arm_cfft_init_f32 must be used.
 */
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
 const arm_cfft_instance_q31 arm_cfft_sR_q31_len16 = {
  16, twiddleCoef_16_q31, armBitRevIndexTable_fixed_16, ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
 const arm_cfft_instance_q31 arm_cfft_sR_q31_len32 = {
  32, twiddleCoef_32_q31, armBitRevIndexTable_fixed_32, ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
 const arm_cfft_instance_q31 arm_cfft_sR_q31_len64 = {
  64, twiddleCoef_64_q31, armBitRevIndexTable_fixed_64, ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
 const arm_cfft_instance_q31 arm_cfft_sR_q31_len128 = {
  128, twiddleCoef_128_q31, armBitRevIndexTable_fixed_128, ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
 const arm_cfft_instance_q31 arm_cfft_sR_q31_len256 = {
  256, twiddleCoef_256_q31, armBitRevIndexTable_fixed_256, ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
 const arm_cfft_instance_q31 arm_cfft_sR_q31_len512 = {
  512, twiddleCoef_512_q31, armBitRevIndexTable_fixed_512, ARMBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
 const arm_cfft_instance_q31 arm_cfft_sR_q31_len1024 = {
  1024, twiddleCoef_1024_q31, armBitRevIndexTable_fixed_1024, ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
 const arm_cfft_instance_q31 arm_cfft_sR_q31_len2048 = {
  2048, twiddleCoef_2048_q31, armBitRevIndexTable_fixed_2048, ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
 const arm_cfft_instance_q31 arm_cfft_sR_q31_len4096 = {
  4096, twiddleCoef_4096_q31, armBitRevIndexTable_fixed_4096, ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
 const arm_cfft_instance_q15 arm_cfft_sR_q15_len16 = {
  16, twiddleCoef_16_q15, armBitRevIndexTable_fixed_16, ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
 const arm_cfft_instance_q15 arm_cfft_sR_q15_len32 = {
  32, twiddleCoef_32_q15, armBitRevIndexTable_fixed_32, ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
 const arm_cfft_instance_q15 arm_cfft_sR_q15_len64 = {
  64, twiddleCoef_64_q15, armBitRevIndexTable_fixed_64, ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
 const arm_cfft_instance_q15 arm_cfft_sR_q15_len128 = {
  128, twiddleCoef_128_q15, armBitRevIndexTable_fixed_128, ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
 const arm_cfft_instance_q15 arm_cfft_sR_q15_len256 = {
  256, twiddleCoef_256_q15, armBitRevIndexTable_fixed_256, ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
 const arm_cfft_instance_q15 arm_cfft_sR_q15_len512 = {
  512, twiddleCoef_512_q15, armBitRevIndexTable_fixed_512, ARMBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
 const arm_cfft_instance_q15 arm_cfft_sR_q15_len1024 = {
  1024, twiddleCoef_1024_q15, armBitRevIndexTable_fixed_1024, ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
 const arm_cfft_instance_q15 arm_cfft_sR_q15_len2048 = {
  2048, twiddleCoef_2048_q15, armBitRevIndexTable_fixed_2048, ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
 const arm_cfft_instance_q15 arm_cfft_sR_q15_len4096 = {
  4096, twiddleCoef_4096_q15, armBitRevIndexTable_fixed_4096, ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH
 };
 #endif
 #endif /* !defined(ARM_MATH_MVEI) */
 /* Structure for real-value inputs */
 /* Double precision strucs */
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_32) && defined(ARM_TABLE_BITREVIDX_FLT64_32) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_32))
 const arm_rfft_fast_instance_f64 arm_rfft_fast_sR_f64_len32 = {
  { 16, (const float64_t *)twiddleCoefF64_16, armBitRevIndexTableF64_16, ARMBITREVINDEXTABLEF64_16_TABLE_LENGTH },
  32U,
  (float64_t *)twiddleCoefF64_rfft_32
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_64) && defined(ARM_TABLE_BITREVIDX_FLT64_64) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_64))
 const arm_rfft_fast_instance_f64 arm_rfft_fast_sR_f64_len64 = {
   { 32, (const float64_t *)twiddleCoefF64_32, armBitRevIndexTableF64_32, ARMBITREVINDEXTABLEF64_32_TABLE_LENGTH },
  64U,
  (float64_t *)twiddleCoefF64_rfft_64
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_128) && defined(ARM_TABLE_BITREVIDX_FLT64_128) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_128))
 const arm_rfft_fast_instance_f64 arm_rfft_fast_sR_f64_len128 = {
  { 64, (const float64_t *)twiddleCoefF64_64, armBitRevIndexTableF64_64, ARMBITREVINDEXTABLEF64_64_TABLE_LENGTH },
  128U,
  (float64_t *)twiddleCoefF64_rfft_128
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_256) && defined(ARM_TABLE_BITREVIDX_FLT64_256) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_256))
 const arm_rfft_fast_instance_f64 arm_rfft_fast_sR_f64_len256 = {
  { 128, (const float64_t *)twiddleCoefF64_128, armBitRevIndexTableF64_128, ARMBITREVINDEXTABLEF64_128_TABLE_LENGTH },
  256U,
  (float64_t *)twiddleCoefF64_rfft_256
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_512) && defined(ARM_TABLE_BITREVIDX_FLT64_512) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_512))
 const arm_rfft_fast_instance_f64 arm_rfft_fast_sR_f64_len512 = {
  { 256, (const float64_t *)twiddleCoefF64_256, armBitRevIndexTableF64_256, ARMBITREVINDEXTABLEF64_256_TABLE_LENGTH },
  512U,
  (float64_t *)twiddleCoefF64_rfft_512
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_1024) && defined(ARM_TABLE_BITREVIDX_FLT64_1024) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_1024))
 const arm_rfft_fast_instance_f64 arm_rfft_fast_sR_f64_len1024 = {
  { 512, (const float64_t *)twiddleCoefF64_512, armBitRevIndexTableF64_512, ARMBITREVINDEXTABLEF64_512_TABLE_LENGTH },
  1024U,
  (float64_t *)twiddleCoefF64_rfft_1024
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_2048) && defined(ARM_TABLE_BITREVIDX_FLT64_2048) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_2048))
 const arm_rfft_fast_instance_f64 arm_rfft_fast_sR_f64_len2048 = {
  { 1024, (const float64_t *)twiddleCoefF64_1024, armBitRevIndexTableF64_1024, ARMBITREVINDEXTABLEF64_1024_TABLE_LENGTH },
  2048U,
  (float64_t *)twiddleCoefF64_rfft_2048
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_4096) && defined(ARM_TABLE_BITREVIDX_FLT64_4096) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_4096))
 const arm_rfft_fast_instance_f64 arm_rfft_fast_sR_f64_len4096 = {
  { 2048, (const float64_t *)twiddleCoefF64_2048, armBitRevIndexTableF64_2048, ARMBITREVINDEXTABLEF64_2048_TABLE_LENGTH },
  4096U,
  (float64_t *)twiddleCoefF64_rfft_4096
 };
 #endif
 /* Floating-point structs */
 #if !defined(ARM_MATH_MVEF) || defined(ARM_MATH_AUTOVECTORIZE)
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_32) && defined(ARM_TABLE_BITREVIDX_FLT_32) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_32))
 const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len32 = {
  { 16, twiddleCoef_16, armBitRevIndexTable16, ARMBITREVINDEXTABLE_16_TABLE_LENGTH },
  32U,
  (float32_t *)twiddleCoef_rfft_32
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_64) && defined(ARM_TABLE_BITREVIDX_FLT_64) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_64))
 const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len64 = {
   { 32, twiddleCoef_32, armBitRevIndexTable32, ARMBITREVINDEXTABLE_32_TABLE_LENGTH },
  64U,
  (float32_t *)twiddleCoef_rfft_64
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_128) && defined(ARM_TABLE_BITREVIDX_FLT_128) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_128))
 const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len128 = {
  { 64, twiddleCoef_64, armBitRevIndexTable64, ARMBITREVINDEXTABLE_64_TABLE_LENGTH },
  128U,
  (float32_t *)twiddleCoef_rfft_128
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_256) && defined(ARM_TABLE_BITREVIDX_FLT_256) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_256))
 const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len256 = {
  { 128, twiddleCoef_128, armBitRevIndexTable128, ARMBITREVINDEXTABLE_128_TABLE_LENGTH },
  256U,
  (float32_t *)twiddleCoef_rfft_256
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_512) && defined(ARM_TABLE_BITREVIDX_FLT_512) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_512))
 const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len512 = {
  { 256, twiddleCoef_256, armBitRevIndexTable256, ARMBITREVINDEXTABLE_256_TABLE_LENGTH },
  512U,
  (float32_t *)twiddleCoef_rfft_512
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_1024))
 const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len1024 = {
  { 512, twiddleCoef_512, armBitRevIndexTable512, ARMBITREVINDEXTABLE_512_TABLE_LENGTH },
  1024U,
  (float32_t *)twiddleCoef_rfft_1024
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_2048))
 const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len2048 = {
  { 1024, twiddleCoef_1024, armBitRevIndexTable1024, ARMBITREVINDEXTABLE_1024_TABLE_LENGTH },
  2048U,
  (float32_t *)twiddleCoef_rfft_2048
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_4096) && defined(ARM_TABLE_BITREVIDX_FLT_4096) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_4096))
 const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len4096 = {
  { 2048, twiddleCoef_2048, armBitRevIndexTable2048, ARMBITREVINDEXTABLE_2048_TABLE_LENGTH },
  4096U,
  (float32_t *)twiddleCoef_rfft_4096
 };
 #endif
 #endif /* #if !defined(ARM_MATH_MVEF) || defined(ARM_MATH_AUTOVECTORIZE) */
 /* Fixed-point structs */
 /* q31_t */
 #if !defined(ARM_MATH_MVEI)
 /*
 Those structures cannot be used to initialize the MVE version of the FFT Q31 instances.
 So they are not compiled when MVE is defined.
 For the MVE version, the new arm_cfft_init_f32 must be used.
 */
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
 const arm_rfft_instance_q31 arm_rfft_sR_q31_len32 = {
  32U,
  0,
  1,
  256U,
  (q31_t*)realCoefAQ31,
  (q31_t*)realCoefBQ31,
  &arm_cfft_sR_q31_len16
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
 const arm_rfft_instance_q31 arm_rfft_sR_q31_len64 = {
  64U,
  0,
  1,
  128U,
  (q31_t*)realCoefAQ31,
  (q31_t*)realCoefBQ31,
  &arm_cfft_sR_q31_len32
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
 const arm_rfft_instance_q31 arm_rfft_sR_q31_len128 = {
  128U,
  0,
  1,
  64U,
  (q31_t*)realCoefAQ31,
  (q31_t*)realCoefBQ31,
  &arm_cfft_sR_q31_len64
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
 const arm_rfft_instance_q31 arm_rfft_sR_q31_len256 = {
  256U,
  0,
  1,
  32U,
  (q31_t*)realCoefAQ31,
  (q31_t*)realCoefBQ31,
  &arm_cfft_sR_q31_len128
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
 const arm_rfft_instance_q31 arm_rfft_sR_q31_len512 = {
  512U,
  0,
  1,
  16U,
  (q31_t*)realCoefAQ31,
  (q31_t*)realCoefBQ31,
  &arm_cfft_sR_q31_len256
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
 const arm_rfft_instance_q31 arm_rfft_sR_q31_len1024 = {
  1024U,
  0,
  1,
  8U,
  (q31_t*)realCoefAQ31,
  (q31_t*)realCoefBQ31,
  &arm_cfft_sR_q31_len512
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
 const arm_rfft_instance_q31 arm_rfft_sR_q31_len2048 = {
  2048U,
  0,
  1,
  4U,
  (q31_t*)realCoefAQ31,
  (q31_t*)realCoefBQ31,
  &arm_cfft_sR_q31_len1024
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
 const arm_rfft_instance_q31 arm_rfft_sR_q31_len4096 = {
  4096U,
  0,
  1,
  2U,
  (q31_t*)realCoefAQ31,
  (q31_t*)realCoefBQ31,
  &arm_cfft_sR_q31_len2048
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
 const arm_rfft_instance_q31 arm_rfft_sR_q31_len8192 = {
  8192U,
  0,
  1,
  1U,
  (q31_t*)realCoefAQ31,
  (q31_t*)realCoefBQ31,
  &arm_cfft_sR_q31_len4096
 };
 #endif
 /* q15_t */
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
 const arm_rfft_instance_q15 arm_rfft_sR_q15_len32 = {
  32U,
  0,
  1,
  256U,
  (q15_t*)realCoefAQ15,
  (q15_t*)realCoefBQ15,
  &arm_cfft_sR_q15_len16
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
 const arm_rfft_instance_q15 arm_rfft_sR_q15_len64 = {
  64U,
  0,
  1,
  128U,
  (q15_t*)realCoefAQ15,
  (q15_t*)realCoefBQ15,
  &arm_cfft_sR_q15_len32
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
 const arm_rfft_instance_q15 arm_rfft_sR_q15_len128 = {
  128U,
  0,
  1,
  64U,
  (q15_t*)realCoefAQ15,
  (q15_t*)realCoefBQ15,
  &arm_cfft_sR_q15_len64
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
 const arm_rfft_instance_q15 arm_rfft_sR_q15_len256 = {
  256U,
  0,
  1,
  32U,
  (q15_t*)realCoefAQ15,
  (q15_t*)realCoefBQ15,
  &arm_cfft_sR_q15_len128
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
 const arm_rfft_instance_q15 arm_rfft_sR_q15_len512 = {
  512U,
  0,
  1,
  16U,
  (q15_t*)realCoefAQ15,
  (q15_t*)realCoefBQ15,
  &arm_cfft_sR_q15_len256
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
 const arm_rfft_instance_q15 arm_rfft_sR_q15_len1024 = {
  1024U,
  0,
  1,
  8U,
  (q15_t*)realCoefAQ15,
  (q15_t*)realCoefBQ15,
  &arm_cfft_sR_q15_len512
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
 const arm_rfft_instance_q15 arm_rfft_sR_q15_len2048 = {
  2048U,
  0,
  1,
  4U,
  (q15_t*)realCoefAQ15,
  (q15_t*)realCoefBQ15,
  &arm_cfft_sR_q15_len1024
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
 const arm_rfft_instance_q15 arm_rfft_sR_q15_len4096 = {
  4096U,
  0,
  1,
  2U,
  (q15_t*)realCoefAQ15,
  (q15_t*)realCoefBQ15,
  &arm_cfft_sR_q15_len2048
 };
 #endif
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
 const arm_rfft_instance_q15 arm_rfft_sR_q15_len8192 = {
  8192U,
  0,
  1,
  1U,
  (q15_t*)realCoefAQ15,
  (q15_t*)realCoefBQ15,
  &arm_cfft_sR_q15_len4096
 };
 #endif
 #endif /* !defined(ARM_MATH_MVEI) */
 #endif
--- a/libraries/cmsis/dsp/Source/CommonTables/arm_mve_tables.c
+++ b/libraries/cmsis/dsp/Source/CommonTables/arm_mve_tables.c
--- a/libraries/cmsis/dsp/Source/ComplexMathFunctions/CMakeLists.txt
+++ b/libraries/cmsis/dsp/Source/ComplexMathFunctions/CMakeLists.txt
@@ -0,0 +1,53 @@
 cmake_minimum_required (VERSION 3.6)
 project(CMSISDSPComplexMath)
 include(configLib)
 include(configDsp)
 file(GLOB SRC "./*_*.c")
 add_library(CMSISDSPComplexMath STATIC)
 configLib(CMSISDSPComplexMath ${ROOT})
 configDsp(CMSISDSPComplexMath ${ROOT})
 include(interpol)
 interpol(CMSISDSPFastMath)
 if (CONFIGTABLE AND ALLFAST)
    target_compile_definitions(CMSISDSPComplexMath PUBLIC ARM_ALL_FAST_TABLES)  
 endif()
 # MVE code is using a table for computing the fast sqrt arm_cmplx_mag_q31
 # There is the possibility of not compiling this function and not including
 # the table.
 if (NOT CONFIGTABLE OR ALLFAST OR ARM_CMPLX_MAG_Q31 OR (NOT HELIUM AND NOT MVEI))
 target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_q31.c)
 endif()
 if (NOT CONFIGTABLE OR ALLFAST OR ARM_CMPLX_MAG_Q15 OR (NOT HELIUM AND NOT MVEI))
 target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_q15.c)
 endif()
 target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_conj_f32.c)
 target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_conj_q15.c)
 target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_conj_q31.c)
 target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_dot_prod_f32.c)
 target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_dot_prod_q15.c)
 target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_dot_prod_q31.c)
 target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_f32.c)
 target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_squared_f32.c)
 target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_squared_q15.c)
 target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_squared_q31.c)
 target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_cmplx_f32.c)
 target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_cmplx_q15.c)
 target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_cmplx_q31.c)
 target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_real_f32.c)
 target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_real_q15.c)
 target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_real_q31.c)
 ### Includes
 target_include_directories(CMSISDSPComplexMath PUBLIC "${DSP}/Include")
--- a/libraries/cmsis/dsp/Source/ComplexMathFunctions/ComplexMathFunctions.c
+++ b/libraries/cmsis/dsp/Source/ComplexMathFunctions/ComplexMathFunctions.c
@@ -0,0 +1,46 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        CompexMathFunctions.c
 * Description:  Combination of all comlex math function source files.
 *
 * $Date:        18. March 2019
 * $Revision:    V1.0.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_cmplx_conj_f32.c"
 #include "arm_cmplx_conj_q15.c"
 #include "arm_cmplx_conj_q31.c"
 #include "arm_cmplx_dot_prod_f32.c"
 #include "arm_cmplx_dot_prod_q15.c"
 #include "arm_cmplx_dot_prod_q31.c"
 #include "arm_cmplx_mag_f32.c"
 #include "arm_cmplx_mag_q15.c"
 #include "arm_cmplx_mag_q31.c"
 #include "arm_cmplx_mag_squared_f32.c"
 #include "arm_cmplx_mag_squared_q15.c"
 #include "arm_cmplx_mag_squared_q31.c"
 #include "arm_cmplx_mult_cmplx_f32.c"
 #include "arm_cmplx_mult_cmplx_q15.c"
 #include "arm_cmplx_mult_cmplx_q31.c"
 #include "arm_cmplx_mult_real_f32.c"
 #include "arm_cmplx_mult_real_q15.c"
 #include "arm_cmplx_mult_real_q31.c"
--- a/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_conj_f32.c
+++ b/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_conj_f32.c
@@ -0,0 +1,213 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_cmplx_conj_f32.c
 * Description:  Floating-point complex conjugate
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupCmplxMath
 */
 /**
  @defgroup cmplx_conj Complex Conjugate
  Conjugates the elements of a complex data vector.
  The <code>pSrc</code> points to the source data and
  <code>pDst</code> points to the destination data where the result should be written.
  <code>numSamples</code> specifies the number of complex samples
  and the data in each array is stored in an interleaved fashion
  (real, imag, real, imag, ...).
  Each array has a total of <code>2*numSamples</code> values.
  The underlying algorithm is used:
  <pre>
  for (n = 0; n < numSamples; n++) {
      pDst[(2*n)  ] =  pSrc[(2*n)  ];    // real part
      pDst[(2*n)+1] = -pSrc[(2*n)+1];    // imag part
  }
  </pre>
  There are separate functions for floating-point, Q15, and Q31 data types.
 */
 /**
  @addtogroup cmplx_conj
  @{
 */
 /**
  @brief         Floating-point complex conjugate.
  @param[in]     pSrc        points to the input vector
  @param[out]    pDst        points to the output vector
  @param[in]     numSamples  number of samples in each vector
  @return        none
 */
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_cmplx_conj_f32(
    const float32_t * pSrc,
    float32_t * pDst,
    uint32_t numSamples)
 {
    static const float32_t cmplx_conj_sign[4] = { 1.0f, -1.0f, 1.0f, -1.0f };
    uint32_t blockSize = numSamples * CMPLX_DIM;   /* loop counters */
    uint32_t blkCnt;
    f32x4_t vecSrc;
    f32x4_t vecSign;
    /*
     * load sign vector
     */
    vecSign = *(f32x4_t *) cmplx_conj_sign;
    /* Compute 4 real samples at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        vecSrc = vld1q(pSrc);
        vst1q(pDst,vmulq(vecSrc, vecSign));
        /*
         * Decrement the blkCnt loop counter
         * Advance vector source and destination pointers
         */
        pSrc += 4;
        pDst += 4;
        blkCnt--;
    }
     /* Tail */
    blkCnt = (blockSize & 0x3) >> 1;
    while (blkCnt > 0U)
    {
      /* C[0] + jC[1] = A[0]+ j(-1)A[1] */
      /* Calculate Complex Conjugate and store result in destination buffer. */
      *pDst++ =  *pSrc++;
      *pDst++ = -*pSrc++;
      /* Decrement loop counter */
      blkCnt--;
    }
 }
 #else
 void arm_cmplx_conj_f32(
  const float32_t * pSrc,
        float32_t * pDst,
        uint32_t numSamples)
 {
        uint32_t blkCnt;                               /* Loop counter */
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
   float32x4_t zero;
   float32x4x2_t vec;
   zero = vdupq_n_f32(0.0f);
   /* Compute 4 outputs at a time */
   blkCnt = numSamples >> 2U;
   while (blkCnt > 0U)
   {
     /* C[0]+jC[1] = A[0]+(-1)*jA[1] */
     /* Calculate Complex Conjugate and then store the results in the destination buffer. */
     vec = vld2q_f32(pSrc);
     vec.val[1] = vsubq_f32(zero,vec.val[1]);
     vst2q_f32(pDst,vec);
     /* Increment pointers */
     pSrc += 8;
     pDst += 8;
     /* Decrement the loop counter */
     blkCnt--;
   }
   /* Tail */
   blkCnt = numSamples & 0x3;
 #else
 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = numSamples >> 2U;
  while (blkCnt > 0U)
  {
    /* C[0] + jC[1] = A[0]+ j(-1)A[1] */
    /* Calculate Complex Conjugate and store result in destination buffer. */
    *pDst++ =  *pSrc++;
    *pDst++ = -*pSrc++;
    *pDst++ =  *pSrc++;
    *pDst++ = -*pSrc++;
    *pDst++ =  *pSrc++;
    *pDst++ = -*pSrc++;
    *pDst++ =  *pSrc++;
    *pDst++ = -*pSrc++;
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = numSamples % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = numSamples;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
 #endif /* #if defined (ARM_MATH_NEON) */
  while (blkCnt > 0U)
  {
    /* C[0] + jC[1] = A[0]+ j(-1)A[1] */
    /* Calculate Complex Conjugate and store result in destination buffer. */
    *pDst++ =  *pSrc++;
    *pDst++ = -*pSrc++;
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 /**
  @} end of cmplx_conj group
 */
--- a/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_conj_q15.c
+++ b/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_conj_q15.c
@@ -0,0 +1,207 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_cmplx_conj_q15.c
 * Description:  Q15 complex conjugate
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupCmplxMath
 */
 /**
  @addtogroup cmplx_conj
  @{
 */
 /**
  @brief         Q15 complex conjugate.
  @param[in]     pSrc        points to the input vector
  @param[out]    pDst        points to the output vector
  @param[in]     numSamples  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   The Q15 value -1 (0x8000) is saturated to the maximum allowable positive value 0x7FFF.
 */
 #if defined(ARM_MATH_MVEI)
 void arm_cmplx_conj_q15(
  const q15_t * pSrc,
        q15_t * pDst,
        uint32_t numSamples)
 {
    uint32_t blockSize = numSamples * CMPLX_DIM;   /* loop counters */
    uint32_t blkCnt;
    q31_t in1;
    q15x8x2_t vecSrc;
    q15x8_t zero;
    zero = vdupq_n_s16(0);
    /* Compute 8 real samples at a time */
    blkCnt = blockSize >> 4U;
    while (blkCnt > 0U)
    {
        vecSrc = vld2q(pSrc);
        vecSrc.val[1] = vqsubq(zero, vecSrc.val[1]);
        vst2q(pDst,vecSrc);
        /*
         * Decrement the blkCnt loop counter
         * Advance vector source and destination pointers
         */
        pSrc += 16;
        pDst += 16;
        blkCnt --;
    }
     /* Tail */
    blkCnt = (blockSize & 0xF) >> 1;
    while (blkCnt > 0U)
    {
      /* C[0] + jC[1] = A[0]+ j(-1)A[1] */
      /* Calculate Complex Conjugate and store result in destination buffer. */
      *pDst++ =  *pSrc++;
      in1 = *pSrc++;
      *pDst++ = __SSAT(-in1, 16);
      /* Decrement loop counter */
      blkCnt--;
    }
 }
 #else
 void arm_cmplx_conj_q15(
  const q15_t * pSrc,
        q15_t * pDst,
        uint32_t numSamples)
 {
        uint32_t blkCnt;                               /* Loop counter */
        q31_t in1;                                     /* Temporary input variable */
 #if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
        q31_t in2, in3, in4;                           /* Temporary input variables */
 #endif
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = numSamples >> 2U;
  while (blkCnt > 0U)
  {
    /* C[0] + jC[1] = A[0]+ j(-1)A[1] */
    /* Calculate Complex Conjugate and store result in destination buffer. */
    #if defined (ARM_MATH_DSP)
    in1 = read_q15x2_ia ((q15_t **) &pSrc);
    in2 = read_q15x2_ia ((q15_t **) &pSrc);
    in3 = read_q15x2_ia ((q15_t **) &pSrc);
    in4 = read_q15x2_ia ((q15_t **) &pSrc);
 #ifndef ARM_MATH_BIG_ENDIAN
    in1 = __QASX(0, in1);
    in2 = __QASX(0, in2);
    in3 = __QASX(0, in3);
    in4 = __QASX(0, in4);
 #else
    in1 = __QSAX(0, in1);
    in2 = __QSAX(0, in2);
    in3 = __QSAX(0, in3);
    in4 = __QSAX(0, in4);
 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
    in1 = ((uint32_t) in1 >> 16) | ((uint32_t) in1 << 16);
    in2 = ((uint32_t) in2 >> 16) | ((uint32_t) in2 << 16);
    in3 = ((uint32_t) in3 >> 16) | ((uint32_t) in3 << 16);
    in4 = ((uint32_t) in4 >> 16) | ((uint32_t) in4 << 16);
    write_q15x2_ia (&pDst, in1);
    write_q15x2_ia (&pDst, in2);
    write_q15x2_ia (&pDst, in3);
    write_q15x2_ia (&pDst, in4);
 #else
    *pDst++ =  *pSrc++;
    in1 = *pSrc++;
    *pDst++ = (in1 == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in1;
    *pDst++ =  *pSrc++;
    in1 = *pSrc++;
    *pDst++ = (in1 == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in1;
    *pDst++ =  *pSrc++;
    in1 = *pSrc++;
    *pDst++ = (in1 == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in1;
    *pDst++ =  *pSrc++;
    in1 = *pSrc++;
    *pDst++ = (in1 == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in1;
 #endif /* #if defined (ARM_MATH_DSP) */
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = numSamples % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = numSamples;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C[0] + jC[1] = A[0]+ j(-1)A[1] */
    /* Calculate Complex Conjugate and store result in destination buffer. */
    *pDst++ =  *pSrc++;
    in1 = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = __SSAT(-in1, 16);
 #else
    *pDst++ = (in1 == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in1;
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of cmplx_conj group
 */
--- a/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_conj_q31.c
+++ b/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_conj_q31.c
@@ -0,0 +1,193 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_cmplx_conj_q31.c
 * Description:  Q31 complex conjugate
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupCmplxMath
 */
 /**
  @addtogroup cmplx_conj
  @{
 */
 /**
  @brief         Q31 complex conjugate.
  @param[in]     pSrc        points to the input vector
  @param[out]    pDst        points to the output vector
  @param[in]     numSamples  number of samples in each vector
  @return        none
  @par           Scaling and Overflow Behavior
                   The function uses saturating arithmetic.
                   The Q31 value -1 (0x80000000) is saturated to the maximum allowable positive value 0x7FFFFFFF.
 */
 #if defined(ARM_MATH_MVEI)
 void arm_cmplx_conj_q31(
  const q31_t * pSrc,
        q31_t * pDst,
        uint32_t numSamples)
 {
    uint32_t blockSize = numSamples * CMPLX_DIM;   /* loop counters */
    uint32_t blkCnt;
    q31x4x2_t vecSrc;
    q31_t in;                                      /* Temporary input variable */
    q31x4_t zero;
    zero = vdupq_n_s32(0);
    /* Compute 4 real samples at a time */
    blkCnt = blockSize >> 3U;
    while (blkCnt > 0U)
    {
        vecSrc = vld2q(pSrc);
        vecSrc.val[1] = vqsubq(zero, vecSrc.val[1]);
        vst2q(pDst,vecSrc);
        /*
         * Decrement the blkCnt loop counter
         * Advance vector source and destination pointers
         */
        pSrc += 8;
        pDst += 8;
        blkCnt --;
    }
     /* Tail */
    blkCnt = (blockSize & 0x7) >> 1;
    while (blkCnt > 0U)
    {
      /* C[0] + jC[1] = A[0]+ j(-1)A[1] */
      /* Calculate Complex Conjugate and store result in destination buffer. */
      *pDst++ =  *pSrc++;
      in = *pSrc++;
      *pDst++ = __QSUB(0, in);
      /* Decrement loop counter */
      blkCnt--;
    }
 }
 #else
 void arm_cmplx_conj_q31(
  const q31_t * pSrc,
        q31_t * pDst,
        uint32_t numSamples)
 {
        uint32_t blkCnt;                               /* Loop counter */
        q31_t in;                                      /* Temporary input variable */
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = numSamples >> 2U;
  while (blkCnt > 0U)
  {
    /* C[0] + jC[1] = A[0]+ j(-1)A[1] */
    /* Calculate Complex Conjugate and store result in destination buffer. */
    *pDst++ =  *pSrc++;
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = __QSUB(0, in);
 #else
    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
 #endif
    *pDst++ =  *pSrc++;
    in =  *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = __QSUB(0, in);
 #else
    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
 #endif
    *pDst++ =  *pSrc++;
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = __QSUB(0, in);
 #else
    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
 #endif
    *pDst++ =  *pSrc++;
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = __QSUB(0, in);
 #else
    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = numSamples % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = numSamples;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    /* C[0] + jC[1] = A[0]+ j(-1)A[1] */
    /* Calculate Complex Conjugate and store result in destination buffer. */
    *pDst++ =  *pSrc++;
    in = *pSrc++;
 #if defined (ARM_MATH_DSP)
    *pDst++ = __QSUB(0, in);
 #else
    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
 #endif
    /* Decrement loop counter */
    blkCnt--;
  }
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of cmplx_conj group
 */
--- a/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c
+++ b/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c
@@ -0,0 +1,302 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_cmplx_dot_prod_f32.c
 * Description:  Floating-point complex dot product
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupCmplxMath
 */
 /**
  @defgroup cmplx_dot_prod Complex Dot Product
  Computes the dot product of two complex vectors.
  The vectors are multiplied element-by-element and then summed.
  The <code>pSrcA</code> points to the first complex input vector and
  <code>pSrcB</code> points to the second complex input vector.
  <code>numSamples</code> specifies the number of complex samples
  and the data in each array is stored in an interleaved fashion
  (real, imag, real, imag, ...).
  Each array has a total of <code>2*numSamples</code> values.
  The underlying algorithm is used:
  <pre>
  realResult = 0;
  imagResult = 0;
  for (n = 0; n < numSamples; n++) {
      realResult += pSrcA[(2*n)+0] * pSrcB[(2*n)+0] - pSrcA[(2*n)+1] * pSrcB[(2*n)+1];
      imagResult += pSrcA[(2*n)+0] * pSrcB[(2*n)+1] + pSrcA[(2*n)+1] * pSrcB[(2*n)+0];
  }
  </pre>
  There are separate functions for floating-point, Q15, and Q31 data types.
 */
 /**
  @addtogroup cmplx_dot_prod
  @{
 */
 /**
  @brief         Floating-point complex dot product.
  @param[in]     pSrcA       points to the first input vector
  @param[in]     pSrcB       points to the second input vector
  @param[in]     numSamples  number of samples in each vector
  @param[out]    realResult  real part of the result returned here
  @param[out]    imagResult  imaginary part of the result returned here
  @return        none
 */
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_cmplx_dot_prod_f32(
    const float32_t * pSrcA,
    const float32_t * pSrcB,
    uint32_t numSamples,
    float32_t * realResult,
    float32_t * imagResult)
 {
    uint32_t blockSize = numSamples * CMPLX_DIM;  /* loop counters */
    uint32_t blkCnt;
    float32_t real_sum, imag_sum;
    f32x4_t vecSrcA, vecSrcB;
    f32x4_t vec_acc = vdupq_n_f32(0.0f);
    float32_t a0,b0,c0,d0;
    /* Compute 2 complex samples at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vec_acc = vcmlaq(vec_acc, vecSrcA, vecSrcB);
        vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
        /*
         * Decrement the blkCnt loop counter
         * Advance vector source and destination pointers
         */
        pSrcA += 4;
        pSrcB += 4;
        blkCnt--;
    }
    real_sum = vgetq_lane(vec_acc, 0) + vgetq_lane(vec_acc, 2);
    imag_sum = vgetq_lane(vec_acc, 1) + vgetq_lane(vec_acc, 3);
    /* Tail */
    blkCnt = (blockSize & 3) >> 1;
    while (blkCnt > 0U)
    {
      a0 = *pSrcA++;
      b0 = *pSrcA++;
      c0 = *pSrcB++;
      d0 = *pSrcB++;
      real_sum += a0 * c0;
      imag_sum += a0 * d0;
      real_sum -= b0 * d0;
      imag_sum += b0 * c0;
      /* Decrement loop counter */
      blkCnt--;
    }
    /*
     * Store the real and imaginary results in the destination buffers
     */
    *realResult = real_sum;
    *imagResult = imag_sum;
 }
 #else
 void arm_cmplx_dot_prod_f32(
  const float32_t * pSrcA,
  const float32_t * pSrcB,
        uint32_t numSamples,
        float32_t * realResult,
        float32_t * imagResult)
 {
        uint32_t blkCnt;                               /* Loop counter */
        float32_t real_sum = 0.0f, imag_sum = 0.0f;    /* Temporary result variables */
        float32_t a0,b0,c0,d0;
 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
    float32x4x2_t vec1,vec2,vec3,vec4;
    float32x4_t accR,accI;
    float32x2_t accum = vdup_n_f32(0);
    accR = vdupq_n_f32(0.0f);
    accI = vdupq_n_f32(0.0f);
    /* Loop unrolling: Compute 8 outputs at a time */
    blkCnt = numSamples >> 3U;
    while (blkCnt > 0U)
    {
 	/* C = (A[0]+jA[1])*(B[0]+jB[1]) + ...  */
        /* Calculate dot product and then store the result in a temporary buffer. */
 	      vec1 = vld2q_f32(pSrcA);
        vec2 = vld2q_f32(pSrcB);
 	/* Increment pointers */
        pSrcA += 8;
        pSrcB += 8;
 	/* Re{C} = Re{A}*Re{B} - Im{A}*Im{B} */
        accR = vmlaq_f32(accR,vec1.val[0],vec2.val[0]);
        accR = vmlsq_f32(accR,vec1.val[1],vec2.val[1]);
 	/* Im{C} = Re{A}*Im{B} + Im{A}*Re{B} */
        accI = vmlaq_f32(accI,vec1.val[1],vec2.val[0]);
        accI = vmlaq_f32(accI,vec1.val[0],vec2.val[1]);
        vec3 = vld2q_f32(pSrcA);
        vec4 = vld2q_f32(pSrcB);
 	/* Increment pointers */
        pSrcA += 8;
        pSrcB += 8;
 	/* Re{C} = Re{A}*Re{B} - Im{A}*Im{B} */
        accR = vmlaq_f32(accR,vec3.val[0],vec4.val[0]);
        accR = vmlsq_f32(accR,vec3.val[1],vec4.val[1]);
 	/* Im{C} = Re{A}*Im{B} + Im{A}*Re{B} */
        accI = vmlaq_f32(accI,vec3.val[1],vec4.val[0]);
        accI = vmlaq_f32(accI,vec3.val[0],vec4.val[1]);
        /* Decrement the loop counter */
        blkCnt--;
    }
    accum = vpadd_f32(vget_low_f32(accR), vget_high_f32(accR));
    real_sum += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
    accum = vpadd_f32(vget_low_f32(accI), vget_high_f32(accI));
    imag_sum += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
    /* Tail */
    blkCnt = numSamples & 0x7;
 #else
 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = numSamples >> 2U;
  while (blkCnt > 0U)
  {
    a0 = *pSrcA++;
    b0 = *pSrcA++;
    c0 = *pSrcB++;
    d0 = *pSrcB++;
    real_sum += a0 * c0;
    imag_sum += a0 * d0;
    real_sum -= b0 * d0;
    imag_sum += b0 * c0;
    a0 = *pSrcA++;
    b0 = *pSrcA++;
    c0 = *pSrcB++;
    d0 = *pSrcB++;
    real_sum += a0 * c0;
    imag_sum += a0 * d0;
    real_sum -= b0 * d0;
    imag_sum += b0 * c0;
    a0 = *pSrcA++;
    b0 = *pSrcA++;
    c0 = *pSrcB++;
    d0 = *pSrcB++;
    real_sum += a0 * c0;
    imag_sum += a0 * d0;
    real_sum -= b0 * d0;
    imag_sum += b0 * c0;
    a0 = *pSrcA++;
    b0 = *pSrcA++;
    c0 = *pSrcB++;
    d0 = *pSrcB++;
    real_sum += a0 * c0;
    imag_sum += a0 * d0;
    real_sum -= b0 * d0;
    imag_sum += b0 * c0;
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = numSamples % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = numSamples;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
 #endif /* #if defined(ARM_MATH_NEON) */
  while (blkCnt > 0U)
  {
    a0 = *pSrcA++;
    b0 = *pSrcA++;
    c0 = *pSrcB++;
    d0 = *pSrcB++;
    real_sum += a0 * c0;
    imag_sum += a0 * d0;
    real_sum -= b0 * d0;
    imag_sum += b0 * c0;
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Store real and imaginary result in destination buffer. */
  *realResult = real_sum;
  *imagResult = imag_sum;
 }
 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 /**
  @} end of cmplx_dot_prod group
 */
--- a/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q15.c
+++ b/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q15.c
@@ -0,0 +1,234 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_cmplx_dot_prod_q15.c
 * Description:  Processing function for the Q15 Complex Dot product
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupCmplxMath
 */
 /**
  @addtogroup cmplx_dot_prod
  @{
 */
 /**
  @brief         Q15 complex dot product.
  @param[in]     pSrcA       points to the first input vector
  @param[in]     pSrcB       points to the second input vector
  @param[in]     numSamples  number of samples in each vector
  @param[out]    realResult  real part of the result returned here
  @param[out]    imagResult  imaginary part of the result returned her
  @return        none
  @par           Scaling and Overflow Behavior
                   The function is implemented using an internal 64-bit accumulator.
                   The intermediate 1.15 by 1.15 multiplications are performed with full precision and yield a 2.30 result.
                   These are accumulated in a 64-bit accumulator with 34.30 precision.
                   As a final step, the accumulators are converted to 8.24 format.
                   The return results <code>realResult</code> and <code>imagResult</code> are in 8.24 format.
 */
 #if defined(ARM_MATH_MVEI)
 void arm_cmplx_dot_prod_q15(
  const q15_t * pSrcA,
  const q15_t * pSrcB,
        uint32_t numSamples,
        q31_t * realResult,
        q31_t * imagResult)
 {
  uint32_t blockSize = numSamples * CMPLX_DIM;  /* loop counters */
  uint32_t blkCnt;
  q15_t a0,b0,c0,d0;
  q63_t accReal = 0LL; q63_t accImag = 0LL;
  q15x8_t vecSrcA, vecSrcB;
  /* should give more freedom to generate stall free code */
  vecSrcA = vld1q(pSrcA);
  vecSrcB = vld1q(pSrcB);
  pSrcA += 8;
  pSrcB += 8;
  /* Compute 4 complex samples at a time */
  blkCnt = blockSize >> 3;
  while (blkCnt > 0U)
  {
      q15x8_t vecSrcC, vecSrcD;
      accReal = vmlsldavaq(accReal, vecSrcA, vecSrcB);
      vecSrcC = vld1q(pSrcA);
      pSrcA += 8;
      accImag = vmlaldavaxq(accImag, vecSrcA, vecSrcB);
      vecSrcD = vld1q(pSrcB);
      pSrcB += 8;
      accReal = vmlsldavaq(accReal, vecSrcC, vecSrcD);
      vecSrcA = vld1q(pSrcA);
      pSrcA += 8;
      accImag = vmlaldavaxq(accImag, vecSrcC, vecSrcD);
      vecSrcB = vld1q(pSrcB);
      pSrcB += 8;
      /*
       * Decrement the blockSize loop counter
       */
      blkCnt--;
  }
  /* Tail */
  pSrcA -= 8;
  pSrcB -= 8;
  blkCnt = (blockSize & 7) >> 1;
  while (blkCnt > 0U)
  {
    a0 = *pSrcA++;
    b0 = *pSrcA++;
    c0 = *pSrcB++;
    d0 = *pSrcB++;
    accReal += (q31_t)a0 * c0;
    accImag += (q31_t)a0 * d0;
    accReal -= (q31_t)b0 * d0;
    accImag += (q31_t)b0 * c0;
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Store real and imaginary result in 8.24 format  */
  /* Convert real data in 34.30 to 8.24 by 6 right shifts */
  *realResult = (q31_t) (accReal >> 6);
  /* Convert imaginary data in 34.30 to 8.24 by 6 right shifts */
  *imagResult = (q31_t) (accImag >> 6);
 }
 #else
 void arm_cmplx_dot_prod_q15(
  const q15_t * pSrcA,
  const q15_t * pSrcB,
        uint32_t numSamples,
        q31_t * realResult,
        q31_t * imagResult)
 {
        uint32_t blkCnt;                               /* Loop counter */
        q63_t real_sum = 0, imag_sum = 0;              /* Temporary result variables */
        q15_t a0,b0,c0,d0;
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = numSamples >> 2U;
  while (blkCnt > 0U)
  {
    a0 = *pSrcA++;
    b0 = *pSrcA++;
    c0 = *pSrcB++;
    d0 = *pSrcB++;
    real_sum += (q31_t)a0 * c0;
    imag_sum += (q31_t)a0 * d0;
    real_sum -= (q31_t)b0 * d0;
    imag_sum += (q31_t)b0 * c0;
    a0 = *pSrcA++;
    b0 = *pSrcA++;
    c0 = *pSrcB++;
    d0 = *pSrcB++;
    real_sum += (q31_t)a0 * c0;
    imag_sum += (q31_t)a0 * d0;
    real_sum -= (q31_t)b0 * d0;
    imag_sum += (q31_t)b0 * c0;
    a0 = *pSrcA++;
    b0 = *pSrcA++;
    c0 = *pSrcB++;
    d0 = *pSrcB++;
    real_sum += (q31_t)a0 * c0;
    imag_sum += (q31_t)a0 * d0;
    real_sum -= (q31_t)b0 * d0;
    imag_sum += (q31_t)b0 * c0;
    a0 = *pSrcA++;
    b0 = *pSrcA++;
    c0 = *pSrcB++;
    d0 = *pSrcB++;
    real_sum += (q31_t)a0 * c0;
    imag_sum += (q31_t)a0 * d0;
    real_sum -= (q31_t)b0 * d0;
    imag_sum += (q31_t)b0 * c0;
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = numSamples % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = numSamples;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    a0 = *pSrcA++;
    b0 = *pSrcA++;
    c0 = *pSrcB++;
    d0 = *pSrcB++;
    real_sum += (q31_t)a0 * c0;
    imag_sum += (q31_t)a0 * d0;
    real_sum -= (q31_t)b0 * d0;
    imag_sum += (q31_t)b0 * c0;
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Store real and imaginary result in 8.24 format  */
  /* Convert real data in 34.30 to 8.24 by 6 right shifts */
  *realResult = (q31_t) (real_sum >> 6);
  /* Convert imaginary data in 34.30 to 8.24 by 6 right shifts */
  *imagResult = (q31_t) (imag_sum >> 6);
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of cmplx_dot_prod group
 */
--- a/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q31.c
+++ b/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q31.c
@@ -0,0 +1,220 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_cmplx_dot_prod_q31.c
 * Description:  Q31 complex dot product
 *
 * $Date:        18. March 2019
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "arm_math.h"
 /**
  @ingroup groupCmplxMath
 */
 /**
  @addtogroup cmplx_dot_prod
  @{
 */
 /**
  @brief         Q31 complex dot product.
  @param[in]     pSrcA       points to the first input vector
  @param[in]     pSrcB       points to the second input vector
  @param[in]     numSamples  number of samples in each vector
  @param[out]    realResult  real part of the result returned here
  @param[out]    imagResult  imaginary part of the result returned here
  @return        none
  @par           Scaling and Overflow Behavior
                   The function is implemented using an internal 64-bit accumulator.
                   The intermediate 1.31 by 1.31 multiplications are performed with 64-bit precision and then shifted to 16.48 format.
                   The internal real and imaginary accumulators are in 16.48 format and provide 15 guard bits.
                   Additions are nonsaturating and no overflow will occur as long as <code>numSamples</code> is less than 32768.
                   The return results <code>realResult</code> and <code>imagResult</code> are in 16.48 format.
                   Input down scaling is not required.
 */
 #if defined(ARM_MATH_MVEI)
 void arm_cmplx_dot_prod_q31(
  const q31_t * pSrcA,
  const q31_t * pSrcB,
        uint32_t numSamples,
        q63_t * realResult,
        q63_t * imagResult)
 {
    uint32_t blockSize = numSamples * CMPLX_DIM;  /* loop counters */
    uint32_t blkCnt;
    q31x4_t vecSrcA, vecSrcB;
    q63_t accReal = 0LL;
    q63_t accImag = 0LL;
    q31_t a0,b0,c0,d0;
     /* Compute 2 complex samples at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        accReal = vrmlsldavhaq(accReal, vecSrcA, vecSrcB);
        accImag = vrmlaldavhaxq(accImag, vecSrcA, vecSrcB);
        /*
         * Decrement the blkCnt loop counter
         * Advance vector source and destination pointers
         */
        pSrcA += 4;
        pSrcB += 4;
        blkCnt --;
    }
    accReal = asrl(accReal, (14 - 8));
    accImag = asrl(accImag, (14 - 8));
    /* Tail */
    blkCnt = (blockSize & 3) >> 1;
    while (blkCnt > 0U)
    {
      a0 = *pSrcA++;
      b0 = *pSrcA++;
      c0 = *pSrcB++;
      d0 = *pSrcB++;
      accReal += ((q63_t)a0 * c0) >> 14;
      accImag += ((q63_t)a0 * d0) >> 14;
      accReal -= ((q63_t)b0 * d0) >> 14;
      accImag += ((q63_t)b0 * c0) >> 14;
      /* Decrement loop counter */
      blkCnt--;
    }
    /* Store real and imaginary result in destination buffer. */
    *realResult = accReal;
    *imagResult = accImag;
 }
 #else
 void arm_cmplx_dot_prod_q31(
  const q31_t * pSrcA,
  const q31_t * pSrcB,
        uint32_t numSamples,
        q63_t * realResult,
        q63_t * imagResult)
 {
        uint32_t blkCnt;                               /* Loop counter */
        q63_t real_sum = 0, imag_sum = 0;              /* Temporary result variables */
        q31_t a0,b0,c0,d0;
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = numSamples >> 2U;
  while (blkCnt > 0U)
  {
    a0 = *pSrcA++;
    b0 = *pSrcA++;
    c0 = *pSrcB++;
    d0 = *pSrcB++;
    real_sum += ((q63_t)a0 * c0) >> 14;
    imag_sum += ((q63_t)a0 * d0) >> 14;
    real_sum -= ((q63_t)b0 * d0) >> 14;
    imag_sum += ((q63_t)b0 * c0) >> 14;
    a0 = *pSrcA++;
    b0 = *pSrcA++;
    c0 = *pSrcB++;
    d0 = *pSrcB++;
    real_sum += ((q63_t)a0 * c0) >> 14;
    imag_sum += ((q63_t)a0 * d0) >> 14;
    real_sum -= ((q63_t)b0 * d0) >> 14;
    imag_sum += ((q63_t)b0 * c0) >> 14;
    a0 = *pSrcA++;
    b0 = *pSrcA++;
    c0 = *pSrcB++;
    d0 = *pSrcB++;
    real_sum += ((q63_t)a0 * c0) >> 14;
    imag_sum += ((q63_t)a0 * d0) >> 14;
    real_sum -= ((q63_t)b0 * d0) >> 14;
    imag_sum += ((q63_t)b0 * c0) >> 14;
    a0 = *pSrcA++;
    b0 = *pSrcA++;
    c0 = *pSrcB++;
    d0 = *pSrcB++;
    real_sum += ((q63_t)a0 * c0) >> 14;
    imag_sum += ((q63_t)a0 * d0) >> 14;
    real_sum -= ((q63_t)b0 * d0) >> 14;
    imag_sum += ((q63_t)b0 * c0) >> 14;
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = numSamples % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = numSamples;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    a0 = *pSrcA++;
    b0 = *pSrcA++;
    c0 = *pSrcB++;
    d0 = *pSrcB++;
    real_sum += ((q63_t)a0 * c0) >> 14;
    imag_sum += ((q63_t)a0 * d0) >> 14;
    real_sum -= ((q63_t)b0 * d0) >> 14;
    imag_sum += ((q63_t)b0 * c0) >> 14;
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Store real and imaginary result in 16.48 format  */
  *realResult = real_sum;
  *imagResult = imag_sum;
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of cmplx_dot_prod group
 */
--- a/Show More
+++ b/Show More