update version to v2.1.4

update version to v2.1.3
update version to v2.1.2
2026-05-21 09:22:19 +00:00 · 2023-02-28 10:34:18 +08:00 · 2022-11-22 18:14:35 +08:00 · 2022-08-26 14:37:12 +08:00 · 2022-07-25 16:31:10 +08:00 · 2022-06-30 17:10:00 +08:00
3187 changed files with 402480 additions and 89873 deletions
--- a/AT32F403A_407_periph_lib_V2.1.4.chm
+++ b/AT32F403A_407_periph_lib_V2.1.4.chm
--- a/29
+++ b/29
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2021, ArteryTek
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/document/AT32F403A_407固件库BSP&Pack应用指南.pdf
+++ b/document/AT32F403A_407固件库BSP&Pack应用指南.pdf
--- a/document/ReleaseNotes_AT32F403A_407_Firmware_Library.pdf
+++ b/document/ReleaseNotes_AT32F403A_407_Firmware_Library.pdf
--- a/libraries/cmsis/cm4/core_support/arm_common_tables.h
+++ b/libraries/cmsis/cm4/core_support/arm_common_tables.h
--- a/libraries/cmsis/cm4/core_support/arm_const_structs.h
+++ b/libraries/cmsis/cm4/core_support/arm_const_structs.h
@@ -1,76 +1,76 @@
-/* ----------------------------------------------------------------------
- * Project:      CMSIS DSP Library
- * Title:        arm_const_structs.h
- * Description:  Constant structs that are initialized for user convenience.
- *               For example, some can be given as arguments to the arm_cfft_f32() function.
- *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
- *
- * Target Processor: Cortex-M cores
- * -------------------------------------------------------------------- */
-/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef _ARM_CONST_STRUCTS_H
-#define _ARM_CONST_STRUCTS_H
-
-#include "arm_math.h"
-#include "arm_common_tables.h"
-
-   extern const arm_cfft_instance_f64 arm_cfft_sR_f64_len16;
-   extern const arm_cfft_instance_f64 arm_cfft_sR_f64_len32;
-   extern const arm_cfft_instance_f64 arm_cfft_sR_f64_len64;
-   extern const arm_cfft_instance_f64 arm_cfft_sR_f64_len128;
-   extern const arm_cfft_instance_f64 arm_cfft_sR_f64_len256;
-   extern const arm_cfft_instance_f64 arm_cfft_sR_f64_len512;
-   extern const arm_cfft_instance_f64 arm_cfft_sR_f64_len1024;
-   extern const arm_cfft_instance_f64 arm_cfft_sR_f64_len2048;
-   extern const arm_cfft_instance_f64 arm_cfft_sR_f64_len4096;
-
-   extern const arm_cfft_instance_f32 arm_cfft_sR_f32_len16;
-   extern const arm_cfft_instance_f32 arm_cfft_sR_f32_len32;
-   extern const arm_cfft_instance_f32 arm_cfft_sR_f32_len64;
-   extern const arm_cfft_instance_f32 arm_cfft_sR_f32_len128;
-   extern const arm_cfft_instance_f32 arm_cfft_sR_f32_len256;
-   extern const arm_cfft_instance_f32 arm_cfft_sR_f32_len512;
-   extern const arm_cfft_instance_f32 arm_cfft_sR_f32_len1024;
-   extern const arm_cfft_instance_f32 arm_cfft_sR_f32_len2048;
-   extern const arm_cfft_instance_f32 arm_cfft_sR_f32_len4096;
-
-   extern const arm_cfft_instance_q31 arm_cfft_sR_q31_len16;
-   extern const arm_cfft_instance_q31 arm_cfft_sR_q31_len32;
-   extern const arm_cfft_instance_q31 arm_cfft_sR_q31_len64;
-   extern const arm_cfft_instance_q31 arm_cfft_sR_q31_len128;
-   extern const arm_cfft_instance_q31 arm_cfft_sR_q31_len256;
-   extern const arm_cfft_instance_q31 arm_cfft_sR_q31_len512;
-   extern const arm_cfft_instance_q31 arm_cfft_sR_q31_len1024;
-   extern const arm_cfft_instance_q31 arm_cfft_sR_q31_len2048;
-   extern const arm_cfft_instance_q31 arm_cfft_sR_q31_len4096;
-
-   extern const arm_cfft_instance_q15 arm_cfft_sR_q15_len16;
-   extern const arm_cfft_instance_q15 arm_cfft_sR_q15_len32;
-   extern const arm_cfft_instance_q15 arm_cfft_sR_q15_len64;
-   extern const arm_cfft_instance_q15 arm_cfft_sR_q15_len128;
-   extern const arm_cfft_instance_q15 arm_cfft_sR_q15_len256;
-   extern const arm_cfft_instance_q15 arm_cfft_sR_q15_len512;
-   extern const arm_cfft_instance_q15 arm_cfft_sR_q15_len1024;
-   extern const arm_cfft_instance_q15 arm_cfft_sR_q15_len2048;
-   extern const arm_cfft_instance_q15 arm_cfft_sR_q15_len4096;
-
-#endif
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_const_structs.h
+ * Description:  Constant structs that are initialized for user convenience.
+ *               For example, some can be given as arguments to the arm_cfft_f32() function.
+ *
+ * $Date:        27. January 2017
+ * $Revision:    V.1.5.1
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _ARM_CONST_STRUCTS_H
+#define _ARM_CONST_STRUCTS_H
+
+#include "arm_math.h"
+#include "arm_common_tables.h"
+
+   extern const arm_cfft_instance_f64 arm_cfft_sR_f64_len16;
+   extern const arm_cfft_instance_f64 arm_cfft_sR_f64_len32;
+   extern const arm_cfft_instance_f64 arm_cfft_sR_f64_len64;
+   extern const arm_cfft_instance_f64 arm_cfft_sR_f64_len128;
+   extern const arm_cfft_instance_f64 arm_cfft_sR_f64_len256;
+   extern const arm_cfft_instance_f64 arm_cfft_sR_f64_len512;
+   extern const arm_cfft_instance_f64 arm_cfft_sR_f64_len1024;
+   extern const arm_cfft_instance_f64 arm_cfft_sR_f64_len2048;
+   extern const arm_cfft_instance_f64 arm_cfft_sR_f64_len4096;
+
+   extern const arm_cfft_instance_f32 arm_cfft_sR_f32_len16;
+   extern const arm_cfft_instance_f32 arm_cfft_sR_f32_len32;
+   extern const arm_cfft_instance_f32 arm_cfft_sR_f32_len64;
+   extern const arm_cfft_instance_f32 arm_cfft_sR_f32_len128;
+   extern const arm_cfft_instance_f32 arm_cfft_sR_f32_len256;
+   extern const arm_cfft_instance_f32 arm_cfft_sR_f32_len512;
+   extern const arm_cfft_instance_f32 arm_cfft_sR_f32_len1024;
+   extern const arm_cfft_instance_f32 arm_cfft_sR_f32_len2048;
+   extern const arm_cfft_instance_f32 arm_cfft_sR_f32_len4096;
+
+   extern const arm_cfft_instance_q31 arm_cfft_sR_q31_len16;
+   extern const arm_cfft_instance_q31 arm_cfft_sR_q31_len32;
+   extern const arm_cfft_instance_q31 arm_cfft_sR_q31_len64;
+   extern const arm_cfft_instance_q31 arm_cfft_sR_q31_len128;
+   extern const arm_cfft_instance_q31 arm_cfft_sR_q31_len256;
+   extern const arm_cfft_instance_q31 arm_cfft_sR_q31_len512;
+   extern const arm_cfft_instance_q31 arm_cfft_sR_q31_len1024;
+   extern const arm_cfft_instance_q31 arm_cfft_sR_q31_len2048;
+   extern const arm_cfft_instance_q31 arm_cfft_sR_q31_len4096;
+
+   extern const arm_cfft_instance_q15 arm_cfft_sR_q15_len16;
+   extern const arm_cfft_instance_q15 arm_cfft_sR_q15_len32;
+   extern const arm_cfft_instance_q15 arm_cfft_sR_q15_len64;
+   extern const arm_cfft_instance_q15 arm_cfft_sR_q15_len128;
+   extern const arm_cfft_instance_q15 arm_cfft_sR_q15_len256;
+   extern const arm_cfft_instance_q15 arm_cfft_sR_q15_len512;
+   extern const arm_cfft_instance_q15 arm_cfft_sR_q15_len1024;
+   extern const arm_cfft_instance_q15 arm_cfft_sR_q15_len2048;
+   extern const arm_cfft_instance_q15 arm_cfft_sR_q15_len4096;
+
+#endif
--- a/libraries/cmsis/cm4/core_support/arm_helium_utils.h
+++ b/libraries/cmsis/cm4/core_support/arm_helium_utils.h
@@ -1,348 +1,348 @@
-/* ----------------------------------------------------------------------
- * Project:      CMSIS DSP Library
- * Title:        arm_helium_utils.h
- * Description:  Utility functions for Helium development
- *
- * $Date:        09. September 2019
- * $Revision:    V.1.5.1
- *
- * Target Processor: Cortex-M cores
- * -------------------------------------------------------------------- */
-/*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef _ARM_UTILS_HELIUM_H_
-#define _ARM_UTILS_HELIUM_H_
-
-/***************************************
-
-Definitions available for MVEF and MVEI
-
-***************************************/
-#if defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF) || defined(ARM_MATH_MVEI)
-
-#define INACTIVELANE            0 /* inactive lane content */
-
-
-#endif /* defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF) || defined(ARM_MATH_MVEI) */
-
-/***************************************
-
-Definitions available for MVEF only
-
-***************************************/
-#if defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF)
-
-__STATIC_FORCEINLINE float32_t vecAddAcrossF32Mve(float32x4_t in)
-{
-    float32_t acc;
-
-    acc = vgetq_lane(in, 0) + vgetq_lane(in, 1) +
-          vgetq_lane(in, 2) + vgetq_lane(in, 3);
-
-    return acc;
-}
-
-/* newton initial guess */
-#define INVSQRT_MAGIC_F32           0x5f3759df
-
-#define INVSQRT_NEWTON_MVE_F32(invSqrt, xHalf, xStart)\
-{                                                     \
-    float32x4_t tmp;                                  \
-                                                      \
-    /* tmp = xhalf * x * x */                         \
-    tmp = vmulq(xStart, xStart);                      \
-    tmp = vmulq(tmp, xHalf);                          \
-    /* (1.5f - xhalf * x * x) */                      \
-    tmp = vsubq(vdupq_n_f32(1.5f), tmp);              \
-    /* x = x*(1.5f-xhalf*x*x); */                     \
-    invSqrt = vmulq(tmp, xStart);                     \
-}
-#endif /* defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF) */
-
-/***************************************
-
-Definitions available for MVEI only
-
-***************************************/
-#if defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEI)
-
-
-#include "arm_common_tables.h"
-
-/* Following functions are used to transpose matrix in f32 and q31 cases */
-__STATIC_INLINE arm_status arm_mat_trans_32bit_2x2_mve(
-    uint32_t * pDataSrc,
-    uint32_t * pDataDest)
-{
-    static const uint32x4_t vecOffs = { 0, 2, 1, 3 };
-    /*
-     *
-     * | 0   1 |   =>  |  0   2 |
-     * | 2   3 |       |  1   3 |
-     *
-     */
-    uint32x4_t vecIn = vldrwq_u32((uint32_t const *)pDataSrc);
-    vstrwq_scatter_shifted_offset_u32(pDataDest, vecOffs, vecIn);
-
-    return (ARM_MATH_SUCCESS);
-}
-
-__STATIC_INLINE arm_status arm_mat_trans_32bit_3x3_mve(
-    uint32_t * pDataSrc,
-    uint32_t * pDataDest)
-{
-    const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
-    const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
-    /*
-     *
-     *  | 0   1   2 |       | 0   3   6 |  4 x 32 flattened version | 0   3   6   1 |
-     *  | 3   4   5 |   =>  | 1   4   7 |            =>             | 4   7   2   5 |
-     *  | 6   7   8 |       | 2   5   8 |       (row major)         | 8   .   .   . |
-     *
-     */
-    uint32x4_t vecIn1 = vldrwq_u32((uint32_t const *) pDataSrc);
-    uint32x4_t vecIn2 = vldrwq_u32((uint32_t const *) &pDataSrc[4]);
-
-    vstrwq_scatter_shifted_offset_u32(pDataDest, vecOffs1, vecIn1);
-    vstrwq_scatter_shifted_offset_u32(pDataDest, vecOffs2, vecIn2);
-
-    pDataDest[8] = pDataSrc[8];
-
-    return (ARM_MATH_SUCCESS);
-}
-
-__STATIC_INLINE arm_status arm_mat_trans_32bit_4x4_mve(uint32_t * pDataSrc, uint32_t * pDataDest)
-{
-    /*
-     * 4x4 Matrix transposition
-     * is 4 x de-interleave operation
-     *
-     * 0   1   2   3       0   4   8   12
-     * 4   5   6   7       1   5   9   13
-     * 8   9   10  11      2   6   10  14
-     * 12  13  14  15      3   7   11  15
-     */
-
-    uint32x4x4_t vecIn;
-
-    vecIn = vld4q((uint32_t const *) pDataSrc);
-    vstrwq(pDataDest, vecIn.val[0]);
-    pDataDest += 4;
-    vstrwq(pDataDest, vecIn.val[1]);
-    pDataDest += 4;
-    vstrwq(pDataDest, vecIn.val[2]);
-    pDataDest += 4;
-    vstrwq(pDataDest, vecIn.val[3]);
-
-    return (ARM_MATH_SUCCESS);
-}
-
-
-__STATIC_INLINE arm_status arm_mat_trans_32bit_generic_mve(
-    uint16_t    srcRows,
-    uint16_t    srcCols,
-    uint32_t  * pDataSrc,
-    uint32_t  * pDataDest)
-{
-    uint32x4_t vecOffs;
-    uint32_t  i;
-    uint32_t  blkCnt;
-    uint32_t const *pDataC;
-    uint32_t *pDataDestR;
-    uint32x4_t vecIn;
-
-    vecOffs = vidupq_u32((uint32_t)0, 1);
-    vecOffs = vecOffs * srcCols;
-
-    i = srcCols;
-    do
-    {
-        pDataC = (uint32_t const *) pDataSrc;
-        pDataDestR = pDataDest;
-
-        blkCnt = srcRows >> 2;
-        while (blkCnt > 0U)
-        {
-            vecIn = vldrwq_gather_shifted_offset_u32(pDataC, vecOffs);
-            vstrwq(pDataDestR, vecIn); 
-            pDataDestR += 4;
-            pDataC = pDataC + srcCols * 4;
-            /*
-             * Decrement the blockSize loop counter
-             */
-            blkCnt--;
-        }
-
-        /*
-         * tail
-         */
-        blkCnt = srcRows & 3;
-        if (blkCnt > 0U)
-        {
-            mve_pred16_t p0 = vctp32q(blkCnt);
-            vecIn = vldrwq_gather_shifted_offset_u32(pDataC, vecOffs);
-            vstrwq_p(pDataDestR, vecIn, p0);
-        }
-
-        pDataSrc += 1;
-        pDataDest += srcRows;
-    }
-    while (--i);
-
-    return (ARM_MATH_SUCCESS);
-}
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q31_MVE)
-__STATIC_INLINE q31x4_t FAST_VSQRT_Q31(q31x4_t vecIn)
-{
-    q63x2_t         vecTmpLL;
-    q31x4_t         vecTmp0, vecTmp1;
-    q31_t           scale;
-    q63_t           tmp64;
-    q31x4_t         vecNrm, vecDst, vecIdx, vecSignBits;
-
-
-    vecSignBits = vclsq(vecIn);
-    vecSignBits = vbicq(vecSignBits, 1);
-    /*
-     * in = in << no_of_sign_bits;
-     */
-    vecNrm = vshlq(vecIn, vecSignBits);
-    /*
-     * index = in >> 24;
-     */
-    vecIdx = vecNrm >> 24;
-    vecIdx = vecIdx << 1;
-
-    vecTmp0 = vldrwq_gather_shifted_offset_s32(sqrtTable_Q31, vecIdx);
-
-    vecIdx = vecIdx + 1;
-
-    vecTmp1 = vldrwq_gather_shifted_offset_s32(sqrtTable_Q31, vecIdx);
-
-    vecTmp1 = vqrdmulhq(vecTmp1, vecNrm);
-    vecTmp0 = vecTmp0 - vecTmp1;
-    vecTmp1 = vqrdmulhq(vecTmp0, vecTmp0);
-    vecTmp1 = vqrdmulhq(vecNrm, vecTmp1);
-    vecTmp1 = vdupq_n_s32(0x18000000) - vecTmp1;
-    vecTmp0 = vqrdmulhq(vecTmp0, vecTmp1);
-    vecTmpLL = vmullbq_int(vecNrm, vecTmp0);
-
-    /*
-     * scale elements 0, 2
-     */
-    scale = 26 + (vecSignBits[0] >> 1);
-    tmp64 = asrl(vecTmpLL[0], scale);
-    vecDst[0] = (q31_t) tmp64;
-
-    scale = 26 + (vecSignBits[2] >> 1);
-    tmp64 = asrl(vecTmpLL[1], scale);
-    vecDst[2] = (q31_t) tmp64;
-
-    vecTmpLL = vmulltq_int(vecNrm, vecTmp0);
-
-    /*
-     * scale elements 1, 3
-     */
-    scale = 26 + (vecSignBits[1] >> 1);
-    tmp64 = asrl(vecTmpLL[0], scale);
-    vecDst[1] = (q31_t) tmp64;
-
-    scale = 26 + (vecSignBits[3] >> 1);
-    tmp64 = asrl(vecTmpLL[1], scale);
-    vecDst[3] = (q31_t) tmp64;
-    /*
-     * set negative values to 0
-     */
-    vecDst = vdupq_m(vecDst, 0, vcmpltq_n_s32(vecIn, 0));
-
-    return vecDst;
-}
-#endif
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q15_MVE)
-__STATIC_INLINE q15x8_t FAST_VSQRT_Q15(q15x8_t vecIn)
-{
-    q31x4_t         vecTmpLev, vecTmpLodd, vecSignL;
-    q15x8_t         vecTmp0, vecTmp1;
-    q15x8_t         vecNrm, vecDst, vecIdx, vecSignBits;
-
-    vecDst = vuninitializedq_s16();
-
-    vecSignBits = vclsq(vecIn);
-    vecSignBits = vbicq(vecSignBits, 1);
-    /*
-     * in = in << no_of_sign_bits;
-     */
-    vecNrm = vshlq(vecIn, vecSignBits);
-
-    vecIdx = vecNrm >> 8;
-    vecIdx = vecIdx << 1;
-
-    vecTmp0 = vldrhq_gather_shifted_offset_s16(sqrtTable_Q15, vecIdx);
-
-    vecIdx = vecIdx + 1;
-
-    vecTmp1 = vldrhq_gather_shifted_offset_s16(sqrtTable_Q15, vecIdx);
-
-    vecTmp1 = vqrdmulhq(vecTmp1, vecNrm);
-    vecTmp0 = vecTmp0 - vecTmp1;
-    vecTmp1 = vqrdmulhq(vecTmp0, vecTmp0);
-    vecTmp1 = vqrdmulhq(vecNrm, vecTmp1);
-    vecTmp1 = vdupq_n_s16(0x1800) - vecTmp1;
-    vecTmp0 = vqrdmulhq(vecTmp0, vecTmp1);
-
-    vecSignBits = vecSignBits >> 1;
-
-    vecTmpLev = vmullbq_int(vecNrm, vecTmp0);
-    vecTmpLodd = vmulltq_int(vecNrm, vecTmp0);
-
-    vecTmp0 = vecSignBits + 10;
-    /*
-     * negate sign to apply register based vshl
-     */
-    vecTmp0 = -vecTmp0;
-
-    /*
-     * shift even elements
-     */
-    vecSignL = vmovlbq(vecTmp0);
-    vecTmpLev = vshlq(vecTmpLev, vecSignL);
-    /*
-     * shift odd elements
-     */
-    vecSignL = vmovltq(vecTmp0);
-    vecTmpLodd = vshlq(vecTmpLodd, vecSignL);
-    /*
-     * merge and narrow odd and even parts
-     */
-    vecDst = vmovnbq_s32(vecDst, vecTmpLev);
-    vecDst = vmovntq_s32(vecDst, vecTmpLodd);
-    /*
-     * set negative values to 0
-     */
-    vecDst = vdupq_m(vecDst, 0, vcmpltq_n_s16(vecIn, 0));
-
-    return vecDst;
-}
-#endif
-
-#endif /* defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEI) */
-
-#endif
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_helium_utils.h
+ * Description:  Utility functions for Helium development
+ *
+ * $Date:        09. September 2019
+ * $Revision:    V.1.5.1
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _ARM_UTILS_HELIUM_H_
+#define _ARM_UTILS_HELIUM_H_
+
+/***************************************
+
+Definitions available for MVEF and MVEI
+
+***************************************/
+#if defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF) || defined(ARM_MATH_MVEI)
+
+#define INACTIVELANE            0 /* inactive lane content */
+
+
+#endif /* defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF) || defined(ARM_MATH_MVEI) */
+
+/***************************************
+
+Definitions available for MVEF only
+
+***************************************/
+#if defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF)
+
+__STATIC_FORCEINLINE float32_t vecAddAcrossF32Mve(float32x4_t in)
+{
+    float32_t acc;
+
+    acc = vgetq_lane(in, 0) + vgetq_lane(in, 1) +
+          vgetq_lane(in, 2) + vgetq_lane(in, 3);
+
+    return acc;
+}
+
+/* newton initial guess */
+#define INVSQRT_MAGIC_F32           0x5f3759df
+
+#define INVSQRT_NEWTON_MVE_F32(invSqrt, xHalf, xStart)\
+{                                                     \
+    float32x4_t tmp;                                  \
+                                                      \
+    /* tmp = xhalf * x * x */                         \
+    tmp = vmulq(xStart, xStart);                      \
+    tmp = vmulq(tmp, xHalf);                          \
+    /* (1.5f - xhalf * x * x) */                      \
+    tmp = vsubq(vdupq_n_f32(1.5f), tmp);              \
+    /* x = x*(1.5f-xhalf*x*x); */                     \
+    invSqrt = vmulq(tmp, xStart);                     \
+}
+#endif /* defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF) */
+
+/***************************************
+
+Definitions available for MVEI only
+
+***************************************/
+#if defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEI)
+
+
+#include "arm_common_tables.h"
+
+/* Following functions are used to transpose matrix in f32 and q31 cases */
+__STATIC_INLINE arm_status arm_mat_trans_32bit_2x2_mve(
+    uint32_t * pDataSrc,
+    uint32_t * pDataDest)
+{
+    static const uint32x4_t vecOffs = { 0, 2, 1, 3 };
+    /*
+     *
+     * | 0   1 |   =>  |  0   2 |
+     * | 2   3 |       |  1   3 |
+     *
+     */
+    uint32x4_t vecIn = vldrwq_u32((uint32_t const *)pDataSrc);
+    vstrwq_scatter_shifted_offset_u32(pDataDest, vecOffs, vecIn);
+
+    return (ARM_MATH_SUCCESS);
+}
+
+__STATIC_INLINE arm_status arm_mat_trans_32bit_3x3_mve(
+    uint32_t * pDataSrc,
+    uint32_t * pDataDest)
+{
+    const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+    const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+    /*
+     *
+     *  | 0   1   2 |       | 0   3   6 |  4 x 32 flattened version | 0   3   6   1 |
+     *  | 3   4   5 |   =>  | 1   4   7 |            =>             | 4   7   2   5 |
+     *  | 6   7   8 |       | 2   5   8 |       (row major)         | 8   .   .   . |
+     *
+     */
+    uint32x4_t vecIn1 = vldrwq_u32((uint32_t const *) pDataSrc);
+    uint32x4_t vecIn2 = vldrwq_u32((uint32_t const *) &pDataSrc[4]);
+
+    vstrwq_scatter_shifted_offset_u32(pDataDest, vecOffs1, vecIn1);
+    vstrwq_scatter_shifted_offset_u32(pDataDest, vecOffs2, vecIn2);
+
+    pDataDest[8] = pDataSrc[8];
+
+    return (ARM_MATH_SUCCESS);
+}
+
+__STATIC_INLINE arm_status arm_mat_trans_32bit_4x4_mve(uint32_t * pDataSrc, uint32_t * pDataDest)
+{
+    /*
+     * 4x4 Matrix transposition
+     * is 4 x de-interleave operation
+     *
+     * 0   1   2   3       0   4   8   12
+     * 4   5   6   7       1   5   9   13
+     * 8   9   10  11      2   6   10  14
+     * 12  13  14  15      3   7   11  15
+     */
+
+    uint32x4x4_t vecIn;
+
+    vecIn = vld4q((uint32_t const *) pDataSrc);
+    vstrwq(pDataDest, vecIn.val[0]);
+    pDataDest += 4;
+    vstrwq(pDataDest, vecIn.val[1]);
+    pDataDest += 4;
+    vstrwq(pDataDest, vecIn.val[2]);
+    pDataDest += 4;
+    vstrwq(pDataDest, vecIn.val[3]);
+
+    return (ARM_MATH_SUCCESS);
+}
+
+
+__STATIC_INLINE arm_status arm_mat_trans_32bit_generic_mve(
+    uint16_t    srcRows,
+    uint16_t    srcCols,
+    uint32_t  * pDataSrc,
+    uint32_t  * pDataDest)
+{
+    uint32x4_t vecOffs;
+    uint32_t  i;
+    uint32_t  blkCnt;
+    uint32_t const *pDataC;
+    uint32_t *pDataDestR;
+    uint32x4_t vecIn;
+
+    vecOffs = vidupq_u32((uint32_t)0, 1);
+    vecOffs = vecOffs * srcCols;
+
+    i = srcCols;
+    do
+    {
+        pDataC = (uint32_t const *) pDataSrc;
+        pDataDestR = pDataDest;
+
+        blkCnt = srcRows >> 2;
+        while (blkCnt > 0U)
+        {
+            vecIn = vldrwq_gather_shifted_offset_u32(pDataC, vecOffs);
+            vstrwq(pDataDestR, vecIn);
+            pDataDestR += 4;
+            pDataC = pDataC + srcCols * 4;
+            /*
+             * Decrement the blockSize loop counter
+             */
+            blkCnt--;
+        }
+
+        /*
+         * tail
+         */
+        blkCnt = srcRows & 3;
+        if (blkCnt > 0U)
+        {
+            mve_pred16_t p0 = vctp32q(blkCnt);
+            vecIn = vldrwq_gather_shifted_offset_u32(pDataC, vecOffs);
+            vstrwq_p(pDataDestR, vecIn, p0);
+        }
+
+        pDataSrc += 1;
+        pDataDest += srcRows;
+    }
+    while (--i);
+
+    return (ARM_MATH_SUCCESS);
+}
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q31_MVE)
+__STATIC_INLINE q31x4_t FAST_VSQRT_Q31(q31x4_t vecIn)
+{
+    q63x2_t         vecTmpLL;
+    q31x4_t         vecTmp0, vecTmp1;
+    q31_t           scale;
+    q63_t           tmp64;
+    q31x4_t         vecNrm, vecDst, vecIdx, vecSignBits;
+
+
+    vecSignBits = vclsq(vecIn);
+    vecSignBits = vbicq(vecSignBits, 1);
+    /*
+     * in = in << no_of_sign_bits;
+     */
+    vecNrm = vshlq(vecIn, vecSignBits);
+    /*
+     * index = in >> 24;
+     */
+    vecIdx = vecNrm >> 24;
+    vecIdx = vecIdx << 1;
+
+    vecTmp0 = vldrwq_gather_shifted_offset_s32(sqrtTable_Q31, vecIdx);
+
+    vecIdx = vecIdx + 1;
+
+    vecTmp1 = vldrwq_gather_shifted_offset_s32(sqrtTable_Q31, vecIdx);
+
+    vecTmp1 = vqrdmulhq(vecTmp1, vecNrm);
+    vecTmp0 = vecTmp0 - vecTmp1;
+    vecTmp1 = vqrdmulhq(vecTmp0, vecTmp0);
+    vecTmp1 = vqrdmulhq(vecNrm, vecTmp1);
+    vecTmp1 = vdupq_n_s32(0x18000000) - vecTmp1;
+    vecTmp0 = vqrdmulhq(vecTmp0, vecTmp1);
+    vecTmpLL = vmullbq_int(vecNrm, vecTmp0);
+
+    /*
+     * scale elements 0, 2
+     */
+    scale = 26 + (vecSignBits[0] >> 1);
+    tmp64 = asrl(vecTmpLL[0], scale);
+    vecDst[0] = (q31_t) tmp64;
+
+    scale = 26 + (vecSignBits[2] >> 1);
+    tmp64 = asrl(vecTmpLL[1], scale);
+    vecDst[2] = (q31_t) tmp64;
+
+    vecTmpLL = vmulltq_int(vecNrm, vecTmp0);
+
+    /*
+     * scale elements 1, 3
+     */
+    scale = 26 + (vecSignBits[1] >> 1);
+    tmp64 = asrl(vecTmpLL[0], scale);
+    vecDst[1] = (q31_t) tmp64;
+
+    scale = 26 + (vecSignBits[3] >> 1);
+    tmp64 = asrl(vecTmpLL[1], scale);
+    vecDst[3] = (q31_t) tmp64;
+    /*
+     * set negative values to 0
+     */
+    vecDst = vdupq_m(vecDst, 0, vcmpltq_n_s32(vecIn, 0));
+
+    return vecDst;
+}
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q15_MVE)
+__STATIC_INLINE q15x8_t FAST_VSQRT_Q15(q15x8_t vecIn)
+{
+    q31x4_t         vecTmpLev, vecTmpLodd, vecSignL;
+    q15x8_t         vecTmp0, vecTmp1;
+    q15x8_t         vecNrm, vecDst, vecIdx, vecSignBits;
+
+    vecDst = vuninitializedq_s16();
+
+    vecSignBits = vclsq(vecIn);
+    vecSignBits = vbicq(vecSignBits, 1);
+    /*
+     * in = in << no_of_sign_bits;
+     */
+    vecNrm = vshlq(vecIn, vecSignBits);
+
+    vecIdx = vecNrm >> 8;
+    vecIdx = vecIdx << 1;
+
+    vecTmp0 = vldrhq_gather_shifted_offset_s16(sqrtTable_Q15, vecIdx);
+
+    vecIdx = vecIdx + 1;
+
+    vecTmp1 = vldrhq_gather_shifted_offset_s16(sqrtTable_Q15, vecIdx);
+
+    vecTmp1 = vqrdmulhq(vecTmp1, vecNrm);
+    vecTmp0 = vecTmp0 - vecTmp1;
+    vecTmp1 = vqrdmulhq(vecTmp0, vecTmp0);
+    vecTmp1 = vqrdmulhq(vecNrm, vecTmp1);
+    vecTmp1 = vdupq_n_s16(0x1800) - vecTmp1;
+    vecTmp0 = vqrdmulhq(vecTmp0, vecTmp1);
+
+    vecSignBits = vecSignBits >> 1;
+
+    vecTmpLev = vmullbq_int(vecNrm, vecTmp0);
+    vecTmpLodd = vmulltq_int(vecNrm, vecTmp0);
+
+    vecTmp0 = vecSignBits + 10;
+    /*
+     * negate sign to apply register based vshl
+     */
+    vecTmp0 = -vecTmp0;
+
+    /*
+     * shift even elements
+     */
+    vecSignL = vmovlbq(vecTmp0);
+    vecTmpLev = vshlq(vecTmpLev, vecSignL);
+    /*
+     * shift odd elements
+     */
+    vecSignL = vmovltq(vecTmp0);
+    vecTmpLodd = vshlq(vecTmpLodd, vecSignL);
+    /*
+     * merge and narrow odd and even parts
+     */
+    vecDst = vmovnbq_s32(vecDst, vecTmpLev);
+    vecDst = vmovntq_s32(vecDst, vecTmpLodd);
+    /*
+     * set negative values to 0
+     */
+    vecDst = vdupq_m(vecDst, 0, vcmpltq_n_s16(vecIn, 0));
+
+    return vecDst;
+}
+#endif
+
+#endif /* defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEI) */
+
+#endif
--- a/libraries/cmsis/cm4/core_support/arm_math.h
+++ b/libraries/cmsis/cm4/core_support/arm_math.h
--- a/libraries/cmsis/cm4/core_support/arm_mve_tables.h
+++ b/libraries/cmsis/cm4/core_support/arm_mve_tables.h
@@ -1,235 +1,235 @@
-/* ----------------------------------------------------------------------
- * Project:      CMSIS DSP Library
- * Title:        arm_mve_tables.h
- * Description:  common tables like fft twiddle factors, Bitreverse, reciprocal etc
- *               used for MVE implementation only
- *
- * $Date:        08. January 2020
- * $Revision:    V1.7.0
- *
- * Target Processor: Cortex-M cores
- * -------------------------------------------------------------------- */
-/*
- * Copyright (C) 2010-2020 ARM Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- #ifndef _ARM_MVE_TABLES_H
- #define _ARM_MVE_TABLES_H
-
- #include "arm_math.h"
-
- 
-
-
- 
-
-#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_16) || defined(ARM_TABLE_TWIDDLECOEF_F32_32)
-
-extern uint32_t rearranged_twiddle_tab_stride1_arr_16_f32[2];
-extern uint32_t rearranged_twiddle_tab_stride2_arr_16_f32[2];
-extern uint32_t rearranged_twiddle_tab_stride3_arr_16_f32[2];
-extern float32_t rearranged_twiddle_stride1_16_f32[8];
-extern float32_t rearranged_twiddle_stride2_16_f32[8];
-extern float32_t rearranged_twiddle_stride3_16_f32[8];
-#endif
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_64) || defined(ARM_TABLE_TWIDDLECOEF_F32_128)
-
-extern uint32_t rearranged_twiddle_tab_stride1_arr_64_f32[3];
-extern uint32_t rearranged_twiddle_tab_stride2_arr_64_f32[3];
-extern uint32_t rearranged_twiddle_tab_stride3_arr_64_f32[3];
-extern float32_t rearranged_twiddle_stride1_64_f32[40];
-extern float32_t rearranged_twiddle_stride2_64_f32[40];
-extern float32_t rearranged_twiddle_stride3_64_f32[40];
-#endif
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_256) || defined(ARM_TABLE_TWIDDLECOEF_F32_512)
-
-extern uint32_t rearranged_twiddle_tab_stride1_arr_256_f32[4];
-extern uint32_t rearranged_twiddle_tab_stride2_arr_256_f32[4];
-extern uint32_t rearranged_twiddle_tab_stride3_arr_256_f32[4];
-extern float32_t rearranged_twiddle_stride1_256_f32[168];
-extern float32_t rearranged_twiddle_stride2_256_f32[168];
-extern float32_t rearranged_twiddle_stride3_256_f32[168];
-#endif
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_1024) || defined(ARM_TABLE_TWIDDLECOEF_F32_2048)
-
-extern uint32_t rearranged_twiddle_tab_stride1_arr_1024_f32[5];
-extern uint32_t rearranged_twiddle_tab_stride2_arr_1024_f32[5];
-extern uint32_t rearranged_twiddle_tab_stride3_arr_1024_f32[5];
-extern float32_t rearranged_twiddle_stride1_1024_f32[680];
-extern float32_t rearranged_twiddle_stride2_1024_f32[680];
-extern float32_t rearranged_twiddle_stride3_1024_f32[680];
-#endif
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_4096) || defined(ARM_TABLE_TWIDDLECOEF_F32_8192)
-
-extern uint32_t rearranged_twiddle_tab_stride1_arr_4096_f32[6];
-extern uint32_t rearranged_twiddle_tab_stride2_arr_4096_f32[6];
-extern uint32_t rearranged_twiddle_tab_stride3_arr_4096_f32[6];
-extern float32_t rearranged_twiddle_stride1_4096_f32[2728];
-extern float32_t rearranged_twiddle_stride2_4096_f32[2728];
-extern float32_t rearranged_twiddle_stride3_4096_f32[2728];
-#endif
-
-
-#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) */
-
-#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
-
-
-
-#if defined(ARM_MATH_MVEI) 
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_16) || defined(ARM_TABLE_TWIDDLECOEF_Q31_32)
-
-extern uint32_t rearranged_twiddle_tab_stride1_arr_16_q31[2];
-extern uint32_t rearranged_twiddle_tab_stride2_arr_16_q31[2];
-extern uint32_t rearranged_twiddle_tab_stride3_arr_16_q31[2];
-extern q31_t rearranged_twiddle_stride1_16_q31[8];
-extern q31_t rearranged_twiddle_stride2_16_q31[8];
-extern q31_t rearranged_twiddle_stride3_16_q31[8];
-#endif
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_64) || defined(ARM_TABLE_TWIDDLECOEF_Q31_128)
-
-extern uint32_t rearranged_twiddle_tab_stride1_arr_64_q31[3];
-extern uint32_t rearranged_twiddle_tab_stride2_arr_64_q31[3];
-extern uint32_t rearranged_twiddle_tab_stride3_arr_64_q31[3];
-extern q31_t rearranged_twiddle_stride1_64_q31[40];
-extern q31_t rearranged_twiddle_stride2_64_q31[40];
-extern q31_t rearranged_twiddle_stride3_64_q31[40];
-#endif
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_256) || defined(ARM_TABLE_TWIDDLECOEF_Q31_512)
-
-extern uint32_t rearranged_twiddle_tab_stride1_arr_256_q31[4];
-extern uint32_t rearranged_twiddle_tab_stride2_arr_256_q31[4];
-extern uint32_t rearranged_twiddle_tab_stride3_arr_256_q31[4];
-extern q31_t rearranged_twiddle_stride1_256_q31[168];
-extern q31_t rearranged_twiddle_stride2_256_q31[168];
-extern q31_t rearranged_twiddle_stride3_256_q31[168];
-#endif
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_1024) || defined(ARM_TABLE_TWIDDLECOEF_Q31_2048)
-
-extern uint32_t rearranged_twiddle_tab_stride1_arr_1024_q31[5];
-extern uint32_t rearranged_twiddle_tab_stride2_arr_1024_q31[5];
-extern uint32_t rearranged_twiddle_tab_stride3_arr_1024_q31[5];
-extern q31_t rearranged_twiddle_stride1_1024_q31[680];
-extern q31_t rearranged_twiddle_stride2_1024_q31[680];
-extern q31_t rearranged_twiddle_stride3_1024_q31[680];
-#endif
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_4096) || defined(ARM_TABLE_TWIDDLECOEF_Q31_8192)
-
-extern uint32_t rearranged_twiddle_tab_stride1_arr_4096_q31[6];
-extern uint32_t rearranged_twiddle_tab_stride2_arr_4096_q31[6];
-extern uint32_t rearranged_twiddle_tab_stride3_arr_4096_q31[6];
-extern q31_t rearranged_twiddle_stride1_4096_q31[2728];
-extern q31_t rearranged_twiddle_stride2_4096_q31[2728];
-extern q31_t rearranged_twiddle_stride3_4096_q31[2728];
-#endif
-
-
-#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) */
-
-#endif /* defined(ARM_MATH_MVEI) */
-
-
-
-#if defined(ARM_MATH_MVEI) 
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_16) || defined(ARM_TABLE_TWIDDLECOEF_Q15_32)
-
-extern uint32_t rearranged_twiddle_tab_stride1_arr_16_q15[2];
-extern uint32_t rearranged_twiddle_tab_stride2_arr_16_q15[2];
-extern uint32_t rearranged_twiddle_tab_stride3_arr_16_q15[2];
-extern q15_t rearranged_twiddle_stride1_16_q15[8];
-extern q15_t rearranged_twiddle_stride2_16_q15[8];
-extern q15_t rearranged_twiddle_stride3_16_q15[8];
-#endif
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_64) || defined(ARM_TABLE_TWIDDLECOEF_Q15_128)
-
-extern uint32_t rearranged_twiddle_tab_stride1_arr_64_q15[3];
-extern uint32_t rearranged_twiddle_tab_stride2_arr_64_q15[3];
-extern uint32_t rearranged_twiddle_tab_stride3_arr_64_q15[3];
-extern q15_t rearranged_twiddle_stride1_64_q15[40];
-extern q15_t rearranged_twiddle_stride2_64_q15[40];
-extern q15_t rearranged_twiddle_stride3_64_q15[40];
-#endif
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_256) || defined(ARM_TABLE_TWIDDLECOEF_Q15_512)
-
-extern uint32_t rearranged_twiddle_tab_stride1_arr_256_q15[4];
-extern uint32_t rearranged_twiddle_tab_stride2_arr_256_q15[4];
-extern uint32_t rearranged_twiddle_tab_stride3_arr_256_q15[4];
-extern q15_t rearranged_twiddle_stride1_256_q15[168];
-extern q15_t rearranged_twiddle_stride2_256_q15[168];
-extern q15_t rearranged_twiddle_stride3_256_q15[168];
-#endif
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_1024) || defined(ARM_TABLE_TWIDDLECOEF_Q15_2048)
-
-extern uint32_t rearranged_twiddle_tab_stride1_arr_1024_q15[5];
-extern uint32_t rearranged_twiddle_tab_stride2_arr_1024_q15[5];
-extern uint32_t rearranged_twiddle_tab_stride3_arr_1024_q15[5];
-extern q15_t rearranged_twiddle_stride1_1024_q15[680];
-extern q15_t rearranged_twiddle_stride2_1024_q15[680];
-extern q15_t rearranged_twiddle_stride3_1024_q15[680];
-#endif
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_4096) || defined(ARM_TABLE_TWIDDLECOEF_Q15_8192)
-
-extern uint32_t rearranged_twiddle_tab_stride1_arr_4096_q15[6];
-extern uint32_t rearranged_twiddle_tab_stride2_arr_4096_q15[6];
-extern uint32_t rearranged_twiddle_tab_stride3_arr_4096_q15[6];
-extern q15_t rearranged_twiddle_stride1_4096_q15[2728];
-extern q15_t rearranged_twiddle_stride2_4096_q15[2728];
-extern q15_t rearranged_twiddle_stride3_4096_q15[2728];
-#endif
-
-
-#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) */
-
-#endif /* defined(ARM_MATH_MVEI) */
-
-
-
-#if defined(ARM_MATH_MVEI) 
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
-
-
-#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) */
-
-#endif /* defined(ARM_MATH_MVEI) */
-
-
-
-#endif /*_ARM_MVE_TABLES_H*/
-
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mve_tables.h
+ * Description:  common tables like fft twiddle factors, Bitreverse, reciprocal etc
+ *               used for MVE implementation only
+ *
+ * $Date:        08. January 2020
+ * $Revision:    V1.7.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2020 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ #ifndef _ARM_MVE_TABLES_H
+ #define _ARM_MVE_TABLES_H
+
+ #include "arm_math.h"
+
+
+
+
+
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_16) || defined(ARM_TABLE_TWIDDLECOEF_F32_32)
+
+extern uint32_t rearranged_twiddle_tab_stride1_arr_16_f32[2];
+extern uint32_t rearranged_twiddle_tab_stride2_arr_16_f32[2];
+extern uint32_t rearranged_twiddle_tab_stride3_arr_16_f32[2];
+extern float32_t rearranged_twiddle_stride1_16_f32[8];
+extern float32_t rearranged_twiddle_stride2_16_f32[8];
+extern float32_t rearranged_twiddle_stride3_16_f32[8];
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_64) || defined(ARM_TABLE_TWIDDLECOEF_F32_128)
+
+extern uint32_t rearranged_twiddle_tab_stride1_arr_64_f32[3];
+extern uint32_t rearranged_twiddle_tab_stride2_arr_64_f32[3];
+extern uint32_t rearranged_twiddle_tab_stride3_arr_64_f32[3];
+extern float32_t rearranged_twiddle_stride1_64_f32[40];
+extern float32_t rearranged_twiddle_stride2_64_f32[40];
+extern float32_t rearranged_twiddle_stride3_64_f32[40];
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_256) || defined(ARM_TABLE_TWIDDLECOEF_F32_512)
+
+extern uint32_t rearranged_twiddle_tab_stride1_arr_256_f32[4];
+extern uint32_t rearranged_twiddle_tab_stride2_arr_256_f32[4];
+extern uint32_t rearranged_twiddle_tab_stride3_arr_256_f32[4];
+extern float32_t rearranged_twiddle_stride1_256_f32[168];
+extern float32_t rearranged_twiddle_stride2_256_f32[168];
+extern float32_t rearranged_twiddle_stride3_256_f32[168];
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_1024) || defined(ARM_TABLE_TWIDDLECOEF_F32_2048)
+
+extern uint32_t rearranged_twiddle_tab_stride1_arr_1024_f32[5];
+extern uint32_t rearranged_twiddle_tab_stride2_arr_1024_f32[5];
+extern uint32_t rearranged_twiddle_tab_stride3_arr_1024_f32[5];
+extern float32_t rearranged_twiddle_stride1_1024_f32[680];
+extern float32_t rearranged_twiddle_stride2_1024_f32[680];
+extern float32_t rearranged_twiddle_stride3_1024_f32[680];
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_4096) || defined(ARM_TABLE_TWIDDLECOEF_F32_8192)
+
+extern uint32_t rearranged_twiddle_tab_stride1_arr_4096_f32[6];
+extern uint32_t rearranged_twiddle_tab_stride2_arr_4096_f32[6];
+extern uint32_t rearranged_twiddle_tab_stride3_arr_4096_f32[6];
+extern float32_t rearranged_twiddle_stride1_4096_f32[2728];
+extern float32_t rearranged_twiddle_stride2_4096_f32[2728];
+extern float32_t rearranged_twiddle_stride3_4096_f32[2728];
+#endif
+
+
+#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) */
+
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+
+
+#if defined(ARM_MATH_MVEI)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_16) || defined(ARM_TABLE_TWIDDLECOEF_Q31_32)
+
+extern uint32_t rearranged_twiddle_tab_stride1_arr_16_q31[2];
+extern uint32_t rearranged_twiddle_tab_stride2_arr_16_q31[2];
+extern uint32_t rearranged_twiddle_tab_stride3_arr_16_q31[2];
+extern q31_t rearranged_twiddle_stride1_16_q31[8];
+extern q31_t rearranged_twiddle_stride2_16_q31[8];
+extern q31_t rearranged_twiddle_stride3_16_q31[8];
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_64) || defined(ARM_TABLE_TWIDDLECOEF_Q31_128)
+
+extern uint32_t rearranged_twiddle_tab_stride1_arr_64_q31[3];
+extern uint32_t rearranged_twiddle_tab_stride2_arr_64_q31[3];
+extern uint32_t rearranged_twiddle_tab_stride3_arr_64_q31[3];
+extern q31_t rearranged_twiddle_stride1_64_q31[40];
+extern q31_t rearranged_twiddle_stride2_64_q31[40];
+extern q31_t rearranged_twiddle_stride3_64_q31[40];
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_256) || defined(ARM_TABLE_TWIDDLECOEF_Q31_512)
+
+extern uint32_t rearranged_twiddle_tab_stride1_arr_256_q31[4];
+extern uint32_t rearranged_twiddle_tab_stride2_arr_256_q31[4];
+extern uint32_t rearranged_twiddle_tab_stride3_arr_256_q31[4];
+extern q31_t rearranged_twiddle_stride1_256_q31[168];
+extern q31_t rearranged_twiddle_stride2_256_q31[168];
+extern q31_t rearranged_twiddle_stride3_256_q31[168];
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_1024) || defined(ARM_TABLE_TWIDDLECOEF_Q31_2048)
+
+extern uint32_t rearranged_twiddle_tab_stride1_arr_1024_q31[5];
+extern uint32_t rearranged_twiddle_tab_stride2_arr_1024_q31[5];
+extern uint32_t rearranged_twiddle_tab_stride3_arr_1024_q31[5];
+extern q31_t rearranged_twiddle_stride1_1024_q31[680];
+extern q31_t rearranged_twiddle_stride2_1024_q31[680];
+extern q31_t rearranged_twiddle_stride3_1024_q31[680];
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_4096) || defined(ARM_TABLE_TWIDDLECOEF_Q31_8192)
+
+extern uint32_t rearranged_twiddle_tab_stride1_arr_4096_q31[6];
+extern uint32_t rearranged_twiddle_tab_stride2_arr_4096_q31[6];
+extern uint32_t rearranged_twiddle_tab_stride3_arr_4096_q31[6];
+extern q31_t rearranged_twiddle_stride1_4096_q31[2728];
+extern q31_t rearranged_twiddle_stride2_4096_q31[2728];
+extern q31_t rearranged_twiddle_stride3_4096_q31[2728];
+#endif
+
+
+#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) */
+
+#endif /* defined(ARM_MATH_MVEI) */
+
+
+
+#if defined(ARM_MATH_MVEI)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_16) || defined(ARM_TABLE_TWIDDLECOEF_Q15_32)
+
+extern uint32_t rearranged_twiddle_tab_stride1_arr_16_q15[2];
+extern uint32_t rearranged_twiddle_tab_stride2_arr_16_q15[2];
+extern uint32_t rearranged_twiddle_tab_stride3_arr_16_q15[2];
+extern q15_t rearranged_twiddle_stride1_16_q15[8];
+extern q15_t rearranged_twiddle_stride2_16_q15[8];
+extern q15_t rearranged_twiddle_stride3_16_q15[8];
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_64) || defined(ARM_TABLE_TWIDDLECOEF_Q15_128)
+
+extern uint32_t rearranged_twiddle_tab_stride1_arr_64_q15[3];
+extern uint32_t rearranged_twiddle_tab_stride2_arr_64_q15[3];
+extern uint32_t rearranged_twiddle_tab_stride3_arr_64_q15[3];
+extern q15_t rearranged_twiddle_stride1_64_q15[40];
+extern q15_t rearranged_twiddle_stride2_64_q15[40];
+extern q15_t rearranged_twiddle_stride3_64_q15[40];
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_256) || defined(ARM_TABLE_TWIDDLECOEF_Q15_512)
+
+extern uint32_t rearranged_twiddle_tab_stride1_arr_256_q15[4];
+extern uint32_t rearranged_twiddle_tab_stride2_arr_256_q15[4];
+extern uint32_t rearranged_twiddle_tab_stride3_arr_256_q15[4];
+extern q15_t rearranged_twiddle_stride1_256_q15[168];
+extern q15_t rearranged_twiddle_stride2_256_q15[168];
+extern q15_t rearranged_twiddle_stride3_256_q15[168];
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_1024) || defined(ARM_TABLE_TWIDDLECOEF_Q15_2048)
+
+extern uint32_t rearranged_twiddle_tab_stride1_arr_1024_q15[5];
+extern uint32_t rearranged_twiddle_tab_stride2_arr_1024_q15[5];
+extern uint32_t rearranged_twiddle_tab_stride3_arr_1024_q15[5];
+extern q15_t rearranged_twiddle_stride1_1024_q15[680];
+extern q15_t rearranged_twiddle_stride2_1024_q15[680];
+extern q15_t rearranged_twiddle_stride3_1024_q15[680];
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_4096) || defined(ARM_TABLE_TWIDDLECOEF_Q15_8192)
+
+extern uint32_t rearranged_twiddle_tab_stride1_arr_4096_q15[6];
+extern uint32_t rearranged_twiddle_tab_stride2_arr_4096_q15[6];
+extern uint32_t rearranged_twiddle_tab_stride3_arr_4096_q15[6];
+extern q15_t rearranged_twiddle_stride1_4096_q15[2728];
+extern q15_t rearranged_twiddle_stride2_4096_q15[2728];
+extern q15_t rearranged_twiddle_stride3_4096_q15[2728];
+#endif
+
+
+#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) */
+
+#endif /* defined(ARM_MATH_MVEI) */
+
+
+
+#if defined(ARM_MATH_MVEI)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
+
+
+#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) */
+
+#endif /* defined(ARM_MATH_MVEI) */
+
+
+
+#endif /*_ARM_MVE_TABLES_H*/
+
--- a/libraries/cmsis/cm4/core_support/arm_vec_math.h
+++ b/libraries/cmsis/cm4/core_support/arm_vec_math.h
@@ -1,372 +1,372 @@
-/******************************************************************************
- * @file     arm_vec_math.h
- * @brief    Public header file for CMSIS DSP Library
- * @version  V1.7.0
- * @date     15. October 2019
- ******************************************************************************/
-/*
- * Copyright (c) 2010-2019 Arm Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef _ARM_VEC_MATH_H
-#define _ARM_VEC_MATH_H
-
-#include "arm_math.h"
-#include "arm_common_tables.h"
-#include "arm_helium_utils.h"
-
-#ifdef   __cplusplus
-extern "C"
-{
-#endif
-
-#if (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)
-
-#define INV_NEWTON_INIT_F32         0x7EF127EA
-
-static const float32_t __logf_rng_f32=0.693147180f;
-
-
-/* fast inverse approximation (3x newton) */
-__STATIC_INLINE f32x4_t vrecip_medprec_f32(
-    f32x4_t x)
-{
-    q31x4_t         m;
-    f32x4_t         b;
-    any32x4_t       xinv;
-    f32x4_t         ax = vabsq(x);
-
-    xinv.f = ax;
-    m = 0x3F800000 - (xinv.i & 0x7F800000);
-    xinv.i = xinv.i + m;
-    xinv.f = 1.41176471f - 0.47058824f * xinv.f;
-    xinv.i = xinv.i + m;
-
-    b = 2.0f - xinv.f * ax;
-    xinv.f = xinv.f * b;
-
-    b = 2.0f - xinv.f * ax;
-    xinv.f = xinv.f * b;
-
-    b = 2.0f - xinv.f * ax;
-    xinv.f = xinv.f * b;
-
-    xinv.f = vdupq_m(xinv.f, INFINITY, vcmpeqq(x, 0.0f));
-    /*
-     * restore sign
-     */
-    xinv.f = vnegq_m(xinv.f, xinv.f, vcmpltq(x, 0.0f));
-
-    return xinv.f;
-}
-
-/* fast inverse approximation (4x newton) */
-__STATIC_INLINE f32x4_t vrecip_hiprec_f32(
-    f32x4_t x)
-{
-    q31x4_t         m;
-    f32x4_t         b;
-    any32x4_t       xinv;
-    f32x4_t         ax = vabsq(x);
-
-    xinv.f = ax;
-
-    m = 0x3F800000 - (xinv.i & 0x7F800000);
-    xinv.i = xinv.i + m;
-    xinv.f = 1.41176471f - 0.47058824f * xinv.f;
-    xinv.i = xinv.i + m;
-
-    b = 2.0f - xinv.f * ax;
-    xinv.f = xinv.f * b;
-
-    b = 2.0f - xinv.f * ax;
-    xinv.f = xinv.f * b;
-
-    b = 2.0f - xinv.f * ax;
-    xinv.f = xinv.f * b;
-
-    b = 2.0f - xinv.f * ax;
-    xinv.f = xinv.f * b;
-
-    xinv.f = vdupq_m(xinv.f, INFINITY, vcmpeqq(x, 0.0f));
-    /*
-     * restore sign
-     */
-    xinv.f = vnegq_m(xinv.f, xinv.f, vcmpltq(x, 0.0f));
-
-    return xinv.f;
-}
-
-__STATIC_INLINE f32x4_t vdiv_f32(
-    f32x4_t num, f32x4_t den)
-{
-    return vmulq(num, vrecip_hiprec_f32(den));
-}
-
-/**
-  @brief         Single-precision taylor dev.
-  @param[in]     x              f32 quad vector input
-  @param[in]     coeffs         f32 quad vector coeffs
-  @return        destination    f32 quad vector
- */
-
-__STATIC_INLINE f32x4_t vtaylor_polyq_f32(
-        f32x4_t           x,
-        const float32_t * coeffs)
-{
-    f32x4_t         A = vfmasq(vdupq_n_f32(coeffs[4]), x, coeffs[0]);
-    f32x4_t         B = vfmasq(vdupq_n_f32(coeffs[6]), x, coeffs[2]);
-    f32x4_t         C = vfmasq(vdupq_n_f32(coeffs[5]), x, coeffs[1]);
-    f32x4_t         D = vfmasq(vdupq_n_f32(coeffs[7]), x, coeffs[3]);
-    f32x4_t         x2 = vmulq(x, x);
-    f32x4_t         x4 = vmulq(x2, x2);
-    f32x4_t         res = vfmaq(vfmaq_f32(A, B, x2), vfmaq_f32(C, D, x2), x4);
-
-    return res;
-}
-
-__STATIC_INLINE f32x4_t vmant_exp_f32(
-    f32x4_t     x,
-    int32x4_t * e)
-{
-    any32x4_t       r;
-    int32x4_t       n;
-
-    r.f = x;
-    n = r.i >> 23;
-    n = n - 127;
-    r.i = r.i - (n << 23);
-
-    *e = n;
-    return r.f;
-}
-
-
-__STATIC_INLINE f32x4_t vlogq_f32(f32x4_t vecIn)
-{
-    q31x4_t         vecExpUnBiased;
-    f32x4_t         vecTmpFlt0, vecTmpFlt1;
-    f32x4_t         vecAcc0, vecAcc1, vecAcc2, vecAcc3;
-    f32x4_t         vecExpUnBiasedFlt;
-
-    /*
-     * extract exponent
-     */
-    vecTmpFlt1 = vmant_exp_f32(vecIn, &vecExpUnBiased);
-
-    vecTmpFlt0 = vecTmpFlt1 * vecTmpFlt1;
-    /*
-     * a = (__logf_lut_f32[4] * r.f) + (__logf_lut_f32[0]);
-     */
-    vecAcc0 = vdupq_n_f32(__logf_lut_f32[0]);
-    vecAcc0 = vfmaq(vecAcc0, vecTmpFlt1, __logf_lut_f32[4]);
-    /*
-     * b = (__logf_lut_f32[6] * r.f) + (__logf_lut_f32[2]);
-     */
-    vecAcc1 = vdupq_n_f32(__logf_lut_f32[2]);
-    vecAcc1 = vfmaq(vecAcc1, vecTmpFlt1, __logf_lut_f32[6]);
-    /*
-     * c = (__logf_lut_f32[5] * r.f) + (__logf_lut_f32[1]);
-     */
-    vecAcc2 = vdupq_n_f32(__logf_lut_f32[1]);
-    vecAcc2 = vfmaq(vecAcc2, vecTmpFlt1, __logf_lut_f32[5]);
-    /*
-     * d = (__logf_lut_f32[7] * r.f) + (__logf_lut_f32[3]);
-     */
-    vecAcc3 = vdupq_n_f32(__logf_lut_f32[3]);
-    vecAcc3 = vfmaq(vecAcc3, vecTmpFlt1, __logf_lut_f32[7]);
-    /*
-     * a = a + b * xx;
-     */
-    vecAcc0 = vfmaq(vecAcc0, vecAcc1, vecTmpFlt0);
-    /*
-     * c = c + d * xx;
-     */
-    vecAcc2 = vfmaq(vecAcc2, vecAcc3, vecTmpFlt0);
-    /*
-     * xx = xx * xx;
-     */
-    vecTmpFlt0 = vecTmpFlt0 * vecTmpFlt0;
-    vecExpUnBiasedFlt = vcvtq_f32_s32(vecExpUnBiased);
-    /*
-     * r.f = a + c * xx;
-     */
-    vecAcc0 = vfmaq(vecAcc0, vecAcc2, vecTmpFlt0);
-    /*
-     * add exponent
-     * r.f = r.f + ((float32_t) m) * __logf_rng_f32;
-     */
-    vecAcc0 = vfmaq(vecAcc0, vecExpUnBiasedFlt, __logf_rng_f32);
-    // set log0 down to -inf
-    vecAcc0 = vdupq_m(vecAcc0, -INFINITY, vcmpeqq(vecIn, 0.0f));
-    return vecAcc0;
-}
-
-__STATIC_INLINE f32x4_t vexpq_f32(
-    f32x4_t x)
-{
-    // Perform range reduction [-log(2),log(2)]
-    int32x4_t       m = vcvtq_s32_f32(vmulq_n_f32(x, 1.4426950408f));
-    f32x4_t         val = vfmsq_f32(x, vcvtq_f32_s32(m), vdupq_n_f32(0.6931471805f));
-
-    // Polynomial Approximation
-    f32x4_t         poly = vtaylor_polyq_f32(val, exp_tab);
-
-    // Reconstruct
-    poly = (f32x4_t) (vqaddq_s32((q31x4_t) (poly), vqshlq_n_s32(m, 23)));
-
-    poly = vdupq_m(poly, 0.0f, vcmpltq_n_s32(m, -126));
-    return poly;
-}
-
-__STATIC_INLINE f32x4_t arm_vec_exponent_f32(f32x4_t x, int32_t nb)
-{
-    f32x4_t         r = x;
-    nb--;
-    while (nb > 0) {
-        r = vmulq(r, x);
-        nb--;
-    }
-    return (r);
-}
-
-__STATIC_INLINE f32x4_t vrecip_f32(f32x4_t vecIn)
-{
-    f32x4_t     vecSx, vecW, vecTmp;
-    any32x4_t   v;
-
-    vecSx = vabsq(vecIn);
-
-    v.f = vecIn;
-    v.i = vsubq(vdupq_n_s32(INV_NEWTON_INIT_F32), v.i);
-
-    vecW = vmulq(vecSx, v.f);
-
-    // v.f = v.f * (8 + w * (-28 + w * (56 + w * (-70 + w *(56 + w * (-28 + w * (8 - w)))))));
-    vecTmp = vsubq(vdupq_n_f32(8.0f), vecW);
-    vecTmp = vfmasq(vecW, vecTmp, -28.0f);
-    vecTmp = vfmasq(vecW, vecTmp, 56.0f);
-    vecTmp = vfmasq(vecW, vecTmp, -70.0f);
-    vecTmp = vfmasq(vecW, vecTmp, 56.0f);
-    vecTmp = vfmasq(vecW, vecTmp, -28.0f);
-    vecTmp = vfmasq(vecW, vecTmp, 8.0f);
-    v.f = vmulq(v.f,  vecTmp);
-
-    v.f = vdupq_m(v.f, INFINITY, vcmpeqq(vecIn, 0.0f));
-    /*
-     * restore sign
-     */
-    v.f = vnegq_m(v.f, v.f, vcmpltq(vecIn, 0.0f));
-    return v.f;
-}
-
-__STATIC_INLINE f32x4_t vtanhq_f32(
-    f32x4_t val)
-{
-    f32x4_t         x =
-        vminnmq_f32(vmaxnmq_f32(val, vdupq_n_f32(-10.f)), vdupq_n_f32(10.0f));
-    f32x4_t         exp2x = vexpq_f32(vmulq_n_f32(x, 2.f));
-    f32x4_t         num = vsubq_n_f32(exp2x, 1.f);
-    f32x4_t         den = vaddq_n_f32(exp2x, 1.f);
-    f32x4_t         tanh = vmulq_f32(num, vrecip_f32(den));
-    return tanh;
-}
-
-__STATIC_INLINE f32x4_t vpowq_f32(
-    f32x4_t val,
-    f32x4_t n)
-{
-    return vexpq_f32(vmulq_f32(n, vlogq_f32(val)));
-}
-
-#endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)*/
-
-#if (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM))
-#endif /* (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) */
-
-#if (defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE)
-
-#include "NEMath.h"
-/**
- * @brief Vectorized integer exponentiation
- * @param[in]    x           value
- * @param[in]    nb          integer exponent >= 1
- * @return x^nb
- *
- */
-__STATIC_INLINE  float32x4_t arm_vec_exponent_f32(float32x4_t x, int32_t nb)
-{
-    float32x4_t r = x;
-    nb --;
-    while(nb > 0)
-    {
-        r = vmulq_f32(r , x);
-        nb--;
-    }
-    return(r);
-}
-
-
-__STATIC_INLINE float32x4_t __arm_vec_sqrt_f32_neon(float32x4_t  x)
-{
-    float32x4_t x1 = vmaxq_f32(x, vdupq_n_f32(FLT_MIN));
-    float32x4_t e = vrsqrteq_f32(x1);
-    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
-    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
-    return vmulq_f32(x, e);
-}
-
-__STATIC_INLINE int16x8_t __arm_vec_sqrt_q15_neon(int16x8_t vec)
-{
-    float32x4_t tempF;
-    int32x4_t tempHI,tempLO;
-
-    tempLO = vmovl_s16(vget_low_s16(vec));
-    tempF = vcvtq_n_f32_s32(tempLO,15);
-    tempF = __arm_vec_sqrt_f32_neon(tempF);
-    tempLO = vcvtq_n_s32_f32(tempF,15);
-
-    tempHI = vmovl_s16(vget_high_s16(vec));
-    tempF = vcvtq_n_f32_s32(tempHI,15);
-    tempF = __arm_vec_sqrt_f32_neon(tempF);
-    tempHI = vcvtq_n_s32_f32(tempF,15);
-
-    return(vcombine_s16(vqmovn_s32(tempLO),vqmovn_s32(tempHI)));
-}
-
-__STATIC_INLINE int32x4_t __arm_vec_sqrt_q31_neon(int32x4_t vec)
-{
-  float32x4_t temp;
-
-  temp = vcvtq_n_f32_s32(vec,31);
-  temp = __arm_vec_sqrt_f32_neon(temp);
-  return(vcvtq_n_s32_f32(temp,31));
-}
-
-#endif /*  (defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE) */
-
-#ifdef   __cplusplus
-}
-#endif
-
-
-#endif /* _ARM_VEC_MATH_H */
-
-/**
- *
- * End of file.
- */
+/******************************************************************************
+ * @file     arm_vec_math.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.7.0
+ * @date     15. October 2019
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2019 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _ARM_VEC_MATH_H
+#define _ARM_VEC_MATH_H
+
+#include "arm_math.h"
+#include "arm_common_tables.h"
+#include "arm_helium_utils.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+#if (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#define INV_NEWTON_INIT_F32         0x7EF127EA
+
+static const float32_t __logf_rng_f32=0.693147180f;
+
+
+/* fast inverse approximation (3x newton) */
+__STATIC_INLINE f32x4_t vrecip_medprec_f32(
+    f32x4_t x)
+{
+    q31x4_t         m;
+    f32x4_t         b;
+    any32x4_t       xinv;
+    f32x4_t         ax = vabsq(x);
+
+    xinv.f = ax;
+    m = 0x3F800000 - (xinv.i & 0x7F800000);
+    xinv.i = xinv.i + m;
+    xinv.f = 1.41176471f - 0.47058824f * xinv.f;
+    xinv.i = xinv.i + m;
+
+    b = 2.0f - xinv.f * ax;
+    xinv.f = xinv.f * b;
+
+    b = 2.0f - xinv.f * ax;
+    xinv.f = xinv.f * b;
+
+    b = 2.0f - xinv.f * ax;
+    xinv.f = xinv.f * b;
+
+    xinv.f = vdupq_m(xinv.f, INFINITY, vcmpeqq(x, 0.0f));
+    /*
+     * restore sign
+     */
+    xinv.f = vnegq_m(xinv.f, xinv.f, vcmpltq(x, 0.0f));
+
+    return xinv.f;
+}
+
+/* fast inverse approximation (4x newton) */
+__STATIC_INLINE f32x4_t vrecip_hiprec_f32(
+    f32x4_t x)
+{
+    q31x4_t         m;
+    f32x4_t         b;
+    any32x4_t       xinv;
+    f32x4_t         ax = vabsq(x);
+
+    xinv.f = ax;
+
+    m = 0x3F800000 - (xinv.i & 0x7F800000);
+    xinv.i = xinv.i + m;
+    xinv.f = 1.41176471f - 0.47058824f * xinv.f;
+    xinv.i = xinv.i + m;
+
+    b = 2.0f - xinv.f * ax;
+    xinv.f = xinv.f * b;
+
+    b = 2.0f - xinv.f * ax;
+    xinv.f = xinv.f * b;
+
+    b = 2.0f - xinv.f * ax;
+    xinv.f = xinv.f * b;
+
+    b = 2.0f - xinv.f * ax;
+    xinv.f = xinv.f * b;
+
+    xinv.f = vdupq_m(xinv.f, INFINITY, vcmpeqq(x, 0.0f));
+    /*
+     * restore sign
+     */
+    xinv.f = vnegq_m(xinv.f, xinv.f, vcmpltq(x, 0.0f));
+
+    return xinv.f;
+}
+
+__STATIC_INLINE f32x4_t vdiv_f32(
+    f32x4_t num, f32x4_t den)
+{
+    return vmulq(num, vrecip_hiprec_f32(den));
+}
+
+/**
+  @brief         Single-precision taylor dev.
+  @param[in]     x              f32 quad vector input
+  @param[in]     coeffs         f32 quad vector coeffs
+  @return        destination    f32 quad vector
+ */
+
+__STATIC_INLINE f32x4_t vtaylor_polyq_f32(
+        f32x4_t           x,
+        const float32_t * coeffs)
+{
+    f32x4_t         A = vfmasq(vdupq_n_f32(coeffs[4]), x, coeffs[0]);
+    f32x4_t         B = vfmasq(vdupq_n_f32(coeffs[6]), x, coeffs[2]);
+    f32x4_t         C = vfmasq(vdupq_n_f32(coeffs[5]), x, coeffs[1]);
+    f32x4_t         D = vfmasq(vdupq_n_f32(coeffs[7]), x, coeffs[3]);
+    f32x4_t         x2 = vmulq(x, x);
+    f32x4_t         x4 = vmulq(x2, x2);
+    f32x4_t         res = vfmaq(vfmaq_f32(A, B, x2), vfmaq_f32(C, D, x2), x4);
+
+    return res;
+}
+
+__STATIC_INLINE f32x4_t vmant_exp_f32(
+    f32x4_t     x,
+    int32x4_t * e)
+{
+    any32x4_t       r;
+    int32x4_t       n;
+
+    r.f = x;
+    n = r.i >> 23;
+    n = n - 127;
+    r.i = r.i - (n << 23);
+
+    *e = n;
+    return r.f;
+}
+
+
+__STATIC_INLINE f32x4_t vlogq_f32(f32x4_t vecIn)
+{
+    q31x4_t         vecExpUnBiased;
+    f32x4_t         vecTmpFlt0, vecTmpFlt1;
+    f32x4_t         vecAcc0, vecAcc1, vecAcc2, vecAcc3;
+    f32x4_t         vecExpUnBiasedFlt;
+
+    /*
+     * extract exponent
+     */
+    vecTmpFlt1 = vmant_exp_f32(vecIn, &vecExpUnBiased);
+
+    vecTmpFlt0 = vecTmpFlt1 * vecTmpFlt1;
+    /*
+     * a = (__logf_lut_f32[4] * r.f) + (__logf_lut_f32[0]);
+     */
+    vecAcc0 = vdupq_n_f32(__logf_lut_f32[0]);
+    vecAcc0 = vfmaq(vecAcc0, vecTmpFlt1, __logf_lut_f32[4]);
+    /*
+     * b = (__logf_lut_f32[6] * r.f) + (__logf_lut_f32[2]);
+     */
+    vecAcc1 = vdupq_n_f32(__logf_lut_f32[2]);
+    vecAcc1 = vfmaq(vecAcc1, vecTmpFlt1, __logf_lut_f32[6]);
+    /*
+     * c = (__logf_lut_f32[5] * r.f) + (__logf_lut_f32[1]);
+     */
+    vecAcc2 = vdupq_n_f32(__logf_lut_f32[1]);
+    vecAcc2 = vfmaq(vecAcc2, vecTmpFlt1, __logf_lut_f32[5]);
+    /*
+     * d = (__logf_lut_f32[7] * r.f) + (__logf_lut_f32[3]);
+     */
+    vecAcc3 = vdupq_n_f32(__logf_lut_f32[3]);
+    vecAcc3 = vfmaq(vecAcc3, vecTmpFlt1, __logf_lut_f32[7]);
+    /*
+     * a = a + b * xx;
+     */
+    vecAcc0 = vfmaq(vecAcc0, vecAcc1, vecTmpFlt0);
+    /*
+     * c = c + d * xx;
+     */
+    vecAcc2 = vfmaq(vecAcc2, vecAcc3, vecTmpFlt0);
+    /*
+     * xx = xx * xx;
+     */
+    vecTmpFlt0 = vecTmpFlt0 * vecTmpFlt0;
+    vecExpUnBiasedFlt = vcvtq_f32_s32(vecExpUnBiased);
+    /*
+     * r.f = a + c * xx;
+     */
+    vecAcc0 = vfmaq(vecAcc0, vecAcc2, vecTmpFlt0);
+    /*
+     * add exponent
+     * r.f = r.f + ((float32_t) m) * __logf_rng_f32;
+     */
+    vecAcc0 = vfmaq(vecAcc0, vecExpUnBiasedFlt, __logf_rng_f32);
+    // set log0 down to -inf
+    vecAcc0 = vdupq_m(vecAcc0, -INFINITY, vcmpeqq(vecIn, 0.0f));
+    return vecAcc0;
+}
+
+__STATIC_INLINE f32x4_t vexpq_f32(
+    f32x4_t x)
+{
+    // Perform range reduction [-log(2),log(2)]
+    int32x4_t       m = vcvtq_s32_f32(vmulq_n_f32(x, 1.4426950408f));
+    f32x4_t         val = vfmsq_f32(x, vcvtq_f32_s32(m), vdupq_n_f32(0.6931471805f));
+
+    // Polynomial Approximation
+    f32x4_t         poly = vtaylor_polyq_f32(val, exp_tab);
+
+    // Reconstruct
+    poly = (f32x4_t) (vqaddq_s32((q31x4_t) (poly), vqshlq_n_s32(m, 23)));
+
+    poly = vdupq_m(poly, 0.0f, vcmpltq_n_s32(m, -126));
+    return poly;
+}
+
+__STATIC_INLINE f32x4_t arm_vec_exponent_f32(f32x4_t x, int32_t nb)
+{
+    f32x4_t         r = x;
+    nb--;
+    while (nb > 0) {
+        r = vmulq(r, x);
+        nb--;
+    }
+    return (r);
+}
+
+__STATIC_INLINE f32x4_t vrecip_f32(f32x4_t vecIn)
+{
+    f32x4_t     vecSx, vecW, vecTmp;
+    any32x4_t   v;
+
+    vecSx = vabsq(vecIn);
+
+    v.f = vecIn;
+    v.i = vsubq(vdupq_n_s32(INV_NEWTON_INIT_F32), v.i);
+
+    vecW = vmulq(vecSx, v.f);
+
+    // v.f = v.f * (8 + w * (-28 + w * (56 + w * (-70 + w *(56 + w * (-28 + w * (8 - w)))))));
+    vecTmp = vsubq(vdupq_n_f32(8.0f), vecW);
+    vecTmp = vfmasq(vecW, vecTmp, -28.0f);
+    vecTmp = vfmasq(vecW, vecTmp, 56.0f);
+    vecTmp = vfmasq(vecW, vecTmp, -70.0f);
+    vecTmp = vfmasq(vecW, vecTmp, 56.0f);
+    vecTmp = vfmasq(vecW, vecTmp, -28.0f);
+    vecTmp = vfmasq(vecW, vecTmp, 8.0f);
+    v.f = vmulq(v.f,  vecTmp);
+
+    v.f = vdupq_m(v.f, INFINITY, vcmpeqq(vecIn, 0.0f));
+    /*
+     * restore sign
+     */
+    v.f = vnegq_m(v.f, v.f, vcmpltq(vecIn, 0.0f));
+    return v.f;
+}
+
+__STATIC_INLINE f32x4_t vtanhq_f32(
+    f32x4_t val)
+{
+    f32x4_t         x =
+        vminnmq_f32(vmaxnmq_f32(val, vdupq_n_f32(-10.f)), vdupq_n_f32(10.0f));
+    f32x4_t         exp2x = vexpq_f32(vmulq_n_f32(x, 2.f));
+    f32x4_t         num = vsubq_n_f32(exp2x, 1.f);
+    f32x4_t         den = vaddq_n_f32(exp2x, 1.f);
+    f32x4_t         tanh = vmulq_f32(num, vrecip_f32(den));
+    return tanh;
+}
+
+__STATIC_INLINE f32x4_t vpowq_f32(
+    f32x4_t val,
+    f32x4_t n)
+{
+    return vexpq_f32(vmulq_f32(n, vlogq_f32(val)));
+}
+
+#endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)*/
+
+#if (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM))
+#endif /* (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) */
+
+#if (defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "NEMath.h"
+/**
+ * @brief Vectorized integer exponentiation
+ * @param[in]    x           value
+ * @param[in]    nb          integer exponent >= 1
+ * @return x^nb
+ *
+ */
+__STATIC_INLINE  float32x4_t arm_vec_exponent_f32(float32x4_t x, int32_t nb)
+{
+    float32x4_t r = x;
+    nb --;
+    while(nb > 0)
+    {
+        r = vmulq_f32(r , x);
+        nb--;
+    }
+    return(r);
+}
+
+
+__STATIC_INLINE float32x4_t __arm_vec_sqrt_f32_neon(float32x4_t  x)
+{
+    float32x4_t x1 = vmaxq_f32(x, vdupq_n_f32(FLT_MIN));
+    float32x4_t e = vrsqrteq_f32(x1);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
+    return vmulq_f32(x, e);
+}
+
+__STATIC_INLINE int16x8_t __arm_vec_sqrt_q15_neon(int16x8_t vec)
+{
+    float32x4_t tempF;
+    int32x4_t tempHI,tempLO;
+
+    tempLO = vmovl_s16(vget_low_s16(vec));
+    tempF = vcvtq_n_f32_s32(tempLO,15);
+    tempF = __arm_vec_sqrt_f32_neon(tempF);
+    tempLO = vcvtq_n_s32_f32(tempF,15);
+
+    tempHI = vmovl_s16(vget_high_s16(vec));
+    tempF = vcvtq_n_f32_s32(tempHI,15);
+    tempF = __arm_vec_sqrt_f32_neon(tempF);
+    tempHI = vcvtq_n_s32_f32(tempF,15);
+
+    return(vcombine_s16(vqmovn_s32(tempLO),vqmovn_s32(tempHI)));
+}
+
+__STATIC_INLINE int32x4_t __arm_vec_sqrt_q31_neon(int32x4_t vec)
+{
+  float32x4_t temp;
+
+  temp = vcvtq_n_f32_s32(vec,31);
+  temp = __arm_vec_sqrt_f32_neon(temp);
+  return(vcvtq_n_s32_f32(temp,31));
+}
+
+#endif /*  (defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+#ifdef   __cplusplus
+}
+#endif
+
+
+#endif /* _ARM_VEC_MATH_H */
+
+/**
+ *
+ * End of file.
+ */
--- a/libraries/cmsis/cm4/core_support/cmsis_armcc.h
+++ b/libraries/cmsis/cm4/core_support/cmsis_armcc.h
--- a/libraries/cmsis/cm4/core_support/cmsis_armclang.h
+++ b/libraries/cmsis/cm4/core_support/cmsis_armclang.h
--- a/libraries/cmsis/cm4/core_support/cmsis_armclang_ltm.h
+++ b/libraries/cmsis/cm4/core_support/cmsis_armclang_ltm.h
--- a/libraries/cmsis/cm4/core_support/cmsis_gcc.h
+++ b/libraries/cmsis/cm4/core_support/cmsis_gcc.h
--- a/libraries/cmsis/cm4/core_support/cmsis_iccarm.h
+++ b/libraries/cmsis/cm4/core_support/cmsis_iccarm.h
@@ -8,7 +8,7 @@
 //------------------------------------------------------------------------------
 //
 // Copyright (c) 2017-2019 IAR Systems
-// Copyright (c) 2017-2019 Arm Limited. All rights reserved. 
+// Copyright (c) 2017-2019 Arm Limited. All rights reserved.
 //
 // SPDX-License-Identifier: Apache-2.0
 //
--- a/libraries/cmsis/cm4/core_support/core_cm4.h
+++ b/libraries/cmsis/cm4/core_support/core_cm4.h
@@ -198,7 +198,7 @@
    #define __VTOR_PRESENT             1U
    #warning "__VTOR_PRESENT not defined in device header file; using default!"
  #endif
-  
+
  #ifndef __NVIC_PRIO_BITS
    #define __NVIC_PRIO_BITS          3U
    #warning "__NVIC_PRIO_BITS not defined in device header file; using default!"
--- a/libraries/cmsis/cm4/core_support/mpu_armv7.h
+++ b/libraries/cmsis/cm4/core_support/mpu_armv7.h
@@ -21,13 +21,13 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
- 
+
 #if   defined ( __ICCARM__ )
  #pragma system_include         /* treat file as system include file for MISRA check */
 #elif defined (__clang__)
  #pragma clang system_header    /* treat file as system include file */
 #endif
- 
+
 #ifndef ARM_MPU_ARMV7_H
 #define ARM_MPU_ARMV7_H

@@ -79,12 +79,12 @@

 /**
 * MPU Memory Access Attributes
-* 
+*
 * \param TypeExtField      Type extension field, allows you to configure memory access type, for example strongly ordered, peripheral.
 * \param IsShareable       Region is shareable between multiple bus masters.
 * \param IsCacheable       Region is cacheable, i.e. its value may be kept in cache.
 * \param IsBufferable      Region is bufferable, i.e. using write-back caching. Cacheable but non-bufferable regions use write-through policy.
-*/  
+*/
 #define ARM_MPU_ACCESS_(TypeExtField, IsShareable, IsCacheable, IsBufferable)   \
  ((((TypeExtField) << MPU_RASR_TEX_Pos) & MPU_RASR_TEX_Msk)                  | \
   (((IsShareable)  << MPU_RASR_S_Pos)   & MPU_RASR_S_Msk)                    | \
@@ -93,7 +93,7 @@

 /**
 * MPU Region Attribute and Size Register Value
-* 
+*
 * \param DisableExec       Instruction access disable bit, 1= disable instruction fetches.
 * \param AccessPermission  Data access permissions, allows you to configure read/write access for User and Privileged mode.
 * \param AccessAttributes  Memory access attribution, see \ref ARM_MPU_ACCESS_.
@@ -110,7 +110,7 @@

 /**
 * MPU Region Attribute and Size Register Value
-* 
+*
 * \param DisableExec       Instruction access disable bit, 1= disable instruction fetches.
 * \param AccessPermission  Data access permissions, allows you to configure read/write access for User and Privileged mode.
 * \param TypeExtField      Type extension field, allows you to configure memory access type, for example strongly ordered, peripheral.
@@ -119,7 +119,7 @@
 * \param IsBufferable      Region is bufferable, i.e. using write-back caching. Cacheable but non-bufferable regions use write-through policy.
 * \param SubRegionDisable  Sub-region disable field.
 * \param Size              Region size of the region to be configured, for example 4K, 8K.
-*/                         
+*/
 #define ARM_MPU_RASR(DisableExec, AccessPermission, TypeExtField, IsShareable, IsCacheable, IsBufferable, SubRegionDisable, Size) \
  ARM_MPU_RASR_EX(DisableExec, AccessPermission, ARM_MPU_ACCESS_(TypeExtField, IsShareable, IsCacheable, IsBufferable), SubRegionDisable, Size)

@@ -129,7 +129,7 @@
 *  - Shareable
 *  - Non-cacheable
 *  - Non-bufferable
-*/ 
+*/
 #define ARM_MPU_ACCESS_ORDERED ARM_MPU_ACCESS_(0U, 1U, 0U, 0U)

 /**
@@ -140,7 +140,7 @@
 *  - Bufferable (if shareable) or non-bufferable (if non-shareable)
 *
 * \param IsShareable Configures the device memory as shareable or non-shareable.
-*/ 
+*/
 #define ARM_MPU_ACCESS_DEVICE(IsShareable) ((IsShareable) ? ARM_MPU_ACCESS_(0U, 1U, 0U, 1U) : ARM_MPU_ACCESS_(2U, 0U, 0U, 0U))

 /**
@@ -153,7 +153,7 @@
 * \param OuterCp Configures the outer cache policy.
 * \param InnerCp Configures the inner cache policy.
 * \param IsShareable Configures the memory as shareable or non-shareable.
-*/ 
+*/
 #define ARM_MPU_ACCESS_NORMAL(OuterCp, InnerCp, IsShareable) ARM_MPU_ACCESS_((4U | (OuterCp)), IsShareable, ((InnerCp) >> 1U), ((InnerCp) & 1U))

 /**
@@ -184,7 +184,7 @@ typedef struct {
  uint32_t RBAR; //!< The region base address register value (RBAR)
  uint32_t RASR; //!< The region attribute and size register value (RASR) \ref MPU_RASR
 } ARM_MPU_Region_t;
-    
+
 /** Enable the MPU.
 * \param MPU_Control Default access permissions for unconfigured regions.
 */
@@ -224,7 +224,7 @@ __STATIC_INLINE void ARM_MPU_ClrRegion(uint32_t rnr)
 /** Configure an MPU region.
 * \param rbar Value for RBAR register.
 * \param rsar Value for RSAR register.
-*/   
+*/
 __STATIC_INLINE void ARM_MPU_SetRegion(uint32_t rbar, uint32_t rasr)
 {
  MPU->RBAR = rbar;
@@ -235,7 +235,7 @@ __STATIC_INLINE void ARM_MPU_SetRegion(uint32_t rbar, uint32_t rasr)
 * \param rnr Region number to be configured.
 * \param rbar Value for RBAR register.
 * \param rsar Value for RSAR register.
-*/   
+*/
 __STATIC_INLINE void ARM_MPU_SetRegionEx(uint32_t rnr, uint32_t rbar, uint32_t rasr)
 {
  MPU->RNR = rnr;
@@ -251,7 +251,7 @@ __STATIC_INLINE void ARM_MPU_SetRegionEx(uint32_t rnr, uint32_t rbar, uint32_t r
 __STATIC_INLINE void ARM_MPU_OrderedMemcpy(volatile uint32_t* dst, const uint32_t* __RESTRICT src, uint32_t len)
 {
  uint32_t i;
-  for (i = 0U; i < len; ++i) 
+  for (i = 0U; i < len; ++i)
  {
    dst[i] = src[i];
  }
@@ -261,7 +261,7 @@ __STATIC_INLINE void ARM_MPU_OrderedMemcpy(volatile uint32_t* dst, const uint32_
 * \param table Pointer to the MPU configuration table.
 * \param cnt Amount of regions to be configured.
 */
-__STATIC_INLINE void ARM_MPU_Load(ARM_MPU_Region_t const* table, uint32_t cnt) 
+__STATIC_INLINE void ARM_MPU_Load(ARM_MPU_Region_t const* table, uint32_t cnt)
 {
  const uint32_t rowWordSize = sizeof(ARM_MPU_Region_t)/4U;
  while (cnt > MPU_TYPE_RALIASES) {
--- a/libraries/cmsis/cm4/core_support/mpu_armv8.h
+++ b/libraries/cmsis/cm4/core_support/mpu_armv8.h
@@ -1,352 +1,352 @@
-/******************************************************************************
- * @file     mpu_armv8.h
- * @brief    CMSIS MPU API for Armv8-M and Armv8.1-M MPU
- * @version  V5.1.2
- * @date     10. February 2020
- ******************************************************************************/
-/*
- * Copyright (c) 2017-2020 Arm Limited. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#if   defined ( __ICCARM__ )
-  #pragma system_include         /* treat file as system include file for MISRA check */
-#elif defined (__clang__)
-  #pragma clang system_header    /* treat file as system include file */
-#endif
-
-#ifndef ARM_MPU_ARMV8_H
-#define ARM_MPU_ARMV8_H
-
-/** \brief Attribute for device memory (outer only) */
-#define ARM_MPU_ATTR_DEVICE                           ( 0U )
-
-/** \brief Attribute for non-cacheable, normal memory */
-#define ARM_MPU_ATTR_NON_CACHEABLE                    ( 4U )
-
-/** \brief Attribute for normal memory (outer and inner)
-* \param NT Non-Transient: Set to 1 for non-transient data.
-* \param WB Write-Back: Set to 1 to use write-back update policy.
-* \param RA Read Allocation: Set to 1 to use cache allocation on read miss.
-* \param WA Write Allocation: Set to 1 to use cache allocation on write miss.
-*/
-#define ARM_MPU_ATTR_MEMORY_(NT, WB, RA, WA) \
-  ((((NT) & 1U) << 3U) | (((WB) & 1U) << 2U) | (((RA) & 1U) << 1U) | ((WA) & 1U))
-
-/** \brief Device memory type non Gathering, non Re-ordering, non Early Write Acknowledgement */
-#define ARM_MPU_ATTR_DEVICE_nGnRnE (0U)
-
-/** \brief Device memory type non Gathering, non Re-ordering, Early Write Acknowledgement */
-#define ARM_MPU_ATTR_DEVICE_nGnRE  (1U)
-
-/** \brief Device memory type non Gathering, Re-ordering, Early Write Acknowledgement */
-#define ARM_MPU_ATTR_DEVICE_nGRE   (2U)
-
-/** \brief Device memory type Gathering, Re-ordering, Early Write Acknowledgement */
-#define ARM_MPU_ATTR_DEVICE_GRE    (3U)
-
-/** \brief Memory Attribute
-* \param O Outer memory attributes
-* \param I O == ARM_MPU_ATTR_DEVICE: Device memory attributes, else: Inner memory attributes
-*/
-#define ARM_MPU_ATTR(O, I) ((((O) & 0xFU) << 4U) | ((((O) & 0xFU) != 0U) ? ((I) & 0xFU) : (((I) & 0x3U) << 2U)))
-
-/** \brief Normal memory non-shareable  */
-#define ARM_MPU_SH_NON   (0U)
-
-/** \brief Normal memory outer shareable  */
-#define ARM_MPU_SH_OUTER (2U)
-
-/** \brief Normal memory inner shareable  */
-#define ARM_MPU_SH_INNER (3U)
-
-/** \brief Memory access permissions
-* \param RO Read-Only: Set to 1 for read-only memory.
-* \param NP Non-Privileged: Set to 1 for non-privileged memory.
-*/
-#define ARM_MPU_AP_(RO, NP) ((((RO) & 1U) << 1U) | ((NP) & 1U))
-
-/** \brief Region Base Address Register value
-* \param BASE The base address bits [31:5] of a memory region. The value is zero extended. Effective address gets 32 byte aligned.
-* \param SH Defines the Shareability domain for this memory region.
-* \param RO Read-Only: Set to 1 for a read-only memory region.
-* \param NP Non-Privileged: Set to 1 for a non-privileged memory region.
-* \oaram XN eXecute Never: Set to 1 for a non-executable memory region.
-*/
-#define ARM_MPU_RBAR(BASE, SH, RO, NP, XN) \
-  (((BASE) & MPU_RBAR_BASE_Msk) | \
-  (((SH) << MPU_RBAR_SH_Pos) & MPU_RBAR_SH_Msk) | \
-  ((ARM_MPU_AP_(RO, NP) << MPU_RBAR_AP_Pos) & MPU_RBAR_AP_Msk) | \
-  (((XN) << MPU_RBAR_XN_Pos) & MPU_RBAR_XN_Msk))
-
-/** \brief Region Limit Address Register value
-* \param LIMIT The limit address bits [31:5] for this memory region. The value is one extended.
-* \param IDX The attribute index to be associated with this memory region.
-*/
-#define ARM_MPU_RLAR(LIMIT, IDX) \
-  (((LIMIT) & MPU_RLAR_LIMIT_Msk) | \
-  (((IDX) << MPU_RLAR_AttrIndx_Pos) & MPU_RLAR_AttrIndx_Msk) | \
-  (MPU_RLAR_EN_Msk))
-
-#if defined(MPU_RLAR_PXN_Pos)
-  
-/** \brief Region Limit Address Register with PXN value
-* \param LIMIT The limit address bits [31:5] for this memory region. The value is one extended.
-* \param PXN Privileged execute never. Defines whether code can be executed from this privileged region.
-* \param IDX The attribute index to be associated with this memory region.
-*/
-#define ARM_MPU_RLAR_PXN(LIMIT, PXN, IDX) \
-  (((LIMIT) & MPU_RLAR_LIMIT_Msk) | \
-  (((PXN) << MPU_RLAR_PXN_Pos) & MPU_RLAR_PXN_Msk) | \
-  (((IDX) << MPU_RLAR_AttrIndx_Pos) & MPU_RLAR_AttrIndx_Msk) | \
-  (MPU_RLAR_EN_Msk))
-  
-#endif
-
-/**
-* Struct for a single MPU Region
-*/
-typedef struct {
-  uint32_t RBAR;                   /*!< Region Base Address Register value */
-  uint32_t RLAR;                   /*!< Region Limit Address Register value */
-} ARM_MPU_Region_t;
-    
-/** Enable the MPU.
-* \param MPU_Control Default access permissions for unconfigured regions.
-*/
-__STATIC_INLINE void ARM_MPU_Enable(uint32_t MPU_Control)
-{
-  __DMB();
-  MPU->CTRL = MPU_Control | MPU_CTRL_ENABLE_Msk;
-#ifdef SCB_SHCSR_MEMFAULTENA_Msk
-  SCB->SHCSR |= SCB_SHCSR_MEMFAULTENA_Msk;
-#endif
-  __DSB();
-  __ISB();
-}
-
-/** Disable the MPU.
-*/
-__STATIC_INLINE void ARM_MPU_Disable(void)
-{
-  __DMB();
-#ifdef SCB_SHCSR_MEMFAULTENA_Msk
-  SCB->SHCSR &= ~SCB_SHCSR_MEMFAULTENA_Msk;
-#endif
-  MPU->CTRL  &= ~MPU_CTRL_ENABLE_Msk;
-  __DSB();
-  __ISB();
-}
-
-#ifdef MPU_NS
-/** Enable the Non-secure MPU.
-* \param MPU_Control Default access permissions for unconfigured regions.
-*/
-__STATIC_INLINE void ARM_MPU_Enable_NS(uint32_t MPU_Control)
-{
-  __DMB();
-  MPU_NS->CTRL = MPU_Control | MPU_CTRL_ENABLE_Msk;
-#ifdef SCB_SHCSR_MEMFAULTENA_Msk
-  SCB_NS->SHCSR |= SCB_SHCSR_MEMFAULTENA_Msk;
-#endif
-  __DSB();
-  __ISB();
-}
-
-/** Disable the Non-secure MPU.
-*/
-__STATIC_INLINE void ARM_MPU_Disable_NS(void)
-{
-  __DMB();
-#ifdef SCB_SHCSR_MEMFAULTENA_Msk
-  SCB_NS->SHCSR &= ~SCB_SHCSR_MEMFAULTENA_Msk;
-#endif
-  MPU_NS->CTRL  &= ~MPU_CTRL_ENABLE_Msk;
-  __DSB();
-  __ISB();
-}
-#endif
-
-/** Set the memory attribute encoding to the given MPU.
-* \param mpu Pointer to the MPU to be configured.
-* \param idx The attribute index to be set [0-7]
-* \param attr The attribute value to be set.
-*/
-__STATIC_INLINE void ARM_MPU_SetMemAttrEx(MPU_Type* mpu, uint8_t idx, uint8_t attr)
-{
-  const uint8_t reg = idx / 4U;
-  const uint32_t pos = ((idx % 4U) * 8U);
-  const uint32_t mask = 0xFFU << pos;
-  
-  if (reg >= (sizeof(mpu->MAIR) / sizeof(mpu->MAIR[0]))) {
-    return; // invalid index
-  }
-  
-  mpu->MAIR[reg] = ((mpu->MAIR[reg] & ~mask) | ((attr << pos) & mask));
-}
-
-/** Set the memory attribute encoding.
-* \param idx The attribute index to be set [0-7]
-* \param attr The attribute value to be set.
-*/
-__STATIC_INLINE void ARM_MPU_SetMemAttr(uint8_t idx, uint8_t attr)
-{
-  ARM_MPU_SetMemAttrEx(MPU, idx, attr);
-}
-
-#ifdef MPU_NS
-/** Set the memory attribute encoding to the Non-secure MPU.
-* \param idx The attribute index to be set [0-7]
-* \param attr The attribute value to be set.
-*/
-__STATIC_INLINE void ARM_MPU_SetMemAttr_NS(uint8_t idx, uint8_t attr)
-{
-  ARM_MPU_SetMemAttrEx(MPU_NS, idx, attr);
-}
-#endif
-
-/** Clear and disable the given MPU region of the given MPU.
-* \param mpu Pointer to MPU to be used.
-* \param rnr Region number to be cleared.
-*/
-__STATIC_INLINE void ARM_MPU_ClrRegionEx(MPU_Type* mpu, uint32_t rnr)
-{
-  mpu->RNR = rnr;
-  mpu->RLAR = 0U;
-}
-
-/** Clear and disable the given MPU region.
-* \param rnr Region number to be cleared.
-*/
-__STATIC_INLINE void ARM_MPU_ClrRegion(uint32_t rnr)
-{
-  ARM_MPU_ClrRegionEx(MPU, rnr);
-}
-
-#ifdef MPU_NS
-/** Clear and disable the given Non-secure MPU region.
-* \param rnr Region number to be cleared.
-*/
-__STATIC_INLINE void ARM_MPU_ClrRegion_NS(uint32_t rnr)
-{  
-  ARM_MPU_ClrRegionEx(MPU_NS, rnr);
-}
-#endif
-
-/** Configure the given MPU region of the given MPU.
-* \param mpu Pointer to MPU to be used.
-* \param rnr Region number to be configured.
-* \param rbar Value for RBAR register.
-* \param rlar Value for RLAR register.
-*/   
-__STATIC_INLINE void ARM_MPU_SetRegionEx(MPU_Type* mpu, uint32_t rnr, uint32_t rbar, uint32_t rlar)
-{
-  mpu->RNR = rnr;
-  mpu->RBAR = rbar;
-  mpu->RLAR = rlar;
-}
-
-/** Configure the given MPU region.
-* \param rnr Region number to be configured.
-* \param rbar Value for RBAR register.
-* \param rlar Value for RLAR register.
-*/   
-__STATIC_INLINE void ARM_MPU_SetRegion(uint32_t rnr, uint32_t rbar, uint32_t rlar)
-{
-  ARM_MPU_SetRegionEx(MPU, rnr, rbar, rlar);
-}
-
-#ifdef MPU_NS
-/** Configure the given Non-secure MPU region.
-* \param rnr Region number to be configured.
-* \param rbar Value for RBAR register.
-* \param rlar Value for RLAR register.
-*/   
-__STATIC_INLINE void ARM_MPU_SetRegion_NS(uint32_t rnr, uint32_t rbar, uint32_t rlar)
-{
-  ARM_MPU_SetRegionEx(MPU_NS, rnr, rbar, rlar);  
-}
-#endif
-
-/** Memcopy with strictly ordered memory access, e.g. for register targets.
-* \param dst Destination data is copied to.
-* \param src Source data is copied from.
-* \param len Amount of data words to be copied.
-*/
-__STATIC_INLINE void ARM_MPU_OrderedMemcpy(volatile uint32_t* dst, const uint32_t* __RESTRICT src, uint32_t len)
-{
-  uint32_t i;
-  for (i = 0U; i < len; ++i) 
-  {
-    dst[i] = src[i];
-  }
-}
-
-/** Load the given number of MPU regions from a table to the given MPU.
-* \param mpu Pointer to the MPU registers to be used.
-* \param rnr First region number to be configured.
-* \param table Pointer to the MPU configuration table.
-* \param cnt Amount of regions to be configured.
-*/
-__STATIC_INLINE void ARM_MPU_LoadEx(MPU_Type* mpu, uint32_t rnr, ARM_MPU_Region_t const* table, uint32_t cnt) 
-{
-  const uint32_t rowWordSize = sizeof(ARM_MPU_Region_t)/4U;
-  if (cnt == 1U) {
-    mpu->RNR = rnr;
-    ARM_MPU_OrderedMemcpy(&(mpu->RBAR), &(table->RBAR), rowWordSize);
-  } else {
-    uint32_t rnrBase   = rnr & ~(MPU_TYPE_RALIASES-1U);
-    uint32_t rnrOffset = rnr % MPU_TYPE_RALIASES;
-    
-    mpu->RNR = rnrBase;
-    while ((rnrOffset + cnt) > MPU_TYPE_RALIASES) {
-      uint32_t c = MPU_TYPE_RALIASES - rnrOffset;
-      ARM_MPU_OrderedMemcpy(&(mpu->RBAR)+(rnrOffset*2U), &(table->RBAR), c*rowWordSize);
-      table += c;
-      cnt -= c;
-      rnrOffset = 0U;
-      rnrBase += MPU_TYPE_RALIASES;
-      mpu->RNR = rnrBase;
-    }
-    
-    ARM_MPU_OrderedMemcpy(&(mpu->RBAR)+(rnrOffset*2U), &(table->RBAR), cnt*rowWordSize);
-  }
-}
-
-/** Load the given number of MPU regions from a table.
-* \param rnr First region number to be configured.
-* \param table Pointer to the MPU configuration table.
-* \param cnt Amount of regions to be configured.
-*/
-__STATIC_INLINE void ARM_MPU_Load(uint32_t rnr, ARM_MPU_Region_t const* table, uint32_t cnt) 
-{
-  ARM_MPU_LoadEx(MPU, rnr, table, cnt);
-}
-
-#ifdef MPU_NS
-/** Load the given number of MPU regions from a table to the Non-secure MPU.
-* \param rnr First region number to be configured.
-* \param table Pointer to the MPU configuration table.
-* \param cnt Amount of regions to be configured.
-*/
-__STATIC_INLINE void ARM_MPU_Load_NS(uint32_t rnr, ARM_MPU_Region_t const* table, uint32_t cnt) 
-{
-  ARM_MPU_LoadEx(MPU_NS, rnr, table, cnt);
-}
-#endif
-
-#endif
-
+/******************************************************************************
+ * @file     mpu_armv8.h
+ * @brief    CMSIS MPU API for Armv8-M and Armv8.1-M MPU
+ * @version  V5.1.2
+ * @date     10. February 2020
+ ******************************************************************************/
+/*
+ * Copyright (c) 2017-2020 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if   defined ( __ICCARM__ )
+  #pragma system_include         /* treat file as system include file for MISRA check */
+#elif defined (__clang__)
+  #pragma clang system_header    /* treat file as system include file */
+#endif
+
+#ifndef ARM_MPU_ARMV8_H
+#define ARM_MPU_ARMV8_H
+
+/** \brief Attribute for device memory (outer only) */
+#define ARM_MPU_ATTR_DEVICE                           ( 0U )
+
+/** \brief Attribute for non-cacheable, normal memory */
+#define ARM_MPU_ATTR_NON_CACHEABLE                    ( 4U )
+
+/** \brief Attribute for normal memory (outer and inner)
+* \param NT Non-Transient: Set to 1 for non-transient data.
+* \param WB Write-Back: Set to 1 to use write-back update policy.
+* \param RA Read Allocation: Set to 1 to use cache allocation on read miss.
+* \param WA Write Allocation: Set to 1 to use cache allocation on write miss.
+*/
+#define ARM_MPU_ATTR_MEMORY_(NT, WB, RA, WA) \
+  ((((NT) & 1U) << 3U) | (((WB) & 1U) << 2U) | (((RA) & 1U) << 1U) | ((WA) & 1U))
+
+/** \brief Device memory type non Gathering, non Re-ordering, non Early Write Acknowledgement */
+#define ARM_MPU_ATTR_DEVICE_nGnRnE (0U)
+
+/** \brief Device memory type non Gathering, non Re-ordering, Early Write Acknowledgement */
+#define ARM_MPU_ATTR_DEVICE_nGnRE  (1U)
+
+/** \brief Device memory type non Gathering, Re-ordering, Early Write Acknowledgement */
+#define ARM_MPU_ATTR_DEVICE_nGRE   (2U)
+
+/** \brief Device memory type Gathering, Re-ordering, Early Write Acknowledgement */
+#define ARM_MPU_ATTR_DEVICE_GRE    (3U)
+
+/** \brief Memory Attribute
+* \param O Outer memory attributes
+* \param I O == ARM_MPU_ATTR_DEVICE: Device memory attributes, else: Inner memory attributes
+*/
+#define ARM_MPU_ATTR(O, I) ((((O) & 0xFU) << 4U) | ((((O) & 0xFU) != 0U) ? ((I) & 0xFU) : (((I) & 0x3U) << 2U)))
+
+/** \brief Normal memory non-shareable  */
+#define ARM_MPU_SH_NON   (0U)
+
+/** \brief Normal memory outer shareable  */
+#define ARM_MPU_SH_OUTER (2U)
+
+/** \brief Normal memory inner shareable  */
+#define ARM_MPU_SH_INNER (3U)
+
+/** \brief Memory access permissions
+* \param RO Read-Only: Set to 1 for read-only memory.
+* \param NP Non-Privileged: Set to 1 for non-privileged memory.
+*/
+#define ARM_MPU_AP_(RO, NP) ((((RO) & 1U) << 1U) | ((NP) & 1U))
+
+/** \brief Region Base Address Register value
+* \param BASE The base address bits [31:5] of a memory region. The value is zero extended. Effective address gets 32 byte aligned.
+* \param SH Defines the Shareability domain for this memory region.
+* \param RO Read-Only: Set to 1 for a read-only memory region.
+* \param NP Non-Privileged: Set to 1 for a non-privileged memory region.
+* \oaram XN eXecute Never: Set to 1 for a non-executable memory region.
+*/
+#define ARM_MPU_RBAR(BASE, SH, RO, NP, XN) \
+  (((BASE) & MPU_RBAR_BASE_Msk) | \
+  (((SH) << MPU_RBAR_SH_Pos) & MPU_RBAR_SH_Msk) | \
+  ((ARM_MPU_AP_(RO, NP) << MPU_RBAR_AP_Pos) & MPU_RBAR_AP_Msk) | \
+  (((XN) << MPU_RBAR_XN_Pos) & MPU_RBAR_XN_Msk))
+
+/** \brief Region Limit Address Register value
+* \param LIMIT The limit address bits [31:5] for this memory region. The value is one extended.
+* \param IDX The attribute index to be associated with this memory region.
+*/
+#define ARM_MPU_RLAR(LIMIT, IDX) \
+  (((LIMIT) & MPU_RLAR_LIMIT_Msk) | \
+  (((IDX) << MPU_RLAR_AttrIndx_Pos) & MPU_RLAR_AttrIndx_Msk) | \
+  (MPU_RLAR_EN_Msk))
+
+#if defined(MPU_RLAR_PXN_Pos)
+
+/** \brief Region Limit Address Register with PXN value
+* \param LIMIT The limit address bits [31:5] for this memory region. The value is one extended.
+* \param PXN Privileged execute never. Defines whether code can be executed from this privileged region.
+* \param IDX The attribute index to be associated with this memory region.
+*/
+#define ARM_MPU_RLAR_PXN(LIMIT, PXN, IDX) \
+  (((LIMIT) & MPU_RLAR_LIMIT_Msk) | \
+  (((PXN) << MPU_RLAR_PXN_Pos) & MPU_RLAR_PXN_Msk) | \
+  (((IDX) << MPU_RLAR_AttrIndx_Pos) & MPU_RLAR_AttrIndx_Msk) | \
+  (MPU_RLAR_EN_Msk))
+
+#endif
+
+/**
+* Struct for a single MPU Region
+*/
+typedef struct {
+  uint32_t RBAR;                   /*!< Region Base Address Register value */
+  uint32_t RLAR;                   /*!< Region Limit Address Register value */
+} ARM_MPU_Region_t;
+
+/** Enable the MPU.
+* \param MPU_Control Default access permissions for unconfigured regions.
+*/
+__STATIC_INLINE void ARM_MPU_Enable(uint32_t MPU_Control)
+{
+  __DMB();
+  MPU->CTRL = MPU_Control | MPU_CTRL_ENABLE_Msk;
+#ifdef SCB_SHCSR_MEMFAULTENA_Msk
+  SCB->SHCSR |= SCB_SHCSR_MEMFAULTENA_Msk;
+#endif
+  __DSB();
+  __ISB();
+}
+
+/** Disable the MPU.
+*/
+__STATIC_INLINE void ARM_MPU_Disable(void)
+{
+  __DMB();
+#ifdef SCB_SHCSR_MEMFAULTENA_Msk
+  SCB->SHCSR &= ~SCB_SHCSR_MEMFAULTENA_Msk;
+#endif
+  MPU->CTRL  &= ~MPU_CTRL_ENABLE_Msk;
+  __DSB();
+  __ISB();
+}
+
+#ifdef MPU_NS
+/** Enable the Non-secure MPU.
+* \param MPU_Control Default access permissions for unconfigured regions.
+*/
+__STATIC_INLINE void ARM_MPU_Enable_NS(uint32_t MPU_Control)
+{
+  __DMB();
+  MPU_NS->CTRL = MPU_Control | MPU_CTRL_ENABLE_Msk;
+#ifdef SCB_SHCSR_MEMFAULTENA_Msk
+  SCB_NS->SHCSR |= SCB_SHCSR_MEMFAULTENA_Msk;
+#endif
+  __DSB();
+  __ISB();
+}
+
+/** Disable the Non-secure MPU.
+*/
+__STATIC_INLINE void ARM_MPU_Disable_NS(void)
+{
+  __DMB();
+#ifdef SCB_SHCSR_MEMFAULTENA_Msk
+  SCB_NS->SHCSR &= ~SCB_SHCSR_MEMFAULTENA_Msk;
+#endif
+  MPU_NS->CTRL  &= ~MPU_CTRL_ENABLE_Msk;
+  __DSB();
+  __ISB();
+}
+#endif
+
+/** Set the memory attribute encoding to the given MPU.
+* \param mpu Pointer to the MPU to be configured.
+* \param idx The attribute index to be set [0-7]
+* \param attr The attribute value to be set.
+*/
+__STATIC_INLINE void ARM_MPU_SetMemAttrEx(MPU_Type* mpu, uint8_t idx, uint8_t attr)
+{
+  const uint8_t reg = idx / 4U;
+  const uint32_t pos = ((idx % 4U) * 8U);
+  const uint32_t mask = 0xFFU << pos;
+
+  if (reg >= (sizeof(mpu->MAIR) / sizeof(mpu->MAIR[0]))) {
+    return; // invalid index
+  }
+
+  mpu->MAIR[reg] = ((mpu->MAIR[reg] & ~mask) | ((attr << pos) & mask));
+}
+
+/** Set the memory attribute encoding.
+* \param idx The attribute index to be set [0-7]
+* \param attr The attribute value to be set.
+*/
+__STATIC_INLINE void ARM_MPU_SetMemAttr(uint8_t idx, uint8_t attr)
+{
+  ARM_MPU_SetMemAttrEx(MPU, idx, attr);
+}
+
+#ifdef MPU_NS
+/** Set the memory attribute encoding to the Non-secure MPU.
+* \param idx The attribute index to be set [0-7]
+* \param attr The attribute value to be set.
+*/
+__STATIC_INLINE void ARM_MPU_SetMemAttr_NS(uint8_t idx, uint8_t attr)
+{
+  ARM_MPU_SetMemAttrEx(MPU_NS, idx, attr);
+}
+#endif
+
+/** Clear and disable the given MPU region of the given MPU.
+* \param mpu Pointer to MPU to be used.
+* \param rnr Region number to be cleared.
+*/
+__STATIC_INLINE void ARM_MPU_ClrRegionEx(MPU_Type* mpu, uint32_t rnr)
+{
+  mpu->RNR = rnr;
+  mpu->RLAR = 0U;
+}
+
+/** Clear and disable the given MPU region.
+* \param rnr Region number to be cleared.
+*/
+__STATIC_INLINE void ARM_MPU_ClrRegion(uint32_t rnr)
+{
+  ARM_MPU_ClrRegionEx(MPU, rnr);
+}
+
+#ifdef MPU_NS
+/** Clear and disable the given Non-secure MPU region.
+* \param rnr Region number to be cleared.
+*/
+__STATIC_INLINE void ARM_MPU_ClrRegion_NS(uint32_t rnr)
+{
+  ARM_MPU_ClrRegionEx(MPU_NS, rnr);
+}
+#endif
+
+/** Configure the given MPU region of the given MPU.
+* \param mpu Pointer to MPU to be used.
+* \param rnr Region number to be configured.
+* \param rbar Value for RBAR register.
+* \param rlar Value for RLAR register.
+*/
+__STATIC_INLINE void ARM_MPU_SetRegionEx(MPU_Type* mpu, uint32_t rnr, uint32_t rbar, uint32_t rlar)
+{
+  mpu->RNR = rnr;
+  mpu->RBAR = rbar;
+  mpu->RLAR = rlar;
+}
+
+/** Configure the given MPU region.
+* \param rnr Region number to be configured.
+* \param rbar Value for RBAR register.
+* \param rlar Value for RLAR register.
+*/
+__STATIC_INLINE void ARM_MPU_SetRegion(uint32_t rnr, uint32_t rbar, uint32_t rlar)
+{
+  ARM_MPU_SetRegionEx(MPU, rnr, rbar, rlar);
+}
+
+#ifdef MPU_NS
+/** Configure the given Non-secure MPU region.
+* \param rnr Region number to be configured.
+* \param rbar Value for RBAR register.
+* \param rlar Value for RLAR register.
+*/
+__STATIC_INLINE void ARM_MPU_SetRegion_NS(uint32_t rnr, uint32_t rbar, uint32_t rlar)
+{
+  ARM_MPU_SetRegionEx(MPU_NS, rnr, rbar, rlar);
+}
+#endif
+
+/** Memcopy with strictly ordered memory access, e.g. for register targets.
+* \param dst Destination data is copied to.
+* \param src Source data is copied from.
+* \param len Amount of data words to be copied.
+*/
+__STATIC_INLINE void ARM_MPU_OrderedMemcpy(volatile uint32_t* dst, const uint32_t* __RESTRICT src, uint32_t len)
+{
+  uint32_t i;
+  for (i = 0U; i < len; ++i)
+  {
+    dst[i] = src[i];
+  }
+}
+
+/** Load the given number of MPU regions from a table to the given MPU.
+* \param mpu Pointer to the MPU registers to be used.
+* \param rnr First region number to be configured.
+* \param table Pointer to the MPU configuration table.
+* \param cnt Amount of regions to be configured.
+*/
+__STATIC_INLINE void ARM_MPU_LoadEx(MPU_Type* mpu, uint32_t rnr, ARM_MPU_Region_t const* table, uint32_t cnt)
+{
+  const uint32_t rowWordSize = sizeof(ARM_MPU_Region_t)/4U;
+  if (cnt == 1U) {
+    mpu->RNR = rnr;
+    ARM_MPU_OrderedMemcpy(&(mpu->RBAR), &(table->RBAR), rowWordSize);
+  } else {
+    uint32_t rnrBase   = rnr & ~(MPU_TYPE_RALIASES-1U);
+    uint32_t rnrOffset = rnr % MPU_TYPE_RALIASES;
+
+    mpu->RNR = rnrBase;
+    while ((rnrOffset + cnt) > MPU_TYPE_RALIASES) {
+      uint32_t c = MPU_TYPE_RALIASES - rnrOffset;
+      ARM_MPU_OrderedMemcpy(&(mpu->RBAR)+(rnrOffset*2U), &(table->RBAR), c*rowWordSize);
+      table += c;
+      cnt -= c;
+      rnrOffset = 0U;
+      rnrBase += MPU_TYPE_RALIASES;
+      mpu->RNR = rnrBase;
+    }
+
+    ARM_MPU_OrderedMemcpy(&(mpu->RBAR)+(rnrOffset*2U), &(table->RBAR), cnt*rowWordSize);
+  }
+}
+
+/** Load the given number of MPU regions from a table.
+* \param rnr First region number to be configured.
+* \param table Pointer to the MPU configuration table.
+* \param cnt Amount of regions to be configured.
+*/
+__STATIC_INLINE void ARM_MPU_Load(uint32_t rnr, ARM_MPU_Region_t const* table, uint32_t cnt)
+{
+  ARM_MPU_LoadEx(MPU, rnr, table, cnt);
+}
+
+#ifdef MPU_NS
+/** Load the given number of MPU regions from a table to the Non-secure MPU.
+* \param rnr First region number to be configured.
+* \param table Pointer to the MPU configuration table.
+* \param cnt Amount of regions to be configured.
+*/
+__STATIC_INLINE void ARM_MPU_Load_NS(uint32_t rnr, ARM_MPU_Region_t const* table, uint32_t cnt)
+{
+  ARM_MPU_LoadEx(MPU_NS, rnr, table, cnt);
+}
+#endif
+
+#endif
+
--- a/libraries/cmsis/cm4/core_support/pmu_armv8.h
+++ b/libraries/cmsis/cm4/core_support/pmu_armv8.h
@@ -1,337 +1,337 @@
-/******************************************************************************
- * @file     pmu_armv8.h
- * @brief    CMSIS PMU API for Armv8.1-M PMU
- * @version  V1.0.0
- * @date     24. March 2020
- ******************************************************************************/
-/*
- * Copyright (c) 2020 Arm Limited. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#if   defined ( __ICCARM__ )
-  #pragma system_include         /* treat file as system include file for MISRA check */
-#elif defined (__clang__)
-  #pragma clang system_header    /* treat file as system include file */
-#endif
-
-#ifndef ARM_PMU_ARMV8_H
-#define ARM_PMU_ARMV8_H
-
-/**
- * \brief PMU Events
- * \note  See the Armv8.1-M Architecture Reference Manual for full details on these PMU events.
- * */
-
-#define ARM_PMU_SW_INCR                              0x0000             /*!< Software update to the PMU_SWINC register, architecturally executed and condition code check pass */
-#define ARM_PMU_L1I_CACHE_REFILL                     0x0001             /*!< L1 I-Cache refill */
-#define ARM_PMU_L1D_CACHE_REFILL                     0x0003             /*!< L1 D-Cache refill */
-#define ARM_PMU_L1D_CACHE                            0x0004             /*!< L1 D-Cache access */
-#define ARM_PMU_LD_RETIRED                           0x0006             /*!< Memory-reading instruction architecturally executed and condition code check pass */
-#define ARM_PMU_ST_RETIRED                           0x0007             /*!< Memory-writing instruction architecturally executed and condition code check pass */
-#define ARM_PMU_INST_RETIRED                         0x0008             /*!< Instruction architecturally executed */
-#define ARM_PMU_EXC_TAKEN                            0x0009             /*!< Exception entry */
-#define ARM_PMU_EXC_RETURN                           0x000A             /*!< Exception return instruction architecturally executed and the condition code check pass */
-#define ARM_PMU_PC_WRITE_RETIRED                     0x000C             /*!< Software change to the Program Counter (PC). Instruction is architecturally executed and condition code check pass */
-#define ARM_PMU_BR_IMMED_RETIRED                     0x000D             /*!< Immediate branch architecturally executed */
-#define ARM_PMU_BR_RETURN_RETIRED                    0x000E             /*!< Function return instruction architecturally executed and the condition code check pass */
-#define ARM_PMU_UNALIGNED_LDST_RETIRED               0x000F             /*!< Unaligned memory memory-reading or memory-writing instruction architecturally executed and condition code check pass */
-#define ARM_PMU_BR_MIS_PRED                          0x0010             /*!< Mispredicted or not predicted branch speculatively executed */
-#define ARM_PMU_CPU_CYCLES                           0x0011             /*!< Cycle */
-#define ARM_PMU_BR_PRED                              0x0012             /*!< Predictable branch speculatively executed */
-#define ARM_PMU_MEM_ACCESS                           0x0013             /*!< Data memory access */
-#define ARM_PMU_L1I_CACHE                            0x0014             /*!< Level 1 instruction cache access */
-#define ARM_PMU_L1D_CACHE_WB                         0x0015             /*!< Level 1 data cache write-back */
-#define ARM_PMU_L2D_CACHE                            0x0016             /*!< Level 2 data cache access */
-#define ARM_PMU_L2D_CACHE_REFILL                     0x0017             /*!< Level 2 data cache refill */
-#define ARM_PMU_L2D_CACHE_WB                         0x0018             /*!< Level 2 data cache write-back */
-#define ARM_PMU_BUS_ACCESS                           0x0019             /*!< Bus access */
-#define ARM_PMU_MEMORY_ERROR                         0x001A             /*!< Local memory error */
-#define ARM_PMU_INST_SPEC                            0x001B             /*!< Instruction speculatively executed */
-#define ARM_PMU_BUS_CYCLES                           0x001D             /*!< Bus cycles */
-#define ARM_PMU_CHAIN                                0x001E             /*!< For an odd numbered counter, increment when an overflow occurs on the preceding even-numbered counter on the same PE */
-#define ARM_PMU_L1D_CACHE_ALLOCATE                   0x001F             /*!< Level 1 data cache allocation without refill */
-#define ARM_PMU_L2D_CACHE_ALLOCATE                   0x0020             /*!< Level 2 data cache allocation without refill */
-#define ARM_PMU_BR_RETIRED                           0x0021             /*!< Branch instruction architecturally executed */
-#define ARM_PMU_BR_MIS_PRED_RETIRED                  0x0022             /*!< Mispredicted branch instruction architecturally executed */
-#define ARM_PMU_STALL_FRONTEND                       0x0023             /*!< No operation issued because of the frontend */
-#define ARM_PMU_STALL_BACKEND                        0x0024             /*!< No operation issued because of the backend */
-#define ARM_PMU_L2I_CACHE                            0x0027             /*!< Level 2 instruction cache access */
-#define ARM_PMU_L2I_CACHE_REFILL                     0x0028             /*!< Level 2 instruction cache refill */
-#define ARM_PMU_L3D_CACHE_ALLOCATE                   0x0029             /*!< Level 3 data cache allocation without refill */
-#define ARM_PMU_L3D_CACHE_REFILL                     0x002A             /*!< Level 3 data cache refill */
-#define ARM_PMU_L3D_CACHE                            0x002B             /*!< Level 3 data cache access */
-#define ARM_PMU_L3D_CACHE_WB                         0x002C             /*!< Level 3 data cache write-back */
-#define ARM_PMU_LL_CACHE_RD                          0x0036             /*!< Last level data cache read */
-#define ARM_PMU_LL_CACHE_MISS_RD                     0x0037             /*!< Last level data cache read miss */
-#define ARM_PMU_L1D_CACHE_MISS_RD                    0x0039             /*!< Level 1 data cache read miss */
-#define ARM_PMU_OP_COMPLETE                          0x003A             /*!< Operation retired */
-#define ARM_PMU_OP_SPEC                              0x003B             /*!< Operation speculatively executed */
-#define ARM_PMU_STALL                                0x003C             /*!< Stall cycle for instruction or operation not sent for execution */
-#define ARM_PMU_STALL_OP_BACKEND                     0x003D             /*!< Stall cycle for instruction or operation not sent for execution due to pipeline backend */
-#define ARM_PMU_STALL_OP_FRONTEND                    0x003E             /*!< Stall cycle for instruction or operation not sent for execution due to pipeline frontend */
-#define ARM_PMU_STALL_OP                             0x003F             /*!< Instruction or operation slots not occupied each cycle */
-#define ARM_PMU_L1D_CACHE_RD                         0x0040             /*!< Level 1 data cache read */
-#define ARM_PMU_LE_RETIRED                           0x0100             /*!< Loop end instruction executed */
-#define ARM_PMU_LE_SPEC                              0x0101             /*!< Loop end instruction speculatively executed */
-#define ARM_PMU_BF_RETIRED                           0x0104             /*!< Branch future instruction architecturally executed and condition code check pass */
-#define ARM_PMU_BF_SPEC                              0x0105             /*!< Branch future instruction speculatively executed and condition code check pass */
-#define ARM_PMU_LE_CANCEL                            0x0108             /*!< Loop end instruction not taken */
-#define ARM_PMU_BF_CANCEL                            0x0109             /*!< Branch future instruction not taken */
-#define ARM_PMU_SE_CALL_S                            0x0114             /*!< Call to secure function, resulting in Security state change */
-#define ARM_PMU_SE_CALL_NS                           0x0115             /*!< Call to non-secure function, resulting in Security state change */
-#define ARM_PMU_DWT_CMPMATCH0                        0x0118             /*!< DWT comparator 0 match */
-#define ARM_PMU_DWT_CMPMATCH1                        0x0119             /*!< DWT comparator 1 match */
-#define ARM_PMU_DWT_CMPMATCH2                        0x011A             /*!< DWT comparator 2 match */
-#define ARM_PMU_DWT_CMPMATCH3                        0x011B             /*!< DWT comparator 3 match */
-#define ARM_PMU_MVE_INST_RETIRED                     0x0200             /*!< MVE instruction architecturally executed */
-#define ARM_PMU_MVE_INST_SPEC                        0x0201             /*!< MVE instruction speculatively executed */
-#define ARM_PMU_MVE_FP_RETIRED                       0x0204             /*!< MVE floating-point instruction architecturally executed */
-#define ARM_PMU_MVE_FP_SPEC                          0x0205             /*!< MVE floating-point instruction speculatively executed */
-#define ARM_PMU_MVE_FP_HP_RETIRED                    0x0208             /*!< MVE half-precision floating-point instruction architecturally executed */
-#define ARM_PMU_MVE_FP_HP_SPEC                       0x0209             /*!< MVE half-precision floating-point instruction speculatively executed */
-#define ARM_PMU_MVE_FP_SP_RETIRED                    0x020C             /*!< MVE single-precision floating-point instruction architecturally executed */
-#define ARM_PMU_MVE_FP_SP_SPEC                       0x020D             /*!< MVE single-precision floating-point instruction speculatively executed */
-#define ARM_PMU_MVE_FP_MAC_RETIRED                   0x0214             /*!< MVE floating-point multiply or multiply-accumulate instruction architecturally executed */
-#define ARM_PMU_MVE_FP_MAC_SPEC                      0x0215             /*!< MVE floating-point multiply or multiply-accumulate instruction speculatively executed */
-#define ARM_PMU_MVE_INT_RETIRED                      0x0224             /*!< MVE integer instruction architecturally executed */
-#define ARM_PMU_MVE_INT_SPEC                         0x0225             /*!< MVE integer instruction speculatively executed */
-#define ARM_PMU_MVE_INT_MAC_RETIRED                  0x0228             /*!< MVE multiply or multiply-accumulate instruction architecturally executed */
-#define ARM_PMU_MVE_INT_MAC_SPEC                     0x0229             /*!< MVE multiply or multiply-accumulate instruction speculatively executed */
-#define ARM_PMU_MVE_LDST_RETIRED                     0x0238             /*!< MVE load or store instruction architecturally executed */
-#define ARM_PMU_MVE_LDST_SPEC                        0x0239             /*!< MVE load or store instruction speculatively executed */
-#define ARM_PMU_MVE_LD_RETIRED                       0x023C             /*!< MVE load instruction architecturally executed */
-#define ARM_PMU_MVE_LD_SPEC                          0x023D             /*!< MVE load instruction speculatively executed */
-#define ARM_PMU_MVE_ST_RETIRED                       0x0240             /*!< MVE store instruction architecturally executed */
-#define ARM_PMU_MVE_ST_SPEC                          0x0241             /*!< MVE store instruction speculatively executed */
-#define ARM_PMU_MVE_LDST_CONTIG_RETIRED              0x0244             /*!< MVE contiguous load or store instruction architecturally executed */
-#define ARM_PMU_MVE_LDST_CONTIG_SPEC                 0x0245             /*!< MVE contiguous load or store instruction speculatively executed */
-#define ARM_PMU_MVE_LD_CONTIG_RETIRED                0x0248             /*!< MVE contiguous load instruction architecturally executed */
-#define ARM_PMU_MVE_LD_CONTIG_SPEC                   0x0249             /*!< MVE contiguous load instruction speculatively executed */
-#define ARM_PMU_MVE_ST_CONTIG_RETIRED                0x024C             /*!< MVE contiguous store instruction architecturally executed */
-#define ARM_PMU_MVE_ST_CONTIG_SPEC                   0x024D             /*!< MVE contiguous store instruction speculatively executed */
-#define ARM_PMU_MVE_LDST_NONCONTIG_RETIRED           0x0250             /*!< MVE non-contiguous load or store instruction architecturally executed */
-#define ARM_PMU_MVE_LDST_NONCONTIG_SPEC              0x0251             /*!< MVE non-contiguous load or store instruction speculatively executed */
-#define ARM_PMU_MVE_LD_NONCONTIG_RETIRED             0x0254             /*!< MVE non-contiguous load instruction architecturally executed */
-#define ARM_PMU_MVE_LD_NONCONTIG_SPEC                0x0255             /*!< MVE non-contiguous load instruction speculatively executed */
-#define ARM_PMU_MVE_ST_NONCONTIG_RETIRED             0x0258             /*!< MVE non-contiguous store instruction architecturally executed */
-#define ARM_PMU_MVE_ST_NONCONTIG_SPEC                0x0259             /*!< MVE non-contiguous store instruction speculatively executed */
-#define ARM_PMU_MVE_LDST_MULTI_RETIRED               0x025C             /*!< MVE memory instruction targeting multiple registers architecturally executed */
-#define ARM_PMU_MVE_LDST_MULTI_SPEC                  0x025D             /*!< MVE memory instruction targeting multiple registers speculatively executed */
-#define ARM_PMU_MVE_LD_MULTI_RETIRED                 0x0260             /*!< MVE memory load instruction targeting multiple registers architecturally executed */
-#define ARM_PMU_MVE_LD_MULTI_SPEC                    0x0261             /*!< MVE memory load instruction targeting multiple registers speculatively executed */
-#define ARM_PMU_MVE_ST_MULTI_RETIRED                 0x0261             /*!< MVE memory store instruction targeting multiple registers architecturally executed */
-#define ARM_PMU_MVE_ST_MULTI_SPEC                    0x0265             /*!< MVE memory store instruction targeting multiple registers speculatively executed */
-#define ARM_PMU_MVE_LDST_UNALIGNED_RETIRED           0x028C             /*!< MVE unaligned memory load or store instruction architecturally executed */
-#define ARM_PMU_MVE_LDST_UNALIGNED_SPEC              0x028D             /*!< MVE unaligned memory load or store instruction speculatively executed */
-#define ARM_PMU_MVE_LD_UNALIGNED_RETIRED             0x0290             /*!< MVE unaligned load instruction architecturally executed */
-#define ARM_PMU_MVE_LD_UNALIGNED_SPEC                0x0291             /*!< MVE unaligned load instruction speculatively executed */
-#define ARM_PMU_MVE_ST_UNALIGNED_RETIRED             0x0294             /*!< MVE unaligned store instruction architecturally executed */
-#define ARM_PMU_MVE_ST_UNALIGNED_SPEC                0x0295             /*!< MVE unaligned store instruction speculatively executed */
-#define ARM_PMU_MVE_LDST_UNALIGNED_NONCONTIG_RETIRED 0x0298             /*!< MVE unaligned noncontiguous load or store instruction architecturally executed */
-#define ARM_PMU_MVE_LDST_UNALIGNED_NONCONTIG_SPEC    0x0299             /*!< MVE unaligned noncontiguous load or store instruction speculatively executed */
-#define ARM_PMU_MVE_VREDUCE_RETIRED                  0x02A0             /*!< MVE vector reduction instruction architecturally executed */
-#define ARM_PMU_MVE_VREDUCE_SPEC                     0x02A1             /*!< MVE vector reduction instruction speculatively executed */
-#define ARM_PMU_MVE_VREDUCE_FP_RETIRED               0x02A4             /*!< MVE floating-point vector reduction instruction architecturally executed */
-#define ARM_PMU_MVE_VREDUCE_FP_SPEC                  0x02A5             /*!< MVE floating-point vector reduction instruction speculatively executed */
-#define ARM_PMU_MVE_VREDUCE_INT_RETIRED              0x02A8             /*!< MVE integer vector reduction instruction architecturally executed */
-#define ARM_PMU_MVE_VREDUCE_INT_SPEC                 0x02A9             /*!< MVE integer vector reduction instruction speculatively executed */
-#define ARM_PMU_MVE_PRED                             0x02B8             /*!< Cycles where one or more predicated beats architecturally executed */
-#define ARM_PMU_MVE_STALL                            0x02CC             /*!< Stall cycles caused by an MVE instruction */
-#define ARM_PMU_MVE_STALL_RESOURCE                   0x02CD             /*!< Stall cycles caused by an MVE instruction because of resource conflicts */
-#define ARM_PMU_MVE_STALL_RESOURCE_MEM               0x02CE             /*!< Stall cycles caused by an MVE instruction because of memory resource conflicts */
-#define ARM_PMU_MVE_STALL_RESOURCE_FP                0x02CF             /*!< Stall cycles caused by an MVE instruction because of floating-point resource conflicts */
-#define ARM_PMU_MVE_STALL_RESOURCE_INT               0x02D0             /*!< Stall cycles caused by an MVE instruction because of integer resource conflicts */
-#define ARM_PMU_MVE_STALL_BREAK                      0x02D3             /*!< Stall cycles caused by an MVE chain break */
-#define ARM_PMU_MVE_STALL_DEPENDENCY                 0x02D4             /*!< Stall cycles caused by MVE register dependency */
-#define ARM_PMU_ITCM_ACCESS                          0x4007             /*!< Instruction TCM access */
-#define ARM_PMU_DTCM_ACCESS                          0x4008             /*!< Data TCM access */
-#define ARM_PMU_TRCEXTOUT0                           0x4010             /*!< ETM external output 0 */
-#define ARM_PMU_TRCEXTOUT1                           0x4011             /*!< ETM external output 1 */
-#define ARM_PMU_TRCEXTOUT2                           0x4012             /*!< ETM external output 2 */
-#define ARM_PMU_TRCEXTOUT3                           0x4013             /*!< ETM external output 3 */
-#define ARM_PMU_CTI_TRIGOUT4                         0x4018             /*!< Cross-trigger Interface output trigger 4 */
-#define ARM_PMU_CTI_TRIGOUT5                         0x4019             /*!< Cross-trigger Interface output trigger 5 */
-#define ARM_PMU_CTI_TRIGOUT6                         0x401A             /*!< Cross-trigger Interface output trigger 6 */
-#define ARM_PMU_CTI_TRIGOUT7                         0x401B             /*!< Cross-trigger Interface output trigger 7 */
-
-/** \brief PMU Functions */
-
-__STATIC_INLINE void ARM_PMU_Enable(void);
-__STATIC_INLINE void ARM_PMU_Disable(void);
-
-__STATIC_INLINE void ARM_PMU_Set_EVTYPER(uint32_t num, uint32_t type);
-
-__STATIC_INLINE void ARM_PMU_CYCCNT_Reset(void);
-__STATIC_INLINE void ARM_PMU_EVCNTR_ALL_Reset(void);
-
-__STATIC_INLINE void ARM_PMU_CNTR_Enable(uint32_t mask);
-__STATIC_INLINE void ARM_PMU_CNTR_Disable(uint32_t mask);
-
-__STATIC_INLINE uint32_t ARM_PMU_Get_CCNTR(void);
-__STATIC_INLINE uint32_t ARM_PMU_Get_EVCNTR(uint32_t num);
-
-__STATIC_INLINE uint32_t ARM_PMU_Get_CNTR_OVS(void);
-__STATIC_INLINE void ARM_PMU_Set_CNTR_OVS(uint32_t mask);
-
-__STATIC_INLINE void ARM_PMU_Set_CNTR_IRQ_Enable(uint32_t mask);
-__STATIC_INLINE void ARM_PMU_Set_CNTR_IRQ_Disable(uint32_t mask);
-
-__STATIC_INLINE void ARM_PMU_CNTR_Increment(uint32_t mask);
-
-/** 
-  \brief   Enable the PMU
-*/
-__STATIC_INLINE void ARM_PMU_Enable(void) 
-{
-  PMU->CTRL |= PMU_CTRL_ENABLE_Msk;
-}
-
-/** 
-  \brief   Disable the PMU
-*/
-__STATIC_INLINE void ARM_PMU_Disable(void) 
-{
-  PMU->CTRL &= ~PMU_CTRL_ENABLE_Msk;
-}
-
-/** 
-  \brief   Set event to count for PMU eventer counter
-  \param [in]    num     Event counter (0-30) to configure
-  \param [in]    type    Event to count
-*/
-__STATIC_INLINE void ARM_PMU_Set_EVTYPER(uint32_t num, uint32_t type)
-{
-  PMU->EVTYPER[num] = type;
-}
-
-/** 
-  \brief  Reset cycle counter
-*/
-__STATIC_INLINE void ARM_PMU_CYCCNT_Reset(void)
-{
-  PMU->CTRL |= PMU_CTRL_CYCCNT_RESET_Msk;
-}
-
-/** 
-  \brief  Reset all event counters
-*/
-__STATIC_INLINE void ARM_PMU_EVCNTR_ALL_Reset(void)
-{
-  PMU->CTRL |= PMU_CTRL_EVENTCNT_RESET_Msk;
-}
-
-/** 
-  \brief  Enable counters 
-  \param [in]     mask    Counters to enable
-  \note   Enables one or more of the following:
-          - event counters (0-30)
-          - cycle counter
-*/
-__STATIC_INLINE void ARM_PMU_CNTR_Enable(uint32_t mask)
-{
-  PMU->CNTENSET = mask;
-}
-
-/** 
-  \brief  Disable counters
-  \param [in]     mask    Counters to enable
-  \note   Disables one or more of the following:
-          - event counters (0-30)
-          - cycle counter
-*/
-__STATIC_INLINE void ARM_PMU_CNTR_Disable(uint32_t mask)
-{
-  PMU->CNTENCLR = mask;
-}
-
-/** 
-  \brief  Read cycle counter
-  \return                 Cycle count
-*/
-__STATIC_INLINE uint32_t ARM_PMU_Get_CCNTR(void)
-{
-  return PMU->CCNTR;
-}
-
-/** 
-  \brief   Read event counter
-  \param [in]     num     Event counter (0-30) to read
-  \return                 Event count
-*/
-__STATIC_INLINE uint32_t ARM_PMU_Get_EVCNTR(uint32_t num)
-{
-  return PMU->EVCNTR[num];
-}
-
-/** 
-  \brief   Read counter overflow status
-  \return  Counter overflow status bits for the following:
-          - event counters (0-30)
-          - cycle counter
-*/
-__STATIC_INLINE uint32_t ARM_PMU_Get_CNTR_OVS(void)
-{
-  return PMU->OVSSET;	
-}
-
-/** 
-  \brief   Clear counter overflow status
-  \param [in]     mask    Counter overflow status bits to clear
-  \note    Clears overflow status bits for one or more of the following:
-           - event counters (0-30)
-           - cycle counter
-*/
-__STATIC_INLINE void ARM_PMU_Set_CNTR_OVS(uint32_t mask)
-{
-  PMU->OVSCLR = mask;
-}
-
-/** 
-  \brief   Enable counter overflow interrupt request 
-  \param [in]     mask    Counter overflow interrupt request bits to set
-  \note    Sets overflow interrupt request bits for one or more of the following:
-           - event counters (0-30)
-           - cycle counter
-*/
-__STATIC_INLINE void ARM_PMU_Set_CNTR_IRQ_Enable(uint32_t mask)
-{
-  PMU->INTENSET = mask;
-}
-
-/** 
-  \brief   Disable counter overflow interrupt request 
-  \param [in]     mask    Counter overflow interrupt request bits to clear
-  \note    Clears overflow interrupt request bits for one or more of the following:
-           - event counters (0-30)
-           - cycle counter
-*/
-__STATIC_INLINE void ARM_PMU_Set_CNTR_IRQ_Disable(uint32_t mask)
-{
-  PMU->INTENCLR = mask;
-}
-
-/** 
-  \brief   Software increment event counter 
-  \param [in]     mask    Counters to increment
-  \note    Software increment bits for one or more event counters (0-30)
-*/
-__STATIC_INLINE void ARM_PMU_CNTR_Increment(uint32_t mask)
-{
-  PMU->SWINC = mask;
-}
-
-#endif
+/******************************************************************************
+ * @file     pmu_armv8.h
+ * @brief    CMSIS PMU API for Armv8.1-M PMU
+ * @version  V1.0.0
+ * @date     24. March 2020
+ ******************************************************************************/
+/*
+ * Copyright (c) 2020 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if   defined ( __ICCARM__ )
+  #pragma system_include         /* treat file as system include file for MISRA check */
+#elif defined (__clang__)
+  #pragma clang system_header    /* treat file as system include file */
+#endif
+
+#ifndef ARM_PMU_ARMV8_H
+#define ARM_PMU_ARMV8_H
+
+/**
+ * \brief PMU Events
+ * \note  See the Armv8.1-M Architecture Reference Manual for full details on these PMU events.
+ * */
+
+#define ARM_PMU_SW_INCR                              0x0000             /*!< Software update to the PMU_SWINC register, architecturally executed and condition code check pass */
+#define ARM_PMU_L1I_CACHE_REFILL                     0x0001             /*!< L1 I-Cache refill */
+#define ARM_PMU_L1D_CACHE_REFILL                     0x0003             /*!< L1 D-Cache refill */
+#define ARM_PMU_L1D_CACHE                            0x0004             /*!< L1 D-Cache access */
+#define ARM_PMU_LD_RETIRED                           0x0006             /*!< Memory-reading instruction architecturally executed and condition code check pass */
+#define ARM_PMU_ST_RETIRED                           0x0007             /*!< Memory-writing instruction architecturally executed and condition code check pass */
+#define ARM_PMU_INST_RETIRED                         0x0008             /*!< Instruction architecturally executed */
+#define ARM_PMU_EXC_TAKEN                            0x0009             /*!< Exception entry */
+#define ARM_PMU_EXC_RETURN                           0x000A             /*!< Exception return instruction architecturally executed and the condition code check pass */
+#define ARM_PMU_PC_WRITE_RETIRED                     0x000C             /*!< Software change to the Program Counter (PC). Instruction is architecturally executed and condition code check pass */
+#define ARM_PMU_BR_IMMED_RETIRED                     0x000D             /*!< Immediate branch architecturally executed */
+#define ARM_PMU_BR_RETURN_RETIRED                    0x000E             /*!< Function return instruction architecturally executed and the condition code check pass */
+#define ARM_PMU_UNALIGNED_LDST_RETIRED               0x000F             /*!< Unaligned memory memory-reading or memory-writing instruction architecturally executed and condition code check pass */
+#define ARM_PMU_BR_MIS_PRED                          0x0010             /*!< Mispredicted or not predicted branch speculatively executed */
+#define ARM_PMU_CPU_CYCLES                           0x0011             /*!< Cycle */
+#define ARM_PMU_BR_PRED                              0x0012             /*!< Predictable branch speculatively executed */
+#define ARM_PMU_MEM_ACCESS                           0x0013             /*!< Data memory access */
+#define ARM_PMU_L1I_CACHE                            0x0014             /*!< Level 1 instruction cache access */
+#define ARM_PMU_L1D_CACHE_WB                         0x0015             /*!< Level 1 data cache write-back */
+#define ARM_PMU_L2D_CACHE                            0x0016             /*!< Level 2 data cache access */
+#define ARM_PMU_L2D_CACHE_REFILL                     0x0017             /*!< Level 2 data cache refill */
+#define ARM_PMU_L2D_CACHE_WB                         0x0018             /*!< Level 2 data cache write-back */
+#define ARM_PMU_BUS_ACCESS                           0x0019             /*!< Bus access */
+#define ARM_PMU_MEMORY_ERROR                         0x001A             /*!< Local memory error */
+#define ARM_PMU_INST_SPEC                            0x001B             /*!< Instruction speculatively executed */
+#define ARM_PMU_BUS_CYCLES                           0x001D             /*!< Bus cycles */
+#define ARM_PMU_CHAIN                                0x001E             /*!< For an odd numbered counter, increment when an overflow occurs on the preceding even-numbered counter on the same PE */
+#define ARM_PMU_L1D_CACHE_ALLOCATE                   0x001F             /*!< Level 1 data cache allocation without refill */
+#define ARM_PMU_L2D_CACHE_ALLOCATE                   0x0020             /*!< Level 2 data cache allocation without refill */
+#define ARM_PMU_BR_RETIRED                           0x0021             /*!< Branch instruction architecturally executed */
+#define ARM_PMU_BR_MIS_PRED_RETIRED                  0x0022             /*!< Mispredicted branch instruction architecturally executed */
+#define ARM_PMU_STALL_FRONTEND                       0x0023             /*!< No operation issued because of the frontend */
+#define ARM_PMU_STALL_BACKEND                        0x0024             /*!< No operation issued because of the backend */
+#define ARM_PMU_L2I_CACHE                            0x0027             /*!< Level 2 instruction cache access */
+#define ARM_PMU_L2I_CACHE_REFILL                     0x0028             /*!< Level 2 instruction cache refill */
+#define ARM_PMU_L3D_CACHE_ALLOCATE                   0x0029             /*!< Level 3 data cache allocation without refill */
+#define ARM_PMU_L3D_CACHE_REFILL                     0x002A             /*!< Level 3 data cache refill */
+#define ARM_PMU_L3D_CACHE                            0x002B             /*!< Level 3 data cache access */
+#define ARM_PMU_L3D_CACHE_WB                         0x002C             /*!< Level 3 data cache write-back */
+#define ARM_PMU_LL_CACHE_RD                          0x0036             /*!< Last level data cache read */
+#define ARM_PMU_LL_CACHE_MISS_RD                     0x0037             /*!< Last level data cache read miss */
+#define ARM_PMU_L1D_CACHE_MISS_RD                    0x0039             /*!< Level 1 data cache read miss */
+#define ARM_PMU_OP_COMPLETE                          0x003A             /*!< Operation retired */
+#define ARM_PMU_OP_SPEC                              0x003B             /*!< Operation speculatively executed */
+#define ARM_PMU_STALL                                0x003C             /*!< Stall cycle for instruction or operation not sent for execution */
+#define ARM_PMU_STALL_OP_BACKEND                     0x003D             /*!< Stall cycle for instruction or operation not sent for execution due to pipeline backend */
+#define ARM_PMU_STALL_OP_FRONTEND                    0x003E             /*!< Stall cycle for instruction or operation not sent for execution due to pipeline frontend */
+#define ARM_PMU_STALL_OP                             0x003F             /*!< Instruction or operation slots not occupied each cycle */
+#define ARM_PMU_L1D_CACHE_RD                         0x0040             /*!< Level 1 data cache read */
+#define ARM_PMU_LE_RETIRED                           0x0100             /*!< Loop end instruction executed */
+#define ARM_PMU_LE_SPEC                              0x0101             /*!< Loop end instruction speculatively executed */
+#define ARM_PMU_BF_RETIRED                           0x0104             /*!< Branch future instruction architecturally executed and condition code check pass */
+#define ARM_PMU_BF_SPEC                              0x0105             /*!< Branch future instruction speculatively executed and condition code check pass */
+#define ARM_PMU_LE_CANCEL                            0x0108             /*!< Loop end instruction not taken */
+#define ARM_PMU_BF_CANCEL                            0x0109             /*!< Branch future instruction not taken */
+#define ARM_PMU_SE_CALL_S                            0x0114             /*!< Call to secure function, resulting in Security state change */
+#define ARM_PMU_SE_CALL_NS                           0x0115             /*!< Call to non-secure function, resulting in Security state change */
+#define ARM_PMU_DWT_CMPMATCH0                        0x0118             /*!< DWT comparator 0 match */
+#define ARM_PMU_DWT_CMPMATCH1                        0x0119             /*!< DWT comparator 1 match */
+#define ARM_PMU_DWT_CMPMATCH2                        0x011A             /*!< DWT comparator 2 match */
+#define ARM_PMU_DWT_CMPMATCH3                        0x011B             /*!< DWT comparator 3 match */
+#define ARM_PMU_MVE_INST_RETIRED                     0x0200             /*!< MVE instruction architecturally executed */
+#define ARM_PMU_MVE_INST_SPEC                        0x0201             /*!< MVE instruction speculatively executed */
+#define ARM_PMU_MVE_FP_RETIRED                       0x0204             /*!< MVE floating-point instruction architecturally executed */
+#define ARM_PMU_MVE_FP_SPEC                          0x0205             /*!< MVE floating-point instruction speculatively executed */
+#define ARM_PMU_MVE_FP_HP_RETIRED                    0x0208             /*!< MVE half-precision floating-point instruction architecturally executed */
+#define ARM_PMU_MVE_FP_HP_SPEC                       0x0209             /*!< MVE half-precision floating-point instruction speculatively executed */
+#define ARM_PMU_MVE_FP_SP_RETIRED                    0x020C             /*!< MVE single-precision floating-point instruction architecturally executed */
+#define ARM_PMU_MVE_FP_SP_SPEC                       0x020D             /*!< MVE single-precision floating-point instruction speculatively executed */
+#define ARM_PMU_MVE_FP_MAC_RETIRED                   0x0214             /*!< MVE floating-point multiply or multiply-accumulate instruction architecturally executed */
+#define ARM_PMU_MVE_FP_MAC_SPEC                      0x0215             /*!< MVE floating-point multiply or multiply-accumulate instruction speculatively executed */
+#define ARM_PMU_MVE_INT_RETIRED                      0x0224             /*!< MVE integer instruction architecturally executed */
+#define ARM_PMU_MVE_INT_SPEC                         0x0225             /*!< MVE integer instruction speculatively executed */
+#define ARM_PMU_MVE_INT_MAC_RETIRED                  0x0228             /*!< MVE multiply or multiply-accumulate instruction architecturally executed */
+#define ARM_PMU_MVE_INT_MAC_SPEC                     0x0229             /*!< MVE multiply or multiply-accumulate instruction speculatively executed */
+#define ARM_PMU_MVE_LDST_RETIRED                     0x0238             /*!< MVE load or store instruction architecturally executed */
+#define ARM_PMU_MVE_LDST_SPEC                        0x0239             /*!< MVE load or store instruction speculatively executed */
+#define ARM_PMU_MVE_LD_RETIRED                       0x023C             /*!< MVE load instruction architecturally executed */
+#define ARM_PMU_MVE_LD_SPEC                          0x023D             /*!< MVE load instruction speculatively executed */
+#define ARM_PMU_MVE_ST_RETIRED                       0x0240             /*!< MVE store instruction architecturally executed */
+#define ARM_PMU_MVE_ST_SPEC                          0x0241             /*!< MVE store instruction speculatively executed */
+#define ARM_PMU_MVE_LDST_CONTIG_RETIRED              0x0244             /*!< MVE contiguous load or store instruction architecturally executed */
+#define ARM_PMU_MVE_LDST_CONTIG_SPEC                 0x0245             /*!< MVE contiguous load or store instruction speculatively executed */
+#define ARM_PMU_MVE_LD_CONTIG_RETIRED                0x0248             /*!< MVE contiguous load instruction architecturally executed */
+#define ARM_PMU_MVE_LD_CONTIG_SPEC                   0x0249             /*!< MVE contiguous load instruction speculatively executed */
+#define ARM_PMU_MVE_ST_CONTIG_RETIRED                0x024C             /*!< MVE contiguous store instruction architecturally executed */
+#define ARM_PMU_MVE_ST_CONTIG_SPEC                   0x024D             /*!< MVE contiguous store instruction speculatively executed */
+#define ARM_PMU_MVE_LDST_NONCONTIG_RETIRED           0x0250             /*!< MVE non-contiguous load or store instruction architecturally executed */
+#define ARM_PMU_MVE_LDST_NONCONTIG_SPEC              0x0251             /*!< MVE non-contiguous load or store instruction speculatively executed */
+#define ARM_PMU_MVE_LD_NONCONTIG_RETIRED             0x0254             /*!< MVE non-contiguous load instruction architecturally executed */
+#define ARM_PMU_MVE_LD_NONCONTIG_SPEC                0x0255             /*!< MVE non-contiguous load instruction speculatively executed */
+#define ARM_PMU_MVE_ST_NONCONTIG_RETIRED             0x0258             /*!< MVE non-contiguous store instruction architecturally executed */
+#define ARM_PMU_MVE_ST_NONCONTIG_SPEC                0x0259             /*!< MVE non-contiguous store instruction speculatively executed */
+#define ARM_PMU_MVE_LDST_MULTI_RETIRED               0x025C             /*!< MVE memory instruction targeting multiple registers architecturally executed */
+#define ARM_PMU_MVE_LDST_MULTI_SPEC                  0x025D             /*!< MVE memory instruction targeting multiple registers speculatively executed */
+#define ARM_PMU_MVE_LD_MULTI_RETIRED                 0x0260             /*!< MVE memory load instruction targeting multiple registers architecturally executed */
+#define ARM_PMU_MVE_LD_MULTI_SPEC                    0x0261             /*!< MVE memory load instruction targeting multiple registers speculatively executed */
+#define ARM_PMU_MVE_ST_MULTI_RETIRED                 0x0261             /*!< MVE memory store instruction targeting multiple registers architecturally executed */
+#define ARM_PMU_MVE_ST_MULTI_SPEC                    0x0265             /*!< MVE memory store instruction targeting multiple registers speculatively executed */
+#define ARM_PMU_MVE_LDST_UNALIGNED_RETIRED           0x028C             /*!< MVE unaligned memory load or store instruction architecturally executed */
+#define ARM_PMU_MVE_LDST_UNALIGNED_SPEC              0x028D             /*!< MVE unaligned memory load or store instruction speculatively executed */
+#define ARM_PMU_MVE_LD_UNALIGNED_RETIRED             0x0290             /*!< MVE unaligned load instruction architecturally executed */
+#define ARM_PMU_MVE_LD_UNALIGNED_SPEC                0x0291             /*!< MVE unaligned load instruction speculatively executed */
+#define ARM_PMU_MVE_ST_UNALIGNED_RETIRED             0x0294             /*!< MVE unaligned store instruction architecturally executed */
+#define ARM_PMU_MVE_ST_UNALIGNED_SPEC                0x0295             /*!< MVE unaligned store instruction speculatively executed */
+#define ARM_PMU_MVE_LDST_UNALIGNED_NONCONTIG_RETIRED 0x0298             /*!< MVE unaligned noncontiguous load or store instruction architecturally executed */
+#define ARM_PMU_MVE_LDST_UNALIGNED_NONCONTIG_SPEC    0x0299             /*!< MVE unaligned noncontiguous load or store instruction speculatively executed */
+#define ARM_PMU_MVE_VREDUCE_RETIRED                  0x02A0             /*!< MVE vector reduction instruction architecturally executed */
+#define ARM_PMU_MVE_VREDUCE_SPEC                     0x02A1             /*!< MVE vector reduction instruction speculatively executed */
+#define ARM_PMU_MVE_VREDUCE_FP_RETIRED               0x02A4             /*!< MVE floating-point vector reduction instruction architecturally executed */
+#define ARM_PMU_MVE_VREDUCE_FP_SPEC                  0x02A5             /*!< MVE floating-point vector reduction instruction speculatively executed */
+#define ARM_PMU_MVE_VREDUCE_INT_RETIRED              0x02A8             /*!< MVE integer vector reduction instruction architecturally executed */
+#define ARM_PMU_MVE_VREDUCE_INT_SPEC                 0x02A9             /*!< MVE integer vector reduction instruction speculatively executed */
+#define ARM_PMU_MVE_PRED                             0x02B8             /*!< Cycles where one or more predicated beats architecturally executed */
+#define ARM_PMU_MVE_STALL                            0x02CC             /*!< Stall cycles caused by an MVE instruction */
+#define ARM_PMU_MVE_STALL_RESOURCE                   0x02CD             /*!< Stall cycles caused by an MVE instruction because of resource conflicts */
+#define ARM_PMU_MVE_STALL_RESOURCE_MEM               0x02CE             /*!< Stall cycles caused by an MVE instruction because of memory resource conflicts */
+#define ARM_PMU_MVE_STALL_RESOURCE_FP                0x02CF             /*!< Stall cycles caused by an MVE instruction because of floating-point resource conflicts */
+#define ARM_PMU_MVE_STALL_RESOURCE_INT               0x02D0             /*!< Stall cycles caused by an MVE instruction because of integer resource conflicts */
+#define ARM_PMU_MVE_STALL_BREAK                      0x02D3             /*!< Stall cycles caused by an MVE chain break */
+#define ARM_PMU_MVE_STALL_DEPENDENCY                 0x02D4             /*!< Stall cycles caused by MVE register dependency */
+#define ARM_PMU_ITCM_ACCESS                          0x4007             /*!< Instruction TCM access */
+#define ARM_PMU_DTCM_ACCESS                          0x4008             /*!< Data TCM access */
+#define ARM_PMU_TRCEXTOUT0                           0x4010             /*!< ETM external output 0 */
+#define ARM_PMU_TRCEXTOUT1                           0x4011             /*!< ETM external output 1 */
+#define ARM_PMU_TRCEXTOUT2                           0x4012             /*!< ETM external output 2 */
+#define ARM_PMU_TRCEXTOUT3                           0x4013             /*!< ETM external output 3 */
+#define ARM_PMU_CTI_TRIGOUT4                         0x4018             /*!< Cross-trigger Interface output trigger 4 */
+#define ARM_PMU_CTI_TRIGOUT5                         0x4019             /*!< Cross-trigger Interface output trigger 5 */
+#define ARM_PMU_CTI_TRIGOUT6                         0x401A             /*!< Cross-trigger Interface output trigger 6 */
+#define ARM_PMU_CTI_TRIGOUT7                         0x401B             /*!< Cross-trigger Interface output trigger 7 */
+
+/** \brief PMU Functions */
+
+__STATIC_INLINE void ARM_PMU_Enable(void);
+__STATIC_INLINE void ARM_PMU_Disable(void);
+
+__STATIC_INLINE void ARM_PMU_Set_EVTYPER(uint32_t num, uint32_t type);
+
+__STATIC_INLINE void ARM_PMU_CYCCNT_Reset(void);
+__STATIC_INLINE void ARM_PMU_EVCNTR_ALL_Reset(void);
+
+__STATIC_INLINE void ARM_PMU_CNTR_Enable(uint32_t mask);
+__STATIC_INLINE void ARM_PMU_CNTR_Disable(uint32_t mask);
+
+__STATIC_INLINE uint32_t ARM_PMU_Get_CCNTR(void);
+__STATIC_INLINE uint32_t ARM_PMU_Get_EVCNTR(uint32_t num);
+
+__STATIC_INLINE uint32_t ARM_PMU_Get_CNTR_OVS(void);
+__STATIC_INLINE void ARM_PMU_Set_CNTR_OVS(uint32_t mask);
+
+__STATIC_INLINE void ARM_PMU_Set_CNTR_IRQ_Enable(uint32_t mask);
+__STATIC_INLINE void ARM_PMU_Set_CNTR_IRQ_Disable(uint32_t mask);
+
+__STATIC_INLINE void ARM_PMU_CNTR_Increment(uint32_t mask);
+
+/**
+  \brief   Enable the PMU
+*/
+__STATIC_INLINE void ARM_PMU_Enable(void)
+{
+  PMU->CTRL |= PMU_CTRL_ENABLE_Msk;
+}
+
+/**
+  \brief   Disable the PMU
+*/
+__STATIC_INLINE void ARM_PMU_Disable(void)
+{
+  PMU->CTRL &= ~PMU_CTRL_ENABLE_Msk;
+}
+
+/**
+  \brief   Set event to count for PMU eventer counter
+  \param [in]    num     Event counter (0-30) to configure
+  \param [in]    type    Event to count
+*/
+__STATIC_INLINE void ARM_PMU_Set_EVTYPER(uint32_t num, uint32_t type)
+{
+  PMU->EVTYPER[num] = type;
+}
+
+/**
+  \brief  Reset cycle counter
+*/
+__STATIC_INLINE void ARM_PMU_CYCCNT_Reset(void)
+{
+  PMU->CTRL |= PMU_CTRL_CYCCNT_RESET_Msk;
+}
+
+/**
+  \brief  Reset all event counters
+*/
+__STATIC_INLINE void ARM_PMU_EVCNTR_ALL_Reset(void)
+{
+  PMU->CTRL |= PMU_CTRL_EVENTCNT_RESET_Msk;
+}
+
+/**
+  \brief  Enable counters
+  \param [in]     mask    Counters to enable
+  \note   Enables one or more of the following:
+          - event counters (0-30)
+          - cycle counter
+*/
+__STATIC_INLINE void ARM_PMU_CNTR_Enable(uint32_t mask)
+{
+  PMU->CNTENSET = mask;
+}
+
+/**
+  \brief  Disable counters
+  \param [in]     mask    Counters to enable
+  \note   Disables one or more of the following:
+          - event counters (0-30)
+          - cycle counter
+*/
+__STATIC_INLINE void ARM_PMU_CNTR_Disable(uint32_t mask)
+{
+  PMU->CNTENCLR = mask;
+}
+
+/**
+  \brief  Read cycle counter
+  \return                 Cycle count
+*/
+__STATIC_INLINE uint32_t ARM_PMU_Get_CCNTR(void)
+{
+  return PMU->CCNTR;
+}
+
+/**
+  \brief   Read event counter
+  \param [in]     num     Event counter (0-30) to read
+  \return                 Event count
+*/
+__STATIC_INLINE uint32_t ARM_PMU_Get_EVCNTR(uint32_t num)
+{
+  return PMU->EVCNTR[num];
+}
+
+/**
+  \brief   Read counter overflow status
+  \return  Counter overflow status bits for the following:
+          - event counters (0-30)
+          - cycle counter
+*/
+__STATIC_INLINE uint32_t ARM_PMU_Get_CNTR_OVS(void)
+{
+  return PMU->OVSSET;
+}
+
+/**
+  \brief   Clear counter overflow status
+  \param [in]     mask    Counter overflow status bits to clear
+  \note    Clears overflow status bits for one or more of the following:
+           - event counters (0-30)
+           - cycle counter
+*/
+__STATIC_INLINE void ARM_PMU_Set_CNTR_OVS(uint32_t mask)
+{
+  PMU->OVSCLR = mask;
+}
+
+/**
+  \brief   Enable counter overflow interrupt request
+  \param [in]     mask    Counter overflow interrupt request bits to set
+  \note    Sets overflow interrupt request bits for one or more of the following:
+           - event counters (0-30)
+           - cycle counter
+*/
+__STATIC_INLINE void ARM_PMU_Set_CNTR_IRQ_Enable(uint32_t mask)
+{
+  PMU->INTENSET = mask;
+}
+
+/**
+  \brief   Disable counter overflow interrupt request
+  \param [in]     mask    Counter overflow interrupt request bits to clear
+  \note    Clears overflow interrupt request bits for one or more of the following:
+           - event counters (0-30)
+           - cycle counter
+*/
+__STATIC_INLINE void ARM_PMU_Set_CNTR_IRQ_Disable(uint32_t mask)
+{
+  PMU->INTENCLR = mask;
+}
+
+/**
+  \brief   Software increment event counter
+  \param [in]     mask    Counters to increment
+  \note    Software increment bits for one or more event counters (0-30)
+*/
+__STATIC_INLINE void ARM_PMU_CNTR_Increment(uint32_t mask)
+{
+  PMU->SWINC = mask;
+}
+
+#endif
--- a/libraries/cmsis/cm4/device_support/at32f403a_407.h
+++ b/libraries/cmsis/cm4/device_support/at32f403a_407.h
@@ -1,17 +1,15 @@
 /**
  **************************************************************************
  * @file     at32f403a_407.h
-  * @version  v2.0.4
-  * @date     2021-11-26
  * @brief    at32f403a_407 header file
  **************************************************************************
  *                       Copyright notice & Disclaimer
  *
-  * The software Board Support Package (BSP) that is made available to 
-  * download from Artery official website is the copyrighted work of Artery. 
-  * Artery authorizes customers to use, copy, and distribute the BSP 
-  * software and its related documentation for the purpose of design and 
-  * development in conjunction with Artery microcontrollers. Use of the 
+  * The software Board Support Package (BSP) that is made available to
+  * download from Artery official website is the copyrighted work of Artery.
+  * Artery authorizes customers to use, copy, and distribute the BSP
+  * software and its related documentation for the purpose of design and
+  * development in conjunction with Artery microcontrollers. Use of the
  * software is governed by this copyright notice and the following disclaimer.
  *
  * THIS SOFTWARE IS PROVIDED ON "AS IS" BASIS WITHOUT WARRANTIES,
@@ -42,7 +40,7 @@ extern "C" {
 /** @addtogroup AT32F403A_407
  * @{
  */
-  
+
 /** @addtogroup Library_configuration_section
  * @{
  */
@@ -60,7 +58,7 @@ extern "C" {
    !defined (AT32F407VCT7)  && !defined (AT32F407RET7)  && !defined (AT32F407VET7)  && \
    !defined (AT32F407AVCT7) && !defined (AT32F407AVGT7)

-    #error "Please select first the target at32f4xx device used in your application (in at32f4xx.h file)"
+    #error "Please select first the target device used in your application (in at32f403a_407.h file)"
 #endif

 #if defined (AT32F403AVCT7) || defined (AT32F403ARCT7) || defined (AT32F403ACCT7) || \
@@ -78,11 +76,38 @@ extern "C" {
    #define AT32F407xx
 #endif

+#if defined (AT32F403AVCT7) || defined (AT32F403AVET7) || defined (AT32F403AVGT7)
+
+    #define AT32F403AVx
+#endif
+
+#if defined (AT32F403ARCT7) || defined (AT32F403ARET7) || defined (AT32F403ARGT7)
+
+    #define AT32F403ARx
+#endif
+
+#if defined (AT32F403ACCT7) || defined (AT32F403ACCU7) || defined (AT32F403ACET7) || \
+    defined (AT32F403ACEU7) || defined (AT32F403ACGT7) || defined (AT32F403ACGU7)
+   
+    #define AT32F403ACx
+#endif
+
+#if defined (AT32F407RCT7)  || defined (AT32F407RET7)  || defined (AT32F407RGT7)
+
+    #define AT32F407Rx
+#endif
+
+#if defined (AT32F407VCT7)  || defined (AT32F407VET7)  || defined (AT32F407VGT7)  || \
+    defined (AT32F407AVCT7) || defined (AT32F407AVGT7)
+
+    #define AT32F407Vx
+#endif
+
 #ifndef USE_STDPERIPH_DRIVER
 /**
  * @brief comment the line below if you will not use the peripherals drivers.
-  * in this case, these drivers will not be included and the application code will 
-  * be based on direct access to peripherals registers 
+  * in this case, these drivers will not be included and the application code will
+  * be based on direct access to peripherals registers
  */
  #ifdef _RTE_
    #include "RTE_Components.h"
@@ -96,7 +121,7 @@ extern "C" {
  * @brief at32f403a_407 standard peripheral library version number
  */
 #define __AT32F403A_407_LIBRARY_VERSION_MAJOR    (0x02) /*!< [31:24] major version */
-#define __AT32F403A_407_LIBRARY_VERSION_MIDDLE   (0x00) /*!< [23:16] middle version */
+#define __AT32F403A_407_LIBRARY_VERSION_MIDDLE   (0x01) /*!< [23:16] middle version */
 #define __AT32F403A_407_LIBRARY_VERSION_MINOR    (0x04) /*!< [15:8]  minor version */
 #define __AT32F403A_407_LIBRARY_VERSION_RC       (0x00) /*!< [7:0]  release candidate */
 #define __AT32F403A_407_LIBRARY_VERSION          ((__AT32F403A_407_LIBRARY_VERSION_MAJOR << 24)  | \
@@ -293,7 +318,7 @@ typedef enum IRQn

 /** @addtogroup Exported_types
  * @{
-  */  
+  */

 typedef int32_t  INT32;
 typedef int16_t  INT16;
@@ -335,19 +360,19 @@ typedef __I uint16_t vuc16;   /*!< read only */
 typedef __I uint8_t  vuc8;    /*!< read only */

 /**
-  * @brief flag status 
+  * @brief flag status
  */
-typedef enum {RESET = 0, SET = !RESET} flag_status;  
+typedef enum {RESET = 0, SET = !RESET} flag_status;

 /**
  * @brief confirm state
-  */       
-typedef enum {FALSE = 0, TRUE = !FALSE} confirm_state;     
+  */
+typedef enum {FALSE = 0, TRUE = !FALSE} confirm_state;

 /**
  * @brief error status
-  */  
-typedef enum {ERROR = 0, SUCCESS = !ERROR} error_status;   
+  */
+typedef enum {ERROR = 0, SUCCESS = !ERROR} error_status;

 /**
  * @}
@@ -415,7 +440,7 @@ typedef enum {ERROR = 0, SUCCESS = !ERROR} error_status;
 #define BPR_BASE                         (APB1PERIPH_BASE + 0x6C00)
 #define PWC_BASE                         (APB1PERIPH_BASE + 0x7000)
 #define DAC_BASE                         (APB1PERIPH_BASE + 0x7400)
-/* apb2 bus base address */              
+/* apb2 bus base address */
 #define IOMUX_BASE                       (APB2PERIPH_BASE + 0x0000)
 #define EXINT_BASE                       (APB2PERIPH_BASE + 0x0400)
 #define GPIOA_BASE                       (APB2PERIPH_BASE + 0x0800)
@@ -441,7 +466,7 @@ typedef enum {ERROR = 0, SUCCESS = !ERROR} error_status;
 #define I2S2EXT_BASE                     (APB2PERIPH_BASE + 0x6C00)
 #define I2S3EXT_BASE                     (APB2PERIPH_BASE + 0x7000)
 #define SDIO1_BASE                       (APB2PERIPH_BASE + 0x8000)
-/* ahb bus base address */               
+/* ahb bus base address */
 #define DMA1_BASE                        (AHBPERIPH_BASE + 0x0000)
 #define DMA1_CHANNEL1_BASE               (AHBPERIPH_BASE + 0x0008)
 #define DMA1_CHANNEL2_BASE               (AHBPERIPH_BASE + 0x001C)
@@ -499,7 +524,7 @@ typedef enum {ERROR = 0, SUCCESS = !ERROR} error_status;
 #define BPR_BASE                         (APB1PERIPH_BASE + 0x6C00)
 #define PWC_BASE                         (APB1PERIPH_BASE + 0x7000)
 #define DAC_BASE                         (APB1PERIPH_BASE + 0x7400)
-/* apb2 bus base address */              
+/* apb2 bus base address */
 #define IOMUX_BASE                       (APB2PERIPH_BASE + 0x0000)
 #define EXINT_BASE                       (APB2PERIPH_BASE + 0x0400)
 #define GPIOA_BASE                       (APB2PERIPH_BASE + 0x0800)
@@ -525,7 +550,7 @@ typedef enum {ERROR = 0, SUCCESS = !ERROR} error_status;
 #define I2S2EXT_BASE                     (APB2PERIPH_BASE + 0x6C00)
 #define I2S3EXT_BASE                     (APB2PERIPH_BASE + 0x7000)
 #define SDIO1_BASE                       (APB2PERIPH_BASE + 0x8000)
-/* ahb bus base address */               
+/* ahb bus base address */
 #define DMA1_BASE                        (AHBPERIPH_BASE + 0x0000)
 #define DMA1_CHANNEL1_BASE               (AHBPERIPH_BASE + 0x0008)
 #define DMA1_CHANNEL2_BASE               (AHBPERIPH_BASE + 0x001C)
@@ -548,11 +573,7 @@ typedef enum {ERROR = 0, SUCCESS = !ERROR} error_status;
 #define SDIO2_BASE                       (AHBPERIPH_BASE + 0x3400)
 #define EMAC_BASE                        (AHBPERIPH_BASE + 0x8000)
 #define XMC_BANK1_REG_BASE               (XMC_REG_BASE + 0x0000)
-#define XMC_BANK1E_REG_BASE              (XMC_REG_BASE + 0x0104)
-#define XMC_BANK1E_H_BASE                (XMC_REG_BASE + 0x0220)
 #define XMC_BANK2_REG_BASE               (XMC_REG_BASE + 0x0060)
-#define XMC_BANK3_REG_BASE               (XMC_REG_BASE + 0x0080)
-#define XMC_BANK4_REG_BASE               (XMC_REG_BASE + 0x00A0)
 #define EMAC_MMC_BASE                    (EMAC_BASE + 0x0100)
 #define EMAC_PTP_BASE                    (EMAC_BASE + 0x0700)
 #define EMAC_DMA_BASE                    (EMAC_BASE + 0x1000)
@@ -569,7 +590,8 @@ typedef enum {ERROR = 0, SUCCESS = !ERROR} error_status;
 /**
  * @}
  */
-  
+
+#include "at32f403a_407_def.h"
 #include "at32f403a_407_conf.h"

 #ifdef __cplusplus
--- a/libraries/cmsis/cm4/device_support/at32f403a_407_conf_template.h
+++ b/libraries/cmsis/cm4/device_support/at32f403a_407_conf_template.h
@@ -1,17 +1,15 @@
 /**
  **************************************************************************
  * @file     at32f403a_407_conf.h
-  * @version  v2.0.4
-  * @date     2021-11-26
  * @brief    at32f403a_407 config header file
  **************************************************************************
  *                       Copyright notice & Disclaimer
  *
-  * The software Board Support Package (BSP) that is made available to 
-  * download from Artery official website is the copyrighted work of Artery. 
-  * Artery authorizes customers to use, copy, and distribute the BSP 
-  * software and its related documentation for the purpose of design and 
-  * development in conjunction with Artery microcontrollers. Use of the 
+  * The software Board Support Package (BSP) that is made available to
+  * download from Artery official website is the copyrighted work of Artery.
+  * Artery authorizes customers to use, copy, and distribute the BSP
+  * software and its related documentation for the purpose of design and
+  * development in conjunction with Artery microcontrollers. Use of the
  * software is governed by this copyright notice and the following disclaimer.
  *
  * THIS SOFTWARE IS PROVIDED ON "AS IS" BASIS WITHOUT WARRANTIES,
@@ -31,7 +29,7 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
- 
+

 /**
  * @brief in the following line adjust the value of high speed exernal crystal (hext)
@@ -47,8 +45,9 @@ extern "C" {
  * @brief in the following line adjust the high speed exernal crystal (hext) startup
  * timeout value
  */
-#define HEXT_STARTUP_TIMEOUT     ((uint16_t)0x3000) /*!< time out for hext start up */
-#define HICK_VALUE               ((uint32_t)8000000) /*!< value of the high speed internal clock in hz */
+#define HEXT_STARTUP_TIMEOUT             ((uint16_t)0x3000)  /*!< time out for hext start up */
+#define HICK_VALUE                       ((uint32_t)8000000) /*!< value of the high speed internal clock in hz */
+#define LEXT_VALUE                       ((uint32_t)32768)   /*!< value of the low speed exernal clock in hz */

 /* module define -------------------------------------------------------------*/
 #define CRM_MODULE_ENABLED
--- a/libraries/cmsis/cm4/device_support/startup/gcc/linker/AT32F403AxC_FLASH.ld
+++ b/libraries/cmsis/cm4/device_support/startup/gcc/linker/AT32F403AxC_FLASH.ld
@@ -22,7 +22,7 @@
 ENTRY(Reset_Handler)

 /* Highest address of the user mode stack */
-_estack = 0x20017FFF;    /* end of RAM */
+_estack = 0x20018000;    /* end of RAM */

 /* Generate a link error if heap and stack don't fit into RAM */
 _Min_Heap_Size = 0x200;      /* required amount of heap  */
@@ -33,6 +33,7 @@ MEMORY
 {
 FLASH (rx)      : ORIGIN = 0x08000000, LENGTH = 256K
 RAM (xrw)       : ORIGIN = 0x20000000, LENGTH = 96K
+SPIM (rx)       : ORIGIN = 0x08400000, LENGTH = 16384K
 }

 /* Define output sections */
@@ -115,6 +116,19 @@ SECTIONS
    _edata = .;        /* define a global symbol at data end */
  } >RAM AT> FLASH

+  _spim_init_base = LOADADDR(.spim);
+  _spim_init_length = SIZEOF(.spim);
+  
+  .spim :
+  {
+    . = ALIGN(4);
+    _spim_start = .;        /* create a global symbol at spim start */
+    *(.spim)                /* .spim sections */
+    *(.spim*)               /* .spim* sections */
+    . = ALIGN(4);
+    _spim_end = .;         /* define a global symbols at end of spim */
+  } >SPIM
+
  /* Uninitialized data section */
  . = ALIGN(4);
  .bss :
@@ -134,12 +148,12 @@ SECTIONS
  /* User_heap_stack section, used to check that there is enough RAM left */
  ._user_heap_stack :
  {
-    . = ALIGN(4);
+    . = ALIGN(8);
    PROVIDE ( end = . );
    PROVIDE ( _end = . );
    . = . + _Min_Heap_Size;
    . = . + _Min_Stack_Size;
-    . = ALIGN(4);
+    . = ALIGN(8);
  } >RAM

  /* Remove information from the standard libraries */
--- a/libraries/cmsis/cm4/device_support/startup/gcc/linker/AT32F403AxE_FLASH.ld
+++ b/libraries/cmsis/cm4/device_support/startup/gcc/linker/AT32F403AxE_FLASH.ld
@@ -22,7 +22,7 @@
 ENTRY(Reset_Handler)

 /* Highest address of the user mode stack */
-_estack = 0x20017FFF;    /* end of RAM */
+_estack = 0x20018000;    /* end of RAM */

 /* Generate a link error if heap and stack don't fit into RAM */
 _Min_Heap_Size = 0x200;      /* required amount of heap  */
@@ -33,6 +33,7 @@ MEMORY
 {
 FLASH (rx)      : ORIGIN = 0x08000000, LENGTH = 512K
 RAM (xrw)       : ORIGIN = 0x20000000, LENGTH = 96K
+SPIM (rx)       : ORIGIN = 0x08400000, LENGTH = 16384K
 }

 /* Define output sections */
@@ -115,6 +116,19 @@ SECTIONS
    _edata = .;        /* define a global symbol at data end */
  } >RAM AT> FLASH

+  _spim_init_base = LOADADDR(.spim);
+  _spim_init_length = SIZEOF(.spim);
+  
+  .spim :
+  {
+    . = ALIGN(4);
+    _spim_start = .;        /* create a global symbol at spim start */
+    *(.spim)                /* .spim sections */
+    *(.spim*)               /* .spim* sections */
+    . = ALIGN(4);
+    _spim_end = .;         /* define a global symbols at end of spim */
+  } >SPIM
+
  /* Uninitialized data section */
  . = ALIGN(4);
  .bss :
@@ -134,12 +148,12 @@ SECTIONS
  /* User_heap_stack section, used to check that there is enough RAM left */
  ._user_heap_stack :
  {
-    . = ALIGN(4);
+    . = ALIGN(8);
    PROVIDE ( end = . );
    PROVIDE ( _end = . );
    . = . + _Min_Heap_Size;
    . = . + _Min_Stack_Size;
-    . = ALIGN(4);
+    . = ALIGN(8);
  } >RAM

  /* Remove information from the standard libraries */
--- a/libraries/cmsis/cm4/device_support/startup/gcc/linker/AT32F403AxG_FLASH.ld
+++ b/libraries/cmsis/cm4/device_support/startup/gcc/linker/AT32F403AxG_FLASH.ld
@@ -22,7 +22,7 @@
 ENTRY(Reset_Handler)

 /* Highest address of the user mode stack */
-_estack = 0x20017FFF;    /* end of RAM */
+_estack = 0x20018000;    /* end of RAM */

 /* Generate a link error if heap and stack don't fit into RAM */
 _Min_Heap_Size = 0x200;      /* required amount of heap  */
@@ -33,6 +33,7 @@ MEMORY
 {
 FLASH (rx)      : ORIGIN = 0x08000000, LENGTH = 1000K
 RAM (xrw)       : ORIGIN = 0x20000000, LENGTH = 96K
+SPIM (rx)       : ORIGIN = 0x08400000, LENGTH = 16384K
 }

 /* Define output sections */
@@ -115,6 +116,19 @@ SECTIONS
    _edata = .;        /* define a global symbol at data end */
  } >RAM AT> FLASH

+  _spim_init_base = LOADADDR(.spim);
+  _spim_init_length = SIZEOF(.spim);
+  
+  .spim :
+  {
+    . = ALIGN(4);
+    _spim_start = .;        /* create a global symbol at spim start */
+    *(.spim)                /* .spim sections */
+    *(.spim*)               /* .spim* sections */
+    . = ALIGN(4);
+    _spim_end = .;         /* define a global symbols at end of spim */
+  } >SPIM
+
  /* Uninitialized data section */
  . = ALIGN(4);
  .bss :
@@ -134,12 +148,12 @@ SECTIONS
  /* User_heap_stack section, used to check that there is enough RAM left */
  ._user_heap_stack :
  {
-    . = ALIGN(4);
+    . = ALIGN(8);
    PROVIDE ( end = . );
    PROVIDE ( _end = . );
    . = . + _Min_Heap_Size;
    . = . + _Min_Stack_Size;
-    . = ALIGN(4);
+    . = ALIGN(8);
  } >RAM

  /* Remove information from the standard libraries */
--- a/libraries/cmsis/cm4/device_support/startup/gcc/linker/AT32F407xC_FLASH.ld
+++ b/libraries/cmsis/cm4/device_support/startup/gcc/linker/AT32F407xC_FLASH.ld
@@ -22,7 +22,7 @@
 ENTRY(Reset_Handler)

 /* Highest address of the user mode stack */
-_estack = 0x20017FFF;    /* end of RAM */
+_estack = 0x20018000;    /* end of RAM */

 /* Generate a link error if heap and stack don't fit into RAM */
 _Min_Heap_Size = 0x200;      /* required amount of heap  */
@@ -33,6 +33,7 @@ MEMORY
 {
 FLASH (rx)      : ORIGIN = 0x08000000, LENGTH = 256K
 RAM (xrw)       : ORIGIN = 0x20000000, LENGTH = 96K
+SPIM (rx)       : ORIGIN = 0x08400000, LENGTH = 16384K
 }

 /* Define output sections */
@@ -115,6 +116,19 @@ SECTIONS
    _edata = .;        /* define a global symbol at data end */
  } >RAM AT> FLASH

+  _spim_init_base = LOADADDR(.spim);
+  _spim_init_length = SIZEOF(.spim);
+  
+  .spim :
+  {
+    . = ALIGN(4);
+    _spim_start = .;        /* create a global symbol at spim start */
+    *(.spim)                /* .spim sections */
+    *(.spim*)               /* .spim* sections */
+    . = ALIGN(4);
+    _spim_end = .;         /* define a global symbols at end of spim */
+  } >SPIM
+
  /* Uninitialized data section */
  . = ALIGN(4);
  .bss :
@@ -134,12 +148,12 @@ SECTIONS
  /* User_heap_stack section, used to check that there is enough RAM left */
  ._user_heap_stack :
  {
-    . = ALIGN(4);
+    . = ALIGN(8);
    PROVIDE ( end = . );
    PROVIDE ( _end = . );
    . = . + _Min_Heap_Size;
    . = . + _Min_Stack_Size;
-    . = ALIGN(4);
+    . = ALIGN(8);
  } >RAM

  /* Remove information from the standard libraries */
--- a/libraries/cmsis/cm4/device_support/startup/gcc/linker/AT32F407xE_FLASH.ld
+++ b/libraries/cmsis/cm4/device_support/startup/gcc/linker/AT32F407xE_FLASH.ld
@@ -22,7 +22,7 @@
 ENTRY(Reset_Handler)

 /* Highest address of the user mode stack */
-_estack = 0x20017FFF;    /* end of RAM */
+_estack = 0x20018000;    /* end of RAM */

 /* Generate a link error if heap and stack don't fit into RAM */
 _Min_Heap_Size = 0x200;      /* required amount of heap  */
@@ -33,6 +33,7 @@ MEMORY
 {
 FLASH (rx)      : ORIGIN = 0x08000000, LENGTH = 512K
 RAM (xrw)       : ORIGIN = 0x20000000, LENGTH = 96K
+SPIM (rx)       : ORIGIN = 0x08400000, LENGTH = 16384K
 }

 /* Define output sections */
@@ -115,6 +116,19 @@ SECTIONS
    _edata = .;        /* define a global symbol at data end */
  } >RAM AT> FLASH

+  _spim_init_base = LOADADDR(.spim);
+  _spim_init_length = SIZEOF(.spim);
+  
+  .spim :
+  {
+    . = ALIGN(4);
+    _spim_start = .;        /* create a global symbol at spim start */
+    *(.spim)                /* .spim sections */
+    *(.spim*)               /* .spim* sections */
+    . = ALIGN(4);
+    _spim_end = .;         /* define a global symbols at end of spim */
+  } >SPIM
+
  /* Uninitialized data section */
  . = ALIGN(4);
  .bss :
@@ -134,12 +148,12 @@ SECTIONS
  /* User_heap_stack section, used to check that there is enough RAM left */
  ._user_heap_stack :
  {
-    . = ALIGN(4);
+    . = ALIGN(8);
    PROVIDE ( end = . );
    PROVIDE ( _end = . );
    . = . + _Min_Heap_Size;
    . = . + _Min_Stack_Size;
-    . = ALIGN(4);
+    . = ALIGN(8);
  } >RAM

  /* Remove information from the standard libraries */
--- a/libraries/cmsis/cm4/device_support/startup/gcc/linker/AT32F407xG_FLASH.ld
+++ b/libraries/cmsis/cm4/device_support/startup/gcc/linker/AT32F407xG_FLASH.ld
@@ -22,7 +22,7 @@
 ENTRY(Reset_Handler)

 /* Highest address of the user mode stack */
-_estack = 0x20017FFF;    /* end of RAM */
+_estack = 0x20018000;    /* end of RAM */

 /* Generate a link error if heap and stack don't fit into RAM */
 _Min_Heap_Size = 0x200;      /* required amount of heap  */
@@ -33,6 +33,7 @@ MEMORY
 {
 FLASH (rx)      : ORIGIN = 0x08000000, LENGTH = 1000K
 RAM (xrw)       : ORIGIN = 0x20000000, LENGTH = 96K
+SPIM (rx)       : ORIGIN = 0x08400000, LENGTH = 16384K
 }

 /* Define output sections */
@@ -115,6 +116,19 @@ SECTIONS
    _edata = .;        /* define a global symbol at data end */
  } >RAM AT> FLASH

+  _spim_init_base = LOADADDR(.spim);
+  _spim_init_length = SIZEOF(.spim);
+  
+  .spim :
+  {
+    . = ALIGN(4);
+    _spim_start = .;        /* create a global symbol at spim start */
+    *(.spim)                /* .spim sections */
+    *(.spim*)               /* .spim* sections */
+    . = ALIGN(4);
+    _spim_end = .;         /* define a global symbols at end of spim */
+  } >SPIM
+
  /* Uninitialized data section */
  . = ALIGN(4);
  .bss :
@@ -134,12 +148,12 @@ SECTIONS
  /* User_heap_stack section, used to check that there is enough RAM left */
  ._user_heap_stack :
  {
-    . = ALIGN(4);
+    . = ALIGN(8);
    PROVIDE ( end = . );
    PROVIDE ( _end = . );
    . = . + _Min_Heap_Size;
    . = . + _Min_Stack_Size;
-    . = ALIGN(4);
+    . = ALIGN(8);
  } >RAM

  /* Remove information from the standard libraries */
--- a/libraries/cmsis/cm4/device_support/startup/gcc/startup_at32f403a_407.s
+++ b/libraries/cmsis/cm4/device_support/startup/gcc/startup_at32f403a_407.s
@@ -1,8 +1,6 @@
 /**
  ******************************************************************************
  * @file     startup_at32f403a_407.s
-  * @version  v2.0.4
-  * @date     2021-11-26
  * @brief    at32f403a_407xx devices vector table for gcc toolchain.
  *           this module performs:
  *           - set the initial sp
@@ -106,7 +104,7 @@ Infinite_Loop:
 * The minimal vector table for a Cortex M3. Note that the proper constructs
 * must be placed on this to ensure that it ends up at physical address
 * 0x0000.0000.
-* 
+*
 *******************************************************************************/
   .section  .isr_vector,"a",%progbits
  .type  g_pfnVectors, %object
@@ -130,7 +128,7 @@ g_pfnVectors:
  .word  0
  .word  PendSV_Handler
  .word  SysTick_Handler
-  
+
  /* External Interrupts */
  .word  WWDT_IRQHandler                     /* Window Watchdog Timer                   */
  .word  PVM_IRQHandler                      /* PVM through EXINT Line detect           */
@@ -216,20 +214,20 @@ g_pfnVectors:

 /*******************************************************************************
 *
-* Provide weak aliases for each Exception handler to the Default_Handler. 
-* As they are weak aliases, any function with the same name will override 
+* Provide weak aliases for each Exception handler to the Default_Handler.
+* As they are weak aliases, any function with the same name will override
 * this definition.
-* 
+*
 *******************************************************************************/
   .weak      NMI_Handler
   .thumb_set NMI_Handler,Default_Handler
-  
+
   .weak      HardFault_Handler
   .thumb_set HardFault_Handler,Default_Handler
-  
+
   .weak      MemManage_Handler
   .thumb_set MemManage_Handler,Default_Handler
-  
+
   .weak      BusFault_Handler
   .thumb_set BusFault_Handler,Default_Handler

@@ -246,10 +244,10 @@ g_pfnVectors:
   .thumb_set PendSV_Handler,Default_Handler

   .weak      SysTick_Handler
-   .thumb_set SysTick_Handler,Default_Handler              
-  
+   .thumb_set SysTick_Handler,Default_Handler
+
   .weak      WWDT_IRQHandler
-   .thumb_set WWDT_IRQHandler,Default_Handler      
+   .thumb_set WWDT_IRQHandler,Default_Handler

   .weak      PVM_IRQHandler
   .thumb_set PVM_IRQHandler,Default_Handler
@@ -273,7 +271,7 @@ g_pfnVectors:
   .thumb_set EXINT1_IRQHandler,Default_Handler

   .weak      EXINT2_IRQHandler
-   .thumb_set EXINT2_IRQHandler,Default_Handler 
+   .thumb_set EXINT2_IRQHandler,Default_Handler

   .weak      EXINT3_IRQHandler
   .thumb_set EXINT3_IRQHandler,Default_Handler
@@ -291,7 +289,7 @@ g_pfnVectors:
   .thumb_set DMA1_Channel3_IRQHandler,Default_Handler

   .weak      DMA1_Channel4_IRQHandler
-   .thumb_set DMA1_Channel4_IRQHandler,Default_Handler 
+   .thumb_set DMA1_Channel4_IRQHandler,Default_Handler

   .weak      DMA1_Channel5_IRQHandler
   .thumb_set DMA1_Channel5_IRQHandler,Default_Handler
@@ -443,10 +441,10 @@ g_pfnVectors:
   .weak      CAN2_TX_IRQHandler
   .thumb_set CAN2_TX_IRQHandler,Default_Handler

-   .weak      CAN2_RX0_IRQHandler 
+   .weak      CAN2_RX0_IRQHandler
   .thumb_set CAN2_RX0_IRQHandler ,Default_Handler

-   .weak      CAN2_RX1_IRQHandler 
+   .weak      CAN2_RX1_IRQHandler
   .thumb_set CAN2_RX1_IRQHandler ,Default_Handler

   .weak      CAN2_SE_IRQHandler
--- a/libraries/cmsis/cm4/device_support/startup/iar/startup_at32f403a_407.s
+++ b/libraries/cmsis/cm4/device_support/startup/iar/startup_at32f403a_407.s
@@ -1,7 +1,5 @@
 ;**************************************************************************
 ;* @file     startup_at32f403a_407.s
-;* @version  v2.0.4
-;* @date     2021-11-26
 ;* @brief    at32f403a_407 startup file for IAR Systems
 ;**************************************************************************
 ;
@@ -64,8 +62,8 @@ __vector_table
        DCD     DMA1_Channel7_IRQHandler            ; DMA1 Channel 7
        DCD     ADC1_2_IRQHandler                   ; ADC1 & ADC2
        DCD     USBFS_H_CAN1_TX_IRQHandler          ; USB High Priority or CAN1 TX
-        DCD     USBFS_L_CAN1_RX0_IRQHandler         ; USB Low  Priority or CAN1 RX0 
-        DCD     CAN1_RX1_IRQHandler                 ; CAN1 RX1 
+        DCD     USBFS_L_CAN1_RX0_IRQHandler         ; USB Low  Priority or CAN1 RX0
+        DCD     CAN1_RX1_IRQHandler                 ; CAN1 RX1
        DCD     CAN1_SE_IRQHandler                  ; CAN1 SE
        DCD     EXINT9_5_IRQHandler                 ; EXINT Line [9:5]
        DCD     TMR1_BRK_TMR9_IRQHandler            ; TMR1 Brake and TMR9
@@ -113,8 +111,8 @@ __vector_table
        DCD     0                                   ; Reserved
        DCD     0                                   ; Reserved
        DCD     CAN2_TX_IRQHandler                  ; CAN2 TX
-        DCD     CAN2_RX0_IRQHandler                 ; CAN2 RX0 
-        DCD     CAN2_RX1_IRQHandler                 ; CAN2 RX1 
+        DCD     CAN2_RX0_IRQHandler                 ; CAN2 RX0
+        DCD     CAN2_RX1_IRQHandler                 ; CAN2 RX1
        DCD     CAN2_SE_IRQHandler                  ; CAN2 SE
        DCD     ACC_IRQHandler                      ; ACC
        DCD     USBFS_MAPH_IRQHandler               ; USB Map HP
@@ -285,15 +283,15 @@ ADC1_2_IRQHandler
 USBFS_H_CAN1_TX_IRQHandler
        B USBFS_H_CAN1_TX_IRQHandler

-        PUBWEAK USBFS_L_CAN1_RX0_IRQHandler 
+        PUBWEAK USBFS_L_CAN1_RX0_IRQHandler
        SECTION .text:CODE:REORDER:NOROOT(1)
-USBFS_L_CAN1_RX0_IRQHandler 
-        B USBFS_L_CAN1_RX0_IRQHandler 
+USBFS_L_CAN1_RX0_IRQHandler
+        B USBFS_L_CAN1_RX0_IRQHandler

-        PUBWEAK CAN1_RX1_IRQHandler 
+        PUBWEAK CAN1_RX1_IRQHandler
        SECTION .text:CODE:REORDER:NOROOT(1)
-CAN1_RX1_IRQHandler 
-        B CAN1_RX1_IRQHandler 
+CAN1_RX1_IRQHandler
+        B CAN1_RX1_IRQHandler

        PUBWEAK CAN1_SE_IRQHandler
        SECTION .text:CODE:REORDER:NOROOT(1)
@@ -510,15 +508,15 @@ SPI4_IRQHandler
 CAN2_TX_IRQHandler
        B CAN2_TX_IRQHandler

-        PUBWEAK CAN2_RX0_IRQHandler 
+        PUBWEAK CAN2_RX0_IRQHandler
        SECTION .text:CODE:REORDER:NOROOT(1)
-CAN2_RX0_IRQHandler 
-        B CAN2_RX0_IRQHandler 
+CAN2_RX0_IRQHandler
+        B CAN2_RX0_IRQHandler

-        PUBWEAK CAN2_RX1_IRQHandler 
+        PUBWEAK CAN2_RX1_IRQHandler
        SECTION .text:CODE:REORDER:NOROOT(1)
-CAN2_RX1_IRQHandler 
-        B CAN2_RX1_IRQHandler 
+CAN2_RX1_IRQHandler
+        B CAN2_RX1_IRQHandler

        PUBWEAK CAN2_SE_IRQHandler
        SECTION .text:CODE:REORDER:NOROOT(1)
--- a/libraries/cmsis/cm4/device_support/startup/mdk/startup_at32f403a_407.s
+++ b/libraries/cmsis/cm4/device_support/startup/mdk/startup_at32f403a_407.s
@@ -1,8 +1,7 @@
 ;**************************************************************************
 ;* @file     startup_at32f403a_407.s
-;* @version  v2.0.4
-;* @date     2021-11-26
 ;* @brief    at32f403a_407 startup file for keil
+;* <<< Use Configuration Wizard in Context Menu >>>  
 ;**************************************************************************
 ;

@@ -77,8 +76,8 @@ __Vectors       DCD     __initial_sp                        ; Top of Stack
                DCD     DMA1_Channel7_IRQHandler            ; DMA1 Channel 7
                DCD     ADC1_2_IRQHandler                   ; ADC1 & ADC2
                DCD     USBFS_H_CAN1_TX_IRQHandler          ; USB High Priority or CAN1 TX
-                DCD     USBFS_L_CAN1_RX0_IRQHandler         ; USB Low  Priority or CAN1 RX0 
-                DCD     CAN1_RX1_IRQHandler                 ; CAN1 RX1 
+                DCD     USBFS_L_CAN1_RX0_IRQHandler         ; USB Low  Priority or CAN1 RX0
+                DCD     CAN1_RX1_IRQHandler                 ; CAN1 RX1
                DCD     CAN1_SE_IRQHandler                  ; CAN1 SE
                DCD     EXINT9_5_IRQHandler                 ; EXINT Line [9:5]
                DCD     TMR1_BRK_TMR9_IRQHandler            ; TMR1 Brake and TMR9
@@ -126,8 +125,8 @@ __Vectors       DCD     __initial_sp                        ; Top of Stack
                DCD     0                                   ; Reserved
                DCD     0                                   ; Reserved
                DCD     CAN2_TX_IRQHandler                  ; CAN2 TX
-                DCD     CAN2_RX0_IRQHandler                 ; CAN2 RX0 
-                DCD     CAN2_RX1_IRQHandler                 ; CAN2 RX1 
+                DCD     CAN2_RX0_IRQHandler                 ; CAN2 RX0
+                DCD     CAN2_RX1_IRQHandler                 ; CAN2 RX1
                DCD     CAN2_SE_IRQHandler                  ; CAN2 SE
                DCD     ACC_IRQHandler                      ; ACC
                DCD     USBFS_MAPH_IRQHandler               ; USB Map High
@@ -299,8 +298,8 @@ DMA1_Channel6_IRQHandler
 DMA1_Channel7_IRQHandler
 ADC1_2_IRQHandler
 USBFS_H_CAN1_TX_IRQHandler
-USBFS_L_CAN1_RX0_IRQHandler 
-CAN1_RX1_IRQHandler 
+USBFS_L_CAN1_RX0_IRQHandler
+CAN1_RX1_IRQHandler
 CAN1_SE_IRQHandler
 EXINT9_5_IRQHandler
 TMR1_BRK_TMR9_IRQHandler
@@ -344,8 +343,8 @@ I2C3_EVT_IRQHandler
 I2C3_ERR_IRQHandler
 SPI4_IRQHandler
 CAN2_TX_IRQHandler
-CAN2_RX0_IRQHandler 
-CAN2_RX1_IRQHandler 
+CAN2_RX0_IRQHandler
+CAN2_RX1_IRQHandler
 CAN2_SE_IRQHandler
 ACC_IRQHandler
 USBFS_MAPH_IRQHandler
--- a/libraries/cmsis/cm4/device_support/system_at32f403a_407.c
+++ b/libraries/cmsis/cm4/device_support/system_at32f403a_407.c
@@ -1,17 +1,15 @@
 /**
  **************************************************************************
  * @file     system_at32f403a_407.c
-  * @version  v2.0.4
-  * @date     2021-11-26
  * @brief    contains all the functions for cmsis cortex-m4 system source file
  **************************************************************************
  *                       Copyright notice & Disclaimer
  *
-  * The software Board Support Package (BSP) that is made available to 
-  * download from Artery official website is the copyrighted work of Artery. 
-  * Artery authorizes customers to use, copy, and distribute the BSP 
-  * software and its related documentation for the purpose of design and 
-  * development in conjunction with Artery microcontrollers. Use of the 
+  * The software Board Support Package (BSP) that is made available to
+  * download from Artery official website is the copyrighted work of Artery.
+  * Artery authorizes customers to use, copy, and distribute the BSP
+  * software and its related documentation for the purpose of design and
+  * development in conjunction with Artery microcontrollers. Use of the
  * software is governed by this copyright notice and the following disclaimer.
  *
  * THIS SOFTWARE IS PROVIDED ON "AS IS" BASIS WITHOUT WARRANTIES,
@@ -31,7 +29,7 @@
 /** @addtogroup AT32F403A_407_system
  * @{
  */
-    
+
 #include "at32f403a_407.h"

 /** @addtogroup AT32F403A_407_system_private_defines
@@ -81,13 +79,13 @@ void SystemInit (void)
  /* wait sclk switch status */
  while(CRM->cfg_bit.sclksts != CRM_SCLK_HICK);

-  /* reset cfg register, include sclk switch, ahbdiv, apb1div, apb2div, adcdiv, 
-     clkout pllrcs, pllhextdiv, pllmult, usbdiv and pllrange bits */
-  CRM->cfg = 0;
-
  /* reset hexten, hextbyps, cfden and pllen bits */
  CRM->ctrl &= ~(0x010D0000U);

+  /* reset cfg register, include sclk switch, ahbdiv, apb1div, apb2div, adcdiv,
+     clkout pllrcs, pllhextdiv, pllmult, usbdiv and pllrange bits */
+  CRM->cfg = 0;
+
  /* reset clkout[3], usbbufs, hickdiv, clkoutdiv */
  CRM->misc1 = 0;

@@ -182,7 +180,7 @@ void system_core_clock_update(void)
 /**
  * @}
  */
-  
+
 /**
  * @}
  */
--- a/libraries/cmsis/cm4/device_support/system_at32f403a_407.h
+++ b/libraries/cmsis/cm4/device_support/system_at32f403a_407.h
@@ -1,17 +1,15 @@
 /**
  **************************************************************************
  * @file     system_at32f403a_407.h
-  * @version  v2.0.4
-  * @date     2021-11-26
  * @brief    cmsis cortex-m4 system header file.
  **************************************************************************
  *                       Copyright notice & Disclaimer
  *
-  * The software Board Support Package (BSP) that is made available to 
-  * download from Artery official website is the copyrighted work of Artery. 
-  * Artery authorizes customers to use, copy, and distribute the BSP 
-  * software and its related documentation for the purpose of design and 
-  * development in conjunction with Artery microcontrollers. Use of the 
+  * The software Board Support Package (BSP) that is made available to
+  * download from Artery official website is the copyrighted work of Artery.
+  * Artery authorizes customers to use, copy, and distribute the BSP
+  * software and its related documentation for the purpose of design and
+  * development in conjunction with Artery microcontrollers. Use of the
  * software is governed by this copyright notice and the following disclaimer.
  *
  * THIS SOFTWARE IS PROVIDED ON "AS IS" BASIS WITHOUT WARRANTIES,
@@ -39,18 +37,19 @@ extern "C" {
  * @{
  */

-/** @defgroup AT32F403A_407_system_clock_stable_definition 
+/** @defgroup AT32F403A_407_system_clock_stable_definition
  * @{
  */

 #define HEXT_STABLE_DELAY                (5000u)
 #define PLL_STABLE_DELAY                 (500u)
+#define SystemCoreClock                  system_core_clock

 /**
  * @}
  */

-/** @defgroup AT32F403A_407_system_exported_variables 
+/** @defgroup AT32F403A_407_system_exported_variables
  * @{
  */

@@ -60,10 +59,10 @@ extern unsigned int system_core_clock; /*!< system clock frequency (core clock)
  * @}
  */

-/** @defgroup AT32F403A_407_system_exported_functions 
+/** @defgroup AT32F403A_407_system_exported_functions
  * @{
  */
-  
+
 extern void SystemInit(void);
 extern void system_core_clock_update(void);

--- a/libraries/cmsis/dsp/ComputeLibrary/Include/NEMath.h
+++ b/libraries/cmsis/dsp/ComputeLibrary/Include/NEMath.h
@@ -0,0 +1,414 @@
+/*
+ * Copyright (c) 2016, 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEMATH_H__
+#define __ARM_COMPUTE_NEMATH_H__
+
+
+#if defined(ARM_MATH_NEON)
+/** Calculate floor of a vector.
+ *
+ * @param[in] val Input vector value in F32 format.
+ *
+ * @return The calculated floor vector.
+ */
+static inline float32x4_t vfloorq_f32(float32x4_t val);
+
+/** Calculate inverse square root.
+ *
+ * @param[in] x Input value.
+ *
+ * @return The calculated inverse square root.
+ */
+static inline float32x2_t vinvsqrt_f32(float32x2_t x);
+
+/** Calculate inverse square root.
+ *
+ * @param[in] x Input value.
+ *
+ * @return The calculated inverse square root.
+ */
+static inline float32x4_t vinvsqrtq_f32(float32x4_t x);
+
+/** Calculate reciprocal.
+ *
+ * @param[in] x Input value.
+ *
+ * @return The calculated reciprocal.
+ */
+static inline float32x2_t vinv_f32(float32x2_t x);
+
+/** Calculate reciprocal.
+ *
+ * @param[in] x Input value.
+ *
+ * @return The calculated reciprocal.
+ */
+static inline float32x4_t vinvq_f32(float32x4_t x);
+
+/** Perform a 7th degree polynomial approximation using Estrin's method.
+ *
+ * @param[in] x      Input vector value in F32 format.
+ * @param[in] coeffs Polynomial coefficients table. (array of flattened float32x4_t vectors)
+ *
+ * @return The calculated approximation.
+ */
+static inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const float32_t *coeffs);
+
+/** Calculate exponential
+ *
+ * @param[in] x Input vector value in F32 format.
+ *
+ * @return The calculated exponent.
+ */
+static inline float32x4_t vexpq_f32(float32x4_t x);
+
+/** Calculate logarithm
+ *
+ * @param[in] x Input vector value in F32 format.
+ *
+ * @return The calculated logarithm.
+ */
+static inline float32x4_t vlogq_f32(float32x4_t x);
+
+/** Calculate hyperbolic tangent.
+ *
+ * tanh(x) = (e^2x - 1)/(e^2x + 1)
+ *
+ * @note We clamp x to [-5,5] to avoid overflowing issues.
+ *
+ * @param[in] val Input vector value in F32 format.
+ *
+ * @return The calculated Hyperbolic Tangent.
+ */
+static inline float32x4_t vtanhq_f32(float32x4_t val);
+
+/** Calculate n power of a number.
+ *
+ * pow(x,n) = e^(n*log(x))
+ *
+ * @param[in] val Input vector value in F32 format.
+ * @param[in] n   Powers to raise the input to.
+ *
+ * @return The calculated power.
+ */
+static inline float32x4_t vpowq_f32(float32x4_t val, float32x4_t n);
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+/** Calculate hyperbolic tangent.
+ *
+ * tanh(x) = (e^2x - 1)/(e^2x + 1)
+ *
+ * @note We clamp x to [-5,5] to avoid overflowing issues.
+ *
+ * @param[in] val Input vector value in F32 format.
+ *
+ * @return The calculated Hyperbolic Tangent.
+ */
+static inline float16x8_t vtanhq_f16(float16x8_t val);
+
+/** Calculate reciprocal.
+ *
+ * @param[in] x Input value.
+ *
+ * @return The calculated reciprocal.
+ */
+static inline float16x4_t vinv_f16(float16x4_t x);
+
+/** Calculate reciprocal.
+ *
+ * @param[in] x Input value.
+ *
+ * @return The calculated reciprocal.
+ */
+static inline float16x8_t vinvq_f16(float16x8_t x);
+
+/** Calculate inverse square root.
+ *
+ * @param[in] x Input value.
+ *
+ * @return The calculated inverse square root.
+ */
+static inline float16x4_t vinvsqrt_f16(float16x4_t x);
+
+/** Calculate inverse square root.
+ *
+ * @param[in] x Input value.
+ *
+ * @return The calculated inverse square root.
+ */
+static inline float16x8_t vinvsqrtq_f16(float16x8_t x);
+
+/** Calculate exponential
+ *
+ * @param[in] x Input vector value in F16 format.
+ *
+ * @return The calculated exponent.
+ */
+static inline float16x8_t vexpq_f16(float16x8_t x);
+
+/** Calculate n power of a number.
+ *
+ * pow(x,n) = e^(n*log(x))
+ *
+ * @param[in] val Input vector value in F16 format.
+ * @param[in] n   Powers to raise the input to.
+ *
+ * @return The calculated power.
+ */
+static inline float16x8_t vpowq_f16(float16x8_t val, float16x8_t n);
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+/** Exponent polynomial coefficients */
+extern const float32_t exp_tab[4*8];
+
+
+/** Logarithm polynomial coefficients */
+extern const float32_t log_tab[4*8];
+
+#ifndef DOXYGEN_SKIP_THIS
+inline float32x4_t vfloorq_f32(float32x4_t val)
+{
+    static const float32_t CONST_1[4] = {1.f,1.f,1.f,1.f};
+
+    const int32x4_t   z = vcvtq_s32_f32(val);
+    const float32x4_t r = vcvtq_f32_s32(z);
+
+    return vbslq_f32(vcgtq_f32(r, val), vsubq_f32(r, vld1q_f32(CONST_1)), r);
+}
+
+inline float32x2_t vinvsqrt_f32(float32x2_t x)
+{
+    float32x2_t sqrt_reciprocal = vrsqrte_f32(x);
+    sqrt_reciprocal             = vmul_f32(vrsqrts_f32(vmul_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    sqrt_reciprocal             = vmul_f32(vrsqrts_f32(vmul_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+
+    return sqrt_reciprocal;
+}
+
+inline float32x4_t vinvsqrtq_f32(float32x4_t x)
+{
+    float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
+    sqrt_reciprocal             = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    sqrt_reciprocal             = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+
+    return sqrt_reciprocal;
+}
+
+inline float32x2_t vinv_f32(float32x2_t x)
+{
+    float32x2_t recip = vrecpe_f32(x);
+    recip             = vmul_f32(vrecps_f32(x, recip), recip);
+    recip             = vmul_f32(vrecps_f32(x, recip), recip);
+    return recip;
+}
+
+inline float32x4_t vinvq_f32(float32x4_t x)
+{
+    float32x4_t recip = vrecpeq_f32(x);
+    recip             = vmulq_f32(vrecpsq_f32(x, recip), recip);
+    recip             = vmulq_f32(vrecpsq_f32(x, recip), recip);
+    return recip;
+}
+
+inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const float32_t *coeffs)
+{
+    float32x4_t A   = vmlaq_f32(vld1q_f32(&coeffs[4*0]), vld1q_f32(&coeffs[4*4]), x);
+    float32x4_t B   = vmlaq_f32(vld1q_f32(&coeffs[4*2]), vld1q_f32(&coeffs[4*6]), x);
+    float32x4_t C   = vmlaq_f32(vld1q_f32(&coeffs[4*1]), vld1q_f32(&coeffs[4*5]), x);
+    float32x4_t D   = vmlaq_f32(vld1q_f32(&coeffs[4*3]), vld1q_f32(&coeffs[4*7]), x);
+    float32x4_t x2  = vmulq_f32(x, x);
+    float32x4_t x4  = vmulq_f32(x2, x2);
+    float32x4_t res = vmlaq_f32(vmlaq_f32(A, B, x2), vmlaq_f32(C, D, x2), x4);
+    return res;
+}
+
+inline float32x4_t vexpq_f32(float32x4_t x)
+{
+    static const float32_t CONST_LN2[4]          = {0.6931471805f,0.6931471805f,0.6931471805f,0.6931471805f}; // ln(2)
+    static const float32_t CONST_INV_LN2[4]      = {1.4426950408f,1.4426950408f,1.4426950408f,1.4426950408f}; // 1/ln(2)
+    static const float32_t CONST_0[4]            = {0.f,0.f,0.f,0.f};
+    static const int32_t   CONST_NEGATIVE_126[4] = {-126,-126,-126,-126};
+
+    // Perform range reduction [-log(2),log(2)]
+    int32x4_t   m   = vcvtq_s32_f32(vmulq_f32(x, vld1q_f32(CONST_INV_LN2)));
+    float32x4_t val = vmlsq_f32(x, vcvtq_f32_s32(m), vld1q_f32(CONST_LN2));
+
+    // Polynomial Approximation
+    float32x4_t poly = vtaylor_polyq_f32(val, exp_tab);
+
+    // Reconstruct
+    poly = vreinterpretq_f32_s32(vqaddq_s32(vreinterpretq_s32_f32(poly), vqshlq_n_s32(m, 23)));
+    poly = vbslq_f32(vcltq_s32(m, vld1q_s32(CONST_NEGATIVE_126)), vld1q_f32(CONST_0), poly);
+
+    return poly;
+}
+
+inline float32x4_t vlogq_f32(float32x4_t x)
+{
+    static const int32_t   CONST_127[4] = {127,127,127,127};           // 127
+    static const float32_t CONST_LN2[4] = {0.6931471805f,0.6931471805f,0.6931471805f,0.6931471805f}; // ln(2)
+
+    // Extract exponent
+    int32x4_t   m   = vsubq_s32(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), vld1q_s32(CONST_127));
+    float32x4_t val = vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
+
+    // Polynomial Approximation
+    float32x4_t poly = vtaylor_polyq_f32(val, log_tab);
+
+    // Reconstruct
+    poly = vmlaq_f32(poly, vcvtq_f32_s32(m), vld1q_f32(CONST_LN2));
+
+    return poly;
+}
+
+inline float32x4_t vtanhq_f32(float32x4_t val)
+{
+    static const float32_t CONST_1[4]        = {1.f,1.f,1.f,1.f};
+    static const float32_t CONST_2[4]        = {2.f,2.f,2.f,2.f};
+    static const float32_t CONST_MIN_TANH[4] = {-10.f,-10.f,-10.f,-10.f};
+    static const float32_t CONST_MAX_TANH[4] = {10.f,10.f,10.f,10.f};
+
+    float32x4_t x     = vminq_f32(vmaxq_f32(val, vld1q_f32(CONST_MIN_TANH)), vld1q_f32(CONST_MAX_TANH));
+    float32x4_t exp2x = vexpq_f32(vmulq_f32(vld1q_f32(CONST_2), x));
+    float32x4_t num   = vsubq_f32(exp2x, vld1q_f32(CONST_1));
+    float32x4_t den   = vaddq_f32(exp2x, vld1q_f32(CONST_1));
+    float32x4_t tanh  = vmulq_f32(num, vinvq_f32(den));
+    return tanh;
+}
+
+inline float32x4_t vpowq_f32(float32x4_t val, float32x4_t n)
+{
+    return vexpq_f32(vmulq_f32(n, vlogq_f32(val)));
+}
+#endif /* DOXYGEN_SKIP_THIS */
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+/** Exponent polynomial coefficients */
+/** Logarithm polynomial coefficients */
+#ifndef DOXYGEN_SKIP_THIS
+inline float16x8_t vfloorq_f16(float16x8_t val)
+{
+    static const float16_t CONST_1[8] = {1.f,1.f,1.f,1.f,1.f,1.f,1.f,1.f};
+
+    const int16x8_t   z = vcvtq_s16_f16(val);
+    const float16x8_t r = vcvtq_f16_s16(z);
+
+    return vbslq_f16(vcgtq_f16(r, val), vsubq_f16(r, vld1q_f16(CONST_1)), r);
+}
+inline float16x4_t vinvsqrt_f16(float16x4_t x)
+{
+    float16x4_t sqrt_reciprocal = vrsqrte_f16(x);
+    sqrt_reciprocal             = vmul_f16(vrsqrts_f16(vmul_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    sqrt_reciprocal             = vmul_f16(vrsqrts_f16(vmul_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    return sqrt_reciprocal;
+}
+
+inline float16x8_t vinvsqrtq_f16(float16x8_t x)
+{
+    float16x8_t sqrt_reciprocal = vrsqrteq_f16(x);
+    sqrt_reciprocal             = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    sqrt_reciprocal             = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    return sqrt_reciprocal;
+}
+
+inline float16x4_t vinv_f16(float16x4_t x)
+{
+    float16x4_t recip = vrecpe_f16(x);
+    recip             = vmul_f16(vrecps_f16(x, recip), recip);
+    recip             = vmul_f16(vrecps_f16(x, recip), recip);
+    return recip;
+}
+
+inline float16x8_t vinvq_f16(float16x8_t x)
+{
+    float16x8_t recip = vrecpeq_f16(x);
+    recip             = vmulq_f16(vrecpsq_f16(x, recip), recip);
+    recip             = vmulq_f16(vrecpsq_f16(x, recip), recip);
+    return recip;
+}
+
+inline float16x8_t vtanhq_f16(float16x8_t val)
+{
+    const float16_t CONST_1[8]        = {1.f,1.f,1.f,1.f,1.f,1.f,1.f,1.f};
+    const float16_t CONST_2[8]        = {2.f,2.f,2.f,2.f,2.f,2.f,2.f,2.f};
+    const float16_t CONST_MIN_TANH[8] = {-10.f,-10.f,-10.f,-10.f,-10.f,-10.f,-10.f,-10.f};
+    const float16_t CONST_MAX_TANH[8] = {10.f,10.f,10.f,10.f,10.f,10.f,10.f,10.f};
+
+    const float16x8_t x     = vminq_f16(vmaxq_f16(val, vld1q_f16(CONST_MIN_TANH)), vld1q_f16(CONST_MAX_TANH));
+    const float16x8_t exp2x = vexpq_f16(vmulq_f16(vld1q_f16(CONST_2), x));
+    const float16x8_t num   = vsubq_f16(exp2x, vld1q_f16(CONST_1));
+    const float16x8_t den   = vaddq_f16(exp2x, vld1q_f16(CONST_1));
+    const float16x8_t tanh  = vmulq_f16(num, vinvq_f16(den));
+    return tanh;
+}
+
+inline float16x8_t vtaylor_polyq_f16(float16x8_t x, const float16_t *coeffs)
+{
+    const float16x8_t A   = vaddq_f16(&coeffs[8*0], vmulq_f16(&coeffs[8*4], x));
+    const float16x8_t B   = vaddq_f16(&coeffs[8*2], vmulq_f16(&coeffs[8*6], x));
+    const float16x8_t C   = vaddq_f16(&coeffs[8*1], vmulq_f16(&coeffs[8*5], x));
+    const float16x8_t D   = vaddq_f16(&coeffs[8*3], vmulq_f16(&coeffs[8*7], x));
+    const float16x8_t x2  = vmulq_f16(x, x);
+    const float16x8_t x4  = vmulq_f16(x2, x2);
+    const float16x8_t res = vaddq_f16(vaddq_f16(A, vmulq_f16(B, x2)), vmulq_f16(vaddq_f16(C, vmulq_f16(D, x2)), x4));
+    return res;
+}
+
+inline float16x8_t vexpq_f16(float16x8_t x)
+{
+    // TODO (COMPMID-1535) : Revisit FP16 approximations
+    const float32x4_t x_high = vcvt_f32_f16(vget_high_f16(x));
+    const float32x4_t x_low  = vcvt_f32_f16(vget_low_f16(x));
+
+    const float16x8_t res = vcvt_high_f16_f32(vcvt_f16_f32(vexpq_f32(x_low)), vexpq_f32(x_high));
+    return res;
+}
+
+inline float16x8_t vlogq_f16(float16x8_t x)
+{
+    // TODO (COMPMID-1535) : Revisit FP16 approximations
+    const float32x4_t x_high = vcvt_f32_f16(vget_high_f16(x));
+    const float32x4_t x_low  = vcvt_f32_f16(vget_low_f16(x));
+
+    const float16x8_t res = vcvt_high_f16_f32(vcvt_f16_f32(vlogq_f32(x_low)), vlogq_f32(x_high));
+    return res;
+}
+
+inline float16x8_t vpowq_f16(float16x8_t val, float16x8_t n)
+{
+    // TODO (giaiod01) - COMPMID-1535
+    float32x4_t n0_f32   = vcvt_f32_f16(vget_low_f16(n));
+    float32x4_t n1_f32   = vcvt_f32_f16(vget_high_f16(n));
+    float32x4_t val0_f32 = vcvt_f32_f16(vget_low_f16(val));
+    float32x4_t val1_f32 = vcvt_f32_f16(vget_high_f16(val));
+
+    float32x4_t res0_f32 = vexpq_f32(vmulq_f32(n0_f32, vlogq_f32(val0_f32)));
+    float32x4_t res1_f32 = vexpq_f32(vmulq_f32(n1_f32, vlogq_f32(val1_f32)));
+
+    return vcombine_f16(vcvt_f16_f32(res0_f32), vcvt_f16_f32(res1_f32));
+}
+#endif /* DOXYGEN_SKIP_THIS */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+#endif
+#endif /* __ARM_COMPUTE_NEMATH_H__ */
--- a/libraries/cmsis/dsp/ComputeLibrary/LICENSE.txt
+++ b/libraries/cmsis/dsp/ComputeLibrary/LICENSE.txt
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2017-2019 ARM Software
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/libraries/cmsis/dsp/ComputeLibrary/README.md
+++ b/libraries/cmsis/dsp/ComputeLibrary/README.md
@@ -0,0 +1,19 @@
+README
+======
+
+This folder is containing two files imported, and slightly modified, from the ComputeLibrary:
+
+    NEMath.h and arm_cl_tables.c 
+
+In the original compute library, there are instead two other files:
+
+    NEMath.h and NEMath.inl
+
+NEMath.inl is included from NEMath.h whereas in this CMSIS DSP implementation, there is no NEMath.inl and its content is copied into NEMath.h
+
+The tables contained in NEMath.inl have been moved to arm_cl_tables.c and finally the files are in C for the CMSIS DSP library and in C++ in the original Compute Library.
+
+Otherwise, the features and implementations are the same : a few optimized Neon functions.
+
+The license covering those files is different : It is a MIT license.
+Other parts of the CMSIS-DSP are covered with an Apache-2.0 license.
--- a/libraries/cmsis/dsp/ComputeLibrary/Source/arm_cl_tables.c
+++ b/libraries/cmsis/dsp/ComputeLibrary/Source/arm_cl_tables.c
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_math.h"
+#include "NEMath.h"
+
+#if defined(ARM_MATH_NEON)
+
+/** Exponent polynomial coefficients */
+const float32_t exp_tab[4*8] =
+{
+        1.f,1.f,1.f,1.f,
+        0.0416598916054f,0.0416598916054f,0.0416598916054f,0.0416598916054f,
+        0.500000596046f,0.500000596046f,0.500000596046f,0.500000596046f,
+        0.0014122662833f,0.0014122662833f,0.0014122662833f,0.0014122662833f,
+        1.00000011921f,1.00000011921f,1.00000011921f,1.00000011921f,
+        0.00833693705499f,0.00833693705499f,0.00833693705499f,0.00833693705499f,
+        0.166665703058f,0.166665703058f,0.166665703058f,0.166665703058f,
+        0.000195780929062f,0.000195780929062f,0.000195780929062f,0.000195780929062f
+};
+
+/** Logarithm polynomial coefficients */
+const float32_t log_tab[4*8] =
+{
+        -2.29561495781f,-2.29561495781f,-2.29561495781f,-2.29561495781f,
+        -2.47071170807f,-2.47071170807f,-2.47071170807f,-2.47071170807f,
+        -5.68692588806f,-5.68692588806f,-5.68692588806f,-5.68692588806f,
+        -0.165253549814f,-0.165253549814f,-0.165253549814f,-0.165253549814f,
+        5.17591238022f,5.17591238022f,5.17591238022f,5.17591238022f,
+        0.844007015228f,0.844007015228f,0.844007015228f,0.844007015228f,
+        4.58445882797f,4.58445882797f,4.58445882797f,4.58445882797f,
+        0.0141278216615f,0.0141278216615f,0.0141278216615f,0.0141278216615f
+};
+
+#endif
--- a/libraries/cmsis/dsp/PrivateInclude/arm_sorting.h
+++ b/libraries/cmsis/dsp/PrivateInclude/arm_sorting.h
@@ -0,0 +1,200 @@
+/******************************************************************************
+ * @file     arm_sorting.h
+ * @brief    Private header file for CMSIS DSP Library
+ * @version  V1.7.0
+ * @date     2019
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2019 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _ARM_SORTING_H_
+#define _ARM_SORTING_H_
+
+#include "arm_math.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+  /**
+   * @param[in]  S          points to an instance of the sorting structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_bubble_sort_f32(
+    const arm_sort_instance_f32 * S,
+          float32_t * pSrc,
+          float32_t * pDst,
+    uint32_t blockSize);
+
+   /**
+   * @param[in]  S          points to an instance of the sorting structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_heap_sort_f32(
+    const arm_sort_instance_f32 * S,
+          float32_t * pSrc,
+          float32_t * pDst,
+    uint32_t blockSize);
+
+  /**
+   * @param[in]  S          points to an instance of the sorting structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_insertion_sort_f32(
+    const arm_sort_instance_f32 * S,
+          float32_t *pSrc,
+          float32_t* pDst,
+    uint32_t blockSize);
+
+  /**
+   * @param[in]  S          points to an instance of the sorting structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_quick_sort_f32(
+    const arm_sort_instance_f32 * S,
+          float32_t * pSrc,
+          float32_t * pDst,
+    uint32_t blockSize);
+
+  /**
+   * @param[in]  S          points to an instance of the sorting structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_selection_sort_f32(
+    const arm_sort_instance_f32 * S,
+          float32_t * pSrc,
+          float32_t * pDst,
+    uint32_t blockSize);
+
+  /**
+   * @param[in]  S          points to an instance of the sorting structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_bitonic_sort_f32(
+    const arm_sort_instance_f32 * S,
+          float32_t * pSrc,
+          float32_t * pDst,
+          uint32_t blockSize);
+
+#if defined(ARM_MATH_NEON)
+
+#define vtrn256_128q(a, b)                   \
+do {                                         \
+	float32x4_t vtrn128_temp = a.val[1]; \
+	a.val[1] = b.val[0];                 \
+	b.val[0] = vtrn128_temp ;            \
+} while (0)
+
+#define vtrn128_64q(a, b)           \
+do {                                \
+	float32x2_t ab, cd, ef, gh; \
+	ab = vget_low_f32(a);	    \
+	ef = vget_low_f32(b);	    \
+	cd = vget_high_f32(a);	    \
+	gh = vget_high_f32(b);      \
+	a = vcombine_f32(ab, ef);   \
+	b = vcombine_f32(cd, gh);   \
+} while (0)
+
+#define vtrn256_64q(a, b)                  \
+do {                                       \
+	float32x2_t a_0, a_1, a_2, a_3;    \
+	float32x2_t b_0, b_1, b_2, b_3;    \
+	a_0 = vget_low_f32(a.val[0]);      \
+	a_1 = vget_high_f32(a.val[0]);     \
+	a_2 = vget_low_f32(a.val[1]);      \
+	a_3 = vget_high_f32(a.val[1]);     \
+	b_0 = vget_low_f32(b.val[0]);      \
+	b_1 = vget_high_f32(b.val[0]);     \
+	b_2 = vget_low_f32(b.val[1]);      \
+	b_3 = vget_high_f32(b.val[1]);     \
+	a.val[0] = vcombine_f32(a_0, b_0); \
+	a.val[1] = vcombine_f32(a_2, b_2); \
+	b.val[0] = vcombine_f32(a_1, b_1); \
+	b.val[1] = vcombine_f32(a_3, b_3); \
+} while (0)
+
+#define vtrn128_32q(a, b)                               \
+do {                                                    \
+	float32x4x2_t vtrn32_tmp = vtrnq_f32((a), (b)); \
+	(a) = vtrn32_tmp.val[0];                        \
+	(b) = vtrn32_tmp.val[1];                        \
+} while (0)
+
+#define vtrn256_32q(a, b)               \
+do {                                    \
+	float32x4x2_t vtrn32_tmp_1 = vtrnq_f32((a.val[0]), (b.val[0])); \
+	float32x4x2_t vtrn32_tmp_2 = vtrnq_f32((a.val[1]), (b.val[1])); \
+	a.val[0] = vtrn32_tmp_1.val[0]; \
+	a.val[1] = vtrn32_tmp_2.val[0]; \
+	b.val[0] = vtrn32_tmp_1.val[1]; \
+	b.val[1] = vtrn32_tmp_2.val[1]; \
+} while (0)
+
+#define vminmaxq(a, b)                    \
+	do {                              \
+	float32x4_t minmax_tmp = (a);     \
+	(a) = vminq_f32((a), (b));        \
+	(b) = vmaxq_f32(minmax_tmp, (b)); \
+} while (0)
+
+#define vminmax256q(a, b)                         \
+	do {                                      \
+	float32x4x2_t minmax256_tmp = (a);        \
+	a.val[0] = vminq_f32(a.val[0], b.val[0]); \
+	a.val[1] = vminq_f32(a.val[1], b.val[1]); \
+	b.val[0] = vmaxq_f32(minmax256_tmp.val[0], b.val[0]); \
+	b.val[1] = vmaxq_f32(minmax256_tmp.val[1], b.val[1]); \
+} while (0)
+
+#define vrev128q_f32(a) \
+        vcombine_f32(vrev64_f32(vget_high_f32(a)), vrev64_f32(vget_low_f32(a)))
+
+#define vrev256q_f32(a)     \
+	do {                \
+        float32x4_t rev_tmp = vcombine_f32(vrev64_f32(vget_high_f32(a.val[0])), vrev64_f32(vget_low_f32(a.val[0]))); \
+	a.val[0] = vcombine_f32(vrev64_f32(vget_high_f32(a.val[1])), vrev64_f32(vget_low_f32(a.val[1])));  \
+	a.val[1] = rev_tmp; \
+} while (0)
+
+#define vldrev128q_f32(a, p) \
+	do {                 \
+	a = vld1q_f32(p);    \
+	a = vrev128q_f32(a); \
+} while (0)
+
+#endif /* ARM_MATH_NEON */
+
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /* _ARM_SORTING_H */
--- a/libraries/cmsis/dsp/PrivateInclude/arm_vec_fft.h
+++ b/libraries/cmsis/dsp/PrivateInclude/arm_vec_fft.h
@@ -0,0 +1,58 @@
+/******************************************************************************
+ * @file     arm_vec_fft.h
+ * @brief    Private header file for CMSIS DSP Library
+ * @version  V1.7.0
+ * @date     07. January 2020
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _ARM_VEC_FFT_H_
+#define _ARM_VEC_FFT_H_
+
+#include "arm_math.h"
+#include "arm_helium_utils.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+#if (defined(ARM_MATH_MVEF) || defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#define MVE_CMPLX_ADD_A_ixB(A, B)           vcaddq_rot90(A,B)
+#define MVE_CMPLX_SUB_A_ixB(A,B)            vcaddq_rot270(A,B)
+#define MVE_CMPLX_MULT_FLT_AxB(A,B)         vcmlaq_rot90(vcmulq(A, B), A, B)
+#define MVE_CMPLX_MULT_FLT_Conj_AxB(A,B)    vcmlaq_rot270(vcmulq(A, B), A, B)
+
+#define MVE_CMPLX_MULT_FX_AxB(A,B)          vqdmladhxq(vqdmlsdhq((__typeof(A))vuninitializedq_s32(), A, B), A, B);
+#define MVE_CMPLX_MULT_FX_AxConjB(A,B)      vqdmladhq(vqdmlsdhxq((__typeof(A))vuninitializedq_s32(), A, B), A, B);
+
+#define MVE_CMPLX_ADD_FX_A_ixB(A, B)        vhcaddq_rot90(A,B)
+#define MVE_CMPLX_SUB_FX_A_ixB(A,B)         vhcaddq_rot270(A,B)
+
+
+#endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)*/
+
+
+#ifdef   __cplusplus
+}
+#endif
+
+
+#endif /* _ARM_VEC_FFT_H_ */
--- a/libraries/cmsis/dsp/PrivateInclude/arm_vec_filtering.h
+++ b/libraries/cmsis/dsp/PrivateInclude/arm_vec_filtering.h
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/BasicMathFunctions.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/BasicMathFunctions.c
@@ -0,0 +1,75 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        BasicMathFunctions.c
+ * Description:  Combination of all basic math function source files.
+ *
+ * $Date:        16. March 2020
+ * $Revision:    V1.1.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2019-2020 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_abs_f32.c"
+#include "arm_abs_q15.c"
+#include "arm_abs_q31.c"
+#include "arm_abs_q7.c"
+#include "arm_add_f32.c"
+#include "arm_add_q15.c"
+#include "arm_add_q31.c"
+#include "arm_add_q7.c"
+#include "arm_and_u16.c"
+#include "arm_and_u32.c"
+#include "arm_and_u8.c"
+#include "arm_dot_prod_f32.c"
+#include "arm_dot_prod_q15.c"
+#include "arm_dot_prod_q31.c"
+#include "arm_dot_prod_q7.c"
+#include "arm_mult_f32.c"
+#include "arm_mult_q15.c"
+#include "arm_mult_q31.c"
+#include "arm_mult_q7.c"
+#include "arm_negate_f32.c"
+#include "arm_negate_q15.c"
+#include "arm_negate_q31.c"
+#include "arm_negate_q7.c"
+#include "arm_not_u16.c"
+#include "arm_not_u32.c"
+#include "arm_not_u8.c"
+#include "arm_offset_f32.c"
+#include "arm_offset_q15.c"
+#include "arm_offset_q31.c"
+#include "arm_offset_q7.c"
+#include "arm_or_u16.c"
+#include "arm_or_u32.c"
+#include "arm_or_u8.c"
+#include "arm_scale_f32.c"
+#include "arm_scale_q15.c"
+#include "arm_scale_q31.c"
+#include "arm_scale_q7.c"
+#include "arm_shift_q15.c"
+#include "arm_shift_q31.c"
+#include "arm_shift_q7.c"
+#include "arm_sub_f32.c"
+#include "arm_sub_q15.c"
+#include "arm_sub_q31.c"
+#include "arm_sub_q7.c"
+#include "arm_xor_u16.c"
+#include "arm_xor_u32.c"
+#include "arm_xor_u8.c"
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/CMakeLists.txt
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/CMakeLists.txt
@@ -0,0 +1,19 @@
+cmake_minimum_required (VERSION 3.6)
+
+project(CMSISDSPBasicMath)
+
+include(configLib)
+include(configDsp)
+
+file(GLOB SRC "./*_*.c")
+
+add_library(CMSISDSPBasicMath STATIC ${SRC})
+
+configLib(CMSISDSPBasicMath ${ROOT})
+configDsp(CMSISDSPBasicMath ${ROOT})
+
+### Includes
+target_include_directories(CMSISDSPBasicMath PUBLIC "${DSP}/Include")
+
+
+
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_abs_f32.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_abs_f32.c
@@ -0,0 +1,196 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_abs_f32.c
+ * Description:  Floating-point vector absolute value
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include <math.h>
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @defgroup BasicAbs Vector Absolute Value
+
+  Computes the absolute value of a vector on an element-by-element basis.
+
+  <pre>
+      pDst[n] = abs(pSrc[n]),   0 <= n < blockSize.
+  </pre>
+
+  The functions support in-place computation allowing the source and
+  destination pointers to reference the same memory buffer.
+  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
+ */
+
+/**
+  @addtogroup BasicAbs
+  @{
+ */
+
+/**
+  @brief         Floating-point vector absolute value.
+  @param[in]     pSrc       points to the input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_abs_f32(
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+    uint32_t blkCnt;                               /* Loop counter */
+    f32x4_t vec1;
+    f32x4_t res;
+
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        /* C = |A| */
+
+        /* Calculate absolute values and then store the results in the destination buffer. */
+        vec1 = vld1q(pSrc);
+        res = vabsq(vec1);
+        vst1q(pDst, res);
+
+        /* Increment pointers */
+        pSrc += 4;
+        pDst += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+
+    if (blkCnt > 0U)
+    {
+      /* C = |A| */
+      mve_pred16_t p0 = vctp32q(blkCnt);
+      vec1 = vld1q(pSrc);
+      vstrwq_p(pDst, vabsq(vec1), p0);
+    }
+
+}
+
+#else
+void arm_abs_f32(
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+    f32x4_t vec1;
+    f32x4_t res;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        /* C = |A| */
+
+    	/* Calculate absolute values and then store the results in the destination buffer. */
+        vec1 = vld1q_f32(pSrc);
+        res = vabsq_f32(vec1);
+        vst1q_f32(pDst, res);
+
+        /* Increment pointers */
+        pSrc += 4;
+        pDst += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+#else
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = |A| */
+
+    /* Calculate absolute and store result in destination buffer. */
+    *pDst++ = fabsf(*pSrc++);
+
+    *pDst++ = fabsf(*pSrc++);
+
+    *pDst++ = fabsf(*pSrc++);
+
+    *pDst++ = fabsf(*pSrc++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined(ARM_MATH_NEON) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = |A| */
+
+    /* Calculate absolute and store result in destination buffer. */
+    *pDst++ = fabsf(*pSrc++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+/**
+  @} end of BasicAbs group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_abs_q15.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_abs_q15.c
@@ -0,0 +1,178 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_abs_q15.c
+ * Description:  Q15 vector absolute value
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicAbs
+  @{
+ */
+
+/**
+  @brief         Q15 vector absolute value.
+  @param[in]     pSrc       points to the input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   The Q15 value -1 (0x8000) will be saturated to the maximum allowable positive value 0x7FFF.
+ */
+
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_abs_q15(
+    const q15_t * pSrc,
+    q15_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q15x8_t vecSrc;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = |A|
+         * Calculate absolute and then store the results in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vst1q(pDst, vqabsq(vecSrc));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 8;
+        pDst += 8;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vstrhq_p(pDst, vqabsq(vecSrc), p0);
+    }
+}
+
+#else
+void arm_abs_q15(
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        q15_t in;                                      /* Temporary input variable */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = |A| */
+
+    /* Calculate absolute of input (if -1 then saturated to 0x7fff) and store result in destination buffer. */
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
+#endif
+
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
+#endif
+
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
+#endif
+
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = |A| */
+
+    /* Calculate absolute of input (if -1 then saturated to 0x7fff) and store result in destination buffer. */
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicAbs group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_abs_q31.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_abs_q31.c
@@ -0,0 +1,208 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_abs_q31.c
+ * Description:  Q31 vector absolute value
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicAbs
+  @{
+ */
+
+/**
+  @brief         Q31 vector absolute value.
+  @param[in]     pSrc       points to the input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   The Q31 value -1 (0x80000000) will be saturated to the maximum allowable positive value 0x7FFFFFFF.
+ */
+
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_abs_q31(
+    const q31_t * pSrc,
+    q31_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* Loop counters */
+    q31x4_t vecSrc;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2;
+
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = |A|
+         * Calculate absolute and then store the results in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vst1q(pDst, vqabsq(vecSrc));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * Advance vector source and destination pointers
+         */
+        pSrc += 4;
+        pDst += 4;
+    }
+    /*
+     * Tail
+     */
+    blkCnt = blockSize & 3;
+
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vstrwq_p(pDst, vqabsq(vecSrc), p0);
+    }
+}
+
+#else
+void arm_abs_q31(
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        q31_t in;                                      /* Temporary variable */
+
+#if defined(ARM_MATH_NEON)
+    int32x4_t vec1;
+    int32x4_t res;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        /* C = |A| */
+        /* Calculate absolute and then store the results in the destination buffer. */
+
+        vec1 = vld1q_s32(pSrc);
+        res = vqabsq_s32(vec1);
+        vst1q_s32(pDst, res);
+
+        /* Increment pointers */
+        pSrc += 4;
+        pDst += 4;
+
+        /* Decrement the blockSize loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+#else
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = |A| */
+
+    /* Calculate absolute of input (if -1 then saturated to 0x7fffffff) and store result in destination buffer. */
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
+#endif
+
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
+#endif
+
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
+#endif
+
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined (ARM_MATH_NEON) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = |A| */
+
+    /* Calculate absolute of input (if -1 then saturated to 0x7fffffff) and store result in destination buffer. */
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* #if defined (ARM_MATH_MVEI) */
+/**
+  @} end of BasicAbs group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_abs_q7.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_abs_q7.c
@@ -0,0 +1,180 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_abs_q7.c
+ * Description:  Q7 vector absolute value
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicAbs
+  @{
+ */
+
+/**
+  @brief         Q7 vector absolute value.
+  @param[in]     pSrc       points to the input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Conditions for optimum performance
+                   Input and output buffers should be aligned by 32-bit
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   The Q7 value -1 (0x80) will be saturated to the maximum allowable positive value 0x7F.
+ */
+
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_abs_q7(
+    const q7_t * pSrc,
+    q7_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q7x16_t vecSrc;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = |A|
+         * Calculate absolute and then store the results in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vst1q(pDst, vqabsq(vecSrc));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 16;
+        pDst += 16;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 0xF;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp8q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vstrbq_p(pDst, vqabsq(vecSrc), p0);
+    }
+}
+
+#else
+void arm_abs_q7(
+  const q7_t * pSrc,
+        q7_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        q7_t in;                                       /* Temporary input variable */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = |A| */
+
+    /* Calculate absolute of input (if -1 then saturated to 0x7f) and store result in destination buffer. */
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q7_t)__QSUB8(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
+#endif
+
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q7_t)__QSUB8(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
+#endif
+
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q7_t)__QSUB8(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
+#endif
+
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q7_t)__QSUB8(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = |A| */
+
+    /* Calculate absolute of input (if -1 then saturated to 0x7f) and store result in destination buffer. */
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (in > 0) ? in : (q7_t) __QSUB8(0, in);
+#else
+    *pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicAbs group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_add_f32.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_add_f32.c
@@ -0,0 +1,199 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_add_f32.c
+ * Description:  Floating-point vector addition
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @defgroup BasicAdd Vector Addition
+
+  Element-by-element addition of two vectors.
+
+  <pre>
+      pDst[n] = pSrcA[n] + pSrcB[n],   0 <= n < blockSize.
+  </pre>
+
+  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
+ */
+
+/**
+  @addtogroup BasicAdd
+  @{
+ */
+
+/**
+  @brief         Floating-point vector addition.
+  @param[in]     pSrcA      points to first input vector
+  @param[in]     pSrcB      points to second input vector
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_add_f32(
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+    uint32_t blkCnt;                               /* Loop counter */
+
+    f32x4_t vec1;
+    f32x4_t vec2;
+    f32x4_t res;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        /* C = A + B */
+
+        /* Add and then store the results in the destination buffer. */
+        vec1 = vld1q(pSrcA);
+        vec2 = vld1q(pSrcB);
+        res = vaddq(vec1, vec2);
+        vst1q(pDst, res);
+
+        /* Increment pointers */
+        pSrcA += 4;
+        pSrcB += 4;
+        pDst += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+    if (blkCnt > 0U)
+    {
+      /* C = A + B */
+      mve_pred16_t p0 = vctp32q(blkCnt);
+      vec1 = vld1q(pSrcA);
+      vec2 = vld1q(pSrcB);
+      vstrwq_p(pDst, vaddq(vec1,vec2), p0);
+    }
+
+}
+
+#else
+void arm_add_f32(
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+    f32x4_t vec1;
+    f32x4_t vec2;
+    f32x4_t res;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        /* C = A + B */
+
+    	/* Add and then store the results in the destination buffer. */
+        vec1 = vld1q_f32(pSrcA);
+        vec2 = vld1q_f32(pSrcB);
+        res = vaddq_f32(vec1, vec2);
+        vst1q_f32(pDst, res);
+
+        /* Increment pointers */
+        pSrcA += 4;
+        pSrcB += 4;
+        pDst += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+#else
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A + B */
+
+    /* Add and store result in destination buffer. */
+    *pDst++ = (*pSrcA++) + (*pSrcB++);
+    *pDst++ = (*pSrcA++) + (*pSrcB++);
+    *pDst++ = (*pSrcA++) + (*pSrcB++);
+    *pDst++ = (*pSrcA++) + (*pSrcB++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined(ARM_MATH_NEON) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A + B */
+
+    /* Add and store result in destination buffer. */
+    *pDst++ = (*pSrcA++) + (*pSrcB++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of BasicAdd group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_add_q15.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_add_q15.c
@@ -0,0 +1,176 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_add_q15.c
+ * Description:  Q15 vector addition
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicAdd
+  @{
+ */
+
+/**
+  @brief         Q15 vector addition.
+  @param[in]     pSrcA      points to the first input vector
+  @param[in]     pSrcB      points to the second input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
+ */
+
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_add_q15(
+    const q15_t * pSrcA,
+    const q15_t * pSrcB,
+    q15_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q15x8_t vecA;
+    q15x8_t vecB;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A + B
+         * Add and then store the results in the destination buffer.
+         */
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vst1q(pDst, vqaddq(vecA, vecB));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrcA  += 8;
+        pSrcB  += 8;
+        pDst   += 8;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vstrhq_p(pDst, vqaddq(vecA, vecB), p0);
+    }
+}
+
+#else
+void arm_add_q15(
+  const q15_t * pSrcA,
+  const q15_t * pSrcB,
+        q15_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+#if defined (ARM_MATH_DSP)
+  q31_t inA1, inA2;
+  q31_t inB1, inB2;
+#endif
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A + B */
+
+#if defined (ARM_MATH_DSP)
+    /* read 2 times 2 samples at a time from sourceA */
+    inA1 = read_q15x2_ia ((q15_t **) &pSrcA);
+    inA2 = read_q15x2_ia ((q15_t **) &pSrcA);
+    /* read 2 times 2 samples at a time from sourceB */
+    inB1 = read_q15x2_ia ((q15_t **) &pSrcB);
+    inB2 = read_q15x2_ia ((q15_t **) &pSrcB);
+
+    /* Add and store 2 times 2 samples at a time */
+    write_q15x2_ia (&pDst, __QADD16(inA1, inB1));
+    write_q15x2_ia (&pDst, __QADD16(inA2, inB2));
+#else
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ + *pSrcB++), 16);
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ + *pSrcB++), 16);
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ + *pSrcB++), 16);
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ + *pSrcB++), 16);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A + B */
+
+    /* Add and store result in destination buffer. */
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (q15_t) __QADD16(*pSrcA++, *pSrcB++);
+#else
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ + *pSrcB++), 16);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+/**
+  @} end of BasicAdd group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_add_q31.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_add_q31.c
@@ -0,0 +1,159 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_add_q31.c
+ * Description:  Q31 vector addition
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicAdd
+  @{
+ */
+
+/**
+  @brief         Q31 vector addition.
+  @param[in]     pSrcA      points to the first input vector
+  @param[in]     pSrcB      points to the second input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
+ */
+
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_add_q31(
+  const q31_t * pSrcA,
+  const q31_t * pSrcB,
+        q31_t * pDst,
+        uint32_t blockSize)
+{
+    uint32_t blkCnt;
+    q31x4_t vecA;
+    q31x4_t vecB;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A + B
+         * Add and then store the results in the destination buffer.
+         */
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vst1q(pDst, vqaddq(vecA, vecB));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrcA  += 4;
+        pSrcB  += 4;
+        pDst   += 4;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 3;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vstrwq_p(pDst, vqaddq(vecA, vecB), p0);
+    }
+}
+
+#else
+void arm_add_q31(
+  const q31_t * pSrcA,
+  const q31_t * pSrcB,
+        q31_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A + B */
+
+    /* Add and store result in destination buffer. */
+    *pDst++ = __QADD(*pSrcA++, *pSrcB++);
+
+    *pDst++ = __QADD(*pSrcA++, *pSrcB++);
+
+    *pDst++ = __QADD(*pSrcA++, *pSrcB++);
+
+    *pDst++ = __QADD(*pSrcA++, *pSrcB++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A + B */
+
+    /* Add and store result in destination buffer. */
+    *pDst++ = __QADD(*pSrcA++, *pSrcB++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+
+#endif /* defined(ARM_MATH_MVEI) */
+/**
+  @} end of BasicAdd group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_add_q7.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_add_q7.c
@@ -0,0 +1,158 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_add_q7.c
+ * Description:  Q7 vector addition
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicAdd
+  @{
+ */
+
+/**
+  @brief         Q7 vector addition.
+  @param[in]     pSrcA      points to the first input vector
+  @param[in]     pSrcB      points to the second input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
+ */
+
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_add_q7(
+    const q7_t * pSrcA,
+    const q7_t * pSrcB,
+    q7_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q7x16_t vecA;
+    q7x16_t vecB;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A + B
+         * Add and then store the results in the destination buffer.
+         */
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vst1q(pDst, vqaddq(vecA, vecB));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrcA  += 16;
+        pSrcB  += 16;
+        pDst   += 16;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 0xF;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp8q(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vstrbq_p(pDst, vqaddq(vecA, vecB), p0);
+    }
+}
+#else
+void arm_add_q7(
+  const q7_t * pSrcA,
+  const q7_t * pSrcB,
+        q7_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A + B */
+
+#if defined (ARM_MATH_DSP)
+    /* Add and store result in destination buffer (4 samples at a time). */
+    write_q7x4_ia (&pDst, __QADD8 (read_q7x4_ia ((q7_t **) &pSrcA), read_q7x4_ia ((q7_t **) &pSrcB)));
+#else
+    *pDst++ = (q7_t) __SSAT ((q15_t) *pSrcA++ + *pSrcB++, 8);
+    *pDst++ = (q7_t) __SSAT ((q15_t) *pSrcA++ + *pSrcB++, 8);
+    *pDst++ = (q7_t) __SSAT ((q15_t) *pSrcA++ + *pSrcB++, 8);
+    *pDst++ = (q7_t) __SSAT ((q15_t) *pSrcA++ + *pSrcB++, 8);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A + B */
+
+    /* Add and store result in destination buffer. */
+    *pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ + *pSrcB++, 8);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+/**
+  @} end of BasicAdd group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_and_u16.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_and_u16.c
@@ -0,0 +1,137 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_and_u16.c
+ * Description:  uint16_t bitwise AND
+ *
+ * $Date:        14 November 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @defgroup And Vector bitwise AND
+
+  Compute the logical bitwise AND.
+
+  There are separate functions for uint32_t, uint16_t, and uint7_t data types.
+ */
+
+/**
+  @addtogroup And
+  @{
+ */
+
+/**
+  @brief         Compute the logical bitwise AND of two fixed-point vectors.
+  @param[in]     pSrcA      points to input vector A
+  @param[in]     pSrcB      points to input vector B
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+void arm_and_u16(
+    const uint16_t * pSrcA,
+    const uint16_t * pSrcB,
+          uint16_t * pDst,
+          uint32_t blockSize)
+{
+    uint32_t blkCnt;      /* Loop counter */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+    q15x8_t vecSrcA, vecSrcB;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3;
+
+    while (blkCnt > 0U)
+    {
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+
+        vst1q(pDst, vandq_u16(vecSrcA, vecSrcB) );
+
+        pSrcA += 8;
+        pSrcB += 8;
+        pDst  += 8;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 7;
+
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+        vstrhq_p(pDst, vandq_u16(vecSrcA, vecSrcB), p0);
+    }
+#else
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+    uint16x8_t vecA, vecB;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3U;
+
+    while (blkCnt > 0U)
+    {
+        vecA = vld1q_u16(pSrcA);
+        vecB = vld1q_u16(pSrcB);
+
+        vst1q_u16(pDst, vandq_u16(vecA, vecB) );
+
+        pSrcA += 8;
+        pSrcB += 8;
+        pDst  += 8;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 7;
+#else
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize;
+#endif
+
+    while (blkCnt > 0U)
+    {
+        *pDst++ = (*pSrcA++)&(*pSrcB++);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+#endif /* if defined(ARM_MATH_MVEI) */
+}
+
+/**
+  @} end of And group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_and_u32.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_and_u32.c
@@ -0,0 +1,129 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_and_u32.c
+ * Description:  uint32_t bitwise AND
+ *
+ * $Date:        14 November 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup And
+  @{
+ */
+
+/**
+  @brief         Compute the logical bitwise AND of two fixed-point vectors.
+  @param[in]     pSrcA      points to input vector A
+  @param[in]     pSrcB      points to input vector B
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+void arm_and_u32(
+    const uint32_t * pSrcA,
+    const uint32_t * pSrcB,
+          uint32_t * pDst,
+          uint32_t blockSize)
+{
+    uint32_t blkCnt;      /* Loop counter */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+    q31x4_t vecSrcA, vecSrcB;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2;
+
+    while (blkCnt > 0U)
+    {
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+
+        vst1q(pDst, vandq_u32(vecSrcA, vecSrcB) );
+
+        pSrcA += 4;
+        pSrcB += 4;
+        pDst  += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 3;
+
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+        vstrwq_p(pDst, vandq_u32(vecSrcA, vecSrcB), p0);
+    }
+#else
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+    uint32x4_t vecA, vecB;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        vecA = vld1q_u32(pSrcA);
+        vecB = vld1q_u32(pSrcB);
+
+        vst1q_u32(pDst, vandq_u32(vecA, vecB) );
+
+        pSrcA += 4;
+        pSrcB += 4;
+        pDst  += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 3;
+#else
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize;
+#endif
+
+    while (blkCnt > 0U)
+    {
+        *pDst++ = (*pSrcA++)&(*pSrcB++);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+#endif /* if defined(ARM_MATH_MVEI) */
+}
+
+/**
+  @} end of And group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_and_u8.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_and_u8.c
@@ -0,0 +1,130 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_and_u8.c
+ * Description:  uint8_t bitwise AND
+ *
+ * $Date:        14 November 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+
+/**
+  @addtogroup And
+  @{
+ */
+
+/**
+  @brief         Compute the logical bitwise AND of two fixed-point vectors.
+  @param[in]     pSrcA      points to input vector A
+  @param[in]     pSrcB      points to input vector B
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+void arm_and_u8(
+    const uint8_t * pSrcA,
+    const uint8_t * pSrcB,
+          uint8_t * pDst,
+          uint32_t blockSize)
+{
+    uint32_t blkCnt;      /* Loop counter */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+    q7x16_t vecSrcA, vecSrcB;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4;
+
+    while (blkCnt > 0U)
+    {
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+
+        vst1q(pDst, vandq_u8(vecSrcA, vecSrcB) );
+
+        pSrcA += 16;
+        pSrcB += 16;
+        pDst  += 16;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0xF;
+
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp8q(blkCnt);
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+        vstrbq_p(pDst, vandq_u8(vecSrcA, vecSrcB), p0);
+    }
+#else
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+    uint8x16_t vecA, vecB;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4U;
+
+    while (blkCnt > 0U)
+    {
+        vecA = vld1q_u8(pSrcA);
+        vecB = vld1q_u8(pSrcB);
+
+        vst1q_u8(pDst, vandq_u8(vecA, vecB) );
+
+        pSrcA += 16;
+        pSrcB += 16;
+        pDst  += 16;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0xF;
+#else
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize;
+#endif
+
+    while (blkCnt > 0U)
+    {
+        *pDst++ = (*pSrcA++)&(*pSrcB++);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+#endif /* if defined(ARM_MATH_MVEI) */
+}
+
+/**
+  @} end of And group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_dot_prod_f32.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_dot_prod_f32.c
@@ -0,0 +1,226 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_dot_prod_f32.c
+ * Description:  Floating-point dot product
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @defgroup BasicDotProd Vector Dot Product
+
+  Computes the dot product of two vectors.
+  The vectors are multiplied element-by-element and then summed.
+
+  <pre>
+      sum = pSrcA[0]*pSrcB[0] + pSrcA[1]*pSrcB[1] + ... + pSrcA[blockSize-1]*pSrcB[blockSize-1]
+  </pre>
+
+  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
+ */
+
+/**
+  @addtogroup BasicDotProd
+  @{
+ */
+
+/**
+  @brief         Dot product of floating-point vectors.
+  @param[in]     pSrcA      points to the first input vector.
+  @param[in]     pSrcB      points to the second input vector.
+  @param[in]     blockSize  number of samples in each vector.
+  @param[out]    result     output result returned here.
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+
+void arm_dot_prod_f32(
+    const float32_t * pSrcA,
+    const float32_t * pSrcB,
+    uint32_t    blockSize,
+    float32_t * result)
+{
+    f32x4_t vecA, vecB;
+    f32x4_t vecSum;
+    uint32_t blkCnt;
+    float32_t sum = 0.0f;
+    vecSum = vdupq_n_f32(0.0f);
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
+         * Calculate dot product and then store the result in a temporary buffer.
+         * and advance vector source and destination pointers
+         */
+        vecA = vld1q(pSrcA);
+        pSrcA += 4;
+
+        vecB = vld1q(pSrcB);
+        pSrcB += 4;
+
+        vecSum = vfmaq(vecSum, vecA, vecB);
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt --;
+    }
+
+
+    blkCnt = blockSize & 3;
+    if (blkCnt > 0U)
+    {
+        /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
+
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vecSum = vfmaq_m(vecSum, vecA, vecB, p0);
+    }
+
+    sum = vecAddAcrossF32Mve(vecSum);
+
+    /* Store result in destination buffer */
+    *result = sum;
+
+}
+
+#else
+
+void arm_dot_prod_f32(
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        uint32_t blockSize,
+        float32_t * result)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        float32_t sum = 0.0f;                          /* Temporary return variable */
+
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+    f32x4_t vec1;
+    f32x4_t vec2;
+    f32x4_t accum = vdupq_n_f32(0);
+    f32x2_t tmp = vdup_n_f32(0);
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    vec1 = vld1q_f32(pSrcA);
+    vec2 = vld1q_f32(pSrcB);
+
+    while (blkCnt > 0U)
+    {
+        /* C = A[0]*B[0] + A[1]*B[1] + A[2]*B[2] + ... + A[blockSize-1]*B[blockSize-1] */
+        /* Calculate dot product and then store the result in a temporary buffer. */
+
+	      accum = vmlaq_f32(accum, vec1, vec2);
+
+        /* Increment pointers */
+        pSrcA += 4;
+        pSrcB += 4;
+
+        vec1 = vld1q_f32(pSrcA);
+        vec2 = vld1q_f32(pSrcB);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+#if __aarch64__
+    sum = vpadds_f32(vpadd_f32(vget_low_f32(accum), vget_high_f32(accum)));
+#else
+    tmp = vpadd_f32(vget_low_f32(accum), vget_high_f32(accum));
+    sum = vget_lane_f32(tmp, 0) + vget_lane_f32(tmp, 1);
+
+#endif
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+#else
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+   ** a second loop below computes the remaining 1 to 3 samples. */
+  while (blkCnt > 0U)
+  {
+    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
+
+    /* Calculate dot product and store result in a temporary buffer. */
+    sum += (*pSrcA++) * (*pSrcB++);
+
+    sum += (*pSrcA++) * (*pSrcB++);
+
+    sum += (*pSrcA++) * (*pSrcB++);
+
+    sum += (*pSrcA++) * (*pSrcB++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined(ARM_MATH_NEON) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
+
+    /* Calculate dot product and store result in a temporary buffer. */
+    sum += (*pSrcA++) * (*pSrcB++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Store result in destination buffer */
+  *result = sum;
+}
+
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+/**
+  @} end of BasicDotProd group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_dot_prod_q15.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_dot_prod_q15.c
@@ -0,0 +1,172 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_dot_prod_q15.c
+ * Description:  Q15 dot product
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicDotProd
+  @{
+ */
+
+/**
+  @brief         Dot product of Q15 vectors.
+  @param[in]     pSrcA      points to the first input vector
+  @param[in]     pSrcB      points to the second input vector
+  @param[in]     blockSize  number of samples in each vector
+  @param[out]    result     output result returned here
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The intermediate multiplications are in 1.15 x 1.15 = 2.30 format and these
+                   results are added to a 64-bit accumulator in 34.30 format.
+                   Nonsaturating additions are used and given that there are 33 guard bits in the accumulator
+                   there is no risk of overflow.
+                   The return result is in 34.30 format.
+ */
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_dot_prod_q15(
+    const q15_t * pSrcA,
+    const q15_t * pSrcB,
+    uint32_t blockSize,
+    q63_t * result)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q15x8_t vecA;
+    q15x8_t vecB;
+    q63_t     sum = 0LL;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
+         * Calculate dot product and then store the result in a temporary buffer.
+         */
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        sum = vmlaldavaq(sum, vecA, vecB);
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrcA += 8;
+        pSrcB += 8;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        sum = vmlaldavaq_p(sum, vecA, vecB, p0);
+    }
+
+    *result = sum;
+}
+
+#else
+void arm_dot_prod_q15(
+  const q15_t * pSrcA,
+  const q15_t * pSrcB,
+        uint32_t blockSize,
+        q63_t * result)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        q63_t sum = 0;                                 /* Temporary return variable */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
+
+#if defined (ARM_MATH_DSP)
+    /* Calculate dot product and store result in a temporary buffer. */
+    sum = __SMLALD(read_q15x2_ia ((q15_t **) &pSrcA), read_q15x2_ia ((q15_t **) &pSrcB), sum);
+    sum = __SMLALD(read_q15x2_ia ((q15_t **) &pSrcA), read_q15x2_ia ((q15_t **) &pSrcB), sum);
+#else
+    sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
+    sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
+    sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
+    sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
+
+    /* Calculate dot product and store result in a temporary buffer. */
+//#if defined (ARM_MATH_DSP)
+//    sum  = __SMLALD(*pSrcA++, *pSrcB++, sum);
+//#else
+    sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
+//#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Store result in destination buffer in 34.30 format */
+  *result = sum;
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicDotProd group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_dot_prod_q31.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_dot_prod_q31.c
@@ -0,0 +1,174 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_dot_prod_q31.c
+ * Description:  Q31 dot product
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicDotProd
+  @{
+ */
+
+/**
+  @brief         Dot product of Q31 vectors.
+  @param[in]     pSrcA      points to the first input vector.
+  @param[in]     pSrcB      points to the second input vector.
+  @param[in]     blockSize  number of samples in each vector.
+  @param[out]    result     output result returned here.
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The intermediate multiplications are in 1.31 x 1.31 = 2.62 format and these
+                   are truncated to 2.48 format by discarding the lower 14 bits.
+                   The 2.48 result is then added without saturation to a 64-bit accumulator in 16.48 format.
+                   There are 15 guard bits in the accumulator and there is no risk of overflow as long as
+                   the length of the vectors is less than 2^16 elements.
+                   The return result is in 16.48 format.
+ */
+
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_dot_prod_q31(
+    const q31_t * pSrcA,
+    const q31_t * pSrcB,
+    uint32_t blockSize,
+    q63_t * result)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q31x4_t vecA;
+    q31x4_t vecB;
+    q63_t     sum = 0LL;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
+         * Calculate dot product and then store the result in a temporary buffer.
+         */
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        sum = vrmlaldavhaq(sum, vecA, vecB);
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrcA += 4;
+        pSrcB += 4;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 3;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        sum = vrmlaldavhaq_p(sum, vecA, vecB, p0);
+    }
+
+    /*
+     * vrmlaldavhaq provides extra intermediate accumulator headroom.
+     * limiting the need of intermediate scaling
+     * Scalar variant uses 2.48 accu format by right shifting accumulators by 14.
+     * 16.48 output conversion is performed outside the loop by scaling accu. by 6
+     */
+    *result = asrl(sum, (14 - 8));
+}
+
+#else
+void arm_dot_prod_q31(
+  const q31_t * pSrcA,
+  const q31_t * pSrcB,
+        uint32_t blockSize,
+        q63_t * result)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        q63_t sum = 0;                                 /* Temporary return variable */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
+
+    /* Calculate dot product and store result in a temporary buffer. */
+    sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
+
+    sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
+
+    sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
+
+    sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
+
+    /* Calculate dot product and store result in a temporary buffer. */
+    sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Store result in destination buffer in 16.48 format */
+  *result = sum;
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicDotProd group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_dot_prod_q7.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_dot_prod_q7.c
@@ -0,0 +1,191 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_dot_prod_q7.c
+ * Description:  Q7 dot product
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicDotProd
+  @{
+ */
+
+/**
+  @brief         Dot product of Q7 vectors.
+  @param[in]     pSrcA      points to the first input vector
+  @param[in]     pSrcB      points to the second input vector
+  @param[in]     blockSize  number of samples in each vector
+  @param[out]    result     output result returned here
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The intermediate multiplications are in 1.7 x 1.7 = 2.14 format and these
+                   results are added to an accumulator in 18.14 format.
+                   Nonsaturating additions are used and there is no danger of wrap around as long as
+                   the vectors are less than 2^18 elements long.
+                   The return result is in 18.14 format.
+ */
+
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_dot_prod_q7(
+    const q7_t * pSrcA,
+    const q7_t * pSrcB,
+    uint32_t blockSize,
+    q31_t * result)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q7x16_t vecA;
+    q7x16_t vecB;
+    q31_t     sum = 0;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
+         * Calculate dot product and then store the result in a temporary buffer.
+         */
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        sum = vmladavaq(sum, vecA, vecB);
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrcA += 16;
+        pSrcB += 16;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 0xF;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp8q(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        sum = vmladavaq_p(sum, vecA, vecB, p0);
+    }
+
+    *result = sum;
+}
+#else
+void arm_dot_prod_q7(
+  const q7_t * pSrcA,
+  const q7_t * pSrcB,
+        uint32_t blockSize,
+        q31_t * result)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        q31_t sum = 0;                                 /* Temporary return variable */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+#if defined (ARM_MATH_DSP)
+  q31_t input1, input2;                          /* Temporary variables */
+  q31_t inA1, inA2, inB1, inB2;                  /* Temporary variables */
+#endif
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
+
+#if defined (ARM_MATH_DSP)
+    /* read 4 samples at a time from sourceA */
+    input1 = read_q7x4_ia ((q7_t **) &pSrcA);
+    /* read 4 samples at a time from sourceB */
+    input2 = read_q7x4_ia ((q7_t **) &pSrcB);
+
+    /* extract two q7_t samples to q15_t samples */
+    inA1 = __SXTB16(__ROR(input1, 8));
+    /* extract reminaing two samples */
+    inA2 = __SXTB16(input1);
+    /* extract two q7_t samples to q15_t samples */
+    inB1 = __SXTB16(__ROR(input2, 8));
+    /* extract reminaing two samples */
+    inB2 = __SXTB16(input2);
+
+    /* multiply and accumulate two samples at a time */
+    sum = __SMLAD(inA1, inB1, sum);
+    sum = __SMLAD(inA2, inB2, sum);
+#else
+    sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
+    sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
+    sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
+    sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
+
+    /* Calculate dot product and store result in a temporary buffer. */
+//#if defined (ARM_MATH_DSP)
+//    sum  = __SMLAD(*pSrcA++, *pSrcB++, sum);
+//#else
+    sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
+//#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Store result in destination buffer in 18.14 format */
+  *result = sum;
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicDotProd group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_mult_f32.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_mult_f32.c
@@ -0,0 +1,200 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mult_f32.c
+ * Description:  Floating-point vector multiplication
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @defgroup BasicMult Vector Multiplication
+
+  Element-by-element multiplication of two vectors.
+
+  <pre>
+      pDst[n] = pSrcA[n] * pSrcB[n],   0 <= n < blockSize.
+  </pre>
+
+  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
+ */
+
+/**
+  @addtogroup BasicMult
+  @{
+ */
+
+/**
+  @brief         Floating-point vector multiplication.
+  @param[in]     pSrcA      points to the first input vector.
+  @param[in]     pSrcB      points to the second input vector.
+  @param[out]    pDst       points to the output vector.
+  @param[in]     blockSize  number of samples in each vector.
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_mult_f32(
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+    uint32_t blkCnt;                               /* Loop counter */
+
+    f32x4_t vec1;
+    f32x4_t vec2;
+    f32x4_t res;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+    while (blkCnt > 0U)
+    {
+        /* C = A + B */
+
+      /* Add and then store the results in the destination buffer. */
+        vec1 = vld1q(pSrcA);
+        vec2 = vld1q(pSrcB);
+        res = vmulq(vec1, vec2);
+        vst1q(pDst, res);
+
+        /* Increment pointers */
+        pSrcA += 4;
+        pSrcB += 4;
+        pDst += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+    if (blkCnt > 0U)
+    {
+      /* C = A + B */
+      mve_pred16_t p0 = vctp32q(blkCnt);
+      vec1 = vld1q(pSrcA);
+      vec2 = vld1q(pSrcB);
+      vstrwq_p(pDst, vmulq(vec1,vec2), p0);
+    }
+
+}
+
+#else
+void arm_mult_f32(
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+    uint32_t blkCnt;                               /* Loop counter */
+
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+    f32x4_t vec1;
+    f32x4_t vec2;
+    f32x4_t res;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        /* C = A * B */
+
+    	/* Multiply the inputs and then store the results in the destination buffer. */
+        vec1 = vld1q_f32(pSrcA);
+        vec2 = vld1q_f32(pSrcB);
+        res = vmulq_f32(vec1, vec2);
+        vst1q_f32(pDst, res);
+
+        /* Increment pointers */
+        pSrcA += 4;
+        pSrcB += 4;
+        pDst += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+#else
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A * B */
+
+    /* Multiply inputs and store result in destination buffer. */
+    *pDst++ = (*pSrcA++) * (*pSrcB++);
+
+    *pDst++ = (*pSrcA++) * (*pSrcB++);
+
+    *pDst++ = (*pSrcA++) * (*pSrcB++);
+
+    *pDst++ = (*pSrcA++) * (*pSrcB++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined(ARM_MATH_NEON) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A * B */
+
+    /* Multiply input and store result in destination buffer. */
+    *pDst++ = (*pSrcA++) * (*pSrcB++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of BasicMult group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_mult_q15.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_mult_q15.c
@@ -0,0 +1,192 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mult_q15.c
+ * Description:  Q15 vector multiplication
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicMult
+  @{
+ */
+
+/**
+  @brief         Q15 vector multiplication
+  @param[in]     pSrcA      points to first input vector
+  @param[in]     pSrcB      points to second input vector
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
+ */
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_mult_q15(
+    const q15_t * pSrcA,
+    const q15_t * pSrcB,
+    q15_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q15x8_t vecA, vecB;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A * B
+         * Multiply the inputs and then store the results in the destination buffer.
+         */
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vst1q(pDst, vqdmulhq(vecA, vecB));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrcA  += 8;
+        pSrcB  += 8;
+        pDst   += 8;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vstrhq_p(pDst, vqdmulhq(vecA, vecB), p0);
+    }
+}
+
+#else
+void arm_mult_q15(
+  const q15_t * pSrcA,
+  const q15_t * pSrcB,
+        q15_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+#if defined (ARM_MATH_DSP)
+  q31_t inA1, inA2, inB1, inB2;                  /* Temporary input variables */
+  q15_t out1, out2, out3, out4;                  /* Temporary output variables */
+  q31_t mul1, mul2, mul3, mul4;                  /* Temporary variables */
+#endif
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A * B */
+
+#if defined (ARM_MATH_DSP)
+    /* read 2 samples at a time from sourceA */
+    inA1 = read_q15x2_ia ((q15_t **) &pSrcA);
+    /* read 2 samples at a time from sourceB */
+    inB1 = read_q15x2_ia ((q15_t **) &pSrcB);
+    /* read 2 samples at a time from sourceA */
+    inA2 = read_q15x2_ia ((q15_t **) &pSrcA);
+    /* read 2 samples at a time from sourceB */
+    inB2 = read_q15x2_ia ((q15_t **) &pSrcB);
+
+    /* multiply mul = sourceA * sourceB */
+    mul1 = (q31_t) ((q15_t) (inA1 >> 16) * (q15_t) (inB1 >> 16));
+    mul2 = (q31_t) ((q15_t) (inA1      ) * (q15_t) (inB1      ));
+    mul3 = (q31_t) ((q15_t) (inA2 >> 16) * (q15_t) (inB2 >> 16));
+    mul4 = (q31_t) ((q15_t) (inA2      ) * (q15_t) (inB2      ));
+
+    /* saturate result to 16 bit */
+    out1 = (q15_t) __SSAT(mul1 >> 15, 16);
+    out2 = (q15_t) __SSAT(mul2 >> 15, 16);
+    out3 = (q15_t) __SSAT(mul3 >> 15, 16);
+    out4 = (q15_t) __SSAT(mul4 >> 15, 16);
+
+    /* store result to destination */
+#ifndef ARM_MATH_BIG_ENDIAN
+    write_q15x2_ia (&pDst, __PKHBT(out2, out1, 16));
+    write_q15x2_ia (&pDst, __PKHBT(out4, out3, 16));
+#else
+    write_q15x2_ia (&pDst, __PKHBT(out1, out2, 16));
+    write_q15x2_ia (&pDst, __PKHBT(out3, out4, 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+#else
+    *pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
+    *pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
+    *pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
+    *pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A * B */
+
+    /* Multiply inputs and store result in destination buffer. */
+    *pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicMult group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_mult_q31.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_mult_q31.c
@@ -0,0 +1,168 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mult_q31.c
+ * Description:  Q31 vector multiplication
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicMult
+  @{
+ */
+
+/**
+  @brief         Q31 vector multiplication.
+  @param[in]     pSrcA      points to the first input vector.
+  @param[in]     pSrcB      points to the second input vector.
+  @param[out]    pDst       points to the output vector.
+  @param[in]     blockSize  number of samples in each vector.
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] are saturated.
+ */
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_mult_q31(
+    const q31_t * pSrcA,
+    const q31_t * pSrcB,
+    q31_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q31x4_t vecA, vecB;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A * B
+         * Multiply the inputs and then store the results in the destination buffer.
+         */
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vst1q(pDst, vqdmulhq(vecA, vecB));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrcA  += 4;
+        pSrcB  += 4;
+        pDst   += 4;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 3;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vstrwq_p(pDst, vqdmulhq(vecA, vecB), p0);
+    }
+}
+
+#else
+void arm_mult_q31(
+  const q31_t * pSrcA,
+  const q31_t * pSrcB,
+        q31_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        q31_t out;                                     /* Temporary output variable */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A * B */
+
+    /* Multiply inputs and store result in destination buffer. */
+    out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
+    out = __SSAT(out, 31);
+    *pDst++ = out << 1U;
+
+    out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
+    out = __SSAT(out, 31);
+    *pDst++ = out << 1U;
+
+    out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
+    out = __SSAT(out, 31);
+    *pDst++ = out << 1U;
+
+    out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
+    out = __SSAT(out, 31);
+    *pDst++ = out << 1U;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A * B */
+
+    /* Multiply inputs and store result in destination buffer. */
+    out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
+    out = __SSAT(out, 31);
+    *pDst++ = out << 1U;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicMult group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_mult_q7.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_mult_q7.c
@@ -0,0 +1,168 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mult_q7.c
+ * Description:  Q7 vector multiplication
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicMult
+  @{
+ */
+
+/**
+  @brief         Q7 vector multiplication
+  @param[in]     pSrcA      points to the first input vector
+  @param[in]     pSrcB      points to the second input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
+ */
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_mult_q7(
+    const q7_t * pSrcA,
+    const q7_t * pSrcB,
+    q7_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q7x16_t vecA, vecB;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A * B
+         * Multiply the inputs and then store the results in the destination buffer.
+         */
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vst1q(pDst, vqdmulhq(vecA, vecB));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrcA  += 16;
+        pSrcB  += 16;
+        pDst   += 16;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 0xF;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp8q(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vstrbq_p(pDst, vqdmulhq(vecA, vecB), p0);
+    }
+}
+
+#else
+void arm_mult_q7(
+  const q7_t * pSrcA,
+  const q7_t * pSrcB,
+        q7_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+#if defined (ARM_MATH_DSP)
+  q7_t out1, out2, out3, out4;                   /* Temporary output variables */
+#endif
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A * B */
+
+#if defined (ARM_MATH_DSP)
+    /* Multiply inputs and store results in temporary variables */
+    out1 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
+    out2 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
+    out3 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
+    out4 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
+
+    /* Pack and store result in destination buffer (in single write) */
+    write_q7x4_ia (&pDst, __PACKq7(out1, out2, out3, out4));
+#else
+    *pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
+    *pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
+    *pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
+    *pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A * B */
+
+    /* Multiply input and store result in destination buffer. */
+    *pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicMult group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_negate_f32.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_negate_f32.c
@@ -0,0 +1,192 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_negate_f32.c
+ * Description:  Negates floating-point vectors
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @defgroup BasicNegate Vector Negate
+
+  Negates the elements of a vector.
+
+  <pre>
+      pDst[n] = -pSrc[n],   0 <= n < blockSize.
+  </pre>
+
+  The functions support in-place computation allowing the source and
+  destination pointers to reference the same memory buffer.
+  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
+ */
+
+/**
+  @addtogroup BasicNegate
+  @{
+ */
+
+/**
+  @brief         Negates the elements of a floating-point vector.
+  @param[in]     pSrc       points to input vector.
+  @param[out]    pDst       points to output vector.
+  @param[in]     blockSize  number of samples in each vector.
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_negate_f32(
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+    uint32_t blkCnt;                               /* Loop counter */
+    f32x4_t vec1;
+    f32x4_t res;
+
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+    while (blkCnt > 0U)
+    {
+        /* C = |A| */
+
+        /* Calculate absolute values and then store the results in the destination buffer. */
+        vec1 = vld1q(pSrc);
+        res = vnegq(vec1);
+        vst1q(pDst, res);
+
+        /* Increment pointers */
+        pSrc += 4;
+        pDst += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+    if (blkCnt > 0U)
+    {
+      /* C = |A| */
+      mve_pred16_t p0 = vctp32q(blkCnt);
+      vec1 = vld1q((float32_t const *) pSrc);
+      vstrwq_p(pDst, vnegq(vec1), p0);
+    }
+
+}
+
+#else
+void arm_negate_f32(
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined(ARM_MATH_NEON_EXPERIMENTAL) && !defined(ARM_MATH_AUTOVECTORIZE)
+    f32x4_t vec1;
+    f32x4_t res;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        /* C = -A */
+
+    	/* Negate and then store the results in the destination buffer. */
+        vec1 = vld1q_f32(pSrc);
+        res = vnegq_f32(vec1);
+        vst1q_f32(pDst, res);
+
+        /* Increment pointers */
+        pSrc += 4;
+        pDst += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+#else
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = -A */
+
+    /* Negate and store result in destination buffer. */
+    *pDst++ = -*pSrc++;
+
+    *pDst++ = -*pSrc++;
+
+    *pDst++ = -*pSrc++;
+
+    *pDst++ = -*pSrc++;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined(ARM_MATH_NEON_EXPERIMENTAL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = -A */
+
+    /* Negate and store result in destination buffer. */
+    *pDst++ = -*pSrc++;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of BasicNegate group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_negate_q15.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_negate_q15.c
@@ -0,0 +1,171 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_negate_q15.c
+ * Description:  Negates Q15 vectors
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicNegate
+  @{
+ */
+
+/**
+  @brief         Negates the elements of a Q15 vector.
+  @param[in]     pSrc       points to the input vector.
+  @param[out]    pDst       points to the output vector.
+  @param[in]     blockSize  number of samples in each vector.
+  @return        none
+
+  @par           Conditions for optimum performance
+                   Input and output buffers should be aligned by 32-bit
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   The Q15 value -1 (0x8000) is saturated to the maximum allowable positive value 0x7FFF.
+ */
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_negate_q15(
+    const q15_t  * pSrc,
+    q15_t  * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q15x8_t vecSrc;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = -A
+         * Negate and then store the results in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vst1q(pDst, vqnegq(vecSrc));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 8;
+        pDst += 8;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vstrhq_p(pDst, vqnegq(vecSrc), p0);
+    }
+}
+
+#else
+void arm_negate_q15(
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        q15_t in;                                      /* Temporary input variable */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+#if defined (ARM_MATH_DSP)
+  q31_t in1;                                    /* Temporary input variables */
+#endif
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = -A */
+
+#if defined (ARM_MATH_DSP)
+    /* Negate and store result in destination buffer (2 samples at a time). */
+    in1 = read_q15x2_ia ((q15_t **) &pSrc);
+    write_q15x2_ia (&pDst, __QSUB16(0, in1));
+
+    in1 = read_q15x2_ia ((q15_t **) &pSrc);
+    write_q15x2_ia (&pDst, __QSUB16(0, in1));
+#else
+    in = *pSrc++;
+    *pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
+
+    in = *pSrc++;
+    *pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
+
+    in = *pSrc++;
+    *pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
+
+    in = *pSrc++;
+    *pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = -A */
+
+    /* Negate and store result in destination buffer. */
+    in = *pSrc++;
+    *pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicNegate group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_negate_q31.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_negate_q31.c
@@ -0,0 +1,178 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_negate_q31.c
+ * Description:  Negates Q31 vectors
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicNegate
+  @{
+ */
+
+/**
+  @brief         Negates the elements of a Q31 vector.
+  @param[in]     pSrc       points to the input vector.
+  @param[out]    pDst       points to the output vector.
+  @param[in]     blockSize   number of samples in each vector.
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   The Q31 value -1 (0x80000000) is saturated to the maximum allowable positive value 0x7FFFFFFF.
+ */
+
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_negate_q31(
+    const q31_t * pSrc,
+    q31_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q31x4_t vecSrc;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = -A
+         * Negate and then store the results in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vst1q(pDst, vqnegq(vecSrc));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 4;
+        pDst += 4;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 3;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vstrwq_p(pDst, vqnegq(vecSrc), p0);
+    }
+}
+
+#else
+void arm_negate_q31(
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        q31_t in;                                      /* Temporary input variable */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = -A */
+
+    /* Negate and store result in destination buffer. */
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = __QSUB(0, in);
+#else
+    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
+#endif
+
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = __QSUB(0, in);
+#else
+    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
+#endif
+
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = __QSUB(0, in);
+#else
+    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
+#endif
+
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = __QSUB(0, in);
+#else
+    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = -A */
+
+    /* Negate and store result in destination buffer. */
+    in = *pSrc++;
+#if defined (ARM_MATH_DSP)
+    *pDst++ = __QSUB(0, in);
+#else
+    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicNegate group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_negate_q7.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_negate_q7.c
@@ -0,0 +1,171 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_negate_q7.c
+ * Description:  Negates Q7 vectors
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicNegate
+  @{
+ */
+
+/**
+  @brief         Negates the elements of a Q7 vector.
+  @param[in]     pSrc       points to the input vector.
+  @param[out]    pDst       points to the output vector.
+  @param[in]     blockSize   number of samples in each vector.
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   The Q7 value -1 (0x80) is saturated to the maximum allowable positive value 0x7F.
+ */
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_negate_q7(
+    const q7_t   * pSrc,
+    q7_t   * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q7x16_t vecSrc;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = -A
+         * Negate and then store the results in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vst1q(pDst, vqnegq(vecSrc));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 16;
+        pDst += 16;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 0xF;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp8q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vstrbq_p(pDst, vqnegq(vecSrc), p0);
+    }
+}
+
+#else
+void arm_negate_q7(
+  const q7_t * pSrc,
+        q7_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        q7_t in;                                       /* Temporary input variable */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+#if defined (ARM_MATH_DSP)
+  q31_t in1;                                    /* Temporary input variable */
+#endif
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = -A */
+
+#if defined (ARM_MATH_DSP)
+    /* Negate and store result in destination buffer (4 samples at a time). */
+    in1 = read_q7x4_ia ((q7_t **) &pSrc);
+    write_q7x4_ia (&pDst, __QSUB8(0, in1));
+#else
+    in = *pSrc++;
+    *pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
+
+    in = *pSrc++;
+    *pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
+
+    in = *pSrc++;
+    *pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
+
+    in = *pSrc++;
+    *pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = -A */
+
+    /* Negate and store result in destination buffer. */
+    in = *pSrc++;
+
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (q7_t) __QSUB8(0, in);
+#else
+    *pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicNegate group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_not_u16.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_not_u16.c
@@ -0,0 +1,130 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_not_u16.c
+ * Description:  uint16_t bitwise NOT
+ *
+ * $Date:        14 November 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @defgroup Not Vector bitwise NOT
+
+  Compute the logical bitwise NOT.
+
+  There are separate functions for uint32_t, uint16_t, and uint8_t data types.
+ */
+
+/**
+  @addtogroup Not
+  @{
+ */
+
+/**
+  @brief         Compute the logical bitwise NOT of a fixed-point vector.
+  @param[in]     pSrc       points to input vector
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+void arm_not_u16(
+    const uint16_t * pSrc,
+          uint16_t * pDst,
+          uint32_t blockSize)
+{
+    uint32_t blkCnt;      /* Loop counter */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+    q15x8_t vecSrc;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3;
+
+    while (blkCnt > 0U)
+    {
+        vecSrc = vld1q(pSrc);
+
+        vst1q(pDst, vmvnq_u16(vecSrc) );
+
+        pSrc += 8;
+        pDst += 8;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 7;
+
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vstrhq_p(pDst, vmvnq_u16(vecSrc), p0);
+    }
+#else
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+    uint16x8_t inV;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3U;
+
+    while (blkCnt > 0U)
+    {
+        inV = vld1q_u16(pSrc);
+
+        vst1q_u16(pDst, vmvnq_u16(inV) );
+
+        pSrc += 8;
+        pDst += 8;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 7;
+#else
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize;
+#endif
+
+    while (blkCnt > 0U)
+    {
+        *pDst++ = ~(*pSrc++);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+#endif /* if defined(ARM_MATH_MVEI) */
+}
+
+/**
+  @} end of Not group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_not_u32.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_not_u32.c
@@ -0,0 +1,122 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_not_u32.c
+ * Description:  uint32_t bitwise NOT
+ *
+ * $Date:        14 November 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup Not
+  @{
+ */
+
+/**
+  @brief         Compute the logical bitwise NOT of a fixed-point vector.
+  @param[in]     pSrc       points to input vector
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+void arm_not_u32(
+    const uint32_t * pSrc,
+          uint32_t * pDst,
+          uint32_t blockSize)
+{
+    uint32_t blkCnt;      /* Loop counter */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+    q31x4_t vecSrc;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 2;
+
+    while (blkCnt > 0U)
+    {
+        vecSrc = vld1q(pSrc);
+
+        vst1q(pDst, vmvnq_u32(vecSrc) );
+
+        pSrc += 4;
+        pDst += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 3;
+
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vstrwq_p(pDst, vmvnq_u32(vecSrc), p0);
+    }
+#else
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+    uint32x4_t inV;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        inV = vld1q_u32(pSrc);
+
+        vst1q_u32(pDst, vmvnq_u32(inV) );
+
+        pSrc += 4;
+        pDst += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 3;
+#else
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize;
+#endif
+
+    while (blkCnt > 0U)
+    {
+        *pDst++ = ~(*pSrc++);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+#endif /* if defined(ARM_MATH_MVEI) */
+}
+
+/**
+  @} end of Not group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_not_u8.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_not_u8.c
@@ -0,0 +1,122 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_not_u8.c
+ * Description:  uint8_t bitwise NOT
+ *
+ * $Date:        14 November 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup Not
+  @{
+ */
+
+/**
+  @brief         Compute the logical bitwise NOT of a fixed-point vector.
+  @param[in]     pSrc       points to input vector
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+void arm_not_u8(
+    const uint8_t * pSrc,
+          uint8_t * pDst,
+          uint32_t blockSize)
+{
+    uint32_t blkCnt;      /* Loop counter */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+    q7x16_t vecSrc;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4;
+
+    while (blkCnt > 0U)
+    {
+        vecSrc = vld1q(pSrc);
+
+        vst1q(pDst, vmvnq_u8(vecSrc) );
+
+        pSrc += 16;
+        pDst += 16;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0xF;
+
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp8q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vstrbq_p(pDst, vmvnq_u8(vecSrc), p0);
+    }
+#else
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+    uint8x16_t inV;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4U;
+
+    while (blkCnt > 0U)
+    {
+        inV = vld1q_u8(pSrc);
+
+        vst1q_u8(pDst, vmvnq_u8(inV) );
+
+        pSrc += 16;
+        pDst += 16;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0xF;
+#else
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize;
+#endif
+
+    while (blkCnt > 0U)
+    {
+        *pDst++ = ~(*pSrc++);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+#endif /* if defined(ARM_MATH_MVEI) */
+}
+
+/**
+  @} end of Not group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_offset_f32.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_offset_f32.c
@@ -0,0 +1,196 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_offset_f32.c
+ * Description:  Floating-point vector offset
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @defgroup BasicOffset Vector Offset
+
+  Adds a constant offset to each element of a vector.
+
+  <pre>
+      pDst[n] = pSrc[n] + offset,   0 <= n < blockSize.
+  </pre>
+
+  The functions support in-place computation allowing the source and
+  destination pointers to reference the same memory buffer.
+  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
+ */
+
+/**
+  @addtogroup BasicOffset
+  @{
+ */
+
+/**
+  @brief         Adds a constant offset to a floating-point vector.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     offset     is the offset to be added
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_offset_f32(
+  const float32_t * pSrc,
+        float32_t offset,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+    f32x4_t vec1;
+    f32x4_t res;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+    while (blkCnt > 0U)
+    {
+        /* C = A + offset */
+
+        /* Add offset and then store the results in the destination buffer. */
+        vec1 = vld1q(pSrc);
+        res = vaddq(vec1,offset);
+        vst1q(pDst, res);
+
+        /* Increment pointers */
+        pSrc += 4;
+        pDst += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vec1 = vld1q((float32_t const *) pSrc);
+        vstrwq_p(pDst, vaddq(vec1, offset), p0);
+    }
+
+
+}
+
+#else
+void arm_offset_f32(
+  const float32_t * pSrc,
+        float32_t offset,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined(ARM_MATH_NEON_EXPERIMENTAL) && !defined(ARM_MATH_AUTOVECTORIZE)
+    f32x4_t vec1;
+    f32x4_t res;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        /* C = A + offset */
+
+        /* Add offset and then store the results in the destination buffer. */
+        vec1 = vld1q_f32(pSrc);
+        res = vaddq_f32(vec1,vdupq_n_f32(offset));
+        vst1q_f32(pDst, res);
+
+        /* Increment pointers */
+        pSrc += 4;
+        pDst += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+#else
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A + offset */
+
+    /* Add offset and store result in destination buffer. */
+    *pDst++ = (*pSrc++) + offset;
+
+    *pDst++ = (*pSrc++) + offset;
+
+    *pDst++ = (*pSrc++) + offset;
+
+    *pDst++ = (*pSrc++) + offset;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined(ARM_MATH_NEON_EXPERIMENTAL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A + offset */
+
+    /* Add offset and store result in destination buffer. */
+    *pDst++ = (*pSrc++) + offset;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of BasicOffset group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_offset_q15.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_offset_q15.c
@@ -0,0 +1,168 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_offset_q15.c
+ * Description:  Q15 vector offset
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicOffset
+  @{
+ */
+
+/**
+  @brief         Adds a constant offset to a Q15 vector.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     offset     is the offset to be added
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
+ */
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_offset_q15(
+    const q15_t * pSrc,
+    q15_t   offset,
+    q15_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q15x8_t vecSrc;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A + offset
+         * Add offset and then store the result in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vst1q(pDst, vqaddq(vecSrc, offset));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 8;
+        pDst += 8;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vstrhq_p(pDst, vqaddq(vecSrc, offset), p0);
+    }
+}
+
+
+#else
+void arm_offset_q15(
+  const q15_t * pSrc,
+        q15_t offset,
+        q15_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+#if defined (ARM_MATH_DSP)
+  q31_t offset_packed;                           /* Offset packed to 32 bit */
+
+  /* Offset is packed to 32 bit in order to use SIMD32 for addition */
+  offset_packed = __PKHBT(offset, offset, 16);
+#endif
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A + offset */
+
+#if defined (ARM_MATH_DSP)
+    /* Add offset and store result in destination buffer (2 samples at a time). */
+    write_q15x2_ia (&pDst, __QADD16(read_q15x2_ia ((q15_t **) &pSrc), offset_packed));
+    write_q15x2_ia (&pDst, __QADD16(read_q15x2_ia ((q15_t **) &pSrc), offset_packed));
+#else
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrc++ + offset), 16);
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrc++ + offset), 16);
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrc++ + offset), 16);
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrc++ + offset), 16);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A + offset */
+
+    /* Add offset and store result in destination buffer. */
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (q15_t) __QADD16(*pSrc++, offset);
+#else
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrc++ + offset), 16);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicOffset group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_offset_q31.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_offset_q31.c
@@ -0,0 +1,175 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_offset_q31.c
+ * Description:  Q31 vector offset
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicOffset
+  @{
+ */
+
+/**
+  @brief         Adds a constant offset to a Q31 vector.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     offset     is the offset to be added
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
+ */
+
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_offset_q31(
+    const q31_t * pSrc,
+    q31_t   offset,
+    q31_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q31x4_t vecSrc;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A + offset
+         * Add offset and then store the result in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vst1q(pDst, vqaddq(vecSrc, offset));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 4;
+        pDst += 4;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 3;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vstrwq_p(pDst, vqaddq(vecSrc, offset), p0);
+    }
+}
+
+#else
+void arm_offset_q31(
+  const q31_t * pSrc,
+        q31_t offset,
+        q31_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A + offset */
+
+    /* Add offset and store result in destination buffer. */
+#if defined (ARM_MATH_DSP)
+    *pDst++ = __QADD(*pSrc++, offset);
+#else
+    *pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);
+#endif
+
+#if defined (ARM_MATH_DSP)
+    *pDst++ = __QADD(*pSrc++, offset);
+#else
+    *pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);
+#endif
+
+#if defined (ARM_MATH_DSP)
+    *pDst++ = __QADD(*pSrc++, offset);
+#else
+    *pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);
+#endif
+
+#if defined (ARM_MATH_DSP)
+    *pDst++ = __QADD(*pSrc++, offset);
+#else
+    *pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A + offset */
+
+    /* Add offset and store result in destination buffer. */
+#if defined (ARM_MATH_DSP)
+    *pDst++ = __QADD(*pSrc++, offset);
+#else
+    *pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicOffset group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_offset_q7.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_offset_q7.c
@@ -0,0 +1,162 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_offset_q7.c
+ * Description:  Q7 vector offset
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicOffset
+  @{
+ */
+
+/**
+  @brief         Adds a constant offset to a Q7 vector.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     offset     is the offset to be added
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
+ */
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_offset_q7(
+    const q7_t * pSrc,
+    q7_t   offset,
+    q7_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q7x16_t vecSrc;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A + offset
+         * Add offset and then store the result in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vst1q(pDst, vqaddq(vecSrc, offset));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 16;
+        pDst += 16;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 0xF;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp8q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vstrbq_p(pDst, vqaddq(vecSrc, offset), p0);
+    }
+}
+
+#else
+void arm_offset_q7(
+  const q7_t * pSrc,
+        q7_t offset,
+        q7_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+#if defined (ARM_MATH_DSP)
+  q31_t offset_packed;                           /* Offset packed to 32 bit */
+
+  /* Offset is packed to 32 bit in order to use SIMD32 for addition */
+  offset_packed = __PACKq7(offset, offset, offset, offset);
+#endif
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A + offset */
+
+#if defined (ARM_MATH_DSP)
+    /* Add offset and store result in destination buffer (4 samples at a time). */
+    write_q7x4_ia (&pDst, __QADD8(read_q7x4_ia ((q7_t **) &pSrc), offset_packed));
+#else
+    *pDst++ = (q7_t) __SSAT(*pSrc++ + offset, 8);
+    *pDst++ = (q7_t) __SSAT(*pSrc++ + offset, 8);
+    *pDst++ = (q7_t) __SSAT(*pSrc++ + offset, 8);
+    *pDst++ = (q7_t) __SSAT(*pSrc++ + offset, 8);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A + offset */
+
+    /* Add offset and store result in destination buffer. */
+    *pDst++ = (q7_t) __SSAT((q15_t) *pSrc++ + offset, 8);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicOffset group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_or_u16.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_or_u16.c
@@ -0,0 +1,137 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_or_u16.c
+ * Description:  uint16_t bitwise inclusive OR
+ *
+ * $Date:        14 November 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @defgroup Or Vector bitwise inclusive OR
+
+  Compute the logical bitwise OR.
+
+  There are separate functions for uint32_t, uint16_t, and uint8_t data types.
+ */
+
+/**
+  @addtogroup Or
+  @{
+ */
+
+/**
+  @brief         Compute the logical bitwise OR of two fixed-point vectors.
+  @param[in]     pSrcA      points to input vector A
+  @param[in]     pSrcB      points to input vector B
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+void arm_or_u16(
+    const uint16_t * pSrcA,
+    const uint16_t * pSrcB,
+          uint16_t * pDst,
+          uint32_t blockSize)
+{
+    uint32_t blkCnt;      /* Loop counter */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+    q15x8_t vecSrcA, vecSrcB;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3;
+
+    while (blkCnt > 0U)
+    {
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+
+        vst1q(pDst, vorrq_u16(vecSrcA, vecSrcB) );
+
+        pSrcA += 8;
+        pSrcB += 8;
+        pDst  += 8;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 7;
+
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+        vstrhq_p(pDst, vorrq_u16(vecSrcA, vecSrcB), p0);
+    }
+#else
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+    uint16x8_t vecA, vecB;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3U;
+
+    while (blkCnt > 0U)
+    {
+        vecA = vld1q_u16(pSrcA);
+        vecB = vld1q_u16(pSrcB);
+
+        vst1q_u16(pDst, vorrq_u16(vecA, vecB) );
+
+        pSrcA += 8;
+        pSrcB += 8;
+        pDst  += 8;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 7;
+#else
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize;
+#endif
+
+    while (blkCnt > 0U)
+    {
+        *pDst++ = (*pSrcA++)|(*pSrcB++);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+#endif /* if defined(ARM_MATH_MVEI) */
+}
+
+/**
+  @} end of Or group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_or_u32.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_or_u32.c
@@ -0,0 +1,128 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_or_u32.c
+ * Description:  uint32_t bitwise inclusive OR
+ *
+ * $Date:        14 November 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup Or
+  @{
+ */
+
+/**
+  @brief         Compute the logical bitwise OR of two fixed-point vectors.
+  @param[in]     pSrcA      points to input vector A
+  @param[in]     pSrcB      points to input vector B
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+void arm_or_u32(
+    const uint32_t * pSrcA,
+    const uint32_t * pSrcB,
+          uint32_t * pDst,
+          uint32_t blockSize)
+{
+    uint32_t blkCnt;      /* Loop counter */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+    q31x4_t vecSrcA, vecSrcB;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2;
+
+    while (blkCnt > 0U)
+    {
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+
+        vst1q(pDst, vorrq_u32(vecSrcA, vecSrcB) );
+
+        pSrcA += 4;
+        pSrcB += 4;
+        pDst  += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 3;
+
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+        vstrwq_p(pDst, vorrq_u32(vecSrcA, vecSrcB), p0);
+    }
+#else
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+    uint32x4_t vecA, vecB;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        vecA = vld1q_u32(pSrcA);
+        vecB = vld1q_u32(pSrcB);
+
+        vst1q_u32(pDst, vorrq_u32(vecA, vecB) );
+
+        pSrcA += 4;
+        pSrcB += 4;
+        pDst  += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 3;
+#else
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize;
+#endif
+
+    while (blkCnt > 0U)
+    {
+        *pDst++ = (*pSrcA++)|(*pSrcB++);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+#endif /* if defined(ARM_MATH_MVEI) */
+}
+/**
+  @} end of Or group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_or_u8.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_or_u8.c
@@ -0,0 +1,128 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_or_u8.c
+ * Description:  uint8_t bitwise inclusive OR
+ *
+ * $Date:        14 November 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup Or
+  @{
+ */
+
+/**
+  @brief         Compute the logical bitwise OR of two fixed-point vectors.
+  @param[in]     pSrcA      points to input vector A
+  @param[in]     pSrcB      points to input vector B
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+void arm_or_u8(
+    const uint8_t * pSrcA,
+    const uint8_t * pSrcB,
+          uint8_t * pDst,
+          uint32_t blockSize)
+{
+    uint32_t blkCnt;      /* Loop counter */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+    q7x16_t vecSrcA, vecSrcB;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4;
+
+    while (blkCnt > 0U)
+    {
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+
+        vst1q(pDst, vorrq_u8(vecSrcA, vecSrcB) );
+
+        pSrcA += 16;
+        pSrcB += 16;
+        pDst  += 16;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0xF;
+
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp8q(blkCnt);
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+        vstrbq_p(pDst, vorrq_u8(vecSrcA, vecSrcB), p0);
+    }
+#else
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+    uint8x16_t vecA, vecB;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4U;
+
+    while (blkCnt > 0U)
+    {
+        vecA = vld1q_u8(pSrcA);
+        vecB = vld1q_u8(pSrcB);
+
+        vst1q_u8(pDst, vorrq_u8(vecA, vecB) );
+
+        pSrcA += 16;
+        pSrcB += 16;
+        pDst  += 16;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0xF;
+#else
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize;
+#endif
+
+    while (blkCnt > 0U)
+    {
+        *pDst++ = (*pSrcA++)|(*pSrcB++);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+#endif /* if defined(ARM_MATH_MVEI) */
+}
+/**
+  @} end of Or group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_scale_f32.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_scale_f32.c
@@ -0,0 +1,216 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_scale_f32.c
+ * Description:  Multiplies a floating-point vector by a scalar
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @defgroup BasicScale Vector Scale
+
+  Multiply a vector by a scalar value.  For floating-point data, the algorithm used is:
+
+  <pre>
+      pDst[n] = pSrc[n] * scale,   0 <= n < blockSize.
+  </pre>
+
+  In the fixed-point Q7, Q15, and Q31 functions, <code>scale</code> is represented by
+  a fractional multiplication <code>scaleFract</code> and an arithmetic shift <code>shift</code>.
+  The shift allows the gain of the scaling operation to exceed 1.0.
+  The algorithm used with fixed-point data is:
+
+  <pre>
+      pDst[n] = (pSrc[n] * scaleFract) << shift,   0 <= n < blockSize.
+  </pre>
+
+  The overall scale factor applied to the fixed-point data is
+  <pre>
+      scale = scaleFract * 2^shift.
+  </pre>
+
+  The functions support in-place computation allowing the source and destination
+  pointers to reference the same memory buffer.
+ */
+
+/**
+  @addtogroup BasicScale
+  @{
+ */
+
+/**
+  @brief         Multiplies a floating-point vector by a scalar.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     scale      scale factor to be applied
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_scale_f32(
+  const float32_t * pSrc,
+        float32_t scale,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+    f32x4_t vec1;
+    f32x4_t res;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        /* C = A + offset */
+
+        /* Add offset and then store the results in the destination buffer. */
+        vec1 = vld1q(pSrc);
+        res = vmulq(vec1,scale);
+        vst1q(pDst, res);
+
+        /* Increment pointers */
+        pSrc += 4;
+        pDst += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vec1 = vld1q((float32_t const *) pSrc);
+        vstrwq_p(pDst, vmulq(vec1, scale), p0);
+    }
+
+
+}
+
+#else
+void arm_scale_f32(
+  const float32_t *pSrc,
+        float32_t scale,
+        float32_t *pDst,
+        uint32_t blockSize)
+{
+  uint32_t blkCnt;                               /* Loop counter */
+#if defined(ARM_MATH_NEON_EXPERIMENTAL)
+    f32x4_t vec1;
+    f32x4_t res;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        /* C = A * scale */
+
+    	/* Scale the input and then store the results in the destination buffer. */
+        vec1 = vld1q_f32(pSrc);
+        res = vmulq_f32(vec1, vdupq_n_f32(scale));
+        vst1q_f32(pDst, res);
+
+        /* Increment pointers */
+        pSrc += 4;
+        pDst += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+#else
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    float32_t in1, in2, in3, in4;
+
+    /* C = A * scale */
+
+    /* Scale input and store result in destination buffer. */
+    in1 = (*pSrc++) * scale;
+
+    in2 = (*pSrc++) * scale;
+
+    in3 = (*pSrc++) * scale;
+
+    in4 = (*pSrc++) * scale;
+
+    *pDst++ = in1;
+    *pDst++ = in2;
+    *pDst++ = in3;
+    *pDst++ = in4;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined(ARM_MATH_NEON_EXPERIMENTAL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A * scale */
+
+    /* Scale input and store result in destination buffer. */
+    *pDst++ = (*pSrc++) * scale;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of BasicScale group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_scale_q15.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_scale_q15.c
@@ -0,0 +1,201 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_scale_q15.c
+ * Description:  Multiplies a Q15 vector by a scalar
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicScale
+  @{
+ */
+
+/**
+  @brief         Multiplies a Q15 vector by a scalar.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     scaleFract fractional portion of the scale value
+  @param[in]     shift      number of bits to shift the result by
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.15 format.
+                   These are multiplied to yield a 2.30 intermediate result and this is shifted with saturation to 1.15 format.
+ */
+
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_scale_q15(
+    const q15_t * pSrc,
+    q15_t   scaleFract,
+    int8_t  shift,
+    q15_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q15x8_t vecSrc;
+    q15x8_t vecDst;
+
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3;
+
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A * scale
+         * Scale the input and then store the result in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vecDst = vmulhq(vecSrc, vdupq_n_s16(scaleFract));
+        vecDst = vqshlq_r(vecDst, shift + 1);
+        vst1q(pDst, vecDst);
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 8;
+        pDst += 8;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);;
+        vecSrc = vld1q(pSrc);
+        vecDst = vmulhq(vecSrc, vdupq_n_s16(scaleFract));
+        vecDst = vqshlq_r(vecDst, shift + 1);
+        vstrhq_p(pDst, vecDst, p0);
+    }
+
+}
+
+
+#else
+void arm_scale_q15(
+  const q15_t *pSrc,
+        q15_t scaleFract,
+        int8_t shift,
+        q15_t *pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        int8_t kShift = 15 - shift;                    /* Shift to apply after scaling */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+#if defined (ARM_MATH_DSP)
+  q31_t inA1, inA2;
+  q31_t out1, out2, out3, out4;                  /* Temporary output variables */
+  q15_t in1, in2, in3, in4;                      /* Temporary input variables */
+#endif
+#endif
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A * scale */
+
+#if defined (ARM_MATH_DSP)
+    /* read 2 times 2 samples at a time from source */
+    inA1 = read_q15x2_ia ((q15_t **) &pSrc);
+    inA2 = read_q15x2_ia ((q15_t **) &pSrc);
+
+    /* Scale inputs and store result in temporary variables
+     * in single cycle by packing the outputs */
+    out1 = (q31_t) ((q15_t) (inA1 >> 16) * scaleFract);
+    out2 = (q31_t) ((q15_t) (inA1      ) * scaleFract);
+    out3 = (q31_t) ((q15_t) (inA2 >> 16) * scaleFract);
+    out4 = (q31_t) ((q15_t) (inA2      ) * scaleFract);
+
+    /* apply shifting */
+    out1 = out1 >> kShift;
+    out2 = out2 >> kShift;
+    out3 = out3 >> kShift;
+    out4 = out4 >> kShift;
+
+    /* saturate the output */
+    in1 = (q15_t) (__SSAT(out1, 16));
+    in2 = (q15_t) (__SSAT(out2, 16));
+    in3 = (q15_t) (__SSAT(out3, 16));
+    in4 = (q15_t) (__SSAT(out4, 16));
+
+    /* store result to destination */
+    write_q15x2_ia (&pDst, __PKHBT(in2, in1, 16));
+    write_q15x2_ia (&pDst, __PKHBT(in4, in3, 16));
+#else
+    *pDst++ = (q15_t) (__SSAT(((q31_t) *pSrc++ * scaleFract) >> kShift, 16));
+    *pDst++ = (q15_t) (__SSAT(((q31_t) *pSrc++ * scaleFract) >> kShift, 16));
+    *pDst++ = (q15_t) (__SSAT(((q31_t) *pSrc++ * scaleFract) >> kShift, 16));
+    *pDst++ = (q15_t) (__SSAT(((q31_t) *pSrc++ * scaleFract) >> kShift, 16));
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A * scale */
+
+    /* Scale input and store result in destination buffer. */
+    *pDst++ = (q15_t) (__SSAT(((q31_t) *pSrc++ * scaleFract) >> kShift, 16));
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicScale group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_scale_q31.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_scale_q31.c
@@ -0,0 +1,244 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_scale_q31.c
+ * Description:  Multiplies a Q31 vector by a scalar
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicScale
+  @{
+ */
+
+/**
+  @brief         Multiplies a Q31 vector by a scalar.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     scaleFract fractional portion of the scale value
+  @param[in]     shift      number of bits to shift the result by
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.31 format.
+                   These are multiplied to yield a 2.62 intermediate result and this is shifted with saturation to 1.31 format.
+ */
+
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_scale_q31(
+    const q31_t * pSrc,
+    q31_t   scaleFract,
+    int8_t  shift,
+    q31_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q31x4_t vecSrc;
+    q31x4_t vecDst;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A * scale
+         * Scale the input and then store the result in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vecDst = vmulhq(vecSrc, vdupq_n_s32(scaleFract));
+        vecDst = vqshlq_r(vecDst, shift + 1);
+        vst1q(pDst, vecDst);
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 4;
+        pDst += 4;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 3;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vecDst = vmulhq(vecSrc, vdupq_n_s32(scaleFract));
+        vecDst = vqshlq_r(vecDst, shift + 1);
+        vstrwq_p(pDst, vecDst, p0);
+    }
+}
+
+#else
+void arm_scale_q31(
+  const q31_t *pSrc,
+        q31_t scaleFract,
+        int8_t shift,
+        q31_t *pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        q31_t in, out;                                 /* Temporary variables */
+        int8_t kShift = shift + 1;                     /* Shift to apply after scaling */
+        int8_t sign = (kShift & 0x80);
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  if (sign == 0U)
+  {
+    while (blkCnt > 0U)
+    {
+      /* C = A * scale */
+
+      /* Scale input and store result in destination buffer. */
+      in = *pSrc++;                                /* read input from source */
+      in = ((q63_t) in * scaleFract) >> 32;        /* multiply input with scaler value */
+      out = in << kShift;                          /* apply shifting */
+      if (in != (out >> kShift))                   /* saturate the result */
+        out = 0x7FFFFFFF ^ (in >> 31);
+      *pDst++ = out;                               /* Store result destination */
+
+      in = *pSrc++;
+      in = ((q63_t) in * scaleFract) >> 32;
+      out = in << kShift;
+      if (in != (out >> kShift))
+        out = 0x7FFFFFFF ^ (in >> 31);
+      *pDst++ = out;
+
+      in = *pSrc++;
+      in = ((q63_t) in * scaleFract) >> 32;
+      out = in << kShift;
+      if (in != (out >> kShift))
+        out = 0x7FFFFFFF ^ (in >> 31);
+      *pDst++ = out;
+
+      in = *pSrc++;
+      in = ((q63_t) in * scaleFract) >> 32;
+      out = in << kShift;
+      if (in != (out >> kShift))
+        out = 0x7FFFFFFF ^ (in >> 31);
+      *pDst++ = out;
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+  }
+  else
+  {
+    while (blkCnt > 0U)
+    {
+      /* C = A * scale */
+
+      /* Scale input and store result in destination buffer. */
+      in = *pSrc++;                                /* read four inputs from source */
+      in = ((q63_t) in * scaleFract) >> 32;        /* multiply input with scaler value */
+      out = in >> -kShift;                         /* apply shifting */
+      *pDst++ = out;                               /* Store result destination */
+
+      in = *pSrc++;
+      in = ((q63_t) in * scaleFract) >> 32;
+      out = in >> -kShift;
+      *pDst++ = out;
+
+      in = *pSrc++;
+      in = ((q63_t) in * scaleFract) >> 32;
+      out = in >> -kShift;
+      *pDst++ = out;
+
+      in = *pSrc++;
+      in = ((q63_t) in * scaleFract) >> 32;
+      out = in >> -kShift;
+      *pDst++ = out;
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  if (sign == 0U)
+  {
+    while (blkCnt > 0U)
+    {
+      /* C = A * scale */
+
+      /* Scale input and store result in destination buffer. */
+      in = *pSrc++;
+      in = ((q63_t) in * scaleFract) >> 32;
+      out = in << kShift;
+      if (in != (out >> kShift))
+          out = 0x7FFFFFFF ^ (in >> 31);
+      *pDst++ = out;
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+  }
+  else
+  {
+    while (blkCnt > 0U)
+    {
+      /* C = A * scale */
+
+      /* Scale input and store result in destination buffer. */
+      in = *pSrc++;
+      in = ((q63_t) in * scaleFract) >> 32;
+      out = in >> -kShift;
+      *pDst++ = out;
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicScale group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_scale_q7.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_scale_q7.c
@@ -0,0 +1,186 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_scale_q7.c
+ * Description:  Multiplies a Q7 vector by a scalar
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicScale
+  @{
+ */
+
+/**
+  @brief         Multiplies a Q7 vector by a scalar.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     scaleFract fractional portion of the scale value
+  @param[in]     shift      number of bits to shift the result by
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.7 format.
+                   These are multiplied to yield a 2.14 intermediate result and this is shifted with saturation to 1.7 format.
+ */
+
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+
+void arm_scale_q7(
+    const q7_t * pSrc,
+    q7_t   scaleFract,
+    int8_t  shift,
+    q7_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q7x16_t vecSrc;
+    q7x16_t vecDst;
+
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4;
+
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A * scale
+         * Scale the input and then store the result in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vecDst = vmulhq(vecSrc, vdupq_n_s8(scaleFract));
+        vecDst = vqshlq_r(vecDst, shift + 1);
+        vst1q(pDst, vecDst);
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 16;
+        pDst += 16;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 0xF;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp8q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vecDst = vmulhq(vecSrc, vdupq_n_s8(scaleFract));
+        vecDst = vqshlq_r(vecDst, shift + 1);
+        vstrbq_p(pDst, vecDst, p0);
+    }
+
+}
+
+#else
+void arm_scale_q7(
+  const q7_t * pSrc,
+        q7_t scaleFract,
+        int8_t shift,
+        q7_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        int8_t kShift = 7 - shift;                     /* Shift to apply after scaling */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+#if defined (ARM_MATH_DSP)
+  q7_t in1,  in2,  in3,  in4;                    /* Temporary input variables */
+  q7_t out1, out2, out3, out4;                   /* Temporary output variables */
+#endif
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A * scale */
+
+#if defined (ARM_MATH_DSP)
+    /* Reading 4 inputs from memory */
+    in1 = *pSrc++;
+    in2 = *pSrc++;
+    in3 = *pSrc++;
+    in4 = *pSrc++;
+
+    /* Scale inputs and store result in the temporary variable. */
+    out1 = (q7_t) (__SSAT(((in1) * scaleFract) >> kShift, 8));
+    out2 = (q7_t) (__SSAT(((in2) * scaleFract) >> kShift, 8));
+    out3 = (q7_t) (__SSAT(((in3) * scaleFract) >> kShift, 8));
+    out4 = (q7_t) (__SSAT(((in4) * scaleFract) >> kShift, 8));
+
+    /* Pack and store result in destination buffer (in single write) */
+    write_q7x4_ia (&pDst, __PACKq7(out1, out2, out3, out4));
+#else
+    *pDst++ = (q7_t) (__SSAT((((q15_t) *pSrc++ * scaleFract) >> kShift), 8));
+    *pDst++ = (q7_t) (__SSAT((((q15_t) *pSrc++ * scaleFract) >> kShift), 8));
+    *pDst++ = (q7_t) (__SSAT((((q15_t) *pSrc++ * scaleFract) >> kShift), 8));
+    *pDst++ = (q7_t) (__SSAT((((q15_t) *pSrc++ * scaleFract) >> kShift), 8));
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A * scale */
+
+    /* Scale input and store result in destination buffer. */
+    *pDst++ = (q7_t) (__SSAT((((q15_t) *pSrc++ * scaleFract) >> kShift), 8));
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicScale group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_shift_q15.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_shift_q15.c
@@ -0,0 +1,251 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_shift_q15.c
+ * Description:  Shifts the elements of a Q15 vector by a specified number of bits
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicShift
+  @{
+ */
+
+/**
+  @brief         Shifts the elements of a Q15 vector a specified number of bits
+  @param[in]     pSrc       points to the input vector
+  @param[in]     shiftBits  number of bits to shift.  A positive value shifts left; a negative value shifts right.
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
+ */
+
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_shift_q15(
+    const q15_t * pSrc,
+    int8_t shiftBits,
+    q15_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q15x8_t vecSrc;
+    q15x8_t vecDst;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A (>> or <<) shiftBits
+         * Shift the input and then store the result in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vecDst = vqshlq_r(vecSrc, shiftBits);
+        vst1q(pDst, vecDst);
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 8;
+        pDst += 8;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vecDst = vqshlq_r(vecSrc, shiftBits);
+        vstrhq_p(pDst, vecDst, p0);
+    }
+}
+
+#else
+void arm_shift_q15(
+  const q15_t * pSrc,
+        int8_t shiftBits,
+        q15_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        uint8_t sign = (shiftBits & 0x80);             /* Sign of shiftBits */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+#if defined (ARM_MATH_DSP)
+  q15_t in1, in2;                                /* Temporary input variables */
+#endif
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  /* If the shift value is positive then do right shift else left shift */
+  if (sign == 0U)
+  {
+    while (blkCnt > 0U)
+    {
+      /* C = A << shiftBits */
+
+#if defined (ARM_MATH_DSP)
+      /* read 2 samples from source */
+      in1 = *pSrc++;
+      in2 = *pSrc++;
+
+      /* Shift the inputs and then store the results in the destination buffer. */
+#ifndef ARM_MATH_BIG_ENDIAN
+      write_q15x2_ia (&pDst, __PKHBT(__SSAT((in1 << shiftBits), 16),
+                                     __SSAT((in2 << shiftBits), 16), 16));
+#else
+      write_q15x2_ia (&pDst, __PKHBT(__SSAT((in2 << shiftBits), 16),
+                                      __SSAT((in1 << shiftBits), 16), 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+      /* read 2 samples from source */
+      in1 = *pSrc++;
+      in2 = *pSrc++;
+
+#ifndef ARM_MATH_BIG_ENDIAN
+      write_q15x2_ia (&pDst, __PKHBT(__SSAT((in1 << shiftBits), 16),
+                                     __SSAT((in2 << shiftBits), 16), 16));
+#else
+      write_q15x2_ia (&pDst, __PKHBT(__SSAT((in2 << shiftBits), 16),
+                                     __SSAT((in1 << shiftBits), 16), 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+#else
+      *pDst++ = __SSAT(((q31_t) *pSrc++ << shiftBits), 16);
+      *pDst++ = __SSAT(((q31_t) *pSrc++ << shiftBits), 16);
+      *pDst++ = __SSAT(((q31_t) *pSrc++ << shiftBits), 16);
+      *pDst++ = __SSAT(((q31_t) *pSrc++ << shiftBits), 16);
+#endif
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+  }
+  else
+  {
+    while (blkCnt > 0U)
+    {
+      /* C = A >> shiftBits */
+
+#if defined (ARM_MATH_DSP)
+      /* read 2 samples from source */
+      in1 = *pSrc++;
+      in2 = *pSrc++;
+
+      /* Shift the inputs and then store the results in the destination buffer. */
+#ifndef ARM_MATH_BIG_ENDIAN
+      write_q15x2_ia (&pDst, __PKHBT((in1 >> -shiftBits),
+                                     (in2 >> -shiftBits), 16));
+#else
+      write_q15x2_ia (&pDst, __PKHBT((in2 >> -shiftBits),
+                                     (in1 >> -shiftBits), 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+      /* read 2 samples from source */
+      in1 = *pSrc++;
+      in2 = *pSrc++;
+
+#ifndef ARM_MATH_BIG_ENDIAN
+      write_q15x2_ia (&pDst, __PKHBT((in1 >> -shiftBits),
+                                     (in2 >> -shiftBits), 16));
+#else
+      write_q15x2_ia (&pDst, __PKHBT((in2 >> -shiftBits),
+                                     (in1 >> -shiftBits), 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+#else
+      *pDst++ = (*pSrc++ >> -shiftBits);
+      *pDst++ = (*pSrc++ >> -shiftBits);
+      *pDst++ = (*pSrc++ >> -shiftBits);
+      *pDst++ = (*pSrc++ >> -shiftBits);
+#endif
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  /* If the shift value is positive then do right shift else left shift */
+  if (sign == 0U)
+  {
+    while (blkCnt > 0U)
+    {
+      /* C = A << shiftBits */
+
+      /* Shift input and store result in destination buffer. */
+      *pDst++ = __SSAT(((q31_t) *pSrc++ << shiftBits), 16);
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+  }
+  else
+  {
+    while (blkCnt > 0U)
+    {
+      /* C = A >> shiftBits */
+
+      /* Shift input and store result in destination buffer. */
+      *pDst++ = (*pSrc++ >> -shiftBits);
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicShift group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_shift_q31.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_shift_q31.c
@@ -0,0 +1,232 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_shift_q31.c
+ * Description:  Shifts the elements of a Q31 vector by a specified number of bits
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+/**
+  @defgroup BasicShift Vector Shift
+
+  Shifts the elements of a fixed-point vector by a specified number of bits.
+  There are separate functions for Q7, Q15, and Q31 data types.
+  The underlying algorithm used is:
+
+  <pre>
+      pDst[n] = pSrc[n] << shift,   0 <= n < blockSize.
+  </pre>
+
+  If <code>shift</code> is positive then the elements of the vector are shifted to the left.
+  If <code>shift</code> is negative then the elements of the vector are shifted to the right.
+
+  The functions support in-place computation allowing the source and destination
+  pointers to reference the same memory buffer.
+ */
+
+/**
+  @addtogroup BasicShift
+  @{
+ */
+
+/**
+  @brief         Shifts the elements of a Q31 vector a specified number of bits.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     shiftBits  number of bits to shift.  A positive value shifts left; a negative value shifts right.
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in the vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
+ */
+
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_shift_q31(
+    const q31_t * pSrc,
+    int8_t shiftBits,
+    q31_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q31x4_t vecSrc;
+    q31x4_t vecDst;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A (>> or <<) shiftBits
+         * Shift the input and then store the result in the destination buffer.
+         */
+        vecSrc = vld1q((q31_t const *) pSrc);
+        vecDst = vqshlq_r(vecSrc, shiftBits);
+        vst1q(pDst, vecDst);
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 4;
+        pDst += 4;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 3;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vecSrc = vld1q((q31_t const *) pSrc);
+        vecDst = vqshlq_r(vecSrc, shiftBits);
+        vstrwq_p(pDst, vecDst, p0);
+    }
+}
+
+
+#else
+void arm_shift_q31(
+  const q31_t * pSrc,
+        int8_t shiftBits,
+        q31_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        uint8_t sign = (shiftBits & 0x80);             /* Sign of shiftBits */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  q31_t in, out;                                 /* Temporary variables */
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  /* If the shift value is positive then do right shift else left shift */
+  if (sign == 0U)
+  {
+    while (blkCnt > 0U)
+    {
+      /* C = A << shiftBits */
+
+      /* Shift input and store result in destination buffer. */
+      in = *pSrc++;
+      out = in << shiftBits;
+      if (in != (out >> shiftBits))
+        out = 0x7FFFFFFF ^ (in >> 31);
+      *pDst++ = out;
+
+      in = *pSrc++;
+      out = in << shiftBits;
+      if (in != (out >> shiftBits))
+        out = 0x7FFFFFFF ^ (in >> 31);
+      *pDst++ = out;
+
+      in = *pSrc++;
+      out = in << shiftBits;
+      if (in != (out >> shiftBits))
+        out = 0x7FFFFFFF ^ (in >> 31);
+      *pDst++ = out;
+
+      in = *pSrc++;
+      out = in << shiftBits;
+      if (in != (out >> shiftBits))
+        out = 0x7FFFFFFF ^ (in >> 31);
+      *pDst++ = out;
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+  }
+  else
+  {
+    while (blkCnt > 0U)
+    {
+      /* C = A >> shiftBits */
+
+      /* Shift input and store results in destination buffer. */
+      *pDst++ = (*pSrc++ >> -shiftBits);
+      *pDst++ = (*pSrc++ >> -shiftBits);
+      *pDst++ = (*pSrc++ >> -shiftBits);
+      *pDst++ = (*pSrc++ >> -shiftBits);
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  /* If the shift value is positive then do right shift else left shift */
+  if (sign == 0U)
+  {
+    while (blkCnt > 0U)
+    {
+      /* C = A << shiftBits */
+
+      /* Shift input and store result in destination buffer. */
+      *pDst++ = clip_q63_to_q31((q63_t) *pSrc++ << shiftBits);
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+  }
+  else
+  {
+    while (blkCnt > 0U)
+    {
+      /* C = A >> shiftBits */
+
+      /* Shift input and store result in destination buffer. */
+      *pDst++ = (*pSrc++ >> -shiftBits);
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicShift group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_shift_q7.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_shift_q7.c
@@ -0,0 +1,225 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_shift_q7.c
+ * Description:  Processing function for the Q7 Shifting
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicShift
+  @{
+ */
+
+/**
+  @brief         Shifts the elements of a Q7 vector a specified number of bits
+  @param[in]     pSrc       points to the input vector
+  @param[in]     shiftBits  number of bits to shift.  A positive value shifts left; a negative value shifts right.
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           onditions for optimum performance
+                   Input and output buffers should be aligned by 32-bit
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
+ */
+
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_shift_q7(
+    const q7_t * pSrc,
+    int8_t shiftBits,
+    q7_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q7x16_t vecSrc;
+    q7x16_t vecDst;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A (>> or <<) shiftBits
+         * Shift the input and then store the result in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vecDst = vqshlq_r(vecSrc, shiftBits);
+        vst1q(pDst, vecDst);
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 16;
+        pDst += 16;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 0xF;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp8q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vecDst = vqshlq_r(vecSrc, shiftBits);
+        vstrbq_p(pDst, vecDst, p0);
+    }
+}
+
+#else
+void arm_shift_q7(
+  const q7_t * pSrc,
+        int8_t shiftBits,
+        q7_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        uint8_t sign = (shiftBits & 0x80);             /* Sign of shiftBits */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+#if defined (ARM_MATH_DSP)
+  q7_t in1,  in2,  in3,  in4;                    /* Temporary input variables */
+#endif
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  /* If the shift value is positive then do right shift else left shift */
+  if (sign == 0U)
+  {
+    while (blkCnt > 0U)
+    {
+      /* C = A << shiftBits */
+
+#if defined (ARM_MATH_DSP)
+      /* Read 4 inputs */
+      in1 = *pSrc++;
+      in2 = *pSrc++;
+      in3 = *pSrc++;
+      in4 = *pSrc++;
+
+    /* Pack and store result in destination buffer (in single write) */
+      write_q7x4_ia (&pDst, __PACKq7(__SSAT((in1 << shiftBits), 8),
+                                     __SSAT((in2 << shiftBits), 8),
+                                     __SSAT((in3 << shiftBits), 8),
+                                     __SSAT((in4 << shiftBits), 8) ));
+#else
+      *pDst++ = (q7_t) __SSAT(((q15_t) *pSrc++ << shiftBits), 8);
+      *pDst++ = (q7_t) __SSAT(((q15_t) *pSrc++ << shiftBits), 8);
+      *pDst++ = (q7_t) __SSAT(((q15_t) *pSrc++ << shiftBits), 8);
+      *pDst++ = (q7_t) __SSAT(((q15_t) *pSrc++ << shiftBits), 8);
+#endif
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+  }
+  else
+  {
+    while (blkCnt > 0U)
+    {
+      /* C = A >> shiftBits */
+
+#if defined (ARM_MATH_DSP)
+      /* Read 4 inputs */
+      in1 = *pSrc++;
+      in2 = *pSrc++;
+      in3 = *pSrc++;
+      in4 = *pSrc++;
+
+    /* Pack and store result in destination buffer (in single write) */
+      write_q7x4_ia (&pDst, __PACKq7((in1 >> -shiftBits),
+                                     (in2 >> -shiftBits),
+                                     (in3 >> -shiftBits),
+                                     (in4 >> -shiftBits) ));
+#else
+      *pDst++ = (*pSrc++ >> -shiftBits);
+      *pDst++ = (*pSrc++ >> -shiftBits);
+      *pDst++ = (*pSrc++ >> -shiftBits);
+      *pDst++ = (*pSrc++ >> -shiftBits);
+#endif
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  /* If the shift value is positive then do right shift else left shift */
+  if (sign == 0U)
+  {
+    while (blkCnt > 0U)
+    {
+      /* C = A << shiftBits */
+
+      /* Shift input and store result in destination buffer. */
+      *pDst++ = (q7_t) __SSAT(((q15_t) *pSrc++ << shiftBits), 8);
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+  }
+  else
+  {
+    while (blkCnt > 0U)
+    {
+      /* C = A >> shiftBits */
+
+      /* Shift input and store result in destination buffer. */
+      *pDst++ = (*pSrc++ >> -shiftBits);
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicShift group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_sub_f32.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_sub_f32.c
@@ -0,0 +1,202 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_sub_f32.c
+ * Description:  Floating-point vector subtraction
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @defgroup BasicSub Vector Subtraction
+
+  Element-by-element subtraction of two vectors.
+
+  <pre>
+      pDst[n] = pSrcA[n] - pSrcB[n],   0 <= n < blockSize.
+  </pre>
+
+  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
+ */
+
+/**
+  @addtogroup BasicSub
+  @{
+ */
+
+/**
+  @brief         Floating-point vector subtraction.
+  @param[in]     pSrcA      points to the first input vector
+  @param[in]     pSrcB      points to the second input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_sub_f32(
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+    uint32_t blkCnt;                               /* Loop counter */
+
+    f32x4_t vec1;
+    f32x4_t vec2;
+    f32x4_t res;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        /* C = A + B */
+
+      /* Add and then store the results in the destination buffer. */
+        vec1 = vld1q(pSrcA);
+        vec2 = vld1q(pSrcB);
+        res = vsubq(vec1, vec2);
+        vst1q(pDst, res);
+
+        /* Increment pointers */
+        pSrcA += 4;
+        pSrcB += 4;
+        pDst += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+    if (blkCnt > 0U)
+    {
+      /* C = A + B */
+      mve_pred16_t p0 = vctp32q(blkCnt);
+      vec1 = vld1q(pSrcA);
+      vec2 = vld1q(pSrcB);
+      vstrwq_p(pDst, vsubq(vec1,vec2), p0);
+    }
+
+}
+
+#else
+void arm_sub_f32(
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+    f32x4_t vec1;
+    f32x4_t vec2;
+    f32x4_t res;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        /* C = A - B */
+
+        /* Subtract and then store the results in the destination buffer. */
+        vec1 = vld1q_f32(pSrcA);
+        vec2 = vld1q_f32(pSrcB);
+        res = vsubq_f32(vec1, vec2);
+        vst1q_f32(pDst, res);
+
+        /* Increment pointers */
+        pSrcA += 4;
+        pSrcB += 4;
+        pDst += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+#else
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A - B */
+
+    /* Subtract and store result in destination buffer. */
+    *pDst++ = (*pSrcA++) - (*pSrcB++);
+
+    *pDst++ = (*pSrcA++) - (*pSrcB++);
+
+    *pDst++ = (*pSrcA++) - (*pSrcB++);
+
+    *pDst++ = (*pSrcA++) - (*pSrcB++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined(ARM_MATH_NEON) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A - B */
+
+    /* Subtract and store result in destination buffer. */
+    *pDst++ = (*pSrcA++) - (*pSrcB++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of BasicSub group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_sub_q15.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_sub_q15.c
@@ -0,0 +1,178 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_sub_q15.c
+ * Description:  Q15 vector subtraction
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicSub
+  @{
+ */
+
+/**
+  @brief         Q15 vector subtraction.
+  @param[in]     pSrcA      points to the first input vector
+  @param[in]     pSrcB      points to the second input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
+ */
+
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_sub_q15(
+    const q15_t * pSrcA,
+    const q15_t * pSrcB,
+    q15_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q15x8_t vecA;
+    q15x8_t vecB;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A - B
+         * Subtract and then store the results in the destination buffer.
+         */
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vst1q(pDst, vqsubq(vecA, vecB));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrcA  += 8;
+        pSrcB  += 8;
+        pDst   += 8;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vstrhq_p(pDst, vqsubq(vecA, vecB), p0);
+    }
+}
+
+
+#else
+void arm_sub_q15(
+  const q15_t * pSrcA,
+  const q15_t * pSrcB,
+        q15_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+#if defined (ARM_MATH_DSP)
+  q31_t inA1, inA2;
+  q31_t inB1, inB2;
+#endif
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A - B */
+
+#if defined (ARM_MATH_DSP)
+    /* read 2 times 2 samples at a time from sourceA */
+    inA1 = read_q15x2_ia ((q15_t **) &pSrcA);
+    inA2 = read_q15x2_ia ((q15_t **) &pSrcA);
+    /* read 2 times 2 samples at a time from sourceB */
+    inB1 = read_q15x2_ia ((q15_t **) &pSrcB);
+    inB2 = read_q15x2_ia ((q15_t **) &pSrcB);
+
+    /* Subtract and store 2 times 2 samples at a time */
+    write_q15x2_ia (&pDst, __QSUB16(inA1, inB1));
+    write_q15x2_ia (&pDst, __QSUB16(inA2, inB2));
+#else
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A - B */
+
+    /* Subtract and store result in destination buffer. */
+#if defined (ARM_MATH_DSP)
+    *pDst++ = (q15_t) __QSUB16(*pSrcA++, *pSrcB++);
+#else
+    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicSub group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_sub_q31.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_sub_q31.c
@@ -0,0 +1,159 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_sub_q31.c
+ * Description:  Q31 vector subtraction
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicSub
+  @{
+ */
+
+/**
+  @brief         Q31 vector subtraction.
+  @param[in]     pSrcA      points to the first input vector
+  @param[in]     pSrcB      points to the second input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
+ */
+
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_sub_q31(
+  const q31_t * pSrcA,
+  const q31_t * pSrcB,
+        q31_t * pDst,
+        uint32_t blockSize)
+{
+    uint32_t blkCnt;
+    q31x4_t vecA;
+    q31x4_t vecB;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A + B
+         * Add and then store the results in the destination buffer.
+         */
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vst1q(pDst, vqsubq(vecA, vecB));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrcA  += 4;
+        pSrcB  += 4;
+        pDst   += 4;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 3;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vstrwq_p(pDst, vqsubq(vecA, vecB), p0);
+    }
+}
+
+#else
+void arm_sub_q31(
+  const q31_t * pSrcA,
+  const q31_t * pSrcB,
+        q31_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A - B */
+
+    /* Subtract and store result in destination buffer. */
+    *pDst++ = __QSUB(*pSrcA++, *pSrcB++);
+
+    *pDst++ = __QSUB(*pSrcA++, *pSrcB++);
+
+    *pDst++ = __QSUB(*pSrcA++, *pSrcB++);
+
+    *pDst++ = __QSUB(*pSrcA++, *pSrcB++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A - B */
+
+    /* Subtract and store result in destination buffer. */
+    *pDst++ = __QSUB(*pSrcA++, *pSrcB++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicSub group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_sub_q7.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_sub_q7.c
@@ -0,0 +1,158 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_sub_q7.c
+ * Description:  Q7 vector subtraction
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup BasicSub
+  @{
+ */
+
+/**
+  @brief         Q7 vector subtraction.
+  @param[in]     pSrcA      points to the first input vector
+  @param[in]     pSrcB      points to the second input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.
+ */
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_sub_q7(
+    const q7_t * pSrcA,
+    const q7_t * pSrcB,
+    q7_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q7x16_t vecA;
+    q7x16_t vecB;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A - B
+         * Subtract and then store the results in the destination buffer.
+         */
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vst1q(pDst, vqsubq(vecA, vecB));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrcA  += 16;
+        pSrcB  += 16;
+        pDst   += 16;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 0xF;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp8q(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vstrbq_p(pDst, vqsubq(vecA, vecB), p0);
+    }
+}
+#else
+void arm_sub_q7(
+  const q7_t * pSrcA,
+  const q7_t * pSrcB,
+        q7_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A - B */
+
+#if defined (ARM_MATH_DSP)
+    /* Subtract and store result in destination buffer (4 samples at a time). */
+    write_q7x4_ia (&pDst, __QSUB8(read_q7x4_ia ((q7_t **) &pSrcA), read_q7x4_ia ((q7_t **) &pSrcB)));
+#else
+    *pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ - *pSrcB++, 8);
+    *pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ - *pSrcB++, 8);
+    *pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ - *pSrcB++, 8);
+    *pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ - *pSrcB++, 8);
+#endif
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A - B */
+
+    /* Subtract and store result in destination buffer. */
+    *pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ - *pSrcB++, 8);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicSub group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_xor_u16.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_xor_u16.c
@@ -0,0 +1,137 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_xor_u16.c
+ * Description:  uint16_t bitwise exclusive OR
+ *
+ * $Date:        14 November 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @defgroup Xor Vector bitwise exclusive OR
+
+  Compute the logical bitwise XOR.
+
+  There are separate functions for uint32_t, uint16_t, and uint8_t data types.
+ */
+
+/**
+  @addtogroup Xor
+  @{
+ */
+
+/**
+  @brief         Compute the logical bitwise XOR of two fixed-point vectors.
+  @param[in]     pSrcA      points to input vector A
+  @param[in]     pSrcB      points to input vector B
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+void arm_xor_u16(
+    const uint16_t * pSrcA,
+    const uint16_t * pSrcB,
+          uint16_t * pDst,
+          uint32_t blockSize)
+{
+    uint32_t blkCnt;      /* Loop counter */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+    q15x8_t vecSrcA, vecSrcB;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3;
+
+    while (blkCnt > 0U)
+    {
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+
+        vst1q(pDst, veorq_u16(vecSrcA, vecSrcB) );
+
+        pSrcA += 8;
+        pSrcB += 8;
+        pDst  += 8;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 7;
+
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+        vstrhq_p(pDst, veorq_u16(vecSrcA, vecSrcB), p0);
+    }
+#else
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+    uint16x8_t vecA, vecB;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3U;
+
+    while (blkCnt > 0U)
+    {
+        vecA = vld1q_u16(pSrcA);
+        vecB = vld1q_u16(pSrcB);
+
+        vst1q_u16(pDst, veorq_u16(vecA, vecB) );
+
+        pSrcA += 8;
+        pSrcB += 8;
+        pDst  += 8;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 7;
+#else
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize;
+#endif
+
+    while (blkCnt > 0U)
+    {
+        *pDst++ = (*pSrcA++)^(*pSrcB++);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+#endif /* if defined(ARM_MATH_MVEI) */
+}
+
+/**
+  @} end of Xor group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_xor_u32.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_xor_u32.c
@@ -0,0 +1,129 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_xor_u32.c
+ * Description:  uint32_t bitwise exclusive OR
+ *
+ * $Date:        14 November 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup Xor
+  @{
+ */
+
+/**
+  @brief         Compute the logical bitwise XOR of two fixed-point vectors.
+  @param[in]     pSrcA      points to input vector A
+  @param[in]     pSrcB      points to input vector B
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+void arm_xor_u32(
+    const uint32_t * pSrcA,
+    const uint32_t * pSrcB,
+          uint32_t * pDst,
+          uint32_t blockSize)
+{
+    uint32_t blkCnt;      /* Loop counter */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+    q31x4_t vecSrcA, vecSrcB;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2;
+
+    while (blkCnt > 0U)
+    {
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+
+        vst1q(pDst, veorq_u32(vecSrcA, vecSrcB) );
+
+        pSrcA += 4;
+        pSrcB += 4;
+        pDst  += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 3;
+
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+        vstrwq_p(pDst, veorq_u32(vecSrcA, vecSrcB), p0);
+    }
+#else
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+    uint32x4_t vecA, vecB;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        vecA = vld1q_u32(pSrcA);
+        vecB = vld1q_u32(pSrcB);
+
+        vst1q_u32(pDst, veorq_u32(vecA, vecB) );
+
+        pSrcA += 4;
+        pSrcB += 4;
+        pDst  += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 3;
+#else
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize;
+#endif
+
+    while (blkCnt > 0U)
+    {
+        *pDst++ = (*pSrcA++)^(*pSrcB++);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+#endif /* if defined(ARM_MATH_MVEI) */
+}
+
+/**
+  @} end of Xor group
+ */
--- a/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_xor_u8.c
+++ b/libraries/cmsis/dsp/Source/BasicMathFunctions/arm_xor_u8.c
@@ -0,0 +1,129 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_xor_u8.c
+ * Description:  uint8_t bitwise exclusive OR
+ *
+ * $Date:        14 November 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @addtogroup Xor
+  @{
+ */
+
+/**
+  @brief         Compute the logical bitwise XOR of two fixed-point vectors.
+  @param[in]     pSrcA      points to input vector A
+  @param[in]     pSrcB      points to input vector B
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+void arm_xor_u8(
+    const uint8_t * pSrcA,
+    const uint8_t * pSrcB,
+          uint8_t * pDst,
+          uint32_t blockSize)
+{
+    uint32_t blkCnt;      /* Loop counter */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+    q7x16_t vecSrcA, vecSrcB;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4;
+
+    while (blkCnt > 0U)
+    {
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+
+        vst1q(pDst, veorq_u8(vecSrcA, vecSrcB) );
+
+        pSrcA += 16;
+        pSrcB += 16;
+        pDst  += 16;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0xF;
+
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp8q(blkCnt);
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+        vstrbq_p(pDst, veorq_u8(vecSrcA, vecSrcB), p0);
+    }
+#else
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+    uint8x16_t vecA, vecB;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4U;
+
+    while (blkCnt > 0U)
+    {
+        vecA = vld1q_u8(pSrcA);
+        vecB = vld1q_u8(pSrcB);
+
+        vst1q_u8(pDst, veorq_u8(vecA, vecB) );
+
+        pSrcA += 16;
+        pSrcB += 16;
+        pDst  += 16;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0xF;
+#else
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize;
+#endif
+
+    while (blkCnt > 0U)
+    {
+        *pDst++ = (*pSrcA++)^(*pSrcB++);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+#endif /* if defined(ARM_MATH_MVEI) */
+}
+
+/**
+  @} end of Xor group
+ */
--- a/libraries/cmsis/dsp/Source/BayesFunctions/BayesFunctions.c
+++ b/libraries/cmsis/dsp/Source/BayesFunctions/BayesFunctions.c
@@ -0,0 +1,29 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        BayesFunctions.c
+ * Description:  Combination of all bayes function source files.
+ *
+ * $Date:        16. March 2020
+ * $Revision:    V1.0.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2020 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_gaussian_naive_bayes_predict_f32.c"
--- a/libraries/cmsis/dsp/Source/BayesFunctions/CMakeLists.txt
+++ b/libraries/cmsis/dsp/Source/BayesFunctions/CMakeLists.txt
@@ -0,0 +1,19 @@
+cmake_minimum_required (VERSION 3.6)
+
+project(CMSISDSPBayes)
+
+include(configLib)
+include(configDsp)
+
+file(GLOB SRC "./*_*.c")
+
+add_library(CMSISDSPBayes STATIC ${SRC})
+
+configLib(CMSISDSPBayes ${ROOT})
+configDsp(CMSISDSPBayes ${ROOT})
+
+### Includes
+target_include_directories(CMSISDSPBayes PUBLIC "${DSP}/Include")
+
+
+
--- a/libraries/cmsis/dsp/Source/BayesFunctions/arm_gaussian_naive_bayes_predict_f32.c
+++ b/libraries/cmsis/dsp/Source/BayesFunctions/arm_gaussian_naive_bayes_predict_f32.c
@@ -0,0 +1,397 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_naive_gaussian_bayes_predict_f32
+ * Description:  Naive Gaussian Bayesian Estimator
+ *
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include <limits.h>
+#include <math.h>
+
+#define PI_F 3.1415926535897932384626433832795f
+#define DPI_F (2.0f*3.1415926535897932384626433832795f)
+
+/**
+ * @addtogroup groupBayes
+ * @{
+ */
+
+/**
+ * @brief Naive Gaussian Bayesian Estimator
+ *
+ * @param[in]  *S         points to a naive bayes instance structure
+ * @param[in]  *in        points to the elements of the input vector.
+ * @param[in]  *pBuffer   points to a buffer of length numberOfClasses
+ * @return The predicted class
+ *
+ * @par If the number of classes is big, MVE version will consume lot of
+ * stack since the log prior are computed on the stack.
+ *
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math.h"
+
+uint32_t arm_gaussian_naive_bayes_predict_f32(const arm_gaussian_naive_bayes_instance_f32 *S,
+   const float32_t * in,
+   float32_t *pBuffer)
+{
+    uint32_t         nbClass;
+    const float32_t *pTheta = S->theta;
+    const float32_t *pSigma = S->sigma;
+    float32_t      *buffer = pBuffer;
+    const float32_t *pIn = in;
+    float32_t       result;
+    f32x4_t         vsigma;
+    float32_t       tmp;
+    f32x4_t         vacc1, vacc2;
+    uint32_t        index;
+    float32_t       logclassPriors[S->numberOfClasses];
+    float32_t      *pLogPrior = logclassPriors;
+
+    arm_vlog_f32((float32_t *) S->classPriors, logclassPriors, S->numberOfClasses);
+
+    pTheta = S->theta;
+    pSigma = S->sigma;
+
+    for (nbClass = 0; nbClass < S->numberOfClasses; nbClass++) {
+        pIn = in;
+
+        vacc1 = vdupq_n_f32(0);
+        vacc2 = vdupq_n_f32(0);
+
+        uint32_t         blkCnt =S->vectorDimension >> 2;
+        while (blkCnt > 0U) {
+            f32x4_t         vinvSigma, vtmp;
+
+            vsigma = vaddq_n_f32(vld1q(pSigma), S->epsilon);
+            vacc1 = vaddq(vacc1, vlogq_f32(vmulq_n_f32(vsigma, 2.0f * PI)));
+
+            vinvSigma = vrecip_medprec_f32(vsigma);
+
+            vtmp = vsubq(vld1q(pIn), vld1q(pTheta));
+            /* squaring */
+            vtmp = vmulq(vtmp, vtmp);
+
+            vacc2 = vfmaq(vacc2, vtmp, vinvSigma);
+
+            pIn += 4;
+            pTheta += 4;
+            pSigma += 4;
+            blkCnt--;
+        }
+
+        blkCnt = S->vectorDimension & 3;
+        if (blkCnt > 0U) {
+            mve_pred16_t    p0 = vctp32q(blkCnt);
+            f32x4_t         vinvSigma, vtmp;
+
+            vsigma = vaddq_n_f32(vld1q(pSigma), S->epsilon);
+            vacc1 =
+                vaddq_m_f32(vacc1, vacc1, vlogq_f32(vmulq_n_f32(vsigma, 2.0f * PI)), p0);
+
+            vinvSigma = vrecip_medprec_f32(vsigma);
+
+            vtmp = vsubq(vld1q(pIn), vld1q(pTheta));
+            /* squaring */
+            vtmp = vmulq(vtmp, vtmp);
+
+            vacc2 = vfmaq_m_f32(vacc2, vtmp, vinvSigma, p0);
+
+            pTheta += blkCnt;
+            pSigma += blkCnt;
+        }
+
+        tmp = -0.5f * vecAddAcrossF32Mve(vacc1);
+        tmp -= 0.5f * vecAddAcrossF32Mve(vacc2);
+
+        *buffer = tmp + *pLogPrior++;
+        buffer++;
+    }
+
+    arm_max_f32(pBuffer, S->numberOfClasses, &result, &index);
+
+    return (index);
+}
+
+#else
+
+#if defined(ARM_MATH_NEON)
+
+#include "NEMath.h"
+
+
+
+uint32_t arm_gaussian_naive_bayes_predict_f32(const arm_gaussian_naive_bayes_instance_f32 *S,
+   const float32_t * in,
+   float32_t *pBuffer)
+{
+
+    const float32_t *pPrior = S->classPriors;
+
+    const float32_t *pTheta = S->theta;
+    const float32_t *pSigma = S->sigma;
+
+    const float32_t *pTheta1 = S->theta + S->vectorDimension;
+    const float32_t *pSigma1 = S->sigma + S->vectorDimension;
+
+    float32_t *buffer = pBuffer;
+    const float32_t *pIn=in;
+
+    float32_t result;
+    float32_t sigma,sigma1;
+    float32_t tmp,tmp1;
+    uint32_t index;
+    uint32_t vecBlkCnt;
+    uint32_t classBlkCnt;
+    float32x4_t epsilonV;
+    float32x4_t sigmaV,sigmaV1;
+    float32x4_t tmpV,tmpVb,tmpV1;
+    float32x2_t tmpV2;
+    float32x4_t thetaV,thetaV1;
+    float32x4_t inV;
+
+    epsilonV = vdupq_n_f32(S->epsilon);
+
+    classBlkCnt = S->numberOfClasses >> 1;
+    while(classBlkCnt > 0)
+    {
+
+
+        pIn = in;
+
+        tmp = logf(*pPrior++);
+        tmp1 = logf(*pPrior++);
+        tmpV = vdupq_n_f32(0.0f);
+        tmpV1 = vdupq_n_f32(0.0f);
+
+        vecBlkCnt = S->vectorDimension >> 2;
+        while(vecBlkCnt > 0)
+        {
+           sigmaV = vld1q_f32(pSigma);
+           thetaV = vld1q_f32(pTheta);
+
+           sigmaV1 = vld1q_f32(pSigma1);
+           thetaV1 = vld1q_f32(pTheta1);
+
+           inV = vld1q_f32(pIn);
+
+           sigmaV = vaddq_f32(sigmaV, epsilonV);
+           sigmaV1 = vaddq_f32(sigmaV1, epsilonV);
+
+           tmpVb = vmulq_n_f32(sigmaV,DPI_F);
+           tmpVb = vlogq_f32(tmpVb);
+           tmpV = vmlsq_n_f32(tmpV,tmpVb,0.5f);
+
+           tmpVb = vmulq_n_f32(sigmaV1,DPI_F);
+           tmpVb = vlogq_f32(tmpVb);
+           tmpV1 = vmlsq_n_f32(tmpV1,tmpVb,0.5f);
+
+           tmpVb = vsubq_f32(inV,thetaV);
+           tmpVb = vmulq_f32(tmpVb,tmpVb);
+           tmpVb = vmulq_f32(tmpVb, vinvq_f32(sigmaV));
+           tmpV = vmlsq_n_f32(tmpV,tmpVb,0.5f);
+
+           tmpVb = vsubq_f32(inV,thetaV1);
+           tmpVb = vmulq_f32(tmpVb,tmpVb);
+           tmpVb = vmulq_f32(tmpVb, vinvq_f32(sigmaV1));
+           tmpV1 = vmlsq_n_f32(tmpV1,tmpVb,0.5f);
+
+           pIn += 4;
+           pTheta += 4;
+           pSigma += 4;
+           pTheta1 += 4;
+           pSigma1 += 4;
+
+           vecBlkCnt--;
+        }
+        tmpV2 = vpadd_f32(vget_low_f32(tmpV),vget_high_f32(tmpV));
+        tmp += vget_lane_f32(tmpV2, 0) + vget_lane_f32(tmpV2, 1);
+
+        tmpV2 = vpadd_f32(vget_low_f32(tmpV1),vget_high_f32(tmpV1));
+        tmp1 += vget_lane_f32(tmpV2, 0) + vget_lane_f32(tmpV2, 1);
+
+        vecBlkCnt = S->vectorDimension & 3;
+        while(vecBlkCnt > 0)
+        {
+           sigma = *pSigma + S->epsilon;
+           sigma1 = *pSigma1 + S->epsilon;
+
+           tmp -= 0.5f*logf(2.0f * PI_F * sigma);
+           tmp -= 0.5f*(*pIn - *pTheta) * (*pIn - *pTheta) / sigma;
+
+           tmp1 -= 0.5f*logf(2.0f * PI_F * sigma1);
+           tmp1 -= 0.5f*(*pIn - *pTheta1) * (*pIn - *pTheta1) / sigma1;
+
+           pIn++;
+           pTheta++;
+           pSigma++;
+           pTheta1++;
+           pSigma1++;
+           vecBlkCnt--;
+        }
+
+        *buffer++ = tmp;
+        *buffer++ = tmp1;
+
+        pSigma += S->vectorDimension;
+        pTheta += S->vectorDimension;
+        pSigma1 += S->vectorDimension;
+        pTheta1 += S->vectorDimension;
+
+        classBlkCnt--;
+    }
+
+    classBlkCnt = S->numberOfClasses & 1;
+
+    while(classBlkCnt > 0)
+    {
+
+
+        pIn = in;
+
+        tmp = logf(*pPrior++);
+        tmpV = vdupq_n_f32(0.0f);
+
+        vecBlkCnt = S->vectorDimension >> 2;
+        while(vecBlkCnt > 0)
+        {
+           sigmaV = vld1q_f32(pSigma);
+           thetaV = vld1q_f32(pTheta);
+           inV = vld1q_f32(pIn);
+
+           sigmaV = vaddq_f32(sigmaV, epsilonV);
+
+           tmpVb = vmulq_n_f32(sigmaV,DPI_F);
+           tmpVb = vlogq_f32(tmpVb);
+           tmpV = vmlsq_n_f32(tmpV,tmpVb,0.5f);
+
+           tmpVb = vsubq_f32(inV,thetaV);
+           tmpVb = vmulq_f32(tmpVb,tmpVb);
+           tmpVb = vmulq_f32(tmpVb, vinvq_f32(sigmaV));
+           tmpV = vmlsq_n_f32(tmpV,tmpVb,0.5f);
+
+           pIn += 4;
+           pTheta += 4;
+           pSigma += 4;
+
+           vecBlkCnt--;
+        }
+        tmpV2 = vpadd_f32(vget_low_f32(tmpV),vget_high_f32(tmpV));
+        tmp += vget_lane_f32(tmpV2, 0) + vget_lane_f32(tmpV2, 1);
+
+        vecBlkCnt = S->vectorDimension & 3;
+        while(vecBlkCnt > 0)
+        {
+           sigma = *pSigma + S->epsilon;
+           tmp -= 0.5f*logf(2.0f * PI_F * sigma);
+           tmp -= 0.5f*(*pIn - *pTheta) * (*pIn - *pTheta) / sigma;
+
+           pIn++;
+           pTheta++;
+           pSigma++;
+           vecBlkCnt--;
+        }
+
+        *buffer++ = tmp;
+
+        classBlkCnt--;
+    }
+
+    arm_max_f32(pBuffer,S->numberOfClasses,&result,&index);
+
+    return(index);
+}
+
+#else
+
+/**
+ * @brief Naive Gaussian Bayesian Estimator
+ *
+ * @param[in]  *S         points to a naive bayes instance structure
+ * @param[in]  *in        points to the elements of the input vector.
+ * @param[in]  *pBuffer   points to a buffer of length numberOfClasses
+ * @return The predicted class
+ *
+ */
+uint32_t arm_gaussian_naive_bayes_predict_f32(const arm_gaussian_naive_bayes_instance_f32 *S,
+   const float32_t * in,
+   float32_t *pBuffer)
+{
+    uint32_t nbClass;
+    uint32_t nbDim;
+    const float32_t *pPrior = S->classPriors;
+    const float32_t *pTheta = S->theta;
+    const float32_t *pSigma = S->sigma;
+    float32_t *buffer = pBuffer;
+    const float32_t *pIn=in;
+    float32_t result;
+    float32_t sigma;
+    float32_t tmp;
+    float32_t acc1,acc2;
+    uint32_t index;
+
+    pTheta=S->theta;
+    pSigma=S->sigma;
+
+    for(nbClass = 0; nbClass < S->numberOfClasses; nbClass++)
+    {
+
+
+        pIn = in;
+
+        tmp = 0.0;
+        acc1 = 0.0f;
+        acc2 = 0.0f;
+        for(nbDim = 0; nbDim < S->vectorDimension; nbDim++)
+        {
+           sigma = *pSigma + S->epsilon;
+           acc1 += logf(2.0f * PI_F * sigma);
+           acc2 += (*pIn - *pTheta) * (*pIn - *pTheta) / sigma;
+
+           pIn++;
+           pTheta++;
+           pSigma++;
+        }
+
+        tmp = -0.5f * acc1;
+        tmp -= 0.5f * acc2;
+
+
+        *buffer = tmp + logf(*pPrior++);
+        buffer++;
+    }
+
+    arm_max_f32(pBuffer,S->numberOfClasses,&result,&index);
+
+    return(index);
+}
+
+#endif
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of groupBayes group
+ */
--- a/libraries/cmsis/dsp/Source/CMakeLists.txt
+++ b/libraries/cmsis/dsp/Source/CMakeLists.txt
@@ -0,0 +1,280 @@
+cmake_minimum_required (VERSION 3.6)
+cmake_policy(SET CMP0077 NEW)
+project(CMSISDSP)
+
+# DSP Sources
+SET(DSP ${ROOT}/CMSIS/DSP)
+
+list(APPEND CMAKE_MODULE_PATH ${DSP}/Source)
+list(APPEND CMAKE_MODULE_PATH ${DSP})
+
+
+include(configLib)
+
+
+option(NEON "Neon acceleration" OFF)
+option(NEONEXPERIMENTAL "Neon experimental acceleration" OFF)
+option(LOOPUNROLL "Loop unrolling" ON)
+option(ROUNDING "Rounding" OFF)
+option(MATRIXCHECK "Matrix Checks" OFF)
+option(HELIUM "Helium acceleration (MVEF and MVEI supported)" OFF)
+option(MVEF "MVEF intrinsics supported" OFF)
+option(MVEI "MVEI intrinsics supported" OFF)
+
+# Select which parts of the CMSIS-DSP must be compiled.
+# There are some dependencies between the parts but they are not tracked
+# by this cmake. So, enabling some functions may require to enable some
+# other ones.
+option(BASICMATH            "Basic Math Functions"              ON)
+option(COMPLEXMATH          "Complex Math Functions"            ON)
+option(CONTROLLER           "Controller Functions"              ON)
+option(FASTMATH             "Fast Math Functions"               ON)
+option(FILTERING            "Filtering Functions"               ON)
+option(MATRIX               "Matrix Functions"                  ON)
+option(STATISTICS           "Statistics Functions"              ON)
+option(SUPPORT              "Support Functions"                 ON)
+option(TRANSFORM            "Transform Functions"               ON)
+option(SVM                  "Support Vector Machine Functions"  ON)
+option(BAYES                "Bayesian Estimators"               ON)
+option(DISTANCE             "Distance Functions"                ON)
+
+# When OFF it is the default behavior : all tables are included.
+option(CONFIGTABLE          "Configuration of table allowed"    OFF)
+
+# When CONFIGTABLE is ON, select if all interpolation tables must be included
+option(ALLFAST              "All interpolation tables included" OFF)
+# When CONFIGTABLE is ON, select if all FFT tables must be included
+option(ALLFFT               "All fft tables included"           OFF)
+
+# Features which require inclusion of a data table.
+# Since some tables may be big, the corresponding feature can be
+# disabled.
+# Those options are taken into account only when CONFIGTABLE is ON
+option(ARM_COS_F32          "cos f32"                           OFF)
+option(ARM_COS_Q31          "cos q31"                           OFF)
+option(ARM_COS_Q15          "cos q15"                           OFF)
+option(ARM_SIN_F32          "sin f32"                           OFF)
+option(ARM_SIN_Q31          "sin q31"                           OFF)
+option(ARM_SIN_Q15          "sin q15"                           OFF)
+option(ARM_SIN_COS_F32      "sin cos f32"                       OFF)
+option(ARM_SIN_COS_Q31      "sin cos q31"                       OFF)
+
+option(ARM_LMS_NORM_Q31     "lms norm q31"                      OFF)
+option(ARM_LMS_NORM_Q15     "lms norm q15"                      OFF)
+
+option(CFFT_F64_16          "cfft f64 16"                       OFF)
+option(CFFT_F64_32          "cfft f64 32"                       OFF)
+option(CFFT_F64_64          "cfft f64 64"                       OFF)
+option(CFFT_F64_128         "cfft f64 128"                      OFF)
+option(CFFT_F64_256         "cfft f64 256"                      OFF)
+option(CFFT_F64_512         "cfft f64 512"                      OFF)
+option(CFFT_F64_1024        "cfft f64 1024"                     OFF)
+option(CFFT_F64_2048        "cfft f64 2048"                     OFF)
+option(CFFT_F64_4096        "cfft f64 4096"                     OFF)
+
+option(CFFT_F32_16          "cfft f32 16"                       OFF)
+option(CFFT_F32_32          "cfft f32 32"                       OFF)
+option(CFFT_F32_64          "cfft f32 64"                       OFF)
+option(CFFT_F32_128         "cfft f32 128"                      OFF)
+option(CFFT_F32_256         "cfft f32 256"                      OFF)
+option(CFFT_F32_512         "cfft f32 512"                      OFF)
+option(CFFT_F32_1024        "cfft f32 1024"                     OFF)
+option(CFFT_F32_2048        "cfft f32 2048"                     OFF)
+option(CFFT_F32_4096        "cfft f32 4096"                     OFF)
+
+option(CFFT_Q31_16          "cfft q31 16"                       OFF)
+option(CFFT_Q31_32          "cfft q31 32"                       OFF)
+option(CFFT_Q31_64          "cfft q31 64"                       OFF)
+option(CFFT_Q31_128         "cfft q31 128"                      OFF)
+option(CFFT_Q31_256         "cfft q31 256"                      OFF)
+option(CFFT_Q31_512         "cfft q31 512"                      OFF)
+option(CFFT_Q31_1024        "cfft q31 1024"                     OFF)
+option(CFFT_Q31_2048        "cfft q31 2048"                     OFF)
+option(CFFT_Q31_4096        "cfft q31 4096"                     OFF)
+
+option(CFFT_Q15_16          "cfft q15 16"                       OFF)
+option(CFFT_Q15_32          "cfft q15 32"                       OFF)
+option(CFFT_Q15_64          "cfft q15 64"                       OFF)
+option(CFFT_Q15_128         "cfft q15 128"                      OFF)
+option(CFFT_Q15_256         "cfft q15 256"                      OFF)
+option(CFFT_Q15_512         "cfft q15 512"                      OFF)
+option(CFFT_Q15_1024        "cfft q15 1024"                     OFF)
+option(CFFT_Q15_2048        "cfft q15 2048"                     OFF)
+option(CFFT_Q15_4096        "cfft q15 4096"                     OFF)
+
+option(RFFT_FAST_F32_32     "rfft fast f32 32"                  OFF)
+option(RFFT_FAST_F32_64     "rfft fast f32 64"                  OFF)
+option(RFFT_FAST_F32_128    "rfft fast f32 128"                 OFF)
+option(RFFT_FAST_F32_256    "rfft fast f32 256"                 OFF)
+option(RFFT_FAST_F32_512    "rfft fast f32 512"                 OFF)
+option(RFFT_FAST_F32_1024   "rfft fast f32 1024"                OFF)
+option(RFFT_FAST_F32_2048   "rfft fast f32 2048"                OFF)
+option(RFFT_FAST_F32_4096   "rfft fast f32 4096"                OFF)
+
+
+option(RFFT_F32_128         "rfft f32 128"                      OFF)
+option(RFFT_F32_512         "rfft f32 512"                      OFF)
+option(RFFT_F32_2048        "rfft f32 2048"                     OFF)
+option(RFFT_F32_8192        "rfft f32 8192"                     OFF)
+
+option(RFFT_FAST_F64_32     "rfft fast f64 32"                  OFF)
+option(RFFT_FAST_F64_64     "rfft fast f64 64"                  OFF)
+option(RFFT_FAST_F64_128    "rfft fast f64 128"                 OFF)
+option(RFFT_FAST_F64_256    "rfft fast f64 256"                 OFF)
+option(RFFT_FAST_F64_512    "rfft fast f64 512"                 OFF)
+option(RFFT_FAST_F64_1024   "rfft fast f64 1024"                OFF)
+option(RFFT_FAST_F64_2048   "rfft fast f64 2048"                OFF)
+option(RFFT_FAST_F64_4096   "rfft fast f64 4096"                OFF)
+
+
+option(RFFT_F64_128         "rfft f64 128"                      OFF)
+option(RFFT_F64_512         "rfft f64 512"                      OFF)
+option(RFFT_F64_2048        "rfft f64 2048"                     OFF)
+option(RFFT_F64_8192        "rfft f64 8192"                     OFF)
+
+option(RFFT_Q31_32          "rfft q31 32"                       OFF)
+option(RFFT_Q31_64          "rfft q31 64"                       OFF)
+option(RFFT_Q31_128         "rfft q31 128"                      OFF)
+option(RFFT_Q31_256         "rfft q31 256"                      OFF)
+option(RFFT_Q31_512         "rfft q31 512"                      OFF)
+option(RFFT_Q31_1024        "rfft q31 1024"                     OFF)
+option(RFFT_Q31_2048        "rfft q31 2048"                     OFF)
+option(RFFT_Q31_4096        "rfft q31 4096"                     OFF)
+option(RFFT_Q31_8192        "rfft q31 8192"                     OFF)
+
+option(RFFT_Q15_32          "rfft q15 32"                       OFF)
+option(RFFT_Q15_64          "rfft q15 64"                       OFF)
+option(RFFT_Q15_128         "rfft q15 128"                      OFF)
+option(RFFT_Q15_256         "rfft q15 256"                      OFF)
+option(RFFT_Q15_512         "rfft q15 512"                      OFF)
+option(RFFT_Q15_1024        "rfft q15 1024"                     OFF)
+option(RFFT_Q15_2048        "rfft q15 2048"                     OFF)
+option(RFFT_Q15_4096        "rfft q15 4096"                     OFF)
+option(RFFT_Q15_8192        "rfft q15 8192"                     OFF)
+
+option(DCT4_F32_128          "dct4 f32 128"                     OFF)
+option(DCT4_F32_512          "dct4 f32 512"                     OFF)
+option(DCT4_F32_2048         "dct4 f32 2048"                    OFF)
+option(DCT4_F32_8192         "dct4 f32 8192"                    OFF)
+
+option(DCT4_Q31_128          "dct4 q31 128"                     OFF)
+option(DCT4_Q31_512          "dct4 q31 512"                     OFF)
+option(DCT4_Q31_2048         "dct4 q31 2048"                    OFF)
+option(DCT4_Q31_8192         "dct4 q31 8192"                    OFF)
+
+option(DCT4_Q15_128          "dct4 q15 128"                     OFF)
+option(DCT4_Q15_512          "dct4 q15 512"                     OFF)
+option(DCT4_Q15_2048         "dct4 q15 2048"                    OFF)
+option(DCT4_Q15_8192         "dct4 q15 8192"                    OFF)
+
+
+###########################
+#
+# CMSIS DSP
+#
+###########################
+
+
+
+add_library(CMSISDSP INTERFACE)
+
+if (BASICMATH)
+  add_subdirectory(BasicMathFunctions)
+  target_link_libraries(CMSISDSP INTERFACE CMSISDSPBasicMath)
+endif()
+
+if (COMPLEXMATH)
+  add_subdirectory(ComplexMathFunctions)
+  target_link_libraries(CMSISDSP INTERFACE CMSISDSPComplexMath)
+endif()
+
+if (CONTROLLER)
+  add_subdirectory(ControllerFunctions)
+  # Fast tables inclusion is allowed
+  if (CONFIGTABLE)
+    target_compile_definitions(CMSISDSPController PUBLIC ARM_FAST_ALLOW_TABLES)
+  endif()
+  target_link_libraries(CMSISDSP INTERFACE CMSISDSPController)
+endif()
+
+if (FASTMATH)
+  add_subdirectory(FastMathFunctions)
+  # Fast tables inclusion is allowed
+  if (CONFIGTABLE)
+    target_compile_definitions(CMSISDSPFastMath PUBLIC ARM_FAST_ALLOW_TABLES)
+  endif()
+  target_link_libraries(CMSISDSP INTERFACE CMSISDSPFastMath)
+endif()
+
+if (FILTERING)
+  add_subdirectory(FilteringFunctions)
+  # Fast tables inclusion is allowed
+  if (CONFIGTABLE)
+    target_compile_definitions(CMSISDSPFiltering PUBLIC ARM_FAST_ALLOW_TABLES)
+  endif()
+  target_link_libraries(CMSISDSP INTERFACE CMSISDSPFiltering)
+endif()
+
+if (MATRIX)
+  add_subdirectory(MatrixFunctions)
+  target_link_libraries(CMSISDSP INTERFACE CMSISDSPMatrix)
+endif()
+
+if (STATISTICS)
+  add_subdirectory(StatisticsFunctions)
+  target_link_libraries(CMSISDSP INTERFACE CMSISDSPStatistics)
+endif()
+
+if (SUPPORT)
+  add_subdirectory(SupportFunctions)
+  target_link_libraries(CMSISDSP INTERFACE CMSISDSPSupport)
+endif()
+
+if (TRANSFORM)
+  add_subdirectory(TransformFunctions)
+  # FFT tables inclusion is allowed
+  if (CONFIGTABLE)
+    target_compile_definitions(CMSISDSPTransform PUBLIC ARM_FFT_ALLOW_TABLES)
+  endif()
+  target_link_libraries(CMSISDSP INTERFACE CMSISDSPTransform)
+endif()
+
+if (FILTERING OR CONTROLLER OR FASTMATH OR TRANSFORM OR SVM OR DISTANCE)
+  add_subdirectory(CommonTables)
+  if (TRANSFORM)
+    # FFT tables inclusion is allowed
+    if (CONFIGTABLE)
+      target_compile_definitions(CMSISDSPCommon PUBLIC ARM_FFT_ALLOW_TABLES)
+    endif()
+  endif()
+  if (FILTERING OR CONTROLLER OR FASTMATH)
+    # Select which tables to include
+    if (CONFIGTABLE)
+      target_compile_definitions(CMSISDSPCommon PUBLIC ARM_FAST_ALLOW_TABLES)
+    endif()
+  endif()
+  target_link_libraries(CMSISDSP INTERFACE CMSISDSPCommon)
+  # Common project is adding ComputeLibrary tables used by SVM and Distance
+  # when NEon is ON.
+endif()
+
+if (SVM)
+  add_subdirectory(SVMFunctions)
+  target_link_libraries(CMSISDSP INTERFACE CMSISDSPSVM)
+endif()
+
+if (BAYES)
+  add_subdirectory(BayesFunctions)
+  target_link_libraries(CMSISDSP INTERFACE CMSISDSPBayes)
+endif()
+
+if (DISTANCE)
+  add_subdirectory(DistanceFunctions)
+  target_link_libraries(CMSISDSP INTERFACE CMSISDSPDistance)
+endif()
+
+### Includes
+target_include_directories(CMSISDSP INTERFACE "${DSP}/Include")
+
+
+
--- a/libraries/cmsis/dsp/Source/CommonTables/CMakeLists.txt
+++ b/libraries/cmsis/dsp/Source/CommonTables/CMakeLists.txt
@@ -0,0 +1,41 @@
+cmake_minimum_required (VERSION 3.6)
+
+project(CMSISDSPCommon)
+
+include(configLib)
+include(configDsp)
+
+add_library(CMSISDSPCommon STATIC arm_common_tables.c)
+
+configLib(CMSISDSPCommon ${ROOT})
+configDsp(CMSISDSPCommon ${ROOT})
+
+if (CONFIGTABLE AND ALLFFT)
+    target_compile_definitions(CMSISDSPCommon PUBLIC ARM_ALL_FFT_TABLES) 
+endif()
+
+if (CONFIGTABLE AND ALLFAST)
+    target_compile_definitions(CMSISDSPCommon PUBLIC ARM_ALL_FAST_TABLES) 
+endif()
+
+include(fft)
+fft(CMSISDSPCommon)
+
+include(interpol)
+interpol(CMSISDSPCommon)
+
+target_sources(CMSISDSPCommon PRIVATE arm_const_structs.c)
+
+
+### Includes
+target_include_directories(CMSISDSPCommon PUBLIC "${DSP}/Include")
+
+if (NEON OR NEONEXPERIMENTAL)
+    target_sources(CMSISDSPCommon PRIVATE "${DSP}/ComputeLibrary/Source/arm_cl_tables.c")
+endif()
+
+if (HELIUM OR MVEF)
+    target_sources(CMSISDSPCommon PRIVATE "${DSP}/Source/CommonTables/arm_mve_tables.c")
+endif()
+
+
--- a/libraries/cmsis/dsp/Source/CommonTables/CommonTables.c
+++ b/libraries/cmsis/dsp/Source/CommonTables/CommonTables.c
@@ -0,0 +1,31 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        CommonTables.c
+ * Description:  Combination of all common table source files.
+ *
+ * $Date:        08. January 2020
+ * $Revision:    V1.1.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2019-2020 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_common_tables.c"
+#include "arm_const_structs.c"
+#include "arm_mve_tables.c"
--- a/libraries/cmsis/dsp/Source/CommonTables/arm_common_tables.c
+++ b/libraries/cmsis/dsp/Source/CommonTables/arm_common_tables.c
--- a/libraries/cmsis/dsp/Source/CommonTables/arm_const_structs.c
+++ b/libraries/cmsis/dsp/Source/CommonTables/arm_const_structs.c
@@ -0,0 +1,663 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_const_structs.c
+ * Description:  Constant structs that are initialized for user convenience.
+ *               For example, some can be given as arguments to the arm_cfft_f32() or arm_rfft_f32() functions.
+ *
+ * $Date:        27. January 2017
+ * $Revision:    V.1.5.1
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include "arm_const_structs.h"
+
+/*
+ALLOW TABLE is true when config table is enabled and the Tramsform folder is included
+for compilation.
+*/
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
+
+/* Floating-point structs */
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_16) && defined(ARM_TABLE_BITREVIDX_FLT64_16))
+const arm_cfft_instance_f64 arm_cfft_sR_f64_len16 = {
+  16, (const float64_t *)twiddleCoefF64_16, armBitRevIndexTableF64_16, ARMBITREVINDEXTABLEF64_16_TABLE_LENGTH
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_32) && defined(ARM_TABLE_BITREVIDX_FLT64_32))
+const arm_cfft_instance_f64 arm_cfft_sR_f64_len32 = {
+  32, (const float64_t *)twiddleCoefF64_32, armBitRevIndexTableF64_32, ARMBITREVINDEXTABLEF64_32_TABLE_LENGTH
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_64) && defined(ARM_TABLE_BITREVIDX_FLT64_64))
+const arm_cfft_instance_f64 arm_cfft_sR_f64_len64 = {
+  64, (const float64_t *)twiddleCoefF64_64, armBitRevIndexTableF64_64, ARMBITREVINDEXTABLEF64_64_TABLE_LENGTH
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_128) && defined(ARM_TABLE_BITREVIDX_FLT64_128))
+const arm_cfft_instance_f64 arm_cfft_sR_f64_len128 = {
+  128, (const float64_t *)twiddleCoefF64_128, armBitRevIndexTableF64_128, ARMBITREVINDEXTABLEF64_128_TABLE_LENGTH
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_256) && defined(ARM_TABLE_BITREVIDX_FLT64_256))
+const arm_cfft_instance_f64 arm_cfft_sR_f64_len256 = {
+  256, (const float64_t *)twiddleCoefF64_256, armBitRevIndexTableF64_256, ARMBITREVINDEXTABLEF64_256_TABLE_LENGTH
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_512) && defined(ARM_TABLE_BITREVIDX_FLT64_512))
+const arm_cfft_instance_f64 arm_cfft_sR_f64_len512 = {
+  512, (const float64_t *)twiddleCoefF64_512, armBitRevIndexTableF64_512, ARMBITREVINDEXTABLEF64_512_TABLE_LENGTH
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_1024) && defined(ARM_TABLE_BITREVIDX_FLT64_1024))
+const arm_cfft_instance_f64 arm_cfft_sR_f64_len1024 = {
+  1024, (const float64_t *)twiddleCoefF64_1024, armBitRevIndexTableF64_1024, ARMBITREVINDEXTABLEF64_1024_TABLE_LENGTH
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_2048) && defined(ARM_TABLE_BITREVIDX_FLT64_2048))
+const arm_cfft_instance_f64 arm_cfft_sR_f64_len2048 = {
+  2048, (const float64_t *)twiddleCoefF64_2048, armBitRevIndexTableF64_2048, ARMBITREVINDEXTABLEF64_2048_TABLE_LENGTH
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_4096) && defined(ARM_TABLE_BITREVIDX_FLT64_4096))
+const arm_cfft_instance_f64 arm_cfft_sR_f64_len4096 = {
+  4096, (const float64_t *)twiddleCoefF64_4096, armBitRevIndexTableF64_4096, ARMBITREVINDEXTABLEF64_4096_TABLE_LENGTH
+};
+#endif
+
+/* Floating-point structs */
+#if !defined(ARM_MATH_MVEF) || defined(ARM_MATH_AUTOVECTORIZE)
+
+/*
+
+Those structures cannot be used to initialize the MVE version of the FFT F32 instances.
+So they are not compiled when MVE is defined.
+
+For the MVE version, the new arm_cfft_init_f32 must be used.
+
+
+*/
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_16) && defined(ARM_TABLE_BITREVIDX_FLT_16))
+const arm_cfft_instance_f32 arm_cfft_sR_f32_len16 = {
+  16, twiddleCoef_16, armBitRevIndexTable16, ARMBITREVINDEXTABLE_16_TABLE_LENGTH
+};
+#endif
+
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_32) && defined(ARM_TABLE_BITREVIDX_FLT_32))
+const arm_cfft_instance_f32 arm_cfft_sR_f32_len32 = {
+  32, twiddleCoef_32, armBitRevIndexTable32, ARMBITREVINDEXTABLE_32_TABLE_LENGTH
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_64) && defined(ARM_TABLE_BITREVIDX_FLT_64))
+const arm_cfft_instance_f32 arm_cfft_sR_f32_len64 = {
+  64, twiddleCoef_64, armBitRevIndexTable64, ARMBITREVINDEXTABLE_64_TABLE_LENGTH
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_128) && defined(ARM_TABLE_BITREVIDX_FLT_128))
+const arm_cfft_instance_f32 arm_cfft_sR_f32_len128 = {
+  128, twiddleCoef_128, armBitRevIndexTable128, ARMBITREVINDEXTABLE_128_TABLE_LENGTH
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_256) && defined(ARM_TABLE_BITREVIDX_FLT_256))
+const arm_cfft_instance_f32 arm_cfft_sR_f32_len256 = {
+  256, twiddleCoef_256, armBitRevIndexTable256, ARMBITREVINDEXTABLE_256_TABLE_LENGTH
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_512) && defined(ARM_TABLE_BITREVIDX_FLT_512))
+const arm_cfft_instance_f32 arm_cfft_sR_f32_len512 = {
+  512, twiddleCoef_512, armBitRevIndexTable512, ARMBITREVINDEXTABLE_512_TABLE_LENGTH
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024))
+const arm_cfft_instance_f32 arm_cfft_sR_f32_len1024 = {
+  1024, twiddleCoef_1024, armBitRevIndexTable1024, ARMBITREVINDEXTABLE_1024_TABLE_LENGTH
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048))
+const arm_cfft_instance_f32 arm_cfft_sR_f32_len2048 = {
+  2048, twiddleCoef_2048, armBitRevIndexTable2048, ARMBITREVINDEXTABLE_2048_TABLE_LENGTH
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_4096) && defined(ARM_TABLE_BITREVIDX_FLT_4096))
+const arm_cfft_instance_f32 arm_cfft_sR_f32_len4096 = {
+  4096, twiddleCoef_4096, armBitRevIndexTable4096, ARMBITREVINDEXTABLE_4096_TABLE_LENGTH
+};
+#endif
+
+#endif /* !defined(ARM_MATH_MVEF) || defined(ARM_MATH_AUTOVECTORIZE) */
+
+/* Fixed-point structs */
+
+#if !defined(ARM_MATH_MVEI)
+
+/*
+
+Those structures cannot be used to initialize the MVE version of the FFT Q31 instances.
+So they are not compiled when MVE is defined.
+
+For the MVE version, the new arm_cfft_init_f32 must be used.
+
+
+*/
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
+const arm_cfft_instance_q31 arm_cfft_sR_q31_len16 = {
+  16, twiddleCoef_16_q31, armBitRevIndexTable_fixed_16, ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
+const arm_cfft_instance_q31 arm_cfft_sR_q31_len32 = {
+  32, twiddleCoef_32_q31, armBitRevIndexTable_fixed_32, ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
+const arm_cfft_instance_q31 arm_cfft_sR_q31_len64 = {
+  64, twiddleCoef_64_q31, armBitRevIndexTable_fixed_64, ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
+const arm_cfft_instance_q31 arm_cfft_sR_q31_len128 = {
+  128, twiddleCoef_128_q31, armBitRevIndexTable_fixed_128, ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
+const arm_cfft_instance_q31 arm_cfft_sR_q31_len256 = {
+  256, twiddleCoef_256_q31, armBitRevIndexTable_fixed_256, ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
+const arm_cfft_instance_q31 arm_cfft_sR_q31_len512 = {
+  512, twiddleCoef_512_q31, armBitRevIndexTable_fixed_512, ARMBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
+const arm_cfft_instance_q31 arm_cfft_sR_q31_len1024 = {
+  1024, twiddleCoef_1024_q31, armBitRevIndexTable_fixed_1024, ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
+const arm_cfft_instance_q31 arm_cfft_sR_q31_len2048 = {
+  2048, twiddleCoef_2048_q31, armBitRevIndexTable_fixed_2048, ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
+const arm_cfft_instance_q31 arm_cfft_sR_q31_len4096 = {
+  4096, twiddleCoef_4096_q31, armBitRevIndexTable_fixed_4096, ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH
+};
+#endif
+
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
+const arm_cfft_instance_q15 arm_cfft_sR_q15_len16 = {
+  16, twiddleCoef_16_q15, armBitRevIndexTable_fixed_16, ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
+const arm_cfft_instance_q15 arm_cfft_sR_q15_len32 = {
+  32, twiddleCoef_32_q15, armBitRevIndexTable_fixed_32, ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
+const arm_cfft_instance_q15 arm_cfft_sR_q15_len64 = {
+  64, twiddleCoef_64_q15, armBitRevIndexTable_fixed_64, ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
+const arm_cfft_instance_q15 arm_cfft_sR_q15_len128 = {
+  128, twiddleCoef_128_q15, armBitRevIndexTable_fixed_128, ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
+const arm_cfft_instance_q15 arm_cfft_sR_q15_len256 = {
+  256, twiddleCoef_256_q15, armBitRevIndexTable_fixed_256, ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
+const arm_cfft_instance_q15 arm_cfft_sR_q15_len512 = {
+  512, twiddleCoef_512_q15, armBitRevIndexTable_fixed_512, ARMBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
+const arm_cfft_instance_q15 arm_cfft_sR_q15_len1024 = {
+  1024, twiddleCoef_1024_q15, armBitRevIndexTable_fixed_1024, ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
+const arm_cfft_instance_q15 arm_cfft_sR_q15_len2048 = {
+  2048, twiddleCoef_2048_q15, armBitRevIndexTable_fixed_2048, ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
+const arm_cfft_instance_q15 arm_cfft_sR_q15_len4096 = {
+  4096, twiddleCoef_4096_q15, armBitRevIndexTable_fixed_4096, ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH
+};
+#endif
+
+#endif /* !defined(ARM_MATH_MVEI) */
+
+/* Structure for real-value inputs */
+/* Double precision strucs */
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_32) && defined(ARM_TABLE_BITREVIDX_FLT64_32) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_32))
+const arm_rfft_fast_instance_f64 arm_rfft_fast_sR_f64_len32 = {
+  { 16, (const float64_t *)twiddleCoefF64_16, armBitRevIndexTableF64_16, ARMBITREVINDEXTABLEF64_16_TABLE_LENGTH },
+  32U,
+  (float64_t *)twiddleCoefF64_rfft_32
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_64) && defined(ARM_TABLE_BITREVIDX_FLT64_64) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_64))
+const arm_rfft_fast_instance_f64 arm_rfft_fast_sR_f64_len64 = {
+   { 32, (const float64_t *)twiddleCoefF64_32, armBitRevIndexTableF64_32, ARMBITREVINDEXTABLEF64_32_TABLE_LENGTH },
+  64U,
+  (float64_t *)twiddleCoefF64_rfft_64
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_128) && defined(ARM_TABLE_BITREVIDX_FLT64_128) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_128))
+const arm_rfft_fast_instance_f64 arm_rfft_fast_sR_f64_len128 = {
+  { 64, (const float64_t *)twiddleCoefF64_64, armBitRevIndexTableF64_64, ARMBITREVINDEXTABLEF64_64_TABLE_LENGTH },
+  128U,
+  (float64_t *)twiddleCoefF64_rfft_128
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_256) && defined(ARM_TABLE_BITREVIDX_FLT64_256) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_256))
+const arm_rfft_fast_instance_f64 arm_rfft_fast_sR_f64_len256 = {
+  { 128, (const float64_t *)twiddleCoefF64_128, armBitRevIndexTableF64_128, ARMBITREVINDEXTABLEF64_128_TABLE_LENGTH },
+  256U,
+  (float64_t *)twiddleCoefF64_rfft_256
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_512) && defined(ARM_TABLE_BITREVIDX_FLT64_512) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_512))
+const arm_rfft_fast_instance_f64 arm_rfft_fast_sR_f64_len512 = {
+  { 256, (const float64_t *)twiddleCoefF64_256, armBitRevIndexTableF64_256, ARMBITREVINDEXTABLEF64_256_TABLE_LENGTH },
+  512U,
+  (float64_t *)twiddleCoefF64_rfft_512
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_1024) && defined(ARM_TABLE_BITREVIDX_FLT64_1024) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_1024))
+const arm_rfft_fast_instance_f64 arm_rfft_fast_sR_f64_len1024 = {
+  { 512, (const float64_t *)twiddleCoefF64_512, armBitRevIndexTableF64_512, ARMBITREVINDEXTABLEF64_512_TABLE_LENGTH },
+  1024U,
+  (float64_t *)twiddleCoefF64_rfft_1024
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_2048) && defined(ARM_TABLE_BITREVIDX_FLT64_2048) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_2048))
+const arm_rfft_fast_instance_f64 arm_rfft_fast_sR_f64_len2048 = {
+  { 1024, (const float64_t *)twiddleCoefF64_1024, armBitRevIndexTableF64_1024, ARMBITREVINDEXTABLEF64_1024_TABLE_LENGTH },
+  2048U,
+  (float64_t *)twiddleCoefF64_rfft_2048
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_4096) && defined(ARM_TABLE_BITREVIDX_FLT64_4096) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_4096))
+const arm_rfft_fast_instance_f64 arm_rfft_fast_sR_f64_len4096 = {
+  { 2048, (const float64_t *)twiddleCoefF64_2048, armBitRevIndexTableF64_2048, ARMBITREVINDEXTABLEF64_2048_TABLE_LENGTH },
+  4096U,
+  (float64_t *)twiddleCoefF64_rfft_4096
+};
+#endif
+
+/* Floating-point structs */
+
+#if !defined(ARM_MATH_MVEF) || defined(ARM_MATH_AUTOVECTORIZE)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_32) && defined(ARM_TABLE_BITREVIDX_FLT_32) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_32))
+const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len32 = {
+  { 16, twiddleCoef_16, armBitRevIndexTable16, ARMBITREVINDEXTABLE_16_TABLE_LENGTH },
+  32U,
+  (float32_t *)twiddleCoef_rfft_32
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_64) && defined(ARM_TABLE_BITREVIDX_FLT_64) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_64))
+const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len64 = {
+   { 32, twiddleCoef_32, armBitRevIndexTable32, ARMBITREVINDEXTABLE_32_TABLE_LENGTH },
+  64U,
+  (float32_t *)twiddleCoef_rfft_64
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_128) && defined(ARM_TABLE_BITREVIDX_FLT_128) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_128))
+const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len128 = {
+  { 64, twiddleCoef_64, armBitRevIndexTable64, ARMBITREVINDEXTABLE_64_TABLE_LENGTH },
+  128U,
+  (float32_t *)twiddleCoef_rfft_128
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_256) && defined(ARM_TABLE_BITREVIDX_FLT_256) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_256))
+const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len256 = {
+  { 128, twiddleCoef_128, armBitRevIndexTable128, ARMBITREVINDEXTABLE_128_TABLE_LENGTH },
+  256U,
+  (float32_t *)twiddleCoef_rfft_256
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_512) && defined(ARM_TABLE_BITREVIDX_FLT_512) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_512))
+const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len512 = {
+  { 256, twiddleCoef_256, armBitRevIndexTable256, ARMBITREVINDEXTABLE_256_TABLE_LENGTH },
+  512U,
+  (float32_t *)twiddleCoef_rfft_512
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_1024))
+const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len1024 = {
+  { 512, twiddleCoef_512, armBitRevIndexTable512, ARMBITREVINDEXTABLE_512_TABLE_LENGTH },
+  1024U,
+  (float32_t *)twiddleCoef_rfft_1024
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_2048))
+const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len2048 = {
+  { 1024, twiddleCoef_1024, armBitRevIndexTable1024, ARMBITREVINDEXTABLE_1024_TABLE_LENGTH },
+  2048U,
+  (float32_t *)twiddleCoef_rfft_2048
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_4096) && defined(ARM_TABLE_BITREVIDX_FLT_4096) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_4096))
+const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len4096 = {
+  { 2048, twiddleCoef_2048, armBitRevIndexTable2048, ARMBITREVINDEXTABLE_2048_TABLE_LENGTH },
+  4096U,
+  (float32_t *)twiddleCoef_rfft_4096
+};
+#endif
+
+#endif /* #if !defined(ARM_MATH_MVEF) || defined(ARM_MATH_AUTOVECTORIZE) */
+
+/* Fixed-point structs */
+/* q31_t */
+
+#if !defined(ARM_MATH_MVEI)
+
+/*
+
+Those structures cannot be used to initialize the MVE version of the FFT Q31 instances.
+So they are not compiled when MVE is defined.
+
+For the MVE version, the new arm_cfft_init_f32 must be used.
+
+
+*/
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
+const arm_rfft_instance_q31 arm_rfft_sR_q31_len32 = {
+  32U,
+  0,
+  1,
+  256U,
+  (q31_t*)realCoefAQ31,
+  (q31_t*)realCoefBQ31,
+  &arm_cfft_sR_q31_len16
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
+const arm_rfft_instance_q31 arm_rfft_sR_q31_len64 = {
+  64U,
+  0,
+  1,
+  128U,
+  (q31_t*)realCoefAQ31,
+  (q31_t*)realCoefBQ31,
+  &arm_cfft_sR_q31_len32
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
+const arm_rfft_instance_q31 arm_rfft_sR_q31_len128 = {
+  128U,
+  0,
+  1,
+  64U,
+  (q31_t*)realCoefAQ31,
+  (q31_t*)realCoefBQ31,
+  &arm_cfft_sR_q31_len64
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
+const arm_rfft_instance_q31 arm_rfft_sR_q31_len256 = {
+  256U,
+  0,
+  1,
+  32U,
+  (q31_t*)realCoefAQ31,
+  (q31_t*)realCoefBQ31,
+  &arm_cfft_sR_q31_len128
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
+const arm_rfft_instance_q31 arm_rfft_sR_q31_len512 = {
+  512U,
+  0,
+  1,
+  16U,
+  (q31_t*)realCoefAQ31,
+  (q31_t*)realCoefBQ31,
+  &arm_cfft_sR_q31_len256
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
+const arm_rfft_instance_q31 arm_rfft_sR_q31_len1024 = {
+  1024U,
+  0,
+  1,
+  8U,
+  (q31_t*)realCoefAQ31,
+  (q31_t*)realCoefBQ31,
+  &arm_cfft_sR_q31_len512
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
+const arm_rfft_instance_q31 arm_rfft_sR_q31_len2048 = {
+  2048U,
+  0,
+  1,
+  4U,
+  (q31_t*)realCoefAQ31,
+  (q31_t*)realCoefBQ31,
+  &arm_cfft_sR_q31_len1024
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
+const arm_rfft_instance_q31 arm_rfft_sR_q31_len4096 = {
+  4096U,
+  0,
+  1,
+  2U,
+  (q31_t*)realCoefAQ31,
+  (q31_t*)realCoefBQ31,
+  &arm_cfft_sR_q31_len2048
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
+const arm_rfft_instance_q31 arm_rfft_sR_q31_len8192 = {
+  8192U,
+  0,
+  1,
+  1U,
+  (q31_t*)realCoefAQ31,
+  (q31_t*)realCoefBQ31,
+  &arm_cfft_sR_q31_len4096
+};
+#endif
+
+
+
+/* q15_t */
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
+const arm_rfft_instance_q15 arm_rfft_sR_q15_len32 = {
+  32U,
+  0,
+  1,
+  256U,
+  (q15_t*)realCoefAQ15,
+  (q15_t*)realCoefBQ15,
+  &arm_cfft_sR_q15_len16
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
+const arm_rfft_instance_q15 arm_rfft_sR_q15_len64 = {
+  64U,
+  0,
+  1,
+  128U,
+  (q15_t*)realCoefAQ15,
+  (q15_t*)realCoefBQ15,
+  &arm_cfft_sR_q15_len32
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
+const arm_rfft_instance_q15 arm_rfft_sR_q15_len128 = {
+  128U,
+  0,
+  1,
+  64U,
+  (q15_t*)realCoefAQ15,
+  (q15_t*)realCoefBQ15,
+  &arm_cfft_sR_q15_len64
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
+const arm_rfft_instance_q15 arm_rfft_sR_q15_len256 = {
+  256U,
+  0,
+  1,
+  32U,
+  (q15_t*)realCoefAQ15,
+  (q15_t*)realCoefBQ15,
+  &arm_cfft_sR_q15_len128
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
+const arm_rfft_instance_q15 arm_rfft_sR_q15_len512 = {
+  512U,
+  0,
+  1,
+  16U,
+  (q15_t*)realCoefAQ15,
+  (q15_t*)realCoefBQ15,
+  &arm_cfft_sR_q15_len256
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
+const arm_rfft_instance_q15 arm_rfft_sR_q15_len1024 = {
+  1024U,
+  0,
+  1,
+  8U,
+  (q15_t*)realCoefAQ15,
+  (q15_t*)realCoefBQ15,
+  &arm_cfft_sR_q15_len512
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
+const arm_rfft_instance_q15 arm_rfft_sR_q15_len2048 = {
+  2048U,
+  0,
+  1,
+  4U,
+  (q15_t*)realCoefAQ15,
+  (q15_t*)realCoefBQ15,
+  &arm_cfft_sR_q15_len1024
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
+const arm_rfft_instance_q15 arm_rfft_sR_q15_len4096 = {
+  4096U,
+  0,
+  1,
+  2U,
+  (q15_t*)realCoefAQ15,
+  (q15_t*)realCoefBQ15,
+  &arm_cfft_sR_q15_len2048
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
+const arm_rfft_instance_q15 arm_rfft_sR_q15_len8192 = {
+  8192U,
+  0,
+  1,
+  1U,
+  (q15_t*)realCoefAQ15,
+  (q15_t*)realCoefBQ15,
+  &arm_cfft_sR_q15_len4096
+};
+#endif
+
+#endif /* !defined(ARM_MATH_MVEI) */
+
+
+#endif
--- a/libraries/cmsis/dsp/Source/CommonTables/arm_mve_tables.c
+++ b/libraries/cmsis/dsp/Source/CommonTables/arm_mve_tables.c
--- a/libraries/cmsis/dsp/Source/ComplexMathFunctions/CMakeLists.txt
+++ b/libraries/cmsis/dsp/Source/ComplexMathFunctions/CMakeLists.txt
@@ -0,0 +1,53 @@
+cmake_minimum_required (VERSION 3.6)
+
+project(CMSISDSPComplexMath)
+
+include(configLib)
+include(configDsp)
+
+file(GLOB SRC "./*_*.c")
+
+add_library(CMSISDSPComplexMath STATIC)
+
+configLib(CMSISDSPComplexMath ${ROOT})
+configDsp(CMSISDSPComplexMath ${ROOT})
+
+
+include(interpol)
+interpol(CMSISDSPFastMath)
+
+if (CONFIGTABLE AND ALLFAST)
+    target_compile_definitions(CMSISDSPComplexMath PUBLIC ARM_ALL_FAST_TABLES)  
+endif()
+
+# MVE code is using a table for computing the fast sqrt arm_cmplx_mag_q31
+# There is the possibility of not compiling this function and not including
+# the table.
+if (NOT CONFIGTABLE OR ALLFAST OR ARM_CMPLX_MAG_Q31 OR (NOT HELIUM AND NOT MVEI))
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_q31.c)
+endif()
+
+if (NOT CONFIGTABLE OR ALLFAST OR ARM_CMPLX_MAG_Q15 OR (NOT HELIUM AND NOT MVEI))
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_q15.c)
+endif()
+
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_conj_f32.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_conj_q15.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_conj_q31.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_dot_prod_f32.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_dot_prod_q15.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_dot_prod_q31.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_f32.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_squared_f32.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_squared_q15.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_squared_q31.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_cmplx_f32.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_cmplx_q15.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_cmplx_q31.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_real_f32.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_real_q15.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_real_q31.c)
+
+
+### Includes
+target_include_directories(CMSISDSPComplexMath PUBLIC "${DSP}/Include")
--- a/libraries/cmsis/dsp/Source/ComplexMathFunctions/ComplexMathFunctions.c
+++ b/libraries/cmsis/dsp/Source/ComplexMathFunctions/ComplexMathFunctions.c
@@ -0,0 +1,46 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        CompexMathFunctions.c
+ * Description:  Combination of all comlex math function source files.
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.0.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_cmplx_conj_f32.c"
+#include "arm_cmplx_conj_q15.c"
+#include "arm_cmplx_conj_q31.c"
+#include "arm_cmplx_dot_prod_f32.c"
+#include "arm_cmplx_dot_prod_q15.c"
+#include "arm_cmplx_dot_prod_q31.c"
+#include "arm_cmplx_mag_f32.c"
+#include "arm_cmplx_mag_q15.c"
+#include "arm_cmplx_mag_q31.c"
+#include "arm_cmplx_mag_squared_f32.c"
+#include "arm_cmplx_mag_squared_q15.c"
+#include "arm_cmplx_mag_squared_q31.c"
+#include "arm_cmplx_mult_cmplx_f32.c"
+#include "arm_cmplx_mult_cmplx_q15.c"
+#include "arm_cmplx_mult_cmplx_q31.c"
+#include "arm_cmplx_mult_real_f32.c"
+#include "arm_cmplx_mult_real_q15.c"
+#include "arm_cmplx_mult_real_q31.c"
--- a/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_conj_f32.c
+++ b/libraries/cmsis/dsp/Source/ComplexMathFunctions/arm_cmplx_conj_f32.c
@@ -0,0 +1,213 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_cmplx_conj_f32.c
+ * Description:  Floating-point complex conjugate
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupCmplxMath
+ */
+
+/**
+  @defgroup cmplx_conj Complex Conjugate
+
+  Conjugates the elements of a complex data vector.
+
+  The <code>pSrc</code> points to the source data and
+  <code>pDst</code> points to the destination data where the result should be written.
+  <code>numSamples</code> specifies the number of complex samples
+  and the data in each array is stored in an interleaved fashion
+  (real, imag, real, imag, ...).
+  Each array has a total of <code>2*numSamples</code> values.
+
+  The underlying algorithm is used:
+  <pre>
+  for (n = 0; n < numSamples; n++) {
+      pDst[(2*n)  ] =  pSrc[(2*n)  ];    // real part
+      pDst[(2*n)+1] = -pSrc[(2*n)+1];    // imag part
+  }
+  </pre>
+
+  There are separate functions for floating-point, Q15, and Q31 data types.
+ */
+
+/**
+  @addtogroup cmplx_conj
+  @{
+ */
+
+/**
+  @brief         Floating-point complex conjugate.
+  @param[in]     pSrc        points to the input vector
+  @param[out]    pDst        points to the output vector
+  @param[in]     numSamples  number of samples in each vector
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_cmplx_conj_f32(
+    const float32_t * pSrc,
+    float32_t * pDst,
+    uint32_t numSamples)
+{
+    static const float32_t cmplx_conj_sign[4] = { 1.0f, -1.0f, 1.0f, -1.0f };
+    uint32_t blockSize = numSamples * CMPLX_DIM;   /* loop counters */
+    uint32_t blkCnt;
+    f32x4_t vecSrc;
+    f32x4_t vecSign;
+
+    /*
+     * load sign vector
+     */
+    vecSign = *(f32x4_t *) cmplx_conj_sign;
+
+    /* Compute 4 real samples at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        vecSrc = vld1q(pSrc);
+        vst1q(pDst,vmulq(vecSrc, vecSign));
+        /*
+         * Decrement the blkCnt loop counter
+         * Advance vector source and destination pointers
+         */
+        pSrc += 4;
+        pDst += 4;
+        blkCnt--;
+    }
+
+     /* Tail */
+    blkCnt = (blockSize & 0x3) >> 1;
+
+    while (blkCnt > 0U)
+    {
+      /* C[0] + jC[1] = A[0]+ j(-1)A[1] */
+
+      /* Calculate Complex Conjugate and store result in destination buffer. */
+      *pDst++ =  *pSrc++;
+      *pDst++ = -*pSrc++;
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+
+}
+
+#else
+void arm_cmplx_conj_f32(
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t numSamples)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+   float32x4_t zero;
+   float32x4x2_t vec;
+
+   zero = vdupq_n_f32(0.0f);
+
+   /* Compute 4 outputs at a time */
+   blkCnt = numSamples >> 2U;
+
+   while (blkCnt > 0U)
+   {
+     /* C[0]+jC[1] = A[0]+(-1)*jA[1] */
+     /* Calculate Complex Conjugate and then store the results in the destination buffer. */
+     vec = vld2q_f32(pSrc);
+     vec.val[1] = vsubq_f32(zero,vec.val[1]);
+     vst2q_f32(pDst,vec);
+
+     /* Increment pointers */
+     pSrc += 8;
+     pDst += 8;
+
+     /* Decrement the loop counter */
+     blkCnt--;
+   }
+
+   /* Tail */
+   blkCnt = numSamples & 0x3;
+
+#else
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = numSamples >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C[0] + jC[1] = A[0]+ j(-1)A[1] */
+
+    /* Calculate Complex Conjugate and store result in destination buffer. */
+    *pDst++ =  *pSrc++;
+    *pDst++ = -*pSrc++;
+
+    *pDst++ =  *pSrc++;
+    *pDst++ = -*pSrc++;
+
+    *pDst++ =  *pSrc++;
+    *pDst++ = -*pSrc++;
+
+    *pDst++ =  *pSrc++;
+    *pDst++ = -*pSrc++;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = numSamples % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = numSamples;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined (ARM_MATH_NEON) */
+
+  while (blkCnt > 0U)
+  {
+    /* C[0] + jC[1] = A[0]+ j(-1)A[1] */
+
+    /* Calculate Complex Conjugate and store result in destination buffer. */
+    *pDst++ =  *pSrc++;
+    *pDst++ = -*pSrc++;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of cmplx_conj group
+ */
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Artery-MCU	3bac77751d	update version to v2.1.4	2023-02-28 10:34:18 +08:00
Artery-MCU	16720f59ce	update version to v2.1.3	2022-11-22 18:14:35 +08:00
Artery-MCU	4403cb6781	update version to v2.1.2	2022-08-26 14:37:12 +08:00
Artery-MCU	eb3198540f	update version to v2.1.1	2022-07-25 16:31:10 +08:00
Artery-MCU	25f12a4ab3	update version to v2.1.0	2022-06-30 17:10:00 +08:00
Artery-MCU	c0f81f4b0d	update version to v2.0.9	2022-05-05 14:44:27 +08:00
Artery-MCU	a89a26cea4	update version to v2.0.8	2022-04-11 19:22:17 +08:00
Artery-MCU	eb00682e95	update version to v2.0.7	2022-03-03 19:28:16 +08:00
Artery-MCU	e1d3f6e2c9	update version to v2.0.6	2022-01-21 15:43:43 +08:00
Artery-MCU	4fd69ebc78	add LICENSE.	2021-12-14 09:37:34 +00:00