changeset 379:24c3a0663d54 prerelease

Added Ne10 headers within include directory
author andrewm
date Sun, 12 Jun 2016 18:16:20 +0100
parents 8db03611ee76
children 9dc5a0ccad25
files examples/audio_in_FFT/render.cpp examples/basic_FFT_phase_vocoder/render.cpp examples/d-box/FIRfilter.h examples/d-box/main.cpp examples/filter_FIR/FIRfilter.h examples/filter_FIR/render.cpp include/ne10/NE10.h include/ne10/NE10_dsp.h include/ne10/NE10_imgproc.h include/ne10/NE10_init.h include/ne10/NE10_macros.h include/ne10/NE10_math.h include/ne10/NE10_physics.h include/ne10/NE10_types.h
diffstat 14 files changed, 2951 insertions(+), 6 deletions(-) [+]
line wrap: on
line diff
--- a/examples/audio_in_FFT/render.cpp	Sun Jun 12 18:10:33 2016 +0100
+++ b/examples/audio_in_FFT/render.cpp	Sun Jun 12 18:16:20 2016 +0100
@@ -36,7 +36,7 @@
 
 #include <Bela.h>
 #include <rtdk.h>
-#include <NE10.h>					// neon library
+#include <ne10/NE10.h>					// neon library
 #include <cmath>
 
 int gFFTSize;
--- a/examples/basic_FFT_phase_vocoder/render.cpp	Sun Jun 12 18:10:33 2016 +0100
+++ b/examples/basic_FFT_phase_vocoder/render.cpp	Sun Jun 12 18:16:20 2016 +0100
@@ -28,7 +28,7 @@
 
 #include <Bela.h>
 #include <rtdk.h>
-#include <NE10.h>					// NEON FFT library
+#include <ne10/NE10.h>					// NEON FFT library
 #include <cmath>
 #include "SampleData.h"
 #include <Midi.h>
--- a/examples/d-box/FIRfilter.h	Sun Jun 12 18:10:33 2016 +0100
+++ b/examples/d-box/FIRfilter.h	Sun Jun 12 18:16:20 2016 +0100
@@ -9,7 +9,7 @@
 #define FIRFILTER_H_
 
 #define ENABLE_NE10_FIR_FLOAT_NEON	// Define needed for Ne10 library
-#include <NE10.h>
+#include <ne10/NE10.h>
 
 //#define FILTER_TAP_NUM 21
 //ne10_float32_t filterTaps[FILTER_TAP_NUM] = {
--- a/examples/d-box/main.cpp	Sun Jun 12 18:10:33 2016 +0100
+++ b/examples/d-box/main.cpp	Sun Jun 12 18:16:20 2016 +0100
@@ -24,7 +24,7 @@
 #include <mntent.h>		// to check if device is mounted
 #include <sys/mount.h>	// mount()
 #include <sys/time.h>	// elapsed time
-#include <NE10.h>		// neon library
+#include <ne10/NE10.h>	// neon library
 
 // thread priority
 #include <pthread.h>
--- a/examples/filter_FIR/FIRfilter.h	Sun Jun 12 18:10:33 2016 +0100
+++ b/examples/filter_FIR/FIRfilter.h	Sun Jun 12 18:16:20 2016 +0100
@@ -9,7 +9,7 @@
 #define FIRFILTER_H_
 
 
-#include <NE10.h>
+#include <ne10/NE10.h>
 
 #define FILTER_TAP_NUM 31
 
--- a/examples/filter_FIR/render.cpp	Sun Jun 12 18:10:33 2016 +0100
+++ b/examples/filter_FIR/render.cpp	Sun Jun 12 18:16:20 2016 +0100
@@ -27,7 +27,7 @@
 
 #include <Bela.h>
 #include <cmath>
-#include <NE10.h>					// neon library
+#include <ne10/NE10.h>					// neon library
 #include "SampleData.h"
 #include "FIRfilter.h"
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/ne10/NE10.h	Sun Jun 12 18:16:20 2016 +0100
@@ -0,0 +1,183 @@
+/*
+ *  Copyright 2011-15 ARM Limited and Contributors.
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *    * Neither the name of ARM Limited nor the
+ *      names of its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
+ *  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ *  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
+ *  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ *  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * NE10 Library : inc/NE10.h
+ */
+
+/**
+   \mainpage Welcome to Ne10 Documentation!
+   *
+   *
+   *\par Introduction
+   *
+   * Ne10 (http://projectne10.github.com/Ne10/) is a library of the most commonly used functions that have been heavily
+   * optimized for ARM-based CPUs with NEON. These functions provide a consistent
+   * well tested behavior that can be easily incorporated into applications enabling
+   * developers to get the most out of the ARM V7/NEON without arduous assembly coding.
+   * Ne10 is usable as a 'drop and go' pre-built library or as a set of modular functions
+   * that can be incorporated in a more modular "pick and mix" form where binary size might
+   * be an issue.
+   *
+   * The following figure illustrates the basic concepts of "What's Ne10"
+   *\image html ne10_library.png "Ne10 Library Description"
+   *
+   *\par Top-Level Overview
+   * When you checkout Ne10, you will notice a number of directories. These directories are as follows:
+   * <pre>
+   * ├── android
+   * │   └── Android reference files
+   * ├── build
+   * │   └── directory for build-related files
+   * ├── common
+   * │   └── directory for common header, table and macro definition files
+   * ├── doc
+   * │   └── directory for documentations
+   * ├── inc
+   * │   └── directory for functions'heaeder files
+   * ├── modules
+   * │   ├── dsp
+   * │   │   ├── @link groupDSPs dsp module@endlink that provides a set of signal processing functions, such as complex/real FFT/IFFT, FIR and IIR
+   * │   │   └── test
+   * │   │       └──  directory for test files
+   * │   ├── imgproc
+   * │   │   ├── @link groupIMGPROCs imgproc module@endlink that provides a set of image processing functions, such as image resize, image rotate
+   * │   │   └── test
+   * │   │       └──  directory for test files
+   * │   ├── math
+   * │   │   ├── @link groupMaths math module@endlink that provides a set of vector/matrix algebra functions
+   * │   │   └── test
+   * │   │       └──  directory for test files
+   * │   ├── physics
+   * │   │   ├── @link groupPhysics physics module@endlink that provides a set of collision detection functions
+   * │   │   └── test
+   * │   │       └──  directory for test files
+   * ├── samples
+   * │   └── @link groupSamples sample code@endlink
+   * ├── test
+   * │   ├── directory for test framework
+   * ├── tools
+   * │   ├── directory for tools such as Cformatter, doxygen, etc
+   * </pre>
+   *
+   *\par Modules Description
+   * Ne10 has a modular structure, which means that the package includes several shared or static libraries.
+   * Currently, the following modules are available or in plan:
+   *
+   * - @link groupMaths Math Functions@endlink
+   * - @link groupDSPs Signal Processing Functions@endlink
+   * - @link groupIMGPROCs Image Processing Functions@endlink
+   * - @link groupPhysics Physics Functions@endlink
+   * - Others
+   *
+   *\par Usage
+   *
+   * Ne10 library provides directly and indirectly function call. you could check the @link groupSamples sample code@endlink for details
+   *
+   *\par Build
+   *
+   * See CMakeBuilding.txt file in the "doc" folder
+   *
+   *\par Code formatter
+   *
+   * See Formatter.txt file in the "doc" folder
+   *
+   *\par License
+   *
+   * The Ne10 is provided free of charge by ARM Limited and Contributors, and licensed under both New BSD License
+   * (http://opensource.org/licenses/BSD-3-Clause) and Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0).
+   * You could also check the LICENSE file in "doc" directory
+   */
+
+
+/**
+ * @defgroup groupMaths Math Functions
+ *
+ *
+ * This set of functions provide vector/matrix algebra functions that include
+ * add, sub, multiply, div and so on. Currently, only the float (single precision)
+ * data type is supported.
+ */
+
+/**
+ * @defgroup groupDSPs Signal Processing Functions
+ *
+ *
+ * This set of functions provide some commonly used functions in signal processing,
+ * such as complex/real FFT/IFFT, FIR and IIR. Currently, only the float (single precision)
+ * data type is supported.
+ */
+
+/**
+ * @defgroup groupIMGPROCs Image Processing Functions
+ *
+ *
+ * This set of functions provide some commonly used functions in image processing,
+ * such as image scale, image rotate.
+ */
+
+/**
+ * @defgroup groupPhysics Physics Functions
+ *
+ *
+ * This set of functions provide some APIs used for collision detection,
+ * such as compute AABB, caculate relative velocity and apply contact impulse.
+ */
+
+/**
+ * @defgroup groupSamples Sample Functions
+ *
+ *
+ * This set of functions provide some sample functions.
+ */
+
+
+#ifndef NE10_H
+#define NE10_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+
+#include "NE10_types.h"
+#include "NE10_macros.h"
+#include "NE10_init.h"
+#include "NE10_math.h"
+#include "NE10_dsp.h"
+#include "NE10_imgproc.h"
+#include "NE10_physics.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/ne10/NE10_dsp.h	Sun Jun 12 18:16:20 2016 +0100
@@ -0,0 +1,411 @@
+/*
+ *  Copyright 2012-15 ARM Limited and Contributors.
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *    * Neither the name of ARM Limited nor the
+ *      names of its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
+ *  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ *  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
+ *  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ *  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * NE10 Library : inc/NE10_dsp.h
+ */
+
+
+#include "NE10_types.h"
+
+#ifndef NE10_DSP_H
+#define NE10_DSP_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+///////////////////////////
+// function prototypes:
+///////////////////////////
+
+    /* fft functions*/
+
+    /* function pointers*/
+    extern ne10_fft_cfg_float32_t (*ne10_fft_alloc_c2c_float32) (ne10_int32_t nfft);
+    extern ne10_fft_cfg_int32_t (*ne10_fft_alloc_c2c_int32) (ne10_int32_t nfft);
+
+    extern void (*ne10_fft_c2c_1d_float32) (ne10_fft_cpx_float32_t *fout,
+                                            ne10_fft_cpx_float32_t *fin,
+                                            ne10_fft_cfg_float32_t cfg,
+                                            ne10_int32_t inverse_fft);
+
+    extern void (*ne10_fft_r2c_1d_float32) (ne10_fft_cpx_float32_t *fout,
+                                            ne10_float32_t *fin,
+                                            ne10_fft_r2c_cfg_float32_t cfg);
+
+    extern void (*ne10_fft_c2r_1d_float32) (ne10_float32_t *fout,
+                                            ne10_fft_cpx_float32_t *fin,
+                                            ne10_fft_r2c_cfg_float32_t cfg);
+
+    extern void (*ne10_fft_c2c_1d_int32) (ne10_fft_cpx_int32_t *fout,
+                                          ne10_fft_cpx_int32_t *fin,
+                                          ne10_fft_cfg_int32_t cfg,
+                                          ne10_int32_t inverse_fft,
+                                          ne10_int32_t scaled_flag);
+
+    extern void (*ne10_fft_r2c_1d_int32) (ne10_fft_cpx_int32_t *fout,
+                                          ne10_int32_t *fin,
+                                          ne10_fft_r2c_cfg_int32_t cfg,
+                                          ne10_int32_t scaled_flag);
+
+    extern void (*ne10_fft_c2r_1d_int32) (ne10_int32_t *fout,
+                                          ne10_fft_cpx_int32_t *fin,
+                                          ne10_fft_r2c_cfg_int32_t cfg,
+                                          ne10_int32_t scaled_flag);
+
+    extern void (*ne10_fft_c2c_1d_int16) (ne10_fft_cpx_int16_t *fout,
+                                          ne10_fft_cpx_int16_t *fin,
+                                          ne10_fft_cfg_int16_t cfg,
+                                          ne10_int32_t inverse_fft,
+                                          ne10_int32_t scaled_flag);
+
+    extern void (*ne10_fft_r2c_1d_int16) (ne10_fft_cpx_int16_t *fout,
+                                          ne10_int16_t *fin,
+                                          ne10_fft_r2c_cfg_int16_t cfg,
+                                          ne10_int32_t scaled_flag);
+
+    extern void (*ne10_fft_c2r_1d_int16) (ne10_int16_t *fout,
+                                          ne10_fft_cpx_int16_t *fin,
+                                          ne10_fft_r2c_cfg_int16_t cfg,
+                                          ne10_int32_t scaled_flag);
+
+    /* init functions*/
+    extern ne10_fft_cfg_int16_t ne10_fft_alloc_c2c_int16 (ne10_int32_t nfft);
+
+    /* destroy functions */
+    extern void ne10_fft_destroy_c2c_float32 (ne10_fft_cfg_float32_t);
+    extern void ne10_fft_destroy_c2c_int32 (ne10_fft_cfg_int32_t);
+    extern void ne10_fft_destroy_c2c_int16 (ne10_fft_cfg_int16_t);
+
+    extern void ne10_fft_destroy_r2c_float32 (ne10_fft_r2c_cfg_float32_t);
+    extern void ne10_fft_destroy_r2c_int32 (ne10_fft_r2c_cfg_int32_t);
+    extern void ne10_fft_destroy_r2c_int16 (ne10_fft_r2c_cfg_int16_t);
+
+    extern ne10_fft_r2c_cfg_float32_t ne10_fft_alloc_r2c_float32 (ne10_int32_t nfft);
+    extern ne10_fft_r2c_cfg_int32_t ne10_fft_alloc_r2c_int32 (ne10_int32_t nfft);
+    extern ne10_fft_r2c_cfg_int16_t ne10_fft_alloc_r2c_int16 (ne10_int32_t nfft);
+
+    /* C version*/
+    extern ne10_fft_cfg_float32_t ne10_fft_alloc_c2c_float32_c (ne10_int32_t nfft);
+    extern ne10_fft_cfg_int32_t ne10_fft_alloc_c2c_int32_c (ne10_int32_t nfft);
+
+    extern void ne10_fft_c2c_1d_float32_c (ne10_fft_cpx_float32_t *fout,
+                                           ne10_fft_cpx_float32_t *fin,
+                                           ne10_fft_cfg_float32_t cfg,
+                                           ne10_int32_t inverse_fft);
+
+    extern void ne10_fft_r2c_1d_float32_c (ne10_fft_cpx_float32_t *fout,
+                                           ne10_float32_t *fin,
+                                           ne10_fft_r2c_cfg_float32_t cfg);
+
+    extern void ne10_fft_c2r_1d_float32_c (ne10_float32_t *fout,
+                                           ne10_fft_cpx_float32_t *fin,
+                                           ne10_fft_r2c_cfg_float32_t cfg);
+
+    extern void ne10_fft_c2c_1d_int32_c (ne10_fft_cpx_int32_t *fout,
+                                         ne10_fft_cpx_int32_t *fin,
+                                         ne10_fft_cfg_int32_t cfg,
+                                         ne10_int32_t inverse_fft,
+                                         ne10_int32_t scaled_flag);
+
+    extern void ne10_fft_r2c_1d_int32_c (ne10_fft_cpx_int32_t *fout,
+                                         ne10_int32_t *fin,
+                                         ne10_fft_r2c_cfg_int32_t cfg,
+                                         ne10_int32_t scaled_flag);
+
+    extern void ne10_fft_c2r_1d_int32_c (ne10_int32_t *fout,
+                                         ne10_fft_cpx_int32_t *fin,
+                                         ne10_fft_r2c_cfg_int32_t cfg,
+                                         ne10_int32_t scaled_flag);
+
+    extern void ne10_fft_c2c_1d_int16_c (ne10_fft_cpx_int16_t *fout,
+                                         ne10_fft_cpx_int16_t *fin,
+                                         ne10_fft_cfg_int16_t cfg,
+                                         ne10_int32_t inverse_fft,
+                                         ne10_int32_t scaled_flag);
+
+    extern void ne10_fft_r2c_1d_int16_c (ne10_fft_cpx_int16_t *fout,
+                                         ne10_int16_t *fin,
+                                         ne10_fft_r2c_cfg_int16_t cfg,
+                                         ne10_int32_t scaled_flag);
+
+    extern void ne10_fft_c2r_1d_int16_c (ne10_int16_t *fout,
+                                         ne10_fft_cpx_int16_t *fin,
+                                         ne10_fft_r2c_cfg_int16_t cfg,
+                                         ne10_int32_t scaled_flag);
+
+
+    /* NEON version*/
+    extern ne10_fft_cfg_float32_t ne10_fft_alloc_c2c_float32_neon (ne10_int32_t nfft);
+    extern ne10_fft_cfg_int32_t ne10_fft_alloc_c2c_int32_neon (ne10_int32_t nfft);
+
+    extern void ne10_fft_c2c_1d_float32_neon (ne10_fft_cpx_float32_t *fout,
+            ne10_fft_cpx_float32_t *fin,
+            ne10_fft_cfg_float32_t cfg,
+            ne10_int32_t inverse_fft);
+
+    extern void ne10_fft_r2c_1d_float32_neon (ne10_fft_cpx_float32_t *fout,
+            ne10_float32_t *fin,
+            ne10_fft_r2c_cfg_float32_t cfg);
+
+    extern void ne10_fft_c2r_1d_float32_neon (ne10_float32_t *fout,
+            ne10_fft_cpx_float32_t *fin,
+            ne10_fft_r2c_cfg_float32_t cfg);
+
+    extern void ne10_fft_c2c_1d_int32_neon (ne10_fft_cpx_int32_t *fout,
+                                            ne10_fft_cpx_int32_t *fin,
+                                            ne10_fft_cfg_int32_t cfg,
+                                            ne10_int32_t inverse_fft,
+                                            ne10_int32_t scaled_flag);
+
+    extern void ne10_fft_r2c_1d_int32_neon (ne10_fft_cpx_int32_t *fout,
+                                            ne10_int32_t *fin,
+                                            ne10_fft_r2c_cfg_int32_t cfg,
+                                            ne10_int32_t scaled_flag);
+
+    extern void ne10_fft_c2r_1d_int32_neon (ne10_int32_t *fout,
+                                            ne10_fft_cpx_int32_t *fin,
+                                            ne10_fft_r2c_cfg_int32_t cfg,
+                                            ne10_int32_t scaled_flag);
+
+    extern void ne10_fft_c2c_1d_int16_neon (ne10_fft_cpx_int16_t *fout,
+                                            ne10_fft_cpx_int16_t *fin,
+                                            ne10_fft_cfg_int16_t cfg,
+                                            ne10_int32_t inverse_fft,
+                                            ne10_int32_t scaled_flag);
+
+    extern void ne10_fft_r2c_1d_int16_neon (ne10_fft_cpx_int16_t *fout,
+                                            ne10_int16_t *fin,
+                                            ne10_fft_r2c_cfg_int16_t cfg,
+                                            ne10_int32_t scaled_flag);
+
+    extern void ne10_fft_c2r_1d_int16_neon (ne10_int16_t *fout,
+                                            ne10_fft_cpx_int16_t *fin,
+                                            ne10_fft_r2c_cfg_int16_t cfg,
+                                            ne10_int32_t scaled_flag);
+
+    /* fir functions*/
+
+    /* function pointers*/
+    extern void (*ne10_fir_float) (const ne10_fir_instance_f32_t * S,
+                                   ne10_float32_t * pSrc,
+                                   ne10_float32_t * pDst,
+                                   ne10_uint32_t blockSize);
+
+    extern void (*ne10_fir_decimate_float) (const ne10_fir_decimate_instance_f32_t * S,
+                                            ne10_float32_t * pSrc,
+                                            ne10_float32_t * pDst,
+                                            ne10_uint32_t blockSize);
+
+    extern void (*ne10_fir_interpolate_float) (const ne10_fir_interpolate_instance_f32_t * S,
+            ne10_float32_t * pSrc,
+            ne10_float32_t * pDst,
+            ne10_uint32_t blockSize);
+
+    extern void (*ne10_fir_lattice_float) (const ne10_fir_lattice_instance_f32_t * S,
+                                           ne10_float32_t * pSrc,
+                                           ne10_float32_t * pDst,
+                                           ne10_uint32_t blockSize);
+
+    extern void (*ne10_fir_sparse_float) (ne10_fir_sparse_instance_f32_t * S,
+                                          ne10_float32_t * pSrc,
+                                          ne10_float32_t * pDst,
+                                          ne10_float32_t * pScratchIn,
+                                          ne10_uint32_t blockSize);
+
+
+    /* init functions*/
+    extern ne10_result_t ne10_fir_init_float (ne10_fir_instance_f32_t * S,
+            ne10_uint16_t numTaps,
+            ne10_float32_t * pCoeffs,
+            ne10_float32_t * pState,
+            ne10_uint32_t blockSize);
+
+    extern ne10_result_t ne10_fir_decimate_init_float (ne10_fir_decimate_instance_f32_t * S,
+            ne10_uint16_t numTaps,
+            ne10_uint8_t M,
+            ne10_float32_t * pCoeffs,
+            ne10_float32_t * pState,
+            ne10_uint32_t blockSize);
+
+    extern ne10_result_t ne10_fir_interpolate_init_float (ne10_fir_interpolate_instance_f32_t * S,
+            ne10_uint8_t L,
+            ne10_uint16_t numTaps,
+            ne10_float32_t * pCoeffs,
+            ne10_float32_t * pState,
+            ne10_uint32_t blockSize);
+
+    extern ne10_result_t ne10_fir_lattice_init_float (ne10_fir_lattice_instance_f32_t * S,
+            ne10_uint16_t numStages,
+            ne10_float32_t * pCoeffs,
+            ne10_float32_t * pState);
+
+    extern ne10_result_t ne10_fir_sparse_init_float (ne10_fir_sparse_instance_f32_t * S,
+            ne10_uint16_t numTaps,
+            ne10_float32_t * pCoeffs,
+            ne10_float32_t * pState,
+            ne10_int32_t * pTapDelay,
+            ne10_uint16_t maxDelay,
+            ne10_uint32_t blockSize);
+
+    /* C version*/
+    extern void ne10_fir_float_c (const ne10_fir_instance_f32_t * S,
+                                  ne10_float32_t * pSrc,
+                                  ne10_float32_t * pDst,
+                                  ne10_uint32_t blockSize);
+
+    extern void ne10_fir_decimate_float_c (const ne10_fir_decimate_instance_f32_t * S,
+                                           ne10_float32_t * pSrc,
+                                           ne10_float32_t * pDst,
+                                           ne10_uint32_t blockSize);
+
+    extern void ne10_fir_interpolate_float_c (const ne10_fir_interpolate_instance_f32_t * S,
+            ne10_float32_t * pSrc,
+            ne10_float32_t * pDst,
+            ne10_uint32_t blockSize);
+
+    extern void ne10_fir_lattice_float_c (const ne10_fir_lattice_instance_f32_t * S,
+                                          ne10_float32_t * pSrc,
+                                          ne10_float32_t * pDst,
+                                          ne10_uint32_t blockSize);
+
+    extern void ne10_fir_sparse_float_c (ne10_fir_sparse_instance_f32_t * S,
+                                         ne10_float32_t * pSrc,
+                                         ne10_float32_t * pDst,
+                                         ne10_float32_t * pScratchIn,
+                                         ne10_uint32_t blockSize);
+
+
+    /* NEON version*/
+
+    /**
+     * @addtogroup FIR
+     * @{
+     */
+#ifdef ENABLE_NE10_FIR_FLOAT_NEON
+    extern void ne10_fir_float_neon (const ne10_fir_instance_f32_t * S,
+                                     ne10_float32_t * pSrc,
+                                     ne10_float32_t * pDst,
+                                     ne10_uint32_t blockSize)
+    asm ("ne10_fir_float_neon");
+#endif // ENABLE_NE10_FIR_FLOAT_NEON
+    /** @} */ //end of FIR group
+
+    /**
+     * @addtogroup FIR_Decimate
+     * @{
+     */
+#ifdef ENABLE_NE10_FIR_DECIMATE_FLOAT_NEON
+    extern void ne10_fir_decimate_float_neon (const ne10_fir_decimate_instance_f32_t * S,
+            ne10_float32_t *pSrc,
+            ne10_float32_t *pDst,
+            ne10_uint32_t blockSize) asm ("ne10_fir_decimate_float_neon");
+
+    /** @} */ //end of FIR_decimate group
+#endif // ENABLE_NE10_FIR_DECIMATE_FLOAT_NEON
+    /**
+     * @addtogroup FIR_Interpolate
+     * @{
+     */
+#ifdef ENABLE_NE10_FIR_INTERPOLATE_FLOAT_NEON
+    extern void ne10_fir_interpolate_float_neon (const ne10_fir_interpolate_instance_f32_t * S,
+            ne10_float32_t *pSrc,
+            ne10_float32_t *pDst,
+            ne10_uint32_t blockSize) asm ("ne10_fir_interpolate_float_neon");
+#endif // ENABLE_NE10_FIR_INTERPOLATE_FLOAT_NEON
+    /** @} */ //end of FIR_interpolate group
+
+    /**
+     * @addtogroup FIR_Lattice
+     * @{
+     */
+#ifdef ENABLE_NE10_FIR_LATTICE_FLOAT_NEON
+    extern void ne10_fir_lattice_float_neon (const ne10_fir_lattice_instance_f32_t * S,
+            ne10_float32_t * pSrc,
+            ne10_float32_t * pDst,
+            ne10_uint32_t blockSize) asm ("ne10_fir_lattice_float_neon");
+#endif // ENABLE_NE10_FIR_LATTICE_FLOAT_NEON
+    /** @} */ //end of FIR_Lattice group
+
+    /**
+     * @addtogroup FIR_Sparse
+     * @{
+     */
+#ifdef ENABLE_NE10_FIR_SPARSE_FLOAT_NEON
+    extern void ne10_fir_sparse_float_neon (ne10_fir_sparse_instance_f32_t * S,
+                                            ne10_float32_t * pSrc,
+                                            ne10_float32_t * pDst,
+                                            ne10_float32_t * pScratch,
+                                            ne10_uint32_t blockSize)
+    asm ("ne10_fir_sparse_float_neon");
+#endif // ENABLE_NE10_FIR_SPARSE_FLOAT_NEON
+    /** @} */ //end of FIR_sparse group
+
+
+    /* iir functions*/
+
+    /* function pointers*/
+    extern void (*ne10_iir_lattice_float) (const ne10_iir_lattice_instance_f32_t * S,
+                                           ne10_float32_t * pSrc,
+                                           ne10_float32_t * pDst,
+                                           ne10_uint32_t blockSize);
+
+    /* init functions*/
+    extern ne10_result_t ne10_iir_lattice_init_float (ne10_iir_lattice_instance_f32_t * S,
+            ne10_uint16_t numStages,
+            ne10_float32_t * pkCoeffs,
+            ne10_float32_t * pvCoeffs,
+            ne10_float32_t * pState,
+            ne10_uint32_t blockSize);
+
+
+    /* C version*/
+    extern void ne10_iir_lattice_float_c (const ne10_iir_lattice_instance_f32_t * S,
+                                          ne10_float32_t * pSrc,
+                                          ne10_float32_t * pDst,
+                                          ne10_uint32_t blockSize);
+
+    /* NEON version*/
+
+    /**
+     * @addtogroup IIR_Lattice
+     * @{
+     */
+#ifdef ENABLE_NE10_IIR_LATTICE_FLOAT_NEON
+    extern void ne10_iir_lattice_float_neon (const ne10_iir_lattice_instance_f32_t * S,
+            ne10_float32_t * pSrc,
+            ne10_float32_t * pDst,
+            ne10_uint32_t blockSize) asm ("ne10_iir_lattice_float_neon");
+#endif // ENABLE_NE10_IIR_LATTICE_FLOAT_NEON
+    /** @} */ //end of IIR_Lattice group
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/ne10/NE10_imgproc.h	Sun Jun 12 18:16:20 2016 +0100
@@ -0,0 +1,131 @@
+/*
+ *  Copyright 2013-15 ARM Limited and Contributors.
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *    * Neither the name of ARM Limited nor the
+ *      names of its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
+ *  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ *  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
+ *  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ *  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * NE10 Library : inc/NE10_imgproc.h
+ */
+
+
+#include "NE10_types.h"
+
+#ifndef NE10_IMGPROC_H
+#define NE10_IMGPROC_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+    ne10_result_t ne10_init_imgproc (ne10_int32_t is_NEON_available);
+
+///////////////////////////
+// function prototypes:
+///////////////////////////
+
+    /* image resize functions*/
+    /* function pointers*/
+    extern void (*ne10_img_resize_bilinear_rgba) (ne10_uint8_t* dst,
+            ne10_uint32_t dst_width,
+            ne10_uint32_t dst_height,
+            ne10_uint8_t* src,
+            ne10_uint32_t src_width,
+            ne10_uint32_t src_height,
+            ne10_uint32_t src_stride);
+    /* C version*/
+    extern void ne10_img_resize_bilinear_rgba_c (ne10_uint8_t* dst,
+            ne10_uint32_t dst_width,
+            ne10_uint32_t dst_height,
+            ne10_uint8_t* src,
+            ne10_uint32_t src_width,
+            ne10_uint32_t src_height,
+            ne10_uint32_t src_stride);
+    /* NEON version*/
+    extern void ne10_img_resize_bilinear_rgba_neon (ne10_uint8_t* dst,
+            ne10_uint32_t dst_width,
+            ne10_uint32_t dst_height,
+            ne10_uint8_t* src,
+            ne10_uint32_t src_width,
+            ne10_uint32_t src_height,
+            ne10_uint32_t src_stride)
+    asm ("ne10_img_resize_bilinear_rgba_neon");
+
+    /* image rotate functions*/
+    /* function pointers*/
+    extern void (*ne10_img_rotate_rgba) (ne10_uint8_t* dst,
+                                         ne10_uint32_t* dst_width,
+                                         ne10_uint32_t* dst_height,
+                                         ne10_uint8_t* src,
+                                         ne10_uint32_t src_width,
+                                         ne10_uint32_t src_height,
+                                         ne10_int32_t angle);
+    /* C version*/
+    extern void ne10_img_rotate_rgba_c (ne10_uint8_t* dst,
+                                        ne10_uint32_t* dst_width,
+                                        ne10_uint32_t* dst_height,
+                                        ne10_uint8_t* src,
+                                        ne10_uint32_t src_width,
+                                        ne10_uint32_t src_height,
+                                        ne10_int32_t angle);
+#ifdef ENABLE_NE10_IMG_ROTATE_RGBA_NEON
+    /* NEON version*/
+    extern void ne10_img_rotate_rgba_neon (ne10_uint8_t* dst,
+                                           ne10_uint32_t* dst_width,
+                                           ne10_uint32_t* dst_height,
+                                           ne10_uint8_t* src,
+                                           ne10_uint32_t src_width,
+                                           ne10_uint32_t src_height,
+                                           ne10_int32_t angle)
+    asm ("ne10_img_rotate_rgba_neon");
+#endif // ENABLE_NE10_IMG_ROTATE_RGBA_NEON
+
+    /* image boxfilter functions */
+    /* function pointers */
+    extern void (*ne10_img_boxfilter_rgba8888) (const ne10_uint8_t *src,
+            ne10_uint8_t *dst,
+            ne10_size_t src_size,
+            ne10_int32_t src_stride,
+            ne10_int32_t dst_stride,
+            ne10_size_t kernel_size);
+    /* C version*/
+    extern void ne10_img_boxfilter_rgba8888_c (const ne10_uint8_t *src,
+            ne10_uint8_t *dst,
+            ne10_size_t src_size,
+            ne10_int32_t src_stride,
+            ne10_int32_t dst_stride,
+            ne10_size_t kernel_size);
+    /* NEON version*/
+    extern void ne10_img_boxfilter_rgba8888_neon (const ne10_uint8_t *src,
+            ne10_uint8_t *dst,
+            ne10_size_t src_size,
+            ne10_int32_t src_stride,
+            ne10_int32_t dst_stride,
+            ne10_size_t kernel_size);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/ne10/NE10_init.h	Sun Jun 12 18:16:20 2016 +0100
@@ -0,0 +1,57 @@
+/*
+ *  Copyright 2011-15 ARM Limited and Contributors.
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *    * Neither the name of ARM Limited nor the
+ *      names of its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
+ *  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ *  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
+ *  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ *  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "NE10.h"
+
+#ifndef NE10_init_H
+#define NE10_init_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+    /*!
+        This routine returns NE10_OK if the running platform supports NEON, otherwise it returns NE10_ERR
+     */
+    extern ne10_result_t ne10_HasNEON(void);
+
+    /*!
+        This routine initializes all the function pointers.
+     */
+    extern ne10_result_t ne10_init(void);
+
+    /*!
+        This routine initializes all the math function pointers defined in "NE10_math.h" with pointers to ARM NEON or ARM VFP implementations.
+     */
+    extern ne10_result_t ne10_init_math (ne10_int32_t is_NEON_available);
+    extern ne10_result_t ne10_init_dsp (ne10_int32_t is_NEON_available);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/ne10/NE10_macros.h	Sun Jun 12 18:16:20 2016 +0100
@@ -0,0 +1,95 @@
+/*
+ *  Copyright 2013-15 ARM Limited and Contributors.
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *    * Neither the name of ARM Limited nor the
+ *      names of its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
+ *  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ *  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
+ *  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ *  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * NE10 Library : inc/NE10_macros.h
+ */
+
+/** NE10 defines a number of macros for use in its function signatures.
+ *  The macros are defined within this header file.
+ */
+
+#ifndef NE10_MACROS_H
+#define NE10_MACROS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/////////////////////////////////////////////////////////
+// constant values that are used across the library
+/////////////////////////////////////////////////////////
+
+#define NE10_PI (ne10_float32_t)(3.1415926535897932384626433832795)
+
+/////////////////////////////////////////////////////////
+// some external macro definitions to be exposed to the users
+/////////////////////////////////////////////////////////
+
+#define NE10_MALLOC malloc
+#define NE10_FREE(p) \
+    do { \
+        free(p); \
+        p = NULL; \
+    }while(0)
+
+#define NE10_MIN(a,b) ((a)>(b)?(b):(a))
+#define NE10_MAX(a,b) ((a)<(b)?(b):(a))
+
+#define NE10_BYTE_ALIGNMENT(address, alignment) \
+    do { \
+        (address) = (((address) + ((alignment) - 1)) & ~ ((alignment) - 1)); \
+    }while (0)
+
+/////////////////////////////////////////////////////////
+// macro definitions for float to fixed point
+/////////////////////////////////////////////////////////
+#define NE10_F2I16_MAX         32767
+#define NE10_F2I16_SHIFT       15
+#define NE10_F2I16_SAMPPROD    ne10_int32_t
+#define NE10_F2I16_OP(x)       (ne10_int16_t)((x)*NE10_F2I16_MAX + 0.5f)
+#define NE10_F2I16_SROUND(x)   (ne10_int16_t)((((x)<<1)+(1<<NE10_F2I16_SHIFT))>>16)
+#define NE10_F2I16_SMUL(a,b)   ((NE10_F2I16_SAMPPROD)(a)*(b))
+#define NE10_F2I16_FIXDIV(c,div) \
+    do {    ((c).r) = ( ( ((c).r)/div) );  \
+        ((c).i) = ( ( ((c).i)/div) ); }while (0)
+
+#define NE10_F2I32_MAX         2147483647
+#define NE10_F2I32_SHIFT       31
+#define NE10_F2I32_SAMPPROD    ne10_int64_t
+#define NE10_F2I32_OP(x)       (ne10_int32_t)((x)*NE10_F2I32_MAX + 0.5f)
+#define NE10_F2I32_SROUND(x)   (ne10_int32_t) ((x)>>NE10_F2I32_SHIFT)
+#define NE10_F2I32_SMUL(a,b)    ((NE10_F2I32_SAMPPROD)(a)*(b))
+#define NE10_F2I32_FIXDIV(c,div) \
+    do {    ((c).r) = ( ( ((c).r)/div) );  \
+        ((c).i) = ( ( ((c).i)/div) ); }while (0)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/ne10/NE10_math.h	Sun Jun 12 18:16:20 2016 +0100
@@ -0,0 +1,1484 @@
+/*
+ *  Copyright 2011-15 ARM Limited and Contributors.
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *    * Neither the name of ARM Limited nor the
+ *      names of its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
+ *  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ *  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
+ *  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ *  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * NE10 Library : inc/NE10_math.h
+ */
+
+
+#include "NE10_types.h"
+
+#ifndef NE10_MATH_H
+#define NE10_MATH_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+///////////////////////////
+// function prototypes:
+///////////////////////////
+
+
+// ## Vector-Constant Arithmetic ##
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup ADD_VEC Vector Add
+     *
+     * \par
+     * These functions implement the vector add operation for float data type.
+     */
+
+    /**
+     * @addtogroup ADD_VEC
+     * @{
+     */
+
+    /**
+     * Adds a constant scalar value to all the elements of an input array and stores the results in an output array.
+     * This function point could be pointed to one of ne10_addc_float_c, ne10_addc_float_neon and ne10_addc_float_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   The constant scalar added to the input values
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_addc_float) (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_addc_float_c (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_addc_float_neon (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count) asm ("ne10_addc_float_neon");
+    extern ne10_result_t ne10_addc_float_asm (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+    /**
+     * Adds a constant 2D vector to all of the vectors in an input array and stores the results in an output array.
+     * This function point could be pointed to one of ne10_addc_vec2f_c, ne10_addc_vec2f_neon and ne10_addc_vec2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   Pointer to the 2D vector added to the input values
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_addc_vec2f) (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_addc_vec2f_c (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_addc_vec2f_neon (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count) asm ("ne10_addc_vec2f_neon");
+    extern ne10_result_t ne10_addc_vec2f_asm (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    /**
+     * Adds a constant 3D vector to all of the vectors in an input array and stores the results in an output array.
+     * This function point could be pointed to one of ne10_addc_vec3f_c, ne10_addc_vec3f_neon and ne10_addc_vec3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   Pointer to the 3D vector added to the input values
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_addc_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_addc_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_addc_vec3f_neon (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count) asm ("ne10_addc_vec3f_neon");
+    extern ne10_result_t ne10_addc_vec3f_asm (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    /**
+     * Adds a constant 4D vector to all of the vectors in an input array and stores the results in an output array.
+     * This function point could be pointed to one of ne10_addc_vec4f_c, ne10_addc_vec4f_neon and ne10_addc_vec4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   Pointer to the 4D vector added to the input values
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_addc_vec4f) (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_addc_vec4f_c (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_addc_vec4f_neon (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count) asm ("ne10_addc_vec4f_neon");
+    extern ne10_result_t ne10_addc_vec4f_asm (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+
+
+    /**
+     * Adds the elements of src1 to the elements of src2 and stores the results in the dst.
+     * This function point could be pointed to one of ne10_add_float_c, ne10_add_float_neon and ne10_add_float_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1  The first array to use as the input array
+     * @param[in]  src2  The second array to use as the input array
+     * @param[in]  count The number of items in the two input arrays
+     */
+    extern ne10_result_t (*ne10_add_float) (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_add_float_c (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_add_float_neon (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count) asm ("ne10_add_float_neon");
+    extern ne10_result_t ne10_add_float_asm (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+    /**
+     * Vector addition of two 2D vectors.
+     * This function point could be pointed to one of ne10_add_vec2f_c, ne10_add_vec2f_neon and ne10_add_vec2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_add_vec2f) (ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_add_vec2f_c (ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_add_vec2f_neon (ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count) asm ("ne10_add_vec2f_neon");
+    extern ne10_result_t ne10_add_vec2f_asm (ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    /**
+     * Vector addition of two 3D vectors.
+     * This function point could be pointed to one of ne10_add_vec3f_c, ne10_add_vec3f_neon and ne10_add_vec3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_add_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_add_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_add_vec3f_neon (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count) asm ("ne10_add_vec3f_neon");
+    extern ne10_result_t ne10_add_vec3f_asm (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    /**
+     * Vector addition of two 4D vectors.
+     * This function point could be pointed to one of ne10_add_vec4f_c, ne10_add_vec4f_neon and ne10_add_vec4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_add_vec4f) (ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_add_vec4f_c (ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_add_vec4f_neon (ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count) asm ("ne10_add_vec4f_neon");
+    extern ne10_result_t ne10_add_vec4f_asm (ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    /** @} */ //end of Vector Add group
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup ADD_MAT Matrix Add
+     *
+     * \par
+     * These functions implement the matrix add operation for float data type.
+     */
+
+    /**
+     * @addtogroup ADD_MAT
+     * @{
+     */
+
+    /**
+     * Vector addition of two 4x4 matrixs.
+     * This function point could be pointed to one of ne10_addmat_4x4f_c, ne10_addmat_4x4f_neon and ne10_addmat_4x4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_addmat_4x4f) (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_addmat_4x4f_c (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_addmat_4x4f_neon (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_addmat_4x4f_asm (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+    /**
+     * Vector addition of two 3x3 matrixs.
+     * This function point could be pointed to one of ne10_addmat_3x3f_c, ne10_addmat_3x3f_neon and ne10_addmat_3x3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_addmat_3x3f) (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_addmat_3x3f_c (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_addmat_3x3f_neon (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_addmat_3x3f_asm (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+    /**
+     * Vector addition of two 2x2 matrixs.
+     * This function point could be pointed to one of ne10_addmat_2x2f_c, ne10_addmat_2x2f_neon and ne10_addmat_2x2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_addmat_2x2f) (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_addmat_2x2f_c (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_addmat_2x2f_neon (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_addmat_2x2f_asm (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+    /** @} */ //end of Matrix Add group
+
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup SUB_VEC Vector Sub
+     *
+     * \par
+     * These functions implement the vector sub operation for float data type.
+     */
+
+    /**
+     * @addtogroup SUB_VEC
+     * @{
+     */
+
+    /**
+     * Subtracts a constant scalar from all the elements of an input array and stores the results in an output array.
+     * This function point could be pointed to one of ne10_subc_float_c, ne10_subc_float_neon and ne10_subc_float_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   The constant scalar subtracted from the input values
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_subc_float) (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_subc_float_c (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count); // subtract cst from the element(s)
+    extern ne10_result_t ne10_subc_float_neon (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count); // subtract cst from the element(s)
+    extern ne10_result_t ne10_subc_float_asm (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count); // subtract cst from the element(s)
+    /**
+     * Subtracts a constant 2D vector from all of the vectors in an input array and stores the results in an output array.
+     * This function point could be pointed to one of ne10_subc_vec2f_c, ne10_subc_vec2f_neon and ne10_subc_vec2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   Pointer to the 2D vector subtracted from the input values
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_subc_vec2f) (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_subc_vec2f_c (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count); // subtract cst from the element(s)
+    extern ne10_result_t ne10_subc_vec2f_neon (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count); // subtract cst from the element(s)
+    extern ne10_result_t ne10_subc_vec2f_asm (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count); // subtract cst from the element(s)
+    /**
+     * Subtracts a constant 3D vector from all of the vectors in an input array and stores the results in an output array.
+     * This function point could be pointed to one of ne10_subc_vec3f_c, ne10_subc_vec3f_neon and ne10_subc_vec3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   Pointer to the 3D vector subtracted from the input values
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_subc_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_subc_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count); // subtract cst from the element(s)
+    extern ne10_result_t ne10_subc_vec3f_neon (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count); // subtract cst from the element(s)
+    extern ne10_result_t ne10_subc_vec3f_asm (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count); // subtract cst from the element(s)
+    /**
+     * Subtracts a constant 4D vector from all of the vectors in an input array and stores the results in an output array.
+     * This function point could be pointed to one of ne10_subc_vec4f_c, ne10_subc_vec4f_neon and ne10_subc_vec4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   Pointer to the 4D vector subtracted from the input values
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_subc_vec4f) (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_subc_vec4f_c (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count); // subtract cst from the element(s)
+    extern ne10_result_t ne10_subc_vec4f_neon (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count); // subtract cst from the element(s)
+    extern ne10_result_t ne10_subc_vec4f_asm (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count); // subtract cst from the element(s)
+
+    /**
+     * Subtracts the elements of src2 from the elements of src1 and stores the results in the dst.
+     * This function point could be pointed to one of ne10_sub_float_c, ne10_sub_float_neon and ne10_sub_float_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1  The first array to use as the input array
+     * @param[in]  src2  The second array to use as the input array
+     * @param[in]  count The number of items in the two input arrays
+     */
+    extern ne10_result_t (*ne10_sub_float) (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_sub_float_c (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_sub_float_neon (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count) asm ("ne10_sub_float_neon");
+    extern ne10_result_t ne10_sub_float_asm (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+    /**
+     * Vector subtraction of two 2D vectors.
+     * This function point could be pointed to one of ne10_sub_vec2f_c, ne10_sub_vec2f_neon and ne10_sub_vec2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_sub_vec2f) (ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_sub_vec2f_c (ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_sub_vec2f_neon (ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count) asm ("ne10_sub_vec2f_neon");
+    extern ne10_result_t ne10_sub_vec2f_asm (ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    /**
+     * Vector subtraction of two 3D vectors.
+     * This function point could be pointed to one of ne10_sub_vec3f_c, ne10_sub_vec3f_neon and ne10_sub_vec3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_sub_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_sub_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_sub_vec3f_neon (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count) asm ("ne10_sub_vec3f_neon");
+    extern ne10_result_t ne10_sub_vec3f_asm (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    /**
+     * Vector subtraction of two 4D vectors.
+     * This function point could be pointed to one of ne10_sub_vec4f_c, ne10_sub_vec4f_neon and ne10_sub_vec4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_sub_vec4f) (ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_sub_vec4f_c (ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_sub_vec4f_neon (ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count) asm ("ne10_sub_vec4f_neon");
+    extern ne10_result_t ne10_sub_vec4f_asm (ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    /** @} */ //end of Vector Sub group
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup RSBC Vector Rsbc
+     *
+     * \par
+     * These functions implement the vector rsbc operation for float data type.
+     */
+
+    /**
+     * @addtogroup RSBC
+     * @{
+     */
+    /**
+     * Subtracts the elements of an input array from a constant scalar and stores the results in an output array.
+     * This function point could be pointed to one of ne10_rsbc_float_c, ne10_rsbc_float_neon and ne10_rsbc_float_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   The constant scalar to subtract the input values from
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_rsbc_float) (ne10_float32_t * dst, ne10_float32_t *src, const ne10_float32_t cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_rsbc_float_c (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count); // subtract element(s) from a cst
+    extern ne10_result_t ne10_rsbc_float_neon (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count); // subtract element(s) from a cst
+    extern ne10_result_t ne10_rsbc_float_asm (ne10_float32_t * dst, ne10_float32_t *src, const ne10_float32_t cst, ne10_uint32_t count); // subtract element(s) from a cst
+    /**
+     * Subtracts the vectors in an input array from a constant 2D vector and stores the results in an output array.
+     * This function point could be pointed to one of ne10_rsbc_vec2f_c, ne10_rsbc_vec2f_neon and ne10_rsbc_vec2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   Pointer to the 2D vector to subtract the input values from
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_rsbc_vec2f) (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_rsbc_vec2f_c (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count); // subtract element(s) from a cst
+    extern ne10_result_t ne10_rsbc_vec2f_neon (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count); // subtract element(s) from a cst
+    extern ne10_result_t ne10_rsbc_vec2f_asm (ne10_vec2f_t * dst, ne10_vec2f_t *src, const ne10_vec2f_t * cst, ne10_uint32_t count); // subtract element(s) from a cst
+    /**
+     * Subtracts the vectors in an input array from a constant 3D vector and stores the results in an output array.
+     * This function point could be pointed to one of ne10_rsbc_vec3f_c, ne10_rsbc_vec3f_neon and ne10_rsbc_vec3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   Pointer to the 3D vector to subtract the input values from
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_rsbc_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_rsbc_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count); // subtract element(s) from a cst
+    extern ne10_result_t ne10_rsbc_vec3f_neon (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count); // subtract element(s) from a cst
+    extern ne10_result_t ne10_rsbc_vec3f_asm (ne10_vec3f_t * dst, ne10_vec3f_t *src, const ne10_vec3f_t * cst, ne10_uint32_t count); // subtract element(s) from a cst
+    /**
+     * Subtracts the vectors in an input array from a constant 4D vector and stores the results in an output array.
+     * This function point could be pointed to one of ne10_rsbc_vec4f_c, ne10_rsbc_vec4f_neon and ne10_rsbc_vec4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   Pointer to the 4D vector to subtract the input values from
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_rsbc_vec4f) (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_rsbc_vec4f_c (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count); // subtract element(s) from a cst
+    extern ne10_result_t ne10_rsbc_vec4f_neon (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count); // subtract element(s) from a cst
+    extern ne10_result_t ne10_rsbc_vec4f_asm (ne10_vec4f_t * dst, ne10_vec4f_t *src, const ne10_vec4f_t * cst, ne10_uint32_t count); // subtract element(s) from a cst
+    /** @} */ //end of Vector RSBC group
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup SUB_MAT Matrix Sub
+     *
+     * \par
+     * These functions implement the matrix sub operation for float data type.
+     */
+
+    /**
+     * @addtogroup SUB_MAT
+     * @{
+     */
+    /**
+     * Matrix subtraction of two 4x4 matrixs.
+     * This function point could be pointed to one of ne10_submat_4x4f_c, ne10_submat_4x4f_neon and ne10_submat_4x4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_submat_4x4f) (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_submat_4x4f_c (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_submat_4x4f_neon (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_submat_4x4f_asm (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+
+    /**
+     * Matrix subtraction of two 3x3 matrixs.
+     * This function point could be pointed to one of ne10_submat_3x3f_c, ne10_submat_3x3f_neon and ne10_submat_3x3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_submat_3x3f) (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_submat_3x3f_c (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_submat_3x3f_neon (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_submat_3x3f_asm (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+
+    /**
+     * Matrix subtraction of two 2x2 matrixs.
+     * This function point could be pointed to one of ne10_submat_2x2f_c, ne10_submat_2x2f_neon and ne10_submat_2x2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_submat_2x2f) (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_submat_2x2f_c (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_submat_2x2f_neon (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_submat_2x2f_asm (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+    /** @} */ //end of Matrix Sub group
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup MUL_VEC Vector Multiply
+     *
+     * \par
+     * These functions implement the vector multiply operation for float data type.
+     */
+
+    /**
+     * @addtogroup MUL_VEC
+     * @{
+     */
+
+    /**
+     * Multiplies the elements of an input array by a constant scalar and stores the results in an output array.
+     * This function point could be pointed to one of ne10_mulc_float_c, ne10_mulc_float_neon and ne10_mulc_float_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   The constant scalar to multiply the input values with
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_mulc_float) (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulc_float_c (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulc_float_neon (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulc_float_asm (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+    /**
+     * Multiplies the components of 2D vectors in an input array by the components of a constant 2D vector and stores the results in an output array.
+     * This function point could be pointed to one of ne10_mulc_vec2f_c, ne10_mulc_vec2f_neon and ne10_mulc_vec2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   Pointer to the 2D vector to multiply the input values with
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_mulc_vec2f) (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulc_vec2f_c (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulc_vec2f_neon (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulc_vec2f_asm (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    /**
+     * Multiplies the components of 3D vectors in an input array by the components of a constant 3D vector and stores the results in an output array.
+     * This function point could be pointed to one of ne10_mulc_vec3f_c, ne10_mulc_vec3f_neon and ne10_mulc_vec3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   Pointer to the 3D vector to multiply the input values with
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_mulc_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulc_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulc_vec3f_neon (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulc_vec3f_asm (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    /**
+     * Multiplies the components of 4D vectors in an input array by the components of a constant 4D vector and stores the results in an output array.
+     * This function point could be pointed to one of ne10_mulc_vec4f_c, ne10_mulc_vec4f_neon and ne10_mulc_vec4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   Pointer to the 4D vector to multiply the input values with
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_mulc_vec4f) (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulc_vec4f_c (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulc_vec4f_neon (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulc_vec4f_asm (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+
+    /**
+     * Multiplies the elements of src1 by the elements of src2 and stores the results in the dst.
+     * This function point could be pointed to one of ne10_mul_float_c, ne10_mul_float_neon and ne10_mul_float_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1  The first array to use as the input array
+     * @param[in]  src2  The second array to use as the input array
+     * @param[in]  count The number of items in the two input arrays
+     */
+    extern ne10_result_t (*ne10_mul_float) (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_mul_float_c (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_mul_float_neon (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count) asm ("ne10_mul_float_neon");
+    extern ne10_result_t ne10_mul_float_asm (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+    /**
+     * Multiplies the components of a 2D vector with the corresponding components of another.
+     * This function point could be pointed to one of ne10_vmul_vec2f_c, ne10_vmul_vec2f_neon and ne10_vmul_vec2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_vmul_vec2f) (ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vmul_vec2f_c (ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vmul_vec2f_neon (ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count) asm ("ne10_vmul_vec2f_neon");
+    extern ne10_result_t ne10_vmul_vec2f_asm (ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    /**
+     * Multiplies the components of a 3D vector with the corresponding components of another.
+     * This function point could be pointed to one of ne10_vmul_vec3f_c, ne10_vmul_vec3f_neon and ne10_vmul_vec3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_vmul_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vmul_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vmul_vec3f_neon (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count) asm ("ne10_vmul_vec3f_neon");
+    extern ne10_result_t ne10_vmul_vec3f_asm (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    /**
+     * Multiplies the components of a 4D vector with the corresponding components of another.
+     * This function point could be pointed to one of ne10_vmul_vec4f_c, ne10_vmul_vec4f_neon and ne10_vmul_vec4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_vmul_vec4f) (ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vmul_vec4f_c (ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vmul_vec4f_neon (ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count) asm ("ne10_vmul_vec4f_neon");
+    extern ne10_result_t ne10_vmul_vec4f_asm (ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    /** @} */ //end of Vector Multiply group
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup MLA_VEC Vector Multiply-Accumulator
+     *
+     * \par
+     * These functions implement the vector multiply-accumulator operation for float data type.
+     */
+
+    /**
+     * @addtogroup MLA_VEC
+     * @{
+     */
+
+    /**
+     * Multiplies each entry in the source array (src) by cst, then adds the result to
+     * the corresponding item of the accumulation array (acc), and stores the result in the destination array.
+     * This function point could be pointed to one of ne10_mlac_float_c, ne10_mlac_float_neon and ne10_mlac_float_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  acc   The corresponding elemetn is added to the result of the multiplication
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   The constant scalar to multiply the input elements with
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_mlac_float) (ne10_float32_t * dst, ne10_float32_t * acc, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mlac_float_c (ne10_float32_t * dst, ne10_float32_t * acc, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mlac_float_neon (ne10_float32_t * dst, ne10_float32_t * acc, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mlac_float_asm (ne10_float32_t * dst, ne10_float32_t * acc, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+    /**
+     * Multiplies each entry in the source array (src) by the 2D vector cst, then adds the result to
+     * the corresponding item of the accumulation array (acc), and stores the result in the destination array.
+     * This function point could be pointed to one of ne10_mlac_vec2f_c, ne10_mlac_vec2f_neon and ne10_mlac_vec2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  acc   The corresponding elemetn is added to the result of the multiplication
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   Pointer to the 2D vector to multiply the input vectors with
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_mlac_vec2f) (ne10_vec2f_t * dst, ne10_vec2f_t * acc, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mlac_vec2f_c (ne10_vec2f_t * dst, ne10_vec2f_t * acc, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mlac_vec2f_neon (ne10_vec2f_t * dst, ne10_vec2f_t * acc, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mlac_vec2f_asm (ne10_vec2f_t * dst, ne10_vec2f_t * acc, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    /**
+     * Multiplies each entry in the source array (src) by the 3D vector cst, then adds the result to
+     * the corresponding item of the accumulation array (acc), and stores the result in the destination array.
+     * This function point could be pointed to one of ne10_mlac_vec3f_c, ne10_mlac_vec3f_neon and ne10_mlac_vec3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  acc   The corresponding elemetn is added to the result of the multiplication
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   Pointer to the 3D vector to multiply the input vectors with
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_mlac_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * acc, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mlac_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * acc, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mlac_vec3f_neon (ne10_vec3f_t * dst, ne10_vec3f_t * acc, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mlac_vec3f_asm (ne10_vec3f_t * dst, ne10_vec3f_t * acc, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    /**
+     * Multiplies each entry in the source array (src) by the 4D vector cst, then adds the result to
+     * the corresponding item of the accumulation array (acc), and stores the result in the destination array.
+     * This function point could be pointed to one of ne10_mlac_vec4f_c, ne10_mlac_vec4f_neon and ne10_mlac_vec4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  acc   The corresponding elemetn is added to the result of the multiplication
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   Pointer to the 4D vector to multiply the input vectors with
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_mlac_vec4f) (ne10_vec4f_t * dst, ne10_vec4f_t * acc, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mlac_vec4f_c (ne10_vec4f_t * dst, ne10_vec4f_t * acc, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mlac_vec4f_neon (ne10_vec4f_t * dst, ne10_vec4f_t * acc, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mlac_vec4f_asm (ne10_vec4f_t * dst, ne10_vec4f_t * acc, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+
+    /**
+     * Performs a multiply and accumulate operation using the corresponding elements in acc, src1, and src2.
+     * This function point could be pointed to one of ne10_mla_float_c, ne10_mla_float_neon and ne10_mla_float_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  acc   These elemtns are added to the result of the multiplication operation
+     * @param[in]  src1  The first array to use as the input array
+     * @param[in]  src2  The second array to use as the input array
+     * @param[in]  count The number of items in the two input arrays
+     */
+    extern ne10_result_t (*ne10_mla_float) (ne10_float32_t * dst, ne10_float32_t * acc, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_mla_float_c (ne10_float32_t * dst, ne10_float32_t * acc, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_mla_float_neon (ne10_float32_t * dst, ne10_float32_t * acc, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count) asm ("ne10_mla_float_neon");
+    extern ne10_result_t ne10_mla_float_asm (ne10_float32_t * dst, ne10_float32_t * acc, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+    /**
+     * Performs a multiply and accumulate operation on the components of a 2D vector with the corresponding components of another.
+     * This function point could be pointed to one of ne10_vmla_vec2f_c, ne10_vmla_vec2f_neon and ne10_vmla_vec2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_vmla_vec2f) (ne10_vec2f_t * dst, ne10_vec2f_t * acc, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vmla_vec2f_c (ne10_vec2f_t * dst, ne10_vec2f_t * acc, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vmla_vec2f_neon (ne10_vec2f_t * dst, ne10_vec2f_t * acc, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count) asm ("ne10_vmla_vec2f_neon");
+    extern ne10_result_t ne10_vmla_vec2f_asm (ne10_vec2f_t * dst, ne10_vec2f_t * acc, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    /**
+     * Performs a multiply and accumulate operation on the components of a 3D vector with the corresponding components of another.
+     * This function point could be pointed to one of ne10_vmla_vec3f_c, ne10_vmla_vec3f_neon and ne10_vmla_vec3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_vmla_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * acc, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vmla_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * acc, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vmla_vec3f_neon (ne10_vec3f_t * dst, ne10_vec3f_t * acc, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count) asm ("ne10_vmla_vec3f_neon");
+    extern ne10_result_t ne10_vmla_vec3f_asm (ne10_vec3f_t * dst, ne10_vec3f_t * acc, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    /**
+     * Performs a multiply and accumulate operation on the components of a 4D vector with the corresponding components of another.
+     * This function point could be pointed to one of ne10_vmla_vec4f_c, ne10_vmla_vec4f_neon and ne10_vmla_vec4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_vmla_vec4f) (ne10_vec4f_t * dst, ne10_vec4f_t * acc, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vmla_vec4f_c (ne10_vec4f_t * dst, ne10_vec4f_t * acc, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vmla_vec4f_neon (ne10_vec4f_t * dst, ne10_vec4f_t * acc, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count) asm ("ne10_vmla_vec4f_neon");
+    extern ne10_result_t ne10_vmla_vec4f_asm (ne10_vec4f_t * dst, ne10_vec4f_t * acc, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    /** @} */ //end of Vector Multiply-Accumulator group
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup MUL_MAT Matrix Multiply
+     *
+     * \par
+     * These functions implement the matrix multiply operation for float data type.
+     */
+
+    /**
+     * @addtogroup MUL_MAT
+     * @{
+     */
+
+    /**
+     * Matrix multiplication of two 4x4 matrixs.
+     * This function point could be pointed to one of ne10_mulmat_4x4f_c, ne10_mulmat_4x4f_neon and ne10_mulmat_4x4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_mulmat_4x4f) (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulmat_4x4f_c (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulmat_4x4f_neon (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count) asm ("ne10_mulmat_4x4f_neon");
+    extern ne10_result_t ne10_mulmat_4x4f_asm (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+
+    /**
+     * Matrix multiplication of two 3x3 matrixs.
+     * This function point could be pointed to one of ne10_mulmat_3x3f_c, ne10_mulmat_3x3f_neon and ne10_mulmat_3x3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_mulmat_3x3f) (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulmat_3x3f_c (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulmat_3x3f_neon (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count) asm ("ne10_mulmat_3x3f_neon");
+    extern ne10_result_t ne10_mulmat_3x3f_asm (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+
+    /**
+     * Matrix multiplication of two 2x2 matrixs.
+     * This function point could be pointed to one of ne10_mulmat_2x2f_c, ne10_mulmat_2x2f_neon and ne10_mulmat_2x2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_mulmat_2x2f) (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulmat_2x2f_c (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulmat_2x2f_neon (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count) asm ("ne10_mulmat_2x2f_neon");
+    extern ne10_result_t ne10_mulmat_2x2f_asm (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+    /** @} */ //end of Matrix Multiply group
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup MUL_MAT_VEC Matrix Vector Multiply
+     *
+     * \par
+     * These functions implement the matrix vector multiply operation for float data type.
+     */
+
+    /**
+     * @addtogroup MUL_MAT_VEC
+     * @{
+     */
+    /**
+     * Matrix multiplication of 4x4 matrix and 4D vector.
+     * This function point could be pointed to one of ne10_mulcmatvec_cm4x4f_v4f_c, ne10_mulcmatvec_cm4x4f_v4f_neon and ne10_mulcmatvec_cm4x4f_v4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  cst   Pointer to the matrix to multiply the input values with
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_mulcmatvec_cm4x4f_v4f) (ne10_vec4f_t * dst, const ne10_mat4x4f_t * cst, ne10_vec4f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulcmatvec_cm4x4f_v4f_c (ne10_vec4f_t * dst, const ne10_mat4x4f_t * cst, ne10_vec4f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulcmatvec_cm4x4f_v4f_neon (ne10_vec4f_t * dst, const ne10_mat4x4f_t * cst, ne10_vec4f_t * src, ne10_uint32_t count) asm ("ne10_mulcmatvec_cm4x4f_v4f_neon");
+    extern ne10_result_t ne10_mulcmatvec_cm4x4f_v4f_asm (ne10_vec4f_t * dst, const ne10_mat4x4f_t * cst, ne10_vec4f_t * src, ne10_uint32_t count);
+    /**
+     * Matrix multiplication of 3x3 matrix and 3D vector.
+     * This function point could be pointed to one of ne10_mulcmatvec_cm3x3f_v3f_c, ne10_mulcmatvec_cm3x3f_v3f_neon and ne10_mulcmatvec_cm3x3f_v3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  cst   Pointer to the matrix to multiply the input values with
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_mulcmatvec_cm3x3f_v3f) (ne10_vec3f_t * dst, const ne10_mat3x3f_t * cst, ne10_vec3f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulcmatvec_cm3x3f_v3f_c (ne10_vec3f_t * dst, const ne10_mat3x3f_t * cst, ne10_vec3f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulcmatvec_cm3x3f_v3f_neon (ne10_vec3f_t * dst, const ne10_mat3x3f_t * cst, ne10_vec3f_t * src, ne10_uint32_t count) asm ("ne10_mulcmatvec_cm3x3f_v3f_neon");
+    extern ne10_result_t ne10_mulcmatvec_cm3x3f_v3f_asm (ne10_vec3f_t * dst, const ne10_mat3x3f_t * cst, ne10_vec3f_t * src, ne10_uint32_t count);
+    /**
+     * Matrix multiplication of 2x2 matrix and 2D vector.
+     * This function point could be pointed to one of ne10_mulcmatvec_cm2x2f_v2f_c, ne10_mulcmatvec_cm2x2f_v2f_neon and ne10_mulcmatvec_cm2x2f_v2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  cst   Pointer to the matrix to multiply the input values with
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_mulcmatvec_cm2x2f_v2f) (ne10_vec2f_t * dst, const ne10_mat2x2f_t * cst, ne10_vec2f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulcmatvec_cm2x2f_v2f_c (ne10_vec2f_t * dst, const ne10_mat2x2f_t * cst, ne10_vec2f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulcmatvec_cm2x2f_v2f_neon (ne10_vec2f_t * dst, const ne10_mat2x2f_t * cst, ne10_vec2f_t * src, ne10_uint32_t count) asm ("ne10_mulcmatvec_cm2x2f_v2f_neon");
+    extern ne10_result_t ne10_mulcmatvec_cm2x2f_v2f_asm (ne10_vec2f_t * dst, const ne10_mat2x2f_t * cst, ne10_vec2f_t * src, ne10_uint32_t count);
+
+    /** @} */ //end of Matrix Vector Multiply group
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup DIV_VEC Vector Div
+     *
+     * \par
+     * These functions implement the vector division operation for float data type.
+     */
+
+    /**
+     * @addtogroup DIV_VEC
+     * @{
+     */
+
+    /**
+     * Divides the elements of an input array by a constant scalar and stores the results in an output array.
+     * This function point could be pointed to one of ne10_divc_float_c, ne10_divc_float_neon and ne10_divc_float_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   The constant scalar to divide the input values by
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_divc_float) (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_divc_float_c (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_divc_float_neon (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_divc_float_asm (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+    /**
+     * Divides the components of 2D vectors in an input array with the components of a constant 2D vector and stores the results in an output array.
+     * This function point could be pointed to one of ne10_divc_vec2f_c, ne10_divc_vec2f_neon and ne10_divc_vec2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   Pointer to the 2D vector to divide the input values by
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_divc_vec2f) (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_divc_vec2f_c (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_divc_vec2f_neon (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_divc_vec2f_asm (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    /**
+     * Divides the components of 3D vectors in an input array with the components of a constant 3D vector and stores the results in an output array.
+     * This function point could be pointed to one of ne10_divc_vec3f_c, ne10_divc_vec3f_neon and ne10_divc_vec3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   Pointer to the 3D vector to divide the input values by
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_divc_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_divc_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_divc_vec3f_neon (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_divc_vec3f_asm (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    /**
+     * Divides the components of 4D vectors in an input array with the components of a constant 4D vector and stores the results in an output array.
+     * This function point could be pointed to one of ne10_divc_vec4f_c, ne10_divc_vec4f_neon and ne10_divc_vec4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   Pointer to the 4D vector to divide the input values by
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_divc_vec4f) (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_divc_vec4f_c (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_divc_vec4f_neon (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_divc_vec4f_asm (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+    /**
+     *  Divides the elements of src1 by the elements of src2 and stores the results in the dst.
+     * This function point could be pointed to one of ne10_div_float_c, ne10_div_float_neon and ne10_div_float_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1  The first array to use as the input array
+     * @param[in]  src2  The second array to use as the input array
+     * @param[in]  count The number of items in the two input arrays
+     */
+    extern ne10_result_t (*ne10_div_float) (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_div_float_c (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_div_float_neon (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count) asm ("ne10_div_float_neon");
+    extern ne10_result_t ne10_div_float_asm (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+    /**
+     * Divides the components of a 2D vector with the corresponding components of another.
+     * This function point could be pointed to one of ne10_vdiv_vec2f_c, ne10_vdiv_vec2f_neon and ne10_vdiv_vec2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the nominators' source array
+     * @param[in]  src2   Pointer to the denominators' source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_vdiv_vec2f) (ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vdiv_vec2f_c (ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vdiv_vec2f_neon (ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count) asm ("ne10_vdiv_vec2f_neon");
+    extern ne10_result_t ne10_vdiv_vec2f_asm (ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    /**
+     * Divides the components of a 3D vector with the corresponding components of another.
+     * This function point could be pointed to one of ne10_vdiv_vec3f_c, ne10_vdiv_vec3f_neon and ne10_vdiv_vec3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the nominators' source array
+     * @param[in]  src2   Pointer to the denominators' source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_vdiv_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vdiv_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vdiv_vec3f_neon (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count) asm ("ne10_vdiv_vec3f_neon");
+    extern ne10_result_t ne10_vdiv_vec3f_asm (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    /**
+     * Divides the components of a 4D vector with the corresponding components of another.
+     * This function point could be pointed to one of ne10_vdiv_vec4f_c, ne10_vdiv_vec4f_neon and ne10_vdiv_vec4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the nominators' source array
+     * @param[in]  src2   Pointer to the denominators' source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_vdiv_vec4f) (ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vdiv_vec4f_c (ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vdiv_vec4f_neon (ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count) asm ("ne10_vdiv_vec4f_neon");
+    extern ne10_result_t ne10_vdiv_vec4f_asm (ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    /** @} */ //end of Vector Div group
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup DIV_MAT Matrix Div
+     *
+     * \par
+     * These functions implement the matrix division operation for float data type.
+     */
+
+    /**
+     * @addtogroup DIV_MAT
+     * @{
+     */
+
+    /**
+     * Divides the components of a 4x4 matrix with the corresponding components of another.
+     * This function point could be pointed to one of ne10_divmat_4x4f_c, ne10_divmat_4x4f_neon and ne10_divmat_4x4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the nominators' source array
+     * @param[in]  src2   Pointer to the denominators' source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_divmat_4x4f) (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_divmat_4x4f_c (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_divmat_4x4f_neon (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count) asm ("ne10_divmat_4x4f_neon");
+    extern ne10_result_t ne10_divmat_4x4f_asm (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+    /**
+     * Divides the components of a 3x3 matrix with the corresponding components of another.
+     * This function point could be pointed to one of ne10_divmat_3x3f_c, ne10_divmat_3x3f_neon and ne10_divmat_3x3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the nominators' source array
+     * @param[in]  src2   Pointer to the denominators' source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_divmat_3x3f) (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_divmat_3x3f_c (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_divmat_3x3f_neon (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count) asm ("ne10_divmat_3x3f_neon");
+    extern ne10_result_t ne10_divmat_3x3f_asm (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+    /**
+     * Divides the components of a 2x2 matrix with the corresponding components of another.
+     * This function point could be pointed to one of ne10_divmat_2x2f_c, ne10_divmat_2x2f_neon and ne10_divmat_2x2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the nominators' source array
+     * @param[in]  src2   Pointer to the denominators' source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_divmat_2x2f) (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_divmat_2x2f_c (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_divmat_2x2f_neon (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count) asm ("ne10_divmat_2x2f_neon");
+    extern ne10_result_t ne10_divmat_2x2f_asm (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+    /** @} */ //end of Matrix Div group
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup SETC_VEC Vector Setc
+     *
+     * \par
+     * These functions implement vector setc operation for float data type.
+     */
+
+    /**
+     * @addtogroup SETC_VEC
+     * @{
+     */
+
+    /**
+     * Sets the elements of an input array to a constant scalar and stores the results in an output array.
+     * This function point could be pointed to one of ne10_setc_float_c, ne10_setc_float_neon and ne10_setc_float_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  cst   The constant scalar to set the input values to
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_setc_float) (ne10_float32_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_setc_float_c (ne10_float32_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_setc_float_neon (ne10_float32_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_setc_float_asm (ne10_float32_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
+    /**
+     * Sets the components of 2D vectors in an input array to the components of a constant 2D vector and stores the results in an output array.
+     * This function point could be pointed to one of ne10_setc_vec2f_c, ne10_setc_vec2f_neon and ne10_setc_vec2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  cst   Pointer to the 2D vector to set the input values to
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_setc_vec2f) (ne10_vec2f_t * dst, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_setc_vec2f_c (ne10_vec2f_t * dst, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_setc_vec2f_neon (ne10_vec2f_t * dst, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_setc_vec2f_asm (ne10_vec2f_t * dst, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    /**
+     * Sets the components of 3D vectors in an input array to the components of a constant 3D vector and stores the results in an output array.
+     * This function point could be pointed to one of ne10_setc_vec3f_c, ne10_setc_vec3f_neon and ne10_setc_vec3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  cst   Pointer to the 3D vector to set the input values to
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_setc_vec3f) (ne10_vec3f_t * dst, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_setc_vec3f_c (ne10_vec3f_t * dst, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_setc_vec3f_neon (ne10_vec3f_t * dst, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_setc_vec3f_asm (ne10_vec3f_t * dst, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    /**
+     * Sets the components of 4D vectors in an input array to the components of a constant 3D vector and stores the results in an output array.
+     * This function point could be pointed to one of ne10_setc_vec4f_c, ne10_setc_vec4f_neon and ne10_setc_vec4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  cst   Pointer to the 4D vector to set the input values to
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_setc_vec4f) (ne10_vec4f_t * dst, const ne10_vec4f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_setc_vec4f_c (ne10_vec4f_t * dst, const ne10_vec4f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_setc_vec4f_neon (ne10_vec4f_t * dst, const ne10_vec4f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_setc_vec4f_asm (ne10_vec4f_t * dst, const ne10_vec4f_t * cst, ne10_uint32_t count);
+    /** @} */ //end of Vector Setc group
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup LEN_VEC Vector Len
+     *
+     * \par
+     * These functions implement vector len operation for float data type.
+     */
+
+    /**
+     * @addtogroup LEN_VEC
+     * @{
+     */
+    /**
+     * Returns length of 2D vectors in corresponding elements of the output array.
+     * This function point could be pointed to one of ne10_len_vec2f_c, ne10_len_vec2f_neon and ne10_len_vec2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_len_vec2f) (ne10_float32_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_len_vec2f_c (ne10_float32_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_len_vec2f_neon (ne10_float32_t * dst, ne10_vec2f_t * src, ne10_uint32_t count) asm ("ne10_len_vec2f_neon");
+    extern ne10_result_t ne10_len_vec2f_asm (ne10_float32_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
+    /**
+     * Returns length of 3D vectors in corresponding elements of the output array.
+     * This function point could be pointed to one of ne10_len_vec3f_c, ne10_len_vec3f_neon and ne10_len_vec3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_len_vec3f) (ne10_float32_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_len_vec3f_c (ne10_float32_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_len_vec3f_neon (ne10_float32_t * dst, ne10_vec3f_t * src, ne10_uint32_t count) asm ("ne10_len_vec3f_neon");
+    extern ne10_result_t ne10_len_vec3f_asm (ne10_float32_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
+    /**
+     * Returns length of 4D vectors in corresponding elements of the output array.
+     * This function point could be pointed to one of ne10_len_vec4f_c, ne10_len_vec4f_neon and ne10_len_vec4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_len_vec4f) (ne10_float32_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_len_vec4f_c (ne10_float32_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_len_vec4f_neon (ne10_float32_t * dst, ne10_vec4f_t * src, ne10_uint32_t count) asm ("ne10_len_vec4f_neon");
+    extern ne10_result_t ne10_len_vec4f_asm (ne10_float32_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
+    /** @} */ //end of Vector Len group
+
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup NORM_VEC Vector Normalize
+     *
+     * \par
+     * These functions implement vector normalize operation for float data type.
+     */
+
+    /**
+     * @addtogroup NORM_VEC
+     * @{
+     */
+    /**
+     * Normalizes 2D vectors of the input array and stores them in the corresponding elements of the output array.
+     * This function point could be pointed to one of ne10_normalize_vec2f_c, ne10_normalize_vec2f_neon and ne10_normalize_vec2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_normalize_vec2f) (ne10_vec2f_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_normalize_vec2f_c (ne10_vec2f_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_normalize_vec2f_neon (ne10_vec2f_t * dst, ne10_vec2f_t * src, ne10_uint32_t count) asm ("ne10_normalize_vec2f_neon");
+    extern ne10_result_t ne10_normalize_vec2f_asm (ne10_vec2f_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
+    /**
+     * Normalizes 3D vectors of the input array and stores them in the corresponding elements of the output array.
+     * This function point could be pointed to one of ne10_normalize_vec3f_c, ne10_normalize_vec3f_neon and ne10_normalize_vec3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_normalize_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_normalize_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_normalize_vec3f_neon (ne10_vec3f_t * dst, ne10_vec3f_t * src, ne10_uint32_t count) asm ("ne10_normalize_vec3f_neon");
+    extern ne10_result_t ne10_normalize_vec3f_asm (ne10_vec3f_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
+    /**
+     * Normalizes 4D vectors of the input array and stores them in the corresponding elements of the output array.
+     * This function point could be pointed to one of ne10_normalize_vec4f_c, ne10_normalize_vec4f_neon and ne10_normalize_vec4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_normalize_vec4f) (ne10_vec4f_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_normalize_vec4f_c (ne10_vec4f_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_normalize_vec4f_neon (ne10_vec4f_t * dst, ne10_vec4f_t * src, ne10_uint32_t count) asm ("ne10_normalize_vec4f_neon");
+    extern ne10_result_t ne10_normalize_vec4f_asm (ne10_vec4f_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
+    /** @} */ //end of Vector Normalize group
+
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup ABS_VEC Vector Abs
+     *
+     * \par
+     * These functions implement vector abs operation for float data type.
+     */
+
+    /**
+     * @addtogroup ABS_VEC
+     * @{
+     */
+
+    /**
+     * Calculates the absolute value of each element in the source array and stores the result in the corresponding entry of the destination array.
+     * This function point could be pointed to one of ne10_abs_float_c, ne10_abs_float_neon and ne10_abs_float_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_abs_float) (ne10_float32_t * dst, ne10_float32_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_abs_float_c (ne10_float32_t * dst, ne10_float32_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_abs_float_neon (ne10_float32_t * dst, ne10_float32_t * src, ne10_uint32_t count) asm ("ne10_abs_float_neon");
+    extern ne10_result_t ne10_abs_float_asm (ne10_float32_t * dst, ne10_float32_t * src, ne10_uint32_t count);
+    /**
+     * Generates a 2D vector from the absolute values of each of the components of an input vector.
+     * This function point could be pointed to one of ne10_abs_vec2f_c, ne10_abs_vec2f_neon and ne10_abs_vec2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_abs_vec2f) (ne10_vec2f_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_abs_vec2f_c (ne10_vec2f_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_abs_vec2f_neon (ne10_vec2f_t * dst, ne10_vec2f_t * src, ne10_uint32_t count) asm ("ne10_abs_vec2f_neon");
+    extern ne10_result_t ne10_abs_vec2f_asm (ne10_vec2f_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
+    /**
+     * Generates a 3D vector from the absolute values of each of the components of an input vector.
+     * This function point could be pointed to one of ne10_abs_vec3f_c, ne10_abs_vec3f_neon and ne10_abs_vec3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_abs_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_abs_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_abs_vec3f_neon (ne10_vec3f_t * dst, ne10_vec3f_t * src, ne10_uint32_t count) asm ("ne10_abs_vec3f_neon");
+    extern ne10_result_t ne10_abs_vec3f_asm (ne10_vec3f_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
+    /**
+     * Generates a 4D vector from the absolute values of each of the components of an input vector.
+     * This function point could be pointed to one of ne10_abs_vec4f_c, ne10_abs_vec4f_neon and ne10_abs_vec4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_abs_vec4f) (ne10_vec4f_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_abs_vec4f_c (ne10_vec4f_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_abs_vec4f_neon (ne10_vec4f_t * dst, ne10_vec4f_t * src, ne10_uint32_t count) asm ("ne10_abs_vec4f_neon");
+    extern ne10_result_t ne10_abs_vec4f_asm (ne10_vec4f_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
+    /** @} */ //end of Vector Abs group
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup DOT_VEC Vector Dot
+     *
+     * \par
+     * These functions implement vector dot operation for float data type.
+     */
+
+    /**
+     * @addtogroup DOT_VEC
+     * @{
+     */
+    /**
+     * Dot product of two 2D vectors.
+     * This function point could be pointed to one of ne10_dot_vec2f_c, ne10_dot_vec2f_neon and ne10_dot_vec2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_dot_vec2f) (ne10_float32_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_dot_vec2f_c (ne10_float32_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_dot_vec2f_neon (ne10_float32_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count) asm ("ne10_dot_vec2f_neon");
+    extern ne10_result_t ne10_dot_vec2f_asm (ne10_float32_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    /**
+     * Dot product of two 3D vectors.
+     * This function point could be pointed to one of ne10_dot_vec3f_c, ne10_dot_vec3f_neon and ne10_dot_vec3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_dot_vec3f) (ne10_float32_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_dot_vec3f_c (ne10_float32_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_dot_vec3f_neon (ne10_float32_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count) asm ("ne10_dot_vec3f_neon");
+    extern ne10_result_t ne10_dot_vec3f_asm (ne10_float32_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    /**
+     * Dot product of two 4D vectors.
+     * This function point could be pointed to one of ne10_dot_vec4f_c, ne10_dot_vec4f_neon and ne10_dot_vec4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_dot_vec4f) (ne10_float32_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_dot_vec4f_c (ne10_float32_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_dot_vec4f_neon (ne10_float32_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count) asm ("ne10_dot_vec4f_neon");
+    extern ne10_result_t ne10_dot_vec4f_asm (ne10_float32_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    /** @} */ //end of Vector Dot group
+
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup CROSS_VEC Vector Cross
+     *
+     * \par
+     * These functions implement vector cross operation for float data type.
+     */
+
+    /**
+     * @addtogroup CROSS_VEC
+     * @{
+     */
+
+    /**
+     * Performs a cross product operation on the two input vectors.
+     * This function point could be pointed to one of ne10_cross_vec3f_c, ne10_cross_vec3f_neon and ne10_cross_vec3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_cross_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_cross_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_cross_vec3f_neon (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count) asm ("ne10_cross_vec3f_neon");
+    extern ne10_result_t ne10_cross_vec3f_asm (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    /** @} */ //end of Vector Cross group
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup DET_MAT Matrix Determinant
+     *
+     * \par
+     * These functions implement matrix determinant operation for float data type.
+     */
+
+    /**
+     * @addtogroup DET_MAT
+     * @{
+     */
+
+    /**
+     * Calculate the determinant of a 4x4 matrix.
+     * This function point could be pointed to one of ne10_detmat_4x4f_c, ne10_detmat_4x4f_neon and ne10_detmat_4x4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_detmat_4x4f) (ne10_float32_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_detmat_4x4f_c (ne10_float32_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_detmat_4x4f_neon (ne10_float32_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count) asm ("ne10_detmat_4x4f_neon");
+    extern ne10_result_t ne10_detmat_4x4f_asm (ne10_float32_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
+    /**
+     * Calculate the determinant of a 3x3 matrix.
+     * This function point could be pointed to one of ne10_detmat_3x3f_c, ne10_detmat_3x3f_neon and ne10_detmat_3x3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_detmat_3x3f) (ne10_float32_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_detmat_3x3f_c (ne10_float32_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_detmat_3x3f_neon (ne10_float32_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count) asm ("ne10_detmat_3x3f_neon");
+    extern ne10_result_t ne10_detmat_3x3f_asm (ne10_float32_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
+    /**
+     * Calculate the determinant of a 2x2 matrix.
+     * This function point could be pointed to one of ne10_detmat_2x2f_c, ne10_detmat_2x2f_neon and ne10_detmat_2x2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_detmat_2x2f) (ne10_float32_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_detmat_2x2f_c (ne10_float32_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_detmat_2x2f_neon (ne10_float32_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count) asm ("ne10_detmat_2x2f_neon");
+    extern ne10_result_t ne10_detmat_2x2f_asm (ne10_float32_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
+    /** @} */ //end of Matrix Determinant group
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup INV_MAT Matrix Invertible
+     *
+     * \par
+     * These functions implement matrix invertible operation for float data type.
+     */
+
+    /**
+     * @addtogroup INV_MAT
+     * @{
+     */
+    /**
+     * Calculate the invertible matrix of a 4x4 matrix.
+     * This function point could be pointed to one of ne10_invmat_4x4f_c, ne10_invmat_4x4f_neon and ne10_invmat_4x4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_invmat_4x4f) (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_invmat_4x4f_c (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_invmat_4x4f_neon (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count) asm ("ne10_invmat_4x4f_neon");
+    extern ne10_result_t ne10_invmat_4x4f_asm (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
+    /**
+     * Calculate the invertible matrix of a 3x3 matrix.
+     * This function point could be pointed to one of ne10_invmat_3x3f_c, ne10_invmat_3x3f_neon and ne10_invmat_3x3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_invmat_3x3f) (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_invmat_3x3f_c (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_invmat_3x3f_neon (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count) asm ("ne10_invmat_3x3f_neon");
+    extern ne10_result_t ne10_invmat_3x3f_asm (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
+    /**
+     * Calculate the invertible matrix of a 2x2 matrix.
+     * This function point could be pointed to one of ne10_invmat_2x2f_c, ne10_invmat_2x2f_neon and ne10_invmat_2x2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_invmat_2x2f) (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_invmat_2x2f_c (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_invmat_2x2f_neon (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count) asm ("ne10_invmat_2x2f_neon");
+    extern ne10_result_t ne10_invmat_2x2f_asm (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
+    /** @} */ //end of Matrix Invertible group
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup TRANS_MAT Matrix Transpose
+     *
+     * \par
+     * These functions implement matrix transpose operation for float data type.
+     */
+
+    /**
+     * @addtogroup TRANS_MAT
+     * @{
+     */
+    /**
+     * Calculate the transpose matrix of a 4x4 matrix.
+     * This function point could be pointed to one of ne10_transmat_4x4f_c, ne10_transmat_4x4f_neon and ne10_transmat_4x4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_transmat_4x4f) (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_transmat_4x4f_c (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_transmat_4x4f_neon (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count) asm ("ne10_transmat_4x4f_neon");
+    extern ne10_result_t ne10_transmat_4x4f_asm (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
+    /**
+     * Calculate the transpose matrix of a 4x4 matrix.
+     * This function point could be pointed to one of ne10_transmat_4x4f_c, ne10_transmat_4x4f_neon and ne10_transmat_4x4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_transmat_3x3f) (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_transmat_3x3f_c (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_transmat_3x3f_neon (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count) asm ("ne10_transmat_3x3f_neon");
+    extern ne10_result_t ne10_transmat_3x3f_asm (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
+    /**
+     * Calculate the transpose matrix of a 3x3 matrix.
+     * This function point could be pointed to one of ne10_transmat_3x3f_c, ne10_transmat_3x3f_neon and ne10_transmat_3x3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_transmat_2x2f) (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_transmat_2x2f_c (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_transmat_2x2f_neon (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count) asm ("ne10_transmat_2x2f_neon");
+    extern ne10_result_t ne10_trans_mat2x2f_asm (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
+    /** @} */ //end of Matrix Transpose group
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup IDENTITY_MAT Matrix Identity
+     *
+     * \par
+     * These functions implement matrix identity operation for float data type.
+     */
+
+    /**
+     * @addtogroup IDENTITY_MAT
+     * @{
+     */
+    /**
+     * Set the identity matrix of a 2x2 matrix.
+     * This function point could be pointed to one of ne10_identitymat_2x2f_c, ne10_identitymat_2x2f_neon and ne10_identitymat_2x2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_identitymat_4x4f) (ne10_mat4x4f_t * dst, ne10_uint32_t count);
+    extern ne10_result_t ne10_identitymat_4x4f_c (ne10_mat4x4f_t * dst, ne10_uint32_t count);
+    extern ne10_result_t ne10_identitymat_4x4f_neon (ne10_mat4x4f_t * dst, ne10_uint32_t count) asm ("ne10_identitymat_4x4f_neon");
+    extern ne10_result_t ne10_identitymat_4x4f_asm (ne10_mat4x4f_t * dst, ne10_uint32_t count);
+    /**
+     * Set the identity matrix of a 3x3 matrix.
+     * This function point could be pointed to one of ne10_identitymat_3x3f_c, ne10_identitymat_3x3f_neon and ne10_identitymat_3x3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_identitymat_3x3f) (ne10_mat3x3f_t * dst, ne10_uint32_t count);
+    extern ne10_result_t ne10_identitymat_3x3f_c (ne10_mat3x3f_t * dst, ne10_uint32_t count);
+    extern ne10_result_t ne10_identitymat_3x3f_neon (ne10_mat3x3f_t * dst, ne10_uint32_t count) asm ("ne10_identitymat_3x3f_neon");
+    extern ne10_result_t ne10_identitymat_3x3f_asm (ne10_mat3x3f_t * dst, ne10_uint32_t count);
+    /**
+     * Set the identity matrix of a 2x2 matrix.
+     * This function point could be pointed to one of ne10_identitymat_2x2f_c, ne10_identitymat_2x2f_neon and ne10_identitymat_2x2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_identitymat_2x2f) (ne10_mat2x2f_t * dst, ne10_uint32_t count);
+    extern ne10_result_t ne10_identitymat_2x2f_c (ne10_mat2x2f_t * dst, ne10_uint32_t count);
+    extern ne10_result_t ne10_identitymat_2x2f_neon (ne10_mat2x2f_t * dst, ne10_uint32_t count) asm ("ne10_identitymat_2x2f_neon");
+    extern ne10_result_t ne10_identity_mat2x2f_asm (ne10_mat2x2f_t * dst, ne10_uint32_t count);
+    /** @} */ //end of Matrix Identity group
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/ne10/NE10_physics.h	Sun Jun 12 18:16:20 2016 +0100
@@ -0,0 +1,131 @@
+/*
+ *  Copyright 2014-15 ARM Limited and Contributors.
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *    * Neither the name of ARM Limited nor the
+ *      names of its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
+ *  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ *  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
+ *  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ *  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * NE10 Library : inc/NE10_physics.h
+ */
+
+
+#include "NE10_types.h"
+
+#ifndef NE10_PHYSICS_H
+#define NE10_PHYSICS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+///////////////////////////
+// function prototypes:
+///////////////////////////
+
+    /* function pointers*/
+    extern void (*ne10_physics_compute_aabb_vec2f) (ne10_mat2x2f_t *aabb,
+            ne10_vec2f_t *vertices,
+            ne10_mat2x2f_t *xf,
+            ne10_vec2f_t *radius,
+            ne10_uint32_t vertex_count);
+    extern void (*ne10_physics_relative_v_vec2f) (ne10_vec2f_t *dv,
+            ne10_vec3f_t *v_wa,
+            ne10_vec2f_t *ra,
+            ne10_vec3f_t *v_wb,
+            ne10_vec2f_t *rb,
+            ne10_uint32_t count);
+    extern void (*ne10_physics_apply_impulse_vec2f) (ne10_vec3f_t *v_wa,
+            ne10_vec3f_t *v_wb,
+            ne10_vec2f_t *ra,
+            ne10_vec2f_t *rb,
+            ne10_vec2f_t *ima,
+            ne10_vec2f_t *imb,
+            ne10_vec2f_t *p,
+            ne10_uint32_t count);
+
+    /* C version*/
+    extern void ne10_physics_compute_aabb_vec2f_c (ne10_mat2x2f_t *aabb,
+            ne10_vec2f_t *vertices,
+            ne10_mat2x2f_t *xf,
+            ne10_vec2f_t *radius,
+            ne10_uint32_t vertex_count);
+    extern void ne10_physics_relative_v_vec2f_c (ne10_vec2f_t *dv,
+            ne10_vec3f_t *v_wa,
+            ne10_vec2f_t *ra,
+            ne10_vec3f_t *v_wb,
+            ne10_vec2f_t *rb,
+            ne10_uint32_t count);
+    extern void ne10_physics_apply_impulse_vec2f_c (ne10_vec3f_t *v_wa,
+            ne10_vec3f_t *v_wb,
+            ne10_vec2f_t *ra,
+            ne10_vec2f_t *rb,
+            ne10_vec2f_t *ima,
+            ne10_vec2f_t *imb,
+            ne10_vec2f_t *p,
+            ne10_uint32_t count);
+
+    /* NEON version*/
+    /**
+     * @addtogroup COLLISION_DETECT
+     * @{
+     */
+#ifdef ENABLE_NE10_PHYSICS_COMPUTE_AABB_VEC2F_NEON
+    extern void ne10_physics_compute_aabb_vec2f_neon (ne10_mat2x2f_t *aabb,
+            ne10_vec2f_t *vertices,
+            ne10_mat2x2f_t *xf,
+            ne10_vec2f_t *radius,
+            ne10_uint32_t vertex_count);
+#endif // ENABLE_NE10_PHYSICS_COMPUTE_AABB_VEC2F_NEON
+
+#ifdef ENABLE_NE10_PHYSICS_RELATIVE_V_VEC2F_NEON
+    extern void ne10_physics_relative_v_vec2f_neon (ne10_vec2f_t *dv,
+            ne10_vec3f_t *v_wa,
+            ne10_vec2f_t *ra,
+            ne10_vec3f_t *v_wb,
+            ne10_vec2f_t *rb,
+            ne10_uint32_t count)
+    asm ("ne10_physics_relative_v_vec2f_neon");
+#endif // ENABLE_NE10_PHYSICS_RELATIVE_V_VEC2F_NEON
+
+#ifdef ENABLE_NE10_PHYSICS_APPLY_IMPULSE_VEC2F_NEON
+    extern void ne10_physics_apply_impulse_vec2f_neon (ne10_vec3f_t *v_wa,
+            ne10_vec3f_t *v_wb,
+            ne10_vec2f_t *ra,
+            ne10_vec2f_t *rb,
+            ne10_vec2f_t *ima,
+            ne10_vec2f_t *imb,
+            ne10_vec2f_t *p,
+            ne10_uint32_t count)
+    asm ("ne10_physics_apply_impulse_vec2f_neon");
+#endif // ENABLE_NE10_PHYSICS_APPLY_IMPULSE_VEC2F_NEON
+    /**
+     * @} end of COLLISION_DETECT group
+     */
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/ne10/NE10_types.h	Sun Jun 12 18:16:20 2016 +0100
@@ -0,0 +1,453 @@
+/*
+ *  Copyright 2011-15 ARM Limited and Contributors.
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *    * Neither the name of ARM Limited nor the
+ *      names of its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
+ *  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ *  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
+ *  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ *  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * NE10 Library : inc/NE10_types.h
+ */
+
+/** NE10 defines a number of types for use in its function signatures.
+ *  The types are defined within this header file.
+ */
+
+#ifndef NE10_TYPES_H
+#define NE10_TYPES_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <math.h>
+#include <string.h>
+#include <assert.h>
+
+/**
+ * @TODO Move the definition of NE10_UNROLL_LEVEL to cmake configuration files.
+ * Macro NE10_UNROLL_LEVEL controls algorithm of FFT funtions.
+ * When NE10_UNROLL_LEVEL == 0, complex FFT performs radix-4 x2 per loop.
+ * When NE10_UNROLL_LEVEL == 1, complex FFT performs radix-4 x4 per loop.
+ */
+#if !defined(NE10_UNROLL_LEVEL)
+#if defined(__arm__)
+#define NE10_UNROLL_LEVEL 0
+#elif defined(__aarch64__)
+#define NE10_UNROLL_LEVEL 1
+#else
+#define NE10_UNROLL_LEVEL 0
+#endif
+#endif
+
+/////////////////////////////////////////////////////////
+// constant values that are used across the library
+/////////////////////////////////////////////////////////
+#define NE10_OK 0
+#define NE10_ERR -1
+
+/////////////////////////////////////////////////////////
+// some external definitions to be exposed to the users
+/////////////////////////////////////////////////////////
+
+typedef signed char             ne10_int8_t;
+typedef unsigned char           ne10_uint8_t;
+typedef signed short            ne10_int16_t;
+typedef unsigned short          ne10_uint16_t;
+typedef signed int              ne10_int32_t;
+typedef unsigned int            ne10_uint32_t;
+typedef signed long long int    ne10_int64_t;
+typedef unsigned long long int  ne10_uint64_t;
+typedef float                   ne10_float32_t;
+typedef double                  ne10_float64_t;
+typedef int                     ne10_result_t;     // resulting [error-]code
+
+/**
+ * @brief a 2-tuple of ne10_float32_t values.
+ */
+typedef struct
+{
+    ne10_float32_t x;
+    ne10_float32_t y;
+} ne10_vec2f_t;
+
+/**
+ * @brief a 3-tuple of ne10_float32_t values.
+ */
+typedef struct
+{
+    ne10_float32_t x;
+    ne10_float32_t y;
+    ne10_float32_t z;
+} ne10_vec3f_t;
+
+/**
+ * @brief a 4-tuple of ne10_float32_t values.
+ */
+typedef struct
+{
+    ne10_float32_t x;
+    ne10_float32_t y;
+    ne10_float32_t z;
+    ne10_float32_t w;
+} ne10_vec4f_t;
+
+/////////////////////////////////////////////////////////
+// definitions for matrix
+/////////////////////////////////////////////////////////
+
+typedef struct
+{
+    ne10_float32_t r1;
+    ne10_float32_t r2;
+} __attribute__ ( (packed)) ne10_mat_row2f;
+
+typedef struct
+{
+    ne10_mat_row2f c1;
+    ne10_mat_row2f c2;
+
+} __attribute__ ( (packed)) ne10_mat2x2f_t;   // a 2x2 matrix
+
+static inline void createColumnMajorMatrix2x2 (ne10_mat2x2f_t * outMat, ne10_float32_t m11, ne10_float32_t m21, ne10_float32_t m12, ne10_float32_t m22)
+{
+    assert (NULL != outMat);
+
+    outMat->c1.r1 = m11;
+    outMat->c1.r2 = m21;
+    outMat->c2.r1 = m12;
+    outMat->c2.r2 = m22;
+}
+
+
+typedef struct
+{
+    ne10_float32_t r1;
+    ne10_float32_t r2;
+    ne10_float32_t r3;
+} __attribute__ ( (packed)) ne10_mat_row3f;
+
+typedef struct
+{
+    ne10_mat_row3f c1;
+    ne10_mat_row3f c2;
+    ne10_mat_row3f c3;
+
+} __attribute__ ( (packed)) ne10_mat3x3f_t;   // a 3x3 matrix
+
+static inline void createColumnMajorMatrix3x3 (ne10_mat3x3f_t * outMat, ne10_float32_t m11, ne10_float32_t m21, ne10_float32_t m31,
+        ne10_float32_t m12, ne10_float32_t m22, ne10_float32_t m32,
+        ne10_float32_t m13, ne10_float32_t m23, ne10_float32_t m33)
+{
+    assert (NULL != outMat);
+
+    outMat->c1.r1 = m11;
+    outMat->c1.r2 = m21;
+    outMat->c1.r3 = m31;
+
+    outMat->c2.r1 = m12;
+    outMat->c2.r2 = m22;
+    outMat->c2.r3 = m32;
+
+    outMat->c3.r1 = m13;
+    outMat->c3.r2 = m23;
+    outMat->c3.r3 = m33;
+}
+
+
+typedef struct
+{
+    ne10_float32_t r1;
+    ne10_float32_t r2;
+    ne10_float32_t r3;
+    ne10_float32_t r4;
+} __attribute__ ( (packed)) ne10_mat_row4f;
+
+typedef struct
+{
+    ne10_mat_row4f c1;
+    ne10_mat_row4f c2;
+    ne10_mat_row4f c3;
+    ne10_mat_row4f c4;
+
+} __attribute__ ( (packed)) ne10_mat4x4f_t;   // a 4x4 matrix
+
+static inline void createColumnMajorMatrix4x4 (ne10_mat4x4f_t * outMat, ne10_float32_t m11, ne10_float32_t m21, ne10_float32_t m31, ne10_float32_t m41,
+        ne10_float32_t m12, ne10_float32_t m22, ne10_float32_t m32, ne10_float32_t m42,
+        ne10_float32_t m13, ne10_float32_t m23, ne10_float32_t m33, ne10_float32_t m43,
+        ne10_float32_t m14, ne10_float32_t m24, ne10_float32_t m34, ne10_float32_t m44)
+{
+    assert (NULL != outMat);
+
+    outMat->c1.r1 = m11;
+    outMat->c1.r2 = m21;
+    outMat->c1.r3 = m31;
+    outMat->c1.r4 = m41;
+
+    outMat->c2.r1 = m12;
+    outMat->c2.r2 = m22;
+    outMat->c2.r3 = m32;
+    outMat->c2.r4 = m42;
+
+    outMat->c3.r1 = m13;
+    outMat->c3.r2 = m23;
+    outMat->c3.r3 = m33;
+    outMat->c3.r4 = m43;
+
+    outMat->c4.r1 = m14;
+    outMat->c4.r2 = m24;
+    outMat->c4.r3 = m34;
+    outMat->c4.r4 = m44;
+}
+
+/////////////////////////////////////////////////////////
+// definitions for fft
+/////////////////////////////////////////////////////////
+
+/**
+ * @brief structure for the floating point FFT function.
+ */
+#define NE10_MAXFACTORS             32
+typedef struct
+{
+    ne10_float32_t r;
+    ne10_float32_t i;
+} ne10_fft_cpx_float32_t;
+
+/**
+ * @brief structure for the floating point FFT state
+ *
+ */
+typedef struct
+{
+    ne10_int32_t nfft;
+    ne10_int32_t *factors;
+    ne10_fft_cpx_float32_t *twiddles;
+    ne10_fft_cpx_float32_t *buffer;
+    ne10_fft_cpx_float32_t *last_twiddles;
+    /**
+     *  @biref Flag to control scaling behaviour in forward floating point complex FFT.
+     *  @note If is_forward_scaled is set 0, Ne10 will not scale output of forward floating
+     *  point complex FFT. Otherwise, Ne10 will scale output of forward floating
+     *  point complex FFT.
+     *  @warning
+     *  Only non-power-of-2 FFT is affected by this flag.
+     */
+    ne10_int32_t is_forward_scaled;
+    /**
+     *  @biref Flag to control scaling behaviour in backward floating point complex FFT.
+     *  @note If is_backward_scaled is set 0, Ne10 will not scale output of backward floating
+     *  point complex FFT. Otherwise, Ne10 will scale output of backward floating
+     *  point complex FFT.
+     *  @warning
+     *  Only non-power-of-2 FFT is affected by this flag.
+     */
+    ne10_int32_t is_backward_scaled;
+} ne10_fft_state_float32_t;
+
+/**
+ * @brief Configure for floating point FFT.
+ */
+typedef ne10_fft_state_float32_t* ne10_fft_cfg_float32_t;
+
+typedef struct
+{
+    ne10_fft_cpx_float32_t *buffer;
+#if (NE10_UNROLL_LEVEL == 0)
+    ne10_int32_t ncfft;
+    ne10_int32_t *factors;
+    ne10_fft_cpx_float32_t *twiddles;
+    ne10_fft_cpx_float32_t *super_twiddles;
+#elif (NE10_UNROLL_LEVEL > 0)
+    ne10_int32_t nfft;
+    ne10_fft_cpx_float32_t *r_twiddles;
+    ne10_int32_t *r_factors;
+    ne10_fft_cpx_float32_t *r_twiddles_backward;
+    ne10_fft_cpx_float32_t *r_twiddles_neon;
+    ne10_fft_cpx_float32_t *r_twiddles_neon_backward;
+    ne10_int32_t *r_factors_neon;
+    ne10_fft_cpx_float32_t *r_super_twiddles_neon;
+#endif
+} ne10_fft_r2c_state_float32_t;
+
+typedef ne10_fft_r2c_state_float32_t* ne10_fft_r2c_cfg_float32_t;
+
+/**
+ * @brief structure for the 16 bits fixed point FFT function.
+ */
+typedef struct
+{
+    ne10_int16_t r;
+    ne10_int16_t i;
+} ne10_fft_cpx_int16_t;
+
+typedef struct
+{
+    ne10_int32_t nfft;
+    ne10_int32_t *factors;
+    ne10_fft_cpx_int16_t *twiddles;
+    ne10_fft_cpx_int16_t *buffer;
+} ne10_fft_state_int16_t;
+
+typedef ne10_fft_state_int16_t* ne10_fft_cfg_int16_t;
+
+typedef struct
+{
+    ne10_int32_t nfft;
+    ne10_int32_t ncfft;
+    ne10_int32_t *factors;
+    ne10_fft_cpx_int16_t *twiddles;
+    ne10_fft_cpx_int16_t *super_twiddles;
+    ne10_fft_cpx_int16_t *buffer;
+} ne10_fft_r2c_state_int16_t;
+
+typedef ne10_fft_r2c_state_int16_t* ne10_fft_r2c_cfg_int16_t;
+
+/**
+ * @brief structure for the 32 bits fixed point FFT function.
+ */
+typedef struct
+{
+    ne10_int32_t r;
+    ne10_int32_t i;
+} ne10_fft_cpx_int32_t;
+
+typedef struct
+{
+    ne10_int32_t nfft;
+    ne10_int32_t *factors;
+    ne10_fft_cpx_int32_t *twiddles;
+    ne10_fft_cpx_int32_t *buffer;
+    ne10_fft_cpx_int32_t *last_twiddles;
+} ne10_fft_state_int32_t;
+
+typedef ne10_fft_state_int32_t* ne10_fft_cfg_int32_t;
+
+typedef struct
+{
+    ne10_int32_t nfft;
+    ne10_int32_t ncfft;
+    ne10_int32_t *factors;
+    ne10_fft_cpx_int32_t *twiddles;
+    ne10_fft_cpx_int32_t *super_twiddles;
+    ne10_fft_cpx_int32_t *buffer;
+} ne10_fft_r2c_state_int32_t;
+
+typedef ne10_fft_r2c_state_int32_t* ne10_fft_r2c_cfg_int32_t;
+
+/////////////////////////////////////////////////////////
+// definitions for fir
+/////////////////////////////////////////////////////////
+
+/**
+ * @brief Instance structure for the floating-point FIR filter.
+ */
+typedef struct
+{
+    ne10_uint16_t numTaps;    /**< Length of the filter. */
+    ne10_float32_t *pState;    /**< Points to the state variable array. The array is of length numTaps+maxBlockSize-1. */
+    ne10_float32_t *pCoeffs;   /**< Points to the coefficient array. The array is of length numTaps. */
+} ne10_fir_instance_f32_t;
+
+/**
+ * @brief Instance structure for the floating point FIR Lattice filter.
+ */
+typedef struct
+{
+    ne10_uint16_t numStages;    /**< numStages of the of lattice filter. */
+    ne10_float32_t *pState;      /**< Points to the state variable array. The array is of length numStages. */
+    ne10_float32_t *pCoeffs;     /**< Points to the coefficient array. The array is of length numStages. */
+} ne10_fir_lattice_instance_f32_t;
+
+/**
+ * @brief Instance structure for the floating-point FIR Decimation.
+ */
+typedef struct
+{
+    ne10_uint8_t  M;            /**< Decimation Factor. */
+    ne10_uint16_t numTaps;      /**< Length of the filter. */
+    ne10_float32_t    *pCoeffs;      /**< Points to the coefficient array. The array is of length numTaps.*/
+    ne10_float32_t    *pState;       /**< Points to the state variable array. The array is of length numTaps+maxBlockSize-1. */
+} ne10_fir_decimate_instance_f32_t;
+
+/**
+ * @brief Instance structure for the floating-point FIR Interpolation.
+ */
+typedef struct
+{
+    ne10_uint8_t L;             /**< Interpolation Factor. */
+    ne10_uint16_t phaseLength;  /**< Length of each polyphase filter component. */
+    ne10_float32_t *pCoeffs;         /**< Points to the coefficient array. The array is of length numTaps.*/
+    ne10_float32_t *pState;          /**< Points to the state variable array. The array is of length numTaps+maxBlockSize-1. */
+} ne10_fir_interpolate_instance_f32_t;
+
+/**
+ * @brief Instance structure for the floating-point FIR Sparse filter.
+ */
+typedef struct
+{
+    ne10_uint16_t numTaps;      /**< Length of the filter. */
+    ne10_uint16_t stateIndex;   /**< Index pointer for the state buffer .*/
+    ne10_float32_t *pState;          /**< Points to the state variable array. The array is of length numTaps+maxBlockSize-1. */
+    ne10_float32_t *pCoeffs;         /**< Points to the coefficient array. The array is of length numTaps.*/
+    ne10_uint16_t  maxDelay;    /**< the largest number of delay line values .*/
+    ne10_int32_t  *pTapDelay;    /**< Pointer to the array containing positions of the non-zero tap values. */
+} ne10_fir_sparse_instance_f32_t;
+
+/**
+   * @brief Instance structure for the floating point IIR Lattice filter.
+   */
+typedef struct
+{
+    ne10_uint16_t numStages;    /**< numStages of the of lattice filter. */
+    ne10_float32_t *pState;      /**< Points to the state variable array. The array is of length numStages + blockSize -1. */
+    ne10_float32_t *pkCoeffs;    /**< Points to the reflection coefficient array. The array is of length numStages. */
+    ne10_float32_t *pvCoeffs;    /**< Points to the ladder coefficient array. The array is of length numStages+1. */
+} ne10_iir_lattice_instance_f32_t;
+
+/////////////////////////////////////////////////////////
+// definitions for imgproc module
+/////////////////////////////////////////////////////////
+
+/**
+ * @brief Structure for point in image
+ */
+typedef struct
+{
+    ne10_uint32_t x;
+    ne10_uint32_t y;
+} ne10_point_t;
+
+typedef struct
+{
+    ne10_uint32_t x;
+    ne10_uint32_t y;
+} ne10_size_t;
+
+typedef enum
+{
+    UBUNTU_COMMAND_LINE,
+    ANDROID_DEMO,
+    IOS_DEMO
+} ne10_print_target_t;
+
+#endif