00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 15. July 2011 00005 * $Revision: V1.0.10 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_mat_mult_q31.c 00009 * 00010 * Description: Q31 matrix multiplication. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Version 1.0.10 2011/7/15 00015 * Big Endian support added and Merged M0 and M3/M4 Source code. 00016 * 00017 * Version 1.0.3 2010/11/29 00018 * Re-organized the CMSIS folders and updated documentation. 00019 * 00020 * Version 1.0.2 2010/11/11 00021 * Documentation updated. 00022 * 00023 * Version 1.0.1 2010/10/05 00024 * Production release and review comments incorporated. 00025 * 00026 * Version 1.0.0 2010/09/20 00027 * Production release and review comments incorporated. 00028 * 00029 * Version 0.0.5 2010/04/26 00030 * incorporated review comments and updated with latest CMSIS layer 00031 * 00032 * Version 0.0.3 2010/03/10 00033 * Initial version 00034 * -------------------------------------------------------------------- */ 00035 00036 #include "arm_math.h" 00037 00073 arm_status arm_mat_mult_q31( 00074 const arm_matrix_instance_q31 * pSrcA, 00075 const arm_matrix_instance_q31 * pSrcB, 00076 arm_matrix_instance_q31 * pDst) 00077 { 00078 q31_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */ 00079 q31_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */ 00080 q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */ 00081 q31_t *pOut = pDst->pData; /* output data matrix pointer */ 00082 q31_t *px; /* Temporary output data matrix pointer */ 00083 q63_t sum; /* Accumulator */ 00084 uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */ 00085 uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */ 00086 uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */ 00087 00088 #ifndef ARM_MATH_CM0 00089 00090 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00091 00092 uint16_t col, i = 0u, j, row = numRowsA, colCnt; /* loop counters */ 00093 arm_status status; /* status of matrix multiplication */ 00094 00095 00096 #ifdef ARM_MATH_MATRIX_CHECK 00097 00098 00099 /* Check for matrix mismatch condition */ 00100 if((pSrcA->numCols != pSrcB->numRows) || 00101 (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols)) 00102 { 00103 /* Set status as ARM_MATH_SIZE_MISMATCH */ 00104 status = ARM_MATH_SIZE_MISMATCH; 00105 } 00106 else 00107 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */ 00108 00109 { 00110 /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */ 00111 /* row loop */ 00112 do 00113 { 00114 /* Output pointer is set to starting address of the row being processed */ 00115 px = pOut + i; 00116 00117 /* For every row wise process, the column loop counter is to be initiated */ 00118 col = numColsB; 00119 00120 /* For every row wise process, the pIn2 pointer is set 00121 ** to the starting address of the pSrcB data */ 00122 pIn2 = pSrcB->pData; 00123 00124 j = 0u; 00125 00126 /* column loop */ 00127 do 00128 { 00129 /* Set the variable sum, that acts as accumulator, to zero */ 00130 sum = 0; 00131 00132 /* Initiate the pointer pIn1 to point to the starting address of pInA */ 00133 pIn1 = pInA; 00134 00135 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00136 colCnt = numColsA >> 2; 00137 00138 00139 /* matrix multiplication */ 00140 while(colCnt > 0u) 00141 { 00142 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ 00143 /* Perform the multiply-accumulates */ 00144 sum += (q63_t) * pIn1++ * *pIn2; 00145 pIn2 += numColsB; 00146 00147 sum += (q63_t) * pIn1++ * *pIn2; 00148 pIn2 += numColsB; 00149 00150 sum += (q63_t) * pIn1++ * *pIn2; 00151 pIn2 += numColsB; 00152 00153 sum += (q63_t) * pIn1++ * *pIn2; 00154 pIn2 += numColsB; 00155 00156 /* Decrement the loop counter */ 00157 colCnt--; 00158 } 00159 00160 /* If the columns of pSrcA is not a multiple of 4, compute any remaining output samples here. 00161 ** No loop unrolling is used. */ 00162 colCnt = numColsA % 0x4u; 00163 00164 while(colCnt > 0u) 00165 { 00166 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ 00167 /* Perform the multiply-accumulates */ 00168 sum += (q63_t) * pIn1++ * *pIn2; 00169 pIn2 += numColsB; 00170 00171 /* Decrement the loop counter */ 00172 colCnt--; 00173 } 00174 00175 /* Convert the result from 2.62 to 1.31 format and store in destination buffer */ 00176 *px++ = (q31_t) (sum >> 31); 00177 00178 /* Update the pointer pIn2 to point to the starting address of the next column */ 00179 j++; 00180 pIn2 = (pSrcB->pData) + j; 00181 00182 /* Decrement the column loop counter */ 00183 col--; 00184 00185 } while(col > 0u); 00186 00187 #else 00188 00189 /* Run the below code for Cortex-M0 */ 00190 00191 q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */ 00192 uint16_t col, i = 0u, row = numRowsA, colCnt; /* loop counters */ 00193 arm_status status; /* status of matrix multiplication */ 00194 00195 00196 #ifdef ARM_MATH_MATRIX_CHECK 00197 00198 /* Check for matrix mismatch condition */ 00199 if((pSrcA->numCols != pSrcB->numRows) || 00200 (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols)) 00201 { 00202 /* Set status as ARM_MATH_SIZE_MISMATCH */ 00203 status = ARM_MATH_SIZE_MISMATCH; 00204 } 00205 else 00206 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */ 00207 00208 { 00209 /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */ 00210 /* row loop */ 00211 do 00212 { 00213 /* Output pointer is set to starting address of the row being processed */ 00214 px = pOut + i; 00215 00216 /* For every row wise process, the column loop counter is to be initiated */ 00217 col = numColsB; 00218 00219 /* For every row wise process, the pIn2 pointer is set 00220 ** to the starting address of the pSrcB data */ 00221 pIn2 = pSrcB->pData; 00222 00223 /* column loop */ 00224 do 00225 { 00226 /* Set the variable sum, that acts as accumulator, to zero */ 00227 sum = 0; 00228 00229 /* Initiate the pointer pIn1 to point to the starting address of pInA */ 00230 pIn1 = pInA; 00231 00232 /* Matrix A columns number of MAC operations are to be performed */ 00233 colCnt = numColsA; 00234 00235 /* matrix multiplication */ 00236 while(colCnt > 0u) 00237 { 00238 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ 00239 /* Perform the multiply-accumulates */ 00240 sum += (q63_t) * pIn1++ * *pIn2; 00241 pIn2 += numColsB; 00242 00243 /* Decrement the loop counter */ 00244 colCnt--; 00245 } 00246 00247 /* Convert the result from 2.62 to 1.31 format and store in destination buffer */ 00248 *px++ = (q31_t) (sum >> 31); 00249 00250 /* Decrement the column loop counter */ 00251 col--; 00252 00253 /* Update the pointer pIn2 to point to the starting address of the next column */ 00254 pIn2 = pInB + (numColsB - col); 00255 00256 } while(col > 0u); 00257 00258 #endif 00259 00260 /* Update the pointer pInA to point to the starting address of the next row */ 00261 i = i + numColsB; 00262 pInA = pInA + numColsA; 00263 00264 /* Decrement the row loop counter */ 00265 row--; 00266 00267 } while(row > 0u); 00268 00269 /* set status as ARM_MATH_SUCCESS */ 00270 status = ARM_MATH_SUCCESS; 00271 } 00272 /* Return to application */ 00273 return (status); 00274 } 00275