DSP

DSP软件优化小实验

2019-07-13 17:15发布

原文
一直以来都在DM642平台下写程序,而自己所做的大部分工作也和优化相关。为了更加清楚地理解TI CCS编译器的优化规则,做了以下试验:   分别写了7个相同功能的函数,操作相同, 但实现的方式不一样,自然运行的时间也不一样: ----------------------debug mode---------------------------------- combine1(v, &dest)    Time elapse: 0.028967 ms.
combine2(v, &dest)    Time elapse: 0.024751 ms.
combine3(v, &dest)    Time elapse: 0.023541 ms.
combine4(v, &dest)    Time elapse: 0.012635 ms.
combine4p(v, &dest)  Time elapse: 0.012090 ms.
combine5p(v, &dest)  Time elapse: 0.007475 ms.
combine6(v, &dest)    Time elapse: 0.011119 ms.
combine6p(v, &dest)  Time elapse: 0.006823 ms.   combine1 > combine2 > combine3 > combine4 > conbine4p > combine6 > conbine5p > combine6p   由于没有开优化,在debug模式下运行,这个结果和自己预期的比较一致。从combine1到combine6p, 依次减少了函数的调用和对存储器的访问,以以及循环的展开等等。自然其耗时越来越短。然而当我把-o3优化打开时,结果令我费解: ------------------release mode--------------------------------------- combine1(v, &dest)   Time elapse: 0.011491 ms.
combine2(v, &dest)   Time elapse: 0.009616 ms.
combine3(v, &dest)   Time elapse: 0.009603 ms.
combine4(v, &dest)   Time elapse: 0.003884 ms.
combine4p(v, &dest) Time elapse: 0.004096 ms.
combine5p(v, &dest) Time elapse: 0.005573 ms.
combine6(v, &dest)   Time elapse: 0.005120 ms.
combine6p(v, &dest) Time elapse: 0.004987 ms.
最值得注意的变化是combine4和combine4p. 下降的幅度最大,并且成为了耗时最短的函数。 仔细看一下combine4中的核心循环:  for (i = 0; i < len; i++)
 {
    x = x OPER data[i];
 }
这里并没有任何人为的循环展开。而正是因为如此编译器对它所进行的优化程度甚至超过了人为循环展开的函数。 这使得它一下子成为了最快的函数。而且combine4与combine4p的唯一区别在于一个使用数组,一个用指针。 而在这里也证明了一点:编译器对数组的优化程度大于指针。尽管差别不是太大。 打开profile查看: 0:0x800202c0-0x80020374,combine4,125-143:test.c,  function,  1,1580,1580,1430,1430,7,7,7,7, 0:0x800207b4-0x80020980,combine6,217-239:test.c,  function,  1,2117,2117,2021,2021,12,12,9,9,

0:0x80020344-0x80020354,combine4,138-141:test.c, loop,    124,1102,1102,978,978,5,5,4,4, (循环124次,cache hit 5/(5+4) = 0.56)
0:0x80020880-0x80020898,combine6,227-231:test.c, loop,       60,1041,1041,981,981,6,6,4,4, (循环60次,cache hit 4/10 = 0.4)   看来编译器做循环展开比我们人为地做要好。
另外,combine6p做了很大的循环展开,但并没有像想象的那样有几何级数的提速,原因是循环展开到一定程度,内存的访问等待时间成为瓶颈。
  1 #include 
  2 #include 
  3 #include 
  4 #include 
  5 #include 
  6 #include 
  7 #include "myMath.h"
  8 #include "HKY_testTime.h"
  9 
 10 #define IDENT 0
 11 #define OPER +
 12 #define VEC_LEN 1024
 13 
 14 typedef int data_t;
 15 
 16 typedef struct {
 17     int len;
 18     data_t *data;
 19 }vec_rec, *vec_ptr;
 20 
 21 vec_ptr new_vec(int len);
 22 int get_vec_element(vec_ptr v, int index, data_t *dest);
 23 int vec_length(vec_ptr v);
 24 
 25 void combine1(vec_ptr v, data_t *dest);
 26 
 27 vec_ptr new_vec(int len)
 28 {
 29     vec_ptr result = (vec_ptr)malloc(sizeof(vec_rec));
 30     if (!result)
 31         {
 32             return NULL;
 33         }
 34     
 35     result->len = len;
 36     
 37     if (len > 0)
 38         {
 39             int i;
 40             data_t *data = (data_t *)malloc(len * sizeof(data_t));
 41             if (!data)
 42                 {
 43                     free((void *)result);
 44                     return NULL;
 45                 }
 46             result->data = data;
 47             
 48             for (i = 0; i < len; i++)
 49             {
 50                 data[i] = i;
 51             }
 52         }
 53     else
 54         {
 55             result->data = NULL;
 56         }
 57     return result;
 58 }
 59 
 60 int get_vec_element(vec_ptr v, int index, data_t *dest)
 61 {
 62     if (index < 0 || index >= v->len)
 63         {
 64             return 0;
 65         }
 66     *dest = v->data[index];
 67     return 1;
 68 }
 69 
 70 int vec_length(vec_ptr v)
 71 {
 72     return v->len;
 73 }
 74 
 75 void combine1(vec_ptr v, data_t *dest)
 76 {
 77     int i;
 78     
 79     *dest = IDENT;
 80 
 81     for (i = 0; i < vec_length(v); i++)
 82     {
 83         data_t val;
 84         get_vec_element(v, i, &val);
 85         *dest = *dest OPER val;
 86     }
 87 }
 88 
 89 void combine2(vec_ptr v, data_t *dest)
 90 {
 91     int i;
 92     int len;
 93     
 94     *dest = IDENT;
 95     len = vec_length(v);
 96     
 97     for (i = 0; i < len; i++)
 98     {
 99         data_t val;
100         get_vec_element(v, i, &val);
101         *dest = *dest OPER val;
102     }
103 }
104 
105 void combine3(vec_ptr v, data_t *dest)
106 {
107     int i;
108     int len;
109     data_t x;
110     
111     *dest = IDENT;
112     x = IDENT;
113     
114     len = vec_length(v);
115     
116     for (i = 0; i < len; i++)
117     {
118         data_t val;
119         get_vec_element(v, i, &val);
120         x = x OPER val;
121     }
122     *dest = x;
123 }
124 
125 void combine4(vec_ptr v, data_t *dest)
126 {
127     int i;
128     int len;
129     data_t x;
130     data_t *data;
131     
132     *dest = IDENT;
133     x = IDENT;
134     
135     len = vec_length(v);
136     data = v->data;
137 
138     for (i = 0; i < len; i++)
139     {
140         x = x OPER data[i];
141     }
142     *dest = x;
143 }
144 
145 void combine4p(vec_ptr v, data_t *dest)
146 {
147     int i;
148     int len;
149     data_t x;
150     data_t *data;
151     data_t *dend;
152     
153     *dest = IDENT;
154     x = IDENT;
155     
156     len = vec_length(v);
157     data = v->data;
158     dend = data + len;
159 
160     for (; data < dend; data++)
161     {
162         x = x OPER (*data);
163     }
164     *dest = x;
165 }
166 void combine5(vec_ptr v, data_t *dest)
167 {
168     int i;
169     int len;
170     data_t x;
171     data_t *data;
172     
173     *dest = IDENT;
174     x = IDENT;
175     
176     len = vec_length(v);
177     data = v->data;
178 
179     for (i = 0; i < len; i += 2)
180     {
181         x = x OPER data[i];
182         x = x OPER data[i+1];
183     }
184     *dest = x;
185 }
186 void combine5p(vec_ptr v, data_t *dest)
187 {
188     int i;
189     int len;
190     data_t x;
191     data_t *data;
192     data_t *dend;
193     data_t *dlimit;
194     
195     *dest = IDENT;
196     x = IDENT;
197         
198     len = vec_length(v);
199     data = v->data;
200     dend = data + len;
201     dlimit = dend - 7;
202 
203     for (; data < dlimit; data += 8)
204     {
205         x = x OPER data[0] OPER data[1] OPER data[2] OPER data[3]
206               OPER data[4] OPER data[5] OPER data[6] OPER data[7];
207     }
208     
209     for (; data < dend; data++)
210     {
211         x = x OPER data[0];
212     }
213 
214     *dest = x;
215 }
216 
217 void combine6(vec_ptr v, data_t *dest)
218 {
219     int length = vec_length(v);
220     int limit = length - 1;
221     int i;
222     
223     data_t *data = v->data;
224     data_t x0 = IDENT;
225     data_t x1 = IDENT;
226     
227     for (i = 0; i < limit; i += 2)
228     {
229         x0 = x0 OPER data[i];
230         x1 = x1 OPER data[i+1];
231     }
232     
233     for (; i < length; i++)
234     {
235         x0 = x0 OPER data[i];
236     }
237     
238     *dest = x0 OPER x1;
239 }
240         
241 void combine6p(vec_ptr v, data_t *dest)
242 {
243     int i;
244     int len;
245     data_t x;
246     data_t *data;
247     data_t *dend;
248     data_t *dlimit;
249     
250     *dest = IDENT;
251     x = IDENT;
252         
253     len = vec_length(v);
254     data = v->data;
255     dend = data + len;
256     dlimit = dend - 15;
257 
258     for (; data < dlimit; data += 16)
259     {
260         x = x OPER data[0] OPER data[1] OPER data[2] OPER data[3] OPER 
261                    data[4] OPER data[5] OPER data[6] OPER data[7] OPER
262                    data[8] OPER data[9] OPER data[10] OPER data[11] OPER
263                    data[12] OPER data[13] OPER data[14] OPER data[15];
264     }
265     
266     for (; data < dend; data++)
267     {
268         x = x OPER data[0];
269     }
270 
271     *dest = x;
272 }
273 
274 #define CLIP(X,AMIN,AMAX) (((X)<(AMIN)) ? (AMIN) : ((X)>(AMAX)) ? (AMAX) : (X))
275 
276 #define CLIP1(Y, X, AMIN, AMAX) if (X < AMIN) { Y = AMIN;} /
277                                 else if (X > AMAX) {Y = AMAX;}/
278                                 else {Y = X;}
279 
280 void genCosTable()
281 {
282 #define PI (3.1415926)
283 #define ROUND(x) ((x) - floor(x)) > 0.5 ? ((int)(x) + 1) : (int)(x)
284     double temp;
285     int temp_d;
286     int i;
287     double theta;
288 
289     for (i = 0; i < 360 * 8; i++)
290     {
291         theta = (double)(i) * PI / (8 * 180);
292         temp = sin(theta) * 65536;
293         temp_d = ROUND(temp);
294         printf("temp = %d /n", temp_d);
295     }   
296 
297 }
298 
299 int test_combine()
300 {
301     vec_ptr v = new_vec(128);
302     data_t dest;
303     HKY_CSL_INIT();
304 
305     CALL_FUN_TIME(combine1(v, &dest));
306     CALL_FUN_TIME(combine2(v, &dest));
307     CALL_FUN_TIME(combine3(v, &dest));
308     CALL_FUN_TIME(combine4(v, &dest));
309     CALL_FUN_TIME(combine4p(v, &dest));
310     CALL_FUN_TIME(combine5(v ,&dest));
311     CALL_FUN_TIME(combine5p(v, &dest));
312     CALL_FUN_TIME(combine6(v, &dest));
313     CALL_FUN_TIME(combine6p(v, &dest));
314     return 0;
315 }
316 
317 
318 #define N 128
319 void mm_ijk(short *c, short *b, short *a)
320 {
321     int i, j, k;
322     short sum;
323     
324     short (*C)[N] = (short (*)[N])c;
325     short (*B)[N] = (short (*)[N])b;
326     short (*A)[N] = (short (*)[N])a;
327 
328     for (i = 0; i < N; i++)
329     {
330         for (j = 0; j < N; j++)
331         {
332             sum = 0;
333             for (k = 0; k < N; k++)
334             {
335                 sum += A[i][k] * B[k][j];
336             }
337             C[i][j] += sum;
338         }
339     }
340 
341 }
342 
343 int main()
344 {
345     test_combine();
346     return 0;
347 }
--------------------header files------------------------------------- #ifndef _HKY_TEST_TIME_H_
#define _HKY_TEST_TIME_H_
#include            
#include   
#include
#include
//#define USE_CLOCK_FUNC    //if you want to use clock(), open this TIMER_Config MyConfig = {
  0x00000200, /* ctl */
  0xFFFFFFFF, /* prd */
  0x00000000  /* cnt */
};
TIMER_Handle myhTimer;
double start_time, end_time, cur_time;
#define HKY_CSL_INIT()/
 CSL_init();/
 CACHE_setL2Mode(CACHE_256KCACHE);/
 CACHE_enableCaching(CACHE_EMIFA_CE00);/
 CACHE_enableCaching(CACHE_EMIFA_CE01);/
 myhTimer = TIMER_open(TIMER_DEV0, 0);/
 TIMER_config(myhTimer, &MyConfig);/
 TIMER_setCount(myhTimer,0);/
 TIMER_start(myhTimer)
 
#ifndef USE_CLOCK_FUNC
#define CALL_FUN_TIME(fun_arg) /
 CACHE_clean(CACHE_L2ALL, (void *)0, 0);/
 TIMER_setCount(myhTimer, 0);/
 start_time = TIMER_getCount(myhTimer);/
 fun_arg;/
 end_time = TIMER_getCount(myhTimer);/
 cur_time = (end_time - start_time) * 1.33e-5;/
 printf(#fun_arg"/tTime elapse: %f ms./n", cur_time)
#else
#define CALL_FUN_TIME(fun_arg)/
 CACHE_clean(CACHE_L2ALL, (void *)0, 0);/
 start_time = clock();/
 fun_arg;/
 end_time = clock();/
 cur_time = (end_time - start_time) * 1.67e-6;/
 printf(#fun_arg"/tTime elapse: %f ms./n", cur_time)
  
#endif //USE_CLOCK_FUNC

#endif //_HKY_TEST_TIME_H_