原文
一直以来都在DM642平台下写程序,而自己所做的大部分工作也和优化相关。为了更加清楚地理解TI CCS编译器的优化规则,做了以下试验:
分别写了7个相同功能的函数,操作相同, 但实现的方式不一样,自然运行的时间也不一样:
----------------------debug mode----------------------------------
combine1(v, &dest) Time elapse: 0.028967 ms.
combine2(v, &dest) Time elapse: 0.024751 ms.
combine3(v, &dest) Time elapse: 0.023541 ms.
combine4(v, &dest) Time elapse: 0.012635 ms.
combine4p(v, &dest) Time elapse: 0.012090 ms.
combine5p(v, &dest) Time elapse: 0.007475 ms.
combine6(v, &dest) Time elapse: 0.011119 ms.
combine6p(v, &dest) Time elapse: 0.006823 ms.
combine1 > combine2 > combine3 > combine4 > conbine4p > combine6 > conbine5p > combine6p
由于没有开优化,在debug模式下运行,这个结果和自己预期的比较一致。从combine1到combine6p, 依次减少了函数的调用和对存储器的访问,以以及循环的展开等等。自然其耗时越来越短。然而当我把-o3优化打开时,结果令我费解:
------------------release mode---------------------------------------
combine1(v, &dest) Time elapse: 0.011491 ms.
combine2(v, &dest) Time elapse: 0.009616 ms.
combine3(v, &dest) Time elapse: 0.009603 ms.
combine4(v, &dest) Time elapse: 0.003884 ms.
combine4p(v, &dest) Time elapse: 0.004096 ms.
combine5p(v, &dest) Time elapse: 0.005573 ms.
combine6(v, &dest) Time elapse: 0.005120 ms.
combine6p(v, &dest) Time elapse: 0.004987 ms.
最值得注意的变化是combine4和combine4p. 下降的幅度最大,并且成为了耗时最短的函数。
仔细看一下combine4中的核心循环:
for (i = 0; i < len; i++)
{
x = x OPER data[i];
}
这里并没有任何人为的循环展开。而正是因为如此编译器对它所进行的优化程度甚至超过了人为循环展开的函数。
这使得它一下子成为了最快的函数。而且combine4与combine4p的唯一区别在于一个使用数组,一个用指针。
而在这里也证明了一点:
编译器对数组的优化程度大于指针。尽管差别不是太大。
打开profile查看:
0:0x800202c0-0x80020374,
combine4,125-143:test.c, function, 1,1580,1580,1430,1430,7,7,7,7,
0:0x800207b4-0x80020980,
combine6,217-239:test.c, function, 1,2117,2117,2021,2021,12,12,9,9,
0:0x80020344-0x80020354,
combine4,138-141:test.c, loop,
124,1102,1102,978,978,5,5,4,4, (循环124次,cache hit 5/(5+4) = 0.56)
0:0x80020880-0x80020898,
combine6,227-231:test.c, loop,
60,1041,1041,981,981,6,6,4,4, (循环60次,cache hit 4/10 = 0.4)
看来编译器做循环展开比我们人为地做要好。
另外,combine6p做了很大的循环展开,但并没有像想象的那样有几何级数的提速,原因是循环展开到一定程度,内存的访问等待时间成为瓶颈。
1 #include
2 #include
3 #include
4 #include
5 #include
6 #include
7 #include "myMath.h"
8 #include "HKY_testTime.h"
9
10 #define IDENT 0
11 #define OPER +
12 #define VEC_LEN 1024
13
14 typedef int data_t;
15
16 typedef struct {
17 int len;
18 data_t *data;
19 }vec_rec, *vec_ptr;
20
21 vec_ptr new_vec(int len);
22 int get_vec_element(vec_ptr v, int index, data_t *dest);
23 int vec_length(vec_ptr v);
24
25 void combine1(vec_ptr v, data_t *dest);
26
27 vec_ptr new_vec(int len)
28 {
29 vec_ptr result = (vec_ptr)malloc(sizeof(vec_rec));
30 if (!result)
31 {
32 return NULL;
33 }
34
35 result->len = len;
36
37 if (len > 0)
38 {
39 int i;
40 data_t *data = (data_t *)malloc(len * sizeof(data_t));
41 if (!data)
42 {
43 free((void *)result);
44 return NULL;
45 }
46 result->data = data;
47
48 for (i = 0; i < len; i++)
49 {
50 data[i] = i;
51 }
52 }
53 else
54 {
55 result->data = NULL;
56 }
57 return result;
58 }
59
60 int get_vec_element(vec_ptr v, int index, data_t *dest)
61 {
62 if (index < 0 || index >= v->len)
63 {
64 return 0;
65 }
66 *dest = v->data[index];
67 return 1;
68 }
69
70 int vec_length(vec_ptr v)
71 {
72 return v->len;
73 }
74
75 void combine1(vec_ptr v, data_t *dest)
76 {
77 int i;
78
79 *dest = IDENT;
80
81 for (i = 0; i < vec_length(v); i++)
82 {
83 data_t val;
84 get_vec_element(v, i, &val);
85 *dest = *dest OPER val;
86 }
87 }
88
89 void combine2(vec_ptr v, data_t *dest)
90 {
91 int i;
92 int len;
93
94 *dest = IDENT;
95 len = vec_length(v);
96
97 for (i = 0; i < len; i++)
98 {
99 data_t val;
100 get_vec_element(v, i, &val);
101 *dest = *dest OPER val;
102 }
103 }
104
105 void combine3(vec_ptr v, data_t *dest)
106 {
107 int i;
108 int len;
109 data_t x;
110
111 *dest = IDENT;
112 x = IDENT;
113
114 len = vec_length(v);
115
116 for (i = 0; i < len; i++)
117 {
118 data_t val;
119 get_vec_element(v, i, &val);
120 x = x OPER val;
121 }
122 *dest = x;
123 }
124
125 void combine4(vec_ptr v, data_t *dest)
126 {
127 int i;
128 int len;
129 data_t x;
130 data_t *data;
131
132 *dest = IDENT;
133 x = IDENT;
134
135 len = vec_length(v);
136 data = v->data;
137
138 for (i = 0; i < len; i++)
139 {
140 x = x OPER data[i];
141 }
142 *dest = x;
143 }
144
145 void combine4p(vec_ptr v, data_t *dest)
146 {
147 int i;
148 int len;
149 data_t x;
150 data_t *data;
151 data_t *dend;
152
153 *dest = IDENT;
154 x = IDENT;
155
156 len = vec_length(v);
157 data = v->data;
158 dend = data + len;
159
160 for (; data < dend; data++)
161 {
162 x = x OPER (*data);
163 }
164 *dest = x;
165 }
166 void combine5(vec_ptr v, data_t *dest)
167 {
168 int i;
169 int len;
170 data_t x;
171 data_t *data;
172
173 *dest = IDENT;
174 x = IDENT;
175
176 len = vec_length(v);
177 data = v->data;
178
179 for (i = 0; i < len; i += 2)
180 {
181 x = x OPER data[i];
182 x = x OPER data[i+1];
183 }
184 *dest = x;
185 }
186 void combine5p(vec_ptr v, data_t *dest)
187 {
188 int i;
189 int len;
190 data_t x;
191 data_t *data;
192 data_t *dend;
193 data_t *dlimit;
194
195 *dest = IDENT;
196 x = IDENT;
197
198 len = vec_length(v);
199 data = v->data;
200 dend = data + len;
201 dlimit = dend - 7;
202
203 for (; data < dlimit; data += 8)
204 {
205 x = x OPER data[0] OPER data[1] OPER data[2] OPER data[3]
206 OPER data[4] OPER data[5] OPER data[6] OPER data[7];
207 }
208
209 for (; data < dend; data++)
210 {
211 x = x OPER data[0];
212 }
213
214 *dest = x;
215 }
216
217 void combine6(vec_ptr v, data_t *dest)
218 {
219 int length = vec_length(v);
220 int limit = length - 1;
221 int i;
222
223 data_t *data = v->data;
224 data_t x0 = IDENT;
225 data_t x1 = IDENT;
226
227 for (i = 0; i < limit; i += 2)
228 {
229 x0 = x0 OPER data[i];
230 x1 = x1 OPER data[i+1];
231 }
232
233 for (; i < length; i++)
234 {
235 x0 = x0 OPER data[i];
236 }
237
238 *dest = x0 OPER x1;
239 }
240
241 void combine6p(vec_ptr v, data_t *dest)
242 {
243 int i;
244 int len;
245 data_t x;
246 data_t *data;
247 data_t *dend;
248 data_t *dlimit;
249
250 *dest = IDENT;
251 x = IDENT;
252
253 len = vec_length(v);
254 data = v->data;
255 dend = data + len;
256 dlimit = dend - 15;
257
258 for (; data < dlimit; data += 16)
259 {
260 x = x OPER data[0] OPER data[1] OPER data[2] OPER data[3]
OPER
261 data[4] OPER data[5] OPER data[6] OPER data[7]
OPER
262 data[8] OPER data[9] OPER data[10] OPER data[11]
OPER
263 data[12] OPER data[13] OPER data[14] OPER data[15];
264 }
265
266 for (; data < dend; data++)
267 {
268 x = x OPER data[0];
269 }
270
271 *dest = x;
272 }
273
274 #define CLIP(X,AMIN,AMAX) (((X)<(AMIN)) ? (AMIN) : ((X)>(AMAX)) ? (AMAX) : (X))
275
276 #define CLIP1(Y, X, AMIN, AMAX) if (X
< AMIN) { Y = AMIN;} /
277 else if (X
> AMAX) {Y = AMAX;}/
278 else {Y = X;}
279
280 void genCosTable()
281 {
282 #define PI (3.1415926)
283 #define ROUND(x) ((x) - floor(x)) > 0.5 ?
((int)(x) + 1) : (int)(x)
284 double temp;
285 int temp_d;
286 int i;
287 double theta;
288
289 for (i = 0; i < 360 * 8;
i++)
290 {
291 theta = (double)(i) * PI / (8 * 180);
292 temp = sin(theta) * 65536;
293 temp_d = ROUND(temp);
294 printf("temp = %d /n",
temp_d);
295 }
296
297 }
298
299 int test_combine()
300 {
301 vec_ptr v = new_vec(128);
302 data_t dest;
303 HKY_CSL_INIT();
304
305 CALL_FUN_TIME(combine1(v, &dest));
306 CALL_FUN_TIME(combine2(v, &dest));
307 CALL_FUN_TIME(combine3(v, &dest));
308 CALL_FUN_TIME(combine4(v, &dest));
309 CALL_FUN_TIME(combine4p(v, &dest));
310 CALL_FUN_TIME(combine5(v ,&dest));
311 CALL_FUN_TIME(combine5p(v, &dest));
312 CALL_FUN_TIME(combine6(v, &dest));
313 CALL_FUN_TIME(combine6p(v, &dest));
314 return 0;
315 }
316
317
318 #define N 128
319 void mm_ijk(short *c, short *b, short *a)
320 {
321 int i, j, k;
322 short sum;
323
324 short (*C)[N] = (short (*)[N])c;
325 short (*B)[N] = (short (*)[N])b;
326 short (*A)[N] = (short (*)[N])a;
327
328 for (i = 0; i < N; i++)
329 {
330 for (j = 0; j < N; j++)
331 {
332 sum = 0;
333 for (k = 0; k < N; k++)
334 {
335 sum += A[i][k] * B[k][j];
336 }
337 C[i][j] += sum;
338 }
339 }
340
341 }
342
343 int main()
344 {
345 test_combine();
346 return 0;
347 }
--------------------header files-------------------------------------
#ifndef _HKY_TEST_TIME_H_
#define _HKY_TEST_TIME_H_
#include
#include
#include
#include
//#define USE_CLOCK_FUNC //if you want to use clock(), open this
TIMER_Config MyConfig = {
0x00000200, /* ctl */
0xFFFFFFFF, /* prd */
0x00000000 /* cnt */
};
TIMER_Handle myhTimer;
double start_time, end_time, cur_time;
#define HKY_CSL_INIT()/
CSL_init();/
CACHE_setL2Mode(CACHE_256KCACHE);/
CACHE_enableCaching(CACHE_EMIFA_CE00);/
CACHE_enableCaching(CACHE_EMIFA_CE01);/
myhTimer = TIMER_open(TIMER_DEV0, 0);/
TIMER_config(myhTimer, &MyConfig);/
TIMER_setCount(myhTimer,0);/
TIMER_start(myhTimer)
#ifndef USE_CLOCK_FUNC
#define CALL_FUN_TIME(fun_arg) /
CACHE_clean(CACHE_L2ALL, (void *)0, 0);/
TIMER_setCount(myhTimer, 0);/
start_time = TIMER_getCount(myhTimer);/
fun_arg;/
end_time = TIMER_getCount(myhTimer);/
cur_time = (end_time - start_time) * 1.33e-5;/
printf(#fun_arg"/tTime elapse: %f ms./n", cur_time)
#else
#define CALL_FUN_TIME(fun_arg)/
CACHE_clean(CACHE_L2ALL, (void *)0, 0);/
start_time = clock();/
fun_arg;/
end_time = clock();/
cur_time = (end_time - start_time) * 1.67e-6;/
printf(#fun_arg"/tTime elapse: %f ms./n", cur_time)
#endif //USE_CLOCK_FUNC
#endif //_HKY_TEST_TIME_H_