http://blog.csdn.net/henhen2002/article/details/4562993
1 通用的调整性能的策略
1.1 选择恰当的编译器选项
- 必须要用的选项 –O[2|3]
- 可以使用-mt(要确保写的数据和读的数据在内存空间上没有重合)
- -mh Specify speculative load byte count threshold
- 如果源代码里含有永远不会执行的代码,使用选项-mo Place each function in a separate subsection
- 如果考虑可执行程序的大小,加上-ms[0-3]。(我在C64x+上编译时,加此选项,连接错误,提示找不到__push_rts?原因不明)
- 不要加上-g –gp –ss –ml3 –mu
- (-s[–k –al] –o[2|3] –mw (-on2 –o3) –consultant 可以在产生分析信息的同时不影响生成代码的性能
1.2 确保循环中的次数变量(一般for(i; i
2 利用优化器的意见
当编译选项中有-s时,在生成的*.asm文件中会有优化器的意见
如"C:/CCStudio_v3.3/C6000/cgtools/bin/cl6x" -g -k -s -on2 -o3 -mt -mw -mv6400+ --mem_model:data=near --consultant -@"Debug.lkf" "lesson_c.c"
其中
void lesson_c(short *xptr, short *yptr, short *zptr, short *w_sum, int N)
{
int i, w_vec1, w_vec2;
short w1,w2;
w1 = zptr[0];
w2 = zptr[1];
for (i = 0; i < N; i++)
{
w_vec1 = xptr[i] * w1;
w_vec2 = yptr[i] * w2;
w_sum[i] = (w_vec1 + w_vec2) >> 15;
}
}
生成的优化器意见为:
;** --------------------------------------------------------------------------*
;** 27 ----------------------- w1 = *zptr;
;** 28 ----------------------- w2 = zptr[1];
;** 29 ----------------------- if ( N <= 0 ) goto g4;
;** --------------------------------------------------------------------------*
;** ----------------------- U$17 = xptr;
;** ----------------------- U$20 = yptr;
;** ----------------------- U$26 = w_sum;
;** 31 ----------------------- L$1 = N;
;** ----------------------- #pragma MUST_ITERATE(1, 1099511627775, 1)
;** ----------------------- #pragma LOOP_FLAGS(4096u)
;** -----------------------g3:
;** 31 ----------------------- *U$26++ = _mpy(*U$17++, w1)+_mpy(*U$20++, w2)>>15;
;** 29 ----------------------- if ( --L$1 ) goto g3;
;** -----------------------g4:
;** ----------------------- return;
从中可以看出,加入了对N是否0的判断。如果改为:
void lesson_c(short *xptr, short *yptr, short *zptr, short *w_sum, int N)
{
int i, w_vec1, w_vec2;
short w1,w2;
w1 = zptr[0];
w2 = zptr[1];
#pragma MUST_ITERATE(1) //至少循环一次
for (i = 0; i < N; i++)
{
w_vec1 = xptr[i] * w1;
w_vec2 = yptr[i] * w2;
w_sum[i] = (w_vec1 + w_vec2) >> 15;
}
}
相应的意见为,没有了对N是否为0的判断
;** --------------------------------------------------------------------------*
;** 27 ----------------------- w1 = *zptr;
;** 28 ----------------------- w2 = zptr[1];
;** ----------------------- U$15 = xptr;
;** ----------------------- U$18 = yptr;
;** ----------------------- U$24 = w_sum;
;** 32 ----------------------- L$1 = N;
;** ----------------------- #pragma MUST_ITERATE(1, 4294967295, 1)
;** ----------------------- #pragma LOOP_FLAGS(4096u)
;** -----------------------g2:
;** 32 ----------------------- *U$24++ = _mpy(*U$15++, w1)+_mpy(*U$18++, w2)>>15;
;** 30 ----------------------- if ( --L$1 ) goto g2;
;** ----------------------- return;
3 利用软件流水信息优化循环
void lesson_c(short *xptr, short *yptr, short *zptr, short *w_sum, int N)
{
int i, w_vec1, w_vec2;
short w1,w2;
w1 = zptr[0];
w2 = zptr[1];
for (i = 0; i < N; i++)
{
w_vec1 = xptr[i] * w1;
w_vec2 = yptr[i] * w2;
w_sum[i] = (w_vec1 + w_vec2) >> 15;
}
}
的软件流水信息为:
;*----------------------------------------------------------------------------*
;* SOFTWARE PIPELINE INFORMATION
;*
;* Loop source line : 30 ;循环开始的行数
;* Loop opening brace source line : 31
;* Loop closing brace source line : 35
;* Known Minimum Trip Count : 1 ;已知的循环最小次数
;* Known Max Trip Count Factor : 1 ;已知循环的因子 循环次数是循环因子的倍数,如果一直循环因子的话,便于编译器自动铺开(unroll)代码
;* Loop Carried Dependency Bound(^) : 0 ;内存读写瓶颈,如果有的话,后面的汇编代码注释里相应语句含有^标志
;* Unpartitioned Resource Bound : 2 ;资源瓶颈
;* Partitioned Resource Bound(*) : 2
;* Resource Partition:
;* A-side B-side
;* .L units 0 0
;* .S units 1 0
;* .D units 2* 1
;* .M units 1 1
;* .X cross paths 1 0
;* .T address paths 2* 1
;* Long read paths 0 0
;* Long write paths 0 0
;* Logical ops (.LS) 0 0 (.L or .S unit)
;* Addition ops (.LSD) 1 0 (.L or .S or .D unit)
;* Bound(.L .S .LS) 1 0
;* Bound(.L .S .D .LS .LSD) 2* 1 ;资源使用不平衡(没有完全利用可用的计算能力)
;*
;* Searching for software pipeline schedule at ...
;* ii = 2 Schedule found with 6 iterations in parallel
;*
;* Register Usage Table:
;* +-----------------------------------------------------------------+
;* |AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA|BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB|
;* |00000000001111111111222222222233|00000000001111111111222222222233|
;* |01234567890123456789012345678901|01234567890123456789012345678901|
;* |--------------------------------+--------------------------------|
;* 0: | ****** | * ** |
;* 1: | * *** | **** |
;* +-----------------------------------------------------------------+
;*
;* Done
;*
;* Loop will be splooped
;* Collapsed epilog stages : 0
;* Collapsed prolog stages : 0
;* Minimum required memory pad : 0 bytes
;*
;* Minimum safe trip count : 1
;*----------------------------------------------------------------------------*
;* SINGLE SCHEDULED ITERATION ;需要加-mw选项
;*
;* $C$C23:
;* 0 LDH .D2T2 *B6++,B5 ; |32| ;一次装载16bit,浪费带宽
;* 1 LDH .D1T1 *A6++,A5 ; |32|
;* 2 NOP 3
;* 5 MPY .M2 B5,B7,B4 ; |32|
;* 6 MPY .M1 A5,A8,A4 ; |32|
;* 7 NOP 1
;* 8 ADD .L1X B4,A4,A3 ; |32|
;* 9 SHR .S1 A3,15,A3 ; |32|
;* 10 STH .D1T1 A3,*A7++ ; |32|
;* || SPBR $C$C23
;* 11 NOP 1
;* 12 ; BRANCHCC OCCURS {$C$C23} ; |30| ;一次循环需要12始终周期
;*----------------------------------------------------------------------------*
修改为下面的代码时:
#define WORD_ALIGNED(x) (_nassert(((int)(x) & 0x3) == 0))
#define DWORD_ALIGNED(x) (_nassert(((int)(x) & 0x7) == 0))
void lesson3_c(short * restrict xptr, short * restrict yptr, short *zptr,
short *w_sum, int N)
{
int i, w_vec1, w_vec2;
short w1, w2;
WORD_ALIGNED(xptr); //保证内存装载的带宽
WORD_ALIGNED(yptr);
w1 = zptr[0];
w2 = zptr[1];
#pragma MUST_ITERATE(48, , 2); //factor=2, 可以铺开代码
for (i = 0; i < N; i++)
{
w_vec1 = xptr[i] * w1;
w_vec2 = yptr[i] * w2;
w_sum[i] = (w_vec1 + w_vec2) >> 15;
}
}
相应的:
;*----------------------------------------------------------------------------*
;* SOFTWARE PIPELINE INFORMATION
;*
;* Loop source line : 59
;* Loop opening brace source line : 60
;* Loop closing brace source line : 64
;* Loop Unroll Multiple : 4x ;循环铺开的次数
;* Known Minimum Trip Count : 12
;* Known Max Trip Count Factor : 1
;* Loop Carried Dependency Bound(^) : 0
;* Unpartitioned Resource Bound : 4
;* Partitioned Resource Bound(*) : 4
;* Resource Partition:
;* A-side B-side
;* .L units 0 0
;* .S units 2 2
;* .D units 4* 4* ;铺开循环保证了资源使用的平衡
;* .M units 2 2
;* .X cross paths 2 2
;* .T address paths 4* 4*
;* Long read paths 0 0
;* Long write paths 0 0
;* Logical ops (.LS) 0 0 (.L or .S unit)
;* Addition ops (.LSD) 2 2 (.L or .S or .D unit)
;* Bound(.L .S .LS) 1 1
;* Bound(.L .S .D .LS .LSD) 3 3
;*
;* Searching for software pipeline schedule at ...
;* ii = 4 Schedule found with 4 iterations in parallel
;*
;* Register Usage Table:
;* +-----------------------------------------------------------------+
;* |AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA|BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB|
;* |00000000001111111111222222222233|00000000001111111111222222222233|
;* |01234567890123456789012345678901|01234567890123456789012345678901|
;* |--------------------------------+--------------------------------|
;* 0: | ***** *** | ****** **** |
;* 1: | ******* * | ****** ** * |
;* 2: | ******* **** | * **** ** |
;* 3: | ******* **** | ****** ** |
;* +-----------------------------------------------------------------+
;*
;* Done
;*
;* Loop will be splooped
;* Collapsed epilog stages : 0
;* Collapsed prolog stages : 0
;* Minimum required memory pad : 0 bytes
;*
;* Minimum safe trip count : 1 (after unrolling)
;*----------------------------------------------------------------------------*
;* SINGLE SCHEDULED ITERATION
;*
;* $C$C24:
;* 0 LDW .D1T1 *A7++(8),A9 ; |61| ;每次装载一个字
;* || LDW .D2T2 *B6++(8),B17 ; |61|
;* 1 LDW .D1T1 *A4++(8),A3 ; |61|
;* 2 NOP 1
;* 3 LDW .D2T2 *B16++(8),B4 ; |61|
;* 4 NOP 2
;* 6 MPY2 .M1 A9,A8,A17:A16 ; |61|
;* 7 MPY2 .M1 A3,A8,A19:A18 ; |61|
;* || MPY2 .M2 B17,B7,B5:B4 ; |61|
;* 8 MPY2 .M2 B4,B7,B19:B18 ; |61|
;* 9 NOP 2
;* 11 ADD .L2X B4,A16,B17 ; |61|
;* 12 ADD .L2X B18,A18,B5 ; |61|
;* || SHR .S2 B17,15,B4 ; |61|
;* || ADD .L1X B5,A17,A3 ; |61|
;* 13 SHR .S2 B5,15,B4 ; |61|
;* || ADD .L1X B19,A19,A19 ; |61|
;* || STH .D2T2 B4,*B9++(8) ; |61|
;* || SHR .S1 A3,15,A18 ; |61|
;* 14 STH .D2T2 B4,*B8++(8) ; |61|
;* || SHR .S1 A19,15,A9 ; |61|
;* || STH .D1T1 A18,*A5++(8) ; |61|
;* 15 STH .D1T1 A9,*A6++(8) ; |61|
;* || SPBR $C$C24
;* 16 ; BRANCHCC OCCURS {$C$C24} ; |59| ;4次循环使用16个时钟周期
;*----------------------------------------------------------------------------*
4 consultant advice 和 *.nfo文件
当编译时加上--consultant和–on2 –o3,可以查看相应的consultant advice 和*.nfo文件。
打开profile,运行程序,这是可以查看viewer的consultant如下:
双击想看的数据,可以查看相应的建议:
*.nfo文件编译时便可以生成,内容没有consultant全。下面是lesson3_c.nfo的内容
TMS320C6x C/C++ Optimizer v6.0.8
Build Number 1GKUL-JA0KH827-RSAQQ-TAV-ZAZG_W_Q_Y
======File-level Analysis Summary======
extern void _lesson3_c() is called from 0 sites in this file.
It appears to be inlineable (size = 58 units)
It calls these functions:
======= End file-level Analysis =======
extern void _lesson3_c() is called from 0 sites in this file.
It appears to be inlineable (size = 58 units)
It calls these functions:
ADVICE: In function lesson3_c()
in the 'for' loop with loop variable 'i' at lines 39-44
for the statement w_sum[i] = _mpy(xptr[i], w1)+_mpy(yptr[i], w2)>>15; at line 41
The address of w_sum[i] for the first iteration of the loop is &w_sum[0].
This pointer is aligned to a 16 bit boundary.
Consider adding an assertion just before the loop:
_nassert( ((int)w_sum % 4) == 0 ); /* 32-bit aligned */
or _nassert( ((int)w_sum % 8) == 0 ); /* 64-bit aligned */
to specify that multiple elements of w_sum[i]
may be accessed in parallel.
ADVICE: In function lesson3_c()
in the 'for' loop with loop variable 'i' at lines 39-44
for the statement w_sum[i] = _mpy(xptr[i], w1)+_mpy(yptr[i], w2)>>15; at line 41
The address of yptr[i] for the first iteration of the loop is &yptr[0].
This pointer is aligned to a 32 bit boundary.
Consider adding an assertion just before the loop:
_nassert( ((int)yptr % 8) == 0 ); /* 64-bit aligned */
to specify that multiple elements of yptr[i]
may be accessed in parallel.
ADVICE: In function lesson3_c()
in the 'for' loop with loop variable 'i' at lines 39-44
for the statement w_sum[i] = _mpy(xptr[i], w1)+_mpy(yptr[i], w2)>>15; at line 41
The address of xptr[i] for the first iteration of the loop is &xptr[0].
This pointer is aligned to a 32 bit boundary.
Consider adding an assertion just before the loop:
_nassert( ((int)xptr % 8) == 0 ); /* 64-bit aligned */
to specify that multiple elements of xptr[i]
may be accessed in parallel.
<>
== END OF INFO OUTPUT==
#define WORD_ALIGNED(x) (_nassert(((int)(x) & 0x3) == 0))
/*输入参数加上关键字restrict,内存独立声明,xptr yptr指向的区域不重合*/
void lesson3_c(short * restrict xptr, short * restrict yptr, short *zptr,
short *w_sum, int N)
{
int i, w_vec1, w_vec2;
short w1, w2;
/*保证xptr yptr是四字节对齐的,编程者负责要保证这一点*/
WORD_ALIGNED(xptr);
WORD_ALIGNED(yptr);
w1 = zptr[0];
w2 = zptr[1];
/*最小循环20次,循环的次数是2的倍数*/
#pragma MUST_ITERATE(20, , 2);
for (i = 0; i < N; i++)
{
w_vec1 = xptr[i] * w1;
w_vec2 = yptr[i] * w2;
w_sum[i] = (w_vec1 + w_vec2) >> 15;
}
}
参考:
1 TMS320C6000 Programmer's Guide.pdf
2 TMS320C6000 Optimizing Compiler User's Guide.pdf