DSP优化,通用的调整性能的策略
2019-07-13 18:12 发布
生成海报
1 通用的调整性能的策略
1.1 选择恰当的编译器选项
必须要用的选项 –O[2|3]
可以使用-mt(要确保写的数据和读的数据在内存空间上没有重合)
-mh Specify speculative load byte count threshold
如果源代码里含有永远不会执行的代码,使用选项-mo Place each function in a separate subsection
如果考虑可执行程序的大小,加上-ms[0-3]。(我在C64x+上编译时,加此选项,连接错误,提示找不到__push_rts?原因不明)
不要加上-g –gp –ss –ml3 –mu
(-s[–k –al] –o[2|3] –mw (-on2 –o3) –consultant 可以在产生分析信息的同时不影响生成代码的性能
1.2 确保循环中的次数变量(一般for(i; i
2 利用优化器的意见
当编译选项中有-s时,在生成的*.asm文件中会有优化器的意见
如"C:/CCStudio_v3.3/C6000/cgtools/bin/cl6x" -g -k -s -on2 -o3 -mt -mw -mv6400+ --mem_model:data=near --consultant -@"Debug.lkf" "lesson_c.c"
其中
void lesson_c(short *xptr, short *yptr, short *zptr, short *w_sum, int N) { int i, w_vec1, w_vec2; short w1,w2;
w1 = zptr[0]; w2 = zptr[1]; for (i = 0; i < N; i++) { w_vec1 = xptr[i] * w1; w_vec2 = yptr[i] * w2; w_sum[i] = (w_vec1 + w_vec2) >> 15; }
}
生成的优化器意见为:
;** --------------------------------------------------------------------------* ;** 27 ----------------------- w1 = *zptr; ;** 28 ----------------------- w2 = zptr[1]; ;** 29 ----------------------- if ( N <= 0 ) goto g4;
;** --------------------------------------------------------------------------* ;** ----------------------- U$17 = xptr; ;** ----------------------- U$20 = yptr; ;** ----------------------- U$26 = w_sum; ;** 31 ----------------------- L$1 = N; ;** ----------------------- #pragma MUST_ITERATE(1, 1099511627775, 1) ;** ----------------------- #pragma LOOP_FLAGS(4096u) ;** -----------------------g3: ;** 31 ----------------------- *U$26++ = _mpy(*U$17++, w1)+_mpy(*U$20++, w2)>>15; ;** 29 ----------------------- if ( --L$1 ) goto g3; ;** -----------------------g4: ;** ----------------------- return;
从中可以看出,加入了对N是否0的判断。如果改为:
void lesson_c(short *xptr, short *yptr, short *zptr, short *w_sum, int N) { int i, w_vec1, w_vec2; short w1,w2;
w1 = zptr[0]; w2 = zptr[1]; #pragma MUST_ITERATE(1) //至少循环一次 for (i = 0; i < N; i++) { w_vec1 = xptr[i] * w1; w_vec2 = yptr[i] * w2; w_sum[i] = (w_vec1 + w_vec2) >> 15; }
}
相应的意见为,没有了对N是否为0的判断
;** --------------------------------------------------------------------------* ;** 27 ----------------------- w1 = *zptr; ;** 28 ----------------------- w2 = zptr[1]; ;** ----------------------- U$15 = xptr; ;** ----------------------- U$18 = yptr; ;** ----------------------- U$24 = w_sum; ;** 32 ----------------------- L$1 = N; ;** ----------------------- #pragma MUST_ITERATE(1, 4294967295, 1) ;** ----------------------- #pragma LOOP_FLAGS(4096u) ;** -----------------------g2: ;** 32 ----------------------- *U$24++ = _mpy(*U$15++, w1)+_mpy(*U$18++, w2)>>15; ;** 30 ----------------------- if ( --L$1 ) goto g2; ;** ----------------------- return;
3 利用软件流水信息优化循环
void lesson_c(short *xptr, short *yptr, short *zptr, short *w_sum, int N) { int i, w_vec1, w_vec2; short w1,w2;
w1 = zptr[0]; w2 = zptr[1]; for (i = 0; i < N; i++) { w_vec1 = xptr[i] * w1; w_vec2 = yptr[i] * w2; w_sum[i] = (w_vec1 + w_vec2) >> 15; }
}
的软件流水信息为:
;*----------------------------------------------------------------------------* ;* SOFTWARE PIPELINE INFORMATION ;* ;* Loop source line : 30 ;循环开始的行数 ;* Loop opening brace source line : 31 ;* Loop closing brace source line : 35 ;* Known Minimum Trip Count : 1 ;已知的循环最小次数 ;* Known Max Trip Count Factor : 1 ;已知循环的因子 循环次数是循环因子的倍数,如果一直循环因子的话,便于编译器自动铺开(unroll)代码 ;* Loop Carried Dependency Bound(^) : 0 ;内存读写瓶颈,如果有的话,后面的汇编代码注释里相应语句含有^标志 ;* Unpartitioned Resource Bound : 2 ;资源瓶颈 ;* Partitioned Resource Bound(*) : 2 ;* Resource Partition: ;* A-side B-side ;* .L units 0 0 ;* .S units 1 0 ;* .D units 2* 1 ;* .M units 1 1 ;* .X cross paths 1 0 ;* .T address paths 2* 1 ;* Long read paths 0 0 ;* Long write paths 0 0 ;* Logical ops (.LS) 0 0 (.L or .S unit) ;* Addition ops (.LSD) 1 0 (.L or .S or .D unit) ;* Bound(.L .S .LS) 1 0 ;* Bound(.L .S .D .LS .LSD) 2* 1 ;资源使用不平衡(没有完全利用可用的计算能力) ;* ;* Searching for software pipeline schedule at ... ;* ii = 2 Schedule found with 6 iterations in parallel ;* ;* Register Usage Table: ;* +-----------------------------------------------------------------+ ;* |AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA|BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB| ;* |00000000001111111111222222222233|00000000001111111111222222222233| ;* |01234567890123456789012345678901|01234567890123456789012345678901| ;* |--------------------------------+--------------------------------| ;* 0: | ****** | * ** | ;* 1: | * *** | **** | ;* +-----------------------------------------------------------------+ ;* ;* Done ;* ;* Loop will be splooped ;* Collapsed epilog stages : 0 ;* Collapsed prolog stages : 0 ;* Minimum required memory pad : 0 bytes ;* ;* Minimum safe trip count : 1 ;*----------------------------------------------------------------------------* ;* SINGLE SCHEDULED ITERATION ;需要加-mw选项 ;* ;* $C$C23: ;* 0 LDH .D2T2 *B6++,B5 ; |32| ;一次装载16bit,浪费带宽 ;* 1 LDH .D1T1 *A6++,A5 ; |32| ;* 2 NOP 3 ;* 5 MPY .M2 B5,B7,B4 ; |32| ;* 6 MPY .M1 A5,A8,A4 ; |32| ;* 7 NOP 1 ;* 8 ADD .L1X B4,A4,A3 ; |32| ;* 9 SHR .S1 A3,15,A3 ; |32| ;* 10 STH .D1T1 A3,*A7++ ; |32| ;* || SPBR $C$C23 ;* 11 NOP 1 ;* 12 ; BRANCHCC OCCURS {$C$C23} ; |30| ;一次循环需要12始终周期 ;*----------------------------------------------------------------------------*
修改为下面的代码时:
#define WORD_ALIGNED(x) (_nassert(((int)(x) & 0x3) == 0)) #define DWORD_ALIGNED(x) (_nassert(((int)(x) & 0x7) == 0))
void lesson3_c(short * restrict xptr, short * restrict yptr, short *zptr, short *w_sum, int N) { int i, w_vec1, w_vec2; short w1, w2; WORD_ALIGNED(xptr); //保证内存装载的带宽 WORD_ALIGNED(yptr); w1 = zptr[0]; w2 = zptr[1]; #pragma MUST_ITERATE(48, , 2); //factor=2, 可以铺开代码 for (i = 0; i < N; i++) { w_vec1 = xptr[i] * w1; w_vec2 = yptr[i] * w2; w_sum[i] = (w_vec1 + w_vec2) >> 15; } }
相应的:
;*----------------------------------------------------------------------------* ;* SOFTWARE PIPELINE INFORMATION ;* ;* Loop source line : 59 ;* Loop opening brace source line : 60 ;* Loop closing brace source line : 64 ;* Loop Unroll Multiple : 4x ;循环铺开的次数 ;* Known Minimum Trip Count : 12 ;* Known Max Trip Count Factor : 1 ;* Loop Carried Dependency Bound(^) : 0 ;* Unpartitioned Resource Bound : 4 ;* Partitioned Resource Bound(*) : 4 ;* Resource Partition: ;* A-side B-side ;* .L units 0 0 ;* .S units 2 2 ;* .D units 4* 4* ;铺开循环保证了资源使用的平衡 ;* .M units 2 2 ;* .X cross paths 2 2 ;* .T address paths 4* 4* ;* Long read paths 0 0 ;* Long write paths 0 0 ;* Logical ops (.LS) 0 0 (.L or .S unit) ;* Addition ops (.LSD) 2 2 (.L or .S or .D unit) ;* Bound(.L .S .LS) 1 1 ;* Bound(.L .S .D .LS .LSD) 3 3 ;* ;* Searching for software pipeline schedule at ... ;* ii = 4 Schedule found with 4 iterations in parallel ;* ;* Register Usage Table: ;* +-----------------------------------------------------------------+ ;* |AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA|BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB| ;* |00000000001111111111222222222233|00000000001111111111222222222233| ;* |01234567890123456789012345678901|01234567890123456789012345678901| ;* |--------------------------------+--------------------------------| ;* 0: | ***** *** | ****** **** | ;* 1: | ******* * | ****** ** * | ;* 2: | ******* **** | * **** ** | ;* 3: | ******* **** | ****** ** | ;* +-----------------------------------------------------------------+ ;* ;* Done ;* ;* Loop will be splooped ;* Collapsed epilog stages : 0 ;* Collapsed prolog stages : 0 ;* Minimum required memory pad : 0 bytes ;* ;* Minimum safe trip count : 1 (after unrolling) ;*----------------------------------------------------------------------------* ;* SINGLE SCHEDULED ITERATION ;* ;* $C$C24: ;* 0 LDW .D1T1 *A7++(8),A9 ; |61| ;每次装载一个字 ;* || LDW .D2T2 *B6++(8),B17 ; |61| ;* 1 LDW .D1T1 *A4++(8),A3 ; |61| ;* 2 NOP 1 ;* 3 LDW .D2T2 *B16++(8),B4 ; |61| ;* 4 NOP 2 ;* 6 MPY2 .M1 A9,A8,A17:A16 ; |61| ;* 7 MPY2 .M1 A3,A8,A19:A18 ; |61| ;* || MPY2 .M2 B17,B7,B5:B4 ; |61| ;* 8 MPY2 .M2 B4,B7,B19:B18 ; |61| ;* 9 NOP 2 ;* 11 ADD .L2X B4,A16,B17 ; |61| ;* 12 ADD .L2X B18,A18,B5 ; |61| ;* || SHR .S2 B17,15,B4 ; |61| ;* || ADD .L1X B5,A17,A3 ; |61| ;* 13 SHR .S2 B5,15,B4 ; |61| ;* || ADD .L1X B19,A19,A19 ; |61| ;* || STH .D2T2 B4,*B9++(8) ; |61| ;* || SHR .S1 A3,15,A18 ; |61| ;* 14 STH .D2T2 B4,*B8++(8) ; |61| ;* || SHR .S1 A19,15,A9 ; |61| ;* || STH .D1T1 A18,*A5++(8) ; |61| ;* 15 STH .D1T1 A9,*A6++(8) ; |61| ;* || SPBR $C$C24 ;* 16 ; BRANCHCC OCCURS {$C$C24} ; |59| ;4次循环使用16个时钟周期 ;*----------------------------------------------------------------------------*
4 consultant advice 和 *.nfo文件
当编译时加上--consultant和–on2 –o3,可以查看相应的consultant advice 和*.nfo文件。
打开profile,运行程序,这是可以查看viewer的consultant如下:
双击想看的数据,可以查看相应的建议:
*.nfo文件编译时便可以生成,内容没有consultant全。下面是lesson3_c.nfo的内容
TMS320C6x C/C++ Optimizer v6.0.8 Build Number 1GKUL-JA0KH827-RSAQQ-TAV-ZAZG_W_Q_Y
======File-level Analysis Summary======
extern void _lesson3_c() is called from 0 sites in this file. It appears to be inlineable (size = 58 units) It calls these functions:
======= End file-level Analysis =======
extern void _lesson3_c() is called from 0 sites in this file. It appears to be inlineable (size = 58 units) It calls these functions:
ADVICE: In function lesson3_c() in the 'for' loop with loop variable 'i' at lines 39-44 for the statement w_sum[i] = _mpy(xptr[i], w1)+_mpy(yptr[i], w2)>>15; at line 41
The address of w_sum[i] for the first iteration of the loop is &w_sum[0]. This pointer is aligned to a 16 bit boundary.
Consider adding an assertion just before the loop:
_nassert( ((int)w_sum % 4) == 0 ); /* 32-bit aligned */ or _nassert( ((int)w_sum % 8) == 0 ); /* 64-bit aligned */
to specify that multiple elements of w_sum[i] may be accessed in parallel.
ADVICE: In function lesson3_c() in the 'for' loop with loop variable 'i' at lines 39-44 for the statement w_sum[i] = _mpy(xptr[i], w1)+_mpy(yptr[i], w2)>>15; at line 41
The address of yptr[i] for the first iteration of the loop is &yptr[0]. This pointer is aligned to a 32 bit boundary.
Consider adding an assertion just before the loop:
_nassert( ((int)yptr % 8) == 0 ); /* 64-bit aligned */
to specify that multiple elements of yptr[i] may be accessed in parallel.
ADVICE: In function lesson3_c() in the 'for' loop with loop variable 'i' at lines 39-44 for the statement w_sum[i] = _mpy(xptr[i], w1)+_mpy(yptr[i], w2)>>15; at line 41
The address of xptr[i] for the first iteration of the loop is &xptr[0]. This pointer is aligned to a 32 bit boundary.
Consider adding an assertion just before the loop:
_nassert( ((int)xptr % 8) == 0 ); /* 64-bit aligned */
to specify that multiple elements of xptr[i] may be accessed in parallel. <>
== END OF INFO OUTPUT==
#define WORD_ALIGNED(x) (_nassert(((int)(x) & 0x3) == 0))
/*输入参数加上关键字restrict,内存独立声明,xptr yptr指向的区域不重合*/
void lesson3_c(short * restrict xptr, short * restrict yptr, short *zptr,
short *w_sum, int N)
{
int i, w_vec1, w_vec2;
short w1, w2;
/*保证xptr yptr是四字节对齐的,编程者负责要保证这一点*/
WORD_ALIGNED(xptr);
WORD_ALIGNED(yptr);
w1 = zptr[0];
w2 = zptr[1];
/*最小循环20次,循环的次数是2的倍数*/
#pragma MUST_ITERATE(20, , 2);
for (i = 0; i < N; i++)
{
w_vec1 = xptr[i] * w1;
w_vec2 = yptr[i] * w2;
w_sum[i] = (w_vec1 + w_vec2) >> 15;
}
}
参考:
1 TMS320C6000 Programmer's Guide.pdf
2 TMS320C6000 Optimizing Compiler User's Guide.pdf
打开微信“扫一扫”,打开网页后点击屏幕右上角分享按钮