DSP

DSP优化,通用的调整性能的策略

2019-07-13 18:12发布

1 通用的调整性能的策略

1.1 选择恰当的编译器选项

  • 必须要用的选项 –O[2|3]
  • 可以使用-mt(要确保写的数据和读的数据在内存空间上没有重合)
  • -mh  Specify speculative load byte count threshold
  • 如果源代码里含有永远不会执行的代码,使用选项-mo Place each function in a separate subsection
  • 如果考虑可执行程序的大小,加上-ms[0-3]。(我在C64x+上编译时,加此选项,连接错误,提示找不到__push_rts?原因不明)
  • 不要加上-g –gp –ss –ml3 –mu
  • (-s[–k –al] –o[2|3]  –mw   (-on2 –o3) –consultant 可以在产生分析信息的同时不影响生成代码的性能 
1.2 确保循环中的次数变量(一般for(i; i 2 利用优化器的意见
当编译选项中有-s时,在生成的*.asm文件中会有优化器的意见 如"C:/CCStudio_v3.3/C6000/cgtools/bin/cl6x" -g -k -s -on2 -o3 -mt -mw -mv6400+ --mem_model:data=near --consultant -@"Debug.lkf" "lesson_c.c" 其中
void lesson_c(short *xptr, short *yptr, short *zptr, short *w_sum, int N)
{
    int i, w_vec1, w_vec2;
    short w1,w2;     w1 = zptr[0];
    w2 = zptr[1];
    for (i = 0; i < N; i++)
    {
        w_vec1 =  xptr[i] * w1;
        w_vec2 =  yptr[i] * w2;
        w_sum[i] = (w_vec1 + w_vec2) >> 15;
    } }
生成的优化器意见为: ;** --------------------------------------------------------------------------*
;** 27    -----------------------    w1 = *zptr;
;** 28    -----------------------    w2 = zptr[1];
;** 29    -----------------------    if ( N <= 0 ) goto g4; ;** --------------------------------------------------------------------------*
;**      -----------------------    U$17 = xptr;
;**      -----------------------    U$20 = yptr;
;**      -----------------------    U$26 = w_sum;
;** 31    -----------------------    L$1 = N;
;**      -----------------------    #pragma MUST_ITERATE(1, 1099511627775, 1)
;**      -----------------------    #pragma LOOP_FLAGS(4096u)
;**    -----------------------g3:
;** 31    -----------------------    *U$26++ = _mpy(*U$17++, w1)+_mpy(*U$20++, w2)>>15;
;** 29    -----------------------    if ( --L$1 ) goto g3;
;**    -----------------------g4:
;**      -----------------------    return; 从中可以看出,加入了对N是否0的判断。如果改为: void lesson_c(short *xptr, short *yptr, short *zptr, short *w_sum, int N)
{
    int i, w_vec1, w_vec2;
    short w1,w2;     w1 = zptr[0];
    w2 = zptr[1];
    #pragma MUST_ITERATE(1)  //至少循环一次
    for (i = 0; i < N; i++)
    {
        w_vec1 =  xptr[i] * w1;
        w_vec2 =  yptr[i] * w2;
        w_sum[i] = (w_vec1 + w_vec2) >> 15;
    } } 相应的意见为,没有了对N是否为0的判断 ;** --------------------------------------------------------------------------*
;** 27    -----------------------    w1 = *zptr;
;** 28    -----------------------    w2 = zptr[1];
;**      -----------------------    U$15 = xptr;
;**      -----------------------    U$18 = yptr;
;**      -----------------------    U$24 = w_sum;
;** 32    -----------------------    L$1 = N;
;**      -----------------------    #pragma MUST_ITERATE(1, 4294967295, 1)
;**      -----------------------    #pragma LOOP_FLAGS(4096u)
;**    -----------------------g2:
;** 32    -----------------------    *U$24++ = _mpy(*U$15++, w1)+_mpy(*U$18++, w2)>>15;
;** 30    -----------------------    if ( --L$1 ) goto g2;
;**      -----------------------    return;    
3 利用软件流水信息优化循环
void lesson_c(short *xptr, short *yptr, short *zptr, short *w_sum, int N)
{
    int i, w_vec1, w_vec2;
    short w1,w2;     w1 = zptr[0];
    w2 = zptr[1];
    for (i = 0; i < N; i++)
    {
        w_vec1 =  xptr[i] * w1;
        w_vec2 =  yptr[i] * w2;
        w_sum[i] = (w_vec1 + w_vec2) >> 15;
    } } 的软件流水信息为: ;*----------------------------------------------------------------------------*
;*   SOFTWARE PIPELINE INFORMATION
;*
;*      Loop source line                 : 30                        ;循环开始的行数
;*      Loop opening brace source line   : 31
;*      Loop closing brace source line   : 35
;*      Known Minimum Trip Count         : 1                    ;已知的循环最小次数
;*      Known Max Trip Count Factor      : 1                   ;已知循环的因子  循环次数是循环因子的倍数,如果一直循环因子的话,便于编译器自动铺开(unroll)代码
;*      Loop Carried Dependency Bound(^) : 0                ;内存读写瓶颈,如果有的话,后面的汇编代码注释里相应语句含有^标志
;*      Unpartitioned Resource Bound     : 2                    ;资源瓶颈
;*      Partitioned Resource Bound(*)    : 2
;*      Resource Partition:
;*                                   A-side   B-side
;*      .L units                     0        0    
;*      .S units                     1        0    
;*      .D units                     2*       1    
;*      .M units                     1        1    
;*      .X cross paths            1        0    
;*      .T address paths         2*       1    
;*      Long read paths          0        0    
;*      Long write paths         0        0    
;*      Logical  ops (.LS)        0        0     (.L or .S unit)
;*      Addition ops (.LSD)      1        0     (.L or .S or .D unit)
;*      Bound(.L .S .LS)          1        0    
;*      Bound(.L .S .D .LS .LSD)     2*       1                 ;资源使用不平衡(没有完全利用可用的计算能力)
;*
;*      Searching for software pipeline schedule at ...
;*         ii = 2  Schedule found with 6 iterations in parallel
;*
;*      Register Usage Table:
;*          +-----------------------------------------------------------------+
;*          |AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA|BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB|
;*          |00000000001111111111222222222233|00000000001111111111222222222233|
;*          |01234567890123456789012345678901|01234567890123456789012345678901|
;*          |--------------------------------+--------------------------------|
;*       0: |   ******                       |    * **                        |
;*       1: |   *  ***                       |    ****                        |
;*          +-----------------------------------------------------------------+
;*
;*      Done
;*
;*      Loop will be splooped
;*      Collapsed epilog stages     : 0
;*      Collapsed prolog stages     : 0
;*      Minimum required memory pad : 0 bytes
;*
;*      Minimum safe trip count     : 1
;*----------------------------------------------------------------------------*
;*        SINGLE SCHEDULED ITERATION                    ;需要加-mw选项
;*
;*        $C$C23:
;*   0              LDH     .D2T2   *B6++,B5          ; |32|              ;一次装载16bit,浪费带宽
;*   1              LDH     .D1T1   *A6++,A5          ; |32|
;*   2              NOP             3
;*   5              MPY     .M2     B5,B7,B4          ; |32|
;*   6              MPY     .M1     A5,A8,A4          ; |32|
;*   7              NOP             1
;*   8              ADD     .L1X    B4,A4,A3          ; |32|
;*   9              SHR     .S1     A3,15,A3          ; |32|
;*  10              STH     .D1T1   A3,*A7++          ; |32|
;*     ||           SPBR            $C$C23
;*  11              NOP             1
;*  12              ; BRANCHCC OCCURS {$C$C23}        ; |30|          ;一次循环需要12始终周期
;*----------------------------------------------------------------------------*
修改为下面的代码时:
#define WORD_ALIGNED(x)  (_nassert(((int)(x) & 0x3) == 0))
#define DWORD_ALIGNED(x) (_nassert(((int)(x) & 0x7) == 0)) void lesson3_c(short * restrict xptr, short * restrict yptr, short *zptr,
               short *w_sum, int N)
{
    int i, w_vec1, w_vec2;
    short w1, w2;
    WORD_ALIGNED(xptr); //保证内存装载的带宽
    WORD_ALIGNED(yptr);                                  
    w1 = zptr[0];
    w2 = zptr[1];
    #pragma MUST_ITERATE(48, , 2); //factor=2, 可以铺开代码
    for (i = 0; i < N; i++)
    {
        w_vec1 =  xptr[i] * w1;
        w_vec2 =  yptr[i] * w2;
        w_sum[i] = (w_vec1 + w_vec2) >> 15;
    }
} 相应的: ;*----------------------------------------------------------------------------*
;*   SOFTWARE PIPELINE INFORMATION
;*
;*      Loop source line                 : 59
;*      Loop opening brace source line   : 60
;*      Loop closing brace source line   : 64
;*      Loop Unroll Multiple             : 4x                       ;循环铺开的次数
;*      Known Minimum Trip Count         : 12                   
;*      Known Max Trip Count Factor      : 1
;*      Loop Carried Dependency Bound(^) : 0
;*      Unpartitioned Resource Bound     : 4
;*      Partitioned Resource Bound(*)    : 4
;*      Resource Partition:
;*                                A-side   B-side
;*      .L units                     0        0    
;*      .S units                     2        2    
;*      .D units                     4*       4*                     ;铺开循环保证了资源使用的平衡
;*      .M units                     2        2    
;*      .X cross paths               2        2    
;*      .T address paths             4*       4*   
;*      Long read paths              0        0    
;*      Long write paths             0        0    
;*      Logical  ops (.LS)           0        0     (.L or .S unit)
;*      Addition ops (.LSD)          2        2     (.L or .S or .D unit)
;*      Bound(.L .S .LS)             1        1    
;*      Bound(.L .S .D .LS .LSD)     3        3    
;*
;*      Searching for software pipeline schedule at ...
;*         ii = 4  Schedule found with 4 iterations in parallel
;*
;*      Register Usage Table:
;*          +-----------------------------------------------------------------+
;*          |AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA|BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB|
;*          |00000000001111111111222222222233|00000000001111111111222222222233|
;*          |01234567890123456789012345678901|01234567890123456789012345678901|
;*          |--------------------------------+--------------------------------|
;*       0: |    *****        ***            |    ******      ****            |
;*       1: |   *******         *            |    ******      ** *            |
;*       2: |   *******      ****            |    * ****      **              |
;*       3: |   *******      ****            |    ******      **              |
;*          +-----------------------------------------------------------------+
;*
;*      Done
;*
;*      Loop will be splooped
;*      Collapsed epilog stages     : 0
;*      Collapsed prolog stages     : 0
;*      Minimum required memory pad : 0 bytes
;*
;*      Minimum safe trip count     : 1 (after unrolling)
;*----------------------------------------------------------------------------*
;*        SINGLE SCHEDULED ITERATION
;*
;*        $C$C24:
;*   0              LDW     .D1T1   *A7++(8),A9       ; |61|             ;每次装载一个字
;*     ||           LDW     .D2T2   *B6++(8),B17      ; |61|
;*   1              LDW     .D1T1   *A4++(8),A3       ; |61|
;*   2              NOP             1
;*   3              LDW     .D2T2   *B16++(8),B4      ; |61|
;*   4              NOP             2
;*   6              MPY2    .M1     A9,A8,A17:A16     ; |61|
;*   7              MPY2    .M1     A3,A8,A19:A18     ; |61|
;*     ||           MPY2    .M2     B17,B7,B5:B4      ; |61|
;*   8              MPY2    .M2     B4,B7,B19:B18     ; |61|
;*   9              NOP             2
;*  11              ADD     .L2X    B4,A16,B17        ; |61|
;*  12              ADD     .L2X    B18,A18,B5        ; |61|
;*     ||           SHR     .S2     B17,15,B4         ; |61|
;*     ||           ADD     .L1X    B5,A17,A3         ; |61|
;*  13              SHR     .S2     B5,15,B4          ; |61|
;*     ||           ADD     .L1X    B19,A19,A19       ; |61|
;*     ||           STH     .D2T2   B4,*B9++(8)       ; |61|
;*     ||           SHR     .S1     A3,15,A18         ; |61|
;*  14              STH     .D2T2   B4,*B8++(8)       ; |61|
;*     ||           SHR     .S1     A19,15,A9         ; |61|
;*     ||           STH     .D1T1   A18,*A5++(8)      ; |61|
;*  15              STH     .D1T1   A9,*A6++(8)       ; |61|
;*     ||           SPBR            $C$C24
;*  16              ; BRANCHCC OCCURS {$C$C24}        ; |59|       ;4次循环使用16个时钟周期
;*----------------------------------------------------------------------------*  
4 consultant advice 和 *.nfo文件
当编译时加上--consultant和–on2 –o3,可以查看相应的consultant advice 和*.nfo文件。 打开profile,运行程序,这是可以查看viewer的consultant如下: image  
双击想看的数据,可以查看相应的建议: HPA`J@C6`F$]GGJ~XZY9%BK   *.nfo文件编译时便可以生成,内容没有consultant全。下面是lesson3_c.nfo的内容 TMS320C6x C/C++ Optimizer               v6.0.8
Build Number 1GKUL-JA0KH827-RSAQQ-TAV-ZAZG_W_Q_Y         ======File-level Analysis Summary====== extern void _lesson3_c() is called from 0 sites in this file.
    It appears to be inlineable (size = 58 units)
    It calls these functions:
            ======= End file-level Analysis ======= extern void _lesson3_c() is called from 0 sites in this file.
    It appears to be inlineable (size = 58 units)
    It calls these functions:
    ADVICE: In function lesson3_c()
    in the 'for' loop with loop variable 'i' at lines 39-44
    for the statement w_sum[i] = _mpy(xptr[i], w1)+_mpy(yptr[i], w2)>>15; at line 41     The address of w_sum[i] for the first iteration of the loop is &w_sum[0].
    This pointer is aligned to a 16 bit boundary.     Consider adding an assertion just before the loop:         _nassert( ((int)w_sum % 4) == 0 );  /* 32-bit aligned */
       or    _nassert( ((int)w_sum % 8) == 0 );  /* 64-bit aligned */     to specify that multiple elements of w_sum[i]
    may be accessed in parallel. ADVICE: In function lesson3_c()
    in the 'for' loop with loop variable 'i' at lines 39-44
    for the statement w_sum[i] = _mpy(xptr[i], w1)+_mpy(yptr[i], w2)>>15; at line 41     The address of yptr[i] for the first iteration of the loop is &yptr[0].
    This pointer is aligned to a 32 bit boundary.     Consider adding an assertion just before the loop:         _nassert( ((int)yptr % 8) == 0 );  /* 64-bit aligned */     to specify that multiple elements of yptr[i]
    may be accessed in parallel. ADVICE: In function lesson3_c()
    in the 'for' loop with loop variable 'i' at lines 39-44
    for the statement w_sum[i] = _mpy(xptr[i], w1)+_mpy(yptr[i], w2)>>15; at line 41     The address of xptr[i] for the first iteration of the loop is &xptr[0].
    This pointer is aligned to a 32 bit boundary.     Consider adding an assertion just before the loop:         _nassert( ((int)xptr % 8) == 0 );  /* 64-bit aligned */     to specify that multiple elements of xptr[i]
    may be accessed in parallel.
<> == END OF INFO OUTPUT==                                     #define WORD_ALIGNED(x) (_nassert(((int)(x) & 0x3) == 0)) /*输入参数加上关键字restrict,内存独立声明,xptr yptr指向的区域不重合*/ void lesson3_c(short * restrict xptr, short * restrict yptr, short *zptr,                short *w_sum, int N) {     int i, w_vec1, w_vec2;     short w1, w2;            /*保证xptr yptr是四字节对齐的,编程者负责要保证这一点*/                                  WORD_ALIGNED(xptr);     WORD_ALIGNED(yptr);                                                                              w1 = zptr[0];     w2 = zptr[1];       /*最小循环20次,循环的次数是2的倍数*/     #pragma MUST_ITERATE(20, , 2);     for (i = 0; i < N; i++)     {         w_vec1 =  xptr[i] * w1;         w_vec2 =  yptr[i] * w2;         w_sum[i] = (w_vec1 + w_vec2) >> 15;     } }   参考: 1 TMS320C6000 Programmer's Guide.pdf 2 TMS320C6000 Optimizing Compiler User's Guide.pdf