DSP优化，通用的调整性能的策略

DSP

DSP优化，通用的调整性能的策略

2019-07-13 18:09发布生成海报

站内文章 / DSP

11722 0

1 通用的调整性能的策略

1.1 选择恰当的编译器选项

必须要用的选项 –O[2|3]
可以使用-mt（要确保写的数据和读的数据在内存空间上没有重合）
-mh Specify speculative load byte count threshold
如果源代码里含有永远不会执行的代码，使用选项-mo Place each function in a separate subsection
如果考虑可执行程序的大小，加上-ms[0-3]。（我在C64x+上编译时，加此选项，连接错误，提示找不到__push_rts?原因不明）
不要加上-g –gp –ss –ml3 –mu
(-s[–k –al] –o[2|3] –mw   (-on2 –o3) –consultant 可以在产生分析信息的同时不影响生成代码的性能
1.2 确保循环中的次数变量（一般for(i; i 2 利用优化器的意见
当编译选项中有-s时，在生成的*.asm文件中会有优化器的意见如"C:/CCStudio_v3.3/C6000/cgtools/bin/cl6x" -g -k -s -on2 -o3 -mt -mw -mv6400+ --mem_model:data=near --consultant -@"Debug.lkf" "lesson_c.c" 其中

void lesson_c(short *xptr, short *yptr, short *zptr, short *w_sum, int N)
{
    int i, w_vec1, w_vec2;
    short w1,w2;     w1 = zptr[0];
    w2 = zptr[1];
    for (i = 0; i < N; i++)
    {
        w_vec1 = xptr[i] * w1;
        w_vec2 = yptr[i] * w2;
        w_sum[i] = (w_vec1 + w_vec2) >> 15;
    } }

生成的优化器意见为： ;** --------------------------------------------------------------------------*
;** 27    -----------------------    w1 = *zptr;
;** 28    -----------------------    w2 = zptr[1];
;** 29    -----------------------    if ( N <= 0 ) goto g4; ;** --------------------------------------------------------------------------*
;**      -----------------------    U$17 = xptr;
;**      -----------------------    U$20 = yptr;
;**      -----------------------    U$26 = w_sum;
;** 31    -----------------------    L$1 = N;
;**      -----------------------    #pragma MUST_ITERATE(1, 1099511627775, 1)
;**      -----------------------    #pragma LOOP_FLAGS(4096u)
;**    -----------------------g3:
;** 31    -----------------------    *U$26++ = _mpy(*U$17++, w1)+_mpy(*U$20++, w2)>>15;
;** 29    -----------------------    if ( --L$1 ) goto g3;
;**    -----------------------g4:
;**      -----------------------    return; 从中可以看出，加入了对N是否0的判断。如果改为： void lesson_c(short *xptr, short *yptr, short *zptr, short *w_sum, int N)
{
    int i, w_vec1, w_vec2;
    short w1,w2;     w1 = zptr[0];
    w2 = zptr[1];
    #pragma MUST_ITERATE(1) //至少循环一次
    for (i = 0; i < N; i++)
    {
        w_vec1 = xptr[i] * w1;
        w_vec2 = yptr[i] * w2;
        w_sum[i] = (w_vec1 + w_vec2) >> 15;
    } } 相应的意见为，没有了对N是否为0的判断 ;** --------------------------------------------------------------------------*
;** 27    -----------------------    w1 = *zptr;
;** 28    -----------------------    w2 = zptr[1];
;**      -----------------------    U$15 = xptr;
;**      -----------------------    U$18 = yptr;
;**      -----------------------    U$24 = w_sum;
;** 32    -----------------------    L$1 = N;
;**      -----------------------    #pragma MUST_ITERATE(1, 4294967295, 1)
;**      -----------------------    #pragma LOOP_FLAGS(4096u)
;**    -----------------------g2:
;** 32    -----------------------    *U$24++ = _mpy(*U$15++, w1)+_mpy(*U$18++, w2)>>15;
;** 30    -----------------------    if ( --L$1 ) goto g2;
;**      -----------------------    return;
3 利用软件流水信息优化循环
void lesson_c(short *xptr, short *yptr, short *zptr, short *w_sum, int N)
{
    int i, w_vec1, w_vec2;
    short w1,w2;     w1 = zptr[0];
    w2 = zptr[1];
    for (i = 0; i < N; i++)
    {
        w_vec1 = xptr[i] * w1;
        w_vec2 = yptr[i] * w2;
        w_sum[i] = (w_vec1 + w_vec2) >> 15;
    } } 的软件流水信息为： ;*----------------------------------------------------------------------------*
;*   SOFTWARE PIPELINE INFORMATION
;*
;*      Loop source line                 : 30                        ;循环开始的行数
;*      Loop opening brace source line   : 31
;*      Loop closing brace source line   : 35
;*      Known Minimum Trip Count         : 1                    ;已知的循环最小次数
;*      Known Max Trip Count Factor      : 1                   ;已知循环的因子循环次数是循环因子的倍数，如果一直循环因子的话，便于编译器自动铺开（unroll）代码
;*      Loop Carried Dependency Bound(^) : 0                ;内存读写瓶颈，如果有的话，后面的汇编代码注释里相应语句含有^标志
;*      Unpartitioned Resource Bound     : 2                    ;资源瓶颈
;*      Partitioned Resource Bound(*)    : 2
;*      Resource Partition:
;*                                   A-side   B-side
;*      .L units                     0        0
;*      .S units                     1        0
;*      .D units                     2*       1
;*      .M units                     1        1
;*      .X cross paths            1        0
;*      .T address paths         2*       1
;*      Long read paths          0        0
;*      Long write paths         0        0
;*      Logical ops (.LS)        0        0     (.L or .S unit)
;*      Addition ops (.LSD)      1        0     (.L or .S or .D unit)
;*      Bound(.L .S .LS)          1        0
;*      Bound(.L .S .D .LS .LSD)     2*       1                 ;资源使用不平衡（没有完全利用可用的计算能力）
;*
;*      Searching for software pipeline schedule at ...
;*         ii = 2 Schedule found with 6 iterations in parallel
;*
;*      Register Usage Table:
;*          +-----------------------------------------------------------------+
;*          |AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA|BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB|
;*          |00000000001111111111222222222233|00000000001111111111222222222233|
;*          |01234567890123456789012345678901|01234567890123456789012345678901|
;*          |--------------------------------+--------------------------------|
;*       0: |   ******                       |    * **                        |
;*       1: |   * ***                       |    ****                        |
;*          +-----------------------------------------------------------------+
;*
;*      Done
;*
;*      Loop will be splooped
;*      Collapsed epilog stages     : 0
;*      Collapsed prolog stages     : 0
;*      Minimum required memory pad : 0 bytes
;*
;*      Minimum safe trip count     : 1
;*----------------------------------------------------------------------------*
;*        SINGLE SCHEDULED ITERATION                    ;需要加-mw选项
;*
;*        $C$C23:
;*   0              LDH     .D2T2   *B6++,B5          ; |32|              ;一次装载16bit，浪费带宽
;*   1              LDH     .D1T1   *A6++,A5          ; |32|
;*   2              NOP             3
;*   5              MPY     .M2     B5,B7,B4          ; |32|
;*   6              MPY     .M1     A5,A8,A4          ; |32|
;*   7              NOP             1
;*   8              ADD     .L1X    B4,A4,A3          ; |32|
;*   9              SHR     .S1     A3,15,A3          ; |32|
;* 10              STH     .D1T1   A3,*A7++          ; |32|
;*     ||           SPBR            $C$C23
;* 11              NOP             1
;* 12              ; BRANCHCC OCCURS {$C$C23}        ; |30|          ;一次循环需要12始终周期
;*----------------------------------------------------------------------------*

修改为下面的代码时：

#define WORD_ALIGNED(x) (_nassert(((int)(x) & 0x3) == 0))
#define DWORD_ALIGNED(x) (_nassert(((int)(x) & 0x7) == 0)) void lesson3_c(short * restrict xptr, short * restrict yptr, short *zptr,
               short *w_sum, int N)
{
    int i, w_vec1, w_vec2;
    short w1, w2;
    WORD_ALIGNED(xptr); //保证内存装载的带宽
    WORD_ALIGNED(yptr);
    w1 = zptr[0];
    w2 = zptr[1];
    #pragma MUST_ITERATE(48, , 2); //factor=2, 可以铺开代码
    for (i = 0; i < N; i++)
    {
        w_vec1 = xptr[i] * w1;
        w_vec2 = yptr[i] * w2;
        w_sum[i] = (w_vec1 + w_vec2) >> 15;
    }
} 相应的： ;*----------------------------------------------------------------------------*
;*   SOFTWARE PIPELINE INFORMATION
;*
;*      Loop source line                 : 59
;*      Loop opening brace source line   : 60
;*      Loop closing brace source line   : 64
;*      Loop Unroll Multiple             : 4x                       ;循环铺开的次数
;*      Known Minimum Trip Count         : 12
;*      Known Max Trip Count Factor      : 1
;*      Loop Carried Dependency Bound(^) : 0
;*      Unpartitioned Resource Bound     : 4
;*      Partitioned Resource Bound(*)    : 4
;*      Resource Partition:
;*                                A-side   B-side
;*      .L units                     0        0
;*      .S units                     2        2
;*      .D units                     4*       4*                     ;铺开循环保证了资源使用的平衡
;*      .M units                     2        2
;*      .X cross paths               2        2
;*      .T address paths             4*       4*
;*      Long read paths              0        0
;*      Long write paths             0        0
;*      Logical ops (.LS)           0        0     (.L or .S unit)
;*      Addition ops (.LSD)          2        2     (.L or .S or .D unit)
;*      Bound(.L .S .LS)             1        1
;*      Bound(.L .S .D .LS .LSD)     3        3
;*
;*      Searching for software pipeline schedule at ...
;*         ii = 4 Schedule found with 4 iterations in parallel
;*
;*      Register Usage Table:
;*          +-----------------------------------------------------------------+
;*          |AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA|BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB|
;*          |00000000001111111111222222222233|00000000001111111111222222222233|
;*          |01234567890123456789012345678901|01234567890123456789012345678901|
;*          |--------------------------------+--------------------------------|
;*       0: |    *****        ***            |    ******      ****            |
;*       1: |   *******         *            |    ******      ** *            |
;*       2: |   *******      ****            |    * ****      **              |
;*       3: |   *******      ****            |    ******      **              |
;*          +-----------------------------------------------------------------+
;*
;*      Done
;*
;*      Loop will be splooped
;*      Collapsed epilog stages     : 0
;*      Collapsed prolog stages     : 0
;*      Minimum required memory pad : 0 bytes
;*
;*      Minimum safe trip count     : 1 (after unrolling)
;*----------------------------------------------------------------------------*
;*        SINGLE SCHEDULED ITERATION
;*
;*        $C$C24:
;*   0              LDW     .D1T1   *A7++(8),A9       ; |61|             ;每次装载一个字
;*     ||           LDW     .D2T2   *B6++(8),B17      ; |61|
;*   1              LDW     .D1T1   *A4++(8),A3       ; |61|
;*   2              NOP             1
;*   3              LDW     .D2T2   *B16++(8),B4      ; |61|
;*   4              NOP             2
;*   6              MPY2    .M1     A9,A8,A17:A16     ; |61|
;*   7              MPY2    .M1     A3,A8,A19:A18     ; |61|
;*     ||           MPY2    .M2     B17,B7,B5:B4      ; |61|
;*   8              MPY2    .M2     B4,B7,B19:B18     ; |61|
;*   9              NOP             2
;* 11              ADD     .L2X    B4,A16,B17        ; |61|
;* 12              ADD     .L2X    B18,A18,B5        ; |61|
;*     ||           SHR     .S2     B17,15,B4         ; |61|
;*     ||           ADD     .L1X    B5,A17,A3         ; |61|
;* 13              SHR     .S2     B5,15,B4          ; |61|
;*     ||           ADD     .L1X    B19,A19,A19       ; |61|
;*     ||           STH     .D2T2   B4,*B9++(8)       ; |61|
;*     ||           SHR     .S1     A3,15,A18         ; |61|
;* 14              STH     .D2T2   B4,*B8++(8)       ; |61|
;*     ||           SHR     .S1     A19,15,A9         ; |61|
;*     ||           STH     .D1T1   A18,*A5++(8)      ; |61|
;* 15              STH     .D1T1   A9,*A6++(8)       ; |61|
;*     ||           SPBR            $C$C24
;* 16              ; BRANCHCC OCCURS {$C$C24}        ; |59|       ;4次循环使用16个时钟周期
;*----------------------------------------------------------------------------*
4 consultant advice 和 *.nfo文件
当编译时加上--consultant和–on2 –o3，可以查看相应的consultant advice 和*.nfo文件。打开profile，运行程序，这是可以查看viewer的consultant如下：

双击想看的数据，可以查看相应的建议： *.nfo文件编译时便可以生成，内容没有consultant全。下面是lesson3_c.nfo的内容 TMS320C6x C/C++ Optimizer               v6.0.8
Build Number 1GKUL-JA0KH827-RSAQQ-TAV-ZAZG_W_Q_Y         ======File-level Analysis Summary====== extern void _lesson3_c() is called from 0 sites in this file.
    It appears to be inlineable (size = 58 units)
    It calls these functions:
            ======= End file-level Analysis ======= extern void _lesson3_c() is called from 0 sites in this file.
    It appears to be inlineable (size = 58 units)
    It calls these functions:
    ADVICE: In function lesson3_c()
    in the 'for' loop with loop variable 'i' at lines 39-44
    for the statement w_sum[i] = _mpy(xptr[i], w1)+_mpy(yptr[i], w2)>>15; at line 41     The address of w_sum[i] for the first iteration of the loop is &w_sum[0].
    This pointer is aligned to a 16 bit boundary.     Consider adding an assertion just before the loop:         _nassert( ((int)w_sum % 4) == 0 ); /* 32-bit aligned */
       or    _nassert( ((int)w_sum % 8) == 0 ); /* 64-bit aligned */     to specify that multiple elements of w_sum[i]
    may be accessed in parallel. ADVICE: In function lesson3_c()
    in the 'for' loop with loop variable 'i' at lines 39-44
    for the statement w_sum[i] = _mpy(xptr[i], w1)+_mpy(yptr[i], w2)>>15; at line 41     The address of yptr[i] for the first iteration of the loop is &yptr[0].
    This pointer is aligned to a 32 bit boundary.     Consider adding an assertion just before the loop:         _nassert( ((int)yptr % 8) == 0 ); /* 64-bit aligned */     to specify that multiple elements of yptr[i]
    may be accessed in parallel. ADVICE: In function lesson3_c()
    in the 'for' loop with loop variable 'i' at lines 39-44
    for the statement w_sum[i] = _mpy(xptr[i], w1)+_mpy(yptr[i], w2)>>15; at line 41     The address of xptr[i] for the first iteration of the loop is &xptr[0].
    This pointer is aligned to a 32 bit boundary.     Consider adding an assertion just before the loop:         _nassert( ((int)xptr % 8) == 0 ); /* 64-bit aligned */     to specify that multiple elements of xptr[i]
    may be accessed in parallel.
<> == END OF INFO OUTPUT==
#define WORD_ALIGNED(x) (_nassert(((int)(x) & 0x3) == 0)) /*输入参数加上关键字restrict，内存独立声明，xptr yptr指向的区域不重合*/ void lesson3_c(short * restrict xptr, short * restrict yptr, short *zptr,                short *w_sum, int N) {     int i, w_vec1, w_vec2;     short w1, w2;            /*保证xptr yptr是四字节对齐的，编程者负责要保证这一点*/                                  WORD_ALIGNED(xptr);     WORD_ALIGNED(yptr);                                                                              w1 = zptr[0];     w2 = zptr[1];     /*最小循环20次，循环的次数是2的倍数*/     #pragma MUST_ITERATE(20, , 2);     for (i = 0; i < N; i++)     {         w_vec1 = xptr[i] * w1;         w_vec2 = yptr[i] * w2;         w_sum[i] = (w_vec1 + w_vec2) >> 15;     } } 参考： 1 TMS320C6000 Programmer's Guide.pdf 2 TMS320C6000 Optimizing Compiler User's Guide.pdf

DSP优化，通用的调整性能的策略

1 通用的调整性能的策略

1.1 选择恰当的编译器选项

Ta的文章更多 >>

热门文章

DSP优化，通用的调整性能的策略

1 通用的调整性能的策略

1.1 选择恰当的编译器选项

Ta的文章 更多 >>

热门文章

举报内容

检举类型

检举原因

检举说明(必填)

打开微信“扫一扫”，打开网页后点击屏幕右上角分享按钮

Ta的文章更多 >>