Statistics
| Branch: | Revision:

ffmpeg / libavcodec / arm / mpegvideo_armv5te_s.S @ a2fc0f6a

History | View | Annotate | Download (3.97 KB)

1
/*
2
 * Optimization of some functions from mpegvideo.c for armv5te
3
 * Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net>
4
 *
5
 * This file is part of FFmpeg.
6
 *
7
 * FFmpeg is free software; you can redistribute it and/or
8
 * modify it under the terms of the GNU Lesser General Public
9
 * License as published by the Free Software Foundation; either
10
 * version 2.1 of the License, or (at your option) any later version.
11
 *
12
 * FFmpeg is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
 * Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with FFmpeg; if not, write to the Free Software
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
 */
21

    
22
#include "config.h"
23
#include "asm.S"
24

    
25
/*
26
 * Special optimized version of dct_unquantize_h263_helper_c, it
27
 * requires the block to be at least 8 bytes aligned, and may process
28
 * more elements than requested.  But it is guaranteed to never
29
 * process more than 64 elements provided that count argument is <= 64,
30
 * so it is safe. This function is optimized for a common distribution
31
 * of values for nCoeffs (they are mostly multiple of 8 plus one or
32
 * two extra elements). So this function processes data as 8 elements
33
 * per loop iteration and contains optional 2 elements processing in
34
 * the end.
35
 *
36
 * Inner loop should take 6 cycles per element on arm926ej-s (Nokia 770)
37
 */
38
function ff_dct_unquantize_h263_armv5te, export=1
39
        push            {r4-r9,lr}
40
        mov             ip, #0
41
        subs            r3, r3, #2
42
        ble             2f
43
        ldrd            r4, [r0, #0]
44
1:
45
        ldrd            r6, [r0, #8]
46

    
47
        rsbs            r9, ip, r4, asr #16
48
        addgt           r9, r2, #0
49
        rsblt           r9, r2, #0
50
        smlatbne        r9, r4, r1, r9
51

    
52
        rsbs            lr, ip, r5, asr #16
53
        addgt           lr, r2, #0
54
        rsblt           lr, r2, #0
55
        smlatbne        lr, r5, r1, lr
56

    
57
        rsbs            r8, ip, r4, asl #16
58
        addgt           r8, r2, #0
59
        rsblt           r8, r2, #0
60
        smlabbne        r4, r4, r1, r8
61

    
62
        rsbs            r8, ip, r5, asl #16
63
        addgt           r8, r2, #0
64
        rsblt           r8, r2, #0
65
        smlabbne        r5, r5, r1, r8
66

    
67
        strh            r4, [r0], #2
68
        strh            r9, [r0], #2
69
        strh            r5, [r0], #2
70
        strh            lr, [r0], #2
71

    
72
        rsbs            r9, ip, r6, asr #16
73
        addgt           r9, r2, #0
74
        rsblt           r9, r2, #0
75
        smlatbne        r9, r6, r1, r9
76

    
77
        rsbs            lr, ip, r7, asr #16
78
        addgt           lr, r2, #0
79
        rsblt           lr, r2, #0
80
        smlatbne        lr, r7, r1, lr
81

    
82
        rsbs            r8, ip, r6, asl #16
83
        addgt           r8, r2, #0
84
        rsblt           r8, r2, #0
85
        smlabbne        r6, r6, r1, r8
86

    
87
        rsbs            r8, ip, r7, asl #16
88
        addgt           r8, r2, #0
89
        rsblt           r8, r2, #0
90
        smlabbne        r7, r7, r1, r8
91

    
92
        strh            r6, [r0], #2
93
        strh            r9, [r0], #2
94
        strh            r7, [r0], #2
95
        strh            lr, [r0], #2
96

    
97
        subs            r3, r3, #8
98
        ldrgtd          r4, [r0, #0] /* load data early to avoid load/use pipeline stall */
99
        bgt             1b
100

    
101
        adds            r3, r3, #2
102
        pople           {r4-r9,pc}
103
2:
104
        ldrsh           r9, [r0, #0]
105
        ldrsh           lr, [r0, #2]
106
        mov             r8, r2
107
        cmp             r9, #0
108
        rsblt           r8, r2, #0
109
        smlabbne        r9, r9, r1, r8
110
        mov             r8, r2
111
        cmp             lr, #0
112
        rsblt           r8, r2, #0
113
        smlabbne        lr, lr, r1, r8
114
        strh            r9, [r0], #2
115
        strh            lr, [r0], #2
116
        pop             {r4-r9,pc}
117
        .endfunc