Statistics
| Branch: | Revision:

ffmpeg / libavcodec / arm / h264idct_neon.S @ 5dad039b

History | View | Annotate | Download (6.03 KB)

1
/*
2
 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20

    
21
#include "asm.S"
22

    
23
        preserve8
24
        .text
25

    
26
function ff_h264_idct_add_neon, export=1
27
        vld1.64         {d0-d3},  [r1,:128]
28

    
29
        vswp            d1,  d2
30
        vadd.i16        d4,  d0,  d1
31
        vshr.s16        q8,  q1,  #1
32
        vsub.i16        d5,  d0,  d1
33
        vadd.i16        d6,  d2,  d17
34
        vsub.i16        d7,  d16, d3
35
        vadd.i16        q0,  q2,  q3
36
        vsub.i16        q1,  q2,  q3
37

    
38
        vtrn.16         d0,  d1
39
        vtrn.16         d3,  d2
40
        vtrn.32         d0,  d3
41
        vtrn.32         d1,  d2
42

    
43
        vadd.i16        d4,  d0,  d3
44
        vld1.32         {d18[0]}, [r0,:32], r2
45
        vswp            d1,  d3
46
        vshr.s16        q8,  q1,  #1
47
        vld1.32         {d19[1]}, [r0,:32], r2
48
        vsub.i16        d5,  d0,  d1
49
        vld1.32         {d18[1]}, [r0,:32], r2
50
        vadd.i16        d6,  d16, d3
51
        vld1.32         {d19[0]}, [r0,:32], r2
52
        vsub.i16        d7,  d2,  d17
53
        sub             r0,  r0,  r2, lsl #2
54
        vadd.i16        q0,  q2,  q3
55
        vsub.i16        q1,  q2,  q3
56

    
57
        vrshr.s16       q0,  q0,  #6
58
        vrshr.s16       q1,  q1,  #6
59

    
60
        vaddw.u8        q0,  q0,  d18
61
        vaddw.u8        q1,  q1,  d19
62

    
63
        vqmovun.s16     d0,  q0
64
        vqmovun.s16     d1,  q1
65

    
66
        vst1.32         {d0[0]},  [r0,:32], r2
67
        vst1.32         {d1[1]},  [r0,:32], r2
68
        vst1.32         {d0[1]},  [r0,:32], r2
69
        vst1.32         {d1[0]},  [r0,:32], r2
70

    
71
        bx              lr
72
        .endfunc
73

    
74
function ff_h264_idct_dc_add_neon, export=1
75
        vld1.16         {d2[],d3[]}, [r1,:16]
76
        vrshr.s16       q1,  q1,  #6
77
        vld1.32         {d0[0]},  [r0,:32], r2
78
        vld1.32         {d0[1]},  [r0,:32], r2
79
        vaddw.u8        q2,  q1,  d0
80
        vld1.32         {d1[0]},  [r0,:32], r2
81
        vld1.32         {d1[1]},  [r0,:32], r2
82
        vaddw.u8        q1,  q1,  d1
83
        vqmovun.s16     d0,  q2
84
        vqmovun.s16     d1,  q1
85
        sub             r0,  r0,  r2, lsl #2
86
        vst1.32         {d0[0]},  [r0,:32], r2
87
        vst1.32         {d0[1]},  [r0,:32], r2
88
        vst1.32         {d1[0]},  [r0,:32], r2
89
        vst1.32         {d1[1]},  [r0,:32], r2
90
        bx              lr
91
        .endfunc
92

    
93
function ff_h264_idct_add16_neon, export=1
94
        push            {r4-r8,lr}
95
        mov             r4,  r0
96
        mov             r5,  r1
97
        mov             r1,  r2
98
        mov             r2,  r3
99
        ldr             r6,  [sp, #24]
100
        movrel          r7,  scan8
101
        mov             ip,  #16
102
1:      ldrb            r8,  [r7], #1
103
        ldr             r0,  [r5], #4
104
        ldrb            r8,  [r6, r8]
105
        subs            r8,  r8,  #1
106
        blt             2f
107
        ldrsh           lr,  [r1]
108
        add             r0,  r0,  r4
109
        movne           lr,  #0
110
        cmp             lr,  #0
111
        adrne           lr,  ff_h264_idct_dc_add_neon
112
        adreq           lr,  ff_h264_idct_add_neon
113
        blx             lr
114
2:      subs            ip,  ip,  #1
115
        add             r1,  r1,  #32
116
        bne             1b
117
        pop             {r4-r8,pc}
118
        .endfunc
119

    
120
function ff_h264_idct_add16intra_neon, export=1
121
        push            {r4-r8,lr}
122
        mov             r4,  r0
123
        mov             r5,  r1
124
        mov             r1,  r2
125
        mov             r2,  r3
126
        ldr             r6,  [sp, #24]
127
        movrel          r7,  scan8
128
        mov             ip,  #16
129
1:      ldrb            r8,  [r7], #1
130
        ldr             r0,  [r5], #4
131
        ldrb            r8,  [r6, r8]
132
        add             r0,  r0,  r4
133
        cmp             r8,  #0
134
        ldrsh           r8,  [r1]
135
        adrne           lr,  ff_h264_idct_add_neon
136
        adreq           lr,  ff_h264_idct_dc_add_neon
137
        cmpeq           r8,  #0
138
        blxne           lr
139
        subs            ip,  ip,  #1
140
        add             r1,  r1,  #32
141
        bne             1b
142
        pop             {r4-r8,pc}
143
        .endfunc
144

    
145
function ff_h264_idct_add8_neon, export=1
146
        push            {r4-r10,lr}
147
        ldm             r0,  {r4,r9}
148
        add             r5,  r1,  #16*4
149
        add             r1,  r2,  #16*32
150
        mov             r2,  r3
151
        ldr             r6,  [sp, #32]
152
        movrel          r7,  scan8+16
153
        mov             ip,  #8
154
1:      ldrb            r8,  [r7], #1
155
        ldr             r0,  [r5], #4
156
        ldrb            r8,  [r6, r8]
157
        tst             ip,  #4
158
        addeq           r0,  r0,  r4
159
        addne           r0,  r0,  r9
160
        cmp             r8,  #0
161
        ldrsh           r8,  [r1]
162
        adrne           lr,  ff_h264_idct_add_neon
163
        adreq           lr,  ff_h264_idct_dc_add_neon
164
        cmpeq           r8,  #0
165
        blxne           lr
166
        subs            ip,  ip,  #1
167
        add             r1,  r1,  #32
168
        bne             1b
169
        pop             {r4-r10,pc}
170
        .endfunc
171

    
172
        .section .rodata
173
scan8:  .byte           4+1*8, 5+1*8, 4+2*8, 5+2*8
174
        .byte           6+1*8, 7+1*8, 6+2*8, 7+2*8
175
        .byte           4+3*8, 5+3*8, 4+4*8, 5+4*8
176
        .byte           6+3*8, 7+3*8, 6+4*8, 7+4*8
177
        .byte           1+1*8, 2+1*8
178
        .byte           1+2*8, 2+2*8
179
        .byte           1+4*8, 2+4*8
180
        .byte           1+5*8, 2+5*8