Statistics
| Branch: | Revision:

ffmpeg / libavcodec / arm / h264pred_neon.S @ 2912e87a

History | View | Annotate | Download (11.8 KB)

1
/*
2
 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
3
 *
4
 * This file is part of Libav.
5
 *
6
 * Libav is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * Libav is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with Libav; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20

    
21
#include "asm.S"
22

    
23
        .macro ldcol.8  rd,  rs,  rt,  n=8,  hi=0
24
.if \n == 8 || \hi == 0
25
        vld1.8          {\rd[0]}, [\rs], \rt
26
        vld1.8          {\rd[1]}, [\rs], \rt
27
        vld1.8          {\rd[2]}, [\rs], \rt
28
        vld1.8          {\rd[3]}, [\rs], \rt
29
.endif
30
.if \n == 8 || \hi == 1
31
        vld1.8          {\rd[4]}, [\rs], \rt
32
        vld1.8          {\rd[5]}, [\rs], \rt
33
        vld1.8          {\rd[6]}, [\rs], \rt
34
        vld1.8          {\rd[7]}, [\rs], \rt
35
.endif
36
        .endm
37

    
38
        .macro add16x8  dq,  dl,  dh,  rl,  rh
39
        vaddl.u8        \dq, \rl, \rh
40
        vadd.u16        \dl, \dl, \dh
41
        vpadd.u16       \dl, \dl, \dl
42
        vpadd.u16       \dl, \dl, \dl
43
        .endm
44

    
45
function ff_pred16x16_128_dc_neon, export=1
46
        vmov.i8         q0,  #128
47
        b               .L_pred16x16_dc_end
48
endfunc
49

    
50
function ff_pred16x16_top_dc_neon, export=1
51
        sub             r2,  r0,  r1
52
        vld1.8          {q0},     [r2,:128]
53
        add16x8         q0,  d0,  d1,  d0,  d1
54
        vrshrn.u16      d0,  q0,  #4
55
        vdup.8          q0,  d0[0]
56
        b               .L_pred16x16_dc_end
57
endfunc
58

    
59
function ff_pred16x16_left_dc_neon, export=1
60
        sub             r2,  r0,  #1
61
        ldcol.8         d0,  r2,  r1
62
        ldcol.8         d1,  r2,  r1
63
        add16x8         q0,  d0,  d1,  d0,  d1
64
        vrshrn.u16      d0,  q0,  #4
65
        vdup.8          q0,  d0[0]
66
        b               .L_pred16x16_dc_end
67
endfunc
68

    
69
function ff_pred16x16_dc_neon, export=1
70
        sub             r2,  r0,  r1
71
        vld1.8          {q0},     [r2,:128]
72
        sub             r2,  r0,  #1
73
        ldcol.8         d2,  r2,  r1
74
        ldcol.8         d3,  r2,  r1
75
        vaddl.u8        q0,  d0,  d1
76
        vaddl.u8        q1,  d2,  d3
77
        vadd.u16        q0,  q0,  q1
78
        vadd.u16        d0,  d0,  d1
79
        vpadd.u16       d0,  d0,  d0
80
        vpadd.u16       d0,  d0,  d0
81
        vrshrn.u16      d0,  q0,  #5
82
        vdup.8          q0,  d0[0]
83
.L_pred16x16_dc_end:
84
        mov             r3,  #8
85
6:      vst1.8          {q0},     [r0,:128], r1
86
        vst1.8          {q0},     [r0,:128], r1
87
        subs            r3,  r3,  #1
88
        bne             6b
89
        bx              lr
90
endfunc
91

    
92
function ff_pred16x16_hor_neon, export=1
93
        sub             r2,  r0,  #1
94
        mov             r3,  #16
95
1:      vld1.8          {d0[],d1[]},[r2],      r1
96
        vst1.8          {q0},       [r0,:128], r1
97
        subs            r3,  r3,  #1
98
        bne             1b
99
        bx              lr
100
endfunc
101

    
102
function ff_pred16x16_vert_neon, export=1
103
        sub             r0,  r0,  r1
104
        vld1.8          {q0},     [r0,:128], r1
105
        mov             r3,  #8
106
1:      vst1.8          {q0},     [r0,:128], r1
107
        vst1.8          {q0},     [r0,:128], r1
108
        subs            r3,  r3,  #1
109
        bne             1b
110
        bx              lr
111
endfunc
112

    
113
function ff_pred16x16_plane_neon, export=1
114
        sub             r3,  r0,  r1
115
        add             r2,  r3,  #8
116
        sub             r3,  r3,  #1
117
        vld1.8          {d0},     [r3]
118
        vld1.8          {d2},     [r2,:64], r1
119
        ldcol.8         d1,  r3,  r1
120
        add             r3,  r3,  r1
121
        ldcol.8         d3,  r3,  r1
122
        vrev64.8        q0,  q0
123
        vaddl.u8        q8,  d2,  d3
124
        vsubl.u8        q2,  d2,  d0
125
        vsubl.u8        q3,  d3,  d1
126
        movrel          r3,  p16weight
127
        vld1.8          {q0},     [r3,:128]
128
        vmul.s16        q2,  q2,  q0
129
        vmul.s16        q3,  q3,  q0
130
        vadd.i16        d4,  d4,  d5
131
        vadd.i16        d5,  d6,  d7
132
        vpadd.i16       d4,  d4,  d5
133
        vpadd.i16       d4,  d4,  d4
134
        vshll.s16       q3,  d4,  #2
135
        vaddw.s16       q2,  q3,  d4
136
        vrshrn.s32      d4,  q2,  #6
137
        mov             r3,  #0
138
        vtrn.16         d4,  d5
139
        vadd.i16        d2,  d4,  d5
140
        vshl.i16        d3,  d2,  #3
141
        vrev64.16       d16, d17
142
        vsub.i16        d3,  d3,  d2
143
        vadd.i16        d16, d16, d0
144
        vshl.i16        d2,  d16, #4
145
        vsub.i16        d2,  d2,  d3
146
        vshl.i16        d3,  d4,  #4
147
        vext.16         q0,  q0,  q0,  #7
148
        vsub.i16        d6,  d5,  d3
149
        vmov.16         d0[0], r3
150
        vmul.i16        q0,  q0,  d4[0]
151
        vdup.16         q1,  d2[0]
152
        vdup.16         q2,  d4[0]
153
        vdup.16         q3,  d6[0]
154
        vshl.i16        q2,  q2,  #3
155
        vadd.i16        q1,  q1,  q0
156
        vadd.i16        q3,  q3,  q2
157
        mov             r3,  #16
158
1:
159
        vqshrun.s16     d0,  q1,  #5
160
        vadd.i16        q1,  q1,  q2
161
        vqshrun.s16     d1,  q1,  #5
162
        vadd.i16        q1,  q1,  q3
163
        vst1.8          {q0},     [r0,:128], r1
164
        subs            r3,  r3,  #1
165
        bne             1b
166
        bx              lr
167
endfunc
168

    
169
        .section        .rodata
170
        .align          4
171
p16weight:
172
        .short          1,2,3,4,5,6,7,8
173

    
174
        .text
175

    
176
function ff_pred8x8_hor_neon, export=1
177
        sub             r2,  r0,  #1
178
        mov             r3,  #8
179
1:      vld1.8          {d0[]},   [r2],     r1
180
        vst1.8          {d0},     [r0,:64], r1
181
        subs            r3,  r3,  #1
182
        bne             1b
183
        bx              lr
184
endfunc
185

    
186
function ff_pred8x8_vert_neon, export=1
187
        sub             r0,  r0,  r1
188
        vld1.8          {d0},     [r0,:64], r1
189
        mov             r3,  #4
190
1:      vst1.8          {d0},     [r0,:64], r1
191
        vst1.8          {d0},     [r0,:64], r1
192
        subs            r3,  r3,  #1
193
        bne             1b
194
        bx              lr
195
endfunc
196

    
197
function ff_pred8x8_plane_neon, export=1
198
        sub             r3,  r0,  r1
199
        add             r2,  r3,  #4
200
        sub             r3,  r3,  #1
201
        vld1.32         {d0[0]},  [r3]
202
        vld1.32         {d2[0]},  [r2,:32], r1
203
        ldcol.8         d0,  r3,  r1,  4,  hi=1
204
        add             r3,  r3,  r1
205
        ldcol.8         d3,  r3,  r1,  4
206
        vaddl.u8        q8,  d2,  d3
207
        vrev32.8        d0,  d0
208
        vtrn.32         d2,  d3
209
        vsubl.u8        q2,  d2,  d0
210
        movrel          r3,  p16weight
211
        vld1.16         {q0},     [r3,:128]
212
        vmul.s16        d4,  d4,  d0
213
        vmul.s16        d5,  d5,  d0
214
        vpadd.i16       d4,  d4,  d5
215
        vpaddl.s16      d4,  d4
216
        vshl.i32        d5,  d4,  #4
217
        vadd.s32        d4,  d4,  d5
218
        vrshrn.s32      d4,  q2,  #5
219
        mov             r3,  #0
220
        vtrn.16         d4,  d5
221
        vadd.i16        d2,  d4,  d5
222
        vshl.i16        d3,  d2,  #2
223
        vrev64.16       d16, d16
224
        vsub.i16        d3,  d3,  d2
225
        vadd.i16        d16, d16, d0
226
        vshl.i16        d2,  d16, #4
227
        vsub.i16        d2,  d2,  d3
228
        vshl.i16        d3,  d4,  #3
229
        vext.16         q0,  q0,  q0,  #7
230
        vsub.i16        d6,  d5,  d3
231
        vmov.16         d0[0], r3
232
        vmul.i16        q0,  q0,  d4[0]
233
        vdup.16         q1,  d2[0]
234
        vdup.16         q2,  d4[0]
235
        vdup.16         q3,  d6[0]
236
        vshl.i16        q2,  q2,  #3
237
        vadd.i16        q1,  q1,  q0
238
        vadd.i16        q3,  q3,  q2
239
        mov             r3,  #8
240
1:
241
        vqshrun.s16     d0,  q1,  #5
242
        vadd.i16        q1,  q1,  q3
243
        vst1.8          {d0},     [r0,:64], r1
244
        subs            r3,  r3,  #1
245
        bne             1b
246
        bx              lr
247
endfunc
248

    
249
function ff_pred8x8_128_dc_neon, export=1
250
        vmov.i8         q0,  #128
251
        b               .L_pred8x8_dc_end
252
endfunc
253

    
254
function ff_pred8x8_top_dc_neon, export=1
255
        sub             r2,  r0,  r1
256
        vld1.8          {d0},     [r2,:64]
257
        vpaddl.u8       d0,  d0
258
        vpadd.u16       d0,  d0,  d0
259
        vrshrn.u16      d0,  q0,  #2
260
        vdup.8          d1,  d0[1]
261
        vdup.8          d0,  d0[0]
262
        vtrn.32         d0,  d1
263
        b               .L_pred8x8_dc_end
264
endfunc
265

    
266
function ff_pred8x8_left_dc_neon, export=1
267
        sub             r2,  r0,  #1
268
        ldcol.8         d0,  r2,  r1
269
        vpaddl.u8       d0,  d0
270
        vpadd.u16       d0,  d0,  d0
271
        vrshrn.u16      d0,  q0,  #2
272
        vdup.8          d1,  d0[1]
273
        vdup.8          d0,  d0[0]
274
        b               .L_pred8x8_dc_end
275
endfunc
276

    
277
function ff_pred8x8_dc_neon, export=1
278
        sub             r2,  r0,  r1
279
        vld1.8          {d0},     [r2,:64]
280
        sub             r2,  r0,  #1
281
        ldcol.8         d1,  r2,  r1
282
        vtrn.32         d0,  d1
283
        vpaddl.u8       q0,  q0
284
        vpadd.u16       d0,  d0,  d1
285
        vpadd.u16       d1,  d0,  d0
286
        vrshrn.u16      d2,  q0,  #3
287
        vrshrn.u16      d3,  q0,  #2
288
        vdup.8          d0,  d2[4]
289
        vdup.8          d1,  d3[3]
290
        vdup.8          d4,  d3[2]
291
        vdup.8          d5,  d2[5]
292
        vtrn.32         q0,  q2
293
.L_pred8x8_dc_end:
294
        mov             r3,  #4
295
        add             r2,  r0,  r1,  lsl #2
296
6:      vst1.8          {d0},     [r0,:64], r1
297
        vst1.8          {d1},     [r2,:64], r1
298
        subs            r3,  r3,  #1
299
        bne             6b
300
        bx              lr
301
endfunc
302

    
303
function ff_pred8x8_l0t_dc_neon, export=1
304
        sub             r2,  r0,  r1
305
        vld1.8          {d0},     [r2,:64]
306
        sub             r2,  r0,  #1
307
        ldcol.8         d1,  r2,  r1,  4
308
        vtrn.32         d0,  d1
309
        vpaddl.u8       q0,  q0
310
        vpadd.u16       d0,  d0,  d1
311
        vpadd.u16       d1,  d0,  d0
312
        vrshrn.u16      d2,  q0,  #3
313
        vrshrn.u16      d3,  q0,  #2
314
        vdup.8          d0,  d2[4]
315
        vdup.8          d1,  d3[0]
316
        vdup.8          q2,  d3[2]
317
        vtrn.32         q0,  q2
318
        b               .L_pred8x8_dc_end
319
endfunc
320

    
321
function ff_pred8x8_l00_dc_neon, export=1
322
        sub             r2,  r0,  #1
323
        ldcol.8         d0,  r2,  r1,  4
324
        vpaddl.u8       d0,  d0
325
        vpadd.u16       d0,  d0,  d0
326
        vrshrn.u16      d0,  q0,  #2
327
        vmov.i8         d1,  #128
328
        vdup.8          d0,  d0[0]
329
        b               .L_pred8x8_dc_end
330
endfunc
331

    
332
function ff_pred8x8_0lt_dc_neon, export=1
333
        sub             r2,  r0,  r1
334
        vld1.8          {d0},     [r2,:64]
335
        add             r2,  r0,  r1,  lsl #2
336
        sub             r2,  r2,  #1
337
        ldcol.8         d1,  r2,  r1,  4,  hi=1
338
        vtrn.32         d0,  d1
339
        vpaddl.u8       q0,  q0
340
        vpadd.u16       d0,  d0,  d1
341
        vpadd.u16       d1,  d0,  d0
342
        vrshrn.u16      d3,  q0,  #2
343
        vrshrn.u16      d2,  q0,  #3
344
        vdup.8          d0,  d3[0]
345
        vdup.8          d1,  d3[3]
346
        vdup.8          d4,  d3[2]
347
        vdup.8          d5,  d2[5]
348
        vtrn.32         q0,  q2
349
        b               .L_pred8x8_dc_end
350
endfunc
351

    
352
function ff_pred8x8_0l0_dc_neon, export=1
353
        add             r2,  r0,  r1,  lsl #2
354
        sub             r2,  r2,  #1
355
        ldcol.8         d1,  r2,  r1,  4
356
        vpaddl.u8       d2,  d1
357
        vpadd.u16       d2,  d2,  d2
358
        vrshrn.u16      d1,  q1,  #2
359
        vmov.i8         d0,  #128
360
        vdup.8          d1,  d1[0]
361
        b               .L_pred8x8_dc_end
362
endfunc