Revision ef15d71c

View differences:

libavcodec/arm/Makefile
2 2

  
3 3
OBJS-$(CONFIG_VP5_DECODER)             += arm/vp56dsp_init_arm.o
4 4
OBJS-$(CONFIG_VP6_DECODER)             += arm/vp56dsp_init_arm.o
5
OBJS-$(CONFIG_VP8_DECODER)             += arm/vp8dsp_init_arm.o
5 6

  
6 7
OBJS-$(CONFIG_H264DSP)                 += arm/h264dsp_init_arm.o
7 8
OBJS-$(CONFIG_H264PRED)                += arm/h264pred_init_arm.o
......
54 55
NEON-OBJS-$(CONFIG_VP6_DECODER)        += arm/vp56dsp_neon.o            \
55 56
                                          arm/vp3dsp_neon.o             \
56 57

  
58
NEON-OBJS-$(CONFIG_VP8_DECODER)        += arm/vp8dsp_neon.o
59

  
57 60
OBJS-$(HAVE_NEON)                      += arm/dsputil_init_neon.o       \
58 61
                                          arm/dsputil_neon.o            \
59 62
                                          arm/fmtconvert_neon.o         \
libavcodec/arm/vp8dsp_init_arm.c
1
/**
2
 * This file is part of FFmpeg.
3
 *
4
 * FFmpeg is free software; you can redistribute it and/or
5
 * modify it under the terms of the GNU Lesser General Public
6
 * License as published by the Free Software Foundation; either
7
 * version 2.1 of the License, or (at your option) any later version.
8
 *
9
 * FFmpeg is distributed in the hope that it will be useful,
10
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12
 * Lesser General Public License for more details.
13
 *
14
 * You should have received a copy of the GNU Lesser General Public
15
 * License along with FFmpeg; if not, write to the Free Software
16
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
 */
18

  
19
#include <stdint.h>
20
#include "libavcodec/vp8dsp.h"
21

  
22
void ff_vp8_luma_dc_wht_neon(DCTELEM block[4][4][16], DCTELEM dc[16]);
23
void ff_vp8_luma_dc_wht_dc_neon(DCTELEM block[4][4][16], DCTELEM dc[16]);
24

  
25
void ff_vp8_idct_add_neon(uint8_t *dst, DCTELEM block[16], int stride);
26
void ff_vp8_idct_dc_add_neon(uint8_t *dst, DCTELEM block[16], int stride);
27
void ff_vp8_idct_dc_add4y_neon(uint8_t *dst, DCTELEM block[4][16], int stride);
28
void ff_vp8_idct_dc_add4uv_neon(uint8_t *dst, DCTELEM block[4][16], int stride);
29

  
30
void ff_vp8_v_loop_filter16_neon(uint8_t *dst, int stride,
31
                                 int flim_E, int flim_I, int hev_thresh);
32
void ff_vp8_h_loop_filter16_neon(uint8_t *dst, int stride,
33
                                 int flim_E, int flim_I, int hev_thresh);
34
void ff_vp8_v_loop_filter8uv_neon(uint8_t *dstU, uint8_t *dstV, int stride,
35
                                  int flim_E, int flim_I, int hev_thresh);
36
void ff_vp8_h_loop_filter8uv_neon(uint8_t *dstU, uint8_t *dstV, int stride,
37
                                  int flim_E, int flim_I, int hev_thresh);
38

  
39
void ff_vp8_v_loop_filter16_inner_neon(uint8_t *dst, int stride,
40
                                       int flim_E, int flim_I, int hev_thresh);
41
void ff_vp8_h_loop_filter16_inner_neon(uint8_t *dst, int stride,
42
                                       int flim_E, int flim_I, int hev_thresh);
43
void ff_vp8_v_loop_filter8uv_inner_neon(uint8_t *dstU, uint8_t *dstV,
44
                                        int stride, int flim_E, int flim_I,
45
                                        int hev_thresh);
46
void ff_vp8_h_loop_filter8uv_inner_neon(uint8_t *dstU, uint8_t *dstV,
47
                                        int stride, int flim_E, int flim_I,
48
                                        int hev_thresh);
49

  
50
void ff_vp8_v_loop_filter16_simple_neon(uint8_t *dst, int stride, int flim);
51
void ff_vp8_h_loop_filter16_simple_neon(uint8_t *dst, int stride, int flim);
52

  
53

  
54
#define VP8_MC(n)                                                       \
55
    void ff_put_vp8_##n##_neon(uint8_t *dst, int dststride,             \
56
                               uint8_t *src, int srcstride,             \
57
                               int h, int x, int y)
58

  
59
#define VP8_EPEL(w)                             \
60
    VP8_MC(pixels ## w);                        \
61
    VP8_MC(epel ## w ## _h4);                   \
62
    VP8_MC(epel ## w ## _h6);                   \
63
    VP8_MC(epel ## w ## _v4);                   \
64
    VP8_MC(epel ## w ## _h4v4);                 \
65
    VP8_MC(epel ## w ## _h6v4);                 \
66
    VP8_MC(epel ## w ## _v6);                   \
67
    VP8_MC(epel ## w ## _h4v6);                 \
68
    VP8_MC(epel ## w ## _h6v6)
69

  
70
VP8_EPEL(16);
71
VP8_EPEL(8);
72
VP8_EPEL(4);
73

  
74
VP8_MC(bilin16_h);
75
VP8_MC(bilin16_v);
76
VP8_MC(bilin16_hv);
77
VP8_MC(bilin8_h);
78
VP8_MC(bilin8_v);
79
VP8_MC(bilin8_hv);
80
VP8_MC(bilin4_h);
81
VP8_MC(bilin4_v);
82
VP8_MC(bilin4_hv);
83

  
84
av_cold void ff_vp8dsp_init_arm(VP8DSPContext *dsp)
85
{
86
    if (HAVE_NEON) {
87
        dsp->vp8_luma_dc_wht    = ff_vp8_luma_dc_wht_neon;
88
        dsp->vp8_luma_dc_wht_dc = ff_vp8_luma_dc_wht_dc_neon;
89

  
90
        dsp->vp8_idct_add       = ff_vp8_idct_add_neon;
91
        dsp->vp8_idct_dc_add    = ff_vp8_idct_dc_add_neon;
92
        dsp->vp8_idct_dc_add4y  = ff_vp8_idct_dc_add4y_neon;
93
        dsp->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_neon;
94

  
95
        dsp->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_neon;
96
        dsp->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_neon;
97
        dsp->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_neon;
98
        dsp->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_neon;
99

  
100
        dsp->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16_inner_neon;
101
        dsp->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16_inner_neon;
102
        dsp->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_neon;
103
        dsp->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_neon;
104

  
105
        dsp->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter16_simple_neon;
106
        dsp->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter16_simple_neon;
107

  
108
        dsp->put_vp8_epel_pixels_tab[0][0][0] = ff_put_vp8_pixels16_neon;
109
        dsp->put_vp8_epel_pixels_tab[0][0][2] = ff_put_vp8_epel16_h6_neon;
110
        dsp->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_neon;
111
        dsp->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_neon;
112

  
113
        dsp->put_vp8_epel_pixels_tab[1][0][0] = ff_put_vp8_pixels8_neon;
114
        dsp->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_neon;
115
        dsp->put_vp8_epel_pixels_tab[1][0][2] = ff_put_vp8_epel8_h6_neon;
116
        dsp->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_neon;
117
        dsp->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_neon;
118
        dsp->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_neon;
119
        dsp->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_neon;
120
        dsp->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_neon;
121
        dsp->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_neon;
122

  
123
        dsp->put_vp8_epel_pixels_tab[2][0][0] = ff_put_vp8_pixels4_neon;
124
        dsp->put_vp8_epel_pixels_tab[2][0][1] = ff_put_vp8_epel4_h4_neon;
125
        dsp->put_vp8_epel_pixels_tab[2][0][2] = ff_put_vp8_epel4_h6_neon;
126
        dsp->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_neon;
127
        dsp->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_neon;
128
        dsp->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_neon;
129
        dsp->put_vp8_epel_pixels_tab[2][2][0] = ff_put_vp8_epel4_v6_neon;
130
        dsp->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_neon;
131
        dsp->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_neon;
132

  
133
        dsp->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_neon;
134
        dsp->put_vp8_bilinear_pixels_tab[0][0][1] = ff_put_vp8_bilin16_h_neon;
135
        dsp->put_vp8_bilinear_pixels_tab[0][0][2] = ff_put_vp8_bilin16_h_neon;
136
        dsp->put_vp8_bilinear_pixels_tab[0][1][0] = ff_put_vp8_bilin16_v_neon;
137
        dsp->put_vp8_bilinear_pixels_tab[0][1][1] = ff_put_vp8_bilin16_hv_neon;
138
        dsp->put_vp8_bilinear_pixels_tab[0][1][2] = ff_put_vp8_bilin16_hv_neon;
139
        dsp->put_vp8_bilinear_pixels_tab[0][2][0] = ff_put_vp8_bilin16_v_neon;
140
        dsp->put_vp8_bilinear_pixels_tab[0][2][1] = ff_put_vp8_bilin16_hv_neon;
141
        dsp->put_vp8_bilinear_pixels_tab[0][2][2] = ff_put_vp8_bilin16_hv_neon;
142

  
143
        dsp->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_neon;
144
        dsp->put_vp8_bilinear_pixels_tab[1][0][1] = ff_put_vp8_bilin8_h_neon;
145
        dsp->put_vp8_bilinear_pixels_tab[1][0][2] = ff_put_vp8_bilin8_h_neon;
146
        dsp->put_vp8_bilinear_pixels_tab[1][1][0] = ff_put_vp8_bilin8_v_neon;
147
        dsp->put_vp8_bilinear_pixels_tab[1][1][1] = ff_put_vp8_bilin8_hv_neon;
148
        dsp->put_vp8_bilinear_pixels_tab[1][1][2] = ff_put_vp8_bilin8_hv_neon;
149
        dsp->put_vp8_bilinear_pixels_tab[1][2][0] = ff_put_vp8_bilin8_v_neon;
150
        dsp->put_vp8_bilinear_pixels_tab[1][2][1] = ff_put_vp8_bilin8_hv_neon;
151
        dsp->put_vp8_bilinear_pixels_tab[1][2][2] = ff_put_vp8_bilin8_hv_neon;
152

  
153
        dsp->put_vp8_bilinear_pixels_tab[2][0][0] = ff_put_vp8_pixels4_neon;
154
        dsp->put_vp8_bilinear_pixels_tab[2][0][1] = ff_put_vp8_bilin4_h_neon;
155
        dsp->put_vp8_bilinear_pixels_tab[2][0][2] = ff_put_vp8_bilin4_h_neon;
156
        dsp->put_vp8_bilinear_pixels_tab[2][1][0] = ff_put_vp8_bilin4_v_neon;
157
        dsp->put_vp8_bilinear_pixels_tab[2][1][1] = ff_put_vp8_bilin4_hv_neon;
158
        dsp->put_vp8_bilinear_pixels_tab[2][1][2] = ff_put_vp8_bilin4_hv_neon;
159
        dsp->put_vp8_bilinear_pixels_tab[2][2][0] = ff_put_vp8_bilin4_v_neon;
160
        dsp->put_vp8_bilinear_pixels_tab[2][2][1] = ff_put_vp8_bilin4_hv_neon;
161
        dsp->put_vp8_bilinear_pixels_tab[2][2][2] = ff_put_vp8_bilin4_hv_neon;
162
    }
163
}
libavcodec/arm/vp8dsp_neon.S
1
/**
2
 * VP8 NEON optimisations
3
 *
4
 * Copyright (c) 2010 Rob Clark <rob@ti.com>
5
 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
6
 *
7
 * This file is part of FFmpeg.
8
 *
9
 * FFmpeg is free software; you can redistribute it and/or
10
 * modify it under the terms of the GNU Lesser General Public
11
 * License as published by the Free Software Foundation; either
12
 * version 2.1 of the License, or (at your option) any later version.
13
 *
14
 * FFmpeg is distributed in the hope that it will be useful,
15
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17
 * Lesser General Public License for more details.
18
 *
19
 * You should have received a copy of the GNU Lesser General Public
20
 * License along with FFmpeg; if not, write to the Free Software
21
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22
 */
23

  
24
#include "asm.S"
25

  
26
function ff_vp8_luma_dc_wht_neon, export=1
27
        vld1.16         {q0-q1},  [r1,:128]
28
        vmov.i16        q15, #0
29

  
30
        vadd.i16        d4,  d0,  d3
31
        vadd.i16        d6,  d1,  d2
32
        vst1.16         {q15},    [r1,:128]!
33
        vsub.i16        d7,  d1,  d2
34
        vsub.i16        d5,  d0,  d3
35
        vst1.16         {q15},    [r1,:128]
36
        vadd.i16        q0,  q2,  q3
37
        vsub.i16        q1,  q2,  q3
38

  
39
        vmov.i16        q8, #3
40

  
41
        vtrn.32         d0,  d2
42
        vtrn.32         d1,  d3
43
        vtrn.16         d0,  d1
44
        vtrn.16         d2,  d3
45

  
46
        vadd.i16        d0,  d0,  d16
47

  
48
        vadd.i16        d4,  d0,  d3
49
        vadd.i16        d6,  d1,  d2
50
        vsub.i16        d7,  d1,  d2
51
        vsub.i16        d5,  d0,  d3
52
        vadd.i16        q0,  q2,  q3
53
        vsub.i16        q1,  q2,  q3
54

  
55
        vshr.s16        q0,  q0,  #3
56
        vshr.s16        q1,  q1,  #3
57

  
58
        mov             r3,  #32
59
        vst1.16         {d0[0]},  [r0,:16], r3
60
        vst1.16         {d1[0]},  [r0,:16], r3
61
        vst1.16         {d2[0]},  [r0,:16], r3
62
        vst1.16         {d3[0]},  [r0,:16], r3
63
        vst1.16         {d0[1]},  [r0,:16], r3
64
        vst1.16         {d1[1]},  [r0,:16], r3
65
        vst1.16         {d2[1]},  [r0,:16], r3
66
        vst1.16         {d3[1]},  [r0,:16], r3
67
        vst1.16         {d0[2]},  [r0,:16], r3
68
        vst1.16         {d1[2]},  [r0,:16], r3
69
        vst1.16         {d2[2]},  [r0,:16], r3
70
        vst1.16         {d3[2]},  [r0,:16], r3
71
        vst1.16         {d0[3]},  [r0,:16], r3
72
        vst1.16         {d1[3]},  [r0,:16], r3
73
        vst1.16         {d2[3]},  [r0,:16], r3
74
        vst1.16         {d3[3]},  [r0,:16], r3
75

  
76
        bx              lr
77
endfunc
78

  
79
function ff_vp8_luma_dc_wht_dc_neon, export=1
80
        ldrsh           r2,  [r1]
81
        mov             r3,  #0
82
        add             r2,  r2,  #3
83
        strh            r3,  [r1]
84
        asr             r2,  r2,  #3
85
    .rept 16
86
        strh            r2,  [r0], #32
87
    .endr
88
        bx              lr
89
endfunc
90

  
91
function ff_vp8_idct_add_neon, export=1
92
        vld1.16         {q0-q1},  [r1,:128]
93
        movw            r3,  #20091
94
        movt            r3,  #35468/2
95
        vdup.32         d4,  r3
96

  
97
        vmull.s16       q12, d1,  d4[0]
98
        vmull.s16       q13, d3,  d4[0]
99
        vqdmulh.s16     d20, d1,  d4[1]
100
        vqdmulh.s16     d23, d3,  d4[1]
101
        vshrn.s32       d21, q12, #16
102
        vshrn.s32       d22, q13, #16
103
        vadd.s16        d21, d21, d1
104
        vadd.s16        d22, d22, d3
105

  
106
        vadd.s16        d16, d0,  d2
107
        vsub.s16        d17, d0,  d2
108
        vadd.s16        d18, d21, d23
109
        vsub.s16        d19, d20, d22
110
        vadd.s16        q0,  q8,  q9
111
        vsub.s16        q1,  q8,  q9
112

  
113
        vtrn.32         d0,  d3
114
        vtrn.32         d1,  d2
115
        vtrn.16         d0,  d1
116
        vtrn.16         d3,  d2
117

  
118
        vmov.i16        q15, #0
119
        vmull.s16       q12, d1,  d4[0]
120
        vst1.16         {q15},    [r1,:128]!
121
        vmull.s16       q13, d2,  d4[0]
122
        vst1.16         {q15},    [r1,:128]
123
        vqdmulh.s16     d21, d1,  d4[1]
124
        vqdmulh.s16     d23, d2,  d4[1]
125
        vshrn.s32       d20, q12, #16
126
        vshrn.s32       d22, q13, #16
127
        vadd.i16        d20, d20, d1
128
        vadd.i16        d22, d22, d2
129

  
130
        vadd.i16        d16, d0,  d3
131
        vsub.i16        d17, d0,  d3
132
        vadd.i16        d18, d20, d23
133
        vld1.32         {d20[]},  [r0,:32], r2
134
        vsub.i16        d19, d21, d22
135
        vld1.32         {d22[]},  [r0,:32], r2
136
        vadd.s16        q0,  q8,  q9
137
        vld1.32         {d23[]},  [r0,:32], r2
138
        vsub.s16        q1,  q8,  q9
139
        vld1.32         {d21[]},  [r0,:32], r2
140
        vrshr.s16       q0,  q0,  #3
141
        vtrn.32         q10, q11
142
        vrshr.s16       q1,  q1,  #3
143

  
144
        sub             r0,  r0,  r2,  lsl #2
145

  
146
        vtrn.32         d0,  d3
147
        vtrn.32         d1,  d2
148
        vtrn.16         d0,  d1
149
        vtrn.16         d3,  d2
150

  
151
        vaddw.u8        q0,  q0,  d20
152
        vaddw.u8        q1,  q1,  d21
153
        vqmovun.s16     d0,  q0
154
        vqmovun.s16     d1,  q1
155

  
156
        vst1.32         {d0[0]},  [r0,:32], r2
157
        vst1.32         {d0[1]},  [r0,:32], r2
158
        vst1.32         {d1[1]},  [r0,:32], r2
159
        vst1.32         {d1[0]},  [r0,:32], r2
160

  
161
        bx              lr
162
endfunc
163

  
164
function ff_vp8_idct_dc_add_neon, export=1
165
        mov             r3,  #0
166
        ldrsh           r12, [r1]
167
        strh            r3,  [r1]
168
        vdup.16         q1,  r12
169
        vrshr.s16       q1,  q1,  #3
170
        vld1.32         {d0[]},   [r0,:32], r2
171
        vld1.32         {d1[]},   [r0,:32], r2
172
        vld1.32         {d0[1]},  [r0,:32], r2
173
        vld1.32         {d1[1]},  [r0,:32], r2
174
        vaddw.u8        q2,  q1,  d0
175
        vaddw.u8        q3,  q1,  d1
176
        sub             r0,  r0,  r2, lsl #2
177
        vqmovun.s16     d0,  q2
178
        vqmovun.s16     d1,  q3
179
        vst1.32         {d0[0]},  [r0,:32], r2
180
        vst1.32         {d1[0]},  [r0,:32], r2
181
        vst1.32         {d0[1]},  [r0,:32], r2
182
        vst1.32         {d1[1]},  [r0,:32], r2
183
        bx              lr
184
endfunc
185

  
186
function ff_vp8_idct_dc_add4uv_neon, export=1
187
        vmov.i16        d0,  #0
188
        mov             r3,  #32
189
        vld1.16         {d16[]},  [r1,:16]
190
        vst1.16         {d0[0]},  [r1,:16], r3
191
        vld1.16         {d17[]},  [r1,:16]
192
        vst1.16         {d0[0]},  [r1,:16], r3
193
        vld1.16         {d18[]},  [r1,:16]
194
        vst1.16         {d0[0]},  [r1,:16], r3
195
        vld1.16         {d19[]},  [r1,:16]
196
        vst1.16         {d0[0]},  [r1,:16], r3
197
        mov             r3,  r0
198
        vrshr.s16       q8,  q8,  #3            @ dc >>= 3
199
        vld1.8          {d0},     [r0,:64], r2
200
        vrshr.s16       q9,  q9,  #3
201
        vld1.8          {d1},     [r0,:64], r2
202
        vaddw.u8        q10, q8,  d0
203
        vld1.8          {d2},     [r0,:64], r2
204
        vaddw.u8        q0,  q8,  d1
205
        vld1.8          {d3},     [r0,:64], r2
206
        vaddw.u8        q11, q8,  d2
207
        vld1.8          {d4},     [r0,:64], r2
208
        vaddw.u8        q1,  q8,  d3
209
        vld1.8          {d5},     [r0,:64], r2
210
        vaddw.u8        q12, q9,  d4
211
        vld1.8          {d6},     [r0,:64], r2
212
        vaddw.u8        q2,  q9,  d5
213
        vld1.8          {d7},     [r0,:64], r2
214
        vaddw.u8        q13, q9,  d6
215
        vqmovun.s16     d20, q10
216
        vaddw.u8        q3,  q9,  d7
217
        vqmovun.s16     d21, q0
218
        vqmovun.s16     d22, q11
219
        vst1.8          {d20},    [r3,:64], r2
220
        vqmovun.s16     d23, q1
221
        vst1.8          {d21},    [r3,:64], r2
222
        vqmovun.s16     d24, q12
223
        vst1.8          {d22},    [r3,:64], r2
224
        vqmovun.s16     d25, q2
225
        vst1.8          {d23},    [r3,:64], r2
226
        vqmovun.s16     d26, q13
227
        vst1.8          {d24},    [r3,:64], r2
228
        vqmovun.s16     d27, q3
229
        vst1.8          {d25},    [r3,:64], r2
230
        vst1.8          {d26},    [r3,:64], r2
231
        vst1.8          {d27},    [r3,:64], r2
232

  
233
        bx              lr
234
endfunc
235

  
236
function ff_vp8_idct_dc_add4y_neon, export=1
237
        vmov.i16        d0,  #0
238
        mov             r3,  #32
239
        vld1.16         {d16[]},  [r1,:16]
240
        vst1.16         {d0[0]},  [r1,:16], r3
241
        vld1.16         {d17[]},  [r1,:16]
242
        vst1.16         {d0[0]},  [r1,:16], r3
243
        vld1.16         {d18[]},  [r1,:16]
244
        vst1.16         {d0[0]},  [r1,:16], r3
245
        vld1.16         {d19[]},  [r1,:16]
246
        vst1.16         {d0[0]},  [r1,:16], r3
247
        vrshr.s16       q8,  q8,  #3            @ dc >>= 3
248
        vld1.8          {q0},     [r0,:128], r2
249
        vrshr.s16       q9,  q9,  #3
250
        vld1.8          {q1},     [r0,:128], r2
251
        vaddw.u8        q10, q8,  d0
252
        vld1.8          {q2},     [r0,:128], r2
253
        vaddw.u8        q0,  q9,  d1
254
        vld1.8          {q3},     [r0,:128], r2
255
        vaddw.u8        q11, q8,  d2
256
        vaddw.u8        q1,  q9,  d3
257
        vaddw.u8        q12, q8,  d4
258
        vaddw.u8        q2,  q9,  d5
259
        vaddw.u8        q13, q8,  d6
260
        vaddw.u8        q3,  q9,  d7
261
        sub             r0,  r0,  r2,  lsl #2
262
        vqmovun.s16     d20, q10
263
        vqmovun.s16     d21, q0
264
        vqmovun.s16     d22, q11
265
        vqmovun.s16     d23, q1
266
        vqmovun.s16     d24, q12
267
        vst1.8          {q10},    [r0,:128], r2
268
        vqmovun.s16     d25, q2
269
        vst1.8          {q11},    [r0,:128], r2
270
        vqmovun.s16     d26, q13
271
        vst1.8          {q12},    [r0,:128], r2
272
        vqmovun.s16     d27, q3
273
        vst1.8          {q13},    [r0,:128], r2
274

  
275
        bx              lr
276
endfunc
277

  
278
@ Register layout:
279
@   P3..Q3 -> q0..q7
280
@   flim_E -> q14
281
@   flim_I -> q15
282
@   hev_thresh -> r12
283
@
284
.macro  vp8_loop_filter, inner=0, simple=0
285
    .if \simple
286
        vabd.u8         q9,  q3,  q4            @ abs(P0-Q0)
287
        vabd.u8         q15, q2,  q5            @ abs(P1-Q1)
288
        vqadd.u8        q9,  q9,  q9            @ abs(P0-Q0) * 2
289
        vshr.u8         q10, q15, #1            @ abs(P1-Q1) / 2
290
        vqadd.u8        q11, q9,  q10           @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
291
        vmov.i8         q13, #0x80
292
        vcle.u8         q8,  q11, q14           @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
293
    .else
294
        @ calculate hev and normal_limit:
295
        vabd.u8         q12, q2,  q3            @ abs(P1-P0)
296
        vabd.u8         q13, q5,  q4            @ abs(Q1-Q0)
297
        vabd.u8         q10, q0,  q1            @ abs(P3-P2)
298
        vabd.u8         q11, q1,  q2            @ abs(P2-P1)
299
        vcle.u8         q8,  q12, q15           @ abs(P1-P0) <= flim_I
300
        vcle.u8         q9,  q13, q15           @ abs(Q1-Q0) <= flim_I
301
        vcle.u8         q10, q10, q15           @ abs(P3-P2) <= flim_I
302
        vcle.u8         q11, q11, q15           @ abs(P2-P1) <= flim_I
303
        vand            q8,  q8,  q9
304
        vabd.u8         q9,  q7,  q6            @ abs(Q3-Q2)
305
        vand            q8,  q8,  q11
306
        vabd.u8         q11, q6,  q5            @ abs(Q2-Q1)
307
        vand            q8,  q8,  q10
308
        vcle.u8         q10, q9,  q15           @ abs(Q3-Q2) <= flim_I
309
        vcle.u8         q11, q11, q15           @ abs(Q2-Q1) <= flim_I
310
        vabd.u8         q9,  q3,  q4            @ abs(P0-Q0)
311
        vabd.u8         q15, q2,  q5            @ abs(P1-Q1)
312
        vand            q8,  q8,  q10
313
        vqadd.u8        q9,  q9,  q9            @ abs(P0-Q0) * 2
314
        vand            q8,  q8,  q11
315
        vshr.u8         q10, q15, #1            @ abs(P1-Q1) / 2
316
        vdup.8          q15, r12                @ hev_thresh
317
        vqadd.u8        q11, q9,  q10           @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
318
        vcgt.u8         q12, q12, q15           @ abs(P1-P0) > hev_thresh
319
        vcle.u8         q11, q11, q14           @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E
320
        vcgt.u8         q14, q13, q15           @ abs(Q1-Q0) > hev_thresh
321
        vand            q8,  q8,  q11
322
        vmov.i8         q13, #0x80
323
        vorr            q9,  q12, q14
324
    .endif
325

  
326
        @ at this point:
327
        @   q8: normal_limit
328
        @   q9: hev
329

  
330
        @ convert to signed value:
331
        veor            q3,  q3,  q13           @ PS0 = P0 ^ 0x80
332
        veor            q4,  q4,  q13           @ QS0 = Q0 ^ 0x80
333

  
334
        vmov.i16        q12, #3
335
        vsubl.s8        q10, d8,  d6            @ QS0 - PS0
336
        vsubl.s8        q11, d9,  d7            @   (widened to 16bit)
337
        veor            q2,  q2,  q13           @ PS1 = P1 ^ 0x80
338
        veor            q5,  q5,  q13           @ QS1 = Q1 ^ 0x80
339
        vmul.i16        q10, q10, q12           @ w = 3 * (QS0 - PS0)
340
        vmul.i16        q11, q11, q12
341

  
342
        vqsub.s8        q12, q2,  q5            @ clamp(PS1-QS1)
343
        vmov.i8         q14, #4
344
        vmov.i8         q15, #3
345
    .if \inner
346
        vand            q12, q12, q9            @ if(hev) w += clamp(PS1-QS1)
347
    .endif
348
        vaddw.s8        q10, q10, d24           @ w += clamp(PS1-QS1)
349
        vaddw.s8        q11, q11, d25
350
        vqmovn.s16      d20, q10                @ narrow result back into q10
351
        vqmovn.s16      d21, q11
352
    .if !\inner && !\simple
353
        veor            q1,  q1,  q13           @ PS2 = P2 ^ 0x80
354
        veor            q6,  q6,  q13           @ QS2 = Q2 ^ 0x80
355
    .endif
356
        vand            q10, q10, q8            @ w &= normal_limit
357

  
358
        @ registers used at this point..
359
        @   q0 -> P3  (don't corrupt)
360
        @   q1-q6 -> PS2-QS2
361
        @   q7 -> Q3  (don't corrupt)
362
        @   q9 -> hev
363
        @   q10 -> w
364
        @   q13 -> #0x80
365
        @   q14 -> #4
366
        @   q15 -> #3
367
        @   q8, q11, q12 -> unused
368

  
369
        @ filter_common:   is4tap==1
370
        @   c1 = clamp(w + 4) >> 3;
371
        @   c2 = clamp(w + 3) >> 3;
372
        @   Q0 = s2u(QS0 - c1);
373
        @   P0 = s2u(PS0 + c2);
374

  
375
    .if \simple
376
        vqadd.s8        q11, q10, q14           @ c1 = clamp((w&hev)+4)
377
        vqadd.s8        q12, q10, q15           @ c2 = clamp((w&hev)+3)
378
        vshr.s8         q11, q11, #3            @ c1 >>= 3
379
        vshr.s8         q12, q12, #3            @ c2 >>= 3
380
        vqsub.s8        q4,  q4,  q11           @ QS0 = clamp(QS0-c1)
381
        vqadd.s8        q3,  q3,  q12           @ PS0 = clamp(PS0+c2)
382
        veor            q4,  q4,  q13           @ Q0 = QS0 ^ 0x80
383
        veor            q3,  q3,  q13           @ P0 = PS0 ^ 0x80
384
        veor            q5,  q5,  q13           @ Q1 = QS1 ^ 0x80
385
        veor            q2,  q2,  q13           @ P1 = PS1 ^ 0x80
386
    .elseif \inner
387
        @ the !is4tap case of filter_common, only used for inner blocks
388
        @   c3 = ((c1&~hev) + 1) >> 1;
389
        @   Q1 = s2u(QS1 - c3);
390
        @   P1 = s2u(PS1 + c3);
391
        vqadd.s8        q11, q10, q14           @ c1 = clamp((w&hev)+4)
392
        vqadd.s8        q12, q10, q15           @ c2 = clamp((w&hev)+3)
393
        vshr.s8         q11, q11, #3            @ c1 >>= 3
394
        vshr.s8         q12, q12, #3            @ c2 >>= 3
395
        vqsub.s8        q4,  q4,  q11           @ QS0 = clamp(QS0-c1)
396
        vqadd.s8        q3,  q3,  q12           @ PS0 = clamp(PS0+c2)
397
        vbic            q11, q11, q9            @ c1 & ~hev
398
        veor            q4,  q4,  q13           @ Q0 = QS0 ^ 0x80
399
        vrshr.s8        q11, q11, #1            @ c3 >>= 1
400
        veor            q3,  q3,  q13           @ P0 = PS0 ^ 0x80
401
        vqsub.s8        q5,  q5,  q11           @ QS1 = clamp(QS1-c3)
402
        vqadd.s8        q2,  q2,  q11           @ PS1 = clamp(PS1+c3)
403
        veor            q5,  q5,  q13           @ Q1 = QS1 ^ 0x80
404
        veor            q2,  q2,  q13           @ P1 = PS1 ^ 0x80
405
    .else
406
        vand            q12, q10, q9            @ w & hev
407
        vqadd.s8        q11, q12, q14           @ c1 = clamp((w&hev)+4)
408
        vqadd.s8        q12, q12, q15           @ c2 = clamp((w&hev)+3)
409
        vshr.s8         q11, q11, #3            @ c1 >>= 3
410
        vshr.s8         q12, q12, #3            @ c2 >>= 3
411
        vbic            q10, q10, q9            @ w &= ~hev
412
        vqsub.s8        q4,  q4,  q11           @ QS0 = clamp(QS0-c1)
413
        vqadd.s8        q3,  q3,  q12           @ PS0 = clamp(PS0+c2)
414

  
415
        @ filter_mbedge:
416
        @   a = clamp((27*w + 63) >> 7);
417
        @   Q0 = s2u(QS0 - a);
418
        @   P0 = s2u(PS0 + a);
419
        @   a = clamp((18*w + 63) >> 7);
420
        @   Q1 = s2u(QS1 - a);
421
        @   P1 = s2u(PS1 + a);
422
        @   a = clamp((9*w + 63) >> 7);
423
        @   Q2 = s2u(QS2 - a);
424
        @   P2 = s2u(PS2 + a);
425
        vmov.i16        q9,  #63
426
        vshll.s8        q14, d20, #3
427
        vshll.s8        q15, d21, #3
428
        vaddw.s8        q14, q14, d20
429
        vaddw.s8        q15, q15, d21
430
        vadd.s16        q8,  q9,  q14
431
        vadd.s16        q9,  q9,  q15           @  9*w + 63
432
        vadd.s16        q11, q8,  q14
433
        vadd.s16        q12, q9,  q15           @ 18*w + 63
434
        vadd.s16        q14, q11, q14
435
        vadd.s16        q15, q12, q15           @ 27*w + 63
436
        vqshrn.s16      d16, q8,  #7
437
        vqshrn.s16      d17, q9,  #7            @ clamp(( 9*w + 63)>>7)
438
        vqshrn.s16      d22, q11, #7
439
        vqshrn.s16      d23, q12, #7            @ clamp((18*w + 63)>>7)
440
        vqshrn.s16      d28, q14, #7
441
        vqshrn.s16      d29, q15, #7            @ clamp((27*w + 63)>>7)
442
        vqadd.s8        q1,  q1,  q8            @ PS2 = clamp(PS2+a)
443
        vqsub.s8        q6,  q6,  q8            @ QS2 = clamp(QS2-a)
444
        vqadd.s8        q2,  q2,  q11           @ PS1 = clamp(PS1+a)
445
        vqsub.s8        q5,  q5,  q11           @ QS1 = clamp(QS1-a)
446
        vqadd.s8        q3,  q3,  q14           @ PS0 = clamp(PS0+a)
447
        vqsub.s8        q4,  q4,  q14           @ QS0 = clamp(QS0-a)
448
        veor            q3,  q3,  q13           @ P0 = PS0 ^ 0x80
449
        veor            q4,  q4,  q13           @ Q0 = QS0 ^ 0x80
450
        veor            q2,  q2,  q13           @ P1 = PS1 ^ 0x80
451
        veor            q5,  q5,  q13           @ Q1 = QS1 ^ 0x80
452
        veor            q1,  q1,  q13           @ P2 = PS2 ^ 0x80
453
        veor            q6,  q6,  q13           @ Q2 = QS2 ^ 0x80
454
    .endif
455
.endm
456

  
457
.macro transpose8x16matrix
458
        vtrn.32         q0,   q4
459
        vtrn.32         q1,   q5
460
        vtrn.32         q2,   q6
461
        vtrn.32         q3,   q7
462

  
463
        vtrn.16         q0,   q2
464
        vtrn.16         q1,   q3
465
        vtrn.16         q4,   q6
466
        vtrn.16         q5,   q7
467

  
468
        vtrn.8          q0,   q1
469
        vtrn.8          q2,   q3
470
        vtrn.8          q4,   q5
471
        vtrn.8          q6,   q7
472
.endm
473

  
474
.macro  vp8_v_loop_filter16 name, inner=0, simple=0
475
function ff_vp8_v_loop_filter16\name\()_neon, export=1
476
        vpush           {q4-q7}
477
        sub             r0,  r0,  r1,  lsl #1+!\simple
478

  
479
        @ Load pixels:
480
    .if !\simple
481
        ldr             r12, [sp, #64]          @ hev_thresh
482
        vld1.8          {q0},     [r0,:128], r1 @ P3
483
        vld1.8          {q1},     [r0,:128], r1 @ P2
484
    .endif
485
        vld1.8          {q2},     [r0,:128], r1 @ P1
486
        vld1.8          {q3},     [r0,:128], r1 @ P0
487
        vld1.8          {q4},     [r0,:128], r1 @ Q0
488
        vld1.8          {q5},     [r0,:128], r1 @ Q1
489
    .if !\simple
490
        vld1.8          {q6},     [r0,:128], r1 @ Q2
491
        vld1.8          {q7},     [r0,:128]     @ Q3
492
        vdup.8          q15, r3                 @ flim_I
493
    .endif
494
        vdup.8          q14, r2                 @ flim_E
495

  
496
        vp8_loop_filter inner=\inner, simple=\simple
497

  
498
        @ back up to P2:  dst -= stride * 6
499
        sub             r0,  r0,  r1,  lsl #2
500
    .if !\simple
501
        sub             r0,  r0,  r1,  lsl #1
502

  
503
        @ Store pixels:
504
        vst1.8          {q1},     [r0,:128], r1 @ P2
505
    .endif
506
        vst1.8          {q2},     [r0,:128], r1 @ P1
507
        vst1.8          {q3},     [r0,:128], r1 @ P0
508
        vst1.8          {q4},     [r0,:128], r1 @ Q0
509
        vst1.8          {q5},     [r0,:128], r1 @ Q1
510
    .if !\simple
511
        vst1.8          {q6},     [r0,:128]     @ Q2
512
    .endif
513

  
514
        vpop            {q4-q7}
515
        bx              lr
516
endfunc
517
.endm
518

  
519
vp8_v_loop_filter16
520
vp8_v_loop_filter16 _inner,  inner=1
521
vp8_v_loop_filter16 _simple, simple=1
522

  
523
.macro  vp8_v_loop_filter8uv name, inner=0
524
function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
525
        vpush           {q4-q7}
526
        sub             r0,  r0,  r2,  lsl #2
527
        sub             r1,  r1,  r2,  lsl #2
528
        ldr             r12, [sp, #64]          @ flim_I
529

  
530
        @ Load pixels:
531
        vld1.8          {d0},     [r0,:64], r2  @ P3
532
        vld1.8          {d1},     [r1,:64], r2  @ P3
533
        vld1.8          {d2},     [r0,:64], r2  @ P2
534
        vld1.8          {d3},     [r1,:64], r2  @ P2
535
        vld1.8          {d4},     [r0,:64], r2  @ P1
536
        vld1.8          {d5},     [r1,:64], r2  @ P1
537
        vld1.8          {d6},     [r0,:64], r2  @ P0
538
        vld1.8          {d7},     [r1,:64], r2  @ P0
539
        vld1.8          {d8},     [r0,:64], r2  @ Q0
540
        vld1.8          {d9},     [r1,:64], r2  @ Q0
541
        vld1.8          {d10},    [r0,:64], r2  @ Q1
542
        vld1.8          {d11},    [r1,:64], r2  @ Q1
543
        vld1.8          {d12},    [r0,:64], r2  @ Q2
544
        vld1.8          {d13},    [r1,:64], r2  @ Q2
545
        vld1.8          {d14},    [r0,:64]      @ Q3
546
        vld1.8          {d15},    [r1,:64]      @ Q3
547

  
548
        vdup.8          q14, r3                 @ flim_E
549
        vdup.8          q15, r12                @ flim_I
550
        ldr             r12, [sp, #68]          @ hev_thresh
551

  
552
        vp8_loop_filter inner=\inner
553

  
554
        @ back up to P2:  u,v -= stride * 6
555
        sub             r0,  r0,  r2,  lsl #2
556
        sub             r1,  r1,  r2,  lsl #2
557
        sub             r0,  r0,  r2,  lsl #1
558
        sub             r1,  r1,  r2,  lsl #1
559

  
560
        @ Store pixels:
561
        vst1.8          {d2},     [r0,:64], r2  @ P2
562
        vst1.8          {d3},     [r1,:64], r2  @ P2
563
        vst1.8          {d4},     [r0,:64], r2  @ P1
564
        vst1.8          {d5},     [r1,:64], r2  @ P1
565
        vst1.8          {d6},     [r0,:64], r2  @ P0
566
        vst1.8          {d7},     [r1,:64], r2  @ P0
567
        vst1.8          {d8},     [r0,:64], r2  @ Q0
568
        vst1.8          {d9},     [r1,:64], r2  @ Q0
569
        vst1.8          {d10},    [r0,:64], r2  @ Q1
570
        vst1.8          {d11},    [r1,:64], r2  @ Q1
571
        vst1.8          {d12},    [r0,:64]      @ Q2
572
        vst1.8          {d13},    [r1,:64]      @ Q2
573

  
574
        vpop            {q4-q7}
575
        bx              lr
576
endfunc
577
.endm
578

  
579
vp8_v_loop_filter8uv
580
vp8_v_loop_filter8uv _inner, inner=1
581

  
582
.macro  vp8_h_loop_filter16 name, inner=0, simple=0
583
function ff_vp8_h_loop_filter16\name\()_neon, export=1
584
        vpush           {q4-q7}
585
        sub             r0,  r0,  #4
586
    .if !\simple
587
        ldr             r12, [sp, #64]          @ hev_thresh
588
    .endif
589

  
590
        @ Load pixels:
591
        vld1.8          {d0},     [r0], r1      @ load first 8-line src data
592
        vld1.8          {d2},     [r0], r1
593
        vld1.8          {d4},     [r0], r1
594
        vld1.8          {d6},     [r0], r1
595
        vld1.8          {d8},     [r0], r1
596
        vld1.8          {d10},    [r0], r1
597
        vld1.8          {d12},    [r0], r1
598
        vld1.8          {d14},    [r0], r1
599
        vld1.8          {d1},     [r0], r1      @ load second 8-line src data
600
        vld1.8          {d3},     [r0], r1
601
        vld1.8          {d5},     [r0], r1
602
        vld1.8          {d7},     [r0], r1
603
        vld1.8          {d9},     [r0], r1
604
        vld1.8          {d11},    [r0], r1
605
        vld1.8          {d13},    [r0], r1
606
        vld1.8          {d15},    [r0], r1
607

  
608
        transpose8x16matrix
609

  
610
        vdup.8          q14, r2                 @ flim_E
611
    .if !\simple
612
        vdup.8          q15, r3                 @ flim_I
613
    .endif
614

  
615
        vp8_loop_filter inner=\inner, simple=\simple
616

  
617
        sub             r0,  r0,  r1, lsl #4    @ backup 16 rows
618

  
619
        transpose8x16matrix
620

  
621
        @ Store pixels:
622
        vst1.8          {d0},     [r0],     r1
623
        vst1.8          {d2},     [r0],     r1
624
        vst1.8          {d4},     [r0],     r1
625
        vst1.8          {d6},     [r0],     r1
626
        vst1.8          {d8},     [r0],     r1
627
        vst1.8          {d10},    [r0],     r1
628
        vst1.8          {d12},    [r0],     r1
629
        vst1.8          {d14},    [r0],     r1
630
        vst1.8          {d1},     [r0],     r1
631
        vst1.8          {d3},     [r0],     r1
632
        vst1.8          {d5},     [r0],     r1
633
        vst1.8          {d7},     [r0],     r1
634
        vst1.8          {d9},     [r0],     r1
635
        vst1.8          {d11},    [r0],     r1
636
        vst1.8          {d13},    [r0],     r1
637
        vst1.8          {d15},    [r0]
638

  
639
        vpop            {q4-q7}
640
        bx              lr
641
endfunc
642
.endm
643

  
644
vp8_h_loop_filter16
645
vp8_h_loop_filter16 _inner,  inner=1
646
vp8_h_loop_filter16 _simple, simple=1
647

  
648
.macro  vp8_h_loop_filter8uv name, inner=0
649
function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
650
        vpush           {q4-q7}
651
        sub             r0,  r0,  #4
652
        sub             r1,  r1,  #4
653
        ldr             r12, [sp, #64]          @ flim_I
654

  
655
        @ Load pixels:
656
        vld1.8          {d0},     [r0], r2      @ load u
657
        vld1.8          {d1},     [r1], r2      @ load v
658
        vld1.8          {d2},     [r0], r2
659
        vld1.8          {d3},     [r1], r2
660
        vld1.8          {d4},     [r0], r2
661
        vld1.8          {d5},     [r1], r2
662
        vld1.8          {d6},     [r0], r2
663
        vld1.8          {d7},     [r1], r2
664
        vld1.8          {d8},     [r0], r2
665
        vld1.8          {d9},     [r1], r2
666
        vld1.8          {d10},    [r0], r2
667
        vld1.8          {d11},    [r1], r2
668
        vld1.8          {d12},    [r0], r2
669
        vld1.8          {d13},    [r1], r2
670
        vld1.8          {d14},    [r0], r2
671
        vld1.8          {d15},    [r1], r2
672

  
673
        transpose8x16matrix
674

  
675
        vdup.8          q14, r3                 @ flim_E
676
        vdup.8          q15, r12                @ flim_I
677
        ldr             r12, [sp, #68]          @ hev_thresh
678

  
679
        vp8_loop_filter inner=\inner
680

  
681
        sub             r0,  r0,  r2, lsl #3    @ backup u 8 rows
682
        sub             r1,  r1,  r2, lsl #3    @ backup v 8 rows
683

  
684
        transpose8x16matrix
685

  
686
        @ Store pixels:
687
        vst1.8          {d0},     [r0], r2
688
        vst1.8          {d1},     [r1], r2
689
        vst1.8          {d2},     [r0], r2
690
        vst1.8          {d3},     [r1], r2
691
        vst1.8          {d4},     [r0], r2
692
        vst1.8          {d5},     [r1], r2
693
        vst1.8          {d6},     [r0], r2
694
        vst1.8          {d7},     [r1], r2
695
        vst1.8          {d8},     [r0], r2
696
        vst1.8          {d9},     [r1], r2
697
        vst1.8          {d10},    [r0], r2
698
        vst1.8          {d11},    [r1], r2
699
        vst1.8          {d12},    [r0], r2
700
        vst1.8          {d13},    [r1], r2
701
        vst1.8          {d14},    [r0]
702
        vst1.8          {d15},    [r1]
703

  
704
        vpop            {q4-q7}
705
        bx              lr
706
endfunc
707
.endm
708

  
709
vp8_h_loop_filter8uv
710
vp8_h_loop_filter8uv _inner, inner=1
711

  
712
function ff_put_vp8_pixels16_neon, export=1
713
        ldr             r12, [sp, #0]           @ h
714
1:
715
        subs            r12, r12, #4
716
        vld1.8          {q0},     [r2], r3
717
        vld1.8          {q1},     [r2], r3
718
        vld1.8          {q2},     [r2], r3
719
        vld1.8          {q3},     [r2], r3
720
        vst1.8          {q0},     [r0,:128], r1
721
        vst1.8          {q1},     [r0,:128], r1
722
        vst1.8          {q2},     [r0,:128], r1
723
        vst1.8          {q3},     [r0,:128], r1
724
        bgt             1b
725
        bx              lr
726
endfunc
727

  
728
function ff_put_vp8_pixels8_neon, export=1
729
        ldr             r12, [sp, #0]           @ h
730
1:
731
        subs            r12, r12, #4
732
        vld1.8          {d0},     [r2], r3
733
        vld1.8          {d1},     [r2], r3
734
        vld1.8          {d2},     [r2], r3
735
        vld1.8          {d3},     [r2], r3
736
        vst1.8          {d0},     [r0,:64], r1
737
        vst1.8          {d1},     [r0,:64], r1
738
        vst1.8          {d2},     [r0,:64], r1
739
        vst1.8          {d3},     [r0,:64], r1
740
        bgt             1b
741
        bx              lr
742
endfunc
743

  
744
function ff_put_vp8_pixels4_neon, export=1
745
        ldr             r12, [sp, #0]           @ h
746
        push            {r4-r6,lr}
747
1:
748
        subs            r12, r12, #4
749
        ldr             r4,       [r2], r3
750
        ldr             r5,       [r2], r3
751
        ldr             r6,       [r2], r3
752
        ldr             lr,       [r2], r3
753
        str             r4,       [r0], r1
754
        str             r5,       [r0], r1
755
        str             r6,       [r0], r1
756
        str             lr,       [r0], r1
757
        bgt             1b
758
        pop             {r4-r6,pc}
759
endfunc
760

  
761
/* 4/6-tap 8th-pel MC */
762

  
763
.macro  vp8_epel8_h6    d,   a,   b
764
        vext.8          d27, \a,  \b,  #1
765
        vmovl.u8        q8,  \a
766
        vext.8          d28, \a,  \b,  #2
767
        vmovl.u8        q9,  d27
768
        vext.8          d29, \a,  \b,  #3
769
        vmovl.u8        q10, d28
770
        vext.8          d30, \a,  \b,  #4
771
        vmovl.u8        q11, d29
772
        vext.8          d31, \a,  \b,  #5
773
        vmovl.u8        q12, d30
774
        vmul.u16        q10, q10, d0[2]
775
        vmovl.u8        q13, d31
776
        vmul.u16        q11, q11, d0[3]
777
        vmls.u16        q10, q9,  d0[1]
778
        vmls.u16        q11, q12, d1[0]
779
        vmla.u16        q10, q8,  d0[0]
780
        vmla.u16        q11, q13, d1[1]
781
        vqadd.s16       q11, q10, q11
782
        vqrshrun.s16    \d,  q11, #7
783
.endm
784

  
785
.macro  vp8_epel16_h6   d0,  d1,  s0,  s1,  s2,  q0,  q1
786
        vext.8          q14, \q0, \q1, #3
787
        vext.8          q15, \q0, \q1, #4
788
        vmovl.u8        q11, d28
789
        vmovl.u8        q14, d29
790
        vext.8          q3,  \q0, \q1, #2
791
        vmovl.u8        q12, d30
792
        vmovl.u8        q15, d31
793
        vext.8          q8,  \q0, \q1, #1
794
        vmovl.u8        q10, d6
795
        vmovl.u8        q3,  d7
796
        vext.8          q2,  \q0, \q1, #5
797
        vmovl.u8        q13, d4
798
        vmovl.u8        q2,  d5
799
        vmovl.u8        q9,  d16
800
        vmovl.u8        q8,  d17
801
        vmul.u16        q11, q11, d0[3]
802
        vmul.u16        q10, q10, d0[2]
803
        vmul.u16        q3,  q3,  d0[2]
804
        vmul.u16        q14, q14, d0[3]
805
        vmls.u16        q11, q12, d1[0]
806
        vmovl.u8        q12, \s0
807
        vmovl.u8        q1,  \s1
808
        vmls.u16        q10, q9,  d0[1]
809
        vmls.u16        q3,  q8,  d0[1]
810
        vmls.u16        q14, q15, d1[0]
811
        vmla.u16        q10, q12, d0[0]
812
        vmla.u16        q11, q13, d1[1]
813
        vmla.u16        q3,  q1,  d0[0]
814
        vmla.u16        q14, q2,  d1[1]
815
        vqadd.s16       q11, q10, q11
816
        vqadd.s16       q14, q3,  q14
817
        vqrshrun.s16    \d0, q11, #7
818
        vqrshrun.s16    \d1, q14, #7
819
.endm
820

  
821
.macro  vp8_epel8_v6    d0,  s0,  s1,  s2,  s3,  s4,  s5
822
        vmovl.u8        q10, \s2
823
        vmovl.u8        q11, \s3
824
        vmovl.u8        q9,  \s1
825
        vmovl.u8        q12, \s4
826
        vmovl.u8        q8,  \s0
827
        vmovl.u8        q13, \s5
828
        vmul.u16        q10, q10, d0[2]
829
        vmul.u16        q11, q11, d0[3]
830
        vmls.u16        q10, q9,  d0[1]
831
        vmls.u16        q11, q12, d1[0]
832
        vmla.u16        q10, q8,  d0[0]
833
        vmla.u16        q11, q13, d1[1]
834
        vqadd.s16       q11, q10, q11
835
        vqrshrun.s16    \d0, q11, #7
836
.endm
837

  
838
.macro  vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
839
        vmovl.u8        q10, \s0
840
        vmovl.u8        q11, \s3
841
        vmovl.u8        q14, \s6
842
        vmovl.u8        q9,  \s1
843
        vmovl.u8        q12, \s4
844
        vmovl.u8        q8,  \s2
845
        vmovl.u8        q13, \s5
846
        vmul.u16        q10, q10, d0[0]
847
        vmul.u16        q15, q11, d0[3]
848
        vmul.u16        q11, q11, d0[2]
849
        vmul.u16        q14, q14, d1[1]
850
        vmls.u16        q10, q9,  d0[1]
851
        vmls.u16        q15, q12, d1[0]
852
        vmls.u16        q11, q8,  d0[1]
853
        vmls.u16        q14, q13, d1[0]
854
        vmla.u16        q10, q8,  d0[2]
855
        vmla.u16        q15, q13, d1[1]
856
        vmla.u16        q11, q9,  d0[0]
857
        vmla.u16        q14, q12, d0[3]
858
        vqadd.s16       q15, q10, q15
859
        vqadd.s16       q14, q11, q14
860
        vqrshrun.s16    \d0, q15, #7
861
        vqrshrun.s16    \d1, q14, #7
862
.endm
863

  
864
.macro  vp8_epel8_h4    d,   a,   b
865
        vext.8          d28, \a,  \b,  #1
866
        vmovl.u8        q9,  \a
867
        vext.8          d29, \a,  \b,  #2
868
        vmovl.u8        q10, d28
869
        vext.8          d30, \a,  \b,  #3
870
        vmovl.u8        q11, d29
871
        vmovl.u8        q12, d30
872
        vmul.u16        q10, q10, d0[2]
873
        vmul.u16        q11, q11, d0[3]
874
        vmls.u16        q10, q9,  d0[1]
875
        vmls.u16        q11, q12, d1[0]
876
        vqadd.s16       q11, q10, q11
877
        vqrshrun.s16    \d,  q11, #7
878
.endm
879

  
880
.macro  vp8_epel8_v4_y2 d0,  d1,  s0,  s1,  s2,  s3,  s4
881
        vmovl.u8        q9,  \s0
882
        vmovl.u8        q10, \s1
883
        vmovl.u8        q11, \s2
884
        vmovl.u8        q12, \s3
885
        vmovl.u8        q13, \s4
886
        vmul.u16        q8,  q10, d0[2]
887
        vmul.u16        q14, q11, d0[3]
888
        vmul.u16        q11, q11, d0[2]
889
        vmul.u16        q15, q12, d0[3]
890
        vmls.u16        q8,  q9,  d0[1]
891
        vmls.u16        q14, q12, d1[0]
892
        vmls.u16        q11, q10, d0[1]
893
        vmls.u16        q15, q13, d1[0]
894
        vqadd.s16       q8,  q8,  q14
895
        vqadd.s16       q11, q11, q15
896
        vqrshrun.s16    \d0, q8,  #7
897
        vqrshrun.s16    \d1, q11, #7
898
.endm
899

  
900
function ff_put_vp8_epel16_v6_neon, export=1
901
        sub             r2,  r2,  r3,  lsl #1
902
        push            {r4,lr}
903
        vpush           {d8-d15}
904

  
905
        ldr             r4,  [sp, #80]          @ my
906
        movrel          lr,  subpel_filters-16
907
        ldr             r12, [sp, #72]          @ h
908
        add             r4,  lr,  r4, lsl #4
909
        vld1.16         {q0},     [r4,:128]
910
1:
911
        vld1.8          {d2-d3},  [r2], r3
912
        vld1.8          {d4-d5},  [r2], r3
913
        vld1.8          {d6-d7},  [r2], r3
914
        vld1.8          {d8-d9},  [r2], r3
915
        vld1.8          {d10-d11},[r2], r3
916
        vld1.8          {d12-d13},[r2], r3
917
        vld1.8          {d14-d15},[r2]
918
        sub             r2,  r2,  r3,  lsl #2
919

  
920
        vp8_epel8_v6_y2 d2,  d4,  d2,  d4,  d6,  d8,  d10, d12, d14
921
        vp8_epel8_v6_y2 d3,  d5,  d3,  d5,  d7,  d9,  d11, d13, d15
922

  
923
        vst1.8          {d2-d3},  [r0,:128], r1
924
        vst1.8          {d4-d5},  [r0,:128], r1
925
        subs            r12, r12, #2
926
        bne             1b
927

  
928
        vpop            {d8-d15}
929
        pop             {r4,pc}
930
endfunc
931

  
932
function ff_put_vp8_epel16_h6_neon, export=1
933
        sub             r2,  r2,  #2
934
        push            {r4,lr}
935

  
936
        ldr             r4,  [sp, #12]          @ mx
937
        movrel          lr,  subpel_filters-16
938
        ldr             r12, [sp, #8]           @ h
939
        add             r4,  lr,  r4, lsl #4
940
        vld1.16         {q0},     [r4,:128]
941
1:
942
        vld1.8          {d2-d4},  [r2], r3
943

  
944
        vp8_epel16_h6   d2,  d3,  d2,  d3,  d4,  q1,  q2
945

  
946
        vst1.8          {d2-d3}, [r0,:128], r1
947
        subs            r12, r12, #1
948
        bne             1b
949

  
950
        pop             {r4,pc}
951
endfunc
952

  
953
function ff_put_vp8_epel16_h6v6_neon, export=1
954
        sub             r2,  r2,  r3,  lsl #1
955
        sub             r2,  r2,  #2
956
        push            {r4,lr}
957
        vpush           {d8-d9}
958

  
959
        @ first pass (horizontal):
960
        ldr             r4,  [sp, #28]          @ mx
961
        movrel          lr,  subpel_filters-16
962
        ldr             r12, [sp, #24]          @ h
963
        add             r4,  lr,  r4, lsl #4
964
        sub             sp,  sp,  #336+16
965
        vld1.16         {q0},     [r4,:128]
966
        add             lr,  sp,  #15
967
        add             r12, r12, #5
968
        bic             lr,  lr,  #15
969
1:
970
        vld1.8          {d2,d3,d4}, [r2], r3
971

  
972
        vp8_epel16_h6   d2,  d3,  d2,  d3,  d4,  q1,  q2
973

  
974
        vst1.8          {d2-d3}, [lr,:128]!
975
        subs            r12, r12, #1
976
        bne             1b
977

  
978
        @ second pass (vertical):
979
        ldr             r4,  [sp, #336+16+32]   @ my
980
        movrel          lr,  subpel_filters-16
981
        ldr             r12, [sp, #336+16+24]   @ h
982
        add             r4,  lr,  r4, lsl #4
983
        add             lr,  sp,  #15
984
        vld1.16         {q0},     [r4,:128]
985
        bic             lr,  lr,  #15
986
2:
987
        vld1.8          {d2-d5},  [lr,:128]!
988
        vld1.8          {d6-d9},  [lr,:128]!
989
        vld1.8          {d28-d31},[lr,:128]
990
        sub             lr,  lr,  #48
991

  
992
        vp8_epel8_v6    d2, d2, d4, d6, d8, d28, d30
993
        vp8_epel8_v6    d3, d3, d5, d7, d9, d29, d31
994

  
995
        vst1.8          {d2-d3}, [r0,:128], r1
996
        subs            r12, r12, #1
997
        bne             2b
998

  
999
        add             sp,  sp,  #336+16
1000
        vpop            {d8-d9}
1001
        pop             {r4,pc}
1002
endfunc
1003

  
1004
function ff_put_vp8_epel8_v6_neon, export=1
1005
        sub             r2,  r2,  r3,  lsl #1
1006
        push            {r4,lr}
1007

  
1008
        ldr             r4,  [sp, #16]          @ my
1009
        movrel          lr,  subpel_filters-16
1010
        ldr             r12, [sp, #8]           @ h
1011
        add             r4,  lr,  r4, lsl #4
1012
        vld1.16         {q0},     [r4,:128]
1013
1:
1014
        vld1.8          {d2},  [r2], r3
1015
        vld1.8          {d3},  [r2], r3
1016
        vld1.8          {d4},  [r2], r3
1017
        vld1.8          {d5},  [r2], r3
1018
        vld1.8          {d6},  [r2], r3
1019
        vld1.8          {d7},  [r2], r3
1020
        vld1.8          {d28}, [r2]
1021

  
1022
        sub             r2,  r2,  r3,  lsl #2
1023

  
1024
        vp8_epel8_v6_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6,  d7,  d28
1025

  
1026
        vst1.8          {d2}, [r0,:64], r1
1027
        vst1.8          {d3}, [r0,:64], r1
1028
        subs            r12, r12, #2
1029
        bne             1b
1030

  
1031
        pop             {r4,pc}
1032
endfunc
1033

  
1034
function ff_put_vp8_epel8_h6_neon, export=1
1035
        sub             r2,  r2,  #2
1036
        push            {r4,lr}
1037

  
1038
        ldr             r4,  [sp, #12]          @ mx
1039
        movrel          lr,  subpel_filters-16
1040
        ldr             r12, [sp, #8]           @ h
1041
        add             r4,  lr,  r4, lsl #4
1042
        vld1.16         {q0},     [r4,:128]
1043
1:
1044
        vld1.8          {d2,d3}, [r2], r3
1045

  
1046
        vp8_epel8_h6    d2,  d2,  d3
1047

  
1048
        vst1.8          {d2}, [r0,:64], r1
1049
        subs            r12, r12, #1
1050
        bne             1b
1051

  
1052
        pop             {r4,pc}
1053
endfunc
1054

  
1055
function ff_put_vp8_epel8_h6v6_neon, export=1
1056
        sub             r2,  r2,  r3,  lsl #1
1057
        sub             r2,  r2,  #2
1058
        push            {r4,lr}
1059

  
1060
        @ first pass (horizontal):
1061
        ldr             r4,  [sp, #12]          @ mx
1062
        movrel          lr,  subpel_filters-16
1063
        ldr             r12, [sp, #8]           @ h
1064
        add             r4,  lr,  r4, lsl #4
1065
        sub             sp,  sp,  #168+16
1066
        vld1.16         {q0},     [r4,:128]
1067
        add             lr,  sp,  #15
1068
        add             r12, r12, #5
1069
        bic             lr,  lr,  #15
1070
1:
1071
        vld1.8          {d2,d3}, [r2], r3
1072

  
1073
        vp8_epel8_h6    d2,  d2,  d3
1074

  
1075
        vst1.8          {d2}, [lr,:64]!
1076
        subs            r12, r12, #1
1077
        bne             1b
1078

  
1079
        @ second pass (vertical):
1080
        ldr             r4,  [sp, #168+16+16]   @ my
1081
        movrel          lr,  subpel_filters-16
1082
        ldr             r12, [sp, #168+16+8]    @ h
1083
        add             r4,  lr,  r4, lsl #4
1084
        add             lr,  sp,  #15
1085
        vld1.16         {q0},     [r4,:128]
1086
        bic             lr,  lr,  #15
1087
2:
1088
        vld1.8          {d2-d5},  [lr,:128]!
1089
        vld1.8          {d6-d7},  [lr,:128]!
1090
        vld1.8          {d30},    [lr,:64]
1091
        sub             lr,  lr,  #32
1092

  
1093
        vp8_epel8_v6_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6,  d7,  d30
1094

  
1095
        vst1.8          {d2}, [r0,:64], r1
1096
        vst1.8          {d3}, [r0,:64], r1
1097
        subs            r12, r12, #2
1098
        bne             2b
1099

  
1100
        add             sp,  sp,  #168+16
1101
        pop             {r4,pc}
1102
endfunc
1103

  
1104
function ff_put_vp8_epel8_v4_neon, export=1
1105
        sub             r2,  r2,  r3
1106
        push            {r4,lr}
1107

  
1108
        ldr             r4,  [sp, #16]          @ my
1109
        movrel          lr,  subpel_filters-16
1110
        ldr             r12, [sp, #8]           @ h
1111
        add             r4,  lr,  r4, lsl #4
1112
        vld1.16         {q0},     [r4,:128]
1113
1:
1114
        vld1.8          {d2},     [r2], r3
1115
        vld1.8          {d3},     [r2], r3
1116
        vld1.8          {d4},     [r2], r3
1117
        vld1.8          {d5},     [r2], r3
1118
        vld1.8          {d6},     [r2]
1119
        sub             r2,  r2,  r3,  lsl #1
1120

  
1121
        vp8_epel8_v4_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6
1122

  
1123
        vst1.8          {d2}, [r0,:64], r1
1124
        vst1.8          {d3}, [r0,:64], r1
1125
        subs            r12, r12, #2
1126
        bne             1b
1127

  
1128
        pop             {r4,pc}
1129
endfunc
1130

  
1131
function ff_put_vp8_epel8_h4_neon, export=1
1132
        sub             r2,  r2,  #1
1133
        push            {r4,lr}
1134

  
1135
        ldr             r4,  [sp, #12]          @ mx
1136
        movrel          lr,  subpel_filters-16
1137
        ldr             r12, [sp, #8]           @ h
1138
        add             r4,  lr,  r4, lsl #4
1139
        vld1.16         {q0},     [r4,:128]
1140
1:
1141
        vld1.8          {d2,d3}, [r2], r3
1142

  
1143
        vp8_epel8_h4    d2,  d2,  d3
1144

  
1145
        vst1.8          {d2}, [r0,:64], r1
1146
        subs            r12, r12, #1
1147
        bne             1b
1148

  
1149
        pop             {r4,pc}
1150
endfunc
1151

  
1152
function ff_put_vp8_epel8_h4v4_neon, export=1
1153
        sub             r2,  r2,  r3
1154
        sub             r2,  r2,  #1
1155
        push            {r4,lr}
1156

  
1157
        @ first pass (horizontal):
1158
        ldr             r4,  [sp, #12]          @ mx
1159
        movrel          lr,  subpel_filters-16
1160
        ldr             r12, [sp, #8]           @ h
1161
        add             r4,  lr,  r4, lsl #4
1162
        sub             sp,  sp,  #168+16
1163
        vld1.16         {q0},     [r4,:128]
1164
        add             lr,  sp,  #15
1165
        add             r12, r12, #3
1166
        bic             lr,  lr,  #15
1167
1:
1168
        vld1.8          {d2,d3}, [r2], r3
1169

  
1170
        vp8_epel8_h4    d2,  d2,  d3
1171

  
1172
        vst1.8          {d2}, [lr,:64]!
1173
        subs            r12, r12, #1
1174
        bne             1b
1175

  
1176
        @ second pass (vertical):
1177
        ldr             r4,  [sp, #168+16+16]   @ my
1178
        movrel          lr,  subpel_filters-16
1179
        ldr             r12, [sp, #168+16+8]    @ h
1180
        add             r4,  lr,  r4, lsl #4
1181
        add             lr,  sp,  #15
1182
        vld1.16         {q0},     [r4,:128]
1183
        bic             lr,  lr,  #15
1184
2:
1185
        vld1.8          {d2-d5},  [lr,:128]!
1186
        vld1.8          {d6},     [lr,:64]
1187
        sub             lr,  lr,  #16
1188

  
1189
        vp8_epel8_v4_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6
1190

  
1191
        vst1.8          {d2},     [r0,:64], r1
1192
        vst1.8          {d3},     [r0,:64], r1
1193
        subs            r12, r12, #2
1194
        bne             2b
1195

  
1196
        add             sp,  sp,  #168+16
1197
        pop             {r4,pc}
1198
endfunc
1199

  
1200
function ff_put_vp8_epel8_h6v4_neon, export=1
1201
        sub             r2,  r2,  r3
1202
        sub             r2,  r2,  #2
1203
        push            {r4,lr}
1204

  
1205
        @ first pass (horizontal):
1206
        ldr             r4,  [sp, #12]          @ mx
1207
        movrel          lr,  subpel_filters-16
1208
        ldr             r12, [sp, #8]           @ h
1209
        add             r4,  lr,  r4, lsl #4
1210
        sub             sp,  sp,  #168+16
1211
        vld1.16         {q0},     [r4,:128]
1212
        add             lr,  sp,  #15
1213
        add             r12, r12, #3
1214
        bic             lr,  lr,  #15
1215
1:
1216
        vld1.8          {d2,d3}, [r2], r3
1217

  
1218
        vp8_epel8_h6    d2,  d2,  d3
1219

  
1220
        vst1.8          {d2}, [lr,:64]!
1221
        subs            r12, r12, #1
1222
        bne             1b
1223

  
1224
        @ second pass (vertical):
1225
        ldr             r4,  [sp, #168+16+16]   @ my
1226
        movrel          lr,  subpel_filters-16
1227
        ldr             r12, [sp, #168+16+8]    @ h
1228
        add             r4,  lr,  r4, lsl #4
1229
        add             lr,  sp,  #15
1230
        vld1.16         {q0},     [r4,:128]
1231
        bic             lr,  lr,  #15
1232
2:
1233
        vld1.8          {d2-d5},  [lr,:128]!
1234
        vld1.8          {d6},     [lr,:64]
1235
        sub             lr,  lr,  #16
1236

  
1237
        vp8_epel8_v4_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6
1238

  
1239
        vst1.8          {d2},     [r0,:64], r1
1240
        vst1.8          {d3},     [r0,:64], r1
1241
        subs            r12, r12, #2
1242
        bne             2b
1243

  
1244
        add             sp,  sp,  #168+16
1245
        pop             {r4,pc}
1246
endfunc
1247

  
1248
function ff_put_vp8_epel8_h4v6_neon, export=1
1249
        sub             r2,  r2,  r3,  lsl #1
1250
        sub             r2,  r2,  #1
1251
        push            {r4,lr}
1252

  
1253
        @ first pass (horizontal):
1254
        ldr             r4,  [sp, #12]          @ mx
1255
        movrel          lr,  subpel_filters-16
1256
        ldr             r12, [sp, #8]           @ h
1257
        add             r4,  lr,  r4, lsl #4
1258
        sub             sp,  sp,  #168+16
1259
        vld1.16         {q0},     [r4,:128]
1260
        add             lr,  sp,  #15
1261
        add             r12, r12, #5
1262
        bic             lr,  lr,  #15
1263
1:
1264
        vld1.8          {d2,d3}, [r2], r3
1265

  
1266
        vp8_epel8_h4    d2,  d2,  d3
1267

  
1268
        vst1.8          {d2}, [lr,:64]!
1269
        subs            r12, r12, #1
1270
        bne             1b
1271

  
1272
        @ second pass (vertical):
1273
        ldr             r4,  [sp, #168+16+16]   @ my
1274
        movrel          lr,  subpel_filters-16
1275
        ldr             r12, [sp, #168+16+8]    @ h
1276
        add             r4,  lr,  r4, lsl #4
1277
        add             lr,  sp,  #15
1278
        vld1.16         {q0},     [r4,:128]
1279
        bic             lr,  lr,  #15
1280
2:
1281
        vld1.8          {d2-d5},  [lr,:128]!
1282
        vld1.8          {d6-d7},  [lr,:128]!
1283
        vld1.8          {d30},    [lr,:64]
1284
        sub             lr,  lr,  #32
1285

  
1286
        vp8_epel8_v6_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6,  d7,  d30
1287

  
1288
        vst1.8          {d2}, [r0,:64], r1
1289
        vst1.8          {d3}, [r0,:64], r1
1290
        subs            r12, r12, #2
1291
        bne             2b
1292

  
1293
        add             sp,  sp,  #168+16
... This diff was truncated because it exceeds the maximum size that can be displayed.

Also available in: Unified diff