Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / vp8dsp-init.c @ f2a30bd8

History | View | Annotate | Download (14.2 KB)

1
/*
2
 * VP8 DSP functions x86-optimized
3
 * Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4
 * Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
5
 *
6
 * This file is part of FFmpeg.
7
 *
8
 * FFmpeg is free software; you can redistribute it and/or
9
 * modify it under the terms of the GNU Lesser General Public
10
 * License as published by the Free Software Foundation; either
11
 * version 2.1 of the License, or (at your option) any later version.
12
 *
13
 * FFmpeg is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
 * Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public
19
 * License along with FFmpeg; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
 */
22

    
23
#include "libavutil/x86_cpu.h"
24
#include "libavcodec/vp8dsp.h"
25

    
26
#if HAVE_YASM
27

    
28
/*
29
 * MC functions
30
 */
31
extern void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, int dststride,
32
                                       uint8_t *src, int srcstride,
33
                                       int height, int mx, int my);
34
extern void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, int dststride,
35
                                       uint8_t *src, int srcstride,
36
                                       int height, int mx, int my);
37
extern void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, int dststride,
38
                                       uint8_t *src, int srcstride,
39
                                       int height, int mx, int my);
40
extern void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, int dststride,
41
                                       uint8_t *src, int srcstride,
42
                                       int height, int mx, int my);
43

    
44
extern void ff_put_vp8_epel8_h4_sse2  (uint8_t *dst, int dststride,
45
                                       uint8_t *src, int srcstride,
46
                                       int height, int mx, int my);
47
extern void ff_put_vp8_epel8_h6_sse2  (uint8_t *dst, int dststride,
48
                                       uint8_t *src, int srcstride,
49
                                       int height, int mx, int my);
50
extern void ff_put_vp8_epel8_v4_sse2  (uint8_t *dst, int dststride,
51
                                       uint8_t *src, int srcstride,
52
                                       int height, int mx, int my);
53
extern void ff_put_vp8_epel8_v6_sse2  (uint8_t *dst, int dststride,
54
                                       uint8_t *src, int srcstride,
55
                                       int height, int mx, int my);
56

    
57
extern void ff_put_vp8_epel4_h4_ssse3 (uint8_t *dst, int dststride,
58
                                       uint8_t *src, int srcstride,
59
                                       int height, int mx, int my);
60
extern void ff_put_vp8_epel4_h6_ssse3 (uint8_t *dst, int dststride,
61
                                       uint8_t *src, int srcstride,
62
                                       int height, int mx, int my);
63
extern void ff_put_vp8_epel4_v4_ssse3 (uint8_t *dst, int dststride,
64
                                       uint8_t *src, int srcstride,
65
                                       int height, int mx, int my);
66
extern void ff_put_vp8_epel4_v6_ssse3 (uint8_t *dst, int dststride,
67
                                       uint8_t *src, int srcstride,
68
                                       int height, int mx, int my);
69
extern void ff_put_vp8_epel8_h4_ssse3 (uint8_t *dst, int dststride,
70
                                       uint8_t *src, int srcstride,
71
                                       int height, int mx, int my);
72
extern void ff_put_vp8_epel8_h6_ssse3 (uint8_t *dst, int dststride,
73
                                       uint8_t *src, int srcstride,
74
                                       int height, int mx, int my);
75
extern void ff_put_vp8_epel8_v4_ssse3 (uint8_t *dst, int dststride,
76
                                       uint8_t *src, int srcstride,
77
                                       int height, int mx, int my);
78
extern void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, int dststride,
79
                                       uint8_t *src, int srcstride,
80
                                       int height, int mx, int my);
81

    
82
extern void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, int dststride,
83
                                          uint8_t *src, int srcstride,
84
                                          int height, int mx, int my);
85
extern void ff_put_vp8_bilinear8_h_sse2  (uint8_t *dst, int dststride,
86
                                          uint8_t *src, int srcstride,
87
                                          int height, int mx, int my);
88
extern void ff_put_vp8_bilinear4_h_ssse3 (uint8_t *dst, int dststride,
89
                                          uint8_t *src, int srcstride,
90
                                          int height, int mx, int my);
91
extern void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, int dststride,
92
                                          uint8_t *src, int srcstride,
93
                                          int height, int mx, int my);
94

    
95
extern void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, int dststride,
96
                                          uint8_t *src, int srcstride,
97
                                          int height, int mx, int my);
98
extern void ff_put_vp8_bilinear8_v_sse2  (uint8_t *dst, int dststride,
99
                                          uint8_t *src, int srcstride,
100
                                          int height, int mx, int my);
101
extern void ff_put_vp8_bilinear4_v_ssse3 (uint8_t *dst, int dststride,
102
                                          uint8_t *src, int srcstride,
103
                                          int height, int mx, int my);
104
extern void ff_put_vp8_bilinear8_v_ssse3 (uint8_t *dst, int dststride,
105
                                          uint8_t *src, int srcstride,
106
                                          int height, int mx, int my);
107

    
108

    
109
extern void ff_put_vp8_pixels8_mmx (uint8_t *dst, int dststride,
110
                                    uint8_t *src, int srcstride,
111
                                    int height, int mx, int my);
112
extern void ff_put_vp8_pixels16_mmx(uint8_t *dst, int dststride,
113
                                    uint8_t *src, int srcstride,
114
                                    int height, int mx, int my);
115
extern void ff_put_vp8_pixels16_sse(uint8_t *dst, int dststride,
116
                                    uint8_t *src, int srcstride,
117
                                    int height, int mx, int my);
118

    
119
#define TAP_W16(OPT, FILTERTYPE, TAPTYPE) \
120
static void ff_put_vp8_ ## FILTERTYPE ## 16_ ## TAPTYPE ## _ ## OPT( \
121
    uint8_t *dst,  int dststride, uint8_t *src, \
122
    int srcstride, int height, int mx, int my) \
123
{ \
124
    ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
125
        dst,     dststride, src,     srcstride, height, mx, my); \
126
    ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
127
        dst + 8, dststride, src + 8, srcstride, height, mx, my); \
128
}
129
#define TAP_W8(OPT, FILTERTYPE, TAPTYPE) \
130
static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
131
    uint8_t *dst,  int dststride, uint8_t *src, \
132
    int srcstride, int height, int mx, int my) \
133
{ \
134
    ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
135
        dst,     dststride, src,     srcstride, height, mx, my); \
136
    ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
137
        dst + 4, dststride, src + 4, srcstride, height, mx, my); \
138
}
139

    
140
TAP_W8 (mmxext, epel, h4)
141
TAP_W8 (mmxext, epel, h6)
142
TAP_W16(mmxext, epel, h6)
143
TAP_W8 (mmxext, epel, v4)
144
TAP_W8 (mmxext, epel, v6)
145
TAP_W16(mmxext, epel, v6)
146
TAP_W8 (mmxext, bilinear, h)
147
TAP_W16(mmxext, bilinear, h)
148
TAP_W8 (mmxext, bilinear, v)
149
TAP_W16(mmxext, bilinear, v)
150

    
151
TAP_W16(sse2,   epel, h6)
152
TAP_W16(sse2,   epel, v6)
153
TAP_W16(sse2,   bilinear, h)
154
TAP_W16(sse2,   bilinear, v)
155

    
156
TAP_W16(ssse3,  epel, h6)
157
TAP_W16(ssse3,  epel, v6)
158
TAP_W16(ssse3,  bilinear, h)
159
TAP_W16(ssse3,  bilinear, v)
160

    
161
#define HVTAP(OPT, ALIGN, TAPNUMX, TAPNUMY, SIZE, MAXHEIGHT) \
162
static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT( \
163
    uint8_t *dst, int dststride, uint8_t *src, \
164
    int srcstride, int height, int mx, int my) \
165
{ \
166
    DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + TAPNUMY - 1)]; \
167
    uint8_t *tmpptr = tmp + SIZE * (TAPNUMY / 2 - 1); \
168
    src -= srcstride * (TAPNUMY / 2 - 1); \
169
    ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## _ ## OPT( \
170
        tmp, SIZE,      src,    srcstride, height + TAPNUMY - 1, mx, my); \
171
    ff_put_vp8_epel ## SIZE ## _v ## TAPNUMY ## _ ## OPT( \
172
        dst, dststride, tmpptr, SIZE,      height,               mx, my); \
173
}
174

    
175
#define HVTAPMMX(x, y) \
176
HVTAP(mmxext, 8, x, y,  4,  8) \
177
HVTAP(mmxext, 8, x, y,  8, 16)
178

    
179
HVTAPMMX(4, 4)
180
HVTAPMMX(4, 6)
181
HVTAPMMX(6, 4)
182
HVTAPMMX(6, 6)
183
HVTAP(mmxext, 8, 6, 6, 16, 16)
184

    
185
#define HVTAPSSE2(x, y, w) \
186
HVTAP(sse2,  16, x, y, w, 16) \
187
HVTAP(ssse3, 16, x, y, w, 16)
188

    
189
HVTAPSSE2(4, 4, 8)
190
HVTAPSSE2(4, 6, 8)
191
HVTAPSSE2(6, 4, 8)
192
HVTAPSSE2(6, 6, 8)
193
HVTAPSSE2(6, 6, 16)
194

    
195
HVTAP(ssse3, 16, 4, 4, 4, 8)
196
HVTAP(ssse3, 16, 4, 6, 4, 8)
197
HVTAP(ssse3, 16, 6, 4, 4, 8)
198
HVTAP(ssse3, 16, 6, 6, 4, 8)
199

    
200
#define HVBILIN(OPT, ALIGN, SIZE, MAXHEIGHT) \
201
static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
202
    uint8_t *dst, int dststride, uint8_t *src, \
203
    int srcstride, int height, int mx, int my) \
204
{ \
205
    DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + 2)]; \
206
    ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT( \
207
        tmp, SIZE,      src, srcstride, height + 1, mx, my); \
208
    ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT( \
209
        dst, dststride, tmp, SIZE,      height,     mx, my); \
210
}
211

    
212
HVBILIN(mmxext, 8,  4,  8)
213
HVBILIN(mmxext, 8,  8, 16)
214
HVBILIN(mmxext, 8, 16, 16)
215
HVBILIN(sse2,   8,  8, 16)
216
HVBILIN(sse2,   8, 16, 16)
217
HVBILIN(ssse3,  8,  4,  8)
218
HVBILIN(ssse3,  8,  8, 16)
219
HVBILIN(ssse3,  8, 16, 16)
220

    
221
extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
222
extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride);
223
extern void ff_vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]);
224
extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
225

    
226
extern void ff_vp8_v_loop_filter_simple_mmx   (uint8_t *dst, int stride, int flim);
227
extern void ff_vp8_v_loop_filter_simple_mmxext(uint8_t *dst, int stride, int flim);
228
extern void ff_vp8_v_loop_filter_simple_sse2  (uint8_t *dst, int stride, int flim);
229
extern void ff_vp8_h_loop_filter_simple_mmx   (uint8_t *dst, int stride, int flim);
230
extern void ff_vp8_h_loop_filter_simple_mmxext(uint8_t *dst, int stride, int flim);
231
extern void ff_vp8_h_loop_filter_simple_sse2  (uint8_t *dst, int stride, int flim);
232
#endif
233

    
234
#define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
235
    c->put_vp8_epel_pixels_tab[IDX][0][2] = ff_put_vp8_epel ## SIZE ## _h6_ ## OPT; \
236
    c->put_vp8_epel_pixels_tab[IDX][2][0] = ff_put_vp8_epel ## SIZE ## _v6_ ## OPT; \
237
    c->put_vp8_epel_pixels_tab[IDX][2][2] = ff_put_vp8_epel ## SIZE ## _h6v6_ ## OPT
238

    
239
#define VP8_MC_FUNC(IDX, SIZE, OPT) \
240
    c->put_vp8_epel_pixels_tab[IDX][0][1] = ff_put_vp8_epel ## SIZE ## _h4_ ## OPT; \
241
    c->put_vp8_epel_pixels_tab[IDX][1][0] = ff_put_vp8_epel ## SIZE ## _v4_ ## OPT; \
242
    c->put_vp8_epel_pixels_tab[IDX][1][1] = ff_put_vp8_epel ## SIZE ## _h4v4_ ## OPT; \
243
    c->put_vp8_epel_pixels_tab[IDX][1][2] = ff_put_vp8_epel ## SIZE ## _h6v4_ ## OPT; \
244
    c->put_vp8_epel_pixels_tab[IDX][2][1] = ff_put_vp8_epel ## SIZE ## _h4v6_ ## OPT; \
245
    VP8_LUMA_MC_FUNC(IDX, SIZE, OPT)
246

    
247
#define VP8_BILINEAR_MC_FUNC(IDX, SIZE, OPT) \
248
    c->put_vp8_bilinear_pixels_tab[IDX][0][1] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
249
    c->put_vp8_bilinear_pixels_tab[IDX][0][2] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
250
    c->put_vp8_bilinear_pixels_tab[IDX][1][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
251
    c->put_vp8_bilinear_pixels_tab[IDX][1][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
252
    c->put_vp8_bilinear_pixels_tab[IDX][1][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
253
    c->put_vp8_bilinear_pixels_tab[IDX][2][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
254
    c->put_vp8_bilinear_pixels_tab[IDX][2][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
255
    c->put_vp8_bilinear_pixels_tab[IDX][2][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT
256

    
257

    
258
av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
259
{
260
    mm_flags = mm_support();
261

    
262
#if HAVE_YASM
263
    if (mm_flags & FF_MM_MMX) {
264
        c->vp8_idct_dc_add                  = ff_vp8_idct_dc_add_mmx;
265
        c->vp8_idct_add                     = ff_vp8_idct_add_mmx;
266
        c->put_vp8_epel_pixels_tab[0][0][0]     =
267
        c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_mmx;
268
        c->put_vp8_epel_pixels_tab[1][0][0]     =
269
        c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx;
270

    
271
        c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx;
272
        c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx;
273
    }
274

    
275
    /* note that 4-tap width=16 functions are missing because w=16
276
     * is only used for luma, and luma is always a copy or sixtap. */
277
    if (mm_flags & FF_MM_MMX2) {
278
        c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmxext;
279
        VP8_LUMA_MC_FUNC(0, 16, mmxext);
280
        VP8_MC_FUNC(1, 8, mmxext);
281
        VP8_MC_FUNC(2, 4, mmxext);
282
        VP8_BILINEAR_MC_FUNC(0, 16, mmxext);
283
        VP8_BILINEAR_MC_FUNC(1, 8, mmxext);
284
        VP8_BILINEAR_MC_FUNC(2, 4, mmxext);
285

    
286
        c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmxext;
287
        c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmxext;
288
    }
289

    
290
    if (mm_flags & FF_MM_SSE) {
291
        c->put_vp8_epel_pixels_tab[0][0][0]     =
292
        c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
293
    }
294

    
295
    if (mm_flags & FF_MM_SSE2) {
296
        VP8_LUMA_MC_FUNC(0, 16, sse2);
297
        VP8_MC_FUNC(1, 8, sse2);
298
        VP8_BILINEAR_MC_FUNC(0, 16, sse2);
299
        VP8_BILINEAR_MC_FUNC(1, 8, sse2);
300

    
301
        c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
302
        c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
303
    }
304

    
305
    if (mm_flags & FF_MM_SSSE3) {
306
        VP8_LUMA_MC_FUNC(0, 16, ssse3);
307
        VP8_MC_FUNC(1, 8, ssse3);
308
        VP8_MC_FUNC(2, 4, ssse3);
309
        VP8_BILINEAR_MC_FUNC(0, 16, ssse3);
310
        VP8_BILINEAR_MC_FUNC(1, 8, ssse3);
311
        VP8_BILINEAR_MC_FUNC(2, 4, ssse3);
312
    }
313

    
314
    if (mm_flags & FF_MM_SSE4) {
315
        c->vp8_idct_dc_add                  = ff_vp8_idct_dc_add_sse4;
316
    }
317
#endif
318
}