Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / h264_weight.asm @ 2912e87a

History | View | Annotate | Download (8.32 KB)

1
;*****************************************************************************
2
;* SSE2-optimized weighted prediction code
3
;*****************************************************************************
4
;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
5
;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com>
6
;*
7
;* This file is part of Libav.
8
;*
9
;* Libav is free software; you can redistribute it and/or
10
;* modify it under the terms of the GNU Lesser General Public
11
;* License as published by the Free Software Foundation; either
12
;* version 2.1 of the License, or (at your option) any later version.
13
;*
14
;* Libav is distributed in the hope that it will be useful,
15
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17
;* Lesser General Public License for more details.
18
;*
19
;* You should have received a copy of the GNU Lesser General Public
20
;* License along with Libav; if not, write to the Free Software
21
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22
;******************************************************************************
23

    
24
%include "x86inc.asm"
25

    
26
SECTION .text
27

    
28
;-----------------------------------------------------------------------------
29
; biweight pred:
30
;
31
; void h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride,
32
;                               int log2_denom, int weightd, int weights,
33
;                               int offset);
34
; and
35
; void h264_weight_16x16_sse2(uint8_t *dst, int stride,
36
;                             int log2_denom, int weight,
37
;                             int offset);
38
;-----------------------------------------------------------------------------
39

    
40
%macro WEIGHT_SETUP 0
41
    add        r4, r4
42
    inc        r4
43
    movd       m3, r3d
44
    movd       m5, r4d
45
    movd       m6, r2d
46
    pslld      m5, m6
47
    psrld      m5, 1
48
%if mmsize == 16
49
    pshuflw    m3, m3, 0
50
    pshuflw    m5, m5, 0
51
    punpcklqdq m3, m3
52
    punpcklqdq m5, m5
53
%else
54
    pshufw     m3, m3, 0
55
    pshufw     m5, m5, 0
56
%endif
57
    pxor       m7, m7
58
%endmacro
59

    
60
%macro WEIGHT_OP 2
61
    movh          m0, [r0+%1]
62
    movh          m1, [r0+%2]
63
    punpcklbw     m0, m7
64
    punpcklbw     m1, m7
65
    pmullw        m0, m3
66
    pmullw        m1, m3
67
    paddsw        m0, m5
68
    paddsw        m1, m5
69
    psraw         m0, m6
70
    psraw         m1, m6
71
    packuswb      m0, m1
72
%endmacro
73

    
74
%macro WEIGHT_FUNC_DBL_MM 1
75
cglobal h264_weight_16x%1_mmx2, 5, 5, 0
76
    WEIGHT_SETUP
77
    mov        r2, %1
78
%if %1 == 16
79
.nextrow
80
    WEIGHT_OP 0,  4
81
    mova     [r0  ], m0
82
    WEIGHT_OP 8, 12
83
    mova     [r0+8], m0
84
    add        r0, r1
85
    dec        r2
86
    jnz .nextrow
87
    REP_RET
88
%else
89
    jmp mangle(ff_h264_weight_16x16_mmx2.nextrow)
90
%endif
91
%endmacro
92

    
93
INIT_MMX
94
WEIGHT_FUNC_DBL_MM 16
95
WEIGHT_FUNC_DBL_MM  8
96

    
97
%macro WEIGHT_FUNC_MM 4
98
cglobal h264_weight_%1x%2_%4, 7, 7, %3
99
    WEIGHT_SETUP
100
    mov        r2, %2
101
%if %2 == 16
102
.nextrow
103
    WEIGHT_OP 0, mmsize/2
104
    mova     [r0], m0
105
    add        r0, r1
106
    dec        r2
107
    jnz .nextrow
108
    REP_RET
109
%else
110
    jmp mangle(ff_h264_weight_%1x16_%4.nextrow)
111
%endif
112
%endmacro
113

    
114
INIT_MMX
115
WEIGHT_FUNC_MM  8, 16,  0, mmx2
116
WEIGHT_FUNC_MM  8,  8,  0, mmx2
117
WEIGHT_FUNC_MM  8,  4,  0, mmx2
118
INIT_XMM
119
WEIGHT_FUNC_MM 16, 16,  8, sse2
120
WEIGHT_FUNC_MM 16,  8,  8, sse2
121

    
122
%macro WEIGHT_FUNC_HALF_MM 5
123
cglobal h264_weight_%1x%2_%5, 5, 5, %4
124
    WEIGHT_SETUP
125
    mov        r2, %2/2
126
    lea        r3, [r1*2]
127
%if %2 == mmsize
128
.nextrow
129
    WEIGHT_OP 0, r1
130
    movh     [r0], m0
131
%if mmsize == 16
132
    movhps   [r0+r1], m0
133
%else
134
    psrlq      m0, 32
135
    movh     [r0+r1], m0
136
%endif
137
    add        r0, r3
138
    dec        r2
139
    jnz .nextrow
140
    REP_RET
141
%else
142
    jmp mangle(ff_h264_weight_%1x%3_%5.nextrow)
143
%endif
144
%endmacro
145

    
146
INIT_MMX
147
WEIGHT_FUNC_HALF_MM 4,  8,  8, 0, mmx2
148
WEIGHT_FUNC_HALF_MM 4,  4,  8, 0, mmx2
149
WEIGHT_FUNC_HALF_MM 4,  2,  8, 0, mmx2
150
INIT_XMM
151
WEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2
152
WEIGHT_FUNC_HALF_MM 8,  8, 16, 8, sse2
153
WEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
154

    
155
%macro BIWEIGHT_SETUP 0
156
    add        r6, 1
157
    or         r6, 1
158
    add        r3, 1
159
    movd       m3, r4d
160
    movd       m4, r5d
161
    movd       m5, r6d
162
    movd       m6, r3d
163
    pslld      m5, m6
164
    psrld      m5, 1
165
%if mmsize == 16
166
    pshuflw    m3, m3, 0
167
    pshuflw    m4, m4, 0
168
    pshuflw    m5, m5, 0
169
    punpcklqdq m3, m3
170
    punpcklqdq m4, m4
171
    punpcklqdq m5, m5
172
%else
173
    pshufw     m3, m3, 0
174
    pshufw     m4, m4, 0
175
    pshufw     m5, m5, 0
176
%endif
177
    pxor       m7, m7
178
%endmacro
179

    
180
%macro BIWEIGHT_STEPA 3
181
    movh       m%1, [r0+%3]
182
    movh       m%2, [r1+%3]
183
    punpcklbw  m%1, m7
184
    punpcklbw  m%2, m7
185
    pmullw     m%1, m3
186
    pmullw     m%2, m4
187
    paddsw     m%1, m%2
188
%endmacro
189

    
190
%macro BIWEIGHT_STEPB 0
191
    paddsw     m0, m5
192
    paddsw     m1, m5
193
    psraw      m0, m6
194
    psraw      m1, m6
195
    packuswb   m0, m1
196
%endmacro
197

    
198
%macro BIWEIGHT_FUNC_DBL_MM 1
199
cglobal h264_biweight_16x%1_mmx2, 7, 7, 0
200
    BIWEIGHT_SETUP
201
    mov        r3, %1
202
%if %1 == 16
203
.nextrow
204
    BIWEIGHT_STEPA 0, 1, 0
205
    BIWEIGHT_STEPA 1, 2, 4
206
    BIWEIGHT_STEPB
207
    mova       [r0], m0
208
    BIWEIGHT_STEPA 0, 1, 8
209
    BIWEIGHT_STEPA 1, 2, 12
210
    BIWEIGHT_STEPB
211
    mova     [r0+8], m0
212
    add        r0, r2
213
    add        r1, r2
214
    dec        r3
215
    jnz .nextrow
216
    REP_RET
217
%else
218
    jmp mangle(ff_h264_biweight_16x16_mmx2.nextrow)
219
%endif
220
%endmacro
221

    
222
INIT_MMX
223
BIWEIGHT_FUNC_DBL_MM 16
224
BIWEIGHT_FUNC_DBL_MM  8
225

    
226
%macro BIWEIGHT_FUNC_MM 4
227
cglobal h264_biweight_%1x%2_%4, 7, 7, %3
228
    BIWEIGHT_SETUP
229
    mov        r3, %2
230
%if %2 == 16
231
.nextrow
232
    BIWEIGHT_STEPA 0, 1, 0
233
    BIWEIGHT_STEPA 1, 2, mmsize/2
234
    BIWEIGHT_STEPB
235
    mova       [r0], m0
236
    add        r0, r2
237
    add        r1, r2
238
    dec        r3
239
    jnz .nextrow
240
    REP_RET
241
%else
242
    jmp mangle(ff_h264_biweight_%1x16_%4.nextrow)
243
%endif
244
%endmacro
245

    
246
INIT_MMX
247
BIWEIGHT_FUNC_MM  8, 16,  0, mmx2
248
BIWEIGHT_FUNC_MM  8,  8,  0, mmx2
249
BIWEIGHT_FUNC_MM  8,  4,  0, mmx2
250
INIT_XMM
251
BIWEIGHT_FUNC_MM 16, 16,  8, sse2
252
BIWEIGHT_FUNC_MM 16,  8,  8, sse2
253

    
254
%macro BIWEIGHT_FUNC_HALF_MM 5
255
cglobal h264_biweight_%1x%2_%5, 7, 7, %4
256
    BIWEIGHT_SETUP
257
    mov        r3, %2/2
258
    lea        r4, [r2*2]
259
%if %2 == mmsize
260
.nextrow
261
    BIWEIGHT_STEPA 0, 1, 0
262
    BIWEIGHT_STEPA 1, 2, r2
263
    BIWEIGHT_STEPB
264
    movh       [r0], m0
265
%if mmsize == 16
266
    movhps     [r0+r2], m0
267
%else
268
    psrlq      m0, 32
269
    movh       [r0+r2], m0
270
%endif
271
    add        r0, r4
272
    add        r1, r4
273
    dec        r3
274
    jnz .nextrow
275
    REP_RET
276
%else
277
    jmp mangle(ff_h264_biweight_%1x%3_%5.nextrow)
278
%endif
279
%endmacro
280

    
281
INIT_MMX
282
BIWEIGHT_FUNC_HALF_MM 4,  8,  8, 0, mmx2
283
BIWEIGHT_FUNC_HALF_MM 4,  4,  8, 0, mmx2
284
BIWEIGHT_FUNC_HALF_MM 4,  2,  8, 0, mmx2
285
INIT_XMM
286
BIWEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2
287
BIWEIGHT_FUNC_HALF_MM 8,  8, 16, 8, sse2
288
BIWEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
289

    
290
%macro BIWEIGHT_SSSE3_SETUP 0
291
    add        r6, 1
292
    or         r6, 1
293
    add        r3, 1
294
    movd       m4, r4d
295
    movd       m0, r5d
296
    movd       m5, r6d
297
    movd       m6, r3d
298
    pslld      m5, m6
299
    psrld      m5, 1
300
    punpcklbw  m4, m0
301
    pshuflw    m4, m4, 0
302
    pshuflw    m5, m5, 0
303
    punpcklqdq m4, m4
304
    punpcklqdq m5, m5
305
%endmacro
306

    
307
%macro BIWEIGHT_SSSE3_OP 0
308
    pmaddubsw  m0, m4
309
    pmaddubsw  m2, m4
310
    paddsw     m0, m5
311
    paddsw     m2, m5
312
    psraw      m0, m6
313
    psraw      m2, m6
314
    packuswb   m0, m2
315
%endmacro
316

    
317
%macro BIWEIGHT_SSSE3_16 1
318
cglobal h264_biweight_16x%1_ssse3, 7, 7, 8
319
    BIWEIGHT_SSSE3_SETUP
320
    mov        r3, %1
321

    
322
%if %1 == 16
323
.nextrow
324
    movh       m0, [r0]
325
    movh       m2, [r0+8]
326
    movh       m3, [r1+8]
327
    punpcklbw  m0, [r1]
328
    punpcklbw  m2, m3
329
    BIWEIGHT_SSSE3_OP
330
    mova       [r0], m0
331
    add        r0, r2
332
    add        r1, r2
333
    dec        r3
334
    jnz .nextrow
335
    REP_RET
336
%else
337
    jmp mangle(ff_h264_biweight_16x16_ssse3.nextrow)
338
%endif
339
%endmacro
340

    
341
INIT_XMM
342
BIWEIGHT_SSSE3_16 16
343
BIWEIGHT_SSSE3_16  8
344

    
345
%macro BIWEIGHT_SSSE3_8 1
346
cglobal h264_biweight_8x%1_ssse3, 7, 7, 8
347
    BIWEIGHT_SSSE3_SETUP
348
    mov        r3, %1/2
349
    lea        r4, [r2*2]
350

    
351
%if %1 == 16
352
.nextrow
353
    movh       m0, [r0]
354
    movh       m1, [r1]
355
    movh       m2, [r0+r2]
356
    movh       m3, [r1+r2]
357
    punpcklbw  m0, m1
358
    punpcklbw  m2, m3
359
    BIWEIGHT_SSSE3_OP
360
    movh       [r0], m0
361
    movhps     [r0+r2], m0
362
    add        r0, r4
363
    add        r1, r4
364
    dec        r3
365
    jnz .nextrow
366
    REP_RET
367
%else
368
    jmp mangle(ff_h264_biweight_8x16_ssse3.nextrow)
369
%endif
370
%endmacro
371

    
372
INIT_XMM
373
BIWEIGHT_SSSE3_8 16
374
BIWEIGHT_SSSE3_8  8
375
BIWEIGHT_SSSE3_8  4