Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / h264_weight_sse2.asm @ 4eca52ed

History | View | Annotate | Download (4.23 KB)

1 98fe09df Jason Garrett-Glaser
;*****************************************************************************
2
;* SSE2-optimized weighted prediction code
3
;*****************************************************************************
4
;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
5
;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com>
6
;*
7
;* This file is part of FFmpeg.
8
;*
9
;* FFmpeg is free software; you can redistribute it and/or
10
;* modify it under the terms of the GNU Lesser General Public
11
;* License as published by the Free Software Foundation; either
12
;* version 2.1 of the License, or (at your option) any later version.
13
;*
14
;* FFmpeg is distributed in the hope that it will be useful,
15
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17
;* Lesser General Public License for more details.
18
;*
19
;* You should have received a copy of the GNU Lesser General Public
20
;* License along with FFmpeg; if not, write to the Free Software
21
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22
;******************************************************************************
23
24
%include "x86inc.asm"
25
26
SECTION .text
27
INIT_XMM
28
29
;-----------------------------------------------------------------------------
30
; biweight pred:
31
;
32
; void h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride,
33
;                               int log2_denom, int weightd, int weights,
34
;                               int offset);
35
;-----------------------------------------------------------------------------
36
37
%macro BIWEIGHT_SSE2_SETUP 0
38
    add        r6, 1
39
    or         r6, 1
40
    add        r3, 1
41
    movd       m3, r4
42
    movd       m4, r5
43
    movd       m5, r6
44
    movd       m6, r3
45
    pslld      m5, m6
46
    psrld      m5, 1
47
    pshuflw    m3, m3, 0
48
    pshuflw    m4, m4, 0
49
    pshuflw    m5, m5, 0
50
    punpcklqdq m3, m3
51
    punpcklqdq m4, m4
52
    punpcklqdq m5, m5
53
    pxor       m7, m7
54
%endmacro
55
56
%macro BIWEIGHT_SSE2_STEPA 3
57
    movh       m%1, [r0+%3]
58
    movh       m%2, [r1+%3]
59
    punpcklbw  m%1, m7
60
    punpcklbw  m%2, m7
61
    pmullw     m%1, m3
62
    pmullw     m%2, m4
63
    paddsw     m%1, m%2
64
%endmacro
65
66
%macro BIWEIGHT_SSE2_STEPB 0
67
    paddsw     m0, m5
68
    paddsw     m1, m5
69
    psraw      m0, m6
70
    psraw      m1, m6
71
    packuswb   m0, m1
72
%endmacro
73
74
cglobal h264_biweight_16x16_sse2, 7, 7, 8
75
    BIWEIGHT_SSE2_SETUP
76
    mov        r3, 16
77
78
.nextrow
79
    BIWEIGHT_SSE2_STEPA 0, 1, 0
80
    BIWEIGHT_SSE2_STEPA 1, 2, 8
81
    BIWEIGHT_SSE2_STEPB
82
    mova       [r0], m0
83
    add        r0, r2
84
    add        r1, r2
85
    dec        r3
86
    jnz .nextrow
87
    REP_RET
88
89
cglobal h264_biweight_8x8_sse2, 7, 7, 8
90
    BIWEIGHT_SSE2_SETUP
91
    mov        r3, 4
92
    lea        r4, [r2*2]
93
94
.nextrow
95
    BIWEIGHT_SSE2_STEPA 0, 1, 0
96
    BIWEIGHT_SSE2_STEPA 1, 2, r2
97
    BIWEIGHT_SSE2_STEPB
98
    movh       [r0], m0
99
    movhps     [r0+r2], m0
100
    add        r0, r4
101
    add        r1, r4
102
    dec        r3
103
    jnz .nextrow
104
    REP_RET
105
106
%macro BIWEIGHT_SSSE3_SETUP 0
107
    add        r6, 1
108
    or         r6, 1
109
    add        r3, 1
110
    movd       m4, r4
111
    movd       m0, r5
112
    movd       m5, r6
113
    movd       m6, r3
114
    pslld      m5, m6
115
    psrld      m5, 1
116
    punpcklbw  m4, m0
117
    pshuflw    m4, m4, 0
118
    pshuflw    m5, m5, 0
119
    punpcklqdq m4, m4
120
    punpcklqdq m5, m5
121
%endmacro
122
123
%macro BIWEIGHT_SSSE3_OP 0
124
    pmaddubsw  m0, m4
125
    pmaddubsw  m2, m4
126
    paddsw     m0, m5
127
    paddsw     m2, m5
128
    psraw      m0, m6
129
    psraw      m2, m6
130
    packuswb   m0, m2
131
%endmacro
132
133
cglobal h264_biweight_16x16_ssse3, 7, 7, 8
134
    BIWEIGHT_SSSE3_SETUP
135
    mov        r3, 16
136
137
.nextrow
138
    movh       m0, [r0]
139
    movh       m2, [r0+8]
140
    movh       m3, [r1+8]
141
    punpcklbw  m0, [r1]
142
    punpcklbw  m2, m3
143
    BIWEIGHT_SSSE3_OP
144
    mova       [r0], m0
145
    add        r0, r2
146
    add        r1, r2
147
    dec        r3
148
    jnz .nextrow
149
    REP_RET
150
151
cglobal h264_biweight_8x8_ssse3, 7, 7, 8
152
    BIWEIGHT_SSSE3_SETUP
153
    mov        r3, 4
154
    lea        r4, [r2*2]
155
156
.nextrow
157
    movh       m0, [r0]
158
    movh       m1, [r1]
159
    movh       m2, [r0+r2]
160
    movh       m3, [r1+r2]
161
    punpcklbw  m0, m1
162
    punpcklbw  m2, m3
163
    BIWEIGHT_SSSE3_OP
164
    movh       [r0], m0
165
    movhps     [r0+r2], m0
166
    add        r0, r4
167
    add        r1, r4
168
    dec        r3
169
    jnz .nextrow
170
    REP_RET