ffmpeg / libavcodec / x86 / vp56dsp.asm @ 888fa31e
History | View | Annotate | Download (4.85 KB)
1 | 89fa3504 | Ronald S. Bultje | ;****************************************************************************** |
---|---|---|---|
2 | ;* MMX/SSE2-optimized functions for the VP6 decoder |
||
3 | ;* Copyright (C) 2009 Sebastien Lucas <sebastien.lucas@gmail.com> |
||
4 | ;* Copyright (C) 2009 Zuxy Meng <zuxy.meng@gmail.com> |
||
5 | ;* |
||
6 | 2912e87a | Mans Rullgard | ;* This file is part of Libav. |
7 | 89fa3504 | Ronald S. Bultje | ;* |
8 | 2912e87a | Mans Rullgard | ;* Libav is free software; you can redistribute it and/or |
9 | 89fa3504 | Ronald S. Bultje | ;* modify it under the terms of the GNU Lesser General Public |
10 | ;* License as published by the Free Software Foundation; either |
||
11 | ;* version 2.1 of the License, or (at your option) any later version. |
||
12 | ;* |
||
13 | 2912e87a | Mans Rullgard | ;* Libav is distributed in the hope that it will be useful, |
14 | 89fa3504 | Ronald S. Bultje | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
16 | ;* Lesser General Public License for more details. |
||
17 | ;* |
||
18 | ;* You should have received a copy of the GNU Lesser General Public |
||
19 | 2912e87a | Mans Rullgard | ;* License along with Libav; if not, write to the Free Software |
20 | 888fa31e | Diego Biurrun | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
21 | 89fa3504 | Ronald S. Bultje | ;****************************************************************************** |
22 | |||
23 | %include "x86inc.asm" |
||
24 | %include "x86util.asm" |
||
25 | |||
26 | cextern pw_64 |
||
27 | |||
28 | SECTION .text |
||
29 | |||
30 | %macro DIAG4_MMX 6 |
||
31 | movq m0, [%1+%2] |
||
32 | movq m1, [%1+%3] |
||
33 | movq m3, m0 |
||
34 | movq m4, m1 |
||
35 | punpcklbw m0, m7 |
||
36 | punpcklbw m1, m7 |
||
37 | punpckhbw m3, m7 |
||
38 | punpckhbw m4, m7 |
||
39 | pmullw m0, [rsp+8*11] ; src[x-8 ] * biweight [0] |
||
40 | pmullw m1, [rsp+8*12] ; src[x ] * biweight [1] |
||
41 | pmullw m3, [rsp+8*11] ; src[x-8 ] * biweight [0] |
||
42 | pmullw m4, [rsp+8*12] ; src[x ] * biweight [1] |
||
43 | paddw m0, m1 |
||
44 | paddw m3, m4 |
||
45 | movq m1, [%1+%4] |
||
46 | movq m2, [%1+%5] |
||
47 | movq m4, m1 |
||
48 | movq m5, m2 |
||
49 | punpcklbw m1, m7 |
||
50 | punpcklbw m2, m7 |
||
51 | 4eca52ed | Ronald S. Bultje | punpckhbw m4, m7 |
52 | punpckhbw m5, m7 |
||
53 | 89fa3504 | Ronald S. Bultje | pmullw m1, [rsp+8*13] ; src[x+8 ] * biweight [2] |
54 | pmullw m2, [rsp+8*14] ; src[x+16] * biweight [3] |
||
55 | pmullw m4, [rsp+8*13] ; src[x+8 ] * biweight [2] |
||
56 | pmullw m5, [rsp+8*14] ; src[x+16] * biweight [3] |
||
57 | paddw m1, m2 |
||
58 | paddw m4, m5 |
||
59 | paddsw m0, m1 |
||
60 | paddsw m3, m4 |
||
61 | paddsw m0, m6 ; Add 64 |
||
62 | paddsw m3, m6 ; Add 64 |
||
63 | psraw m0, 7 |
||
64 | psraw m3, 7 |
||
65 | packuswb m0, m3 |
||
66 | movq [%6], m0 |
||
67 | %endmacro |
||
68 | |||
69 | %macro DIAG4_SSE2 6 |
||
70 | movq m0, [%1+%2] |
||
71 | movq m1, [%1+%3] |
||
72 | punpcklbw m0, m7 |
||
73 | punpcklbw m1, m7 |
||
74 | pmullw m0, m4 ; src[x-8 ] * biweight [0] |
||
75 | pmullw m1, m5 ; src[x ] * biweight [1] |
||
76 | paddw m0, m1 |
||
77 | movq m1, [%1+%4] |
||
78 | movq m2, [%1+%5] |
||
79 | punpcklbw m1, m7 |
||
80 | punpcklbw m2, m7 |
||
81 | pmullw m1, m6 ; src[x+8 ] * biweight [2] |
||
82 | pmullw m2, m3 ; src[x+16] * biweight [3] |
||
83 | paddw m1, m2 |
||
84 | paddsw m0, m1 |
||
85 | paddsw m0, [pw_64] ; Add 64 |
||
86 | psraw m0, 7 |
||
87 | packuswb m0, m0 |
||
88 | movq [%6], m0 |
||
89 | %endmacro |
||
90 | |||
91 | %macro SPLAT4REGS_MMX 0 |
||
92 | movq m5, m3 |
||
93 | punpcklwd m3, m3 |
||
94 | movq m4, m3 |
||
95 | punpckldq m3, m3 |
||
96 | punpckhdq m4, m4 |
||
97 | punpckhwd m5, m5 |
||
98 | 4eca52ed | Ronald S. Bultje | movq m2, m5 |
99 | punpckhdq m2, m2 |
||
100 | 89fa3504 | Ronald S. Bultje | punpckldq m5, m5 |
101 | movq [rsp+8*11], m3 |
||
102 | movq [rsp+8*12], m4 |
||
103 | movq [rsp+8*13], m5 |
||
104 | 4eca52ed | Ronald S. Bultje | movq [rsp+8*14], m2 |
105 | 89fa3504 | Ronald S. Bultje | %endmacro |
106 | |||
107 | %macro SPLAT4REGS_SSE2 0 |
||
108 | pshuflw m4, m3, 0x0 |
||
109 | pshuflw m5, m3, 0x55 |
||
110 | pshuflw m6, m3, 0xAA |
||
111 | pshuflw m3, m3, 0xFF |
||
112 | punpcklqdq m4, m4 |
||
113 | punpcklqdq m5, m5 |
||
114 | punpcklqdq m6, m6 |
||
115 | punpcklqdq m3, m3 |
||
116 | %endmacro |
||
117 | |||
118 | %macro vp6_filter_diag4 2 |
||
119 | ; void ff_vp6_filter_diag4_<opt>(uint8_t *dst, uint8_t *src, int stride, |
||
120 | ; const int16_t h_weight[4], const int16_t v_weights[4]) |
||
121 | cglobal vp6_filter_diag4_%1, 5, 7, %2 |
||
122 | mov r5, rsp ; backup stack pointer |
||
123 | and rsp, ~(mmsize-1) ; align stack |
||
124 | %ifidn %1, sse2 |
||
125 | sub rsp, 8*11 |
||
126 | %else |
||
127 | sub rsp, 8*15 |
||
128 | movq m6, [pw_64] |
||
129 | %endif |
||
130 | 69dad87c | Måns Rullgård | %ifdef ARCH_X86_64 |
131 | movsxd r2, r2d |
||
132 | %endif |
||
133 | 89fa3504 | Ronald S. Bultje | |
134 | sub r1, r2 |
||
135 | |||
136 | pxor m7, m7 |
||
137 | movq m3, [r3] |
||
138 | SPLAT4REGS |
||
139 | |||
140 | mov r3, rsp |
||
141 | mov r6, 11 |
||
142 | .nextrow |
||
143 | DIAG4 r1, -1, 0, 1, 2, r3 |
||
144 | add r3, 8 |
||
145 | add r1, r2 |
||
146 | dec r6 |
||
147 | jnz .nextrow |
||
148 | |||
149 | movq m3, [r4] |
||
150 | SPLAT4REGS |
||
151 | |||
152 | lea r3, [rsp+8] |
||
153 | mov r6, 8 |
||
154 | .nextcol |
||
155 | DIAG4 r3, -8, 0, 8, 16, r0 |
||
156 | add r3, 8 |
||
157 | add r0, r2 |
||
158 | dec r6 |
||
159 | jnz .nextcol |
||
160 | |||
161 | mov rsp, r5 ; restore stack pointer |
||
162 | RET |
||
163 | %endmacro |
||
164 | |||
165 | INIT_MMX |
||
166 | %define DIAG4 DIAG4_MMX |
||
167 | %define SPLAT4REGS SPLAT4REGS_MMX |
||
168 | vp6_filter_diag4 mmx, 0 |
||
169 | |||
170 | INIT_XMM |
||
171 | %define DIAG4 DIAG4_SSE2 |
||
172 | %define SPLAT4REGS SPLAT4REGS_SSE2 |
||
173 | vp6_filter_diag4 sse2, 8 |