ffmpeg / libavcodec / x86 / h264_weight.asm @ 98c6053c

1 | a33a2562 | Ronald S. Bultje | ;***************************************************************************** |
2 | ;* SSE2-optimized weighted prediction code |
3 | ;***************************************************************************** |
4 | ;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt |
5 | ;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com> |
6 | ;* |
7 | ;* This file is part of FFmpeg. |
8 | ;* |
9 | ;* FFmpeg is free software; you can redistribute it and/or |
10 | ;* modify it under the terms of the GNU Lesser General Public |
11 | ;* License as published by the Free Software Foundation; either |
12 | ;* version 2.1 of the License, or (at your option) any later version. |
13 | ;* |
14 | ;* FFmpeg is distributed in the hope that it will be useful, |
15 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
17 | ;* Lesser General Public License for more details. |
18 | ;* |
19 | ;* You should have received a copy of the GNU Lesser General Public |
20 | ;* License along with FFmpeg; if not, write to the Free Software |
21 | ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
22 | ;****************************************************************************** |
23 | |||

24 | %include "x86inc.asm" |
25 | |||

26 | SECTION .text |
27 | |||

28 | ;----------------------------------------------------------------------------- |
29 | ; biweight pred: |
||

30 | ; |
31 | ; void h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride, |
32 | ; int log2_denom, int weightd, int weights, |
33 | ; int offset); |
34 | ; and |
35 | ; void h264_weight_16x16_sse2(uint8_t *dst, int stride, |
36 | ; int log2_denom, int weight, |
37 | ; int offset); |
38 | ;----------------------------------------------------------------------------- |
39 | |||

40 | %macro WEIGHT_SETUP 0 |
41 | add r4, r4 |
42 | inc r4 |
43 | b1c32fb5 | Reimar Döffinger | movd m3, r3d |

44 | movd m5, r4d |
45 | movd m6, r2d |
46 | a33a2562 | Ronald S. Bultje | pslld m5, m6 |

47 | psrld m5, 1 |
48 | %if mmsize == 16 |
49 | pshuflw m3, m3, 0 |
50 | pshuflw m5, m5, 0 |
51 | punpcklqdq m3, m3 |
52 | punpcklqdq m5, m5 |
53 | %else |
54 | pshufw m3, m3, 0 |
55 | pshufw m5, m5, 0 |
56 | %endif |
57 | pxor m7, m7 |
58 | %endmacro |
59 | |||

60 | %macro WEIGHT_OP 2 |
61 | movh m0, [r0+%1] |
62 | movh m1, [r0+%2] |
63 | punpcklbw m0, m7 |
64 | punpcklbw m1, m7 |
65 | pmullw m0, m3 |
66 | pmullw m1, m3 |
67 | paddsw m0, m5 |
68 | paddsw m1, m5 |
69 | psraw m0, m6 |
70 | psraw m1, m6 |
71 | packuswb m0, m1 |
72 | %endmacro |
73 | |||

74 | %macro WEIGHT_FUNC_DBL_MM 1 |
75 | cglobal h264_weight_16x%1_mmx2, 5, 5, 0 |
76 | WEIGHT_SETUP |
77 | mov r2, %1 |
78 | %if %1 == 16 |
79 | .nextrow |
80 | WEIGHT_OP 0, 4 |
81 | mova [r0 ], m0 |
82 | WEIGHT_OP 8, 12 |
83 | mova [r0+8], m0 |
84 | add r0, r1 |
85 | dec r2 |
86 | jnz .nextrow |
87 | REP_RET |
88 | %else |
89 | 615da9b1 | Ronald S. Bultje | jmp mangle(ff_h264_weight_16x16_mmx2.nextrow) |

90 | a33a2562 | Ronald S. Bultje | %endif |

91 | %endmacro |
92 | |||

93 | INIT_MMX |
94 | WEIGHT_FUNC_DBL_MM 16 |
95 | WEIGHT_FUNC_DBL_MM 8 |
96 | |||

97 | %macro WEIGHT_FUNC_MM 4 |
98 | a10a9f5c | Eli Friedman | cglobal h264_weight_%1x%2_%4, 7, 7, %3 |

99 | a33a2562 | Ronald S. Bultje | WEIGHT_SETUP |

100 | mov r2, %2 |
101 | %if %2 == 16 |
102 | .nextrow |
103 | WEIGHT_OP 0, mmsize/2 |
104 | mova [r0], m0 |
105 | add r0, r1 |
106 | dec r2 |
107 | jnz .nextrow |
108 | REP_RET |
109 | %else |
110 | 615da9b1 | Ronald S. Bultje | jmp mangle(ff_h264_weight_%1x16_%4.nextrow) |

111 | a33a2562 | Ronald S. Bultje | %endif |

112 | %endmacro |
113 | |||

114 | INIT_MMX |
115 | WEIGHT_FUNC_MM 8, 16, 0, mmx2 |
116 | WEIGHT_FUNC_MM 8, 8, 0, mmx2 |
117 | WEIGHT_FUNC_MM 8, 4, 0, mmx2 |
118 | INIT_XMM |
119 | WEIGHT_FUNC_MM 16, 16, 8, sse2 |
120 | WEIGHT_FUNC_MM 16, 8, 8, sse2 |
121 | |||

122 | %macro WEIGHT_FUNC_HALF_MM 5 |
123 | cglobal h264_weight_%1x%2_%5, 5, 5, %4 |
124 | WEIGHT_SETUP |
125 | mov r2, %2/2 |
126 | lea r3, [r1*2] |
127 | %if %2 == mmsize |
128 | .nextrow |
129 | WEIGHT_OP 0, r1 |
130 | movh [r0], m0 |
131 | %if mmsize == 16 |
132 | movhps [r0+r1], m0 |
133 | %else |
134 | psrlq m0, 32 |
135 | movh [r0+r1], m0 |
136 | %endif |
137 | add r0, r3 |
138 | dec r2 |
139 | jnz .nextrow |
140 | REP_RET |
141 | %else |
142 | 615da9b1 | Ronald S. Bultje | jmp mangle(ff_h264_weight_%1x%3_%5.nextrow) |

143 | a33a2562 | Ronald S. Bultje | %endif |

144 | %endmacro |
145 | |||

146 | INIT_MMX |
147 | WEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2 |
148 | WEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2 |
149 | WEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2 |
150 | INIT_XMM |
151 | WEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2 |
152 | WEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2 |
153 | WEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 |
154 | |||

155 | %macro BIWEIGHT_SETUP 0 |
156 | add r6, 1 |
157 | or r6, 1 |
158 | add r3, 1 |
159 | b1c32fb5 | Reimar Döffinger | movd m3, r4d |

160 | movd m4, r5d |
161 | movd m5, r6d |
162 | movd m6, r3d |
163 | a33a2562 | Ronald S. Bultje | pslld m5, m6 |

164 | psrld m5, 1 |
165 | %if mmsize == 16 |
166 | pshuflw m3, m3, 0 |
167 | pshuflw m4, m4, 0 |
168 | pshuflw m5, m5, 0 |
169 | punpcklqdq m3, m3 |
170 | punpcklqdq m4, m4 |
171 | punpcklqdq m5, m5 |
172 | %else |
173 | pshufw m3, m3, 0 |
174 | pshufw m4, m4, 0 |
175 | pshufw m5, m5, 0 |
176 | %endif |
177 | pxor m7, m7 |
178 | %endmacro |
179 | |||

180 | %macro BIWEIGHT_STEPA 3 |
181 | movh m%1, [r0+%3] |
182 | movh m%2, [r1+%3] |
183 | punpcklbw m%1, m7 |
184 | punpcklbw m%2, m7 |
185 | pmullw m%1, m3 |
186 | pmullw m%2, m4 |
187 | paddsw m%1, m%2 |
188 | %endmacro |
189 | |||

190 | %macro BIWEIGHT_STEPB 0 |
191 | paddsw m0, m5 |
192 | paddsw m1, m5 |
193 | psraw m0, m6 |
194 | psraw m1, m6 |
195 | packuswb m0, m1 |
196 | %endmacro |
197 | |||

198 | %macro BIWEIGHT_FUNC_DBL_MM 1 |
199 | cglobal h264_biweight_16x%1_mmx2, 7, 7, 0 |
200 | BIWEIGHT_SETUP |
201 | mov r3, %1 |
202 | %if %1 == 16 |
203 | .nextrow |
204 | BIWEIGHT_STEPA 0, 1, 0 |
205 | BIWEIGHT_STEPA 1, 2, 4 |
206 | BIWEIGHT_STEPB |
207 | mova [r0], m0 |
208 | BIWEIGHT_STEPA 0, 1, 8 |
209 | BIWEIGHT_STEPA 1, 2, 12 |
210 | BIWEIGHT_STEPB |
211 | mova [r0+8], m0 |
212 | add r0, r2 |
213 | add r1, r2 |
214 | dec r3 |
215 | jnz .nextrow |
216 | REP_RET |
217 | %else |
218 | 615da9b1 | Ronald S. Bultje | jmp mangle(ff_h264_biweight_16x16_mmx2.nextrow) |

219 | a33a2562 | Ronald S. Bultje | %endif |

220 | %endmacro |
221 | |||

222 | INIT_MMX |
223 | BIWEIGHT_FUNC_DBL_MM 16 |
224 | BIWEIGHT_FUNC_DBL_MM 8 |
225 | |||

226 | %macro BIWEIGHT_FUNC_MM 4 |
227 | cglobal h264_biweight_%1x%2_%4, 7, 7, %3 |
228 | BIWEIGHT_SETUP |
229 | mov r3, %2 |
230 | %if %2 == 16 |
231 | .nextrow |
232 | BIWEIGHT_STEPA 0, 1, 0 |
233 | BIWEIGHT_STEPA 1, 2, mmsize/2 |
234 | BIWEIGHT_STEPB |
235 | mova [r0], m0 |
236 | add r0, r2 |
237 | add r1, r2 |
238 | dec r3 |
239 | jnz .nextrow |
240 | REP_RET |
241 | %else |
242 | 615da9b1 | Ronald S. Bultje | jmp mangle(ff_h264_biweight_%1x16_%4.nextrow) |

243 | a33a2562 | Ronald S. Bultje | %endif |

244 | %endmacro |
245 | |||

246 | INIT_MMX |
247 | BIWEIGHT_FUNC_MM 8, 16, 0, mmx2 |
248 | BIWEIGHT_FUNC_MM 8, 8, 0, mmx2 |
249 | BIWEIGHT_FUNC_MM 8, 4, 0, mmx2 |
250 | INIT_XMM |
251 | BIWEIGHT_FUNC_MM 16, 16, 8, sse2 |
252 | BIWEIGHT_FUNC_MM 16, 8, 8, sse2 |
253 | |||

254 | %macro BIWEIGHT_FUNC_HALF_MM 5 |
255 | cglobal h264_biweight_%1x%2_%5, 7, 7, %4 |
256 | BIWEIGHT_SETUP |
257 | mov r3, %2/2 |
258 | lea r4, [r2*2] |
259 | %if %2 == mmsize |
260 | .nextrow |
261 | BIWEIGHT_STEPA 0, 1, 0 |
262 | BIWEIGHT_STEPA 1, 2, r2 |
263 | BIWEIGHT_STEPB |
264 | movh [r0], m0 |
265 | %if mmsize == 16 |
266 | movhps [r0+r2], m0 |
267 | %else |
268 | psrlq m0, 32 |
269 | movh [r0+r2], m0 |
270 | %endif |
271 | add r0, r4 |
272 | add r1, r4 |
273 | dec r3 |
274 | jnz .nextrow |
275 | REP_RET |
276 | %else |
277 | 615da9b1 | Ronald S. Bultje | jmp mangle(ff_h264_biweight_%1x%3_%5.nextrow) |

278 | a33a2562 | Ronald S. Bultje | %endif |

279 | %endmacro |
280 | |||

281 | INIT_MMX |
282 | BIWEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2 |
283 | BIWEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2 |
284 | BIWEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2 |
285 | INIT_XMM |
286 | BIWEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2 |
287 | BIWEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2 |
288 | BIWEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 |
289 | |||

290 | %macro BIWEIGHT_SSSE3_SETUP 0 |
291 | add r6, 1 |
292 | or r6, 1 |
293 | add r3, 1 |
294 | b1c32fb5 | Reimar Döffinger | movd m4, r4d |

295 | movd m0, r5d |
296 | movd m5, r6d |
297 | movd m6, r3d |
298 | a33a2562 | Ronald S. Bultje | pslld m5, m6 |

299 | psrld m5, 1 |
300 | punpcklbw m4, m0 |
301 | pshuflw m4, m4, 0 |
302 | pshuflw m5, m5, 0 |
303 | punpcklqdq m4, m4 |
304 | punpcklqdq m5, m5 |
305 | %endmacro |
306 | |||

307 | %macro BIWEIGHT_SSSE3_OP 0 |
308 | pmaddubsw m0, m4 |
309 | pmaddubsw m2, m4 |
310 | paddsw m0, m5 |
311 | paddsw m2, m5 |
312 | psraw m0, m6 |
313 | psraw m2, m6 |
314 | packuswb m0, m2 |
315 | %endmacro |
316 | |||

317 | %macro BIWEIGHT_SSSE3_16 1 |
318 | cglobal h264_biweight_16x%1_ssse3, 7, 7, 8 |
319 | BIWEIGHT_SSSE3_SETUP |
320 | mov r3, %1 |
321 | |||

322 | %if %1 == 16 |
323 | .nextrow |
324 | movh m0, [r0] |
325 | movh m2, [r0+8] |
326 | movh m3, [r1+8] |
327 | punpcklbw m0, [r1] |
328 | punpcklbw m2, m3 |
329 | BIWEIGHT_SSSE3_OP |
330 | mova [r0], m0 |
331 | add r0, r2 |
332 | add r1, r2 |
333 | dec r3 |
334 | jnz .nextrow |
335 | REP_RET |
336 | %else |
337 | 615da9b1 | Ronald S. Bultje | jmp mangle(ff_h264_biweight_16x16_ssse3.nextrow) |

338 | a33a2562 | Ronald S. Bultje | %endif |

339 | %endmacro |
340 | |||

341 | INIT_XMM |
342 | BIWEIGHT_SSSE3_16 16 |
343 | BIWEIGHT_SSSE3_16 8 |
344 | |||

345 | %macro BIWEIGHT_SSSE3_8 1 |
346 | cglobal h264_biweight_8x%1_ssse3, 7, 7, 8 |
347 | BIWEIGHT_SSSE3_SETUP |
348 | mov r3, %1/2 |
349 | lea r4, [r2*2] |
350 | |||

351 | %if %1 == 16 |
352 | .nextrow |
353 | movh m0, [r0] |
354 | movh m1, [r1] |
355 | movh m2, [r0+r2] |
356 | movh m3, [r1+r2] |
357 | punpcklbw m0, m1 |
358 | punpcklbw m2, m3 |
359 | BIWEIGHT_SSSE3_OP |
360 | movh [r0], m0 |
361 | movhps [r0+r2], m0 |
362 | add r0, r4 |
363 | add r1, r4 |
364 | dec r3 |
365 | jnz .nextrow |
366 | REP_RET |
367 | %else |
368 | 615da9b1 | Ronald S. Bultje | jmp mangle(ff_h264_biweight_8x16_ssse3.nextrow) |

369 | a33a2562 | Ronald S. Bultje | %endif |

370 | %endmacro |
371 | |||

372 | INIT_XMM |
373 | BIWEIGHT_SSSE3_8 16 |
374 | BIWEIGHT_SSSE3_8 8 |
375 | BIWEIGHT_SSSE3_8 4 |