Revision e6e98234 libavcodec/x86/dsputil_yasm.asm
libavcodec/x86/dsputil_yasm.asm | ||
---|---|---|
27 | 27 |
pb_7: times 8 db 7 |
28 | 28 |
pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11 |
29 | 29 |
pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13 |
30 |
pb_revwords: db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 |
|
31 |
pd_16384: times 4 dd 16384 |
|
30 | 32 |
|
31 | 33 |
section .text align=16 |
32 | 34 |
|
... | ... | |
202 | 204 |
RET |
203 | 205 |
|
204 | 206 |
|
207 |
;----------------------------------------------------------------------------- |
|
208 |
; void ff_apply_window_int16(int16_t *output, const int16_t *input, |
|
209 |
; const int16_t *window, unsigned int len) |
|
210 |
;----------------------------------------------------------------------------- |
|
211 |
|
|
212 |
%macro REVERSE_WORDS_MMXEXT 1-2 |
|
213 |
pshufw %1, %1, 0x1B |
|
214 |
%endmacro |
|
215 |
|
|
216 |
%macro REVERSE_WORDS_SSE2 1-2 |
|
217 |
pshuflw %1, %1, 0x1B |
|
218 |
pshufhw %1, %1, 0x1B |
|
219 |
pshufd %1, %1, 0x4E |
|
220 |
%endmacro |
|
221 |
|
|
222 |
%macro REVERSE_WORDS_SSSE3 2 |
|
223 |
pshufb %1, %2 |
|
224 |
%endmacro |
|
225 |
|
|
226 |
; dst = (dst * src) >> 15 |
|
227 |
; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back |
|
228 |
; in from the pmullw result. |
|
229 |
%macro MUL16FIXED_MMXEXT 3 ; dst, src, temp |
|
230 |
mova %3, %1 |
|
231 |
pmulhw %1, %2 |
|
232 |
pmullw %3, %2 |
|
233 |
psrlw %3, 15 |
|
234 |
psllw %1, 1 |
|
235 |
por %1, %3 |
|
236 |
%endmacro |
|
237 |
|
|
238 |
; dst = ((dst * src) + (1<<14)) >> 15 |
|
239 |
%macro MUL16FIXED_SSSE3 3 ; dst, src, unused |
|
240 |
pmulhrsw %1, %2 |
|
241 |
%endmacro |
|
242 |
|
|
243 |
%macro APPLY_WINDOW_INT16 3 ; %1=instruction set, %2=mmxext/sse2 bit exact version, %3=has_ssse3 |
|
244 |
cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2 |
|
245 |
lea offset2q, [offsetq-mmsize] |
|
246 |
%if %2 |
|
247 |
mova m5, [pd_16384] |
|
248 |
%elifidn %1, ssse3 |
|
249 |
mova m5, [pb_revwords] |
|
250 |
ALIGN 16 |
|
251 |
%endif |
|
252 |
.loop: |
|
253 |
%if %2 |
|
254 |
; This version expands 16-bit to 32-bit, multiplies by the window, |
|
255 |
; adds 16384 for rounding, right shifts 15, then repacks back to words to |
|
256 |
; save to the output. The window is reversed for the second half. |
|
257 |
mova m3, [windowq+offset2q] |
|
258 |
mova m4, [ inputq+offset2q] |
|
259 |
pxor m0, m0 |
|
260 |
punpcklwd m0, m3 |
|
261 |
punpcklwd m1, m4 |
|
262 |
pmaddwd m0, m1 |
|
263 |
paddd m0, m5 |
|
264 |
psrad m0, 15 |
|
265 |
pxor m2, m2 |
|
266 |
punpckhwd m2, m3 |
|
267 |
punpckhwd m1, m4 |
|
268 |
pmaddwd m2, m1 |
|
269 |
paddd m2, m5 |
|
270 |
psrad m2, 15 |
|
271 |
packssdw m0, m2 |
|
272 |
mova [outputq+offset2q], m0 |
|
273 |
REVERSE_WORDS m3 |
|
274 |
mova m4, [ inputq+offsetq] |
|
275 |
pxor m0, m0 |
|
276 |
punpcklwd m0, m3 |
|
277 |
punpcklwd m1, m4 |
|
278 |
pmaddwd m0, m1 |
|
279 |
paddd m0, m5 |
|
280 |
psrad m0, 15 |
|
281 |
pxor m2, m2 |
|
282 |
punpckhwd m2, m3 |
|
283 |
punpckhwd m1, m4 |
|
284 |
pmaddwd m2, m1 |
|
285 |
paddd m2, m5 |
|
286 |
psrad m2, 15 |
|
287 |
packssdw m0, m2 |
|
288 |
mova [outputq+offsetq], m0 |
|
289 |
%elif %3 |
|
290 |
; This version does the 16x16->16 multiplication in-place without expanding |
|
291 |
; to 32-bit. The ssse3 version is bit-identical. |
|
292 |
mova m0, [windowq+offset2q] |
|
293 |
mova m1, [ inputq+offset2q] |
|
294 |
pmulhrsw m1, m0 |
|
295 |
REVERSE_WORDS m0, m5 |
|
296 |
pmulhrsw m0, [ inputq+offsetq ] |
|
297 |
mova [outputq+offset2q], m1 |
|
298 |
mova [outputq+offsetq ], m0 |
|
299 |
%else |
|
300 |
; This version does the 16x16->16 multiplication in-place without expanding |
|
301 |
; to 32-bit. The mmxext and sse2 versions do not use rounding, and |
|
302 |
; therefore are not bit-identical to the C version. |
|
303 |
mova m0, [windowq+offset2q] |
|
304 |
mova m1, [ inputq+offset2q] |
|
305 |
mova m2, [ inputq+offsetq ] |
|
306 |
MUL16FIXED m1, m0, m3 |
|
307 |
REVERSE_WORDS m0 |
|
308 |
MUL16FIXED m2, m0, m3 |
|
309 |
mova [outputq+offset2q], m1 |
|
310 |
mova [outputq+offsetq ], m2 |
|
311 |
%endif |
|
312 |
add offsetd, mmsize |
|
313 |
sub offset2d, mmsize |
|
314 |
jae .loop |
|
315 |
REP_RET |
|
316 |
%endmacro |
|
317 |
|
|
318 |
INIT_MMX |
|
319 |
%define REVERSE_WORDS REVERSE_WORDS_MMXEXT |
|
320 |
%define MUL16FIXED MUL16FIXED_MMXEXT |
|
321 |
APPLY_WINDOW_INT16 mmxext, 0, 0 |
|
322 |
APPLY_WINDOW_INT16 mmxext_ba, 1, 0 |
|
323 |
INIT_XMM |
|
324 |
%define REVERSE_WORDS REVERSE_WORDS_SSE2 |
|
325 |
APPLY_WINDOW_INT16 sse2, 0, 0 |
|
326 |
APPLY_WINDOW_INT16 sse2_ba, 1, 0 |
|
327 |
APPLY_WINDOW_INT16 ssse3_atom, 0, 1 |
|
328 |
%define REVERSE_WORDS REVERSE_WORDS_SSSE3 |
|
329 |
APPLY_WINDOW_INT16 ssse3, 0, 1 |
|
330 |
|
|
205 | 331 |
|
206 | 332 |
; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) |
207 | 333 |
cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top |
Also available in: Unified diff