Revision e6e98234 libavcodec/x86/dsputil_yasm.asm

View differences:

libavcodec/x86/dsputil_yasm.asm
27 27
pb_7: times 8 db 7
28 28
pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
29 29
pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
30
pb_revwords: db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
31
pd_16384: times 4 dd 16384
30 32

  
31 33
section .text align=16
32 34

  
......
202 204
    RET
203 205

  
204 206

  
207
;-----------------------------------------------------------------------------
208
; void ff_apply_window_int16(int16_t *output, const int16_t *input,
209
;                            const int16_t *window, unsigned int len)
210
;-----------------------------------------------------------------------------
211

  
212
%macro REVERSE_WORDS_MMXEXT 1-2
213
    pshufw   %1, %1, 0x1B
214
%endmacro
215

  
216
%macro REVERSE_WORDS_SSE2 1-2
217
    pshuflw  %1, %1, 0x1B
218
    pshufhw  %1, %1, 0x1B
219
    pshufd   %1, %1, 0x4E
220
%endmacro
221

  
222
%macro REVERSE_WORDS_SSSE3 2
223
    pshufb  %1, %2
224
%endmacro
225

  
226
; dst = (dst * src) >> 15
227
; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
228
; in from the pmullw result.
229
%macro MUL16FIXED_MMXEXT 3 ; dst, src, temp
230
    mova    %3, %1
231
    pmulhw  %1, %2
232
    pmullw  %3, %2
233
    psrlw   %3, 15
234
    psllw   %1, 1
235
    por     %1, %3
236
%endmacro
237

  
238
; dst = ((dst * src) + (1<<14)) >> 15
239
%macro MUL16FIXED_SSSE3 3 ; dst, src, unused
240
    pmulhrsw   %1, %2
241
%endmacro
242

  
243
%macro APPLY_WINDOW_INT16 3 ; %1=instruction set, %2=mmxext/sse2 bit exact version, %3=has_ssse3
244
cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2
245
    lea     offset2q, [offsetq-mmsize]
246
%if %2
247
    mova          m5, [pd_16384]
248
%elifidn %1, ssse3
249
    mova          m5, [pb_revwords]
250
    ALIGN 16
251
%endif
252
.loop:
253
%if %2
254
    ; This version expands 16-bit to 32-bit, multiplies by the window,
255
    ; adds 16384 for rounding, right shifts 15, then repacks back to words to
256
    ; save to the output. The window is reversed for the second half.
257
    mova          m3, [windowq+offset2q]
258
    mova          m4, [ inputq+offset2q]
259
    pxor          m0, m0
260
    punpcklwd     m0, m3
261
    punpcklwd     m1, m4
262
    pmaddwd       m0, m1
263
    paddd         m0, m5
264
    psrad         m0, 15
265
    pxor          m2, m2
266
    punpckhwd     m2, m3
267
    punpckhwd     m1, m4
268
    pmaddwd       m2, m1
269
    paddd         m2, m5
270
    psrad         m2, 15
271
    packssdw      m0, m2
272
    mova  [outputq+offset2q], m0
273
    REVERSE_WORDS m3
274
    mova          m4, [ inputq+offsetq]
275
    pxor          m0, m0
276
    punpcklwd     m0, m3
277
    punpcklwd     m1, m4
278
    pmaddwd       m0, m1
279
    paddd         m0, m5
280
    psrad         m0, 15
281
    pxor          m2, m2
282
    punpckhwd     m2, m3
283
    punpckhwd     m1, m4
284
    pmaddwd       m2, m1
285
    paddd         m2, m5
286
    psrad         m2, 15
287
    packssdw      m0, m2
288
    mova  [outputq+offsetq], m0
289
%elif %3
290
    ; This version does the 16x16->16 multiplication in-place without expanding
291
    ; to 32-bit. The ssse3 version is bit-identical.
292
    mova          m0, [windowq+offset2q]
293
    mova          m1, [ inputq+offset2q]
294
    pmulhrsw      m1, m0
295
    REVERSE_WORDS m0, m5
296
    pmulhrsw      m0, [ inputq+offsetq ]
297
    mova  [outputq+offset2q], m1
298
    mova  [outputq+offsetq ], m0
299
%else
300
    ; This version does the 16x16->16 multiplication in-place without expanding
301
    ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
302
    ; therefore are not bit-identical to the C version.
303
    mova          m0, [windowq+offset2q]
304
    mova          m1, [ inputq+offset2q]
305
    mova          m2, [ inputq+offsetq ]
306
    MUL16FIXED    m1, m0, m3
307
    REVERSE_WORDS m0
308
    MUL16FIXED    m2, m0, m3
309
    mova  [outputq+offset2q], m1
310
    mova  [outputq+offsetq ], m2
311
%endif
312
    add      offsetd, mmsize
313
    sub     offset2d, mmsize
314
    jae .loop
315
    REP_RET
316
%endmacro
317

  
318
INIT_MMX
319
%define REVERSE_WORDS REVERSE_WORDS_MMXEXT
320
%define MUL16FIXED MUL16FIXED_MMXEXT
321
APPLY_WINDOW_INT16 mmxext,     0, 0
322
APPLY_WINDOW_INT16 mmxext_ba,  1, 0
323
INIT_XMM
324
%define REVERSE_WORDS REVERSE_WORDS_SSE2
325
APPLY_WINDOW_INT16 sse2,       0, 0
326
APPLY_WINDOW_INT16 sse2_ba,    1, 0
327
APPLY_WINDOW_INT16 ssse3_atom, 0, 1
328
%define REVERSE_WORDS REVERSE_WORDS_SSSE3
329
APPLY_WINDOW_INT16 ssse3,      0, 1
330

  
205 331

  
206 332
; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
207 333
cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top

Also available in: Unified diff