Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / dsputil_yasm.asm @ b1159ad9

History | View | Annotate | Download (9.14 KB)

1
;******************************************************************************
2
;* MMX optimized DSP utils
3
;* Copyright (c) 2008 Loren Merritt
4
;*
5
;* This file is part of FFmpeg.
6
;*
7
;* FFmpeg is free software; you can redistribute it and/or
8
;* modify it under the terms of the GNU Lesser General Public
9
;* License as published by the Free Software Foundation; either
10
;* version 2.1 of the License, or (at your option) any later version.
11
;*
12
;* FFmpeg is distributed in the hope that it will be useful,
13
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
;* Lesser General Public License for more details.
16
;*
17
;* You should have received a copy of the GNU Lesser General Public
18
;* License along with FFmpeg; if not, write to the Free Software
19
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
;******************************************************************************
21

    
22
%include "x86inc.asm"
23

    
24
SECTION_RODATA
25
pb_f: times 16 db 15
26
pb_zzzzzzzz77777777: times 8 db -1
27
pb_7: times 8 db 7
28
pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
29
pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
30

    
31
section .text align=16
32

    
33
%macro PSWAPD_SSE 2
34
    pshufw %1, %2, 0x4e
35
%endmacro
36
%macro PSWAPD_3DN1 2
37
    movq  %1, %2
38
    psrlq %1, 32
39
    punpckldq %1, %2
40
%endmacro
41

    
42
%macro FLOAT_TO_INT16_INTERLEAVE6 1
43
; void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
44
cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5
45
%ifdef ARCH_X86_64
46
    %define lend r10d
47
    mov     lend, r2d
48
%else
49
    %define lend dword r2m
50
%endif
51
    mov src1q, [srcq+1*gprsize]
52
    mov src2q, [srcq+2*gprsize]
53
    mov src3q, [srcq+3*gprsize]
54
    mov src4q, [srcq+4*gprsize]
55
    mov src5q, [srcq+5*gprsize]
56
    mov srcq,  [srcq]
57
    sub src1q, srcq
58
    sub src2q, srcq
59
    sub src3q, srcq
60
    sub src4q, srcq
61
    sub src5q, srcq
62
.loop:
63
    cvtps2pi   mm0, [srcq]
64
    cvtps2pi   mm1, [srcq+src1q]
65
    cvtps2pi   mm2, [srcq+src2q]
66
    cvtps2pi   mm3, [srcq+src3q]
67
    cvtps2pi   mm4, [srcq+src4q]
68
    cvtps2pi   mm5, [srcq+src5q]
69
    packssdw   mm0, mm3
70
    packssdw   mm1, mm4
71
    packssdw   mm2, mm5
72
    pswapd     mm3, mm0
73
    punpcklwd  mm0, mm1
74
    punpckhwd  mm1, mm2
75
    punpcklwd  mm2, mm3
76
    pswapd     mm3, mm0
77
    punpckldq  mm0, mm2
78
    punpckhdq  mm2, mm1
79
    punpckldq  mm1, mm3
80
    movq [dstq   ], mm0
81
    movq [dstq+16], mm2
82
    movq [dstq+ 8], mm1
83
    add srcq, 8
84
    add dstq, 24
85
    sub lend, 2
86
    jg .loop
87
    emms
88
    RET
89
%endmacro ; FLOAT_TO_INT16_INTERLEAVE6
90

    
91
%define pswapd PSWAPD_SSE
92
FLOAT_TO_INT16_INTERLEAVE6 sse
93
%define cvtps2pi pf2id
94
%define pswapd PSWAPD_3DN1
95
FLOAT_TO_INT16_INTERLEAVE6 3dnow
96
%undef pswapd
97
FLOAT_TO_INT16_INTERLEAVE6 3dn2
98
%undef cvtps2pi
99

    
100

    
101

    
102
%macro SCALARPRODUCT 1
103
; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift)
104
cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
105
    shl orderq, 1
106
    add v1q, orderq
107
    add v2q, orderq
108
    neg orderq
109
    movd    m3, shiftm
110
    pxor    m2, m2
111
.loop:
112
    movu    m0, [v1q + orderq]
113
    movu    m1, [v1q + orderq + mmsize]
114
    pmaddwd m0, [v2q + orderq]
115
    pmaddwd m1, [v2q + orderq + mmsize]
116
    paddd   m2, m0
117
    paddd   m2, m1
118
    add     orderq, mmsize*2
119
    jl .loop
120
%if mmsize == 16
121
    movhlps m0, m2
122
    paddd   m2, m0
123
    psrad   m2, m3
124
    pshuflw m0, m2, 0x4e
125
%else
126
    psrad   m2, m3
127
    pshufw  m0, m2, 0x4e
128
%endif
129
    paddd   m2, m0
130
    movd   eax, m2
131
    RET
132

    
133
; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
134
cglobal scalarproduct_and_madd_int16_%1, 3,4,8, v1, v2, v3, order, mul
135
    shl orderq, 1
136
    movd    m7, mulm
137
%if mmsize == 16
138
    pshuflw m7, m7, 0
139
    punpcklqdq m7, m7
140
%else
141
    pshufw  m7, m7, 0
142
%endif
143
    pxor    m6, m6
144
    add v1q, orderq
145
    add v2q, orderq
146
    add v3q, orderq
147
    neg orderq
148
.loop:
149
    movu    m0, [v2q + orderq]
150
    movu    m1, [v2q + orderq + mmsize]
151
    mova    m4, [v1q + orderq]
152
    mova    m5, [v1q + orderq + mmsize]
153
    movu    m2, [v3q + orderq]
154
    movu    m3, [v3q + orderq + mmsize]
155
    pmaddwd m0, m4
156
    pmaddwd m1, m5
157
    pmullw  m2, m7
158
    pmullw  m3, m7
159
    paddd   m6, m0
160
    paddd   m6, m1
161
    paddw   m2, m4
162
    paddw   m3, m5
163
    mova    [v1q + orderq], m2
164
    mova    [v1q + orderq + mmsize], m3
165
    add     orderq, mmsize*2
166
    jl .loop
167
%if mmsize == 16
168
    movhlps m0, m6
169
    paddd   m6, m0
170
    pshuflw m0, m6, 0x4e
171
%else
172
    pshufw  m0, m6, 0x4e
173
%endif
174
    paddd   m6, m0
175
    movd   eax, m6
176
    RET
177
%endmacro
178

    
179
INIT_MMX
180
SCALARPRODUCT mmx2
181
INIT_XMM
182
SCALARPRODUCT sse2
183

    
184
%macro SCALARPRODUCT_LOOP 1
185
align 16
186
.loop%1:
187
    sub     orderq, mmsize*2
188
%if %1
189
    mova    m1, m4
190
    mova    m4, [v2q + orderq]
191
    mova    m0, [v2q + orderq + mmsize]
192
    palignr m1, m0, %1
193
    palignr m0, m4, %1
194
    mova    m3, m5
195
    mova    m5, [v3q + orderq]
196
    mova    m2, [v3q + orderq + mmsize]
197
    palignr m3, m2, %1
198
    palignr m2, m5, %1
199
%else
200
    mova    m0, [v2q + orderq]
201
    mova    m1, [v2q + orderq + mmsize]
202
    mova    m2, [v3q + orderq]
203
    mova    m3, [v3q + orderq + mmsize]
204
%endif
205
    pmaddwd m0, [v1q + orderq]
206
    pmaddwd m1, [v1q + orderq + mmsize]
207
    pmullw  m2, m7
208
    pmullw  m3, m7
209
    paddw   m2, [v1q + orderq]
210
    paddw   m3, [v1q + orderq + mmsize]
211
    paddd   m6, m0
212
    paddd   m6, m1
213
    mova    [v1q + orderq], m2
214
    mova    [v1q + orderq + mmsize], m3
215
    jg .loop%1
216
%if %1
217
    jmp .end
218
%endif
219
%endmacro
220

    
221
; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
222
cglobal scalarproduct_and_madd_int16_ssse3, 4,5,8, v1, v2, v3, order, mul
223
    shl orderq, 1
224
    movd    m7, mulm
225
    pshuflw m7, m7, 0
226
    punpcklqdq m7, m7
227
    pxor    m6, m6
228
    mov    r4d, v2d
229
    and    r4d, 15
230
    and    v2q, ~15
231
    and    v3q, ~15
232
    mova    m4, [v2q + orderq]
233
    mova    m5, [v3q + orderq]
234
    ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
235
    cmp    r4d, 0
236
    je .loop0
237
    cmp    r4d, 2
238
    je .loop2
239
    cmp    r4d, 4
240
    je .loop4
241
    cmp    r4d, 6
242
    je .loop6
243
    cmp    r4d, 8
244
    je .loop8
245
    cmp    r4d, 10
246
    je .loop10
247
    cmp    r4d, 12
248
    je .loop12
249
SCALARPRODUCT_LOOP 14
250
SCALARPRODUCT_LOOP 12
251
SCALARPRODUCT_LOOP 10
252
SCALARPRODUCT_LOOP 8
253
SCALARPRODUCT_LOOP 6
254
SCALARPRODUCT_LOOP 4
255
SCALARPRODUCT_LOOP 2
256
SCALARPRODUCT_LOOP 0
257
.end:
258
    movhlps m0, m6
259
    paddd   m6, m0
260
    pshuflw m0, m6, 0x4e
261
    paddd   m6, m0
262
    movd   eax, m6
263
    RET
264

    
265

    
266

    
267
; void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
268
cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
269
    movq    mm0, [topq]
270
    movq    mm2, mm0
271
    movd    mm4, [left_topq]
272
    psllq   mm2, 8
273
    movq    mm1, mm0
274
    por     mm4, mm2
275
    movd    mm3, [leftq]
276
    psubb   mm0, mm4 ; t-tl
277
    add    dstq, wq
278
    add    topq, wq
279
    add   diffq, wq
280
    neg      wq
281
    jmp .skip
282
.loop:
283
    movq    mm4, [topq+wq]
284
    movq    mm0, mm4
285
    psllq   mm4, 8
286
    por     mm4, mm1
287
    movq    mm1, mm0 ; t
288
    psubb   mm0, mm4 ; t-tl
289
.skip:
290
    movq    mm2, [diffq+wq]
291
%assign i 0
292
%rep 8
293
    movq    mm4, mm0
294
    paddb   mm4, mm3 ; t-tl+l
295
    movq    mm5, mm3
296
    pmaxub  mm3, mm1
297
    pminub  mm5, mm1
298
    pminub  mm3, mm4
299
    pmaxub  mm3, mm5 ; median
300
    paddb   mm3, mm2 ; +residual
301
%if i==0
302
    movq    mm7, mm3
303
    psllq   mm7, 56
304
%else
305
    movq    mm6, mm3
306
    psrlq   mm7, 8
307
    psllq   mm6, 56
308
    por     mm7, mm6
309
%endif
310
%if i<7
311
    psrlq   mm0, 8
312
    psrlq   mm1, 8
313
    psrlq   mm2, 8
314
%endif
315
%assign i i+1
316
%endrep
317
    movq [dstq+wq], mm7
318
    add      wq, 8
319
    jl .loop
320
    movzx   r2d, byte [dstq-1]
321
    mov [leftq], r2d
322
    movzx   r2d, byte [topq-1]
323
    mov [left_topq], r2d
324
    RET
325

    
326

    
327
%macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned
328
    add     srcq, wq
329
    add     dstq, wq
330
    neg     wq
331
%%.loop:
332
    mova    m1, [srcq+wq]
333
    mova    m2, m1
334
    psllw   m1, 8
335
    paddb   m1, m2
336
    mova    m2, m1
337
    pshufb  m1, m3
338
    paddb   m1, m2
339
    pshufb  m0, m5
340
    mova    m2, m1
341
    pshufb  m1, m4
342
    paddb   m1, m2
343
%if mmsize == 16
344
    mova    m2, m1
345
    pshufb  m1, m6
346
    paddb   m1, m2
347
%endif
348
    paddb   m0, m1
349
%if %1
350
    mova    [dstq+wq], m0
351
%else
352
    movq    [dstq+wq], m0
353
    movhps  [dstq+wq+8], m0
354
%endif
355
    add     wq, mmsize
356
    jl %%.loop
357
    mov     eax, mmsize-1
358
    sub     eax, wd
359
    movd    m1, eax
360
    pshufb  m0, m1
361
    movd    eax, m0
362
    RET
363
%endmacro
364

    
365
; int ff_add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
366
INIT_MMX
367
cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left
368
.skip_prologue:
369
    mova    m5, [pb_7 GLOBAL]
370
    mova    m4, [pb_zzzz3333zzzzbbbb GLOBAL]
371
    mova    m3, [pb_zz11zz55zz99zzdd GLOBAL]
372
    movd    m0, leftm
373
    psllq   m0, 56
374
    ADD_HFYU_LEFT_LOOP 1
375

    
376
INIT_XMM
377
cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
378
    mova    m5, [pb_f GLOBAL]
379
    mova    m6, [pb_zzzzzzzz77777777 GLOBAL]
380
    mova    m4, [pb_zzzz3333zzzzbbbb GLOBAL]
381
    mova    m3, [pb_zz11zz55zz99zzdd GLOBAL]
382
    movd    m0, leftm
383
    pslldq  m0, 15
384
    test    srcq, 15
385
    jnz add_hfyu_left_prediction_ssse3.skip_prologue
386
    test    dstq, 15
387
    jnz .unaligned
388
    ADD_HFYU_LEFT_LOOP 1
389
.unaligned:
390
    ADD_HFYU_LEFT_LOOP 0
391