Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / dsputil_yasm.asm @ a4605efd

History | View | Annotate | Download (9.25 KB)

1
;******************************************************************************
2
;* MMX optimized DSP utils
3
;* Copyright (c) 2008 Loren Merritt
4
;*
5
;* This file is part of FFmpeg.
6
;*
7
;* FFmpeg is free software; you can redistribute it and/or
8
;* modify it under the terms of the GNU Lesser General Public
9
;* License as published by the Free Software Foundation; either
10
;* version 2.1 of the License, or (at your option) any later version.
11
;*
12
;* FFmpeg is distributed in the hope that it will be useful,
13
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
;* Lesser General Public License for more details.
16
;*
17
;* You should have received a copy of the GNU Lesser General Public
18
;* License along with FFmpeg; if not, write to the Free Software
19
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
;******************************************************************************
21

    
22
%include "x86inc.asm"
23

    
24
SECTION_RODATA
25
pb_f: times 16 db 15
26
pb_zzzzzzzz77777777: times 8 db -1
27
pb_7: times 8 db 7
28
pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
29
pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
30

    
31
section .text align=16
32

    
33
%macro PSWAPD_SSE 2
34
    pshufw %1, %2, 0x4e
35
%endmacro
36
%macro PSWAPD_3DN1 2
37
    movq  %1, %2
38
    psrlq %1, 32
39
    punpckldq %1, %2
40
%endmacro
41

    
42
%macro FLOAT_TO_INT16_INTERLEAVE6 1
43
; void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
44
cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5
45
%ifdef ARCH_X86_64
46
    %define lend r10d
47
    mov     lend, r2d
48
%else
49
    %define lend dword r2m
50
%endif
51
    mov src1q, [srcq+1*gprsize]
52
    mov src2q, [srcq+2*gprsize]
53
    mov src3q, [srcq+3*gprsize]
54
    mov src4q, [srcq+4*gprsize]
55
    mov src5q, [srcq+5*gprsize]
56
    mov srcq,  [srcq]
57
    sub src1q, srcq
58
    sub src2q, srcq
59
    sub src3q, srcq
60
    sub src4q, srcq
61
    sub src5q, srcq
62
.loop:
63
    cvtps2pi   mm0, [srcq]
64
    cvtps2pi   mm1, [srcq+src1q]
65
    cvtps2pi   mm2, [srcq+src2q]
66
    cvtps2pi   mm3, [srcq+src3q]
67
    cvtps2pi   mm4, [srcq+src4q]
68
    cvtps2pi   mm5, [srcq+src5q]
69
    packssdw   mm0, mm3
70
    packssdw   mm1, mm4
71
    packssdw   mm2, mm5
72
    pswapd     mm3, mm0
73
    punpcklwd  mm0, mm1
74
    punpckhwd  mm1, mm2
75
    punpcklwd  mm2, mm3
76
    pswapd     mm3, mm0
77
    punpckldq  mm0, mm2
78
    punpckhdq  mm2, mm1
79
    punpckldq  mm1, mm3
80
    movq [dstq   ], mm0
81
    movq [dstq+16], mm2
82
    movq [dstq+ 8], mm1
83
    add srcq, 8
84
    add dstq, 24
85
    sub lend, 2
86
    jg .loop
87
    emms
88
    RET
89
%endmacro ; FLOAT_TO_INT16_INTERLEAVE6
90

    
91
%define pswapd PSWAPD_SSE
92
FLOAT_TO_INT16_INTERLEAVE6 sse
93
%define cvtps2pi pf2id
94
%define pswapd PSWAPD_3DN1
95
FLOAT_TO_INT16_INTERLEAVE6 3dnow
96
%undef pswapd
97
FLOAT_TO_INT16_INTERLEAVE6 3dn2
98
%undef cvtps2pi
99

    
100

    
101

    
102
%macro SCALARPRODUCT 1
103
; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift)
104
cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
105
    shl orderq, 1
106
    add v1q, orderq
107
    add v2q, orderq
108
    neg orderq
109
    movd    m3, shiftm
110
    pxor    m2, m2
111
.loop:
112
    movu    m0, [v1q + orderq]
113
    movu    m1, [v1q + orderq + mmsize]
114
    pmaddwd m0, [v2q + orderq]
115
    pmaddwd m1, [v2q + orderq + mmsize]
116
    paddd   m2, m0
117
    paddd   m2, m1
118
    add     orderq, mmsize*2
119
    jl .loop
120
%if mmsize == 16
121
    movhlps m0, m2
122
    paddd   m2, m0
123
    psrad   m2, m3
124
    pshuflw m0, m2, 0x4e
125
%else
126
    psrad   m2, m3
127
    pshufw  m0, m2, 0x4e
128
%endif
129
    paddd   m2, m0
130
    movd   eax, m2
131
    RET
132

    
133
; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
134
cglobal scalarproduct_and_madd_int16_%1, 3,4,8, v1, v2, v3, order, mul
135
    shl orderq, 1
136
    movd    m7, mulm
137
%if mmsize == 16
138
    pshuflw m7, m7, 0
139
    punpcklqdq m7, m7
140
%else
141
    pshufw  m7, m7, 0
142
%endif
143
    pxor    m6, m6
144
    add v1q, orderq
145
    add v2q, orderq
146
    add v3q, orderq
147
    neg orderq
148
.loop:
149
    movu    m0, [v2q + orderq]
150
    movu    m1, [v2q + orderq + mmsize]
151
    mova    m4, [v1q + orderq]
152
    mova    m5, [v1q + orderq + mmsize]
153
    movu    m2, [v3q + orderq]
154
    movu    m3, [v3q + orderq + mmsize]
155
    pmaddwd m0, m4
156
    pmaddwd m1, m5
157
    pmullw  m2, m7
158
    pmullw  m3, m7
159
    paddd   m6, m0
160
    paddd   m6, m1
161
    paddw   m2, m4
162
    paddw   m3, m5
163
    mova    [v1q + orderq], m2
164
    mova    [v1q + orderq + mmsize], m3
165
    add     orderq, mmsize*2
166
    jl .loop
167
%if mmsize == 16
168
    movhlps m0, m6
169
    paddd   m6, m0
170
    pshuflw m0, m6, 0x4e
171
%else
172
    pshufw  m0, m6, 0x4e
173
%endif
174
    paddd   m6, m0
175
    movd   eax, m6
176
    RET
177
%endmacro
178

    
179
INIT_MMX
180
SCALARPRODUCT mmx2
181
INIT_XMM
182
SCALARPRODUCT sse2
183

    
184
%macro SCALARPRODUCT_LOOP 1
185
align 16
186
.loop%1:
187
    sub     orderq, mmsize*2
188
%if %1
189
    mova    m1, m4
190
    mova    m4, [v2q + orderq]
191
    mova    m0, [v2q + orderq + mmsize]
192
    palignr m1, m0, %1
193
    palignr m0, m4, %1
194
    mova    m3, m5
195
    mova    m5, [v3q + orderq]
196
    mova    m2, [v3q + orderq + mmsize]
197
    palignr m3, m2, %1
198
    palignr m2, m5, %1
199
%else
200
    mova    m0, [v2q + orderq]
201
    mova    m1, [v2q + orderq + mmsize]
202
    mova    m2, [v3q + orderq]
203
    mova    m3, [v3q + orderq + mmsize]
204
%endif
205
    %define t0  [v1q + orderq]
206
    %define t1  [v1q + orderq + mmsize]
207
%ifdef ARCH_X86_64
208
    mova    m8, t0
209
    mova    m9, t1
210
    %define t0  m8
211
    %define t1  m9
212
%endif
213
    pmaddwd m0, t0
214
    pmaddwd m1, t1
215
    pmullw  m2, m7
216
    pmullw  m3, m7
217
    paddw   m2, t0
218
    paddw   m3, t1
219
    paddd   m6, m0
220
    paddd   m6, m1
221
    mova    [v1q + orderq], m2
222
    mova    [v1q + orderq + mmsize], m3
223
    jg .loop%1
224
%if %1
225
    jmp .end
226
%endif
227
%endmacro
228

    
229
; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
230
cglobal scalarproduct_and_madd_int16_ssse3, 4,5,10, v1, v2, v3, order, mul
231
    shl orderq, 1
232
    movd    m7, mulm
233
    pshuflw m7, m7, 0
234
    punpcklqdq m7, m7
235
    pxor    m6, m6
236
    mov    r4d, v2d
237
    and    r4d, 15
238
    and    v2q, ~15
239
    and    v3q, ~15
240
    mova    m4, [v2q + orderq]
241
    mova    m5, [v3q + orderq]
242
    ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
243
    cmp    r4d, 0
244
    je .loop0
245
    cmp    r4d, 2
246
    je .loop2
247
    cmp    r4d, 4
248
    je .loop4
249
    cmp    r4d, 6
250
    je .loop6
251
    cmp    r4d, 8
252
    je .loop8
253
    cmp    r4d, 10
254
    je .loop10
255
    cmp    r4d, 12
256
    je .loop12
257
SCALARPRODUCT_LOOP 14
258
SCALARPRODUCT_LOOP 12
259
SCALARPRODUCT_LOOP 10
260
SCALARPRODUCT_LOOP 8
261
SCALARPRODUCT_LOOP 6
262
SCALARPRODUCT_LOOP 4
263
SCALARPRODUCT_LOOP 2
264
SCALARPRODUCT_LOOP 0
265
.end:
266
    movhlps m0, m6
267
    paddd   m6, m0
268
    pshuflw m0, m6, 0x4e
269
    paddd   m6, m0
270
    movd   eax, m6
271
    RET
272

    
273

    
274

    
275
; void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
276
cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
277
    movq    mm0, [topq]
278
    movq    mm2, mm0
279
    movd    mm4, [left_topq]
280
    psllq   mm2, 8
281
    movq    mm1, mm0
282
    por     mm4, mm2
283
    movd    mm3, [leftq]
284
    psubb   mm0, mm4 ; t-tl
285
    add    dstq, wq
286
    add    topq, wq
287
    add   diffq, wq
288
    neg      wq
289
    jmp .skip
290
.loop:
291
    movq    mm4, [topq+wq]
292
    movq    mm0, mm4
293
    psllq   mm4, 8
294
    por     mm4, mm1
295
    movq    mm1, mm0 ; t
296
    psubb   mm0, mm4 ; t-tl
297
.skip:
298
    movq    mm2, [diffq+wq]
299
%assign i 0
300
%rep 8
301
    movq    mm4, mm0
302
    paddb   mm4, mm3 ; t-tl+l
303
    movq    mm5, mm3
304
    pmaxub  mm3, mm1
305
    pminub  mm5, mm1
306
    pminub  mm3, mm4
307
    pmaxub  mm3, mm5 ; median
308
    paddb   mm3, mm2 ; +residual
309
%if i==0
310
    movq    mm7, mm3
311
    psllq   mm7, 56
312
%else
313
    movq    mm6, mm3
314
    psrlq   mm7, 8
315
    psllq   mm6, 56
316
    por     mm7, mm6
317
%endif
318
%if i<7
319
    psrlq   mm0, 8
320
    psrlq   mm1, 8
321
    psrlq   mm2, 8
322
%endif
323
%assign i i+1
324
%endrep
325
    movq [dstq+wq], mm7
326
    add      wq, 8
327
    jl .loop
328
    movzx   r2d, byte [dstq-1]
329
    mov [leftq], r2d
330
    movzx   r2d, byte [topq-1]
331
    mov [left_topq], r2d
332
    RET
333

    
334

    
335
%macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned
336
    add     srcq, wq
337
    add     dstq, wq
338
    neg     wq
339
%%.loop:
340
    mova    m1, [srcq+wq]
341
    mova    m2, m1
342
    psllw   m1, 8
343
    paddb   m1, m2
344
    mova    m2, m1
345
    pshufb  m1, m3
346
    paddb   m1, m2
347
    pshufb  m0, m5
348
    mova    m2, m1
349
    pshufb  m1, m4
350
    paddb   m1, m2
351
%if mmsize == 16
352
    mova    m2, m1
353
    pshufb  m1, m6
354
    paddb   m1, m2
355
%endif
356
    paddb   m0, m1
357
%if %1
358
    mova    [dstq+wq], m0
359
%else
360
    movq    [dstq+wq], m0
361
    movhps  [dstq+wq+8], m0
362
%endif
363
    add     wq, mmsize
364
    jl %%.loop
365
    mov     eax, mmsize-1
366
    sub     eax, wd
367
    movd    m1, eax
368
    pshufb  m0, m1
369
    movd    eax, m0
370
    RET
371
%endmacro
372

    
373
; int ff_add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
374
INIT_MMX
375
cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left
376
.skip_prologue:
377
    mova    m5, [pb_7 GLOBAL]
378
    mova    m4, [pb_zzzz3333zzzzbbbb GLOBAL]
379
    mova    m3, [pb_zz11zz55zz99zzdd GLOBAL]
380
    movd    m0, leftm
381
    psllq   m0, 56
382
    ADD_HFYU_LEFT_LOOP 1
383

    
384
INIT_XMM
385
cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
386
    mova    m5, [pb_f GLOBAL]
387
    mova    m6, [pb_zzzzzzzz77777777 GLOBAL]
388
    mova    m4, [pb_zzzz3333zzzzbbbb GLOBAL]
389
    mova    m3, [pb_zz11zz55zz99zzdd GLOBAL]
390
    movd    m0, leftm
391
    pslldq  m0, 15
392
    test    srcq, 15
393
    jnz add_hfyu_left_prediction_ssse3.skip_prologue
394
    test    dstq, 15
395
    jnz .unaligned
396
    ADD_HFYU_LEFT_LOOP 1
397
.unaligned:
398
    ADD_HFYU_LEFT_LOOP 0
399