Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / dsputil_yasm.asm @ b10fa1bb

History | View | Annotate | Download (7.19 KB)

1 7ca7d5fa Loren Merritt
;******************************************************************************
2
;* MMX optimized DSP utils
3
;* Copyright (c) 2008 Loren Merritt
4
;*
5
;* This file is part of FFmpeg.
6
;*
7
;* FFmpeg is free software; you can redistribute it and/or
8
;* modify it under the terms of the GNU Lesser General Public
9
;* License as published by the Free Software Foundation; either
10
;* version 2.1 of the License, or (at your option) any later version.
11
;*
12
;* FFmpeg is distributed in the hope that it will be useful,
13
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
;* Lesser General Public License for more details.
16
;*
17
;* You should have received a copy of the GNU Lesser General Public
18
;* License along with FFmpeg; if not, write to the Free Software
19
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
;******************************************************************************
21
22
%include "x86inc.asm"
23
24 2f77923d Loren Merritt
SECTION_RODATA
25
pb_f: times 16 db 15
26
pb_zzzzzzzz77777777: times 8 db -1
27
pb_7: times 8 db 7
28
pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
29
pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
30
31 7ca7d5fa Loren Merritt
section .text align=16
32
33
%macro PSWAPD_SSE 2
34
    pshufw %1, %2, 0x4e
35
%endmacro
36
%macro PSWAPD_3DN1 2
37
    movq  %1, %2
38
    psrlq %1, 32
39
    punpckldq %1, %2
40
%endmacro
41
42
%macro FLOAT_TO_INT16_INTERLEAVE6 1
43
; void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
44 40c7d0ae Jason Garrett-Glaser
cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5
45 7ca7d5fa Loren Merritt
%ifdef ARCH_X86_64
46
    %define lend r10d
47
    mov     lend, r2d
48
%else
49
    %define lend dword r2m
50
%endif
51
    mov src1q, [srcq+1*gprsize]
52
    mov src2q, [srcq+2*gprsize]
53
    mov src3q, [srcq+3*gprsize]
54
    mov src4q, [srcq+4*gprsize]
55
    mov src5q, [srcq+5*gprsize]
56
    mov srcq,  [srcq]
57
    sub src1q, srcq
58
    sub src2q, srcq
59
    sub src3q, srcq
60
    sub src4q, srcq
61
    sub src5q, srcq
62
.loop:
63
    cvtps2pi   mm0, [srcq]
64
    cvtps2pi   mm1, [srcq+src1q]
65
    cvtps2pi   mm2, [srcq+src2q]
66
    cvtps2pi   mm3, [srcq+src3q]
67
    cvtps2pi   mm4, [srcq+src4q]
68
    cvtps2pi   mm5, [srcq+src5q]
69
    packssdw   mm0, mm3
70
    packssdw   mm1, mm4
71
    packssdw   mm2, mm5
72
    pswapd     mm3, mm0
73
    punpcklwd  mm0, mm1
74
    punpckhwd  mm1, mm2
75
    punpcklwd  mm2, mm3
76
    pswapd     mm3, mm0
77
    punpckldq  mm0, mm2
78
    punpckhdq  mm2, mm1
79
    punpckldq  mm1, mm3
80
    movq [dstq   ], mm0
81
    movq [dstq+16], mm2
82
    movq [dstq+ 8], mm1
83
    add srcq, 8
84
    add dstq, 24
85
    sub lend, 2
86
    jg .loop
87
    emms
88
    RET
89
%endmacro ; FLOAT_TO_INT16_INTERLEAVE6
90
91
%define pswapd PSWAPD_SSE
92
FLOAT_TO_INT16_INTERLEAVE6 sse
93
%define cvtps2pi pf2id
94
%define pswapd PSWAPD_3DN1
95
FLOAT_TO_INT16_INTERLEAVE6 3dnow
96
%undef pswapd
97
FLOAT_TO_INT16_INTERLEAVE6 3dn2
98
%undef cvtps2pi
99
100 3daa434a Loren Merritt
101
102 b10fa1bb Loren Merritt
%macro SCALARPRODUCT 1
103
; void add_int16(int16_t * v1, int16_t * v2, int order)
104
cglobal add_int16_%1, 3,3,2, v1, v2, order
105
    shl orderq, 1
106
    add v1q, orderq
107
    add v2q, orderq
108
    neg orderq
109
.loop:
110
    movu    m0, [v2q + orderq]
111
    movu    m1, [v2q + orderq + mmsize]
112
    paddw   m0, [v1q + orderq]
113
    paddw   m1, [v1q + orderq + mmsize]
114
    mova    [v1q + orderq], m0
115
    mova    [v1q + orderq + mmsize], m1
116
    add     orderq, mmsize*2
117
    jl .loop
118
    REP_RET
119
120
; void sub_int16(int16_t * v1, int16_t * v2, int order)
121
cglobal sub_int16_%1, 3,3,4, v1, v2, order
122
    shl orderq, 1
123
    add v1q, orderq
124
    add v2q, orderq
125
    neg orderq
126
.loop:
127
    movu    m2, [v2q + orderq]
128
    movu    m3, [v2q + orderq + mmsize]
129
    mova    m0, [v1q + orderq]
130
    mova    m1, [v1q + orderq + mmsize]
131
    psubw   m0, m2
132
    psubw   m1, m3
133
    mova    [v1q + orderq], m0
134
    mova    [v1q + orderq + mmsize], m1
135
    add     orderq, mmsize*2
136
    jl .loop
137
    REP_RET
138
139
; int scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift)
140
cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
141
    shl orderq, 1
142
    add v1q, orderq
143
    add v2q, orderq
144
    neg orderq
145
    movd    m3, shiftm
146
    pxor    m2, m2
147
.loop:
148
    movu    m0, [v1q + orderq]
149
    movu    m1, [v1q + orderq + mmsize]
150
    pmaddwd m0, [v2q + orderq]
151
    pmaddwd m1, [v2q + orderq + mmsize]
152
    paddd   m2, m0
153
    paddd   m2, m1
154
    add     orderq, mmsize*2
155
    jl .loop
156
%if mmsize == 16
157
    movhlps m0, m2
158
    paddd   m2, m0
159
    psrad   m2, m3
160
    pshuflw m0, m2, 0x4e
161
%else
162
    psrad   m2, m3
163
    pshufw  m0, m2, 0x4e
164
%endif
165
    paddd   m2, m0
166
    movd   eax, m2
167
    RET
168
%endmacro
169
170
INIT_MMX
171
SCALARPRODUCT mmx2
172
INIT_XMM
173
SCALARPRODUCT sse2
174
175
176
177 e17ccf60 Loren Merritt
; void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
178 3daa434a Loren Merritt
cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
179
    movq    mm0, [topq]
180
    movq    mm2, mm0
181
    movd    mm4, [left_topq]
182
    psllq   mm2, 8
183
    movq    mm1, mm0
184
    por     mm4, mm2
185
    movd    mm3, [leftq]
186
    psubb   mm0, mm4 ; t-tl
187
    add    dstq, wq
188
    add    topq, wq
189
    add   diffq, wq
190
    neg      wq
191
    jmp .skip
192
.loop:
193
    movq    mm4, [topq+wq]
194
    movq    mm0, mm4
195
    psllq   mm4, 8
196
    por     mm4, mm1
197
    movq    mm1, mm0 ; t
198
    psubb   mm0, mm4 ; t-tl
199
.skip:
200
    movq    mm2, [diffq+wq]
201
%assign i 0
202
%rep 8
203
    movq    mm4, mm0
204
    paddb   mm4, mm3 ; t-tl+l
205
    movq    mm5, mm3
206
    pmaxub  mm3, mm1
207
    pminub  mm5, mm1
208
    pminub  mm3, mm4
209
    pmaxub  mm3, mm5 ; median
210
    paddb   mm3, mm2 ; +residual
211
%if i==0
212
    movq    mm7, mm3
213
    psllq   mm7, 56
214
%else
215
    movq    mm6, mm3
216
    psrlq   mm7, 8
217
    psllq   mm6, 56
218
    por     mm7, mm6
219
%endif
220
%if i<7
221
    psrlq   mm0, 8
222
    psrlq   mm1, 8
223
    psrlq   mm2, 8
224
%endif
225
%assign i i+1
226
%endrep
227
    movq [dstq+wq], mm7
228
    add      wq, 8
229
    jl .loop
230
    movzx   r2d, byte [dstq-1]
231
    mov [leftq], r2d
232
    movzx   r2d, byte [topq-1]
233
    mov [left_topq], r2d
234
    RET
235 2f77923d Loren Merritt
236
237
%macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned
238
    add     srcq, wq
239
    add     dstq, wq
240
    neg     wq
241
%%.loop:
242
    mova    m1, [srcq+wq]
243
    mova    m2, m1
244
    psllw   m1, 8
245
    paddb   m1, m2
246
    mova    m2, m1
247
    pshufb  m1, m3
248
    paddb   m1, m2
249
    pshufb  m0, m5
250
    mova    m2, m1
251
    pshufb  m1, m4
252
    paddb   m1, m2
253
%if mmsize == 16
254
    mova    m2, m1
255
    pshufb  m1, m6
256
    paddb   m1, m2
257
%endif
258
    paddb   m0, m1
259
%if %1
260
    mova    [dstq+wq], m0
261
%else
262
    movq    [dstq+wq], m0
263
    movhps  [dstq+wq+8], m0
264
%endif
265
    add     wq, mmsize
266
    jl %%.loop
267
    mov     eax, mmsize-1
268
    sub     eax, wd
269
    movd    m1, eax
270
    pshufb  m0, m1
271
    movd    eax, m0
272
    RET
273
%endmacro
274
275 e17ccf60 Loren Merritt
; int ff_add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
276 2f77923d Loren Merritt
INIT_MMX
277
cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left
278
.skip_prologue:
279
    mova    m5, [pb_7 GLOBAL]
280
    mova    m4, [pb_zzzz3333zzzzbbbb GLOBAL]
281
    mova    m3, [pb_zz11zz55zz99zzdd GLOBAL]
282
    movd    m0, leftm
283
    psllq   m0, 56
284
    ADD_HFYU_LEFT_LOOP 1
285
286
INIT_XMM
287
cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
288
    mova    m5, [pb_f GLOBAL]
289
    mova    m6, [pb_zzzzzzzz77777777 GLOBAL]
290
    mova    m4, [pb_zzzz3333zzzzbbbb GLOBAL]
291
    mova    m3, [pb_zz11zz55zz99zzdd GLOBAL]
292
    movd    m0, leftm
293
    pslldq  m0, 15
294
    test    srcq, 15
295 b07781b6 Loren Merritt
    jnz add_hfyu_left_prediction_ssse3.skip_prologue
296 2f77923d Loren Merritt
    test    dstq, 15
297
    jnz .unaligned
298
    ADD_HFYU_LEFT_LOOP 1
299
.unaligned:
300
    ADD_HFYU_LEFT_LOOP 0