Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / dsputilenc_yasm.asm @ 888fa31e

History | View | Annotate | Download (9.51 KB)

1
;*****************************************************************************
2
;* MMX optimized DSP utils
3
;*****************************************************************************
4
;* Copyright (c) 2000, 2001 Fabrice Bellard
5
;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6
;*
7
;* This file is part of Libav.
8
;*
9
;* Libav is free software; you can redistribute it and/or
10
;* modify it under the terms of the GNU Lesser General Public
11
;* License as published by the Free Software Foundation; either
12
;* version 2.1 of the License, or (at your option) any later version.
13
;*
14
;* Libav is distributed in the hope that it will be useful,
15
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17
;* Lesser General Public License for more details.
18
;*
19
;* You should have received a copy of the GNU Lesser General Public
20
;* License along with Libav; if not, write to the Free Software
21
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22
;*****************************************************************************
23

    
24
%include "x86inc.asm"
25
%include "x86util.asm"
26

    
27
SECTION .text
28

    
29
%macro DIFF_PIXELS_1 4
30
    movh            %1, %3
31
    movh            %2, %4
32
    punpcklbw       %2, %1
33
    punpcklbw       %1, %1
34
    psubw           %1, %2
35
%endmacro
36

    
37
; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3
38
; %6=temporary storage location
39
; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64)
40
%macro DIFF_PIXELS_8 6
41
    DIFF_PIXELS_1   m0, m7, [%1     +%3], [%2     +%3]
42
    DIFF_PIXELS_1   m1, m7, [%1+%4  +%3], [%2+%4  +%3]
43
    DIFF_PIXELS_1   m2, m7, [%1+%4*2+%3], [%2+%4*2+%3]
44
    add             %1, %5
45
    add             %2, %5
46
    DIFF_PIXELS_1   m3, m7, [%1     +%3], [%2     +%3]
47
    DIFF_PIXELS_1   m4, m7, [%1+%4  +%3], [%2+%4  +%3]
48
    DIFF_PIXELS_1   m5, m7, [%1+%4*2+%3], [%2+%4*2+%3]
49
    DIFF_PIXELS_1   m6, m7, [%1+%5  +%3], [%2+%5  +%3]
50
%ifdef m8
51
    DIFF_PIXELS_1   m7, m8, [%1+%4*4+%3], [%2+%4*4+%3]
52
%else
53
    mova          [%6], m0
54
    DIFF_PIXELS_1   m7, m0, [%1+%4*4+%3], [%2+%4*4+%3]
55
    mova            m0, [%6]
56
%endif
57
    sub             %1, %5
58
    sub             %2, %5
59
%endmacro
60

    
61
%macro HADAMARD8 0
62
    SUMSUB_BADC       m0, m1, m2, m3
63
    SUMSUB_BADC       m4, m5, m6, m7
64
    SUMSUB_BADC       m0, m2, m1, m3
65
    SUMSUB_BADC       m4, m6, m5, m7
66
    SUMSUB_BADC       m0, m4, m1, m5
67
    SUMSUB_BADC       m2, m6, m3, m7
68
%endmacro
69

    
70
%macro ABS1_SUM 3
71
    ABS1            %1, %2
72
    paddusw         %3, %1
73
%endmacro
74

    
75
%macro ABS2_SUM 6
76
    ABS2            %1, %2, %3, %4
77
    paddusw         %5, %1
78
    paddusw         %6, %2
79
%endmacro
80

    
81
%macro ABS_SUM_8x8_64 1
82
    ABS2            m0, m1, m8, m9
83
    ABS2_SUM        m2, m3, m8, m9, m0, m1
84
    ABS2_SUM        m4, m5, m8, m9, m0, m1
85
    ABS2_SUM        m6, m7, m8, m9, m0, m1
86
    paddusw         m0, m1
87
%endmacro
88

    
89
%macro ABS_SUM_8x8_32 1
90
    mova          [%1], m7
91
    ABS1            m0, m7
92
    ABS1            m1, m7
93
    ABS1_SUM        m2, m7, m0
94
    ABS1_SUM        m3, m7, m1
95
    ABS1_SUM        m4, m7, m0
96
    ABS1_SUM        m5, m7, m1
97
    ABS1_SUM        m6, m7, m0
98
    mova            m2, [%1]
99
    ABS1_SUM        m2, m7, m1
100
    paddusw         m0, m1
101
%endmacro
102

    
103
; FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
104
; about 100k on extreme inputs. But that's very unlikely to occur in natural video,
105
; and it's even more unlikely to not have any alternative mvs/modes with lower cost.
106
%macro HSUM_MMX 3
107
    mova            %2, %1
108
    psrlq           %1, 32
109
    paddusw         %1, %2
110
    mova            %2, %1
111
    psrlq           %1, 16
112
    paddusw         %1, %2
113
    movd            %3, %1
114
%endmacro
115

    
116
%macro HSUM_MMX2 3
117
    pshufw          %2, %1, 0xE
118
    paddusw         %1, %2
119
    pshufw          %2, %1, 0x1
120
    paddusw         %1, %2
121
    movd            %3, %1
122
%endmacro
123

    
124
%macro HSUM_SSE2 3
125
    movhlps         %2, %1
126
    paddusw         %1, %2
127
    pshuflw         %2, %1, 0xE
128
    paddusw         %1, %2
129
    pshuflw         %2, %1, 0x1
130
    paddusw         %1, %2
131
    movd            %3, %1
132
%endmacro
133

    
134
%macro STORE4 5
135
    mova [%1+mmsize*0], %2
136
    mova [%1+mmsize*1], %3
137
    mova [%1+mmsize*2], %4
138
    mova [%1+mmsize*3], %5
139
%endmacro
140

    
141
%macro LOAD4 5
142
    mova            %2, [%1+mmsize*0]
143
    mova            %3, [%1+mmsize*1]
144
    mova            %4, [%1+mmsize*2]
145
    mova            %5, [%1+mmsize*3]
146
%endmacro
147

    
148
%macro hadamard8_16_wrapper 3
149
cglobal hadamard8_diff_%1, 4, 4, %2
150
%ifndef m8
151
    %assign pad %3*mmsize-(4+stack_offset&(mmsize-1))
152
    SUB            rsp, pad
153
%endif
154
    call hadamard8x8_diff_%1
155
%ifndef m8
156
    ADD            rsp, pad
157
%endif
158
    RET
159

    
160
cglobal hadamard8_diff16_%1, 5, 6, %2
161
%ifndef m8
162
    %assign pad %3*mmsize-(4+stack_offset&(mmsize-1))
163
    SUB            rsp, pad
164
%endif
165

    
166
    call hadamard8x8_diff_%1
167
    mov            r5d, eax
168

    
169
    add             r1, 8
170
    add             r2, 8
171
    call hadamard8x8_diff_%1
172
    add            r5d, eax
173

    
174
    cmp            r4d, 16
175
    jne .done
176

    
177
    lea             r1, [r1+r3*8-8]
178
    lea             r2, [r2+r3*8-8]
179
    call hadamard8x8_diff_%1
180
    add            r5d, eax
181

    
182
    add             r1, 8
183
    add             r2, 8
184
    call hadamard8x8_diff_%1
185
    add            r5d, eax
186

    
187
.done
188
    mov            eax, r5d
189
%ifndef m8
190
    ADD            rsp, pad
191
%endif
192
    RET
193
%endmacro
194

    
195
%macro HADAMARD8_DIFF_MMX 1
196
ALIGN 16
197
; int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2,
198
;                          int stride, int h)
199
; r0 = void *s = unused, int h = unused (always 8)
200
; note how r1, r2 and r3 are not clobbered in this function, so 16x16
201
; can simply call this 2x2x (and that's why we access rsp+gprsize
202
; everywhere, which is rsp of calling func
203
hadamard8x8_diff_%1:
204
    lea                          r0, [r3*3]
205

    
206
    ; first 4x8 pixels
207
    DIFF_PIXELS_8                r1, r2,  0, r3, r0, rsp+gprsize+0x60
208
    HADAMARD8
209
    mova         [rsp+gprsize+0x60], m7
210
    TRANSPOSE4x4W                 0,  1,  2,  3,  7
211
    STORE4              rsp+gprsize, m0, m1, m2, m3
212
    mova                         m7, [rsp+gprsize+0x60]
213
    TRANSPOSE4x4W                 4,  5,  6,  7,  0
214
    STORE4         rsp+gprsize+0x40, m4, m5, m6, m7
215

    
216
    ; second 4x8 pixels
217
    DIFF_PIXELS_8                r1, r2,  4, r3, r0, rsp+gprsize+0x60
218
    HADAMARD8
219
    mova         [rsp+gprsize+0x60], m7
220
    TRANSPOSE4x4W                 0,  1,  2,  3,  7
221
    STORE4         rsp+gprsize+0x20, m0, m1, m2, m3
222
    mova                         m7, [rsp+gprsize+0x60]
223
    TRANSPOSE4x4W                 4,  5,  6,  7,  0
224

    
225
    LOAD4          rsp+gprsize+0x40, m0, m1, m2, m3
226
    HADAMARD8
227
    ABS_SUM_8x8_32 rsp+gprsize+0x60
228
    mova         [rsp+gprsize+0x60], m0
229

    
230
    LOAD4          rsp+gprsize     , m0, m1, m2, m3
231
    LOAD4          rsp+gprsize+0x20, m4, m5, m6, m7
232
    HADAMARD8
233
    ABS_SUM_8x8_32 rsp+gprsize
234
    paddusw                      m0, [rsp+gprsize+0x60]
235

    
236
    HSUM                         m0, m1, eax
237
    and                         rax, 0xFFFF
238
    ret
239

    
240
hadamard8_16_wrapper %1, 0, 14
241
%endmacro
242

    
243
%macro HADAMARD8_DIFF_SSE2 2
244
hadamard8x8_diff_%1:
245
    lea                          r0, [r3*3]
246
    DIFF_PIXELS_8                r1, r2,  0, r3, r0, rsp+gprsize
247
    HADAMARD8
248
%ifdef ARCH_X86_64
249
    TRANSPOSE8x8W                 0,  1,  2,  3,  4,  5,  6,  7,  8
250
%else
251
    TRANSPOSE8x8W                 0,  1,  2,  3,  4,  5,  6,  7, [rsp+gprsize], [rsp+mmsize+gprsize]
252
%endif
253
    HADAMARD8
254
    ABS_SUM_8x8         rsp+gprsize
255
    HSUM_SSE2                    m0, m1, eax
256
    and                         eax, 0xFFFF
257
    ret
258

    
259
hadamard8_16_wrapper %1, %2, 3
260
%endmacro
261

    
262
INIT_MMX
263
%define ABS1 ABS1_MMX
264
%define HSUM HSUM_MMX
265
HADAMARD8_DIFF_MMX mmx
266

    
267
%define ABS1 ABS1_MMX2
268
%define HSUM HSUM_MMX2
269
HADAMARD8_DIFF_MMX mmx2
270

    
271
INIT_XMM
272
%define ABS2 ABS2_MMX2
273
%ifdef ARCH_X86_64
274
%define ABS_SUM_8x8 ABS_SUM_8x8_64
275
%else
276
%define ABS_SUM_8x8 ABS_SUM_8x8_32
277
%endif
278
HADAMARD8_DIFF_SSE2 sse2, 10
279

    
280
%define ABS2        ABS2_SSSE3
281
%define ABS_SUM_8x8 ABS_SUM_8x8_64
282
HADAMARD8_DIFF_SSE2 ssse3, 9
283

    
284
INIT_XMM
285
; sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
286
cglobal sse16_sse2, 5, 5, 8
287
    shr      r4d, 1
288
    pxor      m0, m0         ; mm0 = 0
289
    pxor      m7, m7         ; mm7 holds the sum
290

    
291
.next2lines ; FIXME why are these unaligned movs? pix1[] is aligned
292
    movu      m1, [r1   ]    ; mm1 = pix1[0][0-15]
293
    movu      m2, [r2   ]    ; mm2 = pix2[0][0-15]
294
    movu      m3, [r1+r3]    ; mm3 = pix1[1][0-15]
295
    movu      m4, [r2+r3]    ; mm4 = pix2[1][0-15]
296

    
297
    ; todo: mm1-mm2, mm3-mm4
298
    ; algo: subtract mm1 from mm2 with saturation and vice versa
299
    ;       OR the result to get the absolute difference
300
    mova      m5, m1
301
    mova      m6, m3
302
    psubusb   m1, m2
303
    psubusb   m3, m4
304
    psubusb   m2, m5
305
    psubusb   m4, m6
306

    
307
    por       m2, m1
308
    por       m4, m3
309

    
310
    ; now convert to 16-bit vectors so we can square them
311
    mova      m1, m2
312
    mova      m3, m4
313

    
314
    punpckhbw m2, m0
315
    punpckhbw m4, m0
316
    punpcklbw m1, m0         ; mm1 not spread over (mm1,mm2)
317
    punpcklbw m3, m0         ; mm4 not spread over (mm3,mm4)
318

    
319
    pmaddwd   m2, m2
320
    pmaddwd   m4, m4
321
    pmaddwd   m1, m1
322
    pmaddwd   m3, m3
323

    
324
    lea       r1, [r1+r3*2]  ; pix1 += 2*line_size
325
    lea       r2, [r2+r3*2]  ; pix2 += 2*line_size
326

    
327
    paddd     m1, m2
328
    paddd     m3, m4
329
    paddd     m7, m1
330
    paddd     m7, m3
331

    
332
    dec       r4
333
    jnz .next2lines
334

    
335
    mova      m1, m7
336
    psrldq    m7, 8          ; shift hi qword to lo
337
    paddd     m7, m1
338
    mova      m1, m7
339
    psrldq    m7, 4          ; shift hi dword to lo
340
    paddd     m7, m1
341
    movd     eax, m7         ; return value
342
    RET