Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / h264_idct.asm @ ae112918

History | View | Annotate | Download (21.3 KB)

1
;*****************************************************************************
2
;* MMX/SSE2-optimized H.264 iDCT
3
;*****************************************************************************
4
;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt
5
;* Copyright (C) 2003-2008 x264 project
6
;*
7
;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
8
;*          Loren Merritt <lorenm@u.washington.edu>
9
;*          Holger Lubitz <hal@duncan.ol.sub.de>
10
;*          Min Chen <chenm001.163.com>
11
;*
12
;* This file is part of FFmpeg.
13
;*
14
;* FFmpeg is free software; you can redistribute it and/or
15
;* modify it under the terms of the GNU Lesser General Public
16
;* License as published by the Free Software Foundation; either
17
;* version 2.1 of the License, or (at your option) any later version.
18
;*
19
;* FFmpeg is distributed in the hope that it will be useful,
20
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
21
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22
;* Lesser General Public License for more details.
23
;*
24
;* You should have received a copy of the GNU Lesser General Public
25
;* License along with FFmpeg; if not, write to the Free Software
26
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27
;*****************************************************************************
28

    
29
%include "x86inc.asm"
30
%include "x86util.asm"
31

    
32
SECTION_RODATA
33

    
34
; FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
35
scan8_mem: db 4+1*8, 5+1*8, 4+2*8, 5+2*8
36
           db 6+1*8, 7+1*8, 6+2*8, 7+2*8
37
           db 4+3*8, 5+3*8, 4+4*8, 5+4*8
38
           db 6+3*8, 7+3*8, 6+4*8, 7+4*8
39
           db 1+1*8, 2+1*8
40
           db 1+2*8, 2+2*8
41
           db 1+4*8, 2+4*8
42
           db 1+5*8, 2+5*8
43
%ifdef PIC
44
%define scan8 r11
45
%else
46
%define scan8 scan8_mem
47
%endif
48

    
49
cextern pw_32
50

    
51
SECTION .text
52

    
53
; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
54
%macro IDCT4_ADD 3
55
    ; Load dct coeffs
56
    movq         m0, [%2]
57
    movq         m1, [%2+8]
58
    movq         m2, [%2+16]
59
    movq         m3, [%2+24]
60

    
61
    IDCT4_1D      0, 1, 2, 3, 4, 5
62
    mova         m6, [pw_32]
63
    TRANSPOSE4x4W 0, 1, 2, 3, 4
64
    paddw        m0, m6
65
    IDCT4_1D      0, 1, 2, 3, 4, 5
66
    pxor         m7, m7
67

    
68
    STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3
69
    lea          %1, [%1+%3*2]
70
    STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3
71
%endmacro
72

    
73
INIT_MMX
74
; ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
75
cglobal h264_idct_add_mmx, 3, 3, 0
76
    IDCT4_ADD    r0, r1, r2
77
    RET
78

    
79
%macro IDCT8_1D 2
80
    mova         m4, m5
81
    mova         m0, m1
82
    psraw        m4, 1
83
    psraw        m1, 1
84
    paddw        m4, m5
85
    paddw        m1, m0
86
    paddw        m4, m7
87
    paddw        m1, m5
88
    psubw        m4, m0
89
    paddw        m1, m3
90

    
91
    psubw        m0, m3
92
    psubw        m5, m3
93
    paddw        m0, m7
94
    psubw        m5, m7
95
    psraw        m3, 1
96
    psraw        m7, 1
97
    psubw        m0, m3
98
    psubw        m5, m7
99

    
100
    mova         m3, m4
101
    mova         m7, m1
102
    psraw        m1, 2
103
    psraw        m3, 2
104
    paddw        m3, m0
105
    psraw        m0, 2
106
    paddw        m1, m5
107
    psraw        m5, 2
108
    psubw        m0, m4
109
    psubw        m7, m5
110

    
111
    mova         m4, m2
112
    mova         m5, m6
113
    psraw        m4, 1
114
    psraw        m6, 1
115
    psubw        m4, m5
116
    paddw        m6, m2
117

    
118
    mova         m2, %1
119
    mova         m5, %2
120
    SUMSUB_BA    m5, m2
121
    SUMSUB_BA    m6, m5
122
    SUMSUB_BA    m4, m2
123
    SUMSUB_BA    m7, m6
124
    SUMSUB_BA    m0, m4
125
    SUMSUB_BA    m3, m2
126
    SUMSUB_BA    m1, m5
127
    SWAP          7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
128
%endmacro
129

    
130
%macro IDCT8_1D_FULL 1
131
    mova         m7, [%1+112]
132
    mova         m6, [%1+ 96]
133
    mova         m5, [%1+ 80]
134
    mova         m3, [%1+ 48]
135
    mova         m2, [%1+ 32]
136
    mova         m1, [%1+ 16]
137
    IDCT8_1D   [%1], [%1+ 64]
138
%endmacro
139

    
140
; %1=int16_t *block, %2=int16_t *dstblock
141
%macro IDCT8_ADD_MMX_START 2
142
    IDCT8_1D_FULL %1
143
    mova       [%1], m7
144
    TRANSPOSE4x4W 0, 1, 2, 3, 7
145
    mova         m7, [%1]
146
    mova    [%2   ], m0
147
    mova    [%2+16], m1
148
    mova    [%2+32], m2
149
    mova    [%2+48], m3
150
    TRANSPOSE4x4W 4, 5, 6, 7, 3
151
    mova    [%2+ 8], m4
152
    mova    [%2+24], m5
153
    mova    [%2+40], m6
154
    mova    [%2+56], m7
155
%endmacro
156

    
157
; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
158
%macro IDCT8_ADD_MMX_END 3
159
    IDCT8_1D_FULL %2
160
    mova    [%2   ], m5
161
    mova    [%2+16], m6
162
    mova    [%2+32], m7
163

    
164
    pxor         m7, m7
165
    STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3
166
    lea          %1, [%1+%3*2]
167
    STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3
168
    mova         m0, [%2   ]
169
    mova         m1, [%2+16]
170
    mova         m2, [%2+32]
171
    lea          %1, [%1+%3*2]
172
    STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3
173
    lea          %1, [%1+%3*2]
174
    STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3
175
%endmacro
176

    
177
INIT_MMX
178
; ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
179
cglobal h264_idct8_add_mmx, 3, 4, 0
180
    %assign pad 128+4-(stack_offset&7)
181
    SUB         rsp, pad
182

    
183
    add   word [r1], 32
184
    IDCT8_ADD_MMX_START r1  , rsp
185
    IDCT8_ADD_MMX_START r1+8, rsp+64
186
    lea          r3, [r0+4]
187
    IDCT8_ADD_MMX_END   r0  , rsp,   r2
188
    IDCT8_ADD_MMX_END   r3  , rsp+8, r2
189

    
190
    ADD         rsp, pad
191
    RET
192

    
193
; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
194
%macro IDCT8_ADD_SSE 4
195
    IDCT8_1D_FULL %2
196
%ifdef ARCH_X86_64
197
    TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
198
%else
199
    TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16]
200
%endif
201
    paddw        m0, [pw_32]
202

    
203
%ifndef ARCH_X86_64
204
    mova    [%2   ], m0
205
    mova    [%2+16], m4
206
    IDCT8_1D   [%2], [%2+ 16]
207
    mova    [%2   ], m6
208
    mova    [%2+16], m7
209
%else
210
    SWAP          0, 8
211
    SWAP          4, 9
212
    IDCT8_1D     m8, m9
213
    SWAP          6, 8
214
    SWAP          7, 9
215
%endif
216

    
217
    pxor         m7, m7
218
    lea          %4, [%3*3]
219
    STORE_DIFF   m0, m6, m7, [%1     ]
220
    STORE_DIFF   m1, m6, m7, [%1+%3  ]
221
    STORE_DIFF   m2, m6, m7, [%1+%3*2]
222
    STORE_DIFF   m3, m6, m7, [%1+%4  ]
223
%ifndef ARCH_X86_64
224
    mova         m0, [%2   ]
225
    mova         m1, [%2+16]
226
%else
227
    SWAP          0, 8
228
    SWAP          1, 9
229
%endif
230
    lea          %1, [%1+%3*4]
231
    STORE_DIFF   m4, m6, m7, [%1     ]
232
    STORE_DIFF   m5, m6, m7, [%1+%3  ]
233
    STORE_DIFF   m0, m6, m7, [%1+%3*2]
234
    STORE_DIFF   m1, m6, m7, [%1+%4  ]
235
%endmacro
236

    
237
INIT_XMM
238
; ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
239
cglobal h264_idct8_add_sse2, 3, 4, 10
240
    IDCT8_ADD_SSE r0, r1, r2, r3
241
    RET
242

    
243
%macro DC_ADD_MMX2_INIT 2-3
244
%if %0 == 2
245
    movsx        %1, word [%1]
246
    add          %1, 32
247
    sar          %1, 6
248
    movd         m0, %1
249
    lea          %1, [%2*3]
250
%else
251
    add          %3, 32
252
    sar          %3, 6
253
    movd         m0, %3
254
    lea          %3, [%2*3]
255
%endif
256
    pshufw       m0, m0, 0
257
    pxor         m1, m1
258
    psubw        m1, m0
259
    packuswb     m0, m0
260
    packuswb     m1, m1
261
%endmacro
262

    
263
%macro DC_ADD_MMX2_OP 3-4
264
    %1           m2, [%2     ]
265
    %1           m3, [%2+%3  ]
266
    %1           m4, [%2+%3*2]
267
    %1           m5, [%2+%4  ]
268
    paddusb      m2, m0
269
    paddusb      m3, m0
270
    paddusb      m4, m0
271
    paddusb      m5, m0
272
    psubusb      m2, m1
273
    psubusb      m3, m1
274
    psubusb      m4, m1
275
    psubusb      m5, m1
276
    %1    [%2     ], m2
277
    %1    [%2+%3  ], m3
278
    %1    [%2+%3*2], m4
279
    %1    [%2+%4  ], m5
280
%endmacro
281

    
282
INIT_MMX
283
; ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
284
cglobal h264_idct_dc_add_mmx2, 3, 3, 0
285
    DC_ADD_MMX2_INIT r1, r2
286
    DC_ADD_MMX2_OP movh, r0, r2, r1
287
    RET
288

    
289
; ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
290
cglobal h264_idct8_dc_add_mmx2, 3, 3, 0
291
    DC_ADD_MMX2_INIT r1, r2
292
    DC_ADD_MMX2_OP mova, r0, r2, r1
293
    lea          r0, [r0+r2*4]
294
    DC_ADD_MMX2_OP mova, r0, r2, r1
295
    RET
296

    
297
; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset,
298
;             DCTELEM *block, int stride, const uint8_t nnzc[6*8])
299
cglobal h264_idct_add16_mmx, 5, 7, 0
300
    xor          r5, r5
301
%ifdef PIC
302
    lea         r11, [scan8_mem]
303
%endif
304
.nextblock
305
    movzx        r6, byte [scan8+r5]
306
    movzx        r6, byte [r4+r6]
307
    test         r6, r6
308
    jz .skipblock
309
    mov         r6d, dword [r1+r5*4]
310
    lea          r6, [r0+r6]
311
    IDCT4_ADD    r6, r2, r3
312
.skipblock
313
    inc          r5
314
    add          r2, 32
315
    cmp          r5, 16
316
    jl .nextblock
317
    REP_RET
318

    
319
; ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset,
320
;                        DCTELEM *block, int stride, const uint8_t nnzc[6*8])
321
cglobal h264_idct8_add4_mmx, 5, 7, 0
322
    %assign pad 128+4-(stack_offset&7)
323
    SUB         rsp, pad
324

    
325
    xor          r5, r5
326
%ifdef PIC
327
    lea         r11, [scan8_mem]
328
%endif
329
.nextblock
330
    movzx        r6, byte [scan8+r5]
331
    movzx        r6, byte [r4+r6]
332
    test         r6, r6
333
    jz .skipblock
334
    mov         r6d, dword [r1+r5*4]
335
    lea          r6, [r0+r6]
336
    add   word [r2], 32
337
    IDCT8_ADD_MMX_START r2  , rsp
338
    IDCT8_ADD_MMX_START r2+8, rsp+64
339
    IDCT8_ADD_MMX_END   r6  , rsp,   r3
340
    mov         r6d, dword [r1+r5*4]
341
    lea          r6, [r0+r6+4]
342
    IDCT8_ADD_MMX_END   r6  , rsp+8, r3
343
.skipblock
344
    add          r5, 4
345
    add          r2, 128
346
    cmp          r5, 16
347
    jl .nextblock
348
    ADD         rsp, pad
349
    RET
350

    
351
; ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset,
352
;                         DCTELEM *block, int stride, const uint8_t nnzc[6*8])
353
cglobal h264_idct_add16_mmx2, 5, 7, 0
354
    xor          r5, r5
355
%ifdef PIC
356
    lea         r11, [scan8_mem]
357
%endif
358
.nextblock
359
    movzx        r6, byte [scan8+r5]
360
    movzx        r6, byte [r4+r6]
361
    test         r6, r6
362
    jz .skipblock
363
    cmp          r6, 1
364
    jnz .no_dc
365
    movsx        r6, word [r2]
366
    test         r6, r6
367
    jz .no_dc
368
    DC_ADD_MMX2_INIT r2, r3, r6
369
%ifdef ARCH_X86_64
370
%define dst_reg  r10
371
%define dst_regd r10d
372
%else
373
%define dst_reg  r1
374
%define dst_regd r1d
375
%endif
376
    mov    dst_regd, dword [r1+r5*4]
377
    lea     dst_reg, [r0+dst_reg]
378
    DC_ADD_MMX2_OP movh, dst_reg, r3, r6
379
%ifndef ARCH_X86_64
380
    mov          r1, r1m
381
%endif
382
    inc          r5
383
    add          r2, 32
384
    cmp          r5, 16
385
    jl .nextblock
386
    REP_RET
387
.no_dc
388
    mov         r6d, dword [r1+r5*4]
389
    lea          r6, [r0+r6]
390
    IDCT4_ADD    r6, r2, r3
391
.skipblock
392
    inc          r5
393
    add          r2, 32
394
    cmp          r5, 16
395
    jl .nextblock
396
    REP_RET
397

    
398
; ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset,
399
;                             DCTELEM *block, int stride, const uint8_t nnzc[6*8])
400
cglobal h264_idct_add16intra_mmx, 5, 7, 0
401
    xor          r5, r5
402
%ifdef PIC
403
    lea         r11, [scan8_mem]
404
%endif
405
.nextblock
406
    movzx        r6, byte [scan8+r5]
407
    movzx        r6, byte [r4+r6]
408
    or          r6w, word [r2]
409
    test         r6, r6
410
    jz .skipblock
411
    mov         r6d, dword [r1+r5*4]
412
    lea          r6, [r0+r6]
413
    IDCT4_ADD    r6, r2, r3
414
.skipblock
415
    inc          r5
416
    add          r2, 32
417
    cmp          r5, 16
418
    jl .nextblock
419
    REP_RET
420

    
421
; ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset,
422
;                              DCTELEM *block, int stride, const uint8_t nnzc[6*8])
423
cglobal h264_idct_add16intra_mmx2, 5, 7, 0
424
    xor          r5, r5
425
%ifdef PIC
426
    lea         r11, [scan8_mem]
427
%endif
428
.nextblock
429
    movzx        r6, byte [scan8+r5]
430
    movzx        r6, byte [r4+r6]
431
    test         r6, r6
432
    jz .try_dc
433
    mov         r6d, dword [r1+r5*4]
434
    lea          r6, [r0+r6]
435
    IDCT4_ADD    r6, r2, r3
436
    inc          r5
437
    add          r2, 32
438
    cmp          r5, 16
439
    jl .nextblock
440
    REP_RET
441
.try_dc
442
    movsx        r6, word [r2]
443
    test         r6, r6
444
    jz .skipblock
445
    DC_ADD_MMX2_INIT r2, r3, r6
446
%ifdef ARCH_X86_64
447
%define dst_reg  r10
448
%define dst_regd r10d
449
%else
450
%define dst_reg  r1
451
%define dst_regd r1d
452
%endif
453
    mov    dst_regd, dword [r1+r5*4]
454
    lea     dst_reg, [r0+dst_reg]
455
    DC_ADD_MMX2_OP movh, dst_reg, r3, r6
456
%ifndef ARCH_X86_64
457
    mov          r1, r1m
458
%endif
459
.skipblock
460
    inc          r5
461
    add          r2, 32
462
    cmp          r5, 16
463
    jl .nextblock
464
    REP_RET
465

    
466
; ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset,
467
;                         DCTELEM *block, int stride, const uint8_t nnzc[6*8])
468
cglobal h264_idct8_add4_mmx2, 5, 7, 0
469
    %assign pad 128+4-(stack_offset&7)
470
    SUB         rsp, pad
471

    
472
    xor          r5, r5
473
%ifdef PIC
474
    lea         r11, [scan8_mem]
475
%endif
476
.nextblock
477
    movzx        r6, byte [scan8+r5]
478
    movzx        r6, byte [r4+r6]
479
    test         r6, r6
480
    jz .skipblock
481
    cmp          r6, 1
482
    jnz .no_dc
483
    movsx        r6, word [r2]
484
    test         r6, r6
485
    jz .no_dc
486
    DC_ADD_MMX2_INIT r2, r3, r6
487
%ifdef ARCH_X86_64
488
%define dst_reg  r10
489
%define dst_regd r10d
490
%else
491
%define dst_reg  r1
492
%define dst_regd r1d
493
%endif
494
    mov    dst_regd, dword [r1+r5*4]
495
    lea     dst_reg, [r0+dst_reg]
496
    DC_ADD_MMX2_OP mova, dst_reg, r3, r6
497
    lea     dst_reg, [dst_reg+r3*4]
498
    DC_ADD_MMX2_OP mova, dst_reg, r3, r6
499
%ifndef ARCH_X86_64
500
    mov          r1, r1m
501
%endif
502
    add          r5, 4
503
    add          r2, 128
504
    cmp          r5, 16
505
    jl .nextblock
506

    
507
    ADD         rsp, pad
508
    RET
509
.no_dc
510
    mov         r6d, dword [r1+r5*4]
511
    lea          r6, [r0+r6]
512
    add   word [r2], 32
513
    IDCT8_ADD_MMX_START r2  , rsp
514
    IDCT8_ADD_MMX_START r2+8, rsp+64
515
    IDCT8_ADD_MMX_END   r6  , rsp,   r3
516
    mov         r6d, dword [r1+r5*4]
517
    lea          r6, [r0+r6+4]
518
    IDCT8_ADD_MMX_END   r6  , rsp+8, r3
519
.skipblock
520
    add          r5, 4
521
    add          r2, 128
522
    cmp          r5, 16
523
    jl .nextblock
524

    
525
    ADD         rsp, pad
526
    RET
527

    
528
INIT_XMM
529
; ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset,
530
;                         DCTELEM *block, int stride, const uint8_t nnzc[6*8])
531
cglobal h264_idct8_add4_sse2, 5, 7, 10
532
    xor          r5, r5
533
%ifdef PIC
534
    lea         r11, [scan8_mem]
535
%endif
536
.nextblock
537
    movzx        r6, byte [scan8+r5]
538
    movzx        r6, byte [r4+r6]
539
    test         r6, r6
540
    jz .skipblock
541
    cmp          r6, 1
542
    jnz .no_dc
543
    movsx        r6, word [r2]
544
    test         r6, r6
545
    jz .no_dc
546
INIT_MMX
547
    DC_ADD_MMX2_INIT r2, r3, r6
548
%ifdef ARCH_X86_64
549
%define dst_reg  r10
550
%define dst_regd r10d
551
%else
552
%define dst_reg  r1
553
%define dst_regd r1d
554
%endif
555
    mov    dst_regd, dword [r1+r5*4]
556
    lea     dst_reg, [r0+dst_reg]
557
    DC_ADD_MMX2_OP mova, dst_reg, r3, r6
558
    lea     dst_reg, [dst_reg+r3*4]
559
    DC_ADD_MMX2_OP mova, dst_reg, r3, r6
560
%ifndef ARCH_X86_64
561
    mov          r1, r1m
562
%endif
563
    add          r5, 4
564
    add          r2, 128
565
    cmp          r5, 16
566
    jl .nextblock
567
    REP_RET
568
.no_dc
569
INIT_XMM
570
    mov    dst_regd, dword [r1+r5*4]
571
    lea     dst_reg, [r0+dst_reg]
572
    IDCT8_ADD_SSE dst_reg, r2, r3, r6
573
%ifndef ARCH_X86_64
574
    mov          r1, r1m
575
%endif
576
.skipblock
577
    add          r5, 4
578
    add          r2, 128
579
    cmp          r5, 16
580
    jl .nextblock
581
    REP_RET
582

    
583
INIT_MMX
584
h264_idct_add8_mmx_plane:
585
.nextblock
586
    movzx        r6, byte [scan8+r5]
587
    movzx        r6, byte [r4+r6]
588
    or          r6w, word [r2]
589
    test         r6, r6
590
    jz .skipblock
591
%ifdef ARCH_X86_64
592
    mov         r0d, dword [r1+r5*4]
593
    add          r0, [r10]
594
%else
595
    mov          r0, r1m ; XXX r1m here is actually r0m of the calling func
596
    mov          r0, [r0]
597
    add          r0, dword [r1+r5*4]
598
%endif
599
    IDCT4_ADD    r0, r2, r3
600
.skipblock
601
    inc          r5
602
    add          r2, 32
603
    test         r5, 3
604
    jnz .nextblock
605
    rep ret
606

    
607
; ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset,
608
;                       DCTELEM *block, int stride, const uint8_t nnzc[6*8])
609
cglobal h264_idct_add8_mmx, 5, 7, 0
610
    mov          r5, 16
611
    add          r2, 512
612
%ifdef PIC
613
    lea         r11, [scan8_mem]
614
%endif
615
%ifdef ARCH_X86_64
616
    mov         r10, r0
617
%endif
618
    call         h264_idct_add8_mmx_plane
619
%ifdef ARCH_X86_64
620
    add         r10, gprsize
621
%else
622
    add        r0mp, gprsize
623
%endif
624
    call         h264_idct_add8_mmx_plane
625
    RET
626

    
627
h264_idct_add8_mmx2_plane
628
.nextblock
629
    movzx        r6, byte [scan8+r5]
630
    movzx        r6, byte [r4+r6]
631
    test         r6, r6
632
    jz .try_dc
633
%ifdef ARCH_X86_64
634
    mov         r0d, dword [r1+r5*4]
635
    add          r0, [r10]
636
%else
637
    mov          r0, r1m ; XXX r1m here is actually r0m of the calling func
638
    mov          r0, [r0]
639
    add          r0, dword [r1+r5*4]
640
%endif
641
    IDCT4_ADD    r0, r2, r3
642
    inc          r5
643
    add          r2, 32
644
    test         r5, 3
645
    jnz .nextblock
646
    rep ret
647
.try_dc
648
    movsx        r6, word [r2]
649
    test         r6, r6
650
    jz .skipblock
651
    DC_ADD_MMX2_INIT r2, r3, r6
652
%ifdef ARCH_X86_64
653
    mov         r0d, dword [r1+r5*4]
654
    add          r0, [r10]
655
%else
656
    mov          r0, r1m ; XXX r1m here is actually r0m of the calling func
657
    mov          r0, [r0]
658
    add          r0, dword [r1+r5*4]
659
%endif
660
    DC_ADD_MMX2_OP movh, r0, r3, r6
661
.skipblock
662
    inc          r5
663
    add          r2, 32
664
    test         r5, 3
665
    jnz .nextblock
666
    rep ret
667

    
668
; ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset,
669
;                        DCTELEM *block, int stride, const uint8_t nnzc[6*8])
670
cglobal h264_idct_add8_mmx2, 5, 7, 0
671
    mov          r5, 16
672
    add          r2, 512
673
%ifdef ARCH_X86_64
674
    mov         r10, r0
675
%endif
676
%ifdef PIC
677
    lea         r11, [scan8_mem]
678
%endif
679
    call h264_idct_add8_mmx2_plane
680
%ifdef ARCH_X86_64
681
    add         r10, gprsize
682
%else
683
    add        r0mp, gprsize
684
%endif
685
    call h264_idct_add8_mmx2_plane
686
    RET
687

    
688
INIT_MMX
689
; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
690
h264_idct_dc_add8_mmx2:
691
    movd         m0, [r2   ]          ;  0 0 X D
692
    punpcklwd    m0, [r2+32]          ;  x X d D
693
    paddsw       m0, [pw_32]
694
    psraw        m0, 6
695
    punpcklwd    m0, m0               ;  d d D D
696
    pxor         m1, m1               ;  0 0 0 0
697
    psubw        m1, m0               ; -d-d-D-D
698
    packuswb     m0, m1               ; -d-d-D-D d d D D
699
    pshufw       m1, m0, 0xFA         ; -d-d-d-d-D-D-D-D
700
    punpcklwd    m0, m0               ;  d d d d D D D D
701
    lea          r6, [r3*3]
702
    DC_ADD_MMX2_OP movq, r0, r3, r6
703
    ret
704

    
705
ALIGN 16
706
INIT_XMM
707
; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride
708
x264_add8x4_idct_sse2:
709
    movq   m0, [r2+ 0]
710
    movq   m1, [r2+ 8]
711
    movq   m2, [r2+16]
712
    movq   m3, [r2+24]
713
    movhps m0, [r2+32]
714
    movhps m1, [r2+40]
715
    movhps m2, [r2+48]
716
    movhps m3, [r2+56]
717
    IDCT4_1D 0,1,2,3,4,5
718
    TRANSPOSE2x4x4W 0,1,2,3,4
719
    paddw m0, [pw_32]
720
    IDCT4_1D 0,1,2,3,4,5
721
    pxor  m7, m7
722
    STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3
723
    lea   r0, [r0+r3*2]
724
    STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3
725
    ret
726

    
727
%macro add16_sse2_cycle 2
728
    movzx       r0, word [r4+%2]
729
    test        r0, r0
730
    jz .cycle%1end
731
    mov        r0d, dword [r1+%1*8]
732
%ifdef ARCH_X86_64
733
    add         r0, r10
734
%else
735
    add         r0, r0m
736
%endif
737
    call        x264_add8x4_idct_sse2
738
.cycle%1end
739
%if %1 < 7
740
    add         r2, 64
741
%endif
742
%endmacro
743

    
744
; ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset,
745
;                         DCTELEM *block, int stride, const uint8_t nnzc[6*8])
746
cglobal h264_idct_add16_sse2, 5, 5, 8
747
%ifdef ARCH_X86_64
748
    mov        r10, r0
749
%endif
750
    ; unrolling of the loop leads to an average performance gain of
751
    ; 20-25%
752
    add16_sse2_cycle 0, 0xc
753
    add16_sse2_cycle 1, 0x14
754
    add16_sse2_cycle 2, 0xe
755
    add16_sse2_cycle 3, 0x16
756
    add16_sse2_cycle 4, 0x1c
757
    add16_sse2_cycle 5, 0x24
758
    add16_sse2_cycle 6, 0x1e
759
    add16_sse2_cycle 7, 0x26
760
    RET
761

    
762
%macro add16intra_sse2_cycle 2
763
    movzx       r0, word [r4+%2]
764
    test        r0, r0
765
    jz .try%1dc
766
    mov        r0d, dword [r1+%1*8]
767
%ifdef ARCH_X86_64
768
    add         r0, r10
769
%else
770
    add         r0, r0m
771
%endif
772
    call        x264_add8x4_idct_sse2
773
    jmp .cycle%1end
774
.try%1dc
775
    movsx       r0, word [r2   ]
776
    or         r0w, word [r2+32]
777
    jz .cycle%1end
778
    mov        r0d, dword [r1+%1*8]
779
%ifdef ARCH_X86_64
780
    add         r0, r10
781
%else
782
    add         r0, r0m
783
%endif
784
    call        h264_idct_dc_add8_mmx2
785
.cycle%1end
786
%if %1 < 7
787
    add         r2, 64
788
%endif
789
%endmacro
790

    
791
; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset,
792
;                              DCTELEM *block, int stride, const uint8_t nnzc[6*8])
793
cglobal h264_idct_add16intra_sse2, 5, 7, 8
794
%ifdef ARCH_X86_64
795
    mov        r10, r0
796
%endif
797
    add16intra_sse2_cycle 0, 0xc
798
    add16intra_sse2_cycle 1, 0x14
799
    add16intra_sse2_cycle 2, 0xe
800
    add16intra_sse2_cycle 3, 0x16
801
    add16intra_sse2_cycle 4, 0x1c
802
    add16intra_sse2_cycle 5, 0x24
803
    add16intra_sse2_cycle 6, 0x1e
804
    add16intra_sse2_cycle 7, 0x26
805
    RET
806

    
807
%macro add8_sse2_cycle 2
808
    movzx       r0, word [r4+%2]
809
    test        r0, r0
810
    jz .try%1dc
811
%ifdef ARCH_X86_64
812
    mov        r0d, dword [r1+%1*8+64]
813
    add         r0, [r10]
814
%else
815
    mov         r0, r0m
816
    mov         r0, [r0]
817
    add         r0, dword [r1+%1*8+64]
818
%endif
819
    call        x264_add8x4_idct_sse2
820
    jmp .cycle%1end
821
.try%1dc
822
    movsx       r0, word [r2   ]
823
    or         r0w, word [r2+32]
824
    jz .cycle%1end
825
%ifdef ARCH_X86_64
826
    mov        r0d, dword [r1+%1*8+64]
827
    add         r0, [r10]
828
%else
829
    mov         r0, r0m
830
    mov         r0, [r0]
831
    add         r0, dword [r1+%1*8+64]
832
%endif
833
    call        h264_idct_dc_add8_mmx2
834
.cycle%1end
835
%if %1 < 3
836
    add         r2, 64
837
%endif
838
%endmacro
839

    
840
; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset,
841
;                        DCTELEM *block, int stride, const uint8_t nnzc[6*8])
842
cglobal h264_idct_add8_sse2, 5, 7, 8
843
    add          r2, 512
844
%ifdef ARCH_X86_64
845
    mov         r10, r0
846
%endif
847
    add8_sse2_cycle 0, 0x09
848
    add8_sse2_cycle 1, 0x11
849
%ifdef ARCH_X86_64
850
    add         r10, gprsize
851
%else
852
    add        r0mp, gprsize
853
%endif
854
    add8_sse2_cycle 2, 0x21
855
    add8_sse2_cycle 3, 0x29
856
    RET