Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / h264_idct.asm @ 19fb234e

History | View | Annotate | Download (24.6 KB)

1
;*****************************************************************************
2
;* MMX/SSE2-optimized H.264 iDCT
3
;*****************************************************************************
4
;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt
5
;* Copyright (C) 2003-2008 x264 project
6
;*
7
;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
8
;*          Loren Merritt <lorenm@u.washington.edu>
9
;*          Holger Lubitz <hal@duncan.ol.sub.de>
10
;*          Min Chen <chenm001.163.com>
11
;*
12
;* This file is part of FFmpeg.
13
;*
14
;* FFmpeg is free software; you can redistribute it and/or
15
;* modify it under the terms of the GNU Lesser General Public
16
;* License as published by the Free Software Foundation; either
17
;* version 2.1 of the License, or (at your option) any later version.
18
;*
19
;* FFmpeg is distributed in the hope that it will be useful,
20
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
21
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22
;* Lesser General Public License for more details.
23
;*
24
;* You should have received a copy of the GNU Lesser General Public
25
;* License along with FFmpeg; if not, write to the Free Software
26
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27
;*****************************************************************************
28

    
29
%include "x86inc.asm"
30
%include "x86util.asm"
31

    
32
SECTION_RODATA
33

    
34
; FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
35
scan8_mem: db 4+1*8, 5+1*8, 4+2*8, 5+2*8
36
           db 6+1*8, 7+1*8, 6+2*8, 7+2*8
37
           db 4+3*8, 5+3*8, 4+4*8, 5+4*8
38
           db 6+3*8, 7+3*8, 6+4*8, 7+4*8
39
           db 1+1*8, 2+1*8
40
           db 1+2*8, 2+2*8
41
           db 1+4*8, 2+4*8
42
           db 1+5*8, 2+5*8
43
%ifdef PIC
44
%define scan8 r11
45
%else
46
%define scan8 scan8_mem
47
%endif
48

    
49
cextern pw_32
50
cextern pw_1
51

    
52
SECTION .text
53

    
54
; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
55
%macro IDCT4_ADD 3
56
    ; Load dct coeffs
57
    movq         m0, [%2]
58
    movq         m1, [%2+8]
59
    movq         m2, [%2+16]
60
    movq         m3, [%2+24]
61

    
62
    IDCT4_1D      0, 1, 2, 3, 4, 5
63
    mova         m6, [pw_32]
64
    TRANSPOSE4x4W 0, 1, 2, 3, 4
65
    paddw        m0, m6
66
    IDCT4_1D      0, 1, 2, 3, 4, 5
67
    pxor         m7, m7
68

    
69
    STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3
70
    lea          %1, [%1+%3*2]
71
    STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3
72
%endmacro
73

    
74
INIT_MMX
75
; ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
76
cglobal h264_idct_add_mmx, 3, 3, 0
77
    IDCT4_ADD    r0, r1, r2
78
    RET
79

    
80
%macro IDCT8_1D 2
81
    mova         m4, m5
82
    mova         m0, m1
83
    psraw        m4, 1
84
    psraw        m1, 1
85
    paddw        m4, m5
86
    paddw        m1, m0
87
    paddw        m4, m7
88
    paddw        m1, m5
89
    psubw        m4, m0
90
    paddw        m1, m3
91

    
92
    psubw        m0, m3
93
    psubw        m5, m3
94
    paddw        m0, m7
95
    psubw        m5, m7
96
    psraw        m3, 1
97
    psraw        m7, 1
98
    psubw        m0, m3
99
    psubw        m5, m7
100

    
101
    mova         m3, m4
102
    mova         m7, m1
103
    psraw        m1, 2
104
    psraw        m3, 2
105
    paddw        m3, m0
106
    psraw        m0, 2
107
    paddw        m1, m5
108
    psraw        m5, 2
109
    psubw        m0, m4
110
    psubw        m7, m5
111

    
112
    mova         m4, m2
113
    mova         m5, m6
114
    psraw        m4, 1
115
    psraw        m6, 1
116
    psubw        m4, m5
117
    paddw        m6, m2
118

    
119
    mova         m2, %1
120
    mova         m5, %2
121
    SUMSUB_BA    m5, m2
122
    SUMSUB_BA    m6, m5
123
    SUMSUB_BA    m4, m2
124
    SUMSUB_BA    m7, m6
125
    SUMSUB_BA    m0, m4
126
    SUMSUB_BA    m3, m2
127
    SUMSUB_BA    m1, m5
128
    SWAP          7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
129
%endmacro
130

    
131
%macro IDCT8_1D_FULL 1
132
    mova         m7, [%1+112]
133
    mova         m6, [%1+ 96]
134
    mova         m5, [%1+ 80]
135
    mova         m3, [%1+ 48]
136
    mova         m2, [%1+ 32]
137
    mova         m1, [%1+ 16]
138
    IDCT8_1D   [%1], [%1+ 64]
139
%endmacro
140

    
141
; %1=int16_t *block, %2=int16_t *dstblock
142
%macro IDCT8_ADD_MMX_START 2
143
    IDCT8_1D_FULL %1
144
    mova       [%1], m7
145
    TRANSPOSE4x4W 0, 1, 2, 3, 7
146
    mova         m7, [%1]
147
    mova    [%2   ], m0
148
    mova    [%2+16], m1
149
    mova    [%2+32], m2
150
    mova    [%2+48], m3
151
    TRANSPOSE4x4W 4, 5, 6, 7, 3
152
    mova    [%2+ 8], m4
153
    mova    [%2+24], m5
154
    mova    [%2+40], m6
155
    mova    [%2+56], m7
156
%endmacro
157

    
158
; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
159
%macro IDCT8_ADD_MMX_END 3
160
    IDCT8_1D_FULL %2
161
    mova    [%2   ], m5
162
    mova    [%2+16], m6
163
    mova    [%2+32], m7
164

    
165
    pxor         m7, m7
166
    STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3
167
    lea          %1, [%1+%3*2]
168
    STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3
169
    mova         m0, [%2   ]
170
    mova         m1, [%2+16]
171
    mova         m2, [%2+32]
172
    lea          %1, [%1+%3*2]
173
    STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3
174
    lea          %1, [%1+%3*2]
175
    STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3
176
%endmacro
177

    
178
INIT_MMX
179
; ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
180
cglobal h264_idct8_add_mmx, 3, 4, 0
181
    %assign pad 128+4-(stack_offset&7)
182
    SUB         rsp, pad
183

    
184
    add   word [r1], 32
185
    IDCT8_ADD_MMX_START r1  , rsp
186
    IDCT8_ADD_MMX_START r1+8, rsp+64
187
    lea          r3, [r0+4]
188
    IDCT8_ADD_MMX_END   r0  , rsp,   r2
189
    IDCT8_ADD_MMX_END   r3  , rsp+8, r2
190

    
191
    ADD         rsp, pad
192
    RET
193

    
194
; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
195
%macro IDCT8_ADD_SSE 4
196
    IDCT8_1D_FULL %2
197
%ifdef ARCH_X86_64
198
    TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
199
%else
200
    TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16]
201
%endif
202
    paddw        m0, [pw_32]
203

    
204
%ifndef ARCH_X86_64
205
    mova    [%2   ], m0
206
    mova    [%2+16], m4
207
    IDCT8_1D   [%2], [%2+ 16]
208
    mova    [%2   ], m6
209
    mova    [%2+16], m7
210
%else
211
    SWAP          0, 8
212
    SWAP          4, 9
213
    IDCT8_1D     m8, m9
214
    SWAP          6, 8
215
    SWAP          7, 9
216
%endif
217

    
218
    pxor         m7, m7
219
    lea          %4, [%3*3]
220
    STORE_DIFF   m0, m6, m7, [%1     ]
221
    STORE_DIFF   m1, m6, m7, [%1+%3  ]
222
    STORE_DIFF   m2, m6, m7, [%1+%3*2]
223
    STORE_DIFF   m3, m6, m7, [%1+%4  ]
224
%ifndef ARCH_X86_64
225
    mova         m0, [%2   ]
226
    mova         m1, [%2+16]
227
%else
228
    SWAP          0, 8
229
    SWAP          1, 9
230
%endif
231
    lea          %1, [%1+%3*4]
232
    STORE_DIFF   m4, m6, m7, [%1     ]
233
    STORE_DIFF   m5, m6, m7, [%1+%3  ]
234
    STORE_DIFF   m0, m6, m7, [%1+%3*2]
235
    STORE_DIFF   m1, m6, m7, [%1+%4  ]
236
%endmacro
237

    
238
INIT_XMM
239
; ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
240
cglobal h264_idct8_add_sse2, 3, 4, 10
241
    IDCT8_ADD_SSE r0, r1, r2, r3
242
    RET
243

    
244
%macro DC_ADD_MMX2_INIT 2-3
245
%if %0 == 2
246
    movsx        %1, word [%1]
247
    add          %1, 32
248
    sar          %1, 6
249
    movd         m0, %1d
250
    lea          %1, [%2*3]
251
%else
252
    add          %3, 32
253
    sar          %3, 6
254
    movd         m0, %3d
255
    lea          %3, [%2*3]
256
%endif
257
    pshufw       m0, m0, 0
258
    pxor         m1, m1
259
    psubw        m1, m0
260
    packuswb     m0, m0
261
    packuswb     m1, m1
262
%endmacro
263

    
264
%macro DC_ADD_MMX2_OP 3-4
265
    %1           m2, [%2     ]
266
    %1           m3, [%2+%3  ]
267
    %1           m4, [%2+%3*2]
268
    %1           m5, [%2+%4  ]
269
    paddusb      m2, m0
270
    paddusb      m3, m0
271
    paddusb      m4, m0
272
    paddusb      m5, m0
273
    psubusb      m2, m1
274
    psubusb      m3, m1
275
    psubusb      m4, m1
276
    psubusb      m5, m1
277
    %1    [%2     ], m2
278
    %1    [%2+%3  ], m3
279
    %1    [%2+%3*2], m4
280
    %1    [%2+%4  ], m5
281
%endmacro
282

    
283
INIT_MMX
284
; ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
285
cglobal h264_idct_dc_add_mmx2, 3, 3, 0
286
    DC_ADD_MMX2_INIT r1, r2
287
    DC_ADD_MMX2_OP movh, r0, r2, r1
288
    RET
289

    
290
; ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
291
cglobal h264_idct8_dc_add_mmx2, 3, 3, 0
292
    DC_ADD_MMX2_INIT r1, r2
293
    DC_ADD_MMX2_OP mova, r0, r2, r1
294
    lea          r0, [r0+r2*4]
295
    DC_ADD_MMX2_OP mova, r0, r2, r1
296
    RET
297

    
298
; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset,
299
;             DCTELEM *block, int stride, const uint8_t nnzc[6*8])
300
cglobal h264_idct_add16_mmx, 5, 7, 0
301
    xor          r5, r5
302
%ifdef PIC
303
    lea         r11, [scan8_mem]
304
%endif
305
.nextblock
306
    movzx        r6, byte [scan8+r5]
307
    movzx        r6, byte [r4+r6]
308
    test         r6, r6
309
    jz .skipblock
310
    mov         r6d, dword [r1+r5*4]
311
    lea          r6, [r0+r6]
312
    IDCT4_ADD    r6, r2, r3
313
.skipblock
314
    inc          r5
315
    add          r2, 32
316
    cmp          r5, 16
317
    jl .nextblock
318
    REP_RET
319

    
320
; ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset,
321
;                        DCTELEM *block, int stride, const uint8_t nnzc[6*8])
322
cglobal h264_idct8_add4_mmx, 5, 7, 0
323
    %assign pad 128+4-(stack_offset&7)
324
    SUB         rsp, pad
325

    
326
    xor          r5, r5
327
%ifdef PIC
328
    lea         r11, [scan8_mem]
329
%endif
330
.nextblock
331
    movzx        r6, byte [scan8+r5]
332
    movzx        r6, byte [r4+r6]
333
    test         r6, r6
334
    jz .skipblock
335
    mov         r6d, dword [r1+r5*4]
336
    lea          r6, [r0+r6]
337
    add   word [r2], 32
338
    IDCT8_ADD_MMX_START r2  , rsp
339
    IDCT8_ADD_MMX_START r2+8, rsp+64
340
    IDCT8_ADD_MMX_END   r6  , rsp,   r3
341
    mov         r6d, dword [r1+r5*4]
342
    lea          r6, [r0+r6+4]
343
    IDCT8_ADD_MMX_END   r6  , rsp+8, r3
344
.skipblock
345
    add          r5, 4
346
    add          r2, 128
347
    cmp          r5, 16
348
    jl .nextblock
349
    ADD         rsp, pad
350
    RET
351

    
352
; ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset,
353
;                         DCTELEM *block, int stride, const uint8_t nnzc[6*8])
354
cglobal h264_idct_add16_mmx2, 5, 7, 0
355
    xor          r5, r5
356
%ifdef PIC
357
    lea         r11, [scan8_mem]
358
%endif
359
.nextblock
360
    movzx        r6, byte [scan8+r5]
361
    movzx        r6, byte [r4+r6]
362
    test         r6, r6
363
    jz .skipblock
364
    cmp          r6, 1
365
    jnz .no_dc
366
    movsx        r6, word [r2]
367
    test         r6, r6
368
    jz .no_dc
369
    DC_ADD_MMX2_INIT r2, r3, r6
370
%ifdef ARCH_X86_64
371
%define dst_reg  r10
372
%define dst_regd r10d
373
%else
374
%define dst_reg  r1
375
%define dst_regd r1d
376
%endif
377
    mov    dst_regd, dword [r1+r5*4]
378
    lea     dst_reg, [r0+dst_reg]
379
    DC_ADD_MMX2_OP movh, dst_reg, r3, r6
380
%ifndef ARCH_X86_64
381
    mov          r1, r1m
382
%endif
383
    inc          r5
384
    add          r2, 32
385
    cmp          r5, 16
386
    jl .nextblock
387
    REP_RET
388
.no_dc
389
    mov         r6d, dword [r1+r5*4]
390
    lea          r6, [r0+r6]
391
    IDCT4_ADD    r6, r2, r3
392
.skipblock
393
    inc          r5
394
    add          r2, 32
395
    cmp          r5, 16
396
    jl .nextblock
397
    REP_RET
398

    
399
; ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset,
400
;                             DCTELEM *block, int stride, const uint8_t nnzc[6*8])
401
cglobal h264_idct_add16intra_mmx, 5, 7, 0
402
    xor          r5, r5
403
%ifdef PIC
404
    lea         r11, [scan8_mem]
405
%endif
406
.nextblock
407
    movzx        r6, byte [scan8+r5]
408
    movzx        r6, byte [r4+r6]
409
    or          r6w, word [r2]
410
    test         r6, r6
411
    jz .skipblock
412
    mov         r6d, dword [r1+r5*4]
413
    lea          r6, [r0+r6]
414
    IDCT4_ADD    r6, r2, r3
415
.skipblock
416
    inc          r5
417
    add          r2, 32
418
    cmp          r5, 16
419
    jl .nextblock
420
    REP_RET
421

    
422
; ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset,
423
;                              DCTELEM *block, int stride, const uint8_t nnzc[6*8])
424
cglobal h264_idct_add16intra_mmx2, 5, 7, 0
425
    xor          r5, r5
426
%ifdef PIC
427
    lea         r11, [scan8_mem]
428
%endif
429
.nextblock
430
    movzx        r6, byte [scan8+r5]
431
    movzx        r6, byte [r4+r6]
432
    test         r6, r6
433
    jz .try_dc
434
    mov         r6d, dword [r1+r5*4]
435
    lea          r6, [r0+r6]
436
    IDCT4_ADD    r6, r2, r3
437
    inc          r5
438
    add          r2, 32
439
    cmp          r5, 16
440
    jl .nextblock
441
    REP_RET
442
.try_dc
443
    movsx        r6, word [r2]
444
    test         r6, r6
445
    jz .skipblock
446
    DC_ADD_MMX2_INIT r2, r3, r6
447
%ifdef ARCH_X86_64
448
%define dst_reg  r10
449
%define dst_regd r10d
450
%else
451
%define dst_reg  r1
452
%define dst_regd r1d
453
%endif
454
    mov    dst_regd, dword [r1+r5*4]
455
    lea     dst_reg, [r0+dst_reg]
456
    DC_ADD_MMX2_OP movh, dst_reg, r3, r6
457
%ifndef ARCH_X86_64
458
    mov          r1, r1m
459
%endif
460
.skipblock
461
    inc          r5
462
    add          r2, 32
463
    cmp          r5, 16
464
    jl .nextblock
465
    REP_RET
466

    
467
; ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset,
468
;                         DCTELEM *block, int stride, const uint8_t nnzc[6*8])
469
cglobal h264_idct8_add4_mmx2, 5, 7, 0
470
    %assign pad 128+4-(stack_offset&7)
471
    SUB         rsp, pad
472

    
473
    xor          r5, r5
474
%ifdef PIC
475
    lea         r11, [scan8_mem]
476
%endif
477
.nextblock
478
    movzx        r6, byte [scan8+r5]
479
    movzx        r6, byte [r4+r6]
480
    test         r6, r6
481
    jz .skipblock
482
    cmp          r6, 1
483
    jnz .no_dc
484
    movsx        r6, word [r2]
485
    test         r6, r6
486
    jz .no_dc
487
    DC_ADD_MMX2_INIT r2, r3, r6
488
%ifdef ARCH_X86_64
489
%define dst_reg  r10
490
%define dst_regd r10d
491
%else
492
%define dst_reg  r1
493
%define dst_regd r1d
494
%endif
495
    mov    dst_regd, dword [r1+r5*4]
496
    lea     dst_reg, [r0+dst_reg]
497
    DC_ADD_MMX2_OP mova, dst_reg, r3, r6
498
    lea     dst_reg, [dst_reg+r3*4]
499
    DC_ADD_MMX2_OP mova, dst_reg, r3, r6
500
%ifndef ARCH_X86_64
501
    mov          r1, r1m
502
%endif
503
    add          r5, 4
504
    add          r2, 128
505
    cmp          r5, 16
506
    jl .nextblock
507

    
508
    ADD         rsp, pad
509
    RET
510
.no_dc
511
    mov         r6d, dword [r1+r5*4]
512
    lea          r6, [r0+r6]
513
    add   word [r2], 32
514
    IDCT8_ADD_MMX_START r2  , rsp
515
    IDCT8_ADD_MMX_START r2+8, rsp+64
516
    IDCT8_ADD_MMX_END   r6  , rsp,   r3
517
    mov         r6d, dword [r1+r5*4]
518
    lea          r6, [r0+r6+4]
519
    IDCT8_ADD_MMX_END   r6  , rsp+8, r3
520
.skipblock
521
    add          r5, 4
522
    add          r2, 128
523
    cmp          r5, 16
524
    jl .nextblock
525

    
526
    ADD         rsp, pad
527
    RET
528

    
529
INIT_XMM
530
; ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset,
531
;                         DCTELEM *block, int stride, const uint8_t nnzc[6*8])
532
cglobal h264_idct8_add4_sse2, 5, 7, 10
533
    xor          r5, r5
534
%ifdef PIC
535
    lea         r11, [scan8_mem]
536
%endif
537
.nextblock
538
    movzx        r6, byte [scan8+r5]
539
    movzx        r6, byte [r4+r6]
540
    test         r6, r6
541
    jz .skipblock
542
    cmp          r6, 1
543
    jnz .no_dc
544
    movsx        r6, word [r2]
545
    test         r6, r6
546
    jz .no_dc
547
INIT_MMX
548
    DC_ADD_MMX2_INIT r2, r3, r6
549
%ifdef ARCH_X86_64
550
%define dst_reg  r10
551
%define dst_regd r10d
552
%else
553
%define dst_reg  r1
554
%define dst_regd r1d
555
%endif
556
    mov    dst_regd, dword [r1+r5*4]
557
    lea     dst_reg, [r0+dst_reg]
558
    DC_ADD_MMX2_OP mova, dst_reg, r3, r6
559
    lea     dst_reg, [dst_reg+r3*4]
560
    DC_ADD_MMX2_OP mova, dst_reg, r3, r6
561
%ifndef ARCH_X86_64
562
    mov          r1, r1m
563
%endif
564
    add          r5, 4
565
    add          r2, 128
566
    cmp          r5, 16
567
    jl .nextblock
568
    REP_RET
569
.no_dc
570
INIT_XMM
571
    mov    dst_regd, dword [r1+r5*4]
572
    lea     dst_reg, [r0+dst_reg]
573
    IDCT8_ADD_SSE dst_reg, r2, r3, r6
574
%ifndef ARCH_X86_64
575
    mov          r1, r1m
576
%endif
577
.skipblock
578
    add          r5, 4
579
    add          r2, 128
580
    cmp          r5, 16
581
    jl .nextblock
582
    REP_RET
583

    
584
INIT_MMX
585
h264_idct_add8_mmx_plane:
586
.nextblock
587
    movzx        r6, byte [scan8+r5]
588
    movzx        r6, byte [r4+r6]
589
    or          r6w, word [r2]
590
    test         r6, r6
591
    jz .skipblock
592
%ifdef ARCH_X86_64
593
    mov         r0d, dword [r1+r5*4]
594
    add          r0, [r10]
595
%else
596
    mov          r0, r1m ; XXX r1m here is actually r0m of the calling func
597
    mov          r0, [r0]
598
    add          r0, dword [r1+r5*4]
599
%endif
600
    IDCT4_ADD    r0, r2, r3
601
.skipblock
602
    inc          r5
603
    add          r2, 32
604
    test         r5, 3
605
    jnz .nextblock
606
    rep ret
607

    
608
; ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset,
609
;                       DCTELEM *block, int stride, const uint8_t nnzc[6*8])
610
cglobal h264_idct_add8_mmx, 5, 7, 0
611
    mov          r5, 16
612
    add          r2, 512
613
%ifdef PIC
614
    lea         r11, [scan8_mem]
615
%endif
616
%ifdef ARCH_X86_64
617
    mov         r10, r0
618
%endif
619
    call         h264_idct_add8_mmx_plane
620
%ifdef ARCH_X86_64
621
    add         r10, gprsize
622
%else
623
    add        r0mp, gprsize
624
%endif
625
    call         h264_idct_add8_mmx_plane
626
    RET
627

    
628
h264_idct_add8_mmx2_plane
629
.nextblock
630
    movzx        r6, byte [scan8+r5]
631
    movzx        r6, byte [r4+r6]
632
    test         r6, r6
633
    jz .try_dc
634
%ifdef ARCH_X86_64
635
    mov         r0d, dword [r1+r5*4]
636
    add          r0, [r10]
637
%else
638
    mov          r0, r1m ; XXX r1m here is actually r0m of the calling func
639
    mov          r0, [r0]
640
    add          r0, dword [r1+r5*4]
641
%endif
642
    IDCT4_ADD    r0, r2, r3
643
    inc          r5
644
    add          r2, 32
645
    test         r5, 3
646
    jnz .nextblock
647
    rep ret
648
.try_dc
649
    movsx        r6, word [r2]
650
    test         r6, r6
651
    jz .skipblock
652
    DC_ADD_MMX2_INIT r2, r3, r6
653
%ifdef ARCH_X86_64
654
    mov         r0d, dword [r1+r5*4]
655
    add          r0, [r10]
656
%else
657
    mov          r0, r1m ; XXX r1m here is actually r0m of the calling func
658
    mov          r0, [r0]
659
    add          r0, dword [r1+r5*4]
660
%endif
661
    DC_ADD_MMX2_OP movh, r0, r3, r6
662
.skipblock
663
    inc          r5
664
    add          r2, 32
665
    test         r5, 3
666
    jnz .nextblock
667
    rep ret
668

    
669
; ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset,
670
;                        DCTELEM *block, int stride, const uint8_t nnzc[6*8])
671
cglobal h264_idct_add8_mmx2, 5, 7, 0
672
    mov          r5, 16
673
    add          r2, 512
674
%ifdef ARCH_X86_64
675
    mov         r10, r0
676
%endif
677
%ifdef PIC
678
    lea         r11, [scan8_mem]
679
%endif
680
    call h264_idct_add8_mmx2_plane
681
%ifdef ARCH_X86_64
682
    add         r10, gprsize
683
%else
684
    add        r0mp, gprsize
685
%endif
686
    call h264_idct_add8_mmx2_plane
687
    RET
688

    
689
INIT_MMX
690
; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
691
h264_idct_dc_add8_mmx2:
692
    movd         m0, [r2   ]          ;  0 0 X D
693
    punpcklwd    m0, [r2+32]          ;  x X d D
694
    paddsw       m0, [pw_32]
695
    psraw        m0, 6
696
    punpcklwd    m0, m0               ;  d d D D
697
    pxor         m1, m1               ;  0 0 0 0
698
    psubw        m1, m0               ; -d-d-D-D
699
    packuswb     m0, m1               ; -d-d-D-D d d D D
700
    pshufw       m1, m0, 0xFA         ; -d-d-d-d-D-D-D-D
701
    punpcklwd    m0, m0               ;  d d d d D D D D
702
    lea          r6, [r3*3]
703
    DC_ADD_MMX2_OP movq, r0, r3, r6
704
    ret
705

    
706
ALIGN 16
707
INIT_XMM
708
; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride
709
x264_add8x4_idct_sse2:
710
    movq   m0, [r2+ 0]
711
    movq   m1, [r2+ 8]
712
    movq   m2, [r2+16]
713
    movq   m3, [r2+24]
714
    movhps m0, [r2+32]
715
    movhps m1, [r2+40]
716
    movhps m2, [r2+48]
717
    movhps m3, [r2+56]
718
    IDCT4_1D 0,1,2,3,4,5
719
    TRANSPOSE2x4x4W 0,1,2,3,4
720
    paddw m0, [pw_32]
721
    IDCT4_1D 0,1,2,3,4,5
722
    pxor  m7, m7
723
    STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3
724
    lea   r0, [r0+r3*2]
725
    STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3
726
    ret
727

    
728
%macro add16_sse2_cycle 2
729
    movzx       r0, word [r4+%2]
730
    test        r0, r0
731
    jz .cycle%1end
732
    mov        r0d, dword [r1+%1*8]
733
%ifdef ARCH_X86_64
734
    add         r0, r10
735
%else
736
    add         r0, r0m
737
%endif
738
    call        x264_add8x4_idct_sse2
739
.cycle%1end
740
%if %1 < 7
741
    add         r2, 64
742
%endif
743
%endmacro
744

    
745
; ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset,
746
;                         DCTELEM *block, int stride, const uint8_t nnzc[6*8])
747
cglobal h264_idct_add16_sse2, 5, 5, 8
748
%ifdef ARCH_X86_64
749
    mov        r10, r0
750
%endif
751
    ; unrolling of the loop leads to an average performance gain of
752
    ; 20-25%
753
    add16_sse2_cycle 0, 0xc
754
    add16_sse2_cycle 1, 0x14
755
    add16_sse2_cycle 2, 0xe
756
    add16_sse2_cycle 3, 0x16
757
    add16_sse2_cycle 4, 0x1c
758
    add16_sse2_cycle 5, 0x24
759
    add16_sse2_cycle 6, 0x1e
760
    add16_sse2_cycle 7, 0x26
761
    RET
762

    
763
%macro add16intra_sse2_cycle 2
764
    movzx       r0, word [r4+%2]
765
    test        r0, r0
766
    jz .try%1dc
767
    mov        r0d, dword [r1+%1*8]
768
%ifdef ARCH_X86_64
769
    add         r0, r10
770
%else
771
    add         r0, r0m
772
%endif
773
    call        x264_add8x4_idct_sse2
774
    jmp .cycle%1end
775
.try%1dc
776
    movsx       r0, word [r2   ]
777
    or         r0w, word [r2+32]
778
    jz .cycle%1end
779
    mov        r0d, dword [r1+%1*8]
780
%ifdef ARCH_X86_64
781
    add         r0, r10
782
%else
783
    add         r0, r0m
784
%endif
785
    call        h264_idct_dc_add8_mmx2
786
.cycle%1end
787
%if %1 < 7
788
    add         r2, 64
789
%endif
790
%endmacro
791

    
792
; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset,
793
;                              DCTELEM *block, int stride, const uint8_t nnzc[6*8])
794
cglobal h264_idct_add16intra_sse2, 5, 7, 8
795
%ifdef ARCH_X86_64
796
    mov        r10, r0
797
%endif
798
    add16intra_sse2_cycle 0, 0xc
799
    add16intra_sse2_cycle 1, 0x14
800
    add16intra_sse2_cycle 2, 0xe
801
    add16intra_sse2_cycle 3, 0x16
802
    add16intra_sse2_cycle 4, 0x1c
803
    add16intra_sse2_cycle 5, 0x24
804
    add16intra_sse2_cycle 6, 0x1e
805
    add16intra_sse2_cycle 7, 0x26
806
    RET
807

    
808
%macro add8_sse2_cycle 2
809
    movzx       r0, word [r4+%2]
810
    test        r0, r0
811
    jz .try%1dc
812
%ifdef ARCH_X86_64
813
    mov        r0d, dword [r1+%1*8+64]
814
    add         r0, [r10]
815
%else
816
    mov         r0, r0m
817
    mov         r0, [r0]
818
    add         r0, dword [r1+%1*8+64]
819
%endif
820
    call        x264_add8x4_idct_sse2
821
    jmp .cycle%1end
822
.try%1dc
823
    movsx       r0, word [r2   ]
824
    or         r0w, word [r2+32]
825
    jz .cycle%1end
826
%ifdef ARCH_X86_64
827
    mov        r0d, dword [r1+%1*8+64]
828
    add         r0, [r10]
829
%else
830
    mov         r0, r0m
831
    mov         r0, [r0]
832
    add         r0, dword [r1+%1*8+64]
833
%endif
834
    call        h264_idct_dc_add8_mmx2
835
.cycle%1end
836
%if %1 < 3
837
    add         r2, 64
838
%endif
839
%endmacro
840

    
841
; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset,
842
;                        DCTELEM *block, int stride, const uint8_t nnzc[6*8])
843
cglobal h264_idct_add8_sse2, 5, 7, 8
844
    add          r2, 512
845
%ifdef ARCH_X86_64
846
    mov         r10, r0
847
%endif
848
    add8_sse2_cycle 0, 0x09
849
    add8_sse2_cycle 1, 0x11
850
%ifdef ARCH_X86_64
851
    add         r10, gprsize
852
%else
853
    add        r0mp, gprsize
854
%endif
855
    add8_sse2_cycle 2, 0x21
856
    add8_sse2_cycle 3, 0x29
857
    RET
858

    
859
;void ff_h264_luma_dc_dequant_idct_mmx(DCTELEM *output, DCTELEM *input, int qmul)
860

    
861
%macro WALSH4_1D 5
862
    SUMSUB_BADC m%4, m%3, m%2, m%1, m%5
863
    SUMSUB_BADC m%4, m%2, m%3, m%1, m%5
864
    SWAP %1, %4, %3
865
%endmacro
866

    
867
%macro DEQUANT_MMX 3
868
    mova        m7, [pw_1]
869
    mova        m4, %1
870
    punpcklwd   %1, m7
871
    punpckhwd   m4, m7
872
    mova        m5, %2
873
    punpcklwd   %2, m7
874
    punpckhwd   m5, m7
875
    movd        m7, t3d
876
    punpckldq   m7, m7
877
    pmaddwd     %1, m7
878
    pmaddwd     %2, m7
879
    pmaddwd     m4, m7
880
    pmaddwd     m5, m7
881
    psrad       %1, %3
882
    psrad       %2, %3
883
    psrad       m4, %3
884
    psrad       m5, %3
885
    packssdw    %1, m4
886
    packssdw    %2, m5
887
%endmacro
888

    
889
%macro STORE_WORDS_MMX 5
890
    movd  t0d, %1
891
    psrlq  %1, 32
892
    movd  t1d, %1
893
    mov [t2+%2*32], t0w
894
    mov [t2+%4*32], t1w
895
    shr   t0d, 16
896
    shr   t1d, 16
897
    mov [t2+%3*32], t0w
898
    mov [t2+%5*32], t1w
899
%endmacro
900

    
901
%macro DEQUANT_STORE_MMX 1
902
    DEQUANT_MMX m0, m1, %1
903
    STORE_WORDS_MMX m0,  0,  1,  4,  5
904
    STORE_WORDS_MMX m1,  2,  3,  6,  7
905

    
906
    DEQUANT_MMX m2, m3, %1
907
    STORE_WORDS_MMX m2,  8,  9, 12, 13
908
    STORE_WORDS_MMX m3, 10, 11, 14, 15
909
%endmacro
910

    
911
%macro STORE_WORDS_SSE 9
912
    movd  t0d, %1
913
    psrldq  %1, 4
914
    movd  t1d, %1
915
    psrldq  %1, 4
916
    mov [t2+%2*32], t0w
917
    mov [t2+%4*32], t1w
918
    shr   t0d, 16
919
    shr   t1d, 16
920
    mov [t2+%3*32], t0w
921
    mov [t2+%5*32], t1w
922
    movd  t0d, %1
923
    psrldq  %1, 4
924
    movd  t1d, %1
925
    mov [t2+%6*32], t0w
926
    mov [t2+%8*32], t1w
927
    shr   t0d, 16
928
    shr   t1d, 16
929
    mov [t2+%7*32], t0w
930
    mov [t2+%9*32], t1w
931
%endmacro
932

    
933
%macro DEQUANT_STORE_SSE2 1
934
    movd      xmm4, t3d
935
    movq      xmm5, [pw_1]
936
    pshufd    xmm4, xmm4, 0
937
    movq2dq   xmm0, m0
938
    movq2dq   xmm1, m1
939
    movq2dq   xmm2, m2
940
    movq2dq   xmm3, m3
941
    punpcklwd xmm0, xmm5
942
    punpcklwd xmm1, xmm5
943
    punpcklwd xmm2, xmm5
944
    punpcklwd xmm3, xmm5
945
    pmaddwd   xmm0, xmm4
946
    pmaddwd   xmm1, xmm4
947
    pmaddwd   xmm2, xmm4
948
    pmaddwd   xmm3, xmm4
949
    psrad     xmm0, %1
950
    psrad     xmm1, %1
951
    psrad     xmm2, %1
952
    psrad     xmm3, %1
953
    packssdw  xmm0, xmm1
954
    packssdw  xmm2, xmm3
955
    STORE_WORDS_SSE xmm0,  0,  1,  4,  5,  2,  3,  6,  7
956
    STORE_WORDS_SSE xmm2,  8,  9, 12, 13, 10, 11, 14, 15
957
%endmacro
958

    
959
%macro IDCT_DC_DEQUANT 2
960
cglobal h264_luma_dc_dequant_idct_%1, 3,4,%2
961
    movq        m3, [r1+24]
962
    movq        m2, [r1+16]
963
    movq        m1, [r1+ 8]
964
    movq        m0, [r1+ 0]
965
    WALSH4_1D    0,1,2,3,4
966
    TRANSPOSE4x4W 0,1,2,3,4
967
    WALSH4_1D    0,1,2,3,4
968

    
969
; shift, tmp, output, qmul
970
%ifdef WIN64
971
    DECLARE_REG_TMP 0,3,1,2
972
    ; we can't avoid this, because r0 is the shift register (ecx) on win64
973
    xchg        r0, t2
974
%elifdef ARCH_X86_64
975
    DECLARE_REG_TMP 3,1,0,2
976
%else
977
    DECLARE_REG_TMP 1,3,0,2
978
%endif
979

    
980
    cmp        t3d, 32767
981
    jg .big_qmul
982
    add        t3d, 128 << 16
983
%ifidn %1,mmx
984
    DEQUANT_STORE_MMX 8
985
%else
986
    DEQUANT_STORE_SSE2 8
987
%endif
988
    RET
989
.big_qmul:
990
    bsr        t0d, t3d
991
    add        t3d, 128 << 16
992
    mov        t1d, 7
993
    cmp        t0d, t1d
994
    cmovg      t0d, t1d
995
    inc        t1d
996
    shr        t3d, t0b
997
    sub        t1d, t0d
998
%ifidn %1,mmx
999
    movd        m6, t1d
1000
    DEQUANT_STORE_MMX m6
1001
%else
1002
    movd      xmm6, t1d
1003
    DEQUANT_STORE_SSE2 xmm6
1004
%endif
1005
    RET
1006
%endmacro
1007

    
1008
INIT_MMX
1009
IDCT_DC_DEQUANT mmx, 0
1010
IDCT_DC_DEQUANT sse2, 7