Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / h264_idct.asm @ 98c6053c

History | View | Annotate | Download (21.4 KB)

1 1d16a1cf Ronald S. Bultje
;*****************************************************************************
2
;* MMX/SSE2-optimized H.264 iDCT
3
;*****************************************************************************
4
;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt
5
;* Copyright (C) 2003-2008 x264 project
6
;*
7
;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
8
;*          Loren Merritt <lorenm@u.washington.edu>
9
;*          Holger Lubitz <hal@duncan.ol.sub.de>
10
;*          Min Chen <chenm001.163.com>
11
;*
12
;* This file is part of FFmpeg.
13
;*
14
;* FFmpeg is free software; you can redistribute it and/or
15
;* modify it under the terms of the GNU Lesser General Public
16
;* License as published by the Free Software Foundation; either
17
;* version 2.1 of the License, or (at your option) any later version.
18
;*
19
;* FFmpeg is distributed in the hope that it will be useful,
20
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
21
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22
;* Lesser General Public License for more details.
23
;*
24
;* You should have received a copy of the GNU Lesser General Public
25
;* License along with FFmpeg; if not, write to the Free Software
26
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27
;*****************************************************************************
28
29
%include "x86inc.asm"
30
%include "x86util.asm"
31
32
SECTION_RODATA
33
34
; FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
35
scan8_mem: db 4+1*8, 5+1*8, 4+2*8, 5+2*8
36
           db 6+1*8, 7+1*8, 6+2*8, 7+2*8
37
           db 4+3*8, 5+3*8, 4+4*8, 5+4*8
38
           db 6+3*8, 7+3*8, 6+4*8, 7+4*8
39
           db 1+1*8, 2+1*8
40
           db 1+2*8, 2+2*8
41
           db 1+4*8, 2+4*8
42
           db 1+5*8, 2+5*8
43
%ifdef PIC
44
%define scan8 r11
45
%else
46
%define scan8 scan8_mem
47
%endif
48
49
cextern pw_32
50
51
SECTION .text
52
53
; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
54
%macro IDCT4_ADD 3
55
    ; Load dct coeffs
56
    movq         m0, [%2]
57
    movq         m1, [%2+8]
58
    movq         m2, [%2+16]
59
    movq         m3, [%2+24]
60
61
    IDCT4_1D      0, 1, 2, 3, 4, 5
62
    mova         m6, [pw_32]
63
    TRANSPOSE4x4W 0, 1, 2, 3, 4
64
    paddw        m0, m6
65
    IDCT4_1D      0, 1, 2, 3, 4, 5
66
    pxor         m7, m7
67
68
    STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3
69
    lea          %1, [%1+%3*2]
70
    STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3
71
%endmacro
72
73
INIT_MMX
74
; ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
75
cglobal h264_idct_add_mmx, 3, 3, 0
76
    IDCT4_ADD    r0, r1, r2
77
    RET
78
79
%macro IDCT8_1D 2
80
    mova         m4, m5
81
    mova         m0, m1
82
    psraw        m4, 1
83
    psraw        m1, 1
84
    paddw        m4, m5
85
    paddw        m1, m0
86
    paddw        m4, m7
87
    paddw        m1, m5
88
    psubw        m4, m0
89
    paddw        m1, m3
90
91
    psubw        m0, m3
92
    psubw        m5, m3
93
    paddw        m0, m7
94
    psubw        m5, m7
95
    psraw        m3, 1
96
    psraw        m7, 1
97
    psubw        m0, m3
98
    psubw        m5, m7
99
100
    mova         m3, m4
101
    mova         m7, m1
102
    psraw        m1, 2
103
    psraw        m3, 2
104
    paddw        m3, m0
105
    psraw        m0, 2
106
    paddw        m1, m5
107
    psraw        m5, 2
108
    psubw        m0, m4
109
    psubw        m7, m5
110
111
    mova         m4, m2
112
    mova         m5, m6
113
    psraw        m4, 1
114
    psraw        m6, 1
115
    psubw        m4, m5
116
    paddw        m6, m2
117
118
    mova         m2, %1
119
    mova         m5, %2
120
    SUMSUB_BA    m5, m2
121
    SUMSUB_BA    m6, m5
122
    SUMSUB_BA    m4, m2
123
    SUMSUB_BA    m7, m6
124
    SUMSUB_BA    m0, m4
125
    SUMSUB_BA    m3, m2
126
    SUMSUB_BA    m1, m5
127
    SWAP          7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
128
%endmacro
129
130
%macro IDCT8_1D_FULL 1
131
    mova         m7, [%1+112]
132
    mova         m6, [%1+ 96]
133
    mova         m5, [%1+ 80]
134
    mova         m3, [%1+ 48]
135
    mova         m2, [%1+ 32]
136
    mova         m1, [%1+ 16]
137
    IDCT8_1D   [%1], [%1+ 64]
138
%endmacro
139
140
; %1=int16_t *block, %2=int16_t *dstblock
141
%macro IDCT8_ADD_MMX_START 2
142
    IDCT8_1D_FULL %1
143
    mova       [%1], m7
144
    TRANSPOSE4x4W 0, 1, 2, 3, 7
145
    mova         m7, [%1]
146
    mova    [%2   ], m0
147
    mova    [%2+16], m1
148
    mova    [%2+32], m2
149
    mova    [%2+48], m3
150
    TRANSPOSE4x4W 4, 5, 6, 7, 3
151
    mova    [%2+ 8], m4
152
    mova    [%2+24], m5
153
    mova    [%2+40], m6
154
    mova    [%2+56], m7
155
%endmacro
156
157
; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
158
%macro IDCT8_ADD_MMX_END 3
159
    IDCT8_1D_FULL %2
160
    mova    [%2   ], m5
161
    mova    [%2+16], m6
162
    mova    [%2+32], m7
163
164
    pxor         m7, m7
165
    STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3
166
    lea          %1, [%1+%3*2]
167
    STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3
168
    mova         m0, [%2   ]
169
    mova         m1, [%2+16]
170
    mova         m2, [%2+32]
171
    lea          %1, [%1+%3*2]
172
    STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3
173
    lea          %1, [%1+%3*2]
174
    STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3
175
%endmacro
176
177
INIT_MMX
178
; ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
179
cglobal h264_idct8_add_mmx, 3, 4, 0
180
    %assign pad 128+4-(stack_offset&7)
181
    SUB         rsp, pad
182
183
    add   word [r1], 32
184
    IDCT8_ADD_MMX_START r1  , rsp
185
    IDCT8_ADD_MMX_START r1+8, rsp+64
186
    lea          r3, [r0+4]
187
    IDCT8_ADD_MMX_END   r0  , rsp,   r2
188
    IDCT8_ADD_MMX_END   r3  , rsp+8, r2
189
190
    ADD         rsp, pad
191
    RET
192
193
; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
194
%macro IDCT8_ADD_SSE 4
195
    IDCT8_1D_FULL %2
196
%ifdef ARCH_X86_64
197
    TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
198
%else
199
    TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16]
200
%endif
201
    paddw        m0, [pw_32]
202
203
%ifndef ARCH_X86_64
204
    mova    [%2   ], m0
205
    mova    [%2+16], m4
206
    IDCT8_1D   [%2], [%2+ 16]
207
    mova    [%2   ], m6
208
    mova    [%2+16], m7
209
%else
210
    SWAP          0, 8
211
    SWAP          4, 9
212
    IDCT8_1D     m8, m9
213
    SWAP          6, 8
214
    SWAP          7, 9
215
%endif
216
217
    pxor         m7, m7
218
    lea          %4, [%3*3]
219
    STORE_DIFF   m0, m6, m7, [%1     ]
220
    STORE_DIFF   m1, m6, m7, [%1+%3  ]
221
    STORE_DIFF   m2, m6, m7, [%1+%3*2]
222
    STORE_DIFF   m3, m6, m7, [%1+%4  ]
223
%ifndef ARCH_X86_64
224
    mova         m0, [%2   ]
225
    mova         m1, [%2+16]
226
%else
227
    SWAP          0, 8
228
    SWAP          1, 9
229
%endif
230
    lea          %1, [%1+%3*4]
231
    STORE_DIFF   m4, m6, m7, [%1     ]
232
    STORE_DIFF   m5, m6, m7, [%1+%3  ]
233
    STORE_DIFF   m0, m6, m7, [%1+%3*2]
234
    STORE_DIFF   m1, m6, m7, [%1+%4  ]
235
%endmacro
236
237
INIT_XMM
238
; ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
239
cglobal h264_idct8_add_sse2, 3, 4, 10
240
    IDCT8_ADD_SSE r0, r1, r2, r3
241
    RET
242
243
%macro DC_ADD_MMX2_INIT 2-3
244
%if %0 == 2
245
    movsx        %1, word [%1]
246
    add          %1, 32
247
    sar          %1, 6
248 02b424d9 Reimar Döffinger
    movd         m0, %1d
249 1d16a1cf Ronald S. Bultje
    lea          %1, [%2*3]
250
%else
251
    add          %3, 32
252
    sar          %3, 6
253 02b424d9 Reimar Döffinger
    movd         m0, %3d
254 1d16a1cf Ronald S. Bultje
    lea          %3, [%2*3]
255
%endif
256
    pshufw       m0, m0, 0
257
    pxor         m1, m1
258
    psubw        m1, m0
259
    packuswb     m0, m0
260
    packuswb     m1, m1
261
%endmacro
262
263
%macro DC_ADD_MMX2_OP 3-4
264
    %1           m2, [%2     ]
265
    %1           m3, [%2+%3  ]
266
    %1           m4, [%2+%3*2]
267
    %1           m5, [%2+%4  ]
268
    paddusb      m2, m0
269
    paddusb      m3, m0
270
    paddusb      m4, m0
271
    paddusb      m5, m0
272
    psubusb      m2, m1
273
    psubusb      m3, m1
274
    psubusb      m4, m1
275
    psubusb      m5, m1
276
    %1    [%2     ], m2
277
    %1    [%2+%3  ], m3
278
    %1    [%2+%3*2], m4
279
    %1    [%2+%4  ], m5
280
%endmacro
281
282
INIT_MMX
283
; ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
284
cglobal h264_idct_dc_add_mmx2, 3, 3, 0
285
    DC_ADD_MMX2_INIT r1, r2
286
    DC_ADD_MMX2_OP movh, r0, r2, r1
287
    RET
288
289
; ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
290
cglobal h264_idct8_dc_add_mmx2, 3, 3, 0
291
    DC_ADD_MMX2_INIT r1, r2
292
    DC_ADD_MMX2_OP mova, r0, r2, r1
293
    lea          r0, [r0+r2*4]
294
    DC_ADD_MMX2_OP mova, r0, r2, r1
295
    RET
296
297
; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset,
298
;             DCTELEM *block, int stride, const uint8_t nnzc[6*8])
299
cglobal h264_idct_add16_mmx, 5, 7, 0
300
    xor          r5, r5
301
%ifdef PIC
302
    lea         r11, [scan8_mem]
303
%endif
304
.nextblock
305
    movzx        r6, byte [scan8+r5]
306
    movzx        r6, byte [r4+r6]
307
    test         r6, r6
308
    jz .skipblock
309
    mov         r6d, dword [r1+r5*4]
310
    lea          r6, [r0+r6]
311
    IDCT4_ADD    r6, r2, r3
312
.skipblock
313
    inc          r5
314
    add          r2, 32
315
    cmp          r5, 16
316
    jl .nextblock
317
    REP_RET
318
319
; ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset,
320
;                        DCTELEM *block, int stride, const uint8_t nnzc[6*8])
321
cglobal h264_idct8_add4_mmx, 5, 7, 0
322
    %assign pad 128+4-(stack_offset&7)
323
    SUB         rsp, pad
324
325
    xor          r5, r5
326
%ifdef PIC
327
    lea         r11, [scan8_mem]
328
%endif
329
.nextblock
330
    movzx        r6, byte [scan8+r5]
331
    movzx        r6, byte [r4+r6]
332
    test         r6, r6
333
    jz .skipblock
334
    mov         r6d, dword [r1+r5*4]
335
    lea          r6, [r0+r6]
336
    add   word [r2], 32
337
    IDCT8_ADD_MMX_START r2  , rsp
338
    IDCT8_ADD_MMX_START r2+8, rsp+64
339
    IDCT8_ADD_MMX_END   r6  , rsp,   r3
340
    mov         r6d, dword [r1+r5*4]
341
    lea          r6, [r0+r6+4]
342
    IDCT8_ADD_MMX_END   r6  , rsp+8, r3
343
.skipblock
344
    add          r5, 4
345
    add          r2, 128
346
    cmp          r5, 16
347
    jl .nextblock
348
    ADD         rsp, pad
349
    RET
350
351
; ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset,
352
;                         DCTELEM *block, int stride, const uint8_t nnzc[6*8])
353
cglobal h264_idct_add16_mmx2, 5, 7, 0
354
    xor          r5, r5
355
%ifdef PIC
356
    lea         r11, [scan8_mem]
357
%endif
358
.nextblock
359
    movzx        r6, byte [scan8+r5]
360
    movzx        r6, byte [r4+r6]
361
    test         r6, r6
362
    jz .skipblock
363
    cmp          r6, 1
364
    jnz .no_dc
365
    movsx        r6, word [r2]
366
    test         r6, r6
367
    jz .no_dc
368
    DC_ADD_MMX2_INIT r2, r3, r6
369
%ifdef ARCH_X86_64
370
%define dst_reg  r10
371
%define dst_regd r10d
372
%else
373
%define dst_reg  r1
374
%define dst_regd r1d
375
%endif
376
    mov    dst_regd, dword [r1+r5*4]
377
    lea     dst_reg, [r0+dst_reg]
378
    DC_ADD_MMX2_OP movh, dst_reg, r3, r6
379
%ifndef ARCH_X86_64
380
    mov          r1, r1m
381
%endif
382
    inc          r5
383
    add          r2, 32
384
    cmp          r5, 16
385
    jl .nextblock
386
    REP_RET
387
.no_dc
388
    mov         r6d, dword [r1+r5*4]
389
    lea          r6, [r0+r6]
390
    IDCT4_ADD    r6, r2, r3
391
.skipblock
392
    inc          r5
393
    add          r2, 32
394
    cmp          r5, 16
395
    jl .nextblock
396
    REP_RET
397
398
; ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset,
399
;                             DCTELEM *block, int stride, const uint8_t nnzc[6*8])
400
cglobal h264_idct_add16intra_mmx, 5, 7, 0
401
    xor          r5, r5
402
%ifdef PIC
403
    lea         r11, [scan8_mem]
404
%endif
405
.nextblock
406
    movzx        r6, byte [scan8+r5]
407
    movzx        r6, byte [r4+r6]
408
    or          r6w, word [r2]
409
    test         r6, r6
410
    jz .skipblock
411
    mov         r6d, dword [r1+r5*4]
412
    lea          r6, [r0+r6]
413
    IDCT4_ADD    r6, r2, r3
414
.skipblock
415
    inc          r5
416
    add          r2, 32
417
    cmp          r5, 16
418
    jl .nextblock
419
    REP_RET
420
421
; ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset,
422
;                              DCTELEM *block, int stride, const uint8_t nnzc[6*8])
423
cglobal h264_idct_add16intra_mmx2, 5, 7, 0
424
    xor          r5, r5
425
%ifdef PIC
426
    lea         r11, [scan8_mem]
427
%endif
428
.nextblock
429
    movzx        r6, byte [scan8+r5]
430
    movzx        r6, byte [r4+r6]
431
    test         r6, r6
432
    jz .try_dc
433
    mov         r6d, dword [r1+r5*4]
434
    lea          r6, [r0+r6]
435
    IDCT4_ADD    r6, r2, r3
436
    inc          r5
437
    add          r2, 32
438
    cmp          r5, 16
439
    jl .nextblock
440
    REP_RET
441
.try_dc
442
    movsx        r6, word [r2]
443
    test         r6, r6
444
    jz .skipblock
445
    DC_ADD_MMX2_INIT r2, r3, r6
446
%ifdef ARCH_X86_64
447
%define dst_reg  r10
448
%define dst_regd r10d
449
%else
450
%define dst_reg  r1
451
%define dst_regd r1d
452
%endif
453
    mov    dst_regd, dword [r1+r5*4]
454
    lea     dst_reg, [r0+dst_reg]
455
    DC_ADD_MMX2_OP movh, dst_reg, r3, r6
456
%ifndef ARCH_X86_64
457
    mov          r1, r1m
458
%endif
459
.skipblock
460
    inc          r5
461
    add          r2, 32
462
    cmp          r5, 16
463
    jl .nextblock
464
    REP_RET
465
466
; ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset,
467
;                         DCTELEM *block, int stride, const uint8_t nnzc[6*8])
468
cglobal h264_idct8_add4_mmx2, 5, 7, 0
469
    %assign pad 128+4-(stack_offset&7)
470
    SUB         rsp, pad
471
472
    xor          r5, r5
473
%ifdef PIC
474
    lea         r11, [scan8_mem]
475
%endif
476
.nextblock
477
    movzx        r6, byte [scan8+r5]
478
    movzx        r6, byte [r4+r6]
479
    test         r6, r6
480
    jz .skipblock
481
    cmp          r6, 1
482
    jnz .no_dc
483
    movsx        r6, word [r2]
484
    test         r6, r6
485
    jz .no_dc
486
    DC_ADD_MMX2_INIT r2, r3, r6
487
%ifdef ARCH_X86_64
488
%define dst_reg  r10
489
%define dst_regd r10d
490
%else
491
%define dst_reg  r1
492
%define dst_regd r1d
493
%endif
494
    mov    dst_regd, dword [r1+r5*4]
495
    lea     dst_reg, [r0+dst_reg]
496
    DC_ADD_MMX2_OP mova, dst_reg, r3, r6
497
    lea     dst_reg, [dst_reg+r3*4]
498
    DC_ADD_MMX2_OP mova, dst_reg, r3, r6
499
%ifndef ARCH_X86_64
500
    mov          r1, r1m
501
%endif
502
    add          r5, 4
503
    add          r2, 128
504
    cmp          r5, 16
505
    jl .nextblock
506
507
    ADD         rsp, pad
508
    RET
509
.no_dc
510
    mov         r6d, dword [r1+r5*4]
511
    lea          r6, [r0+r6]
512
    add   word [r2], 32
513
    IDCT8_ADD_MMX_START r2  , rsp
514
    IDCT8_ADD_MMX_START r2+8, rsp+64
515
    IDCT8_ADD_MMX_END   r6  , rsp,   r3
516
    mov         r6d, dword [r1+r5*4]
517
    lea          r6, [r0+r6+4]
518
    IDCT8_ADD_MMX_END   r6  , rsp+8, r3
519
.skipblock
520
    add          r5, 4
521
    add          r2, 128
522
    cmp          r5, 16
523
    jl .nextblock
524
525
    ADD         rsp, pad
526
    RET
527
528
INIT_XMM
529
; ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset,
530
;                         DCTELEM *block, int stride, const uint8_t nnzc[6*8])
531
cglobal h264_idct8_add4_sse2, 5, 7, 10
532
    xor          r5, r5
533
%ifdef PIC
534
    lea         r11, [scan8_mem]
535
%endif
536
.nextblock
537
    movzx        r6, byte [scan8+r5]
538
    movzx        r6, byte [r4+r6]
539
    test         r6, r6
540
    jz .skipblock
541
    cmp          r6, 1
542
    jnz .no_dc
543
    movsx        r6, word [r2]
544
    test         r6, r6
545
    jz .no_dc
546
INIT_MMX
547
    DC_ADD_MMX2_INIT r2, r3, r6
548
%ifdef ARCH_X86_64
549
%define dst_reg  r10
550
%define dst_regd r10d
551
%else
552
%define dst_reg  r1
553
%define dst_regd r1d
554
%endif
555
    mov    dst_regd, dword [r1+r5*4]
556
    lea     dst_reg, [r0+dst_reg]
557
    DC_ADD_MMX2_OP mova, dst_reg, r3, r6
558
    lea     dst_reg, [dst_reg+r3*4]
559
    DC_ADD_MMX2_OP mova, dst_reg, r3, r6
560
%ifndef ARCH_X86_64
561
    mov          r1, r1m
562
%endif
563
    add          r5, 4
564
    add          r2, 128
565
    cmp          r5, 16
566
    jl .nextblock
567
    REP_RET
568
.no_dc
569
INIT_XMM
570
    mov    dst_regd, dword [r1+r5*4]
571
    lea     dst_reg, [r0+dst_reg]
572
    IDCT8_ADD_SSE dst_reg, r2, r3, r6
573
%ifndef ARCH_X86_64
574
    mov          r1, r1m
575
%endif
576
.skipblock
577
    add          r5, 4
578
    add          r2, 128
579
    cmp          r5, 16
580
    jl .nextblock
581
    REP_RET
582
583
INIT_MMX
584
h264_idct_add8_mmx_plane:
585
.nextblock
586
    movzx        r6, byte [scan8+r5]
587
    movzx        r6, byte [r4+r6]
588
    or          r6w, word [r2]
589
    test         r6, r6
590
    jz .skipblock
591
%ifdef ARCH_X86_64
592
    mov         r0d, dword [r1+r5*4]
593
    add          r0, [r10]
594
%else
595
    mov          r0, r1m ; XXX r1m here is actually r0m of the calling func
596
    mov          r0, [r0]
597
    add          r0, dword [r1+r5*4]
598
%endif
599
    IDCT4_ADD    r0, r2, r3
600
.skipblock
601
    inc          r5
602
    add          r2, 32
603
    test         r5, 3
604
    jnz .nextblock
605
    rep ret
606
607
; ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset,
608
;                       DCTELEM *block, int stride, const uint8_t nnzc[6*8])
609
cglobal h264_idct_add8_mmx, 5, 7, 0
610
    mov          r5, 16
611
    add          r2, 512
612
%ifdef PIC
613
    lea         r11, [scan8_mem]
614
%endif
615
%ifdef ARCH_X86_64
616
    mov         r10, r0
617
%endif
618
    call         h264_idct_add8_mmx_plane
619
%ifdef ARCH_X86_64
620
    add         r10, gprsize
621
%else
622
    add        r0mp, gprsize
623
%endif
624
    call         h264_idct_add8_mmx_plane
625
    RET
626
627
h264_idct_add8_mmx2_plane
628
.nextblock
629
    movzx        r6, byte [scan8+r5]
630
    movzx        r6, byte [r4+r6]
631
    test         r6, r6
632
    jz .try_dc
633
%ifdef ARCH_X86_64
634
    mov         r0d, dword [r1+r5*4]
635
    add          r0, [r10]
636
%else
637
    mov          r0, r1m ; XXX r1m here is actually r0m of the calling func
638
    mov          r0, [r0]
639
    add          r0, dword [r1+r5*4]
640
%endif
641
    IDCT4_ADD    r0, r2, r3
642
    inc          r5
643
    add          r2, 32
644
    test         r5, 3
645
    jnz .nextblock
646
    rep ret
647
.try_dc
648
    movsx        r6, word [r2]
649
    test         r6, r6
650
    jz .skipblock
651
    DC_ADD_MMX2_INIT r2, r3, r6
652
%ifdef ARCH_X86_64
653
    mov         r0d, dword [r1+r5*4]
654
    add          r0, [r10]
655
%else
656
    mov          r0, r1m ; XXX r1m here is actually r0m of the calling func
657
    mov          r0, [r0]
658
    add          r0, dword [r1+r5*4]
659
%endif
660
    DC_ADD_MMX2_OP movh, r0, r3, r6
661
.skipblock
662
    inc          r5
663
    add          r2, 32
664
    test         r5, 3
665
    jnz .nextblock
666
    rep ret
667
668
; ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset,
669
;                        DCTELEM *block, int stride, const uint8_t nnzc[6*8])
670
cglobal h264_idct_add8_mmx2, 5, 7, 0
671
    mov          r5, 16
672
    add          r2, 512
673
%ifdef ARCH_X86_64
674
    mov         r10, r0
675
%endif
676
%ifdef PIC
677
    lea         r11, [scan8_mem]
678
%endif
679
    call h264_idct_add8_mmx2_plane
680
%ifdef ARCH_X86_64
681
    add         r10, gprsize
682
%else
683
    add        r0mp, gprsize
684
%endif
685
    call h264_idct_add8_mmx2_plane
686
    RET
687
688
INIT_MMX
689
; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
690
h264_idct_dc_add8_mmx2:
691
    movd         m0, [r2   ]          ;  0 0 X D
692
    punpcklwd    m0, [r2+32]          ;  x X d D
693
    paddsw       m0, [pw_32]
694
    psraw        m0, 6
695
    punpcklwd    m0, m0               ;  d d D D
696
    pxor         m1, m1               ;  0 0 0 0
697
    psubw        m1, m0               ; -d-d-D-D
698
    packuswb     m0, m1               ; -d-d-D-D d d D D
699
    pshufw       m1, m0, 0xFA         ; -d-d-d-d-D-D-D-D
700
    punpcklwd    m0, m0               ;  d d d d D D D D
701
    lea          r6, [r3*3]
702
    DC_ADD_MMX2_OP movq, r0, r3, r6
703
    ret
704
705
ALIGN 16
706
INIT_XMM
707
; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride
708
x264_add8x4_idct_sse2:
709
    movq   m0, [r2+ 0]
710
    movq   m1, [r2+ 8]
711
    movq   m2, [r2+16]
712
    movq   m3, [r2+24]
713
    movhps m0, [r2+32]
714
    movhps m1, [r2+40]
715
    movhps m2, [r2+48]
716
    movhps m3, [r2+56]
717
    IDCT4_1D 0,1,2,3,4,5
718
    TRANSPOSE2x4x4W 0,1,2,3,4
719
    paddw m0, [pw_32]
720
    IDCT4_1D 0,1,2,3,4,5
721
    pxor  m7, m7
722
    STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3
723
    lea   r0, [r0+r3*2]
724
    STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3
725
    ret
726
727
%macro add16_sse2_cycle 2
728
    movzx       r0, word [r4+%2]
729
    test        r0, r0
730
    jz .cycle%1end
731
    mov        r0d, dword [r1+%1*8]
732
%ifdef ARCH_X86_64
733
    add         r0, r10
734
%else
735
    add         r0, r0m
736
%endif
737
    call        x264_add8x4_idct_sse2
738
.cycle%1end
739
%if %1 < 7
740
    add         r2, 64
741
%endif
742
%endmacro
743
744
; ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset,
745
;                         DCTELEM *block, int stride, const uint8_t nnzc[6*8])
746
cglobal h264_idct_add16_sse2, 5, 5, 8
747
%ifdef ARCH_X86_64
748
    mov        r10, r0
749
%endif
750
    ; unrolling of the loop leads to an average performance gain of
751
    ; 20-25%
752
    add16_sse2_cycle 0, 0xc
753
    add16_sse2_cycle 1, 0x14
754
    add16_sse2_cycle 2, 0xe
755
    add16_sse2_cycle 3, 0x16
756
    add16_sse2_cycle 4, 0x1c
757
    add16_sse2_cycle 5, 0x24
758
    add16_sse2_cycle 6, 0x1e
759
    add16_sse2_cycle 7, 0x26
760
    RET
761
762 ae112918 Ronald S. Bultje
%macro add16intra_sse2_cycle 2
763
    movzx       r0, word [r4+%2]
764 1d16a1cf Ronald S. Bultje
    test        r0, r0
765 ae112918 Ronald S. Bultje
    jz .try%1dc
766
    mov        r0d, dword [r1+%1*8]
767 1d16a1cf Ronald S. Bultje
%ifdef ARCH_X86_64
768
    add         r0, r10
769
%else
770
    add         r0, r0m
771
%endif
772
    call        x264_add8x4_idct_sse2
773 ae112918 Ronald S. Bultje
    jmp .cycle%1end
774
.try%1dc
775 1d16a1cf Ronald S. Bultje
    movsx       r0, word [r2   ]
776
    or         r0w, word [r2+32]
777 ae112918 Ronald S. Bultje
    jz .cycle%1end
778
    mov        r0d, dword [r1+%1*8]
779 1d16a1cf Ronald S. Bultje
%ifdef ARCH_X86_64
780
    add         r0, r10
781
%else
782
    add         r0, r0m
783
%endif
784
    call        h264_idct_dc_add8_mmx2
785 ae112918 Ronald S. Bultje
.cycle%1end
786
%if %1 < 7
787 1d16a1cf Ronald S. Bultje
    add         r2, 64
788 ae112918 Ronald S. Bultje
%endif
789
%endmacro
790
791
; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset,
792
;                              DCTELEM *block, int stride, const uint8_t nnzc[6*8])
793
cglobal h264_idct_add16intra_sse2, 5, 7, 8
794
%ifdef ARCH_X86_64
795
    mov        r10, r0
796
%endif
797
    add16intra_sse2_cycle 0, 0xc
798
    add16intra_sse2_cycle 1, 0x14
799
    add16intra_sse2_cycle 2, 0xe
800
    add16intra_sse2_cycle 3, 0x16
801
    add16intra_sse2_cycle 4, 0x1c
802
    add16intra_sse2_cycle 5, 0x24
803
    add16intra_sse2_cycle 6, 0x1e
804
    add16intra_sse2_cycle 7, 0x26
805
    RET
806 1d16a1cf Ronald S. Bultje
807 4bca6774 Ronald S. Bultje
%macro add8_sse2_cycle 2
808
    movzx       r0, word [r4+%2]
809 1d16a1cf Ronald S. Bultje
    test        r0, r0
810 4bca6774 Ronald S. Bultje
    jz .try%1dc
811 1d16a1cf Ronald S. Bultje
%ifdef ARCH_X86_64
812 4bca6774 Ronald S. Bultje
    mov        r0d, dword [r1+%1*8+64]
813 1d16a1cf Ronald S. Bultje
    add         r0, [r10]
814
%else
815 4bca6774 Ronald S. Bultje
    mov         r0, r0m
816 1d16a1cf Ronald S. Bultje
    mov         r0, [r0]
817 4bca6774 Ronald S. Bultje
    add         r0, dword [r1+%1*8+64]
818 1d16a1cf Ronald S. Bultje
%endif
819
    call        x264_add8x4_idct_sse2
820 4bca6774 Ronald S. Bultje
    jmp .cycle%1end
821
.try%1dc
822 1d16a1cf Ronald S. Bultje
    movsx       r0, word [r2   ]
823
    or         r0w, word [r2+32]
824 4bca6774 Ronald S. Bultje
    jz .cycle%1end
825 1d16a1cf Ronald S. Bultje
%ifdef ARCH_X86_64
826 4bca6774 Ronald S. Bultje
    mov        r0d, dword [r1+%1*8+64]
827 1d16a1cf Ronald S. Bultje
    add         r0, [r10]
828
%else
829 4bca6774 Ronald S. Bultje
    mov         r0, r0m
830 1d16a1cf Ronald S. Bultje
    mov         r0, [r0]
831 4bca6774 Ronald S. Bultje
    add         r0, dword [r1+%1*8+64]
832 1d16a1cf Ronald S. Bultje
%endif
833
    call        h264_idct_dc_add8_mmx2
834 4bca6774 Ronald S. Bultje
.cycle%1end
835
%if %1 < 3
836 1d16a1cf Ronald S. Bultje
    add         r2, 64
837 4bca6774 Ronald S. Bultje
%endif
838
%endmacro
839 1d16a1cf Ronald S. Bultje
840
; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset,
841
;                        DCTELEM *block, int stride, const uint8_t nnzc[6*8])
842
cglobal h264_idct_add8_sse2, 5, 7, 8
843
    add          r2, 512
844
%ifdef ARCH_X86_64
845
    mov         r10, r0
846
%endif
847 4bca6774 Ronald S. Bultje
    add8_sse2_cycle 0, 0x09
848
    add8_sse2_cycle 1, 0x11
849 1d16a1cf Ronald S. Bultje
%ifdef ARCH_X86_64
850
    add         r10, gprsize
851
%else
852
    add        r0mp, gprsize
853
%endif
854 4bca6774 Ronald S. Bultje
    add8_sse2_cycle 2, 0x21
855
    add8_sse2_cycle 3, 0x29
856 1d16a1cf Ronald S. Bultje
    RET