Revision e27ad118 libavcodec/x86/h264_idct_sse2.asm

View differences:

libavcodec/x86/h264_idct_sse2.asm
1
;*****************************************************************************
2
;* SSE2-optimized H.264 iDCT
3
;*****************************************************************************
4
;* Copyright (C) 2003-2008 x264 project
5
;*
6
;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
7
;*          Loren Merritt <lorenm@u.washington.edu>
8
;*          Holger Lubitz <hal@duncan.ol.sub.de>
9
;*          Min Chen <chenm001.163.com>
10
;*
11
;* This program is free software; you can redistribute it and/or modify
12
;* it under the terms of the GNU General Public License as published by
13
;* the Free Software Foundation; either version 2 of the License, or
14
;* (at your option) any later version.
15
;*
16
;* This program is distributed in the hope that it will be useful,
17
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19
;* GNU General Public License for more details.
20
;*
21
;* You should have received a copy of the GNU General Public License
22
;* along with this program; if not, write to the Free Software
23
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
24
;*****************************************************************************
25

  
26
%include "x86inc.asm"
27
%include "x86util.asm"
28

  
29
SECTION_RODATA
30
pw_32: times 8 dw 32
31

  
32
SECTION .text
33

  
34
%macro IDCT4_1D 6
35
    SUMSUB_BA   m%3, m%1
36
    SUMSUBD2_AB m%2, m%4, m%6, m%5
37
    SUMSUB_BADC m%2, m%3, m%5, m%1
38
    SWAP %1, %2, %5, %4, %3
39
%endmacro
40

  
41
INIT_XMM
42
cglobal x264_add8x4_idct_sse2, 3,3
43
    movq   m0, [r1+ 0]
44
    movq   m1, [r1+ 8]
45
    movq   m2, [r1+16]
46
    movq   m3, [r1+24]
47
    movhps m0, [r1+32]
48
    movhps m1, [r1+40]
49
    movhps m2, [r1+48]
50
    movhps m3, [r1+56]
51
    IDCT4_1D 0,1,2,3,4,5
52
    TRANSPOSE2x4x4W 0,1,2,3,4
53
    paddw m0, [pw_32 GLOBAL]
54
    IDCT4_1D 0,1,2,3,4,5
55
    pxor  m7, m7
56
    STORE_DIFF  m0, m4, m7, [r0]
57
    STORE_DIFF  m1, m4, m7, [r0+r2]
58
    lea   r0, [r0+r2*2]
59
    STORE_DIFF  m2, m4, m7, [r0]
60
    STORE_DIFF  m3, m4, m7, [r0+r2]
61
    RET
1
;*****************************************************************************
2
;* SSE2-optimized H.264 iDCT
3
;*****************************************************************************
4
;* Copyright (C) 2003-2008 x264 project
5
;*
6
;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
7
;*          Loren Merritt <lorenm@u.washington.edu>
8
;*          Holger Lubitz <hal@duncan.ol.sub.de>
9
;*          Min Chen <chenm001.163.com>
10
;*
11
;* This program is free software; you can redistribute it and/or modify
12
;* it under the terms of the GNU General Public License as published by
13
;* the Free Software Foundation; either version 2 of the License, or
14
;* (at your option) any later version.
15
;*
16
;* This program is distributed in the hope that it will be useful,
17
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19
;* GNU General Public License for more details.
20
;*
21
;* You should have received a copy of the GNU General Public License
22
;* along with this program; if not, write to the Free Software
23
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
24
;*****************************************************************************
25

  
26
%include "x86inc.asm"
27
%include "x86util.asm"
28

  
29
SECTION_RODATA
30
pw_32: times 8 dw 32
31

  
32
SECTION .text
33

  
34
%macro IDCT4_1D 6
35
    SUMSUB_BA   m%3, m%1
36
    SUMSUBD2_AB m%2, m%4, m%6, m%5
37
    SUMSUB_BADC m%2, m%3, m%5, m%1
38
    SWAP %1, %2, %5, %4, %3
39
%endmacro
40

  
41
INIT_XMM
42
cglobal x264_add8x4_idct_sse2, 3,3
43
    movq   m0, [r1+ 0]
44
    movq   m1, [r1+ 8]
45
    movq   m2, [r1+16]
46
    movq   m3, [r1+24]
47
    movhps m0, [r1+32]
48
    movhps m1, [r1+40]
49
    movhps m2, [r1+48]
50
    movhps m3, [r1+56]
51
    IDCT4_1D 0,1,2,3,4,5
52
    TRANSPOSE2x4x4W 0,1,2,3,4
53
    paddw m0, [pw_32 GLOBAL]
54
    IDCT4_1D 0,1,2,3,4,5
55
    pxor  m7, m7
56
    STORE_DIFF  m0, m4, m7, [r0]
57
    STORE_DIFF  m1, m4, m7, [r0+r2]
58
    lea   r0, [r0+r2*2]
59
    STORE_DIFF  m2, m4, m7, [r0]
60
    STORE_DIFF  m3, m4, m7, [r0+r2]
61
    RET

Also available in: Unified diff