Revision a6624e21

View differences:

libavcodec/i386/dsputil_h264_template_mmx.c
1 1
/*
2
 * Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>
2
 * Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
3
 *                    Loren Merritt
3 4
 *
4 5
 * This library is free software; you can redistribute it and/or
5 6
 * modify it under the terms of the GNU Lesser General Public
......
18 19

  
19 20
/**
20 21
 * MMX optimized version of (put|avg)_h264_chroma_mc8.
21
 * H264_CHROMA_MC8_TMPL must be defined to the desired function name and
22
 * H264_CHROMA_OP must be defined to empty for put and pavgb/pavgusb for avg.
22
 * H264_CHROMA_MC8_TMPL must be defined to the desired function name
23
 * H264_CHROMA_OP must be defined to empty for put and pavgb/pavgusb for avg
24
 * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function
23 25
 */
24 26
static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
25 27
{
26 28
    uint64_t AA __align8;
27 29
    uint64_t DD __align8;
28
    unsigned long srcos = (long)src & 7;
29
    uint64_t sh1 __align8 = srcos * 8;
30
    uint64_t sh2 __align8 = 56 - sh1;
31 30
    int i;
32 31

  
32
    if(y==0 && x==0) {
33
        /* no filter needed */
34
        H264_CHROMA_MC8_MV0(dst, src, stride, h);
35
        return;
36
    }
37

  
33 38
    assert(x<8 && y<8 && x>=0 && y>=0);
34 39

  
35
    asm volatile("movd %1, %%mm4\n\t"
36
                 "movd %2, %%mm6\n\t"
40
    if(y==0)
41
    {
42
        /* horizontal filter only */
43
        asm volatile("movd %0, %%mm5\n\t"
44
                     "punpcklwd %%mm5, %%mm5\n\t"
45
                     "punpckldq %%mm5, %%mm5\n\t" /* mm5 = B = x */
46
                     "movq %1, %%mm4\n\t"
47
                     "pxor %%mm7, %%mm7\n\t"
48
                     "psubw %%mm5, %%mm4\n\t"     /* mm4 = A = 8-x */
49
                     : : "rm" (x), "m" (ff_pw_8));
50

  
51
        for(i=0; i<h; i++) {
52
            asm volatile(
53
                /* mm0 = src[0..7], mm1 = src[1..8] */
54
                "movq %0, %%mm0\n\t"
55
                "movq %1, %%mm1\n\t"
56
                : : "m" (src[0]), "m" (src[1]));
57

  
58
            asm volatile(
59
                /* [mm2,mm3] = A * src[0..7] */
60
                "movq %%mm0, %%mm2\n\t"
61
                "punpcklbw %%mm7, %%mm2\n\t"
62
                "pmullw %%mm4, %%mm2\n\t"
63
                "movq %%mm0, %%mm3\n\t"
64
                "punpckhbw %%mm7, %%mm3\n\t"
65
                "pmullw %%mm4, %%mm3\n\t"
66

  
67
                /* [mm2,mm3] += B * src[1..8] */
68
                "movq %%mm1, %%mm0\n\t"
69
                "punpcklbw %%mm7, %%mm0\n\t"
70
                "pmullw %%mm5, %%mm0\n\t"
71
                "punpckhbw %%mm7, %%mm1\n\t"
72
                "pmullw %%mm5, %%mm1\n\t"
73
                "paddw %%mm0, %%mm2\n\t"
74
                "paddw %%mm1, %%mm3\n\t"
75

  
76
                /* dst[0..7] = pack(([mm2,mm3] + 32) >> 6) */
77
                "paddw %1, %%mm2\n\t"
78
                "paddw %1, %%mm3\n\t"
79
                "psrlw $3, %%mm2\n\t"
80
                "psrlw $3, %%mm3\n\t"
81
                "packuswb %%mm3, %%mm2\n\t"
82
                H264_CHROMA_OP(%0, %%mm2)
83
                "movq %%mm2, %0\n\t"
84
                : "=m" (dst[0]) : "m" (ff_pw_4));
85

  
86
            src += stride;
87
            dst += stride;
88
        }
89
        return;
90
    }
91

  
92
    if(x==0)
93
    {
94
        /* vertical filter only */
95
        asm volatile("movd %0, %%mm6\n\t"
96
                     "punpcklwd %%mm6, %%mm6\n\t"
97
                     "punpckldq %%mm6, %%mm6\n\t" /* mm6 = C = y */
98
                     "movq %1, %%mm4\n\t"
99
                     "pxor %%mm7, %%mm7\n\t"
100
                     "psubw %%mm6, %%mm4\n\t"     /* mm4 = A = 8-y */
101
                     : : "rm" (y), "m" (ff_pw_8));
102

  
103
        asm volatile(
104
            /* mm0 = src[0..7] */
105
            "movq %0, %%mm0\n\t"
106
            : : "m" (src[0]));
107

  
108
        for(i=0; i<h; i++) {
109
            asm volatile(
110
                /* [mm2,mm3] = A * src[0..7] */
111
                "movq %mm0, %mm2\n\t"
112
                "punpcklbw %mm7, %mm2\n\t"
113
                "pmullw %mm4, %mm2\n\t"
114
                "movq %mm0, %mm3\n\t"
115
                "punpckhbw %mm7, %mm3\n\t"
116
                "pmullw %mm4, %mm3\n\t");
117

  
118
            src += stride;
119
            asm volatile(
120
                /* mm0 = src[0..7] */
121
                "movq %0, %%mm0\n\t"
122
                : : "m" (src[0]));
123

  
124
            asm volatile(
125
                /* [mm2,mm3] += C * src[0..7] */
126
                "movq %mm0, %mm1\n\t"
127
                "punpcklbw %mm7, %mm1\n\t"
128
                "pmullw %mm6, %mm1\n\t"
129
                "paddw %mm1, %mm2\n\t"
130
                "movq %mm0, %mm5\n\t"
131
                "punpckhbw %mm7, %mm5\n\t"
132
                "pmullw %mm6, %mm5\n\t"
133
                "paddw %mm5, %mm3\n\t");
134

  
135
            asm volatile(
136
                /* dst[0..7] = pack(([mm2,mm3] + 32) >> 6) */
137
                "paddw %1, %%mm2\n\t"
138
                "paddw %1, %%mm3\n\t"
139
                "psrlw $3, %%mm2\n\t"
140
                "psrlw $3, %%mm3\n\t"
141
                "packuswb %%mm3, %%mm2\n\t"
142
                H264_CHROMA_OP(%0, %%mm2)
143
                "movq %%mm2, %0\n\t"
144
                : "=m" (dst[0]) : "m" (ff_pw_4));
145

  
146
            dst += stride;
147
        }
148
        return;
149
    }
150

  
151
    /* general case, bilinear */
152
    asm volatile("movd %2, %%mm4\n\t"
153
                 "movd %3, %%mm6\n\t"
37 154
                 "punpcklwd %%mm4, %%mm4\n\t"
38 155
                 "punpcklwd %%mm6, %%mm6\n\t"
39 156
                 "punpckldq %%mm4, %%mm4\n\t" /* mm4 = x words */
......
44 161
                 "psllw $3, %%mm6\n\t"
45 162
                 "movq %%mm5, %%mm7\n\t"
46 163
                 "paddw %%mm6, %%mm7\n\t"
47
                 "movq %%mm4, %0\n\t"         /* DD = x * y */
164
                 "movq %%mm4, %1\n\t"         /* DD = x * y */
48 165
                 "psubw %%mm4, %%mm5\n\t"     /* mm5 = B = 8x - xy */
49 166
                 "psubw %%mm4, %%mm6\n\t"     /* mm6 = C = 8y - xy */
50
                 "paddw %3, %%mm4\n\t"
167
                 "paddw %4, %%mm4\n\t"
51 168
                 "psubw %%mm7, %%mm4\n\t"     /* mm4 = A = xy - (8x+8y) + 64 */
52 169
                 "pxor %%mm7, %%mm7\n\t"
53
                 : "=m" (DD) : "rm" (x), "rm" (y), "m" (ff_pw_64));
170
                 "movq %%mm4, %0\n\t"
171
                 : "=m" (AA), "=m" (DD) : "rm" (x), "rm" (y), "m" (ff_pw_64));
54 172

  
55
    asm volatile("movq %%mm4, %0" : "=m" (AA));
56

  
57
    src -= srcos;
58 173
    asm volatile(
59 174
        /* mm0 = src[0..7], mm1 = src[1..8] */
60
        "movq %0, %%mm1\n\t"
61
        "movq %1, %%mm0\n\t"
62
        "psrlq %2, %%mm1\n\t"
63
        "psllq %3, %%mm0\n\t"
64
        "movq %%mm0, %%mm4\n\t"
65
        "psllq $8, %%mm0\n\t"
66
        "por %%mm1, %%mm0\n\t"
67
        "psrlq $8, %%mm1\n\t"
68
        "por %%mm4, %%mm1\n\t"
69
        : : "m" (src[0]), "m" (src[8]), "m" (sh1), "m" (sh2));
175
        "movq %0, %%mm0\n\t"
176
        "movq %1, %%mm1\n\t"
177
        : : "m" (src[0]), "m" (src[1]));
70 178

  
71 179
    for(i=0; i<h; i++) {
72 180
        asm volatile(
......
91 199
        src += stride;
92 200
        asm volatile(
93 201
            /* mm0 = src[0..7], mm1 = src[1..8] */
94
            "movq %0, %%mm1\n\t"
95
            "movq %1, %%mm0\n\t"
96
            "psrlq %2, %%mm1\n\t"
97
            "psllq %3, %%mm0\n\t"
98
            "movq %%mm0, %%mm4\n\t"
99
            "psllq $8, %%mm0\n\t"
100
            "por %%mm1, %%mm0\n\t"
101
            "psrlq $8, %%mm1\n\t"
102
            "por %%mm4, %%mm1\n\t"
103
            : : "m" (src[0]), "m" (src[8]), "m" (sh1), "m" (sh2));
202
            "movq %0, %%mm0\n\t"
203
            "movq %1, %%mm1\n\t"
204
            : : "m" (src[0]), "m" (src[1]));
104 205

  
105 206
        asm volatile(
106 207
            /* [mm2,mm3] += C *  src[0..7] */
......
138 239
        dst+= stride;
139 240
    }
140 241
}
242

  
243
static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
244
{
245
    uint64_t AA __align8;
246
    uint64_t DD __align8;
247
    int i;
248

  
249
    /* no special case for mv=(0,0) in 4x*, since it's much less common than in 8x*.
250
     * could still save a few cycles, but maybe not worth the complexity. */
251

  
252
    assert(x<8 && y<8 && x>=0 && y>=0);
253

  
254
    asm volatile("movd %2, %%mm4\n\t"
255
                 "movd %3, %%mm6\n\t"
256
                 "punpcklwd %%mm4, %%mm4\n\t"
257
                 "punpcklwd %%mm6, %%mm6\n\t"
258
                 "punpckldq %%mm4, %%mm4\n\t" /* mm4 = x words */
259
                 "punpckldq %%mm6, %%mm6\n\t" /* mm6 = y words */
260
                 "movq %%mm4, %%mm5\n\t"
261
                 "pmullw %%mm6, %%mm4\n\t"    /* mm4 = x * y */
262
                 "psllw $3, %%mm5\n\t"
263
                 "psllw $3, %%mm6\n\t"
264
                 "movq %%mm5, %%mm7\n\t"
265
                 "paddw %%mm6, %%mm7\n\t"
266
                 "movq %%mm4, %1\n\t"         /* DD = x * y */
267
                 "psubw %%mm4, %%mm5\n\t"     /* mm5 = B = 8x - xy */
268
                 "psubw %%mm4, %%mm6\n\t"     /* mm6 = C = 8y - xy */
269
                 "paddw %4, %%mm4\n\t"
270
                 "psubw %%mm7, %%mm4\n\t"     /* mm4 = A = xy - (8x+8y) + 64 */
271
                 "pxor %%mm7, %%mm7\n\t"
272
                 "movq %%mm4, %0\n\t"
273
                 : "=m" (AA), "=m" (DD) : "rm" (x), "rm" (y), "m" (ff_pw_64));
274

  
275
    asm volatile(
276
        /* mm0 = src[0..3], mm1 = src[1..4] */
277
        "movd %0, %%mm0\n\t"
278
        "movd %1, %%mm1\n\t"
279
        "punpcklbw %%mm7, %%mm0\n\t"
280
        "punpcklbw %%mm7, %%mm1\n\t"
281
        : : "m" (src[0]), "m" (src[1]));
282

  
283
    for(i=0; i<h; i++) {
284
        asm volatile(
285
            /* mm2 = A * src[0..3] + B * src[1..4] */
286
            "movq %%mm0, %%mm2\n\t"
287
            "pmullw %0, %%mm2\n\t"
288
            "pmullw %%mm5, %%mm1\n\t"
289
            "paddw %%mm1, %%mm2\n\t"
290
            : : "m" (AA));
291

  
292
        src += stride;
293
        asm volatile(
294
            /* mm0 = src[0..3], mm1 = src[1..4] */
295
            "movd %0, %%mm0\n\t"
296
            "movd %1, %%mm1\n\t"
297
            "punpcklbw %%mm7, %%mm0\n\t"
298
            "punpcklbw %%mm7, %%mm1\n\t"
299
            : : "m" (src[0]), "m" (src[1]));
300

  
301
        asm volatile(
302
            /* mm2 += C * src[0..3] + D * src[1..4] */
303
            "movq %%mm0, %%mm3\n\t"
304
            "movq %%mm1, %%mm4\n\t"
305
            "pmullw %%mm6, %%mm3\n\t"
306
            "pmullw %0, %%mm4\n\t"
307
            "paddw %%mm3, %%mm2\n\t"
308
            "paddw %%mm4, %%mm2\n\t"
309
            : : "m" (DD));
310

  
311
        asm volatile(
312
            /* dst[0..3] = pack((mm2 + 32) >> 6) */
313
            "paddw %1, %%mm2\n\t"
314
            "psrlw $6, %%mm2\n\t"
315
            "packuswb %%mm7, %%mm2\n\t"
316
            H264_CHROMA_OP4(%0, %%mm2, %%mm3)
317
            "movd %%mm2, %0\n\t"
318
            : "=m" (dst[0]) : "m" (ff_pw_32));
319
        dst += stride;
320
    }
321
}
libavcodec/i386/dsputil_mmx.c
43 43
static const uint64_t ff_pw_3  attribute_used __attribute__ ((aligned(8))) = 0x0003000300030003ULL;
44 44
static const uint64_t ff_pw_4  attribute_used __attribute__ ((aligned(8))) = 0x0004000400040004ULL;
45 45
static const uint64_t ff_pw_5  attribute_used __attribute__ ((aligned(8))) = 0x0005000500050005ULL;
46
static const uint64_t ff_pw_8  attribute_used __attribute__ ((aligned(8))) = 0x0008000800080008ULL;
46 47
static const uint64_t ff_pw_16 attribute_used __attribute__ ((aligned(8))) = 0x0010001000100010ULL;
47 48
static const uint64_t ff_pw_32 attribute_used __attribute__ ((aligned(8))) = 0x0020002000200020ULL;
48 49
static const uint64_t ff_pw_64 attribute_used __attribute__ ((aligned(8))) = 0x0040004000400040ULL;
......
2726 2727
        c->h263_v_loop_filter= h263_v_loop_filter_mmx;
2727 2728
        c->h263_h_loop_filter= h263_h_loop_filter_mmx;        
2728 2729
	c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx;
2730
        c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx;
2729 2731
        
2730 2732
        if (mm_flags & MM_MMXEXT) {
2731 2733
            c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
......
2825 2827
#undef dspfunc
2826 2828

  
2827 2829
	    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2;
2830
            c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_mmx2;
2828 2831
            c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2;
2829 2832
            c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2;
2830 2833
            c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2;
......
2936 2939
            dspfunc(avg_h264_qpel, 2, 4);
2937 2940

  
2938 2941
	    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow;
2942
            c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow;
2939 2943
        }
2940 2944
    }
2941 2945
        
libavcodec/i386/h264dsp_mmx.c
892 892

  
893 893

  
894 894
#define H264_CHROMA_OP(S,D)
895
#define H264_CHROMA_OP4(S,D,T)
895 896
#define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_mmx
897
#define H264_CHROMA_MC4_TMPL put_h264_chroma_mc4_mmx
898
#define H264_CHROMA_MC8_MV0 put_pixels8_mmx
896 899
#include "dsputil_h264_template_mmx.c"
897 900
#undef H264_CHROMA_OP
901
#undef H264_CHROMA_OP4
898 902
#undef H264_CHROMA_MC8_TMPL
903
#undef H264_CHROMA_MC4_TMPL
904
#undef H264_CHROMA_MC8_MV0
899 905

  
900 906
#define H264_CHROMA_OP(S,D) "pavgb " #S ", " #D " \n\t"
907
#define H264_CHROMA_OP4(S,D,T) "movd  " #S ", " #T " \n\t"\
908
                               "pavgb " #T ", " #D " \n\t"
901 909
#define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_mmx2
910
#define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_mmx2
911
#define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2
902 912
#include "dsputil_h264_template_mmx.c"
903 913
#undef H264_CHROMA_OP
914
#undef H264_CHROMA_OP4
904 915
#undef H264_CHROMA_MC8_TMPL
916
#undef H264_CHROMA_MC4_TMPL
917
#undef H264_CHROMA_MC8_MV0
905 918

  
906 919
#define H264_CHROMA_OP(S,D) "pavgusb " #S ", " #D " \n\t"
920
#define H264_CHROMA_OP4(S,D,T) "movd " #S ", " #T " \n\t"\
921
                               "pavgusb " #T ", " #D " \n\t"
907 922
#define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_3dnow
923
#define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_3dnow
924
#define H264_CHROMA_MC8_MV0 avg_pixels8_3dnow
908 925
#include "dsputil_h264_template_mmx.c"
909 926
#undef H264_CHROMA_OP
927
#undef H264_CHROMA_OP4
910 928
#undef H264_CHROMA_MC8_TMPL
929
#undef H264_CHROMA_MC4_TMPL
930
#undef H264_CHROMA_MC8_MV0
911 931

  
912 932
/***********************************/
913 933
/* weighted prediction */

Also available in: Unified diff