Revision a6624e21 libavcodec/i386/dsputil_h264_template_mmx.c

View differences:

libavcodec/i386/dsputil_h264_template_mmx.c
1 1
/*
2
 * Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>
2
 * Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
3
 *                    Loren Merritt
3 4
 *
4 5
 * This library is free software; you can redistribute it and/or
5 6
 * modify it under the terms of the GNU Lesser General Public
......
18 19

  
19 20
/**
20 21
 * MMX optimized version of (put|avg)_h264_chroma_mc8.
21
 * H264_CHROMA_MC8_TMPL must be defined to the desired function name and
22
 * H264_CHROMA_OP must be defined to empty for put and pavgb/pavgusb for avg.
22
 * H264_CHROMA_MC8_TMPL must be defined to the desired function name
23
 * H264_CHROMA_OP must be defined to empty for put and pavgb/pavgusb for avg
24
 * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function
23 25
 */
24 26
static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
25 27
{
26 28
    uint64_t AA __align8;
27 29
    uint64_t DD __align8;
28
    unsigned long srcos = (long)src & 7;
29
    uint64_t sh1 __align8 = srcos * 8;
30
    uint64_t sh2 __align8 = 56 - sh1;
31 30
    int i;
32 31

  
32
    if(y==0 && x==0) {
33
        /* no filter needed */
34
        H264_CHROMA_MC8_MV0(dst, src, stride, h);
35
        return;
36
    }
37

  
33 38
    assert(x<8 && y<8 && x>=0 && y>=0);
34 39

  
35
    asm volatile("movd %1, %%mm4\n\t"
36
                 "movd %2, %%mm6\n\t"
40
    if(y==0)
41
    {
42
        /* horizontal filter only */
43
        asm volatile("movd %0, %%mm5\n\t"
44
                     "punpcklwd %%mm5, %%mm5\n\t"
45
                     "punpckldq %%mm5, %%mm5\n\t" /* mm5 = B = x */
46
                     "movq %1, %%mm4\n\t"
47
                     "pxor %%mm7, %%mm7\n\t"
48
                     "psubw %%mm5, %%mm4\n\t"     /* mm4 = A = 8-x */
49
                     : : "rm" (x), "m" (ff_pw_8));
50

  
51
        for(i=0; i<h; i++) {
52
            asm volatile(
53
                /* mm0 = src[0..7], mm1 = src[1..8] */
54
                "movq %0, %%mm0\n\t"
55
                "movq %1, %%mm1\n\t"
56
                : : "m" (src[0]), "m" (src[1]));
57

  
58
            asm volatile(
59
                /* [mm2,mm3] = A * src[0..7] */
60
                "movq %%mm0, %%mm2\n\t"
61
                "punpcklbw %%mm7, %%mm2\n\t"
62
                "pmullw %%mm4, %%mm2\n\t"
63
                "movq %%mm0, %%mm3\n\t"
64
                "punpckhbw %%mm7, %%mm3\n\t"
65
                "pmullw %%mm4, %%mm3\n\t"
66

  
67
                /* [mm2,mm3] += B * src[1..8] */
68
                "movq %%mm1, %%mm0\n\t"
69
                "punpcklbw %%mm7, %%mm0\n\t"
70
                "pmullw %%mm5, %%mm0\n\t"
71
                "punpckhbw %%mm7, %%mm1\n\t"
72
                "pmullw %%mm5, %%mm1\n\t"
73
                "paddw %%mm0, %%mm2\n\t"
74
                "paddw %%mm1, %%mm3\n\t"
75

  
76
                /* dst[0..7] = pack(([mm2,mm3] + 32) >> 6) */
77
                "paddw %1, %%mm2\n\t"
78
                "paddw %1, %%mm3\n\t"
79
                "psrlw $3, %%mm2\n\t"
80
                "psrlw $3, %%mm3\n\t"
81
                "packuswb %%mm3, %%mm2\n\t"
82
                H264_CHROMA_OP(%0, %%mm2)
83
                "movq %%mm2, %0\n\t"
84
                : "=m" (dst[0]) : "m" (ff_pw_4));
85

  
86
            src += stride;
87
            dst += stride;
88
        }
89
        return;
90
    }
91

  
92
    if(x==0)
93
    {
94
        /* vertical filter only */
95
        asm volatile("movd %0, %%mm6\n\t"
96
                     "punpcklwd %%mm6, %%mm6\n\t"
97
                     "punpckldq %%mm6, %%mm6\n\t" /* mm6 = C = y */
98
                     "movq %1, %%mm4\n\t"
99
                     "pxor %%mm7, %%mm7\n\t"
100
                     "psubw %%mm6, %%mm4\n\t"     /* mm4 = A = 8-y */
101
                     : : "rm" (y), "m" (ff_pw_8));
102

  
103
        asm volatile(
104
            /* mm0 = src[0..7] */
105
            "movq %0, %%mm0\n\t"
106
            : : "m" (src[0]));
107

  
108
        for(i=0; i<h; i++) {
109
            asm volatile(
110
                /* [mm2,mm3] = A * src[0..7] */
111
                "movq %mm0, %mm2\n\t"
112
                "punpcklbw %mm7, %mm2\n\t"
113
                "pmullw %mm4, %mm2\n\t"
114
                "movq %mm0, %mm3\n\t"
115
                "punpckhbw %mm7, %mm3\n\t"
116
                "pmullw %mm4, %mm3\n\t");
117

  
118
            src += stride;
119
            asm volatile(
120
                /* mm0 = src[0..7] */
121
                "movq %0, %%mm0\n\t"
122
                : : "m" (src[0]));
123

  
124
            asm volatile(
125
                /* [mm2,mm3] += C * src[0..7] */
126
                "movq %mm0, %mm1\n\t"
127
                "punpcklbw %mm7, %mm1\n\t"
128
                "pmullw %mm6, %mm1\n\t"
129
                "paddw %mm1, %mm2\n\t"
130
                "movq %mm0, %mm5\n\t"
131
                "punpckhbw %mm7, %mm5\n\t"
132
                "pmullw %mm6, %mm5\n\t"
133
                "paddw %mm5, %mm3\n\t");
134

  
135
            asm volatile(
136
                /* dst[0..7] = pack(([mm2,mm3] + 32) >> 6) */
137
                "paddw %1, %%mm2\n\t"
138
                "paddw %1, %%mm3\n\t"
139
                "psrlw $3, %%mm2\n\t"
140
                "psrlw $3, %%mm3\n\t"
141
                "packuswb %%mm3, %%mm2\n\t"
142
                H264_CHROMA_OP(%0, %%mm2)
143
                "movq %%mm2, %0\n\t"
144
                : "=m" (dst[0]) : "m" (ff_pw_4));
145

  
146
            dst += stride;
147
        }
148
        return;
149
    }
150

  
151
    /* general case, bilinear */
152
    asm volatile("movd %2, %%mm4\n\t"
153
                 "movd %3, %%mm6\n\t"
37 154
                 "punpcklwd %%mm4, %%mm4\n\t"
38 155
                 "punpcklwd %%mm6, %%mm6\n\t"
39 156
                 "punpckldq %%mm4, %%mm4\n\t" /* mm4 = x words */
......
44 161
                 "psllw $3, %%mm6\n\t"
45 162
                 "movq %%mm5, %%mm7\n\t"
46 163
                 "paddw %%mm6, %%mm7\n\t"
47
                 "movq %%mm4, %0\n\t"         /* DD = x * y */
164
                 "movq %%mm4, %1\n\t"         /* DD = x * y */
48 165
                 "psubw %%mm4, %%mm5\n\t"     /* mm5 = B = 8x - xy */
49 166
                 "psubw %%mm4, %%mm6\n\t"     /* mm6 = C = 8y - xy */
50
                 "paddw %3, %%mm4\n\t"
167
                 "paddw %4, %%mm4\n\t"
51 168
                 "psubw %%mm7, %%mm4\n\t"     /* mm4 = A = xy - (8x+8y) + 64 */
52 169
                 "pxor %%mm7, %%mm7\n\t"
53
                 : "=m" (DD) : "rm" (x), "rm" (y), "m" (ff_pw_64));
170
                 "movq %%mm4, %0\n\t"
171
                 : "=m" (AA), "=m" (DD) : "rm" (x), "rm" (y), "m" (ff_pw_64));
54 172

  
55
    asm volatile("movq %%mm4, %0" : "=m" (AA));
56

  
57
    src -= srcos;
58 173
    asm volatile(
59 174
        /* mm0 = src[0..7], mm1 = src[1..8] */
60
        "movq %0, %%mm1\n\t"
61
        "movq %1, %%mm0\n\t"
62
        "psrlq %2, %%mm1\n\t"
63
        "psllq %3, %%mm0\n\t"
64
        "movq %%mm0, %%mm4\n\t"
65
        "psllq $8, %%mm0\n\t"
66
        "por %%mm1, %%mm0\n\t"
67
        "psrlq $8, %%mm1\n\t"
68
        "por %%mm4, %%mm1\n\t"
69
        : : "m" (src[0]), "m" (src[8]), "m" (sh1), "m" (sh2));
175
        "movq %0, %%mm0\n\t"
176
        "movq %1, %%mm1\n\t"
177
        : : "m" (src[0]), "m" (src[1]));
70 178

  
71 179
    for(i=0; i<h; i++) {
72 180
        asm volatile(
......
91 199
        src += stride;
92 200
        asm volatile(
93 201
            /* mm0 = src[0..7], mm1 = src[1..8] */
94
            "movq %0, %%mm1\n\t"
95
            "movq %1, %%mm0\n\t"
96
            "psrlq %2, %%mm1\n\t"
97
            "psllq %3, %%mm0\n\t"
98
            "movq %%mm0, %%mm4\n\t"
99
            "psllq $8, %%mm0\n\t"
100
            "por %%mm1, %%mm0\n\t"
101
            "psrlq $8, %%mm1\n\t"
102
            "por %%mm4, %%mm1\n\t"
103
            : : "m" (src[0]), "m" (src[8]), "m" (sh1), "m" (sh2));
202
            "movq %0, %%mm0\n\t"
203
            "movq %1, %%mm1\n\t"
204
            : : "m" (src[0]), "m" (src[1]));
104 205

  
105 206
        asm volatile(
106 207
            /* [mm2,mm3] += C *  src[0..7] */
......
138 239
        dst+= stride;
139 240
    }
140 241
}
242

  
243
static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
244
{
245
    uint64_t AA __align8;
246
    uint64_t DD __align8;
247
    int i;
248

  
249
    /* no special case for mv=(0,0) in 4x*, since it's much less common than in 8x*.
250
     * could still save a few cycles, but maybe not worth the complexity. */
251

  
252
    assert(x<8 && y<8 && x>=0 && y>=0);
253

  
254
    asm volatile("movd %2, %%mm4\n\t"
255
                 "movd %3, %%mm6\n\t"
256
                 "punpcklwd %%mm4, %%mm4\n\t"
257
                 "punpcklwd %%mm6, %%mm6\n\t"
258
                 "punpckldq %%mm4, %%mm4\n\t" /* mm4 = x words */
259
                 "punpckldq %%mm6, %%mm6\n\t" /* mm6 = y words */
260
                 "movq %%mm4, %%mm5\n\t"
261
                 "pmullw %%mm6, %%mm4\n\t"    /* mm4 = x * y */
262
                 "psllw $3, %%mm5\n\t"
263
                 "psllw $3, %%mm6\n\t"
264
                 "movq %%mm5, %%mm7\n\t"
265
                 "paddw %%mm6, %%mm7\n\t"
266
                 "movq %%mm4, %1\n\t"         /* DD = x * y */
267
                 "psubw %%mm4, %%mm5\n\t"     /* mm5 = B = 8x - xy */
268
                 "psubw %%mm4, %%mm6\n\t"     /* mm6 = C = 8y - xy */
269
                 "paddw %4, %%mm4\n\t"
270
                 "psubw %%mm7, %%mm4\n\t"     /* mm4 = A = xy - (8x+8y) + 64 */
271
                 "pxor %%mm7, %%mm7\n\t"
272
                 "movq %%mm4, %0\n\t"
273
                 : "=m" (AA), "=m" (DD) : "rm" (x), "rm" (y), "m" (ff_pw_64));
274

  
275
    asm volatile(
276
        /* mm0 = src[0..3], mm1 = src[1..4] */
277
        "movd %0, %%mm0\n\t"
278
        "movd %1, %%mm1\n\t"
279
        "punpcklbw %%mm7, %%mm0\n\t"
280
        "punpcklbw %%mm7, %%mm1\n\t"
281
        : : "m" (src[0]), "m" (src[1]));
282

  
283
    for(i=0; i<h; i++) {
284
        asm volatile(
285
            /* mm2 = A * src[0..3] + B * src[1..4] */
286
            "movq %%mm0, %%mm2\n\t"
287
            "pmullw %0, %%mm2\n\t"
288
            "pmullw %%mm5, %%mm1\n\t"
289
            "paddw %%mm1, %%mm2\n\t"
290
            : : "m" (AA));
291

  
292
        src += stride;
293
        asm volatile(
294
            /* mm0 = src[0..3], mm1 = src[1..4] */
295
            "movd %0, %%mm0\n\t"
296
            "movd %1, %%mm1\n\t"
297
            "punpcklbw %%mm7, %%mm0\n\t"
298
            "punpcklbw %%mm7, %%mm1\n\t"
299
            : : "m" (src[0]), "m" (src[1]));
300

  
301
        asm volatile(
302
            /* mm2 += C * src[0..3] + D * src[1..4] */
303
            "movq %%mm0, %%mm3\n\t"
304
            "movq %%mm1, %%mm4\n\t"
305
            "pmullw %%mm6, %%mm3\n\t"
306
            "pmullw %0, %%mm4\n\t"
307
            "paddw %%mm3, %%mm2\n\t"
308
            "paddw %%mm4, %%mm2\n\t"
309
            : : "m" (DD));
310

  
311
        asm volatile(
312
            /* dst[0..3] = pack((mm2 + 32) >> 6) */
313
            "paddw %1, %%mm2\n\t"
314
            "psrlw $6, %%mm2\n\t"
315
            "packuswb %%mm7, %%mm2\n\t"
316
            H264_CHROMA_OP4(%0, %%mm2, %%mm3)
317
            "movd %%mm2, %0\n\t"
318
            : "=m" (dst[0]) : "m" (ff_pw_32));
319
        dst += stride;
320
    }
321
}

Also available in: Unified diff