Statistics
| Branch: | Revision:

ffmpeg / libavcodec / i386 / dsputil_h264_template_mmx.c @ 5509bffa

History | View | Annotate | Download (11.1 KB)

1
/*
2
 * Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
3
 *                    Loren Merritt
4
 *
5
 * This library is free software; you can redistribute it and/or
6
 * modify it under the terms of the GNU Lesser General Public
7
 * License as published by the Free Software Foundation; either
8
 * version 2 of the License, or (at your option) any later version.
9
 *
10
 * This library is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
 * Lesser General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU Lesser General Public
16
 * License along with this library; if not, write to the Free Software
17
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
 */
19

    
20
/**
21
 * MMX optimized version of (put|avg)_h264_chroma_mc8.
22
 * H264_CHROMA_MC8_TMPL must be defined to the desired function name
23
 * H264_CHROMA_OP must be defined to empty for put and pavgb/pavgusb for avg
24
 * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function
25
 */
26
static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
27
{
28
    uint64_t AA __align8;
29
    uint64_t DD __align8;
30
    int i;
31

    
32
    if(y==0 && x==0) {
33
        /* no filter needed */
34
        H264_CHROMA_MC8_MV0(dst, src, stride, h);
35
        return;
36
    }
37

    
38
    assert(x<8 && y<8 && x>=0 && y>=0);
39

    
40
    if(y==0)
41
    {
42
        /* horizontal filter only */
43
        asm volatile("movd %0, %%mm5\n\t"
44
                     "punpcklwd %%mm5, %%mm5\n\t"
45
                     "punpckldq %%mm5, %%mm5\n\t" /* mm5 = B = x */
46
                     "movq %1, %%mm4\n\t"
47
                     "pxor %%mm7, %%mm7\n\t"
48
                     "psubw %%mm5, %%mm4\n\t"     /* mm4 = A = 8-x */
49
                     : : "rm" (x), "m" (ff_pw_8));
50

    
51
        for(i=0; i<h; i++) {
52
            asm volatile(
53
                /* mm0 = src[0..7], mm1 = src[1..8] */
54
                "movq %0, %%mm0\n\t"
55
                "movq %1, %%mm1\n\t"
56
                : : "m" (src[0]), "m" (src[1]));
57

    
58
            asm volatile(
59
                /* [mm2,mm3] = A * src[0..7] */
60
                "movq %%mm0, %%mm2\n\t"
61
                "punpcklbw %%mm7, %%mm2\n\t"
62
                "pmullw %%mm4, %%mm2\n\t"
63
                "movq %%mm0, %%mm3\n\t"
64
                "punpckhbw %%mm7, %%mm3\n\t"
65
                "pmullw %%mm4, %%mm3\n\t"
66

    
67
                /* [mm2,mm3] += B * src[1..8] */
68
                "movq %%mm1, %%mm0\n\t"
69
                "punpcklbw %%mm7, %%mm0\n\t"
70
                "pmullw %%mm5, %%mm0\n\t"
71
                "punpckhbw %%mm7, %%mm1\n\t"
72
                "pmullw %%mm5, %%mm1\n\t"
73
                "paddw %%mm0, %%mm2\n\t"
74
                "paddw %%mm1, %%mm3\n\t"
75

    
76
                /* dst[0..7] = pack(([mm2,mm3] + 32) >> 6) */
77
                "paddw %1, %%mm2\n\t"
78
                "paddw %1, %%mm3\n\t"
79
                "psrlw $3, %%mm2\n\t"
80
                "psrlw $3, %%mm3\n\t"
81
                "packuswb %%mm3, %%mm2\n\t"
82
                H264_CHROMA_OP(%0, %%mm2)
83
                "movq %%mm2, %0\n\t"
84
                : "=m" (dst[0]) : "m" (ff_pw_4));
85

    
86
            src += stride;
87
            dst += stride;
88
        }
89
        return;
90
    }
91

    
92
    if(x==0)
93
    {
94
        /* vertical filter only */
95
        asm volatile("movd %0, %%mm6\n\t"
96
                     "punpcklwd %%mm6, %%mm6\n\t"
97
                     "punpckldq %%mm6, %%mm6\n\t" /* mm6 = C = y */
98
                     "movq %1, %%mm4\n\t"
99
                     "pxor %%mm7, %%mm7\n\t"
100
                     "psubw %%mm6, %%mm4\n\t"     /* mm4 = A = 8-y */
101
                     : : "rm" (y), "m" (ff_pw_8));
102

    
103
        asm volatile(
104
            /* mm0 = src[0..7] */
105
            "movq %0, %%mm0\n\t"
106
            : : "m" (src[0]));
107

    
108
        for(i=0; i<h; i++) {
109
            asm volatile(
110
                /* [mm2,mm3] = A * src[0..7] */
111
                "movq %mm0, %mm2\n\t"
112
                "punpcklbw %mm7, %mm2\n\t"
113
                "pmullw %mm4, %mm2\n\t"
114
                "movq %mm0, %mm3\n\t"
115
                "punpckhbw %mm7, %mm3\n\t"
116
                "pmullw %mm4, %mm3\n\t");
117

    
118
            src += stride;
119
            asm volatile(
120
                /* mm0 = src[0..7] */
121
                "movq %0, %%mm0\n\t"
122
                : : "m" (src[0]));
123

    
124
            asm volatile(
125
                /* [mm2,mm3] += C * src[0..7] */
126
                "movq %mm0, %mm1\n\t"
127
                "punpcklbw %mm7, %mm1\n\t"
128
                "pmullw %mm6, %mm1\n\t"
129
                "paddw %mm1, %mm2\n\t"
130
                "movq %mm0, %mm5\n\t"
131
                "punpckhbw %mm7, %mm5\n\t"
132
                "pmullw %mm6, %mm5\n\t"
133
                "paddw %mm5, %mm3\n\t");
134

    
135
            asm volatile(
136
                /* dst[0..7] = pack(([mm2,mm3] + 32) >> 6) */
137
                "paddw %1, %%mm2\n\t"
138
                "paddw %1, %%mm3\n\t"
139
                "psrlw $3, %%mm2\n\t"
140
                "psrlw $3, %%mm3\n\t"
141
                "packuswb %%mm3, %%mm2\n\t"
142
                H264_CHROMA_OP(%0, %%mm2)
143
                "movq %%mm2, %0\n\t"
144
                : "=m" (dst[0]) : "m" (ff_pw_4));
145

    
146
            dst += stride;
147
        }
148
        return;
149
    }
150

    
151
    /* general case, bilinear */
152
    asm volatile("movd %2, %%mm4\n\t"
153
                 "movd %3, %%mm6\n\t"
154
                 "punpcklwd %%mm4, %%mm4\n\t"
155
                 "punpcklwd %%mm6, %%mm6\n\t"
156
                 "punpckldq %%mm4, %%mm4\n\t" /* mm4 = x words */
157
                 "punpckldq %%mm6, %%mm6\n\t" /* mm6 = y words */
158
                 "movq %%mm4, %%mm5\n\t"
159
                 "pmullw %%mm6, %%mm4\n\t"    /* mm4 = x * y */
160
                 "psllw $3, %%mm5\n\t"
161
                 "psllw $3, %%mm6\n\t"
162
                 "movq %%mm5, %%mm7\n\t"
163
                 "paddw %%mm6, %%mm7\n\t"
164
                 "movq %%mm4, %1\n\t"         /* DD = x * y */
165
                 "psubw %%mm4, %%mm5\n\t"     /* mm5 = B = 8x - xy */
166
                 "psubw %%mm4, %%mm6\n\t"     /* mm6 = C = 8y - xy */
167
                 "paddw %4, %%mm4\n\t"
168
                 "psubw %%mm7, %%mm4\n\t"     /* mm4 = A = xy - (8x+8y) + 64 */
169
                 "pxor %%mm7, %%mm7\n\t"
170
                 "movq %%mm4, %0\n\t"
171
                 : "=m" (AA), "=m" (DD) : "rm" (x), "rm" (y), "m" (ff_pw_64));
172

    
173
    asm volatile(
174
        /* mm0 = src[0..7], mm1 = src[1..8] */
175
        "movq %0, %%mm0\n\t"
176
        "movq %1, %%mm1\n\t"
177
        : : "m" (src[0]), "m" (src[1]));
178

    
179
    for(i=0; i<h; i++) {
180
        asm volatile(
181
            /* [mm2,mm3] = A * src[0..7] */
182
            "movq %%mm0, %%mm2\n\t"
183
            "punpcklbw %%mm7, %%mm2\n\t"
184
            "pmullw %0, %%mm2\n\t"
185
            "movq %%mm0, %%mm3\n\t"
186
            "punpckhbw %%mm7, %%mm3\n\t"
187
            "pmullw %0, %%mm3\n\t"
188

    
189
            /* [mm2,mm3] += B * src[1..8] */
190
            "movq %%mm1, %%mm0\n\t"
191
            "punpcklbw %%mm7, %%mm0\n\t"
192
            "pmullw %%mm5, %%mm0\n\t"
193
            "punpckhbw %%mm7, %%mm1\n\t"
194
            "pmullw %%mm5, %%mm1\n\t"
195
            "paddw %%mm0, %%mm2\n\t"
196
            "paddw %%mm1, %%mm3\n\t"
197
            : : "m" (AA));
198

    
199
        src += stride;
200
        asm volatile(
201
            /* mm0 = src[0..7], mm1 = src[1..8] */
202
            "movq %0, %%mm0\n\t"
203
            "movq %1, %%mm1\n\t"
204
            : : "m" (src[0]), "m" (src[1]));
205

    
206
        asm volatile(
207
            /* [mm2,mm3] += C *  src[0..7] */
208
            "movq %mm0, %mm4\n\t"
209
            "punpcklbw %mm7, %mm4\n\t"
210
            "pmullw %mm6, %mm4\n\t"
211
            "paddw %mm4, %mm2\n\t"
212
            "movq %mm0, %mm4\n\t"
213
            "punpckhbw %mm7, %mm4\n\t"
214
            "pmullw %mm6, %mm4\n\t"
215
            "paddw %mm4, %mm3\n\t");
216

    
217
        asm volatile(
218
            /* [mm2,mm3] += D *  src[1..8] */
219
            "movq %%mm1, %%mm4\n\t"
220
            "punpcklbw %%mm7, %%mm4\n\t"
221
            "pmullw %0, %%mm4\n\t"
222
            "paddw %%mm4, %%mm2\n\t"
223
            "movq %%mm1, %%mm4\n\t"
224
            "punpckhbw %%mm7, %%mm4\n\t"
225
            "pmullw %0, %%mm4\n\t"
226
            "paddw %%mm4, %%mm3\n\t"
227
            : : "m" (DD));
228

    
229
        asm volatile(
230
            /* dst[0..7] = pack(([mm2,mm3] + 32) >> 6) */
231
            "paddw %1, %%mm2\n\t"
232
            "paddw %1, %%mm3\n\t"
233
            "psrlw $6, %%mm2\n\t"
234
            "psrlw $6, %%mm3\n\t"
235
            "packuswb %%mm3, %%mm2\n\t"
236
            H264_CHROMA_OP(%0, %%mm2)
237
            "movq %%mm2, %0\n\t"
238
            : "=m" (dst[0]) : "m" (ff_pw_32));
239
        dst+= stride;
240
    }
241
}
242

    
243
static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
244
{
245
    uint64_t AA __align8;
246
    uint64_t DD __align8;
247
    int i;
248

    
249
    /* no special case for mv=(0,0) in 4x*, since it's much less common than in 8x*.
250
     * could still save a few cycles, but maybe not worth the complexity. */
251

    
252
    assert(x<8 && y<8 && x>=0 && y>=0);
253

    
254
    asm volatile("movd %2, %%mm4\n\t"
255
                 "movd %3, %%mm6\n\t"
256
                 "punpcklwd %%mm4, %%mm4\n\t"
257
                 "punpcklwd %%mm6, %%mm6\n\t"
258
                 "punpckldq %%mm4, %%mm4\n\t" /* mm4 = x words */
259
                 "punpckldq %%mm6, %%mm6\n\t" /* mm6 = y words */
260
                 "movq %%mm4, %%mm5\n\t"
261
                 "pmullw %%mm6, %%mm4\n\t"    /* mm4 = x * y */
262
                 "psllw $3, %%mm5\n\t"
263
                 "psllw $3, %%mm6\n\t"
264
                 "movq %%mm5, %%mm7\n\t"
265
                 "paddw %%mm6, %%mm7\n\t"
266
                 "movq %%mm4, %1\n\t"         /* DD = x * y */
267
                 "psubw %%mm4, %%mm5\n\t"     /* mm5 = B = 8x - xy */
268
                 "psubw %%mm4, %%mm6\n\t"     /* mm6 = C = 8y - xy */
269
                 "paddw %4, %%mm4\n\t"
270
                 "psubw %%mm7, %%mm4\n\t"     /* mm4 = A = xy - (8x+8y) + 64 */
271
                 "pxor %%mm7, %%mm7\n\t"
272
                 "movq %%mm4, %0\n\t"
273
                 : "=m" (AA), "=m" (DD) : "rm" (x), "rm" (y), "m" (ff_pw_64));
274

    
275
    asm volatile(
276
        /* mm0 = src[0..3], mm1 = src[1..4] */
277
        "movd %0, %%mm0\n\t"
278
        "movd %1, %%mm1\n\t"
279
        "punpcklbw %%mm7, %%mm0\n\t"
280
        "punpcklbw %%mm7, %%mm1\n\t"
281
        : : "m" (src[0]), "m" (src[1]));
282

    
283
    for(i=0; i<h; i++) {
284
        asm volatile(
285
            /* mm2 = A * src[0..3] + B * src[1..4] */
286
            "movq %%mm0, %%mm2\n\t"
287
            "pmullw %0, %%mm2\n\t"
288
            "pmullw %%mm5, %%mm1\n\t"
289
            "paddw %%mm1, %%mm2\n\t"
290
            : : "m" (AA));
291

    
292
        src += stride;
293
        asm volatile(
294
            /* mm0 = src[0..3], mm1 = src[1..4] */
295
            "movd %0, %%mm0\n\t"
296
            "movd %1, %%mm1\n\t"
297
            "punpcklbw %%mm7, %%mm0\n\t"
298
            "punpcklbw %%mm7, %%mm1\n\t"
299
            : : "m" (src[0]), "m" (src[1]));
300

    
301
        asm volatile(
302
            /* mm2 += C * src[0..3] + D * src[1..4] */
303
            "movq %%mm0, %%mm3\n\t"
304
            "movq %%mm1, %%mm4\n\t"
305
            "pmullw %%mm6, %%mm3\n\t"
306
            "pmullw %0, %%mm4\n\t"
307
            "paddw %%mm3, %%mm2\n\t"
308
            "paddw %%mm4, %%mm2\n\t"
309
            : : "m" (DD));
310

    
311
        asm volatile(
312
            /* dst[0..3] = pack((mm2 + 32) >> 6) */
313
            "paddw %1, %%mm2\n\t"
314
            "psrlw $6, %%mm2\n\t"
315
            "packuswb %%mm7, %%mm2\n\t"
316
            H264_CHROMA_OP4(%0, %%mm2, %%mm3)
317
            "movd %%mm2, %0\n\t"
318
            : "=m" (dst[0]) : "m" (ff_pw_32));
319
        dst += stride;
320
    }
321
}