Statistics
| Branch: | Revision:

ffmpeg / libavcodec / i386 / motion_est_mmx.c @ 8f2ab833

History | View | Annotate | Download (12.3 KB)

1
/*
2
 * MMX optimized motion estimation
3
 * Copyright (c) 2001 Fabrice Bellard.
4
 * Copyright (c) 2002-2004 Michael Niedermayer
5
 *
6
 * This library is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2 of the License, or (at your option) any later version.
10
 *
11
 * This library is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with this library; if not, write to the Free Software
18
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19
 *
20
 * mostly by Michael Niedermayer <michaelni@gmx.at>
21
 */
22
#include "../dsputil.h"
23

    
24
static const __attribute__ ((aligned(8))) uint64_t round_tab[3]={
25
0x0000000000000000ULL,
26
0x0001000100010001ULL,
27
0x0002000200020002ULL,
28
};
29

    
30
static __attribute__ ((aligned(8), unused)) uint64_t bone= 0x0101010101010101LL;
31

    
32
static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
33
{
34
    int len= -(stride*h);
35
    asm volatile(
36
        ".balign 16                        \n\t"
37
        "1:                                \n\t"
38
        "movq (%1, %%eax), %%mm0        \n\t"
39
        "movq (%2, %%eax), %%mm2        \n\t"
40
        "movq (%2, %%eax), %%mm4        \n\t"
41
        "addl %3, %%eax                        \n\t"
42
        "psubusb %%mm0, %%mm2                \n\t"
43
        "psubusb %%mm4, %%mm0                \n\t"
44
        "movq (%1, %%eax), %%mm1        \n\t"
45
        "movq (%2, %%eax), %%mm3        \n\t"
46
        "movq (%2, %%eax), %%mm5        \n\t"
47
        "psubusb %%mm1, %%mm3                \n\t"
48
        "psubusb %%mm5, %%mm1                \n\t"
49
        "por %%mm2, %%mm0                \n\t"
50
        "por %%mm1, %%mm3                \n\t"
51
        "movq %%mm0, %%mm1                \n\t"
52
        "movq %%mm3, %%mm2                \n\t"
53
        "punpcklbw %%mm7, %%mm0                \n\t"
54
        "punpckhbw %%mm7, %%mm1                \n\t"
55
        "punpcklbw %%mm7, %%mm3                \n\t"
56
        "punpckhbw %%mm7, %%mm2                \n\t"
57
        "paddw %%mm1, %%mm0                \n\t"
58
        "paddw %%mm3, %%mm2                \n\t"
59
        "paddw %%mm2, %%mm0                \n\t"
60
        "paddw %%mm0, %%mm6                \n\t"
61
        "addl %3, %%eax                        \n\t"
62
        " js 1b                                \n\t"
63
        : "+a" (len)
64
        : "r" (blk1 - len), "r" (blk2 - len), "r" (stride)
65
    );
66
}
67

    
68
static inline void sad8_1_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
69
{
70
    int len= -(stride*h);
71
    asm volatile(
72
        ".balign 16                        \n\t"
73
        "1:                                \n\t"
74
        "movq (%1, %%eax), %%mm0        \n\t"
75
        "movq (%2, %%eax), %%mm2        \n\t"
76
        "psadbw %%mm2, %%mm0                \n\t"
77
        "addl %3, %%eax                        \n\t"
78
        "movq (%1, %%eax), %%mm1        \n\t"
79
        "movq (%2, %%eax), %%mm3        \n\t"
80
        "psadbw %%mm1, %%mm3                \n\t"
81
        "paddw %%mm3, %%mm0                \n\t"
82
        "paddw %%mm0, %%mm6                \n\t"
83
        "addl %3, %%eax                        \n\t"
84
        " js 1b                                \n\t"
85
        : "+a" (len)
86
        : "r" (blk1 - len), "r" (blk2 - len), "r" (stride)
87
    );
88
}
89

    
90
static inline void sad8_2_mmx2(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
91
{
92
    int len= -(stride*h);
93
    asm volatile(
94
        ".balign 16                        \n\t"
95
        "1:                                \n\t"
96
        "movq (%1, %%eax), %%mm0        \n\t"
97
        "movq (%2, %%eax), %%mm2        \n\t"
98
        "pavgb %%mm2, %%mm0                \n\t"
99
        "movq (%3, %%eax), %%mm2        \n\t"
100
        "psadbw %%mm2, %%mm0                \n\t"
101
        "addl %4, %%eax                        \n\t"
102
        "movq (%1, %%eax), %%mm1        \n\t"
103
        "movq (%2, %%eax), %%mm3        \n\t"
104
        "pavgb %%mm1, %%mm3                \n\t"
105
        "movq (%3, %%eax), %%mm1        \n\t"
106
        "psadbw %%mm1, %%mm3                \n\t"
107
        "paddw %%mm3, %%mm0                \n\t"
108
        "paddw %%mm0, %%mm6                \n\t"
109
        "addl %4, %%eax                        \n\t"
110
        " js 1b                                \n\t"
111
        : "+a" (len)
112
        : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" (stride)
113
    );
114
}
115

    
116
static inline void sad8_4_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
117
{ //FIXME reuse src
118
    int len= -(stride*h);
119
    asm volatile(
120
        ".balign 16                        \n\t"
121
        "movq "MANGLE(bone)", %%mm5        \n\t"
122
        "1:                                \n\t"
123
        "movq (%1, %%eax), %%mm0        \n\t"
124
        "movq (%2, %%eax), %%mm2        \n\t"
125
        "movq 1(%1, %%eax), %%mm1        \n\t"
126
        "movq 1(%2, %%eax), %%mm3        \n\t"
127
        "pavgb %%mm2, %%mm0                \n\t"
128
        "pavgb %%mm1, %%mm3                \n\t"
129
        "psubusb %%mm5, %%mm3                \n\t"
130
        "pavgb %%mm3, %%mm0                \n\t"
131
        "movq (%3, %%eax), %%mm2        \n\t"
132
        "psadbw %%mm2, %%mm0                \n\t"
133
        "addl %4, %%eax                        \n\t"
134
        "movq (%1, %%eax), %%mm1        \n\t"
135
        "movq (%2, %%eax), %%mm3        \n\t"
136
        "movq 1(%1, %%eax), %%mm2        \n\t"
137
        "movq 1(%2, %%eax), %%mm4        \n\t"
138
        "pavgb %%mm3, %%mm1                \n\t"
139
        "pavgb %%mm4, %%mm2                \n\t"
140
        "psubusb %%mm5, %%mm2                \n\t"
141
        "pavgb %%mm1, %%mm2                \n\t"
142
        "movq (%3, %%eax), %%mm1        \n\t"
143
        "psadbw %%mm1, %%mm2                \n\t"
144
        "paddw %%mm2, %%mm0                \n\t"
145
        "paddw %%mm0, %%mm6                \n\t"
146
        "addl %4, %%eax                        \n\t"
147
        " js 1b                                \n\t"
148
        : "+a" (len)
149
        : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), "r" (stride)
150
    );
151
}
152

    
153
static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
154
{
155
    int len= -(stride*h);
156
    asm volatile(
157
        ".balign 16                        \n\t"
158
        "1:                                \n\t"
159
        "movq (%1, %%eax), %%mm0        \n\t"
160
        "movq (%2, %%eax), %%mm1        \n\t"
161
        "movq (%1, %%eax), %%mm2        \n\t"
162
        "movq (%2, %%eax), %%mm3        \n\t"
163
        "punpcklbw %%mm7, %%mm0                \n\t"
164
        "punpcklbw %%mm7, %%mm1                \n\t"
165
        "punpckhbw %%mm7, %%mm2                \n\t"
166
        "punpckhbw %%mm7, %%mm3                \n\t"
167
        "paddw %%mm0, %%mm1                \n\t"
168
        "paddw %%mm2, %%mm3                \n\t"
169
        "movq (%3, %%eax), %%mm4        \n\t"
170
        "movq (%3, %%eax), %%mm2        \n\t"
171
        "paddw %%mm5, %%mm1                \n\t"
172
        "paddw %%mm5, %%mm3                \n\t"
173
        "psrlw $1, %%mm1                \n\t"
174
        "psrlw $1, %%mm3                \n\t"
175
        "packuswb %%mm3, %%mm1                \n\t"
176
        "psubusb %%mm1, %%mm4                \n\t"
177
        "psubusb %%mm2, %%mm1                \n\t"
178
        "por %%mm4, %%mm1                \n\t"
179
        "movq %%mm1, %%mm0                \n\t"
180
        "punpcklbw %%mm7, %%mm0                \n\t"
181
        "punpckhbw %%mm7, %%mm1                \n\t"
182
        "paddw %%mm1, %%mm0                \n\t"
183
        "paddw %%mm0, %%mm6                \n\t"
184
        "addl %4, %%eax                        \n\t"
185
        " js 1b                                \n\t"
186
        : "+a" (len)
187
        : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" (stride)
188
    );
189
}
190

    
191
static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
192
{
193
    int len= -(stride*h);
194
    asm volatile(
195
        ".balign 16                        \n\t"
196
        "1:                                \n\t"
197
        "movq (%1, %%eax), %%mm0        \n\t"
198
        "movq (%2, %%eax), %%mm1        \n\t"
199
        "movq %%mm0, %%mm4                \n\t"
200
        "movq %%mm1, %%mm2                \n\t"
201
        "punpcklbw %%mm7, %%mm0                \n\t"
202
        "punpcklbw %%mm7, %%mm1                \n\t"
203
        "punpckhbw %%mm7, %%mm4                \n\t"
204
        "punpckhbw %%mm7, %%mm2                \n\t"
205
        "paddw %%mm1, %%mm0                \n\t"
206
        "paddw %%mm2, %%mm4                \n\t"
207
        "movq 1(%1, %%eax), %%mm2        \n\t"
208
        "movq 1(%2, %%eax), %%mm3        \n\t"
209
        "movq %%mm2, %%mm1                \n\t"
210
        "punpcklbw %%mm7, %%mm2                \n\t"
211
        "punpckhbw %%mm7, %%mm1                \n\t"
212
        "paddw %%mm0, %%mm2                \n\t"
213
        "paddw %%mm4, %%mm1                \n\t"
214
        "movq %%mm3, %%mm4                \n\t"
215
        "punpcklbw %%mm7, %%mm3                \n\t"
216
        "punpckhbw %%mm7, %%mm4                \n\t"
217
        "paddw %%mm3, %%mm2                \n\t"
218
        "paddw %%mm4, %%mm1                \n\t"
219
        "movq (%3, %%eax), %%mm3        \n\t"
220
        "movq (%3, %%eax), %%mm4        \n\t"
221
        "paddw %%mm5, %%mm2                \n\t"
222
        "paddw %%mm5, %%mm1                \n\t"
223
        "psrlw $2, %%mm2                \n\t"
224
        "psrlw $2, %%mm1                \n\t"
225
        "packuswb %%mm1, %%mm2                \n\t"
226
        "psubusb %%mm2, %%mm3                \n\t"
227
        "psubusb %%mm4, %%mm2                \n\t"
228
        "por %%mm3, %%mm2                \n\t"
229
        "movq %%mm2, %%mm0                \n\t"
230
        "punpcklbw %%mm7, %%mm0                \n\t"
231
        "punpckhbw %%mm7, %%mm2                \n\t"
232
        "paddw %%mm2, %%mm0                \n\t"
233
        "paddw %%mm0, %%mm6                \n\t"
234
        "addl %4, %%eax                        \n\t"
235
        " js 1b                                \n\t"
236
        : "+a" (len)
237
        : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" (stride)
238
    );
239
}
240

    
241
static inline int sum_mmx(void)
242
{
243
    int ret;
244
    asm volatile(
245
        "movq %%mm6, %%mm0                \n\t"
246
        "psrlq $32, %%mm6                \n\t"
247
        "paddw %%mm0, %%mm6                \n\t"
248
        "movq %%mm6, %%mm0                \n\t"
249
        "psrlq $16, %%mm6                \n\t"
250
        "paddw %%mm0, %%mm6                \n\t"
251
        "movd %%mm6, %0                        \n\t"
252
        : "=r" (ret)
253
    );
254
    return ret&0xFFFF;
255
}
256

    
257
static inline int sum_mmx2(void)
258
{
259
    int ret;
260
    asm volatile(
261
        "movd %%mm6, %0                        \n\t"
262
        : "=r" (ret)
263
    );
264
    return ret;
265
}
266

    
267

    
268
#define PIX_SAD(suf)\
269
static int sad8_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
270
{\
271
    assert(h==8);\
272
    asm volatile("pxor %%mm7, %%mm7                \n\t"\
273
                 "pxor %%mm6, %%mm6                \n\t":);\
274
\
275
    sad8_1_ ## suf(blk1, blk2, stride, 8);\
276
\
277
    return sum_ ## suf();\
278
}\
279
static int sad8_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
280
{\
281
    assert(h==8);\
282
    asm volatile("pxor %%mm7, %%mm7                \n\t"\
283
                 "pxor %%mm6, %%mm6                \n\t"\
284
                 "movq %0, %%mm5                \n\t"\
285
                 :: "m"(round_tab[1]) \
286
                 );\
287
\
288
    sad8_2_ ## suf(blk1, blk1+1, blk2, stride, 8);\
289
\
290
    return sum_ ## suf();\
291
}\
292
\
293
static int sad8_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
294
{\
295
    assert(h==8);\
296
    asm volatile("pxor %%mm7, %%mm7                \n\t"\
297
                 "pxor %%mm6, %%mm6                \n\t"\
298
                 "movq %0, %%mm5                \n\t"\
299
                 :: "m"(round_tab[1]) \
300
                 );\
301
\
302
    sad8_2_ ## suf(blk1, blk1+stride, blk2, stride, 8);\
303
\
304
    return sum_ ## suf();\
305
}\
306
\
307
static int sad8_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
308
{\
309
    assert(h==8);\
310
    asm volatile("pxor %%mm7, %%mm7                \n\t"\
311
                 "pxor %%mm6, %%mm6                \n\t"\
312
                 "movq %0, %%mm5                \n\t"\
313
                 :: "m"(round_tab[2]) \
314
                 );\
315
\
316
    sad8_4_ ## suf(blk1, blk2, stride, 8);\
317
\
318
    return sum_ ## suf();\
319
}\
320
\
321
static int sad16_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
322
{\
323
    asm volatile("pxor %%mm7, %%mm7                \n\t"\
324
                 "pxor %%mm6, %%mm6                \n\t":);\
325
\
326
    sad8_1_ ## suf(blk1  , blk2  , stride, h);\
327
    sad8_1_ ## suf(blk1+8, blk2+8, stride, h);\
328
\
329
    return sum_ ## suf();\
330
}\
331
static int sad16_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
332
{\
333
    asm volatile("pxor %%mm7, %%mm7                \n\t"\
334
                 "pxor %%mm6, %%mm6                \n\t"\
335
                 "movq %0, %%mm5                \n\t"\
336
                 :: "m"(round_tab[1]) \
337
                 );\
338
\
339
    sad8_2_ ## suf(blk1  , blk1+1, blk2  , stride, h);\
340
    sad8_2_ ## suf(blk1+8, blk1+9, blk2+8, stride, h);\
341
\
342
    return sum_ ## suf();\
343
}\
344
static int sad16_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
345
{\
346
    asm volatile("pxor %%mm7, %%mm7                \n\t"\
347
                 "pxor %%mm6, %%mm6                \n\t"\
348
                 "movq %0, %%mm5                \n\t"\
349
                 :: "m"(round_tab[1]) \
350
                 );\
351
\
352
    sad8_2_ ## suf(blk1  , blk1+stride,  blk2  , stride, h);\
353
    sad8_2_ ## suf(blk1+8, blk1+stride+8,blk2+8, stride, h);\
354
\
355
    return sum_ ## suf();\
356
}\
357
static int sad16_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
358
{\
359
    asm volatile("pxor %%mm7, %%mm7                \n\t"\
360
                 "pxor %%mm6, %%mm6                \n\t"\
361
                 "movq %0, %%mm5                \n\t"\
362
                 :: "m"(round_tab[2]) \
363
                 );\
364
\
365
    sad8_4_ ## suf(blk1  , blk2  , stride, h);\
366
    sad8_4_ ## suf(blk1+8, blk2+8, stride, h);\
367
\
368
    return sum_ ## suf();\
369
}\
370

    
371
PIX_SAD(mmx)
372
PIX_SAD(mmx2)
373

    
374
void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx)
375
{
376
    if (mm_flags & MM_MMX) {
377
        c->pix_abs[0][0] = sad16_mmx;
378
        c->pix_abs[0][1] = sad16_x2_mmx;
379
        c->pix_abs[0][2] = sad16_y2_mmx;
380
        c->pix_abs[0][3] = sad16_xy2_mmx;
381
        c->pix_abs[1][0] = sad8_mmx;
382
        c->pix_abs[1][1] = sad8_x2_mmx;
383
        c->pix_abs[1][2] = sad8_y2_mmx;
384
        c->pix_abs[1][3] = sad8_xy2_mmx;
385

    
386
        c->sad[0]= sad16_mmx;
387
        c->sad[1]= sad8_mmx;
388
    }
389
    if (mm_flags & MM_MMXEXT) {
390
        c->pix_abs[0][0] = sad16_mmx2;
391
        c->pix_abs[1][0] = sad8_mmx2;
392

    
393
        c->sad[0]= sad16_mmx2;
394
        c->sad[1]= sad8_mmx2;
395
        
396
        if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
397
            c->pix_abs[0][1] = sad16_x2_mmx2;
398
            c->pix_abs[0][2] = sad16_y2_mmx2;
399
            c->pix_abs[0][3] = sad16_xy2_mmx2;
400
            c->pix_abs[1][1] = sad8_x2_mmx2;
401
            c->pix_abs[1][2] = sad8_y2_mmx2;
402
            c->pix_abs[1][3] = sad8_xy2_mmx2;
403
        }
404
    }
405
}