Statistics
| Branch: | Revision:

ffmpeg / libavcodec / i386 / motion_est_mmx.c @ 5509bffa

History | View | Annotate | Download (14.2 KB)

1
/*
2
 * MMX optimized motion estimation
3
 * Copyright (c) 2001 Fabrice Bellard.
4
 * Copyright (c) 2002-2004 Michael Niedermayer
5
 *
6
 * This library is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2 of the License, or (at your option) any later version.
10
 *
11
 * This library is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with this library; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 *
20
 * mostly by Michael Niedermayer <michaelni@gmx.at>
21
 */
22
#include "../dsputil.h"
23
#include "mmx.h"
24

    
25
static const __attribute__ ((aligned(8))) uint64_t round_tab[3]={
26
0x0000000000000000ULL,
27
0x0001000100010001ULL,
28
0x0002000200020002ULL,
29
};
30

    
31
static attribute_used __attribute__ ((aligned(8))) uint64_t bone= 0x0101010101010101LL;
32

    
33
static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
34
{
35
    long len= -(stride*h);
36
    asm volatile(
37
        ".balign 16                     \n\t"
38
        "1:                             \n\t"
39
        "movq (%1, %%"REG_a"), %%mm0    \n\t"
40
        "movq (%2, %%"REG_a"), %%mm2    \n\t"
41
        "movq (%2, %%"REG_a"), %%mm4    \n\t"
42
        "add %3, %%"REG_a"              \n\t"
43
        "psubusb %%mm0, %%mm2           \n\t"
44
        "psubusb %%mm4, %%mm0           \n\t"
45
        "movq (%1, %%"REG_a"), %%mm1    \n\t"
46
        "movq (%2, %%"REG_a"), %%mm3    \n\t"
47
        "movq (%2, %%"REG_a"), %%mm5    \n\t"
48
        "psubusb %%mm1, %%mm3           \n\t"
49
        "psubusb %%mm5, %%mm1           \n\t"
50
        "por %%mm2, %%mm0               \n\t"
51
        "por %%mm1, %%mm3               \n\t"
52
        "movq %%mm0, %%mm1              \n\t"
53
        "movq %%mm3, %%mm2              \n\t"
54
        "punpcklbw %%mm7, %%mm0         \n\t"
55
        "punpckhbw %%mm7, %%mm1         \n\t"
56
        "punpcklbw %%mm7, %%mm3         \n\t"
57
        "punpckhbw %%mm7, %%mm2         \n\t"
58
        "paddw %%mm1, %%mm0             \n\t"
59
        "paddw %%mm3, %%mm2             \n\t"
60
        "paddw %%mm2, %%mm0             \n\t"
61
        "paddw %%mm0, %%mm6             \n\t"
62
        "add %3, %%"REG_a"              \n\t"
63
        " js 1b                         \n\t"
64
        : "+a" (len)
65
        : "r" (blk1 - len), "r" (blk2 - len), "r" ((long)stride)
66
    );
67
}
68

    
69
static inline void sad8_1_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
70
{
71
    long len= -(stride*h);
72
    asm volatile(
73
        ".balign 16                     \n\t"
74
        "1:                             \n\t"
75
        "movq (%1, %%"REG_a"), %%mm0    \n\t"
76
        "movq (%2, %%"REG_a"), %%mm2    \n\t"
77
        "psadbw %%mm2, %%mm0            \n\t"
78
        "add %3, %%"REG_a"              \n\t"
79
        "movq (%1, %%"REG_a"), %%mm1    \n\t"
80
        "movq (%2, %%"REG_a"), %%mm3    \n\t"
81
        "psadbw %%mm1, %%mm3            \n\t"
82
        "paddw %%mm3, %%mm0             \n\t"
83
        "paddw %%mm0, %%mm6             \n\t"
84
        "add %3, %%"REG_a"              \n\t"
85
        " js 1b                         \n\t"
86
        : "+a" (len)
87
        : "r" (blk1 - len), "r" (blk2 - len), "r" ((long)stride)
88
    );
89
}
90

    
91
static inline void sad8_2_mmx2(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
92
{
93
    long len= -(stride*h);
94
    asm volatile(
95
        ".balign 16                     \n\t"
96
        "1:                             \n\t"
97
        "movq (%1, %%"REG_a"), %%mm0    \n\t"
98
        "movq (%2, %%"REG_a"), %%mm2    \n\t"
99
        "pavgb %%mm2, %%mm0             \n\t"
100
        "movq (%3, %%"REG_a"), %%mm2    \n\t"
101
        "psadbw %%mm2, %%mm0            \n\t"
102
        "add %4, %%"REG_a"              \n\t"
103
        "movq (%1, %%"REG_a"), %%mm1    \n\t"
104
        "movq (%2, %%"REG_a"), %%mm3    \n\t"
105
        "pavgb %%mm1, %%mm3             \n\t"
106
        "movq (%3, %%"REG_a"), %%mm1    \n\t"
107
        "psadbw %%mm1, %%mm3            \n\t"
108
        "paddw %%mm3, %%mm0             \n\t"
109
        "paddw %%mm0, %%mm6             \n\t"
110
        "add %4, %%"REG_a"              \n\t"
111
        " js 1b                         \n\t"
112
        : "+a" (len)
113
        : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((long)stride)
114
    );
115
}
116

    
117
static inline void sad8_4_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
118
{ //FIXME reuse src
119
    long len= -(stride*h);
120
    asm volatile(
121
        ".balign 16                     \n\t"
122
        "movq "MANGLE(bone)", %%mm5     \n\t"
123
        "1:                             \n\t"
124
        "movq (%1, %%"REG_a"), %%mm0    \n\t"
125
        "movq (%2, %%"REG_a"), %%mm2    \n\t"
126
        "movq 1(%1, %%"REG_a"), %%mm1   \n\t"
127
        "movq 1(%2, %%"REG_a"), %%mm3   \n\t"
128
        "pavgb %%mm2, %%mm0             \n\t"
129
        "pavgb %%mm1, %%mm3             \n\t"
130
        "psubusb %%mm5, %%mm3           \n\t"
131
        "pavgb %%mm3, %%mm0             \n\t"
132
        "movq (%3, %%"REG_a"), %%mm2    \n\t"
133
        "psadbw %%mm2, %%mm0            \n\t"
134
        "add %4, %%"REG_a"              \n\t"
135
        "movq (%1, %%"REG_a"), %%mm1    \n\t"
136
        "movq (%2, %%"REG_a"), %%mm3    \n\t"
137
        "movq 1(%1, %%"REG_a"), %%mm2   \n\t"
138
        "movq 1(%2, %%"REG_a"), %%mm4   \n\t"
139
        "pavgb %%mm3, %%mm1             \n\t"
140
        "pavgb %%mm4, %%mm2             \n\t"
141
        "psubusb %%mm5, %%mm2           \n\t"
142
        "pavgb %%mm1, %%mm2             \n\t"
143
        "movq (%3, %%"REG_a"), %%mm1    \n\t"
144
        "psadbw %%mm1, %%mm2            \n\t"
145
        "paddw %%mm2, %%mm0             \n\t"
146
        "paddw %%mm0, %%mm6             \n\t"
147
        "add %4, %%"REG_a"              \n\t"
148
        " js 1b                         \n\t"
149
        : "+a" (len)
150
        : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), "r" ((long)stride)
151
    );
152
}
153

    
154
static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
155
{
156
    long len= -(stride*h);
157
    asm volatile(
158
        ".balign 16                     \n\t"
159
        "1:                             \n\t"
160
        "movq (%1, %%"REG_a"), %%mm0    \n\t"
161
        "movq (%2, %%"REG_a"), %%mm1    \n\t"
162
        "movq (%1, %%"REG_a"), %%mm2    \n\t"
163
        "movq (%2, %%"REG_a"), %%mm3    \n\t"
164
        "punpcklbw %%mm7, %%mm0         \n\t"
165
        "punpcklbw %%mm7, %%mm1         \n\t"
166
        "punpckhbw %%mm7, %%mm2         \n\t"
167
        "punpckhbw %%mm7, %%mm3         \n\t"
168
        "paddw %%mm0, %%mm1             \n\t"
169
        "paddw %%mm2, %%mm3             \n\t"
170
        "movq (%3, %%"REG_a"), %%mm4    \n\t"
171
        "movq (%3, %%"REG_a"), %%mm2    \n\t"
172
        "paddw %%mm5, %%mm1             \n\t"
173
        "paddw %%mm5, %%mm3             \n\t"
174
        "psrlw $1, %%mm1                \n\t"
175
        "psrlw $1, %%mm3                \n\t"
176
        "packuswb %%mm3, %%mm1          \n\t"
177
        "psubusb %%mm1, %%mm4           \n\t"
178
        "psubusb %%mm2, %%mm1           \n\t"
179
        "por %%mm4, %%mm1               \n\t"
180
        "movq %%mm1, %%mm0              \n\t"
181
        "punpcklbw %%mm7, %%mm0         \n\t"
182
        "punpckhbw %%mm7, %%mm1         \n\t"
183
        "paddw %%mm1, %%mm0             \n\t"
184
        "paddw %%mm0, %%mm6             \n\t"
185
        "add %4, %%"REG_a"              \n\t"
186
        " js 1b                         \n\t"
187
        : "+a" (len)
188
        : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((long)stride)
189
    );
190
}
191

    
192
static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
193
{
194
    long len= -(stride*h);
195
    asm volatile(
196
        ".balign 16                     \n\t"
197
        "1:                             \n\t"
198
        "movq (%1, %%"REG_a"), %%mm0    \n\t"
199
        "movq (%2, %%"REG_a"), %%mm1    \n\t"
200
        "movq %%mm0, %%mm4              \n\t"
201
        "movq %%mm1, %%mm2              \n\t"
202
        "punpcklbw %%mm7, %%mm0         \n\t"
203
        "punpcklbw %%mm7, %%mm1         \n\t"
204
        "punpckhbw %%mm7, %%mm4         \n\t"
205
        "punpckhbw %%mm7, %%mm2         \n\t"
206
        "paddw %%mm1, %%mm0             \n\t"
207
        "paddw %%mm2, %%mm4             \n\t"
208
        "movq 1(%1, %%"REG_a"), %%mm2   \n\t"
209
        "movq 1(%2, %%"REG_a"), %%mm3   \n\t"
210
        "movq %%mm2, %%mm1              \n\t"
211
        "punpcklbw %%mm7, %%mm2         \n\t"
212
        "punpckhbw %%mm7, %%mm1         \n\t"
213
        "paddw %%mm0, %%mm2             \n\t"
214
        "paddw %%mm4, %%mm1             \n\t"
215
        "movq %%mm3, %%mm4              \n\t"
216
        "punpcklbw %%mm7, %%mm3         \n\t"
217
        "punpckhbw %%mm7, %%mm4         \n\t"
218
        "paddw %%mm3, %%mm2             \n\t"
219
        "paddw %%mm4, %%mm1             \n\t"
220
        "movq (%3, %%"REG_a"), %%mm3    \n\t"
221
        "movq (%3, %%"REG_a"), %%mm4    \n\t"
222
        "paddw %%mm5, %%mm2             \n\t"
223
        "paddw %%mm5, %%mm1             \n\t"
224
        "psrlw $2, %%mm2                \n\t"
225
        "psrlw $2, %%mm1                \n\t"
226
        "packuswb %%mm1, %%mm2          \n\t"
227
        "psubusb %%mm2, %%mm3           \n\t"
228
        "psubusb %%mm4, %%mm2           \n\t"
229
        "por %%mm3, %%mm2               \n\t"
230
        "movq %%mm2, %%mm0              \n\t"
231
        "punpcklbw %%mm7, %%mm0         \n\t"
232
        "punpckhbw %%mm7, %%mm2         \n\t"
233
        "paddw %%mm2, %%mm0             \n\t"
234
        "paddw %%mm0, %%mm6             \n\t"
235
        "add %4, %%"REG_a"              \n\t"
236
        " js 1b                         \n\t"
237
        : "+a" (len)
238
        : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" ((long)stride)
239
    );
240
}
241

    
242
static inline int sum_mmx(void)
243
{
244
    int ret;
245
    asm volatile(
246
        "movq %%mm6, %%mm0              \n\t"
247
        "psrlq $32, %%mm6               \n\t"
248
        "paddw %%mm0, %%mm6             \n\t"
249
        "movq %%mm6, %%mm0              \n\t"
250
        "psrlq $16, %%mm6               \n\t"
251
        "paddw %%mm0, %%mm6             \n\t"
252
        "movd %%mm6, %0                 \n\t"
253
        : "=r" (ret)
254
    );
255
    return ret&0xFFFF;
256
}
257

    
258
static inline int sum_mmx2(void)
259
{
260
    int ret;
261
    asm volatile(
262
        "movd %%mm6, %0                 \n\t"
263
        : "=r" (ret)
264
    );
265
    return ret;
266
}
267

    
268

    
269
#define PIX_SAD(suf)\
270
static int sad8_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
271
{\
272
    assert(h==8);\
273
    asm volatile("pxor %%mm7, %%mm7     \n\t"\
274
                 "pxor %%mm6, %%mm6     \n\t":);\
275
\
276
    sad8_1_ ## suf(blk1, blk2, stride, 8);\
277
\
278
    return sum_ ## suf();\
279
}\
280
static int sad8_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
281
{\
282
    assert(h==8);\
283
    asm volatile("pxor %%mm7, %%mm7     \n\t"\
284
                 "pxor %%mm6, %%mm6     \n\t"\
285
                 "movq %0, %%mm5        \n\t"\
286
                 :: "m"(round_tab[1]) \
287
                 );\
288
\
289
    sad8_2_ ## suf(blk1, blk1+1, blk2, stride, 8);\
290
\
291
    return sum_ ## suf();\
292
}\
293
\
294
static int sad8_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
295
{\
296
    assert(h==8);\
297
    asm volatile("pxor %%mm7, %%mm7     \n\t"\
298
                 "pxor %%mm6, %%mm6     \n\t"\
299
                 "movq %0, %%mm5        \n\t"\
300
                 :: "m"(round_tab[1]) \
301
                 );\
302
\
303
    sad8_2_ ## suf(blk1, blk1+stride, blk2, stride, 8);\
304
\
305
    return sum_ ## suf();\
306
}\
307
\
308
static int sad8_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
309
{\
310
    assert(h==8);\
311
    asm volatile("pxor %%mm7, %%mm7     \n\t"\
312
                 "pxor %%mm6, %%mm6     \n\t"\
313
                 "movq %0, %%mm5        \n\t"\
314
                 :: "m"(round_tab[2]) \
315
                 );\
316
\
317
    sad8_4_ ## suf(blk1, blk2, stride, 8);\
318
\
319
    return sum_ ## suf();\
320
}\
321
\
322
static int sad16_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
323
{\
324
    asm volatile("pxor %%mm7, %%mm7     \n\t"\
325
                 "pxor %%mm6, %%mm6     \n\t":);\
326
\
327
    sad8_1_ ## suf(blk1  , blk2  , stride, h);\
328
    sad8_1_ ## suf(blk1+8, blk2+8, stride, h);\
329
\
330
    return sum_ ## suf();\
331
}\
332
static int sad16_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
333
{\
334
    asm volatile("pxor %%mm7, %%mm7     \n\t"\
335
                 "pxor %%mm6, %%mm6     \n\t"\
336
                 "movq %0, %%mm5        \n\t"\
337
                 :: "m"(round_tab[1]) \
338
                 );\
339
\
340
    sad8_2_ ## suf(blk1  , blk1+1, blk2  , stride, h);\
341
    sad8_2_ ## suf(blk1+8, blk1+9, blk2+8, stride, h);\
342
\
343
    return sum_ ## suf();\
344
}\
345
static int sad16_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
346
{\
347
    asm volatile("pxor %%mm7, %%mm7     \n\t"\
348
                 "pxor %%mm6, %%mm6     \n\t"\
349
                 "movq %0, %%mm5        \n\t"\
350
                 :: "m"(round_tab[1]) \
351
                 );\
352
\
353
    sad8_2_ ## suf(blk1  , blk1+stride,  blk2  , stride, h);\
354
    sad8_2_ ## suf(blk1+8, blk1+stride+8,blk2+8, stride, h);\
355
\
356
    return sum_ ## suf();\
357
}\
358
static int sad16_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
359
{\
360
    asm volatile("pxor %%mm7, %%mm7     \n\t"\
361
                 "pxor %%mm6, %%mm6     \n\t"\
362
                 "movq %0, %%mm5        \n\t"\
363
                 :: "m"(round_tab[2]) \
364
                 );\
365
\
366
    sad8_4_ ## suf(blk1  , blk2  , stride, h);\
367
    sad8_4_ ## suf(blk1+8, blk2+8, stride, h);\
368
\
369
    return sum_ ## suf();\
370
}\
371

    
372
PIX_SAD(mmx)
373
PIX_SAD(mmx2)
374

    
375
void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx)
376
{
377
    if (mm_flags & MM_MMX) {
378
        c->pix_abs[0][0] = sad16_mmx;
379
        c->pix_abs[0][1] = sad16_x2_mmx;
380
        c->pix_abs[0][2] = sad16_y2_mmx;
381
        c->pix_abs[0][3] = sad16_xy2_mmx;
382
        c->pix_abs[1][0] = sad8_mmx;
383
        c->pix_abs[1][1] = sad8_x2_mmx;
384
        c->pix_abs[1][2] = sad8_y2_mmx;
385
        c->pix_abs[1][3] = sad8_xy2_mmx;
386

    
387
        c->sad[0]= sad16_mmx;
388
        c->sad[1]= sad8_mmx;
389
    }
390
    if (mm_flags & MM_MMXEXT) {
391
        c->pix_abs[0][0] = sad16_mmx2;
392
        c->pix_abs[1][0] = sad8_mmx2;
393

    
394
        c->sad[0]= sad16_mmx2;
395
        c->sad[1]= sad8_mmx2;
396

    
397
        if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
398
            c->pix_abs[0][1] = sad16_x2_mmx2;
399
            c->pix_abs[0][2] = sad16_y2_mmx2;
400
            c->pix_abs[0][3] = sad16_xy2_mmx2;
401
            c->pix_abs[1][1] = sad8_x2_mmx2;
402
            c->pix_abs[1][2] = sad8_y2_mmx2;
403
            c->pix_abs[1][3] = sad8_xy2_mmx2;
404
        }
405
    }
406
}