Statistics
| Branch: | Revision:

ffmpeg / libavcodec / i386 / motion_est_mmx.c @ 07787186

History | View | Annotate | Download (11 KB)

1
/*
2
 * MMX optimized motion estimation
3
 * Copyright (c) 2001 Fabrice Bellard.
4
 *
5
 * This library is free software; you can redistribute it and/or
6
 * modify it under the terms of the GNU Lesser General Public
7
 * License as published by the Free Software Foundation; either
8
 * version 2 of the License, or (at your option) any later version.
9
 *
10
 * This library is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
 * Lesser General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU Lesser General Public
16
 * License along with this library; if not, write to the Free Software
17
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18
 *
19
 * mostly by Michael Niedermayer <michaelni@gmx.at>
20
 */
21
#include "../dsputil.h"
22

    
23
static const __attribute__ ((aligned(8))) UINT64 round_tab[3]={
24
0x0000000000000000,
25
0x0001000100010001,
26
0x0002000200020002,
27
};
28

    
29
static __attribute__ ((aligned(8))) uint64_t bone= 0x0101010101010101LL;
30

    
31
static inline void sad8_mmx(UINT8 *blk1, UINT8 *blk2, int stride, int h)
32
{
33
    int len= -(stride<<h);
34
    asm volatile(
35
        ".balign 16                        \n\t"
36
        "1:                                \n\t"
37
        "movq (%1, %%eax), %%mm0        \n\t"
38
        "movq (%2, %%eax), %%mm2        \n\t"
39
        "movq (%2, %%eax), %%mm4        \n\t"
40
        "addl %3, %%eax                        \n\t"
41
        "psubusb %%mm0, %%mm2                \n\t"
42
        "psubusb %%mm4, %%mm0                \n\t"
43
        "movq (%1, %%eax), %%mm1        \n\t"
44
        "movq (%2, %%eax), %%mm3        \n\t"
45
        "movq (%2, %%eax), %%mm5        \n\t"
46
        "psubusb %%mm1, %%mm3                \n\t"
47
        "psubusb %%mm5, %%mm1                \n\t"
48
        "por %%mm2, %%mm0                \n\t"
49
        "por %%mm1, %%mm3                \n\t"
50
        "movq %%mm0, %%mm1                \n\t"
51
        "movq %%mm3, %%mm2                \n\t"
52
        "punpcklbw %%mm7, %%mm0                \n\t"
53
        "punpckhbw %%mm7, %%mm1                \n\t"
54
        "punpcklbw %%mm7, %%mm3                \n\t"
55
        "punpckhbw %%mm7, %%mm2                \n\t"
56
        "paddw %%mm1, %%mm0                \n\t"
57
        "paddw %%mm3, %%mm2                \n\t"
58
        "paddw %%mm2, %%mm0                \n\t"
59
        "paddw %%mm0, %%mm6                \n\t"
60
        "addl %3, %%eax                        \n\t"
61
        " js 1b                                \n\t"
62
        : "+a" (len)
63
        : "r" (blk1 - len), "r" (blk2 - len), "r" (stride)
64
    );
65
}
66

    
67
static inline void sad8_mmx2(UINT8 *blk1, UINT8 *blk2, int stride, int h)
68
{
69
    int len= -(stride<<h);
70
    asm volatile(
71
        ".balign 16                        \n\t"
72
        "1:                                \n\t"
73
        "movq (%1, %%eax), %%mm0        \n\t"
74
        "movq (%2, %%eax), %%mm2        \n\t"
75
        "psadbw %%mm2, %%mm0                \n\t"
76
        "addl %3, %%eax                        \n\t"
77
        "movq (%1, %%eax), %%mm1        \n\t"
78
        "movq (%2, %%eax), %%mm3        \n\t"
79
        "psadbw %%mm1, %%mm3                \n\t"
80
        "paddw %%mm3, %%mm0                \n\t"
81
        "paddw %%mm0, %%mm6                \n\t"
82
        "addl %3, %%eax                        \n\t"
83
        " js 1b                                \n\t"
84
        : "+a" (len)
85
        : "r" (blk1 - len), "r" (blk2 - len), "r" (stride)
86
    );
87
}
88

    
89
static inline void sad8_2_mmx2(UINT8 *blk1a, UINT8 *blk1b, UINT8 *blk2, int stride, int h)
90
{
91
    int len= -(stride<<h);
92
    asm volatile(
93
        ".balign 16                        \n\t"
94
        "1:                                \n\t"
95
        "movq (%1, %%eax), %%mm0        \n\t"
96
        "movq (%2, %%eax), %%mm2        \n\t"
97
        "pavgb %%mm2, %%mm0                \n\t"
98
        "movq (%3, %%eax), %%mm2        \n\t"
99
        "psadbw %%mm2, %%mm0                \n\t"
100
        "addl %4, %%eax                        \n\t"
101
        "movq (%1, %%eax), %%mm1        \n\t"
102
        "movq (%2, %%eax), %%mm3        \n\t"
103
        "pavgb %%mm1, %%mm3                \n\t"
104
        "movq (%3, %%eax), %%mm1        \n\t"
105
        "psadbw %%mm1, %%mm3                \n\t"
106
        "paddw %%mm3, %%mm0                \n\t"
107
        "paddw %%mm0, %%mm6                \n\t"
108
        "addl %4, %%eax                        \n\t"
109
        " js 1b                                \n\t"
110
        : "+a" (len)
111
        : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" (stride)
112
    );
113
}
114

    
115
static inline void sad8_4_mmx2(UINT8 *blk1, UINT8 *blk2, int stride, int h)
116
{ //FIXME reuse src
117
    int len= -(stride<<h);
118
    asm volatile(
119
        ".balign 16                        \n\t"
120
        "movq "MANGLE(bone)", %%mm5        \n\t"
121
        "1:                                \n\t" 
122
        "movq (%1, %%eax), %%mm0        \n\t"
123
        "movq (%2, %%eax), %%mm2        \n\t"
124
        "movq 1(%1, %%eax), %%mm1        \n\t"
125
        "movq 1(%2, %%eax), %%mm3        \n\t"
126
        "pavgb %%mm2, %%mm0                \n\t"
127
        "pavgb %%mm1, %%mm3                \n\t"
128
        "psubusb %%mm5, %%mm3                \n\t"
129
        "pavgb %%mm3, %%mm0                \n\t"
130
        "movq (%3, %%eax), %%mm2        \n\t"
131
        "psadbw %%mm2, %%mm0                \n\t"
132
        "addl %4, %%eax                        \n\t"
133
        "movq (%1, %%eax), %%mm1        \n\t"
134
        "movq (%2, %%eax), %%mm3        \n\t"
135
        "movq 1(%1, %%eax), %%mm2        \n\t"
136
        "movq 1(%2, %%eax), %%mm4        \n\t"
137
        "pavgb %%mm3, %%mm1                \n\t"
138
        "pavgb %%mm4, %%mm2                \n\t"
139
        "psubusb %%mm5, %%mm2                \n\t"
140
        "pavgb %%mm1, %%mm2                \n\t"
141
        "movq (%3, %%eax), %%mm1        \n\t"
142
        "psadbw %%mm1, %%mm2                \n\t"
143
        "paddw %%mm2, %%mm0                \n\t"
144
        "paddw %%mm0, %%mm6                \n\t"
145
        "addl %4, %%eax                        \n\t"
146
        " js 1b                                \n\t"
147
        : "+a" (len)
148
        : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), "r" (stride)
149
    );
150
}
151

    
152
static inline void sad8_2_mmx(UINT8 *blk1a, UINT8 *blk1b, UINT8 *blk2, int stride, int h)
153
{
154
    int len= -(stride<<h);
155
    asm volatile(
156
        ".balign 16                        \n\t"
157
        "1:                                \n\t"
158
        "movq (%1, %%eax), %%mm0        \n\t"
159
        "movq (%2, %%eax), %%mm1        \n\t"
160
        "movq (%1, %%eax), %%mm2        \n\t"
161
        "movq (%2, %%eax), %%mm3        \n\t"
162
        "punpcklbw %%mm7, %%mm0                \n\t"
163
        "punpcklbw %%mm7, %%mm1                \n\t"
164
        "punpckhbw %%mm7, %%mm2                \n\t"
165
        "punpckhbw %%mm7, %%mm3                \n\t"
166
        "paddw %%mm0, %%mm1                \n\t"
167
        "paddw %%mm2, %%mm3                \n\t"
168
        "movq (%3, %%eax), %%mm4        \n\t" 
169
        "movq (%3, %%eax), %%mm2        \n\t"
170
        "paddw %%mm5, %%mm1                \n\t"
171
        "paddw %%mm5, %%mm3                \n\t"
172
        "psrlw $1, %%mm1                \n\t"
173
        "psrlw $1, %%mm3                \n\t"
174
        "packuswb %%mm3, %%mm1                \n\t"
175
        "psubusb %%mm1, %%mm4                \n\t"
176
        "psubusb %%mm2, %%mm1                \n\t"
177
        "por %%mm4, %%mm1                \n\t"
178
        "movq %%mm1, %%mm0                \n\t"
179
        "punpcklbw %%mm7, %%mm0                \n\t"
180
        "punpckhbw %%mm7, %%mm1                \n\t"
181
        "paddw %%mm1, %%mm0                \n\t"
182
        "paddw %%mm0, %%mm6                \n\t"
183
        "addl %4, %%eax                        \n\t"
184
        " js 1b                                \n\t"
185
        : "+a" (len)
186
        : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" (stride)
187
    );
188
}
189

    
190
static inline void sad8_4_mmx(UINT8 *blk1, UINT8 *blk2, int stride, int h)
191
{
192
    int len= -(stride<<h);
193
    asm volatile(
194
        ".balign 16                        \n\t"
195
        "1:                                \n\t"
196
        "movq (%1, %%eax), %%mm0        \n\t"
197
        "movq (%2, %%eax), %%mm1        \n\t"
198
        "movq %%mm0, %%mm4                \n\t"
199
        "movq %%mm1, %%mm2                \n\t"
200
        "punpcklbw %%mm7, %%mm0                \n\t"
201
        "punpcklbw %%mm7, %%mm1                \n\t"
202
        "punpckhbw %%mm7, %%mm4                \n\t"
203
        "punpckhbw %%mm7, %%mm2                \n\t"
204
        "paddw %%mm1, %%mm0                \n\t"
205
        "paddw %%mm2, %%mm4                \n\t"
206
        "movq 1(%1, %%eax), %%mm2        \n\t"
207
        "movq 1(%2, %%eax), %%mm3        \n\t"
208
        "movq %%mm2, %%mm1                \n\t"
209
        "punpcklbw %%mm7, %%mm2                \n\t"
210
        "punpckhbw %%mm7, %%mm1                \n\t"
211
        "paddw %%mm0, %%mm2                \n\t"
212
        "paddw %%mm4, %%mm1                \n\t"
213
        "movq %%mm3, %%mm4                \n\t"
214
        "punpcklbw %%mm7, %%mm3                \n\t"
215
        "punpckhbw %%mm7, %%mm4                \n\t"
216
        "paddw %%mm3, %%mm2                \n\t"
217
        "paddw %%mm4, %%mm1                \n\t"
218
        "movq (%3, %%eax), %%mm3        \n\t" 
219
        "movq (%3, %%eax), %%mm4        \n\t" 
220
        "paddw %%mm5, %%mm2                \n\t"
221
        "paddw %%mm5, %%mm1                \n\t"
222
        "psrlw $2, %%mm2                \n\t"
223
        "psrlw $2, %%mm1                \n\t"
224
        "packuswb %%mm1, %%mm2                \n\t"
225
        "psubusb %%mm2, %%mm3                \n\t"
226
        "psubusb %%mm4, %%mm2                \n\t"
227
        "por %%mm3, %%mm2                \n\t"
228
        "movq %%mm2, %%mm0                \n\t"
229
        "punpcklbw %%mm7, %%mm0                \n\t"
230
        "punpckhbw %%mm7, %%mm2                \n\t"
231
        "paddw %%mm2, %%mm0                \n\t"
232
        "paddw %%mm0, %%mm6                \n\t"
233
        "addl %4, %%eax                        \n\t"
234
        " js 1b                                \n\t"
235
        : "+a" (len)
236
        : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" (stride)
237
    );
238
}
239

    
240
static inline int sum_mmx()
241
{
242
    int ret;
243
    asm volatile(
244
        "movq %%mm6, %%mm0                \n\t"
245
        "psrlq $32, %%mm6                \n\t"
246
        "paddw %%mm0, %%mm6                \n\t"
247
        "movq %%mm6, %%mm0                \n\t"
248
        "psrlq $16, %%mm6                \n\t"
249
        "paddw %%mm0, %%mm6                \n\t"
250
        "movd %%mm6, %0                        \n\t"
251
        : "=r" (ret)
252
    );
253
    return ret&0xFFFF;
254
}
255

    
256
static inline int sum_mmx2()
257
{
258
    int ret;
259
    asm volatile(
260
        "movd %%mm6, %0                        \n\t"
261
        : "=r" (ret)
262
    );
263
    return ret;
264
}
265

    
266
#define PIX_SAD(suf)\
267
int pix_abs8x8_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
268
{\
269
    asm volatile("pxor %%mm7, %%mm7                \n\t"\
270
                 "pxor %%mm6, %%mm6                \n\t":);\
271
\
272
    sad8_ ## suf(blk1, blk2, stride, 3);\
273
\
274
    return sum_ ## suf();\
275
}\
276
\
277
int pix_abs8x8_x2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
278
{\
279
    asm volatile("pxor %%mm7, %%mm7                \n\t"\
280
                 "pxor %%mm6, %%mm6                \n\t"\
281
                 "movq %0, %%mm5                \n\t"\
282
                 :: "m"(round_tab[1]) \
283
                 );\
284
\
285
    sad8_2_ ## suf(blk1, blk2+1, blk2, stride, 3);\
286
\
287
    return sum_ ## suf();\
288
}\
289
\
290
int pix_abs8x8_y2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
291
{\
292
    asm volatile("pxor %%mm7, %%mm7                \n\t"\
293
                 "pxor %%mm6, %%mm6                \n\t"\
294
                 "movq %0, %%mm5                \n\t"\
295
                 :: "m"(round_tab[1]) \
296
                 );\
297
\
298
    sad8_2_ ## suf(blk1, blk1+stride, blk2, stride, 3);\
299
\
300
    return sum_ ## suf();\
301
}\
302
\
303
int pix_abs8x8_xy2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
304
{\
305
    asm volatile("pxor %%mm7, %%mm7                \n\t"\
306
                 "pxor %%mm6, %%mm6                \n\t"\
307
                 "movq %0, %%mm5                \n\t"\
308
                 :: "m"(round_tab[2]) \
309
                 );\
310
\
311
    sad8_4_ ## suf(blk1, blk2, stride, 3);\
312
\
313
    return sum_ ## suf();\
314
}\
315
\
316
int pix_abs16x16_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
317
{\
318
    asm volatile("pxor %%mm7, %%mm7                \n\t"\
319
                 "pxor %%mm6, %%mm6                \n\t":);\
320
\
321
    sad8_ ## suf(blk1  , blk2  , stride, 4);\
322
    sad8_ ## suf(blk1+8, blk2+8, stride, 4);\
323
\
324
    return sum_ ## suf();\
325
}\
326
int pix_abs16x16_x2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
327
{\
328
    asm volatile("pxor %%mm7, %%mm7                \n\t"\
329
                 "pxor %%mm6, %%mm6                \n\t"\
330
                 "movq %0, %%mm5                \n\t"\
331
                 :: "m"(round_tab[1]) \
332
                 );\
333
\
334
    sad8_2_ ## suf(blk1  , blk1+1, blk2  , stride, 4);\
335
    sad8_2_ ## suf(blk1+8, blk1+9, blk2+8, stride, 4);\
336
\
337
    return sum_ ## suf();\
338
}\
339
int pix_abs16x16_y2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
340
{\
341
    asm volatile("pxor %%mm7, %%mm7                \n\t"\
342
                 "pxor %%mm6, %%mm6                \n\t"\
343
                 "movq %0, %%mm5                \n\t"\
344
                 :: "m"(round_tab[1]) \
345
                 );\
346
\
347
    sad8_2_ ## suf(blk1  , blk1+stride,  blk2  , stride, 4);\
348
    sad8_2_ ## suf(blk1+8, blk1+stride+8,blk2+8, stride, 4);\
349
\
350
    return sum_ ## suf();\
351
}\
352
int pix_abs16x16_xy2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
353
{\
354
    asm volatile("pxor %%mm7, %%mm7                \n\t"\
355
                 "pxor %%mm6, %%mm6                \n\t"\
356
                 "movq %0, %%mm5                \n\t"\
357
                 :: "m"(round_tab[2]) \
358
                 );\
359
\
360
    sad8_4_ ## suf(blk1  , blk2  , stride, 4);\
361
    sad8_4_ ## suf(blk1+8, blk2+8, stride, 4);\
362
\
363
    return sum_ ## suf();\
364
}\
365

    
366
PIX_SAD(mmx)
367
PIX_SAD(mmx2)