Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / dsputilenc_mmx.c @ 2912e87a

History | View | Annotate | Download (35 KB)

1
/*
2
 * MMX optimized DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * This file is part of Libav.
7
 *
8
 * Libav is free software; you can redistribute it and/or
9
 * modify it under the terms of the GNU Lesser General Public
10
 * License as published by the Free Software Foundation; either
11
 * version 2.1 of the License, or (at your option) any later version.
12
 *
13
 * Libav is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
 * Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public
19
 * License along with Libav; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
 *
22
 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
23
 */
24

    
25
#include "libavutil/cpu.h"
26
#include "libavutil/x86_cpu.h"
27
#include "libavcodec/dsputil.h"
28
#include "libavcodec/mpegvideo.h"
29
#include "libavcodec/mathops.h"
30
#include "dsputil_mmx.h"
31

    
32

    
33
static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
34
{
35
    __asm__ volatile(
36
        "mov $-128, %%"REG_a"           \n\t"
37
        "pxor %%mm7, %%mm7              \n\t"
38
        ".p2align 4                     \n\t"
39
        "1:                             \n\t"
40
        "movq (%0), %%mm0               \n\t"
41
        "movq (%0, %2), %%mm2           \n\t"
42
        "movq %%mm0, %%mm1              \n\t"
43
        "movq %%mm2, %%mm3              \n\t"
44
        "punpcklbw %%mm7, %%mm0         \n\t"
45
        "punpckhbw %%mm7, %%mm1         \n\t"
46
        "punpcklbw %%mm7, %%mm2         \n\t"
47
        "punpckhbw %%mm7, %%mm3         \n\t"
48
        "movq %%mm0, (%1, %%"REG_a")    \n\t"
49
        "movq %%mm1, 8(%1, %%"REG_a")   \n\t"
50
        "movq %%mm2, 16(%1, %%"REG_a")  \n\t"
51
        "movq %%mm3, 24(%1, %%"REG_a")  \n\t"
52
        "add %3, %0                     \n\t"
53
        "add $32, %%"REG_a"             \n\t"
54
        "js 1b                          \n\t"
55
        : "+r" (pixels)
56
        : "r" (block+64), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*2)
57
        : "%"REG_a
58
    );
59
}
60

    
61
static void get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size)
62
{
63
    __asm__ volatile(
64
        "pxor %%xmm4,      %%xmm4         \n\t"
65
        "movq (%0),        %%xmm0         \n\t"
66
        "movq (%0, %2),    %%xmm1         \n\t"
67
        "movq (%0, %2,2),  %%xmm2         \n\t"
68
        "movq (%0, %3),    %%xmm3         \n\t"
69
        "lea (%0,%2,4), %0                \n\t"
70
        "punpcklbw %%xmm4, %%xmm0         \n\t"
71
        "punpcklbw %%xmm4, %%xmm1         \n\t"
72
        "punpcklbw %%xmm4, %%xmm2         \n\t"
73
        "punpcklbw %%xmm4, %%xmm3         \n\t"
74
        "movdqa %%xmm0,      (%1)         \n\t"
75
        "movdqa %%xmm1,    16(%1)         \n\t"
76
        "movdqa %%xmm2,    32(%1)         \n\t"
77
        "movdqa %%xmm3,    48(%1)         \n\t"
78
        "movq (%0),        %%xmm0         \n\t"
79
        "movq (%0, %2),    %%xmm1         \n\t"
80
        "movq (%0, %2,2),  %%xmm2         \n\t"
81
        "movq (%0, %3),    %%xmm3         \n\t"
82
        "punpcklbw %%xmm4, %%xmm0         \n\t"
83
        "punpcklbw %%xmm4, %%xmm1         \n\t"
84
        "punpcklbw %%xmm4, %%xmm2         \n\t"
85
        "punpcklbw %%xmm4, %%xmm3         \n\t"
86
        "movdqa %%xmm0,    64(%1)         \n\t"
87
        "movdqa %%xmm1,    80(%1)         \n\t"
88
        "movdqa %%xmm2,    96(%1)         \n\t"
89
        "movdqa %%xmm3,   112(%1)         \n\t"
90
        : "+r" (pixels)
91
        : "r" (block), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3)
92
    );
93
}
94

    
95
static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
96
{
97
    __asm__ volatile(
98
        "pxor %%mm7, %%mm7              \n\t"
99
        "mov $-128, %%"REG_a"           \n\t"
100
        ".p2align 4                     \n\t"
101
        "1:                             \n\t"
102
        "movq (%0), %%mm0               \n\t"
103
        "movq (%1), %%mm2               \n\t"
104
        "movq %%mm0, %%mm1              \n\t"
105
        "movq %%mm2, %%mm3              \n\t"
106
        "punpcklbw %%mm7, %%mm0         \n\t"
107
        "punpckhbw %%mm7, %%mm1         \n\t"
108
        "punpcklbw %%mm7, %%mm2         \n\t"
109
        "punpckhbw %%mm7, %%mm3         \n\t"
110
        "psubw %%mm2, %%mm0             \n\t"
111
        "psubw %%mm3, %%mm1             \n\t"
112
        "movq %%mm0, (%2, %%"REG_a")    \n\t"
113
        "movq %%mm1, 8(%2, %%"REG_a")   \n\t"
114
        "add %3, %0                     \n\t"
115
        "add %3, %1                     \n\t"
116
        "add $16, %%"REG_a"             \n\t"
117
        "jnz 1b                         \n\t"
118
        : "+r" (s1), "+r" (s2)
119
        : "r" (block+64), "r" ((x86_reg)stride)
120
        : "%"REG_a
121
    );
122
}
123

    
124
static int pix_sum16_mmx(uint8_t * pix, int line_size){
125
    const int h=16;
126
    int sum;
127
    x86_reg index= -line_size*h;
128

    
129
    __asm__ volatile(
130
                "pxor %%mm7, %%mm7              \n\t"
131
                "pxor %%mm6, %%mm6              \n\t"
132
                "1:                             \n\t"
133
                "movq (%2, %1), %%mm0           \n\t"
134
                "movq (%2, %1), %%mm1           \n\t"
135
                "movq 8(%2, %1), %%mm2          \n\t"
136
                "movq 8(%2, %1), %%mm3          \n\t"
137
                "punpcklbw %%mm7, %%mm0         \n\t"
138
                "punpckhbw %%mm7, %%mm1         \n\t"
139
                "punpcklbw %%mm7, %%mm2         \n\t"
140
                "punpckhbw %%mm7, %%mm3         \n\t"
141
                "paddw %%mm0, %%mm1             \n\t"
142
                "paddw %%mm2, %%mm3             \n\t"
143
                "paddw %%mm1, %%mm3             \n\t"
144
                "paddw %%mm3, %%mm6             \n\t"
145
                "add %3, %1                     \n\t"
146
                " js 1b                         \n\t"
147
                "movq %%mm6, %%mm5              \n\t"
148
                "psrlq $32, %%mm6               \n\t"
149
                "paddw %%mm5, %%mm6             \n\t"
150
                "movq %%mm6, %%mm5              \n\t"
151
                "psrlq $16, %%mm6               \n\t"
152
                "paddw %%mm5, %%mm6             \n\t"
153
                "movd %%mm6, %0                 \n\t"
154
                "andl $0xFFFF, %0               \n\t"
155
                : "=&r" (sum), "+r" (index)
156
                : "r" (pix - index), "r" ((x86_reg)line_size)
157
        );
158

    
159
        return sum;
160
}
161

    
162
static int pix_norm1_mmx(uint8_t *pix, int line_size) {
163
    int tmp;
164
  __asm__ volatile (
165
      "movl $16,%%ecx\n"
166
      "pxor %%mm0,%%mm0\n"
167
      "pxor %%mm7,%%mm7\n"
168
      "1:\n"
169
      "movq (%0),%%mm2\n"       /* mm2 = pix[0-7] */
170
      "movq 8(%0),%%mm3\n"      /* mm3 = pix[8-15] */
171

    
172
      "movq %%mm2,%%mm1\n"      /* mm1 = mm2 = pix[0-7] */
173

    
174
      "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
175
      "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
176

    
177
      "movq %%mm3,%%mm4\n"      /* mm4 = mm3 = pix[8-15] */
178
      "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
179
      "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
180

    
181
      "pmaddwd %%mm1,%%mm1\n"   /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
182
      "pmaddwd %%mm2,%%mm2\n"   /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
183

    
184
      "pmaddwd %%mm3,%%mm3\n"
185
      "pmaddwd %%mm4,%%mm4\n"
186

    
187
      "paddd %%mm1,%%mm2\n"     /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
188
                                          pix2^2+pix3^2+pix6^2+pix7^2) */
189
      "paddd %%mm3,%%mm4\n"
190
      "paddd %%mm2,%%mm7\n"
191

    
192
      "add %2, %0\n"
193
      "paddd %%mm4,%%mm7\n"
194
      "dec %%ecx\n"
195
      "jnz 1b\n"
196

    
197
      "movq %%mm7,%%mm1\n"
198
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
199
      "paddd %%mm7,%%mm1\n"
200
      "movd %%mm1,%1\n"
201
      : "+r" (pix), "=r"(tmp) : "r" ((x86_reg)line_size) : "%ecx" );
202
    return tmp;
203
}
204

    
205
static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
206
    int tmp;
207
  __asm__ volatile (
208
      "movl %4,%%ecx\n"
209
      "shr $1,%%ecx\n"
210
      "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */
211
      "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */
212
      "1:\n"
213
      "movq (%0),%%mm1\n"       /* mm1 = pix1[0][0-7] */
214
      "movq (%1),%%mm2\n"       /* mm2 = pix2[0][0-7] */
215
      "movq (%0,%3),%%mm3\n"    /* mm3 = pix1[1][0-7] */
216
      "movq (%1,%3),%%mm4\n"    /* mm4 = pix2[1][0-7] */
217

    
218
      /* todo: mm1-mm2, mm3-mm4 */
219
      /* algo: subtract mm1 from mm2 with saturation and vice versa */
220
      /*       OR the results to get absolute difference */
221
      "movq %%mm1,%%mm5\n"
222
      "movq %%mm3,%%mm6\n"
223
      "psubusb %%mm2,%%mm1\n"
224
      "psubusb %%mm4,%%mm3\n"
225
      "psubusb %%mm5,%%mm2\n"
226
      "psubusb %%mm6,%%mm4\n"
227

    
228
      "por %%mm1,%%mm2\n"
229
      "por %%mm3,%%mm4\n"
230

    
231
      /* now convert to 16-bit vectors so we can square them */
232
      "movq %%mm2,%%mm1\n"
233
      "movq %%mm4,%%mm3\n"
234

    
235
      "punpckhbw %%mm0,%%mm2\n"
236
      "punpckhbw %%mm0,%%mm4\n"
237
      "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
238
      "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
239

    
240
      "pmaddwd %%mm2,%%mm2\n"
241
      "pmaddwd %%mm4,%%mm4\n"
242
      "pmaddwd %%mm1,%%mm1\n"
243
      "pmaddwd %%mm3,%%mm3\n"
244

    
245
      "lea (%0,%3,2), %0\n"     /* pix1 += 2*line_size */
246
      "lea (%1,%3,2), %1\n"     /* pix2 += 2*line_size */
247

    
248
      "paddd %%mm2,%%mm1\n"
249
      "paddd %%mm4,%%mm3\n"
250
      "paddd %%mm1,%%mm7\n"
251
      "paddd %%mm3,%%mm7\n"
252

    
253
      "decl %%ecx\n"
254
      "jnz 1b\n"
255

    
256
      "movq %%mm7,%%mm1\n"
257
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
258
      "paddd %%mm7,%%mm1\n"
259
      "movd %%mm1,%2\n"
260
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
261
      : "r" ((x86_reg)line_size) , "m" (h)
262
      : "%ecx");
263
    return tmp;
264
}
265

    
266
static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
267
    int tmp;
268
  __asm__ volatile (
269
      "movl %4,%%ecx\n"
270
      "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */
271
      "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */
272
      "1:\n"
273
      "movq (%0),%%mm1\n"       /* mm1 = pix1[0-7] */
274
      "movq (%1),%%mm2\n"       /* mm2 = pix2[0-7] */
275
      "movq 8(%0),%%mm3\n"      /* mm3 = pix1[8-15] */
276
      "movq 8(%1),%%mm4\n"      /* mm4 = pix2[8-15] */
277

    
278
      /* todo: mm1-mm2, mm3-mm4 */
279
      /* algo: subtract mm1 from mm2 with saturation and vice versa */
280
      /*       OR the results to get absolute difference */
281
      "movq %%mm1,%%mm5\n"
282
      "movq %%mm3,%%mm6\n"
283
      "psubusb %%mm2,%%mm1\n"
284
      "psubusb %%mm4,%%mm3\n"
285
      "psubusb %%mm5,%%mm2\n"
286
      "psubusb %%mm6,%%mm4\n"
287

    
288
      "por %%mm1,%%mm2\n"
289
      "por %%mm3,%%mm4\n"
290

    
291
      /* now convert to 16-bit vectors so we can square them */
292
      "movq %%mm2,%%mm1\n"
293
      "movq %%mm4,%%mm3\n"
294

    
295
      "punpckhbw %%mm0,%%mm2\n"
296
      "punpckhbw %%mm0,%%mm4\n"
297
      "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
298
      "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
299

    
300
      "pmaddwd %%mm2,%%mm2\n"
301
      "pmaddwd %%mm4,%%mm4\n"
302
      "pmaddwd %%mm1,%%mm1\n"
303
      "pmaddwd %%mm3,%%mm3\n"
304

    
305
      "add %3,%0\n"
306
      "add %3,%1\n"
307

    
308
      "paddd %%mm2,%%mm1\n"
309
      "paddd %%mm4,%%mm3\n"
310
      "paddd %%mm1,%%mm7\n"
311
      "paddd %%mm3,%%mm7\n"
312

    
313
      "decl %%ecx\n"
314
      "jnz 1b\n"
315

    
316
      "movq %%mm7,%%mm1\n"
317
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
318
      "paddd %%mm7,%%mm1\n"
319
      "movd %%mm1,%2\n"
320
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
321
      : "r" ((x86_reg)line_size) , "m" (h)
322
      : "%ecx");
323
    return tmp;
324
}
325

    
326
int ff_sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h);
327

    
328
static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
329
    int tmp;
330
  __asm__ volatile (
331
      "movl %3,%%ecx\n"
332
      "pxor %%mm7,%%mm7\n"
333
      "pxor %%mm6,%%mm6\n"
334

    
335
      "movq (%0),%%mm0\n"
336
      "movq %%mm0, %%mm1\n"
337
      "psllq $8, %%mm0\n"
338
      "psrlq $8, %%mm1\n"
339
      "psrlq $8, %%mm0\n"
340
      "movq %%mm0, %%mm2\n"
341
      "movq %%mm1, %%mm3\n"
342
      "punpcklbw %%mm7,%%mm0\n"
343
      "punpcklbw %%mm7,%%mm1\n"
344
      "punpckhbw %%mm7,%%mm2\n"
345
      "punpckhbw %%mm7,%%mm3\n"
346
      "psubw %%mm1, %%mm0\n"
347
      "psubw %%mm3, %%mm2\n"
348

    
349
      "add %2,%0\n"
350

    
351
      "movq (%0),%%mm4\n"
352
      "movq %%mm4, %%mm1\n"
353
      "psllq $8, %%mm4\n"
354
      "psrlq $8, %%mm1\n"
355
      "psrlq $8, %%mm4\n"
356
      "movq %%mm4, %%mm5\n"
357
      "movq %%mm1, %%mm3\n"
358
      "punpcklbw %%mm7,%%mm4\n"
359
      "punpcklbw %%mm7,%%mm1\n"
360
      "punpckhbw %%mm7,%%mm5\n"
361
      "punpckhbw %%mm7,%%mm3\n"
362
      "psubw %%mm1, %%mm4\n"
363
      "psubw %%mm3, %%mm5\n"
364
      "psubw %%mm4, %%mm0\n"
365
      "psubw %%mm5, %%mm2\n"
366
      "pxor %%mm3, %%mm3\n"
367
      "pxor %%mm1, %%mm1\n"
368
      "pcmpgtw %%mm0, %%mm3\n\t"
369
      "pcmpgtw %%mm2, %%mm1\n\t"
370
      "pxor %%mm3, %%mm0\n"
371
      "pxor %%mm1, %%mm2\n"
372
      "psubw %%mm3, %%mm0\n"
373
      "psubw %%mm1, %%mm2\n"
374
      "paddw %%mm0, %%mm2\n"
375
      "paddw %%mm2, %%mm6\n"
376

    
377
      "add %2,%0\n"
378
      "1:\n"
379

    
380
      "movq (%0),%%mm0\n"
381
      "movq %%mm0, %%mm1\n"
382
      "psllq $8, %%mm0\n"
383
      "psrlq $8, %%mm1\n"
384
      "psrlq $8, %%mm0\n"
385
      "movq %%mm0, %%mm2\n"
386
      "movq %%mm1, %%mm3\n"
387
      "punpcklbw %%mm7,%%mm0\n"
388
      "punpcklbw %%mm7,%%mm1\n"
389
      "punpckhbw %%mm7,%%mm2\n"
390
      "punpckhbw %%mm7,%%mm3\n"
391
      "psubw %%mm1, %%mm0\n"
392
      "psubw %%mm3, %%mm2\n"
393
      "psubw %%mm0, %%mm4\n"
394
      "psubw %%mm2, %%mm5\n"
395
      "pxor %%mm3, %%mm3\n"
396
      "pxor %%mm1, %%mm1\n"
397
      "pcmpgtw %%mm4, %%mm3\n\t"
398
      "pcmpgtw %%mm5, %%mm1\n\t"
399
      "pxor %%mm3, %%mm4\n"
400
      "pxor %%mm1, %%mm5\n"
401
      "psubw %%mm3, %%mm4\n"
402
      "psubw %%mm1, %%mm5\n"
403
      "paddw %%mm4, %%mm5\n"
404
      "paddw %%mm5, %%mm6\n"
405

    
406
      "add %2,%0\n"
407

    
408
      "movq (%0),%%mm4\n"
409
      "movq %%mm4, %%mm1\n"
410
      "psllq $8, %%mm4\n"
411
      "psrlq $8, %%mm1\n"
412
      "psrlq $8, %%mm4\n"
413
      "movq %%mm4, %%mm5\n"
414
      "movq %%mm1, %%mm3\n"
415
      "punpcklbw %%mm7,%%mm4\n"
416
      "punpcklbw %%mm7,%%mm1\n"
417
      "punpckhbw %%mm7,%%mm5\n"
418
      "punpckhbw %%mm7,%%mm3\n"
419
      "psubw %%mm1, %%mm4\n"
420
      "psubw %%mm3, %%mm5\n"
421
      "psubw %%mm4, %%mm0\n"
422
      "psubw %%mm5, %%mm2\n"
423
      "pxor %%mm3, %%mm3\n"
424
      "pxor %%mm1, %%mm1\n"
425
      "pcmpgtw %%mm0, %%mm3\n\t"
426
      "pcmpgtw %%mm2, %%mm1\n\t"
427
      "pxor %%mm3, %%mm0\n"
428
      "pxor %%mm1, %%mm2\n"
429
      "psubw %%mm3, %%mm0\n"
430
      "psubw %%mm1, %%mm2\n"
431
      "paddw %%mm0, %%mm2\n"
432
      "paddw %%mm2, %%mm6\n"
433

    
434
      "add %2,%0\n"
435
      "subl $2, %%ecx\n"
436
      " jnz 1b\n"
437

    
438
      "movq %%mm6, %%mm0\n"
439
      "punpcklwd %%mm7,%%mm0\n"
440
      "punpckhwd %%mm7,%%mm6\n"
441
      "paddd %%mm0, %%mm6\n"
442

    
443
      "movq %%mm6,%%mm0\n"
444
      "psrlq $32, %%mm6\n"
445
      "paddd %%mm6,%%mm0\n"
446
      "movd %%mm0,%1\n"
447
      : "+r" (pix1), "=r"(tmp)
448
      : "r" ((x86_reg)line_size) , "g" (h-2)
449
      : "%ecx");
450
      return tmp;
451
}
452

    
453
static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
454
    int tmp;
455
    uint8_t * pix= pix1;
456
  __asm__ volatile (
457
      "movl %3,%%ecx\n"
458
      "pxor %%mm7,%%mm7\n"
459
      "pxor %%mm6,%%mm6\n"
460

    
461
      "movq (%0),%%mm0\n"
462
      "movq 1(%0),%%mm1\n"
463
      "movq %%mm0, %%mm2\n"
464
      "movq %%mm1, %%mm3\n"
465
      "punpcklbw %%mm7,%%mm0\n"
466
      "punpcklbw %%mm7,%%mm1\n"
467
      "punpckhbw %%mm7,%%mm2\n"
468
      "punpckhbw %%mm7,%%mm3\n"
469
      "psubw %%mm1, %%mm0\n"
470
      "psubw %%mm3, %%mm2\n"
471

    
472
      "add %2,%0\n"
473

    
474
      "movq (%0),%%mm4\n"
475
      "movq 1(%0),%%mm1\n"
476
      "movq %%mm4, %%mm5\n"
477
      "movq %%mm1, %%mm3\n"
478
      "punpcklbw %%mm7,%%mm4\n"
479
      "punpcklbw %%mm7,%%mm1\n"
480
      "punpckhbw %%mm7,%%mm5\n"
481
      "punpckhbw %%mm7,%%mm3\n"
482
      "psubw %%mm1, %%mm4\n"
483
      "psubw %%mm3, %%mm5\n"
484
      "psubw %%mm4, %%mm0\n"
485
      "psubw %%mm5, %%mm2\n"
486
      "pxor %%mm3, %%mm3\n"
487
      "pxor %%mm1, %%mm1\n"
488
      "pcmpgtw %%mm0, %%mm3\n\t"
489
      "pcmpgtw %%mm2, %%mm1\n\t"
490
      "pxor %%mm3, %%mm0\n"
491
      "pxor %%mm1, %%mm2\n"
492
      "psubw %%mm3, %%mm0\n"
493
      "psubw %%mm1, %%mm2\n"
494
      "paddw %%mm0, %%mm2\n"
495
      "paddw %%mm2, %%mm6\n"
496

    
497
      "add %2,%0\n"
498
      "1:\n"
499

    
500
      "movq (%0),%%mm0\n"
501
      "movq 1(%0),%%mm1\n"
502
      "movq %%mm0, %%mm2\n"
503
      "movq %%mm1, %%mm3\n"
504
      "punpcklbw %%mm7,%%mm0\n"
505
      "punpcklbw %%mm7,%%mm1\n"
506
      "punpckhbw %%mm7,%%mm2\n"
507
      "punpckhbw %%mm7,%%mm3\n"
508
      "psubw %%mm1, %%mm0\n"
509
      "psubw %%mm3, %%mm2\n"
510
      "psubw %%mm0, %%mm4\n"
511
      "psubw %%mm2, %%mm5\n"
512
      "pxor %%mm3, %%mm3\n"
513
      "pxor %%mm1, %%mm1\n"
514
      "pcmpgtw %%mm4, %%mm3\n\t"
515
      "pcmpgtw %%mm5, %%mm1\n\t"
516
      "pxor %%mm3, %%mm4\n"
517
      "pxor %%mm1, %%mm5\n"
518
      "psubw %%mm3, %%mm4\n"
519
      "psubw %%mm1, %%mm5\n"
520
      "paddw %%mm4, %%mm5\n"
521
      "paddw %%mm5, %%mm6\n"
522

    
523
      "add %2,%0\n"
524

    
525
      "movq (%0),%%mm4\n"
526
      "movq 1(%0),%%mm1\n"
527
      "movq %%mm4, %%mm5\n"
528
      "movq %%mm1, %%mm3\n"
529
      "punpcklbw %%mm7,%%mm4\n"
530
      "punpcklbw %%mm7,%%mm1\n"
531
      "punpckhbw %%mm7,%%mm5\n"
532
      "punpckhbw %%mm7,%%mm3\n"
533
      "psubw %%mm1, %%mm4\n"
534
      "psubw %%mm3, %%mm5\n"
535
      "psubw %%mm4, %%mm0\n"
536
      "psubw %%mm5, %%mm2\n"
537
      "pxor %%mm3, %%mm3\n"
538
      "pxor %%mm1, %%mm1\n"
539
      "pcmpgtw %%mm0, %%mm3\n\t"
540
      "pcmpgtw %%mm2, %%mm1\n\t"
541
      "pxor %%mm3, %%mm0\n"
542
      "pxor %%mm1, %%mm2\n"
543
      "psubw %%mm3, %%mm0\n"
544
      "psubw %%mm1, %%mm2\n"
545
      "paddw %%mm0, %%mm2\n"
546
      "paddw %%mm2, %%mm6\n"
547

    
548
      "add %2,%0\n"
549
      "subl $2, %%ecx\n"
550
      " jnz 1b\n"
551

    
552
      "movq %%mm6, %%mm0\n"
553
      "punpcklwd %%mm7,%%mm0\n"
554
      "punpckhwd %%mm7,%%mm6\n"
555
      "paddd %%mm0, %%mm6\n"
556

    
557
      "movq %%mm6,%%mm0\n"
558
      "psrlq $32, %%mm6\n"
559
      "paddd %%mm6,%%mm0\n"
560
      "movd %%mm0,%1\n"
561
      : "+r" (pix1), "=r"(tmp)
562
      : "r" ((x86_reg)line_size) , "g" (h-2)
563
      : "%ecx");
564
      return tmp + hf_noise8_mmx(pix+8, line_size, h);
565
}
566

    
567
static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
568
    MpegEncContext *c = p;
569
    int score1, score2;
570

    
571
    if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
572
    else  score1 = sse16_mmx(c, pix1, pix2, line_size, h);
573
    score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
574

    
575
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
576
    else  return score1 + FFABS(score2)*8;
577
}
578

    
579
static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
580
    MpegEncContext *c = p;
581
    int score1= sse8_mmx(c, pix1, pix2, line_size, h);
582
    int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
583

    
584
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
585
    else  return score1 + FFABS(score2)*8;
586
}
587

    
588
static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
589
    int tmp;
590

    
591
    assert( (((int)pix) & 7) == 0);
592
    assert((line_size &7) ==0);
593

    
594
#define SUM(in0, in1, out0, out1) \
595
      "movq (%0), %%mm2\n"\
596
      "movq 8(%0), %%mm3\n"\
597
      "add %2,%0\n"\
598
      "movq %%mm2, " #out0 "\n"\
599
      "movq %%mm3, " #out1 "\n"\
600
      "psubusb " #in0 ", %%mm2\n"\
601
      "psubusb " #in1 ", %%mm3\n"\
602
      "psubusb " #out0 ", " #in0 "\n"\
603
      "psubusb " #out1 ", " #in1 "\n"\
604
      "por %%mm2, " #in0 "\n"\
605
      "por %%mm3, " #in1 "\n"\
606
      "movq " #in0 ", %%mm2\n"\
607
      "movq " #in1 ", %%mm3\n"\
608
      "punpcklbw %%mm7, " #in0 "\n"\
609
      "punpcklbw %%mm7, " #in1 "\n"\
610
      "punpckhbw %%mm7, %%mm2\n"\
611
      "punpckhbw %%mm7, %%mm3\n"\
612
      "paddw " #in1 ", " #in0 "\n"\
613
      "paddw %%mm3, %%mm2\n"\
614
      "paddw %%mm2, " #in0 "\n"\
615
      "paddw " #in0 ", %%mm6\n"
616

    
617

    
618
  __asm__ volatile (
619
      "movl %3,%%ecx\n"
620
      "pxor %%mm6,%%mm6\n"
621
      "pxor %%mm7,%%mm7\n"
622
      "movq (%0),%%mm0\n"
623
      "movq 8(%0),%%mm1\n"
624
      "add %2,%0\n"
625
      "jmp 2f\n"
626
      "1:\n"
627

    
628
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
629
      "2:\n"
630
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
631

    
632
      "subl $2, %%ecx\n"
633
      "jnz 1b\n"
634

    
635
      "movq %%mm6,%%mm0\n"
636
      "psrlq $32, %%mm6\n"
637
      "paddw %%mm6,%%mm0\n"
638
      "movq %%mm0,%%mm6\n"
639
      "psrlq $16, %%mm0\n"
640
      "paddw %%mm6,%%mm0\n"
641
      "movd %%mm0,%1\n"
642
      : "+r" (pix), "=r"(tmp)
643
      : "r" ((x86_reg)line_size) , "m" (h)
644
      : "%ecx");
645
    return tmp & 0xFFFF;
646
}
647
#undef SUM
648

    
649
static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
650
    int tmp;
651

    
652
    assert( (((int)pix) & 7) == 0);
653
    assert((line_size &7) ==0);
654

    
655
#define SUM(in0, in1, out0, out1) \
656
      "movq (%0), " #out0 "\n"\
657
      "movq 8(%0), " #out1 "\n"\
658
      "add %2,%0\n"\
659
      "psadbw " #out0 ", " #in0 "\n"\
660
      "psadbw " #out1 ", " #in1 "\n"\
661
      "paddw " #in1 ", " #in0 "\n"\
662
      "paddw " #in0 ", %%mm6\n"
663

    
664
  __asm__ volatile (
665
      "movl %3,%%ecx\n"
666
      "pxor %%mm6,%%mm6\n"
667
      "pxor %%mm7,%%mm7\n"
668
      "movq (%0),%%mm0\n"
669
      "movq 8(%0),%%mm1\n"
670
      "add %2,%0\n"
671
      "jmp 2f\n"
672
      "1:\n"
673

    
674
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
675
      "2:\n"
676
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
677

    
678
      "subl $2, %%ecx\n"
679
      "jnz 1b\n"
680

    
681
      "movd %%mm6,%1\n"
682
      : "+r" (pix), "=r"(tmp)
683
      : "r" ((x86_reg)line_size) , "m" (h)
684
      : "%ecx");
685
    return tmp;
686
}
687
#undef SUM
688

    
689
static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
690
    int tmp;
691

    
692
    assert( (((int)pix1) & 7) == 0);
693
    assert( (((int)pix2) & 7) == 0);
694
    assert((line_size &7) ==0);
695

    
696
#define SUM(in0, in1, out0, out1) \
697
      "movq (%0),%%mm2\n"\
698
      "movq (%1)," #out0 "\n"\
699
      "movq 8(%0),%%mm3\n"\
700
      "movq 8(%1)," #out1 "\n"\
701
      "add %3,%0\n"\
702
      "add %3,%1\n"\
703
      "psubb " #out0 ", %%mm2\n"\
704
      "psubb " #out1 ", %%mm3\n"\
705
      "pxor %%mm7, %%mm2\n"\
706
      "pxor %%mm7, %%mm3\n"\
707
      "movq %%mm2, " #out0 "\n"\
708
      "movq %%mm3, " #out1 "\n"\
709
      "psubusb " #in0 ", %%mm2\n"\
710
      "psubusb " #in1 ", %%mm3\n"\
711
      "psubusb " #out0 ", " #in0 "\n"\
712
      "psubusb " #out1 ", " #in1 "\n"\
713
      "por %%mm2, " #in0 "\n"\
714
      "por %%mm3, " #in1 "\n"\
715
      "movq " #in0 ", %%mm2\n"\
716
      "movq " #in1 ", %%mm3\n"\
717
      "punpcklbw %%mm7, " #in0 "\n"\
718
      "punpcklbw %%mm7, " #in1 "\n"\
719
      "punpckhbw %%mm7, %%mm2\n"\
720
      "punpckhbw %%mm7, %%mm3\n"\
721
      "paddw " #in1 ", " #in0 "\n"\
722
      "paddw %%mm3, %%mm2\n"\
723
      "paddw %%mm2, " #in0 "\n"\
724
      "paddw " #in0 ", %%mm6\n"
725

    
726

    
727
  __asm__ volatile (
728
      "movl %4,%%ecx\n"
729
      "pxor %%mm6,%%mm6\n"
730
      "pcmpeqw %%mm7,%%mm7\n"
731
      "psllw $15, %%mm7\n"
732
      "packsswb %%mm7, %%mm7\n"
733
      "movq (%0),%%mm0\n"
734
      "movq (%1),%%mm2\n"
735
      "movq 8(%0),%%mm1\n"
736
      "movq 8(%1),%%mm3\n"
737
      "add %3,%0\n"
738
      "add %3,%1\n"
739
      "psubb %%mm2, %%mm0\n"
740
      "psubb %%mm3, %%mm1\n"
741
      "pxor %%mm7, %%mm0\n"
742
      "pxor %%mm7, %%mm1\n"
743
      "jmp 2f\n"
744
      "1:\n"
745

    
746
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
747
      "2:\n"
748
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
749

    
750
      "subl $2, %%ecx\n"
751
      "jnz 1b\n"
752

    
753
      "movq %%mm6,%%mm0\n"
754
      "psrlq $32, %%mm6\n"
755
      "paddw %%mm6,%%mm0\n"
756
      "movq %%mm0,%%mm6\n"
757
      "psrlq $16, %%mm0\n"
758
      "paddw %%mm6,%%mm0\n"
759
      "movd %%mm0,%2\n"
760
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
761
      : "r" ((x86_reg)line_size) , "m" (h)
762
      : "%ecx");
763
    return tmp & 0x7FFF;
764
}
765
#undef SUM
766

    
767
static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
768
    int tmp;
769

    
770
    assert( (((int)pix1) & 7) == 0);
771
    assert( (((int)pix2) & 7) == 0);
772
    assert((line_size &7) ==0);
773

    
774
#define SUM(in0, in1, out0, out1) \
775
      "movq (%0)," #out0 "\n"\
776
      "movq (%1),%%mm2\n"\
777
      "movq 8(%0)," #out1 "\n"\
778
      "movq 8(%1),%%mm3\n"\
779
      "add %3,%0\n"\
780
      "add %3,%1\n"\
781
      "psubb %%mm2, " #out0 "\n"\
782
      "psubb %%mm3, " #out1 "\n"\
783
      "pxor %%mm7, " #out0 "\n"\
784
      "pxor %%mm7, " #out1 "\n"\
785
      "psadbw " #out0 ", " #in0 "\n"\
786
      "psadbw " #out1 ", " #in1 "\n"\
787
      "paddw " #in1 ", " #in0 "\n"\
788
      "paddw " #in0 ", %%mm6\n"
789

    
790
  __asm__ volatile (
791
      "movl %4,%%ecx\n"
792
      "pxor %%mm6,%%mm6\n"
793
      "pcmpeqw %%mm7,%%mm7\n"
794
      "psllw $15, %%mm7\n"
795
      "packsswb %%mm7, %%mm7\n"
796
      "movq (%0),%%mm0\n"
797
      "movq (%1),%%mm2\n"
798
      "movq 8(%0),%%mm1\n"
799
      "movq 8(%1),%%mm3\n"
800
      "add %3,%0\n"
801
      "add %3,%1\n"
802
      "psubb %%mm2, %%mm0\n"
803
      "psubb %%mm3, %%mm1\n"
804
      "pxor %%mm7, %%mm0\n"
805
      "pxor %%mm7, %%mm1\n"
806
      "jmp 2f\n"
807
      "1:\n"
808

    
809
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
810
      "2:\n"
811
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
812

    
813
      "subl $2, %%ecx\n"
814
      "jnz 1b\n"
815

    
816
      "movd %%mm6,%2\n"
817
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
818
      : "r" ((x86_reg)line_size) , "m" (h)
819
      : "%ecx");
820
    return tmp;
821
}
822
#undef SUM
823

    
824
static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
825
    x86_reg i=0;
826
    __asm__ volatile(
827
        "1:                             \n\t"
828
        "movq  (%2, %0), %%mm0          \n\t"
829
        "movq  (%1, %0), %%mm1          \n\t"
830
        "psubb %%mm0, %%mm1             \n\t"
831
        "movq %%mm1, (%3, %0)           \n\t"
832
        "movq 8(%2, %0), %%mm0          \n\t"
833
        "movq 8(%1, %0), %%mm1          \n\t"
834
        "psubb %%mm0, %%mm1             \n\t"
835
        "movq %%mm1, 8(%3, %0)          \n\t"
836
        "add $16, %0                    \n\t"
837
        "cmp %4, %0                     \n\t"
838
        " jb 1b                         \n\t"
839
        : "+r" (i)
840
        : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w-15)
841
    );
842
    for(; i<w; i++)
843
        dst[i+0] = src1[i+0]-src2[i+0];
844
}
845

    
846
static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
847
    x86_reg i=0;
848
    uint8_t l, lt;
849

    
850
    __asm__ volatile(
851
        "1:                             \n\t"
852
        "movq  -1(%1, %0), %%mm0        \n\t" // LT
853
        "movq  (%1, %0), %%mm1          \n\t" // T
854
        "movq  -1(%2, %0), %%mm2        \n\t" // L
855
        "movq  (%2, %0), %%mm3          \n\t" // X
856
        "movq %%mm2, %%mm4              \n\t" // L
857
        "psubb %%mm0, %%mm2             \n\t"
858
        "paddb %%mm1, %%mm2             \n\t" // L + T - LT
859
        "movq %%mm4, %%mm5              \n\t" // L
860
        "pmaxub %%mm1, %%mm4            \n\t" // max(T, L)
861
        "pminub %%mm5, %%mm1            \n\t" // min(T, L)
862
        "pminub %%mm2, %%mm4            \n\t"
863
        "pmaxub %%mm1, %%mm4            \n\t"
864
        "psubb %%mm4, %%mm3             \n\t" // dst - pred
865
        "movq %%mm3, (%3, %0)           \n\t"
866
        "add $8, %0                     \n\t"
867
        "cmp %4, %0                     \n\t"
868
        " jb 1b                         \n\t"
869
        : "+r" (i)
870
        : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w)
871
    );
872

    
873
    l= *left;
874
    lt= *left_top;
875

    
876
    dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
877

    
878
    *left_top= src1[w-1];
879
    *left    = src2[w-1];
880
}
881

    
882
#define MMABS_MMX(a,z)\
883
    "pxor " #z ", " #z "              \n\t"\
884
    "pcmpgtw " #a ", " #z "           \n\t"\
885
    "pxor " #z ", " #a "              \n\t"\
886
    "psubw " #z ", " #a "             \n\t"
887

    
888
#define MMABS_MMX2(a,z)\
889
    "pxor " #z ", " #z "              \n\t"\
890
    "psubw " #a ", " #z "             \n\t"\
891
    "pmaxsw " #z ", " #a "            \n\t"
892

    
893
#define MMABS_SSSE3(a,z)\
894
    "pabsw " #a ", " #a "             \n\t"
895

    
896
#define MMABS_SUM(a,z, sum)\
897
    MMABS(a,z)\
898
    "paddusw " #a ", " #sum "         \n\t"
899

    
900
/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
901
 * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
902
 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
903
#define HSUM_MMX(a, t, dst)\
904
    "movq "#a", "#t"                  \n\t"\
905
    "psrlq $32, "#a"                  \n\t"\
906
    "paddusw "#t", "#a"               \n\t"\
907
    "movq "#a", "#t"                  \n\t"\
908
    "psrlq $16, "#a"                  \n\t"\
909
    "paddusw "#t", "#a"               \n\t"\
910
    "movd "#a", "#dst"                \n\t"\
911

    
912
#define HSUM_MMX2(a, t, dst)\
913
    "pshufw $0x0E, "#a", "#t"         \n\t"\
914
    "paddusw "#t", "#a"               \n\t"\
915
    "pshufw $0x01, "#a", "#t"         \n\t"\
916
    "paddusw "#t", "#a"               \n\t"\
917
    "movd "#a", "#dst"                \n\t"\
918

    
919
#define HSUM_SSE2(a, t, dst)\
920
    "movhlps "#a", "#t"               \n\t"\
921
    "paddusw "#t", "#a"               \n\t"\
922
    "pshuflw $0x0E, "#a", "#t"        \n\t"\
923
    "paddusw "#t", "#a"               \n\t"\
924
    "pshuflw $0x01, "#a", "#t"        \n\t"\
925
    "paddusw "#t", "#a"               \n\t"\
926
    "movd "#a", "#dst"                \n\t"\
927

    
928
#define hadamard_func(cpu) \
929
int ff_hadamard8_diff_##cpu  (void *s, uint8_t *src1, uint8_t *src2, \
930
                              int stride, int h); \
931
int ff_hadamard8_diff16_##cpu(void *s, uint8_t *src1, uint8_t *src2, \
932
                              int stride, int h);
933

    
934
hadamard_func(mmx)
935
hadamard_func(mmx2)
936
hadamard_func(sse2)
937
hadamard_func(ssse3)
938

    
939
#define DCT_SAD4(m,mm,o)\
940
    "mov"#m" "#o"+ 0(%1), "#mm"2      \n\t"\
941
    "mov"#m" "#o"+16(%1), "#mm"3      \n\t"\
942
    "mov"#m" "#o"+32(%1), "#mm"4      \n\t"\
943
    "mov"#m" "#o"+48(%1), "#mm"5      \n\t"\
944
    MMABS_SUM(mm##2, mm##6, mm##0)\
945
    MMABS_SUM(mm##3, mm##7, mm##1)\
946
    MMABS_SUM(mm##4, mm##6, mm##0)\
947
    MMABS_SUM(mm##5, mm##7, mm##1)\
948

    
949
#define DCT_SAD_MMX\
950
    "pxor %%mm0, %%mm0                \n\t"\
951
    "pxor %%mm1, %%mm1                \n\t"\
952
    DCT_SAD4(q, %%mm, 0)\
953
    DCT_SAD4(q, %%mm, 8)\
954
    DCT_SAD4(q, %%mm, 64)\
955
    DCT_SAD4(q, %%mm, 72)\
956
    "paddusw %%mm1, %%mm0             \n\t"\
957
    HSUM(%%mm0, %%mm1, %0)
958

    
959
#define DCT_SAD_SSE2\
960
    "pxor %%xmm0, %%xmm0              \n\t"\
961
    "pxor %%xmm1, %%xmm1              \n\t"\
962
    DCT_SAD4(dqa, %%xmm, 0)\
963
    DCT_SAD4(dqa, %%xmm, 64)\
964
    "paddusw %%xmm1, %%xmm0           \n\t"\
965
    HSUM(%%xmm0, %%xmm1, %0)
966

    
967
#define DCT_SAD_FUNC(cpu) \
968
static int sum_abs_dctelem_##cpu(DCTELEM *block){\
969
    int sum;\
970
    __asm__ volatile(\
971
        DCT_SAD\
972
        :"=r"(sum)\
973
        :"r"(block)\
974
    );\
975
    return sum&0xFFFF;\
976
}
977

    
978
#define DCT_SAD       DCT_SAD_MMX
979
#define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
980
#define MMABS(a,z)    MMABS_MMX(a,z)
981
DCT_SAD_FUNC(mmx)
982
#undef MMABS
983
#undef HSUM
984

    
985
#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
986
#define MMABS(a,z)    MMABS_MMX2(a,z)
987
DCT_SAD_FUNC(mmx2)
988
#undef HSUM
989
#undef DCT_SAD
990

    
991
#define DCT_SAD       DCT_SAD_SSE2
992
#define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
993
DCT_SAD_FUNC(sse2)
994
#undef MMABS
995

    
996
#if HAVE_SSSE3
997
#define MMABS(a,z)    MMABS_SSSE3(a,z)
998
DCT_SAD_FUNC(ssse3)
999
#undef MMABS
1000
#endif
1001
#undef HSUM
1002
#undef DCT_SAD
1003

    
1004
static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
1005
    int sum;
1006
    x86_reg i=size;
1007
    __asm__ volatile(
1008
        "pxor %%mm4, %%mm4 \n"
1009
        "1: \n"
1010
        "sub $8, %0 \n"
1011
        "movq (%2,%0), %%mm2 \n"
1012
        "movq (%3,%0,2), %%mm0 \n"
1013
        "movq 8(%3,%0,2), %%mm1 \n"
1014
        "punpckhbw %%mm2, %%mm3 \n"
1015
        "punpcklbw %%mm2, %%mm2 \n"
1016
        "psraw $8, %%mm3 \n"
1017
        "psraw $8, %%mm2 \n"
1018
        "psubw %%mm3, %%mm1 \n"
1019
        "psubw %%mm2, %%mm0 \n"
1020
        "pmaddwd %%mm1, %%mm1 \n"
1021
        "pmaddwd %%mm0, %%mm0 \n"
1022
        "paddd %%mm1, %%mm4 \n"
1023
        "paddd %%mm0, %%mm4 \n"
1024
        "jg 1b \n"
1025
        "movq %%mm4, %%mm3 \n"
1026
        "psrlq $32, %%mm3 \n"
1027
        "paddd %%mm3, %%mm4 \n"
1028
        "movd %%mm4, %1 \n"
1029
        :"+r"(i), "=r"(sum)
1030
        :"r"(pix1), "r"(pix2)
1031
    );
1032
    return sum;
1033
}
1034

    
1035
#define PHADDD(a, t)\
1036
    "movq "#a", "#t"                  \n\t"\
1037
    "psrlq $32, "#a"                  \n\t"\
1038
    "paddd "#t", "#a"                 \n\t"
1039
/*
1040
   pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
1041
   pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
1042
   pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
1043
 */
1044
#define PMULHRW(x, y, s, o)\
1045
    "pmulhw " #s ", "#x "            \n\t"\
1046
    "pmulhw " #s ", "#y "            \n\t"\
1047
    "paddw " #o ", "#x "             \n\t"\
1048
    "paddw " #o ", "#y "             \n\t"\
1049
    "psraw $1, "#x "                 \n\t"\
1050
    "psraw $1, "#y "                 \n\t"
1051
#define DEF(x) x ## _mmx
1052
#define SET_RND MOVQ_WONE
1053
#define SCALE_OFFSET 1
1054

    
1055
#include "dsputil_mmx_qns_template.c"
1056

    
1057
#undef DEF
1058
#undef SET_RND
1059
#undef SCALE_OFFSET
1060
#undef PMULHRW
1061

    
1062
#define DEF(x) x ## _3dnow
1063
#define SET_RND(x)
1064
#define SCALE_OFFSET 0
1065
#define PMULHRW(x, y, s, o)\
1066
    "pmulhrw " #s ", "#x "           \n\t"\
1067
    "pmulhrw " #s ", "#y "           \n\t"
1068

    
1069
#include "dsputil_mmx_qns_template.c"
1070

    
1071
#undef DEF
1072
#undef SET_RND
1073
#undef SCALE_OFFSET
1074
#undef PMULHRW
1075

    
1076
#if HAVE_SSSE3
1077
#undef PHADDD
1078
#define DEF(x) x ## _ssse3
1079
#define SET_RND(x)
1080
#define SCALE_OFFSET -1
1081
#define PHADDD(a, t)\
1082
    "pshufw $0x0E, "#a", "#t"         \n\t"\
1083
    "paddd "#t", "#a"                 \n\t" /* faster than phaddd on core2 */
1084
#define PMULHRW(x, y, s, o)\
1085
    "pmulhrsw " #s ", "#x "          \n\t"\
1086
    "pmulhrsw " #s ", "#y "          \n\t"
1087

    
1088
#include "dsputil_mmx_qns_template.c"
1089

    
1090
#undef DEF
1091
#undef SET_RND
1092
#undef SCALE_OFFSET
1093
#undef PMULHRW
1094
#undef PHADDD
1095
#endif //HAVE_SSSE3
1096

    
1097

    
1098
void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
1099
{
1100
    int mm_flags = av_get_cpu_flags();
1101

    
1102
    if (mm_flags & AV_CPU_FLAG_MMX) {
1103
        const int dct_algo = avctx->dct_algo;
1104
        if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
1105
            if(mm_flags & AV_CPU_FLAG_SSE2){
1106
                c->fdct = ff_fdct_sse2;
1107
            }else if(mm_flags & AV_CPU_FLAG_MMX2){
1108
                c->fdct = ff_fdct_mmx2;
1109
            }else{
1110
                c->fdct = ff_fdct_mmx;
1111
            }
1112
        }
1113

    
1114
        c->get_pixels = get_pixels_mmx;
1115
        c->diff_pixels = diff_pixels_mmx;
1116
        c->pix_sum = pix_sum16_mmx;
1117

    
1118
        c->diff_bytes= diff_bytes_mmx;
1119
        c->sum_abs_dctelem= sum_abs_dctelem_mmx;
1120

    
1121
#if HAVE_YASM
1122
        c->hadamard8_diff[0]= ff_hadamard8_diff16_mmx;
1123
        c->hadamard8_diff[1]= ff_hadamard8_diff_mmx;
1124
#endif
1125

    
1126
        c->pix_norm1 = pix_norm1_mmx;
1127
        c->sse[0] = (HAVE_YASM && mm_flags & AV_CPU_FLAG_SSE2) ? ff_sse16_sse2 : sse16_mmx;
1128
          c->sse[1] = sse8_mmx;
1129
        c->vsad[4]= vsad_intra16_mmx;
1130

    
1131
        c->nsse[0] = nsse16_mmx;
1132
        c->nsse[1] = nsse8_mmx;
1133
        if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1134
            c->vsad[0] = vsad16_mmx;
1135
        }
1136

    
1137
        if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1138
            c->try_8x8basis= try_8x8basis_mmx;
1139
        }
1140
        c->add_8x8basis= add_8x8basis_mmx;
1141

    
1142
        c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
1143

    
1144

    
1145
        if (mm_flags & AV_CPU_FLAG_MMX2) {
1146
            c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
1147
#if HAVE_YASM
1148
            c->hadamard8_diff[0]= ff_hadamard8_diff16_mmx2;
1149
            c->hadamard8_diff[1]= ff_hadamard8_diff_mmx2;
1150
#endif
1151
            c->vsad[4]= vsad_intra16_mmx2;
1152

    
1153
            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1154
                c->vsad[0] = vsad16_mmx2;
1155
            }
1156

    
1157
            c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
1158
        }
1159

    
1160
        if(mm_flags & AV_CPU_FLAG_SSE2){
1161
            c->get_pixels = get_pixels_sse2;
1162
            c->sum_abs_dctelem= sum_abs_dctelem_sse2;
1163
#if HAVE_YASM && HAVE_ALIGNED_STACK
1164
            c->hadamard8_diff[0]= ff_hadamard8_diff16_sse2;
1165
            c->hadamard8_diff[1]= ff_hadamard8_diff_sse2;
1166
#endif
1167
        }
1168

    
1169
#if HAVE_SSSE3
1170
        if(mm_flags & AV_CPU_FLAG_SSSE3){
1171
            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1172
                c->try_8x8basis= try_8x8basis_ssse3;
1173
            }
1174
            c->add_8x8basis= add_8x8basis_ssse3;
1175
            c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
1176
#if HAVE_YASM && HAVE_ALIGNED_STACK
1177
            c->hadamard8_diff[0]= ff_hadamard8_diff16_ssse3;
1178
            c->hadamard8_diff[1]= ff_hadamard8_diff_ssse3;
1179
#endif
1180
        }
1181
#endif
1182

    
1183
        if(mm_flags & AV_CPU_FLAG_3DNOW){
1184
            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1185
                c->try_8x8basis= try_8x8basis_3dnow;
1186
            }
1187
            c->add_8x8basis= add_8x8basis_3dnow;
1188
        }
1189
    }
1190

    
1191
    dsputil_init_pix_mmx(c, avctx);
1192
}