Statistics
| Branch: | Revision:

ffmpeg / libavcodec / i386 / dsputilenc_mmx.c @ f76543c9

History | View | Annotate | Download (42.9 KB)

1
/*
2
 * MMX optimized DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * This file is part of FFmpeg.
7
 *
8
 * FFmpeg is free software; you can redistribute it and/or
9
 * modify it under the terms of the GNU Lesser General Public
10
 * License as published by the Free Software Foundation; either
11
 * version 2.1 of the License, or (at your option) any later version.
12
 *
13
 * FFmpeg is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
 * Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public
19
 * License along with FFmpeg; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
 *
22
 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
23
 */
24

    
25
#include "libavutil/x86_cpu.h"
26
#include "libavcodec/dsputil.h"
27
#include "libavcodec/mpegvideo.h"
28
#include "dsputil_mmx.h"
29

    
30

    
31
static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
32
{
33
    asm volatile(
34
        "mov $-128, %%"REG_a"           \n\t"
35
        "pxor %%mm7, %%mm7              \n\t"
36
        ASMALIGN(4)
37
        "1:                             \n\t"
38
        "movq (%0), %%mm0               \n\t"
39
        "movq (%0, %2), %%mm2           \n\t"
40
        "movq %%mm0, %%mm1              \n\t"
41
        "movq %%mm2, %%mm3              \n\t"
42
        "punpcklbw %%mm7, %%mm0         \n\t"
43
        "punpckhbw %%mm7, %%mm1         \n\t"
44
        "punpcklbw %%mm7, %%mm2         \n\t"
45
        "punpckhbw %%mm7, %%mm3         \n\t"
46
        "movq %%mm0, (%1, %%"REG_a")    \n\t"
47
        "movq %%mm1, 8(%1, %%"REG_a")   \n\t"
48
        "movq %%mm2, 16(%1, %%"REG_a")  \n\t"
49
        "movq %%mm3, 24(%1, %%"REG_a")  \n\t"
50
        "add %3, %0                     \n\t"
51
        "add $32, %%"REG_a"             \n\t"
52
        "js 1b                          \n\t"
53
        : "+r" (pixels)
54
        : "r" (block+64), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*2)
55
        : "%"REG_a
56
    );
57
}
58

    
59
static void get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size)
60
{
61
    asm volatile(
62
        "pxor %%xmm7,      %%xmm7         \n\t"
63
        "movq (%0),        %%xmm0         \n\t"
64
        "movq (%0, %2),    %%xmm1         \n\t"
65
        "movq (%0, %2,2),  %%xmm2         \n\t"
66
        "movq (%0, %3),    %%xmm3         \n\t"
67
        "lea (%0,%2,4), %0                \n\t"
68
        "punpcklbw %%xmm7, %%xmm0         \n\t"
69
        "punpcklbw %%xmm7, %%xmm1         \n\t"
70
        "punpcklbw %%xmm7, %%xmm2         \n\t"
71
        "punpcklbw %%xmm7, %%xmm3         \n\t"
72
        "movdqa %%xmm0,      (%1)         \n\t"
73
        "movdqa %%xmm1,    16(%1)         \n\t"
74
        "movdqa %%xmm2,    32(%1)         \n\t"
75
        "movdqa %%xmm3,    48(%1)         \n\t"
76
        "movq (%0),        %%xmm0         \n\t"
77
        "movq (%0, %2),    %%xmm1         \n\t"
78
        "movq (%0, %2,2),  %%xmm2         \n\t"
79
        "movq (%0, %3),    %%xmm3         \n\t"
80
        "punpcklbw %%xmm7, %%xmm0         \n\t"
81
        "punpcklbw %%xmm7, %%xmm1         \n\t"
82
        "punpcklbw %%xmm7, %%xmm2         \n\t"
83
        "punpcklbw %%xmm7, %%xmm3         \n\t"
84
        "movdqa %%xmm0,    64(%1)         \n\t"
85
        "movdqa %%xmm1,    80(%1)         \n\t"
86
        "movdqa %%xmm2,    96(%1)         \n\t"
87
        "movdqa %%xmm3,   112(%1)         \n\t"
88
        : "+r" (pixels)
89
        : "r" (block), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3)
90
    );
91
}
92

    
93
static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
94
{
95
    asm volatile(
96
        "pxor %%mm7, %%mm7              \n\t"
97
        "mov $-128, %%"REG_a"           \n\t"
98
        ASMALIGN(4)
99
        "1:                             \n\t"
100
        "movq (%0), %%mm0               \n\t"
101
        "movq (%1), %%mm2               \n\t"
102
        "movq %%mm0, %%mm1              \n\t"
103
        "movq %%mm2, %%mm3              \n\t"
104
        "punpcklbw %%mm7, %%mm0         \n\t"
105
        "punpckhbw %%mm7, %%mm1         \n\t"
106
        "punpcklbw %%mm7, %%mm2         \n\t"
107
        "punpckhbw %%mm7, %%mm3         \n\t"
108
        "psubw %%mm2, %%mm0             \n\t"
109
        "psubw %%mm3, %%mm1             \n\t"
110
        "movq %%mm0, (%2, %%"REG_a")    \n\t"
111
        "movq %%mm1, 8(%2, %%"REG_a")   \n\t"
112
        "add %3, %0                     \n\t"
113
        "add %3, %1                     \n\t"
114
        "add $16, %%"REG_a"             \n\t"
115
        "jnz 1b                         \n\t"
116
        : "+r" (s1), "+r" (s2)
117
        : "r" (block+64), "r" ((x86_reg)stride)
118
        : "%"REG_a
119
    );
120
}
121

    
122
static int pix_sum16_mmx(uint8_t * pix, int line_size){
123
    const int h=16;
124
    int sum;
125
    x86_reg index= -line_size*h;
126

    
127
    asm volatile(
128
                "pxor %%mm7, %%mm7              \n\t"
129
                "pxor %%mm6, %%mm6              \n\t"
130
                "1:                             \n\t"
131
                "movq (%2, %1), %%mm0           \n\t"
132
                "movq (%2, %1), %%mm1           \n\t"
133
                "movq 8(%2, %1), %%mm2          \n\t"
134
                "movq 8(%2, %1), %%mm3          \n\t"
135
                "punpcklbw %%mm7, %%mm0         \n\t"
136
                "punpckhbw %%mm7, %%mm1         \n\t"
137
                "punpcklbw %%mm7, %%mm2         \n\t"
138
                "punpckhbw %%mm7, %%mm3         \n\t"
139
                "paddw %%mm0, %%mm1             \n\t"
140
                "paddw %%mm2, %%mm3             \n\t"
141
                "paddw %%mm1, %%mm3             \n\t"
142
                "paddw %%mm3, %%mm6             \n\t"
143
                "add %3, %1                     \n\t"
144
                " js 1b                         \n\t"
145
                "movq %%mm6, %%mm5              \n\t"
146
                "psrlq $32, %%mm6               \n\t"
147
                "paddw %%mm5, %%mm6             \n\t"
148
                "movq %%mm6, %%mm5              \n\t"
149
                "psrlq $16, %%mm6               \n\t"
150
                "paddw %%mm5, %%mm6             \n\t"
151
                "movd %%mm6, %0                 \n\t"
152
                "andl $0xFFFF, %0               \n\t"
153
                : "=&r" (sum), "+r" (index)
154
                : "r" (pix - index), "r" ((x86_reg)line_size)
155
        );
156

    
157
        return sum;
158
}
159

    
160
static int pix_norm1_mmx(uint8_t *pix, int line_size) {
161
    int tmp;
162
  asm volatile (
163
      "movl $16,%%ecx\n"
164
      "pxor %%mm0,%%mm0\n"
165
      "pxor %%mm7,%%mm7\n"
166
      "1:\n"
167
      "movq (%0),%%mm2\n"       /* mm2 = pix[0-7] */
168
      "movq 8(%0),%%mm3\n"      /* mm3 = pix[8-15] */
169

    
170
      "movq %%mm2,%%mm1\n"      /* mm1 = mm2 = pix[0-7] */
171

    
172
      "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
173
      "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
174

    
175
      "movq %%mm3,%%mm4\n"      /* mm4 = mm3 = pix[8-15] */
176
      "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
177
      "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
178

    
179
      "pmaddwd %%mm1,%%mm1\n"   /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
180
      "pmaddwd %%mm2,%%mm2\n"   /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
181

    
182
      "pmaddwd %%mm3,%%mm3\n"
183
      "pmaddwd %%mm4,%%mm4\n"
184

    
185
      "paddd %%mm1,%%mm2\n"     /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
186
                                          pix2^2+pix3^2+pix6^2+pix7^2) */
187
      "paddd %%mm3,%%mm4\n"
188
      "paddd %%mm2,%%mm7\n"
189

    
190
      "add %2, %0\n"
191
      "paddd %%mm4,%%mm7\n"
192
      "dec %%ecx\n"
193
      "jnz 1b\n"
194

    
195
      "movq %%mm7,%%mm1\n"
196
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
197
      "paddd %%mm7,%%mm1\n"
198
      "movd %%mm1,%1\n"
199
      : "+r" (pix), "=r"(tmp) : "r" ((x86_reg)line_size) : "%ecx" );
200
    return tmp;
201
}
202

    
203
static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
204
    int tmp;
205
  asm volatile (
206
      "movl %4,%%ecx\n"
207
      "shr $1,%%ecx\n"
208
      "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */
209
      "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */
210
      "1:\n"
211
      "movq (%0),%%mm1\n"       /* mm1 = pix1[0][0-7] */
212
      "movq (%1),%%mm2\n"       /* mm2 = pix2[0][0-7] */
213
      "movq (%0,%3),%%mm3\n"    /* mm3 = pix1[1][0-7] */
214
      "movq (%1,%3),%%mm4\n"    /* mm4 = pix2[1][0-7] */
215

    
216
      /* todo: mm1-mm2, mm3-mm4 */
217
      /* algo: subtract mm1 from mm2 with saturation and vice versa */
218
      /*       OR the results to get absolute difference */
219
      "movq %%mm1,%%mm5\n"
220
      "movq %%mm3,%%mm6\n"
221
      "psubusb %%mm2,%%mm1\n"
222
      "psubusb %%mm4,%%mm3\n"
223
      "psubusb %%mm5,%%mm2\n"
224
      "psubusb %%mm6,%%mm4\n"
225

    
226
      "por %%mm1,%%mm2\n"
227
      "por %%mm3,%%mm4\n"
228

    
229
      /* now convert to 16-bit vectors so we can square them */
230
      "movq %%mm2,%%mm1\n"
231
      "movq %%mm4,%%mm3\n"
232

    
233
      "punpckhbw %%mm0,%%mm2\n"
234
      "punpckhbw %%mm0,%%mm4\n"
235
      "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
236
      "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
237

    
238
      "pmaddwd %%mm2,%%mm2\n"
239
      "pmaddwd %%mm4,%%mm4\n"
240
      "pmaddwd %%mm1,%%mm1\n"
241
      "pmaddwd %%mm3,%%mm3\n"
242

    
243
      "lea (%0,%3,2), %0\n"     /* pix1 += 2*line_size */
244
      "lea (%1,%3,2), %1\n"     /* pix2 += 2*line_size */
245

    
246
      "paddd %%mm2,%%mm1\n"
247
      "paddd %%mm4,%%mm3\n"
248
      "paddd %%mm1,%%mm7\n"
249
      "paddd %%mm3,%%mm7\n"
250

    
251
      "decl %%ecx\n"
252
      "jnz 1b\n"
253

    
254
      "movq %%mm7,%%mm1\n"
255
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
256
      "paddd %%mm7,%%mm1\n"
257
      "movd %%mm1,%2\n"
258
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
259
      : "r" ((x86_reg)line_size) , "m" (h)
260
      : "%ecx");
261
    return tmp;
262
}
263

    
264
static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
265
    int tmp;
266
  asm volatile (
267
      "movl %4,%%ecx\n"
268
      "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */
269
      "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */
270
      "1:\n"
271
      "movq (%0),%%mm1\n"       /* mm1 = pix1[0-7] */
272
      "movq (%1),%%mm2\n"       /* mm2 = pix2[0-7] */
273
      "movq 8(%0),%%mm3\n"      /* mm3 = pix1[8-15] */
274
      "movq 8(%1),%%mm4\n"      /* mm4 = pix2[8-15] */
275

    
276
      /* todo: mm1-mm2, mm3-mm4 */
277
      /* algo: subtract mm1 from mm2 with saturation and vice versa */
278
      /*       OR the results to get absolute difference */
279
      "movq %%mm1,%%mm5\n"
280
      "movq %%mm3,%%mm6\n"
281
      "psubusb %%mm2,%%mm1\n"
282
      "psubusb %%mm4,%%mm3\n"
283
      "psubusb %%mm5,%%mm2\n"
284
      "psubusb %%mm6,%%mm4\n"
285

    
286
      "por %%mm1,%%mm2\n"
287
      "por %%mm3,%%mm4\n"
288

    
289
      /* now convert to 16-bit vectors so we can square them */
290
      "movq %%mm2,%%mm1\n"
291
      "movq %%mm4,%%mm3\n"
292

    
293
      "punpckhbw %%mm0,%%mm2\n"
294
      "punpckhbw %%mm0,%%mm4\n"
295
      "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
296
      "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
297

    
298
      "pmaddwd %%mm2,%%mm2\n"
299
      "pmaddwd %%mm4,%%mm4\n"
300
      "pmaddwd %%mm1,%%mm1\n"
301
      "pmaddwd %%mm3,%%mm3\n"
302

    
303
      "add %3,%0\n"
304
      "add %3,%1\n"
305

    
306
      "paddd %%mm2,%%mm1\n"
307
      "paddd %%mm4,%%mm3\n"
308
      "paddd %%mm1,%%mm7\n"
309
      "paddd %%mm3,%%mm7\n"
310

    
311
      "decl %%ecx\n"
312
      "jnz 1b\n"
313

    
314
      "movq %%mm7,%%mm1\n"
315
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
316
      "paddd %%mm7,%%mm1\n"
317
      "movd %%mm1,%2\n"
318
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
319
      : "r" ((x86_reg)line_size) , "m" (h)
320
      : "%ecx");
321
    return tmp;
322
}
323

    
324
static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
325
    int tmp;
326
  asm volatile (
327
      "shr $1,%2\n"
328
      "pxor %%xmm0,%%xmm0\n"    /* mm0 = 0 */
329
      "pxor %%xmm7,%%xmm7\n"    /* mm7 holds the sum */
330
      "1:\n"
331
      "movdqu (%0),%%xmm1\n"    /* mm1 = pix1[0][0-15] */
332
      "movdqu (%1),%%xmm2\n"    /* mm2 = pix2[0][0-15] */
333
      "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
334
      "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
335

    
336
      /* todo: mm1-mm2, mm3-mm4 */
337
      /* algo: subtract mm1 from mm2 with saturation and vice versa */
338
      /*       OR the results to get absolute difference */
339
      "movdqa %%xmm1,%%xmm5\n"
340
      "movdqa %%xmm3,%%xmm6\n"
341
      "psubusb %%xmm2,%%xmm1\n"
342
      "psubusb %%xmm4,%%xmm3\n"
343
      "psubusb %%xmm5,%%xmm2\n"
344
      "psubusb %%xmm6,%%xmm4\n"
345

    
346
      "por %%xmm1,%%xmm2\n"
347
      "por %%xmm3,%%xmm4\n"
348

    
349
      /* now convert to 16-bit vectors so we can square them */
350
      "movdqa %%xmm2,%%xmm1\n"
351
      "movdqa %%xmm4,%%xmm3\n"
352

    
353
      "punpckhbw %%xmm0,%%xmm2\n"
354
      "punpckhbw %%xmm0,%%xmm4\n"
355
      "punpcklbw %%xmm0,%%xmm1\n"  /* mm1 now spread over (mm1,mm2) */
356
      "punpcklbw %%xmm0,%%xmm3\n"  /* mm4 now spread over (mm3,mm4) */
357

    
358
      "pmaddwd %%xmm2,%%xmm2\n"
359
      "pmaddwd %%xmm4,%%xmm4\n"
360
      "pmaddwd %%xmm1,%%xmm1\n"
361
      "pmaddwd %%xmm3,%%xmm3\n"
362

    
363
      "lea (%0,%4,2), %0\n"        /* pix1 += 2*line_size */
364
      "lea (%1,%4,2), %1\n"        /* pix2 += 2*line_size */
365

    
366
      "paddd %%xmm2,%%xmm1\n"
367
      "paddd %%xmm4,%%xmm3\n"
368
      "paddd %%xmm1,%%xmm7\n"
369
      "paddd %%xmm3,%%xmm7\n"
370

    
371
      "decl %2\n"
372
      "jnz 1b\n"
373

    
374
      "movdqa %%xmm7,%%xmm1\n"
375
      "psrldq $8, %%xmm7\n"        /* shift hi qword to lo */
376
      "paddd %%xmm1,%%xmm7\n"
377
      "movdqa %%xmm7,%%xmm1\n"
378
      "psrldq $4, %%xmm7\n"        /* shift hi dword to lo */
379
      "paddd %%xmm1,%%xmm7\n"
380
      "movd %%xmm7,%3\n"
381
      : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
382
      : "r" ((x86_reg)line_size));
383
    return tmp;
384
}
385

    
386
static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
387
    int tmp;
388
  asm volatile (
389
      "movl %3,%%ecx\n"
390
      "pxor %%mm7,%%mm7\n"
391
      "pxor %%mm6,%%mm6\n"
392

    
393
      "movq (%0),%%mm0\n"
394
      "movq %%mm0, %%mm1\n"
395
      "psllq $8, %%mm0\n"
396
      "psrlq $8, %%mm1\n"
397
      "psrlq $8, %%mm0\n"
398
      "movq %%mm0, %%mm2\n"
399
      "movq %%mm1, %%mm3\n"
400
      "punpcklbw %%mm7,%%mm0\n"
401
      "punpcklbw %%mm7,%%mm1\n"
402
      "punpckhbw %%mm7,%%mm2\n"
403
      "punpckhbw %%mm7,%%mm3\n"
404
      "psubw %%mm1, %%mm0\n"
405
      "psubw %%mm3, %%mm2\n"
406

    
407
      "add %2,%0\n"
408

    
409
      "movq (%0),%%mm4\n"
410
      "movq %%mm4, %%mm1\n"
411
      "psllq $8, %%mm4\n"
412
      "psrlq $8, %%mm1\n"
413
      "psrlq $8, %%mm4\n"
414
      "movq %%mm4, %%mm5\n"
415
      "movq %%mm1, %%mm3\n"
416
      "punpcklbw %%mm7,%%mm4\n"
417
      "punpcklbw %%mm7,%%mm1\n"
418
      "punpckhbw %%mm7,%%mm5\n"
419
      "punpckhbw %%mm7,%%mm3\n"
420
      "psubw %%mm1, %%mm4\n"
421
      "psubw %%mm3, %%mm5\n"
422
      "psubw %%mm4, %%mm0\n"
423
      "psubw %%mm5, %%mm2\n"
424
      "pxor %%mm3, %%mm3\n"
425
      "pxor %%mm1, %%mm1\n"
426
      "pcmpgtw %%mm0, %%mm3\n\t"
427
      "pcmpgtw %%mm2, %%mm1\n\t"
428
      "pxor %%mm3, %%mm0\n"
429
      "pxor %%mm1, %%mm2\n"
430
      "psubw %%mm3, %%mm0\n"
431
      "psubw %%mm1, %%mm2\n"
432
      "paddw %%mm0, %%mm2\n"
433
      "paddw %%mm2, %%mm6\n"
434

    
435
      "add %2,%0\n"
436
      "1:\n"
437

    
438
      "movq (%0),%%mm0\n"
439
      "movq %%mm0, %%mm1\n"
440
      "psllq $8, %%mm0\n"
441
      "psrlq $8, %%mm1\n"
442
      "psrlq $8, %%mm0\n"
443
      "movq %%mm0, %%mm2\n"
444
      "movq %%mm1, %%mm3\n"
445
      "punpcklbw %%mm7,%%mm0\n"
446
      "punpcklbw %%mm7,%%mm1\n"
447
      "punpckhbw %%mm7,%%mm2\n"
448
      "punpckhbw %%mm7,%%mm3\n"
449
      "psubw %%mm1, %%mm0\n"
450
      "psubw %%mm3, %%mm2\n"
451
      "psubw %%mm0, %%mm4\n"
452
      "psubw %%mm2, %%mm5\n"
453
      "pxor %%mm3, %%mm3\n"
454
      "pxor %%mm1, %%mm1\n"
455
      "pcmpgtw %%mm4, %%mm3\n\t"
456
      "pcmpgtw %%mm5, %%mm1\n\t"
457
      "pxor %%mm3, %%mm4\n"
458
      "pxor %%mm1, %%mm5\n"
459
      "psubw %%mm3, %%mm4\n"
460
      "psubw %%mm1, %%mm5\n"
461
      "paddw %%mm4, %%mm5\n"
462
      "paddw %%mm5, %%mm6\n"
463

    
464
      "add %2,%0\n"
465

    
466
      "movq (%0),%%mm4\n"
467
      "movq %%mm4, %%mm1\n"
468
      "psllq $8, %%mm4\n"
469
      "psrlq $8, %%mm1\n"
470
      "psrlq $8, %%mm4\n"
471
      "movq %%mm4, %%mm5\n"
472
      "movq %%mm1, %%mm3\n"
473
      "punpcklbw %%mm7,%%mm4\n"
474
      "punpcklbw %%mm7,%%mm1\n"
475
      "punpckhbw %%mm7,%%mm5\n"
476
      "punpckhbw %%mm7,%%mm3\n"
477
      "psubw %%mm1, %%mm4\n"
478
      "psubw %%mm3, %%mm5\n"
479
      "psubw %%mm4, %%mm0\n"
480
      "psubw %%mm5, %%mm2\n"
481
      "pxor %%mm3, %%mm3\n"
482
      "pxor %%mm1, %%mm1\n"
483
      "pcmpgtw %%mm0, %%mm3\n\t"
484
      "pcmpgtw %%mm2, %%mm1\n\t"
485
      "pxor %%mm3, %%mm0\n"
486
      "pxor %%mm1, %%mm2\n"
487
      "psubw %%mm3, %%mm0\n"
488
      "psubw %%mm1, %%mm2\n"
489
      "paddw %%mm0, %%mm2\n"
490
      "paddw %%mm2, %%mm6\n"
491

    
492
      "add %2,%0\n"
493
      "subl $2, %%ecx\n"
494
      " jnz 1b\n"
495

    
496
      "movq %%mm6, %%mm0\n"
497
      "punpcklwd %%mm7,%%mm0\n"
498
      "punpckhwd %%mm7,%%mm6\n"
499
      "paddd %%mm0, %%mm6\n"
500

    
501
      "movq %%mm6,%%mm0\n"
502
      "psrlq $32, %%mm6\n"
503
      "paddd %%mm6,%%mm0\n"
504
      "movd %%mm0,%1\n"
505
      : "+r" (pix1), "=r"(tmp)
506
      : "r" ((x86_reg)line_size) , "g" (h-2)
507
      : "%ecx");
508
      return tmp;
509
}
510

    
511
static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
512
    int tmp;
513
    uint8_t * pix= pix1;
514
  asm volatile (
515
      "movl %3,%%ecx\n"
516
      "pxor %%mm7,%%mm7\n"
517
      "pxor %%mm6,%%mm6\n"
518

    
519
      "movq (%0),%%mm0\n"
520
      "movq 1(%0),%%mm1\n"
521
      "movq %%mm0, %%mm2\n"
522
      "movq %%mm1, %%mm3\n"
523
      "punpcklbw %%mm7,%%mm0\n"
524
      "punpcklbw %%mm7,%%mm1\n"
525
      "punpckhbw %%mm7,%%mm2\n"
526
      "punpckhbw %%mm7,%%mm3\n"
527
      "psubw %%mm1, %%mm0\n"
528
      "psubw %%mm3, %%mm2\n"
529

    
530
      "add %2,%0\n"
531

    
532
      "movq (%0),%%mm4\n"
533
      "movq 1(%0),%%mm1\n"
534
      "movq %%mm4, %%mm5\n"
535
      "movq %%mm1, %%mm3\n"
536
      "punpcklbw %%mm7,%%mm4\n"
537
      "punpcklbw %%mm7,%%mm1\n"
538
      "punpckhbw %%mm7,%%mm5\n"
539
      "punpckhbw %%mm7,%%mm3\n"
540
      "psubw %%mm1, %%mm4\n"
541
      "psubw %%mm3, %%mm5\n"
542
      "psubw %%mm4, %%mm0\n"
543
      "psubw %%mm5, %%mm2\n"
544
      "pxor %%mm3, %%mm3\n"
545
      "pxor %%mm1, %%mm1\n"
546
      "pcmpgtw %%mm0, %%mm3\n\t"
547
      "pcmpgtw %%mm2, %%mm1\n\t"
548
      "pxor %%mm3, %%mm0\n"
549
      "pxor %%mm1, %%mm2\n"
550
      "psubw %%mm3, %%mm0\n"
551
      "psubw %%mm1, %%mm2\n"
552
      "paddw %%mm0, %%mm2\n"
553
      "paddw %%mm2, %%mm6\n"
554

    
555
      "add %2,%0\n"
556
      "1:\n"
557

    
558
      "movq (%0),%%mm0\n"
559
      "movq 1(%0),%%mm1\n"
560
      "movq %%mm0, %%mm2\n"
561
      "movq %%mm1, %%mm3\n"
562
      "punpcklbw %%mm7,%%mm0\n"
563
      "punpcklbw %%mm7,%%mm1\n"
564
      "punpckhbw %%mm7,%%mm2\n"
565
      "punpckhbw %%mm7,%%mm3\n"
566
      "psubw %%mm1, %%mm0\n"
567
      "psubw %%mm3, %%mm2\n"
568
      "psubw %%mm0, %%mm4\n"
569
      "psubw %%mm2, %%mm5\n"
570
      "pxor %%mm3, %%mm3\n"
571
      "pxor %%mm1, %%mm1\n"
572
      "pcmpgtw %%mm4, %%mm3\n\t"
573
      "pcmpgtw %%mm5, %%mm1\n\t"
574
      "pxor %%mm3, %%mm4\n"
575
      "pxor %%mm1, %%mm5\n"
576
      "psubw %%mm3, %%mm4\n"
577
      "psubw %%mm1, %%mm5\n"
578
      "paddw %%mm4, %%mm5\n"
579
      "paddw %%mm5, %%mm6\n"
580

    
581
      "add %2,%0\n"
582

    
583
      "movq (%0),%%mm4\n"
584
      "movq 1(%0),%%mm1\n"
585
      "movq %%mm4, %%mm5\n"
586
      "movq %%mm1, %%mm3\n"
587
      "punpcklbw %%mm7,%%mm4\n"
588
      "punpcklbw %%mm7,%%mm1\n"
589
      "punpckhbw %%mm7,%%mm5\n"
590
      "punpckhbw %%mm7,%%mm3\n"
591
      "psubw %%mm1, %%mm4\n"
592
      "psubw %%mm3, %%mm5\n"
593
      "psubw %%mm4, %%mm0\n"
594
      "psubw %%mm5, %%mm2\n"
595
      "pxor %%mm3, %%mm3\n"
596
      "pxor %%mm1, %%mm1\n"
597
      "pcmpgtw %%mm0, %%mm3\n\t"
598
      "pcmpgtw %%mm2, %%mm1\n\t"
599
      "pxor %%mm3, %%mm0\n"
600
      "pxor %%mm1, %%mm2\n"
601
      "psubw %%mm3, %%mm0\n"
602
      "psubw %%mm1, %%mm2\n"
603
      "paddw %%mm0, %%mm2\n"
604
      "paddw %%mm2, %%mm6\n"
605

    
606
      "add %2,%0\n"
607
      "subl $2, %%ecx\n"
608
      " jnz 1b\n"
609

    
610
      "movq %%mm6, %%mm0\n"
611
      "punpcklwd %%mm7,%%mm0\n"
612
      "punpckhwd %%mm7,%%mm6\n"
613
      "paddd %%mm0, %%mm6\n"
614

    
615
      "movq %%mm6,%%mm0\n"
616
      "psrlq $32, %%mm6\n"
617
      "paddd %%mm6,%%mm0\n"
618
      "movd %%mm0,%1\n"
619
      : "+r" (pix1), "=r"(tmp)
620
      : "r" ((x86_reg)line_size) , "g" (h-2)
621
      : "%ecx");
622
      return tmp + hf_noise8_mmx(pix+8, line_size, h);
623
}
624

    
625
static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
626
    MpegEncContext *c = p;
627
    int score1, score2;
628

    
629
    if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
630
    else  score1 = sse16_mmx(c, pix1, pix2, line_size, h);
631
    score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
632

    
633
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
634
    else  return score1 + FFABS(score2)*8;
635
}
636

    
637
static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
638
    MpegEncContext *c = p;
639
    int score1= sse8_mmx(c, pix1, pix2, line_size, h);
640
    int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
641

    
642
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
643
    else  return score1 + FFABS(score2)*8;
644
}
645

    
646
static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
647
    int tmp;
648

    
649
    assert( (((int)pix) & 7) == 0);
650
    assert((line_size &7) ==0);
651

    
652
#define SUM(in0, in1, out0, out1) \
653
      "movq (%0), %%mm2\n"\
654
      "movq 8(%0), %%mm3\n"\
655
      "add %2,%0\n"\
656
      "movq %%mm2, " #out0 "\n"\
657
      "movq %%mm3, " #out1 "\n"\
658
      "psubusb " #in0 ", %%mm2\n"\
659
      "psubusb " #in1 ", %%mm3\n"\
660
      "psubusb " #out0 ", " #in0 "\n"\
661
      "psubusb " #out1 ", " #in1 "\n"\
662
      "por %%mm2, " #in0 "\n"\
663
      "por %%mm3, " #in1 "\n"\
664
      "movq " #in0 ", %%mm2\n"\
665
      "movq " #in1 ", %%mm3\n"\
666
      "punpcklbw %%mm7, " #in0 "\n"\
667
      "punpcklbw %%mm7, " #in1 "\n"\
668
      "punpckhbw %%mm7, %%mm2\n"\
669
      "punpckhbw %%mm7, %%mm3\n"\
670
      "paddw " #in1 ", " #in0 "\n"\
671
      "paddw %%mm3, %%mm2\n"\
672
      "paddw %%mm2, " #in0 "\n"\
673
      "paddw " #in0 ", %%mm6\n"
674

    
675

    
676
  asm volatile (
677
      "movl %3,%%ecx\n"
678
      "pxor %%mm6,%%mm6\n"
679
      "pxor %%mm7,%%mm7\n"
680
      "movq (%0),%%mm0\n"
681
      "movq 8(%0),%%mm1\n"
682
      "add %2,%0\n"
683
      "jmp 2f\n"
684
      "1:\n"
685

    
686
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
687
      "2:\n"
688
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
689

    
690
      "subl $2, %%ecx\n"
691
      "jnz 1b\n"
692

    
693
      "movq %%mm6,%%mm0\n"
694
      "psrlq $32, %%mm6\n"
695
      "paddw %%mm6,%%mm0\n"
696
      "movq %%mm0,%%mm6\n"
697
      "psrlq $16, %%mm0\n"
698
      "paddw %%mm6,%%mm0\n"
699
      "movd %%mm0,%1\n"
700
      : "+r" (pix), "=r"(tmp)
701
      : "r" ((x86_reg)line_size) , "m" (h)
702
      : "%ecx");
703
    return tmp & 0xFFFF;
704
}
705
#undef SUM
706

    
707
static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
708
    int tmp;
709

    
710
    assert( (((int)pix) & 7) == 0);
711
    assert((line_size &7) ==0);
712

    
713
#define SUM(in0, in1, out0, out1) \
714
      "movq (%0), " #out0 "\n"\
715
      "movq 8(%0), " #out1 "\n"\
716
      "add %2,%0\n"\
717
      "psadbw " #out0 ", " #in0 "\n"\
718
      "psadbw " #out1 ", " #in1 "\n"\
719
      "paddw " #in1 ", " #in0 "\n"\
720
      "paddw " #in0 ", %%mm6\n"
721

    
722
  asm volatile (
723
      "movl %3,%%ecx\n"
724
      "pxor %%mm6,%%mm6\n"
725
      "pxor %%mm7,%%mm7\n"
726
      "movq (%0),%%mm0\n"
727
      "movq 8(%0),%%mm1\n"
728
      "add %2,%0\n"
729
      "jmp 2f\n"
730
      "1:\n"
731

    
732
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
733
      "2:\n"
734
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
735

    
736
      "subl $2, %%ecx\n"
737
      "jnz 1b\n"
738

    
739
      "movd %%mm6,%1\n"
740
      : "+r" (pix), "=r"(tmp)
741
      : "r" ((x86_reg)line_size) , "m" (h)
742
      : "%ecx");
743
    return tmp;
744
}
745
#undef SUM
746

    
747
static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
748
    int tmp;
749

    
750
    assert( (((int)pix1) & 7) == 0);
751
    assert( (((int)pix2) & 7) == 0);
752
    assert((line_size &7) ==0);
753

    
754
#define SUM(in0, in1, out0, out1) \
755
      "movq (%0),%%mm2\n"\
756
      "movq (%1)," #out0 "\n"\
757
      "movq 8(%0),%%mm3\n"\
758
      "movq 8(%1)," #out1 "\n"\
759
      "add %3,%0\n"\
760
      "add %3,%1\n"\
761
      "psubb " #out0 ", %%mm2\n"\
762
      "psubb " #out1 ", %%mm3\n"\
763
      "pxor %%mm7, %%mm2\n"\
764
      "pxor %%mm7, %%mm3\n"\
765
      "movq %%mm2, " #out0 "\n"\
766
      "movq %%mm3, " #out1 "\n"\
767
      "psubusb " #in0 ", %%mm2\n"\
768
      "psubusb " #in1 ", %%mm3\n"\
769
      "psubusb " #out0 ", " #in0 "\n"\
770
      "psubusb " #out1 ", " #in1 "\n"\
771
      "por %%mm2, " #in0 "\n"\
772
      "por %%mm3, " #in1 "\n"\
773
      "movq " #in0 ", %%mm2\n"\
774
      "movq " #in1 ", %%mm3\n"\
775
      "punpcklbw %%mm7, " #in0 "\n"\
776
      "punpcklbw %%mm7, " #in1 "\n"\
777
      "punpckhbw %%mm7, %%mm2\n"\
778
      "punpckhbw %%mm7, %%mm3\n"\
779
      "paddw " #in1 ", " #in0 "\n"\
780
      "paddw %%mm3, %%mm2\n"\
781
      "paddw %%mm2, " #in0 "\n"\
782
      "paddw " #in0 ", %%mm6\n"
783

    
784

    
785
  asm volatile (
786
      "movl %4,%%ecx\n"
787
      "pxor %%mm6,%%mm6\n"
788
      "pcmpeqw %%mm7,%%mm7\n"
789
      "psllw $15, %%mm7\n"
790
      "packsswb %%mm7, %%mm7\n"
791
      "movq (%0),%%mm0\n"
792
      "movq (%1),%%mm2\n"
793
      "movq 8(%0),%%mm1\n"
794
      "movq 8(%1),%%mm3\n"
795
      "add %3,%0\n"
796
      "add %3,%1\n"
797
      "psubb %%mm2, %%mm0\n"
798
      "psubb %%mm3, %%mm1\n"
799
      "pxor %%mm7, %%mm0\n"
800
      "pxor %%mm7, %%mm1\n"
801
      "jmp 2f\n"
802
      "1:\n"
803

    
804
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
805
      "2:\n"
806
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
807

    
808
      "subl $2, %%ecx\n"
809
      "jnz 1b\n"
810

    
811
      "movq %%mm6,%%mm0\n"
812
      "psrlq $32, %%mm6\n"
813
      "paddw %%mm6,%%mm0\n"
814
      "movq %%mm0,%%mm6\n"
815
      "psrlq $16, %%mm0\n"
816
      "paddw %%mm6,%%mm0\n"
817
      "movd %%mm0,%2\n"
818
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
819
      : "r" ((x86_reg)line_size) , "m" (h)
820
      : "%ecx");
821
    return tmp & 0x7FFF;
822
}
823
#undef SUM
824

    
825
static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
826
    int tmp;
827

    
828
    assert( (((int)pix1) & 7) == 0);
829
    assert( (((int)pix2) & 7) == 0);
830
    assert((line_size &7) ==0);
831

    
832
#define SUM(in0, in1, out0, out1) \
833
      "movq (%0)," #out0 "\n"\
834
      "movq (%1),%%mm2\n"\
835
      "movq 8(%0)," #out1 "\n"\
836
      "movq 8(%1),%%mm3\n"\
837
      "add %3,%0\n"\
838
      "add %3,%1\n"\
839
      "psubb %%mm2, " #out0 "\n"\
840
      "psubb %%mm3, " #out1 "\n"\
841
      "pxor %%mm7, " #out0 "\n"\
842
      "pxor %%mm7, " #out1 "\n"\
843
      "psadbw " #out0 ", " #in0 "\n"\
844
      "psadbw " #out1 ", " #in1 "\n"\
845
      "paddw " #in1 ", " #in0 "\n"\
846
      "paddw " #in0 ", %%mm6\n"
847

    
848
  asm volatile (
849
      "movl %4,%%ecx\n"
850
      "pxor %%mm6,%%mm6\n"
851
      "pcmpeqw %%mm7,%%mm7\n"
852
      "psllw $15, %%mm7\n"
853
      "packsswb %%mm7, %%mm7\n"
854
      "movq (%0),%%mm0\n"
855
      "movq (%1),%%mm2\n"
856
      "movq 8(%0),%%mm1\n"
857
      "movq 8(%1),%%mm3\n"
858
      "add %3,%0\n"
859
      "add %3,%1\n"
860
      "psubb %%mm2, %%mm0\n"
861
      "psubb %%mm3, %%mm1\n"
862
      "pxor %%mm7, %%mm0\n"
863
      "pxor %%mm7, %%mm1\n"
864
      "jmp 2f\n"
865
      "1:\n"
866

    
867
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
868
      "2:\n"
869
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
870

    
871
      "subl $2, %%ecx\n"
872
      "jnz 1b\n"
873

    
874
      "movd %%mm6,%2\n"
875
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
876
      : "r" ((x86_reg)line_size) , "m" (h)
877
      : "%ecx");
878
    return tmp;
879
}
880
#undef SUM
881

    
882
static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
883
    x86_reg i=0;
884
    asm volatile(
885
        "1:                             \n\t"
886
        "movq  (%2, %0), %%mm0          \n\t"
887
        "movq  (%1, %0), %%mm1          \n\t"
888
        "psubb %%mm0, %%mm1             \n\t"
889
        "movq %%mm1, (%3, %0)           \n\t"
890
        "movq 8(%2, %0), %%mm0          \n\t"
891
        "movq 8(%1, %0), %%mm1          \n\t"
892
        "psubb %%mm0, %%mm1             \n\t"
893
        "movq %%mm1, 8(%3, %0)          \n\t"
894
        "add $16, %0                    \n\t"
895
        "cmp %4, %0                     \n\t"
896
        " jb 1b                         \n\t"
897
        : "+r" (i)
898
        : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w-15)
899
    );
900
    for(; i<w; i++)
901
        dst[i+0] = src1[i+0]-src2[i+0];
902
}
903

    
904
static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
905
    x86_reg i=0;
906
    uint8_t l, lt;
907

    
908
    asm volatile(
909
        "1:                             \n\t"
910
        "movq  -1(%1, %0), %%mm0        \n\t" // LT
911
        "movq  (%1, %0), %%mm1          \n\t" // T
912
        "movq  -1(%2, %0), %%mm2        \n\t" // L
913
        "movq  (%2, %0), %%mm3          \n\t" // X
914
        "movq %%mm2, %%mm4              \n\t" // L
915
        "psubb %%mm0, %%mm2             \n\t"
916
        "paddb %%mm1, %%mm2             \n\t" // L + T - LT
917
        "movq %%mm4, %%mm5              \n\t" // L
918
        "pmaxub %%mm1, %%mm4            \n\t" // max(T, L)
919
        "pminub %%mm5, %%mm1            \n\t" // min(T, L)
920
        "pminub %%mm2, %%mm4            \n\t"
921
        "pmaxub %%mm1, %%mm4            \n\t"
922
        "psubb %%mm4, %%mm3             \n\t" // dst - pred
923
        "movq %%mm3, (%3, %0)           \n\t"
924
        "add $8, %0                     \n\t"
925
        "cmp %4, %0                     \n\t"
926
        " jb 1b                         \n\t"
927
        : "+r" (i)
928
        : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w)
929
    );
930

    
931
    l= *left;
932
    lt= *left_top;
933

    
934
    dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
935

    
936
    *left_top= src1[w-1];
937
    *left    = src2[w-1];
938
}
939

    
940
#define DIFF_PIXELS_1(m,a,t,p1,p2)\
941
    "mov"#m" "#p1", "#a"              \n\t"\
942
    "mov"#m" "#p2", "#t"              \n\t"\
943
    "punpcklbw "#a", "#t"             \n\t"\
944
    "punpcklbw "#a", "#a"             \n\t"\
945
    "psubw     "#t", "#a"             \n\t"\
946

    
947
#define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
948
    uint8_t *p1b=p1, *p2b=p2;\
949
    asm volatile(\
950
        DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
951
        DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
952
        DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
953
        "add %4, %1                   \n\t"\
954
        "add %4, %2                   \n\t"\
955
        DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
956
        DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
957
        DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
958
        DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
959
        "mov"#m1" "#mm"0, %0          \n\t"\
960
        DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
961
        "mov"#m1" %0, "#mm"0          \n\t"\
962
        : "+m"(temp), "+r"(p1b), "+r"(p2b)\
963
        : "r"((x86_reg)stride), "r"((x86_reg)stride*3)\
964
    );\
965
}
966
    //the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp)
967

    
968
#define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q,   %%mm,  p1, p2, stride, temp)
969
#define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
970

    
971
#define LBUTTERFLY2(a1,b1,a2,b2)\
972
    "paddw " #b1 ", " #a1 "           \n\t"\
973
    "paddw " #b2 ", " #a2 "           \n\t"\
974
    "paddw " #b1 ", " #b1 "           \n\t"\
975
    "paddw " #b2 ", " #b2 "           \n\t"\
976
    "psubw " #a1 ", " #b1 "           \n\t"\
977
    "psubw " #a2 ", " #b2 "           \n\t"
978

    
979
#define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
980
        LBUTTERFLY2(m0, m1, m2, m3)\
981
        LBUTTERFLY2(m4, m5, m6, m7)\
982
        LBUTTERFLY2(m0, m2, m1, m3)\
983
        LBUTTERFLY2(m4, m6, m5, m7)\
984
        LBUTTERFLY2(m0, m4, m1, m5)\
985
        LBUTTERFLY2(m2, m6, m3, m7)\
986

    
987
#define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
988

    
989
#define MMABS_MMX(a,z)\
990
    "pxor " #z ", " #z "              \n\t"\
991
    "pcmpgtw " #a ", " #z "           \n\t"\
992
    "pxor " #z ", " #a "              \n\t"\
993
    "psubw " #z ", " #a "             \n\t"
994

    
995
#define MMABS_MMX2(a,z)\
996
    "pxor " #z ", " #z "              \n\t"\
997
    "psubw " #a ", " #z "             \n\t"\
998
    "pmaxsw " #z ", " #a "            \n\t"
999

    
1000
#define MMABS_SSSE3(a,z)\
1001
    "pabsw " #a ", " #a "             \n\t"
1002

    
1003
#define MMABS_SUM(a,z, sum)\
1004
    MMABS(a,z)\
1005
    "paddusw " #a ", " #sum "         \n\t"
1006

    
1007
#define MMABS_SUM_8x8_NOSPILL\
1008
    MMABS(%%xmm0, %%xmm8)\
1009
    MMABS(%%xmm1, %%xmm9)\
1010
    MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
1011
    MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
1012
    MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
1013
    MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
1014
    MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
1015
    MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
1016
    "paddusw %%xmm1, %%xmm0           \n\t"
1017

    
1018
#ifdef ARCH_X86_64
1019
#define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
1020
#else
1021
#define MMABS_SUM_8x8_SSE2\
1022
    "movdqa %%xmm7, (%1)              \n\t"\
1023
    MMABS(%%xmm0, %%xmm7)\
1024
    MMABS(%%xmm1, %%xmm7)\
1025
    MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
1026
    MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
1027
    MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
1028
    MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
1029
    MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
1030
    "movdqa (%1), %%xmm2              \n\t"\
1031
    MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
1032
    "paddusw %%xmm1, %%xmm0           \n\t"
1033
#endif
1034

    
1035
/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
1036
 * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
1037
 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
1038
#define HSUM_MMX(a, t, dst)\
1039
    "movq "#a", "#t"                  \n\t"\
1040
    "psrlq $32, "#a"                  \n\t"\
1041
    "paddusw "#t", "#a"               \n\t"\
1042
    "movq "#a", "#t"                  \n\t"\
1043
    "psrlq $16, "#a"                  \n\t"\
1044
    "paddusw "#t", "#a"               \n\t"\
1045
    "movd "#a", "#dst"                \n\t"\
1046

    
1047
#define HSUM_MMX2(a, t, dst)\
1048
    "pshufw $0x0E, "#a", "#t"         \n\t"\
1049
    "paddusw "#t", "#a"               \n\t"\
1050
    "pshufw $0x01, "#a", "#t"         \n\t"\
1051
    "paddusw "#t", "#a"               \n\t"\
1052
    "movd "#a", "#dst"                \n\t"\
1053

    
1054
#define HSUM_SSE2(a, t, dst)\
1055
    "movhlps "#a", "#t"               \n\t"\
1056
    "paddusw "#t", "#a"               \n\t"\
1057
    "pshuflw $0x0E, "#a", "#t"        \n\t"\
1058
    "paddusw "#t", "#a"               \n\t"\
1059
    "pshuflw $0x01, "#a", "#t"        \n\t"\
1060
    "paddusw "#t", "#a"               \n\t"\
1061
    "movd "#a", "#dst"                \n\t"\
1062

    
1063
#define HADAMARD8_DIFF_MMX(cpu) \
1064
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1065
    DECLARE_ALIGNED_8(uint64_t, temp[13]);\
1066
    int sum;\
1067
\
1068
    assert(h==8);\
1069
\
1070
    DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
1071
\
1072
    asm volatile(\
1073
        HADAMARD48\
1074
\
1075
        "movq %%mm7, 96(%1)             \n\t"\
1076
\
1077
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1078
        STORE4(8,  0(%1), %%mm0, %%mm3, %%mm7, %%mm2)\
1079
\
1080
        "movq 96(%1), %%mm7             \n\t"\
1081
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1082
        STORE4(8, 64(%1), %%mm4, %%mm7, %%mm0, %%mm6)\
1083
\
1084
        : "=r" (sum)\
1085
        : "r"(temp)\
1086
    );\
1087
\
1088
    DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
1089
\
1090
    asm volatile(\
1091
        HADAMARD48\
1092
\
1093
        "movq %%mm7, 96(%1)             \n\t"\
1094
\
1095
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1096
        STORE4(8, 32(%1), %%mm0, %%mm3, %%mm7, %%mm2)\
1097
\
1098
        "movq 96(%1), %%mm7             \n\t"\
1099
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1100
        "movq %%mm7, %%mm5              \n\t"/*FIXME remove*/\
1101
        "movq %%mm6, %%mm7              \n\t"\
1102
        "movq %%mm0, %%mm6              \n\t"\
1103
\
1104
        LOAD4(8, 64(%1), %%mm0, %%mm1, %%mm2, %%mm3)\
1105
\
1106
        HADAMARD48\
1107
        "movq %%mm7, 64(%1)             \n\t"\
1108
        MMABS(%%mm0, %%mm7)\
1109
        MMABS(%%mm1, %%mm7)\
1110
        MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1111
        MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1112
        MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1113
        MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1114
        MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1115
        "movq 64(%1), %%mm2             \n\t"\
1116
        MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1117
        "paddusw %%mm1, %%mm0           \n\t"\
1118
        "movq %%mm0, 64(%1)             \n\t"\
1119
\
1120
        LOAD4(8,  0(%1), %%mm0, %%mm1, %%mm2, %%mm3)\
1121
        LOAD4(8, 32(%1), %%mm4, %%mm5, %%mm6, %%mm7)\
1122
\
1123
        HADAMARD48\
1124
        "movq %%mm7, (%1)               \n\t"\
1125
        MMABS(%%mm0, %%mm7)\
1126
        MMABS(%%mm1, %%mm7)\
1127
        MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1128
        MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1129
        MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1130
        MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1131
        MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1132
        "movq (%1), %%mm2               \n\t"\
1133
        MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1134
        "paddusw 64(%1), %%mm0          \n\t"\
1135
        "paddusw %%mm1, %%mm0           \n\t"\
1136
\
1137
        HSUM(%%mm0, %%mm1, %0)\
1138
\
1139
        : "=r" (sum)\
1140
        : "r"(temp)\
1141
    );\
1142
    return sum&0xFFFF;\
1143
}\
1144
WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1145

    
1146
#define HADAMARD8_DIFF_SSE2(cpu) \
1147
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1148
    DECLARE_ALIGNED_16(uint64_t, temp[4]);\
1149
    int sum;\
1150
\
1151
    assert(h==8);\
1152
\
1153
    DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
1154
\
1155
    asm volatile(\
1156
        HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
1157
        TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
1158
        HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
1159
        MMABS_SUM_8x8\
1160
        HSUM_SSE2(%%xmm0, %%xmm1, %0)\
1161
        : "=r" (sum)\
1162
        : "r"(temp)\
1163
    );\
1164
    return sum&0xFFFF;\
1165
}\
1166
WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1167

    
1168
#define MMABS(a,z)         MMABS_MMX(a,z)
1169
#define HSUM(a,t,dst)      HSUM_MMX(a,t,dst)
1170
HADAMARD8_DIFF_MMX(mmx)
1171
#undef MMABS
1172
#undef HSUM
1173

    
1174
#define MMABS(a,z)         MMABS_MMX2(a,z)
1175
#define MMABS_SUM_8x8      MMABS_SUM_8x8_SSE2
1176
#define HSUM(a,t,dst)      HSUM_MMX2(a,t,dst)
1177
HADAMARD8_DIFF_MMX(mmx2)
1178
HADAMARD8_DIFF_SSE2(sse2)
1179
#undef MMABS
1180
#undef MMABS_SUM_8x8
1181
#undef HSUM
1182

    
1183
#ifdef HAVE_SSSE3
1184
#define MMABS(a,z)         MMABS_SSSE3(a,z)
1185
#define MMABS_SUM_8x8      MMABS_SUM_8x8_NOSPILL
1186
HADAMARD8_DIFF_SSE2(ssse3)
1187
#undef MMABS
1188
#undef MMABS_SUM_8x8
1189
#endif
1190

    
1191
#define DCT_SAD4(m,mm,o)\
1192
    "mov"#m" "#o"+ 0(%1), "#mm"2      \n\t"\
1193
    "mov"#m" "#o"+16(%1), "#mm"3      \n\t"\
1194
    "mov"#m" "#o"+32(%1), "#mm"4      \n\t"\
1195
    "mov"#m" "#o"+48(%1), "#mm"5      \n\t"\
1196
    MMABS_SUM(mm##2, mm##6, mm##0)\
1197
    MMABS_SUM(mm##3, mm##7, mm##1)\
1198
    MMABS_SUM(mm##4, mm##6, mm##0)\
1199
    MMABS_SUM(mm##5, mm##7, mm##1)\
1200

    
1201
#define DCT_SAD_MMX\
1202
    "pxor %%mm0, %%mm0                \n\t"\
1203
    "pxor %%mm1, %%mm1                \n\t"\
1204
    DCT_SAD4(q, %%mm, 0)\
1205
    DCT_SAD4(q, %%mm, 8)\
1206
    DCT_SAD4(q, %%mm, 64)\
1207
    DCT_SAD4(q, %%mm, 72)\
1208
    "paddusw %%mm1, %%mm0             \n\t"\
1209
    HSUM(%%mm0, %%mm1, %0)
1210

    
1211
#define DCT_SAD_SSE2\
1212
    "pxor %%xmm0, %%xmm0              \n\t"\
1213
    "pxor %%xmm1, %%xmm1              \n\t"\
1214
    DCT_SAD4(dqa, %%xmm, 0)\
1215
    DCT_SAD4(dqa, %%xmm, 64)\
1216
    "paddusw %%xmm1, %%xmm0           \n\t"\
1217
    HSUM(%%xmm0, %%xmm1, %0)
1218

    
1219
#define DCT_SAD_FUNC(cpu) \
1220
static int sum_abs_dctelem_##cpu(DCTELEM *block){\
1221
    int sum;\
1222
    asm volatile(\
1223
        DCT_SAD\
1224
        :"=r"(sum)\
1225
        :"r"(block)\
1226
    );\
1227
    return sum&0xFFFF;\
1228
}
1229

    
1230
#define DCT_SAD       DCT_SAD_MMX
1231
#define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1232
#define MMABS(a,z)    MMABS_MMX(a,z)
1233
DCT_SAD_FUNC(mmx)
1234
#undef MMABS
1235
#undef HSUM
1236

    
1237
#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1238
#define MMABS(a,z)    MMABS_MMX2(a,z)
1239
DCT_SAD_FUNC(mmx2)
1240
#undef HSUM
1241
#undef DCT_SAD
1242

    
1243
#define DCT_SAD       DCT_SAD_SSE2
1244
#define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
1245
DCT_SAD_FUNC(sse2)
1246
#undef MMABS
1247

    
1248
#ifdef HAVE_SSSE3
1249
#define MMABS(a,z)    MMABS_SSSE3(a,z)
1250
DCT_SAD_FUNC(ssse3)
1251
#undef MMABS
1252
#endif
1253
#undef HSUM
1254
#undef DCT_SAD
1255

    
1256
static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
1257
    int sum;
1258
    x86_reg i=size;
1259
    asm volatile(
1260
        "pxor %%mm4, %%mm4 \n"
1261
        "1: \n"
1262
        "sub $8, %0 \n"
1263
        "movq (%2,%0), %%mm2 \n"
1264
        "movq (%3,%0,2), %%mm0 \n"
1265
        "movq 8(%3,%0,2), %%mm1 \n"
1266
        "punpckhbw %%mm2, %%mm3 \n"
1267
        "punpcklbw %%mm2, %%mm2 \n"
1268
        "psraw $8, %%mm3 \n"
1269
        "psraw $8, %%mm2 \n"
1270
        "psubw %%mm3, %%mm1 \n"
1271
        "psubw %%mm2, %%mm0 \n"
1272
        "pmaddwd %%mm1, %%mm1 \n"
1273
        "pmaddwd %%mm0, %%mm0 \n"
1274
        "paddd %%mm1, %%mm4 \n"
1275
        "paddd %%mm0, %%mm4 \n"
1276
        "jg 1b \n"
1277
        "movq %%mm4, %%mm3 \n"
1278
        "psrlq $32, %%mm3 \n"
1279
        "paddd %%mm3, %%mm4 \n"
1280
        "movd %%mm4, %1 \n"
1281
        :"+r"(i), "=r"(sum)
1282
        :"r"(pix1), "r"(pix2)
1283
    );
1284
    return sum;
1285
}
1286

    
1287
#define PHADDD(a, t)\
1288
    "movq "#a", "#t"                  \n\t"\
1289
    "psrlq $32, "#a"                  \n\t"\
1290
    "paddd "#t", "#a"                 \n\t"
1291
/*
1292
   pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
1293
   pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
1294
   pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
1295
 */
1296
#define PMULHRW(x, y, s, o)\
1297
    "pmulhw " #s ", "#x "            \n\t"\
1298
    "pmulhw " #s ", "#y "            \n\t"\
1299
    "paddw " #o ", "#x "             \n\t"\
1300
    "paddw " #o ", "#y "             \n\t"\
1301
    "psraw $1, "#x "                 \n\t"\
1302
    "psraw $1, "#y "                 \n\t"
1303
#define DEF(x) x ## _mmx
1304
#define SET_RND MOVQ_WONE
1305
#define SCALE_OFFSET 1
1306

    
1307
#include "dsputil_mmx_qns.h"
1308

    
1309
#undef DEF
1310
#undef SET_RND
1311
#undef SCALE_OFFSET
1312
#undef PMULHRW
1313

    
1314
#define DEF(x) x ## _3dnow
1315
#define SET_RND(x)
1316
#define SCALE_OFFSET 0
1317
#define PMULHRW(x, y, s, o)\
1318
    "pmulhrw " #s ", "#x "           \n\t"\
1319
    "pmulhrw " #s ", "#y "           \n\t"
1320

    
1321
#include "dsputil_mmx_qns.h"
1322

    
1323
#undef DEF
1324
#undef SET_RND
1325
#undef SCALE_OFFSET
1326
#undef PMULHRW
1327

    
1328
#ifdef HAVE_SSSE3
1329
#undef PHADDD
1330
#define DEF(x) x ## _ssse3
1331
#define SET_RND(x)
1332
#define SCALE_OFFSET -1
1333
#define PHADDD(a, t)\
1334
    "pshufw $0x0E, "#a", "#t"         \n\t"\
1335
    "paddd "#t", "#a"                 \n\t" /* faster than phaddd on core2 */
1336
#define PMULHRW(x, y, s, o)\
1337
    "pmulhrsw " #s ", "#x "          \n\t"\
1338
    "pmulhrsw " #s ", "#y "          \n\t"
1339

    
1340
#include "dsputil_mmx_qns.h"
1341

    
1342
#undef DEF
1343
#undef SET_RND
1344
#undef SCALE_OFFSET
1345
#undef PMULHRW
1346
#undef PHADDD
1347
#endif //HAVE_SSSE3
1348

    
1349

    
1350
/* FLAC specific */
1351
void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag,
1352
                                   double *autoc);
1353

    
1354

    
1355
void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
1356
{
1357
    if (mm_flags & MM_MMX) {
1358
        const int dct_algo = avctx->dct_algo;
1359
        if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
1360
            if(mm_flags & MM_SSE2){
1361
                c->fdct = ff_fdct_sse2;
1362
            }else if(mm_flags & MM_MMXEXT){
1363
                c->fdct = ff_fdct_mmx2;
1364
            }else{
1365
                c->fdct = ff_fdct_mmx;
1366
            }
1367
        }
1368

    
1369
        c->get_pixels = get_pixels_mmx;
1370
        c->diff_pixels = diff_pixels_mmx;
1371
        c->pix_sum = pix_sum16_mmx;
1372

    
1373
        c->diff_bytes= diff_bytes_mmx;
1374
        c->sum_abs_dctelem= sum_abs_dctelem_mmx;
1375

    
1376
        c->hadamard8_diff[0]= hadamard8_diff16_mmx;
1377
        c->hadamard8_diff[1]= hadamard8_diff_mmx;
1378

    
1379
        c->pix_norm1 = pix_norm1_mmx;
1380
        c->sse[0] = (mm_flags & MM_SSE2) ? sse16_sse2 : sse16_mmx;
1381
          c->sse[1] = sse8_mmx;
1382
        c->vsad[4]= vsad_intra16_mmx;
1383

    
1384
        c->nsse[0] = nsse16_mmx;
1385
        c->nsse[1] = nsse8_mmx;
1386
        if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1387
            c->vsad[0] = vsad16_mmx;
1388
        }
1389

    
1390
        if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1391
            c->try_8x8basis= try_8x8basis_mmx;
1392
        }
1393
        c->add_8x8basis= add_8x8basis_mmx;
1394

    
1395
        c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
1396

    
1397

    
1398
        if (mm_flags & MM_MMXEXT) {
1399
            c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
1400
            c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
1401
            c->hadamard8_diff[1]= hadamard8_diff_mmx2;
1402
            c->vsad[4]= vsad_intra16_mmx2;
1403

    
1404
            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1405
                c->vsad[0] = vsad16_mmx2;
1406
            }
1407

    
1408
            c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
1409
        }
1410

    
1411
        if(mm_flags & MM_SSE2){
1412
            c->get_pixels = get_pixels_sse2;
1413
            c->sum_abs_dctelem= sum_abs_dctelem_sse2;
1414
            c->hadamard8_diff[0]= hadamard8_diff16_sse2;
1415
            c->hadamard8_diff[1]= hadamard8_diff_sse2;
1416
            if (ENABLE_FLAC_ENCODER)
1417
                c->flac_compute_autocorr = ff_flac_compute_autocorr_sse2;
1418
        }
1419

    
1420
#ifdef HAVE_SSSE3
1421
        if(mm_flags & MM_SSSE3){
1422
            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1423
                c->try_8x8basis= try_8x8basis_ssse3;
1424
            }
1425
            c->add_8x8basis= add_8x8basis_ssse3;
1426
            c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
1427
            c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
1428
            c->hadamard8_diff[1]= hadamard8_diff_ssse3;
1429
        }
1430
#endif
1431

    
1432
        if(mm_flags & MM_3DNOW){
1433
            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1434
                c->try_8x8basis= try_8x8basis_3dnow;
1435
            }
1436
            c->add_8x8basis= add_8x8basis_3dnow;
1437
        }
1438
    }
1439

    
1440
    dsputil_init_pix_mmx(c, avctx);
1441
}