Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / dsputilenc_mmx.c @ f49747e9

History | View | Annotate | Download (42.9 KB)

1
/*
2
 * MMX optimized DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * This file is part of FFmpeg.
7
 *
8
 * FFmpeg is free software; you can redistribute it and/or
9
 * modify it under the terms of the GNU Lesser General Public
10
 * License as published by the Free Software Foundation; either
11
 * version 2.1 of the License, or (at your option) any later version.
12
 *
13
 * FFmpeg is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
 * Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public
19
 * License along with FFmpeg; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
 *
22
 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
23
 */
24

    
25
#include "libavutil/x86_cpu.h"
26
#include "libavcodec/dsputil.h"
27
#include "libavcodec/mpegvideo.h"
28
#include "libavcodec/mathops.h"
29
#include "dsputil_mmx.h"
30

    
31

    
32
static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
33
{
34
    __asm__ volatile(
35
        "mov $-128, %%"REG_a"           \n\t"
36
        "pxor %%mm7, %%mm7              \n\t"
37
        ASMALIGN(4)
38
        "1:                             \n\t"
39
        "movq (%0), %%mm0               \n\t"
40
        "movq (%0, %2), %%mm2           \n\t"
41
        "movq %%mm0, %%mm1              \n\t"
42
        "movq %%mm2, %%mm3              \n\t"
43
        "punpcklbw %%mm7, %%mm0         \n\t"
44
        "punpckhbw %%mm7, %%mm1         \n\t"
45
        "punpcklbw %%mm7, %%mm2         \n\t"
46
        "punpckhbw %%mm7, %%mm3         \n\t"
47
        "movq %%mm0, (%1, %%"REG_a")    \n\t"
48
        "movq %%mm1, 8(%1, %%"REG_a")   \n\t"
49
        "movq %%mm2, 16(%1, %%"REG_a")  \n\t"
50
        "movq %%mm3, 24(%1, %%"REG_a")  \n\t"
51
        "add %3, %0                     \n\t"
52
        "add $32, %%"REG_a"             \n\t"
53
        "js 1b                          \n\t"
54
        : "+r" (pixels)
55
        : "r" (block+64), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*2)
56
        : "%"REG_a
57
    );
58
}
59

    
60
static void get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size)
61
{
62
    __asm__ volatile(
63
        "pxor %%xmm7,      %%xmm7         \n\t"
64
        "movq (%0),        %%xmm0         \n\t"
65
        "movq (%0, %2),    %%xmm1         \n\t"
66
        "movq (%0, %2,2),  %%xmm2         \n\t"
67
        "movq (%0, %3),    %%xmm3         \n\t"
68
        "lea (%0,%2,4), %0                \n\t"
69
        "punpcklbw %%xmm7, %%xmm0         \n\t"
70
        "punpcklbw %%xmm7, %%xmm1         \n\t"
71
        "punpcklbw %%xmm7, %%xmm2         \n\t"
72
        "punpcklbw %%xmm7, %%xmm3         \n\t"
73
        "movdqa %%xmm0,      (%1)         \n\t"
74
        "movdqa %%xmm1,    16(%1)         \n\t"
75
        "movdqa %%xmm2,    32(%1)         \n\t"
76
        "movdqa %%xmm3,    48(%1)         \n\t"
77
        "movq (%0),        %%xmm0         \n\t"
78
        "movq (%0, %2),    %%xmm1         \n\t"
79
        "movq (%0, %2,2),  %%xmm2         \n\t"
80
        "movq (%0, %3),    %%xmm3         \n\t"
81
        "punpcklbw %%xmm7, %%xmm0         \n\t"
82
        "punpcklbw %%xmm7, %%xmm1         \n\t"
83
        "punpcklbw %%xmm7, %%xmm2         \n\t"
84
        "punpcklbw %%xmm7, %%xmm3         \n\t"
85
        "movdqa %%xmm0,    64(%1)         \n\t"
86
        "movdqa %%xmm1,    80(%1)         \n\t"
87
        "movdqa %%xmm2,    96(%1)         \n\t"
88
        "movdqa %%xmm3,   112(%1)         \n\t"
89
        : "+r" (pixels)
90
        : "r" (block), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3)
91
    );
92
}
93

    
94
static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
95
{
96
    __asm__ volatile(
97
        "pxor %%mm7, %%mm7              \n\t"
98
        "mov $-128, %%"REG_a"           \n\t"
99
        ASMALIGN(4)
100
        "1:                             \n\t"
101
        "movq (%0), %%mm0               \n\t"
102
        "movq (%1), %%mm2               \n\t"
103
        "movq %%mm0, %%mm1              \n\t"
104
        "movq %%mm2, %%mm3              \n\t"
105
        "punpcklbw %%mm7, %%mm0         \n\t"
106
        "punpckhbw %%mm7, %%mm1         \n\t"
107
        "punpcklbw %%mm7, %%mm2         \n\t"
108
        "punpckhbw %%mm7, %%mm3         \n\t"
109
        "psubw %%mm2, %%mm0             \n\t"
110
        "psubw %%mm3, %%mm1             \n\t"
111
        "movq %%mm0, (%2, %%"REG_a")    \n\t"
112
        "movq %%mm1, 8(%2, %%"REG_a")   \n\t"
113
        "add %3, %0                     \n\t"
114
        "add %3, %1                     \n\t"
115
        "add $16, %%"REG_a"             \n\t"
116
        "jnz 1b                         \n\t"
117
        : "+r" (s1), "+r" (s2)
118
        : "r" (block+64), "r" ((x86_reg)stride)
119
        : "%"REG_a
120
    );
121
}
122

    
123
static int pix_sum16_mmx(uint8_t * pix, int line_size){
124
    const int h=16;
125
    int sum;
126
    x86_reg index= -line_size*h;
127

    
128
    __asm__ volatile(
129
                "pxor %%mm7, %%mm7              \n\t"
130
                "pxor %%mm6, %%mm6              \n\t"
131
                "1:                             \n\t"
132
                "movq (%2, %1), %%mm0           \n\t"
133
                "movq (%2, %1), %%mm1           \n\t"
134
                "movq 8(%2, %1), %%mm2          \n\t"
135
                "movq 8(%2, %1), %%mm3          \n\t"
136
                "punpcklbw %%mm7, %%mm0         \n\t"
137
                "punpckhbw %%mm7, %%mm1         \n\t"
138
                "punpcklbw %%mm7, %%mm2         \n\t"
139
                "punpckhbw %%mm7, %%mm3         \n\t"
140
                "paddw %%mm0, %%mm1             \n\t"
141
                "paddw %%mm2, %%mm3             \n\t"
142
                "paddw %%mm1, %%mm3             \n\t"
143
                "paddw %%mm3, %%mm6             \n\t"
144
                "add %3, %1                     \n\t"
145
                " js 1b                         \n\t"
146
                "movq %%mm6, %%mm5              \n\t"
147
                "psrlq $32, %%mm6               \n\t"
148
                "paddw %%mm5, %%mm6             \n\t"
149
                "movq %%mm6, %%mm5              \n\t"
150
                "psrlq $16, %%mm6               \n\t"
151
                "paddw %%mm5, %%mm6             \n\t"
152
                "movd %%mm6, %0                 \n\t"
153
                "andl $0xFFFF, %0               \n\t"
154
                : "=&r" (sum), "+r" (index)
155
                : "r" (pix - index), "r" ((x86_reg)line_size)
156
        );
157

    
158
        return sum;
159
}
160

    
161
static int pix_norm1_mmx(uint8_t *pix, int line_size) {
162
    int tmp;
163
  __asm__ volatile (
164
      "movl $16,%%ecx\n"
165
      "pxor %%mm0,%%mm0\n"
166
      "pxor %%mm7,%%mm7\n"
167
      "1:\n"
168
      "movq (%0),%%mm2\n"       /* mm2 = pix[0-7] */
169
      "movq 8(%0),%%mm3\n"      /* mm3 = pix[8-15] */
170

    
171
      "movq %%mm2,%%mm1\n"      /* mm1 = mm2 = pix[0-7] */
172

    
173
      "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
174
      "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
175

    
176
      "movq %%mm3,%%mm4\n"      /* mm4 = mm3 = pix[8-15] */
177
      "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
178
      "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
179

    
180
      "pmaddwd %%mm1,%%mm1\n"   /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
181
      "pmaddwd %%mm2,%%mm2\n"   /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
182

    
183
      "pmaddwd %%mm3,%%mm3\n"
184
      "pmaddwd %%mm4,%%mm4\n"
185

    
186
      "paddd %%mm1,%%mm2\n"     /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
187
                                          pix2^2+pix3^2+pix6^2+pix7^2) */
188
      "paddd %%mm3,%%mm4\n"
189
      "paddd %%mm2,%%mm7\n"
190

    
191
      "add %2, %0\n"
192
      "paddd %%mm4,%%mm7\n"
193
      "dec %%ecx\n"
194
      "jnz 1b\n"
195

    
196
      "movq %%mm7,%%mm1\n"
197
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
198
      "paddd %%mm7,%%mm1\n"
199
      "movd %%mm1,%1\n"
200
      : "+r" (pix), "=r"(tmp) : "r" ((x86_reg)line_size) : "%ecx" );
201
    return tmp;
202
}
203

    
204
static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
205
    int tmp;
206
  __asm__ volatile (
207
      "movl %4,%%ecx\n"
208
      "shr $1,%%ecx\n"
209
      "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */
210
      "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */
211
      "1:\n"
212
      "movq (%0),%%mm1\n"       /* mm1 = pix1[0][0-7] */
213
      "movq (%1),%%mm2\n"       /* mm2 = pix2[0][0-7] */
214
      "movq (%0,%3),%%mm3\n"    /* mm3 = pix1[1][0-7] */
215
      "movq (%1,%3),%%mm4\n"    /* mm4 = pix2[1][0-7] */
216

    
217
      /* todo: mm1-mm2, mm3-mm4 */
218
      /* algo: subtract mm1 from mm2 with saturation and vice versa */
219
      /*       OR the results to get absolute difference */
220
      "movq %%mm1,%%mm5\n"
221
      "movq %%mm3,%%mm6\n"
222
      "psubusb %%mm2,%%mm1\n"
223
      "psubusb %%mm4,%%mm3\n"
224
      "psubusb %%mm5,%%mm2\n"
225
      "psubusb %%mm6,%%mm4\n"
226

    
227
      "por %%mm1,%%mm2\n"
228
      "por %%mm3,%%mm4\n"
229

    
230
      /* now convert to 16-bit vectors so we can square them */
231
      "movq %%mm2,%%mm1\n"
232
      "movq %%mm4,%%mm3\n"
233

    
234
      "punpckhbw %%mm0,%%mm2\n"
235
      "punpckhbw %%mm0,%%mm4\n"
236
      "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
237
      "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
238

    
239
      "pmaddwd %%mm2,%%mm2\n"
240
      "pmaddwd %%mm4,%%mm4\n"
241
      "pmaddwd %%mm1,%%mm1\n"
242
      "pmaddwd %%mm3,%%mm3\n"
243

    
244
      "lea (%0,%3,2), %0\n"     /* pix1 += 2*line_size */
245
      "lea (%1,%3,2), %1\n"     /* pix2 += 2*line_size */
246

    
247
      "paddd %%mm2,%%mm1\n"
248
      "paddd %%mm4,%%mm3\n"
249
      "paddd %%mm1,%%mm7\n"
250
      "paddd %%mm3,%%mm7\n"
251

    
252
      "decl %%ecx\n"
253
      "jnz 1b\n"
254

    
255
      "movq %%mm7,%%mm1\n"
256
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
257
      "paddd %%mm7,%%mm1\n"
258
      "movd %%mm1,%2\n"
259
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
260
      : "r" ((x86_reg)line_size) , "m" (h)
261
      : "%ecx");
262
    return tmp;
263
}
264

    
265
static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
266
    int tmp;
267
  __asm__ volatile (
268
      "movl %4,%%ecx\n"
269
      "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */
270
      "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */
271
      "1:\n"
272
      "movq (%0),%%mm1\n"       /* mm1 = pix1[0-7] */
273
      "movq (%1),%%mm2\n"       /* mm2 = pix2[0-7] */
274
      "movq 8(%0),%%mm3\n"      /* mm3 = pix1[8-15] */
275
      "movq 8(%1),%%mm4\n"      /* mm4 = pix2[8-15] */
276

    
277
      /* todo: mm1-mm2, mm3-mm4 */
278
      /* algo: subtract mm1 from mm2 with saturation and vice versa */
279
      /*       OR the results to get absolute difference */
280
      "movq %%mm1,%%mm5\n"
281
      "movq %%mm3,%%mm6\n"
282
      "psubusb %%mm2,%%mm1\n"
283
      "psubusb %%mm4,%%mm3\n"
284
      "psubusb %%mm5,%%mm2\n"
285
      "psubusb %%mm6,%%mm4\n"
286

    
287
      "por %%mm1,%%mm2\n"
288
      "por %%mm3,%%mm4\n"
289

    
290
      /* now convert to 16-bit vectors so we can square them */
291
      "movq %%mm2,%%mm1\n"
292
      "movq %%mm4,%%mm3\n"
293

    
294
      "punpckhbw %%mm0,%%mm2\n"
295
      "punpckhbw %%mm0,%%mm4\n"
296
      "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
297
      "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
298

    
299
      "pmaddwd %%mm2,%%mm2\n"
300
      "pmaddwd %%mm4,%%mm4\n"
301
      "pmaddwd %%mm1,%%mm1\n"
302
      "pmaddwd %%mm3,%%mm3\n"
303

    
304
      "add %3,%0\n"
305
      "add %3,%1\n"
306

    
307
      "paddd %%mm2,%%mm1\n"
308
      "paddd %%mm4,%%mm3\n"
309
      "paddd %%mm1,%%mm7\n"
310
      "paddd %%mm3,%%mm7\n"
311

    
312
      "decl %%ecx\n"
313
      "jnz 1b\n"
314

    
315
      "movq %%mm7,%%mm1\n"
316
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
317
      "paddd %%mm7,%%mm1\n"
318
      "movd %%mm1,%2\n"
319
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
320
      : "r" ((x86_reg)line_size) , "m" (h)
321
      : "%ecx");
322
    return tmp;
323
}
324

    
325
static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
326
    int tmp;
327
  __asm__ volatile (
328
      "shr $1,%2\n"
329
      "pxor %%xmm0,%%xmm0\n"    /* mm0 = 0 */
330
      "pxor %%xmm7,%%xmm7\n"    /* mm7 holds the sum */
331
      "1:\n"
332
      "movdqu (%0),%%xmm1\n"    /* mm1 = pix1[0][0-15] */
333
      "movdqu (%1),%%xmm2\n"    /* mm2 = pix2[0][0-15] */
334
      "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
335
      "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
336

    
337
      /* todo: mm1-mm2, mm3-mm4 */
338
      /* algo: subtract mm1 from mm2 with saturation and vice versa */
339
      /*       OR the results to get absolute difference */
340
      "movdqa %%xmm1,%%xmm5\n"
341
      "movdqa %%xmm3,%%xmm6\n"
342
      "psubusb %%xmm2,%%xmm1\n"
343
      "psubusb %%xmm4,%%xmm3\n"
344
      "psubusb %%xmm5,%%xmm2\n"
345
      "psubusb %%xmm6,%%xmm4\n"
346

    
347
      "por %%xmm1,%%xmm2\n"
348
      "por %%xmm3,%%xmm4\n"
349

    
350
      /* now convert to 16-bit vectors so we can square them */
351
      "movdqa %%xmm2,%%xmm1\n"
352
      "movdqa %%xmm4,%%xmm3\n"
353

    
354
      "punpckhbw %%xmm0,%%xmm2\n"
355
      "punpckhbw %%xmm0,%%xmm4\n"
356
      "punpcklbw %%xmm0,%%xmm1\n"  /* mm1 now spread over (mm1,mm2) */
357
      "punpcklbw %%xmm0,%%xmm3\n"  /* mm4 now spread over (mm3,mm4) */
358

    
359
      "pmaddwd %%xmm2,%%xmm2\n"
360
      "pmaddwd %%xmm4,%%xmm4\n"
361
      "pmaddwd %%xmm1,%%xmm1\n"
362
      "pmaddwd %%xmm3,%%xmm3\n"
363

    
364
      "lea (%0,%4,2), %0\n"        /* pix1 += 2*line_size */
365
      "lea (%1,%4,2), %1\n"        /* pix2 += 2*line_size */
366

    
367
      "paddd %%xmm2,%%xmm1\n"
368
      "paddd %%xmm4,%%xmm3\n"
369
      "paddd %%xmm1,%%xmm7\n"
370
      "paddd %%xmm3,%%xmm7\n"
371

    
372
      "decl %2\n"
373
      "jnz 1b\n"
374

    
375
      "movdqa %%xmm7,%%xmm1\n"
376
      "psrldq $8, %%xmm7\n"        /* shift hi qword to lo */
377
      "paddd %%xmm1,%%xmm7\n"
378
      "movdqa %%xmm7,%%xmm1\n"
379
      "psrldq $4, %%xmm7\n"        /* shift hi dword to lo */
380
      "paddd %%xmm1,%%xmm7\n"
381
      "movd %%xmm7,%3\n"
382
      : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
383
      : "r" ((x86_reg)line_size));
384
    return tmp;
385
}
386

    
387
static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
388
    int tmp;
389
  __asm__ volatile (
390
      "movl %3,%%ecx\n"
391
      "pxor %%mm7,%%mm7\n"
392
      "pxor %%mm6,%%mm6\n"
393

    
394
      "movq (%0),%%mm0\n"
395
      "movq %%mm0, %%mm1\n"
396
      "psllq $8, %%mm0\n"
397
      "psrlq $8, %%mm1\n"
398
      "psrlq $8, %%mm0\n"
399
      "movq %%mm0, %%mm2\n"
400
      "movq %%mm1, %%mm3\n"
401
      "punpcklbw %%mm7,%%mm0\n"
402
      "punpcklbw %%mm7,%%mm1\n"
403
      "punpckhbw %%mm7,%%mm2\n"
404
      "punpckhbw %%mm7,%%mm3\n"
405
      "psubw %%mm1, %%mm0\n"
406
      "psubw %%mm3, %%mm2\n"
407

    
408
      "add %2,%0\n"
409

    
410
      "movq (%0),%%mm4\n"
411
      "movq %%mm4, %%mm1\n"
412
      "psllq $8, %%mm4\n"
413
      "psrlq $8, %%mm1\n"
414
      "psrlq $8, %%mm4\n"
415
      "movq %%mm4, %%mm5\n"
416
      "movq %%mm1, %%mm3\n"
417
      "punpcklbw %%mm7,%%mm4\n"
418
      "punpcklbw %%mm7,%%mm1\n"
419
      "punpckhbw %%mm7,%%mm5\n"
420
      "punpckhbw %%mm7,%%mm3\n"
421
      "psubw %%mm1, %%mm4\n"
422
      "psubw %%mm3, %%mm5\n"
423
      "psubw %%mm4, %%mm0\n"
424
      "psubw %%mm5, %%mm2\n"
425
      "pxor %%mm3, %%mm3\n"
426
      "pxor %%mm1, %%mm1\n"
427
      "pcmpgtw %%mm0, %%mm3\n\t"
428
      "pcmpgtw %%mm2, %%mm1\n\t"
429
      "pxor %%mm3, %%mm0\n"
430
      "pxor %%mm1, %%mm2\n"
431
      "psubw %%mm3, %%mm0\n"
432
      "psubw %%mm1, %%mm2\n"
433
      "paddw %%mm0, %%mm2\n"
434
      "paddw %%mm2, %%mm6\n"
435

    
436
      "add %2,%0\n"
437
      "1:\n"
438

    
439
      "movq (%0),%%mm0\n"
440
      "movq %%mm0, %%mm1\n"
441
      "psllq $8, %%mm0\n"
442
      "psrlq $8, %%mm1\n"
443
      "psrlq $8, %%mm0\n"
444
      "movq %%mm0, %%mm2\n"
445
      "movq %%mm1, %%mm3\n"
446
      "punpcklbw %%mm7,%%mm0\n"
447
      "punpcklbw %%mm7,%%mm1\n"
448
      "punpckhbw %%mm7,%%mm2\n"
449
      "punpckhbw %%mm7,%%mm3\n"
450
      "psubw %%mm1, %%mm0\n"
451
      "psubw %%mm3, %%mm2\n"
452
      "psubw %%mm0, %%mm4\n"
453
      "psubw %%mm2, %%mm5\n"
454
      "pxor %%mm3, %%mm3\n"
455
      "pxor %%mm1, %%mm1\n"
456
      "pcmpgtw %%mm4, %%mm3\n\t"
457
      "pcmpgtw %%mm5, %%mm1\n\t"
458
      "pxor %%mm3, %%mm4\n"
459
      "pxor %%mm1, %%mm5\n"
460
      "psubw %%mm3, %%mm4\n"
461
      "psubw %%mm1, %%mm5\n"
462
      "paddw %%mm4, %%mm5\n"
463
      "paddw %%mm5, %%mm6\n"
464

    
465
      "add %2,%0\n"
466

    
467
      "movq (%0),%%mm4\n"
468
      "movq %%mm4, %%mm1\n"
469
      "psllq $8, %%mm4\n"
470
      "psrlq $8, %%mm1\n"
471
      "psrlq $8, %%mm4\n"
472
      "movq %%mm4, %%mm5\n"
473
      "movq %%mm1, %%mm3\n"
474
      "punpcklbw %%mm7,%%mm4\n"
475
      "punpcklbw %%mm7,%%mm1\n"
476
      "punpckhbw %%mm7,%%mm5\n"
477
      "punpckhbw %%mm7,%%mm3\n"
478
      "psubw %%mm1, %%mm4\n"
479
      "psubw %%mm3, %%mm5\n"
480
      "psubw %%mm4, %%mm0\n"
481
      "psubw %%mm5, %%mm2\n"
482
      "pxor %%mm3, %%mm3\n"
483
      "pxor %%mm1, %%mm1\n"
484
      "pcmpgtw %%mm0, %%mm3\n\t"
485
      "pcmpgtw %%mm2, %%mm1\n\t"
486
      "pxor %%mm3, %%mm0\n"
487
      "pxor %%mm1, %%mm2\n"
488
      "psubw %%mm3, %%mm0\n"
489
      "psubw %%mm1, %%mm2\n"
490
      "paddw %%mm0, %%mm2\n"
491
      "paddw %%mm2, %%mm6\n"
492

    
493
      "add %2,%0\n"
494
      "subl $2, %%ecx\n"
495
      " jnz 1b\n"
496

    
497
      "movq %%mm6, %%mm0\n"
498
      "punpcklwd %%mm7,%%mm0\n"
499
      "punpckhwd %%mm7,%%mm6\n"
500
      "paddd %%mm0, %%mm6\n"
501

    
502
      "movq %%mm6,%%mm0\n"
503
      "psrlq $32, %%mm6\n"
504
      "paddd %%mm6,%%mm0\n"
505
      "movd %%mm0,%1\n"
506
      : "+r" (pix1), "=r"(tmp)
507
      : "r" ((x86_reg)line_size) , "g" (h-2)
508
      : "%ecx");
509
      return tmp;
510
}
511

    
512
static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
513
    int tmp;
514
    uint8_t * pix= pix1;
515
  __asm__ volatile (
516
      "movl %3,%%ecx\n"
517
      "pxor %%mm7,%%mm7\n"
518
      "pxor %%mm6,%%mm6\n"
519

    
520
      "movq (%0),%%mm0\n"
521
      "movq 1(%0),%%mm1\n"
522
      "movq %%mm0, %%mm2\n"
523
      "movq %%mm1, %%mm3\n"
524
      "punpcklbw %%mm7,%%mm0\n"
525
      "punpcklbw %%mm7,%%mm1\n"
526
      "punpckhbw %%mm7,%%mm2\n"
527
      "punpckhbw %%mm7,%%mm3\n"
528
      "psubw %%mm1, %%mm0\n"
529
      "psubw %%mm3, %%mm2\n"
530

    
531
      "add %2,%0\n"
532

    
533
      "movq (%0),%%mm4\n"
534
      "movq 1(%0),%%mm1\n"
535
      "movq %%mm4, %%mm5\n"
536
      "movq %%mm1, %%mm3\n"
537
      "punpcklbw %%mm7,%%mm4\n"
538
      "punpcklbw %%mm7,%%mm1\n"
539
      "punpckhbw %%mm7,%%mm5\n"
540
      "punpckhbw %%mm7,%%mm3\n"
541
      "psubw %%mm1, %%mm4\n"
542
      "psubw %%mm3, %%mm5\n"
543
      "psubw %%mm4, %%mm0\n"
544
      "psubw %%mm5, %%mm2\n"
545
      "pxor %%mm3, %%mm3\n"
546
      "pxor %%mm1, %%mm1\n"
547
      "pcmpgtw %%mm0, %%mm3\n\t"
548
      "pcmpgtw %%mm2, %%mm1\n\t"
549
      "pxor %%mm3, %%mm0\n"
550
      "pxor %%mm1, %%mm2\n"
551
      "psubw %%mm3, %%mm0\n"
552
      "psubw %%mm1, %%mm2\n"
553
      "paddw %%mm0, %%mm2\n"
554
      "paddw %%mm2, %%mm6\n"
555

    
556
      "add %2,%0\n"
557
      "1:\n"
558

    
559
      "movq (%0),%%mm0\n"
560
      "movq 1(%0),%%mm1\n"
561
      "movq %%mm0, %%mm2\n"
562
      "movq %%mm1, %%mm3\n"
563
      "punpcklbw %%mm7,%%mm0\n"
564
      "punpcklbw %%mm7,%%mm1\n"
565
      "punpckhbw %%mm7,%%mm2\n"
566
      "punpckhbw %%mm7,%%mm3\n"
567
      "psubw %%mm1, %%mm0\n"
568
      "psubw %%mm3, %%mm2\n"
569
      "psubw %%mm0, %%mm4\n"
570
      "psubw %%mm2, %%mm5\n"
571
      "pxor %%mm3, %%mm3\n"
572
      "pxor %%mm1, %%mm1\n"
573
      "pcmpgtw %%mm4, %%mm3\n\t"
574
      "pcmpgtw %%mm5, %%mm1\n\t"
575
      "pxor %%mm3, %%mm4\n"
576
      "pxor %%mm1, %%mm5\n"
577
      "psubw %%mm3, %%mm4\n"
578
      "psubw %%mm1, %%mm5\n"
579
      "paddw %%mm4, %%mm5\n"
580
      "paddw %%mm5, %%mm6\n"
581

    
582
      "add %2,%0\n"
583

    
584
      "movq (%0),%%mm4\n"
585
      "movq 1(%0),%%mm1\n"
586
      "movq %%mm4, %%mm5\n"
587
      "movq %%mm1, %%mm3\n"
588
      "punpcklbw %%mm7,%%mm4\n"
589
      "punpcklbw %%mm7,%%mm1\n"
590
      "punpckhbw %%mm7,%%mm5\n"
591
      "punpckhbw %%mm7,%%mm3\n"
592
      "psubw %%mm1, %%mm4\n"
593
      "psubw %%mm3, %%mm5\n"
594
      "psubw %%mm4, %%mm0\n"
595
      "psubw %%mm5, %%mm2\n"
596
      "pxor %%mm3, %%mm3\n"
597
      "pxor %%mm1, %%mm1\n"
598
      "pcmpgtw %%mm0, %%mm3\n\t"
599
      "pcmpgtw %%mm2, %%mm1\n\t"
600
      "pxor %%mm3, %%mm0\n"
601
      "pxor %%mm1, %%mm2\n"
602
      "psubw %%mm3, %%mm0\n"
603
      "psubw %%mm1, %%mm2\n"
604
      "paddw %%mm0, %%mm2\n"
605
      "paddw %%mm2, %%mm6\n"
606

    
607
      "add %2,%0\n"
608
      "subl $2, %%ecx\n"
609
      " jnz 1b\n"
610

    
611
      "movq %%mm6, %%mm0\n"
612
      "punpcklwd %%mm7,%%mm0\n"
613
      "punpckhwd %%mm7,%%mm6\n"
614
      "paddd %%mm0, %%mm6\n"
615

    
616
      "movq %%mm6,%%mm0\n"
617
      "psrlq $32, %%mm6\n"
618
      "paddd %%mm6,%%mm0\n"
619
      "movd %%mm0,%1\n"
620
      : "+r" (pix1), "=r"(tmp)
621
      : "r" ((x86_reg)line_size) , "g" (h-2)
622
      : "%ecx");
623
      return tmp + hf_noise8_mmx(pix+8, line_size, h);
624
}
625

    
626
static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
627
    MpegEncContext *c = p;
628
    int score1, score2;
629

    
630
    if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
631
    else  score1 = sse16_mmx(c, pix1, pix2, line_size, h);
632
    score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
633

    
634
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
635
    else  return score1 + FFABS(score2)*8;
636
}
637

    
638
static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
639
    MpegEncContext *c = p;
640
    int score1= sse8_mmx(c, pix1, pix2, line_size, h);
641
    int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
642

    
643
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
644
    else  return score1 + FFABS(score2)*8;
645
}
646

    
647
static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
648
    int tmp;
649

    
650
    assert( (((int)pix) & 7) == 0);
651
    assert((line_size &7) ==0);
652

    
653
#define SUM(in0, in1, out0, out1) \
654
      "movq (%0), %%mm2\n"\
655
      "movq 8(%0), %%mm3\n"\
656
      "add %2,%0\n"\
657
      "movq %%mm2, " #out0 "\n"\
658
      "movq %%mm3, " #out1 "\n"\
659
      "psubusb " #in0 ", %%mm2\n"\
660
      "psubusb " #in1 ", %%mm3\n"\
661
      "psubusb " #out0 ", " #in0 "\n"\
662
      "psubusb " #out1 ", " #in1 "\n"\
663
      "por %%mm2, " #in0 "\n"\
664
      "por %%mm3, " #in1 "\n"\
665
      "movq " #in0 ", %%mm2\n"\
666
      "movq " #in1 ", %%mm3\n"\
667
      "punpcklbw %%mm7, " #in0 "\n"\
668
      "punpcklbw %%mm7, " #in1 "\n"\
669
      "punpckhbw %%mm7, %%mm2\n"\
670
      "punpckhbw %%mm7, %%mm3\n"\
671
      "paddw " #in1 ", " #in0 "\n"\
672
      "paddw %%mm3, %%mm2\n"\
673
      "paddw %%mm2, " #in0 "\n"\
674
      "paddw " #in0 ", %%mm6\n"
675

    
676

    
677
  __asm__ volatile (
678
      "movl %3,%%ecx\n"
679
      "pxor %%mm6,%%mm6\n"
680
      "pxor %%mm7,%%mm7\n"
681
      "movq (%0),%%mm0\n"
682
      "movq 8(%0),%%mm1\n"
683
      "add %2,%0\n"
684
      "jmp 2f\n"
685
      "1:\n"
686

    
687
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
688
      "2:\n"
689
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
690

    
691
      "subl $2, %%ecx\n"
692
      "jnz 1b\n"
693

    
694
      "movq %%mm6,%%mm0\n"
695
      "psrlq $32, %%mm6\n"
696
      "paddw %%mm6,%%mm0\n"
697
      "movq %%mm0,%%mm6\n"
698
      "psrlq $16, %%mm0\n"
699
      "paddw %%mm6,%%mm0\n"
700
      "movd %%mm0,%1\n"
701
      : "+r" (pix), "=r"(tmp)
702
      : "r" ((x86_reg)line_size) , "m" (h)
703
      : "%ecx");
704
    return tmp & 0xFFFF;
705
}
706
#undef SUM
707

    
708
static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
709
    int tmp;
710

    
711
    assert( (((int)pix) & 7) == 0);
712
    assert((line_size &7) ==0);
713

    
714
#define SUM(in0, in1, out0, out1) \
715
      "movq (%0), " #out0 "\n"\
716
      "movq 8(%0), " #out1 "\n"\
717
      "add %2,%0\n"\
718
      "psadbw " #out0 ", " #in0 "\n"\
719
      "psadbw " #out1 ", " #in1 "\n"\
720
      "paddw " #in1 ", " #in0 "\n"\
721
      "paddw " #in0 ", %%mm6\n"
722

    
723
  __asm__ volatile (
724
      "movl %3,%%ecx\n"
725
      "pxor %%mm6,%%mm6\n"
726
      "pxor %%mm7,%%mm7\n"
727
      "movq (%0),%%mm0\n"
728
      "movq 8(%0),%%mm1\n"
729
      "add %2,%0\n"
730
      "jmp 2f\n"
731
      "1:\n"
732

    
733
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
734
      "2:\n"
735
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
736

    
737
      "subl $2, %%ecx\n"
738
      "jnz 1b\n"
739

    
740
      "movd %%mm6,%1\n"
741
      : "+r" (pix), "=r"(tmp)
742
      : "r" ((x86_reg)line_size) , "m" (h)
743
      : "%ecx");
744
    return tmp;
745
}
746
#undef SUM
747

    
748
static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
749
    int tmp;
750

    
751
    assert( (((int)pix1) & 7) == 0);
752
    assert( (((int)pix2) & 7) == 0);
753
    assert((line_size &7) ==0);
754

    
755
#define SUM(in0, in1, out0, out1) \
756
      "movq (%0),%%mm2\n"\
757
      "movq (%1)," #out0 "\n"\
758
      "movq 8(%0),%%mm3\n"\
759
      "movq 8(%1)," #out1 "\n"\
760
      "add %3,%0\n"\
761
      "add %3,%1\n"\
762
      "psubb " #out0 ", %%mm2\n"\
763
      "psubb " #out1 ", %%mm3\n"\
764
      "pxor %%mm7, %%mm2\n"\
765
      "pxor %%mm7, %%mm3\n"\
766
      "movq %%mm2, " #out0 "\n"\
767
      "movq %%mm3, " #out1 "\n"\
768
      "psubusb " #in0 ", %%mm2\n"\
769
      "psubusb " #in1 ", %%mm3\n"\
770
      "psubusb " #out0 ", " #in0 "\n"\
771
      "psubusb " #out1 ", " #in1 "\n"\
772
      "por %%mm2, " #in0 "\n"\
773
      "por %%mm3, " #in1 "\n"\
774
      "movq " #in0 ", %%mm2\n"\
775
      "movq " #in1 ", %%mm3\n"\
776
      "punpcklbw %%mm7, " #in0 "\n"\
777
      "punpcklbw %%mm7, " #in1 "\n"\
778
      "punpckhbw %%mm7, %%mm2\n"\
779
      "punpckhbw %%mm7, %%mm3\n"\
780
      "paddw " #in1 ", " #in0 "\n"\
781
      "paddw %%mm3, %%mm2\n"\
782
      "paddw %%mm2, " #in0 "\n"\
783
      "paddw " #in0 ", %%mm6\n"
784

    
785

    
786
  __asm__ volatile (
787
      "movl %4,%%ecx\n"
788
      "pxor %%mm6,%%mm6\n"
789
      "pcmpeqw %%mm7,%%mm7\n"
790
      "psllw $15, %%mm7\n"
791
      "packsswb %%mm7, %%mm7\n"
792
      "movq (%0),%%mm0\n"
793
      "movq (%1),%%mm2\n"
794
      "movq 8(%0),%%mm1\n"
795
      "movq 8(%1),%%mm3\n"
796
      "add %3,%0\n"
797
      "add %3,%1\n"
798
      "psubb %%mm2, %%mm0\n"
799
      "psubb %%mm3, %%mm1\n"
800
      "pxor %%mm7, %%mm0\n"
801
      "pxor %%mm7, %%mm1\n"
802
      "jmp 2f\n"
803
      "1:\n"
804

    
805
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
806
      "2:\n"
807
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
808

    
809
      "subl $2, %%ecx\n"
810
      "jnz 1b\n"
811

    
812
      "movq %%mm6,%%mm0\n"
813
      "psrlq $32, %%mm6\n"
814
      "paddw %%mm6,%%mm0\n"
815
      "movq %%mm0,%%mm6\n"
816
      "psrlq $16, %%mm0\n"
817
      "paddw %%mm6,%%mm0\n"
818
      "movd %%mm0,%2\n"
819
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
820
      : "r" ((x86_reg)line_size) , "m" (h)
821
      : "%ecx");
822
    return tmp & 0x7FFF;
823
}
824
#undef SUM
825

    
826
static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
827
    int tmp;
828

    
829
    assert( (((int)pix1) & 7) == 0);
830
    assert( (((int)pix2) & 7) == 0);
831
    assert((line_size &7) ==0);
832

    
833
#define SUM(in0, in1, out0, out1) \
834
      "movq (%0)," #out0 "\n"\
835
      "movq (%1),%%mm2\n"\
836
      "movq 8(%0)," #out1 "\n"\
837
      "movq 8(%1),%%mm3\n"\
838
      "add %3,%0\n"\
839
      "add %3,%1\n"\
840
      "psubb %%mm2, " #out0 "\n"\
841
      "psubb %%mm3, " #out1 "\n"\
842
      "pxor %%mm7, " #out0 "\n"\
843
      "pxor %%mm7, " #out1 "\n"\
844
      "psadbw " #out0 ", " #in0 "\n"\
845
      "psadbw " #out1 ", " #in1 "\n"\
846
      "paddw " #in1 ", " #in0 "\n"\
847
      "paddw " #in0 ", %%mm6\n"
848

    
849
  __asm__ volatile (
850
      "movl %4,%%ecx\n"
851
      "pxor %%mm6,%%mm6\n"
852
      "pcmpeqw %%mm7,%%mm7\n"
853
      "psllw $15, %%mm7\n"
854
      "packsswb %%mm7, %%mm7\n"
855
      "movq (%0),%%mm0\n"
856
      "movq (%1),%%mm2\n"
857
      "movq 8(%0),%%mm1\n"
858
      "movq 8(%1),%%mm3\n"
859
      "add %3,%0\n"
860
      "add %3,%1\n"
861
      "psubb %%mm2, %%mm0\n"
862
      "psubb %%mm3, %%mm1\n"
863
      "pxor %%mm7, %%mm0\n"
864
      "pxor %%mm7, %%mm1\n"
865
      "jmp 2f\n"
866
      "1:\n"
867

    
868
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
869
      "2:\n"
870
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
871

    
872
      "subl $2, %%ecx\n"
873
      "jnz 1b\n"
874

    
875
      "movd %%mm6,%2\n"
876
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
877
      : "r" ((x86_reg)line_size) , "m" (h)
878
      : "%ecx");
879
    return tmp;
880
}
881
#undef SUM
882

    
883
static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
884
    x86_reg i=0;
885
    __asm__ volatile(
886
        "1:                             \n\t"
887
        "movq  (%2, %0), %%mm0          \n\t"
888
        "movq  (%1, %0), %%mm1          \n\t"
889
        "psubb %%mm0, %%mm1             \n\t"
890
        "movq %%mm1, (%3, %0)           \n\t"
891
        "movq 8(%2, %0), %%mm0          \n\t"
892
        "movq 8(%1, %0), %%mm1          \n\t"
893
        "psubb %%mm0, %%mm1             \n\t"
894
        "movq %%mm1, 8(%3, %0)          \n\t"
895
        "add $16, %0                    \n\t"
896
        "cmp %4, %0                     \n\t"
897
        " jb 1b                         \n\t"
898
        : "+r" (i)
899
        : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w-15)
900
    );
901
    for(; i<w; i++)
902
        dst[i+0] = src1[i+0]-src2[i+0];
903
}
904

    
905
static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
906
    x86_reg i=0;
907
    uint8_t l, lt;
908

    
909
    __asm__ volatile(
910
        "1:                             \n\t"
911
        "movq  -1(%1, %0), %%mm0        \n\t" // LT
912
        "movq  (%1, %0), %%mm1          \n\t" // T
913
        "movq  -1(%2, %0), %%mm2        \n\t" // L
914
        "movq  (%2, %0), %%mm3          \n\t" // X
915
        "movq %%mm2, %%mm4              \n\t" // L
916
        "psubb %%mm0, %%mm2             \n\t"
917
        "paddb %%mm1, %%mm2             \n\t" // L + T - LT
918
        "movq %%mm4, %%mm5              \n\t" // L
919
        "pmaxub %%mm1, %%mm4            \n\t" // max(T, L)
920
        "pminub %%mm5, %%mm1            \n\t" // min(T, L)
921
        "pminub %%mm2, %%mm4            \n\t"
922
        "pmaxub %%mm1, %%mm4            \n\t"
923
        "psubb %%mm4, %%mm3             \n\t" // dst - pred
924
        "movq %%mm3, (%3, %0)           \n\t"
925
        "add $8, %0                     \n\t"
926
        "cmp %4, %0                     \n\t"
927
        " jb 1b                         \n\t"
928
        : "+r" (i)
929
        : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w)
930
    );
931

    
932
    l= *left;
933
    lt= *left_top;
934

    
935
    dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
936

    
937
    *left_top= src1[w-1];
938
    *left    = src2[w-1];
939
}
940

    
941
#define DIFF_PIXELS_1(m,a,t,p1,p2)\
942
    "mov"#m" "#p1", "#a"              \n\t"\
943
    "mov"#m" "#p2", "#t"              \n\t"\
944
    "punpcklbw "#a", "#t"             \n\t"\
945
    "punpcklbw "#a", "#a"             \n\t"\
946
    "psubw     "#t", "#a"             \n\t"\
947

    
948
#define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
949
    uint8_t *p1b=p1, *p2b=p2;\
950
    __asm__ volatile(\
951
        DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
952
        DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
953
        DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
954
        "add %4, %1                   \n\t"\
955
        "add %4, %2                   \n\t"\
956
        DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
957
        DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
958
        DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
959
        DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
960
        "mov"#m1" "#mm"0, %0          \n\t"\
961
        DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
962
        "mov"#m1" %0, "#mm"0          \n\t"\
963
        : "+m"(temp), "+r"(p1b), "+r"(p2b)\
964
        : "r"((x86_reg)stride), "r"((x86_reg)stride*3)\
965
    );\
966
}
967
    //the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp)
968

    
969
#define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q,   %%mm,  p1, p2, stride, temp)
970
#define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
971

    
972
#define LBUTTERFLY2(a1,b1,a2,b2)\
973
    "paddw " #b1 ", " #a1 "           \n\t"\
974
    "paddw " #b2 ", " #a2 "           \n\t"\
975
    "paddw " #b1 ", " #b1 "           \n\t"\
976
    "paddw " #b2 ", " #b2 "           \n\t"\
977
    "psubw " #a1 ", " #b1 "           \n\t"\
978
    "psubw " #a2 ", " #b2 "           \n\t"
979

    
980
#define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
981
        LBUTTERFLY2(m0, m1, m2, m3)\
982
        LBUTTERFLY2(m4, m5, m6, m7)\
983
        LBUTTERFLY2(m0, m2, m1, m3)\
984
        LBUTTERFLY2(m4, m6, m5, m7)\
985
        LBUTTERFLY2(m0, m4, m1, m5)\
986
        LBUTTERFLY2(m2, m6, m3, m7)\
987

    
988
#define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
989

    
990
#define MMABS_MMX(a,z)\
991
    "pxor " #z ", " #z "              \n\t"\
992
    "pcmpgtw " #a ", " #z "           \n\t"\
993
    "pxor " #z ", " #a "              \n\t"\
994
    "psubw " #z ", " #a "             \n\t"
995

    
996
#define MMABS_MMX2(a,z)\
997
    "pxor " #z ", " #z "              \n\t"\
998
    "psubw " #a ", " #z "             \n\t"\
999
    "pmaxsw " #z ", " #a "            \n\t"
1000

    
1001
#define MMABS_SSSE3(a,z)\
1002
    "pabsw " #a ", " #a "             \n\t"
1003

    
1004
#define MMABS_SUM(a,z, sum)\
1005
    MMABS(a,z)\
1006
    "paddusw " #a ", " #sum "         \n\t"
1007

    
1008
#define MMABS_SUM_8x8_NOSPILL\
1009
    MMABS(%%xmm0, %%xmm8)\
1010
    MMABS(%%xmm1, %%xmm9)\
1011
    MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
1012
    MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
1013
    MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
1014
    MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
1015
    MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
1016
    MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
1017
    "paddusw %%xmm1, %%xmm0           \n\t"
1018

    
1019
#if ARCH_X86_64
1020
#define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
1021
#else
1022
#define MMABS_SUM_8x8_SSE2\
1023
    "movdqa %%xmm7, (%1)              \n\t"\
1024
    MMABS(%%xmm0, %%xmm7)\
1025
    MMABS(%%xmm1, %%xmm7)\
1026
    MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
1027
    MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
1028
    MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
1029
    MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
1030
    MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
1031
    "movdqa (%1), %%xmm2              \n\t"\
1032
    MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
1033
    "paddusw %%xmm1, %%xmm0           \n\t"
1034
#endif
1035

    
1036
/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
1037
 * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
1038
 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
1039
#define HSUM_MMX(a, t, dst)\
1040
    "movq "#a", "#t"                  \n\t"\
1041
    "psrlq $32, "#a"                  \n\t"\
1042
    "paddusw "#t", "#a"               \n\t"\
1043
    "movq "#a", "#t"                  \n\t"\
1044
    "psrlq $16, "#a"                  \n\t"\
1045
    "paddusw "#t", "#a"               \n\t"\
1046
    "movd "#a", "#dst"                \n\t"\
1047

    
1048
#define HSUM_MMX2(a, t, dst)\
1049
    "pshufw $0x0E, "#a", "#t"         \n\t"\
1050
    "paddusw "#t", "#a"               \n\t"\
1051
    "pshufw $0x01, "#a", "#t"         \n\t"\
1052
    "paddusw "#t", "#a"               \n\t"\
1053
    "movd "#a", "#dst"                \n\t"\
1054

    
1055
#define HSUM_SSE2(a, t, dst)\
1056
    "movhlps "#a", "#t"               \n\t"\
1057
    "paddusw "#t", "#a"               \n\t"\
1058
    "pshuflw $0x0E, "#a", "#t"        \n\t"\
1059
    "paddusw "#t", "#a"               \n\t"\
1060
    "pshuflw $0x01, "#a", "#t"        \n\t"\
1061
    "paddusw "#t", "#a"               \n\t"\
1062
    "movd "#a", "#dst"                \n\t"\
1063

    
1064
#define HADAMARD8_DIFF_MMX(cpu) \
1065
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1066
    DECLARE_ALIGNED(8, uint64_t, temp)[13];\
1067
    int sum;\
1068
\
1069
    assert(h==8);\
1070
\
1071
    DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
1072
\
1073
    __asm__ volatile(\
1074
        HADAMARD48\
1075
\
1076
        "movq %%mm7, 96(%1)             \n\t"\
1077
\
1078
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1079
        STORE4(8,  0(%1), %%mm0, %%mm3, %%mm7, %%mm2)\
1080
\
1081
        "movq 96(%1), %%mm7             \n\t"\
1082
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1083
        STORE4(8, 64(%1), %%mm4, %%mm7, %%mm0, %%mm6)\
1084
\
1085
        : "=r" (sum)\
1086
        : "r"(temp)\
1087
    );\
1088
\
1089
    DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
1090
\
1091
    __asm__ volatile(\
1092
        HADAMARD48\
1093
\
1094
        "movq %%mm7, 96(%1)             \n\t"\
1095
\
1096
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1097
        STORE4(8, 32(%1), %%mm0, %%mm3, %%mm7, %%mm2)\
1098
\
1099
        "movq 96(%1), %%mm7             \n\t"\
1100
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1101
        "movq %%mm7, %%mm5              \n\t"/*FIXME remove*/\
1102
        "movq %%mm6, %%mm7              \n\t"\
1103
        "movq %%mm0, %%mm6              \n\t"\
1104
\
1105
        LOAD4(8, 64(%1), %%mm0, %%mm1, %%mm2, %%mm3)\
1106
\
1107
        HADAMARD48\
1108
        "movq %%mm7, 64(%1)             \n\t"\
1109
        MMABS(%%mm0, %%mm7)\
1110
        MMABS(%%mm1, %%mm7)\
1111
        MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1112
        MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1113
        MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1114
        MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1115
        MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1116
        "movq 64(%1), %%mm2             \n\t"\
1117
        MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1118
        "paddusw %%mm1, %%mm0           \n\t"\
1119
        "movq %%mm0, 64(%1)             \n\t"\
1120
\
1121
        LOAD4(8,  0(%1), %%mm0, %%mm1, %%mm2, %%mm3)\
1122
        LOAD4(8, 32(%1), %%mm4, %%mm5, %%mm6, %%mm7)\
1123
\
1124
        HADAMARD48\
1125
        "movq %%mm7, (%1)               \n\t"\
1126
        MMABS(%%mm0, %%mm7)\
1127
        MMABS(%%mm1, %%mm7)\
1128
        MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1129
        MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1130
        MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1131
        MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1132
        MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1133
        "movq (%1), %%mm2               \n\t"\
1134
        MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1135
        "paddusw 64(%1), %%mm0          \n\t"\
1136
        "paddusw %%mm1, %%mm0           \n\t"\
1137
\
1138
        HSUM(%%mm0, %%mm1, %0)\
1139
\
1140
        : "=r" (sum)\
1141
        : "r"(temp)\
1142
    );\
1143
    return sum&0xFFFF;\
1144
}\
1145
WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1146

    
1147
#define HADAMARD8_DIFF_SSE2(cpu) \
1148
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1149
    DECLARE_ALIGNED(16, uint64_t, temp)[4];\
1150
    int sum;\
1151
\
1152
    assert(h==8);\
1153
\
1154
    DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
1155
\
1156
    __asm__ volatile(\
1157
        HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
1158
        TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
1159
        HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
1160
        MMABS_SUM_8x8\
1161
        HSUM_SSE2(%%xmm0, %%xmm1, %0)\
1162
        : "=r" (sum)\
1163
        : "r"(temp)\
1164
    );\
1165
    return sum&0xFFFF;\
1166
}\
1167
WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1168

    
1169
#define MMABS(a,z)         MMABS_MMX(a,z)
1170
#define HSUM(a,t,dst)      HSUM_MMX(a,t,dst)
1171
HADAMARD8_DIFF_MMX(mmx)
1172
#undef MMABS
1173
#undef HSUM
1174

    
1175
#define MMABS(a,z)         MMABS_MMX2(a,z)
1176
#define MMABS_SUM_8x8      MMABS_SUM_8x8_SSE2
1177
#define HSUM(a,t,dst)      HSUM_MMX2(a,t,dst)
1178
HADAMARD8_DIFF_MMX(mmx2)
1179
HADAMARD8_DIFF_SSE2(sse2)
1180
#undef MMABS
1181
#undef MMABS_SUM_8x8
1182
#undef HSUM
1183

    
1184
#if HAVE_SSSE3
1185
#define MMABS(a,z)         MMABS_SSSE3(a,z)
1186
#define MMABS_SUM_8x8      MMABS_SUM_8x8_NOSPILL
1187
HADAMARD8_DIFF_SSE2(ssse3)
1188
#undef MMABS
1189
#undef MMABS_SUM_8x8
1190
#endif
1191

    
1192
#define DCT_SAD4(m,mm,o)\
1193
    "mov"#m" "#o"+ 0(%1), "#mm"2      \n\t"\
1194
    "mov"#m" "#o"+16(%1), "#mm"3      \n\t"\
1195
    "mov"#m" "#o"+32(%1), "#mm"4      \n\t"\
1196
    "mov"#m" "#o"+48(%1), "#mm"5      \n\t"\
1197
    MMABS_SUM(mm##2, mm##6, mm##0)\
1198
    MMABS_SUM(mm##3, mm##7, mm##1)\
1199
    MMABS_SUM(mm##4, mm##6, mm##0)\
1200
    MMABS_SUM(mm##5, mm##7, mm##1)\
1201

    
1202
#define DCT_SAD_MMX\
1203
    "pxor %%mm0, %%mm0                \n\t"\
1204
    "pxor %%mm1, %%mm1                \n\t"\
1205
    DCT_SAD4(q, %%mm, 0)\
1206
    DCT_SAD4(q, %%mm, 8)\
1207
    DCT_SAD4(q, %%mm, 64)\
1208
    DCT_SAD4(q, %%mm, 72)\
1209
    "paddusw %%mm1, %%mm0             \n\t"\
1210
    HSUM(%%mm0, %%mm1, %0)
1211

    
1212
#define DCT_SAD_SSE2\
1213
    "pxor %%xmm0, %%xmm0              \n\t"\
1214
    "pxor %%xmm1, %%xmm1              \n\t"\
1215
    DCT_SAD4(dqa, %%xmm, 0)\
1216
    DCT_SAD4(dqa, %%xmm, 64)\
1217
    "paddusw %%xmm1, %%xmm0           \n\t"\
1218
    HSUM(%%xmm0, %%xmm1, %0)
1219

    
1220
#define DCT_SAD_FUNC(cpu) \
1221
static int sum_abs_dctelem_##cpu(DCTELEM *block){\
1222
    int sum;\
1223
    __asm__ volatile(\
1224
        DCT_SAD\
1225
        :"=r"(sum)\
1226
        :"r"(block)\
1227
    );\
1228
    return sum&0xFFFF;\
1229
}
1230

    
1231
#define DCT_SAD       DCT_SAD_MMX
1232
#define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1233
#define MMABS(a,z)    MMABS_MMX(a,z)
1234
DCT_SAD_FUNC(mmx)
1235
#undef MMABS
1236
#undef HSUM
1237

    
1238
#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1239
#define MMABS(a,z)    MMABS_MMX2(a,z)
1240
DCT_SAD_FUNC(mmx2)
1241
#undef HSUM
1242
#undef DCT_SAD
1243

    
1244
#define DCT_SAD       DCT_SAD_SSE2
1245
#define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
1246
DCT_SAD_FUNC(sse2)
1247
#undef MMABS
1248

    
1249
#if HAVE_SSSE3
1250
#define MMABS(a,z)    MMABS_SSSE3(a,z)
1251
DCT_SAD_FUNC(ssse3)
1252
#undef MMABS
1253
#endif
1254
#undef HSUM
1255
#undef DCT_SAD
1256

    
1257
static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
1258
    int sum;
1259
    x86_reg i=size;
1260
    __asm__ volatile(
1261
        "pxor %%mm4, %%mm4 \n"
1262
        "1: \n"
1263
        "sub $8, %0 \n"
1264
        "movq (%2,%0), %%mm2 \n"
1265
        "movq (%3,%0,2), %%mm0 \n"
1266
        "movq 8(%3,%0,2), %%mm1 \n"
1267
        "punpckhbw %%mm2, %%mm3 \n"
1268
        "punpcklbw %%mm2, %%mm2 \n"
1269
        "psraw $8, %%mm3 \n"
1270
        "psraw $8, %%mm2 \n"
1271
        "psubw %%mm3, %%mm1 \n"
1272
        "psubw %%mm2, %%mm0 \n"
1273
        "pmaddwd %%mm1, %%mm1 \n"
1274
        "pmaddwd %%mm0, %%mm0 \n"
1275
        "paddd %%mm1, %%mm4 \n"
1276
        "paddd %%mm0, %%mm4 \n"
1277
        "jg 1b \n"
1278
        "movq %%mm4, %%mm3 \n"
1279
        "psrlq $32, %%mm3 \n"
1280
        "paddd %%mm3, %%mm4 \n"
1281
        "movd %%mm4, %1 \n"
1282
        :"+r"(i), "=r"(sum)
1283
        :"r"(pix1), "r"(pix2)
1284
    );
1285
    return sum;
1286
}
1287

    
1288
#define PHADDD(a, t)\
1289
    "movq "#a", "#t"                  \n\t"\
1290
    "psrlq $32, "#a"                  \n\t"\
1291
    "paddd "#t", "#a"                 \n\t"
1292
/*
1293
   pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
1294
   pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
1295
   pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
1296
 */
1297
#define PMULHRW(x, y, s, o)\
1298
    "pmulhw " #s ", "#x "            \n\t"\
1299
    "pmulhw " #s ", "#y "            \n\t"\
1300
    "paddw " #o ", "#x "             \n\t"\
1301
    "paddw " #o ", "#y "             \n\t"\
1302
    "psraw $1, "#x "                 \n\t"\
1303
    "psraw $1, "#y "                 \n\t"
1304
#define DEF(x) x ## _mmx
1305
#define SET_RND MOVQ_WONE
1306
#define SCALE_OFFSET 1
1307

    
1308
#include "dsputil_mmx_qns_template.c"
1309

    
1310
#undef DEF
1311
#undef SET_RND
1312
#undef SCALE_OFFSET
1313
#undef PMULHRW
1314

    
1315
#define DEF(x) x ## _3dnow
1316
#define SET_RND(x)
1317
#define SCALE_OFFSET 0
1318
#define PMULHRW(x, y, s, o)\
1319
    "pmulhrw " #s ", "#x "           \n\t"\
1320
    "pmulhrw " #s ", "#y "           \n\t"
1321

    
1322
#include "dsputil_mmx_qns_template.c"
1323

    
1324
#undef DEF
1325
#undef SET_RND
1326
#undef SCALE_OFFSET
1327
#undef PMULHRW
1328

    
1329
#if HAVE_SSSE3
1330
#undef PHADDD
1331
#define DEF(x) x ## _ssse3
1332
#define SET_RND(x)
1333
#define SCALE_OFFSET -1
1334
#define PHADDD(a, t)\
1335
    "pshufw $0x0E, "#a", "#t"         \n\t"\
1336
    "paddd "#t", "#a"                 \n\t" /* faster than phaddd on core2 */
1337
#define PMULHRW(x, y, s, o)\
1338
    "pmulhrsw " #s ", "#x "          \n\t"\
1339
    "pmulhrsw " #s ", "#y "          \n\t"
1340

    
1341
#include "dsputil_mmx_qns_template.c"
1342

    
1343
#undef DEF
1344
#undef SET_RND
1345
#undef SCALE_OFFSET
1346
#undef PMULHRW
1347
#undef PHADDD
1348
#endif //HAVE_SSSE3
1349

    
1350

    
1351
void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
1352
{
1353
    if (mm_flags & FF_MM_MMX) {
1354
        const int dct_algo = avctx->dct_algo;
1355
        if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
1356
            if(mm_flags & FF_MM_SSE2){
1357
                c->fdct = ff_fdct_sse2;
1358
            }else if(mm_flags & FF_MM_MMX2){
1359
                c->fdct = ff_fdct_mmx2;
1360
            }else{
1361
                c->fdct = ff_fdct_mmx;
1362
            }
1363
        }
1364

    
1365
        c->get_pixels = get_pixels_mmx;
1366
        c->diff_pixels = diff_pixels_mmx;
1367
        c->pix_sum = pix_sum16_mmx;
1368

    
1369
        c->diff_bytes= diff_bytes_mmx;
1370
        c->sum_abs_dctelem= sum_abs_dctelem_mmx;
1371

    
1372
        c->hadamard8_diff[0]= hadamard8_diff16_mmx;
1373
        c->hadamard8_diff[1]= hadamard8_diff_mmx;
1374

    
1375
        c->pix_norm1 = pix_norm1_mmx;
1376
        c->sse[0] = (mm_flags & FF_MM_SSE2) ? sse16_sse2 : sse16_mmx;
1377
          c->sse[1] = sse8_mmx;
1378
        c->vsad[4]= vsad_intra16_mmx;
1379

    
1380
        c->nsse[0] = nsse16_mmx;
1381
        c->nsse[1] = nsse8_mmx;
1382
        if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1383
            c->vsad[0] = vsad16_mmx;
1384
        }
1385

    
1386
        if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1387
            c->try_8x8basis= try_8x8basis_mmx;
1388
        }
1389
        c->add_8x8basis= add_8x8basis_mmx;
1390

    
1391
        c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
1392

    
1393

    
1394
        if (mm_flags & FF_MM_MMX2) {
1395
            c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
1396
            c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
1397
            c->hadamard8_diff[1]= hadamard8_diff_mmx2;
1398
            c->vsad[4]= vsad_intra16_mmx2;
1399

    
1400
            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1401
                c->vsad[0] = vsad16_mmx2;
1402
            }
1403

    
1404
            c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
1405
        }
1406

    
1407
        if(mm_flags & FF_MM_SSE2){
1408
            c->get_pixels = get_pixels_sse2;
1409
            c->sum_abs_dctelem= sum_abs_dctelem_sse2;
1410
            c->hadamard8_diff[0]= hadamard8_diff16_sse2;
1411
            c->hadamard8_diff[1]= hadamard8_diff_sse2;
1412
#if CONFIG_LPC
1413
            c->lpc_compute_autocorr = ff_lpc_compute_autocorr_sse2;
1414
#endif
1415
        }
1416

    
1417
#if HAVE_SSSE3
1418
        if(mm_flags & FF_MM_SSSE3){
1419
            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1420
                c->try_8x8basis= try_8x8basis_ssse3;
1421
            }
1422
            c->add_8x8basis= add_8x8basis_ssse3;
1423
            c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
1424
            c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
1425
            c->hadamard8_diff[1]= hadamard8_diff_ssse3;
1426
        }
1427
#endif
1428

    
1429
        if(mm_flags & FF_MM_3DNOW){
1430
            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1431
                c->try_8x8basis= try_8x8basis_3dnow;
1432
            }
1433
            c->add_8x8basis= add_8x8basis_3dnow;
1434
        }
1435
    }
1436

    
1437
    dsputil_init_pix_mmx(c, avctx);
1438
}