Statistics
| Branch: | Revision:

ffmpeg / libavcodec / i386 / dsputilenc_mmx.c @ 40d0e665

History | View | Annotate | Download (41.9 KB)

1
/*
2
 * MMX optimized DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * This file is part of FFmpeg.
7
 *
8
 * FFmpeg is free software; you can redistribute it and/or
9
 * modify it under the terms of the GNU Lesser General Public
10
 * License as published by the Free Software Foundation; either
11
 * version 2.1 of the License, or (at your option) any later version.
12
 *
13
 * FFmpeg is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
 * Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public
19
 * License along with FFmpeg; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
 *
22
 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
23
 */
24

    
25
#include "dsputil.h"
26
#include "dsputil_mmx.h"
27
#include "mpegvideo.h"
28
#include "x86_cpu.h"
29

    
30

    
31
static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
32
{
33
    asm volatile(
34
        "mov $-128, %%"REG_a"           \n\t"
35
        "pxor %%mm7, %%mm7              \n\t"
36
        ASMALIGN(4)
37
        "1:                             \n\t"
38
        "movq (%0), %%mm0               \n\t"
39
        "movq (%0, %2), %%mm2           \n\t"
40
        "movq %%mm0, %%mm1              \n\t"
41
        "movq %%mm2, %%mm3              \n\t"
42
        "punpcklbw %%mm7, %%mm0         \n\t"
43
        "punpckhbw %%mm7, %%mm1         \n\t"
44
        "punpcklbw %%mm7, %%mm2         \n\t"
45
        "punpckhbw %%mm7, %%mm3         \n\t"
46
        "movq %%mm0, (%1, %%"REG_a")    \n\t"
47
        "movq %%mm1, 8(%1, %%"REG_a")   \n\t"
48
        "movq %%mm2, 16(%1, %%"REG_a")  \n\t"
49
        "movq %%mm3, 24(%1, %%"REG_a")  \n\t"
50
        "add %3, %0                     \n\t"
51
        "add $32, %%"REG_a"             \n\t"
52
        "js 1b                          \n\t"
53
        : "+r" (pixels)
54
        : "r" (block+64), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*2)
55
        : "%"REG_a
56
    );
57
}
58

    
59
static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
60
{
61
    asm volatile(
62
        "pxor %%mm7, %%mm7              \n\t"
63
        "mov $-128, %%"REG_a"           \n\t"
64
        ASMALIGN(4)
65
        "1:                             \n\t"
66
        "movq (%0), %%mm0               \n\t"
67
        "movq (%1), %%mm2               \n\t"
68
        "movq %%mm0, %%mm1              \n\t"
69
        "movq %%mm2, %%mm3              \n\t"
70
        "punpcklbw %%mm7, %%mm0         \n\t"
71
        "punpckhbw %%mm7, %%mm1         \n\t"
72
        "punpcklbw %%mm7, %%mm2         \n\t"
73
        "punpckhbw %%mm7, %%mm3         \n\t"
74
        "psubw %%mm2, %%mm0             \n\t"
75
        "psubw %%mm3, %%mm1             \n\t"
76
        "movq %%mm0, (%2, %%"REG_a")    \n\t"
77
        "movq %%mm1, 8(%2, %%"REG_a")   \n\t"
78
        "add %3, %0                     \n\t"
79
        "add %3, %1                     \n\t"
80
        "add $16, %%"REG_a"             \n\t"
81
        "jnz 1b                         \n\t"
82
        : "+r" (s1), "+r" (s2)
83
        : "r" (block+64), "r" ((x86_reg)stride)
84
        : "%"REG_a
85
    );
86
}
87

    
88
static int pix_sum16_mmx(uint8_t * pix, int line_size){
89
    const int h=16;
90
    int sum;
91
    x86_reg index= -line_size*h;
92

    
93
    asm volatile(
94
                "pxor %%mm7, %%mm7              \n\t"
95
                "pxor %%mm6, %%mm6              \n\t"
96
                "1:                             \n\t"
97
                "movq (%2, %1), %%mm0           \n\t"
98
                "movq (%2, %1), %%mm1           \n\t"
99
                "movq 8(%2, %1), %%mm2          \n\t"
100
                "movq 8(%2, %1), %%mm3          \n\t"
101
                "punpcklbw %%mm7, %%mm0         \n\t"
102
                "punpckhbw %%mm7, %%mm1         \n\t"
103
                "punpcklbw %%mm7, %%mm2         \n\t"
104
                "punpckhbw %%mm7, %%mm3         \n\t"
105
                "paddw %%mm0, %%mm1             \n\t"
106
                "paddw %%mm2, %%mm3             \n\t"
107
                "paddw %%mm1, %%mm3             \n\t"
108
                "paddw %%mm3, %%mm6             \n\t"
109
                "add %3, %1                     \n\t"
110
                " js 1b                         \n\t"
111
                "movq %%mm6, %%mm5              \n\t"
112
                "psrlq $32, %%mm6               \n\t"
113
                "paddw %%mm5, %%mm6             \n\t"
114
                "movq %%mm6, %%mm5              \n\t"
115
                "psrlq $16, %%mm6               \n\t"
116
                "paddw %%mm5, %%mm6             \n\t"
117
                "movd %%mm6, %0                 \n\t"
118
                "andl $0xFFFF, %0               \n\t"
119
                : "=&r" (sum), "+r" (index)
120
                : "r" (pix - index), "r" ((x86_reg)line_size)
121
        );
122

    
123
        return sum;
124
}
125

    
126
static int pix_norm1_mmx(uint8_t *pix, int line_size) {
127
    int tmp;
128
  asm volatile (
129
      "movl $16,%%ecx\n"
130
      "pxor %%mm0,%%mm0\n"
131
      "pxor %%mm7,%%mm7\n"
132
      "1:\n"
133
      "movq (%0),%%mm2\n"       /* mm2 = pix[0-7] */
134
      "movq 8(%0),%%mm3\n"      /* mm3 = pix[8-15] */
135

    
136
      "movq %%mm2,%%mm1\n"      /* mm1 = mm2 = pix[0-7] */
137

    
138
      "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
139
      "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
140

    
141
      "movq %%mm3,%%mm4\n"      /* mm4 = mm3 = pix[8-15] */
142
      "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
143
      "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
144

    
145
      "pmaddwd %%mm1,%%mm1\n"   /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
146
      "pmaddwd %%mm2,%%mm2\n"   /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
147

    
148
      "pmaddwd %%mm3,%%mm3\n"
149
      "pmaddwd %%mm4,%%mm4\n"
150

    
151
      "paddd %%mm1,%%mm2\n"     /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
152
                                          pix2^2+pix3^2+pix6^2+pix7^2) */
153
      "paddd %%mm3,%%mm4\n"
154
      "paddd %%mm2,%%mm7\n"
155

    
156
      "add %2, %0\n"
157
      "paddd %%mm4,%%mm7\n"
158
      "dec %%ecx\n"
159
      "jnz 1b\n"
160

    
161
      "movq %%mm7,%%mm1\n"
162
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
163
      "paddd %%mm7,%%mm1\n"
164
      "movd %%mm1,%1\n"
165
      : "+r" (pix), "=r"(tmp) : "r" ((x86_reg)line_size) : "%ecx" );
166
    return tmp;
167
}
168

    
169
static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
170
    int tmp;
171
  asm volatile (
172
      "movl %4,%%ecx\n"
173
      "shr $1,%%ecx\n"
174
      "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */
175
      "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */
176
      "1:\n"
177
      "movq (%0),%%mm1\n"       /* mm1 = pix1[0][0-7] */
178
      "movq (%1),%%mm2\n"       /* mm2 = pix2[0][0-7] */
179
      "movq (%0,%3),%%mm3\n"    /* mm3 = pix1[1][0-7] */
180
      "movq (%1,%3),%%mm4\n"    /* mm4 = pix2[1][0-7] */
181

    
182
      /* todo: mm1-mm2, mm3-mm4 */
183
      /* algo: subtract mm1 from mm2 with saturation and vice versa */
184
      /*       OR the results to get absolute difference */
185
      "movq %%mm1,%%mm5\n"
186
      "movq %%mm3,%%mm6\n"
187
      "psubusb %%mm2,%%mm1\n"
188
      "psubusb %%mm4,%%mm3\n"
189
      "psubusb %%mm5,%%mm2\n"
190
      "psubusb %%mm6,%%mm4\n"
191

    
192
      "por %%mm1,%%mm2\n"
193
      "por %%mm3,%%mm4\n"
194

    
195
      /* now convert to 16-bit vectors so we can square them */
196
      "movq %%mm2,%%mm1\n"
197
      "movq %%mm4,%%mm3\n"
198

    
199
      "punpckhbw %%mm0,%%mm2\n"
200
      "punpckhbw %%mm0,%%mm4\n"
201
      "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
202
      "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
203

    
204
      "pmaddwd %%mm2,%%mm2\n"
205
      "pmaddwd %%mm4,%%mm4\n"
206
      "pmaddwd %%mm1,%%mm1\n"
207
      "pmaddwd %%mm3,%%mm3\n"
208

    
209
      "lea (%0,%3,2), %0\n"     /* pix1 += 2*line_size */
210
      "lea (%1,%3,2), %1\n"     /* pix2 += 2*line_size */
211

    
212
      "paddd %%mm2,%%mm1\n"
213
      "paddd %%mm4,%%mm3\n"
214
      "paddd %%mm1,%%mm7\n"
215
      "paddd %%mm3,%%mm7\n"
216

    
217
      "decl %%ecx\n"
218
      "jnz 1b\n"
219

    
220
      "movq %%mm7,%%mm1\n"
221
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
222
      "paddd %%mm7,%%mm1\n"
223
      "movd %%mm1,%2\n"
224
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
225
      : "r" ((x86_reg)line_size) , "m" (h)
226
      : "%ecx");
227
    return tmp;
228
}
229

    
230
static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
231
    int tmp;
232
  asm volatile (
233
      "movl %4,%%ecx\n"
234
      "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */
235
      "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */
236
      "1:\n"
237
      "movq (%0),%%mm1\n"       /* mm1 = pix1[0-7] */
238
      "movq (%1),%%mm2\n"       /* mm2 = pix2[0-7] */
239
      "movq 8(%0),%%mm3\n"      /* mm3 = pix1[8-15] */
240
      "movq 8(%1),%%mm4\n"      /* mm4 = pix2[8-15] */
241

    
242
      /* todo: mm1-mm2, mm3-mm4 */
243
      /* algo: subtract mm1 from mm2 with saturation and vice versa */
244
      /*       OR the results to get absolute difference */
245
      "movq %%mm1,%%mm5\n"
246
      "movq %%mm3,%%mm6\n"
247
      "psubusb %%mm2,%%mm1\n"
248
      "psubusb %%mm4,%%mm3\n"
249
      "psubusb %%mm5,%%mm2\n"
250
      "psubusb %%mm6,%%mm4\n"
251

    
252
      "por %%mm1,%%mm2\n"
253
      "por %%mm3,%%mm4\n"
254

    
255
      /* now convert to 16-bit vectors so we can square them */
256
      "movq %%mm2,%%mm1\n"
257
      "movq %%mm4,%%mm3\n"
258

    
259
      "punpckhbw %%mm0,%%mm2\n"
260
      "punpckhbw %%mm0,%%mm4\n"
261
      "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
262
      "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
263

    
264
      "pmaddwd %%mm2,%%mm2\n"
265
      "pmaddwd %%mm4,%%mm4\n"
266
      "pmaddwd %%mm1,%%mm1\n"
267
      "pmaddwd %%mm3,%%mm3\n"
268

    
269
      "add %3,%0\n"
270
      "add %3,%1\n"
271

    
272
      "paddd %%mm2,%%mm1\n"
273
      "paddd %%mm4,%%mm3\n"
274
      "paddd %%mm1,%%mm7\n"
275
      "paddd %%mm3,%%mm7\n"
276

    
277
      "decl %%ecx\n"
278
      "jnz 1b\n"
279

    
280
      "movq %%mm7,%%mm1\n"
281
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
282
      "paddd %%mm7,%%mm1\n"
283
      "movd %%mm1,%2\n"
284
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
285
      : "r" ((x86_reg)line_size) , "m" (h)
286
      : "%ecx");
287
    return tmp;
288
}
289

    
290
static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
291
    int tmp;
292
  asm volatile (
293
      "shr $1,%2\n"
294
      "pxor %%xmm0,%%xmm0\n"    /* mm0 = 0 */
295
      "pxor %%xmm7,%%xmm7\n"    /* mm7 holds the sum */
296
      "1:\n"
297
      "movdqu (%0),%%xmm1\n"    /* mm1 = pix1[0][0-15] */
298
      "movdqu (%1),%%xmm2\n"    /* mm2 = pix2[0][0-15] */
299
      "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
300
      "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
301

    
302
      /* todo: mm1-mm2, mm3-mm4 */
303
      /* algo: subtract mm1 from mm2 with saturation and vice versa */
304
      /*       OR the results to get absolute difference */
305
      "movdqa %%xmm1,%%xmm5\n"
306
      "movdqa %%xmm3,%%xmm6\n"
307
      "psubusb %%xmm2,%%xmm1\n"
308
      "psubusb %%xmm4,%%xmm3\n"
309
      "psubusb %%xmm5,%%xmm2\n"
310
      "psubusb %%xmm6,%%xmm4\n"
311

    
312
      "por %%xmm1,%%xmm2\n"
313
      "por %%xmm3,%%xmm4\n"
314

    
315
      /* now convert to 16-bit vectors so we can square them */
316
      "movdqa %%xmm2,%%xmm1\n"
317
      "movdqa %%xmm4,%%xmm3\n"
318

    
319
      "punpckhbw %%xmm0,%%xmm2\n"
320
      "punpckhbw %%xmm0,%%xmm4\n"
321
      "punpcklbw %%xmm0,%%xmm1\n"  /* mm1 now spread over (mm1,mm2) */
322
      "punpcklbw %%xmm0,%%xmm3\n"  /* mm4 now spread over (mm3,mm4) */
323

    
324
      "pmaddwd %%xmm2,%%xmm2\n"
325
      "pmaddwd %%xmm4,%%xmm4\n"
326
      "pmaddwd %%xmm1,%%xmm1\n"
327
      "pmaddwd %%xmm3,%%xmm3\n"
328

    
329
      "lea (%0,%4,2), %0\n"        /* pix1 += 2*line_size */
330
      "lea (%1,%4,2), %1\n"        /* pix2 += 2*line_size */
331

    
332
      "paddd %%xmm2,%%xmm1\n"
333
      "paddd %%xmm4,%%xmm3\n"
334
      "paddd %%xmm1,%%xmm7\n"
335
      "paddd %%xmm3,%%xmm7\n"
336

    
337
      "decl %2\n"
338
      "jnz 1b\n"
339

    
340
      "movdqa %%xmm7,%%xmm1\n"
341
      "psrldq $8, %%xmm7\n"        /* shift hi qword to lo */
342
      "paddd %%xmm1,%%xmm7\n"
343
      "movdqa %%xmm7,%%xmm1\n"
344
      "psrldq $4, %%xmm7\n"        /* shift hi dword to lo */
345
      "paddd %%xmm1,%%xmm7\n"
346
      "movd %%xmm7,%3\n"
347
      : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
348
      : "r" ((x86_reg)line_size));
349
    return tmp;
350
}
351

    
352
static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
353
    int tmp;
354
  asm volatile (
355
      "movl %3,%%ecx\n"
356
      "pxor %%mm7,%%mm7\n"
357
      "pxor %%mm6,%%mm6\n"
358

    
359
      "movq (%0),%%mm0\n"
360
      "movq %%mm0, %%mm1\n"
361
      "psllq $8, %%mm0\n"
362
      "psrlq $8, %%mm1\n"
363
      "psrlq $8, %%mm0\n"
364
      "movq %%mm0, %%mm2\n"
365
      "movq %%mm1, %%mm3\n"
366
      "punpcklbw %%mm7,%%mm0\n"
367
      "punpcklbw %%mm7,%%mm1\n"
368
      "punpckhbw %%mm7,%%mm2\n"
369
      "punpckhbw %%mm7,%%mm3\n"
370
      "psubw %%mm1, %%mm0\n"
371
      "psubw %%mm3, %%mm2\n"
372

    
373
      "add %2,%0\n"
374

    
375
      "movq (%0),%%mm4\n"
376
      "movq %%mm4, %%mm1\n"
377
      "psllq $8, %%mm4\n"
378
      "psrlq $8, %%mm1\n"
379
      "psrlq $8, %%mm4\n"
380
      "movq %%mm4, %%mm5\n"
381
      "movq %%mm1, %%mm3\n"
382
      "punpcklbw %%mm7,%%mm4\n"
383
      "punpcklbw %%mm7,%%mm1\n"
384
      "punpckhbw %%mm7,%%mm5\n"
385
      "punpckhbw %%mm7,%%mm3\n"
386
      "psubw %%mm1, %%mm4\n"
387
      "psubw %%mm3, %%mm5\n"
388
      "psubw %%mm4, %%mm0\n"
389
      "psubw %%mm5, %%mm2\n"
390
      "pxor %%mm3, %%mm3\n"
391
      "pxor %%mm1, %%mm1\n"
392
      "pcmpgtw %%mm0, %%mm3\n\t"
393
      "pcmpgtw %%mm2, %%mm1\n\t"
394
      "pxor %%mm3, %%mm0\n"
395
      "pxor %%mm1, %%mm2\n"
396
      "psubw %%mm3, %%mm0\n"
397
      "psubw %%mm1, %%mm2\n"
398
      "paddw %%mm0, %%mm2\n"
399
      "paddw %%mm2, %%mm6\n"
400

    
401
      "add %2,%0\n"
402
      "1:\n"
403

    
404
      "movq (%0),%%mm0\n"
405
      "movq %%mm0, %%mm1\n"
406
      "psllq $8, %%mm0\n"
407
      "psrlq $8, %%mm1\n"
408
      "psrlq $8, %%mm0\n"
409
      "movq %%mm0, %%mm2\n"
410
      "movq %%mm1, %%mm3\n"
411
      "punpcklbw %%mm7,%%mm0\n"
412
      "punpcklbw %%mm7,%%mm1\n"
413
      "punpckhbw %%mm7,%%mm2\n"
414
      "punpckhbw %%mm7,%%mm3\n"
415
      "psubw %%mm1, %%mm0\n"
416
      "psubw %%mm3, %%mm2\n"
417
      "psubw %%mm0, %%mm4\n"
418
      "psubw %%mm2, %%mm5\n"
419
      "pxor %%mm3, %%mm3\n"
420
      "pxor %%mm1, %%mm1\n"
421
      "pcmpgtw %%mm4, %%mm3\n\t"
422
      "pcmpgtw %%mm5, %%mm1\n\t"
423
      "pxor %%mm3, %%mm4\n"
424
      "pxor %%mm1, %%mm5\n"
425
      "psubw %%mm3, %%mm4\n"
426
      "psubw %%mm1, %%mm5\n"
427
      "paddw %%mm4, %%mm5\n"
428
      "paddw %%mm5, %%mm6\n"
429

    
430
      "add %2,%0\n"
431

    
432
      "movq (%0),%%mm4\n"
433
      "movq %%mm4, %%mm1\n"
434
      "psllq $8, %%mm4\n"
435
      "psrlq $8, %%mm1\n"
436
      "psrlq $8, %%mm4\n"
437
      "movq %%mm4, %%mm5\n"
438
      "movq %%mm1, %%mm3\n"
439
      "punpcklbw %%mm7,%%mm4\n"
440
      "punpcklbw %%mm7,%%mm1\n"
441
      "punpckhbw %%mm7,%%mm5\n"
442
      "punpckhbw %%mm7,%%mm3\n"
443
      "psubw %%mm1, %%mm4\n"
444
      "psubw %%mm3, %%mm5\n"
445
      "psubw %%mm4, %%mm0\n"
446
      "psubw %%mm5, %%mm2\n"
447
      "pxor %%mm3, %%mm3\n"
448
      "pxor %%mm1, %%mm1\n"
449
      "pcmpgtw %%mm0, %%mm3\n\t"
450
      "pcmpgtw %%mm2, %%mm1\n\t"
451
      "pxor %%mm3, %%mm0\n"
452
      "pxor %%mm1, %%mm2\n"
453
      "psubw %%mm3, %%mm0\n"
454
      "psubw %%mm1, %%mm2\n"
455
      "paddw %%mm0, %%mm2\n"
456
      "paddw %%mm2, %%mm6\n"
457

    
458
      "add %2,%0\n"
459
      "subl $2, %%ecx\n"
460
      " jnz 1b\n"
461

    
462
      "movq %%mm6, %%mm0\n"
463
      "punpcklwd %%mm7,%%mm0\n"
464
      "punpckhwd %%mm7,%%mm6\n"
465
      "paddd %%mm0, %%mm6\n"
466

    
467
      "movq %%mm6,%%mm0\n"
468
      "psrlq $32, %%mm6\n"
469
      "paddd %%mm6,%%mm0\n"
470
      "movd %%mm0,%1\n"
471
      : "+r" (pix1), "=r"(tmp)
472
      : "r" ((x86_reg)line_size) , "g" (h-2)
473
      : "%ecx");
474
      return tmp;
475
}
476

    
477
static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
478
    int tmp;
479
    uint8_t * pix= pix1;
480
  asm volatile (
481
      "movl %3,%%ecx\n"
482
      "pxor %%mm7,%%mm7\n"
483
      "pxor %%mm6,%%mm6\n"
484

    
485
      "movq (%0),%%mm0\n"
486
      "movq 1(%0),%%mm1\n"
487
      "movq %%mm0, %%mm2\n"
488
      "movq %%mm1, %%mm3\n"
489
      "punpcklbw %%mm7,%%mm0\n"
490
      "punpcklbw %%mm7,%%mm1\n"
491
      "punpckhbw %%mm7,%%mm2\n"
492
      "punpckhbw %%mm7,%%mm3\n"
493
      "psubw %%mm1, %%mm0\n"
494
      "psubw %%mm3, %%mm2\n"
495

    
496
      "add %2,%0\n"
497

    
498
      "movq (%0),%%mm4\n"
499
      "movq 1(%0),%%mm1\n"
500
      "movq %%mm4, %%mm5\n"
501
      "movq %%mm1, %%mm3\n"
502
      "punpcklbw %%mm7,%%mm4\n"
503
      "punpcklbw %%mm7,%%mm1\n"
504
      "punpckhbw %%mm7,%%mm5\n"
505
      "punpckhbw %%mm7,%%mm3\n"
506
      "psubw %%mm1, %%mm4\n"
507
      "psubw %%mm3, %%mm5\n"
508
      "psubw %%mm4, %%mm0\n"
509
      "psubw %%mm5, %%mm2\n"
510
      "pxor %%mm3, %%mm3\n"
511
      "pxor %%mm1, %%mm1\n"
512
      "pcmpgtw %%mm0, %%mm3\n\t"
513
      "pcmpgtw %%mm2, %%mm1\n\t"
514
      "pxor %%mm3, %%mm0\n"
515
      "pxor %%mm1, %%mm2\n"
516
      "psubw %%mm3, %%mm0\n"
517
      "psubw %%mm1, %%mm2\n"
518
      "paddw %%mm0, %%mm2\n"
519
      "paddw %%mm2, %%mm6\n"
520

    
521
      "add %2,%0\n"
522
      "1:\n"
523

    
524
      "movq (%0),%%mm0\n"
525
      "movq 1(%0),%%mm1\n"
526
      "movq %%mm0, %%mm2\n"
527
      "movq %%mm1, %%mm3\n"
528
      "punpcklbw %%mm7,%%mm0\n"
529
      "punpcklbw %%mm7,%%mm1\n"
530
      "punpckhbw %%mm7,%%mm2\n"
531
      "punpckhbw %%mm7,%%mm3\n"
532
      "psubw %%mm1, %%mm0\n"
533
      "psubw %%mm3, %%mm2\n"
534
      "psubw %%mm0, %%mm4\n"
535
      "psubw %%mm2, %%mm5\n"
536
      "pxor %%mm3, %%mm3\n"
537
      "pxor %%mm1, %%mm1\n"
538
      "pcmpgtw %%mm4, %%mm3\n\t"
539
      "pcmpgtw %%mm5, %%mm1\n\t"
540
      "pxor %%mm3, %%mm4\n"
541
      "pxor %%mm1, %%mm5\n"
542
      "psubw %%mm3, %%mm4\n"
543
      "psubw %%mm1, %%mm5\n"
544
      "paddw %%mm4, %%mm5\n"
545
      "paddw %%mm5, %%mm6\n"
546

    
547
      "add %2,%0\n"
548

    
549
      "movq (%0),%%mm4\n"
550
      "movq 1(%0),%%mm1\n"
551
      "movq %%mm4, %%mm5\n"
552
      "movq %%mm1, %%mm3\n"
553
      "punpcklbw %%mm7,%%mm4\n"
554
      "punpcklbw %%mm7,%%mm1\n"
555
      "punpckhbw %%mm7,%%mm5\n"
556
      "punpckhbw %%mm7,%%mm3\n"
557
      "psubw %%mm1, %%mm4\n"
558
      "psubw %%mm3, %%mm5\n"
559
      "psubw %%mm4, %%mm0\n"
560
      "psubw %%mm5, %%mm2\n"
561
      "pxor %%mm3, %%mm3\n"
562
      "pxor %%mm1, %%mm1\n"
563
      "pcmpgtw %%mm0, %%mm3\n\t"
564
      "pcmpgtw %%mm2, %%mm1\n\t"
565
      "pxor %%mm3, %%mm0\n"
566
      "pxor %%mm1, %%mm2\n"
567
      "psubw %%mm3, %%mm0\n"
568
      "psubw %%mm1, %%mm2\n"
569
      "paddw %%mm0, %%mm2\n"
570
      "paddw %%mm2, %%mm6\n"
571

    
572
      "add %2,%0\n"
573
      "subl $2, %%ecx\n"
574
      " jnz 1b\n"
575

    
576
      "movq %%mm6, %%mm0\n"
577
      "punpcklwd %%mm7,%%mm0\n"
578
      "punpckhwd %%mm7,%%mm6\n"
579
      "paddd %%mm0, %%mm6\n"
580

    
581
      "movq %%mm6,%%mm0\n"
582
      "psrlq $32, %%mm6\n"
583
      "paddd %%mm6,%%mm0\n"
584
      "movd %%mm0,%1\n"
585
      : "+r" (pix1), "=r"(tmp)
586
      : "r" ((x86_reg)line_size) , "g" (h-2)
587
      : "%ecx");
588
      return tmp + hf_noise8_mmx(pix+8, line_size, h);
589
}
590

    
591
static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
592
    MpegEncContext *c = p;
593
    int score1, score2;
594

    
595
    if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
596
    else  score1 = sse16_mmx(c, pix1, pix2, line_size, h);
597
    score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
598

    
599
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
600
    else  return score1 + FFABS(score2)*8;
601
}
602

    
603
static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
604
    MpegEncContext *c = p;
605
    int score1= sse8_mmx(c, pix1, pix2, line_size, h);
606
    int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
607

    
608
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
609
    else  return score1 + FFABS(score2)*8;
610
}
611

    
612
static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
613
    int tmp;
614

    
615
    assert( (((int)pix) & 7) == 0);
616
    assert((line_size &7) ==0);
617

    
618
#define SUM(in0, in1, out0, out1) \
619
      "movq (%0), %%mm2\n"\
620
      "movq 8(%0), %%mm3\n"\
621
      "add %2,%0\n"\
622
      "movq %%mm2, " #out0 "\n"\
623
      "movq %%mm3, " #out1 "\n"\
624
      "psubusb " #in0 ", %%mm2\n"\
625
      "psubusb " #in1 ", %%mm3\n"\
626
      "psubusb " #out0 ", " #in0 "\n"\
627
      "psubusb " #out1 ", " #in1 "\n"\
628
      "por %%mm2, " #in0 "\n"\
629
      "por %%mm3, " #in1 "\n"\
630
      "movq " #in0 ", %%mm2\n"\
631
      "movq " #in1 ", %%mm3\n"\
632
      "punpcklbw %%mm7, " #in0 "\n"\
633
      "punpcklbw %%mm7, " #in1 "\n"\
634
      "punpckhbw %%mm7, %%mm2\n"\
635
      "punpckhbw %%mm7, %%mm3\n"\
636
      "paddw " #in1 ", " #in0 "\n"\
637
      "paddw %%mm3, %%mm2\n"\
638
      "paddw %%mm2, " #in0 "\n"\
639
      "paddw " #in0 ", %%mm6\n"
640

    
641

    
642
  asm volatile (
643
      "movl %3,%%ecx\n"
644
      "pxor %%mm6,%%mm6\n"
645
      "pxor %%mm7,%%mm7\n"
646
      "movq (%0),%%mm0\n"
647
      "movq 8(%0),%%mm1\n"
648
      "add %2,%0\n"
649
      "subl $2, %%ecx\n"
650
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
651
      "1:\n"
652

    
653
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
654

    
655
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
656

    
657
      "subl $2, %%ecx\n"
658
      "jnz 1b\n"
659

    
660
      "movq %%mm6,%%mm0\n"
661
      "psrlq $32, %%mm6\n"
662
      "paddw %%mm6,%%mm0\n"
663
      "movq %%mm0,%%mm6\n"
664
      "psrlq $16, %%mm0\n"
665
      "paddw %%mm6,%%mm0\n"
666
      "movd %%mm0,%1\n"
667
      : "+r" (pix), "=r"(tmp)
668
      : "r" ((x86_reg)line_size) , "m" (h)
669
      : "%ecx");
670
    return tmp & 0xFFFF;
671
}
672
#undef SUM
673

    
674
static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
675
    int tmp;
676

    
677
    assert( (((int)pix) & 7) == 0);
678
    assert((line_size &7) ==0);
679

    
680
#define SUM(in0, in1, out0, out1) \
681
      "movq (%0), " #out0 "\n"\
682
      "movq 8(%0), " #out1 "\n"\
683
      "add %2,%0\n"\
684
      "psadbw " #out0 ", " #in0 "\n"\
685
      "psadbw " #out1 ", " #in1 "\n"\
686
      "paddw " #in1 ", " #in0 "\n"\
687
      "paddw " #in0 ", %%mm6\n"
688

    
689
  asm volatile (
690
      "movl %3,%%ecx\n"
691
      "pxor %%mm6,%%mm6\n"
692
      "pxor %%mm7,%%mm7\n"
693
      "movq (%0),%%mm0\n"
694
      "movq 8(%0),%%mm1\n"
695
      "add %2,%0\n"
696
      "subl $2, %%ecx\n"
697
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
698
      "1:\n"
699

    
700
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
701

    
702
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
703

    
704
      "subl $2, %%ecx\n"
705
      "jnz 1b\n"
706

    
707
      "movd %%mm6,%1\n"
708
      : "+r" (pix), "=r"(tmp)
709
      : "r" ((x86_reg)line_size) , "m" (h)
710
      : "%ecx");
711
    return tmp;
712
}
713
#undef SUM
714

    
715
static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
716
    int tmp;
717

    
718
    assert( (((int)pix1) & 7) == 0);
719
    assert( (((int)pix2) & 7) == 0);
720
    assert((line_size &7) ==0);
721

    
722
#define SUM(in0, in1, out0, out1) \
723
      "movq (%0),%%mm2\n"\
724
      "movq (%1)," #out0 "\n"\
725
      "movq 8(%0),%%mm3\n"\
726
      "movq 8(%1)," #out1 "\n"\
727
      "add %3,%0\n"\
728
      "add %3,%1\n"\
729
      "psubb " #out0 ", %%mm2\n"\
730
      "psubb " #out1 ", %%mm3\n"\
731
      "pxor %%mm7, %%mm2\n"\
732
      "pxor %%mm7, %%mm3\n"\
733
      "movq %%mm2, " #out0 "\n"\
734
      "movq %%mm3, " #out1 "\n"\
735
      "psubusb " #in0 ", %%mm2\n"\
736
      "psubusb " #in1 ", %%mm3\n"\
737
      "psubusb " #out0 ", " #in0 "\n"\
738
      "psubusb " #out1 ", " #in1 "\n"\
739
      "por %%mm2, " #in0 "\n"\
740
      "por %%mm3, " #in1 "\n"\
741
      "movq " #in0 ", %%mm2\n"\
742
      "movq " #in1 ", %%mm3\n"\
743
      "punpcklbw %%mm7, " #in0 "\n"\
744
      "punpcklbw %%mm7, " #in1 "\n"\
745
      "punpckhbw %%mm7, %%mm2\n"\
746
      "punpckhbw %%mm7, %%mm3\n"\
747
      "paddw " #in1 ", " #in0 "\n"\
748
      "paddw %%mm3, %%mm2\n"\
749
      "paddw %%mm2, " #in0 "\n"\
750
      "paddw " #in0 ", %%mm6\n"
751

    
752

    
753
  asm volatile (
754
      "movl %4,%%ecx\n"
755
      "pxor %%mm6,%%mm6\n"
756
      "pcmpeqw %%mm7,%%mm7\n"
757
      "psllw $15, %%mm7\n"
758
      "packsswb %%mm7, %%mm7\n"
759
      "movq (%0),%%mm0\n"
760
      "movq (%1),%%mm2\n"
761
      "movq 8(%0),%%mm1\n"
762
      "movq 8(%1),%%mm3\n"
763
      "add %3,%0\n"
764
      "add %3,%1\n"
765
      "subl $2, %%ecx\n"
766
      "psubb %%mm2, %%mm0\n"
767
      "psubb %%mm3, %%mm1\n"
768
      "pxor %%mm7, %%mm0\n"
769
      "pxor %%mm7, %%mm1\n"
770
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
771
      "1:\n"
772

    
773
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
774

    
775
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
776

    
777
      "subl $2, %%ecx\n"
778
      "jnz 1b\n"
779

    
780
      "movq %%mm6,%%mm0\n"
781
      "psrlq $32, %%mm6\n"
782
      "paddw %%mm6,%%mm0\n"
783
      "movq %%mm0,%%mm6\n"
784
      "psrlq $16, %%mm0\n"
785
      "paddw %%mm6,%%mm0\n"
786
      "movd %%mm0,%2\n"
787
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
788
      : "r" ((x86_reg)line_size) , "m" (h)
789
      : "%ecx");
790
    return tmp & 0x7FFF;
791
}
792
#undef SUM
793

    
794
static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
795
    int tmp;
796

    
797
    assert( (((int)pix1) & 7) == 0);
798
    assert( (((int)pix2) & 7) == 0);
799
    assert((line_size &7) ==0);
800

    
801
#define SUM(in0, in1, out0, out1) \
802
      "movq (%0)," #out0 "\n"\
803
      "movq (%1),%%mm2\n"\
804
      "movq 8(%0)," #out1 "\n"\
805
      "movq 8(%1),%%mm3\n"\
806
      "add %3,%0\n"\
807
      "add %3,%1\n"\
808
      "psubb %%mm2, " #out0 "\n"\
809
      "psubb %%mm3, " #out1 "\n"\
810
      "pxor %%mm7, " #out0 "\n"\
811
      "pxor %%mm7, " #out1 "\n"\
812
      "psadbw " #out0 ", " #in0 "\n"\
813
      "psadbw " #out1 ", " #in1 "\n"\
814
      "paddw " #in1 ", " #in0 "\n"\
815
      "paddw " #in0 ", %%mm6\n"
816

    
817
  asm volatile (
818
      "movl %4,%%ecx\n"
819
      "pxor %%mm6,%%mm6\n"
820
      "pcmpeqw %%mm7,%%mm7\n"
821
      "psllw $15, %%mm7\n"
822
      "packsswb %%mm7, %%mm7\n"
823
      "movq (%0),%%mm0\n"
824
      "movq (%1),%%mm2\n"
825
      "movq 8(%0),%%mm1\n"
826
      "movq 8(%1),%%mm3\n"
827
      "add %3,%0\n"
828
      "add %3,%1\n"
829
      "subl $2, %%ecx\n"
830
      "psubb %%mm2, %%mm0\n"
831
      "psubb %%mm3, %%mm1\n"
832
      "pxor %%mm7, %%mm0\n"
833
      "pxor %%mm7, %%mm1\n"
834
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
835
      "1:\n"
836

    
837
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
838

    
839
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
840

    
841
      "subl $2, %%ecx\n"
842
      "jnz 1b\n"
843

    
844
      "movd %%mm6,%2\n"
845
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
846
      : "r" ((x86_reg)line_size) , "m" (h)
847
      : "%ecx");
848
    return tmp;
849
}
850
#undef SUM
851

    
852
static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
853
    x86_reg i=0;
854
    asm volatile(
855
        "1:                             \n\t"
856
        "movq  (%2, %0), %%mm0          \n\t"
857
        "movq  (%1, %0), %%mm1          \n\t"
858
        "psubb %%mm0, %%mm1             \n\t"
859
        "movq %%mm1, (%3, %0)           \n\t"
860
        "movq 8(%2, %0), %%mm0          \n\t"
861
        "movq 8(%1, %0), %%mm1          \n\t"
862
        "psubb %%mm0, %%mm1             \n\t"
863
        "movq %%mm1, 8(%3, %0)          \n\t"
864
        "add $16, %0                    \n\t"
865
        "cmp %4, %0                     \n\t"
866
        " jb 1b                         \n\t"
867
        : "+r" (i)
868
        : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w-15)
869
    );
870
    for(; i<w; i++)
871
        dst[i+0] = src1[i+0]-src2[i+0];
872
}
873

    
874
static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
875
    x86_reg i=0;
876
    uint8_t l, lt;
877

    
878
    asm volatile(
879
        "1:                             \n\t"
880
        "movq  -1(%1, %0), %%mm0        \n\t" // LT
881
        "movq  (%1, %0), %%mm1          \n\t" // T
882
        "movq  -1(%2, %0), %%mm2        \n\t" // L
883
        "movq  (%2, %0), %%mm3          \n\t" // X
884
        "movq %%mm2, %%mm4              \n\t" // L
885
        "psubb %%mm0, %%mm2             \n\t"
886
        "paddb %%mm1, %%mm2             \n\t" // L + T - LT
887
        "movq %%mm4, %%mm5              \n\t" // L
888
        "pmaxub %%mm1, %%mm4            \n\t" // max(T, L)
889
        "pminub %%mm5, %%mm1            \n\t" // min(T, L)
890
        "pminub %%mm2, %%mm4            \n\t"
891
        "pmaxub %%mm1, %%mm4            \n\t"
892
        "psubb %%mm4, %%mm3             \n\t" // dst - pred
893
        "movq %%mm3, (%3, %0)           \n\t"
894
        "add $8, %0                     \n\t"
895
        "cmp %4, %0                     \n\t"
896
        " jb 1b                         \n\t"
897
        : "+r" (i)
898
        : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w)
899
    );
900

    
901
    l= *left;
902
    lt= *left_top;
903

    
904
    dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
905

    
906
    *left_top= src1[w-1];
907
    *left    = src2[w-1];
908
}
909

    
910
#define DIFF_PIXELS_1(m,a,t,p1,p2)\
911
    "mov"#m" "#p1", "#a"              \n\t"\
912
    "mov"#m" "#p2", "#t"              \n\t"\
913
    "punpcklbw "#a", "#t"             \n\t"\
914
    "punpcklbw "#a", "#a"             \n\t"\
915
    "psubw     "#t", "#a"             \n\t"\
916

    
917
#define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
918
    uint8_t *p1b=p1, *p2b=p2;\
919
    asm volatile(\
920
        DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
921
        DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
922
        DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
923
        "add %4, %1                   \n\t"\
924
        "add %4, %2                   \n\t"\
925
        DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
926
        DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
927
        DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
928
        DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
929
        "mov"#m1" "#mm"0, %0          \n\t"\
930
        DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
931
        "mov"#m1" %0, "#mm"0          \n\t"\
932
        : "+m"(temp), "+r"(p1b), "+r"(p2b)\
933
        : "r"((x86_reg)stride), "r"((x86_reg)stride*3)\
934
    );\
935
}
936
    //the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp)
937

    
938
#define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q,   %%mm,  p1, p2, stride, temp)
939
#define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
940

    
941
#define LBUTTERFLY2(a1,b1,a2,b2)\
942
    "paddw " #b1 ", " #a1 "           \n\t"\
943
    "paddw " #b2 ", " #a2 "           \n\t"\
944
    "paddw " #b1 ", " #b1 "           \n\t"\
945
    "paddw " #b2 ", " #b2 "           \n\t"\
946
    "psubw " #a1 ", " #b1 "           \n\t"\
947
    "psubw " #a2 ", " #b2 "           \n\t"
948

    
949
#define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
950
        LBUTTERFLY2(m0, m1, m2, m3)\
951
        LBUTTERFLY2(m4, m5, m6, m7)\
952
        LBUTTERFLY2(m0, m2, m1, m3)\
953
        LBUTTERFLY2(m4, m6, m5, m7)\
954
        LBUTTERFLY2(m0, m4, m1, m5)\
955
        LBUTTERFLY2(m2, m6, m3, m7)\
956

    
957
#define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
958

    
959
#define MMABS_MMX(a,z)\
960
    "pxor " #z ", " #z "              \n\t"\
961
    "pcmpgtw " #a ", " #z "           \n\t"\
962
    "pxor " #z ", " #a "              \n\t"\
963
    "psubw " #z ", " #a "             \n\t"
964

    
965
#define MMABS_MMX2(a,z)\
966
    "pxor " #z ", " #z "              \n\t"\
967
    "psubw " #a ", " #z "             \n\t"\
968
    "pmaxsw " #z ", " #a "            \n\t"
969

    
970
#define MMABS_SSSE3(a,z)\
971
    "pabsw " #a ", " #a "             \n\t"
972

    
973
#define MMABS_SUM(a,z, sum)\
974
    MMABS(a,z)\
975
    "paddusw " #a ", " #sum "         \n\t"
976

    
977
#define MMABS_SUM_8x8_NOSPILL\
978
    MMABS(%%xmm0, %%xmm8)\
979
    MMABS(%%xmm1, %%xmm9)\
980
    MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
981
    MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
982
    MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
983
    MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
984
    MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
985
    MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
986
    "paddusw %%xmm1, %%xmm0           \n\t"
987

    
988
#ifdef ARCH_X86_64
989
#define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
990
#else
991
#define MMABS_SUM_8x8_SSE2\
992
    "movdqa %%xmm7, (%1)              \n\t"\
993
    MMABS(%%xmm0, %%xmm7)\
994
    MMABS(%%xmm1, %%xmm7)\
995
    MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
996
    MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
997
    MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
998
    MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
999
    MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
1000
    "movdqa (%1), %%xmm2              \n\t"\
1001
    MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
1002
    "paddusw %%xmm1, %%xmm0           \n\t"
1003
#endif
1004

    
1005
#define LOAD4(o, a, b, c, d)\
1006
    "movq "#o"(%1),    "#a"           \n\t"\
1007
    "movq "#o"+8(%1),  "#b"           \n\t"\
1008
    "movq "#o"+16(%1), "#c"           \n\t"\
1009
    "movq "#o"+24(%1), "#d"           \n\t"\
1010

    
1011
#define STORE4(o, a, b, c, d)\
1012
    "movq "#a", "#o"(%1)              \n\t"\
1013
    "movq "#b", "#o"+8(%1)            \n\t"\
1014
    "movq "#c", "#o"+16(%1)           \n\t"\
1015
    "movq "#d", "#o"+24(%1)           \n\t"\
1016

    
1017
/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
1018
 * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
1019
 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
1020
#define HSUM_MMX(a, t, dst)\
1021
    "movq "#a", "#t"                  \n\t"\
1022
    "psrlq $32, "#a"                  \n\t"\
1023
    "paddusw "#t", "#a"               \n\t"\
1024
    "movq "#a", "#t"                  \n\t"\
1025
    "psrlq $16, "#a"                  \n\t"\
1026
    "paddusw "#t", "#a"               \n\t"\
1027
    "movd "#a", "#dst"                \n\t"\
1028

    
1029
#define HSUM_MMX2(a, t, dst)\
1030
    "pshufw $0x0E, "#a", "#t"         \n\t"\
1031
    "paddusw "#t", "#a"               \n\t"\
1032
    "pshufw $0x01, "#a", "#t"         \n\t"\
1033
    "paddusw "#t", "#a"               \n\t"\
1034
    "movd "#a", "#dst"                \n\t"\
1035

    
1036
#define HSUM_SSE2(a, t, dst)\
1037
    "movhlps "#a", "#t"               \n\t"\
1038
    "paddusw "#t", "#a"               \n\t"\
1039
    "pshuflw $0x0E, "#a", "#t"        \n\t"\
1040
    "paddusw "#t", "#a"               \n\t"\
1041
    "pshuflw $0x01, "#a", "#t"        \n\t"\
1042
    "paddusw "#t", "#a"               \n\t"\
1043
    "movd "#a", "#dst"                \n\t"\
1044

    
1045
#define HADAMARD8_DIFF_MMX(cpu) \
1046
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1047
    DECLARE_ALIGNED_8(uint64_t, temp[13]);\
1048
    int sum;\
1049
\
1050
    assert(h==8);\
1051
\
1052
    DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
1053
\
1054
    asm volatile(\
1055
        HADAMARD48\
1056
\
1057
        "movq %%mm7, 96(%1)             \n\t"\
1058
\
1059
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1060
        STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\
1061
\
1062
        "movq 96(%1), %%mm7             \n\t"\
1063
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1064
        STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\
1065
\
1066
        : "=r" (sum)\
1067
        : "r"(temp)\
1068
    );\
1069
\
1070
    DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
1071
\
1072
    asm volatile(\
1073
        HADAMARD48\
1074
\
1075
        "movq %%mm7, 96(%1)             \n\t"\
1076
\
1077
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1078
        STORE4(32, %%mm0, %%mm3, %%mm7, %%mm2)\
1079
\
1080
        "movq 96(%1), %%mm7             \n\t"\
1081
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1082
        "movq %%mm7, %%mm5              \n\t"/*FIXME remove*/\
1083
        "movq %%mm6, %%mm7              \n\t"\
1084
        "movq %%mm0, %%mm6              \n\t"\
1085
\
1086
        LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\
1087
\
1088
        HADAMARD48\
1089
        "movq %%mm7, 64(%1)             \n\t"\
1090
        MMABS(%%mm0, %%mm7)\
1091
        MMABS(%%mm1, %%mm7)\
1092
        MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1093
        MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1094
        MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1095
        MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1096
        MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1097
        "movq 64(%1), %%mm2             \n\t"\
1098
        MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1099
        "paddusw %%mm1, %%mm0           \n\t"\
1100
        "movq %%mm0, 64(%1)             \n\t"\
1101
\
1102
        LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\
1103
        LOAD4(32, %%mm4, %%mm5, %%mm6, %%mm7)\
1104
\
1105
        HADAMARD48\
1106
        "movq %%mm7, (%1)               \n\t"\
1107
        MMABS(%%mm0, %%mm7)\
1108
        MMABS(%%mm1, %%mm7)\
1109
        MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1110
        MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1111
        MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1112
        MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1113
        MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1114
        "movq (%1), %%mm2               \n\t"\
1115
        MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1116
        "paddusw 64(%1), %%mm0          \n\t"\
1117
        "paddusw %%mm1, %%mm0           \n\t"\
1118
\
1119
        HSUM(%%mm0, %%mm1, %0)\
1120
\
1121
        : "=r" (sum)\
1122
        : "r"(temp)\
1123
    );\
1124
    return sum&0xFFFF;\
1125
}\
1126
WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1127

    
1128
#define HADAMARD8_DIFF_SSE2(cpu) \
1129
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1130
    DECLARE_ALIGNED_16(uint64_t, temp[4]);\
1131
    int sum;\
1132
\
1133
    assert(h==8);\
1134
\
1135
    DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
1136
\
1137
    asm volatile(\
1138
        HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
1139
        TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
1140
        HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
1141
        MMABS_SUM_8x8\
1142
        HSUM_SSE2(%%xmm0, %%xmm1, %0)\
1143
        : "=r" (sum)\
1144
        : "r"(temp)\
1145
    );\
1146
    return sum&0xFFFF;\
1147
}\
1148
WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1149

    
1150
#define MMABS(a,z)         MMABS_MMX(a,z)
1151
#define HSUM(a,t,dst)      HSUM_MMX(a,t,dst)
1152
HADAMARD8_DIFF_MMX(mmx)
1153
#undef MMABS
1154
#undef HSUM
1155

    
1156
#define MMABS(a,z)         MMABS_MMX2(a,z)
1157
#define MMABS_SUM_8x8      MMABS_SUM_8x8_SSE2
1158
#define HSUM(a,t,dst)      HSUM_MMX2(a,t,dst)
1159
HADAMARD8_DIFF_MMX(mmx2)
1160
HADAMARD8_DIFF_SSE2(sse2)
1161
#undef MMABS
1162
#undef MMABS_SUM_8x8
1163
#undef HSUM
1164

    
1165
#ifdef HAVE_SSSE3
1166
#define MMABS(a,z)         MMABS_SSSE3(a,z)
1167
#define MMABS_SUM_8x8      MMABS_SUM_8x8_NOSPILL
1168
HADAMARD8_DIFF_SSE2(ssse3)
1169
#undef MMABS
1170
#undef MMABS_SUM_8x8
1171
#endif
1172

    
1173
#define DCT_SAD4(m,mm,o)\
1174
    "mov"#m" "#o"+ 0(%1), "#mm"2      \n\t"\
1175
    "mov"#m" "#o"+16(%1), "#mm"3      \n\t"\
1176
    "mov"#m" "#o"+32(%1), "#mm"4      \n\t"\
1177
    "mov"#m" "#o"+48(%1), "#mm"5      \n\t"\
1178
    MMABS_SUM(mm##2, mm##6, mm##0)\
1179
    MMABS_SUM(mm##3, mm##7, mm##1)\
1180
    MMABS_SUM(mm##4, mm##6, mm##0)\
1181
    MMABS_SUM(mm##5, mm##7, mm##1)\
1182

    
1183
#define DCT_SAD_MMX\
1184
    "pxor %%mm0, %%mm0                \n\t"\
1185
    "pxor %%mm1, %%mm1                \n\t"\
1186
    DCT_SAD4(q, %%mm, 0)\
1187
    DCT_SAD4(q, %%mm, 8)\
1188
    DCT_SAD4(q, %%mm, 64)\
1189
    DCT_SAD4(q, %%mm, 72)\
1190
    "paddusw %%mm1, %%mm0             \n\t"\
1191
    HSUM(%%mm0, %%mm1, %0)
1192

    
1193
#define DCT_SAD_SSE2\
1194
    "pxor %%xmm0, %%xmm0              \n\t"\
1195
    "pxor %%xmm1, %%xmm1              \n\t"\
1196
    DCT_SAD4(dqa, %%xmm, 0)\
1197
    DCT_SAD4(dqa, %%xmm, 64)\
1198
    "paddusw %%xmm1, %%xmm0           \n\t"\
1199
    HSUM(%%xmm0, %%xmm1, %0)
1200

    
1201
#define DCT_SAD_FUNC(cpu) \
1202
static int sum_abs_dctelem_##cpu(DCTELEM *block){\
1203
    int sum;\
1204
    asm volatile(\
1205
        DCT_SAD\
1206
        :"=r"(sum)\
1207
        :"r"(block)\
1208
    );\
1209
    return sum&0xFFFF;\
1210
}
1211

    
1212
#define DCT_SAD       DCT_SAD_MMX
1213
#define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1214
#define MMABS(a,z)    MMABS_MMX(a,z)
1215
DCT_SAD_FUNC(mmx)
1216
#undef MMABS
1217
#undef HSUM
1218

    
1219
#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1220
#define MMABS(a,z)    MMABS_MMX2(a,z)
1221
DCT_SAD_FUNC(mmx2)
1222
#undef HSUM
1223
#undef DCT_SAD
1224

    
1225
#define DCT_SAD       DCT_SAD_SSE2
1226
#define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
1227
DCT_SAD_FUNC(sse2)
1228
#undef MMABS
1229

    
1230
#ifdef HAVE_SSSE3
1231
#define MMABS(a,z)    MMABS_SSSE3(a,z)
1232
DCT_SAD_FUNC(ssse3)
1233
#undef MMABS
1234
#endif
1235
#undef HSUM
1236
#undef DCT_SAD
1237

    
1238
static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
1239
    int sum;
1240
    x86_reg i=size;
1241
    asm volatile(
1242
        "pxor %%mm4, %%mm4 \n"
1243
        "1: \n"
1244
        "sub $8, %0 \n"
1245
        "movq (%2,%0), %%mm2 \n"
1246
        "movq (%3,%0,2), %%mm0 \n"
1247
        "movq 8(%3,%0,2), %%mm1 \n"
1248
        "punpckhbw %%mm2, %%mm3 \n"
1249
        "punpcklbw %%mm2, %%mm2 \n"
1250
        "psraw $8, %%mm3 \n"
1251
        "psraw $8, %%mm2 \n"
1252
        "psubw %%mm3, %%mm1 \n"
1253
        "psubw %%mm2, %%mm0 \n"
1254
        "pmaddwd %%mm1, %%mm1 \n"
1255
        "pmaddwd %%mm0, %%mm0 \n"
1256
        "paddd %%mm1, %%mm4 \n"
1257
        "paddd %%mm0, %%mm4 \n"
1258
        "jg 1b \n"
1259
        "movq %%mm4, %%mm3 \n"
1260
        "psrlq $32, %%mm3 \n"
1261
        "paddd %%mm3, %%mm4 \n"
1262
        "movd %%mm4, %1 \n"
1263
        :"+r"(i), "=r"(sum)
1264
        :"r"(pix1), "r"(pix2)
1265
    );
1266
    return sum;
1267
}
1268

    
1269
#define PHADDD(a, t)\
1270
    "movq "#a", "#t"                  \n\t"\
1271
    "psrlq $32, "#a"                  \n\t"\
1272
    "paddd "#t", "#a"                 \n\t"
1273
/*
1274
   pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
1275
   pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
1276
   pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
1277
 */
1278
#define PMULHRW(x, y, s, o)\
1279
    "pmulhw " #s ", "#x "            \n\t"\
1280
    "pmulhw " #s ", "#y "            \n\t"\
1281
    "paddw " #o ", "#x "             \n\t"\
1282
    "paddw " #o ", "#y "             \n\t"\
1283
    "psraw $1, "#x "                 \n\t"\
1284
    "psraw $1, "#y "                 \n\t"
1285
#define DEF(x) x ## _mmx
1286
#define SET_RND MOVQ_WONE
1287
#define SCALE_OFFSET 1
1288

    
1289
#include "dsputil_mmx_qns.h"
1290

    
1291
#undef DEF
1292
#undef SET_RND
1293
#undef SCALE_OFFSET
1294
#undef PMULHRW
1295

    
1296
#define DEF(x) x ## _3dnow
1297
#define SET_RND(x)
1298
#define SCALE_OFFSET 0
1299
#define PMULHRW(x, y, s, o)\
1300
    "pmulhrw " #s ", "#x "           \n\t"\
1301
    "pmulhrw " #s ", "#y "           \n\t"
1302

    
1303
#include "dsputil_mmx_qns.h"
1304

    
1305
#undef DEF
1306
#undef SET_RND
1307
#undef SCALE_OFFSET
1308
#undef PMULHRW
1309

    
1310
#ifdef HAVE_SSSE3
1311
#undef PHADDD
1312
#define DEF(x) x ## _ssse3
1313
#define SET_RND(x)
1314
#define SCALE_OFFSET -1
1315
#define PHADDD(a, t)\
1316
    "pshufw $0x0E, "#a", "#t"         \n\t"\
1317
    "paddd "#t", "#a"                 \n\t" /* faster than phaddd on core2 */
1318
#define PMULHRW(x, y, s, o)\
1319
    "pmulhrsw " #s ", "#x "          \n\t"\
1320
    "pmulhrsw " #s ", "#y "          \n\t"
1321

    
1322
#include "dsputil_mmx_qns.h"
1323

    
1324
#undef DEF
1325
#undef SET_RND
1326
#undef SCALE_OFFSET
1327
#undef PMULHRW
1328
#undef PHADDD
1329
#endif //HAVE_SSSE3
1330

    
1331

    
1332
/* FLAC specific */
1333
void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag,
1334
                                   double *autoc);
1335

    
1336

    
1337
void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
1338
{
1339
    if (mm_flags & MM_MMX) {
1340
        const int dct_algo = avctx->dct_algo;
1341
        if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
1342
            if(mm_flags & MM_SSE2){
1343
                c->fdct = ff_fdct_sse2;
1344
            }else if(mm_flags & MM_MMXEXT){
1345
                c->fdct = ff_fdct_mmx2;
1346
            }else{
1347
                c->fdct = ff_fdct_mmx;
1348
            }
1349
        }
1350

    
1351
        c->get_pixels = get_pixels_mmx;
1352
        c->diff_pixels = diff_pixels_mmx;
1353
        c->pix_sum = pix_sum16_mmx;
1354

    
1355
        c->diff_bytes= diff_bytes_mmx;
1356
        c->sum_abs_dctelem= sum_abs_dctelem_mmx;
1357

    
1358
        c->hadamard8_diff[0]= hadamard8_diff16_mmx;
1359
        c->hadamard8_diff[1]= hadamard8_diff_mmx;
1360

    
1361
        c->pix_norm1 = pix_norm1_mmx;
1362
        c->sse[0] = (mm_flags & MM_SSE2) ? sse16_sse2 : sse16_mmx;
1363
          c->sse[1] = sse8_mmx;
1364
        c->vsad[4]= vsad_intra16_mmx;
1365

    
1366
        c->nsse[0] = nsse16_mmx;
1367
        c->nsse[1] = nsse8_mmx;
1368
        if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1369
            c->vsad[0] = vsad16_mmx;
1370
        }
1371

    
1372
        if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1373
            c->try_8x8basis= try_8x8basis_mmx;
1374
        }
1375
        c->add_8x8basis= add_8x8basis_mmx;
1376

    
1377
        c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
1378

    
1379

    
1380
        if (mm_flags & MM_MMXEXT) {
1381
            c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
1382
            c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
1383
            c->hadamard8_diff[1]= hadamard8_diff_mmx2;
1384
            c->vsad[4]= vsad_intra16_mmx2;
1385

    
1386
            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1387
                c->vsad[0] = vsad16_mmx2;
1388
            }
1389

    
1390
            c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
1391
        }
1392

    
1393
        if(mm_flags & MM_SSE2){
1394
            c->sum_abs_dctelem= sum_abs_dctelem_sse2;
1395
            c->hadamard8_diff[0]= hadamard8_diff16_sse2;
1396
            c->hadamard8_diff[1]= hadamard8_diff_sse2;
1397
            if (ENABLE_FLAC_ENCODER)
1398
                c->flac_compute_autocorr = ff_flac_compute_autocorr_sse2;
1399
        }
1400

    
1401
#ifdef HAVE_SSSE3
1402
        if(mm_flags & MM_SSSE3){
1403
            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1404
                c->try_8x8basis= try_8x8basis_ssse3;
1405
            }
1406
            c->add_8x8basis= add_8x8basis_ssse3;
1407
            c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
1408
            c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
1409
            c->hadamard8_diff[1]= hadamard8_diff_ssse3;
1410
        }
1411
#endif
1412

    
1413
        if(mm_flags & MM_3DNOW){
1414
            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1415
                c->try_8x8basis= try_8x8basis_3dnow;
1416
            }
1417
            c->add_8x8basis= add_8x8basis_3dnow;
1418
        }
1419
    }
1420

    
1421
    dsputil_init_pix_mmx(c, avctx);
1422
}