Statistics
| Branch: | Revision:

ffmpeg / libavcodec / i386 / dsputilenc_mmx.c @ c4ff7c53

History | View | Annotate | Download (43.1 KB)

1 97d1d009 Aurelien Jacobs
/*
2
 * MMX optimized DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * This file is part of FFmpeg.
7
 *
8
 * FFmpeg is free software; you can redistribute it and/or
9
 * modify it under the terms of the GNU Lesser General Public
10
 * License as published by the Free Software Foundation; either
11
 * version 2.1 of the License, or (at your option) any later version.
12
 *
13
 * FFmpeg is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
 * Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public
19
 * License along with FFmpeg; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
 *
22
 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
23
 */
24
25 245976da Diego Biurrun
#include "libavutil/x86_cpu.h"
26
#include "libavcodec/dsputil.h"
27
#include "libavcodec/mpegvideo.h"
28 97d1d009 Aurelien Jacobs
#include "dsputil_mmx.h"
29
30
31
static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
32
{
33 be449fca Diego Pettenò
    __asm__ volatile(
34 97d1d009 Aurelien Jacobs
        "mov $-128, %%"REG_a"           \n\t"
35
        "pxor %%mm7, %%mm7              \n\t"
36
        ASMALIGN(4)
37
        "1:                             \n\t"
38
        "movq (%0), %%mm0               \n\t"
39
        "movq (%0, %2), %%mm2           \n\t"
40
        "movq %%mm0, %%mm1              \n\t"
41
        "movq %%mm2, %%mm3              \n\t"
42
        "punpcklbw %%mm7, %%mm0         \n\t"
43
        "punpckhbw %%mm7, %%mm1         \n\t"
44
        "punpcklbw %%mm7, %%mm2         \n\t"
45
        "punpckhbw %%mm7, %%mm3         \n\t"
46
        "movq %%mm0, (%1, %%"REG_a")    \n\t"
47
        "movq %%mm1, 8(%1, %%"REG_a")   \n\t"
48
        "movq %%mm2, 16(%1, %%"REG_a")  \n\t"
49
        "movq %%mm3, 24(%1, %%"REG_a")  \n\t"
50
        "add %3, %0                     \n\t"
51
        "add $32, %%"REG_a"             \n\t"
52
        "js 1b                          \n\t"
53
        : "+r" (pixels)
54 40d0e665 Ramiro Polla
        : "r" (block+64), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*2)
55 97d1d009 Aurelien Jacobs
        : "%"REG_a
56
    );
57
}
58
59 f76543c9 Baptiste Coudurier
static void get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size)
60
{
61 be449fca Diego Pettenò
    __asm__ volatile(
62 f76543c9 Baptiste Coudurier
        "pxor %%xmm7,      %%xmm7         \n\t"
63
        "movq (%0),        %%xmm0         \n\t"
64
        "movq (%0, %2),    %%xmm1         \n\t"
65
        "movq (%0, %2,2),  %%xmm2         \n\t"
66
        "movq (%0, %3),    %%xmm3         \n\t"
67
        "lea (%0,%2,4), %0                \n\t"
68
        "punpcklbw %%xmm7, %%xmm0         \n\t"
69
        "punpcklbw %%xmm7, %%xmm1         \n\t"
70
        "punpcklbw %%xmm7, %%xmm2         \n\t"
71
        "punpcklbw %%xmm7, %%xmm3         \n\t"
72
        "movdqa %%xmm0,      (%1)         \n\t"
73
        "movdqa %%xmm1,    16(%1)         \n\t"
74
        "movdqa %%xmm2,    32(%1)         \n\t"
75
        "movdqa %%xmm3,    48(%1)         \n\t"
76
        "movq (%0),        %%xmm0         \n\t"
77
        "movq (%0, %2),    %%xmm1         \n\t"
78
        "movq (%0, %2,2),  %%xmm2         \n\t"
79
        "movq (%0, %3),    %%xmm3         \n\t"
80
        "punpcklbw %%xmm7, %%xmm0         \n\t"
81
        "punpcklbw %%xmm7, %%xmm1         \n\t"
82
        "punpcklbw %%xmm7, %%xmm2         \n\t"
83
        "punpcklbw %%xmm7, %%xmm3         \n\t"
84
        "movdqa %%xmm0,    64(%1)         \n\t"
85
        "movdqa %%xmm1,    80(%1)         \n\t"
86
        "movdqa %%xmm2,    96(%1)         \n\t"
87
        "movdqa %%xmm3,   112(%1)         \n\t"
88
        : "+r" (pixels)
89
        : "r" (block), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3)
90
    );
91
}
92
93 97d1d009 Aurelien Jacobs
static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
94
{
95 be449fca Diego Pettenò
    __asm__ volatile(
96 97d1d009 Aurelien Jacobs
        "pxor %%mm7, %%mm7              \n\t"
97
        "mov $-128, %%"REG_a"           \n\t"
98
        ASMALIGN(4)
99
        "1:                             \n\t"
100
        "movq (%0), %%mm0               \n\t"
101
        "movq (%1), %%mm2               \n\t"
102
        "movq %%mm0, %%mm1              \n\t"
103
        "movq %%mm2, %%mm3              \n\t"
104
        "punpcklbw %%mm7, %%mm0         \n\t"
105
        "punpckhbw %%mm7, %%mm1         \n\t"
106
        "punpcklbw %%mm7, %%mm2         \n\t"
107
        "punpckhbw %%mm7, %%mm3         \n\t"
108
        "psubw %%mm2, %%mm0             \n\t"
109
        "psubw %%mm3, %%mm1             \n\t"
110
        "movq %%mm0, (%2, %%"REG_a")    \n\t"
111
        "movq %%mm1, 8(%2, %%"REG_a")   \n\t"
112
        "add %3, %0                     \n\t"
113
        "add %3, %1                     \n\t"
114
        "add $16, %%"REG_a"             \n\t"
115
        "jnz 1b                         \n\t"
116
        : "+r" (s1), "+r" (s2)
117 40d0e665 Ramiro Polla
        : "r" (block+64), "r" ((x86_reg)stride)
118 97d1d009 Aurelien Jacobs
        : "%"REG_a
119
    );
120
}
121
122
static int pix_sum16_mmx(uint8_t * pix, int line_size){
123
    const int h=16;
124
    int sum;
125 40d0e665 Ramiro Polla
    x86_reg index= -line_size*h;
126 97d1d009 Aurelien Jacobs
127 be449fca Diego Pettenò
    __asm__ volatile(
128 97d1d009 Aurelien Jacobs
                "pxor %%mm7, %%mm7              \n\t"
129
                "pxor %%mm6, %%mm6              \n\t"
130
                "1:                             \n\t"
131
                "movq (%2, %1), %%mm0           \n\t"
132
                "movq (%2, %1), %%mm1           \n\t"
133
                "movq 8(%2, %1), %%mm2          \n\t"
134
                "movq 8(%2, %1), %%mm3          \n\t"
135
                "punpcklbw %%mm7, %%mm0         \n\t"
136
                "punpckhbw %%mm7, %%mm1         \n\t"
137
                "punpcklbw %%mm7, %%mm2         \n\t"
138
                "punpckhbw %%mm7, %%mm3         \n\t"
139
                "paddw %%mm0, %%mm1             \n\t"
140
                "paddw %%mm2, %%mm3             \n\t"
141
                "paddw %%mm1, %%mm3             \n\t"
142
                "paddw %%mm3, %%mm6             \n\t"
143
                "add %3, %1                     \n\t"
144
                " js 1b                         \n\t"
145
                "movq %%mm6, %%mm5              \n\t"
146
                "psrlq $32, %%mm6               \n\t"
147
                "paddw %%mm5, %%mm6             \n\t"
148
                "movq %%mm6, %%mm5              \n\t"
149
                "psrlq $16, %%mm6               \n\t"
150
                "paddw %%mm5, %%mm6             \n\t"
151
                "movd %%mm6, %0                 \n\t"
152
                "andl $0xFFFF, %0               \n\t"
153
                : "=&r" (sum), "+r" (index)
154 40d0e665 Ramiro Polla
                : "r" (pix - index), "r" ((x86_reg)line_size)
155 97d1d009 Aurelien Jacobs
        );
156
157
        return sum;
158
}
159
160
static int pix_norm1_mmx(uint8_t *pix, int line_size) {
161
    int tmp;
162 be449fca Diego Pettenò
  __asm__ volatile (
163 97d1d009 Aurelien Jacobs
      "movl $16,%%ecx\n"
164
      "pxor %%mm0,%%mm0\n"
165
      "pxor %%mm7,%%mm7\n"
166
      "1:\n"
167
      "movq (%0),%%mm2\n"       /* mm2 = pix[0-7] */
168
      "movq 8(%0),%%mm3\n"      /* mm3 = pix[8-15] */
169
170
      "movq %%mm2,%%mm1\n"      /* mm1 = mm2 = pix[0-7] */
171
172
      "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
173
      "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
174
175
      "movq %%mm3,%%mm4\n"      /* mm4 = mm3 = pix[8-15] */
176
      "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
177
      "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
178
179
      "pmaddwd %%mm1,%%mm1\n"   /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
180
      "pmaddwd %%mm2,%%mm2\n"   /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
181
182
      "pmaddwd %%mm3,%%mm3\n"
183
      "pmaddwd %%mm4,%%mm4\n"
184
185
      "paddd %%mm1,%%mm2\n"     /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
186
                                          pix2^2+pix3^2+pix6^2+pix7^2) */
187
      "paddd %%mm3,%%mm4\n"
188
      "paddd %%mm2,%%mm7\n"
189
190
      "add %2, %0\n"
191
      "paddd %%mm4,%%mm7\n"
192
      "dec %%ecx\n"
193
      "jnz 1b\n"
194
195
      "movq %%mm7,%%mm1\n"
196
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
197
      "paddd %%mm7,%%mm1\n"
198
      "movd %%mm1,%1\n"
199 40d0e665 Ramiro Polla
      : "+r" (pix), "=r"(tmp) : "r" ((x86_reg)line_size) : "%ecx" );
200 97d1d009 Aurelien Jacobs
    return tmp;
201
}
202
203
static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
204
    int tmp;
205 be449fca Diego Pettenò
  __asm__ volatile (
206 97d1d009 Aurelien Jacobs
      "movl %4,%%ecx\n"
207
      "shr $1,%%ecx\n"
208
      "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */
209
      "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */
210
      "1:\n"
211
      "movq (%0),%%mm1\n"       /* mm1 = pix1[0][0-7] */
212
      "movq (%1),%%mm2\n"       /* mm2 = pix2[0][0-7] */
213
      "movq (%0,%3),%%mm3\n"    /* mm3 = pix1[1][0-7] */
214
      "movq (%1,%3),%%mm4\n"    /* mm4 = pix2[1][0-7] */
215
216
      /* todo: mm1-mm2, mm3-mm4 */
217
      /* algo: subtract mm1 from mm2 with saturation and vice versa */
218
      /*       OR the results to get absolute difference */
219
      "movq %%mm1,%%mm5\n"
220
      "movq %%mm3,%%mm6\n"
221
      "psubusb %%mm2,%%mm1\n"
222
      "psubusb %%mm4,%%mm3\n"
223
      "psubusb %%mm5,%%mm2\n"
224
      "psubusb %%mm6,%%mm4\n"
225
226
      "por %%mm1,%%mm2\n"
227
      "por %%mm3,%%mm4\n"
228
229
      /* now convert to 16-bit vectors so we can square them */
230
      "movq %%mm2,%%mm1\n"
231
      "movq %%mm4,%%mm3\n"
232
233
      "punpckhbw %%mm0,%%mm2\n"
234
      "punpckhbw %%mm0,%%mm4\n"
235
      "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
236
      "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
237
238
      "pmaddwd %%mm2,%%mm2\n"
239
      "pmaddwd %%mm4,%%mm4\n"
240
      "pmaddwd %%mm1,%%mm1\n"
241
      "pmaddwd %%mm3,%%mm3\n"
242
243
      "lea (%0,%3,2), %0\n"     /* pix1 += 2*line_size */
244
      "lea (%1,%3,2), %1\n"     /* pix2 += 2*line_size */
245
246
      "paddd %%mm2,%%mm1\n"
247
      "paddd %%mm4,%%mm3\n"
248
      "paddd %%mm1,%%mm7\n"
249
      "paddd %%mm3,%%mm7\n"
250
251
      "decl %%ecx\n"
252
      "jnz 1b\n"
253
254
      "movq %%mm7,%%mm1\n"
255
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
256
      "paddd %%mm7,%%mm1\n"
257
      "movd %%mm1,%2\n"
258
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
259 40d0e665 Ramiro Polla
      : "r" ((x86_reg)line_size) , "m" (h)
260 97d1d009 Aurelien Jacobs
      : "%ecx");
261
    return tmp;
262
}
263
264
static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
265
    int tmp;
266 be449fca Diego Pettenò
  __asm__ volatile (
267 97d1d009 Aurelien Jacobs
      "movl %4,%%ecx\n"
268
      "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */
269
      "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */
270
      "1:\n"
271
      "movq (%0),%%mm1\n"       /* mm1 = pix1[0-7] */
272
      "movq (%1),%%mm2\n"       /* mm2 = pix2[0-7] */
273
      "movq 8(%0),%%mm3\n"      /* mm3 = pix1[8-15] */
274
      "movq 8(%1),%%mm4\n"      /* mm4 = pix2[8-15] */
275
276
      /* todo: mm1-mm2, mm3-mm4 */
277
      /* algo: subtract mm1 from mm2 with saturation and vice versa */
278
      /*       OR the results to get absolute difference */
279
      "movq %%mm1,%%mm5\n"
280
      "movq %%mm3,%%mm6\n"
281
      "psubusb %%mm2,%%mm1\n"
282
      "psubusb %%mm4,%%mm3\n"
283
      "psubusb %%mm5,%%mm2\n"
284
      "psubusb %%mm6,%%mm4\n"
285
286
      "por %%mm1,%%mm2\n"
287
      "por %%mm3,%%mm4\n"
288
289
      /* now convert to 16-bit vectors so we can square them */
290
      "movq %%mm2,%%mm1\n"
291
      "movq %%mm4,%%mm3\n"
292
293
      "punpckhbw %%mm0,%%mm2\n"
294
      "punpckhbw %%mm0,%%mm4\n"
295
      "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
296
      "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
297
298
      "pmaddwd %%mm2,%%mm2\n"
299
      "pmaddwd %%mm4,%%mm4\n"
300
      "pmaddwd %%mm1,%%mm1\n"
301
      "pmaddwd %%mm3,%%mm3\n"
302
303
      "add %3,%0\n"
304
      "add %3,%1\n"
305
306
      "paddd %%mm2,%%mm1\n"
307
      "paddd %%mm4,%%mm3\n"
308
      "paddd %%mm1,%%mm7\n"
309
      "paddd %%mm3,%%mm7\n"
310
311
      "decl %%ecx\n"
312
      "jnz 1b\n"
313
314
      "movq %%mm7,%%mm1\n"
315
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
316
      "paddd %%mm7,%%mm1\n"
317
      "movd %%mm1,%2\n"
318
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
319 40d0e665 Ramiro Polla
      : "r" ((x86_reg)line_size) , "m" (h)
320 97d1d009 Aurelien Jacobs
      : "%ecx");
321
    return tmp;
322
}
323
324
static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
325
    int tmp;
326 be449fca Diego Pettenò
  __asm__ volatile (
327 97d1d009 Aurelien Jacobs
      "shr $1,%2\n"
328
      "pxor %%xmm0,%%xmm0\n"    /* mm0 = 0 */
329
      "pxor %%xmm7,%%xmm7\n"    /* mm7 holds the sum */
330
      "1:\n"
331
      "movdqu (%0),%%xmm1\n"    /* mm1 = pix1[0][0-15] */
332
      "movdqu (%1),%%xmm2\n"    /* mm2 = pix2[0][0-15] */
333
      "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
334
      "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
335
336
      /* todo: mm1-mm2, mm3-mm4 */
337
      /* algo: subtract mm1 from mm2 with saturation and vice versa */
338
      /*       OR the results to get absolute difference */
339
      "movdqa %%xmm1,%%xmm5\n"
340
      "movdqa %%xmm3,%%xmm6\n"
341
      "psubusb %%xmm2,%%xmm1\n"
342
      "psubusb %%xmm4,%%xmm3\n"
343
      "psubusb %%xmm5,%%xmm2\n"
344
      "psubusb %%xmm6,%%xmm4\n"
345
346
      "por %%xmm1,%%xmm2\n"
347
      "por %%xmm3,%%xmm4\n"
348
349
      /* now convert to 16-bit vectors so we can square them */
350
      "movdqa %%xmm2,%%xmm1\n"
351
      "movdqa %%xmm4,%%xmm3\n"
352
353
      "punpckhbw %%xmm0,%%xmm2\n"
354
      "punpckhbw %%xmm0,%%xmm4\n"
355
      "punpcklbw %%xmm0,%%xmm1\n"  /* mm1 now spread over (mm1,mm2) */
356
      "punpcklbw %%xmm0,%%xmm3\n"  /* mm4 now spread over (mm3,mm4) */
357
358
      "pmaddwd %%xmm2,%%xmm2\n"
359
      "pmaddwd %%xmm4,%%xmm4\n"
360
      "pmaddwd %%xmm1,%%xmm1\n"
361
      "pmaddwd %%xmm3,%%xmm3\n"
362
363
      "lea (%0,%4,2), %0\n"        /* pix1 += 2*line_size */
364
      "lea (%1,%4,2), %1\n"        /* pix2 += 2*line_size */
365
366
      "paddd %%xmm2,%%xmm1\n"
367
      "paddd %%xmm4,%%xmm3\n"
368
      "paddd %%xmm1,%%xmm7\n"
369
      "paddd %%xmm3,%%xmm7\n"
370
371
      "decl %2\n"
372
      "jnz 1b\n"
373
374
      "movdqa %%xmm7,%%xmm1\n"
375
      "psrldq $8, %%xmm7\n"        /* shift hi qword to lo */
376
      "paddd %%xmm1,%%xmm7\n"
377
      "movdqa %%xmm7,%%xmm1\n"
378
      "psrldq $4, %%xmm7\n"        /* shift hi dword to lo */
379
      "paddd %%xmm1,%%xmm7\n"
380
      "movd %%xmm7,%3\n"
381
      : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
382 40d0e665 Ramiro Polla
      : "r" ((x86_reg)line_size));
383 97d1d009 Aurelien Jacobs
    return tmp;
384
}
385
386
static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
387
    int tmp;
388 be449fca Diego Pettenò
  __asm__ volatile (
389 97d1d009 Aurelien Jacobs
      "movl %3,%%ecx\n"
390
      "pxor %%mm7,%%mm7\n"
391
      "pxor %%mm6,%%mm6\n"
392
393
      "movq (%0),%%mm0\n"
394
      "movq %%mm0, %%mm1\n"
395
      "psllq $8, %%mm0\n"
396
      "psrlq $8, %%mm1\n"
397
      "psrlq $8, %%mm0\n"
398
      "movq %%mm0, %%mm2\n"
399
      "movq %%mm1, %%mm3\n"
400
      "punpcklbw %%mm7,%%mm0\n"
401
      "punpcklbw %%mm7,%%mm1\n"
402
      "punpckhbw %%mm7,%%mm2\n"
403
      "punpckhbw %%mm7,%%mm3\n"
404
      "psubw %%mm1, %%mm0\n"
405
      "psubw %%mm3, %%mm2\n"
406
407
      "add %2,%0\n"
408
409
      "movq (%0),%%mm4\n"
410
      "movq %%mm4, %%mm1\n"
411
      "psllq $8, %%mm4\n"
412
      "psrlq $8, %%mm1\n"
413
      "psrlq $8, %%mm4\n"
414
      "movq %%mm4, %%mm5\n"
415
      "movq %%mm1, %%mm3\n"
416
      "punpcklbw %%mm7,%%mm4\n"
417
      "punpcklbw %%mm7,%%mm1\n"
418
      "punpckhbw %%mm7,%%mm5\n"
419
      "punpckhbw %%mm7,%%mm3\n"
420
      "psubw %%mm1, %%mm4\n"
421
      "psubw %%mm3, %%mm5\n"
422
      "psubw %%mm4, %%mm0\n"
423
      "psubw %%mm5, %%mm2\n"
424
      "pxor %%mm3, %%mm3\n"
425
      "pxor %%mm1, %%mm1\n"
426
      "pcmpgtw %%mm0, %%mm3\n\t"
427
      "pcmpgtw %%mm2, %%mm1\n\t"
428
      "pxor %%mm3, %%mm0\n"
429
      "pxor %%mm1, %%mm2\n"
430
      "psubw %%mm3, %%mm0\n"
431
      "psubw %%mm1, %%mm2\n"
432
      "paddw %%mm0, %%mm2\n"
433
      "paddw %%mm2, %%mm6\n"
434
435
      "add %2,%0\n"
436
      "1:\n"
437
438
      "movq (%0),%%mm0\n"
439
      "movq %%mm0, %%mm1\n"
440
      "psllq $8, %%mm0\n"
441
      "psrlq $8, %%mm1\n"
442
      "psrlq $8, %%mm0\n"
443
      "movq %%mm0, %%mm2\n"
444
      "movq %%mm1, %%mm3\n"
445
      "punpcklbw %%mm7,%%mm0\n"
446
      "punpcklbw %%mm7,%%mm1\n"
447
      "punpckhbw %%mm7,%%mm2\n"
448
      "punpckhbw %%mm7,%%mm3\n"
449
      "psubw %%mm1, %%mm0\n"
450
      "psubw %%mm3, %%mm2\n"
451
      "psubw %%mm0, %%mm4\n"
452
      "psubw %%mm2, %%mm5\n"
453
      "pxor %%mm3, %%mm3\n"
454
      "pxor %%mm1, %%mm1\n"
455
      "pcmpgtw %%mm4, %%mm3\n\t"
456
      "pcmpgtw %%mm5, %%mm1\n\t"
457
      "pxor %%mm3, %%mm4\n"
458
      "pxor %%mm1, %%mm5\n"
459
      "psubw %%mm3, %%mm4\n"
460
      "psubw %%mm1, %%mm5\n"
461
      "paddw %%mm4, %%mm5\n"
462
      "paddw %%mm5, %%mm6\n"
463
464
      "add %2,%0\n"
465
466
      "movq (%0),%%mm4\n"
467
      "movq %%mm4, %%mm1\n"
468
      "psllq $8, %%mm4\n"
469
      "psrlq $8, %%mm1\n"
470
      "psrlq $8, %%mm4\n"
471
      "movq %%mm4, %%mm5\n"
472
      "movq %%mm1, %%mm3\n"
473
      "punpcklbw %%mm7,%%mm4\n"
474
      "punpcklbw %%mm7,%%mm1\n"
475
      "punpckhbw %%mm7,%%mm5\n"
476
      "punpckhbw %%mm7,%%mm3\n"
477
      "psubw %%mm1, %%mm4\n"
478
      "psubw %%mm3, %%mm5\n"
479
      "psubw %%mm4, %%mm0\n"
480
      "psubw %%mm5, %%mm2\n"
481
      "pxor %%mm3, %%mm3\n"
482
      "pxor %%mm1, %%mm1\n"
483
      "pcmpgtw %%mm0, %%mm3\n\t"
484
      "pcmpgtw %%mm2, %%mm1\n\t"
485
      "pxor %%mm3, %%mm0\n"
486
      "pxor %%mm1, %%mm2\n"
487
      "psubw %%mm3, %%mm0\n"
488
      "psubw %%mm1, %%mm2\n"
489
      "paddw %%mm0, %%mm2\n"
490
      "paddw %%mm2, %%mm6\n"
491
492
      "add %2,%0\n"
493
      "subl $2, %%ecx\n"
494
      " jnz 1b\n"
495
496
      "movq %%mm6, %%mm0\n"
497
      "punpcklwd %%mm7,%%mm0\n"
498
      "punpckhwd %%mm7,%%mm6\n"
499
      "paddd %%mm0, %%mm6\n"
500
501
      "movq %%mm6,%%mm0\n"
502
      "psrlq $32, %%mm6\n"
503
      "paddd %%mm6,%%mm0\n"
504
      "movd %%mm0,%1\n"
505
      : "+r" (pix1), "=r"(tmp)
506 40d0e665 Ramiro Polla
      : "r" ((x86_reg)line_size) , "g" (h-2)
507 97d1d009 Aurelien Jacobs
      : "%ecx");
508
      return tmp;
509
}
510
511
static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
512
    int tmp;
513
    uint8_t * pix= pix1;
514 be449fca Diego Pettenò
  __asm__ volatile (
515 97d1d009 Aurelien Jacobs
      "movl %3,%%ecx\n"
516
      "pxor %%mm7,%%mm7\n"
517
      "pxor %%mm6,%%mm6\n"
518
519
      "movq (%0),%%mm0\n"
520
      "movq 1(%0),%%mm1\n"
521
      "movq %%mm0, %%mm2\n"
522
      "movq %%mm1, %%mm3\n"
523
      "punpcklbw %%mm7,%%mm0\n"
524
      "punpcklbw %%mm7,%%mm1\n"
525
      "punpckhbw %%mm7,%%mm2\n"
526
      "punpckhbw %%mm7,%%mm3\n"
527
      "psubw %%mm1, %%mm0\n"
528
      "psubw %%mm3, %%mm2\n"
529
530
      "add %2,%0\n"
531
532
      "movq (%0),%%mm4\n"
533
      "movq 1(%0),%%mm1\n"
534
      "movq %%mm4, %%mm5\n"
535
      "movq %%mm1, %%mm3\n"
536
      "punpcklbw %%mm7,%%mm4\n"
537
      "punpcklbw %%mm7,%%mm1\n"
538
      "punpckhbw %%mm7,%%mm5\n"
539
      "punpckhbw %%mm7,%%mm3\n"
540
      "psubw %%mm1, %%mm4\n"
541
      "psubw %%mm3, %%mm5\n"
542
      "psubw %%mm4, %%mm0\n"
543
      "psubw %%mm5, %%mm2\n"
544
      "pxor %%mm3, %%mm3\n"
545
      "pxor %%mm1, %%mm1\n"
546
      "pcmpgtw %%mm0, %%mm3\n\t"
547
      "pcmpgtw %%mm2, %%mm1\n\t"
548
      "pxor %%mm3, %%mm0\n"
549
      "pxor %%mm1, %%mm2\n"
550
      "psubw %%mm3, %%mm0\n"
551
      "psubw %%mm1, %%mm2\n"
552
      "paddw %%mm0, %%mm2\n"
553
      "paddw %%mm2, %%mm6\n"
554
555
      "add %2,%0\n"
556
      "1:\n"
557
558
      "movq (%0),%%mm0\n"
559
      "movq 1(%0),%%mm1\n"
560
      "movq %%mm0, %%mm2\n"
561
      "movq %%mm1, %%mm3\n"
562
      "punpcklbw %%mm7,%%mm0\n"
563
      "punpcklbw %%mm7,%%mm1\n"
564
      "punpckhbw %%mm7,%%mm2\n"
565
      "punpckhbw %%mm7,%%mm3\n"
566
      "psubw %%mm1, %%mm0\n"
567
      "psubw %%mm3, %%mm2\n"
568
      "psubw %%mm0, %%mm4\n"
569
      "psubw %%mm2, %%mm5\n"
570
      "pxor %%mm3, %%mm3\n"
571
      "pxor %%mm1, %%mm1\n"
572
      "pcmpgtw %%mm4, %%mm3\n\t"
573
      "pcmpgtw %%mm5, %%mm1\n\t"
574
      "pxor %%mm3, %%mm4\n"
575
      "pxor %%mm1, %%mm5\n"
576
      "psubw %%mm3, %%mm4\n"
577
      "psubw %%mm1, %%mm5\n"
578
      "paddw %%mm4, %%mm5\n"
579
      "paddw %%mm5, %%mm6\n"
580
581
      "add %2,%0\n"
582
583
      "movq (%0),%%mm4\n"
584
      "movq 1(%0),%%mm1\n"
585
      "movq %%mm4, %%mm5\n"
586
      "movq %%mm1, %%mm3\n"
587
      "punpcklbw %%mm7,%%mm4\n"
588
      "punpcklbw %%mm7,%%mm1\n"
589
      "punpckhbw %%mm7,%%mm5\n"
590
      "punpckhbw %%mm7,%%mm3\n"
591
      "psubw %%mm1, %%mm4\n"
592
      "psubw %%mm3, %%mm5\n"
593
      "psubw %%mm4, %%mm0\n"
594
      "psubw %%mm5, %%mm2\n"
595
      "pxor %%mm3, %%mm3\n"
596
      "pxor %%mm1, %%mm1\n"
597
      "pcmpgtw %%mm0, %%mm3\n\t"
598
      "pcmpgtw %%mm2, %%mm1\n\t"
599
      "pxor %%mm3, %%mm0\n"
600
      "pxor %%mm1, %%mm2\n"
601
      "psubw %%mm3, %%mm0\n"
602
      "psubw %%mm1, %%mm2\n"
603
      "paddw %%mm0, %%mm2\n"
604
      "paddw %%mm2, %%mm6\n"
605
606
      "add %2,%0\n"
607
      "subl $2, %%ecx\n"
608
      " jnz 1b\n"
609
610
      "movq %%mm6, %%mm0\n"
611
      "punpcklwd %%mm7,%%mm0\n"
612
      "punpckhwd %%mm7,%%mm6\n"
613
      "paddd %%mm0, %%mm6\n"
614
615
      "movq %%mm6,%%mm0\n"
616
      "psrlq $32, %%mm6\n"
617
      "paddd %%mm6,%%mm0\n"
618
      "movd %%mm0,%1\n"
619
      : "+r" (pix1), "=r"(tmp)
620 40d0e665 Ramiro Polla
      : "r" ((x86_reg)line_size) , "g" (h-2)
621 97d1d009 Aurelien Jacobs
      : "%ecx");
622
      return tmp + hf_noise8_mmx(pix+8, line_size, h);
623
}
624
625
static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
626
    MpegEncContext *c = p;
627
    int score1, score2;
628
629
    if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
630
    else  score1 = sse16_mmx(c, pix1, pix2, line_size, h);
631
    score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
632
633
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
634
    else  return score1 + FFABS(score2)*8;
635
}
636
637
static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
638
    MpegEncContext *c = p;
639
    int score1= sse8_mmx(c, pix1, pix2, line_size, h);
640
    int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
641
642
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
643
    else  return score1 + FFABS(score2)*8;
644
}
645
646
static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
647
    int tmp;
648
649
    assert( (((int)pix) & 7) == 0);
650
    assert((line_size &7) ==0);
651
652
#define SUM(in0, in1, out0, out1) \
653
      "movq (%0), %%mm2\n"\
654
      "movq 8(%0), %%mm3\n"\
655
      "add %2,%0\n"\
656
      "movq %%mm2, " #out0 "\n"\
657
      "movq %%mm3, " #out1 "\n"\
658
      "psubusb " #in0 ", %%mm2\n"\
659
      "psubusb " #in1 ", %%mm3\n"\
660
      "psubusb " #out0 ", " #in0 "\n"\
661
      "psubusb " #out1 ", " #in1 "\n"\
662
      "por %%mm2, " #in0 "\n"\
663
      "por %%mm3, " #in1 "\n"\
664
      "movq " #in0 ", %%mm2\n"\
665
      "movq " #in1 ", %%mm3\n"\
666
      "punpcklbw %%mm7, " #in0 "\n"\
667
      "punpcklbw %%mm7, " #in1 "\n"\
668
      "punpckhbw %%mm7, %%mm2\n"\
669
      "punpckhbw %%mm7, %%mm3\n"\
670
      "paddw " #in1 ", " #in0 "\n"\
671
      "paddw %%mm3, %%mm2\n"\
672
      "paddw %%mm2, " #in0 "\n"\
673
      "paddw " #in0 ", %%mm6\n"
674
675
676 be449fca Diego Pettenò
  __asm__ volatile (
677 97d1d009 Aurelien Jacobs
      "movl %3,%%ecx\n"
678
      "pxor %%mm6,%%mm6\n"
679
      "pxor %%mm7,%%mm7\n"
680
      "movq (%0),%%mm0\n"
681
      "movq 8(%0),%%mm1\n"
682
      "add %2,%0\n"
683 06bb35f9 Michael Niedermayer
      "jmp 2f\n"
684 97d1d009 Aurelien Jacobs
      "1:\n"
685
686
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
687 06bb35f9 Michael Niedermayer
      "2:\n"
688 97d1d009 Aurelien Jacobs
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
689
690
      "subl $2, %%ecx\n"
691
      "jnz 1b\n"
692
693
      "movq %%mm6,%%mm0\n"
694
      "psrlq $32, %%mm6\n"
695
      "paddw %%mm6,%%mm0\n"
696
      "movq %%mm0,%%mm6\n"
697
      "psrlq $16, %%mm0\n"
698
      "paddw %%mm6,%%mm0\n"
699
      "movd %%mm0,%1\n"
700
      : "+r" (pix), "=r"(tmp)
701 40d0e665 Ramiro Polla
      : "r" ((x86_reg)line_size) , "m" (h)
702 97d1d009 Aurelien Jacobs
      : "%ecx");
703
    return tmp & 0xFFFF;
704
}
705
#undef SUM
706
707
static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
708
    int tmp;
709
710
    assert( (((int)pix) & 7) == 0);
711
    assert((line_size &7) ==0);
712
713
#define SUM(in0, in1, out0, out1) \
714
      "movq (%0), " #out0 "\n"\
715
      "movq 8(%0), " #out1 "\n"\
716
      "add %2,%0\n"\
717
      "psadbw " #out0 ", " #in0 "\n"\
718
      "psadbw " #out1 ", " #in1 "\n"\
719
      "paddw " #in1 ", " #in0 "\n"\
720
      "paddw " #in0 ", %%mm6\n"
721
722 be449fca Diego Pettenò
  __asm__ volatile (
723 97d1d009 Aurelien Jacobs
      "movl %3,%%ecx\n"
724
      "pxor %%mm6,%%mm6\n"
725
      "pxor %%mm7,%%mm7\n"
726
      "movq (%0),%%mm0\n"
727
      "movq 8(%0),%%mm1\n"
728
      "add %2,%0\n"
729 e1381022 Michael Niedermayer
      "jmp 2f\n"
730 97d1d009 Aurelien Jacobs
      "1:\n"
731
732
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
733 e1381022 Michael Niedermayer
      "2:\n"
734 97d1d009 Aurelien Jacobs
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
735
736
      "subl $2, %%ecx\n"
737
      "jnz 1b\n"
738
739
      "movd %%mm6,%1\n"
740
      : "+r" (pix), "=r"(tmp)
741 40d0e665 Ramiro Polla
      : "r" ((x86_reg)line_size) , "m" (h)
742 97d1d009 Aurelien Jacobs
      : "%ecx");
743
    return tmp;
744
}
745
#undef SUM
746
747
static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
748
    int tmp;
749
750
    assert( (((int)pix1) & 7) == 0);
751
    assert( (((int)pix2) & 7) == 0);
752
    assert((line_size &7) ==0);
753
754
#define SUM(in0, in1, out0, out1) \
755
      "movq (%0),%%mm2\n"\
756
      "movq (%1)," #out0 "\n"\
757
      "movq 8(%0),%%mm3\n"\
758
      "movq 8(%1)," #out1 "\n"\
759
      "add %3,%0\n"\
760
      "add %3,%1\n"\
761
      "psubb " #out0 ", %%mm2\n"\
762
      "psubb " #out1 ", %%mm3\n"\
763
      "pxor %%mm7, %%mm2\n"\
764
      "pxor %%mm7, %%mm3\n"\
765
      "movq %%mm2, " #out0 "\n"\
766
      "movq %%mm3, " #out1 "\n"\
767
      "psubusb " #in0 ", %%mm2\n"\
768
      "psubusb " #in1 ", %%mm3\n"\
769
      "psubusb " #out0 ", " #in0 "\n"\
770
      "psubusb " #out1 ", " #in1 "\n"\
771
      "por %%mm2, " #in0 "\n"\
772
      "por %%mm3, " #in1 "\n"\
773
      "movq " #in0 ", %%mm2\n"\
774
      "movq " #in1 ", %%mm3\n"\
775
      "punpcklbw %%mm7, " #in0 "\n"\
776
      "punpcklbw %%mm7, " #in1 "\n"\
777
      "punpckhbw %%mm7, %%mm2\n"\
778
      "punpckhbw %%mm7, %%mm3\n"\
779
      "paddw " #in1 ", " #in0 "\n"\
780
      "paddw %%mm3, %%mm2\n"\
781
      "paddw %%mm2, " #in0 "\n"\
782
      "paddw " #in0 ", %%mm6\n"
783
784
785 be449fca Diego Pettenò
  __asm__ volatile (
786 97d1d009 Aurelien Jacobs
      "movl %4,%%ecx\n"
787
      "pxor %%mm6,%%mm6\n"
788
      "pcmpeqw %%mm7,%%mm7\n"
789
      "psllw $15, %%mm7\n"
790
      "packsswb %%mm7, %%mm7\n"
791
      "movq (%0),%%mm0\n"
792
      "movq (%1),%%mm2\n"
793
      "movq 8(%0),%%mm1\n"
794
      "movq 8(%1),%%mm3\n"
795
      "add %3,%0\n"
796
      "add %3,%1\n"
797
      "psubb %%mm2, %%mm0\n"
798
      "psubb %%mm3, %%mm1\n"
799
      "pxor %%mm7, %%mm0\n"
800
      "pxor %%mm7, %%mm1\n"
801 6bf6a930 Michael Niedermayer
      "jmp 2f\n"
802 97d1d009 Aurelien Jacobs
      "1:\n"
803
804
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
805 6bf6a930 Michael Niedermayer
      "2:\n"
806 97d1d009 Aurelien Jacobs
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
807
808
      "subl $2, %%ecx\n"
809
      "jnz 1b\n"
810
811
      "movq %%mm6,%%mm0\n"
812
      "psrlq $32, %%mm6\n"
813
      "paddw %%mm6,%%mm0\n"
814
      "movq %%mm0,%%mm6\n"
815
      "psrlq $16, %%mm0\n"
816
      "paddw %%mm6,%%mm0\n"
817
      "movd %%mm0,%2\n"
818
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
819 40d0e665 Ramiro Polla
      : "r" ((x86_reg)line_size) , "m" (h)
820 97d1d009 Aurelien Jacobs
      : "%ecx");
821
    return tmp & 0x7FFF;
822
}
823
#undef SUM
824
825
static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
826
    int tmp;
827
828
    assert( (((int)pix1) & 7) == 0);
829
    assert( (((int)pix2) & 7) == 0);
830
    assert((line_size &7) ==0);
831
832
#define SUM(in0, in1, out0, out1) \
833
      "movq (%0)," #out0 "\n"\
834
      "movq (%1),%%mm2\n"\
835
      "movq 8(%0)," #out1 "\n"\
836
      "movq 8(%1),%%mm3\n"\
837
      "add %3,%0\n"\
838
      "add %3,%1\n"\
839
      "psubb %%mm2, " #out0 "\n"\
840
      "psubb %%mm3, " #out1 "\n"\
841
      "pxor %%mm7, " #out0 "\n"\
842
      "pxor %%mm7, " #out1 "\n"\
843
      "psadbw " #out0 ", " #in0 "\n"\
844
      "psadbw " #out1 ", " #in1 "\n"\
845
      "paddw " #in1 ", " #in0 "\n"\
846
      "paddw " #in0 ", %%mm6\n"
847
848 be449fca Diego Pettenò
  __asm__ volatile (
849 97d1d009 Aurelien Jacobs
      "movl %4,%%ecx\n"
850
      "pxor %%mm6,%%mm6\n"
851
      "pcmpeqw %%mm7,%%mm7\n"
852
      "psllw $15, %%mm7\n"
853
      "packsswb %%mm7, %%mm7\n"
854
      "movq (%0),%%mm0\n"
855
      "movq (%1),%%mm2\n"
856
      "movq 8(%0),%%mm1\n"
857
      "movq 8(%1),%%mm3\n"
858
      "add %3,%0\n"
859
      "add %3,%1\n"
860
      "psubb %%mm2, %%mm0\n"
861
      "psubb %%mm3, %%mm1\n"
862
      "pxor %%mm7, %%mm0\n"
863
      "pxor %%mm7, %%mm1\n"
864 0bd134ab Michael Niedermayer
      "jmp 2f\n"
865 97d1d009 Aurelien Jacobs
      "1:\n"
866
867
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
868 0bd134ab Michael Niedermayer
      "2:\n"
869 97d1d009 Aurelien Jacobs
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
870
871
      "subl $2, %%ecx\n"
872
      "jnz 1b\n"
873
874
      "movd %%mm6,%2\n"
875
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
876 40d0e665 Ramiro Polla
      : "r" ((x86_reg)line_size) , "m" (h)
877 97d1d009 Aurelien Jacobs
      : "%ecx");
878
    return tmp;
879
}
880
#undef SUM
881
882
static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
883 40d0e665 Ramiro Polla
    x86_reg i=0;
884 be449fca Diego Pettenò
    __asm__ volatile(
885 97d1d009 Aurelien Jacobs
        "1:                             \n\t"
886
        "movq  (%2, %0), %%mm0          \n\t"
887
        "movq  (%1, %0), %%mm1          \n\t"
888
        "psubb %%mm0, %%mm1             \n\t"
889
        "movq %%mm1, (%3, %0)           \n\t"
890
        "movq 8(%2, %0), %%mm0          \n\t"
891
        "movq 8(%1, %0), %%mm1          \n\t"
892
        "psubb %%mm0, %%mm1             \n\t"
893
        "movq %%mm1, 8(%3, %0)          \n\t"
894
        "add $16, %0                    \n\t"
895
        "cmp %4, %0                     \n\t"
896
        " jb 1b                         \n\t"
897
        : "+r" (i)
898 40d0e665 Ramiro Polla
        : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w-15)
899 97d1d009 Aurelien Jacobs
    );
900
    for(; i<w; i++)
901
        dst[i+0] = src1[i+0]-src2[i+0];
902
}
903
904
static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
905 40d0e665 Ramiro Polla
    x86_reg i=0;
906 97d1d009 Aurelien Jacobs
    uint8_t l, lt;
907
908 be449fca Diego Pettenò
    __asm__ volatile(
909 97d1d009 Aurelien Jacobs
        "1:                             \n\t"
910
        "movq  -1(%1, %0), %%mm0        \n\t" // LT
911
        "movq  (%1, %0), %%mm1          \n\t" // T
912
        "movq  -1(%2, %0), %%mm2        \n\t" // L
913
        "movq  (%2, %0), %%mm3          \n\t" // X
914
        "movq %%mm2, %%mm4              \n\t" // L
915
        "psubb %%mm0, %%mm2             \n\t"
916
        "paddb %%mm1, %%mm2             \n\t" // L + T - LT
917
        "movq %%mm4, %%mm5              \n\t" // L
918
        "pmaxub %%mm1, %%mm4            \n\t" // max(T, L)
919
        "pminub %%mm5, %%mm1            \n\t" // min(T, L)
920
        "pminub %%mm2, %%mm4            \n\t"
921
        "pmaxub %%mm1, %%mm4            \n\t"
922
        "psubb %%mm4, %%mm3             \n\t" // dst - pred
923
        "movq %%mm3, (%3, %0)           \n\t"
924
        "add $8, %0                     \n\t"
925
        "cmp %4, %0                     \n\t"
926
        " jb 1b                         \n\t"
927
        : "+r" (i)
928 40d0e665 Ramiro Polla
        : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w)
929 97d1d009 Aurelien Jacobs
    );
930
931
    l= *left;
932
    lt= *left_top;
933
934
    dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
935
936
    *left_top= src1[w-1];
937
    *left    = src2[w-1];
938
}
939
940
#define DIFF_PIXELS_1(m,a,t,p1,p2)\
941
    "mov"#m" "#p1", "#a"              \n\t"\
942
    "mov"#m" "#p2", "#t"              \n\t"\
943
    "punpcklbw "#a", "#t"             \n\t"\
944
    "punpcklbw "#a", "#a"             \n\t"\
945
    "psubw     "#t", "#a"             \n\t"\
946
947
#define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
948
    uint8_t *p1b=p1, *p2b=p2;\
949 be449fca Diego Pettenò
    __asm__ volatile(\
950 97d1d009 Aurelien Jacobs
        DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
951
        DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
952
        DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
953
        "add %4, %1                   \n\t"\
954
        "add %4, %2                   \n\t"\
955
        DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
956
        DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
957
        DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
958
        DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
959
        "mov"#m1" "#mm"0, %0          \n\t"\
960
        DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
961
        "mov"#m1" %0, "#mm"0          \n\t"\
962
        : "+m"(temp), "+r"(p1b), "+r"(p2b)\
963 40d0e665 Ramiro Polla
        : "r"((x86_reg)stride), "r"((x86_reg)stride*3)\
964 97d1d009 Aurelien Jacobs
    );\
965
}
966
    //the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp)
967
968
#define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q,   %%mm,  p1, p2, stride, temp)
969
#define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
970
971
#define LBUTTERFLY2(a1,b1,a2,b2)\
972
    "paddw " #b1 ", " #a1 "           \n\t"\
973
    "paddw " #b2 ", " #a2 "           \n\t"\
974
    "paddw " #b1 ", " #b1 "           \n\t"\
975
    "paddw " #b2 ", " #b2 "           \n\t"\
976
    "psubw " #a1 ", " #b1 "           \n\t"\
977
    "psubw " #a2 ", " #b2 "           \n\t"
978
979
#define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
980
        LBUTTERFLY2(m0, m1, m2, m3)\
981
        LBUTTERFLY2(m4, m5, m6, m7)\
982
        LBUTTERFLY2(m0, m2, m1, m3)\
983
        LBUTTERFLY2(m4, m6, m5, m7)\
984
        LBUTTERFLY2(m0, m4, m1, m5)\
985
        LBUTTERFLY2(m2, m6, m3, m7)\
986
987
#define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
988
989
#define MMABS_MMX(a,z)\
990
    "pxor " #z ", " #z "              \n\t"\
991
    "pcmpgtw " #a ", " #z "           \n\t"\
992
    "pxor " #z ", " #a "              \n\t"\
993
    "psubw " #z ", " #a "             \n\t"
994
995
#define MMABS_MMX2(a,z)\
996
    "pxor " #z ", " #z "              \n\t"\
997
    "psubw " #a ", " #z "             \n\t"\
998
    "pmaxsw " #z ", " #a "            \n\t"
999
1000
#define MMABS_SSSE3(a,z)\
1001
    "pabsw " #a ", " #a "             \n\t"
1002
1003
#define MMABS_SUM(a,z, sum)\
1004
    MMABS(a,z)\
1005
    "paddusw " #a ", " #sum "         \n\t"
1006
1007
#define MMABS_SUM_8x8_NOSPILL\
1008
    MMABS(%%xmm0, %%xmm8)\
1009
    MMABS(%%xmm1, %%xmm9)\
1010
    MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
1011
    MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
1012
    MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
1013
    MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
1014
    MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
1015
    MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
1016
    "paddusw %%xmm1, %%xmm0           \n\t"
1017
1018
#ifdef ARCH_X86_64
1019
#define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
1020
#else
1021
#define MMABS_SUM_8x8_SSE2\
1022
    "movdqa %%xmm7, (%1)              \n\t"\
1023
    MMABS(%%xmm0, %%xmm7)\
1024
    MMABS(%%xmm1, %%xmm7)\
1025
    MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
1026
    MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
1027
    MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
1028
    MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
1029
    MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
1030
    "movdqa (%1), %%xmm2              \n\t"\
1031
    MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
1032
    "paddusw %%xmm1, %%xmm0           \n\t"
1033
#endif
1034
1035
/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
1036
 * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
1037
 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
1038
#define HSUM_MMX(a, t, dst)\
1039
    "movq "#a", "#t"                  \n\t"\
1040
    "psrlq $32, "#a"                  \n\t"\
1041
    "paddusw "#t", "#a"               \n\t"\
1042
    "movq "#a", "#t"                  \n\t"\
1043
    "psrlq $16, "#a"                  \n\t"\
1044
    "paddusw "#t", "#a"               \n\t"\
1045
    "movd "#a", "#dst"                \n\t"\
1046
1047
#define HSUM_MMX2(a, t, dst)\
1048
    "pshufw $0x0E, "#a", "#t"         \n\t"\
1049
    "paddusw "#t", "#a"               \n\t"\
1050
    "pshufw $0x01, "#a", "#t"         \n\t"\
1051
    "paddusw "#t", "#a"               \n\t"\
1052
    "movd "#a", "#dst"                \n\t"\
1053
1054
#define HSUM_SSE2(a, t, dst)\
1055
    "movhlps "#a", "#t"               \n\t"\
1056
    "paddusw "#t", "#a"               \n\t"\
1057
    "pshuflw $0x0E, "#a", "#t"        \n\t"\
1058
    "paddusw "#t", "#a"               \n\t"\
1059
    "pshuflw $0x01, "#a", "#t"        \n\t"\
1060
    "paddusw "#t", "#a"               \n\t"\
1061
    "movd "#a", "#dst"                \n\t"\
1062
1063
#define HADAMARD8_DIFF_MMX(cpu) \
1064
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1065
    DECLARE_ALIGNED_8(uint64_t, temp[13]);\
1066
    int sum;\
1067
\
1068
    assert(h==8);\
1069
\
1070
    DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
1071
\
1072 be449fca Diego Pettenò
    __asm__ volatile(\
1073 97d1d009 Aurelien Jacobs
        HADAMARD48\
1074
\
1075
        "movq %%mm7, 96(%1)             \n\t"\
1076
\
1077
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1078 1835cda6 Victor Pollex
        STORE4(8,  0(%1), %%mm0, %%mm3, %%mm7, %%mm2)\
1079 97d1d009 Aurelien Jacobs
\
1080
        "movq 96(%1), %%mm7             \n\t"\
1081
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1082 1835cda6 Victor Pollex
        STORE4(8, 64(%1), %%mm4, %%mm7, %%mm0, %%mm6)\
1083 97d1d009 Aurelien Jacobs
\
1084
        : "=r" (sum)\
1085
        : "r"(temp)\
1086
    );\
1087
\
1088
    DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
1089
\
1090 be449fca Diego Pettenò
    __asm__ volatile(\
1091 97d1d009 Aurelien Jacobs
        HADAMARD48\
1092
\
1093
        "movq %%mm7, 96(%1)             \n\t"\
1094
\
1095
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1096 1835cda6 Victor Pollex
        STORE4(8, 32(%1), %%mm0, %%mm3, %%mm7, %%mm2)\
1097 97d1d009 Aurelien Jacobs
\
1098
        "movq 96(%1), %%mm7             \n\t"\
1099
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1100
        "movq %%mm7, %%mm5              \n\t"/*FIXME remove*/\
1101
        "movq %%mm6, %%mm7              \n\t"\
1102
        "movq %%mm0, %%mm6              \n\t"\
1103
\
1104 1835cda6 Victor Pollex
        LOAD4(8, 64(%1), %%mm0, %%mm1, %%mm2, %%mm3)\
1105 97d1d009 Aurelien Jacobs
\
1106
        HADAMARD48\
1107
        "movq %%mm7, 64(%1)             \n\t"\
1108
        MMABS(%%mm0, %%mm7)\
1109
        MMABS(%%mm1, %%mm7)\
1110
        MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1111
        MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1112
        MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1113
        MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1114
        MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1115
        "movq 64(%1), %%mm2             \n\t"\
1116
        MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1117
        "paddusw %%mm1, %%mm0           \n\t"\
1118
        "movq %%mm0, 64(%1)             \n\t"\
1119
\
1120 1835cda6 Victor Pollex
        LOAD4(8,  0(%1), %%mm0, %%mm1, %%mm2, %%mm3)\
1121
        LOAD4(8, 32(%1), %%mm4, %%mm5, %%mm6, %%mm7)\
1122 97d1d009 Aurelien Jacobs
\
1123
        HADAMARD48\
1124
        "movq %%mm7, (%1)               \n\t"\
1125
        MMABS(%%mm0, %%mm7)\
1126
        MMABS(%%mm1, %%mm7)\
1127
        MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1128
        MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1129
        MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1130
        MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1131
        MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1132
        "movq (%1), %%mm2               \n\t"\
1133
        MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1134
        "paddusw 64(%1), %%mm0          \n\t"\
1135
        "paddusw %%mm1, %%mm0           \n\t"\
1136
\
1137
        HSUM(%%mm0, %%mm1, %0)\
1138
\
1139
        : "=r" (sum)\
1140
        : "r"(temp)\
1141
    );\
1142
    return sum&0xFFFF;\
1143
}\
1144
WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1145
1146
#define HADAMARD8_DIFF_SSE2(cpu) \
1147
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1148
    DECLARE_ALIGNED_16(uint64_t, temp[4]);\
1149
    int sum;\
1150
\
1151
    assert(h==8);\
1152
\
1153
    DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
1154
\
1155 be449fca Diego Pettenò
    __asm__ volatile(\
1156 97d1d009 Aurelien Jacobs
        HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
1157
        TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
1158
        HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
1159
        MMABS_SUM_8x8\
1160
        HSUM_SSE2(%%xmm0, %%xmm1, %0)\
1161
        : "=r" (sum)\
1162
        : "r"(temp)\
1163
    );\
1164
    return sum&0xFFFF;\
1165
}\
1166
WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1167
1168
#define MMABS(a,z)         MMABS_MMX(a,z)
1169
#define HSUM(a,t,dst)      HSUM_MMX(a,t,dst)
1170
HADAMARD8_DIFF_MMX(mmx)
1171
#undef MMABS
1172
#undef HSUM
1173
1174
#define MMABS(a,z)         MMABS_MMX2(a,z)
1175
#define MMABS_SUM_8x8      MMABS_SUM_8x8_SSE2
1176
#define HSUM(a,t,dst)      HSUM_MMX2(a,t,dst)
1177
HADAMARD8_DIFF_MMX(mmx2)
1178
HADAMARD8_DIFF_SSE2(sse2)
1179
#undef MMABS
1180
#undef MMABS_SUM_8x8
1181
#undef HSUM
1182
1183
#ifdef HAVE_SSSE3
1184
#define MMABS(a,z)         MMABS_SSSE3(a,z)
1185
#define MMABS_SUM_8x8      MMABS_SUM_8x8_NOSPILL
1186
HADAMARD8_DIFF_SSE2(ssse3)
1187
#undef MMABS
1188
#undef MMABS_SUM_8x8
1189
#endif
1190
1191
#define DCT_SAD4(m,mm,o)\
1192
    "mov"#m" "#o"+ 0(%1), "#mm"2      \n\t"\
1193
    "mov"#m" "#o"+16(%1), "#mm"3      \n\t"\
1194
    "mov"#m" "#o"+32(%1), "#mm"4      \n\t"\
1195
    "mov"#m" "#o"+48(%1), "#mm"5      \n\t"\
1196
    MMABS_SUM(mm##2, mm##6, mm##0)\
1197
    MMABS_SUM(mm##3, mm##7, mm##1)\
1198
    MMABS_SUM(mm##4, mm##6, mm##0)\
1199
    MMABS_SUM(mm##5, mm##7, mm##1)\
1200
1201
#define DCT_SAD_MMX\
1202
    "pxor %%mm0, %%mm0                \n\t"\
1203
    "pxor %%mm1, %%mm1                \n\t"\
1204
    DCT_SAD4(q, %%mm, 0)\
1205
    DCT_SAD4(q, %%mm, 8)\
1206
    DCT_SAD4(q, %%mm, 64)\
1207
    DCT_SAD4(q, %%mm, 72)\
1208
    "paddusw %%mm1, %%mm0             \n\t"\
1209
    HSUM(%%mm0, %%mm1, %0)
1210
1211
#define DCT_SAD_SSE2\
1212
    "pxor %%xmm0, %%xmm0              \n\t"\
1213
    "pxor %%xmm1, %%xmm1              \n\t"\
1214
    DCT_SAD4(dqa, %%xmm, 0)\
1215
    DCT_SAD4(dqa, %%xmm, 64)\
1216
    "paddusw %%xmm1, %%xmm0           \n\t"\
1217
    HSUM(%%xmm0, %%xmm1, %0)
1218
1219
#define DCT_SAD_FUNC(cpu) \
1220
static int sum_abs_dctelem_##cpu(DCTELEM *block){\
1221
    int sum;\
1222 be449fca Diego Pettenò
    __asm__ volatile(\
1223 97d1d009 Aurelien Jacobs
        DCT_SAD\
1224
        :"=r"(sum)\
1225
        :"r"(block)\
1226
    );\
1227
    return sum&0xFFFF;\
1228
}
1229
1230
#define DCT_SAD       DCT_SAD_MMX
1231
#define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1232
#define MMABS(a,z)    MMABS_MMX(a,z)
1233
DCT_SAD_FUNC(mmx)
1234
#undef MMABS
1235
#undef HSUM
1236
1237
#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1238
#define MMABS(a,z)    MMABS_MMX2(a,z)
1239
DCT_SAD_FUNC(mmx2)
1240
#undef HSUM
1241
#undef DCT_SAD
1242
1243
#define DCT_SAD       DCT_SAD_SSE2
1244
#define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
1245
DCT_SAD_FUNC(sse2)
1246
#undef MMABS
1247
1248
#ifdef HAVE_SSSE3
1249
#define MMABS(a,z)    MMABS_SSSE3(a,z)
1250
DCT_SAD_FUNC(ssse3)
1251
#undef MMABS
1252
#endif
1253
#undef HSUM
1254
#undef DCT_SAD
1255
1256
static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
1257
    int sum;
1258 40d0e665 Ramiro Polla
    x86_reg i=size;
1259 be449fca Diego Pettenò
    __asm__ volatile(
1260 97d1d009 Aurelien Jacobs
        "pxor %%mm4, %%mm4 \n"
1261
        "1: \n"
1262
        "sub $8, %0 \n"
1263
        "movq (%2,%0), %%mm2 \n"
1264
        "movq (%3,%0,2), %%mm0 \n"
1265
        "movq 8(%3,%0,2), %%mm1 \n"
1266
        "punpckhbw %%mm2, %%mm3 \n"
1267
        "punpcklbw %%mm2, %%mm2 \n"
1268
        "psraw $8, %%mm3 \n"
1269
        "psraw $8, %%mm2 \n"
1270
        "psubw %%mm3, %%mm1 \n"
1271
        "psubw %%mm2, %%mm0 \n"
1272
        "pmaddwd %%mm1, %%mm1 \n"
1273
        "pmaddwd %%mm0, %%mm0 \n"
1274
        "paddd %%mm1, %%mm4 \n"
1275
        "paddd %%mm0, %%mm4 \n"
1276
        "jg 1b \n"
1277
        "movq %%mm4, %%mm3 \n"
1278
        "psrlq $32, %%mm3 \n"
1279
        "paddd %%mm3, %%mm4 \n"
1280
        "movd %%mm4, %1 \n"
1281
        :"+r"(i), "=r"(sum)
1282
        :"r"(pix1), "r"(pix2)
1283
    );
1284
    return sum;
1285
}
1286
1287
#define PHADDD(a, t)\
1288
    "movq "#a", "#t"                  \n\t"\
1289
    "psrlq $32, "#a"                  \n\t"\
1290
    "paddd "#t", "#a"                 \n\t"
1291
/*
1292
   pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
1293
   pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
1294
   pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
1295
 */
1296
#define PMULHRW(x, y, s, o)\
1297
    "pmulhw " #s ", "#x "            \n\t"\
1298
    "pmulhw " #s ", "#y "            \n\t"\
1299
    "paddw " #o ", "#x "             \n\t"\
1300
    "paddw " #o ", "#y "             \n\t"\
1301
    "psraw $1, "#x "                 \n\t"\
1302
    "psraw $1, "#y "                 \n\t"
1303
#define DEF(x) x ## _mmx
1304
#define SET_RND MOVQ_WONE
1305
#define SCALE_OFFSET 1
1306
1307 782fc0c3 Diego Pettenò
#include "dsputil_mmx_qns_template.c"
1308 97d1d009 Aurelien Jacobs
1309
#undef DEF
1310
#undef SET_RND
1311
#undef SCALE_OFFSET
1312
#undef PMULHRW
1313
1314
#define DEF(x) x ## _3dnow
1315
#define SET_RND(x)
1316
#define SCALE_OFFSET 0
1317
#define PMULHRW(x, y, s, o)\
1318
    "pmulhrw " #s ", "#x "           \n\t"\
1319
    "pmulhrw " #s ", "#y "           \n\t"
1320
1321 782fc0c3 Diego Pettenò
#include "dsputil_mmx_qns_template.c"
1322 97d1d009 Aurelien Jacobs
1323
#undef DEF
1324
#undef SET_RND
1325
#undef SCALE_OFFSET
1326
#undef PMULHRW
1327
1328
#ifdef HAVE_SSSE3
1329
#undef PHADDD
1330
#define DEF(x) x ## _ssse3
1331
#define SET_RND(x)
1332
#define SCALE_OFFSET -1
1333
#define PHADDD(a, t)\
1334
    "pshufw $0x0E, "#a", "#t"         \n\t"\
1335
    "paddd "#t", "#a"                 \n\t" /* faster than phaddd on core2 */
1336
#define PMULHRW(x, y, s, o)\
1337
    "pmulhrsw " #s ", "#x "          \n\t"\
1338
    "pmulhrsw " #s ", "#y "          \n\t"
1339
1340 782fc0c3 Diego Pettenò
#include "dsputil_mmx_qns_template.c"
1341 97d1d009 Aurelien Jacobs
1342
#undef DEF
1343
#undef SET_RND
1344
#undef SCALE_OFFSET
1345
#undef PMULHRW
1346
#undef PHADDD
1347
#endif //HAVE_SSSE3
1348
1349
1350
/* FLAC specific */
1351
void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag,
1352
                                   double *autoc);
1353
1354
1355
void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
1356
{
1357 82d1605f Dominik Mierzejewski
    if (mm_flags & FF_MM_MMX) {
1358 97d1d009 Aurelien Jacobs
        const int dct_algo = avctx->dct_algo;
1359
        if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
1360 82d1605f Dominik Mierzejewski
            if(mm_flags & FF_MM_SSE2){
1361 97d1d009 Aurelien Jacobs
                c->fdct = ff_fdct_sse2;
1362 82d1605f Dominik Mierzejewski
            }else if(mm_flags & FF_MM_MMXEXT){
1363 97d1d009 Aurelien Jacobs
                c->fdct = ff_fdct_mmx2;
1364
            }else{
1365
                c->fdct = ff_fdct_mmx;
1366
            }
1367
        }
1368
1369
        c->get_pixels = get_pixels_mmx;
1370
        c->diff_pixels = diff_pixels_mmx;
1371
        c->pix_sum = pix_sum16_mmx;
1372
1373
        c->diff_bytes= diff_bytes_mmx;
1374
        c->sum_abs_dctelem= sum_abs_dctelem_mmx;
1375
1376
        c->hadamard8_diff[0]= hadamard8_diff16_mmx;
1377
        c->hadamard8_diff[1]= hadamard8_diff_mmx;
1378
1379
        c->pix_norm1 = pix_norm1_mmx;
1380 82d1605f Dominik Mierzejewski
        c->sse[0] = (mm_flags & FF_MM_SSE2) ? sse16_sse2 : sse16_mmx;
1381 97d1d009 Aurelien Jacobs
          c->sse[1] = sse8_mmx;
1382
        c->vsad[4]= vsad_intra16_mmx;
1383
1384
        c->nsse[0] = nsse16_mmx;
1385
        c->nsse[1] = nsse8_mmx;
1386
        if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1387
            c->vsad[0] = vsad16_mmx;
1388
        }
1389
1390
        if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1391
            c->try_8x8basis= try_8x8basis_mmx;
1392
        }
1393
        c->add_8x8basis= add_8x8basis_mmx;
1394
1395
        c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
1396
1397
1398 82d1605f Dominik Mierzejewski
        if (mm_flags & FF_MM_MMXEXT) {
1399 97d1d009 Aurelien Jacobs
            c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
1400
            c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
1401
            c->hadamard8_diff[1]= hadamard8_diff_mmx2;
1402
            c->vsad[4]= vsad_intra16_mmx2;
1403
1404
            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1405
                c->vsad[0] = vsad16_mmx2;
1406
            }
1407
1408
            c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
1409
        }
1410
1411 82d1605f Dominik Mierzejewski
        if(mm_flags & FF_MM_SSE2){
1412 f76543c9 Baptiste Coudurier
            c->get_pixels = get_pixels_sse2;
1413 97d1d009 Aurelien Jacobs
            c->sum_abs_dctelem= sum_abs_dctelem_sse2;
1414
            c->hadamard8_diff[0]= hadamard8_diff16_sse2;
1415
            c->hadamard8_diff[1]= hadamard8_diff_sse2;
1416
            if (ENABLE_FLAC_ENCODER)
1417
                c->flac_compute_autocorr = ff_flac_compute_autocorr_sse2;
1418
        }
1419
1420
#ifdef HAVE_SSSE3
1421 82d1605f Dominik Mierzejewski
        if(mm_flags & FF_MM_SSSE3){
1422 97d1d009 Aurelien Jacobs
            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1423
                c->try_8x8basis= try_8x8basis_ssse3;
1424
            }
1425
            c->add_8x8basis= add_8x8basis_ssse3;
1426
            c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
1427
            c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
1428
            c->hadamard8_diff[1]= hadamard8_diff_ssse3;
1429
        }
1430
#endif
1431
1432 82d1605f Dominik Mierzejewski
        if(mm_flags & FF_MM_3DNOW){
1433 97d1d009 Aurelien Jacobs
            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1434
                c->try_8x8basis= try_8x8basis_3dnow;
1435
            }
1436
            c->add_8x8basis= add_8x8basis_3dnow;
1437
        }
1438
    }
1439
1440
    dsputil_init_pix_mmx(c, avctx);
1441
}