Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / h264dsp_mmx.c @ a33a2562

History | View | Annotate | Download (40 KB)

1
/*
2
 * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20

    
21
#include "libavutil/x86_cpu.h"
22
#include "libavcodec/h264dsp.h"
23
#include "dsputil_mmx.h"
24

    
25
DECLARE_ALIGNED(8, static const uint64_t, ff_pb_3_1  ) = 0x0103010301030103ULL;
26
DECLARE_ALIGNED(8, static const uint64_t, ff_pb_7_3  ) = 0x0307030703070307ULL;
27

    
28
/***********************************/
29
/* IDCT */
30

    
31
#define SUMSUB_BADC( a, b, c, d ) \
32
    "paddw "#b", "#a" \n\t"\
33
    "paddw "#d", "#c" \n\t"\
34
    "paddw "#b", "#b" \n\t"\
35
    "paddw "#d", "#d" \n\t"\
36
    "psubw "#a", "#b" \n\t"\
37
    "psubw "#c", "#d" \n\t"
38

    
39
#define SUMSUBD2_AB( a, b, t ) \
40
    "movq  "#b", "#t" \n\t"\
41
    "psraw  $1 , "#b" \n\t"\
42
    "paddw "#a", "#b" \n\t"\
43
    "psraw  $1 , "#a" \n\t"\
44
    "psubw "#t", "#a" \n\t"
45

    
46
#define IDCT4_1D( s02, s13, d02, d13, t ) \
47
    SUMSUB_BA  ( s02, d02 )\
48
    SUMSUBD2_AB( s13, d13, t )\
49
    SUMSUB_BADC( d13, s02, s13, d02 )
50

    
51
#define STORE_DIFF_4P( p, t, z ) \
52
    "psraw      $6,     "#p" \n\t"\
53
    "movd       (%0),   "#t" \n\t"\
54
    "punpcklbw "#z",    "#t" \n\t"\
55
    "paddsw    "#t",    "#p" \n\t"\
56
    "packuswb  "#z",    "#p" \n\t"\
57
    "movd      "#p",    (%0) \n\t"
58

    
59
static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
60
{
61
    /* Load dct coeffs */
62
    __asm__ volatile(
63
        "movq   (%0), %%mm0 \n\t"
64
        "movq  8(%0), %%mm1 \n\t"
65
        "movq 16(%0), %%mm2 \n\t"
66
        "movq 24(%0), %%mm3 \n\t"
67
    :: "r"(block) );
68

    
69
    __asm__ volatile(
70
        /* mm1=s02+s13  mm2=s02-s13  mm4=d02+d13  mm0=d02-d13 */
71
        IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 )
72

    
73
        "movq      %0,    %%mm6 \n\t"
74
        /* in: 1,4,0,2  out: 1,2,3,0 */
75
        TRANSPOSE4( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 )
76

    
77
        "paddw     %%mm6, %%mm3 \n\t"
78

    
79
        /* mm2=s02+s13  mm3=s02-s13  mm4=d02+d13  mm1=d02-d13 */
80
        IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 )
81

    
82
        "pxor %%mm7, %%mm7    \n\t"
83
    :: "m"(ff_pw_32));
84

    
85
    __asm__ volatile(
86
    STORE_DIFF_4P( %%mm0, %%mm1, %%mm7)
87
        "add %1, %0             \n\t"
88
    STORE_DIFF_4P( %%mm2, %%mm1, %%mm7)
89
        "add %1, %0             \n\t"
90
    STORE_DIFF_4P( %%mm3, %%mm1, %%mm7)
91
        "add %1, %0             \n\t"
92
    STORE_DIFF_4P( %%mm4, %%mm1, %%mm7)
93
        : "+r"(dst)
94
        : "r" ((x86_reg)stride)
95
    );
96
}
97

    
98
static inline void h264_idct8_1d(int16_t *block)
99
{
100
    __asm__ volatile(
101
        "movq 112(%0), %%mm7  \n\t"
102
        "movq  80(%0), %%mm0  \n\t"
103
        "movq  48(%0), %%mm3  \n\t"
104
        "movq  16(%0), %%mm5  \n\t"
105

    
106
        "movq   %%mm0, %%mm4  \n\t"
107
        "movq   %%mm5, %%mm1  \n\t"
108
        "psraw  $1,    %%mm4  \n\t"
109
        "psraw  $1,    %%mm1  \n\t"
110
        "paddw  %%mm0, %%mm4  \n\t"
111
        "paddw  %%mm5, %%mm1  \n\t"
112
        "paddw  %%mm7, %%mm4  \n\t"
113
        "paddw  %%mm0, %%mm1  \n\t"
114
        "psubw  %%mm5, %%mm4  \n\t"
115
        "paddw  %%mm3, %%mm1  \n\t"
116

    
117
        "psubw  %%mm3, %%mm5  \n\t"
118
        "psubw  %%mm3, %%mm0  \n\t"
119
        "paddw  %%mm7, %%mm5  \n\t"
120
        "psubw  %%mm7, %%mm0  \n\t"
121
        "psraw  $1,    %%mm3  \n\t"
122
        "psraw  $1,    %%mm7  \n\t"
123
        "psubw  %%mm3, %%mm5  \n\t"
124
        "psubw  %%mm7, %%mm0  \n\t"
125

    
126
        "movq   %%mm4, %%mm3  \n\t"
127
        "movq   %%mm1, %%mm7  \n\t"
128
        "psraw  $2,    %%mm1  \n\t"
129
        "psraw  $2,    %%mm3  \n\t"
130
        "paddw  %%mm5, %%mm3  \n\t"
131
        "psraw  $2,    %%mm5  \n\t"
132
        "paddw  %%mm0, %%mm1  \n\t"
133
        "psraw  $2,    %%mm0  \n\t"
134
        "psubw  %%mm4, %%mm5  \n\t"
135
        "psubw  %%mm0, %%mm7  \n\t"
136

    
137
        "movq  32(%0), %%mm2  \n\t"
138
        "movq  96(%0), %%mm6  \n\t"
139
        "movq   %%mm2, %%mm4  \n\t"
140
        "movq   %%mm6, %%mm0  \n\t"
141
        "psraw  $1,    %%mm4  \n\t"
142
        "psraw  $1,    %%mm6  \n\t"
143
        "psubw  %%mm0, %%mm4  \n\t"
144
        "paddw  %%mm2, %%mm6  \n\t"
145

    
146
        "movq    (%0), %%mm2  \n\t"
147
        "movq  64(%0), %%mm0  \n\t"
148
        SUMSUB_BA( %%mm0, %%mm2 )
149
        SUMSUB_BA( %%mm6, %%mm0 )
150
        SUMSUB_BA( %%mm4, %%mm2 )
151
        SUMSUB_BA( %%mm7, %%mm6 )
152
        SUMSUB_BA( %%mm5, %%mm4 )
153
        SUMSUB_BA( %%mm3, %%mm2 )
154
        SUMSUB_BA( %%mm1, %%mm0 )
155
        :: "r"(block)
156
    );
157
}
158

    
159
static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
160
{
161
    int i;
162
    DECLARE_ALIGNED(8, int16_t, b2)[64];
163

    
164
    block[0] += 32;
165

    
166
    for(i=0; i<2; i++){
167
        DECLARE_ALIGNED(8, uint64_t, tmp);
168

    
169
        h264_idct8_1d(block+4*i);
170

    
171
        __asm__ volatile(
172
            "movq   %%mm7,    %0   \n\t"
173
            TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 )
174
            "movq   %%mm0,  8(%1)  \n\t"
175
            "movq   %%mm6, 24(%1)  \n\t"
176
            "movq   %%mm7, 40(%1)  \n\t"
177
            "movq   %%mm4, 56(%1)  \n\t"
178
            "movq    %0,    %%mm7  \n\t"
179
            TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 )
180
            "movq   %%mm7,   (%1)  \n\t"
181
            "movq   %%mm1, 16(%1)  \n\t"
182
            "movq   %%mm0, 32(%1)  \n\t"
183
            "movq   %%mm3, 48(%1)  \n\t"
184
            : "=m"(tmp)
185
            : "r"(b2+32*i)
186
            : "memory"
187
        );
188
    }
189

    
190
    for(i=0; i<2; i++){
191
        h264_idct8_1d(b2+4*i);
192

    
193
        __asm__ volatile(
194
            "psraw     $6, %%mm7  \n\t"
195
            "psraw     $6, %%mm6  \n\t"
196
            "psraw     $6, %%mm5  \n\t"
197
            "psraw     $6, %%mm4  \n\t"
198
            "psraw     $6, %%mm3  \n\t"
199
            "psraw     $6, %%mm2  \n\t"
200
            "psraw     $6, %%mm1  \n\t"
201
            "psraw     $6, %%mm0  \n\t"
202

    
203
            "movq   %%mm7,    (%0)  \n\t"
204
            "movq   %%mm5,  16(%0)  \n\t"
205
            "movq   %%mm3,  32(%0)  \n\t"
206
            "movq   %%mm1,  48(%0)  \n\t"
207
            "movq   %%mm0,  64(%0)  \n\t"
208
            "movq   %%mm2,  80(%0)  \n\t"
209
            "movq   %%mm4,  96(%0)  \n\t"
210
            "movq   %%mm6, 112(%0)  \n\t"
211
            :: "r"(b2+4*i)
212
            : "memory"
213
        );
214
    }
215

    
216
    ff_add_pixels_clamped_mmx(b2, dst, stride);
217
}
218

    
219
#define STORE_DIFF_8P( p, d, t, z )\
220
        "movq       "#d", "#t" \n"\
221
        "psraw       $6,  "#p" \n"\
222
        "punpcklbw  "#z", "#t" \n"\
223
        "paddsw     "#t", "#p" \n"\
224
        "packuswb   "#p", "#p" \n"\
225
        "movq       "#p", "#d" \n"
226

    
227
#define H264_IDCT8_1D_SSE2(a,b,c,d,e,f,g,h)\
228
        "movdqa     "#c", "#a" \n"\
229
        "movdqa     "#g", "#e" \n"\
230
        "psraw       $1,  "#c" \n"\
231
        "psraw       $1,  "#g" \n"\
232
        "psubw      "#e", "#c" \n"\
233
        "paddw      "#a", "#g" \n"\
234
        "movdqa     "#b", "#e" \n"\
235
        "psraw       $1,  "#e" \n"\
236
        "paddw      "#b", "#e" \n"\
237
        "paddw      "#d", "#e" \n"\
238
        "paddw      "#f", "#e" \n"\
239
        "movdqa     "#f", "#a" \n"\
240
        "psraw       $1,  "#a" \n"\
241
        "paddw      "#f", "#a" \n"\
242
        "paddw      "#h", "#a" \n"\
243
        "psubw      "#b", "#a" \n"\
244
        "psubw      "#d", "#b" \n"\
245
        "psubw      "#d", "#f" \n"\
246
        "paddw      "#h", "#b" \n"\
247
        "psubw      "#h", "#f" \n"\
248
        "psraw       $1,  "#d" \n"\
249
        "psraw       $1,  "#h" \n"\
250
        "psubw      "#d", "#b" \n"\
251
        "psubw      "#h", "#f" \n"\
252
        "movdqa     "#e", "#d" \n"\
253
        "movdqa     "#a", "#h" \n"\
254
        "psraw       $2,  "#d" \n"\
255
        "psraw       $2,  "#h" \n"\
256
        "paddw      "#f", "#d" \n"\
257
        "paddw      "#b", "#h" \n"\
258
        "psraw       $2,  "#f" \n"\
259
        "psraw       $2,  "#b" \n"\
260
        "psubw      "#f", "#e" \n"\
261
        "psubw      "#a", "#b" \n"\
262
        "movdqa 0x00(%1), "#a" \n"\
263
        "movdqa 0x40(%1), "#f" \n"\
264
        SUMSUB_BA(f, a)\
265
        SUMSUB_BA(g, f)\
266
        SUMSUB_BA(c, a)\
267
        SUMSUB_BA(e, g)\
268
        SUMSUB_BA(b, c)\
269
        SUMSUB_BA(h, a)\
270
        SUMSUB_BA(d, f)
271

    
272
static void ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
273
{
274
    __asm__ volatile(
275
        "movdqa   0x10(%1), %%xmm1 \n"
276
        "movdqa   0x20(%1), %%xmm2 \n"
277
        "movdqa   0x30(%1), %%xmm3 \n"
278
        "movdqa   0x50(%1), %%xmm5 \n"
279
        "movdqa   0x60(%1), %%xmm6 \n"
280
        "movdqa   0x70(%1), %%xmm7 \n"
281
        H264_IDCT8_1D_SSE2(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)
282
        TRANSPOSE8(%%xmm4, %%xmm1, %%xmm7, %%xmm3, %%xmm5, %%xmm0, %%xmm2, %%xmm6, (%1))
283
        "paddw          %4, %%xmm4 \n"
284
        "movdqa     %%xmm4, 0x00(%1) \n"
285
        "movdqa     %%xmm2, 0x40(%1) \n"
286
        H264_IDCT8_1D_SSE2(%%xmm4, %%xmm0, %%xmm6, %%xmm3, %%xmm2, %%xmm5, %%xmm7, %%xmm1)
287
        "movdqa     %%xmm6, 0x60(%1) \n"
288
        "movdqa     %%xmm7, 0x70(%1) \n"
289
        "pxor       %%xmm7, %%xmm7 \n"
290
        STORE_DIFF_8P(%%xmm2, (%0),      %%xmm6, %%xmm7)
291
        STORE_DIFF_8P(%%xmm0, (%0,%2),   %%xmm6, %%xmm7)
292
        STORE_DIFF_8P(%%xmm1, (%0,%2,2), %%xmm6, %%xmm7)
293
        STORE_DIFF_8P(%%xmm3, (%0,%3),   %%xmm6, %%xmm7)
294
        "lea     (%0,%2,4), %0 \n"
295
        STORE_DIFF_8P(%%xmm5, (%0),      %%xmm6, %%xmm7)
296
        STORE_DIFF_8P(%%xmm4, (%0,%2),   %%xmm6, %%xmm7)
297
        "movdqa   0x60(%1), %%xmm0 \n"
298
        "movdqa   0x70(%1), %%xmm1 \n"
299
        STORE_DIFF_8P(%%xmm0, (%0,%2,2), %%xmm6, %%xmm7)
300
        STORE_DIFF_8P(%%xmm1, (%0,%3),   %%xmm6, %%xmm7)
301
        :"+r"(dst)
302
        :"r"(block), "r"((x86_reg)stride), "r"((x86_reg)3L*stride), "m"(ff_pw_32)
303
    );
304
}
305

    
306
static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
307
{
308
    int dc = (block[0] + 32) >> 6;
309
    __asm__ volatile(
310
        "movd          %0, %%mm0 \n\t"
311
        "pshufw $0, %%mm0, %%mm0 \n\t"
312
        "pxor       %%mm1, %%mm1 \n\t"
313
        "psubw      %%mm0, %%mm1 \n\t"
314
        "packuswb   %%mm0, %%mm0 \n\t"
315
        "packuswb   %%mm1, %%mm1 \n\t"
316
        ::"r"(dc)
317
    );
318
    __asm__ volatile(
319
        "movd          %0, %%mm2 \n\t"
320
        "movd          %1, %%mm3 \n\t"
321
        "movd          %2, %%mm4 \n\t"
322
        "movd          %3, %%mm5 \n\t"
323
        "paddusb    %%mm0, %%mm2 \n\t"
324
        "paddusb    %%mm0, %%mm3 \n\t"
325
        "paddusb    %%mm0, %%mm4 \n\t"
326
        "paddusb    %%mm0, %%mm5 \n\t"
327
        "psubusb    %%mm1, %%mm2 \n\t"
328
        "psubusb    %%mm1, %%mm3 \n\t"
329
        "psubusb    %%mm1, %%mm4 \n\t"
330
        "psubusb    %%mm1, %%mm5 \n\t"
331
        "movd       %%mm2, %0    \n\t"
332
        "movd       %%mm3, %1    \n\t"
333
        "movd       %%mm4, %2    \n\t"
334
        "movd       %%mm5, %3    \n\t"
335
        :"+m"(*(uint32_t*)(dst+0*stride)),
336
         "+m"(*(uint32_t*)(dst+1*stride)),
337
         "+m"(*(uint32_t*)(dst+2*stride)),
338
         "+m"(*(uint32_t*)(dst+3*stride))
339
    );
340
}
341

    
342
static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
343
{
344
    int dc = (block[0] + 32) >> 6;
345
    int y;
346
    __asm__ volatile(
347
        "movd          %0, %%mm0 \n\t"
348
        "pshufw $0, %%mm0, %%mm0 \n\t"
349
        "pxor       %%mm1, %%mm1 \n\t"
350
        "psubw      %%mm0, %%mm1 \n\t"
351
        "packuswb   %%mm0, %%mm0 \n\t"
352
        "packuswb   %%mm1, %%mm1 \n\t"
353
        ::"r"(dc)
354
    );
355
    for(y=2; y--; dst += 4*stride){
356
    __asm__ volatile(
357
        "movq          %0, %%mm2 \n\t"
358
        "movq          %1, %%mm3 \n\t"
359
        "movq          %2, %%mm4 \n\t"
360
        "movq          %3, %%mm5 \n\t"
361
        "paddusb    %%mm0, %%mm2 \n\t"
362
        "paddusb    %%mm0, %%mm3 \n\t"
363
        "paddusb    %%mm0, %%mm4 \n\t"
364
        "paddusb    %%mm0, %%mm5 \n\t"
365
        "psubusb    %%mm1, %%mm2 \n\t"
366
        "psubusb    %%mm1, %%mm3 \n\t"
367
        "psubusb    %%mm1, %%mm4 \n\t"
368
        "psubusb    %%mm1, %%mm5 \n\t"
369
        "movq       %%mm2, %0    \n\t"
370
        "movq       %%mm3, %1    \n\t"
371
        "movq       %%mm4, %2    \n\t"
372
        "movq       %%mm5, %3    \n\t"
373
        :"+m"(*(uint64_t*)(dst+0*stride)),
374
         "+m"(*(uint64_t*)(dst+1*stride)),
375
         "+m"(*(uint64_t*)(dst+2*stride)),
376
         "+m"(*(uint64_t*)(dst+3*stride))
377
    );
378
    }
379
}
380

    
381
//FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
382
static const uint8_t scan8[16 + 2*4]={
383
 4+1*8, 5+1*8, 4+2*8, 5+2*8,
384
 6+1*8, 7+1*8, 6+2*8, 7+2*8,
385
 4+3*8, 5+3*8, 4+4*8, 5+4*8,
386
 6+3*8, 7+3*8, 6+4*8, 7+4*8,
387
 1+1*8, 2+1*8,
388
 1+2*8, 2+2*8,
389
 1+4*8, 2+4*8,
390
 1+5*8, 2+5*8,
391
};
392

    
393
static void ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
394
    int i;
395
    for(i=0; i<16; i++){
396
        if(nnzc[ scan8[i] ])
397
            ff_h264_idct_add_mmx(dst + block_offset[i], block + i*16, stride);
398
    }
399
}
400

    
401
static void ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
402
    int i;
403
    for(i=0; i<16; i+=4){
404
        if(nnzc[ scan8[i] ])
405
            ff_h264_idct8_add_mmx(dst + block_offset[i], block + i*16, stride);
406
    }
407
}
408

    
409

    
410
static void ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
411
    int i;
412
    for(i=0; i<16; i++){
413
        int nnz = nnzc[ scan8[i] ];
414
        if(nnz){
415
            if(nnz==1 && block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
416
            else                      ff_h264_idct_add_mmx    (dst + block_offset[i], block + i*16, stride);
417
        }
418
    }
419
}
420

    
421
static void ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
422
    int i;
423
    for(i=0; i<16; i++){
424
        if(nnzc[ scan8[i] ] || block[i*16])
425
            ff_h264_idct_add_mmx(dst + block_offset[i], block + i*16, stride);
426
    }
427
}
428

    
429
static void ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
430
    int i;
431
    for(i=0; i<16; i++){
432
        if(nnzc[ scan8[i] ]) ff_h264_idct_add_mmx    (dst + block_offset[i], block + i*16, stride);
433
        else if(block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
434
    }
435
}
436

    
437
static void ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
438
    int i;
439
    for(i=0; i<16; i+=4){
440
        int nnz = nnzc[ scan8[i] ];
441
        if(nnz){
442
            if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
443
            else                      ff_h264_idct8_add_mmx    (dst + block_offset[i], block + i*16, stride);
444
        }
445
    }
446
}
447

    
448
static void ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
449
    int i;
450
    for(i=0; i<16; i+=4){
451
        int nnz = nnzc[ scan8[i] ];
452
        if(nnz){
453
            if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
454
            else                      ff_h264_idct8_add_sse2   (dst + block_offset[i], block + i*16, stride);
455
        }
456
    }
457
}
458

    
459
static void ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
460
    int i;
461
    for(i=16; i<16+8; i++){
462
        if(nnzc[ scan8[i] ] || block[i*16])
463
            ff_h264_idct_add_mmx    (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
464
    }
465
}
466

    
467
static void ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
468
    int i;
469
    for(i=16; i<16+8; i++){
470
        if(nnzc[ scan8[i] ])
471
            ff_h264_idct_add_mmx    (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
472
        else if(block[i*16])
473
            ff_h264_idct_dc_add_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
474
    }
475
}
476

    
477
#if CONFIG_GPL && HAVE_YASM
478
static void ff_h264_idct_dc_add8_mmx2(uint8_t *dst, int16_t *block, int stride)
479
{
480
    __asm__ volatile(
481
        "movd             %0, %%mm0 \n\t"   //  0 0 X D
482
        "punpcklwd        %1, %%mm0 \n\t"   //  x X d D
483
        "paddsw           %2, %%mm0 \n\t"
484
        "psraw            $6, %%mm0 \n\t"
485
        "punpcklwd     %%mm0, %%mm0 \n\t"   //  d d D D
486
        "pxor          %%mm1, %%mm1 \n\t"   //  0 0 0 0
487
        "psubw         %%mm0, %%mm1 \n\t"   // -d-d-D-D
488
        "packuswb      %%mm1, %%mm0 \n\t"   // -d-d-D-D d d D D
489
        "pshufw $0xFA, %%mm0, %%mm1 \n\t"   // -d-d-d-d-D-D-D-D
490
        "punpcklwd     %%mm0, %%mm0 \n\t"   //  d d d d D D D D
491
        ::"m"(block[ 0]),
492
          "m"(block[16]),
493
          "m"(ff_pw_32)
494
    );
495
    __asm__ volatile(
496
        "movq          %0, %%mm2 \n\t"
497
        "movq          %1, %%mm3 \n\t"
498
        "movq          %2, %%mm4 \n\t"
499
        "movq          %3, %%mm5 \n\t"
500
        "paddusb    %%mm0, %%mm2 \n\t"
501
        "paddusb    %%mm0, %%mm3 \n\t"
502
        "paddusb    %%mm0, %%mm4 \n\t"
503
        "paddusb    %%mm0, %%mm5 \n\t"
504
        "psubusb    %%mm1, %%mm2 \n\t"
505
        "psubusb    %%mm1, %%mm3 \n\t"
506
        "psubusb    %%mm1, %%mm4 \n\t"
507
        "psubusb    %%mm1, %%mm5 \n\t"
508
        "movq       %%mm2, %0    \n\t"
509
        "movq       %%mm3, %1    \n\t"
510
        "movq       %%mm4, %2    \n\t"
511
        "movq       %%mm5, %3    \n\t"
512
        :"+m"(*(uint64_t*)(dst+0*stride)),
513
         "+m"(*(uint64_t*)(dst+1*stride)),
514
         "+m"(*(uint64_t*)(dst+2*stride)),
515
         "+m"(*(uint64_t*)(dst+3*stride))
516
    );
517
}
518

    
519
extern void ff_x264_add8x4_idct_sse2(uint8_t *dst, int16_t *block, int stride);
520

    
521
static void ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
522
    int i;
523
    for(i=0; i<16; i+=2)
524
        if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ])
525
            ff_x264_add8x4_idct_sse2 (dst + block_offset[i], block + i*16, stride);
526
}
527

    
528
static void ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
529
    int i;
530
    for(i=0; i<16; i+=2){
531
        if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ])
532
            ff_x264_add8x4_idct_sse2 (dst + block_offset[i], block + i*16, stride);
533
        else if(block[i*16]|block[i*16+16])
534
            ff_h264_idct_dc_add8_mmx2(dst + block_offset[i], block + i*16, stride);
535
    }
536
}
537

    
538
static void ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
539
    int i;
540
    for(i=16; i<16+8; i+=2){
541
        if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ])
542
            ff_x264_add8x4_idct_sse2 (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
543
        else if(block[i*16]|block[i*16+16])
544
            ff_h264_idct_dc_add8_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
545
    }
546
}
547
#endif
548

    
549
/***********************************/
550
/* deblocking */
551

    
552
// out: o = |x-y|>a
553
// clobbers: t
554
#define DIFF_GT_MMX(x,y,a,o,t)\
555
    "movq     "#y", "#t"  \n\t"\
556
    "movq     "#x", "#o"  \n\t"\
557
    "psubusb  "#x", "#t"  \n\t"\
558
    "psubusb  "#y", "#o"  \n\t"\
559
    "por      "#t", "#o"  \n\t"\
560
    "psubusb  "#a", "#o"  \n\t"
561

    
562
// out: o = |x-y|>a
563
// clobbers: t
564
#define DIFF_GT2_MMX(x,y,a,o,t)\
565
    "movq     "#y", "#t"  \n\t"\
566
    "movq     "#x", "#o"  \n\t"\
567
    "psubusb  "#x", "#t"  \n\t"\
568
    "psubusb  "#y", "#o"  \n\t"\
569
    "psubusb  "#a", "#t"  \n\t"\
570
    "psubusb  "#a", "#o"  \n\t"\
571
    "pcmpeqb  "#t", "#o"  \n\t"\
572

    
573
// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1
574
// out: mm5=beta-1, mm7=mask
575
// clobbers: mm4,mm6
576
#define H264_DEBLOCK_MASK(alpha1, beta1) \
577
    "pshufw $0, "#alpha1", %%mm4 \n\t"\
578
    "pshufw $0, "#beta1 ", %%mm5 \n\t"\
579
    "packuswb  %%mm4, %%mm4      \n\t"\
580
    "packuswb  %%mm5, %%mm5      \n\t"\
581
    DIFF_GT_MMX(%%mm1, %%mm2, %%mm4, %%mm7, %%mm6) /* |p0-q0| > alpha-1 */\
582
    DIFF_GT_MMX(%%mm0, %%mm1, %%mm5, %%mm4, %%mm6) /* |p1-p0| > beta-1 */\
583
    "por       %%mm4, %%mm7      \n\t"\
584
    DIFF_GT_MMX(%%mm3, %%mm2, %%mm5, %%mm4, %%mm6) /* |q1-q0| > beta-1 */\
585
    "por       %%mm4, %%mm7      \n\t"\
586
    "pxor      %%mm6, %%mm6      \n\t"\
587
    "pcmpeqb   %%mm6, %%mm7      \n\t"
588

    
589
// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask)
590
// out: mm1=p0' mm2=q0'
591
// clobbers: mm0,3-6
592
#define H264_DEBLOCK_P0_Q0(pb_01, pb_3f)\
593
        "movq    %%mm1              , %%mm5 \n\t"\
594
        "pxor    %%mm2              , %%mm5 \n\t" /* p0^q0*/\
595
        "pand    "#pb_01"           , %%mm5 \n\t" /* (p0^q0)&1*/\
596
        "pcmpeqb %%mm4              , %%mm4 \n\t"\
597
        "pxor    %%mm4              , %%mm3 \n\t"\
598
        "pavgb   %%mm0              , %%mm3 \n\t" /* (p1 - q1 + 256)>>1*/\
599
        "pavgb   "MANGLE(ff_pb_3)"  , %%mm3 \n\t" /*(((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2*/\
600
        "pxor    %%mm1              , %%mm4 \n\t"\
601
        "pavgb   %%mm2              , %%mm4 \n\t" /* (q0 - p0 + 256)>>1*/\
602
        "pavgb   %%mm5              , %%mm3 \n\t"\
603
        "paddusb %%mm4              , %%mm3 \n\t" /* d+128+33*/\
604
        "movq    "MANGLE(ff_pb_A1)" , %%mm6 \n\t"\
605
        "psubusb %%mm3              , %%mm6 \n\t"\
606
        "psubusb "MANGLE(ff_pb_A1)" , %%mm3 \n\t"\
607
        "pminub  %%mm7              , %%mm6 \n\t"\
608
        "pminub  %%mm7              , %%mm3 \n\t"\
609
        "psubusb %%mm6              , %%mm1 \n\t"\
610
        "psubusb %%mm3              , %%mm2 \n\t"\
611
        "paddusb %%mm3              , %%mm1 \n\t"\
612
        "paddusb %%mm6              , %%mm2 \n\t"
613

    
614
// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) %8=ff_bone
615
// out: (q1addr) = av_clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
616
// clobbers: q2, tmp, tc0
617
#define H264_DEBLOCK_Q1(p1, q2, q2addr, q1addr, tc0, tmp)\
618
        "movq     %%mm1,  "#tmp"   \n\t"\
619
        "pavgb    %%mm2,  "#tmp"   \n\t"\
620
        "pavgb    "#tmp", "#q2"    \n\t" /* avg(p2,avg(p0,q0)) */\
621
        "pxor   "q2addr", "#tmp"   \n\t"\
622
        "pand     %9,     "#tmp"   \n\t" /* (p2^avg(p0,q0))&1 */\
623
        "psubusb  "#tmp", "#q2"    \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\
624
        "movq     "#p1",  "#tmp"   \n\t"\
625
        "psubusb  "#tc0", "#tmp"   \n\t"\
626
        "paddusb  "#p1",  "#tc0"   \n\t"\
627
        "pmaxub   "#tmp", "#q2"    \n\t"\
628
        "pminub   "#tc0", "#q2"    \n\t"\
629
        "movq     "#q2",  "q1addr" \n\t"
630

    
631
static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
632
{
633
    DECLARE_ALIGNED(8, uint64_t, tmp0)[2];
634

    
635
    __asm__ volatile(
636
        "movq    (%2,%4), %%mm0    \n\t" //p1
637
        "movq    (%2,%4,2), %%mm1  \n\t" //p0
638
        "movq    (%3),    %%mm2    \n\t" //q0
639
        "movq    (%3,%4), %%mm3    \n\t" //q1
640
        H264_DEBLOCK_MASK(%7, %8)
641

    
642
        "movd      %6,    %%mm4    \n\t"
643
        "punpcklbw %%mm4, %%mm4    \n\t"
644
        "punpcklwd %%mm4, %%mm4    \n\t"
645
        "pcmpeqb   %%mm3, %%mm3    \n\t"
646
        "movq      %%mm4, %%mm6    \n\t"
647
        "pcmpgtb   %%mm3, %%mm4    \n\t"
648
        "movq      %%mm6, %1       \n\t"
649
        "pand      %%mm4, %%mm7    \n\t"
650
        "movq      %%mm7, %0       \n\t"
651

    
652
        /* filter p1 */
653
        "movq     (%2),   %%mm3    \n\t" //p2
654
        DIFF_GT2_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1
655
        "pand     %%mm7,  %%mm6    \n\t" // mask & |p2-p0|<beta
656
        "pand     %1,     %%mm7    \n\t" // mask & tc0
657
        "movq     %%mm7,  %%mm4    \n\t"
658
        "psubb    %%mm6,  %%mm7    \n\t"
659
        "pand     %%mm4,  %%mm6    \n\t" // mask & |p2-p0|<beta & tc0
660
        H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%2)", "(%2,%4)", %%mm6, %%mm4)
661

    
662
        /* filter q1 */
663
        "movq    (%3,%4,2), %%mm4  \n\t" //q2
664
        DIFF_GT2_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3) // |q2-q0|>beta-1
665
        "pand     %0,     %%mm6    \n\t"
666
        "movq     %1,     %%mm5    \n\t" // can be merged with the and below but is slower then
667
        "pand     %%mm6,  %%mm5    \n\t"
668
        "psubb    %%mm6,  %%mm7    \n\t"
669
        "movq    (%3,%4), %%mm3    \n\t"
670
        H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%3,%4,2)", "(%3,%4)", %%mm5, %%mm6)
671

    
672
        /* filter p0, q0 */
673
        H264_DEBLOCK_P0_Q0(%9, unused)
674
        "movq      %%mm1, (%2,%4,2) \n\t"
675
        "movq      %%mm2, (%3)      \n\t"
676

    
677
        : "=m"(tmp0[0]), "=m"(tmp0[1])
678
        : "r"(pix-3*stride), "r"(pix), "r"((x86_reg)stride),
679
          "m"(*tmp0/*unused*/), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1),
680
          "m"(ff_bone)
681
    );
682
}
683

    
684
static void h264_v_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
685
{
686
    if((tc0[0] & tc0[1]) >= 0)
687
        h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0);
688
    if((tc0[2] & tc0[3]) >= 0)
689
        h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2);
690
}
691
static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
692
{
693
    //FIXME: could cut some load/stores by merging transpose with filter
694
    // also, it only needs to transpose 6x8
695
    DECLARE_ALIGNED(8, uint8_t, trans)[8*8];
696
    int i;
697
    for(i=0; i<2; i++, pix+=8*stride, tc0+=2) {
698
        if((tc0[0] & tc0[1]) < 0)
699
            continue;
700
        transpose4x4(trans,       pix-4,          8, stride);
701
        transpose4x4(trans  +4*8, pix,            8, stride);
702
        transpose4x4(trans+4,     pix-4+4*stride, 8, stride);
703
        transpose4x4(trans+4+4*8, pix  +4*stride, 8, stride);
704
        h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0);
705
        transpose4x4(pix-2,          trans  +2*8, stride, 8);
706
        transpose4x4(pix-2+4*stride, trans+4+2*8, stride, 8);
707
    }
708
}
709

    
710
static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
711
{
712
    __asm__ volatile(
713
        "movq    (%0),    %%mm0     \n\t" //p1
714
        "movq    (%0,%2), %%mm1     \n\t" //p0
715
        "movq    (%1),    %%mm2     \n\t" //q0
716
        "movq    (%1,%2), %%mm3     \n\t" //q1
717
        H264_DEBLOCK_MASK(%4, %5)
718
        "movd      %3,    %%mm6     \n\t"
719
        "punpcklbw %%mm6, %%mm6     \n\t"
720
        "pand      %%mm6, %%mm7     \n\t" // mm7 = tc&mask
721
        H264_DEBLOCK_P0_Q0(%6, %7)
722
        "movq      %%mm1, (%0,%2)   \n\t"
723
        "movq      %%mm2, (%1)      \n\t"
724

    
725
        :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride),
726
           "r"(*(uint32_t*)tc0),
727
           "m"(alpha1), "m"(beta1), "m"(ff_bone), "m"(ff_pb_3F)
728
    );
729
}
730

    
731
static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
732
{
733
    h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0);
734
}
735

    
736
static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
737
{
738
    //FIXME: could cut some load/stores by merging transpose with filter
739
    DECLARE_ALIGNED(8, uint8_t, trans)[8*4];
740
    transpose4x4(trans, pix-2, 8, stride);
741
    transpose4x4(trans+4, pix-2+4*stride, 8, stride);
742
    h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0);
743
    transpose4x4(pix-2, trans, stride, 8);
744
    transpose4x4(pix-2+4*stride, trans+4, stride, 8);
745
}
746

    
747
// p0 = (p0 + q1 + 2*p1 + 2) >> 2
748
#define H264_FILTER_CHROMA4(p0, p1, q1, one) \
749
    "movq    "#p0", %%mm4  \n\t"\
750
    "pxor    "#q1", %%mm4  \n\t"\
751
    "pand   "#one", %%mm4  \n\t" /* mm4 = (p0^q1)&1 */\
752
    "pavgb   "#q1", "#p0"  \n\t"\
753
    "psubusb %%mm4, "#p0"  \n\t"\
754
    "pavgb   "#p1", "#p0"  \n\t" /* dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) */\
755

    
756
static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1)
757
{
758
    __asm__ volatile(
759
        "movq    (%0),    %%mm0     \n\t"
760
        "movq    (%0,%2), %%mm1     \n\t"
761
        "movq    (%1),    %%mm2     \n\t"
762
        "movq    (%1,%2), %%mm3     \n\t"
763
        H264_DEBLOCK_MASK(%3, %4)
764
        "movq    %%mm1,   %%mm5     \n\t"
765
        "movq    %%mm2,   %%mm6     \n\t"
766
        H264_FILTER_CHROMA4(%%mm1, %%mm0, %%mm3, %5) //p0'
767
        H264_FILTER_CHROMA4(%%mm2, %%mm3, %%mm0, %5) //q0'
768
        "psubb   %%mm5,   %%mm1     \n\t"
769
        "psubb   %%mm6,   %%mm2     \n\t"
770
        "pand    %%mm7,   %%mm1     \n\t"
771
        "pand    %%mm7,   %%mm2     \n\t"
772
        "paddb   %%mm5,   %%mm1     \n\t"
773
        "paddb   %%mm6,   %%mm2     \n\t"
774
        "movq    %%mm1,   (%0,%2)   \n\t"
775
        "movq    %%mm2,   (%1)      \n\t"
776
        :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride),
777
           "m"(alpha1), "m"(beta1), "m"(ff_bone)
778
    );
779
}
780

    
781
static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
782
{
783
    h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1);
784
}
785

    
786
static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
787
{
788
    //FIXME: could cut some load/stores by merging transpose with filter
789
    DECLARE_ALIGNED(8, uint8_t, trans)[8*4];
790
    transpose4x4(trans, pix-2, 8, stride);
791
    transpose4x4(trans+4, pix-2+4*stride, 8, stride);
792
    h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1);
793
    transpose4x4(pix-2, trans, stride, 8);
794
    transpose4x4(pix-2+4*stride, trans+4, stride, 8);
795
}
796

    
797
static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2],
798
                                            int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) {
799
    int dir;
800
    __asm__ volatile(
801
        "movq %0, %%mm7 \n"
802
        "movq %1, %%mm6 \n"
803
        ::"m"(ff_pb_1), "m"(ff_pb_3)
804
    );
805
    if(field)
806
        __asm__ volatile(
807
            "movq %0, %%mm6 \n"
808
            ::"m"(ff_pb_3_1)
809
        );
810
    __asm__ volatile(
811
        "movq  %%mm6, %%mm5 \n"
812
        "paddb %%mm5, %%mm5 \n"
813
    :);
814

    
815
    // could do a special case for dir==0 && edges==1, but it only reduces the
816
    // average filter time by 1.2%
817
    for( dir=1; dir>=0; dir-- ) {
818
        const x86_reg d_idx = dir ? -8 : -1;
819
        const int mask_mv = dir ? mask_mv1 : mask_mv0;
820
        DECLARE_ALIGNED(8, const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL;
821
        int b_idx, edge;
822
        for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) {
823
            __asm__ volatile(
824
                "pand %0, %%mm0 \n\t"
825
                ::"m"(mask_dir)
826
            );
827
            if(!(mask_mv & edge)) {
828
                if(bidir) {
829
                    __asm__ volatile(
830
                        "movd         (%1,%0), %%mm2 \n"
831
                        "punpckldq  40(%1,%0), %%mm2 \n" // { ref0[bn], ref1[bn] }
832
                        "pshufw $0x44,   (%1), %%mm0 \n" // { ref0[b], ref0[b] }
833
                        "pshufw $0x44, 40(%1), %%mm1 \n" // { ref1[b], ref1[b] }
834
                        "pshufw $0x4E, %%mm2, %%mm3 \n"
835
                        "psubb         %%mm2, %%mm0 \n" // { ref0[b]!=ref0[bn], ref0[b]!=ref1[bn] }
836
                        "psubb         %%mm3, %%mm1 \n" // { ref1[b]!=ref1[bn], ref1[b]!=ref0[bn] }
837
                        "1: \n"
838
                        "por           %%mm1, %%mm0 \n"
839
                        "movq      (%2,%0,4), %%mm1 \n"
840
                        "movq     8(%2,%0,4), %%mm2 \n"
841
                        "movq          %%mm1, %%mm3 \n"
842
                        "movq          %%mm2, %%mm4 \n"
843
                        "psubw          (%2), %%mm1 \n"
844
                        "psubw         8(%2), %%mm2 \n"
845
                        "psubw       160(%2), %%mm3 \n"
846
                        "psubw       168(%2), %%mm4 \n"
847
                        "packsswb      %%mm2, %%mm1 \n"
848
                        "packsswb      %%mm4, %%mm3 \n"
849
                        "paddb         %%mm6, %%mm1 \n"
850
                        "paddb         %%mm6, %%mm3 \n"
851
                        "psubusb       %%mm5, %%mm1 \n" // abs(mv[b] - mv[bn]) >= limit
852
                        "psubusb       %%mm5, %%mm3 \n"
853
                        "packsswb      %%mm3, %%mm1 \n"
854
                        "add $40, %0 \n"
855
                        "cmp $40, %0 \n"
856
                        "jl 1b \n"
857
                        "sub $80, %0 \n"
858
                        "pshufw $0x4E, %%mm1, %%mm1 \n"
859
                        "por           %%mm1, %%mm0 \n"
860
                        "pshufw $0x4E, %%mm0, %%mm1 \n"
861
                        "pminub        %%mm1, %%mm0 \n"
862
                        ::"r"(d_idx),
863
                          "r"(ref[0]+b_idx),
864
                          "r"(mv[0]+b_idx)
865
                    );
866
                } else {
867
                    __asm__ volatile(
868
                        "movd        (%1), %%mm0 \n"
869
                        "psubb    (%1,%0), %%mm0 \n" // ref[b] != ref[bn]
870
                        "movq        (%2), %%mm1 \n"
871
                        "movq       8(%2), %%mm2 \n"
872
                        "psubw  (%2,%0,4), %%mm1 \n"
873
                        "psubw 8(%2,%0,4), %%mm2 \n"
874
                        "packsswb   %%mm2, %%mm1 \n"
875
                        "paddb      %%mm6, %%mm1 \n"
876
                        "psubusb    %%mm5, %%mm1 \n" // abs(mv[b] - mv[bn]) >= limit
877
                        "packsswb   %%mm1, %%mm1 \n"
878
                        "por        %%mm1, %%mm0 \n"
879
                        ::"r"(d_idx),
880
                          "r"(ref[0]+b_idx),
881
                          "r"(mv[0]+b_idx)
882
                    );
883
                }
884
            }
885
            __asm__ volatile(
886
                "movd %0, %%mm1 \n"
887
                "por  %1, %%mm1 \n" // nnz[b] || nnz[bn]
888
                ::"m"(nnz[b_idx]),
889
                  "m"(nnz[b_idx+d_idx])
890
            );
891
            __asm__ volatile(
892
                "pminub    %%mm7, %%mm1 \n"
893
                "pminub    %%mm7, %%mm0 \n"
894
                "psllw        $1, %%mm1 \n"
895
                "pxor      %%mm2, %%mm2 \n"
896
                "pmaxub    %%mm0, %%mm1 \n"
897
                "punpcklbw %%mm2, %%mm1 \n"
898
                "movq      %%mm1, %0    \n"
899
                :"=m"(*bS[dir][edge])
900
                ::"memory"
901
            );
902
        }
903
        edges = 4;
904
        step = 1;
905
    }
906
    __asm__ volatile(
907
        "movq   (%0), %%mm0 \n\t"
908
        "movq  8(%0), %%mm1 \n\t"
909
        "movq 16(%0), %%mm2 \n\t"
910
        "movq 24(%0), %%mm3 \n\t"
911
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4)
912
        "movq %%mm0,   (%0) \n\t"
913
        "movq %%mm3,  8(%0) \n\t"
914
        "movq %%mm4, 16(%0) \n\t"
915
        "movq %%mm2, 24(%0) \n\t"
916
        ::"r"(bS[0])
917
        :"memory"
918
    );
919
}
920

    
921
/***********************************/
922
/* weighted prediction */
923

    
924
#define H264_WEIGHT(W, H, OPT) \
925
void ff_h264_weight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \
926
    int stride, int log2_denom, int weight, int offset);
927

    
928
#define H264_BIWEIGHT(W, H, OPT) \
929
void ff_h264_biweight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \
930
    uint8_t *src, int stride, int log2_denom, int weightd, \
931
    int weights, int offset);
932

    
933
#define H264_BIWEIGHT_MMX(W,H) \
934
H264_WEIGHT  (W, H, mmx2) \
935
H264_BIWEIGHT(W, H, mmx2)
936

    
937
#define H264_BIWEIGHT_MMX_SSE(W,H) \
938
H264_BIWEIGHT_MMX(W, H) \
939
H264_WEIGHT      (W, H, sse2) \
940
H264_BIWEIGHT    (W, H, sse2) \
941
H264_BIWEIGHT    (W, H, ssse3)
942

    
943
H264_BIWEIGHT_MMX_SSE(16, 16)
944
H264_BIWEIGHT_MMX_SSE(16,  8)
945
H264_BIWEIGHT_MMX_SSE( 8, 16)
946
H264_BIWEIGHT_MMX_SSE( 8,  8)
947
H264_BIWEIGHT_MMX_SSE( 8,  4)
948
H264_BIWEIGHT_MMX    ( 4,  8)
949
H264_BIWEIGHT_MMX    ( 4,  4)
950
H264_BIWEIGHT_MMX    ( 4,  2)
951

    
952
void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
953
void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
954
void ff_x264_deblock_h_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta);
955
void ff_x264_deblock_v_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta);
956
void ff_x264_deblock_h_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta);
957

    
958
#if HAVE_YASM && ARCH_X86_32
959
void ff_x264_deblock_v8_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta);
960
static void ff_x264_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta)
961
{
962
    ff_x264_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta);
963
    ff_x264_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta);
964
}
965
#endif
966

    
967
void ff_h264dsp_init_x86(H264DSPContext *c)
968
{
969
    int mm_flags = mm_support();
970

    
971
    if (mm_flags & FF_MM_MMX) {
972
        c->h264_idct_dc_add=
973
        c->h264_idct_add= ff_h264_idct_add_mmx;
974
        c->h264_idct8_dc_add=
975
        c->h264_idct8_add= ff_h264_idct8_add_mmx;
976

    
977
        c->h264_idct_add16     = ff_h264_idct_add16_mmx;
978
        c->h264_idct8_add4     = ff_h264_idct8_add4_mmx;
979
        c->h264_idct_add8      = ff_h264_idct_add8_mmx;
980
        c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx;
981

    
982
        if (mm_flags & FF_MM_MMX2) {
983
            c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
984
            c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;
985
            c->h264_idct_add16     = ff_h264_idct_add16_mmx2;
986
            c->h264_idct8_add4     = ff_h264_idct8_add4_mmx2;
987
            c->h264_idct_add8      = ff_h264_idct_add8_mmx2;
988
            c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2;
989

    
990
            c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2;
991
            c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2;
992
            c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2;
993
            c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2;
994
            c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
995
            c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
996
            c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
997
        }
998
        if(mm_flags & FF_MM_SSE2){
999
            c->h264_idct8_add = ff_h264_idct8_add_sse2;
1000
            c->h264_idct8_add4= ff_h264_idct8_add4_sse2;
1001
        }
1002

    
1003
#if HAVE_YASM
1004
        if (mm_flags & FF_MM_MMX2){
1005
#if ARCH_X86_32
1006
            c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext;
1007
            c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext;
1008
#endif
1009
            c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
1010
            c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
1011
            c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
1012
            c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
1013
            c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
1014
            c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
1015
            c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
1016
            c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
1017

    
1018
            c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
1019
            c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
1020
            c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
1021
            c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
1022
            c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
1023
            c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
1024
            c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
1025
            c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
1026

    
1027
            if( mm_flags&FF_MM_SSE2 ){
1028
                c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2;
1029
                c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2;
1030
                c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_sse2;
1031
                c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_sse2;
1032
                c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_sse2;
1033

    
1034
                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_sse2;
1035
                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_sse2;
1036
                c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_sse2;
1037
                c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_sse2;
1038
                c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2;
1039

    
1040
#if ARCH_X86_64 || !defined(__ICC) || __ICC > 1110
1041
                c->h264_v_loop_filter_luma = ff_x264_deblock_v_luma_sse2;
1042
                c->h264_h_loop_filter_luma = ff_x264_deblock_h_luma_sse2;
1043
                c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_sse2;
1044
                c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_sse2;
1045
#endif
1046
#if CONFIG_GPL
1047
                c->h264_idct_add16 = ff_h264_idct_add16_sse2;
1048
                c->h264_idct_add8  = ff_h264_idct_add8_sse2;
1049
                c->h264_idct_add16intra = ff_h264_idct_add16intra_sse2;
1050
#endif
1051
            }
1052
            if ( mm_flags&FF_MM_SSSE3 ){
1053
                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_ssse3;
1054
                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_ssse3;
1055
                c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_ssse3;
1056
                c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3;
1057
                c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3;
1058
            }
1059
        }
1060
#endif
1061
    }
1062
}