Statistics
| Branch: | Revision:

ffmpeg / libavcodec / i386 / dsputil_mmx_rnd.h @ be449fca

History | View | Annotate | Download (22.8 KB)

1
/*
2
 * DSP utils mmx functions are compiled twice for rnd/no_rnd
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7
 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
8
 * and improved by Zdenek Kabelac <kabi@users.sf.net>
9
 *
10
 * This file is part of FFmpeg.
11
 *
12
 * FFmpeg is free software; you can redistribute it and/or
13
 * modify it under the terms of the GNU Lesser General Public
14
 * License as published by the Free Software Foundation; either
15
 * version 2.1 of the License, or (at your option) any later version.
16
 *
17
 * FFmpeg is distributed in the hope that it will be useful,
18
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
 * Lesser General Public License for more details.
21
 *
22
 * You should have received a copy of the GNU Lesser General Public
23
 * License along with FFmpeg; if not, write to the Free Software
24
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25
 */
26

    
27
/* This header intentionally has no multiple inclusion guards. It is meant to
28
 * be included multiple times and generates different code depending on the
29
 * value of certain #defines. */
30

    
31
// put_pixels
32
static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
33
{
34
    MOVQ_BFE(mm6);
35
    __asm__ volatile(
36
        "lea    (%3, %3), %%"REG_a"     \n\t"
37
        ASMALIGN(3)
38
        "1:                             \n\t"
39
        "movq   (%1), %%mm0             \n\t"
40
        "movq   1(%1), %%mm1            \n\t"
41
        "movq   (%1, %3), %%mm2         \n\t"
42
        "movq   1(%1, %3), %%mm3        \n\t"
43
        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
44
        "movq   %%mm4, (%2)             \n\t"
45
        "movq   %%mm5, (%2, %3)         \n\t"
46
        "add    %%"REG_a", %1           \n\t"
47
        "add    %%"REG_a", %2           \n\t"
48
        "movq   (%1), %%mm0             \n\t"
49
        "movq   1(%1), %%mm1            \n\t"
50
        "movq   (%1, %3), %%mm2         \n\t"
51
        "movq   1(%1, %3), %%mm3        \n\t"
52
        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
53
        "movq   %%mm4, (%2)             \n\t"
54
        "movq   %%mm5, (%2, %3)         \n\t"
55
        "add    %%"REG_a", %1           \n\t"
56
        "add    %%"REG_a", %2           \n\t"
57
        "subl   $4, %0                  \n\t"
58
        "jnz    1b                      \n\t"
59
        :"+g"(h), "+S"(pixels), "+D"(block)
60
        :"r"((x86_reg)line_size)
61
        :REG_a, "memory");
62
}
63

    
64
static void av_unused DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
65
{
66
    MOVQ_BFE(mm6);
67
    __asm__ volatile(
68
        "testl $1, %0                   \n\t"
69
        " jz 1f                         \n\t"
70
        "movq   (%1), %%mm0             \n\t"
71
        "movq   (%2), %%mm1             \n\t"
72
        "add    %4, %1                  \n\t"
73
        "add    $8, %2                  \n\t"
74
        PAVGB(%%mm0, %%mm1, %%mm4, %%mm6)
75
        "movq   %%mm4, (%3)             \n\t"
76
        "add    %5, %3                  \n\t"
77
        "decl   %0                      \n\t"
78
        ASMALIGN(3)
79
        "1:                             \n\t"
80
        "movq   (%1), %%mm0             \n\t"
81
        "movq   (%2), %%mm1             \n\t"
82
        "add    %4, %1                  \n\t"
83
        "movq   (%1), %%mm2             \n\t"
84
        "movq   8(%2), %%mm3            \n\t"
85
        "add    %4, %1                  \n\t"
86
        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
87
        "movq   %%mm4, (%3)             \n\t"
88
        "add    %5, %3                  \n\t"
89
        "movq   %%mm5, (%3)             \n\t"
90
        "add    %5, %3                  \n\t"
91
        "movq   (%1), %%mm0             \n\t"
92
        "movq   16(%2), %%mm1           \n\t"
93
        "add    %4, %1                  \n\t"
94
        "movq   (%1), %%mm2             \n\t"
95
        "movq   24(%2), %%mm3           \n\t"
96
        "add    %4, %1                  \n\t"
97
        "add    $32, %2                 \n\t"
98
        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
99
        "movq   %%mm4, (%3)             \n\t"
100
        "add    %5, %3                  \n\t"
101
        "movq   %%mm5, (%3)             \n\t"
102
        "add    %5, %3                  \n\t"
103
        "subl   $4, %0                  \n\t"
104
        "jnz    1b                      \n\t"
105
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
106
        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
107
#else
108
        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
109
#endif
110
        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
111
        :"memory");
112
}
113

    
114
static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
115
{
116
    MOVQ_BFE(mm6);
117
    __asm__ volatile(
118
        "lea        (%3, %3), %%"REG_a" \n\t"
119
        ASMALIGN(3)
120
        "1:                             \n\t"
121
        "movq   (%1), %%mm0             \n\t"
122
        "movq   1(%1), %%mm1            \n\t"
123
        "movq   (%1, %3), %%mm2         \n\t"
124
        "movq   1(%1, %3), %%mm3        \n\t"
125
        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
126
        "movq   %%mm4, (%2)             \n\t"
127
        "movq   %%mm5, (%2, %3)         \n\t"
128
        "movq   8(%1), %%mm0            \n\t"
129
        "movq   9(%1), %%mm1            \n\t"
130
        "movq   8(%1, %3), %%mm2        \n\t"
131
        "movq   9(%1, %3), %%mm3        \n\t"
132
        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
133
        "movq   %%mm4, 8(%2)            \n\t"
134
        "movq   %%mm5, 8(%2, %3)        \n\t"
135
        "add    %%"REG_a", %1           \n\t"
136
        "add    %%"REG_a", %2           \n\t"
137
        "movq   (%1), %%mm0             \n\t"
138
        "movq   1(%1), %%mm1            \n\t"
139
        "movq   (%1, %3), %%mm2         \n\t"
140
        "movq   1(%1, %3), %%mm3        \n\t"
141
        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
142
        "movq   %%mm4, (%2)             \n\t"
143
        "movq   %%mm5, (%2, %3)         \n\t"
144
        "movq   8(%1), %%mm0            \n\t"
145
        "movq   9(%1), %%mm1            \n\t"
146
        "movq   8(%1, %3), %%mm2        \n\t"
147
        "movq   9(%1, %3), %%mm3        \n\t"
148
        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
149
        "movq   %%mm4, 8(%2)            \n\t"
150
        "movq   %%mm5, 8(%2, %3)        \n\t"
151
        "add    %%"REG_a", %1           \n\t"
152
        "add    %%"REG_a", %2           \n\t"
153
        "subl   $4, %0                  \n\t"
154
        "jnz    1b                      \n\t"
155
        :"+g"(h), "+S"(pixels), "+D"(block)
156
        :"r"((x86_reg)line_size)
157
        :REG_a, "memory");
158
}
159

    
160
static void av_unused DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
161
{
162
    MOVQ_BFE(mm6);
163
    __asm__ volatile(
164
        "testl $1, %0                   \n\t"
165
        " jz 1f                         \n\t"
166
        "movq   (%1), %%mm0             \n\t"
167
        "movq   (%2), %%mm1             \n\t"
168
        "movq   8(%1), %%mm2            \n\t"
169
        "movq   8(%2), %%mm3            \n\t"
170
        "add    %4, %1                  \n\t"
171
        "add    $16, %2                 \n\t"
172
        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
173
        "movq   %%mm4, (%3)             \n\t"
174
        "movq   %%mm5, 8(%3)            \n\t"
175
        "add    %5, %3                  \n\t"
176
        "decl   %0                      \n\t"
177
        ASMALIGN(3)
178
        "1:                             \n\t"
179
        "movq   (%1), %%mm0             \n\t"
180
        "movq   (%2), %%mm1             \n\t"
181
        "movq   8(%1), %%mm2            \n\t"
182
        "movq   8(%2), %%mm3            \n\t"
183
        "add    %4, %1                  \n\t"
184
        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
185
        "movq   %%mm4, (%3)             \n\t"
186
        "movq   %%mm5, 8(%3)            \n\t"
187
        "add    %5, %3                  \n\t"
188
        "movq   (%1), %%mm0             \n\t"
189
        "movq   16(%2), %%mm1           \n\t"
190
        "movq   8(%1), %%mm2            \n\t"
191
        "movq   24(%2), %%mm3           \n\t"
192
        "add    %4, %1                  \n\t"
193
        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
194
        "movq   %%mm4, (%3)             \n\t"
195
        "movq   %%mm5, 8(%3)            \n\t"
196
        "add    %5, %3                  \n\t"
197
        "add    $32, %2                 \n\t"
198
        "subl   $2, %0                  \n\t"
199
        "jnz    1b                      \n\t"
200
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
201
        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
202
#else
203
        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
204
#endif
205
        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
206
        :"memory");
207
}
208

    
209
static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
210
{
211
    MOVQ_BFE(mm6);
212
    __asm__ volatile(
213
        "lea (%3, %3), %%"REG_a"        \n\t"
214
        "movq (%1), %%mm0               \n\t"
215
        ASMALIGN(3)
216
        "1:                             \n\t"
217
        "movq   (%1, %3), %%mm1         \n\t"
218
        "movq   (%1, %%"REG_a"),%%mm2   \n\t"
219
        PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
220
        "movq   %%mm4, (%2)             \n\t"
221
        "movq   %%mm5, (%2, %3)         \n\t"
222
        "add    %%"REG_a", %1           \n\t"
223
        "add    %%"REG_a", %2           \n\t"
224
        "movq   (%1, %3), %%mm1         \n\t"
225
        "movq   (%1, %%"REG_a"),%%mm0   \n\t"
226
        PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
227
        "movq   %%mm4, (%2)             \n\t"
228
        "movq   %%mm5, (%2, %3)         \n\t"
229
        "add    %%"REG_a", %1           \n\t"
230
        "add    %%"REG_a", %2           \n\t"
231
        "subl   $4, %0                  \n\t"
232
        "jnz    1b                      \n\t"
233
        :"+g"(h), "+S"(pixels), "+D"(block)
234
        :"r"((x86_reg)line_size)
235
        :REG_a, "memory");
236
}
237

    
238
static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
239
{
240
    MOVQ_ZERO(mm7);
241
    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
242
    __asm__ volatile(
243
        "movq   (%1), %%mm0             \n\t"
244
        "movq   1(%1), %%mm4            \n\t"
245
        "movq   %%mm0, %%mm1            \n\t"
246
        "movq   %%mm4, %%mm5            \n\t"
247
        "punpcklbw %%mm7, %%mm0         \n\t"
248
        "punpcklbw %%mm7, %%mm4         \n\t"
249
        "punpckhbw %%mm7, %%mm1         \n\t"
250
        "punpckhbw %%mm7, %%mm5         \n\t"
251
        "paddusw %%mm0, %%mm4           \n\t"
252
        "paddusw %%mm1, %%mm5           \n\t"
253
        "xor    %%"REG_a", %%"REG_a"    \n\t"
254
        "add    %3, %1                  \n\t"
255
        ASMALIGN(3)
256
        "1:                             \n\t"
257
        "movq   (%1, %%"REG_a"), %%mm0  \n\t"
258
        "movq   1(%1, %%"REG_a"), %%mm2 \n\t"
259
        "movq   %%mm0, %%mm1            \n\t"
260
        "movq   %%mm2, %%mm3            \n\t"
261
        "punpcklbw %%mm7, %%mm0         \n\t"
262
        "punpcklbw %%mm7, %%mm2         \n\t"
263
        "punpckhbw %%mm7, %%mm1         \n\t"
264
        "punpckhbw %%mm7, %%mm3         \n\t"
265
        "paddusw %%mm2, %%mm0           \n\t"
266
        "paddusw %%mm3, %%mm1           \n\t"
267
        "paddusw %%mm6, %%mm4           \n\t"
268
        "paddusw %%mm6, %%mm5           \n\t"
269
        "paddusw %%mm0, %%mm4           \n\t"
270
        "paddusw %%mm1, %%mm5           \n\t"
271
        "psrlw  $2, %%mm4               \n\t"
272
        "psrlw  $2, %%mm5               \n\t"
273
        "packuswb  %%mm5, %%mm4         \n\t"
274
        "movq   %%mm4, (%2, %%"REG_a")  \n\t"
275
        "add    %3, %%"REG_a"           \n\t"
276

    
277
        "movq   (%1, %%"REG_a"), %%mm2  \n\t" // 0 <-> 2   1 <-> 3
278
        "movq   1(%1, %%"REG_a"), %%mm4 \n\t"
279
        "movq   %%mm2, %%mm3            \n\t"
280
        "movq   %%mm4, %%mm5            \n\t"
281
        "punpcklbw %%mm7, %%mm2         \n\t"
282
        "punpcklbw %%mm7, %%mm4         \n\t"
283
        "punpckhbw %%mm7, %%mm3         \n\t"
284
        "punpckhbw %%mm7, %%mm5         \n\t"
285
        "paddusw %%mm2, %%mm4           \n\t"
286
        "paddusw %%mm3, %%mm5           \n\t"
287
        "paddusw %%mm6, %%mm0           \n\t"
288
        "paddusw %%mm6, %%mm1           \n\t"
289
        "paddusw %%mm4, %%mm0           \n\t"
290
        "paddusw %%mm5, %%mm1           \n\t"
291
        "psrlw  $2, %%mm0               \n\t"
292
        "psrlw  $2, %%mm1               \n\t"
293
        "packuswb  %%mm1, %%mm0         \n\t"
294
        "movq   %%mm0, (%2, %%"REG_a")  \n\t"
295
        "add    %3, %%"REG_a"           \n\t"
296

    
297
        "subl   $2, %0                  \n\t"
298
        "jnz    1b                      \n\t"
299
        :"+g"(h), "+S"(pixels)
300
        :"D"(block), "r"((x86_reg)line_size)
301
        :REG_a, "memory");
302
}
303

    
304
// avg_pixels
305
static void av_unused DEF(avg, pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
306
{
307
    MOVQ_BFE(mm6);
308
    JUMPALIGN();
309
    do {
310
        __asm__ volatile(
311
             "movd  %0, %%mm0           \n\t"
312
             "movd  %1, %%mm1           \n\t"
313
             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
314
             "movd  %%mm2, %0           \n\t"
315
             :"+m"(*block)
316
             :"m"(*pixels)
317
             :"memory");
318
        pixels += line_size;
319
        block += line_size;
320
    }
321
    while (--h);
322
}
323

    
324
// in case more speed is needed - unroling would certainly help
325
static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
326
{
327
    MOVQ_BFE(mm6);
328
    JUMPALIGN();
329
    do {
330
        __asm__ volatile(
331
             "movq  %0, %%mm0           \n\t"
332
             "movq  %1, %%mm1           \n\t"
333
             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
334
             "movq  %%mm2, %0           \n\t"
335
             :"+m"(*block)
336
             :"m"(*pixels)
337
             :"memory");
338
        pixels += line_size;
339
        block += line_size;
340
    }
341
    while (--h);
342
}
343

    
344
static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
345
{
346
    MOVQ_BFE(mm6);
347
    JUMPALIGN();
348
    do {
349
        __asm__ volatile(
350
             "movq  %0, %%mm0           \n\t"
351
             "movq  %1, %%mm1           \n\t"
352
             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
353
             "movq  %%mm2, %0           \n\t"
354
             "movq  8%0, %%mm0          \n\t"
355
             "movq  8%1, %%mm1          \n\t"
356
             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
357
             "movq  %%mm2, 8%0          \n\t"
358
             :"+m"(*block)
359
             :"m"(*pixels)
360
             :"memory");
361
        pixels += line_size;
362
        block += line_size;
363
    }
364
    while (--h);
365
}
366

    
367
static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
368
{
369
    MOVQ_BFE(mm6);
370
    JUMPALIGN();
371
    do {
372
        __asm__ volatile(
373
            "movq  %1, %%mm0            \n\t"
374
            "movq  1%1, %%mm1           \n\t"
375
            "movq  %0, %%mm3            \n\t"
376
            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
377
            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
378
            "movq  %%mm0, %0            \n\t"
379
            :"+m"(*block)
380
            :"m"(*pixels)
381
            :"memory");
382
        pixels += line_size;
383
        block += line_size;
384
    } while (--h);
385
}
386

    
387
static av_unused void DEF(avg, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
388
{
389
    MOVQ_BFE(mm6);
390
    JUMPALIGN();
391
    do {
392
        __asm__ volatile(
393
            "movq  %1, %%mm0            \n\t"
394
            "movq  %2, %%mm1            \n\t"
395
            "movq  %0, %%mm3            \n\t"
396
            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
397
            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
398
            "movq  %%mm0, %0            \n\t"
399
            :"+m"(*dst)
400
            :"m"(*src1), "m"(*src2)
401
            :"memory");
402
        dst += dstStride;
403
        src1 += src1Stride;
404
        src2 += 8;
405
    } while (--h);
406
}
407

    
408
static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
409
{
410
    MOVQ_BFE(mm6);
411
    JUMPALIGN();
412
    do {
413
        __asm__ volatile(
414
            "movq  %1, %%mm0            \n\t"
415
            "movq  1%1, %%mm1           \n\t"
416
            "movq  %0, %%mm3            \n\t"
417
            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
418
            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
419
            "movq  %%mm0, %0            \n\t"
420
            "movq  8%1, %%mm0           \n\t"
421
            "movq  9%1, %%mm1           \n\t"
422
            "movq  8%0, %%mm3           \n\t"
423
            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
424
            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
425
            "movq  %%mm0, 8%0           \n\t"
426
            :"+m"(*block)
427
            :"m"(*pixels)
428
            :"memory");
429
        pixels += line_size;
430
        block += line_size;
431
    } while (--h);
432
}
433

    
434
static av_unused void DEF(avg, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
435
{
436
    MOVQ_BFE(mm6);
437
    JUMPALIGN();
438
    do {
439
        __asm__ volatile(
440
            "movq  %1, %%mm0            \n\t"
441
            "movq  %2, %%mm1            \n\t"
442
            "movq  %0, %%mm3            \n\t"
443
            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
444
            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
445
            "movq  %%mm0, %0            \n\t"
446
            "movq  8%1, %%mm0           \n\t"
447
            "movq  8%2, %%mm1           \n\t"
448
            "movq  8%0, %%mm3           \n\t"
449
            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
450
            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
451
            "movq  %%mm0, 8%0           \n\t"
452
            :"+m"(*dst)
453
            :"m"(*src1), "m"(*src2)
454
            :"memory");
455
        dst += dstStride;
456
        src1 += src1Stride;
457
        src2 += 16;
458
    } while (--h);
459
}
460

    
461
static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
462
{
463
    MOVQ_BFE(mm6);
464
    __asm__ volatile(
465
        "lea    (%3, %3), %%"REG_a"     \n\t"
466
        "movq   (%1), %%mm0             \n\t"
467
        ASMALIGN(3)
468
        "1:                             \n\t"
469
        "movq   (%1, %3), %%mm1         \n\t"
470
        "movq   (%1, %%"REG_a"), %%mm2  \n\t"
471
        PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
472
        "movq   (%2), %%mm3             \n\t"
473
        PAVGB(%%mm3, %%mm4, %%mm0, %%mm6)
474
        "movq   (%2, %3), %%mm3         \n\t"
475
        PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
476
        "movq   %%mm0, (%2)             \n\t"
477
        "movq   %%mm1, (%2, %3)         \n\t"
478
        "add    %%"REG_a", %1           \n\t"
479
        "add    %%"REG_a", %2           \n\t"
480

    
481
        "movq   (%1, %3), %%mm1         \n\t"
482
        "movq   (%1, %%"REG_a"), %%mm0  \n\t"
483
        PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
484
        "movq   (%2), %%mm3             \n\t"
485
        PAVGB(%%mm3, %%mm4, %%mm2, %%mm6)
486
        "movq   (%2, %3), %%mm3         \n\t"
487
        PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
488
        "movq   %%mm2, (%2)             \n\t"
489
        "movq   %%mm1, (%2, %3)         \n\t"
490
        "add    %%"REG_a", %1           \n\t"
491
        "add    %%"REG_a", %2           \n\t"
492

    
493
        "subl   $4, %0                  \n\t"
494
        "jnz    1b                      \n\t"
495
        :"+g"(h), "+S"(pixels), "+D"(block)
496
        :"r"((x86_reg)line_size)
497
        :REG_a, "memory");
498
}
499

    
500
// this routine is 'slightly' suboptimal but mostly unused
501
static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
502
{
503
    MOVQ_ZERO(mm7);
504
    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
505
    __asm__ volatile(
506
        "movq   (%1), %%mm0             \n\t"
507
        "movq   1(%1), %%mm4            \n\t"
508
        "movq   %%mm0, %%mm1            \n\t"
509
        "movq   %%mm4, %%mm5            \n\t"
510
        "punpcklbw %%mm7, %%mm0         \n\t"
511
        "punpcklbw %%mm7, %%mm4         \n\t"
512
        "punpckhbw %%mm7, %%mm1         \n\t"
513
        "punpckhbw %%mm7, %%mm5         \n\t"
514
        "paddusw %%mm0, %%mm4           \n\t"
515
        "paddusw %%mm1, %%mm5           \n\t"
516
        "xor    %%"REG_a", %%"REG_a"    \n\t"
517
        "add    %3, %1                  \n\t"
518
        ASMALIGN(3)
519
        "1:                             \n\t"
520
        "movq   (%1, %%"REG_a"), %%mm0  \n\t"
521
        "movq   1(%1, %%"REG_a"), %%mm2 \n\t"
522
        "movq   %%mm0, %%mm1            \n\t"
523
        "movq   %%mm2, %%mm3            \n\t"
524
        "punpcklbw %%mm7, %%mm0         \n\t"
525
        "punpcklbw %%mm7, %%mm2         \n\t"
526
        "punpckhbw %%mm7, %%mm1         \n\t"
527
        "punpckhbw %%mm7, %%mm3         \n\t"
528
        "paddusw %%mm2, %%mm0           \n\t"
529
        "paddusw %%mm3, %%mm1           \n\t"
530
        "paddusw %%mm6, %%mm4           \n\t"
531
        "paddusw %%mm6, %%mm5           \n\t"
532
        "paddusw %%mm0, %%mm4           \n\t"
533
        "paddusw %%mm1, %%mm5           \n\t"
534
        "psrlw  $2, %%mm4               \n\t"
535
        "psrlw  $2, %%mm5               \n\t"
536
                "movq   (%2, %%"REG_a"), %%mm3  \n\t"
537
        "packuswb  %%mm5, %%mm4         \n\t"
538
                "pcmpeqd %%mm2, %%mm2   \n\t"
539
                "paddb %%mm2, %%mm2     \n\t"
540
                PAVGB(%%mm3, %%mm4, %%mm5, %%mm2)
541
                "movq   %%mm5, (%2, %%"REG_a")  \n\t"
542
        "add    %3, %%"REG_a"                \n\t"
543

    
544
        "movq   (%1, %%"REG_a"), %%mm2  \n\t" // 0 <-> 2   1 <-> 3
545
        "movq   1(%1, %%"REG_a"), %%mm4 \n\t"
546
        "movq   %%mm2, %%mm3            \n\t"
547
        "movq   %%mm4, %%mm5            \n\t"
548
        "punpcklbw %%mm7, %%mm2         \n\t"
549
        "punpcklbw %%mm7, %%mm4         \n\t"
550
        "punpckhbw %%mm7, %%mm3         \n\t"
551
        "punpckhbw %%mm7, %%mm5         \n\t"
552
        "paddusw %%mm2, %%mm4           \n\t"
553
        "paddusw %%mm3, %%mm5           \n\t"
554
        "paddusw %%mm6, %%mm0           \n\t"
555
        "paddusw %%mm6, %%mm1           \n\t"
556
        "paddusw %%mm4, %%mm0           \n\t"
557
        "paddusw %%mm5, %%mm1           \n\t"
558
        "psrlw  $2, %%mm0               \n\t"
559
        "psrlw  $2, %%mm1               \n\t"
560
                "movq   (%2, %%"REG_a"), %%mm3  \n\t"
561
        "packuswb  %%mm1, %%mm0         \n\t"
562
                "pcmpeqd %%mm2, %%mm2   \n\t"
563
                "paddb %%mm2, %%mm2     \n\t"
564
                PAVGB(%%mm3, %%mm0, %%mm1, %%mm2)
565
                "movq   %%mm1, (%2, %%"REG_a")  \n\t"
566
        "add    %3, %%"REG_a"           \n\t"
567

    
568
        "subl   $2, %0                  \n\t"
569
        "jnz    1b                      \n\t"
570
        :"+g"(h), "+S"(pixels)
571
        :"D"(block), "r"((x86_reg)line_size)
572
        :REG_a, "memory");
573
}
574

    
575
//FIXME optimize
576
static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
577
    DEF(put, pixels8_y2)(block  , pixels  , line_size, h);
578
    DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h);
579
}
580

    
581
static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
582
    DEF(put, pixels8_xy2)(block  , pixels  , line_size, h);
583
    DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h);
584
}
585

    
586
static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
587
    DEF(avg, pixels8_y2)(block  , pixels  , line_size, h);
588
    DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h);
589
}
590

    
591
static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
592
    DEF(avg, pixels8_xy2)(block  , pixels  , line_size, h);
593
    DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h);
594
}