Statistics
| Branch: | Revision:

ffmpeg / libavcodec / i386 / dsputil_mmx_rnd.h @ 5509bffa

History | View | Annotate | Download (22.8 KB)

1
/*
2
 * DSP utils mmx functions are compiled twice for rnd/no_rnd
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * This library is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2 of the License, or (at your option) any later version.
10
 *
11
 * This library is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with this library; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 *
20
 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
21
 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
22
 * and improved by Zdenek Kabelac <kabi@users.sf.net>
23
 */
24

    
25
// put_pixels
26
static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
27
{
28
    MOVQ_BFE(mm6);
29
    __asm __volatile(
30
        "lea    (%3, %3), %%"REG_a"     \n\t"
31
        ".balign 8                      \n\t"
32
        "1:                             \n\t"
33
        "movq   (%1), %%mm0             \n\t"
34
        "movq   1(%1), %%mm1            \n\t"
35
        "movq   (%1, %3), %%mm2         \n\t"
36
        "movq   1(%1, %3), %%mm3        \n\t"
37
        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
38
        "movq   %%mm4, (%2)             \n\t"
39
        "movq   %%mm5, (%2, %3)         \n\t"
40
        "add    %%"REG_a", %1           \n\t"
41
        "add    %%"REG_a", %2           \n\t"
42
        "movq   (%1), %%mm0             \n\t"
43
        "movq   1(%1), %%mm1            \n\t"
44
        "movq   (%1, %3), %%mm2         \n\t"
45
        "movq   1(%1, %3), %%mm3        \n\t"
46
        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
47
        "movq   %%mm4, (%2)             \n\t"
48
        "movq   %%mm5, (%2, %3)         \n\t"
49
        "add    %%"REG_a", %1           \n\t"
50
        "add    %%"REG_a", %2           \n\t"
51
        "subl   $4, %0                  \n\t"
52
        "jnz    1b                      \n\t"
53
        :"+g"(h), "+S"(pixels), "+D"(block)
54
        :"r"((long)line_size)
55
        :REG_a, "memory");
56
}
57

    
58
static void attribute_unused DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
59
{
60
    MOVQ_BFE(mm6);
61
    __asm __volatile(
62
        "testl $1, %0                   \n\t"
63
        " jz 1f                         \n\t"
64
        "movq   (%1), %%mm0             \n\t"
65
        "movq   (%2), %%mm1             \n\t"
66
        "add    %4, %1                  \n\t"
67
        "add    $8, %2                  \n\t"
68
        PAVGB(%%mm0, %%mm1, %%mm4, %%mm6)
69
        "movq   %%mm4, (%3)             \n\t"
70
        "add    %5, %3                  \n\t"
71
        "decl   %0                      \n\t"
72
        ".balign 8                      \n\t"
73
        "1:                             \n\t"
74
        "movq   (%1), %%mm0             \n\t"
75
        "movq   (%2), %%mm1             \n\t"
76
        "add    %4, %1                  \n\t"
77
        "movq   (%1), %%mm2             \n\t"
78
        "movq   8(%2), %%mm3            \n\t"
79
        "add    %4, %1                  \n\t"
80
        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
81
        "movq   %%mm4, (%3)             \n\t"
82
        "add    %5, %3                  \n\t"
83
        "movq   %%mm5, (%3)             \n\t"
84
        "add    %5, %3                  \n\t"
85
        "movq   (%1), %%mm0             \n\t"
86
        "movq   16(%2), %%mm1           \n\t"
87
        "add    %4, %1                  \n\t"
88
        "movq   (%1), %%mm2             \n\t"
89
        "movq   24(%2), %%mm3           \n\t"
90
        "add    %4, %1                  \n\t"
91
        "add    $32, %2                 \n\t"
92
        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
93
        "movq   %%mm4, (%3)             \n\t"
94
        "add    %5, %3                  \n\t"
95
        "movq   %%mm5, (%3)             \n\t"
96
        "add    %5, %3                  \n\t"
97
        "subl   $4, %0                  \n\t"
98
        "jnz    1b                      \n\t"
99
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
100
        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
101
#else
102
        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
103
#endif
104
        :"S"((long)src1Stride), "D"((long)dstStride)
105
        :"memory");
106
}
107

    
108
static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
109
{
110
    MOVQ_BFE(mm6);
111
    __asm __volatile(
112
        "lea        (%3, %3), %%"REG_a" \n\t"
113
        ".balign 8                      \n\t"
114
        "1:                             \n\t"
115
        "movq   (%1), %%mm0             \n\t"
116
        "movq   1(%1), %%mm1            \n\t"
117
        "movq   (%1, %3), %%mm2         \n\t"
118
        "movq   1(%1, %3), %%mm3        \n\t"
119
        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
120
        "movq   %%mm4, (%2)             \n\t"
121
        "movq   %%mm5, (%2, %3)         \n\t"
122
        "movq   8(%1), %%mm0            \n\t"
123
        "movq   9(%1), %%mm1            \n\t"
124
        "movq   8(%1, %3), %%mm2        \n\t"
125
        "movq   9(%1, %3), %%mm3        \n\t"
126
        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
127
        "movq   %%mm4, 8(%2)            \n\t"
128
        "movq   %%mm5, 8(%2, %3)        \n\t"
129
        "add    %%"REG_a", %1           \n\t"
130
        "add    %%"REG_a", %2           \n\t"
131
        "movq   (%1), %%mm0             \n\t"
132
        "movq   1(%1), %%mm1            \n\t"
133
        "movq   (%1, %3), %%mm2         \n\t"
134
        "movq   1(%1, %3), %%mm3        \n\t"
135
        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
136
        "movq   %%mm4, (%2)             \n\t"
137
        "movq   %%mm5, (%2, %3)         \n\t"
138
        "movq   8(%1), %%mm0            \n\t"
139
        "movq   9(%1), %%mm1            \n\t"
140
        "movq   8(%1, %3), %%mm2        \n\t"
141
        "movq   9(%1, %3), %%mm3        \n\t"
142
        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
143
        "movq   %%mm4, 8(%2)            \n\t"
144
        "movq   %%mm5, 8(%2, %3)        \n\t"
145
        "add    %%"REG_a", %1           \n\t"
146
        "add    %%"REG_a", %2           \n\t"
147
        "subl   $4, %0                  \n\t"
148
        "jnz    1b                      \n\t"
149
        :"+g"(h), "+S"(pixels), "+D"(block)
150
        :"r"((long)line_size)
151
        :REG_a, "memory");
152
}
153

    
154
static void attribute_unused DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
155
{
156
    MOVQ_BFE(mm6);
157
    __asm __volatile(
158
        "testl $1, %0                   \n\t"
159
        " jz 1f                         \n\t"
160
        "movq   (%1), %%mm0             \n\t"
161
        "movq   (%2), %%mm1             \n\t"
162
        "movq   8(%1), %%mm2            \n\t"
163
        "movq   8(%2), %%mm3            \n\t"
164
        "add    %4, %1                  \n\t"
165
        "add    $16, %2                 \n\t"
166
        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
167
        "movq   %%mm4, (%3)             \n\t"
168
        "movq   %%mm5, 8(%3)            \n\t"
169
        "add    %5, %3                  \n\t"
170
        "decl   %0                      \n\t"
171
        ".balign 8                      \n\t"
172
        "1:                             \n\t"
173
        "movq   (%1), %%mm0             \n\t"
174
        "movq   (%2), %%mm1             \n\t"
175
        "movq   8(%1), %%mm2            \n\t"
176
        "movq   8(%2), %%mm3            \n\t"
177
        "add    %4, %1                  \n\t"
178
        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
179
        "movq   %%mm4, (%3)             \n\t"
180
        "movq   %%mm5, 8(%3)            \n\t"
181
        "add    %5, %3                  \n\t"
182
        "movq   (%1), %%mm0             \n\t"
183
        "movq   16(%2), %%mm1           \n\t"
184
        "movq   8(%1), %%mm2            \n\t"
185
        "movq   24(%2), %%mm3           \n\t"
186
        "add    %4, %1                  \n\t"
187
        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
188
        "movq   %%mm4, (%3)             \n\t"
189
        "movq   %%mm5, 8(%3)            \n\t"
190
        "add    %5, %3                  \n\t"
191
        "add    $32, %2                 \n\t"
192
        "subl   $2, %0                  \n\t"
193
        "jnz    1b                      \n\t"
194
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
195
        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
196
#else
197
        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
198
#endif
199
        :"S"((long)src1Stride), "D"((long)dstStride)
200
        :"memory");
201
}
202

    
203
static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
204
{
205
    MOVQ_BFE(mm6);
206
    __asm __volatile(
207
        "lea (%3, %3), %%"REG_a"        \n\t"
208
        "movq (%1), %%mm0               \n\t"
209
        ".balign 8                      \n\t"
210
        "1:                             \n\t"
211
        "movq   (%1, %3), %%mm1         \n\t"
212
        "movq   (%1, %%"REG_a"),%%mm2   \n\t"
213
        PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
214
        "movq   %%mm4, (%2)             \n\t"
215
        "movq   %%mm5, (%2, %3)         \n\t"
216
        "add    %%"REG_a", %1           \n\t"
217
        "add    %%"REG_a", %2           \n\t"
218
        "movq   (%1, %3), %%mm1         \n\t"
219
        "movq   (%1, %%"REG_a"),%%mm0   \n\t"
220
        PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
221
        "movq   %%mm4, (%2)             \n\t"
222
        "movq   %%mm5, (%2, %3)         \n\t"
223
        "add    %%"REG_a", %1           \n\t"
224
        "add    %%"REG_a", %2           \n\t"
225
        "subl   $4, %0                  \n\t"
226
        "jnz    1b                      \n\t"
227
        :"+g"(h), "+S"(pixels), "+D"(block)
228
        :"r"((long)line_size)
229
        :REG_a, "memory");
230
}
231

    
232
static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
233
{
234
    MOVQ_ZERO(mm7);
235
    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
236
    __asm __volatile(
237
        "movq   (%1), %%mm0             \n\t"
238
        "movq   1(%1), %%mm4            \n\t"
239
        "movq   %%mm0, %%mm1            \n\t"
240
        "movq   %%mm4, %%mm5            \n\t"
241
        "punpcklbw %%mm7, %%mm0         \n\t"
242
        "punpcklbw %%mm7, %%mm4         \n\t"
243
        "punpckhbw %%mm7, %%mm1         \n\t"
244
        "punpckhbw %%mm7, %%mm5         \n\t"
245
        "paddusw %%mm0, %%mm4           \n\t"
246
        "paddusw %%mm1, %%mm5           \n\t"
247
        "xor    %%"REG_a", %%"REG_a"    \n\t"
248
        "add    %3, %1                  \n\t"
249
        ".balign 8                      \n\t"
250
        "1:                             \n\t"
251
        "movq   (%1, %%"REG_a"), %%mm0  \n\t"
252
        "movq   1(%1, %%"REG_a"), %%mm2 \n\t"
253
        "movq   %%mm0, %%mm1            \n\t"
254
        "movq   %%mm2, %%mm3            \n\t"
255
        "punpcklbw %%mm7, %%mm0         \n\t"
256
        "punpcklbw %%mm7, %%mm2         \n\t"
257
        "punpckhbw %%mm7, %%mm1         \n\t"
258
        "punpckhbw %%mm7, %%mm3         \n\t"
259
        "paddusw %%mm2, %%mm0           \n\t"
260
        "paddusw %%mm3, %%mm1           \n\t"
261
        "paddusw %%mm6, %%mm4           \n\t"
262
        "paddusw %%mm6, %%mm5           \n\t"
263
        "paddusw %%mm0, %%mm4           \n\t"
264
        "paddusw %%mm1, %%mm5           \n\t"
265
        "psrlw  $2, %%mm4               \n\t"
266
        "psrlw  $2, %%mm5               \n\t"
267
        "packuswb  %%mm5, %%mm4         \n\t"
268
        "movq   %%mm4, (%2, %%"REG_a")  \n\t"
269
        "add    %3, %%"REG_a"           \n\t"
270

    
271
        "movq   (%1, %%"REG_a"), %%mm2  \n\t" // 0 <-> 2   1 <-> 3
272
        "movq   1(%1, %%"REG_a"), %%mm4 \n\t"
273
        "movq   %%mm2, %%mm3            \n\t"
274
        "movq   %%mm4, %%mm5            \n\t"
275
        "punpcklbw %%mm7, %%mm2         \n\t"
276
        "punpcklbw %%mm7, %%mm4         \n\t"
277
        "punpckhbw %%mm7, %%mm3         \n\t"
278
        "punpckhbw %%mm7, %%mm5         \n\t"
279
        "paddusw %%mm2, %%mm4           \n\t"
280
        "paddusw %%mm3, %%mm5           \n\t"
281
        "paddusw %%mm6, %%mm0           \n\t"
282
        "paddusw %%mm6, %%mm1           \n\t"
283
        "paddusw %%mm4, %%mm0           \n\t"
284
        "paddusw %%mm5, %%mm1           \n\t"
285
        "psrlw  $2, %%mm0               \n\t"
286
        "psrlw  $2, %%mm1               \n\t"
287
        "packuswb  %%mm1, %%mm0         \n\t"
288
        "movq   %%mm0, (%2, %%"REG_a")  \n\t"
289
        "add    %3, %%"REG_a"           \n\t"
290

    
291
        "subl   $2, %0                  \n\t"
292
        "jnz    1b                      \n\t"
293
        :"+g"(h), "+S"(pixels)
294
        :"D"(block), "r"((long)line_size)
295
        :REG_a, "memory");
296
}
297

    
298
// avg_pixels
299
static void attribute_unused DEF(avg, pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
300
{
301
    MOVQ_BFE(mm6);
302
    JUMPALIGN();
303
    do {
304
        __asm __volatile(
305
             "movd  %0, %%mm0           \n\t"
306
             "movd  %1, %%mm1           \n\t"
307
             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
308
             "movd  %%mm2, %0           \n\t"
309
             :"+m"(*block)
310
             :"m"(*pixels)
311
             :"memory");
312
        pixels += line_size;
313
        block += line_size;
314
    }
315
    while (--h);
316
}
317

    
318
// in case more speed is needed - unroling would certainly help
319
static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
320
{
321
    MOVQ_BFE(mm6);
322
    JUMPALIGN();
323
    do {
324
        __asm __volatile(
325
             "movq  %0, %%mm0           \n\t"
326
             "movq  %1, %%mm1           \n\t"
327
             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
328
             "movq  %%mm2, %0           \n\t"
329
             :"+m"(*block)
330
             :"m"(*pixels)
331
             :"memory");
332
        pixels += line_size;
333
        block += line_size;
334
    }
335
    while (--h);
336
}
337

    
338
static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
339
{
340
    MOVQ_BFE(mm6);
341
    JUMPALIGN();
342
    do {
343
        __asm __volatile(
344
             "movq  %0, %%mm0           \n\t"
345
             "movq  %1, %%mm1           \n\t"
346
             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
347
             "movq  %%mm2, %0           \n\t"
348
             "movq  8%0, %%mm0          \n\t"
349
             "movq  8%1, %%mm1          \n\t"
350
             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
351
             "movq  %%mm2, 8%0          \n\t"
352
             :"+m"(*block)
353
             :"m"(*pixels)
354
             :"memory");
355
        pixels += line_size;
356
        block += line_size;
357
    }
358
    while (--h);
359
}
360

    
361
static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
362
{
363
    MOVQ_BFE(mm6);
364
    JUMPALIGN();
365
    do {
366
        __asm __volatile(
367
            "movq  %1, %%mm0            \n\t"
368
            "movq  1%1, %%mm1           \n\t"
369
            "movq  %0, %%mm3            \n\t"
370
            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
371
            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
372
            "movq  %%mm0, %0            \n\t"
373
            :"+m"(*block)
374
            :"m"(*pixels)
375
            :"memory");
376
        pixels += line_size;
377
        block += line_size;
378
    } while (--h);
379
}
380

    
381
static __attribute__((unused)) void DEF(avg, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
382
{
383
    MOVQ_BFE(mm6);
384
    JUMPALIGN();
385
    do {
386
        __asm __volatile(
387
            "movq  %1, %%mm0            \n\t"
388
            "movq  %2, %%mm1            \n\t"
389
            "movq  %0, %%mm3            \n\t"
390
            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
391
            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
392
            "movq  %%mm0, %0            \n\t"
393
            :"+m"(*dst)
394
            :"m"(*src1), "m"(*src2)
395
            :"memory");
396
        dst += dstStride;
397
        src1 += src1Stride;
398
        src2 += 8;
399
    } while (--h);
400
}
401

    
402
static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
403
{
404
    MOVQ_BFE(mm6);
405
    JUMPALIGN();
406
    do {
407
        __asm __volatile(
408
            "movq  %1, %%mm0            \n\t"
409
            "movq  1%1, %%mm1           \n\t"
410
            "movq  %0, %%mm3            \n\t"
411
            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
412
            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
413
            "movq  %%mm0, %0            \n\t"
414
            "movq  8%1, %%mm0           \n\t"
415
            "movq  9%1, %%mm1           \n\t"
416
            "movq  8%0, %%mm3           \n\t"
417
            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
418
            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
419
            "movq  %%mm0, 8%0           \n\t"
420
            :"+m"(*block)
421
            :"m"(*pixels)
422
            :"memory");
423
        pixels += line_size;
424
        block += line_size;
425
    } while (--h);
426
}
427

    
428
static __attribute__((unused)) void DEF(avg, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
429
{
430
    MOVQ_BFE(mm6);
431
    JUMPALIGN();
432
    do {
433
        __asm __volatile(
434
            "movq  %1, %%mm0            \n\t"
435
            "movq  %2, %%mm1            \n\t"
436
            "movq  %0, %%mm3            \n\t"
437
            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
438
            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
439
            "movq  %%mm0, %0            \n\t"
440
            "movq  8%1, %%mm0           \n\t"
441
            "movq  8%2, %%mm1           \n\t"
442
            "movq  8%0, %%mm3           \n\t"
443
            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
444
            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
445
            "movq  %%mm0, 8%0           \n\t"
446
            :"+m"(*dst)
447
            :"m"(*src1), "m"(*src2)
448
            :"memory");
449
        dst += dstStride;
450
        src1 += src1Stride;
451
        src2 += 16;
452
    } while (--h);
453
}
454

    
455
static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
456
{
457
    MOVQ_BFE(mm6);
458
    __asm __volatile(
459
        "lea    (%3, %3), %%"REG_a"     \n\t"
460
        "movq   (%1), %%mm0             \n\t"
461
        ".balign 8                      \n\t"
462
        "1:                             \n\t"
463
        "movq   (%1, %3), %%mm1         \n\t"
464
        "movq   (%1, %%"REG_a"), %%mm2  \n\t"
465
        PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
466
        "movq   (%2), %%mm3             \n\t"
467
        PAVGB(%%mm3, %%mm4, %%mm0, %%mm6)
468
        "movq   (%2, %3), %%mm3         \n\t"
469
        PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
470
        "movq   %%mm0, (%2)             \n\t"
471
        "movq   %%mm1, (%2, %3)         \n\t"
472
        "add    %%"REG_a", %1           \n\t"
473
        "add    %%"REG_a", %2           \n\t"
474

    
475
        "movq   (%1, %3), %%mm1         \n\t"
476
        "movq   (%1, %%"REG_a"), %%mm0  \n\t"
477
        PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
478
        "movq   (%2), %%mm3             \n\t"
479
        PAVGB(%%mm3, %%mm4, %%mm2, %%mm6)
480
        "movq   (%2, %3), %%mm3         \n\t"
481
        PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
482
        "movq   %%mm2, (%2)             \n\t"
483
        "movq   %%mm1, (%2, %3)         \n\t"
484
        "add    %%"REG_a", %1           \n\t"
485
        "add    %%"REG_a", %2           \n\t"
486

    
487
        "subl   $4, %0                  \n\t"
488
        "jnz    1b                      \n\t"
489
        :"+g"(h), "+S"(pixels), "+D"(block)
490
        :"r"((long)line_size)
491
        :REG_a, "memory");
492
}
493

    
494
// this routine is 'slightly' suboptimal but mostly unused
495
static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
496
{
497
    MOVQ_ZERO(mm7);
498
    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
499
    __asm __volatile(
500
        "movq   (%1), %%mm0             \n\t"
501
        "movq   1(%1), %%mm4            \n\t"
502
        "movq   %%mm0, %%mm1            \n\t"
503
        "movq   %%mm4, %%mm5            \n\t"
504
        "punpcklbw %%mm7, %%mm0         \n\t"
505
        "punpcklbw %%mm7, %%mm4         \n\t"
506
        "punpckhbw %%mm7, %%mm1         \n\t"
507
        "punpckhbw %%mm7, %%mm5         \n\t"
508
        "paddusw %%mm0, %%mm4           \n\t"
509
        "paddusw %%mm1, %%mm5           \n\t"
510
        "xor    %%"REG_a", %%"REG_a"    \n\t"
511
        "add    %3, %1                  \n\t"
512
        ".balign 8                      \n\t"
513
        "1:                             \n\t"
514
        "movq   (%1, %%"REG_a"), %%mm0  \n\t"
515
        "movq   1(%1, %%"REG_a"), %%mm2 \n\t"
516
        "movq   %%mm0, %%mm1            \n\t"
517
        "movq   %%mm2, %%mm3            \n\t"
518
        "punpcklbw %%mm7, %%mm0         \n\t"
519
        "punpcklbw %%mm7, %%mm2         \n\t"
520
        "punpckhbw %%mm7, %%mm1         \n\t"
521
        "punpckhbw %%mm7, %%mm3         \n\t"
522
        "paddusw %%mm2, %%mm0           \n\t"
523
        "paddusw %%mm3, %%mm1           \n\t"
524
        "paddusw %%mm6, %%mm4           \n\t"
525
        "paddusw %%mm6, %%mm5           \n\t"
526
        "paddusw %%mm0, %%mm4           \n\t"
527
        "paddusw %%mm1, %%mm5           \n\t"
528
        "psrlw  $2, %%mm4               \n\t"
529
        "psrlw  $2, %%mm5               \n\t"
530
                "movq   (%2, %%"REG_a"), %%mm3  \n\t"
531
        "packuswb  %%mm5, %%mm4         \n\t"
532
                "pcmpeqd %%mm2, %%mm2   \n\t"
533
                "paddb %%mm2, %%mm2     \n\t"
534
                PAVGB(%%mm3, %%mm4, %%mm5, %%mm2)
535
                "movq   %%mm5, (%2, %%"REG_a")  \n\t"
536
        "add    %3, %%"REG_a"                \n\t"
537

    
538
        "movq   (%1, %%"REG_a"), %%mm2  \n\t" // 0 <-> 2   1 <-> 3
539
        "movq   1(%1, %%"REG_a"), %%mm4 \n\t"
540
        "movq   %%mm2, %%mm3            \n\t"
541
        "movq   %%mm4, %%mm5            \n\t"
542
        "punpcklbw %%mm7, %%mm2         \n\t"
543
        "punpcklbw %%mm7, %%mm4         \n\t"
544
        "punpckhbw %%mm7, %%mm3         \n\t"
545
        "punpckhbw %%mm7, %%mm5         \n\t"
546
        "paddusw %%mm2, %%mm4           \n\t"
547
        "paddusw %%mm3, %%mm5           \n\t"
548
        "paddusw %%mm6, %%mm0           \n\t"
549
        "paddusw %%mm6, %%mm1           \n\t"
550
        "paddusw %%mm4, %%mm0           \n\t"
551
        "paddusw %%mm5, %%mm1           \n\t"
552
        "psrlw  $2, %%mm0               \n\t"
553
        "psrlw  $2, %%mm1               \n\t"
554
                "movq   (%2, %%"REG_a"), %%mm3  \n\t"
555
        "packuswb  %%mm1, %%mm0         \n\t"
556
                "pcmpeqd %%mm2, %%mm2   \n\t"
557
                "paddb %%mm2, %%mm2     \n\t"
558
                PAVGB(%%mm3, %%mm0, %%mm1, %%mm2)
559
                "movq   %%mm1, (%2, %%"REG_a")  \n\t"
560
        "add    %3, %%"REG_a"           \n\t"
561

    
562
        "subl   $2, %0                  \n\t"
563
        "jnz    1b                      \n\t"
564
        :"+g"(h), "+S"(pixels)
565
        :"D"(block), "r"((long)line_size)
566
        :REG_a, "memory");
567
}
568

    
569
//FIXME optimize
570
static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
571
    DEF(put, pixels8_y2)(block  , pixels  , line_size, h);
572
    DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h);
573
}
574

    
575
static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
576
    DEF(put, pixels8_xy2)(block  , pixels  , line_size, h);
577
    DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h);
578
}
579

    
580
static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
581
    DEF(avg, pixels8_y2)(block  , pixels  , line_size, h);
582
    DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h);
583
}
584

    
585
static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
586
    DEF(avg, pixels8_xy2)(block  , pixels  , line_size, h);
587
    DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h);
588
}
589

    
590