Statistics
| Branch: | Revision:

ffmpeg / libavcodec / i386 / dsputil_mmx_rnd.h @ 3178ee4c

History | View | Annotate | Download (21.6 KB)

1
/*
2
 * DSP utils mmx functions are compiled twice for rnd/no_rnd
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 *
5
 * This library is free software; you can redistribute it and/or
6
 * modify it under the terms of the GNU Lesser General Public
7
 * License as published by the Free Software Foundation; either
8
 * version 2 of the License, or (at your option) any later version.
9
 *
10
 * This library is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
 * Lesser General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU Lesser General Public
16
 * License along with this library; if not, write to the Free Software
17
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18
 *
19
 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
20
 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
21
 * and improved by Zdenek Kabelac <kabi@users.sf.net>
22
 */
23

    
24
// put_pixels
25
static void DEF(put, pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
26
{
27
    MOVQ_BFE(mm6);
28
    __asm __volatile(
29
        "lea        (%3, %3), %%eax                \n\t"
30
        ".balign 8                        \n\t"
31
        "1:                                \n\t"
32
        "movq        (%1), %%mm0                \n\t"
33
        "movq        1(%1), %%mm1                \n\t"
34
        "movq        (%1, %3), %%mm2                \n\t"
35
        "movq        1(%1, %3), %%mm3        \n\t"
36
        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
37
        "movq        %%mm4, (%2)                \n\t"
38
        "movq        %%mm5, (%2, %3)                \n\t"
39
        "addl        %%eax, %1                \n\t"
40
        "addl        %%eax, %2                \n\t"
41
        "movq        (%1), %%mm0                \n\t"
42
        "movq        1(%1), %%mm1                \n\t"
43
        "movq        (%1, %3), %%mm2                \n\t"
44
        "movq        1(%1, %3), %%mm3        \n\t"
45
        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
46
        "movq        %%mm4, (%2)                \n\t"
47
        "movq        %%mm5, (%2, %3)                \n\t"
48
        "addl        %%eax, %1                \n\t"
49
        "addl        %%eax, %2                \n\t"
50
        "subl        $4, %0                        \n\t"
51
        "jnz        1b                        \n\t"
52
        :"+g"(h), "+S"(pixels), "+D"(block)
53
        :"r"(line_size)
54
        :"eax", "memory");
55
}
56

    
57
static void DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
58
{
59
    MOVQ_BFE(mm6);
60
    __asm __volatile(
61
        ".balign 8                        \n\t"
62
        "1:                                \n\t"
63
        "movq        (%1), %%mm0                \n\t"
64
        "movq        (%2), %%mm1                \n\t"
65
        "addl        %4, %1                        \n\t"
66
        "movq        (%1), %%mm2                \n\t"
67
        "movq        8(%2), %%mm3                \n\t"
68
        "addl        %4, %1                        \n\t"
69
        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
70
        "movq        %%mm4, (%3)                \n\t"
71
        "addl        %5, %3                        \n\t"
72
        "movq        %%mm5, (%3)                \n\t"
73
        "addl        %5, %3                        \n\t"
74
        "movq        (%1), %%mm0                \n\t"
75
        "movq        16(%2), %%mm1                \n\t"
76
        "addl        %4, %1                        \n\t"
77
        "movq        (%1), %%mm2                \n\t"
78
        "movq        24(%2), %%mm3                \n\t"
79
        "addl        %4, %1                        \n\t"
80
        "addl        $32, %2                        \n\t"
81
        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
82
        "movq        %%mm4, (%3)                \n\t"
83
        "addl        %5, %3                        \n\t"
84
        "movq        %%mm5, (%3)                \n\t"
85
        "addl        %5, %3                        \n\t"
86
        "subl        $4, %0                        \n\t"
87
        "jnz        1b                        \n\t"
88
        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
89
        :"r"(src1Stride), "r"(dstStride)
90
        :"memory");
91
}
92

    
93
static void DEF(put, pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
94
{
95
    MOVQ_BFE(mm6);
96
    __asm __volatile(
97
        "lea        (%3, %3), %%eax                \n\t"
98
        ".balign 8                        \n\t"
99
        "1:                                \n\t"
100
        "movq        (%1), %%mm0                \n\t"
101
        "movq        1(%1), %%mm1                \n\t"
102
        "movq        (%1, %3), %%mm2                \n\t"
103
        "movq        1(%1, %3), %%mm3        \n\t"
104
        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
105
        "movq        %%mm4, (%2)                \n\t"
106
        "movq        %%mm5, (%2, %3)                \n\t"
107
        "movq        8(%1), %%mm0                \n\t"
108
        "movq        9(%1), %%mm1                \n\t"
109
        "movq        8(%1, %3), %%mm2        \n\t"
110
        "movq        9(%1, %3), %%mm3        \n\t"
111
        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
112
        "movq        %%mm4, 8(%2)                \n\t"
113
        "movq        %%mm5, 8(%2, %3)        \n\t"
114
        "addl        %%eax, %1                \n\t"
115
        "addl        %%eax, %2                \n\t"
116
        "movq        (%1), %%mm0                \n\t"
117
        "movq        1(%1), %%mm1                \n\t"
118
        "movq        (%1, %3), %%mm2                \n\t"
119
        "movq        1(%1, %3), %%mm3        \n\t"
120
        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
121
        "movq        %%mm4, (%2)                \n\t"
122
        "movq        %%mm5, (%2, %3)                \n\t"
123
        "movq        8(%1), %%mm0                \n\t"
124
        "movq        9(%1), %%mm1                \n\t"
125
        "movq        8(%1, %3), %%mm2        \n\t"
126
        "movq        9(%1, %3), %%mm3        \n\t"
127
        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
128
        "movq        %%mm4, 8(%2)                \n\t"
129
        "movq        %%mm5, 8(%2, %3)        \n\t"
130
        "addl        %%eax, %1                \n\t"
131
        "addl        %%eax, %2                \n\t"
132
        "subl        $4, %0                        \n\t"
133
        "jnz        1b                        \n\t"
134
        :"+g"(h), "+S"(pixels), "+D"(block)
135
        :"r"(line_size)
136
        :"eax", "memory");
137
}
138

    
139
static void DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
140
{
141
    MOVQ_BFE(mm6);
142
    __asm __volatile(
143
        ".balign 8                        \n\t"
144
        "1:                                \n\t"
145
        "movq        (%1), %%mm0                \n\t"
146
        "movq        (%2), %%mm1                \n\t"
147
        "movq        8(%1), %%mm2                \n\t"
148
        "movq        8(%2), %%mm3                \n\t"
149
        "addl        %4, %1                        \n\t"
150
        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
151
        "movq        %%mm4, (%3)                \n\t"
152
        "movq        %%mm5, 8(%3)                \n\t"
153
        "addl        %5, %3                        \n\t"
154
        "movq        (%1), %%mm0                \n\t"
155
        "movq        16(%2), %%mm1                \n\t"
156
        "movq        8(%1), %%mm2                \n\t"
157
        "movq        24(%2), %%mm3                \n\t"
158
        "addl        %4, %1                        \n\t"
159
        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
160
        "movq        %%mm4, (%3)                \n\t"
161
        "movq        %%mm5, 8(%3)                \n\t"
162
        "addl        %5, %3                        \n\t"
163
        "addl        $32, %2                        \n\t"
164
        "subl        $2, %0                        \n\t"
165
        "jnz        1b                        \n\t"
166
        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
167
        :"r"(src1Stride), "r"(dstStride)
168
        :"memory");
169
}
170

    
171
static void DEF(put, pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
172
{
173
    MOVQ_BFE(mm6);
174
    __asm __volatile(
175
        "lea (%3, %3), %%eax                \n\t"
176
        "movq (%1), %%mm0                \n\t"
177
        ".balign 8                        \n\t"
178
        "1:                                \n\t"
179
        "movq        (%1, %3), %%mm1                \n\t"
180
        "movq        (%1, %%eax),%%mm2        \n\t"
181
        PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
182
        "movq        %%mm4, (%2)                \n\t"
183
        "movq        %%mm5, (%2, %3)                \n\t"
184
        "addl        %%eax, %1                \n\t"
185
        "addl        %%eax, %2                \n\t"
186
        "movq        (%1, %3), %%mm1                \n\t"
187
        "movq        (%1, %%eax),%%mm0        \n\t"
188
        PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
189
        "movq        %%mm4, (%2)                \n\t"
190
        "movq        %%mm5, (%2, %3)                \n\t"
191
        "addl        %%eax, %1                \n\t"
192
        "addl        %%eax, %2                \n\t"
193
        "subl        $4, %0                        \n\t"
194
        "jnz        1b                        \n\t"
195
        :"+g"(h), "+S"(pixels), "+D"(block)
196
        :"r"(line_size)
197
        :"eax", "memory");
198
}
199

    
200
static void DEF(put, pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
201
{
202
    MOVQ_ZERO(mm7);
203
    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
204
    __asm __volatile(
205
        "movq        (%1), %%mm0                \n\t"
206
        "movq        1(%1), %%mm4                \n\t"
207
        "movq        %%mm0, %%mm1                \n\t"
208
        "movq        %%mm4, %%mm5                \n\t"
209
        "punpcklbw %%mm7, %%mm0                \n\t"
210
        "punpcklbw %%mm7, %%mm4                \n\t"
211
        "punpckhbw %%mm7, %%mm1                \n\t"
212
        "punpckhbw %%mm7, %%mm5                \n\t"
213
        "paddusw %%mm0, %%mm4                \n\t"
214
        "paddusw %%mm1, %%mm5                \n\t"
215
        "xorl        %%eax, %%eax                \n\t"
216
        "addl        %3, %1                        \n\t"
217
        ".balign 8                      \n\t"
218
        "1:                                \n\t"
219
        "movq        (%1, %%eax), %%mm0        \n\t"
220
        "movq        1(%1, %%eax), %%mm2        \n\t"
221
        "movq        %%mm0, %%mm1                \n\t"
222
        "movq        %%mm2, %%mm3                \n\t"
223
        "punpcklbw %%mm7, %%mm0                \n\t"
224
        "punpcklbw %%mm7, %%mm2                \n\t"
225
        "punpckhbw %%mm7, %%mm1                \n\t"
226
        "punpckhbw %%mm7, %%mm3                \n\t"
227
        "paddusw %%mm2, %%mm0                 \n\t"
228
        "paddusw %%mm3, %%mm1                \n\t"
229
        "paddusw %%mm6, %%mm4                \n\t"
230
        "paddusw %%mm6, %%mm5                \n\t"
231
        "paddusw %%mm0, %%mm4                \n\t"
232
        "paddusw %%mm1, %%mm5                \n\t"
233
        "psrlw        $2, %%mm4                \n\t"
234
        "psrlw        $2, %%mm5                \n\t"
235
        "packuswb  %%mm5, %%mm4                \n\t"
236
        "movq        %%mm4, (%2, %%eax)        \n\t"
237
        "addl        %3, %%eax                \n\t"
238

    
239
        "movq        (%1, %%eax), %%mm2        \n\t" // 0 <-> 2   1 <-> 3
240
        "movq        1(%1, %%eax), %%mm4        \n\t"
241
        "movq        %%mm2, %%mm3                \n\t"
242
        "movq        %%mm4, %%mm5                \n\t"
243
        "punpcklbw %%mm7, %%mm2                \n\t"
244
        "punpcklbw %%mm7, %%mm4                \n\t"
245
        "punpckhbw %%mm7, %%mm3                \n\t"
246
        "punpckhbw %%mm7, %%mm5                \n\t"
247
        "paddusw %%mm2, %%mm4                 \n\t"
248
        "paddusw %%mm3, %%mm5                \n\t"
249
        "paddusw %%mm6, %%mm0                \n\t"
250
        "paddusw %%mm6, %%mm1                \n\t"
251
        "paddusw %%mm4, %%mm0                \n\t"
252
        "paddusw %%mm5, %%mm1                \n\t"
253
        "psrlw        $2, %%mm0                \n\t"
254
        "psrlw        $2, %%mm1                \n\t"
255
        "packuswb  %%mm1, %%mm0                \n\t"
256
        "movq        %%mm0, (%2, %%eax)        \n\t"
257
        "addl        %3, %%eax                \n\t"
258

    
259
        "subl        $2, %0                        \n\t"
260
        "jnz        1b                        \n\t"
261
        :"+g"(h), "+S"(pixels)
262
        :"D"(block), "r"(line_size)
263
        :"eax", "memory");
264
}
265

    
266
static void DEF(put, pixels8_l4)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int stride, int h)
267
{
268
    MOVQ_ZERO(mm7);
269
    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
270
    __asm __volatile(
271
        ".balign 8                      \n\t"
272
        "1:                                \n\t"
273
        "movq        (%1), %%mm0                \n\t"
274
        "movq        (%2), %%mm1                \n\t"
275
        "movq        64(%2), %%mm2                \n\t"
276
        "movq        136(%2), %%mm3                \n\t"
277
        "punpcklbw %%mm7, %%mm0                \n\t"
278
        "punpcklbw %%mm7, %%mm1                \n\t"
279
        "punpcklbw %%mm7, %%mm2                \n\t"
280
        "punpcklbw %%mm7, %%mm3                \n\t"
281
        "paddusw %%mm6, %%mm0                \n\t"
282
        "paddusw %%mm0, %%mm1                \n\t"
283
        "paddusw %%mm2, %%mm3                \n\t"
284
        "paddusw %%mm1, %%mm3                \n\t"
285
        "psrlw        $2, %%mm3                \n\t"
286
        "movq        (%1), %%mm0                \n\t"
287
        "movq        (%2), %%mm1                \n\t"
288
        "movq        64(%2), %%mm2                \n\t"
289
        "movq        136(%2), %%mm4                \n\t"
290
        "punpckhbw %%mm7, %%mm0                \n\t"
291
        "punpckhbw %%mm7, %%mm1                \n\t"
292
        "punpckhbw %%mm7, %%mm2                \n\t"
293
        "punpckhbw %%mm7, %%mm4                \n\t"
294
        "paddusw %%mm6, %%mm0                \n\t"
295
        "paddusw %%mm0, %%mm1                \n\t"
296
        "paddusw %%mm2, %%mm4                \n\t"
297
        "paddusw %%mm1, %%mm4                \n\t"
298
        "psrlw        $2, %%mm4                \n\t"
299
        "packuswb  %%mm4, %%mm3                \n\t"
300
        "movq        %%mm3, (%0)                \n\t"
301
        "addl        %4, %0                        \n\t"
302
        "addl        %4, %1                        \n\t"
303
        "addl        $8, %2                        \n\t" 
304
        "decl        %3                        \n\t"
305
        "jnz        1b                        \n\t"
306
        :"+r"(dst), "+r"(src1), "+r"(src2), "+r"(h)
307
        :"r"(stride)
308
        :"memory");
309
}
310

    
311
static void DEF(put, pixels16_l4)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int stride, int h)
312
{
313
    MOVQ_ZERO(mm7);
314
    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
315
    __asm __volatile(
316
        ".balign 8                      \n\t"
317
        "1:                                \n\t"
318
        "movq        (%1), %%mm0                \n\t"
319
        "movq        (%2), %%mm1                \n\t"
320
        "movq        256(%2), %%mm2                \n\t"
321
        "movq        528(%2), %%mm3                \n\t"
322
        "punpcklbw %%mm7, %%mm0                \n\t"
323
        "punpcklbw %%mm7, %%mm1                \n\t"
324
        "punpcklbw %%mm7, %%mm2                \n\t"
325
        "punpcklbw %%mm7, %%mm3                \n\t"
326
        "paddusw %%mm6, %%mm0                \n\t"
327
        "paddusw %%mm0, %%mm1                \n\t"
328
        "paddusw %%mm2, %%mm3                \n\t"
329
        "paddusw %%mm1, %%mm3                \n\t"
330
        "psrlw        $2, %%mm3                \n\t"
331
        "movq        (%1), %%mm0                \n\t"
332
        "movq        (%2), %%mm1                \n\t"
333
        "movq        256(%2), %%mm2                \n\t"
334
        "movq        528(%2), %%mm4                \n\t"
335
        "punpckhbw %%mm7, %%mm0                \n\t"
336
        "punpckhbw %%mm7, %%mm1                \n\t"
337
        "punpckhbw %%mm7, %%mm2                \n\t"
338
        "punpckhbw %%mm7, %%mm4                \n\t"
339
        "paddusw %%mm6, %%mm0                \n\t"
340
        "paddusw %%mm0, %%mm1                \n\t"
341
        "paddusw %%mm2, %%mm4                \n\t"
342
        "paddusw %%mm1, %%mm4                \n\t"
343
        "psrlw        $2, %%mm4                \n\t"
344
        "packuswb  %%mm4, %%mm3                \n\t"
345
        "movq        %%mm3, (%0)                \n\t"
346
        "movq        8(%1), %%mm0                \n\t"
347
        "movq        8(%2), %%mm1                \n\t"
348
        "movq        264(%2), %%mm2                \n\t"
349
        "movq        536(%2), %%mm3                \n\t"
350
        "punpcklbw %%mm7, %%mm0                \n\t"
351
        "punpcklbw %%mm7, %%mm1                \n\t"
352
        "punpcklbw %%mm7, %%mm2                \n\t"
353
        "punpcklbw %%mm7, %%mm3                \n\t"
354
        "paddusw %%mm6, %%mm0                \n\t"
355
        "paddusw %%mm0, %%mm1                \n\t"
356
        "paddusw %%mm2, %%mm3                \n\t"
357
        "paddusw %%mm1, %%mm3                \n\t"
358
        "psrlw        $2, %%mm3                \n\t"
359
        "movq        8(%1), %%mm0                \n\t"
360
        "movq        8(%2), %%mm1                \n\t"
361
        "movq        264(%2), %%mm2                \n\t"
362
        "movq        536(%2), %%mm4                \n\t"
363
        "punpckhbw %%mm7, %%mm0                \n\t"
364
        "punpckhbw %%mm7, %%mm1                \n\t"
365
        "punpckhbw %%mm7, %%mm2                \n\t"
366
        "punpckhbw %%mm7, %%mm4                \n\t"
367
        "paddusw %%mm6, %%mm0                \n\t"
368
        "paddusw %%mm0, %%mm1                \n\t"
369
        "paddusw %%mm2, %%mm4                \n\t"
370
        "paddusw %%mm1, %%mm4                \n\t"
371
        "psrlw        $2, %%mm4                \n\t"
372
        "packuswb  %%mm4, %%mm3                \n\t"
373
        "movq        %%mm3, 8(%0)                \n\t"
374
        "addl        %4, %0                        \n\t"
375
        "addl        %4, %1                        \n\t"
376
        "addl        $16, %2                        \n\t" 
377
        "decl        %3                        \n\t"
378
        "jnz        1b                        \n\t"
379
        :"+r"(dst), "+r"(src1), "+r"(src2), "+r"(h)
380
        :"r"(stride)
381
        :"memory");
382
}
383

    
384
// avg_pixels
385
// in case more speed is needed - unroling would certainly help
386
static void DEF(avg, pixels8)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
387
{
388
    MOVQ_BFE(mm6);
389
    JUMPALIGN();
390
    do {
391
        __asm __volatile(
392
             "movq  %0, %%mm0                \n\t"
393
             "movq  %1, %%mm1                \n\t"
394
             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
395
             "movq  %%mm2, %0                \n\t"
396
             :"+m"(*block)
397
             :"m"(*pixels)
398
             :"memory");
399
        pixels += line_size;
400
        block += line_size;
401
    }
402
    while (--h);
403
}
404

    
405
static void DEF(avg, pixels16)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
406
{
407
    MOVQ_BFE(mm6);
408
    JUMPALIGN();
409
    do {
410
        __asm __volatile(
411
             "movq  %0, %%mm0                \n\t"
412
             "movq  %1, %%mm1                \n\t"
413
             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
414
             "movq  %%mm2, %0                \n\t"
415
             "movq  8%0, %%mm0                \n\t"
416
             "movq  8%1, %%mm1                \n\t"
417
             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
418
             "movq  %%mm2, 8%0                \n\t"
419
             :"+m"(*block)
420
             :"m"(*pixels)
421
             :"memory");
422
        pixels += line_size;
423
        block += line_size;
424
    }
425
    while (--h);
426
}
427

    
428
static void DEF(avg, pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
429
{
430
    MOVQ_BFE(mm6);
431
    JUMPALIGN();
432
    do {
433
        __asm __volatile(
434
            "movq  %1, %%mm0                \n\t"
435
            "movq  1%1, %%mm1                \n\t"
436
            "movq  %0, %%mm3                \n\t"
437
            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
438
            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
439
            "movq  %%mm0, %0                \n\t"
440
            :"+m"(*block)
441
            :"m"(*pixels)
442
            :"memory");
443
        pixels += line_size;
444
        block += line_size;
445
    } while (--h);
446
}
447

    
448
static void DEF(avg, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
449
{
450
    MOVQ_BFE(mm6);
451
    JUMPALIGN();
452
    do {
453
        __asm __volatile(
454
            "movq  %1, %%mm0                \n\t"
455
            "movq  %2, %%mm1                \n\t"
456
            "movq  %0, %%mm3                \n\t"
457
            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
458
            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
459
            "movq  %%mm0, %0                \n\t"
460
            :"+m"(*dst)
461
            :"m"(*src1), "m"(*src2)
462
            :"memory");
463
        dst += dstStride;
464
        src1 += src1Stride;
465
        src2 += 8;
466
    } while (--h);
467
}
468

    
469
static void DEF(avg, pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
470
{
471
    MOVQ_BFE(mm6);
472
    JUMPALIGN();
473
    do {
474
        __asm __volatile(
475
            "movq  %1, %%mm0                \n\t"
476
            "movq  1%1, %%mm1                \n\t"
477
            "movq  %0, %%mm3                \n\t"
478
            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
479
            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
480
            "movq  %%mm0, %0                \n\t"
481
            "movq  8%1, %%mm0                \n\t"
482
            "movq  9%1, %%mm1                \n\t"
483
            "movq  8%0, %%mm3                \n\t"
484
            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
485
            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
486
            "movq  %%mm0, 8%0                \n\t"
487
            :"+m"(*block)
488
            :"m"(*pixels)
489
            :"memory");
490
        pixels += line_size;
491
        block += line_size;
492
    } while (--h);
493
}
494

    
495
static void DEF(avg, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
496
{
497
    MOVQ_BFE(mm6);
498
    JUMPALIGN();
499
    do {
500
        __asm __volatile(
501
            "movq  %1, %%mm0                \n\t"
502
            "movq  %2, %%mm1                \n\t"
503
            "movq  %0, %%mm3                \n\t"
504
            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
505
            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
506
            "movq  %%mm0, %0                \n\t"
507
            "movq  8%1, %%mm0                \n\t"
508
            "movq  8%2, %%mm1                \n\t"
509
            "movq  8%0, %%mm3                \n\t"
510
            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
511
            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
512
            "movq  %%mm0, 8%0                \n\t"
513
            :"+m"(*dst)
514
            :"m"(*src1), "m"(*src2)
515
            :"memory");
516
        dst += dstStride;
517
        src1 += src1Stride;
518
        src2 += 16;
519
    } while (--h);
520
}
521

    
522
static void DEF(avg, pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
523
{
524
    MOVQ_BFE(mm6);
525
    __asm __volatile(
526
        "lea        (%3, %3), %%eax                \n\t"
527
        "movq        (%1), %%mm0                \n\t"
528
        ".balign 8                        \n\t"
529
        "1:                                \n\t"
530
        "movq        (%1, %3), %%mm1                \n\t"
531
        "movq        (%1, %%eax), %%mm2        \n\t"
532
        PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
533
        "movq        (%2), %%mm3                \n\t"
534
        PAVGB(%%mm3, %%mm4, %%mm0, %%mm6)
535
        "movq        (%2, %3), %%mm3                \n\t"
536
        PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
537
        "movq        %%mm0, (%2)                \n\t"
538
        "movq        %%mm1, (%2, %3)                \n\t"
539
        "addl        %%eax, %1                \n\t"
540
        "addl        %%eax, %2                \n\t"
541

    
542
        "movq        (%1, %3), %%mm1                \n\t"
543
        "movq        (%1, %%eax), %%mm0        \n\t"
544
        PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
545
        "movq        (%2), %%mm3                \n\t"
546
        PAVGB(%%mm3, %%mm4, %%mm2, %%mm6)
547
        "movq        (%2, %3), %%mm3                \n\t"
548
        PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
549
        "movq        %%mm2, (%2)                \n\t"
550
        "movq        %%mm1, (%2, %3)                \n\t"
551
        "addl        %%eax, %1                \n\t"
552
        "addl        %%eax, %2                \n\t"
553

    
554
        "subl        $4, %0                        \n\t"
555
        "jnz        1b                        \n\t"
556
        :"+g"(h), "+S"(pixels), "+D"(block)
557
        :"r"(line_size)
558
        :"eax", "memory");
559
}
560

    
561
// this routine is 'slightly' suboptimal but mostly unused
562
static void DEF(avg, pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
563
{
564
    MOVQ_ZERO(mm7);
565
    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
566
    __asm __volatile(
567
        "movq        (%1), %%mm0                \n\t"
568
        "movq        1(%1), %%mm4                \n\t"
569
        "movq        %%mm0, %%mm1                \n\t"
570
        "movq        %%mm4, %%mm5                \n\t"
571
        "punpcklbw %%mm7, %%mm0                \n\t"
572
        "punpcklbw %%mm7, %%mm4                \n\t"
573
        "punpckhbw %%mm7, %%mm1                \n\t"
574
        "punpckhbw %%mm7, %%mm5                \n\t"
575
        "paddusw %%mm0, %%mm4                \n\t"
576
        "paddusw %%mm1, %%mm5                \n\t"
577
        "xorl        %%eax, %%eax                \n\t"
578
        "addl        %3, %1                        \n\t"
579
        ".balign 8                        \n\t"
580
        "1:                                \n\t"
581
        "movq        (%1, %%eax), %%mm0        \n\t"
582
        "movq        1(%1, %%eax), %%mm2        \n\t"
583
        "movq        %%mm0, %%mm1                \n\t"
584
        "movq        %%mm2, %%mm3                \n\t"
585
        "punpcklbw %%mm7, %%mm0                \n\t"
586
        "punpcklbw %%mm7, %%mm2                \n\t"
587
        "punpckhbw %%mm7, %%mm1                \n\t"
588
        "punpckhbw %%mm7, %%mm3                \n\t"
589
        "paddusw %%mm2, %%mm0                 \n\t"
590
        "paddusw %%mm3, %%mm1                \n\t"
591
        "paddusw %%mm6, %%mm4                \n\t"
592
        "paddusw %%mm6, %%mm5                \n\t"
593
        "paddusw %%mm0, %%mm4                \n\t"
594
        "paddusw %%mm1, %%mm5                \n\t"
595
        "psrlw        $2, %%mm4                \n\t"
596
        "psrlw        $2, %%mm5                \n\t"
597
                "movq        (%2, %%eax), %%mm3        \n\t"
598
        "packuswb  %%mm5, %%mm4                \n\t"
599
                "pcmpeqd %%mm2, %%mm2        \n\t"
600
                "paddb %%mm2, %%mm2        \n\t"
601
                PAVGB(%%mm3, %%mm4, %%mm5, %%mm2)
602
                "movq        %%mm5, (%2, %%eax)        \n\t"
603
        "addl        %3, %%eax                \n\t"
604

    
605
        "movq        (%1, %%eax), %%mm2        \n\t" // 0 <-> 2   1 <-> 3
606
        "movq        1(%1, %%eax), %%mm4        \n\t"
607
        "movq        %%mm2, %%mm3                \n\t"
608
        "movq        %%mm4, %%mm5                \n\t"
609
        "punpcklbw %%mm7, %%mm2                \n\t"
610
        "punpcklbw %%mm7, %%mm4                \n\t"
611
        "punpckhbw %%mm7, %%mm3                \n\t"
612
        "punpckhbw %%mm7, %%mm5                \n\t"
613
        "paddusw %%mm2, %%mm4                 \n\t"
614
        "paddusw %%mm3, %%mm5                \n\t"
615
        "paddusw %%mm6, %%mm0                \n\t"
616
        "paddusw %%mm6, %%mm1                \n\t"
617
        "paddusw %%mm4, %%mm0                \n\t"
618
        "paddusw %%mm5, %%mm1                \n\t"
619
        "psrlw        $2, %%mm0                \n\t"
620
        "psrlw        $2, %%mm1                \n\t"
621
                "movq        (%2, %%eax), %%mm3        \n\t"
622
        "packuswb  %%mm1, %%mm0                \n\t"
623
                "pcmpeqd %%mm2, %%mm2        \n\t"
624
                "paddb %%mm2, %%mm2        \n\t"
625
                PAVGB(%%mm3, %%mm0, %%mm1, %%mm2)
626
                "movq        %%mm1, (%2, %%eax)        \n\t"
627
        "addl        %3, %%eax                \n\t"
628

    
629
        "subl        $2, %0                        \n\t"
630
        "jnz        1b                        \n\t"
631
        :"+g"(h), "+S"(pixels)
632
        :"D"(block), "r"(line_size)
633
        :"eax", "memory");
634
}
635

    
636
static void DEF(avg, pixels8_l4)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int stride, int h)
637
{
638
    MOVQ_ZERO(mm7);
639
    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
640
    MOVQ_BFE(mm5);
641
    __asm __volatile(
642
        ".balign 8                      \n\t"
643
        "1:                                \n\t"
644
        "movq        (%1), %%mm0                \n\t"
645
        "movq        (%2), %%mm1                \n\t"
646
        "movq        64(%2), %%mm2                \n\t"
647
        "movq        136(%2), %%mm3                \n\t"
648
        "punpcklbw %%mm7, %%mm0                \n\t"
649
        "punpcklbw %%mm7, %%mm1                \n\t"
650
        "punpcklbw %%mm7, %%mm2                \n\t"
651
        "punpcklbw %%mm7, %%mm3                \n\t"
652
        "paddusw %%mm6, %%mm0                \n\t"
653
        "paddusw %%mm0, %%mm1                \n\t"
654
        "paddusw %%mm2, %%mm3                \n\t"
655
        "paddusw %%mm1, %%mm3                \n\t"
656
        "psrlw        $2, %%mm3                \n\t"
657
        "movq        (%1), %%mm0                \n\t"
658
        "movq        (%2), %%mm1                \n\t"
659
        "movq        64(%2), %%mm2                \n\t"
660
        "movq        136(%2), %%mm4                \n\t"
661
        "punpckhbw %%mm7, %%mm0                \n\t"
662
        "punpckhbw %%mm7, %%mm1                \n\t"
663
        "punpckhbw %%mm7, %%mm2                \n\t"
664
        "punpckhbw %%mm7, %%mm4                \n\t"
665
        "paddusw %%mm6, %%mm0                \n\t"
666
        "paddusw %%mm0, %%mm1                \n\t"
667
        "paddusw %%mm2, %%mm4                \n\t"
668
        "paddusw %%mm1, %%mm4                \n\t"
669
        "psrlw        $2, %%mm4                \n\t"
670
        "packuswb  %%mm4, %%mm3                \n\t"
671
        "movq        (%0), %%mm4                \n\t"
672
        PAVGB(%%mm3, %%mm4, %%mm0, %%mm5)
673
        "movq        %%mm0, (%0)                \n\t"
674
        "addl        %4, %0                        \n\t"
675
        "addl        %4, %1                        \n\t"
676
        "addl        $8, %2                        \n\t" 
677
        "decl        %3                        \n\t"
678
        "jnz        1b                        \n\t"
679
        :"+r"(dst), "+r"(src1), "+r"(src2), "+r"(h)
680
        :"r"(stride)
681
        :"memory");
682
}
683

    
684
static void DEF(avg, pixels16_l4)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int stride, int h)
685
{
686
    MOVQ_ZERO(mm7);
687
    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
688
    MOVQ_BFE(mm5);
689
    __asm __volatile(
690
        ".balign 8                      \n\t"
691
        "1:                                \n\t"
692
        "movq        (%1), %%mm0                \n\t"
693
        "movq        (%2), %%mm1                \n\t"
694
        "movq        256(%2), %%mm2                \n\t"
695
        "movq        528(%2), %%mm3                \n\t"
696
        "punpcklbw %%mm7, %%mm0                \n\t"
697
        "punpcklbw %%mm7, %%mm1                \n\t"
698
        "punpcklbw %%mm7, %%mm2                \n\t"
699
        "punpcklbw %%mm7, %%mm3                \n\t"
700
        "paddusw %%mm6, %%mm0                \n\t"
701
        "paddusw %%mm0, %%mm1                \n\t"
702
        "paddusw %%mm2, %%mm3                \n\t"
703
        "paddusw %%mm1, %%mm3                \n\t"
704
        "psrlw        $2, %%mm3                \n\t"
705
        "movq        (%1), %%mm0                \n\t"
706
        "movq        (%2), %%mm1                \n\t"
707
        "movq        256(%2), %%mm2                \n\t"
708
        "movq        528(%2), %%mm4                \n\t"
709
        "punpckhbw %%mm7, %%mm0                \n\t"
710
        "punpckhbw %%mm7, %%mm1                \n\t"
711
        "punpckhbw %%mm7, %%mm2                \n\t"
712
        "punpckhbw %%mm7, %%mm4                \n\t"
713
        "paddusw %%mm6, %%mm0                \n\t"
714
        "paddusw %%mm0, %%mm1                \n\t"
715
        "paddusw %%mm2, %%mm4                \n\t"
716
        "paddusw %%mm1, %%mm4                \n\t"
717
        "psrlw        $2, %%mm4                \n\t"
718
        "packuswb  %%mm4, %%mm3                \n\t"
719
        "movq        (%0), %%mm4                \n\t"
720
        PAVGB(%%mm3, %%mm4, %%mm0, %%mm5)
721
        "movq        %%mm0, (%0)                \n\t"
722
        "movq        8(%1), %%mm0                \n\t"
723
        "movq        8(%2), %%mm1                \n\t"
724
        "movq        264(%2), %%mm2                \n\t"
725
        "movq        536(%2), %%mm3                \n\t"
726
        "punpcklbw %%mm7, %%mm0                \n\t"
727
        "punpcklbw %%mm7, %%mm1                \n\t"
728
        "punpcklbw %%mm7, %%mm2                \n\t"
729
        "punpcklbw %%mm7, %%mm3                \n\t"
730
        "paddusw %%mm6, %%mm0                \n\t"
731
        "paddusw %%mm0, %%mm1                \n\t"
732
        "paddusw %%mm2, %%mm3                \n\t"
733
        "paddusw %%mm1, %%mm3                \n\t"
734
        "psrlw        $2, %%mm3                \n\t"
735
        "movq        8(%1), %%mm0                \n\t"
736
        "movq        8(%2), %%mm1                \n\t"
737
        "movq        264(%2), %%mm2                \n\t"
738
        "movq        536(%2), %%mm4                \n\t"
739
        "punpckhbw %%mm7, %%mm0                \n\t"
740
        "punpckhbw %%mm7, %%mm1                \n\t"
741
        "punpckhbw %%mm7, %%mm2                \n\t"
742
        "punpckhbw %%mm7, %%mm4                \n\t"
743
        "paddusw %%mm6, %%mm0                \n\t"
744
        "paddusw %%mm0, %%mm1                \n\t"
745
        "paddusw %%mm2, %%mm4                \n\t"
746
        "paddusw %%mm1, %%mm4                \n\t"
747
        "psrlw        $2, %%mm4                \n\t"
748
        "packuswb  %%mm4, %%mm3                \n\t"
749
        "movq        8(%0), %%mm4                \n\t"
750
        PAVGB(%%mm3, %%mm4, %%mm0, %%mm5)
751
        "movq        %%mm0, 8(%0)                \n\t"
752
        "addl        %4, %0                        \n\t"
753
        "addl        %4, %1                        \n\t"
754
        "addl        $16, %2                        \n\t" 
755
        "decl        %3                        \n\t"
756
        "jnz        1b                        \n\t"
757
        :"+r"(dst), "+r"(src1), "+r"(src2), "+r"(h)
758
        :"r"(stride)
759
        :"memory");
760
}
761

    
762

    
763
//FIXME optimize
764
static void DEF(put, pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
765
    DEF(put, pixels8_y2)(block  , pixels  , line_size, h);
766
    DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h);
767
}
768

    
769
static void DEF(put, pixels16_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
770
    DEF(put, pixels8_xy2)(block  , pixels  , line_size, h);
771
    DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h);
772
}
773

    
774
static void DEF(avg, pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
775
    DEF(avg, pixels8_y2)(block  , pixels  , line_size, h);
776
    DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h);
777
}
778

    
779
static void DEF(avg, pixels16_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
780
    DEF(avg, pixels8_xy2)(block  , pixels  , line_size, h);
781
    DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h);
782
}
783

    
784