Statistics
| Branch: | Revision:

ffmpeg / libavcodec / sparc / dsputil_vis.c @ be449fca

History | View | Annotate | Download (119 KB)

1 44f54ceb Michael Niedermayer
/*
2 0f12310f James Morrison
 * dsputil_vis.c
3 44f54ceb Michael Niedermayer
 * Copyright (C) 2003 David S. Miller <davem@redhat.com>
4
 *
5 b78e7197 Diego Biurrun
 * This file is part of FFmpeg.
6 44f54ceb Michael Niedermayer
 *
7 a33fe572 Diego Biurrun
 * FFmpeg is free software; you can redistribute it and/or
8
 * modify it under the terms of the GNU Lesser General Public
9
 * License as published by the Free Software Foundation; either
10
 * version 2.1 of the License, or (at your option) any later version.
11 44f54ceb Michael Niedermayer
 *
12 b78e7197 Diego Biurrun
 * FFmpeg is distributed in the hope that it will be useful,
13 44f54ceb Michael Niedermayer
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 a33fe572 Diego Biurrun
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
 * Lesser General Public License for more details.
16 44f54ceb Michael Niedermayer
 *
17 a33fe572 Diego Biurrun
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with FFmpeg; if not, write to the Free Software
19 5509bffa Diego Biurrun
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 44f54ceb Michael Niedermayer
 */
21
22 0f12310f James Morrison
/* The *no_round* functions have been added by James A. Morrison, 2003,2004.
23 44f54ceb Michael Niedermayer
   The vis code from libmpeg2 was adapted for ffmpeg by James A. Morrison.
24
 */
25
26
#include "config.h"
27
28
#include <inttypes.h>
29
30 245976da Diego Biurrun
#include "libavcodec/dsputil.h"
31 44f54ceb Michael Niedermayer
32
#include "vis.h"
33
34 d9420d4d Denes Balatoni
extern void ff_simple_idct_put_vis(uint8_t *dest, int line_size, DCTELEM *data);
35
extern void ff_simple_idct_add_vis(uint8_t *dest, int line_size, DCTELEM *data);
36
extern void ff_simple_idct_vis(DCTELEM *data);
37
38 44f54ceb Michael Niedermayer
/* The trick used in some of this file is the formula from the MMX
39
 * motion comp code, which is:
40
 *
41
 * (x+y+1)>>1 == (x|y)-((x^y)>>1)
42
 *
43
 * This allows us to average 8 bytes at a time in a 64-bit FPU reg.
44
 * We avoid overflows by masking before we do the shift, and we
45
 * implement the shift by multiplying by 1/2 using mul8x16.  So in
46
 * VIS this is (assume 'x' is in f0, 'y' is in f2, a repeating mask
47
 * of '0xfe' is in f4, a repeating mask of '0x7f' is in f6, and
48
 * the value 0x80808080 is in f8):
49
 *
50 bb270c08 Diego Biurrun
 *      fxor            f0,   f2, f10
51
 *      fand            f10,  f4, f10
52
 *      fmul8x16        f8,  f10, f10
53
 *      fand            f10,  f6, f10
54
 *      for             f0,   f2, f12
55
 *      fpsub16         f12, f10, f10
56 44f54ceb Michael Niedermayer
 */
57
58
#define ATTR_ALIGN(alignd) __attribute__ ((aligned(alignd)))
59
60
#define DUP4(x) {x, x, x, x}
61
#define DUP8(x) {x, x, x, x, x, x, x, x}
62
static const int16_t constants1[] ATTR_ALIGN(8) = DUP4 (1);
63
static const int16_t constants2[] ATTR_ALIGN(8) = DUP4 (2);
64
static const int16_t constants3[] ATTR_ALIGN(8) = DUP4 (3);
65
static const int16_t constants6[] ATTR_ALIGN(8) = DUP4 (6);
66
static const int8_t constants_fe[] ATTR_ALIGN(8) = DUP8 (0xfe);
67
static const int8_t constants_7f[] ATTR_ALIGN(8) = DUP8 (0x7f);
68
static const int8_t constants128[] ATTR_ALIGN(8) = DUP8 (128);
69
static const int16_t constants256_512[] ATTR_ALIGN(8) =
70 bb270c08 Diego Biurrun
        {256, 512, 256, 512};
71 44f54ceb Michael Niedermayer
static const int16_t constants256_1024[] ATTR_ALIGN(8) =
72 bb270c08 Diego Biurrun
        {256, 1024, 256, 1024};
73
74
#define REF_0           0
75
#define REF_0_1         1
76
#define REF_2           2
77
#define REF_2_1         3
78
#define REF_4           4
79
#define REF_4_1         5
80
#define REF_6           6
81
#define REF_6_1         7
82
#define REF_S0          8
83
#define REF_S0_1        9
84
#define REF_S2          10
85
#define REF_S2_1        11
86
#define REF_S4          12
87
#define REF_S4_1        13
88
#define REF_S6          14
89
#define REF_S6_1        15
90
#define DST_0           16
91
#define DST_1           17
92
#define DST_2           18
93
#define DST_3           19
94
#define CONST_1         20
95
#define CONST_2         20
96
#define CONST_3         20
97
#define CONST_6         20
98
#define MASK_fe         20
99
#define CONST_128       22
100
#define CONST_256       22
101
#define CONST_512       22
102
#define CONST_1024      22
103
#define TMP0            24
104
#define TMP1            25
105
#define TMP2            26
106
#define TMP3            27
107
#define TMP4            28
108
#define TMP5            29
109
#define ZERO            30
110
#define MASK_7f         30
111
112
#define TMP6            32
113
#define TMP8            34
114
#define TMP10           36
115
#define TMP12           38
116
#define TMP14           40
117
#define TMP16           42
118
#define TMP18           44
119
#define TMP20           46
120
#define TMP22           48
121
#define TMP24           50
122
#define TMP26           52
123
#define TMP28           54
124
#define TMP30           56
125
#define TMP32           58
126 44f54ceb Michael Niedermayer
127
static void MC_put_o_16_vis (uint8_t * dest, const uint8_t * _ref,
128 bb270c08 Diego Biurrun
                             const int stride, int height)
129 44f54ceb Michael Niedermayer
{
130 bb270c08 Diego Biurrun
        uint8_t *ref = (uint8_t *) _ref;
131 44f54ceb Michael Niedermayer
132 bb270c08 Diego Biurrun
        ref = vis_alignaddr(ref);
133
        do {    /* 5 cycles */
134
                vis_ld64(ref[0], TMP0);
135 44f54ceb Michael Niedermayer
136 bb270c08 Diego Biurrun
                vis_ld64_2(ref, 8, TMP2);
137 44f54ceb Michael Niedermayer
138 bb270c08 Diego Biurrun
                vis_ld64_2(ref, 16, TMP4);
139
                ref += stride;
140 44f54ceb Michael Niedermayer
141 bb270c08 Diego Biurrun
                vis_faligndata(TMP0, TMP2, REF_0);
142
                vis_st64(REF_0, dest[0]);
143 44f54ceb Michael Niedermayer
144 bb270c08 Diego Biurrun
                vis_faligndata(TMP2, TMP4, REF_2);
145
                vis_st64_2(REF_2, dest, 8);
146
                dest += stride;
147
        } while (--height);
148 44f54ceb Michael Niedermayer
}
149
150
static void MC_put_o_8_vis (uint8_t * dest, const uint8_t * _ref,
151 bb270c08 Diego Biurrun
                            const int stride, int height)
152 44f54ceb Michael Niedermayer
{
153 bb270c08 Diego Biurrun
        uint8_t *ref = (uint8_t *) _ref;
154 44f54ceb Michael Niedermayer
155 bb270c08 Diego Biurrun
        ref = vis_alignaddr(ref);
156
        do {    /* 4 cycles */
157
                vis_ld64(ref[0], TMP0);
158 44f54ceb Michael Niedermayer
159 bb270c08 Diego Biurrun
                vis_ld64(ref[8], TMP2);
160
                ref += stride;
161 44f54ceb Michael Niedermayer
162 bb270c08 Diego Biurrun
                /* stall */
163 44f54ceb Michael Niedermayer
164 bb270c08 Diego Biurrun
                vis_faligndata(TMP0, TMP2, REF_0);
165
                vis_st64(REF_0, dest[0]);
166
                dest += stride;
167
        } while (--height);
168 44f54ceb Michael Niedermayer
}
169
170
171
static void MC_avg_o_16_vis (uint8_t * dest, const uint8_t * _ref,
172 bb270c08 Diego Biurrun
                             const int stride, int height)
173 44f54ceb Michael Niedermayer
{
174 bb270c08 Diego Biurrun
        uint8_t *ref = (uint8_t *) _ref;
175
        int stride_8 = stride + 8;
176 44f54ceb Michael Niedermayer
177 bb270c08 Diego Biurrun
        ref = vis_alignaddr(ref);
178 44f54ceb Michael Niedermayer
179 bb270c08 Diego Biurrun
        vis_ld64(ref[0], TMP0);
180 44f54ceb Michael Niedermayer
181 bb270c08 Diego Biurrun
        vis_ld64(ref[8], TMP2);
182 44f54ceb Michael Niedermayer
183 bb270c08 Diego Biurrun
        vis_ld64(ref[16], TMP4);
184 44f54ceb Michael Niedermayer
185 bb270c08 Diego Biurrun
        vis_ld64(dest[0], DST_0);
186 44f54ceb Michael Niedermayer
187 bb270c08 Diego Biurrun
        vis_ld64(dest[8], DST_2);
188 44f54ceb Michael Niedermayer
189 bb270c08 Diego Biurrun
        vis_ld64(constants_fe[0], MASK_fe);
190
        vis_faligndata(TMP0, TMP2, REF_0);
191 44f54ceb Michael Niedermayer
192 bb270c08 Diego Biurrun
        vis_ld64(constants_7f[0], MASK_7f);
193
        vis_faligndata(TMP2, TMP4, REF_2);
194 44f54ceb Michael Niedermayer
195 bb270c08 Diego Biurrun
        vis_ld64(constants128[0], CONST_128);
196 44f54ceb Michael Niedermayer
197 bb270c08 Diego Biurrun
        ref += stride;
198
        height = (height >> 1) - 1;
199 44f54ceb Michael Niedermayer
200 bb270c08 Diego Biurrun
        do {    /* 24 cycles */
201
                vis_ld64(ref[0], TMP0);
202
                vis_xor(DST_0, REF_0, TMP6);
203 44f54ceb Michael Niedermayer
204 bb270c08 Diego Biurrun
                vis_ld64_2(ref, 8, TMP2);
205
                vis_and(TMP6, MASK_fe, TMP6);
206 44f54ceb Michael Niedermayer
207 bb270c08 Diego Biurrun
                vis_ld64_2(ref, 16, TMP4);
208
                ref += stride;
209
                vis_mul8x16(CONST_128, TMP6, TMP6);
210
                vis_xor(DST_2, REF_2, TMP8);
211 44f54ceb Michael Niedermayer
212 bb270c08 Diego Biurrun
                vis_and(TMP8, MASK_fe, TMP8);
213 44f54ceb Michael Niedermayer
214 bb270c08 Diego Biurrun
                vis_or(DST_0, REF_0, TMP10);
215
                vis_ld64_2(dest, stride, DST_0);
216
                vis_mul8x16(CONST_128, TMP8, TMP8);
217 44f54ceb Michael Niedermayer
218 bb270c08 Diego Biurrun
                vis_or(DST_2, REF_2, TMP12);
219
                vis_ld64_2(dest, stride_8, DST_2);
220 44f54ceb Michael Niedermayer
221 bb270c08 Diego Biurrun
                vis_ld64(ref[0], TMP14);
222
                vis_and(TMP6, MASK_7f, TMP6);
223 44f54ceb Michael Niedermayer
224 bb270c08 Diego Biurrun
                vis_and(TMP8, MASK_7f, TMP8);
225 44f54ceb Michael Niedermayer
226 bb270c08 Diego Biurrun
                vis_psub16(TMP10, TMP6, TMP6);
227
                vis_st64(TMP6, dest[0]);
228 44f54ceb Michael Niedermayer
229 bb270c08 Diego Biurrun
                vis_psub16(TMP12, TMP8, TMP8);
230
                vis_st64_2(TMP8, dest, 8);
231 44f54ceb Michael Niedermayer
232 bb270c08 Diego Biurrun
                dest += stride;
233
                vis_ld64_2(ref, 8, TMP16);
234
                vis_faligndata(TMP0, TMP2, REF_0);
235 44f54ceb Michael Niedermayer
236 bb270c08 Diego Biurrun
                vis_ld64_2(ref, 16, TMP18);
237
                vis_faligndata(TMP2, TMP4, REF_2);
238
                ref += stride;
239 44f54ceb Michael Niedermayer
240 bb270c08 Diego Biurrun
                vis_xor(DST_0, REF_0, TMP20);
241 44f54ceb Michael Niedermayer
242 bb270c08 Diego Biurrun
                vis_and(TMP20, MASK_fe, TMP20);
243 44f54ceb Michael Niedermayer
244 bb270c08 Diego Biurrun
                vis_xor(DST_2, REF_2, TMP22);
245
                vis_mul8x16(CONST_128, TMP20, TMP20);
246 44f54ceb Michael Niedermayer
247 bb270c08 Diego Biurrun
                vis_and(TMP22, MASK_fe, TMP22);
248 44f54ceb Michael Niedermayer
249 bb270c08 Diego Biurrun
                vis_or(DST_0, REF_0, TMP24);
250
                vis_mul8x16(CONST_128, TMP22, TMP22);
251 44f54ceb Michael Niedermayer
252 bb270c08 Diego Biurrun
                vis_or(DST_2, REF_2, TMP26);
253 44f54ceb Michael Niedermayer
254 bb270c08 Diego Biurrun
                vis_ld64_2(dest, stride, DST_0);
255
                vis_faligndata(TMP14, TMP16, REF_0);
256 44f54ceb Michael Niedermayer
257 bb270c08 Diego Biurrun
                vis_ld64_2(dest, stride_8, DST_2);
258
                vis_faligndata(TMP16, TMP18, REF_2);
259 44f54ceb Michael Niedermayer
260 bb270c08 Diego Biurrun
                vis_and(TMP20, MASK_7f, TMP20);
261 44f54ceb Michael Niedermayer
262 bb270c08 Diego Biurrun
                vis_and(TMP22, MASK_7f, TMP22);
263 44f54ceb Michael Niedermayer
264 bb270c08 Diego Biurrun
                vis_psub16(TMP24, TMP20, TMP20);
265
                vis_st64(TMP20, dest[0]);
266 44f54ceb Michael Niedermayer
267 bb270c08 Diego Biurrun
                vis_psub16(TMP26, TMP22, TMP22);
268
                vis_st64_2(TMP22, dest, 8);
269
                dest += stride;
270
        } while (--height);
271 44f54ceb Michael Niedermayer
272 bb270c08 Diego Biurrun
        vis_ld64(ref[0], TMP0);
273
        vis_xor(DST_0, REF_0, TMP6);
274 44f54ceb Michael Niedermayer
275 bb270c08 Diego Biurrun
        vis_ld64_2(ref, 8, TMP2);
276
        vis_and(TMP6, MASK_fe, TMP6);
277 44f54ceb Michael Niedermayer
278 bb270c08 Diego Biurrun
        vis_ld64_2(ref, 16, TMP4);
279
        vis_mul8x16(CONST_128, TMP6, TMP6);
280
        vis_xor(DST_2, REF_2, TMP8);
281 44f54ceb Michael Niedermayer
282 bb270c08 Diego Biurrun
        vis_and(TMP8, MASK_fe, TMP8);
283 44f54ceb Michael Niedermayer
284 bb270c08 Diego Biurrun
        vis_or(DST_0, REF_0, TMP10);
285
        vis_ld64_2(dest, stride, DST_0);
286
        vis_mul8x16(CONST_128, TMP8, TMP8);
287 44f54ceb Michael Niedermayer
288 bb270c08 Diego Biurrun
        vis_or(DST_2, REF_2, TMP12);
289
        vis_ld64_2(dest, stride_8, DST_2);
290 44f54ceb Michael Niedermayer
291 bb270c08 Diego Biurrun
        vis_ld64(ref[0], TMP14);
292
        vis_and(TMP6, MASK_7f, TMP6);
293 44f54ceb Michael Niedermayer
294 bb270c08 Diego Biurrun
        vis_and(TMP8, MASK_7f, TMP8);
295 44f54ceb Michael Niedermayer
296 bb270c08 Diego Biurrun
        vis_psub16(TMP10, TMP6, TMP6);
297
        vis_st64(TMP6, dest[0]);
298 44f54ceb Michael Niedermayer
299 bb270c08 Diego Biurrun
        vis_psub16(TMP12, TMP8, TMP8);
300
        vis_st64_2(TMP8, dest, 8);
301 44f54ceb Michael Niedermayer
302 bb270c08 Diego Biurrun
        dest += stride;
303
        vis_faligndata(TMP0, TMP2, REF_0);
304 44f54ceb Michael Niedermayer
305 bb270c08 Diego Biurrun
        vis_faligndata(TMP2, TMP4, REF_2);
306 44f54ceb Michael Niedermayer
307 bb270c08 Diego Biurrun
        vis_xor(DST_0, REF_0, TMP20);
308 44f54ceb Michael Niedermayer
309 bb270c08 Diego Biurrun
        vis_and(TMP20, MASK_fe, TMP20);
310 44f54ceb Michael Niedermayer
311 bb270c08 Diego Biurrun
        vis_xor(DST_2, REF_2, TMP22);
312
        vis_mul8x16(CONST_128, TMP20, TMP20);
313 44f54ceb Michael Niedermayer
314 bb270c08 Diego Biurrun
        vis_and(TMP22, MASK_fe, TMP22);
315 44f54ceb Michael Niedermayer
316 bb270c08 Diego Biurrun
        vis_or(DST_0, REF_0, TMP24);
317
        vis_mul8x16(CONST_128, TMP22, TMP22);
318 44f54ceb Michael Niedermayer
319 bb270c08 Diego Biurrun
        vis_or(DST_2, REF_2, TMP26);
320 44f54ceb Michael Niedermayer
321 bb270c08 Diego Biurrun
        vis_and(TMP20, MASK_7f, TMP20);
322 44f54ceb Michael Niedermayer
323 bb270c08 Diego Biurrun
        vis_and(TMP22, MASK_7f, TMP22);
324 44f54ceb Michael Niedermayer
325 bb270c08 Diego Biurrun
        vis_psub16(TMP24, TMP20, TMP20);
326
        vis_st64(TMP20, dest[0]);
327 44f54ceb Michael Niedermayer
328 bb270c08 Diego Biurrun
        vis_psub16(TMP26, TMP22, TMP22);
329
        vis_st64_2(TMP22, dest, 8);
330 44f54ceb Michael Niedermayer
}
331
332
static void MC_avg_o_8_vis (uint8_t * dest, const uint8_t * _ref,
333 bb270c08 Diego Biurrun
                            const int stride, int height)
334 44f54ceb Michael Niedermayer
{
335 bb270c08 Diego Biurrun
        uint8_t *ref = (uint8_t *) _ref;
336 44f54ceb Michael Niedermayer
337 bb270c08 Diego Biurrun
        ref = vis_alignaddr(ref);
338 44f54ceb Michael Niedermayer
339 bb270c08 Diego Biurrun
        vis_ld64(ref[0], TMP0);
340 44f54ceb Michael Niedermayer
341 bb270c08 Diego Biurrun
        vis_ld64(ref[8], TMP2);
342 44f54ceb Michael Niedermayer
343 bb270c08 Diego Biurrun
        vis_ld64(dest[0], DST_0);
344 44f54ceb Michael Niedermayer
345 bb270c08 Diego Biurrun
        vis_ld64(constants_fe[0], MASK_fe);
346 44f54ceb Michael Niedermayer
347 bb270c08 Diego Biurrun
        vis_ld64(constants_7f[0], MASK_7f);
348
        vis_faligndata(TMP0, TMP2, REF_0);
349 44f54ceb Michael Niedermayer
350 bb270c08 Diego Biurrun
        vis_ld64(constants128[0], CONST_128);
351 44f54ceb Michael Niedermayer
352 bb270c08 Diego Biurrun
        ref += stride;
353
        height = (height >> 1) - 1;
354 44f54ceb Michael Niedermayer
355 bb270c08 Diego Biurrun
        do {    /* 12 cycles */
356
                vis_ld64(ref[0], TMP0);
357
                vis_xor(DST_0, REF_0, TMP4);
358 44f54ceb Michael Niedermayer
359 bb270c08 Diego Biurrun
                vis_ld64(ref[8], TMP2);
360
                vis_and(TMP4, MASK_fe, TMP4);
361 44f54ceb Michael Niedermayer
362 bb270c08 Diego Biurrun
                vis_or(DST_0, REF_0, TMP6);
363
                vis_ld64_2(dest, stride, DST_0);
364
                ref += stride;
365
                vis_mul8x16(CONST_128, TMP4, TMP4);
366 44f54ceb Michael Niedermayer
367 bb270c08 Diego Biurrun
                vis_ld64(ref[0], TMP12);
368
                vis_faligndata(TMP0, TMP2, REF_0);
369 44f54ceb Michael Niedermayer
370 bb270c08 Diego Biurrun
                vis_ld64(ref[8], TMP2);
371
                vis_xor(DST_0, REF_0, TMP0);
372
                ref += stride;
373 44f54ceb Michael Niedermayer
374 bb270c08 Diego Biurrun
                vis_and(TMP0, MASK_fe, TMP0);
375 44f54ceb Michael Niedermayer
376 bb270c08 Diego Biurrun
                vis_and(TMP4, MASK_7f, TMP4);
377 44f54ceb Michael Niedermayer
378 bb270c08 Diego Biurrun
                vis_psub16(TMP6, TMP4, TMP4);
379
                vis_st64(TMP4, dest[0]);
380
                dest += stride;
381
                vis_mul8x16(CONST_128, TMP0, TMP0);
382 44f54ceb Michael Niedermayer
383 bb270c08 Diego Biurrun
                vis_or(DST_0, REF_0, TMP6);
384
                vis_ld64_2(dest, stride, DST_0);
385 44f54ceb Michael Niedermayer
386 bb270c08 Diego Biurrun
                vis_faligndata(TMP12, TMP2, REF_0);
387 44f54ceb Michael Niedermayer
388 bb270c08 Diego Biurrun
                vis_and(TMP0, MASK_7f, TMP0);
389 44f54ceb Michael Niedermayer
390 bb270c08 Diego Biurrun
                vis_psub16(TMP6, TMP0, TMP4);
391
                vis_st64(TMP4, dest[0]);
392
                dest += stride;
393
        } while (--height);
394 44f54ceb Michael Niedermayer
395 bb270c08 Diego Biurrun
        vis_ld64(ref[0], TMP0);
396
        vis_xor(DST_0, REF_0, TMP4);
397 44f54ceb Michael Niedermayer
398 bb270c08 Diego Biurrun
        vis_ld64(ref[8], TMP2);
399
        vis_and(TMP4, MASK_fe, TMP4);
400 44f54ceb Michael Niedermayer
401 bb270c08 Diego Biurrun
        vis_or(DST_0, REF_0, TMP6);
402
        vis_ld64_2(dest, stride, DST_0);
403
        vis_mul8x16(CONST_128, TMP4, TMP4);
404 44f54ceb Michael Niedermayer
405 bb270c08 Diego Biurrun
        vis_faligndata(TMP0, TMP2, REF_0);
406 44f54ceb Michael Niedermayer
407 bb270c08 Diego Biurrun
        vis_xor(DST_0, REF_0, TMP0);
408 44f54ceb Michael Niedermayer
409 bb270c08 Diego Biurrun
        vis_and(TMP0, MASK_fe, TMP0);
410 44f54ceb Michael Niedermayer
411 bb270c08 Diego Biurrun
        vis_and(TMP4, MASK_7f, TMP4);
412 44f54ceb Michael Niedermayer
413 bb270c08 Diego Biurrun
        vis_psub16(TMP6, TMP4, TMP4);
414
        vis_st64(TMP4, dest[0]);
415
        dest += stride;
416
        vis_mul8x16(CONST_128, TMP0, TMP0);
417 44f54ceb Michael Niedermayer
418 bb270c08 Diego Biurrun
        vis_or(DST_0, REF_0, TMP6);
419 44f54ceb Michael Niedermayer
420 bb270c08 Diego Biurrun
        vis_and(TMP0, MASK_7f, TMP0);
421 44f54ceb Michael Niedermayer
422 bb270c08 Diego Biurrun
        vis_psub16(TMP6, TMP0, TMP4);
423
        vis_st64(TMP4, dest[0]);
424 44f54ceb Michael Niedermayer
}
425
426
static void MC_put_x_16_vis (uint8_t * dest, const uint8_t * _ref,
427 bb270c08 Diego Biurrun
                             const int stride, int height)
428 44f54ceb Michael Niedermayer
{
429 bb270c08 Diego Biurrun
        uint8_t *ref = (uint8_t *) _ref;
430
        unsigned long off = (unsigned long) ref & 0x7;
431
        unsigned long off_plus_1 = off + 1;
432 44f54ceb Michael Niedermayer
433 bb270c08 Diego Biurrun
        ref = vis_alignaddr(ref);
434 44f54ceb Michael Niedermayer
435 bb270c08 Diego Biurrun
        vis_ld64(ref[0],    TMP0);
436 44f54ceb Michael Niedermayer
437 bb270c08 Diego Biurrun
        vis_ld64_2(ref, 8,  TMP2);
438 44f54ceb Michael Niedermayer
439 bb270c08 Diego Biurrun
        vis_ld64_2(ref, 16, TMP4);
440 44f54ceb Michael Niedermayer
441 bb270c08 Diego Biurrun
        vis_ld64(constants_fe[0], MASK_fe);
442 44f54ceb Michael Niedermayer
443 bb270c08 Diego Biurrun
        vis_ld64(constants_7f[0], MASK_7f);
444
        vis_faligndata(TMP0, TMP2, REF_0);
445 44f54ceb Michael Niedermayer
446 bb270c08 Diego Biurrun
        vis_ld64(constants128[0], CONST_128);
447
        vis_faligndata(TMP2, TMP4, REF_4);
448 44f54ceb Michael Niedermayer
449 bb270c08 Diego Biurrun
        if (off != 0x7) {
450
                vis_alignaddr_g0((void *)off_plus_1);
451
                vis_faligndata(TMP0, TMP2, REF_2);
452
                vis_faligndata(TMP2, TMP4, REF_6);
453
        } else {
454
                vis_src1(TMP2, REF_2);
455
                vis_src1(TMP4, REF_6);
456
        }
457 44f54ceb Michael Niedermayer
458 bb270c08 Diego Biurrun
        ref += stride;
459
        height = (height >> 1) - 1;
460 44f54ceb Michael Niedermayer
461 bb270c08 Diego Biurrun
        do {    /* 34 cycles */
462
                vis_ld64(ref[0],    TMP0);
463
                vis_xor(REF_0, REF_2, TMP6);
464 44f54ceb Michael Niedermayer
465 bb270c08 Diego Biurrun
                vis_ld64_2(ref, 8,  TMP2);
466
                vis_xor(REF_4, REF_6, TMP8);
467 44f54ceb Michael Niedermayer
468 bb270c08 Diego Biurrun
                vis_ld64_2(ref, 16, TMP4);
469
                vis_and(TMP6, MASK_fe, TMP6);
470
                ref += stride;
471 44f54ceb Michael Niedermayer
472 bb270c08 Diego Biurrun
                vis_ld64(ref[0],    TMP14);
473
                vis_mul8x16(CONST_128, TMP6, TMP6);
474
                vis_and(TMP8, MASK_fe, TMP8);
475 44f54ceb Michael Niedermayer
476 bb270c08 Diego Biurrun
                vis_ld64_2(ref, 8,  TMP16);
477
                vis_mul8x16(CONST_128, TMP8, TMP8);
478
                vis_or(REF_0, REF_2, TMP10);
479 44f54ceb Michael Niedermayer
480 bb270c08 Diego Biurrun
                vis_ld64_2(ref, 16, TMP18);
481
                ref += stride;
482
                vis_or(REF_4, REF_6, TMP12);
483 44f54ceb Michael Niedermayer
484 bb270c08 Diego Biurrun
                vis_alignaddr_g0((void *)off);
485 44f54ceb Michael Niedermayer
486 bb270c08 Diego Biurrun
                vis_faligndata(TMP0, TMP2, REF_0);
487 44f54ceb Michael Niedermayer
488 bb270c08 Diego Biurrun
                vis_faligndata(TMP2, TMP4, REF_4);
489 44f54ceb Michael Niedermayer
490 bb270c08 Diego Biurrun
                if (off != 0x7) {
491
                        vis_alignaddr_g0((void *)off_plus_1);
492
                        vis_faligndata(TMP0, TMP2, REF_2);
493
                        vis_faligndata(TMP2, TMP4, REF_6);
494
                } else {
495
                        vis_src1(TMP2, REF_2);
496
                        vis_src1(TMP4, REF_6);
497
                }
498 44f54ceb Michael Niedermayer
499 bb270c08 Diego Biurrun
                vis_and(TMP6, MASK_7f, TMP6);
500 44f54ceb Michael Niedermayer
501 bb270c08 Diego Biurrun
                vis_and(TMP8, MASK_7f, TMP8);
502 44f54ceb Michael Niedermayer
503 bb270c08 Diego Biurrun
                vis_psub16(TMP10, TMP6, TMP6);
504
                vis_st64(TMP6, dest[0]);
505 44f54ceb Michael Niedermayer
506 bb270c08 Diego Biurrun
                vis_psub16(TMP12, TMP8, TMP8);
507
                vis_st64_2(TMP8, dest, 8);
508
                dest += stride;
509 44f54ceb Michael Niedermayer
510 bb270c08 Diego Biurrun
                vis_xor(REF_0, REF_2, TMP6);
511 44f54ceb Michael Niedermayer
512 bb270c08 Diego Biurrun
                vis_xor(REF_4, REF_6, TMP8);
513 44f54ceb Michael Niedermayer
514 bb270c08 Diego Biurrun
                vis_and(TMP6, MASK_fe, TMP6);
515 44f54ceb Michael Niedermayer
516 bb270c08 Diego Biurrun
                vis_mul8x16(CONST_128, TMP6, TMP6);
517
                vis_and(TMP8, MASK_fe, TMP8);
518 44f54ceb Michael Niedermayer
519 bb270c08 Diego Biurrun
                vis_mul8x16(CONST_128, TMP8, TMP8);
520
                vis_or(REF_0, REF_2, TMP10);
521 44f54ceb Michael Niedermayer
522 bb270c08 Diego Biurrun
                vis_or(REF_4, REF_6, TMP12);
523 44f54ceb Michael Niedermayer
524 bb270c08 Diego Biurrun
                vis_alignaddr_g0((void *)off);
525 44f54ceb Michael Niedermayer
526 bb270c08 Diego Biurrun
                vis_faligndata(TMP14, TMP16, REF_0);
527 44f54ceb Michael Niedermayer
528 bb270c08 Diego Biurrun
                vis_faligndata(TMP16, TMP18, REF_4);
529 44f54ceb Michael Niedermayer
530 bb270c08 Diego Biurrun
                if (off != 0x7) {
531
                        vis_alignaddr_g0((void *)off_plus_1);
532
                        vis_faligndata(TMP14, TMP16, REF_2);
533
                        vis_faligndata(TMP16, TMP18, REF_6);
534
                } else {
535
                        vis_src1(TMP16, REF_2);
536
                        vis_src1(TMP18, REF_6);
537
                }
538 44f54ceb Michael Niedermayer
539 bb270c08 Diego Biurrun
                vis_and(TMP6, MASK_7f, TMP6);
540 44f54ceb Michael Niedermayer
541 bb270c08 Diego Biurrun
                vis_and(TMP8, MASK_7f, TMP8);
542 44f54ceb Michael Niedermayer
543 bb270c08 Diego Biurrun
                vis_psub16(TMP10, TMP6, TMP6);
544
                vis_st64(TMP6, dest[0]);
545 44f54ceb Michael Niedermayer
546 bb270c08 Diego Biurrun
                vis_psub16(TMP12, TMP8, TMP8);
547
                vis_st64_2(TMP8, dest, 8);
548
                dest += stride;
549
        } while (--height);
550 44f54ceb Michael Niedermayer
551 bb270c08 Diego Biurrun
        vis_ld64(ref[0],    TMP0);
552
        vis_xor(REF_0, REF_2, TMP6);
553 44f54ceb Michael Niedermayer
554 bb270c08 Diego Biurrun
        vis_ld64_2(ref, 8,  TMP2);
555
        vis_xor(REF_4, REF_6, TMP8);
556 44f54ceb Michael Niedermayer
557 bb270c08 Diego Biurrun
        vis_ld64_2(ref, 16, TMP4);
558
        vis_and(TMP6, MASK_fe, TMP6);
559 44f54ceb Michael Niedermayer
560 bb270c08 Diego Biurrun
        vis_mul8x16(CONST_128, TMP6, TMP6);
561
        vis_and(TMP8, MASK_fe, TMP8);
562 44f54ceb Michael Niedermayer
563 bb270c08 Diego Biurrun
        vis_mul8x16(CONST_128, TMP8, TMP8);
564
        vis_or(REF_0, REF_2, TMP10);
565 44f54ceb Michael Niedermayer
566 bb270c08 Diego Biurrun
        vis_or(REF_4, REF_6, TMP12);
567 44f54ceb Michael Niedermayer
568 bb270c08 Diego Biurrun
        vis_alignaddr_g0((void *)off);
569 44f54ceb Michael Niedermayer
570 bb270c08 Diego Biurrun
        vis_faligndata(TMP0, TMP2, REF_0);
571 44f54ceb Michael Niedermayer
572 bb270c08 Diego Biurrun
        vis_faligndata(TMP2, TMP4, REF_4);
573 44f54ceb Michael Niedermayer
574 bb270c08 Diego Biurrun
        if (off != 0x7) {
575
                vis_alignaddr_g0((void *)off_plus_1);
576
                vis_faligndata(TMP0, TMP2, REF_2);
577
                vis_faligndata(TMP2, TMP4, REF_6);
578
        } else {
579
                vis_src1(TMP2, REF_2);
580
                vis_src1(TMP4, REF_6);
581
        }
582 44f54ceb Michael Niedermayer
583 bb270c08 Diego Biurrun
        vis_and(TMP6, MASK_7f, TMP6);
584 44f54ceb Michael Niedermayer
585 bb270c08 Diego Biurrun
        vis_and(TMP8, MASK_7f, TMP8);
586 44f54ceb Michael Niedermayer
587 bb270c08 Diego Biurrun
        vis_psub16(TMP10, TMP6, TMP6);
588
        vis_st64(TMP6, dest[0]);
589 44f54ceb Michael Niedermayer
590 bb270c08 Diego Biurrun
        vis_psub16(TMP12, TMP8, TMP8);
591
        vis_st64_2(TMP8, dest, 8);
592
        dest += stride;
593 44f54ceb Michael Niedermayer
594 bb270c08 Diego Biurrun
        vis_xor(REF_0, REF_2, TMP6);
595 44f54ceb Michael Niedermayer
596 bb270c08 Diego Biurrun
        vis_xor(REF_4, REF_6, TMP8);
597 44f54ceb Michael Niedermayer
598 bb270c08 Diego Biurrun
        vis_and(TMP6, MASK_fe, TMP6);
599 44f54ceb Michael Niedermayer
600 bb270c08 Diego Biurrun
        vis_mul8x16(CONST_128, TMP6, TMP6);
601
        vis_and(TMP8, MASK_fe, TMP8);
602 44f54ceb Michael Niedermayer
603 bb270c08 Diego Biurrun
        vis_mul8x16(CONST_128, TMP8, TMP8);
604
        vis_or(REF_0, REF_2, TMP10);
605 44f54ceb Michael Niedermayer
606 bb270c08 Diego Biurrun
        vis_or(REF_4, REF_6, TMP12);
607 44f54ceb Michael Niedermayer
608 bb270c08 Diego Biurrun
        vis_and(TMP6, MASK_7f, TMP6);
609 44f54ceb Michael Niedermayer
610 bb270c08 Diego Biurrun
        vis_and(TMP8, MASK_7f, TMP8);
611 44f54ceb Michael Niedermayer
612 bb270c08 Diego Biurrun
        vis_psub16(TMP10, TMP6, TMP6);
613
        vis_st64(TMP6, dest[0]);
614 44f54ceb Michael Niedermayer
615 bb270c08 Diego Biurrun
        vis_psub16(TMP12, TMP8, TMP8);
616
        vis_st64_2(TMP8, dest, 8);
617 44f54ceb Michael Niedermayer
}
618
619
static void MC_put_x_8_vis (uint8_t * dest, const uint8_t * _ref,
620 bb270c08 Diego Biurrun
                            const int stride, int height)
621 44f54ceb Michael Niedermayer
{
622 bb270c08 Diego Biurrun
        uint8_t *ref = (uint8_t *) _ref;
623
        unsigned long off = (unsigned long) ref & 0x7;
624
        unsigned long off_plus_1 = off + 1;
625 44f54ceb Michael Niedermayer
626 bb270c08 Diego Biurrun
        ref = vis_alignaddr(ref);
627 44f54ceb Michael Niedermayer
628 bb270c08 Diego Biurrun
        vis_ld64(ref[0], TMP0);
629 44f54ceb Michael Niedermayer
630 bb270c08 Diego Biurrun
        vis_ld64(ref[8], TMP2);
631 44f54ceb Michael Niedermayer
632 bb270c08 Diego Biurrun
        vis_ld64(constants_fe[0], MASK_fe);
633 44f54ceb Michael Niedermayer
634 bb270c08 Diego Biurrun
        vis_ld64(constants_7f[0], MASK_7f);
635 44f54ceb Michael Niedermayer
636 bb270c08 Diego Biurrun
        vis_ld64(constants128[0], CONST_128);
637
        vis_faligndata(TMP0, TMP2, REF_0);
638 44f54ceb Michael Niedermayer
639 bb270c08 Diego Biurrun
        if (off != 0x7) {
640
                vis_alignaddr_g0((void *)off_plus_1);
641
                vis_faligndata(TMP0, TMP2, REF_2);
642
        } else {
643
                vis_src1(TMP2, REF_2);
644
        }
645 44f54ceb Michael Niedermayer
646 bb270c08 Diego Biurrun
        ref += stride;
647
        height = (height >> 1) - 1;
648 44f54ceb Michael Niedermayer
649 bb270c08 Diego Biurrun
        do {    /* 20 cycles */
650
                vis_ld64(ref[0], TMP0);
651
                vis_xor(REF_0, REF_2, TMP4);
652 44f54ceb Michael Niedermayer
653 bb270c08 Diego Biurrun
                vis_ld64_2(ref, 8, TMP2);
654
                vis_and(TMP4, MASK_fe, TMP4);
655
                ref += stride;
656 44f54ceb Michael Niedermayer
657 bb270c08 Diego Biurrun
                vis_ld64(ref[0], TMP8);
658
                vis_or(REF_0, REF_2, TMP6);
659
                vis_mul8x16(CONST_128, TMP4, TMP4);
660 44f54ceb Michael Niedermayer
661 bb270c08 Diego Biurrun
                vis_alignaddr_g0((void *)off);
662 44f54ceb Michael Niedermayer
663 bb270c08 Diego Biurrun
                vis_ld64_2(ref, 8, TMP10);
664
                ref += stride;
665
                vis_faligndata(TMP0, TMP2, REF_0);
666 44f54ceb Michael Niedermayer
667 bb270c08 Diego Biurrun
                if (off != 0x7) {
668
                        vis_alignaddr_g0((void *)off_plus_1);
669
                        vis_faligndata(TMP0, TMP2, REF_2);
670
                } else {
671
                        vis_src1(TMP2, REF_2);
672
                }
673 44f54ceb Michael Niedermayer
674 bb270c08 Diego Biurrun
                vis_and(TMP4, MASK_7f, TMP4);
675 44f54ceb Michael Niedermayer
676 bb270c08 Diego Biurrun
                vis_psub16(TMP6, TMP4, DST_0);
677
                vis_st64(DST_0, dest[0]);
678
                dest += stride;
679 44f54ceb Michael Niedermayer
680 bb270c08 Diego Biurrun
                vis_xor(REF_0, REF_2, TMP12);
681 44f54ceb Michael Niedermayer
682 bb270c08 Diego Biurrun
                vis_and(TMP12, MASK_fe, TMP12);
683 44f54ceb Michael Niedermayer
684 bb270c08 Diego Biurrun
                vis_or(REF_0, REF_2, TMP14);
685
                vis_mul8x16(CONST_128, TMP12, TMP12);
686 44f54ceb Michael Niedermayer
687 bb270c08 Diego Biurrun
                vis_alignaddr_g0((void *)off);
688
                vis_faligndata(TMP8, TMP10, REF_0);
689
                if (off != 0x7) {
690
                        vis_alignaddr_g0((void *)off_plus_1);
691
                        vis_faligndata(TMP8, TMP10, REF_2);
692
                } else {
693
                        vis_src1(TMP10, REF_2);
694
                }
695 44f54ceb Michael Niedermayer
696 bb270c08 Diego Biurrun
                vis_and(TMP12, MASK_7f, TMP12);
697 44f54ceb Michael Niedermayer
698 bb270c08 Diego Biurrun
                vis_psub16(TMP14, TMP12, DST_0);
699
                vis_st64(DST_0, dest[0]);
700
                dest += stride;
701
        } while (--height);
702 44f54ceb Michael Niedermayer
703 bb270c08 Diego Biurrun
        vis_ld64(ref[0], TMP0);
704
        vis_xor(REF_0, REF_2, TMP4);
705 44f54ceb Michael Niedermayer
706 bb270c08 Diego Biurrun
        vis_ld64_2(ref, 8, TMP2);
707
        vis_and(TMP4, MASK_fe, TMP4);
708 44f54ceb Michael Niedermayer
709 bb270c08 Diego Biurrun
        vis_or(REF_0, REF_2, TMP6);
710
        vis_mul8x16(CONST_128, TMP4, TMP4);
711 44f54ceb Michael Niedermayer
712 bb270c08 Diego Biurrun
        vis_alignaddr_g0((void *)off);
713 44f54ceb Michael Niedermayer
714 bb270c08 Diego Biurrun
        vis_faligndata(TMP0, TMP2, REF_0);
715 44f54ceb Michael Niedermayer
716 bb270c08 Diego Biurrun
        if (off != 0x7) {
717
                vis_alignaddr_g0((void *)off_plus_1);
718
                vis_faligndata(TMP0, TMP2, REF_2);
719
        } else {
720
                vis_src1(TMP2, REF_2);
721
        }
722 44f54ceb Michael Niedermayer
723 bb270c08 Diego Biurrun
        vis_and(TMP4, MASK_7f, TMP4);
724 44f54ceb Michael Niedermayer
725 bb270c08 Diego Biurrun
        vis_psub16(TMP6, TMP4, DST_0);
726
        vis_st64(DST_0, dest[0]);
727
        dest += stride;
728 44f54ceb Michael Niedermayer
729 bb270c08 Diego Biurrun
        vis_xor(REF_0, REF_2, TMP12);
730 44f54ceb Michael Niedermayer
731 bb270c08 Diego Biurrun
        vis_and(TMP12, MASK_fe, TMP12);
732 44f54ceb Michael Niedermayer
733 bb270c08 Diego Biurrun
        vis_or(REF_0, REF_2, TMP14);
734
        vis_mul8x16(CONST_128, TMP12, TMP12);
735 44f54ceb Michael Niedermayer
736 bb270c08 Diego Biurrun
        vis_and(TMP12, MASK_7f, TMP12);
737 44f54ceb Michael Niedermayer
738 bb270c08 Diego Biurrun
        vis_psub16(TMP14, TMP12, DST_0);
739
        vis_st64(DST_0, dest[0]);
740
        dest += stride;
741 44f54ceb Michael Niedermayer
}
742
743
static void MC_avg_x_16_vis (uint8_t * dest, const uint8_t * _ref,
744 bb270c08 Diego Biurrun
                             const int stride, int height)
745 44f54ceb Michael Niedermayer
{
746 bb270c08 Diego Biurrun
        uint8_t *ref = (uint8_t *) _ref;
747
        unsigned long off = (unsigned long) ref & 0x7;
748
        unsigned long off_plus_1 = off + 1;
749 44f54ceb Michael Niedermayer
750 bb270c08 Diego Biurrun
        vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
751 44f54ceb Michael Niedermayer
752 bb270c08 Diego Biurrun
        vis_ld64(constants3[0], CONST_3);
753
        vis_fzero(ZERO);
754
        vis_ld64(constants256_512[0], CONST_256);
755 44f54ceb Michael Niedermayer
756 bb270c08 Diego Biurrun
        ref = vis_alignaddr(ref);
757
        do {    /* 26 cycles */
758
                vis_ld64(ref[0], TMP0);
759 44f54ceb Michael Niedermayer
760 bb270c08 Diego Biurrun
                vis_ld64(ref[8], TMP2);
761 44f54ceb Michael Niedermayer
762 bb270c08 Diego Biurrun
                vis_alignaddr_g0((void *)off);
763 44f54ceb Michael Niedermayer
764 bb270c08 Diego Biurrun
                vis_ld64(ref[16], TMP4);
765 44f54ceb Michael Niedermayer
766 bb270c08 Diego Biurrun
                vis_ld64(dest[0], DST_0);
767
                vis_faligndata(TMP0, TMP2, REF_0);
768 44f54ceb Michael Niedermayer
769 bb270c08 Diego Biurrun
                vis_ld64(dest[8], DST_2);
770
                vis_faligndata(TMP2, TMP4, REF_4);
771 44f54ceb Michael Niedermayer
772 bb270c08 Diego Biurrun
                if (off != 0x7) {
773
                        vis_alignaddr_g0((void *)off_plus_1);
774
                        vis_faligndata(TMP0, TMP2, REF_2);
775
                        vis_faligndata(TMP2, TMP4, REF_6);
776
                } else {
777
                        vis_src1(TMP2, REF_2);
778
                        vis_src1(TMP4, REF_6);
779
                }
780 44f54ceb Michael Niedermayer
781 bb270c08 Diego Biurrun
                vis_mul8x16au(REF_0,   CONST_256, TMP0);
782 44f54ceb Michael Niedermayer
783 bb270c08 Diego Biurrun
                vis_pmerge(ZERO,     REF_2,     TMP4);
784
                vis_mul8x16au(REF_0_1, CONST_256, TMP2);
785 44f54ceb Michael Niedermayer
786 bb270c08 Diego Biurrun
                vis_pmerge(ZERO, REF_2_1, TMP6);
787 44f54ceb Michael Niedermayer
788 bb270c08 Diego Biurrun
                vis_padd16(TMP0, TMP4, TMP0);
789 44f54ceb Michael Niedermayer
790 bb270c08 Diego Biurrun
                vis_mul8x16al(DST_0,   CONST_512, TMP4);
791
                vis_padd16(TMP2, TMP6, TMP2);
792 44f54ceb Michael Niedermayer
793 bb270c08 Diego Biurrun
                vis_mul8x16al(DST_1,   CONST_512, TMP6);
794 44f54ceb Michael Niedermayer
795 bb270c08 Diego Biurrun
                vis_mul8x16au(REF_6,   CONST_256, TMP12);
796 44f54ceb Michael Niedermayer
797 bb270c08 Diego Biurrun
                vis_padd16(TMP0, TMP4, TMP0);
798
                vis_mul8x16au(REF_6_1, CONST_256, TMP14);
799 44f54ceb Michael Niedermayer
800 bb270c08 Diego Biurrun
                vis_padd16(TMP2, TMP6, TMP2);
801
                vis_mul8x16au(REF_4,   CONST_256, TMP16);
802 44f54ceb Michael Niedermayer
803 bb270c08 Diego Biurrun
                vis_padd16(TMP0, CONST_3, TMP8);
804
                vis_mul8x16au(REF_4_1, CONST_256, TMP18);
805 44f54ceb Michael Niedermayer
806 bb270c08 Diego Biurrun
                vis_padd16(TMP2, CONST_3, TMP10);
807
                vis_pack16(TMP8, DST_0);
808 44f54ceb Michael Niedermayer
809 bb270c08 Diego Biurrun
                vis_pack16(TMP10, DST_1);
810
                vis_padd16(TMP16, TMP12, TMP0);
811 44f54ceb Michael Niedermayer
812 bb270c08 Diego Biurrun
                vis_st64(DST_0, dest[0]);
813
                vis_mul8x16al(DST_2,   CONST_512, TMP4);
814
                vis_padd16(TMP18, TMP14, TMP2);
815 44f54ceb Michael Niedermayer
816 bb270c08 Diego Biurrun
                vis_mul8x16al(DST_3,   CONST_512, TMP6);
817
                vis_padd16(TMP0, CONST_3, TMP0);
818 44f54ceb Michael Niedermayer
819 bb270c08 Diego Biurrun
                vis_padd16(TMP2, CONST_3, TMP2);
820 44f54ceb Michael Niedermayer
821 bb270c08 Diego Biurrun
                vis_padd16(TMP0, TMP4, TMP0);
822 44f54ceb Michael Niedermayer
823 bb270c08 Diego Biurrun
                vis_padd16(TMP2, TMP6, TMP2);
824
                vis_pack16(TMP0, DST_2);
825 44f54ceb Michael Niedermayer
826 bb270c08 Diego Biurrun
                vis_pack16(TMP2, DST_3);
827
                vis_st64(DST_2, dest[8]);
828 44f54ceb Michael Niedermayer
829 bb270c08 Diego Biurrun
                ref += stride;
830
                dest += stride;
831
        } while (--height);
832 44f54ceb Michael Niedermayer
}
833
834
static void MC_avg_x_8_vis (uint8_t * dest, const uint8_t * _ref,
835 bb270c08 Diego Biurrun
                            const int stride, int height)
836 44f54ceb Michael Niedermayer
{
837 bb270c08 Diego Biurrun
        uint8_t *ref = (uint8_t *) _ref;
838
        unsigned long off = (unsigned long) ref & 0x7;
839
        unsigned long off_plus_1 = off + 1;
840
        int stride_times_2 = stride << 1;
841 44f54ceb Michael Niedermayer
842 bb270c08 Diego Biurrun
        vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
843 44f54ceb Michael Niedermayer
844 bb270c08 Diego Biurrun
        vis_ld64(constants3[0], CONST_3);
845
        vis_fzero(ZERO);
846
        vis_ld64(constants256_512[0], CONST_256);
847 44f54ceb Michael Niedermayer
848 bb270c08 Diego Biurrun
        ref = vis_alignaddr(ref);
849
        height >>= 2;
850
        do {    /* 47 cycles */
851
                vis_ld64(ref[0],   TMP0);
852 44f54ceb Michael Niedermayer
853 bb270c08 Diego Biurrun
                vis_ld64_2(ref, 8, TMP2);
854
                ref += stride;
855 44f54ceb Michael Niedermayer
856 bb270c08 Diego Biurrun
                vis_alignaddr_g0((void *)off);
857 44f54ceb Michael Niedermayer
858 bb270c08 Diego Biurrun
                vis_ld64(ref[0],   TMP4);
859
                vis_faligndata(TMP0, TMP2, REF_0);
860 44f54ceb Michael Niedermayer
861 bb270c08 Diego Biurrun
                vis_ld64_2(ref, 8, TMP6);
862
                ref += stride;
863 44f54ceb Michael Niedermayer
864 bb270c08 Diego Biurrun
                vis_ld64(ref[0],   TMP8);
865 44f54ceb Michael Niedermayer
866 bb270c08 Diego Biurrun
                vis_ld64_2(ref, 8, TMP10);
867
                ref += stride;
868
                vis_faligndata(TMP4, TMP6, REF_4);
869 44f54ceb Michael Niedermayer
870 bb270c08 Diego Biurrun
                vis_ld64(ref[0],   TMP12);
871 44f54ceb Michael Niedermayer
872 bb270c08 Diego Biurrun
                vis_ld64_2(ref, 8, TMP14);
873
                ref += stride;
874
                vis_faligndata(TMP8, TMP10, REF_S0);
875 44f54ceb Michael Niedermayer
876 bb270c08 Diego Biurrun
                vis_faligndata(TMP12, TMP14, REF_S4);
877 44f54ceb Michael Niedermayer
878 bb270c08 Diego Biurrun
                if (off != 0x7) {
879
                        vis_alignaddr_g0((void *)off_plus_1);
880 44f54ceb Michael Niedermayer
881 bb270c08 Diego Biurrun
                        vis_ld64(dest[0], DST_0);
882
                        vis_faligndata(TMP0, TMP2, REF_2);
883 44f54ceb Michael Niedermayer
884 bb270c08 Diego Biurrun
                        vis_ld64_2(dest, stride, DST_2);
885
                        vis_faligndata(TMP4, TMP6, REF_6);
886 44f54ceb Michael Niedermayer
887 bb270c08 Diego Biurrun
                        vis_faligndata(TMP8, TMP10, REF_S2);
888 44f54ceb Michael Niedermayer
889 bb270c08 Diego Biurrun
                        vis_faligndata(TMP12, TMP14, REF_S6);
890
                } else {
891
                        vis_ld64(dest[0], DST_0);
892
                        vis_src1(TMP2, REF_2);
893 44f54ceb Michael Niedermayer
894 bb270c08 Diego Biurrun
                        vis_ld64_2(dest, stride, DST_2);
895
                        vis_src1(TMP6, REF_6);
896 44f54ceb Michael Niedermayer
897 bb270c08 Diego Biurrun
                        vis_src1(TMP10, REF_S2);
898 44f54ceb Michael Niedermayer
899 bb270c08 Diego Biurrun
                        vis_src1(TMP14, REF_S6);
900
                }
901 44f54ceb Michael Niedermayer
902 bb270c08 Diego Biurrun
                vis_pmerge(ZERO,     REF_0,     TMP0);
903
                vis_mul8x16au(REF_0_1, CONST_256, TMP2);
904 44f54ceb Michael Niedermayer
905 bb270c08 Diego Biurrun
                vis_pmerge(ZERO,     REF_2,     TMP4);
906
                vis_mul8x16au(REF_2_1, CONST_256, TMP6);
907 44f54ceb Michael Niedermayer
908 bb270c08 Diego Biurrun
                vis_padd16(TMP0, CONST_3, TMP0);
909
                vis_mul8x16al(DST_0,   CONST_512, TMP16);
910 44f54ceb Michael Niedermayer
911 bb270c08 Diego Biurrun
                vis_padd16(TMP2, CONST_3, TMP2);
912
                vis_mul8x16al(DST_1,   CONST_512, TMP18);
913 44f54ceb Michael Niedermayer
914 bb270c08 Diego Biurrun
                vis_padd16(TMP0, TMP4, TMP0);
915
                vis_mul8x16au(REF_4, CONST_256, TMP8);
916 44f54ceb Michael Niedermayer
917 bb270c08 Diego Biurrun
                vis_padd16(TMP2, TMP6, TMP2);
918
                vis_mul8x16au(REF_4_1, CONST_256, TMP10);
919 44f54ceb Michael Niedermayer
920 bb270c08 Diego Biurrun
                vis_padd16(TMP0, TMP16, TMP0);
921
                vis_mul8x16au(REF_6, CONST_256, TMP12);
922 44f54ceb Michael Niedermayer
923 bb270c08 Diego Biurrun
                vis_padd16(TMP2, TMP18, TMP2);
924
                vis_mul8x16au(REF_6_1, CONST_256, TMP14);
925 44f54ceb Michael Niedermayer
926 bb270c08 Diego Biurrun
                vis_padd16(TMP8, CONST_3, TMP8);
927
                vis_mul8x16al(DST_2, CONST_512, TMP16);
928 44f54ceb Michael Niedermayer
929 bb270c08 Diego Biurrun
                vis_padd16(TMP8, TMP12, TMP8);
930
                vis_mul8x16al(DST_3, CONST_512, TMP18);
931 44f54ceb Michael Niedermayer
932 bb270c08 Diego Biurrun
                vis_padd16(TMP10, TMP14, TMP10);
933
                vis_pack16(TMP0, DST_0);
934 44f54ceb Michael Niedermayer
935 bb270c08 Diego Biurrun
                vis_pack16(TMP2, DST_1);
936
                vis_st64(DST_0, dest[0]);
937
                dest += stride;
938
                vis_padd16(TMP10, CONST_3, TMP10);
939 44f54ceb Michael Niedermayer
940 bb270c08 Diego Biurrun
                vis_ld64_2(dest, stride, DST_0);
941
                vis_padd16(TMP8, TMP16, TMP8);
942 44f54ceb Michael Niedermayer
943 bb270c08 Diego Biurrun
                vis_ld64_2(dest, stride_times_2, TMP4/*DST_2*/);
944
                vis_padd16(TMP10, TMP18, TMP10);
945
                vis_pack16(TMP8, DST_2);
946 44f54ceb Michael Niedermayer
947 bb270c08 Diego Biurrun
                vis_pack16(TMP10, DST_3);
948
                vis_st64(DST_2, dest[0]);
949
                dest += stride;
950 44f54ceb Michael Niedermayer
951 bb270c08 Diego Biurrun
                vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
952
                vis_pmerge(ZERO,     REF_S0,     TMP0);
953 44f54ceb Michael Niedermayer
954 bb270c08 Diego Biurrun
                vis_pmerge(ZERO,     REF_S2,     TMP24);
955
                vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
956 44f54ceb Michael Niedermayer
957 bb270c08 Diego Biurrun
                vis_padd16(TMP0, CONST_3, TMP0);
958
                vis_mul8x16au(REF_S4, CONST_256, TMP8);
959 44f54ceb Michael Niedermayer
960 bb270c08 Diego Biurrun
                vis_padd16(TMP2, CONST_3, TMP2);
961
                vis_mul8x16au(REF_S4_1, CONST_256, TMP10);
962 44f54ceb Michael Niedermayer
963 bb270c08 Diego Biurrun
                vis_padd16(TMP0, TMP24, TMP0);
964
                vis_mul8x16au(REF_S6, CONST_256, TMP12);
965 44f54ceb Michael Niedermayer
966 bb270c08 Diego Biurrun
                vis_padd16(TMP2, TMP6, TMP2);
967
                vis_mul8x16au(REF_S6_1, CONST_256, TMP14);
968 44f54ceb Michael Niedermayer
969 bb270c08 Diego Biurrun
                vis_padd16(TMP8, CONST_3, TMP8);
970
                vis_mul8x16al(DST_0,   CONST_512, TMP16);
971 44f54ceb Michael Niedermayer
972 bb270c08 Diego Biurrun
                vis_padd16(TMP10, CONST_3, TMP10);
973
                vis_mul8x16al(DST_1,   CONST_512, TMP18);
974 44f54ceb Michael Niedermayer
975 bb270c08 Diego Biurrun
                vis_padd16(TMP8, TMP12, TMP8);
976
                vis_mul8x16al(TMP4/*DST_2*/, CONST_512, TMP20);
977 44f54ceb Michael Niedermayer
978 bb270c08 Diego Biurrun
                vis_mul8x16al(TMP5/*DST_3*/, CONST_512, TMP22);
979
                vis_padd16(TMP0, TMP16, TMP0);
980 44f54ceb Michael Niedermayer
981 bb270c08 Diego Biurrun
                vis_padd16(TMP2, TMP18, TMP2);
982
                vis_pack16(TMP0, DST_0);
983 44f54ceb Michael Niedermayer
984 bb270c08 Diego Biurrun
                vis_padd16(TMP10, TMP14, TMP10);
985
                vis_pack16(TMP2, DST_1);
986
                vis_st64(DST_0, dest[0]);
987
                dest += stride;
988 44f54ceb Michael Niedermayer
989 bb270c08 Diego Biurrun
                vis_padd16(TMP8, TMP20, TMP8);
990 44f54ceb Michael Niedermayer
991 bb270c08 Diego Biurrun
                vis_padd16(TMP10, TMP22, TMP10);
992
                vis_pack16(TMP8, DST_2);
993 44f54ceb Michael Niedermayer
994 bb270c08 Diego Biurrun
                vis_pack16(TMP10, DST_3);
995
                vis_st64(DST_2, dest[0]);
996
                dest += stride;
997
        } while (--height);
998 44f54ceb Michael Niedermayer
}
999
1000
static void MC_put_y_16_vis (uint8_t * dest, const uint8_t * _ref,
1001 bb270c08 Diego Biurrun
                             const int stride, int height)
1002 44f54ceb Michael Niedermayer
{
1003 bb270c08 Diego Biurrun
        uint8_t *ref = (uint8_t *) _ref;
1004 44f54ceb Michael Niedermayer
1005 bb270c08 Diego Biurrun
        ref = vis_alignaddr(ref);
1006
        vis_ld64(ref[0], TMP0);
1007 44f54ceb Michael Niedermayer
1008 bb270c08 Diego Biurrun
        vis_ld64_2(ref, 8, TMP2);
1009 44f54ceb Michael Niedermayer
1010 bb270c08 Diego Biurrun
        vis_ld64_2(ref, 16, TMP4);
1011
        ref += stride;
1012 44f54ceb Michael Niedermayer
1013 bb270c08 Diego Biurrun
        vis_ld64(ref[0], TMP6);
1014
        vis_faligndata(TMP0, TMP2, REF_0);
1015 44f54ceb Michael Niedermayer
1016 bb270c08 Diego Biurrun
        vis_ld64_2(ref, 8, TMP8);
1017
        vis_faligndata(TMP2, TMP4, REF_4);
1018 44f54ceb Michael Niedermayer
1019 bb270c08 Diego Biurrun
        vis_ld64_2(ref, 16, TMP10);
1020
        ref += stride;
1021 44f54ceb Michael Niedermayer
1022 bb270c08 Diego Biurrun
        vis_ld64(constants_fe[0], MASK_fe);
1023
        vis_faligndata(TMP6, TMP8, REF_2);
1024 44f54ceb Michael Niedermayer
1025 bb270c08 Diego Biurrun
        vis_ld64(constants_7f[0], MASK_7f);
1026
        vis_faligndata(TMP8, TMP10, REF_6);
1027 44f54ceb Michael Niedermayer
1028 bb270c08 Diego Biurrun
        vis_ld64(constants128[0], CONST_128);
1029
        height = (height >> 1) - 1;
1030
        do {    /* 24 cycles */
1031
                vis_ld64(ref[0], TMP0);
1032
                vis_xor(REF_0, REF_2, TMP12);
1033 44f54ceb Michael Niedermayer
1034 bb270c08 Diego Biurrun
                vis_ld64_2(ref, 8, TMP2);
1035
                vis_xor(REF_4, REF_6, TMP16);
1036 44f54ceb Michael Niedermayer
1037 bb270c08 Diego Biurrun
                vis_ld64_2(ref, 16, TMP4);
1038
                ref += stride;
1039
                vis_or(REF_0, REF_2, TMP14);
1040 44f54ceb Michael Niedermayer
1041 bb270c08 Diego Biurrun
                vis_ld64(ref[0], TMP6);
1042
                vis_or(REF_4, REF_6, TMP18);
1043 44f54ceb Michael Niedermayer
1044 bb270c08 Diego Biurrun
                vis_ld64_2(ref, 8, TMP8);
1045
                vis_faligndata(TMP0, TMP2, REF_0);
1046 44f54ceb Michael Niedermayer
1047 bb270c08 Diego Biurrun
                vis_ld64_2(ref, 16, TMP10);
1048
                ref += stride;
1049
                vis_faligndata(TMP2, TMP4, REF_4);
1050 44f54ceb Michael Niedermayer
1051 bb270c08 Diego Biurrun
                vis_and(TMP12, MASK_fe, TMP12);
1052 44f54ceb Michael Niedermayer
1053 bb270c08 Diego Biurrun
                vis_and(TMP16, MASK_fe, TMP16);
1054
                vis_mul8x16(CONST_128, TMP12, TMP12);
1055 44f54ceb Michael Niedermayer
1056 bb270c08 Diego Biurrun
                vis_mul8x16(CONST_128, TMP16, TMP16);
1057
                vis_xor(REF_0, REF_2, TMP0);
1058 44f54ceb Michael Niedermayer
1059 bb270c08 Diego Biurrun
                vis_xor(REF_4, REF_6, TMP2);
1060 44f54ceb Michael Niedermayer
1061 bb270c08 Diego Biurrun
                vis_or(REF_0, REF_2, TMP20);
1062 44f54ceb Michael Niedermayer
1063 bb270c08 Diego Biurrun
                vis_and(TMP12, MASK_7f, TMP12);
1064 44f54ceb Michael Niedermayer
1065 bb270c08 Diego Biurrun
                vis_and(TMP16, MASK_7f, TMP16);
1066 44f54ceb Michael Niedermayer
1067 bb270c08 Diego Biurrun
                vis_psub16(TMP14, TMP12, TMP12);
1068
                vis_st64(TMP12, dest[0]);
1069 44f54ceb Michael Niedermayer
1070 bb270c08 Diego Biurrun
                vis_psub16(TMP18, TMP16, TMP16);
1071
                vis_st64_2(TMP16, dest, 8);
1072
                dest += stride;
1073 44f54ceb Michael Niedermayer
1074 bb270c08 Diego Biurrun
                vis_or(REF_4, REF_6, TMP18);
1075 44f54ceb Michael Niedermayer
1076 bb270c08 Diego Biurrun
                vis_and(TMP0, MASK_fe, TMP0);
1077 44f54ceb Michael Niedermayer
1078 bb270c08 Diego Biurrun
                vis_and(TMP2, MASK_fe, TMP2);
1079
                vis_mul8x16(CONST_128, TMP0, TMP0);
1080 44f54ceb Michael Niedermayer
1081 bb270c08 Diego Biurrun
                vis_faligndata(TMP6, TMP8, REF_2);
1082
                vis_mul8x16(CONST_128, TMP2, TMP2);
1083 44f54ceb Michael Niedermayer
1084 bb270c08 Diego Biurrun
                vis_faligndata(TMP8, TMP10, REF_6);
1085 44f54ceb Michael Niedermayer
1086 bb270c08 Diego Biurrun
                vis_and(TMP0, MASK_7f, TMP0);
1087 44f54ceb Michael Niedermayer
1088 bb270c08 Diego Biurrun
                vis_and(TMP2, MASK_7f, TMP2);
1089 44f54ceb Michael Niedermayer
1090 bb270c08 Diego Biurrun
                vis_psub16(TMP20, TMP0, TMP0);
1091
                vis_st64(TMP0, dest[0]);
1092 44f54ceb Michael Niedermayer
1093 bb270c08 Diego Biurrun
                vis_psub16(TMP18, TMP2, TMP2);
1094
                vis_st64_2(TMP2, dest, 8);
1095
                dest += stride;
1096
        } while (--height);
1097 44f54ceb Michael Niedermayer
1098 bb270c08 Diego Biurrun
        vis_ld64(ref[0], TMP0);
1099
        vis_xor(REF_0, REF_2, TMP12);
1100 44f54ceb Michael Niedermayer
1101 bb270c08 Diego Biurrun
        vis_ld64_2(ref, 8, TMP2);
1102
        vis_xor(REF_4, REF_6, TMP16);
1103 44f54ceb Michael Niedermayer
1104 bb270c08 Diego Biurrun
        vis_ld64_2(ref, 16, TMP4);
1105
        vis_or(REF_0, REF_2, TMP14);
1106 44f54ceb Michael Niedermayer
1107 bb270c08 Diego Biurrun
        vis_or(REF_4, REF_6, TMP18);
1108 44f54ceb Michael Niedermayer
1109 bb270c08 Diego Biurrun
        vis_faligndata(TMP0, TMP2, REF_0);
1110 44f54ceb Michael Niedermayer
1111 bb270c08 Diego Biurrun
        vis_faligndata(TMP2, TMP4, REF_4);
1112 44f54ceb Michael Niedermayer
1113 bb270c08 Diego Biurrun
        vis_and(TMP12, MASK_fe, TMP12);
1114 44f54ceb Michael Niedermayer
1115 bb270c08 Diego Biurrun
        vis_and(TMP16, MASK_fe, TMP16);
1116
        vis_mul8x16(CONST_128, TMP12, TMP12);
1117 44f54ceb Michael Niedermayer
1118 bb270c08 Diego Biurrun
        vis_mul8x16(CONST_128, TMP16, TMP16);
1119
        vis_xor(REF_0, REF_2, TMP0);
1120 44f54ceb Michael Niedermayer
1121 bb270c08 Diego Biurrun
        vis_xor(REF_4, REF_6, TMP2);
1122 44f54ceb Michael Niedermayer
1123 bb270c08 Diego Biurrun
        vis_or(REF_0, REF_2, TMP20);
1124 44f54ceb Michael Niedermayer
1125 bb270c08 Diego Biurrun
        vis_and(TMP12, MASK_7f, TMP12);
1126 44f54ceb Michael Niedermayer
1127 bb270c08 Diego Biurrun
        vis_and(TMP16, MASK_7f, TMP16);
1128 44f54ceb Michael Niedermayer
1129 bb270c08 Diego Biurrun
        vis_psub16(TMP14, TMP12, TMP12);
1130
        vis_st64(TMP12, dest[0]);
1131 44f54ceb Michael Niedermayer
1132 bb270c08 Diego Biurrun
        vis_psub16(TMP18, TMP16, TMP16);
1133
        vis_st64_2(TMP16, dest, 8);
1134
        dest += stride;
1135 44f54ceb Michael Niedermayer
1136 bb270c08 Diego Biurrun
        vis_or(REF_4, REF_6, TMP18);
1137 44f54ceb Michael Niedermayer
1138 bb270c08 Diego Biurrun
        vis_and(TMP0, MASK_fe, TMP0);
1139 44f54ceb Michael Niedermayer
1140 bb270c08 Diego Biurrun
        vis_and(TMP2, MASK_fe, TMP2);
1141
        vis_mul8x16(CONST_128, TMP0, TMP0);
1142 44f54ceb Michael Niedermayer
1143 bb270c08 Diego Biurrun
        vis_mul8x16(CONST_128, TMP2, TMP2);
1144 44f54ceb Michael Niedermayer
1145 bb270c08 Diego Biurrun
        vis_and(TMP0, MASK_7f, TMP0);
1146 44f54ceb Michael Niedermayer
1147 bb270c08 Diego Biurrun
        vis_and(TMP2, MASK_7f, TMP2);
1148 44f54ceb Michael Niedermayer
1149 bb270c08 Diego Biurrun
        vis_psub16(TMP20, TMP0, TMP0);
1150
        vis_st64(TMP0, dest[0]);
1151 44f54ceb Michael Niedermayer
1152 bb270c08 Diego Biurrun
        vis_psub16(TMP18, TMP2, TMP2);
1153
        vis_st64_2(TMP2, dest, 8);
1154 44f54ceb Michael Niedermayer
}
1155
1156
static void MC_put_y_8_vis (uint8_t * dest, const uint8_t * _ref,
1157 bb270c08 Diego Biurrun
                            const int stride, int height)
1158 44f54ceb Michael Niedermayer
{
1159 bb270c08 Diego Biurrun
        uint8_t *ref = (uint8_t *) _ref;
1160 44f54ceb Michael Niedermayer
1161 bb270c08 Diego Biurrun
        ref = vis_alignaddr(ref);
1162
        vis_ld64(ref[0], TMP0);
1163 44f54ceb Michael Niedermayer
1164 bb270c08 Diego Biurrun
        vis_ld64_2(ref, 8, TMP2);
1165
        ref += stride;
1166 44f54ceb Michael Niedermayer
1167 bb270c08 Diego Biurrun
        vis_ld64(ref[0], TMP4);
1168 44f54ceb Michael Niedermayer
1169 bb270c08 Diego Biurrun
        vis_ld64_2(ref, 8, TMP6);
1170
        ref += stride;
1171 44f54ceb Michael Niedermayer
1172 bb270c08 Diego Biurrun
        vis_ld64(constants_fe[0], MASK_fe);
1173
        vis_faligndata(TMP0, TMP2, REF_0);
1174 44f54ceb Michael Niedermayer
1175 bb270c08 Diego Biurrun
        vis_ld64(constants_7f[0], MASK_7f);
1176
        vis_faligndata(TMP4, TMP6, REF_2);
1177 44f54ceb Michael Niedermayer
1178 bb270c08 Diego Biurrun
        vis_ld64(constants128[0], CONST_128);
1179
        height = (height >> 1) - 1;
1180
        do {    /* 12 cycles */
1181
                vis_ld64(ref[0], TMP0);
1182
                vis_xor(REF_0, REF_2, TMP4);
1183 44f54ceb Michael Niedermayer
1184 bb270c08 Diego Biurrun
                vis_ld64_2(ref, 8, TMP2);
1185
                ref += stride;
1186
                vis_and(TMP4, MASK_fe, TMP4);
1187 44f54ceb Michael Niedermayer
1188 bb270c08 Diego Biurrun
                vis_or(REF_0, REF_2, TMP6);
1189
                vis_mul8x16(CONST_128, TMP4, TMP4);
1190 44f54ceb Michael Niedermayer
1191 bb270c08 Diego Biurrun
                vis_faligndata(TMP0, TMP2, REF_0);
1192
                vis_ld64(ref[0], TMP0);
1193 44f54ceb Michael Niedermayer
1194 bb270c08 Diego Biurrun
                vis_ld64_2(ref, 8, TMP2);
1195
                ref += stride;
1196
                vis_xor(REF_0, REF_2, TMP12);
1197 44f54ceb Michael Niedermayer
1198 bb270c08 Diego Biurrun
                vis_and(TMP4, MASK_7f, TMP4);
1199 44f54ceb Michael Niedermayer
1200 bb270c08 Diego Biurrun
                vis_and(TMP12, MASK_fe, TMP12);
1201 44f54ceb Michael Niedermayer
1202 bb270c08 Diego Biurrun
                vis_mul8x16(CONST_128, TMP12, TMP12);
1203
                vis_or(REF_0, REF_2, TMP14);
1204 44f54ceb Michael Niedermayer
1205 bb270c08 Diego Biurrun
                vis_psub16(TMP6, TMP4, DST_0);
1206
                vis_st64(DST_0, dest[0]);
1207
                dest += stride;
1208 44f54ceb Michael Niedermayer
1209 bb270c08 Diego Biurrun
                vis_faligndata(TMP0, TMP2, REF_2);
1210 44f54ceb Michael Niedermayer
1211 bb270c08 Diego Biurrun
                vis_and(TMP12, MASK_7f, TMP12);
1212 44f54ceb Michael Niedermayer
1213 bb270c08 Diego Biurrun
                vis_psub16(TMP14, TMP12, DST_0);
1214
                vis_st64(DST_0, dest[0]);
1215
                dest += stride;
1216
        } while (--height);
1217 44f54ceb Michael Niedermayer
1218 bb270c08 Diego Biurrun
        vis_ld64(ref[0], TMP0);
1219
        vis_xor(REF_0, REF_2, TMP4);
1220 44f54ceb Michael Niedermayer
1221 bb270c08 Diego Biurrun
        vis_ld64_2(ref, 8, TMP2);
1222
        vis_and(TMP4, MASK_fe, TMP4);
1223 44f54ceb Michael Niedermayer
1224 bb270c08 Diego Biurrun
        vis_or(REF_0, REF_2, TMP6);
1225
        vis_mul8x16(CONST_128, TMP4, TMP4);
1226 44f54ceb Michael Niedermayer
1227 bb270c08 Diego Biurrun
        vis_faligndata(TMP0, TMP2, REF_0);
1228 44f54ceb Michael Niedermayer
1229 bb270c08 Diego Biurrun
        vis_xor(REF_0, REF_2, TMP12);
1230 44f54ceb Michael Niedermayer
1231 bb270c08 Diego Biurrun
        vis_and(TMP4, MASK_7f, TMP4);
1232 44f54ceb Michael Niedermayer
1233 bb270c08 Diego Biurrun
        vis_and(TMP12, MASK_fe, TMP12);
1234 44f54ceb Michael Niedermayer
1235 bb270c08 Diego Biurrun
        vis_mul8x16(CONST_128, TMP12, TMP12);
1236
        vis_or(REF_0, REF_2, TMP14);
1237 44f54ceb Michael Niedermayer
1238 bb270c08 Diego Biurrun
        vis_psub16(TMP6, TMP4, DST_0);
1239
        vis_st64(DST_0, dest[0]);
1240
        dest += stride;
1241 44f54ceb Michael Niedermayer
1242 bb270c08 Diego Biurrun
        vis_and(TMP12, MASK_7f, TMP12);
1243 44f54ceb Michael Niedermayer
1244 bb270c08 Diego Biurrun
        vis_psub16(TMP14, TMP12, DST_0);
1245
        vis_st64(DST_0, dest[0]);
1246 44f54ceb Michael Niedermayer
}
1247
1248
static void MC_avg_y_16_vis (uint8_t * dest, const uint8_t * _ref,
1249 bb270c08 Diego Biurrun
                             const int stride, int height)
1250 44f54ceb Michael Niedermayer
{
1251 bb270c08 Diego Biurrun
        uint8_t *ref = (uint8_t *) _ref;
1252
        int stride_8 = stride + 8;
1253
        int stride_16 = stride + 16;
1254 44f54ceb Michael Niedermayer
1255 bb270c08 Diego Biurrun
        vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
1256 44f54ceb Michael Niedermayer
1257 bb270c08 Diego Biurrun
        ref = vis_alignaddr(ref);
1258 44f54ceb Michael Niedermayer
1259 bb270c08 Diego Biurrun
        vis_ld64(ref[ 0], TMP0);
1260
        vis_fzero(ZERO);
1261 44f54ceb Michael Niedermayer
1262 bb270c08 Diego Biurrun
        vis_ld64(ref[ 8], TMP2);
1263 44f54ceb Michael Niedermayer
1264 bb270c08 Diego Biurrun
        vis_ld64(ref[16], TMP4);
1265 44f54ceb Michael Niedermayer
1266 bb270c08 Diego Biurrun
        vis_ld64(constants3[0], CONST_3);
1267
        vis_faligndata(TMP0, TMP2, REF_2);
1268 44f54ceb Michael Niedermayer
1269 bb270c08 Diego Biurrun
        vis_ld64(constants256_512[0], CONST_256);
1270
        vis_faligndata(TMP2, TMP4, REF_6);
1271
        height >>= 1;
1272 44f54ceb Michael Niedermayer
1273 bb270c08 Diego Biurrun
        do {    /* 31 cycles */
1274
                vis_ld64_2(ref, stride, TMP0);
1275
                vis_pmerge(ZERO,       REF_2,     TMP12);
1276
                vis_mul8x16au(REF_2_1, CONST_256, TMP14);
1277 44f54ceb Michael Niedermayer
1278 bb270c08 Diego Biurrun
                vis_ld64_2(ref, stride_8, TMP2);
1279
                vis_pmerge(ZERO,       REF_6,     TMP16);
1280
                vis_mul8x16au(REF_6_1, CONST_256, TMP18);
1281 44f54ceb Michael Niedermayer
1282 bb270c08 Diego Biurrun
                vis_ld64_2(ref, stride_16, TMP4);
1283
                ref += stride;
1284 44f54ceb Michael Niedermayer
1285 bb270c08 Diego Biurrun
                vis_ld64(dest[0], DST_0);
1286
                vis_faligndata(TMP0, TMP2, REF_0);
1287 44f54ceb Michael Niedermayer
1288 bb270c08 Diego Biurrun
                vis_ld64_2(dest, 8, DST_2);
1289
                vis_faligndata(TMP2, TMP4, REF_4);
1290 44f54ceb Michael Niedermayer
1291 bb270c08 Diego Biurrun
                vis_ld64_2(ref, stride, TMP6);
1292
                vis_pmerge(ZERO,     REF_0,     TMP0);
1293
                vis_mul8x16au(REF_0_1, CONST_256, TMP2);
1294 44f54ceb Michael Niedermayer
1295 bb270c08 Diego Biurrun
                vis_ld64_2(ref, stride_8, TMP8);
1296
                vis_pmerge(ZERO,     REF_4,     TMP4);
1297 44f54ceb Michael Niedermayer
1298 bb270c08 Diego Biurrun
                vis_ld64_2(ref, stride_16, TMP10);
1299
                ref += stride;
1300 44f54ceb Michael Niedermayer
1301 bb270c08 Diego Biurrun
                vis_ld64_2(dest, stride, REF_S0/*DST_4*/);
1302
                vis_faligndata(TMP6, TMP8, REF_2);
1303
                vis_mul8x16au(REF_4_1, CONST_256, TMP6);
1304 44f54ceb Michael Niedermayer
1305 bb270c08 Diego Biurrun
                vis_ld64_2(dest, stride_8, REF_S2/*DST_6*/);
1306
                vis_faligndata(TMP8, TMP10, REF_6);
1307
                vis_mul8x16al(DST_0,   CONST_512, TMP20);
1308 44f54ceb Michael Niedermayer
1309 bb270c08 Diego Biurrun
                vis_padd16(TMP0, CONST_3, TMP0);
1310
                vis_mul8x16al(DST_1,   CONST_512, TMP22);
1311 44f54ceb Michael Niedermayer
1312 bb270c08 Diego Biurrun
                vis_padd16(TMP2, CONST_3, TMP2);
1313
                vis_mul8x16al(DST_2,   CONST_512, TMP24);
1314 44f54ceb Michael Niedermayer
1315 bb270c08 Diego Biurrun
                vis_padd16(TMP4, CONST_3, TMP4);
1316
                vis_mul8x16al(DST_3,   CONST_512, TMP26);
1317 44f54ceb Michael Niedermayer
1318 bb270c08 Diego Biurrun
                vis_padd16(TMP6, CONST_3, TMP6);
1319 44f54ceb Michael Niedermayer
1320 bb270c08 Diego Biurrun
                vis_padd16(TMP12, TMP20, TMP12);
1321
                vis_mul8x16al(REF_S0,   CONST_512, TMP20);
1322 44f54ceb Michael Niedermayer
1323 bb270c08 Diego Biurrun
                vis_padd16(TMP14, TMP22, TMP14);
1324
                vis_mul8x16al(REF_S0_1, CONST_512, TMP22);
1325 44f54ceb Michael Niedermayer
1326 bb270c08 Diego Biurrun
                vis_padd16(TMP16, TMP24, TMP16);
1327
                vis_mul8x16al(REF_S2,   CONST_512, TMP24);
1328 44f54ceb Michael Niedermayer
1329 bb270c08 Diego Biurrun
                vis_padd16(TMP18, TMP26, TMP18);
1330
                vis_mul8x16al(REF_S2_1, CONST_512, TMP26);
1331 44f54ceb Michael Niedermayer
1332 bb270c08 Diego Biurrun
                vis_padd16(TMP12, TMP0, TMP12);
1333
                vis_mul8x16au(REF_2,   CONST_256, TMP28);
1334 44f54ceb Michael Niedermayer
1335 bb270c08 Diego Biurrun
                vis_padd16(TMP14, TMP2, TMP14);
1336
                vis_mul8x16au(REF_2_1, CONST_256, TMP30);
1337 44f54ceb Michael Niedermayer
1338 bb270c08 Diego Biurrun
                vis_padd16(TMP16, TMP4, TMP16);
1339
                vis_mul8x16au(REF_6,   CONST_256, REF_S4);
1340 44f54ceb Michael Niedermayer
1341 bb270c08 Diego Biurrun
                vis_padd16(TMP18, TMP6, TMP18);
1342
                vis_mul8x16au(REF_6_1, CONST_256, REF_S6);
1343 44f54ceb Michael Niedermayer
1344 bb270c08 Diego Biurrun
                vis_pack16(TMP12, DST_0);
1345
                vis_padd16(TMP28, TMP0, TMP12);
1346 44f54ceb Michael Niedermayer
1347 bb270c08 Diego Biurrun
                vis_pack16(TMP14, DST_1);
1348
                vis_st64(DST_0, dest[0]);
1349
                vis_padd16(TMP30, TMP2, TMP14);
1350 44f54ceb Michael Niedermayer
1351 bb270c08 Diego Biurrun
                vis_pack16(TMP16, DST_2);
1352
                vis_padd16(REF_S4, TMP4, TMP16);
1353 44f54ceb Michael Niedermayer
1354 bb270c08 Diego Biurrun
                vis_pack16(TMP18, DST_3);
1355
                vis_st64_2(DST_2, dest, 8);
1356
                dest += stride;
1357
                vis_padd16(REF_S6, TMP6, TMP18);
1358 44f54ceb Michael Niedermayer
1359 bb270c08 Diego Biurrun
                vis_padd16(TMP12, TMP20, TMP12);
1360 44f54ceb Michael Niedermayer
1361 bb270c08 Diego Biurrun
                vis_padd16(TMP14, TMP22, TMP14);
1362
                vis_pack16(TMP12, DST_0);
1363 44f54ceb Michael Niedermayer
1364 bb270c08 Diego Biurrun
                vis_padd16(TMP16, TMP24, TMP16);
1365
                vis_pack16(TMP14, DST_1);
1366
                vis_st64(DST_0, dest[0]);
1367 44f54ceb Michael Niedermayer
1368 bb270c08 Diego Biurrun
                vis_padd16(TMP18, TMP26, TMP18);
1369
                vis_pack16(TMP16, DST_2);
1370 44f54ceb Michael Niedermayer
1371 bb270c08 Diego Biurrun
                vis_pack16(TMP18, DST_3);
1372
                vis_st64_2(DST_2, dest, 8);
1373
                dest += stride;
1374
        } while (--height);
1375 44f54ceb Michael Niedermayer
}
1376
1377
static void MC_avg_y_8_vis (uint8_t * dest, const uint8_t * _ref,
1378 bb270c08 Diego Biurrun
                            const int stride, int height)
1379 44f54ceb Michael Niedermayer
{
1380 bb270c08 Diego Biurrun
        uint8_t *ref = (uint8_t *) _ref;
1381
        int stride_8 = stride + 8;
1382 44f54ceb Michael Niedermayer
1383 bb270c08 Diego Biurrun
        vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
1384 44f54ceb Michael Niedermayer
1385 bb270c08 Diego Biurrun
        ref = vis_alignaddr(ref);
1386 44f54ceb Michael Niedermayer
1387 bb270c08 Diego Biurrun
        vis_ld64(ref[ 0], TMP0);
1388
        vis_fzero(ZERO);
1389 44f54ceb Michael Niedermayer
1390 bb270c08 Diego Biurrun
        vis_ld64(ref[ 8], TMP2);
1391 44f54ceb Michael Niedermayer
1392 bb270c08 Diego Biurrun
        vis_ld64(constants3[0], CONST_3);
1393
        vis_faligndata(TMP0, TMP2, REF_2);
1394 44f54ceb Michael Niedermayer
1395 bb270c08 Diego Biurrun
        vis_ld64(constants256_512[0], CONST_256);
1396 44f54ceb Michael Niedermayer
1397 bb270c08 Diego Biurrun
        height >>= 1;
1398
        do {    /* 20 cycles */
1399
                vis_ld64_2(ref, stride, TMP0);
1400
                vis_pmerge(ZERO,       REF_2,     TMP8);
1401
                vis_mul8x16au(REF_2_1, CONST_256, TMP10);
1402 44f54ceb Michael Niedermayer
1403 bb270c08 Diego Biurrun
                vis_ld64_2(ref, stride_8, TMP2);
1404
                ref += stride;
1405 44f54ceb Michael Niedermayer
1406 bb270c08 Diego Biurrun
                vis_ld64(dest[0], DST_0);
1407 44f54ceb Michael Niedermayer
1408 bb270c08 Diego Biurrun
                vis_ld64_2(dest, stride, DST_2);
1409
                vis_faligndata(TMP0, TMP2, REF_0);
1410 44f54ceb Michael Niedermayer
1411 bb270c08 Diego Biurrun
                vis_ld64_2(ref, stride, TMP4);
1412
                vis_mul8x16al(DST_0,   CONST_512, TMP16);
1413
                vis_pmerge(ZERO,       REF_0,     TMP12);
1414 44f54ceb Michael Niedermayer
1415 bb270c08 Diego Biurrun
                vis_ld64_2(ref, stride_8, TMP6);
1416
                ref += stride;
1417
                vis_mul8x16al(DST_1,   CONST_512, TMP18);
1418
                vis_pmerge(ZERO,       REF_0_1,   TMP14);
1419 44f54ceb Michael Niedermayer
1420 bb270c08 Diego Biurrun
                vis_padd16(TMP12, CONST_3, TMP12);
1421
                vis_mul8x16al(DST_2,   CONST_512, TMP24);
1422 44f54ceb Michael Niedermayer
1423 bb270c08 Diego Biurrun
                vis_padd16(TMP14, CONST_3, TMP14);
1424
                vis_mul8x16al(DST_3,   CONST_512, TMP26);
1425 44f54ceb Michael Niedermayer
1426 bb270c08 Diego Biurrun
                vis_faligndata(TMP4, TMP6, REF_2);
1427 44f54ceb Michael Niedermayer
1428 bb270c08 Diego Biurrun
                vis_padd16(TMP8, TMP12, TMP8);
1429 44f54ceb Michael Niedermayer
1430 bb270c08 Diego Biurrun
                vis_padd16(TMP10, TMP14, TMP10);
1431
                vis_mul8x16au(REF_2,   CONST_256, TMP20);
1432 44f54ceb Michael Niedermayer
1433 bb270c08 Diego Biurrun
                vis_padd16(TMP8, TMP16, TMP0);
1434
                vis_mul8x16au(REF_2_1, CONST_256, TMP22);
1435 44f54ceb Michael Niedermayer
1436 bb270c08 Diego Biurrun
                vis_padd16(TMP10, TMP18, TMP2);
1437
                vis_pack16(TMP0, DST_0);
1438 44f54ceb Michael Niedermayer
1439 bb270c08 Diego Biurrun
                vis_pack16(TMP2, DST_1);
1440
                vis_st64(DST_0, dest[0]);
1441
                dest += stride;
1442
                vis_padd16(TMP12, TMP20, TMP12);
1443 44f54ceb Michael Niedermayer
1444 bb270c08 Diego Biurrun
                vis_padd16(TMP14, TMP22, TMP14);
1445 44f54ceb Michael Niedermayer
1446 bb270c08 Diego Biurrun
                vis_padd16(TMP12, TMP24, TMP0);
1447 44f54ceb Michael Niedermayer
1448 bb270c08 Diego Biurrun
                vis_padd16(TMP14, TMP26, TMP2);
1449
                vis_pack16(TMP0, DST_2);
1450 44f54ceb Michael Niedermayer
1451 bb270c08 Diego Biurrun
                vis_pack16(TMP2, DST_3);
1452
                vis_st64(DST_2, dest[0]);
1453
                dest += stride;
1454
        } while (--height);
1455 44f54ceb Michael Niedermayer
}
1456
1457
static void MC_put_xy_16_vis (uint8_t * dest, const uint8_t * _ref,
1458 bb270c08 Diego Biurrun
                              const int stride, int height)
1459 44f54ceb Michael Niedermayer
{
1460 bb270c08 Diego Biurrun
        uint8_t *ref = (uint8_t *) _ref;
1461
        unsigned long off = (unsigned long) ref & 0x7;
1462
        unsigned long off_plus_1 = off + 1;
1463
        int stride_8 = stride + 8;
1464
        int stride_16 = stride + 16;
1465 44f54ceb Michael Niedermayer
1466 bb270c08 Diego Biurrun
        vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
1467 44f54ceb Michael Niedermayer
1468 bb270c08 Diego Biurrun
        ref = vis_alignaddr(ref);
1469 44f54ceb Michael Niedermayer
1470 bb270c08 Diego Biurrun
        vis_ld64(ref[ 0], TMP0);
1471
        vis_fzero(ZERO);
1472 44f54ceb Michael Niedermayer
1473 bb270c08 Diego Biurrun
        vis_ld64(ref[ 8], TMP2);
1474 44f54ceb Michael Niedermayer
1475 bb270c08 Diego Biurrun
        vis_ld64(ref[16], TMP4);
1476 44f54ceb Michael Niedermayer
1477 bb270c08 Diego Biurrun
        vis_ld64(constants2[0], CONST_2);
1478
        vis_faligndata(TMP0, TMP2, REF_S0);
1479 44f54ceb Michael Niedermayer
1480 bb270c08 Diego Biurrun
        vis_ld64(constants256_512[0], CONST_256);
1481
        vis_faligndata(TMP2, TMP4, REF_S4);
1482 44f54ceb Michael Niedermayer
1483 bb270c08 Diego Biurrun
        if (off != 0x7) {
1484
                vis_alignaddr_g0((void *)off_plus_1);
1485
                vis_faligndata(TMP0, TMP2, REF_S2);
1486
                vis_faligndata(TMP2, TMP4, REF_S6);
1487
        } else {
1488
                vis_src1(TMP2, REF_S2);
1489
                vis_src1(TMP4, REF_S6);
1490
        }
1491 44f54ceb Michael Niedermayer
1492 bb270c08 Diego Biurrun
        height >>= 1;
1493
        do {
1494
                vis_ld64_2(ref, stride, TMP0);
1495
                vis_mul8x16au(REF_S0, CONST_256, TMP12);
1496
                vis_pmerge(ZERO,      REF_S0_1,  TMP14);
1497 44f54ceb Michael Niedermayer
1498 bb270c08 Diego Biurrun
                vis_alignaddr_g0((void *)off);
1499 44f54ceb Michael Niedermayer
1500 bb270c08 Diego Biurrun
                vis_ld64_2(ref, stride_8, TMP2);
1501
                vis_mul8x16au(REF_S2, CONST_256, TMP16);
1502
                vis_pmerge(ZERO,      REF_S2_1,  TMP18);
1503 44f54ceb Michael Niedermayer
1504 bb270c08 Diego Biurrun
                vis_ld64_2(ref, stride_16, TMP4);
1505
                ref += stride;
1506
                vis_mul8x16au(REF_S4, CONST_256, TMP20);
1507
                vis_pmerge(ZERO,      REF_S4_1,  TMP22);
1508 44f54ceb Michael Niedermayer
1509 bb270c08 Diego Biurrun
                vis_ld64_2(ref, stride, TMP6);
1510
                vis_mul8x16au(REF_S6, CONST_256, TMP24);
1511
                vis_pmerge(ZERO,      REF_S6_1,  TMP26);
1512 44f54ceb Michael Niedermayer
1513 bb270c08 Diego Biurrun
                vis_ld64_2(ref, stride_8, TMP8);
1514
                vis_faligndata(TMP0, TMP2, REF_0);
1515 44f54ceb Michael Niedermayer
1516 bb270c08 Diego Biurrun
                vis_ld64_2(ref, stride_16, TMP10);
1517
                ref += stride;
1518
                vis_faligndata(TMP2, TMP4, REF_4);
1519 44f54ceb Michael Niedermayer
1520 bb270c08 Diego Biurrun
                vis_faligndata(TMP6, TMP8, REF_S0);
1521 44f54ceb Michael Niedermayer
1522 bb270c08 Diego Biurrun
                vis_faligndata(TMP8, TMP10, REF_S4);
1523 44f54ceb Michael Niedermayer
1524 bb270c08 Diego Biurrun
                if (off != 0x7) {
1525
                        vis_alignaddr_g0((void *)off_plus_1);
1526
                        vis_faligndata(TMP0, TMP2, REF_2);
1527
                        vis_faligndata(TMP2, TMP4, REF_6);
1528
                        vis_faligndata(TMP6, TMP8, REF_S2);
1529
                        vis_faligndata(TMP8, TMP10, REF_S6);
1530
                } else {
1531
                        vis_src1(TMP2, REF_2);
1532
                        vis_src1(TMP4, REF_6);
1533
                        vis_src1(TMP8, REF_S2);
1534
                        vis_src1(TMP10, REF_S6);
1535
                }
1536 44f54ceb Michael Niedermayer
1537 bb270c08 Diego Biurrun
                vis_mul8x16au(REF_0, CONST_256, TMP0);
1538
                vis_pmerge(ZERO,      REF_0_1,  TMP2);
1539 44f54ceb Michael Niedermayer
1540 bb270c08 Diego Biurrun
                vis_mul8x16au(REF_2, CONST_256, TMP4);
1541
                vis_pmerge(ZERO,      REF_2_1,  TMP6);
1542 44f54ceb Michael Niedermayer
1543 bb270c08 Diego Biurrun
                vis_padd16(TMP0, CONST_2, TMP8);
1544
                vis_mul8x16au(REF_4, CONST_256, TMP0);
1545 44f54ceb Michael Niedermayer
1546 bb270c08 Diego Biurrun
                vis_padd16(TMP2, CONST_2, TMP10);
1547
                vis_mul8x16au(REF_4_1, CONST_256, TMP2);
1548 44f54ceb Michael Niedermayer
1549 bb270c08 Diego Biurrun
                vis_padd16(TMP8, TMP4, TMP8);
1550
                vis_mul8x16au(REF_6, CONST_256, TMP4);
1551 44f54ceb Michael Niedermayer
1552 bb270c08 Diego Biurrun
                vis_padd16(TMP10, TMP6, TMP10);
1553
                vis_mul8x16au(REF_6_1, CONST_256, TMP6);
1554 44f54ceb Michael Niedermayer
1555 bb270c08 Diego Biurrun
                vis_padd16(TMP12, TMP8, TMP12);
1556 44f54ceb Michael Niedermayer
1557 bb270c08 Diego Biurrun
                vis_padd16(TMP14, TMP10, TMP14);
1558 44f54ceb Michael Niedermayer
1559 bb270c08 Diego Biurrun
                vis_padd16(TMP12, TMP16, TMP12);
1560 44f54ceb Michael Niedermayer
1561 bb270c08 Diego Biurrun
                vis_padd16(TMP14, TMP18, TMP14);
1562
                vis_pack16(TMP12, DST_0);
1563 44f54ceb Michael Niedermayer
1564 bb270c08 Diego Biurrun
                vis_pack16(TMP14, DST_1);
1565
                vis_st64(DST_0, dest[0]);
1566
                vis_padd16(TMP0, CONST_2, TMP12);
1567 44f54ceb Michael Niedermayer
1568 bb270c08 Diego Biurrun
                vis_mul8x16au(REF_S0, CONST_256, TMP0);
1569
                vis_padd16(TMP2, CONST_2, TMP14);
1570 44f54ceb Michael Niedermayer
1571 bb270c08 Diego Biurrun
                vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
1572
                vis_padd16(TMP12, TMP4, TMP12);
1573 44f54ceb Michael Niedermayer
1574 bb270c08 Diego Biurrun
                vis_mul8x16au(REF_S2, CONST_256, TMP4);
1575
                vis_padd16(TMP14, TMP6, TMP14);
1576 44f54ceb Michael Niedermayer
1577 bb270c08 Diego Biurrun
                vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
1578
                vis_padd16(TMP20, TMP12, TMP20);
1579 44f54ceb Michael Niedermayer
1580 bb270c08 Diego Biurrun
                vis_padd16(TMP22, TMP14, TMP22);
1581 44f54ceb Michael Niedermayer
1582 bb270c08 Diego Biurrun
                vis_padd16(TMP20, TMP24, TMP20);
1583 44f54ceb Michael Niedermayer
1584 bb270c08 Diego Biurrun
                vis_padd16(TMP22, TMP26, TMP22);
1585
                vis_pack16(TMP20, DST_2);
1586 44f54ceb Michael Niedermayer
1587 bb270c08 Diego Biurrun
                vis_pack16(TMP22, DST_3);
1588
                vis_st64_2(DST_2, dest, 8);
1589
                dest += stride;
1590
                vis_padd16(TMP0, TMP4, TMP24);
1591 44f54ceb Michael Niedermayer
1592 bb270c08 Diego Biurrun
                vis_mul8x16au(REF_S4, CONST_256, TMP0);
1593
                vis_padd16(TMP2, TMP6, TMP26);
1594 44f54ceb Michael Niedermayer
1595 bb270c08 Diego Biurrun
                vis_mul8x16au(REF_S4_1, CONST_256, TMP2);
1596
                vis_padd16(TMP24, TMP8, TMP24);
1597 44f54ceb Michael Niedermayer
1598 bb270c08 Diego Biurrun
                vis_padd16(TMP26, TMP10, TMP26);
1599
                vis_pack16(TMP24, DST_0);
1600 44f54ceb Michael Niedermayer
1601 bb270c08 Diego Biurrun
                vis_pack16(TMP26, DST_1);
1602
                vis_st64(DST_0, dest[0]);
1603
                vis_pmerge(ZERO, REF_S6, TMP4);
1604 44f54ceb Michael Niedermayer
1605 bb270c08 Diego Biurrun
                vis_pmerge(ZERO,      REF_S6_1,  TMP6);
1606 44f54ceb Michael Niedermayer
1607 bb270c08 Diego Biurrun
                vis_padd16(TMP0, TMP4, TMP0);
1608 44f54ceb Michael Niedermayer
1609 bb270c08 Diego Biurrun
                vis_padd16(TMP2, TMP6, TMP2);
1610 44f54ceb Michael Niedermayer
1611 bb270c08 Diego Biurrun
                vis_padd16(TMP0, TMP12, TMP0);
1612 44f54ceb Michael Niedermayer
1613 bb270c08 Diego Biurrun
                vis_padd16(TMP2, TMP14, TMP2);
1614
                vis_pack16(TMP0, DST_2);
1615 44f54ceb Michael Niedermayer
1616 bb270c08 Diego Biurrun
                vis_pack16(TMP2, DST_3);
1617
                vis_st64_2(DST_2, dest, 8);
1618
                dest += stride;
1619
        } while (--height);
1620 44f54ceb Michael Niedermayer
}
1621
1622
static void MC_put_xy_8_vis (uint8_t * dest, const uint8_t * _ref,
1623 bb270c08 Diego Biurrun
                             const int stride, int height)
1624 44f54ceb Michael Niedermayer
{
1625 bb270c08 Diego Biurrun
        uint8_t *ref = (uint8_t *) _ref;
1626
        unsigned long off = (unsigned long) ref & 0x7;
1627
        unsigned long off_plus_1 = off + 1;
1628
        int stride_8 = stride + 8;
1629 44f54ceb Michael Niedermayer
1630 bb270c08 Diego Biurrun
        vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
1631 44f54ceb Michael Niedermayer
1632 bb270c08 Diego Biurrun
        ref = vis_alignaddr(ref);
1633 44f54ceb Michael Niedermayer
1634 bb270c08 Diego Biurrun
        vis_ld64(ref[ 0], TMP0);
1635
        vis_fzero(ZERO);
1636 44f54ceb Michael Niedermayer
1637 bb270c08 Diego Biurrun
        vis_ld64(ref[ 8], TMP2);
1638 44f54ceb Michael Niedermayer
1639 bb270c08 Diego Biurrun
        vis_ld64(constants2[0], CONST_2);
1640 44f54ceb Michael Niedermayer
1641 bb270c08 Diego Biurrun
        vis_ld64(constants256_512[0], CONST_256);
1642
        vis_faligndata(TMP0, TMP2, REF_S0);
1643 44f54ceb Michael Niedermayer
1644 bb270c08 Diego Biurrun
        if (off != 0x7) {
1645
                vis_alignaddr_g0((void *)off_plus_1);
1646
                vis_faligndata(TMP0, TMP2, REF_S2);
1647
        } else {
1648
                vis_src1(TMP2, REF_S2);
1649
        }
1650 44f54ceb Michael Niedermayer
1651 bb270c08 Diego Biurrun
        height >>= 1;
1652
        do {    /* 26 cycles */
1653
                vis_ld64_2(ref, stride, TMP0);
1654
                vis_mul8x16au(REF_S0,   CONST_256, TMP8);
1655
                vis_pmerge(ZERO,        REF_S2,    TMP12);
1656 44f54ceb Michael Niedermayer
1657 bb270c08 Diego Biurrun
                vis_alignaddr_g0((void *)off);
1658 44f54ceb Michael Niedermayer
1659 bb270c08 Diego Biurrun
                vis_ld64_2(ref, stride_8, TMP2);
1660
                ref += stride;
1661
                vis_mul8x16au(REF_S0_1, CONST_256, TMP10);
1662
                vis_pmerge(ZERO,        REF_S2_1,  TMP14);
1663 44f54ceb Michael Niedermayer
1664 bb270c08 Diego Biurrun
                vis_ld64_2(ref, stride, TMP4);
1665 44f54ceb Michael Niedermayer
1666 bb270c08 Diego Biurrun
                vis_ld64_2(ref, stride_8, TMP6);
1667
                ref += stride;
1668
                vis_faligndata(TMP0, TMP2, REF_S4);
1669 44f54ceb Michael Niedermayer
1670 bb270c08 Diego Biurrun
                vis_pmerge(ZERO, REF_S4, TMP18);
1671 44f54ceb Michael Niedermayer
1672 bb270c08 Diego Biurrun
                vis_pmerge(ZERO, REF_S4_1, TMP20);
1673 44f54ceb Michael Niedermayer
1674 bb270c08 Diego Biurrun
                vis_faligndata(TMP4, TMP6, REF_S0);
1675 44f54ceb Michael Niedermayer
1676 bb270c08 Diego Biurrun
                if (off != 0x7) {
1677
                        vis_alignaddr_g0((void *)off_plus_1);
1678
                        vis_faligndata(TMP0, TMP2, REF_S6);
1679
                        vis_faligndata(TMP4, TMP6, REF_S2);
1680
                } else {
1681
                        vis_src1(TMP2, REF_S6);
1682
                        vis_src1(TMP6, REF_S2);
1683
                }
1684 44f54ceb Michael Niedermayer
1685 bb270c08 Diego Biurrun
                vis_padd16(TMP18, CONST_2, TMP18);
1686
                vis_mul8x16au(REF_S6,   CONST_256, TMP22);
1687 44f54ceb Michael Niedermayer
1688 bb270c08 Diego Biurrun
                vis_padd16(TMP20, CONST_2, TMP20);
1689
                vis_mul8x16au(REF_S6_1, CONST_256, TMP24);
1690 44f54ceb Michael Niedermayer
1691 bb270c08 Diego Biurrun
                vis_mul8x16au(REF_S0,   CONST_256, TMP26);
1692
                vis_pmerge(ZERO, REF_S0_1, TMP28);
1693 44f54ceb Michael Niedermayer
1694 bb270c08 Diego Biurrun
                vis_mul8x16au(REF_S2,   CONST_256, TMP30);
1695
                vis_padd16(TMP18, TMP22, TMP18);
1696 44f54ceb Michael Niedermayer
1697 bb270c08 Diego Biurrun
                vis_mul8x16au(REF_S2_1, CONST_256, TMP32);
1698
                vis_padd16(TMP20, TMP24, TMP20);
1699 44f54ceb Michael Niedermayer
1700 bb270c08 Diego Biurrun
                vis_padd16(TMP8,  TMP18, TMP8);
1701 44f54ceb Michael Niedermayer
1702 bb270c08 Diego Biurrun
                vis_padd16(TMP10, TMP20, TMP10);
1703 44f54ceb Michael Niedermayer
1704 bb270c08 Diego Biurrun
                vis_padd16(TMP8,  TMP12, TMP8);
1705 44f54ceb Michael Niedermayer
1706 bb270c08 Diego Biurrun
                vis_padd16(TMP10, TMP14, TMP10);
1707
                vis_pack16(TMP8,  DST_0);
1708 44f54ceb Michael Niedermayer
1709 bb270c08 Diego Biurrun
                vis_pack16(TMP10, DST_1);
1710
                vis_st64(DST_0, dest[0]);
1711
                dest += stride;
1712
                vis_padd16(TMP18, TMP26, TMP18);
1713 44f54ceb Michael Niedermayer
1714 bb270c08 Diego Biurrun
                vis_padd16(TMP20, TMP28, TMP20);
1715 44f54ceb Michael Niedermayer
1716 bb270c08 Diego Biurrun
                vis_padd16(TMP18, TMP30, TMP18);
1717 44f54ceb Michael Niedermayer
1718 bb270c08 Diego Biurrun
                vis_padd16(TMP20, TMP32, TMP20);
1719
                vis_pack16(TMP18, DST_2);
1720 44f54ceb Michael Niedermayer
1721 bb270c08 Diego Biurrun
                vis_pack16(TMP20, DST_3);
1722
                vis_st64(DST_2, dest[0]);
1723
                dest += stride;
1724
        } while (--height);
1725 44f54ceb Michael Niedermayer
}
1726
1727
static void MC_avg_xy_16_vis (uint8_t * dest, const uint8_t * _ref,
1728 bb270c08 Diego Biurrun
                              const int stride, int height)
1729 44f54ceb Michael Niedermayer
{
1730 bb270c08 Diego Biurrun
        uint8_t *ref = (uint8_t *) _ref;
1731
        unsigned long off = (unsigned long) ref & 0x7;
1732
        unsigned long off_plus_1 = off + 1;
1733
        int stride_8 = stride + 8;
1734
        int stride_16 = stride + 16;
1735 44f54ceb Michael Niedermayer
1736 bb270c08 Diego Biurrun
        vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);
1737 44f54ceb Michael Niedermayer
1738 bb270c08 Diego Biurrun
        ref = vis_alignaddr(ref);
1739 44f54ceb Michael Niedermayer
1740 bb270c08 Diego Biurrun
        vis_ld64(ref[ 0], TMP0);
1741
        vis_fzero(ZERO);
1742 44f54ceb Michael Niedermayer
1743 bb270c08 Diego Biurrun
        vis_ld64(ref[ 8], TMP2);
1744 44f54ceb Michael Niedermayer
1745 bb270c08 Diego Biurrun
        vis_ld64(ref[16], TMP4);
1746 44f54ceb Michael Niedermayer
1747 bb270c08 Diego Biurrun
        vis_ld64(constants6[0], CONST_6);
1748
        vis_faligndata(TMP0, TMP2, REF_S0);
1749 44f54ceb Michael Niedermayer
1750 bb270c08 Diego Biurrun
        vis_ld64(constants256_1024[0], CONST_256);
1751
        vis_faligndata(TMP2, TMP4, REF_S4);
1752 44f54ceb Michael Niedermayer
1753 bb270c08 Diego Biurrun
        if (off != 0x7) {
1754
                vis_alignaddr_g0((void *)off_plus_1);
1755
                vis_faligndata(TMP0, TMP2, REF_S2);
1756
                vis_faligndata(TMP2, TMP4, REF_S6);
1757
        } else {
1758
                vis_src1(TMP2, REF_S2);
1759
                vis_src1(TMP4, REF_S6);
1760
        }
1761 44f54ceb Michael Niedermayer
1762 bb270c08 Diego Biurrun
        height >>= 1;
1763
        do {    /* 55 cycles */
1764
                vis_ld64_2(ref, stride, TMP0);
1765
                vis_mul8x16au(REF_S0, CONST_256, TMP12);
1766
                vis_pmerge(ZERO,      REF_S0_1,  TMP14);
1767 44f54ceb Michael Niedermayer
1768 bb270c08 Diego Biurrun
                vis_alignaddr_g0((void *)off);
1769 44f54ceb Michael Niedermayer
1770 bb270c08 Diego Biurrun
                vis_ld64_2(ref, stride_8, TMP2);
1771
                vis_mul8x16au(REF_S2, CONST_256, TMP16);
1772
                vis_pmerge(ZERO,      REF_S2_1,  TMP18);
1773 44f54ceb Michael Niedermayer
1774 bb270c08 Diego Biurrun
                vis_ld64_2(ref, stride_16, TMP4);
1775
                ref += stride;
1776
                vis_mul8x16au(REF_S4, CONST_256, TMP20);
1777
                vis_pmerge(ZERO,      REF_S4_1,  TMP22);
1778 44f54ceb Michael Niedermayer
1779 bb270c08 Diego Biurrun
                vis_ld64_2(ref, stride, TMP6);
1780
                vis_mul8x16au(REF_S6, CONST_256, TMP24);
1781
                vis_pmerge(ZERO,      REF_S6_1,  TMP26);
1782 44f54ceb Michael Niedermayer
1783 bb270c08 Diego Biurrun
                vis_ld64_2(ref, stride_8, TMP8);
1784
                vis_faligndata(TMP0, TMP2, REF_0);
1785 44f54ceb Michael Niedermayer
1786 bb270c08 Diego Biurrun
                vis_ld64_2(ref, stride_16, TMP10);
1787
                ref += stride;
1788
                vis_faligndata(TMP2, TMP4, REF_4);
1789 44f54ceb Michael Niedermayer
1790 bb270c08 Diego Biurrun
                vis_ld64(dest[0], DST_0);
1791
                vis_faligndata(TMP6, TMP8, REF_S0);
1792 44f54ceb Michael Niedermayer
1793 bb270c08 Diego Biurrun
                vis_ld64_2(dest, 8, DST_2);
1794
                vis_faligndata(TMP8, TMP10, REF_S4);
1795 44f54ceb Michael Niedermayer
1796 bb270c08 Diego Biurrun
                if (off != 0x7) {
1797
                        vis_alignaddr_g0((void *)off_plus_1);
1798
                        vis_faligndata(TMP0, TMP2, REF_2);
1799
                        vis_faligndata(TMP2, TMP4, REF_6);
1800
                        vis_faligndata(TMP6, TMP8, REF_S2);
1801
                        vis_faligndata(TMP8, TMP10, REF_S6);
1802
                } else {
1803
                        vis_src1(TMP2, REF_2);
1804
                        vis_src1(TMP4, REF_6);
1805
                        vis_src1(TMP8, REF_S2);
1806
                        vis_src1(TMP10, REF_S6);
1807
                }
1808 44f54ceb Michael Niedermayer
1809 bb270c08 Diego Biurrun
                vis_mul8x16al(DST_0,   CONST_1024, TMP30);
1810
                vis_pmerge(ZERO, REF_0, TMP0);
1811 44f54ceb Michael Niedermayer
1812 bb270c08 Diego Biurrun
                vis_mul8x16al(DST_1,   CONST_1024, TMP32);
1813
                vis_pmerge(ZERO,      REF_0_1,  TMP2);
1814 44f54ceb Michael Niedermayer
1815 bb270c08 Diego Biurrun
                vis_mul8x16au(REF_2, CONST_256, TMP4);
1816
                vis_pmerge(ZERO,      REF_2_1,  TMP6);
1817 44f54ceb Michael Niedermayer
1818 bb270c08 Diego Biurrun
                vis_mul8x16al(DST_2,   CONST_1024, REF_0);
1819
                vis_padd16(TMP0, CONST_6, TMP0);
1820 44f54ceb Michael Niedermayer
1821 bb270c08 Diego Biurrun
                vis_mul8x16al(DST_3,   CONST_1024, REF_2);
1822
                vis_padd16(TMP2, CONST_6, TMP2);
1823 44f54ceb Michael Niedermayer
1824 bb270c08 Diego Biurrun
                vis_padd16(TMP0, TMP4, TMP0);
1825
                vis_mul8x16au(REF_4, CONST_256, TMP4);
1826 44f54ceb Michael Niedermayer
1827 bb270c08 Diego Biurrun
                vis_padd16(TMP2, TMP6, TMP2);
1828
                vis_mul8x16au(REF_4_1, CONST_256, TMP6);
1829 44f54ceb Michael Niedermayer
1830 bb270c08 Diego Biurrun
                vis_padd16(TMP12, TMP0, TMP12);
1831
                vis_mul8x16au(REF_6, CONST_256, TMP8);
1832 44f54ceb Michael Niedermayer
1833 bb270c08 Diego Biurrun
                vis_padd16(TMP14, TMP2, TMP14);
1834
                vis_mul8x16au(REF_6_1, CONST_256, TMP10);
1835 44f54ceb Michael Niedermayer
1836 bb270c08 Diego Biurrun
                vis_padd16(TMP12, TMP16, TMP12);
1837
                vis_mul8x16au(REF_S0, CONST_256, REF_4);
1838 44f54ceb Michael Niedermayer
1839 bb270c08 Diego Biurrun
                vis_padd16(TMP14, TMP18, TMP14);
1840
                vis_mul8x16au(REF_S0_1, CONST_256, REF_6);
1841 44f54ceb Michael Niedermayer
1842 bb270c08 Diego Biurrun
                vis_padd16(TMP12, TMP30, TMP12);
1843 44f54ceb Michael Niedermayer
1844 bb270c08 Diego Biurrun
                vis_padd16(TMP14, TMP32, TMP14);
1845
                vis_pack16(TMP12, DST_0);
1846 44f54ceb Michael Niedermayer
1847 bb270c08 Diego Biurrun
                vis_pack16(TMP14, DST_1);
1848
                vis_st64(DST_0, dest[0]);
1849
                vis_padd16(TMP4, CONST_6, TMP4);
1850 44f54ceb Michael Niedermayer
1851 bb270c08 Diego Biurrun
                vis_ld64_2(dest, stride, DST_0);
1852
                vis_padd16(TMP6, CONST_6, TMP6);
1853
                vis_mul8x16au(REF_S2, CONST_256, TMP12);
1854 44f54ceb Michael Niedermayer
1855 bb270c08 Diego Biurrun
                vis_padd16(TMP4, TMP8, TMP4);
1856
                vis_mul8x16au(REF_S2_1, CONST_256,  TMP14);
1857 44f54ceb Michael Niedermayer
1858 bb270c08 Diego Biurrun
                vis_padd16(TMP6, TMP10, TMP6);
1859 44f54ceb Michael Niedermayer
1860 bb270c08 Diego Biurrun
                vis_padd16(TMP20, TMP4, TMP20);
1861 44f54ceb Michael Niedermayer
1862 bb270c08 Diego Biurrun
                vis_padd16(TMP22, TMP6, TMP22);
1863 44f54ceb Michael Niedermayer
1864 bb270c08 Diego Biurrun
                vis_padd16(TMP20, TMP24, TMP20);
1865 44f54ceb Michael Niedermayer
1866 bb270c08 Diego Biurrun
                vis_padd16(TMP22, TMP26, TMP22);
1867 44f54ceb Michael Niedermayer
1868 bb270c08 Diego Biurrun
                vis_padd16(TMP20, REF_0, TMP20);
1869
                vis_mul8x16au(REF_S4, CONST_256, REF_0);
1870 44f54ceb Michael Niedermayer
1871 bb270c08 Diego Biurrun
                vis_padd16(TMP22, REF_2, TMP22);
1872
                vis_pack16(TMP20, DST_2);
1873 44f54ceb Michael Niedermayer
1874 bb270c08 Diego Biurrun
                vis_pack16(TMP22, DST_3);
1875
                vis_st64_2(DST_2, dest, 8);
1876
                dest += stride;
1877 44f54ceb Michael Niedermayer
1878 bb270c08 Diego Biurrun
                vis_ld64_2(dest, 8, DST_2);
1879
                vis_mul8x16al(DST_0,   CONST_1024, TMP30);
1880
                vis_pmerge(ZERO,      REF_S4_1,  REF_2);
1881 44f54ceb Michael Niedermayer
1882 bb270c08 Diego Biurrun
                vis_mul8x16al(DST_1,   CONST_1024, TMP32);
1883
                vis_padd16(REF_4, TMP0, TMP8);
1884 44f54ceb Michael Niedermayer
1885 bb270c08 Diego Biurrun
                vis_mul8x16au(REF_S6, CONST_256, REF_4);
1886
                vis_padd16(REF_6, TMP2, TMP10);
1887 44f54ceb Michael Niedermayer
1888 bb270c08 Diego Biurrun
                vis_mul8x16au(REF_S6_1, CONST_256, REF_6);
1889
                vis_padd16(TMP8, TMP12, TMP8);
1890 44f54ceb Michael Niedermayer
1891 bb270c08 Diego Biurrun
                vis_padd16(TMP10, TMP14, TMP10);
1892 44f54ceb Michael Niedermayer
1893 bb270c08 Diego Biurrun
                vis_padd16(TMP8, TMP30, TMP8);
1894 44f54ceb Michael Niedermayer
1895 bb270c08 Diego Biurrun
                vis_padd16(TMP10, TMP32, TMP10);
1896
                vis_pack16(TMP8, DST_0);
1897 44f54ceb Michael Niedermayer
1898 bb270c08 Diego Biurrun
                vis_pack16(TMP10, DST_1);
1899
                vis_st64(DST_0, dest[0]);
1900 44f54ceb Michael Niedermayer
1901 bb270c08 Diego Biurrun
                vis_padd16(REF_0, TMP4, REF_0);
1902 44f54ceb Michael Niedermayer
1903 bb270c08 Diego Biurrun
                vis_mul8x16al(DST_2,   CONST_1024, TMP30);
1904
                vis_padd16(REF_2, TMP6, REF_2);
1905 44f54ceb Michael Niedermayer
1906 bb270c08 Diego Biurrun
                vis_mul8x16al(DST_3,   CONST_1024, TMP32);
1907
                vis_padd16(REF_0, REF_4, REF_0);
1908 44f54ceb Michael Niedermayer
1909 bb270c08 Diego Biurrun
                vis_padd16(REF_2, REF_6, REF_2);
1910 44f54ceb Michael Niedermayer
1911 bb270c08 Diego Biurrun
                vis_padd16(REF_0, TMP30, REF_0);
1912 44f54ceb Michael Niedermayer
1913 bb270c08 Diego Biurrun
                /* stall */
1914 44f54ceb Michael Niedermayer
1915 bb270c08 Diego Biurrun
                vis_padd16(REF_2, TMP32, REF_2);
1916
                vis_pack16(REF_0, DST_2);
1917 44f54ceb Michael Niedermayer
1918 bb270c08 Diego Biurrun
                vis_pack16(REF_2, DST_3);
1919
                vis_st64_2(DST_2, dest, 8);
1920
                dest += stride;
1921
        } while (--height);
1922 44f54ceb Michael Niedermayer
}
1923
1924
static void MC_avg_xy_8_vis (uint8_t * dest, const uint8_t * _ref,
1925 bb270c08 Diego Biurrun
                             const int stride, int height)
1926 44f54ceb Michael Niedermayer
{
1927 bb270c08 Diego Biurrun
        uint8_t *ref = (uint8_t *) _ref;
1928
        unsigned long off = (unsigned long) ref & 0x7;
1929
        unsigned long off_plus_1 = off + 1;
1930
        int stride_8 = stride + 8;
1931 44f54ceb Michael Niedermayer
1932 bb270c08 Diego Biurrun
        vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);
1933 44f54ceb Michael Niedermayer
1934 bb270c08 Diego Biurrun
        ref = vis_alignaddr(ref);
1935 44f54ceb Michael Niedermayer
1936 bb270c08 Diego Biurrun
        vis_ld64(ref[0], TMP0);
1937
        vis_fzero(ZERO);
1938 44f54ceb Michael Niedermayer
1939 bb270c08 Diego Biurrun
        vis_ld64_2(ref, 8, TMP2);
1940 44f54ceb Michael Niedermayer
1941 bb270c08 Diego Biurrun
        vis_ld64(constants6[0], CONST_6);
1942 44f54ceb Michael Niedermayer
1943 bb270c08 Diego Biurrun
        vis_ld64(constants256_1024[0], CONST_256);
1944
        vis_faligndata(TMP0, TMP2, REF_S0);
1945 44f54ceb Michael Niedermayer
1946 bb270c08 Diego Biurrun
        if (off != 0x7) {
1947
                vis_alignaddr_g0((void *)off_plus_1);
1948
                vis_faligndata(TMP0, TMP2, REF_S2);
1949
        } else {
1950
                vis_src1(TMP2, REF_S2);
1951
        }
1952 44f54ceb Michael Niedermayer
1953 bb270c08 Diego Biurrun
        height >>= 1;
1954
        do {    /* 31 cycles */
1955
                vis_ld64_2(ref, stride, TMP0);
1956
                vis_mul8x16au(REF_S0, CONST_256, TMP8);
1957
                vis_pmerge(ZERO,      REF_S0_1,  TMP10);
1958 44f54ceb Michael Niedermayer
1959 bb270c08 Diego Biurrun
                vis_ld64_2(ref, stride_8, TMP2);
1960
                ref += stride;
1961
                vis_mul8x16au(REF_S2, CONST_256, TMP12);
1962
                vis_pmerge(ZERO,      REF_S2_1,  TMP14);
1963 44f54ceb Michael Niedermayer
1964 bb270c08 Diego Biurrun
                vis_alignaddr_g0((void *)off);
1965 44f54ceb Michael Niedermayer
1966 bb270c08 Diego Biurrun
                vis_ld64_2(ref, stride, TMP4);
1967
                vis_faligndata(TMP0, TMP2, REF_S4);
1968 44f54ceb Michael Niedermayer
1969 bb270c08 Diego Biurrun
                vis_ld64_2(ref, stride_8, TMP6);
1970
                ref += stride;
1971 44f54ceb Michael Niedermayer
1972 bb270c08 Diego Biurrun
                vis_ld64(dest[0], DST_0);
1973
                vis_faligndata(TMP4, TMP6, REF_S0);
1974 44f54ceb Michael Niedermayer
1975 bb270c08 Diego Biurrun
                vis_ld64_2(dest, stride, DST_2);
1976 44f54ceb Michael Niedermayer
1977 bb270c08 Diego Biurrun
                if (off != 0x7) {
1978
                        vis_alignaddr_g0((void *)off_plus_1);
1979
                        vis_faligndata(TMP0, TMP2, REF_S6);
1980
                        vis_faligndata(TMP4, TMP6, REF_S2);
1981
                } else {
1982
                        vis_src1(TMP2, REF_S6);
1983
                        vis_src1(TMP6, REF_S2);
1984
                }
1985 44f54ceb Michael Niedermayer
1986 bb270c08 Diego Biurrun
                vis_mul8x16al(DST_0,   CONST_1024, TMP30);
1987
                vis_pmerge(ZERO, REF_S4, TMP22);
1988 44f54ceb Michael Niedermayer
1989 bb270c08 Diego Biurrun
                vis_mul8x16al(DST_1,   CONST_1024, TMP32);
1990
                vis_pmerge(ZERO,      REF_S4_1,  TMP24);
1991 44f54ceb Michael Niedermayer
1992 bb270c08 Diego Biurrun
                vis_mul8x16au(REF_S6, CONST_256, TMP26);
1993
                vis_pmerge(ZERO,      REF_S6_1,  TMP28);
1994 44f54ceb Michael Niedermayer
1995 bb270c08 Diego Biurrun
                vis_mul8x16au(REF_S0, CONST_256, REF_S4);
1996
                vis_padd16(TMP22, CONST_6, TMP22);
1997 44f54ceb Michael Niedermayer
1998 bb270c08 Diego Biurrun
                vis_mul8x16au(REF_S0_1, CONST_256, REF_S6);
1999
                vis_padd16(TMP24, CONST_6, TMP24);
2000 44f54ceb Michael Niedermayer
2001 bb270c08 Diego Biurrun
                vis_mul8x16al(DST_2,   CONST_1024, REF_0);
2002
                vis_padd16(TMP22, TMP26, TMP22);
2003 44f54ceb Michael Niedermayer
2004 bb270c08 Diego Biurrun
                vis_mul8x16al(DST_3,   CONST_1024, REF_2);
2005
                vis_padd16(TMP24, TMP28, TMP24);
2006 44f54ceb Michael Niedermayer
2007 bb270c08 Diego Biurrun
                vis_mul8x16au(REF_S2, CONST_256, TMP26);
2008
                vis_padd16(TMP8, TMP22, TMP8);
2009 44f54ceb Michael Niedermayer
2010 bb270c08 Diego Biurrun
                vis_mul8x16au(REF_S2_1, CONST_256, TMP28);
2011
                vis_padd16(TMP10, TMP24, TMP10);
2012 44f54ceb Michael Niedermayer
2013 bb270c08 Diego Biurrun
                vis_padd16(TMP8, TMP12, TMP8);
2014 44f54ceb Michael Niedermayer
2015 bb270c08 Diego Biurrun
                vis_padd16(TMP10, TMP14, TMP10);
2016 44f54ceb Michael Niedermayer
2017 bb270c08 Diego Biurrun
                vis_padd16(TMP8, TMP30, TMP8);
2018 44f54ceb Michael Niedermayer
2019 bb270c08 Diego Biurrun
                vis_padd16(TMP10, TMP32, TMP10);
2020
                vis_pack16(TMP8, DST_0);
2021 44f54ceb Michael Niedermayer
2022 bb270c08 Diego Biurrun
                vis_pack16(TMP10, DST_1);
2023
                vis_st64(DST_0, dest[0]);
2024
                dest += stride;
2025 44f54ceb Michael Niedermayer
2026 bb270c08 Diego Biurrun
                vis_padd16(REF_S4, TMP22, TMP12);
2027 44f54ceb Michael Niedermayer
2028 bb270c08 Diego Biurrun
                vis_padd16(REF_S6, TMP24, TMP14);
2029 44f54ceb Michael Niedermayer
2030 bb270c08 Diego Biurrun
                vis_padd16(TMP12, TMP26, TMP12);
2031 44f54ceb Michael Niedermayer
2032 bb270c08 Diego Biurrun
                vis_padd16(TMP14, TMP28, TMP14);
2033 44f54ceb Michael Niedermayer
2034 bb270c08 Diego Biurrun
                vis_padd16(TMP12, REF_0, TMP12);
2035 44f54ceb Michael Niedermayer
2036 bb270c08 Diego Biurrun
                vis_padd16(TMP14, REF_2, TMP14);
2037
                vis_pack16(TMP12, DST_2);
2038 44f54ceb Michael Niedermayer
2039 bb270c08 Diego Biurrun
                vis_pack16(TMP14, DST_3);
2040
                vis_st64(DST_2, dest[0]);
2041
                dest += stride;
2042
        } while (--height);
2043 44f54ceb Michael Niedermayer
}
2044
2045
/* End of rounding code */
2046
2047
/* Start of no rounding code */
2048
/* The trick used in some of this file is the formula from the MMX
2049
 * motion comp code, which is:
2050
 *
2051
 * (x+y)>>1 == (x&y)+((x^y)>>1)
2052
 *
2053
 * This allows us to average 8 bytes at a time in a 64-bit FPU reg.
2054
 * We avoid overflows by masking before we do the shift, and we
2055
 * implement the shift by multiplying by 1/2 using mul8x16.  So in
2056
 * VIS this is (assume 'x' is in f0, 'y' is in f2, a repeating mask
2057
 * of '0xfe' is in f4, a repeating mask of '0x7f' is in f6, and
2058
 * the value 0x80808080 is in f8):
2059
 *
2060 bb270c08 Diego Biurrun
 *      fxor            f0,   f2, f10
2061
 *      fand            f10,  f4, f10
2062
 *      fmul8x16        f8,  f10, f10
2063
 *      fand            f10,  f6, f10
2064
 *      fand            f0,   f2, f12
2065
 *      fpadd16         f12, f10, f10
2066 44f54ceb Michael Niedermayer
 */
2067
2068
static void MC_put_no_round_o_16_vis (uint8_t * dest, const uint8_t * _ref,
2069 bb270c08 Diego Biurrun
                                      const int stride, int height)
2070 44f54ceb Michael Niedermayer
{
2071 bb270c08 Diego Biurrun
        uint8_t *ref = (uint8_t *) _ref;
2072 44f54ceb Michael Niedermayer
2073 bb270c08 Diego Biurrun
        ref = vis_alignaddr(ref);
2074
        do {    /* 5 cycles */
2075
                vis_ld64(ref[0], TMP0);
2076 44f54ceb Michael Niedermayer
2077 bb270c08 Diego Biurrun
                vis_ld64_2(ref, 8, TMP2);
2078 44f54ceb Michael Niedermayer
2079 bb270c08 Diego Biurrun
                vis_ld64_2(ref, 16, TMP4);
2080
                ref += stride;
2081 44f54ceb Michael Niedermayer
2082 bb270c08 Diego Biurrun
                vis_faligndata(TMP0, TMP2, REF_0);
2083
                vis_st64(REF_0, dest[0]);
2084 44f54ceb Michael Niedermayer
2085 bb270c08 Diego Biurrun
                vis_faligndata(TMP2, TMP4, REF_2);
2086
                vis_st64_2(REF_2, dest, 8);
2087
                dest += stride;
2088
        } while (--height);
2089 44f54ceb Michael Niedermayer
}
2090
2091
static void MC_put_no_round_o_8_vis (uint8_t * dest, const uint8_t * _ref,
2092 bb270c08 Diego Biurrun
                            const int stride, int height)
2093 44f54ceb Michael Niedermayer
{
2094 bb270c08 Diego Biurrun
        uint8_t *ref = (uint8_t *) _ref;
2095 44f54ceb Michael Niedermayer
2096 bb270c08 Diego Biurrun
        ref = vis_alignaddr(ref);
2097
        do {    /* 4 cycles */
2098
                vis_ld64(ref[0], TMP0);
2099 44f54ceb Michael Niedermayer
2100 bb270c08 Diego Biurrun
                vis_ld64(ref[8], TMP2);
2101
                ref += stride;
2102 44f54ceb Michael Niedermayer
2103 bb270c08 Diego Biurrun
                /* stall */
2104 44f54ceb Michael Niedermayer
2105 bb270c08 Diego Biurrun
                vis_faligndata(TMP0, TMP2, REF_0);
2106
                vis_st64(REF_0, dest[0]);
2107
                dest += stride;
2108
        } while (--height);
2109 44f54ceb Michael Niedermayer
}
2110
2111
2112
static void MC_avg_no_round_o_16_vis (uint8_t * dest, const uint8_t * _ref,
2113 bb270c08 Diego Biurrun
                             const int stride, int height)
2114 44f54ceb Michael Niedermayer
{
2115 bb270c08 Diego Biurrun
        uint8_t *ref = (uint8_t *) _ref;
2116
        int stride_8 = stride + 8;
2117 44f54ceb Michael Niedermayer
2118 bb270c08 Diego Biurrun
        ref = vis_alignaddr(ref);
2119 44f54ceb Michael Niedermayer
2120 bb270c08 Diego Biurrun
        vis_ld64(ref[0], TMP0);
2121 44f54ceb Michael Niedermayer
2122 bb270c08 Diego Biurrun
        vis_ld64(ref[8], TMP2);
2123 44f54ceb Michael Niedermayer
2124 bb270c08 Diego Biurrun
        vis_ld64(ref[16], TMP4);
2125 44f54ceb Michael Niedermayer
2126 bb270c08 Diego Biurrun
        vis_ld64(dest[0], DST_0);
2127 44f54ceb Michael Niedermayer
2128 bb270c08 Diego Biurrun
        vis_ld64(dest[8], DST_2);
2129 44f54ceb Michael Niedermayer
2130 bb270c08 Diego Biurrun
        vis_ld64(constants_fe[0], MASK_fe);
2131
        vis_faligndata(TMP0, TMP2, REF_0);
2132 44f54ceb Michael Niedermayer
2133 bb270c08 Diego Biurrun
        vis_ld64(constants_7f[0], MASK_7f);
2134
        vis_faligndata(TMP2, TMP4, REF_2);
2135 44f54ceb Michael Niedermayer
2136 bb270c08 Diego Biurrun
        vis_ld64(constants128[0], CONST_128);
2137 44f54ceb Michael Niedermayer
2138 bb270c08 Diego Biurrun
        ref += stride;
2139
        height = (height >> 1) - 1;
2140 44f54ceb Michael Niedermayer
2141 bb270c08 Diego Biurrun
        do {    /* 24 cycles */
2142
                vis_ld64(ref[0], TMP0);
2143
                vis_xor(DST_0, REF_0, TMP6);
2144 44f54ceb Michael Niedermayer
2145 bb270c08 Diego Biurrun
                vis_ld64_2(ref, 8, TMP2);
2146
                vis_and(TMP6, MASK_fe, TMP6);
2147 44f54ceb Michael Niedermayer
2148 bb270c08 Diego Biurrun
                vis_ld64_2(ref, 16, TMP4);
2149
                ref += stride;
2150
                vis_mul8x16(CONST_128, TMP6, TMP6);
2151
                vis_xor(DST_2, REF_2, TMP8);
2152 44f54ceb Michael Niedermayer
2153 bb270c08 Diego Biurrun
                vis_and(TMP8, MASK_fe, TMP8);
2154 44f54ceb Michael Niedermayer
2155 bb270c08 Diego Biurrun
                vis_and(DST_0, REF_0, TMP10);
2156
                vis_ld64_2(dest, stride, DST_0);
2157
                vis_mul8x16(CONST_128, TMP8, TMP8);
2158 44f54ceb Michael Niedermayer
2159 bb270c08 Diego Biurrun
                vis_and(DST_2, REF_2, TMP12);
2160
                vis_ld64_2(dest, stride_8, DST_2);
2161 44f54ceb Michael Niedermayer
2162 bb270c08 Diego Biurrun
                vis_ld64(ref[0], TMP14);
2163
                vis_and(TMP6, MASK_7f, TMP6);
2164 44f54ceb Michael Niedermayer
2165 bb270c08 Diego Biurrun
                vis_and(TMP8, MASK_7f, TMP8);
2166 44f54ceb Michael Niedermayer
2167 bb270c08 Diego Biurrun
                vis_padd16(TMP10, TMP6, TMP6);
2168
                vis_st64(TMP6, dest[0]);
2169 44f54ceb Michael Niedermayer
2170 bb270c08 Diego Biurrun
                vis_padd16(TMP12, TMP8, TMP8);
2171
                vis_st64_2(TMP8, dest, 8);
2172 44f54ceb Michael Niedermayer
2173 bb270c08 Diego Biurrun
                dest += stride;
2174
                vis_ld64_2(ref, 8, TMP16);
2175
                vis_faligndata(TMP0, TMP2, REF_0);
2176 44f54ceb Michael Niedermayer
2177 bb270c08 Diego Biurrun
                vis_ld64_2(ref, 16, TMP18);
2178
                vis_faligndata(TMP2, TMP4, REF_2);
2179
                ref += stride;
2180 44f54ceb Michael Niedermayer
2181 bb270c08 Diego Biurrun
                vis_xor(DST_0, REF_0, TMP20);
2182 44f54ceb Michael Niedermayer
2183 bb270c08 Diego Biurrun
                vis_and(TMP20, MASK_fe, TMP20);
2184 44f54ceb Michael Niedermayer
2185 bb270c08 Diego Biurrun
                vis_xor(DST_2, REF_2, TMP22);
2186
                vis_mul8x16(CONST_128, TMP20, TMP20);
2187 44f54ceb Michael Niedermayer
2188 bb270c08 Diego Biurrun
                vis_and(TMP22, MASK_fe, TMP22);
2189 44f54ceb Michael Niedermayer
2190 bb270c08 Diego Biurrun
                vis_and(DST_0, REF_0, TMP24);
2191
                vis_mul8x16(CONST_128, TMP22, TMP22);
2192 44f54ceb Michael Niedermayer
2193 bb270c08 Diego Biurrun
                vis_and(DST_2, REF_2, TMP26);
2194 44f54ceb Michael Niedermayer
2195 bb270c08 Diego Biurrun
                vis_ld64_2(dest, stride, DST_0);
2196
                vis_faligndata(TMP14, TMP16, REF_0);
2197 44f54ceb Michael Niedermayer
2198 bb270c08 Diego Biurrun
                vis_ld64_2(dest, stride_8, DST_2);
2199
                vis_faligndata(TMP16, TMP18, REF_2);
2200 44f54ceb Michael Niedermayer
2201 bb270c08 Diego Biurrun
                vis_and(TMP20, MASK_7f, TMP20);
2202 44f54ceb Michael Niedermayer
2203 bb270c08 Diego Biurrun
                vis_and(TMP22, MASK_7f, TMP22);
2204 44f54ceb Michael Niedermayer
2205 bb270c08 Diego Biurrun
                vis_padd16(TMP24, TMP20, TMP20);
2206
                vis_st64(TMP20, dest[0]);
2207 44f54ceb Michael Niedermayer
2208 bb270c08 Diego Biurrun
                vis_padd16(TMP26, TMP22, TMP22);
2209
                vis_st64_2(TMP22, dest, 8);
2210
                dest += stride;
2211
        } while (--height);
2212 44f54ceb Michael Niedermayer
2213 bb270c08 Diego Biurrun
        vis_ld64(ref[0], TMP0);
2214
        vis_xor(DST_0, REF_0, TMP6);
2215 44f54ceb Michael Niedermayer
2216 bb270c08 Diego Biurrun
        vis_ld64_2(ref, 8, TMP2);
2217
        vis_and(TMP6, MASK_fe, TMP6);
2218 44f54ceb Michael Niedermayer
2219 bb270c08 Diego Biurrun
        vis_ld64_2(ref, 16, TMP4);
2220
        vis_mul8x16(CONST_128, TMP6, TMP6);
2221
        vis_xor(DST_2, REF_2, TMP8);
2222 44f54ceb Michael Niedermayer
2223 bb270c08 Diego Biurrun
        vis_and(TMP8, MASK_fe, TMP8);
2224 44f54ceb Michael Niedermayer
2225 bb270c08 Diego Biurrun
        vis_and(DST_0, REF_0, TMP10);
2226
        vis_ld64_2(dest, stride, DST_0);
2227
        vis_mul8x16(CONST_128, TMP8, TMP8);
2228 44f54ceb Michael Niedermayer
2229 bb270c08 Diego Biurrun
        vis_and(DST_2, REF_2, TMP12);
2230
        vis_ld64_2(dest, stride_8, DST_2);
2231 44f54ceb Michael Niedermayer
2232 bb270c08 Diego Biurrun
        vis_ld64(ref[0], TMP14);
2233
        vis_and(TMP6, MASK_7f, TMP6);
2234 44f54ceb Michael Niedermayer
2235 bb270c08 Diego Biurrun
        vis_and(TMP8, MASK_7f, TMP8);
2236 44f54ceb Michael Niedermayer
2237 bb270c08 Diego Biurrun
        vis_padd16(TMP10, TMP6, TMP6);
2238
        vis_st64(TMP6, dest[0]);
2239 44f54ceb Michael Niedermayer
2240 bb270c08 Diego Biurrun
        vis_padd16(TMP12, TMP8, TMP8);
2241
        vis_st64_2(TMP8, dest, 8);
2242 44f54ceb Michael Niedermayer
2243 bb270c08 Diego Biurrun
        dest += stride;
2244
        vis_faligndata(TMP0, TMP2, REF_0);
2245 44f54ceb Michael Niedermayer
2246 bb270c08 Diego Biurrun
        vis_faligndata(TMP2, TMP4, REF_2);
2247 44f54ceb Michael Niedermayer
2248 bb270c08 Diego Biurrun
        vis_xor(DST_0, REF_0, TMP20);
2249 44f54ceb Michael Niedermayer
2250 bb270c08 Diego Biurrun
        vis_and(TMP20, MASK_fe, TMP20);
2251 44f54ceb Michael Niedermayer
2252 bb270c08 Diego Biurrun
        vis_xor(DST_2, REF_2, TMP22);
2253
        vis_mul8x16(CONST_128, TMP20, TMP20);
2254 44f54ceb Michael Niedermayer
2255 bb270c08 Diego Biurrun
        vis_and(TMP22, MASK_fe, TMP22);
2256 44f54ceb Michael Niedermayer
2257 bb270c08 Diego Biurrun
        vis_and(DST_0, REF_0, TMP24);
2258
        vis_mul8x16(CONST_128, TMP22, TMP22);
2259 44f54ceb Michael Niedermayer
2260 bb270c08 Diego Biurrun
        vis_and(DST_2, REF_2, TMP26);
2261 44f54ceb Michael Niedermayer
2262 bb270c08 Diego Biurrun
        vis_and(TMP20, MASK_7f, TMP20);
2263 44f54ceb Michael Niedermayer
2264 bb270c08 Diego Biurrun
        vis_and(TMP22, MASK_7f, TMP22);
2265 44f54ceb Michael Niedermayer
2266 bb270c08 Diego Biurrun
        vis_padd16(TMP24, TMP20, TMP20);
2267
        vis_st64(TMP20, dest[0]);
2268 44f54ceb Michael Niedermayer
2269 bb270c08 Diego Biurrun
        vis_padd16(TMP26, TMP22, TMP22);
2270
        vis_st64_2(TMP22, dest, 8);
2271 44f54ceb Michael Niedermayer
}
2272
2273
static void MC_avg_no_round_o_8_vis (uint8_t * dest, const uint8_t * _ref,
2274 bb270c08 Diego Biurrun
                            const int stride, int height)
2275 44f54ceb Michael Niedermayer
{
2276 bb270c08 Diego Biurrun
        uint8_t *ref = (uint8_t *) _ref;
2277 44f54ceb Michael Niedermayer
2278 bb270c08 Diego Biurrun
        ref = vis_alignaddr(ref);
2279 44f54ceb Michael Niedermayer
2280 bb270c08 Diego Biurrun
        vis_ld64(ref[0], TMP0);
2281 44f54ceb Michael Niedermayer
2282 bb270c08 Diego Biurrun
        vis_ld64(ref[8], TMP2);
2283 44f54ceb Michael Niedermayer
2284 bb270c08 Diego Biurrun
        vis_ld64(dest[0], DST_0);
2285 44f54ceb Michael Niedermayer
2286 bb270c08 Diego Biurrun
        vis_ld64(constants_fe[0], MASK_fe);
2287 44f54ceb Michael Niedermayer
2288 bb270c08 Diego Biurrun
        vis_ld64(constants_7f[0], MASK_7f);
2289
        vis_faligndata(TMP0, TMP2, REF_0);
2290 44f54ceb Michael Niedermayer
2291 bb270c08 Diego Biurrun
        vis_ld64(constants128[0], CONST_128);
2292 44f54ceb Michael Niedermayer
2293 bb270c08 Diego Biurrun
        ref += stride;
2294
        height = (height >> 1) - 1;
2295 44f54ceb Michael Niedermayer
2296 bb270c08 Diego Biurrun
        do {    /* 12 cycles */
2297
                vis_ld64(ref[0], TMP0);
2298
                vis_xor(DST_0, REF_0, TMP4);
2299 44f54ceb Michael Niedermayer
2300 bb270c08 Diego Biurrun
                vis_ld64(ref[8], TMP2);
2301
                vis_and(TMP4, MASK_fe, TMP4);
2302 44f54ceb Michael Niedermayer
2303 bb270c08 Diego Biurrun
                vis_and(DST_0, REF_0, TMP6);
2304
                vis_ld64_2(dest, stride, DST_0);
2305
                ref += stride;
2306
                vis_mul8x16(CONST_128, TMP4, TMP4);
2307 44f54ceb Michael Niedermayer
2308 bb270c08 Diego Biurrun
                vis_ld64(ref[0], TMP12);
2309
                vis_faligndata(TMP0, TMP2, REF_0);
2310 44f54ceb Michael Niedermayer
2311 bb270c08 Diego Biurrun
                vis_ld64(ref[8], TMP2);
2312
                vis_xor(DST_0, REF_0, TMP0);
2313
                ref += stride;
2314 44f54ceb Michael Niedermayer
2315 bb270c08 Diego Biurrun
                vis_and(TMP0, MASK_fe, TMP0);
2316 44f54ceb Michael Niedermayer
2317 bb270c08 Diego Biurrun
                vis_and(TMP4, MASK_7f, TMP4);
2318 44f54ceb Michael Niedermayer
2319 bb270c08 Diego Biurrun
                vis_padd16(TMP6, TMP4, TMP4);
2320
                vis_st64(TMP4, dest[0]);
2321
                dest += stride;
2322
                vis_mul8x16(CONST_128, TMP0, TMP0);
2323 44f54ceb Michael Niedermayer
2324 bb270c08 Diego Biurrun
                vis_and(DST_0, REF_0, TMP6);
2325
                vis_ld64_2(dest, stride, DST_0);
2326 44f54ceb Michael Niedermayer
2327 bb270c08 Diego Biurrun
                vis_faligndata(TMP12, TMP2, REF_0);
2328 44f54ceb Michael Niedermayer
2329 bb270c08 Diego Biurrun
                vis_and(TMP0, MASK_7f, TMP0);
2330 44f54ceb Michael Niedermayer
2331 bb270c08 Diego Biurrun
                vis_padd16(TMP6, TMP0, TMP4);
2332
                vis_st64(TMP4, dest[0]);
2333
                dest += stride;
2334
        } while (--height);
2335 44f54ceb Michael Niedermayer
2336 bb270c08 Diego Biurrun
        vis_ld64(ref[0], TMP0);
2337
        vis_xor(DST_0, REF_0, TMP4);
2338 44f54ceb Michael Niedermayer
2339 bb270c08 Diego Biurrun
        vis_ld64(ref[8], TMP2);
2340
        vis_and(TMP4, MASK_fe, TMP4);
2341 44f54ceb Michael Niedermayer
2342 bb270c08 Diego Biurrun
        vis_and(DST_0, REF_0, TMP6);
2343
        vis_ld64_2(dest, stride, DST_0);
2344
        vis_mul8x16(CONST_128, TMP4, TMP4);
2345 44f54ceb Michael Niedermayer
2346 bb270c08 Diego Biurrun
        vis_faligndata(TMP0, TMP2, REF_0);
2347 44f54ceb Michael Niedermayer
2348 bb270c08 Diego Biurrun
        vis_xor(DST_0, REF_0, TMP0);
2349 44f54ceb Michael Niedermayer
2350 bb270c08 Diego Biurrun
        vis_and(TMP0, MASK_fe, TMP0);
2351 44f54ceb Michael Niedermayer
2352 bb270c08 Diego Biurrun
        vis_and(TMP4, MASK_7f, TMP4);
2353 44f54ceb Michael Niedermayer
2354 bb270c08 Diego Biurrun
        vis_padd16(TMP6, TMP4, TMP4);
2355
        vis_st64(TMP4, dest[0]);
2356
        dest += stride;
2357
        vis_mul8x16(CONST_128, TMP0, TMP0);
2358 44f54ceb Michael Niedermayer
2359 bb270c08 Diego Biurrun
        vis_and(DST_0, REF_0, TMP6);
2360 44f54ceb Michael Niedermayer
2361 bb270c08 Diego Biurrun
        vis_and(TMP0, MASK_7f, TMP0);
2362 44f54ceb Michael Niedermayer
2363 bb270c08 Diego Biurrun
        vis_padd16(TMP6, TMP0, TMP4);
2364
        vis_st64(TMP4, dest[0]);
2365 44f54ceb Michael Niedermayer
}
2366
2367
static void MC_put_no_round_x_16_vis (uint8_t * dest, const uint8_t * _ref,
2368 bb270c08 Diego Biurrun
                             const int stride, int height)
2369 44f54ceb Michael Niedermayer
{
2370 bb270c08 Diego Biurrun
        uint8_t *ref = (uint8_t *) _ref;
2371
        unsigned long off = (unsigned long) ref & 0x7;
2372
        unsigned long off_plus_1 = off + 1;
2373 44f54ceb Michael Niedermayer
2374 bb270c08 Diego Biurrun
        ref = vis_alignaddr(ref);
2375 44f54ceb Michael Niedermayer
2376 bb270c08 Diego Biurrun
        vis_ld64(ref[0],    TMP0);
2377 44f54ceb Michael Niedermayer
2378 bb270c08 Diego Biurrun
        vis_ld64_2(ref, 8,  TMP2);
2379 44f54ceb Michael Niedermayer
2380 bb270c08 Diego Biurrun
        vis_ld64_2(ref, 16, TMP4);
2381 44f54ceb Michael Niedermayer
2382 bb270c08 Diego Biurrun
        vis_ld64(constants_fe[0], MASK_fe);
2383 44f54ceb Michael Niedermayer
2384 bb270c08 Diego Biurrun
        vis_ld64(constants_7f[0], MASK_7f);
2385
        vis_faligndata(TMP0, TMP2, REF_0);
2386 44f54ceb Michael Niedermayer
2387 bb270c08 Diego Biurrun
        vis_ld64(constants128[0], CONST_128);
2388
        vis_faligndata(TMP2, TMP4, REF_4);
2389 44f54ceb Michael Niedermayer
2390 bb270c08 Diego Biurrun
        if (off != 0x7) {
2391
                vis_alignaddr_g0((void *)off_plus_1);
2392
                vis_faligndata(TMP0, TMP2, REF_2);
2393
                vis_faligndata(TMP2, TMP4, REF_6);
2394
        } else {
2395
                vis_src1(TMP2, REF_2);
2396
                vis_src1(TMP4, REF_6);
2397
        }
2398 44f54ceb Michael Niedermayer
2399 bb270c08 Diego Biurrun
        ref += stride;
2400
        height = (height >> 1) - 1;
2401 44f54ceb Michael Niedermayer
2402 bb270c08 Diego Biurrun
        do {    /* 34 cycles */
2403
                vis_ld64(ref[0],    TMP0);
2404
                vis_xor(REF_0, REF_2, TMP6);
2405 44f54ceb Michael Niedermayer
2406 bb270c08 Diego Biurrun
                vis_ld64_2(ref, 8,  TMP2);
2407
                vis_xor(REF_4, REF_6, TMP8);
2408 44f54ceb Michael Niedermayer
2409 bb270c08 Diego Biurrun
                vis_ld64_2(ref, 16, TMP4);
2410
                vis_and(TMP6, MASK_fe, TMP6);
2411
                ref += stride;
2412 44f54ceb Michael Niedermayer
2413 bb270c08 Diego Biurrun
                vis_ld64(ref[0],    TMP14);
2414
                vis_mul8x16(CONST_128, TMP6, TMP6);
2415
                vis_and(TMP8, MASK_fe, TMP8);
2416 44f54ceb Michael Niedermayer
2417 bb270c08 Diego Biurrun
                vis_ld64_2(ref, 8,  TMP16);
2418
                vis_mul8x16(CONST_128, TMP8, TMP8);
2419
                vis_and(REF_0, REF_2, TMP10);
2420 44f54ceb Michael Niedermayer
2421 bb270c08 Diego Biurrun
                vis_ld64_2(ref, 16, TMP18);
2422
                ref += stride;
2423
                vis_and(REF_4, REF_6, TMP12);
2424 44f54ceb Michael Niedermayer
2425 bb270c08 Diego Biurrun
                vis_alignaddr_g0((void *)off);
2426 44f54ceb Michael Niedermayer
2427 bb270c08 Diego Biurrun
                vis_faligndata(TMP0, TMP2, REF_0);
2428 44f54ceb Michael Niedermayer
2429 bb270c08 Diego Biurrun
                vis_faligndata(TMP2, TMP4, REF_4);
2430 44f54ceb Michael Niedermayer
2431 bb270c08 Diego Biurrun
                if (off != 0x7) {
2432
                        vis_alignaddr_g0((void *)off_plus_1);
2433
                        vis_faligndata(TMP0, TMP2, REF_2);
2434
                        vis_faligndata(TMP2, TMP4, REF_6);
2435
                } else {
2436
                        vis_src1(TMP2, REF_2);
2437
                        vis_src1(TMP4, REF_6);
2438
                }
2439 44f54ceb Michael Niedermayer
2440 bb270c08 Diego Biurrun
                vis_and(TMP6, MASK_7f, TMP6);
2441 44f54ceb Michael Niedermayer
2442 bb270c08 Diego Biurrun
                vis_and(TMP8, MASK_7f, TMP8);
2443 44f54ceb Michael Niedermayer
2444 bb270c08 Diego Biurrun
                vis_padd16(TMP10, TMP6, TMP6);
2445
                vis_st64(TMP6, dest[0]);
2446 44f54ceb Michael Niedermayer
2447 bb270c08 Diego Biurrun
                vis_padd16(TMP12, TMP8, TMP8);
2448
                vis_st64_2(TMP8, dest, 8);
2449
                dest += stride;
2450 44f54ceb Michael Niedermayer
2451 bb270c08 Diego Biurrun
                vis_xor(REF_0, REF_2, TMP6);
2452 44f54ceb Michael Niedermayer
2453 bb270c08 Diego Biurrun
                vis_xor(REF_4, REF_6, TMP8);
2454 44f54ceb Michael Niedermayer
2455 bb270c08 Diego Biurrun
                vis_and(TMP6, MASK_fe, TMP6);
2456 44f54ceb Michael Niedermayer
2457 bb270c08 Diego Biurrun
                vis_mul8x16(CONST_128, TMP6, TMP6);
2458
                vis_and(TMP8, MASK_fe, TMP8);
2459 44f54ceb Michael Niedermayer
2460 bb270c08 Diego Biurrun
                vis_mul8x16(CONST_128, TMP8, TMP8);
2461
                vis_and(REF_0, REF_2, TMP10);
2462 44f54ceb Michael Niedermayer
2463 bb270c08 Diego Biurrun
                vis_and(REF_4, REF_6, TMP12);
2464 44f54ceb Michael Niedermayer
2465 bb270c08 Diego Biurrun
                vis_alignaddr_g0((void *)off);
2466 44f54ceb Michael Niedermayer
2467 bb270c08 Diego Biurrun
                vis_faligndata(TMP14, TMP16, REF_0);
2468 44f54ceb Michael Niedermayer
2469 bb270c08 Diego Biurrun
                vis_faligndata(TMP16, TMP18, REF_4);
2470 44f54ceb Michael Niedermayer
2471 bb270c08 Diego Biurrun
                if (off != 0x7) {
2472
                        vis_alignaddr_g0((void *)off_plus_1);
2473
                        vis_faligndata(TMP14, TMP16, REF_2);
2474
                        vis_faligndata(TMP16, TMP18, REF_6);
2475
                } else {
2476
                        vis_src1(TMP16, REF_2);
2477
                        vis_src1(TMP18, REF_6);
2478
                }
2479 44f54ceb Michael Niedermayer
2480 bb270c08 Diego Biurrun
                vis_and(TMP6, MASK_7f, TMP6);
2481 44f54ceb Michael Niedermayer
2482 bb270c08 Diego Biurrun
                vis_and(TMP8, MASK_7f, TMP8);
2483 44f54ceb Michael Niedermayer
2484 bb270c08 Diego Biurrun
                vis_padd16(TMP10, TMP6, TMP6);
2485
                vis_st64(TMP6, dest[0]);
2486 44f54ceb Michael Niedermayer
2487 bb270c08 Diego Biurrun
                vis_padd16(TMP12, TMP8, TMP8);
2488
                vis_st64_2(TMP8, dest, 8);
2489
                dest += stride;
2490
        } while (--height);
2491 44f54ceb Michael Niedermayer
2492 bb270c08 Diego Biurrun
        vis_ld64(ref[0],    TMP0);
2493
        vis_xor(REF_0, REF_2, TMP6);
2494 44f54ceb Michael Niedermayer
2495 bb270c08 Diego Biurrun
        vis_ld64_2(ref, 8,  TMP2);
2496
        vis_xor(REF_4, REF_6, TMP8);
2497 44f54ceb Michael Niedermayer
2498 bb270c08 Diego Biurrun
        vis_ld64_2(ref, 16, TMP4);
2499
        vis_and(TMP6, MASK_fe, TMP6);
2500 44f54ceb Michael Niedermayer
2501 bb270c08 Diego Biurrun
        vis_mul8x16(CONST_128, TMP6, TMP6);
2502
        vis_and(TMP8, MASK_fe, TMP8);
2503 44f54ceb Michael Niedermayer
2504 bb270c08 Diego Biurrun
        vis_mul8x16(CONST_128, TMP8, TMP8);
2505
        vis_and(REF_0, REF_2, TMP10);
2506 44f54ceb Michael Niedermayer
2507 bb270c08 Diego Biurrun
        vis_and(REF_4, REF_6, TMP12);
2508 44f54ceb Michael Niedermayer
2509 bb270c08 Diego Biurrun
        vis_alignaddr_g0((void *)off);
2510 44f54ceb Michael Niedermayer
2511 bb270c08 Diego Biurrun
        vis_faligndata(TMP0, TMP2, REF_0);
2512 44f54ceb Michael Niedermayer
2513 bb270c08 Diego Biurrun
        vis_faligndata(TMP2, TMP4, REF_4);
2514 44f54ceb Michael Niedermayer
2515 bb270c08 Diego Biurrun
        if (off != 0x7) {
2516
                vis_alignaddr_g0((void *)off_plus_1);
2517
                vis_faligndata(TMP0, TMP2, REF_2);
2518
                vis_faligndata(TMP2, TMP4, REF_6);
2519
        } else {
2520
                vis_src1(TMP2, REF_2);
2521
                vis_src1(TMP4, REF_6);
2522
        }
2523 44f54ceb Michael Niedermayer
2524 bb270c08 Diego Biurrun
        vis_and(TMP6, MASK_7f, TMP6);
2525 44f54ceb Michael Niedermayer
2526 bb270c08 Diego Biurrun
        vis_and(TMP8, MASK_7f, TMP8);
2527 44f54ceb Michael Niedermayer
2528 bb270c08 Diego Biurrun
        vis_padd16(TMP10, TMP6, TMP6);
2529
        vis_st64(TMP6, dest[0]);
2530 44f54ceb Michael Niedermayer
2531 bb270c08 Diego Biurrun
        vis_padd16(TMP12, TMP8, TMP8);
2532
        vis_st64_2(TMP8, dest, 8);
2533
        dest += stride;
2534 44f54ceb Michael Niedermayer
2535 bb270c08 Diego Biurrun
        vis_xor(REF_0, REF_2, TMP6);
2536 44f54ceb Michael Niedermayer
2537 bb270c08 Diego Biurrun
        vis_xor(REF_4, REF_6, TMP8);
2538 44f54ceb Michael Niedermayer
2539 bb270c08 Diego Biurrun
        vis_and(TMP6, MASK_fe, TMP6);
2540 44f54ceb Michael Niedermayer
2541 bb270c08 Diego Biurrun
        vis_mul8x16(CONST_128, TMP6, TMP6);
2542
        vis_and(TMP8, MASK_fe, TMP8);
2543 44f54ceb Michael Niedermayer
2544 bb270c08 Diego Biurrun
        vis_mul8x16(CONST_128, TMP8, TMP8);
2545
        vis_and(REF_0, REF_2, TMP10);
2546 44f54ceb Michael Niedermayer
2547 bb270c08 Diego Biurrun
        vis_and(REF_4, REF_6, TMP12);
2548 44f54ceb Michael Niedermayer
2549 bb270c08 Diego Biurrun
        vis_and(TMP6, MASK_7f, TMP6);
2550 44f54ceb Michael Niedermayer
2551 bb270c08 Diego Biurrun
        vis_and(TMP8, MASK_7f, TMP8);
2552 44f54ceb Michael Niedermayer
2553 bb270c08 Diego Biurrun
        vis_padd16(TMP10, TMP6, TMP6);
2554
        vis_st64(TMP6, dest[0]);
2555 44f54ceb Michael Niedermayer
2556 bb270c08 Diego Biurrun
        vis_padd16(TMP12, TMP8, TMP8);
2557
        vis_st64_2(TMP8, dest, 8);
2558 44f54ceb Michael Niedermayer
}
2559
2560
static void MC_put_no_round_x_8_vis (uint8_t * dest, const uint8_t * _ref,
2561 bb270c08 Diego Biurrun
                            const int stride, int height)
2562 44f54ceb Michael Niedermayer
{
2563 bb270c08 Diego Biurrun
        uint8_t *ref = (uint8_t *) _ref;
2564
        unsigned long off = (unsigned long) ref & 0x7;
2565
        unsigned long off_plus_1 = off + 1;
2566 44f54ceb Michael Niedermayer
2567 bb270c08 Diego Biurrun
        ref = vis_alignaddr(ref);
2568 44f54ceb Michael Niedermayer
2569 bb270c08 Diego Biurrun
        vis_ld64(ref[0], TMP0);
2570 44f54ceb Michael Niedermayer
2571 bb270c08 Diego Biurrun
        vis_ld64(ref[8], TMP2);
2572 44f54ceb Michael Niedermayer
2573 bb270c08 Diego Biurrun
        vis_ld64(constants_fe[0], MASK_fe);
2574 44f54ceb Michael Niedermayer
2575 bb270c08 Diego Biurrun
        vis_ld64(constants_7f[0], MASK_7f);
2576 44f54ceb Michael Niedermayer
2577 bb270c08 Diego Biurrun
        vis_ld64(constants128[0], CONST_128);
2578
        vis_faligndata(TMP0, TMP2, REF_0);
2579 44f54ceb Michael Niedermayer
2580 bb270c08 Diego Biurrun
        if (off != 0x7) {
2581
                vis_alignaddr_g0((void *)off_plus_1);
2582
                vis_faligndata(TMP0, TMP2, REF_2);
2583
        } else {
2584
                vis_src1(TMP2, REF_2);
2585
        }
2586 44f54ceb Michael Niedermayer
2587 bb270c08 Diego Biurrun
        ref += stride;
2588
        height = (height >> 1) - 1;
2589 44f54ceb Michael Niedermayer
2590 bb270c08 Diego Biurrun
        do {    /* 20 cycles */
2591
                vis_ld64(ref[0], TMP0);
2592
                vis_xor(REF_0, REF_2, TMP4);
2593 44f54ceb Michael Niedermayer
2594 bb270c08 Diego Biurrun
                vis_ld64_2(ref, 8, TMP2);
2595
                vis_and(TMP4, MASK_fe, TMP4);
2596
                ref += stride;
2597 44f54ceb Michael Niedermayer
2598 bb270c08 Diego Biurrun
                vis_ld64(ref[0], TMP8);
2599
                vis_and(REF_0, REF_2, TMP6);
2600
                vis_mul8x16(CONST_128, TMP4, TMP4);
2601 44f54ceb Michael Niedermayer
2602 bb270c08 Diego Biurrun
                vis_alignaddr_g0((void *)off);
2603