Revision 44f54ceb

View differences:

libavcodec/Makefile
161 161
OBJS+= sh4/idct_sh4.o sh4/dsputil_sh4.o sh4/dsputil_align.o
162 162
endif
163 163

  
164
ifeq ($(TARGET_ARCH_SPARC),yes)
165
OBJS+=sparc/dsputil_vis.o
166
CFLAGS+= -mcpu=ultrasparc -mtune=ultrasparc
167
endif
168

  
164 169

  
165 170
SRCS := $(OBJS:.o=.c) $(ASM_OBJS:.o=.S)
166 171
OBJS := $(OBJS) $(ASM_OBJS)
libavcodec/dsputil.c
3286 3286
#ifdef HAVE_MLIB
3287 3287
    dsputil_init_mlib(c, avctx);
3288 3288
#endif
3289
#ifdef ARCH_SPARC
3290
   dsputil_init_vis(c,avctx);
3291
#endif
3289 3292
#ifdef ARCH_ALPHA
3290 3293
    dsputil_init_alpha(c, avctx);
3291 3294
#endif
libavcodec/dsputil.h
422 422

  
423 423
void dsputil_init_mlib(DSPContext* c, AVCodecContext *avctx);
424 424

  
425
#elif defined(ARCH_SPARC)
426

  
427
/* SPARC/VIS IDCT needs 8-byte aligned DCT blocks */
428
#define __align8 __attribute__ ((aligned (8)))
429
void dsputil_init_vis(DSPContext* c, AVCodecContext *avctx);
430

  
425 431
#elif defined(ARCH_ALPHA)
426 432

  
427 433
#define __align8 __attribute__ ((aligned (8)))
libavcodec/sparc/dsputil_vis.c
1
/*
2
 * motion_comp_vis.c
3
 * Copyright (C) 2003 David S. Miller <davem@redhat.com>
4
 *
5
 * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
6
 * See http://libmpeg2.sourceforge.net/ for updates.
7
 *
8
 * mpeg2dec is free software; you can redistribute it and/or modify
9
 * it under the terms of the GNU General Public License as published by
10
 * the Free Software Foundation; either version 2 of the License, or
11
 * (at your option) any later version.
12
 *
13
 * mpeg2dec is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
 * GNU General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU General Public License
19
 * along with this program; if not, write to the Free Software
20
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
21
 */
22

  
23
/* The *no_round* functions have been added by James A. Morrison, 2003.
24
   The vis code from libmpeg2 was adapted for ffmpeg by James A. Morrison.
25
   Note: This code is GPL'd and may only be distributed in a GPL'd package.
26
 */
27

  
28
#include "config.h"
29

  
30
#ifdef ARCH_SPARC
31

  
32
#include <inttypes.h>
33

  
34
#include "../dsputil.h"
35

  
36
#include "vis.h"
37

  
38
/* The trick used in some of this file is the formula from the MMX
39
 * motion comp code, which is:
40
 *
41
 * (x+y+1)>>1 == (x|y)-((x^y)>>1)
42
 *
43
 * This allows us to average 8 bytes at a time in a 64-bit FPU reg.
44
 * We avoid overflows by masking before we do the shift, and we
45
 * implement the shift by multiplying by 1/2 using mul8x16.  So in
46
 * VIS this is (assume 'x' is in f0, 'y' is in f2, a repeating mask
47
 * of '0xfe' is in f4, a repeating mask of '0x7f' is in f6, and
48
 * the value 0x80808080 is in f8):
49
 *
50
 *	fxor		f0, f2, f10
51
 *	fand		f10, f4, f10
52
 *	fmul8x16	f8, f10, f10
53
 *	fand		f10, f6, f10
54
 *	for		f0, f2, f12
55
 *	fpsub16		f12, f10, f10
56
 */
57

  
58
#define ATTR_ALIGN(alignd) __attribute__ ((aligned(alignd)))
59

  
60
#define DUP4(x) {x, x, x, x}
61
#define DUP8(x) {x, x, x, x, x, x, x, x}
62
static const int16_t constants1[] ATTR_ALIGN(8) = DUP4 (1);
63
static const int16_t constants2[] ATTR_ALIGN(8) = DUP4 (2);
64
static const int16_t constants3[] ATTR_ALIGN(8) = DUP4 (3);
65
static const int16_t constants6[] ATTR_ALIGN(8) = DUP4 (6);
66
static const int8_t constants_fe[] ATTR_ALIGN(8) = DUP8 (0xfe);
67
static const int8_t constants_7f[] ATTR_ALIGN(8) = DUP8 (0x7f);
68
static const int8_t constants128[] ATTR_ALIGN(8) = DUP8 (128);
69
static const int16_t constants256_512[] ATTR_ALIGN(8) =
70
	{256, 512, 256, 512};
71
static const int16_t constants256_1024[] ATTR_ALIGN(8) =
72
	{256, 1024, 256, 1024};
73

  
74
#define REF_0		0
75
#define REF_0_1		1
76
#define REF_2		2
77
#define REF_2_1		3
78
#define REF_4		4
79
#define REF_4_1		5
80
#define REF_6		6
81
#define REF_6_1		7
82
#define REF_S0		8
83
#define REF_S0_1	9
84
#define REF_S2		10
85
#define REF_S2_1	11
86
#define REF_S4		12
87
#define REF_S4_1	13
88
#define REF_S6		14
89
#define REF_S6_1	15
90
#define DST_0		16
91
#define DST_1		17
92
#define DST_2		18
93
#define DST_3		19
94
#define CONST_1		20
95
#define CONST_2		20
96
#define CONST_3		20
97
#define CONST_6		20
98
#define MASK_fe		20
99
#define CONST_128	22
100
#define CONST_256	22
101
#define CONST_512	22
102
#define CONST_1024	22
103
#define TMP0		24
104
#define TMP1		25
105
#define TMP2		26
106
#define TMP3		27
107
#define TMP4		28
108
#define TMP5		29
109
#define ZERO		30
110
#define MASK_7f		30
111

  
112
#define TMP6		32
113
#define TMP8		34
114
#define TMP10		36
115
#define TMP12		38
116
#define TMP14		40
117
#define TMP16		42
118
#define TMP18		44
119
#define TMP20		46
120
#define TMP22		48
121
#define TMP24		50
122
#define TMP26		52
123
#define TMP28		54
124
#define TMP30		56
125
#define TMP32		58
126

  
127
static void MC_put_o_16_vis (uint8_t * dest, const uint8_t * _ref,
128
			     const int stride, int height)
129
{
130
	uint8_t *ref = (uint8_t *) _ref;
131

  
132
	ref = vis_alignaddr(ref);
133
	do {	/* 5 cycles */
134
		vis_ld64(ref[0], TMP0);
135

  
136
		vis_ld64_2(ref, 8, TMP2);
137

  
138
		vis_ld64_2(ref, 16, TMP4);
139
		ref += stride;
140

  
141
		vis_faligndata(TMP0, TMP2, REF_0);
142
		vis_st64(REF_0, dest[0]);
143

  
144
		vis_faligndata(TMP2, TMP4, REF_2);
145
		vis_st64_2(REF_2, dest, 8);
146
		dest += stride;
147
	} while (--height);
148
}
149

  
150
static void MC_put_o_8_vis (uint8_t * dest, const uint8_t * _ref,
151
			    const int stride, int height)
152
{
153
	uint8_t *ref = (uint8_t *) _ref;
154

  
155
	ref = vis_alignaddr(ref);
156
	do {	/* 4 cycles */
157
		vis_ld64(ref[0], TMP0);
158

  
159
		vis_ld64(ref[8], TMP2);
160
		ref += stride;
161

  
162
		/* stall */
163

  
164
		vis_faligndata(TMP0, TMP2, REF_0);
165
		vis_st64(REF_0, dest[0]);
166
		dest += stride;
167
	} while (--height);
168
}
169

  
170

  
171
static void MC_avg_o_16_vis (uint8_t * dest, const uint8_t * _ref,
172
			     const int stride, int height)
173
{
174
	uint8_t *ref = (uint8_t *) _ref;
175
	int stride_8 = stride + 8;
176

  
177
	ref = vis_alignaddr(ref);
178

  
179
	vis_ld64(ref[0], TMP0);
180

  
181
	vis_ld64(ref[8], TMP2);
182

  
183
	vis_ld64(ref[16], TMP4);
184

  
185
	vis_ld64(dest[0], DST_0);
186

  
187
	vis_ld64(dest[8], DST_2);
188

  
189
	vis_ld64(constants_fe[0], MASK_fe);
190
	vis_faligndata(TMP0, TMP2, REF_0);
191

  
192
	vis_ld64(constants_7f[0], MASK_7f);
193
	vis_faligndata(TMP2, TMP4, REF_2);
194

  
195
	vis_ld64(constants128[0], CONST_128);
196

  
197
	ref += stride;
198
	height = (height >> 1) - 1;
199

  
200
	do {	/* 24 cycles */
201
		vis_ld64(ref[0], TMP0);
202
		vis_xor(DST_0, REF_0, TMP6);
203

  
204
		vis_ld64_2(ref, 8, TMP2);
205
		vis_and(TMP6, MASK_fe, TMP6);
206

  
207
		vis_ld64_2(ref, 16, TMP4);
208
		ref += stride;
209
		vis_mul8x16(CONST_128, TMP6, TMP6);
210
		vis_xor(DST_2, REF_2, TMP8);
211

  
212
		vis_and(TMP8, MASK_fe, TMP8);
213

  
214
		vis_or(DST_0, REF_0, TMP10);
215
		vis_ld64_2(dest, stride, DST_0);
216
		vis_mul8x16(CONST_128, TMP8, TMP8);
217

  
218
		vis_or(DST_2, REF_2, TMP12);
219
		vis_ld64_2(dest, stride_8, DST_2);
220

  
221
		vis_ld64(ref[0], TMP14);
222
		vis_and(TMP6, MASK_7f, TMP6);
223

  
224
		vis_and(TMP8, MASK_7f, TMP8);
225

  
226
		vis_psub16(TMP10, TMP6, TMP6);
227
		vis_st64(TMP6, dest[0]);
228

  
229
		vis_psub16(TMP12, TMP8, TMP8);
230
		vis_st64_2(TMP8, dest, 8);
231

  
232
		dest += stride;
233
		vis_ld64_2(ref, 8, TMP16);
234
		vis_faligndata(TMP0, TMP2, REF_0);
235

  
236
		vis_ld64_2(ref, 16, TMP18);
237
		vis_faligndata(TMP2, TMP4, REF_2);
238
		ref += stride;
239

  
240
		vis_xor(DST_0, REF_0, TMP20);
241

  
242
		vis_and(TMP20, MASK_fe, TMP20);
243

  
244
		vis_xor(DST_2, REF_2, TMP22);
245
		vis_mul8x16(CONST_128, TMP20, TMP20);
246

  
247
		vis_and(TMP22, MASK_fe, TMP22);
248

  
249
		vis_or(DST_0, REF_0, TMP24);
250
		vis_mul8x16(CONST_128, TMP22, TMP22);
251

  
252
		vis_or(DST_2, REF_2, TMP26);
253

  
254
		vis_ld64_2(dest, stride, DST_0);
255
		vis_faligndata(TMP14, TMP16, REF_0);
256

  
257
		vis_ld64_2(dest, stride_8, DST_2);
258
		vis_faligndata(TMP16, TMP18, REF_2);
259

  
260
		vis_and(TMP20, MASK_7f, TMP20);
261

  
262
		vis_and(TMP22, MASK_7f, TMP22);
263

  
264
		vis_psub16(TMP24, TMP20, TMP20);
265
		vis_st64(TMP20, dest[0]);
266

  
267
		vis_psub16(TMP26, TMP22, TMP22);
268
		vis_st64_2(TMP22, dest, 8);
269
		dest += stride;
270
	} while (--height);
271

  
272
	vis_ld64(ref[0], TMP0);
273
	vis_xor(DST_0, REF_0, TMP6);
274

  
275
	vis_ld64_2(ref, 8, TMP2);
276
	vis_and(TMP6, MASK_fe, TMP6);
277

  
278
	vis_ld64_2(ref, 16, TMP4);
279
	vis_mul8x16(CONST_128, TMP6, TMP6);
280
	vis_xor(DST_2, REF_2, TMP8);
281

  
282
	vis_and(TMP8, MASK_fe, TMP8);
283

  
284
	vis_or(DST_0, REF_0, TMP10);
285
	vis_ld64_2(dest, stride, DST_0);
286
	vis_mul8x16(CONST_128, TMP8, TMP8);
287

  
288
	vis_or(DST_2, REF_2, TMP12);
289
	vis_ld64_2(dest, stride_8, DST_2);
290

  
291
	vis_ld64(ref[0], TMP14);
292
	vis_and(TMP6, MASK_7f, TMP6);
293

  
294
	vis_and(TMP8, MASK_7f, TMP8);
295

  
296
	vis_psub16(TMP10, TMP6, TMP6);
297
	vis_st64(TMP6, dest[0]);
298

  
299
	vis_psub16(TMP12, TMP8, TMP8);
300
	vis_st64_2(TMP8, dest, 8);
301

  
302
	dest += stride;
303
	vis_faligndata(TMP0, TMP2, REF_0);
304

  
305
	vis_faligndata(TMP2, TMP4, REF_2);
306

  
307
	vis_xor(DST_0, REF_0, TMP20);
308

  
309
	vis_and(TMP20, MASK_fe, TMP20);
310

  
311
	vis_xor(DST_2, REF_2, TMP22);
312
	vis_mul8x16(CONST_128, TMP20, TMP20);
313

  
314
	vis_and(TMP22, MASK_fe, TMP22);
315

  
316
	vis_or(DST_0, REF_0, TMP24);
317
	vis_mul8x16(CONST_128, TMP22, TMP22);
318

  
319
	vis_or(DST_2, REF_2, TMP26);
320

  
321
	vis_and(TMP20, MASK_7f, TMP20);
322

  
323
	vis_and(TMP22, MASK_7f, TMP22);
324

  
325
	vis_psub16(TMP24, TMP20, TMP20);
326
	vis_st64(TMP20, dest[0]);
327

  
328
	vis_psub16(TMP26, TMP22, TMP22);
329
	vis_st64_2(TMP22, dest, 8);
330
}
331

  
332
static void MC_avg_o_8_vis (uint8_t * dest, const uint8_t * _ref,
333
			    const int stride, int height)
334
{
335
	uint8_t *ref = (uint8_t *) _ref;
336

  
337
	ref = vis_alignaddr(ref);
338

  
339
	vis_ld64(ref[0], TMP0);
340

  
341
	vis_ld64(ref[8], TMP2);
342

  
343
	vis_ld64(dest[0], DST_0);
344

  
345
	vis_ld64(constants_fe[0], MASK_fe);
346

  
347
	vis_ld64(constants_7f[0], MASK_7f);
348
	vis_faligndata(TMP0, TMP2, REF_0);
349

  
350
	vis_ld64(constants128[0], CONST_128);
351

  
352
	ref += stride;
353
	height = (height >> 1) - 1;
354

  
355
	do {	/* 12 cycles */
356
		vis_ld64(ref[0], TMP0);
357
		vis_xor(DST_0, REF_0, TMP4);
358

  
359
		vis_ld64(ref[8], TMP2);
360
		vis_and(TMP4, MASK_fe, TMP4);
361

  
362
		vis_or(DST_0, REF_0, TMP6);
363
		vis_ld64_2(dest, stride, DST_0);
364
		ref += stride;
365
		vis_mul8x16(CONST_128, TMP4, TMP4);
366

  
367
		vis_ld64(ref[0], TMP12);
368
		vis_faligndata(TMP0, TMP2, REF_0);
369

  
370
		vis_ld64(ref[8], TMP2);
371
		vis_xor(DST_0, REF_0, TMP0);
372
		ref += stride;
373

  
374
		vis_and(TMP0, MASK_fe, TMP0);
375

  
376
		vis_and(TMP4, MASK_7f, TMP4);
377

  
378
		vis_psub16(TMP6, TMP4, TMP4);
379
		vis_st64(TMP4, dest[0]);
380
		dest += stride;
381
		vis_mul8x16(CONST_128, TMP0, TMP0);
382

  
383
		vis_or(DST_0, REF_0, TMP6);
384
		vis_ld64_2(dest, stride, DST_0);
385

  
386
		vis_faligndata(TMP12, TMP2, REF_0);
387

  
388
		vis_and(TMP0, MASK_7f, TMP0);
389

  
390
		vis_psub16(TMP6, TMP0, TMP4);
391
		vis_st64(TMP4, dest[0]);
392
		dest += stride;
393
	} while (--height);
394

  
395
	vis_ld64(ref[0], TMP0);
396
	vis_xor(DST_0, REF_0, TMP4);
397

  
398
	vis_ld64(ref[8], TMP2);
399
	vis_and(TMP4, MASK_fe, TMP4);
400

  
401
	vis_or(DST_0, REF_0, TMP6);
402
	vis_ld64_2(dest, stride, DST_0);
403
	vis_mul8x16(CONST_128, TMP4, TMP4);
404

  
405
	vis_faligndata(TMP0, TMP2, REF_0);
406

  
407
	vis_xor(DST_0, REF_0, TMP0);
408

  
409
	vis_and(TMP0, MASK_fe, TMP0);
410

  
411
	vis_and(TMP4, MASK_7f, TMP4);
412

  
413
	vis_psub16(TMP6, TMP4, TMP4);
414
	vis_st64(TMP4, dest[0]);
415
	dest += stride;
416
	vis_mul8x16(CONST_128, TMP0, TMP0);
417

  
418
	vis_or(DST_0, REF_0, TMP6);
419

  
420
	vis_and(TMP0, MASK_7f, TMP0);
421

  
422
	vis_psub16(TMP6, TMP0, TMP4);
423
	vis_st64(TMP4, dest[0]);
424
}
425

  
426
static void MC_put_x_16_vis (uint8_t * dest, const uint8_t * _ref,
427
			     const int stride, int height)
428
{
429
	uint8_t *ref = (uint8_t *) _ref;
430
	unsigned long off = (unsigned long) ref & 0x7;
431
	unsigned long off_plus_1 = off + 1;
432

  
433
	ref = vis_alignaddr(ref);
434

  
435
	vis_ld64(ref[0],    TMP0);
436

  
437
	vis_ld64_2(ref, 8,  TMP2);
438

  
439
	vis_ld64_2(ref, 16, TMP4);
440

  
441
	vis_ld64(constants_fe[0], MASK_fe);
442

  
443
	vis_ld64(constants_7f[0], MASK_7f);
444
	vis_faligndata(TMP0, TMP2, REF_0);
445

  
446
	vis_ld64(constants128[0], CONST_128);
447
	vis_faligndata(TMP2, TMP4, REF_4);
448

  
449
	if (off != 0x7) {
450
		vis_alignaddr_g0((void *)off_plus_1);
451
		vis_faligndata(TMP0, TMP2, REF_2);
452
		vis_faligndata(TMP2, TMP4, REF_6);
453
	} else {
454
		vis_src1(TMP2, REF_2);
455
		vis_src1(TMP4, REF_6);
456
	}
457

  
458
	ref += stride;
459
	height = (height >> 1) - 1;
460

  
461
	do {	/* 34 cycles */
462
		vis_ld64(ref[0],    TMP0);
463
		vis_xor(REF_0, REF_2, TMP6);
464

  
465
		vis_ld64_2(ref, 8,  TMP2);
466
		vis_xor(REF_4, REF_6, TMP8);
467

  
468
		vis_ld64_2(ref, 16, TMP4);
469
		vis_and(TMP6, MASK_fe, TMP6);
470
		ref += stride;
471

  
472
		vis_ld64(ref[0],    TMP14);
473
		vis_mul8x16(CONST_128, TMP6, TMP6);
474
		vis_and(TMP8, MASK_fe, TMP8);
475

  
476
		vis_ld64_2(ref, 8,  TMP16);
477
		vis_mul8x16(CONST_128, TMP8, TMP8);
478
		vis_or(REF_0, REF_2, TMP10);
479

  
480
		vis_ld64_2(ref, 16, TMP18);
481
		ref += stride;
482
		vis_or(REF_4, REF_6, TMP12);
483

  
484
		vis_alignaddr_g0((void *)off);
485

  
486
		vis_faligndata(TMP0, TMP2, REF_0);
487

  
488
		vis_faligndata(TMP2, TMP4, REF_4);
489

  
490
		if (off != 0x7) {
491
			vis_alignaddr_g0((void *)off_plus_1);
492
			vis_faligndata(TMP0, TMP2, REF_2);
493
			vis_faligndata(TMP2, TMP4, REF_6);
494
		} else {
495
			vis_src1(TMP2, REF_2);
496
			vis_src1(TMP4, REF_6);
497
		}
498

  
499
		vis_and(TMP6, MASK_7f, TMP6);
500

  
501
		vis_and(TMP8, MASK_7f, TMP8);
502

  
503
		vis_psub16(TMP10, TMP6, TMP6);
504
		vis_st64(TMP6, dest[0]);
505

  
506
		vis_psub16(TMP12, TMP8, TMP8);
507
		vis_st64_2(TMP8, dest, 8);
508
		dest += stride;
509

  
510
		vis_xor(REF_0, REF_2, TMP6);
511

  
512
		vis_xor(REF_4, REF_6, TMP8);
513

  
514
		vis_and(TMP6, MASK_fe, TMP6);
515

  
516
		vis_mul8x16(CONST_128, TMP6, TMP6);
517
		vis_and(TMP8, MASK_fe, TMP8);
518

  
519
		vis_mul8x16(CONST_128, TMP8, TMP8);
520
		vis_or(REF_0, REF_2, TMP10);
521

  
522
		vis_or(REF_4, REF_6, TMP12);
523

  
524
		vis_alignaddr_g0((void *)off);
525

  
526
		vis_faligndata(TMP14, TMP16, REF_0);
527

  
528
		vis_faligndata(TMP16, TMP18, REF_4);
529

  
530
		if (off != 0x7) {
531
			vis_alignaddr_g0((void *)off_plus_1);
532
			vis_faligndata(TMP14, TMP16, REF_2);
533
			vis_faligndata(TMP16, TMP18, REF_6);
534
		} else {
535
			vis_src1(TMP16, REF_2);
536
			vis_src1(TMP18, REF_6);
537
		}
538

  
539
		vis_and(TMP6, MASK_7f, TMP6);
540

  
541
		vis_and(TMP8, MASK_7f, TMP8);
542

  
543
		vis_psub16(TMP10, TMP6, TMP6);
544
		vis_st64(TMP6, dest[0]);
545

  
546
		vis_psub16(TMP12, TMP8, TMP8);
547
		vis_st64_2(TMP8, dest, 8);
548
		dest += stride;
549
	} while (--height);
550

  
551
	vis_ld64(ref[0],    TMP0);
552
	vis_xor(REF_0, REF_2, TMP6);
553

  
554
	vis_ld64_2(ref, 8,  TMP2);
555
	vis_xor(REF_4, REF_6, TMP8);
556

  
557
	vis_ld64_2(ref, 16, TMP4);
558
	vis_and(TMP6, MASK_fe, TMP6);
559

  
560
	vis_mul8x16(CONST_128, TMP6, TMP6);
561
	vis_and(TMP8, MASK_fe, TMP8);
562

  
563
	vis_mul8x16(CONST_128, TMP8, TMP8);
564
	vis_or(REF_0, REF_2, TMP10);
565

  
566
	vis_or(REF_4, REF_6, TMP12);
567

  
568
	vis_alignaddr_g0((void *)off);
569

  
570
	vis_faligndata(TMP0, TMP2, REF_0);
571

  
572
	vis_faligndata(TMP2, TMP4, REF_4);
573

  
574
	if (off != 0x7) {
575
		vis_alignaddr_g0((void *)off_plus_1);
576
		vis_faligndata(TMP0, TMP2, REF_2);
577
		vis_faligndata(TMP2, TMP4, REF_6);
578
	} else {
579
		vis_src1(TMP2, REF_2);
580
		vis_src1(TMP4, REF_6);
581
	}
582

  
583
	vis_and(TMP6, MASK_7f, TMP6);
584

  
585
	vis_and(TMP8, MASK_7f, TMP8);
586

  
587
	vis_psub16(TMP10, TMP6, TMP6);
588
	vis_st64(TMP6, dest[0]);
589

  
590
	vis_psub16(TMP12, TMP8, TMP8);
591
	vis_st64_2(TMP8, dest, 8);
592
	dest += stride;
593

  
594
	vis_xor(REF_0, REF_2, TMP6);
595

  
596
	vis_xor(REF_4, REF_6, TMP8);
597

  
598
	vis_and(TMP6, MASK_fe, TMP6);
599

  
600
	vis_mul8x16(CONST_128, TMP6, TMP6);
601
	vis_and(TMP8, MASK_fe, TMP8);
602

  
603
	vis_mul8x16(CONST_128, TMP8, TMP8);
604
	vis_or(REF_0, REF_2, TMP10);
605

  
606
	vis_or(REF_4, REF_6, TMP12);
607

  
608
	vis_and(TMP6, MASK_7f, TMP6);
609

  
610
	vis_and(TMP8, MASK_7f, TMP8);
611

  
612
	vis_psub16(TMP10, TMP6, TMP6);
613
	vis_st64(TMP6, dest[0]);
614

  
615
	vis_psub16(TMP12, TMP8, TMP8);
616
	vis_st64_2(TMP8, dest, 8);
617
}
618

  
619
static void MC_put_x_8_vis (uint8_t * dest, const uint8_t * _ref,
620
			    const int stride, int height)
621
{
622
	uint8_t *ref = (uint8_t *) _ref;
623
	unsigned long off = (unsigned long) ref & 0x7;
624
	unsigned long off_plus_1 = off + 1;
625

  
626
	ref = vis_alignaddr(ref);
627

  
628
	vis_ld64(ref[0], TMP0);
629

  
630
	vis_ld64(ref[8], TMP2);
631

  
632
	vis_ld64(constants_fe[0], MASK_fe);
633

  
634
	vis_ld64(constants_7f[0], MASK_7f);
635

  
636
	vis_ld64(constants128[0], CONST_128);
637
	vis_faligndata(TMP0, TMP2, REF_0);
638

  
639
	if (off != 0x7) {
640
		vis_alignaddr_g0((void *)off_plus_1);
641
		vis_faligndata(TMP0, TMP2, REF_2);
642
	} else {
643
		vis_src1(TMP2, REF_2);
644
	}
645

  
646
	ref += stride;
647
	height = (height >> 1) - 1;
648

  
649
	do {	/* 20 cycles */
650
		vis_ld64(ref[0], TMP0);
651
		vis_xor(REF_0, REF_2, TMP4);
652

  
653
		vis_ld64_2(ref, 8, TMP2);
654
		vis_and(TMP4, MASK_fe, TMP4);
655
		ref += stride;
656

  
657
		vis_ld64(ref[0], TMP8);
658
		vis_or(REF_0, REF_2, TMP6);
659
		vis_mul8x16(CONST_128, TMP4, TMP4);
660

  
661
		vis_alignaddr_g0((void *)off);
662

  
663
		vis_ld64_2(ref, 8, TMP10);
664
		ref += stride;
665
		vis_faligndata(TMP0, TMP2, REF_0);
666

  
667
		if (off != 0x7) {
668
			vis_alignaddr_g0((void *)off_plus_1);
669
			vis_faligndata(TMP0, TMP2, REF_2);
670
		} else {
671
			vis_src1(TMP2, REF_2);
672
		}
673

  
674
		vis_and(TMP4, MASK_7f, TMP4);
675

  
676
		vis_psub16(TMP6, TMP4, DST_0);
677
		vis_st64(DST_0, dest[0]);
678
		dest += stride;
679

  
680
		vis_xor(REF_0, REF_2, TMP12);
681

  
682
		vis_and(TMP12, MASK_fe, TMP12);
683

  
684
		vis_or(REF_0, REF_2, TMP14);
685
		vis_mul8x16(CONST_128, TMP12, TMP12);
686

  
687
		vis_alignaddr_g0((void *)off);
688
		vis_faligndata(TMP8, TMP10, REF_0);
689
		if (off != 0x7) {
690
			vis_alignaddr_g0((void *)off_plus_1);
691
			vis_faligndata(TMP8, TMP10, REF_2);
692
		} else {
693
			vis_src1(TMP10, REF_2);
694
		}
695

  
696
		vis_and(TMP12, MASK_7f, TMP12);
697

  
698
		vis_psub16(TMP14, TMP12, DST_0);
699
		vis_st64(DST_0, dest[0]);
700
		dest += stride;
701
	} while (--height);
702

  
703
	vis_ld64(ref[0], TMP0);
704
	vis_xor(REF_0, REF_2, TMP4);
705

  
706
	vis_ld64_2(ref, 8, TMP2);
707
	vis_and(TMP4, MASK_fe, TMP4);
708

  
709
	vis_or(REF_0, REF_2, TMP6);
710
	vis_mul8x16(CONST_128, TMP4, TMP4);
711

  
712
	vis_alignaddr_g0((void *)off);
713

  
714
	vis_faligndata(TMP0, TMP2, REF_0);
715

  
716
	if (off != 0x7) {
717
		vis_alignaddr_g0((void *)off_plus_1);
718
		vis_faligndata(TMP0, TMP2, REF_2);
719
	} else {
720
		vis_src1(TMP2, REF_2);
721
	}
722

  
723
	vis_and(TMP4, MASK_7f, TMP4);
724

  
725
	vis_psub16(TMP6, TMP4, DST_0);
726
	vis_st64(DST_0, dest[0]);
727
	dest += stride;
728

  
729
	vis_xor(REF_0, REF_2, TMP12);
730

  
731
	vis_and(TMP12, MASK_fe, TMP12);
732

  
733
	vis_or(REF_0, REF_2, TMP14);
734
	vis_mul8x16(CONST_128, TMP12, TMP12);
735

  
736
	vis_and(TMP12, MASK_7f, TMP12);
737

  
738
	vis_psub16(TMP14, TMP12, DST_0);
739
	vis_st64(DST_0, dest[0]);
740
	dest += stride;
741
}
742

  
743
static void MC_avg_x_16_vis (uint8_t * dest, const uint8_t * _ref,
744
			     const int stride, int height)
745
{
746
	uint8_t *ref = (uint8_t *) _ref;
747
	unsigned long off = (unsigned long) ref & 0x7;
748
	unsigned long off_plus_1 = off + 1;
749

  
750
	vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
751

  
752
	vis_ld64(constants3[0], CONST_3);
753
	vis_fzero(ZERO);
754
	vis_ld64(constants256_512[0], CONST_256);
755

  
756
	ref = vis_alignaddr(ref);
757
	do {	/* 26 cycles */
758
		vis_ld64(ref[0], TMP0);
759

  
760
		vis_ld64(ref[8], TMP2);
761

  
762
		vis_alignaddr_g0((void *)off);
763

  
764
		vis_ld64(ref[16], TMP4);
765

  
766
		vis_ld64(dest[0], DST_0);
767
		vis_faligndata(TMP0, TMP2, REF_0);
768

  
769
		vis_ld64(dest[8], DST_2);
770
		vis_faligndata(TMP2, TMP4, REF_4);
771

  
772
		if (off != 0x7) {
773
			vis_alignaddr_g0((void *)off_plus_1);
774
			vis_faligndata(TMP0, TMP2, REF_2);
775
			vis_faligndata(TMP2, TMP4, REF_6);
776
		} else {
777
			vis_src1(TMP2, REF_2);
778
			vis_src1(TMP4, REF_6);
779
		}
780

  
781
		vis_mul8x16au(REF_0,   CONST_256, TMP0);
782

  
783
		vis_pmerge(ZERO,     REF_2,     TMP4);
784
		vis_mul8x16au(REF_0_1, CONST_256, TMP2);
785

  
786
		vis_pmerge(ZERO, REF_2_1, TMP6);
787

  
788
		vis_padd16(TMP0, TMP4, TMP0);
789

  
790
		vis_mul8x16al(DST_0,   CONST_512, TMP4);
791
		vis_padd16(TMP2, TMP6, TMP2);
792

  
793
		vis_mul8x16al(DST_1,   CONST_512, TMP6);
794

  
795
		vis_mul8x16au(REF_6,   CONST_256, TMP12);
796

  
797
		vis_padd16(TMP0, TMP4, TMP0);
798
		vis_mul8x16au(REF_6_1, CONST_256, TMP14);
799

  
800
		vis_padd16(TMP2, TMP6, TMP2);
801
		vis_mul8x16au(REF_4,   CONST_256, TMP16);
802

  
803
		vis_padd16(TMP0, CONST_3, TMP8);
804
		vis_mul8x16au(REF_4_1, CONST_256, TMP18);
805

  
806
		vis_padd16(TMP2, CONST_3, TMP10);
807
		vis_pack16(TMP8, DST_0);
808

  
809
		vis_pack16(TMP10, DST_1);
810
		vis_padd16(TMP16, TMP12, TMP0);
811

  
812
		vis_st64(DST_0, dest[0]);
813
		vis_mul8x16al(DST_2,   CONST_512, TMP4);
814
		vis_padd16(TMP18, TMP14, TMP2);
815

  
816
		vis_mul8x16al(DST_3,   CONST_512, TMP6);
817
		vis_padd16(TMP0, CONST_3, TMP0);
818

  
819
		vis_padd16(TMP2, CONST_3, TMP2);
820

  
821
		vis_padd16(TMP0, TMP4, TMP0);
822

  
823
		vis_padd16(TMP2, TMP6, TMP2);
824
		vis_pack16(TMP0, DST_2);
825

  
826
		vis_pack16(TMP2, DST_3);
827
		vis_st64(DST_2, dest[8]);
828

  
829
		ref += stride;
830
		dest += stride;
831
	} while (--height);
832
}
833

  
834
static void MC_avg_x_8_vis (uint8_t * dest, const uint8_t * _ref,
835
			    const int stride, int height)
836
{
837
	uint8_t *ref = (uint8_t *) _ref;
838
	unsigned long off = (unsigned long) ref & 0x7;
839
	unsigned long off_plus_1 = off + 1;
840
	int stride_times_2 = stride << 1;
841

  
842
	vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
843

  
844
	vis_ld64(constants3[0], CONST_3);
845
	vis_fzero(ZERO);
846
	vis_ld64(constants256_512[0], CONST_256);
847

  
848
	ref = vis_alignaddr(ref);
849
	height >>= 2;
850
	do {	/* 47 cycles */
851
		vis_ld64(ref[0],   TMP0);
852

  
853
		vis_ld64_2(ref, 8, TMP2);
854
		ref += stride;
855

  
856
		vis_alignaddr_g0((void *)off);
857

  
858
		vis_ld64(ref[0],   TMP4);
859
		vis_faligndata(TMP0, TMP2, REF_0);
860

  
861
		vis_ld64_2(ref, 8, TMP6);
862
		ref += stride;
863

  
864
		vis_ld64(ref[0],   TMP8);
865

  
866
		vis_ld64_2(ref, 8, TMP10);
867
		ref += stride;
868
		vis_faligndata(TMP4, TMP6, REF_4);
869

  
870
		vis_ld64(ref[0],   TMP12);
871

  
872
		vis_ld64_2(ref, 8, TMP14);
873
		ref += stride;
874
		vis_faligndata(TMP8, TMP10, REF_S0);
875

  
876
		vis_faligndata(TMP12, TMP14, REF_S4);
877

  
878
		if (off != 0x7) {
879
			vis_alignaddr_g0((void *)off_plus_1);
880

  
881
			vis_ld64(dest[0], DST_0);
882
			vis_faligndata(TMP0, TMP2, REF_2);
883

  
884
			vis_ld64_2(dest, stride, DST_2);
885
			vis_faligndata(TMP4, TMP6, REF_6);
886

  
887
			vis_faligndata(TMP8, TMP10, REF_S2);
888

  
889
			vis_faligndata(TMP12, TMP14, REF_S6);
890
		} else {
891
			vis_ld64(dest[0], DST_0);
892
			vis_src1(TMP2, REF_2);
893

  
894
			vis_ld64_2(dest, stride, DST_2);
895
			vis_src1(TMP6, REF_6);
896

  
897
			vis_src1(TMP10, REF_S2);
898

  
899
			vis_src1(TMP14, REF_S6);
900
		}
901

  
902
		vis_pmerge(ZERO,     REF_0,     TMP0);
903
		vis_mul8x16au(REF_0_1, CONST_256, TMP2);
904

  
905
		vis_pmerge(ZERO,     REF_2,     TMP4);
906
		vis_mul8x16au(REF_2_1, CONST_256, TMP6);
907

  
908
		vis_padd16(TMP0, CONST_3, TMP0);
909
		vis_mul8x16al(DST_0,   CONST_512, TMP16);
910

  
911
		vis_padd16(TMP2, CONST_3, TMP2);
912
		vis_mul8x16al(DST_1,   CONST_512, TMP18);
913

  
914
		vis_padd16(TMP0, TMP4, TMP0);
915
		vis_mul8x16au(REF_4, CONST_256, TMP8);
916

  
917
		vis_padd16(TMP2, TMP6, TMP2);
918
		vis_mul8x16au(REF_4_1, CONST_256, TMP10);
919

  
920
		vis_padd16(TMP0, TMP16, TMP0);
921
		vis_mul8x16au(REF_6, CONST_256, TMP12);
922

  
923
		vis_padd16(TMP2, TMP18, TMP2);
924
		vis_mul8x16au(REF_6_1, CONST_256, TMP14);
925

  
926
		vis_padd16(TMP8, CONST_3, TMP8);
927
		vis_mul8x16al(DST_2, CONST_512, TMP16);
928

  
929
		vis_padd16(TMP8, TMP12, TMP8);
930
		vis_mul8x16al(DST_3, CONST_512, TMP18);
931

  
932
		vis_padd16(TMP10, TMP14, TMP10);
933
		vis_pack16(TMP0, DST_0);
934

  
935
		vis_pack16(TMP2, DST_1);
936
		vis_st64(DST_0, dest[0]);
937
		dest += stride;
938
		vis_padd16(TMP10, CONST_3, TMP10);
939

  
940
		vis_ld64_2(dest, stride, DST_0);
941
		vis_padd16(TMP8, TMP16, TMP8);
942

  
943
		vis_ld64_2(dest, stride_times_2, TMP4/*DST_2*/);
944
		vis_padd16(TMP10, TMP18, TMP10);
945
		vis_pack16(TMP8, DST_2);
946

  
947
		vis_pack16(TMP10, DST_3);
948
		vis_st64(DST_2, dest[0]);
949
		dest += stride;
950

  
951
		vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
952
		vis_pmerge(ZERO,     REF_S0,     TMP0);
953

  
954
		vis_pmerge(ZERO,     REF_S2,     TMP24);
955
		vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
956

  
957
		vis_padd16(TMP0, CONST_3, TMP0);
958
		vis_mul8x16au(REF_S4, CONST_256, TMP8);
959

  
960
		vis_padd16(TMP2, CONST_3, TMP2);
961
		vis_mul8x16au(REF_S4_1, CONST_256, TMP10);
962

  
963
		vis_padd16(TMP0, TMP24, TMP0);
964
		vis_mul8x16au(REF_S6, CONST_256, TMP12);
965

  
966
		vis_padd16(TMP2, TMP6, TMP2);
967
		vis_mul8x16au(REF_S6_1, CONST_256, TMP14);
968

  
969
		vis_padd16(TMP8, CONST_3, TMP8);
970
		vis_mul8x16al(DST_0,   CONST_512, TMP16);
971

  
972
		vis_padd16(TMP10, CONST_3, TMP10);
973
		vis_mul8x16al(DST_1,   CONST_512, TMP18);
974

  
975
		vis_padd16(TMP8, TMP12, TMP8);
976
		vis_mul8x16al(TMP4/*DST_2*/, CONST_512, TMP20);
977

  
978
		vis_mul8x16al(TMP5/*DST_3*/, CONST_512, TMP22);
979
		vis_padd16(TMP0, TMP16, TMP0);
980

  
981
		vis_padd16(TMP2, TMP18, TMP2);
982
		vis_pack16(TMP0, DST_0);
983

  
984
		vis_padd16(TMP10, TMP14, TMP10);
985
		vis_pack16(TMP2, DST_1);
986
		vis_st64(DST_0, dest[0]);
987
		dest += stride;
988

  
989
		vis_padd16(TMP8, TMP20, TMP8);
990

  
991
		vis_padd16(TMP10, TMP22, TMP10);
992
		vis_pack16(TMP8, DST_2);
993

  
994
		vis_pack16(TMP10, DST_3);
995
		vis_st64(DST_2, dest[0]);
996
		dest += stride;
997
	} while (--height);
998
}
999

  
1000
static void MC_put_y_16_vis (uint8_t * dest, const uint8_t * _ref,
1001
			     const int stride, int height)
1002
{
1003
	uint8_t *ref = (uint8_t *) _ref;
1004

  
1005
	ref = vis_alignaddr(ref);
1006
	vis_ld64(ref[0], TMP0);
1007

  
1008
	vis_ld64_2(ref, 8, TMP2);
1009

  
1010
	vis_ld64_2(ref, 16, TMP4);
1011
	ref += stride;
1012

  
1013
	vis_ld64(ref[0], TMP6);
1014
	vis_faligndata(TMP0, TMP2, REF_0);
1015

  
1016
	vis_ld64_2(ref, 8, TMP8);
1017
	vis_faligndata(TMP2, TMP4, REF_4);
1018

  
1019
	vis_ld64_2(ref, 16, TMP10);
1020
	ref += stride;
1021

  
1022
	vis_ld64(constants_fe[0], MASK_fe);
1023
	vis_faligndata(TMP6, TMP8, REF_2);
1024

  
1025
	vis_ld64(constants_7f[0], MASK_7f);
1026
	vis_faligndata(TMP8, TMP10, REF_6);
1027

  
1028
	vis_ld64(constants128[0], CONST_128);
1029
	height = (height >> 1) - 1;
1030
	do {	/* 24 cycles */
1031
		vis_ld64(ref[0], TMP0);
1032
		vis_xor(REF_0, REF_2, TMP12);
1033

  
1034
		vis_ld64_2(ref, 8, TMP2);
1035
		vis_xor(REF_4, REF_6, TMP16);
1036

  
1037
		vis_ld64_2(ref, 16, TMP4);
1038
		ref += stride;
1039
		vis_or(REF_0, REF_2, TMP14);
1040

  
1041
		vis_ld64(ref[0], TMP6);
1042
		vis_or(REF_4, REF_6, TMP18);
1043

  
1044
		vis_ld64_2(ref, 8, TMP8);
1045
		vis_faligndata(TMP0, TMP2, REF_0);
1046

  
1047
		vis_ld64_2(ref, 16, TMP10);
1048
		ref += stride;
1049
		vis_faligndata(TMP2, TMP4, REF_4);
1050

  
1051
		vis_and(TMP12, MASK_fe, TMP12);
1052

  
1053
		vis_and(TMP16, MASK_fe, TMP16);
1054
		vis_mul8x16(CONST_128, TMP12, TMP12);
1055

  
1056
		vis_mul8x16(CONST_128, TMP16, TMP16);
1057
		vis_xor(REF_0, REF_2, TMP0);
1058

  
1059
		vis_xor(REF_4, REF_6, TMP2);
1060

  
1061
		vis_or(REF_0, REF_2, TMP20);
1062

  
1063
		vis_and(TMP12, MASK_7f, TMP12);
1064

  
1065
		vis_and(TMP16, MASK_7f, TMP16);
1066

  
1067
		vis_psub16(TMP14, TMP12, TMP12);
1068
		vis_st64(TMP12, dest[0]);
1069

  
1070
		vis_psub16(TMP18, TMP16, TMP16);
1071
		vis_st64_2(TMP16, dest, 8);
1072
		dest += stride;
1073

  
1074
		vis_or(REF_4, REF_6, TMP18);
1075

  
1076
		vis_and(TMP0, MASK_fe, TMP0);
1077

  
1078
		vis_and(TMP2, MASK_fe, TMP2);
1079
		vis_mul8x16(CONST_128, TMP0, TMP0);
1080

  
1081
		vis_faligndata(TMP6, TMP8, REF_2);
1082
		vis_mul8x16(CONST_128, TMP2, TMP2);
1083

  
1084
		vis_faligndata(TMP8, TMP10, REF_6);
1085

  
1086
		vis_and(TMP0, MASK_7f, TMP0);
1087

  
1088
		vis_and(TMP2, MASK_7f, TMP2);
1089

  
1090
		vis_psub16(TMP20, TMP0, TMP0);
1091
		vis_st64(TMP0, dest[0]);
1092

  
1093
		vis_psub16(TMP18, TMP2, TMP2);
1094
		vis_st64_2(TMP2, dest, 8);
1095
		dest += stride;
1096
	} while (--height);
1097

  
1098
	vis_ld64(ref[0], TMP0);
1099
	vis_xor(REF_0, REF_2, TMP12);
1100

  
1101
	vis_ld64_2(ref, 8, TMP2);
1102
	vis_xor(REF_4, REF_6, TMP16);
1103

  
1104
	vis_ld64_2(ref, 16, TMP4);
1105
	vis_or(REF_0, REF_2, TMP14);
1106

  
1107
	vis_or(REF_4, REF_6, TMP18);
1108

  
1109
	vis_faligndata(TMP0, TMP2, REF_0);
1110

  
1111
	vis_faligndata(TMP2, TMP4, REF_4);
1112

  
1113
	vis_and(TMP12, MASK_fe, TMP12);
1114

  
1115
	vis_and(TMP16, MASK_fe, TMP16);
1116
	vis_mul8x16(CONST_128, TMP12, TMP12);
1117

  
1118
	vis_mul8x16(CONST_128, TMP16, TMP16);
1119
	vis_xor(REF_0, REF_2, TMP0);
1120

  
1121
	vis_xor(REF_4, REF_6, TMP2);
1122

  
1123
	vis_or(REF_0, REF_2, TMP20);
1124

  
1125
	vis_and(TMP12, MASK_7f, TMP12);
1126

  
1127
	vis_and(TMP16, MASK_7f, TMP16);
1128

  
1129
	vis_psub16(TMP14, TMP12, TMP12);
1130
	vis_st64(TMP12, dest[0]);
1131

  
1132
	vis_psub16(TMP18, TMP16, TMP16);
1133
	vis_st64_2(TMP16, dest, 8);
1134
	dest += stride;
1135

  
1136
	vis_or(REF_4, REF_6, TMP18);
1137

  
1138
	vis_and(TMP0, MASK_fe, TMP0);
1139

  
1140
	vis_and(TMP2, MASK_fe, TMP2);
1141
	vis_mul8x16(CONST_128, TMP0, TMP0);
1142

  
1143
	vis_mul8x16(CONST_128, TMP2, TMP2);
1144

  
1145
	vis_and(TMP0, MASK_7f, TMP0);
1146

  
1147
	vis_and(TMP2, MASK_7f, TMP2);
1148

  
1149
	vis_psub16(TMP20, TMP0, TMP0);
1150
	vis_st64(TMP0, dest[0]);
1151

  
1152
	vis_psub16(TMP18, TMP2, TMP2);
1153
	vis_st64_2(TMP2, dest, 8);
1154
}
1155

  
1156
static void MC_put_y_8_vis (uint8_t * dest, const uint8_t * _ref,
1157
			    const int stride, int height)
1158
{
1159
	uint8_t *ref = (uint8_t *) _ref;
1160

  
1161
	ref = vis_alignaddr(ref);
1162
	vis_ld64(ref[0], TMP0);
1163

  
1164
	vis_ld64_2(ref, 8, TMP2);
1165
	ref += stride;
1166

  
1167
	vis_ld64(ref[0], TMP4);
1168

  
1169
	vis_ld64_2(ref, 8, TMP6);
1170
	ref += stride;
1171

  
1172
	vis_ld64(constants_fe[0], MASK_fe);
1173
	vis_faligndata(TMP0, TMP2, REF_0);
1174

  
1175
	vis_ld64(constants_7f[0], MASK_7f);
1176
	vis_faligndata(TMP4, TMP6, REF_2);
1177

  
1178
	vis_ld64(constants128[0], CONST_128);
1179
	height = (height >> 1) - 1;
1180
	do {	/* 12 cycles */
1181
		vis_ld64(ref[0], TMP0);
1182
		vis_xor(REF_0, REF_2, TMP4);
1183

  
1184
		vis_ld64_2(ref, 8, TMP2);
1185
		ref += stride;
1186
		vis_and(TMP4, MASK_fe, TMP4);
1187

  
1188
		vis_or(REF_0, REF_2, TMP6);
1189
		vis_mul8x16(CONST_128, TMP4, TMP4);
1190

  
1191
		vis_faligndata(TMP0, TMP2, REF_0);
1192
		vis_ld64(ref[0], TMP0);
1193

  
1194
		vis_ld64_2(ref, 8, TMP2);
1195
		ref += stride;
1196
		vis_xor(REF_0, REF_2, TMP12);
1197

  
1198
		vis_and(TMP4, MASK_7f, TMP4);
1199

  
1200
		vis_and(TMP12, MASK_fe, TMP12);
1201

  
1202
		vis_mul8x16(CONST_128, TMP12, TMP12);
1203
		vis_or(REF_0, REF_2, TMP14);
1204

  
1205
		vis_psub16(TMP6, TMP4, DST_0);
1206
		vis_st64(DST_0, dest[0]);
1207
		dest += stride;
1208

  
1209
		vis_faligndata(TMP0, TMP2, REF_2);
1210

  
1211
		vis_and(TMP12, MASK_7f, TMP12);
1212

  
1213
		vis_psub16(TMP14, TMP12, DST_0);
1214
		vis_st64(DST_0, dest[0]);
1215
		dest += stride;
1216
	} while (--height);
1217

  
1218
	vis_ld64(ref[0], TMP0);
1219
	vis_xor(REF_0, REF_2, TMP4);
1220

  
1221
	vis_ld64_2(ref, 8, TMP2);
1222
	vis_and(TMP4, MASK_fe, TMP4);
1223

  
1224
	vis_or(REF_0, REF_2, TMP6);
1225
	vis_mul8x16(CONST_128, TMP4, TMP4);
1226

  
1227
	vis_faligndata(TMP0, TMP2, REF_0);
1228

  
1229
	vis_xor(REF_0, REF_2, TMP12);
1230

  
1231
	vis_and(TMP4, MASK_7f, TMP4);
1232

  
1233
	vis_and(TMP12, MASK_fe, TMP12);
1234

  
1235
	vis_mul8x16(CONST_128, TMP12, TMP12);
1236
	vis_or(REF_0, REF_2, TMP14);
1237

  
1238
	vis_psub16(TMP6, TMP4, DST_0);
1239
	vis_st64(DST_0, dest[0]);
1240
	dest += stride;
1241

  
1242
	vis_and(TMP12, MASK_7f, TMP12);
1243

  
1244
	vis_psub16(TMP14, TMP12, DST_0);
1245
	vis_st64(DST_0, dest[0]);
1246
}
1247

  
1248
static void MC_avg_y_16_vis (uint8_t * dest, const uint8_t * _ref,
1249
			     const int stride, int height)
1250
{
1251
	uint8_t *ref = (uint8_t *) _ref;
1252
	int stride_8 = stride + 8;
1253
	int stride_16 = stride + 16;
1254

  
1255
	vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
1256

  
1257
	ref = vis_alignaddr(ref);
1258

  
1259
	vis_ld64(ref[ 0], TMP0);
1260
	vis_fzero(ZERO);
1261

  
1262
	vis_ld64(ref[ 8], TMP2);
1263

  
1264
	vis_ld64(ref[16], TMP4);
1265

  
1266
	vis_ld64(constants3[0], CONST_3);
1267
	vis_faligndata(TMP0, TMP2, REF_2);
1268

  
1269
	vis_ld64(constants256_512[0], CONST_256);
1270
	vis_faligndata(TMP2, TMP4, REF_6);
1271
	height >>= 1;
1272

  
1273
	do {	/* 31 cycles */
1274
		vis_ld64_2(ref, stride, TMP0);
1275
		vis_pmerge(ZERO,       REF_2,     TMP12);
1276
		vis_mul8x16au(REF_2_1, CONST_256, TMP14);
1277

  
1278
		vis_ld64_2(ref, stride_8, TMP2);
1279
		vis_pmerge(ZERO,       REF_6,     TMP16);
1280
		vis_mul8x16au(REF_6_1, CONST_256, TMP18);
1281

  
1282
		vis_ld64_2(ref, stride_16, TMP4);
1283
		ref += stride;
1284

  
1285
		vis_ld64(dest[0], DST_0);
1286
		vis_faligndata(TMP0, TMP2, REF_0);
1287

  
1288
		vis_ld64_2(dest, 8, DST_2);
1289
		vis_faligndata(TMP2, TMP4, REF_4);
1290

  
1291
		vis_ld64_2(ref, stride, TMP6);
1292
		vis_pmerge(ZERO,     REF_0,     TMP0);
1293
		vis_mul8x16au(REF_0_1, CONST_256, TMP2);
1294

  
1295
		vis_ld64_2(ref, stride_8, TMP8);
1296
		vis_pmerge(ZERO,     REF_4,     TMP4);
1297

  
1298
		vis_ld64_2(ref, stride_16, TMP10);
1299
		ref += stride;
1300

  
1301
		vis_ld64_2(dest, stride, REF_S0/*DST_4*/);
1302
		vis_faligndata(TMP6, TMP8, REF_2);
1303
		vis_mul8x16au(REF_4_1, CONST_256, TMP6);
1304

  
1305
		vis_ld64_2(dest, stride_8, REF_S2/*DST_6*/);
1306
		vis_faligndata(TMP8, TMP10, REF_6);
1307
		vis_mul8x16al(DST_0,   CONST_512, TMP20);
1308

  
1309
		vis_padd16(TMP0, CONST_3, TMP0);
1310
		vis_mul8x16al(DST_1,   CONST_512, TMP22);
1311

  
1312
		vis_padd16(TMP2, CONST_3, TMP2);
1313
		vis_mul8x16al(DST_2,   CONST_512, TMP24);
1314

  
1315
		vis_padd16(TMP4, CONST_3, TMP4);
1316
		vis_mul8x16al(DST_3,   CONST_512, TMP26);
1317

  
1318
		vis_padd16(TMP6, CONST_3, TMP6);
1319

  
1320
		vis_padd16(TMP12, TMP20, TMP12);
1321
		vis_mul8x16al(REF_S0,   CONST_512, TMP20);
1322

  
1323
		vis_padd16(TMP14, TMP22, TMP14);
1324
		vis_mul8x16al(REF_S0_1, CONST_512, TMP22);
1325

  
1326
		vis_padd16(TMP16, TMP24, TMP16);
1327
		vis_mul8x16al(REF_S2,   CONST_512, TMP24);
1328

  
1329
		vis_padd16(TMP18, TMP26, TMP18);
1330
		vis_mul8x16al(REF_S2_1, CONST_512, TMP26);
1331

  
1332
		vis_padd16(TMP12, TMP0, TMP12);
1333
		vis_mul8x16au(REF_2,   CONST_256, TMP28);
1334

  
1335
		vis_padd16(TMP14, TMP2, TMP14);
1336
		vis_mul8x16au(REF_2_1, CONST_256, TMP30);
1337

  
1338
		vis_padd16(TMP16, TMP4, TMP16);
1339
		vis_mul8x16au(REF_6,   CONST_256, REF_S4);
1340

  
1341
		vis_padd16(TMP18, TMP6, TMP18);
1342
		vis_mul8x16au(REF_6_1, CONST_256, REF_S6);
1343

  
1344
		vis_pack16(TMP12, DST_0);
1345
		vis_padd16(TMP28, TMP0, TMP12);
1346

  
1347
		vis_pack16(TMP14, DST_1);
1348
		vis_st64(DST_0, dest[0]);
1349
		vis_padd16(TMP30, TMP2, TMP14);
1350

  
1351
		vis_pack16(TMP16, DST_2);
1352
		vis_padd16(REF_S4, TMP4, TMP16);
1353

  
1354
		vis_pack16(TMP18, DST_3);
1355
		vis_st64_2(DST_2, dest, 8);
1356
		dest += stride;
1357
		vis_padd16(REF_S6, TMP6, TMP18);
1358

  
1359
		vis_padd16(TMP12, TMP20, TMP12);
1360

  
1361
		vis_padd16(TMP14, TMP22, TMP14);
1362
		vis_pack16(TMP12, DST_0);
1363

  
1364
		vis_padd16(TMP16, TMP24, TMP16);
1365
		vis_pack16(TMP14, DST_1);
1366
		vis_st64(DST_0, dest[0]);
1367

  
1368
		vis_padd16(TMP18, TMP26, TMP18);
1369
		vis_pack16(TMP16, DST_2);
1370

  
1371
		vis_pack16(TMP18, DST_3);
1372
		vis_st64_2(DST_2, dest, 8);
1373
		dest += stride;
1374
	} while (--height);
1375
}
1376

  
1377
static void MC_avg_y_8_vis (uint8_t * dest, const uint8_t * _ref,
1378
			    const int stride, int height)
1379
{
1380
	uint8_t *ref = (uint8_t *) _ref;
1381
	int stride_8 = stride + 8;
1382

  
1383
	vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
1384

  
1385
	ref = vis_alignaddr(ref);
1386

  
1387
	vis_ld64(ref[ 0], TMP0);
1388
	vis_fzero(ZERO);
1389

  
1390
	vis_ld64(ref[ 8], TMP2);
1391

  
1392
	vis_ld64(constants3[0], CONST_3);
1393
	vis_faligndata(TMP0, TMP2, REF_2);
1394

  
1395
	vis_ld64(constants256_512[0], CONST_256);
1396

  
1397
	height >>= 1;
1398
	do {	/* 20 cycles */
1399
		vis_ld64_2(ref, stride, TMP0);
1400
		vis_pmerge(ZERO,       REF_2,     TMP8);
1401
		vis_mul8x16au(REF_2_1, CONST_256, TMP10);
1402

  
1403
		vis_ld64_2(ref, stride_8, TMP2);
1404
		ref += stride;
1405

  
1406
		vis_ld64(dest[0], DST_0);
1407

  
1408
		vis_ld64_2(dest, stride, DST_2);
1409
		vis_faligndata(TMP0, TMP2, REF_0);
1410

  
1411
		vis_ld64_2(ref, stride, TMP4);
1412
		vis_mul8x16al(DST_0,   CONST_512, TMP16);
1413
		vis_pmerge(ZERO,       REF_0,     TMP12);
1414

  
1415
		vis_ld64_2(ref, stride_8, TMP6);
1416
		ref += stride;
1417
		vis_mul8x16al(DST_1,   CONST_512, TMP18);
1418
		vis_pmerge(ZERO,       REF_0_1,   TMP14);
1419

  
1420
		vis_padd16(TMP12, CONST_3, TMP12);
1421
		vis_mul8x16al(DST_2,   CONST_512, TMP24);
1422

  
1423
		vis_padd16(TMP14, CONST_3, TMP14);
1424
		vis_mul8x16al(DST_3,   CONST_512, TMP26);
1425

  
1426
		vis_faligndata(TMP4, TMP6, REF_2);
1427

  
1428
		vis_padd16(TMP8, TMP12, TMP8);
1429

  
1430
		vis_padd16(TMP10, TMP14, TMP10);
1431
		vis_mul8x16au(REF_2,   CONST_256, TMP20);
1432

  
1433
		vis_padd16(TMP8, TMP16, TMP0);
1434
		vis_mul8x16au(REF_2_1, CONST_256, TMP22);
1435

  
1436
		vis_padd16(TMP10, TMP18, TMP2);
1437
		vis_pack16(TMP0, DST_0);
1438

  
... This diff was truncated because it exceeds the maximum size that can be displayed.

Also available in: Unified diff