Revision 20646267

View differences:

libavcodec/libpostproc/postprocess.c
29 29
isVertMinMaxOk		Ec	Ec			Ec
30 30
doVertLowPass		E		e	e	Ec
31 31
doVertDefFilter		Ec	Ec	e	e	Ec
32
isHorizDC		Ec	Ec
33
isHorizMinMaxOk		a	E
34
doHorizLowPass		E		e	e
35
doHorizDefFilter	Ec	Ec	e	e
32
isHorizDC		Ec	Ec			Ec
33
isHorizMinMaxOk		a	E			Ec
34
doHorizLowPass		E		e	e	Ec
35
doHorizDefFilter	Ec	Ec	e	e	Ec
36 36
do_a_deblock		Ec	E	Ec	E
37 37
deRing			E		e	e*	Ecp
38 38
Vertical RKAlgo1	E		a	a
......
43 43
CubicIpolDeinterlace	a		e	e*
44 44
LinBlendDeinterlace	e		E	E*
45 45
MedianDeinterlace#	E	Ec	Ec
46
TempDeNoiser#		E		e	e
46
TempDeNoiser#		E		e	e	Ec
47 47

  
48 48
* i dont have a 3dnow CPU -> its untested, but noone said it doesnt work so it seems to work
49 49
# more or less selfinvented filters so the exactness isnt too meaningfull
libavcodec/libpostproc/postprocess_altivec_template.c
73 73
  vector signed short v2QP;
74 74
  vector unsigned short v4QP;
75 75
  vector unsigned short v_dcThreshold;
76
  int two_vectors = ((((unsigned long)src2 % 16) > 8) || (stride % 16)) ? 1 : 0;
76
  const int properStride = (stride % 16);
77
  const int srcAlign = ((unsigned long)src2 % 16);
78
  const int two_vectors = ((srcAlign > 8) || properStride) ? 1 : 0;
77 79
  const vector signed int zero = vec_splat_s32(0);
78 80
  const vector signed short mask = vec_splat_s16(1);
79 81
  vector signed int v_numEq = vec_splat_s32(0);
......
90 92

  
91 93
  src2 += stride * 4;
92 94

  
95
  vector signed short v_srcAss0, v_srcAss1, v_srcAss2, v_srcAss3, v_srcAss4, v_srcAss5, v_srcAss6, v_srcAss7;
96

  
93 97
#define LOAD_LINE(i)							\
94 98
  register int j##i = i * stride;					\
95 99
  vector unsigned char perm##i = vec_lvsl(j##i, src2);			\
......
99 103
    v_srcA2##i = vec_ld(j##i + 16, src2);				\
100 104
  const vector unsigned char v_srcA##i =				\
101 105
    vec_perm(v_srcA1##i, v_srcA2##i, perm##i);				\
102
  vector signed short v_srcAss##i =					\
106
  v_srcAss##i =                                                         \
103 107
    (vector signed short)vec_mergeh((vector signed char)zero,		\
104 108
				    (vector signed char)v_srcA##i)
105 109

  
106
  LOAD_LINE(0);
107
  LOAD_LINE(1);
108
  LOAD_LINE(2);
109
  LOAD_LINE(3);
110
  LOAD_LINE(4);
111
  LOAD_LINE(5);
112
  LOAD_LINE(6);
113
  LOAD_LINE(7);
114
#undef LOAD_LINE
115

  
116
#define ITER(i, j)							\
117
  const vector signed short v_diff##i =					\
118
    vec_sub(v_srcAss##i, v_srcAss##j);					\
119
  const vector signed short v_sum##i =					\
120
    vec_add(v_diff##i, v_dcOffset);					\
121
  const vector signed short v_comp##i =					\
122
    (vector signed short)vec_cmplt((vector unsigned short)v_sum##i,	\
123
				   v_dcThreshold);			\
124
  const vector signed short v_part##i = vec_and(mask, v_comp##i);	\
125
  v_numEq = vec_sum4s(v_part##i, v_numEq);
126

  
127
  ITER(0, 1);
128
  ITER(1, 2);
129
  ITER(2, 3);
130
  ITER(3, 4);
131
  ITER(4, 5);
132
  ITER(5, 6);
133
  ITER(6, 7);
134
#undef ITER
135

  
136
  v_numEq = vec_sums(v_numEq, zero);
137
	
138
  v_numEq = vec_splat(v_numEq, 3);
139
  vec_ste(v_numEq, 0, &numEq);
140

  
141
  if (numEq > c->ppMode.flatnessThreshold)
142
    {
143
      const vector unsigned char mmoP1 = (const vector unsigned char)
144
	AVV(0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
145
	    0x00, 0x01, 0x12, 0x13, 0x08, 0x09, 0x1A, 0x1B);
146
      const vector unsigned char mmoP2 = (const vector unsigned char)
147
	AVV(0x04, 0x05, 0x16, 0x17, 0x0C, 0x0D, 0x1E, 0x1F,
148
	    0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f);
149
      const vector unsigned char mmoP = (const vector unsigned char)
150
	vec_lvsl(8, (unsigned char*)0);
151
      
152
      vector signed short mmoL1 = vec_perm(v_srcAss0, v_srcAss2, mmoP1);
153
      vector signed short mmoL2 = vec_perm(v_srcAss4, v_srcAss6, mmoP2);
154
      vector signed short mmoL = vec_perm(mmoL1, mmoL2, mmoP);
155
      vector signed short mmoR1 = vec_perm(v_srcAss5, v_srcAss7, mmoP1);
156
      vector signed short mmoR2 = vec_perm(v_srcAss1, v_srcAss3, mmoP2);
157
      vector signed short mmoR = vec_perm(mmoR1, mmoR2, mmoP);
158
      vector signed short mmoDiff = vec_sub(mmoL, mmoR);
159
      vector unsigned short mmoSum = (vector unsigned short)vec_add(mmoDiff, v2QP);
160
      
161
      if (vec_any_gt(mmoSum, v4QP))
162
	return 0;
163
      else
164
	return 1;
165
    }
166
  else return 2; 
167
}
168

  
169
/* this is the same as vertClassify_altivec,
170
   with an added 8x8 transpose after the loading,
171
   and w/o the stride*4 offset */
172
static inline int horizClassify_altivec(uint8_t src[], int stride, PPContext *c) {
173
  /*
174
    this code makes no assumption on src or stride.
175
    One could remove the recomputation of the perm
176
    vector by assuming (stride % 16) == 0, unfortunately
177
    this is not always true.
178
  */
179
  register int y;
180
  short __attribute__ ((aligned(16))) data[8];
181
  int numEq;
182
  uint8_t *src2 = src;
183
  vector signed short v_dcOffset;
184
  vector signed short v2QP;
185
  vector unsigned short v4QP;
186
  vector unsigned short v_dcThreshold;
187
  int two_vectors = ((((unsigned long)src2 % 16) > 8) || (stride % 16)) ? 1 : 0;
188
  const vector signed int zero = vec_splat_s32(0);
189
  const vector signed short mask = vec_splat_s16(1);
190
  vector signed int v_numEq = vec_splat_s32(0);
191
	
192
  data[0] = ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
193
  data[1] = data[0] * 2 + 1;
194
  data[2] = c->QP * 2;
195
  data[3] = c->QP * 4;
196
  vector signed short v_data = vec_ld(0, data);
197
  v_dcOffset = vec_splat(v_data, 0);
198
  v_dcThreshold = (vector unsigned short)vec_splat(v_data, 1);
199
  v2QP = vec_splat(v_data, 2);
200
  v4QP = (vector unsigned short)vec_splat(v_data, 3);
201

  
202
  //  src2 += stride * 4;
203

  
204
#define LOAD_LINE(i)							\
205
  register int j##i = i * stride;					\
206
  vector unsigned char perm##i = vec_lvsl(j##i, src2);			\
207
  const vector unsigned char v_srcA1##i = vec_ld(j##i, src2);		\
208
  vector unsigned char v_srcA2##i;					\
209
  if (two_vectors)							\
210
    v_srcA2##i = vec_ld(j##i + 16, src2);				\
211
  const vector unsigned char v_srcA##i =				\
212
    vec_perm(v_srcA1##i, v_srcA2##i, perm##i);				\
213
  vector signed short v_srcAss##i =					\
110
#define LOAD_LINE_ALIGNED(i)                                            \
111
  register int j##i = i * stride;                                       \
112
  const vector unsigned char v_srcA##i = vec_ld(j##i, src2);            \
113
  v_srcAss##i =                                                         \
214 114
    (vector signed short)vec_mergeh((vector signed char)zero,		\
215 115
				    (vector signed char)v_srcA##i)
216 116

  
217
  LOAD_LINE(0);
218
  LOAD_LINE(1);
219
  LOAD_LINE(2);
220
  LOAD_LINE(3);
221
  LOAD_LINE(4);
222
  LOAD_LINE(5);
223
  LOAD_LINE(6);
224
  LOAD_LINE(7);
117
    // special casing the aligned case is worthwhile, as all call from
118
    // the (transposed) horizontable deblocks will be aligned, i naddition
119
    // to the naturraly aligned vertical deblocks.
120
    if (properStride && srcAlign) {
121
      LOAD_LINE_ALIGNED(0);
122
      LOAD_LINE_ALIGNED(1);
123
      LOAD_LINE_ALIGNED(2);
124
      LOAD_LINE_ALIGNED(3);
125
      LOAD_LINE_ALIGNED(4);
126
      LOAD_LINE_ALIGNED(5);
127
      LOAD_LINE_ALIGNED(6);
128
      LOAD_LINE_ALIGNED(7);
129
    } else {
130
      LOAD_LINE(0);
131
      LOAD_LINE(1);
132
      LOAD_LINE(2);
133
      LOAD_LINE(3);
134
      LOAD_LINE(4);
135
      LOAD_LINE(5);
136
      LOAD_LINE(6);
137
      LOAD_LINE(7);
138
    }
225 139
#undef LOAD_LINE
226

  
227
  ALTIVEC_TRANSPOSE_8x8_SHORT(v_srcAss0,
228
			      v_srcAss1,
229
			      v_srcAss2,
230
			      v_srcAss3,
231
			      v_srcAss4,
232
			      v_srcAss5,
233
			      v_srcAss6,
234
			      v_srcAss7);
140
#undef LOAD_LINE_ALIGNED
235 141

  
236 142
#define ITER(i, j)							\
237 143
  const vector signed short v_diff##i =					\
......
286 192
  else return 2; 
287 193
}
288 194

  
289

  
290 195
static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c) {
291 196
  /*
292 197
    this code makes no assumption on src or stride.
......
298 203
  */
299 204
  uint8_t *src2 = src;
300 205
  const vector signed int zero = vec_splat_s32(0);
206
  const int properStride = (stride % 16);
207
  const int srcAlign = ((unsigned long)src2 % 16);
301 208
  short __attribute__ ((aligned(16))) qp[8];
302 209
  qp[0] = c->QP;
303 210
  vector signed short vqp = vec_ld(0, qp);
304 211
  vqp = vec_splat(vqp, 0);
305 212
	
213
  src2 += stride*3;
214

  
215
  vector signed short vb0, vb1, vb2, vb3, vb4, vb5, vb6, vb7, vb8, vb9;
216
  vector unsigned char vbA0, vbA1, vbA2, vbA3, vbA4, vbA5, vbA6, vbA7, vbA8, vbA9;
217
  vector unsigned char vbB0, vbB1, vbB2, vbB3, vbB4, vbB5, vbB6, vbB7, vbB8, vbB9;
218
  vector unsigned char vbT0, vbT1, vbT2, vbT3, vbT4, vbT5, vbT6, vbT7, vbT8, vbT9;
219
	
306 220
#define LOAD_LINE(i)                                                    \
307 221
  const vector unsigned char perml##i =					\
308 222
    vec_lvsl(i * stride, src2);						\
309
  const vector unsigned char vbA##i =					\
310
    vec_ld(i * stride, src2);						\
311
  const vector unsigned char vbB##i =					\
312
    vec_ld(i * stride + 16, src2);					\
313
  const vector unsigned char vbT##i =					\
314
    vec_perm(vbA##i, vbB##i, perml##i);					\
315
  const vector signed short vb##i =					\
223
  vbA##i = vec_ld(i * stride, src2);                                    \
224
  vbB##i = vec_ld(i * stride + 16, src2);                               \
225
  vbT##i = vec_perm(vbA##i, vbB##i, perml##i);                          \
226
  vb##i =                                                               \
316 227
    (vector signed short)vec_mergeh((vector unsigned char)zero,		\
317 228
				    (vector unsigned char)vbT##i)
318
	
319
  src2 += stride*3;
320 229

  
321
  LOAD_LINE(0);
322
  LOAD_LINE(1);
323
  LOAD_LINE(2);
324
  LOAD_LINE(3);
325
  LOAD_LINE(4);
326
  LOAD_LINE(5);
327
  LOAD_LINE(6);
328
  LOAD_LINE(7);
329
  LOAD_LINE(8);
330
  LOAD_LINE(9);
230
#define LOAD_LINE_ALIGNED(i)                                            \
231
  register int j##i = i * stride;                                       \
232
  vbT##i = vec_ld(j##i, src2);                                          \
233
  vb##i =                                                               \
234
    (vector signed short)vec_mergeh((vector signed char)zero,		\
235
				    (vector signed char)vbT##i)
236

  
237
    // special casing the aligned case is worthwhile, as all call from
238
    // the (transposed) horizontable deblocks will be aligned, in addition
239
    // to the naturraly aligned vertical deblocks.
240
    if (properStride && srcAlign) {
241
      LOAD_LINE_ALIGNED(0);
242
      LOAD_LINE_ALIGNED(1);
243
      LOAD_LINE_ALIGNED(2);
244
      LOAD_LINE_ALIGNED(3);
245
      LOAD_LINE_ALIGNED(4);
246
      LOAD_LINE_ALIGNED(5);
247
      LOAD_LINE_ALIGNED(6);
248
      LOAD_LINE_ALIGNED(7);
249
      LOAD_LINE_ALIGNED(8);
250
      LOAD_LINE_ALIGNED(9);
251
    } else {
252
      LOAD_LINE(0);
253
      LOAD_LINE(1);
254
      LOAD_LINE(2);
255
      LOAD_LINE(3);
256
      LOAD_LINE(4);
257
      LOAD_LINE(5);
258
      LOAD_LINE(6);
259
      LOAD_LINE(7);
260
      LOAD_LINE(8);
261
      LOAD_LINE(9);
262
    }
331 263
#undef LOAD_LINE
264
#undef LOAD_LINE_ALIGNED
332 265

  
333 266
  const vector unsigned short v_1 = vec_splat_u16(1);
334 267
  const vector unsigned short v_2 = vec_splat_u16(2);
......
413 346
  vec_st(svA##i, i * stride, src2);				\
414 347
  vec_st(svB##i, i * stride + 16, src2)
415 348

  
416
  PACK_AND_STORE(1);
417
  PACK_AND_STORE(2);
418
  PACK_AND_STORE(3);
419
  PACK_AND_STORE(4);
420
  PACK_AND_STORE(5);
421
  PACK_AND_STORE(6);
422
  PACK_AND_STORE(7);
423
  PACK_AND_STORE(8);
424

  
349
#define PACK_AND_STORE_ALIGNED(i)				\
350
  const vector unsigned char vf##i =				\
351
    vec_packsu(vr##i, (vector signed short)zero);		\
352
  const vector unsigned char vg##i =				\
353
    vec_perm(vf##i, vbT##i, permHH);				\
354
  vec_st(vg##i, i * stride, src2)
355

  
356
  // special casing the aligned case is worthwhile, as all call from
357
  // the (transposed) horizontable deblocks will be aligned, in addition
358
  // to the naturraly aligned vertical deblocks.
359
  if (properStride && srcAlign) {
360
    PACK_AND_STORE_ALIGNED(1);
361
    PACK_AND_STORE_ALIGNED(2);
362
    PACK_AND_STORE_ALIGNED(3);
363
    PACK_AND_STORE_ALIGNED(4);
364
    PACK_AND_STORE_ALIGNED(5);
365
    PACK_AND_STORE_ALIGNED(6);
366
    PACK_AND_STORE_ALIGNED(7);
367
    PACK_AND_STORE_ALIGNED(8);
368
  } else {
369
    PACK_AND_STORE(1);
370
    PACK_AND_STORE(2);
371
    PACK_AND_STORE(3);
372
    PACK_AND_STORE(4);
373
    PACK_AND_STORE(5);
374
    PACK_AND_STORE(6);
375
    PACK_AND_STORE(7);
376
    PACK_AND_STORE(8);
377
  }
425 378
#undef PACK_AND_STORE
379
#undef PACK_AND_STORE_ALIGNED
426 380
}
427 381

  
428 382

  
......
1043 997
  PACK_AND_STORE(tempBlured, 7);
1044 998
#undef PACK_AND_STORE
1045 999
}
1000

  
1001
static inline void transpose_16x8_char_toPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) {
1002
  const vector unsigned char zero = vec_splat_u8(0);
1003

  
1004
#define LOAD_DOUBLE_LINE(i, j)						\
1005
  vector unsigned char perm1##i = vec_lvsl(i * stride, src);		\
1006
  vector unsigned char perm2##i = vec_lvsl(j * stride, src);		\
1007
  vector unsigned char srcA##i = vec_ld(i * stride, src);		\
1008
  vector unsigned char srcB##i = vec_ld(i * stride + 16, src);          \
1009
  vector unsigned char srcC##i = vec_ld(j * stride, src);		\
1010
  vector unsigned char srcD##i = vec_ld(j * stride+ 16, src);           \
1011
  vector unsigned char src##i = vec_perm(srcA##i, srcB##i, perm1##i);	\
1012
  vector unsigned char src##j = vec_perm(srcC##i, srcD##i, perm2##i)
1013
  
1014
  LOAD_DOUBLE_LINE(0, 1);
1015
  LOAD_DOUBLE_LINE(2, 3);
1016
  LOAD_DOUBLE_LINE(4, 5);
1017
  LOAD_DOUBLE_LINE(6, 7);
1018
#undef LOAD_DOUBLE_LINE
1019

  
1020
  vector unsigned char tempA = vec_mergeh(src0, zero);
1021
  vector unsigned char tempB = vec_mergel(src0, zero);
1022
  vector unsigned char tempC = vec_mergeh(src1, zero);
1023
  vector unsigned char tempD = vec_mergel(src1, zero);
1024
  vector unsigned char tempE = vec_mergeh(src2, zero);
1025
  vector unsigned char tempF = vec_mergel(src2, zero);
1026
  vector unsigned char tempG = vec_mergeh(src3, zero);
1027
  vector unsigned char tempH = vec_mergel(src3, zero);
1028
  vector unsigned char tempI = vec_mergeh(src4, zero);
1029
  vector unsigned char tempJ = vec_mergel(src4, zero);
1030
  vector unsigned char tempK = vec_mergeh(src5, zero);
1031
  vector unsigned char tempL = vec_mergel(src5, zero);
1032
  vector unsigned char tempM = vec_mergeh(src6, zero);
1033
  vector unsigned char tempN = vec_mergel(src6, zero);
1034
  vector unsigned char tempO = vec_mergeh(src7, zero);
1035
  vector unsigned char tempP = vec_mergel(src7, zero);
1036

  
1037
  vector unsigned char temp0 = vec_mergeh(tempA, tempI);
1038
  vector unsigned char temp1 = vec_mergel(tempA, tempI);
1039
  vector unsigned char temp2 = vec_mergeh(tempB, tempJ);
1040
  vector unsigned char temp3 = vec_mergel(tempB, tempJ);
1041
  vector unsigned char temp4 = vec_mergeh(tempC, tempK);
1042
  vector unsigned char temp5 = vec_mergel(tempC, tempK);
1043
  vector unsigned char temp6 = vec_mergeh(tempD, tempL);
1044
  vector unsigned char temp7 = vec_mergel(tempD, tempL);
1045
  vector unsigned char temp8 = vec_mergeh(tempE, tempM);
1046
  vector unsigned char temp9 = vec_mergel(tempE, tempM);
1047
  vector unsigned char temp10 = vec_mergeh(tempF, tempN);
1048
  vector unsigned char temp11 = vec_mergel(tempF, tempN);
1049
  vector unsigned char temp12 = vec_mergeh(tempG, tempO);
1050
  vector unsigned char temp13 = vec_mergel(tempG, tempO);
1051
  vector unsigned char temp14 = vec_mergeh(tempH, tempP);
1052
  vector unsigned char temp15 = vec_mergel(tempH, tempP);
1053

  
1054
  tempA = vec_mergeh(temp0, temp8);
1055
  tempB = vec_mergel(temp0, temp8);
1056
  tempC = vec_mergeh(temp1, temp9);
1057
  tempD = vec_mergel(temp1, temp9);
1058
  tempE = vec_mergeh(temp2, temp10);
1059
  tempF = vec_mergel(temp2, temp10);
1060
  tempG = vec_mergeh(temp3, temp11);
1061
  tempH = vec_mergel(temp3, temp11);
1062
  tempI = vec_mergeh(temp4, temp12);
1063
  tempJ = vec_mergel(temp4, temp12);
1064
  tempK = vec_mergeh(temp5, temp13);
1065
  tempL = vec_mergel(temp5, temp13);
1066
  tempM = vec_mergeh(temp6, temp14);
1067
  tempN = vec_mergel(temp6, temp14);
1068
  tempO = vec_mergeh(temp7, temp15);
1069
  tempP = vec_mergel(temp7, temp15);
1070

  
1071
  temp0 = vec_mergeh(tempA, tempI);
1072
  temp1 = vec_mergel(tempA, tempI);
1073
  temp2 = vec_mergeh(tempB, tempJ);
1074
  temp3 = vec_mergel(tempB, tempJ);
1075
  temp4 = vec_mergeh(tempC, tempK);
1076
  temp5 = vec_mergel(tempC, tempK);
1077
  temp6 = vec_mergeh(tempD, tempL);
1078
  temp7 = vec_mergel(tempD, tempL);
1079
  temp8 = vec_mergeh(tempE, tempM);
1080
  temp9 = vec_mergel(tempE, tempM);
1081
  temp10 = vec_mergeh(tempF, tempN);
1082
  temp11 = vec_mergel(tempF, tempN);
1083
  temp12 = vec_mergeh(tempG, tempO);
1084
  temp13 = vec_mergel(tempG, tempO);
1085
  temp14 = vec_mergeh(tempH, tempP);
1086
  temp15 = vec_mergel(tempH, tempP);
1087

  
1088
  vec_st(temp0, 0, dst);
1089
  vec_st(temp1, 16, dst);
1090
  vec_st(temp2, 32, dst);
1091
  vec_st(temp3, 48, dst);
1092
  vec_st(temp4, 64, dst);
1093
  vec_st(temp5, 80, dst);
1094
  vec_st(temp6, 96, dst);
1095
  vec_st(temp7, 112, dst);
1096
  vec_st(temp8, 128, dst);
1097
  vec_st(temp9, 144, dst);
1098
  vec_st(temp10, 160, dst);
1099
  vec_st(temp11, 176, dst);
1100
  vec_st(temp12, 192, dst);
1101
  vec_st(temp13, 208, dst);
1102
  vec_st(temp14, 224, dst);
1103
  vec_st(temp15, 240, dst);
1104
}
1105

  
1106
static inline void transpose_8x16_char_fromPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) {
1107
  const vector unsigned char zero = vec_splat_u8(0);
1108
  const vector unsigned char magic_perm = (const vector unsigned char)
1109
    AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1110
	0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
1111
  
1112
#define LOAD_DOUBLE_LINE(i, j)			    		\
1113
  vector unsigned char src##i = vec_ld(i * 16, src);		\
1114
  vector unsigned char src##j = vec_ld(j * 16, src)
1115

  
1116
  LOAD_DOUBLE_LINE(0, 1);
1117
  LOAD_DOUBLE_LINE(2, 3);
1118
  LOAD_DOUBLE_LINE(4, 5);
1119
  LOAD_DOUBLE_LINE(6, 7);
1120
  LOAD_DOUBLE_LINE(8, 9);
1121
  LOAD_DOUBLE_LINE(10, 11);
1122
  LOAD_DOUBLE_LINE(12, 13);
1123
  LOAD_DOUBLE_LINE(14, 15);
1124
#undef LOAD_DOUBLE_LINE
1125

  
1126
  vector unsigned char tempA = vec_mergeh(src0, src8);
1127
  vector unsigned char tempB;
1128
  vector unsigned char tempC = vec_mergeh(src1, src9);
1129
  vector unsigned char tempD;
1130
  vector unsigned char tempE = vec_mergeh(src2, src10);
1131
  vector unsigned char tempG = vec_mergeh(src3, src11);
1132
  vector unsigned char tempI = vec_mergeh(src4, src12);
1133
  vector unsigned char tempJ;
1134
  vector unsigned char tempK = vec_mergeh(src5, src13);
1135
  vector unsigned char tempL;
1136
  vector unsigned char tempM = vec_mergeh(src6, src14);
1137
  vector unsigned char tempO = vec_mergeh(src7, src15);
1138

  
1139
  vector unsigned char temp0 = vec_mergeh(tempA, tempI);
1140
  vector unsigned char temp1 = vec_mergel(tempA, tempI);
1141
  vector unsigned char temp2;
1142
  vector unsigned char temp3;
1143
  vector unsigned char temp4 = vec_mergeh(tempC, tempK);
1144
  vector unsigned char temp5 = vec_mergel(tempC, tempK);
1145
  vector unsigned char temp6;
1146
  vector unsigned char temp7;
1147
  vector unsigned char temp8 = vec_mergeh(tempE, tempM);
1148
  vector unsigned char temp9 = vec_mergel(tempE, tempM);
1149
  vector unsigned char temp12 = vec_mergeh(tempG, tempO);
1150
  vector unsigned char temp13 = vec_mergel(tempG, tempO);
1151

  
1152
  tempA = vec_mergeh(temp0, temp8);
1153
  tempB = vec_mergel(temp0, temp8);
1154
  tempC = vec_mergeh(temp1, temp9);
1155
  tempD = vec_mergel(temp1, temp9);
1156
  tempI = vec_mergeh(temp4, temp12);
1157
  tempJ = vec_mergel(temp4, temp12);
1158
  tempK = vec_mergeh(temp5, temp13);
1159
  tempL = vec_mergel(temp5, temp13);
1160

  
1161
  temp0 = vec_mergeh(tempA, tempI);
1162
  temp1 = vec_mergel(tempA, tempI);
1163
  temp2 = vec_mergeh(tempB, tempJ);
1164
  temp3 = vec_mergel(tempB, tempJ);
1165
  temp4 = vec_mergeh(tempC, tempK);
1166
  temp5 = vec_mergel(tempC, tempK);
1167
  temp6 = vec_mergeh(tempD, tempL);
1168
  temp7 = vec_mergel(tempD, tempL);
1169

  
1170

  
1171
  const vector signed char neg1 = vec_splat_s8(-1);
1172
#define STORE_DOUBLE_LINE(i, j)						\
1173
  vector unsigned char dstA##i = vec_ld(i * stride, dst);		\
1174
  vector unsigned char dstB##i = vec_ld(i * stride + 16, dst);		\
1175
  vector unsigned char dstA##j = vec_ld(j * stride, dst);		\
1176
  vector unsigned char dstB##j = vec_ld(j * stride+ 16, dst);		\
1177
  vector unsigned char align##i = vec_lvsr(i * stride, dst);		\
1178
  vector unsigned char align##j = vec_lvsr(j * stride, dst);		\
1179
  vector unsigned char mask##i = vec_perm(zero, (vector unsigned char)neg1, align##i);	\
1180
  vector unsigned char mask##j = vec_perm(zero, (vector unsigned char)neg1, align##j);	\
1181
  vector unsigned char dstR##i = vec_perm(temp##i, temp##i, align##i);	\
1182
  vector unsigned char dstR##j = vec_perm(temp##j, temp##j, align##j);	\
1183
  vector unsigned char dstAF##i = vec_sel(dstA##i, dstR##i, mask##i);	\
1184
  vector unsigned char dstBF##i = vec_sel(dstR##i, dstB##i, mask##i);	\
1185
  vector unsigned char dstAF##j = vec_sel(dstA##j, dstR##j, mask##j);	\
1186
  vector unsigned char dstBF##j = vec_sel(dstR##j, dstB##j, mask##j);	\
1187
  vec_st(dstAF##i, i * stride, dst);					\
1188
  vec_st(dstBF##i, i * stride + 16, dst);				\
1189
  vec_st(dstAF##j, j * stride, dst);					\
1190
  vec_st(dstBF##j, j * stride + 16, dst)
1191

  
1192
  STORE_DOUBLE_LINE(0,1);
1193
  STORE_DOUBLE_LINE(2,3);
1194
  STORE_DOUBLE_LINE(4,5);
1195
  STORE_DOUBLE_LINE(6,7);
1196
}
libavcodec/libpostproc/postprocess_template.c
3684 3684
					horizX1Filter(dstBlock-4, stride, QP);
3685 3685
				else if(mode & H_DEBLOCK)
3686 3686
				{
3687
#ifdef HAVE_ALTIVEC
3688
					unsigned char __attribute__ ((aligned(16))) tempBlock[272];
3689
					transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride);
3690

  
3691
					const int t=vertClassify_altivec(tempBlock-48, 16, &c);
3692
					if(t==1) {
3693
						doVertLowPass_altivec(tempBlock-48, 16, &c);
3694
                                                transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
3695
                                        }
3696
					else if(t==2) {
3697
						doVertDefFilter_altivec(tempBlock-48, 16, &c);
3698
                                                transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
3699
                                        }
3700
#else
3687 3701
					const int t= RENAME(horizClassify)(dstBlock-4, stride, &c);
3688 3702

  
3689 3703
					if(t==1)
3690 3704
						RENAME(doHorizLowPass)(dstBlock-4, stride, &c);
3691 3705
					else if(t==2)
3692 3706
						RENAME(doHorizDefFilter)(dstBlock-4, stride, &c);
3707
#endif
3693 3708
				}else if(mode & H_A_DEBLOCK){
3694 3709
					RENAME(do_a_deblock)(dstBlock-8, 1, stride, &c);
3695 3710
				}

Also available in: Unified diff