Revision 3ca96802 libavcodec/ppc/h264_template_altivec.c

View differences:

libavcodec/ppc/h264_template_altivec.c
27 27
                          ((8 - x) * (y)),
28 28
                          ((x) * (y))};
29 29
    register int i;
30
    vector unsigned char fperm;
31
    const vector signed int vABCD = vec_ld(0, ABCD);
32
    const vector signed short vA = vec_splat((vector signed short)vABCD, 1);
33
    const vector signed short vB = vec_splat((vector signed short)vABCD, 3);
34
    const vector signed short vC = vec_splat((vector signed short)vABCD, 5);
35
    const vector signed short vD = vec_splat((vector signed short)vABCD, 7);
36
    const vector signed int vzero = vec_splat_s32(0);
37
    const vector signed short v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
38
    const vector unsigned short v6us = vec_splat_u16(6);
30
    vec_u8_t fperm;
31
    const vec_s32_t vABCD = vec_ld(0, ABCD);
32
    const vec_s16_t vA = vec_splat((vec_s16_t)vABCD, 1);
33
    const vec_s16_t vB = vec_splat((vec_s16_t)vABCD, 3);
34
    const vec_s16_t vC = vec_splat((vec_s16_t)vABCD, 5);
35
    const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7);
36
    LOAD_ZERO;
37
    const vec_s16_t v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
38
    const vec_u16_t v6us = vec_splat_u16(6);
39 39
    register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
40 40
    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
41 41

  
42
    vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
43
    vector unsigned char vsrc0uc, vsrc1uc;
44
    vector signed short vsrc0ssH, vsrc1ssH;
45
    vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc;
46
    vector signed short vsrc2ssH, vsrc3ssH, psum;
47
    vector unsigned char vdst, ppsum, vfdst, fsum;
42
    vec_u8_t vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
43
    vec_u8_t vsrc0uc, vsrc1uc;
44
    vec_s16_t vsrc0ssH, vsrc1ssH;
45
    vec_u8_t vsrcCuc, vsrc2uc, vsrc3uc;
46
    vec_s16_t vsrc2ssH, vsrc3ssH, psum;
47
    vec_u8_t vdst, ppsum, vfdst, fsum;
48 48

  
49 49
  POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
50 50

  
51 51
    if (((unsigned long)dst) % 16 == 0) {
52
      fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13,
53
                                        0x14, 0x15, 0x16, 0x17,
54
                                        0x08, 0x09, 0x0A, 0x0B,
55
                                        0x0C, 0x0D, 0x0E, 0x0F);
52
      fperm = (vec_u8_t)AVV(0x10, 0x11, 0x12, 0x13,
53
                            0x14, 0x15, 0x16, 0x17,
54
                            0x08, 0x09, 0x0A, 0x0B,
55
                            0x0C, 0x0D, 0x0E, 0x0F);
56 56
    } else {
57
      fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03,
58
                                        0x04, 0x05, 0x06, 0x07,
59
                                        0x18, 0x19, 0x1A, 0x1B,
60
                                        0x1C, 0x1D, 0x1E, 0x1F);
57
      fperm = (vec_u8_t)AVV(0x00, 0x01, 0x02, 0x03,
58
                            0x04, 0x05, 0x06, 0x07,
59
                            0x18, 0x19, 0x1A, 0x1B,
60
                            0x1C, 0x1D, 0x1E, 0x1F);
61 61
    }
62 62

  
63 63
    vsrcAuc = vec_ld(0, src);
......
73 73
    else
74 74
      vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
75 75

  
76
    vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
77
                                               (vector unsigned char)vsrc0uc);
78
    vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
79
                                               (vector unsigned char)vsrc1uc);
76
    vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc0uc);
77
    vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc1uc);
80 78

  
81 79
    if (!loadSecond) {// -> !reallyBadAlign
82 80
      for (i = 0 ; i < h ; i++) {
......
87 85
        vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
88 86
        vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
89 87

  
90
        vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
91
                                                (vector unsigned char)vsrc2uc);
92
        vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
93
                                                (vector unsigned char)vsrc3uc);
88
        vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc2uc);
89
        vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc3uc);
94 90

  
95 91
        psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
96 92
        psum = vec_mladd(vB, vsrc1ssH, psum);
......
100 96
        psum = vec_sra(psum, v6us);
101 97

  
102 98
        vdst = vec_ld(0, dst);
103
        ppsum = (vector unsigned char)vec_packsu(psum, psum);
99
        ppsum = (vec_u8_t)vec_packsu(psum, psum);
104 100
        vfdst = vec_perm(vdst, ppsum, fperm);
105 101

  
106 102
        OP_U8_ALTIVEC(fsum, vfdst, vdst);
......
114 110
        src += stride;
115 111
      }
116 112
    } else {
117
        vector unsigned char vsrcDuc;
113
        vec_u8_t vsrcDuc;
118 114
      for (i = 0 ; i < h ; i++) {
119 115
        vsrcCuc = vec_ld(stride + 0, src);
120 116
        vsrcDuc = vec_ld(stride + 16, src);
......
125 121
        else
126 122
          vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
127 123

  
128
        vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
129
                                                (vector unsigned char)vsrc2uc);
130
        vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
131
                                                (vector unsigned char)vsrc3uc);
124
        vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc2uc);
125
        vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc3uc);
132 126

  
133 127
        psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
134 128
        psum = vec_mladd(vB, vsrc1ssH, psum);
......
138 132
        psum = vec_sr(psum, v6us);
139 133

  
140 134
        vdst = vec_ld(0, dst);
141
        ppsum = (vector unsigned char)vec_pack(psum, psum);
135
        ppsum = (vec_u8_t)vec_pack(psum, psum);
142 136
        vfdst = vec_perm(vdst, ppsum, fperm);
143 137

  
144 138
        OP_U8_ALTIVEC(fsum, vfdst, vdst);
......
160 154
  POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
161 155
  register int i;
162 156

  
163
  const vector signed int vzero = vec_splat_s32(0);
164
  const vector unsigned char permM2 = vec_lvsl(-2, src);
165
  const vector unsigned char permM1 = vec_lvsl(-1, src);
166
  const vector unsigned char permP0 = vec_lvsl(+0, src);
167
  const vector unsigned char permP1 = vec_lvsl(+1, src);
168
  const vector unsigned char permP2 = vec_lvsl(+2, src);
169
  const vector unsigned char permP3 = vec_lvsl(+3, src);
170
  const vector signed short v5ss = vec_splat_s16(5);
171
  const vector unsigned short v5us = vec_splat_u16(5);
172
  const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
173
  const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
174
  const vector unsigned char dstperm = vec_lvsr(0, dst);
175
  const vector unsigned char neg1 =
176
                                (const vector unsigned char) vec_splat_s8(-1);
177

  
178
  const vector unsigned char dstmask =
179
                                vec_perm((const vector unsigned char)vzero,
180
                                                               neg1, dstperm);
181

  
182
  vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
157
  LOAD_ZERO;
158
  const vec_u8_t permM2 = vec_lvsl(-2, src);
159
  const vec_u8_t permM1 = vec_lvsl(-1, src);
160
  const vec_u8_t permP0 = vec_lvsl(+0, src);
161
  const vec_u8_t permP1 = vec_lvsl(+1, src);
162
  const vec_u8_t permP2 = vec_lvsl(+2, src);
163
  const vec_u8_t permP3 = vec_lvsl(+3, src);
164
  const vec_s16_t v5ss = vec_splat_s16(5);
165
  const vec_u16_t v5us = vec_splat_u16(5);
166
  const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
167
  const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
168
  const vec_u8_t dstperm = vec_lvsr(0, dst);
169
  const vec_u8_t neg1 = (const vec_u8_t) vec_splat_s8(-1);
170
  const vec_u8_t dstmask = vec_perm(zero_u8v, neg1, dstperm);
171

  
172
  vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
183 173

  
184 174
  register int align = ((((unsigned long)src) - 2) % 16);
185 175

  
186
  vector signed short srcP0A, srcP0B, srcP1A, srcP1B,
176
  vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,
187 177
                      srcP2A, srcP2B, srcP3A, srcP3B,
188 178
                      srcM1A, srcM1B, srcM2A, srcM2B,
189 179
                      sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
190 180
                      pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
191 181
                      psumA, psumB, sumA, sumB;
192 182

  
193
  vector unsigned char sum, dst1, dst2, vdst, fsum,
194
                       rsum, fdst1, fdst2;
183
  vec_u8_t sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2;
195 184

  
196 185
  POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
197 186

  
198 187
  for (i = 0 ; i < 16 ; i ++) {
199
    vector unsigned char srcR1 = vec_ld(-2, src);
200
    vector unsigned char srcR2 = vec_ld(14, src);
188
    vec_u8_t srcR1 = vec_ld(-2, src);
189
    vec_u8_t srcR2 = vec_ld(14, src);
201 190

  
202 191
    switch (align) {
203 192
    default: {
......
217 206
      srcP3 = srcR2;
218 207
    } break;
219 208
    case 12: {
220
      vector unsigned char srcR3 = vec_ld(30, src);
209
      vec_u8_t srcR3 = vec_ld(30, src);
221 210
      srcM2 = vec_perm(srcR1, srcR2, permM2);
222 211
      srcM1 = vec_perm(srcR1, srcR2, permM1);
223 212
      srcP0 = vec_perm(srcR1, srcR2, permP0);
......
226 215
      srcP3 = vec_perm(srcR2, srcR3, permP3);
227 216
    } break;
228 217
    case 13: {
229
      vector unsigned char srcR3 = vec_ld(30, src);
218
      vec_u8_t srcR3 = vec_ld(30, src);
230 219
      srcM2 = vec_perm(srcR1, srcR2, permM2);
231 220
      srcM1 = vec_perm(srcR1, srcR2, permM1);
232 221
      srcP0 = vec_perm(srcR1, srcR2, permP0);
......
235 224
      srcP3 = vec_perm(srcR2, srcR3, permP3);
236 225
    } break;
237 226
    case 14: {
238
      vector unsigned char srcR3 = vec_ld(30, src);
227
      vec_u8_t srcR3 = vec_ld(30, src);
239 228
      srcM2 = vec_perm(srcR1, srcR2, permM2);
240 229
      srcM1 = vec_perm(srcR1, srcR2, permM1);
241 230
      srcP0 = srcR2;
......
244 233
      srcP3 = vec_perm(srcR2, srcR3, permP3);
245 234
    } break;
246 235
    case 15: {
247
      vector unsigned char srcR3 = vec_ld(30, src);
236
      vec_u8_t srcR3 = vec_ld(30, src);
248 237
      srcM2 = vec_perm(srcR1, srcR2, permM2);
249 238
      srcM1 = srcR2;
250 239
      srcP0 = vec_perm(srcR2, srcR3, permP0);
......
254 243
    } break;
255 244
    }
256 245

  
257
    srcP0A = (vector signed short)
258
                vec_mergeh((vector unsigned char)vzero, srcP0);
259
    srcP0B = (vector signed short)
260
                vec_mergel((vector unsigned char)vzero, srcP0);
261
    srcP1A = (vector signed short)
262
                vec_mergeh((vector unsigned char)vzero, srcP1);
263
    srcP1B = (vector signed short)
264
                vec_mergel((vector unsigned char)vzero, srcP1);
265

  
266
    srcP2A = (vector signed short)
267
                vec_mergeh((vector unsigned char)vzero, srcP2);
268
    srcP2B = (vector signed short)
269
                vec_mergel((vector unsigned char)vzero, srcP2);
270
    srcP3A = (vector signed short)
271
                vec_mergeh((vector unsigned char)vzero, srcP3);
272
    srcP3B = (vector signed short)
273
                vec_mergel((vector unsigned char)vzero, srcP3);
274

  
275
    srcM1A = (vector signed short)
276
                vec_mergeh((vector unsigned char)vzero, srcM1);
277
    srcM1B = (vector signed short)
278
                vec_mergel((vector unsigned char)vzero, srcM1);
279
    srcM2A = (vector signed short)
280
                vec_mergeh((vector unsigned char)vzero, srcM2);
281
    srcM2B = (vector signed short)
282
                vec_mergel((vector unsigned char)vzero, srcM2);
246
    srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
247
    srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
248
    srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
249
    srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
250

  
251
    srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
252
    srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
253
    srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
254
    srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
255

  
256
    srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
257
    srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
258
    srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
259
    srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
283 260

  
284 261
    sum1A = vec_adds(srcP0A, srcP1A);
285 262
    sum1B = vec_adds(srcP0B, srcP1B);
......
291 268
    pp1A = vec_mladd(sum1A, v20ss, v16ss);
292 269
    pp1B = vec_mladd(sum1B, v20ss, v16ss);
293 270

  
294
    pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
295
    pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
271
    pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
272
    pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
296 273

  
297 274
    pp3A = vec_add(sum3A, pp1A);
298 275
    pp3B = vec_add(sum3B, pp1B);
......
330 307

  
331 308
  register int i;
332 309

  
333
  const vector signed int vzero = vec_splat_s32(0);
334
  const vector unsigned char perm = vec_lvsl(0, src);
335
  const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
336
  const vector unsigned short v5us = vec_splat_u16(5);
337
  const vector signed short v5ss = vec_splat_s16(5);
338
  const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
339
  const vector unsigned char dstperm = vec_lvsr(0, dst);
340
  const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1);
341
  const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
310
  LOAD_ZERO;
311
  const vec_u8_t perm = vec_lvsl(0, src);
312
  const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
313
  const vec_u16_t v5us = vec_splat_u16(5);
314
  const vec_s16_t v5ss = vec_splat_s16(5);
315
  const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
316
  const vec_u8_t dstperm = vec_lvsr(0, dst);
317
  const vec_u8_t neg1 = (const vec_u8_t)vec_splat_s8(-1);
318
  const vec_u8_t dstmask = vec_perm(zero_u8v, neg1, dstperm);
342 319

  
343 320
  uint8_t *srcbis = src - (srcStride * 2);
344 321

  
345
  const vector unsigned char srcM2a = vec_ld(0, srcbis);
346
  const vector unsigned char srcM2b = vec_ld(16, srcbis);
347
  const vector unsigned char srcM2 = vec_perm(srcM2a, srcM2b, perm);
322
  const vec_u8_t srcM2a = vec_ld(0, srcbis);
323
  const vec_u8_t srcM2b = vec_ld(16, srcbis);
324
  const vec_u8_t srcM2 = vec_perm(srcM2a, srcM2b, perm);
348 325
//  srcbis += srcStride;
349
  const vector unsigned char srcM1a = vec_ld(0, srcbis += srcStride);
350
  const vector unsigned char srcM1b = vec_ld(16, srcbis);
351
  const vector unsigned char srcM1 = vec_perm(srcM1a, srcM1b, perm);
326
  const vec_u8_t srcM1a = vec_ld(0, srcbis += srcStride);
327
  const vec_u8_t srcM1b = vec_ld(16, srcbis);
328
  const vec_u8_t srcM1 = vec_perm(srcM1a, srcM1b, perm);
352 329
//  srcbis += srcStride;
353
  const vector unsigned char srcP0a = vec_ld(0, srcbis += srcStride);
354
  const vector unsigned char srcP0b = vec_ld(16, srcbis);
355
  const vector unsigned char srcP0 = vec_perm(srcP0a, srcP0b, perm);
330
  const vec_u8_t srcP0a = vec_ld(0, srcbis += srcStride);
331
  const vec_u8_t srcP0b = vec_ld(16, srcbis);
332
  const vec_u8_t srcP0 = vec_perm(srcP0a, srcP0b, perm);
356 333
//  srcbis += srcStride;
357
  const vector unsigned char srcP1a = vec_ld(0, srcbis += srcStride);
358
  const vector unsigned char srcP1b = vec_ld(16, srcbis);
359
  const vector unsigned char srcP1 = vec_perm(srcP1a, srcP1b, perm);
334
  const vec_u8_t srcP1a = vec_ld(0, srcbis += srcStride);
335
  const vec_u8_t srcP1b = vec_ld(16, srcbis);
336
  const vec_u8_t srcP1 = vec_perm(srcP1a, srcP1b, perm);
360 337
//  srcbis += srcStride;
361
  const vector unsigned char srcP2a = vec_ld(0, srcbis += srcStride);
362
  const vector unsigned char srcP2b = vec_ld(16, srcbis);
363
  const vector unsigned char srcP2 = vec_perm(srcP2a, srcP2b, perm);
338
  const vec_u8_t srcP2a = vec_ld(0, srcbis += srcStride);
339
  const vec_u8_t srcP2b = vec_ld(16, srcbis);
340
  const vec_u8_t srcP2 = vec_perm(srcP2a, srcP2b, perm);
364 341
//  srcbis += srcStride;
365 342

  
366
  vector signed short srcM2ssA = (vector signed short)
367
                                vec_mergeh((vector unsigned char)vzero, srcM2);
368
  vector signed short srcM2ssB = (vector signed short)
369
                                vec_mergel((vector unsigned char)vzero, srcM2);
370
  vector signed short srcM1ssA = (vector signed short)
371
                                vec_mergeh((vector unsigned char)vzero, srcM1);
372
  vector signed short srcM1ssB = (vector signed short)
373
                                vec_mergel((vector unsigned char)vzero, srcM1);
374
  vector signed short srcP0ssA = (vector signed short)
375
                                vec_mergeh((vector unsigned char)vzero, srcP0);
376
  vector signed short srcP0ssB = (vector signed short)
377
                                vec_mergel((vector unsigned char)vzero, srcP0);
378
  vector signed short srcP1ssA = (vector signed short)
379
                                vec_mergeh((vector unsigned char)vzero, srcP1);
380
  vector signed short srcP1ssB = (vector signed short)
381
                                vec_mergel((vector unsigned char)vzero, srcP1);
382
  vector signed short srcP2ssA = (vector signed short)
383
                                vec_mergeh((vector unsigned char)vzero, srcP2);
384
  vector signed short srcP2ssB = (vector signed short)
385
                                vec_mergel((vector unsigned char)vzero, srcP2);
386

  
387
  vector signed short pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
343
  vec_s16_t srcM2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
344
  vec_s16_t srcM2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
345
  vec_s16_t srcM1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
346
  vec_s16_t srcM1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
347
  vec_s16_t srcP0ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
348
  vec_s16_t srcP0ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
349
  vec_s16_t srcP1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
350
  vec_s16_t srcP1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
351
  vec_s16_t srcP2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
352
  vec_s16_t srcP2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
353

  
354
  vec_s16_t pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
388 355
                      psumA, psumB, sumA, sumB,
389 356
                      srcP3ssA, srcP3ssB,
390 357
                      sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
391 358

  
392
  vector unsigned char sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2,
393
                       srcP3a, srcP3b, srcP3;
359
  vec_u8_t sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2, srcP3a, srcP3b, srcP3;
394 360

  
395 361
  POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
396 362

  
......
398 364
    srcP3a = vec_ld(0, srcbis += srcStride);
399 365
    srcP3b = vec_ld(16, srcbis);
400 366
    srcP3 = vec_perm(srcP3a, srcP3b, perm);
401
    srcP3ssA = (vector signed short)
402
                                vec_mergeh((vector unsigned char)vzero, srcP3);
403
    srcP3ssB = (vector signed short)
404
                                vec_mergel((vector unsigned char)vzero, srcP3);
367
    srcP3ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
368
    srcP3ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
405 369
//    srcbis += srcStride;
406 370

  
407 371
    sum1A = vec_adds(srcP0ssA, srcP1ssA);
......
425 389
    pp1A = vec_mladd(sum1A, v20ss, v16ss);
426 390
    pp1B = vec_mladd(sum1B, v20ss, v16ss);
427 391

  
428
    pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
429
    pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
392
    pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
393
    pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
430 394

  
431 395
    pp3A = vec_add(sum3A, pp1A);
432 396
    pp3B = vec_add(sum3B, pp1B);
......
461 425
static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
462 426
  POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
463 427
  register int i;
464
  const vector signed int vzero = vec_splat_s32(0);
465
  const vector unsigned char permM2 = vec_lvsl(-2, src);
466
  const vector unsigned char permM1 = vec_lvsl(-1, src);
467
  const vector unsigned char permP0 = vec_lvsl(+0, src);
468
  const vector unsigned char permP1 = vec_lvsl(+1, src);
469
  const vector unsigned char permP2 = vec_lvsl(+2, src);
470
  const vector unsigned char permP3 = vec_lvsl(+3, src);
471
  const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
472
  const vector unsigned int v10ui = vec_splat_u32(10);
473
  const vector signed short v5ss = vec_splat_s16(5);
474
  const vector signed short v1ss = vec_splat_s16(1);
475
  const vector signed int v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
476
  const vector unsigned int v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
428
  LOAD_ZERO;
429
  const vec_u8_t permM2 = vec_lvsl(-2, src);
430
  const vec_u8_t permM1 = vec_lvsl(-1, src);
431
  const vec_u8_t permP0 = vec_lvsl(+0, src);
432
  const vec_u8_t permP1 = vec_lvsl(+1, src);
433
  const vec_u8_t permP2 = vec_lvsl(+2, src);
434
  const vec_u8_t permP3 = vec_lvsl(+3, src);
435
  const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
436
  const vec_u32_t v10ui = vec_splat_u32(10);
437
  const vec_s16_t v5ss = vec_splat_s16(5);
438
  const vec_s16_t v1ss = vec_splat_s16(1);
439
  const vec_s32_t v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
440
  const vec_u32_t v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
477 441

  
478 442
  register int align = ((((unsigned long)src) - 2) % 16);
479 443

  
480
  const vector unsigned char neg1 = (const vector unsigned char)
481
                                                        vec_splat_s8(-1);
444
  const vec_u8_t neg1 = (const vec_u8_t) vec_splat_s8(-1);
482 445

  
483
  vector signed short srcP0A, srcP0B, srcP1A, srcP1B,
446
  vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,
484 447
                      srcP2A, srcP2B, srcP3A, srcP3B,
485 448
                      srcM1A, srcM1B, srcM2A, srcM2B,
486 449
                      sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
487 450
                      pp1A, pp1B, pp2A, pp2B, psumA, psumB;
488 451

  
489
  const vector unsigned char dstperm = vec_lvsr(0, dst);
452
  const vec_u8_t dstperm = vec_lvsr(0, dst);
490 453

  
491
  const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
454
  const vec_u8_t dstmask = vec_perm(zero_u8v, neg1, dstperm);
492 455

  
493
  const vector unsigned char mperm = (const vector unsigned char)
456
  const vec_u8_t mperm = (const vec_u8_t)
494 457
    AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
495 458
        0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F);
496 459
  int16_t *tmpbis = tmp;
497 460

  
498
  vector signed short tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
461
  vec_s16_t tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
499 462
                      tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
500 463
                      tmpP2ssA, tmpP2ssB;
501 464

  
502
  vector signed int pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
465
  vec_s32_t pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
503 466
                    pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
504 467
                    pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
505 468
                    ssumAe, ssumAo, ssumBe, ssumBo;
506
  vector unsigned char fsum, sumv, sum, dst1, dst2, vdst,
507
                       rsum, fdst1, fdst2;
508
  vector signed short ssume, ssumo;
469
  vec_u8_t fsum, sumv, sum, dst1, dst2, vdst, rsum, fdst1, fdst2;
470
  vec_s16_t ssume, ssumo;
509 471

  
510 472
  POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
511 473
  src -= (2 * srcStride);
512 474
  for (i = 0 ; i < 21 ; i ++) {
513
    vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
514
    vector unsigned char srcR1 = vec_ld(-2, src);
515
    vector unsigned char srcR2 = vec_ld(14, src);
475
    vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
476
    vec_u8_t srcR1 = vec_ld(-2, src);
477
    vec_u8_t srcR2 = vec_ld(14, src);
516 478

  
517 479
    switch (align) {
518 480
    default: {
......
532 494
      srcP3 = srcR2;
533 495
    } break;
534 496
    case 12: {
535
      vector unsigned char srcR3 = vec_ld(30, src);
497
      vec_u8_t srcR3 = vec_ld(30, src);
536 498
      srcM2 = vec_perm(srcR1, srcR2, permM2);
537 499
      srcM1 = vec_perm(srcR1, srcR2, permM1);
538 500
      srcP0 = vec_perm(srcR1, srcR2, permP0);
......
541 503
      srcP3 = vec_perm(srcR2, srcR3, permP3);
542 504
    } break;
543 505
    case 13: {
544
      vector unsigned char srcR3 = vec_ld(30, src);
506
      vec_u8_t srcR3 = vec_ld(30, src);
545 507
      srcM2 = vec_perm(srcR1, srcR2, permM2);
546 508
      srcM1 = vec_perm(srcR1, srcR2, permM1);
547 509
      srcP0 = vec_perm(srcR1, srcR2, permP0);
......
550 512
      srcP3 = vec_perm(srcR2, srcR3, permP3);
551 513
    } break;
552 514
    case 14: {
553
      vector unsigned char srcR3 = vec_ld(30, src);
515
      vec_u8_t srcR3 = vec_ld(30, src);
554 516
      srcM2 = vec_perm(srcR1, srcR2, permM2);
555 517
      srcM1 = vec_perm(srcR1, srcR2, permM1);
556 518
      srcP0 = srcR2;
......
559 521
      srcP3 = vec_perm(srcR2, srcR3, permP3);
560 522
    } break;
561 523
    case 15: {
562
      vector unsigned char srcR3 = vec_ld(30, src);
524
      vec_u8_t srcR3 = vec_ld(30, src);
563 525
      srcM2 = vec_perm(srcR1, srcR2, permM2);
564 526
      srcM1 = srcR2;
565 527
      srcP0 = vec_perm(srcR2, srcR3, permP0);
......
569 531
    } break;
570 532
    }
571 533

  
572
    srcP0A = (vector signed short)
573
                            vec_mergeh((vector unsigned char)vzero, srcP0);
574
    srcP0B = (vector signed short)
575
                            vec_mergel((vector unsigned char)vzero, srcP0);
576
    srcP1A = (vector signed short)
577
                            vec_mergeh((vector unsigned char)vzero, srcP1);
578
    srcP1B = (vector signed short)
579
                            vec_mergel((vector unsigned char)vzero, srcP1);
580

  
581
    srcP2A = (vector signed short)
582
                            vec_mergeh((vector unsigned char)vzero, srcP2);
583
    srcP2B = (vector signed short)
584
                            vec_mergel((vector unsigned char)vzero, srcP2);
585
    srcP3A = (vector signed short)
586
                            vec_mergeh((vector unsigned char)vzero, srcP3);
587
    srcP3B = (vector signed short)
588
                            vec_mergel((vector unsigned char)vzero, srcP3);
589

  
590
    srcM1A = (vector signed short)
591
                            vec_mergeh((vector unsigned char)vzero, srcM1);
592
    srcM1B = (vector signed short)
593
                            vec_mergel((vector unsigned char)vzero, srcM1);
594
    srcM2A = (vector signed short)
595
                            vec_mergeh((vector unsigned char)vzero, srcM2);
596
    srcM2B = (vector signed short)
597
                            vec_mergel((vector unsigned char)vzero, srcM2);
534
    srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
535
    srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
536
    srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
537
    srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
538

  
539
    srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
540
    srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
541
    srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
542
    srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
543

  
544
    srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
545
    srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
546
    srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
547
    srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
598 548

  
599 549
    sum1A = vec_adds(srcP0A, srcP1A);
600 550
    sum1B = vec_adds(srcP0B, srcP1B);
......
606 556
    pp1A = vec_mladd(sum1A, v20ss, sum3A);
607 557
    pp1B = vec_mladd(sum1B, v20ss, sum3B);
608 558

  
609
    pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
610
    pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
559
    pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
560
    pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
611 561

  
612 562
    psumA = vec_sub(pp1A, pp2A);
613 563
    psumB = vec_sub(pp1B, pp2B);
......
636 586
  tmpbis += tmpStride;
637 587

  
638 588
  for (i = 0 ; i < 16 ; i++) {
639
    const vector signed short tmpP3ssA = vec_ld(0, tmpbis);
640
    const vector signed short tmpP3ssB = vec_ld(16, tmpbis);
589
    const vec_s16_t tmpP3ssA = vec_ld(0, tmpbis);
590
    const vec_s16_t tmpP3ssB = vec_ld(16, tmpbis);
641 591

  
642
    const vector signed short sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
643
    const vector signed short sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
644
    const vector signed short sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
645
    const vector signed short sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
646
    const vector signed short sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
647
    const vector signed short sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
592
    const vec_s16_t sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
593
    const vec_s16_t sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
594
    const vec_s16_t sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
595
    const vec_s16_t sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
596
    const vec_s16_t sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
597
    const vec_s16_t sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
648 598

  
649 599
    tmpbis += tmpStride;
650 600

  
......
669 619
    pp2Be = vec_mule(sum2B, v5ss);
670 620
    pp2Bo = vec_mulo(sum2B, v5ss);
671 621

  
672
    pp3Ae = vec_sra((vector signed int)sum3A, v16ui);
622
    pp3Ae = vec_sra((vec_s32_t)sum3A, v16ui);
673 623
    pp3Ao = vec_mulo(sum3A, v1ss);
674
    pp3Be = vec_sra((vector signed int)sum3B, v16ui);
624
    pp3Be = vec_sra((vec_s32_t)sum3B, v16ui);
675 625
    pp3Bo = vec_mulo(sum3B, v1ss);
676 626

  
677 627
    pp1cAe = vec_add(pp1Ae, v512si);

Also available in: Unified diff