Revision e8772eec libavcodec/ppc/dsputil_h264_template_altivec.c

View differences:

libavcodec/ppc/dsputil_h264_template_altivec.c
19 19
/* this code assume that stride % 16 == 0 */
20 20
void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
21 21
  POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1);
22
  POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
23
    signed int ABCD[4] __attribute__((aligned(16)));
22
    signed int ABCD[4] __attribute__((aligned(16))) =
23
                        {((8 - x) * (8 - y)),
24
                          ((x) * (8 - y)),
25
                          ((8 - x) * (y)),
26
                          ((x) * (y))};
24 27
    register int i;
25
    ABCD[0] = ((8 - x) * (8 - y));
26
    ABCD[1] = ((x) * (8 - y));
27
    ABCD[2] = ((8 - x) * (y));
28
    ABCD[3] = ((x) * (y));
28
    vector unsigned char fperm;
29 29
    const vector signed int vABCD = vec_ld(0, ABCD);
30 30
    const vector signed short vA = vec_splat((vector signed short)vABCD, 1);
31 31
    const vector signed short vB = vec_splat((vector signed short)vABCD, 3);
......
34 34
    const vector signed int vzero = vec_splat_s32(0);
35 35
    const vector signed short v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
36 36
    const vector unsigned short v6us = vec_splat_u16(6);
37
    register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
38
    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
37 39

  
38
    vector unsigned char fperm;
40
    vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
41
    vector unsigned char vsrc0uc, vsrc1uc;
42
    vector signed short vsrc0ssH, vsrc1ssH;
43
    vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc;
44
    vector signed short vsrc2ssH, vsrc3ssH, psum;
45
    vector unsigned char vdst, ppsum, vfdst, fsum;
46

  
47
  POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
39 48

  
40 49
    if (((unsigned long)dst) % 16 == 0) {
41
      fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
42
                                        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
50
      fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13,
51
                                        0x14, 0x15, 0x16, 0x17,
52
                                        0x08, 0x09, 0x0A, 0x0B,
53
                                        0x0C, 0x0D, 0x0E, 0x0F);
43 54
    } else {
44
      fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
45
                                        0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
55
      fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03,
56
                                        0x04, 0x05, 0x06, 0x07,
57
                                        0x18, 0x19, 0x1A, 0x1B,
58
                                        0x1C, 0x1D, 0x1E, 0x1F);
46 59
    }
47 60

  
48
    register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
49
    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
50

  
51
    vector unsigned char vsrcAuc;
52
    vector unsigned char vsrcBuc;
53
    vector unsigned char vsrcperm0;
54
    vector unsigned char vsrcperm1;
55 61
    vsrcAuc = vec_ld(0, src);
62

  
56 63
    if (loadSecond)
57 64
      vsrcBuc = vec_ld(16, src);
58 65
    vsrcperm0 = vec_lvsl(0, src);
59 66
    vsrcperm1 = vec_lvsl(1, src);
60 67

  
61
    vector unsigned char vsrc0uc;
62
    vector unsigned char vsrc1uc;
63 68
    vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
64 69
    if (reallyBadAlign)
65 70
      vsrc1uc = vsrcBuc;
66 71
    else
67 72
      vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
68 73

  
69
    vector signed short vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc0uc);
70
    vector signed short vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc1uc);
74
    vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
75
                                               (vector unsigned char)vsrc0uc);
76
    vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
77
                                               (vector unsigned char)vsrc1uc);
71 78

  
72 79
    if (!loadSecond) {// -> !reallyBadAlign
73 80
      for (i = 0 ; i < h ; i++) {
74
        vector unsigned char vsrcCuc;
81

  
82

  
75 83
        vsrcCuc = vec_ld(stride + 0, src);
76 84

  
77
        vector unsigned char vsrc2uc;
78
        vector unsigned char vsrc3uc;
79 85
        vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
80 86
        vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
81 87

  
82
        vector signed short vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc2uc);
83
        vector signed short vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc3uc);
84

  
85
        vector signed short psum;
88
        vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
89
                                                (vector unsigned char)vsrc2uc);
90
        vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
91
                                                (vector unsigned char)vsrc3uc);
86 92

  
87 93
        psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
88 94
        psum = vec_mladd(vB, vsrc1ssH, psum);
......
91 97
        psum = vec_add(v32ss, psum);
92 98
        psum = vec_sra(psum, v6us);
93 99

  
94
        vector unsigned char vdst = vec_ld(0, dst);
95
        vector unsigned char ppsum = (vector unsigned char)vec_packsu(psum, psum);
96

  
97
        vector unsigned char vfdst = vec_perm(vdst, ppsum, fperm);
98
        vector unsigned char fsum;
100
        vdst = vec_ld(0, dst);
101
        ppsum = (vector unsigned char)vec_packsu(psum, psum);
102
        vfdst = vec_perm(vdst, ppsum, fperm);
99 103

  
100 104
        OP_U8_ALTIVEC(fsum, vfdst, vdst);
101 105

  
......
108 112
        src += stride;
109 113
      }
110 114
    } else {
111
      for (i = 0 ; i < h ; i++) {
112
        vector unsigned char vsrcCuc;
113 115
        vector unsigned char vsrcDuc;
116
      for (i = 0 ; i < h ; i++) {
114 117
        vsrcCuc = vec_ld(stride + 0, src);
115 118
        vsrcDuc = vec_ld(stride + 16, src);
116 119

  
117
        vector unsigned char vsrc2uc;
118
        vector unsigned char vsrc3uc;
119 120
        vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
120 121
        if (reallyBadAlign)
121 122
          vsrc3uc = vsrcDuc;
122 123
        else
123 124
          vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
124 125

  
125
        vector signed short vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc2uc);
126
        vector signed short vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc3uc);
127

  
128
        vector signed short psum;
126
        vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
127
                                                (vector unsigned char)vsrc2uc);
128
        vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
129
                                                (vector unsigned char)vsrc3uc);
129 130

  
130 131
        psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
131 132
        psum = vec_mladd(vB, vsrc1ssH, psum);
......
134 135
        psum = vec_add(v32ss, psum);
135 136
        psum = vec_sr(psum, v6us);
136 137

  
137
        vector unsigned char vdst = vec_ld(0, dst);
138
        vector unsigned char ppsum = (vector unsigned char)vec_pack(psum, psum);
139

  
140
        vector unsigned char vfdst = vec_perm(vdst, ppsum, fperm);
141
        vector unsigned char fsum;
138
        vdst = vec_ld(0, dst);
139
        ppsum = (vector unsigned char)vec_pack(psum, psum);
140
        vfdst = vec_perm(vdst, ppsum, fperm);
142 141

  
143 142
        OP_U8_ALTIVEC(fsum, vfdst, vdst);
144 143

  
......
157 156
/* this code assume stride % 16 == 0 */
158 157
static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
159 158
  POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
160
  POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
161 159
  register int i;
162 160

  
163 161
  const vector signed int vzero = vec_splat_s32(0);
......
172 170
  const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
173 171
  const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
174 172
  const vector unsigned char dstperm = vec_lvsr(0, dst);
175
  const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1);
176
  const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
173
  const vector unsigned char neg1 =
174
                                (const vector unsigned char) vec_splat_s8(-1);
175

  
176
  const vector unsigned char dstmask =
177
                                vec_perm((const vector unsigned char)vzero,
178
                                                               neg1, dstperm);
179

  
180
  vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
177 181

  
178 182
  register int align = ((((unsigned long)src) - 2) % 16);
179 183

  
184
  vector signed short srcP0A, srcP0B, srcP1A, srcP1B,
185
                      srcP2A, srcP2B, srcP3A, srcP3B,
186
                      srcM1A, srcM1B, srcM2A, srcM2B,
187
                      sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
188
                      pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
189
                      psumA, psumB, sumA, sumB;
190

  
191
  vector unsigned char sum, dst1, dst2, vdst, fsum,
192
                       rsum, fdst1, fdst2;
193

  
194
  POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
195

  
180 196
  for (i = 0 ; i < 16 ; i ++) {
181
    vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
182 197
    vector unsigned char srcR1 = vec_ld(-2, src);
183 198
    vector unsigned char srcR2 = vec_ld(14, src);
184 199

  
......
237 252
    } break;
238 253
    }
239 254

  
240
    const vector signed short srcP0A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP0);
241
    const vector signed short srcP0B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP0);
242
    const vector signed short srcP1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP1);
243
    const vector signed short srcP1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP1);
255
    srcP0A = vec_mergeh((vector unsigned char)vzero, srcP0);
256
    srcP0B = vec_mergel((vector unsigned char)vzero, srcP0);
257
    srcP1A = vec_mergeh((vector unsigned char)vzero, srcP1);
258
    srcP1B = vec_mergel((vector unsigned char)vzero, srcP1);
244 259

  
245
    const vector signed short srcP2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP2);
246
    const vector signed short srcP2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP2);
247
    const vector signed short srcP3A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP3);
248
    const vector signed short srcP3B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP3);
260
    srcP2A = vec_mergeh((vector unsigned char)vzero, srcP2);
261
    srcP2B = vec_mergel((vector unsigned char)vzero, srcP2);
262
    srcP3A = vec_mergeh((vector unsigned char)vzero, srcP3);
263
    srcP3B = vec_mergel((vector unsigned char)vzero, srcP3);
249 264

  
250
    const vector signed short srcM1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM1);
251
    const vector signed short srcM1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM1);
252
    const vector signed short srcM2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM2);
253
    const vector signed short srcM2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM2);
265
    srcM1A = vec_mergeh((vector unsigned char)vzero, srcM1);
266
    srcM1B = vec_mergel((vector unsigned char)vzero, srcM1);
267
    srcM2A = vec_mergeh((vector unsigned char)vzero, srcM2);
268
    srcM2B = vec_mergel((vector unsigned char)vzero, srcM2);
254 269

  
255
    const vector signed short sum1A = vec_adds(srcP0A, srcP1A);
256
    const vector signed short sum1B = vec_adds(srcP0B, srcP1B);
257
    const vector signed short sum2A = vec_adds(srcM1A, srcP2A);
258
    const vector signed short sum2B = vec_adds(srcM1B, srcP2B);
259
    const vector signed short sum3A = vec_adds(srcM2A, srcP3A);
260
    const vector signed short sum3B = vec_adds(srcM2B, srcP3B);
270
    sum1A = vec_adds(srcP0A, srcP1A);
271
    sum1B = vec_adds(srcP0B, srcP1B);
272
    sum2A = vec_adds(srcM1A, srcP2A);
273
    sum2B = vec_adds(srcM1B, srcP2B);
274
    sum3A = vec_adds(srcM2A, srcP3A);
275
    sum3B = vec_adds(srcM2B, srcP3B);
261 276

  
262
    const vector signed short pp1A = vec_mladd(sum1A, v20ss, v16ss);
263
    const vector signed short pp1B = vec_mladd(sum1B, v20ss, v16ss);
277
    pp1A = vec_mladd(sum1A, v20ss, v16ss);
278
    pp1B = vec_mladd(sum1B, v20ss, v16ss);
264 279

  
265
    const vector signed short pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
266
    const vector signed short pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
280
    pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
281
    pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
267 282

  
268
    const vector signed short pp3A = vec_add(sum3A, pp1A);
269
    const vector signed short pp3B = vec_add(sum3B, pp1B);
283
    pp3A = vec_add(sum3A, pp1A);
284
    pp3B = vec_add(sum3B, pp1B);
270 285

  
271
    const vector signed short psumA = vec_sub(pp3A, pp2A);
272
    const vector signed short psumB = vec_sub(pp3B, pp2B);
286
    psumA = vec_sub(pp3A, pp2A);
287
    psumB = vec_sub(pp3B, pp2B);
273 288

  
274
    const vector signed short sumA = vec_sra(psumA, v5us);
275
    const vector signed short sumB = vec_sra(psumB, v5us);
289
    sumA = vec_sra(psumA, v5us);
290
    sumB = vec_sra(psumB, v5us);
276 291

  
277
    const vector unsigned char sum = vec_packsu(sumA, sumB);
292
    sum = vec_packsu(sumA, sumB);
278 293

  
279
    const vector unsigned char dst1 = vec_ld(0, dst);
280
    const vector unsigned char dst2 = vec_ld(16, dst);
281
    const vector unsigned char vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
294
    dst1 = vec_ld(0, dst);
295
    dst2 = vec_ld(16, dst);
296
    vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
282 297

  
283
    vector unsigned char fsum;
284 298
    OP_U8_ALTIVEC(fsum, sum, vdst);
285 299

  
286
    const vector unsigned char rsum = vec_perm(fsum, fsum, dstperm);
287
    const vector unsigned char fdst1 = vec_sel(dst1, rsum, dstmask);
288
    const vector unsigned char fdst2 = vec_sel(rsum, dst2, dstmask);
300
    rsum = vec_perm(fsum, fsum, dstperm);
301
    fdst1 = vec_sel(dst1, rsum, dstmask);
302
    fdst2 = vec_sel(rsum, dst2, dstmask);
289 303

  
290 304
    vec_st(fdst1, 0, dst);
291 305
    vec_st(fdst2, 16, dst);
......
299 313
/* this code assume stride % 16 == 0 */
300 314
static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
301 315
  POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
302
  POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
303 316

  
304 317
  register int i;
305 318

  
......
318 331
  const vector unsigned char srcM2a = vec_ld(0, srcbis);
319 332
  const vector unsigned char srcM2b = vec_ld(16, srcbis);
320 333
  const vector unsigned char srcM2 = vec_perm(srcM2a, srcM2b, perm);
321
  srcbis += srcStride;
322
  const vector unsigned char srcM1a = vec_ld(0, srcbis);
334
//  srcbis += srcStride;
335
  const vector unsigned char srcM1a = vec_ld(0, srcbis += srcStride);
323 336
  const vector unsigned char srcM1b = vec_ld(16, srcbis);
324 337
  const vector unsigned char srcM1 = vec_perm(srcM1a, srcM1b, perm);
325
  srcbis += srcStride;
326
  const vector unsigned char srcP0a = vec_ld(0, srcbis);
338
//  srcbis += srcStride;
339
  const vector unsigned char srcP0a = vec_ld(0, srcbis += srcStride);
327 340
  const vector unsigned char srcP0b = vec_ld(16, srcbis);
328 341
  const vector unsigned char srcP0 = vec_perm(srcP0a, srcP0b, perm);
329
  srcbis += srcStride;
330
  const vector unsigned char srcP1a = vec_ld(0, srcbis);
342
//  srcbis += srcStride;
343
  const vector unsigned char srcP1a = vec_ld(0, srcbis += srcStride);
331 344
  const vector unsigned char srcP1b = vec_ld(16, srcbis);
332 345
  const vector unsigned char srcP1 = vec_perm(srcP1a, srcP1b, perm);
333
  srcbis += srcStride;
334
  const vector unsigned char srcP2a = vec_ld(0, srcbis);
346
//  srcbis += srcStride;
347
  const vector unsigned char srcP2a = vec_ld(0, srcbis += srcStride);
335 348
  const vector unsigned char srcP2b = vec_ld(16, srcbis);
336 349
  const vector unsigned char srcP2 = vec_perm(srcP2a, srcP2b, perm);
337
  srcbis += srcStride;
338

  
339
  vector signed short srcM2ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM2);
340
  vector signed short srcM2ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM2);
341
  vector signed short srcM1ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM1);
342
  vector signed short srcM1ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM1);
343
  vector signed short srcP0ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP0);
344
  vector signed short srcP0ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP0);
345
  vector signed short srcP1ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP1);
346
  vector signed short srcP1ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP1);
347
  vector signed short srcP2ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP2);
348
  vector signed short srcP2ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP2);
350
//  srcbis += srcStride;
351

  
352
  vector signed short srcM2ssA = (vector signed short)
353
                                vec_mergeh((vector unsigned char)vzero, srcM2);
354
  vector signed short srcM2ssB = (vector signed short)
355
                                vec_mergel((vector unsigned char)vzero, srcM2);
356
  vector signed short srcM1ssA = (vector signed short)
357
                                vec_mergeh((vector unsigned char)vzero, srcM1);
358
  vector signed short srcM1ssB = (vector signed short)
359
                                vec_mergel((vector unsigned char)vzero, srcM1);
360
  vector signed short srcP0ssA = (vector signed short)
361
                                vec_mergeh((vector unsigned char)vzero, srcP0);
362
  vector signed short srcP0ssB = (vector signed short)
363
                                vec_mergel((vector unsigned char)vzero, srcP0);
364
  vector signed short srcP1ssA = (vector signed short)
365
                                vec_mergeh((vector unsigned char)vzero, srcP1);
366
  vector signed short srcP1ssB = (vector signed short)
367
                                vec_mergel((vector unsigned char)vzero, srcP1);
368
  vector signed short srcP2ssA = (vector signed short)
369
                                vec_mergeh((vector unsigned char)vzero, srcP2);
370
  vector signed short srcP2ssB = (vector signed short)
371
                                vec_mergel((vector unsigned char)vzero, srcP2);
372

  
373
  vector signed short pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
374
                      psumA, psumB, sumA, sumB,
375
                      srcP3ssA, srcP3ssB,
376
                      sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
377

  
378
  vector unsigned char sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2,
379
                       srcP3a, srcP3b, srcP3;
380

  
381
  POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
349 382

  
350 383
  for (i = 0 ; i < 16 ; i++) {
351
    const vector unsigned char srcP3a = vec_ld(0, srcbis);
352
    const vector unsigned char srcP3b = vec_ld(16, srcbis);
353
    const vector unsigned char srcP3 = vec_perm(srcP3a, srcP3b, perm);
354
    const vector signed short srcP3ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP3);
355
    const vector signed short srcP3ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP3);
356
    srcbis += srcStride;
357

  
358
    const vector signed short sum1A = vec_adds(srcP0ssA, srcP1ssA);
359
    const vector signed short sum1B = vec_adds(srcP0ssB, srcP1ssB);
360
    const vector signed short sum2A = vec_adds(srcM1ssA, srcP2ssA);
361
    const vector signed short sum2B = vec_adds(srcM1ssB, srcP2ssB);
362
    const vector signed short sum3A = vec_adds(srcM2ssA, srcP3ssA);
363
    const vector signed short sum3B = vec_adds(srcM2ssB, srcP3ssB);
384
    srcP3a = vec_ld(0, srcbis += srcStride);
385
    srcP3b = vec_ld(16, srcbis);
386
    srcP3 = vec_perm(srcP3a, srcP3b, perm);
387
    srcP3ssA = (vector signed short)
388
                                vec_mergeh((vector unsigned char)vzero, srcP3);
389
    srcP3ssB = (vector signed short)
390
                                vec_mergel((vector unsigned char)vzero, srcP3);
391
//    srcbis += srcStride;
392

  
393
    sum1A = vec_adds(srcP0ssA, srcP1ssA);
394
    sum1B = vec_adds(srcP0ssB, srcP1ssB);
395
    sum2A = vec_adds(srcM1ssA, srcP2ssA);
396
    sum2B = vec_adds(srcM1ssB, srcP2ssB);
397
    sum3A = vec_adds(srcM2ssA, srcP3ssA);
398
    sum3B = vec_adds(srcM2ssB, srcP3ssB);
364 399

  
365 400
    srcM2ssA = srcM1ssA;
366 401
    srcM2ssB = srcM1ssB;
......
373 408
    srcP2ssA = srcP3ssA;
374 409
    srcP2ssB = srcP3ssB;
375 410

  
376
    const vector signed short pp1A = vec_mladd(sum1A, v20ss, v16ss);
377
    const vector signed short pp1B = vec_mladd(sum1B, v20ss, v16ss);
411
    pp1A = vec_mladd(sum1A, v20ss, v16ss);
412
    pp1B = vec_mladd(sum1B, v20ss, v16ss);
378 413

  
379
    const vector signed short pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
380
    const vector signed short pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
414
    pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
415
    pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
381 416

  
382
    const vector signed short pp3A = vec_add(sum3A, pp1A);
383
    const vector signed short pp3B = vec_add(sum3B, pp1B);
417
    pp3A = vec_add(sum3A, pp1A);
418
    pp3B = vec_add(sum3B, pp1B);
384 419

  
385
    const vector signed short psumA = vec_sub(pp3A, pp2A);
386
    const vector signed short psumB = vec_sub(pp3B, pp2B);
420
    psumA = vec_sub(pp3A, pp2A);
421
    psumB = vec_sub(pp3B, pp2B);
387 422

  
388
    const vector signed short sumA = vec_sra(psumA, v5us);
389
    const vector signed short sumB = vec_sra(psumB, v5us);
423
    sumA = vec_sra(psumA, v5us);
424
    sumB = vec_sra(psumB, v5us);
390 425

  
391
    const vector unsigned char sum = vec_packsu(sumA, sumB);
426
    sum = vec_packsu(sumA, sumB);
392 427

  
393
    const vector unsigned char dst1 = vec_ld(0, dst);
394
    const vector unsigned char dst2 = vec_ld(16, dst);
395
    const vector unsigned char vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
428
    dst1 = vec_ld(0, dst);
429
    dst2 = vec_ld(16, dst);
430
    vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
396 431

  
397
    vector unsigned char fsum;
398 432
    OP_U8_ALTIVEC(fsum, sum, vdst);
399 433

  
400
    const vector unsigned char rsum = vec_perm(fsum, fsum, dstperm);
401
    const vector unsigned char fdst1 = vec_sel(dst1, rsum, dstmask);
402
    const vector unsigned char fdst2 = vec_sel(rsum, dst2, dstmask);
434
    rsum = vec_perm(fsum, fsum, dstperm);
435
    fdst1 = vec_sel(dst1, rsum, dstmask);
436
    fdst2 = vec_sel(rsum, dst2, dstmask);
403 437

  
404 438
    vec_st(fdst1, 0, dst);
405 439
    vec_st(fdst2, 16, dst);
......
412 446
/* this code assume stride % 16 == 0 *and* tmp is properly aligned */
413 447
static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
414 448
  POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
415
  POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
416 449
  register int i;
417 450
  const vector signed int vzero = vec_splat_s32(0);
418 451
  const vector unsigned char permM2 = vec_lvsl(-2, src);
......
430 463

  
431 464
  register int align = ((((unsigned long)src) - 2) % 16);
432 465

  
433
  src -= (2 * srcStride);
466
  const vector unsigned char neg1 = (const vector unsigned char)
467
                                                        vec_splat_s8(-1);
468

  
469
  vector signed short srcP0A, srcP0B, srcP1A, srcP1B,
470
                      srcP2A, srcP2B, srcP3A, srcP3B,
471
                      srcM1A, srcM1B, srcM2A, srcM2B,
472
                      sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
473
                      pp1A, pp1B, pp2A, pp2B, psumA, psumB;
474

  
475
  const vector unsigned char dstperm = vec_lvsr(0, dst);
476

  
477
  const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
478

  
479
  const vector unsigned char mperm = (const vector unsigned char)
480
    AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
481
        0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F);
482
  int16_t *tmpbis = tmp;
483

  
484
  vector signed short tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
485
                      tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
486
                      tmpP2ssA, tmpP2ssB;
434 487

  
488
  vector signed int pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
489
                    pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
490
                    pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
491
                    ssumAe, ssumAo, ssumBe, ssumBo;
492
  vector unsigned char fsum, sumv, sum, dst1, dst2, vdst,
493
                       rsum, fdst1, fdst2;
494
  vector signed short ssume, ssumo;
495

  
496
  POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
497
  src -= (2 * srcStride);
435 498
  for (i = 0 ; i < 21 ; i ++) {
436 499
    vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
437 500
    vector unsigned char srcR1 = vec_ld(-2, src);
......
492 555
    } break;
493 556
    }
494 557

  
495
    const vector signed short srcP0A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP0);
496
    const vector signed short srcP0B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP0);
497
    const vector signed short srcP1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP1);
498
    const vector signed short srcP1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP1);
499

  
500
    const vector signed short srcP2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP2);
501
    const vector signed short srcP2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP2);
502
    const vector signed short srcP3A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP3);
503
    const vector signed short srcP3B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP3);
504

  
505
    const vector signed short srcM1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM1);
506
    const vector signed short srcM1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM1);
507
    const vector signed short srcM2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM2);
508
    const vector signed short srcM2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM2);
509

  
510
    const vector signed short sum1A = vec_adds(srcP0A, srcP1A);
511
    const vector signed short sum1B = vec_adds(srcP0B, srcP1B);
512
    const vector signed short sum2A = vec_adds(srcM1A, srcP2A);
513
    const vector signed short sum2B = vec_adds(srcM1B, srcP2B);
514
    const vector signed short sum3A = vec_adds(srcM2A, srcP3A);
515
    const vector signed short sum3B = vec_adds(srcM2B, srcP3B);
516

  
517
    const vector signed short pp1A = vec_mladd(sum1A, v20ss, sum3A);
518
    const vector signed short pp1B = vec_mladd(sum1B, v20ss, sum3B);
519

  
520
    const vector signed short pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
521
    const vector signed short pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
522

  
523
    const vector signed short psumA = vec_sub(pp1A, pp2A);
524
    const vector signed short psumB = vec_sub(pp1B, pp2B);
558
    srcP0A = (vector signed short)
559
                            vec_mergeh((vector unsigned char)vzero, srcP0);
560
    srcP0B = (vector signed short)
561
                            vec_mergel((vector unsigned char)vzero, srcP0);
562
    srcP1A = (vector signed short)
563
                            vec_mergeh((vector unsigned char)vzero, srcP1);
564
    srcP1B = (vector signed short)
565
                            vec_mergel((vector unsigned char)vzero, srcP1);
566

  
567
    srcP2A = (vector signed short)
568
                            vec_mergeh((vector unsigned char)vzero, srcP2);
569
    srcP2B = (vector signed short)
570
                            vec_mergel((vector unsigned char)vzero, srcP2);
571
    srcP3A = (vector signed short)
572
                            vec_mergeh((vector unsigned char)vzero, srcP3);
573
    srcP3B = (vector signed short)
574
                            vec_mergel((vector unsigned char)vzero, srcP3);
575

  
576
    srcM1A = (vector signed short)
577
                            vec_mergeh((vector unsigned char)vzero, srcM1);
578
    srcM1B = (vector signed short)
579
                            vec_mergel((vector unsigned char)vzero, srcM1);
580
    srcM2A = (vector signed short)
581
                            vec_mergeh((vector unsigned char)vzero, srcM2);
582
    srcM2B = (vector signed short)
583
                            vec_mergel((vector unsigned char)vzero, srcM2);
584

  
585
    sum1A = vec_adds(srcP0A, srcP1A);
586
    sum1B = vec_adds(srcP0B, srcP1B);
587
    sum2A = vec_adds(srcM1A, srcP2A);
588
    sum2B = vec_adds(srcM1B, srcP2B);
589
    sum3A = vec_adds(srcM2A, srcP3A);
590
    sum3B = vec_adds(srcM2B, srcP3B);
591

  
592
    pp1A = vec_mladd(sum1A, v20ss, sum3A);
593
    pp1B = vec_mladd(sum1B, v20ss, sum3B);
594

  
595
    pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
596
    pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
597

  
598
    psumA = vec_sub(pp1A, pp2A);
599
    psumB = vec_sub(pp1B, pp2B);
525 600

  
526 601
    vec_st(psumA, 0, tmp);
527 602
    vec_st(psumB, 16, tmp);
......
530 605
    tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
531 606
  }
532 607

  
533
  const vector unsigned char dstperm = vec_lvsr(0, dst);
534
  const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1);
535
  const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
536
  const vector unsigned char mperm = (const vector unsigned char)
537
    AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
538
        0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F);
539

  
540
  int16_t *tmpbis = tmp - (tmpStride * 21);
541

  
542
  vector signed short tmpM2ssA = vec_ld(0, tmpbis);
543
  vector signed short tmpM2ssB = vec_ld(16, tmpbis);
608
  tmpM2ssA = vec_ld(0, tmpbis);
609
  tmpM2ssB = vec_ld(16, tmpbis);
544 610
  tmpbis += tmpStride;
545
  vector signed short tmpM1ssA = vec_ld(0, tmpbis);
546
  vector signed short tmpM1ssB = vec_ld(16, tmpbis);
611
  tmpM1ssA = vec_ld(0, tmpbis);
612
  tmpM1ssB = vec_ld(16, tmpbis);
547 613
  tmpbis += tmpStride;
548
  vector signed short tmpP0ssA = vec_ld(0, tmpbis);
549
  vector signed short tmpP0ssB = vec_ld(16, tmpbis);
614
  tmpP0ssA = vec_ld(0, tmpbis);
615
  tmpP0ssB = vec_ld(16, tmpbis);
550 616
  tmpbis += tmpStride;
551
  vector signed short tmpP1ssA = vec_ld(0, tmpbis);
552
  vector signed short tmpP1ssB = vec_ld(16, tmpbis);
617
  tmpP1ssA = vec_ld(0, tmpbis);
618
  tmpP1ssB = vec_ld(16, tmpbis);
553 619
  tmpbis += tmpStride;
554
  vector signed short tmpP2ssA = vec_ld(0, tmpbis);
555
  vector signed short tmpP2ssB = vec_ld(16, tmpbis);
620
  tmpP2ssA = vec_ld(0, tmpbis);
621
  tmpP2ssB = vec_ld(16, tmpbis);
556 622
  tmpbis += tmpStride;
557 623

  
558 624
  for (i = 0 ; i < 16 ; i++) {
559 625
    const vector signed short tmpP3ssA = vec_ld(0, tmpbis);
560 626
    const vector signed short tmpP3ssB = vec_ld(16, tmpbis);
561
    tmpbis += tmpStride;
562 627

  
563 628
    const vector signed short sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
564 629
    const vector signed short sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
......
567 632
    const vector signed short sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
568 633
    const vector signed short sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
569 634

  
635
    tmpbis += tmpStride;
636

  
570 637
    tmpM2ssA = tmpM1ssA;
571 638
    tmpM2ssB = tmpM1ssB;
572 639
    tmpM1ssA = tmpP0ssA;
......
578 645
    tmpP2ssA = tmpP3ssA;
579 646
    tmpP2ssB = tmpP3ssB;
580 647

  
581
    const vector signed int pp1Ae = vec_mule(sum1A, v20ss);
582
    const vector signed int pp1Ao = vec_mulo(sum1A, v20ss);
583
    const vector signed int pp1Be = vec_mule(sum1B, v20ss);
584
    const vector signed int pp1Bo = vec_mulo(sum1B, v20ss);
648
    pp1Ae = vec_mule(sum1A, v20ss);
649
    pp1Ao = vec_mulo(sum1A, v20ss);
650
    pp1Be = vec_mule(sum1B, v20ss);
651
    pp1Bo = vec_mulo(sum1B, v20ss);
585 652

  
586
    const vector signed int pp2Ae = vec_mule(sum2A, v5ss);
587
    const vector signed int pp2Ao = vec_mulo(sum2A, v5ss);
588
    const vector signed int pp2Be = vec_mule(sum2B, v5ss);
589
    const vector signed int pp2Bo = vec_mulo(sum2B, v5ss);
653
    pp2Ae = vec_mule(sum2A, v5ss);
654
    pp2Ao = vec_mulo(sum2A, v5ss);
655
    pp2Be = vec_mule(sum2B, v5ss);
656
    pp2Bo = vec_mulo(sum2B, v5ss);
590 657

  
591
    const vector signed int pp3Ae = vec_sra((vector signed int)sum3A, v16ui);
592
    const vector signed int pp3Ao = vec_mulo(sum3A, v1ss);
593
    const vector signed int pp3Be = vec_sra((vector signed int)sum3B, v16ui);
594
    const vector signed int pp3Bo = vec_mulo(sum3B, v1ss);
658
    pp3Ae = vec_sra((vector signed int)sum3A, v16ui);
659
    pp3Ao = vec_mulo(sum3A, v1ss);
660
    pp3Be = vec_sra((vector signed int)sum3B, v16ui);
661
    pp3Bo = vec_mulo(sum3B, v1ss);
595 662

  
596
    const vector signed int pp1cAe = vec_add(pp1Ae, v512si);
597
    const vector signed int pp1cAo = vec_add(pp1Ao, v512si);
598
    const vector signed int pp1cBe = vec_add(pp1Be, v512si);
599
    const vector signed int pp1cBo = vec_add(pp1Bo, v512si);
663
    pp1cAe = vec_add(pp1Ae, v512si);
664
    pp1cAo = vec_add(pp1Ao, v512si);
665
    pp1cBe = vec_add(pp1Be, v512si);
666
    pp1cBo = vec_add(pp1Bo, v512si);
600 667

  
601
    const vector signed int pp32Ae = vec_sub(pp3Ae, pp2Ae);
602
    const vector signed int pp32Ao = vec_sub(pp3Ao, pp2Ao);
603
    const vector signed int pp32Be = vec_sub(pp3Be, pp2Be);
604
    const vector signed int pp32Bo = vec_sub(pp3Bo, pp2Bo);
668
    pp32Ae = vec_sub(pp3Ae, pp2Ae);
669
    pp32Ao = vec_sub(pp3Ao, pp2Ao);
670
    pp32Be = vec_sub(pp3Be, pp2Be);
671
    pp32Bo = vec_sub(pp3Bo, pp2Bo);
605 672

  
606
    const vector signed int sumAe = vec_add(pp1cAe, pp32Ae);
607
    const vector signed int sumAo = vec_add(pp1cAo, pp32Ao);
608
    const vector signed int sumBe = vec_add(pp1cBe, pp32Be);
609
    const vector signed int sumBo = vec_add(pp1cBo, pp32Bo);
673
    sumAe = vec_add(pp1cAe, pp32Ae);
674
    sumAo = vec_add(pp1cAo, pp32Ao);
675
    sumBe = vec_add(pp1cBe, pp32Be);
676
    sumBo = vec_add(pp1cBo, pp32Bo);
610 677

  
611
    const vector signed int ssumAe = vec_sra(sumAe, v10ui);
612
    const vector signed int ssumAo = vec_sra(sumAo, v10ui);
613
    const vector signed int ssumBe = vec_sra(sumBe, v10ui);
614
    const vector signed int ssumBo = vec_sra(sumBo, v10ui);
678
    ssumAe = vec_sra(sumAe, v10ui);
679
    ssumAo = vec_sra(sumAo, v10ui);
680
    ssumBe = vec_sra(sumBe, v10ui);
681
    ssumBo = vec_sra(sumBo, v10ui);
615 682

  
616
    const vector signed short ssume = vec_packs(ssumAe, ssumBe);
617
    const vector signed short ssumo = vec_packs(ssumAo, ssumBo);
683
    ssume = vec_packs(ssumAe, ssumBe);
684
    ssumo = vec_packs(ssumAo, ssumBo);
618 685

  
619
    const vector unsigned char sumv = vec_packsu(ssume, ssumo);
620
    const vector unsigned char sum = vec_perm(sumv, sumv, mperm);
686
    sumv = vec_packsu(ssume, ssumo);
687
    sum = vec_perm(sumv, sumv, mperm);
621 688

  
622
    const vector unsigned char dst1 = vec_ld(0, dst);
623
    const vector unsigned char dst2 = vec_ld(16, dst);
624
    const vector unsigned char vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
689
    dst1 = vec_ld(0, dst);
690
    dst2 = vec_ld(16, dst);
691
    vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
625 692

  
626
    vector unsigned char fsum;
627 693
    OP_U8_ALTIVEC(fsum, sum, vdst);
628 694

  
629
    const vector unsigned char rsum = vec_perm(fsum, fsum, dstperm);
630
    const vector unsigned char fdst1 = vec_sel(dst1, rsum, dstmask);
631
    const vector unsigned char fdst2 = vec_sel(rsum, dst2, dstmask);
695
    rsum = vec_perm(fsum, fsum, dstperm);
696
    fdst1 = vec_sel(dst1, rsum, dstmask);
697
    fdst2 = vec_sel(rsum, dst2, dstmask);
632 698

  
633 699
    vec_st(fdst1, 0, dst);
634 700
    vec_st(fdst2, 16, dst);

Also available in: Unified diff