Statistics
| Branch: | Revision:

ffmpeg / libavcodec / ppc / dsputil_h264_template_altivec.c @ 47261113

History | View | Annotate | Download (26.9 KB)

1 a6a12a8a Romain Dolbeau
/*
2
 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
3
 *
4
 * This library is free software; you can redistribute it and/or
5
 * modify it under the terms of the GNU Lesser General Public
6
 * License as published by the Free Software Foundation; either
7
 * version 2 of the License, or (at your option) any later version.
8
 *
9
 * This library is distributed in the hope that it will be useful,
10
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12
 * Lesser General Public License for more details.
13
 *
14
 * You should have received a copy of the GNU Lesser General Public
15
 * License along with this library; if not, write to the Free Software
16 5509bffa Diego Biurrun
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17 a6a12a8a Romain Dolbeau
 */
18
19
/* this code assume that stride % 16 == 0 */
20
void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
21
  POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1);
22 e8772eec Luca Barbato
    signed int ABCD[4] __attribute__((aligned(16))) =
23
                        {((8 - x) * (8 - y)),
24
                          ((x) * (8 - y)),
25
                          ((8 - x) * (y)),
26
                          ((x) * (y))};
27 a6a12a8a Romain Dolbeau
    register int i;
28 e8772eec Luca Barbato
    vector unsigned char fperm;
29 a6a12a8a Romain Dolbeau
    const vector signed int vABCD = vec_ld(0, ABCD);
30
    const vector signed short vA = vec_splat((vector signed short)vABCD, 1);
31
    const vector signed short vB = vec_splat((vector signed short)vABCD, 3);
32
    const vector signed short vC = vec_splat((vector signed short)vABCD, 5);
33
    const vector signed short vD = vec_splat((vector signed short)vABCD, 7);
34
    const vector signed int vzero = vec_splat_s32(0);
35 5cb9fda4 Luca Barbato
    const vector signed short v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
36 a6a12a8a Romain Dolbeau
    const vector unsigned short v6us = vec_splat_u16(6);
37 e8772eec Luca Barbato
    register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
38
    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
39 a6a12a8a Romain Dolbeau
40 e8772eec Luca Barbato
    vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
41
    vector unsigned char vsrc0uc, vsrc1uc;
42
    vector signed short vsrc0ssH, vsrc1ssH;
43
    vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc;
44
    vector signed short vsrc2ssH, vsrc3ssH, psum;
45
    vector unsigned char vdst, ppsum, vfdst, fsum;
46
47
  POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
48 a6a12a8a Romain Dolbeau
49
    if (((unsigned long)dst) % 16 == 0) {
50 e8772eec Luca Barbato
      fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13,
51
                                        0x14, 0x15, 0x16, 0x17,
52
                                        0x08, 0x09, 0x0A, 0x0B,
53
                                        0x0C, 0x0D, 0x0E, 0x0F);
54 a6a12a8a Romain Dolbeau
    } else {
55 e8772eec Luca Barbato
      fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03,
56
                                        0x04, 0x05, 0x06, 0x07,
57
                                        0x18, 0x19, 0x1A, 0x1B,
58
                                        0x1C, 0x1D, 0x1E, 0x1F);
59 a6a12a8a Romain Dolbeau
    }
60
61
    vsrcAuc = vec_ld(0, src);
62 e8772eec Luca Barbato
63 a6a12a8a Romain Dolbeau
    if (loadSecond)
64
      vsrcBuc = vec_ld(16, src);
65
    vsrcperm0 = vec_lvsl(0, src);
66
    vsrcperm1 = vec_lvsl(1, src);
67 115329f1 Diego Biurrun
68 a6a12a8a Romain Dolbeau
    vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
69
    if (reallyBadAlign)
70
      vsrc1uc = vsrcBuc;
71
    else
72
      vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
73 115329f1 Diego Biurrun
74 e8772eec Luca Barbato
    vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
75
                                               (vector unsigned char)vsrc0uc);
76
    vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
77
                                               (vector unsigned char)vsrc1uc);
78 a6a12a8a Romain Dolbeau
79
    if (!loadSecond) {// -> !reallyBadAlign
80
      for (i = 0 ; i < h ; i++) {
81 e8772eec Luca Barbato
82
83 a6a12a8a Romain Dolbeau
        vsrcCuc = vec_ld(stride + 0, src);
84 115329f1 Diego Biurrun
85 a6a12a8a Romain Dolbeau
        vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
86
        vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
87 115329f1 Diego Biurrun
88 e8772eec Luca Barbato
        vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
89
                                                (vector unsigned char)vsrc2uc);
90
        vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
91
                                                (vector unsigned char)vsrc3uc);
92 115329f1 Diego Biurrun
93 a6a12a8a Romain Dolbeau
        psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
94
        psum = vec_mladd(vB, vsrc1ssH, psum);
95
        psum = vec_mladd(vC, vsrc2ssH, psum);
96
        psum = vec_mladd(vD, vsrc3ssH, psum);
97
        psum = vec_add(v32ss, psum);
98
        psum = vec_sra(psum, v6us);
99 115329f1 Diego Biurrun
100 e8772eec Luca Barbato
        vdst = vec_ld(0, dst);
101
        ppsum = (vector unsigned char)vec_packsu(psum, psum);
102
        vfdst = vec_perm(vdst, ppsum, fperm);
103 115329f1 Diego Biurrun
104 a6a12a8a Romain Dolbeau
        OP_U8_ALTIVEC(fsum, vfdst, vdst);
105
106
        vec_st(fsum, 0, dst);
107 115329f1 Diego Biurrun
108 a6a12a8a Romain Dolbeau
        vsrc0ssH = vsrc2ssH;
109
        vsrc1ssH = vsrc3ssH;
110 115329f1 Diego Biurrun
111 a6a12a8a Romain Dolbeau
        dst += stride;
112
        src += stride;
113
      }
114
    } else {
115
        vector unsigned char vsrcDuc;
116 e8772eec Luca Barbato
      for (i = 0 ; i < h ; i++) {
117 a6a12a8a Romain Dolbeau
        vsrcCuc = vec_ld(stride + 0, src);
118
        vsrcDuc = vec_ld(stride + 16, src);
119 115329f1 Diego Biurrun
120 a6a12a8a Romain Dolbeau
        vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
121
        if (reallyBadAlign)
122
          vsrc3uc = vsrcDuc;
123
        else
124
          vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
125 115329f1 Diego Biurrun
126 e8772eec Luca Barbato
        vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
127
                                                (vector unsigned char)vsrc2uc);
128
        vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
129
                                                (vector unsigned char)vsrc3uc);
130 115329f1 Diego Biurrun
131 a6a12a8a Romain Dolbeau
        psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
132
        psum = vec_mladd(vB, vsrc1ssH, psum);
133
        psum = vec_mladd(vC, vsrc2ssH, psum);
134
        psum = vec_mladd(vD, vsrc3ssH, psum);
135
        psum = vec_add(v32ss, psum);
136
        psum = vec_sr(psum, v6us);
137 115329f1 Diego Biurrun
138 e8772eec Luca Barbato
        vdst = vec_ld(0, dst);
139
        ppsum = (vector unsigned char)vec_pack(psum, psum);
140
        vfdst = vec_perm(vdst, ppsum, fperm);
141 115329f1 Diego Biurrun
142 a6a12a8a Romain Dolbeau
        OP_U8_ALTIVEC(fsum, vfdst, vdst);
143
144
        vec_st(fsum, 0, dst);
145 115329f1 Diego Biurrun
146 a6a12a8a Romain Dolbeau
        vsrc0ssH = vsrc2ssH;
147
        vsrc1ssH = vsrc3ssH;
148 115329f1 Diego Biurrun
149 a6a12a8a Romain Dolbeau
        dst += stride;
150
        src += stride;
151
      }
152
    }
153
    POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1);
154
}
155
156
/* this code assume stride % 16 == 0 */
157
static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
158
  POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
159
  register int i;
160 115329f1 Diego Biurrun
161 a6a12a8a Romain Dolbeau
  const vector signed int vzero = vec_splat_s32(0);
162
  const vector unsigned char permM2 = vec_lvsl(-2, src);
163
  const vector unsigned char permM1 = vec_lvsl(-1, src);
164
  const vector unsigned char permP0 = vec_lvsl(+0, src);
165
  const vector unsigned char permP1 = vec_lvsl(+1, src);
166
  const vector unsigned char permP2 = vec_lvsl(+2, src);
167
  const vector unsigned char permP3 = vec_lvsl(+3, src);
168
  const vector signed short v5ss = vec_splat_s16(5);
169 5cb9fda4 Luca Barbato
  const vector unsigned short v5us = vec_splat_u16(5);
170
  const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
171
  const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
172 a6a12a8a Romain Dolbeau
  const vector unsigned char dstperm = vec_lvsr(0, dst);
173 e8772eec Luca Barbato
  const vector unsigned char neg1 =
174
                                (const vector unsigned char) vec_splat_s8(-1);
175
176
  const vector unsigned char dstmask =
177
                                vec_perm((const vector unsigned char)vzero,
178
                                                               neg1, dstperm);
179
180
  vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
181 a6a12a8a Romain Dolbeau
182
  register int align = ((((unsigned long)src) - 2) % 16);
183
184 e8772eec Luca Barbato
  vector signed short srcP0A, srcP0B, srcP1A, srcP1B,
185
                      srcP2A, srcP2B, srcP3A, srcP3B,
186
                      srcM1A, srcM1B, srcM2A, srcM2B,
187
                      sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
188
                      pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
189
                      psumA, psumB, sumA, sumB;
190
191
  vector unsigned char sum, dst1, dst2, vdst, fsum,
192
                       rsum, fdst1, fdst2;
193
194
  POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
195
196 a6a12a8a Romain Dolbeau
  for (i = 0 ; i < 16 ; i ++) {
197
    vector unsigned char srcR1 = vec_ld(-2, src);
198
    vector unsigned char srcR2 = vec_ld(14, src);
199
200
    switch (align) {
201
    default: {
202
      srcM2 = vec_perm(srcR1, srcR2, permM2);
203
      srcM1 = vec_perm(srcR1, srcR2, permM1);
204
      srcP0 = vec_perm(srcR1, srcR2, permP0);
205
      srcP1 = vec_perm(srcR1, srcR2, permP1);
206
      srcP2 = vec_perm(srcR1, srcR2, permP2);
207
      srcP3 = vec_perm(srcR1, srcR2, permP3);
208
    } break;
209
    case 11: {
210
      srcM2 = vec_perm(srcR1, srcR2, permM2);
211
      srcM1 = vec_perm(srcR1, srcR2, permM1);
212
      srcP0 = vec_perm(srcR1, srcR2, permP0);
213
      srcP1 = vec_perm(srcR1, srcR2, permP1);
214
      srcP2 = vec_perm(srcR1, srcR2, permP2);
215
      srcP3 = srcR2;
216
    } break;
217
    case 12: {
218
      vector unsigned char srcR3 = vec_ld(30, src);
219
      srcM2 = vec_perm(srcR1, srcR2, permM2);
220
      srcM1 = vec_perm(srcR1, srcR2, permM1);
221
      srcP0 = vec_perm(srcR1, srcR2, permP0);
222
      srcP1 = vec_perm(srcR1, srcR2, permP1);
223
      srcP2 = srcR2;
224
      srcP3 = vec_perm(srcR2, srcR3, permP3);
225
    } break;
226
    case 13: {
227
      vector unsigned char srcR3 = vec_ld(30, src);
228
      srcM2 = vec_perm(srcR1, srcR2, permM2);
229
      srcM1 = vec_perm(srcR1, srcR2, permM1);
230
      srcP0 = vec_perm(srcR1, srcR2, permP0);
231
      srcP1 = srcR2;
232
      srcP2 = vec_perm(srcR2, srcR3, permP2);
233
      srcP3 = vec_perm(srcR2, srcR3, permP3);
234
    } break;
235
    case 14: {
236
      vector unsigned char srcR3 = vec_ld(30, src);
237
      srcM2 = vec_perm(srcR1, srcR2, permM2);
238
      srcM1 = vec_perm(srcR1, srcR2, permM1);
239
      srcP0 = srcR2;
240
      srcP1 = vec_perm(srcR2, srcR3, permP1);
241
      srcP2 = vec_perm(srcR2, srcR3, permP2);
242
      srcP3 = vec_perm(srcR2, srcR3, permP3);
243
    } break;
244
    case 15: {
245
      vector unsigned char srcR3 = vec_ld(30, src);
246
      srcM2 = vec_perm(srcR1, srcR2, permM2);
247
      srcM1 = srcR2;
248
      srcP0 = vec_perm(srcR2, srcR3, permP0);
249
      srcP1 = vec_perm(srcR2, srcR3, permP1);
250
      srcP2 = vec_perm(srcR2, srcR3, permP2);
251
      srcP3 = vec_perm(srcR2, srcR3, permP3);
252
    } break;
253
    }
254
255 47261113 Luca Barbato
    srcP0A = (vector signed short)
256
                vec_mergeh((vector unsigned char)vzero, srcP0);
257
    srcP0B = (vector signed short)
258
                vec_mergel((vector unsigned char)vzero, srcP0);
259
    srcP1A = (vector signed short)
260
                vec_mergeh((vector unsigned char)vzero, srcP1);
261
    srcP1B = (vector signed short)
262
                vec_mergel((vector unsigned char)vzero, srcP1);
263
264
    srcP2A = (vector signed short)
265
                vec_mergeh((vector unsigned char)vzero, srcP2);
266
    srcP2B = (vector signed short)
267
                vec_mergel((vector unsigned char)vzero, srcP2);
268
    srcP3A = (vector signed short)
269
                vec_mergeh((vector unsigned char)vzero, srcP3);
270
    srcP3B = (vector signed short)
271
                vec_mergel((vector unsigned char)vzero, srcP3);
272
273
    srcM1A = (vector signed short)
274
                vec_mergeh((vector unsigned char)vzero, srcM1);
275
    srcM1B = (vector signed short)
276
                vec_mergel((vector unsigned char)vzero, srcM1);
277
    srcM2A = (vector signed short)
278
                vec_mergeh((vector unsigned char)vzero, srcM2);
279
    srcM2B = (vector signed short)
280
                vec_mergel((vector unsigned char)vzero, srcM2);
281 a6a12a8a Romain Dolbeau
282 e8772eec Luca Barbato
    sum1A = vec_adds(srcP0A, srcP1A);
283
    sum1B = vec_adds(srcP0B, srcP1B);
284
    sum2A = vec_adds(srcM1A, srcP2A);
285
    sum2B = vec_adds(srcM1B, srcP2B);
286
    sum3A = vec_adds(srcM2A, srcP3A);
287
    sum3B = vec_adds(srcM2B, srcP3B);
288 115329f1 Diego Biurrun
289 e8772eec Luca Barbato
    pp1A = vec_mladd(sum1A, v20ss, v16ss);
290
    pp1B = vec_mladd(sum1B, v20ss, v16ss);
291 a6a12a8a Romain Dolbeau
292 e8772eec Luca Barbato
    pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
293
    pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
294 115329f1 Diego Biurrun
295 e8772eec Luca Barbato
    pp3A = vec_add(sum3A, pp1A);
296
    pp3B = vec_add(sum3B, pp1B);
297 a6a12a8a Romain Dolbeau
298 e8772eec Luca Barbato
    psumA = vec_sub(pp3A, pp2A);
299
    psumB = vec_sub(pp3B, pp2B);
300 a6a12a8a Romain Dolbeau
301 e8772eec Luca Barbato
    sumA = vec_sra(psumA, v5us);
302
    sumB = vec_sra(psumB, v5us);
303 a6a12a8a Romain Dolbeau
304 e8772eec Luca Barbato
    sum = vec_packsu(sumA, sumB);
305 a6a12a8a Romain Dolbeau
306 e8772eec Luca Barbato
    dst1 = vec_ld(0, dst);
307
    dst2 = vec_ld(16, dst);
308
    vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
309 a6a12a8a Romain Dolbeau
310
    OP_U8_ALTIVEC(fsum, sum, vdst);
311
312 e8772eec Luca Barbato
    rsum = vec_perm(fsum, fsum, dstperm);
313
    fdst1 = vec_sel(dst1, rsum, dstmask);
314
    fdst2 = vec_sel(rsum, dst2, dstmask);
315 a6a12a8a Romain Dolbeau
316
    vec_st(fdst1, 0, dst);
317
    vec_st(fdst2, 16, dst);
318
319
    src += srcStride;
320
    dst += dstStride;
321
  }
322
POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
323
}
324
325
/* this code assume stride % 16 == 0 */
326
static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
327
  POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
328 115329f1 Diego Biurrun
329 a6a12a8a Romain Dolbeau
  register int i;
330
331
  const vector signed int vzero = vec_splat_s32(0);
332
  const vector unsigned char perm = vec_lvsl(0, src);
333 5cb9fda4 Luca Barbato
  const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
334 a6a12a8a Romain Dolbeau
  const vector unsigned short v5us = vec_splat_u16(5);
335
  const vector signed short v5ss = vec_splat_s16(5);
336 5cb9fda4 Luca Barbato
  const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
337 a6a12a8a Romain Dolbeau
  const vector unsigned char dstperm = vec_lvsr(0, dst);
338
  const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1);
339
  const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
340 115329f1 Diego Biurrun
341 a6a12a8a Romain Dolbeau
  uint8_t *srcbis = src - (srcStride * 2);
342
343
  const vector unsigned char srcM2a = vec_ld(0, srcbis);
344
  const vector unsigned char srcM2b = vec_ld(16, srcbis);
345
  const vector unsigned char srcM2 = vec_perm(srcM2a, srcM2b, perm);
346 e8772eec Luca Barbato
//  srcbis += srcStride;
347
  const vector unsigned char srcM1a = vec_ld(0, srcbis += srcStride);
348 a6a12a8a Romain Dolbeau
  const vector unsigned char srcM1b = vec_ld(16, srcbis);
349
  const vector unsigned char srcM1 = vec_perm(srcM1a, srcM1b, perm);
350 e8772eec Luca Barbato
//  srcbis += srcStride;
351
  const vector unsigned char srcP0a = vec_ld(0, srcbis += srcStride);
352 a6a12a8a Romain Dolbeau
  const vector unsigned char srcP0b = vec_ld(16, srcbis);
353
  const vector unsigned char srcP0 = vec_perm(srcP0a, srcP0b, perm);
354 e8772eec Luca Barbato
//  srcbis += srcStride;
355
  const vector unsigned char srcP1a = vec_ld(0, srcbis += srcStride);
356 a6a12a8a Romain Dolbeau
  const vector unsigned char srcP1b = vec_ld(16, srcbis);
357
  const vector unsigned char srcP1 = vec_perm(srcP1a, srcP1b, perm);
358 e8772eec Luca Barbato
//  srcbis += srcStride;
359
  const vector unsigned char srcP2a = vec_ld(0, srcbis += srcStride);
360 a6a12a8a Romain Dolbeau
  const vector unsigned char srcP2b = vec_ld(16, srcbis);
361
  const vector unsigned char srcP2 = vec_perm(srcP2a, srcP2b, perm);
362 e8772eec Luca Barbato
//  srcbis += srcStride;
363
364
  vector signed short srcM2ssA = (vector signed short)
365
                                vec_mergeh((vector unsigned char)vzero, srcM2);
366
  vector signed short srcM2ssB = (vector signed short)
367
                                vec_mergel((vector unsigned char)vzero, srcM2);
368
  vector signed short srcM1ssA = (vector signed short)
369
                                vec_mergeh((vector unsigned char)vzero, srcM1);
370
  vector signed short srcM1ssB = (vector signed short)
371
                                vec_mergel((vector unsigned char)vzero, srcM1);
372
  vector signed short srcP0ssA = (vector signed short)
373
                                vec_mergeh((vector unsigned char)vzero, srcP0);
374
  vector signed short srcP0ssB = (vector signed short)
375
                                vec_mergel((vector unsigned char)vzero, srcP0);
376
  vector signed short srcP1ssA = (vector signed short)
377
                                vec_mergeh((vector unsigned char)vzero, srcP1);
378
  vector signed short srcP1ssB = (vector signed short)
379
                                vec_mergel((vector unsigned char)vzero, srcP1);
380
  vector signed short srcP2ssA = (vector signed short)
381
                                vec_mergeh((vector unsigned char)vzero, srcP2);
382
  vector signed short srcP2ssB = (vector signed short)
383
                                vec_mergel((vector unsigned char)vzero, srcP2);
384
385
  vector signed short pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
386
                      psumA, psumB, sumA, sumB,
387
                      srcP3ssA, srcP3ssB,
388
                      sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
389
390
  vector unsigned char sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2,
391
                       srcP3a, srcP3b, srcP3;
392
393
  POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
394 a6a12a8a Romain Dolbeau
395
  for (i = 0 ; i < 16 ; i++) {
396 e8772eec Luca Barbato
    srcP3a = vec_ld(0, srcbis += srcStride);
397
    srcP3b = vec_ld(16, srcbis);
398
    srcP3 = vec_perm(srcP3a, srcP3b, perm);
399
    srcP3ssA = (vector signed short)
400
                                vec_mergeh((vector unsigned char)vzero, srcP3);
401
    srcP3ssB = (vector signed short)
402
                                vec_mergel((vector unsigned char)vzero, srcP3);
403
//    srcbis += srcStride;
404
405
    sum1A = vec_adds(srcP0ssA, srcP1ssA);
406
    sum1B = vec_adds(srcP0ssB, srcP1ssB);
407
    sum2A = vec_adds(srcM1ssA, srcP2ssA);
408
    sum2B = vec_adds(srcM1ssB, srcP2ssB);
409
    sum3A = vec_adds(srcM2ssA, srcP3ssA);
410
    sum3B = vec_adds(srcM2ssB, srcP3ssB);
411 a6a12a8a Romain Dolbeau
412
    srcM2ssA = srcM1ssA;
413
    srcM2ssB = srcM1ssB;
414
    srcM1ssA = srcP0ssA;
415
    srcM1ssB = srcP0ssB;
416
    srcP0ssA = srcP1ssA;
417
    srcP0ssB = srcP1ssB;
418
    srcP1ssA = srcP2ssA;
419
    srcP1ssB = srcP2ssB;
420
    srcP2ssA = srcP3ssA;
421
    srcP2ssB = srcP3ssB;
422 115329f1 Diego Biurrun
423 e8772eec Luca Barbato
    pp1A = vec_mladd(sum1A, v20ss, v16ss);
424
    pp1B = vec_mladd(sum1B, v20ss, v16ss);
425 a6a12a8a Romain Dolbeau
426 e8772eec Luca Barbato
    pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
427
    pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
428 115329f1 Diego Biurrun
429 e8772eec Luca Barbato
    pp3A = vec_add(sum3A, pp1A);
430
    pp3B = vec_add(sum3B, pp1B);
431 a6a12a8a Romain Dolbeau
432 e8772eec Luca Barbato
    psumA = vec_sub(pp3A, pp2A);
433
    psumB = vec_sub(pp3B, pp2B);
434 a6a12a8a Romain Dolbeau
435 e8772eec Luca Barbato
    sumA = vec_sra(psumA, v5us);
436
    sumB = vec_sra(psumB, v5us);
437 a6a12a8a Romain Dolbeau
438 e8772eec Luca Barbato
    sum = vec_packsu(sumA, sumB);
439 a6a12a8a Romain Dolbeau
440 e8772eec Luca Barbato
    dst1 = vec_ld(0, dst);
441
    dst2 = vec_ld(16, dst);
442
    vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
443 a6a12a8a Romain Dolbeau
444
    OP_U8_ALTIVEC(fsum, sum, vdst);
445
446 e8772eec Luca Barbato
    rsum = vec_perm(fsum, fsum, dstperm);
447
    fdst1 = vec_sel(dst1, rsum, dstmask);
448
    fdst2 = vec_sel(rsum, dst2, dstmask);
449 a6a12a8a Romain Dolbeau
450
    vec_st(fdst1, 0, dst);
451
    vec_st(fdst2, 16, dst);
452
453
    dst += dstStride;
454
  }
455
  POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
456
}
457
458
/* this code assume stride % 16 == 0 *and* tmp is properly aligned */
459
static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
460
  POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
461
  register int i;
462
  const vector signed int vzero = vec_splat_s32(0);
463
  const vector unsigned char permM2 = vec_lvsl(-2, src);
464
  const vector unsigned char permM1 = vec_lvsl(-1, src);
465
  const vector unsigned char permP0 = vec_lvsl(+0, src);
466
  const vector unsigned char permP1 = vec_lvsl(+1, src);
467
  const vector unsigned char permP2 = vec_lvsl(+2, src);
468
  const vector unsigned char permP3 = vec_lvsl(+3, src);
469 5cb9fda4 Luca Barbato
  const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
470 a6a12a8a Romain Dolbeau
  const vector unsigned int v10ui = vec_splat_u32(10);
471
  const vector signed short v5ss = vec_splat_s16(5);
472
  const vector signed short v1ss = vec_splat_s16(1);
473 5cb9fda4 Luca Barbato
  const vector signed int v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
474 11c19637 Likai Liu
  const vector unsigned int v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
475 a6a12a8a Romain Dolbeau
476
  register int align = ((((unsigned long)src) - 2) % 16);
477
478 e8772eec Luca Barbato
  const vector unsigned char neg1 = (const vector unsigned char)
479
                                                        vec_splat_s8(-1);
480
481
  vector signed short srcP0A, srcP0B, srcP1A, srcP1B,
482
                      srcP2A, srcP2B, srcP3A, srcP3B,
483
                      srcM1A, srcM1B, srcM2A, srcM2B,
484
                      sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
485
                      pp1A, pp1B, pp2A, pp2B, psumA, psumB;
486
487
  const vector unsigned char dstperm = vec_lvsr(0, dst);
488
489
  const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
490
491
  const vector unsigned char mperm = (const vector unsigned char)
492
    AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
493
        0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F);
494
  int16_t *tmpbis = tmp;
495
496
  vector signed short tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
497
                      tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
498
                      tmpP2ssA, tmpP2ssB;
499 a6a12a8a Romain Dolbeau
500 e8772eec Luca Barbato
  vector signed int pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
501
                    pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
502
                    pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
503
                    ssumAe, ssumAo, ssumBe, ssumBo;
504
  vector unsigned char fsum, sumv, sum, dst1, dst2, vdst,
505
                       rsum, fdst1, fdst2;
506
  vector signed short ssume, ssumo;
507
508
  POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
509
  src -= (2 * srcStride);
510 a6a12a8a Romain Dolbeau
  for (i = 0 ; i < 21 ; i ++) {
511
    vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
512
    vector unsigned char srcR1 = vec_ld(-2, src);
513
    vector unsigned char srcR2 = vec_ld(14, src);
514
515
    switch (align) {
516
    default: {
517
      srcM2 = vec_perm(srcR1, srcR2, permM2);
518
      srcM1 = vec_perm(srcR1, srcR2, permM1);
519
      srcP0 = vec_perm(srcR1, srcR2, permP0);
520
      srcP1 = vec_perm(srcR1, srcR2, permP1);
521
      srcP2 = vec_perm(srcR1, srcR2, permP2);
522
      srcP3 = vec_perm(srcR1, srcR2, permP3);
523
    } break;
524
    case 11: {
525
      srcM2 = vec_perm(srcR1, srcR2, permM2);
526
      srcM1 = vec_perm(srcR1, srcR2, permM1);
527
      srcP0 = vec_perm(srcR1, srcR2, permP0);
528
      srcP1 = vec_perm(srcR1, srcR2, permP1);
529
      srcP2 = vec_perm(srcR1, srcR2, permP2);
530
      srcP3 = srcR2;
531
    } break;
532
    case 12: {
533
      vector unsigned char srcR3 = vec_ld(30, src);
534
      srcM2 = vec_perm(srcR1, srcR2, permM2);
535
      srcM1 = vec_perm(srcR1, srcR2, permM1);
536
      srcP0 = vec_perm(srcR1, srcR2, permP0);
537
      srcP1 = vec_perm(srcR1, srcR2, permP1);
538
      srcP2 = srcR2;
539
      srcP3 = vec_perm(srcR2, srcR3, permP3);
540
    } break;
541
    case 13: {
542
      vector unsigned char srcR3 = vec_ld(30, src);
543
      srcM2 = vec_perm(srcR1, srcR2, permM2);
544
      srcM1 = vec_perm(srcR1, srcR2, permM1);
545
      srcP0 = vec_perm(srcR1, srcR2, permP0);
546
      srcP1 = srcR2;
547
      srcP2 = vec_perm(srcR2, srcR3, permP2);
548
      srcP3 = vec_perm(srcR2, srcR3, permP3);
549
    } break;
550
    case 14: {
551
      vector unsigned char srcR3 = vec_ld(30, src);
552
      srcM2 = vec_perm(srcR1, srcR2, permM2);
553
      srcM1 = vec_perm(srcR1, srcR2, permM1);
554
      srcP0 = srcR2;
555
      srcP1 = vec_perm(srcR2, srcR3, permP1);
556
      srcP2 = vec_perm(srcR2, srcR3, permP2);
557
      srcP3 = vec_perm(srcR2, srcR3, permP3);
558
    } break;
559
    case 15: {
560
      vector unsigned char srcR3 = vec_ld(30, src);
561
      srcM2 = vec_perm(srcR1, srcR2, permM2);
562
      srcM1 = srcR2;
563
      srcP0 = vec_perm(srcR2, srcR3, permP0);
564
      srcP1 = vec_perm(srcR2, srcR3, permP1);
565
      srcP2 = vec_perm(srcR2, srcR3, permP2);
566
      srcP3 = vec_perm(srcR2, srcR3, permP3);
567
    } break;
568
    }
569
570 e8772eec Luca Barbato
    srcP0A = (vector signed short)
571
                            vec_mergeh((vector unsigned char)vzero, srcP0);
572
    srcP0B = (vector signed short)
573
                            vec_mergel((vector unsigned char)vzero, srcP0);
574
    srcP1A = (vector signed short)
575
                            vec_mergeh((vector unsigned char)vzero, srcP1);
576
    srcP1B = (vector signed short)
577
                            vec_mergel((vector unsigned char)vzero, srcP1);
578
579
    srcP2A = (vector signed short)
580
                            vec_mergeh((vector unsigned char)vzero, srcP2);
581
    srcP2B = (vector signed short)
582
                            vec_mergel((vector unsigned char)vzero, srcP2);
583
    srcP3A = (vector signed short)
584
                            vec_mergeh((vector unsigned char)vzero, srcP3);
585
    srcP3B = (vector signed short)
586
                            vec_mergel((vector unsigned char)vzero, srcP3);
587
588
    srcM1A = (vector signed short)
589
                            vec_mergeh((vector unsigned char)vzero, srcM1);
590
    srcM1B = (vector signed short)
591
                            vec_mergel((vector unsigned char)vzero, srcM1);
592
    srcM2A = (vector signed short)
593
                            vec_mergeh((vector unsigned char)vzero, srcM2);
594
    srcM2B = (vector signed short)
595
                            vec_mergel((vector unsigned char)vzero, srcM2);
596
597
    sum1A = vec_adds(srcP0A, srcP1A);
598
    sum1B = vec_adds(srcP0B, srcP1B);
599
    sum2A = vec_adds(srcM1A, srcP2A);
600
    sum2B = vec_adds(srcM1B, srcP2B);
601
    sum3A = vec_adds(srcM2A, srcP3A);
602
    sum3B = vec_adds(srcM2B, srcP3B);
603
604
    pp1A = vec_mladd(sum1A, v20ss, sum3A);
605
    pp1B = vec_mladd(sum1B, v20ss, sum3B);
606
607
    pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
608
    pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
609
610
    psumA = vec_sub(pp1A, pp2A);
611
    psumB = vec_sub(pp1B, pp2B);
612 a6a12a8a Romain Dolbeau
613
    vec_st(psumA, 0, tmp);
614
    vec_st(psumB, 16, tmp);
615 115329f1 Diego Biurrun
616 a6a12a8a Romain Dolbeau
    src += srcStride;
617
    tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
618
  }
619 115329f1 Diego Biurrun
620 e8772eec Luca Barbato
  tmpM2ssA = vec_ld(0, tmpbis);
621
  tmpM2ssB = vec_ld(16, tmpbis);
622 a6a12a8a Romain Dolbeau
  tmpbis += tmpStride;
623 e8772eec Luca Barbato
  tmpM1ssA = vec_ld(0, tmpbis);
624
  tmpM1ssB = vec_ld(16, tmpbis);
625 a6a12a8a Romain Dolbeau
  tmpbis += tmpStride;
626 e8772eec Luca Barbato
  tmpP0ssA = vec_ld(0, tmpbis);
627
  tmpP0ssB = vec_ld(16, tmpbis);
628 a6a12a8a Romain Dolbeau
  tmpbis += tmpStride;
629 e8772eec Luca Barbato
  tmpP1ssA = vec_ld(0, tmpbis);
630
  tmpP1ssB = vec_ld(16, tmpbis);
631 a6a12a8a Romain Dolbeau
  tmpbis += tmpStride;
632 e8772eec Luca Barbato
  tmpP2ssA = vec_ld(0, tmpbis);
633
  tmpP2ssB = vec_ld(16, tmpbis);
634 a6a12a8a Romain Dolbeau
  tmpbis += tmpStride;
635
636
  for (i = 0 ; i < 16 ; i++) {
637
    const vector signed short tmpP3ssA = vec_ld(0, tmpbis);
638
    const vector signed short tmpP3ssB = vec_ld(16, tmpbis);
639
640
    const vector signed short sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
641
    const vector signed short sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
642
    const vector signed short sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
643
    const vector signed short sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
644
    const vector signed short sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
645
    const vector signed short sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
646
647 e8772eec Luca Barbato
    tmpbis += tmpStride;
648
649 a6a12a8a Romain Dolbeau
    tmpM2ssA = tmpM1ssA;
650
    tmpM2ssB = tmpM1ssB;
651
    tmpM1ssA = tmpP0ssA;
652
    tmpM1ssB = tmpP0ssB;
653
    tmpP0ssA = tmpP1ssA;
654
    tmpP0ssB = tmpP1ssB;
655
    tmpP1ssA = tmpP2ssA;
656
    tmpP1ssB = tmpP2ssB;
657
    tmpP2ssA = tmpP3ssA;
658
    tmpP2ssB = tmpP3ssB;
659
660 e8772eec Luca Barbato
    pp1Ae = vec_mule(sum1A, v20ss);
661
    pp1Ao = vec_mulo(sum1A, v20ss);
662
    pp1Be = vec_mule(sum1B, v20ss);
663
    pp1Bo = vec_mulo(sum1B, v20ss);
664 a6a12a8a Romain Dolbeau
665 e8772eec Luca Barbato
    pp2Ae = vec_mule(sum2A, v5ss);
666
    pp2Ao = vec_mulo(sum2A, v5ss);
667
    pp2Be = vec_mule(sum2B, v5ss);
668
    pp2Bo = vec_mulo(sum2B, v5ss);
669 a6a12a8a Romain Dolbeau
670 e8772eec Luca Barbato
    pp3Ae = vec_sra((vector signed int)sum3A, v16ui);
671
    pp3Ao = vec_mulo(sum3A, v1ss);
672
    pp3Be = vec_sra((vector signed int)sum3B, v16ui);
673
    pp3Bo = vec_mulo(sum3B, v1ss);
674 a6a12a8a Romain Dolbeau
675 e8772eec Luca Barbato
    pp1cAe = vec_add(pp1Ae, v512si);
676
    pp1cAo = vec_add(pp1Ao, v512si);
677
    pp1cBe = vec_add(pp1Be, v512si);
678
    pp1cBo = vec_add(pp1Bo, v512si);
679 a6a12a8a Romain Dolbeau
680 e8772eec Luca Barbato
    pp32Ae = vec_sub(pp3Ae, pp2Ae);
681
    pp32Ao = vec_sub(pp3Ao, pp2Ao);
682
    pp32Be = vec_sub(pp3Be, pp2Be);
683
    pp32Bo = vec_sub(pp3Bo, pp2Bo);
684 a6a12a8a Romain Dolbeau
685 e8772eec Luca Barbato
    sumAe = vec_add(pp1cAe, pp32Ae);
686
    sumAo = vec_add(pp1cAo, pp32Ao);
687
    sumBe = vec_add(pp1cBe, pp32Be);
688
    sumBo = vec_add(pp1cBo, pp32Bo);
689 115329f1 Diego Biurrun
690 e8772eec Luca Barbato
    ssumAe = vec_sra(sumAe, v10ui);
691
    ssumAo = vec_sra(sumAo, v10ui);
692
    ssumBe = vec_sra(sumBe, v10ui);
693
    ssumBo = vec_sra(sumBo, v10ui);
694 a6a12a8a Romain Dolbeau
695 e8772eec Luca Barbato
    ssume = vec_packs(ssumAe, ssumBe);
696
    ssumo = vec_packs(ssumAo, ssumBo);
697 a6a12a8a Romain Dolbeau
698 e8772eec Luca Barbato
    sumv = vec_packsu(ssume, ssumo);
699
    sum = vec_perm(sumv, sumv, mperm);
700 a6a12a8a Romain Dolbeau
701 e8772eec Luca Barbato
    dst1 = vec_ld(0, dst);
702
    dst2 = vec_ld(16, dst);
703
    vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
704 a6a12a8a Romain Dolbeau
705
    OP_U8_ALTIVEC(fsum, sum, vdst);
706
707 e8772eec Luca Barbato
    rsum = vec_perm(fsum, fsum, dstperm);
708
    fdst1 = vec_sel(dst1, rsum, dstmask);
709
    fdst2 = vec_sel(rsum, dst2, dstmask);
710 a6a12a8a Romain Dolbeau
711
    vec_st(fdst1, 0, dst);
712
    vec_st(fdst2, 16, dst);
713
714
    dst += dstStride;
715
  }
716
  POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
717
}