Statistics
| Branch: | Revision:

ffmpeg / libavcodec / ppc / h264_template_altivec.c @ 3ca96802

History | View | Annotate | Download (22.5 KB)

1
/*
2
 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20

    
21
/* this code assume that stride % 16 == 0 */
22
void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
23
  POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1);
24
    DECLARE_ALIGNED_16(signed int, ABCD[4]) =
25
                        {((8 - x) * (8 - y)),
26
                          ((x) * (8 - y)),
27
                          ((8 - x) * (y)),
28
                          ((x) * (y))};
29
    register int i;
30
    vec_u8_t fperm;
31
    const vec_s32_t vABCD = vec_ld(0, ABCD);
32
    const vec_s16_t vA = vec_splat((vec_s16_t)vABCD, 1);
33
    const vec_s16_t vB = vec_splat((vec_s16_t)vABCD, 3);
34
    const vec_s16_t vC = vec_splat((vec_s16_t)vABCD, 5);
35
    const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7);
36
    LOAD_ZERO;
37
    const vec_s16_t v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
38
    const vec_u16_t v6us = vec_splat_u16(6);
39
    register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
40
    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
41

    
42
    vec_u8_t vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
43
    vec_u8_t vsrc0uc, vsrc1uc;
44
    vec_s16_t vsrc0ssH, vsrc1ssH;
45
    vec_u8_t vsrcCuc, vsrc2uc, vsrc3uc;
46
    vec_s16_t vsrc2ssH, vsrc3ssH, psum;
47
    vec_u8_t vdst, ppsum, vfdst, fsum;
48

    
49
  POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
50

    
51
    if (((unsigned long)dst) % 16 == 0) {
52
      fperm = (vec_u8_t)AVV(0x10, 0x11, 0x12, 0x13,
53
                            0x14, 0x15, 0x16, 0x17,
54
                            0x08, 0x09, 0x0A, 0x0B,
55
                            0x0C, 0x0D, 0x0E, 0x0F);
56
    } else {
57
      fperm = (vec_u8_t)AVV(0x00, 0x01, 0x02, 0x03,
58
                            0x04, 0x05, 0x06, 0x07,
59
                            0x18, 0x19, 0x1A, 0x1B,
60
                            0x1C, 0x1D, 0x1E, 0x1F);
61
    }
62

    
63
    vsrcAuc = vec_ld(0, src);
64

    
65
    if (loadSecond)
66
      vsrcBuc = vec_ld(16, src);
67
    vsrcperm0 = vec_lvsl(0, src);
68
    vsrcperm1 = vec_lvsl(1, src);
69

    
70
    vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
71
    if (reallyBadAlign)
72
      vsrc1uc = vsrcBuc;
73
    else
74
      vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
75

    
76
    vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc0uc);
77
    vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc1uc);
78

    
79
    if (!loadSecond) {// -> !reallyBadAlign
80
      for (i = 0 ; i < h ; i++) {
81

    
82

    
83
        vsrcCuc = vec_ld(stride + 0, src);
84

    
85
        vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
86
        vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
87

    
88
        vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc2uc);
89
        vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc3uc);
90

    
91
        psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
92
        psum = vec_mladd(vB, vsrc1ssH, psum);
93
        psum = vec_mladd(vC, vsrc2ssH, psum);
94
        psum = vec_mladd(vD, vsrc3ssH, psum);
95
        psum = vec_add(v32ss, psum);
96
        psum = vec_sra(psum, v6us);
97

    
98
        vdst = vec_ld(0, dst);
99
        ppsum = (vec_u8_t)vec_packsu(psum, psum);
100
        vfdst = vec_perm(vdst, ppsum, fperm);
101

    
102
        OP_U8_ALTIVEC(fsum, vfdst, vdst);
103

    
104
        vec_st(fsum, 0, dst);
105

    
106
        vsrc0ssH = vsrc2ssH;
107
        vsrc1ssH = vsrc3ssH;
108

    
109
        dst += stride;
110
        src += stride;
111
      }
112
    } else {
113
        vec_u8_t vsrcDuc;
114
      for (i = 0 ; i < h ; i++) {
115
        vsrcCuc = vec_ld(stride + 0, src);
116
        vsrcDuc = vec_ld(stride + 16, src);
117

    
118
        vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
119
        if (reallyBadAlign)
120
          vsrc3uc = vsrcDuc;
121
        else
122
          vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
123

    
124
        vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc2uc);
125
        vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc3uc);
126

    
127
        psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
128
        psum = vec_mladd(vB, vsrc1ssH, psum);
129
        psum = vec_mladd(vC, vsrc2ssH, psum);
130
        psum = vec_mladd(vD, vsrc3ssH, psum);
131
        psum = vec_add(v32ss, psum);
132
        psum = vec_sr(psum, v6us);
133

    
134
        vdst = vec_ld(0, dst);
135
        ppsum = (vec_u8_t)vec_pack(psum, psum);
136
        vfdst = vec_perm(vdst, ppsum, fperm);
137

    
138
        OP_U8_ALTIVEC(fsum, vfdst, vdst);
139

    
140
        vec_st(fsum, 0, dst);
141

    
142
        vsrc0ssH = vsrc2ssH;
143
        vsrc1ssH = vsrc3ssH;
144

    
145
        dst += stride;
146
        src += stride;
147
      }
148
    }
149
    POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1);
150
}
151

    
152
/* this code assume stride % 16 == 0 */
153
static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
154
  POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
155
  register int i;
156

    
157
  LOAD_ZERO;
158
  const vec_u8_t permM2 = vec_lvsl(-2, src);
159
  const vec_u8_t permM1 = vec_lvsl(-1, src);
160
  const vec_u8_t permP0 = vec_lvsl(+0, src);
161
  const vec_u8_t permP1 = vec_lvsl(+1, src);
162
  const vec_u8_t permP2 = vec_lvsl(+2, src);
163
  const vec_u8_t permP3 = vec_lvsl(+3, src);
164
  const vec_s16_t v5ss = vec_splat_s16(5);
165
  const vec_u16_t v5us = vec_splat_u16(5);
166
  const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
167
  const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
168
  const vec_u8_t dstperm = vec_lvsr(0, dst);
169
  const vec_u8_t neg1 = (const vec_u8_t) vec_splat_s8(-1);
170
  const vec_u8_t dstmask = vec_perm(zero_u8v, neg1, dstperm);
171

    
172
  vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
173

    
174
  register int align = ((((unsigned long)src) - 2) % 16);
175

    
176
  vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,
177
                      srcP2A, srcP2B, srcP3A, srcP3B,
178
                      srcM1A, srcM1B, srcM2A, srcM2B,
179
                      sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
180
                      pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
181
                      psumA, psumB, sumA, sumB;
182

    
183
  vec_u8_t sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2;
184

    
185
  POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
186

    
187
  for (i = 0 ; i < 16 ; i ++) {
188
    vec_u8_t srcR1 = vec_ld(-2, src);
189
    vec_u8_t srcR2 = vec_ld(14, src);
190

    
191
    switch (align) {
192
    default: {
193
      srcM2 = vec_perm(srcR1, srcR2, permM2);
194
      srcM1 = vec_perm(srcR1, srcR2, permM1);
195
      srcP0 = vec_perm(srcR1, srcR2, permP0);
196
      srcP1 = vec_perm(srcR1, srcR2, permP1);
197
      srcP2 = vec_perm(srcR1, srcR2, permP2);
198
      srcP3 = vec_perm(srcR1, srcR2, permP3);
199
    } break;
200
    case 11: {
201
      srcM2 = vec_perm(srcR1, srcR2, permM2);
202
      srcM1 = vec_perm(srcR1, srcR2, permM1);
203
      srcP0 = vec_perm(srcR1, srcR2, permP0);
204
      srcP1 = vec_perm(srcR1, srcR2, permP1);
205
      srcP2 = vec_perm(srcR1, srcR2, permP2);
206
      srcP3 = srcR2;
207
    } break;
208
    case 12: {
209
      vec_u8_t srcR3 = vec_ld(30, src);
210
      srcM2 = vec_perm(srcR1, srcR2, permM2);
211
      srcM1 = vec_perm(srcR1, srcR2, permM1);
212
      srcP0 = vec_perm(srcR1, srcR2, permP0);
213
      srcP1 = vec_perm(srcR1, srcR2, permP1);
214
      srcP2 = srcR2;
215
      srcP3 = vec_perm(srcR2, srcR3, permP3);
216
    } break;
217
    case 13: {
218
      vec_u8_t srcR3 = vec_ld(30, src);
219
      srcM2 = vec_perm(srcR1, srcR2, permM2);
220
      srcM1 = vec_perm(srcR1, srcR2, permM1);
221
      srcP0 = vec_perm(srcR1, srcR2, permP0);
222
      srcP1 = srcR2;
223
      srcP2 = vec_perm(srcR2, srcR3, permP2);
224
      srcP3 = vec_perm(srcR2, srcR3, permP3);
225
    } break;
226
    case 14: {
227
      vec_u8_t srcR3 = vec_ld(30, src);
228
      srcM2 = vec_perm(srcR1, srcR2, permM2);
229
      srcM1 = vec_perm(srcR1, srcR2, permM1);
230
      srcP0 = srcR2;
231
      srcP1 = vec_perm(srcR2, srcR3, permP1);
232
      srcP2 = vec_perm(srcR2, srcR3, permP2);
233
      srcP3 = vec_perm(srcR2, srcR3, permP3);
234
    } break;
235
    case 15: {
236
      vec_u8_t srcR3 = vec_ld(30, src);
237
      srcM2 = vec_perm(srcR1, srcR2, permM2);
238
      srcM1 = srcR2;
239
      srcP0 = vec_perm(srcR2, srcR3, permP0);
240
      srcP1 = vec_perm(srcR2, srcR3, permP1);
241
      srcP2 = vec_perm(srcR2, srcR3, permP2);
242
      srcP3 = vec_perm(srcR2, srcR3, permP3);
243
    } break;
244
    }
245

    
246
    srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
247
    srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
248
    srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
249
    srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
250

    
251
    srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
252
    srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
253
    srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
254
    srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
255

    
256
    srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
257
    srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
258
    srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
259
    srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
260

    
261
    sum1A = vec_adds(srcP0A, srcP1A);
262
    sum1B = vec_adds(srcP0B, srcP1B);
263
    sum2A = vec_adds(srcM1A, srcP2A);
264
    sum2B = vec_adds(srcM1B, srcP2B);
265
    sum3A = vec_adds(srcM2A, srcP3A);
266
    sum3B = vec_adds(srcM2B, srcP3B);
267

    
268
    pp1A = vec_mladd(sum1A, v20ss, v16ss);
269
    pp1B = vec_mladd(sum1B, v20ss, v16ss);
270

    
271
    pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
272
    pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
273

    
274
    pp3A = vec_add(sum3A, pp1A);
275
    pp3B = vec_add(sum3B, pp1B);
276

    
277
    psumA = vec_sub(pp3A, pp2A);
278
    psumB = vec_sub(pp3B, pp2B);
279

    
280
    sumA = vec_sra(psumA, v5us);
281
    sumB = vec_sra(psumB, v5us);
282

    
283
    sum = vec_packsu(sumA, sumB);
284

    
285
    dst1 = vec_ld(0, dst);
286
    dst2 = vec_ld(16, dst);
287
    vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
288

    
289
    OP_U8_ALTIVEC(fsum, sum, vdst);
290

    
291
    rsum = vec_perm(fsum, fsum, dstperm);
292
    fdst1 = vec_sel(dst1, rsum, dstmask);
293
    fdst2 = vec_sel(rsum, dst2, dstmask);
294

    
295
    vec_st(fdst1, 0, dst);
296
    vec_st(fdst2, 16, dst);
297

    
298
    src += srcStride;
299
    dst += dstStride;
300
  }
301
POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
302
}
303

    
304
/* this code assume stride % 16 == 0 */
305
static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
306
  POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
307

    
308
  register int i;
309

    
310
  LOAD_ZERO;
311
  const vec_u8_t perm = vec_lvsl(0, src);
312
  const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
313
  const vec_u16_t v5us = vec_splat_u16(5);
314
  const vec_s16_t v5ss = vec_splat_s16(5);
315
  const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
316
  const vec_u8_t dstperm = vec_lvsr(0, dst);
317
  const vec_u8_t neg1 = (const vec_u8_t)vec_splat_s8(-1);
318
  const vec_u8_t dstmask = vec_perm(zero_u8v, neg1, dstperm);
319

    
320
  uint8_t *srcbis = src - (srcStride * 2);
321

    
322
  const vec_u8_t srcM2a = vec_ld(0, srcbis);
323
  const vec_u8_t srcM2b = vec_ld(16, srcbis);
324
  const vec_u8_t srcM2 = vec_perm(srcM2a, srcM2b, perm);
325
//  srcbis += srcStride;
326
  const vec_u8_t srcM1a = vec_ld(0, srcbis += srcStride);
327
  const vec_u8_t srcM1b = vec_ld(16, srcbis);
328
  const vec_u8_t srcM1 = vec_perm(srcM1a, srcM1b, perm);
329
//  srcbis += srcStride;
330
  const vec_u8_t srcP0a = vec_ld(0, srcbis += srcStride);
331
  const vec_u8_t srcP0b = vec_ld(16, srcbis);
332
  const vec_u8_t srcP0 = vec_perm(srcP0a, srcP0b, perm);
333
//  srcbis += srcStride;
334
  const vec_u8_t srcP1a = vec_ld(0, srcbis += srcStride);
335
  const vec_u8_t srcP1b = vec_ld(16, srcbis);
336
  const vec_u8_t srcP1 = vec_perm(srcP1a, srcP1b, perm);
337
//  srcbis += srcStride;
338
  const vec_u8_t srcP2a = vec_ld(0, srcbis += srcStride);
339
  const vec_u8_t srcP2b = vec_ld(16, srcbis);
340
  const vec_u8_t srcP2 = vec_perm(srcP2a, srcP2b, perm);
341
//  srcbis += srcStride;
342

    
343
  vec_s16_t srcM2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
344
  vec_s16_t srcM2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
345
  vec_s16_t srcM1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
346
  vec_s16_t srcM1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
347
  vec_s16_t srcP0ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
348
  vec_s16_t srcP0ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
349
  vec_s16_t srcP1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
350
  vec_s16_t srcP1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
351
  vec_s16_t srcP2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
352
  vec_s16_t srcP2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
353

    
354
  vec_s16_t pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
355
                      psumA, psumB, sumA, sumB,
356
                      srcP3ssA, srcP3ssB,
357
                      sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
358

    
359
  vec_u8_t sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2, srcP3a, srcP3b, srcP3;
360

    
361
  POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
362

    
363
  for (i = 0 ; i < 16 ; i++) {
364
    srcP3a = vec_ld(0, srcbis += srcStride);
365
    srcP3b = vec_ld(16, srcbis);
366
    srcP3 = vec_perm(srcP3a, srcP3b, perm);
367
    srcP3ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
368
    srcP3ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
369
//    srcbis += srcStride;
370

    
371
    sum1A = vec_adds(srcP0ssA, srcP1ssA);
372
    sum1B = vec_adds(srcP0ssB, srcP1ssB);
373
    sum2A = vec_adds(srcM1ssA, srcP2ssA);
374
    sum2B = vec_adds(srcM1ssB, srcP2ssB);
375
    sum3A = vec_adds(srcM2ssA, srcP3ssA);
376
    sum3B = vec_adds(srcM2ssB, srcP3ssB);
377

    
378
    srcM2ssA = srcM1ssA;
379
    srcM2ssB = srcM1ssB;
380
    srcM1ssA = srcP0ssA;
381
    srcM1ssB = srcP0ssB;
382
    srcP0ssA = srcP1ssA;
383
    srcP0ssB = srcP1ssB;
384
    srcP1ssA = srcP2ssA;
385
    srcP1ssB = srcP2ssB;
386
    srcP2ssA = srcP3ssA;
387
    srcP2ssB = srcP3ssB;
388

    
389
    pp1A = vec_mladd(sum1A, v20ss, v16ss);
390
    pp1B = vec_mladd(sum1B, v20ss, v16ss);
391

    
392
    pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
393
    pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
394

    
395
    pp3A = vec_add(sum3A, pp1A);
396
    pp3B = vec_add(sum3B, pp1B);
397

    
398
    psumA = vec_sub(pp3A, pp2A);
399
    psumB = vec_sub(pp3B, pp2B);
400

    
401
    sumA = vec_sra(psumA, v5us);
402
    sumB = vec_sra(psumB, v5us);
403

    
404
    sum = vec_packsu(sumA, sumB);
405

    
406
    dst1 = vec_ld(0, dst);
407
    dst2 = vec_ld(16, dst);
408
    vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
409

    
410
    OP_U8_ALTIVEC(fsum, sum, vdst);
411

    
412
    rsum = vec_perm(fsum, fsum, dstperm);
413
    fdst1 = vec_sel(dst1, rsum, dstmask);
414
    fdst2 = vec_sel(rsum, dst2, dstmask);
415

    
416
    vec_st(fdst1, 0, dst);
417
    vec_st(fdst2, 16, dst);
418

    
419
    dst += dstStride;
420
  }
421
  POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
422
}
423

    
424
/* this code assume stride % 16 == 0 *and* tmp is properly aligned */
425
static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
426
  POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
427
  register int i;
428
  LOAD_ZERO;
429
  const vec_u8_t permM2 = vec_lvsl(-2, src);
430
  const vec_u8_t permM1 = vec_lvsl(-1, src);
431
  const vec_u8_t permP0 = vec_lvsl(+0, src);
432
  const vec_u8_t permP1 = vec_lvsl(+1, src);
433
  const vec_u8_t permP2 = vec_lvsl(+2, src);
434
  const vec_u8_t permP3 = vec_lvsl(+3, src);
435
  const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
436
  const vec_u32_t v10ui = vec_splat_u32(10);
437
  const vec_s16_t v5ss = vec_splat_s16(5);
438
  const vec_s16_t v1ss = vec_splat_s16(1);
439
  const vec_s32_t v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
440
  const vec_u32_t v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
441

    
442
  register int align = ((((unsigned long)src) - 2) % 16);
443

    
444
  const vec_u8_t neg1 = (const vec_u8_t) vec_splat_s8(-1);
445

    
446
  vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,
447
                      srcP2A, srcP2B, srcP3A, srcP3B,
448
                      srcM1A, srcM1B, srcM2A, srcM2B,
449
                      sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
450
                      pp1A, pp1B, pp2A, pp2B, psumA, psumB;
451

    
452
  const vec_u8_t dstperm = vec_lvsr(0, dst);
453

    
454
  const vec_u8_t dstmask = vec_perm(zero_u8v, neg1, dstperm);
455

    
456
  const vec_u8_t mperm = (const vec_u8_t)
457
    AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
458
        0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F);
459
  int16_t *tmpbis = tmp;
460

    
461
  vec_s16_t tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
462
                      tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
463
                      tmpP2ssA, tmpP2ssB;
464

    
465
  vec_s32_t pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
466
                    pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
467
                    pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
468
                    ssumAe, ssumAo, ssumBe, ssumBo;
469
  vec_u8_t fsum, sumv, sum, dst1, dst2, vdst, rsum, fdst1, fdst2;
470
  vec_s16_t ssume, ssumo;
471

    
472
  POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
473
  src -= (2 * srcStride);
474
  for (i = 0 ; i < 21 ; i ++) {
475
    vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
476
    vec_u8_t srcR1 = vec_ld(-2, src);
477
    vec_u8_t srcR2 = vec_ld(14, src);
478

    
479
    switch (align) {
480
    default: {
481
      srcM2 = vec_perm(srcR1, srcR2, permM2);
482
      srcM1 = vec_perm(srcR1, srcR2, permM1);
483
      srcP0 = vec_perm(srcR1, srcR2, permP0);
484
      srcP1 = vec_perm(srcR1, srcR2, permP1);
485
      srcP2 = vec_perm(srcR1, srcR2, permP2);
486
      srcP3 = vec_perm(srcR1, srcR2, permP3);
487
    } break;
488
    case 11: {
489
      srcM2 = vec_perm(srcR1, srcR2, permM2);
490
      srcM1 = vec_perm(srcR1, srcR2, permM1);
491
      srcP0 = vec_perm(srcR1, srcR2, permP0);
492
      srcP1 = vec_perm(srcR1, srcR2, permP1);
493
      srcP2 = vec_perm(srcR1, srcR2, permP2);
494
      srcP3 = srcR2;
495
    } break;
496
    case 12: {
497
      vec_u8_t srcR3 = vec_ld(30, src);
498
      srcM2 = vec_perm(srcR1, srcR2, permM2);
499
      srcM1 = vec_perm(srcR1, srcR2, permM1);
500
      srcP0 = vec_perm(srcR1, srcR2, permP0);
501
      srcP1 = vec_perm(srcR1, srcR2, permP1);
502
      srcP2 = srcR2;
503
      srcP3 = vec_perm(srcR2, srcR3, permP3);
504
    } break;
505
    case 13: {
506
      vec_u8_t srcR3 = vec_ld(30, src);
507
      srcM2 = vec_perm(srcR1, srcR2, permM2);
508
      srcM1 = vec_perm(srcR1, srcR2, permM1);
509
      srcP0 = vec_perm(srcR1, srcR2, permP0);
510
      srcP1 = srcR2;
511
      srcP2 = vec_perm(srcR2, srcR3, permP2);
512
      srcP3 = vec_perm(srcR2, srcR3, permP3);
513
    } break;
514
    case 14: {
515
      vec_u8_t srcR3 = vec_ld(30, src);
516
      srcM2 = vec_perm(srcR1, srcR2, permM2);
517
      srcM1 = vec_perm(srcR1, srcR2, permM1);
518
      srcP0 = srcR2;
519
      srcP1 = vec_perm(srcR2, srcR3, permP1);
520
      srcP2 = vec_perm(srcR2, srcR3, permP2);
521
      srcP3 = vec_perm(srcR2, srcR3, permP3);
522
    } break;
523
    case 15: {
524
      vec_u8_t srcR3 = vec_ld(30, src);
525
      srcM2 = vec_perm(srcR1, srcR2, permM2);
526
      srcM1 = srcR2;
527
      srcP0 = vec_perm(srcR2, srcR3, permP0);
528
      srcP1 = vec_perm(srcR2, srcR3, permP1);
529
      srcP2 = vec_perm(srcR2, srcR3, permP2);
530
      srcP3 = vec_perm(srcR2, srcR3, permP3);
531
    } break;
532
    }
533

    
534
    srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
535
    srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
536
    srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
537
    srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
538

    
539
    srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
540
    srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
541
    srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
542
    srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
543

    
544
    srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
545
    srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
546
    srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
547
    srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
548

    
549
    sum1A = vec_adds(srcP0A, srcP1A);
550
    sum1B = vec_adds(srcP0B, srcP1B);
551
    sum2A = vec_adds(srcM1A, srcP2A);
552
    sum2B = vec_adds(srcM1B, srcP2B);
553
    sum3A = vec_adds(srcM2A, srcP3A);
554
    sum3B = vec_adds(srcM2B, srcP3B);
555

    
556
    pp1A = vec_mladd(sum1A, v20ss, sum3A);
557
    pp1B = vec_mladd(sum1B, v20ss, sum3B);
558

    
559
    pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
560
    pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
561

    
562
    psumA = vec_sub(pp1A, pp2A);
563
    psumB = vec_sub(pp1B, pp2B);
564

    
565
    vec_st(psumA, 0, tmp);
566
    vec_st(psumB, 16, tmp);
567

    
568
    src += srcStride;
569
    tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
570
  }
571

    
572
  tmpM2ssA = vec_ld(0, tmpbis);
573
  tmpM2ssB = vec_ld(16, tmpbis);
574
  tmpbis += tmpStride;
575
  tmpM1ssA = vec_ld(0, tmpbis);
576
  tmpM1ssB = vec_ld(16, tmpbis);
577
  tmpbis += tmpStride;
578
  tmpP0ssA = vec_ld(0, tmpbis);
579
  tmpP0ssB = vec_ld(16, tmpbis);
580
  tmpbis += tmpStride;
581
  tmpP1ssA = vec_ld(0, tmpbis);
582
  tmpP1ssB = vec_ld(16, tmpbis);
583
  tmpbis += tmpStride;
584
  tmpP2ssA = vec_ld(0, tmpbis);
585
  tmpP2ssB = vec_ld(16, tmpbis);
586
  tmpbis += tmpStride;
587

    
588
  for (i = 0 ; i < 16 ; i++) {
589
    const vec_s16_t tmpP3ssA = vec_ld(0, tmpbis);
590
    const vec_s16_t tmpP3ssB = vec_ld(16, tmpbis);
591

    
592
    const vec_s16_t sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
593
    const vec_s16_t sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
594
    const vec_s16_t sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
595
    const vec_s16_t sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
596
    const vec_s16_t sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
597
    const vec_s16_t sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
598

    
599
    tmpbis += tmpStride;
600

    
601
    tmpM2ssA = tmpM1ssA;
602
    tmpM2ssB = tmpM1ssB;
603
    tmpM1ssA = tmpP0ssA;
604
    tmpM1ssB = tmpP0ssB;
605
    tmpP0ssA = tmpP1ssA;
606
    tmpP0ssB = tmpP1ssB;
607
    tmpP1ssA = tmpP2ssA;
608
    tmpP1ssB = tmpP2ssB;
609
    tmpP2ssA = tmpP3ssA;
610
    tmpP2ssB = tmpP3ssB;
611

    
612
    pp1Ae = vec_mule(sum1A, v20ss);
613
    pp1Ao = vec_mulo(sum1A, v20ss);
614
    pp1Be = vec_mule(sum1B, v20ss);
615
    pp1Bo = vec_mulo(sum1B, v20ss);
616

    
617
    pp2Ae = vec_mule(sum2A, v5ss);
618
    pp2Ao = vec_mulo(sum2A, v5ss);
619
    pp2Be = vec_mule(sum2B, v5ss);
620
    pp2Bo = vec_mulo(sum2B, v5ss);
621

    
622
    pp3Ae = vec_sra((vec_s32_t)sum3A, v16ui);
623
    pp3Ao = vec_mulo(sum3A, v1ss);
624
    pp3Be = vec_sra((vec_s32_t)sum3B, v16ui);
625
    pp3Bo = vec_mulo(sum3B, v1ss);
626

    
627
    pp1cAe = vec_add(pp1Ae, v512si);
628
    pp1cAo = vec_add(pp1Ao, v512si);
629
    pp1cBe = vec_add(pp1Be, v512si);
630
    pp1cBo = vec_add(pp1Bo, v512si);
631

    
632
    pp32Ae = vec_sub(pp3Ae, pp2Ae);
633
    pp32Ao = vec_sub(pp3Ao, pp2Ao);
634
    pp32Be = vec_sub(pp3Be, pp2Be);
635
    pp32Bo = vec_sub(pp3Bo, pp2Bo);
636

    
637
    sumAe = vec_add(pp1cAe, pp32Ae);
638
    sumAo = vec_add(pp1cAo, pp32Ao);
639
    sumBe = vec_add(pp1cBe, pp32Be);
640
    sumBo = vec_add(pp1cBo, pp32Bo);
641

    
642
    ssumAe = vec_sra(sumAe, v10ui);
643
    ssumAo = vec_sra(sumAo, v10ui);
644
    ssumBe = vec_sra(sumBe, v10ui);
645
    ssumBo = vec_sra(sumBo, v10ui);
646

    
647
    ssume = vec_packs(ssumAe, ssumBe);
648
    ssumo = vec_packs(ssumAo, ssumBo);
649

    
650
    sumv = vec_packsu(ssume, ssumo);
651
    sum = vec_perm(sumv, sumv, mperm);
652

    
653
    dst1 = vec_ld(0, dst);
654
    dst2 = vec_ld(16, dst);
655
    vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
656

    
657
    OP_U8_ALTIVEC(fsum, sum, vdst);
658

    
659
    rsum = vec_perm(fsum, fsum, dstperm);
660
    fdst1 = vec_sel(dst1, rsum, dstmask);
661
    fdst2 = vec_sel(rsum, dst2, dstmask);
662

    
663
    vec_st(fdst1, 0, dst);
664
    vec_st(fdst2, 16, dst);
665

    
666
    dst += dstStride;
667
  }
668
  POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
669
}