Revision 3ca96802

View differences:

libavcodec/ppc/h264_altivec.c
186 186
                          ((8 - x) * (y)),
187 187
                          ((x) * (y))};
188 188
    register int i;
189
    vector unsigned char fperm;
190
    const vector signed int vABCD = vec_ld(0, ABCD);
191
    const vector signed short vA = vec_splat((vector signed short)vABCD, 1);
192
    const vector signed short vB = vec_splat((vector signed short)vABCD, 3);
193
    const vector signed short vC = vec_splat((vector signed short)vABCD, 5);
194
    const vector signed short vD = vec_splat((vector signed short)vABCD, 7);
195
    const vector signed int vzero = vec_splat_s32(0);
196
    const vector signed short v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
197
    const vector unsigned short v6us = vec_splat_u16(6);
189
    vec_u8_t fperm;
190
    const vec_s32_t vABCD = vec_ld(0, ABCD);
191
    const vec_s16_t vA = vec_splat((vec_s16_t)vABCD, 1);
192
    const vec_s16_t vB = vec_splat((vec_s16_t)vABCD, 3);
193
    const vec_s16_t vC = vec_splat((vec_s16_t)vABCD, 5);
194
    const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7);
195
    LOAD_ZERO;
196
    const vec_s16_t v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
197
    const vec_u16_t v6us = vec_splat_u16(6);
198 198
    register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
199 199
    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
200 200

  
201
    vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
202
    vector unsigned char vsrc0uc, vsrc1uc;
203
    vector signed short vsrc0ssH, vsrc1ssH;
204
    vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc;
205
    vector signed short vsrc2ssH, vsrc3ssH, psum;
206
    vector unsigned char vdst, ppsum, fsum;
201
    vec_u8_t vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
202
    vec_u8_t vsrc0uc, vsrc1uc;
203
    vec_s16_t vsrc0ssH, vsrc1ssH;
204
    vec_u8_t vsrcCuc, vsrc2uc, vsrc3uc;
205
    vec_s16_t vsrc2ssH, vsrc3ssH, psum;
206
    vec_u8_t vdst, ppsum, fsum;
207 207

  
208 208
    if (((unsigned long)dst) % 16 == 0) {
209
      fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13,
210
                                        0x14, 0x15, 0x16, 0x17,
211
                                        0x08, 0x09, 0x0A, 0x0B,
212
                                        0x0C, 0x0D, 0x0E, 0x0F);
209
      fperm = (vec_u8_t)AVV(0x10, 0x11, 0x12, 0x13,
210
                            0x14, 0x15, 0x16, 0x17,
211
                            0x08, 0x09, 0x0A, 0x0B,
212
                            0x0C, 0x0D, 0x0E, 0x0F);
213 213
    } else {
214
      fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03,
215
                                        0x04, 0x05, 0x06, 0x07,
216
                                        0x18, 0x19, 0x1A, 0x1B,
217
                                        0x1C, 0x1D, 0x1E, 0x1F);
214
      fperm = (vec_u8_t)AVV(0x00, 0x01, 0x02, 0x03,
215
                            0x04, 0x05, 0x06, 0x07,
216
                            0x18, 0x19, 0x1A, 0x1B,
217
                            0x1C, 0x1D, 0x1E, 0x1F);
218 218
    }
219 219

  
220 220
    vsrcAuc = vec_ld(0, src);
......
230 230
    else
231 231
      vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
232 232

  
233
    vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
234
                                               (vector unsigned char)vsrc0uc);
235
    vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
236
                                               (vector unsigned char)vsrc1uc);
233
    vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc0uc);
234
    vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc1uc);
237 235

  
238 236
    if (!loadSecond) {// -> !reallyBadAlign
239 237
      for (i = 0 ; i < h ; i++) {
......
244 242
        vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
245 243
        vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
246 244

  
247
        vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
248
                                                (vector unsigned char)vsrc2uc);
249
        vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
250
                                                (vector unsigned char)vsrc3uc);
245
        vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc2uc);
246
        vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc3uc);
251 247

  
252 248
        psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
253 249
        psum = vec_mladd(vB, vsrc1ssH, psum);
......
257 253
        psum = vec_sra(psum, v6us);
258 254

  
259 255
        vdst = vec_ld(0, dst);
260
        ppsum = (vector unsigned char)vec_packsu(psum, psum);
256
        ppsum = (vec_u8_t)vec_packsu(psum, psum);
261 257
        fsum = vec_perm(vdst, ppsum, fperm);
262 258

  
263 259
        vec_st(fsum, 0, dst);
......
269 265
        src += stride;
270 266
      }
271 267
    } else {
272
        vector unsigned char vsrcDuc;
268
        vec_u8_t vsrcDuc;
273 269
      for (i = 0 ; i < h ; i++) {
274 270
        vsrcCuc = vec_ld(stride + 0, src);
275 271
        vsrcDuc = vec_ld(stride + 16, src);
......
280 276
        else
281 277
          vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
282 278

  
283
        vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
284
                                                (vector unsigned char)vsrc2uc);
285
        vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
286
                                                (vector unsigned char)vsrc3uc);
279
        vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc2uc);
280
        vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc3uc);
287 281

  
288 282
        psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
289 283
        psum = vec_mladd(vB, vsrc1ssH, psum);
......
293 287
        psum = vec_sr(psum, v6us);
294 288

  
295 289
        vdst = vec_ld(0, dst);
296
        ppsum = (vector unsigned char)vec_pack(psum, psum);
290
        ppsum = (vec_u8_t)vec_pack(psum, psum);
297 291
        fsum = vec_perm(vdst, ppsum, fperm);
298 292

  
299 293
        vec_st(fsum, 0, dst);
......
312 306
                                    int src_stride1, int h)
313 307
{
314 308
    int i;
315
    vector unsigned char a, b, d, tmp1, tmp2, mask, mask_, edges, align;
309
    vec_u8_t a, b, d, tmp1, tmp2, mask, mask_, edges, align;
316 310

  
317 311
    mask_ = vec_lvsl(0, src2);
318 312

  
......
354 348
                                    int src_stride1, int h)
355 349
{
356 350
    int i;
357
    vector unsigned char a, b, d, tmp1, tmp2, mask, mask_, edges, align;
351
    vec_u8_t a, b, d, tmp1, tmp2, mask, mask_, edges, align;
358 352

  
359 353
    mask_ = vec_lvsl(0, src2);
360 354

  
......
567 561
    const vec_u16_t twov = vec_splat_u16(2);
568 562
    const vec_u16_t sixv = vec_splat_u16(6);
569 563

  
570
    const vec_u8_t sel = (vec_u8_t) AVV(0,0,0,0,0,0,0,0,
571
                                        -1,-1,-1,-1,-1,-1,-1,-1);
564
    const vec_u8_t sel = (vec_u8_t) AVV(0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1);
572 565
    LOAD_ZERO;
573 566

  
574 567
    dct[0] += 32; // rounding for the >>6 at the end
......
601 594
}
602 595

  
603 596
#define transpose4x16(r0, r1, r2, r3) {      \
604
    register vector unsigned char r4;        \
605
    register vector unsigned char r5;        \
606
    register vector unsigned char r6;        \
607
    register vector unsigned char r7;        \
597
    register vec_u8_t r4;                    \
598
    register vec_u8_t r5;                    \
599
    register vec_u8_t r6;                    \
600
    register vec_u8_t r7;                    \
608 601
                                             \
609 602
    r4 = vec_mergeh(r0, r2);  /*0, 2 set 0*/ \
610 603
    r5 = vec_mergel(r0, r2);  /*0, 2 set 1*/ \
......
618 611
}
619 612

  
620 613
static inline void write16x4(uint8_t *dst, int dst_stride,
621
                             register vector unsigned char r0, register vector unsigned char r1,
622
                             register vector unsigned char r2, register vector unsigned char r3) {
614
                             register vec_u8_t r0, register vec_u8_t r1,
615
                             register vec_u8_t r2, register vec_u8_t r3) {
623 616
    DECLARE_ALIGNED_16(unsigned char, result[64]);
624 617
    uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
625 618
    int int_dst_stride = dst_stride/4;
......
651 644
    \todo FIXME: see if we can't spare some vec_lvsl() by them factorizing
652 645
    out of unaligned_load() */
653 646
#define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\
654
    register vector unsigned char r0  = unaligned_load(0,             src);\
655
    register vector unsigned char r1  = unaligned_load(   src_stride, src);\
656
    register vector unsigned char r2  = unaligned_load(2* src_stride, src);\
657
    register vector unsigned char r3  = unaligned_load(3* src_stride, src);\
658
    register vector unsigned char r4  = unaligned_load(4* src_stride, src);\
659
    register vector unsigned char r5  = unaligned_load(5* src_stride, src);\
660
    register vector unsigned char r6  = unaligned_load(6* src_stride, src);\
661
    register vector unsigned char r7  = unaligned_load(7* src_stride, src);\
662
    register vector unsigned char r14 = unaligned_load(14*src_stride, src);\
663
    register vector unsigned char r15 = unaligned_load(15*src_stride, src);\
647
    register vec_u8_t r0  = unaligned_load(0,             src);            \
648
    register vec_u8_t r1  = unaligned_load(   src_stride, src);            \
649
    register vec_u8_t r2  = unaligned_load(2* src_stride, src);            \
650
    register vec_u8_t r3  = unaligned_load(3* src_stride, src);            \
651
    register vec_u8_t r4  = unaligned_load(4* src_stride, src);            \
652
    register vec_u8_t r5  = unaligned_load(5* src_stride, src);            \
653
    register vec_u8_t r6  = unaligned_load(6* src_stride, src);            \
654
    register vec_u8_t r7  = unaligned_load(7* src_stride, src);            \
655
    register vec_u8_t r14 = unaligned_load(14*src_stride, src);            \
656
    register vec_u8_t r15 = unaligned_load(15*src_stride, src);            \
664 657
                                                                           \
665 658
    r8  = unaligned_load( 8*src_stride, src);                              \
666 659
    r9  = unaligned_load( 9*src_stride, src);                              \
......
710 703
}
711 704

  
712 705
// out: o = |x-y| < a
713
static inline vector unsigned char diff_lt_altivec ( register vector unsigned char x,
714
                                                     register vector unsigned char y,
715
                                                     register vector unsigned char a) {
716

  
717
    register vector unsigned char diff = vec_subs(x, y);
718
    register vector unsigned char diffneg = vec_subs(y, x);
719
    register vector unsigned char o = vec_or(diff, diffneg); /* |x-y| */
720
    o = (vector unsigned char)vec_cmplt(o, a);
706
static inline vec_u8_t diff_lt_altivec ( register vec_u8_t x,
707
                                         register vec_u8_t y,
708
                                         register vec_u8_t a) {
709

  
710
    register vec_u8_t diff = vec_subs(x, y);
711
    register vec_u8_t diffneg = vec_subs(y, x);
712
    register vec_u8_t o = vec_or(diff, diffneg); /* |x-y| */
713
    o = (vec_u8_t)vec_cmplt(o, a);
721 714
    return o;
722 715
}
723 716

  
724
static inline vector unsigned char h264_deblock_mask ( register vector unsigned char p0,
725
                                                       register vector unsigned char p1,
726
                                                       register vector unsigned char q0,
727
                                                       register vector unsigned char q1,
728
                                                       register vector unsigned char alpha,
729
                                                       register vector unsigned char beta) {
717
static inline vec_u8_t h264_deblock_mask ( register vec_u8_t p0,
718
                                           register vec_u8_t p1,
719
                                           register vec_u8_t q0,
720
                                           register vec_u8_t q1,
721
                                           register vec_u8_t alpha,
722
                                           register vec_u8_t beta) {
730 723

  
731
    register vector unsigned char mask;
732
    register vector unsigned char tempmask;
724
    register vec_u8_t mask;
725
    register vec_u8_t tempmask;
733 726

  
734 727
    mask = diff_lt_altivec(p0, q0, alpha);
735 728
    tempmask = diff_lt_altivec(p1, p0, beta);
......
741 734
}
742 735

  
743 736
// out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0)
744
static inline vector unsigned char h264_deblock_q1(register vector unsigned char p0,
745
                                                   register vector unsigned char p1,
746
                                                   register vector unsigned char p2,
747
                                                   register vector unsigned char q0,
748
                                                   register vector unsigned char tc0) {
749

  
750
    register vector unsigned char average = vec_avg(p0, q0);
751
    register vector unsigned char temp;
752
    register vector unsigned char uncliped;
753
    register vector unsigned char ones;
754
    register vector unsigned char max;
755
    register vector unsigned char min;
756
    register vector unsigned char newp1;
737
static inline vec_u8_t h264_deblock_q1(register vec_u8_t p0,
738
                                       register vec_u8_t p1,
739
                                       register vec_u8_t p2,
740
                                       register vec_u8_t q0,
741
                                       register vec_u8_t tc0) {
742

  
743
    register vec_u8_t average = vec_avg(p0, q0);
744
    register vec_u8_t temp;
745
    register vec_u8_t uncliped;
746
    register vec_u8_t ones;
747
    register vec_u8_t max;
748
    register vec_u8_t min;
749
    register vec_u8_t newp1;
757 750

  
758 751
    temp = vec_xor(average, p2);
759 752
    average = vec_avg(average, p2);     /*avg(p2, avg(p0, q0)) */
......
769 762

  
770 763
#define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) {                                           \
771 764
                                                                                                  \
772
    const vector unsigned char A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4));                   \
765
    const vec_u8_t A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4));                               \
773 766
                                                                                                  \
774
    register vector unsigned char pq0bit = vec_xor(p0,q0);                                        \
775
    register vector unsigned char q1minus;                                                        \
776
    register vector unsigned char p0minus;                                                        \
777
    register vector unsigned char stage1;                                                         \
778
    register vector unsigned char stage2;                                                         \
779
    register vector unsigned char vec160;                                                         \
780
    register vector unsigned char delta;                                                          \
781
    register vector unsigned char deltaneg;                                                       \
767
    register vec_u8_t pq0bit = vec_xor(p0,q0);                                                    \
768
    register vec_u8_t q1minus;                                                                    \
769
    register vec_u8_t p0minus;                                                                    \
770
    register vec_u8_t stage1;                                                                     \
771
    register vec_u8_t stage2;                                                                     \
772
    register vec_u8_t vec160;                                                                     \
773
    register vec_u8_t delta;                                                                      \
774
    register vec_u8_t deltaneg;                                                                   \
782 775
                                                                                                  \
783 776
    q1minus = vec_nor(q1, q1);                 /* 255 - q1 */                                     \
784 777
    stage1 = vec_avg(p1, q1minus);             /* (p1 - q1 + 256)>>1 */                           \
......
801 794

  
802 795
#define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) {            \
803 796
    DECLARE_ALIGNED_16(unsigned char, temp[16]);                                             \
804
    register vector unsigned char alphavec;                                                  \
805
    register vector unsigned char betavec;                                                   \
806
    register vector unsigned char mask;                                                      \
807
    register vector unsigned char p1mask;                                                    \
808
    register vector unsigned char q1mask;                                                    \
797
    register vec_u8_t alphavec;                                                              \
798
    register vec_u8_t betavec;                                                               \
799
    register vec_u8_t mask;                                                                  \
800
    register vec_u8_t p1mask;                                                                \
801
    register vec_u8_t q1mask;                                                                \
809 802
    register vector signed   char tc0vec;                                                    \
810
    register vector unsigned char finaltc0;                                                  \
811
    register vector unsigned char tc0masked;                                                 \
812
    register vector unsigned char newp1;                                                     \
813
    register vector unsigned char newq1;                                                     \
803
    register vec_u8_t finaltc0;                                                              \
804
    register vec_u8_t tc0masked;                                                             \
805
    register vec_u8_t newp1;                                                                 \
806
    register vec_u8_t newq1;                                                                 \
814 807
                                                                                             \
815 808
    temp[0] = alpha;                                                                         \
816 809
    temp[1] = beta;                                                                          \
......
824 817
    tc0vec = vec_mergeh(tc0vec, tc0vec);                                                     \
825 818
    tc0vec = vec_mergeh(tc0vec, tc0vec);                                                     \
826 819
    mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_s8(-1)));  /* if tc0[i] >= 0 */         \
827
    finaltc0 = vec_and((vector unsigned char)tc0vec, mask);     /* tc = tc0 */               \
820
    finaltc0 = vec_and((vec_u8_t)tc0vec, mask);     /* tc = tc0 */                           \
828 821
                                                                                             \
829 822
    p1mask = diff_lt_altivec(p2, p0, betavec);                                               \
830 823
    p1mask = vec_and(p1mask, mask);                             /* if( |p2 - p0| < beta) */  \
831
    tc0masked = vec_and(p1mask, (vector unsigned char)tc0vec);                               \
824
    tc0masked = vec_and(p1mask, (vec_u8_t)tc0vec);                                           \
832 825
    finaltc0 = vec_sub(finaltc0, p1mask);                       /* tc++ */                   \
833 826
    newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked);                                      \
834 827
    /*end if*/                                                                               \
835 828
                                                                                             \
836 829
    q1mask = diff_lt_altivec(q2, q0, betavec);                                               \
837 830
    q1mask = vec_and(q1mask, mask);                             /* if ( |q2 - q0| < beta ) */\
838
    tc0masked = vec_and(q1mask, (vector unsigned char)tc0vec);                               \
831
    tc0masked = vec_and(q1mask, (vec_u8_t)tc0vec);                                           \
839 832
    finaltc0 = vec_sub(finaltc0, q1mask);                       /* tc++ */                   \
840 833
    newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked);                                      \
841 834
    /*end if*/                                                                               \
......
848 841
static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
849 842

  
850 843
    if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) {
851
        register vector unsigned char p2 = vec_ld(-3*stride, pix);
852
        register vector unsigned char p1 = vec_ld(-2*stride, pix);
853
        register vector unsigned char p0 = vec_ld(-1*stride, pix);
854
        register vector unsigned char q0 = vec_ld(0, pix);
855
        register vector unsigned char q1 = vec_ld(stride, pix);
856
        register vector unsigned char q2 = vec_ld(2*stride, pix);
844
        register vec_u8_t p2 = vec_ld(-3*stride, pix);
845
        register vec_u8_t p1 = vec_ld(-2*stride, pix);
846
        register vec_u8_t p0 = vec_ld(-1*stride, pix);
847
        register vec_u8_t q0 = vec_ld(0, pix);
848
        register vec_u8_t q1 = vec_ld(stride, pix);
849
        register vec_u8_t q2 = vec_ld(2*stride, pix);
857 850
        h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0);
858 851
        vec_st(p1, -2*stride, pix);
859 852
        vec_st(p0, -1*stride, pix);
......
864 857

  
865 858
static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
866 859

  
867
    register vector unsigned char line0, line1, line2, line3, line4, line5;
860
    register vec_u8_t line0, line1, line2, line3, line4, line5;
868 861
    if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0)
869 862
        return;
870 863
    readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5);
libavcodec/ppc/h264_template_altivec.c
27 27
                          ((8 - x) * (y)),
28 28
                          ((x) * (y))};
29 29
    register int i;
30
    vector unsigned char fperm;
31
    const vector signed int vABCD = vec_ld(0, ABCD);
32
    const vector signed short vA = vec_splat((vector signed short)vABCD, 1);
33
    const vector signed short vB = vec_splat((vector signed short)vABCD, 3);
34
    const vector signed short vC = vec_splat((vector signed short)vABCD, 5);
35
    const vector signed short vD = vec_splat((vector signed short)vABCD, 7);
36
    const vector signed int vzero = vec_splat_s32(0);
37
    const vector signed short v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
38
    const vector unsigned short v6us = vec_splat_u16(6);
30
    vec_u8_t fperm;
31
    const vec_s32_t vABCD = vec_ld(0, ABCD);
32
    const vec_s16_t vA = vec_splat((vec_s16_t)vABCD, 1);
33
    const vec_s16_t vB = vec_splat((vec_s16_t)vABCD, 3);
34
    const vec_s16_t vC = vec_splat((vec_s16_t)vABCD, 5);
35
    const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7);
36
    LOAD_ZERO;
37
    const vec_s16_t v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
38
    const vec_u16_t v6us = vec_splat_u16(6);
39 39
    register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
40 40
    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
41 41

  
42
    vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
43
    vector unsigned char vsrc0uc, vsrc1uc;
44
    vector signed short vsrc0ssH, vsrc1ssH;
45
    vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc;
46
    vector signed short vsrc2ssH, vsrc3ssH, psum;
47
    vector unsigned char vdst, ppsum, vfdst, fsum;
42
    vec_u8_t vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
43
    vec_u8_t vsrc0uc, vsrc1uc;
44
    vec_s16_t vsrc0ssH, vsrc1ssH;
45
    vec_u8_t vsrcCuc, vsrc2uc, vsrc3uc;
46
    vec_s16_t vsrc2ssH, vsrc3ssH, psum;
47
    vec_u8_t vdst, ppsum, vfdst, fsum;
48 48

  
49 49
  POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
50 50

  
51 51
    if (((unsigned long)dst) % 16 == 0) {
52
      fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13,
53
                                        0x14, 0x15, 0x16, 0x17,
54
                                        0x08, 0x09, 0x0A, 0x0B,
55
                                        0x0C, 0x0D, 0x0E, 0x0F);
52
      fperm = (vec_u8_t)AVV(0x10, 0x11, 0x12, 0x13,
53
                            0x14, 0x15, 0x16, 0x17,
54
                            0x08, 0x09, 0x0A, 0x0B,
55
                            0x0C, 0x0D, 0x0E, 0x0F);
56 56
    } else {
57
      fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03,
58
                                        0x04, 0x05, 0x06, 0x07,
59
                                        0x18, 0x19, 0x1A, 0x1B,
60
                                        0x1C, 0x1D, 0x1E, 0x1F);
57
      fperm = (vec_u8_t)AVV(0x00, 0x01, 0x02, 0x03,
58
                            0x04, 0x05, 0x06, 0x07,
59
                            0x18, 0x19, 0x1A, 0x1B,
60
                            0x1C, 0x1D, 0x1E, 0x1F);
61 61
    }
62 62

  
63 63
    vsrcAuc = vec_ld(0, src);
......
73 73
    else
74 74
      vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
75 75

  
76
    vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
77
                                               (vector unsigned char)vsrc0uc);
78
    vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
79
                                               (vector unsigned char)vsrc1uc);
76
    vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc0uc);
77
    vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc1uc);
80 78

  
81 79
    if (!loadSecond) {// -> !reallyBadAlign
82 80
      for (i = 0 ; i < h ; i++) {
......
87 85
        vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
88 86
        vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
89 87

  
90
        vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
91
                                                (vector unsigned char)vsrc2uc);
92
        vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
93
                                                (vector unsigned char)vsrc3uc);
88
        vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc2uc);
89
        vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc3uc);
94 90

  
95 91
        psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
96 92
        psum = vec_mladd(vB, vsrc1ssH, psum);
......
100 96
        psum = vec_sra(psum, v6us);
101 97

  
102 98
        vdst = vec_ld(0, dst);
103
        ppsum = (vector unsigned char)vec_packsu(psum, psum);
99
        ppsum = (vec_u8_t)vec_packsu(psum, psum);
104 100
        vfdst = vec_perm(vdst, ppsum, fperm);
105 101

  
106 102
        OP_U8_ALTIVEC(fsum, vfdst, vdst);
......
114 110
        src += stride;
115 111
      }
116 112
    } else {
117
        vector unsigned char vsrcDuc;
113
        vec_u8_t vsrcDuc;
118 114
      for (i = 0 ; i < h ; i++) {
119 115
        vsrcCuc = vec_ld(stride + 0, src);
120 116
        vsrcDuc = vec_ld(stride + 16, src);
......
125 121
        else
126 122
          vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
127 123

  
128
        vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
129
                                                (vector unsigned char)vsrc2uc);
130
        vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
131
                                                (vector unsigned char)vsrc3uc);
124
        vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc2uc);
125
        vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc3uc);
132 126

  
133 127
        psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
134 128
        psum = vec_mladd(vB, vsrc1ssH, psum);
......
138 132
        psum = vec_sr(psum, v6us);
139 133

  
140 134
        vdst = vec_ld(0, dst);
141
        ppsum = (vector unsigned char)vec_pack(psum, psum);
135
        ppsum = (vec_u8_t)vec_pack(psum, psum);
142 136
        vfdst = vec_perm(vdst, ppsum, fperm);
143 137

  
144 138
        OP_U8_ALTIVEC(fsum, vfdst, vdst);
......
160 154
  POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
161 155
  register int i;
162 156

  
163
  const vector signed int vzero = vec_splat_s32(0);
164
  const vector unsigned char permM2 = vec_lvsl(-2, src);
165
  const vector unsigned char permM1 = vec_lvsl(-1, src);
166
  const vector unsigned char permP0 = vec_lvsl(+0, src);
167
  const vector unsigned char permP1 = vec_lvsl(+1, src);
168
  const vector unsigned char permP2 = vec_lvsl(+2, src);
169
  const vector unsigned char permP3 = vec_lvsl(+3, src);
170
  const vector signed short v5ss = vec_splat_s16(5);
171
  const vector unsigned short v5us = vec_splat_u16(5);
172
  const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
173
  const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
174
  const vector unsigned char dstperm = vec_lvsr(0, dst);
175
  const vector unsigned char neg1 =
176
                                (const vector unsigned char) vec_splat_s8(-1);
177

  
178
  const vector unsigned char dstmask =
179
                                vec_perm((const vector unsigned char)vzero,
180
                                                               neg1, dstperm);
181

  
182
  vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
157
  LOAD_ZERO;
158
  const vec_u8_t permM2 = vec_lvsl(-2, src);
159
  const vec_u8_t permM1 = vec_lvsl(-1, src);
160
  const vec_u8_t permP0 = vec_lvsl(+0, src);
161
  const vec_u8_t permP1 = vec_lvsl(+1, src);
162
  const vec_u8_t permP2 = vec_lvsl(+2, src);
163
  const vec_u8_t permP3 = vec_lvsl(+3, src);
164
  const vec_s16_t v5ss = vec_splat_s16(5);
165
  const vec_u16_t v5us = vec_splat_u16(5);
166
  const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
167
  const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
168
  const vec_u8_t dstperm = vec_lvsr(0, dst);
169
  const vec_u8_t neg1 = (const vec_u8_t) vec_splat_s8(-1);
170
  const vec_u8_t dstmask = vec_perm(zero_u8v, neg1, dstperm);
171

  
172
  vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
183 173

  
184 174
  register int align = ((((unsigned long)src) - 2) % 16);
185 175

  
186
  vector signed short srcP0A, srcP0B, srcP1A, srcP1B,
176
  vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,
187 177
                      srcP2A, srcP2B, srcP3A, srcP3B,
188 178
                      srcM1A, srcM1B, srcM2A, srcM2B,
189 179
                      sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
190 180
                      pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
191 181
                      psumA, psumB, sumA, sumB;
192 182

  
193
  vector unsigned char sum, dst1, dst2, vdst, fsum,
194
                       rsum, fdst1, fdst2;
183
  vec_u8_t sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2;
195 184

  
196 185
  POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
197 186

  
198 187
  for (i = 0 ; i < 16 ; i ++) {
199
    vector unsigned char srcR1 = vec_ld(-2, src);
200
    vector unsigned char srcR2 = vec_ld(14, src);
188
    vec_u8_t srcR1 = vec_ld(-2, src);
189
    vec_u8_t srcR2 = vec_ld(14, src);
201 190

  
202 191
    switch (align) {
203 192
    default: {
......
217 206
      srcP3 = srcR2;
218 207
    } break;
219 208
    case 12: {
220
      vector unsigned char srcR3 = vec_ld(30, src);
209
      vec_u8_t srcR3 = vec_ld(30, src);
221 210
      srcM2 = vec_perm(srcR1, srcR2, permM2);
222 211
      srcM1 = vec_perm(srcR1, srcR2, permM1);
223 212
      srcP0 = vec_perm(srcR1, srcR2, permP0);
......
226 215
      srcP3 = vec_perm(srcR2, srcR3, permP3);
227 216
    } break;
228 217
    case 13: {
229
      vector unsigned char srcR3 = vec_ld(30, src);
218
      vec_u8_t srcR3 = vec_ld(30, src);
230 219
      srcM2 = vec_perm(srcR1, srcR2, permM2);
231 220
      srcM1 = vec_perm(srcR1, srcR2, permM1);
232 221
      srcP0 = vec_perm(srcR1, srcR2, permP0);
......
235 224
      srcP3 = vec_perm(srcR2, srcR3, permP3);
236 225
    } break;
237 226
    case 14: {
238
      vector unsigned char srcR3 = vec_ld(30, src);
227
      vec_u8_t srcR3 = vec_ld(30, src);
239 228
      srcM2 = vec_perm(srcR1, srcR2, permM2);
240 229
      srcM1 = vec_perm(srcR1, srcR2, permM1);
241 230
      srcP0 = srcR2;
......
244 233
      srcP3 = vec_perm(srcR2, srcR3, permP3);
245 234
    } break;
246 235
    case 15: {
247
      vector unsigned char srcR3 = vec_ld(30, src);
236
      vec_u8_t srcR3 = vec_ld(30, src);
248 237
      srcM2 = vec_perm(srcR1, srcR2, permM2);
249 238
      srcM1 = srcR2;
250 239
      srcP0 = vec_perm(srcR2, srcR3, permP0);
......
254 243
    } break;
255 244
    }
256 245

  
257
    srcP0A = (vector signed short)
258
                vec_mergeh((vector unsigned char)vzero, srcP0);
259
    srcP0B = (vector signed short)
260
                vec_mergel((vector unsigned char)vzero, srcP0);
261
    srcP1A = (vector signed short)
262
                vec_mergeh((vector unsigned char)vzero, srcP1);
263
    srcP1B = (vector signed short)
264
                vec_mergel((vector unsigned char)vzero, srcP1);
265

  
266
    srcP2A = (vector signed short)
267
                vec_mergeh((vector unsigned char)vzero, srcP2);
268
    srcP2B = (vector signed short)
269
                vec_mergel((vector unsigned char)vzero, srcP2);
270
    srcP3A = (vector signed short)
271
                vec_mergeh((vector unsigned char)vzero, srcP3);
272
    srcP3B = (vector signed short)
273
                vec_mergel((vector unsigned char)vzero, srcP3);
274

  
275
    srcM1A = (vector signed short)
276
                vec_mergeh((vector unsigned char)vzero, srcM1);
277
    srcM1B = (vector signed short)
278
                vec_mergel((vector unsigned char)vzero, srcM1);
279
    srcM2A = (vector signed short)
280
                vec_mergeh((vector unsigned char)vzero, srcM2);
281
    srcM2B = (vector signed short)
282
                vec_mergel((vector unsigned char)vzero, srcM2);
246
    srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
247
    srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
248
    srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
249
    srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
250

  
251
    srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
252
    srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
253
    srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
254
    srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
255

  
256
    srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
257
    srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
258
    srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
259
    srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
283 260

  
284 261
    sum1A = vec_adds(srcP0A, srcP1A);
285 262
    sum1B = vec_adds(srcP0B, srcP1B);
......
291 268
    pp1A = vec_mladd(sum1A, v20ss, v16ss);
292 269
    pp1B = vec_mladd(sum1B, v20ss, v16ss);
293 270

  
294
    pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
295
    pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
271
    pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
272
    pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
296 273

  
297 274
    pp3A = vec_add(sum3A, pp1A);
298 275
    pp3B = vec_add(sum3B, pp1B);
......
330 307

  
331 308
  register int i;
332 309

  
333
  const vector signed int vzero = vec_splat_s32(0);
334
  const vector unsigned char perm = vec_lvsl(0, src);
335
  const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
336
  const vector unsigned short v5us = vec_splat_u16(5);
337
  const vector signed short v5ss = vec_splat_s16(5);
338
  const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
339
  const vector unsigned char dstperm = vec_lvsr(0, dst);
340
  const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1);
341
  const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
310
  LOAD_ZERO;
311
  const vec_u8_t perm = vec_lvsl(0, src);
312
  const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
313
  const vec_u16_t v5us = vec_splat_u16(5);
314
  const vec_s16_t v5ss = vec_splat_s16(5);
315
  const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
316
  const vec_u8_t dstperm = vec_lvsr(0, dst);
317
  const vec_u8_t neg1 = (const vec_u8_t)vec_splat_s8(-1);
318
  const vec_u8_t dstmask = vec_perm(zero_u8v, neg1, dstperm);
342 319

  
343 320
  uint8_t *srcbis = src - (srcStride * 2);
344 321

  
345
  const vector unsigned char srcM2a = vec_ld(0, srcbis);
346
  const vector unsigned char srcM2b = vec_ld(16, srcbis);
347
  const vector unsigned char srcM2 = vec_perm(srcM2a, srcM2b, perm);
322
  const vec_u8_t srcM2a = vec_ld(0, srcbis);
323
  const vec_u8_t srcM2b = vec_ld(16, srcbis);
324
  const vec_u8_t srcM2 = vec_perm(srcM2a, srcM2b, perm);
348 325
//  srcbis += srcStride;
349
  const vector unsigned char srcM1a = vec_ld(0, srcbis += srcStride);
350
  const vector unsigned char srcM1b = vec_ld(16, srcbis);
351
  const vector unsigned char srcM1 = vec_perm(srcM1a, srcM1b, perm);
326
  const vec_u8_t srcM1a = vec_ld(0, srcbis += srcStride);
327
  const vec_u8_t srcM1b = vec_ld(16, srcbis);
328
  const vec_u8_t srcM1 = vec_perm(srcM1a, srcM1b, perm);
352 329
//  srcbis += srcStride;
353
  const vector unsigned char srcP0a = vec_ld(0, srcbis += srcStride);
354
  const vector unsigned char srcP0b = vec_ld(16, srcbis);
355
  const vector unsigned char srcP0 = vec_perm(srcP0a, srcP0b, perm);
330
  const vec_u8_t srcP0a = vec_ld(0, srcbis += srcStride);
331
  const vec_u8_t srcP0b = vec_ld(16, srcbis);
332
  const vec_u8_t srcP0 = vec_perm(srcP0a, srcP0b, perm);
356 333
//  srcbis += srcStride;
357
  const vector unsigned char srcP1a = vec_ld(0, srcbis += srcStride);
358
  const vector unsigned char srcP1b = vec_ld(16, srcbis);
359
  const vector unsigned char srcP1 = vec_perm(srcP1a, srcP1b, perm);
334
  const vec_u8_t srcP1a = vec_ld(0, srcbis += srcStride);
335
  const vec_u8_t srcP1b = vec_ld(16, srcbis);
336
  const vec_u8_t srcP1 = vec_perm(srcP1a, srcP1b, perm);
360 337
//  srcbis += srcStride;
361
  const vector unsigned char srcP2a = vec_ld(0, srcbis += srcStride);
362
  const vector unsigned char srcP2b = vec_ld(16, srcbis);
363
  const vector unsigned char srcP2 = vec_perm(srcP2a, srcP2b, perm);
338
  const vec_u8_t srcP2a = vec_ld(0, srcbis += srcStride);
339
  const vec_u8_t srcP2b = vec_ld(16, srcbis);
340
  const vec_u8_t srcP2 = vec_perm(srcP2a, srcP2b, perm);
364 341
//  srcbis += srcStride;
365 342

  
366
  vector signed short srcM2ssA = (vector signed short)
367
                                vec_mergeh((vector unsigned char)vzero, srcM2);
368
  vector signed short srcM2ssB = (vector signed short)
369
                                vec_mergel((vector unsigned char)vzero, srcM2);
370
  vector signed short srcM1ssA = (vector signed short)
371
                                vec_mergeh((vector unsigned char)vzero, srcM1);
372
  vector signed short srcM1ssB = (vector signed short)
373
                                vec_mergel((vector unsigned char)vzero, srcM1);
374
  vector signed short srcP0ssA = (vector signed short)
375
                                vec_mergeh((vector unsigned char)vzero, srcP0);
376
  vector signed short srcP0ssB = (vector signed short)
377
                                vec_mergel((vector unsigned char)vzero, srcP0);
378
  vector signed short srcP1ssA = (vector signed short)
379
                                vec_mergeh((vector unsigned char)vzero, srcP1);
380
  vector signed short srcP1ssB = (vector signed short)
381
                                vec_mergel((vector unsigned char)vzero, srcP1);
382
  vector signed short srcP2ssA = (vector signed short)
383
                                vec_mergeh((vector unsigned char)vzero, srcP2);
384
  vector signed short srcP2ssB = (vector signed short)
385
                                vec_mergel((vector unsigned char)vzero, srcP2);
386

  
387
  vector signed short pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
343
  vec_s16_t srcM2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
344
  vec_s16_t srcM2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
345
  vec_s16_t srcM1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
346
  vec_s16_t srcM1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
347
  vec_s16_t srcP0ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
348
  vec_s16_t srcP0ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
349
  vec_s16_t srcP1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
350
  vec_s16_t srcP1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
351
  vec_s16_t srcP2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
352
  vec_s16_t srcP2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
353

  
354
  vec_s16_t pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
388 355
                      psumA, psumB, sumA, sumB,
389 356
                      srcP3ssA, srcP3ssB,
390 357
                      sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
391 358

  
392
  vector unsigned char sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2,
393
                       srcP3a, srcP3b, srcP3;
359
  vec_u8_t sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2, srcP3a, srcP3b, srcP3;
394 360

  
395 361
  POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
396 362

  
......
398 364
    srcP3a = vec_ld(0, srcbis += srcStride);
399 365
    srcP3b = vec_ld(16, srcbis);
400 366
    srcP3 = vec_perm(srcP3a, srcP3b, perm);
401
    srcP3ssA = (vector signed short)
402
                                vec_mergeh((vector unsigned char)vzero, srcP3);
403
    srcP3ssB = (vector signed short)
404
                                vec_mergel((vector unsigned char)vzero, srcP3);
367
    srcP3ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
368
    srcP3ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
405 369
//    srcbis += srcStride;
406 370

  
407 371
    sum1A = vec_adds(srcP0ssA, srcP1ssA);
......
425 389
    pp1A = vec_mladd(sum1A, v20ss, v16ss);
426 390
    pp1B = vec_mladd(sum1B, v20ss, v16ss);
427 391

  
428
    pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
429
    pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
392
    pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
393
    pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
430 394

  
431 395
    pp3A = vec_add(sum3A, pp1A);
432 396
    pp3B = vec_add(sum3B, pp1B);
......
461 425
static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
462 426
  POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
463 427
  register int i;
464
  const vector signed int vzero = vec_splat_s32(0);
465
  const vector unsigned char permM2 = vec_lvsl(-2, src);
466
  const vector unsigned char permM1 = vec_lvsl(-1, src);
467
  const vector unsigned char permP0 = vec_lvsl(+0, src);
468
  const vector unsigned char permP1 = vec_lvsl(+1, src);
469
  const vector unsigned char permP2 = vec_lvsl(+2, src);
470
  const vector unsigned char permP3 = vec_lvsl(+3, src);
471
  const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
472
  const vector unsigned int v10ui = vec_splat_u32(10);
473
  const vector signed short v5ss = vec_splat_s16(5);
474
  const vector signed short v1ss = vec_splat_s16(1);
475
  const vector signed int v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
476
  const vector unsigned int v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
428
  LOAD_ZERO;
429
  const vec_u8_t permM2 = vec_lvsl(-2, src);
430
  const vec_u8_t permM1 = vec_lvsl(-1, src);
431
  const vec_u8_t permP0 = vec_lvsl(+0, src);
432
  const vec_u8_t permP1 = vec_lvsl(+1, src);
433
  const vec_u8_t permP2 = vec_lvsl(+2, src);
434
  const vec_u8_t permP3 = vec_lvsl(+3, src);
435
  const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
436
  const vec_u32_t v10ui = vec_splat_u32(10);
437
  const vec_s16_t v5ss = vec_splat_s16(5);
438
  const vec_s16_t v1ss = vec_splat_s16(1);
439
  const vec_s32_t v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
440
  const vec_u32_t v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
477 441

  
478 442
  register int align = ((((unsigned long)src) - 2) % 16);
479 443

  
480
  const vector unsigned char neg1 = (const vector unsigned char)
481
                                                        vec_splat_s8(-1);
444
  const vec_u8_t neg1 = (const vec_u8_t) vec_splat_s8(-1);
482 445

  
483
  vector signed short srcP0A, srcP0B, srcP1A, srcP1B,
446
  vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,
484 447
                      srcP2A, srcP2B, srcP3A, srcP3B,
485 448
                      srcM1A, srcM1B, srcM2A, srcM2B,
486 449
                      sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
487 450
                      pp1A, pp1B, pp2A, pp2B, psumA, psumB;
488 451

  
489
  const vector unsigned char dstperm = vec_lvsr(0, dst);
452
  const vec_u8_t dstperm = vec_lvsr(0, dst);
490 453

  
491
  const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
454
  const vec_u8_t dstmask = vec_perm(zero_u8v, neg1, dstperm);
492 455

  
493
  const vector unsigned char mperm = (const vector unsigned char)
456
  const vec_u8_t mperm = (const vec_u8_t)
494 457
    AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
495 458
        0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F);
496 459
  int16_t *tmpbis = tmp;
497 460

  
498
  vector signed short tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
461
  vec_s16_t tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
499 462
                      tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
500 463
                      tmpP2ssA, tmpP2ssB;
501 464

  
502
  vector signed int pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
465
  vec_s32_t pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
503 466
                    pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
504 467
                    pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
505 468
                    ssumAe, ssumAo, ssumBe, ssumBo;
506
  vector unsigned char fsum, sumv, sum, dst1, dst2, vdst,
507
                       rsum, fdst1, fdst2;
508
  vector signed short ssume, ssumo;
469
  vec_u8_t fsum, sumv, sum, dst1, dst2, vdst, rsum, fdst1, fdst2;
470
  vec_s16_t ssume, ssumo;
509 471

  
510 472
  POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
511 473
  src -= (2 * srcStride);
512 474
  for (i = 0 ; i < 21 ; i ++) {
513
    vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
514
    vector unsigned char srcR1 = vec_ld(-2, src);
515
    vector unsigned char srcR2 = vec_ld(14, src);
475
    vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
476
    vec_u8_t srcR1 = vec_ld(-2, src);
477
    vec_u8_t srcR2 = vec_ld(14, src);
516 478

  
517 479
    switch (align) {
518 480
    default: {
......
532 494
      srcP3 = srcR2;
533 495
    } break;
534 496
    case 12: {
535
      vector unsigned char srcR3 = vec_ld(30, src);
497
      vec_u8_t srcR3 = vec_ld(30, src);
536 498
      srcM2 = vec_perm(srcR1, srcR2, permM2);
537 499
      srcM1 = vec_perm(srcR1, srcR2, permM1);
538 500
      srcP0 = vec_perm(srcR1, srcR2, permP0);
......
541 503
      srcP3 = vec_perm(srcR2, srcR3, permP3);
542 504
    } break;
543 505
    case 13: {
544
      vector unsigned char srcR3 = vec_ld(30, src);
506
      vec_u8_t srcR3 = vec_ld(30, src);
545 507
      srcM2 = vec_perm(srcR1, srcR2, permM2);
546 508
      srcM1 = vec_perm(srcR1, srcR2, permM1);
547 509
      srcP0 = vec_perm(srcR1, srcR2, permP0);
......
550 512
      srcP3 = vec_perm(srcR2, srcR3, permP3);
551 513
    } break;
552 514
    case 14: {
553
      vector unsigned char srcR3 = vec_ld(30, src);
515
      vec_u8_t srcR3 = vec_ld(30, src);
554 516
      srcM2 = vec_perm(srcR1, srcR2, permM2);
555 517
      srcM1 = vec_perm(srcR1, srcR2, permM1);
556 518
      srcP0 = srcR2;
......
559 521
      srcP3 = vec_perm(srcR2, srcR3, permP3);
560 522
    } break;
561 523
    case 15: {
562
      vector unsigned char srcR3 = vec_ld(30, src);
524
      vec_u8_t srcR3 = vec_ld(30, src);
563 525
      srcM2 = vec_perm(srcR1, srcR2, permM2);
564 526
      srcM1 = srcR2;
565 527
      srcP0 = vec_perm(srcR2, srcR3, permP0);
......
569 531
    } break;
570 532
    }
571 533

  
572
    srcP0A = (vector signed short)
573
                            vec_mergeh((vector unsigned char)vzero, srcP0);
574
    srcP0B = (vector signed short)
575
                            vec_mergel((vector unsigned char)vzero, srcP0);
576
    srcP1A = (vector signed short)
577
                            vec_mergeh((vector unsigned char)vzero, srcP1);
578
    srcP1B = (vector signed short)
579
                            vec_mergel((vector unsigned char)vzero, srcP1);
580

  
581
    srcP2A = (vector signed short)
582
                            vec_mergeh((vector unsigned char)vzero, srcP2);
583
    srcP2B = (vector signed short)
584
                            vec_mergel((vector unsigned char)vzero, srcP2);
585
    srcP3A = (vector signed short)
586
                            vec_mergeh((vector unsigned char)vzero, srcP3);
587
    srcP3B = (vector signed short)
588
                            vec_mergel((vector unsigned char)vzero, srcP3);
589

  
590
    srcM1A = (vector signed short)
591
                            vec_mergeh((vector unsigned char)vzero, srcM1);
592
    srcM1B = (vector signed short)
593
                            vec_mergel((vector unsigned char)vzero, srcM1);
594
    srcM2A = (vector signed short)
595
                            vec_mergeh((vector unsigned char)vzero, srcM2);
596
    srcM2B = (vector signed short)
597
                            vec_mergel((vector unsigned char)vzero, srcM2);
534
    srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
535
    srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
536
    srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
537
    srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
538

  
539
    srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
540
    srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
541
    srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
542
    srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
543

  
544
    srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
545
    srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
546
    srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
547
    srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
598 548

  
599 549
    sum1A = vec_adds(srcP0A, srcP1A);
600 550
    sum1B = vec_adds(srcP0B, srcP1B);
......
606 556
    pp1A = vec_mladd(sum1A, v20ss, sum3A);
607 557
    pp1B = vec_mladd(sum1B, v20ss, sum3B);
608 558

  
609
    pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
610
    pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
559
    pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
560
    pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
611 561

  
612 562
    psumA = vec_sub(pp1A, pp2A);
613 563
    psumB = vec_sub(pp1B, pp2B);
......
636 586
  tmpbis += tmpStride;
637 587

  
638 588
  for (i = 0 ; i < 16 ; i++) {
639
    const vector signed short tmpP3ssA = vec_ld(0, tmpbis);
640
    const vector signed short tmpP3ssB = vec_ld(16, tmpbis);
589
    const vec_s16_t tmpP3ssA = vec_ld(0, tmpbis);
590
    const vec_s16_t tmpP3ssB = vec_ld(16, tmpbis);
641 591

  
642
    const vector signed short sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
643
    const vector signed short sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
644
    const vector signed short sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
645
    const vector signed short sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
646
    const vector signed short sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
647
    const vector signed short sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
592
    const vec_s16_t sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
593
    const vec_s16_t sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
594
    const vec_s16_t sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
595
    const vec_s16_t sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
596
    const vec_s16_t sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
597
    const vec_s16_t sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
648 598

  
649 599
    tmpbis += tmpStride;
650 600

  
......
669 619
    pp2Be = vec_mule(sum2B, v5ss);
670 620
    pp2Bo = vec_mulo(sum2B, v5ss);
671 621

  
672
    pp3Ae = vec_sra((vector signed int)sum3A, v16ui);
622
    pp3Ae = vec_sra((vec_s32_t)sum3A, v16ui);
673 623
    pp3Ao = vec_mulo(sum3A, v1ss);
674
    pp3Be = vec_sra((vector signed int)sum3B, v16ui);
624
    pp3Be = vec_sra((vec_s32_t)sum3B, v16ui);
675 625
    pp3Bo = vec_mulo(sum3B, v1ss);
676 626

  
677 627
    pp1cAe = vec_add(pp1Ae, v512si);

Also available in: Unified diff