Revision 3ca96802 libavcodec/ppc/h264_altivec.c

View differences:

libavcodec/ppc/h264_altivec.c
186 186
                          ((8 - x) * (y)),
187 187
                          ((x) * (y))};
188 188
    register int i;
189
    vector unsigned char fperm;
190
    const vector signed int vABCD = vec_ld(0, ABCD);
191
    const vector signed short vA = vec_splat((vector signed short)vABCD, 1);
192
    const vector signed short vB = vec_splat((vector signed short)vABCD, 3);
193
    const vector signed short vC = vec_splat((vector signed short)vABCD, 5);
194
    const vector signed short vD = vec_splat((vector signed short)vABCD, 7);
195
    const vector signed int vzero = vec_splat_s32(0);
196
    const vector signed short v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
197
    const vector unsigned short v6us = vec_splat_u16(6);
189
    vec_u8_t fperm;
190
    const vec_s32_t vABCD = vec_ld(0, ABCD);
191
    const vec_s16_t vA = vec_splat((vec_s16_t)vABCD, 1);
192
    const vec_s16_t vB = vec_splat((vec_s16_t)vABCD, 3);
193
    const vec_s16_t vC = vec_splat((vec_s16_t)vABCD, 5);
194
    const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7);
195
    LOAD_ZERO;
196
    const vec_s16_t v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
197
    const vec_u16_t v6us = vec_splat_u16(6);
198 198
    register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
199 199
    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
200 200

  
201
    vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
202
    vector unsigned char vsrc0uc, vsrc1uc;
203
    vector signed short vsrc0ssH, vsrc1ssH;
204
    vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc;
205
    vector signed short vsrc2ssH, vsrc3ssH, psum;
206
    vector unsigned char vdst, ppsum, fsum;
201
    vec_u8_t vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
202
    vec_u8_t vsrc0uc, vsrc1uc;
203
    vec_s16_t vsrc0ssH, vsrc1ssH;
204
    vec_u8_t vsrcCuc, vsrc2uc, vsrc3uc;
205
    vec_s16_t vsrc2ssH, vsrc3ssH, psum;
206
    vec_u8_t vdst, ppsum, fsum;
207 207

  
208 208
    if (((unsigned long)dst) % 16 == 0) {
209
      fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13,
210
                                        0x14, 0x15, 0x16, 0x17,
211
                                        0x08, 0x09, 0x0A, 0x0B,
212
                                        0x0C, 0x0D, 0x0E, 0x0F);
209
      fperm = (vec_u8_t)AVV(0x10, 0x11, 0x12, 0x13,
210
                            0x14, 0x15, 0x16, 0x17,
211
                            0x08, 0x09, 0x0A, 0x0B,
212
                            0x0C, 0x0D, 0x0E, 0x0F);
213 213
    } else {
214
      fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03,
215
                                        0x04, 0x05, 0x06, 0x07,
216
                                        0x18, 0x19, 0x1A, 0x1B,
217
                                        0x1C, 0x1D, 0x1E, 0x1F);
214
      fperm = (vec_u8_t)AVV(0x00, 0x01, 0x02, 0x03,
215
                            0x04, 0x05, 0x06, 0x07,
216
                            0x18, 0x19, 0x1A, 0x1B,
217
                            0x1C, 0x1D, 0x1E, 0x1F);
218 218
    }
219 219

  
220 220
    vsrcAuc = vec_ld(0, src);
......
230 230
    else
231 231
      vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
232 232

  
233
    vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
234
                                               (vector unsigned char)vsrc0uc);
235
    vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
236
                                               (vector unsigned char)vsrc1uc);
233
    vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc0uc);
234
    vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc1uc);
237 235

  
238 236
    if (!loadSecond) {// -> !reallyBadAlign
239 237
      for (i = 0 ; i < h ; i++) {
......
244 242
        vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
245 243
        vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
246 244

  
247
        vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
248
                                                (vector unsigned char)vsrc2uc);
249
        vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
250
                                                (vector unsigned char)vsrc3uc);
245
        vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc2uc);
246
        vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc3uc);
251 247

  
252 248
        psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
253 249
        psum = vec_mladd(vB, vsrc1ssH, psum);
......
257 253
        psum = vec_sra(psum, v6us);
258 254

  
259 255
        vdst = vec_ld(0, dst);
260
        ppsum = (vector unsigned char)vec_packsu(psum, psum);
256
        ppsum = (vec_u8_t)vec_packsu(psum, psum);
261 257
        fsum = vec_perm(vdst, ppsum, fperm);
262 258

  
263 259
        vec_st(fsum, 0, dst);
......
269 265
        src += stride;
270 266
      }
271 267
    } else {
272
        vector unsigned char vsrcDuc;
268
        vec_u8_t vsrcDuc;
273 269
      for (i = 0 ; i < h ; i++) {
274 270
        vsrcCuc = vec_ld(stride + 0, src);
275 271
        vsrcDuc = vec_ld(stride + 16, src);
......
280 276
        else
281 277
          vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
282 278

  
283
        vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
284
                                                (vector unsigned char)vsrc2uc);
285
        vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
286
                                                (vector unsigned char)vsrc3uc);
279
        vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc2uc);
280
        vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc3uc);
287 281

  
288 282
        psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
289 283
        psum = vec_mladd(vB, vsrc1ssH, psum);
......
293 287
        psum = vec_sr(psum, v6us);
294 288

  
295 289
        vdst = vec_ld(0, dst);
296
        ppsum = (vector unsigned char)vec_pack(psum, psum);
290
        ppsum = (vec_u8_t)vec_pack(psum, psum);
297 291
        fsum = vec_perm(vdst, ppsum, fperm);
298 292

  
299 293
        vec_st(fsum, 0, dst);
......
312 306
                                    int src_stride1, int h)
313 307
{
314 308
    int i;
315
    vector unsigned char a, b, d, tmp1, tmp2, mask, mask_, edges, align;
309
    vec_u8_t a, b, d, tmp1, tmp2, mask, mask_, edges, align;
316 310

  
317 311
    mask_ = vec_lvsl(0, src2);
318 312

  
......
354 348
                                    int src_stride1, int h)
355 349
{
356 350
    int i;
357
    vector unsigned char a, b, d, tmp1, tmp2, mask, mask_, edges, align;
351
    vec_u8_t a, b, d, tmp1, tmp2, mask, mask_, edges, align;
358 352

  
359 353
    mask_ = vec_lvsl(0, src2);
360 354

  
......
567 561
    const vec_u16_t twov = vec_splat_u16(2);
568 562
    const vec_u16_t sixv = vec_splat_u16(6);
569 563

  
570
    const vec_u8_t sel = (vec_u8_t) AVV(0,0,0,0,0,0,0,0,
571
                                        -1,-1,-1,-1,-1,-1,-1,-1);
564
    const vec_u8_t sel = (vec_u8_t) AVV(0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1);
572 565
    LOAD_ZERO;
573 566

  
574 567
    dct[0] += 32; // rounding for the >>6 at the end
......
601 594
}
602 595

  
603 596
#define transpose4x16(r0, r1, r2, r3) {      \
604
    register vector unsigned char r4;        \
605
    register vector unsigned char r5;        \
606
    register vector unsigned char r6;        \
607
    register vector unsigned char r7;        \
597
    register vec_u8_t r4;                    \
598
    register vec_u8_t r5;                    \
599
    register vec_u8_t r6;                    \
600
    register vec_u8_t r7;                    \
608 601
                                             \
609 602
    r4 = vec_mergeh(r0, r2);  /*0, 2 set 0*/ \
610 603
    r5 = vec_mergel(r0, r2);  /*0, 2 set 1*/ \
......
618 611
}
619 612

  
620 613
static inline void write16x4(uint8_t *dst, int dst_stride,
621
                             register vector unsigned char r0, register vector unsigned char r1,
622
                             register vector unsigned char r2, register vector unsigned char r3) {
614
                             register vec_u8_t r0, register vec_u8_t r1,
615
                             register vec_u8_t r2, register vec_u8_t r3) {
623 616
    DECLARE_ALIGNED_16(unsigned char, result[64]);
624 617
    uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
625 618
    int int_dst_stride = dst_stride/4;
......
651 644
    \todo FIXME: see if we can't spare some vec_lvsl() by them factorizing
652 645
    out of unaligned_load() */
653 646
#define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\
654
    register vector unsigned char r0  = unaligned_load(0,             src);\
655
    register vector unsigned char r1  = unaligned_load(   src_stride, src);\
656
    register vector unsigned char r2  = unaligned_load(2* src_stride, src);\
657
    register vector unsigned char r3  = unaligned_load(3* src_stride, src);\
658
    register vector unsigned char r4  = unaligned_load(4* src_stride, src);\
659
    register vector unsigned char r5  = unaligned_load(5* src_stride, src);\
660
    register vector unsigned char r6  = unaligned_load(6* src_stride, src);\
661
    register vector unsigned char r7  = unaligned_load(7* src_stride, src);\
662
    register vector unsigned char r14 = unaligned_load(14*src_stride, src);\
663
    register vector unsigned char r15 = unaligned_load(15*src_stride, src);\
647
    register vec_u8_t r0  = unaligned_load(0,             src);            \
648
    register vec_u8_t r1  = unaligned_load(   src_stride, src);            \
649
    register vec_u8_t r2  = unaligned_load(2* src_stride, src);            \
650
    register vec_u8_t r3  = unaligned_load(3* src_stride, src);            \
651
    register vec_u8_t r4  = unaligned_load(4* src_stride, src);            \
652
    register vec_u8_t r5  = unaligned_load(5* src_stride, src);            \
653
    register vec_u8_t r6  = unaligned_load(6* src_stride, src);            \
654
    register vec_u8_t r7  = unaligned_load(7* src_stride, src);            \
655
    register vec_u8_t r14 = unaligned_load(14*src_stride, src);            \
656
    register vec_u8_t r15 = unaligned_load(15*src_stride, src);            \
664 657
                                                                           \
665 658
    r8  = unaligned_load( 8*src_stride, src);                              \
666 659
    r9  = unaligned_load( 9*src_stride, src);                              \
......
710 703
}
711 704

  
712 705
// out: o = |x-y| < a
713
static inline vector unsigned char diff_lt_altivec ( register vector unsigned char x,
714
                                                     register vector unsigned char y,
715
                                                     register vector unsigned char a) {
716

  
717
    register vector unsigned char diff = vec_subs(x, y);
718
    register vector unsigned char diffneg = vec_subs(y, x);
719
    register vector unsigned char o = vec_or(diff, diffneg); /* |x-y| */
720
    o = (vector unsigned char)vec_cmplt(o, a);
706
static inline vec_u8_t diff_lt_altivec ( register vec_u8_t x,
707
                                         register vec_u8_t y,
708
                                         register vec_u8_t a) {
709

  
710
    register vec_u8_t diff = vec_subs(x, y);
711
    register vec_u8_t diffneg = vec_subs(y, x);
712
    register vec_u8_t o = vec_or(diff, diffneg); /* |x-y| */
713
    o = (vec_u8_t)vec_cmplt(o, a);
721 714
    return o;
722 715
}
723 716

  
724
static inline vector unsigned char h264_deblock_mask ( register vector unsigned char p0,
725
                                                       register vector unsigned char p1,
726
                                                       register vector unsigned char q0,
727
                                                       register vector unsigned char q1,
728
                                                       register vector unsigned char alpha,
729
                                                       register vector unsigned char beta) {
717
static inline vec_u8_t h264_deblock_mask ( register vec_u8_t p0,
718
                                           register vec_u8_t p1,
719
                                           register vec_u8_t q0,
720
                                           register vec_u8_t q1,
721
                                           register vec_u8_t alpha,
722
                                           register vec_u8_t beta) {
730 723

  
731
    register vector unsigned char mask;
732
    register vector unsigned char tempmask;
724
    register vec_u8_t mask;
725
    register vec_u8_t tempmask;
733 726

  
734 727
    mask = diff_lt_altivec(p0, q0, alpha);
735 728
    tempmask = diff_lt_altivec(p1, p0, beta);
......
741 734
}
742 735

  
743 736
// out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0)
744
static inline vector unsigned char h264_deblock_q1(register vector unsigned char p0,
745
                                                   register vector unsigned char p1,
746
                                                   register vector unsigned char p2,
747
                                                   register vector unsigned char q0,
748
                                                   register vector unsigned char tc0) {
749

  
750
    register vector unsigned char average = vec_avg(p0, q0);
751
    register vector unsigned char temp;
752
    register vector unsigned char uncliped;
753
    register vector unsigned char ones;
754
    register vector unsigned char max;
755
    register vector unsigned char min;
756
    register vector unsigned char newp1;
737
static inline vec_u8_t h264_deblock_q1(register vec_u8_t p0,
738
                                       register vec_u8_t p1,
739
                                       register vec_u8_t p2,
740
                                       register vec_u8_t q0,
741
                                       register vec_u8_t tc0) {
742

  
743
    register vec_u8_t average = vec_avg(p0, q0);
744
    register vec_u8_t temp;
745
    register vec_u8_t uncliped;
746
    register vec_u8_t ones;
747
    register vec_u8_t max;
748
    register vec_u8_t min;
749
    register vec_u8_t newp1;
757 750

  
758 751
    temp = vec_xor(average, p2);
759 752
    average = vec_avg(average, p2);     /*avg(p2, avg(p0, q0)) */
......
769 762

  
770 763
#define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) {                                           \
771 764
                                                                                                  \
772
    const vector unsigned char A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4));                   \
765
    const vec_u8_t A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4));                               \
773 766
                                                                                                  \
774
    register vector unsigned char pq0bit = vec_xor(p0,q0);                                        \
775
    register vector unsigned char q1minus;                                                        \
776
    register vector unsigned char p0minus;                                                        \
777
    register vector unsigned char stage1;                                                         \
778
    register vector unsigned char stage2;                                                         \
779
    register vector unsigned char vec160;                                                         \
780
    register vector unsigned char delta;                                                          \
781
    register vector unsigned char deltaneg;                                                       \
767
    register vec_u8_t pq0bit = vec_xor(p0,q0);                                                    \
768
    register vec_u8_t q1minus;                                                                    \
769
    register vec_u8_t p0minus;                                                                    \
770
    register vec_u8_t stage1;                                                                     \
771
    register vec_u8_t stage2;                                                                     \
772
    register vec_u8_t vec160;                                                                     \
773
    register vec_u8_t delta;                                                                      \
774
    register vec_u8_t deltaneg;                                                                   \
782 775
                                                                                                  \
783 776
    q1minus = vec_nor(q1, q1);                 /* 255 - q1 */                                     \
784 777
    stage1 = vec_avg(p1, q1minus);             /* (p1 - q1 + 256)>>1 */                           \
......
801 794

  
802 795
#define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) {            \
803 796
    DECLARE_ALIGNED_16(unsigned char, temp[16]);                                             \
804
    register vector unsigned char alphavec;                                                  \
805
    register vector unsigned char betavec;                                                   \
806
    register vector unsigned char mask;                                                      \
807
    register vector unsigned char p1mask;                                                    \
808
    register vector unsigned char q1mask;                                                    \
797
    register vec_u8_t alphavec;                                                              \
798
    register vec_u8_t betavec;                                                               \
799
    register vec_u8_t mask;                                                                  \
800
    register vec_u8_t p1mask;                                                                \
801
    register vec_u8_t q1mask;                                                                \
809 802
    register vector signed   char tc0vec;                                                    \
810
    register vector unsigned char finaltc0;                                                  \
811
    register vector unsigned char tc0masked;                                                 \
812
    register vector unsigned char newp1;                                                     \
813
    register vector unsigned char newq1;                                                     \
803
    register vec_u8_t finaltc0;                                                              \
804
    register vec_u8_t tc0masked;                                                             \
805
    register vec_u8_t newp1;                                                                 \
806
    register vec_u8_t newq1;                                                                 \
814 807
                                                                                             \
815 808
    temp[0] = alpha;                                                                         \
816 809
    temp[1] = beta;                                                                          \
......
824 817
    tc0vec = vec_mergeh(tc0vec, tc0vec);                                                     \
825 818
    tc0vec = vec_mergeh(tc0vec, tc0vec);                                                     \
826 819
    mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_s8(-1)));  /* if tc0[i] >= 0 */         \
827
    finaltc0 = vec_and((vector unsigned char)tc0vec, mask);     /* tc = tc0 */               \
820
    finaltc0 = vec_and((vec_u8_t)tc0vec, mask);     /* tc = tc0 */                           \
828 821
                                                                                             \
829 822
    p1mask = diff_lt_altivec(p2, p0, betavec);                                               \
830 823
    p1mask = vec_and(p1mask, mask);                             /* if( |p2 - p0| < beta) */  \
831
    tc0masked = vec_and(p1mask, (vector unsigned char)tc0vec);                               \
824
    tc0masked = vec_and(p1mask, (vec_u8_t)tc0vec);                                           \
832 825
    finaltc0 = vec_sub(finaltc0, p1mask);                       /* tc++ */                   \
833 826
    newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked);                                      \
834 827
    /*end if*/                                                                               \
835 828
                                                                                             \
836 829
    q1mask = diff_lt_altivec(q2, q0, betavec);                                               \
837 830
    q1mask = vec_and(q1mask, mask);                             /* if ( |q2 - q0| < beta ) */\
838
    tc0masked = vec_and(q1mask, (vector unsigned char)tc0vec);                               \
831
    tc0masked = vec_and(q1mask, (vec_u8_t)tc0vec);                                           \
839 832
    finaltc0 = vec_sub(finaltc0, q1mask);                       /* tc++ */                   \
840 833
    newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked);                                      \
841 834
    /*end if*/                                                                               \
......
848 841
static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
849 842

  
850 843
    if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) {
851
        register vector unsigned char p2 = vec_ld(-3*stride, pix);
852
        register vector unsigned char p1 = vec_ld(-2*stride, pix);
853
        register vector unsigned char p0 = vec_ld(-1*stride, pix);
854
        register vector unsigned char q0 = vec_ld(0, pix);
855
        register vector unsigned char q1 = vec_ld(stride, pix);
856
        register vector unsigned char q2 = vec_ld(2*stride, pix);
844
        register vec_u8_t p2 = vec_ld(-3*stride, pix);
845
        register vec_u8_t p1 = vec_ld(-2*stride, pix);
846
        register vec_u8_t p0 = vec_ld(-1*stride, pix);
847
        register vec_u8_t q0 = vec_ld(0, pix);
848
        register vec_u8_t q1 = vec_ld(stride, pix);
849
        register vec_u8_t q2 = vec_ld(2*stride, pix);
857 850
        h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0);
858 851
        vec_st(p1, -2*stride, pix);
859 852
        vec_st(p0, -1*stride, pix);
......
864 857

  
865 858
static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
866 859

  
867
    register vector unsigned char line0, line1, line2, line3, line4, line5;
860
    register vec_u8_t line0, line1, line2, line3, line4, line5;
868 861
    if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0)
869 862
        return;
870 863
    readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5);

Also available in: Unified diff