Revision 830bf1f2

View differences:

libavcodec/ppc/h264_altivec.c
182 182
void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
183 183
   DECLARE_ALIGNED_16(signed int, ABCD[4]) =
184 184
                        {((8 - x) * (8 - y)),
185
                          ((x) * (8 - y)),
186
                          ((8 - x) * (y)),
187
                          ((x) * (y))};
185
                             ((x) * (8 - y)),
186
                         ((8 - x) * (y)),
187
                             ((x) * (y))};
188 188
    register int i;
189 189
    vec_u8_t fperm;
190 190
    const vec_s32_t vABCD = vec_ld(0, ABCD);
......
195 195
    LOAD_ZERO;
196 196
    const vec_s16_t v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
197 197
    const vec_u16_t v6us = vec_splat_u16(6);
198
    register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
198
    register int loadSecond     = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
199 199
    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
200 200

  
201 201
    vec_u8_t vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
......
206 206
    vec_u8_t vdst, ppsum, fsum;
207 207

  
208 208
    if (((unsigned long)dst) % 16 == 0) {
209
      fperm = (vec_u8_t)AVV(0x10, 0x11, 0x12, 0x13,
210
                            0x14, 0x15, 0x16, 0x17,
211
                            0x08, 0x09, 0x0A, 0x0B,
212
                            0x0C, 0x0D, 0x0E, 0x0F);
209
        fperm = (vec_u8_t)AVV(0x10, 0x11, 0x12, 0x13,
210
                              0x14, 0x15, 0x16, 0x17,
211
                              0x08, 0x09, 0x0A, 0x0B,
212
                              0x0C, 0x0D, 0x0E, 0x0F);
213 213
    } else {
214
      fperm = (vec_u8_t)AVV(0x00, 0x01, 0x02, 0x03,
215
                            0x04, 0x05, 0x06, 0x07,
216
                            0x18, 0x19, 0x1A, 0x1B,
217
                            0x1C, 0x1D, 0x1E, 0x1F);
214
        fperm = (vec_u8_t)AVV(0x00, 0x01, 0x02, 0x03,
215
                              0x04, 0x05, 0x06, 0x07,
216
                              0x18, 0x19, 0x1A, 0x1B,
217
                              0x1C, 0x1D, 0x1E, 0x1F);
218 218
    }
219 219

  
220 220
    vsrcAuc = vec_ld(0, src);
221 221

  
222 222
    if (loadSecond)
223
      vsrcBuc = vec_ld(16, src);
223
        vsrcBuc = vec_ld(16, src);
224 224
    vsrcperm0 = vec_lvsl(0, src);
225 225
    vsrcperm1 = vec_lvsl(1, src);
226 226

  
227 227
    vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
228 228
    if (reallyBadAlign)
229
      vsrc1uc = vsrcBuc;
229
        vsrc1uc = vsrcBuc;
230 230
    else
231
      vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
231
        vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
232 232

  
233 233
    vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc0uc);
234 234
    vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc1uc);
235 235

  
236 236
    if (!loadSecond) {// -> !reallyBadAlign
237
      for (i = 0 ; i < h ; i++) {
237
        for (i = 0 ; i < h ; i++) {
238 238

  
239 239

  
240
        vsrcCuc = vec_ld(stride + 0, src);
240
            vsrcCuc = vec_ld(stride + 0, src);
241 241

  
242
        vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
243
        vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
242
            vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
243
            vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
244 244

  
245
        vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc2uc);
246
        vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc3uc);
245
            vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc2uc);
246
            vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc3uc);
247 247

  
248
        psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
249
        psum = vec_mladd(vB, vsrc1ssH, psum);
250
        psum = vec_mladd(vC, vsrc2ssH, psum);
251
        psum = vec_mladd(vD, vsrc3ssH, psum);
252
        psum = vec_add(v28ss, psum);
253
        psum = vec_sra(psum, v6us);
248
            psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
249
            psum = vec_mladd(vB, vsrc1ssH, psum);
250
            psum = vec_mladd(vC, vsrc2ssH, psum);
251
            psum = vec_mladd(vD, vsrc3ssH, psum);
252
            psum = vec_add(v28ss, psum);
253
            psum = vec_sra(psum, v6us);
254 254

  
255
        vdst = vec_ld(0, dst);
256
        ppsum = (vec_u8_t)vec_packsu(psum, psum);
257
        fsum = vec_perm(vdst, ppsum, fperm);
255
            vdst = vec_ld(0, dst);
256
            ppsum = (vec_u8_t)vec_packsu(psum, psum);
257
            fsum = vec_perm(vdst, ppsum, fperm);
258 258

  
259
        vec_st(fsum, 0, dst);
259
            vec_st(fsum, 0, dst);
260 260

  
261
        vsrc0ssH = vsrc2ssH;
262
        vsrc1ssH = vsrc3ssH;
261
            vsrc0ssH = vsrc2ssH;
262
            vsrc1ssH = vsrc3ssH;
263 263

  
264
        dst += stride;
265
        src += stride;
266
      }
264
            dst += stride;
265
            src += stride;
266
        }
267 267
    } else {
268 268
        vec_u8_t vsrcDuc;
269
      for (i = 0 ; i < h ; i++) {
270
        vsrcCuc = vec_ld(stride + 0, src);
271
        vsrcDuc = vec_ld(stride + 16, src);
272

  
273
        vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
274
        if (reallyBadAlign)
275
          vsrc3uc = vsrcDuc;
276
        else
277
          vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
278

  
279
        vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc2uc);
280
        vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc3uc);
281

  
282
        psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
283
        psum = vec_mladd(vB, vsrc1ssH, psum);
284
        psum = vec_mladd(vC, vsrc2ssH, psum);
285
        psum = vec_mladd(vD, vsrc3ssH, psum);
286
        psum = vec_add(v28ss, psum);
287
        psum = vec_sr(psum, v6us);
288

  
289
        vdst = vec_ld(0, dst);
290
        ppsum = (vec_u8_t)vec_pack(psum, psum);
291
        fsum = vec_perm(vdst, ppsum, fperm);
292

  
293
        vec_st(fsum, 0, dst);
294

  
295
        vsrc0ssH = vsrc2ssH;
296
        vsrc1ssH = vsrc3ssH;
297

  
298
        dst += stride;
299
        src += stride;
300
      }
269
        for (i = 0 ; i < h ; i++) {
270
            vsrcCuc = vec_ld(stride + 0, src);
271
            vsrcDuc = vec_ld(stride + 16, src);
272

  
273
            vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
274
            if (reallyBadAlign)
275
                vsrc3uc = vsrcDuc;
276
            else
277
                vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
278

  
279
            vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc2uc);
280
            vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc3uc);
281

  
282
            psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
283
            psum = vec_mladd(vB, vsrc1ssH, psum);
284
            psum = vec_mladd(vC, vsrc2ssH, psum);
285
            psum = vec_mladd(vD, vsrc3ssH, psum);
286
            psum = vec_add(v28ss, psum);
287
            psum = vec_sr(psum, v6us);
288

  
289
            vdst = vec_ld(0, dst);
290
            ppsum = (vec_u8_t)vec_pack(psum, psum);
291
            fsum = vec_perm(vdst, ppsum, fperm);
292

  
293
            vec_st(fsum, 0, dst);
294

  
295
            vsrc0ssH = vsrc2ssH;
296
            vsrc1ssH = vsrc3ssH;
297

  
298
            dst += stride;
299
            src += stride;
300
        }
301 301
    }
302 302
}
303 303

  
......
398 398
 * IDCT transform:
399 399
 ****************************************************************************/
400 400

  
401
#define VEC_1D_DCT(vb0,vb1,vb2,vb3,va0,va1,va2,va3)              \
402
   /* 1st stage */                                               \
403
   vz0 = vec_add(vb0,vb2);       /* temp[0] = Y[0] + Y[2] */     \
404
   vz1 = vec_sub(vb0,vb2);       /* temp[1] = Y[0] - Y[2] */     \
405
   vz2 = vec_sra(vb1,vec_splat_u16(1));                          \
406
   vz2 = vec_sub(vz2,vb3);       /* temp[2] = Y[1].1/2 - Y[3] */ \
407
   vz3 = vec_sra(vb3,vec_splat_u16(1));                          \
408
   vz3 = vec_add(vb1,vz3);       /* temp[3] = Y[1] + Y[3].1/2 */ \
409
   /* 2nd stage: output */                                       \
410
   va0 = vec_add(vz0,vz3);       /* x[0] = temp[0] + temp[3] */  \
411
   va1 = vec_add(vz1,vz2);       /* x[1] = temp[1] + temp[2] */  \
412
   va2 = vec_sub(vz1,vz2);       /* x[2] = temp[1] - temp[2] */  \
413
   va3 = vec_sub(vz0,vz3)        /* x[3] = temp[0] - temp[3] */
401
#define VEC_1D_DCT(vb0,vb1,vb2,vb3,va0,va1,va2,va3)               \
402
    /* 1st stage */                                               \
403
    vz0 = vec_add(vb0,vb2);       /* temp[0] = Y[0] + Y[2] */     \
404
    vz1 = vec_sub(vb0,vb2);       /* temp[1] = Y[0] - Y[2] */     \
405
    vz2 = vec_sra(vb1,vec_splat_u16(1));                          \
406
    vz2 = vec_sub(vz2,vb3);       /* temp[2] = Y[1].1/2 - Y[3] */ \
407
    vz3 = vec_sra(vb3,vec_splat_u16(1));                          \
408
    vz3 = vec_add(vb1,vz3);       /* temp[3] = Y[1] + Y[3].1/2 */ \
409
    /* 2nd stage: output */                                       \
410
    va0 = vec_add(vz0,vz3);       /* x[0] = temp[0] + temp[3] */  \
411
    va1 = vec_add(vz1,vz2);       /* x[1] = temp[1] + temp[2] */  \
412
    va2 = vec_sub(vz1,vz2);       /* x[2] = temp[1] - temp[2] */  \
413
    va3 = vec_sub(vz0,vz3)        /* x[3] = temp[0] - temp[3] */
414 414

  
415 415
#define VEC_TRANSPOSE_4(a0,a1,a2,a3,b0,b1,b2,b3) \
416 416
    b0 = vec_mergeh( a0, a0 ); \
......
820 820
    finaltc0 = vec_and((vec_u8_t)tc0vec, mask);     /* tc = tc0 */                           \
821 821
                                                                                             \
822 822
    p1mask = diff_lt_altivec(p2, p0, betavec);                                               \
823
    p1mask = vec_and(p1mask, mask);                             /* if( |p2 - p0| < beta) */  \
823
    p1mask = vec_and(p1mask, mask);                             /* if ( |p2 - p0| < beta) */ \
824 824
    tc0masked = vec_and(p1mask, (vec_u8_t)tc0vec);                                           \
825 825
    finaltc0 = vec_sub(finaltc0, p1mask);                       /* tc++ */                   \
826 826
    newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked);                                      \
......
840 840

  
841 841
static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
842 842

  
843
    if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) {
843
    if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) {
844 844
        register vec_u8_t p2 = vec_ld(-3*stride, pix);
845 845
        register vec_u8_t p1 = vec_ld(-2*stride, pix);
846 846
        register vec_u8_t p0 = vec_ld(-1*stride, pix);
......
858 858
static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
859 859

  
860 860
    register vec_u8_t line0, line1, line2, line3, line4, line5;
861
    if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0)
861
    if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0)
862 862
        return;
863 863
    readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5);
864 864
    h264_loop_filter_luma_altivec(line0, line1, line2, line3, line4, line5, alpha, beta, tc0);
......
868 868

  
869 869
void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) {
870 870

  
871
  if (has_altivec()) {
872
    c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_altivec;
873
    c->put_no_rnd_h264_chroma_pixels_tab[0] = put_no_rnd_h264_chroma_mc8_altivec;
874
    c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec;
875
    c->h264_idct_add = ff_h264_idct_add_altivec;
876
    c->h264_idct8_add = ff_h264_idct8_add_altivec;
877
    c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec;
878
    c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec;
871
    if (has_altivec()) {
872
        c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_altivec;
873
        c->put_no_rnd_h264_chroma_pixels_tab[0] = put_no_rnd_h264_chroma_mc8_altivec;
874
        c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec;
875
        c->h264_idct_add = ff_h264_idct_add_altivec;
876
        c->h264_idct8_add = ff_h264_idct8_add_altivec;
877
        c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec;
878
        c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec;
879 879

  
880 880
#define dspfunc(PFX, IDX, NUM) \
881
    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_altivec; \
882
    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_altivec; \
883
    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_altivec; \
884
    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_altivec; \
885
    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_altivec; \
886
    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_altivec; \
887
    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_altivec; \
888
    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_altivec; \
889
    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_altivec; \
890
    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_altivec; \
891
    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_altivec; \
892
    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_altivec; \
893
    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_altivec; \
894
    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_altivec; \
895
    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_altivec; \
896
    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_altivec
897

  
898
    dspfunc(put_h264_qpel, 0, 16);
899
    dspfunc(avg_h264_qpel, 0, 16);
881
        c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_altivec; \
882
        c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_altivec; \
883
        c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_altivec; \
884
        c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_altivec; \
885
        c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_altivec; \
886
        c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_altivec; \
887
        c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_altivec; \
888
        c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_altivec; \
889
        c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_altivec; \
890
        c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_altivec; \
891
        c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_altivec; \
892
        c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_altivec; \
893
        c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_altivec; \
894
        c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_altivec; \
895
        c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_altivec; \
896
        c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_altivec
897

  
898
        dspfunc(put_h264_qpel, 0, 16);
899
        dspfunc(avg_h264_qpel, 0, 16);
900 900
#undef dspfunc
901
  }
901
    }
902 902
}

Also available in: Unified diff