Revision 838cc9c8 libpostproc/postprocess_altivec_template.c

View differences:

libpostproc/postprocess_altivec_template.c
103 103

  
104 104
    src2 += stride * 4;
105 105

  
106

  
107 106
#define LOAD_LINE(i)                                                    \
108 107
    {                                                                   \
109 108
    vector unsigned char perm##i = vec_lvsl(j##i, src2);                \
......
158 157
                                       v_dcThreshold);                  \
159 158
    const vector signed short v_part##i = vec_and(mask, v_comp##i);
160 159

  
161
{
162
    ITER(0, 1)
163
    ITER(1, 2)
164
    ITER(2, 3)
165
    ITER(3, 4)
166
    ITER(4, 5)
167
    ITER(5, 6)
168
    ITER(6, 7)
169

  
170
    v_numEq = vec_sum4s(v_part0, v_numEq);
171
    v_numEq = vec_sum4s(v_part1, v_numEq);
172
    v_numEq = vec_sum4s(v_part2, v_numEq);
173
    v_numEq = vec_sum4s(v_part3, v_numEq);
174
    v_numEq = vec_sum4s(v_part4, v_numEq);
175
    v_numEq = vec_sum4s(v_part5, v_numEq);
176
    v_numEq = vec_sum4s(v_part6, v_numEq);
177
}
160
    {
161
        ITER(0, 1)
162
        ITER(1, 2)
163
        ITER(2, 3)
164
        ITER(3, 4)
165
        ITER(4, 5)
166
        ITER(5, 6)
167
        ITER(6, 7)
168

  
169
        v_numEq = vec_sum4s(v_part0, v_numEq);
170
        v_numEq = vec_sum4s(v_part1, v_numEq);
171
        v_numEq = vec_sum4s(v_part2, v_numEq);
172
        v_numEq = vec_sum4s(v_part3, v_numEq);
173
        v_numEq = vec_sum4s(v_part4, v_numEq);
174
        v_numEq = vec_sum4s(v_part5, v_numEq);
175
        v_numEq = vec_sum4s(v_part6, v_numEq);
176
    }
178 177

  
179 178
#undef ITER
180 179

  
......
286 285
          LOAD_LINE(7);
287 286
          LOAD_LINE(8);
288 287
          LOAD_LINE(9);
289
      }
288
    }
290 289
#undef LOAD_LINE
291 290
#undef LOAD_LINE_ALIGNED
292
{
293
    const vector unsigned short v_2 = vec_splat_u16(2);
294
    const vector unsigned short v_4 = vec_splat_u16(4);
295

  
296
    const vector signed short v_diff01 = vec_sub(vb0, vb1);
297
    const vector unsigned short v_cmp01 =
298
        (const vector unsigned short) vec_cmplt(vec_abs(v_diff01), vqp);
299
    const vector signed short v_first = vec_sel(vb1, vb0, v_cmp01);
300
    const vector signed short v_diff89 = vec_sub(vb8, vb9);
301
    const vector unsigned short v_cmp89 =
302
        (const vector unsigned short) vec_cmplt(vec_abs(v_diff89), vqp);
303
    const vector signed short v_last = vec_sel(vb8, vb9, v_cmp89);
304

  
305
    const vector signed short temp01 = vec_mladd(v_first, (vector signed short)v_4, vb1);
306
    const vector signed short temp02 = vec_add(vb2, vb3);
307
    const vector signed short temp03 = vec_add(temp01, (vector signed short)v_4);
308
    const vector signed short v_sumsB0 = vec_add(temp02, temp03);
309

  
310
    const vector signed short temp11 = vec_sub(v_sumsB0, v_first);
311
    const vector signed short v_sumsB1 = vec_add(temp11, vb4);
312

  
313
    const vector signed short temp21 = vec_sub(v_sumsB1, v_first);
314
    const vector signed short v_sumsB2 = vec_add(temp21, vb5);
315

  
316
    const vector signed short temp31 = vec_sub(v_sumsB2, v_first);
317
    const vector signed short v_sumsB3 = vec_add(temp31, vb6);
318

  
319
    const vector signed short temp41 = vec_sub(v_sumsB3, v_first);
320
    const vector signed short v_sumsB4 = vec_add(temp41, vb7);
321

  
322
    const vector signed short temp51 = vec_sub(v_sumsB4, vb1);
323
    const vector signed short v_sumsB5 = vec_add(temp51, vb8);
324

  
325
    const vector signed short temp61 = vec_sub(v_sumsB5, vb2);
326
    const vector signed short v_sumsB6 = vec_add(temp61, v_last);
327

  
328
    const vector signed short temp71 = vec_sub(v_sumsB6, vb3);
329
    const vector signed short v_sumsB7 = vec_add(temp71, v_last);
330

  
331
    const vector signed short temp81 = vec_sub(v_sumsB7, vb4);
332
    const vector signed short v_sumsB8 = vec_add(temp81, v_last);
333

  
334
    const vector signed short temp91 = vec_sub(v_sumsB8, vb5);
335
    const vector signed short v_sumsB9 = vec_add(temp91, v_last);
336

  
337
#define COMPUTE_VR(i, j, k)                                             \
338
    const vector signed short temps1##i =                               \
339
        vec_add(v_sumsB##i, v_sumsB##k);                                \
340
    const vector signed short temps2##i =                               \
341
        vec_mladd(vb##j, (vector signed short)v_2, temps1##i);          \
342
    const vector signed short  vr##j = vec_sra(temps2##i, v_4)
343

  
344
    COMPUTE_VR(0, 1, 2);
345
    COMPUTE_VR(1, 2, 3);
346
    COMPUTE_VR(2, 3, 4);
347
    COMPUTE_VR(3, 4, 5);
348
    COMPUTE_VR(4, 5, 6);
349
    COMPUTE_VR(5, 6, 7);
350
    COMPUTE_VR(6, 7, 8);
351
    COMPUTE_VR(7, 8, 9);
352

  
353
    const vector signed char neg1 = vec_splat_s8(-1);
354
    const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
355
                                                                        0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
291
    {
292
        const vector unsigned short v_2 = vec_splat_u16(2);
293
        const vector unsigned short v_4 = vec_splat_u16(4);
294

  
295
        const vector signed short v_diff01 = vec_sub(vb0, vb1);
296
        const vector unsigned short v_cmp01 =
297
            (const vector unsigned short) vec_cmplt(vec_abs(v_diff01), vqp);
298
        const vector signed short v_first = vec_sel(vb1, vb0, v_cmp01);
299
        const vector signed short v_diff89 = vec_sub(vb8, vb9);
300
        const vector unsigned short v_cmp89 =
301
            (const vector unsigned short) vec_cmplt(vec_abs(v_diff89), vqp);
302
        const vector signed short v_last = vec_sel(vb8, vb9, v_cmp89);
303

  
304
        const vector signed short temp01 = vec_mladd(v_first, (vector signed short)v_4, vb1);
305
        const vector signed short temp02 = vec_add(vb2, vb3);
306
        const vector signed short temp03 = vec_add(temp01, (vector signed short)v_4);
307
        const vector signed short v_sumsB0 = vec_add(temp02, temp03);
308

  
309
        const vector signed short temp11 = vec_sub(v_sumsB0, v_first);
310
        const vector signed short v_sumsB1 = vec_add(temp11, vb4);
311

  
312
        const vector signed short temp21 = vec_sub(v_sumsB1, v_first);
313
        const vector signed short v_sumsB2 = vec_add(temp21, vb5);
314

  
315
        const vector signed short temp31 = vec_sub(v_sumsB2, v_first);
316
        const vector signed short v_sumsB3 = vec_add(temp31, vb6);
317

  
318
        const vector signed short temp41 = vec_sub(v_sumsB3, v_first);
319
        const vector signed short v_sumsB4 = vec_add(temp41, vb7);
320

  
321
        const vector signed short temp51 = vec_sub(v_sumsB4, vb1);
322
        const vector signed short v_sumsB5 = vec_add(temp51, vb8);
323

  
324
        const vector signed short temp61 = vec_sub(v_sumsB5, vb2);
325
        const vector signed short v_sumsB6 = vec_add(temp61, v_last);
326

  
327
        const vector signed short temp71 = vec_sub(v_sumsB6, vb3);
328
        const vector signed short v_sumsB7 = vec_add(temp71, v_last);
329

  
330
        const vector signed short temp81 = vec_sub(v_sumsB7, vb4);
331
        const vector signed short v_sumsB8 = vec_add(temp81, v_last);
332

  
333
        const vector signed short temp91 = vec_sub(v_sumsB8, vb5);
334
        const vector signed short v_sumsB9 = vec_add(temp91, v_last);
335

  
336
    #define COMPUTE_VR(i, j, k)                                             \
337
        const vector signed short temps1##i =                               \
338
            vec_add(v_sumsB##i, v_sumsB##k);                                \
339
        const vector signed short temps2##i =                               \
340
            vec_mladd(vb##j, (vector signed short)v_2, temps1##i);          \
341
        const vector signed short  vr##j = vec_sra(temps2##i, v_4)
342

  
343
        COMPUTE_VR(0, 1, 2);
344
        COMPUTE_VR(1, 2, 3);
345
        COMPUTE_VR(2, 3, 4);
346
        COMPUTE_VR(3, 4, 5);
347
        COMPUTE_VR(4, 5, 6);
348
        COMPUTE_VR(5, 6, 7);
349
        COMPUTE_VR(6, 7, 8);
350
        COMPUTE_VR(7, 8, 9);
351

  
352
        const vector signed char neg1 = vec_splat_s8(-1);
353
        const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
354
                                                                            0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
356 355

  
357 356
#define PACK_AND_STORE(i)                                       \
358 357
{    const vector unsigned char perms##i =                       \
......
379 378
        vec_perm(vf##i, vbT##i, permHH);                        \
380 379
    vec_st(vg##i, i * stride, src2);}
381 380

  
382
    /* Special-casing the aligned case is worthwhile, as all calls from
383
     * the (transposed) horizontable deblocks will be aligned, in addition
384
     * to the naturally aligned vertical deblocks. */
385
    if (properStride && srcAlign) {
386
        PACK_AND_STORE_ALIGNED(1)
387
        PACK_AND_STORE_ALIGNED(2)
388
        PACK_AND_STORE_ALIGNED(3)
389
        PACK_AND_STORE_ALIGNED(4)
390
        PACK_AND_STORE_ALIGNED(5)
391
        PACK_AND_STORE_ALIGNED(6)
392
        PACK_AND_STORE_ALIGNED(7)
393
        PACK_AND_STORE_ALIGNED(8)
394
    } else {
395
        PACK_AND_STORE(1)
396
        PACK_AND_STORE(2)
397
        PACK_AND_STORE(3)
398
        PACK_AND_STORE(4)
399
        PACK_AND_STORE(5)
400
        PACK_AND_STORE(6)
401
        PACK_AND_STORE(7)
402
        PACK_AND_STORE(8)
381
        /* Special-casing the aligned case is worthwhile, as all calls from
382
         * the (transposed) horizontable deblocks will be aligned, in addition
383
         * to the naturally aligned vertical deblocks. */
384
        if (properStride && srcAlign) {
385
            PACK_AND_STORE_ALIGNED(1)
386
            PACK_AND_STORE_ALIGNED(2)
387
            PACK_AND_STORE_ALIGNED(3)
388
            PACK_AND_STORE_ALIGNED(4)
389
            PACK_AND_STORE_ALIGNED(5)
390
            PACK_AND_STORE_ALIGNED(6)
391
            PACK_AND_STORE_ALIGNED(7)
392
            PACK_AND_STORE_ALIGNED(8)
393
        } else {
394
            PACK_AND_STORE(1)
395
            PACK_AND_STORE(2)
396
            PACK_AND_STORE(3)
397
            PACK_AND_STORE(4)
398
            PACK_AND_STORE(5)
399
            PACK_AND_STORE(6)
400
            PACK_AND_STORE(7)
401
            PACK_AND_STORE(8)
402
        }
403
    #undef PACK_AND_STORE
404
    #undef PACK_AND_STORE_ALIGNED
403 405
    }
404
#undef PACK_AND_STORE
405
#undef PACK_AND_STORE_ALIGNED
406
}
407 406
}
408 407

  
409 408

  

Also available in: Unified diff