Statistics
| Branch: | Revision:

ffmpeg / libavcodec / ppc / dsputil_altivec.c @ 9c76bd48

History | View | Annotate | Download (16.7 KB)

1
/*
2
 * Copyright (c) 2002 Brian Foley
3
 * Copyright (c) 2002 Dieter Shirley
4
 *
5
 * This library is free software; you can redistribute it and/or
6
 * modify it under the terms of the GNU Lesser General Public
7
 * License as published by the Free Software Foundation; either
8
 * version 2 of the License, or (at your option) any later version.
9
 *
10
 * This library is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
 * Lesser General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU Lesser General Public
16
 * License along with this library; if not, write to the Free Software
17
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18
 */
19
 
20
#include "../dsputil.h"
21
#include "dsputil_altivec.h"
22

    
23
#if CONFIG_DARWIN
24
#include <sys/sysctl.h>
25
#endif
26

    
27
int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
28
{
29
    int s, i;
30
    vector unsigned char *tv, zero;
31
    vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
32
    vector unsigned int sad;
33
    vector signed int sumdiffs;
34

    
35
    s = 0;
36
    zero = vec_splat_u8(0);
37
    sad = vec_splat_u32(0);
38
    for(i=0;i<16;i++) {
39
        /*
40
           Read unaligned pixels into our vectors. The vectors are as follows:
41
           pix1v: pix1[0]-pix1[15]
42
           pix2v: pix2[0]-pix2[15]        pix2iv: pix2[1]-pix2[16]
43
        */
44
        tv = (vector unsigned char *) pix1;
45
        pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
46
        
47
        tv = (vector unsigned char *) &pix2[0];
48
        pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
49

    
50
        tv = (vector unsigned char *) &pix2[1];
51
        pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
52

    
53
        /* Calculate the average vector */
54
        avgv = vec_avg(pix2v, pix2iv);
55

    
56
        /* Calculate a sum of abs differences vector */
57
        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
58

    
59
        /* Add each 4 pixel group together and put 4 results into sad */
60
        sad = vec_sum4s(t5, sad);
61
        
62
        pix1 += line_size;
63
        pix2 += line_size;
64
    }
65
    /* Sum up the four partial sums, and put the result into s */
66
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
67
    sumdiffs = vec_splat(sumdiffs, 3);
68
    vec_ste(sumdiffs, 0, &s);
69

    
70
    return s;
71
}
72

    
73
int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
74
{
75
    int s, i;
76
    vector unsigned char *tv, zero;
77
    vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
78
    vector unsigned int sad;
79
    vector signed int sumdiffs;
80
    uint8_t *pix3 = pix2 + line_size;
81

    
82
    s = 0;
83
    zero = vec_splat_u8(0);
84
    sad = vec_splat_u32(0);
85

    
86
    /*
87
       Due to the fact that pix3 = pix2 + line_size, the pix3 of one
88
       iteration becomes pix2 in the next iteration. We can use this
89
       fact to avoid a potentially expensive unaligned read, each
90
       time around the loop.
91
       Read unaligned pixels into our vectors. The vectors are as follows:
92
       pix2v: pix2[0]-pix2[15]
93
       Split the pixel vectors into shorts
94
    */
95
    tv = (vector unsigned char *) &pix2[0];
96
    pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
97
    
98
    for(i=0;i<16;i++) {
99
        /*
100
           Read unaligned pixels into our vectors. The vectors are as follows:
101
           pix1v: pix1[0]-pix1[15]
102
           pix3v: pix3[0]-pix3[15]
103
        */
104
        tv = (vector unsigned char *) pix1;
105
        pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
106

    
107
        tv = (vector unsigned char *) &pix3[0];
108
        pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
109

    
110
        /* Calculate the average vector */
111
        avgv = vec_avg(pix2v, pix3v);
112

    
113
        /* Calculate a sum of abs differences vector */
114
        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
115

    
116
        /* Add each 4 pixel group together and put 4 results into sad */
117
        sad = vec_sum4s(t5, sad);
118
        
119
        pix1 += line_size;
120
        pix2v = pix3v;
121
        pix3 += line_size;
122
        
123
    }
124
    
125
    /* Sum up the four partial sums, and put the result into s */
126
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
127
    sumdiffs = vec_splat(sumdiffs, 3);
128
    vec_ste(sumdiffs, 0, &s);
129
    return s;    
130
}
131

    
132
int pix_abs16x16_xy2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
133
{
134
    int s, i;
135
    uint8_t *pix3 = pix2 + line_size;
136
    vector unsigned char *tv, avgv, t5, zero;
137
    vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
138
    vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
139
    vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
140
    vector unsigned short avghv, avglv, two;
141
    vector unsigned short t1, t2, t3, t4;
142
    vector unsigned int sad;
143
    vector signed int sumdiffs;
144

    
145
    zero = vec_splat_u8(0);
146
    two = vec_splat_u16(2);
147
    sad = vec_splat_u32(0);
148
    
149
    s = 0;
150

    
151
    /*
152
       Due to the fact that pix3 = pix2 + line_size, the pix3 of one
153
       iteration becomes pix2 in the next iteration. We can use this
154
       fact to avoid a potentially expensive unaligned read, as well
155
       as some splitting, and vector addition each time around the loop.
156
       Read unaligned pixels into our vectors. The vectors are as follows:
157
       pix2v: pix2[0]-pix2[15]        pix2iv: pix2[1]-pix2[16]
158
       Split the pixel vectors into shorts
159
    */
160
    tv = (vector unsigned char *) &pix2[0];
161
    pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
162

    
163
    tv = (vector unsigned char *) &pix2[1];
164
    pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
165

    
166
    pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v);
167
    pix2lv = (vector unsigned short) vec_mergel(zero, pix2v);
168
    pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv);
169
    pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv);
170
    t1 = vec_add(pix2hv, pix2ihv);
171
    t2 = vec_add(pix2lv, pix2ilv);
172
    
173
    for(i=0;i<16;i++) {
174
        /*
175
           Read unaligned pixels into our vectors. The vectors are as follows:
176
           pix1v: pix1[0]-pix1[15]
177
           pix3v: pix3[0]-pix3[15]        pix3iv: pix3[1]-pix3[16]
178
        */
179
        tv = (vector unsigned char *) pix1;
180
        pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
181

    
182
        tv = (vector unsigned char *) &pix3[0];
183
        pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
184

    
185
        tv = (vector unsigned char *) &pix3[1];
186
        pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1]));
187

    
188
        /*
189
          Note that Altivec does have vec_avg, but this works on vector pairs
190
          and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
191
          would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
192
          Instead, we have to split the pixel vectors into vectors of shorts,
193
          and do the averaging by hand.
194
        */
195

    
196
        /* Split the pixel vectors into shorts */
197
        pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v);
198
        pix3lv = (vector unsigned short) vec_mergel(zero, pix3v);
199
        pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
200
        pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
201

    
202
        /* Do the averaging on them */
203
        t3 = vec_add(pix3hv, pix3ihv);
204
        t4 = vec_add(pix3lv, pix3ilv);
205

    
206
        avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
207
        avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
208

    
209
        /* Pack the shorts back into a result */
210
        avgv = vec_pack(avghv, avglv);
211

    
212
        /* Calculate a sum of abs differences vector */
213
        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
214

    
215
        /* Add each 4 pixel group together and put 4 results into sad */
216
        sad = vec_sum4s(t5, sad);
217

    
218
        pix1 += line_size;
219
        pix3 += line_size;
220
        /* Transfer the calculated values for pix3 into pix2 */
221
        t1 = t3;
222
        t2 = t4;
223
    }
224
    /* Sum up the four partial sums, and put the result into s */
225
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
226
    sumdiffs = vec_splat(sumdiffs, 3);
227
    vec_ste(sumdiffs, 0, &s);
228

    
229
    return s;
230
}
231

    
232
int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
233
{
234
    int i, s;
235
    vector unsigned char perm1, perm2, *pix1v, *pix2v;
236
    vector unsigned char t1, t2, t3,t4, t5;
237
    vector unsigned int sad, zero;
238
    vector signed int sumdiffs;
239
    
240
    zero = (vector unsigned int) (0);
241
    sad = (vector unsigned int) (0);
242

    
243

    
244
    for(i=0;i<16;i++) {
245
        /* Read potentially unaligned pixels into t1 and t2 */
246
        perm1 = vec_lvsl(0, pix1);
247
        pix1v = (vector unsigned char *) pix1;
248
        perm2 = vec_lvsl(0, pix2);
249
        pix2v = (vector unsigned char *) pix2;
250
        t1 = vec_perm(pix1v[0], pix1v[1], perm1);
251
        t2 = vec_perm(pix2v[0], pix2v[1], perm2);
252
       
253
        /* Calculate a sum of abs differences vector */ 
254
        t3 = vec_max(t1, t2);
255
        t4 = vec_min(t1, t2);
256
        t5 = vec_sub(t3, t4);
257
        
258
        /* Add each 4 pixel group together and put 4 results into sad */
259
        sad = vec_sum4s(t5, sad);
260

    
261
        pix1 += line_size;
262
        pix2 += line_size;
263
    }
264

    
265
    /* Sum up the four partial sums, and put the result into s */
266
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
267
    sumdiffs = vec_splat(sumdiffs, 3);
268
    vec_ste(sumdiffs, 0, &s);
269
    
270
    return s;
271
}
272

    
273
int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
274
{
275
    int i, s;
276
    vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
277
    vector unsigned char t1, t2, t3,t4, t5;
278
    vector unsigned int sad, zero;
279
    vector signed int sumdiffs;
280

    
281
    zero = (vector unsigned int) (0);
282
    sad = (vector unsigned int) (0);
283
    permclear = (vector unsigned char) (255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
284

    
285
    for(i=0;i<8;i++) {
286
        /* Read potentially unaligned pixels into t1 and t2
287
           Since we're reading 16 pixels, and actually only want 8,
288
           mask out the last 8 pixels. The 0s don't change the sum. */
289
        perm1 = vec_lvsl(0, pix1);
290
        pix1v = (vector unsigned char *) pix1;
291
        perm2 = vec_lvsl(0, pix2);
292
        pix2v = (vector unsigned char *) pix2;
293
        t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
294
        t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
295

    
296
        /* Calculate a sum of abs differences vector */ 
297
        t3 = vec_max(t1, t2);
298
        t4 = vec_min(t1, t2);
299
        t5 = vec_sub(t3, t4);
300

    
301
        /* Add each 4 pixel group together and put 4 results into sad */
302
        sad = vec_sum4s(t5, sad);
303

    
304
        pix1 += line_size;
305
        pix2 += line_size;
306
    }
307

    
308
    /* Sum up the four partial sums, and put the result into s */
309
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
310
    sumdiffs = vec_splat(sumdiffs, 3);
311
    vec_ste(sumdiffs, 0, &s);
312

    
313
    return s;
314
}
315

    
316
int pix_norm1_altivec(uint8_t *pix, int line_size)
317
{
318
    int s, i;
319
    vector unsigned char *tv, zero;
320
    vector unsigned char pixv;
321
    vector unsigned int sv;
322
    vector signed int sum;
323
        
324
    zero = vec_splat_u8(0);
325
    sv = vec_splat_u32(0);
326
    
327
    s = 0;
328
    for (i = 0; i < 16; i++) {
329
        /* Read in the potentially unaligned pixels */
330
        tv = (vector unsigned char *) pix;
331
        pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));
332

    
333
        /* Square the values, and add them to our sum */
334
        sv = vec_msum(pixv, pixv, sv);
335

    
336
        pix += line_size;
337
    }
338
    /* Sum up the four partial sums, and put the result into s */
339
    sum = vec_sums((vector signed int) sv, (vector signed int) zero);
340
    sum = vec_splat(sum, 3);
341
    vec_ste(sum, 0, &s);
342

    
343
    return s;
344
}
345

    
346

    
347
int pix_norm_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
348
{
349
    int s, i;
350
    vector unsigned char *tv, zero;
351
    vector unsigned char pix1v, pix2v, t5;
352
    vector unsigned int sv;
353
    vector signed int sum;
354

    
355
    zero = vec_splat_u8(0);
356
    sv = vec_splat_u32(0);
357
    s = 0;
358
    for (i = 0; i < 16; i++) {
359
        /* Read in the potentially unaligned pixels */
360
        tv = (vector unsigned char *) pix1;
361
        pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
362

    
363
        tv = (vector unsigned char *) pix2;
364
        pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix2));
365

    
366
        /*
367
           Since we want to use unsigned chars, we can take advantage
368
           of the fact that abs(a-b)^2 = (a-b)^2.
369
        */
370
        
371
        /* Calculate a sum of abs differences vector */
372
        t5 = vec_sub(vec_max(pix1v, pix2v), vec_min(pix1v, pix2v));
373

    
374
        /* Square the values and add them to our sum */
375
        sv = vec_msum(t5, t5, sv);
376
        
377
        pix1 += line_size;
378
        pix2 += line_size;
379
    }
380
    /* Sum up the four partial sums, and put the result into s */
381
    sum = vec_sums((vector signed int) sv, (vector signed int) zero);
382
    sum = vec_splat(sum, 3);
383
    vec_ste(sum, 0, &s);
384
    return s;
385
}
386

    
387

    
388
int pix_sum_altivec(UINT8 * pix, int line_size)
389
{
390

    
391
    vector unsigned char perm, *pixv;
392
    vector unsigned char t1;
393
    vector unsigned int sad, zero;
394
    vector signed int sumdiffs;
395

    
396
    int s, i;
397

    
398
    zero = (vector unsigned int) (0);
399
    sad = (vector unsigned int) (0);
400
    
401
    for (i = 0; i < 16; i++) {
402
        /* Read the potentially unaligned 16 pixels into t1 */
403
        perm = vec_lvsl(0, pix);
404
        pixv = (vector unsigned char *) pix;
405
        t1 = vec_perm(pixv[0], pixv[1], perm);
406

    
407
        /* Add each 4 pixel group together and put 4 results into sad */
408
        sad = vec_sum4s(t1, sad);
409
        
410
        pix += line_size;
411
    }
412
    
413
    /* Sum up the four partial sums, and put the result into s */
414
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
415
    sumdiffs = vec_splat(sumdiffs, 3);
416
    vec_ste(sumdiffs, 0, &s);
417
    
418
    return s;
419
}
420

    
421
void get_pixels_altivec(DCTELEM *restrict block, const UINT8 *pixels, int line_size)
422
{
423
    int i;
424
    vector unsigned char perm, bytes, *pixv;
425
    vector unsigned char zero = (vector unsigned char) (0);
426
    vector signed short shorts;
427

    
428
    for(i=0;i<8;i++)
429
    {
430
        // Read potentially unaligned pixels.
431
        // We're reading 16 pixels, and actually only want 8,
432
        // but we simply ignore the extras.
433
        perm = vec_lvsl(0, pixels);
434
        pixv = (vector unsigned char *) pixels;
435
        bytes = vec_perm(pixv[0], pixv[1], perm);
436

    
437
        // convert the bytes into shorts
438
        shorts = (vector signed short)vec_mergeh(zero, bytes);
439

    
440
        // save the data to the block, we assume the block is 16-byte aligned
441
        vec_st(shorts, i*16, (vector signed short*)block);
442

    
443
        pixels += line_size;
444
    }
445
}
446

    
447
void diff_pixels_altivec(DCTELEM *restrict block, const UINT8 *s1,
448
        const UINT8 *s2, int stride)
449
{
450
    int i;
451
    vector unsigned char perm, bytes, *pixv;
452
    vector unsigned char zero = (vector unsigned char) (0);
453
    vector signed short shorts1, shorts2;
454

    
455
    for(i=0;i<4;i++)
456
    {
457
        // Read potentially unaligned pixels
458
        // We're reading 16 pixels, and actually only want 8,
459
        // but we simply ignore the extras.
460
        perm = vec_lvsl(0, s1);
461
        pixv = (vector unsigned char *) s1;
462
        bytes = vec_perm(pixv[0], pixv[1], perm);
463

    
464
        // convert the bytes into shorts
465
        shorts1 = (vector signed short)vec_mergeh(zero, bytes);
466

    
467
        // Do the same for the second block of pixels
468
        perm = vec_lvsl(0, s2);
469
        pixv = (vector unsigned char *) s2;
470
        bytes = vec_perm(pixv[0], pixv[1], perm);
471

    
472
        // convert the bytes into shorts
473
        shorts2 = (vector signed short)vec_mergeh(zero, bytes);
474

    
475
        // Do the subtraction
476
        shorts1 = vec_sub(shorts1, shorts2);
477

    
478
        // save the data to the block, we assume the block is 16-byte aligned
479
        vec_st(shorts1, 0, (vector signed short*)block);
480

    
481
        s1 += stride;
482
        s2 += stride;
483
        block += 8;
484

    
485

    
486
        // The code below is a copy of the code above... This is a manual
487
        // unroll.
488

    
489
        // Read potentially unaligned pixels
490
        // We're reading 16 pixels, and actually only want 8,
491
        // but we simply ignore the extras.
492
        perm = vec_lvsl(0, s1);
493
        pixv = (vector unsigned char *) s1;
494
        bytes = vec_perm(pixv[0], pixv[1], perm);
495

    
496
        // convert the bytes into shorts
497
        shorts1 = (vector signed short)vec_mergeh(zero, bytes);
498

    
499
        // Do the same for the second block of pixels
500
        perm = vec_lvsl(0, s2);
501
        pixv = (vector unsigned char *) s2;
502
        bytes = vec_perm(pixv[0], pixv[1], perm);
503

    
504
        // convert the bytes into shorts
505
        shorts2 = (vector signed short)vec_mergeh(zero, bytes);
506

    
507
        // Do the subtraction
508
        shorts1 = vec_sub(shorts1, shorts2);
509

    
510
        // save the data to the block, we assume the block is 16-byte aligned
511
        vec_st(shorts1, 0, (vector signed short*)block);
512

    
513
        s1 += stride;
514
        s2 += stride;
515
        block += 8;
516
    }
517
}
518

    
519

    
520
int has_altivec(void)
521
{
522
#if CONFIG_DARWIN
523
    int sels[2] = {CTL_HW, HW_VECTORUNIT};
524
    int has_vu = 0;
525
    size_t len = sizeof(has_vu);
526
    int err;
527

    
528
    err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
529

    
530
    if (err == 0) return (has_vu != 0);
531
#endif
532
    return 0;
533
}
534