Statistics
| Branch: | Revision:

ffmpeg / libavcodec / ppc / dsputil_altivec.c @ 186447f8

History | View | Annotate | Download (45 KB)

1
/*
2
 * Copyright (c) 2002 Brian Foley
3
 * Copyright (c) 2002 Dieter Shirley
4
 * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
5
 *
6
 * This library is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2 of the License, or (at your option) any later version.
10
 *
11
 * This library is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with this library; if not, write to the Free Software
18
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19
 */
20
 
21
#include "../dsputil.h"
22

    
23
#include "gcc_fixes.h"
24

    
25
#include "dsputil_altivec.h"
26

    
27
#ifdef CONFIG_DARWIN
28
#include <sys/sysctl.h>
29
#else /* CONFIG_DARWIN */
30
#include <signal.h>
31
#include <setjmp.h>
32

    
33
static sigjmp_buf jmpbuf;
34
static volatile sig_atomic_t canjump = 0;
35

    
36
static void sigill_handler (int sig)
37
{
38
    if (!canjump) {
39
        signal (sig, SIG_DFL);
40
        raise (sig);
41
    }
42
    
43
    canjump = 0;
44
    siglongjmp (jmpbuf, 1);
45
}
46
#endif /* CONFIG_DARWIN */
47

    
48
int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
49
{
50
    int i;
51
    int s __attribute__((aligned(16)));
52
    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
53
    vector unsigned char *tv;
54
    vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
55
    vector unsigned int sad;
56
    vector signed int sumdiffs;
57

    
58
    s = 0;
59
    sad = (vector unsigned int)vec_splat_u32(0);
60
    for(i=0;i<16;i++) {
61
        /*
62
           Read unaligned pixels into our vectors. The vectors are as follows:
63
           pix1v: pix1[0]-pix1[15]
64
           pix2v: pix2[0]-pix2[15]        pix2iv: pix2[1]-pix2[16]
65
        */
66
        tv = (vector unsigned char *) pix1;
67
        pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
68
        
69
        tv = (vector unsigned char *) &pix2[0];
70
        pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
71

    
72
        tv = (vector unsigned char *) &pix2[1];
73
        pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
74

    
75
        /* Calculate the average vector */
76
        avgv = vec_avg(pix2v, pix2iv);
77

    
78
        /* Calculate a sum of abs differences vector */
79
        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
80

    
81
        /* Add each 4 pixel group together and put 4 results into sad */
82
        sad = vec_sum4s(t5, sad);
83
        
84
        pix1 += line_size;
85
        pix2 += line_size;
86
    }
87
    /* Sum up the four partial sums, and put the result into s */
88
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
89
    sumdiffs = vec_splat(sumdiffs, 3);
90
    vec_ste(sumdiffs, 0, &s);
91

    
92
    return s;
93
}
94

    
95
int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
96
{
97
    int i;
98
    int s __attribute__((aligned(16)));
99
    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
100
    vector unsigned char *tv;
101
    vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
102
    vector unsigned int sad;
103
    vector signed int sumdiffs;
104
    uint8_t *pix3 = pix2 + line_size;
105

    
106
    s = 0;
107
    sad = (vector unsigned int)vec_splat_u32(0);
108

    
109
    /*
110
       Due to the fact that pix3 = pix2 + line_size, the pix3 of one
111
       iteration becomes pix2 in the next iteration. We can use this
112
       fact to avoid a potentially expensive unaligned read, each
113
       time around the loop.
114
       Read unaligned pixels into our vectors. The vectors are as follows:
115
       pix2v: pix2[0]-pix2[15]
116
       Split the pixel vectors into shorts
117
    */
118
    tv = (vector unsigned char *) &pix2[0];
119
    pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
120
    
121
    for(i=0;i<16;i++) {
122
        /*
123
           Read unaligned pixels into our vectors. The vectors are as follows:
124
           pix1v: pix1[0]-pix1[15]
125
           pix3v: pix3[0]-pix3[15]
126
        */
127
        tv = (vector unsigned char *) pix1;
128
        pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
129

    
130
        tv = (vector unsigned char *) &pix3[0];
131
        pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
132

    
133
        /* Calculate the average vector */
134
        avgv = vec_avg(pix2v, pix3v);
135

    
136
        /* Calculate a sum of abs differences vector */
137
        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
138

    
139
        /* Add each 4 pixel group together and put 4 results into sad */
140
        sad = vec_sum4s(t5, sad);
141
        
142
        pix1 += line_size;
143
        pix2v = pix3v;
144
        pix3 += line_size;
145
        
146
    }
147
    
148
    /* Sum up the four partial sums, and put the result into s */
149
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
150
    sumdiffs = vec_splat(sumdiffs, 3);
151
    vec_ste(sumdiffs, 0, &s);
152
    return s;    
153
}
154

    
155
int pix_abs16x16_xy2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
156
{
157
    int i;
158
    int s __attribute__((aligned(16)));
159
    uint8_t *pix3 = pix2 + line_size;
160
    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
161
    const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2);
162
    vector unsigned char *tv, avgv, t5;
163
    vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
164
    vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
165
    vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
166
    vector unsigned short avghv, avglv;
167
    vector unsigned short t1, t2, t3, t4;
168
    vector unsigned int sad;
169
    vector signed int sumdiffs;
170

    
171
    sad = (vector unsigned int)vec_splat_u32(0);
172
    
173
    s = 0;
174

    
175
    /*
176
       Due to the fact that pix3 = pix2 + line_size, the pix3 of one
177
       iteration becomes pix2 in the next iteration. We can use this
178
       fact to avoid a potentially expensive unaligned read, as well
179
       as some splitting, and vector addition each time around the loop.
180
       Read unaligned pixels into our vectors. The vectors are as follows:
181
       pix2v: pix2[0]-pix2[15]        pix2iv: pix2[1]-pix2[16]
182
       Split the pixel vectors into shorts
183
    */
184
    tv = (vector unsigned char *) &pix2[0];
185
    pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
186

    
187
    tv = (vector unsigned char *) &pix2[1];
188
    pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
189

    
190
    pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v);
191
    pix2lv = (vector unsigned short) vec_mergel(zero, pix2v);
192
    pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv);
193
    pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv);
194
    t1 = vec_add(pix2hv, pix2ihv);
195
    t2 = vec_add(pix2lv, pix2ilv);
196
    
197
    for(i=0;i<16;i++) {
198
        /*
199
           Read unaligned pixels into our vectors. The vectors are as follows:
200
           pix1v: pix1[0]-pix1[15]
201
           pix3v: pix3[0]-pix3[15]        pix3iv: pix3[1]-pix3[16]
202
        */
203
        tv = (vector unsigned char *) pix1;
204
        pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
205

    
206
        tv = (vector unsigned char *) &pix3[0];
207
        pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
208

    
209
        tv = (vector unsigned char *) &pix3[1];
210
        pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1]));
211

    
212
        /*
213
          Note that Altivec does have vec_avg, but this works on vector pairs
214
          and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
215
          would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
216
          Instead, we have to split the pixel vectors into vectors of shorts,
217
          and do the averaging by hand.
218
        */
219

    
220
        /* Split the pixel vectors into shorts */
221
        pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v);
222
        pix3lv = (vector unsigned short) vec_mergel(zero, pix3v);
223
        pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
224
        pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
225

    
226
        /* Do the averaging on them */
227
        t3 = vec_add(pix3hv, pix3ihv);
228
        t4 = vec_add(pix3lv, pix3ilv);
229

    
230
        avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
231
        avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
232

    
233
        /* Pack the shorts back into a result */
234
        avgv = vec_pack(avghv, avglv);
235

    
236
        /* Calculate a sum of abs differences vector */
237
        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
238

    
239
        /* Add each 4 pixel group together and put 4 results into sad */
240
        sad = vec_sum4s(t5, sad);
241

    
242
        pix1 += line_size;
243
        pix3 += line_size;
244
        /* Transfer the calculated values for pix3 into pix2 */
245
        t1 = t3;
246
        t2 = t4;
247
    }
248
    /* Sum up the four partial sums, and put the result into s */
249
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
250
    sumdiffs = vec_splat(sumdiffs, 3);
251
    vec_ste(sumdiffs, 0, &s);
252

    
253
    return s;
254
}
255

    
256
int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
257
{
258
    int i;
259
    int s __attribute__((aligned(16)));
260
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
261
    vector unsigned char perm1, perm2, *pix1v, *pix2v;
262
    vector unsigned char t1, t2, t3,t4, t5;
263
    vector unsigned int sad;
264
    vector signed int sumdiffs;
265
    
266
    sad = (vector unsigned int)vec_splat_u32(0);
267

    
268

    
269
    for(i=0;i<16;i++) {
270
        /* Read potentially unaligned pixels into t1 and t2 */
271
        perm1 = vec_lvsl(0, pix1);
272
        pix1v = (vector unsigned char *) pix1;
273
        perm2 = vec_lvsl(0, pix2);
274
        pix2v = (vector unsigned char *) pix2;
275
        t1 = vec_perm(pix1v[0], pix1v[1], perm1);
276
        t2 = vec_perm(pix2v[0], pix2v[1], perm2);
277
       
278
        /* Calculate a sum of abs differences vector */ 
279
        t3 = vec_max(t1, t2);
280
        t4 = vec_min(t1, t2);
281
        t5 = vec_sub(t3, t4);
282
        
283
        /* Add each 4 pixel group together and put 4 results into sad */
284
        sad = vec_sum4s(t5, sad);
285

    
286
        pix1 += line_size;
287
        pix2 += line_size;
288
    }
289

    
290
    /* Sum up the four partial sums, and put the result into s */
291
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
292
    sumdiffs = vec_splat(sumdiffs, 3);
293
    vec_ste(sumdiffs, 0, &s);
294
    
295
    return s;
296
}
297

    
298
int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
299
{
300
    int i;
301
    int s __attribute__((aligned(16)));
302
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
303
    vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
304
    vector unsigned char t1, t2, t3,t4, t5;
305
    vector unsigned int sad;
306
    vector signed int sumdiffs;
307

    
308
    sad = (vector unsigned int)vec_splat_u32(0);
309

    
310
    permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
311

    
312
    for(i=0;i<8;i++) {
313
        /* Read potentially unaligned pixels into t1 and t2
314
           Since we're reading 16 pixels, and actually only want 8,
315
           mask out the last 8 pixels. The 0s don't change the sum. */
316
        perm1 = vec_lvsl(0, pix1);
317
        pix1v = (vector unsigned char *) pix1;
318
        perm2 = vec_lvsl(0, pix2);
319
        pix2v = (vector unsigned char *) pix2;
320
        t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
321
        t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
322

    
323
        /* Calculate a sum of abs differences vector */ 
324
        t3 = vec_max(t1, t2);
325
        t4 = vec_min(t1, t2);
326
        t5 = vec_sub(t3, t4);
327

    
328
        /* Add each 4 pixel group together and put 4 results into sad */
329
        sad = vec_sum4s(t5, sad);
330

    
331
        pix1 += line_size;
332
        pix2 += line_size;
333
    }
334

    
335
    /* Sum up the four partial sums, and put the result into s */
336
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
337
    sumdiffs = vec_splat(sumdiffs, 3);
338
    vec_ste(sumdiffs, 0, &s);
339

    
340
    return s;
341
}
342

    
343
int pix_norm1_altivec(uint8_t *pix, int line_size)
344
{
345
    int i;
346
    int s __attribute__((aligned(16)));
347
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
348
    vector unsigned char *tv;
349
    vector unsigned char pixv;
350
    vector unsigned int sv;
351
    vector signed int sum;
352
    
353
    sv = (vector unsigned int)vec_splat_u32(0);
354
    
355
    s = 0;
356
    for (i = 0; i < 16; i++) {
357
        /* Read in the potentially unaligned pixels */
358
        tv = (vector unsigned char *) pix;
359
        pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));
360

    
361
        /* Square the values, and add them to our sum */
362
        sv = vec_msum(pixv, pixv, sv);
363

    
364
        pix += line_size;
365
    }
366
    /* Sum up the four partial sums, and put the result into s */
367
    sum = vec_sums((vector signed int) sv, (vector signed int) zero);
368
    sum = vec_splat(sum, 3);
369
    vec_ste(sum, 0, &s);
370

    
371
    return s;
372
}
373

    
374
/**
375
 * Sum of Squared Errors for a 8x8 block.
376
 * AltiVec-enhanced.
377
 * It's the pix_abs8x8_altivec code above w/ squaring added.
378
 */
379
int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
380
{
381
    int i;
382
    int s __attribute__((aligned(16)));
383
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
384
    vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
385
    vector unsigned char t1, t2, t3,t4, t5;
386
    vector unsigned int sum;
387
    vector signed int sumsqr;
388
    
389
    sum = (vector unsigned int)vec_splat_u32(0);
390

    
391
    permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
392

    
393
    
394
    for(i=0;i<8;i++) {
395
        /* Read potentially unaligned pixels into t1 and t2
396
           Since we're reading 16 pixels, and actually only want 8,
397
           mask out the last 8 pixels. The 0s don't change the sum. */
398
        perm1 = vec_lvsl(0, pix1);
399
        pix1v = (vector unsigned char *) pix1;
400
        perm2 = vec_lvsl(0, pix2);
401
        pix2v = (vector unsigned char *) pix2;
402
        t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
403
        t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
404

    
405
        /*
406
          Since we want to use unsigned chars, we can take advantage
407
          of the fact that abs(a-b)^2 = (a-b)^2.
408
        */
409
        
410
        /* Calculate abs differences vector */ 
411
        t3 = vec_max(t1, t2);
412
        t4 = vec_min(t1, t2);
413
        t5 = vec_sub(t3, t4);
414
        
415
        /* Square the values and add them to our sum */
416
        sum = vec_msum(t5, t5, sum);
417
        
418
        pix1 += line_size;
419
        pix2 += line_size;
420
    }
421
    
422
    /* Sum up the four partial sums, and put the result into s */
423
    sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
424
    sumsqr = vec_splat(sumsqr, 3);
425
    vec_ste(sumsqr, 0, &s);
426
    
427
    return s;
428
}
429

    
430
/**
431
 * Sum of Squared Errors for a 16x16 block.
432
 * AltiVec-enhanced.
433
 * It's the pix_abs16x16_altivec code above w/ squaring added.
434
 */
435
int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
436
{
437
    int i;
438
    int s __attribute__((aligned(16)));
439
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
440
    vector unsigned char perm1, perm2, *pix1v, *pix2v;
441
    vector unsigned char t1, t2, t3,t4, t5;
442
    vector unsigned int sum;
443
    vector signed int sumsqr;
444
    
445
    sum = (vector unsigned int)vec_splat_u32(0);
446
    
447
    for(i=0;i<16;i++) {
448
        /* Read potentially unaligned pixels into t1 and t2 */
449
        perm1 = vec_lvsl(0, pix1);
450
        pix1v = (vector unsigned char *) pix1;
451
        perm2 = vec_lvsl(0, pix2);
452
        pix2v = (vector unsigned char *) pix2;
453
        t1 = vec_perm(pix1v[0], pix1v[1], perm1);
454
        t2 = vec_perm(pix2v[0], pix2v[1], perm2);
455

    
456
        /*
457
          Since we want to use unsigned chars, we can take advantage
458
          of the fact that abs(a-b)^2 = (a-b)^2.
459
        */
460
        
461
        /* Calculate abs differences vector */ 
462
        t3 = vec_max(t1, t2);
463
        t4 = vec_min(t1, t2);
464
        t5 = vec_sub(t3, t4);
465
        
466
        /* Square the values and add them to our sum */
467
        sum = vec_msum(t5, t5, sum);
468
        
469
        pix1 += line_size;
470
        pix2 += line_size;
471
    }
472
    
473
    /* Sum up the four partial sums, and put the result into s */
474
    sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
475
    sumsqr = vec_splat(sumsqr, 3);
476
    vec_ste(sumsqr, 0, &s);
477
    
478
    return s;
479
}
480

    
481
int pix_sum_altivec(uint8_t * pix, int line_size)
482
{
483
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
484
    vector unsigned char perm, *pixv;
485
    vector unsigned char t1;
486
    vector unsigned int sad;
487
    vector signed int sumdiffs;
488

    
489
    int i;
490
    int s __attribute__((aligned(16)));
491
    
492
    sad = (vector unsigned int)vec_splat_u32(0);
493
    
494
    for (i = 0; i < 16; i++) {
495
        /* Read the potentially unaligned 16 pixels into t1 */
496
        perm = vec_lvsl(0, pix);
497
        pixv = (vector unsigned char *) pix;
498
        t1 = vec_perm(pixv[0], pixv[1], perm);
499

    
500
        /* Add each 4 pixel group together and put 4 results into sad */
501
        sad = vec_sum4s(t1, sad);
502
        
503
        pix += line_size;
504
    }
505
    
506
    /* Sum up the four partial sums, and put the result into s */
507
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
508
    sumdiffs = vec_splat(sumdiffs, 3);
509
    vec_ste(sumdiffs, 0, &s);
510
    
511
    return s;
512
}
513

    
514
void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
515
{
516
    int i;
517
    vector unsigned char perm, bytes, *pixv;
518
    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
519
    vector signed short shorts;
520

    
521
    for(i=0;i<8;i++)
522
    {
523
        // Read potentially unaligned pixels.
524
        // We're reading 16 pixels, and actually only want 8,
525
        // but we simply ignore the extras.
526
        perm = vec_lvsl(0, pixels);
527
        pixv = (vector unsigned char *) pixels;
528
        bytes = vec_perm(pixv[0], pixv[1], perm);
529

    
530
        // convert the bytes into shorts
531
        shorts = (vector signed short)vec_mergeh(zero, bytes);
532

    
533
        // save the data to the block, we assume the block is 16-byte aligned
534
        vec_st(shorts, i*16, (vector signed short*)block);
535

    
536
        pixels += line_size;
537
    }
538
}
539

    
540
void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
541
        const uint8_t *s2, int stride)
542
{
543
    int i;
544
    vector unsigned char perm, bytes, *pixv;
545
    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
546
    vector signed short shorts1, shorts2;
547

    
548
    for(i=0;i<4;i++)
549
    {
550
        // Read potentially unaligned pixels
551
        // We're reading 16 pixels, and actually only want 8,
552
        // but we simply ignore the extras.
553
        perm = vec_lvsl(0, s1);
554
        pixv = (vector unsigned char *) s1;
555
        bytes = vec_perm(pixv[0], pixv[1], perm);
556

    
557
        // convert the bytes into shorts
558
        shorts1 = (vector signed short)vec_mergeh(zero, bytes);
559

    
560
        // Do the same for the second block of pixels
561
        perm = vec_lvsl(0, s2);
562
        pixv = (vector unsigned char *) s2;
563
        bytes = vec_perm(pixv[0], pixv[1], perm);
564

    
565
        // convert the bytes into shorts
566
        shorts2 = (vector signed short)vec_mergeh(zero, bytes);
567

    
568
        // Do the subtraction
569
        shorts1 = vec_sub(shorts1, shorts2);
570

    
571
        // save the data to the block, we assume the block is 16-byte aligned
572
        vec_st(shorts1, 0, (vector signed short*)block);
573

    
574
        s1 += stride;
575
        s2 += stride;
576
        block += 8;
577

    
578

    
579
        // The code below is a copy of the code above... This is a manual
580
        // unroll.
581

    
582
        // Read potentially unaligned pixels
583
        // We're reading 16 pixels, and actually only want 8,
584
        // but we simply ignore the extras.
585
        perm = vec_lvsl(0, s1);
586
        pixv = (vector unsigned char *) s1;
587
        bytes = vec_perm(pixv[0], pixv[1], perm);
588

    
589
        // convert the bytes into shorts
590
        shorts1 = (vector signed short)vec_mergeh(zero, bytes);
591

    
592
        // Do the same for the second block of pixels
593
        perm = vec_lvsl(0, s2);
594
        pixv = (vector unsigned char *) s2;
595
        bytes = vec_perm(pixv[0], pixv[1], perm);
596

    
597
        // convert the bytes into shorts
598
        shorts2 = (vector signed short)vec_mergeh(zero, bytes);
599

    
600
        // Do the subtraction
601
        shorts1 = vec_sub(shorts1, shorts2);
602

    
603
        // save the data to the block, we assume the block is 16-byte aligned
604
        vec_st(shorts1, 0, (vector signed short*)block);
605

    
606
        s1 += stride;
607
        s2 += stride;
608
        block += 8;
609
    }
610
}
611

    
612
int sad16x16_altivec(void *s, uint8_t *a, uint8_t *b, int stride) {
613
  return pix_abs16x16_altivec(a,b,stride);
614
}
615

    
616
int sad8x8_altivec(void *s, uint8_t *a, uint8_t *b, int stride) {
617
  return pix_abs8x8_altivec(a,b,stride);
618
}
619

    
620
void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) {
621
#ifdef ALTIVEC_USE_REFERENCE_C_CODE
622
    int i;
623
    for(i=0; i+7<w; i++){
624
        dst[i+0] += src[i+0];
625
        dst[i+1] += src[i+1];
626
        dst[i+2] += src[i+2];
627
        dst[i+3] += src[i+3];
628
        dst[i+4] += src[i+4];
629
        dst[i+5] += src[i+5];
630
        dst[i+6] += src[i+6];
631
        dst[i+7] += src[i+7];
632
    }
633
    for(; i<w; i++)
634
        dst[i+0] += src[i+0];
635
#else /* ALTIVEC_USE_REFERENCE_C_CODE */
636
    register int i;
637
    register vector unsigned char vdst, vsrc;
638
    
639
    /* dst and src are 16 bytes-aligned (guaranteed) */
640
    for(i = 0 ; (i + 15) < w ; i++)
641
    {
642
      vdst = vec_ld(i << 4, (unsigned char*)dst);
643
      vsrc = vec_ld(i << 4, (unsigned char*)src);
644
      vdst = vec_add(vsrc, vdst);
645
      vec_st(vdst, i << 4, (unsigned char*)dst);
646
    }
647
    /* if w is not a multiple of 16 */
648
    for (; (i < w) ; i++)
649
    {
650
      dst[i] = src[i];
651
    }
652
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
653
}
654

    
655
/* next one assumes that ((line_size % 16) == 0) */
656
void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
657
{
658
POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1);
659
#ifdef ALTIVEC_USE_REFERENCE_C_CODE
660
    int i;
661

    
662
POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
663

    
664
    for(i=0; i<h; i++) {
665
      *((uint32_t*)(block )) = (((const struct unaligned_32 *) (pixels))->l);
666
      *((uint32_t*)(block+4)) = (((const struct unaligned_32 *) (pixels+4))->l);
667
      *((uint32_t*)(block+8)) = (((const struct unaligned_32 *) (pixels+8))->l);
668
      *((uint32_t*)(block+12)) = (((const struct unaligned_32 *) (pixels+12))->l);
669
      pixels+=line_size;
670
      block +=line_size;
671
    }
672

    
673
POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
674

    
675
#else /* ALTIVEC_USE_REFERENCE_C_CODE */
676
    register vector unsigned char pixelsv1, pixelsv2;
677
    register vector unsigned char pixelsv1B, pixelsv2B;
678
    register vector unsigned char pixelsv1C, pixelsv2C;
679
    register vector unsigned char pixelsv1D, pixelsv2D;
680

    
681
    register vector unsigned char perm = vec_lvsl(0, pixels);
682
    int i;
683
    register int line_size_2 = line_size << 1;
684
    register int line_size_3 = line_size + line_size_2;
685
    register int line_size_4 = line_size << 2;
686

    
687
POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
688
// hand-unrolling the loop by 4 gains about 15%
689
// mininum execution time goes from 74 to 60 cycles
690
// it's faster than -funroll-loops, but using
691
// -funroll-loops w/ this is bad - 74 cycles again.
692
// all this is on a 7450, tuning for the 7450
693
#if 0
694
    for(i=0; i<h; i++) {
695
      pixelsv1 = vec_ld(0, (unsigned char*)pixels);
696
      pixelsv2 = vec_ld(16, (unsigned char*)pixels);
697
      vec_st(vec_perm(pixelsv1, pixelsv2, perm),
698
             0, (unsigned char*)block);
699
      pixels+=line_size;
700
      block +=line_size;
701
    }
702
#else
703
    for(i=0; i<h; i+=4) {
704
      pixelsv1 = vec_ld(0, (unsigned char*)pixels);
705
      pixelsv2 = vec_ld(16, (unsigned char*)pixels);
706
      pixelsv1B = vec_ld(line_size, (unsigned char*)pixels);
707
      pixelsv2B = vec_ld(16 + line_size, (unsigned char*)pixels);
708
      pixelsv1C = vec_ld(line_size_2, (unsigned char*)pixels);
709
      pixelsv2C = vec_ld(16 + line_size_2, (unsigned char*)pixels);
710
      pixelsv1D = vec_ld(line_size_3, (unsigned char*)pixels);
711
      pixelsv2D = vec_ld(16 + line_size_3, (unsigned char*)pixels);
712
      vec_st(vec_perm(pixelsv1, pixelsv2, perm),
713
             0, (unsigned char*)block);
714
      vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
715
             line_size, (unsigned char*)block);
716
      vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
717
             line_size_2, (unsigned char*)block);
718
      vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
719
             line_size_3, (unsigned char*)block);
720
      pixels+=line_size_4;
721
      block +=line_size_4;
722
    }
723
#endif
724
POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
725

    
726
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
727
}
728

    
729
/* next one assumes that ((line_size % 16) == 0) */
730
#define op_avg(a,b)  a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
731
void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
732
{
733
POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1);
734
#ifdef ALTIVEC_USE_REFERENCE_C_CODE
735
    int i;
736

    
737
POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
738

    
739
    for(i=0; i<h; i++) {
740
      op_avg(*((uint32_t*)(block)),(((const struct unaligned_32 *)(pixels))->l));
741
      op_avg(*((uint32_t*)(block+4)),(((const struct unaligned_32 *)(pixels+4))->l));
742
      op_avg(*((uint32_t*)(block+8)),(((const struct unaligned_32 *)(pixels+8))->l));
743
      op_avg(*((uint32_t*)(block+12)),(((const struct unaligned_32 *)(pixels+12))->l));
744
      pixels+=line_size;
745
      block +=line_size;
746
    }
747

    
748
POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
749

    
750
#else /* ALTIVEC_USE_REFERENCE_C_CODE */
751
    register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
752
    register vector unsigned char perm = vec_lvsl(0, pixels);
753
    int i;
754

    
755
POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
756

    
757
    for(i=0; i<h; i++) {
758
      pixelsv1 = vec_ld(0, (unsigned char*)pixels);
759
      pixelsv2 = vec_ld(16, (unsigned char*)pixels);
760
      blockv = vec_ld(0, block);
761
      pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
762
      blockv = vec_avg(blockv,pixelsv);
763
      vec_st(blockv, 0, (unsigned char*)block);
764
      pixels+=line_size;
765
      block +=line_size;
766
    }
767

    
768
POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
769

    
770
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
771
}
772

    
773
/* next one assumes that ((line_size % 8) == 0) */
774
void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
775
{
776
POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1);
777
#ifdef ALTIVEC_USE_REFERENCE_C_CODE
778
    int i;
779
POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
780
    for (i = 0; i < h; i++) {
781
        *((uint32_t *) (block)) =
782
            (((*((uint32_t *) (block))) |
783
              ((((const struct unaligned_32 *) (pixels))->l))) -
784
             ((((*((uint32_t *) (block))) ^
785
                ((((const struct unaligned_32 *) (pixels))->
786
                  l))) & 0xFEFEFEFEUL) >> 1));
787
        *((uint32_t *) (block + 4)) =
788
            (((*((uint32_t *) (block + 4))) |
789
              ((((const struct unaligned_32 *) (pixels + 4))->l))) -
790
             ((((*((uint32_t *) (block + 4))) ^
791
                ((((const struct unaligned_32 *) (pixels +
792
                                                  4))->
793
                  l))) & 0xFEFEFEFEUL) >> 1));
794
        pixels += line_size;
795
        block += line_size;
796
    }
797
POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
798

    
799
#else /* ALTIVEC_USE_REFERENCE_C_CODE */
800
    register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
801
    int i;
802

    
803
POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
804
 
805
   for (i = 0; i < h; i++) {
806
     /*
807
       block is 8 bytes-aligned, so we're either in the
808
       left block (16 bytes-aligned) or in the right block (not)
809
     */
810
     int rightside = ((unsigned long)block & 0x0000000F);
811
     
812
     blockv = vec_ld(0, block);
813
     pixelsv1 = vec_ld(0, (unsigned char*)pixels);
814
     pixelsv2 = vec_ld(16, (unsigned char*)pixels);
815
     pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
816
     
817
     if (rightside)
818
     {
819
       pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
820
     }
821
     else
822
     {
823
       pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
824
     }
825
     
826
     blockv = vec_avg(blockv, pixelsv);
827

    
828
     vec_st(blockv, 0, block);
829
     
830
     pixels += line_size;
831
     block += line_size;
832
   }
833
   
834
POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
835
 
836
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
837
}
838

    
839
/* next one assumes that ((line_size % 8) == 0) */
840
void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
841
{
842
POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1);
843
#ifdef ALTIVEC_USE_REFERENCE_C_CODE
844
    int j;
845
POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
846
    for (j = 0; j < 2; j++) {
847
      int i;
848
      const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
849
      const uint32_t b =
850
        (((const struct unaligned_32 *) (pixels + 1))->l);
851
      uint32_t l0 =
852
        (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
853
      uint32_t h0 =
854
        ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
855
      uint32_t l1, h1;
856
      pixels += line_size;
857
      for (i = 0; i < h; i += 2) {
858
        uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
859
        uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
860
        l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
861
        h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
862
        *((uint32_t *) block) =
863
          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
864
        pixels += line_size;
865
        block += line_size;
866
        a = (((const struct unaligned_32 *) (pixels))->l);
867
        b = (((const struct unaligned_32 *) (pixels + 1))->l);
868
        l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
869
        h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
870
        *((uint32_t *) block) =
871
          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
872
        pixels += line_size;
873
        block += line_size;
874
      } pixels += 4 - line_size * (h + 1);
875
      block += 4 - line_size * h;
876
    }
877

    
878
POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
879

    
880
#else /* ALTIVEC_USE_REFERENCE_C_CODE */
881
   register int i;
882
   register vector unsigned char
883
     pixelsv1, pixelsv2,
884
     pixelsavg;
885
   register vector unsigned char
886
     blockv, temp1, temp2;
887
   register vector unsigned short
888
     pixelssum1, pixelssum2, temp3;
889
   register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
890
   register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
891
   
892
   temp1 = vec_ld(0, pixels);
893
   temp2 = vec_ld(16, pixels);
894
   pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
895
   if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F)
896
   {
897
     pixelsv2 = temp2;
898
   }
899
   else
900
   {
901
     pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
902
   }
903
   pixelsv1 = vec_mergeh(vczero, pixelsv1);
904
   pixelsv2 = vec_mergeh(vczero, pixelsv2);
905
   pixelssum1 = vec_add((vector unsigned short)pixelsv1,
906
                        (vector unsigned short)pixelsv2);
907
   pixelssum1 = vec_add(pixelssum1, vctwo);
908
   
909
POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1); 
910
   for (i = 0; i < h ; i++) {
911
     int rightside = ((unsigned long)block & 0x0000000F);
912
     blockv = vec_ld(0, block);
913

    
914
     temp1 = vec_ld(line_size, pixels);
915
     temp2 = vec_ld(line_size + 16, pixels);
916
     pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
917
     if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F)
918
     {
919
       pixelsv2 = temp2;
920
     }
921
     else
922
     {
923
       pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
924
     }
925

    
926
     pixelsv1 = vec_mergeh(vczero, pixelsv1);
927
     pixelsv2 = vec_mergeh(vczero, pixelsv2);
928
     pixelssum2 = vec_add((vector unsigned short)pixelsv1,
929
                          (vector unsigned short)pixelsv2);
930
     temp3 = vec_add(pixelssum1, pixelssum2);
931
     temp3 = vec_sra(temp3, vctwo);
932
     pixelssum1 = vec_add(pixelssum2, vctwo);
933
     pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
934
     
935
     if (rightside)
936
     {
937
       blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
938
     }
939
     else
940
     {
941
       blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
942
     }
943
     
944
     vec_st(blockv, 0, block);
945
     
946
     block += line_size;
947
     pixels += line_size;
948
   }
949
   
950
POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
951
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
952
}
953

    
954
/* next one assumes that ((line_size % 8) == 0) */
955
void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
956
{
957
POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
958
#ifdef ALTIVEC_USE_REFERENCE_C_CODE
959
    int j;
960
POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
961
    for (j = 0; j < 2; j++) {
962
      int i;
963
      const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
964
      const uint32_t b =
965
        (((const struct unaligned_32 *) (pixels + 1))->l);
966
      uint32_t l0 =
967
        (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
968
      uint32_t h0 =
969
        ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
970
      uint32_t l1, h1;
971
      pixels += line_size;
972
      for (i = 0; i < h; i += 2) {
973
        uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
974
        uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
975
        l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
976
        h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
977
        *((uint32_t *) block) =
978
          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
979
        pixels += line_size;
980
        block += line_size;
981
        a = (((const struct unaligned_32 *) (pixels))->l);
982
        b = (((const struct unaligned_32 *) (pixels + 1))->l);
983
        l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
984
        h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
985
        *((uint32_t *) block) =
986
          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
987
        pixels += line_size;
988
        block += line_size;
989
      } pixels += 4 - line_size * (h + 1);
990
      block += 4 - line_size * h;
991
    }
992
    
993
POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
994

    
995
#else /* ALTIVEC_USE_REFERENCE_C_CODE */
996
   register int i;
997
   register vector unsigned char
998
     pixelsv1, pixelsv2,
999
     pixelsavg;
1000
   register vector unsigned char
1001
     blockv, temp1, temp2;
1002
   register vector unsigned short
1003
     pixelssum1, pixelssum2, temp3;
1004
   register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
1005
   register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
1006
   register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
1007
   
1008
   temp1 = vec_ld(0, pixels);
1009
   temp2 = vec_ld(16, pixels);
1010
   pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1011
   if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F)
1012
   {
1013
     pixelsv2 = temp2;
1014
   }
1015
   else
1016
   {
1017
     pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
1018
   }
1019
   pixelsv1 = vec_mergeh(vczero, pixelsv1);
1020
   pixelsv2 = vec_mergeh(vczero, pixelsv2);
1021
   pixelssum1 = vec_add((vector unsigned short)pixelsv1,
1022
                        (vector unsigned short)pixelsv2);
1023
   pixelssum1 = vec_add(pixelssum1, vcone);
1024
   
1025
POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); 
1026
   for (i = 0; i < h ; i++) {
1027
     int rightside = ((unsigned long)block & 0x0000000F);
1028
     blockv = vec_ld(0, block);
1029

    
1030
     temp1 = vec_ld(line_size, pixels);
1031
     temp2 = vec_ld(line_size + 16, pixels);
1032
     pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1033
     if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F)
1034
     {
1035
       pixelsv2 = temp2;
1036
     }
1037
     else
1038
     {
1039
       pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1040
     }
1041

    
1042
     pixelsv1 = vec_mergeh(vczero, pixelsv1);
1043
     pixelsv2 = vec_mergeh(vczero, pixelsv2);
1044
     pixelssum2 = vec_add((vector unsigned short)pixelsv1,
1045
                          (vector unsigned short)pixelsv2);
1046
     temp3 = vec_add(pixelssum1, pixelssum2);
1047
     temp3 = vec_sra(temp3, vctwo);
1048
     pixelssum1 = vec_add(pixelssum2, vcone);
1049
     pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
1050
     
1051
     if (rightside)
1052
     {
1053
       blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
1054
     }
1055
     else
1056
     {
1057
       blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
1058
     }
1059
     
1060
     vec_st(blockv, 0, block);
1061
     
1062
     block += line_size;
1063
     pixels += line_size;
1064
   }
1065
   
1066
POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
1067
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1068
}
1069

    
1070
/* next one assumes that ((line_size % 16) == 0) */
1071
void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
1072
{
1073
POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1);
1074
#ifdef ALTIVEC_USE_REFERENCE_C_CODE
1075
    int j;
1076
POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
1077
      for (j = 0; j < 4; j++) {
1078
      int i;
1079
      const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1080
      const uint32_t b =
1081
        (((const struct unaligned_32 *) (pixels + 1))->l);
1082
      uint32_t l0 =
1083
        (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
1084
      uint32_t h0 =
1085
        ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1086
      uint32_t l1, h1;
1087
      pixels += line_size;
1088
      for (i = 0; i < h; i += 2) {
1089
        uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1090
        uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
1091
        l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
1092
        h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1093
        *((uint32_t *) block) =
1094
          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1095
        pixels += line_size;
1096
        block += line_size;
1097
        a = (((const struct unaligned_32 *) (pixels))->l);
1098
        b = (((const struct unaligned_32 *) (pixels + 1))->l);
1099
        l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
1100
        h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1101
        *((uint32_t *) block) =
1102
          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1103
        pixels += line_size;
1104
        block += line_size;
1105
      } pixels += 4 - line_size * (h + 1);
1106
      block += 4 - line_size * h;
1107
    }
1108

    
1109
POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
1110

    
1111
#else /* ALTIVEC_USE_REFERENCE_C_CODE */
1112
   register int i;
1113
   register vector unsigned char
1114
     pixelsv1, pixelsv2, pixelsv3, pixelsv4;
1115
   register vector unsigned char
1116
     blockv, temp1, temp2;
1117
   register vector unsigned short
1118
     pixelssum1, pixelssum2, temp3,
1119
     pixelssum3, pixelssum4, temp4;
1120
   register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
1121
   register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
1122

    
1123
POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
1124
 
1125
   temp1 = vec_ld(0, pixels);
1126
   temp2 = vec_ld(16, pixels);
1127
   pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1128
   if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F)
1129
   {
1130
     pixelsv2 = temp2;
1131
   }
1132
   else
1133
   {
1134
     pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
1135
   }
1136
   pixelsv3 = vec_mergel(vczero, pixelsv1);
1137
   pixelsv4 = vec_mergel(vczero, pixelsv2);
1138
   pixelsv1 = vec_mergeh(vczero, pixelsv1);
1139
   pixelsv2 = vec_mergeh(vczero, pixelsv2);
1140
   pixelssum3 = vec_add((vector unsigned short)pixelsv3,
1141
                        (vector unsigned short)pixelsv4);
1142
   pixelssum3 = vec_add(pixelssum3, vctwo);
1143
   pixelssum1 = vec_add((vector unsigned short)pixelsv1,
1144
                        (vector unsigned short)pixelsv2);
1145
   pixelssum1 = vec_add(pixelssum1, vctwo);
1146
   
1147
   for (i = 0; i < h ; i++) {
1148
     blockv = vec_ld(0, block);
1149

    
1150
     temp1 = vec_ld(line_size, pixels);
1151
     temp2 = vec_ld(line_size + 16, pixels);
1152
     pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1153
     if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F)
1154
     {
1155
       pixelsv2 = temp2;
1156
     }
1157
     else
1158
     {
1159
       pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1160
     }
1161

    
1162
     pixelsv3 = vec_mergel(vczero, pixelsv1);
1163
     pixelsv4 = vec_mergel(vczero, pixelsv2);
1164
     pixelsv1 = vec_mergeh(vczero, pixelsv1);
1165
     pixelsv2 = vec_mergeh(vczero, pixelsv2);
1166
     
1167
     pixelssum4 = vec_add((vector unsigned short)pixelsv3,
1168
                          (vector unsigned short)pixelsv4);
1169
     pixelssum2 = vec_add((vector unsigned short)pixelsv1,
1170
                          (vector unsigned short)pixelsv2);
1171
     temp4 = vec_add(pixelssum3, pixelssum4);
1172
     temp4 = vec_sra(temp4, vctwo);
1173
     temp3 = vec_add(pixelssum1, pixelssum2);
1174
     temp3 = vec_sra(temp3, vctwo);
1175

    
1176
     pixelssum3 = vec_add(pixelssum4, vctwo);
1177
     pixelssum1 = vec_add(pixelssum2, vctwo);
1178

    
1179
     blockv = vec_packsu(temp3, temp4);
1180
     
1181
     vec_st(blockv, 0, block);
1182
     
1183
     block += line_size;
1184
     pixels += line_size;
1185
   }
1186
   
1187
POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
1188
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1189
}
1190

    
1191
/* next one assumes that ((line_size % 16) == 0) */
1192
void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
1193
{
1194
POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);
1195
#ifdef ALTIVEC_USE_REFERENCE_C_CODE
1196
    int j;
1197
POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1198
      for (j = 0; j < 4; j++) {
1199
      int i;
1200
      const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1201
      const uint32_t b =
1202
        (((const struct unaligned_32 *) (pixels + 1))->l);
1203
      uint32_t l0 =
1204
        (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
1205
      uint32_t h0 =
1206
        ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1207
      uint32_t l1, h1;
1208
      pixels += line_size;
1209
      for (i = 0; i < h; i += 2) {
1210
        uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1211
        uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
1212
        l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
1213
        h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1214
        *((uint32_t *) block) =
1215
          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1216
        pixels += line_size;
1217
        block += line_size;
1218
        a = (((const struct unaligned_32 *) (pixels))->l);
1219
        b = (((const struct unaligned_32 *) (pixels + 1))->l);
1220
        l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
1221
        h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1222
        *((uint32_t *) block) =
1223
          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1224
        pixels += line_size;
1225
        block += line_size;
1226
      } pixels += 4 - line_size * (h + 1);
1227
      block += 4 - line_size * h;
1228
    }
1229

    
1230
POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1231

    
1232
#else /* ALTIVEC_USE_REFERENCE_C_CODE */
1233
   register int i;
1234
   register vector unsigned char
1235
     pixelsv1, pixelsv2, pixelsv3, pixelsv4;
1236
   register vector unsigned char
1237
     blockv, temp1, temp2;
1238
   register vector unsigned short
1239
     pixelssum1, pixelssum2, temp3,
1240
     pixelssum3, pixelssum4, temp4;
1241
   register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
1242
   register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
1243
   register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
1244

    
1245
POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1246
 
1247
   temp1 = vec_ld(0, pixels);
1248
   temp2 = vec_ld(16, pixels);
1249
   pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1250
   if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F)
1251
   {
1252
     pixelsv2 = temp2;
1253
   }
1254
   else
1255
   {
1256
     pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
1257
   }
1258
   pixelsv3 = vec_mergel(vczero, pixelsv1);
1259
   pixelsv4 = vec_mergel(vczero, pixelsv2);
1260
   pixelsv1 = vec_mergeh(vczero, pixelsv1);
1261
   pixelsv2 = vec_mergeh(vczero, pixelsv2);
1262
   pixelssum3 = vec_add((vector unsigned short)pixelsv3,
1263
                        (vector unsigned short)pixelsv4);
1264
   pixelssum3 = vec_add(pixelssum3, vcone);
1265
   pixelssum1 = vec_add((vector unsigned short)pixelsv1,
1266
                        (vector unsigned short)pixelsv2);
1267
   pixelssum1 = vec_add(pixelssum1, vcone);
1268
   
1269
   for (i = 0; i < h ; i++) {
1270
     blockv = vec_ld(0, block);
1271

    
1272
     temp1 = vec_ld(line_size, pixels);
1273
     temp2 = vec_ld(line_size + 16, pixels);
1274
     pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1275
     if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F)
1276
     {
1277
       pixelsv2 = temp2;
1278
     }
1279
     else
1280
     {
1281
       pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1282
     }
1283

    
1284
     pixelsv3 = vec_mergel(vczero, pixelsv1);
1285
     pixelsv4 = vec_mergel(vczero, pixelsv2);
1286
     pixelsv1 = vec_mergeh(vczero, pixelsv1);
1287
     pixelsv2 = vec_mergeh(vczero, pixelsv2);
1288
     
1289
     pixelssum4 = vec_add((vector unsigned short)pixelsv3,
1290
                          (vector unsigned short)pixelsv4);
1291
     pixelssum2 = vec_add((vector unsigned short)pixelsv1,
1292
                          (vector unsigned short)pixelsv2);
1293
     temp4 = vec_add(pixelssum3, pixelssum4);
1294
     temp4 = vec_sra(temp4, vctwo);
1295
     temp3 = vec_add(pixelssum1, pixelssum2);
1296
     temp3 = vec_sra(temp3, vctwo);
1297

    
1298
     pixelssum3 = vec_add(pixelssum4, vcone);
1299
     pixelssum1 = vec_add(pixelssum2, vcone);
1300

    
1301
     blockv = vec_packsu(temp3, temp4);
1302
     
1303
     vec_st(blockv, 0, block);
1304
     
1305
     block += line_size;
1306
     pixels += line_size;
1307
   }
1308
   
1309
POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1310
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1311
}
1312

    
1313
int has_altivec(void)
1314
{
1315
#ifdef CONFIG_DARWIN
1316
    int sels[2] = {CTL_HW, HW_VECTORUNIT};
1317
    int has_vu = 0;
1318
    size_t len = sizeof(has_vu);
1319
    int err;
1320

    
1321
    err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
1322

    
1323
    if (err == 0) return (has_vu != 0);
1324
#else /* CONFIG_DARWIN */
1325
/* no Darwin, do it the brute-force way */
1326
/* this is borrowed from the libmpeg2 library */
1327
    {
1328
      signal (SIGILL, sigill_handler);
1329
      if (sigsetjmp (jmpbuf, 1)) {
1330
        signal (SIGILL, SIG_DFL);
1331
      } else {
1332
        canjump = 1;
1333
        
1334
        asm volatile ("mtspr 256, %0\n\t"
1335
                      "vand %%v0, %%v0, %%v0"
1336
                      :
1337
                      : "r" (-1));
1338
        
1339
        signal (SIGILL, SIG_DFL);
1340
        return 1;
1341
      }
1342
    }
1343
#endif /* CONFIG_DARWIN */
1344
    return 0;
1345
}