Statistics
| Branch: | Revision:

ffmpeg / libavcodec / ppc / dsputil_altivec.c @ aab34ca0

History | View | Annotate | Download (44.6 KB)

1
/*
2
 * Copyright (c) 2002 Brian Foley
3
 * Copyright (c) 2002 Dieter Shirley
4
 * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
5
 *
6
 * This library is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2 of the License, or (at your option) any later version.
10
 *
11
 * This library is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with this library; if not, write to the Free Software
18
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19
 */
20
 
21
#include "../dsputil.h"
22

    
23
#include "gcc_fixes.h"
24

    
25
#include "dsputil_altivec.h"
26

    
27
#ifdef CONFIG_DARWIN
28
#include <sys/sysctl.h>
29
#else /* CONFIG_DARWIN */
30
#include <signal.h>
31
#include <setjmp.h>
32

    
33
static sigjmp_buf jmpbuf;
34
static volatile sig_atomic_t canjump = 0;
35

    
36
static void sigill_handler (int sig)
37
{
38
    if (!canjump) {
39
        signal (sig, SIG_DFL);
40
        raise (sig);
41
    }
42
    
43
    canjump = 0;
44
    siglongjmp (jmpbuf, 1);
45
}
46
#endif /* CONFIG_DARWIN */
47

    
48
int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
49
{
50
    int i;
51
    int s __attribute__((aligned(16)));
52
    const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
53
    vector unsigned char *tv;
54
    vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
55
    vector unsigned int sad;
56
    vector signed int sumdiffs;
57

    
58
    s = 0;
59
    sad = (vector unsigned int)vec_splat_u32(0);
60
    for(i=0;i<h;i++) {
61
        /*
62
           Read unaligned pixels into our vectors. The vectors are as follows:
63
           pix1v: pix1[0]-pix1[15]
64
           pix2v: pix2[0]-pix2[15]        pix2iv: pix2[1]-pix2[16]
65
        */
66
        tv = (vector unsigned char *) pix1;
67
        pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
68
        
69
        tv = (vector unsigned char *) &pix2[0];
70
        pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
71

    
72
        tv = (vector unsigned char *) &pix2[1];
73
        pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
74

    
75
        /* Calculate the average vector */
76
        avgv = vec_avg(pix2v, pix2iv);
77

    
78
        /* Calculate a sum of abs differences vector */
79
        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
80

    
81
        /* Add each 4 pixel group together and put 4 results into sad */
82
        sad = vec_sum4s(t5, sad);
83
        
84
        pix1 += line_size;
85
        pix2 += line_size;
86
    }
87
    /* Sum up the four partial sums, and put the result into s */
88
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
89
    sumdiffs = vec_splat(sumdiffs, 3);
90
    vec_ste(sumdiffs, 0, &s);
91

    
92
    return s;
93
}
94

    
95
int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
96
{
97
    int i;
98
    int s __attribute__((aligned(16)));
99
    const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
100
    vector unsigned char *tv;
101
    vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
102
    vector unsigned int sad;
103
    vector signed int sumdiffs;
104
    uint8_t *pix3 = pix2 + line_size;
105

    
106
    s = 0;
107
    sad = (vector unsigned int)vec_splat_u32(0);
108

    
109
    /*
110
       Due to the fact that pix3 = pix2 + line_size, the pix3 of one
111
       iteration becomes pix2 in the next iteration. We can use this
112
       fact to avoid a potentially expensive unaligned read, each
113
       time around the loop.
114
       Read unaligned pixels into our vectors. The vectors are as follows:
115
       pix2v: pix2[0]-pix2[15]
116
       Split the pixel vectors into shorts
117
    */
118
    tv = (vector unsigned char *) &pix2[0];
119
    pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
120
    
121
    for(i=0;i<h;i++) {
122
        /*
123
           Read unaligned pixels into our vectors. The vectors are as follows:
124
           pix1v: pix1[0]-pix1[15]
125
           pix3v: pix3[0]-pix3[15]
126
        */
127
        tv = (vector unsigned char *) pix1;
128
        pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
129

    
130
        tv = (vector unsigned char *) &pix3[0];
131
        pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
132

    
133
        /* Calculate the average vector */
134
        avgv = vec_avg(pix2v, pix3v);
135

    
136
        /* Calculate a sum of abs differences vector */
137
        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
138

    
139
        /* Add each 4 pixel group together and put 4 results into sad */
140
        sad = vec_sum4s(t5, sad);
141
        
142
        pix1 += line_size;
143
        pix2v = pix3v;
144
        pix3 += line_size;
145
        
146
    }
147
    
148
    /* Sum up the four partial sums, and put the result into s */
149
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
150
    sumdiffs = vec_splat(sumdiffs, 3);
151
    vec_ste(sumdiffs, 0, &s);
152
    return s;    
153
}
154

    
155
int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
156
{
157
    int i;
158
    int s __attribute__((aligned(16)));
159
    uint8_t *pix3 = pix2 + line_size;
160
    const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
161
    const_vector unsigned short two = (const_vector unsigned short)vec_splat_u16(2);
162
    vector unsigned char *tv, avgv, t5;
163
    vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
164
    vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
165
    vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
166
    vector unsigned short avghv, avglv;
167
    vector unsigned short t1, t2, t3, t4;
168
    vector unsigned int sad;
169
    vector signed int sumdiffs;
170

    
171
    sad = (vector unsigned int)vec_splat_u32(0);
172
    
173
    s = 0;
174

    
175
    /*
176
       Due to the fact that pix3 = pix2 + line_size, the pix3 of one
177
       iteration becomes pix2 in the next iteration. We can use this
178
       fact to avoid a potentially expensive unaligned read, as well
179
       as some splitting, and vector addition each time around the loop.
180
       Read unaligned pixels into our vectors. The vectors are as follows:
181
       pix2v: pix2[0]-pix2[15]        pix2iv: pix2[1]-pix2[16]
182
       Split the pixel vectors into shorts
183
    */
184
    tv = (vector unsigned char *) &pix2[0];
185
    pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
186

    
187
    tv = (vector unsigned char *) &pix2[1];
188
    pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
189

    
190
    pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v);
191
    pix2lv = (vector unsigned short) vec_mergel(zero, pix2v);
192
    pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv);
193
    pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv);
194
    t1 = vec_add(pix2hv, pix2ihv);
195
    t2 = vec_add(pix2lv, pix2ilv);
196
    
197
    for(i=0;i<h;i++) {
198
        /*
199
           Read unaligned pixels into our vectors. The vectors are as follows:
200
           pix1v: pix1[0]-pix1[15]
201
           pix3v: pix3[0]-pix3[15]        pix3iv: pix3[1]-pix3[16]
202
        */
203
        tv = (vector unsigned char *) pix1;
204
        pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
205

    
206
        tv = (vector unsigned char *) &pix3[0];
207
        pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
208

    
209
        tv = (vector unsigned char *) &pix3[1];
210
        pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1]));
211

    
212
        /*
213
          Note that Altivec does have vec_avg, but this works on vector pairs
214
          and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
215
          would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
216
          Instead, we have to split the pixel vectors into vectors of shorts,
217
          and do the averaging by hand.
218
        */
219

    
220
        /* Split the pixel vectors into shorts */
221
        pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v);
222
        pix3lv = (vector unsigned short) vec_mergel(zero, pix3v);
223
        pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
224
        pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
225

    
226
        /* Do the averaging on them */
227
        t3 = vec_add(pix3hv, pix3ihv);
228
        t4 = vec_add(pix3lv, pix3ilv);
229

    
230
        avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
231
        avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
232

    
233
        /* Pack the shorts back into a result */
234
        avgv = vec_pack(avghv, avglv);
235

    
236
        /* Calculate a sum of abs differences vector */
237
        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
238

    
239
        /* Add each 4 pixel group together and put 4 results into sad */
240
        sad = vec_sum4s(t5, sad);
241

    
242
        pix1 += line_size;
243
        pix3 += line_size;
244
        /* Transfer the calculated values for pix3 into pix2 */
245
        t1 = t3;
246
        t2 = t4;
247
    }
248
    /* Sum up the four partial sums, and put the result into s */
249
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
250
    sumdiffs = vec_splat(sumdiffs, 3);
251
    vec_ste(sumdiffs, 0, &s);
252

    
253
    return s;
254
}
255

    
256
int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
257
{
258
    int i;
259
    int s __attribute__((aligned(16)));
260
    const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
261
    vector unsigned char perm1, perm2, *pix1v, *pix2v;
262
    vector unsigned char t1, t2, t3,t4, t5;
263
    vector unsigned int sad;
264
    vector signed int sumdiffs;
265
    
266
    sad = (vector unsigned int)vec_splat_u32(0);
267

    
268

    
269
    for(i=0;i<h;i++) {
270
        /* Read potentially unaligned pixels into t1 and t2 */
271
        perm1 = vec_lvsl(0, pix1);
272
        pix1v = (vector unsigned char *) pix1;
273
        perm2 = vec_lvsl(0, pix2);
274
        pix2v = (vector unsigned char *) pix2;
275
        t1 = vec_perm(pix1v[0], pix1v[1], perm1);
276
        t2 = vec_perm(pix2v[0], pix2v[1], perm2);
277
       
278
        /* Calculate a sum of abs differences vector */ 
279
        t3 = vec_max(t1, t2);
280
        t4 = vec_min(t1, t2);
281
        t5 = vec_sub(t3, t4);
282
        
283
        /* Add each 4 pixel group together and put 4 results into sad */
284
        sad = vec_sum4s(t5, sad);
285

    
286
        pix1 += line_size;
287
        pix2 += line_size;
288
    }
289

    
290
    /* Sum up the four partial sums, and put the result into s */
291
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
292
    sumdiffs = vec_splat(sumdiffs, 3);
293
    vec_ste(sumdiffs, 0, &s);
294
    
295
    return s;
296
}
297

    
298
int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
299
{
300
    int i;
301
    int s __attribute__((aligned(16)));
302
    const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
303
    vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
304
    vector unsigned char t1, t2, t3,t4, t5;
305
    vector unsigned int sad;
306
    vector signed int sumdiffs;
307

    
308
    sad = (vector unsigned int)vec_splat_u32(0);
309

    
310
    permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
311

    
312
    for(i=0;i<h;i++) {
313
        /* Read potentially unaligned pixels into t1 and t2
314
           Since we're reading 16 pixels, and actually only want 8,
315
           mask out the last 8 pixels. The 0s don't change the sum. */
316
        perm1 = vec_lvsl(0, pix1);
317
        pix1v = (vector unsigned char *) pix1;
318
        perm2 = vec_lvsl(0, pix2);
319
        pix2v = (vector unsigned char *) pix2;
320
        t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
321
        t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
322

    
323
        /* Calculate a sum of abs differences vector */ 
324
        t3 = vec_max(t1, t2);
325
        t4 = vec_min(t1, t2);
326
        t5 = vec_sub(t3, t4);
327

    
328
        /* Add each 4 pixel group together and put 4 results into sad */
329
        sad = vec_sum4s(t5, sad);
330

    
331
        pix1 += line_size;
332
        pix2 += line_size;
333
    }
334

    
335
    /* Sum up the four partial sums, and put the result into s */
336
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
337
    sumdiffs = vec_splat(sumdiffs, 3);
338
    vec_ste(sumdiffs, 0, &s);
339

    
340
    return s;
341
}
342

    
343
int pix_norm1_altivec(uint8_t *pix, int line_size)
344
{
345
    int i;
346
    int s __attribute__((aligned(16)));
347
    const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
348
    vector unsigned char *tv;
349
    vector unsigned char pixv;
350
    vector unsigned int sv;
351
    vector signed int sum;
352
    
353
    sv = (vector unsigned int)vec_splat_u32(0);
354
    
355
    s = 0;
356
    for (i = 0; i < 16; i++) {
357
        /* Read in the potentially unaligned pixels */
358
        tv = (vector unsigned char *) pix;
359
        pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));
360

    
361
        /* Square the values, and add them to our sum */
362
        sv = vec_msum(pixv, pixv, sv);
363

    
364
        pix += line_size;
365
    }
366
    /* Sum up the four partial sums, and put the result into s */
367
    sum = vec_sums((vector signed int) sv, (vector signed int) zero);
368
    sum = vec_splat(sum, 3);
369
    vec_ste(sum, 0, &s);
370

    
371
    return s;
372
}
373

    
374
/**
375
 * Sum of Squared Errors for a 8x8 block.
376
 * AltiVec-enhanced.
377
 * It's the sad8_altivec code above w/ squaring added.
378
 */
379
int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
380
{
381
    int i;
382
    int s __attribute__((aligned(16)));
383
    const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
384
    vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
385
    vector unsigned char t1, t2, t3,t4, t5;
386
    vector unsigned int sum;
387
    vector signed int sumsqr;
388
    
389
    sum = (vector unsigned int)vec_splat_u32(0);
390

    
391
    permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
392

    
393
    
394
    for(i=0;i<h;i++) {
395
        /* Read potentially unaligned pixels into t1 and t2
396
           Since we're reading 16 pixels, and actually only want 8,
397
           mask out the last 8 pixels. The 0s don't change the sum. */
398
        perm1 = vec_lvsl(0, pix1);
399
        pix1v = (vector unsigned char *) pix1;
400
        perm2 = vec_lvsl(0, pix2);
401
        pix2v = (vector unsigned char *) pix2;
402
        t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
403
        t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
404

    
405
        /*
406
          Since we want to use unsigned chars, we can take advantage
407
          of the fact that abs(a-b)^2 = (a-b)^2.
408
        */
409
        
410
        /* Calculate abs differences vector */ 
411
        t3 = vec_max(t1, t2);
412
        t4 = vec_min(t1, t2);
413
        t5 = vec_sub(t3, t4);
414
        
415
        /* Square the values and add them to our sum */
416
        sum = vec_msum(t5, t5, sum);
417
        
418
        pix1 += line_size;
419
        pix2 += line_size;
420
    }
421
    
422
    /* Sum up the four partial sums, and put the result into s */
423
    sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
424
    sumsqr = vec_splat(sumsqr, 3);
425
    vec_ste(sumsqr, 0, &s);
426
    
427
    return s;
428
}
429

    
430
/**
431
 * Sum of Squared Errors for a 16x16 block.
432
 * AltiVec-enhanced.
433
 * It's the sad16_altivec code above w/ squaring added.
434
 */
435
int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
436
{
437
    int i;
438
    int s __attribute__((aligned(16)));
439
    const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
440
    vector unsigned char perm1, perm2, *pix1v, *pix2v;
441
    vector unsigned char t1, t2, t3,t4, t5;
442
    vector unsigned int sum;
443
    vector signed int sumsqr;
444
    
445
    sum = (vector unsigned int)vec_splat_u32(0);
446
    
447
    for(i=0;i<h;i++) {
448
        /* Read potentially unaligned pixels into t1 and t2 */
449
        perm1 = vec_lvsl(0, pix1);
450
        pix1v = (vector unsigned char *) pix1;
451
        perm2 = vec_lvsl(0, pix2);
452
        pix2v = (vector unsigned char *) pix2;
453
        t1 = vec_perm(pix1v[0], pix1v[1], perm1);
454
        t2 = vec_perm(pix2v[0], pix2v[1], perm2);
455

    
456
        /*
457
          Since we want to use unsigned chars, we can take advantage
458
          of the fact that abs(a-b)^2 = (a-b)^2.
459
        */
460
        
461
        /* Calculate abs differences vector */ 
462
        t3 = vec_max(t1, t2);
463
        t4 = vec_min(t1, t2);
464
        t5 = vec_sub(t3, t4);
465
        
466
        /* Square the values and add them to our sum */
467
        sum = vec_msum(t5, t5, sum);
468
        
469
        pix1 += line_size;
470
        pix2 += line_size;
471
    }
472
    
473
    /* Sum up the four partial sums, and put the result into s */
474
    sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
475
    sumsqr = vec_splat(sumsqr, 3);
476
    vec_ste(sumsqr, 0, &s);
477
    
478
    return s;
479
}
480

    
481
int pix_sum_altivec(uint8_t * pix, int line_size)
482
{
483
    const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
484
    vector unsigned char perm, *pixv;
485
    vector unsigned char t1;
486
    vector unsigned int sad;
487
    vector signed int sumdiffs;
488

    
489
    int i;
490
    int s __attribute__((aligned(16)));
491
    
492
    sad = (vector unsigned int)vec_splat_u32(0);
493
    
494
    for (i = 0; i < 16; i++) {
495
        /* Read the potentially unaligned 16 pixels into t1 */
496
        perm = vec_lvsl(0, pix);
497
        pixv = (vector unsigned char *) pix;
498
        t1 = vec_perm(pixv[0], pixv[1], perm);
499

    
500
        /* Add each 4 pixel group together and put 4 results into sad */
501
        sad = vec_sum4s(t1, sad);
502
        
503
        pix += line_size;
504
    }
505
    
506
    /* Sum up the four partial sums, and put the result into s */
507
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
508
    sumdiffs = vec_splat(sumdiffs, 3);
509
    vec_ste(sumdiffs, 0, &s);
510
    
511
    return s;
512
}
513

    
514
void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
515
{
516
    int i;
517
    vector unsigned char perm, bytes, *pixv;
518
    const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
519
    vector signed short shorts;
520

    
521
    for(i=0;i<8;i++)
522
    {
523
        // Read potentially unaligned pixels.
524
        // We're reading 16 pixels, and actually only want 8,
525
        // but we simply ignore the extras.
526
        perm = vec_lvsl(0, pixels);
527
        pixv = (vector unsigned char *) pixels;
528
        bytes = vec_perm(pixv[0], pixv[1], perm);
529

    
530
        // convert the bytes into shorts
531
        shorts = (vector signed short)vec_mergeh(zero, bytes);
532

    
533
        // save the data to the block, we assume the block is 16-byte aligned
534
        vec_st(shorts, i*16, (vector signed short*)block);
535

    
536
        pixels += line_size;
537
    }
538
}
539

    
540
void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
541
        const uint8_t *s2, int stride)
542
{
543
    int i;
544
    vector unsigned char perm, bytes, *pixv;
545
    const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
546
    vector signed short shorts1, shorts2;
547

    
548
    for(i=0;i<4;i++)
549
    {
550
        // Read potentially unaligned pixels
551
        // We're reading 16 pixels, and actually only want 8,
552
        // but we simply ignore the extras.
553
        perm = vec_lvsl(0, s1);
554
        pixv = (vector unsigned char *) s1;
555
        bytes = vec_perm(pixv[0], pixv[1], perm);
556

    
557
        // convert the bytes into shorts
558
        shorts1 = (vector signed short)vec_mergeh(zero, bytes);
559

    
560
        // Do the same for the second block of pixels
561
        perm = vec_lvsl(0, s2);
562
        pixv = (vector unsigned char *) s2;
563
        bytes = vec_perm(pixv[0], pixv[1], perm);
564

    
565
        // convert the bytes into shorts
566
        shorts2 = (vector signed short)vec_mergeh(zero, bytes);
567

    
568
        // Do the subtraction
569
        shorts1 = vec_sub(shorts1, shorts2);
570

    
571
        // save the data to the block, we assume the block is 16-byte aligned
572
        vec_st(shorts1, 0, (vector signed short*)block);
573

    
574
        s1 += stride;
575
        s2 += stride;
576
        block += 8;
577

    
578

    
579
        // The code below is a copy of the code above... This is a manual
580
        // unroll.
581

    
582
        // Read potentially unaligned pixels
583
        // We're reading 16 pixels, and actually only want 8,
584
        // but we simply ignore the extras.
585
        perm = vec_lvsl(0, s1);
586
        pixv = (vector unsigned char *) s1;
587
        bytes = vec_perm(pixv[0], pixv[1], perm);
588

    
589
        // convert the bytes into shorts
590
        shorts1 = (vector signed short)vec_mergeh(zero, bytes);
591

    
592
        // Do the same for the second block of pixels
593
        perm = vec_lvsl(0, s2);
594
        pixv = (vector unsigned char *) s2;
595
        bytes = vec_perm(pixv[0], pixv[1], perm);
596

    
597
        // convert the bytes into shorts
598
        shorts2 = (vector signed short)vec_mergeh(zero, bytes);
599

    
600
        // Do the subtraction
601
        shorts1 = vec_sub(shorts1, shorts2);
602

    
603
        // save the data to the block, we assume the block is 16-byte aligned
604
        vec_st(shorts1, 0, (vector signed short*)block);
605

    
606
        s1 += stride;
607
        s2 += stride;
608
        block += 8;
609
    }
610
}
611

    
612
void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) {
613
#ifdef ALTIVEC_USE_REFERENCE_C_CODE
614
    int i;
615
    for(i=0; i+7<w; i++){
616
        dst[i+0] += src[i+0];
617
        dst[i+1] += src[i+1];
618
        dst[i+2] += src[i+2];
619
        dst[i+3] += src[i+3];
620
        dst[i+4] += src[i+4];
621
        dst[i+5] += src[i+5];
622
        dst[i+6] += src[i+6];
623
        dst[i+7] += src[i+7];
624
    }
625
    for(; i<w; i++)
626
        dst[i+0] += src[i+0];
627
#else /* ALTIVEC_USE_REFERENCE_C_CODE */
628
    register int i;
629
    register vector unsigned char vdst, vsrc;
630
    
631
    /* dst and src are 16 bytes-aligned (guaranteed) */
632
    for(i = 0 ; (i + 15) < w ; i++)
633
    {
634
      vdst = vec_ld(i << 4, (unsigned char*)dst);
635
      vsrc = vec_ld(i << 4, (unsigned char*)src);
636
      vdst = vec_add(vsrc, vdst);
637
      vec_st(vdst, i << 4, (unsigned char*)dst);
638
    }
639
    /* if w is not a multiple of 16 */
640
    for (; (i < w) ; i++)
641
    {
642
      dst[i] = src[i];
643
    }
644
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
645
}
646

    
647
/* next one assumes that ((line_size % 16) == 0) */
648
void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
649
{
650
POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1);
651
#ifdef ALTIVEC_USE_REFERENCE_C_CODE
652
    int i;
653

    
654
POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
655

    
656
    for(i=0; i<h; i++) {
657
      *((uint32_t*)(block)) = LD32(pixels);
658
      *((uint32_t*)(block+4)) = LD32(pixels+4);
659
      *((uint32_t*)(block+8)) = LD32(pixels+8);
660
      *((uint32_t*)(block+12)) = LD32(pixels+12);
661
      pixels+=line_size;
662
      block +=line_size;
663
    }
664

    
665
POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
666

    
667
#else /* ALTIVEC_USE_REFERENCE_C_CODE */
668
    register vector unsigned char pixelsv1, pixelsv2;
669
    register vector unsigned char pixelsv1B, pixelsv2B;
670
    register vector unsigned char pixelsv1C, pixelsv2C;
671
    register vector unsigned char pixelsv1D, pixelsv2D;
672

    
673
    register vector unsigned char perm = vec_lvsl(0, pixels);
674
    int i;
675
    register int line_size_2 = line_size << 1;
676
    register int line_size_3 = line_size + line_size_2;
677
    register int line_size_4 = line_size << 2;
678

    
679
POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
680
// hand-unrolling the loop by 4 gains about 15%
681
// mininum execution time goes from 74 to 60 cycles
682
// it's faster than -funroll-loops, but using
683
// -funroll-loops w/ this is bad - 74 cycles again.
684
// all this is on a 7450, tuning for the 7450
685
#if 0
686
    for(i=0; i<h; i++) {
687
      pixelsv1 = vec_ld(0, (unsigned char*)pixels);
688
      pixelsv2 = vec_ld(16, (unsigned char*)pixels);
689
      vec_st(vec_perm(pixelsv1, pixelsv2, perm),
690
             0, (unsigned char*)block);
691
      pixels+=line_size;
692
      block +=line_size;
693
    }
694
#else
695
    for(i=0; i<h; i+=4) {
696
      pixelsv1 = vec_ld(0, (unsigned char*)pixels);
697
      pixelsv2 = vec_ld(16, (unsigned char*)pixels);
698
      pixelsv1B = vec_ld(line_size, (unsigned char*)pixels);
699
      pixelsv2B = vec_ld(16 + line_size, (unsigned char*)pixels);
700
      pixelsv1C = vec_ld(line_size_2, (unsigned char*)pixels);
701
      pixelsv2C = vec_ld(16 + line_size_2, (unsigned char*)pixels);
702
      pixelsv1D = vec_ld(line_size_3, (unsigned char*)pixels);
703
      pixelsv2D = vec_ld(16 + line_size_3, (unsigned char*)pixels);
704
      vec_st(vec_perm(pixelsv1, pixelsv2, perm),
705
             0, (unsigned char*)block);
706
      vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
707
             line_size, (unsigned char*)block);
708
      vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
709
             line_size_2, (unsigned char*)block);
710
      vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
711
             line_size_3, (unsigned char*)block);
712
      pixels+=line_size_4;
713
      block +=line_size_4;
714
    }
715
#endif
716
POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
717

    
718
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
719
}
720

    
721
/* next one assumes that ((line_size % 16) == 0) */
722
#define op_avg(a,b)  a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
723
void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
724
{
725
POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1);
726
#ifdef ALTIVEC_USE_REFERENCE_C_CODE
727
    int i;
728

    
729
POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
730

    
731
    for(i=0; i<h; i++) {
732
      op_avg(*((uint32_t*)(block)),LD32(pixels));
733
      op_avg(*((uint32_t*)(block+4)),LD32(pixels+4));
734
      op_avg(*((uint32_t*)(block+8)),LD32(pixels+8));
735
      op_avg(*((uint32_t*)(block+12)),LD32(pixels+12));
736
      pixels+=line_size;
737
      block +=line_size;
738
    }
739

    
740
POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
741

    
742
#else /* ALTIVEC_USE_REFERENCE_C_CODE */
743
    register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
744
    register vector unsigned char perm = vec_lvsl(0, pixels);
745
    int i;
746

    
747
POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
748

    
749
    for(i=0; i<h; i++) {
750
      pixelsv1 = vec_ld(0, (unsigned char*)pixels);
751
      pixelsv2 = vec_ld(16, (unsigned char*)pixels);
752
      blockv = vec_ld(0, block);
753
      pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
754
      blockv = vec_avg(blockv,pixelsv);
755
      vec_st(blockv, 0, (unsigned char*)block);
756
      pixels+=line_size;
757
      block +=line_size;
758
    }
759

    
760
POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
761

    
762
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
763
}
764

    
765
/* next one assumes that ((line_size % 8) == 0) */
766
void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
767
{
768
POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1);
769
#ifdef ALTIVEC_USE_REFERENCE_C_CODE
770
    int i;
771
POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
772
    for (i = 0; i < h; i++) {
773
        *((uint32_t *) (block)) =
774
            (((*((uint32_t *) (block))) |
775
              ((((const struct unaligned_32 *) (pixels))->l))) -
776
             ((((*((uint32_t *) (block))) ^
777
                ((((const struct unaligned_32 *) (pixels))->
778
                  l))) & 0xFEFEFEFEUL) >> 1));
779
        *((uint32_t *) (block + 4)) =
780
            (((*((uint32_t *) (block + 4))) |
781
              ((((const struct unaligned_32 *) (pixels + 4))->l))) -
782
             ((((*((uint32_t *) (block + 4))) ^
783
                ((((const struct unaligned_32 *) (pixels +
784
                                                  4))->
785
                  l))) & 0xFEFEFEFEUL) >> 1));
786
        pixels += line_size;
787
        block += line_size;
788
    }
789
POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
790

    
791
#else /* ALTIVEC_USE_REFERENCE_C_CODE */
792
    register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
793
    int i;
794

    
795
POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
796
 
797
   for (i = 0; i < h; i++) {
798
     /*
799
       block is 8 bytes-aligned, so we're either in the
800
       left block (16 bytes-aligned) or in the right block (not)
801
     */
802
     int rightside = ((unsigned long)block & 0x0000000F);
803
     
804
     blockv = vec_ld(0, block);
805
     pixelsv1 = vec_ld(0, (unsigned char*)pixels);
806
     pixelsv2 = vec_ld(16, (unsigned char*)pixels);
807
     pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
808
     
809
     if (rightside)
810
     {
811
       pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
812
     }
813
     else
814
     {
815
       pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
816
     }
817
     
818
     blockv = vec_avg(blockv, pixelsv);
819

    
820
     vec_st(blockv, 0, block);
821
     
822
     pixels += line_size;
823
     block += line_size;
824
   }
825
   
826
POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
827
 
828
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
829
}
830

    
831
/* next one assumes that ((line_size % 8) == 0) */
832
void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
833
{
834
POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1);
835
#ifdef ALTIVEC_USE_REFERENCE_C_CODE
836
    int j;
837
POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
838
    for (j = 0; j < 2; j++) {
839
      int i;
840
      const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
841
      const uint32_t b =
842
        (((const struct unaligned_32 *) (pixels + 1))->l);
843
      uint32_t l0 =
844
        (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
845
      uint32_t h0 =
846
        ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
847
      uint32_t l1, h1;
848
      pixels += line_size;
849
      for (i = 0; i < h; i += 2) {
850
        uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
851
        uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
852
        l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
853
        h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
854
        *((uint32_t *) block) =
855
          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
856
        pixels += line_size;
857
        block += line_size;
858
        a = (((const struct unaligned_32 *) (pixels))->l);
859
        b = (((const struct unaligned_32 *) (pixels + 1))->l);
860
        l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
861
        h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
862
        *((uint32_t *) block) =
863
          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
864
        pixels += line_size;
865
        block += line_size;
866
      } pixels += 4 - line_size * (h + 1);
867
      block += 4 - line_size * h;
868
    }
869

    
870
POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
871

    
872
#else /* ALTIVEC_USE_REFERENCE_C_CODE */
873
   register int i;
874
   register vector unsigned char
875
     pixelsv1, pixelsv2,
876
     pixelsavg;
877
   register vector unsigned char
878
     blockv, temp1, temp2;
879
   register vector unsigned short
880
     pixelssum1, pixelssum2, temp3;
881
   register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
882
   register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
883
   
884
   temp1 = vec_ld(0, pixels);
885
   temp2 = vec_ld(16, pixels);
886
   pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
887
   if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F)
888
   {
889
     pixelsv2 = temp2;
890
   }
891
   else
892
   {
893
     pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
894
   }
895
   pixelsv1 = vec_mergeh(vczero, pixelsv1);
896
   pixelsv2 = vec_mergeh(vczero, pixelsv2);
897
   pixelssum1 = vec_add((vector unsigned short)pixelsv1,
898
                        (vector unsigned short)pixelsv2);
899
   pixelssum1 = vec_add(pixelssum1, vctwo);
900
   
901
POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1); 
902
   for (i = 0; i < h ; i++) {
903
     int rightside = ((unsigned long)block & 0x0000000F);
904
     blockv = vec_ld(0, block);
905

    
906
     temp1 = vec_ld(line_size, pixels);
907
     temp2 = vec_ld(line_size + 16, pixels);
908
     pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
909
     if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F)
910
     {
911
       pixelsv2 = temp2;
912
     }
913
     else
914
     {
915
       pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
916
     }
917

    
918
     pixelsv1 = vec_mergeh(vczero, pixelsv1);
919
     pixelsv2 = vec_mergeh(vczero, pixelsv2);
920
     pixelssum2 = vec_add((vector unsigned short)pixelsv1,
921
                          (vector unsigned short)pixelsv2);
922
     temp3 = vec_add(pixelssum1, pixelssum2);
923
     temp3 = vec_sra(temp3, vctwo);
924
     pixelssum1 = vec_add(pixelssum2, vctwo);
925
     pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
926
     
927
     if (rightside)
928
     {
929
       blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
930
     }
931
     else
932
     {
933
       blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
934
     }
935
     
936
     vec_st(blockv, 0, block);
937
     
938
     block += line_size;
939
     pixels += line_size;
940
   }
941
   
942
POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
943
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
944
}
945

    
946
/* next one assumes that ((line_size % 8) == 0) */
947
void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
948
{
949
POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
950
#ifdef ALTIVEC_USE_REFERENCE_C_CODE
951
    int j;
952
POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
953
    for (j = 0; j < 2; j++) {
954
      int i;
955
      const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
956
      const uint32_t b =
957
        (((const struct unaligned_32 *) (pixels + 1))->l);
958
      uint32_t l0 =
959
        (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
960
      uint32_t h0 =
961
        ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
962
      uint32_t l1, h1;
963
      pixels += line_size;
964
      for (i = 0; i < h; i += 2) {
965
        uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
966
        uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
967
        l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
968
        h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
969
        *((uint32_t *) block) =
970
          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
971
        pixels += line_size;
972
        block += line_size;
973
        a = (((const struct unaligned_32 *) (pixels))->l);
974
        b = (((const struct unaligned_32 *) (pixels + 1))->l);
975
        l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
976
        h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
977
        *((uint32_t *) block) =
978
          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
979
        pixels += line_size;
980
        block += line_size;
981
      } pixels += 4 - line_size * (h + 1);
982
      block += 4 - line_size * h;
983
    }
984
    
985
POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
986

    
987
#else /* ALTIVEC_USE_REFERENCE_C_CODE */
988
   register int i;
989
   register vector unsigned char
990
     pixelsv1, pixelsv2,
991
     pixelsavg;
992
   register vector unsigned char
993
     blockv, temp1, temp2;
994
   register vector unsigned short
995
     pixelssum1, pixelssum2, temp3;
996
   register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
997
   register const_vector unsigned short vcone = (const_vector unsigned short)vec_splat_u16(1);
998
   register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
999
   
1000
   temp1 = vec_ld(0, pixels);
1001
   temp2 = vec_ld(16, pixels);
1002
   pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1003
   if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F)
1004
   {
1005
     pixelsv2 = temp2;
1006
   }
1007
   else
1008
   {
1009
     pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
1010
   }
1011
   pixelsv1 = vec_mergeh(vczero, pixelsv1);
1012
   pixelsv2 = vec_mergeh(vczero, pixelsv2);
1013
   pixelssum1 = vec_add((vector unsigned short)pixelsv1,
1014
                        (vector unsigned short)pixelsv2);
1015
   pixelssum1 = vec_add(pixelssum1, vcone);
1016
   
1017
POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); 
1018
   for (i = 0; i < h ; i++) {
1019
     int rightside = ((unsigned long)block & 0x0000000F);
1020
     blockv = vec_ld(0, block);
1021

    
1022
     temp1 = vec_ld(line_size, pixels);
1023
     temp2 = vec_ld(line_size + 16, pixels);
1024
     pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1025
     if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F)
1026
     {
1027
       pixelsv2 = temp2;
1028
     }
1029
     else
1030
     {
1031
       pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1032
     }
1033

    
1034
     pixelsv1 = vec_mergeh(vczero, pixelsv1);
1035
     pixelsv2 = vec_mergeh(vczero, pixelsv2);
1036
     pixelssum2 = vec_add((vector unsigned short)pixelsv1,
1037
                          (vector unsigned short)pixelsv2);
1038
     temp3 = vec_add(pixelssum1, pixelssum2);
1039
     temp3 = vec_sra(temp3, vctwo);
1040
     pixelssum1 = vec_add(pixelssum2, vcone);
1041
     pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
1042
     
1043
     if (rightside)
1044
     {
1045
       blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
1046
     }
1047
     else
1048
     {
1049
       blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
1050
     }
1051
     
1052
     vec_st(blockv, 0, block);
1053
     
1054
     block += line_size;
1055
     pixels += line_size;
1056
   }
1057
   
1058
POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
1059
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1060
}
1061

    
1062
/* next one assumes that ((line_size % 16) == 0) */
1063
void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
1064
{
1065
POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1);
1066
#ifdef ALTIVEC_USE_REFERENCE_C_CODE
1067
    int j;
1068
POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
1069
      for (j = 0; j < 4; j++) {
1070
      int i;
1071
      const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1072
      const uint32_t b =
1073
        (((const struct unaligned_32 *) (pixels + 1))->l);
1074
      uint32_t l0 =
1075
        (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
1076
      uint32_t h0 =
1077
        ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1078
      uint32_t l1, h1;
1079
      pixels += line_size;
1080
      for (i = 0; i < h; i += 2) {
1081
        uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1082
        uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
1083
        l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
1084
        h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1085
        *((uint32_t *) block) =
1086
          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1087
        pixels += line_size;
1088
        block += line_size;
1089
        a = (((const struct unaligned_32 *) (pixels))->l);
1090
        b = (((const struct unaligned_32 *) (pixels + 1))->l);
1091
        l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
1092
        h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1093
        *((uint32_t *) block) =
1094
          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1095
        pixels += line_size;
1096
        block += line_size;
1097
      } pixels += 4 - line_size * (h + 1);
1098
      block += 4 - line_size * h;
1099
    }
1100

    
1101
POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
1102

    
1103
#else /* ALTIVEC_USE_REFERENCE_C_CODE */
1104
   register int i;
1105
   register vector unsigned char
1106
     pixelsv1, pixelsv2, pixelsv3, pixelsv4;
1107
   register vector unsigned char
1108
     blockv, temp1, temp2;
1109
   register vector unsigned short
1110
     pixelssum1, pixelssum2, temp3,
1111
     pixelssum3, pixelssum4, temp4;
1112
   register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
1113
   register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
1114

    
1115
POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
1116
 
1117
   temp1 = vec_ld(0, pixels);
1118
   temp2 = vec_ld(16, pixels);
1119
   pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1120
   if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F)
1121
   {
1122
     pixelsv2 = temp2;
1123
   }
1124
   else
1125
   {
1126
     pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
1127
   }
1128
   pixelsv3 = vec_mergel(vczero, pixelsv1);
1129
   pixelsv4 = vec_mergel(vczero, pixelsv2);
1130
   pixelsv1 = vec_mergeh(vczero, pixelsv1);
1131
   pixelsv2 = vec_mergeh(vczero, pixelsv2);
1132
   pixelssum3 = vec_add((vector unsigned short)pixelsv3,
1133
                        (vector unsigned short)pixelsv4);
1134
   pixelssum3 = vec_add(pixelssum3, vctwo);
1135
   pixelssum1 = vec_add((vector unsigned short)pixelsv1,
1136
                        (vector unsigned short)pixelsv2);
1137
   pixelssum1 = vec_add(pixelssum1, vctwo);
1138
   
1139
   for (i = 0; i < h ; i++) {
1140
     blockv = vec_ld(0, block);
1141

    
1142
     temp1 = vec_ld(line_size, pixels);
1143
     temp2 = vec_ld(line_size + 16, pixels);
1144
     pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1145
     if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F)
1146
     {
1147
       pixelsv2 = temp2;
1148
     }
1149
     else
1150
     {
1151
       pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1152
     }
1153

    
1154
     pixelsv3 = vec_mergel(vczero, pixelsv1);
1155
     pixelsv4 = vec_mergel(vczero, pixelsv2);
1156
     pixelsv1 = vec_mergeh(vczero, pixelsv1);
1157
     pixelsv2 = vec_mergeh(vczero, pixelsv2);
1158
     
1159
     pixelssum4 = vec_add((vector unsigned short)pixelsv3,
1160
                          (vector unsigned short)pixelsv4);
1161
     pixelssum2 = vec_add((vector unsigned short)pixelsv1,
1162
                          (vector unsigned short)pixelsv2);
1163
     temp4 = vec_add(pixelssum3, pixelssum4);
1164
     temp4 = vec_sra(temp4, vctwo);
1165
     temp3 = vec_add(pixelssum1, pixelssum2);
1166
     temp3 = vec_sra(temp3, vctwo);
1167

    
1168
     pixelssum3 = vec_add(pixelssum4, vctwo);
1169
     pixelssum1 = vec_add(pixelssum2, vctwo);
1170

    
1171
     blockv = vec_packsu(temp3, temp4);
1172
     
1173
     vec_st(blockv, 0, block);
1174
     
1175
     block += line_size;
1176
     pixels += line_size;
1177
   }
1178
   
1179
POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
1180
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1181
}
1182

    
1183
/* next one assumes that ((line_size % 16) == 0) */
1184
void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
1185
{
1186
POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);
1187
#ifdef ALTIVEC_USE_REFERENCE_C_CODE
1188
    int j;
1189
POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1190
      for (j = 0; j < 4; j++) {
1191
      int i;
1192
      const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1193
      const uint32_t b =
1194
        (((const struct unaligned_32 *) (pixels + 1))->l);
1195
      uint32_t l0 =
1196
        (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
1197
      uint32_t h0 =
1198
        ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1199
      uint32_t l1, h1;
1200
      pixels += line_size;
1201
      for (i = 0; i < h; i += 2) {
1202
        uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1203
        uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
1204
        l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
1205
        h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1206
        *((uint32_t *) block) =
1207
          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1208
        pixels += line_size;
1209
        block += line_size;
1210
        a = (((const struct unaligned_32 *) (pixels))->l);
1211
        b = (((const struct unaligned_32 *) (pixels + 1))->l);
1212
        l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
1213
        h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1214
        *((uint32_t *) block) =
1215
          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1216
        pixels += line_size;
1217
        block += line_size;
1218
      } pixels += 4 - line_size * (h + 1);
1219
      block += 4 - line_size * h;
1220
    }
1221

    
1222
POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1223

    
1224
#else /* ALTIVEC_USE_REFERENCE_C_CODE */
1225
   register int i;
1226
   register vector unsigned char
1227
     pixelsv1, pixelsv2, pixelsv3, pixelsv4;
1228
   register vector unsigned char
1229
     blockv, temp1, temp2;
1230
   register vector unsigned short
1231
     pixelssum1, pixelssum2, temp3,
1232
     pixelssum3, pixelssum4, temp4;
1233
   register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
1234
   register const_vector unsigned short vcone = (const_vector unsigned short)vec_splat_u16(1);
1235
   register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
1236

    
1237
POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1238
 
1239
   temp1 = vec_ld(0, pixels);
1240
   temp2 = vec_ld(16, pixels);
1241
   pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1242
   if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F)
1243
   {
1244
     pixelsv2 = temp2;
1245
   }
1246
   else
1247
   {
1248
     pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
1249
   }
1250
   pixelsv3 = vec_mergel(vczero, pixelsv1);
1251
   pixelsv4 = vec_mergel(vczero, pixelsv2);
1252
   pixelsv1 = vec_mergeh(vczero, pixelsv1);
1253
   pixelsv2 = vec_mergeh(vczero, pixelsv2);
1254
   pixelssum3 = vec_add((vector unsigned short)pixelsv3,
1255
                        (vector unsigned short)pixelsv4);
1256
   pixelssum3 = vec_add(pixelssum3, vcone);
1257
   pixelssum1 = vec_add((vector unsigned short)pixelsv1,
1258
                        (vector unsigned short)pixelsv2);
1259
   pixelssum1 = vec_add(pixelssum1, vcone);
1260
   
1261
   for (i = 0; i < h ; i++) {
1262
     blockv = vec_ld(0, block);
1263

    
1264
     temp1 = vec_ld(line_size, pixels);
1265
     temp2 = vec_ld(line_size + 16, pixels);
1266
     pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1267
     if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F)
1268
     {
1269
       pixelsv2 = temp2;
1270
     }
1271
     else
1272
     {
1273
       pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1274
     }
1275

    
1276
     pixelsv3 = vec_mergel(vczero, pixelsv1);
1277
     pixelsv4 = vec_mergel(vczero, pixelsv2);
1278
     pixelsv1 = vec_mergeh(vczero, pixelsv1);
1279
     pixelsv2 = vec_mergeh(vczero, pixelsv2);
1280
     
1281
     pixelssum4 = vec_add((vector unsigned short)pixelsv3,
1282
                          (vector unsigned short)pixelsv4);
1283
     pixelssum2 = vec_add((vector unsigned short)pixelsv1,
1284
                          (vector unsigned short)pixelsv2);
1285
     temp4 = vec_add(pixelssum3, pixelssum4);
1286
     temp4 = vec_sra(temp4, vctwo);
1287
     temp3 = vec_add(pixelssum1, pixelssum2);
1288
     temp3 = vec_sra(temp3, vctwo);
1289

    
1290
     pixelssum3 = vec_add(pixelssum4, vcone);
1291
     pixelssum1 = vec_add(pixelssum2, vcone);
1292

    
1293
     blockv = vec_packsu(temp3, temp4);
1294
     
1295
     vec_st(blockv, 0, block);
1296
     
1297
     block += line_size;
1298
     pixels += line_size;
1299
   }
1300
   
1301
POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1302
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1303
}
1304

    
1305
int has_altivec(void)
1306
{
1307
#ifdef CONFIG_DARWIN
1308
    int sels[2] = {CTL_HW, HW_VECTORUNIT};
1309
    int has_vu = 0;
1310
    size_t len = sizeof(has_vu);
1311
    int err;
1312

    
1313
    err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
1314

    
1315
    if (err == 0) return (has_vu != 0);
1316
#else /* CONFIG_DARWIN */
1317
/* no Darwin, do it the brute-force way */
1318
/* this is borrowed from the libmpeg2 library */
1319
    {
1320
      signal (SIGILL, sigill_handler);
1321
      if (sigsetjmp (jmpbuf, 1)) {
1322
        signal (SIGILL, SIG_DFL);
1323
      } else {
1324
        canjump = 1;
1325
        
1326
        asm volatile ("mtspr 256, %0\n\t"
1327
                      "vand %%v0, %%v0, %%v0"
1328
                      :
1329
                      : "r" (-1));
1330
        
1331
        signal (SIGILL, SIG_DFL);
1332
        return 1;
1333
      }
1334
    }
1335
#endif /* CONFIG_DARWIN */
1336
    return 0;
1337
}