Statistics
| Branch: | Revision:

ffmpeg / libavcodec / ppc / dsputil_altivec.c @ 8dbcc9f2

History | View | Annotate | Download (45 KB)

1 05c4072b Michael Niedermayer
/*
2
 * Copyright (c) 2002 Brian Foley
3
 * Copyright (c) 2002 Dieter Shirley
4 fe50f385 Romain Dolbeau
 * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
5 05c4072b Michael Niedermayer
 *
6
 * This library is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2 of the License, or (at your option) any later version.
10
 *
11
 * This library is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with this library; if not, write to the Free Software
18
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19
 */
20
 
21 59925ef2 Brian Foley
#include "../dsputil.h"
22 a9a07762 Michael Niedermayer
23
#include "gcc_fixes.h"
24
25 05c4072b Michael Niedermayer
#include "dsputil_altivec.h"
26 59925ef2 Brian Foley
27 3b991c54 Romain Dolbeau
#ifdef CONFIG_DARWIN
28 59925ef2 Brian Foley
#include <sys/sysctl.h>
29 3b991c54 Romain Dolbeau
#else /* CONFIG_DARWIN */
30
#include <signal.h>
31
#include <setjmp.h>
32
33
static sigjmp_buf jmpbuf;
34
static volatile sig_atomic_t canjump = 0;
35
36
static void sigill_handler (int sig)
37
{
38
    if (!canjump) {
39
        signal (sig, SIG_DFL);
40
        raise (sig);
41
    }
42
    
43
    canjump = 0;
44
    siglongjmp (jmpbuf, 1);
45
}
46
#endif /* CONFIG_DARWIN */
47 59925ef2 Brian Foley
48 f2677d6b Brian Foley
int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
49
{
50 4013fcf4 Fabrice Bellard
    int i;
51
    int s __attribute__((aligned(16)));
52 3b991c54 Romain Dolbeau
    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
53 4013fcf4 Fabrice Bellard
    vector unsigned char *tv;
54 f2677d6b Brian Foley
    vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
55
    vector unsigned int sad;
56
    vector signed int sumdiffs;
57
58
    s = 0;
59 3b991c54 Romain Dolbeau
    sad = (vector unsigned int)vec_splat_u32(0);
60 f2677d6b Brian Foley
    for(i=0;i<16;i++) {
61
        /*
62
           Read unaligned pixels into our vectors. The vectors are as follows:
63
           pix1v: pix1[0]-pix1[15]
64
           pix2v: pix2[0]-pix2[15]        pix2iv: pix2[1]-pix2[16]
65
        */
66
        tv = (vector unsigned char *) pix1;
67
        pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
68
        
69
        tv = (vector unsigned char *) &pix2[0];
70
        pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
71
72
        tv = (vector unsigned char *) &pix2[1];
73
        pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
74
75
        /* Calculate the average vector */
76
        avgv = vec_avg(pix2v, pix2iv);
77
78
        /* Calculate a sum of abs differences vector */
79
        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
80
81
        /* Add each 4 pixel group together and put 4 results into sad */
82
        sad = vec_sum4s(t5, sad);
83
        
84
        pix1 += line_size;
85
        pix2 += line_size;
86
    }
87
    /* Sum up the four partial sums, and put the result into s */
88
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
89
    sumdiffs = vec_splat(sumdiffs, 3);
90
    vec_ste(sumdiffs, 0, &s);
91
92
    return s;
93
}
94
95
int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
96
{
97 4013fcf4 Fabrice Bellard
    int i;
98
    int s __attribute__((aligned(16)));
99 3b991c54 Romain Dolbeau
    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
100 4013fcf4 Fabrice Bellard
    vector unsigned char *tv;
101 f2677d6b Brian Foley
    vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
102
    vector unsigned int sad;
103
    vector signed int sumdiffs;
104
    uint8_t *pix3 = pix2 + line_size;
105
106
    s = 0;
107 3b991c54 Romain Dolbeau
    sad = (vector unsigned int)vec_splat_u32(0);
108 f2677d6b Brian Foley
109
    /*
110
       Due to the fact that pix3 = pix2 + line_size, the pix3 of one
111
       iteration becomes pix2 in the next iteration. We can use this
112
       fact to avoid a potentially expensive unaligned read, each
113
       time around the loop.
114
       Read unaligned pixels into our vectors. The vectors are as follows:
115
       pix2v: pix2[0]-pix2[15]
116
       Split the pixel vectors into shorts
117
    */
118
    tv = (vector unsigned char *) &pix2[0];
119
    pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
120
    
121
    for(i=0;i<16;i++) {
122
        /*
123
           Read unaligned pixels into our vectors. The vectors are as follows:
124
           pix1v: pix1[0]-pix1[15]
125
           pix3v: pix3[0]-pix3[15]
126
        */
127
        tv = (vector unsigned char *) pix1;
128
        pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
129
130
        tv = (vector unsigned char *) &pix3[0];
131
        pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
132
133
        /* Calculate the average vector */
134
        avgv = vec_avg(pix2v, pix3v);
135
136
        /* Calculate a sum of abs differences vector */
137
        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
138
139
        /* Add each 4 pixel group together and put 4 results into sad */
140
        sad = vec_sum4s(t5, sad);
141
        
142
        pix1 += line_size;
143
        pix2v = pix3v;
144
        pix3 += line_size;
145
        
146
    }
147
    
148
    /* Sum up the four partial sums, and put the result into s */
149
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
150
    sumdiffs = vec_splat(sumdiffs, 3);
151
    vec_ste(sumdiffs, 0, &s);
152
    return s;    
153
}
154
155
int pix_abs16x16_xy2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
156
{
157 4013fcf4 Fabrice Bellard
    int i;
158
    int s __attribute__((aligned(16)));
159 f2677d6b Brian Foley
    uint8_t *pix3 = pix2 + line_size;
160 3b991c54 Romain Dolbeau
    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
161
    const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2);
162 4013fcf4 Fabrice Bellard
    vector unsigned char *tv, avgv, t5;
163 f2677d6b Brian Foley
    vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
164
    vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
165
    vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
166 4013fcf4 Fabrice Bellard
    vector unsigned short avghv, avglv;
167 f2677d6b Brian Foley
    vector unsigned short t1, t2, t3, t4;
168
    vector unsigned int sad;
169
    vector signed int sumdiffs;
170
171 3b991c54 Romain Dolbeau
    sad = (vector unsigned int)vec_splat_u32(0);
172 f2677d6b Brian Foley
    
173
    s = 0;
174
175
    /*
176
       Due to the fact that pix3 = pix2 + line_size, the pix3 of one
177
       iteration becomes pix2 in the next iteration. We can use this
178
       fact to avoid a potentially expensive unaligned read, as well
179
       as some splitting, and vector addition each time around the loop.
180
       Read unaligned pixels into our vectors. The vectors are as follows:
181
       pix2v: pix2[0]-pix2[15]        pix2iv: pix2[1]-pix2[16]
182
       Split the pixel vectors into shorts
183
    */
184
    tv = (vector unsigned char *) &pix2[0];
185
    pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
186
187
    tv = (vector unsigned char *) &pix2[1];
188
    pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
189
190
    pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v);
191
    pix2lv = (vector unsigned short) vec_mergel(zero, pix2v);
192
    pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv);
193
    pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv);
194
    t1 = vec_add(pix2hv, pix2ihv);
195
    t2 = vec_add(pix2lv, pix2ilv);
196
    
197
    for(i=0;i<16;i++) {
198
        /*
199
           Read unaligned pixels into our vectors. The vectors are as follows:
200
           pix1v: pix1[0]-pix1[15]
201
           pix3v: pix3[0]-pix3[15]        pix3iv: pix3[1]-pix3[16]
202
        */
203
        tv = (vector unsigned char *) pix1;
204
        pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
205
206
        tv = (vector unsigned char *) &pix3[0];
207
        pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
208
209
        tv = (vector unsigned char *) &pix3[1];
210
        pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1]));
211
212
        /*
213
          Note that Altivec does have vec_avg, but this works on vector pairs
214
          and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
215
          would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
216
          Instead, we have to split the pixel vectors into vectors of shorts,
217
          and do the averaging by hand.
218
        */
219
220
        /* Split the pixel vectors into shorts */
221
        pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v);
222
        pix3lv = (vector unsigned short) vec_mergel(zero, pix3v);
223
        pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
224
        pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
225
226
        /* Do the averaging on them */
227
        t3 = vec_add(pix3hv, pix3ihv);
228
        t4 = vec_add(pix3lv, pix3ilv);
229
230 9c76bd48 Brian Foley
        avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
231
        avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
232 f2677d6b Brian Foley
233
        /* Pack the shorts back into a result */
234
        avgv = vec_pack(avghv, avglv);
235
236
        /* Calculate a sum of abs differences vector */
237
        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
238
239
        /* Add each 4 pixel group together and put 4 results into sad */
240
        sad = vec_sum4s(t5, sad);
241
242
        pix1 += line_size;
243
        pix3 += line_size;
244
        /* Transfer the calculated values for pix3 into pix2 */
245
        t1 = t3;
246
        t2 = t4;
247
    }
248
    /* Sum up the four partial sums, and put the result into s */
249
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
250
    sumdiffs = vec_splat(sumdiffs, 3);
251
    vec_ste(sumdiffs, 0, &s);
252
253
    return s;
254
}
255
256 59925ef2 Brian Foley
int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
257
{
258 4013fcf4 Fabrice Bellard
    int i;
259
    int s __attribute__((aligned(16)));
260 3b991c54 Romain Dolbeau
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
261 59925ef2 Brian Foley
    vector unsigned char perm1, perm2, *pix1v, *pix2v;
262
    vector unsigned char t1, t2, t3,t4, t5;
263 4013fcf4 Fabrice Bellard
    vector unsigned int sad;
264 59925ef2 Brian Foley
    vector signed int sumdiffs;
265
    
266 3b991c54 Romain Dolbeau
    sad = (vector unsigned int)vec_splat_u32(0);
267 59925ef2 Brian Foley
268
269
    for(i=0;i<16;i++) {
270
        /* Read potentially unaligned pixels into t1 and t2 */
271
        perm1 = vec_lvsl(0, pix1);
272
        pix1v = (vector unsigned char *) pix1;
273
        perm2 = vec_lvsl(0, pix2);
274
        pix2v = (vector unsigned char *) pix2;
275
        t1 = vec_perm(pix1v[0], pix1v[1], perm1);
276
        t2 = vec_perm(pix2v[0], pix2v[1], perm2);
277
       
278
        /* Calculate a sum of abs differences vector */ 
279
        t3 = vec_max(t1, t2);
280
        t4 = vec_min(t1, t2);
281
        t5 = vec_sub(t3, t4);
282
        
283
        /* Add each 4 pixel group together and put 4 results into sad */
284
        sad = vec_sum4s(t5, sad);
285
286
        pix1 += line_size;
287
        pix2 += line_size;
288
    }
289
290
    /* Sum up the four partial sums, and put the result into s */
291
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
292
    sumdiffs = vec_splat(sumdiffs, 3);
293
    vec_ste(sumdiffs, 0, &s);
294
    
295
    return s;
296
}
297
298
int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
299
{
300 4013fcf4 Fabrice Bellard
    int i;
301
    int s __attribute__((aligned(16)));
302 3b991c54 Romain Dolbeau
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
303 59925ef2 Brian Foley
    vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
304
    vector unsigned char t1, t2, t3,t4, t5;
305 4013fcf4 Fabrice Bellard
    vector unsigned int sad;
306 59925ef2 Brian Foley
    vector signed int sumdiffs;
307
308 3b991c54 Romain Dolbeau
    sad = (vector unsigned int)vec_splat_u32(0);
309 a9a07762 Michael Niedermayer
310
    permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
311 59925ef2 Brian Foley
312
    for(i=0;i<8;i++) {
313
        /* Read potentially unaligned pixels into t1 and t2
314
           Since we're reading 16 pixels, and actually only want 8,
315
           mask out the last 8 pixels. The 0s don't change the sum. */
316
        perm1 = vec_lvsl(0, pix1);
317
        pix1v = (vector unsigned char *) pix1;
318
        perm2 = vec_lvsl(0, pix2);
319
        pix2v = (vector unsigned char *) pix2;
320
        t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
321
        t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
322
323
        /* Calculate a sum of abs differences vector */ 
324
        t3 = vec_max(t1, t2);
325
        t4 = vec_min(t1, t2);
326
        t5 = vec_sub(t3, t4);
327
328
        /* Add each 4 pixel group together and put 4 results into sad */
329
        sad = vec_sum4s(t5, sad);
330
331
        pix1 += line_size;
332
        pix2 += line_size;
333
    }
334
335
    /* Sum up the four partial sums, and put the result into s */
336
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
337
    sumdiffs = vec_splat(sumdiffs, 3);
338
    vec_ste(sumdiffs, 0, &s);
339
340
    return s;
341
}
342
343 f2677d6b Brian Foley
int pix_norm1_altivec(uint8_t *pix, int line_size)
344
{
345 4013fcf4 Fabrice Bellard
    int i;
346
    int s __attribute__((aligned(16)));
347 3b991c54 Romain Dolbeau
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
348 4013fcf4 Fabrice Bellard
    vector unsigned char *tv;
349 f2677d6b Brian Foley
    vector unsigned char pixv;
350
    vector unsigned int sv;
351
    vector signed int sum;
352 4013fcf4 Fabrice Bellard
    
353 3b991c54 Romain Dolbeau
    sv = (vector unsigned int)vec_splat_u32(0);
354 f2677d6b Brian Foley
    
355
    s = 0;
356
    for (i = 0; i < 16; i++) {
357
        /* Read in the potentially unaligned pixels */
358
        tv = (vector unsigned char *) pix;
359
        pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));
360
361 9c76bd48 Brian Foley
        /* Square the values, and add them to our sum */
362
        sv = vec_msum(pixv, pixv, sv);
363 f2677d6b Brian Foley
364
        pix += line_size;
365
    }
366
    /* Sum up the four partial sums, and put the result into s */
367
    sum = vec_sums((vector signed int) sv, (vector signed int) zero);
368
    sum = vec_splat(sum, 3);
369
    vec_ste(sum, 0, &s);
370
371
    return s;
372
}
373
374 4013fcf4 Fabrice Bellard
/**
375
 * Sum of Squared Errors for a 8x8 block.
376
 * AltiVec-enhanced.
377
 * It's the pix_abs8x8_altivec code above w/ squaring added.
378
 */
379
int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
380
{
381
    int i;
382
    int s __attribute__((aligned(16)));
383 3b991c54 Romain Dolbeau
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
384 4013fcf4 Fabrice Bellard
    vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
385
    vector unsigned char t1, t2, t3,t4, t5;
386
    vector unsigned int sum;
387
    vector signed int sumsqr;
388
    
389 3b991c54 Romain Dolbeau
    sum = (vector unsigned int)vec_splat_u32(0);
390 a9a07762 Michael Niedermayer
391
    permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
392
393 4013fcf4 Fabrice Bellard
    
394
    for(i=0;i<8;i++) {
395
        /* Read potentially unaligned pixels into t1 and t2
396
           Since we're reading 16 pixels, and actually only want 8,
397
           mask out the last 8 pixels. The 0s don't change the sum. */
398
        perm1 = vec_lvsl(0, pix1);
399
        pix1v = (vector unsigned char *) pix1;
400
        perm2 = vec_lvsl(0, pix2);
401
        pix2v = (vector unsigned char *) pix2;
402
        t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
403
        t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
404
405
        /*
406
          Since we want to use unsigned chars, we can take advantage
407
          of the fact that abs(a-b)^2 = (a-b)^2.
408
        */
409
        
410
        /* Calculate abs differences vector */ 
411
        t3 = vec_max(t1, t2);
412
        t4 = vec_min(t1, t2);
413
        t5 = vec_sub(t3, t4);
414
        
415
        /* Square the values and add them to our sum */
416
        sum = vec_msum(t5, t5, sum);
417
        
418
        pix1 += line_size;
419
        pix2 += line_size;
420
    }
421
    
422
    /* Sum up the four partial sums, and put the result into s */
423
    sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
424
    sumsqr = vec_splat(sumsqr, 3);
425
    vec_ste(sumsqr, 0, &s);
426
    
427
    return s;
428
}
429
430
/**
431
 * Sum of Squared Errors for a 16x16 block.
432
 * AltiVec-enhanced.
433
 * It's the pix_abs16x16_altivec code above w/ squaring added.
434
 */
435
int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
436 59925ef2 Brian Foley
{
437 4013fcf4 Fabrice Bellard
    int i;
438
    int s __attribute__((aligned(16)));
439 3b991c54 Romain Dolbeau
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
440 4013fcf4 Fabrice Bellard
    vector unsigned char perm1, perm2, *pix1v, *pix2v;
441
    vector unsigned char t1, t2, t3,t4, t5;
442
    vector unsigned int sum;
443
    vector signed int sumsqr;
444
    
445 3b991c54 Romain Dolbeau
    sum = (vector unsigned int)vec_splat_u32(0);
446 4013fcf4 Fabrice Bellard
    
447
    for(i=0;i<16;i++) {
448
        /* Read potentially unaligned pixels into t1 and t2 */
449
        perm1 = vec_lvsl(0, pix1);
450
        pix1v = (vector unsigned char *) pix1;
451
        perm2 = vec_lvsl(0, pix2);
452
        pix2v = (vector unsigned char *) pix2;
453
        t1 = vec_perm(pix1v[0], pix1v[1], perm1);
454
        t2 = vec_perm(pix2v[0], pix2v[1], perm2);
455
456
        /*
457
          Since we want to use unsigned chars, we can take advantage
458
          of the fact that abs(a-b)^2 = (a-b)^2.
459
        */
460
        
461
        /* Calculate abs differences vector */ 
462
        t3 = vec_max(t1, t2);
463
        t4 = vec_min(t1, t2);
464
        t5 = vec_sub(t3, t4);
465
        
466
        /* Square the values and add them to our sum */
467
        sum = vec_msum(t5, t5, sum);
468
        
469
        pix1 += line_size;
470
        pix2 += line_size;
471
    }
472
    
473
    /* Sum up the four partial sums, and put the result into s */
474
    sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
475
    sumsqr = vec_splat(sumsqr, 3);
476
    vec_ste(sumsqr, 0, &s);
477
    
478
    return s;
479
}
480 59925ef2 Brian Foley
481 0c1a9eda Zdenek Kabelac
int pix_sum_altivec(uint8_t * pix, int line_size)
482 4013fcf4 Fabrice Bellard
{
483 3b991c54 Romain Dolbeau
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
484 59925ef2 Brian Foley
    vector unsigned char perm, *pixv;
485
    vector unsigned char t1;
486 4013fcf4 Fabrice Bellard
    vector unsigned int sad;
487 59925ef2 Brian Foley
    vector signed int sumdiffs;
488
489 4013fcf4 Fabrice Bellard
    int i;
490
    int s __attribute__((aligned(16)));
491
    
492 3b991c54 Romain Dolbeau
    sad = (vector unsigned int)vec_splat_u32(0);
493 59925ef2 Brian Foley
    
494
    for (i = 0; i < 16; i++) {
495
        /* Read the potentially unaligned 16 pixels into t1 */
496
        perm = vec_lvsl(0, pix);
497
        pixv = (vector unsigned char *) pix;
498
        t1 = vec_perm(pixv[0], pixv[1], perm);
499
500
        /* Add each 4 pixel group together and put 4 results into sad */
501
        sad = vec_sum4s(t1, sad);
502
        
503
        pix += line_size;
504
    }
505
    
506
    /* Sum up the four partial sums, and put the result into s */
507
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
508
    sumdiffs = vec_splat(sumdiffs, 3);
509
    vec_ste(sumdiffs, 0, &s);
510
    
511
    return s;
512
}
513
514 0c1a9eda Zdenek Kabelac
void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
515 05c4072b Michael Niedermayer
{
516
    int i;
517
    vector unsigned char perm, bytes, *pixv;
518 3b991c54 Romain Dolbeau
    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
519 05c4072b Michael Niedermayer
    vector signed short shorts;
520
521
    for(i=0;i<8;i++)
522
    {
523
        // Read potentially unaligned pixels.
524
        // We're reading 16 pixels, and actually only want 8,
525
        // but we simply ignore the extras.
526
        perm = vec_lvsl(0, pixels);
527
        pixv = (vector unsigned char *) pixels;
528
        bytes = vec_perm(pixv[0], pixv[1], perm);
529
530
        // convert the bytes into shorts
531
        shorts = (vector signed short)vec_mergeh(zero, bytes);
532
533
        // save the data to the block, we assume the block is 16-byte aligned
534
        vec_st(shorts, i*16, (vector signed short*)block);
535
536
        pixels += line_size;
537
    }
538
}
539
540 0c1a9eda Zdenek Kabelac
void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
541
        const uint8_t *s2, int stride)
542 05c4072b Michael Niedermayer
{
543
    int i;
544
    vector unsigned char perm, bytes, *pixv;
545 3b991c54 Romain Dolbeau
    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
546 05c4072b Michael Niedermayer
    vector signed short shorts1, shorts2;
547
548
    for(i=0;i<4;i++)
549
    {
550
        // Read potentially unaligned pixels
551
        // We're reading 16 pixels, and actually only want 8,
552
        // but we simply ignore the extras.
553
        perm = vec_lvsl(0, s1);
554
        pixv = (vector unsigned char *) s1;
555
        bytes = vec_perm(pixv[0], pixv[1], perm);
556
557
        // convert the bytes into shorts
558
        shorts1 = (vector signed short)vec_mergeh(zero, bytes);
559
560
        // Do the same for the second block of pixels
561
        perm = vec_lvsl(0, s2);
562
        pixv = (vector unsigned char *) s2;
563
        bytes = vec_perm(pixv[0], pixv[1], perm);
564
565
        // convert the bytes into shorts
566
        shorts2 = (vector signed short)vec_mergeh(zero, bytes);
567
568
        // Do the subtraction
569
        shorts1 = vec_sub(shorts1, shorts2);
570
571
        // save the data to the block, we assume the block is 16-byte aligned
572
        vec_st(shorts1, 0, (vector signed short*)block);
573
574
        s1 += stride;
575
        s2 += stride;
576
        block += 8;
577
578
579
        // The code below is a copy of the code above... This is a manual
580
        // unroll.
581
582
        // Read potentially unaligned pixels
583
        // We're reading 16 pixels, and actually only want 8,
584
        // but we simply ignore the extras.
585
        perm = vec_lvsl(0, s1);
586
        pixv = (vector unsigned char *) s1;
587
        bytes = vec_perm(pixv[0], pixv[1], perm);
588
589
        // convert the bytes into shorts
590
        shorts1 = (vector signed short)vec_mergeh(zero, bytes);
591
592
        // Do the same for the second block of pixels
593
        perm = vec_lvsl(0, s2);
594
        pixv = (vector unsigned char *) s2;
595
        bytes = vec_perm(pixv[0], pixv[1], perm);
596
597
        // convert the bytes into shorts
598
        shorts2 = (vector signed short)vec_mergeh(zero, bytes);
599
600
        // Do the subtraction
601
        shorts1 = vec_sub(shorts1, shorts2);
602
603
        // save the data to the block, we assume the block is 16-byte aligned
604
        vec_st(shorts1, 0, (vector signed short*)block);
605
606
        s1 += stride;
607
        s2 += stride;
608
        block += 8;
609
    }
610
}
611
612 e629ab68 Romain Dolbeau
int sad16x16_altivec(void *s, uint8_t *a, uint8_t *b, int stride) {
613
  return pix_abs16x16_altivec(a,b,stride);
614
}
615
616
int sad8x8_altivec(void *s, uint8_t *a, uint8_t *b, int stride) {
617
  return pix_abs8x8_altivec(a,b,stride);
618
}
619
620
void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) {
621 db40a39a Michael Niedermayer
#ifdef ALTIVEC_USE_REFERENCE_C_CODE
622 e629ab68 Romain Dolbeau
    int i;
623
    for(i=0; i+7<w; i++){
624
        dst[i+0] += src[i+0];
625
        dst[i+1] += src[i+1];
626
        dst[i+2] += src[i+2];
627
        dst[i+3] += src[i+3];
628
        dst[i+4] += src[i+4];
629
        dst[i+5] += src[i+5];
630
        dst[i+6] += src[i+6];
631
        dst[i+7] += src[i+7];
632
    }
633
    for(; i<w; i++)
634
        dst[i+0] += src[i+0];
635 db40a39a Michael Niedermayer
#else /* ALTIVEC_USE_REFERENCE_C_CODE */
636 e629ab68 Romain Dolbeau
    register int i;
637 db40a39a Michael Niedermayer
    register vector unsigned char vdst, vsrc;
638
    
639
    /* dst and src are 16 bytes-aligned (guaranteed) */
640
    for(i = 0 ; (i + 15) < w ; i++)
641 e629ab68 Romain Dolbeau
    {
642 db40a39a Michael Niedermayer
      vdst = vec_ld(i << 4, (unsigned char*)dst);
643
      vsrc = vec_ld(i << 4, (unsigned char*)src);
644 e629ab68 Romain Dolbeau
      vdst = vec_add(vsrc, vdst);
645 db40a39a Michael Niedermayer
      vec_st(vdst, i << 4, (unsigned char*)dst);
646 e629ab68 Romain Dolbeau
    }
647 db40a39a Michael Niedermayer
    /* if w is not a multiple of 16 */
648 e629ab68 Romain Dolbeau
    for (; (i < w) ; i++)
649
    {
650
      dst[i] = src[i];
651
    }
652 db40a39a Michael Niedermayer
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
653
}
654
655 fe50f385 Romain Dolbeau
/* next one assumes that ((line_size % 16) == 0) */
656 db40a39a Michael Niedermayer
void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
657
{
658 e45a2872 Romain Dolbeau
POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1);
659 db40a39a Michael Niedermayer
#ifdef ALTIVEC_USE_REFERENCE_C_CODE
660
    int i;
661
662 e45a2872 Romain Dolbeau
POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
663 db40a39a Michael Niedermayer
664
    for(i=0; i<h; i++) {
665
      *((uint32_t*)(block )) = (((const struct unaligned_32 *) (pixels))->l);
666
      *((uint32_t*)(block+4)) = (((const struct unaligned_32 *) (pixels+4))->l);
667
      *((uint32_t*)(block+8)) = (((const struct unaligned_32 *) (pixels+8))->l);
668
      *((uint32_t*)(block+12)) = (((const struct unaligned_32 *) (pixels+12))->l);
669
      pixels+=line_size;
670
      block +=line_size;
671
    }
672
673 e45a2872 Romain Dolbeau
POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
674 db40a39a Michael Niedermayer
675
#else /* ALTIVEC_USE_REFERENCE_C_CODE */
676
    register vector unsigned char pixelsv1, pixelsv2;
677 e45a2872 Romain Dolbeau
    register vector unsigned char pixelsv1B, pixelsv2B;
678
    register vector unsigned char pixelsv1C, pixelsv2C;
679
    register vector unsigned char pixelsv1D, pixelsv2D;
680
681 fe50f385 Romain Dolbeau
    register vector unsigned char perm = vec_lvsl(0, pixels);
682 db40a39a Michael Niedermayer
    int i;
683 e45a2872 Romain Dolbeau
    register int line_size_2 = line_size << 1;
684
    register int line_size_3 = line_size + line_size_2;
685
    register int line_size_4 = line_size << 2;
686
687
POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
688
// hand-unrolling the loop by 4 gains about 15%
689
// mininum execution time goes from 74 to 60 cycles
690
// it's faster than -funroll-loops, but using
691
// -funroll-loops w/ this is bad - 74 cycles again.
692
// all this is on a 7450, tuning for the 7450
693
#if 0
694 db40a39a Michael Niedermayer
    for(i=0; i<h; i++) {
695
      pixelsv1 = vec_ld(0, (unsigned char*)pixels);
696
      pixelsv2 = vec_ld(16, (unsigned char*)pixels);
697 fe50f385 Romain Dolbeau
      vec_st(vec_perm(pixelsv1, pixelsv2, perm),
698 35e5fb06 Romain Dolbeau
             0, (unsigned char*)block);
699 db40a39a Michael Niedermayer
      pixels+=line_size;
700
      block +=line_size;
701
    }
702 e45a2872 Romain Dolbeau
#else
703
    for(i=0; i<h; i+=4) {
704
      pixelsv1 = vec_ld(0, (unsigned char*)pixels);
705
      pixelsv2 = vec_ld(16, (unsigned char*)pixels);
706
      pixelsv1B = vec_ld(line_size, (unsigned char*)pixels);
707
      pixelsv2B = vec_ld(16 + line_size, (unsigned char*)pixels);
708
      pixelsv1C = vec_ld(line_size_2, (unsigned char*)pixels);
709
      pixelsv2C = vec_ld(16 + line_size_2, (unsigned char*)pixels);
710
      pixelsv1D = vec_ld(line_size_3, (unsigned char*)pixels);
711
      pixelsv2D = vec_ld(16 + line_size_3, (unsigned char*)pixels);
712
      vec_st(vec_perm(pixelsv1, pixelsv2, perm),
713
             0, (unsigned char*)block);
714
      vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
715
             line_size, (unsigned char*)block);
716
      vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
717
             line_size_2, (unsigned char*)block);
718
      vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
719
             line_size_3, (unsigned char*)block);
720
      pixels+=line_size_4;
721
      block +=line_size_4;
722
    }
723
#endif
724
POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
725 db40a39a Michael Niedermayer
726
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
727
}
728
729 fe50f385 Romain Dolbeau
/* next one assumes that ((line_size % 16) == 0) */
730 db40a39a Michael Niedermayer
#define op_avg(a,b)  a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
731
void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
732
{
733 e45a2872 Romain Dolbeau
POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1);
734 db40a39a Michael Niedermayer
#ifdef ALTIVEC_USE_REFERENCE_C_CODE
735
    int i;
736
737 e45a2872 Romain Dolbeau
POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
738 db40a39a Michael Niedermayer
739
    for(i=0; i<h; i++) {
740
      op_avg(*((uint32_t*)(block)),(((const struct unaligned_32 *)(pixels))->l));
741
      op_avg(*((uint32_t*)(block+4)),(((const struct unaligned_32 *)(pixels+4))->l));
742
      op_avg(*((uint32_t*)(block+8)),(((const struct unaligned_32 *)(pixels+8))->l));
743
      op_avg(*((uint32_t*)(block+12)),(((const struct unaligned_32 *)(pixels+12))->l));
744
      pixels+=line_size;
745
      block +=line_size;
746
    }
747
748 e45a2872 Romain Dolbeau
POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
749 db40a39a Michael Niedermayer
750
#else /* ALTIVEC_USE_REFERENCE_C_CODE */
751
    register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
752 fe50f385 Romain Dolbeau
    register vector unsigned char perm = vec_lvsl(0, pixels);
753 db40a39a Michael Niedermayer
    int i;
754
755 e45a2872 Romain Dolbeau
POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
756 db40a39a Michael Niedermayer
757
    for(i=0; i<h; i++) {
758
      pixelsv1 = vec_ld(0, (unsigned char*)pixels);
759
      pixelsv2 = vec_ld(16, (unsigned char*)pixels);
760
      blockv = vec_ld(0, block);
761 fe50f385 Romain Dolbeau
      pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
762 db40a39a Michael Niedermayer
      blockv = vec_avg(blockv,pixelsv);
763
      vec_st(blockv, 0, (unsigned char*)block);
764
      pixels+=line_size;
765
      block +=line_size;
766
    }
767
768 e45a2872 Romain Dolbeau
POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
769 db40a39a Michael Niedermayer
770
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
771 e629ab68 Romain Dolbeau
}
772 05c4072b Michael Niedermayer
773 fe50f385 Romain Dolbeau
/* next one assumes that ((line_size % 8) == 0) */
774
void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
775 35e5fb06 Romain Dolbeau
{
776 e45a2872 Romain Dolbeau
POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1);
777 35e5fb06 Romain Dolbeau
#ifdef ALTIVEC_USE_REFERENCE_C_CODE
778
    int i;
779 e45a2872 Romain Dolbeau
POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
780 35e5fb06 Romain Dolbeau
    for (i = 0; i < h; i++) {
781
        *((uint32_t *) (block)) =
782
            (((*((uint32_t *) (block))) |
783
              ((((const struct unaligned_32 *) (pixels))->l))) -
784
             ((((*((uint32_t *) (block))) ^
785
                ((((const struct unaligned_32 *) (pixels))->
786
                  l))) & 0xFEFEFEFEUL) >> 1));
787
        *((uint32_t *) (block + 4)) =
788
            (((*((uint32_t *) (block + 4))) |
789
              ((((const struct unaligned_32 *) (pixels + 4))->l))) -
790
             ((((*((uint32_t *) (block + 4))) ^
791
                ((((const struct unaligned_32 *) (pixels +
792
                                                  4))->
793
                  l))) & 0xFEFEFEFEUL) >> 1));
794
        pixels += line_size;
795
        block += line_size;
796
    }
797 e45a2872 Romain Dolbeau
POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
798 35e5fb06 Romain Dolbeau
799
#else /* ALTIVEC_USE_REFERENCE_C_CODE */
800
    register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
801
    int i;
802
803 e45a2872 Romain Dolbeau
POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
804 35e5fb06 Romain Dolbeau
 
805
   for (i = 0; i < h; i++) {
806
     /*
807
       block is 8 bytes-aligned, so we're either in the
808
       left block (16 bytes-aligned) or in the right block (not)
809
     */
810
     int rightside = ((unsigned long)block & 0x0000000F);
811
     
812
     blockv = vec_ld(0, block);
813
     pixelsv1 = vec_ld(0, (unsigned char*)pixels);
814
     pixelsv2 = vec_ld(16, (unsigned char*)pixels);
815
     pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
816
     
817
     if (rightside)
818
     {
819
       pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
820
     }
821
     else
822
     {
823
       pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
824
     }
825
     
826
     blockv = vec_avg(blockv, pixelsv);
827
828
     vec_st(blockv, 0, block);
829
     
830
     pixels += line_size;
831
     block += line_size;
832
   }
833
   
834 e45a2872 Romain Dolbeau
POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
835 35e5fb06 Romain Dolbeau
 
836
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
837
}
838
839 fe50f385 Romain Dolbeau
/* next one assumes that ((line_size % 8) == 0) */
840 35e5fb06 Romain Dolbeau
void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
841
{
842 e45a2872 Romain Dolbeau
POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1);
843 35e5fb06 Romain Dolbeau
#ifdef ALTIVEC_USE_REFERENCE_C_CODE
844
    int j;
845 e45a2872 Romain Dolbeau
POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
846 35e5fb06 Romain Dolbeau
    for (j = 0; j < 2; j++) {
847
      int i;
848
      const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
849
      const uint32_t b =
850
        (((const struct unaligned_32 *) (pixels + 1))->l);
851
      uint32_t l0 =
852
        (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
853
      uint32_t h0 =
854
        ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
855
      uint32_t l1, h1;
856
      pixels += line_size;
857
      for (i = 0; i < h; i += 2) {
858
        uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
859
        uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
860
        l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
861
        h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
862
        *((uint32_t *) block) =
863
          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
864
        pixels += line_size;
865
        block += line_size;
866
        a = (((const struct unaligned_32 *) (pixels))->l);
867
        b = (((const struct unaligned_32 *) (pixels + 1))->l);
868
        l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
869
        h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
870
        *((uint32_t *) block) =
871
          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
872
        pixels += line_size;
873
        block += line_size;
874
      } pixels += 4 - line_size * (h + 1);
875
      block += 4 - line_size * h;
876
    }
877
878 e45a2872 Romain Dolbeau
POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
879 35e5fb06 Romain Dolbeau
880
#else /* ALTIVEC_USE_REFERENCE_C_CODE */
881
   register int i;
882
   register vector unsigned char
883
     pixelsv1, pixelsv2,
884
     pixelsavg;
885
   register vector unsigned char
886
     blockv, temp1, temp2;
887
   register vector unsigned short
888
     pixelssum1, pixelssum2, temp3;
889 3b991c54 Romain Dolbeau
   register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
890
   register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
891 35e5fb06 Romain Dolbeau
   
892
   temp1 = vec_ld(0, pixels);
893
   temp2 = vec_ld(16, pixels);
894
   pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
895
   if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F)
896
   {
897
     pixelsv2 = temp2;
898
   }
899
   else
900
   {
901
     pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
902
   }
903
   pixelsv1 = vec_mergeh(vczero, pixelsv1);
904
   pixelsv2 = vec_mergeh(vczero, pixelsv2);
905
   pixelssum1 = vec_add((vector unsigned short)pixelsv1,
906
                        (vector unsigned short)pixelsv2);
907
   pixelssum1 = vec_add(pixelssum1, vctwo);
908
   
909 e45a2872 Romain Dolbeau
POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1); 
910 35e5fb06 Romain Dolbeau
   for (i = 0; i < h ; i++) {
911
     int rightside = ((unsigned long)block & 0x0000000F);
912
     blockv = vec_ld(0, block);
913
914
     temp1 = vec_ld(line_size, pixels);
915
     temp2 = vec_ld(line_size + 16, pixels);
916
     pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
917
     if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F)
918
     {
919
       pixelsv2 = temp2;
920
     }
921
     else
922
     {
923
       pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
924
     }
925
926
     pixelsv1 = vec_mergeh(vczero, pixelsv1);
927
     pixelsv2 = vec_mergeh(vczero, pixelsv2);
928
     pixelssum2 = vec_add((vector unsigned short)pixelsv1,
929
                          (vector unsigned short)pixelsv2);
930
     temp3 = vec_add(pixelssum1, pixelssum2);
931
     temp3 = vec_sra(temp3, vctwo);
932
     pixelssum1 = vec_add(pixelssum2, vctwo);
933
     pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
934
     
935
     if (rightside)
936
     {
937
       blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
938
     }
939
     else
940
     {
941
       blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
942
     }
943
     
944
     vec_st(blockv, 0, block);
945
     
946
     block += line_size;
947
     pixels += line_size;
948
   }
949
   
950 e45a2872 Romain Dolbeau
POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
951 35e5fb06 Romain Dolbeau
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
952
}
953
954 fe50f385 Romain Dolbeau
/* next one assumes that ((line_size % 8) == 0) */
955
void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
956
{
957 e45a2872 Romain Dolbeau
POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
958 fe50f385 Romain Dolbeau
#ifdef ALTIVEC_USE_REFERENCE_C_CODE
959
    int j;
960 e45a2872 Romain Dolbeau
POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
961 fe50f385 Romain Dolbeau
    for (j = 0; j < 2; j++) {
962
      int i;
963
      const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
964
      const uint32_t b =
965
        (((const struct unaligned_32 *) (pixels + 1))->l);
966
      uint32_t l0 =
967
        (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
968
      uint32_t h0 =
969
        ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
970
      uint32_t l1, h1;
971
      pixels += line_size;
972
      for (i = 0; i < h; i += 2) {
973
        uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
974
        uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
975
        l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
976
        h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
977
        *((uint32_t *) block) =
978
          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
979
        pixels += line_size;
980
        block += line_size;
981
        a = (((const struct unaligned_32 *) (pixels))->l);
982
        b = (((const struct unaligned_32 *) (pixels + 1))->l);
983
        l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
984
        h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
985
        *((uint32_t *) block) =
986
          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
987
        pixels += line_size;
988
        block += line_size;
989
      } pixels += 4 - line_size * (h + 1);
990
      block += 4 - line_size * h;
991
    }
992
    
993 e45a2872 Romain Dolbeau
POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
994 fe50f385 Romain Dolbeau
995
#else /* ALTIVEC_USE_REFERENCE_C_CODE */
996
   register int i;
997
   register vector unsigned char
998
     pixelsv1, pixelsv2,
999
     pixelsavg;
1000
   register vector unsigned char
1001
     blockv, temp1, temp2;
1002
   register vector unsigned short
1003
     pixelssum1, pixelssum2, temp3;
1004 3b991c54 Romain Dolbeau
   register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
1005
   register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
1006
   register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
1007 fe50f385 Romain Dolbeau
   
1008
   temp1 = vec_ld(0, pixels);
1009
   temp2 = vec_ld(16, pixels);
1010
   pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1011
   if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F)
1012
   {
1013
     pixelsv2 = temp2;
1014
   }
1015
   else
1016
   {
1017
     pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
1018
   }
1019
   pixelsv1 = vec_mergeh(vczero, pixelsv1);
1020
   pixelsv2 = vec_mergeh(vczero, pixelsv2);
1021
   pixelssum1 = vec_add((vector unsigned short)pixelsv1,
1022
                        (vector unsigned short)pixelsv2);
1023
   pixelssum1 = vec_add(pixelssum1, vcone);
1024
   
1025 e45a2872 Romain Dolbeau
POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); 
1026 fe50f385 Romain Dolbeau
   for (i = 0; i < h ; i++) {
1027
     int rightside = ((unsigned long)block & 0x0000000F);
1028
     blockv = vec_ld(0, block);
1029
1030
     temp1 = vec_ld(line_size, pixels);
1031
     temp2 = vec_ld(line_size + 16, pixels);
1032
     pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1033
     if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F)
1034
     {
1035
       pixelsv2 = temp2;
1036
     }
1037
     else
1038
     {
1039
       pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1040
     }
1041
1042
     pixelsv1 = vec_mergeh(vczero, pixelsv1);
1043
     pixelsv2 = vec_mergeh(vczero, pixelsv2);
1044
     pixelssum2 = vec_add((vector unsigned short)pixelsv1,
1045
                          (vector unsigned short)pixelsv2);
1046
     temp3 = vec_add(pixelssum1, pixelssum2);
1047
     temp3 = vec_sra(temp3, vctwo);
1048
     pixelssum1 = vec_add(pixelssum2, vcone);
1049
     pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
1050
     
1051
     if (rightside)
1052
     {
1053
       blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
1054
     }
1055
     else
1056
     {
1057
       blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
1058
     }
1059
     
1060
     vec_st(blockv, 0, block);
1061
     
1062
     block += line_size;
1063
     pixels += line_size;
1064
   }
1065
   
1066 e45a2872 Romain Dolbeau
POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
1067 fe50f385 Romain Dolbeau
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1068
}
1069
1070
/* next one assumes that ((line_size % 16) == 0) */
1071
void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
1072
{
1073 e45a2872 Romain Dolbeau
POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1);
1074 fe50f385 Romain Dolbeau
#ifdef ALTIVEC_USE_REFERENCE_C_CODE
1075
    int j;
1076 e45a2872 Romain Dolbeau
POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
1077 fe50f385 Romain Dolbeau
      for (j = 0; j < 4; j++) {
1078
      int i;
1079
      const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1080
      const uint32_t b =
1081
        (((const struct unaligned_32 *) (pixels + 1))->l);
1082
      uint32_t l0 =
1083
        (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
1084
      uint32_t h0 =
1085
        ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1086
      uint32_t l1, h1;
1087
      pixels += line_size;
1088
      for (i = 0; i < h; i += 2) {
1089
        uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1090
        uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
1091
        l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
1092
        h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1093
        *((uint32_t *) block) =
1094
          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1095
        pixels += line_size;
1096
        block += line_size;
1097
        a = (((const struct unaligned_32 *) (pixels))->l);
1098
        b = (((const struct unaligned_32 *) (pixels + 1))->l);
1099
        l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
1100
        h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1101
        *((uint32_t *) block) =
1102
          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1103
        pixels += line_size;
1104
        block += line_size;
1105
      } pixels += 4 - line_size * (h + 1);
1106
      block += 4 - line_size * h;
1107
    }
1108
1109 e45a2872 Romain Dolbeau
POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
1110 fe50f385 Romain Dolbeau
1111
#else /* ALTIVEC_USE_REFERENCE_C_CODE */
1112
   register int i;
1113
   register vector unsigned char
1114
     pixelsv1, pixelsv2, pixelsv3, pixelsv4;
1115
   register vector unsigned char
1116
     blockv, temp1, temp2;
1117
   register vector unsigned short
1118
     pixelssum1, pixelssum2, temp3,
1119
     pixelssum3, pixelssum4, temp4;
1120 3b991c54 Romain Dolbeau
   register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
1121
   register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
1122 3efd4952 Romain Dolbeau
1123 e45a2872 Romain Dolbeau
POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
1124 3efd4952 Romain Dolbeau
 
1125 fe50f385 Romain Dolbeau
   temp1 = vec_ld(0, pixels);
1126
   temp2 = vec_ld(16, pixels);
1127
   pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1128
   if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F)
1129
   {
1130
     pixelsv2 = temp2;
1131
   }
1132
   else
1133
   {
1134
     pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
1135
   }
1136
   pixelsv3 = vec_mergel(vczero, pixelsv1);
1137
   pixelsv4 = vec_mergel(vczero, pixelsv2);
1138
   pixelsv1 = vec_mergeh(vczero, pixelsv1);
1139
   pixelsv2 = vec_mergeh(vczero, pixelsv2);
1140
   pixelssum3 = vec_add((vector unsigned short)pixelsv3,
1141
                        (vector unsigned short)pixelsv4);
1142
   pixelssum3 = vec_add(pixelssum3, vctwo);
1143
   pixelssum1 = vec_add((vector unsigned short)pixelsv1,
1144
                        (vector unsigned short)pixelsv2);
1145
   pixelssum1 = vec_add(pixelssum1, vctwo);
1146
   
1147
   for (i = 0; i < h ; i++) {
1148
     blockv = vec_ld(0, block);
1149
1150
     temp1 = vec_ld(line_size, pixels);
1151
     temp2 = vec_ld(line_size + 16, pixels);
1152
     pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1153
     if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F)
1154
     {
1155
       pixelsv2 = temp2;
1156
     }
1157
     else
1158
     {
1159
       pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1160
     }
1161
1162
     pixelsv3 = vec_mergel(vczero, pixelsv1);
1163
     pixelsv4 = vec_mergel(vczero, pixelsv2);
1164
     pixelsv1 = vec_mergeh(vczero, pixelsv1);
1165
     pixelsv2 = vec_mergeh(vczero, pixelsv2);
1166
     
1167
     pixelssum4 = vec_add((vector unsigned short)pixelsv3,
1168
                          (vector unsigned short)pixelsv4);
1169
     pixelssum2 = vec_add((vector unsigned short)pixelsv1,
1170
                          (vector unsigned short)pixelsv2);
1171
     temp4 = vec_add(pixelssum3, pixelssum4);
1172
     temp4 = vec_sra(temp4, vctwo);
1173
     temp3 = vec_add(pixelssum1, pixelssum2);
1174
     temp3 = vec_sra(temp3, vctwo);
1175
1176
     pixelssum3 = vec_add(pixelssum4, vctwo);
1177
     pixelssum1 = vec_add(pixelssum2, vctwo);
1178
1179
     blockv = vec_packsu(temp3, temp4);
1180
     
1181
     vec_st(blockv, 0, block);
1182
     
1183
     block += line_size;
1184
     pixels += line_size;
1185
   }
1186
   
1187 e45a2872 Romain Dolbeau
POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
1188 fe50f385 Romain Dolbeau
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1189
}
1190
1191
/* next one assumes that ((line_size % 16) == 0) */
1192
void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
1193
{
1194 e45a2872 Romain Dolbeau
POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);
1195 fe50f385 Romain Dolbeau
#ifdef ALTIVEC_USE_REFERENCE_C_CODE
1196
    int j;
1197 e45a2872 Romain Dolbeau
POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1198 fe50f385 Romain Dolbeau
      for (j = 0; j < 4; j++) {
1199
      int i;
1200
      const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1201
      const uint32_t b =
1202
        (((const struct unaligned_32 *) (pixels + 1))->l);
1203
      uint32_t l0 =
1204
        (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
1205
      uint32_t h0 =
1206
        ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1207
      uint32_t l1, h1;
1208
      pixels += line_size;
1209
      for (i = 0; i < h; i += 2) {
1210
        uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1211
        uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
1212
        l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
1213
        h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1214
        *((uint32_t *) block) =
1215
          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1216
        pixels += line_size;
1217
        block += line_size;
1218
        a = (((const struct unaligned_32 *) (pixels))->l);
1219
        b = (((const struct unaligned_32 *) (pixels + 1))->l);
1220
        l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
1221
        h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1222
        *((uint32_t *) block) =
1223
          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1224
        pixels += line_size;
1225
        block += line_size;
1226
      } pixels += 4 - line_size * (h + 1);
1227
      block += 4 - line_size * h;
1228
    }
1229
1230 e45a2872 Romain Dolbeau
POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1231 fe50f385 Romain Dolbeau
1232
#else /* ALTIVEC_USE_REFERENCE_C_CODE */
1233
   register int i;
1234
   register vector unsigned char
1235
     pixelsv1, pixelsv2, pixelsv3, pixelsv4;
1236
   register vector unsigned char
1237
     blockv, temp1, temp2;
1238
   register vector unsigned short
1239
     pixelssum1, pixelssum2, temp3,
1240
     pixelssum3, pixelssum4, temp4;
1241 3b991c54 Romain Dolbeau
   register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
1242
   register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
1243
   register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
1244 3efd4952 Romain Dolbeau
1245 e45a2872 Romain Dolbeau
POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1246 3efd4952 Romain Dolbeau
 
1247 fe50f385 Romain Dolbeau
   temp1 = vec_ld(0, pixels);
1248
   temp2 = vec_ld(16, pixels);
1249
   pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1250
   if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F)
1251
   {
1252
     pixelsv2 = temp2;
1253
   }
1254
   else
1255
   {
1256
     pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
1257
   }
1258
   pixelsv3 = vec_mergel(vczero, pixelsv1);
1259
   pixelsv4 = vec_mergel(vczero, pixelsv2);
1260
   pixelsv1 = vec_mergeh(vczero, pixelsv1);
1261
   pixelsv2 = vec_mergeh(vczero, pixelsv2);
1262
   pixelssum3 = vec_add((vector unsigned short)pixelsv3,
1263
                        (vector unsigned short)pixelsv4);
1264
   pixelssum3 = vec_add(pixelssum3, vcone);
1265
   pixelssum1 = vec_add((vector unsigned short)pixelsv1,
1266
                        (vector unsigned short)pixelsv2);
1267
   pixelssum1 = vec_add(pixelssum1, vcone);
1268
   
1269
   for (i = 0; i < h ; i++) {
1270
     blockv = vec_ld(0, block);
1271
1272
     temp1 = vec_ld(line_size, pixels);
1273
     temp2 = vec_ld(line_size + 16, pixels);
1274
     pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1275
     if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F)
1276
     {
1277
       pixelsv2 = temp2;
1278
     }
1279
     else
1280
     {
1281
       pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1282
     }
1283
1284
     pixelsv3 = vec_mergel(vczero, pixelsv1);
1285
     pixelsv4 = vec_mergel(vczero, pixelsv2);
1286
     pixelsv1 = vec_mergeh(vczero, pixelsv1);
1287
     pixelsv2 = vec_mergeh(vczero, pixelsv2);
1288
     
1289
     pixelssum4 = vec_add((vector unsigned short)pixelsv3,
1290
                          (vector unsigned short)pixelsv4);
1291
     pixelssum2 = vec_add((vector unsigned short)pixelsv1,
1292
                          (vector unsigned short)pixelsv2);
1293
     temp4 = vec_add(pixelssum3, pixelssum4);
1294
     temp4 = vec_sra(temp4, vctwo);
1295
     temp3 = vec_add(pixelssum1, pixelssum2);
1296
     temp3 = vec_sra(temp3, vctwo);
1297
1298
     pixelssum3 = vec_add(pixelssum4, vcone);
1299
     pixelssum1 = vec_add(pixelssum2, vcone);
1300
1301
     blockv = vec_packsu(temp3, temp4);
1302
     
1303
     vec_st(blockv, 0, block);
1304
     
1305
     block += line_size;
1306
     pixels += line_size;
1307
   }
1308
   
1309 e45a2872 Romain Dolbeau
POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1310 fe50f385 Romain Dolbeau
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1311
}
1312
1313 59925ef2 Brian Foley
int has_altivec(void)
1314
{
1315 3b991c54 Romain Dolbeau
#ifdef CONFIG_DARWIN
1316 59925ef2 Brian Foley
    int sels[2] = {CTL_HW, HW_VECTORUNIT};
1317
    int has_vu = 0;
1318
    size_t len = sizeof(has_vu);
1319
    int err;
1320
1321
    err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
1322
1323
    if (err == 0) return (has_vu != 0);
1324 3b991c54 Romain Dolbeau
#else /* CONFIG_DARWIN */
1325
/* no Darwin, do it the brute-force way */
1326
/* this is borrowed from the libmpeg2 library */
1327
    {
1328
      signal (SIGILL, sigill_handler);
1329
      if (sigsetjmp (jmpbuf, 1)) {
1330
        signal (SIGILL, SIG_DFL);
1331
      } else {
1332
        canjump = 1;
1333
        
1334
        asm volatile ("mtspr 256, %0\n\t"
1335
                      "vand %%v0, %%v0, %%v0"
1336
                      :
1337
                      : "r" (-1));
1338
        
1339
        signal (SIGILL, SIG_DFL);
1340
        return 1;
1341
      }
1342
    }
1343
#endif /* CONFIG_DARWIN */
1344 59925ef2 Brian Foley
    return 0;
1345
}