Revision 9c76bd48

View differences:

libavcodec/dsputil.c
144 144
}
145 145

  
146 146

  
147
static int pix_norm_c(UINT8 * pix1, UINT8 * pix2, int line_size)
148
{
149
    int s, i, j;
150
    UINT32 *sq = squareTbl + 256;
151

  
152
    s = 0;
153
    for (i = 0; i < 16; i++) {
154
        for (j = 0; j < 16; j += 8) {
155
            s += sq[pix1[0] - pix2[0]];
156
            s += sq[pix1[1] - pix2[1]];
157
            s += sq[pix1[2] - pix2[2]];
158
            s += sq[pix1[3] - pix2[3]];
159
            s += sq[pix1[4] - pix2[4]];
160
            s += sq[pix1[5] - pix2[5]];
161
            s += sq[pix1[6] - pix2[6]];
162
            s += sq[pix1[7] - pix2[7]];
163
            pix1 += 8;
164
            pix2 += 8;
165
        }
166
        pix1 += line_size - 16;
167
        pix2 += line_size - 16;
168
    }
169
    return s;
170
}
171

  
147 172
static void get_pixels_c(DCTELEM *restrict block, const UINT8 *pixels, int line_size)
148 173
{
149 174
    int i;
......
1404 1429
    c->clear_blocks = clear_blocks_c;
1405 1430
    c->pix_sum = pix_sum_c;
1406 1431
    c->pix_norm1 = pix_norm1_c;
1432
    c->pix_norm = pix_norm_c;
1407 1433

  
1408 1434
    /* TODO [0] 16  [1] 8 */
1409 1435
    c->pix_abs16x16     = pix_abs16x16_c;
libavcodec/dsputil.h
98 98
    void (*clear_blocks)(DCTELEM *blocks/*align 16*/);
99 99
    int (*pix_sum)(UINT8 * pix, int line_size);
100 100
    int (*pix_norm1)(UINT8 * pix, int line_size);
101
    int (*pix_norm)(UINT8 * pix1, UINT8 * pix2, int line_size);
101 102

  
102 103
    /* maybe create an array for 16/8 functions */
103 104
    op_pixels_func put_pixels_tab[2][4];
libavcodec/motion_est.c
63 63
    return s;
64 64
}
65 65

  
66
static int pix_norm(UINT8 * pix1, UINT8 * pix2, int line_size)
67
{
68
    int s, i, j;
69
    UINT32 *sq = squareTbl + 256;
70

  
71
    s = 0;
72
    for (i = 0; i < 16; i++) {
73
	for (j = 0; j < 16; j += 8) {
74
	    s += sq[pix1[0] - pix2[0]];
75
	    s += sq[pix1[1] - pix2[1]];
76
	    s += sq[pix1[2] - pix2[2]];
77
	    s += sq[pix1[3] - pix2[3]];
78
	    s += sq[pix1[4] - pix2[4]];
79
	    s += sq[pix1[5] - pix2[5]];
80
	    s += sq[pix1[6] - pix2[6]];
81
	    s += sq[pix1[7] - pix2[7]];
82
	    pix1 += 8;
83
	    pix2 += 8;
84
	}
85
	pix1 += line_size - 16;
86
	pix2 += line_size - 16;
87
    }
88
    return s;
89
}
90

  
91 66
static inline void no_motion_search(MpegEncContext * s,
92 67
				    int *mx_ptr, int *my_ptr)
93 68
{
......
1137 1112
    
1138 1113
    varc = (s->dsp.pix_norm1(pix, s->linesize) - (((unsigned)(sum*sum))>>8) + 500 + 128)>>8;
1139 1114
    // FIXME: MMX OPTIMIZE
1140
    vard = (pix_norm(pix, ppix, s->linesize)+128)>>8;
1115
    vard = (s->dsp.pix_norm(pix, ppix, s->linesize)+128)>>8;
1141 1116

  
1142 1117
//printf("%d %d %d %X %X %X\n", s->mb_width, mb_x, mb_y,(int)s, (int)s->mb_var, (int)s->mc_mb_var); fflush(stdout);
1143 1118
    s->mb_var   [s->mb_width * mb_y + mb_x] = varc;
libavcodec/ppc/dsputil_altivec.c
137 137
    vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
138 138
    vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
139 139
    vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
140
    vector unsigned short avghv, avglv, two, shift_mask;
140
    vector unsigned short avghv, avglv, two;
141 141
    vector unsigned short t1, t2, t3, t4;
142 142
    vector unsigned int sad;
143 143
    vector signed int sumdiffs;
144 144

  
145
    shift_mask = (vector unsigned short) (0x3fff, 0x3fff, 0x3fff, 0x3fff,
146
                                          0x3fff, 0x3fff, 0x3fff, 0x3fff);
147 145
    zero = vec_splat_u8(0);
148 146
    two = vec_splat_u16(2);
149 147
    sad = vec_splat_u32(0);
......
205 203
        t3 = vec_add(pix3hv, pix3ihv);
206 204
        t4 = vec_add(pix3lv, pix3ilv);
207 205

  
208
        avghv = vec_add(vec_add(t1, t3), two);
209
        avghv= vec_and(vec_srl(avghv, two), shift_mask);
210

  
211
        avglv = vec_add(vec_add(t2, t4), two);
212
        avglv = vec_and(vec_srl(avglv, two), shift_mask);
206
        avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
207
        avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
213 208

  
214 209
        /* Pack the shorts back into a result */
215 210
        avgv = vec_pack(avghv, avglv);
......
323 318
    int s, i;
324 319
    vector unsigned char *tv, zero;
325 320
    vector unsigned char pixv;
326
    vector unsigned short pixlv, pixhv, zeros;
327 321
    vector unsigned int sv;
328 322
    vector signed int sum;
329
    vector unsigned char perm_stoint_h = (vector unsigned char)
330
        (16, 16, 0, 1, 16, 16, 2, 3, 16, 16, 4, 5, 16, 16, 6, 7);
331
    
332
    vector unsigned char perm_stoint_l = (vector unsigned char)
333
        (16, 16, 8, 9, 16, 16, 10, 11, 16, 16, 12, 13, 16, 16, 14, 15);
334 323
        
335 324
    zero = vec_splat_u8(0);
336
    zeros = vec_splat_u16(0);
337 325
    sv = vec_splat_u32(0);
338 326
    
339 327
    s = 0;
......
342 330
        tv = (vector unsigned char *) pix;
343 331
        pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));
344 332

  
345
        /* Split them into two vectors of shorts */
346
        pixhv = (vector unsigned short) vec_mergeh(zero, pixv);
347
        pixlv = (vector unsigned short) vec_mergel(zero, pixv);
348

  
349
        
350
        /* Square the values and add them to our sum */
351
        sv = vec_msum(pixhv, pixhv, sv);
352
        sv = vec_msum(pixlv, pixlv, sv);
333
        /* Square the values, and add them to our sum */
334
        sv = vec_msum(pixv, pixv, sv);
353 335

  
354 336
        pix += line_size;
355 337
    }
......
361 343
    return s;
362 344
}
363 345

  
346

  
347
int pix_norm_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
348
{
349
    int s, i;
350
    vector unsigned char *tv, zero;
351
    vector unsigned char pix1v, pix2v, t5;
352
    vector unsigned int sv;
353
    vector signed int sum;
354

  
355
    zero = vec_splat_u8(0);
356
    sv = vec_splat_u32(0);
357
    s = 0;
358
    for (i = 0; i < 16; i++) {
359
        /* Read in the potentially unaligned pixels */
360
        tv = (vector unsigned char *) pix1;
361
        pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
362

  
363
        tv = (vector unsigned char *) pix2;
364
        pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix2));
365

  
366
        /*
367
           Since we want to use unsigned chars, we can take advantage
368
           of the fact that abs(a-b)^2 = (a-b)^2.
369
        */
370
        
371
        /* Calculate a sum of abs differences vector */
372
        t5 = vec_sub(vec_max(pix1v, pix2v), vec_min(pix1v, pix2v));
373

  
374
        /* Square the values and add them to our sum */
375
        sv = vec_msum(t5, t5, sv);
376
        
377
        pix1 += line_size;
378
        pix2 += line_size;
379
    }
380
    /* Sum up the four partial sums, and put the result into s */
381
    sum = vec_sums((vector signed int) sv, (vector signed int) zero);
382
    sum = vec_splat(sum, 3);
383
    vec_ste(sum, 0, &s);
384
    return s;
385
}
386

  
387

  
364 388
int pix_sum_altivec(UINT8 * pix, int line_size)
365 389
{
366 390

  
libavcodec/ppc/dsputil_altivec.h
23 23
extern int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size);
24 24
extern int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size);
25 25
extern int pix_norm1_altivec(uint8_t *pix, int line_size);
26
extern int pix_norm_altivec(uint8_t *pix1, uint8_t *pix2, int line_size);
26 27
extern int pix_sum_altivec(UINT8 * pix, int line_size);
27 28
extern void diff_pixels_altivec(DCTELEM* block, const UINT8* s1, const UINT8* s2, int stride);
28 29
extern void get_pixels_altivec(DCTELEM* block, const UINT8 * pixels, int line_size);
libavcodec/ppc/dsputil_ppc.c
34 34
        // Altivec specific optimisations
35 35
        c->pix_abs16x16_x2 = pix_abs16x16_x2_altivec;
36 36
        c->pix_abs16x16_y2 = pix_abs16x16_y2_altivec;
37
	c->pix_abs16x16_xy2 = pix_abs16x16_xy2_altivec;
38
	c->pix_abs16x16 = pix_abs16x16_altivec;
37
        c->pix_abs16x16_xy2 = pix_abs16x16_xy2_altivec;
38
        c->pix_abs16x16 = pix_abs16x16_altivec;
39 39
        c->pix_abs8x8 = pix_abs8x8_altivec;
40 40
        c->pix_norm1 = pix_norm1_altivec;
41
        c->pix_norm = pix_norm_altivec;
41 42
        c->pix_sum = pix_sum_altivec;
42 43
        c->diff_pixels = diff_pixels_altivec;
43 44
        c->get_pixels = get_pixels_altivec;

Also available in: Unified diff