Revision 2b092f7a libavcodec/h264pred_internal.h

View differences:

libavcodec/h264pred_internal.h
28 28
#include "mathops.h"
29 29
#include "dsputil.h"
30 30

  
31
static void pred4x4_vertical_c(uint8_t *src, const uint8_t *topright, int stride){
32
    const uint32_t a= ((uint32_t*)(src-stride))[0];
33
    ((uint32_t*)(src+0*stride))[0]= a;
34
    ((uint32_t*)(src+1*stride))[0]= a;
35
    ((uint32_t*)(src+2*stride))[0]= a;
36
    ((uint32_t*)(src+3*stride))[0]= a;
37
}
38

  
39
static void pred4x4_horizontal_c(uint8_t *src, const uint8_t *topright, int stride){
40
    ((uint32_t*)(src+0*stride))[0]= src[-1+0*stride]*0x01010101;
41
    ((uint32_t*)(src+1*stride))[0]= src[-1+1*stride]*0x01010101;
42
    ((uint32_t*)(src+2*stride))[0]= src[-1+2*stride]*0x01010101;
43
    ((uint32_t*)(src+3*stride))[0]= src[-1+3*stride]*0x01010101;
44
}
45

  
46
static void pred4x4_dc_c(uint8_t *src, const uint8_t *topright, int stride){
31
#define BIT_DEPTH 8
32

  
33
#define pixel uint8_t
34
#define pixel4 uint32_t
35
#define dctcoef DCTELEM
36

  
37
#define INIT_CLIP uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
38
#define CLIP(a) cm[a]
39
#define FUNC(a) a
40
#define FUNCC(a) a ## _c
41
#define PIXEL_SPLAT_X4(x) ((x)*0x01010101U)
42
#define AV_WN4P  AV_WN32
43
#define AV_WN4PA AV_WN32A
44

  
45
static void FUNCC(pred4x4_vertical)(uint8_t *p_src, const uint8_t *topright, int p_stride){
46
    pixel *src = (pixel*)p_src;
47
    int stride = p_stride>>(sizeof(pixel)-1);
48
    const pixel4 a= ((pixel4*)(src-stride))[0];
49
    ((pixel4*)(src+0*stride))[0]= a;
50
    ((pixel4*)(src+1*stride))[0]= a;
51
    ((pixel4*)(src+2*stride))[0]= a;
52
    ((pixel4*)(src+3*stride))[0]= a;
53
}
54

  
55
static void FUNCC(pred4x4_horizontal)(uint8_t *p_src, const uint8_t *topright, int p_stride){
56
    pixel *src = (pixel*)p_src;
57
    int stride = p_stride>>(sizeof(pixel)-1);
58
    ((pixel4*)(src+0*stride))[0]= PIXEL_SPLAT_X4(src[-1+0*stride]);
59
    ((pixel4*)(src+1*stride))[0]= PIXEL_SPLAT_X4(src[-1+1*stride]);
60
    ((pixel4*)(src+2*stride))[0]= PIXEL_SPLAT_X4(src[-1+2*stride]);
61
    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(src[-1+3*stride]);
62
}
63

  
64
static void FUNCC(pred4x4_dc)(uint8_t *p_src, const uint8_t *topright, int p_stride){
65
    pixel *src = (pixel*)p_src;
66
    int stride = p_stride>>(sizeof(pixel)-1);
47 67
    const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
48 68
                   + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
49 69

  
50
    ((uint32_t*)(src+0*stride))[0]=
51
    ((uint32_t*)(src+1*stride))[0]=
52
    ((uint32_t*)(src+2*stride))[0]=
53
    ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
70
    ((pixel4*)(src+0*stride))[0]=
71
    ((pixel4*)(src+1*stride))[0]=
72
    ((pixel4*)(src+2*stride))[0]=
73
    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(dc);
54 74
}
55 75

  
56
static void pred4x4_left_dc_c(uint8_t *src, const uint8_t *topright, int stride){
76
static void FUNCC(pred4x4_left_dc)(uint8_t *p_src, const uint8_t *topright, int p_stride){
77
    pixel *src = (pixel*)p_src;
78
    int stride = p_stride>>(sizeof(pixel)-1);
57 79
    const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
58 80

  
59
    ((uint32_t*)(src+0*stride))[0]=
60
    ((uint32_t*)(src+1*stride))[0]=
61
    ((uint32_t*)(src+2*stride))[0]=
62
    ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
81
    ((pixel4*)(src+0*stride))[0]=
82
    ((pixel4*)(src+1*stride))[0]=
83
    ((pixel4*)(src+2*stride))[0]=
84
    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(dc);
63 85
}
64 86

  
65
static void pred4x4_top_dc_c(uint8_t *src, const uint8_t *topright, int stride){
87
static void FUNCC(pred4x4_top_dc)(uint8_t *p_src, const uint8_t *topright, int p_stride){
88
    pixel *src = (pixel*)p_src;
89
    int stride = p_stride>>(sizeof(pixel)-1);
66 90
    const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
67 91

  
68
    ((uint32_t*)(src+0*stride))[0]=
69
    ((uint32_t*)(src+1*stride))[0]=
70
    ((uint32_t*)(src+2*stride))[0]=
71
    ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
92
    ((pixel4*)(src+0*stride))[0]=
93
    ((pixel4*)(src+1*stride))[0]=
94
    ((pixel4*)(src+2*stride))[0]=
95
    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(dc);
72 96
}
73 97

  
74
static void pred4x4_128_dc_c(uint8_t *src, const uint8_t *topright, int stride){
75
    ((uint32_t*)(src+0*stride))[0]=
76
    ((uint32_t*)(src+1*stride))[0]=
77
    ((uint32_t*)(src+2*stride))[0]=
78
    ((uint32_t*)(src+3*stride))[0]= 128U*0x01010101U;
98
static void FUNCC(pred4x4_128_dc)(uint8_t *p_src, const uint8_t *topright, int p_stride){
99
    pixel *src = (pixel*)p_src;
100
    int stride = p_stride>>(sizeof(pixel)-1);
101
    ((pixel4*)(src+0*stride))[0]=
102
    ((pixel4*)(src+1*stride))[0]=
103
    ((pixel4*)(src+2*stride))[0]=
104
    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1));
79 105
}
80 106

  
81
static void pred4x4_127_dc_c(uint8_t *src, const uint8_t *topright, int stride){
82
    ((uint32_t*)(src+0*stride))[0]=
83
    ((uint32_t*)(src+1*stride))[0]=
84
    ((uint32_t*)(src+2*stride))[0]=
85
    ((uint32_t*)(src+3*stride))[0]= 127U*0x01010101U;
107
static void FUNCC(pred4x4_127_dc)(uint8_t *p_src, const uint8_t *topright, int p_stride){
108
    pixel *src = (pixel*)p_src;
109
    int stride = p_stride>>(sizeof(pixel)-1);
110
    ((pixel4*)(src+0*stride))[0]=
111
    ((pixel4*)(src+1*stride))[0]=
112
    ((pixel4*)(src+2*stride))[0]=
113
    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))-1);
86 114
}
87 115

  
88
static void pred4x4_129_dc_c(uint8_t *src, const uint8_t *topright, int stride){
89
    ((uint32_t*)(src+0*stride))[0]=
90
    ((uint32_t*)(src+1*stride))[0]=
91
    ((uint32_t*)(src+2*stride))[0]=
92
    ((uint32_t*)(src+3*stride))[0]= 129U*0x01010101U;
116
static void FUNCC(pred4x4_129_dc)(uint8_t *p_src, const uint8_t *topright, int p_stride){
117
    pixel *src = (pixel*)p_src;
118
    int stride = p_stride>>(sizeof(pixel)-1);
119
    ((pixel4*)(src+0*stride))[0]=
120
    ((pixel4*)(src+1*stride))[0]=
121
    ((pixel4*)(src+2*stride))[0]=
122
    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))+1);
93 123
}
94 124

  
95 125

  
......
117 147
    const int av_unused t2= src[ 2-1*stride];\
118 148
    const int av_unused t3= src[ 3-1*stride];\
119 149

  
120
static void pred4x4_vertical_vp8_c(uint8_t *src, const uint8_t *topright, int stride){
150
static void FUNCC(pred4x4_vertical_vp8)(uint8_t *p_src, const uint8_t *p_topright, int p_stride){
151
    pixel *src = (pixel*)p_src;
152
    const pixel *topright = (const pixel*)p_topright;
153
    int stride = p_stride>>(sizeof(pixel)-1);
121 154
    const int lt= src[-1-1*stride];
122 155
    LOAD_TOP_EDGE
123 156
    LOAD_TOP_RIGHT_EDGE
124
    uint32_t v = PACK_4U8((lt + 2*t0 + t1 + 2) >> 2,
157
    pixel4 v = PACK_4U8((lt + 2*t0 + t1 + 2) >> 2,
125 158
                            (t0 + 2*t1 + t2 + 2) >> 2,
126 159
                            (t1 + 2*t2 + t3 + 2) >> 2,
127 160
                            (t2 + 2*t3 + t4 + 2) >> 2);
128 161

  
129
    AV_WN32A(src+0*stride, v);
130
    AV_WN32A(src+1*stride, v);
131
    AV_WN32A(src+2*stride, v);
132
    AV_WN32A(src+3*stride, v);
162
    AV_WN4PA(src+0*stride, v);
163
    AV_WN4PA(src+1*stride, v);
164
    AV_WN4PA(src+2*stride, v);
165
    AV_WN4PA(src+3*stride, v);
133 166
}
134 167

  
135
static void pred4x4_horizontal_vp8_c(uint8_t *src, const uint8_t *topright, int stride){
168
static void FUNCC(pred4x4_horizontal_vp8)(uint8_t *p_src, const uint8_t *topright, int p_stride){
169
    pixel *src = (pixel*)p_src;
170
    int stride = p_stride>>(sizeof(pixel)-1);
136 171
    const int lt= src[-1-1*stride];
137 172
    LOAD_LEFT_EDGE
138 173

  
139
    AV_WN32A(src+0*stride, ((lt + 2*l0 + l1 + 2) >> 2)*0x01010101);
140
    AV_WN32A(src+1*stride, ((l0 + 2*l1 + l2 + 2) >> 2)*0x01010101);
141
    AV_WN32A(src+2*stride, ((l1 + 2*l2 + l3 + 2) >> 2)*0x01010101);
142
    AV_WN32A(src+3*stride, ((l2 + 2*l3 + l3 + 2) >> 2)*0x01010101);
174
    AV_WN4PA(src+0*stride, PIXEL_SPLAT_X4((lt + 2*l0 + l1 + 2) >> 2));
175
    AV_WN4PA(src+1*stride, PIXEL_SPLAT_X4((l0 + 2*l1 + l2 + 2) >> 2));
176
    AV_WN4PA(src+2*stride, PIXEL_SPLAT_X4((l1 + 2*l2 + l3 + 2) >> 2));
177
    AV_WN4PA(src+3*stride, PIXEL_SPLAT_X4((l2 + 2*l3 + l3 + 2) >> 2));
143 178
}
144 179

  
145
static void pred4x4_down_right_c(uint8_t *src, const uint8_t *topright, int stride){
180
static void FUNCC(pred4x4_down_right)(uint8_t *p_src, const uint8_t *topright, int p_stride){
181
    pixel *src = (pixel*)p_src;
182
    int stride = p_stride>>(sizeof(pixel)-1);
146 183
    const int lt= src[-1-1*stride];
147 184
    LOAD_TOP_EDGE
148 185
    LOAD_LEFT_EDGE
......
165 202
    src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
166 203
}
167 204

  
168
static void pred4x4_down_left_c(uint8_t *src, const uint8_t *topright, int stride){
205
static void FUNCC(pred4x4_down_left)(uint8_t *p_src, const uint8_t *p_topright, int p_stride){
206
    pixel *src = (pixel*)p_src;
207
    const pixel *topright = (const pixel*)p_topright;
208
    int stride = p_stride>>(sizeof(pixel)-1);
169 209
    LOAD_TOP_EDGE
170 210
    LOAD_TOP_RIGHT_EDGE
171 211
//    LOAD_LEFT_EDGE
......
188 228
    src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
189 229
}
190 230

  
191
static void pred4x4_down_left_svq3_c(uint8_t *src, const uint8_t *topright, int stride){
231
static void FUNCC(pred4x4_down_left_svq3)(uint8_t *p_src, const uint8_t *topright, int p_stride){
232
    pixel *src = (pixel*)p_src;
233
    int stride = p_stride>>(sizeof(pixel)-1);
192 234
    LOAD_TOP_EDGE
193 235
    LOAD_LEFT_EDGE
194 236
    const av_unused int unu0= t0;
......
212 254
    src[3+3*stride]=(l3 + t3)>>1;
213 255
}
214 256

  
215
static void pred4x4_down_left_rv40_c(uint8_t *src, const uint8_t *topright, int stride){
257
static void FUNCC(pred4x4_down_left_rv40)(uint8_t *p_src, const uint8_t *p_topright, int p_stride){
258
    pixel *src = (pixel*)p_src;
259
    const pixel *topright = (const pixel*)p_topright;
260
    int stride = p_stride>>(sizeof(pixel)-1);
216 261
    LOAD_TOP_EDGE
217 262
    LOAD_TOP_RIGHT_EDGE
218 263
    LOAD_LEFT_EDGE
......
236 281
    src[3+3*stride]=(t6 + t7 + 1 + l6 + l7 + 1)>>2;
237 282
}
238 283

  
239
static void pred4x4_down_left_rv40_nodown_c(uint8_t *src, const uint8_t *topright, int stride){
284
static void FUNCC(pred4x4_down_left_rv40_nodown)(uint8_t *p_src, const uint8_t *p_topright, int p_stride){
285
    pixel *src = (pixel*)p_src;
286
    const pixel *topright = (const pixel*)p_topright;
287
    int stride = p_stride>>(sizeof(pixel)-1);
240 288
    LOAD_TOP_EDGE
241 289
    LOAD_TOP_RIGHT_EDGE
242 290
    LOAD_LEFT_EDGE
......
259 307
    src[3+3*stride]=(t6 + t7 + 1 + 2*l3 + 1)>>2;
260 308
}
261 309

  
262
static void pred4x4_vertical_right_c(uint8_t *src, const uint8_t *topright, int stride){
310
static void FUNCC(pred4x4_vertical_right)(uint8_t *p_src, const uint8_t *topright, int p_stride){
311
    pixel *src = (pixel*)p_src;
312
    int stride = p_stride>>(sizeof(pixel)-1);
263 313
    const int lt= src[-1-1*stride];
264 314
    LOAD_TOP_EDGE
265 315
    LOAD_LEFT_EDGE
......
282 332
    src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
283 333
}
284 334

  
285
static void pred4x4_vertical_left_c(uint8_t *src, const uint8_t *topright, int stride){
335
static void FUNCC(pred4x4_vertical_left)(uint8_t *p_src, const uint8_t *p_topright, int p_stride){
336
    pixel *src = (pixel*)p_src;
337
    const pixel *topright = (const pixel*)p_topright;
338
    int stride = p_stride>>(sizeof(pixel)-1);
286 339
    LOAD_TOP_EDGE
287 340
    LOAD_TOP_RIGHT_EDGE
288 341

  
......
304 357
    src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
305 358
}
306 359

  
307
static void pred4x4_vertical_left_rv40(uint8_t *src, const uint8_t *topright, int stride,
360
static void FUNCC(pred4x4_vertical_left_rv40_internal)(uint8_t *p_src, const uint8_t *p_topright, int p_stride,
308 361
                                      const int l0, const int l1, const int l2, const int l3, const int l4){
362
    pixel *src = (pixel*)p_src;
363
    const pixel *topright = (const pixel*)p_topright;
364
    int stride = p_stride>>(sizeof(pixel)-1);
309 365
    LOAD_TOP_EDGE
310 366
    LOAD_TOP_RIGHT_EDGE
311 367

  
......
327 383
    src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
328 384
}
329 385

  
330
static void pred4x4_vertical_left_rv40_c(uint8_t *src, const uint8_t *topright, int stride){
386
static void FUNCC(pred4x4_vertical_left_rv40)(uint8_t *p_src, const uint8_t *topright, int p_stride){
387
    pixel *src = (pixel*)p_src;
388
    int stride = p_stride>>(sizeof(pixel)-1);
331 389
    LOAD_LEFT_EDGE
332 390
    LOAD_DOWN_LEFT_EDGE
333 391

  
334
    pred4x4_vertical_left_rv40(src, topright, stride, l0, l1, l2, l3, l4);
392
    FUNCC(pred4x4_vertical_left_rv40_internal)(p_src, topright, p_stride, l0, l1, l2, l3, l4);
335 393
}
336 394

  
337
static void pred4x4_vertical_left_rv40_nodown_c(uint8_t *src, const uint8_t *topright, int stride){
395
static void FUNCC(pred4x4_vertical_left_rv40_nodown)(uint8_t *p_src, const uint8_t *topright, int p_stride){
396
    pixel *src = (pixel*)p_src;
397
    int stride = p_stride>>(sizeof(pixel)-1);
338 398
    LOAD_LEFT_EDGE
339 399

  
340
    pred4x4_vertical_left_rv40(src, topright, stride, l0, l1, l2, l3, l3);
400
    FUNCC(pred4x4_vertical_left_rv40_internal)(p_src, topright, p_stride, l0, l1, l2, l3, l3);
341 401
}
342 402

  
343
static void pred4x4_vertical_left_vp8_c(uint8_t *src, const uint8_t *topright, int stride){
403
static void FUNCC(pred4x4_vertical_left_vp8)(uint8_t *p_src, const uint8_t *p_topright, int p_stride){
404
    pixel *src = (pixel*)p_src;
405
    const pixel *topright = (const pixel*)p_topright;
406
    int stride = p_stride>>(sizeof(pixel)-1);
344 407
    LOAD_TOP_EDGE
345 408
    LOAD_TOP_RIGHT_EDGE
346 409

  
......
362 425
    src[3+3*stride]=(t5 + 2*t6 + t7 + 2)>>2;
363 426
}
364 427

  
365
static void pred4x4_horizontal_up_c(uint8_t *src, const uint8_t *topright, int stride){
428
static void FUNCC(pred4x4_horizontal_up)(uint8_t *p_src, const uint8_t *topright, int p_stride){
429
    pixel *src = (pixel*)p_src;
430
    int stride = p_stride>>(sizeof(pixel)-1);
366 431
    LOAD_LEFT_EDGE
367 432

  
368 433
    src[0+0*stride]=(l0 + l1 + 1)>>1;
......
383 448
    src[3+3*stride]=l3;
384 449
}
385 450

  
386
static void pred4x4_horizontal_up_rv40_c(uint8_t *src, const uint8_t *topright, int stride){
451
static void FUNCC(pred4x4_horizontal_up_rv40)(uint8_t *p_src, const uint8_t *p_topright, int p_stride){
452
    pixel *src = (pixel*)p_src;
453
    const pixel *topright = (const pixel*)p_topright;
454
    int stride = p_stride>>(sizeof(pixel)-1);
387 455
    LOAD_LEFT_EDGE
388 456
    LOAD_DOWN_LEFT_EDGE
389 457
    LOAD_TOP_EDGE
......
407 475
    src[3+3*stride]=(l4 + 2*l5 + l6 + 2)>>2;
408 476
}
409 477

  
410
static void pred4x4_horizontal_up_rv40_nodown_c(uint8_t *src, const uint8_t *topright, int stride){
478
static void FUNCC(pred4x4_horizontal_up_rv40_nodown)(uint8_t *p_src, const uint8_t *p_topright, int p_stride){
479
    pixel *src = (pixel*)p_src;
480
    const pixel *topright = (const pixel*)p_topright;
481
    int stride = p_stride>>(sizeof(pixel)-1);
411 482
    LOAD_LEFT_EDGE
412 483
    LOAD_TOP_EDGE
413 484
    LOAD_TOP_RIGHT_EDGE
......
430 501
    src[3+3*stride]=l3;
431 502
}
432 503

  
433
static void pred4x4_horizontal_down_c(uint8_t *src, const uint8_t *topright, int stride){
504
static void FUNCC(pred4x4_horizontal_down)(uint8_t *p_src, const uint8_t *topright, int p_stride){
505
    pixel *src = (pixel*)p_src;
506
    int stride = p_stride>>(sizeof(pixel)-1);
434 507
    const int lt= src[-1-1*stride];
435 508
    LOAD_TOP_EDGE
436 509
    LOAD_LEFT_EDGE
......
453 526
    src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
454 527
}
455 528

  
456
static void pred4x4_tm_vp8_c(uint8_t *src, const uint8_t *topright, int stride){
529
static void FUNCC(pred4x4_tm_vp8)(uint8_t *p_src, const uint8_t *topright, int p_stride){
530
    pixel *src = (pixel*)p_src;
531
    int stride = p_stride>>(sizeof(pixel)-1);
457 532
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP - src[-1-stride];
458
    uint8_t *top = src-stride;
533
    pixel *top = src-stride;
459 534
    int y;
460 535

  
461 536
    for (y = 0; y < 4; y++) {
......
468 543
    }
469 544
}
470 545

  
471
static void pred16x16_vertical_c(uint8_t *src, int stride){
546
static void FUNCC(pred16x16_vertical)(uint8_t *p_src, int p_stride){
472 547
    int i;
473
    const uint32_t a= ((uint32_t*)(src-stride))[0];
474
    const uint32_t b= ((uint32_t*)(src-stride))[1];
475
    const uint32_t c= ((uint32_t*)(src-stride))[2];
476
    const uint32_t d= ((uint32_t*)(src-stride))[3];
548
    pixel *src = (pixel*)p_src;
549
    int stride = p_stride>>(sizeof(pixel)-1);
550
    const pixel4 a = ((pixel4*)(src-stride))[0];
551
    const pixel4 b = ((pixel4*)(src-stride))[1];
552
    const pixel4 c = ((pixel4*)(src-stride))[2];
553
    const pixel4 d = ((pixel4*)(src-stride))[3];
477 554

  
478 555
    for(i=0; i<16; i++){
479
        ((uint32_t*)(src+i*stride))[0]= a;
480
        ((uint32_t*)(src+i*stride))[1]= b;
481
        ((uint32_t*)(src+i*stride))[2]= c;
482
        ((uint32_t*)(src+i*stride))[3]= d;
556
        ((pixel4*)(src+i*stride))[0] = a;
557
        ((pixel4*)(src+i*stride))[1] = b;
558
        ((pixel4*)(src+i*stride))[2] = c;
559
        ((pixel4*)(src+i*stride))[3] = d;
483 560
    }
484 561
}
485 562

  
486
static void pred16x16_horizontal_c(uint8_t *src, int stride){
563
static void FUNCC(pred16x16_horizontal)(uint8_t *p_src, int stride){
487 564
    int i;
565
    pixel *src = (pixel*)p_src;
566
    stride >>= sizeof(pixel)-1;
488 567

  
489 568
    for(i=0; i<16; i++){
490
        ((uint32_t*)(src+i*stride))[0]=
491
        ((uint32_t*)(src+i*stride))[1]=
492
        ((uint32_t*)(src+i*stride))[2]=
493
        ((uint32_t*)(src+i*stride))[3]= src[-1+i*stride]*0x01010101;
569
        ((pixel4*)(src+i*stride))[0] =
570
        ((pixel4*)(src+i*stride))[1] =
571
        ((pixel4*)(src+i*stride))[2] =
572
        ((pixel4*)(src+i*stride))[3] = PIXEL_SPLAT_X4(src[-1+i*stride]);
494 573
    }
495 574
}
496 575

  
497
static void pred16x16_dc_c(uint8_t *src, int stride){
576
#define PREDICT_16x16_DC(v)\
577
    for(i=0; i<16; i++){\
578
        AV_WN4P(src+ 0, v);\
579
        AV_WN4P(src+ 4, v);\
580
        AV_WN4P(src+ 8, v);\
581
        AV_WN4P(src+12, v);\
582
        src += stride;\
583
    }
584

  
585
static void FUNCC(pred16x16_dc)(uint8_t *p_src, int stride){
498 586
    int i, dc=0;
587
    pixel *src = (pixel*)p_src;
588
    pixel4 dcsplat;
589
    stride >>= sizeof(pixel)-1;
499 590

  
500 591
    for(i=0;i<16; i++){
501 592
        dc+= src[-1+i*stride];
......
505 596
        dc+= src[i-stride];
506 597
    }
507 598

  
508
    dc= 0x01010101*((dc + 16)>>5);
509

  
510
    for(i=0; i<16; i++){
511
        ((uint32_t*)(src+i*stride))[0]=
512
        ((uint32_t*)(src+i*stride))[1]=
513
        ((uint32_t*)(src+i*stride))[2]=
514
        ((uint32_t*)(src+i*stride))[3]= dc;
515
    }
599
    dcsplat = PIXEL_SPLAT_X4((dc+16)>>5);
600
    PREDICT_16x16_DC(dcsplat);
516 601
}
517 602

  
518
static void pred16x16_left_dc_c(uint8_t *src, int stride){
603
static void FUNCC(pred16x16_left_dc)(uint8_t *p_src, int stride){
519 604
    int i, dc=0;
605
    pixel *src = (pixel*)p_src;
606
    pixel4 dcsplat;
607
    stride >>= sizeof(pixel)-1;
520 608

  
521 609
    for(i=0;i<16; i++){
522 610
        dc+= src[-1+i*stride];
523 611
    }
524 612

  
525
    dc= 0x01010101*((dc + 8)>>4);
526

  
527
    for(i=0; i<16; i++){
528
        ((uint32_t*)(src+i*stride))[0]=
529
        ((uint32_t*)(src+i*stride))[1]=
530
        ((uint32_t*)(src+i*stride))[2]=
531
        ((uint32_t*)(src+i*stride))[3]= dc;
532
    }
613
    dcsplat = PIXEL_SPLAT_X4((dc+8)>>4);
614
    PREDICT_16x16_DC(dcsplat);
533 615
}
534 616

  
535
static void pred16x16_top_dc_c(uint8_t *src, int stride){
617
static void FUNCC(pred16x16_top_dc)(uint8_t *p_src, int stride){
536 618
    int i, dc=0;
619
    pixel *src = (pixel*)p_src;
620
    pixel4 dcsplat;
621
    stride >>= sizeof(pixel)-1;
537 622

  
538 623
    for(i=0;i<16; i++){
539 624
        dc+= src[i-stride];
540 625
    }
541
    dc= 0x01010101*((dc + 8)>>4);
542 626

  
543
    for(i=0; i<16; i++){
544
        ((uint32_t*)(src+i*stride))[0]=
545
        ((uint32_t*)(src+i*stride))[1]=
546
        ((uint32_t*)(src+i*stride))[2]=
547
        ((uint32_t*)(src+i*stride))[3]= dc;
548
    }
627
    dcsplat = PIXEL_SPLAT_X4((dc+8)>>4);
628
    PREDICT_16x16_DC(dcsplat);
549 629
}
550 630

  
551
static void pred16x16_128_dc_c(uint8_t *src, int stride){
552
    int i;
553

  
554
    for(i=0; i<16; i++){
555
        ((uint32_t*)(src+i*stride))[0]=
556
        ((uint32_t*)(src+i*stride))[1]=
557
        ((uint32_t*)(src+i*stride))[2]=
558
        ((uint32_t*)(src+i*stride))[3]= 0x01010101U*128U;
559
    }
631
#define PRED16x16_X(n, v) \
632
static void FUNCC(pred16x16_##n##_dc)(uint8_t *p_src, int stride){\
633
    int i;\
634
    pixel *src = (pixel*)p_src;\
635
    stride >>= sizeof(pixel)-1;\
636
    PREDICT_16x16_DC(PIXEL_SPLAT_X4(v));\
560 637
}
561 638

  
562
static void pred16x16_127_dc_c(uint8_t *src, int stride){
563
    int i;
564

  
565
    for(i=0; i<16; i++){
566
        ((uint32_t*)(src+i*stride))[0]=
567
        ((uint32_t*)(src+i*stride))[1]=
568
        ((uint32_t*)(src+i*stride))[2]=
569
        ((uint32_t*)(src+i*stride))[3]= 0x01010101U*127U;
570
    }
571
}
572

  
573
static void pred16x16_129_dc_c(uint8_t *src, int stride){
574
    int i;
575

  
576
    for(i=0; i<16; i++){
577
        ((uint32_t*)(src+i*stride))[0]=
578
        ((uint32_t*)(src+i*stride))[1]=
579
        ((uint32_t*)(src+i*stride))[2]=
580
        ((uint32_t*)(src+i*stride))[3]= 0x01010101U*129U;
581
    }
582
}
639
PRED16x16_X(127, (1<<(BIT_DEPTH-1))-1);
640
PRED16x16_X(128, (1<<(BIT_DEPTH-1))+0);
641
PRED16x16_X(129, (1<<(BIT_DEPTH-1))+1);
583 642

  
584
static inline void pred16x16_plane_compat_c(uint8_t *src, int stride, const int svq3, const int rv40){
643
static inline void FUNCC(pred16x16_plane_compat)(uint8_t *p_src, int p_stride, const int svq3, const int rv40){
585 644
  int i, j, k;
586 645
  int a;
587
  uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
588
  const uint8_t * const src0 = src+7-stride;
589
  const uint8_t *src1 = src+8*stride-1;
590
  const uint8_t *src2 = src1-2*stride;      // == src+6*stride-1;
646
  INIT_CLIP
647
  pixel *src = (pixel*)p_src;
648
  int stride = p_stride>>(sizeof(pixel)-1);
649
  const pixel * const src0 = src +7-stride;
650
  const pixel *       src1 = src +8*stride-1;
651
  const pixel *       src2 = src1-2*stride;    // == src+6*stride-1;
591 652
  int H = src0[1] - src0[-1];
592 653
  int V = src1[0] - src2[ 0];
593 654
  for(k=2; k<=8; ++k) {
......
614 675
    int b = a;
615 676
    a += V;
616 677
    for(i=-16; i<0; i+=4) {
617
      src[16+i] = cm[ (b    ) >> 5 ];
618
      src[17+i] = cm[ (b+  H) >> 5 ];
619
      src[18+i] = cm[ (b+2*H) >> 5 ];
620
      src[19+i] = cm[ (b+3*H) >> 5 ];
678
      src[16+i] = CLIP((b    ) >> 5);
679
      src[17+i] = CLIP((b+  H) >> 5);
680
      src[18+i] = CLIP((b+2*H) >> 5);
681
      src[19+i] = CLIP((b+3*H) >> 5);
621 682
      b += 4*H;
622 683
    }
623 684
    src += stride;
624 685
  }
625 686
}
626 687

  
627
static void pred16x16_plane_c(uint8_t *src, int stride){
628
    pred16x16_plane_compat_c(src, stride, 0, 0);
688
static void FUNCC(pred16x16_plane)(uint8_t *src, int stride){
689
    FUNCC(pred16x16_plane_compat)(src, stride, 0, 0);
629 690
}
630 691

  
631
static void pred16x16_plane_svq3_c(uint8_t *src, int stride){
632
    pred16x16_plane_compat_c(src, stride, 1, 0);
692
static void FUNCC(pred16x16_plane_svq3)(uint8_t *src, int stride){
693
    FUNCC(pred16x16_plane_compat)(src, stride, 1, 0);
633 694
}
634 695

  
635
static void pred16x16_plane_rv40_c(uint8_t *src, int stride){
636
    pred16x16_plane_compat_c(src, stride, 0, 1);
696
static void FUNCC(pred16x16_plane_rv40)(uint8_t *src, int stride){
697
    FUNCC(pred16x16_plane_compat)(src, stride, 0, 1);
637 698
}
638 699

  
639
static void pred16x16_tm_vp8_c(uint8_t *src, int stride){
700
static void FUNCC(pred16x16_tm_vp8)(uint8_t *src, int stride){
640 701
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP - src[-1-stride];
641 702
    uint8_t *top = src-stride;
642 703
    int y;
......
663 724
    }
664 725
}
665 726

  
666
static void pred8x8_vertical_c(uint8_t *src, int stride){
727
static void FUNCC(pred8x8_vertical)(uint8_t *p_src, int p_stride){
667 728
    int i;
668
    const uint32_t a= ((uint32_t*)(src-stride))[0];
669
    const uint32_t b= ((uint32_t*)(src-stride))[1];
729
    pixel *src = (pixel*)p_src;
730
    int stride = p_stride>>(sizeof(pixel)-1);
731
    const pixel4 a= ((pixel4*)(src-stride))[0];
732
    const pixel4 b= ((pixel4*)(src-stride))[1];
670 733

  
671 734
    for(i=0; i<8; i++){
672
        ((uint32_t*)(src+i*stride))[0]= a;
673
        ((uint32_t*)(src+i*stride))[1]= b;
735
        ((pixel4*)(src+i*stride))[0]= a;
736
        ((pixel4*)(src+i*stride))[1]= b;
674 737
    }
675 738
}
676 739

  
677
static void pred8x8_horizontal_c(uint8_t *src, int stride){
740
static void FUNCC(pred8x8_horizontal)(uint8_t *p_src, int stride){
678 741
    int i;
742
    pixel *src = (pixel*)p_src;
743
    stride >>= sizeof(pixel)-1;
679 744

  
680 745
    for(i=0; i<8; i++){
681
        ((uint32_t*)(src+i*stride))[0]=
682
        ((uint32_t*)(src+i*stride))[1]= src[-1+i*stride]*0x01010101;
746
        ((pixel4*)(src+i*stride))[0]=
747
        ((pixel4*)(src+i*stride))[1]= PIXEL_SPLAT_X4(src[-1+i*stride]);
683 748
    }
684 749
}
685 750

  
686
static void pred8x8_128_dc_c(uint8_t *src, int stride){
687
    int i;
688

  
689
    for(i=0; i<8; i++){
690
        ((uint32_t*)(src+i*stride))[0]=
691
        ((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U;
692
    }
751
#define PRED8x8_X(n, v)\
752
static void FUNCC(pred8x8_##n##_dc)(uint8_t *p_src, int stride){\
753
    int i;\
754
    pixel *src = (pixel*)p_src;\
755
    stride >>= sizeof(pixel)-1;\
756
    for(i=0; i<8; i++){\
757
        ((pixel4*)(src+i*stride))[0]=\
758
        ((pixel4*)(src+i*stride))[1]= PIXEL_SPLAT_X4(v);\
759
    }\
693 760
}
694 761

  
695
static void pred8x8_127_dc_c(uint8_t *src, int stride){
696
    int i;
762
PRED8x8_X(127, (1<<(BIT_DEPTH-1))-1);
763
PRED8x8_X(128, (1<<(BIT_DEPTH-1))+0);
764
PRED8x8_X(129, (1<<(BIT_DEPTH-1))+1);
697 765

  
698
    for(i=0; i<8; i++){
699
        ((uint32_t*)(src+i*stride))[0]=
700
        ((uint32_t*)(src+i*stride))[1]= 0x01010101U*127U;
701
    }
702
}
703
static void pred8x8_129_dc_c(uint8_t *src, int stride){
704
    int i;
705

  
706
    for(i=0; i<8; i++){
707
        ((uint32_t*)(src+i*stride))[0]=
708
        ((uint32_t*)(src+i*stride))[1]= 0x01010101U*129U;
709
    }
710
}
711

  
712
static void pred8x8_left_dc_c(uint8_t *src, int stride){
766
static void FUNCC(pred8x8_left_dc)(uint8_t *p_src, int stride){
713 767
    int i;
714 768
    int dc0, dc2;
769
    pixel4 dc0splat, dc2splat;
770
    pixel *src = (pixel*)p_src;
771
    stride >>= sizeof(pixel)-1;
715 772

  
716 773
    dc0=dc2=0;
717 774
    for(i=0;i<4; i++){
718 775
        dc0+= src[-1+i*stride];
719 776
        dc2+= src[-1+(i+4)*stride];
720 777
    }
721
    dc0= 0x01010101*((dc0 + 2)>>2);
722
    dc2= 0x01010101*((dc2 + 2)>>2);
778
    dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2);
779
    dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
723 780

  
724 781
    for(i=0; i<4; i++){
725
        ((uint32_t*)(src+i*stride))[0]=
726
        ((uint32_t*)(src+i*stride))[1]= dc0;
782
        ((pixel4*)(src+i*stride))[0]=
783
        ((pixel4*)(src+i*stride))[1]= dc0splat;
727 784
    }
728 785
    for(i=4; i<8; i++){
729
        ((uint32_t*)(src+i*stride))[0]=
730
        ((uint32_t*)(src+i*stride))[1]= dc2;
786
        ((pixel4*)(src+i*stride))[0]=
787
        ((pixel4*)(src+i*stride))[1]= dc2splat;
731 788
    }
732 789
}
733 790

  
734
static void pred8x8_left_dc_rv40_c(uint8_t *src, int stride){
791
static void FUNCC(pred8x8_left_dc_rv40)(uint8_t *p_src, int stride){
735 792
    int i;
736 793
    int dc0;
794
    pixel4 dc0splat;
795
    pixel *src = (pixel*)p_src;
796
    stride >>= sizeof(pixel)-1;
737 797

  
738 798
    dc0=0;
739 799
    for(i=0;i<8; i++)
740 800
        dc0+= src[-1+i*stride];
741
    dc0= 0x01010101*((dc0 + 4)>>3);
801
    dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3);
742 802

  
743 803
    for(i=0; i<8; i++){
744
        ((uint32_t*)(src+i*stride))[0]=
745
        ((uint32_t*)(src+i*stride))[1]= dc0;
804
        ((pixel4*)(src+i*stride))[0]=
805
        ((pixel4*)(src+i*stride))[1]= dc0splat;
746 806
    }
747 807
}
748 808

  
749
static void pred8x8_top_dc_c(uint8_t *src, int stride){
809
static void FUNCC(pred8x8_top_dc)(uint8_t *p_src, int stride){
750 810
    int i;
751 811
    int dc0, dc1;
812
    pixel4 dc0splat, dc1splat;
813
    pixel *src = (pixel*)p_src;
814
    stride >>= sizeof(pixel)-1;
752 815

  
753 816
    dc0=dc1=0;
754 817
    for(i=0;i<4; i++){
755 818
        dc0+= src[i-stride];
756 819
        dc1+= src[4+i-stride];
757 820
    }
758
    dc0= 0x01010101*((dc0 + 2)>>2);
759
    dc1= 0x01010101*((dc1 + 2)>>2);
821
    dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2);
822
    dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
760 823

  
761 824
    for(i=0; i<4; i++){
762
        ((uint32_t*)(src+i*stride))[0]= dc0;
763
        ((uint32_t*)(src+i*stride))[1]= dc1;
825
        ((pixel4*)(src+i*stride))[0]= dc0splat;
826
        ((pixel4*)(src+i*stride))[1]= dc1splat;
764 827
    }
765 828
    for(i=4; i<8; i++){
766
        ((uint32_t*)(src+i*stride))[0]= dc0;
767
        ((uint32_t*)(src+i*stride))[1]= dc1;
829
        ((pixel4*)(src+i*stride))[0]= dc0splat;
830
        ((pixel4*)(src+i*stride))[1]= dc1splat;
768 831
    }
769 832
}
770 833

  
771
static void pred8x8_top_dc_rv40_c(uint8_t *src, int stride){
834
static void FUNCC(pred8x8_top_dc_rv40)(uint8_t *p_src, int stride){
772 835
    int i;
773 836
    int dc0;
837
    pixel4 dc0splat;
838
    pixel *src = (pixel*)p_src;
839
    stride >>= sizeof(pixel)-1;
774 840

  
775 841
    dc0=0;
776 842
    for(i=0;i<8; i++)
777 843
        dc0+= src[i-stride];
778
    dc0= 0x01010101*((dc0 + 4)>>3);
844
    dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3);
779 845

  
780 846
    for(i=0; i<8; i++){
781
        ((uint32_t*)(src+i*stride))[0]=
782
        ((uint32_t*)(src+i*stride))[1]= dc0;
847
        ((pixel4*)(src+i*stride))[0]=
848
        ((pixel4*)(src+i*stride))[1]= dc0splat;
783 849
    }
784 850
}
785 851

  
786 852

  
787
static void pred8x8_dc_c(uint8_t *src, int stride){
853
static void FUNCC(pred8x8_dc)(uint8_t *p_src, int stride){
788 854
    int i;
789
    int dc0, dc1, dc2, dc3;
855
    int dc0, dc1, dc2;
856
    pixel4 dc0splat, dc1splat, dc2splat, dc3splat;
857
    pixel *src = (pixel*)p_src;
858
    stride >>= sizeof(pixel)-1;
790 859

  
791 860
    dc0=dc1=dc2=0;
792 861
    for(i=0;i<4; i++){
......
794 863
        dc1+= src[4+i-stride];
795 864
        dc2+= src[-1+(i+4)*stride];
796 865
    }
797
    dc3= 0x01010101*((dc1 + dc2 + 4)>>3);
798
    dc0= 0x01010101*((dc0 + 4)>>3);
799
    dc1= 0x01010101*((dc1 + 2)>>2);
800
    dc2= 0x01010101*((dc2 + 2)>>2);
866
    dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3);
867
    dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
868
    dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
869
    dc3splat = PIXEL_SPLAT_X4((dc1 + dc2 + 4)>>3);
801 870

  
802 871
    for(i=0; i<4; i++){
803
        ((uint32_t*)(src+i*stride))[0]= dc0;
804
        ((uint32_t*)(src+i*stride))[1]= dc1;
872
        ((pixel4*)(src+i*stride))[0]= dc0splat;
873
        ((pixel4*)(src+i*stride))[1]= dc1splat;
805 874
    }
806 875
    for(i=4; i<8; i++){
807
        ((uint32_t*)(src+i*stride))[0]= dc2;
808
        ((uint32_t*)(src+i*stride))[1]= dc3;
876
        ((pixel4*)(src+i*stride))[0]= dc2splat;
877
        ((pixel4*)(src+i*stride))[1]= dc3splat;
809 878
    }
810 879
}
811 880

  
812 881
//the following 4 function should not be optimized!
813
static void pred8x8_mad_cow_dc_l0t(uint8_t *src, int stride){
814
    pred8x8_top_dc_c(src, stride);
815
    pred4x4_dc_c(src, NULL, stride);
882
static void FUNC(pred8x8_mad_cow_dc_l0t)(uint8_t *src, int stride){
883
    FUNCC(pred8x8_top_dc)(src, stride);
884
    FUNCC(pred4x4_dc)(src, NULL, stride);
816 885
}
817 886

  
818
static void pred8x8_mad_cow_dc_0lt(uint8_t *src, int stride){
819
    pred8x8_dc_c(src, stride);
820
    pred4x4_top_dc_c(src, NULL, stride);
887
static void FUNC(pred8x8_mad_cow_dc_0lt)(uint8_t *src, int stride){
888
    FUNCC(pred8x8_dc)(src, stride);
889
    FUNCC(pred4x4_top_dc)(src, NULL, stride);
821 890
}
822 891

  
823
static void pred8x8_mad_cow_dc_l00(uint8_t *src, int stride){
824
    pred8x8_left_dc_c(src, stride);
825
    pred4x4_128_dc_c(src + 4*stride    , NULL, stride);
826
    pred4x4_128_dc_c(src + 4*stride + 4, NULL, stride);
892
static void FUNC(pred8x8_mad_cow_dc_l00)(uint8_t *src, int stride){
893
    FUNCC(pred8x8_left_dc)(src, stride);
894
    FUNCC(pred4x4_128_dc)(src + 4*stride                  , NULL, stride);
895
    FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride);
827 896
}
828 897

  
829
static void pred8x8_mad_cow_dc_0l0(uint8_t *src, int stride){
830
    pred8x8_left_dc_c(src, stride);
831
    pred4x4_128_dc_c(src    , NULL, stride);
832
    pred4x4_128_dc_c(src + 4, NULL, stride);
898
static void FUNC(pred8x8_mad_cow_dc_0l0)(uint8_t *src, int stride){
899
    FUNCC(pred8x8_left_dc)(src, stride);
900
    FUNCC(pred4x4_128_dc)(src                  , NULL, stride);
901
    FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride);
833 902
}
834 903

  
835

  
836
static void pred8x8_dc_rv40_c(uint8_t *src, int stride){
904
static void FUNCC(pred8x8_dc_rv40)(uint8_t *p_src, int stride){
837 905
    int i;
838 906
    int dc0=0;
907
    pixel4 dc0splat;
908
    pixel *src = (pixel*)p_src;
909
    stride >>= sizeof(pixel)-1;
839 910

  
840 911
    for(i=0;i<4; i++){
841 912
        dc0+= src[-1+i*stride] + src[i-stride];
842 913
        dc0+= src[4+i-stride];
843 914
        dc0+= src[-1+(i+4)*stride];
844 915
    }
845
    dc0= 0x01010101*((dc0 + 8)>>4);
916
    dc0splat = PIXEL_SPLAT_X4((dc0 + 8)>>4);
846 917

  
847 918
    for(i=0; i<4; i++){
848
        ((uint32_t*)(src+i*stride))[0]= dc0;
849
        ((uint32_t*)(src+i*stride))[1]= dc0;
919
        ((pixel4*)(src+i*stride))[0]= dc0splat;
920
        ((pixel4*)(src+i*stride))[1]= dc0splat;
850 921
    }
851 922
    for(i=4; i<8; i++){
852
        ((uint32_t*)(src+i*stride))[0]= dc0;
853
        ((uint32_t*)(src+i*stride))[1]= dc0;
923
        ((pixel4*)(src+i*stride))[0]= dc0splat;
924
        ((pixel4*)(src+i*stride))[1]= dc0splat;
854 925
    }
855 926
}
856 927

  
857
static void pred8x8_plane_c(uint8_t *src, int stride){
928
static void FUNCC(pred8x8_plane)(uint8_t *p_src, int p_stride){
858 929
  int j, k;
859 930
  int a;
860
  uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
861
  const uint8_t * const src0 = src+3-stride;
862
  const uint8_t *src1 = src+4*stride-1;
863
  const uint8_t *src2 = src1-2*stride;      // == src+2*stride-1;
931
  INIT_CLIP
932
  pixel *src = (pixel*)p_src;
933
  int stride = p_stride>>(sizeof(pixel)-1);
934
  const pixel * const src0 = src +3-stride;
935
  const pixel *       src1 = src +4*stride-1;
936
  const pixel *       src2 = src1-2*stride;    // == src+2*stride-1;
864 937
  int H = src0[1] - src0[-1];
865 938
  int V = src1[0] - src2[ 0];
866 939
  for(k=2; k<=4; ++k) {
......
875 948
  for(j=8; j>0; --j) {
876 949
    int b = a;
877 950
    a += V;
878
    src[0] = cm[ (b    ) >> 5 ];
879
    src[1] = cm[ (b+  H) >> 5 ];
880
    src[2] = cm[ (b+2*H) >> 5 ];
881
    src[3] = cm[ (b+3*H) >> 5 ];
882
    src[4] = cm[ (b+4*H) >> 5 ];
883
    src[5] = cm[ (b+5*H) >> 5 ];
884
    src[6] = cm[ (b+6*H) >> 5 ];
885
    src[7] = cm[ (b+7*H) >> 5 ];
951
    src[0] = CLIP((b    ) >> 5);
952
    src[1] = CLIP((b+  H) >> 5);
953
    src[2] = CLIP((b+2*H) >> 5);
954
    src[3] = CLIP((b+3*H) >> 5);
955
    src[4] = CLIP((b+4*H) >> 5);
956
    src[5] = CLIP((b+5*H) >> 5);
957
    src[6] = CLIP((b+6*H) >> 5);
958
    src[7] = CLIP((b+7*H) >> 5);
886 959
    src += stride;
887 960
  }
888 961
}
889 962

  
890
static void pred8x8_tm_vp8_c(uint8_t *src, int stride){
963
static void FUNCC(pred8x8_tm_vp8)(uint8_t *p_src, int p_stride){
964
    pixel *src = (pixel*)p_src;
965
    int stride = p_stride>>(sizeof(pixel)-1);
891 966
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP - src[-1-stride];
892
    uint8_t *top = src-stride;
967
    pixel *top = src-stride;
893 968
    int y;
894 969

  
895 970
    for (y = 0; y < 8; y++) {
......
939 1014
#define PREDICT_8x8_DC(v) \
940 1015
    int y; \
941 1016
    for( y = 0; y < 8; y++ ) { \
942
        ((uint32_t*)src)[0] = \
943
        ((uint32_t*)src)[1] = v; \
1017
        ((pixel4*)src)[0] = \
1018
        ((pixel4*)src)[1] = v; \
944 1019
        src += stride; \
945 1020
    }
946 1021

  
947
static void pred8x8l_128_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
1022
static void FUNCC(pred8x8l_128_dc)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
948 1023
{
949
    PREDICT_8x8_DC(0x80808080);
1024
    pixel *src = (pixel*)p_src;
1025
    int stride = p_stride>>(sizeof(pixel)-1);
1026

  
1027
    PREDICT_8x8_DC(PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1)));
950 1028
}
951
static void pred8x8l_left_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
1029
static void FUNCC(pred8x8l_left_dc)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
952 1030
{
1031
    pixel *src = (pixel*)p_src;
1032
    int stride = p_stride>>(sizeof(pixel)-1);
1033

  
953 1034
    PREDICT_8x8_LOAD_LEFT;
954
    const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3) * 0x01010101;
1035
    const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3);
955 1036
    PREDICT_8x8_DC(dc);
956 1037
}
957
static void pred8x8l_top_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
1038
static void FUNCC(pred8x8l_top_dc)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
958 1039
{
1040
    pixel *src = (pixel*)p_src;
1041
    int stride = p_stride>>(sizeof(pixel)-1);
1042

  
959 1043
    PREDICT_8x8_LOAD_TOP;
960
    const uint32_t dc = ((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3) * 0x01010101;
1044
    const pixel4 dc = PIXEL_SPLAT_X4((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3);
961 1045
    PREDICT_8x8_DC(dc);
962 1046
}
963
static void pred8x8l_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
1047
static void FUNCC(pred8x8l_dc)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
964 1048
{
1049
    pixel *src = (pixel*)p_src;
1050
    int stride = p_stride>>(sizeof(pixel)-1);
1051

  
965 1052
    PREDICT_8x8_LOAD_LEFT;
966 1053
    PREDICT_8x8_LOAD_TOP;
967
    const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7
968
                         +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4) * 0x01010101;
1054
    const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7
1055
                                     +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4);
969 1056
    PREDICT_8x8_DC(dc);
970 1057
}
971
static void pred8x8l_horizontal_c(uint8_t *src, int has_topleft, int has_topright, int stride)
1058
static void FUNCC(pred8x8l_horizontal)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
972 1059
{
1060
    pixel *src = (pixel*)p_src;
1061
    int stride = p_stride>>(sizeof(pixel)-1);
1062

  
973 1063
    PREDICT_8x8_LOAD_LEFT;
974
#define ROW(y) ((uint32_t*)(src+y*stride))[0] =\
975
               ((uint32_t*)(src+y*stride))[1] = 0x01010101 * l##y
1064
#define ROW(y) ((pixel4*)(src+y*stride))[0] =\
1065
               ((pixel4*)(src+y*stride))[1] = PIXEL_SPLAT_X4(l##y)
976 1066
    ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
977 1067
#undef ROW
978 1068
}
979
static void pred8x8l_vertical_c(uint8_t *src, int has_topleft, int has_topright, int stride)
1069
static void FUNCC(pred8x8l_vertical)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
980 1070
{
981 1071
    int y;
1072
    pixel *src = (pixel*)p_src;
1073
    int stride = p_stride>>(sizeof(pixel)-1);
1074

  
982 1075
    PREDICT_8x8_LOAD_TOP;
983 1076
    src[0] = t0;
984 1077
    src[1] = t1;
......
988 1081
    src[5] = t5;
989 1082
    src[6] = t6;
990 1083
    src[7] = t7;
991
    for( y = 1; y < 8; y++ )
992
        *(uint64_t*)(src+y*stride) = *(uint64_t*)src;
1084
    for( y = 1; y < 8; y++ ) {
1085
        ((pixel4*)(src+y*stride))[0] = ((pixel4*)src)[0];
1086
        ((pixel4*)(src+y*stride))[1] = ((pixel4*)src)[1];
1087
    }
993 1088
}
994
static void pred8x8l_down_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
1089
static void FUNCC(pred8x8l_down_left)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
995 1090
{
1091
    pixel *src = (pixel*)p_src;
1092
    int stride = p_stride>>(sizeof(pixel)-1);
996 1093
    PREDICT_8x8_LOAD_TOP;
997 1094
    PREDICT_8x8_LOAD_TOPRIGHT;
998 1095
    SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
......
1011 1108
    SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
1012 1109
    SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
1013 1110
}
1014
static void pred8x8l_down_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
1111
static void FUNCC(pred8x8l_down_right)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1015 1112
{
1113
    pixel *src = (pixel*)p_src;
1114
    int stride = p_stride>>(sizeof(pixel)-1);
1016 1115
    PREDICT_8x8_LOAD_TOP;
1017 1116
    PREDICT_8x8_LOAD_LEFT;
1018 1117
    PREDICT_8x8_LOAD_TOPLEFT;
......
1031 1130
    SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
1032 1131
    SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
1033 1132
    SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
1034

  
1035 1133
}
1036
static void pred8x8l_vertical_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
1134
static void FUNCC(pred8x8l_vertical_right)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1037 1135
{
1136
    pixel *src = (pixel*)p_src;
1137
    int stride = p_stride>>(sizeof(pixel)-1);
1038 1138
    PREDICT_8x8_LOAD_TOP;
1039 1139
    PREDICT_8x8_LOAD_LEFT;
1040 1140
    PREDICT_8x8_LOAD_TOPLEFT;
......
1061 1161
    SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
1062 1162
    SRC(7,0)= (t6 + t7 + 1) >> 1;
1063 1163
}
1064
static void pred8x8l_horizontal_down_c(uint8_t *src, int has_topleft, int has_topright, int stride)
1164
static void FUNCC(pred8x8l_horizontal_down)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1065 1165
{
1166
    pixel *src = (pixel*)p_src;
1167
    int stride = p_stride>>(sizeof(pixel)-1);
1066 1168
    PREDICT_8x8_LOAD_TOP;
1067 1169
    PREDICT_8x8_LOAD_LEFT;
1068 1170
    PREDICT_8x8_LOAD_TOPLEFT;
......
1089 1191
    SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
1090 1192
    SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
1091 1193
}
1092
static void pred8x8l_vertical_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
1194
static void FUNCC(pred8x8l_vertical_left)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1093 1195
{
1196
    pixel *src = (pixel*)p_src;
1197
    int stride = p_stride>>(sizeof(pixel)-1);
1094 1198
    PREDICT_8x8_LOAD_TOP;
1095 1199
    PREDICT_8x8_LOAD_TOPRIGHT;
1096 1200
    SRC(0,0)= (t0 + t1 + 1) >> 1;
......
1116 1220
    SRC(7,6)= (t10 + t11 + 1) >> 1;
1117 1221
    SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
1118 1222
}
1119
static void pred8x8l_horizontal_up_c(uint8_t *src, int has_topleft, int has_topright, int stride)
1223
static void FUNCC(pred8x8l_horizontal_up)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1120 1224
{
1225
    pixel *src = (pixel*)p_src;
1226
    int stride = p_stride>>(sizeof(pixel)-1);
1121 1227
    PREDICT_8x8_LOAD_LEFT;
1122 1228
    SRC(0,0)= (l0 + l1 + 1) >> 1;
1123 1229
    SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
......
1148 1254
#undef PL
1149 1255
#undef SRC
1150 1256

  
1151
static void pred4x4_vertical_add_c(uint8_t *pix, const DCTELEM *block, int stride){
1257
static void FUNCC(pred4x4_vertical_add)(uint8_t *p_pix, const DCTELEM *p_block, int stride){
1152 1258
    int i;
1259
    pixel *pix = (pixel*)p_pix;
1260
    const dctcoef *block = (const dctcoef*)p_block;
1261
    stride >>= sizeof(pixel)-1;
1153 1262
    pix -= stride;
1154 1263
    for(i=0; i<4; i++){
1155
        uint8_t v = pix[0];
1264
        pixel v = pix[0];
1156 1265
        pix[1*stride]= v += block[0];
1157 1266
        pix[2*stride]= v += block[4];
1158 1267
        pix[3*stride]= v += block[8];
......
1162 1271
    }
1163 1272
}
1164 1273

  
1165
static void pred4x4_horizontal_add_c(uint8_t *pix, const DCTELEM *block, int stride){
1274
static void FUNCC(pred4x4_horizontal_add)(uint8_t *p_pix, const DCTELEM *p_block, int stride){
1166 1275
    int i;
1276
    pixel *pix = (pixel*)p_pix;
1277
    const dctcoef *block = (const dctcoef*)p_block;
1278
    stride >>= sizeof(pixel)-1;
1167 1279
    for(i=0; i<4; i++){
1168
        uint8_t v = pix[-1];
1280
        pixel v = pix[-1];
1169 1281
        pix[0]= v += block[0];
1170 1282
        pix[1]= v += block[1];
1171 1283
        pix[2]= v += block[2];
......
1175 1287
    }
1176 1288
}
1177 1289

  
1178
static void pred8x8l_vertical_add_c(uint8_t *pix, const DCTELEM *block, int stride){
1290
static void FUNCC(pred8x8l_vertical_add)(uint8_t *p_pix, const DCTELEM *p_block, int stride){
1179 1291
    int i;
1292
    pixel *pix = (pixel*)p_pix;
1293
    const dctcoef *block = (const dctcoef*)p_block;
1294
    stride >>= sizeof(pixel)-1;
1180 1295
    pix -= stride;
1181 1296
    for(i=0; i<8; i++){
1182
        uint8_t v = pix[0];
1297
        pixel v = pix[0];
1183 1298
        pix[1*stride]= v += block[0];
1184 1299
        pix[2*stride]= v += block[8];
1185 1300
        pix[3*stride]= v += block[16];
......
1193 1308
    }
1194 1309
}
1195 1310

  
1196
static void pred8x8l_horizontal_add_c(uint8_t *pix, const DCTELEM *block, int stride){
1311
static void FUNCC(pred8x8l_horizontal_add)(uint8_t *p_pix, const DCTELEM *p_block, int stride){
1197 1312
    int i;
1313
    pixel *pix = (pixel*)p_pix;
1314
    const dctcoef *block = (const dctcoef*)p_block;
1315
    stride >>= sizeof(pixel)-1;
1198 1316
    for(i=0; i<8; i++){
1199
        uint8_t v = pix[-1];
1317
        pixel v = pix[-1];
1200 1318
        pix[0]= v += block[0];
1201 1319
        pix[1]= v += block[1];
1202 1320
        pix[2]= v += block[2];
......
1210 1328
    }
1211 1329
}
1212 1330

  
1213
static void pred16x16_vertical_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
1331
static void FUNCC(pred16x16_vertical_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
1214 1332
    int i;
1215 1333
    for(i=0; i<16; i++)
1216
        pred4x4_vertical_add_c(pix + block_offset[i], block + i*16, stride);
1334
        FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1217 1335
}
1218 1336

  
1219
static void pred16x16_horizontal_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
1337
static void FUNCC(pred16x16_horizontal_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
1220 1338
    int i;
1221 1339
    for(i=0; i<16; i++)
1222
        pred4x4_horizontal_add_c(pix + block_offset[i], block + i*16, stride);
1340
        FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1223 1341
}
1224 1342

  
1225
static void pred8x8_vertical_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
1343
static void FUNCC(pred8x8_vertical_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
1226 1344
    int i;
1227 1345
    for(i=0; i<4; i++)
1228
        pred4x4_vertical_add_c(pix + block_offset[i], block + i*16, stride);
1346
        FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1229 1347
}
1230 1348

  
1231
static void pred8x8_horizontal_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
1349
static void FUNCC(pred8x8_horizontal_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
1232 1350
    int i;
1233 1351
    for(i=0; i<4; i++)
1234
        pred4x4_horizontal_add_c(pix + block_offset[i], block + i*16, stride);
1352
        FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1235 1353
}

Also available in: Unified diff