Statistics
| Branch: | Revision:

ffmpeg / libavcodec / h264pred_internal.h @ 8dbe5856

History | View | Annotate | Download (44.2 KB)

1
/*
2
 * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
3
 * Copyright (c) 2003-2011 Michael Niedermayer <michaelni@gmx.at>
4
 *
5
 * This file is part of FFmpeg.
6
 *
7
 * FFmpeg is free software; you can redistribute it and/or
8
 * modify it under the terms of the GNU Lesser General Public
9
 * License as published by the Free Software Foundation; either
10
 * version 2.1 of the License, or (at your option) any later version.
11
 *
12
 * FFmpeg is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
 * Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with FFmpeg; if not, write to the Free Software
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
 */
21

    
22
/**
23
 * @file
24
 * H.264 / AVC / MPEG4 part10 prediction functions.
25
 * @author Michael Niedermayer <michaelni@gmx.at>
26
 */
27

    
28
#include "mathops.h"
29
#include "h264_high_depth.h"
30

    
31
static void FUNCC(pred4x4_vertical)(uint8_t *p_src, const uint8_t *topright, int p_stride){
32
    pixel *src = (pixel*)p_src;
33
    int stride = p_stride>>(sizeof(pixel)-1);
34
    const pixel4 a= ((pixel4*)(src-stride))[0];
35
    ((pixel4*)(src+0*stride))[0]= a;
36
    ((pixel4*)(src+1*stride))[0]= a;
37
    ((pixel4*)(src+2*stride))[0]= a;
38
    ((pixel4*)(src+3*stride))[0]= a;
39
}
40

    
41
static void FUNCC(pred4x4_horizontal)(uint8_t *p_src, const uint8_t *topright, int p_stride){
42
    pixel *src = (pixel*)p_src;
43
    int stride = p_stride>>(sizeof(pixel)-1);
44
    ((pixel4*)(src+0*stride))[0]= PIXEL_SPLAT_X4(src[-1+0*stride]);
45
    ((pixel4*)(src+1*stride))[0]= PIXEL_SPLAT_X4(src[-1+1*stride]);
46
    ((pixel4*)(src+2*stride))[0]= PIXEL_SPLAT_X4(src[-1+2*stride]);
47
    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(src[-1+3*stride]);
48
}
49

    
50
static void FUNCC(pred4x4_dc)(uint8_t *p_src, const uint8_t *topright, int p_stride){
51
    pixel *src = (pixel*)p_src;
52
    int stride = p_stride>>(sizeof(pixel)-1);
53
    const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
54
                   + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
55

    
56
    ((pixel4*)(src+0*stride))[0]=
57
    ((pixel4*)(src+1*stride))[0]=
58
    ((pixel4*)(src+2*stride))[0]=
59
    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(dc);
60
}
61

    
62
static void FUNCC(pred4x4_left_dc)(uint8_t *p_src, const uint8_t *topright, int p_stride){
63
    pixel *src = (pixel*)p_src;
64
    int stride = p_stride>>(sizeof(pixel)-1);
65
    const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
66

    
67
    ((pixel4*)(src+0*stride))[0]=
68
    ((pixel4*)(src+1*stride))[0]=
69
    ((pixel4*)(src+2*stride))[0]=
70
    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(dc);
71
}
72

    
73
static void FUNCC(pred4x4_top_dc)(uint8_t *p_src, const uint8_t *topright, int p_stride){
74
    pixel *src = (pixel*)p_src;
75
    int stride = p_stride>>(sizeof(pixel)-1);
76
    const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
77

    
78
    ((pixel4*)(src+0*stride))[0]=
79
    ((pixel4*)(src+1*stride))[0]=
80
    ((pixel4*)(src+2*stride))[0]=
81
    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(dc);
82
}
83

    
84
static void FUNCC(pred4x4_128_dc)(uint8_t *p_src, const uint8_t *topright, int p_stride){
85
    pixel *src = (pixel*)p_src;
86
    int stride = p_stride>>(sizeof(pixel)-1);
87
    ((pixel4*)(src+0*stride))[0]=
88
    ((pixel4*)(src+1*stride))[0]=
89
    ((pixel4*)(src+2*stride))[0]=
90
    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1));
91
}
92

    
93
static void FUNCC(pred4x4_127_dc)(uint8_t *p_src, const uint8_t *topright, int p_stride){
94
    pixel *src = (pixel*)p_src;
95
    int stride = p_stride>>(sizeof(pixel)-1);
96
    ((pixel4*)(src+0*stride))[0]=
97
    ((pixel4*)(src+1*stride))[0]=
98
    ((pixel4*)(src+2*stride))[0]=
99
    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))-1);
100
}
101

    
102
static void FUNCC(pred4x4_129_dc)(uint8_t *p_src, const uint8_t *topright, int p_stride){
103
    pixel *src = (pixel*)p_src;
104
    int stride = p_stride>>(sizeof(pixel)-1);
105
    ((pixel4*)(src+0*stride))[0]=
106
    ((pixel4*)(src+1*stride))[0]=
107
    ((pixel4*)(src+2*stride))[0]=
108
    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))+1);
109
}
110

    
111

    
112
#define LOAD_TOP_RIGHT_EDGE\
113
    const int av_unused t4= topright[0];\
114
    const int av_unused t5= topright[1];\
115
    const int av_unused t6= topright[2];\
116
    const int av_unused t7= topright[3];\
117

    
118
#define LOAD_DOWN_LEFT_EDGE\
119
    const int av_unused l4= src[-1+4*stride];\
120
    const int av_unused l5= src[-1+5*stride];\
121
    const int av_unused l6= src[-1+6*stride];\
122
    const int av_unused l7= src[-1+7*stride];\
123

    
124
#define LOAD_LEFT_EDGE\
125
    const int av_unused l0= src[-1+0*stride];\
126
    const int av_unused l1= src[-1+1*stride];\
127
    const int av_unused l2= src[-1+2*stride];\
128
    const int av_unused l3= src[-1+3*stride];\
129

    
130
#define LOAD_TOP_EDGE\
131
    const int av_unused t0= src[ 0-1*stride];\
132
    const int av_unused t1= src[ 1-1*stride];\
133
    const int av_unused t2= src[ 2-1*stride];\
134
    const int av_unused t3= src[ 3-1*stride];\
135

    
136
static void FUNCC(pred4x4_vertical_vp8)(uint8_t *p_src, const uint8_t *p_topright, int p_stride){
137
    pixel *src = (pixel*)p_src;
138
    const pixel *topright = (const pixel*)p_topright;
139
    int stride = p_stride>>(sizeof(pixel)-1);
140
    const int lt= src[-1-1*stride];
141
    LOAD_TOP_EDGE
142
    LOAD_TOP_RIGHT_EDGE
143
    pixel4 v = PACK_4U8((lt + 2*t0 + t1 + 2) >> 2,
144
                            (t0 + 2*t1 + t2 + 2) >> 2,
145
                            (t1 + 2*t2 + t3 + 2) >> 2,
146
                            (t2 + 2*t3 + t4 + 2) >> 2);
147

    
148
    AV_WN4PA(src+0*stride, v);
149
    AV_WN4PA(src+1*stride, v);
150
    AV_WN4PA(src+2*stride, v);
151
    AV_WN4PA(src+3*stride, v);
152
}
153

    
154
static void FUNCC(pred4x4_horizontal_vp8)(uint8_t *p_src, const uint8_t *topright, int p_stride){
155
    pixel *src = (pixel*)p_src;
156
    int stride = p_stride>>(sizeof(pixel)-1);
157
    const int lt= src[-1-1*stride];
158
    LOAD_LEFT_EDGE
159

    
160
    AV_WN4PA(src+0*stride, PIXEL_SPLAT_X4((lt + 2*l0 + l1 + 2) >> 2));
161
    AV_WN4PA(src+1*stride, PIXEL_SPLAT_X4((l0 + 2*l1 + l2 + 2) >> 2));
162
    AV_WN4PA(src+2*stride, PIXEL_SPLAT_X4((l1 + 2*l2 + l3 + 2) >> 2));
163
    AV_WN4PA(src+3*stride, PIXEL_SPLAT_X4((l2 + 2*l3 + l3 + 2) >> 2));
164
}
165

    
166
static void FUNCC(pred4x4_down_right)(uint8_t *p_src, const uint8_t *topright, int p_stride){
167
    pixel *src = (pixel*)p_src;
168
    int stride = p_stride>>(sizeof(pixel)-1);
169
    const int lt= src[-1-1*stride];
170
    LOAD_TOP_EDGE
171
    LOAD_LEFT_EDGE
172

    
173
    src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2;
174
    src[0+2*stride]=
175
    src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2;
176
    src[0+1*stride]=
177
    src[1+2*stride]=
178
    src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2;
179
    src[0+0*stride]=
180
    src[1+1*stride]=
181
    src[2+2*stride]=
182
    src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
183
    src[1+0*stride]=
184
    src[2+1*stride]=
185
    src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
186
    src[2+0*stride]=
187
    src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
188
    src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
189
}
190

    
191
static void FUNCC(pred4x4_down_left)(uint8_t *p_src, const uint8_t *p_topright, int p_stride){
192
    pixel *src = (pixel*)p_src;
193
    const pixel *topright = (const pixel*)p_topright;
194
    int stride = p_stride>>(sizeof(pixel)-1);
195
    LOAD_TOP_EDGE
196
    LOAD_TOP_RIGHT_EDGE
197
//    LOAD_LEFT_EDGE
198

    
199
    src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
200
    src[1+0*stride]=
201
    src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
202
    src[2+0*stride]=
203
    src[1+1*stride]=
204
    src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
205
    src[3+0*stride]=
206
    src[2+1*stride]=
207
    src[1+2*stride]=
208
    src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
209
    src[3+1*stride]=
210
    src[2+2*stride]=
211
    src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
212
    src[3+2*stride]=
213
    src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
214
    src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
215
}
216

    
217
static void FUNCC(pred4x4_down_left_svq3)(uint8_t *p_src, const uint8_t *topright, int p_stride){
218
    pixel *src = (pixel*)p_src;
219
    int stride = p_stride>>(sizeof(pixel)-1);
220
    LOAD_TOP_EDGE
221
    LOAD_LEFT_EDGE
222
    const av_unused int unu0= t0;
223
    const av_unused int unu1= l0;
224

    
225
    src[0+0*stride]=(l1 + t1)>>1;
226
    src[1+0*stride]=
227
    src[0+1*stride]=(l2 + t2)>>1;
228
    src[2+0*stride]=
229
    src[1+1*stride]=
230
    src[0+2*stride]=
231
    src[3+0*stride]=
232
    src[2+1*stride]=
233
    src[1+2*stride]=
234
    src[0+3*stride]=
235
    src[3+1*stride]=
236
    src[2+2*stride]=
237
    src[1+3*stride]=
238
    src[3+2*stride]=
239
    src[2+3*stride]=
240
    src[3+3*stride]=(l3 + t3)>>1;
241
}
242

    
243
static void FUNCC(pred4x4_down_left_rv40)(uint8_t *p_src, const uint8_t *p_topright, int p_stride){
244
    pixel *src = (pixel*)p_src;
245
    const pixel *topright = (const pixel*)p_topright;
246
    int stride = p_stride>>(sizeof(pixel)-1);
247
    LOAD_TOP_EDGE
248
    LOAD_TOP_RIGHT_EDGE
249
    LOAD_LEFT_EDGE
250
    LOAD_DOWN_LEFT_EDGE
251

    
252
    src[0+0*stride]=(t0 + t2 + 2*t1 + 2 + l0 + l2 + 2*l1 + 2)>>3;
253
    src[1+0*stride]=
254
    src[0+1*stride]=(t1 + t3 + 2*t2 + 2 + l1 + l3 + 2*l2 + 2)>>3;
255
    src[2+0*stride]=
256
    src[1+1*stride]=
257
    src[0+2*stride]=(t2 + t4 + 2*t3 + 2 + l2 + l4 + 2*l3 + 2)>>3;
258
    src[3+0*stride]=
259
    src[2+1*stride]=
260
    src[1+2*stride]=
261
    src[0+3*stride]=(t3 + t5 + 2*t4 + 2 + l3 + l5 + 2*l4 + 2)>>3;
262
    src[3+1*stride]=
263
    src[2+2*stride]=
264
    src[1+3*stride]=(t4 + t6 + 2*t5 + 2 + l4 + l6 + 2*l5 + 2)>>3;
265
    src[3+2*stride]=
266
    src[2+3*stride]=(t5 + t7 + 2*t6 + 2 + l5 + l7 + 2*l6 + 2)>>3;
267
    src[3+3*stride]=(t6 + t7 + 1 + l6 + l7 + 1)>>2;
268
}
269

    
270
static void FUNCC(pred4x4_down_left_rv40_nodown)(uint8_t *p_src, const uint8_t *p_topright, int p_stride){
271
    pixel *src = (pixel*)p_src;
272
    const pixel *topright = (const pixel*)p_topright;
273
    int stride = p_stride>>(sizeof(pixel)-1);
274
    LOAD_TOP_EDGE
275
    LOAD_TOP_RIGHT_EDGE
276
    LOAD_LEFT_EDGE
277

    
278
    src[0+0*stride]=(t0 + t2 + 2*t1 + 2 + l0 + l2 + 2*l1 + 2)>>3;
279
    src[1+0*stride]=
280
    src[0+1*stride]=(t1 + t3 + 2*t2 + 2 + l1 + l3 + 2*l2 + 2)>>3;
281
    src[2+0*stride]=
282
    src[1+1*stride]=
283
    src[0+2*stride]=(t2 + t4 + 2*t3 + 2 + l2 + 3*l3 + 2)>>3;
284
    src[3+0*stride]=
285
    src[2+1*stride]=
286
    src[1+2*stride]=
287
    src[0+3*stride]=(t3 + t5 + 2*t4 + 2 + l3*4 + 2)>>3;
288
    src[3+1*stride]=
289
    src[2+2*stride]=
290
    src[1+3*stride]=(t4 + t6 + 2*t5 + 2 + l3*4 + 2)>>3;
291
    src[3+2*stride]=
292
    src[2+3*stride]=(t5 + t7 + 2*t6 + 2 + l3*4 + 2)>>3;
293
    src[3+3*stride]=(t6 + t7 + 1 + 2*l3 + 1)>>2;
294
}
295

    
296
static void FUNCC(pred4x4_vertical_right)(uint8_t *p_src, const uint8_t *topright, int p_stride){
297
    pixel *src = (pixel*)p_src;
298
    int stride = p_stride>>(sizeof(pixel)-1);
299
    const int lt= src[-1-1*stride];
300
    LOAD_TOP_EDGE
301
    LOAD_LEFT_EDGE
302

    
303
    src[0+0*stride]=
304
    src[1+2*stride]=(lt + t0 + 1)>>1;
305
    src[1+0*stride]=
306
    src[2+2*stride]=(t0 + t1 + 1)>>1;
307
    src[2+0*stride]=
308
    src[3+2*stride]=(t1 + t2 + 1)>>1;
309
    src[3+0*stride]=(t2 + t3 + 1)>>1;
310
    src[0+1*stride]=
311
    src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
312
    src[1+1*stride]=
313
    src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
314
    src[2+1*stride]=
315
    src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
316
    src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
317
    src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
318
    src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
319
}
320

    
321
static void FUNCC(pred4x4_vertical_left)(uint8_t *p_src, const uint8_t *p_topright, int p_stride){
322
    pixel *src = (pixel*)p_src;
323
    const pixel *topright = (const pixel*)p_topright;
324
    int stride = p_stride>>(sizeof(pixel)-1);
325
    LOAD_TOP_EDGE
326
    LOAD_TOP_RIGHT_EDGE
327

    
328
    src[0+0*stride]=(t0 + t1 + 1)>>1;
329
    src[1+0*stride]=
330
    src[0+2*stride]=(t1 + t2 + 1)>>1;
331
    src[2+0*stride]=
332
    src[1+2*stride]=(t2 + t3 + 1)>>1;
333
    src[3+0*stride]=
334
    src[2+2*stride]=(t3 + t4+ 1)>>1;
335
    src[3+2*stride]=(t4 + t5+ 1)>>1;
336
    src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
337
    src[1+1*stride]=
338
    src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
339
    src[2+1*stride]=
340
    src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
341
    src[3+1*stride]=
342
    src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
343
    src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
344
}
345

    
346
static void FUNCC(pred4x4_vertical_left_rv40_internal)(uint8_t *p_src, const uint8_t *p_topright, int p_stride,
347
                                      const int l0, const int l1, const int l2, const int l3, const int l4){
348
    pixel *src = (pixel*)p_src;
349
    const pixel *topright = (const pixel*)p_topright;
350
    int stride = p_stride>>(sizeof(pixel)-1);
351
    LOAD_TOP_EDGE
352
    LOAD_TOP_RIGHT_EDGE
353

    
354
    src[0+0*stride]=(2*t0 + 2*t1 + l1 + 2*l2 + l3 + 4)>>3;
355
    src[1+0*stride]=
356
    src[0+2*stride]=(t1 + t2 + 1)>>1;
357
    src[2+0*stride]=
358
    src[1+2*stride]=(t2 + t3 + 1)>>1;
359
    src[3+0*stride]=
360
    src[2+2*stride]=(t3 + t4+ 1)>>1;
361
    src[3+2*stride]=(t4 + t5+ 1)>>1;
362
    src[0+1*stride]=(t0 + 2*t1 + t2 + l2 + 2*l3 + l4 + 4)>>3;
363
    src[1+1*stride]=
364
    src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
365
    src[2+1*stride]=
366
    src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
367
    src[3+1*stride]=
368
    src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
369
    src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
370
}
371

    
372
static void FUNCC(pred4x4_vertical_left_rv40)(uint8_t *p_src, const uint8_t *topright, int p_stride){
373
    pixel *src = (pixel*)p_src;
374
    int stride = p_stride>>(sizeof(pixel)-1);
375
    LOAD_LEFT_EDGE
376
    LOAD_DOWN_LEFT_EDGE
377

    
378
    FUNCC(pred4x4_vertical_left_rv40_internal)(p_src, topright, p_stride, l0, l1, l2, l3, l4);
379
}
380

    
381
static void FUNCC(pred4x4_vertical_left_rv40_nodown)(uint8_t *p_src, const uint8_t *topright, int p_stride){
382
    pixel *src = (pixel*)p_src;
383
    int stride = p_stride>>(sizeof(pixel)-1);
384
    LOAD_LEFT_EDGE
385

    
386
    FUNCC(pred4x4_vertical_left_rv40_internal)(p_src, topright, p_stride, l0, l1, l2, l3, l3);
387
}
388

    
389
static void FUNCC(pred4x4_vertical_left_vp8)(uint8_t *p_src, const uint8_t *p_topright, int p_stride){
390
    pixel *src = (pixel*)p_src;
391
    const pixel *topright = (const pixel*)p_topright;
392
    int stride = p_stride>>(sizeof(pixel)-1);
393
    LOAD_TOP_EDGE
394
    LOAD_TOP_RIGHT_EDGE
395

    
396
    src[0+0*stride]=(t0 + t1 + 1)>>1;
397
    src[1+0*stride]=
398
    src[0+2*stride]=(t1 + t2 + 1)>>1;
399
    src[2+0*stride]=
400
    src[1+2*stride]=(t2 + t3 + 1)>>1;
401
    src[3+0*stride]=
402
    src[2+2*stride]=(t3 + t4 + 1)>>1;
403
    src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
404
    src[1+1*stride]=
405
    src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
406
    src[2+1*stride]=
407
    src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
408
    src[3+1*stride]=
409
    src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
410
    src[3+2*stride]=(t4 + 2*t5 + t6 + 2)>>2;
411
    src[3+3*stride]=(t5 + 2*t6 + t7 + 2)>>2;
412
}
413

    
414
static void FUNCC(pred4x4_horizontal_up)(uint8_t *p_src, const uint8_t *topright, int p_stride){
415
    pixel *src = (pixel*)p_src;
416
    int stride = p_stride>>(sizeof(pixel)-1);
417
    LOAD_LEFT_EDGE
418

    
419
    src[0+0*stride]=(l0 + l1 + 1)>>1;
420
    src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
421
    src[2+0*stride]=
422
    src[0+1*stride]=(l1 + l2 + 1)>>1;
423
    src[3+0*stride]=
424
    src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
425
    src[2+1*stride]=
426
    src[0+2*stride]=(l2 + l3 + 1)>>1;
427
    src[3+1*stride]=
428
    src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
429
    src[3+2*stride]=
430
    src[1+3*stride]=
431
    src[0+3*stride]=
432
    src[2+2*stride]=
433
    src[2+3*stride]=
434
    src[3+3*stride]=l3;
435
}
436

    
437
static void FUNCC(pred4x4_horizontal_up_rv40)(uint8_t *p_src, const uint8_t *p_topright, int p_stride){
438
    pixel *src = (pixel*)p_src;
439
    const pixel *topright = (const pixel*)p_topright;
440
    int stride = p_stride>>(sizeof(pixel)-1);
441
    LOAD_LEFT_EDGE
442
    LOAD_DOWN_LEFT_EDGE
443
    LOAD_TOP_EDGE
444
    LOAD_TOP_RIGHT_EDGE
445

    
446
    src[0+0*stride]=(t1 + 2*t2 + t3 + 2*l0 + 2*l1 + 4)>>3;
447
    src[1+0*stride]=(t2 + 2*t3 + t4 + l0 + 2*l1 + l2 + 4)>>3;
448
    src[2+0*stride]=
449
    src[0+1*stride]=(t3 + 2*t4 + t5 + 2*l1 + 2*l2 + 4)>>3;
450
    src[3+0*stride]=
451
    src[1+1*stride]=(t4 + 2*t5 + t6 + l1 + 2*l2 + l3 + 4)>>3;
452
    src[2+1*stride]=
453
    src[0+2*stride]=(t5 + 2*t6 + t7 + 2*l2 + 2*l3 + 4)>>3;
454
    src[3+1*stride]=
455
    src[1+2*stride]=(t6 + 3*t7 + l2 + 3*l3 + 4)>>3;
456
    src[3+2*stride]=
457
    src[1+3*stride]=(l3 + 2*l4 + l5 + 2)>>2;
458
    src[0+3*stride]=
459
    src[2+2*stride]=(t6 + t7 + l3 + l4 + 2)>>2;
460
    src[2+3*stride]=(l4 + l5 + 1)>>1;
461
    src[3+3*stride]=(l4 + 2*l5 + l6 + 2)>>2;
462
}
463

    
464
static void FUNCC(pred4x4_horizontal_up_rv40_nodown)(uint8_t *p_src, const uint8_t *p_topright, int p_stride){
465
    pixel *src = (pixel*)p_src;
466
    const pixel *topright = (const pixel*)p_topright;
467
    int stride = p_stride>>(sizeof(pixel)-1);
468
    LOAD_LEFT_EDGE
469
    LOAD_TOP_EDGE
470
    LOAD_TOP_RIGHT_EDGE
471

    
472
    src[0+0*stride]=(t1 + 2*t2 + t3 + 2*l0 + 2*l1 + 4)>>3;
473
    src[1+0*stride]=(t2 + 2*t3 + t4 + l0 + 2*l1 + l2 + 4)>>3;
474
    src[2+0*stride]=
475
    src[0+1*stride]=(t3 + 2*t4 + t5 + 2*l1 + 2*l2 + 4)>>3;
476
    src[3+0*stride]=
477
    src[1+1*stride]=(t4 + 2*t5 + t6 + l1 + 2*l2 + l3 + 4)>>3;
478
    src[2+1*stride]=
479
    src[0+2*stride]=(t5 + 2*t6 + t7 + 2*l2 + 2*l3 + 4)>>3;
480
    src[3+1*stride]=
481
    src[1+2*stride]=(t6 + 3*t7 + l2 + 3*l3 + 4)>>3;
482
    src[3+2*stride]=
483
    src[1+3*stride]=l3;
484
    src[0+3*stride]=
485
    src[2+2*stride]=(t6 + t7 + 2*l3 + 2)>>2;
486
    src[2+3*stride]=
487
    src[3+3*stride]=l3;
488
}
489

    
490
static void FUNCC(pred4x4_horizontal_down)(uint8_t *p_src, const uint8_t *topright, int p_stride){
491
    pixel *src = (pixel*)p_src;
492
    int stride = p_stride>>(sizeof(pixel)-1);
493
    const int lt= src[-1-1*stride];
494
    LOAD_TOP_EDGE
495
    LOAD_LEFT_EDGE
496

    
497
    src[0+0*stride]=
498
    src[2+1*stride]=(lt + l0 + 1)>>1;
499
    src[1+0*stride]=
500
    src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
501
    src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
502
    src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
503
    src[0+1*stride]=
504
    src[2+2*stride]=(l0 + l1 + 1)>>1;
505
    src[1+1*stride]=
506
    src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
507
    src[0+2*stride]=
508
    src[2+3*stride]=(l1 + l2+ 1)>>1;
509
    src[1+2*stride]=
510
    src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
511
    src[0+3*stride]=(l2 + l3 + 1)>>1;
512
    src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
513
}
514

    
515
static void FUNCC(pred4x4_tm_vp8)(uint8_t *p_src, const uint8_t *topright, int p_stride){
516
    pixel *src = (pixel*)p_src;
517
    int stride = p_stride>>(sizeof(pixel)-1);
518
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP - src[-1-stride];
519
    pixel *top = src-stride;
520
    int y;
521

    
522
    for (y = 0; y < 4; y++) {
523
        uint8_t *cm_in = cm + src[-1];
524
        src[0] = cm_in[top[0]];
525
        src[1] = cm_in[top[1]];
526
        src[2] = cm_in[top[2]];
527
        src[3] = cm_in[top[3]];
528
        src += stride;
529
    }
530
}
531

    
532
static void FUNCC(pred16x16_vertical)(uint8_t *p_src, int p_stride){
533
    int i;
534
    pixel *src = (pixel*)p_src;
535
    int stride = p_stride>>(sizeof(pixel)-1);
536
    const pixel4 a = ((pixel4*)(src-stride))[0];
537
    const pixel4 b = ((pixel4*)(src-stride))[1];
538
    const pixel4 c = ((pixel4*)(src-stride))[2];
539
    const pixel4 d = ((pixel4*)(src-stride))[3];
540

    
541
    for(i=0; i<16; i++){
542
        ((pixel4*)(src+i*stride))[0] = a;
543
        ((pixel4*)(src+i*stride))[1] = b;
544
        ((pixel4*)(src+i*stride))[2] = c;
545
        ((pixel4*)(src+i*stride))[3] = d;
546
    }
547
}
548

    
549
static void FUNCC(pred16x16_horizontal)(uint8_t *p_src, int stride){
550
    int i;
551
    pixel *src = (pixel*)p_src;
552
    stride >>= sizeof(pixel)-1;
553

    
554
    for(i=0; i<16; i++){
555
        ((pixel4*)(src+i*stride))[0] =
556
        ((pixel4*)(src+i*stride))[1] =
557
        ((pixel4*)(src+i*stride))[2] =
558
        ((pixel4*)(src+i*stride))[3] = PIXEL_SPLAT_X4(src[-1+i*stride]);
559
    }
560
}
561

    
562
#define PREDICT_16x16_DC(v)\
563
    for(i=0; i<16; i++){\
564
        AV_WN4P(src+ 0, v);\
565
        AV_WN4P(src+ 4, v);\
566
        AV_WN4P(src+ 8, v);\
567
        AV_WN4P(src+12, v);\
568
        src += stride;\
569
    }
570

    
571
static void FUNCC(pred16x16_dc)(uint8_t *p_src, int stride){
572
    int i, dc=0;
573
    pixel *src = (pixel*)p_src;
574
    pixel4 dcsplat;
575
    stride >>= sizeof(pixel)-1;
576

    
577
    for(i=0;i<16; i++){
578
        dc+= src[-1+i*stride];
579
    }
580

    
581
    for(i=0;i<16; i++){
582
        dc+= src[i-stride];
583
    }
584

    
585
    dcsplat = PIXEL_SPLAT_X4((dc+16)>>5);
586
    PREDICT_16x16_DC(dcsplat);
587
}
588

    
589
static void FUNCC(pred16x16_left_dc)(uint8_t *p_src, int stride){
590
    int i, dc=0;
591
    pixel *src = (pixel*)p_src;
592
    pixel4 dcsplat;
593
    stride >>= sizeof(pixel)-1;
594

    
595
    for(i=0;i<16; i++){
596
        dc+= src[-1+i*stride];
597
    }
598

    
599
    dcsplat = PIXEL_SPLAT_X4((dc+8)>>4);
600
    PREDICT_16x16_DC(dcsplat);
601
}
602

    
603
static void FUNCC(pred16x16_top_dc)(uint8_t *p_src, int stride){
604
    int i, dc=0;
605
    pixel *src = (pixel*)p_src;
606
    pixel4 dcsplat;
607
    stride >>= sizeof(pixel)-1;
608

    
609
    for(i=0;i<16; i++){
610
        dc+= src[i-stride];
611
    }
612

    
613
    dcsplat = PIXEL_SPLAT_X4((dc+8)>>4);
614
    PREDICT_16x16_DC(dcsplat);
615
}
616

    
617
#define PRED16x16_X(n, v) \
618
static void FUNCC(pred16x16_##n##_dc)(uint8_t *p_src, int stride){\
619
    int i;\
620
    pixel *src = (pixel*)p_src;\
621
    stride >>= sizeof(pixel)-1;\
622
    PREDICT_16x16_DC(PIXEL_SPLAT_X4(v));\
623
}
624

    
625
PRED16x16_X(127, (1<<(BIT_DEPTH-1))-1);
626
PRED16x16_X(128, (1<<(BIT_DEPTH-1))+0);
627
PRED16x16_X(129, (1<<(BIT_DEPTH-1))+1);
628

    
629
static inline void FUNCC(pred16x16_plane_compat)(uint8_t *p_src, int p_stride, const int svq3, const int rv40){
630
  int i, j, k;
631
  int a;
632
  INIT_CLIP
633
  pixel *src = (pixel*)p_src;
634
  int stride = p_stride>>(sizeof(pixel)-1);
635
  const pixel * const src0 = src +7-stride;
636
  const pixel *       src1 = src +8*stride-1;
637
  const pixel *       src2 = src1-2*stride;    // == src+6*stride-1;
638
  int H = src0[1] - src0[-1];
639
  int V = src1[0] - src2[ 0];
640
  for(k=2; k<=8; ++k) {
641
    src1 += stride; src2 -= stride;
642
    H += k*(src0[k] - src0[-k]);
643
    V += k*(src1[0] - src2[ 0]);
644
  }
645
  if(svq3){
646
    H = ( 5*(H/4) ) / 16;
647
    V = ( 5*(V/4) ) / 16;
648

    
649
    /* required for 100% accuracy */
650
    i = H; H = V; V = i;
651
  }else if(rv40){
652
    H = ( H + (H>>2) ) >> 4;
653
    V = ( V + (V>>2) ) >> 4;
654
  }else{
655
    H = ( 5*H+32 ) >> 6;
656
    V = ( 5*V+32 ) >> 6;
657
  }
658

    
659
  a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
660
  for(j=16; j>0; --j) {
661
    int b = a;
662
    a += V;
663
    for(i=-16; i<0; i+=4) {
664
      src[16+i] = CLIP((b    ) >> 5);
665
      src[17+i] = CLIP((b+  H) >> 5);
666
      src[18+i] = CLIP((b+2*H) >> 5);
667
      src[19+i] = CLIP((b+3*H) >> 5);
668
      b += 4*H;
669
    }
670
    src += stride;
671
  }
672
}
673

    
674
static void FUNCC(pred16x16_plane)(uint8_t *src, int stride){
675
    FUNCC(pred16x16_plane_compat)(src, stride, 0, 0);
676
}
677

    
678
static void FUNCC(pred16x16_plane_svq3)(uint8_t *src, int stride){
679
    FUNCC(pred16x16_plane_compat)(src, stride, 1, 0);
680
}
681

    
682
static void FUNCC(pred16x16_plane_rv40)(uint8_t *src, int stride){
683
    FUNCC(pred16x16_plane_compat)(src, stride, 0, 1);
684
}
685

    
686
static void FUNCC(pred16x16_tm_vp8)(uint8_t *src, int stride){
687
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP - src[-1-stride];
688
    uint8_t *top = src-stride;
689
    int y;
690

    
691
    for (y = 0; y < 16; y++) {
692
        uint8_t *cm_in = cm + src[-1];
693
        src[0]  = cm_in[top[0]];
694
        src[1]  = cm_in[top[1]];
695
        src[2]  = cm_in[top[2]];
696
        src[3]  = cm_in[top[3]];
697
        src[4]  = cm_in[top[4]];
698
        src[5]  = cm_in[top[5]];
699
        src[6]  = cm_in[top[6]];
700
        src[7]  = cm_in[top[7]];
701
        src[8]  = cm_in[top[8]];
702
        src[9]  = cm_in[top[9]];
703
        src[10] = cm_in[top[10]];
704
        src[11] = cm_in[top[11]];
705
        src[12] = cm_in[top[12]];
706
        src[13] = cm_in[top[13]];
707
        src[14] = cm_in[top[14]];
708
        src[15] = cm_in[top[15]];
709
        src += stride;
710
    }
711
}
712

    
713
static void FUNCC(pred8x8_vertical)(uint8_t *p_src, int p_stride){
714
    int i;
715
    pixel *src = (pixel*)p_src;
716
    int stride = p_stride>>(sizeof(pixel)-1);
717
    const pixel4 a= ((pixel4*)(src-stride))[0];
718
    const pixel4 b= ((pixel4*)(src-stride))[1];
719

    
720
    for(i=0; i<8; i++){
721
        ((pixel4*)(src+i*stride))[0]= a;
722
        ((pixel4*)(src+i*stride))[1]= b;
723
    }
724
}
725

    
726
static void FUNCC(pred8x8_horizontal)(uint8_t *p_src, int stride){
727
    int i;
728
    pixel *src = (pixel*)p_src;
729
    stride >>= sizeof(pixel)-1;
730

    
731
    for(i=0; i<8; i++){
732
        ((pixel4*)(src+i*stride))[0]=
733
        ((pixel4*)(src+i*stride))[1]= PIXEL_SPLAT_X4(src[-1+i*stride]);
734
    }
735
}
736

    
737
#define PRED8x8_X(n, v)\
738
static void FUNCC(pred8x8_##n##_dc)(uint8_t *p_src, int stride){\
739
    int i;\
740
    pixel *src = (pixel*)p_src;\
741
    stride >>= sizeof(pixel)-1;\
742
    for(i=0; i<8; i++){\
743
        ((pixel4*)(src+i*stride))[0]=\
744
        ((pixel4*)(src+i*stride))[1]= PIXEL_SPLAT_X4(v);\
745
    }\
746
}
747

    
748
PRED8x8_X(127, (1<<(BIT_DEPTH-1))-1);
749
PRED8x8_X(128, (1<<(BIT_DEPTH-1))+0);
750
PRED8x8_X(129, (1<<(BIT_DEPTH-1))+1);
751

    
752
static void FUNCC(pred8x8_left_dc)(uint8_t *p_src, int stride){
753
    int i;
754
    int dc0, dc2;
755
    pixel4 dc0splat, dc2splat;
756
    pixel *src = (pixel*)p_src;
757
    stride >>= sizeof(pixel)-1;
758

    
759
    dc0=dc2=0;
760
    for(i=0;i<4; i++){
761
        dc0+= src[-1+i*stride];
762
        dc2+= src[-1+(i+4)*stride];
763
    }
764
    dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2);
765
    dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
766

    
767
    for(i=0; i<4; i++){
768
        ((pixel4*)(src+i*stride))[0]=
769
        ((pixel4*)(src+i*stride))[1]= dc0splat;
770
    }
771
    for(i=4; i<8; i++){
772
        ((pixel4*)(src+i*stride))[0]=
773
        ((pixel4*)(src+i*stride))[1]= dc2splat;
774
    }
775
}
776

    
777
static void FUNCC(pred8x8_left_dc_rv40)(uint8_t *p_src, int stride){
778
    int i;
779
    int dc0;
780
    pixel4 dc0splat;
781
    pixel *src = (pixel*)p_src;
782
    stride >>= sizeof(pixel)-1;
783

    
784
    dc0=0;
785
    for(i=0;i<8; i++)
786
        dc0+= src[-1+i*stride];
787
    dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3);
788

    
789
    for(i=0; i<8; i++){
790
        ((pixel4*)(src+i*stride))[0]=
791
        ((pixel4*)(src+i*stride))[1]= dc0splat;
792
    }
793
}
794

    
795
static void FUNCC(pred8x8_top_dc)(uint8_t *p_src, int stride){
796
    int i;
797
    int dc0, dc1;
798
    pixel4 dc0splat, dc1splat;
799
    pixel *src = (pixel*)p_src;
800
    stride >>= sizeof(pixel)-1;
801

    
802
    dc0=dc1=0;
803
    for(i=0;i<4; i++){
804
        dc0+= src[i-stride];
805
        dc1+= src[4+i-stride];
806
    }
807
    dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2);
808
    dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
809

    
810
    for(i=0; i<4; i++){
811
        ((pixel4*)(src+i*stride))[0]= dc0splat;
812
        ((pixel4*)(src+i*stride))[1]= dc1splat;
813
    }
814
    for(i=4; i<8; i++){
815
        ((pixel4*)(src+i*stride))[0]= dc0splat;
816
        ((pixel4*)(src+i*stride))[1]= dc1splat;
817
    }
818
}
819

    
820
static void FUNCC(pred8x8_top_dc_rv40)(uint8_t *p_src, int stride){
821
    int i;
822
    int dc0;
823
    pixel4 dc0splat;
824
    pixel *src = (pixel*)p_src;
825
    stride >>= sizeof(pixel)-1;
826

    
827
    dc0=0;
828
    for(i=0;i<8; i++)
829
        dc0+= src[i-stride];
830
    dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3);
831

    
832
    for(i=0; i<8; i++){
833
        ((pixel4*)(src+i*stride))[0]=
834
        ((pixel4*)(src+i*stride))[1]= dc0splat;
835
    }
836
}
837

    
838

    
839
static void FUNCC(pred8x8_dc)(uint8_t *p_src, int stride){
840
    int i;
841
    int dc0, dc1, dc2;
842
    pixel4 dc0splat, dc1splat, dc2splat, dc3splat;
843
    pixel *src = (pixel*)p_src;
844
    stride >>= sizeof(pixel)-1;
845

    
846
    dc0=dc1=dc2=0;
847
    for(i=0;i<4; i++){
848
        dc0+= src[-1+i*stride] + src[i-stride];
849
        dc1+= src[4+i-stride];
850
        dc2+= src[-1+(i+4)*stride];
851
    }
852
    dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3);
853
    dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
854
    dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
855
    dc3splat = PIXEL_SPLAT_X4((dc1 + dc2 + 4)>>3);
856

    
857
    for(i=0; i<4; i++){
858
        ((pixel4*)(src+i*stride))[0]= dc0splat;
859
        ((pixel4*)(src+i*stride))[1]= dc1splat;
860
    }
861
    for(i=4; i<8; i++){
862
        ((pixel4*)(src+i*stride))[0]= dc2splat;
863
        ((pixel4*)(src+i*stride))[1]= dc3splat;
864
    }
865
}
866

    
867
//the following 4 function should not be optimized!
868
static void FUNC(pred8x8_mad_cow_dc_l0t)(uint8_t *src, int stride){
869
    FUNCC(pred8x8_top_dc)(src, stride);
870
    FUNCC(pred4x4_dc)(src, NULL, stride);
871
}
872

    
873
static void FUNC(pred8x8_mad_cow_dc_0lt)(uint8_t *src, int stride){
874
    FUNCC(pred8x8_dc)(src, stride);
875
    FUNCC(pred4x4_top_dc)(src, NULL, stride);
876
}
877

    
878
static void FUNC(pred8x8_mad_cow_dc_l00)(uint8_t *src, int stride){
879
    FUNCC(pred8x8_left_dc)(src, stride);
880
    FUNCC(pred4x4_128_dc)(src + 4*stride                  , NULL, stride);
881
    FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride);
882
}
883

    
884
static void FUNC(pred8x8_mad_cow_dc_0l0)(uint8_t *src, int stride){
885
    FUNCC(pred8x8_left_dc)(src, stride);
886
    FUNCC(pred4x4_128_dc)(src                  , NULL, stride);
887
    FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride);
888
}
889

    
890
static void FUNCC(pred8x8_dc_rv40)(uint8_t *p_src, int stride){
891
    int i;
892
    int dc0=0;
893
    pixel4 dc0splat;
894
    pixel *src = (pixel*)p_src;
895
    stride >>= sizeof(pixel)-1;
896

    
897
    for(i=0;i<4; i++){
898
        dc0+= src[-1+i*stride] + src[i-stride];
899
        dc0+= src[4+i-stride];
900
        dc0+= src[-1+(i+4)*stride];
901
    }
902
    dc0splat = PIXEL_SPLAT_X4((dc0 + 8)>>4);
903

    
904
    for(i=0; i<4; i++){
905
        ((pixel4*)(src+i*stride))[0]= dc0splat;
906
        ((pixel4*)(src+i*stride))[1]= dc0splat;
907
    }
908
    for(i=4; i<8; i++){
909
        ((pixel4*)(src+i*stride))[0]= dc0splat;
910
        ((pixel4*)(src+i*stride))[1]= dc0splat;
911
    }
912
}
913

    
914
static void FUNCC(pred8x8_plane)(uint8_t *p_src, int p_stride){
915
  int j, k;
916
  int a;
917
  INIT_CLIP
918
  pixel *src = (pixel*)p_src;
919
  int stride = p_stride>>(sizeof(pixel)-1);
920
  const pixel * const src0 = src +3-stride;
921
  const pixel *       src1 = src +4*stride-1;
922
  const pixel *       src2 = src1-2*stride;    // == src+2*stride-1;
923
  int H = src0[1] - src0[-1];
924
  int V = src1[0] - src2[ 0];
925
  for(k=2; k<=4; ++k) {
926
    src1 += stride; src2 -= stride;
927
    H += k*(src0[k] - src0[-k]);
928
    V += k*(src1[0] - src2[ 0]);
929
  }
930
  H = ( 17*H+16 ) >> 5;
931
  V = ( 17*V+16 ) >> 5;
932

    
933
  a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
934
  for(j=8; j>0; --j) {
935
    int b = a;
936
    a += V;
937
    src[0] = CLIP((b    ) >> 5);
938
    src[1] = CLIP((b+  H) >> 5);
939
    src[2] = CLIP((b+2*H) >> 5);
940
    src[3] = CLIP((b+3*H) >> 5);
941
    src[4] = CLIP((b+4*H) >> 5);
942
    src[5] = CLIP((b+5*H) >> 5);
943
    src[6] = CLIP((b+6*H) >> 5);
944
    src[7] = CLIP((b+7*H) >> 5);
945
    src += stride;
946
  }
947
}
948

    
949
static void FUNCC(pred8x8_tm_vp8)(uint8_t *p_src, int p_stride){
950
    pixel *src = (pixel*)p_src;
951
    int stride = p_stride>>(sizeof(pixel)-1);
952
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP - src[-1-stride];
953
    pixel *top = src-stride;
954
    int y;
955

    
956
    for (y = 0; y < 8; y++) {
957
        uint8_t *cm_in = cm + src[-1];
958
        src[0] = cm_in[top[0]];
959
        src[1] = cm_in[top[1]];
960
        src[2] = cm_in[top[2]];
961
        src[3] = cm_in[top[3]];
962
        src[4] = cm_in[top[4]];
963
        src[5] = cm_in[top[5]];
964
        src[6] = cm_in[top[6]];
965
        src[7] = cm_in[top[7]];
966
        src += stride;
967
    }
968
}
969

    
970
#define SRC(x,y) src[(x)+(y)*stride]
971
#define PL(y) \
972
    const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
973
#define PREDICT_8x8_LOAD_LEFT \
974
    const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
975
                     + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
976
    PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
977
    const int l7 av_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
978

    
979
#define PT(x) \
980
    const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
981
#define PREDICT_8x8_LOAD_TOP \
982
    const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
983
                     + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
984
    PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
985
    const int t7 av_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
986
                     + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
987

    
988
#define PTR(x) \
989
    t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
990
#define PREDICT_8x8_LOAD_TOPRIGHT \
991
    int t8, t9, t10, t11, t12, t13, t14, t15; \
992
    if(has_topright) { \
993
        PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
994
        t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
995
    } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
996

    
997
#define PREDICT_8x8_LOAD_TOPLEFT \
998
    const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
999

    
1000
#define PREDICT_8x8_DC(v) \
1001
    int y; \
1002
    for( y = 0; y < 8; y++ ) { \
1003
        ((pixel4*)src)[0] = \
1004
        ((pixel4*)src)[1] = v; \
1005
        src += stride; \
1006
    }
1007

    
1008
static void FUNCC(pred8x8l_128_dc)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1009
{
1010
    pixel *src = (pixel*)p_src;
1011
    int stride = p_stride>>(sizeof(pixel)-1);
1012

    
1013
    PREDICT_8x8_DC(PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1)));
1014
}
1015
static void FUNCC(pred8x8l_left_dc)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1016
{
1017
    pixel *src = (pixel*)p_src;
1018
    int stride = p_stride>>(sizeof(pixel)-1);
1019

    
1020
    PREDICT_8x8_LOAD_LEFT;
1021
    const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3);
1022
    PREDICT_8x8_DC(dc);
1023
}
1024
static void FUNCC(pred8x8l_top_dc)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1025
{
1026
    pixel *src = (pixel*)p_src;
1027
    int stride = p_stride>>(sizeof(pixel)-1);
1028

    
1029
    PREDICT_8x8_LOAD_TOP;
1030
    const pixel4 dc = PIXEL_SPLAT_X4((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3);
1031
    PREDICT_8x8_DC(dc);
1032
}
1033
static void FUNCC(pred8x8l_dc)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1034
{
1035
    pixel *src = (pixel*)p_src;
1036
    int stride = p_stride>>(sizeof(pixel)-1);
1037

    
1038
    PREDICT_8x8_LOAD_LEFT;
1039
    PREDICT_8x8_LOAD_TOP;
1040
    const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7
1041
                                     +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4);
1042
    PREDICT_8x8_DC(dc);
1043
}
1044
static void FUNCC(pred8x8l_horizontal)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1045
{
1046
    pixel *src = (pixel*)p_src;
1047
    int stride = p_stride>>(sizeof(pixel)-1);
1048

    
1049
    PREDICT_8x8_LOAD_LEFT;
1050
#define ROW(y) ((pixel4*)(src+y*stride))[0] =\
1051
               ((pixel4*)(src+y*stride))[1] = PIXEL_SPLAT_X4(l##y)
1052
    ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
1053
#undef ROW
1054
}
1055
static void FUNCC(pred8x8l_vertical)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1056
{
1057
    int y;
1058
    pixel *src = (pixel*)p_src;
1059
    int stride = p_stride>>(sizeof(pixel)-1);
1060

    
1061
    PREDICT_8x8_LOAD_TOP;
1062
    src[0] = t0;
1063
    src[1] = t1;
1064
    src[2] = t2;
1065
    src[3] = t3;
1066
    src[4] = t4;
1067
    src[5] = t5;
1068
    src[6] = t6;
1069
    src[7] = t7;
1070
    for( y = 1; y < 8; y++ ) {
1071
        ((pixel4*)(src+y*stride))[0] = ((pixel4*)src)[0];
1072
        ((pixel4*)(src+y*stride))[1] = ((pixel4*)src)[1];
1073
    }
1074
}
1075
static void FUNCC(pred8x8l_down_left)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1076
{
1077
    pixel *src = (pixel*)p_src;
1078
    int stride = p_stride>>(sizeof(pixel)-1);
1079
    PREDICT_8x8_LOAD_TOP;
1080
    PREDICT_8x8_LOAD_TOPRIGHT;
1081
    SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
1082
    SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
1083
    SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
1084
    SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
1085
    SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
1086
    SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
1087
    SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
1088
    SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
1089
    SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
1090
    SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
1091
    SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
1092
    SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
1093
    SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
1094
    SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
1095
    SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
1096
}
1097
static void FUNCC(pred8x8l_down_right)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1098
{
1099
    pixel *src = (pixel*)p_src;
1100
    int stride = p_stride>>(sizeof(pixel)-1);
1101
    PREDICT_8x8_LOAD_TOP;
1102
    PREDICT_8x8_LOAD_LEFT;
1103
    PREDICT_8x8_LOAD_TOPLEFT;
1104
    SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
1105
    SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
1106
    SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
1107
    SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
1108
    SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
1109
    SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
1110
    SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
1111
    SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
1112
    SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
1113
    SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
1114
    SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
1115
    SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
1116
    SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
1117
    SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
1118
    SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
1119
}
1120
static void FUNCC(pred8x8l_vertical_right)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1121
{
1122
    pixel *src = (pixel*)p_src;
1123
    int stride = p_stride>>(sizeof(pixel)-1);
1124
    PREDICT_8x8_LOAD_TOP;
1125
    PREDICT_8x8_LOAD_LEFT;
1126
    PREDICT_8x8_LOAD_TOPLEFT;
1127
    SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
1128
    SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
1129
    SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
1130
    SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
1131
    SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
1132
    SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
1133
    SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
1134
    SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
1135
    SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
1136
    SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
1137
    SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
1138
    SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
1139
    SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
1140
    SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
1141
    SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
1142
    SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
1143
    SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
1144
    SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
1145
    SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
1146
    SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
1147
    SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
1148
    SRC(7,0)= (t6 + t7 + 1) >> 1;
1149
}
1150
static void FUNCC(pred8x8l_horizontal_down)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1151
{
1152
    pixel *src = (pixel*)p_src;
1153
    int stride = p_stride>>(sizeof(pixel)-1);
1154
    PREDICT_8x8_LOAD_TOP;
1155
    PREDICT_8x8_LOAD_LEFT;
1156
    PREDICT_8x8_LOAD_TOPLEFT;
1157
    SRC(0,7)= (l6 + l7 + 1) >> 1;
1158
    SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
1159
    SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
1160
    SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
1161
    SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
1162
    SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
1163
    SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
1164
    SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
1165
    SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
1166
    SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
1167
    SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
1168
    SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
1169
    SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
1170
    SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
1171
    SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
1172
    SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
1173
    SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
1174
    SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
1175
    SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
1176
    SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
1177
    SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
1178
    SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
1179
}
1180
static void FUNCC(pred8x8l_vertical_left)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1181
{
1182
    pixel *src = (pixel*)p_src;
1183
    int stride = p_stride>>(sizeof(pixel)-1);
1184
    PREDICT_8x8_LOAD_TOP;
1185
    PREDICT_8x8_LOAD_TOPRIGHT;
1186
    SRC(0,0)= (t0 + t1 + 1) >> 1;
1187
    SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
1188
    SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
1189
    SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
1190
    SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
1191
    SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
1192
    SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
1193
    SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
1194
    SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
1195
    SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
1196
    SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
1197
    SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
1198
    SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
1199
    SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
1200
    SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
1201
    SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
1202
    SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
1203
    SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
1204
    SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
1205
    SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
1206
    SRC(7,6)= (t10 + t11 + 1) >> 1;
1207
    SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
1208
}
1209
static void FUNCC(pred8x8l_horizontal_up)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1210
{
1211
    pixel *src = (pixel*)p_src;
1212
    int stride = p_stride>>(sizeof(pixel)-1);
1213
    PREDICT_8x8_LOAD_LEFT;
1214
    SRC(0,0)= (l0 + l1 + 1) >> 1;
1215
    SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
1216
    SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
1217
    SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
1218
    SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
1219
    SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
1220
    SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
1221
    SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
1222
    SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
1223
    SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
1224
    SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
1225
    SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
1226
    SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
1227
    SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
1228
    SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
1229
    SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
1230
    SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
1231
    SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
1232
}
1233
#undef PREDICT_8x8_LOAD_LEFT
1234
#undef PREDICT_8x8_LOAD_TOP
1235
#undef PREDICT_8x8_LOAD_TOPLEFT
1236
#undef PREDICT_8x8_LOAD_TOPRIGHT
1237
#undef PREDICT_8x8_DC
1238
#undef PTR
1239
#undef PT
1240
#undef PL
1241
#undef SRC
1242

    
1243
static void FUNCC(pred4x4_vertical_add)(uint8_t *p_pix, const DCTELEM *p_block, int stride){
1244
    int i;
1245
    pixel *pix = (pixel*)p_pix;
1246
    const dctcoef *block = (const dctcoef*)p_block;
1247
    stride >>= sizeof(pixel)-1;
1248
    pix -= stride;
1249
    for(i=0; i<4; i++){
1250
        pixel v = pix[0];
1251
        pix[1*stride]= v += block[0];
1252
        pix[2*stride]= v += block[4];
1253
        pix[3*stride]= v += block[8];
1254
        pix[4*stride]= v +  block[12];
1255
        pix++;
1256
        block++;
1257
    }
1258
}
1259

    
1260
static void FUNCC(pred4x4_horizontal_add)(uint8_t *p_pix, const DCTELEM *p_block, int stride){
1261
    int i;
1262
    pixel *pix = (pixel*)p_pix;
1263
    const dctcoef *block = (const dctcoef*)p_block;
1264
    stride >>= sizeof(pixel)-1;
1265
    for(i=0; i<4; i++){
1266
        pixel v = pix[-1];
1267
        pix[0]= v += block[0];
1268
        pix[1]= v += block[1];
1269
        pix[2]= v += block[2];
1270
        pix[3]= v +  block[3];
1271
        pix+= stride;
1272
        block+= 4;
1273
    }
1274
}
1275

    
1276
static void FUNCC(pred8x8l_vertical_add)(uint8_t *p_pix, const DCTELEM *p_block, int stride){
1277
    int i;
1278
    pixel *pix = (pixel*)p_pix;
1279
    const dctcoef *block = (const dctcoef*)p_block;
1280
    stride >>= sizeof(pixel)-1;
1281
    pix -= stride;
1282
    for(i=0; i<8; i++){
1283
        pixel v = pix[0];
1284
        pix[1*stride]= v += block[0];
1285
        pix[2*stride]= v += block[8];
1286
        pix[3*stride]= v += block[16];
1287
        pix[4*stride]= v += block[24];
1288
        pix[5*stride]= v += block[32];
1289
        pix[6*stride]= v += block[40];
1290
        pix[7*stride]= v += block[48];
1291
        pix[8*stride]= v +  block[56];
1292
        pix++;
1293
        block++;
1294
    }
1295
}
1296

    
1297
static void FUNCC(pred8x8l_horizontal_add)(uint8_t *p_pix, const DCTELEM *p_block, int stride){
1298
    int i;
1299
    pixel *pix = (pixel*)p_pix;
1300
    const dctcoef *block = (const dctcoef*)p_block;
1301
    stride >>= sizeof(pixel)-1;
1302
    for(i=0; i<8; i++){
1303
        pixel v = pix[-1];
1304
        pix[0]= v += block[0];
1305
        pix[1]= v += block[1];
1306
        pix[2]= v += block[2];
1307
        pix[3]= v += block[3];
1308
        pix[4]= v += block[4];
1309
        pix[5]= v += block[5];
1310
        pix[6]= v += block[6];
1311
        pix[7]= v +  block[7];
1312
        pix+= stride;
1313
        block+= 8;
1314
    }
1315
}
1316

    
1317
static void FUNCC(pred16x16_vertical_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
1318
    int i;
1319
    for(i=0; i<16; i++)
1320
        FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1321
}
1322

    
1323
static void FUNCC(pred16x16_horizontal_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
1324
    int i;
1325
    for(i=0; i<16; i++)
1326
        FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1327
}
1328

    
1329
static void FUNCC(pred8x8_vertical_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
1330
    int i;
1331
    for(i=0; i<4; i++)
1332
        FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1333
}
1334

    
1335
static void FUNCC(pred8x8_horizontal_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
1336
    int i;
1337
    for(i=0; i<4; i++)
1338
        FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1339
}