Statistics
| Branch: | Revision:

ffmpeg / libavcodec / h264pred_internal.h @ 2b092f7a

History | View | Annotate | Download (44.5 KB)

1
/*
2
 * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
3
 * Copyright (c) 2003-2011 Michael Niedermayer <michaelni@gmx.at>
4
 *
5
 * This file is part of FFmpeg.
6
 *
7
 * FFmpeg is free software; you can redistribute it and/or
8
 * modify it under the terms of the GNU Lesser General Public
9
 * License as published by the Free Software Foundation; either
10
 * version 2.1 of the License, or (at your option) any later version.
11
 *
12
 * FFmpeg is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
 * Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with FFmpeg; if not, write to the Free Software
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
 */
21

    
22
/**
23
 * @file
24
 * H.264 / AVC / MPEG4 part10 prediction functions.
25
 * @author Michael Niedermayer <michaelni@gmx.at>
26
 */
27

    
28
#include "mathops.h"
29
#include "dsputil.h"
30

    
31
#define BIT_DEPTH 8
32

    
33
#define pixel uint8_t
34
#define pixel4 uint32_t
35
#define dctcoef DCTELEM
36

    
37
#define INIT_CLIP uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
38
#define CLIP(a) cm[a]
39
#define FUNC(a) a
40
#define FUNCC(a) a ## _c
41
#define PIXEL_SPLAT_X4(x) ((x)*0x01010101U)
42
#define AV_WN4P  AV_WN32
43
#define AV_WN4PA AV_WN32A
44

    
45
static void FUNCC(pred4x4_vertical)(uint8_t *p_src, const uint8_t *topright, int p_stride){
46
    pixel *src = (pixel*)p_src;
47
    int stride = p_stride>>(sizeof(pixel)-1);
48
    const pixel4 a= ((pixel4*)(src-stride))[0];
49
    ((pixel4*)(src+0*stride))[0]= a;
50
    ((pixel4*)(src+1*stride))[0]= a;
51
    ((pixel4*)(src+2*stride))[0]= a;
52
    ((pixel4*)(src+3*stride))[0]= a;
53
}
54

    
55
static void FUNCC(pred4x4_horizontal)(uint8_t *p_src, const uint8_t *topright, int p_stride){
56
    pixel *src = (pixel*)p_src;
57
    int stride = p_stride>>(sizeof(pixel)-1);
58
    ((pixel4*)(src+0*stride))[0]= PIXEL_SPLAT_X4(src[-1+0*stride]);
59
    ((pixel4*)(src+1*stride))[0]= PIXEL_SPLAT_X4(src[-1+1*stride]);
60
    ((pixel4*)(src+2*stride))[0]= PIXEL_SPLAT_X4(src[-1+2*stride]);
61
    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(src[-1+3*stride]);
62
}
63

    
64
static void FUNCC(pred4x4_dc)(uint8_t *p_src, const uint8_t *topright, int p_stride){
65
    pixel *src = (pixel*)p_src;
66
    int stride = p_stride>>(sizeof(pixel)-1);
67
    const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
68
                   + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
69

    
70
    ((pixel4*)(src+0*stride))[0]=
71
    ((pixel4*)(src+1*stride))[0]=
72
    ((pixel4*)(src+2*stride))[0]=
73
    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(dc);
74
}
75

    
76
static void FUNCC(pred4x4_left_dc)(uint8_t *p_src, const uint8_t *topright, int p_stride){
77
    pixel *src = (pixel*)p_src;
78
    int stride = p_stride>>(sizeof(pixel)-1);
79
    const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
80

    
81
    ((pixel4*)(src+0*stride))[0]=
82
    ((pixel4*)(src+1*stride))[0]=
83
    ((pixel4*)(src+2*stride))[0]=
84
    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(dc);
85
}
86

    
87
static void FUNCC(pred4x4_top_dc)(uint8_t *p_src, const uint8_t *topright, int p_stride){
88
    pixel *src = (pixel*)p_src;
89
    int stride = p_stride>>(sizeof(pixel)-1);
90
    const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
91

    
92
    ((pixel4*)(src+0*stride))[0]=
93
    ((pixel4*)(src+1*stride))[0]=
94
    ((pixel4*)(src+2*stride))[0]=
95
    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(dc);
96
}
97

    
98
static void FUNCC(pred4x4_128_dc)(uint8_t *p_src, const uint8_t *topright, int p_stride){
99
    pixel *src = (pixel*)p_src;
100
    int stride = p_stride>>(sizeof(pixel)-1);
101
    ((pixel4*)(src+0*stride))[0]=
102
    ((pixel4*)(src+1*stride))[0]=
103
    ((pixel4*)(src+2*stride))[0]=
104
    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1));
105
}
106

    
107
static void FUNCC(pred4x4_127_dc)(uint8_t *p_src, const uint8_t *topright, int p_stride){
108
    pixel *src = (pixel*)p_src;
109
    int stride = p_stride>>(sizeof(pixel)-1);
110
    ((pixel4*)(src+0*stride))[0]=
111
    ((pixel4*)(src+1*stride))[0]=
112
    ((pixel4*)(src+2*stride))[0]=
113
    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))-1);
114
}
115

    
116
static void FUNCC(pred4x4_129_dc)(uint8_t *p_src, const uint8_t *topright, int p_stride){
117
    pixel *src = (pixel*)p_src;
118
    int stride = p_stride>>(sizeof(pixel)-1);
119
    ((pixel4*)(src+0*stride))[0]=
120
    ((pixel4*)(src+1*stride))[0]=
121
    ((pixel4*)(src+2*stride))[0]=
122
    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))+1);
123
}
124

    
125

    
126
#define LOAD_TOP_RIGHT_EDGE\
127
    const int av_unused t4= topright[0];\
128
    const int av_unused t5= topright[1];\
129
    const int av_unused t6= topright[2];\
130
    const int av_unused t7= topright[3];\
131

    
132
#define LOAD_DOWN_LEFT_EDGE\
133
    const int av_unused l4= src[-1+4*stride];\
134
    const int av_unused l5= src[-1+5*stride];\
135
    const int av_unused l6= src[-1+6*stride];\
136
    const int av_unused l7= src[-1+7*stride];\
137

    
138
#define LOAD_LEFT_EDGE\
139
    const int av_unused l0= src[-1+0*stride];\
140
    const int av_unused l1= src[-1+1*stride];\
141
    const int av_unused l2= src[-1+2*stride];\
142
    const int av_unused l3= src[-1+3*stride];\
143

    
144
#define LOAD_TOP_EDGE\
145
    const int av_unused t0= src[ 0-1*stride];\
146
    const int av_unused t1= src[ 1-1*stride];\
147
    const int av_unused t2= src[ 2-1*stride];\
148
    const int av_unused t3= src[ 3-1*stride];\
149

    
150
static void FUNCC(pred4x4_vertical_vp8)(uint8_t *p_src, const uint8_t *p_topright, int p_stride){
151
    pixel *src = (pixel*)p_src;
152
    const pixel *topright = (const pixel*)p_topright;
153
    int stride = p_stride>>(sizeof(pixel)-1);
154
    const int lt= src[-1-1*stride];
155
    LOAD_TOP_EDGE
156
    LOAD_TOP_RIGHT_EDGE
157
    pixel4 v = PACK_4U8((lt + 2*t0 + t1 + 2) >> 2,
158
                            (t0 + 2*t1 + t2 + 2) >> 2,
159
                            (t1 + 2*t2 + t3 + 2) >> 2,
160
                            (t2 + 2*t3 + t4 + 2) >> 2);
161

    
162
    AV_WN4PA(src+0*stride, v);
163
    AV_WN4PA(src+1*stride, v);
164
    AV_WN4PA(src+2*stride, v);
165
    AV_WN4PA(src+3*stride, v);
166
}
167

    
168
static void FUNCC(pred4x4_horizontal_vp8)(uint8_t *p_src, const uint8_t *topright, int p_stride){
169
    pixel *src = (pixel*)p_src;
170
    int stride = p_stride>>(sizeof(pixel)-1);
171
    const int lt= src[-1-1*stride];
172
    LOAD_LEFT_EDGE
173

    
174
    AV_WN4PA(src+0*stride, PIXEL_SPLAT_X4((lt + 2*l0 + l1 + 2) >> 2));
175
    AV_WN4PA(src+1*stride, PIXEL_SPLAT_X4((l0 + 2*l1 + l2 + 2) >> 2));
176
    AV_WN4PA(src+2*stride, PIXEL_SPLAT_X4((l1 + 2*l2 + l3 + 2) >> 2));
177
    AV_WN4PA(src+3*stride, PIXEL_SPLAT_X4((l2 + 2*l3 + l3 + 2) >> 2));
178
}
179

    
180
static void FUNCC(pred4x4_down_right)(uint8_t *p_src, const uint8_t *topright, int p_stride){
181
    pixel *src = (pixel*)p_src;
182
    int stride = p_stride>>(sizeof(pixel)-1);
183
    const int lt= src[-1-1*stride];
184
    LOAD_TOP_EDGE
185
    LOAD_LEFT_EDGE
186

    
187
    src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2;
188
    src[0+2*stride]=
189
    src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2;
190
    src[0+1*stride]=
191
    src[1+2*stride]=
192
    src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2;
193
    src[0+0*stride]=
194
    src[1+1*stride]=
195
    src[2+2*stride]=
196
    src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
197
    src[1+0*stride]=
198
    src[2+1*stride]=
199
    src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
200
    src[2+0*stride]=
201
    src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
202
    src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
203
}
204

    
205
static void FUNCC(pred4x4_down_left)(uint8_t *p_src, const uint8_t *p_topright, int p_stride){
206
    pixel *src = (pixel*)p_src;
207
    const pixel *topright = (const pixel*)p_topright;
208
    int stride = p_stride>>(sizeof(pixel)-1);
209
    LOAD_TOP_EDGE
210
    LOAD_TOP_RIGHT_EDGE
211
//    LOAD_LEFT_EDGE
212

    
213
    src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
214
    src[1+0*stride]=
215
    src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
216
    src[2+0*stride]=
217
    src[1+1*stride]=
218
    src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
219
    src[3+0*stride]=
220
    src[2+1*stride]=
221
    src[1+2*stride]=
222
    src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
223
    src[3+1*stride]=
224
    src[2+2*stride]=
225
    src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
226
    src[3+2*stride]=
227
    src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
228
    src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
229
}
230

    
231
static void FUNCC(pred4x4_down_left_svq3)(uint8_t *p_src, const uint8_t *topright, int p_stride){
232
    pixel *src = (pixel*)p_src;
233
    int stride = p_stride>>(sizeof(pixel)-1);
234
    LOAD_TOP_EDGE
235
    LOAD_LEFT_EDGE
236
    const av_unused int unu0= t0;
237
    const av_unused int unu1= l0;
238

    
239
    src[0+0*stride]=(l1 + t1)>>1;
240
    src[1+0*stride]=
241
    src[0+1*stride]=(l2 + t2)>>1;
242
    src[2+0*stride]=
243
    src[1+1*stride]=
244
    src[0+2*stride]=
245
    src[3+0*stride]=
246
    src[2+1*stride]=
247
    src[1+2*stride]=
248
    src[0+3*stride]=
249
    src[3+1*stride]=
250
    src[2+2*stride]=
251
    src[1+3*stride]=
252
    src[3+2*stride]=
253
    src[2+3*stride]=
254
    src[3+3*stride]=(l3 + t3)>>1;
255
}
256

    
257
static void FUNCC(pred4x4_down_left_rv40)(uint8_t *p_src, const uint8_t *p_topright, int p_stride){
258
    pixel *src = (pixel*)p_src;
259
    const pixel *topright = (const pixel*)p_topright;
260
    int stride = p_stride>>(sizeof(pixel)-1);
261
    LOAD_TOP_EDGE
262
    LOAD_TOP_RIGHT_EDGE
263
    LOAD_LEFT_EDGE
264
    LOAD_DOWN_LEFT_EDGE
265

    
266
    src[0+0*stride]=(t0 + t2 + 2*t1 + 2 + l0 + l2 + 2*l1 + 2)>>3;
267
    src[1+0*stride]=
268
    src[0+1*stride]=(t1 + t3 + 2*t2 + 2 + l1 + l3 + 2*l2 + 2)>>3;
269
    src[2+0*stride]=
270
    src[1+1*stride]=
271
    src[0+2*stride]=(t2 + t4 + 2*t3 + 2 + l2 + l4 + 2*l3 + 2)>>3;
272
    src[3+0*stride]=
273
    src[2+1*stride]=
274
    src[1+2*stride]=
275
    src[0+3*stride]=(t3 + t5 + 2*t4 + 2 + l3 + l5 + 2*l4 + 2)>>3;
276
    src[3+1*stride]=
277
    src[2+2*stride]=
278
    src[1+3*stride]=(t4 + t6 + 2*t5 + 2 + l4 + l6 + 2*l5 + 2)>>3;
279
    src[3+2*stride]=
280
    src[2+3*stride]=(t5 + t7 + 2*t6 + 2 + l5 + l7 + 2*l6 + 2)>>3;
281
    src[3+3*stride]=(t6 + t7 + 1 + l6 + l7 + 1)>>2;
282
}
283

    
284
static void FUNCC(pred4x4_down_left_rv40_nodown)(uint8_t *p_src, const uint8_t *p_topright, int p_stride){
285
    pixel *src = (pixel*)p_src;
286
    const pixel *topright = (const pixel*)p_topright;
287
    int stride = p_stride>>(sizeof(pixel)-1);
288
    LOAD_TOP_EDGE
289
    LOAD_TOP_RIGHT_EDGE
290
    LOAD_LEFT_EDGE
291

    
292
    src[0+0*stride]=(t0 + t2 + 2*t1 + 2 + l0 + l2 + 2*l1 + 2)>>3;
293
    src[1+0*stride]=
294
    src[0+1*stride]=(t1 + t3 + 2*t2 + 2 + l1 + l3 + 2*l2 + 2)>>3;
295
    src[2+0*stride]=
296
    src[1+1*stride]=
297
    src[0+2*stride]=(t2 + t4 + 2*t3 + 2 + l2 + 3*l3 + 2)>>3;
298
    src[3+0*stride]=
299
    src[2+1*stride]=
300
    src[1+2*stride]=
301
    src[0+3*stride]=(t3 + t5 + 2*t4 + 2 + l3*4 + 2)>>3;
302
    src[3+1*stride]=
303
    src[2+2*stride]=
304
    src[1+3*stride]=(t4 + t6 + 2*t5 + 2 + l3*4 + 2)>>3;
305
    src[3+2*stride]=
306
    src[2+3*stride]=(t5 + t7 + 2*t6 + 2 + l3*4 + 2)>>3;
307
    src[3+3*stride]=(t6 + t7 + 1 + 2*l3 + 1)>>2;
308
}
309

    
310
static void FUNCC(pred4x4_vertical_right)(uint8_t *p_src, const uint8_t *topright, int p_stride){
311
    pixel *src = (pixel*)p_src;
312
    int stride = p_stride>>(sizeof(pixel)-1);
313
    const int lt= src[-1-1*stride];
314
    LOAD_TOP_EDGE
315
    LOAD_LEFT_EDGE
316

    
317
    src[0+0*stride]=
318
    src[1+2*stride]=(lt + t0 + 1)>>1;
319
    src[1+0*stride]=
320
    src[2+2*stride]=(t0 + t1 + 1)>>1;
321
    src[2+0*stride]=
322
    src[3+2*stride]=(t1 + t2 + 1)>>1;
323
    src[3+0*stride]=(t2 + t3 + 1)>>1;
324
    src[0+1*stride]=
325
    src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
326
    src[1+1*stride]=
327
    src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
328
    src[2+1*stride]=
329
    src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
330
    src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
331
    src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
332
    src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
333
}
334

    
335
static void FUNCC(pred4x4_vertical_left)(uint8_t *p_src, const uint8_t *p_topright, int p_stride){
336
    pixel *src = (pixel*)p_src;
337
    const pixel *topright = (const pixel*)p_topright;
338
    int stride = p_stride>>(sizeof(pixel)-1);
339
    LOAD_TOP_EDGE
340
    LOAD_TOP_RIGHT_EDGE
341

    
342
    src[0+0*stride]=(t0 + t1 + 1)>>1;
343
    src[1+0*stride]=
344
    src[0+2*stride]=(t1 + t2 + 1)>>1;
345
    src[2+0*stride]=
346
    src[1+2*stride]=(t2 + t3 + 1)>>1;
347
    src[3+0*stride]=
348
    src[2+2*stride]=(t3 + t4+ 1)>>1;
349
    src[3+2*stride]=(t4 + t5+ 1)>>1;
350
    src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
351
    src[1+1*stride]=
352
    src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
353
    src[2+1*stride]=
354
    src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
355
    src[3+1*stride]=
356
    src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
357
    src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
358
}
359

    
360
static void FUNCC(pred4x4_vertical_left_rv40_internal)(uint8_t *p_src, const uint8_t *p_topright, int p_stride,
361
                                      const int l0, const int l1, const int l2, const int l3, const int l4){
362
    pixel *src = (pixel*)p_src;
363
    const pixel *topright = (const pixel*)p_topright;
364
    int stride = p_stride>>(sizeof(pixel)-1);
365
    LOAD_TOP_EDGE
366
    LOAD_TOP_RIGHT_EDGE
367

    
368
    src[0+0*stride]=(2*t0 + 2*t1 + l1 + 2*l2 + l3 + 4)>>3;
369
    src[1+0*stride]=
370
    src[0+2*stride]=(t1 + t2 + 1)>>1;
371
    src[2+0*stride]=
372
    src[1+2*stride]=(t2 + t3 + 1)>>1;
373
    src[3+0*stride]=
374
    src[2+2*stride]=(t3 + t4+ 1)>>1;
375
    src[3+2*stride]=(t4 + t5+ 1)>>1;
376
    src[0+1*stride]=(t0 + 2*t1 + t2 + l2 + 2*l3 + l4 + 4)>>3;
377
    src[1+1*stride]=
378
    src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
379
    src[2+1*stride]=
380
    src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
381
    src[3+1*stride]=
382
    src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
383
    src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
384
}
385

    
386
static void FUNCC(pred4x4_vertical_left_rv40)(uint8_t *p_src, const uint8_t *topright, int p_stride){
387
    pixel *src = (pixel*)p_src;
388
    int stride = p_stride>>(sizeof(pixel)-1);
389
    LOAD_LEFT_EDGE
390
    LOAD_DOWN_LEFT_EDGE
391

    
392
    FUNCC(pred4x4_vertical_left_rv40_internal)(p_src, topright, p_stride, l0, l1, l2, l3, l4);
393
}
394

    
395
static void FUNCC(pred4x4_vertical_left_rv40_nodown)(uint8_t *p_src, const uint8_t *topright, int p_stride){
396
    pixel *src = (pixel*)p_src;
397
    int stride = p_stride>>(sizeof(pixel)-1);
398
    LOAD_LEFT_EDGE
399

    
400
    FUNCC(pred4x4_vertical_left_rv40_internal)(p_src, topright, p_stride, l0, l1, l2, l3, l3);
401
}
402

    
403
static void FUNCC(pred4x4_vertical_left_vp8)(uint8_t *p_src, const uint8_t *p_topright, int p_stride){
404
    pixel *src = (pixel*)p_src;
405
    const pixel *topright = (const pixel*)p_topright;
406
    int stride = p_stride>>(sizeof(pixel)-1);
407
    LOAD_TOP_EDGE
408
    LOAD_TOP_RIGHT_EDGE
409

    
410
    src[0+0*stride]=(t0 + t1 + 1)>>1;
411
    src[1+0*stride]=
412
    src[0+2*stride]=(t1 + t2 + 1)>>1;
413
    src[2+0*stride]=
414
    src[1+2*stride]=(t2 + t3 + 1)>>1;
415
    src[3+0*stride]=
416
    src[2+2*stride]=(t3 + t4 + 1)>>1;
417
    src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
418
    src[1+1*stride]=
419
    src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
420
    src[2+1*stride]=
421
    src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
422
    src[3+1*stride]=
423
    src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
424
    src[3+2*stride]=(t4 + 2*t5 + t6 + 2)>>2;
425
    src[3+3*stride]=(t5 + 2*t6 + t7 + 2)>>2;
426
}
427

    
428
static void FUNCC(pred4x4_horizontal_up)(uint8_t *p_src, const uint8_t *topright, int p_stride){
429
    pixel *src = (pixel*)p_src;
430
    int stride = p_stride>>(sizeof(pixel)-1);
431
    LOAD_LEFT_EDGE
432

    
433
    src[0+0*stride]=(l0 + l1 + 1)>>1;
434
    src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
435
    src[2+0*stride]=
436
    src[0+1*stride]=(l1 + l2 + 1)>>1;
437
    src[3+0*stride]=
438
    src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
439
    src[2+1*stride]=
440
    src[0+2*stride]=(l2 + l3 + 1)>>1;
441
    src[3+1*stride]=
442
    src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
443
    src[3+2*stride]=
444
    src[1+3*stride]=
445
    src[0+3*stride]=
446
    src[2+2*stride]=
447
    src[2+3*stride]=
448
    src[3+3*stride]=l3;
449
}
450

    
451
static void FUNCC(pred4x4_horizontal_up_rv40)(uint8_t *p_src, const uint8_t *p_topright, int p_stride){
452
    pixel *src = (pixel*)p_src;
453
    const pixel *topright = (const pixel*)p_topright;
454
    int stride = p_stride>>(sizeof(pixel)-1);
455
    LOAD_LEFT_EDGE
456
    LOAD_DOWN_LEFT_EDGE
457
    LOAD_TOP_EDGE
458
    LOAD_TOP_RIGHT_EDGE
459

    
460
    src[0+0*stride]=(t1 + 2*t2 + t3 + 2*l0 + 2*l1 + 4)>>3;
461
    src[1+0*stride]=(t2 + 2*t3 + t4 + l0 + 2*l1 + l2 + 4)>>3;
462
    src[2+0*stride]=
463
    src[0+1*stride]=(t3 + 2*t4 + t5 + 2*l1 + 2*l2 + 4)>>3;
464
    src[3+0*stride]=
465
    src[1+1*stride]=(t4 + 2*t5 + t6 + l1 + 2*l2 + l3 + 4)>>3;
466
    src[2+1*stride]=
467
    src[0+2*stride]=(t5 + 2*t6 + t7 + 2*l2 + 2*l3 + 4)>>3;
468
    src[3+1*stride]=
469
    src[1+2*stride]=(t6 + 3*t7 + l2 + 3*l3 + 4)>>3;
470
    src[3+2*stride]=
471
    src[1+3*stride]=(l3 + 2*l4 + l5 + 2)>>2;
472
    src[0+3*stride]=
473
    src[2+2*stride]=(t6 + t7 + l3 + l4 + 2)>>2;
474
    src[2+3*stride]=(l4 + l5 + 1)>>1;
475
    src[3+3*stride]=(l4 + 2*l5 + l6 + 2)>>2;
476
}
477

    
478
static void FUNCC(pred4x4_horizontal_up_rv40_nodown)(uint8_t *p_src, const uint8_t *p_topright, int p_stride){
479
    pixel *src = (pixel*)p_src;
480
    const pixel *topright = (const pixel*)p_topright;
481
    int stride = p_stride>>(sizeof(pixel)-1);
482
    LOAD_LEFT_EDGE
483
    LOAD_TOP_EDGE
484
    LOAD_TOP_RIGHT_EDGE
485

    
486
    src[0+0*stride]=(t1 + 2*t2 + t3 + 2*l0 + 2*l1 + 4)>>3;
487
    src[1+0*stride]=(t2 + 2*t3 + t4 + l0 + 2*l1 + l2 + 4)>>3;
488
    src[2+0*stride]=
489
    src[0+1*stride]=(t3 + 2*t4 + t5 + 2*l1 + 2*l2 + 4)>>3;
490
    src[3+0*stride]=
491
    src[1+1*stride]=(t4 + 2*t5 + t6 + l1 + 2*l2 + l3 + 4)>>3;
492
    src[2+1*stride]=
493
    src[0+2*stride]=(t5 + 2*t6 + t7 + 2*l2 + 2*l3 + 4)>>3;
494
    src[3+1*stride]=
495
    src[1+2*stride]=(t6 + 3*t7 + l2 + 3*l3 + 4)>>3;
496
    src[3+2*stride]=
497
    src[1+3*stride]=l3;
498
    src[0+3*stride]=
499
    src[2+2*stride]=(t6 + t7 + 2*l3 + 2)>>2;
500
    src[2+3*stride]=
501
    src[3+3*stride]=l3;
502
}
503

    
504
static void FUNCC(pred4x4_horizontal_down)(uint8_t *p_src, const uint8_t *topright, int p_stride){
505
    pixel *src = (pixel*)p_src;
506
    int stride = p_stride>>(sizeof(pixel)-1);
507
    const int lt= src[-1-1*stride];
508
    LOAD_TOP_EDGE
509
    LOAD_LEFT_EDGE
510

    
511
    src[0+0*stride]=
512
    src[2+1*stride]=(lt + l0 + 1)>>1;
513
    src[1+0*stride]=
514
    src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
515
    src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
516
    src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
517
    src[0+1*stride]=
518
    src[2+2*stride]=(l0 + l1 + 1)>>1;
519
    src[1+1*stride]=
520
    src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
521
    src[0+2*stride]=
522
    src[2+3*stride]=(l1 + l2+ 1)>>1;
523
    src[1+2*stride]=
524
    src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
525
    src[0+3*stride]=(l2 + l3 + 1)>>1;
526
    src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
527
}
528

    
529
static void FUNCC(pred4x4_tm_vp8)(uint8_t *p_src, const uint8_t *topright, int p_stride){
530
    pixel *src = (pixel*)p_src;
531
    int stride = p_stride>>(sizeof(pixel)-1);
532
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP - src[-1-stride];
533
    pixel *top = src-stride;
534
    int y;
535

    
536
    for (y = 0; y < 4; y++) {
537
        uint8_t *cm_in = cm + src[-1];
538
        src[0] = cm_in[top[0]];
539
        src[1] = cm_in[top[1]];
540
        src[2] = cm_in[top[2]];
541
        src[3] = cm_in[top[3]];
542
        src += stride;
543
    }
544
}
545

    
546
static void FUNCC(pred16x16_vertical)(uint8_t *p_src, int p_stride){
547
    int i;
548
    pixel *src = (pixel*)p_src;
549
    int stride = p_stride>>(sizeof(pixel)-1);
550
    const pixel4 a = ((pixel4*)(src-stride))[0];
551
    const pixel4 b = ((pixel4*)(src-stride))[1];
552
    const pixel4 c = ((pixel4*)(src-stride))[2];
553
    const pixel4 d = ((pixel4*)(src-stride))[3];
554

    
555
    for(i=0; i<16; i++){
556
        ((pixel4*)(src+i*stride))[0] = a;
557
        ((pixel4*)(src+i*stride))[1] = b;
558
        ((pixel4*)(src+i*stride))[2] = c;
559
        ((pixel4*)(src+i*stride))[3] = d;
560
    }
561
}
562

    
563
static void FUNCC(pred16x16_horizontal)(uint8_t *p_src, int stride){
564
    int i;
565
    pixel *src = (pixel*)p_src;
566
    stride >>= sizeof(pixel)-1;
567

    
568
    for(i=0; i<16; i++){
569
        ((pixel4*)(src+i*stride))[0] =
570
        ((pixel4*)(src+i*stride))[1] =
571
        ((pixel4*)(src+i*stride))[2] =
572
        ((pixel4*)(src+i*stride))[3] = PIXEL_SPLAT_X4(src[-1+i*stride]);
573
    }
574
}
575

    
576
#define PREDICT_16x16_DC(v)\
577
    for(i=0; i<16; i++){\
578
        AV_WN4P(src+ 0, v);\
579
        AV_WN4P(src+ 4, v);\
580
        AV_WN4P(src+ 8, v);\
581
        AV_WN4P(src+12, v);\
582
        src += stride;\
583
    }
584

    
585
static void FUNCC(pred16x16_dc)(uint8_t *p_src, int stride){
586
    int i, dc=0;
587
    pixel *src = (pixel*)p_src;
588
    pixel4 dcsplat;
589
    stride >>= sizeof(pixel)-1;
590

    
591
    for(i=0;i<16; i++){
592
        dc+= src[-1+i*stride];
593
    }
594

    
595
    for(i=0;i<16; i++){
596
        dc+= src[i-stride];
597
    }
598

    
599
    dcsplat = PIXEL_SPLAT_X4((dc+16)>>5);
600
    PREDICT_16x16_DC(dcsplat);
601
}
602

    
603
static void FUNCC(pred16x16_left_dc)(uint8_t *p_src, int stride){
604
    int i, dc=0;
605
    pixel *src = (pixel*)p_src;
606
    pixel4 dcsplat;
607
    stride >>= sizeof(pixel)-1;
608

    
609
    for(i=0;i<16; i++){
610
        dc+= src[-1+i*stride];
611
    }
612

    
613
    dcsplat = PIXEL_SPLAT_X4((dc+8)>>4);
614
    PREDICT_16x16_DC(dcsplat);
615
}
616

    
617
static void FUNCC(pred16x16_top_dc)(uint8_t *p_src, int stride){
618
    int i, dc=0;
619
    pixel *src = (pixel*)p_src;
620
    pixel4 dcsplat;
621
    stride >>= sizeof(pixel)-1;
622

    
623
    for(i=0;i<16; i++){
624
        dc+= src[i-stride];
625
    }
626

    
627
    dcsplat = PIXEL_SPLAT_X4((dc+8)>>4);
628
    PREDICT_16x16_DC(dcsplat);
629
}
630

    
631
#define PRED16x16_X(n, v) \
632
static void FUNCC(pred16x16_##n##_dc)(uint8_t *p_src, int stride){\
633
    int i;\
634
    pixel *src = (pixel*)p_src;\
635
    stride >>= sizeof(pixel)-1;\
636
    PREDICT_16x16_DC(PIXEL_SPLAT_X4(v));\
637
}
638

    
639
PRED16x16_X(127, (1<<(BIT_DEPTH-1))-1);
640
PRED16x16_X(128, (1<<(BIT_DEPTH-1))+0);
641
PRED16x16_X(129, (1<<(BIT_DEPTH-1))+1);
642

    
643
static inline void FUNCC(pred16x16_plane_compat)(uint8_t *p_src, int p_stride, const int svq3, const int rv40){
644
  int i, j, k;
645
  int a;
646
  INIT_CLIP
647
  pixel *src = (pixel*)p_src;
648
  int stride = p_stride>>(sizeof(pixel)-1);
649
  const pixel * const src0 = src +7-stride;
650
  const pixel *       src1 = src +8*stride-1;
651
  const pixel *       src2 = src1-2*stride;    // == src+6*stride-1;
652
  int H = src0[1] - src0[-1];
653
  int V = src1[0] - src2[ 0];
654
  for(k=2; k<=8; ++k) {
655
    src1 += stride; src2 -= stride;
656
    H += k*(src0[k] - src0[-k]);
657
    V += k*(src1[0] - src2[ 0]);
658
  }
659
  if(svq3){
660
    H = ( 5*(H/4) ) / 16;
661
    V = ( 5*(V/4) ) / 16;
662

    
663
    /* required for 100% accuracy */
664
    i = H; H = V; V = i;
665
  }else if(rv40){
666
    H = ( H + (H>>2) ) >> 4;
667
    V = ( V + (V>>2) ) >> 4;
668
  }else{
669
    H = ( 5*H+32 ) >> 6;
670
    V = ( 5*V+32 ) >> 6;
671
  }
672

    
673
  a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
674
  for(j=16; j>0; --j) {
675
    int b = a;
676
    a += V;
677
    for(i=-16; i<0; i+=4) {
678
      src[16+i] = CLIP((b    ) >> 5);
679
      src[17+i] = CLIP((b+  H) >> 5);
680
      src[18+i] = CLIP((b+2*H) >> 5);
681
      src[19+i] = CLIP((b+3*H) >> 5);
682
      b += 4*H;
683
    }
684
    src += stride;
685
  }
686
}
687

    
688
static void FUNCC(pred16x16_plane)(uint8_t *src, int stride){
689
    FUNCC(pred16x16_plane_compat)(src, stride, 0, 0);
690
}
691

    
692
static void FUNCC(pred16x16_plane_svq3)(uint8_t *src, int stride){
693
    FUNCC(pred16x16_plane_compat)(src, stride, 1, 0);
694
}
695

    
696
static void FUNCC(pred16x16_plane_rv40)(uint8_t *src, int stride){
697
    FUNCC(pred16x16_plane_compat)(src, stride, 0, 1);
698
}
699

    
700
static void FUNCC(pred16x16_tm_vp8)(uint8_t *src, int stride){
701
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP - src[-1-stride];
702
    uint8_t *top = src-stride;
703
    int y;
704

    
705
    for (y = 0; y < 16; y++) {
706
        uint8_t *cm_in = cm + src[-1];
707
        src[0]  = cm_in[top[0]];
708
        src[1]  = cm_in[top[1]];
709
        src[2]  = cm_in[top[2]];
710
        src[3]  = cm_in[top[3]];
711
        src[4]  = cm_in[top[4]];
712
        src[5]  = cm_in[top[5]];
713
        src[6]  = cm_in[top[6]];
714
        src[7]  = cm_in[top[7]];
715
        src[8]  = cm_in[top[8]];
716
        src[9]  = cm_in[top[9]];
717
        src[10] = cm_in[top[10]];
718
        src[11] = cm_in[top[11]];
719
        src[12] = cm_in[top[12]];
720
        src[13] = cm_in[top[13]];
721
        src[14] = cm_in[top[14]];
722
        src[15] = cm_in[top[15]];
723
        src += stride;
724
    }
725
}
726

    
727
static void FUNCC(pred8x8_vertical)(uint8_t *p_src, int p_stride){
728
    int i;
729
    pixel *src = (pixel*)p_src;
730
    int stride = p_stride>>(sizeof(pixel)-1);
731
    const pixel4 a= ((pixel4*)(src-stride))[0];
732
    const pixel4 b= ((pixel4*)(src-stride))[1];
733

    
734
    for(i=0; i<8; i++){
735
        ((pixel4*)(src+i*stride))[0]= a;
736
        ((pixel4*)(src+i*stride))[1]= b;
737
    }
738
}
739

    
740
static void FUNCC(pred8x8_horizontal)(uint8_t *p_src, int stride){
741
    int i;
742
    pixel *src = (pixel*)p_src;
743
    stride >>= sizeof(pixel)-1;
744

    
745
    for(i=0; i<8; i++){
746
        ((pixel4*)(src+i*stride))[0]=
747
        ((pixel4*)(src+i*stride))[1]= PIXEL_SPLAT_X4(src[-1+i*stride]);
748
    }
749
}
750

    
751
#define PRED8x8_X(n, v)\
752
static void FUNCC(pred8x8_##n##_dc)(uint8_t *p_src, int stride){\
753
    int i;\
754
    pixel *src = (pixel*)p_src;\
755
    stride >>= sizeof(pixel)-1;\
756
    for(i=0; i<8; i++){\
757
        ((pixel4*)(src+i*stride))[0]=\
758
        ((pixel4*)(src+i*stride))[1]= PIXEL_SPLAT_X4(v);\
759
    }\
760
}
761

    
762
PRED8x8_X(127, (1<<(BIT_DEPTH-1))-1);
763
PRED8x8_X(128, (1<<(BIT_DEPTH-1))+0);
764
PRED8x8_X(129, (1<<(BIT_DEPTH-1))+1);
765

    
766
static void FUNCC(pred8x8_left_dc)(uint8_t *p_src, int stride){
767
    int i;
768
    int dc0, dc2;
769
    pixel4 dc0splat, dc2splat;
770
    pixel *src = (pixel*)p_src;
771
    stride >>= sizeof(pixel)-1;
772

    
773
    dc0=dc2=0;
774
    for(i=0;i<4; i++){
775
        dc0+= src[-1+i*stride];
776
        dc2+= src[-1+(i+4)*stride];
777
    }
778
    dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2);
779
    dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
780

    
781
    for(i=0; i<4; i++){
782
        ((pixel4*)(src+i*stride))[0]=
783
        ((pixel4*)(src+i*stride))[1]= dc0splat;
784
    }
785
    for(i=4; i<8; i++){
786
        ((pixel4*)(src+i*stride))[0]=
787
        ((pixel4*)(src+i*stride))[1]= dc2splat;
788
    }
789
}
790

    
791
static void FUNCC(pred8x8_left_dc_rv40)(uint8_t *p_src, int stride){
792
    int i;
793
    int dc0;
794
    pixel4 dc0splat;
795
    pixel *src = (pixel*)p_src;
796
    stride >>= sizeof(pixel)-1;
797

    
798
    dc0=0;
799
    for(i=0;i<8; i++)
800
        dc0+= src[-1+i*stride];
801
    dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3);
802

    
803
    for(i=0; i<8; i++){
804
        ((pixel4*)(src+i*stride))[0]=
805
        ((pixel4*)(src+i*stride))[1]= dc0splat;
806
    }
807
}
808

    
809
static void FUNCC(pred8x8_top_dc)(uint8_t *p_src, int stride){
810
    int i;
811
    int dc0, dc1;
812
    pixel4 dc0splat, dc1splat;
813
    pixel *src = (pixel*)p_src;
814
    stride >>= sizeof(pixel)-1;
815

    
816
    dc0=dc1=0;
817
    for(i=0;i<4; i++){
818
        dc0+= src[i-stride];
819
        dc1+= src[4+i-stride];
820
    }
821
    dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2);
822
    dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
823

    
824
    for(i=0; i<4; i++){
825
        ((pixel4*)(src+i*stride))[0]= dc0splat;
826
        ((pixel4*)(src+i*stride))[1]= dc1splat;
827
    }
828
    for(i=4; i<8; i++){
829
        ((pixel4*)(src+i*stride))[0]= dc0splat;
830
        ((pixel4*)(src+i*stride))[1]= dc1splat;
831
    }
832
}
833

    
834
static void FUNCC(pred8x8_top_dc_rv40)(uint8_t *p_src, int stride){
835
    int i;
836
    int dc0;
837
    pixel4 dc0splat;
838
    pixel *src = (pixel*)p_src;
839
    stride >>= sizeof(pixel)-1;
840

    
841
    dc0=0;
842
    for(i=0;i<8; i++)
843
        dc0+= src[i-stride];
844
    dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3);
845

    
846
    for(i=0; i<8; i++){
847
        ((pixel4*)(src+i*stride))[0]=
848
        ((pixel4*)(src+i*stride))[1]= dc0splat;
849
    }
850
}
851

    
852

    
853
static void FUNCC(pred8x8_dc)(uint8_t *p_src, int stride){
854
    int i;
855
    int dc0, dc1, dc2;
856
    pixel4 dc0splat, dc1splat, dc2splat, dc3splat;
857
    pixel *src = (pixel*)p_src;
858
    stride >>= sizeof(pixel)-1;
859

    
860
    dc0=dc1=dc2=0;
861
    for(i=0;i<4; i++){
862
        dc0+= src[-1+i*stride] + src[i-stride];
863
        dc1+= src[4+i-stride];
864
        dc2+= src[-1+(i+4)*stride];
865
    }
866
    dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3);
867
    dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
868
    dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
869
    dc3splat = PIXEL_SPLAT_X4((dc1 + dc2 + 4)>>3);
870

    
871
    for(i=0; i<4; i++){
872
        ((pixel4*)(src+i*stride))[0]= dc0splat;
873
        ((pixel4*)(src+i*stride))[1]= dc1splat;
874
    }
875
    for(i=4; i<8; i++){
876
        ((pixel4*)(src+i*stride))[0]= dc2splat;
877
        ((pixel4*)(src+i*stride))[1]= dc3splat;
878
    }
879
}
880

    
881
//the following 4 function should not be optimized!
882
static void FUNC(pred8x8_mad_cow_dc_l0t)(uint8_t *src, int stride){
883
    FUNCC(pred8x8_top_dc)(src, stride);
884
    FUNCC(pred4x4_dc)(src, NULL, stride);
885
}
886

    
887
static void FUNC(pred8x8_mad_cow_dc_0lt)(uint8_t *src, int stride){
888
    FUNCC(pred8x8_dc)(src, stride);
889
    FUNCC(pred4x4_top_dc)(src, NULL, stride);
890
}
891

    
892
static void FUNC(pred8x8_mad_cow_dc_l00)(uint8_t *src, int stride){
893
    FUNCC(pred8x8_left_dc)(src, stride);
894
    FUNCC(pred4x4_128_dc)(src + 4*stride                  , NULL, stride);
895
    FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride);
896
}
897

    
898
static void FUNC(pred8x8_mad_cow_dc_0l0)(uint8_t *src, int stride){
899
    FUNCC(pred8x8_left_dc)(src, stride);
900
    FUNCC(pred4x4_128_dc)(src                  , NULL, stride);
901
    FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride);
902
}
903

    
904
static void FUNCC(pred8x8_dc_rv40)(uint8_t *p_src, int stride){
905
    int i;
906
    int dc0=0;
907
    pixel4 dc0splat;
908
    pixel *src = (pixel*)p_src;
909
    stride >>= sizeof(pixel)-1;
910

    
911
    for(i=0;i<4; i++){
912
        dc0+= src[-1+i*stride] + src[i-stride];
913
        dc0+= src[4+i-stride];
914
        dc0+= src[-1+(i+4)*stride];
915
    }
916
    dc0splat = PIXEL_SPLAT_X4((dc0 + 8)>>4);
917

    
918
    for(i=0; i<4; i++){
919
        ((pixel4*)(src+i*stride))[0]= dc0splat;
920
        ((pixel4*)(src+i*stride))[1]= dc0splat;
921
    }
922
    for(i=4; i<8; i++){
923
        ((pixel4*)(src+i*stride))[0]= dc0splat;
924
        ((pixel4*)(src+i*stride))[1]= dc0splat;
925
    }
926
}
927

    
928
static void FUNCC(pred8x8_plane)(uint8_t *p_src, int p_stride){
929
  int j, k;
930
  int a;
931
  INIT_CLIP
932
  pixel *src = (pixel*)p_src;
933
  int stride = p_stride>>(sizeof(pixel)-1);
934
  const pixel * const src0 = src +3-stride;
935
  const pixel *       src1 = src +4*stride-1;
936
  const pixel *       src2 = src1-2*stride;    // == src+2*stride-1;
937
  int H = src0[1] - src0[-1];
938
  int V = src1[0] - src2[ 0];
939
  for(k=2; k<=4; ++k) {
940
    src1 += stride; src2 -= stride;
941
    H += k*(src0[k] - src0[-k]);
942
    V += k*(src1[0] - src2[ 0]);
943
  }
944
  H = ( 17*H+16 ) >> 5;
945
  V = ( 17*V+16 ) >> 5;
946

    
947
  a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
948
  for(j=8; j>0; --j) {
949
    int b = a;
950
    a += V;
951
    src[0] = CLIP((b    ) >> 5);
952
    src[1] = CLIP((b+  H) >> 5);
953
    src[2] = CLIP((b+2*H) >> 5);
954
    src[3] = CLIP((b+3*H) >> 5);
955
    src[4] = CLIP((b+4*H) >> 5);
956
    src[5] = CLIP((b+5*H) >> 5);
957
    src[6] = CLIP((b+6*H) >> 5);
958
    src[7] = CLIP((b+7*H) >> 5);
959
    src += stride;
960
  }
961
}
962

    
963
static void FUNCC(pred8x8_tm_vp8)(uint8_t *p_src, int p_stride){
964
    pixel *src = (pixel*)p_src;
965
    int stride = p_stride>>(sizeof(pixel)-1);
966
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP - src[-1-stride];
967
    pixel *top = src-stride;
968
    int y;
969

    
970
    for (y = 0; y < 8; y++) {
971
        uint8_t *cm_in = cm + src[-1];
972
        src[0] = cm_in[top[0]];
973
        src[1] = cm_in[top[1]];
974
        src[2] = cm_in[top[2]];
975
        src[3] = cm_in[top[3]];
976
        src[4] = cm_in[top[4]];
977
        src[5] = cm_in[top[5]];
978
        src[6] = cm_in[top[6]];
979
        src[7] = cm_in[top[7]];
980
        src += stride;
981
    }
982
}
983

    
984
#define SRC(x,y) src[(x)+(y)*stride]
985
#define PL(y) \
986
    const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
987
#define PREDICT_8x8_LOAD_LEFT \
988
    const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
989
                     + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
990
    PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
991
    const int l7 av_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
992

    
993
#define PT(x) \
994
    const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
995
#define PREDICT_8x8_LOAD_TOP \
996
    const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
997
                     + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
998
    PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
999
    const int t7 av_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
1000
                     + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
1001

    
1002
#define PTR(x) \
1003
    t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
1004
#define PREDICT_8x8_LOAD_TOPRIGHT \
1005
    int t8, t9, t10, t11, t12, t13, t14, t15; \
1006
    if(has_topright) { \
1007
        PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
1008
        t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
1009
    } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
1010

    
1011
#define PREDICT_8x8_LOAD_TOPLEFT \
1012
    const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
1013

    
1014
#define PREDICT_8x8_DC(v) \
1015
    int y; \
1016
    for( y = 0; y < 8; y++ ) { \
1017
        ((pixel4*)src)[0] = \
1018
        ((pixel4*)src)[1] = v; \
1019
        src += stride; \
1020
    }
1021

    
1022
static void FUNCC(pred8x8l_128_dc)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1023
{
1024
    pixel *src = (pixel*)p_src;
1025
    int stride = p_stride>>(sizeof(pixel)-1);
1026

    
1027
    PREDICT_8x8_DC(PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1)));
1028
}
1029
static void FUNCC(pred8x8l_left_dc)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1030
{
1031
    pixel *src = (pixel*)p_src;
1032
    int stride = p_stride>>(sizeof(pixel)-1);
1033

    
1034
    PREDICT_8x8_LOAD_LEFT;
1035
    const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3);
1036
    PREDICT_8x8_DC(dc);
1037
}
1038
static void FUNCC(pred8x8l_top_dc)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1039
{
1040
    pixel *src = (pixel*)p_src;
1041
    int stride = p_stride>>(sizeof(pixel)-1);
1042

    
1043
    PREDICT_8x8_LOAD_TOP;
1044
    const pixel4 dc = PIXEL_SPLAT_X4((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3);
1045
    PREDICT_8x8_DC(dc);
1046
}
1047
static void FUNCC(pred8x8l_dc)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1048
{
1049
    pixel *src = (pixel*)p_src;
1050
    int stride = p_stride>>(sizeof(pixel)-1);
1051

    
1052
    PREDICT_8x8_LOAD_LEFT;
1053
    PREDICT_8x8_LOAD_TOP;
1054
    const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7
1055
                                     +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4);
1056
    PREDICT_8x8_DC(dc);
1057
}
1058
static void FUNCC(pred8x8l_horizontal)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1059
{
1060
    pixel *src = (pixel*)p_src;
1061
    int stride = p_stride>>(sizeof(pixel)-1);
1062

    
1063
    PREDICT_8x8_LOAD_LEFT;
1064
#define ROW(y) ((pixel4*)(src+y*stride))[0] =\
1065
               ((pixel4*)(src+y*stride))[1] = PIXEL_SPLAT_X4(l##y)
1066
    ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
1067
#undef ROW
1068
}
1069
static void FUNCC(pred8x8l_vertical)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1070
{
1071
    int y;
1072
    pixel *src = (pixel*)p_src;
1073
    int stride = p_stride>>(sizeof(pixel)-1);
1074

    
1075
    PREDICT_8x8_LOAD_TOP;
1076
    src[0] = t0;
1077
    src[1] = t1;
1078
    src[2] = t2;
1079
    src[3] = t3;
1080
    src[4] = t4;
1081
    src[5] = t5;
1082
    src[6] = t6;
1083
    src[7] = t7;
1084
    for( y = 1; y < 8; y++ ) {
1085
        ((pixel4*)(src+y*stride))[0] = ((pixel4*)src)[0];
1086
        ((pixel4*)(src+y*stride))[1] = ((pixel4*)src)[1];
1087
    }
1088
}
1089
static void FUNCC(pred8x8l_down_left)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1090
{
1091
    pixel *src = (pixel*)p_src;
1092
    int stride = p_stride>>(sizeof(pixel)-1);
1093
    PREDICT_8x8_LOAD_TOP;
1094
    PREDICT_8x8_LOAD_TOPRIGHT;
1095
    SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
1096
    SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
1097
    SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
1098
    SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
1099
    SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
1100
    SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
1101
    SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
1102
    SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
1103
    SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
1104
    SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
1105
    SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
1106
    SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
1107
    SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
1108
    SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
1109
    SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
1110
}
1111
static void FUNCC(pred8x8l_down_right)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1112
{
1113
    pixel *src = (pixel*)p_src;
1114
    int stride = p_stride>>(sizeof(pixel)-1);
1115
    PREDICT_8x8_LOAD_TOP;
1116
    PREDICT_8x8_LOAD_LEFT;
1117
    PREDICT_8x8_LOAD_TOPLEFT;
1118
    SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
1119
    SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
1120
    SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
1121
    SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
1122
    SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
1123
    SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
1124
    SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
1125
    SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
1126
    SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
1127
    SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
1128
    SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
1129
    SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
1130
    SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
1131
    SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
1132
    SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
1133
}
1134
static void FUNCC(pred8x8l_vertical_right)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1135
{
1136
    pixel *src = (pixel*)p_src;
1137
    int stride = p_stride>>(sizeof(pixel)-1);
1138
    PREDICT_8x8_LOAD_TOP;
1139
    PREDICT_8x8_LOAD_LEFT;
1140
    PREDICT_8x8_LOAD_TOPLEFT;
1141
    SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
1142
    SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
1143
    SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
1144
    SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
1145
    SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
1146
    SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
1147
    SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
1148
    SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
1149
    SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
1150
    SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
1151
    SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
1152
    SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
1153
    SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
1154
    SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
1155
    SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
1156
    SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
1157
    SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
1158
    SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
1159
    SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
1160
    SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
1161
    SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
1162
    SRC(7,0)= (t6 + t7 + 1) >> 1;
1163
}
1164
static void FUNCC(pred8x8l_horizontal_down)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1165
{
1166
    pixel *src = (pixel*)p_src;
1167
    int stride = p_stride>>(sizeof(pixel)-1);
1168
    PREDICT_8x8_LOAD_TOP;
1169
    PREDICT_8x8_LOAD_LEFT;
1170
    PREDICT_8x8_LOAD_TOPLEFT;
1171
    SRC(0,7)= (l6 + l7 + 1) >> 1;
1172
    SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
1173
    SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
1174
    SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
1175
    SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
1176
    SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
1177
    SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
1178
    SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
1179
    SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
1180
    SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
1181
    SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
1182
    SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
1183
    SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
1184
    SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
1185
    SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
1186
    SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
1187
    SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
1188
    SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
1189
    SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
1190
    SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
1191
    SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
1192
    SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
1193
}
1194
static void FUNCC(pred8x8l_vertical_left)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1195
{
1196
    pixel *src = (pixel*)p_src;
1197
    int stride = p_stride>>(sizeof(pixel)-1);
1198
    PREDICT_8x8_LOAD_TOP;
1199
    PREDICT_8x8_LOAD_TOPRIGHT;
1200
    SRC(0,0)= (t0 + t1 + 1) >> 1;
1201
    SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
1202
    SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
1203
    SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
1204
    SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
1205
    SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
1206
    SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
1207
    SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
1208
    SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
1209
    SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
1210
    SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
1211
    SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
1212
    SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
1213
    SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
1214
    SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
1215
    SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
1216
    SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
1217
    SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
1218
    SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
1219
    SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
1220
    SRC(7,6)= (t10 + t11 + 1) >> 1;
1221
    SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
1222
}
1223
static void FUNCC(pred8x8l_horizontal_up)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1224
{
1225
    pixel *src = (pixel*)p_src;
1226
    int stride = p_stride>>(sizeof(pixel)-1);
1227
    PREDICT_8x8_LOAD_LEFT;
1228
    SRC(0,0)= (l0 + l1 + 1) >> 1;
1229
    SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
1230
    SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
1231
    SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
1232
    SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
1233
    SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
1234
    SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
1235
    SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
1236
    SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
1237
    SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
1238
    SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
1239
    SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
1240
    SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
1241
    SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
1242
    SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
1243
    SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
1244
    SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
1245
    SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
1246
}
1247
#undef PREDICT_8x8_LOAD_LEFT
1248
#undef PREDICT_8x8_LOAD_TOP
1249
#undef PREDICT_8x8_LOAD_TOPLEFT
1250
#undef PREDICT_8x8_LOAD_TOPRIGHT
1251
#undef PREDICT_8x8_DC
1252
#undef PTR
1253
#undef PT
1254
#undef PL
1255
#undef SRC
1256

    
1257
static void FUNCC(pred4x4_vertical_add)(uint8_t *p_pix, const DCTELEM *p_block, int stride){
1258
    int i;
1259
    pixel *pix = (pixel*)p_pix;
1260
    const dctcoef *block = (const dctcoef*)p_block;
1261
    stride >>= sizeof(pixel)-1;
1262
    pix -= stride;
1263
    for(i=0; i<4; i++){
1264
        pixel v = pix[0];
1265
        pix[1*stride]= v += block[0];
1266
        pix[2*stride]= v += block[4];
1267
        pix[3*stride]= v += block[8];
1268
        pix[4*stride]= v +  block[12];
1269
        pix++;
1270
        block++;
1271
    }
1272
}
1273

    
1274
static void FUNCC(pred4x4_horizontal_add)(uint8_t *p_pix, const DCTELEM *p_block, int stride){
1275
    int i;
1276
    pixel *pix = (pixel*)p_pix;
1277
    const dctcoef *block = (const dctcoef*)p_block;
1278
    stride >>= sizeof(pixel)-1;
1279
    for(i=0; i<4; i++){
1280
        pixel v = pix[-1];
1281
        pix[0]= v += block[0];
1282
        pix[1]= v += block[1];
1283
        pix[2]= v += block[2];
1284
        pix[3]= v +  block[3];
1285
        pix+= stride;
1286
        block+= 4;
1287
    }
1288
}
1289

    
1290
static void FUNCC(pred8x8l_vertical_add)(uint8_t *p_pix, const DCTELEM *p_block, int stride){
1291
    int i;
1292
    pixel *pix = (pixel*)p_pix;
1293
    const dctcoef *block = (const dctcoef*)p_block;
1294
    stride >>= sizeof(pixel)-1;
1295
    pix -= stride;
1296
    for(i=0; i<8; i++){
1297
        pixel v = pix[0];
1298
        pix[1*stride]= v += block[0];
1299
        pix[2*stride]= v += block[8];
1300
        pix[3*stride]= v += block[16];
1301
        pix[4*stride]= v += block[24];
1302
        pix[5*stride]= v += block[32];
1303
        pix[6*stride]= v += block[40];
1304
        pix[7*stride]= v += block[48];
1305
        pix[8*stride]= v +  block[56];
1306
        pix++;
1307
        block++;
1308
    }
1309
}
1310

    
1311
static void FUNCC(pred8x8l_horizontal_add)(uint8_t *p_pix, const DCTELEM *p_block, int stride){
1312
    int i;
1313
    pixel *pix = (pixel*)p_pix;
1314
    const dctcoef *block = (const dctcoef*)p_block;
1315
    stride >>= sizeof(pixel)-1;
1316
    for(i=0; i<8; i++){
1317
        pixel v = pix[-1];
1318
        pix[0]= v += block[0];
1319
        pix[1]= v += block[1];
1320
        pix[2]= v += block[2];
1321
        pix[3]= v += block[3];
1322
        pix[4]= v += block[4];
1323
        pix[5]= v += block[5];
1324
        pix[6]= v += block[6];
1325
        pix[7]= v +  block[7];
1326
        pix+= stride;
1327
        block+= 8;
1328
    }
1329
}
1330

    
1331
static void FUNCC(pred16x16_vertical_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
1332
    int i;
1333
    for(i=0; i<16; i++)
1334
        FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1335
}
1336

    
1337
static void FUNCC(pred16x16_horizontal_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
1338
    int i;
1339
    for(i=0; i<16; i++)
1340
        FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1341
}
1342

    
1343
static void FUNCC(pred8x8_vertical_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
1344
    int i;
1345
    for(i=0; i<4; i++)
1346
        FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1347
}
1348

    
1349
static void FUNCC(pred8x8_horizontal_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
1350
    int i;
1351
    for(i=0; i<4; i++)
1352
        FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1353
}