Statistics
| Branch: | Revision:

ffmpeg / libavcodec / h264pred_template.c @ d2bf4289

History | View | Annotate | Download (33 KB)

1
/*
2
 * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
3
 * Copyright (c) 2003-2011 Michael Niedermayer <michaelni@gmx.at>
4
 *
5
 * This file is part of Libav.
6
 *
7
 * Libav is free software; you can redistribute it and/or
8
 * modify it under the terms of the GNU Lesser General Public
9
 * License as published by the Free Software Foundation; either
10
 * version 2.1 of the License, or (at your option) any later version.
11
 *
12
 * Libav is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
 * Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with Libav; if not, write to the Free Software
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
 */
21

    
22
/**
23
 * @file
24
 * H.264 / AVC / MPEG4 part10 prediction functions.
25
 * @author Michael Niedermayer <michaelni@gmx.at>
26
 */
27

    
28
#include "mathops.h"
29
#include "high_bit_depth.h"
30

    
31
static void FUNCC(pred4x4_vertical)(uint8_t *_src, const uint8_t *topright, int _stride){
32
    pixel *src = (pixel*)_src;
33
    int stride = _stride/sizeof(pixel);
34
    const pixel4 a= AV_RN4PA(src-stride);
35

    
36
    AV_WN4PA(src+0*stride, a);
37
    AV_WN4PA(src+1*stride, a);
38
    AV_WN4PA(src+2*stride, a);
39
    AV_WN4PA(src+3*stride, a);
40
}
41

    
42
static void FUNCC(pred4x4_horizontal)(uint8_t *_src, const uint8_t *topright, int _stride){
43
    pixel *src = (pixel*)_src;
44
    int stride = _stride/sizeof(pixel);
45
    AV_WN4PA(src+0*stride, PIXEL_SPLAT_X4(src[-1+0*stride]));
46
    AV_WN4PA(src+1*stride, PIXEL_SPLAT_X4(src[-1+1*stride]));
47
    AV_WN4PA(src+2*stride, PIXEL_SPLAT_X4(src[-1+2*stride]));
48
    AV_WN4PA(src+3*stride, PIXEL_SPLAT_X4(src[-1+3*stride]));
49
}
50

    
51
static void FUNCC(pred4x4_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
52
    pixel *src = (pixel*)_src;
53
    int stride = _stride/sizeof(pixel);
54
    const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
55
                   + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
56
    const pixel4 a = PIXEL_SPLAT_X4(dc);
57

    
58
    AV_WN4PA(src+0*stride, a);
59
    AV_WN4PA(src+1*stride, a);
60
    AV_WN4PA(src+2*stride, a);
61
    AV_WN4PA(src+3*stride, a);
62
}
63

    
64
static void FUNCC(pred4x4_left_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
65
    pixel *src = (pixel*)_src;
66
    int stride = _stride/sizeof(pixel);
67
    const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
68
    const pixel4 a = PIXEL_SPLAT_X4(dc);
69

    
70
    AV_WN4PA(src+0*stride, a);
71
    AV_WN4PA(src+1*stride, a);
72
    AV_WN4PA(src+2*stride, a);
73
    AV_WN4PA(src+3*stride, a);
74
}
75

    
76
static void FUNCC(pred4x4_top_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
77
    pixel *src = (pixel*)_src;
78
    int stride = _stride/sizeof(pixel);
79
    const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
80
    const pixel4 a = PIXEL_SPLAT_X4(dc);
81

    
82
    AV_WN4PA(src+0*stride, a);
83
    AV_WN4PA(src+1*stride, a);
84
    AV_WN4PA(src+2*stride, a);
85
    AV_WN4PA(src+3*stride, a);
86
}
87

    
88
static void FUNCC(pred4x4_128_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
89
    pixel *src = (pixel*)_src;
90
    int stride = _stride/sizeof(pixel);
91
    const pixel4 a = PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1));
92

    
93
    AV_WN4PA(src+0*stride, a);
94
    AV_WN4PA(src+1*stride, a);
95
    AV_WN4PA(src+2*stride, a);
96
    AV_WN4PA(src+3*stride, a);
97
}
98

    
99
static void FUNCC(pred4x4_127_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
100
    pixel *src = (pixel*)_src;
101
    int stride = _stride/sizeof(pixel);
102
    const pixel4 a = PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))-1);
103

    
104
    AV_WN4PA(src+0*stride, a);
105
    AV_WN4PA(src+1*stride, a);
106
    AV_WN4PA(src+2*stride, a);
107
    AV_WN4PA(src+3*stride, a);
108
}
109

    
110
static void FUNCC(pred4x4_129_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
111
    pixel *src = (pixel*)_src;
112
    int stride = _stride/sizeof(pixel);
113
    const pixel4 a = PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))+1);
114

    
115
    AV_WN4PA(src+0*stride, a);
116
    AV_WN4PA(src+1*stride, a);
117
    AV_WN4PA(src+2*stride, a);
118
    AV_WN4PA(src+3*stride, a);
119
}
120

    
121

    
122
#define LOAD_TOP_RIGHT_EDGE\
123
    const int av_unused t4= topright[0];\
124
    const int av_unused t5= topright[1];\
125
    const int av_unused t6= topright[2];\
126
    const int av_unused t7= topright[3];\
127

    
128
#define LOAD_DOWN_LEFT_EDGE\
129
    const int av_unused l4= src[-1+4*stride];\
130
    const int av_unused l5= src[-1+5*stride];\
131
    const int av_unused l6= src[-1+6*stride];\
132
    const int av_unused l7= src[-1+7*stride];\
133

    
134
#define LOAD_LEFT_EDGE\
135
    const int av_unused l0= src[-1+0*stride];\
136
    const int av_unused l1= src[-1+1*stride];\
137
    const int av_unused l2= src[-1+2*stride];\
138
    const int av_unused l3= src[-1+3*stride];\
139

    
140
#define LOAD_TOP_EDGE\
141
    const int av_unused t0= src[ 0-1*stride];\
142
    const int av_unused t1= src[ 1-1*stride];\
143
    const int av_unused t2= src[ 2-1*stride];\
144
    const int av_unused t3= src[ 3-1*stride];\
145

    
146
static void FUNCC(pred4x4_down_right)(uint8_t *_src, const uint8_t *topright, int _stride){
147
    pixel *src = (pixel*)_src;
148
    int stride = _stride/sizeof(pixel);
149
    const int lt= src[-1-1*stride];
150
    LOAD_TOP_EDGE
151
    LOAD_LEFT_EDGE
152

    
153
    src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2;
154
    src[0+2*stride]=
155
    src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2;
156
    src[0+1*stride]=
157
    src[1+2*stride]=
158
    src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2;
159
    src[0+0*stride]=
160
    src[1+1*stride]=
161
    src[2+2*stride]=
162
    src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
163
    src[1+0*stride]=
164
    src[2+1*stride]=
165
    src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
166
    src[2+0*stride]=
167
    src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
168
    src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
169
}
170

    
171
static void FUNCC(pred4x4_down_left)(uint8_t *_src, const uint8_t *_topright, int _stride){
172
    pixel *src = (pixel*)_src;
173
    const pixel *topright = (const pixel*)_topright;
174
    int stride = _stride/sizeof(pixel);
175
    LOAD_TOP_EDGE
176
    LOAD_TOP_RIGHT_EDGE
177
//    LOAD_LEFT_EDGE
178

    
179
    src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
180
    src[1+0*stride]=
181
    src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
182
    src[2+0*stride]=
183
    src[1+1*stride]=
184
    src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
185
    src[3+0*stride]=
186
    src[2+1*stride]=
187
    src[1+2*stride]=
188
    src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
189
    src[3+1*stride]=
190
    src[2+2*stride]=
191
    src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
192
    src[3+2*stride]=
193
    src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
194
    src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
195
}
196

    
197
static void FUNCC(pred4x4_vertical_right)(uint8_t *_src, const uint8_t *topright, int _stride){
198
    pixel *src = (pixel*)_src;
199
    int stride = _stride/sizeof(pixel);
200
    const int lt= src[-1-1*stride];
201
    LOAD_TOP_EDGE
202
    LOAD_LEFT_EDGE
203

    
204
    src[0+0*stride]=
205
    src[1+2*stride]=(lt + t0 + 1)>>1;
206
    src[1+0*stride]=
207
    src[2+2*stride]=(t0 + t1 + 1)>>1;
208
    src[2+0*stride]=
209
    src[3+2*stride]=(t1 + t2 + 1)>>1;
210
    src[3+0*stride]=(t2 + t3 + 1)>>1;
211
    src[0+1*stride]=
212
    src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
213
    src[1+1*stride]=
214
    src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
215
    src[2+1*stride]=
216
    src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
217
    src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
218
    src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
219
    src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
220
}
221

    
222
static void FUNCC(pred4x4_vertical_left)(uint8_t *_src, const uint8_t *_topright, int _stride){
223
    pixel *src = (pixel*)_src;
224
    const pixel *topright = (const pixel*)_topright;
225
    int stride = _stride/sizeof(pixel);
226
    LOAD_TOP_EDGE
227
    LOAD_TOP_RIGHT_EDGE
228

    
229
    src[0+0*stride]=(t0 + t1 + 1)>>1;
230
    src[1+0*stride]=
231
    src[0+2*stride]=(t1 + t2 + 1)>>1;
232
    src[2+0*stride]=
233
    src[1+2*stride]=(t2 + t3 + 1)>>1;
234
    src[3+0*stride]=
235
    src[2+2*stride]=(t3 + t4+ 1)>>1;
236
    src[3+2*stride]=(t4 + t5+ 1)>>1;
237
    src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
238
    src[1+1*stride]=
239
    src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
240
    src[2+1*stride]=
241
    src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
242
    src[3+1*stride]=
243
    src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
244
    src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
245
}
246

    
247
static void FUNCC(pred4x4_horizontal_up)(uint8_t *_src, const uint8_t *topright, int _stride){
248
    pixel *src = (pixel*)_src;
249
    int stride = _stride/sizeof(pixel);
250
    LOAD_LEFT_EDGE
251

    
252
    src[0+0*stride]=(l0 + l1 + 1)>>1;
253
    src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
254
    src[2+0*stride]=
255
    src[0+1*stride]=(l1 + l2 + 1)>>1;
256
    src[3+0*stride]=
257
    src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
258
    src[2+1*stride]=
259
    src[0+2*stride]=(l2 + l3 + 1)>>1;
260
    src[3+1*stride]=
261
    src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
262
    src[3+2*stride]=
263
    src[1+3*stride]=
264
    src[0+3*stride]=
265
    src[2+2*stride]=
266
    src[2+3*stride]=
267
    src[3+3*stride]=l3;
268
}
269

    
270
static void FUNCC(pred4x4_horizontal_down)(uint8_t *_src, const uint8_t *topright, int _stride){
271
    pixel *src = (pixel*)_src;
272
    int stride = _stride/sizeof(pixel);
273
    const int lt= src[-1-1*stride];
274
    LOAD_TOP_EDGE
275
    LOAD_LEFT_EDGE
276

    
277
    src[0+0*stride]=
278
    src[2+1*stride]=(lt + l0 + 1)>>1;
279
    src[1+0*stride]=
280
    src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
281
    src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
282
    src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
283
    src[0+1*stride]=
284
    src[2+2*stride]=(l0 + l1 + 1)>>1;
285
    src[1+1*stride]=
286
    src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
287
    src[0+2*stride]=
288
    src[2+3*stride]=(l1 + l2+ 1)>>1;
289
    src[1+2*stride]=
290
    src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
291
    src[0+3*stride]=(l2 + l3 + 1)>>1;
292
    src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
293
}
294

    
295
static void FUNCC(pred16x16_vertical)(uint8_t *_src, int _stride){
296
    int i;
297
    pixel *src = (pixel*)_src;
298
    int stride = _stride/sizeof(pixel);
299
    const pixel4 a = AV_RN4PA(((pixel4*)(src-stride))+0);
300
    const pixel4 b = AV_RN4PA(((pixel4*)(src-stride))+1);
301
    const pixel4 c = AV_RN4PA(((pixel4*)(src-stride))+2);
302
    const pixel4 d = AV_RN4PA(((pixel4*)(src-stride))+3);
303

    
304
    for(i=0; i<16; i++){
305
        AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
306
        AV_WN4PA(((pixel4*)(src+i*stride))+1, b);
307
        AV_WN4PA(((pixel4*)(src+i*stride))+2, c);
308
        AV_WN4PA(((pixel4*)(src+i*stride))+3, d);
309
    }
310
}
311

    
312
static void FUNCC(pred16x16_horizontal)(uint8_t *_src, int stride){
313
    int i;
314
    pixel *src = (pixel*)_src;
315
    stride /= sizeof(pixel);
316

    
317
    for(i=0; i<16; i++){
318
        const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]);
319

    
320
        AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
321
        AV_WN4PA(((pixel4*)(src+i*stride))+1, a);
322
        AV_WN4PA(((pixel4*)(src+i*stride))+2, a);
323
        AV_WN4PA(((pixel4*)(src+i*stride))+3, a);
324
    }
325
}
326

    
327
#define PREDICT_16x16_DC(v)\
328
    for(i=0; i<16; i++){\
329
        AV_WN4PA(src+ 0, v);\
330
        AV_WN4PA(src+ 4, v);\
331
        AV_WN4PA(src+ 8, v);\
332
        AV_WN4PA(src+12, v);\
333
        src += stride;\
334
    }
335

    
336
static void FUNCC(pred16x16_dc)(uint8_t *_src, int stride){
337
    int i, dc=0;
338
    pixel *src = (pixel*)_src;
339
    pixel4 dcsplat;
340
    stride /= sizeof(pixel);
341

    
342
    for(i=0;i<16; i++){
343
        dc+= src[-1+i*stride];
344
    }
345

    
346
    for(i=0;i<16; i++){
347
        dc+= src[i-stride];
348
    }
349

    
350
    dcsplat = PIXEL_SPLAT_X4((dc+16)>>5);
351
    PREDICT_16x16_DC(dcsplat);
352
}
353

    
354
static void FUNCC(pred16x16_left_dc)(uint8_t *_src, int stride){
355
    int i, dc=0;
356
    pixel *src = (pixel*)_src;
357
    pixel4 dcsplat;
358
    stride /= sizeof(pixel);
359

    
360
    for(i=0;i<16; i++){
361
        dc+= src[-1+i*stride];
362
    }
363

    
364
    dcsplat = PIXEL_SPLAT_X4((dc+8)>>4);
365
    PREDICT_16x16_DC(dcsplat);
366
}
367

    
368
static void FUNCC(pred16x16_top_dc)(uint8_t *_src, int stride){
369
    int i, dc=0;
370
    pixel *src = (pixel*)_src;
371
    pixel4 dcsplat;
372
    stride /= sizeof(pixel);
373

    
374
    for(i=0;i<16; i++){
375
        dc+= src[i-stride];
376
    }
377

    
378
    dcsplat = PIXEL_SPLAT_X4((dc+8)>>4);
379
    PREDICT_16x16_DC(dcsplat);
380
}
381

    
382
#define PRED16x16_X(n, v) \
383
static void FUNCC(pred16x16_##n##_dc)(uint8_t *_src, int stride){\
384
    int i;\
385
    pixel *src = (pixel*)_src;\
386
    stride /= sizeof(pixel);\
387
    PREDICT_16x16_DC(PIXEL_SPLAT_X4(v));\
388
}
389

    
390
PRED16x16_X(127, (1<<(BIT_DEPTH-1))-1);
391
PRED16x16_X(128, (1<<(BIT_DEPTH-1))+0);
392
PRED16x16_X(129, (1<<(BIT_DEPTH-1))+1);
393

    
394
static inline void FUNCC(pred16x16_plane_compat)(uint8_t *_src, int _stride, const int svq3, const int rv40){
395
  int i, j, k;
396
  int a;
397
  INIT_CLIP
398
  pixel *src = (pixel*)_src;
399
  int stride = _stride/sizeof(pixel);
400
  const pixel * const src0 = src +7-stride;
401
  const pixel *       src1 = src +8*stride-1;
402
  const pixel *       src2 = src1-2*stride;    // == src+6*stride-1;
403
  int H = src0[1] - src0[-1];
404
  int V = src1[0] - src2[ 0];
405
  for(k=2; k<=8; ++k) {
406
    src1 += stride; src2 -= stride;
407
    H += k*(src0[k] - src0[-k]);
408
    V += k*(src1[0] - src2[ 0]);
409
  }
410
  if(svq3){
411
    H = ( 5*(H/4) ) / 16;
412
    V = ( 5*(V/4) ) / 16;
413

    
414
    /* required for 100% accuracy */
415
    i = H; H = V; V = i;
416
  }else if(rv40){
417
    H = ( H + (H>>2) ) >> 4;
418
    V = ( V + (V>>2) ) >> 4;
419
  }else{
420
    H = ( 5*H+32 ) >> 6;
421
    V = ( 5*V+32 ) >> 6;
422
  }
423

    
424
  a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
425
  for(j=16; j>0; --j) {
426
    int b = a;
427
    a += V;
428
    for(i=-16; i<0; i+=4) {
429
      src[16+i] = CLIP((b    ) >> 5);
430
      src[17+i] = CLIP((b+  H) >> 5);
431
      src[18+i] = CLIP((b+2*H) >> 5);
432
      src[19+i] = CLIP((b+3*H) >> 5);
433
      b += 4*H;
434
    }
435
    src += stride;
436
  }
437
}
438

    
439
static void FUNCC(pred16x16_plane)(uint8_t *src, int stride){
440
    FUNCC(pred16x16_plane_compat)(src, stride, 0, 0);
441
}
442

    
443
static void FUNCC(pred8x8_vertical)(uint8_t *_src, int _stride){
444
    int i;
445
    pixel *src = (pixel*)_src;
446
    int stride = _stride/sizeof(pixel);
447
    const pixel4 a= AV_RN4PA(((pixel4*)(src-stride))+0);
448
    const pixel4 b= AV_RN4PA(((pixel4*)(src-stride))+1);
449

    
450
    for(i=0; i<8; i++){
451
        AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
452
        AV_WN4PA(((pixel4*)(src+i*stride))+1, b);
453
    }
454
}
455

    
456
static void FUNCC(pred8x8_horizontal)(uint8_t *_src, int stride){
457
    int i;
458
    pixel *src = (pixel*)_src;
459
    stride /= sizeof(pixel);
460

    
461
    for(i=0; i<8; i++){
462
        const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]);
463
        AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
464
        AV_WN4PA(((pixel4*)(src+i*stride))+1, a);
465
    }
466
}
467

    
468
#define PRED8x8_X(n, v)\
469
static void FUNCC(pred8x8_##n##_dc)(uint8_t *_src, int stride){\
470
    int i;\
471
    const pixel4 a = PIXEL_SPLAT_X4(v);\
472
    pixel *src = (pixel*)_src;\
473
    stride /= sizeof(pixel);\
474
    for(i=0; i<8; i++){\
475
        AV_WN4PA(((pixel4*)(src+i*stride))+0, a);\
476
        AV_WN4PA(((pixel4*)(src+i*stride))+1, a);\
477
    }\
478
}
479

    
480
PRED8x8_X(127, (1<<(BIT_DEPTH-1))-1);
481
PRED8x8_X(128, (1<<(BIT_DEPTH-1))+0);
482
PRED8x8_X(129, (1<<(BIT_DEPTH-1))+1);
483

    
484
static void FUNCC(pred8x8_left_dc)(uint8_t *_src, int stride){
485
    int i;
486
    int dc0, dc2;
487
    pixel4 dc0splat, dc2splat;
488
    pixel *src = (pixel*)_src;
489
    stride /= sizeof(pixel);
490

    
491
    dc0=dc2=0;
492
    for(i=0;i<4; i++){
493
        dc0+= src[-1+i*stride];
494
        dc2+= src[-1+(i+4)*stride];
495
    }
496
    dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2);
497
    dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
498

    
499
    for(i=0; i<4; i++){
500
        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
501
        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc0splat);
502
    }
503
    for(i=4; i<8; i++){
504
        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc2splat);
505
        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc2splat);
506
    }
507
}
508

    
509
static void FUNCC(pred8x8_top_dc)(uint8_t *_src, int stride){
510
    int i;
511
    int dc0, dc1;
512
    pixel4 dc0splat, dc1splat;
513
    pixel *src = (pixel*)_src;
514
    stride /= sizeof(pixel);
515

    
516
    dc0=dc1=0;
517
    for(i=0;i<4; i++){
518
        dc0+= src[i-stride];
519
        dc1+= src[4+i-stride];
520
    }
521
    dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2);
522
    dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
523

    
524
    for(i=0; i<4; i++){
525
        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
526
        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
527
    }
528
    for(i=4; i<8; i++){
529
        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
530
        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
531
    }
532
}
533

    
534
static void FUNCC(pred8x8_dc)(uint8_t *_src, int stride){
535
    int i;
536
    int dc0, dc1, dc2;
537
    pixel4 dc0splat, dc1splat, dc2splat, dc3splat;
538
    pixel *src = (pixel*)_src;
539
    stride /= sizeof(pixel);
540

    
541
    dc0=dc1=dc2=0;
542
    for(i=0;i<4; i++){
543
        dc0+= src[-1+i*stride] + src[i-stride];
544
        dc1+= src[4+i-stride];
545
        dc2+= src[-1+(i+4)*stride];
546
    }
547
    dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3);
548
    dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
549
    dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
550
    dc3splat = PIXEL_SPLAT_X4((dc1 + dc2 + 4)>>3);
551

    
552
    for(i=0; i<4; i++){
553
        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
554
        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
555
    }
556
    for(i=4; i<8; i++){
557
        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc2splat);
558
        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc3splat);
559
    }
560
}
561

    
562
//the following 4 function should not be optimized!
563
static void FUNC(pred8x8_mad_cow_dc_l0t)(uint8_t *src, int stride){
564
    FUNCC(pred8x8_top_dc)(src, stride);
565
    FUNCC(pred4x4_dc)(src, NULL, stride);
566
}
567

    
568
static void FUNC(pred8x8_mad_cow_dc_0lt)(uint8_t *src, int stride){
569
    FUNCC(pred8x8_dc)(src, stride);
570
    FUNCC(pred4x4_top_dc)(src, NULL, stride);
571
}
572

    
573
static void FUNC(pred8x8_mad_cow_dc_l00)(uint8_t *src, int stride){
574
    FUNCC(pred8x8_left_dc)(src, stride);
575
    FUNCC(pred4x4_128_dc)(src + 4*stride                  , NULL, stride);
576
    FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride);
577
}
578

    
579
static void FUNC(pred8x8_mad_cow_dc_0l0)(uint8_t *src, int stride){
580
    FUNCC(pred8x8_left_dc)(src, stride);
581
    FUNCC(pred4x4_128_dc)(src                  , NULL, stride);
582
    FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride);
583
}
584

    
585
static void FUNCC(pred8x8_plane)(uint8_t *_src, int _stride){
586
  int j, k;
587
  int a;
588
  INIT_CLIP
589
  pixel *src = (pixel*)_src;
590
  int stride = _stride/sizeof(pixel);
591
  const pixel * const src0 = src +3-stride;
592
  const pixel *       src1 = src +4*stride-1;
593
  const pixel *       src2 = src1-2*stride;    // == src+2*stride-1;
594
  int H = src0[1] - src0[-1];
595
  int V = src1[0] - src2[ 0];
596
  for(k=2; k<=4; ++k) {
597
    src1 += stride; src2 -= stride;
598
    H += k*(src0[k] - src0[-k]);
599
    V += k*(src1[0] - src2[ 0]);
600
  }
601
  H = ( 17*H+16 ) >> 5;
602
  V = ( 17*V+16 ) >> 5;
603

    
604
  a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
605
  for(j=8; j>0; --j) {
606
    int b = a;
607
    a += V;
608
    src[0] = CLIP((b    ) >> 5);
609
    src[1] = CLIP((b+  H) >> 5);
610
    src[2] = CLIP((b+2*H) >> 5);
611
    src[3] = CLIP((b+3*H) >> 5);
612
    src[4] = CLIP((b+4*H) >> 5);
613
    src[5] = CLIP((b+5*H) >> 5);
614
    src[6] = CLIP((b+6*H) >> 5);
615
    src[7] = CLIP((b+7*H) >> 5);
616
    src += stride;
617
  }
618
}
619

    
620
#define SRC(x,y) src[(x)+(y)*stride]
621
#define PL(y) \
622
    const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
623
#define PREDICT_8x8_LOAD_LEFT \
624
    const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
625
                     + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
626
    PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
627
    const int l7 av_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
628

    
629
#define PT(x) \
630
    const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
631
#define PREDICT_8x8_LOAD_TOP \
632
    const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
633
                     + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
634
    PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
635
    const int t7 av_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
636
                     + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
637

    
638
#define PTR(x) \
639
    t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
640
#define PREDICT_8x8_LOAD_TOPRIGHT \
641
    int t8, t9, t10, t11, t12, t13, t14, t15; \
642
    if(has_topright) { \
643
        PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
644
        t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
645
    } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
646

    
647
#define PREDICT_8x8_LOAD_TOPLEFT \
648
    const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
649

    
650
#define PREDICT_8x8_DC(v) \
651
    int y; \
652
    for( y = 0; y < 8; y++ ) { \
653
        AV_WN4PA(((pixel4*)src)+0, v); \
654
        AV_WN4PA(((pixel4*)src)+1, v); \
655
        src += stride; \
656
    }
657

    
658
static void FUNCC(pred8x8l_128_dc)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
659
{
660
    pixel *src = (pixel*)_src;
661
    int stride = _stride/sizeof(pixel);
662

    
663
    PREDICT_8x8_DC(PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1)));
664
}
665
static void FUNCC(pred8x8l_left_dc)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
666
{
667
    pixel *src = (pixel*)_src;
668
    int stride = _stride/sizeof(pixel);
669

    
670
    PREDICT_8x8_LOAD_LEFT;
671
    const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3);
672
    PREDICT_8x8_DC(dc);
673
}
674
static void FUNCC(pred8x8l_top_dc)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
675
{
676
    pixel *src = (pixel*)_src;
677
    int stride = _stride/sizeof(pixel);
678

    
679
    PREDICT_8x8_LOAD_TOP;
680
    const pixel4 dc = PIXEL_SPLAT_X4((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3);
681
    PREDICT_8x8_DC(dc);
682
}
683
static void FUNCC(pred8x8l_dc)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
684
{
685
    pixel *src = (pixel*)_src;
686
    int stride = _stride/sizeof(pixel);
687

    
688
    PREDICT_8x8_LOAD_LEFT;
689
    PREDICT_8x8_LOAD_TOP;
690
    const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7
691
                                     +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4);
692
    PREDICT_8x8_DC(dc);
693
}
694
static void FUNCC(pred8x8l_horizontal)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
695
{
696
    pixel *src = (pixel*)_src;
697
    int stride = _stride/sizeof(pixel);
698
    pixel4 a;
699

    
700
    PREDICT_8x8_LOAD_LEFT;
701
#define ROW(y) a = PIXEL_SPLAT_X4(l##y); \
702
               AV_WN4PA(src+y*stride, a); \
703
               AV_WN4PA(src+y*stride+4, a);
704
    ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
705
#undef ROW
706
}
707
static void FUNCC(pred8x8l_vertical)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
708
{
709
    int y;
710
    pixel *src = (pixel*)_src;
711
    int stride = _stride/sizeof(pixel);
712
    pixel4 a, b;
713

    
714
    PREDICT_8x8_LOAD_TOP;
715
    src[0] = t0;
716
    src[1] = t1;
717
    src[2] = t2;
718
    src[3] = t3;
719
    src[4] = t4;
720
    src[5] = t5;
721
    src[6] = t6;
722
    src[7] = t7;
723
    a = AV_RN4PA(((pixel4*)src)+0);
724
    b = AV_RN4PA(((pixel4*)src)+1);
725
    for( y = 1; y < 8; y++ ) {
726
        AV_WN4PA(((pixel4*)(src+y*stride))+0, a);
727
        AV_WN4PA(((pixel4*)(src+y*stride))+1, b);
728
    }
729
}
730
static void FUNCC(pred8x8l_down_left)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
731
{
732
    pixel *src = (pixel*)_src;
733
    int stride = _stride/sizeof(pixel);
734
    PREDICT_8x8_LOAD_TOP;
735
    PREDICT_8x8_LOAD_TOPRIGHT;
736
    SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
737
    SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
738
    SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
739
    SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
740
    SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
741
    SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
742
    SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
743
    SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
744
    SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
745
    SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
746
    SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
747
    SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
748
    SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
749
    SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
750
    SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
751
}
752
static void FUNCC(pred8x8l_down_right)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
753
{
754
    pixel *src = (pixel*)_src;
755
    int stride = _stride/sizeof(pixel);
756
    PREDICT_8x8_LOAD_TOP;
757
    PREDICT_8x8_LOAD_LEFT;
758
    PREDICT_8x8_LOAD_TOPLEFT;
759
    SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
760
    SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
761
    SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
762
    SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
763
    SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
764
    SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
765
    SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
766
    SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
767
    SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
768
    SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
769
    SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
770
    SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
771
    SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
772
    SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
773
    SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
774
}
775
static void FUNCC(pred8x8l_vertical_right)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
776
{
777
    pixel *src = (pixel*)_src;
778
    int stride = _stride/sizeof(pixel);
779
    PREDICT_8x8_LOAD_TOP;
780
    PREDICT_8x8_LOAD_LEFT;
781
    PREDICT_8x8_LOAD_TOPLEFT;
782
    SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
783
    SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
784
    SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
785
    SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
786
    SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
787
    SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
788
    SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
789
    SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
790
    SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
791
    SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
792
    SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
793
    SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
794
    SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
795
    SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
796
    SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
797
    SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
798
    SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
799
    SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
800
    SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
801
    SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
802
    SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
803
    SRC(7,0)= (t6 + t7 + 1) >> 1;
804
}
805
static void FUNCC(pred8x8l_horizontal_down)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
806
{
807
    pixel *src = (pixel*)_src;
808
    int stride = _stride/sizeof(pixel);
809
    PREDICT_8x8_LOAD_TOP;
810
    PREDICT_8x8_LOAD_LEFT;
811
    PREDICT_8x8_LOAD_TOPLEFT;
812
    SRC(0,7)= (l6 + l7 + 1) >> 1;
813
    SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
814
    SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
815
    SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
816
    SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
817
    SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
818
    SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
819
    SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
820
    SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
821
    SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
822
    SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
823
    SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
824
    SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
825
    SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
826
    SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
827
    SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
828
    SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
829
    SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
830
    SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
831
    SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
832
    SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
833
    SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
834
}
835
static void FUNCC(pred8x8l_vertical_left)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
836
{
837
    pixel *src = (pixel*)_src;
838
    int stride = _stride/sizeof(pixel);
839
    PREDICT_8x8_LOAD_TOP;
840
    PREDICT_8x8_LOAD_TOPRIGHT;
841
    SRC(0,0)= (t0 + t1 + 1) >> 1;
842
    SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
843
    SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
844
    SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
845
    SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
846
    SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
847
    SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
848
    SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
849
    SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
850
    SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
851
    SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
852
    SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
853
    SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
854
    SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
855
    SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
856
    SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
857
    SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
858
    SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
859
    SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
860
    SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
861
    SRC(7,6)= (t10 + t11 + 1) >> 1;
862
    SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
863
}
864
static void FUNCC(pred8x8l_horizontal_up)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
865
{
866
    pixel *src = (pixel*)_src;
867
    int stride = _stride/sizeof(pixel);
868
    PREDICT_8x8_LOAD_LEFT;
869
    SRC(0,0)= (l0 + l1 + 1) >> 1;
870
    SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
871
    SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
872
    SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
873
    SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
874
    SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
875
    SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
876
    SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
877
    SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
878
    SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
879
    SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
880
    SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
881
    SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
882
    SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
883
    SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
884
    SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
885
    SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
886
    SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
887
}
888
#undef PREDICT_8x8_LOAD_LEFT
889
#undef PREDICT_8x8_LOAD_TOP
890
#undef PREDICT_8x8_LOAD_TOPLEFT
891
#undef PREDICT_8x8_LOAD_TOPRIGHT
892
#undef PREDICT_8x8_DC
893
#undef PTR
894
#undef PT
895
#undef PL
896
#undef SRC
897

    
898
static void FUNCC(pred4x4_vertical_add)(uint8_t *_pix, const DCTELEM *_block, int stride){
899
    int i;
900
    pixel *pix = (pixel*)_pix;
901
    const dctcoef *block = (const dctcoef*)_block;
902
    stride /= sizeof(pixel);
903
    pix -= stride;
904
    for(i=0; i<4; i++){
905
        pixel v = pix[0];
906
        pix[1*stride]= v += block[0];
907
        pix[2*stride]= v += block[4];
908
        pix[3*stride]= v += block[8];
909
        pix[4*stride]= v +  block[12];
910
        pix++;
911
        block++;
912
    }
913
}
914

    
915
static void FUNCC(pred4x4_horizontal_add)(uint8_t *_pix, const DCTELEM *_block, int stride){
916
    int i;
917
    pixel *pix = (pixel*)_pix;
918
    const dctcoef *block = (const dctcoef*)_block;
919
    stride /= sizeof(pixel);
920
    for(i=0; i<4; i++){
921
        pixel v = pix[-1];
922
        pix[0]= v += block[0];
923
        pix[1]= v += block[1];
924
        pix[2]= v += block[2];
925
        pix[3]= v +  block[3];
926
        pix+= stride;
927
        block+= 4;
928
    }
929
}
930

    
931
static void FUNCC(pred8x8l_vertical_add)(uint8_t *_pix, const DCTELEM *_block, int stride){
932
    int i;
933
    pixel *pix = (pixel*)_pix;
934
    const dctcoef *block = (const dctcoef*)_block;
935
    stride /= sizeof(pixel);
936
    pix -= stride;
937
    for(i=0; i<8; i++){
938
        pixel v = pix[0];
939
        pix[1*stride]= v += block[0];
940
        pix[2*stride]= v += block[8];
941
        pix[3*stride]= v += block[16];
942
        pix[4*stride]= v += block[24];
943
        pix[5*stride]= v += block[32];
944
        pix[6*stride]= v += block[40];
945
        pix[7*stride]= v += block[48];
946
        pix[8*stride]= v +  block[56];
947
        pix++;
948
        block++;
949
    }
950
}
951

    
952
static void FUNCC(pred8x8l_horizontal_add)(uint8_t *_pix, const DCTELEM *_block, int stride){
953
    int i;
954
    pixel *pix = (pixel*)_pix;
955
    const dctcoef *block = (const dctcoef*)_block;
956
    stride /= sizeof(pixel);
957
    for(i=0; i<8; i++){
958
        pixel v = pix[-1];
959
        pix[0]= v += block[0];
960
        pix[1]= v += block[1];
961
        pix[2]= v += block[2];
962
        pix[3]= v += block[3];
963
        pix[4]= v += block[4];
964
        pix[5]= v += block[5];
965
        pix[6]= v += block[6];
966
        pix[7]= v +  block[7];
967
        pix+= stride;
968
        block+= 8;
969
    }
970
}
971

    
972
static void FUNCC(pred16x16_vertical_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
973
    int i;
974
    for(i=0; i<16; i++)
975
        FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
976
}
977

    
978
static void FUNCC(pred16x16_horizontal_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
979
    int i;
980
    for(i=0; i<16; i++)
981
        FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
982
}
983

    
984
static void FUNCC(pred8x8_vertical_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
985
    int i;
986
    for(i=0; i<4; i++)
987
        FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
988
}
989

    
990
static void FUNCC(pred8x8_horizontal_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
991
    int i;
992
    for(i=0; i<4; i++)
993
        FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
994
}