Statistics
| Branch: | Revision:

ffmpeg / libavcodec / h264.c @ e69364b7

History | View | Annotate | Download (325 KB)

1
/*
2
 * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
3
 * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
4
 *
5
 * This file is part of FFmpeg.
6
 *
7
 * FFmpeg is free software; you can redistribute it and/or
8
 * modify it under the terms of the GNU Lesser General Public
9
 * License as published by the Free Software Foundation; either
10
 * version 2.1 of the License, or (at your option) any later version.
11
 *
12
 * FFmpeg is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
 * Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with FFmpeg; if not, write to the Free Software
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
 *
21
 */
22

    
23
/**
24
 * @file h264.c
25
 * H.264 / AVC / MPEG4 part10 codec.
26
 * @author Michael Niedermayer <michaelni@gmx.at>
27
 */
28

    
29
#include "common.h"
30
#include "dsputil.h"
31
#include "avcodec.h"
32
#include "mpegvideo.h"
33
#include "h264data.h"
34
#include "golomb.h"
35

    
36
#include "cabac.h"
37

    
38
//#undef NDEBUG
39
#include <assert.h>
40

    
41
#define interlaced_dct interlaced_dct_is_a_bad_name
42
#define mb_intra mb_intra_isnt_initalized_see_mb_type
43

    
44
#define LUMA_DC_BLOCK_INDEX   25
45
#define CHROMA_DC_BLOCK_INDEX 26
46

    
47
#define CHROMA_DC_COEFF_TOKEN_VLC_BITS 8
48
#define COEFF_TOKEN_VLC_BITS           8
49
#define TOTAL_ZEROS_VLC_BITS           9
50
#define CHROMA_DC_TOTAL_ZEROS_VLC_BITS 3
51
#define RUN_VLC_BITS                   3
52
#define RUN7_VLC_BITS                  6
53

    
54
#define MAX_SPS_COUNT 32
55
#define MAX_PPS_COUNT 256
56

    
57
#define MAX_MMCO_COUNT 66
58

    
59
/* Compiling in interlaced support reduces the speed
60
 * of progressive decoding by about 2%. */
61
#define ALLOW_INTERLACE
62

    
63
#ifdef ALLOW_INTERLACE
64
#define MB_MBAFF h->mb_mbaff
65
#define MB_FIELD h->mb_field_decoding_flag
66
#define FRAME_MBAFF h->mb_aff_frame
67
#else
68
#define MB_MBAFF 0
69
#define MB_FIELD 0
70
#define FRAME_MBAFF 0
71
#undef  IS_INTERLACED
72
#define IS_INTERLACED(mb_type) 0
73
#endif
74

    
75
/**
76
 * Sequence parameter set
77
 */
78
typedef struct SPS{
79

    
80
    int profile_idc;
81
    int level_idc;
82
    int transform_bypass;              ///< qpprime_y_zero_transform_bypass_flag
83
    int log2_max_frame_num;            ///< log2_max_frame_num_minus4 + 4
84
    int poc_type;                      ///< pic_order_cnt_type
85
    int log2_max_poc_lsb;              ///< log2_max_pic_order_cnt_lsb_minus4
86
    int delta_pic_order_always_zero_flag;
87
    int offset_for_non_ref_pic;
88
    int offset_for_top_to_bottom_field;
89
    int poc_cycle_length;              ///< num_ref_frames_in_pic_order_cnt_cycle
90
    int ref_frame_count;               ///< num_ref_frames
91
    int gaps_in_frame_num_allowed_flag;
92
    int mb_width;                      ///< frame_width_in_mbs_minus1 + 1
93
    int mb_height;                     ///< frame_height_in_mbs_minus1 + 1
94
    int frame_mbs_only_flag;
95
    int mb_aff;                        ///<mb_adaptive_frame_field_flag
96
    int direct_8x8_inference_flag;
97
    int crop;                   ///< frame_cropping_flag
98
    int crop_left;              ///< frame_cropping_rect_left_offset
99
    int crop_right;             ///< frame_cropping_rect_right_offset
100
    int crop_top;               ///< frame_cropping_rect_top_offset
101
    int crop_bottom;            ///< frame_cropping_rect_bottom_offset
102
    int vui_parameters_present_flag;
103
    AVRational sar;
104
    int timing_info_present_flag;
105
    uint32_t num_units_in_tick;
106
    uint32_t time_scale;
107
    int fixed_frame_rate_flag;
108
    short offset_for_ref_frame[256]; //FIXME dyn aloc?
109
    int bitstream_restriction_flag;
110
    int num_reorder_frames;
111
    int scaling_matrix_present;
112
    uint8_t scaling_matrix4[6][16];
113
    uint8_t scaling_matrix8[2][64];
114
}SPS;
115

    
116
/**
117
 * Picture parameter set
118
 */
119
typedef struct PPS{
120
    unsigned int sps_id;
121
    int cabac;                  ///< entropy_coding_mode_flag
122
    int pic_order_present;      ///< pic_order_present_flag
123
    int slice_group_count;      ///< num_slice_groups_minus1 + 1
124
    int mb_slice_group_map_type;
125
    unsigned int ref_count[2];  ///< num_ref_idx_l0/1_active_minus1 + 1
126
    int weighted_pred;          ///< weighted_pred_flag
127
    int weighted_bipred_idc;
128
    int init_qp;                ///< pic_init_qp_minus26 + 26
129
    int init_qs;                ///< pic_init_qs_minus26 + 26
130
    int chroma_qp_index_offset;
131
    int deblocking_filter_parameters_present; ///< deblocking_filter_parameters_present_flag
132
    int constrained_intra_pred; ///< constrained_intra_pred_flag
133
    int redundant_pic_cnt_present; ///< redundant_pic_cnt_present_flag
134
    int transform_8x8_mode;     ///< transform_8x8_mode_flag
135
    uint8_t scaling_matrix4[6][16];
136
    uint8_t scaling_matrix8[2][64];
137
}PPS;
138

    
139
/**
140
 * Memory management control operation opcode.
141
 */
142
typedef enum MMCOOpcode{
143
    MMCO_END=0,
144
    MMCO_SHORT2UNUSED,
145
    MMCO_LONG2UNUSED,
146
    MMCO_SHORT2LONG,
147
    MMCO_SET_MAX_LONG,
148
    MMCO_RESET,
149
    MMCO_LONG,
150
} MMCOOpcode;
151

    
152
/**
153
 * Memory management control operation.
154
 */
155
typedef struct MMCO{
156
    MMCOOpcode opcode;
157
    int short_frame_num;
158
    int long_index;
159
} MMCO;
160

    
161
/**
162
 * H264Context
163
 */
164
typedef struct H264Context{
165
    MpegEncContext s;
166
    int nal_ref_idc;
167
    int nal_unit_type;
168
    uint8_t *rbsp_buffer;
169
    unsigned int rbsp_buffer_size;
170

    
171
    /**
172
      * Used to parse AVC variant of h264
173
      */
174
    int is_avc; ///< this flag is != 0 if codec is avc1
175
    int got_avcC; ///< flag used to parse avcC data only once
176
    int nal_length_size; ///< Number of bytes used for nal length (1, 2 or 4)
177

    
178
    int chroma_qp; //QPc
179

    
180
    int prev_mb_skipped;
181
    int next_mb_skipped;
182

    
183
    //prediction stuff
184
    int chroma_pred_mode;
185
    int intra16x16_pred_mode;
186

    
187
    int top_mb_xy;
188
    int left_mb_xy[2];
189

    
190
    int8_t intra4x4_pred_mode_cache[5*8];
191
    int8_t (*intra4x4_pred_mode)[8];
192
    void (*pred4x4  [9+3])(uint8_t *src, uint8_t *topright, int stride);//FIXME move to dsp?
193
    void (*pred8x8l [9+3])(uint8_t *src, int topleft, int topright, int stride);
194
    void (*pred8x8  [4+3])(uint8_t *src, int stride);
195
    void (*pred16x16[4+3])(uint8_t *src, int stride);
196
    unsigned int topleft_samples_available;
197
    unsigned int top_samples_available;
198
    unsigned int topright_samples_available;
199
    unsigned int left_samples_available;
200
    uint8_t (*top_borders[2])[16+2*8];
201
    uint8_t left_border[2*(17+2*9)];
202

    
203
    /**
204
     * non zero coeff count cache.
205
     * is 64 if not available.
206
     */
207
    DECLARE_ALIGNED_8(uint8_t, non_zero_count_cache[6*8]);
208
    uint8_t (*non_zero_count)[16];
209

    
210
    /**
211
     * Motion vector cache.
212
     */
213
    DECLARE_ALIGNED_8(int16_t, mv_cache[2][5*8][2]);
214
    DECLARE_ALIGNED_8(int8_t, ref_cache[2][5*8]);
215
#define LIST_NOT_USED -1 //FIXME rename?
216
#define PART_NOT_AVAILABLE -2
217

    
218
    /**
219
     * is 1 if the specific list MV&references are set to 0,0,-2.
220
     */
221
    int mv_cache_clean[2];
222

    
223
    /**
224
     * number of neighbors (top and/or left) that used 8x8 dct
225
     */
226
    int neighbor_transform_size;
227

    
228
    /**
229
     * block_offset[ 0..23] for frame macroblocks
230
     * block_offset[24..47] for field macroblocks
231
     */
232
    int block_offset[2*(16+8)];
233

    
234
    uint32_t *mb2b_xy; //FIXME are these 4 a good idea?
235
    uint32_t *mb2b8_xy;
236
    int b_stride; //FIXME use s->b4_stride
237
    int b8_stride;
238

    
239
    int mb_linesize;   ///< may be equal to s->linesize or s->linesize*2, for mbaff
240
    int mb_uvlinesize;
241

    
242
    int emu_edge_width;
243
    int emu_edge_height;
244

    
245
    int halfpel_flag;
246
    int thirdpel_flag;
247

    
248
    int unknown_svq3_flag;
249
    int next_slice_index;
250

    
251
    SPS sps_buffer[MAX_SPS_COUNT];
252
    SPS sps; ///< current sps
253

    
254
    PPS pps_buffer[MAX_PPS_COUNT];
255
    /**
256
     * current pps
257
     */
258
    PPS pps; //FIXME move to Picture perhaps? (->no) do we need that?
259

    
260
    uint32_t dequant4_buffer[6][52][16];
261
    uint32_t dequant8_buffer[2][52][64];
262
    uint32_t (*dequant4_coeff[6])[16];
263
    uint32_t (*dequant8_coeff[2])[64];
264
    int dequant_coeff_pps;     ///< reinit tables when pps changes
265

    
266
    int slice_num;
267
    uint8_t *slice_table_base;
268
    uint8_t *slice_table;      ///< slice_table_base + 2*mb_stride + 1
269
    int slice_type;
270
    int slice_type_fixed;
271

    
272
    //interlacing specific flags
273
    int mb_aff_frame;
274
    int mb_field_decoding_flag;
275
    int mb_mbaff;              ///< mb_aff_frame && mb_field_decoding_flag
276

    
277
    unsigned int sub_mb_type[4];
278

    
279
    //POC stuff
280
    int poc_lsb;
281
    int poc_msb;
282
    int delta_poc_bottom;
283
    int delta_poc[2];
284
    int frame_num;
285
    int prev_poc_msb;             ///< poc_msb of the last reference pic for POC type 0
286
    int prev_poc_lsb;             ///< poc_lsb of the last reference pic for POC type 0
287
    int frame_num_offset;         ///< for POC type 2
288
    int prev_frame_num_offset;    ///< for POC type 2
289
    int prev_frame_num;           ///< frame_num of the last pic for POC type 1/2
290

    
291
    /**
292
     * frame_num for frames or 2*frame_num for field pics.
293
     */
294
    int curr_pic_num;
295

    
296
    /**
297
     * max_frame_num or 2*max_frame_num for field pics.
298
     */
299
    int max_pic_num;
300

    
301
    //Weighted pred stuff
302
    int use_weight;
303
    int use_weight_chroma;
304
    int luma_log2_weight_denom;
305
    int chroma_log2_weight_denom;
306
    int luma_weight[2][48];
307
    int luma_offset[2][48];
308
    int chroma_weight[2][48][2];
309
    int chroma_offset[2][48][2];
310
    int implicit_weight[48][48];
311

    
312
    //deblock
313
    int deblocking_filter;         ///< disable_deblocking_filter_idc with 1<->0
314
    int slice_alpha_c0_offset;
315
    int slice_beta_offset;
316

    
317
    int redundant_pic_count;
318

    
319
    int direct_spatial_mv_pred;
320
    int dist_scale_factor[16];
321
    int dist_scale_factor_field[32];
322
    int map_col_to_list0[2][16];
323
    int map_col_to_list0_field[2][32];
324

    
325
    /**
326
     * num_ref_idx_l0/1_active_minus1 + 1
327
     */
328
    unsigned int ref_count[2];   ///< counts frames or fields, depending on current mb mode
329
    unsigned int list_count;
330
    Picture *short_ref[32];
331
    Picture *long_ref[32];
332
    Picture default_ref_list[2][32];
333
    Picture ref_list[2][48];     ///< 0..15: frame refs, 16..47: mbaff field refs
334
    Picture *delayed_pic[18]; //FIXME size?
335
    Picture *delayed_output_pic;
336

    
337
    /**
338
     * memory management control operations buffer.
339
     */
340
    MMCO mmco[MAX_MMCO_COUNT];
341
    int mmco_index;
342

    
343
    int long_ref_count;  ///< number of actual long term references
344
    int short_ref_count; ///< number of actual short term references
345

    
346
    //data partitioning
347
    GetBitContext intra_gb;
348
    GetBitContext inter_gb;
349
    GetBitContext *intra_gb_ptr;
350
    GetBitContext *inter_gb_ptr;
351

    
352
    DECLARE_ALIGNED_8(DCTELEM, mb[16*24]);
353
    DCTELEM mb_padding[256];        ///< as mb is addressed by scantable[i] and scantable is uint8_t we can either check that i is not to large or ensure that there is some unused stuff after mb
354

    
355
    /**
356
     * Cabac
357
     */
358
    CABACContext cabac;
359
    uint8_t      cabac_state[460];
360
    int          cabac_init_idc;
361

    
362
    /* 0x100 -> non null luma_dc, 0x80/0x40 -> non null chroma_dc (cb/cr), 0x?0 -> chroma_cbp(0,1,2), 0x0? luma_cbp */
363
    uint16_t     *cbp_table;
364
    int cbp;
365
    int top_cbp;
366
    int left_cbp;
367
    /* chroma_pred_mode for i4x4 or i16x16, else 0 */
368
    uint8_t     *chroma_pred_mode_table;
369
    int         last_qscale_diff;
370
    int16_t     (*mvd_table[2])[2];
371
    DECLARE_ALIGNED_8(int16_t, mvd_cache[2][5*8][2]);
372
    uint8_t     *direct_table;
373
    uint8_t     direct_cache[5*8];
374

    
375
    uint8_t zigzag_scan[16];
376
    uint8_t zigzag_scan8x8[64];
377
    uint8_t zigzag_scan8x8_cavlc[64];
378
    uint8_t field_scan[16];
379
    uint8_t field_scan8x8[64];
380
    uint8_t field_scan8x8_cavlc[64];
381
    const uint8_t *zigzag_scan_q0;
382
    const uint8_t *zigzag_scan8x8_q0;
383
    const uint8_t *zigzag_scan8x8_cavlc_q0;
384
    const uint8_t *field_scan_q0;
385
    const uint8_t *field_scan8x8_q0;
386
    const uint8_t *field_scan8x8_cavlc_q0;
387

    
388
    int x264_build;
389
}H264Context;
390

    
391
static VLC coeff_token_vlc[4];
392
static VLC chroma_dc_coeff_token_vlc;
393

    
394
static VLC total_zeros_vlc[15];
395
static VLC chroma_dc_total_zeros_vlc[3];
396

    
397
static VLC run_vlc[6];
398
static VLC run7_vlc;
399

    
400
static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
401
static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
402
static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
403
static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
404

    
405
static av_always_inline uint32_t pack16to32(int a, int b){
406
#ifdef WORDS_BIGENDIAN
407
   return (b&0xFFFF) + (a<<16);
408
#else
409
   return (a&0xFFFF) + (b<<16);
410
#endif
411
}
412

    
413
const uint8_t ff_rem6[52]={
414
0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
415
};
416

    
417
const uint8_t ff_div6[52]={
418
0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
419
};
420

    
421

    
422
/**
423
 * fill a rectangle.
424
 * @param h height of the rectangle, should be a constant
425
 * @param w width of the rectangle, should be a constant
426
 * @param size the size of val (1 or 4), should be a constant
427
 */
428
static av_always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){
429
    uint8_t *p= (uint8_t*)vp;
430
    assert(size==1 || size==4);
431
    assert(w<=4);
432

    
433
    w      *= size;
434
    stride *= size;
435

    
436
    assert((((long)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0);
437
    assert((stride&(w-1))==0);
438
    if(w==2){
439
        const uint16_t v= size==4 ? val : val*0x0101;
440
        *(uint16_t*)(p + 0*stride)= v;
441
        if(h==1) return;
442
        *(uint16_t*)(p + 1*stride)= v;
443
        if(h==2) return;
444
        *(uint16_t*)(p + 2*stride)=
445
        *(uint16_t*)(p + 3*stride)= v;
446
    }else if(w==4){
447
        const uint32_t v= size==4 ? val : val*0x01010101;
448
        *(uint32_t*)(p + 0*stride)= v;
449
        if(h==1) return;
450
        *(uint32_t*)(p + 1*stride)= v;
451
        if(h==2) return;
452
        *(uint32_t*)(p + 2*stride)=
453
        *(uint32_t*)(p + 3*stride)= v;
454
    }else if(w==8){
455
    //gcc can't optimize 64bit math on x86_32
456
#if defined(ARCH_X86_64) || (defined(MP_WORDSIZE) && MP_WORDSIZE >= 64)
457
        const uint64_t v= val*0x0100000001ULL;
458
        *(uint64_t*)(p + 0*stride)= v;
459
        if(h==1) return;
460
        *(uint64_t*)(p + 1*stride)= v;
461
        if(h==2) return;
462
        *(uint64_t*)(p + 2*stride)=
463
        *(uint64_t*)(p + 3*stride)= v;
464
    }else if(w==16){
465
        const uint64_t v= val*0x0100000001ULL;
466
        *(uint64_t*)(p + 0+0*stride)=
467
        *(uint64_t*)(p + 8+0*stride)=
468
        *(uint64_t*)(p + 0+1*stride)=
469
        *(uint64_t*)(p + 8+1*stride)= v;
470
        if(h==2) return;
471
        *(uint64_t*)(p + 0+2*stride)=
472
        *(uint64_t*)(p + 8+2*stride)=
473
        *(uint64_t*)(p + 0+3*stride)=
474
        *(uint64_t*)(p + 8+3*stride)= v;
475
#else
476
        *(uint32_t*)(p + 0+0*stride)=
477
        *(uint32_t*)(p + 4+0*stride)= val;
478
        if(h==1) return;
479
        *(uint32_t*)(p + 0+1*stride)=
480
        *(uint32_t*)(p + 4+1*stride)= val;
481
        if(h==2) return;
482
        *(uint32_t*)(p + 0+2*stride)=
483
        *(uint32_t*)(p + 4+2*stride)=
484
        *(uint32_t*)(p + 0+3*stride)=
485
        *(uint32_t*)(p + 4+3*stride)= val;
486
    }else if(w==16){
487
        *(uint32_t*)(p + 0+0*stride)=
488
        *(uint32_t*)(p + 4+0*stride)=
489
        *(uint32_t*)(p + 8+0*stride)=
490
        *(uint32_t*)(p +12+0*stride)=
491
        *(uint32_t*)(p + 0+1*stride)=
492
        *(uint32_t*)(p + 4+1*stride)=
493
        *(uint32_t*)(p + 8+1*stride)=
494
        *(uint32_t*)(p +12+1*stride)= val;
495
        if(h==2) return;
496
        *(uint32_t*)(p + 0+2*stride)=
497
        *(uint32_t*)(p + 4+2*stride)=
498
        *(uint32_t*)(p + 8+2*stride)=
499
        *(uint32_t*)(p +12+2*stride)=
500
        *(uint32_t*)(p + 0+3*stride)=
501
        *(uint32_t*)(p + 4+3*stride)=
502
        *(uint32_t*)(p + 8+3*stride)=
503
        *(uint32_t*)(p +12+3*stride)= val;
504
#endif
505
    }else
506
        assert(0);
507
    assert(h==4);
508
}
509

    
510
static void fill_caches(H264Context *h, int mb_type, int for_deblock){
511
    MpegEncContext * const s = &h->s;
512
    const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
513
    int topleft_xy, top_xy, topright_xy, left_xy[2];
514
    int topleft_type, top_type, topright_type, left_type[2];
515
    int left_block[8];
516
    int i;
517

    
518
    //FIXME deblocking could skip the intra and nnz parts.
519
    if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[mb_xy-s->mb_stride]) && !FRAME_MBAFF)
520
        return;
521

    
522
    //wow what a mess, why didn't they simplify the interlacing&intra stuff, i can't imagine that these complex rules are worth it
523

    
524
    top_xy     = mb_xy  - s->mb_stride;
525
    topleft_xy = top_xy - 1;
526
    topright_xy= top_xy + 1;
527
    left_xy[1] = left_xy[0] = mb_xy-1;
528
    left_block[0]= 0;
529
    left_block[1]= 1;
530
    left_block[2]= 2;
531
    left_block[3]= 3;
532
    left_block[4]= 7;
533
    left_block[5]= 10;
534
    left_block[6]= 8;
535
    left_block[7]= 11;
536
    if(FRAME_MBAFF){
537
        const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
538
        const int top_pair_xy      = pair_xy     - s->mb_stride;
539
        const int topleft_pair_xy  = top_pair_xy - 1;
540
        const int topright_pair_xy = top_pair_xy + 1;
541
        const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
542
        const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
543
        const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
544
        const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
545
        const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
546
        const int bottom = (s->mb_y & 1);
547
        tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
548
        if (bottom
549
                ? !curr_mb_frame_flag // bottom macroblock
550
                : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
551
                ) {
552
            top_xy -= s->mb_stride;
553
        }
554
        if (bottom
555
                ? !curr_mb_frame_flag // bottom macroblock
556
                : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
557
                ) {
558
            topleft_xy -= s->mb_stride;
559
        }
560
        if (bottom
561
                ? !curr_mb_frame_flag // bottom macroblock
562
                : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
563
                ) {
564
            topright_xy -= s->mb_stride;
565
        }
566
        if (left_mb_frame_flag != curr_mb_frame_flag) {
567
            left_xy[1] = left_xy[0] = pair_xy - 1;
568
            if (curr_mb_frame_flag) {
569
                if (bottom) {
570
                    left_block[0]= 2;
571
                    left_block[1]= 2;
572
                    left_block[2]= 3;
573
                    left_block[3]= 3;
574
                    left_block[4]= 8;
575
                    left_block[5]= 11;
576
                    left_block[6]= 8;
577
                    left_block[7]= 11;
578
                } else {
579
                    left_block[0]= 0;
580
                    left_block[1]= 0;
581
                    left_block[2]= 1;
582
                    left_block[3]= 1;
583
                    left_block[4]= 7;
584
                    left_block[5]= 10;
585
                    left_block[6]= 7;
586
                    left_block[7]= 10;
587
                }
588
            } else {
589
                left_xy[1] += s->mb_stride;
590
                //left_block[0]= 0;
591
                left_block[1]= 2;
592
                left_block[2]= 0;
593
                left_block[3]= 2;
594
                //left_block[4]= 7;
595
                left_block[5]= 10;
596
                left_block[6]= 7;
597
                left_block[7]= 10;
598
            }
599
        }
600
    }
601

    
602
    h->top_mb_xy = top_xy;
603
    h->left_mb_xy[0] = left_xy[0];
604
    h->left_mb_xy[1] = left_xy[1];
605
    if(for_deblock){
606
        topleft_type = 0;
607
        topright_type = 0;
608
        top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
609
        left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
610
        left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
611

    
612
        if(FRAME_MBAFF && !IS_INTRA(mb_type)){
613
            int list;
614
            int v = *(uint16_t*)&h->non_zero_count[mb_xy][14];
615
            for(i=0; i<16; i++)
616
                h->non_zero_count_cache[scan8[i]] = (v>>i)&1;
617
            for(list=0; list<h->list_count; list++){
618
                if(USES_LIST(mb_type,list)){
619
                    uint32_t *src = (uint32_t*)s->current_picture.motion_val[list][h->mb2b_xy[mb_xy]];
620
                    uint32_t *dst = (uint32_t*)h->mv_cache[list][scan8[0]];
621
                    int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
622
                    for(i=0; i<4; i++, dst+=8, src+=h->b_stride){
623
                        dst[0] = src[0];
624
                        dst[1] = src[1];
625
                        dst[2] = src[2];
626
                        dst[3] = src[3];
627
                    }
628
                    *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
629
                    *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = pack16to32(ref[0],ref[1])*0x0101;
630
                    ref += h->b8_stride;
631
                    *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
632
                    *(uint32_t*)&h->ref_cache[list][scan8[10]] = pack16to32(ref[0],ref[1])*0x0101;
633
                }else{
634
                    fill_rectangle(&h-> mv_cache[list][scan8[ 0]], 4, 4, 8, 0, 4);
635
                    fill_rectangle(&h->ref_cache[list][scan8[ 0]], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
636
                }
637
            }
638
        }
639
    }else{
640
        topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
641
        top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
642
        topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
643
        left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
644
        left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
645
    }
646

    
647
    if(IS_INTRA(mb_type)){
648
        h->topleft_samples_available=
649
        h->top_samples_available=
650
        h->left_samples_available= 0xFFFF;
651
        h->topright_samples_available= 0xEEEA;
652

    
653
        if(!IS_INTRA(top_type) && (top_type==0 || h->pps.constrained_intra_pred)){
654
            h->topleft_samples_available= 0xB3FF;
655
            h->top_samples_available= 0x33FF;
656
            h->topright_samples_available= 0x26EA;
657
        }
658
        for(i=0; i<2; i++){
659
            if(!IS_INTRA(left_type[i]) && (left_type[i]==0 || h->pps.constrained_intra_pred)){
660
                h->topleft_samples_available&= 0xDF5F;
661
                h->left_samples_available&= 0x5F5F;
662
            }
663
        }
664

    
665
        if(!IS_INTRA(topleft_type) && (topleft_type==0 || h->pps.constrained_intra_pred))
666
            h->topleft_samples_available&= 0x7FFF;
667

    
668
        if(!IS_INTRA(topright_type) && (topright_type==0 || h->pps.constrained_intra_pred))
669
            h->topright_samples_available&= 0xFBFF;
670

    
671
        if(IS_INTRA4x4(mb_type)){
672
            if(IS_INTRA4x4(top_type)){
673
                h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
674
                h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
675
                h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
676
                h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
677
            }else{
678
                int pred;
679
                if(!top_type || (IS_INTER(top_type) && h->pps.constrained_intra_pred))
680
                    pred= -1;
681
                else{
682
                    pred= 2;
683
                }
684
                h->intra4x4_pred_mode_cache[4+8*0]=
685
                h->intra4x4_pred_mode_cache[5+8*0]=
686
                h->intra4x4_pred_mode_cache[6+8*0]=
687
                h->intra4x4_pred_mode_cache[7+8*0]= pred;
688
            }
689
            for(i=0; i<2; i++){
690
                if(IS_INTRA4x4(left_type[i])){
691
                    h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
692
                    h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
693
                }else{
694
                    int pred;
695
                    if(!left_type[i] || (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
696
                        pred= -1;
697
                    else{
698
                        pred= 2;
699
                    }
700
                    h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
701
                    h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
702
                }
703
            }
704
        }
705
    }
706

    
707

    
708
/*
709
0 . T T. T T T T
710
1 L . .L . . . .
711
2 L . .L . . . .
712
3 . T TL . . . .
713
4 L . .L . . . .
714
5 L . .. . . . .
715
*/
716
//FIXME constraint_intra_pred & partitioning & nnz (lets hope this is just a typo in the spec)
717
    if(top_type){
718
        h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
719
        h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
720
        h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
721
        h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
722

    
723
        h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
724
        h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
725

    
726
        h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
727
        h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
728

    
729
    }else{
730
        h->non_zero_count_cache[4+8*0]=
731
        h->non_zero_count_cache[5+8*0]=
732
        h->non_zero_count_cache[6+8*0]=
733
        h->non_zero_count_cache[7+8*0]=
734

    
735
        h->non_zero_count_cache[1+8*0]=
736
        h->non_zero_count_cache[2+8*0]=
737

    
738
        h->non_zero_count_cache[1+8*3]=
739
        h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
740

    
741
    }
742

    
743
    for (i=0; i<2; i++) {
744
        if(left_type[i]){
745
            h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
746
            h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
747
            h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
748
            h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
749
        }else{
750
            h->non_zero_count_cache[3+8*1 + 2*8*i]=
751
            h->non_zero_count_cache[3+8*2 + 2*8*i]=
752
            h->non_zero_count_cache[0+8*1 +   8*i]=
753
            h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
754
        }
755
    }
756

    
757
    if( h->pps.cabac ) {
758
        // top_cbp
759
        if(top_type) {
760
            h->top_cbp = h->cbp_table[top_xy];
761
        } else if(IS_INTRA(mb_type)) {
762
            h->top_cbp = 0x1C0;
763
        } else {
764
            h->top_cbp = 0;
765
        }
766
        // left_cbp
767
        if (left_type[0]) {
768
            h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
769
        } else if(IS_INTRA(mb_type)) {
770
            h->left_cbp = 0x1C0;
771
        } else {
772
            h->left_cbp = 0;
773
        }
774
        if (left_type[0]) {
775
            h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
776
        }
777
        if (left_type[1]) {
778
            h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
779
        }
780
    }
781

    
782
#if 1
783
    if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
784
        int list;
785
        for(list=0; list<h->list_count; list++){
786
            if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
787
                /*if(!h->mv_cache_clean[list]){
788
                    memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
789
                    memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
790
                    h->mv_cache_clean[list]= 1;
791
                }*/
792
                continue;
793
            }
794
            h->mv_cache_clean[list]= 0;
795

    
796
            if(USES_LIST(top_type, list)){
797
                const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
798
                const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
799
                *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
800
                *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
801
                *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
802
                *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
803
                h->ref_cache[list][scan8[0] + 0 - 1*8]=
804
                h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
805
                h->ref_cache[list][scan8[0] + 2 - 1*8]=
806
                h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
807
            }else{
808
                *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
809
                *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
810
                *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
811
                *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
812
                *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
813
            }
814

    
815
            for(i=0; i<2; i++){
816
                int cache_idx = scan8[0] - 1 + i*2*8;
817
                if(USES_LIST(left_type[i], list)){
818
                    const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
819
                    const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
820
                    *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
821
                    *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
822
                    h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
823
                    h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
824
                }else{
825
                    *(uint32_t*)h->mv_cache [list][cache_idx  ]=
826
                    *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
827
                    h->ref_cache[list][cache_idx  ]=
828
                    h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
829
                }
830
            }
831

    
832
            if((for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred)) && !FRAME_MBAFF)
833
                continue;
834

    
835
            if(USES_LIST(topleft_type, list)){
836
                const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
837
                const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + h->b8_stride;
838
                *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
839
                h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
840
            }else{
841
                *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
842
                h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
843
            }
844

    
845
            if(USES_LIST(topright_type, list)){
846
                const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
847
                const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
848
                *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
849
                h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
850
            }else{
851
                *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
852
                h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
853
            }
854

    
855
            if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
856
                continue;
857

    
858
            h->ref_cache[list][scan8[5 ]+1] =
859
            h->ref_cache[list][scan8[7 ]+1] =
860
            h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
861
            h->ref_cache[list][scan8[4 ]] =
862
            h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
863
            *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
864
            *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
865
            *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
866
            *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
867
            *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
868

    
869
            if( h->pps.cabac ) {
870
                /* XXX beurk, Load mvd */
871
                if(USES_LIST(top_type, list)){
872
                    const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
873
                    *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
874
                    *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
875
                    *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
876
                    *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
877
                }else{
878
                    *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
879
                    *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
880
                    *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
881
                    *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
882
                }
883
                if(USES_LIST(left_type[0], list)){
884
                    const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
885
                    *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
886
                    *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
887
                }else{
888
                    *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
889
                    *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
890
                }
891
                if(USES_LIST(left_type[1], list)){
892
                    const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
893
                    *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
894
                    *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
895
                }else{
896
                    *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
897
                    *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
898
                }
899
                *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
900
                *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
901
                *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
902
                *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
903
                *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
904

    
905
                if(h->slice_type == B_TYPE){
906
                    fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
907

    
908
                    if(IS_DIRECT(top_type)){
909
                        *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
910
                    }else if(IS_8X8(top_type)){
911
                        int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
912
                        h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
913
                        h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
914
                    }else{
915
                        *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
916
                    }
917

    
918
                    if(IS_DIRECT(left_type[0]))
919
                        h->direct_cache[scan8[0] - 1 + 0*8]= 1;
920
                    else if(IS_8X8(left_type[0]))
921
                        h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
922
                    else
923
                        h->direct_cache[scan8[0] - 1 + 0*8]= 0;
924

    
925
                    if(IS_DIRECT(left_type[1]))
926
                        h->direct_cache[scan8[0] - 1 + 2*8]= 1;
927
                    else if(IS_8X8(left_type[1]))
928
                        h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
929
                    else
930
                        h->direct_cache[scan8[0] - 1 + 2*8]= 0;
931
                }
932
            }
933

    
934
            if(FRAME_MBAFF){
935
#define MAP_MVS\
936
                    MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
937
                    MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
938
                    MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
939
                    MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
940
                    MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
941
                    MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
942
                    MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
943
                    MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
944
                    MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
945
                    MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
946
                if(MB_FIELD){
947
#define MAP_F2F(idx, mb_type)\
948
                    if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
949
                        h->ref_cache[list][idx] <<= 1;\
950
                        h->mv_cache[list][idx][1] /= 2;\
951
                        h->mvd_cache[list][idx][1] /= 2;\
952
                    }
953
                    MAP_MVS
954
#undef MAP_F2F
955
                }else{
956
#define MAP_F2F(idx, mb_type)\
957
                    if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
958
                        h->ref_cache[list][idx] >>= 1;\
959
                        h->mv_cache[list][idx][1] <<= 1;\
960
                        h->mvd_cache[list][idx][1] <<= 1;\
961
                    }
962
                    MAP_MVS
963
#undef MAP_F2F
964
                }
965
            }
966
        }
967
    }
968
#endif
969

    
970
    h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
971
}
972

    
973
static inline void write_back_intra_pred_mode(H264Context *h){
974
    MpegEncContext * const s = &h->s;
975
    const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
976

    
977
    h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
978
    h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
979
    h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
980
    h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
981
    h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
982
    h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
983
    h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
984
}
985

    
986
/**
987
 * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
988
 */
989
static inline int check_intra4x4_pred_mode(H264Context *h){
990
    MpegEncContext * const s = &h->s;
991
    static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
992
    static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
993
    int i;
994

    
995
    if(!(h->top_samples_available&0x8000)){
996
        for(i=0; i<4; i++){
997
            int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
998
            if(status<0){
999
                av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
1000
                return -1;
1001
            } else if(status){
1002
                h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
1003
            }
1004
        }
1005
    }
1006

    
1007
    if(!(h->left_samples_available&0x8000)){
1008
        for(i=0; i<4; i++){
1009
            int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
1010
            if(status<0){
1011
                av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
1012
                return -1;
1013
            } else if(status){
1014
                h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
1015
            }
1016
        }
1017
    }
1018

    
1019
    return 0;
1020
} //FIXME cleanup like next
1021

    
1022
/**
1023
 * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
1024
 */
1025
static inline int check_intra_pred_mode(H264Context *h, int mode){
1026
    MpegEncContext * const s = &h->s;
1027
    static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
1028
    static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
1029

    
1030
    if(mode > 6U) {
1031
        av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
1032
        return -1;
1033
    }
1034

    
1035
    if(!(h->top_samples_available&0x8000)){
1036
        mode= top[ mode ];
1037
        if(mode<0){
1038
            av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
1039
            return -1;
1040
        }
1041
    }
1042

    
1043
    if(!(h->left_samples_available&0x8000)){
1044
        mode= left[ mode ];
1045
        if(mode<0){
1046
            av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
1047
            return -1;
1048
        }
1049
    }
1050

    
1051
    return mode;
1052
}
1053

    
1054
/**
1055
 * gets the predicted intra4x4 prediction mode.
1056
 */
1057
static inline int pred_intra_mode(H264Context *h, int n){
1058
    const int index8= scan8[n];
1059
    const int left= h->intra4x4_pred_mode_cache[index8 - 1];
1060
    const int top = h->intra4x4_pred_mode_cache[index8 - 8];
1061
    const int min= FFMIN(left, top);
1062

    
1063
    tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
1064

    
1065
    if(min<0) return DC_PRED;
1066
    else      return min;
1067
}
1068

    
1069
static inline void write_back_non_zero_count(H264Context *h){
1070
    MpegEncContext * const s = &h->s;
1071
    const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
1072

    
1073
    h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
1074
    h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
1075
    h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
1076
    h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
1077
    h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
1078
    h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
1079
    h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
1080

    
1081
    h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
1082
    h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
1083
    h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
1084

    
1085
    h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
1086
    h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
1087
    h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
1088

    
1089
    if(FRAME_MBAFF){
1090
        // store all luma nnzs, for deblocking
1091
        int v = 0, i;
1092
        for(i=0; i<16; i++)
1093
            v += (!!h->non_zero_count_cache[scan8[i]]) << i;
1094
        *(uint16_t*)&h->non_zero_count[mb_xy][14] = v;
1095
    }
1096
}
1097

    
1098
/**
1099
 * gets the predicted number of non zero coefficients.
1100
 * @param n block index
1101
 */
1102
static inline int pred_non_zero_count(H264Context *h, int n){
1103
    const int index8= scan8[n];
1104
    const int left= h->non_zero_count_cache[index8 - 1];
1105
    const int top = h->non_zero_count_cache[index8 - 8];
1106
    int i= left + top;
1107

    
1108
    if(i<64) i= (i+1)>>1;
1109

    
1110
    tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
1111

    
1112
    return i&31;
1113
}
1114

    
1115
static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
1116
    const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
1117
    MpegEncContext *s = &h->s;
1118

    
1119
    /* there is no consistent mapping of mvs to neighboring locations that will
1120
     * make mbaff happy, so we can't move all this logic to fill_caches */
1121
    if(FRAME_MBAFF){
1122
        const uint32_t *mb_types = s->current_picture_ptr->mb_type;
1123
        const int16_t *mv;
1124
        *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
1125
        *C = h->mv_cache[list][scan8[0]-2];
1126

    
1127
        if(!MB_FIELD
1128
           && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
1129
            int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
1130
            if(IS_INTERLACED(mb_types[topright_xy])){
1131
#define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
1132
                const int x4 = X4, y4 = Y4;\
1133
                const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
1134
                if(!USES_LIST(mb_type,list) && !IS_8X8(mb_type))\
1135
                    return LIST_NOT_USED;\
1136
                mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
1137
                h->mv_cache[list][scan8[0]-2][0] = mv[0];\
1138
                h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
1139
                return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
1140

    
1141
                SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
1142
            }
1143
        }
1144
        if(topright_ref == PART_NOT_AVAILABLE
1145
           && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
1146
           && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
1147
            if(!MB_FIELD
1148
               && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
1149
                SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
1150
            }
1151
            if(MB_FIELD
1152
               && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
1153
               && i >= scan8[0]+8){
1154
                // leftshift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's ok.
1155
                SET_DIAG_MV(>>1, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
1156
            }
1157
        }
1158
#undef SET_DIAG_MV
1159
    }
1160

    
1161
    if(topright_ref != PART_NOT_AVAILABLE){
1162
        *C= h->mv_cache[list][ i - 8 + part_width ];
1163
        return topright_ref;
1164
    }else{
1165
        tprintf(s->avctx, "topright MV not available\n");
1166

    
1167
        *C= h->mv_cache[list][ i - 8 - 1 ];
1168
        return h->ref_cache[list][ i - 8 - 1 ];
1169
    }
1170
}
1171

    
1172
/**
1173
 * gets the predicted MV.
1174
 * @param n the block index
1175
 * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
1176
 * @param mx the x component of the predicted motion vector
1177
 * @param my the y component of the predicted motion vector
1178
 */
1179
static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
1180
    const int index8= scan8[n];
1181
    const int top_ref=      h->ref_cache[list][ index8 - 8 ];
1182
    const int left_ref=     h->ref_cache[list][ index8 - 1 ];
1183
    const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
1184
    const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
1185
    const int16_t * C;
1186
    int diagonal_ref, match_count;
1187

    
1188
    assert(part_width==1 || part_width==2 || part_width==4);
1189

    
1190
/* mv_cache
1191
  B . . A T T T T
1192
  U . . L . . , .
1193
  U . . L . . . .
1194
  U . . L . . , .
1195
  . . . L . . . .
1196
*/
1197

    
1198
    diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
1199
    match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
1200
    tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
1201
    if(match_count > 1){ //most common
1202
        *mx= mid_pred(A[0], B[0], C[0]);
1203
        *my= mid_pred(A[1], B[1], C[1]);
1204
    }else if(match_count==1){
1205
        if(left_ref==ref){
1206
            *mx= A[0];
1207
            *my= A[1];
1208
        }else if(top_ref==ref){
1209
            *mx= B[0];
1210
            *my= B[1];
1211
        }else{
1212
            *mx= C[0];
1213
            *my= C[1];
1214
        }
1215
    }else{
1216
        if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
1217
            *mx= A[0];
1218
            *my= A[1];
1219
        }else{
1220
            *mx= mid_pred(A[0], B[0], C[0]);
1221
            *my= mid_pred(A[1], B[1], C[1]);
1222
        }
1223
    }
1224

    
1225
    tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
1226
}
1227

    
1228
/**
1229
 * gets the directionally predicted 16x8 MV.
1230
 * @param n the block index
1231
 * @param mx the x component of the predicted motion vector
1232
 * @param my the y component of the predicted motion vector
1233
 */
1234
static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
1235
    if(n==0){
1236
        const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
1237
        const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
1238

    
1239
        tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
1240

    
1241
        if(top_ref == ref){
1242
            *mx= B[0];
1243
            *my= B[1];
1244
            return;
1245
        }
1246
    }else{
1247
        const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
1248
        const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
1249

    
1250
        tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
1251

    
1252
        if(left_ref == ref){
1253
            *mx= A[0];
1254
            *my= A[1];
1255
            return;
1256
        }
1257
    }
1258

    
1259
    //RARE
1260
    pred_motion(h, n, 4, list, ref, mx, my);
1261
}
1262

    
1263
/**
1264
 * gets the directionally predicted 8x16 MV.
1265
 * @param n the block index
1266
 * @param mx the x component of the predicted motion vector
1267
 * @param my the y component of the predicted motion vector
1268
 */
1269
static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
1270
    if(n==0){
1271
        const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
1272
        const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
1273

    
1274
        tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
1275

    
1276
        if(left_ref == ref){
1277
            *mx= A[0];
1278
            *my= A[1];
1279
            return;
1280
        }
1281
    }else{
1282
        const int16_t * C;
1283
        int diagonal_ref;
1284

    
1285
        diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
1286

    
1287
        tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
1288

    
1289
        if(diagonal_ref == ref){
1290
            *mx= C[0];
1291
            *my= C[1];
1292
            return;
1293
        }
1294
    }
1295

    
1296
    //RARE
1297
    pred_motion(h, n, 2, list, ref, mx, my);
1298
}
1299

    
1300
static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
1301
    const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
1302
    const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
1303

    
1304
    tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
1305

    
1306
    if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
1307
       || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
1308
       || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
1309

    
1310
        *mx = *my = 0;
1311
        return;
1312
    }
1313

    
1314
    pred_motion(h, 0, 4, 0, 0, mx, my);
1315

    
1316
    return;
1317
}
1318

    
1319
static inline void direct_dist_scale_factor(H264Context * const h){
1320
    const int poc = h->s.current_picture_ptr->poc;
1321
    const int poc1 = h->ref_list[1][0].poc;
1322
    int i;
1323
    for(i=0; i<h->ref_count[0]; i++){
1324
        int poc0 = h->ref_list[0][i].poc;
1325
        int td = av_clip(poc1 - poc0, -128, 127);
1326
        if(td == 0 /* FIXME || pic0 is a long-term ref */){
1327
            h->dist_scale_factor[i] = 256;
1328
        }else{
1329
            int tb = av_clip(poc - poc0, -128, 127);
1330
            int tx = (16384 + (FFABS(td) >> 1)) / td;
1331
            h->dist_scale_factor[i] = av_clip((tb*tx + 32) >> 6, -1024, 1023);
1332
        }
1333
    }
1334
    if(FRAME_MBAFF){
1335
        for(i=0; i<h->ref_count[0]; i++){
1336
            h->dist_scale_factor_field[2*i] =
1337
            h->dist_scale_factor_field[2*i+1] = h->dist_scale_factor[i];
1338
        }
1339
    }
1340
}
1341
static inline void direct_ref_list_init(H264Context * const h){
1342
    MpegEncContext * const s = &h->s;
1343
    Picture * const ref1 = &h->ref_list[1][0];
1344
    Picture * const cur = s->current_picture_ptr;
1345
    int list, i, j;
1346
    if(cur->pict_type == I_TYPE)
1347
        cur->ref_count[0] = 0;
1348
    if(cur->pict_type != B_TYPE)
1349
        cur->ref_count[1] = 0;
1350
    for(list=0; list<2; list++){
1351
        cur->ref_count[list] = h->ref_count[list];
1352
        for(j=0; j<h->ref_count[list]; j++)
1353
            cur->ref_poc[list][j] = h->ref_list[list][j].poc;
1354
    }
1355
    if(cur->pict_type != B_TYPE || h->direct_spatial_mv_pred)
1356
        return;
1357
    for(list=0; list<2; list++){
1358
        for(i=0; i<ref1->ref_count[list]; i++){
1359
            const int poc = ref1->ref_poc[list][i];
1360
            h->map_col_to_list0[list][i] = 0; /* bogus; fills in for missing frames */
1361
            for(j=0; j<h->ref_count[list]; j++)
1362
                if(h->ref_list[list][j].poc == poc){
1363
                    h->map_col_to_list0[list][i] = j;
1364
                    break;
1365
                }
1366
        }
1367
    }
1368
    if(FRAME_MBAFF){
1369
        for(list=0; list<2; list++){
1370
            for(i=0; i<ref1->ref_count[list]; i++){
1371
                j = h->map_col_to_list0[list][i];
1372
                h->map_col_to_list0_field[list][2*i] = 2*j;
1373
                h->map_col_to_list0_field[list][2*i+1] = 2*j+1;
1374
            }
1375
        }
1376
    }
1377
}
1378

    
1379
static inline void pred_direct_motion(H264Context * const h, int *mb_type){
1380
    MpegEncContext * const s = &h->s;
1381
    const int mb_xy =   s->mb_x +   s->mb_y*s->mb_stride;
1382
    const int b8_xy = 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1383
    const int b4_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1384
    const int mb_type_col = h->ref_list[1][0].mb_type[mb_xy];
1385
    const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[0][b4_xy];
1386
    const int16_t (*l1mv1)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[1][b4_xy];
1387
    const int8_t *l1ref0 = &h->ref_list[1][0].ref_index[0][b8_xy];
1388
    const int8_t *l1ref1 = &h->ref_list[1][0].ref_index[1][b8_xy];
1389
    const int is_b8x8 = IS_8X8(*mb_type);
1390
    unsigned int sub_mb_type;
1391
    int i8, i4;
1392

    
1393
#define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
1394
    if(IS_8X8(mb_type_col) && !h->sps.direct_8x8_inference_flag){
1395
        /* FIXME save sub mb types from previous frames (or derive from MVs)
1396
         * so we know exactly what block size to use */
1397
        sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1398
        *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1399
    }else if(!is_b8x8 && (mb_type_col & MB_TYPE_16x16_OR_INTRA)){
1400
        sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1401
        *mb_type =    MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1402
    }else{
1403
        sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1404
        *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1405
    }
1406
    if(!is_b8x8)
1407
        *mb_type |= MB_TYPE_DIRECT2;
1408
    if(MB_FIELD)
1409
        *mb_type |= MB_TYPE_INTERLACED;
1410

    
1411
    tprintf(s->avctx, "mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type, sub_mb_type, is_b8x8, mb_type_col);
1412

    
1413
    if(h->direct_spatial_mv_pred){
1414
        int ref[2];
1415
        int mv[2][2];
1416
        int list;
1417

    
1418
        /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1419

    
1420
        /* ref = min(neighbors) */
1421
        for(list=0; list<2; list++){
1422
            int refa = h->ref_cache[list][scan8[0] - 1];
1423
            int refb = h->ref_cache[list][scan8[0] - 8];
1424
            int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1425
            if(refc == -2)
1426
                refc = h->ref_cache[list][scan8[0] - 8 - 1];
1427
            ref[list] = refa;
1428
            if(ref[list] < 0 || (refb < ref[list] && refb >= 0))
1429
                ref[list] = refb;
1430
            if(ref[list] < 0 || (refc < ref[list] && refc >= 0))
1431
                ref[list] = refc;
1432
            if(ref[list] < 0)
1433
                ref[list] = -1;
1434
        }
1435

    
1436
        if(ref[0] < 0 && ref[1] < 0){
1437
            ref[0] = ref[1] = 0;
1438
            mv[0][0] = mv[0][1] =
1439
            mv[1][0] = mv[1][1] = 0;
1440
        }else{
1441
            for(list=0; list<2; list++){
1442
                if(ref[list] >= 0)
1443
                    pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1444
                else
1445
                    mv[list][0] = mv[list][1] = 0;
1446
            }
1447
        }
1448

    
1449
        if(ref[1] < 0){
1450
            *mb_type &= ~MB_TYPE_P0L1;
1451
            sub_mb_type &= ~MB_TYPE_P0L1;
1452
        }else if(ref[0] < 0){
1453
            *mb_type &= ~MB_TYPE_P0L0;
1454
            sub_mb_type &= ~MB_TYPE_P0L0;
1455
        }
1456

    
1457
        if(IS_16X16(*mb_type)){
1458
            int a=0, b=0;
1459

    
1460
            fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1461
            fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1462
            if(!IS_INTRA(mb_type_col)
1463
               && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1464
                   || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1465
                       && (h->x264_build>33 || !h->x264_build)))){
1466
                if(ref[0] > 0)
1467
                    a= pack16to32(mv[0][0],mv[0][1]);
1468
                if(ref[1] > 0)
1469
                    b= pack16to32(mv[1][0],mv[1][1]);
1470
            }else{
1471
                a= pack16to32(mv[0][0],mv[0][1]);
1472
                b= pack16to32(mv[1][0],mv[1][1]);
1473
            }
1474
            fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1475
            fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1476
        }else{
1477
            for(i8=0; i8<4; i8++){
1478
                const int x8 = i8&1;
1479
                const int y8 = i8>>1;
1480

    
1481
                if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1482
                    continue;
1483
                h->sub_mb_type[i8] = sub_mb_type;
1484

    
1485
                fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1486
                fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1487
                fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1488
                fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1489

    
1490
                /* col_zero_flag */
1491
                if(!IS_INTRA(mb_type_col) && (   l1ref0[x8 + y8*h->b8_stride] == 0
1492
                                              || (l1ref0[x8 + y8*h->b8_stride] < 0 && l1ref1[x8 + y8*h->b8_stride] == 0
1493
                                                  && (h->x264_build>33 || !h->x264_build)))){
1494
                    const int16_t (*l1mv)[2]= l1ref0[x8 + y8*h->b8_stride] == 0 ? l1mv0 : l1mv1;
1495
                    if(IS_SUB_8X8(sub_mb_type)){
1496
                        const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1497
                        if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1498
                            if(ref[0] == 0)
1499
                                fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1500
                            if(ref[1] == 0)
1501
                                fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1502
                        }
1503
                    }else
1504
                    for(i4=0; i4<4; i4++){
1505
                        const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1506
                        if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1507
                            if(ref[0] == 0)
1508
                                *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1509
                            if(ref[1] == 0)
1510
                                *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1511
                        }
1512
                    }
1513
                }
1514
            }
1515
        }
1516
    }else{ /* direct temporal mv pred */
1517
        const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1518
        const int *dist_scale_factor = h->dist_scale_factor;
1519

    
1520
        if(FRAME_MBAFF){
1521
            if(IS_INTERLACED(*mb_type)){
1522
                map_col_to_list0[0] = h->map_col_to_list0_field[0];
1523
                map_col_to_list0[1] = h->map_col_to_list0_field[1];
1524
                dist_scale_factor = h->dist_scale_factor_field;
1525
            }
1526
            if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1527
                /* FIXME assumes direct_8x8_inference == 1 */
1528
                const int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1529
                int mb_types_col[2];
1530
                int y_shift;
1531

    
1532
                *mb_type = MB_TYPE_8x8|MB_TYPE_L0L1
1533
                         | (is_b8x8 ? 0 : MB_TYPE_DIRECT2)
1534
                         | (*mb_type & MB_TYPE_INTERLACED);
1535
                sub_mb_type = MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_16x16;
1536

    
1537
                if(IS_INTERLACED(*mb_type)){
1538
                    /* frame to field scaling */
1539
                    mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1540
                    mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1541
                    if(s->mb_y&1){
1542
                        l1ref0 -= 2*h->b8_stride;
1543
                        l1ref1 -= 2*h->b8_stride;
1544
                        l1mv0 -= 4*h->b_stride;
1545
                        l1mv1 -= 4*h->b_stride;
1546
                    }
1547
                    y_shift = 0;
1548

    
1549
                    if(   (mb_types_col[0] & MB_TYPE_16x16_OR_INTRA)
1550
                       && (mb_types_col[1] & MB_TYPE_16x16_OR_INTRA)
1551
                       && !is_b8x8)
1552
                        *mb_type |= MB_TYPE_16x8;
1553
                    else
1554
                        *mb_type |= MB_TYPE_8x8;
1555
                }else{
1556
                    /* field to frame scaling */
1557
                    /* col_mb_y = (mb_y&~1) + (topAbsDiffPOC < bottomAbsDiffPOC ? 0 : 1)
1558
                     * but in MBAFF, top and bottom POC are equal */
1559
                    int dy = (s->mb_y&1) ? 1 : 2;
1560
                    mb_types_col[0] =
1561
                    mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1562
                    l1ref0 += dy*h->b8_stride;
1563
                    l1ref1 += dy*h->b8_stride;
1564
                    l1mv0 += 2*dy*h->b_stride;
1565
                    l1mv1 += 2*dy*h->b_stride;
1566
                    y_shift = 2;
1567

    
1568
                    if((mb_types_col[0] & (MB_TYPE_16x16_OR_INTRA|MB_TYPE_16x8))
1569
                       && !is_b8x8)
1570
                        *mb_type |= MB_TYPE_16x16;
1571
                    else
1572
                        *mb_type |= MB_TYPE_8x8;
1573
                }
1574

    
1575
                for(i8=0; i8<4; i8++){
1576
                    const int x8 = i8&1;
1577
                    const int y8 = i8>>1;
1578
                    int ref0, scale;
1579
                    const int16_t (*l1mv)[2]= l1mv0;
1580

    
1581
                    if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1582
                        continue;
1583
                    h->sub_mb_type[i8] = sub_mb_type;
1584

    
1585
                    fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1586
                    if(IS_INTRA(mb_types_col[y8])){
1587
                        fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1588
                        fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1589
                        fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1590
                        continue;
1591
                    }
1592

    
1593
                    ref0 = l1ref0[x8 + (y8*2>>y_shift)*h->b8_stride];
1594
                    if(ref0 >= 0)
1595
                        ref0 = map_col_to_list0[0][ref0*2>>y_shift];
1596
                    else{
1597
                        ref0 = map_col_to_list0[1][l1ref1[x8 + (y8*2>>y_shift)*h->b8_stride]*2>>y_shift];
1598
                        l1mv= l1mv1;
1599
                    }
1600
                    scale = dist_scale_factor[ref0];
1601
                    fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1602

    
1603
                    {
1604
                        const int16_t *mv_col = l1mv[x8*3 + (y8*6>>y_shift)*h->b_stride];
1605
                        int my_col = (mv_col[1]<<y_shift)/2;
1606
                        int mx = (scale * mv_col[0] + 128) >> 8;
1607
                        int my = (scale * my_col + 128) >> 8;
1608
                        fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1609
                        fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1610
                    }
1611
                }
1612
                return;
1613
            }
1614
        }
1615

    
1616
        /* one-to-one mv scaling */
1617

    
1618
        if(IS_16X16(*mb_type)){
1619
            int ref, mv0, mv1;
1620

    
1621
            fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1622
            if(IS_INTRA(mb_type_col)){
1623
                ref=mv0=mv1=0;
1624
            }else{
1625
                const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
1626
                                                : map_col_to_list0[1][l1ref1[0]];
1627
                const int scale = dist_scale_factor[ref0];
1628
                const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1629
                int mv_l0[2];
1630
                mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1631
                mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1632
                ref= ref0;
1633
                mv0= pack16to32(mv_l0[0],mv_l0[1]);
1634
                mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1635
            }
1636
            fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1637
            fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1638
            fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1639
        }else{
1640
            for(i8=0; i8<4; i8++){
1641
                const int x8 = i8&1;
1642
                const int y8 = i8>>1;
1643
                int ref0, scale;
1644
                const int16_t (*l1mv)[2]= l1mv0;
1645

    
1646
                if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1647
                    continue;
1648
                h->sub_mb_type[i8] = sub_mb_type;
1649
                fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1650
                if(IS_INTRA(mb_type_col)){
1651
                    fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1652
                    fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1653
                    fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1654
                    continue;
1655
                }
1656

    
1657
                ref0 = l1ref0[x8 + y8*h->b8_stride];
1658
                if(ref0 >= 0)
1659
                    ref0 = map_col_to_list0[0][ref0];
1660
                else{
1661
                    ref0 = map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
1662
                    l1mv= l1mv1;
1663
                }
1664
                scale = dist_scale_factor[ref0];
1665

    
1666
                fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1667
                if(IS_SUB_8X8(sub_mb_type)){
1668
                    const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1669
                    int mx = (scale * mv_col[0] + 128) >> 8;
1670
                    int my = (scale * mv_col[1] + 128) >> 8;
1671
                    fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1672
                    fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1673
                }else
1674
                for(i4=0; i4<4; i4++){
1675
                    const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1676
                    int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1677
                    mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1678
                    mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1679
                    *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1680
                        pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1681
                }
1682
            }
1683
        }
1684
    }
1685
}
1686

    
1687
static inline void write_back_motion(H264Context *h, int mb_type){
1688
    MpegEncContext * const s = &h->s;
1689
    const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1690
    const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1691
    int list;
1692

    
1693
    if(!USES_LIST(mb_type, 0))
1694
        fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1695

    
1696
    for(list=0; list<h->list_count; list++){
1697
        int y;
1698
        if(!USES_LIST(mb_type, list))
1699
            continue;
1700

    
1701
        for(y=0; y<4; y++){
1702
            *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1703
            *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1704
        }
1705
        if( h->pps.cabac ) {
1706
            if(IS_SKIP(mb_type))
1707
                fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1708
            else
1709
            for(y=0; y<4; y++){
1710
                *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1711
                *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1712
            }
1713
        }
1714

    
1715
        {
1716
            int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1717
            ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1718
            ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1719
            ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1720
            ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1721
        }
1722
    }
1723

    
1724
    if(h->slice_type == B_TYPE && h->pps.cabac){
1725
        if(IS_8X8(mb_type)){
1726
            uint8_t *direct_table = &h->direct_table[b8_xy];
1727
            direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1728
            direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1729
            direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1730
        }
1731
    }
1732
}
1733

    
1734
/**
1735
 * Decodes a network abstraction layer unit.
1736
 * @param consumed is the number of bytes used as input
1737
 * @param length is the length of the array
1738
 * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1739
 * @returns decoded bytes, might be src+1 if no escapes
1740
 */
1741
static uint8_t *decode_nal(H264Context *h, uint8_t *src, int *dst_length, int *consumed, int length){
1742
    int i, si, di;
1743
    uint8_t *dst;
1744

    
1745
//    src[0]&0x80;                //forbidden bit
1746
    h->nal_ref_idc= src[0]>>5;
1747
    h->nal_unit_type= src[0]&0x1F;
1748

    
1749
    src++; length--;
1750
#if 0
1751
    for(i=0; i<length; i++)
1752
        printf("%2X ", src[i]);
1753
#endif
1754
    for(i=0; i+1<length; i+=2){
1755
        if(src[i]) continue;
1756
        if(i>0 && src[i-1]==0) i--;
1757
        if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1758
            if(src[i+2]!=3){
1759
                /* startcode, so we must be past the end */
1760
                length=i;
1761
            }
1762
            break;
1763
        }
1764
    }
1765

    
1766
    if(i>=length-1){ //no escaped 0
1767
        *dst_length= length;
1768
        *consumed= length+1; //+1 for the header
1769
        return src;
1770
    }
1771

    
1772
    h->rbsp_buffer= av_fast_realloc(h->rbsp_buffer, &h->rbsp_buffer_size, length);
1773
    dst= h->rbsp_buffer;
1774

    
1775
    if (dst == NULL){
1776
        return NULL;
1777
    }
1778

    
1779
//printf("decoding esc\n");
1780
    si=di=0;
1781
    while(si<length){
1782
        //remove escapes (very rare 1:2^22)
1783
        if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1784
            if(src[si+2]==3){ //escape
1785
                dst[di++]= 0;
1786
                dst[di++]= 0;
1787
                si+=3;
1788
                continue;
1789
            }else //next start code
1790
                break;
1791
        }
1792

    
1793
        dst[di++]= src[si++];
1794
    }
1795

    
1796
    *dst_length= di;
1797
    *consumed= si + 1;//+1 for the header
1798
//FIXME store exact number of bits in the getbitcontext (its needed for decoding)
1799
    return dst;
1800
}
1801

    
1802
/**
1803
 * identifies the exact end of the bitstream
1804
 * @return the length of the trailing, or 0 if damaged
1805
 */
1806
static int decode_rbsp_trailing(H264Context *h, uint8_t *src){
1807
    int v= *src;
1808
    int r;
1809

    
1810
    tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1811

    
1812
    for(r=1; r<9; r++){
1813
        if(v&1) return r;
1814
        v>>=1;
1815
    }
1816
    return 0;
1817
}
1818

    
1819
/**
1820
 * idct tranforms the 16 dc values and dequantize them.
1821
 * @param qp quantization parameter
1822
 */
1823
static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1824
#define stride 16
1825
    int i;
1826
    int temp[16]; //FIXME check if this is a good idea
1827
    static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1828
    static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1829

    
1830
//memset(block, 64, 2*256);
1831
//return;
1832
    for(i=0; i<4; i++){
1833
        const int offset= y_offset[i];
1834
        const int z0= block[offset+stride*0] + block[offset+stride*4];
1835
        const int z1= block[offset+stride*0] - block[offset+stride*4];
1836
        const int z2= block[offset+stride*1] - block[offset+stride*5];
1837
        const int z3= block[offset+stride*1] + block[offset+stride*5];
1838

    
1839
        temp[4*i+0]= z0+z3;
1840
        temp[4*i+1]= z1+z2;
1841
        temp[4*i+2]= z1-z2;
1842
        temp[4*i+3]= z0-z3;
1843
    }
1844

    
1845
    for(i=0; i<4; i++){
1846
        const int offset= x_offset[i];
1847
        const int z0= temp[4*0+i] + temp[4*2+i];
1848
        const int z1= temp[4*0+i] - temp[4*2+i];
1849
        const int z2= temp[4*1+i] - temp[4*3+i];
1850
        const int z3= temp[4*1+i] + temp[4*3+i];
1851

    
1852
        block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_resdual
1853
        block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1854
        block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1855
        block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1856
    }
1857
}
1858

    
1859
#if 0
1860
/**
1861
 * dct tranforms the 16 dc values.
1862
 * @param qp quantization parameter ??? FIXME
1863
 */
1864
static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1865
//    const int qmul= dequant_coeff[qp][0];
1866
    int i;
1867
    int temp[16]; //FIXME check if this is a good idea
1868
    static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1869
    static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1870

1871
    for(i=0; i<4; i++){
1872
        const int offset= y_offset[i];
1873
        const int z0= block[offset+stride*0] + block[offset+stride*4];
1874
        const int z1= block[offset+stride*0] - block[offset+stride*4];
1875
        const int z2= block[offset+stride*1] - block[offset+stride*5];
1876
        const int z3= block[offset+stride*1] + block[offset+stride*5];
1877

1878
        temp[4*i+0]= z0+z3;
1879
        temp[4*i+1]= z1+z2;
1880
        temp[4*i+2]= z1-z2;
1881
        temp[4*i+3]= z0-z3;
1882
    }
1883

1884
    for(i=0; i<4; i++){
1885
        const int offset= x_offset[i];
1886
        const int z0= temp[4*0+i] + temp[4*2+i];
1887
        const int z1= temp[4*0+i] - temp[4*2+i];
1888
        const int z2= temp[4*1+i] - temp[4*3+i];
1889
        const int z3= temp[4*1+i] + temp[4*3+i];
1890

1891
        block[stride*0 +offset]= (z0 + z3)>>1;
1892
        block[stride*2 +offset]= (z1 + z2)>>1;
1893
        block[stride*8 +offset]= (z1 - z2)>>1;
1894
        block[stride*10+offset]= (z0 - z3)>>1;
1895
    }
1896
}
1897
#endif
1898

    
1899
#undef xStride
1900
#undef stride
1901

    
1902
static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1903
    const int stride= 16*2;
1904
    const int xStride= 16;
1905
    int a,b,c,d,e;
1906

    
1907
    a= block[stride*0 + xStride*0];
1908
    b= block[stride*0 + xStride*1];
1909
    c= block[stride*1 + xStride*0];
1910
    d= block[stride*1 + xStride*1];
1911

    
1912
    e= a-b;
1913
    a= a+b;
1914
    b= c-d;
1915
    c= c+d;
1916

    
1917
    block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1918
    block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1919
    block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1920
    block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1921
}
1922

    
1923
#if 0
1924
static void chroma_dc_dct_c(DCTELEM *block){
1925
    const int stride= 16*2;
1926
    const int xStride= 16;
1927
    int a,b,c,d,e;
1928

1929
    a= block[stride*0 + xStride*0];
1930
    b= block[stride*0 + xStride*1];
1931
    c= block[stride*1 + xStride*0];
1932
    d= block[stride*1 + xStride*1];
1933

1934
    e= a-b;
1935
    a= a+b;
1936
    b= c-d;
1937
    c= c+d;
1938

1939
    block[stride*0 + xStride*0]= (a+c);
1940
    block[stride*0 + xStride*1]= (e+b);
1941
    block[stride*1 + xStride*0]= (a-c);
1942
    block[stride*1 + xStride*1]= (e-b);
1943
}
1944
#endif
1945

    
1946
/**
1947
 * gets the chroma qp.
1948
 */
1949
static inline int get_chroma_qp(int chroma_qp_index_offset, int qscale){
1950

    
1951
    return chroma_qp[av_clip(qscale + chroma_qp_index_offset, 0, 51)];
1952
}
1953

    
1954
//FIXME need to check that this doesnt overflow signed 32 bit for low qp, i am not sure, it's very close
1955
//FIXME check that gcc inlines this (and optimizes intra & separate_dc stuff away)
1956
static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int separate_dc){
1957
    int i;
1958
    const int * const quant_table= quant_coeff[qscale];
1959
    const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1960
    const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1961
    const unsigned int threshold2= (threshold1<<1);
1962
    int last_non_zero;
1963

    
1964
    if(separate_dc){
1965
        if(qscale<=18){
1966
            //avoid overflows
1967
            const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1968
            const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1969
            const unsigned int dc_threshold2= (dc_threshold1<<1);
1970

    
1971
            int level= block[0]*quant_coeff[qscale+18][0];
1972
            if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1973
                if(level>0){
1974
                    level= (dc_bias + level)>>(QUANT_SHIFT-2);
1975
                    block[0]= level;
1976
                }else{
1977
                    level= (dc_bias - level)>>(QUANT_SHIFT-2);
1978
                    block[0]= -level;
1979
                }
1980
//                last_non_zero = i;
1981
            }else{
1982
                block[0]=0;
1983
            }
1984
        }else{
1985
            const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1986
            const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1987
            const unsigned int dc_threshold2= (dc_threshold1<<1);
1988

    
1989
            int level= block[0]*quant_table[0];
1990
            if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1991
                if(level>0){
1992
                    level= (dc_bias + level)>>(QUANT_SHIFT+1);
1993
                    block[0]= level;
1994
                }else{
1995
                    level= (dc_bias - level)>>(QUANT_SHIFT+1);
1996
                    block[0]= -level;
1997
                }
1998
//                last_non_zero = i;
1999
            }else{
2000
                block[0]=0;
2001
            }
2002
        }
2003
        last_non_zero= 0;
2004
        i=1;
2005
    }else{
2006
        last_non_zero= -1;
2007
        i=0;
2008
    }
2009

    
2010
    for(; i<16; i++){
2011
        const int j= scantable[i];
2012
        int level= block[j]*quant_table[j];
2013

    
2014
//        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
2015
//           || bias-level >= (1<<(QMAT_SHIFT - 3))){
2016
        if(((unsigned)(level+threshold1))>threshold2){
2017
            if(level>0){
2018
                level= (bias + level)>>QUANT_SHIFT;
2019
                block[j]= level;
2020
            }else{
2021
                level= (bias - level)>>QUANT_SHIFT;
2022
                block[j]= -level;
2023
            }
2024
            last_non_zero = i;
2025
        }else{
2026
            block[j]=0;
2027
        }
2028
    }
2029

    
2030
    return last_non_zero;
2031
}
2032

    
2033
static void pred4x4_vertical_c(uint8_t *src, uint8_t *topright, int stride){
2034
    const uint32_t a= ((uint32_t*)(src-stride))[0];
2035
    ((uint32_t*)(src+0*stride))[0]= a;
2036
    ((uint32_t*)(src+1*stride))[0]= a;
2037
    ((uint32_t*)(src+2*stride))[0]= a;
2038
    ((uint32_t*)(src+3*stride))[0]= a;
2039
}
2040

    
2041
static void pred4x4_horizontal_c(uint8_t *src, uint8_t *topright, int stride){
2042
    ((uint32_t*)(src+0*stride))[0]= src[-1+0*stride]*0x01010101;
2043
    ((uint32_t*)(src+1*stride))[0]= src[-1+1*stride]*0x01010101;
2044
    ((uint32_t*)(src+2*stride))[0]= src[-1+2*stride]*0x01010101;
2045
    ((uint32_t*)(src+3*stride))[0]= src[-1+3*stride]*0x01010101;
2046
}
2047

    
2048
static void pred4x4_dc_c(uint8_t *src, uint8_t *topright, int stride){
2049
    const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
2050
                   + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
2051

    
2052
    ((uint32_t*)(src+0*stride))[0]=
2053
    ((uint32_t*)(src+1*stride))[0]=
2054
    ((uint32_t*)(src+2*stride))[0]=
2055
    ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
2056
}
2057

    
2058
static void pred4x4_left_dc_c(uint8_t *src, uint8_t *topright, int stride){
2059
    const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
2060

    
2061
    ((uint32_t*)(src+0*stride))[0]=
2062
    ((uint32_t*)(src+1*stride))[0]=
2063
    ((uint32_t*)(src+2*stride))[0]=
2064
    ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
2065
}
2066

    
2067
static void pred4x4_top_dc_c(uint8_t *src, uint8_t *topright, int stride){
2068
    const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
2069

    
2070
    ((uint32_t*)(src+0*stride))[0]=
2071
    ((uint32_t*)(src+1*stride))[0]=
2072
    ((uint32_t*)(src+2*stride))[0]=
2073
    ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
2074
}
2075

    
2076
static void pred4x4_128_dc_c(uint8_t *src, uint8_t *topright, int stride){
2077
    ((uint32_t*)(src+0*stride))[0]=
2078
    ((uint32_t*)(src+1*stride))[0]=
2079
    ((uint32_t*)(src+2*stride))[0]=
2080
    ((uint32_t*)(src+3*stride))[0]= 128U*0x01010101U;
2081
}
2082

    
2083

    
2084
#define LOAD_TOP_RIGHT_EDGE\
2085
    const int t4= topright[0];\
2086
    const int t5= topright[1];\
2087
    const int t6= topright[2];\
2088
    const int t7= topright[3];\
2089

    
2090
#define LOAD_LEFT_EDGE\
2091
    const int l0= src[-1+0*stride];\
2092
    const int l1= src[-1+1*stride];\
2093
    const int l2= src[-1+2*stride];\
2094
    const int l3= src[-1+3*stride];\
2095

    
2096
#define LOAD_TOP_EDGE\
2097
    const int t0= src[ 0-1*stride];\
2098
    const int t1= src[ 1-1*stride];\
2099
    const int t2= src[ 2-1*stride];\
2100
    const int t3= src[ 3-1*stride];\
2101

    
2102
static void pred4x4_down_right_c(uint8_t *src, uint8_t *topright, int stride){
2103
    const int lt= src[-1-1*stride];
2104
    LOAD_TOP_EDGE
2105
    LOAD_LEFT_EDGE
2106

    
2107
    src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2;
2108
    src[0+2*stride]=
2109
    src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2;
2110
    src[0+1*stride]=
2111
    src[1+2*stride]=
2112
    src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2;
2113
    src[0+0*stride]=
2114
    src[1+1*stride]=
2115
    src[2+2*stride]=
2116
    src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
2117
    src[1+0*stride]=
2118
    src[2+1*stride]=
2119
    src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
2120
    src[2+0*stride]=
2121
    src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2122
    src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
2123
}
2124

    
2125
static void pred4x4_down_left_c(uint8_t *src, uint8_t *topright, int stride){
2126
    LOAD_TOP_EDGE
2127
    LOAD_TOP_RIGHT_EDGE
2128
//    LOAD_LEFT_EDGE
2129

    
2130
    src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
2131
    src[1+0*stride]=
2132
    src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
2133
    src[2+0*stride]=
2134
    src[1+1*stride]=
2135
    src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
2136
    src[3+0*stride]=
2137
    src[2+1*stride]=
2138
    src[1+2*stride]=
2139
    src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
2140
    src[3+1*stride]=
2141
    src[2+2*stride]=
2142
    src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
2143
    src[3+2*stride]=
2144
    src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
2145
    src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
2146
}
2147

    
2148
static void pred4x4_vertical_right_c(uint8_t *src, uint8_t *topright, int stride){
2149
    const int lt= src[-1-1*stride];
2150
    LOAD_TOP_EDGE
2151
    LOAD_LEFT_EDGE
2152
    const __attribute__((unused)) int unu= l3;
2153

    
2154
    src[0+0*stride]=
2155
    src[1+2*stride]=(lt + t0 + 1)>>1;
2156
    src[1+0*stride]=
2157
    src[2+2*stride]=(t0 + t1 + 1)>>1;
2158
    src[2+0*stride]=
2159
    src[3+2*stride]=(t1 + t2 + 1)>>1;
2160
    src[3+0*stride]=(t2 + t3 + 1)>>1;
2161
    src[0+1*stride]=
2162
    src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
2163
    src[1+1*stride]=
2164
    src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
2165
    src[2+1*stride]=
2166
    src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2167
    src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
2168
    src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
2169
    src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
2170
}
2171

    
2172
static void pred4x4_vertical_left_c(uint8_t *src, uint8_t *topright, int stride){
2173
    LOAD_TOP_EDGE
2174
    LOAD_TOP_RIGHT_EDGE
2175
    const __attribute__((unused)) int unu= t7;
2176

    
2177
    src[0+0*stride]=(t0 + t1 + 1)>>1;
2178
    src[1+0*stride]=
2179
    src[0+2*stride]=(t1 + t2 + 1)>>1;
2180
    src[2+0*stride]=
2181
    src[1+2*stride]=(t2 + t3 + 1)>>1;
2182
    src[3+0*stride]=
2183
    src[2+2*stride]=(t3 + t4+ 1)>>1;
2184
    src[3+2*stride]=(t4 + t5+ 1)>>1;
2185
    src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2186
    src[1+1*stride]=
2187
    src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
2188
    src[2+1*stride]=
2189
    src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
2190
    src[3+1*stride]=
2191
    src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
2192
    src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
2193
}
2194

    
2195
static void pred4x4_horizontal_up_c(uint8_t *src, uint8_t *topright, int stride){
2196
    LOAD_LEFT_EDGE
2197

    
2198
    src[0+0*stride]=(l0 + l1 + 1)>>1;
2199
    src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
2200
    src[2+0*stride]=
2201
    src[0+1*stride]=(l1 + l2 + 1)>>1;
2202
    src[3+0*stride]=
2203
    src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
2204
    src[2+1*stride]=
2205
    src[0+2*stride]=(l2 + l3 + 1)>>1;
2206
    src[3+1*stride]=
2207
    src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
2208
    src[3+2*stride]=
2209
    src[1+3*stride]=
2210
    src[0+3*stride]=
2211
    src[2+2*stride]=
2212
    src[2+3*stride]=
2213
    src[3+3*stride]=l3;
2214
}
2215

    
2216
static void pred4x4_horizontal_down_c(uint8_t *src, uint8_t *topright, int stride){
2217
    const int lt= src[-1-1*stride];
2218
    LOAD_TOP_EDGE
2219
    LOAD_LEFT_EDGE
2220
    const __attribute__((unused)) int unu= t3;
2221

    
2222
    src[0+0*stride]=
2223
    src[2+1*stride]=(lt + l0 + 1)>>1;
2224
    src[1+0*stride]=
2225
    src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
2226
    src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
2227
    src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2228
    src[0+1*stride]=
2229
    src[2+2*stride]=(l0 + l1 + 1)>>1;
2230
    src[1+1*stride]=
2231
    src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
2232
    src[0+2*stride]=
2233
    src[2+3*stride]=(l1 + l2+ 1)>>1;
2234
    src[1+2*stride]=
2235
    src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
2236
    src[0+3*stride]=(l2 + l3 + 1)>>1;
2237
    src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
2238
}
2239

    
2240
void ff_pred16x16_vertical_c(uint8_t *src, int stride){
2241
    int i;
2242
    const uint32_t a= ((uint32_t*)(src-stride))[0];
2243
    const uint32_t b= ((uint32_t*)(src-stride))[1];
2244
    const uint32_t c= ((uint32_t*)(src-stride))[2];
2245
    const uint32_t d= ((uint32_t*)(src-stride))[3];
2246

    
2247
    for(i=0; i<16; i++){
2248
        ((uint32_t*)(src+i*stride))[0]= a;
2249
        ((uint32_t*)(src+i*stride))[1]= b;
2250
        ((uint32_t*)(src+i*stride))[2]= c;
2251
        ((uint32_t*)(src+i*stride))[3]= d;
2252
    }
2253
}
2254

    
2255
void ff_pred16x16_horizontal_c(uint8_t *src, int stride){
2256
    int i;
2257

    
2258
    for(i=0; i<16; i++){
2259
        ((uint32_t*)(src+i*stride))[0]=
2260
        ((uint32_t*)(src+i*stride))[1]=
2261
        ((uint32_t*)(src+i*stride))[2]=
2262
        ((uint32_t*)(src+i*stride))[3]= src[-1+i*stride]*0x01010101;
2263
    }
2264
}
2265

    
2266
void ff_pred16x16_dc_c(uint8_t *src, int stride){
2267
    int i, dc=0;
2268

    
2269
    for(i=0;i<16; i++){
2270
        dc+= src[-1+i*stride];
2271
    }
2272

    
2273
    for(i=0;i<16; i++){
2274
        dc+= src[i-stride];
2275
    }
2276

    
2277
    dc= 0x01010101*((dc + 16)>>5);
2278

    
2279
    for(i=0; i<16; i++){
2280
        ((uint32_t*)(src+i*stride))[0]=
2281
        ((uint32_t*)(src+i*stride))[1]=
2282
        ((uint32_t*)(src+i*stride))[2]=
2283
        ((uint32_t*)(src+i*stride))[3]= dc;
2284
    }
2285
}
2286

    
2287
static void pred16x16_left_dc_c(uint8_t *src, int stride){
2288
    int i, dc=0;
2289

    
2290
    for(i=0;i<16; i++){
2291
        dc+= src[-1+i*stride];
2292
    }
2293

    
2294
    dc= 0x01010101*((dc + 8)>>4);
2295

    
2296
    for(i=0; i<16; i++){
2297
        ((uint32_t*)(src+i*stride))[0]=
2298
        ((uint32_t*)(src+i*stride))[1]=
2299
        ((uint32_t*)(src+i*stride))[2]=
2300
        ((uint32_t*)(src+i*stride))[3]= dc;
2301
    }
2302
}
2303

    
2304
static void pred16x16_top_dc_c(uint8_t *src, int stride){
2305
    int i, dc=0;
2306

    
2307
    for(i=0;i<16; i++){
2308
        dc+= src[i-stride];
2309
    }
2310
    dc= 0x01010101*((dc + 8)>>4);
2311

    
2312
    for(i=0; i<16; i++){
2313
        ((uint32_t*)(src+i*stride))[0]=
2314
        ((uint32_t*)(src+i*stride))[1]=
2315
        ((uint32_t*)(src+i*stride))[2]=
2316
        ((uint32_t*)(src+i*stride))[3]= dc;
2317
    }
2318
}
2319

    
2320
void ff_pred16x16_128_dc_c(uint8_t *src, int stride){
2321
    int i;
2322

    
2323
    for(i=0; i<16; i++){
2324
        ((uint32_t*)(src+i*stride))[0]=
2325
        ((uint32_t*)(src+i*stride))[1]=
2326
        ((uint32_t*)(src+i*stride))[2]=
2327
        ((uint32_t*)(src+i*stride))[3]= 0x01010101U*128U;
2328
    }
2329
}
2330

    
2331
static inline void pred16x16_plane_compat_c(uint8_t *src, int stride, const int svq3){
2332
  int i, j, k;
2333
  int a;
2334
  uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2335
  const uint8_t * const src0 = src+7-stride;
2336
  const uint8_t *src1 = src+8*stride-1;
2337
  const uint8_t *src2 = src1-2*stride;      // == src+6*stride-1;
2338
  int H = src0[1] - src0[-1];
2339
  int V = src1[0] - src2[ 0];
2340
  for(k=2; k<=8; ++k) {
2341
    src1 += stride; src2 -= stride;
2342
    H += k*(src0[k] - src0[-k]);
2343
    V += k*(src1[0] - src2[ 0]);
2344
  }
2345
  if(svq3){
2346
    H = ( 5*(H/4) ) / 16;
2347
    V = ( 5*(V/4) ) / 16;
2348

    
2349
    /* required for 100% accuracy */
2350
    i = H; H = V; V = i;
2351
  }else{
2352
    H = ( 5*H+32 ) >> 6;
2353
    V = ( 5*V+32 ) >> 6;
2354
  }
2355

    
2356
  a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
2357
  for(j=16; j>0; --j) {
2358
    int b = a;
2359
    a += V;
2360
    for(i=-16; i<0; i+=4) {
2361
      src[16+i] = cm[ (b    ) >> 5 ];
2362
      src[17+i] = cm[ (b+  H) >> 5 ];
2363
      src[18+i] = cm[ (b+2*H) >> 5 ];
2364
      src[19+i] = cm[ (b+3*H) >> 5 ];
2365
      b += 4*H;
2366
    }
2367
    src += stride;
2368
  }
2369
}
2370

    
2371
void ff_pred16x16_plane_c(uint8_t *src, int stride){
2372
    pred16x16_plane_compat_c(src, stride, 0);
2373
}
2374

    
2375
void ff_pred8x8_vertical_c(uint8_t *src, int stride){
2376
    int i;
2377
    const uint32_t a= ((uint32_t*)(src-stride))[0];
2378
    const uint32_t b= ((uint32_t*)(src-stride))[1];
2379

    
2380
    for(i=0; i<8; i++){
2381
        ((uint32_t*)(src+i*stride))[0]= a;
2382
        ((uint32_t*)(src+i*stride))[1]= b;
2383
    }
2384
}
2385

    
2386
void ff_pred8x8_horizontal_c(uint8_t *src, int stride){
2387
    int i;
2388

    
2389
    for(i=0; i<8; i++){
2390
        ((uint32_t*)(src+i*stride))[0]=
2391
        ((uint32_t*)(src+i*stride))[1]= src[-1+i*stride]*0x01010101;
2392
    }
2393
}
2394

    
2395
void ff_pred8x8_128_dc_c(uint8_t *src, int stride){
2396
    int i;
2397

    
2398
    for(i=0; i<8; i++){
2399
        ((uint32_t*)(src+i*stride))[0]=
2400
        ((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U;
2401
    }
2402
}
2403

    
2404
static void pred8x8_left_dc_c(uint8_t *src, int stride){
2405
    int i;
2406
    int dc0, dc2;
2407

    
2408
    dc0=dc2=0;
2409
    for(i=0;i<4; i++){
2410
        dc0+= src[-1+i*stride];
2411
        dc2+= src[-1+(i+4)*stride];
2412
    }
2413
    dc0= 0x01010101*((dc0 + 2)>>2);
2414
    dc2= 0x01010101*((dc2 + 2)>>2);
2415

    
2416
    for(i=0; i<4; i++){
2417
        ((uint32_t*)(src+i*stride))[0]=
2418
        ((uint32_t*)(src+i*stride))[1]= dc0;
2419
    }
2420
    for(i=4; i<8; i++){
2421
        ((uint32_t*)(src+i*stride))[0]=
2422
        ((uint32_t*)(src+i*stride))[1]= dc2;
2423
    }
2424
}
2425

    
2426
static void pred8x8_top_dc_c(uint8_t *src, int stride){
2427
    int i;
2428
    int dc0, dc1;
2429

    
2430
    dc0=dc1=0;
2431
    for(i=0;i<4; i++){
2432
        dc0+= src[i-stride];
2433
        dc1+= src[4+i-stride];
2434
    }
2435
    dc0= 0x01010101*((dc0 + 2)>>2);
2436
    dc1= 0x01010101*((dc1 + 2)>>2);
2437

    
2438
    for(i=0; i<4; i++){
2439
        ((uint32_t*)(src+i*stride))[0]= dc0;
2440
        ((uint32_t*)(src+i*stride))[1]= dc1;
2441
    }
2442
    for(i=4; i<8; i++){
2443
        ((uint32_t*)(src+i*stride))[0]= dc0;
2444
        ((uint32_t*)(src+i*stride))[1]= dc1;
2445
    }
2446
}
2447

    
2448

    
2449
void ff_pred8x8_dc_c(uint8_t *src, int stride){
2450
    int i;
2451
    int dc0, dc1, dc2, dc3;
2452

    
2453
    dc0=dc1=dc2=0;
2454
    for(i=0;i<4; i++){
2455
        dc0+= src[-1+i*stride] + src[i-stride];
2456
        dc1+= src[4+i-stride];
2457
        dc2+= src[-1+(i+4)*stride];
2458
    }
2459
    dc3= 0x01010101*((dc1 + dc2 + 4)>>3);
2460
    dc0= 0x01010101*((dc0 + 4)>>3);
2461
    dc1= 0x01010101*((dc1 + 2)>>2);
2462
    dc2= 0x01010101*((dc2 + 2)>>2);
2463

    
2464
    for(i=0; i<4; i++){
2465
        ((uint32_t*)(src+i*stride))[0]= dc0;
2466
        ((uint32_t*)(src+i*stride))[1]= dc1;
2467
    }
2468
    for(i=4; i<8; i++){
2469
        ((uint32_t*)(src+i*stride))[0]= dc2;
2470
        ((uint32_t*)(src+i*stride))[1]= dc3;
2471
    }
2472
}
2473

    
2474
void ff_pred8x8_plane_c(uint8_t *src, int stride){
2475
  int j, k;
2476
  int a;
2477
  uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2478
  const uint8_t * const src0 = src+3-stride;
2479
  const uint8_t *src1 = src+4*stride-1;
2480
  const uint8_t *src2 = src1-2*stride;      // == src+2*stride-1;
2481
  int H = src0[1] - src0[-1];
2482
  int V = src1[0] - src2[ 0];
2483
  for(k=2; k<=4; ++k) {
2484
    src1 += stride; src2 -= stride;
2485
    H += k*(src0[k] - src0[-k]);
2486
    V += k*(src1[0] - src2[ 0]);
2487
  }
2488
  H = ( 17*H+16 ) >> 5;
2489
  V = ( 17*V+16 ) >> 5;
2490

    
2491
  a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
2492
  for(j=8; j>0; --j) {
2493
    int b = a;
2494
    a += V;
2495
    src[0] = cm[ (b    ) >> 5 ];
2496
    src[1] = cm[ (b+  H) >> 5 ];
2497
    src[2] = cm[ (b+2*H) >> 5 ];
2498
    src[3] = cm[ (b+3*H) >> 5 ];
2499
    src[4] = cm[ (b+4*H) >> 5 ];
2500
    src[5] = cm[ (b+5*H) >> 5 ];
2501
    src[6] = cm[ (b+6*H) >> 5 ];
2502
    src[7] = cm[ (b+7*H) >> 5 ];
2503
    src += stride;
2504
  }
2505
}
2506

    
2507
#define SRC(x,y) src[(x)+(y)*stride]
2508
#define PL(y) \
2509
    const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
2510
#define PREDICT_8x8_LOAD_LEFT \
2511
    const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
2512
                     + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
2513
    PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
2514
    const int l7 attribute_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
2515

    
2516
#define PT(x) \
2517
    const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
2518
#define PREDICT_8x8_LOAD_TOP \
2519
    const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
2520
                     + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
2521
    PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
2522
    const int t7 attribute_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
2523
                     + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
2524

    
2525
#define PTR(x) \
2526
    t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
2527
#define PREDICT_8x8_LOAD_TOPRIGHT \
2528
    int t8, t9, t10, t11, t12, t13, t14, t15; \
2529
    if(has_topright) { \
2530
        PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
2531
        t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
2532
    } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
2533

    
2534
#define PREDICT_8x8_LOAD_TOPLEFT \
2535
    const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
2536

    
2537
#define PREDICT_8x8_DC(v) \
2538
    int y; \
2539
    for( y = 0; y < 8; y++ ) { \
2540
        ((uint32_t*)src)[0] = \
2541
        ((uint32_t*)src)[1] = v; \
2542
        src += stride; \
2543
    }
2544

    
2545
static void pred8x8l_128_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2546
{
2547
    PREDICT_8x8_DC(0x80808080);
2548
}
2549
static void pred8x8l_left_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2550
{
2551
    PREDICT_8x8_LOAD_LEFT;
2552
    const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3) * 0x01010101;
2553
    PREDICT_8x8_DC(dc);
2554
}
2555
static void pred8x8l_top_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2556
{
2557
    PREDICT_8x8_LOAD_TOP;
2558
    const uint32_t dc = ((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3) * 0x01010101;
2559
    PREDICT_8x8_DC(dc);
2560
}
2561
static void pred8x8l_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2562
{
2563
    PREDICT_8x8_LOAD_LEFT;
2564
    PREDICT_8x8_LOAD_TOP;
2565
    const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7
2566
                         +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4) * 0x01010101;
2567
    PREDICT_8x8_DC(dc);
2568
}
2569
static void pred8x8l_horizontal_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2570
{
2571
    PREDICT_8x8_LOAD_LEFT;
2572
#define ROW(y) ((uint32_t*)(src+y*stride))[0] =\
2573
               ((uint32_t*)(src+y*stride))[1] = 0x01010101 * l##y
2574
    ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
2575
#undef ROW
2576
}
2577
static void pred8x8l_vertical_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2578
{
2579
    int y;
2580
    PREDICT_8x8_LOAD_TOP;
2581
    src[0] = t0;
2582
    src[1] = t1;
2583
    src[2] = t2;
2584
    src[3] = t3;
2585
    src[4] = t4;
2586
    src[5] = t5;
2587
    src[6] = t6;
2588
    src[7] = t7;
2589
    for( y = 1; y < 8; y++ )
2590
        *(uint64_t*)(src+y*stride) = *(uint64_t*)src;
2591
}
2592
static void pred8x8l_down_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2593
{
2594
    PREDICT_8x8_LOAD_TOP;
2595
    PREDICT_8x8_LOAD_TOPRIGHT;
2596
    SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
2597
    SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
2598
    SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
2599
    SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
2600
    SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
2601
    SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
2602
    SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
2603
    SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
2604
    SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
2605
    SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
2606
    SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
2607
    SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
2608
    SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
2609
    SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
2610
    SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
2611
}
2612
static void pred8x8l_down_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2613
{
2614
    PREDICT_8x8_LOAD_TOP;
2615
    PREDICT_8x8_LOAD_LEFT;
2616
    PREDICT_8x8_LOAD_TOPLEFT;
2617
    SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
2618
    SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
2619
    SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
2620
    SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
2621
    SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
2622
    SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
2623
    SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
2624
    SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
2625
    SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
2626
    SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
2627
    SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
2628
    SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
2629
    SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
2630
    SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
2631
    SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
2632

    
2633
}
2634
static void pred8x8l_vertical_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2635
{
2636
    PREDICT_8x8_LOAD_TOP;
2637
    PREDICT_8x8_LOAD_LEFT;
2638
    PREDICT_8x8_LOAD_TOPLEFT;
2639
    SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
2640
    SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
2641
    SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
2642
    SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
2643
    SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
2644
    SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
2645
    SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
2646
    SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
2647
    SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
2648
    SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
2649
    SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
2650
    SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
2651
    SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
2652
    SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
2653
    SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
2654
    SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
2655
    SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
2656
    SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
2657
    SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
2658
    SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
2659
    SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
2660
    SRC(7,0)= (t6 + t7 + 1) >> 1;
2661
}
2662
static void pred8x8l_horizontal_down_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2663
{
2664
    PREDICT_8x8_LOAD_TOP;
2665
    PREDICT_8x8_LOAD_LEFT;
2666
    PREDICT_8x8_LOAD_TOPLEFT;
2667
    SRC(0,7)= (l6 + l7 + 1) >> 1;
2668
    SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
2669
    SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
2670
    SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
2671
    SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
2672
    SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
2673
    SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
2674
    SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
2675
    SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
2676
    SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
2677
    SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
2678
    SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
2679
    SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
2680
    SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
2681
    SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
2682
    SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
2683
    SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
2684
    SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
2685
    SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
2686
    SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
2687
    SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
2688
    SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
2689
}
2690
static void pred8x8l_vertical_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2691
{
2692
    PREDICT_8x8_LOAD_TOP;
2693
    PREDICT_8x8_LOAD_TOPRIGHT;
2694
    SRC(0,0)= (t0 + t1 + 1) >> 1;
2695
    SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
2696
    SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
2697
    SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
2698
    SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
2699
    SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
2700
    SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
2701
    SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
2702
    SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
2703
    SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
2704
    SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
2705
    SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
2706
    SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
2707
    SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
2708
    SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
2709
    SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
2710
    SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
2711
    SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
2712
    SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
2713
    SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
2714
    SRC(7,6)= (t10 + t11 + 1) >> 1;
2715
    SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
2716
}
2717
static void pred8x8l_horizontal_up_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2718
{
2719
    PREDICT_8x8_LOAD_LEFT;
2720
    SRC(0,0)= (l0 + l1 + 1) >> 1;
2721
    SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
2722
    SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
2723
    SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
2724
    SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
2725
    SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
2726
    SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
2727
    SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
2728
    SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
2729
    SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
2730
    SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
2731
    SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
2732
    SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
2733
    SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
2734
    SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
2735
    SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
2736
    SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
2737
    SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
2738
}
2739
#undef PREDICT_8x8_LOAD_LEFT
2740
#undef PREDICT_8x8_LOAD_TOP
2741
#undef PREDICT_8x8_LOAD_TOPLEFT
2742
#undef PREDICT_8x8_LOAD_TOPRIGHT
2743
#undef PREDICT_8x8_DC
2744
#undef PTR
2745
#undef PT
2746
#undef PL
2747
#undef SRC
2748

    
2749
static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
2750
                           uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2751
                           int src_x_offset, int src_y_offset,
2752
                           qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
2753
    MpegEncContext * const s = &h->s;
2754
    const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
2755
    int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
2756
    const int luma_xy= (mx&3) + ((my&3)<<2);
2757
    uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
2758
    uint8_t * src_cb, * src_cr;
2759
    int extra_width= h->emu_edge_width;
2760
    int extra_height= h->emu_edge_height;
2761
    int emu=0;
2762
    const int full_mx= mx>>2;
2763
    const int full_my= my>>2;
2764
    const int pic_width  = 16*s->mb_width;
2765
    const int pic_height = 16*s->mb_height >> MB_MBAFF;
2766

    
2767
    if(!pic->data[0]) //FIXME this is unacceptable, some senseable error concealment must be done for missing reference frames
2768
        return;
2769

    
2770
    if(mx&7) extra_width -= 3;
2771
    if(my&7) extra_height -= 3;
2772

    
2773
    if(   full_mx < 0-extra_width
2774
       || full_my < 0-extra_height
2775
       || full_mx + 16/*FIXME*/ > pic_width + extra_width
2776
       || full_my + 16/*FIXME*/ > pic_height + extra_height){
2777
        ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
2778
            src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
2779
        emu=1;
2780
    }
2781

    
2782
    qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
2783
    if(!square){
2784
        qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
2785
    }
2786

    
2787
    if(s->flags&CODEC_FLAG_GRAY) return;
2788

    
2789
    if(MB_MBAFF){
2790
        // chroma offset when predicting from a field of opposite parity
2791
        my += 2 * ((s->mb_y & 1) - (h->ref_cache[list][scan8[n]] & 1));
2792
        emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
2793
    }
2794
    src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
2795
    src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
2796

    
2797
    if(emu){
2798
        ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
2799
            src_cb= s->edge_emu_buffer;
2800
    }
2801
    chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
2802

    
2803
    if(emu){
2804
        ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
2805
            src_cr= s->edge_emu_buffer;
2806
    }
2807
    chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
2808
}
2809

    
2810
static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
2811
                           uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2812
                           int x_offset, int y_offset,
2813
                           qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2814
                           qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
2815
                           int list0, int list1){
2816
    MpegEncContext * const s = &h->s;
2817
    qpel_mc_func *qpix_op=  qpix_put;
2818
    h264_chroma_mc_func chroma_op= chroma_put;
2819

    
2820
    dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
2821
    dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
2822
    dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
2823
    x_offset += 8*s->mb_x;
2824
    y_offset += 8*(s->mb_y >> MB_MBAFF);
2825

    
2826
    if(list0){
2827
        Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
2828
        mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
2829
                           dest_y, dest_cb, dest_cr, x_offset, y_offset,
2830
                           qpix_op, chroma_op);
2831

    
2832
        qpix_op=  qpix_avg;
2833
        chroma_op= chroma_avg;
2834
    }
2835

    
2836
    if(list1){
2837
        Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
2838
        mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
2839
                           dest_y, dest_cb, dest_cr, x_offset, y_offset,
2840
                           qpix_op, chroma_op);
2841
    }
2842
}
2843

    
2844
static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
2845
                           uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2846
                           int x_offset, int y_offset,
2847
                           qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2848
                           h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
2849
                           h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
2850
                           int list0, int list1){
2851
    MpegEncContext * const s = &h->s;
2852

    
2853
    dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
2854
    dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
2855
    dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
2856
    x_offset += 8*s->mb_x;
2857
    y_offset += 8*(s->mb_y >> MB_MBAFF);
2858

    
2859
    if(list0 && list1){
2860
        /* don't optimize for luma-only case, since B-frames usually
2861
         * use implicit weights => chroma too. */
2862
        uint8_t *tmp_cb = s->obmc_scratchpad;
2863
        uint8_t *tmp_cr = s->obmc_scratchpad + 8;
2864
        uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
2865
        int refn0 = h->ref_cache[0][ scan8[n] ];
2866
        int refn1 = h->ref_cache[1][ scan8[n] ];
2867

    
2868
        mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
2869
                    dest_y, dest_cb, dest_cr,
2870
                    x_offset, y_offset, qpix_put, chroma_put);
2871
        mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
2872
                    tmp_y, tmp_cb, tmp_cr,
2873
                    x_offset, y_offset, qpix_put, chroma_put);
2874

    
2875
        if(h->use_weight == 2){
2876
            int weight0 = h->implicit_weight[refn0][refn1];
2877
            int weight1 = 64 - weight0;
2878
            luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
2879
            chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
2880
            chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
2881
        }else{
2882
            luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
2883
                            h->luma_weight[0][refn0], h->luma_weight[1][refn1],
2884
                            h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
2885
            chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
2886
                            h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
2887
                            h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
2888
            chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
2889
                            h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
2890
                            h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
2891
        }
2892
    }else{
2893
        int list = list1 ? 1 : 0;
2894
        int refn = h->ref_cache[list][ scan8[n] ];
2895
        Picture *ref= &h->ref_list[list][refn];
2896
        mc_dir_part(h, ref, n, square, chroma_height, delta, list,
2897
                    dest_y, dest_cb, dest_cr, x_offset, y_offset,
2898
                    qpix_put, chroma_put);
2899

    
2900
        luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
2901
                       h->luma_weight[list][refn], h->luma_offset[list][refn]);
2902
        if(h->use_weight_chroma){
2903
            chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
2904
                             h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
2905
            chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
2906
                             h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
2907
        }
2908
    }
2909
}
2910

    
2911
static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
2912
                           uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2913
                           int x_offset, int y_offset,
2914
                           qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2915
                           qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
2916
                           h264_weight_func *weight_op, h264_biweight_func *weight_avg,
2917
                           int list0, int list1){
2918
    if((h->use_weight==2 && list0 && list1
2919
        && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
2920
       || h->use_weight==1)
2921
        mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
2922
                         x_offset, y_offset, qpix_put, chroma_put,
2923
                         weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
2924
    else
2925
        mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
2926
                    x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
2927
}
2928

    
2929
static inline void prefetch_motion(H264Context *h, int list){
2930
    /* fetch pixels for estimated mv 4 macroblocks ahead
2931
     * optimized for 64byte cache lines */
2932
    MpegEncContext * const s = &h->s;
2933
    const int refn = h->ref_cache[list][scan8[0]];
2934
    if(refn >= 0){
2935
        const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
2936
        const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
2937
        uint8_t **src= h->ref_list[list][refn].data;
2938
        int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
2939
        s->dsp.prefetch(src[0]+off, s->linesize, 4);
2940
        off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
2941
        s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
2942
    }
2943
}
2944

    
2945
static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2946
                      qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
2947
                      qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
2948
                      h264_weight_func *weight_op, h264_biweight_func *weight_avg){
2949
    MpegEncContext * const s = &h->s;
2950
    const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
2951
    const int mb_type= s->current_picture.mb_type[mb_xy];
2952

    
2953
    assert(IS_INTER(mb_type));
2954

    
2955
    prefetch_motion(h, 0);
2956

    
2957
    if(IS_16X16(mb_type)){
2958
        mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
2959
                qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
2960
                &weight_op[0], &weight_avg[0],
2961
                IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2962
    }else if(IS_16X8(mb_type)){
2963
        mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
2964
                qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
2965
                &weight_op[1], &weight_avg[1],
2966
                IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2967
        mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
2968
                qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
2969
                &weight_op[1], &weight_avg[1],
2970
                IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
2971
    }else if(IS_8X16(mb_type)){
2972
        mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
2973
                qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2974
                &weight_op[2], &weight_avg[2],
2975
                IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2976
        mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
2977
                qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2978
                &weight_op[2], &weight_avg[2],
2979
                IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
2980
    }else{
2981
        int i;
2982

    
2983
        assert(IS_8X8(mb_type));
2984

    
2985
        for(i=0; i<4; i++){
2986
            const int sub_mb_type= h->sub_mb_type[i];
2987
            const int n= 4*i;
2988
            int x_offset= (i&1)<<2;
2989
            int y_offset= (i&2)<<1;
2990

    
2991
            if(IS_SUB_8X8(sub_mb_type)){
2992
                mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
2993
                    qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2994
                    &weight_op[3], &weight_avg[3],
2995
                    IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2996
            }else if(IS_SUB_8X4(sub_mb_type)){
2997
                mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
2998
                    qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
2999
                    &weight_op[4], &weight_avg[4],
3000
                    IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3001
                mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
3002
                    qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
3003
                    &weight_op[4], &weight_avg[4],
3004
                    IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3005
            }else if(IS_SUB_4X8(sub_mb_type)){
3006
                mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
3007
                    qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
3008
                    &weight_op[5], &weight_avg[5],
3009
                    IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3010
                mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
3011
                    qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
3012
                    &weight_op[5], &weight_avg[5],
3013
                    IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3014
            }else{
3015
                int j;
3016
                assert(IS_SUB_4X4(sub_mb_type));
3017
                for(j=0; j<4; j++){
3018
                    int sub_x_offset= x_offset + 2*(j&1);
3019
                    int sub_y_offset= y_offset +   (j&2);
3020
                    mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
3021
                        qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
3022
                        &weight_op[6], &weight_avg[6],
3023
                        IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3024
                }
3025
            }
3026
        }
3027
    }
3028

    
3029
    prefetch_motion(h, 1);
3030
}
3031

    
3032
static void decode_init_vlc(void){
3033
    static int done = 0;
3034

    
3035
    if (!done) {
3036
        int i;
3037
        done = 1;
3038

    
3039
        init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
3040
                 &chroma_dc_coeff_token_len [0], 1, 1,
3041
                 &chroma_dc_coeff_token_bits[0], 1, 1, 1);
3042

    
3043
        for(i=0; i<4; i++){
3044
            init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
3045
                     &coeff_token_len [i][0], 1, 1,
3046
                     &coeff_token_bits[i][0], 1, 1, 1);
3047
        }
3048

    
3049
        for(i=0; i<3; i++){
3050
            init_vlc(&chroma_dc_total_zeros_vlc[i], CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
3051
                     &chroma_dc_total_zeros_len [i][0], 1, 1,
3052
                     &chroma_dc_total_zeros_bits[i][0], 1, 1, 1);
3053
        }
3054
        for(i=0; i<15; i++){
3055
            init_vlc(&total_zeros_vlc[i], TOTAL_ZEROS_VLC_BITS, 16,
3056
                     &total_zeros_len [i][0], 1, 1,
3057
                     &total_zeros_bits[i][0], 1, 1, 1);
3058
        }
3059

    
3060
        for(i=0; i<6; i++){
3061
            init_vlc(&run_vlc[i], RUN_VLC_BITS, 7,
3062
                     &run_len [i][0], 1, 1,
3063
                     &run_bits[i][0], 1, 1, 1);
3064
        }
3065
        init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
3066
                 &run_len [6][0], 1, 1,
3067
                 &run_bits[6][0], 1, 1, 1);
3068
    }
3069
}
3070

    
3071
/**
3072
 * Sets the intra prediction function pointers.
3073
 */
3074
static void init_pred_ptrs(H264Context *h){
3075
//    MpegEncContext * const s = &h->s;
3076

    
3077
    h->pred4x4[VERT_PRED           ]= pred4x4_vertical_c;
3078
    h->pred4x4[HOR_PRED            ]= pred4x4_horizontal_c;
3079
    h->pred4x4[DC_PRED             ]= pred4x4_dc_c;
3080
    h->pred4x4[DIAG_DOWN_LEFT_PRED ]= pred4x4_down_left_c;
3081
    h->pred4x4[DIAG_DOWN_RIGHT_PRED]= pred4x4_down_right_c;
3082
    h->pred4x4[VERT_RIGHT_PRED     ]= pred4x4_vertical_right_c;
3083
    h->pred4x4[HOR_DOWN_PRED       ]= pred4x4_horizontal_down_c;
3084
    h->pred4x4[VERT_LEFT_PRED      ]= pred4x4_vertical_left_c;
3085
    h->pred4x4[HOR_UP_PRED         ]= pred4x4_horizontal_up_c;
3086
    h->pred4x4[LEFT_DC_PRED        ]= pred4x4_left_dc_c;
3087
    h->pred4x4[TOP_DC_PRED         ]= pred4x4_top_dc_c;
3088
    h->pred4x4[DC_128_PRED         ]= pred4x4_128_dc_c;
3089

    
3090
    h->pred8x8l[VERT_PRED           ]= pred8x8l_vertical_c;
3091
    h->pred8x8l[HOR_PRED            ]= pred8x8l_horizontal_c;
3092
    h->pred8x8l[DC_PRED             ]= pred8x8l_dc_c;
3093
    h->pred8x8l[DIAG_DOWN_LEFT_PRED ]= pred8x8l_down_left_c;
3094
    h->pred8x8l[DIAG_DOWN_RIGHT_PRED]= pred8x8l_down_right_c;
3095
    h->pred8x8l[VERT_RIGHT_PRED     ]= pred8x8l_vertical_right_c;
3096
    h->pred8x8l[HOR_DOWN_PRED       ]= pred8x8l_horizontal_down_c;
3097
    h->pred8x8l[VERT_LEFT_PRED      ]= pred8x8l_vertical_left_c;
3098
    h->pred8x8l[HOR_UP_PRED         ]= pred8x8l_horizontal_up_c;
3099
    h->pred8x8l[LEFT_DC_PRED        ]= pred8x8l_left_dc_c;
3100
    h->pred8x8l[TOP_DC_PRED         ]= pred8x8l_top_dc_c;
3101
    h->pred8x8l[DC_128_PRED         ]= pred8x8l_128_dc_c;
3102

    
3103
    h->pred8x8[DC_PRED8x8     ]= ff_pred8x8_dc_c;
3104
    h->pred8x8[VERT_PRED8x8   ]= ff_pred8x8_vertical_c;
3105
    h->pred8x8[HOR_PRED8x8    ]= ff_pred8x8_horizontal_c;
3106
    h->pred8x8[PLANE_PRED8x8  ]= ff_pred8x8_plane_c;
3107
    h->pred8x8[LEFT_DC_PRED8x8]= pred8x8_left_dc_c;
3108
    h->pred8x8[TOP_DC_PRED8x8 ]= pred8x8_top_dc_c;
3109
    h->pred8x8[DC_128_PRED8x8 ]= ff_pred8x8_128_dc_c;
3110

    
3111
    h->pred16x16[DC_PRED8x8     ]= ff_pred16x16_dc_c;
3112
    h->pred16x16[VERT_PRED8x8   ]= ff_pred16x16_vertical_c;
3113
    h->pred16x16[HOR_PRED8x8    ]= ff_pred16x16_horizontal_c;
3114
    h->pred16x16[PLANE_PRED8x8  ]= ff_pred16x16_plane_c;
3115
    h->pred16x16[LEFT_DC_PRED8x8]= pred16x16_left_dc_c;
3116
    h->pred16x16[TOP_DC_PRED8x8 ]= pred16x16_top_dc_c;
3117
    h->pred16x16[DC_128_PRED8x8 ]= ff_pred16x16_128_dc_c;
3118
}
3119

    
3120
static void free_tables(H264Context *h){
3121
    av_freep(&h->intra4x4_pred_mode);
3122
    av_freep(&h->chroma_pred_mode_table);
3123
    av_freep(&h->cbp_table);
3124
    av_freep(&h->mvd_table[0]);
3125
    av_freep(&h->mvd_table[1]);
3126
    av_freep(&h->direct_table);
3127
    av_freep(&h->non_zero_count);
3128
    av_freep(&h->slice_table_base);
3129
    av_freep(&h->top_borders[1]);
3130
    av_freep(&h->top_borders[0]);
3131
    h->slice_table= NULL;
3132

    
3133
    av_freep(&h->mb2b_xy);
3134
    av_freep(&h->mb2b8_xy);
3135

    
3136
    av_freep(&h->s.obmc_scratchpad);
3137
}
3138

    
3139
static void init_dequant8_coeff_table(H264Context *h){
3140
    int i,q,x;
3141
    const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
3142
    h->dequant8_coeff[0] = h->dequant8_buffer[0];
3143
    h->dequant8_coeff[1] = h->dequant8_buffer[1];
3144

    
3145
    for(i=0; i<2; i++ ){
3146
        if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
3147
            h->dequant8_coeff[1] = h->dequant8_buffer[0];
3148
            break;
3149
        }
3150

    
3151
        for(q=0; q<52; q++){
3152
            int shift = ff_div6[q];
3153
            int idx = ff_rem6[q];
3154
            for(x=0; x<64; x++)
3155
                h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
3156
                    ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
3157
                    h->pps.scaling_matrix8[i][x]) << shift;
3158
        }
3159
    }
3160
}
3161

    
3162
static void init_dequant4_coeff_table(H264Context *h){
3163
    int i,j,q,x;
3164
    const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
3165
    for(i=0; i<6; i++ ){
3166
        h->dequant4_coeff[i] = h->dequant4_buffer[i];
3167
        for(j=0; j<i; j++){
3168
            if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
3169
                h->dequant4_coeff[i] = h->dequant4_buffer[j];
3170
                break;
3171
            }
3172
        }
3173
        if(j<i)
3174
            continue;
3175

    
3176
        for(q=0; q<52; q++){
3177
            int shift = ff_div6[q] + 2;
3178
            int idx = ff_rem6[q];
3179
            for(x=0; x<16; x++)
3180
                h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
3181
                    ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
3182
                    h->pps.scaling_matrix4[i][x]) << shift;
3183
        }
3184
    }
3185
}
3186

    
3187
static void init_dequant_tables(H264Context *h){
3188
    int i,x;
3189
    init_dequant4_coeff_table(h);
3190
    if(h->pps.transform_8x8_mode)
3191
        init_dequant8_coeff_table(h);
3192
    if(h->sps.transform_bypass){
3193
        for(i=0; i<6; i++)
3194
            for(x=0; x<16; x++)
3195
                h->dequant4_coeff[i][0][x] = 1<<6;
3196
        if(h->pps.transform_8x8_mode)
3197
            for(i=0; i<2; i++)
3198
                for(x=0; x<64; x++)
3199
                    h->dequant8_coeff[i][0][x] = 1<<6;
3200
    }
3201
}
3202

    
3203

    
3204
/**
3205
 * allocates tables.
3206
 * needs width/height
3207
 */
3208
static int alloc_tables(H264Context *h){
3209
    MpegEncContext * const s = &h->s;
3210
    const int big_mb_num= s->mb_stride * (s->mb_height+1);
3211
    int x,y;
3212

    
3213
    CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
3214

    
3215
    CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
3216
    CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(uint8_t))
3217
    CHECKED_ALLOCZ(h->top_borders[0]    , s->mb_width * (16+8+8) * sizeof(uint8_t))
3218
    CHECKED_ALLOCZ(h->top_borders[1]    , s->mb_width * (16+8+8) * sizeof(uint8_t))
3219
    CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
3220

    
3221
    if( h->pps.cabac ) {
3222
        CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
3223
        CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
3224
        CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
3225
        CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
3226
    }
3227

    
3228
    memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(uint8_t));
3229
    h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
3230

    
3231
    CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
3232
    CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
3233
    for(y=0; y<s->mb_height; y++){
3234
        for(x=0; x<s->mb_width; x++){
3235
            const int mb_xy= x + y*s->mb_stride;
3236
            const int b_xy = 4*x + 4*y*h->b_stride;
3237
            const int b8_xy= 2*x + 2*y*h->b8_stride;
3238

    
3239
            h->mb2b_xy [mb_xy]= b_xy;
3240
            h->mb2b8_xy[mb_xy]= b8_xy;
3241
        }
3242
    }
3243

    
3244
    s->obmc_scratchpad = NULL;
3245

    
3246
    if(!h->dequant4_coeff[0])
3247
        init_dequant_tables(h);
3248

    
3249
    return 0;
3250
fail:
3251
    free_tables(h);
3252
    return -1;
3253
}
3254

    
3255
static void common_init(H264Context *h){
3256
    MpegEncContext * const s = &h->s;
3257

    
3258
    s->width = s->avctx->width;
3259
    s->height = s->avctx->height;
3260
    s->codec_id= s->avctx->codec->id;
3261

    
3262
    init_pred_ptrs(h);
3263

    
3264
    h->dequant_coeff_pps= -1;
3265
    s->unrestricted_mv=1;
3266
    s->decode=1; //FIXME
3267

    
3268
    memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
3269
    memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
3270
}
3271

    
3272
static int decode_init(AVCodecContext *avctx){
3273
    H264Context *h= avctx->priv_data;
3274
    MpegEncContext * const s = &h->s;
3275

    
3276
    MPV_decode_defaults(s);
3277

    
3278
    s->avctx = avctx;
3279
    common_init(h);
3280

    
3281
    s->out_format = FMT_H264;
3282
    s->workaround_bugs= avctx->workaround_bugs;
3283

    
3284
    // set defaults
3285
//    s->decode_mb= ff_h263_decode_mb;
3286
    s->low_delay= 1;
3287
    avctx->pix_fmt= PIX_FMT_YUV420P;
3288

    
3289
    decode_init_vlc();
3290

    
3291
    if(avctx->extradata_size > 0 && avctx->extradata &&
3292
       *(char *)avctx->extradata == 1){
3293
        h->is_avc = 1;
3294
        h->got_avcC = 0;
3295
    } else {
3296
        h->is_avc = 0;
3297
    }
3298

    
3299
    return 0;
3300
}
3301

    
3302
static int frame_start(H264Context *h){
3303
    MpegEncContext * const s = &h->s;
3304
    int i;
3305

    
3306
    if(MPV_frame_start(s, s->avctx) < 0)
3307
        return -1;
3308
    ff_er_frame_start(s);
3309

    
3310
    assert(s->linesize && s->uvlinesize);
3311

    
3312
    for(i=0; i<16; i++){
3313
        h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
3314
        h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
3315
    }
3316
    for(i=0; i<4; i++){
3317
        h->block_offset[16+i]=
3318
        h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
3319
        h->block_offset[24+16+i]=
3320
        h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
3321
    }
3322

    
3323
    /* can't be in alloc_tables because linesize isn't known there.
3324
     * FIXME: redo bipred weight to not require extra buffer? */
3325
    if(!s->obmc_scratchpad)
3326
        s->obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
3327

    
3328
    /* some macroblocks will be accessed before they're available */
3329
    if(FRAME_MBAFF)
3330
        memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(uint8_t));
3331

    
3332
//    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
3333
    return 0;
3334
}
3335

    
3336
static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
3337
    MpegEncContext * const s = &h->s;
3338
    int i;
3339

    
3340
    src_y  -=   linesize;
3341
    src_cb -= uvlinesize;
3342
    src_cr -= uvlinesize;
3343

    
3344
    // There are two lines saved, the line above the the top macroblock of a pair,
3345
    // and the line above the bottom macroblock
3346
    h->left_border[0]= h->top_borders[0][s->mb_x][15];
3347
    for(i=1; i<17; i++){
3348
        h->left_border[i]= src_y[15+i*  linesize];
3349
    }
3350

    
3351
    *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
3352
    *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
3353

    
3354
    if(!(s->flags&CODEC_FLAG_GRAY)){
3355
        h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
3356
        h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
3357
        for(i=1; i<9; i++){
3358
            h->left_border[i+17  ]= src_cb[7+i*uvlinesize];
3359
            h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
3360
        }
3361
        *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
3362
        *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
3363
    }
3364
}
3365

    
3366
static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
3367
    MpegEncContext * const s = &h->s;
3368
    int temp8, i;
3369
    uint64_t temp64;
3370
    int deblock_left = (s->mb_x > 0);
3371
    int deblock_top  = (s->mb_y > 0);
3372

    
3373
    src_y  -=   linesize + 1;
3374
    src_cb -= uvlinesize + 1;
3375
    src_cr -= uvlinesize + 1;
3376

    
3377
#define XCHG(a,b,t,xchg)\
3378
t= a;\
3379
if(xchg)\
3380
    a= b;\
3381
b= t;
3382

    
3383
    if(deblock_left){
3384
        for(i = !deblock_top; i<17; i++){
3385
            XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
3386
        }
3387
    }
3388

    
3389
    if(deblock_top){
3390
        XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
3391
        XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
3392
        if(s->mb_x+1 < s->mb_width){
3393
            XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
3394
        }
3395
    }
3396

    
3397
    if(!(s->flags&CODEC_FLAG_GRAY)){
3398
        if(deblock_left){
3399
            for(i = !deblock_top; i<9; i++){
3400
                XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
3401
                XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
3402
            }
3403
        }
3404
        if(deblock_top){
3405
            XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
3406
            XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
3407
        }
3408
    }
3409
}
3410

    
3411
static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
3412
    MpegEncContext * const s = &h->s;
3413
    int i;
3414

    
3415
    src_y  -= 2 *   linesize;
3416
    src_cb -= 2 * uvlinesize;
3417
    src_cr -= 2 * uvlinesize;
3418

    
3419
    // There are two lines saved, the line above the the top macroblock of a pair,
3420
    // and the line above the bottom macroblock
3421
    h->left_border[0]= h->top_borders[0][s->mb_x][15];
3422
    h->left_border[1]= h->top_borders[1][s->mb_x][15];
3423
    for(i=2; i<34; i++){
3424
        h->left_border[i]= src_y[15+i*  linesize];
3425
    }
3426

    
3427
    *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  32*linesize);
3428
    *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+32*linesize);
3429
    *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
3430
    *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
3431

    
3432
    if(!(s->flags&CODEC_FLAG_GRAY)){
3433
        h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
3434
        h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
3435
        h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
3436
        h->left_border[34+18+1]= h->top_borders[1][s->mb_x][24+7];
3437
        for(i=2; i<18; i++){
3438
            h->left_border[i+34   ]= src_cb[7+i*uvlinesize];
3439
            h->left_border[i+34+18]= src_cr[7+i*uvlinesize];
3440
        }
3441
        *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+16*uvlinesize);
3442
        *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+16*uvlinesize);
3443
        *(uint64_t*)(h->top_borders[1][s->mb_x]+16)= *(uint64_t*)(src_cb+17*uvlinesize);
3444
        *(uint64_t*)(h->top_borders[1][s->mb_x]+24)= *(uint64_t*)(src_cr+17*uvlinesize);
3445
    }
3446
}
3447

    
3448
static inline void xchg_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
3449
    MpegEncContext * const s = &h->s;
3450
    int temp8, i;
3451
    uint64_t temp64;
3452
    int deblock_left = (s->mb_x > 0);
3453
    int deblock_top  = (s->mb_y > 1);
3454

    
3455
    tprintf(s->avctx, "xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
3456

    
3457
    src_y  -= 2 *   linesize + 1;
3458
    src_cb -= 2 * uvlinesize + 1;
3459
    src_cr -= 2 * uvlinesize + 1;
3460

    
3461
#define XCHG(a,b,t,xchg)\
3462
t= a;\
3463
if(xchg)\
3464
    a= b;\
3465
b= t;
3466

    
3467
    if(deblock_left){
3468
        for(i = (!deblock_top)<<1; i<34; i++){
3469
            XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
3470
        }
3471
    }
3472

    
3473
    if(deblock_top){
3474
        XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
3475
        XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
3476
        XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+0), *(uint64_t*)(src_y +1 +linesize), temp64, xchg);
3477
        XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+8), *(uint64_t*)(src_y +9 +linesize), temp64, 1);
3478
        if(s->mb_x+1 < s->mb_width){
3479
            XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
3480
            XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x+1]), *(uint64_t*)(src_y +17 +linesize), temp64, 1);
3481
        }
3482
    }
3483

    
3484
    if(!(s->flags&CODEC_FLAG_GRAY)){
3485
        if(deblock_left){
3486
            for(i = (!deblock_top) << 1; i<18; i++){
3487
                XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
3488
                XCHG(h->left_border[i+34+18], src_cr[i*uvlinesize], temp8, xchg);
3489
            }
3490
        }
3491
        if(deblock_top){
3492
            XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
3493
            XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
3494
            XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+16), *(uint64_t*)(src_cb+1 +uvlinesize), temp64, 1);
3495
            XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1);
3496
        }
3497
    }
3498
}
3499

    
3500
static void av_always_inline hl_decode_mb_internal(H264Context *h, int simple){
3501
    MpegEncContext * const s = &h->s;
3502
    const int mb_x= s->mb_x;
3503
    const int mb_y= s->mb_y;
3504
    const int mb_xy= mb_x + mb_y*s->mb_stride;
3505
    const int mb_type= s->current_picture.mb_type[mb_xy];
3506
    uint8_t  *dest_y, *dest_cb, *dest_cr;
3507
    int linesize, uvlinesize /*dct_offset*/;
3508
    int i;
3509
    int *block_offset = &h->block_offset[0];
3510
    const unsigned int bottom = mb_y & 1;
3511
    const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass), is_h264 = (simple || s->codec_id == CODEC_ID_H264);
3512
    void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
3513
    void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
3514

    
3515
    dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
3516
    dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3517
    dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3518

    
3519
    s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
3520
    s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
3521

    
3522
    if (!simple && MB_FIELD) {
3523
        linesize   = h->mb_linesize   = s->linesize * 2;
3524
        uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
3525
        block_offset = &h->block_offset[24];
3526
        if(mb_y&1){ //FIXME move out of this func?
3527
            dest_y -= s->linesize*15;
3528
            dest_cb-= s->uvlinesize*7;
3529
            dest_cr-= s->uvlinesize*7;
3530
        }
3531
        if(FRAME_MBAFF) {
3532
            int list;
3533
            for(list=0; list<h->list_count; list++){
3534
                if(!USES_LIST(mb_type, list))
3535
                    continue;
3536
                if(IS_16X16(mb_type)){
3537
                    int8_t *ref = &h->ref_cache[list][scan8[0]];
3538
                    fill_rectangle(ref, 4, 4, 8, 16+*ref^(s->mb_y&1), 1);
3539
                }else{
3540
                    for(i=0; i<16; i+=4){
3541
                        //FIXME can refs be smaller than 8x8 when !direct_8x8_inference ?
3542
                        int ref = h->ref_cache[list][scan8[i]];
3543
                        if(ref >= 0)
3544
                            fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, 16+ref^(s->mb_y&1), 1);
3545
                    }
3546
                }
3547
            }
3548
        }
3549
    } else {
3550
        linesize   = h->mb_linesize   = s->linesize;
3551
        uvlinesize = h->mb_uvlinesize = s->uvlinesize;
3552
//        dct_offset = s->linesize * 16;
3553
    }
3554

    
3555
    if(transform_bypass){
3556
        idct_dc_add =
3557
        idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
3558
    }else if(IS_8x8DCT(mb_type)){
3559
        idct_dc_add = s->dsp.h264_idct8_dc_add;
3560
        idct_add = s->dsp.h264_idct8_add;
3561
    }else{
3562
        idct_dc_add = s->dsp.h264_idct_dc_add;
3563
        idct_add = s->dsp.h264_idct_add;
3564
    }
3565

    
3566
    if(!simple && FRAME_MBAFF && h->deblocking_filter && IS_INTRA(mb_type)
3567
       && (!bottom || !IS_INTRA(s->current_picture.mb_type[mb_xy-s->mb_stride]))){
3568
        int mbt_y = mb_y&~1;
3569
        uint8_t *top_y  = s->current_picture.data[0] + (mbt_y * 16* s->linesize  ) + mb_x * 16;
3570
        uint8_t *top_cb = s->current_picture.data[1] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
3571
        uint8_t *top_cr = s->current_picture.data[2] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
3572
        xchg_pair_border(h, top_y, top_cb, top_cr, s->linesize, s->uvlinesize, 1);
3573
    }
3574

    
3575
    if (!simple && IS_INTRA_PCM(mb_type)) {
3576
        unsigned int x, y;
3577

    
3578
        // The pixels are stored in h->mb array in the same order as levels,
3579
        // copy them in output in the correct order.
3580
        for(i=0; i<16; i++) {
3581
            for (y=0; y<4; y++) {
3582
                for (x=0; x<4; x++) {
3583
                    *(dest_y + block_offset[i] + y*linesize + x) = h->mb[i*16+y*4+x];
3584
                }
3585
            }
3586
        }
3587
        for(i=16; i<16+4; i++) {
3588
            for (y=0; y<4; y++) {
3589
                for (x=0; x<4; x++) {
3590
                    *(dest_cb + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
3591
                }
3592
            }
3593
        }
3594
        for(i=20; i<20+4; i++) {
3595
            for (y=0; y<4; y++) {
3596
                for (x=0; x<4; x++) {
3597
                    *(dest_cr + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
3598
                }
3599
            }
3600
        }
3601
    } else {
3602
        if(IS_INTRA(mb_type)){
3603
            if(h->deblocking_filter && (simple || !FRAME_MBAFF))
3604
                xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1);
3605

    
3606
            if(simple || !(s->flags&CODEC_FLAG_GRAY)){
3607
                h->pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
3608
                h->pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
3609
            }
3610

    
3611
            if(IS_INTRA4x4(mb_type)){
3612
                if(simple || !s->encoding){
3613
                    if(IS_8x8DCT(mb_type)){
3614
                        for(i=0; i<16; i+=4){
3615
                            uint8_t * const ptr= dest_y + block_offset[i];
3616
                            const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
3617
                            const int nnz = h->non_zero_count_cache[ scan8[i] ];
3618
                            h->pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
3619
                                                   (h->topright_samples_available<<i)&0x4000, linesize);
3620
                            if(nnz){
3621
                                if(nnz == 1 && h->mb[i*16])
3622
                                    idct_dc_add(ptr, h->mb + i*16, linesize);
3623
                                else
3624
                                    idct_add(ptr, h->mb + i*16, linesize);
3625
                            }
3626
                        }
3627
                    }else
3628
                    for(i=0; i<16; i++){
3629
                        uint8_t * const ptr= dest_y + block_offset[i];
3630
                        uint8_t *topright;
3631
                        const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
3632
                        int nnz, tr;
3633

    
3634
                        if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
3635
                            const int topright_avail= (h->topright_samples_available<<i)&0x8000;
3636
                            assert(mb_y || linesize <= block_offset[i]);
3637
                            if(!topright_avail){
3638
                                tr= ptr[3 - linesize]*0x01010101;
3639
                                topright= (uint8_t*) &tr;
3640
                            }else
3641
                                topright= ptr + 4 - linesize;
3642
                        }else
3643
                            topright= NULL;
3644

    
3645
                        h->pred4x4[ dir ](ptr, topright, linesize);
3646
                        nnz = h->non_zero_count_cache[ scan8[i] ];
3647
                        if(nnz){
3648
                            if(is_h264){
3649
                                if(nnz == 1 && h->mb[i*16])
3650
                                    idct_dc_add(ptr, h->mb + i*16, linesize);
3651
                                else
3652
                                    idct_add(ptr, h->mb + i*16, linesize);
3653
                            }else
3654
                                svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
3655
                        }
3656
                    }
3657
                }
3658
            }else{
3659
                h->pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
3660
                if(is_h264){
3661
                    if(!transform_bypass)
3662
                        h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[IS_INTRA(mb_type) ? 0:3][s->qscale][0]);
3663
                }else
3664
                    svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
3665
            }
3666
            if(h->deblocking_filter && (simple || !FRAME_MBAFF))
3667
                xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0);
3668
        }else if(is_h264){
3669
            hl_motion(h, dest_y, dest_cb, dest_cr,
3670
                      s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
3671
                      s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
3672
                      s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
3673
        }
3674

    
3675

    
3676
        if(!IS_INTRA4x4(mb_type)){
3677
            if(is_h264){
3678
                if(IS_INTRA16x16(mb_type)){
3679
                    for(i=0; i<16; i++){
3680
                        if(h->non_zero_count_cache[ scan8[i] ])
3681
                            idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3682
                        else if(h->mb[i*16])
3683
                            idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3684
                    }
3685
                }else{
3686
                    const int di = IS_8x8DCT(mb_type) ? 4 : 1;
3687
                    for(i=0; i<16; i+=di){
3688
                        int nnz = h->non_zero_count_cache[ scan8[i] ];
3689
                        if(nnz){
3690
                            if(nnz==1 && h->mb[i*16])
3691
                                idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3692
                            else
3693
                                idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3694
                        }
3695
                    }
3696
                }
3697
            }else{
3698
                for(i=0; i<16; i++){
3699
                    if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
3700
                        uint8_t * const ptr= dest_y + block_offset[i];
3701
                        svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
3702
                    }
3703
                }
3704
            }
3705
        }
3706

    
3707
        if(simple || !(s->flags&CODEC_FLAG_GRAY)){
3708
            uint8_t *dest[2] = {dest_cb, dest_cr};
3709
            if(transform_bypass){
3710
                idct_add = idct_dc_add = s->dsp.add_pixels4;
3711
            }else{
3712
                idct_add = s->dsp.h264_idct_add;
3713
                idct_dc_add = s->dsp.h264_idct_dc_add;
3714
                chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp][0]);
3715
                chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp][0]);
3716
            }
3717
            if(is_h264){
3718
                for(i=16; i<16+8; i++){
3719
                    if(h->non_zero_count_cache[ scan8[i] ])
3720
                        idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
3721
                    else if(h->mb[i*16])
3722
                        idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
3723
                }
3724
            }else{
3725
                for(i=16; i<16+8; i++){
3726
                    if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
3727
                        uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
3728
                        svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
3729
                    }
3730
                }
3731
            }
3732
        }
3733
    }
3734
    if(h->deblocking_filter) {
3735
        if (!simple && FRAME_MBAFF) {
3736
            //FIXME try deblocking one mb at a time?
3737
            // the reduction in load/storing mvs and such might outweigh the extra backup/xchg_border
3738
            const int mb_y = s->mb_y - 1;
3739
            uint8_t  *pair_dest_y, *pair_dest_cb, *pair_dest_cr;
3740
            const int mb_xy= mb_x + mb_y*s->mb_stride;
3741
            const int mb_type_top   = s->current_picture.mb_type[mb_xy];
3742
            const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
3743
            if (!bottom) return;
3744
            pair_dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
3745
            pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3746
            pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3747

    
3748
            if(IS_INTRA(mb_type_top | mb_type_bottom))
3749
                xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
3750

    
3751
            backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
3752
            // deblock a pair
3753
            // top
3754
            s->mb_y--;
3755
            tprintf(h->s.avctx, "call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
3756
            fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
3757
            h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mb_xy]);
3758
            filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
3759
            // bottom
3760
            s->mb_y++;
3761
            tprintf(h->s.avctx, "call mbaff filter_mb\n");
3762
            fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
3763
            h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
3764
            filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3765
        } else {
3766
            tprintf(h->s.avctx, "call filter_mb\n");
3767
            backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3768
            fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
3769
            filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3770
        }
3771
    }
3772
}
3773

    
3774
/**
3775
 * Process a macroblock; this case avoids checks for expensive uncommon cases.
3776
 */
3777
static void hl_decode_mb_simple(H264Context *h){
3778
    hl_decode_mb_internal(h, 1);
3779
}
3780

    
3781
/**
3782
 * Process a macroblock; this handles edge cases, such as interlacing.
3783
 */
3784
static void av_noinline hl_decode_mb_complex(H264Context *h){
3785
    hl_decode_mb_internal(h, 0);
3786
}
3787

    
3788
static void hl_decode_mb(H264Context *h){
3789
    MpegEncContext * const s = &h->s;
3790
    const int mb_x= s->mb_x;
3791
    const int mb_y= s->mb_y;
3792
    const int mb_xy= mb_x + mb_y*s->mb_stride;
3793
    const int mb_type= s->current_picture.mb_type[mb_xy];
3794
    int is_complex = FRAME_MBAFF || MB_FIELD || IS_INTRA_PCM(mb_type) || s->codec_id != CODEC_ID_H264 || (s->flags&CODEC_FLAG_GRAY) || s->encoding;
3795

    
3796
    if(!s->decode)
3797
        return;
3798

    
3799
    if (is_complex)
3800
        hl_decode_mb_complex(h);
3801
    else hl_decode_mb_simple(h);
3802
}
3803

    
3804
/**
3805
 * fills the default_ref_list.
3806
 */
3807
static int fill_default_ref_list(H264Context *h){
3808
    MpegEncContext * const s = &h->s;
3809
    int i;
3810
    int smallest_poc_greater_than_current = -1;
3811
    Picture sorted_short_ref[32];
3812

    
3813
    if(h->slice_type==B_TYPE){
3814
        int out_i;
3815
        int limit= INT_MIN;
3816

    
3817
        /* sort frame according to poc in B slice */
3818
        for(out_i=0; out_i<h->short_ref_count; out_i++){
3819
            int best_i=INT_MIN;
3820
            int best_poc=INT_MAX;
3821

    
3822
            for(i=0; i<h->short_ref_count; i++){
3823
                const int poc= h->short_ref[i]->poc;
3824
                if(poc > limit && poc < best_poc){
3825
                    best_poc= poc;
3826
                    best_i= i;
3827
                }
3828
            }
3829

    
3830
            assert(best_i != INT_MIN);
3831

    
3832
            limit= best_poc;
3833
            sorted_short_ref[out_i]= *h->short_ref[best_i];
3834
            tprintf(h->s.avctx, "sorted poc: %d->%d poc:%d fn:%d\n", best_i, out_i, sorted_short_ref[out_i].poc, sorted_short_ref[out_i].frame_num);
3835
            if (-1 == smallest_poc_greater_than_current) {
3836
                if (h->short_ref[best_i]->poc >= s->current_picture_ptr->poc) {
3837
                    smallest_poc_greater_than_current = out_i;
3838
                }
3839
            }
3840
        }
3841
    }
3842

    
3843
    if(s->picture_structure == PICT_FRAME){
3844
        if(h->slice_type==B_TYPE){
3845
            int list;
3846
            tprintf(h->s.avctx, "current poc: %d, smallest_poc_greater_than_current: %d\n", s->current_picture_ptr->poc, smallest_poc_greater_than_current);
3847

    
3848
            // find the largest poc
3849
            for(list=0; list<2; list++){
3850
                int index = 0;
3851
                int j= -99;
3852
                int step= list ? -1 : 1;
3853

    
3854
                for(i=0; i<h->short_ref_count && index < h->ref_count[list]; i++, j+=step) {
3855
                    while(j<0 || j>= h->short_ref_count){
3856
                        if(j != -99 && step == (list ? -1 : 1))
3857
                            return -1;
3858
                        step = -step;
3859
                        j= smallest_poc_greater_than_current + (step>>1);
3860
                    }
3861
                    if(sorted_short_ref[j].reference != 3) continue;
3862
                    h->default_ref_list[list][index  ]= sorted_short_ref[j];
3863
                    h->default_ref_list[list][index++].pic_id= sorted_short_ref[j].frame_num;
3864
                }
3865

    
3866
                for(i = 0; i < 16 && index < h->ref_count[ list ]; i++){
3867
                    if(h->long_ref[i] == NULL) continue;
3868
                    if(h->long_ref[i]->reference != 3) continue;
3869

    
3870
                    h->default_ref_list[ list ][index  ]= *h->long_ref[i];
3871
                    h->default_ref_list[ list ][index++].pic_id= i;;
3872
                }
3873

    
3874
                if(list && (smallest_poc_greater_than_current<=0 || smallest_poc_greater_than_current>=h->short_ref_count) && (1 < index)){
3875
                    // swap the two first elements of L1 when
3876
                    // L0 and L1 are identical
3877
                    Picture temp= h->default_ref_list[1][0];
3878
                    h->default_ref_list[1][0] = h->default_ref_list[1][1];
3879
                    h->default_ref_list[1][1] = temp;
3880
                }
3881

    
3882
                if(index < h->ref_count[ list ])
3883
                    memset(&h->default_ref_list[list][index], 0, sizeof(Picture)*(h->ref_count[ list ] - index));
3884
            }
3885
        }else{
3886
            int index=0;
3887
            for(i=0; i<h->short_ref_count; i++){
3888
                if(h->short_ref[i]->reference != 3) continue; //FIXME refernce field shit
3889
                h->default_ref_list[0][index  ]= *h->short_ref[i];
3890
                h->default_ref_list[0][index++].pic_id= h->short_ref[i]->frame_num;
3891
            }
3892
            for(i = 0; i < 16; i++){
3893
                if(h->long_ref[i] == NULL) continue;
3894
                if(h->long_ref[i]->reference != 3) continue;
3895
                h->default_ref_list[0][index  ]= *h->long_ref[i];
3896
                h->default_ref_list[0][index++].pic_id= i;;
3897
            }
3898
            if(index < h->ref_count[0])
3899
                memset(&h->default_ref_list[0][index], 0, sizeof(Picture)*(h->ref_count[0] - index));
3900
        }
3901
    }else{ //FIELD
3902
        if(h->slice_type==B_TYPE){
3903
        }else{
3904
            //FIXME second field balh
3905
        }
3906
    }
3907
#ifdef TRACE
3908
    for (i=0; i<h->ref_count[0]; i++) {
3909
        tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
3910
    }
3911
    if(h->slice_type==B_TYPE){
3912
        for (i=0; i<h->ref_count[1]; i++) {
3913
            tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[0][i].data[0]);
3914
        }
3915
    }
3916
#endif
3917
    return 0;
3918
}
3919

    
3920
static void print_short_term(H264Context *h);
3921
static void print_long_term(H264Context *h);
3922

    
3923
static int decode_ref_pic_list_reordering(H264Context *h){
3924
    MpegEncContext * const s = &h->s;
3925
    int list, index;
3926

    
3927
    print_short_term(h);
3928
    print_long_term(h);
3929
    if(h->slice_type==I_TYPE || h->slice_type==SI_TYPE) return 0; //FIXME move before func
3930

    
3931
    for(list=0; list<h->list_count; list++){
3932
        memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
3933

    
3934
        if(get_bits1(&s->gb)){
3935
            int pred= h->curr_pic_num;
3936

    
3937
            for(index=0; ; index++){
3938
                unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
3939
                unsigned int pic_id;
3940
                int i;
3941
                Picture *ref = NULL;
3942

    
3943
                if(reordering_of_pic_nums_idc==3)
3944
                    break;
3945

    
3946
                if(index >= h->ref_count[list]){
3947
                    av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
3948
                    return -1;
3949
                }
3950

    
3951
                if(reordering_of_pic_nums_idc<3){
3952
                    if(reordering_of_pic_nums_idc<2){
3953
                        const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
3954

    
3955
                        if(abs_diff_pic_num >= h->max_pic_num){
3956
                            av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
3957
                            return -1;
3958
                        }
3959

    
3960
                        if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
3961
                        else                                pred+= abs_diff_pic_num;
3962
                        pred &= h->max_pic_num - 1;
3963

    
3964
                        for(i= h->short_ref_count-1; i>=0; i--){
3965
                            ref = h->short_ref[i];
3966
                            assert(ref->reference == 3);
3967
                            assert(!ref->long_ref);
3968
                            if(ref->data[0] != NULL && ref->frame_num == pred && ref->long_ref == 0) // ignore non existing pictures by testing data[0] pointer
3969
                                break;
3970
                        }
3971
                        if(i>=0)
3972
                            ref->pic_id= ref->frame_num;
3973
                    }else{
3974
                        pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
3975
                        if(pic_id>31){
3976
                            av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
3977
                            return -1;
3978
                        }
3979
                        ref = h->long_ref[pic_id];
3980
                        if(ref){
3981
                            ref->pic_id= pic_id;
3982
                            assert(ref->reference == 3);
3983
                            assert(ref->long_ref);
3984
                            i=0;
3985
                        }else{
3986
                            i=-1;
3987
                        }
3988
                    }
3989

    
3990
                    if (i < 0) {
3991
                        av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
3992
                        memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
3993
                    } else {
3994
                        for(i=index; i+1<h->ref_count[list]; i++){
3995
                            if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
3996
                                break;
3997
                        }
3998
                        for(; i > index; i--){
3999
                            h->ref_list[list][i]= h->ref_list[list][i-1];
4000
                        }
4001
                        h->ref_list[list][index]= *ref;
4002
                    }
4003
                }else{
4004
                    av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
4005
                    return -1;
4006
                }
4007
            }
4008
        }
4009
    }
4010
    for(list=0; list<h->list_count; list++){
4011
        for(index= 0; index < h->ref_count[list]; index++){
4012
            if(!h->ref_list[list][index].data[0])
4013
                h->ref_list[list][index]= s->current_picture;
4014
        }
4015
    }
4016

    
4017
    if(h->slice_type==B_TYPE && !h->direct_spatial_mv_pred)
4018
        direct_dist_scale_factor(h);
4019
    direct_ref_list_init(h);
4020
    return 0;
4021
}
4022

    
4023
static void fill_mbaff_ref_list(H264Context *h){
4024
    int list, i, j;
4025
    for(list=0; list<2; list++){ //FIXME try list_count
4026
        for(i=0; i<h->ref_count[list]; i++){
4027
            Picture *frame = &h->ref_list[list][i];
4028
            Picture *field = &h->ref_list[list][16+2*i];
4029
            field[0] = *frame;
4030
            for(j=0; j<3; j++)
4031
                field[0].linesize[j] <<= 1;
4032
            field[1] = field[0];
4033
            for(j=0; j<3; j++)
4034
                field[1].data[j] += frame->linesize[j];
4035

    
4036
            h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
4037
            h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
4038
            for(j=0; j<2; j++){
4039
                h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
4040
                h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
4041
            }
4042
        }
4043
    }
4044
    for(j=0; j<h->ref_count[1]; j++){
4045
        for(i=0; i<h->ref_count[0]; i++)
4046
            h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
4047
        memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
4048
        memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
4049
    }
4050
}
4051

    
4052
static int pred_weight_table(H264Context *h){
4053
    MpegEncContext * const s = &h->s;
4054
    int list, i;
4055
    int luma_def, chroma_def;
4056

    
4057
    h->use_weight= 0;
4058
    h->use_weight_chroma= 0;
4059
    h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
4060
    h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
4061
    luma_def = 1<<h->luma_log2_weight_denom;
4062
    chroma_def = 1<<h->chroma_log2_weight_denom;
4063

    
4064
    for(list=0; list<2; list++){
4065
        for(i=0; i<h->ref_count[list]; i++){
4066
            int luma_weight_flag, chroma_weight_flag;
4067

    
4068
            luma_weight_flag= get_bits1(&s->gb);
4069
            if(luma_weight_flag){
4070
                h->luma_weight[list][i]= get_se_golomb(&s->gb);
4071
                h->luma_offset[list][i]= get_se_golomb(&s->gb);
4072
                if(   h->luma_weight[list][i] != luma_def
4073
                   || h->luma_offset[list][i] != 0)
4074
                    h->use_weight= 1;
4075
            }else{
4076
                h->luma_weight[list][i]= luma_def;
4077
                h->luma_offset[list][i]= 0;
4078
            }
4079

    
4080
            chroma_weight_flag= get_bits1(&s->gb);
4081
            if(chroma_weight_flag){
4082
                int j;
4083
                for(j=0; j<2; j++){
4084
                    h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
4085
                    h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
4086
                    if(   h->chroma_weight[list][i][j] != chroma_def
4087
                       || h->chroma_offset[list][i][j] != 0)
4088
                        h->use_weight_chroma= 1;
4089
                }
4090
            }else{
4091
                int j;
4092
                for(j=0; j<2; j++){
4093
                    h->chroma_weight[list][i][j]= chroma_def;
4094
                    h->chroma_offset[list][i][j]= 0;
4095
                }
4096
            }
4097
        }
4098
        if(h->slice_type != B_TYPE) break;
4099
    }
4100
    h->use_weight= h->use_weight || h->use_weight_chroma;
4101
    return 0;
4102
}
4103

    
4104
static void implicit_weight_table(H264Context *h){
4105
    MpegEncContext * const s = &h->s;
4106
    int ref0, ref1;
4107
    int cur_poc = s->current_picture_ptr->poc;
4108

    
4109
    if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
4110
       && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
4111
        h->use_weight= 0;
4112
        h->use_weight_chroma= 0;
4113
        return;
4114
    }
4115

    
4116
    h->use_weight= 2;
4117
    h->use_weight_chroma= 2;
4118
    h->luma_log2_weight_denom= 5;
4119
    h->chroma_log2_weight_denom= 5;
4120

    
4121
    for(ref0=0; ref0 < h->ref_count[0]; ref0++){
4122
        int poc0 = h->ref_list[0][ref0].poc;
4123
        for(ref1=0; ref1 < h->ref_count[1]; ref1++){
4124
            int poc1 = h->ref_list[1][ref1].poc;
4125
            int td = av_clip(poc1 - poc0, -128, 127);
4126
            if(td){
4127
                int tb = av_clip(cur_poc - poc0, -128, 127);
4128
                int tx = (16384 + (FFABS(td) >> 1)) / td;
4129
                int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
4130
                if(dist_scale_factor < -64 || dist_scale_factor > 128)
4131
                    h->implicit_weight[ref0][ref1] = 32;
4132
                else
4133
                    h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
4134
            }else
4135
                h->implicit_weight[ref0][ref1] = 32;
4136
        }
4137
    }
4138
}
4139

    
4140
static inline void unreference_pic(H264Context *h, Picture *pic){
4141
    int i;
4142
    pic->reference=0;
4143
    if(pic == h->delayed_output_pic)
4144
        pic->reference=1;
4145
    else{
4146
        for(i = 0; h->delayed_pic[i]; i++)
4147
            if(pic == h->delayed_pic[i]){
4148
                pic->reference=1;
4149
                break;
4150
            }
4151
    }
4152
}
4153

    
4154
/**
4155
 * instantaneous decoder refresh.
4156
 */
4157
static void idr(H264Context *h){
4158
    int i;
4159

    
4160
    for(i=0; i<16; i++){
4161
        if (h->long_ref[i] != NULL) {
4162
            unreference_pic(h, h->long_ref[i]);
4163
            h->long_ref[i]= NULL;
4164
        }
4165
    }
4166
    h->long_ref_count=0;
4167

    
4168
    for(i=0; i<h->short_ref_count; i++){
4169
        unreference_pic(h, h->short_ref[i]);
4170
        h->short_ref[i]= NULL;
4171
    }
4172
    h->short_ref_count=0;
4173
}
4174

    
4175
/* forget old pics after a seek */
4176
static void flush_dpb(AVCodecContext *avctx){
4177
    H264Context *h= avctx->priv_data;
4178
    int i;
4179
    for(i=0; i<16; i++) {
4180
        if(h->delayed_pic[i])
4181
            h->delayed_pic[i]->reference= 0;
4182
        h->delayed_pic[i]= NULL;
4183
    }
4184
    if(h->delayed_output_pic)
4185
        h->delayed_output_pic->reference= 0;
4186
    h->delayed_output_pic= NULL;
4187
    idr(h);
4188
    if(h->s.current_picture_ptr)
4189
        h->s.current_picture_ptr->reference= 0;
4190
}
4191

    
4192
/**
4193
 *
4194
 * @return the removed picture or NULL if an error occurs
4195
 */
4196
static Picture * remove_short(H264Context *h, int frame_num){
4197
    MpegEncContext * const s = &h->s;
4198
    int i;
4199

    
4200
    if(s->avctx->debug&FF_DEBUG_MMCO)
4201
        av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
4202

    
4203
    for(i=0; i<h->short_ref_count; i++){
4204
        Picture *pic= h->short_ref[i];
4205
        if(s->avctx->debug&FF_DEBUG_MMCO)
4206
            av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
4207
        if(pic->frame_num == frame_num){
4208
            h->short_ref[i]= NULL;
4209
            memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i - 1)*sizeof(Picture*));
4210
            h->short_ref_count--;
4211
            return pic;
4212
        }
4213
    }
4214
    return NULL;
4215
}
4216

    
4217
/**
4218
 *
4219
 * @return the removed picture or NULL if an error occurs
4220
 */
4221
static Picture * remove_long(H264Context *h, int i){
4222
    Picture *pic;
4223

    
4224
    pic= h->long_ref[i];
4225
    h->long_ref[i]= NULL;
4226
    if(pic) h->long_ref_count--;
4227

    
4228
    return pic;
4229
}
4230

    
4231
/**
4232
 * print short term list
4233
 */
4234
static void print_short_term(H264Context *h) {
4235
    uint32_t i;
4236
    if(h->s.avctx->debug&FF_DEBUG_MMCO) {
4237
        av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
4238
        for(i=0; i<h->short_ref_count; i++){
4239
            Picture *pic= h->short_ref[i];
4240
            av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
4241
        }
4242
    }
4243
}
4244

    
4245
/**
4246
 * print long term list
4247
 */
4248
static void print_long_term(H264Context *h) {
4249
    uint32_t i;
4250
    if(h->s.avctx->debug&FF_DEBUG_MMCO) {
4251
        av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
4252
        for(i = 0; i < 16; i++){
4253
            Picture *pic= h->long_ref[i];
4254
            if (pic) {
4255
                av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
4256
            }
4257
        }
4258
    }
4259
}
4260

    
4261
/**
4262
 * Executes the reference picture marking (memory management control operations).
4263
 */
4264
static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
4265
    MpegEncContext * const s = &h->s;
4266
    int i, j;
4267
    int current_is_long=0;
4268
    Picture *pic;
4269

    
4270
    if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
4271
        av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
4272

    
4273
    for(i=0; i<mmco_count; i++){
4274
        if(s->avctx->debug&FF_DEBUG_MMCO)
4275
            av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_frame_num, h->mmco[i].long_index);
4276

    
4277
        switch(mmco[i].opcode){
4278
        case MMCO_SHORT2UNUSED:
4279