Statistics
| Branch: | Revision:

ffmpeg / libavcodec / h264.c @ 6de06295

History | View | Annotate | Download (321 KB)

1
/*
2
 * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
3
 * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
4
 *
5
 * This file is part of FFmpeg.
6
 *
7
 * FFmpeg is free software; you can redistribute it and/or
8
 * modify it under the terms of the GNU Lesser General Public
9
 * License as published by the Free Software Foundation; either
10
 * version 2.1 of the License, or (at your option) any later version.
11
 *
12
 * FFmpeg is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
 * Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with FFmpeg; if not, write to the Free Software
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
 *
21
 */
22

    
23
/**
24
 * @file h264.c
25
 * H.264 / AVC / MPEG4 part10 codec.
26
 * @author Michael Niedermayer <michaelni@gmx.at>
27
 */
28

    
29
#include "common.h"
30
#include "dsputil.h"
31
#include "avcodec.h"
32
#include "mpegvideo.h"
33
#include "h264data.h"
34
#include "golomb.h"
35

    
36
#include "cabac.h"
37

    
38
//#undef NDEBUG
39
#include <assert.h>
40

    
41
#define interlaced_dct interlaced_dct_is_a_bad_name
42
#define mb_intra mb_intra_isnt_initalized_see_mb_type
43

    
44
#define LUMA_DC_BLOCK_INDEX   25
45
#define CHROMA_DC_BLOCK_INDEX 26
46

    
47
#define CHROMA_DC_COEFF_TOKEN_VLC_BITS 8
48
#define COEFF_TOKEN_VLC_BITS           8
49
#define TOTAL_ZEROS_VLC_BITS           9
50
#define CHROMA_DC_TOTAL_ZEROS_VLC_BITS 3
51
#define RUN_VLC_BITS                   3
52
#define RUN7_VLC_BITS                  6
53

    
54
#define MAX_SPS_COUNT 32
55
#define MAX_PPS_COUNT 256
56

    
57
#define MAX_MMCO_COUNT 66
58

    
59
/* Compiling in interlaced support reduces the speed
60
 * of progressive decoding by about 2%. */
61
#define ALLOW_INTERLACE
62

    
63
#ifdef ALLOW_INTERLACE
64
#define MB_MBAFF h->mb_mbaff
65
#define MB_FIELD h->mb_field_decoding_flag
66
#define FRAME_MBAFF h->mb_aff_frame
67
#else
68
#define MB_MBAFF 0
69
#define MB_FIELD 0
70
#define FRAME_MBAFF 0
71
#undef  IS_INTERLACED
72
#define IS_INTERLACED(mb_type) 0
73
#endif
74

    
75
/**
76
 * Sequence parameter set
77
 */
78
typedef struct SPS{
79

    
80
    int profile_idc;
81
    int level_idc;
82
    int transform_bypass;              ///< qpprime_y_zero_transform_bypass_flag
83
    int log2_max_frame_num;            ///< log2_max_frame_num_minus4 + 4
84
    int poc_type;                      ///< pic_order_cnt_type
85
    int log2_max_poc_lsb;              ///< log2_max_pic_order_cnt_lsb_minus4
86
    int delta_pic_order_always_zero_flag;
87
    int offset_for_non_ref_pic;
88
    int offset_for_top_to_bottom_field;
89
    int poc_cycle_length;              ///< num_ref_frames_in_pic_order_cnt_cycle
90
    int ref_frame_count;               ///< num_ref_frames
91
    int gaps_in_frame_num_allowed_flag;
92
    int mb_width;                      ///< frame_width_in_mbs_minus1 + 1
93
    int mb_height;                     ///< frame_height_in_mbs_minus1 + 1
94
    int frame_mbs_only_flag;
95
    int mb_aff;                        ///<mb_adaptive_frame_field_flag
96
    int direct_8x8_inference_flag;
97
    int crop;                   ///< frame_cropping_flag
98
    int crop_left;              ///< frame_cropping_rect_left_offset
99
    int crop_right;             ///< frame_cropping_rect_right_offset
100
    int crop_top;               ///< frame_cropping_rect_top_offset
101
    int crop_bottom;            ///< frame_cropping_rect_bottom_offset
102
    int vui_parameters_present_flag;
103
    AVRational sar;
104
    int timing_info_present_flag;
105
    uint32_t num_units_in_tick;
106
    uint32_t time_scale;
107
    int fixed_frame_rate_flag;
108
    short offset_for_ref_frame[256]; //FIXME dyn aloc?
109
    int bitstream_restriction_flag;
110
    int num_reorder_frames;
111
    int scaling_matrix_present;
112
    uint8_t scaling_matrix4[6][16];
113
    uint8_t scaling_matrix8[2][64];
114
}SPS;
115

    
116
/**
117
 * Picture parameter set
118
 */
119
typedef struct PPS{
120
    int sps_id;
121
    int cabac;                  ///< entropy_coding_mode_flag
122
    int pic_order_present;      ///< pic_order_present_flag
123
    int slice_group_count;      ///< num_slice_groups_minus1 + 1
124
    int mb_slice_group_map_type;
125
    int ref_count[2];           ///< num_ref_idx_l0/1_active_minus1 + 1
126
    int weighted_pred;          ///< weighted_pred_flag
127
    int weighted_bipred_idc;
128
    int init_qp;                ///< pic_init_qp_minus26 + 26
129
    int init_qs;                ///< pic_init_qs_minus26 + 26
130
    int chroma_qp_index_offset;
131
    int deblocking_filter_parameters_present; ///< deblocking_filter_parameters_present_flag
132
    int constrained_intra_pred; ///< constrained_intra_pred_flag
133
    int redundant_pic_cnt_present; ///< redundant_pic_cnt_present_flag
134
    int transform_8x8_mode;     ///< transform_8x8_mode_flag
135
    uint8_t scaling_matrix4[6][16];
136
    uint8_t scaling_matrix8[2][64];
137
}PPS;
138

    
139
/**
140
 * Memory management control operation opcode.
141
 */
142
typedef enum MMCOOpcode{
143
    MMCO_END=0,
144
    MMCO_SHORT2UNUSED,
145
    MMCO_LONG2UNUSED,
146
    MMCO_SHORT2LONG,
147
    MMCO_SET_MAX_LONG,
148
    MMCO_RESET,
149
    MMCO_LONG,
150
} MMCOOpcode;
151

    
152
/**
153
 * Memory management control operation.
154
 */
155
typedef struct MMCO{
156
    MMCOOpcode opcode;
157
    int short_frame_num;
158
    int long_index;
159
} MMCO;
160

    
161
/**
162
 * H264Context
163
 */
164
typedef struct H264Context{
165
    MpegEncContext s;
166
    int nal_ref_idc;
167
    int nal_unit_type;
168
    uint8_t *rbsp_buffer;
169
    unsigned int rbsp_buffer_size;
170

    
171
    /**
172
      * Used to parse AVC variant of h264
173
      */
174
    int is_avc; ///< this flag is != 0 if codec is avc1
175
    int got_avcC; ///< flag used to parse avcC data only once
176
    int nal_length_size; ///< Number of bytes used for nal length (1, 2 or 4)
177

    
178
    int chroma_qp; //QPc
179

    
180
    int prev_mb_skipped;
181
    int next_mb_skipped;
182

    
183
    //prediction stuff
184
    int chroma_pred_mode;
185
    int intra16x16_pred_mode;
186

    
187
    int top_mb_xy;
188
    int left_mb_xy[2];
189

    
190
    int8_t intra4x4_pred_mode_cache[5*8];
191
    int8_t (*intra4x4_pred_mode)[8];
192
    void (*pred4x4  [9+3])(uint8_t *src, uint8_t *topright, int stride);//FIXME move to dsp?
193
    void (*pred8x8l [9+3])(uint8_t *src, int topleft, int topright, int stride);
194
    void (*pred8x8  [4+3])(uint8_t *src, int stride);
195
    void (*pred16x16[4+3])(uint8_t *src, int stride);
196
    unsigned int topleft_samples_available;
197
    unsigned int top_samples_available;
198
    unsigned int topright_samples_available;
199
    unsigned int left_samples_available;
200
    uint8_t (*top_borders[2])[16+2*8];
201
    uint8_t left_border[2*(17+2*9)];
202

    
203
    /**
204
     * non zero coeff count cache.
205
     * is 64 if not available.
206
     */
207
    DECLARE_ALIGNED_8(uint8_t, non_zero_count_cache[6*8]);
208
    uint8_t (*non_zero_count)[16];
209

    
210
    /**
211
     * Motion vector cache.
212
     */
213
    DECLARE_ALIGNED_8(int16_t, mv_cache[2][5*8][2]);
214
    DECLARE_ALIGNED_8(int8_t, ref_cache[2][5*8]);
215
#define LIST_NOT_USED -1 //FIXME rename?
216
#define PART_NOT_AVAILABLE -2
217

    
218
    /**
219
     * is 1 if the specific list MV&references are set to 0,0,-2.
220
     */
221
    int mv_cache_clean[2];
222

    
223
    /**
224
     * number of neighbors (top and/or left) that used 8x8 dct
225
     */
226
    int neighbor_transform_size;
227

    
228
    /**
229
     * block_offset[ 0..23] for frame macroblocks
230
     * block_offset[24..47] for field macroblocks
231
     */
232
    int block_offset[2*(16+8)];
233

    
234
    uint32_t *mb2b_xy; //FIXME are these 4 a good idea?
235
    uint32_t *mb2b8_xy;
236
    int b_stride; //FIXME use s->b4_stride
237
    int b8_stride;
238

    
239
    int mb_linesize;   ///< may be equal to s->linesize or s->linesize*2, for mbaff
240
    int mb_uvlinesize;
241

    
242
    int emu_edge_width;
243
    int emu_edge_height;
244

    
245
    int halfpel_flag;
246
    int thirdpel_flag;
247

    
248
    int unknown_svq3_flag;
249
    int next_slice_index;
250

    
251
    SPS sps_buffer[MAX_SPS_COUNT];
252
    SPS sps; ///< current sps
253

    
254
    PPS pps_buffer[MAX_PPS_COUNT];
255
    /**
256
     * current pps
257
     */
258
    PPS pps; //FIXME move to Picture perhaps? (->no) do we need that?
259

    
260
    uint32_t dequant4_buffer[6][52][16];
261
    uint32_t dequant8_buffer[2][52][64];
262
    uint32_t (*dequant4_coeff[6])[16];
263
    uint32_t (*dequant8_coeff[2])[64];
264
    int dequant_coeff_pps;     ///< reinit tables when pps changes
265

    
266
    int slice_num;
267
    uint8_t *slice_table_base;
268
    uint8_t *slice_table;      ///< slice_table_base + 2*mb_stride + 1
269
    int slice_type;
270
    int slice_type_fixed;
271

    
272
    //interlacing specific flags
273
    int mb_aff_frame;
274
    int mb_field_decoding_flag;
275
    int mb_mbaff;              ///< mb_aff_frame && mb_field_decoding_flag
276

    
277
    int sub_mb_type[4];
278

    
279
    //POC stuff
280
    int poc_lsb;
281
    int poc_msb;
282
    int delta_poc_bottom;
283
    int delta_poc[2];
284
    int frame_num;
285
    int prev_poc_msb;             ///< poc_msb of the last reference pic for POC type 0
286
    int prev_poc_lsb;             ///< poc_lsb of the last reference pic for POC type 0
287
    int frame_num_offset;         ///< for POC type 2
288
    int prev_frame_num_offset;    ///< for POC type 2
289
    int prev_frame_num;           ///< frame_num of the last pic for POC type 1/2
290

    
291
    /**
292
     * frame_num for frames or 2*frame_num for field pics.
293
     */
294
    int curr_pic_num;
295

    
296
    /**
297
     * max_frame_num or 2*max_frame_num for field pics.
298
     */
299
    int max_pic_num;
300

    
301
    //Weighted pred stuff
302
    int use_weight;
303
    int use_weight_chroma;
304
    int luma_log2_weight_denom;
305
    int chroma_log2_weight_denom;
306
    int luma_weight[2][48];
307
    int luma_offset[2][48];
308
    int chroma_weight[2][48][2];
309
    int chroma_offset[2][48][2];
310
    int implicit_weight[48][48];
311

    
312
    //deblock
313
    int deblocking_filter;         ///< disable_deblocking_filter_idc with 1<->0
314
    int slice_alpha_c0_offset;
315
    int slice_beta_offset;
316

    
317
    int redundant_pic_count;
318

    
319
    int direct_spatial_mv_pred;
320
    int dist_scale_factor[16];
321
    int dist_scale_factor_field[32];
322
    int map_col_to_list0[2][16];
323
    int map_col_to_list0_field[2][32];
324

    
325
    /**
326
     * num_ref_idx_l0/1_active_minus1 + 1
327
     */
328
    int ref_count[2];            ///< counts frames or fields, depending on current mb mode
329
    Picture *short_ref[32];
330
    Picture *long_ref[32];
331
    Picture default_ref_list[2][32];
332
    Picture ref_list[2][48];     ///< 0..15: frame refs, 16..47: mbaff field refs
333
    Picture *delayed_pic[16]; //FIXME size?
334
    Picture *delayed_output_pic;
335

    
336
    /**
337
     * memory management control operations buffer.
338
     */
339
    MMCO mmco[MAX_MMCO_COUNT];
340
    int mmco_index;
341

    
342
    int long_ref_count;  ///< number of actual long term references
343
    int short_ref_count; ///< number of actual short term references
344

    
345
    //data partitioning
346
    GetBitContext intra_gb;
347
    GetBitContext inter_gb;
348
    GetBitContext *intra_gb_ptr;
349
    GetBitContext *inter_gb_ptr;
350

    
351
    DECLARE_ALIGNED_8(DCTELEM, mb[16*24]);
352

    
353
    /**
354
     * Cabac
355
     */
356
    CABACContext cabac;
357
    uint8_t      cabac_state[460];
358
    int          cabac_init_idc;
359

    
360
    /* 0x100 -> non null luma_dc, 0x80/0x40 -> non null chroma_dc (cb/cr), 0x?0 -> chroma_cbp(0,1,2), 0x0? luma_cbp */
361
    uint16_t     *cbp_table;
362
    int cbp;
363
    int top_cbp;
364
    int left_cbp;
365
    /* chroma_pred_mode for i4x4 or i16x16, else 0 */
366
    uint8_t     *chroma_pred_mode_table;
367
    int         last_qscale_diff;
368
    int16_t     (*mvd_table[2])[2];
369
    DECLARE_ALIGNED_8(int16_t, mvd_cache[2][5*8][2]);
370
    uint8_t     *direct_table;
371
    uint8_t     direct_cache[5*8];
372

    
373
    uint8_t zigzag_scan[16];
374
    uint8_t zigzag_scan8x8[64];
375
    uint8_t zigzag_scan8x8_cavlc[64];
376
    uint8_t field_scan[16];
377
    uint8_t field_scan8x8[64];
378
    uint8_t field_scan8x8_cavlc[64];
379
    const uint8_t *zigzag_scan_q0;
380
    const uint8_t *zigzag_scan8x8_q0;
381
    const uint8_t *zigzag_scan8x8_cavlc_q0;
382
    const uint8_t *field_scan_q0;
383
    const uint8_t *field_scan8x8_q0;
384
    const uint8_t *field_scan8x8_cavlc_q0;
385

    
386
    int x264_build;
387
}H264Context;
388

    
389
static VLC coeff_token_vlc[4];
390
static VLC chroma_dc_coeff_token_vlc;
391

    
392
static VLC total_zeros_vlc[15];
393
static VLC chroma_dc_total_zeros_vlc[3];
394

    
395
static VLC run_vlc[6];
396
static VLC run7_vlc;
397

    
398
static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
399
static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
400
static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
401
static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
402

    
403
static av_always_inline uint32_t pack16to32(int a, int b){
404
#ifdef WORDS_BIGENDIAN
405
   return (b&0xFFFF) + (a<<16);
406
#else
407
   return (a&0xFFFF) + (b<<16);
408
#endif
409
}
410

    
411
const uint8_t ff_rem6[52]={
412
0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
413
};
414

    
415
const uint8_t ff_div6[52]={
416
0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
417
};
418

    
419

    
420
/**
421
 * fill a rectangle.
422
 * @param h height of the rectangle, should be a constant
423
 * @param w width of the rectangle, should be a constant
424
 * @param size the size of val (1 or 4), should be a constant
425
 */
426
static av_always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){
427
    uint8_t *p= (uint8_t*)vp;
428
    assert(size==1 || size==4);
429
    assert(w<=4);
430

    
431
    w      *= size;
432
    stride *= size;
433

    
434
    assert((((long)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0);
435
    assert((stride&(w-1))==0);
436
    if(w==2){
437
        const uint16_t v= size==4 ? val : val*0x0101;
438
        *(uint16_t*)(p + 0*stride)= v;
439
        if(h==1) return;
440
        *(uint16_t*)(p + 1*stride)= v;
441
        if(h==2) return;
442
        *(uint16_t*)(p + 2*stride)=
443
        *(uint16_t*)(p + 3*stride)= v;
444
    }else if(w==4){
445
        const uint32_t v= size==4 ? val : val*0x01010101;
446
        *(uint32_t*)(p + 0*stride)= v;
447
        if(h==1) return;
448
        *(uint32_t*)(p + 1*stride)= v;
449
        if(h==2) return;
450
        *(uint32_t*)(p + 2*stride)=
451
        *(uint32_t*)(p + 3*stride)= v;
452
    }else if(w==8){
453
    //gcc can't optimize 64bit math on x86_32
454
#if defined(ARCH_X86_64) || (defined(MP_WORDSIZE) && MP_WORDSIZE >= 64)
455
        const uint64_t v= val*0x0100000001ULL;
456
        *(uint64_t*)(p + 0*stride)= v;
457
        if(h==1) return;
458
        *(uint64_t*)(p + 1*stride)= v;
459
        if(h==2) return;
460
        *(uint64_t*)(p + 2*stride)=
461
        *(uint64_t*)(p + 3*stride)= v;
462
    }else if(w==16){
463
        const uint64_t v= val*0x0100000001ULL;
464
        *(uint64_t*)(p + 0+0*stride)=
465
        *(uint64_t*)(p + 8+0*stride)=
466
        *(uint64_t*)(p + 0+1*stride)=
467
        *(uint64_t*)(p + 8+1*stride)= v;
468
        if(h==2) return;
469
        *(uint64_t*)(p + 0+2*stride)=
470
        *(uint64_t*)(p + 8+2*stride)=
471
        *(uint64_t*)(p + 0+3*stride)=
472
        *(uint64_t*)(p + 8+3*stride)= v;
473
#else
474
        *(uint32_t*)(p + 0+0*stride)=
475
        *(uint32_t*)(p + 4+0*stride)= val;
476
        if(h==1) return;
477
        *(uint32_t*)(p + 0+1*stride)=
478
        *(uint32_t*)(p + 4+1*stride)= val;
479
        if(h==2) return;
480
        *(uint32_t*)(p + 0+2*stride)=
481
        *(uint32_t*)(p + 4+2*stride)=
482
        *(uint32_t*)(p + 0+3*stride)=
483
        *(uint32_t*)(p + 4+3*stride)= val;
484
    }else if(w==16){
485
        *(uint32_t*)(p + 0+0*stride)=
486
        *(uint32_t*)(p + 4+0*stride)=
487
        *(uint32_t*)(p + 8+0*stride)=
488
        *(uint32_t*)(p +12+0*stride)=
489
        *(uint32_t*)(p + 0+1*stride)=
490
        *(uint32_t*)(p + 4+1*stride)=
491
        *(uint32_t*)(p + 8+1*stride)=
492
        *(uint32_t*)(p +12+1*stride)= val;
493
        if(h==2) return;
494
        *(uint32_t*)(p + 0+2*stride)=
495
        *(uint32_t*)(p + 4+2*stride)=
496
        *(uint32_t*)(p + 8+2*stride)=
497
        *(uint32_t*)(p +12+2*stride)=
498
        *(uint32_t*)(p + 0+3*stride)=
499
        *(uint32_t*)(p + 4+3*stride)=
500
        *(uint32_t*)(p + 8+3*stride)=
501
        *(uint32_t*)(p +12+3*stride)= val;
502
#endif
503
    }else
504
        assert(0);
505
    assert(h==4);
506
}
507

    
508
static void fill_caches(H264Context *h, int mb_type, int for_deblock){
509
    MpegEncContext * const s = &h->s;
510
    const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
511
    int topleft_xy, top_xy, topright_xy, left_xy[2];
512
    int topleft_type, top_type, topright_type, left_type[2];
513
    int left_block[8];
514
    int i;
515

    
516
    //FIXME deblocking could skip the intra and nnz parts.
517
    if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[mb_xy-s->mb_stride]) && !FRAME_MBAFF)
518
        return;
519

    
520
    //wow what a mess, why didn't they simplify the interlacing&intra stuff, i can't imagine that these complex rules are worth it
521

    
522
    top_xy     = mb_xy  - s->mb_stride;
523
    topleft_xy = top_xy - 1;
524
    topright_xy= top_xy + 1;
525
    left_xy[1] = left_xy[0] = mb_xy-1;
526
    left_block[0]= 0;
527
    left_block[1]= 1;
528
    left_block[2]= 2;
529
    left_block[3]= 3;
530
    left_block[4]= 7;
531
    left_block[5]= 10;
532
    left_block[6]= 8;
533
    left_block[7]= 11;
534
    if(FRAME_MBAFF){
535
        const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
536
        const int top_pair_xy      = pair_xy     - s->mb_stride;
537
        const int topleft_pair_xy  = top_pair_xy - 1;
538
        const int topright_pair_xy = top_pair_xy + 1;
539
        const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
540
        const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
541
        const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
542
        const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
543
        const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
544
        const int bottom = (s->mb_y & 1);
545
        tprintf("fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
546
        if (bottom
547
                ? !curr_mb_frame_flag // bottom macroblock
548
                : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
549
                ) {
550
            top_xy -= s->mb_stride;
551
        }
552
        if (bottom
553
                ? !curr_mb_frame_flag // bottom macroblock
554
                : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
555
                ) {
556
            topleft_xy -= s->mb_stride;
557
        }
558
        if (bottom
559
                ? !curr_mb_frame_flag // bottom macroblock
560
                : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
561
                ) {
562
            topright_xy -= s->mb_stride;
563
        }
564
        if (left_mb_frame_flag != curr_mb_frame_flag) {
565
            left_xy[1] = left_xy[0] = pair_xy - 1;
566
            if (curr_mb_frame_flag) {
567
                if (bottom) {
568
                    left_block[0]= 2;
569
                    left_block[1]= 2;
570
                    left_block[2]= 3;
571
                    left_block[3]= 3;
572
                    left_block[4]= 8;
573
                    left_block[5]= 11;
574
                    left_block[6]= 8;
575
                    left_block[7]= 11;
576
                } else {
577
                    left_block[0]= 0;
578
                    left_block[1]= 0;
579
                    left_block[2]= 1;
580
                    left_block[3]= 1;
581
                    left_block[4]= 7;
582
                    left_block[5]= 10;
583
                    left_block[6]= 7;
584
                    left_block[7]= 10;
585
                }
586
            } else {
587
                left_xy[1] += s->mb_stride;
588
                //left_block[0]= 0;
589
                left_block[1]= 2;
590
                left_block[2]= 0;
591
                left_block[3]= 2;
592
                //left_block[4]= 7;
593
                left_block[5]= 10;
594
                left_block[6]= 7;
595
                left_block[7]= 10;
596
            }
597
        }
598
    }
599

    
600
    h->top_mb_xy = top_xy;
601
    h->left_mb_xy[0] = left_xy[0];
602
    h->left_mb_xy[1] = left_xy[1];
603
    if(for_deblock){
604
        topleft_type = 0;
605
        topright_type = 0;
606
        top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
607
        left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
608
        left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
609

    
610
        if(FRAME_MBAFF && !IS_INTRA(mb_type)){
611
            int list;
612
            int v = *(uint16_t*)&h->non_zero_count[mb_xy][14];
613
            for(i=0; i<16; i++)
614
                h->non_zero_count_cache[scan8[i]] = (v>>i)&1;
615
            for(list=0; list<1+(h->slice_type==B_TYPE); list++){
616
                if(USES_LIST(mb_type,list)){
617
                    uint32_t *src = (uint32_t*)s->current_picture.motion_val[list][h->mb2b_xy[mb_xy]];
618
                    uint32_t *dst = (uint32_t*)h->mv_cache[list][scan8[0]];
619
                    int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
620
                    for(i=0; i<4; i++, dst+=8, src+=h->b_stride){
621
                        dst[0] = src[0];
622
                        dst[1] = src[1];
623
                        dst[2] = src[2];
624
                        dst[3] = src[3];
625
                    }
626
                    *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
627
                    *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = pack16to32(ref[0],ref[1])*0x0101;
628
                    ref += h->b8_stride;
629
                    *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
630
                    *(uint32_t*)&h->ref_cache[list][scan8[10]] = pack16to32(ref[0],ref[1])*0x0101;
631
                }else{
632
                    fill_rectangle(&h-> mv_cache[list][scan8[ 0]], 4, 4, 8, 0, 4);
633
                    fill_rectangle(&h->ref_cache[list][scan8[ 0]], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
634
                }
635
            }
636
        }
637
    }else{
638
        topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
639
        top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
640
        topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
641
        left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
642
        left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
643
    }
644

    
645
    if(IS_INTRA(mb_type)){
646
        h->topleft_samples_available=
647
        h->top_samples_available=
648
        h->left_samples_available= 0xFFFF;
649
        h->topright_samples_available= 0xEEEA;
650

    
651
        if(!IS_INTRA(top_type) && (top_type==0 || h->pps.constrained_intra_pred)){
652
            h->topleft_samples_available= 0xB3FF;
653
            h->top_samples_available= 0x33FF;
654
            h->topright_samples_available= 0x26EA;
655
        }
656
        for(i=0; i<2; i++){
657
            if(!IS_INTRA(left_type[i]) && (left_type[i]==0 || h->pps.constrained_intra_pred)){
658
                h->topleft_samples_available&= 0xDF5F;
659
                h->left_samples_available&= 0x5F5F;
660
            }
661
        }
662

    
663
        if(!IS_INTRA(topleft_type) && (topleft_type==0 || h->pps.constrained_intra_pred))
664
            h->topleft_samples_available&= 0x7FFF;
665

    
666
        if(!IS_INTRA(topright_type) && (topright_type==0 || h->pps.constrained_intra_pred))
667
            h->topright_samples_available&= 0xFBFF;
668

    
669
        if(IS_INTRA4x4(mb_type)){
670
            if(IS_INTRA4x4(top_type)){
671
                h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
672
                h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
673
                h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
674
                h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
675
            }else{
676
                int pred;
677
                if(!top_type || (IS_INTER(top_type) && h->pps.constrained_intra_pred))
678
                    pred= -1;
679
                else{
680
                    pred= 2;
681
                }
682
                h->intra4x4_pred_mode_cache[4+8*0]=
683
                h->intra4x4_pred_mode_cache[5+8*0]=
684
                h->intra4x4_pred_mode_cache[6+8*0]=
685
                h->intra4x4_pred_mode_cache[7+8*0]= pred;
686
            }
687
            for(i=0; i<2; i++){
688
                if(IS_INTRA4x4(left_type[i])){
689
                    h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
690
                    h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
691
                }else{
692
                    int pred;
693
                    if(!left_type[i] || (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
694
                        pred= -1;
695
                    else{
696
                        pred= 2;
697
                    }
698
                    h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
699
                    h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
700
                }
701
            }
702
        }
703
    }
704

    
705

    
706
/*
707
0 . T T. T T T T
708
1 L . .L . . . .
709
2 L . .L . . . .
710
3 . T TL . . . .
711
4 L . .L . . . .
712
5 L . .. . . . .
713
*/
714
//FIXME constraint_intra_pred & partitioning & nnz (lets hope this is just a typo in the spec)
715
    if(top_type){
716
        h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
717
        h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
718
        h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
719
        h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
720

    
721
        h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
722
        h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
723

    
724
        h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
725
        h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
726

    
727
    }else{
728
        h->non_zero_count_cache[4+8*0]=
729
        h->non_zero_count_cache[5+8*0]=
730
        h->non_zero_count_cache[6+8*0]=
731
        h->non_zero_count_cache[7+8*0]=
732

    
733
        h->non_zero_count_cache[1+8*0]=
734
        h->non_zero_count_cache[2+8*0]=
735

    
736
        h->non_zero_count_cache[1+8*3]=
737
        h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
738

    
739
    }
740

    
741
    for (i=0; i<2; i++) {
742
        if(left_type[i]){
743
            h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
744
            h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
745
            h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
746
            h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
747
        }else{
748
            h->non_zero_count_cache[3+8*1 + 2*8*i]=
749
            h->non_zero_count_cache[3+8*2 + 2*8*i]=
750
            h->non_zero_count_cache[0+8*1 +   8*i]=
751
            h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
752
        }
753
    }
754

    
755
    if( h->pps.cabac ) {
756
        // top_cbp
757
        if(top_type) {
758
            h->top_cbp = h->cbp_table[top_xy];
759
        } else if(IS_INTRA(mb_type)) {
760
            h->top_cbp = 0x1C0;
761
        } else {
762
            h->top_cbp = 0;
763
        }
764
        // left_cbp
765
        if (left_type[0]) {
766
            h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
767
        } else if(IS_INTRA(mb_type)) {
768
            h->left_cbp = 0x1C0;
769
        } else {
770
            h->left_cbp = 0;
771
        }
772
        if (left_type[0]) {
773
            h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
774
        }
775
        if (left_type[1]) {
776
            h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
777
        }
778
    }
779

    
780
#if 1
781
    if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
782
        int list;
783
        for(list=0; list<1+(h->slice_type==B_TYPE); list++){
784
            if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
785
                /*if(!h->mv_cache_clean[list]){
786
                    memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
787
                    memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
788
                    h->mv_cache_clean[list]= 1;
789
                }*/
790
                continue;
791
            }
792
            h->mv_cache_clean[list]= 0;
793

    
794
            if(USES_LIST(top_type, list)){
795
                const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
796
                const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
797
                *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
798
                *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
799
                *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
800
                *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
801
                h->ref_cache[list][scan8[0] + 0 - 1*8]=
802
                h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
803
                h->ref_cache[list][scan8[0] + 2 - 1*8]=
804
                h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
805
            }else{
806
                *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
807
                *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
808
                *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
809
                *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
810
                *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
811
            }
812

    
813
            //FIXME unify cleanup or sth
814
            if(USES_LIST(left_type[0], list)){
815
                const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
816
                const int b8_xy= h->mb2b8_xy[left_xy[0]] + 1;
817
                *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0]];
818
                *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1]];
819
                h->ref_cache[list][scan8[0] - 1 + 0*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0]>>1)];
820
                h->ref_cache[list][scan8[0] - 1 + 1*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1]>>1)];
821
            }else{
822
                *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 0*8]=
823
                *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 1*8]= 0;
824
                h->ref_cache[list][scan8[0] - 1 + 0*8]=
825
                h->ref_cache[list][scan8[0] - 1 + 1*8]= left_type[0] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
826
            }
827

    
828
            if(USES_LIST(left_type[1], list)){
829
                const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
830
                const int b8_xy= h->mb2b8_xy[left_xy[1]] + 1;
831
                *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[2]];
832
                *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[3]];
833
                h->ref_cache[list][scan8[0] - 1 + 2*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[2]>>1)];
834
                h->ref_cache[list][scan8[0] - 1 + 3*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[3]>>1)];
835
            }else{
836
                *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 2*8]=
837
                *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 3*8]= 0;
838
                h->ref_cache[list][scan8[0] - 1 + 2*8]=
839
                h->ref_cache[list][scan8[0] - 1 + 3*8]= left_type[0] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
840
                assert((!left_type[0]) == (!left_type[1]));
841
            }
842

    
843
            if((for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred)) && !FRAME_MBAFF)
844
                continue;
845

    
846
            if(USES_LIST(topleft_type, list)){
847
                const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
848
                const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + h->b8_stride;
849
                *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
850
                h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
851
            }else{
852
                *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
853
                h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
854
            }
855

    
856
            if(USES_LIST(topright_type, list)){
857
                const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
858
                const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
859
                *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
860
                h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
861
            }else{
862
                *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
863
                h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
864
            }
865

    
866
            if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
867
                continue;
868

    
869
            h->ref_cache[list][scan8[5 ]+1] =
870
            h->ref_cache[list][scan8[7 ]+1] =
871
            h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
872
            h->ref_cache[list][scan8[4 ]] =
873
            h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
874
            *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
875
            *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
876
            *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
877
            *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
878
            *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
879

    
880
            if( h->pps.cabac ) {
881
                /* XXX beurk, Load mvd */
882
                if(USES_LIST(top_type, list)){
883
                    const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
884
                    *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
885
                    *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
886
                    *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
887
                    *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
888
                }else{
889
                    *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
890
                    *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
891
                    *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
892
                    *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
893
                }
894
                if(USES_LIST(left_type[0], list)){
895
                    const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
896
                    *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
897
                    *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
898
                }else{
899
                    *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
900
                    *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
901
                }
902
                if(USES_LIST(left_type[1], list)){
903
                    const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
904
                    *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
905
                    *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
906
                }else{
907
                    *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
908
                    *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
909
                }
910
                *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
911
                *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
912
                *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
913
                *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
914
                *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
915

    
916
                if(h->slice_type == B_TYPE){
917
                    fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
918

    
919
                    if(IS_DIRECT(top_type)){
920
                        *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
921
                    }else if(IS_8X8(top_type)){
922
                        int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
923
                        h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
924
                        h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
925
                    }else{
926
                        *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
927
                    }
928

    
929
                    if(IS_DIRECT(left_type[0]))
930
                        h->direct_cache[scan8[0] - 1 + 0*8]= 1;
931
                    else if(IS_8X8(left_type[0]))
932
                        h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
933
                    else
934
                        h->direct_cache[scan8[0] - 1 + 0*8]= 0;
935

    
936
                    if(IS_DIRECT(left_type[1]))
937
                        h->direct_cache[scan8[0] - 1 + 2*8]= 1;
938
                    else if(IS_8X8(left_type[1]))
939
                        h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
940
                    else
941
                        h->direct_cache[scan8[0] - 1 + 2*8]= 0;
942
                }
943
            }
944

    
945
            if(FRAME_MBAFF){
946
#define MAP_MVS\
947
                    MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
948
                    MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
949
                    MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
950
                    MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
951
                    MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
952
                    MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
953
                    MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
954
                    MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
955
                    MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
956
                    MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
957
                if(MB_FIELD){
958
#define MAP_F2F(idx, mb_type)\
959
                    if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
960
                        h->ref_cache[list][idx] <<= 1;\
961
                        h->mv_cache[list][idx][1] /= 2;\
962
                        h->mvd_cache[list][idx][1] /= 2;\
963
                    }
964
                    MAP_MVS
965
#undef MAP_F2F
966
                }else{
967
#define MAP_F2F(idx, mb_type)\
968
                    if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
969
                        h->ref_cache[list][idx] >>= 1;\
970
                        h->mv_cache[list][idx][1] <<= 1;\
971
                        h->mvd_cache[list][idx][1] <<= 1;\
972
                    }
973
                    MAP_MVS
974
#undef MAP_F2F
975
                }
976
            }
977
        }
978
    }
979
#endif
980

    
981
    h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
982
}
983

    
984
static inline void write_back_intra_pred_mode(H264Context *h){
985
    MpegEncContext * const s = &h->s;
986
    const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
987

    
988
    h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
989
    h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
990
    h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
991
    h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
992
    h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
993
    h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
994
    h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
995
}
996

    
997
/**
998
 * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
999
 */
1000
static inline int check_intra4x4_pred_mode(H264Context *h){
1001
    MpegEncContext * const s = &h->s;
1002
    static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
1003
    static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
1004
    int i;
1005

    
1006
    if(!(h->top_samples_available&0x8000)){
1007
        for(i=0; i<4; i++){
1008
            int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
1009
            if(status<0){
1010
                av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
1011
                return -1;
1012
            } else if(status){
1013
                h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
1014
            }
1015
        }
1016
    }
1017

    
1018
    if(!(h->left_samples_available&0x8000)){
1019
        for(i=0; i<4; i++){
1020
            int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
1021
            if(status<0){
1022
                av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
1023
                return -1;
1024
            } else if(status){
1025
                h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
1026
            }
1027
        }
1028
    }
1029

    
1030
    return 0;
1031
} //FIXME cleanup like next
1032

    
1033
/**
1034
 * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
1035
 */
1036
static inline int check_intra_pred_mode(H264Context *h, int mode){
1037
    MpegEncContext * const s = &h->s;
1038
    static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
1039
    static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
1040

    
1041
    if(mode < 0 || mode > 6) {
1042
        av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
1043
        return -1;
1044
    }
1045

    
1046
    if(!(h->top_samples_available&0x8000)){
1047
        mode= top[ mode ];
1048
        if(mode<0){
1049
            av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
1050
            return -1;
1051
        }
1052
    }
1053

    
1054
    if(!(h->left_samples_available&0x8000)){
1055
        mode= left[ mode ];
1056
        if(mode<0){
1057
            av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
1058
            return -1;
1059
        }
1060
    }
1061

    
1062
    return mode;
1063
}
1064

    
1065
/**
1066
 * gets the predicted intra4x4 prediction mode.
1067
 */
1068
static inline int pred_intra_mode(H264Context *h, int n){
1069
    const int index8= scan8[n];
1070
    const int left= h->intra4x4_pred_mode_cache[index8 - 1];
1071
    const int top = h->intra4x4_pred_mode_cache[index8 - 8];
1072
    const int min= FFMIN(left, top);
1073

    
1074
    tprintf("mode:%d %d min:%d\n", left ,top, min);
1075

    
1076
    if(min<0) return DC_PRED;
1077
    else      return min;
1078
}
1079

    
1080
static inline void write_back_non_zero_count(H264Context *h){
1081
    MpegEncContext * const s = &h->s;
1082
    const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
1083

    
1084
    h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
1085
    h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
1086
    h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
1087
    h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
1088
    h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
1089
    h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
1090
    h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
1091

    
1092
    h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
1093
    h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
1094
    h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
1095

    
1096
    h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
1097
    h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
1098
    h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
1099

    
1100
    if(FRAME_MBAFF){
1101
        // store all luma nnzs, for deblocking
1102
        int v = 0, i;
1103
        for(i=0; i<16; i++)
1104
            v += (!!h->non_zero_count_cache[scan8[i]]) << i;
1105
        *(uint16_t*)&h->non_zero_count[mb_xy][14] = v;
1106
    }
1107
}
1108

    
1109
/**
1110
 * gets the predicted number of non zero coefficients.
1111
 * @param n block index
1112
 */
1113
static inline int pred_non_zero_count(H264Context *h, int n){
1114
    const int index8= scan8[n];
1115
    const int left= h->non_zero_count_cache[index8 - 1];
1116
    const int top = h->non_zero_count_cache[index8 - 8];
1117
    int i= left + top;
1118

    
1119
    if(i<64) i= (i+1)>>1;
1120

    
1121
    tprintf("pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
1122

    
1123
    return i&31;
1124
}
1125

    
1126
static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
1127
    const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
1128

    
1129
    /* there is no consistent mapping of mvs to neighboring locations that will
1130
     * make mbaff happy, so we can't move all this logic to fill_caches */
1131
    if(FRAME_MBAFF){
1132
        MpegEncContext *s = &h->s;
1133
        const uint32_t *mb_types = s->current_picture_ptr->mb_type;
1134
        const int16_t *mv;
1135
        *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
1136
        *C = h->mv_cache[list][scan8[0]-2];
1137

    
1138
        if(!MB_FIELD
1139
           && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
1140
            int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
1141
            if(IS_INTERLACED(mb_types[topright_xy])){
1142
#define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
1143
                const int x4 = X4, y4 = Y4;\
1144
                const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
1145
                if(!USES_LIST(mb_type,list) && !IS_8X8(mb_type))\
1146
                    return LIST_NOT_USED;\
1147
                mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
1148
                h->mv_cache[list][scan8[0]-2][0] = mv[0];\
1149
                h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
1150
                return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
1151

    
1152
                SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
1153
            }
1154
        }
1155
        if(topright_ref == PART_NOT_AVAILABLE
1156
           && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
1157
           && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
1158
            if(!MB_FIELD
1159
               && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
1160
                SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
1161
            }
1162
            if(MB_FIELD
1163
               && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
1164
               && i >= scan8[0]+8){
1165
                // leftshift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's ok.
1166
                SET_DIAG_MV(>>1, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
1167
            }
1168
        }
1169
#undef SET_DIAG_MV
1170
    }
1171

    
1172
    if(topright_ref != PART_NOT_AVAILABLE){
1173
        *C= h->mv_cache[list][ i - 8 + part_width ];
1174
        return topright_ref;
1175
    }else{
1176
        tprintf("topright MV not available\n");
1177

    
1178
        *C= h->mv_cache[list][ i - 8 - 1 ];
1179
        return h->ref_cache[list][ i - 8 - 1 ];
1180
    }
1181
}
1182

    
1183
/**
1184
 * gets the predicted MV.
1185
 * @param n the block index
1186
 * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
1187
 * @param mx the x component of the predicted motion vector
1188
 * @param my the y component of the predicted motion vector
1189
 */
1190
static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
1191
    const int index8= scan8[n];
1192
    const int top_ref=      h->ref_cache[list][ index8 - 8 ];
1193
    const int left_ref=     h->ref_cache[list][ index8 - 1 ];
1194
    const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
1195
    const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
1196
    const int16_t * C;
1197
    int diagonal_ref, match_count;
1198

    
1199
    assert(part_width==1 || part_width==2 || part_width==4);
1200

    
1201
/* mv_cache
1202
  B . . A T T T T
1203
  U . . L . . , .
1204
  U . . L . . . .
1205
  U . . L . . , .
1206
  . . . L . . . .
1207
*/
1208

    
1209
    diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
1210
    match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
1211
    tprintf("pred_motion match_count=%d\n", match_count);
1212
    if(match_count > 1){ //most common
1213
        *mx= mid_pred(A[0], B[0], C[0]);
1214
        *my= mid_pred(A[1], B[1], C[1]);
1215
    }else if(match_count==1){
1216
        if(left_ref==ref){
1217
            *mx= A[0];
1218
            *my= A[1];
1219
        }else if(top_ref==ref){
1220
            *mx= B[0];
1221
            *my= B[1];
1222
        }else{
1223
            *mx= C[0];
1224
            *my= C[1];
1225
        }
1226
    }else{
1227
        if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
1228
            *mx= A[0];
1229
            *my= A[1];
1230
        }else{
1231
            *mx= mid_pred(A[0], B[0], C[0]);
1232
            *my= mid_pred(A[1], B[1], C[1]);
1233
        }
1234
    }
1235

    
1236
    tprintf("pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
1237
}
1238

    
1239
/**
1240
 * gets the directionally predicted 16x8 MV.
1241
 * @param n the block index
1242
 * @param mx the x component of the predicted motion vector
1243
 * @param my the y component of the predicted motion vector
1244
 */
1245
static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
1246
    if(n==0){
1247
        const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
1248
        const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
1249

    
1250
        tprintf("pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
1251

    
1252
        if(top_ref == ref){
1253
            *mx= B[0];
1254
            *my= B[1];
1255
            return;
1256
        }
1257
    }else{
1258
        const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
1259
        const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
1260

    
1261
        tprintf("pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
1262

    
1263
        if(left_ref == ref){
1264
            *mx= A[0];
1265
            *my= A[1];
1266
            return;
1267
        }
1268
    }
1269

    
1270
    //RARE
1271
    pred_motion(h, n, 4, list, ref, mx, my);
1272
}
1273

    
1274
/**
1275
 * gets the directionally predicted 8x16 MV.
1276
 * @param n the block index
1277
 * @param mx the x component of the predicted motion vector
1278
 * @param my the y component of the predicted motion vector
1279
 */
1280
static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
1281
    if(n==0){
1282
        const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
1283
        const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
1284

    
1285
        tprintf("pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
1286

    
1287
        if(left_ref == ref){
1288
            *mx= A[0];
1289
            *my= A[1];
1290
            return;
1291
        }
1292
    }else{
1293
        const int16_t * C;
1294
        int diagonal_ref;
1295

    
1296
        diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
1297

    
1298
        tprintf("pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
1299

    
1300
        if(diagonal_ref == ref){
1301
            *mx= C[0];
1302
            *my= C[1];
1303
            return;
1304
        }
1305
    }
1306

    
1307
    //RARE
1308
    pred_motion(h, n, 2, list, ref, mx, my);
1309
}
1310

    
1311
static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
1312
    const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
1313
    const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
1314

    
1315
    tprintf("pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
1316

    
1317
    if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
1318
       || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
1319
       || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
1320

    
1321
        *mx = *my = 0;
1322
        return;
1323
    }
1324

    
1325
    pred_motion(h, 0, 4, 0, 0, mx, my);
1326

    
1327
    return;
1328
}
1329

    
1330
static inline void direct_dist_scale_factor(H264Context * const h){
1331
    const int poc = h->s.current_picture_ptr->poc;
1332
    const int poc1 = h->ref_list[1][0].poc;
1333
    int i;
1334
    for(i=0; i<h->ref_count[0]; i++){
1335
        int poc0 = h->ref_list[0][i].poc;
1336
        int td = clip(poc1 - poc0, -128, 127);
1337
        if(td == 0 /* FIXME || pic0 is a long-term ref */){
1338
            h->dist_scale_factor[i] = 256;
1339
        }else{
1340
            int tb = clip(poc - poc0, -128, 127);
1341
            int tx = (16384 + (FFABS(td) >> 1)) / td;
1342
            h->dist_scale_factor[i] = clip((tb*tx + 32) >> 6, -1024, 1023);
1343
        }
1344
    }
1345
    if(FRAME_MBAFF){
1346
        for(i=0; i<h->ref_count[0]; i++){
1347
            h->dist_scale_factor_field[2*i] =
1348
            h->dist_scale_factor_field[2*i+1] = h->dist_scale_factor[i];
1349
        }
1350
    }
1351
}
1352
static inline void direct_ref_list_init(H264Context * const h){
1353
    MpegEncContext * const s = &h->s;
1354
    Picture * const ref1 = &h->ref_list[1][0];
1355
    Picture * const cur = s->current_picture_ptr;
1356
    int list, i, j;
1357
    if(cur->pict_type == I_TYPE)
1358
        cur->ref_count[0] = 0;
1359
    if(cur->pict_type != B_TYPE)
1360
        cur->ref_count[1] = 0;
1361
    for(list=0; list<2; list++){
1362
        cur->ref_count[list] = h->ref_count[list];
1363
        for(j=0; j<h->ref_count[list]; j++)
1364
            cur->ref_poc[list][j] = h->ref_list[list][j].poc;
1365
    }
1366
    if(cur->pict_type != B_TYPE || h->direct_spatial_mv_pred)
1367
        return;
1368
    for(list=0; list<2; list++){
1369
        for(i=0; i<ref1->ref_count[list]; i++){
1370
            const int poc = ref1->ref_poc[list][i];
1371
            h->map_col_to_list0[list][i] = 0; /* bogus; fills in for missing frames */
1372
            for(j=0; j<h->ref_count[list]; j++)
1373
                if(h->ref_list[list][j].poc == poc){
1374
                    h->map_col_to_list0[list][i] = j;
1375
                    break;
1376
                }
1377
        }
1378
    }
1379
    if(FRAME_MBAFF){
1380
        for(list=0; list<2; list++){
1381
            for(i=0; i<ref1->ref_count[list]; i++){
1382
                j = h->map_col_to_list0[list][i];
1383
                h->map_col_to_list0_field[list][2*i] = 2*j;
1384
                h->map_col_to_list0_field[list][2*i+1] = 2*j+1;
1385
            }
1386
        }
1387
    }
1388
}
1389

    
1390
static inline void pred_direct_motion(H264Context * const h, int *mb_type){
1391
    MpegEncContext * const s = &h->s;
1392
    const int mb_xy =   s->mb_x +   s->mb_y*s->mb_stride;
1393
    const int b8_xy = 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1394
    const int b4_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1395
    const int mb_type_col = h->ref_list[1][0].mb_type[mb_xy];
1396
    const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[0][b4_xy];
1397
    const int16_t (*l1mv1)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[1][b4_xy];
1398
    const int8_t *l1ref0 = &h->ref_list[1][0].ref_index[0][b8_xy];
1399
    const int8_t *l1ref1 = &h->ref_list[1][0].ref_index[1][b8_xy];
1400
    const int is_b8x8 = IS_8X8(*mb_type);
1401
    int sub_mb_type;
1402
    int i8, i4;
1403

    
1404
#define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
1405
    if(IS_8X8(mb_type_col) && !h->sps.direct_8x8_inference_flag){
1406
        /* FIXME save sub mb types from previous frames (or derive from MVs)
1407
         * so we know exactly what block size to use */
1408
        sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1409
        *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1410
    }else if(!is_b8x8 && (mb_type_col & MB_TYPE_16x16_OR_INTRA)){
1411
        sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1412
        *mb_type =    MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1413
    }else{
1414
        sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1415
        *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1416
    }
1417
    if(!is_b8x8)
1418
        *mb_type |= MB_TYPE_DIRECT2;
1419
    if(MB_FIELD)
1420
        *mb_type |= MB_TYPE_INTERLACED;
1421

    
1422
    tprintf("mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type, sub_mb_type, is_b8x8, mb_type_col);
1423

    
1424
    if(h->direct_spatial_mv_pred){
1425
        int ref[2];
1426
        int mv[2][2];
1427
        int list;
1428

    
1429
        /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1430

    
1431
        /* ref = min(neighbors) */
1432
        for(list=0; list<2; list++){
1433
            int refa = h->ref_cache[list][scan8[0] - 1];
1434
            int refb = h->ref_cache[list][scan8[0] - 8];
1435
            int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1436
            if(refc == -2)
1437
                refc = h->ref_cache[list][scan8[0] - 8 - 1];
1438
            ref[list] = refa;
1439
            if(ref[list] < 0 || (refb < ref[list] && refb >= 0))
1440
                ref[list] = refb;
1441
            if(ref[list] < 0 || (refc < ref[list] && refc >= 0))
1442
                ref[list] = refc;
1443
            if(ref[list] < 0)
1444
                ref[list] = -1;
1445
        }
1446

    
1447
        if(ref[0] < 0 && ref[1] < 0){
1448
            ref[0] = ref[1] = 0;
1449
            mv[0][0] = mv[0][1] =
1450
            mv[1][0] = mv[1][1] = 0;
1451
        }else{
1452
            for(list=0; list<2; list++){
1453
                if(ref[list] >= 0)
1454
                    pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1455
                else
1456
                    mv[list][0] = mv[list][1] = 0;
1457
            }
1458
        }
1459

    
1460
        if(ref[1] < 0){
1461
            *mb_type &= ~MB_TYPE_P0L1;
1462
            sub_mb_type &= ~MB_TYPE_P0L1;
1463
        }else if(ref[0] < 0){
1464
            *mb_type &= ~MB_TYPE_P0L0;
1465
            sub_mb_type &= ~MB_TYPE_P0L0;
1466
        }
1467

    
1468
        if(IS_16X16(*mb_type)){
1469
            fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1470
            fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1471
            if(!IS_INTRA(mb_type_col)
1472
               && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1473
                   || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1474
                       && (h->x264_build>33 || !h->x264_build)))){
1475
                if(ref[0] > 0)
1476
                    fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1477
                else
1478
                    fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, 0, 4);
1479
                if(ref[1] > 0)
1480
                    fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1481
                else
1482
                    fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, 0, 4);
1483
            }else{
1484
                fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1485
                fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1486
            }
1487
        }else{
1488
            for(i8=0; i8<4; i8++){
1489
                const int x8 = i8&1;
1490
                const int y8 = i8>>1;
1491

    
1492
                if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1493
                    continue;
1494
                h->sub_mb_type[i8] = sub_mb_type;
1495

    
1496
                fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1497
                fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1498
                fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1499
                fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1500

    
1501
                /* col_zero_flag */
1502
                if(!IS_INTRA(mb_type_col) && (   l1ref0[x8 + y8*h->b8_stride] == 0
1503
                                              || (l1ref0[x8 + y8*h->b8_stride] < 0 && l1ref1[x8 + y8*h->b8_stride] == 0
1504
                                                  && (h->x264_build>33 || !h->x264_build)))){
1505
                    const int16_t (*l1mv)[2]= l1ref0[x8 + y8*h->b8_stride] == 0 ? l1mv0 : l1mv1;
1506
                    if(IS_SUB_8X8(sub_mb_type)){
1507
                        const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1508
                        if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1509
                            if(ref[0] == 0)
1510
                                fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1511
                            if(ref[1] == 0)
1512
                                fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1513
                        }
1514
                    }else
1515
                    for(i4=0; i4<4; i4++){
1516
                        const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1517
                        if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1518
                            if(ref[0] == 0)
1519
                                *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1520
                            if(ref[1] == 0)
1521
                                *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1522
                        }
1523
                    }
1524
                }
1525
            }
1526
        }
1527
    }else{ /* direct temporal mv pred */
1528
        const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1529
        const int *dist_scale_factor = h->dist_scale_factor;
1530

    
1531
        if(FRAME_MBAFF){
1532
            if(IS_INTERLACED(*mb_type)){
1533
                map_col_to_list0[0] = h->map_col_to_list0_field[0];
1534
                map_col_to_list0[1] = h->map_col_to_list0_field[1];
1535
                dist_scale_factor = h->dist_scale_factor_field;
1536
            }
1537
            if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1538
                /* FIXME assumes direct_8x8_inference == 1 */
1539
                const int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1540
                int mb_types_col[2];
1541
                int y_shift;
1542

    
1543
                *mb_type = MB_TYPE_8x8|MB_TYPE_L0L1
1544
                         | (is_b8x8 ? 0 : MB_TYPE_DIRECT2)
1545
                         | (*mb_type & MB_TYPE_INTERLACED);
1546
                sub_mb_type = MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_16x16;
1547

    
1548
                if(IS_INTERLACED(*mb_type)){
1549
                    /* frame to field scaling */
1550
                    mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1551
                    mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1552
                    if(s->mb_y&1){
1553
                        l1ref0 -= 2*h->b8_stride;
1554
                        l1ref1 -= 2*h->b8_stride;
1555
                        l1mv0 -= 4*h->b_stride;
1556
                        l1mv1 -= 4*h->b_stride;
1557
                    }
1558
                    y_shift = 0;
1559

    
1560
                    if(   (mb_types_col[0] & MB_TYPE_16x16_OR_INTRA)
1561
                       && (mb_types_col[1] & MB_TYPE_16x16_OR_INTRA)
1562
                       && !is_b8x8)
1563
                        *mb_type |= MB_TYPE_16x8;
1564
                    else
1565
                        *mb_type |= MB_TYPE_8x8;
1566
                }else{
1567
                    /* field to frame scaling */
1568
                    /* col_mb_y = (mb_y&~1) + (topAbsDiffPOC < bottomAbsDiffPOC ? 0 : 1)
1569
                     * but in MBAFF, top and bottom POC are equal */
1570
                    int dy = (s->mb_y&1) ? 1 : 2;
1571
                    mb_types_col[0] =
1572
                    mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1573
                    l1ref0 += dy*h->b8_stride;
1574
                    l1ref1 += dy*h->b8_stride;
1575
                    l1mv0 += 2*dy*h->b_stride;
1576
                    l1mv1 += 2*dy*h->b_stride;
1577
                    y_shift = 2;
1578

    
1579
                    if((mb_types_col[0] & (MB_TYPE_16x16_OR_INTRA|MB_TYPE_16x8))
1580
                       && !is_b8x8)
1581
                        *mb_type |= MB_TYPE_16x16;
1582
                    else
1583
                        *mb_type |= MB_TYPE_8x8;
1584
                }
1585

    
1586
                for(i8=0; i8<4; i8++){
1587
                    const int x8 = i8&1;
1588
                    const int y8 = i8>>1;
1589
                    int ref0, scale;
1590
                    const int16_t (*l1mv)[2]= l1mv0;
1591

    
1592
                    if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1593
                        continue;
1594
                    h->sub_mb_type[i8] = sub_mb_type;
1595

    
1596
                    fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1597
                    if(IS_INTRA(mb_types_col[y8])){
1598
                        fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1599
                        fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1600
                        fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1601
                        continue;
1602
                    }
1603

    
1604
                    ref0 = l1ref0[x8 + (y8*2>>y_shift)*h->b8_stride];
1605
                    if(ref0 >= 0)
1606
                        ref0 = map_col_to_list0[0][ref0*2>>y_shift];
1607
                    else{
1608
                        ref0 = map_col_to_list0[1][l1ref1[x8 + (y8*2>>y_shift)*h->b8_stride]*2>>y_shift];
1609
                        l1mv= l1mv1;
1610
                    }
1611
                    scale = dist_scale_factor[ref0];
1612
                    fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1613

    
1614
                    {
1615
                        const int16_t *mv_col = l1mv[x8*3 + (y8*6>>y_shift)*h->b_stride];
1616
                        int my_col = (mv_col[1]<<y_shift)/2;
1617
                        int mx = (scale * mv_col[0] + 128) >> 8;
1618
                        int my = (scale * my_col + 128) >> 8;
1619
                        fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1620
                        fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1621
                    }
1622
                }
1623
                return;
1624
            }
1625
        }
1626

    
1627
        /* one-to-one mv scaling */
1628

    
1629
        if(IS_16X16(*mb_type)){
1630
            fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1631
            if(IS_INTRA(mb_type_col)){
1632
                fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
1633
                fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, 0, 4);
1634
                fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, 0, 4);
1635
            }else{
1636
                const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
1637
                                                : map_col_to_list0[1][l1ref1[0]];
1638
                const int scale = dist_scale_factor[ref0];
1639
                const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1640
                int mv_l0[2];
1641
                mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1642
                mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1643
                fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref0, 1);
1644
                fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mv_l0[0],mv_l0[1]), 4);
1645
                fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]), 4);
1646
            }
1647
        }else{
1648
            for(i8=0; i8<4; i8++){
1649
                const int x8 = i8&1;
1650
                const int y8 = i8>>1;
1651
                int ref0, scale;
1652
                const int16_t (*l1mv)[2]= l1mv0;
1653

    
1654
                if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1655
                    continue;
1656
                h->sub_mb_type[i8] = sub_mb_type;
1657
                fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1658
                if(IS_INTRA(mb_type_col)){
1659
                    fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1660
                    fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1661
                    fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1662
                    continue;
1663
                }
1664

    
1665
                ref0 = l1ref0[x8 + y8*h->b8_stride];
1666
                if(ref0 >= 0)
1667
                    ref0 = map_col_to_list0[0][ref0];
1668
                else{
1669
                    ref0 = map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
1670
                    l1mv= l1mv1;
1671
                }
1672
                scale = dist_scale_factor[ref0];
1673

    
1674
                fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1675
                if(IS_SUB_8X8(sub_mb_type)){
1676
                    const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1677
                    int mx = (scale * mv_col[0] + 128) >> 8;
1678
                    int my = (scale * mv_col[1] + 128) >> 8;
1679
                    fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1680
                    fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1681
                }else
1682
                for(i4=0; i4<4; i4++){
1683
                    const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1684
                    int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1685
                    mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1686
                    mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1687
                    *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1688
                        pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1689
                }
1690
            }
1691
        }
1692
    }
1693
}
1694

    
1695
static inline void write_back_motion(H264Context *h, int mb_type){
1696
    MpegEncContext * const s = &h->s;
1697
    const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1698
    const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1699
    int list;
1700

    
1701
    if(!USES_LIST(mb_type, 0))
1702
        fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1703

    
1704
    for(list=0; list<2; list++){
1705
        int y;
1706
        if(!USES_LIST(mb_type, list))
1707
            continue;
1708

    
1709
        for(y=0; y<4; y++){
1710
            *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1711
            *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1712
        }
1713
        if( h->pps.cabac ) {
1714
            if(IS_SKIP(mb_type))
1715
                fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1716
            else
1717
            for(y=0; y<4; y++){
1718
                *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1719
                *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1720
            }
1721
        }
1722

    
1723
        {
1724
            int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1725
            ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1726
            ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1727
            ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1728
            ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1729
        }
1730
    }
1731

    
1732
    if(h->slice_type == B_TYPE && h->pps.cabac){
1733
        if(IS_8X8(mb_type)){
1734
            uint8_t *direct_table = &h->direct_table[b8_xy];
1735
            direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1736
            direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1737
            direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1738
        }
1739
    }
1740
}
1741

    
1742
/**
1743
 * Decodes a network abstraction layer unit.
1744
 * @param consumed is the number of bytes used as input
1745
 * @param length is the length of the array
1746
 * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1747
 * @returns decoded bytes, might be src+1 if no escapes
1748
 */
1749
static uint8_t *decode_nal(H264Context *h, uint8_t *src, int *dst_length, int *consumed, int length){
1750
    int i, si, di;
1751
    uint8_t *dst;
1752

    
1753
//    src[0]&0x80;                //forbidden bit
1754
    h->nal_ref_idc= src[0]>>5;
1755
    h->nal_unit_type= src[0]&0x1F;
1756

    
1757
    src++; length--;
1758
#if 0
1759
    for(i=0; i<length; i++)
1760
        printf("%2X ", src[i]);
1761
#endif
1762
    for(i=0; i+1<length; i+=2){
1763
        if(src[i]) continue;
1764
        if(i>0 && src[i-1]==0) i--;
1765
        if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1766
            if(src[i+2]!=3){
1767
                /* startcode, so we must be past the end */
1768
                length=i;
1769
            }
1770
            break;
1771
        }
1772
    }
1773

    
1774
    if(i>=length-1){ //no escaped 0
1775
        *dst_length= length;
1776
        *consumed= length+1; //+1 for the header
1777
        return src;
1778
    }
1779

    
1780
    h->rbsp_buffer= av_fast_realloc(h->rbsp_buffer, &h->rbsp_buffer_size, length);
1781
    dst= h->rbsp_buffer;
1782

    
1783
//printf("decoding esc\n");
1784
    si=di=0;
1785
    while(si<length){
1786
        //remove escapes (very rare 1:2^22)
1787
        if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1788
            if(src[si+2]==3){ //escape
1789
                dst[di++]= 0;
1790
                dst[di++]= 0;
1791
                si+=3;
1792
                continue;
1793
            }else //next start code
1794
                break;
1795
        }
1796

    
1797
        dst[di++]= src[si++];
1798
    }
1799

    
1800
    *dst_length= di;
1801
    *consumed= si + 1;//+1 for the header
1802
//FIXME store exact number of bits in the getbitcontext (its needed for decoding)
1803
    return dst;
1804
}
1805

    
1806
/**
1807
 * identifies the exact end of the bitstream
1808
 * @return the length of the trailing, or 0 if damaged
1809
 */
1810
static int decode_rbsp_trailing(uint8_t *src){
1811
    int v= *src;
1812
    int r;
1813

    
1814
    tprintf("rbsp trailing %X\n", v);
1815

    
1816
    for(r=1; r<9; r++){
1817
        if(v&1) return r;
1818
        v>>=1;
1819
    }
1820
    return 0;
1821
}
1822

    
1823
/**
1824
 * idct tranforms the 16 dc values and dequantize them.
1825
 * @param qp quantization parameter
1826
 */
1827
static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1828
#define stride 16
1829
    int i;
1830
    int temp[16]; //FIXME check if this is a good idea
1831
    static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1832
    static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1833

    
1834
//memset(block, 64, 2*256);
1835
//return;
1836
    for(i=0; i<4; i++){
1837
        const int offset= y_offset[i];
1838
        const int z0= block[offset+stride*0] + block[offset+stride*4];
1839
        const int z1= block[offset+stride*0] - block[offset+stride*4];
1840
        const int z2= block[offset+stride*1] - block[offset+stride*5];
1841
        const int z3= block[offset+stride*1] + block[offset+stride*5];
1842

    
1843
        temp[4*i+0]= z0+z3;
1844
        temp[4*i+1]= z1+z2;
1845
        temp[4*i+2]= z1-z2;
1846
        temp[4*i+3]= z0-z3;
1847
    }
1848

    
1849
    for(i=0; i<4; i++){
1850
        const int offset= x_offset[i];
1851
        const int z0= temp[4*0+i] + temp[4*2+i];
1852
        const int z1= temp[4*0+i] - temp[4*2+i];
1853
        const int z2= temp[4*1+i] - temp[4*3+i];
1854
        const int z3= temp[4*1+i] + temp[4*3+i];
1855

    
1856
        block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_resdual
1857
        block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1858
        block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1859
        block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1860
    }
1861
}
1862

    
1863
#if 0
1864
/**
1865
 * dct tranforms the 16 dc values.
1866
 * @param qp quantization parameter ??? FIXME
1867
 */
1868
static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1869
//    const int qmul= dequant_coeff[qp][0];
1870
    int i;
1871
    int temp[16]; //FIXME check if this is a good idea
1872
    static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1873
    static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1874

1875
    for(i=0; i<4; i++){
1876
        const int offset= y_offset[i];
1877
        const int z0= block[offset+stride*0] + block[offset+stride*4];
1878
        const int z1= block[offset+stride*0] - block[offset+stride*4];
1879
        const int z2= block[offset+stride*1] - block[offset+stride*5];
1880
        const int z3= block[offset+stride*1] + block[offset+stride*5];
1881

1882
        temp[4*i+0]= z0+z3;
1883
        temp[4*i+1]= z1+z2;
1884
        temp[4*i+2]= z1-z2;
1885
        temp[4*i+3]= z0-z3;
1886
    }
1887

1888
    for(i=0; i<4; i++){
1889
        const int offset= x_offset[i];
1890
        const int z0= temp[4*0+i] + temp[4*2+i];
1891
        const int z1= temp[4*0+i] - temp[4*2+i];
1892
        const int z2= temp[4*1+i] - temp[4*3+i];
1893
        const int z3= temp[4*1+i] + temp[4*3+i];
1894

1895
        block[stride*0 +offset]= (z0 + z3)>>1;
1896
        block[stride*2 +offset]= (z1 + z2)>>1;
1897
        block[stride*8 +offset]= (z1 - z2)>>1;
1898
        block[stride*10+offset]= (z0 - z3)>>1;
1899
    }
1900
}
1901
#endif
1902

    
1903
#undef xStride
1904
#undef stride
1905

    
1906
static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1907
    const int stride= 16*2;
1908
    const int xStride= 16;
1909
    int a,b,c,d,e;
1910

    
1911
    a= block[stride*0 + xStride*0];
1912
    b= block[stride*0 + xStride*1];
1913
    c= block[stride*1 + xStride*0];
1914
    d= block[stride*1 + xStride*1];
1915

    
1916
    e= a-b;
1917
    a= a+b;
1918
    b= c-d;
1919
    c= c+d;
1920

    
1921
    block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1922
    block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1923
    block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1924
    block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1925
}
1926

    
1927
#if 0
1928
static void chroma_dc_dct_c(DCTELEM *block){
1929
    const int stride= 16*2;
1930
    const int xStride= 16;
1931
    int a,b,c,d,e;
1932

1933
    a= block[stride*0 + xStride*0];
1934
    b= block[stride*0 + xStride*1];
1935
    c= block[stride*1 + xStride*0];
1936
    d= block[stride*1 + xStride*1];
1937

1938
    e= a-b;
1939
    a= a+b;
1940
    b= c-d;
1941
    c= c+d;
1942

1943
    block[stride*0 + xStride*0]= (a+c);
1944
    block[stride*0 + xStride*1]= (e+b);
1945
    block[stride*1 + xStride*0]= (a-c);
1946
    block[stride*1 + xStride*1]= (e-b);
1947
}
1948
#endif
1949

    
1950
/**
1951
 * gets the chroma qp.
1952
 */
1953
static inline int get_chroma_qp(int chroma_qp_index_offset, int qscale){
1954

    
1955
    return chroma_qp[clip(qscale + chroma_qp_index_offset, 0, 51)];
1956
}
1957

    
1958
//FIXME need to check that this doesnt overflow signed 32 bit for low qp, i am not sure, it's very close
1959
//FIXME check that gcc inlines this (and optimizes intra & seperate_dc stuff away)
1960
static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int seperate_dc){
1961
    int i;
1962
    const int * const quant_table= quant_coeff[qscale];
1963
    const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1964
    const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1965
    const unsigned int threshold2= (threshold1<<1);
1966
    int last_non_zero;
1967

    
1968
    if(seperate_dc){
1969
        if(qscale<=18){
1970
            //avoid overflows
1971
            const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1972
            const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1973
            const unsigned int dc_threshold2= (dc_threshold1<<1);
1974

    
1975
            int level= block[0]*quant_coeff[qscale+18][0];
1976
            if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1977
                if(level>0){
1978
                    level= (dc_bias + level)>>(QUANT_SHIFT-2);
1979
                    block[0]= level;
1980
                }else{
1981
                    level= (dc_bias - level)>>(QUANT_SHIFT-2);
1982
                    block[0]= -level;
1983
                }
1984
//                last_non_zero = i;
1985
            }else{
1986
                block[0]=0;
1987
            }
1988
        }else{
1989
            const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1990
            const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1991
            const unsigned int dc_threshold2= (dc_threshold1<<1);
1992

    
1993
            int level= block[0]*quant_table[0];
1994
            if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1995
                if(level>0){
1996
                    level= (dc_bias + level)>>(QUANT_SHIFT+1);
1997
                    block[0]= level;
1998
                }else{
1999
                    level= (dc_bias - level)>>(QUANT_SHIFT+1);
2000
                    block[0]= -level;
2001
                }
2002
//                last_non_zero = i;
2003
            }else{
2004
                block[0]=0;
2005
            }
2006
        }
2007
        last_non_zero= 0;
2008
        i=1;
2009
    }else{
2010
        last_non_zero= -1;
2011
        i=0;
2012
    }
2013

    
2014
    for(; i<16; i++){
2015
        const int j= scantable[i];
2016
        int level= block[j]*quant_table[j];
2017

    
2018
//        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
2019
//           || bias-level >= (1<<(QMAT_SHIFT - 3))){
2020
        if(((unsigned)(level+threshold1))>threshold2){
2021
            if(level>0){
2022
                level= (bias + level)>>QUANT_SHIFT;
2023
                block[j]= level;
2024
            }else{
2025
                level= (bias - level)>>QUANT_SHIFT;
2026
                block[j]= -level;
2027
            }
2028
            last_non_zero = i;
2029
        }else{
2030
            block[j]=0;
2031
        }
2032
    }
2033

    
2034
    return last_non_zero;
2035
}
2036

    
2037
static void pred4x4_vertical_c(uint8_t *src, uint8_t *topright, int stride){
2038
    const uint32_t a= ((uint32_t*)(src-stride))[0];
2039
    ((uint32_t*)(src+0*stride))[0]= a;
2040
    ((uint32_t*)(src+1*stride))[0]= a;
2041
    ((uint32_t*)(src+2*stride))[0]= a;
2042
    ((uint32_t*)(src+3*stride))[0]= a;
2043
}
2044

    
2045
static void pred4x4_horizontal_c(uint8_t *src, uint8_t *topright, int stride){
2046
    ((uint32_t*)(src+0*stride))[0]= src[-1+0*stride]*0x01010101;
2047
    ((uint32_t*)(src+1*stride))[0]= src[-1+1*stride]*0x01010101;
2048
    ((uint32_t*)(src+2*stride))[0]= src[-1+2*stride]*0x01010101;
2049
    ((uint32_t*)(src+3*stride))[0]= src[-1+3*stride]*0x01010101;
2050
}
2051

    
2052
static void pred4x4_dc_c(uint8_t *src, uint8_t *topright, int stride){
2053
    const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
2054
                   + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
2055

    
2056
    ((uint32_t*)(src+0*stride))[0]=
2057
    ((uint32_t*)(src+1*stride))[0]=
2058
    ((uint32_t*)(src+2*stride))[0]=
2059
    ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
2060
}
2061

    
2062
static void pred4x4_left_dc_c(uint8_t *src, uint8_t *topright, int stride){
2063
    const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
2064

    
2065
    ((uint32_t*)(src+0*stride))[0]=
2066
    ((uint32_t*)(src+1*stride))[0]=
2067
    ((uint32_t*)(src+2*stride))[0]=
2068
    ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
2069
}
2070

    
2071
static void pred4x4_top_dc_c(uint8_t *src, uint8_t *topright, int stride){
2072
    const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
2073

    
2074
    ((uint32_t*)(src+0*stride))[0]=
2075
    ((uint32_t*)(src+1*stride))[0]=
2076
    ((uint32_t*)(src+2*stride))[0]=
2077
    ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
2078
}
2079

    
2080
static void pred4x4_128_dc_c(uint8_t *src, uint8_t *topright, int stride){
2081
    ((uint32_t*)(src+0*stride))[0]=
2082
    ((uint32_t*)(src+1*stride))[0]=
2083
    ((uint32_t*)(src+2*stride))[0]=
2084
    ((uint32_t*)(src+3*stride))[0]= 128U*0x01010101U;
2085
}
2086

    
2087

    
2088
#define LOAD_TOP_RIGHT_EDGE\
2089
    const int t4= topright[0];\
2090
    const int t5= topright[1];\
2091
    const int t6= topright[2];\
2092
    const int t7= topright[3];\
2093

    
2094
#define LOAD_LEFT_EDGE\
2095
    const int l0= src[-1+0*stride];\
2096
    const int l1= src[-1+1*stride];\
2097
    const int l2= src[-1+2*stride];\
2098
    const int l3= src[-1+3*stride];\
2099

    
2100
#define LOAD_TOP_EDGE\
2101
    const int t0= src[ 0-1*stride];\
2102
    const int t1= src[ 1-1*stride];\
2103
    const int t2= src[ 2-1*stride];\
2104
    const int t3= src[ 3-1*stride];\
2105

    
2106
static void pred4x4_down_right_c(uint8_t *src, uint8_t *topright, int stride){
2107
    const int lt= src[-1-1*stride];
2108
    LOAD_TOP_EDGE
2109
    LOAD_LEFT_EDGE
2110

    
2111
    src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2;
2112
    src[0+2*stride]=
2113
    src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2;
2114
    src[0+1*stride]=
2115
    src[1+2*stride]=
2116
    src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2;
2117
    src[0+0*stride]=
2118
    src[1+1*stride]=
2119
    src[2+2*stride]=
2120
    src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
2121
    src[1+0*stride]=
2122
    src[2+1*stride]=
2123
    src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
2124
    src[2+0*stride]=
2125
    src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2126
    src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
2127
}
2128

    
2129
static void pred4x4_down_left_c(uint8_t *src, uint8_t *topright, int stride){
2130
    LOAD_TOP_EDGE
2131
    LOAD_TOP_RIGHT_EDGE
2132
//    LOAD_LEFT_EDGE
2133

    
2134
    src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
2135
    src[1+0*stride]=
2136
    src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
2137
    src[2+0*stride]=
2138
    src[1+1*stride]=
2139
    src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
2140
    src[3+0*stride]=
2141
    src[2+1*stride]=
2142
    src[1+2*stride]=
2143
    src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
2144
    src[3+1*stride]=
2145
    src[2+2*stride]=
2146
    src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
2147
    src[3+2*stride]=
2148
    src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
2149
    src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
2150
}
2151

    
2152
static void pred4x4_vertical_right_c(uint8_t *src, uint8_t *topright, int stride){
2153
    const int lt= src[-1-1*stride];
2154
    LOAD_TOP_EDGE
2155
    LOAD_LEFT_EDGE
2156
    const __attribute__((unused)) int unu= l3;
2157

    
2158
    src[0+0*stride]=
2159
    src[1+2*stride]=(lt + t0 + 1)>>1;
2160
    src[1+0*stride]=
2161
    src[2+2*stride]=(t0 + t1 + 1)>>1;
2162
    src[2+0*stride]=
2163
    src[3+2*stride]=(t1 + t2 + 1)>>1;
2164
    src[3+0*stride]=(t2 + t3 + 1)>>1;
2165
    src[0+1*stride]=
2166
    src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
2167
    src[1+1*stride]=
2168
    src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
2169
    src[2+1*stride]=
2170
    src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2171
    src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
2172
    src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
2173
    src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
2174
}
2175

    
2176
static void pred4x4_vertical_left_c(uint8_t *src, uint8_t *topright, int stride){
2177
    LOAD_TOP_EDGE
2178
    LOAD_TOP_RIGHT_EDGE
2179
    const __attribute__((unused)) int unu= t7;
2180

    
2181
    src[0+0*stride]=(t0 + t1 + 1)>>1;
2182
    src[1+0*stride]=
2183
    src[0+2*stride]=(t1 + t2 + 1)>>1;
2184
    src[2+0*stride]=
2185
    src[1+2*stride]=(t2 + t3 + 1)>>1;
2186
    src[3+0*stride]=
2187
    src[2+2*stride]=(t3 + t4+ 1)>>1;
2188
    src[3+2*stride]=(t4 + t5+ 1)>>1;
2189
    src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2190
    src[1+1*stride]=
2191
    src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
2192
    src[2+1*stride]=
2193
    src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
2194
    src[3+1*stride]=
2195
    src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
2196
    src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
2197
}
2198

    
2199
static void pred4x4_horizontal_up_c(uint8_t *src, uint8_t *topright, int stride){
2200
    LOAD_LEFT_EDGE
2201

    
2202
    src[0+0*stride]=(l0 + l1 + 1)>>1;
2203
    src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
2204
    src[2+0*stride]=
2205
    src[0+1*stride]=(l1 + l2 + 1)>>1;
2206
    src[3+0*stride]=
2207
    src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
2208
    src[2+1*stride]=
2209
    src[0+2*stride]=(l2 + l3 + 1)>>1;
2210
    src[3+1*stride]=
2211
    src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
2212
    src[3+2*stride]=
2213
    src[1+3*stride]=
2214
    src[0+3*stride]=
2215
    src[2+2*stride]=
2216
    src[2+3*stride]=
2217
    src[3+3*stride]=l3;
2218
}
2219

    
2220
static void pred4x4_horizontal_down_c(uint8_t *src, uint8_t *topright, int stride){
2221
    const int lt= src[-1-1*stride];
2222
    LOAD_TOP_EDGE
2223
    LOAD_LEFT_EDGE
2224
    const __attribute__((unused)) int unu= t3;
2225

    
2226
    src[0+0*stride]=
2227
    src[2+1*stride]=(lt + l0 + 1)>>1;
2228
    src[1+0*stride]=
2229
    src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
2230
    src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
2231
    src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2232
    src[0+1*stride]=
2233
    src[2+2*stride]=(l0 + l1 + 1)>>1;
2234
    src[1+1*stride]=
2235
    src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
2236
    src[0+2*stride]=
2237
    src[2+3*stride]=(l1 + l2+ 1)>>1;
2238
    src[1+2*stride]=
2239
    src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
2240
    src[0+3*stride]=(l2 + l3 + 1)>>1;
2241
    src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
2242
}
2243

    
2244
void ff_pred16x16_vertical_c(uint8_t *src, int stride){
2245
    int i;
2246
    const uint32_t a= ((uint32_t*)(src-stride))[0];
2247
    const uint32_t b= ((uint32_t*)(src-stride))[1];
2248
    const uint32_t c= ((uint32_t*)(src-stride))[2];
2249
    const uint32_t d= ((uint32_t*)(src-stride))[3];
2250

    
2251
    for(i=0; i<16; i++){
2252
        ((uint32_t*)(src+i*stride))[0]= a;
2253
        ((uint32_t*)(src+i*stride))[1]= b;
2254
        ((uint32_t*)(src+i*stride))[2]= c;
2255
        ((uint32_t*)(src+i*stride))[3]= d;
2256
    }
2257
}
2258

    
2259
void ff_pred16x16_horizontal_c(uint8_t *src, int stride){
2260
    int i;
2261

    
2262
    for(i=0; i<16; i++){
2263
        ((uint32_t*)(src+i*stride))[0]=
2264
        ((uint32_t*)(src+i*stride))[1]=
2265
        ((uint32_t*)(src+i*stride))[2]=
2266
        ((uint32_t*)(src+i*stride))[3]= src[-1+i*stride]*0x01010101;
2267
    }
2268
}
2269

    
2270
void ff_pred16x16_dc_c(uint8_t *src, int stride){
2271
    int i, dc=0;
2272

    
2273
    for(i=0;i<16; i++){
2274
        dc+= src[-1+i*stride];
2275
    }
2276

    
2277
    for(i=0;i<16; i++){
2278
        dc+= src[i-stride];
2279
    }
2280

    
2281
    dc= 0x01010101*((dc + 16)>>5);
2282

    
2283
    for(i=0; i<16; i++){
2284
        ((uint32_t*)(src+i*stride))[0]=
2285
        ((uint32_t*)(src+i*stride))[1]=
2286
        ((uint32_t*)(src+i*stride))[2]=
2287
        ((uint32_t*)(src+i*stride))[3]= dc;
2288
    }
2289
}
2290

    
2291
static void pred16x16_left_dc_c(uint8_t *src, int stride){
2292
    int i, dc=0;
2293

    
2294
    for(i=0;i<16; i++){
2295
        dc+= src[-1+i*stride];
2296
    }
2297

    
2298
    dc= 0x01010101*((dc + 8)>>4);
2299

    
2300
    for(i=0; i<16; i++){
2301
        ((uint32_t*)(src+i*stride))[0]=
2302
        ((uint32_t*)(src+i*stride))[1]=
2303
        ((uint32_t*)(src+i*stride))[2]=
2304
        ((uint32_t*)(src+i*stride))[3]= dc;
2305
    }
2306
}
2307

    
2308
static void pred16x16_top_dc_c(uint8_t *src, int stride){
2309
    int i, dc=0;
2310

    
2311
    for(i=0;i<16; i++){
2312
        dc+= src[i-stride];
2313
    }
2314
    dc= 0x01010101*((dc + 8)>>4);
2315

    
2316
    for(i=0; i<16; i++){
2317
        ((uint32_t*)(src+i*stride))[0]=
2318
        ((uint32_t*)(src+i*stride))[1]=
2319
        ((uint32_t*)(src+i*stride))[2]=
2320
        ((uint32_t*)(src+i*stride))[3]= dc;
2321
    }
2322
}
2323

    
2324
void ff_pred16x16_128_dc_c(uint8_t *src, int stride){
2325
    int i;
2326

    
2327
    for(i=0; i<16; i++){
2328
        ((uint32_t*)(src+i*stride))[0]=
2329
        ((uint32_t*)(src+i*stride))[1]=
2330
        ((uint32_t*)(src+i*stride))[2]=
2331
        ((uint32_t*)(src+i*stride))[3]= 0x01010101U*128U;
2332
    }
2333
}
2334

    
2335
static inline void pred16x16_plane_compat_c(uint8_t *src, int stride, const int svq3){
2336
  int i, j, k;
2337
  int a;
2338
  uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2339
  const uint8_t * const src0 = src+7-stride;
2340
  const uint8_t *src1 = src+8*stride-1;
2341
  const uint8_t *src2 = src1-2*stride;      // == src+6*stride-1;
2342
  int H = src0[1] - src0[-1];
2343
  int V = src1[0] - src2[ 0];
2344
  for(k=2; k<=8; ++k) {
2345
    src1 += stride; src2 -= stride;
2346
    H += k*(src0[k] - src0[-k]);
2347
    V += k*(src1[0] - src2[ 0]);
2348
  }
2349
  if(svq3){
2350
    H = ( 5*(H/4) ) / 16;
2351
    V = ( 5*(V/4) ) / 16;
2352

    
2353
    /* required for 100% accuracy */
2354
    i = H; H = V; V = i;
2355
  }else{
2356
    H = ( 5*H+32 ) >> 6;
2357
    V = ( 5*V+32 ) >> 6;
2358
  }
2359

    
2360
  a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
2361
  for(j=16; j>0; --j) {
2362
    int b = a;
2363
    a += V;
2364
    for(i=-16; i<0; i+=4) {
2365
      src[16+i] = cm[ (b    ) >> 5 ];
2366
      src[17+i] = cm[ (b+  H) >> 5 ];
2367
      src[18+i] = cm[ (b+2*H) >> 5 ];
2368
      src[19+i] = cm[ (b+3*H) >> 5 ];
2369
      b += 4*H;
2370
    }
2371
    src += stride;
2372
  }
2373
}
2374

    
2375
void ff_pred16x16_plane_c(uint8_t *src, int stride){
2376
    pred16x16_plane_compat_c(src, stride, 0);
2377
}
2378

    
2379
void ff_pred8x8_vertical_c(uint8_t *src, int stride){
2380
    int i;
2381
    const uint32_t a= ((uint32_t*)(src-stride))[0];
2382
    const uint32_t b= ((uint32_t*)(src-stride))[1];
2383

    
2384
    for(i=0; i<8; i++){
2385
        ((uint32_t*)(src+i*stride))[0]= a;
2386
        ((uint32_t*)(src+i*stride))[1]= b;
2387
    }
2388
}
2389

    
2390
void ff_pred8x8_horizontal_c(uint8_t *src, int stride){
2391
    int i;
2392

    
2393
    for(i=0; i<8; i++){
2394
        ((uint32_t*)(src+i*stride))[0]=
2395
        ((uint32_t*)(src+i*stride))[1]= src[-1+i*stride]*0x01010101;
2396
    }
2397
}
2398

    
2399
void ff_pred8x8_128_dc_c(uint8_t *src, int stride){
2400
    int i;
2401

    
2402
    for(i=0; i<8; i++){
2403
        ((uint32_t*)(src+i*stride))[0]=
2404
        ((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U;
2405
    }
2406
}
2407

    
2408
static void pred8x8_left_dc_c(uint8_t *src, int stride){
2409
    int i;
2410
    int dc0, dc2;
2411

    
2412
    dc0=dc2=0;
2413
    for(i=0;i<4; i++){
2414
        dc0+= src[-1+i*stride];
2415
        dc2+= src[-1+(i+4)*stride];
2416
    }
2417
    dc0= 0x01010101*((dc0 + 2)>>2);
2418
    dc2= 0x01010101*((dc2 + 2)>>2);
2419

    
2420
    for(i=0; i<4; i++){
2421
        ((uint32_t*)(src+i*stride))[0]=
2422
        ((uint32_t*)(src+i*stride))[1]= dc0;
2423
    }
2424
    for(i=4; i<8; i++){
2425
        ((uint32_t*)(src+i*stride))[0]=
2426
        ((uint32_t*)(src+i*stride))[1]= dc2;
2427
    }
2428
}
2429

    
2430
static void pred8x8_top_dc_c(uint8_t *src, int stride){
2431
    int i;
2432
    int dc0, dc1;
2433

    
2434
    dc0=dc1=0;
2435
    for(i=0;i<4; i++){
2436
        dc0+= src[i-stride];
2437
        dc1+= src[4+i-stride];
2438
    }
2439
    dc0= 0x01010101*((dc0 + 2)>>2);
2440
    dc1= 0x01010101*((dc1 + 2)>>2);
2441

    
2442
    for(i=0; i<4; i++){
2443
        ((uint32_t*)(src+i*stride))[0]= dc0;
2444
        ((uint32_t*)(src+i*stride))[1]= dc1;
2445
    }
2446
    for(i=4; i<8; i++){
2447
        ((uint32_t*)(src+i*stride))[0]= dc0;
2448
        ((uint32_t*)(src+i*stride))[1]= dc1;
2449
    }
2450
}
2451

    
2452

    
2453
void ff_pred8x8_dc_c(uint8_t *src, int stride){
2454
    int i;
2455
    int dc0, dc1, dc2, dc3;
2456

    
2457
    dc0=dc1=dc2=0;
2458
    for(i=0;i<4; i++){
2459
        dc0+= src[-1+i*stride] + src[i-stride];
2460
        dc1+= src[4+i-stride];
2461
        dc2+= src[-1+(i+4)*stride];
2462
    }
2463
    dc3= 0x01010101*((dc1 + dc2 + 4)>>3);
2464
    dc0= 0x01010101*((dc0 + 4)>>3);
2465
    dc1= 0x01010101*((dc1 + 2)>>2);
2466
    dc2= 0x01010101*((dc2 + 2)>>2);
2467

    
2468
    for(i=0; i<4; i++){
2469
        ((uint32_t*)(src+i*stride))[0]= dc0;
2470
        ((uint32_t*)(src+i*stride))[1]= dc1;
2471
    }
2472
    for(i=4; i<8; i++){
2473
        ((uint32_t*)(src+i*stride))[0]= dc2;
2474
        ((uint32_t*)(src+i*stride))[1]= dc3;
2475
    }
2476
}
2477

    
2478
void ff_pred8x8_plane_c(uint8_t *src, int stride){
2479
  int j, k;
2480
  int a;
2481
  uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2482
  const uint8_t * const src0 = src+3-stride;
2483
  const uint8_t *src1 = src+4*stride-1;
2484
  const uint8_t *src2 = src1-2*stride;      // == src+2*stride-1;
2485
  int H = src0[1] - src0[-1];
2486
  int V = src1[0] - src2[ 0];
2487
  for(k=2; k<=4; ++k) {
2488
    src1 += stride; src2 -= stride;
2489
    H += k*(src0[k] - src0[-k]);
2490
    V += k*(src1[0] - src2[ 0]);
2491
  }
2492
  H = ( 17*H+16 ) >> 5;
2493
  V = ( 17*V+16 ) >> 5;
2494

    
2495
  a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
2496
  for(j=8; j>0; --j) {
2497
    int b = a;
2498
    a += V;
2499
    src[0] = cm[ (b    ) >> 5 ];
2500
    src[1] = cm[ (b+  H) >> 5 ];
2501
    src[2] = cm[ (b+2*H) >> 5 ];
2502
    src[3] = cm[ (b+3*H) >> 5 ];
2503
    src[4] = cm[ (b+4*H) >> 5 ];
2504
    src[5] = cm[ (b+5*H) >> 5 ];
2505
    src[6] = cm[ (b+6*H) >> 5 ];
2506
    src[7] = cm[ (b+7*H) >> 5 ];
2507
    src += stride;
2508
  }
2509
}
2510

    
2511
#define SRC(x,y) src[(x)+(y)*stride]
2512
#define PL(y) \
2513
    const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
2514
#define PREDICT_8x8_LOAD_LEFT \
2515
    const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
2516
                     + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
2517
    PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
2518
    const int l7 attribute_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
2519

    
2520
#define PT(x) \
2521
    const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
2522
#define PREDICT_8x8_LOAD_TOP \
2523
    const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
2524
                     + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
2525
    PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
2526
    const int t7 attribute_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
2527
                     + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
2528

    
2529
#define PTR(x) \
2530
    t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
2531
#define PREDICT_8x8_LOAD_TOPRIGHT \
2532
    int t8, t9, t10, t11, t12, t13, t14, t15; \
2533
    if(has_topright) { \
2534
        PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
2535
        t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
2536
    } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
2537

    
2538
#define PREDICT_8x8_LOAD_TOPLEFT \
2539
    const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
2540

    
2541
#define PREDICT_8x8_DC(v) \
2542
    int y; \
2543
    for( y = 0; y < 8; y++ ) { \
2544
        ((uint32_t*)src)[0] = \
2545
        ((uint32_t*)src)[1] = v; \
2546
        src += stride; \
2547
    }
2548

    
2549
static void pred8x8l_128_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2550
{
2551
    PREDICT_8x8_DC(0x80808080);
2552
}
2553
static void pred8x8l_left_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2554
{
2555
    PREDICT_8x8_LOAD_LEFT;
2556
    const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3) * 0x01010101;
2557
    PREDICT_8x8_DC(dc);
2558
}
2559
static void pred8x8l_top_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2560
{
2561
    PREDICT_8x8_LOAD_TOP;
2562
    const uint32_t dc = ((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3) * 0x01010101;
2563
    PREDICT_8x8_DC(dc);
2564
}
2565
static void pred8x8l_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2566
{
2567
    PREDICT_8x8_LOAD_LEFT;
2568
    PREDICT_8x8_LOAD_TOP;
2569
    const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7
2570
                         +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4) * 0x01010101;
2571
    PREDICT_8x8_DC(dc);
2572
}
2573
static void pred8x8l_horizontal_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2574
{
2575
    PREDICT_8x8_LOAD_LEFT;
2576
#define ROW(y) ((uint32_t*)(src+y*stride))[0] =\
2577
               ((uint32_t*)(src+y*stride))[1] = 0x01010101 * l##y
2578
    ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
2579
#undef ROW
2580
}
2581
static void pred8x8l_vertical_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2582
{
2583
    int y;
2584
    PREDICT_8x8_LOAD_TOP;
2585
    src[0] = t0;
2586
    src[1] = t1;
2587
    src[2] = t2;
2588
    src[3] = t3;
2589
    src[4] = t4;
2590
    src[5] = t5;
2591
    src[6] = t6;
2592
    src[7] = t7;
2593
    for( y = 1; y < 8; y++ )
2594
        *(uint64_t*)(src+y*stride) = *(uint64_t*)src;
2595
}
2596
static void pred8x8l_down_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2597
{
2598
    PREDICT_8x8_LOAD_TOP;
2599
    PREDICT_8x8_LOAD_TOPRIGHT;
2600
    SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
2601
    SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
2602
    SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
2603
    SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
2604
    SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
2605
    SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
2606
    SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
2607
    SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
2608
    SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
2609
    SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
2610
    SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
2611
    SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
2612
    SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
2613
    SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
2614
    SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
2615
}
2616
static void pred8x8l_down_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2617
{
2618
    PREDICT_8x8_LOAD_TOP;
2619
    PREDICT_8x8_LOAD_LEFT;
2620
    PREDICT_8x8_LOAD_TOPLEFT;
2621
    SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
2622
    SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
2623
    SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
2624
    SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
2625
    SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
2626
    SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
2627
    SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
2628
    SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
2629
    SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
2630
    SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
2631
    SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
2632
    SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
2633
    SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
2634
    SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
2635
    SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
2636

    
2637
}
2638
static void pred8x8l_vertical_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2639
{
2640
    PREDICT_8x8_LOAD_TOP;
2641
    PREDICT_8x8_LOAD_LEFT;
2642
    PREDICT_8x8_LOAD_TOPLEFT;
2643
    SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
2644
    SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
2645
    SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
2646
    SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
2647
    SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
2648
    SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
2649
    SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
2650
    SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
2651
    SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
2652
    SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
2653
    SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
2654
    SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
2655
    SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
2656
    SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
2657
    SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
2658
    SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
2659
    SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
2660
    SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
2661
    SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
2662
    SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
2663
    SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
2664
    SRC(7,0)= (t6 + t7 + 1) >> 1;
2665
}
2666
static void pred8x8l_horizontal_down_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2667
{
2668
    PREDICT_8x8_LOAD_TOP;
2669
    PREDICT_8x8_LOAD_LEFT;
2670
    PREDICT_8x8_LOAD_TOPLEFT;
2671
    SRC(0,7)= (l6 + l7 + 1) >> 1;
2672
    SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
2673
    SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
2674
    SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
2675
    SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
2676
    SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
2677
    SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
2678
    SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
2679
    SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
2680
    SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
2681
    SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
2682
    SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
2683
    SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
2684
    SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
2685
    SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
2686
    SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
2687
    SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
2688
    SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
2689
    SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
2690
    SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
2691
    SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
2692
    SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
2693
}
2694
static void pred8x8l_vertical_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2695
{
2696
    PREDICT_8x8_LOAD_TOP;
2697
    PREDICT_8x8_LOAD_TOPRIGHT;
2698
    SRC(0,0)= (t0 + t1 + 1) >> 1;
2699
    SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
2700
    SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
2701
    SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
2702
    SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
2703
    SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
2704
    SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
2705
    SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
2706
    SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
2707
    SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
2708
    SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
2709
    SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
2710
    SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
2711
    SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
2712
    SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
2713
    SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
2714
    SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
2715
    SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
2716
    SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
2717
    SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
2718
    SRC(7,6)= (t10 + t11 + 1) >> 1;
2719
    SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
2720
}
2721
static void pred8x8l_horizontal_up_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2722
{
2723
    PREDICT_8x8_LOAD_LEFT;
2724
    SRC(0,0)= (l0 + l1 + 1) >> 1;
2725
    SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
2726
    SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
2727
    SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
2728
    SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
2729
    SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
2730
    SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
2731
    SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
2732
    SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
2733
    SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
2734
    SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
2735
    SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
2736
    SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
2737
    SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
2738
    SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
2739
    SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
2740
    SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
2741
    SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
2742
}
2743
#undef PREDICT_8x8_LOAD_LEFT
2744
#undef PREDICT_8x8_LOAD_TOP
2745
#undef PREDICT_8x8_LOAD_TOPLEFT
2746
#undef PREDICT_8x8_LOAD_TOPRIGHT
2747
#undef PREDICT_8x8_DC
2748
#undef PTR
2749
#undef PT
2750
#undef PL
2751
#undef SRC
2752

    
2753
static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
2754
                           uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2755
                           int src_x_offset, int src_y_offset,
2756
                           qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
2757
    MpegEncContext * const s = &h->s;
2758
    const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
2759
    int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
2760
    const int luma_xy= (mx&3) + ((my&3)<<2);
2761
    uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
2762
    uint8_t * src_cb, * src_cr;
2763
    int extra_width= h->emu_edge_width;
2764
    int extra_height= h->emu_edge_height;
2765
    int emu=0;
2766
    const int full_mx= mx>>2;
2767
    const int full_my= my>>2;
2768
    const int pic_width  = 16*s->mb_width;
2769
    const int pic_height = 16*s->mb_height >> MB_MBAFF;
2770

    
2771
    if(!pic->data[0])
2772
        return;
2773

    
2774
    if(mx&7) extra_width -= 3;
2775
    if(my&7) extra_height -= 3;
2776

    
2777
    if(   full_mx < 0-extra_width
2778
       || full_my < 0-extra_height
2779
       || full_mx + 16/*FIXME*/ > pic_width + extra_width
2780
       || full_my + 16/*FIXME*/ > pic_height + extra_height){
2781
        ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
2782
            src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
2783
        emu=1;
2784
    }
2785

    
2786
    qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
2787
    if(!square){
2788
        qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
2789
    }
2790

    
2791
    if(s->flags&CODEC_FLAG_GRAY) return;
2792

    
2793
    if(MB_MBAFF){
2794
        // chroma offset when predicting from a field of opposite parity
2795
        my += 2 * ((s->mb_y & 1) - (h->ref_cache[list][scan8[n]] & 1));
2796
        emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
2797
    }
2798
    src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
2799
    src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
2800

    
2801
    if(emu){
2802
        ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
2803
            src_cb= s->edge_emu_buffer;
2804
    }
2805
    chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
2806

    
2807
    if(emu){
2808
        ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
2809
            src_cr= s->edge_emu_buffer;
2810
    }
2811
    chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
2812
}
2813

    
2814
static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
2815
                           uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2816
                           int x_offset, int y_offset,
2817
                           qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2818
                           qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
2819
                           int list0, int list1){
2820
    MpegEncContext * const s = &h->s;
2821
    qpel_mc_func *qpix_op=  qpix_put;
2822
    h264_chroma_mc_func chroma_op= chroma_put;
2823

    
2824
    dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
2825
    dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
2826
    dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
2827
    x_offset += 8*s->mb_x;
2828
    y_offset += 8*(s->mb_y >> MB_MBAFF);
2829

    
2830
    if(list0){
2831
        Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
2832
        mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
2833
                           dest_y, dest_cb, dest_cr, x_offset, y_offset,
2834
                           qpix_op, chroma_op);
2835

    
2836
        qpix_op=  qpix_avg;
2837
        chroma_op= chroma_avg;
2838
    }
2839

    
2840
    if(list1){
2841
        Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
2842
        mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
2843
                           dest_y, dest_cb, dest_cr, x_offset, y_offset,
2844
                           qpix_op, chroma_op);
2845
    }
2846
}
2847

    
2848
static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
2849
                           uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2850
                           int x_offset, int y_offset,
2851
                           qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2852
                           h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
2853
                           h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
2854
                           int list0, int list1){
2855
    MpegEncContext * const s = &h->s;
2856

    
2857
    dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
2858
    dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
2859
    dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
2860
    x_offset += 8*s->mb_x;
2861
    y_offset += 8*(s->mb_y >> MB_MBAFF);
2862

    
2863
    if(list0 && list1){
2864
        /* don't optimize for luma-only case, since B-frames usually
2865
         * use implicit weights => chroma too. */
2866
        uint8_t *tmp_cb = s->obmc_scratchpad;
2867
        uint8_t *tmp_cr = s->obmc_scratchpad + 8;
2868
        uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
2869
        int refn0 = h->ref_cache[0][ scan8[n] ];
2870
        int refn1 = h->ref_cache[1][ scan8[n] ];
2871

    
2872
        mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
2873
                    dest_y, dest_cb, dest_cr,
2874
                    x_offset, y_offset, qpix_put, chroma_put);
2875
        mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
2876
                    tmp_y, tmp_cb, tmp_cr,
2877
                    x_offset, y_offset, qpix_put, chroma_put);
2878

    
2879
        if(h->use_weight == 2){
2880
            int weight0 = h->implicit_weight[refn0][refn1];
2881
            int weight1 = 64 - weight0;
2882
            luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
2883
            chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
2884
            chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
2885
        }else{
2886
            luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
2887
                            h->luma_weight[0][refn0], h->luma_weight[1][refn1],
2888
                            h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
2889
            chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
2890
                            h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
2891
                            h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
2892
            chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
2893
                            h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
2894
                            h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
2895
        }
2896
    }else{
2897
        int list = list1 ? 1 : 0;
2898
        int refn = h->ref_cache[list][ scan8[n] ];
2899
        Picture *ref= &h->ref_list[list][refn];
2900
        mc_dir_part(h, ref, n, square, chroma_height, delta, list,
2901
                    dest_y, dest_cb, dest_cr, x_offset, y_offset,
2902
                    qpix_put, chroma_put);
2903

    
2904
        luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
2905
                       h->luma_weight[list][refn], h->luma_offset[list][refn]);
2906
        if(h->use_weight_chroma){
2907
            chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
2908
                             h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
2909
            chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
2910
                             h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
2911
        }
2912
    }
2913
}
2914

    
2915
static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
2916
                           uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2917
                           int x_offset, int y_offset,
2918
                           qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2919
                           qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
2920
                           h264_weight_func *weight_op, h264_biweight_func *weight_avg,
2921
                           int list0, int list1){
2922
    if((h->use_weight==2 && list0 && list1
2923
        && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
2924
       || h->use_weight==1)
2925
        mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
2926
                         x_offset, y_offset, qpix_put, chroma_put,
2927
                         weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
2928
    else
2929
        mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
2930
                    x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
2931
}
2932

    
2933
static inline void prefetch_motion(H264Context *h, int list){
2934
    /* fetch pixels for estimated mv 4 macroblocks ahead
2935
     * optimized for 64byte cache lines */
2936
    MpegEncContext * const s = &h->s;
2937
    const int refn = h->ref_cache[list][scan8[0]];
2938
    if(refn >= 0){
2939
        const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
2940
        const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
2941
        uint8_t **src= h->ref_list[list][refn].data;
2942
        int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
2943
        s->dsp.prefetch(src[0]+off, s->linesize, 4);
2944
        off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
2945
        s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
2946
    }
2947
}
2948

    
2949
static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2950
                      qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
2951
                      qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
2952
                      h264_weight_func *weight_op, h264_biweight_func *weight_avg){
2953
    MpegEncContext * const s = &h->s;
2954
    const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
2955
    const int mb_type= s->current_picture.mb_type[mb_xy];
2956

    
2957
    assert(IS_INTER(mb_type));
2958

    
2959
    prefetch_motion(h, 0);
2960

    
2961
    if(IS_16X16(mb_type)){
2962
        mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
2963
                qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
2964
                &weight_op[0], &weight_avg[0],
2965
                IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2966
    }else if(IS_16X8(mb_type)){
2967
        mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
2968
                qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
2969
                &weight_op[1], &weight_avg[1],
2970
                IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2971
        mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
2972
                qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
2973
                &weight_op[1], &weight_avg[1],
2974
                IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
2975
    }else if(IS_8X16(mb_type)){
2976
        mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
2977
                qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2978
                &weight_op[2], &weight_avg[2],
2979
                IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2980
        mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
2981
                qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2982
                &weight_op[2], &weight_avg[2],
2983
                IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
2984
    }else{
2985
        int i;
2986

    
2987
        assert(IS_8X8(mb_type));
2988

    
2989
        for(i=0; i<4; i++){
2990
            const int sub_mb_type= h->sub_mb_type[i];
2991
            const int n= 4*i;
2992
            int x_offset= (i&1)<<2;
2993
            int y_offset= (i&2)<<1;
2994

    
2995
            if(IS_SUB_8X8(sub_mb_type)){
2996
                mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
2997
                    qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2998
                    &weight_op[3], &weight_avg[3],
2999
                    IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3000
            }else if(IS_SUB_8X4(sub_mb_type)){
3001
                mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
3002
                    qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
3003
                    &weight_op[4], &weight_avg[4],
3004
                    IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3005
                mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
3006
                    qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
3007
                    &weight_op[4], &weight_avg[4],
3008
                    IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3009
            }else if(IS_SUB_4X8(sub_mb_type)){
3010
                mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
3011
                    qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
3012
                    &weight_op[5], &weight_avg[5],
3013
                    IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3014
                mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
3015
                    qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
3016
                    &weight_op[5], &weight_avg[5],
3017
                    IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3018
            }else{
3019
                int j;
3020
                assert(IS_SUB_4X4(sub_mb_type));
3021
                for(j=0; j<4; j++){
3022
                    int sub_x_offset= x_offset + 2*(j&1);
3023
                    int sub_y_offset= y_offset +   (j&2);
3024
                    mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
3025
                        qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
3026
                        &weight_op[6], &weight_avg[6],
3027
                        IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3028
                }
3029
            }
3030
        }
3031
    }
3032

    
3033
    prefetch_motion(h, 1);
3034
}
3035

    
3036
static void decode_init_vlc(){
3037
    static int done = 0;
3038

    
3039
    if (!done) {
3040
        int i;
3041
        done = 1;
3042

    
3043
        init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
3044
                 &chroma_dc_coeff_token_len [0], 1, 1,
3045
                 &chroma_dc_coeff_token_bits[0], 1, 1, 1);
3046

    
3047
        for(i=0; i<4; i++){
3048
            init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
3049
                     &coeff_token_len [i][0], 1, 1,
3050
                     &coeff_token_bits[i][0], 1, 1, 1);
3051
        }
3052

    
3053
        for(i=0; i<3; i++){
3054
            init_vlc(&chroma_dc_total_zeros_vlc[i], CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
3055
                     &chroma_dc_total_zeros_len [i][0], 1, 1,
3056
                     &chroma_dc_total_zeros_bits[i][0], 1, 1, 1);
3057
        }
3058
        for(i=0; i<15; i++){
3059
            init_vlc(&total_zeros_vlc[i], TOTAL_ZEROS_VLC_BITS, 16,
3060
                     &total_zeros_len [i][0], 1, 1,
3061
                     &total_zeros_bits[i][0], 1, 1, 1);
3062
        }
3063

    
3064
        for(i=0; i<6; i++){
3065
            init_vlc(&run_vlc[i], RUN_VLC_BITS, 7,
3066
                     &run_len [i][0], 1, 1,
3067
                     &run_bits[i][0], 1, 1, 1);
3068
        }
3069
        init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
3070
                 &run_len [6][0], 1, 1,
3071
                 &run_bits[6][0], 1, 1, 1);
3072
    }
3073
}
3074

    
3075
/**
3076
 * Sets the intra prediction function pointers.
3077
 */
3078
static void init_pred_ptrs(H264Context *h){
3079
//    MpegEncContext * const s = &h->s;
3080

    
3081
    h->pred4x4[VERT_PRED           ]= pred4x4_vertical_c;
3082
    h->pred4x4[HOR_PRED            ]= pred4x4_horizontal_c;
3083
    h->pred4x4[DC_PRED             ]= pred4x4_dc_c;
3084
    h->pred4x4[DIAG_DOWN_LEFT_PRED ]= pred4x4_down_left_c;
3085
    h->pred4x4[DIAG_DOWN_RIGHT_PRED]= pred4x4_down_right_c;
3086
    h->pred4x4[VERT_RIGHT_PRED     ]= pred4x4_vertical_right_c;
3087
    h->pred4x4[HOR_DOWN_PRED       ]= pred4x4_horizontal_down_c;
3088
    h->pred4x4[VERT_LEFT_PRED      ]= pred4x4_vertical_left_c;
3089
    h->pred4x4[HOR_UP_PRED         ]= pred4x4_horizontal_up_c;
3090
    h->pred4x4[LEFT_DC_PRED        ]= pred4x4_left_dc_c;
3091
    h->pred4x4[TOP_DC_PRED         ]= pred4x4_top_dc_c;
3092
    h->pred4x4[DC_128_PRED         ]= pred4x4_128_dc_c;
3093

    
3094
    h->pred8x8l[VERT_PRED           ]= pred8x8l_vertical_c;
3095
    h->pred8x8l[HOR_PRED            ]= pred8x8l_horizontal_c;
3096
    h->pred8x8l[DC_PRED             ]= pred8x8l_dc_c;
3097
    h->pred8x8l[DIAG_DOWN_LEFT_PRED ]= pred8x8l_down_left_c;
3098
    h->pred8x8l[DIAG_DOWN_RIGHT_PRED]= pred8x8l_down_right_c;
3099
    h->pred8x8l[VERT_RIGHT_PRED     ]= pred8x8l_vertical_right_c;
3100
    h->pred8x8l[HOR_DOWN_PRED       ]= pred8x8l_horizontal_down_c;
3101
    h->pred8x8l[VERT_LEFT_PRED      ]= pred8x8l_vertical_left_c;
3102
    h->pred8x8l[HOR_UP_PRED         ]= pred8x8l_horizontal_up_c;
3103
    h->pred8x8l[LEFT_DC_PRED        ]= pred8x8l_left_dc_c;
3104
    h->pred8x8l[TOP_DC_PRED         ]= pred8x8l_top_dc_c;
3105
    h->pred8x8l[DC_128_PRED         ]= pred8x8l_128_dc_c;
3106

    
3107
    h->pred8x8[DC_PRED8x8     ]= ff_pred8x8_dc_c;
3108
    h->pred8x8[VERT_PRED8x8   ]= ff_pred8x8_vertical_c;
3109
    h->pred8x8[HOR_PRED8x8    ]= ff_pred8x8_horizontal_c;
3110
    h->pred8x8[PLANE_PRED8x8  ]= ff_pred8x8_plane_c;
3111
    h->pred8x8[LEFT_DC_PRED8x8]= pred8x8_left_dc_c;
3112
    h->pred8x8[TOP_DC_PRED8x8 ]= pred8x8_top_dc_c;
3113
    h->pred8x8[DC_128_PRED8x8 ]= ff_pred8x8_128_dc_c;
3114

    
3115
    h->pred16x16[DC_PRED8x8     ]= ff_pred16x16_dc_c;
3116
    h->pred16x16[VERT_PRED8x8   ]= ff_pred16x16_vertical_c;
3117
    h->pred16x16[HOR_PRED8x8    ]= ff_pred16x16_horizontal_c;
3118
    h->pred16x16[PLANE_PRED8x8  ]= ff_pred16x16_plane_c;
3119
    h->pred16x16[LEFT_DC_PRED8x8]= pred16x16_left_dc_c;
3120
    h->pred16x16[TOP_DC_PRED8x8 ]= pred16x16_top_dc_c;
3121
    h->pred16x16[DC_128_PRED8x8 ]= ff_pred16x16_128_dc_c;
3122
}
3123

    
3124
static void free_tables(H264Context *h){
3125
    av_freep(&h->intra4x4_pred_mode);
3126
    av_freep(&h->chroma_pred_mode_table);
3127
    av_freep(&h->cbp_table);
3128
    av_freep(&h->mvd_table[0]);
3129
    av_freep(&h->mvd_table[1]);
3130
    av_freep(&h->direct_table);
3131
    av_freep(&h->non_zero_count);
3132
    av_freep(&h->slice_table_base);
3133
    av_freep(&h->top_borders[1]);
3134
    av_freep(&h->top_borders[0]);
3135
    h->slice_table= NULL;
3136

    
3137
    av_freep(&h->mb2b_xy);
3138
    av_freep(&h->mb2b8_xy);
3139

    
3140
    av_freep(&h->s.obmc_scratchpad);
3141
}
3142

    
3143
static void init_dequant8_coeff_table(H264Context *h){
3144
    int i,q,x;
3145
    const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
3146
    h->dequant8_coeff[0] = h->dequant8_buffer[0];
3147
    h->dequant8_coeff[1] = h->dequant8_buffer[1];
3148

    
3149
    for(i=0; i<2; i++ ){
3150
        if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
3151
            h->dequant8_coeff[1] = h->dequant8_buffer[0];
3152
            break;
3153
        }
3154

    
3155
        for(q=0; q<52; q++){
3156
            int shift = ff_div6[q];
3157
            int idx = ff_rem6[q];
3158
            for(x=0; x<64; x++)
3159
                h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
3160
                    ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
3161
                    h->pps.scaling_matrix8[i][x]) << shift;
3162
        }
3163
    }
3164
}
3165

    
3166
static void init_dequant4_coeff_table(H264Context *h){
3167
    int i,j,q,x;
3168
    const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
3169
    for(i=0; i<6; i++ ){
3170
        h->dequant4_coeff[i] = h->dequant4_buffer[i];
3171
        for(j=0; j<i; j++){
3172
            if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
3173
                h->dequant4_coeff[i] = h->dequant4_buffer[j];
3174
                break;
3175
            }
3176
        }
3177
        if(j<i)
3178
            continue;
3179

    
3180
        for(q=0; q<52; q++){
3181
            int shift = ff_div6[q] + 2;
3182
            int idx = ff_rem6[q];
3183
            for(x=0; x<16; x++)
3184
                h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
3185
                    ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
3186
                    h->pps.scaling_matrix4[i][x]) << shift;
3187
        }
3188
    }
3189
}
3190

    
3191
static void init_dequant_tables(H264Context *h){
3192
    int i,x;
3193
    init_dequant4_coeff_table(h);
3194
    if(h->pps.transform_8x8_mode)
3195
        init_dequant8_coeff_table(h);
3196
    if(h->sps.transform_bypass){
3197
        for(i=0; i<6; i++)
3198
            for(x=0; x<16; x++)
3199
                h->dequant4_coeff[i][0][x] = 1<<6;
3200
        if(h->pps.transform_8x8_mode)
3201
            for(i=0; i<2; i++)
3202
                for(x=0; x<64; x++)
3203
                    h->dequant8_coeff[i][0][x] = 1<<6;
3204
    }
3205
}
3206

    
3207

    
3208
/**
3209
 * allocates tables.
3210
 * needs width/height
3211
 */
3212
static int alloc_tables(H264Context *h){
3213
    MpegEncContext * const s = &h->s;
3214
    const int big_mb_num= s->mb_stride * (s->mb_height+1);
3215
    int x,y;
3216

    
3217
    CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
3218

    
3219
    CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
3220
    CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(uint8_t))
3221
    CHECKED_ALLOCZ(h->top_borders[0]    , s->mb_width * (16+8+8) * sizeof(uint8_t))
3222
    CHECKED_ALLOCZ(h->top_borders[1]    , s->mb_width * (16+8+8) * sizeof(uint8_t))
3223
    CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
3224

    
3225
    if( h->pps.cabac ) {
3226
        CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
3227
        CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
3228
        CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
3229
        CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
3230
    }
3231

    
3232
    memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(uint8_t));
3233
    h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
3234

    
3235
    CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
3236
    CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
3237
    for(y=0; y<s->mb_height; y++){
3238
        for(x=0; x<s->mb_width; x++){
3239
            const int mb_xy= x + y*s->mb_stride;
3240
            const int b_xy = 4*x + 4*y*h->b_stride;
3241
            const int b8_xy= 2*x + 2*y*h->b8_stride;
3242

    
3243
            h->mb2b_xy [mb_xy]= b_xy;
3244
            h->mb2b8_xy[mb_xy]= b8_xy;
3245
        }
3246
    }
3247

    
3248
    s->obmc_scratchpad = NULL;
3249

    
3250
    if(!h->dequant4_coeff[0])
3251
        init_dequant_tables(h);
3252

    
3253
    return 0;
3254
fail:
3255
    free_tables(h);
3256
    return -1;
3257
}
3258

    
3259
static void common_init(H264Context *h){
3260
    MpegEncContext * const s = &h->s;
3261

    
3262
    s->width = s->avctx->width;
3263
    s->height = s->avctx->height;
3264
    s->codec_id= s->avctx->codec->id;
3265

    
3266
    init_pred_ptrs(h);
3267

    
3268
    h->dequant_coeff_pps= -1;
3269
    s->unrestricted_mv=1;
3270
    s->decode=1; //FIXME
3271

    
3272
    memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
3273
    memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
3274
}
3275

    
3276
static int decode_init(AVCodecContext *avctx){
3277
    H264Context *h= avctx->priv_data;
3278
    MpegEncContext * const s = &h->s;
3279

    
3280
    MPV_decode_defaults(s);
3281

    
3282
    s->avctx = avctx;
3283
    common_init(h);
3284

    
3285
    s->out_format = FMT_H264;
3286
    s->workaround_bugs= avctx->workaround_bugs;
3287

    
3288
    // set defaults
3289
//    s->decode_mb= ff_h263_decode_mb;
3290
    s->low_delay= 1;
3291
    avctx->pix_fmt= PIX_FMT_YUV420P;
3292

    
3293
    decode_init_vlc();
3294

    
3295
    if(avctx->extradata_size > 0 && avctx->extradata &&
3296
       *(char *)avctx->extradata == 1){
3297
        h->is_avc = 1;
3298
        h->got_avcC = 0;
3299
    } else {
3300
        h->is_avc = 0;
3301
    }
3302

    
3303
    return 0;
3304
}
3305

    
3306
static int frame_start(H264Context *h){
3307
    MpegEncContext * const s = &h->s;
3308
    int i;
3309

    
3310
    if(MPV_frame_start(s, s->avctx) < 0)
3311
        return -1;
3312
    ff_er_frame_start(s);
3313

    
3314
    assert(s->linesize && s->uvlinesize);
3315

    
3316
    for(i=0; i<16; i++){
3317
        h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
3318
        h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
3319
    }
3320
    for(i=0; i<4; i++){
3321
        h->block_offset[16+i]=
3322
        h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
3323
        h->block_offset[24+16+i]=
3324
        h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
3325
    }
3326

    
3327
    /* can't be in alloc_tables because linesize isn't known there.
3328
     * FIXME: redo bipred weight to not require extra buffer? */
3329
    if(!s->obmc_scratchpad)
3330
        s->obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
3331

    
3332
    /* some macroblocks will be accessed before they're available */
3333
    if(FRAME_MBAFF)
3334
        memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(uint8_t));
3335

    
3336
//    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
3337
    return 0;
3338
}
3339

    
3340
static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
3341
    MpegEncContext * const s = &h->s;
3342
    int i;
3343

    
3344
    src_y  -=   linesize;
3345
    src_cb -= uvlinesize;
3346
    src_cr -= uvlinesize;
3347

    
3348
    // There are two lines saved, the line above the the top macroblock of a pair,
3349
    // and the line above the bottom macroblock
3350
    h->left_border[0]= h->top_borders[0][s->mb_x][15];
3351
    for(i=1; i<17; i++){
3352
        h->left_border[i]= src_y[15+i*  linesize];
3353
    }
3354

    
3355
    *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
3356
    *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
3357

    
3358
    if(!(s->flags&CODEC_FLAG_GRAY)){
3359
        h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
3360
        h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
3361
        for(i=1; i<9; i++){
3362
            h->left_border[i+17  ]= src_cb[7+i*uvlinesize];
3363
            h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
3364
        }
3365
        *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
3366
        *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
3367
    }
3368
}
3369

    
3370
static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
3371
    MpegEncContext * const s = &h->s;
3372
    int temp8, i;
3373
    uint64_t temp64;
3374
    int deblock_left = (s->mb_x > 0);
3375
    int deblock_top  = (s->mb_y > 0);
3376

    
3377
    src_y  -=   linesize + 1;
3378
    src_cb -= uvlinesize + 1;
3379
    src_cr -= uvlinesize + 1;
3380

    
3381
#define XCHG(a,b,t,xchg)\
3382
t= a;\
3383
if(xchg)\
3384
    a= b;\
3385
b= t;
3386

    
3387
    if(deblock_left){
3388
        for(i = !deblock_top; i<17; i++){
3389
            XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
3390
        }
3391
    }
3392

    
3393
    if(deblock_top){
3394
        XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
3395
        XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
3396
        if(s->mb_x+1 < s->mb_width){
3397
            XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
3398
        }
3399
    }
3400

    
3401
    if(!(s->flags&CODEC_FLAG_GRAY)){
3402
        if(deblock_left){
3403
            for(i = !deblock_top; i<9; i++){
3404
                XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
3405
                XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
3406
            }
3407
        }
3408
        if(deblock_top){
3409
            XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
3410
            XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
3411
        }
3412
    }
3413
}
3414

    
3415
static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
3416
    MpegEncContext * const s = &h->s;
3417
    int i;
3418

    
3419
    src_y  -= 2 *   linesize;
3420
    src_cb -= 2 * uvlinesize;
3421
    src_cr -= 2 * uvlinesize;
3422

    
3423
    // There are two lines saved, the line above the the top macroblock of a pair,
3424
    // and the line above the bottom macroblock
3425
    h->left_border[0]= h->top_borders[0][s->mb_x][15];
3426
    h->left_border[1]= h->top_borders[1][s->mb_x][15];
3427
    for(i=2; i<34; i++){
3428
        h->left_border[i]= src_y[15+i*  linesize];
3429
    }
3430

    
3431
    *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  32*linesize);
3432
    *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+32*linesize);
3433
    *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
3434
    *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
3435

    
3436
    if(!(s->flags&CODEC_FLAG_GRAY)){
3437
        h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
3438
        h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
3439
        h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
3440
        h->left_border[34+18+1]= h->top_borders[1][s->mb_x][24+7];
3441
        for(i=2; i<18; i++){
3442
            h->left_border[i+34   ]= src_cb[7+i*uvlinesize];
3443
            h->left_border[i+34+18]= src_cr[7+i*uvlinesize];
3444
        }
3445
        *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+16*uvlinesize);
3446
        *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+16*uvlinesize);
3447
        *(uint64_t*)(h->top_borders[1][s->mb_x]+16)= *(uint64_t*)(src_cb+17*uvlinesize);
3448
        *(uint64_t*)(h->top_borders[1][s->mb_x]+24)= *(uint64_t*)(src_cr+17*uvlinesize);
3449
    }
3450
}
3451

    
3452
static inline void xchg_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
3453
    MpegEncContext * const s = &h->s;
3454
    int temp8, i;
3455
    uint64_t temp64;
3456
    int deblock_left = (s->mb_x > 0);
3457
    int deblock_top  = (s->mb_y > 1);
3458

    
3459
    tprintf("xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
3460

    
3461
    src_y  -= 2 *   linesize + 1;
3462
    src_cb -= 2 * uvlinesize + 1;
3463
    src_cr -= 2 * uvlinesize + 1;
3464

    
3465
#define XCHG(a,b,t,xchg)\
3466
t= a;\
3467
if(xchg)\
3468
    a= b;\
3469
b= t;
3470

    
3471
    if(deblock_left){
3472
        for(i = (!deblock_top)<<1; i<34; i++){
3473
            XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
3474
        }
3475
    }
3476

    
3477
    if(deblock_top){
3478
        XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
3479
        XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
3480
        XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+0), *(uint64_t*)(src_y +1 +linesize), temp64, xchg);
3481
        XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+8), *(uint64_t*)(src_y +9 +linesize), temp64, 1);
3482
        if(s->mb_x+1 < s->mb_width){
3483
            XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
3484
            XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x+1]), *(uint64_t*)(src_y +17 +linesize), temp64, 1);
3485
        }
3486
    }
3487

    
3488
    if(!(s->flags&CODEC_FLAG_GRAY)){
3489
        if(deblock_left){
3490
            for(i = (!deblock_top) << 1; i<18; i++){
3491
                XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
3492
                XCHG(h->left_border[i+34+18], src_cr[i*uvlinesize], temp8, xchg);
3493
            }
3494
        }
3495
        if(deblock_top){
3496
            XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
3497
            XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
3498
            XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+16), *(uint64_t*)(src_cb+1 +uvlinesize), temp64, 1);
3499
            XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1);
3500
        }
3501
    }
3502
}
3503

    
3504
static void hl_decode_mb(H264Context *h){
3505
    MpegEncContext * const s = &h->s;
3506
    const int mb_x= s->mb_x;
3507
    const int mb_y= s->mb_y;
3508
    const int mb_xy= mb_x + mb_y*s->mb_stride;
3509
    const int mb_type= s->current_picture.mb_type[mb_xy];
3510
    uint8_t  *dest_y, *dest_cb, *dest_cr;
3511
    int linesize, uvlinesize /*dct_offset*/;
3512
    int i;
3513
    int *block_offset = &h->block_offset[0];
3514
    const unsigned int bottom = mb_y & 1;
3515
    const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass);
3516
    void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
3517
    void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
3518

    
3519
    if(!s->decode)
3520
        return;
3521

    
3522
    dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
3523
    dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3524
    dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3525

    
3526
    s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
3527
    s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
3528

    
3529
    if (MB_FIELD) {
3530
        linesize   = h->mb_linesize   = s->linesize * 2;
3531
        uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
3532
        block_offset = &h->block_offset[24];
3533
        if(mb_y&1){ //FIXME move out of this func?
3534
            dest_y -= s->linesize*15;
3535
            dest_cb-= s->uvlinesize*7;
3536
            dest_cr-= s->uvlinesize*7;
3537
        }
3538
        if(FRAME_MBAFF) {
3539
            int list;
3540
            for(list=0; list<2; list++){
3541
                if(!USES_LIST(mb_type, list))
3542
                    continue;
3543
                if(IS_16X16(mb_type)){
3544
                    int8_t *ref = &h->ref_cache[list][scan8[0]];
3545
                    fill_rectangle(ref, 4, 4, 8, 16+*ref^(s->mb_y&1), 1);
3546
                }else{
3547
                    for(i=0; i<16; i+=4){
3548
                        //FIXME can refs be smaller than 8x8 when !direct_8x8_inference ?
3549
                        int ref = h->ref_cache[list][scan8[i]];
3550
                        if(ref >= 0)
3551
                            fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, 16+ref^(s->mb_y&1), 1);
3552
                    }
3553
                }
3554
            }
3555
        }
3556
    } else {
3557
        linesize   = h->mb_linesize   = s->linesize;
3558
        uvlinesize = h->mb_uvlinesize = s->uvlinesize;
3559
//        dct_offset = s->linesize * 16;
3560
    }
3561

    
3562
    if(transform_bypass){
3563
        idct_dc_add =
3564
        idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
3565
    }else if(IS_8x8DCT(mb_type)){
3566
        idct_dc_add = s->dsp.h264_idct8_dc_add;
3567
        idct_add = s->dsp.h264_idct8_add;
3568
    }else{
3569
        idct_dc_add = s->dsp.h264_idct_dc_add;
3570
        idct_add = s->dsp.h264_idct_add;
3571
    }
3572

    
3573
    if(FRAME_MBAFF && h->deblocking_filter && IS_INTRA(mb_type)
3574
       && (!bottom || !IS_INTRA(s->current_picture.mb_type[mb_xy-s->mb_stride]))){
3575
        int mbt_y = mb_y&~1;
3576
        uint8_t *top_y  = s->current_picture.data[0] + (mbt_y * 16* s->linesize  ) + mb_x * 16;
3577
        uint8_t *top_cb = s->current_picture.data[1] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
3578
        uint8_t *top_cr = s->current_picture.data[2] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
3579
        xchg_pair_border(h, top_y, top_cb, top_cr, s->linesize, s->uvlinesize, 1);
3580
    }
3581

    
3582
    if (IS_INTRA_PCM(mb_type)) {
3583
        unsigned int x, y;
3584

    
3585
        // The pixels are stored in h->mb array in the same order as levels,
3586
        // copy them in output in the correct order.
3587
        for(i=0; i<16; i++) {
3588
            for (y=0; y<4; y++) {
3589
                for (x=0; x<4; x++) {
3590
                    *(dest_y + block_offset[i] + y*linesize + x) = h->mb[i*16+y*4+x];
3591
                }
3592
            }
3593
        }
3594
        for(i=16; i<16+4; i++) {
3595
            for (y=0; y<4; y++) {
3596
                for (x=0; x<4; x++) {
3597
                    *(dest_cb + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
3598
                }
3599
            }
3600
        }
3601
        for(i=20; i<20+4; i++) {
3602
            for (y=0; y<4; y++) {
3603
                for (x=0; x<4; x++) {
3604
                    *(dest_cr + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
3605
                }
3606
            }
3607
        }
3608
    } else {
3609
        if(IS_INTRA(mb_type)){
3610
            if(h->deblocking_filter && !FRAME_MBAFF)
3611
                xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1);
3612

    
3613
            if(!(s->flags&CODEC_FLAG_GRAY)){
3614
                h->pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
3615
                h->pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
3616
            }
3617

    
3618
            if(IS_INTRA4x4(mb_type)){
3619
                if(!s->encoding){
3620
                    if(IS_8x8DCT(mb_type)){
3621
                        for(i=0; i<16; i+=4){
3622
                            uint8_t * const ptr= dest_y + block_offset[i];
3623
                            const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
3624
                            const int nnz = h->non_zero_count_cache[ scan8[i] ];
3625
                            h->pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
3626
                                                   (h->topright_samples_available<<(i+1))&0x8000, linesize);
3627
                            if(nnz){
3628
                                if(nnz == 1 && h->mb[i*16])
3629
                                    idct_dc_add(ptr, h->mb + i*16, linesize);
3630
                                else
3631
                                    idct_add(ptr, h->mb + i*16, linesize);
3632
                            }
3633
                        }
3634
                    }else
3635
                    for(i=0; i<16; i++){
3636
                        uint8_t * const ptr= dest_y + block_offset[i];
3637
                        uint8_t *topright;
3638
                        const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
3639
                        int nnz, tr;
3640

    
3641
                        if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
3642
                            const int topright_avail= (h->topright_samples_available<<i)&0x8000;
3643
                            assert(mb_y || linesize <= block_offset[i]);
3644
                            if(!topright_avail){
3645
                                tr= ptr[3 - linesize]*0x01010101;
3646
                                topright= (uint8_t*) &tr;
3647
                            }else
3648
                                topright= ptr + 4 - linesize;
3649
                        }else
3650
                            topright= NULL;
3651

    
3652
                        h->pred4x4[ dir ](ptr, topright, linesize);
3653
                        nnz = h->non_zero_count_cache[ scan8[i] ];
3654
                        if(nnz){
3655
                            if(s->codec_id == CODEC_ID_H264){
3656
                                if(nnz == 1 && h->mb[i*16])
3657
                                    idct_dc_add(ptr, h->mb + i*16, linesize);
3658
                                else
3659
                                    idct_add(ptr, h->mb + i*16, linesize);
3660
                            }else
3661
                                svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
3662
                        }
3663
                    }
3664
                }
3665
            }else{
3666
                h->pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
3667
                if(s->codec_id == CODEC_ID_H264){
3668
                    if(!transform_bypass)
3669
                        h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[IS_INTRA(mb_type) ? 0:3][s->qscale][0]);
3670
                }else
3671
                    svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
3672
            }
3673
            if(h->deblocking_filter && !FRAME_MBAFF)
3674
                xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0);
3675
        }else if(s->codec_id == CODEC_ID_H264){
3676
            hl_motion(h, dest_y, dest_cb, dest_cr,
3677
                      s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
3678
                      s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
3679
                      s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
3680
        }
3681

    
3682

    
3683
        if(!IS_INTRA4x4(mb_type)){
3684
            if(s->codec_id == CODEC_ID_H264){
3685
                if(IS_INTRA16x16(mb_type)){
3686
                    for(i=0; i<16; i++){
3687
                        if(h->non_zero_count_cache[ scan8[i] ])
3688
                            idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3689
                        else if(h->mb[i*16])
3690
                            idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3691
                    }
3692
                }else{
3693
                    const int di = IS_8x8DCT(mb_type) ? 4 : 1;
3694
                    for(i=0; i<16; i+=di){
3695
                        int nnz = h->non_zero_count_cache[ scan8[i] ];
3696
                        if(nnz){
3697
                            if(nnz==1 && h->mb[i*16])
3698
                                idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3699
                            else
3700
                                idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3701
                        }
3702
                    }
3703
                }
3704
            }else{
3705
                for(i=0; i<16; i++){
3706
                    if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
3707
                        uint8_t * const ptr= dest_y + block_offset[i];
3708
                        svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
3709
                    }
3710
                }
3711
            }
3712
        }
3713

    
3714
        if(!(s->flags&CODEC_FLAG_GRAY)){
3715
            uint8_t *dest[2] = {dest_cb, dest_cr};
3716
            if(transform_bypass){
3717
                idct_add = idct_dc_add = s->dsp.add_pixels4;
3718
            }else{
3719
                idct_add = s->dsp.h264_idct_add;
3720
                idct_dc_add = s->dsp.h264_idct_dc_add;
3721
                chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp][0]);
3722
                chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp][0]);
3723
            }
3724
            if(s->codec_id == CODEC_ID_H264){
3725
                for(i=16; i<16+8; i++){
3726
                    if(h->non_zero_count_cache[ scan8[i] ])
3727
                        idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
3728
                    else if(h->mb[i*16])
3729
                        idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
3730
                }
3731
            }else{
3732
                for(i=16; i<16+8; i++){
3733
                    if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
3734
                        uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
3735
                        svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
3736
                    }
3737
                }
3738
            }
3739
        }
3740
    }
3741
    if(h->deblocking_filter) {
3742
        if (FRAME_MBAFF) {
3743
            //FIXME try deblocking one mb at a time?
3744
            // the reduction in load/storing mvs and such might outweigh the extra backup/xchg_border
3745
            const int mb_y = s->mb_y - 1;
3746
            uint8_t  *pair_dest_y, *pair_dest_cb, *pair_dest_cr;
3747
            const int mb_xy= mb_x + mb_y*s->mb_stride;
3748
            const int mb_type_top   = s->current_picture.mb_type[mb_xy];
3749
            const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
3750
            if (!bottom) return;
3751
            pair_dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
3752
            pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3753
            pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3754

    
3755
            if(IS_INTRA(mb_type_top | mb_type_bottom))
3756
                xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
3757

    
3758
            backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
3759
            // deblock a pair
3760
            // top
3761
            s->mb_y--;
3762
            tprintf("call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
3763
            fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
3764
            h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mb_xy]);
3765
            filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
3766
            // bottom
3767
            s->mb_y++;
3768
            tprintf("call mbaff filter_mb\n");
3769
            fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
3770
            h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
3771
            filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3772
        } else {
3773
            tprintf("call filter_mb\n");
3774
            backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3775
            fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
3776
            filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3777
        }
3778
    }
3779
}
3780

    
3781
/**
3782
 * fills the default_ref_list.
3783
 */
3784
static int fill_default_ref_list(H264Context *h){
3785
    MpegEncContext * const s = &h->s;
3786
    int i;
3787
    int smallest_poc_greater_than_current = -1;
3788
    Picture sorted_short_ref[32];
3789

    
3790
    if(h->slice_type==B_TYPE){
3791
        int out_i;
3792
        int limit= INT_MIN;
3793

    
3794
        /* sort frame according to poc in B slice */
3795
        for(out_i=0; out_i<h->short_ref_count; out_i++){
3796
            int best_i=INT_MIN;
3797
            int best_poc=INT_MAX;
3798

    
3799
            for(i=0; i<h->short_ref_count; i++){
3800
                const int poc= h->short_ref[i]->poc;
3801
                if(poc > limit && poc < best_poc){
3802
                    best_poc= poc;
3803
                    best_i= i;
3804
                }
3805
            }
3806

    
3807
            assert(best_i != INT_MIN);
3808

    
3809
            limit= best_poc;
3810
            sorted_short_ref[out_i]= *h->short_ref[best_i];
3811
            tprintf("sorted poc: %d->%d poc:%d fn:%d\n", best_i, out_i, sorted_short_ref[out_i].poc, sorted_short_ref[out_i].frame_num);
3812
            if (-1 == smallest_poc_greater_than_current) {
3813
                if (h->short_ref[best_i]->poc >= s->current_picture_ptr->poc) {
3814
                    smallest_poc_greater_than_current = out_i;
3815
                }
3816
            }
3817
        }
3818
    }
3819

    
3820
    if(s->picture_structure == PICT_FRAME){
3821
        if(h->slice_type==B_TYPE){
3822
            int list;
3823
            tprintf("current poc: %d, smallest_poc_greater_than_current: %d\n", s->current_picture_ptr->poc, smallest_poc_greater_than_current);
3824

    
3825
            // find the largest poc
3826
            for(list=0; list<2; list++){
3827
                int index = 0;
3828
                int j= -99;
3829
                int step= list ? -1 : 1;
3830

    
3831
                for(i=0; i<h->short_ref_count && index < h->ref_count[list]; i++, j+=step) {
3832
                    while(j<0 || j>= h->short_ref_count){
3833
                        if(j != -99 && step == (list ? -1 : 1))
3834
                            return -1;
3835
                        step = -step;
3836
                        j= smallest_poc_greater_than_current + (step>>1);
3837
                    }
3838
                    if(sorted_short_ref[j].reference != 3) continue;
3839
                    h->default_ref_list[list][index  ]= sorted_short_ref[j];
3840
                    h->default_ref_list[list][index++].pic_id= sorted_short_ref[j].frame_num;
3841
                }
3842

    
3843
                for(i = 0; i < 16 && index < h->ref_count[ list ]; i++){
3844
                    if(h->long_ref[i] == NULL) continue;
3845
                    if(h->long_ref[i]->reference != 3) continue;
3846

    
3847
                    h->default_ref_list[ list ][index  ]= *h->long_ref[i];
3848
                    h->default_ref_list[ list ][index++].pic_id= i;;
3849
                }
3850

    
3851
                if(list && (smallest_poc_greater_than_current<=0 || smallest_poc_greater_than_current>=h->short_ref_count) && (1 < index)){
3852
                    // swap the two first elements of L1 when
3853
                    // L0 and L1 are identical
3854
                    Picture temp= h->default_ref_list[1][0];
3855
                    h->default_ref_list[1][0] = h->default_ref_list[1][1];
3856
                    h->default_ref_list[1][1] = temp;
3857
                }
3858

    
3859
                if(index < h->ref_count[ list ])
3860
                    memset(&h->default_ref_list[list][index], 0, sizeof(Picture)*(h->ref_count[ list ] - index));
3861
            }
3862
        }else{
3863
            int index=0;
3864
            for(i=0; i<h->short_ref_count; i++){
3865
                if(h->short_ref[i]->reference != 3) continue; //FIXME refernce field shit
3866
                h->default_ref_list[0][index  ]= *h->short_ref[i];
3867
                h->default_ref_list[0][index++].pic_id= h->short_ref[i]->frame_num;
3868
            }
3869
            for(i = 0; i < 16; i++){
3870
                if(h->long_ref[i] == NULL) continue;
3871
                if(h->long_ref[i]->reference != 3) continue;
3872
                h->default_ref_list[0][index  ]= *h->long_ref[i];
3873
                h->default_ref_list[0][index++].pic_id= i;;
3874
            }
3875
            if(index < h->ref_count[0])
3876
                memset(&h->default_ref_list[0][index], 0, sizeof(Picture)*(h->ref_count[0] - index));
3877
        }
3878
    }else{ //FIELD
3879
        if(h->slice_type==B_TYPE){
3880
        }else{
3881
            //FIXME second field balh
3882
        }
3883
    }
3884
#ifdef TRACE
3885
    for (i=0; i<h->ref_count[0]; i++) {
3886
        tprintf("List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
3887
    }
3888
    if(h->slice_type==B_TYPE){
3889
        for (i=0; i<h->ref_count[1]; i++) {
3890
            tprintf("List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[0][i].data[0]);
3891
        }
3892
    }
3893
#endif
3894
    return 0;
3895
}
3896

    
3897
static void print_short_term(H264Context *h);
3898
static void print_long_term(H264Context *h);
3899

    
3900
static int decode_ref_pic_list_reordering(H264Context *h){
3901
    MpegEncContext * const s = &h->s;
3902
    int list, index;
3903

    
3904
    print_short_term(h);
3905
    print_long_term(h);
3906
    if(h->slice_type==I_TYPE || h->slice_type==SI_TYPE) return 0; //FIXME move before func
3907

    
3908
    for(list=0; list<2; list++){
3909
        memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
3910

    
3911
        if(get_bits1(&s->gb)){
3912
            int pred= h->curr_pic_num;
3913

    
3914
            for(index=0; ; index++){
3915
                int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
3916
                int pic_id;
3917
                int i;
3918
                Picture *ref = NULL;
3919

    
3920
                if(reordering_of_pic_nums_idc==3)
3921
                    break;
3922

    
3923
                if(index >= h->ref_count[list]){
3924
                    av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
3925
                    return -1;
3926
                }
3927

    
3928
                if(reordering_of_pic_nums_idc<3){
3929
                    if(reordering_of_pic_nums_idc<2){
3930
                        const int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
3931

    
3932
                        if(abs_diff_pic_num >= h->max_pic_num){
3933
                            av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
3934
                            return -1;
3935
                        }
3936

    
3937
                        if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
3938
                        else                                pred+= abs_diff_pic_num;
3939
                        pred &= h->max_pic_num - 1;
3940

    
3941
                        for(i= h->short_ref_count-1; i>=0; i--){
3942
                            ref = h->short_ref[i];
3943
                            assert(ref->reference == 3);
3944
                            assert(!ref->long_ref);
3945
                            if(ref->data[0] != NULL && ref->frame_num == pred && ref->long_ref == 0) // ignore non existing pictures by testing data[0] pointer
3946
                                break;
3947
                        }
3948
                        if(i>=0)
3949
                            ref->pic_id= ref->frame_num;
3950
                    }else{
3951
                        pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
3952
                        ref = h->long_ref[pic_id];
3953
                        ref->pic_id= pic_id;
3954
                        assert(ref->reference == 3);
3955
                        assert(ref->long_ref);
3956
                        i=0;
3957
                    }
3958

    
3959
                    if (i < 0) {
3960
                        av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
3961
                        memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
3962
                    } else {
3963
                        for(i=index; i+1<h->ref_count[list]; i++){
3964
                            if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
3965
                                break;
3966
                        }
3967
                        for(; i > index; i--){
3968
                            h->ref_list[list][i]= h->ref_list[list][i-1];
3969
                        }
3970
                        h->ref_list[list][index]= *ref;
3971
                    }
3972
                }else{
3973
                    av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
3974
                    return -1;
3975
                }
3976
            }
3977
        }
3978

    
3979
        if(h->slice_type!=B_TYPE) break;
3980
    }
3981
    for(list=0; list<2; list++){
3982
        for(index= 0; index < h->ref_count[list]; index++){
3983
            if(!h->ref_list[list][index].data[0])
3984
                h->ref_list[list][index]= s->current_picture;
3985
        }
3986
        if(h->slice_type!=B_TYPE) break;
3987
    }
3988

    
3989
    if(h->slice_type==B_TYPE && !h->direct_spatial_mv_pred)
3990
        direct_dist_scale_factor(h);
3991
    direct_ref_list_init(h);
3992
    return 0;
3993
}
3994

    
3995
static void fill_mbaff_ref_list(H264Context *h){
3996
    int list, i, j;
3997
    for(list=0; list<2; list++){
3998
        for(i=0; i<h->ref_count[list]; i++){
3999
            Picture *frame = &h->ref_list[list][i];
4000
            Picture *field = &h->ref_list[list][16+2*i];
4001
            field[0] = *frame;
4002
            for(j=0; j<3; j++)
4003
                field[0].linesize[j] <<= 1;
4004
            field[1] = field[0];
4005
            for(j=0; j<3; j++)
4006
                field[1].data[j] += frame->linesize[j];
4007

    
4008
            h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
4009
            h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
4010
            for(j=0; j<2; j++){
4011
                h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
4012
                h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
4013
            }
4014
        }
4015
    }
4016
    for(j=0; j<h->ref_count[1]; j++){
4017
        for(i=0; i<h->ref_count[0]; i++)
4018
            h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
4019
        memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
4020
        memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
4021
    }
4022
}
4023

    
4024
static int pred_weight_table(H264Context *h){
4025
    MpegEncContext * const s = &h->s;
4026
    int list, i;
4027
    int luma_def, chroma_def;
4028

    
4029
    h->use_weight= 0;
4030
    h->use_weight_chroma= 0;
4031
    h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
4032
    h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
4033
    luma_def = 1<<h->luma_log2_weight_denom;
4034
    chroma_def = 1<<h->chroma_log2_weight_denom;
4035

    
4036
    for(list=0; list<2; list++){
4037
        for(i=0; i<h->ref_count[list]; i++){
4038
            int luma_weight_flag, chroma_weight_flag;
4039

    
4040
            luma_weight_flag= get_bits1(&s->gb);
4041
            if(luma_weight_flag){
4042
                h->luma_weight[list][i]= get_se_golomb(&s->gb);
4043
                h->luma_offset[list][i]= get_se_golomb(&s->gb);
4044
                if(   h->luma_weight[list][i] != luma_def
4045
                   || h->luma_offset[list][i] != 0)
4046
                    h->use_weight= 1;
4047
            }else{
4048
                h->luma_weight[list][i]= luma_def;
4049
                h->luma_offset[list][i]= 0;
4050
            }
4051

    
4052
            chroma_weight_flag= get_bits1(&s->gb);
4053
            if(chroma_weight_flag){
4054
                int j;
4055
                for(j=0; j<2; j++){
4056
                    h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
4057
                    h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
4058
                    if(   h->chroma_weight[list][i][j] != chroma_def
4059
                       || h->chroma_offset[list][i][j] != 0)
4060
                        h->use_weight_chroma= 1;
4061
                }
4062
            }else{
4063
                int j;
4064
                for(j=0; j<2; j++){
4065
                    h->chroma_weight[list][i][j]= chroma_def;
4066
                    h->chroma_offset[list][i][j]= 0;
4067
                }
4068
            }
4069
        }
4070
        if(h->slice_type != B_TYPE) break;
4071
    }
4072
    h->use_weight= h->use_weight || h->use_weight_chroma;
4073
    return 0;
4074
}
4075

    
4076
static void implicit_weight_table(H264Context *h){
4077
    MpegEncContext * const s = &h->s;
4078
    int ref0, ref1;
4079
    int cur_poc = s->current_picture_ptr->poc;
4080

    
4081
    if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
4082
       && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
4083
        h->use_weight= 0;
4084
        h->use_weight_chroma= 0;
4085
        return;
4086
    }
4087

    
4088
    h->use_weight= 2;
4089
    h->use_weight_chroma= 2;
4090
    h->luma_log2_weight_denom= 5;
4091
    h->chroma_log2_weight_denom= 5;
4092

    
4093
    for(ref0=0; ref0 < h->ref_count[0]; ref0++){
4094
        int poc0 = h->ref_list[0][ref0].poc;
4095
        for(ref1=0; ref1 < h->ref_count[1]; ref1++){
4096
            int poc1 = h->ref_list[1][ref1].poc;
4097
            int td = clip(poc1 - poc0, -128, 127);
4098
            if(td){
4099
                int tb = clip(cur_poc - poc0, -128, 127);
4100
                int tx = (16384 + (FFABS(td) >> 1)) / td;
4101
                int dist_scale_factor = clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
4102
                if(dist_scale_factor < -64 || dist_scale_factor > 128)
4103
                    h->implicit_weight[ref0][ref1] = 32;
4104
                else
4105
                    h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
4106
            }else
4107
                h->implicit_weight[ref0][ref1] = 32;
4108
        }
4109
    }
4110
}
4111

    
4112
static inline void unreference_pic(H264Context *h, Picture *pic){
4113
    int i;
4114
    pic->reference=0;
4115
    if(pic == h->delayed_output_pic)
4116
        pic->reference=1;
4117
    else{
4118
        for(i = 0; h->delayed_pic[i]; i++)
4119
            if(pic == h->delayed_pic[i]){
4120
                pic->reference=1;
4121
                break;
4122
            }
4123
    }
4124
}
4125

    
4126
/**
4127
 * instantaneous decoder refresh.
4128
 */
4129
static void idr(H264Context *h){
4130
    int i;
4131

    
4132
    for(i=0; i<16; i++){
4133
        if (h->long_ref[i] != NULL) {
4134
            unreference_pic(h, h->long_ref[i]);
4135
            h->long_ref[i]= NULL;
4136
        }
4137
    }
4138
    h->long_ref_count=0;
4139

    
4140
    for(i=0; i<h->short_ref_count; i++){
4141
        unreference_pic(h, h->short_ref[i]);
4142
        h->short_ref[i]= NULL;
4143
    }
4144
    h->short_ref_count=0;
4145
}
4146

    
4147
/* forget old pics after a seek */
4148
static void flush_dpb(AVCodecContext *avctx){
4149
    H264Context *h= avctx->priv_data;
4150
    int i;
4151
    for(i=0; i<16; i++) {
4152
        if(h->delayed_pic[i])
4153
            h->delayed_pic[i]->reference= 0;
4154
        h->delayed_pic[i]= NULL;
4155
    }
4156
    if(h->delayed_output_pic)
4157
        h->delayed_output_pic->reference= 0;
4158
    h->delayed_output_pic= NULL;
4159
    idr(h);
4160
    if(h->s.current_picture_ptr)
4161
        h->s.current_picture_ptr->reference= 0;
4162
}
4163

    
4164
/**
4165
 *
4166
 * @return the removed picture or NULL if an error occurs
4167
 */
4168
static Picture * remove_short(H264Context *h, int frame_num){
4169
    MpegEncContext * const s = &h->s;
4170
    int i;
4171

    
4172
    if(s->avctx->debug&FF_DEBUG_MMCO)
4173
        av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
4174

    
4175
    for(i=0; i<h->short_ref_count; i++){
4176
        Picture *pic= h->short_ref[i];
4177
        if(s->avctx->debug&FF_DEBUG_MMCO)
4178
            av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
4179
        if(pic->frame_num == frame_num){
4180
            h->short_ref[i]= NULL;
4181
            memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i - 1)*sizeof(Picture*));
4182
            h->short_ref_count--;
4183
            return pic;
4184
        }
4185
    }
4186
    return NULL;
4187
}
4188

    
4189
/**
4190
 *
4191
 * @return the removed picture or NULL if an error occurs
4192
 */
4193
static Picture * remove_long(H264Context *h, int i){
4194
    Picture *pic;
4195

    
4196
    pic= h->long_ref[i];
4197
    h->long_ref[i]= NULL;
4198
    if(pic) h->long_ref_count--;
4199

    
4200
    return pic;
4201
}
4202

    
4203
/**
4204
 * print short term list
4205
 */
4206
static void print_short_term(H264Context *h) {
4207
    uint32_t i;
4208
    if(h->s.avctx->debug&FF_DEBUG_MMCO) {
4209
        av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
4210
        for(i=0; i<h->short_ref_count; i++){
4211
            Picture *pic= h->short_ref[i];
4212
            av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
4213
        }
4214
    }
4215
}
4216

    
4217
/**
4218
 * print long term list
4219
 */
4220
static void print_long_term(H264Context *h) {
4221
    uint32_t i;
4222
    if(h->s.avctx->debug&FF_DEBUG_MMCO) {
4223
        av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
4224
        for(i = 0; i < 16; i++){
4225
            Picture *pic= h->long_ref[i];
4226
            if (pic) {
4227
                av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
4228
            }
4229
        }
4230
    }
4231
}
4232

    
4233
/**
4234
 * Executes the reference picture marking (memory management control operations).
4235
 */
4236
static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
4237
    MpegEncContext * const s = &h->s;
4238
    int i, j;
4239
    int current_is_long=0;
4240
    Picture *pic;
4241

    
4242
    if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
4243
        av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
4244

    
4245
    for(i=0; i<mmco_count; i++){
4246
        if(s->avctx->debug&FF_DEBUG_MMCO)
4247
            av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_frame_num, h->mmco[i].long_index);
4248

    
4249
        switch(mmco[i].opcode){
4250
        case MMCO_SHORT2UNUSED:
4251
            pic= remove_short(h, mmco[i].short_frame_num);
4252
            if(pic)
4253
                unreference_pic(h, pic);
4254
            else if(s->avctx->debug&FF_DEBUG_MMCO)
4255
                av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: remove_short() failure\n");
4256
            break;
4257
        case MMCO_SHORT2LONG:
4258
            pic= remove_long(h, mmco[i].long_index);
4259
            if(pic) unreference_pic(h, pic);
4260

    
4261
            h->long_ref[ mmco[i].long_index ]= remove_short(h, mmco[i].short_frame_num);
4262
            h->long_ref[ mmco[i].long_index ]->long_ref=1;
4263
            h->long_ref_count++;