Statistics
| Branch: | Revision:

ffmpeg / libavcodec / vp8.c @ e9266a2b

History | View | Annotate | Download (65.6 KB)

1
/**
2
 * VP8 compatible video decoder
3
 *
4
 * Copyright (C) 2010 David Conrad
5
 * Copyright (C) 2010 Ronald S. Bultje
6
 * Copyright (C) 2010 Jason Garrett-Glaser
7
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24

    
25
#include "libavcore/imgutils.h"
26
#include "avcodec.h"
27
#include "vp56.h"
28
#include "vp8data.h"
29
#include "vp8dsp.h"
30
#include "h264pred.h"
31
#include "rectangle.h"
32

    
33
typedef struct {
34
    uint8_t filter_level;
35
    uint8_t inner_limit;
36
    uint8_t inner_filter;
37
} VP8FilterStrength;
38

    
39
typedef struct {
40
    uint8_t skip;
41
    // todo: make it possible to check for at least (i4x4 or split_mv)
42
    // in one op. are others needed?
43
    uint8_t mode;
44
    uint8_t ref_frame;
45
    uint8_t partitioning;
46
    VP56mv mv;
47
    VP56mv bmv[16];
48
} VP8Macroblock;
49

    
50
typedef struct {
51
    AVCodecContext *avctx;
52
    DSPContext dsp;
53
    VP8DSPContext vp8dsp;
54
    H264PredContext hpc;
55
    vp8_mc_func put_pixels_tab[3][3][3];
56
    AVFrame frames[4];
57
    AVFrame *framep[4];
58
    uint8_t *edge_emu_buffer;
59
    VP56RangeCoder c;   ///< header context, includes mb modes and motion vectors
60
    int profile;
61

    
62
    int mb_width;   /* number of horizontal MB */
63
    int mb_height;  /* number of vertical MB */
64
    int linesize;
65
    int uvlinesize;
66

    
67
    int keyframe;
68
    int invisible;
69
    int update_last;    ///< update VP56_FRAME_PREVIOUS with the current one
70
    int update_golden;  ///< VP56_FRAME_NONE if not updated, or which frame to copy if so
71
    int update_altref;
72
    int deblock_filter;
73

    
74
    /**
75
     * If this flag is not set, all the probability updates
76
     * are discarded after this frame is decoded.
77
     */
78
    int update_probabilities;
79

    
80
    /**
81
     * All coefficients are contained in separate arith coding contexts.
82
     * There can be 1, 2, 4, or 8 of these after the header context.
83
     */
84
    int num_coeff_partitions;
85
    VP56RangeCoder coeff_partition[8];
86

    
87
    VP8Macroblock *macroblocks;
88
    VP8Macroblock *macroblocks_base;
89
    VP8FilterStrength *filter_strength;
90

    
91
    uint8_t *intra4x4_pred_mode_top;
92
    uint8_t intra4x4_pred_mode_left[4];
93
    uint8_t *segmentation_map;
94

    
95
    /**
96
     * Cache of the top row needed for intra prediction
97
     * 16 for luma, 8 for each chroma plane
98
     */
99
    uint8_t (*top_border)[16+8+8];
100

    
101
    /**
102
     * For coeff decode, we need to know whether the above block had non-zero
103
     * coefficients. This means for each macroblock, we need data for 4 luma
104
     * blocks, 2 u blocks, 2 v blocks, and the luma dc block, for a total of 9
105
     * per macroblock. We keep the last row in top_nnz.
106
     */
107
    uint8_t (*top_nnz)[9];
108
    DECLARE_ALIGNED(8, uint8_t, left_nnz)[9];
109

    
110
    /**
111
     * This is the index plus one of the last non-zero coeff
112
     * for each of the blocks in the current macroblock.
113
     * So, 0 -> no coeffs
114
     *     1 -> dc-only (special transform)
115
     *     2+-> full transform
116
     */
117
    DECLARE_ALIGNED(16, uint8_t, non_zero_count_cache)[6][4];
118
    DECLARE_ALIGNED(16, DCTELEM, block)[6][4][16];
119
    DECLARE_ALIGNED(16, DCTELEM, block_dc)[16];
120
    uint8_t intra4x4_pred_mode_mb[16];
121

    
122
    int chroma_pred_mode;    ///< 8x8c pred mode of the current macroblock
123
    int segment;             ///< segment of the current macroblock
124

    
125
    int mbskip_enabled;
126
    int sign_bias[4]; ///< one state [0, 1] per ref frame type
127
    int ref_count[3];
128

    
129
    /**
130
     * Base parameters for segmentation, i.e. per-macroblock parameters.
131
     * These must be kept unchanged even if segmentation is not used for
132
     * a frame, since the values persist between interframes.
133
     */
134
    struct {
135
        int enabled;
136
        int absolute_vals;
137
        int update_map;
138
        int8_t base_quant[4];
139
        int8_t filter_level[4];     ///< base loop filter level
140
    } segmentation;
141

    
142
    /**
143
     * Macroblocks can have one of 4 different quants in a frame when
144
     * segmentation is enabled.
145
     * If segmentation is disabled, only the first segment's values are used.
146
     */
147
    struct {
148
        // [0] - DC qmul  [1] - AC qmul
149
        int16_t luma_qmul[2];
150
        int16_t luma_dc_qmul[2];    ///< luma dc-only block quant
151
        int16_t chroma_qmul[2];
152
    } qmat[4];
153

    
154
    struct {
155
        int simple;
156
        int level;
157
        int sharpness;
158
    } filter;
159

    
160
    struct {
161
        int enabled;    ///< whether each mb can have a different strength based on mode/ref
162

    
163
        /**
164
         * filter strength adjustment for the following macroblock modes:
165
         * [0-3] - i16x16 (always zero)
166
         * [4]   - i4x4
167
         * [5]   - zero mv
168
         * [6]   - inter modes except for zero or split mv
169
         * [7]   - split mv
170
         *  i16x16 modes never have any adjustment
171
         */
172
        int8_t mode[VP8_MVMODE_SPLIT+1];
173

    
174
        /**
175
         * filter strength adjustment for macroblocks that reference:
176
         * [0] - intra / VP56_FRAME_CURRENT
177
         * [1] - VP56_FRAME_PREVIOUS
178
         * [2] - VP56_FRAME_GOLDEN
179
         * [3] - altref / VP56_FRAME_GOLDEN2
180
         */
181
        int8_t ref[4];
182
    } lf_delta;
183

    
184
    /**
185
     * These are all of the updatable probabilities for binary decisions.
186
     * They are only implictly reset on keyframes, making it quite likely
187
     * for an interframe to desync if a prior frame's header was corrupt
188
     * or missing outright!
189
     */
190
    struct {
191
        uint8_t segmentid[3];
192
        uint8_t mbskip;
193
        uint8_t intra;
194
        uint8_t last;
195
        uint8_t golden;
196
        uint8_t pred16x16[4];
197
        uint8_t pred8x8c[3];
198
        /* Padded to allow overreads */
199
        uint8_t token[4][17][3][NUM_DCT_TOKENS-1];
200
        uint8_t mvc[2][19];
201
    } prob[2];
202
} VP8Context;
203

    
204
static void vp8_decode_flush(AVCodecContext *avctx)
205
{
206
    VP8Context *s = avctx->priv_data;
207
    int i;
208

    
209
    for (i = 0; i < 4; i++)
210
        if (s->frames[i].data[0])
211
            avctx->release_buffer(avctx, &s->frames[i]);
212
    memset(s->framep, 0, sizeof(s->framep));
213

    
214
    av_freep(&s->macroblocks_base);
215
    av_freep(&s->filter_strength);
216
    av_freep(&s->intra4x4_pred_mode_top);
217
    av_freep(&s->top_nnz);
218
    av_freep(&s->edge_emu_buffer);
219
    av_freep(&s->top_border);
220
    av_freep(&s->segmentation_map);
221

    
222
    s->macroblocks        = NULL;
223
}
224

    
225
static int update_dimensions(VP8Context *s, int width, int height)
226
{
227
    if (av_image_check_size(width, height, 0, s->avctx))
228
        return AVERROR_INVALIDDATA;
229

    
230
    vp8_decode_flush(s->avctx);
231

    
232
    avcodec_set_dimensions(s->avctx, width, height);
233

    
234
    s->mb_width  = (s->avctx->coded_width +15) / 16;
235
    s->mb_height = (s->avctx->coded_height+15) / 16;
236

    
237
    s->macroblocks_base        = av_mallocz((s->mb_width+s->mb_height*2+1)*sizeof(*s->macroblocks));
238
    s->filter_strength         = av_mallocz(s->mb_width*sizeof(*s->filter_strength));
239
    s->intra4x4_pred_mode_top  = av_mallocz(s->mb_width*4);
240
    s->top_nnz                 = av_mallocz(s->mb_width*sizeof(*s->top_nnz));
241
    s->top_border              = av_mallocz((s->mb_width+1)*sizeof(*s->top_border));
242
    s->segmentation_map        = av_mallocz(s->mb_width*s->mb_height);
243

    
244
    if (!s->macroblocks_base || !s->filter_strength || !s->intra4x4_pred_mode_top ||
245
        !s->top_nnz || !s->top_border || !s->segmentation_map)
246
        return AVERROR(ENOMEM);
247

    
248
    s->macroblocks        = s->macroblocks_base + 1;
249

    
250
    return 0;
251
}
252

    
253
static void parse_segment_info(VP8Context *s)
254
{
255
    VP56RangeCoder *c = &s->c;
256
    int i;
257

    
258
    s->segmentation.update_map = vp8_rac_get(c);
259

    
260
    if (vp8_rac_get(c)) { // update segment feature data
261
        s->segmentation.absolute_vals = vp8_rac_get(c);
262

    
263
        for (i = 0; i < 4; i++)
264
            s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
265

    
266
        for (i = 0; i < 4; i++)
267
            s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
268
    }
269
    if (s->segmentation.update_map)
270
        for (i = 0; i < 3; i++)
271
            s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
272
}
273

    
274
static void update_lf_deltas(VP8Context *s)
275
{
276
    VP56RangeCoder *c = &s->c;
277
    int i;
278

    
279
    for (i = 0; i < 4; i++)
280
        s->lf_delta.ref[i]  = vp8_rac_get_sint(c, 6);
281

    
282
    for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++)
283
        s->lf_delta.mode[i] = vp8_rac_get_sint(c, 6);
284
}
285

    
286
static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
287
{
288
    const uint8_t *sizes = buf;
289
    int i;
290

    
291
    s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
292

    
293
    buf      += 3*(s->num_coeff_partitions-1);
294
    buf_size -= 3*(s->num_coeff_partitions-1);
295
    if (buf_size < 0)
296
        return -1;
297

    
298
    for (i = 0; i < s->num_coeff_partitions-1; i++) {
299
        int size = AV_RL24(sizes + 3*i);
300
        if (buf_size - size < 0)
301
            return -1;
302

    
303
        ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
304
        buf      += size;
305
        buf_size -= size;
306
    }
307
    ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
308

    
309
    return 0;
310
}
311

    
312
static void get_quants(VP8Context *s)
313
{
314
    VP56RangeCoder *c = &s->c;
315
    int i, base_qi;
316

    
317
    int yac_qi     = vp8_rac_get_uint(c, 7);
318
    int ydc_delta  = vp8_rac_get_sint(c, 4);
319
    int y2dc_delta = vp8_rac_get_sint(c, 4);
320
    int y2ac_delta = vp8_rac_get_sint(c, 4);
321
    int uvdc_delta = vp8_rac_get_sint(c, 4);
322
    int uvac_delta = vp8_rac_get_sint(c, 4);
323

    
324
    for (i = 0; i < 4; i++) {
325
        if (s->segmentation.enabled) {
326
            base_qi = s->segmentation.base_quant[i];
327
            if (!s->segmentation.absolute_vals)
328
                base_qi += yac_qi;
329
        } else
330
            base_qi = yac_qi;
331

    
332
        s->qmat[i].luma_qmul[0]    =       vp8_dc_qlookup[av_clip(base_qi + ydc_delta , 0, 127)];
333
        s->qmat[i].luma_qmul[1]    =       vp8_ac_qlookup[av_clip(base_qi             , 0, 127)];
334
        s->qmat[i].luma_dc_qmul[0] =   2 * vp8_dc_qlookup[av_clip(base_qi + y2dc_delta, 0, 127)];
335
        s->qmat[i].luma_dc_qmul[1] = 155 * vp8_ac_qlookup[av_clip(base_qi + y2ac_delta, 0, 127)] / 100;
336
        s->qmat[i].chroma_qmul[0]  =       vp8_dc_qlookup[av_clip(base_qi + uvdc_delta, 0, 127)];
337
        s->qmat[i].chroma_qmul[1]  =       vp8_ac_qlookup[av_clip(base_qi + uvac_delta, 0, 127)];
338

    
339
        s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
340
        s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
341
    }
342
}
343

    
344
/**
345
 * Determine which buffers golden and altref should be updated with after this frame.
346
 * The spec isn't clear here, so I'm going by my understanding of what libvpx does
347
 *
348
 * Intra frames update all 3 references
349
 * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
350
 * If the update (golden|altref) flag is set, it's updated with the current frame
351
 *      if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
352
 * If the flag is not set, the number read means:
353
 *      0: no update
354
 *      1: VP56_FRAME_PREVIOUS
355
 *      2: update golden with altref, or update altref with golden
356
 */
357
static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
358
{
359
    VP56RangeCoder *c = &s->c;
360

    
361
    if (update)
362
        return VP56_FRAME_CURRENT;
363

    
364
    switch (vp8_rac_get_uint(c, 2)) {
365
    case 1:
366
        return VP56_FRAME_PREVIOUS;
367
    case 2:
368
        return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
369
    }
370
    return VP56_FRAME_NONE;
371
}
372

    
373
static void update_refs(VP8Context *s)
374
{
375
    VP56RangeCoder *c = &s->c;
376

    
377
    int update_golden = vp8_rac_get(c);
378
    int update_altref = vp8_rac_get(c);
379

    
380
    s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
381
    s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
382
}
383

    
384
static int decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
385
{
386
    VP56RangeCoder *c = &s->c;
387
    int header_size, hscale, vscale, i, j, k, l, m, ret;
388
    int width  = s->avctx->width;
389
    int height = s->avctx->height;
390

    
391
    s->keyframe  = !(buf[0] & 1);
392
    s->profile   =  (buf[0]>>1) & 7;
393
    s->invisible = !(buf[0] & 0x10);
394
    header_size  = AV_RL24(buf) >> 5;
395
    buf      += 3;
396
    buf_size -= 3;
397

    
398
    if (s->profile > 3)
399
        av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
400

    
401
    if (!s->profile)
402
        memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
403
    else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
404
        memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab, sizeof(s->put_pixels_tab));
405

    
406
    if (header_size > buf_size - 7*s->keyframe) {
407
        av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
408
        return AVERROR_INVALIDDATA;
409
    }
410

    
411
    if (s->keyframe) {
412
        if (AV_RL24(buf) != 0x2a019d) {
413
            av_log(s->avctx, AV_LOG_ERROR, "Invalid start code 0x%x\n", AV_RL24(buf));
414
            return AVERROR_INVALIDDATA;
415
        }
416
        width  = AV_RL16(buf+3) & 0x3fff;
417
        height = AV_RL16(buf+5) & 0x3fff;
418
        hscale = buf[4] >> 6;
419
        vscale = buf[6] >> 6;
420
        buf      += 7;
421
        buf_size -= 7;
422

    
423
        if (hscale || vscale)
424
            av_log_missing_feature(s->avctx, "Upscaling", 1);
425

    
426
        s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
427
        for (i = 0; i < 4; i++)
428
            for (j = 0; j < 16; j++)
429
                memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
430
                       sizeof(s->prob->token[i][j]));
431
        memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter, sizeof(s->prob->pred16x16));
432
        memcpy(s->prob->pred8x8c , vp8_pred8x8c_prob_inter , sizeof(s->prob->pred8x8c));
433
        memcpy(s->prob->mvc      , vp8_mv_default_prob     , sizeof(s->prob->mvc));
434
        memset(&s->segmentation, 0, sizeof(s->segmentation));
435
    }
436

    
437
    if (!s->macroblocks_base || /* first frame */
438
        width != s->avctx->width || height != s->avctx->height) {
439
        if ((ret = update_dimensions(s, width, height) < 0))
440
            return ret;
441
    }
442

    
443
    ff_vp56_init_range_decoder(c, buf, header_size);
444
    buf      += header_size;
445
    buf_size -= header_size;
446

    
447
    if (s->keyframe) {
448
        if (vp8_rac_get(c))
449
            av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
450
        vp8_rac_get(c); // whether we can skip clamping in dsp functions
451
    }
452

    
453
    if ((s->segmentation.enabled = vp8_rac_get(c)))
454
        parse_segment_info(s);
455
    else
456
        s->segmentation.update_map = 0; // FIXME: move this to some init function?
457

    
458
    s->filter.simple    = vp8_rac_get(c);
459
    s->filter.level     = vp8_rac_get_uint(c, 6);
460
    s->filter.sharpness = vp8_rac_get_uint(c, 3);
461

    
462
    if ((s->lf_delta.enabled = vp8_rac_get(c)))
463
        if (vp8_rac_get(c))
464
            update_lf_deltas(s);
465

    
466
    if (setup_partitions(s, buf, buf_size)) {
467
        av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
468
        return AVERROR_INVALIDDATA;
469
    }
470

    
471
    get_quants(s);
472

    
473
    if (!s->keyframe) {
474
        update_refs(s);
475
        s->sign_bias[VP56_FRAME_GOLDEN]               = vp8_rac_get(c);
476
        s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
477
    }
478

    
479
    // if we aren't saving this frame's probabilities for future frames,
480
    // make a copy of the current probabilities
481
    if (!(s->update_probabilities = vp8_rac_get(c)))
482
        s->prob[1] = s->prob[0];
483

    
484
    s->update_last = s->keyframe || vp8_rac_get(c);
485

    
486
    for (i = 0; i < 4; i++)
487
        for (j = 0; j < 8; j++)
488
            for (k = 0; k < 3; k++)
489
                for (l = 0; l < NUM_DCT_TOKENS-1; l++)
490
                    if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
491
                        int prob = vp8_rac_get_uint(c, 8);
492
                        for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
493
                            s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
494
                    }
495

    
496
    if ((s->mbskip_enabled = vp8_rac_get(c)))
497
        s->prob->mbskip = vp8_rac_get_uint(c, 8);
498

    
499
    if (!s->keyframe) {
500
        s->prob->intra  = vp8_rac_get_uint(c, 8);
501
        s->prob->last   = vp8_rac_get_uint(c, 8);
502
        s->prob->golden = vp8_rac_get_uint(c, 8);
503

    
504
        if (vp8_rac_get(c))
505
            for (i = 0; i < 4; i++)
506
                s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
507
        if (vp8_rac_get(c))
508
            for (i = 0; i < 3; i++)
509
                s->prob->pred8x8c[i]  = vp8_rac_get_uint(c, 8);
510

    
511
        // 17.2 MV probability update
512
        for (i = 0; i < 2; i++)
513
            for (j = 0; j < 19; j++)
514
                if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
515
                    s->prob->mvc[i][j] = vp8_rac_get_nn(c);
516
    }
517

    
518
    return 0;
519
}
520

    
521
static av_always_inline
522
void clamp_mv(VP8Context *s, VP56mv *dst, const VP56mv *src, int mb_x, int mb_y)
523
{
524
#define MARGIN (16 << 2)
525
    dst->x = av_clip(src->x, -((mb_x << 6) + MARGIN),
526
                     ((s->mb_width  - 1 - mb_x) << 6) + MARGIN);
527
    dst->y = av_clip(src->y, -((mb_y << 6) + MARGIN),
528
                     ((s->mb_height - 1 - mb_y) << 6) + MARGIN);
529
}
530

    
531
static av_always_inline
532
void find_near_mvs(VP8Context *s, VP8Macroblock *mb,
533
                   VP56mv near[2], VP56mv *best, uint8_t cnt[4])
534
{
535
    VP8Macroblock *mb_edge[3] = { mb + 2 /* top */,
536
                                  mb - 1 /* left */,
537
                                  mb + 1 /* top-left */ };
538
    enum { EDGE_TOP, EDGE_LEFT, EDGE_TOPLEFT };
539
    VP56mv near_mv[4]  = {{ 0 }};
540
    enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
541
    int idx = CNT_ZERO;
542
    int best_idx = CNT_ZERO;
543
    int cur_sign_bias = s->sign_bias[mb->ref_frame];
544
    int *sign_bias = s->sign_bias;
545

    
546
    /* Process MB on top, left and top-left */
547
    #define MV_EDGE_CHECK(n)\
548
    {\
549
        VP8Macroblock *edge = mb_edge[n];\
550
        int edge_ref = edge->ref_frame;\
551
        if (edge_ref != VP56_FRAME_CURRENT) {\
552
            uint32_t mv = AV_RN32A(&edge->mv);\
553
            if (mv) {\
554
                if (cur_sign_bias != sign_bias[edge_ref]) {\
555
                    /* SWAR negate of the values in mv. */\
556
                    mv = ~mv;\
557
                    mv = ((mv&0x7fff7fff) + 0x00010001) ^ (mv&0x80008000);\
558
                }\
559
                if (!n || mv != AV_RN32A(&near_mv[idx]))\
560
                    AV_WN32A(&near_mv[++idx], mv);\
561
                cnt[idx]      += 1 + (n != 2);\
562
            } else\
563
                cnt[CNT_ZERO] += 1 + (n != 2);\
564
        }\
565
    }
566
    MV_EDGE_CHECK(0)
567
    MV_EDGE_CHECK(1)
568
    MV_EDGE_CHECK(2)
569

    
570
    /* If we have three distinct MVs, merge first and last if they're the same */
571
    if (cnt[CNT_SPLITMV] && AV_RN32A(&near_mv[1+EDGE_TOP]) == AV_RN32A(&near_mv[1+EDGE_TOPLEFT]))
572
        cnt[CNT_NEAREST] += 1;
573

    
574
    cnt[CNT_SPLITMV] = ((mb_edge[EDGE_LEFT]->mode   == VP8_MVMODE_SPLIT) +
575
                        (mb_edge[EDGE_TOP]->mode    == VP8_MVMODE_SPLIT)) * 2 +
576
                       (mb_edge[EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
577

    
578
    /* Swap near and nearest if necessary */
579
    if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
580
        FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
581
        FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
582
    }
583

    
584
    /* Choose the best mv out of 0,0 and the nearest mv */
585
    if (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])
586
        best_idx = CNT_NEAREST;
587

    
588
    mb->mv  = near_mv[best_idx];
589
    near[0] = near_mv[CNT_NEAREST];
590
    near[1] = near_mv[CNT_NEAR];
591
}
592

    
593
/**
594
 * Motion vector coding, 17.1.
595
 */
596
static int read_mv_component(VP56RangeCoder *c, const uint8_t *p)
597
{
598
    int bit, x = 0;
599

    
600
    if (vp56_rac_get_prob_branchy(c, p[0])) {
601
        int i;
602

    
603
        for (i = 0; i < 3; i++)
604
            x += vp56_rac_get_prob(c, p[9 + i]) << i;
605
        for (i = 9; i > 3; i--)
606
            x += vp56_rac_get_prob(c, p[9 + i]) << i;
607
        if (!(x & 0xFFF0) || vp56_rac_get_prob(c, p[12]))
608
            x += 8;
609
    } else {
610
        // small_mvtree
611
        const uint8_t *ps = p+2;
612
        bit = vp56_rac_get_prob(c, *ps);
613
        ps += 1 + 3*bit;
614
        x  += 4*bit;
615
        bit = vp56_rac_get_prob(c, *ps);
616
        ps += 1 + bit;
617
        x  += 2*bit;
618
        x  += vp56_rac_get_prob(c, *ps);
619
    }
620

    
621
    return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
622
}
623

    
624
static av_always_inline
625
const uint8_t *get_submv_prob(uint32_t left, uint32_t top)
626
{
627
    if (left == top)
628
        return vp8_submv_prob[4-!!left];
629
    if (!top)
630
        return vp8_submv_prob[2];
631
    return vp8_submv_prob[1-!!left];
632
}
633

    
634
/**
635
 * Split motion vector prediction, 16.4.
636
 * @returns the number of motion vectors parsed (2, 4 or 16)
637
 */
638
static av_always_inline
639
int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb)
640
{
641
    int part_idx;
642
    int n, num;
643
    VP8Macroblock *top_mb  = &mb[2];
644
    VP8Macroblock *left_mb = &mb[-1];
645
    const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning],
646
                  *mbsplits_top = vp8_mbsplits[top_mb->partitioning],
647
                  *mbsplits_cur, *firstidx;
648
    VP56mv *top_mv  = top_mb->bmv;
649
    VP56mv *left_mv = left_mb->bmv;
650
    VP56mv *cur_mv  = mb->bmv;
651

    
652
    if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
653
        if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1])) {
654
            part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
655
        } else {
656
            part_idx = VP8_SPLITMVMODE_8x8;
657
        }
658
    } else {
659
        part_idx = VP8_SPLITMVMODE_4x4;
660
    }
661

    
662
    num = vp8_mbsplit_count[part_idx];
663
    mbsplits_cur = vp8_mbsplits[part_idx],
664
    firstidx = vp8_mbfirstidx[part_idx];
665
    mb->partitioning = part_idx;
666

    
667
    for (n = 0; n < num; n++) {
668
        int k = firstidx[n];
669
        uint32_t left, above;
670
        const uint8_t *submv_prob;
671

    
672
        if (!(k & 3))
673
            left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
674
        else
675
            left  = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
676
        if (k <= 3)
677
            above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
678
        else
679
            above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
680

    
681
        submv_prob = get_submv_prob(left, above);
682

    
683
        if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
684
            if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
685
                if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
686
                    mb->bmv[n].y = mb->mv.y + read_mv_component(c, s->prob->mvc[0]);
687
                    mb->bmv[n].x = mb->mv.x + read_mv_component(c, s->prob->mvc[1]);
688
                } else {
689
                    AV_ZERO32(&mb->bmv[n]);
690
                }
691
            } else {
692
                AV_WN32A(&mb->bmv[n], above);
693
            }
694
        } else {
695
            AV_WN32A(&mb->bmv[n], left);
696
        }
697
    }
698

    
699
    return num;
700
}
701

    
702
static av_always_inline
703
void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c,
704
                           int mb_x, int keyframe)
705
{
706
    uint8_t *intra4x4 = s->intra4x4_pred_mode_mb;
707
    if (keyframe) {
708
        int x, y;
709
        uint8_t* const top = s->intra4x4_pred_mode_top + 4 * mb_x;
710
        uint8_t* const left = s->intra4x4_pred_mode_left;
711
        for (y = 0; y < 4; y++) {
712
            for (x = 0; x < 4; x++) {
713
                const uint8_t *ctx;
714
                ctx = vp8_pred4x4_prob_intra[top[x]][left[y]];
715
                *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
716
                left[y] = top[x] = *intra4x4;
717
                intra4x4++;
718
            }
719
        }
720
    } else {
721
        int i;
722
        for (i = 0; i < 16; i++)
723
            intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree, vp8_pred4x4_prob_inter);
724
    }
725
}
726

    
727
static av_always_inline
728
void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, uint8_t *segment)
729
{
730
    VP56RangeCoder *c = &s->c;
731

    
732
    if (s->segmentation.update_map)
733
        *segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s->prob->segmentid);
734
    s->segment = *segment;
735

    
736
    mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
737

    
738
    if (s->keyframe) {
739
        mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra, vp8_pred16x16_prob_intra);
740

    
741
        if (mb->mode == MODE_I4x4) {
742
            decode_intra4x4_modes(s, c, mb_x, 1);
743
        } else {
744
            const uint32_t modes = vp8_pred4x4_mode[mb->mode] * 0x01010101u;
745
            AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
746
            AV_WN32A(s->intra4x4_pred_mode_left, modes);
747
        }
748

    
749
        s->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, vp8_pred8x8c_prob_intra);
750
        mb->ref_frame = VP56_FRAME_CURRENT;
751
    } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
752
        VP56mv near[2], best;
753
        uint8_t cnt[4] = { 0 };
754

    
755
        // inter MB, 16.2
756
        if (vp56_rac_get_prob_branchy(c, s->prob->last))
757
            mb->ref_frame = vp56_rac_get_prob(c, s->prob->golden) ?
758
                VP56_FRAME_GOLDEN2 /* altref */ : VP56_FRAME_GOLDEN;
759
        else
760
            mb->ref_frame = VP56_FRAME_PREVIOUS;
761
        s->ref_count[mb->ref_frame-1]++;
762

    
763
        // motion vectors, 16.3
764
        find_near_mvs(s, mb, near, &best, cnt);
765
        if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[0]][0])) {
766
            mb->mode = VP8_MVMODE_MV;
767
            if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[1]][1])) {
768
                if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[2]][2])) {
769
                    if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[3]][3])) {
770
                        mb->mode = VP8_MVMODE_SPLIT;
771
                        clamp_mv(s, &mb->mv, &mb->mv, mb_x, mb_y);
772
                        mb->mv = mb->bmv[decode_splitmvs(s, c, mb) - 1];
773
                    } else {
774
                        clamp_mv(s, &mb->mv, &mb->mv, mb_x, mb_y);
775
                        mb->mv.y += read_mv_component(c, s->prob->mvc[0]);
776
                        mb->mv.x += read_mv_component(c, s->prob->mvc[1]);
777
                    }
778
                } else
779
                    clamp_mv(s, &mb->mv, &near[1], mb_x, mb_y);
780
            } else
781
                clamp_mv(s, &mb->mv, &near[0], mb_x, mb_y);
782
        } else {
783
            mb->mode = VP8_MVMODE_ZERO;
784
            AV_ZERO32(&mb->mv);
785
        }
786
        if (mb->mode != VP8_MVMODE_SPLIT) {
787
            mb->partitioning = VP8_SPLITMVMODE_NONE;
788
            mb->bmv[0] = mb->mv;
789
        }
790
    } else {
791
        // intra MB, 16.1
792
        mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
793

    
794
        if (mb->mode == MODE_I4x4)
795
            decode_intra4x4_modes(s, c, mb_x, 0);
796

    
797
        s->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, s->prob->pred8x8c);
798
        mb->ref_frame = VP56_FRAME_CURRENT;
799
        mb->partitioning = VP8_SPLITMVMODE_NONE;
800
        AV_ZERO32(&mb->bmv[0]);
801
    }
802
}
803

    
804
/**
805
 * @param c arithmetic bitstream reader context
806
 * @param block destination for block coefficients
807
 * @param probs probabilities to use when reading trees from the bitstream
808
 * @param i initial coeff index, 0 unless a separate DC block is coded
809
 * @param zero_nhood the initial prediction context for number of surrounding
810
 *                   all-zero blocks (only left/top, so 0-2)
811
 * @param qmul array holding the dc/ac dequant factor at position 0/1
812
 * @return 0 if no coeffs were decoded
813
 *         otherwise, the index of the last coeff decoded plus one
814
 */
815
static int decode_block_coeffs_internal(VP56RangeCoder *c, DCTELEM block[16],
816
                                        uint8_t probs[8][3][NUM_DCT_TOKENS-1],
817
                                        int i, uint8_t *token_prob, int16_t qmul[2])
818
{
819
    goto skip_eob;
820
    do {
821
        int coeff;
822
        if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
823
            return i;
824

    
825
skip_eob:
826
        if (!vp56_rac_get_prob_branchy(c, token_prob[1])) { // DCT_0
827
            if (++i == 16)
828
                return i; // invalid input; blocks should end with EOB
829
            token_prob = probs[i][0];
830
            goto skip_eob;
831
        }
832

    
833
        if (!vp56_rac_get_prob_branchy(c, token_prob[2])) { // DCT_1
834
            coeff = 1;
835
            token_prob = probs[i+1][1];
836
        } else {
837
            if (!vp56_rac_get_prob_branchy(c, token_prob[3])) { // DCT 2,3,4
838
                coeff = vp56_rac_get_prob_branchy(c, token_prob[4]);
839
                if (coeff)
840
                    coeff += vp56_rac_get_prob(c, token_prob[5]);
841
                coeff += 2;
842
            } else {
843
                // DCT_CAT*
844
                if (!vp56_rac_get_prob_branchy(c, token_prob[6])) {
845
                    if (!vp56_rac_get_prob_branchy(c, token_prob[7])) { // DCT_CAT1
846
                        coeff  = 5 + vp56_rac_get_prob(c, vp8_dct_cat1_prob[0]);
847
                    } else {                                    // DCT_CAT2
848
                        coeff  = 7;
849
                        coeff += vp56_rac_get_prob(c, vp8_dct_cat2_prob[0]) << 1;
850
                        coeff += vp56_rac_get_prob(c, vp8_dct_cat2_prob[1]);
851
                    }
852
                } else {    // DCT_CAT3 and up
853
                    int a = vp56_rac_get_prob(c, token_prob[8]);
854
                    int b = vp56_rac_get_prob(c, token_prob[9+a]);
855
                    int cat = (a<<1) + b;
856
                    coeff  = 3 + (8<<cat);
857
                    coeff += vp8_rac_get_coeff(c, vp8_dct_cat_prob[cat]);
858
                }
859
            }
860
            token_prob = probs[i+1][2];
861
        }
862
        block[zigzag_scan[i]] = (vp8_rac_get(c) ? -coeff : coeff) * qmul[!!i];
863
    } while (++i < 16);
864

    
865
    return i;
866
}
867

    
868
static av_always_inline
869
int decode_block_coeffs(VP56RangeCoder *c, DCTELEM block[16],
870
                        uint8_t probs[8][3][NUM_DCT_TOKENS-1],
871
                        int i, int zero_nhood, int16_t qmul[2])
872
{
873
    uint8_t *token_prob = probs[i][zero_nhood];
874
    if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
875
        return 0;
876
    return decode_block_coeffs_internal(c, block, probs, i, token_prob, qmul);
877
}
878

    
879
static av_always_inline
880
void decode_mb_coeffs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
881
                      uint8_t t_nnz[9], uint8_t l_nnz[9])
882
{
883
    int i, x, y, luma_start = 0, luma_ctx = 3;
884
    int nnz_pred, nnz, nnz_total = 0;
885
    int segment = s->segment;
886
    int block_dc = 0;
887

    
888
    if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
889
        nnz_pred = t_nnz[8] + l_nnz[8];
890

    
891
        // decode DC values and do hadamard
892
        nnz = decode_block_coeffs(c, s->block_dc, s->prob->token[1], 0, nnz_pred,
893
                                  s->qmat[segment].luma_dc_qmul);
894
        l_nnz[8] = t_nnz[8] = !!nnz;
895
        if (nnz) {
896
            nnz_total += nnz;
897
            block_dc = 1;
898
            if (nnz == 1)
899
                s->vp8dsp.vp8_luma_dc_wht_dc(s->block, s->block_dc);
900
            else
901
                s->vp8dsp.vp8_luma_dc_wht(s->block, s->block_dc);
902
        }
903
        luma_start = 1;
904
        luma_ctx = 0;
905
    }
906

    
907
    // luma blocks
908
    for (y = 0; y < 4; y++)
909
        for (x = 0; x < 4; x++) {
910
            nnz_pred = l_nnz[y] + t_nnz[x];
911
            nnz = decode_block_coeffs(c, s->block[y][x], s->prob->token[luma_ctx], luma_start,
912
                                      nnz_pred, s->qmat[segment].luma_qmul);
913
            // nnz+block_dc may be one more than the actual last index, but we don't care
914
            s->non_zero_count_cache[y][x] = nnz + block_dc;
915
            t_nnz[x] = l_nnz[y] = !!nnz;
916
            nnz_total += nnz;
917
        }
918

    
919
    // chroma blocks
920
    // TODO: what to do about dimensions? 2nd dim for luma is x,
921
    // but for chroma it's (y<<1)|x
922
    for (i = 4; i < 6; i++)
923
        for (y = 0; y < 2; y++)
924
            for (x = 0; x < 2; x++) {
925
                nnz_pred = l_nnz[i+2*y] + t_nnz[i+2*x];
926
                nnz = decode_block_coeffs(c, s->block[i][(y<<1)+x], s->prob->token[2], 0,
927
                                          nnz_pred, s->qmat[segment].chroma_qmul);
928
                s->non_zero_count_cache[i][(y<<1)+x] = nnz;
929
                t_nnz[i+2*x] = l_nnz[i+2*y] = !!nnz;
930
                nnz_total += nnz;
931
            }
932

    
933
    // if there were no coded coeffs despite the macroblock not being marked skip,
934
    // we MUST not do the inner loop filter and should not do IDCT
935
    // Since skip isn't used for bitstream prediction, just manually set it.
936
    if (!nnz_total)
937
        mb->skip = 1;
938
}
939

    
940
static av_always_inline
941
void backup_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
942
                      int linesize, int uvlinesize, int simple)
943
{
944
    AV_COPY128(top_border, src_y + 15*linesize);
945
    if (!simple) {
946
        AV_COPY64(top_border+16, src_cb + 7*uvlinesize);
947
        AV_COPY64(top_border+24, src_cr + 7*uvlinesize);
948
    }
949
}
950

    
951
static av_always_inline
952
void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
953
                    int linesize, int uvlinesize, int mb_x, int mb_y, int mb_width,
954
                    int simple, int xchg)
955
{
956
    uint8_t *top_border_m1 = top_border-32;     // for TL prediction
957
    src_y  -=   linesize;
958
    src_cb -= uvlinesize;
959
    src_cr -= uvlinesize;
960

    
961
#define XCHG(a,b,xchg) do {                     \
962
        if (xchg) AV_SWAP64(b,a);               \
963
        else      AV_COPY64(b,a);               \
964
    } while (0)
965

    
966
    XCHG(top_border_m1+8, src_y-8, xchg);
967
    XCHG(top_border,      src_y,   xchg);
968
    XCHG(top_border+8,    src_y+8, 1);
969
    if (mb_x < mb_width-1)
970
        XCHG(top_border+32, src_y+16, 1);
971

    
972
    // only copy chroma for normal loop filter
973
    // or to initialize the top row to 127
974
    if (!simple || !mb_y) {
975
        XCHG(top_border_m1+16, src_cb-8, xchg);
976
        XCHG(top_border_m1+24, src_cr-8, xchg);
977
        XCHG(top_border+16,    src_cb, 1);
978
        XCHG(top_border+24,    src_cr, 1);
979
    }
980
}
981

    
982
static av_always_inline
983
int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
984
{
985
    if (!mb_x) {
986
        return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
987
    } else {
988
        return mb_y ? mode : LEFT_DC_PRED8x8;
989
    }
990
}
991

    
992
static av_always_inline
993
int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y)
994
{
995
    if (!mb_x) {
996
        return mb_y ? VERT_PRED8x8 : DC_129_PRED8x8;
997
    } else {
998
        return mb_y ? mode : HOR_PRED8x8;
999
    }
1000
}
1001

    
1002
static av_always_inline
1003
int check_intra_pred8x8_mode(int mode, int mb_x, int mb_y)
1004
{
1005
    if (mode == DC_PRED8x8) {
1006
        return check_dc_pred8x8_mode(mode, mb_x, mb_y);
1007
    } else {
1008
        return mode;
1009
    }
1010
}
1011

    
1012
static av_always_inline
1013
int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y)
1014
{
1015
    switch (mode) {
1016
    case DC_PRED8x8:
1017
        return check_dc_pred8x8_mode(mode, mb_x, mb_y);
1018
    case VERT_PRED8x8:
1019
        return !mb_y ? DC_127_PRED8x8 : mode;
1020
    case HOR_PRED8x8:
1021
        return !mb_x ? DC_129_PRED8x8 : mode;
1022
    case PLANE_PRED8x8 /*TM*/:
1023
        return check_tm_pred8x8_mode(mode, mb_x, mb_y);
1024
    }
1025
    return mode;
1026
}
1027

    
1028
static av_always_inline
1029
int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y)
1030
{
1031
    if (!mb_x) {
1032
        return mb_y ? VERT_VP8_PRED : DC_129_PRED;
1033
    } else {
1034
        return mb_y ? mode : HOR_VP8_PRED;
1035
    }
1036
}
1037

    
1038
static av_always_inline
1039
int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y, int *copy_buf)
1040
{
1041
    switch (mode) {
1042
    case VERT_PRED:
1043
        if (!mb_x && mb_y) {
1044
            *copy_buf = 1;
1045
            return mode;
1046
        }
1047
        /* fall-through */
1048
    case DIAG_DOWN_LEFT_PRED:
1049
    case VERT_LEFT_PRED:
1050
        return !mb_y ? DC_127_PRED : mode;
1051
    case HOR_PRED:
1052
        if (!mb_y) {
1053
            *copy_buf = 1;
1054
            return mode;
1055
        }
1056
        /* fall-through */
1057
    case HOR_UP_PRED:
1058
        return !mb_x ? DC_129_PRED : mode;
1059
    case TM_VP8_PRED:
1060
        return check_tm_pred4x4_mode(mode, mb_x, mb_y);
1061
    case DC_PRED: // 4x4 DC doesn't use the same "H.264-style" exceptions as 16x16/8x8 DC
1062
    case DIAG_DOWN_RIGHT_PRED:
1063
    case VERT_RIGHT_PRED:
1064
    case HOR_DOWN_PRED:
1065
        if (!mb_y || !mb_x)
1066
            *copy_buf = 1;
1067
        return mode;
1068
    }
1069
    return mode;
1070
}
1071

    
1072
static av_always_inline
1073
void intra_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
1074
                   int mb_x, int mb_y)
1075
{
1076
    AVCodecContext *avctx = s->avctx;
1077
    int x, y, mode, nnz, tr;
1078

    
1079
    // for the first row, we need to run xchg_mb_border to init the top edge to 127
1080
    // otherwise, skip it if we aren't going to deblock
1081
    if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y))
1082
        xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
1083
                       s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1084
                       s->filter.simple, 1);
1085

    
1086
    if (mb->mode < MODE_I4x4) {
1087
        if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // tested
1088
            mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y);
1089
        } else {
1090
            mode = check_intra_pred8x8_mode(mb->mode, mb_x, mb_y);
1091
        }
1092
        s->hpc.pred16x16[mode](dst[0], s->linesize);
1093
    } else {
1094
        uint8_t *ptr = dst[0];
1095
        uint8_t *intra4x4 = s->intra4x4_pred_mode_mb;
1096
        uint8_t tr_top[4] = { 127, 127, 127, 127 };
1097

    
1098
        // all blocks on the right edge of the macroblock use bottom edge
1099
        // the top macroblock for their topright edge
1100
        uint8_t *tr_right = ptr - s->linesize + 16;
1101

    
1102
        // if we're on the right edge of the frame, said edge is extended
1103
        // from the top macroblock
1104
        if (!(!mb_y && avctx->flags & CODEC_FLAG_EMU_EDGE) &&
1105
            mb_x == s->mb_width-1) {
1106
            tr = tr_right[-1]*0x01010101;
1107
            tr_right = (uint8_t *)&tr;
1108
        }
1109

    
1110
        if (mb->skip)
1111
            AV_ZERO128(s->non_zero_count_cache);
1112

    
1113
        for (y = 0; y < 4; y++) {
1114
            uint8_t *topright = ptr + 4 - s->linesize;
1115
            for (x = 0; x < 4; x++) {
1116
                int copy = 0, linesize = s->linesize;
1117
                uint8_t *dst = ptr+4*x;
1118
                DECLARE_ALIGNED(4, uint8_t, copy_dst)[5*8];
1119

    
1120
                if ((y == 0 || x == 3) && mb_y == 0 && avctx->flags & CODEC_FLAG_EMU_EDGE) {
1121
                    topright = tr_top;
1122
                } else if (x == 3)
1123
                    topright = tr_right;
1124

    
1125
                if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // mb_x+x or mb_y+y is a hack but works
1126
                    mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x, mb_y + y, &copy);
1127
                    if (copy) {
1128
                        dst = copy_dst + 12;
1129
                        linesize = 8;
1130
                        if (!(mb_y + y)) {
1131
                            copy_dst[3] = 127U;
1132
                            AV_WN32A(copy_dst+4, 127U * 0x01010101U);
1133
                        } else {
1134
                            AV_COPY32(copy_dst+4, ptr+4*x-s->linesize);
1135
                            if (!(mb_x + x)) {
1136
                                copy_dst[3] = 129U;
1137
                            } else {
1138
                                copy_dst[3] = ptr[4*x-s->linesize-1];
1139
                            }
1140
                        }
1141
                        if (!(mb_x + x)) {
1142
                            copy_dst[11] =
1143
                            copy_dst[19] =
1144
                            copy_dst[27] =
1145
                            copy_dst[35] = 129U;
1146
                        } else {
1147
                            copy_dst[11] = ptr[4*x              -1];
1148
                            copy_dst[19] = ptr[4*x+s->linesize  -1];
1149
                            copy_dst[27] = ptr[4*x+s->linesize*2-1];
1150
                            copy_dst[35] = ptr[4*x+s->linesize*3-1];
1151
                        }
1152
                    }
1153
                } else {
1154
                    mode = intra4x4[x];
1155
                }
1156
                s->hpc.pred4x4[mode](dst, topright, linesize);
1157
                if (copy) {
1158
                    AV_COPY32(ptr+4*x              , copy_dst+12);
1159
                    AV_COPY32(ptr+4*x+s->linesize  , copy_dst+20);
1160
                    AV_COPY32(ptr+4*x+s->linesize*2, copy_dst+28);
1161
                    AV_COPY32(ptr+4*x+s->linesize*3, copy_dst+36);
1162
                }
1163

    
1164
                nnz = s->non_zero_count_cache[y][x];
1165
                if (nnz) {
1166
                    if (nnz == 1)
1167
                        s->vp8dsp.vp8_idct_dc_add(ptr+4*x, s->block[y][x], s->linesize);
1168
                    else
1169
                        s->vp8dsp.vp8_idct_add(ptr+4*x, s->block[y][x], s->linesize);
1170
                }
1171
                topright += 4;
1172
            }
1173

    
1174
            ptr   += 4*s->linesize;
1175
            intra4x4 += 4;
1176
        }
1177
    }
1178

    
1179
    if (avctx->flags & CODEC_FLAG_EMU_EDGE) {
1180
        mode = check_intra_pred8x8_mode_emuedge(s->chroma_pred_mode, mb_x, mb_y);
1181
    } else {
1182
        mode = check_intra_pred8x8_mode(s->chroma_pred_mode, mb_x, mb_y);
1183
    }
1184
    s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1185
    s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1186

    
1187
    if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y))
1188
        xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
1189
                       s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1190
                       s->filter.simple, 0);
1191
}
1192

    
1193
static const uint8_t subpel_idx[3][8] = {
1194
    { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1195
                                // also function pointer index
1196
    { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1197
    { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1198
};
1199

    
1200
/**
1201
 * Generic MC function.
1202
 *
1203
 * @param s VP8 decoding context
1204
 * @param luma 1 for luma (Y) planes, 0 for chroma (Cb/Cr) planes
1205
 * @param dst target buffer for block data at block position
1206
 * @param src reference picture buffer at origin (0, 0)
1207
 * @param mv motion vector (relative to block position) to get pixel data from
1208
 * @param x_off horizontal position of block from origin (0, 0)
1209
 * @param y_off vertical position of block from origin (0, 0)
1210
 * @param block_w width of block (16, 8 or 4)
1211
 * @param block_h height of block (always same as block_w)
1212
 * @param width width of src/dst plane data
1213
 * @param height height of src/dst plane data
1214
 * @param linesize size of a single line of plane data, including padding
1215
 * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1216
 */
1217
static av_always_inline
1218
void vp8_mc_luma(VP8Context *s, uint8_t *dst, uint8_t *src, const VP56mv *mv,
1219
                 int x_off, int y_off, int block_w, int block_h,
1220
                 int width, int height, int linesize,
1221
                 vp8_mc_func mc_func[3][3])
1222
{
1223
    if (AV_RN32A(mv)) {
1224

    
1225
        int mx = (mv->x << 1)&7, mx_idx = subpel_idx[0][mx];
1226
        int my = (mv->y << 1)&7, my_idx = subpel_idx[0][my];
1227

    
1228
        x_off += mv->x >> 2;
1229
        y_off += mv->y >> 2;
1230

    
1231
        // edge emulation
1232
        src += y_off * linesize + x_off;
1233
        if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1234
            y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1235
            s->dsp.emulated_edge_mc(s->edge_emu_buffer, src - my_idx * linesize - mx_idx, linesize,
1236
                                    block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1237
                                    x_off - mx_idx, y_off - my_idx, width, height);
1238
            src = s->edge_emu_buffer + mx_idx + linesize * my_idx;
1239
        }
1240
        mc_func[my_idx][mx_idx](dst, linesize, src, linesize, block_h, mx, my);
1241
    } else
1242
        mc_func[0][0](dst, linesize, src + y_off * linesize + x_off, linesize, block_h, 0, 0);
1243
}
1244

    
1245
static av_always_inline
1246
void vp8_mc_chroma(VP8Context *s, uint8_t *dst1, uint8_t *dst2, uint8_t *src1,
1247
                   uint8_t *src2, const VP56mv *mv, int x_off, int y_off,
1248
                   int block_w, int block_h, int width, int height, int linesize,
1249
                   vp8_mc_func mc_func[3][3])
1250
{
1251
    if (AV_RN32A(mv)) {
1252
        int mx = mv->x&7, mx_idx = subpel_idx[0][mx];
1253
        int my = mv->y&7, my_idx = subpel_idx[0][my];
1254

    
1255
        x_off += mv->x >> 3;
1256
        y_off += mv->y >> 3;
1257

    
1258
        // edge emulation
1259
        src1 += y_off * linesize + x_off;
1260
        src2 += y_off * linesize + x_off;
1261
        if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1262
            y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1263
            s->dsp.emulated_edge_mc(s->edge_emu_buffer, src1 - my_idx * linesize - mx_idx, linesize,
1264
                                    block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1265
                                    x_off - mx_idx, y_off - my_idx, width, height);
1266
            src1 = s->edge_emu_buffer + mx_idx + linesize * my_idx;
1267
            mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1268

    
1269
            s->dsp.emulated_edge_mc(s->edge_emu_buffer, src2 - my_idx * linesize - mx_idx, linesize,
1270
                                    block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1271
                                    x_off - mx_idx, y_off - my_idx, width, height);
1272
            src2 = s->edge_emu_buffer + mx_idx + linesize * my_idx;
1273
            mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1274
        } else {
1275
            mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1276
            mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1277
        }
1278
    } else {
1279
        mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1280
        mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1281
    }
1282
}
1283

    
1284
static av_always_inline
1285
void vp8_mc_part(VP8Context *s, uint8_t *dst[3],
1286
                 AVFrame *ref_frame, int x_off, int y_off,
1287
                 int bx_off, int by_off,
1288
                 int block_w, int block_h,
1289
                 int width, int height, VP56mv *mv)
1290
{
1291
    VP56mv uvmv = *mv;
1292

    
1293
    /* Y */
1294
    vp8_mc_luma(s, dst[0] + by_off * s->linesize + bx_off,
1295
                ref_frame->data[0], mv, x_off + bx_off, y_off + by_off,
1296
                block_w, block_h, width, height, s->linesize,
1297
                s->put_pixels_tab[block_w == 8]);
1298

    
1299
    /* U/V */
1300
    if (s->profile == 3) {
1301
        uvmv.x &= ~7;
1302
        uvmv.y &= ~7;
1303
    }
1304
    x_off   >>= 1; y_off   >>= 1;
1305
    bx_off  >>= 1; by_off  >>= 1;
1306
    width   >>= 1; height  >>= 1;
1307
    block_w >>= 1; block_h >>= 1;
1308
    vp8_mc_chroma(s, dst[1] + by_off * s->uvlinesize + bx_off,
1309
                  dst[2] + by_off * s->uvlinesize + bx_off, ref_frame->data[1],
1310
                  ref_frame->data[2], &uvmv, x_off + bx_off, y_off + by_off,
1311
                  block_w, block_h, width, height, s->uvlinesize,
1312
                  s->put_pixels_tab[1 + (block_w == 4)]);
1313
}
1314

    
1315
/* Fetch pixels for estimated mv 4 macroblocks ahead.
1316
 * Optimized for 64-byte cache lines.  Inspired by ffh264 prefetch_motion. */
1317
static av_always_inline void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int mb_xy, int ref)
1318
{
1319
    /* Don't prefetch refs that haven't been used very often this frame. */
1320
    if (s->ref_count[ref-1] > (mb_xy >> 5)) {
1321
        int x_off = mb_x << 4, y_off = mb_y << 4;
1322
        int mx = (mb->mv.x>>2) + x_off + 8;
1323
        int my = (mb->mv.y>>2) + y_off;
1324
        uint8_t **src= s->framep[ref]->data;
1325
        int off= mx + (my + (mb_x&3)*4)*s->linesize + 64;
1326
        s->dsp.prefetch(src[0]+off, s->linesize, 4);
1327
        off= (mx>>1) + ((my>>1) + (mb_x&7))*s->uvlinesize + 64;
1328
        s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1329
    }
1330
}
1331

    
1332
/**
1333
 * Apply motion vectors to prediction buffer, chapter 18.
1334
 */
1335
static av_always_inline
1336
void inter_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
1337
                   int mb_x, int mb_y)
1338
{
1339
    int x_off = mb_x << 4, y_off = mb_y << 4;
1340
    int width = 16*s->mb_width, height = 16*s->mb_height;
1341
    AVFrame *ref = s->framep[mb->ref_frame];
1342
    VP56mv *bmv = mb->bmv;
1343

    
1344
    switch (mb->partitioning) {
1345
    case VP8_SPLITMVMODE_NONE:
1346
        vp8_mc_part(s, dst, ref, x_off, y_off,
1347
                    0, 0, 16, 16, width, height, &mb->mv);
1348
        break;
1349
    case VP8_SPLITMVMODE_4x4: {
1350
        int x, y;
1351
        VP56mv uvmv;
1352

    
1353
        /* Y */
1354
        for (y = 0; y < 4; y++) {
1355
            for (x = 0; x < 4; x++) {
1356
                vp8_mc_luma(s, dst[0] + 4*y*s->linesize + x*4,
1357
                            ref->data[0], &bmv[4*y + x],
1358
                            4*x + x_off, 4*y + y_off, 4, 4,
1359
                            width, height, s->linesize,
1360
                            s->put_pixels_tab[2]);
1361
            }
1362
        }
1363

    
1364
        /* U/V */
1365
        x_off >>= 1; y_off >>= 1; width >>= 1; height >>= 1;
1366
        for (y = 0; y < 2; y++) {
1367
            for (x = 0; x < 2; x++) {
1368
                uvmv.x = mb->bmv[ 2*y    * 4 + 2*x  ].x +
1369
                         mb->bmv[ 2*y    * 4 + 2*x+1].x +
1370
                         mb->bmv[(2*y+1) * 4 + 2*x  ].x +
1371
                         mb->bmv[(2*y+1) * 4 + 2*x+1].x;
1372
                uvmv.y = mb->bmv[ 2*y    * 4 + 2*x  ].y +
1373
                         mb->bmv[ 2*y    * 4 + 2*x+1].y +
1374
                         mb->bmv[(2*y+1) * 4 + 2*x  ].y +
1375
                         mb->bmv[(2*y+1) * 4 + 2*x+1].y;
1376
                uvmv.x = (uvmv.x + 2 + (uvmv.x >> (INT_BIT-1))) >> 2;
1377
                uvmv.y = (uvmv.y + 2 + (uvmv.y >> (INT_BIT-1))) >> 2;
1378
                if (s->profile == 3) {
1379
                    uvmv.x &= ~7;
1380
                    uvmv.y &= ~7;
1381
                }
1382
                vp8_mc_chroma(s, dst[1] + 4*y*s->uvlinesize + x*4,
1383
                              dst[2] + 4*y*s->uvlinesize + x*4,
1384
                              ref->data[1], ref->data[2], &uvmv,
1385
                              4*x + x_off, 4*y + y_off, 4, 4,
1386
                              width, height, s->uvlinesize,
1387
                              s->put_pixels_tab[2]);
1388
            }
1389
        }
1390
        break;
1391
    }
1392
    case VP8_SPLITMVMODE_16x8:
1393
        vp8_mc_part(s, dst, ref, x_off, y_off,
1394
                    0, 0, 16, 8, width, height, &bmv[0]);
1395
        vp8_mc_part(s, dst, ref, x_off, y_off,
1396
                    0, 8, 16, 8, width, height, &bmv[1]);
1397
        break;
1398
    case VP8_SPLITMVMODE_8x16:
1399
        vp8_mc_part(s, dst, ref, x_off, y_off,
1400
                    0, 0, 8, 16, width, height, &bmv[0]);
1401
        vp8_mc_part(s, dst, ref, x_off, y_off,
1402
                    8, 0, 8, 16, width, height, &bmv[1]);
1403
        break;
1404
    case VP8_SPLITMVMODE_8x8:
1405
        vp8_mc_part(s, dst, ref, x_off, y_off,
1406
                    0, 0, 8, 8, width, height, &bmv[0]);
1407
        vp8_mc_part(s, dst, ref, x_off, y_off,
1408
                    8, 0, 8, 8, width, height, &bmv[1]);
1409
        vp8_mc_part(s, dst, ref, x_off, y_off,
1410
                    0, 8, 8, 8, width, height, &bmv[2]);
1411
        vp8_mc_part(s, dst, ref, x_off, y_off,
1412
                    8, 8, 8, 8, width, height, &bmv[3]);
1413
        break;
1414
    }
1415
}
1416

    
1417
static av_always_inline void idct_mb(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb)
1418
{
1419
    int x, y, ch;
1420

    
1421
    if (mb->mode != MODE_I4x4) {
1422
        uint8_t *y_dst = dst[0];
1423
        for (y = 0; y < 4; y++) {
1424
            uint32_t nnz4 = AV_RN32A(s->non_zero_count_cache[y]);
1425
            if (nnz4) {
1426
                if (nnz4&~0x01010101) {
1427
                    for (x = 0; x < 4; x++) {
1428
                        int nnz = s->non_zero_count_cache[y][x];
1429
                        if (nnz) {
1430
                            if (nnz == 1)
1431
                                s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, s->block[y][x], s->linesize);
1432
                            else
1433
                                s->vp8dsp.vp8_idct_add(y_dst+4*x, s->block[y][x], s->linesize);
1434
                        }
1435
                    }
1436
                } else {
1437
                    s->vp8dsp.vp8_idct_dc_add4y(y_dst, s->block[y], s->linesize);
1438
                }
1439
            }
1440
            y_dst += 4*s->linesize;
1441
        }
1442
    }
1443

    
1444
    for (ch = 0; ch < 2; ch++) {
1445
        uint32_t nnz4 = AV_RN32A(s->non_zero_count_cache[4+ch]);
1446
        if (nnz4) {
1447
            uint8_t *ch_dst = dst[1+ch];
1448
            if (nnz4&~0x01010101) {
1449
                for (y = 0; y < 2; y++) {
1450
                    for (x = 0; x < 2; x++) {
1451
                        int nnz = s->non_zero_count_cache[4+ch][(y<<1)+x];
1452
                        if (nnz) {
1453
                            if (nnz == 1)
1454
                                s->vp8dsp.vp8_idct_dc_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
1455
                            else
1456
                                s->vp8dsp.vp8_idct_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
1457
                        }
1458
                    }
1459
                    ch_dst += 4*s->uvlinesize;
1460
                }
1461
            } else {
1462
                s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, s->block[4+ch], s->uvlinesize);
1463
            }
1464
        }
1465
    }
1466
}
1467

    
1468
static av_always_inline void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb, VP8FilterStrength *f )
1469
{
1470
    int interior_limit, filter_level;
1471

    
1472
    if (s->segmentation.enabled) {
1473
        filter_level = s->segmentation.filter_level[s->segment];
1474
        if (!s->segmentation.absolute_vals)
1475
            filter_level += s->filter.level;
1476
    } else
1477
        filter_level = s->filter.level;
1478

    
1479
    if (s->lf_delta.enabled) {
1480
        filter_level += s->lf_delta.ref[mb->ref_frame];
1481
        filter_level += s->lf_delta.mode[mb->mode];
1482
    }
1483

    
1484
/* Like av_clip for inputs 0 and max, where max is equal to (2^n-1) */
1485
#define POW2CLIP(x,max) (((x) & ~max) ? (-(x))>>31 & max : (x));
1486
    filter_level = POW2CLIP(filter_level, 63);
1487

    
1488
    interior_limit = filter_level;
1489
    if (s->filter.sharpness) {
1490
        interior_limit >>= (s->filter.sharpness + 3) >> 2;
1491
        interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
1492
    }
1493
    interior_limit = FFMAX(interior_limit, 1);
1494

    
1495
    f->filter_level = filter_level;
1496
    f->inner_limit = interior_limit;
1497
    f->inner_filter = !mb->skip || mb->mode == MODE_I4x4 || mb->mode == VP8_MVMODE_SPLIT;
1498
}
1499

    
1500
static av_always_inline void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f, int mb_x, int mb_y)
1501
{
1502
    int mbedge_lim, bedge_lim, hev_thresh;
1503
    int filter_level = f->filter_level;
1504
    int inner_limit = f->inner_limit;
1505
    int inner_filter = f->inner_filter;
1506
    int linesize = s->linesize;
1507
    int uvlinesize = s->uvlinesize;
1508
    static const uint8_t hev_thresh_lut[2][64] = {
1509
        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1510
          2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1511
          3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1512
          3, 3, 3, 3 },
1513
        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1514
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1515
          2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1516
          2, 2, 2, 2 }
1517
    };
1518

    
1519
    if (!filter_level)
1520
        return;
1521

    
1522
     bedge_lim = 2*filter_level + inner_limit;
1523
    mbedge_lim = bedge_lim + 4;
1524

    
1525
    hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
1526

    
1527
    if (mb_x) {
1528
        s->vp8dsp.vp8_h_loop_filter16y(dst[0],     linesize,
1529
                                       mbedge_lim, inner_limit, hev_thresh);
1530
        s->vp8dsp.vp8_h_loop_filter8uv(dst[1],     dst[2],      uvlinesize,
1531
                                       mbedge_lim, inner_limit, hev_thresh);
1532
    }
1533

    
1534
    if (inner_filter) {
1535
        s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 4, linesize, bedge_lim,
1536
                                             inner_limit, hev_thresh);
1537
        s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 8, linesize, bedge_lim,
1538
                                             inner_limit, hev_thresh);
1539
        s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+12, linesize, bedge_lim,
1540
                                             inner_limit, hev_thresh);
1541
        s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] + 4, dst[2] + 4,
1542
                                             uvlinesize,  bedge_lim,
1543
                                             inner_limit, hev_thresh);
1544
    }
1545

    
1546
    if (mb_y) {
1547
        s->vp8dsp.vp8_v_loop_filter16y(dst[0],     linesize,
1548
                                       mbedge_lim, inner_limit, hev_thresh);
1549
        s->vp8dsp.vp8_v_loop_filter8uv(dst[1],     dst[2],      uvlinesize,
1550
                                       mbedge_lim, inner_limit, hev_thresh);
1551
    }
1552

    
1553
    if (inner_filter) {
1554
        s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 4*linesize,
1555
                                             linesize,    bedge_lim,
1556
                                             inner_limit, hev_thresh);
1557
        s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 8*linesize,
1558
                                             linesize,    bedge_lim,
1559
                                             inner_limit, hev_thresh);
1560
        s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+12*linesize,
1561
                                             linesize,    bedge_lim,
1562
                                             inner_limit, hev_thresh);
1563
        s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] + 4 * uvlinesize,
1564
                                             dst[2] + 4 * uvlinesize,
1565
                                             uvlinesize,  bedge_lim,
1566
                                             inner_limit, hev_thresh);
1567
    }
1568
}
1569

    
1570
static av_always_inline void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f, int mb_x, int mb_y)
1571
{
1572
    int mbedge_lim, bedge_lim;
1573
    int filter_level = f->filter_level;
1574
    int inner_limit = f->inner_limit;
1575
    int inner_filter = f->inner_filter;
1576
    int linesize = s->linesize;
1577

    
1578
    if (!filter_level)
1579
        return;
1580

    
1581
     bedge_lim = 2*filter_level + inner_limit;
1582
    mbedge_lim = bedge_lim + 4;
1583

    
1584
    if (mb_x)
1585
        s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
1586
    if (inner_filter) {
1587
        s->vp8dsp.vp8_h_loop_filter_simple(dst+ 4, linesize, bedge_lim);
1588
        s->vp8dsp.vp8_h_loop_filter_simple(dst+ 8, linesize, bedge_lim);
1589
        s->vp8dsp.vp8_h_loop_filter_simple(dst+12, linesize, bedge_lim);
1590
    }
1591

    
1592
    if (mb_y)
1593
        s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
1594
    if (inner_filter) {
1595
        s->vp8dsp.vp8_v_loop_filter_simple(dst+ 4*linesize, linesize, bedge_lim);
1596
        s->vp8dsp.vp8_v_loop_filter_simple(dst+ 8*linesize, linesize, bedge_lim);
1597
        s->vp8dsp.vp8_v_loop_filter_simple(dst+12*linesize, linesize, bedge_lim);
1598
    }
1599
}
1600

    
1601
static void filter_mb_row(VP8Context *s, int mb_y)
1602
{
1603
    VP8FilterStrength *f = s->filter_strength;
1604
    uint8_t *dst[3] = {
1605
        s->framep[VP56_FRAME_CURRENT]->data[0] + 16*mb_y*s->linesize,
1606
        s->framep[VP56_FRAME_CURRENT]->data[1] +  8*mb_y*s->uvlinesize,
1607
        s->framep[VP56_FRAME_CURRENT]->data[2] +  8*mb_y*s->uvlinesize
1608
    };
1609
    int mb_x;
1610

    
1611
    for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
1612
        backup_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2], s->linesize, s->uvlinesize, 0);
1613
        filter_mb(s, dst, f++, mb_x, mb_y);
1614
        dst[0] += 16;
1615
        dst[1] += 8;
1616
        dst[2] += 8;
1617
    }
1618
}
1619

    
1620
static void filter_mb_row_simple(VP8Context *s, int mb_y)
1621
{
1622
    VP8FilterStrength *f = s->filter_strength;
1623
    uint8_t *dst = s->framep[VP56_FRAME_CURRENT]->data[0] + 16*mb_y*s->linesize;
1624
    int mb_x;
1625

    
1626
    for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
1627
        backup_mb_border(s->top_border[mb_x+1], dst, NULL, NULL, s->linesize, 0, 1);
1628
        filter_mb_simple(s, dst, f++, mb_x, mb_y);
1629
        dst += 16;
1630
    }
1631
}
1632

    
1633
static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
1634
                            AVPacket *avpkt)
1635
{
1636
    VP8Context *s = avctx->priv_data;
1637
    int ret, mb_x, mb_y, i, y, referenced;
1638
    enum AVDiscard skip_thresh;
1639
    AVFrame *av_uninit(curframe);
1640

    
1641
    if ((ret = decode_frame_header(s, avpkt->data, avpkt->size)) < 0)
1642
        return ret;
1643

    
1644
    referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT
1645
                                || s->update_altref == VP56_FRAME_CURRENT;
1646

    
1647
    skip_thresh = !referenced ? AVDISCARD_NONREF :
1648
                    !s->keyframe ? AVDISCARD_NONKEY : AVDISCARD_ALL;
1649

    
1650
    if (avctx->skip_frame >= skip_thresh) {
1651
        s->invisible = 1;
1652
        goto skip_decode;
1653
    }
1654
    s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
1655

    
1656
    for (i = 0; i < 4; i++)
1657
        if (&s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1658
            &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1659
            &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
1660
            curframe = s->framep[VP56_FRAME_CURRENT] = &s->frames[i];
1661
            break;
1662
        }
1663
    if (curframe->data[0])
1664
        avctx->release_buffer(avctx, curframe);
1665

    
1666
    curframe->key_frame = s->keyframe;
1667
    curframe->pict_type = s->keyframe ? FF_I_TYPE : FF_P_TYPE;
1668
    curframe->reference = referenced ? 3 : 0;
1669
    if ((ret = avctx->get_buffer(avctx, curframe))) {
1670
        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed!\n");
1671
        return ret;
1672
    }
1673

    
1674
    // Given that arithmetic probabilities are updated every frame, it's quite likely
1675
    // that the values we have on a random interframe are complete junk if we didn't
1676
    // start decode on a keyframe. So just don't display anything rather than junk.
1677
    if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
1678
                         !s->framep[VP56_FRAME_GOLDEN] ||
1679
                         !s->framep[VP56_FRAME_GOLDEN2])) {
1680
        av_log(avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
1681
        return AVERROR_INVALIDDATA;
1682
    }
1683

    
1684
    s->linesize   = curframe->linesize[0];
1685
    s->uvlinesize = curframe->linesize[1];
1686

    
1687
    if (!s->edge_emu_buffer)
1688
        s->edge_emu_buffer = av_malloc(21*s->linesize);
1689

    
1690
    memset(s->top_nnz, 0, s->mb_width*sizeof(*s->top_nnz));
1691

    
1692
    /* Zero macroblock structures for top/top-left prediction from outside the frame. */
1693
    memset(s->macroblocks + s->mb_height*2 - 1, 0, (s->mb_width+1)*sizeof(*s->macroblocks));
1694

    
1695
    // top edge of 127 for intra prediction
1696
    if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1697
        s->top_border[0][15] = s->top_border[0][23] = 127;
1698
        memset(s->top_border[1]-1, 127, s->mb_width*sizeof(*s->top_border)+1);
1699
    }
1700
    memset(s->ref_count, 0, sizeof(s->ref_count));
1701
    if (s->keyframe)
1702
        memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width*4);
1703

    
1704
    for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
1705
        VP56RangeCoder *c = &s->coeff_partition[mb_y & (s->num_coeff_partitions-1)];
1706
        VP8Macroblock *mb = s->macroblocks + (s->mb_height - mb_y - 1)*2;
1707
        int mb_xy = mb_y*s->mb_width;
1708
        uint8_t *dst[3] = {
1709
            curframe->data[0] + 16*mb_y*s->linesize,
1710
            curframe->data[1] +  8*mb_y*s->uvlinesize,
1711
            curframe->data[2] +  8*mb_y*s->uvlinesize
1712
        };
1713

    
1714
        memset(mb - 1, 0, sizeof(*mb));   // zero left macroblock
1715
        memset(s->left_nnz, 0, sizeof(s->left_nnz));
1716
        AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED*0x01010101);
1717

    
1718
        // left edge of 129 for intra prediction
1719
        if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1720
            for (i = 0; i < 3; i++)
1721
                for (y = 0; y < 16>>!!i; y++)
1722
                    dst[i][y*curframe->linesize[i]-1] = 129;
1723
            if (mb_y == 1) // top left edge is also 129
1724
                s->top_border[0][15] = s->top_border[0][23] = s->top_border[0][31] = 129;
1725
        }
1726

    
1727
        for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
1728
            /* Prefetch the current frame, 4 MBs ahead */
1729
            s->dsp.prefetch(dst[0] + (mb_x&3)*4*s->linesize + 64, s->linesize, 4);
1730
            s->dsp.prefetch(dst[1] + (mb_x&7)*s->uvlinesize + 64, dst[2] - dst[1], 2);
1731

    
1732
            decode_mb_mode(s, mb, mb_x, mb_y, s->segmentation_map + mb_xy);
1733

    
1734
            prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
1735

    
1736
            if (!mb->skip)
1737
                decode_mb_coeffs(s, c, mb, s->top_nnz[mb_x], s->left_nnz);
1738

    
1739
            if (mb->mode <= MODE_I4x4)
1740
                intra_predict(s, dst, mb, mb_x, mb_y);
1741
            else
1742
                inter_predict(s, dst, mb, mb_x, mb_y);
1743

    
1744
            prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
1745

    
1746
            if (!mb->skip) {
1747
                idct_mb(s, dst, mb);
1748
            } else {
1749
                AV_ZERO64(s->left_nnz);
1750
                AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
1751

    
1752
                // Reset DC block predictors if they would exist if the mb had coefficients
1753
                if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
1754
                    s->left_nnz[8]      = 0;
1755
                    s->top_nnz[mb_x][8] = 0;
1756
                }
1757
            }
1758

    
1759
            if (s->deblock_filter)
1760
                filter_level_for_mb(s, mb, &s->filter_strength[mb_x]);
1761

    
1762
            prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
1763

    
1764
            dst[0] += 16;
1765
            dst[1] += 8;
1766
            dst[2] += 8;
1767
        }
1768
        if (s->deblock_filter) {
1769
            if (s->filter.simple)
1770
                filter_mb_row_simple(s, mb_y);
1771
            else
1772
                filter_mb_row(s, mb_y);
1773
        }
1774
    }
1775

    
1776
skip_decode:
1777
    // if future frames don't use the updated probabilities,
1778
    // reset them to the values we saved
1779
    if (!s->update_probabilities)
1780
        s->prob[0] = s->prob[1];
1781

    
1782
    // check if golden and altref are swapped
1783
    if (s->update_altref == VP56_FRAME_GOLDEN &&
1784
        s->update_golden == VP56_FRAME_GOLDEN2)
1785
        FFSWAP(AVFrame *, s->framep[VP56_FRAME_GOLDEN], s->framep[VP56_FRAME_GOLDEN2]);
1786
    else {
1787
        if (s->update_altref != VP56_FRAME_NONE)
1788
            s->framep[VP56_FRAME_GOLDEN2] = s->framep[s->update_altref];
1789

    
1790
        if (s->update_golden != VP56_FRAME_NONE)
1791
            s->framep[VP56_FRAME_GOLDEN] = s->framep[s->update_golden];
1792
    }
1793

    
1794
    if (s->update_last) // move cur->prev
1795
        s->framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_CURRENT];
1796

    
1797
    // release no longer referenced frames
1798
    for (i = 0; i < 4; i++)
1799
        if (s->frames[i].data[0] &&
1800
            &s->frames[i] != s->framep[VP56_FRAME_CURRENT] &&
1801
            &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1802
            &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1803
            &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
1804
            avctx->release_buffer(avctx, &s->frames[i]);
1805

    
1806
    if (!s->invisible) {
1807
        *(AVFrame*)data = *s->framep[VP56_FRAME_CURRENT];
1808
        *data_size = sizeof(AVFrame);
1809
    }
1810

    
1811
    return avpkt->size;
1812
}
1813

    
1814
static av_cold int vp8_decode_init(AVCodecContext *avctx)
1815
{
1816
    VP8Context *s = avctx->priv_data;
1817

    
1818
    s->avctx = avctx;
1819
    avctx->pix_fmt = PIX_FMT_YUV420P;
1820

    
1821
    dsputil_init(&s->dsp, avctx);
1822
    ff_h264_pred_init(&s->hpc, CODEC_ID_VP8);
1823
    ff_vp8dsp_init(&s->vp8dsp);
1824

    
1825
    return 0;
1826
}
1827

    
1828
static av_cold int vp8_decode_free(AVCodecContext *avctx)
1829
{
1830
    vp8_decode_flush(avctx);
1831
    return 0;
1832
}
1833

    
1834
AVCodec ff_vp8_decoder = {
1835
    "vp8",
1836
    AVMEDIA_TYPE_VIDEO,
1837
    CODEC_ID_VP8,
1838
    sizeof(VP8Context),
1839
    vp8_decode_init,
1840
    NULL,
1841
    vp8_decode_free,
1842
    vp8_decode_frame,
1843
    CODEC_CAP_DR1,
1844
    .flush = vp8_decode_flush,
1845
    .long_name = NULL_IF_CONFIG_SMALL("On2 VP8"),
1846
};